[DSP] Section 3
[REF | AudioSignalProcessingForML]
7. Time-domain audio features
Time-domain features
- Amplitude envelope (AE)
- Root-Mean-Square energy (RMS)
- Zero-crossing rate (ZCR)
- …
Amplitude Envelope
- Max amplitude value of all smaples in a frame
- $AE_t = \begin{align} (t+1)(K-1) \ max \ k=tK \end{align} \ s(k)$
- Gives rough idea of loudness
- Sensitive to outliers
- Onset detcetion, music genre classification
Root-mean-square energy
- RMS of all samples in a frame
- $RMS_t = \sqrt {1 \over K} \sum^{(t+1)(K-1)}_{K=tK} \ s(k)^2$
- Indicator of loudness
- Less sensitive to outliers than AE
- Audio segmentation, music genre classification
Zero crossing rate
- Number of times a signal crosses the horizontal axis
$ZCR_t = {1 \over 2} \sum^{(t+1)(K-1)}_{k=tK} sgn(s(k)) - sgn(s(k+1)) $ - $sgn(s(k))$: Sign function
- $s(k)>0 \rightarrow +1$
- $s(k)<0 \rightarrow -1$
- $s(k)=0 \rightarrow 0$
Zero crossing rate applications
- Recognition of percussive vs pitched sounds
- Monophonic pitch estimation
- Voice/unvoiced decision for speech signals
8. Implemneting the amplitude envelope
Amplitude envelope
1
2
3
4
5
6
import os
import numpy as np
import librosa
import librosa.display
import IPython.display as ipd
import matplotlib.pyplot as plt
1
2
3
4
5
# load audio files
audio_files = "./raw/8_audio"
debussy_file = os.path.join(audio_files, "debussy.wav")
redhot_file = os.path.join(audio_files, "redhot.wav")
duke_file = os.path.join(audio_files, "duke.wav")
1
ipd.Audio(debussy_file)
1
ipd.Audio(redhot_file)
1
ipd.Audio(duke_file)
1
2
3
debussy, sr = librosa.load(debussy_file) # sr, mono ...
redhot, _ = librosa.load(redhot_file)
duke, _ = librosa.load(duke_file)
1
debussy
1
2
array([-0.01742554, -0.03567505, -0.04995728, ..., 0.00912476,
0.00866699, 0.00964355], dtype=float32)
1
debussy.size # total samples
1
661500
1
2
3
# duration of 1 sample
sample_duration = 1 / sr
print(f"Duration of 1 sample is: {sample_duration:.6f} secpnds")
1
Duration of 1 sample is: 0.000045 secpnds
1
2
3
# duration of the audio signal in seconds
duration = sample_duration * len(debussy)
print(f"Duration of signal is: {duration:.2f} seconds")
1
Duration of signal is: 30.00 seconds
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# visualise the waveforms
plt.figure(figsize=(15, 17))
fig, ax = plt.subplots(nrows=3, ncols=1, figsize=(8, 10), sharex=True, sharey=True)
librosa.display.waveshow(debussy, ax=ax[0])
ax[0].set_title("Debussy")
librosa.display.waveshow(redhot, ax=ax[1])
ax[1].set_title("RHCP")
librosa.display.waveshow(duke, ax=ax[2])
ax[2].set_title("Duke")
plt.ylim((-1, 1))
plt.show()
1
<Figure size 1500x1700 with 0 Axes>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
FRAME_SIZE = 1024
HOP_LENGTH = 512
# calculate the amplitude envelope
def amplitude_envelope(signal, frame_size, hop_length):
amplitude_envelope = []
# calculate AE for each frame
for i in range(0, len(signal), hop_length):
current_frame_amplitude_envelope = max(signal[i:i+frame_size])
amplitude_envelope.append(current_frame_amplitude_envelope)
return np.array(amplitude_envelope)
def fancy_amplitude_envelope(signal, frame_size, hop_length):
return np.array([max(signal[i:i+frame_size]) for i in range(0, signal.size, hop_length)])
1
2
ae_debussy = amplitude_envelope(debussy, FRAME_SIZE, HOP_LENGTH)
len(ae_debussy)
1
1292
1
fancy_ae_debussy = fancy_amplitude_envelope(debussy, FRAME_SIZE, HOP_LENGTH)
1
(ae_debussy == fancy_ae_debussy).all()
1
True
1
2
ae_redhot = amplitude_envelope(redhot, FRAME_SIZE, HOP_LENGTH)
ae_duke = amplitude_envelope(duke, FRAME_SIZE, HOP_LENGTH)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# visualise amplitude envelope for all audio files
frames = range(0, ae_debussy.size)
t = librosa.frames_to_time(frames, hop_length=HOP_LENGTH)
fig, ax = plt.subplots(nrows=3, ncols=1, figsize=(8, 10), sharex=True, sharey=True)
librosa.display.waveshow(debussy, ax=ax[0])
ax[0].plot(t, ae_debussy, color="r")
ax[0].set_title("Debussy")
librosa.display.waveshow(redhot, ax=ax[1])
ax[1].plot(t, ae_redhot, color="r")
ax[1].set_title("RHCP")
librosa.display.waveshow(duke, ax=ax[2])
ax[2].plot(t, ae_duke, color="r")
ax[2].set_title("Duke")
plt.ylim((-1, 1))
plt.show()
9. RMS energy and zero-crossing rate
1
2
3
4
5
6
7
8
9
# load audio files
audio_files = "./raw/8_audio"
debussy_file = os.path.join(audio_files, "debussy.wav")
redhot_file = os.path.join(audio_files, "redhot.wav")
duke_file = os.path.join(audio_files, "duke.wav")
debussy, _ = librosa.load(debussy_file)
redhot, _ = librosa.load(redhot_file)
duke, _ = librosa.load(duke_file)
1
librosa.feature.rms()
1
2
3
4
5
6
7
# Extract RMSE with librosa
FRAME_LENGTH = 1024
HOP_LENGTH = 512
rms_debussy = librosa.feature.rms(y=debussy, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
rms_redhot = librosa.feature.rms(y=redhot, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
rms_duke = librosa.feature.rms(y=duke, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# plot the RMSE for all the music pieces
# visualise amplitude envelope for all audio files
frames = range(0, rms_debussy.size)
t = librosa.frames_to_time(frames, hop_length=HOP_LENGTH)
fig, ax = plt.subplots(nrows=3, ncols=1, figsize=(8, 10), sharex=True, sharey=True)
librosa.display.waveshow(debussy, ax=ax[0])
ax[0].plot(t, rms_debussy, color="r")
ax[0].set_title("Debussy")
librosa.display.waveshow(redhot, ax=ax[1])
ax[1].plot(t, rms_redhot, color="r")
ax[1].set_title("RHCP")
librosa.display.waveshow(duke, ax=ax[2])
ax[2].plot(t, rms_duke, color="r")
ax[2].set_title("Duke")
plt.ylim((-1, 1))
plt.show()
1
2
3
4
5
6
def rms(signal, frames_length, hop_length):
rms = []
for i in range(0, len(signal), hop_length):
rms_current_frame = np.sqrt(np.sum(signal[i:i+frames_length]**2) / frames_length)
rms.append(rms_current_frame)
return np.array(rms)
1
2
3
rms1_debussy = rms(debussy, FRAME_LENGTH, HOP_LENGTH)
rms1_redhot = rms(redhot, FRAME_LENGTH, HOP_LENGTH)
rms1_duke = rms(duke, FRAME_LENGTH, HOP_LENGTH)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
fig, ax = plt.subplots(nrows=3, ncols=1, figsize=(8, 10), sharex=True, sharey=True)
librosa.display.waveshow(debussy, ax=ax[0])
ax[0].plot(t, rms_debussy, color="r")
ax[0].plot(t, rms1_debussy, color="y")
ax[0].set_title("Debussy")
librosa.display.waveshow(redhot, ax=ax[1])
ax[1].plot(t, rms_redhot, color="r")
ax[1].plot(t, rms1_redhot, color="y")
ax[1].set_title("RHCP")
librosa.display.waveshow(duke, ax=ax[2])
ax[2].plot(t, rms_duke, color="r")
ax[2].plot(t, rms1_duke, color="y")
ax[2].set_title("Duke")
plt.ylim((-1, 1))
plt.show()
1
2
3
4
# Zero-crossing rate
zcr_debussy = librosa.feature.zero_crossing_rate(y=debussy, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
zcr_redhot = librosa.feature.zero_crossing_rate(y=redhot, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
zcr_duke = librosa.feature.zero_crossing_rate(y=duke, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
1
2
3
4
5
6
7
8
9
10
# visualize ZCR for all the music pieces
plt.figure(figsize=(8, 5))
plt.plot(t, zcr_debussy*FRAME_LENGTH, color="r")
plt.plot(t, zcr_redhot*FRAME_LENGTH, color="y")
plt.plot(t, zcr_duke*FRAME_LENGTH, color="b")
plt.legend(["Debussy", "RHCP", "Duke"])
plt.ylim((0, 500))
plt.show()
1
2
3
4
5
6
audio_files = "./raw/9_audio"
voice_file = os.path.join(audio_files, "voice.wav")
noise_file = os.path.join(audio_files, "noise.wav")
voice, _ = librosa.load(voice_file, duration=15)
noise, _ = librosa.load(noise_file, duration=15)
1
ipd.Audio(voice_file)
1
ipd.Audio(noise_file)