[DSP] Section 3

[REF | AudioSignalProcessingForML]

7. Time-domain audio features

Time-domain features

Amplitude envelope (AE)
Root-Mean-Square energy (RMS)
Zero-crossing rate (ZCR)
…

Amplitude Envelope

Max amplitude value of all smaples in a frame
$AE_t = \begin{align} (t+1)(K-1) \ max \ k=tK \end{align} \ s(k)$
Gives rough idea of loudness
Sensitive to outliers
Onset detcetion, music genre classification

Root-mean-square energy

RMS of all samples in a frame
- $RMS_t = \sqrt {1 \over K} \sum^{(t+1)(K-1)}_{K=tK} \ s(k)^2$
Indicator of loudness
Less sensitive to outliers than AE
Audio segmentation, music genre classification

Zero crossing rate

Number of times a signal crosses the horizontal axis
- $ZCR_t = {1 \over 2} \sum^{(t+1)(K-1)}_{k=tK} sgn(s(k)) - sgn(s(k+1)) $
- $sgn(s(k))$: Sign function
  - $s(k)>0 \rightarrow +1$
  - $s(k)<0 \rightarrow -1$
  - $s(k)=0 \rightarrow 0$

Zero crossing rate applications

Recognition of percussive vs pitched sounds
Monophonic pitch estimation
Voice/unvoiced decision for speech signals

8. Implemneting the amplitude envelope

Amplitude envelope

  
import os
import numpy as np
import librosa
import librosa.display
import IPython.display as ipd
import matplotlib.pyplot as plt

  
# load audio files
audio_files = "./raw/8_audio"
debussy_file = os.path.join(audio_files, "debussy.wav")
redhot_file = os.path.join(audio_files, "redhot.wav")
duke_file = os.path.join(audio_files, "duke.wav")

  
ipd.Audio(debussy_file)

  
ipd.Audio(redhot_file)

  
ipd.Audio(duke_file)

  
debussy, sr = librosa.load(debussy_file) # sr, mono ...
redhot, _ = librosa.load(redhot_file)
duke, _ = librosa.load(duke_file)

debussy

array([-0.01742554, -0.03567505, -0.04995728, ...,  0.00912476,
        0.00866699,  0.00964355], dtype=float32)

  
debussy.size # total samples

661500

  
# duration of 1 sample
sample_duration = 1 / sr
print(f"Duration of 1 sample is: {sample_duration:.6f} secpnds")

Duration of 1 sample is: 0.000045 secpnds

  
# duration of the audio signal in seconds
duration = sample_duration * len(debussy)
print(f"Duration of signal is: {duration:.2f} seconds")

Duration of signal is: 30.00 seconds

  
# visualise the waveforms
plt.figure(figsize=(15, 17))
fig, ax = plt.subplots(nrows=3, ncols=1, figsize=(8, 10), sharex=True, sharey=True)

librosa.display.waveshow(debussy, ax=ax[0])
ax[0].set_title("Debussy")

librosa.display.waveshow(redhot, ax=ax[1])
ax[1].set_title("RHCP")

librosa.display.waveshow(duke, ax=ax[2])
ax[2].set_title("Duke")

plt.ylim((-1, 1))
plt.show()

<Figure size 1500x1700 with 0 Axes>

  
FRAME_SIZE = 1024
HOP_LENGTH = 512

# calculate the amplitude envelope
def amplitude_envelope(signal, frame_size, hop_length):
    amplitude_envelope = []
    # calculate AE for each frame
    for i in range(0, len(signal), hop_length):
        current_frame_amplitude_envelope = max(signal[i:i+frame_size])
        amplitude_envelope.append(current_frame_amplitude_envelope)
    return np.array(amplitude_envelope)

def fancy_amplitude_envelope(signal, frame_size, hop_length):
    return np.array([max(signal[i:i+frame_size]) for i in range(0, signal.size, hop_length)])

  
ae_debussy = amplitude_envelope(debussy, FRAME_SIZE, HOP_LENGTH)
len(ae_debussy)

1292

  
fancy_ae_debussy = fancy_amplitude_envelope(debussy, FRAME_SIZE, HOP_LENGTH)

  
(ae_debussy == fancy_ae_debussy).all()

True

  
ae_redhot = amplitude_envelope(redhot, FRAME_SIZE, HOP_LENGTH)
ae_duke = amplitude_envelope(duke, FRAME_SIZE, HOP_LENGTH)

  
# visualise amplitude envelope for all audio files
frames = range(0, ae_debussy.size)
t = librosa.frames_to_time(frames, hop_length=HOP_LENGTH)

fig, ax = plt.subplots(nrows=3, ncols=1, figsize=(8, 10), sharex=True, sharey=True)

librosa.display.waveshow(debussy, ax=ax[0])
ax[0].plot(t, ae_debussy, color="r")
ax[0].set_title("Debussy")

librosa.display.waveshow(redhot, ax=ax[1])
ax[1].plot(t, ae_redhot, color="r")
ax[1].set_title("RHCP")

librosa.display.waveshow(duke, ax=ax[2])
ax[2].plot(t, ae_duke, color="r")
ax[2].set_title("Duke")

plt.ylim((-1, 1))
plt.show()

9. RMS energy and zero-crossing rate

  
# load audio files
audio_files = "./raw/8_audio"
debussy_file = os.path.join(audio_files, "debussy.wav")
redhot_file = os.path.join(audio_files, "redhot.wav")
duke_file = os.path.join(audio_files, "duke.wav")

debussy, _ = librosa.load(debussy_file)
redhot, _ = librosa.load(redhot_file)
duke, _ = librosa.load(duke_file)

  
librosa.feature.rms()

  
# Extract RMSE with librosa
FRAME_LENGTH = 1024
HOP_LENGTH = 512

rms_debussy = librosa.feature.rms(y=debussy, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
rms_redhot = librosa.feature.rms(y=redhot, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
rms_duke = librosa.feature.rms(y=duke, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]

  
# plot the RMSE for all the music pieces
# visualise amplitude envelope for all audio files
frames = range(0, rms_debussy.size)
t = librosa.frames_to_time(frames, hop_length=HOP_LENGTH)

fig, ax = plt.subplots(nrows=3, ncols=1, figsize=(8, 10), sharex=True, sharey=True)

librosa.display.waveshow(debussy, ax=ax[0])
ax[0].plot(t, rms_debussy, color="r")
ax[0].set_title("Debussy")

librosa.display.waveshow(redhot, ax=ax[1])
ax[1].plot(t, rms_redhot, color="r")
ax[1].set_title("RHCP")

librosa.display.waveshow(duke, ax=ax[2])
ax[2].plot(t, rms_duke, color="r")
ax[2].set_title("Duke")

plt.ylim((-1, 1))
plt.show()

  
def rms(signal, frames_length, hop_length):
    rms = []
    for i in range(0, len(signal), hop_length):
        rms_current_frame = np.sqrt(np.sum(signal[i:i+frames_length]**2) / frames_length)
        rms.append(rms_current_frame)
    return np.array(rms)

  
rms1_debussy = rms(debussy, FRAME_LENGTH, HOP_LENGTH)
rms1_redhot = rms(redhot, FRAME_LENGTH, HOP_LENGTH)
rms1_duke = rms(duke, FRAME_LENGTH, HOP_LENGTH)

  
fig, ax = plt.subplots(nrows=3, ncols=1, figsize=(8, 10), sharex=True, sharey=True)

librosa.display.waveshow(debussy, ax=ax[0])
ax[0].plot(t, rms_debussy, color="r")
ax[0].plot(t, rms1_debussy, color="y")
ax[0].set_title("Debussy")

librosa.display.waveshow(redhot, ax=ax[1])
ax[1].plot(t, rms_redhot, color="r")
ax[1].plot(t, rms1_redhot, color="y")
ax[1].set_title("RHCP")

librosa.display.waveshow(duke, ax=ax[2])
ax[2].plot(t, rms_duke, color="r")
ax[2].plot(t, rms1_duke, color="y")
ax[2].set_title("Duke")

plt.ylim((-1, 1))
plt.show()

  
# Zero-crossing rate
zcr_debussy = librosa.feature.zero_crossing_rate(y=debussy, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
zcr_redhot = librosa.feature.zero_crossing_rate(y=redhot, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
zcr_duke = librosa.feature.zero_crossing_rate(y=duke, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]

  
# visualize ZCR for all the music pieces
plt.figure(figsize=(8, 5))

plt.plot(t, zcr_debussy*FRAME_LENGTH, color="r")
plt.plot(t, zcr_redhot*FRAME_LENGTH, color="y")
plt.plot(t, zcr_duke*FRAME_LENGTH, color="b")

plt.legend(["Debussy", "RHCP", "Duke"])
plt.ylim((0, 500))
plt.show()

  
audio_files = "./raw/9_audio"
voice_file = os.path.join(audio_files, "voice.wav")
noise_file = os.path.join(audio_files, "noise.wav")

voice, _ = librosa.load(voice_file, duration=15)
noise, _ = librosa.load(noise_file, duration=15)

  
ipd.Audio(voice_file)

  
ipd.Audio(noise_file)

  
zcr_voice = librosa.feature.zero_crossing_rate(voice, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
zcr_noise = librosa.feature.zero_crossing_rate(noise, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]

  
frames = range(len(zcr_voice))
t = librosa.frames_to_time(frames)

plt.figure(figsize=(8, 5))
plt.plot(t, zcr_voice, color="r")
plt.plot(t, zcr_noise, color="y")

plt.legend(["Voice", "Noise"])
plt.ylim((0, 1))
plt.show()

[DSP] Section 3: 오디오 데이터 특징

[DSP] Section 3

7. Time-domain audio features

Time-domain features

Amplitude Envelope

Root-mean-square energy

Zero crossing rate

Zero crossing rate applications

8. Implemneting the amplitude envelope

Amplitude envelope

9. RMS energy and zero-crossing rate

Further Reading

[DSP] Section 6: 스펙토그램, MFCCs

[DSP] Section 7: Audio Features

[DSP] Section 1: 소리의 특성