Home [DSP] Section 3: 오디오 데이터 특징
Post
Cancel

[DSP] Section 3: 오디오 데이터 특징

[DSP] Section 3

[REF | AudioSignalProcessingForML]

7. Time-domain audio features

Time-domain features

  • Amplitude envelope (AE)
  • Root-Mean-Square energy (RMS)
  • Zero-crossing rate (ZCR)

Amplitude Envelope

  • Max amplitude value of all smaples in a frame
  • $AE_t = \begin{align} (t+1)(K-1) \ max \ k=tK \end{align} \ s(k)$
  • Gives rough idea of loudness
  • Sensitive to outliers
  • Onset detcetion, music genre classification

Root-mean-square energy

  • RMS of all samples in a frame
    • $RMS_t = \sqrt {1 \over K} \sum^{(t+1)(K-1)}_{K=tK} \ s(k)^2$
  • Indicator of loudness
  • Less sensitive to outliers than AE
  • Audio segmentation, music genre classification

Zero crossing rate

  • Number of times a signal crosses the horizontal axis
    • $ZCR_t = {1 \over 2} \sum^{(t+1)(K-1)}_{k=tK}sgn(s(k)) - sgn(s(k+1))$
    • $sgn(s(k))$: Sign function
      • $s(k)>0 \rightarrow +1$
      • $s(k)<0 \rightarrow -1$
      • $s(k)=0 \rightarrow 0$

Zero crossing rate applications

  • Recognition of percussive vs pitched sounds
  • Monophonic pitch estimation
  • Voice/unvoiced decision for speech signals

8. Implemneting the amplitude envelope

Amplitude envelope

1
2
3
4
5
6
import os
import numpy as np
import librosa
import librosa.display
import IPython.display as ipd
import matplotlib.pyplot as plt
1
2
3
4
5
# load audio files
audio_files = "./raw/8_audio"
debussy_file = os.path.join(audio_files, "debussy.wav")
redhot_file = os.path.join(audio_files, "redhot.wav")
duke_file = os.path.join(audio_files, "duke.wav")
1
ipd.Audio(debussy_file)
1
ipd.Audio(redhot_file)
1
ipd.Audio(duke_file)
1
2
3
debussy, sr = librosa.load(debussy_file) # sr, mono ...
redhot, _ = librosa.load(redhot_file)
duke, _ = librosa.load(duke_file)
1
debussy
1
2
array([-0.01742554, -0.03567505, -0.04995728, ...,  0.00912476,
        0.00866699,  0.00964355], dtype=float32)
1
debussy.size # total samples
1
661500
1
2
3
# duration of 1 sample
sample_duration = 1 / sr
print(f"Duration of 1 sample is: {sample_duration:.6f} secpnds")
1
Duration of 1 sample is: 0.000045 secpnds
1
2
3
# duration of the audio signal in seconds
duration = sample_duration * len(debussy)
print(f"Duration of signal is: {duration:.2f} seconds")
1
Duration of signal is: 30.00 seconds
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# visualise the waveforms
plt.figure(figsize=(15, 17))
fig, ax = plt.subplots(nrows=3, ncols=1, figsize=(8, 10), sharex=True, sharey=True)

librosa.display.waveshow(debussy, ax=ax[0])
ax[0].set_title("Debussy")

librosa.display.waveshow(redhot, ax=ax[1])
ax[1].set_title("RHCP")

librosa.display.waveshow(duke, ax=ax[2])
ax[2].set_title("Duke")

plt.ylim((-1, 1))
plt.show()
1
<Figure size 1500x1700 with 0 Axes>

png

1
2
3
4
5
6
7
8
9
10
11
12
13
14
FRAME_SIZE = 1024
HOP_LENGTH = 512

# calculate the amplitude envelope
def amplitude_envelope(signal, frame_size, hop_length):
    amplitude_envelope = []
    # calculate AE for each frame
    for i in range(0, len(signal), hop_length):
        current_frame_amplitude_envelope = max(signal[i:i+frame_size])
        amplitude_envelope.append(current_frame_amplitude_envelope)
    return np.array(amplitude_envelope)

def fancy_amplitude_envelope(signal, frame_size, hop_length):
    return np.array([max(signal[i:i+frame_size]) for i in range(0, signal.size, hop_length)])
1
2
ae_debussy = amplitude_envelope(debussy, FRAME_SIZE, HOP_LENGTH)
len(ae_debussy)
1
1292
1
fancy_ae_debussy = fancy_amplitude_envelope(debussy, FRAME_SIZE, HOP_LENGTH)
1
(ae_debussy == fancy_ae_debussy).all()
1
True
1
2
ae_redhot = amplitude_envelope(redhot, FRAME_SIZE, HOP_LENGTH)
ae_duke = amplitude_envelope(duke, FRAME_SIZE, HOP_LENGTH)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# visualise amplitude envelope for all audio files
frames = range(0, ae_debussy.size)
t = librosa.frames_to_time(frames, hop_length=HOP_LENGTH)

fig, ax = plt.subplots(nrows=3, ncols=1, figsize=(8, 10), sharex=True, sharey=True)

librosa.display.waveshow(debussy, ax=ax[0])
ax[0].plot(t, ae_debussy, color="r")
ax[0].set_title("Debussy")

librosa.display.waveshow(redhot, ax=ax[1])
ax[1].plot(t, ae_redhot, color="r")
ax[1].set_title("RHCP")

librosa.display.waveshow(duke, ax=ax[2])
ax[2].plot(t, ae_duke, color="r")
ax[2].set_title("Duke")

plt.ylim((-1, 1))
plt.show()

png

9. RMS energy and zero-crossing rate

1
2
3
4
5
6
7
8
9
# load audio files
audio_files = "./raw/8_audio"
debussy_file = os.path.join(audio_files, "debussy.wav")
redhot_file = os.path.join(audio_files, "redhot.wav")
duke_file = os.path.join(audio_files, "duke.wav")

debussy, _ = librosa.load(debussy_file)
redhot, _ = librosa.load(redhot_file)
duke, _ = librosa.load(duke_file)
1
librosa.feature.rms()
1
2
3
4
5
6
7
# Extract RMSE with librosa
FRAME_LENGTH = 1024
HOP_LENGTH = 512

rms_debussy = librosa.feature.rms(y=debussy, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
rms_redhot = librosa.feature.rms(y=redhot, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
rms_duke = librosa.feature.rms(y=duke, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# plot the RMSE for all the music pieces
# visualise amplitude envelope for all audio files
frames = range(0, rms_debussy.size)
t = librosa.frames_to_time(frames, hop_length=HOP_LENGTH)

fig, ax = plt.subplots(nrows=3, ncols=1, figsize=(8, 10), sharex=True, sharey=True)

librosa.display.waveshow(debussy, ax=ax[0])
ax[0].plot(t, rms_debussy, color="r")
ax[0].set_title("Debussy")

librosa.display.waveshow(redhot, ax=ax[1])
ax[1].plot(t, rms_redhot, color="r")
ax[1].set_title("RHCP")

librosa.display.waveshow(duke, ax=ax[2])
ax[2].plot(t, rms_duke, color="r")
ax[2].set_title("Duke")

plt.ylim((-1, 1))
plt.show()

png

1
2
3
4
5
6
def rms(signal, frames_length, hop_length):
    rms = []
    for i in range(0, len(signal), hop_length):
        rms_current_frame = np.sqrt(np.sum(signal[i:i+frames_length]**2) / frames_length)
        rms.append(rms_current_frame)
    return np.array(rms)
1
2
3
rms1_debussy = rms(debussy, FRAME_LENGTH, HOP_LENGTH)
rms1_redhot = rms(redhot, FRAME_LENGTH, HOP_LENGTH)
rms1_duke = rms(duke, FRAME_LENGTH, HOP_LENGTH)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
fig, ax = plt.subplots(nrows=3, ncols=1, figsize=(8, 10), sharex=True, sharey=True)

librosa.display.waveshow(debussy, ax=ax[0])
ax[0].plot(t, rms_debussy, color="r")
ax[0].plot(t, rms1_debussy, color="y")
ax[0].set_title("Debussy")

librosa.display.waveshow(redhot, ax=ax[1])
ax[1].plot(t, rms_redhot, color="r")
ax[1].plot(t, rms1_redhot, color="y")
ax[1].set_title("RHCP")

librosa.display.waveshow(duke, ax=ax[2])
ax[2].plot(t, rms_duke, color="r")
ax[2].plot(t, rms1_duke, color="y")
ax[2].set_title("Duke")

plt.ylim((-1, 1))
plt.show()

png

1
2
3
4
# Zero-crossing rate
zcr_debussy = librosa.feature.zero_crossing_rate(y=debussy, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
zcr_redhot = librosa.feature.zero_crossing_rate(y=redhot, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
zcr_duke = librosa.feature.zero_crossing_rate(y=duke, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
1
2
3
4
5
6
7
8
9
10
# visualize ZCR for all the music pieces
plt.figure(figsize=(8, 5))

plt.plot(t, zcr_debussy*FRAME_LENGTH, color="r")
plt.plot(t, zcr_redhot*FRAME_LENGTH, color="y")
plt.plot(t, zcr_duke*FRAME_LENGTH, color="b")

plt.legend(["Debussy", "RHCP", "Duke"])
plt.ylim((0, 500))
plt.show()

png

1
2
3
4
5
6
audio_files = "./raw/9_audio"
voice_file = os.path.join(audio_files, "voice.wav")
noise_file = os.path.join(audio_files, "noise.wav")

voice, _ = librosa.load(voice_file, duration=15)
noise, _ = librosa.load(noise_file, duration=15)
1
ipd.Audio(voice_file)
1
ipd.Audio(noise_file)
1
2
zcr_voice = librosa.feature.zero_crossing_rate(voice, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
zcr_noise = librosa.feature.zero_crossing_rate(noise, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
1
2
3
4
5
6
7
8
9
10
frames = range(len(zcr_voice))
t = librosa.frames_to_time(frames)

plt.figure(figsize=(8, 5))
plt.plot(t, zcr_voice, color="r")
plt.plot(t, zcr_noise, color="y")

plt.legend(["Voice", "Noise"])
plt.ylim((0, 1))
plt.show()

png

This post is licensed under CC BY 4.0 by the author.

[SP] Linux File

[DSP] Section 4: 푸리에 변환

Comments powered by Disqus.