[DSP] Section 6
[Ref | AudioSignalProcessingForML]
1
2
3
4
5
6
7
8
9
10
11
| import os
import librosa
import librosa.display
import IPython.display as ipd
import numpy as np
import matplotlib.pyplot as plt
scale_file = "./audio/scale.wav"
debussy_file = "./audio/debussy.wav"
redhot_file = "./audio/redhot.wav"
duke_file = "./audio/duke.wav"
|
1
2
3
4
| ipd.Audio(scale_file)
# ipd.Audio(debussy_file)
# ipd.Audio(redhot_file)
# ipd.Audio(duke_file)
|
1
2
3
4
| scale, sr = librosa.load(scale_file)
debussy, _ = librosa.load(debussy_file)
redhot, _ = librosa.load(redhot_file)
duke, _ = librosa.load(duke_file)
|
1
2
3
4
5
6
7
| FRAME_SIZE = 2048
HOP_SIZE = 512
S_scale = librosa.stft(scale, n_fft=FRAME_SIZE, hop_length=HOP_SIZE)
print(f"S_scale shape: {S_scale.shape}")
print(f"Type: {type(S_scale[0][0])}")
|
1
2
| S_scale shape: (1025, 342)
Type: <class 'numpy.complex64'>
|
Calculating the spectrogram
1
2
3
4
| Y_scale = np.abs(S_scale)**2
print(f"Y_scale shape: {Y_scale.shape}")
print(f"Type: {type(Y_scale[0][0])}")
|
1
2
| Y_scale shape: (1025, 342)
Type: <class 'numpy.float32'>
|
Visualizing the spectrogram
1
2
3
4
| def plot_spectrogram(Y, sr, hop_length, y_axis="linear"):
plt.figure(figsize=(7, 3))
librosa.display.specshow(Y, sr=sr, hop_length=hop_length, x_axis="time", y_axis=y_axis)
plt.colorbar(format="%+2.f")
|
1
| plot_spectrogram(Y_scale, sr, HOP_SIZE)
|
Log-Amplitude Spectrogram
1
2
3
| Y_log_scale = librosa.power_to_db(Y_scale)
plot_spectrogram(Y_log_scale, sr, HOP_SIZE)
|
Log-Freq. Spectrogram
1
| plot_spectrogram(Y_log_scale, sr, HOP_SIZE, y_axis="log")
|
Visualising songs from different genres
1
2
3
4
5
6
7
8
9
10
11
| S_debussy = librosa.stft(debussy, n_fft=FRAME_SIZE, hop_length=HOP_SIZE)
S_redhot = librosa.stft(redhot, n_fft=FRAME_SIZE, hop_length=HOP_SIZE)
S_duke = librosa.stft(duke, n_fft=FRAME_SIZE, hop_length=HOP_SIZE)
Y_debussy = librosa.power_to_db(np.abs(S_debussy)**2)
Y_redhot = librosa.power_to_db(np.abs(S_redhot)**2)
Y_duke = librosa.power_to_db(np.abs(S_duke)**2)
plot_spectrogram(Y_debussy, sr, HOP_SIZE, "log")
plot_spectrogram(Y_redhot, sr, HOP_SIZE, "log")
plot_spectrogram(Y_duke, sr, HOP_SIZE, "log")
|
17. Mel Spectrogram Explained Easily
Psychoacoustic experiment
- 1st sample: C2 - C4 -> (65 - 262Hz)
- 2nd sample: G6 - A6 -> (1568 - 1760Hz)
- Human perceive freq. logarithmically
Ideal audio feature
- Time-freq. representation
- Perceptually-relevant amplitude representation
- Perceptually-relevant frequency representation
-> Mel spectrograms
Mel-scale
\(m = 2595 \cdot log(1 + {f \over 500} )\) \(f = 700 (10^{m/2595} - 1)\)
- Logarithmic scale
- Equal distances on the scale have same “perceptual” distance
- 1000 Hz = 1000 Mel
- Extract STFT
- Convert amplitude to DBs
- Convert freq. to Mel scale
Convert freq. to Mel scale
- Choose # of mel bands
- Construct mel filter banks
- Apply mel filter banks to spectrogram
How many mel bands?
- depends on the problem -> 하이퍼파라미터임
Mel filter banks
- Convert lowest/highest freq. to Mel
- $ m=2595 \cdot log(1+{f \over 500} ) $
- Create # bands equally spaced points
- Convert points back to Hertz
- $ f = 700(10^{m/2595} - 1) $
- Round to nearest freq. bin
- Create triangular filters
Mel filter banks’ shape
- (# bands, framesie / 2 + 1)
Applying mel filter banks to spactrogram
- M = (# bands, framesize / 2 + 1)
- Y = (framesize / 2 +1, # frames)
- Mel spectrogram = MY
- (# bands, # frames)
Mel spectrogram applications
- Audio classification
- Automatic mood recognition
- Music genre classification
- Music instrument classification
- …
18. Mel Spectrogram with Python
1
2
3
| scale_file = "./audio/scale.wav"
ipd.Audio(scale_file)
|
1
| scale, sr = librosa.load(scale_file)
|
Mel filter banks
1
2
3
| filter_banks = librosa.filters.mel(n_fft=2048, sr=22050, n_mels=10)
print(f"filter: {filter_banks.shape}")
|
1
2
3
4
| plt.figure(figsize=(7, 3))
librosa.display.specshow(filter_banks, sr=sr, x_axis="linear")
plt.colorbar(format="%+2.f")
plt.show()
|
1
2
3
4
5
6
7
8
9
10
| mel_spectrogram = librosa.feature.melspectrogram(scale, sr=sr, n_fft=2048, hop_length=512, n_mels=90)
print(f"mel_spectrogram shape: {mel_spectrogram.shape}")
log_mel_spectrogram = librosa.power_to_db(mel_spectrogram)
print(f"log_mel_spectrogram shape: {log_mel_spectrogram.shape}")
plt.figure(figsize=(7, 3))
librosa.display.specshow(log_mel_spectrogram, x_axis="time", y_axis="mel", sr=sr)
plt.colorbar(format="%+.2f")
plt.show()
|
1
2
| mel_spectrogram shape: (90, 342)
log_mel_spectrogram shape: (90, 342)
|
1
2
3
4
5
6
7
8
9
10
| mel_spectrogram = librosa.feature.melspectrogram(scale, sr=sr, n_fft=2048, hop_length=512, n_mels=10)
print(f"mel_spectrogram shape: {mel_spectrogram.shape}")
log_mel_spectrogram = librosa.power_to_db(mel_spectrogram)
print(f"log_mel_spectrogram shape: {log_mel_spectrogram.shape}")
plt.figure(figsize=(7, 3))
librosa.display.specshow(log_mel_spectrogram, x_axis="time", y_axis="mel", sr=sr)
plt.colorbar(format="%+.2f")
plt.show()
|
1
2
| mel_spectrogram shape: (10, 342)
log_mel_spectrogram shape: (10, 342)
|
Comments powered by Disqus.