8000 Fix CI scan problems by yiluxiangbei · Pull Request #110 · mindspore-lab/mindaudio · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content
8000

Fix CI scan problems #110

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Mar 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
173 changes: 70 additions & 103 deletions mindaudio/data/augment.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,33 @@
import numpy as np
import random

import mindspore as ms
import mindspore.dataset.audio as msaudio
import numpy as np
from mindspore.nn import Conv1d
from .io import read
from .spectrum import compute_amplitude, dB_to_amplitude, stft, istft, \
_pad_shape
from .processing import rescale, resample

from .filters import notch_filter
from .io import read
from .processing import resample, rescale
from .spectrum import _pad_shape, compute_amplitude, dB_to_amplitude, istft, stft

__all__ = [
'frequencymasking',
'timemasking',
'reverberate',
'add_noise',
'add_reverb',
'add_babble',
'drop_freq',
'speed_perturb',
'drop_chunk',
'time_stretch',
'pitch_shift'
"frequencymasking",
"timemasking",
"reverberate",
"add_noise",
"add_reverb",
"add_babble",
"drop_freq",
"speed_perturb",
"drop_chunk",
"time_stretch",
"pitch_shift",
]


def frequencymasking(waveform, iid_masks=False, frequency_mask_param=0,
mask_start=0, mask_value=0.0):
def frequencymasking(
waveform, iid_masks=False, frequency_mask_param=0, mask_start=0, mask_value=0.0,
):
"""
Apply masking to a spectrogram in the frequency domain.

Expand All @@ -49,15 +51,14 @@ def frequencymasking(waveform, iid_masks=False, frequency_mask_param=0,
>>> orignal = spectrum.spectrogram(waveform)
>>> masked = augment.frequencymasking(orignal, frequency_mask_param=80)
"""
frequency_masking = msaudio.FrequencyMasking(iid_masks,
frequency_mask_param,
mask_start, mask_value)
frequency_masking = msaudio.FrequencyMasking(iid_masks, frequency_mask_param, mask_start, mask_value)

return frequency_masking(waveform)


def timemasking(waveform, iid_masks=False, frequency_mask_param=0,
mask_start=0, mask_value=0.0):
def timemasking(
waveform, iid_masks=False, frequency_mask_param=0, mask_start=0, mask_value=0.0,
):
"""
Apply masking to a spectrogram in the time domain.

Expand All @@ -80,8 +81,7 @@ def timemasking(waveform, iid_masks=False, frequency_mask_param=0,
>>> masked = augment.timemasking(orignal, frequency_mask_param=80)

"""
time_masking = msaudio.TimeMasking(iid_masks, frequency_mask_param,
mask_start, mask_value)
time_masking = msaudio.TimeMasking(iid_masks, frequency_mask_param, mask_start, mask_value)

return time_masking(waveform)

Expand Down Expand Up @@ -128,9 +128,7 @@ def reverberate(waveforms, rir_waveform, rescale_amp="avg"):
rir_waveform = np.expand_dims(rir_waveform, -1)

# Compute the average amplitude of the clean
orig_amplitude = compute_amplitude(
waveforms, waveforms.shape[1], rescale_amp
)
orig_amplitude = compute_amplitude(waveforms, waveforms.shape[1], rescale_amp)

# Compute index of the direct signal, so we can preserve alignment
# value_max = np.max(np.abs(rir_waveform), axis=1, keepdims=True)
Expand All @@ -141,12 +139,7 @@ def reverberate(waveforms, rir_waveform, rescale_amp="avg"):
# rir_waveform[mask] = -rir_waveform[mask]

# Use FFT to compute convolution, because of long reverberation filter
waveforms = convolve1d(
waveforms=waveforms,
kernel=rir_waveform,
use_fft=True,
rotation_index=direct_index,
)
waveforms = convolve1d(waveforms=waveforms, kernel=rir_waveform, use_fft=True, rotation_index=direct_index,)

if len(orig_shape) == 1:
waveforms = np.squeeze(np.squeeze(waveforms, 0), -1)
Expand All @@ -158,14 +151,13 @@ def reverberate(waveforms, rir_waveform, rescale_amp="avg"):
lengths = waveforms.shape[1]

# Rescale to the peak amplitude of the clean waveform
waveforms = rescale(
waveforms, orig_amplitude, lengths=lengths, amp_type=rescale_amp
)
waveforms = rescale(waveforms, orig_amplitude, lengths=lengths, amp_type=rescale_amp)
return waveforms


def convolve1d(waveforms, kernel, padding=0, pad_type="constant", stride=1,
groups=1, use_fft=True, rotation_index=0):
def convolve1d(
waveforms, kernel, padding=0, pad_type="constant", stride=1, groups=1, use_fft=True, rotation_index=0,
):
"""Use mindspore.conv1d to perform 1d padding and convolution.

Args:
Expand Down Expand Up @@ -237,9 +229,7 @@ def convolve1d(waveforms, kernel, padding=0, pad_type="constant", stride=1,

else:
# Todo: the implementation can be optimized here.
conv1d = Conv1d(1, 1, kernel_size=kernel.shape[-1],
stride=stride, group=groups, padding=0,
pad_mode='valid')
conv1d = Conv1d(1, 1, kernel_size=kernel.shape[-1], stride=stride, group=groups, padding=0, pad_mode="valid",)
weight = ms.Tensor(np.expand_dims(kernel, 0))
weight.set_dtype(ms.float32)
conv1d.weight.set_data(weight)
Expand All @@ -253,8 +243,7 @@ def convolve1d(waveforms, kernel, padding=0, pad_type="constant", stride=1,
convolved = np.squeeze(convolved, 1)
return convolved
if n_dim == 3:
return np.transpose(convolved,
[0, 2, 1]) # transpose back to channel last
return np.transpose(convolved, [0, 2, 1]) # transpose back to channel last


def rms_normalize(samples):
Expand Down Expand Up @@ -285,8 +274,7 @@ def caculate_rms(samples):
return rms


def add_noise(samples, backgroundlist, min_snr_in_db, max_snr_in_db,
mix_prob=1.0):
def add_noise(samples, backgroundlist, min_snr_in_db, max_snr_in_db, mix_prob=1.0):
"""
add background noise.

Expand Down Expand Up @@ -346,13 +334,11 @@ def add_noise(samples, backgroundlist, min_snr_in_db, max_snr_in_db,
sample_rms = caculate_rms(samples)
snr = np.random.uniform(min_snr_in_db, max_snr_in_db, 1)
background_scale = sample_rms / (10 ** (snr / 20))
background_noise = (np.expand_dims(background, axis=0)) * (
np.expand_dims(background_scale, axis=2))
background_noise = (np.expand_dims(background, axis=0)) * (np.expand_dims(background_scale, axis=2))
samples_added_noise = samples + background_noise

if dimension_of_samples == 1:
samples_added_noise = samples_added_noise.squeeze(axis=1).squeeze(
axis=0)
samples_added_noise = samples_added_noise.squeeze(axis=1).squeeze(axis=0)
elif dimension_of_samples == 2:
samples_added_noise = samples_added_noise.squeeze(axis=1)

Expand Down Expand Up @@ -393,8 +379,7 @@ def add_reverb(samples, rirlist, reverb_prob=1.0):
samples = np.expand_dims(samples, axis=2)
elif orig_shapelen == 3:
batch, chanel, times = samples.shape
samples = np.expand_dims(samples.reshape(batch * chanel, times),
axis=2)
samples = np.expand_dims(samples.reshape(batch * chanel, times), axis=2)

rir_path = random.choice(rirlist)
rir_waveform, sr = read(rir_path)
Expand All @@ -410,8 +395,7 @@ def add_reverb(samples, rirlist, reverb_prob=1.0):
return res


def add_babble(waveforms, lengths, speaker_count=3, snr_low=0, snr_high=0,
mix_prob=1.0):
def add_babble(waveforms, lengths, speaker_count=3, snr_low=0, snr_high=0, mix_prob=1.0):
"""
Simulate babble noise by mixing the signals in a batch.

Expand Down Expand Up @@ -488,9 +472,9 @@ def add_babble(waveforms, lengths, speaker_count=3, snr_low=0, snr_high=0,
return babbled_waveform


def drop_freq(waveforms, drop_freq_low=1e-14, drop_freq_high=1,
drop_count_low=1, drop_count_high=2,
drop_width=0.05, drop_prob=1):
def drop_freq(
waveforms, drop_freq_low=1e-14, drop_freq_high=1, drop_count_low=1, drop_count_high=2, drop_width=0.05, drop_prob=1,
):
"""
Drops a random frequency from the signal.To teach models to learn to rely
on all parts of the signal,not just a few frequency bands.
Expand Down Expand Up @@ -529,22 +513,17 @@ def drop_freq(waveforms, drop_freq_low=1e-14, drop_freq_high=1,

# Add channels dimension
if len(waveforms.shape) == 1:
dropped_waveform = np.expand_dims(np.expand_dims(dropped_waveform, 0),
2)
dropped_waveform = np.expand_dims(np.expand_dims(dropped_waveform, 0), 2)
elif len(waveforms.shape) == 2:
dropped_waveform = np.expand_dims(dropped_waveform, axis=2)

# Pick number of frequencies to drop
drop_count = np.random.randint(
low=drop_count_low, high=drop_count_high + 1, size=(1,)
)
drop_count = np.random.randint(low=drop_count_low, high=drop_count_high + 1, size=(1,))
drop_count = drop_count[0]

# Pick a frequency to drop
drop_range = drop_freq_high - drop_freq_low
drop_frequency = (
np.random.rand(drop_count) * drop_range + drop_freq_low
)
drop_frequency = np.random.rand(drop_count) * drop_range + drop_freq_low

# Filter parameters
filter_length = 101
Expand All @@ -556,9 +535,7 @@ def drop_freq(waveforms, drop_freq_low=1e-14, drop_freq_high=1,

# Subtract each frequency
for frequency in drop_frequency:
notch_kernel = notch_filter(
frequency, filter_length, drop_width,
)
notch_kernel = notch_filter(frequency, filter_length, drop_width,)
drop_filter = convolve1d(drop_filter, notch_kernel, pad)

# Apply filter
Expand All @@ -568,13 +545,11 @@ def drop_freq(waveforms, drop_freq_low=1e-14, drop_freq_high=1,
if orig_shapelen == 2:
dropped_waveform = np.squeeze(dropped_waveform, axis=2)
elif orig_shapelen == 1:
dropped_waveform = np.squeeze(np.squeeze(dropped_waveform, axis=2),
axis=0)
dropped_waveform = np.squeeze(np.squeeze(dropped_waveform, axis=2), axis=0)
return dropped_waveform


def speed_perturb(waveform, orig_freq, speeds=[90, 100, 110],
perturb_prob=1.0):
def speed_perturb(waveform, orig_freq, speeds=[90, 100, 110], perturb_prob=1.0):
"""
Slightly speed up or slow down an audio signal.Resample the audio signal
at a rate that is similar to the original rate, to achieve a slightly
Expand Down Expand Up @@ -614,9 +589,18 @@ def speed_perturb(waveform, orig_freq, speeds=[90, 100, 110],
return perturbed_waveform


def drop_chunk(waveforms, lengths, drop_length_low=100, drop_length_high=1000,
drop_count_low=1, drop_count_high=10,
drop_start=0, drop_end=None, drop_prob=1, noise_factor=0.0):
def drop_chunk(
waveforms,
lengths,
drop_length_low=100,
drop_length_high=1000,
drop_count_low=1,
drop_count_high=10,
drop_start=0,
drop_end=None,
drop_prob=1,
noise_factor=0.0,
):
"""
This class drops portions of the input signal.Using `drop_chunk` as an
augmentation strategy helps a models learn to rely on all parts of the
Expand Down Expand Up @@ -692,7 +676,7 @@ def drop_chunk(waveforms, lengths, drop_length_low=100, drop_length_high=1000,
drop_length_high = min(drop_length_high, drop_range)

# Reading input list
lengths = (lengths * waveforms.shape[1])
lengths = lengths * waveforms.shape[1]
batch_size = waveforms.shape[0]
dropped_waveform = waveforms.copy()

Expand All @@ -701,27 +685,18 @@ def drop_chunk(waveforms, lengths, drop_length_low=100, drop_length_high=1000,
return dropped_waveform

# Store original amplitude for computing white noise amplitude
clean_amplitude = compute_amplitude(waveforms,
np.expand_dims(lengths, axis=1))
clean_amplitude = compute_amplitude(waveforms, np.expand_dims(lengths, axis=1))

# Pick a number of times to drop
drop_times = np.random.randint(
low=drop_count_low,
high=drop_count_high + 1,
size=(batch_size,),
)
drop_times = np.random.randint(low=drop_count_low, high=drop_count_high + 1, size=(batch_size,),)

# Iterate batch to set mask
for i in range(batch_size):
if drop_times[i] == 0:
continue

# Pick lengths
length = np.random.randint(
low=drop_length_low,
high=drop_length_high + 1,
size=(drop_times[i],),
)
length = np.random.randint(low=drop_length_low, high=drop_length_high + 1, size=(drop_times[i],),)

# Compute range of starting locations
start_min = drop_start
Expand All @@ -735,16 +710,14 @@ def drop_chunk(waveforms, lengths, drop_length_low=100, drop_length_high=1000,
start_max = max(0, start_max - length.max())

# Pick starting locations
start = np.random.randint(
low=start_min, high=start_max + 1, size=(drop_times[i],),
)
start = np.random.randint(low=start_min, high=start_max + 1, size=(drop_times[i],),)

end = start + length

# Update waveform
if not noise_factor:
for j in range(drop_times[i]):
dropped_waveform[i, start[j]: end[j]] = 0.0
dropped_waveform[i, start[j] : end[j]] = 0.0
else:
# Uniform distribution of -2 to +2 * avg amplitude should
# preserve the average for normalization
Expand All @@ -753,7 +726,7 @@ def drop_chunk(waveforms, lengths, drop_length_low=100, drop_length_high=1000,
# zero-center the noise distribution
noise_vec = np.random.rand(length[j])
noise_vec = 2 * noise_max * noise_vec - noise_max
dropped_waveform[i, start[j]: end[j]] = noise_vec
dropped_waveform[i, start[j] : end[j]] = noise_vec

return dropped_waveform

Expand Down Expand Up @@ -823,16 +796,14 @@ def _phase_vocoder(matrix, rate, hop_length=None, n_fft=None):
matrix = np.pad(matrix, padding, mode="constant")

for t, step in enumerate(time_steps):
columns = matrix[..., int(step): int(step + 2)]
columns = matrix[..., int(step) : int(step + 2)]
alpha = np.mod(step, 1.0)
mag = (1.0 - alpha) * np.abs(columns[..., 0]) + alpha * np.abs(
columns[..., 1])
mag = (1.0 - alpha) * np.abs(columns[..., 0]) + alpha * np.abs(columns[..., 1])
phase_complex = np.cos(phase_acc) + 1j * np.sin(phase_acc)
if mag is not None:
phase_complex *= mag
d_stretch[..., t] = phase_complex
dphase = np.angle(columns[..., 1]) - np.angle(
columns[..., 0]) - phi_advance
dphase = np.angle(columns[..., 1]) - np.angle(columns[..., 0]) - phi_advance
dphase = dphase - 2.0 * np.pi * np.round(dphase / (2.0 * np.pi))
phase_acc += phi_advance + dphase

Expand Down Expand Up @@ -861,9 +832,5 @@ def pitch_shift(waveforms, sr, n_steps, bins_per_octave=12):
rate = 2.0 ** (-float(n_steps) / bins_per_octave)
waveforms_stretch = time_stretch(waveforms, rate=rate)
# Stretch in time, then resample
y_shift = resample(
waveforms_stretch,
orig_freq=float(sr) / rate,
new_freq=sr,
)
y_shift = resample(waveforms_stretch, orig_freq=float(sr) / rate, new_freq=sr,)
return _pad_shape(y_shift, data_shape=waveforms_stretch.shape[-1])
Loading
0