[cleanAudio,fs] = audioread("SpeechDFT-16-8-mono-5secs.wav");
sound(cleanAudio,fs)
noise = audioread("SpeechDFT-16-8-mono-5secs.wav");
% noise = load("volvo.mat");
% Extract a noise segment from a random location in the noise file
ind = randi(numel(noise) - numel(cleanAudio) + 1, 1, 1);
noiseSegment = noise(ind:ind + numel(cleanAudio) - 1);
speechPower = sum(cleanAudio.^2);
noisePower = sum(noiseSegment.^2);
noisyAudio = cleanAudio + sqrt(speechPower/noisePower) * noiseSegment;
sound(noisyAudio,fs)
t = (1/fs) * (0:numel(cleanAudio)-1);
subplot(2,1,1)
plot(t,cleanAudio)
title("Clean Audio")
grid on
subplot(2,1,2)
plot(t,noisyAudio)
title("Noisy Audio")
xlabel("Time (s)")
grid on
%% 检查数据集
%% 此示例使用 Mozilla 通用语音数据集 [1] 的一部分来训练和测试深度学习网络。该数据集包含受试者口述短句的 48 kHz 录音。下载该数据集并解压缩下载的文件。
url = 'http://222.178.203.72:19005/whst/63/_rrczlZsgvnqjrzbnl//supportfiles/audio/commonvoice.zip';
downloadFolder = tempdir;
dataFolder = fullfile(downloadFolder,'commonvoice');
if ~exist(dataFolder,'dir')
disp('Downloading data set (956 MB) ...')
unzip(url,downloadFolder)
end
%% 使用 audioDatastore 为训练集创建数据存储。要以牺牲性能为代价来加快示例的运行时间,请将 reduceDataset 设置为 true
adsTrain = audioDatastore(fullfile(dataFolder,'train'),'IncludeSubfolders',true);
reduceDataset = true;
if reduceDataset
adsTrain = shuffle(adsTrain);
adsTrain = subset(adsTrain,1:1000);
end
%% 使用 read 获取数据存储中第一个文件的内容。
[audio,adsTrainInfo] = read(adsTrain);
%% 收听语音信号。
sound(audio,adsTrainInfo.SampleRate)
%% 对语音信号绘图。
figure
t = (1/adsTrainInfo.SampleRate) * (0:numel(audio)-1);
plot(t,audio)
title("Example Speech Signal")
xlabel("Time (s)")
grid on
%% STFT 目标和预测变量
%本节说明如何从一个训练文件中生成目标和预测变量信号。
%首先,定义系统参数:
windowLength = 256;
win = hamming(windowLength,"periodic");
overlap = round(0.75 * windowLength);
ffTLength = windowLength;
inputFs = 48e3;
fs = 8e3;
numFeatures = ffTLength/2 + 1;
numSegments = 8;
%% 创建一个 dsp.SampleRateConverter (DSP System Toolbox) 对象以将 48 kHz 音频转换为 8 kHz。
src = dsp.SampleRateConverter("InputSampleRate",inputFs, ...
"OutputSampleRate",fs, ...
"Bandwidth",7920);
%% 使用 read 从数据存储中获取音频文件的内容。
audio = read(adsTrain);
%% 确保音频长度是采样率转换器抽取因子的倍数。
decimationFactor = inputFs/fs;
L = floor(numel(audio)/decimationFactor);
audio = audio(1:decimationFactor*L);
%% 将音频信号转换为 8 kHz。
audio = src(audio);
reset(src)
%% 使用洗衣机噪声向量创建一个随机噪声段。
randind = randi(numel(noise) - numel(audio),[1 1]);
noiseSegment = noise(randind : randind + numel(audio) - 1);
%% 向语音信号添加噪声,使 SNR 为 0 dB。
noisePower = sum(noiseSegment.^2);
cleanPower = sum(audio.^2);
noiseSegment = noiseSegment .* sqrt(cleanPower/noisePower);
noisyAudio = audio + noiseSegment;
%% 使用 stft (Signal Processing Toolbox) 基于原始和含噪音频信号生成幅值 STFT 向量。
cleanSTFT = stft(audio,'Window',win,'OverlapLength',overlap,'FFTLength',ffTLength);
cleanSTFT = abs(cleanSTFT(numFeatures-1:end,:));
noisySTFT = stft(noisyAudio,'Window',win,'OverlapLength',overlap,'FFTLength',ffTLength);
noisySTFT = abs(noisySTFT(numFeatures-1:end,:));
%% 基于含噪 STFT 生成包含 8 个段的训练预测变量信号。连续预测变量之间的重叠是 7 个段。
noisySTFT = [noisySTFT(:,1:numSegments - 1), noisySTFT];
stftSegments = zeros(numFeatures, numSegments , size(noisySTFT,2) - numSegments + 1);
for index = 1:size(noisySTFT,2) - numSegments + 1
stftSegments(:,:,index) = (noisySTFT(:,index:index + numSegments - 1));
end
%% 设置目标和预测变量。两个变量的最后一个维度对应于由音频文件生成的非重复预测变量/目标对组的数量。每个预测变量为 129×8,每个目标为 129×1。
targets = cleanSTFT;
size(targets)
predictors = stftSegments;
size(predictors)
%% 使用 tall 数组提取特征
%为了加快处理速度,使用 tall 数组从数据存储中所有音频文件的语音段中提取特征序列。与内存数组不同,在您调用 gather 函数之前,tall 数组通常不会实际进行计算。这种延迟计算使您能够快速处理大型数据集。当使用 gather 最终请求输出时,MATLAB 会尽可能地合并排队的计算,并执行最少次数的数据遍历。如果您有 Parallel Computing Toolbox?,您可以在本地 MATLAB 会话中或在本地并行池中使用 tall 数组。如果安装了 MATLAB? Parallel Server?,您还可以在群集上运行 tall 数组计算。
%首先,将数据存储转换为 tall 数组。
reset(adsTrain)
T = tall(adsTrain)
%上面的输出内容指示行数(对应于数据存储中的文件数)M 未知。M 是占位符,直到计算完成才会填充该值。
% 从 tall 表中提取目标幅值 STFT 和预测变量幅值 STFT。此操作会创建新 tall 数组变量以用于后续计算。函数 HelperGenerateSpeechDenoisingFeatures 执行在 STFT 目标和预测变量部分中已着重介绍的步骤。cellfun 命令将 HelperGenerateSpeechDenoisingFeatures 应用于数据存储中每个音频文件的内容。
[targets,predictors] = cellfun(@(x)HelperGenerateSpeechDenoisingFeatures(x,noise,src),T,"UniformOutput",false);
%% 使用 gather 计算目标和预测变量。
[targets,predictors] = gather(targets,predictors);
%% 将所有特征归一化为具有零均值和单位标准差是很好的做法。
%分别计算预测变量和目标的均值和标准差,并使用它们来归一化数据。
predictors = cat(3,predictors{:});
noisyMean = mean(predictors(:));
noisyStd = std(predictors(:));
predictors(:) = (predictors(:) - noisyMean)/noisyStd;
targets = cat(2,targets{:});
cleanMean = mean(targets(:));
cleanStd = std(targets(:));
targets(:) = (targets(:) - cleanMean)/cleanStd;
%% 将预测变量和目标重构为深度学习网络需要的维度。
predictors = reshape(predictors,size(predictors,1),size(predictors,2),1,size(predictors,3));
targets = reshape(targets,1,1,size(targets,1),size(targets,2));
%% 在训练期间,您将使用 1% 的数据进行验证。验证对于检测网络过拟合训练数据的情况非常有用。
%将数据随机分成训练集和验证集。
inds = randperm(size(predictors,4));
L = round(0.99 * size(predictors,4));
trainPredictors = predictors(:,:,:,inds(1:L));
trainTargets = targets(:,:,:,inds(1:L));
validatePredictors = predictors(:,:,:,inds(L+1:end));
validateTargets = targets(:,:,:,inds(L+1:end));
%%
layers = [
imageInputLayer([numFeatures,numSegments])
fullyConnectedLayer(1024)
batchNormalizationLayer
reluLayer
fullyConnectedLayer(1024)
batchNormalizationLayer
reluLayer
fullyConnectedLayer(numFeatures)
regressionLayer
];
%%
miniBatchSize = 128;
options = trainingOptions("adam", ...
"MaxEpochs",3, ...
"InitialLearnRate",1e-5,...
"MiniBatchSize",miniBatchSize, ...
"Shuffle","every-epoch", ...
"Plots","training-progress", ...
"Verbose",false, ...
"ValidationFrequency",floor(size(trainPredictors,4)/miniBatchSize), ...
"LearnRateSchedule","piecewise", ...
"LearnRateDropFactor",0.9, ...
"LearnRateDropPeriod",1, ...
"ValidationData",{validatePredictors,validateTargets});
%%
doTraining = true;
if doTraining
denoiseNetFullyConnected = trainNetwork(trainPredictors,trainTargets,layers,options);
else
s = load("denoisenet.mat");
denoiseNetFullyConnected = s.denoiseNetFullyConnected;
cleanMean = s.cleanMean;
cleanStd = s.cleanStd;
noisyMean = s.noisyMean;
noisyStd = s.noisyStd;
end
%%
numWeights = 0;
for index = 1:numel(denoiseNetFullyConnected.Layers)
if isa(denoiseNetFullyConnected.Layers(index),"nnet.cnn.layer.FullyConnectedLayer")
numWeights = numWeights + numel(denoiseNetFullyConnected.Layers(index).Weights);
end
end
fprintf("The number of weights is %d.\n",numWeights);
%%
layers = [imageInputLayer([numFeatures,numSegments])
convolution2dLayer([9 8],18,"Stride",[1 100],"Padding","same")
batchNormalizationLayer
reluLayer
repmat( ...
[convolution2dLayer([5 1],30,"Stride",[1 100],"Padding","same")
batchNormalizationLayer
- 1
- 2
- 3
- 4
- 5
- 6
前往页