function pesq_mos= pesq_psychoacoustic_model (ref_data, ref_Nsamples, deg_data, ...
deg_Nsamples )
global CALIBRATE Nfmax Nb Sl Sp
global nr_of_hz_bands_per_bark_band centre_of_band_bark
global width_of_band_hz centre_of_band_hz width_of_band_bark
global pow_dens_correction_factor abs_thresh_power
global Downsample SEARCHBUFFER DATAPADDING_MSECS Fs Nutterances
global Utt_Start Utt_End Utt_Delay NUMBER_OF_PSQM_FRAMES_PER_SYLLABE
global Fs Plot_Frame
% Plot_Frame= 75; % this is the frame whose spectrum will be plotted
FALSE= 0;
TRUE= 1;
NUMBER_OF_PSQM_FRAMES_PER_SYLLABE= 20;
maxNsamples = max (ref_Nsamples, deg_Nsamples);
Nf = Downsample * 8;
MAX_NUMBER_OF_BAD_INTERVALS = 1000;
start_frame_of_bad_interval= zeros( 1, MAX_NUMBER_OF_BAD_INTERVALS);
stop_frame_of_bad_interval= zeros( 1, MAX_NUMBER_OF_BAD_INTERVALS);
start_sample_of_bad_interval= zeros( 1, MAX_NUMBER_OF_BAD_INTERVALS);
stop_sample_of_bad_interval= zeros( 1, MAX_NUMBER_OF_BAD_INTERVALS);
number_of_samples_in_bad_interval= zeros( 1, MAX_NUMBER_OF_BAD_INTERVALS);
delay_in_samples_in_bad_interval= zeros( 1, MAX_NUMBER_OF_BAD_INTERVALS);
number_of_bad_intervals= 0;
there_is_a_bad_frame= FALSE;
Whanning= hann( Nf, 'periodic');
Whanning= Whanning';
D_POW_F = 2;
D_POW_S = 6;
D_POW_T = 2;
A_POW_F = 1;
A_POW_S = 6;
A_POW_T = 2;
D_WEIGHT= 0.1;
A_WEIGHT= 0.0309;
CRITERIUM_FOR_SILENCE_OF_5_SAMPLES = 500;
samples_to_skip_at_start = 0;
sum_of_5_samples= 0;
while ((sum_of_5_samples< CRITERIUM_FOR_SILENCE_OF_5_SAMPLES) ...
&& (samples_to_skip_at_start < maxNsamples / 2))
sum_of_5_samples= sum( abs( ref_data( samples_to_skip_at_start...
+ SEARCHBUFFER * Downsample + 1: samples_to_skip_at_start...
+ SEARCHBUFFER * Downsample + 5)));
if (sum_of_5_samples< CRITERIUM_FOR_SILENCE_OF_5_SAMPLES)
samples_to_skip_at_start = samples_to_skip_at_start+ 1;
end
end
% fprintf( 'samples_to_skip_at_start is %d\n', samples_to_skip_at_start);
samples_to_skip_at_end = 0;
sum_of_5_samples= 0;
while ((sum_of_5_samples< CRITERIUM_FOR_SILENCE_OF_5_SAMPLES) ...
&& (samples_to_skip_at_end < maxNsamples / 2))
sum_of_5_samples= sum( abs( ref_data( maxNsamples - ...
SEARCHBUFFER* Downsample + DATAPADDING_MSECS* (Fs/ 1000) ...
- samples_to_skip_at_end - 4: maxNsamples - ...
SEARCHBUFFER* Downsample + DATAPADDING_MSECS* (Fs/ 1000) ...
- samples_to_skip_at_end)));
if (sum_of_5_samples< CRITERIUM_FOR_SILENCE_OF_5_SAMPLES)
samples_to_skip_at_end = samples_to_skip_at_end+ 1;
end
end
% fprintf( 'samples_to_skip_at_end is %d\n', samples_to_skip_at_end);
start_frame = floor( samples_to_skip_at_start/ (Nf/ 2));
stop_frame = floor( (maxNsamples- 2* SEARCHBUFFER* Downsample ...
+ DATAPADDING_MSECS* (Fs/ 1000)- samples_to_skip_at_end) ...
/ (Nf/ 2))- 1;
% number of frames in speech data plus DATAPADDING_MSECS
% fprintf( 'start/end frame is %d/%d\n', start_frame, stop_frame);
D_disturbance= zeros( stop_frame+ 1, Nb);
DA_disturbance= zeros( stop_frame+ 1, Nb);
power_ref = pow_of (ref_data, SEARCHBUFFER* Downsample, ...
maxNsamples- SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000),...
maxNsamples- 2* SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000));
power_deg = pow_of (deg_data, SEARCHBUFFER * Downsample, ...
maxNsamples- SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000),...
maxNsamples- 2* SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000));
% fprintf( 'ref/deg power is %f/%f\n', power_ref, power_deg);
hz_spectrum_ref = zeros( 1, Nf/ 2);
hz_spectrum_deg = zeros( 1, Nf/ 2);
frame_is_bad = zeros( 1, stop_frame + 1);
smeared_frame_is_bad = zeros( 1, stop_frame + 1);
silent = zeros( 1, stop_frame + 1);
pitch_pow_dens_ref = zeros( stop_frame + 1, Nb);
pitch_pow_dens_deg = zeros( stop_frame + 1, Nb);
frame_was_skipped = zeros( 1, stop_frame + 1);
frame_disturbance = zeros( 1, stop_frame + 1);
frame_disturbance_asym_add = zeros( 1, stop_frame + 1);
avg_pitch_pow_dens_ref = zeros( 1, Nb);
avg_pitch_pow_dens_deg = zeros( 1, Nb);
loudness_dens_ref = zeros( 1, Nb);
loudness_dens_deg = zeros( 1, Nb);
deadzone = zeros( 1, Nb);
disturbance_dens = zeros( 1, Nb);
disturbance_dens_asym_add = zeros( 1, Nb);
time_weight = zeros( 1, stop_frame + 1);
total_power_ref = zeros( 1, stop_frame + 1);
% fid= fopen( 'tmp_mat.txt', 'wt');
for frame = 0: stop_frame
start_sample_ref = 1+ SEARCHBUFFER * Downsample + frame* (Nf/ 2);
hz_spectrum_ref= short_term_fft (Nf, ref_data, Whanning, ...
start_sample_ref);
utt = Nutterances;
while ((utt >= 1) && ((Utt_Start(utt)- 1)* Downsample+ 1 ...
> start_sample_ref))
utt= utt - 1;
end
if (utt >= 1)
delay = Utt_Delay(utt);
else
delay = Utt_Delay(1);
end
start_sample_deg = start_sample_ref + delay;
if ((start_sample_deg > 0) && (start_sample_deg + Nf- 1 < ...
maxNsamples+ DATAPADDING_MSECS* (Fs/ 1000)))
hz_spectrum_deg= short_term_fft (Nf, deg_data, Whanning, ...
start_sample_deg);
else
hz_spectrum_deg( 1: Nf/ 2)= 0;
end
pitch_pow_dens_ref( frame+ 1, :)= freq_warping (...
hz_spectrum_ref, Nb, frame);
%peak = maximum_of (pitch_pow_dens_ref, 0, Nb);
pitch_pow_dens_deg( frame+ 1, :)= freq_warping (...
hz_spectrum_deg, Nb, frame);
total_audible_pow_ref = total_audible (frame, pitch_pow_dens_ref, 1E2);
total_audible_pow_deg = total_audible (frame, pitch_pow_dens_deg, 1E2);
silent(frame+ 1) = (total_audible_pow_ref < 1E7);
end
% fclose( fid);
avg_pitch_pow_dens_ref= time_avg_audible_of (stop_frame + 1, ...
silent, pitch_pow_dens_ref, floor((maxNsamples- 2* SEARCHBUFFER* ...
Downsample+ DATAPADDING_MSECS* (Fs/ 1000))/ (Nf / 2))- 1);
avg_pitch_pow_dens_deg= time_avg_audible_of (stop_frame + 1, ...
silent, pitch_pow_dens_deg, floor((maxNsamples- 2* SEARCHBUFFER* ...
Downsample+ DATAPADDING_MSECS* (Fs/ 1000))/ (Nf/ 2))- 1);
% fid= fopen( 'tmp_mat.txt', 'wt');
% fprintf( fid, '%f\n', avg_pitch_pow_dens_deg);
% fclose( fid);
if (CALIBRATE== 0)
pitch_pow_dens_ref= freq_resp_compensation (stop_frame + 1, ...
pitch_pow_dens_ref, avg_pitch_pow_dens_ref, ...
avg_pitch_pow_dens_deg, 1000);
if (Plot_Frame>= 0) % plot pitch_pow_dens_ref
figure;
subplot( 1, 2, 1);
plot( centre_of_band_hz, 10* log10( eps+ ...
pitch_pow_dens_ref( Plot_Frame+ 1, :)));
axis( [0 Fs/2 0 95]); %xlabel( 'Hz'); ylabel( 'Db');
title( 'reference signal bark spectrum with frequency compensation');
subplot( 1, 2, 2);
plot( centre_of_band_hz, 10* log10( eps+ ...
pitch_pow_dens_deg( Plot_Frame+ 1, :)));
axis( [0 Fs/2 0 95]); %xlabel( 'Hz'); ylabel( 'Db');
title( 'degraded signal bark spectrum');
end
end
% tmp1= pitch_pow_dens_ref';
MAX_SCALE = 5.0;
MIN_SCALE = 3e-4;
oldScale = 1;
THRESHOLD_BAD_FRAMES = 30;
for frame = 0: stop_frame
total_audible_pow_ref = total_audible (frame, pitch_pow_dens_ref, 1);
total_audible_pow_deg = total_audible (frame, pitch_pow_dens_deg, 1);
total_power_ref (1+ frame) = total_audible_pow_ref;
scale = (total_audible_pow_ref + 5e3)/ (total_audible_pow_deg + 5e3);
if (frame > 0)
scale = 0.2 * oldScale + 0.8 * scale;
end
oldScale = scale;
if (scale > MAX_SCALE)
scale = MAX_SCALE;
elseif (scale < MIN_SCALE)
scale = MIN_SCALE;
end
pitch_pow_dens_deg( 1+ frame, :) = ...
pitch_pow_dens_deg( 1+ frame, :) * scale;
if (frame== Plot_Frame)
figure;
subplot( 1, 2, 1);
plot( centre_of_band_hz, 10* log10( eps+ ...
pitch
评论5