Help needed in matlab code

Status
Not open for further replies.

mzoh

Newbie level 4
Joined
Nov 2, 2017
Messages
6
Helped
0
Reputation
0
Reaction score
0
Trophy points
1
Activity points
63
Hello,

I am working on audio classification using svm and i found this code from link:

https://github.com/mosamdabhi/Voice...n-Speech-Recognition-System-Machine-Learning-

but its gving errors, data set is not shared in this code, i am using other data set, in svmtrain i am getting errors about

thisLab = str2num(Characterstring(1:end-4));
trainedSVM = svmtrain(net_TrainingSet,net_Labels);
error: svmtrain Y and TRAINING must have the same number of rows

i have made some changes in this code like wavread changed my audioread and some other function, please help me out to run this code.



Code:
% Author: Mosam

clc;
clear all;
close all;

listing = dir('C:\Users\hp\Downloads\Compressed\Voice-Based-Digit-Recognition-Speech-Recognition-System-Machine-Learning--master\Voice-Based-Digit-Recognition-Speech-Recognition-System-Machine-Learning--master\Train1\*.wav');

for g=1:length(listing);
    names{g} = listing(g).name;
end

labels=[];
TrainingSet=[];
Accuracy=[];

for xx=1:length(listing);
    [x,Fs] = audioread([ 'C:\Users\hp\Downloads\Compressed\Voice-Based-Digit-Recognition-Speech-Recognition-System-Machine-Learning--master\Voice-Based-Digit-Recognition-Speech-Recognition-System-Machine-Learning--master\Train1\' names{xx}]);
    
    
    
    %N = length(x); % signal length
    %n = 0:N-1;
    %ts = n*(1/Fs); % time for signal
    
    %wintype = 'rectwin';
    Nlen = 320;
    Nshft = 160;
    
    steng=calsteng(x,Nlen,Nshft);
    
    Characterstring = names{xx};
    Characterstring = Characterstring(1:end-4);
    thisLab = str2num(Characterstring(1:end-4));
    labels = [labels thisLab];
    GroupVariable = labels';
    
    
    
    %labels_row = cellstr(R)
    %y=1/x;
    
    %disp([names{xx} num2str(length(steng))])
    InitDiff = diff(steng(1:length(steng)));
    
    InitDiffthreshold = (1/100)*max(steng);
    max(InitDiff);
    %M = max(find(abs(T) > (t)));
    stengmin = min(find(abs(InitDiff) > (InitDiffthreshold)));
    
    FlippedInitDiff = fliplr(InitDiff);
    max(FlippedInitDiff);
    Thresholdcut = find(abs(FlippedInitDiff) > (InitDiffthreshold), 1 );
    stengmax = length(FlippedInitDiff)-Thresholdcut;
    
    
    factor = length(x)/length(steng);
    
    %i = stengmax;
    xlimupper = factor*stengmax;
    
    %j = stengmin;
    xlimlower = factor*stengmin;
    
    
    ax1 = subplot(3,1,1);
    plot(x); hold on;
    %# vertical line
    hx = plot(xlimlower, 'LineStyle',':', 'Color',[.7 .7 .7]);
   % changedependvar(hx,'x');
    hx = plot(xlimupper, 'LineStyle',':', 'Color',[.7 .7 .7]);
  %  changedependvar(hx,'x');
    hold off;
    
    title(names{xx})
    
    
    
    subplot(3,1,2);
    plot(steng,'r');
    
    
    ax2 = subplot(3,1,3);
    %axis([3500,9625,-1,1]);
    plot(x);
    xlim(ax2,[xlimlower xlimupper]);
    
    Tw = 25;                % analysis frame duration (ms)
    Ts = 10;                % analysis frame shift (ms)
    alpha = 0.97;           % preemphasis coefficient
    M = 20;                 % number of filterbank channels
    C = 12;                 % number of cepstral coefficients
    L = 22;                 % cepstral sine lifter parameter
    LF = 300;               % lower frequency limit (Hz)
    HF = 3700;              % upper frequency limit (Hz)
    wav_file = ([ 'C:\Users\hp\Downloads\Compressed\Voice-Based-Digit-Recognition-Speech-Recognition-System-Machine-Learning--master\Voice-Based-Digit-Recognition-Speech-Recognition-System-Machine-Learning--master\Train1\' names{xx}]);   % input audio filename
    
    [ speech, fs ] = audioread( wav_file );
    
    
    % Feature extraction (feature vectors as columns)
%     [ MFCCs, FBEs, frames ] = ...
%         mfcc( speech, fs, Tw, Ts, alpha, @hamming, [LF HF], M, C+1, L );
    [ MFCCs, FBEs, frames ] = ...
                    mfcc( speech, fs, Tw, Ts, alpha, @hamming, [LF HF], M, C+1, L );
    
    % Generate data needed for plotting
    [ Nw, NF ] = size( frames );                % frame length and number of frames
    time_frames = [0:NF-1]*Ts*0.001+0.5*Nw/fs;  % time vector (s) for frames
    time = [ 0:length(speech)-1 ]/fs;           % time vector (s) for signal samples
    logFBEs = 20*log10( FBEs );                 % compute log FBEs for plotting
    logFBEs_floor = max(logFBEs(:))-50;         % get logFBE floor 50 dB below max
    logFBEs( logFBEs<logFBEs_floor ) = logFBEs_floor; % limit logFBE dynamic range
    
    
    % Generate plots
    %figure('Position', [30 30 800 600], 'PaperPositionMode', 'auto', ...
     %   'color', 'w', 'PaperOrientation', 'landscape', 'Visible', 'on' );
    
    ax2 = subplot(3,1,1);
    plot(x);
    xlim(ax2,[xlimlower xlimupper])
    xlabel( 'Time (s)' );
    ylabel( 'Amplitude' );
    title( 'Speech waveform');
    
    subplot(3,1,2);
    imagesc( time_frames, [1:M], logFBEs );
    axis( 'xy' );
    xlim( [ min(time_frames) max(time_frames) ] );
    xlabel( 'Time (s)' );
    ylabel( 'Channel index' );
    title( 'Log (mel) filterbank energies');
    
    subplot(3,1,3);
    imagesc( time_frames, [1:C], MFCCs(2:end,:) ); % HTK's TARGETKIND: MFCC
    %imagesc( time_frames, [1:C+1], MFCCs );       % HTK's TARGETKIND: MFCC_0
    %R = imagesc
    
    axis( 'xy' );
    xlim( [ min(time_frames) max(time_frames) ] )
    xlabel( 'Time (s)' );
    ylabel( 'Cepstrum index' );
    title( 'Mel frequency cepstrum' );
    Energy = MFCCs';
    R1 = Energy;
    R1 = R1(1:15:length(Energy),:);
    
    
    %Energy = MFCCs(4,:);
    %R2 = Energy';
    %R1 = R2;
    
    %Feature1 = var(R1);
    %Feature2 = mean(R1);
    %CovarianceFeat = cov(R1);
    %Feature3 = CovarianceFeat(:)';
    %Feature4 = std(R1);
    %CorrcoefFeat = corrcoef(R1);
    %Feature5 = CorrcoefFeat(:)';
    %Feature6 = mode(R1);
    %Feature7 = median(R1);
    %Feature8 = min(R1);
    %Feature9 = max(R1);
    %NetFeatures = [Feature1 Feature2 Feature3 Feature4 Feature6 Feature7 Feature8 Feature9];
    
    
    
    TrainingSet = [R1; TrainingSet];
    
    
end

labels_rows = GroupVariable(:);
labels_appending_matrix = [];
for i1=1:size(TrainingSet,1)
    labels_appending_matrix = [labels_rows; labels_appending_matrix]; 
end

listing3 = dir('C:\Users\hp\Downloads\Compressed\Voice-Based-Digit-Recognition-Speech-Recognition-System-Machine-Learning--master\Voice-Based-Digit-Recognition-Speech-Recognition-System-Machine-Learning--master\Train2\*.wav');

for g3=1:length(listing3); 
    names{g3} = listing3(g3).name;
end

TrainingSet_Train2=[];
labels3=[];


for xx3=1:length(listing3);
    [x3,Fs] = audioread([ 'C:\Users\hp\Downloads\Compressed\Voice-Based-Digit-Recognition-Speech-Recognition-System-Machine-Learning--master\Voice-Based-Digit-Recognition-Speech-Recognition-System-Machine-Learning--master\Train2\' names{xx3}]);
    
    
    
    %N = length(x); % signal length
    %n = 0:N-1;
    %ts = n*(1/Fs); % time for signal
    
    %wintype = 'rectwin';
    Nlen = 320;
    Nshft = 160;
    
    steng3=calsteng(x3,Nlen,Nshft);
    
    Characterstring3 = names{xx3};
    Characterstring3 = Characterstring3(1:end-4);
    thisLab3 = str2num([Characterstring3(:)]);
    labels3 = [labels3 thisLab3];
    GroupVariable3 = labels3';
    
    
    InitDiff3 = diff(steng3(1:length(steng3)));
    
    InitDiffthreshold3 = (1/100)*max(steng3);
    max(InitDiff3);
%     M = max(find(abs(T) > (t)));
    stengmin3 = find(abs(InitDiff3) > (InitDiffthreshold3), 1 );
    
    FlippedInitDiff3 = fliplr(InitDiff3);
    max(FlippedInitDiff3);
    Thresholdcut3 = find(abs(FlippedInitDiff3) > (InitDiffthreshold3), 1 );
    stengmax3 = length(FlippedInitDiff3)-Thresholdcut3;
    
    
    factor3 = length(x3)/length(steng3);
    
    %i = stengmax;
    xlimupper3 = factor3*stengmax3;
    
    %j = stengmin;
    xlimlower3 = factor3*stengmin3;
    
    
    ax123 = subplot(3,1,1);
    plot(x3); hold on;
    %# vertical line
    hx3 = plot(xlimlower3, 'LineStyle',':', 'Color',[.7 .7 .7]);
%     changedependvar(hx3,'x');
    hx3 = plot(xlimupper3, 'LineStyle',':', 'Color',[.7 .7 .7]);
   % changedependvar(hx3,'x');
    hold off;
    
    title(names{xx3})
    
    
    
    subplot(3,1,2);
    plot(steng3,'r');
    
    
    ax223 = subplot(3,1,3);
    %axis([3500,9625,-1,1]);
    plot(x3);
    xlim(ax223,[xlimlower3 xlimupper3])
    
    Tw = 25;                % analysis frame duration (ms)
    Ts = 10;                % analysis frame shift (ms)
    alpha = 0.97;           % preemphasis coefficient
    M = 20;                 % number of filterbank channels
    C = 12;                 % number of cepstral coefficients
    L = 22;                 % cepstral sine lifter parameter
    LF = 300;               % lower frequency limit (Hz)
    HF = 3700;              % upper frequency limit (Hz)
    wav_file = ([ 'C:\Users\hp\Downloads\Compressed\Voice-Based-Digit-Recognition-Speech-Recognition-System-Machine-Learning--master\Voice-Based-Digit-Recognition-Speech-Recognition-System-Machine-Learning--master\Train2\' names{xx3}]);   % input audio filename
    
    [ speech3, fs3 ] = audioread( wav_file );
    
    
    % Feature extraction (feature vectors as columns)
    [ MFCCs3, FBEs3, frames3 ] = ...
        mfcc( speech3, fs3, Tw, Ts, alpha, @hamming, [LF HF], M, C+1, L );
    
    
    % Generate data needed for plotting
    [ Nw3, NF3 ] = size( frames3 );                % frame length and number of frames
    time_frames3 = [0:NF3-1]*Ts*0.001+0.5*Nw3/fs3;  % time vector (s) for frames
    time3 = [ 0:length(speech3)-1 ]/fs3;           % time vector (s) for signal samples
    logFBEs3 = 20*log10( FBEs3 );                 % compute log FBEs for plotting
    logFBEs_floor3 = max(logFBEs3(:))-50;         % get logFBE floor 50 dB below max
    logFBEs3( logFBEs3<logFBEs_floor3 ) = logFBEs_floor3; % limit logFBE dynamic range
    
    
    % Generate plots
    %figure('Position', [30 30 800 600], 'PaperPositionMode', 'auto', ...
     %   'color', 'w', 'PaperOrientation', 'landscape', 'Visible', 'on' );
    
    ax223 = subplot(3,1,1);
    plot(x3);
    xlim(ax223,[xlimlower3 xlimupper3]);
    xlabel( 'Time (s)' );
    ylabel( 'Amplitude' );
    title( 'Speech waveform');
    
    subplot(3,1,2);
    imagesc( time_frames3, [1:M], logFBEs3 );
    axis( 'xy' );
    xlim( [ min(time_frames3) max(time_frames3) ] );
    xlabel( 'Time (s)' );
    ylabel( 'Channel index' );
    title( 'Log (mel) filterbank energies');
    
    subplot(3,1,3);
    imagesc( time_frames3, [1:C], MFCCs3(2:end,:) ); % HTK's TARGETKIND: MFCC
    %imagesc( time_frames, [1:C+1], MFCCs );       % HTK's TARGETKIND: MFCC_0
    %R = imagesc
    
    axis( 'xy' );
    xlim( [ min(time_frames3) max(time_frames3) ] );
    xlabel( 'Time (s)' );
    ylabel( 'Cepstrum index' );
    title( 'Mel frequency cepstrum' );
    Energy3 = MFCCs3';
    R3 = Energy3;
    R3 = R3(1:15:length(Energy3),:);
    
    %R223 = Energy3';
    
    
    %R123 = R223;
    
    %Feature1_3 = var(R123);
    %Feature2_3 = mean(R123);
    %CovarianceFeat = cov(R123);
    %Feature3_3 = CovarianceFeat(:)';
    %Feature4_3 = std(R123);
    %CorrcoefFeat = corrcoef(R12);
    %Feature5_2 = CorrcoefFeat(:)';
    %Feature6_3 = mode(R123);
    %Feature7_3 = median(R123);
    %Feature8_3 = min(R123);
    %Feature9_3 = max(R123);
    %NetFeatures1_3 = [Feature1_3 Feature2_3 Feature3_3 Feature4_3 Feature6_3 Feature7_3 Feature8_3 Feature9_3];
    
    
    
    TrainingSet_Train2 = [R3; TrainingSet_Train2];
 
    
end


labels_rows3 = GroupVariable3(:);
labels_appending_matrix3 = [];
for i3=1:size(TrainingSet_Train2,1)
    labels_appending_matrix3 = [labels_rows3; labels_appending_matrix3]; 
end

net_TrainingSet = [TrainingSet; TrainingSet_Train2];
net_Labels = [labels_appending_matrix; labels_appending_matrix3];
trainedSVM = svmtrain(net_TrainingSet,net_Labels);


testSet = dir('C:\Users\hp\Downloads\Compressed\Voice-Based-Digit-Recognition-Speech-Recognition-System-Machine-Learning--master\Voice-Based-Digit-Recognition-Speech-Recognition-System-Machine-Learning--master\Test*.wav');

for g2=1:length(testSet); 
    names{g2} = testSet(g2).name;
end

TrainingSet_Test=[];
output=[];
labels2=[];
for xx2=1:length(testSet);
    [x2,Fs] = audioread([ 'C:\Users\hp\Downloads\Compressed\Voice-Based-Digit-Recognition-Speech-Recognition-System-Machine-Learning--master\Voice-Based-Digit-Recognition-Speech-Recognition-System-Machine-Learning--master\Test\' names{xx2}]);
    
    
    
    %N = length(x); % signal length
    %n = 0:N-1;
    %ts = n*(1/Fs); % time for signal
    
    %wintype = 'rectwin';
    Nlen = 320;
    Nshft = 160;
    
    steng2=calsteng(x2,Nlen,Nshft);
    
    Characterstring2 = names{xx2};
    Characterstring2 = Characterstring2(1:end-4);
    thisLab2 = str2num([Characterstring2(4)]);
    labels2 = [labels2 thisLab2];
    GroupVariable2 = labels2';
    
    
    InitDiff2 = diff(steng2(1:length(steng2)));
    
    InitDiffthreshold2 = (1/100)*max(steng2);
    max(InitDiff2);
    %M = max(find(abs(T) > (t)));
    stengmin2 = min(find(abs(InitDiff2) > (InitDiffthreshold2)));
    
    FlippedInitDiff2 = fliplr(InitDiff2);
    max(FlippedInitDiff2);
    Thresholdcut2 = min(find(abs(FlippedInitDiff2) > (InitDiffthreshold2)));
    stengmax2 = length(FlippedInitDiff2)-Thresholdcut2;
    
    
    factor2 = length(x2)/length(steng2);
    
    %i = stengmax;
    xlimupper2 = factor2*stengmax2;
    
    %j = stengmin;
    xlimlower2 = factor2*stengmin2;
    
    
    ax12 = subplot(3,1,1);
    plot(x2); hold on;
    %# vertical line
    hx2 = plot(xlimlower2, 'LineStyle',':', 'Color',[.7 .7 .7]);
    %changedependvar(hx2,'x');
    hx2 = plot(xlimupper2, 'LineStyle',':', 'Color',[.7 .7 .7]);
    %changedependvar(hx2,'x');
    hold off;
    
    title(names{xx2})
    
    
    
    subplot(3,1,2);
    plot(steng2,'r');
    
    
    ax22 = subplot(3,1,3);
    %axis([3500,9625,-1,1]);
    plot(x2);
    xlim(ax22,[xlimlower2 xlimupper2])
    
    Tw = 25;                % analysis frame duration (ms)
    Ts = 10;                % analysis frame shift (ms)
    alpha = 0.97;           % preemphasis coefficient
    M = 20;                 % number of filterbank channels
    C = 12;                 % number of cepstral coefficients
    L = 22;                 % cepstral sine lifter parameter
    LF = 300;               % lower frequency limit (Hz)
    HF = 3700;              % upper frequency limit (Hz)
    wav_file = ([ 'C:\Users\hp\Downloads\Compressed\Voice-Based-Digit-Recognition-Speech-Recognition-System-Machine-Learning--master\Voice-Based-Digit-Recognition-Speech-Recognition-System-Machine-Learning--master\Test\' names{xx2}]);   % input audio filename
    
    [ speech2, fs2 ] = audioread( wav_file );
    
    
    % Feature extraction (feature vectors as columns)
    [ MFCCs2, FBEs2, frames2 ] = ...
        mfcc( speech2, fs2, Tw, Ts, alpha, @hamming, [LF HF], M, C+1, L );
    
    
    % Generate data needed for plotting
    [ Nw2, NF2 ] = size( frames2 );                % frame length and number of frames
    time_frames2 = [0:NF2-1]*Ts*0.001+0.5*Nw2/fs2;  % time vector (s) for frames
    time2 = [ 0:length(speech2)-1 ]/fs2;           % time vector (s) for signal samples
    logFBEs2 = 20*log10( FBEs2 );                 % compute log FBEs for plotting
    logFBEs_floor2 = max(logFBEs2(:))-50;         % get logFBE floor 50 dB below max
    logFBEs2( logFBEs2<logFBEs_floor2 ) = logFBEs_floor2; % limit logFBE dynamic range
    
    
    % Generate plots
    %figure('Position', [30 30 800 600], 'PaperPositionMode', 'auto', ...
     %   'color', 'w', 'PaperOrientation', 'landscape', 'Visible', 'on' );
    
    ax22 = subplot(3,1,1);
    plot(x2);
    xlim(ax22,[xlimlower2 xlimupper2]);
    xlabel( 'Time (s)' );
    ylabel( 'Amplitude' );
    title( 'Speech waveform');
    
    subplot(3,1,2);
    imagesc( time_frames2, [1:M], logFBEs2 );
    axis( 'xy' );
    xlim( [ min(time_frames2) max(time_frames2) ] );
    xlabel( 'Time (s)' );
    ylabel( 'Channel index' );
    title( 'Log (mel) filterbank energies');
    
    subplot(3,1,3);
    imagesc( time_frames2, [1:C], MFCCs2(2:end,:) ); % HTK's TARGETKIND: MFCC
    %imagesc( time_frames, [1:C+1], MFCCs );       % HTK's TARGETKIND: MFCC_0
    %R = imagesc
    
    axis( 'xy' );
    xlim( [ min(time_frames2) max(time_frames2) ] );
    xlabel( 'Time (s)' );
    ylabel( 'Cepstrum index' );
    title( 'Mel frequency cepstrum' );
     
    Energy2 = MFCCs2'
    %Energy2 = MFCCs2';
    %R2 = Energy2;
    %R2 = R2(1:15:length(Energy2),:);
    
    
    
    
    TrainingSet_Test = [Energy2; TrainingSet_Test];
 
   

    
end

%net_TrainingSet = [TrainingSet; TrainingSet_Train2];


labels_rows2 = GroupVariable2(1,:);
labels_appending_matrix2 = [];
for i2=1:size(TrainingSet_Test,1)
    labels_appending_matrix2 = [labels_rows2; labels_appending_matrix2]; 
end



%trainedSVM = svmtrain(net_TrainingSet,net_Labels);
for i=1:size(TrainingSet_Test,1)
    tmp = svmclassify(trainedSVM,TrainingSet_Test(i,:));
    output = [output tmp];
end
 %TRY=[];
 output = mode(output)
 %acctf = ((output-labels2)./labels2);
 %accsum = sum(((acctf')/i)*100);
 %accuracy = 100 - accsum;
 TRY = [output' labels_appending_matrix2(1,:)];
 EVAL = Evaluate(labels_appending_matrix2(1,:),output'); 
 %stats = confusionmatStats(labels2,output);

%Accuracy = [output-labels]
%[svmstruct,level] = Train_DSVM(TrainingSet,GroupVariable)
 
Last edited by a moderator:

Status
Not open for further replies.
Cookies are required to use this site. You must accept them to continue using the site. Learn more…