Continue to Site

Welcome to EDAboard.com

Welcome to our site! EDAboard.com is an international Electronics Discussion Forum focused on EDA software, circuits, schematics, books, theory, papers, asic, pld, 8051, DSP, Network, RF, Analog Design, PCB, Service Manuals... and a whole lot more! To participate you need to register. Registration is free. Click here to register now.

about this speech recognition using fpga

Status
Not open for further replies.

lgeorge123

Full Member level 2
Joined
Jun 13, 2004
Messages
130
Helped
0
Reputation
0
Reaction score
0
Trophy points
1,296
Location
Hong Kong
Activity points
1,403
iord_altera_avalon_pio_data return value

I found this link of using speech recognition using neural network and FPGA.
**broken link removed** .

It provides the main code in c as below :
My problems are how to find out the values of normc[12] and mins[12] ??? As I can not find them to ask !!!!!


#include <math.h>
#include "system.h"
#include <stdio.h>
#include "sys/alt_irq.h"
#include "altera_avalon_pio_regs.h"

// LCD Stuff
#define ESC_TOP_LEFT "[1;0H"
#define ESC_BOTTOM_LEFT "[2;0H"
#define LCD_CLR "[2J"
#define LCD_CLR_LINE "[K"
//static unsigned char esc = 0x1b; // Integer ASCII value of the ESC character
unsigned char line[4] = {0x1b, '[', '2', 'J'};

// File descriptor for the LCD
FILE * lcd_fd;

// Just for kicks
#define TRUE 1
#define FALSE 0

// Debugging
#define NNTRAIN_DEBUG 1
#define CEPSTRAL_DEBUG 2
#define NNVOWEL_DEBUG 3
#define NNCLASSIFICATION_DEBUG 4
#define DEBUG 0

// Helpful Altera Defines
#define FFT_DONE IORD_ALTERA_AVALON_PIO_DATA(FFTDONE_BASE)
#define FFT_POW IORD_ALTERA_AVALON_PIO_DATA(FFTPOWER_BASE)
#define FFT_EXP IORD_ALTERA_AVALON_PIO_DATA(FFTEXP_BASE)
#define VOWEL IORD_ALTERA_AVALON_PIO_DATA(VOWELID_BASE)
#define TRAIN IORD_ALTERA_AVALON_PIO_DATA(TRAIN_BASE)
#define APP IORD_ALTERA_AVALON_PIO_DATA(APPSWITCHES_BASE)

// MACROS
#define SET_RED_LED(val) IOWR_ALTERA_AVALON_PIO_DATA(REDLED_BASE, val)
#define SET_GREEN_LED(val) IOWR_ALTERA_AVALON_PIO_DATA(GREENLED_BASE, val)
#define INVALID_FB() {\
SET_RED_LED(1);\
SET_GREEN_LED(0);\
}
#define VALID_FB() {\
SET_RED_LED(0);\
SET_GREEN_LED(1);\
}
#define REC_FB_OFF() {\
SET_RED_LED(0);\
SET_GREEN_LED(0);\
}
#define SET_VOWEL_FB(val) IOWR_ALTERA_AVALON_PIO_DATA(VOWELLEDS_BASE, val)
#define FFT_START(val) IOWR_ALTERA_AVALON_PIO_DATA(FFTSTART_BASE, val)
#define FFT_ADDR(val) IOWR_ALTERA_AVALON_PIO_DATA(FFTADDR_BASE, val)
#define ABS(val) (((signed short)val) > 0 ? ((signed short)val) : (-((signed short)val)))
#define ABSF(val) (((float)val) > 0 ? ((float)val) : (-((float)val)))
#define SWAP(a,b) tempr=(a);(a)=(b);(b)=tempr

// Possible applications
#define APP_VOWEL_RECOGNITION 0
#define APP_VOWEL_IDENTIFICATION 1
#define APP_SPEAKER_VERIFICATION 2
#define APP_VERIFY_COMBO 3

// KNN
#define POPULATION_SIZE 4
#define TRAINING_SAMPLES 1000
#define CEPSTRAL_COEFFS 12
float cepstrumdbase[POPULATION_SIZE][TRAINING_SAMPLES][CEPSTRAL_COEFFS];
float cepstrumsum[CEPSTRAL_COEFFS];
float cepstrumsumsq[CEPSTRAL_COEFFS];
int trainindex[POPULATION_SIZE];
int dbasesize;

// Perceptron
#define ETA (0.1)
#define NUM_WEIGHTS (CEPSTRAL_COEFFS)
float w[NUM_WEIGHTS];

// FFT values
float power[512];

// MFCC stuff
#define PI 3.141592653589793
#define WALPHA 0.54
#define WBETA 0.46
float fc[] = {0,132.83,290.87,478.9,702.61,968.77,1285.4,1662.2,2110.5,2643.8,3278.3,4033.2,4931.4,6000};
float xm[12];
float cc[12];

// Array if scaled input MFCCs
float tempcc[12];

// Neural net for vowels
float normC[12] = {0.1256,0.2387,0.3103,0.4629,0.7097,0.7513,0.7551,0.9407,1.0806,1.6268,1.7445,1.8445};
float mins[12] = {35.2227,-3.5023,-2.5545,-3.4447,-1.7989,-1.6293,-1.2367,-1.4114,-1.0041,-0.6698,-0.6260,-0.5228};
float inputWeights[5][12] = {
{ -1.2089, 0.2943, 0.7391, 1.6160, -2.1935, -0.7240, -1.3857, -2.3402, -1.9270, -0.9871, -1.2068, 0.4986},
{ 1.7209, 0.7152, 2.4952, -3.2453, 0.0210, -1.6449, 0.9243, 0.8289, 1.4265, 0.3174, -0.7964, 0.0300},
{ 0.2512, 0.2612, 0.4827, 0.1527, 0.4246, 0.1010, 0.3902, 2.6627, 0.1643, -0.9129, 1.3745, -0.7828},
{ -1.3447, 0.2917, -3.1331, -2.6909, 1.1394, 0.3363, 1.7668, -0.8060, 1.5766, -0.6394, 0.7125, -0.5882},
{ -1.7162, -1.0673, -0.9907, 1.6492, 0.4469, 1.5707, -0.6763, -3.7785, 0.7869, -0.3750, -0.2600, -0.2563}
};
float layerWeights[5] = {1.1872,-1.8493,-0.4447,1.7714,-2.0585};
float layerBias[5] = {1.7583,1.6819,-1.2861,1.9129,1.1419};
float outputBias = -1.2987;
float outputs[5];

// Neural net for verification (2 layer network, 20 input nodes, 1 output node)
float normC2[12] = { 0.1111, 0.2864, 0.3138, 0.4321, 0.4089, 0.5940, 0.8157, 0.7546, 1.0784, 1.1271, 1.2148, 1.5602};
float mins2[12] = { 31.8608,-1.7785,-3.9666,-3.5837,-3.6057,-1.9164,-1.5185,-1.8338,-1.2547,-1.0079,-1.0161,-0.6993};
float inputWeights2[20][12] = {
{ -0.7526, 0.4288,-0.4882,-1.3992, 0.3468, 1.6403, 0.0536, 0.2692,-1.9052,-0.7899,-0.4174, 0.9512},
{ -2.4543, 0.5014, 0.3943, 1.6109, 0.7841,-1.3805,-0.3385, 0.1604, 2.7870, 0.8548,-1.0384, 0.8302},
{ -0.4268,-0.0416, 1.0595, 0.5244, 0.2416,-0.9700,-1.2861, 0.7993,-0.7730, 0.4486,-0.8534, 0.1769},
{ 1.4332,-1.3129,-2.1724, 0.9689,-0.4139, 0.2017, 0.5963, 2.5860, 1.4730,-0.3187,-0.1881,-0.5114},
{ -0.0862,-0.6416, 0.7505, 0.7027, 0.1335,-1.3470, 1.0661,-0.4483, 0.1176, 1.1444, 0.2731,-0.1028},
{ 1.8872,-1.0276, 1.8398,-1.6880, 0.3150, 0.1477, 0.8880,-0.1398, 0.6836, 1.8576,-0.7553,-0.9342},
{ -0.9216, 0.6352, 0.7301,-0.3109,-0.2257,-0.4336, 1.1039, 0.5359, 0.7242, 0.7082,-1.0292, 1.2419},
{ -0.4453,-1.2779,-1.6460, 0.2163,-1.4386, 0.7433,-0.7834, 0.3025,-0.2256,-0.4309, 0.4253, 1.8680},
{ 0.1236,-2.5354, 0.4105,-0.9713,-0.7253,-1.0976,-0.9437,-0.9861,-1.2965, 0.3741,-1.8046, 0.6564},
{ -0.6594,-2.2827, 1.1421, 1.4527, 0.2529, 0.5253,-0.5925,-1.1238,-0.1844,-1.1727, 0.9060,-1.2109},
{ 0.4207,-1.3578,-0.8224, 0.1839,-2.6818,-0.6341, 1.1097, 2.8541,-0.2094, 3.0292,-1.4645,-1.2377},
{ 0.4295, 1.0299,-1.5659, 0.2940,-0.6917,-1.0349, 1.0695, 0.2867,-2.2725, 1.8007, 1.7328,-1.4786},
{ -1.1575,-0.2113,-0.1655,-0.0400, 0.7032,-0.3078, 0.3254,-0.9211, 2.5576, 0.0611, 2.9928,-0.0013},
{ -0.1518,-0.3270, 1.1073, 0.4715,-0.8868, 0.3812,-0.2471,-0.9095,-0.8270,-0.1252, 1.6829, 0.6401},
{ 0.5530,-0.3549,-0.4217, 0.9172, 1.5877,-0.0034,-0.7609, 0.3707,-1.3530,-1.3026, 1.0384, 0.1577},
{ -2.1922, 1.5180, 0.9797, 0.8251, 0.6398, 0.1715, 0.6412, 3.0613, 0.0815, 1.1072, 1.8286, 0.0065},
{ -1.2666,-1.1772, 1.1917,-0.1739,-0.9549, 0.8335,-0.5706, 1.0262, 0.1206,-1.4301,-0.5439, 2.6319},
{ -0.4765, 0.3235,-0.4655, 1.2807, 0.0106,-0.0949,-0.7238, 2.2040, 1.3202, 1.5410,-1.0232, 0.0781},
{ 0.6639, 0.0960,-0.2031, 0.8317,-0.3008,-0.6373,-0.5143, 0.5518,-0.1179,-0.3721, 0.8707, 0.3580},
{ -0.9843, 0.2230, 0.8193,-1.2243, 0.1191,-0.4784,-0.9186, 0.5795,-0.7898,-0.4291,-0.4353,-1.0045}
};
float layerWeights2[20] = {-1.8151,-2.9834,-1.9503,-2.5326,-1.2862,-2.1438, 0.5390, 1.7446, 2.5479,-2.7056, 3.2107,-3.1836, 3.0553,-2.1323, 1.7749, 2.3216,-2.9436,-2.2102,-0.2598, 1.7208};
float layerBias2[20] = {-1.7852, 2.7261, 1.9236,-2.0655,-0.8399,-1.7943, 0.0652, 0.2180, 1.1294,-0.7624, 0.5999, 0.9947,-0.9433, 0.9724,-0.1905,-1.4673,-1.6549,-1.3535, 1.5971,-2.2682};
float outputBias2 = -1.0170;
float outputs2[20];

// Utility prototypes
void print_array(float data[], int n);

// MFCC prototypes
void dct(float x[], float y[], int n);
double melH(int f, int i);
void melshift(float x[], int n, float xm[], int m, float fs);
//void hamming(float x[], int n);
//void mfcc(float x[], int n, float cc[], float m, float fs);

// KNN Prototypes
void initdbase();
void addtrain(int speakeridx, float coeffs[]);
int nearestneighbor(float coeffs[], unsigned char euclidean);

// Good FFT check (vowel using neural net)
unsigned char vowelCheck(float cc[], int n);

// Perceptron of first three cepstrals
unsigned char classify(float cc[], int n);

// Fundamental method prototypes
int findFundamental(float power[]);

int main(void)
{
int temp,i;
short exp;

// Lets just initialize the feedback to off to be sure
SET_VOWEL_FB(0);
REC_FB_OFF();

// Make sure start is zero for a while
FFT_START(0);
for (i = 0; i < 1000000; i++);

//open the lcd --- device name from system.h
lcd_fd = fopen("/dev/lcd_0", "w");
if(lcd_fd == NULL) printf("Unable to open lcd display\n");

// Initialize the KNN database
initdbase();

while(TRUE) {
FFT_START(1);
while(!FFT_DONE); // Wait for the FFT to finish
FFT_START(0); // Deassert start
exp = FFT_EXP;
if (exp < 62)
{
for (i = 0; i < 512; i++) {
FFT_ADDR(i);
power = (float)(short)FFT_POW;
}

// shift the spectrum into the mel scale
melshift(power, 512, xm, 12, 32000);

// compute the dct of the mel spectrum
dct(xm, cc, 12);

#if (DEBUG == NNVOWEL_DEBUG)
if (vowelCheck(cc,12))
{
printf("VOWEL\n");
}
else
{
printf("NOT VOWEL\n");
}
#endif
#if (DEBUG == NNTRAIN_DEBUG)
if (vowelCheck(cc,12))
{
print_array(cc,12);
}
#endif
// Do the selected application
switch (APP)
{
// Vowel checking, light green if vowel, red if not
case APP_VOWEL_RECOGNITION:
// Make sure vowel id feedback is off
SET_VOWEL_FB(0);

// Check for a value, display LED feedback
if (vowelCheck(cc,12))
{
VALID_FB();
}
else
{
INVALID_FB();
}
break;
// Vowel identification, light the nearest neighbor trained vowel
case APP_VOWEL_IDENTIFICATION:
// Make sure verification feedback is off
REC_FB_OFF();

// If we have a vowel, depending on training button state, train or classify, show feedback
if (vowelCheck(cc,12))
{
if (TRAIN)
{
if (VOWEL & 0x08)
{
addtrain(3, cc);
}
else if (VOWEL & 0x04)
{
addtrain(2, cc);
}
else if (VOWEL & 0x02)
{
addtrain(1, cc);
}
else if (VOWEL & 0x01)
{
addtrain(0, cc);
}
}
else
{
temp = nearestneighbor(cc, FALSE);
SET_VOWEL_FB(1 << temp);
}
}
break;
case APP_SPEAKER_VERIFICATION:
// Make sure vowel id feedback is off
SET_VOWEL_FB(0);

// If we have a vowel, try to verify speaker is Parker
if (vowelCheck(cc,12))
{
if (classify(cc,12))
{
VALID_FB();
printf("PARKER\n");
}
else
{
INVALID_FB();
printf("NOT PARKER\n");
}
}
break;
case APP_VERIFY_COMBO:
// Just set all LED feedback to off
//SET_VOWEL_FB(0);
//REC_FB_OFF();
// Make sure verification feedback is off
REC_FB_OFF();

// If we have a vowel, depending on training button state, train or classify, show feedback
if (vowelCheck(cc,12))
{
if (TRAIN)
{
if (VOWEL & 0x08)
{
addtrain(3, cc);
}
else if (VOWEL & 0x04)
{
addtrain(2, cc);
}
else if (VOWEL & 0x02)
{
addtrain(1, cc);
}
else if (VOWEL & 0x01)
{
addtrain(0, cc);
}
}
else
{
if (classify(cc,12) && nearestneighbor(cc, TRUE) == 0)
{
VALID_FB();
printf("PARKER\n");
}
else
{
INVALID_FB();
printf("NOT PARKER\n");
}
}
}
break;
}
}
else
{
// No significant sound is coming in, just turn all feedback off
SET_VOWEL_FB(0);
REC_FB_OFF();
}
}
}

// Check if this spectrum is a vowel (not noise)
unsigned char vowelCheck(float cc[], int n)
{
int i,j;
float out;

// Normalize the ccs
for (i = 0; i < 12; i++)
{
tempcc = (cc-mins)*normC - 1.0;
}

// Initialize the output (with the bias we will add)
out = outputBias;

// Get the network outputs
for (i = 0; i < 5; i++)
{
// Calculate the net sum of the weighted inputs
outputs = 0;
for (j = 0; j < n; j++)
{
outputs += tempcc[j]*inputWeights[j];
}

// The input to the second layer is the weighted tranfer fnc of the biased net sum of weighted inputs
out += tanh(outputs + layerBias)*layerWeights;
}

// Now we apply the transfer function of the second layer output
out = tanh(out);

// We apply an arbitrary cutoff and what is definitely a vowel
if (out > 0.4)
{
return TRUE;
}
else
{
return FALSE;
}
}

// Print the array
void print_array(float data[], int n)
{
int i;

printf("[");
for (i = 0; i < n; i++)
{
printf("%f",data);
if (i != n-1)
{
printf(",");
}
}
printf("]\n");
}

// shift the n point spectrum x into the mel frequency m point spectrum xm
void melshift(float x[], int n, float xm[], int m, float fs)
{
int i, j;

float deltaf = fs / n;
for (i = 0; i < m; i++) {
xm = 0.0;
for (j = 0; j < n; j++) {
xm += x[j]*melH(j*deltaf, i+1);
}
xm = log10(xm);
}
}

// compute the value of the mel triangle filter bank i at frequency f
double melH(int f, int i)
{
if (f < fc[i-1] || f >= fc[i+1]) return 0;
if (f < fc) return (f-fc[i-1])/(fc-fc[i-1]);
else return (f-fc[i+1])/(fc-fc[i+1]);
}

// O(n2) dct - simple but slow (use for small vectors only)
void dct(float x[], float y[], int n)
{
int i,j;

double pn = PI / n;
for (i = 0; i < n; i++) {
y = 0.0;
for (j = 0; j < n; j++) {
y += x[j]*cos(i*pn*(j+0.5));
}
}
}

// KNN initialize
void initdbase()
{
int i,j,k;

dbasesize = 0;
for (i=0; i<POPULATION_SIZE; i++) {
trainindex = 0;
for (j=0; j<TRAINING_SAMPLES; j++) {
for (k=0; k<CEPSTRAL_COEFFS; k++) {
cepstrumdbase[j][k] = 0.0f;
}
}
}
for (i=0; i<CEPSTRAL_COEFFS; i++) {
cepstrumsum = 0.0f;
cepstrumsumsq = 0.0f;
}
}

void addtrain(int speakeridx, float coeffs[])
{
int i;

int idx = trainindex[speakeridx];
// don't overwrite the array
if (idx >= TRAINING_SAMPLES) return;

dbasesize++;
for (i=0; i<CEPSTRAL_COEFFS; i++) {
float c = coeffs;
cepstrumdbase[speakeridx][idx] = c;
cepstrumsum += c;
cepstrumsumsq += c*c;
}
trainindex[speakeridx]++;
}

int nearestneighbor(float coeffs[], unsigned char euclidean)
{
int i,j,k;

// normalize coeffs
float mean = 0.0f;
float var = 0.0f;
float stdev = 0.0f;
float val = 0.0f;
float norm = 0.0f;
float diff = 0.0f;
for (i=0; i<CEPSTRAL_COEFFS; i++) {
mean = cepstrumsum / dbasesize;
var = cepstrumsumsq / dbasesize;
stdev = sqrt(var - mean*mean);
coeffs = (coeffs-mean) / stdev;
}

// compare to each vector in the database
float dist = 0.0f, bestdist = 99999.0f;
int closespeaker = -1;
int samples = 0;
for (i=0; i<POPULATION_SIZE; i++) {
samples = trainindex;
for (j=0; j<samples; j++) {
dist = 0.0f;
for (k=0; k<CEPSTRAL_COEFFS; k++) {
// normalize this coeff
val = cepstrumdbase[j][k];
mean = cepstrumsum[k] / dbasesize;
var = cepstrumsumsq[k] / dbasesize;
stdev = sqrt(var - mean*mean);
norm = (val-mean) / stdev;
diff = coeffs[k]-norm;
if (euclidean)
dist += diff*diff;
else
dist += ABSF(diff);
}
if (dist < bestdist) {
bestdist = dist;
closespeaker = i;
}
}
}

return closespeaker;
}

// Classify using linear separator
unsigned char classify(float cc[],int n)
{
int i,j;
float out;

// Normalize the ccs
for (i = 0; i < 12; i++)
{
tempcc = (cc-mins2)*normC2 - 1.0;
}

#if (DEBUG == NNCLASSIFICATION_DEBUG)
print_array(cc,12);
#endif

// Initialize the output
out = outputBias2;

// Get the input layer outputs (20 neurons)
for (i = 0; i < 20; i++)
{
// Calculate the net sum of the weighted inputs (for input layer)
outputs2 = 0;
for (j = 0; j < n; j++)
{
outputs2 += tempcc[j]*inputWeights2[j];
}

// Calculate the biased transfered output for the input layer
outputs2 = tanh(outputs2 + layerBias2);
out += outputs2*layerWeights2;
}

// Now we apply the transfer function of the second layer output
out = tanh(out);

#if (DEBUG == NNCLASSIFICATION_DEBUG)
print_array(tempcc,12);
printf("Net Value: %f\n",out);
#endif

// We want to be really sure its me
if (out > 0.9)
{
return TRUE;
}
else
{
return FALSE;
}
}
 

Re: iord_altera_avalon_pio_data return value

Hi There,

lgeorge123 said:
It provides the main code in c as below :
My problems are how to find out the values of normc[12] and mins[12] ??? As I can not find them to ask !!!!!

I think the answer to your question is actually in the code you sent with your question:

// Neural net for vowels
float normC[12] = {0.1256,0.2387,0.3103,0.4629,0.7097,0.7513,0.7551,0.9407,1.0806,1.6268,1.7445,1.8445};
float mins[12] = {35.2227,-3.5023,-2.5545,-3.4447,-1.7989,-1.6293,-1.2367,-1.4114,-1.0041,-0.6698,-0.6260,-0.5228};

As you can see the 2 are defined as 12 element floating point variables with the default values defined during the declaration.

Hope this answers to your questions and thanks a lot for the link, it is really interesting, I may try to port it to my Xilinx based FPGA board.

Cheers,
/Farhad Abdolian
 

Status
Not open for further replies.

Part and Inventory Search

Welcome to EDABoard.com

Sponsor

Back
Top