Continue to Site

Welcome to EDAboard.com

Welcome to our site! EDAboard.com is an international Electronics Discussion Forum focused on EDA software, circuits, schematics, books, theory, papers, asic, pld, 8051, DSP, Network, RF, Analog Design, PCB, Service Manuals... and a whole lot more! To participate you need to register. Registration is free. Click here to register now.

mutlplty verilog code does not multiply

Status
Not open for further replies.

promach

Advanced Member level 4
Advanced Member level 4
Joined
Feb 22, 2016
Messages
1,202
Helped
2
Reputation
4
Reaction score
5
Trophy points
1,318
Activity points
11,643
Why the following multiply verilog code does not multiply ?

Besides, in order to debug the code, I need access to the variable "middle_layers", but gtkwave is not giving me access to it. Why ?

I have also tried formal verification using yosys-smtbmc, but surprisingly the code failed even the simplest cover(in_valid) which I do not understand at all.


Code Verilog - [expand]
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
module multiply(clk, reset, in_valid, out_valid, in_A, in_B, out_C); // C=A*B
 
parameter A_WIDTH = 16;
parameter B_WIDTH = 16;
 
input clk, reset;
input in_valid; // to signify that in_A, in_B are valid, multiplication process can start
input signed [(A_WIDTH-1):0] in_A;
input signed [(B_WIDTH-1):0] in_B;
output signed [(A_WIDTH+B_WIDTH-1):0] out_C;
output reg out_valid; // to signify that out_C is valid, multiplication finished
 
/* 
   This signed multiplier code architecture is a combination of row adder tree and 
   modified baugh-wooley algorithm, thus requires an area of O(N*M*logN) and time O(logN)
   with M, N being the length(bitwidth) of the multiplicand and multiplier respectively
 
   see [url]http://i.imgur.com/NaqjC6G.png[/url] or 
   Row Adder Tree Multipliers in [url]http://www.andraka.com/multipli.php[/url] or
   [url]http://pdfs.semanticscholar.org/415c/d98dafb5c9cb358c94189927e1f3216b7494.pdf#page=10[/url]
   regarding the mechanisms within all layers
 
   In terms of fmax consideration: In the case of an adder tree, the adders making up the levels
   closer to the input take up real estate (remember the structure of row adder tree).  As the 
   size of the input multiplicand bitwidth grows, it becomes more and more difficult to find a
   placement that does not use long routes involving multiple switch nodes for FPGA.  The result
   is the maximum clocking speed degrades quickly as the size of the bitwidth grows.
 
   For signed multiplication, see also modified baugh-wooley algorithm for trick in skipping 
   sign extension (implemented as verilog example in [url]http://www.dsprelated.com/showarticle/555.php[/url]),
   thus smaller final routed silicon area.
 
   [url]http://stackoverflow.com/questions/54268192/understanding-modified-baugh-wooley-multiplication-algorithm/[/url]
 
   All layers are pipelined, so throughput = one result for each clock cycle 
   but each multiplication result still have latency = NUM_OF_INTERMEDIATE_LAYERS 
*/
 
 
// The multiplication of two numbers is equivalent to adding as many copies of one 
// of them, the multiplicand, as the value of the other one, the multiplier.
// Therefore, multiplicand always have the larger width compared to multipliers
 
localparam SMALLER_WIDTH = (A_WIDTH <= B_WIDTH) ? A_WIDTH : B_WIDTH;
localparam LARGER_WIDTH = (A_WIDTH > B_WIDTH) ? A_WIDTH : B_WIDTH;
 
wire [(LARGER_WIDTH-1):0] MULTIPLICAND = (A_WIDTH > B_WIDTH) ? in_A : in_B ;
wire [(SMALLER_WIDTH-1):0] MULTIPLIPLIER = (A_WIDTH <= B_WIDTH) ? in_A : in_B ;
 
localparam NUM_OF_INTERMEDIATE_LAYERS = $clog2(SMALLER_WIDTH);
 
 
/*Binary multiplications and additions for partial products rows*/
 
// first layer has "SMALLER_WIDTH" entries of data of width "LARGER_WIDTH"
// This resulted in a binary tree with faster vertical addition processes as we have 
// lesser (NUM_OF_INTERMEDIATE_LAYERS) rows to add
 
// intermediate partial product rows additions
// Imagine a rhombus of height of "SMALLER_WIDTH" and width of "LARGER_WIDTH"
// being re-arranged into binary row adder tree
// such that additions can be done in O(logN) time
 
//reg [(NUM_OF_INTERMEDIATE_LAYERS-1):0][0:(SMALLER_WIDTH-1)][(A_WIDTH+B_WIDTH-1):0] middle_layers;
reg [(A_WIDTH+B_WIDTH-1):0] middle_layers[(NUM_OF_INTERMEDIATE_LAYERS-1):0][0:(SMALLER_WIDTH-1)];
//reg [(NUM_OF_INTERMEDIATE_LAYERS-1):0] middle_layers [0:(SMALLER_WIDTH-1)] [(A_WIDTH+B_WIDTH-1):0];
//reg middle_layers [(NUM_OF_INTERMEDIATE_LAYERS-1):0][0:(SMALLER_WIDTH-1)][(A_WIDTH+B_WIDTH-1):0];
 
generate // duplicates the leafs of the binary tree
 
    genvar layer; // layer 0 means the youngest leaf, layer N means the tree trunk
 
    for(layer=0; layer<NUM_OF_INTERMEDIATE_LAYERS; layer=layer+1) begin: intermediate_layers
 
        integer pp_index; // leaf index within each layer of the tree
        integer bit_index; // index of binary string within each leaf
 
        always @(posedge clk)
        begin
            if(reset) 
            begin
                for(pp_index=0; pp_index<SMALLER_WIDTH ; pp_index=pp_index+1)
                    middle_layers[layer][pp_index] <= 0;
            end
 
            else begin
            
                if(layer == 0)  // all partial products rows are in first layer
                begin
                
                    // generation of partial products rows
                    for(pp_index=0; pp_index<SMALLER_WIDTH ; pp_index=pp_index+1)
                        middle_layers[layer][pp_index] <= 
                        (MULTIPLICAND & MULTIPLIPLIER[pp_index]);
                        
                    // see modified baugh-wooley algorithm: [url]http://i.imgur.com/VcgbY4g.png[/url]
                    for(pp_index=0; pp_index<SMALLER_WIDTH ; pp_index=pp_index+1)
                        middle_layers[layer][pp_index][LARGER_WIDTH-1] <= 
                       !middle_layers[layer][pp_index][LARGER_WIDTH-1];
                        
                    middle_layers[layer][SMALLER_WIDTH-1] <= !middle_layers[layer][SMALLER_WIDTH-1];
                    middle_layers[layer][0][LARGER_WIDTH] <= 1;
                    middle_layers[layer][SMALLER_WIDTH-1][LARGER_WIDTH] <= 1;
                end
                
                // adding the partial product rows according to row adder tree architecture
                else begin
                    for(pp_index=0; pp_index<(SMALLER_WIDTH >> layer) ; pp_index=pp_index+1)
                        middle_layers[layer][pp_index] <=
                        middle_layers[layer-1][pp_index<<1] +
                      (middle_layers[layer-1][(pp_index<<1) + 1]) << 1;
                    
                    // bit-level additions using full adders
                    /*for(pp_index=0; pp_index<SMALLER_WIDTH ; pp_index=pp_index+1)
                        for(bit_index=0; bit_index<(LARGER_WIDTH+layer); bit_index=bit_index+1)
                            full_adder fa(.clk(clk), .reset(reset), .ain(), .bin(), .cin(), .sum(), .cout());*/
                end
            end
        end
    end
 
endgenerate
 
assign out_C = (reset)? 0 : middle_layers[NUM_OF_INTERMEDIATE_LAYERS-1][0];
 
 
/*Checking if the final multiplication result is ready or not*/
 
reg [($clog2($clog2(SMALLER_WIDTH))-1):0] out_valid_counter; // to track the multiply stages
reg multiply_had_started;
 
always @(posedge clk)
begin
    if(reset) 
    begin
        multiply_had_started <= 0;
        out_valid <= 0;
        out_valid_counter <= 0;
    end
 
    else if(out_valid_counter == $clog2(SMALLER_WIDTH)-1) begin
        multiply_had_started <= 0;
        out_valid <= 1;
        out_valid_counter <= 0;
    end
    
    else if(in_valid && !multiply_had_started) begin
        multiply_had_started <= 1;
        out_valid <= 0; // for consecutive multiplication
    end
    
    else begin
        out_valid <= 0;
        if(multiply_had_started) out_valid_counter <= out_valid_counter + 1;
    end
end
 
 
`ifdef FORMAL
 
initial assume(in_valid == 0);
initial assert(out_valid == 0);
initial assert(out_valid_counter == 0);
 
wire sign_bit = in_A[A_WIDTH-1] ^ in_B[B_WIDTH-1];
 
always @(posedge clk)
begin
    if(reset) assert(out_C == 0);
    
    else if(out_valid) begin
        assert(out_C == (in_A * in_B));
        assert(out_C[A_WIDTH+B_WIDTH-1] == sign_bit);
    end
end
 
`endif
 
`ifdef FORMAL
 
always @(posedge clk)
begin
    cover(in_valid && (in_A != 0) && (in_B != 0));
    cover(out_valid);
end
 
`endif
 
endmodule
 
 
module full_adder(clk, reset, ain, bin, cin, sum, cout);
 
input clk, reset;
input ain, bin, cin;
output reg sum, cout;
 
// Full Adder Equations
// Sum = A ⊕ B ⊕ Cin and Cout = (A ⋅ B) + (Cin ⋅ (A ⊕ B))
// where A ⊕ B is equivalent to A XOR B , A ⋅ B is equivalent to A AND B
 
always @(posedge clk)
begin
    if(reset)
    begin
        sum <= 0;
        cout <= 0;
    end
    
    else begin
        sum <= ain^bin^cin;
        cout <= (ain & bin) | (cin & (ain^bin));
        //cout <= (ain * bin) + (cin * (ain - bin)); 
    end
end
 
endmodule



epC5U2h.png
 
Last edited:

does something in modelsim, though the answer is wrong.

FYI modelsim doesn't add the waveform for the middle_layers array either using add wave -r * (have to add it manually). I believe it defaults to not adding arrays as they may represent large memory arrays which you don't normally want to simulate, it might be a similar problem with the VCD output you are generating from whatever simulator you are running. You'll probably have to explicitly add that signal for VCD output.

- - - Updated - - -

Depending on how you generate your reset in the testbench, I would move the reset over by half a clock cycle, I don't think either edge of your clock sees the reset go active as you use a synchronous reset. In my modelsim sim, I generated the reset pulse on the falling edge.
 
I have the following waveform after modifying the reset signal in the testbench

Note: I am using iverilog, not modelsim.

test_multiply.v


Code Verilog - [expand]
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
// Testbench
module test_multiply;
 
  parameter A_WIDTH=16, B_WIDTH=16;
  
  reg i_clk;
  reg i_reset;
  reg i_ce;
  reg signed[(A_WIDTH-1):0] i_a;
  reg signed[(B_WIDTH-1):0] i_b;
  wire signed[(A_WIDTH+B_WIDTH-1):0]    o_p;
  wire o_valid;
  
  // Instantiate design under test
  multiply mul(.clk(i_clk), .reset(i_reset), .in_valid(i_ce), .in_A(i_a), .in_B(i_b), .out_valid(o_valid), .out_C(o_p));
          
  initial begin
    // Dump waves
    $dumpfile("test_multiply.vcd");
    $dumpvars(0, test_multiply);
    
    $display("Reset flop.");
    i_clk = 0;
    i_reset = 0;
    i_ce = 0;
    i_a = 0;
    i_b = 0;
 
  end
 
  always #5 i_clk = !i_clk;
 
  initial begin
    
    @(posedge i_clk);
    @(posedge i_clk);
 
    i_reset = 1;
 
    @(posedge i_clk);
    @(posedge i_clk);
    
    i_reset = 0;
 
    @(posedge i_clk);
    @(posedge i_clk);
 
    i_ce = 1;
    i_a = 16'h8;
    i_b = 16'h7;
 
 
    #400 $finish;
 
  end
  
endmodule



jsEVFfH.png
 

I get the same results as you, which is definitely not the answer of 56 for 7 * 8 :'(

Or is this that "new math" they teach now days in school ;-)
 

Using the suggestions at http://github.com/steveicarus/iverilog/issues/75#issuecomment-129031448 and http://inf-server.inf.uth.gr/~konstadel/resources/Icarus_Verilog_GTKWave_guide.pdf#page=5

I have the following iverilog error :

[phung@archlinux phung]$ iverilog -o multiply test_multiply.v multiply.v
test_multiply.v:36: syntax error
test_multiply.v:36: error: invalid module item.
[phung@archlinux phung]$


Code Verilog - [expand]
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
// Testbench
module test_multiply;
 
  parameter A_WIDTH=16, B_WIDTH=16;
  
  reg i_clk;
  reg i_reset;
  reg i_ce;
  reg signed[(A_WIDTH-1):0] i_a;
  reg signed[(B_WIDTH-1):0] i_b;
  wire signed[(A_WIDTH+B_WIDTH-1):0]    o_p;
  wire o_valid;
  
  // Instantiate design under test
  multiply mul(.clk(i_clk), .reset(i_reset), .in_valid(i_ce), .in_A(i_a), .in_B(i_b), .out_valid(o_valid), .out_C(o_p));
  
  
  initial begin
    // Dump waves
    $dumpfile("test_multiply.vcd");
    $dumpvars(0, test_multiply);
    
    i_clk = 0;
    i_reset = 0;
    i_ce = 0;
    i_a = 0;
    i_b = 0;
 
  end
 
  genvar i, j; // array index
  
  generate
    for(i = 0; i < 4; i = i + 1) begin
        for(j = 0; j < 16; j = j + 1) begin
            $dumpvars(0, test_multiply.multiply.middle_layers[i][j]);
        end
    end
  endgenerate
 
  always #5 i_clk = !i_clk;
 
  initial begin
    
    @(posedge i_clk);
    @(posedge i_clk);
 
    $display("Reset flop.");
 
    i_reset = 1;
 
    @(posedge i_clk);
    @(posedge i_clk);
    
    i_reset = 0;
 
    @(posedge i_clk);
    @(posedge i_clk);
 
    i_ce = 1;
    i_a = 16'h8;
    i_b = 16'h7;
 
 
    #400 $finish;
 
  end
  
endmodule

 

@FvM

I just got my cover(in_valid) passed in FORMAL verification, therefore I have the following waveform to investigate.

i5y3qe5.png


Note that the middle_layers bit length does not match the bitwidth described in line 91 of multiply.v . Why ?


Code Verilog - [expand]
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
module multiply(clk, reset, in_valid, out_valid, in_A, in_B, out_C); // C=A*B
 
`ifdef FORMAL
parameter A_WIDTH = 4;
parameter B_WIDTH = 4;
 
`else
 
parameter A_WIDTH = 16;
parameter B_WIDTH = 16;
`endif
 
input clk, reset;
input in_valid; // to signify that in_A, in_B are valid, multiplication process can start
input signed [(A_WIDTH-1):0] in_A;
input signed [(B_WIDTH-1):0] in_B;
output signed [(A_WIDTH+B_WIDTH-1):0] out_C;
output reg out_valid; // to signify that out_C is valid, multiplication finished
 
/* 
   This signed multiplier code architecture is a combination of row adder tree and 
   modified baugh-wooley algorithm, thus requires an area of O(N*M*logN) and time O(logN)
   with M, N being the length(bitwidth) of the multiplicand and multiplier respectively
 
   see [url]http://i.imgur.com/NaqjC6G.png[/url] or 
   Row Adder Tree Multipliers in [url]http://www.andraka.com/multipli.php[/url] or
   [url]http://pdfs.semanticscholar.org/415c/d98dafb5c9cb358c94189927e1f3216b7494.pdf#page=10[/url]
   regarding the mechanisms within all layers
 
   In terms of fmax consideration: In the case of an adder tree, the adders making up the levels
   closer to the input take up real estate (remember the structure of row adder tree).  As the 
   size of the input multiplicand bitwidth grows, it becomes more and more difficult to find a
   placement that does not use long routes involving multiple switch nodes for FPGA.  The result
   is the maximum clocking speed degrades quickly as the size of the bitwidth grows.
 
   For signed multiplication, see also modified baugh-wooley algorithm for trick in skipping 
   sign extension (implemented as verilog example in [url]http://www.dsprelated.com/showarticle/555.php[/url]),
   thus smaller final routed silicon area.
 
   [url]http://stackoverflow.com/questions/54268192/understanding-modified-baugh-wooley-multiplication-algorithm/[/url]
 
   All layers are pipelined, so throughput = one result for each clock cycle 
   but each multiplication result still have latency = NUM_OF_INTERMEDIATE_LAYERS 
*/
 
 
// The multiplication of two numbers is equivalent to adding as many copies of one 
// of them, the multiplicand, as the value of the other one, the multiplier.
// Therefore, multiplicand always have the larger width compared to multipliers
 
localparam SMALLER_WIDTH = (A_WIDTH <= B_WIDTH) ? A_WIDTH : B_WIDTH;
localparam LARGER_WIDTH = (A_WIDTH > B_WIDTH) ? A_WIDTH : B_WIDTH;
 
wire [(LARGER_WIDTH-1):0] MULTIPLICAND = (A_WIDTH > B_WIDTH) ? in_A : in_B ;
wire [(SMALLER_WIDTH-1):0] MULTIPLIPLIER = (A_WIDTH <= B_WIDTH) ? in_A : in_B ;
 
`ifdef FORMAL
// to keep the values of multiplicand and multiplier before the multiplication finishes 
reg [(LARGER_WIDTH-1):0] MULTIPLICAND_reg;
reg [(SMALLER_WIDTH-1):0] MULTIPLIPLIER_reg;
 
always @(posedge clk)
begin
    if(reset) begin
        MULTIPLICAND_reg <= 0;
        MULTIPLIPLIER_reg <= 0;
    end
 
    else if(in_valid) begin
        MULTIPLICAND_reg <= MULTIPLICAND;
        MULTIPLIPLIER_reg <= MULTIPLIPLIER;
    end
end
`endif
 
localparam NUM_OF_INTERMEDIATE_LAYERS = $clog2(SMALLER_WIDTH);
 
 
/*Binary multiplications and additions for partial products rows*/
 
// first layer has "SMALLER_WIDTH" entries of data of width "LARGER_WIDTH"
// This resulted in a binary tree with faster vertical addition processes as we have 
// lesser (NUM_OF_INTERMEDIATE_LAYERS) rows to add
 
// intermediate partial product rows additions
// Imagine a rhombus of height of "SMALLER_WIDTH" and width of "LARGER_WIDTH"
// being re-arranged into binary row adder tree
// such that additions can be done in O(logN) time
 
//reg [(NUM_OF_INTERMEDIATE_LAYERS-1):0][(SMALLER_WIDTH-1):0][(A_WIDTH+B_WIDTH-1):0] middle_layers;
reg [(A_WIDTH+B_WIDTH-1):0] middle_layers[(NUM_OF_INTERMEDIATE_LAYERS-1):0][0:(SMALLER_WIDTH-1)];
//reg [(NUM_OF_INTERMEDIATE_LAYERS-1):0] middle_layers [0:(SMALLER_WIDTH-1)] [(A_WIDTH+B_WIDTH-1):0];
//reg middle_layers [(NUM_OF_INTERMEDIATE_LAYERS-1):0][0:(SMALLER_WIDTH-1)][(A_WIDTH+B_WIDTH-1):0];
 
generate // duplicates the leafs of the binary tree
 
    genvar layer; // layer 0 means the youngest leaf, layer N means the tree trunk
 
    for(layer=0; layer<NUM_OF_INTERMEDIATE_LAYERS; layer=layer+1) begin: intermediate_layers
 
        integer pp_index; // leaf index within each layer of the tree
        integer bit_index; // index of binary string within each leaf
 
        always @(posedge clk)
        begin
            if(reset) 
            begin
                for(pp_index=0; pp_index<SMALLER_WIDTH ; pp_index=pp_index+1)
                    middle_layers[layer][pp_index] <= 0;
            end
 
            else begin
            
                if(layer == 0)  // all partial products rows are in first layer
                begin
                
                    // generation of partial products rows
                    for(pp_index=0; pp_index<SMALLER_WIDTH ; pp_index=pp_index+1)
                        middle_layers[layer][pp_index] <= 
                        (MULTIPLICAND & MULTIPLIPLIER[pp_index]);
                        
                    // see modified baugh-wooley algorithm: [url]http://i.imgur.com/VcgbY4g.png[/url]
                    for(pp_index=0; pp_index<SMALLER_WIDTH ; pp_index=pp_index+1)
                        middle_layers[layer][pp_index][LARGER_WIDTH-1] <= 
                       !middle_layers[layer][pp_index][LARGER_WIDTH-1];
                        
                    middle_layers[layer][SMALLER_WIDTH-1] <= !middle_layers[layer][SMALLER_WIDTH-1];
                    middle_layers[layer][0][LARGER_WIDTH] <= 1;
                    middle_layers[layer][SMALLER_WIDTH-1][LARGER_WIDTH] <= 1;
                end
                
                // adding the partial product rows according to row adder tree architecture
                else begin
                    for(pp_index=0; pp_index<(SMALLER_WIDTH >> layer) ; pp_index=pp_index+1)
                        middle_layers[layer][pp_index] <=
                        middle_layers[layer-1][pp_index<<1] +
                      (middle_layers[layer-1][(pp_index<<1) + 1]) << 1;
                    
                    // bit-level additions using full adders
                    /*for(pp_index=0; pp_index<SMALLER_WIDTH ; pp_index=pp_index+1)
                        for(bit_index=0; bit_index<(LARGER_WIDTH+layer); bit_index=bit_index+1)
                            full_adder fa(.clk(clk), .reset(reset), .ain(), .bin(), .cin(), .sum(), .cout());*/
                end
            end
        end
    end
 
endgenerate
 
assign out_C = (reset)? 0 : middle_layers[NUM_OF_INTERMEDIATE_LAYERS-1][0];
 
 
/*Checking if the final multiplication result is ready or not*/
 
reg [($clog2($clog2(SMALLER_WIDTH))-1):0] out_valid_counter; // to track the multiply stages
reg multiply_had_started;
 
always @(posedge clk)
begin
    if(reset) 
    begin
        multiply_had_started <= 0;
        out_valid <= 0;
        out_valid_counter <= 0;
    end
 
    else if(out_valid_counter == $clog2(SMALLER_WIDTH)-1) begin
        multiply_had_started <= 0;
        out_valid <= 1;
        out_valid_counter <= 0;
    end
    
    else if(in_valid && !multiply_had_started) begin
        multiply_had_started <= 1;
        out_valid <= 0; // for consecutive multiplication
    end
    
    else begin
        out_valid <= 0;
        if(multiply_had_started) out_valid_counter <= out_valid_counter + 1;
    end
end
 
 
`ifdef FORMAL
 
initial assume(reset);
initial assume(in_valid == 0);
//initial assert(out_valid == 0);
//initial assert(out_valid_counter == 0);
 
wire sign_bit = MULTIPLICAND_reg[LARGER_WIDTH-1] ^ MULTIPLIPLIER_reg[SMALLER_WIDTH-1];
 
always @(posedge clk)
begin
    if(reset) assert(out_C == 0);
    
    else if(out_valid) begin
        assert(out_C == (MULTIPLICAND_reg * MULTIPLIPLIER_reg));
        assert(out_C[A_WIDTH+B_WIDTH-1] == sign_bit);
    end
end
 
`endif
 
`ifdef FORMAL
 
localparam user_A = 3;
localparam user_B = 6;
 
always @(posedge clk)
begin
    cover(in_valid && (in_A == user_A) && (in_B == user_B));
    cover(out_valid);
end
 
`endif
 
endmodule
 
 
module full_adder(clk, reset, ain, bin, cin, sum, cout);
 
input clk, reset;
input ain, bin, cin;
output reg sum, cout;
 
// Full Adder Equations
// Sum = A ⊕ B ⊕ Cin and Cout = (A ⋅ B) + (Cin ⋅ (A ⊕ B))
// where A ⊕ B is equivalent to A XOR B , A ⋅ B is equivalent to A AND B
 
always @(posedge clk)
begin
    if(reset)
    begin
        sum <= 0;
        cout <= 0;
    end
    
    else begin
        sum <= ain^bin^cin;
        cout <= (ain & bin) | (cin & (ain^bin));
        //cout <= (ain * bin) + (cin * (ain - bin)); 
    end
end
 
endmodule

 

multiply mul(.clk(i_clk), .reset(i_reset), .in_valid(i_ce), .in_A(i_a), .in_B(i_b), .out_valid(o_valid), .out_C(o_p));
$dumpvars(0, test_multiply.multiply.middle_layers[j]);

error is in red

- - - Updated - - -

Note that the middle_layers bit length does not match the bitwidth described in line 91. Why ?

Line 91: reg [(A_WIDTH+B_WIDTH-1):0] middle_layers[(NUM_OF_INTERMEDIATE_LAYERS-1):0][0:(SMALLER_WIDTH-1)];

An ifdef that must be set to FORMAL
`ifdef FORMAL
parameter A_WIDTH = 4;
parameter B_WIDTH = 4;

`else

parameter A_WIDTH = 16;
parameter B_WIDTH = 16;
`endif

You shouldn't even have the ifdef stuff, you should use the Verilog 2001 parameter declaration in the module port and then use the Verilog 2001 named association for the parameter passing. You would then set the FORMAL definition of the widths from outside the code.
 

@ads-ee

Note that the middle_layers bit length does not match the bitwidth described in line 91

You still had not answered why yet.

For the ifdef issue, it is not the reason causing the bitwidth mismatch, right ?

Regarding the red error you pointed out, you did not seem to answer it correctly. See the reply from iverilog developer : https://github.com/steveicarus/iverilog/issues/230#issuecomment-462079234

When I try with vivado builtin simulator, I am surprised that I do not face any bitwidth mismatch issue for the 3D array "middle_layers"
However, I still have logic bug where the multiplication result does not follow the modified baugh-wooley algorithm. Which part of the code in the screenshot below did I code wrongly ?

Note: The last four signals in the waveform plot are for layer=0

ThVRzh4.png
 

@ads-ee

You still had not answered why yet.

For the ifdef issue, it is not the reason causing the bitwidth mismatch, right ?

Yes it is. It is the only thing selecting the parameters A_WIDTH and B_WIDTH in your multiply code. You don't have any defparams in your testbench code to change the parameters to anything else, so the setting of FORMAL will determine the parameter settings. You have A and B inputs of 4-bits and multiplying them will give a result of 8-bits, which is NOT a mismatch in widths. Either that or you are asking some other question incorrectly and I'm answering a different question as your question is poorly written....

You should use Verilog 2001 port syntax instead, given how it is now 2019, 18 years after that standard was introduced. I also would avoid using ifdefs, most if not all uses of them are really abusing them in some form or another. Also as defines are global in nature, they can result in problems elsewhere when someone uses the same `define somewhere else overriding the expected value somewhere else breaking that ifdef.

Regarding the red error you pointed out, you did not seem to answer it correctly. See the reply from iverilog developer : **broken link removed**
well you also posted code with a typo, which you didn't do on that other site. I don't go opening arbitrary links so I only looked at the code you posted which has test_multiply.multiply.middle_layers which does not use the correct instance name of the multiply module, which is mult.

Ever heard of the term GIGO? It stands for garbage in garbage out...post an incorrect question with errors and you will get a wrong answer. I stopped on the first error I see, didn't even pay attention to the bit slice stuff. Besides that I don't use iverilog and noticed that according to every release note for it, that it does support a lot of language features of even Verilog2001 and nothing from SV, so I refuse to waste time with it.
 

The following verilog source code and/or testbench works nicely across commercial simulators, iverilog as well as formal verification tool (yosys-smtbmc)

Please keep the complaint about `ifdef FORMAL until later. I need them to use with yosys-smtbmc which does not support bind command yet.

I am now debugging the generate coding since the multiplication (using modified baugh-wooley algorithm) does not work yet.
See the screenshot at post #8

test_multiply.v


Code Verilog - [expand]
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
// Testbench
module test_multiply;
 
  parameter A_WIDTH=4, B_WIDTH=4;
  
  reg i_clk;
  reg i_reset;
  reg i_ce;
  reg signed[(A_WIDTH-1):0] i_a;
  reg signed[(B_WIDTH-1):0] i_b;
  wire signed[(A_WIDTH+B_WIDTH-1):0] o_p;
  wire o_valid;
  
  // Instantiate design under test
  multiply #(A_WIDTH, B_WIDTH) mul(.clk(i_clk), .reset(i_reset), .in_valid(i_ce), .in_A(i_a), .in_B(i_b), .out_valid(o_valid), .out_C(o_p));
  
  
  initial begin
    // Dump waves
    $dumpfile("test_multiply.vcd");
    $dumpvars(0, test_multiply);
    
    i_clk = 0;
    i_reset = 0;
    i_ce = 0;
    i_a = 0;
    i_b = 0;
 
  end
 
  localparam SMALLER_WIDTH = (A_WIDTH <= B_WIDTH) ? A_WIDTH : B_WIDTH;
  localparam NUM_OF_INTERMEDIATE_LAYERS = $clog2(SMALLER_WIDTH);
 
  genvar i, j; // array index
  
  generate
    for(i = 0; i < NUM_OF_INTERMEDIATE_LAYERS; i = i + 1) begin
        for(j = 0; j < SMALLER_WIDTH; j = j + 1) begin
            initial $dumpvars(0, test_multiply.mul.middle_layers[i][j]);
        end
    end
  endgenerate
 
  always #5 i_clk = !i_clk;
 
  initial begin
    
    @(posedge i_clk);
    @(posedge i_clk);
 
    $display("Reset flop.");
 
    i_reset = 1;
 
    @(posedge i_clk);
    @(posedge i_clk);
    
    i_reset = 0;
 
    @(posedge i_clk);
    @(posedge i_clk);
 
    i_ce = 1;
    i_a = 3;
    i_b = 2;
 
    #50 $finish;
 
  end
  
endmodule




multiply.v


Code Verilog - [expand]
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
module multiply #(parameter A_WIDTH=16, B_WIDTH=16)
(clk, reset, in_valid, out_valid, in_A, in_B, out_C); // C=A*B
 
`ifdef FORMAL
parameter A_WIDTH = 4;
parameter B_WIDTH = 4;
`endif
 
input clk, reset;
input in_valid; // to signify that in_A, in_B are valid, multiplication process can start
input signed [(A_WIDTH-1):0] in_A;
input signed [(B_WIDTH-1):0] in_B;
output signed [(A_WIDTH+B_WIDTH-1):0] out_C;
output reg out_valid; // to signify that out_C is valid, multiplication finished
 
/* 
   This signed multiplier code architecture is a combination of row adder tree and 
   modified baugh-wooley algorithm, thus requires an area of O(N*M*logN) and time O(logN)
   with M, N being the length(bitwidth) of the multiplicand and multiplier respectively
 
   see [url]http://i.imgur.com/NaqjC6G.png[/url] or 
   Row Adder Tree Multipliers in [url]http://www.andraka.com/multipli.php[/url] or
   [url]http://pdfs.semanticscholar.org/415c/d98dafb5c9cb358c94189927e1f3216b7494.pdf#page=10[/url]
   regarding the mechanisms within all layers
 
   In terms of fmax consideration: In the case of an adder tree, the adders making up the levels
   closer to the input take up real estate (remember the structure of row adder tree).  As the 
   size of the input multiplicand bitwidth grows, it becomes more and more difficult to find a
   placement that does not use long routes involving multiple switch nodes for FPGA.  The result
   is the maximum clocking speed degrades quickly as the size of the bitwidth grows.
 
   For signed multiplication, see also modified baugh-wooley algorithm for trick in skipping 
   sign extension (implemented as verilog example in [url]http://www.dsprelated.com/showarticle/555.php[/url]),
   thus smaller final routed silicon area.
 
   [url]http://stackoverflow.com/questions/54268192/understanding-modified-baugh-wooley-multiplication-algorithm/[/url]
 
   All layers are pipelined, so throughput = one result for each clock cycle 
   but each multiplication result still have latency = NUM_OF_INTERMEDIATE_LAYERS 
*/
 
 
// The multiplication of two numbers is equivalent to adding as many copies of one 
// of them, the multiplicand, as the value of the other one, the multiplier.
// Therefore, multiplicand always have the larger width compared to multipliers
 
localparam SMALLER_WIDTH = (A_WIDTH <= B_WIDTH) ? A_WIDTH : B_WIDTH;
localparam LARGER_WIDTH = (A_WIDTH > B_WIDTH) ? A_WIDTH : B_WIDTH;
 
wire [(LARGER_WIDTH-1):0] MULTIPLICAND = (A_WIDTH > B_WIDTH) ? in_A : in_B ;
wire [(SMALLER_WIDTH-1):0] MULTIPLIPLIER = (A_WIDTH <= B_WIDTH) ? in_A : in_B ;
 
`ifdef FORMAL
// to keep the values of multiplicand and multiplier before the multiplication finishes 
reg [(LARGER_WIDTH-1):0] MULTIPLICAND_reg;
reg [(SMALLER_WIDTH-1):0] MULTIPLIPLIER_reg;
 
always @(posedge clk)
begin
    if(reset) begin
        MULTIPLICAND_reg <= 0;
        MULTIPLIPLIER_reg <= 0;
    end
 
    else if(in_valid) begin
        MULTIPLICAND_reg <= MULTIPLICAND;
        MULTIPLIPLIER_reg <= MULTIPLIPLIER;
    end
end
`endif
 
localparam NUM_OF_INTERMEDIATE_LAYERS = $clog2(SMALLER_WIDTH);
 
 
/*Binary multiplications and additions for partial products rows*/
 
// first layer has "SMALLER_WIDTH" entries of data of width "LARGER_WIDTH"
// This resulted in a binary tree with faster vertical addition processes as we have 
// lesser (NUM_OF_INTERMEDIATE_LAYERS) rows to add
 
// intermediate partial product rows additions
// Imagine a rhombus of height of "SMALLER_WIDTH" and width of "LARGER_WIDTH"
// being re-arranged into binary row adder tree
// such that additions can be done in O(logN) time
 
//reg [(NUM_OF_INTERMEDIATE_LAYERS-1):0][(SMALLER_WIDTH-1):0][(A_WIDTH+B_WIDTH-1):0] middle_layers;
reg [(A_WIDTH+B_WIDTH-1):0] middle_layers[(NUM_OF_INTERMEDIATE_LAYERS-1):0][0:(SMALLER_WIDTH-1)];
//reg [(NUM_OF_INTERMEDIATE_LAYERS-1):0] middle_layers [0:(SMALLER_WIDTH-1)] [(A_WIDTH+B_WIDTH-1):0];
//reg middle_layers [(NUM_OF_INTERMEDIATE_LAYERS-1):0][0:(SMALLER_WIDTH-1)][(A_WIDTH+B_WIDTH-1):0];
 
generate // duplicates the leafs of the binary tree
 
    genvar layer; // layer 0 means the youngest leaf, layer N means the tree trunk
 
    for(layer=0; layer<NUM_OF_INTERMEDIATE_LAYERS; layer=layer+1) begin: intermediate_layers
 
        integer pp_index; // leaf index within each layer of the tree
        integer bit_index; // index of binary string within each leaf
 
        always @(posedge clk)
        begin
            if(reset) 
            begin
                for(pp_index=0; pp_index<SMALLER_WIDTH ; pp_index=pp_index+1)
                    middle_layers[layer][pp_index] <= 0;
            end
 
            else begin
            
                if(layer == 0)  // all partial products rows are in first layer
                begin
                
                    // generation of partial products rows
                    for(pp_index=0; pp_index<SMALLER_WIDTH ; pp_index=pp_index+1)
                        middle_layers[layer][pp_index] <= 
                        (MULTIPLICAND & MULTIPLIPLIER[pp_index]);
                        
                    // see modified baugh-wooley algorithm: [url]http://i.imgur.com/VcgbY4g.png[/url] from
                                        // page 122 of book "Ultra-Low-Voltage Design of Energy-Efficient Digital Circuits"
                    for(pp_index=0; pp_index<SMALLER_WIDTH ; pp_index=pp_index+1)
                        middle_layers[layer][pp_index][LARGER_WIDTH-1] <= 
                       !middle_layers[layer][pp_index][LARGER_WIDTH-1];
                        
                    middle_layers[layer][SMALLER_WIDTH-1] <= !middle_layers[layer][SMALLER_WIDTH-1];
                    middle_layers[layer][0][LARGER_WIDTH] <= 1;
                    middle_layers[layer][SMALLER_WIDTH-1][LARGER_WIDTH] <= 1;
                end
                
                // adding the partial product rows according to row adder tree architecture
                else begin
                    for(pp_index=0; pp_index<(SMALLER_WIDTH >> layer) ; pp_index=pp_index+1)
                        middle_layers[layer][pp_index] <=
                        middle_layers[layer-1][pp_index<<1] +
                      (middle_layers[layer-1][(pp_index<<1) + 1]) << 1;
                    
                    // bit-level additions using full adders
                    /*for(pp_index=0; pp_index<SMALLER_WIDTH ; pp_index=pp_index+1)
                        for(bit_index=0; bit_index<(LARGER_WIDTH+layer); bit_index=bit_index+1)
                            full_adder fa(.clk(clk), .reset(reset), .ain(), .bin(), .cin(), .sum(), .cout());*/
                end
            end
        end
    end
 
endgenerate
 
assign out_C = (reset)? 0 : middle_layers[NUM_OF_INTERMEDIATE_LAYERS-1][0];
 
 
/*Checking if the final multiplication result is ready or not*/
 
reg [($clog2(NUM_OF_INTERMEDIATE_LAYERS)-1):0] out_valid_counter; // to track the multiply stages
reg multiply_had_started;
 
always @(posedge clk)
begin
    if(reset) 
    begin
        multiply_had_started <= 0;
        out_valid <= 0;
        out_valid_counter <= 0;
    end
 
    else if(out_valid_counter == NUM_OF_INTERMEDIATE_LAYERS-1) begin
        multiply_had_started <= 0;
        out_valid <= 1;
        out_valid_counter <= 0;
    end
    
    else if(in_valid && !multiply_had_started) begin
        multiply_had_started <= 1;
        out_valid <= 0; // for consecutive multiplication
    end
    
    else begin
        out_valid <= 0;
        if(multiply_had_started) out_valid_counter <= out_valid_counter + 1;
    end
end
 
 
`ifdef FORMAL
 
initial assume(reset);
initial assume(in_valid == 0);
 
wire sign_bit = MULTIPLICAND_reg[LARGER_WIDTH-1] ^ MULTIPLIPLIER_reg[SMALLER_WIDTH-1];
 
always @(posedge clk)
begin
    if(reset) assert(out_C == 0);
    
    else if(out_valid) begin
        assert(out_C == (MULTIPLICAND_reg * MULTIPLIPLIER_reg));
        assert(out_C[A_WIDTH+B_WIDTH-1] == sign_bit);
    end
end
 
`endif
 
`ifdef FORMAL
 
localparam user_A = 3;
localparam user_B = 2;
 
always @(posedge clk)
begin
    cover(in_valid && (in_A == user_A) && (in_B == user_B));
    cover(out_valid);
end
 
`endif
 
endmodule
 
 
/*module full_adder(clk, reset, ain, bin, cin, sum, cout);
 
input clk, reset;
input ain, bin, cin;
output reg sum, cout;
 
// Full Adder Equations
// Sum = A ⊕ B ⊕ Cin and Cout = (A ⋅ B) + (Cin ⋅ (A ⊕ B))
// where A ⊕ B is equivalent to A XOR B , A ⋅ B is equivalent to A AND B
 
always @(posedge clk)
begin
    if(reset)
    begin
        sum <= 0;
        cout <= 0;
    end
    
    else begin
        sum <= ain^bin^cin;
        cout <= (ain & bin) | (cin & (ain^bin));
        //cout <= (ain * bin) + (cin * (ain - bin)); 
    end
end
 
endmodule*/

 

FYI modelsim doesn't add the waveform for the middle_layers array either using add wave -r * (have to add it manually). I believe it defaults to not adding arrays as they may represent large memory arrays which you don't normally want to simulate, it might be a similar problem with the VCD output you are generating from whatever simulator you are running. You'll probably have to explicitly add that signal for VCD output.
On Icarus, you definitely need to dump each array row explicitly using $dumpvars, though you only have to specify the index/indices parts without having to specify the bitrange. This goes for VCD and for the faster database formats such as FST.

VCS and CVC automatically dump arrays fully without needing to step through them in testbench code--at least for FSDB. Signal names between Icarus and other simulators differ for arrays because Icarus escapes the hierarchy of the signal name where the actual signal name starts, probably because array rows contain '[' and ']' characters, probably to be compliant with what is in the Verilog spec. In order to be able to use the same gtkwave save file regardless of simulator, I wound up hacking my source for Icarus so the names dumped into FST would appear the same as for when I simulate using VCS.

-Tony
 

I have already solved the dumpvars issue for multi-dimensional array.

The problem now is the simulation waveform does not follow the algorithm.

See the screenshot at post #8 and the code at post #10

- - - Updated - - -

if I comment out lines 119 to 125 , then I have all zeroes for the 2D array of vectors, middle_layers. Why ?

MULTIPLICAND & MULTIPLIPLIER[pp_index] seems to be the culprit ??

T1SDSN2.png
 

Problem solved : See http://gist.github.com/promach/5f2d9a9494704ed93cf65687c982198c#file-multiply-v

It now gives correct signed multiplication result both in vivado simulation and cover() within formal verification


Code Verilog - [expand]
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
module multiply #(parameter A_WIDTH=16, B_WIDTH=16)
(clk, reset, in_valid, out_valid, in_A, in_B, out_C); // C=A*B
 
`ifdef FORMAL
parameter A_WIDTH = 4;
parameter B_WIDTH = 4;
`endif
 
input clk, reset;
input in_valid; // to signify that in_A, in_B are valid, multiplication process can start
input signed [(A_WIDTH-1):0] in_A;
input signed [(B_WIDTH-1):0] in_B;
output signed [(A_WIDTH+B_WIDTH-1):0] out_C;
output reg out_valid; // to signify that out_C is valid, multiplication finished
 
/* 
   This signed multiplier code architecture is a combination of row adder tree and 
   modified baugh-wooley algorithm, thus requires an area of O(N*M*logN) and time O(logN)
   with M, N being the length(bitwidth) of the multiplicand and multiplier respectively
 
   see [url]http://i.imgur.com/NaqjC6G.png[/url] or 
   Row Adder Tree Multipliers in [url]http://www.andraka.com/multipli.php[/url] or
   [url]http://pdfs.semanticscholar.org/415c/d98dafb5c9cb358c94189927e1f3216b7494.pdf#page=10[/url]
   regarding the mechanisms within all layers
 
   In terms of fmax consideration: In the case of an adder tree, the adders making up the levels
   closer to the input take up real estate (remember the structure of row adder tree).  As the 
   size of the input multiplicand bitwidth grows, it becomes more and more difficult to find a
   placement that does not use long routes involving multiple switch nodes for FPGA.  The result
   is the maximum clocking speed degrades quickly as the size of the bitwidth grows.
 
   For signed multiplication, see also modified baugh-wooley algorithm for trick in skipping 
   sign extension (implemented as verilog example in [url]http://www.dsprelated.com/showarticle/555.php[/url]),
   thus smaller final routed silicon area.
 
   [url]http://stackoverflow.com/questions/54268192/understanding-modified-baugh-wooley-multiplication-algorithm/[/url]
 
   All layers are pipelined, so throughput = one result for each clock cycle 
   but each multiplication result still have latency = NUM_OF_INTERMEDIATE_LAYERS 
*/
 
 
// The multiplication of two numbers is equivalent to adding as many copies of one 
// of them, the multiplicand, as the value of the other one, the multiplier.
// Therefore, multiplicand always have the larger width compared to multipliers
 
localparam SMALLER_WIDTH = (A_WIDTH <= B_WIDTH) ? A_WIDTH : B_WIDTH;
localparam LARGER_WIDTH = (A_WIDTH > B_WIDTH) ? A_WIDTH : B_WIDTH;
 
wire [(LARGER_WIDTH-1):0] MULTIPLICAND = (A_WIDTH > B_WIDTH) ? in_A : in_B ;
wire [(SMALLER_WIDTH-1):0] MULTIPLIER = (A_WIDTH <= B_WIDTH) ? in_A : in_B ;
 
`ifdef FORMAL
// to keep the values of multiplicand and multiplier before the multiplication finishes 
reg signed [(LARGER_WIDTH-1):0] MULTIPLICAND_reg;
reg signed [(SMALLER_WIDTH-1):0] MULTIPLIER_reg;
 
always @(posedge clk)
begin
    if(reset) begin
        MULTIPLICAND_reg <= 0;
        MULTIPLIPLIER_reg <= 0;
    end
 
    else if(in_valid) begin
        MULTIPLICAND_reg <= MULTIPLICAND;
        MULTIPLIER_reg <= MULTIPLIER;
    end
end
`endif
 
localparam NUM_OF_INTERMEDIATE_LAYERS = $clog2(SMALLER_WIDTH);
 
 
/*Binary multiplications and additions for partial products rows*/
 
// first layer has "SMALLER_WIDTH" entries of data of width "LARGER_WIDTH"
// This resulted in a binary tree with faster vertical addition processes as we have 
// lesser (NUM_OF_INTERMEDIATE_LAYERS) rows to add
 
// intermediate partial product rows additions
// Imagine a rhombus of height of "SMALLER_WIDTH" and width of "LARGER_WIDTH"
// being re-arranged into binary row adder tree
// such that additions can be done in O(logN) time
 
//reg [(NUM_OF_INTERMEDIATE_LAYERS-1):0][(SMALLER_WIDTH-1):0][(A_WIDTH+B_WIDTH-1):0] middle_layers;
reg [(A_WIDTH+B_WIDTH-1):0] middle_layers[NUM_OF_INTERMEDIATE_LAYERS:0][0:(SMALLER_WIDTH-1)];
//reg [(NUM_OF_INTERMEDIATE_LAYERS-1):0] middle_layers [0:(SMALLER_WIDTH-1)] [(A_WIDTH+B_WIDTH-1):0];
//reg middle_layers [(NUM_OF_INTERMEDIATE_LAYERS-1):0][0:(SMALLER_WIDTH-1)][(A_WIDTH+B_WIDTH-1):0];
 
generate // duplicates the leafs of the binary tree
 
    genvar layer; // layer 0 means the youngest leaf, layer N means the tree trunk
 
    for(layer=0; layer<=NUM_OF_INTERMEDIATE_LAYERS; layer=layer+1) begin: intermediate_layers
 
        integer pp_index; // leaf index within each layer of the tree
 
        always @(posedge clk)
        begin
            if(reset) 
            begin
                for(pp_index=0; pp_index<SMALLER_WIDTH ; pp_index=pp_index+1)
                    middle_layers[layer][pp_index] <= 0;
            end
 
            else begin      
    
                if(layer == 0)  // all partial products rows are in first layer
                begin               
                    // generation of partial products rows
                    for(pp_index=0; pp_index<SMALLER_WIDTH ; pp_index=pp_index+1)
                        middle_layers[layer][pp_index] <= MULTIPLIER[pp_index] ? MULTIPLICAND:0;    
                        
                    // see modified baugh-wooley algorithm: [url]http://i.imgur.com/VcgbY4g.png[/url] from
                    // page 122 of book: Ultra-Low-Voltage Design of Energy-Efficient Digital Circuits
                    for(pp_index=0; pp_index<(SMALLER_WIDTH-1) ; pp_index=pp_index+1)
                        middle_layers[layer][pp_index][LARGER_WIDTH-1] <= 
                        (MULTIPLICAND[LARGER_WIDTH-1] & MULTIPLIER[pp_index]) ? 0:1;
                        
                    for(pp_index=0; pp_index<(LARGER_WIDTH-1) ; pp_index=pp_index+1)
                        middle_layers[layer][SMALLER_WIDTH-1][pp_index] <= 
                        (MULTIPLICAND[pp_index] & MULTIPLIER[SMALLER_WIDTH-1]) ? 0:1;
 
                    middle_layers[layer][0][LARGER_WIDTH] <= 1;
                    middle_layers[layer][SMALLER_WIDTH-1][LARGER_WIDTH] <= 1;
                end
                
                // adding the partial product rows according to row adder tree architecture
                else begin
                    for(pp_index=0; pp_index<(SMALLER_WIDTH >> layer) ; pp_index=pp_index+1)
                    begin
                        if(pp_index==0)
                            middle_layers[layer][pp_index] <=
                            middle_layers[layer-1][0] +
                            (middle_layers[layer-1][1] << layer);
 
                        else middle_layers[layer][pp_index] <=
                            middle_layers[layer-1][pp_index<<1] +
                            (middle_layers[layer-1][(pp_index<<1) + 1] << layer);
                    end
                end
            end
        end
    end
 
endgenerate
 
assign out_C = (reset)? 0 : middle_layers[NUM_OF_INTERMEDIATE_LAYERS][0];
 
 
/*Checking if the final multiplication result is ready or not*/
 
reg [($clog2(NUM_OF_INTERMEDIATE_LAYERS)-1):0] out_valid_counter; // to track the multiply stages
reg multiply_had_started;
 
always @(posedge clk)
begin
    if(reset) 
    begin
        multiply_had_started <= 0;
        out_valid <= 0;
        out_valid_counter <= 0;
    end
 
    else if(out_valid_counter == NUM_OF_INTERMEDIATE_LAYERS-1) begin
        multiply_had_started <= 0;
        out_valid <= 1;
        out_valid_counter <= 0;
    end
    
    else if(in_valid && !multiply_had_started) begin
        multiply_had_started <= 1;
        out_valid <= 0; // for consecutive multiplication
    end
    
    else begin
        out_valid <= 0;
        if(multiply_had_started) out_valid_counter <= out_valid_counter + 1;
    end
end
 
 
`ifdef FORMAL
 
initial assume(reset);
initial assume(in_valid == 0);
 
wire sign_bit = MULTIPLICAND_reg[LARGER_WIDTH-1] ^ MULTIPLIPLIER_reg[SMALLER_WIDTH-1];
 
always @(posedge clk)
begin
    if(reset) assert(out_C == 0);
    
    else if(out_valid) begin
        assert(out_C == (MULTIPLICAND_reg * MULTIPLIER_reg));
        assert(out_C[A_WIDTH+B_WIDTH-1] == sign_bit);
    end
end
 
`endif
 
`ifdef FORMAL
 
localparam user_A = 3;
localparam user_B = -2;
 
always @(posedge clk)
begin
    cover(in_valid && (in_A == user_A) && (in_B == user_B));
    cover(out_valid);
end
 
`endif
 
endmodule



oBkgkZH.png
 

Status
Not open for further replies.

Part and Inventory Search

Welcome to EDABoard.com

Sponsor

Back
Top