promach
Advanced Member level 4
Why the following multiply verilog code does not multiply ?
Besides, in order to debug the code, I need access to the variable "middle_layers", but gtkwave is not giving me access to it. Why ?
I have also tried formal verification using yosys-smtbmc, but surprisingly the code failed even the simplest cover(in_valid) which I do not understand at all.
Besides, in order to debug the code, I need access to the variable "middle_layers", but gtkwave is not giving me access to it. Why ?
I have also tried formal verification using yosys-smtbmc, but surprisingly the code failed even the simplest cover(in_valid) which I do not understand at all.
Code Verilog - [expand] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 module multiply(clk, reset, in_valid, out_valid, in_A, in_B, out_C); // C=A*B parameter A_WIDTH = 16; parameter B_WIDTH = 16; input clk, reset; input in_valid; // to signify that in_A, in_B are valid, multiplication process can start input signed [(A_WIDTH-1):0] in_A; input signed [(B_WIDTH-1):0] in_B; output signed [(A_WIDTH+B_WIDTH-1):0] out_C; output reg out_valid; // to signify that out_C is valid, multiplication finished /* This signed multiplier code architecture is a combination of row adder tree and modified baugh-wooley algorithm, thus requires an area of O(N*M*logN) and time O(logN) with M, N being the length(bitwidth) of the multiplicand and multiplier respectively see [url]http://i.imgur.com/NaqjC6G.png[/url] or Row Adder Tree Multipliers in [url]http://www.andraka.com/multipli.php[/url] or [url]http://pdfs.semanticscholar.org/415c/d98dafb5c9cb358c94189927e1f3216b7494.pdf#page=10[/url] regarding the mechanisms within all layers In terms of fmax consideration: In the case of an adder tree, the adders making up the levels closer to the input take up real estate (remember the structure of row adder tree). As the size of the input multiplicand bitwidth grows, it becomes more and more difficult to find a placement that does not use long routes involving multiple switch nodes for FPGA. The result is the maximum clocking speed degrades quickly as the size of the bitwidth grows. For signed multiplication, see also modified baugh-wooley algorithm for trick in skipping sign extension (implemented as verilog example in [url]http://www.dsprelated.com/showarticle/555.php[/url]), thus smaller final routed silicon area. [url]http://stackoverflow.com/questions/54268192/understanding-modified-baugh-wooley-multiplication-algorithm/[/url] All layers are pipelined, so throughput = one result for each clock cycle but each multiplication result still have latency = NUM_OF_INTERMEDIATE_LAYERS */ // The multiplication of two numbers is equivalent to adding as many copies of one // of them, the multiplicand, as the value of the other one, the multiplier. // Therefore, multiplicand always have the larger width compared to multipliers localparam SMALLER_WIDTH = (A_WIDTH <= B_WIDTH) ? A_WIDTH : B_WIDTH; localparam LARGER_WIDTH = (A_WIDTH > B_WIDTH) ? A_WIDTH : B_WIDTH; wire [(LARGER_WIDTH-1):0] MULTIPLICAND = (A_WIDTH > B_WIDTH) ? in_A : in_B ; wire [(SMALLER_WIDTH-1):0] MULTIPLIPLIER = (A_WIDTH <= B_WIDTH) ? in_A : in_B ; localparam NUM_OF_INTERMEDIATE_LAYERS = $clog2(SMALLER_WIDTH); /*Binary multiplications and additions for partial products rows*/ // first layer has "SMALLER_WIDTH" entries of data of width "LARGER_WIDTH" // This resulted in a binary tree with faster vertical addition processes as we have // lesser (NUM_OF_INTERMEDIATE_LAYERS) rows to add // intermediate partial product rows additions // Imagine a rhombus of height of "SMALLER_WIDTH" and width of "LARGER_WIDTH" // being re-arranged into binary row adder tree // such that additions can be done in O(logN) time //reg [(NUM_OF_INTERMEDIATE_LAYERS-1):0][0:(SMALLER_WIDTH-1)][(A_WIDTH+B_WIDTH-1):0] middle_layers; reg [(A_WIDTH+B_WIDTH-1):0] middle_layers[(NUM_OF_INTERMEDIATE_LAYERS-1):0][0:(SMALLER_WIDTH-1)]; //reg [(NUM_OF_INTERMEDIATE_LAYERS-1):0] middle_layers [0:(SMALLER_WIDTH-1)] [(A_WIDTH+B_WIDTH-1):0]; //reg middle_layers [(NUM_OF_INTERMEDIATE_LAYERS-1):0][0:(SMALLER_WIDTH-1)][(A_WIDTH+B_WIDTH-1):0]; generate // duplicates the leafs of the binary tree genvar layer; // layer 0 means the youngest leaf, layer N means the tree trunk for(layer=0; layer<NUM_OF_INTERMEDIATE_LAYERS; layer=layer+1) begin: intermediate_layers integer pp_index; // leaf index within each layer of the tree integer bit_index; // index of binary string within each leaf always @(posedge clk) begin if(reset) begin for(pp_index=0; pp_index<SMALLER_WIDTH ; pp_index=pp_index+1) middle_layers[layer][pp_index] <= 0; end else begin if(layer == 0) // all partial products rows are in first layer begin // generation of partial products rows for(pp_index=0; pp_index<SMALLER_WIDTH ; pp_index=pp_index+1) middle_layers[layer][pp_index] <= (MULTIPLICAND & MULTIPLIPLIER[pp_index]); // see modified baugh-wooley algorithm: [url]http://i.imgur.com/VcgbY4g.png[/url] for(pp_index=0; pp_index<SMALLER_WIDTH ; pp_index=pp_index+1) middle_layers[layer][pp_index][LARGER_WIDTH-1] <= !middle_layers[layer][pp_index][LARGER_WIDTH-1]; middle_layers[layer][SMALLER_WIDTH-1] <= !middle_layers[layer][SMALLER_WIDTH-1]; middle_layers[layer][0][LARGER_WIDTH] <= 1; middle_layers[layer][SMALLER_WIDTH-1][LARGER_WIDTH] <= 1; end // adding the partial product rows according to row adder tree architecture else begin for(pp_index=0; pp_index<(SMALLER_WIDTH >> layer) ; pp_index=pp_index+1) middle_layers[layer][pp_index] <= middle_layers[layer-1][pp_index<<1] + (middle_layers[layer-1][(pp_index<<1) + 1]) << 1; // bit-level additions using full adders /*for(pp_index=0; pp_index<SMALLER_WIDTH ; pp_index=pp_index+1) for(bit_index=0; bit_index<(LARGER_WIDTH+layer); bit_index=bit_index+1) full_adder fa(.clk(clk), .reset(reset), .ain(), .bin(), .cin(), .sum(), .cout());*/ end end end end endgenerate assign out_C = (reset)? 0 : middle_layers[NUM_OF_INTERMEDIATE_LAYERS-1][0]; /*Checking if the final multiplication result is ready or not*/ reg [($clog2($clog2(SMALLER_WIDTH))-1):0] out_valid_counter; // to track the multiply stages reg multiply_had_started; always @(posedge clk) begin if(reset) begin multiply_had_started <= 0; out_valid <= 0; out_valid_counter <= 0; end else if(out_valid_counter == $clog2(SMALLER_WIDTH)-1) begin multiply_had_started <= 0; out_valid <= 1; out_valid_counter <= 0; end else if(in_valid && !multiply_had_started) begin multiply_had_started <= 1; out_valid <= 0; // for consecutive multiplication end else begin out_valid <= 0; if(multiply_had_started) out_valid_counter <= out_valid_counter + 1; end end `ifdef FORMAL initial assume(in_valid == 0); initial assert(out_valid == 0); initial assert(out_valid_counter == 0); wire sign_bit = in_A[A_WIDTH-1] ^ in_B[B_WIDTH-1]; always @(posedge clk) begin if(reset) assert(out_C == 0); else if(out_valid) begin assert(out_C == (in_A * in_B)); assert(out_C[A_WIDTH+B_WIDTH-1] == sign_bit); end end `endif `ifdef FORMAL always @(posedge clk) begin cover(in_valid && (in_A != 0) && (in_B != 0)); cover(out_valid); end `endif endmodule module full_adder(clk, reset, ain, bin, cin, sum, cout); input clk, reset; input ain, bin, cin; output reg sum, cout; // Full Adder Equations // Sum = A ⊕ B ⊕ Cin and Cout = (A ⋅ B) + (Cin ⋅ (A ⊕ B)) // where A ⊕ B is equivalent to A XOR B , A ⋅ B is equivalent to A AND B always @(posedge clk) begin if(reset) begin sum <= 0; cout <= 0; end else begin sum <= ain^bin^cin; cout <= (ain & bin) | (cin & (ain^bin)); //cout <= (ain * bin) + (cin * (ain - bin)); end end endmodule
Last edited: