+ Post New Thread
Results 1 to 13 of 13
  1. #1
    Advanced Member level 2
    Points: 3,174, Level: 13

    Join Date
    Feb 2016
    Posts
    607
    Helped
    1 / 1
    Points
    3,174
    Level
    13

    mutlplty verilog code does not multiply

    Why the following multiply verilog code does not multiply ?

    Besides, in order to debug the code, I need access to the variable "middle_layers", but gtkwave is not giving me access to it. Why ?

    I have also tried formal verification using yosys-smtbmc, but surprisingly the code failed even the simplest cover(in_valid) which I do not understand at all.

    Code Verilog - [expand]
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    122
    123
    124
    125
    126
    127
    128
    129
    130
    131
    132
    133
    134
    135
    136
    137
    138
    139
    140
    141
    142
    143
    144
    145
    146
    147
    148
    149
    150
    151
    152
    153
    154
    155
    156
    157
    158
    159
    160
    161
    162
    163
    164
    165
    166
    167
    168
    169
    170
    171
    172
    173
    174
    175
    176
    177
    178
    179
    180
    181
    182
    183
    184
    185
    186
    187
    188
    189
    190
    191
    192
    193
    194
    195
    196
    197
    198
    199
    200
    201
    202
    203
    204
    205
    206
    207
    208
    209
    210
    211
    212
    213
    214
    215
    216
    217
    
    module multiply(clk, reset, in_valid, out_valid, in_A, in_B, out_C); // C=A*B
     
    parameter A_WIDTH = 16;
    parameter B_WIDTH = 16;
     
    input clk, reset;
    input in_valid; // to signify that in_A, in_B are valid, multiplication process can start
    input signed [(A_WIDTH-1):0] in_A;
    input signed [(B_WIDTH-1):0] in_B;
    output signed [(A_WIDTH+B_WIDTH-1):0] out_C;
    output reg out_valid; // to signify that out_C is valid, multiplication finished
     
    /* 
       This signed multiplier code architecture is a combination of row adder tree and 
       modified baugh-wooley algorithm, thus requires an area of O(N*M*logN) and time O(logN)
       with M, N being the length(bitwidth) of the multiplicand and multiplier respectively
     
       see [url]https://i.imgur.com/NaqjC6G.png[/url] or 
       Row Adder Tree Multipliers in [url]http://www.andraka.com/multipli.php[/url] or
       [url]https://pdfs.semanticscholar.org/415c/d98dafb5c9cb358c94189927e1f3216b7494.pdf#page=10[/url]
       regarding the mechanisms within all layers
     
       In terms of fmax consideration: In the case of an adder tree, the adders making up the levels
       closer to the input take up real estate (remember the structure of row adder tree).  As the 
       size of the input multiplicand bitwidth grows, it becomes more and more difficult to find a
       placement that does not use long routes involving multiple switch nodes for FPGA.  The result
       is the maximum clocking speed degrades quickly as the size of the bitwidth grows.
     
       For signed multiplication, see also modified baugh-wooley algorithm for trick in skipping 
       sign extension (implemented as verilog example in [url]https://www.dsprelated.com/showarticle/555.php[/url]),
       thus smaller final routed silicon area.
     
       [url]https://stackoverflow.com/questions/54268192/understanding-modified-baugh-wooley-multiplication-algorithm/[/url]
     
       All layers are pipelined, so throughput = one result for each clock cycle 
       but each multiplication result still have latency = NUM_OF_INTERMEDIATE_LAYERS 
    */
     
     
    // The multiplication of two numbers is equivalent to adding as many copies of one 
    // of them, the multiplicand, as the value of the other one, the multiplier.
    // Therefore, multiplicand always have the larger width compared to multipliers
     
    localparam SMALLER_WIDTH = (A_WIDTH <= B_WIDTH) ? A_WIDTH : B_WIDTH;
    localparam LARGER_WIDTH = (A_WIDTH > B_WIDTH) ? A_WIDTH : B_WIDTH;
     
    wire [(LARGER_WIDTH-1):0] MULTIPLICAND = (A_WIDTH > B_WIDTH) ? in_A : in_B ;
    wire [(SMALLER_WIDTH-1):0] MULTIPLIPLIER = (A_WIDTH <= B_WIDTH) ? in_A : in_B ;
     
    localparam NUM_OF_INTERMEDIATE_LAYERS = $clog2(SMALLER_WIDTH);
     
     
    /*Binary multiplications and additions for partial products rows*/
     
    // first layer has "SMALLER_WIDTH" entries of data of width "LARGER_WIDTH"
    // This resulted in a binary tree with faster vertical addition processes as we have 
    // lesser (NUM_OF_INTERMEDIATE_LAYERS) rows to add
     
    // intermediate partial product rows additions
    // Imagine a rhombus of height of "SMALLER_WIDTH" and width of "LARGER_WIDTH"
    // being re-arranged into binary row adder tree
    // such that additions can be done in O(logN) time
     
    //reg [(NUM_OF_INTERMEDIATE_LAYERS-1):0][0:(SMALLER_WIDTH-1)][(A_WIDTH+B_WIDTH-1):0] middle_layers;
    reg [(A_WIDTH+B_WIDTH-1):0] middle_layers[(NUM_OF_INTERMEDIATE_LAYERS-1):0][0:(SMALLER_WIDTH-1)];
    //reg [(NUM_OF_INTERMEDIATE_LAYERS-1):0] middle_layers [0:(SMALLER_WIDTH-1)] [(A_WIDTH+B_WIDTH-1):0];
    //reg middle_layers [(NUM_OF_INTERMEDIATE_LAYERS-1):0][0:(SMALLER_WIDTH-1)][(A_WIDTH+B_WIDTH-1):0];
     
    generate // duplicates the leafs of the binary tree
     
        genvar layer; // layer 0 means the youngest leaf, layer N means the tree trunk
     
        for(layer=0; layer<NUM_OF_INTERMEDIATE_LAYERS; layer=layer+1) begin: intermediate_layers
     
            integer pp_index; // leaf index within each layer of the tree
            integer bit_index; // index of binary string within each leaf
     
            always @(posedge clk)
            begin
                if(reset) 
                begin
                    for(pp_index=0; pp_index<SMALLER_WIDTH ; pp_index=pp_index+1)
                        middle_layers[layer][pp_index] <= 0;
                end
     
                else begin
                
                    if(layer == 0)  // all partial products rows are in first layer
                    begin
                    
                        // generation of partial products rows
                        for(pp_index=0; pp_index<SMALLER_WIDTH ; pp_index=pp_index+1)
                            middle_layers[layer][pp_index] <= 
                            (MULTIPLICAND & MULTIPLIPLIER[pp_index]);
                            
                        // see modified baugh-wooley algorithm: [url]https://i.imgur.com/VcgbY4g.png[/url]
                        for(pp_index=0; pp_index<SMALLER_WIDTH ; pp_index=pp_index+1)
                            middle_layers[layer][pp_index][LARGER_WIDTH-1] <= 
                           !middle_layers[layer][pp_index][LARGER_WIDTH-1];
                            
                        middle_layers[layer][SMALLER_WIDTH-1] <= !middle_layers[layer][SMALLER_WIDTH-1];
                        middle_layers[layer][0][LARGER_WIDTH] <= 1;
                        middle_layers[layer][SMALLER_WIDTH-1][LARGER_WIDTH] <= 1;
                    end
                    
                    // adding the partial product rows according to row adder tree architecture
                    else begin
                        for(pp_index=0; pp_index<(SMALLER_WIDTH >> layer) ; pp_index=pp_index+1)
                            middle_layers[layer][pp_index] <=
                            middle_layers[layer-1][pp_index<<1] +
                          (middle_layers[layer-1][(pp_index<<1) + 1]) << 1;
                        
                        // bit-level additions using full adders
                        /*for(pp_index=0; pp_index<SMALLER_WIDTH ; pp_index=pp_index+1)
                            for(bit_index=0; bit_index<(LARGER_WIDTH+layer); bit_index=bit_index+1)
                                full_adder fa(.clk(clk), .reset(reset), .ain(), .bin(), .cin(), .sum(), .cout());*/
                    end
                end
            end
        end
     
    endgenerate
     
    assign out_C = (reset)? 0 : middle_layers[NUM_OF_INTERMEDIATE_LAYERS-1][0];
     
     
    /*Checking if the final multiplication result is ready or not*/
     
    reg [($clog2($clog2(SMALLER_WIDTH))-1):0] out_valid_counter; // to track the multiply stages
    reg multiply_had_started;
     
    always @(posedge clk)
    begin
        if(reset) 
        begin
            multiply_had_started <= 0;
            out_valid <= 0;
            out_valid_counter <= 0;
        end
     
        else if(out_valid_counter == $clog2(SMALLER_WIDTH)-1) begin
            multiply_had_started <= 0;
            out_valid <= 1;
            out_valid_counter <= 0;
        end
        
        else if(in_valid && !multiply_had_started) begin
            multiply_had_started <= 1;
            out_valid <= 0; // for consecutive multiplication
        end
        
        else begin
            out_valid <= 0;
            if(multiply_had_started) out_valid_counter <= out_valid_counter + 1;
        end
    end
     
     
    `ifdef FORMAL
     
    initial assume(in_valid == 0);
    initial assert(out_valid == 0);
    initial assert(out_valid_counter == 0);
     
    wire sign_bit = in_A[A_WIDTH-1] ^ in_B[B_WIDTH-1];
     
    always @(posedge clk)
    begin
        if(reset) assert(out_C == 0);
        
        else if(out_valid) begin
            assert(out_C == (in_A * in_B));
            assert(out_C[A_WIDTH+B_WIDTH-1] == sign_bit);
        end
    end
     
    `endif
     
    `ifdef FORMAL
     
    always @(posedge clk)
    begin
        cover(in_valid && (in_A != 0) && (in_B != 0));
        cover(out_valid);
    end
     
    `endif
     
    endmodule
     
     
    module full_adder(clk, reset, ain, bin, cin, sum, cout);
     
    input clk, reset;
    input ain, bin, cin;
    output reg sum, cout;
     
    // Full Adder Equations
    // Sum = A ⊕ B ⊕ Cin and Cout = (A ⋅ B) + (Cin ⋅ (A ⊕ B))
    // where A ⊕ B is equivalent to A XOR B , A ⋅ B is equivalent to A AND B
     
    always @(posedge clk)
    begin
        if(reset)
        begin
            sum <= 0;
            cout <= 0;
        end
        
        else begin
            sum <= ain^bin^cin;
            cout <= (ain & bin) | (cin & (ain^bin));
            //cout <= (ain * bin) + (cin * (ain - bin)); 
        end
    end
     
    endmodule

    Last edited by promach; 9th February 2019 at 01:59.

  2. #2
    Super Moderator
    Points: 30,789, Level: 42
    ads-ee's Avatar
    Join Date
    Sep 2013
    Location
    USA
    Posts
    7,084
    Helped
    1692 / 1692
    Points
    30,789
    Level
    42

    Re: mutlplty verilog code does not multiply

    does something in modelsim, though the answer is wrong.

    FYI modelsim doesn't add the waveform for the middle_layers array either using add wave -r * (have to add it manually). I believe it defaults to not adding arrays as they may represent large memory arrays which you don't normally want to simulate, it might be a similar problem with the VCD output you are generating from whatever simulator you are running. You'll probably have to explicitly add that signal for VCD output.

    - - - Updated - - -

    Depending on how you generate your reset in the testbench, I would move the reset over by half a clock cycle, I don't think either edge of your clock sees the reset go active as you use a synchronous reset. In my modelsim sim, I generated the reset pulse on the falling edge.


    1 members found this post helpful.

  3. #3
    Advanced Member level 2
    Points: 3,174, Level: 13

    Join Date
    Feb 2016
    Posts
    607
    Helped
    1 / 1
    Points
    3,174
    Level
    13

    Re: mutlplty verilog code does not multiply

    I have the following waveform after modifying the reset signal in the testbench

    Note: I am using iverilog, not modelsim.

    test_multiply.v

    Code Verilog - [expand]
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    
    // Testbench
    module test_multiply;
     
      parameter A_WIDTH=16, B_WIDTH=16;
      
      reg i_clk;
      reg i_reset;
      reg i_ce;
      reg signed[(A_WIDTH-1):0] i_a;
      reg signed[(B_WIDTH-1):0] i_b;
      wire signed[(A_WIDTH+B_WIDTH-1):0]    o_p;
      wire o_valid;
      
      // Instantiate design under test
      multiply mul(.clk(i_clk), .reset(i_reset), .in_valid(i_ce), .in_A(i_a), .in_B(i_b), .out_valid(o_valid), .out_C(o_p));
              
      initial begin
        // Dump waves
        $dumpfile("test_multiply.vcd");
        $dumpvars(0, test_multiply);
        
        $display("Reset flop.");
        i_clk = 0;
        i_reset = 0;
        i_ce = 0;
        i_a = 0;
        i_b = 0;
     
      end
     
      always #5 i_clk = !i_clk;
     
      initial begin
        
        @(posedge i_clk);
        @(posedge i_clk);
     
        i_reset = 1;
     
        @(posedge i_clk);
        @(posedge i_clk);
        
        i_reset = 0;
     
        @(posedge i_clk);
        @(posedge i_clk);
     
        i_ce = 1;
        i_a = 16'h8;
        i_b = 16'h7;
     
     
        #400 $finish;
     
      end
      
    endmodule




    •   AltAdvertisement

        
       

  4. #4
    Super Moderator
    Points: 30,789, Level: 42
    ads-ee's Avatar
    Join Date
    Sep 2013
    Location
    USA
    Posts
    7,084
    Helped
    1692 / 1692
    Points
    30,789
    Level
    42

    Re: mutlplty verilog code does not multiply

    I get the same results as you, which is definitely not the answer of 56 for 7 * 8 :'(

    Or is this that "new math" they teach now days in school



  5. #5
    Advanced Member level 2
    Points: 3,174, Level: 13

    Join Date
    Feb 2016
    Posts
    607
    Helped
    1 / 1
    Points
    3,174
    Level
    13

    Re: mutlplty verilog code does not multiply

    Using the suggestions at https://github.com/steveicarus/iveri...ment-129031448 and http://inf-server.inf.uth.gr/~konsta...ide.pdf#page=5

    I have the following iverilog error :

    [phung@archlinux phung]$ iverilog -o multiply test_multiply.v multiply.v
    test_multiply.v:36: syntax error
    test_multiply.v:36: error: invalid module item.
    [phung@archlinux phung]$
    Code Verilog - [expand]
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    
    // Testbench
    module test_multiply;
     
      parameter A_WIDTH=16, B_WIDTH=16;
      
      reg i_clk;
      reg i_reset;
      reg i_ce;
      reg signed[(A_WIDTH-1):0] i_a;
      reg signed[(B_WIDTH-1):0] i_b;
      wire signed[(A_WIDTH+B_WIDTH-1):0]    o_p;
      wire o_valid;
      
      // Instantiate design under test
      multiply mul(.clk(i_clk), .reset(i_reset), .in_valid(i_ce), .in_A(i_a), .in_B(i_b), .out_valid(o_valid), .out_C(o_p));
      
      
      initial begin
        // Dump waves
        $dumpfile("test_multiply.vcd");
        $dumpvars(0, test_multiply);
        
        i_clk = 0;
        i_reset = 0;
        i_ce = 0;
        i_a = 0;
        i_b = 0;
     
      end
     
      genvar i, j; // array index
      
      generate
        for(i = 0; i < 4; i = i + 1) begin
            for(j = 0; j < 16; j = j + 1) begin
                $dumpvars(0, test_multiply.multiply.middle_layers[i][j]);
            end
        end
      endgenerate
     
      always #5 i_clk = !i_clk;
     
      initial begin
        
        @(posedge i_clk);
        @(posedge i_clk);
     
        $display("Reset flop.");
     
        i_reset = 1;
     
        @(posedge i_clk);
        @(posedge i_clk);
        
        i_reset = 0;
     
        @(posedge i_clk);
        @(posedge i_clk);
     
        i_ce = 1;
        i_a = 16'h8;
        i_b = 16'h7;
     
     
        #400 $finish;
     
      end
      
    endmodule



  6. #6
    Advanced Member level 2
    Points: 3,174, Level: 13

    Join Date
    Feb 2016
    Posts
    607
    Helped
    1 / 1
    Points
    3,174
    Level
    13

    Re: mutlplty verilog code does not multiply

    @FvM

    I just got my cover(in_valid) passed in FORMAL verification, therefore I have the following waveform to investigate.



    Note that the middle_layers bit length does not match the bitwidth described in line 91 of multiply.v . Why ?

    Code Verilog - [expand]
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    122
    123
    124
    125
    126
    127
    128
    129
    130
    131
    132
    133
    134
    135
    136
    137
    138
    139
    140
    141
    142
    143
    144
    145
    146
    147
    148
    149
    150
    151
    152
    153
    154
    155
    156
    157
    158
    159
    160
    161
    162
    163
    164
    165
    166
    167
    168
    169
    170
    171
    172
    173
    174
    175
    176
    177
    178
    179
    180
    181
    182
    183
    184
    185
    186
    187
    188
    189
    190
    191
    192
    193
    194
    195
    196
    197
    198
    199
    200
    201
    202
    203
    204
    205
    206
    207
    208
    209
    210
    211
    212
    213
    214
    215
    216
    217
    218
    219
    220
    221
    222
    223
    224
    225
    226
    227
    228
    229
    230
    231
    232
    233
    234
    235
    236
    237
    238
    239
    240
    241
    242
    243
    244
    245
    246
    247
    
    module multiply(clk, reset, in_valid, out_valid, in_A, in_B, out_C); // C=A*B
     
    `ifdef FORMAL
    parameter A_WIDTH = 4;
    parameter B_WIDTH = 4;
     
    `else
     
    parameter A_WIDTH = 16;
    parameter B_WIDTH = 16;
    `endif
     
    input clk, reset;
    input in_valid; // to signify that in_A, in_B are valid, multiplication process can start
    input signed [(A_WIDTH-1):0] in_A;
    input signed [(B_WIDTH-1):0] in_B;
    output signed [(A_WIDTH+B_WIDTH-1):0] out_C;
    output reg out_valid; // to signify that out_C is valid, multiplication finished
     
    /* 
       This signed multiplier code architecture is a combination of row adder tree and 
       modified baugh-wooley algorithm, thus requires an area of O(N*M*logN) and time O(logN)
       with M, N being the length(bitwidth) of the multiplicand and multiplier respectively
     
       see [url]https://i.imgur.com/NaqjC6G.png[/url] or 
       Row Adder Tree Multipliers in [url]http://www.andraka.com/multipli.php[/url] or
       [url]https://pdfs.semanticscholar.org/415c/d98dafb5c9cb358c94189927e1f3216b7494.pdf#page=10[/url]
       regarding the mechanisms within all layers
     
       In terms of fmax consideration: In the case of an adder tree, the adders making up the levels
       closer to the input take up real estate (remember the structure of row adder tree).  As the 
       size of the input multiplicand bitwidth grows, it becomes more and more difficult to find a
       placement that does not use long routes involving multiple switch nodes for FPGA.  The result
       is the maximum clocking speed degrades quickly as the size of the bitwidth grows.
     
       For signed multiplication, see also modified baugh-wooley algorithm for trick in skipping 
       sign extension (implemented as verilog example in [url]https://www.dsprelated.com/showarticle/555.php[/url]),
       thus smaller final routed silicon area.
     
       [url]https://stackoverflow.com/questions/54268192/understanding-modified-baugh-wooley-multiplication-algorithm/[/url]
     
       All layers are pipelined, so throughput = one result for each clock cycle 
       but each multiplication result still have latency = NUM_OF_INTERMEDIATE_LAYERS 
    */
     
     
    // The multiplication of two numbers is equivalent to adding as many copies of one 
    // of them, the multiplicand, as the value of the other one, the multiplier.
    // Therefore, multiplicand always have the larger width compared to multipliers
     
    localparam SMALLER_WIDTH = (A_WIDTH <= B_WIDTH) ? A_WIDTH : B_WIDTH;
    localparam LARGER_WIDTH = (A_WIDTH > B_WIDTH) ? A_WIDTH : B_WIDTH;
     
    wire [(LARGER_WIDTH-1):0] MULTIPLICAND = (A_WIDTH > B_WIDTH) ? in_A : in_B ;
    wire [(SMALLER_WIDTH-1):0] MULTIPLIPLIER = (A_WIDTH <= B_WIDTH) ? in_A : in_B ;
     
    `ifdef FORMAL
    // to keep the values of multiplicand and multiplier before the multiplication finishes 
    reg [(LARGER_WIDTH-1):0] MULTIPLICAND_reg;
    reg [(SMALLER_WIDTH-1):0] MULTIPLIPLIER_reg;
     
    always @(posedge clk)
    begin
        if(reset) begin
            MULTIPLICAND_reg <= 0;
            MULTIPLIPLIER_reg <= 0;
        end
     
        else if(in_valid) begin
            MULTIPLICAND_reg <= MULTIPLICAND;
            MULTIPLIPLIER_reg <= MULTIPLIPLIER;
        end
    end
    `endif
     
    localparam NUM_OF_INTERMEDIATE_LAYERS = $clog2(SMALLER_WIDTH);
     
     
    /*Binary multiplications and additions for partial products rows*/
     
    // first layer has "SMALLER_WIDTH" entries of data of width "LARGER_WIDTH"
    // This resulted in a binary tree with faster vertical addition processes as we have 
    // lesser (NUM_OF_INTERMEDIATE_LAYERS) rows to add
     
    // intermediate partial product rows additions
    // Imagine a rhombus of height of "SMALLER_WIDTH" and width of "LARGER_WIDTH"
    // being re-arranged into binary row adder tree
    // such that additions can be done in O(logN) time
     
    //reg [(NUM_OF_INTERMEDIATE_LAYERS-1):0][(SMALLER_WIDTH-1):0][(A_WIDTH+B_WIDTH-1):0] middle_layers;
    reg [(A_WIDTH+B_WIDTH-1):0] middle_layers[(NUM_OF_INTERMEDIATE_LAYERS-1):0][0:(SMALLER_WIDTH-1)];
    //reg [(NUM_OF_INTERMEDIATE_LAYERS-1):0] middle_layers [0:(SMALLER_WIDTH-1)] [(A_WIDTH+B_WIDTH-1):0];
    //reg middle_layers [(NUM_OF_INTERMEDIATE_LAYERS-1):0][0:(SMALLER_WIDTH-1)][(A_WIDTH+B_WIDTH-1):0];
     
    generate // duplicates the leafs of the binary tree
     
        genvar layer; // layer 0 means the youngest leaf, layer N means the tree trunk
     
        for(layer=0; layer<NUM_OF_INTERMEDIATE_LAYERS; layer=layer+1) begin: intermediate_layers
     
            integer pp_index; // leaf index within each layer of the tree
            integer bit_index; // index of binary string within each leaf
     
            always @(posedge clk)
            begin
                if(reset) 
                begin
                    for(pp_index=0; pp_index<SMALLER_WIDTH ; pp_index=pp_index+1)
                        middle_layers[layer][pp_index] <= 0;
                end
     
                else begin
                
                    if(layer == 0)  // all partial products rows are in first layer
                    begin
                    
                        // generation of partial products rows
                        for(pp_index=0; pp_index<SMALLER_WIDTH ; pp_index=pp_index+1)
                            middle_layers[layer][pp_index] <= 
                            (MULTIPLICAND & MULTIPLIPLIER[pp_index]);
                            
                        // see modified baugh-wooley algorithm: [url]https://i.imgur.com/VcgbY4g.png[/url]
                        for(pp_index=0; pp_index<SMALLER_WIDTH ; pp_index=pp_index+1)
                            middle_layers[layer][pp_index][LARGER_WIDTH-1] <= 
                           !middle_layers[layer][pp_index][LARGER_WIDTH-1];
                            
                        middle_layers[layer][SMALLER_WIDTH-1] <= !middle_layers[layer][SMALLER_WIDTH-1];
                        middle_layers[layer][0][LARGER_WIDTH] <= 1;
                        middle_layers[layer][SMALLER_WIDTH-1][LARGER_WIDTH] <= 1;
                    end
                    
                    // adding the partial product rows according to row adder tree architecture
                    else begin
                        for(pp_index=0; pp_index<(SMALLER_WIDTH >> layer) ; pp_index=pp_index+1)
                            middle_layers[layer][pp_index] <=
                            middle_layers[layer-1][pp_index<<1] +
                          (middle_layers[layer-1][(pp_index<<1) + 1]) << 1;
                        
                        // bit-level additions using full adders
                        /*for(pp_index=0; pp_index<SMALLER_WIDTH ; pp_index=pp_index+1)
                            for(bit_index=0; bit_index<(LARGER_WIDTH+layer); bit_index=bit_index+1)
                                full_adder fa(.clk(clk), .reset(reset), .ain(), .bin(), .cin(), .sum(), .cout());*/
                    end
                end
            end
        end
     
    endgenerate
     
    assign out_C = (reset)? 0 : middle_layers[NUM_OF_INTERMEDIATE_LAYERS-1][0];
     
     
    /*Checking if the final multiplication result is ready or not*/
     
    reg [($clog2($clog2(SMALLER_WIDTH))-1):0] out_valid_counter; // to track the multiply stages
    reg multiply_had_started;
     
    always @(posedge clk)
    begin
        if(reset) 
        begin
            multiply_had_started <= 0;
            out_valid <= 0;
            out_valid_counter <= 0;
        end
     
        else if(out_valid_counter == $clog2(SMALLER_WIDTH)-1) begin
            multiply_had_started <= 0;
            out_valid <= 1;
            out_valid_counter <= 0;
        end
        
        else if(in_valid && !multiply_had_started) begin
            multiply_had_started <= 1;
            out_valid <= 0; // for consecutive multiplication
        end
        
        else begin
            out_valid <= 0;
            if(multiply_had_started) out_valid_counter <= out_valid_counter + 1;
        end
    end
     
     
    `ifdef FORMAL
     
    initial assume(reset);
    initial assume(in_valid == 0);
    //initial assert(out_valid == 0);
    //initial assert(out_valid_counter == 0);
     
    wire sign_bit = MULTIPLICAND_reg[LARGER_WIDTH-1] ^ MULTIPLIPLIER_reg[SMALLER_WIDTH-1];
     
    always @(posedge clk)
    begin
        if(reset) assert(out_C == 0);
        
        else if(out_valid) begin
            assert(out_C == (MULTIPLICAND_reg * MULTIPLIPLIER_reg));
            assert(out_C[A_WIDTH+B_WIDTH-1] == sign_bit);
        end
    end
     
    `endif
     
    `ifdef FORMAL
     
    localparam user_A = 3;
    localparam user_B = 6;
     
    always @(posedge clk)
    begin
        cover(in_valid && (in_A == user_A) && (in_B == user_B));
        cover(out_valid);
    end
     
    `endif
     
    endmodule
     
     
    module full_adder(clk, reset, ain, bin, cin, sum, cout);
     
    input clk, reset;
    input ain, bin, cin;
    output reg sum, cout;
     
    // Full Adder Equations
    // Sum = A ⊕ B ⊕ Cin and Cout = (A ⋅ B) + (Cin ⋅ (A ⊕ B))
    // where A ⊕ B is equivalent to A XOR B , A ⋅ B is equivalent to A AND B
     
    always @(posedge clk)
    begin
        if(reset)
        begin
            sum <= 0;
            cout <= 0;
        end
        
        else begin
            sum <= ain^bin^cin;
            cout <= (ain & bin) | (cin & (ain^bin));
            //cout <= (ain * bin) + (cin * (ain - bin)); 
        end
    end
     
    endmodule



    •   AltAdvertisement

        
       

  7. #7
    Super Moderator
    Points: 30,789, Level: 42
    ads-ee's Avatar
    Join Date
    Sep 2013
    Location
    USA
    Posts
    7,084
    Helped
    1692 / 1692
    Points
    30,789
    Level
    42

    Re: mutlplty verilog code does not multiply

    multiply mul(.clk(i_clk), .reset(i_reset), .in_valid(i_ce), .in_A(i_a), .in_B(i_b), .out_valid(o_valid), .out_C(o_p));
    $dumpvars(0, test_multiply.multiply.middle_layers[i][j]);
    error is in red

    - - - Updated - - -

    Quote Originally Posted by promach View Post
    Note that the middle_layers bit length does not match the bitwidth described in line 91. Why ?
    Line 91: reg [(A_WIDTH+B_WIDTH-1):0] middle_layers[(NUM_OF_INTERMEDIATE_LAYERS-1):0][0:(SMALLER_WIDTH-1)];

    An ifdef that must be set to FORMAL
    `ifdef FORMAL
    parameter A_WIDTH = 4;
    parameter B_WIDTH = 4;

    `else

    parameter A_WIDTH = 16;
    parameter B_WIDTH = 16;
    `endif
    You shouldn't even have the ifdef stuff, you should use the Verilog 2001 parameter declaration in the module port and then use the Verilog 2001 named association for the parameter passing. You would then set the FORMAL definition of the widths from outside the code.



  8. #8
    Advanced Member level 2
    Points: 3,174, Level: 13

    Join Date
    Feb 2016
    Posts
    607
    Helped
    1 / 1
    Points
    3,174
    Level
    13

    Re: mutlplty verilog code does not multiply

    @ads-ee

    Note that the middle_layers bit length does not match the bitwidth described in line 91
    You still had not answered why yet.

    For the ifdef issue, it is not the reason causing the bitwidth mismatch, right ?

    Regarding the red error you pointed out, you did not seem to answer it correctly. See the reply from iverilog developer : https://github.com/steveicarus/iveri...ment-462079234

    When I try with vivado builtin simulator, I am surprised that I do not face any bitwidth mismatch issue for the 3D array "middle_layers"
    However, I still have logic bug where the multiplication result does not follow the modified baugh-wooley algorithm. Which part of the code in the screenshot below did I code wrongly ?

    Note: The last four signals in the waveform plot are for layer=0




  9. #9
    Super Moderator
    Points: 30,789, Level: 42
    ads-ee's Avatar
    Join Date
    Sep 2013
    Location
    USA
    Posts
    7,084
    Helped
    1692 / 1692
    Points
    30,789
    Level
    42

    Re: mutlplty verilog code does not multiply

    Quote Originally Posted by promach View Post
    @ads-ee

    You still had not answered why yet.

    For the ifdef issue, it is not the reason causing the bitwidth mismatch, right ?
    Yes it is. It is the only thing selecting the parameters A_WIDTH and B_WIDTH in your multiply code. You don't have any defparams in your testbench code to change the parameters to anything else, so the setting of FORMAL will determine the parameter settings. You have A and B inputs of 4-bits and multiplying them will give a result of 8-bits, which is NOT a mismatch in widths. Either that or you are asking some other question incorrectly and I'm answering a different question as your question is poorly written....

    You should use Verilog 2001 port syntax instead, given how it is now 2019, 18 years after that standard was introduced. I also would avoid using ifdefs, most if not all uses of them are really abusing them in some form or another. Also as defines are global in nature, they can result in problems elsewhere when someone uses the same `define somewhere else overriding the expected value somewhere else breaking that ifdef.

    Regarding the red error you pointed out, you did not seem to answer it correctly. See the reply from iverilog developer : https://github.com/steveicarus/iveri...ment-462079234
    well you also posted code with a typo, which you didn't do on that other site. I don't go opening arbitrary links so I only looked at the code you posted which has test_multiply.multiply.middle_layers which does not use the correct instance name of the multiply module, which is mult.

    Ever heard of the term GIGO? It stands for garbage in garbage out...post an incorrect question with errors and you will get a wrong answer. I stopped on the first error I see, didn't even pay attention to the bit slice stuff. Besides that I don't use iverilog and noticed that according to every release note for it, that it does support a lot of language features of even Verilog2001 and nothing from SV, so I refuse to waste time with it.



  10. #10
    Advanced Member level 2
    Points: 3,174, Level: 13

    Join Date
    Feb 2016
    Posts
    607
    Helped
    1 / 1
    Points
    3,174
    Level
    13

    Re: mutlplty verilog code does not multiply

    The following verilog source code and/or testbench works nicely across commercial simulators, iverilog as well as formal verification tool (yosys-smtbmc)

    Please keep the complaint about `ifdef FORMAL until later. I need them to use with yosys-smtbmc which does not support bind command yet.

    I am now debugging the generate coding since the multiplication (using modified baugh-wooley algorithm) does not work yet.
    See the screenshot at post #8

    test_multiply.v

    Code Verilog - [expand]
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    
    // Testbench
    module test_multiply;
     
      parameter A_WIDTH=4, B_WIDTH=4;
      
      reg i_clk;
      reg i_reset;
      reg i_ce;
      reg signed[(A_WIDTH-1):0] i_a;
      reg signed[(B_WIDTH-1):0] i_b;
      wire signed[(A_WIDTH+B_WIDTH-1):0] o_p;
      wire o_valid;
      
      // Instantiate design under test
      multiply #(A_WIDTH, B_WIDTH) mul(.clk(i_clk), .reset(i_reset), .in_valid(i_ce), .in_A(i_a), .in_B(i_b), .out_valid(o_valid), .out_C(o_p));
      
      
      initial begin
        // Dump waves
        $dumpfile("test_multiply.vcd");
        $dumpvars(0, test_multiply);
        
        i_clk = 0;
        i_reset = 0;
        i_ce = 0;
        i_a = 0;
        i_b = 0;
     
      end
     
      localparam SMALLER_WIDTH = (A_WIDTH <= B_WIDTH) ? A_WIDTH : B_WIDTH;
      localparam NUM_OF_INTERMEDIATE_LAYERS = $clog2(SMALLER_WIDTH);
     
      genvar i, j; // array index
      
      generate
        for(i = 0; i < NUM_OF_INTERMEDIATE_LAYERS; i = i + 1) begin
            for(j = 0; j < SMALLER_WIDTH; j = j + 1) begin
                initial $dumpvars(0, test_multiply.mul.middle_layers[i][j]);
            end
        end
      endgenerate
     
      always #5 i_clk = !i_clk;
     
      initial begin
        
        @(posedge i_clk);
        @(posedge i_clk);
     
        $display("Reset flop.");
     
        i_reset = 1;
     
        @(posedge i_clk);
        @(posedge i_clk);
        
        i_reset = 0;
     
        @(posedge i_clk);
        @(posedge i_clk);
     
        i_ce = 1;
        i_a = 3;
        i_b = 2;
     
        #50 $finish;
     
      end
      
    endmodule


    multiply.v

    Code Verilog - [expand]
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    122
    123
    124
    125
    126
    127
    128
    129
    130
    131
    132
    133
    134
    135
    136
    137
    138
    139
    140
    141
    142
    143
    144
    145
    146
    147
    148
    149
    150
    151
    152
    153
    154
    155
    156
    157
    158
    159
    160
    161
    162
    163
    164
    165
    166
    167
    168
    169
    170
    171
    172
    173
    174
    175
    176
    177
    178
    179
    180
    181
    182
    183
    184
    185
    186
    187
    188
    189
    190
    191
    192
    193
    194
    195
    196
    197
    198
    199
    200
    201
    202
    203
    204
    205
    206
    207
    208
    209
    210
    211
    212
    213
    214
    215
    216
    217
    218
    219
    220
    221
    222
    223
    224
    225
    226
    227
    228
    229
    230
    231
    232
    233
    234
    235
    236
    237
    238
    239
    240
    241
    242
    
    module multiply #(parameter A_WIDTH=16, B_WIDTH=16)
    (clk, reset, in_valid, out_valid, in_A, in_B, out_C); // C=A*B
     
    `ifdef FORMAL
    parameter A_WIDTH = 4;
    parameter B_WIDTH = 4;
    `endif
     
    input clk, reset;
    input in_valid; // to signify that in_A, in_B are valid, multiplication process can start
    input signed [(A_WIDTH-1):0] in_A;
    input signed [(B_WIDTH-1):0] in_B;
    output signed [(A_WIDTH+B_WIDTH-1):0] out_C;
    output reg out_valid; // to signify that out_C is valid, multiplication finished
     
    /* 
       This signed multiplier code architecture is a combination of row adder tree and 
       modified baugh-wooley algorithm, thus requires an area of O(N*M*logN) and time O(logN)
       with M, N being the length(bitwidth) of the multiplicand and multiplier respectively
     
       see [url]https://i.imgur.com/NaqjC6G.png[/url] or 
       Row Adder Tree Multipliers in [url]http://www.andraka.com/multipli.php[/url] or
       [url]https://pdfs.semanticscholar.org/415c/d98dafb5c9cb358c94189927e1f3216b7494.pdf#page=10[/url]
       regarding the mechanisms within all layers
     
       In terms of fmax consideration: In the case of an adder tree, the adders making up the levels
       closer to the input take up real estate (remember the structure of row adder tree).  As the 
       size of the input multiplicand bitwidth grows, it becomes more and more difficult to find a
       placement that does not use long routes involving multiple switch nodes for FPGA.  The result
       is the maximum clocking speed degrades quickly as the size of the bitwidth grows.
     
       For signed multiplication, see also modified baugh-wooley algorithm for trick in skipping 
       sign extension (implemented as verilog example in [url]https://www.dsprelated.com/showarticle/555.php[/url]),
       thus smaller final routed silicon area.
     
       [url]https://stackoverflow.com/questions/54268192/understanding-modified-baugh-wooley-multiplication-algorithm/[/url]
     
       All layers are pipelined, so throughput = one result for each clock cycle 
       but each multiplication result still have latency = NUM_OF_INTERMEDIATE_LAYERS 
    */
     
     
    // The multiplication of two numbers is equivalent to adding as many copies of one 
    // of them, the multiplicand, as the value of the other one, the multiplier.
    // Therefore, multiplicand always have the larger width compared to multipliers
     
    localparam SMALLER_WIDTH = (A_WIDTH <= B_WIDTH) ? A_WIDTH : B_WIDTH;
    localparam LARGER_WIDTH = (A_WIDTH > B_WIDTH) ? A_WIDTH : B_WIDTH;
     
    wire [(LARGER_WIDTH-1):0] MULTIPLICAND = (A_WIDTH > B_WIDTH) ? in_A : in_B ;
    wire [(SMALLER_WIDTH-1):0] MULTIPLIPLIER = (A_WIDTH <= B_WIDTH) ? in_A : in_B ;
     
    `ifdef FORMAL
    // to keep the values of multiplicand and multiplier before the multiplication finishes 
    reg [(LARGER_WIDTH-1):0] MULTIPLICAND_reg;
    reg [(SMALLER_WIDTH-1):0] MULTIPLIPLIER_reg;
     
    always @(posedge clk)
    begin
        if(reset) begin
            MULTIPLICAND_reg <= 0;
            MULTIPLIPLIER_reg <= 0;
        end
     
        else if(in_valid) begin
            MULTIPLICAND_reg <= MULTIPLICAND;
            MULTIPLIPLIER_reg <= MULTIPLIPLIER;
        end
    end
    `endif
     
    localparam NUM_OF_INTERMEDIATE_LAYERS = $clog2(SMALLER_WIDTH);
     
     
    /*Binary multiplications and additions for partial products rows*/
     
    // first layer has "SMALLER_WIDTH" entries of data of width "LARGER_WIDTH"
    // This resulted in a binary tree with faster vertical addition processes as we have 
    // lesser (NUM_OF_INTERMEDIATE_LAYERS) rows to add
     
    // intermediate partial product rows additions
    // Imagine a rhombus of height of "SMALLER_WIDTH" and width of "LARGER_WIDTH"
    // being re-arranged into binary row adder tree
    // such that additions can be done in O(logN) time
     
    //reg [(NUM_OF_INTERMEDIATE_LAYERS-1):0][(SMALLER_WIDTH-1):0][(A_WIDTH+B_WIDTH-1):0] middle_layers;
    reg [(A_WIDTH+B_WIDTH-1):0] middle_layers[(NUM_OF_INTERMEDIATE_LAYERS-1):0][0:(SMALLER_WIDTH-1)];
    //reg [(NUM_OF_INTERMEDIATE_LAYERS-1):0] middle_layers [0:(SMALLER_WIDTH-1)] [(A_WIDTH+B_WIDTH-1):0];
    //reg middle_layers [(NUM_OF_INTERMEDIATE_LAYERS-1):0][0:(SMALLER_WIDTH-1)][(A_WIDTH+B_WIDTH-1):0];
     
    generate // duplicates the leafs of the binary tree
     
        genvar layer; // layer 0 means the youngest leaf, layer N means the tree trunk
     
        for(layer=0; layer<NUM_OF_INTERMEDIATE_LAYERS; layer=layer+1) begin: intermediate_layers
     
            integer pp_index; // leaf index within each layer of the tree
            integer bit_index; // index of binary string within each leaf
     
            always @(posedge clk)
            begin
                if(reset) 
                begin
                    for(pp_index=0; pp_index<SMALLER_WIDTH ; pp_index=pp_index+1)
                        middle_layers[layer][pp_index] <= 0;
                end
     
                else begin
                
                    if(layer == 0)  // all partial products rows are in first layer
                    begin
                    
                        // generation of partial products rows
                        for(pp_index=0; pp_index<SMALLER_WIDTH ; pp_index=pp_index+1)
                            middle_layers[layer][pp_index] <= 
                            (MULTIPLICAND & MULTIPLIPLIER[pp_index]);
                            
                        // see modified baugh-wooley algorithm: [url]https://i.imgur.com/VcgbY4g.png[/url] from
                                            // page 122 of book "Ultra-Low-Voltage Design of Energy-Efficient Digital Circuits"
                        for(pp_index=0; pp_index<SMALLER_WIDTH ; pp_index=pp_index+1)
                            middle_layers[layer][pp_index][LARGER_WIDTH-1] <= 
                           !middle_layers[layer][pp_index][LARGER_WIDTH-1];
                            
                        middle_layers[layer][SMALLER_WIDTH-1] <= !middle_layers[layer][SMALLER_WIDTH-1];
                        middle_layers[layer][0][LARGER_WIDTH] <= 1;
                        middle_layers[layer][SMALLER_WIDTH-1][LARGER_WIDTH] <= 1;
                    end
                    
                    // adding the partial product rows according to row adder tree architecture
                    else begin
                        for(pp_index=0; pp_index<(SMALLER_WIDTH >> layer) ; pp_index=pp_index+1)
                            middle_layers[layer][pp_index] <=
                            middle_layers[layer-1][pp_index<<1] +
                          (middle_layers[layer-1][(pp_index<<1) + 1]) << 1;
                        
                        // bit-level additions using full adders
                        /*for(pp_index=0; pp_index<SMALLER_WIDTH ; pp_index=pp_index+1)
                            for(bit_index=0; bit_index<(LARGER_WIDTH+layer); bit_index=bit_index+1)
                                full_adder fa(.clk(clk), .reset(reset), .ain(), .bin(), .cin(), .sum(), .cout());*/
                    end
                end
            end
        end
     
    endgenerate
     
    assign out_C = (reset)? 0 : middle_layers[NUM_OF_INTERMEDIATE_LAYERS-1][0];
     
     
    /*Checking if the final multiplication result is ready or not*/
     
    reg [($clog2(NUM_OF_INTERMEDIATE_LAYERS)-1):0] out_valid_counter; // to track the multiply stages
    reg multiply_had_started;
     
    always @(posedge clk)
    begin
        if(reset) 
        begin
            multiply_had_started <= 0;
            out_valid <= 0;
            out_valid_counter <= 0;
        end
     
        else if(out_valid_counter == NUM_OF_INTERMEDIATE_LAYERS-1) begin
            multiply_had_started <= 0;
            out_valid <= 1;
            out_valid_counter <= 0;
        end
        
        else if(in_valid && !multiply_had_started) begin
            multiply_had_started <= 1;
            out_valid <= 0; // for consecutive multiplication
        end
        
        else begin
            out_valid <= 0;
            if(multiply_had_started) out_valid_counter <= out_valid_counter + 1;
        end
    end
     
     
    `ifdef FORMAL
     
    initial assume(reset);
    initial assume(in_valid == 0);
     
    wire sign_bit = MULTIPLICAND_reg[LARGER_WIDTH-1] ^ MULTIPLIPLIER_reg[SMALLER_WIDTH-1];
     
    always @(posedge clk)
    begin
        if(reset) assert(out_C == 0);
        
        else if(out_valid) begin
            assert(out_C == (MULTIPLICAND_reg * MULTIPLIPLIER_reg));
            assert(out_C[A_WIDTH+B_WIDTH-1] == sign_bit);
        end
    end
     
    `endif
     
    `ifdef FORMAL
     
    localparam user_A = 3;
    localparam user_B = 2;
     
    always @(posedge clk)
    begin
        cover(in_valid && (in_A == user_A) && (in_B == user_B));
        cover(out_valid);
    end
     
    `endif
     
    endmodule
     
     
    /*module full_adder(clk, reset, ain, bin, cin, sum, cout);
     
    input clk, reset;
    input ain, bin, cin;
    output reg sum, cout;
     
    // Full Adder Equations
    // Sum = A ⊕ B ⊕ Cin and Cout = (A ⋅ B) + (Cin ⋅ (A ⊕ B))
    // where A ⊕ B is equivalent to A XOR B , A ⋅ B is equivalent to A AND B
     
    always @(posedge clk)
    begin
        if(reset)
        begin
            sum <= 0;
            cout <= 0;
        end
        
        else begin
            sum <= ain^bin^cin;
            cout <= (ain & bin) | (cin & (ain^bin));
            //cout <= (ain * bin) + (cin * (ain - bin)); 
        end
    end
     
    endmodule*/



    •   AltAdvertisement

        
       

  11. #11
    Newbie level 1
    Points: 13, Level: 1

    Join Date
    Feb 2019
    Posts
    1
    Helped
    0 / 0
    Points
    13
    Level
    1

    Re: mutlplty verilog code does not multiply

    Quote Originally Posted by ads-ee View Post
    FYI modelsim doesn't add the waveform for the middle_layers array either using add wave -r * (have to add it manually). I believe it defaults to not adding arrays as they may represent large memory arrays which you don't normally want to simulate, it might be a similar problem with the VCD output you are generating from whatever simulator you are running. You'll probably have to explicitly add that signal for VCD output.
    On Icarus, you definitely need to dump each array row explicitly using $dumpvars, though you only have to specify the index/indices parts without having to specify the bitrange. This goes for VCD and for the faster database formats such as FST.

    VCS and CVC automatically dump arrays fully without needing to step through them in testbench code--at least for FSDB. Signal names between Icarus and other simulators differ for arrays because Icarus escapes the hierarchy of the signal name where the actual signal name starts, probably because array rows contain '[' and ']' characters, probably to be compliant with what is in the Verilog spec. In order to be able to use the same gtkwave save file regardless of simulator, I wound up hacking my source for Icarus so the names dumped into FST would appear the same as for when I simulate using VCS.

    -Tony



  12. #12
    Advanced Member level 2
    Points: 3,174, Level: 13

    Join Date
    Feb 2016
    Posts
    607
    Helped
    1 / 1
    Points
    3,174
    Level
    13

    Re: mutlplty verilog code does not multiply

    I have already solved the dumpvars issue for multi-dimensional array.

    The problem now is the simulation waveform does not follow the algorithm.

    See the screenshot at post #8 and the code at post #10

    - - - Updated - - -

    if I comment out lines 119 to 125 , then I have all zeroes for the 2D array of vectors, middle_layers. Why ?

    MULTIPLICAND & MULTIPLIPLIER[pp_index] seems to be the culprit ??




  13. #13
    Advanced Member level 2
    Points: 3,174, Level: 13

    Join Date
    Feb 2016
    Posts
    607
    Helped
    1 / 1
    Points
    3,174
    Level
    13

    Re: mutlplty verilog code does not multiply

    Problem solved : See https://gist.github.com/promach/5f2d...ile-multiply-v

    It now gives correct signed multiplication result both in vivado simulation and cover() within formal verification

    Code Verilog - [expand]
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    122
    123
    124
    125
    126
    127
    128
    129
    130
    131
    132
    133
    134
    135
    136
    137
    138
    139
    140
    141
    142
    143
    144
    145
    146
    147
    148
    149
    150
    151
    152
    153
    154
    155
    156
    157
    158
    159
    160
    161
    162
    163
    164
    165
    166
    167
    168
    169
    170
    171
    172
    173
    174
    175
    176
    177
    178
    179
    180
    181
    182
    183
    184
    185
    186
    187
    188
    189
    190
    191
    192
    193
    194
    195
    196
    197
    198
    199
    200
    201
    202
    203
    204
    205
    206
    207
    208
    209
    210
    211
    212
    213
    214
    215
    216
    
    module multiply #(parameter A_WIDTH=16, B_WIDTH=16)
    (clk, reset, in_valid, out_valid, in_A, in_B, out_C); // C=A*B
     
    `ifdef FORMAL
    parameter A_WIDTH = 4;
    parameter B_WIDTH = 4;
    `endif
     
    input clk, reset;
    input in_valid; // to signify that in_A, in_B are valid, multiplication process can start
    input signed [(A_WIDTH-1):0] in_A;
    input signed [(B_WIDTH-1):0] in_B;
    output signed [(A_WIDTH+B_WIDTH-1):0] out_C;
    output reg out_valid; // to signify that out_C is valid, multiplication finished
     
    /* 
       This signed multiplier code architecture is a combination of row adder tree and 
       modified baugh-wooley algorithm, thus requires an area of O(N*M*logN) and time O(logN)
       with M, N being the length(bitwidth) of the multiplicand and multiplier respectively
     
       see [url]https://i.imgur.com/NaqjC6G.png[/url] or 
       Row Adder Tree Multipliers in [url]http://www.andraka.com/multipli.php[/url] or
       [url]https://pdfs.semanticscholar.org/415c/d98dafb5c9cb358c94189927e1f3216b7494.pdf#page=10[/url]
       regarding the mechanisms within all layers
     
       In terms of fmax consideration: In the case of an adder tree, the adders making up the levels
       closer to the input take up real estate (remember the structure of row adder tree).  As the 
       size of the input multiplicand bitwidth grows, it becomes more and more difficult to find a
       placement that does not use long routes involving multiple switch nodes for FPGA.  The result
       is the maximum clocking speed degrades quickly as the size of the bitwidth grows.
     
       For signed multiplication, see also modified baugh-wooley algorithm for trick in skipping 
       sign extension (implemented as verilog example in [url]https://www.dsprelated.com/showarticle/555.php[/url]),
       thus smaller final routed silicon area.
     
       [url]https://stackoverflow.com/questions/54268192/understanding-modified-baugh-wooley-multiplication-algorithm/[/url]
     
       All layers are pipelined, so throughput = one result for each clock cycle 
       but each multiplication result still have latency = NUM_OF_INTERMEDIATE_LAYERS 
    */
     
     
    // The multiplication of two numbers is equivalent to adding as many copies of one 
    // of them, the multiplicand, as the value of the other one, the multiplier.
    // Therefore, multiplicand always have the larger width compared to multipliers
     
    localparam SMALLER_WIDTH = (A_WIDTH <= B_WIDTH) ? A_WIDTH : B_WIDTH;
    localparam LARGER_WIDTH = (A_WIDTH > B_WIDTH) ? A_WIDTH : B_WIDTH;
     
    wire [(LARGER_WIDTH-1):0] MULTIPLICAND = (A_WIDTH > B_WIDTH) ? in_A : in_B ;
    wire [(SMALLER_WIDTH-1):0] MULTIPLIER = (A_WIDTH <= B_WIDTH) ? in_A : in_B ;
     
    `ifdef FORMAL
    // to keep the values of multiplicand and multiplier before the multiplication finishes 
    reg signed [(LARGER_WIDTH-1):0] MULTIPLICAND_reg;
    reg signed [(SMALLER_WIDTH-1):0] MULTIPLIER_reg;
     
    always @(posedge clk)
    begin
        if(reset) begin
            MULTIPLICAND_reg <= 0;
            MULTIPLIPLIER_reg <= 0;
        end
     
        else if(in_valid) begin
            MULTIPLICAND_reg <= MULTIPLICAND;
            MULTIPLIER_reg <= MULTIPLIER;
        end
    end
    `endif
     
    localparam NUM_OF_INTERMEDIATE_LAYERS = $clog2(SMALLER_WIDTH);
     
     
    /*Binary multiplications and additions for partial products rows*/
     
    // first layer has "SMALLER_WIDTH" entries of data of width "LARGER_WIDTH"
    // This resulted in a binary tree with faster vertical addition processes as we have 
    // lesser (NUM_OF_INTERMEDIATE_LAYERS) rows to add
     
    // intermediate partial product rows additions
    // Imagine a rhombus of height of "SMALLER_WIDTH" and width of "LARGER_WIDTH"
    // being re-arranged into binary row adder tree
    // such that additions can be done in O(logN) time
     
    //reg [(NUM_OF_INTERMEDIATE_LAYERS-1):0][(SMALLER_WIDTH-1):0][(A_WIDTH+B_WIDTH-1):0] middle_layers;
    reg [(A_WIDTH+B_WIDTH-1):0] middle_layers[NUM_OF_INTERMEDIATE_LAYERS:0][0:(SMALLER_WIDTH-1)];
    //reg [(NUM_OF_INTERMEDIATE_LAYERS-1):0] middle_layers [0:(SMALLER_WIDTH-1)] [(A_WIDTH+B_WIDTH-1):0];
    //reg middle_layers [(NUM_OF_INTERMEDIATE_LAYERS-1):0][0:(SMALLER_WIDTH-1)][(A_WIDTH+B_WIDTH-1):0];
     
    generate // duplicates the leafs of the binary tree
     
        genvar layer; // layer 0 means the youngest leaf, layer N means the tree trunk
     
        for(layer=0; layer<=NUM_OF_INTERMEDIATE_LAYERS; layer=layer+1) begin: intermediate_layers
     
            integer pp_index; // leaf index within each layer of the tree
     
            always @(posedge clk)
            begin
                if(reset) 
                begin
                    for(pp_index=0; pp_index<SMALLER_WIDTH ; pp_index=pp_index+1)
                        middle_layers[layer][pp_index] <= 0;
                end
     
                else begin      
        
                    if(layer == 0)  // all partial products rows are in first layer
                    begin               
                        // generation of partial products rows
                        for(pp_index=0; pp_index<SMALLER_WIDTH ; pp_index=pp_index+1)
                            middle_layers[layer][pp_index] <= MULTIPLIER[pp_index] ? MULTIPLICAND:0;    
                            
                        // see modified baugh-wooley algorithm: [url]https://i.imgur.com/VcgbY4g.png[/url] from
                        // page 122 of book: Ultra-Low-Voltage Design of Energy-Efficient Digital Circuits
                        for(pp_index=0; pp_index<(SMALLER_WIDTH-1) ; pp_index=pp_index+1)
                            middle_layers[layer][pp_index][LARGER_WIDTH-1] <= 
                            (MULTIPLICAND[LARGER_WIDTH-1] & MULTIPLIER[pp_index]) ? 0:1;
                            
                        for(pp_index=0; pp_index<(LARGER_WIDTH-1) ; pp_index=pp_index+1)
                            middle_layers[layer][SMALLER_WIDTH-1][pp_index] <= 
                            (MULTIPLICAND[pp_index] & MULTIPLIER[SMALLER_WIDTH-1]) ? 0:1;
     
                        middle_layers[layer][0][LARGER_WIDTH] <= 1;
                        middle_layers[layer][SMALLER_WIDTH-1][LARGER_WIDTH] <= 1;
                    end
                    
                    // adding the partial product rows according to row adder tree architecture
                    else begin
                        for(pp_index=0; pp_index<(SMALLER_WIDTH >> layer) ; pp_index=pp_index+1)
                        begin
                            if(pp_index==0)
                                middle_layers[layer][pp_index] <=
                                middle_layers[layer-1][0] +
                                (middle_layers[layer-1][1] << layer);
     
                            else middle_layers[layer][pp_index] <=
                                middle_layers[layer-1][pp_index<<1] +
                                (middle_layers[layer-1][(pp_index<<1) + 1] << layer);
                        end
                    end
                end
            end
        end
     
    endgenerate
     
    assign out_C = (reset)? 0 : middle_layers[NUM_OF_INTERMEDIATE_LAYERS][0];
     
     
    /*Checking if the final multiplication result is ready or not*/
     
    reg [($clog2(NUM_OF_INTERMEDIATE_LAYERS)-1):0] out_valid_counter; // to track the multiply stages
    reg multiply_had_started;
     
    always @(posedge clk)
    begin
        if(reset) 
        begin
            multiply_had_started <= 0;
            out_valid <= 0;
            out_valid_counter <= 0;
        end
     
        else if(out_valid_counter == NUM_OF_INTERMEDIATE_LAYERS-1) begin
            multiply_had_started <= 0;
            out_valid <= 1;
            out_valid_counter <= 0;
        end
        
        else if(in_valid && !multiply_had_started) begin
            multiply_had_started <= 1;
            out_valid <= 0; // for consecutive multiplication
        end
        
        else begin
            out_valid <= 0;
            if(multiply_had_started) out_valid_counter <= out_valid_counter + 1;
        end
    end
     
     
    `ifdef FORMAL
     
    initial assume(reset);
    initial assume(in_valid == 0);
     
    wire sign_bit = MULTIPLICAND_reg[LARGER_WIDTH-1] ^ MULTIPLIPLIER_reg[SMALLER_WIDTH-1];
     
    always @(posedge clk)
    begin
        if(reset) assert(out_C == 0);
        
        else if(out_valid) begin
            assert(out_C == (MULTIPLICAND_reg * MULTIPLIER_reg));
            assert(out_C[A_WIDTH+B_WIDTH-1] == sign_bit);
        end
    end
     
    `endif
     
    `ifdef FORMAL
     
    localparam user_A = 3;
    localparam user_B = -2;
     
    always @(posedge clk)
    begin
        cover(in_valid && (in_A == user_A) && (in_B == user_B));
        cover(out_valid);
    end
     
    `endif
     
    endmodule




--[[ ]]--