Verilog Coding Tips and Tricks: 2020

Tuesday, December 15, 2020

Synthesizable Clocked Square Root Calculator In Verilog

    Long back I had shared a Verilog module for finding the square root of a number. This function too was synthesisable, but as it was implemented with a conventional 'for' loop, it was purely combinatorial. If you want to find the square root of a relatively larger number, then the resource usage was very high.

    In such cases, it makes sense to use a clocked design. Such a clocked design enables us to reuse one set of resources over and over. The advantage of such a design is that it uses far less resources while the disadvantage being the low speed. 

    For example, in the design I have shared in this post, to find the square root of a N-bit number, you need to wait N/2 clock cycles. 

    The code is written based on Figure (8) from this paper: A New Non-Restoring Square Root Algorithm and Its VLSI Implementations.

    The codes are well commented, so I wont write much about how it works here. Please refer to the block diagram from the paper in case you have some doubts.

Let me share the codes now:

square_root.v:


//Synthesisable Design for Finding Square root of a number.
module square_root
    #(parameter N = 32)
    (   input Clock,  //Clock
        input reset,  //Asynchronous active high reset.      
        input [N-1:0] num_in,   //this is the number for which we want to find square root.
        output reg done,     //This signal goes high when output is ready
        output reg [N/2-1:0] sq_root  //square root of 'num_in'
    );

    reg [N-1:0] a;   //original input.
    reg [N/2+1:0] left,right;     //input to adder/sub.r-remainder.
    reg signed [N/2+1:0] r;
    reg [N/2-1:0] q;    //result.
    integer i;   //index of the loop. 

    always @(posedge Clock or posedge reset) 
    begin
        if (reset == 1begin   //reset the variables.
            done <= 0;
            sq_root <= 0;
            i = 0;
            a = 0;
            left = 0;
            right = 0;
            r = 0;
            q = 0;
        end    
        else begin
            //Before we start the first clock cycle get the 'input' to the variable 'a'.
            if(i == 0begin  
                a = num_in;
                done <= 0;    //reset 'done' signal.
                i = i+1;   //increment the loop index.
            end
            else if(i < N/2begin //keep incrementing the loop index.
                i = i+1;  
            end
            //These statements below are derived from the block diagram.
            right = {q,r[N/2+1],1'b1};
            left = {r[N/2-1:0], a[N-1:N-2]};
            a = {a[N-3:0], 2'b0};  //shifting left by 2 bit.
            if ( r[N/2+1] == 1)    //add or subtract as per this bit.
                r = left + right;
            else
                r = left - right;
            q = {q[N/2-2:0], ~r[N/2+1]};
            if(i == N/2begin    //This means the max value of loop index has reached. 
                done <= 1;    //make 'done' high because output is ready.
                i = 0//reset loop index for beginning the next cycle.
                sq_root <= q;   //assign 'q' to the output port.
                //reset other signals for using in the next cycle.
                left = 0;
                right = 0;
                r = 0;
                q = 0;
            end
        end    
    end

endmodule


Testbench: tb.v


//Testbench for out square root calculator design.
module tb;  //testbench module is always empty. No input or output ports.

reg Clock, reset;
wire done;
parameter N = 16;   //width of the input.
reg [N-1:0] num_in;
reg [N:0] i;
wire [N/2-1:0] sq_root;
integer error,actual_result;  //this indicates the number of errors encountered during simulation.
parameter Clock_period = 10;    //Change clock period here. 

//Apply the inputs to the design and check if the results are correct. 
//The number of inputs for which the results were wrongly calculated are counted by 'error'. 
initial
begin
    Clock = 1;
    error = 0;
    i=1;
    //First we apply reset input for one clock period.
    reset = 1;
    #Clock_period;
    reset = 0;
    //Test the design for all the combination of inputs.
    //Since we have (2^16)-1 inputs, we test all of them one by one. 
    while(i<=2**N-1begin
        apply_input(i);
        i = i+1;    
    end
    #Clock_period;
    reset = 1;   //all inputs are tested. Apply reset
    num_in = 0;     //reset the 'num_in'
    $stop;  //Stop the simulation, as we have finished testing the design.
end

task apply_input;
    input [N:0] i;
begin
    num_in = i[N-1:0];  
    wait(~done);    //wait for the 'done' to finish its previous high state
    wait(done); //wait until 'done' output goes High.
    wait(~Clock);   //we sample the output at the falling edge of the clock.
    actual_result = $rtoi($floor($pow(i,0.5))); //Calculate the actual result.
    //if actual result and calculated result are different increment 'error' by 1.
    if(actual_result != sq_root) 
        error = error + 1
end
endtask

//generate a 50Mhz clock for testing the design.
always #(Clock_period/2) Clock <= ~Clock;

//Instantiate the matrix multiplier
square_root #(.N(N)) find_sq_root 
        (.Clock(Clock), 
        .reset(reset), 
        .num_in(num_in), 
        .done(done),
        .sq_root(sq_root)
        );

endmodule   //End of testbench.


Simulation Waveform from ModelSim:



        To reach the end of the testbench, you need to simulate only for 5.5 msec of simulation time. The simulation will automatically stop once all the input combinations are tested. 


Monday, December 14, 2020

Synthesizable Polynomial Equation Calculator in Verilog

      A polynomial equation is an equation which is formed with variables, coefficients and exponents. They can be written in the following format:

                        y =anxn + an-1xn-1 + ... +a1x + a0 .

Here an, an-1  ... , a1 ,a0  are coefficients,

n, n-1, ... etc are exponents and x is the variable. 

In this post, I want to share, Verilog code for computing the value of 'y' if you know the value of 'x' and coefficients forming the polynomial. 

Directly calculating the polynomial with the above equation is very inefficient. So, first we reformat the equation as per Horner's rule:

                        y =(((anx + an-1)x + an-2)x + an-3)x + ... )x +a0 .

This modified equation can be represented by the following block diagram:





There are 3 sub-modules in the above block diagram.

1) Block number 1:

    This block keep generating successive powers of input x. 

In the first clock cycle it generates x^0 = 1.

In the 2ndclock cycle it generates x^1.

In the 3rd clock cycle it generates x^2.

In the nth clock cycle it generates x^(n-1).

    Every clock cycle, the previous outputs are multiplied with 'x' to get the next power of x.

2)Block number 2:

    The powers of x generated in the first block are multiplied with the corresponding coefficients of the polynomial in Block numbered 2. The coefficients are declared and initialized in the top level entity and can be changed easily.

3)Block number 4:

    This block is basically an accumulative adder which accumulates all the products generated by the multiplier. Once it does 'n' additions, we have the final result available in output 'y'. An output 'done' is set as high to indicate that the output is ready.


Let me share the Verilog codes for these modules along with the testbench.

1) power_module.v


//This module generates successive powers of input x in each clock cycle.
//For example 1,x,x^2,x^3,x^4 etc. 
//The output from the previous cycle is multiplied again by x to get the next power.
//The output from this entity is passed to the multiplier module, where it gets
//multiplied by the corresponding coefficient.
module power_module
    #(parameter N = 32)
    (   input Clock,
        input reset,
        input signed [N-1:0x,
        output signed [N-1:0] x_powered
    );

    reg signed [N-1:0] x_pow;
    reg signed [2*N-1:0] temp_prod;

    always @(posedge Clock or posedge reset ) 
    begin
        if(reset == 1)  //asynchronous active high reset.
            x_pow = 1;
        else begin
            temp_prod = x*x_pow;
            //The MSB half of the result is ignored. We assume that all intermediate powers of x
            //can be represented by a max of N bits.
            x_pow = temp_prod[N-1:0];
        end    
    end

    assign x_powered = x_pow;

endmodule


2) multiplier.v


//The successive powers of x are multiplied by the corresponding coeffcients here. 
//The results are sent to the adder module which acts as an "accumulator".
module multiplier
    #(parameter N = 32)
    (   input Clock,
        input signed [N-1:0x,y,
        output reg signed [N-1:0] prod
    );

    reg signed [2*N-1:0] temp_prod;

    always @(posedge Clock) begin
        if(Clock == 1begin
            temp_prod = x*y;
            //The MSB half of the result is ignored. We assume that all intermediate numbers
            //are represented by a max of N bits.
            prod = temp_prod[N-1:0];
        end
    end

endmodule


3) adder.v


//The output from multiplier module is received by this module and accumulated to
//form the output. If the polynomial equation has a degree of 'n' then 'n' additions has to take place
//to get the final result.
module adder
    #(parameter N = 32)
    (   input Clock,reset,
        input signed [N-1:0x,
        output signed [N-1:0] sum
    );

    reg signed [N-1:0] temp_sum;

    always @(posedge Clock or posedge reset ) 
    begin
        if(reset == 1)  //asynchronous active high reset.
            temp_sum = 0;
        else begin
            temp_sum = temp_sum + x;
        end    
    end  

    assign sum = temp_sum;

endmodule


4) Top Level Entity : poly_eq_calc.v


//This is the top level module for the polynomial equation calculator.
//The power_module, multiplier and adder modules are instantiated inside this top level block.
//When the output signal done is High, the output available at the 'y' output is the result we want.
//Ignore all other values of 'y' when done is Low.
//We have assumed that all the intermediate numbers calculated to reach the final result, can be represented
//by a maximum of N bits. This simplifies the design very much.
module poly_eq_calc
    #(parameter N = 32)
    (   input Clock,
        input reset,
        input signed [N-1:0x,
        output signed [N-1:0] y,
        output reg done
    );

    wire signed [N-1:0] x_powered,product_from_mult;
    reg signed [N-1:0] coeff;
    reg reset_adder;
    integer coeff_index;
    parameter  NUM_COEFFS = 4//Change here to change the degree of the polynomial. 
    reg signed [N-1:0] Coeffs [0:NUM_COEFFS-1];
    
    //Eq : 3*x^3 - 2*x^2 - 4*x + 5;
    //The coefficients belonging to higher powers of x are stored in higher addresses in Coeffs array.
    initial begin
        Coeffs[0] = 5;
        Coeffs[1] = -4;
        Coeffs[2] = -2;
        Coeffs[3] = 3;
    end 

    //Instantiate the 3 sub-modules.    
    power_module #(.N(N)) calc_power_of_x
        (Clock,reset,x,x_powered);

    multiplier #(.N(N)) multiply 
        (Clock,x_powered,coeff,product_from_mult);

    adder #(.N(N)) add  
        (Clock,reset_adder,product_from_mult,y);

    //The Always block controlling the 3 sub-modules and also supplying them with coefficients.   
    always @(posedge Clock or posedge reset ) 
    begin
        if(reset == 1begin    //asynchronous active high reset.
            done = 0;
            coeff_index = 0;
            coeff = Coeffs[0];
            reset_adder = 1;
        end
        else begin
            //the disabling of 'reset of adder' gets noticed by adder entity in the next clock cycle.
            reset_adder = 0
            if(coeff_index < NUM_COEFFS-1begin
                coeff_index = coeff_index+1;
                coeff = Coeffs[coeff_index];  //send the coefficients one by one to the multiplier module.
            end 
            else if(coeff_index == NUM_COEFFS-1
                coeff_index = coeff_index+1;
            else
                done = 1;    //The final result is available in 'y' now.
        end    
    end

endmodule


5) Testbench : tb_poly_eq_calc.v


//Testbench code.
module tb_poly_eq_calc;  //testbench module is always empty. No input or output ports.

    parameter Clock_period = 10;    //Change clock period here. 
    parameter N = 32;    //change the width of input and output here.
    reg Clock;
    reg reset;
    reg [N-1:0x;
    wire [N-1:0] y;
    wire done;
    integer input_num;

    //Apply inputs here. Only 'x' can be changed here.
    //To change coefficients and degree of polynomial edit the top level module directly.
    initial
    begin
        Clock = 0;
        input_num = 0;
        reset = 1;
        #(Clock_period * 2.5);
        reset = 0;

        apply_input(4);     //first input.
        apply_input(7);     //second input.
        apply_input(15);     //third input.
        apply_input(-9);     //fourth input.
        #Clock_period;          //wait for one clock cycle.
        $stop;  //Stop the simulation, as we have finished testing the design.
    end

    //the verilog task for applying one input.
    task apply_input;
        input [N-1:0] x_in;
    begin
        reset = 1;
        #Clock_period;
        reset = 0;
        x = x_in;
        #Clock_period;
        wait(done);     //wait for result to be out.
        check_result(x_in);     //verify the result.
        #Clock_period;
    end
    endtask

    //Verilog task to check if the results are correct and report.
    task check_result;
        input [N-1:0] x_in;
        integer actual_res;
    begin
        wait(Clock);
        if(done == 1begin
            //Change this equation if you change the polynomial eq inside the top level module.
            actual_res = 3*x_in*x_in*x_in - 2*x_in*x_in - 4*x_in + 5;  
            input_num = input_num+1;
            if(actual_res == y) 
                $display("Input number %d Worked Well",input_num);
            else
                $display("Input number %d Has Error",input_num);                 
        end
    end
    endtask

    //generate a 50Mhz clock for testing the design.
    always #(Clock_period/2Clock <= ~Clock;

    //Instantiate the polynomial calculator.
    poly_eq_calc #(.N(N)) poly_calculator 
            (.Clock(Clock), 
            .reset(reset), 
            .x(x), 
            .y(y),
            .done(done));


endmodule   //End of testbench.


The code was simulated successfully using Modelsim 10.4a version. The simulation waveform below shows the signals for the first set of input:







Sunday, December 13, 2020

Generic Verilog Code for Binary to Gray and Gray to Binary converter

     Few years back I had written a 4 bit converter for conversion between Gray and Binary codes. After receiving much positive response I decided to write a generic version of the same.

Let me share the codes...

Binary to Gray Code Converter:



//Binary to Gray code Converter
//The 'parameter' keyword below is how we give the inputs/outputs a generic size.
module bin2gray #(parameter N = 4)
        ( input [N-1:0] bin,    //binary input
        output [N-1:0] G);      //Gray output

assign G[N-1] = bin[N-1];

//generate xor gates.
//the loop index need to be declared as 'genvar' and it can be done
//as you can see inside the 'for' loop.
//The instantiation is labelled as 'xor_gates_b2g'. 
//Always put a label when you generate instantiations.
//The 'generate' keyword need not be explicitly written.
for(genvar i=N-2;i>=0;i=i-1begin : xor_gates_b2g
    xor(G[i],bin[i+1],bin[i]);
end

endmodule


Gray Code to Binary Converter:



//Gray code to Binary Converter
module gray2bin #(parameter N = 4)
        ( input [N-1:0] G,    //Gray input
        output [N-1:0] bin);      //Binary output

assign bin[N-1] = G[N-1];

for(genvar i=N-2;i>=0;i=i-1begin : xor_gates_g2b
    xor(bin[i],G[i],bin[i+1]);
end

endmodule


Testbench:



//Testbench which connects both the converters back to back.
module tb;  //testbench module is always empty.

parameter N = 16;   //Change this to control the number of bits in the input/output.
reg [N-1:0] bin;
wire [N-1:0] G,bin_out;
reg [N:0] i;
integer error;  //this counts the number of errors during simulation.

    //Both the converters are connected back to back to see the binary input going to the
    //first module is the same as the output coming out of the second module.
    bin2gray #(.N(N)) uut1
        (
          .bin(bin),
          .G(G)
        );
 
    gray2bin #(.N(N)) uut2
        (
          .G(G),  
          .bin(bin_out)
        );
          
    initial 
    begin
        error = 0;  //initialize the error as zero.
        for(i=0;i<2**N;i=i+1begin     //loop through all the  available inputs 
            bin = i[N-1:0];
            #5;
            //Count the number of errors.It should be zero at the end of simulation.
            if(bin != bin_out)  
                error = error + 1;
            #5;
        end
        #10;
        $stop;  //All possible inputs are tested. So stop the simulation.
    end          

endmodule


    The codes were tested using Modelsim 10.4a version. Simply change the value of the parameter 'N' in the testbench to test for different sized converters.

A screenshot of the simulation waveform is shown below:





Saturday, December 12, 2020

Synthesizable Matrix Multiplication in Verilog

     Long back I had posted a simple matrix multiplier which works well in simulation but couldn't be synthesized. But many people had requested for a synthesizable version of this code. So here we go.

    The design takes two matrices of 3 by 3 and outputs a matrix of 3 by 3. Each element is stored as 8 bits. This is not a generic multiplier, but if you understand the code well, you can easily extend it for different sized matrices. 

    Each matrix has 9 elements, each of which is 8 bits in size. So I am passing the matrix as a 72 bit 1-Dimensional array in the design. The following table shows how the 2-D elements are mapped into the 1-D array.

Row

Column

Bit’s Position in 1-D array

0

0

7:0

0

1

15:8

0

2

23:16

1

0

31:24

1

1

39:32

1

2

47:40

2

0

55:48

2

1

63:56

2

2

71:64


Let me share the codes now...

matrix_mult.v:


//3 by 3 matrix multiplier. Each element of the matrix is 8 bit wide. 
//Inputs are named A and B and output is named as C. 
//Each matrix has 9 elements each of which is 8 bit wide. So the inputs is 9*8=72 bit long.
module matrix_mult
    (   input Clock,
        input reset, //active high reset
        input Enable,    //This should be High throughout the matrix multiplication process.
        input [71:0] A,
        input [71:0] B,
        output reg [71:0] C,
        output reg done     //A High indicates that multiplication is done and result is availble at C.
    );   

//temperory registers. 
reg signed [7:0] matA [2:0][2:0];
reg signed [7:0] matB [2:0][2:0];
reg signed [7:0] matC [2:0][2:0];
integer i,j,k;  //loop indices
reg first_cycle;    //indicates its the first clock cycle after Enable went High.
reg end_of_mult;    //indicates multiplication has ended.
reg signed [15:0] temp; //a temeporary register to hold the product of two elements.

//Matrix multiplication.
always @(posedge Clock or posedge reset)    
begin
    if(reset == 1begin    //Active high reset
        i = 0;
        j = 0;
        k = 0;
        temp = 0;
        first_cycle = 1;
        end_of_mult = 0;
        done = 0;
        //Initialize all the matrix register elements to zero.
        for(i=0;i<=2;i=i+1begin
            for(j=0;j<=2;j=j+1begin
                matA[i][j] = 8'd0;
                matB[i][j] = 8'd0;
                matC[i][j] = 8'd0;
            end 
        end 
    end
    else begin  //for the positve edge of Clock.
        if(Enable == 1)     //Any action happens only when Enable is High.
            if(first_cycle == 1begin     //the very first cycle after Enable is high.
                //the matrices which are in a 1-D array are converted to 2-D matrices first.
                for(i=0;i<=2;i=i+1begin
                    for(j=0;j<=2;j=j+1begin
                        matA[i][j] = A[(i*3+j)*8+:8];
                        matB[i][j] = B[(i*3+j)*8+:8];
                        matC[i][j] = 8'd0;
                    end 
                end
                //re-initalize registers before the start of multiplication.
                first_cycle = 0;
                end_of_mult = 0;
                temp = 0;
                i = 0;
                j = 0;
                k = 0;
            end
            else if(end_of_mult == 0begin     //multiplication hasnt ended. Keep multiplying.
                //Actual matrix multiplication starts from now on.
                temp = matA[i][k]*matB[k][j];
                matC[i][j] = matC[i][j] + temp[7:0];    //Lower half of the product is accumulatively added to form the result.
                if(k == 2begin
                    k = 0;
                    if(j == 2begin
                        j = 0;
                        if (i == 2begin
                            i = 0;
                            end_of_mult = 1;
                        end
                        else
                            i = i + 1;
                    end
                    else
                        j = j+1;    
                end
                else
                    k = k+1;
            end
            else if(end_of_mult == 1begin     //End of multiplication has reached
                //convert 3 by 3 matrix into a 1-D matrix.
                for(i=0;i<=2;i=i+1begin   //run through the rows
                    for(j=0;j<=2;j=j+1begin    //run through the columns
                        C[(i*3+j)*8+:8] = matC[i][j];
                    end
                end   
                done = 1;   //Set this output High, to say that C has the final result.
            end
    end
end
 
endmodule


tb_matrix_mult.v:


//Testbench for testing the 3 by 3 matrix multiplier.
module tb_matrix_mult;  //testbench module is always empty. No input or output ports.

reg [71:0] A;
reg [71:0] B;
wire [71:0] C;
reg Clock,reset, Enable;
wire done;
reg [7:0] matC [2:0][2:0];
integer i,j;
parameter Clock_period = 10;    //Change clock period here. 

initial
begin
    Clock = 1;
    reset = 1;
    #100;   //Apply reset for 100 ns before applying inputs.
    reset = 0;
    #Clock_period;
    //input matrices are set and Enable input is set High
    A = {8'd9,8'd8,8'd7,8'd6,8'd5,8'd4,8'd3,8'd2,8'd1};
    B = {8'd1,8'd9,8'd8,8'd7,8'd6,8'd5,8'd4,8'd3,8'd2};
    Enable = 1;
    wait(done); //wait until 'done' output goes High.
    //The result C should be (93,150,126,57,96,81,21,42,36)
    #(Clock_period/2);  //wait for half a clock cycle.
    //convert the 1-D matrix into 2-D format to easily verify the results.
    for(i=0;i<=2;i=i+1begin
        for(j=0;j<=2;j=j+1begin
            matC[i][j] = C[(i*3+j)*8+:8];
        end
    end
    #Clock_period;  //wait for one clock cycle.
    Enable = 0//reset Enable.
    #Clock_period;
    $stop;  //Stop the simulation, as we have finished testing the design.
end

//generate a 50Mhz clock for testing the design.
always #(Clock_period/2) Clock <= ~Clock;

//Instantiate the matrix multiplier
matrix_mult matrix_multiplier 
        (.Clock(Clock), 
        .reset(reset), 
        .Enable(Enable), 
        .A(A),
        .B(B), 
        .C(C),
        .done(done));


endmodule   //End of testbench.


Simulation Results:

    The design was simulated successfully using Modelsim SE 10.4a version. Screenshot of the simulation waveform is shown below:



    Please let me know if you are unable to get the code to work or if its not synthesisable. Good luck with your projects.