Home記事一覧フォーラム

独自CPUを自作する(メモリ操作の速いアーキテクチャ編) : SIMD対応版解説ページ

SIMD命令一覧

v8add
v8sub
v8mul
v8sr
v8sl
v8sra
v8mv
v8ceq
v8cgt
v8cgta

SIMD命令仕様

※以下、[element] はソース、格納先のレジスタ、メモリを8bit区切りにした要素それぞれに対しての演算を表す。


V8ADD

解説: Vector 8bit Add : 8bit SIMD 加算

アセンブリ: v8add(reg_d, reg_a, reg_b, inc_d, inc_a, inc_b, mem_d, mem_a, mem_b)

機能: D[element] = A[element] + B[element];


V8SUB

解説: Vector 8bit Subtract : 8bit SIMD 減算

アセンブリ: v8sub(reg_d, reg_a, reg_b, inc_d, inc_a, inc_b, mem_d, mem_a, mem_b)

機能: D[element] = A[element] - B[element];


V8MUL

解説: Vector 8bit Multiply : 8bit SIMD 乗算

アセンブリ: v8mul(reg_d, reg_a, reg_b, inc_d, inc_a, inc_b, mem_d, mem_a, mem_b)

機能: D[element] = A[element] * B[element];


V8SR

解説: Vector 8bit Shift Right : 8bit SIMD 論理右シフト

アセンブリ: v8sr(reg_d, reg_a, reg_b, inc_d, inc_a, inc_b, mem_d, mem_a, mem_b)

機能: D[element] = A[element] >> B;


V8SL

解説: Vector 8bit Shift Left : 8bit SIMD 論理左シフト

アセンブリ: v8sl(reg_d, reg_a, reg_b, inc_d, inc_a, inc_b, mem_d, mem_a, mem_b)

機能: D[element] = A[element] << B;


V8SRA

解説: Vector 8bit Shift Right Arithmetic : 8bit SIMD 算術右シフト

アセンブリ: v8sra(reg_d, reg_a, reg_b, inc_d, inc_a, inc_b, mem_d, mem_a, mem_b)

機能: D[element] = A[element] >>> B;


V8MV

解説: Vector 8bit Move : もし(R1[element] != 0)ならばD[element]にA[element]を代入

アセンブリ: v8mv(reg_d, reg_a, inc_d, inc_a, mem_d, mem_a)

機能: if (R1[element] != 0) {D[element] = A[element];}


V8CEQ

解説: Vector 8bit Compare Equal : もし(A[element] == B[element])ならばR1[element] = 0xff、それ以外ならR1[element] = 0

アセンブリ: v8ceq(reg_a, reg_b, inc_a, inc_b, mem_a, mem_b)

機能: if (A[element] == B[element]) {R1[element] = 0xff;} else {R1[element] = 0;}


V8CGT

解説: Vector 8bit Compare Greater Than : もし(A[element] > B[element])ならばR1[element] = 0xff、それ以外ならR1[element] = 0

アセンブリ: v8cgt(reg_a, reg_b, inc_a, inc_b, mem_a, mem_b)

機能: if (A[element] > B[element]) {R1[element] = 0xff;} else {R1[element] = 0;}


V8CGTA

解説: Vector 8bit Compare Greater Than Arithmetic : A[element]、B[element]をsignedとして扱い、もし(A[element] > B[element])ならばR1[element] = 0xff、それ以外ならR1[element] = 0

アセンブリ: v8cgta(reg_a, reg_b, inc_a, inc_b, mem_a, mem_b)

機能: if (A[element](signed) > B[element](signed)) {R1[element] = 0xff;} else {R1[element] = 0;}

ソースコード

SIMDパッチ済み sc1_cpu.v

/*
  Copyright (c) 2015-2016, miya
  All rights reserved.

  Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

  1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.

  2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.

  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/



module sc1_cpu
  #(
    parameter WIDTH_I = 32,
    parameter WIDTH_D = 32,
    parameter WIDTH_REG = 32,
    parameter DEPTH_I = 8,
    parameter DEPTH_D = 8,
    parameter DEPTH_REG = 4
    )
  (
   input                    clk,
   input                    reset,
   output reg [DEPTH_I-1:0] rom_addr,
   input [WIDTH_I-1:0]      rom_data,
   input [WIDTH_D-1:0]      port_in,
   output reg [WIDTH_D-1:0] port_out
   );

  localparam VEC8_WAY = (WIDTH_REG / 8);

  localparam SP_REG_MVI = 4'd0;
  localparam SP_REG_BA = 4'd0;
  localparam SP_REG_CP = 4'd1;
  localparam SP_REG_LINK = 4'd2;
  localparam SP_REG_LOOP_COUNTER = 4'd3;
  localparam SP_REG_LOOP_END = 4'd4;
  localparam SP_REG_LOOP_SPAN = 4'd5;

  // opcode
  // special type
  localparam I_HALT = 7'h00;
  localparam I_NOP  = 7'h01;
  localparam I_MV   = 7'h02;
  localparam I_MVI  = 7'h03;
  localparam I_MVIH = 7'h04;
  localparam I_CEQ  = 7'h05;
  localparam I_CGT  = 7'h06;
  localparam I_CGTA = 7'h07;
  localparam I_BC   = 7'h08;
  localparam I_BL   = 7'h09;
  localparam I_BA   = 7'h0a;
  localparam I_LOOP = 7'h0b;
  localparam I_OUT  = 7'h0c;
  localparam I_V8MV  = 7'h0d;
  localparam I_V8CEQ  = 7'h0e;
  localparam I_V8CGT  = 7'h0f;
  localparam I_V8CGTA  = 7'h10;
  // normal type
  localparam I_ADD  = 7'h40;
  localparam I_SUB  = 7'h41;
  localparam I_AND  = 7'h42;
  localparam I_OR   = 7'h43;
  localparam I_XOR  = 7'h44;
  localparam I_NOT  = 7'h45;
  localparam I_SR   = 7'h46;
  localparam I_SL   = 7'h47;
  localparam I_SRA  = 7'h48;
  localparam I_MUL  = 7'h49;
  localparam I_IN   = 7'h4a;
  localparam I_V8ADD = 7'h4b;
  localparam I_V8SUB = 7'h4c;
  localparam I_V8MUL = 7'h4d;
  localparam I_V8SR  = 7'h4e;
  localparam I_V8SL  = 7'h4f;
  localparam I_V8SRA = 7'h50;

  localparam TRUE = 1'b1;
  localparam FALSE = 1'b0;
  localparam ONE = 1'd1;
  localparam ZERO = 1'd0;
  localparam FFFF = {WIDTH_D{1'b1}};

  wire [WIDTH_I-1:0]        mem_i_o;
  reg [DEPTH_I-1:0]         mem_i_addr_r;
  reg [DEPTH_I-1:0]         mem_i_addr_w;
  reg [WIDTH_I-1:0]         mem_i_i;
  reg                       mem_i_we;

  wire [WIDTH_D-1:0]        mem_d_o_a;
  wire [WIDTH_D-1:0]        mem_d_o_b;
  wire                      mem_d_we_sig;
  reg [WIDTH_D-1:0]         mem_d_i;
  reg [DEPTH_D-1:0]         mem_d_addr_w;
  reg [DEPTH_D-1:0]         mem_d_addr_w_d1;
  reg [DEPTH_D-1:0]         mem_d_addr_w_d2;
  reg [DEPTH_D-1:0]         mem_d_addr_r_a;
  reg [DEPTH_D-1:0]         mem_d_addr_r_b;
  reg                       mem_d_we;

  reg                       cpu_en;
  reg [DEPTH_I-1:0]         pc_d1;
  reg [DEPTH_I-1:0]         pc_d2;
  reg [10:0]                stage_init;
  reg [WIDTH_D-1:0]         loop_counter;
  reg [DEPTH_I-1:0]         loop_end;
  reg [DEPTH_I-1:0]         loop_span;

  wire                      is_mem_d_s1;
  wire                      is_mem_a_s1;
  wire                      is_mem_b_s1;
  wire                      add_d_s1;
  wire                      add_a_s1;
  wire                      add_b_s1;
  wire [DEPTH_REG-1:0]      reg_d_addr_s1;
  wire [DEPTH_REG-1:0]      reg_a_addr_s1;
  wire [DEPTH_REG-1:0]      reg_b_addr_s1;

  reg [WIDTH_I-1:0]         mem_i_o_d1;
  wire [6:0]                op;
  wire                      is_type_normal;
  wire                      not_increment;
  wire                      is_mem_d;
  wire                      is_mem_a;
  wire                      is_mem_b;
  wire [DEPTH_REG-1:0]      reg_d_addr;
  wire [DEPTH_REG-1:0]      reg_a_addr;
  wire [DEPTH_REG-1:0]      reg_b_addr;
  wire [15:0]               im16;
  wire signed [15:0]        ims16;

  wire [WIDTH_D-1:0]        source_a;
  wire [WIDTH_D-1:0]        source_b;

  // register file
  reg [WIDTH_REG-1:0]       reg_file [0:(1 << DEPTH_REG)-1];

  // decode(stage1)
  assign is_mem_d_s1 = mem_i_o[9];
  assign is_mem_a_s1 = mem_i_o[8];
  assign is_mem_b_s1 = mem_i_o[7];
  assign add_d_s1 = mem_i_o[12];
  assign add_a_s1 = mem_i_o[11];
  assign add_b_s1 = mem_i_o[10];
  assign reg_d_addr_s1 = mem_i_o[DEPTH_REG+26-1:26];
  assign reg_a_addr_s1 = mem_i_o[DEPTH_REG+20-1:20];
  assign reg_b_addr_s1 = mem_i_o[DEPTH_REG+14-1:14];

  // decode(stage2)
  assign op = mem_i_o_d1[6:0];
  assign is_type_normal = mem_i_o_d1[6];
  assign is_mem_d = mem_i_o_d1[9];
  assign is_mem_a = mem_i_o_d1[8];
  assign is_mem_b = mem_i_o_d1[7];
  assign reg_d_addr = mem_i_o_d1[DEPTH_REG+26-1:26];
  assign reg_a_addr = mem_i_o_d1[DEPTH_REG+20-1:20];
  assign reg_b_addr = mem_i_o_d1[DEPTH_REG+14-1:14];
  assign im16 = mem_i_o_d1[25:10];
  assign ims16 = mem_i_o_d1[25:10];

  // manual pc increment
  assign not_increment = ((op == I_HALT) || (op == I_BC) || (op == I_BL) || (op == I_BA)) ? 1'b1 : 1'b0;

  // switch source
  assign source_a = is_mem_a ? mem_d_o_a : reg_file[reg_a_addr];
  assign source_b = is_mem_b ? mem_d_o_b : reg_file[reg_b_addr];

  // switch operation
  function [WIDTH_D-1:0] result
    (
     input [6:0] op_result
     );
    integer      i;
    begin
      case (op_result)
        I_ADD:   result = source_a + source_b;
        I_SUB:   result = source_a - source_b;
        I_AND:   result = source_a & source_b;
        I_OR:    result = source_a | source_b;
        I_XOR:   result = source_a ^ source_b;
        I_NOT:   result = ~source_a;
        I_SR:    result = source_a >> source_b;
        I_SL:    result = source_a << source_b;
        I_SRA:   result = $signed(source_a) >>> source_b;
        I_MUL:   result = $signed(source_a) * $signed(source_b);
        I_IN:    result = port_in;
        I_V8ADD:
          begin
            for (i = 0; i < VEC8_WAY; i = i + 1)
              begin
                result[i*8 +: 8] = source_a[i*8 +: 8] + source_b[i*8 +: 8];
              end
          end
        I_V8SUB:
          begin
            for (i = 0; i < VEC8_WAY; i = i + 1)
              begin
                result[i*8 +: 8] = source_a[i*8 +: 8] - source_b[i*8 +: 8];
              end
          end
        I_V8MUL:
          begin
            for (i = 0; i < VEC8_WAY; i = i + 1)
              begin
                result[i*8 +: 8] = $signed(source_a[i*8 +: 8]) * $signed(source_b[i*8 +: 8]);
              end
          end
        I_V8SR:
          begin
            for (i = 0; i < VEC8_WAY; i = i + 1)
              begin
                result[i*8 +: 8] = source_a[i*8 +: 8] >> source_b;
              end
          end
        I_V8SL:
          begin
            for (i = 0; i < VEC8_WAY; i = i + 1)
              begin
                result[i*8 +: 8] = source_a[i*8 +: 8] << source_b;
              end
          end
        I_V8SRA:
          begin
            for (i = 0; i < VEC8_WAY; i = i + 1)
              begin
                result[i*8 +: 8] = $signed(source_a[i*8 +: 8]) >>> source_b;
              end
          end
        default:;
      endcase
    end
  endfunction

  // mem_d_we condition
  assign mem_d_we_sig = is_mem_d & (is_type_normal | (op == I_MV));

  always @(posedge clk)
    begin
      if (reset == TRUE)
        begin
          stage_init <= ZERO;
          cpu_en <= FALSE;
          mem_i_addr_r <= ZERO;
          mem_i_addr_w <= ZERO;
          mem_i_we <= FALSE;
          port_out <= ZERO;
          loop_counter <= ZERO;
          loop_end <= ZERO;
          loop_span <= ZERO;
        end
      else if (cpu_en == FALSE)
        // init
        begin
          if (stage_init < 11'h400)
            begin
              case (stage_init[1:0])
                // load program from ROM
                2'd0:
                  begin
                    rom_addr <= stage_init[9:2];
                  end
                2'd1:
                  begin
                  end
                2'd2:
                  begin
                    mem_i_addr_w <= stage_init[9:2];
                    mem_i_i <= rom_data;
                    mem_i_we <= TRUE;
                  end
                2'd3:
                  begin
                    mem_i_we <= FALSE;
                  end
                default: ;
              endcase
              stage_init <= stage_init + ONE;
            end
          else
            begin
              cpu_en <= TRUE;
            end
        end
      else
        // cpu enable
        begin
          // increment mem_d address automatically
          if (is_mem_d_s1)
            begin
              if (add_d_s1)
                begin
                  mem_d_addr_w <= mem_d_addr_w + reg_file[reg_d_addr_s1][DEPTH_D-1:0];
                end
              else
                begin
                  mem_d_addr_w <= reg_file[reg_d_addr_s1][DEPTH_D-1:0];
                end
            end
          if (is_mem_a_s1)
            begin
              if (add_a_s1)
                begin
                  mem_d_addr_r_a <= mem_d_addr_r_a + reg_file[reg_a_addr_s1][DEPTH_D-1:0];
                end
              else
                begin
                  mem_d_addr_r_a <= reg_file[reg_a_addr_s1][DEPTH_D-1:0];
                end
            end
          if (is_mem_b_s1)
            begin
              if (add_b_s1)
                begin
                  mem_d_addr_r_b <= mem_d_addr_r_b + reg_file[reg_b_addr_s1][DEPTH_D-1:0];
                end
              else
                begin
                  mem_d_addr_r_b <= reg_file[reg_b_addr_s1][DEPTH_D-1:0];
                end
            end

          // delay
          mem_i_o_d1 <= mem_i_o;
          pc_d2 <= pc_d1;
          pc_d1 <= mem_i_addr_r;
          mem_d_we <= mem_d_we_sig;
          mem_d_addr_w_d1 <= mem_d_addr_w;
          mem_d_addr_w_d2 <= mem_d_addr_w_d1;

          // loop counter
          if (loop_end == mem_i_addr_r)
            begin
              if ((loop_counter != ZERO) && (op != I_LOOP))
                begin
                  loop_counter <= loop_counter - ONE;
                end
            end

          // increment pc (prefetch address)
          if (!not_increment)
            begin
              if (loop_end == mem_i_addr_r)
                begin
                  if (loop_counter == ZERO)
                    begin
                      mem_i_addr_r <= mem_i_addr_r + ONE;
                    end
                  else
                    begin
                      mem_i_addr_r <= mem_i_addr_r + loop_span;
                    end
                end
              else
                begin
                  mem_i_addr_r <= mem_i_addr_r + ONE;
                end
            end

          // execution
          if (is_type_normal)
            begin
              // for normal instructions
              if (is_mem_d)
                begin
                  mem_d_i <= result(op);
                end
              else
                begin
                  reg_file[reg_d_addr] <= result(op);
                end
            end
          else
            begin
              // special instructions
              case (op)
                I_HALT:
                  begin
                    mem_i_addr_r <= pc_d2;
                  end
                I_NOP:
                  begin
                  end
                I_MV:
                  begin
                    if (reg_file[SP_REG_CP] != ZERO)
                      begin
                        if (is_mem_d)
                          begin
                            mem_d_i <= source_a;
                          end
                        else
                          begin
                            reg_file[reg_d_addr] <= source_a;
                          end
                      end
                  end
                I_MVI:
                  begin
                    reg_file[SP_REG_MVI] <= im16;
                  end
                I_MVIH:
                  begin
                    if (WIDTH_REG >= 16)
                      begin
                        reg_file[SP_REG_MVI] <= {im16, reg_file[SP_REG_MVI][15:0]};
                      end
                  end
                I_CEQ:
                  begin
                    if (source_a == source_b)
                      begin
                        reg_file[SP_REG_CP] <= FFFF;
                      end
                    else
                      begin
                        reg_file[SP_REG_CP] <= ZERO;
                      end
                  end
                I_CGT:
                  begin
                    if (source_a > source_b)
                      begin
                        reg_file[SP_REG_CP] <= FFFF;
                      end
                    else
                      begin
                        reg_file[SP_REG_CP] <= ZERO;
                      end
                  end
                I_CGTA:
                  begin
                    if ($signed(source_a) > $signed(source_b))
                      begin
                        reg_file[SP_REG_CP] <= FFFF;
                      end
                    else
                      begin
                        reg_file[SP_REG_CP] <= ZERO;
                      end
                  end
                I_BC:
                  begin
                    if (reg_file[SP_REG_CP] == ZERO)
                      begin
                        mem_i_addr_r <= mem_i_addr_r + ONE;
                      end
                    else
                      begin
                        mem_i_addr_r <= pc_d2 + ims16;
                      end
                  end
                I_BL:
                  begin
                    reg_file[SP_REG_LINK] <= pc_d2 + ONE;
                    mem_i_addr_r <= pc_d2 + ims16;
                  end
                I_BA:
                  begin
                    mem_i_addr_r <= reg_file[SP_REG_BA];
                  end
                I_LOOP:
                  begin
                    loop_counter <= reg_file[SP_REG_LOOP_COUNTER];
                    loop_end <= pc_d2 + reg_file[SP_REG_LOOP_END][DEPTH_I-1:0];
                    loop_span <= reg_file[SP_REG_LOOP_SPAN][DEPTH_I-1:0];
                  end
                I_OUT:
                  begin
                    port_out <= source_a;
                  end
                I_V8MV:
                  begin : v8mv
                    integer i;
                    for (i = 0; i < VEC8_WAY; i = i + 1)
                      begin
                        if (reg_file[SP_REG_CP][i*8 +: 8] != ZERO)
                          begin
                            if (is_mem_d)
                              begin
                                mem_d_i[i*8 +: 8] <= source_a[i*8 +: 8];
                              end
                            else
                              begin
                                reg_file[reg_d_addr][i*8 +: 8] <= source_a[i*8 +: 8];
                              end
                          end
                      end
                  end
                I_V8CEQ:
                  begin : v8ceq
                    integer i;
                    for (i = 0; i < VEC8_WAY; i = i + 1)
                      begin
                        if (source_a[i*8 +: 8] == source_b[i*8 +: 8])
                          begin
                            reg_file[SP_REG_CP][i*8 +: 8] <= FFFF;
                          end
                        else
                          begin
                            reg_file[SP_REG_CP][i*8 +: 8] <= ZERO;
                          end
                      end
                  end
                I_V8CGT:
                  begin : v8cgt
                    integer i;
                    for (i = 0; i < VEC8_WAY; i = i + 1)
                      begin
                        if (source_a[i*8 +: 8] > source_b[i*8 +: 8])
                          begin
                            reg_file[SP_REG_CP][i*8 +: 8] <= FFFF;
                          end
                        else
                          begin
                            reg_file[SP_REG_CP][i*8 +: 8] <= ZERO;
                          end
                      end
                  end
                I_V8CGTA:
                  begin : v8cgta
                    integer i;
                    for (i = 0; i < VEC8_WAY; i = i + 1)
                      begin
                        if ($signed(source_a[i*8 +: 8]) > $signed(source_b[i*8 +: 8]))
                          begin
                            reg_file[SP_REG_CP][i*8 +: 8] <= FFFF;
                          end
                        else
                          begin
                            reg_file[SP_REG_CP][i*8 +: 8] <= ZERO;
                          end
                      end
                  end
                default: ;
              endcase
            end
        end
    end



  rw_port_ram
    #(
      .DATA_WIDTH (WIDTH_I),
      .ADDR_WIDTH (DEPTH_I)
      )
  mem_i
    (
     .clk (clk),
     .addr_r (mem_i_addr_r),
     .addr_w (mem_i_addr_w),
     .data_in (mem_i_i),
     .we (mem_i_we),
     .data_out (mem_i_o)
     );

  rw_port_ram
    #(
      .DATA_WIDTH (WIDTH_D),
      .ADDR_WIDTH (DEPTH_D)
      )
  mem_d_a
    (
     .clk (clk),
     .addr_r (mem_d_addr_r_a),
     .addr_w (mem_d_addr_w_d2),
     .data_in (mem_d_i),
     .we (mem_d_we),
     .data_out (mem_d_o_a)
     );

  rw_port_ram
    #(
      .DATA_WIDTH (WIDTH_D),
      .ADDR_WIDTH (DEPTH_D)
      )
  mem_d_b
    (
     .clk (clk),
     .addr_r (mem_d_addr_r_b),
     .addr_w (mem_d_addr_w_d2),
     .data_in (mem_d_i),
     .we (mem_d_we),
     .data_out (mem_d_o_b)
     );

endmodule

関連記事

独自CPUを自作する(メモリ操作の速いアーキテクチャ編):通常版解説ページ