SIMD命令一覧
v8add
v8sub
v8mul
v8sr
v8sl
v8sra
v8mv
v8ceq
v8cgt
v8cgta
SIMD命令仕様
※以下、[element] はソース、格納先のレジスタ、メモリを8bit区切りにした要素それぞれに対しての演算を表す。
V8ADD
解説: Vector 8bit Add : 8bit SIMD 加算
アセンブリ: v8add(reg_d, reg_a, reg_b, inc_d, inc_a, inc_b, mem_d, mem_a, mem_b)
機能: D[element] = A[element] + B[element];
V8SUB
解説: Vector 8bit Subtract : 8bit SIMD 減算
アセンブリ: v8sub(reg_d, reg_a, reg_b, inc_d, inc_a, inc_b, mem_d, mem_a, mem_b)
機能: D[element] = A[element] - B[element];
V8MUL
解説: Vector 8bit Multiply : 8bit SIMD 乗算
アセンブリ: v8mul(reg_d, reg_a, reg_b, inc_d, inc_a, inc_b, mem_d, mem_a, mem_b)
機能: D[element] = A[element] * B[element];
V8SR
解説: Vector 8bit Shift Right : 8bit SIMD 論理右シフト
アセンブリ: v8sr(reg_d, reg_a, reg_b, inc_d, inc_a, inc_b, mem_d, mem_a, mem_b)
機能: D[element] = A[element] >> B;
V8SL
解説: Vector 8bit Shift Left : 8bit SIMD 論理左シフト
アセンブリ: v8sl(reg_d, reg_a, reg_b, inc_d, inc_a, inc_b, mem_d, mem_a, mem_b)
機能: D[element] = A[element] << B;
V8SRA
解説: Vector 8bit Shift Right Arithmetic : 8bit SIMD 算術右シフト
アセンブリ: v8sra(reg_d, reg_a, reg_b, inc_d, inc_a, inc_b, mem_d, mem_a, mem_b)
機能: D[element] = A[element] >>> B;
V8MV
解説: Vector 8bit Move : もし(R1[element] != 0)ならばD[element]にA[element]を代入
アセンブリ: v8mv(reg_d, reg_a, inc_d, inc_a, mem_d, mem_a)
機能: if (R1[element] != 0) {D[element] = A[element];}
V8CEQ
解説: Vector 8bit Compare Equal : もし(A[element] == B[element])ならばR1[element] = 0xff、それ以外ならR1[element] = 0
アセンブリ: v8ceq(reg_a, reg_b, inc_a, inc_b, mem_a, mem_b)
機能: if (A[element] == B[element]) {R1[element] = 0xff;} else {R1[element] = 0;}
V8CGT
解説: Vector 8bit Compare Greater Than : もし(A[element] > B[element])ならばR1[element] = 0xff、それ以外ならR1[element] = 0
アセンブリ: v8cgt(reg_a, reg_b, inc_a, inc_b, mem_a, mem_b)
機能: if (A[element] > B[element]) {R1[element] = 0xff;} else {R1[element] = 0;}
V8CGTA
解説: Vector 8bit Compare Greater Than Arithmetic : A[element]、B[element]をsignedとして扱い、もし(A[element] > B[element])ならばR1[element] = 0xff、それ以外ならR1[element] = 0
アセンブリ: v8cgta(reg_a, reg_b, inc_a, inc_b, mem_a, mem_b)
機能: if (A[element](signed) > B[element](signed)) {R1[element] = 0xff;} else {R1[element] = 0;}
ソースコード
SIMDパッチ済み sc1_cpu.v
/*
Copyright (c) 2015-2016, miya
All rights reserved.
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
module sc1_cpu
#(
parameter WIDTH_I = 32,
parameter WIDTH_D = 32,
parameter WIDTH_REG = 32,
parameter DEPTH_I = 8,
parameter DEPTH_D = 8,
parameter DEPTH_REG = 4
)
(
input clk,
input reset,
output reg [DEPTH_I-1:0] rom_addr,
input [WIDTH_I-1:0] rom_data,
input [WIDTH_D-1:0] port_in,
output reg [WIDTH_D-1:0] port_out
);
localparam VEC8_WAY = (WIDTH_REG / 8);
localparam SP_REG_MVI = 4'd0;
localparam SP_REG_BA = 4'd0;
localparam SP_REG_CP = 4'd1;
localparam SP_REG_LINK = 4'd2;
localparam SP_REG_LOOP_COUNTER = 4'd3;
localparam SP_REG_LOOP_END = 4'd4;
localparam SP_REG_LOOP_SPAN = 4'd5;
// opcode
// special type
localparam I_HALT = 7'h00;
localparam I_NOP = 7'h01;
localparam I_MV = 7'h02;
localparam I_MVI = 7'h03;
localparam I_MVIH = 7'h04;
localparam I_CEQ = 7'h05;
localparam I_CGT = 7'h06;
localparam I_CGTA = 7'h07;
localparam I_BC = 7'h08;
localparam I_BL = 7'h09;
localparam I_BA = 7'h0a;
localparam I_LOOP = 7'h0b;
localparam I_OUT = 7'h0c;
localparam I_V8MV = 7'h0d;
localparam I_V8CEQ = 7'h0e;
localparam I_V8CGT = 7'h0f;
localparam I_V8CGTA = 7'h10;
// normal type
localparam I_ADD = 7'h40;
localparam I_SUB = 7'h41;
localparam I_AND = 7'h42;
localparam I_OR = 7'h43;
localparam I_XOR = 7'h44;
localparam I_NOT = 7'h45;
localparam I_SR = 7'h46;
localparam I_SL = 7'h47;
localparam I_SRA = 7'h48;
localparam I_MUL = 7'h49;
localparam I_IN = 7'h4a;
localparam I_V8ADD = 7'h4b;
localparam I_V8SUB = 7'h4c;
localparam I_V8MUL = 7'h4d;
localparam I_V8SR = 7'h4e;
localparam I_V8SL = 7'h4f;
localparam I_V8SRA = 7'h50;
localparam TRUE = 1'b1;
localparam FALSE = 1'b0;
localparam ONE = 1'd1;
localparam ZERO = 1'd0;
localparam FFFF = {WIDTH_D{1'b1}};
wire [WIDTH_I-1:0] mem_i_o;
reg [DEPTH_I-1:0] mem_i_addr_r;
reg [DEPTH_I-1:0] mem_i_addr_w;
reg [WIDTH_I-1:0] mem_i_i;
reg mem_i_we;
wire [WIDTH_D-1:0] mem_d_o_a;
wire [WIDTH_D-1:0] mem_d_o_b;
wire mem_d_we_sig;
reg [WIDTH_D-1:0] mem_d_i;
reg [DEPTH_D-1:0] mem_d_addr_w;
reg [DEPTH_D-1:0] mem_d_addr_w_d1;
reg [DEPTH_D-1:0] mem_d_addr_w_d2;
reg [DEPTH_D-1:0] mem_d_addr_r_a;
reg [DEPTH_D-1:0] mem_d_addr_r_b;
reg mem_d_we;
reg cpu_en;
reg [DEPTH_I-1:0] pc_d1;
reg [DEPTH_I-1:0] pc_d2;
reg [10:0] stage_init;
reg [WIDTH_D-1:0] loop_counter;
reg [DEPTH_I-1:0] loop_end;
reg [DEPTH_I-1:0] loop_span;
wire is_mem_d_s1;
wire is_mem_a_s1;
wire is_mem_b_s1;
wire add_d_s1;
wire add_a_s1;
wire add_b_s1;
wire [DEPTH_REG-1:0] reg_d_addr_s1;
wire [DEPTH_REG-1:0] reg_a_addr_s1;
wire [DEPTH_REG-1:0] reg_b_addr_s1;
reg [WIDTH_I-1:0] mem_i_o_d1;
wire [6:0] op;
wire is_type_normal;
wire not_increment;
wire is_mem_d;
wire is_mem_a;
wire is_mem_b;
wire [DEPTH_REG-1:0] reg_d_addr;
wire [DEPTH_REG-1:0] reg_a_addr;
wire [DEPTH_REG-1:0] reg_b_addr;
wire [15:0] im16;
wire signed [15:0] ims16;
wire [WIDTH_D-1:0] source_a;
wire [WIDTH_D-1:0] source_b;
// register file
reg [WIDTH_REG-1:0] reg_file [0:(1 << DEPTH_REG)-1];
// decode(stage1)
assign is_mem_d_s1 = mem_i_o[9];
assign is_mem_a_s1 = mem_i_o[8];
assign is_mem_b_s1 = mem_i_o[7];
assign add_d_s1 = mem_i_o[12];
assign add_a_s1 = mem_i_o[11];
assign add_b_s1 = mem_i_o[10];
assign reg_d_addr_s1 = mem_i_o[DEPTH_REG+26-1:26];
assign reg_a_addr_s1 = mem_i_o[DEPTH_REG+20-1:20];
assign reg_b_addr_s1 = mem_i_o[DEPTH_REG+14-1:14];
// decode(stage2)
assign op = mem_i_o_d1[6:0];
assign is_type_normal = mem_i_o_d1[6];
assign is_mem_d = mem_i_o_d1[9];
assign is_mem_a = mem_i_o_d1[8];
assign is_mem_b = mem_i_o_d1[7];
assign reg_d_addr = mem_i_o_d1[DEPTH_REG+26-1:26];
assign reg_a_addr = mem_i_o_d1[DEPTH_REG+20-1:20];
assign reg_b_addr = mem_i_o_d1[DEPTH_REG+14-1:14];
assign im16 = mem_i_o_d1[25:10];
assign ims16 = mem_i_o_d1[25:10];
// manual pc increment
assign not_increment = ((op == I_HALT) || (op == I_BC) || (op == I_BL) || (op == I_BA)) ? 1'b1 : 1'b0;
// switch source
assign source_a = is_mem_a ? mem_d_o_a : reg_file[reg_a_addr];
assign source_b = is_mem_b ? mem_d_o_b : reg_file[reg_b_addr];
// switch operation
function [WIDTH_D-1:0] result
(
input [6:0] op_result
);
integer i;
begin
case (op_result)
I_ADD: result = source_a + source_b;
I_SUB: result = source_a - source_b;
I_AND: result = source_a & source_b;
I_OR: result = source_a | source_b;
I_XOR: result = source_a ^ source_b;
I_NOT: result = ~source_a;
I_SR: result = source_a >> source_b;
I_SL: result = source_a << source_b;
I_SRA: result = $signed(source_a) >>> source_b;
I_MUL: result = $signed(source_a) * $signed(source_b);
I_IN: result = port_in;
I_V8ADD:
begin
for (i = 0; i < VEC8_WAY; i = i + 1)
begin
result[i*8 +: 8] = source_a[i*8 +: 8] + source_b[i*8 +: 8];
end
end
I_V8SUB:
begin
for (i = 0; i < VEC8_WAY; i = i + 1)
begin
result[i*8 +: 8] = source_a[i*8 +: 8] - source_b[i*8 +: 8];
end
end
I_V8MUL:
begin
for (i = 0; i < VEC8_WAY; i = i + 1)
begin
result[i*8 +: 8] = $signed(source_a[i*8 +: 8]) * $signed(source_b[i*8 +: 8]);
end
end
I_V8SR:
begin
for (i = 0; i < VEC8_WAY; i = i + 1)
begin
result[i*8 +: 8] = source_a[i*8 +: 8] >> source_b;
end
end
I_V8SL:
begin
for (i = 0; i < VEC8_WAY; i = i + 1)
begin
result[i*8 +: 8] = source_a[i*8 +: 8] << source_b;
end
end
I_V8SRA:
begin
for (i = 0; i < VEC8_WAY; i = i + 1)
begin
result[i*8 +: 8] = $signed(source_a[i*8 +: 8]) >>> source_b;
end
end
default:;
endcase
end
endfunction
// mem_d_we condition
assign mem_d_we_sig = is_mem_d & (is_type_normal | (op == I_MV));
always @(posedge clk)
begin
if (reset == TRUE)
begin
stage_init <= ZERO;
cpu_en <= FALSE;
mem_i_addr_r <= ZERO;
mem_i_addr_w <= ZERO;
mem_i_we <= FALSE;
port_out <= ZERO;
loop_counter <= ZERO;
loop_end <= ZERO;
loop_span <= ZERO;
end
else if (cpu_en == FALSE)
// init
begin
if (stage_init < 11'h400)
begin
case (stage_init[1:0])
// load program from ROM
2'd0:
begin
rom_addr <= stage_init[9:2];
end
2'd1:
begin
end
2'd2:
begin
mem_i_addr_w <= stage_init[9:2];
mem_i_i <= rom_data;
mem_i_we <= TRUE;
end
2'd3:
begin
mem_i_we <= FALSE;
end
default: ;
endcase
stage_init <= stage_init + ONE;
end
else
begin
cpu_en <= TRUE;
end
end
else
// cpu enable
begin
// increment mem_d address automatically
if (is_mem_d_s1)
begin
if (add_d_s1)
begin
mem_d_addr_w <= mem_d_addr_w + reg_file[reg_d_addr_s1][DEPTH_D-1:0];
end
else
begin
mem_d_addr_w <= reg_file[reg_d_addr_s1][DEPTH_D-1:0];
end
end
if (is_mem_a_s1)
begin
if (add_a_s1)
begin
mem_d_addr_r_a <= mem_d_addr_r_a + reg_file[reg_a_addr_s1][DEPTH_D-1:0];
end
else
begin
mem_d_addr_r_a <= reg_file[reg_a_addr_s1][DEPTH_D-1:0];
end
end
if (is_mem_b_s1)
begin
if (add_b_s1)
begin
mem_d_addr_r_b <= mem_d_addr_r_b + reg_file[reg_b_addr_s1][DEPTH_D-1:0];
end
else
begin
mem_d_addr_r_b <= reg_file[reg_b_addr_s1][DEPTH_D-1:0];
end
end
// delay
mem_i_o_d1 <= mem_i_o;
pc_d2 <= pc_d1;
pc_d1 <= mem_i_addr_r;
mem_d_we <= mem_d_we_sig;
mem_d_addr_w_d1 <= mem_d_addr_w;
mem_d_addr_w_d2 <= mem_d_addr_w_d1;
// loop counter
if (loop_end == mem_i_addr_r)
begin
if ((loop_counter != ZERO) && (op != I_LOOP))
begin
loop_counter <= loop_counter - ONE;
end
end
// increment pc (prefetch address)
if (!not_increment)
begin
if (loop_end == mem_i_addr_r)
begin
if (loop_counter == ZERO)
begin
mem_i_addr_r <= mem_i_addr_r + ONE;
end
else
begin
mem_i_addr_r <= mem_i_addr_r + loop_span;
end
end
else
begin
mem_i_addr_r <= mem_i_addr_r + ONE;
end
end
// execution
if (is_type_normal)
begin
// for normal instructions
if (is_mem_d)
begin
mem_d_i <= result(op);
end
else
begin
reg_file[reg_d_addr] <= result(op);
end
end
else
begin
// special instructions
case (op)
I_HALT:
begin
mem_i_addr_r <= pc_d2;
end
I_NOP:
begin
end
I_MV:
begin
if (reg_file[SP_REG_CP] != ZERO)
begin
if (is_mem_d)
begin
mem_d_i <= source_a;
end
else
begin
reg_file[reg_d_addr] <= source_a;
end
end
end
I_MVI:
begin
reg_file[SP_REG_MVI] <= im16;
end
I_MVIH:
begin
if (WIDTH_REG >= 16)
begin
reg_file[SP_REG_MVI] <= {im16, reg_file[SP_REG_MVI][15:0]};
end
end
I_CEQ:
begin
if (source_a == source_b)
begin
reg_file[SP_REG_CP] <= FFFF;
end
else
begin
reg_file[SP_REG_CP] <= ZERO;
end
end
I_CGT:
begin
if (source_a > source_b)
begin
reg_file[SP_REG_CP] <= FFFF;
end
else
begin
reg_file[SP_REG_CP] <= ZERO;
end
end
I_CGTA:
begin
if ($signed(source_a) > $signed(source_b))
begin
reg_file[SP_REG_CP] <= FFFF;
end
else
begin
reg_file[SP_REG_CP] <= ZERO;
end
end
I_BC:
begin
if (reg_file[SP_REG_CP] == ZERO)
begin
mem_i_addr_r <= mem_i_addr_r + ONE;
end
else
begin
mem_i_addr_r <= pc_d2 + ims16;
end
end
I_BL:
begin
reg_file[SP_REG_LINK] <= pc_d2 + ONE;
mem_i_addr_r <= pc_d2 + ims16;
end
I_BA:
begin
mem_i_addr_r <= reg_file[SP_REG_BA];
end
I_LOOP:
begin
loop_counter <= reg_file[SP_REG_LOOP_COUNTER];
loop_end <= pc_d2 + reg_file[SP_REG_LOOP_END][DEPTH_I-1:0];
loop_span <= reg_file[SP_REG_LOOP_SPAN][DEPTH_I-1:0];
end
I_OUT:
begin
port_out <= source_a;
end
I_V8MV:
begin : v8mv
integer i;
for (i = 0; i < VEC8_WAY; i = i + 1)
begin
if (reg_file[SP_REG_CP][i*8 +: 8] != ZERO)
begin
if (is_mem_d)
begin
mem_d_i[i*8 +: 8] <= source_a[i*8 +: 8];
end
else
begin
reg_file[reg_d_addr][i*8 +: 8] <= source_a[i*8 +: 8];
end
end
end
end
I_V8CEQ:
begin : v8ceq
integer i;
for (i = 0; i < VEC8_WAY; i = i + 1)
begin
if (source_a[i*8 +: 8] == source_b[i*8 +: 8])
begin
reg_file[SP_REG_CP][i*8 +: 8] <= FFFF;
end
else
begin
reg_file[SP_REG_CP][i*8 +: 8] <= ZERO;
end
end
end
I_V8CGT:
begin : v8cgt
integer i;
for (i = 0; i < VEC8_WAY; i = i + 1)
begin
if (source_a[i*8 +: 8] > source_b[i*8 +: 8])
begin
reg_file[SP_REG_CP][i*8 +: 8] <= FFFF;
end
else
begin
reg_file[SP_REG_CP][i*8 +: 8] <= ZERO;
end
end
end
I_V8CGTA:
begin : v8cgta
integer i;
for (i = 0; i < VEC8_WAY; i = i + 1)
begin
if ($signed(source_a[i*8 +: 8]) > $signed(source_b[i*8 +: 8]))
begin
reg_file[SP_REG_CP][i*8 +: 8] <= FFFF;
end
else
begin
reg_file[SP_REG_CP][i*8 +: 8] <= ZERO;
end
end
end
default: ;
endcase
end
end
end
rw_port_ram
#(
.DATA_WIDTH (WIDTH_I),
.ADDR_WIDTH (DEPTH_I)
)
mem_i
(
.clk (clk),
.addr_r (mem_i_addr_r),
.addr_w (mem_i_addr_w),
.data_in (mem_i_i),
.we (mem_i_we),
.data_out (mem_i_o)
);
rw_port_ram
#(
.DATA_WIDTH (WIDTH_D),
.ADDR_WIDTH (DEPTH_D)
)
mem_d_a
(
.clk (clk),
.addr_r (mem_d_addr_r_a),
.addr_w (mem_d_addr_w_d2),
.data_in (mem_d_i),
.we (mem_d_we),
.data_out (mem_d_o_a)
);
rw_port_ram
#(
.DATA_WIDTH (WIDTH_D),
.ADDR_WIDTH (DEPTH_D)
)
mem_d_b
(
.clk (clk),
.addr_r (mem_d_addr_r_b),
.addr_w (mem_d_addr_w_d2),
.data_in (mem_d_i),
.we (mem_d_we),
.data_out (mem_d_o_b)
);
endmodule
関連記事
独自CPUを自作する(メモリ操作の速いアーキテクチャ編):通常版解説ページ