Parallella Fan!
Home記事一覧BBS

マンデルブロ集合をFPGAで並列化(固定小数点版)

【更新履歴】
2015/04/27 公式セットアップスクリプトに準拠する形に修正しました。以前にダウンロードされた方は再度ダウンロードしなおしてください。
2015/04/24 新規公開

FPGAに実装するマンデルブロ集合のプログラムを固定小数点演算を使って更に高速化します。

前回の「マンデルブロ集合の浮動小数点演算をFPGAで並列化する」で書いたプログラムを固定小数点で計算する形に修正したところ、1演算コアあたりの回路規模が大幅に減少したため、ZYNQ-7020のFPGA上に32個のマンデルブロ集合演算コアを実装することができました。これにより演算性能が大幅に向上し、シングルスレッドの浮動小数点版と比較して260倍速を達成しました。

ベンチマーク条件と結果は以下の通りです。

演算方式チューニング設定演算コア数所要時間(秒)スピードアップ比
浮動小数点デフォルト1357.31
浮動小数点浮動小数点IPチューン(Max DSP, レイテンシ2)1149.62.39
固定小数点デフォルト165.25.48
固定小数点Chaining, Operation strength reduction142.48.44
固定小数点Chaining, Operation strength reduction321.37260.5

ベンチマーク用パラメーター設定
max_count: 255
frames: 10
cx: 0.7509765625f
cy: 0.0f
start dx: 0.25f / 512.0f
zoom speed: 246.0f / 256.0f

プロジェクトの準備

「マンデルブロ集合の浮動小数点演算をFPGAで並列化する」と同様にプロジェクトをセットアップします。

ソースコードのダウンロードとHDLコードの生成

端末で、

cd ~/parallella_hw_projects/user_ip/edk/pcores

wget -O synthesijer_template_mandelbrot_fixed.tar.gz https://cellspe.matrix.jp/files/synthesijer_template_mandelbrot_fixed.tar.gz

tar xzf synthesijer_template_mandelbrot_fixed.tar.gz

Synthesijerを実行してJavaプログラムからVerilog HDLコードを生成します。

cd synthesijer_template_v1_00_a/devl/synthesijer

make

ビットストリームの生成・転送

前回と同様に、「system_i-system(system.xmp)」を「Reset Output Products」してからビットストリームを生成、Parallellaに転送します。

ドライバのコンパイル・実行

ドライバのプログラムは synthesijer_template_v1_00_a/devl/driver/synthesijer_template_driver_mandelbrot_fixed.tar.gz に入っています。これも前回と同様の方法でParallellaに転送してmake、実行します。

ソースコード

これらのソースコードはBSD 2-Clauseライセンスで公開します。

Template.java : コントロール、描画クラス

32コア版のソースは冗長なので1コア版を掲載します。基本的な構造は同じです。

/*
  Copyright (c) 2015, miya
  All rights reserved.

  Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

  1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.

  2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.

  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/


public class Template extends Thread
{
  // CONSTANTS
  private static final int PARALLEL = 1;
  private static final int WIDTH = 512;
  private static final int HEIGHT = 512;
  private static final int INT_SIZE = 4;
  private static final int HEIGHT_PLUS_PARALLEL = 513;

  // AXI IO
  public int waddr;
  public int wdata;
  public int wburst;
  public boolean wreq;
  public boolean wbusy;
  public int raddr;
  public int rdata;
  public boolean rreq;
  public boolean rbusy;

  // user IO
  public int gpio0; // fb_addr
  public int gpio1; // line_bytes
  public int gpio2; // mode
  public int gpio3; // param_addr

  private int fb_addr;
  private int line_bytes;
  private int page0;
  private int page1;
  private int cx;
  private int cy;
  private int dx;
  private int speed;
  private int mode;
  private int param_addr;

  private Mandel mandel0 = new Mandel();

  public void init()
  {
    wreq = false;
    wbusy = false;
    rreq = false;
    rbusy = false;
    fb_addr = gpio0;
    line_bytes = gpio1;
    mode = gpio2;
    param_addr = gpio3;
    int initmode = mode & 0xff;
    if (initmode == 0)
    {
      cx = read_data(param_addr);
      cy = read_data(param_addr + 4);
      dx = read_data(param_addr + 8);
      speed = read_data(param_addr + 12);

      mandel0.cx = cx;

      mandel0.cy = cy;

      mandel0.dx = dx;

      mandel0.mode = mode;

      mandel0.enable = true;

      mandel0.req = false;

      mandel0.ack = false;

      mandel0.start();
    }
    else if (initmode == 1)
    {
      mandel0.req = false;

      mandel0.enable = false;

      try
      {
        mandel0.join();
      }
      catch (InterruptedException e)
      {
      }
    }
  }

  private void shortwait()
  {
  }

  private void write_data(int addr, int data, int burst)
  {
    while(wbusy == true)
    {
      yield();
    }
    waddr = addr;
    wdata = data;
    wburst = burst;
    wreq = true;
    if (wreq == true)
    {
      wreq = false;
    }
  }

  private void write_flush()
  {
    while(wbusy == true)
    {
      yield();
    }
  }

  private int read_data(int addr)
  {
    raddr = addr;
    rreq = true;
    if (rreq == true)
    {
      rreq = false;
    }
    shortwait();
    while(rbusy == true)
    {
      yield();
    }
    return rdata;
  }

  public void run()
  {
    page0 = 0;
    page1 = 1;
    int sy = 0;

    int addr = fb_addr;
    int line_bytes_p = line_bytes * PARALLEL;

    mandel0.dx = dx;

    for (int i = 0; i < HEIGHT_PLUS_PARALLEL; i += PARALLEL)
    {
      mandel0.sy = sy;
      mandel0.page = page0;

      if (i < HEIGHT)
      {
        // job start
        mandel0.req = true;
      }

      if (i >= PARALLEL)
      {
        // draw
        int addr1 = addr;
        int i_s, i_e;
        if (page1 == 0)
        {
          i_s = 0;
        }
        else
        {
          i_s = WIDTH;
        }
        i_e = i_s + WIDTH;
        for (int j = i_s; j < i_e; j++)
        {
          int addr0 = addr1;
          write_data(addr0, mandel0.data[j], 0);

          addr1 += INT_SIZE;
        }
        addr += line_bytes_p;
      }

      if (i < HEIGHT)
      {
        // job join
        while (mandel0.ack == false)
        {
          yield();
        }

        mandel0.req = false;

        while (mandel0.ack == true)
        {
          yield();
        }
      }

      int p = page0;
      page0 = page1;
      page1 = p;
      sy += PARALLEL;
    }
    dx = (dx * speed) >> 8 /*DX_BITS*/;
  }
}

Mandel.java : マンデルブロ集合演算クラス
/*
  Copyright (c) 2015, miya
  All rights reserved.

  Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

  1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.

  2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.

  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/


public class Mandel extends Thread
{
  private static final int HALF_WIDTH = 256;
  private static final int WIDTH = 512;
  private static final int MAX_C = 32768 /*(4 << FIXED_BITS)*/;

  public int cx;
  public int cy;
  public int sy;
  public int dx;
  public int page;
  public final int[] data = new int[1024 /*BUF_SIZE*/];
  public boolean req;
  public boolean ack;
  public boolean enable;
  public int mode;

  public void run()
  {
    while (true)
    {
      while ((req == false) && (enable == true))
      {
        yield();
      }

      if (enable == false)
      {
        break;
      }

      int x, x1, y, y1, calc_size;
      calc_size = dx << 8 /*HALF_WIDTH_BITS*/;
      x = cx - calc_size;
      y = cy - calc_size + dx * sy;
      int max_count = (mode >> 8) & 0xff;
      y1 = y >> 13 /*FIXED_BITS*/;

      int i_s, i_e;
      if (page == 0)
      {
        i_s = 0;
      }
      else
      {
        i_s = WIDTH;
      }
      i_e = i_s + WIDTH;

      for (int i = i_s; i < i_e; i++)
      {
        int a, b, a2, b2, c;
        a = 0;
        b = 0;
        a2 = 0;
        b2 = 0;
        c = 0;
        int count = max_count;
        x1 = x >> 13 /*FIXED_BITS*/;

        while ((c < MAX_C) && (count > 0))
        {
          b = ((a * b) >> 12 /*FIXED_BITS-1*/) - y1;
          a = a2 - b2 - x1;
          a2 = (a * a) >> 13 /*FIXED_BITS*/;
          b2 = (b * b) >> 13 /*FIXED_BITS*/;
          c = a2 + b2;
          count--;
        }

        int color = 0;
        color = 0xff000000 | ((count << 3) + (count << 10) + (count << 18));
        data[i] = color;
        x += dx;
      }
      ack = true;

      while (req == true)
      {
        yield();
      }

      ack = false;
    }
  }
}

main.c : ドライバプログラム
/*
  Copyright (c) 2015, miya
  All rights reserved.

  Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

  1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.

  2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.

  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/


#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/ioctl.h>
#include <sys/time.h>
#include <linux/fb.h>
#include <stdint.h>

#define DEVMEM "/dev/mem"
#define DEVFB "/dev/fb0"
#define BASE_ADDR 0x60000000
#define MAP_SIZE 0x1000
#define BPP 4
#define LOOP 120
#define FIXED_BITS 13

static inline long long get_tick_count(void)
{
  struct timeval timeprof;
  gettimeofday(&timeprof, NULL);
  return ((long long)timeprof.tv_sec * (long long)1000000 + (long long)timeprof.tv_usec);
}

static inline uint32_t pack_control(uint32_t addr, uint32_t we, uint32_t req)
{
  return ((addr & 0xff) << 24) | ((we & 0x1) << 1) | (req & 0x1);
}

static inline void write_reg(void * mem, uint32_t addr, uint32_t data)
{
  volatile uint32_t * reg = mem;
  reg[1] = data;
  reg[0] = pack_control(addr, 1, 1);
  while (reg[2] == 0)
  {
    usleep(1);
  }
  reg[0] = pack_control(addr, 1, 0);
  while (reg[2] == 1)
  {
    usleep(1);
  }
}

static inline uint32_t read_reg(void * mem, uint32_t addr)
{
  volatile uint32_t * reg = mem;
  reg[0] = pack_control(addr, 0, 1);
  while (reg[2] == 0)
  {
    usleep(1);
  }
  reg[0] = pack_control(addr, 0, 0);
  while (reg[2] == 1)
  {
    usleep(1);
  }
  return reg[3];
}

int main(int argc, char *argv[])
{
  unsigned long fb_addr = 0;
  uint32_t line_length = 0;
  uint32_t width = 0;
  uint32_t height = 0;

  int fdfb = open(DEVFB, O_RDWR);
  if (fdfb > 0)
  {
    struct fb_fix_screeninfo fbf;
    struct fb_var_screeninfo fbv;
    if (ioctl(fdfb, FBIOGET_FSCREENINFO, &fbf) == 0)
    {
      fb_addr = fbf.smem_start;
      line_length = fbf.line_length;
    }
    if (ioctl(fdfb, FBIOGET_VSCREENINFO, &fbv) == 0)
    {
      width = fbv.xres;
      height = fbv.yres;
    }
    close(fdfb);
  }

  int fd = open(DEVMEM, O_RDWR | O_SYNC);
  if (fd == -1)
  {
    perror("cannot open /dev/mem");
    abort();
  }

  uint32_t *mem = mmap(NULL, MAP_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd, (off_t)BASE_ADDR);

  if (mem == MAP_FAILED)
  {
    perror("map failed");
    abort();
  }

  uint32_t *fbmem = mmap(NULL, line_length * height, PROT_READ | PROT_WRITE, MAP_SHARED, fd, (off_t)fb_addr);

  if (fbmem == MAP_FAILED)
  {
    perror("map failed");
    abort();
  }

  uint32_t windowsize = 512;
  if ((windowsize > height) || (windowsize > width))
  {
    perror("screen size");
    abort();
  }
  uint32_t sy = (height - windowsize) / 2;
  uint32_t sx = (width - windowsize) / 2;
  uint32_t saddr = sy * line_length + (sx * BPP);
  int f2b = 1 << (FIXED_BITS << 1);

  // for benchmark (LOOP = 10)
/*
  fbmem[0] = (int)(0.7509765625f * f2b);
  fbmem[1] = (int)(0.0f * f2b);
  fbmem[2] = 4 << FIXED_BITS;
  fbmem[3] = 246;
*/


  fbmem[0] = (int)(1.49f * f2b);
  fbmem[1] = (int)(0.0f * f2b);
  fbmem[2] = 64 << FIXED_BITS;
  fbmem[3] = 246;

  long long start_tick = get_tick_count();

  // Template.java: init
  write_reg(mem, 1, fb_addr + saddr);
  write_reg(mem, 2, line_length);
  write_reg(mem, 3, (128 << 8) + 0);
  write_reg(mem, 4, fb_addr);
  write_reg(mem, 0, 0);

  int i;
  for (i = 0; i < LOOP; i++)
  {
    // Template.java: run()
    write_reg(mem, 0, 1);
  }

  write_reg(mem, 3, (128 << 8) + 1);
  write_reg(mem, 0, 0);

  float calctime = (float)(get_tick_count() - start_tick) / 1000000.0f;
  float fps = (float)LOOP / calctime;
  printf("time: %f sec\n", calctime);
  printf("fps: %f\n", fps);

  munmap(mem, MAP_SIZE);
  close(fd);
  return 0;
}