CUDAの練習1: mandelbrot集合

を正月休みにしていた。CUDAに関しては雑な知識しかなかったので、『CUDA BY EXAMPLE』なる書籍を読んだ。『CUDA C プロフェッショナルプログラミング』なる本が名著であると勧められてはいたが、まずモチベーションを殺さないために短めのものを一読してから重い本で知識をアップデートするやり方に慣れていたので、手元にあった薄い方の本を手にとった次第だ。

CUDAの文法や基礎知識は解説できるほど理解していないので、本や何やで確認してほしい。

mandelbrot

恐らく最も並列化が容易でビジュアル的にもよいであろうマンデルブロ集合が最初の目標としては最適だろうと考えた。初めてのCUDAプログラムなので、__shared__メモリだとか__constant__メモリだとかそういうのは気にせずまったりやることにする。

naiveな実装

std::complexをCUDAで使えるのかわからなかったので、ひどい再発明から入った。実際のところ、__device__関数で何も考えずにstd::complex z, c; z * z + c;みたいなのを書くと、__host__関数を__device__からは呼べないと怒られるし、std::complexのコンストラクタはconstexprなので--expt-relaxed-constexprオプションをつけないとそこでも怒られる。

とりあえず必要なものだけ定義する。__device__が付いていること以外は通常の関数と変わらない。templateな__device__関数の例は見たことがなかったが（CUDA Cの資料を見ているので当然である）、できるだろうと思って書いたらできた。CUDAのC++対応がどの程度進んでいるかまとまっているところはないだろうか。

template<typename realT>
struct Complex
{
    realT r, i;
};

template<typename realT>
__device__ __forceinline__
Complex<realT> add(const Complex<realT> a, const Complex<realT> b)
{
    return Complex<realT>{a.r + b.r, a.i + b.i};
}

template<typename realT>
__device__ __forceinline__
Complex<realT> mul(const Complex<realT> a, const Complex<realT> b)
{
    return Complex<realT>{a.r * b.r - a.i * b.i, a.r * b.i + b.r * a.i};
}

template<typename realT>
__device__ __forceinline__ 
Complex<realT> mandelbrot_step(const Complex<realT> z, const Complex<realT> c)
{
    return add(mul(z, z), c);
}

template<typename realT>
__device__ __forceinline__
realT abs(const Complex<realT> z)
{
    return z.r * z.r + z.i * z.i;
}

まあ、素直にthrust::complexを使えば済む話なので、これを読んでいる人はこんなクソッタレな再発明をするべきではない。

で、各blockの各threadでマンデルブロ集合内であるかを判定する関数を書く。threadIdxとblockIdxで担当する場所を決め、それぞれが判定する。

template<typename realT>
__device__ bool
is_mandelbrot(const Complex<realT> z, const std::size_t threshold)
{
    Complex<realT> x{0., 0.};
    for(std::size_t i=0; i<threshold; ++i)
    {
        x = mandelbrot_step(x, z);
        if(abs(x) > 2.)
            return false;
    }
    return true;
}

template<typename realT>
__global__ void
write_mandelbrot(
        bool* result,
        const realT rmin, const realT rmax, const realT imin, const realT imax,
        const std::size_t th, const std::size_t x_dim, const std::size_t y_dim)
{
    const std::size_t x = threadIdx.x + blockIdx.x * blockDim.x;
    const std::size_t y = threadIdx.y + blockIdx.y * blockDim.y;

    const std::size_t offset = x + gridDim.x * blockDim.x * y;
    if(offset > x_dim * y_dim) return;

    const realT r = rmin + (rmax - rmin) / x_dim * x;
    const realT i = imin + (imax - imin) / y_dim * y;
    const Complex<realT> z{r, i};

    result[offset] = is_mandelbrot(z, th);
    return;
}

これを__host__側から呼ぶ。templateな__global__関数を呼ぶときはwrite_mandelbrot<double><<<blocks, threads>>>(args...)のようにすればできる。

#include <png++/png.hpp>

/*上記の関数達*/

int main()
{
    const double rmin = -2.0;
    const double rmax =  1.0;
    const double imin = -1.5;
    const double imax =  1.5;
    const std::size_t threshold = 400;
    const std::size_t x_dim = 1024;
    const std::size_t y_dim = 1024;

    bool* mandelbrot;
    cudaError_t ermalloc = cudaMalloc((void**)&mandelbrot, x_dim * y_dim);
    assert(ermalloc == 0);

    dim3 blocks(x_dim/32, y_dim/32);
    dim3 threads(32, 32);
    write_mandelbrot<double><<<blocks, threads>>>(
            mandelbrot, rmin, rmax, imin, imax, threshold, x_dim, y_dim);

    bool *result = new bool[x_dim * y_dim];
    cudaError_t ercpy = cudaMemcpy(
            result, mandelbrot, x_dim * y_dim, cudaMemcpyDeviceToHost);
    assert(ercpy == 0);

    cudaFree(mandelbrot);

    png::image<png::rgb_pixel> image(x_dim, y_dim);
    for(std::size_t i=0; i<x_dim; ++i)
    {
        for(std::size_t j=0; j<y_dim; ++j)
        {
            std::size_t offset = i + x_dim * j;
            if(result[offset])
                image[i][j] = png::rgb_pixel(255, 0, 0);
            else
                image[i][j] = png::rgb_pixel(0,0,0);
        }
    }

    image.write("mandelbrot.png");
    return 0;
}

少々雑だが、これで動く。

f:id:tniina:20170106150749p:plain

thrust

さて、C++erとしては生のCUDAコードを書くよりも、thrustの使用を検討したほうがよいという話は聞く。生CUDAを書いてしまってから検討するのは遅きに失している気もするが、thrustを使ってみよう。

実はだいぶ以前に一度CUDAを使ってみようとthrustのクイックスタートガイドを和訳したことがあるので（著作権などがどうなっているか厳しい気がしたので公開していないが）、thrustに関してその程度の知識はある。結局その後今に至るまで実際にコードを書くことはなかったのだが。

というわけで、thrust::transformによって、複素平面上の点を格納したthrust::device_vector<thrust::complex<double>>を、マンデルブロ集合内かどうかの判定結果が格納されたthrust::device_vector<bool>に変換することにしよう。

その判定用の関数オブジェクトを作る。なんのことはない、thrust::complexを受け取ってthreshold回のイテレーションの間に絶対値が2より大きくなるか調べるだけだ。

template<typename realT>
struct judge_mandelbrot
{
    const std::size_t threshold;

    judge_mandelbrot(const std::size_t th): threshold(th){}

    __host__ __device__
    bool operator()(const thrust::complex<realT>& z) const
    {
        thrust::complex<realT> x(0., 0.);
        for(std::size_t i=0; i<threshold; ++i)
        {
            x = x * x + z;
            if(thrust::abs(x) > 2.)
                return false;
        }
        return true;
    }
};

最初にthrust::device_vector<thrust::complex<double>>を複素平面で埋めるやり方がCUDA初心者にはよくわからなかったので、諦めてhost_vectorを作ってコピーすることにした。

// generate complex plane
thrust::host_vector<thrust::complex<double>> host_plane(size);
for(std::size_t i=0; i<r_resolution; ++i)
{
    for(std::size_t j=0; j<i_resolution; ++j)
    {
        const thrust::complex<double> val =
            min + thrust::complex<double>(dr * i, di * j);
        const std::size_t offset = i + r_resolution * j;
        host_plane[offset] = val;
    }
}

thrust::device_vector<thrust::complex<double>> complex_plane = host_plane;

で、これをthrust::transformする。

judge_mandelbrot<double> judge(threshold);
thrust::device_vector<bool> is_mandelbrot(size);
thrust::transform(complex_plane.begin(), complex_plane.end(),
                  is_mandelbrot.begin(), judge);

これで計算は終了だ。こんなに簡単でいいのだろうか。あとは出力するだけである。

ところで、出力するときにいちいちdevice_vectorにアクセスしていると、一度host_vectorにコピーしたときと比較して、信じられないくらい時間がかかることがわかった。要素一つごとにcudaMemcpyしているのだろう。

thrustつきのプログラムをコンパイルする方法に一瞬迷ったが、

$ nvcc -O3 -std=c++11 mandelbrot_thrust.cu -o mandelbrot -lpng12

でよいようだ。最後のはpng++ライブラリのリンク。

ところで、雑でもいいのでパフォーマンスを計測してみようとwrite_mandelbrot関数にかかった時間とthrust::transformにかかった時間をstd::chronoで計測してみたのだが、素人が書いたwrite_mandelbrotで5回の平均が18[us]、対してthrustは平均10[us]であった。thrust強すぎでは? 以前にArrayfireの紹介記事で素直な実装だと生CUDAコードを書いてもArrayfireの方が早いというのがあったが、thrustも（その記事とは比較にならないほど雑なコードだろうが）素直な生CUDAよりも早いとは。

この内容は

GitHub - ToruNiina/cuda_practice

で公開されている。

次回はisingモデルを書いたときのことをまとめる。