cuda画像処理入門 - nvidia · 1. cuda で画像処理 gpu = graphics processing unit...

GTC 2013 チュートリアルエヌビディアジャパン CUDAエンジニア森野慎也

CUDA画像処理入門

1. マシンビジョンにおける GPU の活用

CT や MRI から画像を受信して

三次元画像の構築をするシステム

1. 医用画像処理における GPU の活用

２次元スキャンデータから３次元、４次元イメージの高速生成

CUDA 化により画像処理速度を約20倍に高速化

1. CUDA で画像処理

GPU = Graphics Processing Unit

— 画像を「生成する」ためのプロセッサです。

「与えられた画像」を「処理する」ことも上手です。

— 「複雑な処理」も「プログラミング」できます。

CUDAによる画像処理の入門編です。

2. 画像処理：アフィン変換

画像の線形変換、平行移動

2. アフィン変換

変換式

11001

y

x

tdc

tba

Y

X

y

x

100

0cossin

0sincos

rotateT

100

00

00

y

x

magnify r

r

T

100

10

01

y

x

translate t

t

T

変換行列の例

2. 画像のメモリ配置

RGBA(8 bit, uchar4)の配列

index = x + y * pitchInPixels

width

pitchInPixels = pitchInBytes / sizeof(uchar4)

（x, y)

2. 2次元メモリ確保・転送

cudaError_t cudaMallocPitch ( void** devPtr, size_t* pitch,

size_t width, size_t height )

— widthバイトのメモリを、height行分、取得する。

— 行は、pitchバイトで整列する。

cudaError_t cudaMemcpy2D ( void* dst, size_t dpitch,

const void* src, size_t spitch, size_t width, size_t height,

cudaMemcpyKind kind )

— dstで示されるメモリ (dpitchバイトで整列)に、

srcで示されるメモリ (spitchバイトで整列) を、

width (バイト) x height (行)、コピーする。

2. アフィン変換: カーネル設計

「スレッド」に、変換後の画面の

「ピクセル」を割り当てる

— ピクセル数分、スレッドが走る。

例 : 262,144 (= 512 x 512) スレッド

スレッドは、処理対象のピクセルを持つ。

— 自分の位置(X, Y)を知ることが必要

2. 2DでのBlock・Threadの割り当て

Threadを「２次元」で質点に対応。

Blockを「２次元」で定義。一定のサイズ。

Grid : 必要数のBlockを「２次元」に並べる。

1 Block

1 Pixel = 1 Thread

(i, j) =

(GlobalID(x),GlobalID(y))

2. 2DでのBlock・Threadの割り当て

GlobalID は、(x, y, z)方向に計算できる

GlobalID(x) = blockDim.x * blockIdx.x + threadIdx.x

GlobalID(y) = blockDim.y * blockIdx.y + threadIdx.y

GlobalID(z) = blockDim.z * blockIdx.z + threadIdx.z

blockDim.x * blockIdx.x threadIdx.x

blockDim.y * blockIdx.y

threadIdx.y

2. アフィン変換: Grid サイズ指定

/* value、radixで割って、切り上げる */

int divRoundUp(int value, int radix) {

return (value + radix – 1) / radix;

}

/* gridDim, blockDimを、２次元(x, y方向)に初期化 */

dim3 blockDim(128, 4);

/* divRoundUp()は、切り上げの割り算 */

dim3 gridDim(divRoundUp(width, blockDim.x), divRoundUp(height, blockDim.y));

affineTransformKernel<<<gridDim, blockDim>>>(dDst, dSrc, …);

2. アフィン変換: カーネルの入出力

__global__

void affineTransformKernel(uchar4 *dDst, const uchar4 *dSrc,… )

dSrc dDst

2. アフィン変換: カーネルのスケルトン

__global__

void affineTransformKernel(uchar4 *dDst, const uchar4 *dSrc,

int width, int height, int pitch) {

int gidx = blockDim.x * blockIdx.x + threadIdx.x;

int gidy = blockDim.y * blockIdx.y + threadIdx.y;

if ((gidx < width) && (gidy < height)) {

uchar4 pixel = …; /* 値を設定 */

int myPixelPos = gidx + gidy * pitch;

zDst[myPixelPos] = pixel;

}

}

2. アフィン変換: 座標は「逆変換」

変換後のピクセル座標(X, Y)は、既知

(X, Y) から、(x, y)に逆変換。ピクセルをコピー。

2. アフィン変換: 逆変換

行列は、すべての変換で共通（大域的）。

— 事前に、CPU上で計算しておく。

— カーネルでは、与えられた行列を使うのみ。

1100

1

1

Y

X

atctac

dtbtbd

bcady

x

yx

xy

2. アフィン変換: カーネル呼び出し

struct Matrix { float a, b, c, d; float tx, ty; } Matrix matrix; // 値設定済み (略)

Matrix inverted; // 逆行列

float det = matrix.a * matrix.d - matrix.b * matrix.c;

if (det != 0.f) {

inverted.a = matrix.d / det; inverted.b = - matrix.b / det;

inverted.c = - matrix.c / det; inverted.d = matrix.a / det;

inverted.tx = (matrix.b * matrix.ty - matrix.tx * matrix.d) / det;

inverted.ty = (matrix.tx * matrix.c - matrix.a * matrix.ty) / det;

dim3 blockDim(128, 4);

dim3 gridDim(divRoundUp(width, blockDim.x), divRoundUp(height, blockDim.y));

affineTransformKernel<<<gridDim, blockDim>>>(inverted, dDst, texSrc, width, height, pitch / sizeof(uchar4));

(略)

2. アフィン変換: カーネルの実装

__global__ void affineTransformKernel(Matrix invMat, uchar4 *dDst, const uchar4 *dSrc, int width, int height, int pitch) { int gidx = blockDim.x * blockIdx.x + threadIdx.x; int gidy = blockDim.y * blockIdx.y + threadIdx.y; if ((gidx < width) && (gidy < height)) {

float X = gidx + 0.5f; float Y = gidy + 0.5f;

float x = invMat.a * X + invMat.b * Y + invMat.tx; /* 逆変換 */

float y = invMat.d * X + invMat.e * Y + invMat.ty;

uchar4 srcPixel ;

if ((0.f < x) && (x < width) && (0.f < y) && (y < wdith)) {

int srcPixelPos = int(x) + int(y) * pitchInPixels;

srcPixel = dSrc[srcPixelPos];

}

else {

srcPixel = make_uchar4(0, 0, 0, 0)

}

dDst[gidx + gidy * pitch] = srcPixel;

}

2. OpenGL Interoperability

CUDAから、 OpenGLオブジェクトをアクセス

Texture PBO/VBO などバッファ

OpenGLオブジェクト

登録

cudaGraphicsGLRegisterImage() cudaGraphicsGLRegisterBuffer()

OpenGLオブジェクト

登録解除

cudaGraphicsGLUnregisterImage() cudaGraphicsGLUnregisterBuffer()

リソースマップ cudaGraphicsMapResources()

リソースアンマップ cudaGraphicsUnmapResources()

CUDAオブジェクト

取得

cudaGrahipcsSubResourceGetMapp

edArray()

cudaGraphicsResourceGetMappedPoi

nter()

3. たたみ込み

画像フィルタ

— Gaussian Filter, Sobel Filter, Laplacian Filter…

パターンマッチング

— SAD、SSD、相関マッチング … etc

3. Gaussian Filter

元画像のピクセル x 係数

すべて足し合わせる。

— 係数を、ガウス分布とする

1スレッドで、

１ピクセルを出力

値の形式は、float

元画像

係数

足し合わせる

×

＋

3. カーネルの実装イメージ

__device__ float f(int x, int y); // ピクセルの値を取得する関数

__global__

void gaussianKernel_3x3(float *dDst, const float *dSrc, int width, int height, int pitch) {




float pixel =

coef[0][0] * f(gidx - 1, gidy - 1) + coef[0][1] * f(gidx, gidy - 1) + coef[0][2] * f(gidx + 1, gidy - 1);

+ coef[1][0] * f(gidx - 1, gidy ) + coef[1][1] * f(gidx, gidy ) + coef[1][2] * f(gidx + 1, gidy );

+ coef[2][0] * f(gidx - 1, gidy + 1) + coef[2][1] * f(gidx, gidy + 1) + coef[2][2] * f(gidx + 1, gidy + 1);

int myPixelPos = gidx + gidy + pitchInPIxels;

dDst[myPixelPos] = pixel;

}

}

3. Texture

GPU上のハードウエア

— Read-only、L1キャッシュが使用可能

— 端の要素の処理

Clamp、Wrap、Mirror、Border

— 線形補間も使用可能

Texture Object

— Fermi以降、CUDA 5.0以降で使用可能

— カーネルに引数として渡せる。

3. Textureオブジェクトの作成

TextureDesc texDesc;

ResourceDesc resDesc;

// 値のクリア

memset(&texDesc, 0, sizeof(texDesc));

memset(&resDec, 0, sizeof(resDesc));

texDesc.addressMode[0] =

texDesc.addressMode[1] = cudaAddressModeClamp;

texDesc.filterMode = cudaFilterModePoint;

texDesc.readMode = cudaReadModeElementType;

texDesc.normalizedCoords = 0;

resDesc.resType =

cudaResourceTypePitch2D;

resDesc.res.pitch2D.devPtr = dSrc;

resDesc.res.pitch2D.desc =

cudaCreateChannelDesc<float>();

resDesc.res.pitch2D.pitchInBytes =

pitchInBytes;

resDesc.res.pitch2D.width = width;

resDesc.res.pitch2D.height = height;

cudaTextureObject_t tex;

cudaCreateTextureObject(&tex, &resDesc,

&texDesc, NULL);

カーネル実装：Texture導入

__device__ float f(cudaTextureObject_t texSrc, int x, int y) { // ピクセルの値を取得する関数

return tex2D<float>(texSrc, x, y);

}

__global__

void gaussianKernel_3x3(float *dDst, cudaTextureObject_t texSrc, int width, int height, int pitch) {




float pixel =

coef[0][0] * f(gidx - 1, gidy - 1) + coef[0][1] * f(gidx, gidy - 1) + coef[0][2] * f(gidx + 1, gidy - 1)

+ coef[1][0] * f(gidx - 1, gidy ) + coef[1][1] * f(gidx, gidy ) + coef[1][2] * f(gidx + 1, gidy )

+ coef[2][0] * f(gidx - 1, gidy + 1) + coef[2][1] * f(gidx, gidy + 1) + coef[2][2] * f(gidx + 1, gidy + 1);

dDst[gidx + gidy * pitchInPIxels] = pixel;

}

}

3. Constant Memory

定数専用のメモリ

— 複数のスレッドから、同じ値をアクセスするのが、前提。

— サイズは64 KB。キャッシュされる。

値の設定

— 直接初期化。

— Hostから値を設定することも可能

cudaMemcpyToSymbol()

3. カーネル実装：Texture導入

__constant__ float coef[3][3] = {

{ 1.f / 16.f, 2.f / 16.f, 1.f / 16.f, },

{ 2.f / 16.f, 4.f / 16.f, 2.f / 16.f, },

{ 1.f / 16.f, 2.f / 16.f, 1.f / 16.f, },

};

__device__ float f(cudaTextureObject_t texSrc, int x, int y) { // ピクセルの値を取得する関数

return tex2D<float>(texSrc, x, y);

}

__global__

void gaussianKernel_3x3(float *dDst, cudaTextureObject_t texSrc, int width, int height, int pitch) {



(略)

}

3. 演算量、メモリアクセス量の算出

画像サイズ : x (pixels) * y(pixels)

— メモリ読みこみ、書き出し量

= 2 * x * y * sizeof(float) [byte]

— 演算量 = 17 * x * y [FP]

— B/F = 8 / 17 ≒ 0.48 [byte/FP]

実際のGPU = 0.04～0.08 [byte/FP]

メモリ読み込み量が多い ⇒ バンド幅律速

3. TIPS: ベクタライズによる高速化

1つのスレッドで、

複数のピクセルを処理する。

(例では、2x2)

元画像からの読み込み値は、

変数(レジスタ)に保存する

Communication-Minimizing 2D Convolution in GPU Registers

Forrest N. Iandola, David Sheffield, Michael Anderson, Phitchaya Mangpo Phothilimthana, Kurt

Keutzer, http://parlab.eecs.berkeley.edu/publication/899

元画像：レジスタに保存

係数

出力

Sobel Filter

輪郭の検出

— 係数(横方向)

＋

-1 -2 -1

0 0 0

1 2 1

-1 0 1

-2 0 2

-1 0 1

22

yx vvv

横、縦成分の合成

— 係数(縦方向)

3. ベンチマーク例

Tesla K20 ECC off, 2048 x 2048 pixels.

ベクタ化性能

(GFLOPS)

バンド幅

(GB/s)

バンド幅

効率性能向上

Gaussian Filter

(3x3)

- 256 112 54 % -

2x2 346 152 73 % 35 %

Sobel Filter - 205 95.3 46 % -

2x2 315 147 71 % 54 %

画像処理のための CUDA入門

「画像処理のためのCUDA入門」

— 日時 : 8/28、9/26 15:00～18:00

— 場所 : NVIDIA Japan 赤坂オフィス

— 定員 : 20名

— 申し込み :

http://www.nvidia.co.jp/object/event-calendar-jp.html

— 入門編。無償です。

cuda画像処理入門 - nvidia · 1. cuda で 画像処理 gpu = graphics processing unit...

Documents

cuda画像処理入門 - nvidia · 1. cuda で画像処理 gpu = graphics processing unit...