opencl compiler support based on open64 for mpus+gpus

Open64 Developers Forum 2010

OpenCL Compiler Support Based on Open64 for MPUs+GPUs

Yu-Te Lin, Chung-Ju WuChia-Han Lu, Shao-Chung Wang

Jenq-Kuen Lee

Department of Computer Science,National Tsing Hua University, HsinChu, Taiwan

2

Outline• OpenCL overview• Issues on supporting OpenCL• On-going implementation on Open64• Summary & Discussion

3

Open Computing Language• Programming framework on heterogeneous platforms

consisting of CPU, GPU , and other processors.• Initially developed by Apple Inc. and submitted to Khronos

Group in 2008.• Open and Royalty-Free.• OpenCL Language

– C language extension– built-in functions

• OpenCL Runtime– platform APIs– runtime APIs

OpenCL tutorial, IEEE HotChips, Aug.23, 2009

OpenCL Framework overview

MPU GPU

__kernel void dot(__global const float4 *a __global const float4 *b __global float4 *c){ int tid = get_global_id(0); c[tid] = a[tid] * b[tid];}

OpenCL KernelsHost ProgramApplication

OpenCL Framework

Platform

int main(int argc, char **argv){ ... clBuildProgram(program, ...); clCreateKernel(program, “dot”...); ...}

Runtime APIs

Platform APIs

OpenCL Compiler

OpenCL Runtime

Front-end

Back-end

Separate programs into host-side and kernel-side code fragment

Compiler• compile OpenCL C

language just-in-timeRuntime

• allow host program to manipulate context

MPU : host, kernel programGPU : kernel program

OpenCL_for_Halifux.pdf, OpenCL overview, Intel Visual Adrenaline

5

OpenCL Sample Code

int runCL(int *a, int *b, int *c, int size){ // Device initialization clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); ...

// Program and Kernel Creation char *source = load_program_source(“kernel.cl”); cl_program program = clCreateProgramWithSource(context, 1, \ (const char**) &source, NULL, &err); cl_BuildProgram(program, 1, &device, NULL, NULL, NULL); cl_kernel kernel = clCreateKernel(program, “add”, &err); cl_mem buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, \ NWITEMS*sizeof(cl_uint4), NULL, NULL ); ...

// Device teardown clFinish(queue); ...}

int main(){ int a[100],b[100],c[100]; int i, size = 100; runCL(a, b, c, size); return 0;}

__kernel void add( __global float *a, __global float *b, __global float *c){ int gid = get_global_id(0); c[gid] = a[gid] + b[gid];}

int main(){ int a[100],b[100],c[100]; int size = 100; int i;

for(i=0;i<size;i++) c[i] = a[i] + b[i];

return 0;}

kernel.clmain.c

OpenCL Compiler is embedded in API

Configure platform devices

Prepare memory object

Convert kernel source into character array

Dispatch kernels on devices

OpenCL Runtime

6

OpenCL Execution Scenario

kernel{ code fragment 1 code fragment 2 code fragment 3}

kernel A{ code 1 code 2 code 3}

X O

1 2 3

Task-level parallelism

A:1,2,3data(0

)

A:1,2,3data(1

)

A:1,2,3data(2

)

A:1,2,3data(3

)

A:1,2,3data(4

)

A:1,2,3data(5

)

kernel A{ code 1}

kernel B{ code 2}

kernel C{ code 3}

A:1 B:2 C:3

A:1 B:2 C:3

OpenCL Runtime

Data-level parallelism

7

Supporting OpenCL• Syntax parsing by compiler

– qualifier– vector– built-in function– Optimizations on

single core

• Runtime implementation– handle multi-core issues– Co-work with device vendor

__kernel void add( __global float4 *a, __global float4 *b, __global float4 *c){ int gid = get_global_id(0); float4 data = (float4) (1.0, 2.0, 3.0, 4.0); c[gid] = a[gid] + b[gid] + data;}

Runtime APIsPlatform APIs

MPU GPU

8

OpenCL compiler implementation• Nowadays, LLVM is commonly used to be

the solution.– Apple– ATI– NVIDIA– RapidMind– ...

• Clang is the front-end parser.

9

ATI SDK OpenCL support• We took ATI SDK to try OpenCL programs

__kernel void memset( __global uint4 *dst ){ uint4 a = (uint4)(1, 2, 3, 4); uint4 b = (uint4)(97,98,99,100); uint4 c = a * b; dst[get_global_id(0)] = c; }

func 1204 ; __OpenCL_memset_kernel......dcl_literal l7, 0x00000001, 0x00000001, 0x00000001, 0x00000001; int: 1dcl_literal l8, 0x00000004, 0x00000004, 0x00000004, 0x00000004; int: 4...... (more)...... (more)dcl_literal l19, 0x00000040, 0x00000040, 0x00000040, 0x00000040; int: 64dcl_literal l20, 0x00000050, 0x00000050, 0x00000050, 0x00000050; int: 80dcl_num_thread_per_group 64, 1, 1 dcl_raw_uav_id(1)......... (more)endfunc;ARGEND:__OpenCL_memset_kernelfunc 1205 ; memset

mov r176.x___, r1.xxxxmov r177, l7mov r177, r177.xxxxmov r178, l8mov r179, l9iadd r177, r177.xyz0, r178.000xmov r181, l10iadd r177, r177.xy0w, r181.00x0mov r181, l11mov r179, r179.xxxxmov r182, l12iadd r177, r177.x0zw, r181.0x00mov r1049, l13mov r2.x___, r176.xxxxmov r1.x___, r1049.xxxxcall 1061 ; get32BitStorePrivateiadd r176, r179.xyz0, r182.000xmov r179, l14mov r1050, l15mov r2, r177mov r1.x___, r1050.xxxxcall 1063 ; get128BitStorePrivateiadd r176, r176.xy0w, r179.00x0mov r179, l16mov r1051, l17mov r2, r177mov r1.x___, r1051.xxxx

dcl_literal l7, 0x00000061, 0x00000061, 0x00000061, 0x00000061; int: 97dcl_literal l8, 0x00000190, 0x00000190, 0x00000190, 0x00000190; int: 400.........dcl_literal l10, 0x00000129, 0x00000129, 0x00000129, 0x00000129; int: 297dcl_literal l11, 0x000000c4, 0x000000c4, 0x000000c4, 0x000000c4; int: 196.........func 1210 ; memset

mov r176.x___, r1.xxxxcall 1111 ; __amdil_get_global_id_intmov r177, r1mov r178, l7mov r178, r178.xxxxmov r179, l8mov r177, r177.x000 mov r181, l9iadd r178, r178.xyz0, r179.000xmov r179, l10ishl r177.x___, r177.xxxx, r181.xxxxiadd r178, r178.xy0w, r179.00x0mov r179, l11iadd r176.x___, r176.xxxx, r177.xxxxiadd r177, r178.x0zw, r179.0x00mov r2, r177mov r1.x___, r176.xxxxcall 1080 ; get128BitStoreUAVret

endfunc ; memset

LLVM O0 compilation

internal &linker optimization

{ 97, 97, 97, 97 }

{ 97, 97, 97, 400 }

{ 97, 97, 297, 400 }

{ 97, 196, 297, 400 }

call 1063 ; get128BitStorePrivateiadd r176, r176.x0zw, r179.0x00mov r1052, l18mov r2, r176mov r1.x___, r1052.xxxxcall 1063 ; get128BitStorePrivatemov r1053, l19mov r2, r176mov r1.x___, r1053.xxxxcall 1063 ; get128BitStorePrivatemov r1054, l17mov r1.x___, r1054.xxxxcall 1066 ; get128BitLoadPrivatemov r177, r1imul r176, r177, r176mov r1055, l20mov r2, r176mov r1.x___, r1055.xxxxcall 1063 ; get128BitStorePrivatemov r1056, l13mov r1.x___, r1056.xxxxcall 1064 ; get32BitLoadPrivatemov r176.x___, r1.xxxxmov r177, l13mov r1.x___, r177.xxxxcall 1027 ; get_global_idmov r177.x___, r1.xxxxishl r177.x___, r177.xxxx, r178.xxxxiadd r176.x___, r176.xxxx, r177.xxxxmov r1057, l20mov r1.x___, r1057.xxxxcall 1066 ; get128BitLoadPrivatemov r177, r1mov r2, r177mov r1.x___, r176.xxxxcall 1080 ; get128BitStoreUAVret

endfunc ; memset

10

Our experiences on Open64• PACDSP Compiler

– Based on Open64 compiler– Intermediate Representation:

WHIRL– CGIR level implementation

• PACDSP impact– Distributed Register Files– 5-way VLIW

11

Previous Works for Distributed Register Files• Local register allocation

– Compiler Supports and Optimizations for PAC VLIW DSP Processors [Lin, LCPC’05]– Register Allocation for VLIW DSP Processors with Irregular Register Files [Lin, CPC’06]– PALF: Compiler Supports for Irregular Register Files in Clustered VLIW DSP Processors

[Lin, CC:PE’07]– Effective Code Generation for Distributed and Ping-Pong Register Files: a Case Study on

PAC VLIW DSP Cores [Lin, JoVSPS’08]– Expression Rematerialization for VLIW DSP Processors with Distributed Register Files [Wu,

CPC’09]• Global register allocation

– A Local-Conscious Global Register Allocator for VLIW DSP Processors with Distributed Register Files [Lu, CPC’07]

– LC-GRFA: Global Register File Assignment with Local Consciousness for VLIW DSP Processors with Non-uniform Register Files [Lu, CC:PE’09]

• Local optimization– Copy Propagation Optimizations for VLIW DSP Processors with Distributed Register Files

[Wu, LCPC’06]• Loop optimization

– Enabling Compiler Flow for Embedded VLIW DSP Processors with Distributed Register Files [Chen, LCTES’07]

12

Our on-going implementationFront-end Back-end

cc142 .spin wgen42 WHIRL CGIR Opt asm

OpenCLspec.

GCC 4parser

GCC 4 tree structure vendor

architecture

vendorruntime library

WHIRL node

vendortoolkits

.c .B .s

1.Qualifier• WHIRL symbol table

2.Vector• initialization• operation

3.built-in function• extract essential module

from ATI SDK

GPU asm codeCPU asm codepreserve qualifier information

vector type optimizations

High WHIRL

Mid WHIRL

Low WHIRL

Very High WHIRL

Very Low WHIRL

13

Qualifier Parsing• Support OpenCL qualifiers

– __global, __local, __constant, __private ,and __kernel• Add function “c_parser_ocl_qualifiers” to parse qualifier

and build attribute tree node

• Add function “handle_global_attribute” , “handle_local_attribute” and etc. to set flag.

static tree c_parser_ocl_qualifiers (c_parser *parser){ …… attr = build_tree_list (attr_name, NULL_TREE); attrs = chainon(attrs,attr); …… return attrs;}

static tree handle_global_attribute (tree *node,…….){ DECL_GLOBAL (*node) = 1; return NULL_TREE;}

14

Qualifier Parsing

15

Vector Parsing• Built-in Vector Data Types

– Support OpenCL vector data types such as float4, int4…– With the help of GCC vector extension.– The idea is from using union to define OpenCL vector.

– Create the built-in vector tree node in GCC front-end.

typedef float __ocl_float4 __attribute((vector_size(16)));typedef union { struct {float x, y, z, w}; __ocl_float4 _ocl_vec;} float4;

d = build_decl (FIELD_DECL, get_identifier (“w”, float_type_node);TREE_CHAIN(d) = decls;……vec = build_vector_type_for_mode(float_type_node, V4SFmode);d = build_decl (FIELD_DECL, get_identifier ("_ocl_vec"), vec);TREE_CHAIN(d) = decls;……record_builtin_type (RID_TYPEDEF, "float4", float4_type_node);

16

Vector Parsing• Vector Initialization

– Add c_parser_parend_init() for vector data initialization.– Similar to c_parser_braced_init() used for array initialization.

• a = (float4) (3.0, 5.0, 7.0, 9.0)• Vector Assignment

– Add additional fields in c_expr to record vector expressions.• a.x = b.y or a.xyzw = b.xxyy

– Modify c_parser_postfix_expression_after_primary() to reference vector elements and store in c_expr.

– Use build_modify_expr() to build vector assignment.• Vector Operation

– Type casting for scalar or vectors with different number of components.

– Use GCC’s vector operation if possible.

17

Vector Parsing• Data InitializationOpenCLsource

WHIRL

INIT Symtab

18

Vector Parsing• Binary Operation

19

Runtime Compilation Flow on Multi-Core CPU(x86)

clc prelink.bcInternal

optimizer and linker

builtin-x86.bc

opt.s

ldas

opencc

Reuse stub code and metadata

kernel.cl

builtin-x86.bcllvm-extract/llc

stub/metadata

OpenCL_kernel.s

lib.c

ATI SDK

Open64

libatiocl.so

When the Runtime API clBuildProgram is invoked.....

clc : OpenCL-LLVM compiler front-end.bc : LLVM Intermediate Representation

20

ATI Sample Test

21

Experiment Set 1• x86 optimization testing

– 4-core CPU, Linux OS, ATI SDK– Small programs for common optimizations

for(i=0;i<100;i++) for(j=0;j<100;j++) for(k=0;k<9000;k++) output[tid] = input[tid] * multiplier;

for(i=0;i<5000;i++) { for(j=0;j<2000;j++) { output[101] = sum * 3; output[102] = sum / 2; output[i%100] = i + j; } }

unsigned int count = 0; unsigned int i;

count += (input[i%100] > 5) ? 1 : 2; int a, b, c, d, e; a = value;

b = a; c = b; d = c; e = d; output[100] = e;

int index,sum; int a = 10, b = 2;

sum = 1 + 2 + 3 + 4; sum = sum + 5 + 6 + 7 + 8 + 9 + 10; sum += a * b + index;

int i, j, k; int sum, a=1, b=2;

sum = a + b;

for(i=0;i<100;i++) for(j=0;j<100;j++) for(k=0;k<9000;k++) a = i + j + k + b;

output[100] = sum;

int i; int a, b, c, d, e, f;

d = a + b + c + 20; e = (a + b + c) * 5; f = a + b + c - 99;

Loop Nested Optimization

Global Code HoistingIf-conversion

copy propagation

common available expression

dead code elimination

constant folding

22

Experiment Set 2• ATI OpenCL samples testing

– 4-core CPU, Linux OS, ATI SDK– Real-world OpenCL samples

uint count = 0;float prev_diff = (diagonal[0] - x);count += (prev_diff < 0)? 1 : 0;for(uint i = 1; i < width; i += 1) { float diff = (diagonal[i] - x) - ((offDiagonal[i-1] * offDiagonal[i-1])/prev_diff); count += (diff < 0)? 1 : 0; prev_diff = diff; }return count;

EigenValue

for(uint k=0; k < blockWidth; k++) { uint index1 = (inverse)? i*blockWidth + k : k * blockWidth + i; uint index2 = getIdx(groupIdx, groupIdy, j, k, blockWidth, width); acc += dct8x8[index1] * input[index2]; }inter[j*blockWidth + i] = acc;barrier(CLK_LOCAL_MEM_FENCE);acc = 0.0f;for(uint k=0; k < blockWidth; k++) { uint index1 = i* blockWidth + k; uint index2 = (inverse)? j*blockWidth + k : k* blockWidth + j; acc += inter[index1] * dct8x8[index2]; } output[idx] = acc;

DCT

int tid = get_global_id(0); int i = tid%width;int j = tid/width; float x0 = ((i*scale) - ((scale/2)*width))/width;float y0 = ((j*scale) - ((scale/2)*width))/width; float x = x0;float y = y0; float x2 = x*x;float y2 = y*y; float scaleSquare = scale * scale; uint iter=0;for(iter=0; (x2+y2 <= scaleSquare) && (iter < maxIterations); ++iter) { y = 2 * x * y + y0; x = x2 - y2 + x0; x2 = x*x; y2 = y*y; }

Mandelbrot

sum0.x += tempA0.x * tempB0.x + tempA0.y * tempB1.x + tempA0.z * tempB2.x + tempA0.w * tempB3.x;sum0.y += tempA0.x * tempB0.y + tempA0.y * tempB1.y + tempA0.z * tempB2.y + tempA0.w * tempB3.y;sum0.z += tempA0.x * tempB0.z + tempA0.y * tempB1.z + tempA0.z * tempB2.z + tempA0.w * tempB3.z;sum0.w += tempA0.x * tempB0.w + tempA0.y * tempB1.w + tempA0.z * tempB2.w + tempA0.w * tempB3.w;




MatrixMultiplication

Common Optimization Experiment • Taking multi-core CPU as OpenCL device to test optimizations• Comparison of Execution Time , y-axis is the improvement

Copy Pro

pagati

on

Constant F

olding

Dead Code E

liminati

on

Common Subexp

ressio

n Eliminati

on

If-Conver

sion

Loop Nested

Optimization

Global Code H

oisting

-100%-75%-50%-25%

0%25%50%75%

100%125%150%

5.01% 15.82%0.75% 0.12% 2.72% 14.53%

137.54%

Comparison of Optimizations on CPU(x86)

ATI LLVM Open64

ATI OpenCL Sample Experiment• The performance comparison based on ATI SDK LLVM is

shown as the following graph.• We take four classical sample codes from ATI SDK.

MatrixMultiplication EigenValue DCT Mandelbrot-50%

-40%

-30%

-20%

-10%

0%

10%

20%

30%

40%

-20.93%

27.13%

3.02% -0.21%

Comparison of Real Program on Multi-Core CPU(x86)

ATI LLVM Open64

25

Reviewers Comment1. Is it sufficient to test it on CPU instead of GPU?

difference: back-end code generation & runtime compilation flow 2. How to handle multi-dimensional thread execution model?

In OpenCL perspective, programmer is charge of it.3. How is a kernel function compiled differently from a non-kernel

function? Host Program: Host Compiler Kernel Program: OpenCL Compiler

4. Does your OpenCL compiler handle both MPU and GPU code or just the GPU code? If both, what is the MPU architecture? OpenCL compiler only generates ONE target assembly code.

5. Why is ATI GPU targeted rather than the nVidia GPU? Simply pick one up to start research work.

26

Summary/Discussion• Supporting OpenCL

– OpenCL compiler– OpenCL runtime

• OpenCL compiler on Open64– front-end & back-end

• Preliminary Experiment– Open64 performs good optimizations– Refinements are still required

• Future Work– Keep going on GPU targ-info

27

THANK YOU !!

opencl compiler support based on open64 for mpus+gpus

Documents

opencl compiler support

open64 compiler

open64 background

language specification

taiwan open64 developers

platform model

academic ongoing work

national tsing hua university