gmac global memory for acceleratorsdeveloper.amd.com/wordpress/media/2013/06/2908_1_final.pdf ·...

32
GMAC Global Memory for Accelerators Wen-mei W. Hwu, Isaac Gelado and Javier Cabezas

Upload: others

Post on 25-Aug-2020

1 views

Category:

Documents


0 download

TRANSCRIPT

Page 1: GMAC Global Memory for Acceleratorsdeveloper.amd.com/wordpress/media/2013/06/2908_1_final.pdf · 2013. 10. 24. · GMAC Performance •Sobel filtering on video stream •OpenCL: –

GMAC Global Memory for Accelerators

Wen-mei W. Hwu, Isaac Gelado and Javier Cabezas

Page 2: GMAC Global Memory for Acceleratorsdeveloper.amd.com/wordpress/media/2013/06/2908_1_final.pdf · 2013. 10. 24. · GMAC Performance •Sobel filtering on video stream •OpenCL: –

GMAC in a nutshell

• GMAC: Unified Virtual Address Space for OpenCL

– Simplifies the CPU code

– Exploits advanced OpenCL features for free

– Transparent memory consistency management

• Vector addition example – Really simple kernel code

– But, what about the CPU code?

__kernel void vector(__global float *c, __global float *a, __global float *b) { int idx = get_global_id(0); c[idx] = a[idx] + b[idx]; }

6/15/11 2 AMD Fusion Summit 2011

Page 3: GMAC Global Memory for Acceleratorsdeveloper.amd.com/wordpress/media/2013/06/2908_1_final.pdf · 2013. 10. 24. · GMAC Performance •Sobel filtering on video stream •OpenCL: –

CPU OpenCL code (I)

• Set-up OpenCL int main(int argc, char *argv[]) { cl_platform_id platform; cl_device_id device; cl_context context; cl_command_queue command_queue; cl_program program; cl_kernel kernel; cl_int error_code; float *a, *b, *c; cl_mem d_a, d_b, d_c; /* Start setting up OpenCL */ error_code = clGetPlatformIDs(1, &platform, NULL); assert(error_code == CL_SUCCESS); error_code = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); assert(error_code == CL_SUCCESS); context = clCreateContext(0, 1, &device, NULL, NULL, &error_code); assert(error_code == CL_SUCCESS); command_queue = clCreateCommandQueue(context, device, 0, &error_code); assert(error_code == CL_SUCCESS); program = clCreateProgramWithSource(context, 1, &kernel_source, NULL, &error_code); assert(error_code == CL_SUCCESS); error_code = clBuildProgram(program, 1, &device, NULL, NULL, NULL); assert(error_code == CL_SUCCESS); kernel = clCreateKernel(program, "vecAdd", &error_code); assert(error_code == CL_SUCCESS);

6/15/11 3 AMD Fusion Summit 2011

Page 4: GMAC Global Memory for Acceleratorsdeveloper.amd.com/wordpress/media/2013/06/2908_1_final.pdf · 2013. 10. 24. · GMAC Performance •Sobel filtering on video stream •OpenCL: –

CPU OpenCL code (II)

• Allocate memory and initialize data /* Alloc & init input data */ assert((a = (float *)malloc(vecSize * sizeof(float)) != NULL); d_a = clCreateBuffer(context, CL_MEM_READ_WRITE, vecSize * sizeof(float), NULL, &error_code); assert(error == CL_SUCCESS); read_file(“vector_A.data”, a, vecSize); assert((b = (float *)malloc(vecSize * sizeof(float)) != NULL); d_b = clCreateBuffer(context, CL_MEM_READ_WRITE, vecSize * sizeof(float), NULL, &error_code); assert(error == CL_SUCCESS); read_file(“vector_B.data”, b, vecSize); /* Alloc output data */ assert((b = (float *)malloc(vecSize * sizeof(float)) != NULL); d_b = clCreateBuffer(context, CL_MEM_READ_WRITE, vecSize * sizeof(float), NULL, &error_code); assert(error == CL_SUCCESS); /* Copy data to the device */ assert(clEnqueueWriteBuffer(command_queue, d_a, CL_FALSE, 0, vecSize * sizeof(float), a, 0, NULL, NULL) == CL_SUCCESS); assert(clEnqueueWriteBuffer(command_queue, d_b, CL_FALSE, 0, vecSize * sizeof(float), b, 0, NULL, NULL) == CL_SUCCESS);

6/15/11 4 AMD Fusion Summit 2011

Page 5: GMAC Global Memory for Acceleratorsdeveloper.amd.com/wordpress/media/2013/06/2908_1_final.pdf · 2013. 10. 24. · GMAC Performance •Sobel filtering on video stream •OpenCL: –

CPU OpenCL code (III)

• Call the kernel and save the output /* Set kernel arguments */ assert(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_c) == CL_SUCCESS); assert(clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_a) == CL_SUCCESS); assert(clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_b) == CL_SUCCESS); /* Call the kernel */ size_t global_size = vecSize; assert(clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_size, NULL, 0, NULL, NULL) == CL_SUCCESS); assert(clFinish(command_queue) == CL_SUCCESS); /* Get the results back */ assert(clEnqueueReadBuffer(command_queue, d_c, CL_FALSE, 0, vecSize * sizeof(float), c, 0, NULL, NULL) == CL_SUCCESS); save_file(“vector_C.data”, c, vecSize); /* Release memory */ clReleaseMemObject(d_c); free(c); clReleaseMemObject(d_b); free(b); clReleaseMemObject(d_a); free(a); clReleaseCommandQueue(command_queue); clReleaseContext(context); return 0; }

6/15/11 5 AMD Fusion Summit 2011

Page 6: GMAC Global Memory for Acceleratorsdeveloper.amd.com/wordpress/media/2013/06/2908_1_final.pdf · 2013. 10. 24. · GMAC Performance •Sobel filtering on video stream •OpenCL: –

GMAC code sample

6/15/11 6 AMD Fusion Summit 2011

int main(int argc, char *argv[]) { float *a, *b, *c; assert(eclCompileSource(kernel_source) == eclSuccess); /* Alloc & init input data */ assert(eclMalloc((void **)&a, vecSize * sizeof(float)) == eclSuccess); read_file(“vector_A.data”, vecSize); assert(eclMalloc((void **)&b, vecSize * sizeof(float)) == eclSuccess) read_file(“vector_B.data”, vecSize); /* Alloc output data */ assert(eclMalloc((void **)&c, vecSize * sizeof(float)) == eclSuccess) /* Call the kernel */ ecl_kernel kernel; size_t globalSize = vecSize; assert(eclGetKernel("vecAdd", &kernel) == eclSuccess); assert(eclSetKernelArgPtr(kernel, 0, c) == eclSuccess); assert(eclSetKernelArgPtr(kernel, 1, a) == eclSuccess); assert(eclSetKernelArgPtr(kernel, 2, b) == eclSuccess); assert(eclCallNDRange(kernel, 1, NULL, &globalSize, NULL) == eclSuccess); save_file(“vector_C.data”, vecSize); eclFree(a); eclFree(b); eclFree(c); return 0; }

Page 7: GMAC Global Memory for Acceleratorsdeveloper.amd.com/wordpress/media/2013/06/2908_1_final.pdf · 2013. 10. 24. · GMAC Performance •Sobel filtering on video stream •OpenCL: –

GMAC Supported Platforms

• Any OpenCL 1.1 compatible stack, with optimizations for:

– AMD Fusion devices

– AMD Radeon HD

– NVIDIA Tesla

• Windows 7 (64 and 32 bits)

• GNU/Linux (64 and 32 bits)

6/15/11 AMD Fusion Summit 2011 7

Page 8: GMAC Global Memory for Acceleratorsdeveloper.amd.com/wordpress/media/2013/06/2908_1_final.pdf · 2013. 10. 24. · GMAC Performance •Sobel filtering on video stream •OpenCL: –

Outline

• Introduction

• GMAC Memory Model

– Asymmetric Memory

– Global Memory

• Performance Evaluation

• Conclusions

6/15/11 8 AMD Fusion Summit 2011

Page 9: GMAC Global Memory for Acceleratorsdeveloper.amd.com/wordpress/media/2013/06/2908_1_final.pdf · 2013. 10. 24. · GMAC Performance •Sobel filtering on video stream •OpenCL: –

GMAC Memory Model

• Unified CPU / GPU virtual address space

• Asymmetric address space accessibility

CPU

Memory

GPU

Shared Data Accessed by CPU and GPU via same pointer

CPU Data

6/15/11 9 AMD Fusion Summit 2011

Page 10: GMAC Global Memory for Acceleratorsdeveloper.amd.com/wordpress/media/2013/06/2908_1_final.pdf · 2013. 10. 24. · GMAC Performance •Sobel filtering on video stream •OpenCL: –

GMAC Implementation

• Fusion APU

• AMD Radeon HD

6/15/11 AMD Fusion Summit 2011 10

CPU

Physical Memory

GPU

CPU CPU

Physical Memory

GPU GPU

Physical Memory

Coherence

Page 11: GMAC Global Memory for Acceleratorsdeveloper.amd.com/wordpress/media/2013/06/2908_1_final.pdf · 2013. 10. 24. · GMAC Performance •Sobel filtering on video stream •OpenCL: –

GMAC Consistency Model

• Implicit acquire / release primitives at accelerator call / return boundaries

6/15/11 AMD Fusion Summit 2011 11

CPU GPU

CPU GPU

Page 12: GMAC Global Memory for Acceleratorsdeveloper.amd.com/wordpress/media/2013/06/2908_1_final.pdf · 2013. 10. 24. · GMAC Performance •Sobel filtering on video stream •OpenCL: –

GMAC Coherence

• Avoid unnecessary data copies

• Lazy-update: – Call: transfer modified data

– Return: transfer when needed

6/15/11 AMD Fusion Summit 2011 12

Accelerator System Memory

Accelerator Memory

CPU

Page 13: GMAC Global Memory for Acceleratorsdeveloper.amd.com/wordpress/media/2013/06/2908_1_final.pdf · 2013. 10. 24. · GMAC Performance •Sobel filtering on video stream •OpenCL: –

GMAC Memory API

• Allocate shared memory eclError_t eclMalloc(void **ptr, size_t size)

– Allocated memory address (returned by reference)

– Gets the size of the data to be allocated

– Error code, eclSuccess if no error

• Example usage

#include <gmac/opencl.h> int main(int argc, char *argv[]) { float *foo = NULL; eclError_t error; if((error = eclMalloc((void **)&foo, FOO_SIZE)) != eclSuccess) FATAL(“Error allocating memory %s”, eclErrorString(error)); . . . }

6/15/11 13 AMD Fusion Summit 2011

Page 14: GMAC Global Memory for Acceleratorsdeveloper.amd.com/wordpress/media/2013/06/2908_1_final.pdf · 2013. 10. 24. · GMAC Performance •Sobel filtering on video stream •OpenCL: –

GMAC Memory API

• Release shared memory eclError_t eclFree(void *ptr)

– Memory address to be released

– Error code, eclSuccess if no error

• Example usage

#include <gmac/opencl.h> int main(int argc, char *argv[]) { float *foo = NULL; eclError_t error; if((error = eclMalloc((void **)&foo, FOO_SIZE)) != eclSuccess) FATAL(“Error allocating memory %s”, eclErrorString(error)); . . . eclFree(foo); }

6/15/11 14 AMD Fusion Summit 2011

Page 15: GMAC Global Memory for Acceleratorsdeveloper.amd.com/wordpress/media/2013/06/2908_1_final.pdf · 2013. 10. 24. · GMAC Performance •Sobel filtering on video stream •OpenCL: –

• Functions overridden (interposition) by GMAC: – Standard C Library memory functions: memset(), memcpy()

– Standard C Library I/O: fread(), fwrite(), read(), write()

– MPI: MPI_Send(), MPI_Receive

• Get advanced OpenCL features for free – Asynchronous highly optimized data transfers

– Pre-pinned memory

GMAC Built-in Optimizations

6/15/11 AMD Fusion Summit 2011 15

Calls to fread()

Data Transfers wait for kernel completion

Page 16: GMAC Global Memory for Acceleratorsdeveloper.amd.com/wordpress/media/2013/06/2908_1_final.pdf · 2013. 10. 24. · GMAC Performance •Sobel filtering on video stream •OpenCL: –

Outline

• Introduction

• GMAC Memory Model

– Asymmetric Memory

– Global Memory

• Performance Evaluation

• Conclusions

6/15/11 16 AMD Fusion Summit 2011

Page 17: GMAC Global Memory for Acceleratorsdeveloper.amd.com/wordpress/media/2013/06/2908_1_final.pdf · 2013. 10. 24. · GMAC Performance •Sobel filtering on video stream •OpenCL: –

GMAC Global Memory

• For multi-GPU systems: data accessible by all accelerators, but owned by the CPU

• Example: medium matrix in FDTD simulations

CPU

Memory

GPU

GPU

6/15/11 17 AMD Fusion Summit 2011

Page 18: GMAC Global Memory for Acceleratorsdeveloper.amd.com/wordpress/media/2013/06/2908_1_final.pdf · 2013. 10. 24. · GMAC Performance •Sobel filtering on video stream •OpenCL: –

GMAC Global Memory

• Read-only data structures

– Zero-copy memory if read only once by the GPU

– Replicated data if read often by the GPU

• GMAC Global memory:

– Pre-pinned zero-copy in AMD Fusion

– Discrete GPU (e.g. HD Radeon):

• Replicated data copies if enough GPU memory

• Pre-pinned zero-copy otherwise

6/15/11 AMD Fusion Summit 2011 18

Page 19: GMAC Global Memory for Acceleratorsdeveloper.amd.com/wordpress/media/2013/06/2908_1_final.pdf · 2013. 10. 24. · GMAC Performance •Sobel filtering on video stream •OpenCL: –

GMAC Global memory API

• Allocate global shared Memory eclError_t eclGlobalMalloc(void **ptr, size_t size)

– Allocated memory address (returned by reference)

– Gets the size of the data to be allocated

– Error code, eclSuccess if no error

• Example usage

#include <gmac/opencl.h> int main(int argc, char *argv[]) { float *foo = NULL; eclError_t error; if((error = eclGlobalMalloc((void **)&foo, FOO_SIZE)) != eclSuccess) FATAL(“Error allocating memory %s”, eclErrorString(error)); . . . }

6/15/11 19 AMD Fusion Summit 2011

Page 20: GMAC Global Memory for Acceleratorsdeveloper.amd.com/wordpress/media/2013/06/2908_1_final.pdf · 2013. 10. 24. · GMAC Performance •Sobel filtering on video stream •OpenCL: –

Outline

• Introduction

• GMAC Memory Model

– Asymmetric Memory

– Global Memory

• Performance Evaluation

• Conclusions

6/15/11 20 AMD Fusion Summit 2011

Page 21: GMAC Global Memory for Acceleratorsdeveloper.amd.com/wordpress/media/2013/06/2908_1_final.pdf · 2013. 10. 24. · GMAC Performance •Sobel filtering on video stream •OpenCL: –

GMAC Performance

• Vector Addition: worst case scenario

6/15/11 AMD Fusion Summit 2011 21

0.85

0.9

0.95

1

1.05

1.1

1.15

0

0.2

0.4

0.6

0.8

1

1.2

1.4

1.6

1.8

2

Spee

d-u

p

Exec

uti

on

tim

e (s

eco

nd

s)

Vector Size

SpeedUp OpenCL GMAC

Page 22: GMAC Global Memory for Acceleratorsdeveloper.amd.com/wordpress/media/2013/06/2908_1_final.pdf · 2013. 10. 24. · GMAC Performance •Sobel filtering on video stream •OpenCL: –

GMAC Performance

• Sobel filtering on video stream

• OpenCL:

– 2.5ms per frame

– 192 lines of code

• GMAC:

– 1.5ms per frame

– 91 lines of code

• Both OpenCL and GMAC are faster than a CPU implementation

6/15/11 AMD Fusion Summit 2011 22

Page 23: GMAC Global Memory for Acceleratorsdeveloper.amd.com/wordpress/media/2013/06/2908_1_final.pdf · 2013. 10. 24. · GMAC Performance •Sobel filtering on video stream •OpenCL: –

GMAC Hands-on

• Sobel Filtering Example

• Bullet Particle Collision Demo

– OpenCL

– GMAC

6/15/11 AMD Fusion Summit 2011 23

Page 24: GMAC Global Memory for Acceleratorsdeveloper.amd.com/wordpress/media/2013/06/2908_1_final.pdf · 2013. 10. 24. · GMAC Performance •Sobel filtering on video stream •OpenCL: –

Outline

• Introduction

• GMAC Memory Model

– Asymmetric Memory

– Global Memory

• Performance Evaluation

• Conclusions

6/15/11 24 AMD Fusion Summit 2011

Page 25: GMAC Global Memory for Acceleratorsdeveloper.amd.com/wordpress/media/2013/06/2908_1_final.pdf · 2013. 10. 24. · GMAC Performance •Sobel filtering on video stream •OpenCL: –

Conclusions

• Single virtual address space for CPUs and GPUs

• Use OpenCL advanced features

– Automatic overlap data communication and computation

– Get access to any GPU from any CPU thread

• Get more performance from your application more easily

6/15/11 25 AMD Fusion Summit 2011

Page 26: GMAC Global Memory for Acceleratorsdeveloper.amd.com/wordpress/media/2013/06/2908_1_final.pdf · 2013. 10. 24. · GMAC Performance •Sobel filtering on video stream •OpenCL: –

GMAC Global Memory for Accelerators

http://www.multicorewareinc.com

Page 27: GMAC Global Memory for Acceleratorsdeveloper.amd.com/wordpress/media/2013/06/2908_1_final.pdf · 2013. 10. 24. · GMAC Performance •Sobel filtering on video stream •OpenCL: –

Backup Slides

Page 28: GMAC Global Memory for Acceleratorsdeveloper.amd.com/wordpress/media/2013/06/2908_1_final.pdf · 2013. 10. 24. · GMAC Performance •Sobel filtering on video stream •OpenCL: –

Rolling Update Data Transfers

• Overlap CPU execution and data transfers

• Minimal transfer on-demand

• Rolling-update: – Memory-block size granularity

6/15/11 AMD Fusion Summit 2011 28

Accelerator System Memory

Accelerator Memory

CPU

Page 29: GMAC Global Memory for Acceleratorsdeveloper.amd.com/wordpress/media/2013/06/2908_1_final.pdf · 2013. 10. 24. · GMAC Performance •Sobel filtering on video stream •OpenCL: –

GMAC and Multi-threading

• In the past, one host thread had one CPU

• In GMAC, each host thread has:

– One CPU

– One GPU

• A GMAC thread is running on GPU or on the CPU, but not on both at the same time

• Create threads using what you already know – pthread_create(...)

6/15/11 29 AMD Fusion Summit 2011

Page 30: GMAC Global Memory for Acceleratorsdeveloper.amd.com/wordpress/media/2013/06/2908_1_final.pdf · 2013. 10. 24. · GMAC Performance •Sobel filtering on video stream •OpenCL: –

GMAC and Multi-threading

• Virtual memory accessibility:

– Complete address space in CPU mode

– Partial address space in GPU mode

CPU CPU

GPU GPU Memory

6/15/11 30 AMD Fusion Summit 2011

Page 31: GMAC Global Memory for Acceleratorsdeveloper.amd.com/wordpress/media/2013/06/2908_1_final.pdf · 2013. 10. 24. · GMAC Performance •Sobel filtering on video stream •OpenCL: –

GMAC Global Memory for Accelerators

http://www.multicorewareinc.com

Page 32: GMAC Global Memory for Acceleratorsdeveloper.amd.com/wordpress/media/2013/06/2908_1_final.pdf · 2013. 10. 24. · GMAC Performance •Sobel filtering on video stream •OpenCL: –

Disclaimer & Attribution The information presented in this document is for informational purposes only and may contain technical inaccuracies, omissions

and typographical errors.

The information contained herein is subject to change and may be rendered inaccurate for many reasons, including but not limited

to product and roadmap changes, component and motherboard version changes, new model and/or product releases, product

differences between differing manufacturers, software changes, BIOS flashes, firmware upgrades, or the like. There is no

obligation to update or otherwise correct or revise this information. However, we reserve the right to revise this information and to

make changes from time to time to the content hereof without obligation to notify any person of such revisions or changes.

NO REPRESENTATIONS OR WARRANTIES ARE MADE WITH RESPECT TO THE CONTENTS HEREOF AND NO

RESPONSIBILITY IS ASSUMED FOR ANY INACCURACIES, ERRORS OR OMISSIONS THAT MAY APPEAR IN THIS

INFORMATION.

ALL IMPLIED WARRANTIES OF MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE ARE EXPRESSLY

DISCLAIMED. IN NO EVENT WILL ANY LIABILITY TO ANY PERSON BE INCURRED FOR ANY DIRECT, INDIRECT, SPECIAL

OR OTHER CONSEQUENTIAL DAMAGES ARISING FROM THE USE OF ANY INFORMATION CONTAINED HEREIN, EVEN IF

EXPRESSLY ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.

AMD, the AMD arrow logo, and combinations thereof are trademarks of Advanced Micro Devices, Inc. All other names used in

this presentation are for informational purposes only and may be trademarks of their respective owners.

The contents of this presentation were provided by individual(s) and/or company listed on the title page. The information and

opinions presented in this presentation may not represent AMD’s positions, strategies or opinions. Unless explicitly stated, AMD is

not responsible for the content herein and no endorsements are implied.