parallalization of molecular dynamics danijel novaković, faculty of electronic engineering - niš,...
Post on 17-Jan-2016
216 Views
Preview:
TRANSCRIPT
Parallalization of molecular dynamics
Danijel Novaković, Faculty of Electronic Engineering - Niš, Serbia
dakidaki@gmail.com
10. December, TU Ilmenau
IAESTE LC Ilmenau IAESTE LC Niš
Plan of speechPlan of speech
• The representation of the problem based on physics
• Description of the C code• Description of the Cuda code• Optimization and suggestions for future
development• Conclusion
Plan of speechPlan of speech
• The representation of the problem based on physics
• Description of the C code• Description of the Cuda code• Optimization and suggestions for future
development• Conclusion
Definition of problem 1Definition of problem 1
Definition of problem 2Definition of problem 2
Definition of problem 3Definition of problem 3
Plan of speechPlan of speech
• The representation of the problem based on physics
• Description of the C code• Description of the Cuda code• Optimization and suggestions for future
development• Conclusion
C codeC code
Fortran F95 C prog. language • constants.c;
• trajektorievars.c;
• inputoutput.c;
• trajektoriem.c;
• efef.c;
• main.c
C code 2C code 2• trajektoriem.c– int relation(float spy) – void doit_line()
C code 3C code 3
Why Cuda, not C or Fortran?Why Cuda, not C or Fortran?
• Cuda program should be faster• Parallel calculations• Less time for complicate calculations• Better performances
What is CudaWhat is Cuda
• Compute Unied Device Architecture– C programming language on GPUs– Requires no knowledge of graphics APIs or GPU
programming– Stable, available (for free), documented and
supported– For both Windows and Linux– Designed and developed by NVIDIA
» Requires an NVIDIA GPU (GeForce 8xxx/Tesla/Quadro)
SIMDSIMD
• P - pipe• Different data in each P• Unique control unit
SIMD+
A BC D
A+B
C+D
Grid of thread blocksGrid of thread blocks- The computational grid consists of a
grid of thread blocks- Each thread executes the kernel- The application species the grid and
block dimensions- The grid layouts can be 1, 2, or
3-dimensional- The maximal sizes are determined by
GPU memory and kernel complexity- Each block has an unique block ID- Each thread has an unique thread ID
(within the block)
Special variablesSpecial variables
• Special variables for thread identification in kernel– dim3 threadIdx– dim3 blockIdx– dim3 blockDimoutdata[blockIdx.x*blockDim.x+threadIdx.x] =
relation(indata[blockIdx.x*blockDim.x+threadIdx.x], … );
Memory modelMemory model
Cuda codeCuda code
• setup_rand.cu• run.cu• kernel.cu• main.c• inputoutput.c• trajektorievars.c• definicije.h
• CUDA programs have a basic flow– The host initializes an array with data.– The array is copied from the host to the memory
on the CUDA device.– The CUDA device operates on the data in the
array.– The array is copied back to the host.
Cuda code Cuda code
setup_rand.cusetup_rand.custruct DevMemData{ float* orig_data; //original data float* h_data; //copy of original data, stored on the host float* d_data; //copy of original data, stored on the Cuda
device int n; // number of elements in arrays unsigned int memsize; unsigned int sharedDeviceMem; int init; }DevMemData* GPUMemoryData;dim3 grid;dim3 threads;
The host initializes an array with dataThe host initializes an array with datavoid memDataCopyOrigHost (DevMemData* memData){ int i;
if(memData->init==0) printf("Error: memDataCopyOrigHost: memData not initialised.\n");
elsefor(i=0;i<memData->n;i++)
memData->h_data[i]= memData->orig_data[i];return;
}void memDataCopyHostOrig (DevMemData* memData){
int i;if(memData->init==0)
printf("Error: memDataCopyOrigHost: memData not initialised.\n");else
for(i=0;i<memData->n;i++)memData->orig_data[i] = memData->h_data[i];
return;}
void memDataCopyHostDevice(DevMemData* memData)
{ CUDA_SAFE_CALL( cudaMemcpy( memData-
>d_data, memData->h_data,memData->memsize,cudaMemcpyHostToDevice) );
}The array is copied from the host to the memory on theCUDA device.
setup_rand.cusetup_rand.cu
void memDataCopyDeviceHost(DevMemData* memData)
{ CUDA_SAFE_CALL( cudaMemcpy( memData-
>h_data, memData->d_data,memData->memsize,cudaMemcpyDeviceToHost) );
}a transfer of the data from the array in the CUDA device
memory back to the array in the host memory
setup_rand.cusetup_rand.cu
void allocateDeviceMem(float* original_data,int n, DevMemData* memData){ grid.x=n; grid.y=1; grid.z=1; threads.x=n; threads.y=1; threads.z=1; memData->orig_data=original_data; memData->n=n; memData->memsize=memData->n*sizeof(float); memData->sharedDeviceMem=0; // allocate array on host memData->h_data=(float*) malloc(memData->n*sizeof(float)); // allocate array on device CUDA_SAFE_CALL( cudaMalloc( (void**) &memData->d_data, memData->memsize)); memData->init=1; memDataCopyOrigHost(memData); memDataCopyHostDevice(memData);}
setup_rand.cusetup_rand.cu
void CudaInit(int deviceid,f loat* h_data,f loat*hh_data,int n) { GPUMemoryData=(struct DevMemData*) malloc(2*sizeof(struct
DevMemData)); printf("CUDA Init Start\n"); int gpunum;
//returns the number of compute-capable devices CUDA_SAFE_CALL(cudaGetDeviceCount(&gpunum)); printf("GPU Num: %i\n",gpunum); if (gpunum<deviceid+1) {printf("Not enough GPUs
available!!");return;} // sets device to be used for GPU executions CUDA_SAFE_CALL(cudaSetDevice(deviceid)); allocateDeviceMem(h_data,n,&GPUMemoryData[0]); allocateDeviceMem(hh_data,n,&GPUMemoryData[1]);}
setup_rand.cusetup_rand.cu
Function Type QualifiersFunction Type Qualifiers__device__The __device__ qualifier declares a function that is:
Executed on the device, __device__ int relation(float spy, …)Callable from the device only.
__global__The __global__ qualifier declares a function as being a
kernel. Executed on the device, __global__ void run_kernel(float* indata, …)Callable from the host only.
__host__The __host__ qualifier declares a function that is:
Executed on the host, __host__ int CudaInit(int deviceid, …) Callable from the host only.
Kernels functionsKernels functions
• Cuda extends C by allowing the programmer to define C functions, called kernels, that, when, called, are executed N times in parallel by N different Cuda threads, as opposed to only once like regular C functions.
• __global__ void kernelName <<< a,b >>> ( … )a - the number of blocks,b - the number of threads in each block
ran.curan.cuvoid CudaRun(float b,float UdK,float kappa,float dt){ int i;// kernel execution directive
run_kernel<<<grid,threads>>(GPUMemoryData[0].d_data,GPUMemoryData[1].d_data,GPUMemoryData[0].n,b,UdK,kappa,dt);
cudaThreadSynchronize(); //wait for compute-device to finish printf("ERROR-CUDA: %s\n",cudaGetErrorString(cudaGetLastError())); CUT_CHECK_ERROR("Kernel execution failed"); memDataCopyDeviceHost(&GPUMemoryData[0]); memDataCopyHostOrig(&GPUMemoryData[0]); memDataCopyDeviceHost(&GPUMemoryData[1]); memDataCopyHostOrig(&GPUMemoryData[1]);}
__global__ void run_kernel(float* indata,float* outdata, int n,float b,float UdK,float kappa,float dt)
{outdata[blockIdx.x*blockDim.x+threadIdx.x] = relation(indata[blockIdx.x*blockDim.x+threadIdx.x],b,UdK,kappa,dt);
}
kernel.cukernel.cu
__device__ int relation(float spy,float b,float UdK,float kappa,float dt){ float r, rpa,ti =0;
float4 x,dxdt;int trapped =0, outside =0; //0 – false; 1 - truex.y = spy; x.x = (float) (-sqrt(b*b-x.y*x.y));x.z = Um; x.w = 0.0; rpa= rP+a;while (trapped == 0 && outside == 0){
dxdt = derivs(ti,x,UdK,kappa);x = rk4(x,dxdt,ti,dt,UdK,kappa); //Runge Kutta methodti+= dt;r = (float) (sqrt(x.x*x.x+x.y*x.y));if(r<=rpa) trapped = 1;if(r>b) outside = 1;
}return trapped;
}
kernel.cukernel.cu
float4,float3,float2,int4,int3,int2float4,float3,float2,int4,int3,int2
• float4 temp;– temp.x– temp.y– temp.z– temp.w
__device__ float4 derivs(float ti, float4 x, float UdK, float kappa)
{float2 F;float4 dxdt;F = FStokes(x,UdK,kappa); //racuna snagu F, prenosi Xdxdt.x = x.z;dxdt.y = x.w;dxdt.z = F.x;dxdt.w = F.y;return dxdt;
}
kernel.cukernel.cu
int main(){
int deviceid=0;int cudaInitCheck = 0; // booleanfloat dt;float* inarray,*outarray;setconst();initrP(); dt = initTime();inarray = input(5)outarray = (float*)malloc(num*sizeof(float));CudaInit(deviceid,inarray,outarray,num); return 1;
}
kernel.cukernel.cu
Plan of speechPlan of speech
• The representation of the problem based on physics
• Description of the C code• Description of the Cuda code• Optimization and suggestions for future
development• Conclusion
Future developmentFuture development
__device__ int relation(float spy,float b,float UdK,float kappa,float dt){ float r, rpa,ti =0;
float4 x,dxdt;int trapped =0, outside =0 ; //0 – false; 1 - truex.y = spy; x.x = (float) (-sqrt(b*b-x.y*x.y));x.z = Um; x.w = 0.0; rpa= rP+a;while (trapped == 0 && outside == 0){ dxdt = derivs(ti,x,UdK,kappa);
x = rk4(x,dxdt,ti,dt,UdK,kappa); //Runge Kutta methodti+= dt;r = (float) (sqrt(x.x*x.x+x.y*x.y));if(r<=rpa) trapped = 1;if(r>b) outside = 1;
}return trapped; }
Future developmentFuture development
__device__ int relation(float spy,float b,float UdK,float kappa,float dt){ float r, rpa,ti =0;
float4 x,dxdt;int trapped =0, outside =0, safe =0; x.y = spy; x.x = (float) (-sqrt(b*b-x.y*x.y));x.z = Um; x.w = 0.0; rpa= rP+a;while (trapped == 0 && outside == 0 && (++safe)<100000001 ){
dxdt = derivs(ti,x,UdK,kappa);x = rk4(x,dxdt,ti,dt,UdK,kappa);ti+= dt;r = (float) (sqrt(x.x*x.x+x.y*x.y));if(r<=rpa) trapped = 1;if(r>b) outside = 1;
}return trapped;
}
Future developmentFuture development
__device__ int relation(float spy,float b,float UdK,float kappa,float dt){ float r, rpa,ti =0;
float4 x,dxdt;int trapped =0, outside =0, safe =0; x.y = spy; x.x = (float) (-sqrt(b*b-x.y*x.y));x.z = Um; x.w = 0.0; rpa= rP+a;while (trapped == 0 && outside == 0 && (++safe)<100000001 ){
dxdt = derivs(ti,x,UdK,kappa);x = rk4(x,dxdt,ti,dt,UdK,kappa);x = difusion ( );ti+= dt;r = (float) (sqrt(x.x*x.x+x.y*x.y));if(r<=rpa) trapped = 1;if(r>b) outside = 1;
}return trapped;
}
Future developmentFuture development
Plan of speechPlan of speech
• The representation of the problem based on physics
• Description of the C code• Description of the Cuda code• Optimization and suggestions for future
development• Conclusion
ConclusionConclusion
• Parallel calculations• Better performance
Available resourcesAvailable resources
• NVIDIA CUDA Homepage– http://www.nvidia.com/cuda
• Contains downloads, documentation, examples and links
• CUDA Forums– http://forums.nvidia.com
• Supercomputing 2007 CUDA Tutorials– http://www.gpgpu.org/sc2007/
Thank you for listening !!!
especilly thank toDr. Hartmut GrilleChristian MüllerIAESTE
top related