performance portability through descriptive parallelism
TRANSCRIPT
2
My definition of performance portability
The same source code will run productively on a variety of different architectures.
3
Two Types of Portability
FUNCTIONAL PORTABILITY
The ability for a single code to run anywhere.
PERFORMANCE PORTABILITY
The ability for a single code to run well anywhere.
6
Descriptive or Prescriptive Parallelism
Programmer explicitly parallelizes the code, compiler obeys
Requires little/no analysis by the compiler
Substantially different architectures require different directives
Fairly consistent behavior between implementations
Prescriptive
Compiler parallelizes the code with guidance from the programmer
Compiler must make decisions from available information
Compiler uses information from the programmer and heuristics about the architecture to make decisions
Quality of implementation greatly affects results.
Descriptive
7
Prescribing vs. Describingwhile ( error > tol && iter < iter_max ){
error = 0.0;
#pragma omp parallel for reduction(max:error)for( int j = 1; j < n-1; j++) {
#pragma omp simdfor( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));}
}
#pragma omp parallel forfor( int j = 1; j < n-1; j++) {
#pragma omp simdfor( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];}
}
if(iter++ % 100 == 0) printf("%5d, %0.6f\n", iter, error);}
while ( error > tol && iter < iter_max ){
error = 0.0;
#pragma acc parallel loop reduction(max:error)for( int j = 1; j < n-1; j++) {
#pragma acc loop reduction(max:error)for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));}
}
#pragma acc parallel loopfor( int j = 1; j < n-1; j++) {
#pragma acc loopfor( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];}
}
if(iter++ % 100 == 0) printf("%5d, %0.6f\n", iter, error);}
8
Prescribing vs. Describingwhile ( error > tol && iter < iter_max ){
error = 0.0;
#pragma omp parallel for reduction(max:error)for( int j = 1; j < n-1; j++) {
#pragma omp simdfor( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));}
}
#pragma omp parallel forfor( int j = 1; j < n-1; j++) {
#pragma omp simdfor( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];}
}
if(iter++ % 100 == 0) printf("%5d, %0.6f\n", iter, error);}
while ( error > tol && iter < iter_max ){
error = 0.0;
#pragma acc parallel loop reduction(max:error)for( int j = 1; j < n-1; j++) {
#pragma acc loop reduction(max:error)for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));}
}
#pragma acc parallel loopfor( int j = 1; j < n-1; j++) {
#pragma acc loopfor( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];}
}
if(iter++ % 100 == 0) printf("%5d, %0.6f\n", iter, error);}
while ( error > tol && iter < iter_max ){
error = 0.0;#pragma omp target {#pragma omp parallel for reduction(max:error)
for( int j = 1; j < n-1; j++) {for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));}
}
#pragma omp parallel forfor( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {A[j][i] = Anew[j][i];
}}
}if(iter++ % 100 == 0) printf("%5d, %0.6f\n", iter, error);
}
9
Prescribing vs. Describingwhile ( error > tol && iter < iter_max ){
error = 0.0;
#pragma omp parallel for reduction(max:error)for( int j = 1; j < n-1; j++) {
#pragma omp simdfor( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));}
}
#pragma omp parallel forfor( int j = 1; j < n-1; j++) {
#pragma omp simdfor( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];}
}
if(iter++ % 100 == 0) printf("%5d, %0.6f\n", iter, error);}
while ( error > tol && iter < iter_max ){
error = 0.0;
#pragma acc parallel loop reduction(max:error)for( int j = 1; j < n-1; j++) {
#pragma acc loop reduction(max:error)for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));}
}
#pragma acc parallel loopfor( int j = 1; j < n-1; j++) {
#pragma acc loopfor( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];}
}
if(iter++ % 100 == 0) printf("%5d, %0.6f\n", iter, error);}
while ( error > tol && iter < iter_max ){
error = 0.0;#pragma omp target {#pragma omp parallel for reduction(max:error)
for( int j = 1; j < n-1; j++) {for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));}
}
#pragma omp parallel forfor( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {A[j][i] = Anew[j][i];
}}
}if(iter++ % 100 == 0) printf("%5d, %0.6f\n", iter, error);
}
#pragma omp target data map(alloc:Anew) map(A)while ( error > tol && iter < iter_max ){
error = 0.0;
#pragma omp target teams distribute parallel for reduction(max:error)
for( int j = 1; j < n-1; j++){
for( int i = 1; i < m-1; i++ ){
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));}
}
#pragma omp target teams distribute parallel forfor( int j = 1; j < n-1; j++){
for( int i = 1; i < m-1; i++ ){
A[j][i] = Anew[j][i];}
}
if(iter % 100 == 0) printf("%5d, %0.6f\n", iter, error);
10
Prescribing vs. Describingwhile ( error > tol && iter < iter_max ){
error = 0.0;
#pragma omp parallel for reduction(max:error)for( int j = 1; j < n-1; j++) {
#pragma omp simdfor( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));}
}
#pragma omp parallel forfor( int j = 1; j < n-1; j++) {
#pragma omp simdfor( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];}
}
if(iter++ % 100 == 0) printf("%5d, %0.6f\n", iter, error);}
while ( error > tol && iter < iter_max ){
error = 0.0;
#pragma acc parallel loop reduction(max:error)for( int j = 1; j < n-1; j++) {
#pragma acc loop reduction(max:error)for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));}
}
#pragma acc parallel loopfor( int j = 1; j < n-1; j++) {
#pragma acc loopfor( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];}
}
if(iter++ % 100 == 0) printf("%5d, %0.6f\n", iter, error);}
while ( error > tol && iter < iter_max ){
error = 0.0;#pragma omp target {#pragma omp parallel for reduction(max:error)
for( int j = 1; j < n-1; j++) {for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));}
}
#pragma omp parallel forfor( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {A[j][i] = Anew[j][i];
}}
}if(iter++ % 100 == 0) printf("%5d, %0.6f\n", iter, error);
}
#pragma omp target data map(alloc:Anew) map(A)while ( error > tol && iter < iter_max ){
error = 0.0;
#pragma omp target teams distribute parallel for reduction(max:error)
for( int j = 1; j < n-1; j++){
for( int i = 1; i < m-1; i++ ){
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));}
}
#pragma omp target teams distribute parallel forfor( int j = 1; j < n-1; j++){
for( int i = 1; i < m-1; i++ ){
A[j][i] = Anew[j][i];}
}
if(iter % 100 == 0) printf("%5d, %0.6f\n", iter, error);
#pragma omp target data map(alloc:Anew) map(A)while ( error > tol && iter < iter_max ){
error = 0.0;
#pragma omp target teams distribute parallel for reduction(max:error)
for( int j = 1; j < n-1; j++){
for( int i = 1; i < m-1; i++ ){
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));}
}
#pragma omp target teams distribute parallel forfor( int j = 1; j < n-1; j++){
for( int i = 1; i < m-1; i++ ){
A[j][i] = Anew[j][i];}
}
#pragma omp target data map(alloc:Anew) map(A)while ( error > tol && iter < iter_max ){
error = 0.0;
#pragma omp target teams distributefor( int j = 1; j < n-1; j++){
#pragma omp parallel for reduction(max:error) schedule(static,1)for( int i = 1; i < m-1; i++ ){
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));}
}
#pragma omp target teams distributefor( int j = 1; j < n-1; j++){
#pragma omp parallel for schedule(static,1)for( int i = 1; i < m-1; i++ ){
A[j][i] = Anew[j][i];
11
Prescribing vs. Describingwhile ( error > tol && iter < iter_max ){
error = 0.0;
#pragma omp parallel for reduction(max:error)for( int j = 1; j < n-1; j++) {
#pragma omp simdfor( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));}
}
#pragma omp parallel forfor( int j = 1; j < n-1; j++) {
#pragma omp simdfor( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];}
}
if(iter++ % 100 == 0) printf("%5d, %0.6f\n", iter, error);}
while ( error > tol && iter < iter_max ){
error = 0.0;
#pragma acc parallel loop reduction(max:error)for( int j = 1; j < n-1; j++) {
#pragma acc loop reduction(max:error)for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));}
}
#pragma acc parallel loopfor( int j = 1; j < n-1; j++) {
#pragma acc loopfor( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];}
}
if(iter++ % 100 == 0) printf("%5d, %0.6f\n", iter, error);}
while ( error > tol && iter < iter_max ){
error = 0.0;#pragma omp target {#pragma omp parallel for reduction(max:error)
for( int j = 1; j < n-1; j++) {for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));}
}
#pragma omp parallel forfor( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {A[j][i] = Anew[j][i];
}}
}if(iter++ % 100 == 0) printf("%5d, %0.6f\n", iter, error);
}
#pragma omp target data map(alloc:Anew) map(A)while ( error > tol && iter < iter_max ){
error = 0.0;
#pragma omp target teams distribute parallel for reduction(max:error)
for( int j = 1; j < n-1; j++){
for( int i = 1; i < m-1; i++ ){
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));}
}
#pragma omp target teams distribute parallel forfor( int j = 1; j < n-1; j++){
for( int i = 1; i < m-1; i++ ){
A[j][i] = Anew[j][i];}
}
if(iter % 100 == 0) printf("%5d, %0.6f\n", iter, error);
#pragma omp target data map(alloc:Anew) map(A)while ( error > tol && iter < iter_max ){
error = 0.0;
#pragma omp target teams distribute parallel for reduction(max:error)
for( int j = 1; j < n-1; j++){
for( int i = 1; i < m-1; i++ ){
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));}
}
#pragma omp target teams distribute parallel forfor( int j = 1; j < n-1; j++){
for( int i = 1; i < m-1; i++ ){
A[j][i] = Anew[j][i];}
}
#pragma omp target data map(alloc:Anew) map(A)while ( error > tol && iter < iter_max ){
error = 0.0;
#pragma omp target teams distributefor( int j = 1; j < n-1; j++){
#pragma omp parallel for reduction(max:error) schedule(static,1)for( int i = 1; i < m-1; i++ ){
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));}
}
#pragma omp target teams distributefor( int j = 1; j < n-1; j++){
#pragma omp parallel for schedule(static,1)for( int i = 1; i < m-1; i++ ){
A[j][i] = Anew[j][i];
#pragma omp target data map(alloc:Anew) map(A)while ( error > tol && iter < iter_max ){
error = 0.0;
#pragma omp target teams distribute parallel for \reduction(max:error) collapse(2)
for( int j = 1; j < n-1; j++){
for( int i = 1; i < m-1; i++ ){
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));}
}
#pragma omp target teams distribute parallel for \collapse(2)
for( int j = 1; j < n-1; j++){
for( int i = 1; i < m-1; i++ )
12
Prescribing vs. Describingwhile ( error > tol && iter < iter_max ){
error = 0.0;
#pragma omp parallel for reduction(max:error)for( int j = 1; j < n-1; j++) {
#pragma omp simdfor( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));}
}
#pragma omp parallel forfor( int j = 1; j < n-1; j++) {
#pragma omp simdfor( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];}
}
if(iter++ % 100 == 0) printf("%5d, %0.6f\n", iter, error);}
while ( error > tol && iter < iter_max ){
error = 0.0;
#pragma acc parallel loop reduction(max:error)for( int j = 1; j < n-1; j++) {
#pragma acc loop reduction(max:error)for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));}
}
#pragma acc parallel loopfor( int j = 1; j < n-1; j++) {
#pragma acc loopfor( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];}
}
if(iter++ % 100 == 0) printf("%5d, %0.6f\n", iter, error);}
while ( error > tol && iter < iter_max ){
error = 0.0;#pragma omp target {#pragma omp parallel for reduction(max:error)
for( int j = 1; j < n-1; j++) {for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));}
}
#pragma omp parallel forfor( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {A[j][i] = Anew[j][i];
}}
}if(iter++ % 100 == 0) printf("%5d, %0.6f\n", iter, error);
}
#pragma omp target data map(alloc:Anew) map(A)while ( error > tol && iter < iter_max ){
error = 0.0;
#pragma omp target teams distribute parallel for reduction(max:error)
for( int j = 1; j < n-1; j++){
for( int i = 1; i < m-1; i++ ){
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));}
}
#pragma omp target teams distribute parallel forfor( int j = 1; j < n-1; j++){
for( int i = 1; i < m-1; i++ ){
A[j][i] = Anew[j][i];}
}
if(iter % 100 == 0) printf("%5d, %0.6f\n", iter, error);
#pragma omp target data map(alloc:Anew) map(A)while ( error > tol && iter < iter_max ){
error = 0.0;
#pragma omp target teams distribute parallel for reduction(max:error)
for( int j = 1; j < n-1; j++){
for( int i = 1; i < m-1; i++ ){
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));}
}
#pragma omp target teams distribute parallel forfor( int j = 1; j < n-1; j++){
for( int i = 1; i < m-1; i++ ){
A[j][i] = Anew[j][i];}
}
#pragma omp target data map(alloc:Anew) map(A)while ( error > tol && iter < iter_max ){
error = 0.0;
#pragma omp target teams distributefor( int j = 1; j < n-1; j++){
#pragma omp parallel for reduction(max:error) schedule(static,1)for( int i = 1; i < m-1; i++ ){
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));}
}
#pragma omp target teams distributefor( int j = 1; j < n-1; j++){
#pragma omp parallel for schedule(static,1)for( int i = 1; i < m-1; i++ ){
A[j][i] = Anew[j][i];
#pragma omp target data map(alloc:Anew) map(A)while ( error > tol && iter < iter_max ){
error = 0.0;
#pragma omp target teams distributefor( int j = 1; j < n-1; j++){
#pragma omp parallel for reduction(max:error) schedule(static,1)for( int i = 1; i < m-1; i++ ){
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));}
}
#pragma omp target teams distributefor( int j = 1; j < n-1; j++){
#pragma omp parallel for schedule(static,1)for( int i = 1; i < m-1; i++ )
13
Prescribing vs. Describingwhile ( error > tol && iter < iter_max ){
error = 0.0;
#pragma omp parallel for reduction(max:error)for( int j = 1; j < n-1; j++) {
#pragma omp simdfor( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));}
}
#pragma omp parallel forfor( int j = 1; j < n-1; j++) {
#pragma omp simdfor( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];}
}
if(iter++ % 100 == 0) printf("%5d, %0.6f\n", iter, error);}
while ( error > tol && iter < iter_max ){
error = 0.0;
#pragma acc parallel loop reduction(max:error)for( int j = 1; j < n-1; j++) {
#pragma acc loop reduction(max:error)for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));}
}
#pragma acc parallel loopfor( int j = 1; j < n-1; j++) {
#pragma acc loopfor( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];}
}
if(iter++ % 100 == 0) printf("%5d, %0.6f\n", iter, error);}
while ( error > tol && iter < iter_max ){
error = 0.0;#pragma omp target {#pragma omp parallel for reduction(max:error)
for( int j = 1; j < n-1; j++) {for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));}
}
#pragma omp parallel forfor( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {A[j][i] = Anew[j][i];
}}
}if(iter++ % 100 == 0) printf("%5d, %0.6f\n", iter, error);
}
#pragma omp target data map(alloc:Anew) map(A)while ( error > tol && iter < iter_max ){
error = 0.0;
#pragma omp target teams distribute parallel for reduction(max:error)
for( int j = 1; j < n-1; j++){
for( int i = 1; i < m-1; i++ ){
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));}
}
#pragma omp target teams distribute parallel forfor( int j = 1; j < n-1; j++){
for( int i = 1; i < m-1; i++ ){
A[j][i] = Anew[j][i];}
}
if(iter % 100 == 0) printf("%5d, %0.6f\n", iter, error);
#pragma omp target data map(alloc:Anew) map(A)while ( error > tol && iter < iter_max ){
error = 0.0;
#pragma omp target teams distribute parallel for reduction(max:error)
for( int j = 1; j < n-1; j++){
for( int i = 1; i < m-1; i++ ){
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));}
}
#pragma omp target teams distribute parallel forfor( int j = 1; j < n-1; j++){
for( int i = 1; i < m-1; i++ ){
A[j][i] = Anew[j][i];}
}
#pragma omp target data map(alloc:Anew) map(A)while ( error > tol && iter < iter_max ){
error = 0.0;
#pragma omp target teams distributefor( int j = 1; j < n-1; j++){
#pragma omp parallel for reduction(max:error) schedule(static,1)for( int i = 1; i < m-1; i++ ){
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));}
}
#pragma omp target teams distributefor( int j = 1; j < n-1; j++){
#pragma omp parallel for schedule(static,1)for( int i = 1; i < m-1; i++ ){
A[j][i] = Anew[j][i];
#pragma omp target data map(alloc:Anew) map(A)while ( error > tol && iter < iter_max ){
error = 0.0;
#pragma omp target teams distributefor( int j = 1; j < n-1; j++){
#pragma omp parallel for reduction(max:error) schedule(static,1)for( int i = 1; i < m-1; i++ ){
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));}
}
#pragma omp target teams distributefor( int j = 1; j < n-1; j++){
#pragma omp parallel for schedule(static,1)for( int i = 1; i < m-1; i++ )
#pragma omp target data map(alloc:Anew) map(A)while ( error > tol && iter < iter_max ){
error = 0.0;
#pragma omp target teams distribute parallel for \reduction(max:error) collapse(2) schedule(static,1)
for( int j = 1; j < n-1; j++){
for( int i = 1; i < m-1; i++ ){
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));}
}
#pragma omp target teams distribute parallel for \collapse(2) schedule(static,1)
for( int j = 1; j < n-1; j++)
14
Execution Time (Smaller is Better)
1.00X
5.12X
0.12X
1.01X
2.47X
0.96X
4.00X
17.42X
4.96X
23.03X
0.00
20.00
40.00
60.00
80.00
100.00
120.00
140.00
Original CPUThreaded
GPUThreaded
GPU Teams GPU Split GPUCollapse
GPU SplitSched
GPUCollapseSched
OpenACC(CPU)
OpenACC(GPU)
893
Same
directives
(unoptimized)
Executi
on T
ime (
seconds)
NVIDIA Tesla K40, Intel Xeon E5-2690 v2 @ 3.00GHz – CLANG pre-3.8, PGI 16.3
15
Importance of Loop Scheduling
1.00X
5.11X
1.07X
0.96X
17.42X
0
20
40
60
80
100
120
Serial OpenMP (CPU) OpenMP (CPU)interleaved
OpenMP (GPU) OpenMP (GPU)interleaved
Tim
e (
s)
NVIDIA Tesla K40, Intel Xeon E5-2690 v2 @ 3.00GHz –CLANG pre-3.8, PGI 16.3
16
Importance of Loop Scheduling
1.00X
16.55X
2.36X
1.64X
29.84X
0
20
40
60
80
100
120
140
160
180
200
Serial OpenMP (CPU) OpenMP (CPU)interleaved
OpenMP (GPU) OpenMP (GPU)interleaved
Tim
e (
s)
NVIDIA Tesla K40, Intel Xeon E5-2690 v2 @ 3.00GHz – Intel C Compiler 15.0, CLANG pre-3.8, PGI 16.3
ICC 15.0 CLANG
17
Why Can’t OpenMP Do This? (1)
#pragma omp target teams distribute parallel for \reduction(max:error) collapse(2) schedule(static,1)
for( int j = 1; j < n-1; j++){
for( int i = 1; i < m-1; i++ ){
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));}
}
#pragma omp target teams distribute parallel for \collapse(2) schedule(static,1)
for( int j = 1; j < n-1; j++){
for( int i = 1; i < m-1; i++ ){
A[j][i] = Anew[j][i];}
}
The compiler can freely specify the default schedule, but not add the collapse
Wrong loop for coalescing on the GPU
Default schedule is implementation defined
18
Why Can’t OpenMP Do This? (2)
#pragma omp target teams distributefor( int j = 1; j < n-1; j++){
#pragma omp parallel for reduction(max:error) schedule(static,1)for( int i = 1; i < m-1; i++ ){
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));}
}
#pragma omp target teams distributefor( int j = 1; j < n-1; j++){
#pragma omp parallel for schedule(static,1)for( int i = 1; i < m-1; i++ ){
A[j][i] = Anew[j][i];}
}
Right loop for coalescing on the GPU
Makes no sense to distribute to teams on the CPU
Default schedule is implementation defined
19
Why Can’t OpenMP Do This? (3)
#pragma omp target teams distribute parallel for \reduction(max:error) collapse(2) simd
for( int j = 1; j < n-1; j++){
for( int i = 1; i < m-1; i++ ){
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));}
}
#pragma omp target teams distribute parallel for collapse(2) simdfor( int j = 1; j < n-1; j++){
for( int i = 1; i < m-1; i++ ){
A[j][i] = Anew[j][i];}
}
Compile can always choose 1 team, making this behave as a parallel for
Maybe the compiler can treat SIMD as a coalescing hint?
Is this the new best practice?
Default schedule is implementation defined
20
Performance Portability Results
0
2
4
6
8
10
12
miniGhost (Mantevo) NEMO (Climate & Ocean) CLOVERLEAF (Physics)
CPU: MPI + OpenMP *
Speedup v
s si
ngle
CPU
Core
PGI 15.10 CompilerminiGhost: CPU: Intel Xeon E5-2698 v3, 2 sockets, 32-cores total, GPU: Tesla K80 (single GPU)NEMO: Each socket CPU: Intel Xeon E5-‐2698 v3, 16 cores; GPU: NVIDIA K80 both GPUsCLOVERLEAF: CPU: Dual socket Intel Xeon CPU E5-2690 v2, 20 cores total, GPU: Tesla K80 both GPUs * NEMO run used all-MPI
21
Performance Portability Results
0
2
4
6
8
10
12
miniGhost (Mantevo) NEMO (Climate & Ocean) CLOVERLEAF (Physics)
CPU: MPI + OpenMP *
CPU: MPI + OpenACC
Speedup v
s si
ngle
CPU
Core
PGI 15.10 CompilerminiGhost: CPU: Intel Xeon E5-2698 v3, 2 sockets, 32-cores total, GPU: Tesla K80 (single GPU)NEMO: Each socket CPU: Intel Xeon E5-‐2698 v3, 16 cores; GPU: NVIDIA K80 both GPUsCLOVERLEAF: CPU: Dual socket Intel Xeon CPU E5-2690 v2, 20 cores total, GPU: Tesla K80 both GPUs * NEMO run used all-MPI
22
Performance Portability Results
0
2
4
6
8
10
12
miniGhost (Mantevo) NEMO (Climate & Ocean) CLOVERLEAF (Physics)
CPU: MPI + OpenMP *
CPU: MPI + OpenACC
CPU + GPU: MPI + OpenACC
Speedup v
s si
ngle
CPU
Core
PGI 15.10 CompilerminiGhost: CPU: Intel Xeon E5-2698 v3, 2 sockets, 32-cores total, GPU: Tesla K80 (single GPU)NEMO: Each socket CPU: Intel Xeon E5-‐2698 v3, 16 cores; GPU: NVIDIA K80 both GPUsCLOVERLEAF: CPU: Dual socket Intel Xeon CPU E5-2690 v2, 20 cores total, GPU: Tesla K80 both GPUs
30X
* NEMO run used all-MPI