Slide 20
Slide 20 text
Communication optimization
• To improve scalability, CUDA-aware MPI and GPUDirect RDMA are used.
• Communication-related routines (halo packing/unpacking and grid interpolation/
restriction) are o
ff
l
oaded in addition to computation to minimize CPU-GPU copies.
20
DO J = SFTN(6), SFTN(7)
DO I = SFTN(2), SFTN(3)
ICNT = ICNT + 1
BUF_SND1(ICNT) = VAL1(I,J,1)
BUF_SND1(ICNT+JNUM) = VAL1(I,J,2)
BUF_SND1(ICNT+2*JNUM) = VAL2(I,J)
END DO
END DO
!$acc kernels
!$acc loop collapse(2) independent
DO J = SFTN(6), SFTN(7)
DO I = SFTN(2), SFTN(3)
ICNT = (I- SFTN(2) + 1) + (J -
SFTN(6)) * (SFTN(3) - SFTN(2) + 1)
BUF_SND1(ICNT) = VAL1(I,J,1)
BUF_SND1(ICNT+JNUM) = VAL1(I,J,2)
BUF_SND1(ICNT+2*JNUM) = VAL2(I,J)
END DO
END DO
!$acc end kernels
Original halo packing O
ffl
oaded halo packing