Nonblocking Halo Exchange Example
Our first application of nonblocking point-to=point communications is to the famiamilar halo exchange problem. We can modify the solutions to the Exercise,
We can easily convert the Sendrecv
invocations to two Irecv
and two Isend
calls. With nonblocking sends and receives, we do not have to carefully manage the “sweeps” up and down or right to left; the MPI libary will handle that. This avoids the serialization of the blocking exchanges; this was the main purpose for the introduction of nonblocking communications.
As a general rule, the calls to Irecv
should be posted first, followed by the Isend
. In addition, in many cases such as this, it is preferable to use Waitall
so that the MPI library can handle the ordering of the completions.
#include <iostream>
#include <mpi.h>
using namespace std;
int main (int argc, char *argv[]) {
int i, j;
int N;
double topBC=100;
double bottomBC=0.;
double edgeBC=100.;
// Added for MPI
int nr, nrl, nc, ncl;
int rank, nprocs;
MPI_Status status;
int root=0, tag=0;
int up, down;
//Initialize MPI
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
//Usually we would read in nr and nc; they represent the global grid RxC;
//For this case they are both N.
N=500;
nr=N;
nc=N;
//Use ncl for clarity
ncl=nc;
//Weak scaling
//nrl=nr;
//Strong scaling
if (nr%nprocs!=0) {
MPI_Finalize();
cout<<"Not an even division of the arrays.\n";
return 0;
}
else {
nrl=nr/nprocs;
}
//Find my neighbors
if (rank==0) {
up=MPI_PROC_NULL;
}
else {
up=rank-1;
}
if (rank==nprocs-1) {
down=MPI_PROC_NULL;
}
else {
down=rank+1;
}
//Two extra rows and columns for boundary conditions
int nrows=nrl+2;
int ncols=ncl+2;
double **w=new double*[nrows];
double *wptr=new double[(nrows)*(ncols)];
for (i=0;i<(nrl+2);++i,wptr+=ncols)
w[i] = wptr;
if (rank==0) {
for (int i=0;i<=ncl+1;++i){
w[0][i]=bottomBC;
}
}
if (rank==nprocs-1) {
for (int i=0;i<ncl+1;++i){
w[nrl+1][i]=topBC;
}
}
for (int i=0;i<=nrl+1;++i){
w[i][0]=edgeBC;
w[i][ncl+1]=edgeBC;
}
//Initialize interior only
for ( i = 1; i <= nrl; i++ ) {
for (j = 1; j <= ncl; j++ ) {
w[i][j] = 50.;
}
}
int nrequests=4;
MPI_Request requests[nrequests];
MPI_Irecv(&w[nrl+1][1], ncl, MPI_DOUBLE, down, tag, MPI_COMM_WORLD, &requests[0]);
MPI_Irecv(&w[0][1], ncl, MPI_DOUBLE, up, tag, MPI_COMM_WORLD, &requests[1]);
MPI_Isend(&w[1][1], ncl, MPI_DOUBLE, up, tag, MPI_COMM_WORLD, &requests[2]);
MPI_Isend(&w[nrl][1],ncl,MPI_DOUBLE, down, tag, MPI_COMM_WORLD, &requests[3]);
MPI_Status status_arr[4];
MPI_Waitall(nrequests,requests,status_arr);
//Spot-check results
for (i=0;i<nprocs;++i) {
if (i==rank) {
cout<<i<<" "<<w[0][ncl/2]<<" "<<w[nrl+1][ncl/2]<<endl;
}
}
MPI_Finalize();
}
program halo
use mpi_f08
double precision, allocatable, dimension(:,:) :: w
integer :: N=500
double precision :: topBC, bottomBC, edgeBC
integer :: i
integer :: nr, nc, nrl, ncl
integer :: rank, nprocs, tag=0
integer :: errcode, ierr
type(MPI_Status), dimension(:), allocatable :: mpi_status_arr
type(MPI_Request), dimension(:), allocatable :: mpi_requests
integer :: nrequests
integer, parameter :: root=0
integer :: left, right
!Initialize MPI, get the local number of columns
call MPI_INIT(ierr)
call MPI_COMM_SIZE(MPI_COMM_WORLD,nprocs,ierr)
call MPI_COMM_RANK(MPI_COMM_WORLD,rank,ierr)
!For this case
!Usually we'd read in nr and nc; they represent the global grid size RxC.
nr=N
nc=N
!Generally we'd just use nr and not duplicate; this is for clarity later
nrl=nr
! weak scaling
!ncl = nc
! strong scaling
if (mod(nc,nprocs).ne.0) then
call MPI_FINALIZE(ierr)
stop 'Not an even division of the array'
else
ncl = nc/nprocs
endif
! Find my neighbors
if ( rank .eq. root ) then
left = MPI_PROC_NULL
else
left = rank - 1
endif
if ( rank .eq. nprocs-1 ) then
right = MPI_PROC_NULL
else
right = rank + 1
endif
allocate(w(0:nrl+1,0:ncl+1))
! Boundary conditions
topBC=0.d0
bottomBC=100.d0
edgeBC=100.d0
!Set boundary conditions
if (rank==0) then
w(:,0)=edgeBC
endif
if (rank==nprocs-1) then
w(:,ncl+1)=edgeBC
endif
w(nrl+1,:)=bottomBC
w(0,:)=topBC
! Initialize the array interior
w(1:nrl,1:ncl)=50.
! Exchange halo values
nrequests=4
allocate(mpi_requests(nrequests),mpi_status_arr(nrequests))
call MPI_Irecv(w(1:nrl,ncl+1),nrl,MPI_DOUBLE_PRECISION,right,tag, &
MPI_COMM_WORLD,mpi_requests(1),ierr)
call MPI_Irecv(w(1:nrl,0) ,nrl,MPI_DOUBLE_PRECISION,left,tag, &
MPI_COMM_WORLD,mpi_requests(2),ierr)
call MPI_Isend(w(1:nrl,1) ,nrl,MPI_DOUBLE_PRECISION,left,tag, &
MPI_COMM_WORLD,mpi_requests(3),ierr)
call MPI_Isend(w(1:nrl,ncl) ,nrl,MPI_DOUBLE_PRECISION,right,tag, &
MPI_COMM_WORLD,mpi_requests(4),ierr)
call MPI_Waitall(nrequests,mpi_requests,mpi_status_arr)
!Spot-check result
do i=0,nprocs-1
if (i==rank) then
write (*,*) rank, w(nrl/2,0), w(nrl/2,ncl+1)
endif
enddo
call MPI_FINALIZE(ierr)
end program
import sys
import numpy as np
from mpi4py import MPI
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
nprocs = comm.Get_size()
N = 500
#Strong scaling
if N%nprocs==0:
nrl = N//nprocs
else:
print("Number of ranks should divide the number of rows evenly.")
sys.exit()
#Weak scaling
#nrl = N
#Both
ncl = N
w = np.zeros((nrl+2, ncl+2), dtype=np.double)
#Set up boundaries
if rank == 0 :
w[0,:] = 0. # up
if rank == nprocs-1 :
w[nrl+1,:] = 100. # bottom
w[:,0] = 100. # left
w[:,ncl+1] = 100. # right
#Arbitarary value for interior that may speed up convergence somewhat.
#Be sure not to overwrite boundaries.
w[1:nrl+1,1:ncl+1] = 50.
# setting up the up and down rank for each process
if rank == 0 :
up = MPI.PROC_NULL
else :
up = rank - 1
if rank == nprocs - 1 :
down = MPI.PROC_NULL
else :
down = rank + 1
tag=0
# Receive
rcv_down=comm.Irecv([w[nrl+1,1:ncl+1],MPI.DOUBLE],down)
rcv_up=comm.Irecv([w[0,1:ncl+1],MPI.DOUBLE],up)
send_up=comm.Isend([w[1,1:ncl+1],MPI.DOUBLE], up)
send_down=comm.Isend([w[nrl,1:ncl+1],MPI.DOUBLE], down)
requests=[rcv_down,rcv_up,send_up,send_down]
MPI.Request.Waitall(requests)
# Spot-check result
for n in range(nprocs):
if n==rank:
print(n,w[0,ncl//2],w[nrl+1,ncl//2])