MPI Project Set 1

Project 1

A very inefficient way to compute $\pi$ is to use a Monte Carlo method. In this we randomly throw “darts” at a circle of radius 1 within a square with sides of length 2. The ratio of the areas is thus $\pi \div 4$. If we compute the ratio of “hits” that land within the circle to the total number of throws, then our estimate of $\pi$ is $4 \times \mathrm{ratio}$.

Write a program that computes pi by this method. Start with a serial code and test it for a relatively small number of “dart throws.” You will find it very inaccurate for a small number of throws but should be able to obtain an estimate at least close to 3. Once you are satisfied that your program works in serial, parallelize it. Distribute the throws over N processes and use a reduction to get the totals for your overall estimate of pi. Test your parallel code with 8 processes and 1e9 throws. Hint: C++ programmers should use long long for the number of throws and any corresponding loop counter. Fortran programmers should use an 8-byte integer. (See the C++ and Fortran random-walk examples.) Python automatically handles large integers.

Note that the formula is the same if one considers only the upper right quadrant of the figure.

Fortran programmers: you may wish to extract the random module from mpirandomwalk.f90 into a separate file random.f90, since it is used frequently. Remember that modules must be compiled before any program units that USE them.

Try to write the serial code before peeking at the examples.

Serial Codes

C++

/* Computes pi by a Monte-Carlo method
 *
*/

#include <iostream>
#include <iomanip>
#include <string>
#include <sstream>
#include <random>
#include <cmath>

using namespace std;

#define urand ((double)rand()/(double)RAND_MAX);

long long throwDarts(long long nThrows) {

   long long hits=0;
   for (int i=0; i<nThrows; ++i) {
       double x = urand;
       double y = urand;
       if ( sqrt(pow(x,2)+pow(y,2)) < 1.0 ) {
           hits++;
       }
   }
   return hits;
}

int main(int argc, char **argv) {

   long long nThrows;

   if (argc != 2) {
      cout<<"Usage: nThrows\n";
      return 0;
   }
   else {
      string steps=argv[1];
      stringstream ssteps;
      ssteps<<steps;
      ssteps>>nThrows;
   }

   //It's better to use C++ random-number generators, but that requires some
   //additional coding, and possibly templating, so let's keep it simple and
   //fast for this example.
   srand(time(0));


   long long nHits=throwDarts(nThrows);

   double pi=4.0*(double)nHits/(double)nThrows;

   cout<<"Pi is "<<setprecision(6)<<pi<<endl;

   return 0;

}

Fortran

!*******************************************************************************
! Program to compute pi using a Monte-Carlo integration
!
! Author:  K. Holcomb
! Date  :  20180424
!
!*******************************************************************************

module random
  implicit none

  ! Author:  Katherine Holcomb
  ! Shortened version of a longer module
  ! Module incorporated into this file for convenience; typically it would be in its
  ! own file.
  ! Comment out one or the other when the module is incorporated into a code.

  ! Single precision
  !integer, parameter   :: rk = kind(1.0)
  ! Double precision
  integer, parameter   :: rk = kind(1.0d0)

  contains
     subroutine set_random_seed(seed)
       ! Sets all elements of the seed array
       integer, optional, intent(in)       :: seed

       integer                             :: isize
       integer, dimension(:), allocatable  :: iseed
       integer, dimension(8)               :: idate
       integer                             :: icount, i

       call random_seed(size=isize)

       if ( .not. present(seed) ) then
          call system_clock(icount)
          allocate(iseed(isize),source=icount+370*[(i,i=0,isize-1)])
       else
          allocate(iseed(isize))
          iseed=seed
       endif

       call random_seed(put=iseed)

     end subroutine set_random_seed

     function urand(lb,ub,seed)
       ! Returns a uniformly-distributed random number in the range lb to ub.
       real(rk)                        :: urand
       real(rk), optional, intent(in)  :: lb,ub
       real(rk), optional, intent(in)  :: seed

       integer                         :: iseed
       real(rk)                        :: rnd
       real(rk)                        :: lower,upper

       if ( present(seed) ) then
          iseed=int(seed)
          call set_random_seed(iseed)
       endif

       if ( present(lb) ) then
          lower=lb
       else
          lower=0.0_rk
       endif

       if ( present(ub) ) then
          upper = ub
       else
          upper = 1.0_rk
       endif

       call random_number(rnd)
       urand = lower+(upper-lower)*rnd

       return
     end function urand
end module

program piMC
   use random
   implicit none

   integer, parameter :: ik = selected_int_kind(15)
   character(len=12)  :: throws
   integer            :: nargs
   integer(ik)        :: n_throws, n_hits
   double precision   :: pi

   integer(ik)        :: n

   interface 
      function throw_darts(n_throws)
         integer, parameter :: ik = selected_int_kind(15)
         integer(ik)             :: throw_darts
         integer(ik), intent(in) :: n_throws
      end function throw_darts
   end interface

   nargs=command_argument_count()
   if (nargs .lt. 1) then
      n_throws=1000000000   ! default
   else
      call get_command_argument(1,throws)
      read(throws,'(i12)') n_throws
   endif

   call set_random_seed()
   n_hits=throw_darts(n_throws)

   pi=4.0d0*dble(n_hits)/dble(n_throws)

   write(*,'(f12.8)') pi

end program piMC

function throw_darts(n_throws)
   use random
   integer, parameter      :: ik = selected_int_kind(15)
   integer(ik)             :: throw_darts
   integer(ik), intent(in) :: n_throws
   integer(ik)             :: i,hits
   double precision        :: x, y

   hits=0
   do i=1,n_throws
      x=urand(0.d0,1.d0)
      y=urand(0.d0,1.d0)
      if (sqrt(x**2+y**2)<1.0) then
         hits=hits+1
      endif
   enddo

   throw_darts=hits
end function throw_darts

Python

"""
 This program estimates the value of PI by running a Monte Carlo simulation.
 NOTE:  This is not how one would normally want to calculate PI, but serves
 to illustrate the principle.
"""

import sys
import os
import math
import numpy as np

def throw(numPoints):
    """Throw a series of imaginary darts at an imaginary dartboard of unit 
        radius and count how many land inside the circle."""

    x=np.random.random(numPoints)
    y=np.random.random(numPoints)

    inside=(x**2+y**2)<=1.0
    numInside=sum(inside)
    return numInside

def main():

    if (len(sys.argv)>1):
        try:
            numPoints=int(float((sys.argv[1])))
        except:
            print("Argument must be an integer.")
    else:
        print("USAGE:python MonteCarlo.py numPoints")
        exit()

    numInside=throw(numPoints)
    ppi=4.*numInside/float(numPoints)
    print(ppi)

if __name__=="__main__":
    main()

Example Solutions

C++

/* Computes pi by a Monte-Carlo method
 *
*/

#include <iostream>
#include <iomanip>
#include <string>
#include <sstream>
#include <random>
#include <cmath>
#include <mpi.h>

using namespace std;

#define urand ((double)rand()/(double)RAND_MAX);

long long throwDarts(long long nThrows) {

   long long hits=0;
   for (int i=0; i<nThrows; ++i) {
       double x = urand;
       double y = urand;
       if ( sqrt(pow(x,2)+pow(y,2)) < 1.0 ) {
           hits++;
       }
   }
   return hits;
}

int main(int argc, char **argv) {

   int nprocs, rank;

   MPI_Init(&argc, &argv);
   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
   MPI_Comm_rank(MPI_COMM_WORLD, &rank);

   long long nThrows;

   if (argc != 2) {
      cout<<rank<<"Usage: nThrows\n";
      MPI_Finalize();
      return 0;
   }
   else {
      string steps=argv[1];
      stringstream ssteps;
      ssteps<<steps;
      ssteps>>nThrows;
   }

   //It's better to use C++ random-number generators, but that requires some
   //additional coding, and possibly templating, so let's keep it simple and 
   //fast for this example.
   srand(time(0));

// Strong scaling
   long long myThrows=nThrows/nprocs;
   if ( nThrows%nprocs!=0 ) {
      long long nExtra=nThrows%nprocs;   // dopey load balancing
      for (long long n=1;n<=nExtra;n++) {
         if (n-1==rank) myThrows++;
      }
   }

// For weak scaling
//  myThrows=nThrows
//
   long long myHits=throwDarts(myThrows);
   long long allHits;
   MPI_Reduce(&myHits,&allHits,1,MPI_LONG_LONG_INT,MPI_SUM,0,MPI_COMM_WORLD);

   double pi=4.*(double)allHits/(double)myThrows;
//weak scaling
//   double pi=(double)allHits/(double)myThrows*nprocs;

   if (rank==0) {
       cout<<"Pi is "<<setprecision(6)<<pi<<endl;
   }

   MPI_Finalize();

   return 0;

}

Fortran

!*******************************************************************************
! Program to compute pi using a Monte-Carlo integration
!
! Author:  K. Holcomb
! Date  :  20180424
!
!*******************************************************************************

module random
  implicit none

  ! Author:  Katherine Holcomb
  ! Shortened version of a longer module
  ! Module incorporated into this file for convenience; typically it would be in its
  ! own file.
  ! Comment out one or the other when the module is incorporated into a code.

  ! Single precision
  !integer, parameter   :: rk = kind(1.0)
  ! Double precision
  integer, parameter   :: rk = kind(1.0d0)

  contains
     subroutine set_random_seed(seed)
       ! Sets all elements of the seed array
       integer, optional, intent(in)       :: seed

       integer                             :: isize
       integer, dimension(:), allocatable  :: iseed
       integer, dimension(8)               :: idate
       integer                             :: icount, i

       call random_seed(size=isize)

       if ( .not. present(seed) ) then
          call system_clock(icount)
          allocate(iseed(isize),source=icount+370*[(i,i=0,isize-1)])
       else
          allocate(iseed(isize))
          iseed=seed
       endif

       call random_seed(put=iseed)

     end subroutine set_random_seed

     function urand(lb,ub,seed)
       ! Returns a uniformly-distributed random number in the range lb to ub.
       real(rk)                        :: urand
       real(rk), optional, intent(in)  :: lb,ub
       real(rk), optional, intent(in)  :: seed

       integer                         :: iseed
       real(rk)                        :: rnd
       real(rk)                        :: lower,upper

       if ( present(seed) ) then
          iseed=int(seed)
          call set_random_seed(iseed)
       endif

       if ( present(lb) ) then
          lower=lb
       else
          lower=0.0_rk
       endif

       if ( present(ub) ) then
          upper = ub
       else
          upper = 1.0_rk
       endif

       call random_number(rnd)
       urand = lower+(upper-lower)*rnd

       return
     end function urand
end module

program piMC
   use mpi
   use random
   implicit none

   integer, parameter :: ik = selected_int_kind(15)
   character(len=12)  :: throws
   integer            :: nargs
   integer(ik)        :: my_nthrows, my_hits, n_extra, n_throws, n_hits
   double precision   :: pi

   integer            :: rank, nprocs, ierr
   integer(ik)        :: n

   interface 
      function throw_darts(n_throws)
         integer, parameter :: ik = selected_int_kind(15)
         integer(ik)             :: throw_darts
         integer(ik), intent(in) :: n_throws
      end function throw_darts
   end interface

   nargs=command_argument_count()
   if (nargs .lt. 1) then
       n_throws=1000000000   ! default, if not wanted then stop
   else
      call get_command_argument(1,throws)
      read(throws,'(i12)') n_throws
   endif

   call MPI_Init(ierr)
   call MPI_Comm_size(MPI_COMM_WORLD,nprocs,ierr)
   call MPI_Comm_rank(MPI_COMM_WORLD,rank,ierr)

   call set_random_seed()

! For strong scaling
   my_nthrows=n_throws/nprocs
   n_extra=mod(n_throws,nprocs) ! dopey load balancing
   if ( n_extra /=0 ) then
      do n=1,n_extra
         if (n-1==rank) my_nthrows=my_nthrows+1
      enddo
   endif
! For weak scaling
!  my_nthrows=n_throws
! change final denominator to npes*n_throws

   my_hits=throw_darts(my_nthrows)

   call MPI_Reduce(my_hits,n_hits,1,MPI_INTEGER8,MPI_SUM,0,MPI_COMM_WORLD,ierr)
   
   pi=4.0d0*dble(n_hits)/dble(n_throws)

   if ( rank==0 ) then
      write(*,'(f12.8)') pi
   endif

!weak scaling
!   pi=4.0d0*dble(n_hits)/dble(nprocs*n_throws)

   call MPI_Finalize(ierr)

end program piMC

function throw_darts(n_throws)
   use random
   integer, parameter      :: ik = selected_int_kind(15)
   integer(ik)             :: throw_darts
   integer(ik), intent(in) :: n_throws
   integer(ik)             :: i,hits
   double precision        :: x, y

   hits=0
   do i=1,n_throws
      x=urand(0.d0,1.d0)
      y=urand(0.d0,1.d0)
      if (sqrt(x**2+y**2)<1.0) then
         hits=hits+1
      endif
   enddo

   throw_darts=hits
end function throw_darts

Python

"""
 This program estimates the value of PI by running a Monte Carlo simulation.
 NOTE:  This is not how one would normally want to calculate PI, but serves
 to illustrate the principle.
"""

import sys
import os
import math
import numpy as np
from mpi4py import MPI

def throw(numPoints):
    """Throw a series of imaginary darts at an imaginary dartboard of unit 
        radius and count how many land inside the circle."""

    x=np.random.random(numPoints)
    y=np.random.random(numPoints)

    inside=(x**2+y**2)<=1.0
    numInside=sum(inside)
    return numInside

def main():

    if (len(sys.argv)>1):
        try:
            numPoints=int(float((sys.argv[1])))
        except:
            print("Argument must be an integer.")
    else:
        print("USAGE:python MonteCarlo.py numPoints")
        exit()

    myrank = MPI.COMM_WORLD.Get_rank()
    nprocs = MPI.COMM_WORLD.Get_size()

    #Distribute points
    #Simple-minded method to distribute equally if the division isn't even.
    chunks=numPoints%nprocs
    myNumPoints=[numPoints//nprocs+1]*chunks+[numPoints//nprocs]*(nprocs-chunks)

    tic=MPI.Wtime()
    myInside=np.ones(1)*throw(myNumPoints[myrank])
    allInside=np.zeros(1)
    MPI.COMM_WORLD.Reduce(myInside, allInside, op=MPI.SUM, root=0)
    ppi=4.*allInside/float(numPoints)
    toc=MPI.Wtime()
    if myrank==0:
        print(ppi[0])
        print("Time on "+str(nprocs)+" cores:"+str(round(toc-tic,4)))

if __name__=="__main__":
    main()

Project 2

The trapezoid rule is a simple method for numerical quadrature (integration). Given a function $f(x)$, the integral from $x=a$ to $x=b$ is $$ I = \int^b_a f(x)dx $$ We can approximate the area under the curve as a trapezoid with area $$ I_{trap} = \frac{1}{2}(b-a)(f(b)+f(a) $$ However, a single trapezoid isn’t a very good approximation. We can improve our result by dividing the area under the curve into many trapezoids: $$ I_{trap} = \sum^n_{i=1} \frac{f(x_{i+1})+f(x_i)}{2} \Delta x_i $$ where $$ \Delta x_i = x_{i+1}-x_i $$ The larger the number of trapezoids, the more accurate our approximation to the integral will be. Typically $\Delta x_i$ is a constant usually represented by $h$, but that is not a requirement.

It should be clear that the trapezoid rule is very easy to parallelize. The interval on the independent variable should be split up among the ranks. Each rank then carries out the summation over its part. The complete integral is the sume of all ranks' results.

Starting from the serial code in your language of choice, parallelize the trapezoid rule. Only rank 0 should read the input data; then it should broadcast appropriately to the other processes. Test for four processes. Note that the integral of the sine from 0 to $\pi$ has an exact value of 2, making it a good test case.

C++ and Fortran programmers: if you are not familiar with passing subprogram names as dummy variables, refer to our courses for Fortran or C++.

Serial Codes

C++

#include <iostream>
#include <iomanip>
#include <string>
#include <sstream>
#include <cmath>
//C++11
#include <functional>

using namespace std;

float f(float x) {
  return sin(x);
}

//Function pointer
//float trap(float a, float b, float h, int n, float (*function)(float)) {
//Templated function object
float trap(float a, float b, float h, int n, function<float(float)> f) {

   float integral = (f(a) + f(b))/2.0;
   float x=a;
   for (int i=0; i<n; ++i) {
       x+=h;
       integral += f(x);
   }

   integral *= h;
   return integral;
}


int main(int argc, char **argv) {
                                                                                
// Calculate a definite integral using trapezoid rule
                                                                                
    float  a, b;
    int    n;

    float  h, integral;
    float  x;

    if (argc != 4) {
        cout<<"Usage: lower bound, upper bound, number of steps\n";
        return 0;
    }
    else {
       stringstream lb;
       lb<<argv[1];
       lb>>a;
       a=stof(argv[1]);

       stringstream ub;
       ub<<argv[1];
       ub>>b;
       b=stof(argv[2]);

       stringstream steps;
       steps<<argv[3];
       steps>>n;
   }

   h=(b-a)/n;
   integral = trap(a,b,h,n,&f);

   cout<<" Result "<<integral<<endl;

}

Fortran

program trapezoid
implicit none
                                                                                
! Calculate a definite integral using trapezoid rule
                                                                                
real    :: a, b
integer :: n

real    :: h, integral
real    :: x
integer :: i, nargs
character(len=16) lb, ub, ns

interface 
  real function trap(f,a,b,h,n)
      implicit none
      real,    intent(in)   :: a, b, h
      integer, intent(in)   :: n
      interface
         real function f(x)
         implicit none
         real, intent(in) :: x
         end function
       end interface
   end function
   real function f(x)
        implicit none
        real, intent(in) :: x
   end function
end interface

  nargs=command_argument_count()
  if ( nargs .ne. 3 ) then
     stop "Usage: arguments are lower bound, upper bound, number of steps"
  else
     call get_command_argument(1,lb)
     read(lb,'(f16.9)') a
     call get_command_argument(2,ub)
     read(ub,'(f16.9)') b
     call get_command_argument(3,ns)
     read(ns,'(i10)') n
  endif

   h=(b-a)/n
   integral=trap(f,a,b,h,n)

   print *, integral
                                                                                
end program

real function trap(f, a, b, h, n)
   implicit none
   real,    intent(in)   :: a, b, h
   integer, intent(in)   :: n

   interface
     real function f(x)
        implicit none
        real, intent(in) :: x
     end function
   end interface

   real   :: x
   integer:: i
   real   :: integral

   integral = (f(a) + f(b))/2.0

   x=a
   do i=1, n-1
      x = x+h
      integral = integral + f(x)
   enddo

   trap   = integral*h
   return

end function
                                                                                
real function f(x)
  implicit none
  real, intent(in):: x
     f=sin(x)
end function

Python

import sys
import numpy as np

# Calculate a definite integral using trapezoid rule
# Does not use scipy so we can parallelize it
                                                                                
def f(x):
    return np.sin(x)

def trap(a,b,h,n,f):
    integral=(f(a) + f(b))/2.0
    x=a
    for i in range(n):
        x+=h
        integral+=f(x)
    integral*=h
    return integral
                                                                                
if len(sys.argv)==4:
    a=float(sys.argv[1])
    b=float(sys.argv[2])
    n=int(float(sys.argv[3]))
else:
    print("Usage: lower bound, upper bound, number of steps")
    sys.exit()

h=(b-a)/n
integral=trap(a,b,h,n,f)

print("Result ",integral)

Example Solutions

C++

#include <iostream>
#include <iomanip>
#include <string>
#include <sstream>
#include <cmath>
//For templated function objects
#include <functional>
#include <mpi.h>

using namespace std;

float f(float x) {
  return sin(x);
}

//Function pointer
//float trap(float a, float b, float h, int n, float (*function)(float)) {
//Templated function object
float trap(float a, float b, float h, int n, function<float(float)> f) {

   float integral = (f(a) + f(b))/2.0;
   float x=a;
   for (int i=0; i<n; ++i) {
       x+=h;
       integral += f(x);
   }

   integral *= h;
   return integral;
}

int main(int argc, char **argv) {
                                                                                
// Calculate a definite integral using trapezoid rule
                                                                                
    float  a, b;
    int    n;

    float  h, integral;
    float  x;

    int   nprocs, rank;
    float total;

    MPI_Init(&argc,&argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

    bool shutdown=false;

    if (rank==0) {
        if (argc != 4) {
            cout<<"Usage: lower bound, upper bound, number of steps\n";
	    shutdown=true;
        }
        else {
           stringstream lb;
           lb<<argv[1];
           lb>>a;
           a=stof(argv[1]);

           stringstream ub;
           ub<<argv[1];
           ub>>b;
           b=stof(argv[2]);

           stringstream steps;
           steps<<argv[3];
           steps>>n;
        }
	//Not going to bother to redistribute this one
        if (!shutdown) {
	    if (n%nprocs != 0) {
	        cout<<"Number of processes must evenly divide number of steps\n";
	        shutdown=true;
	    }
	}
    }

    MPI_Bcast(&shutdown,1,MPI_CXX_BOOL,0,MPI_COMM_WORLD);
    if (shutdown) {
        MPI_Finalize();
	exit(1);
    }

    MPI_Bcast(&a,1,MPI_REAL,0,MPI_COMM_WORLD);
    MPI_Bcast(&b,1,MPI_REAL,0,MPI_COMM_WORLD);
    MPI_Bcast(&n,1,MPI_INTEGER,0,MPI_COMM_WORLD);

    h=(b-a)/n;

    int   local_n=n/nprocs;
    float local_a=a+rank*local_n*h;
    float local_b=b+rank*local_n*h;

    integral = trap(local_a,local_b,h,local_n,&f);

    MPI_Reduce(&integral,&total,1,MPI_FLOAT,MPI_SUM,0,MPI_COMM_WORLD);

    if (rank==0) {
        cout<<" Result "<<total<<endl;
    }

}

Fortran

program trapezoid
use mpi
implicit none
                                                                                
! Calculate a definite integral using trapezoid rule
                                                                                
real    :: a, b
integer :: n

real    :: h, integral
real    :: x
integer :: i, nargs
character(len=16) lb, ub, ns

real   :: local_a, local_b, total 
integer:: rank, nprocs, ierr, errcode
integer:: local_n
logical:: shutdown=.false.

interface 
  real function trap(f,a,b,h,n)
      implicit none
      real,    intent(in)   :: a, b, h
      integer, intent(in)   :: n
      interface
         real function f(x)
         implicit none
         real, intent(in) :: x
         end function
       end interface
   end function
   real function f(x)
        implicit none
        real, intent(in) :: x
   end function
end interface

  call MPI_Init(ierr)
  call MPI_Comm_rank(MPI_COMM_WORLD, rank, ierr)
  call MPI_Comm_size(MPI_COMM_WORLD, nprocs, ierr)

  if (rank==0) then
      nargs=command_argument_count()
      if ( nargs .ne. 3 ) then
         write(*,*) "Usage: arguments are lower bound, upper bound, number of steps"
         shutdown=.true.
      else
         call get_command_argument(1,lb)
         read(lb,'(f16.9)') a
         call get_command_argument(2,ub)
         read(ub,'(f16.9)') b
         call get_command_argument(3,ns)
         read(ns,'(i10)') n
         shutdown=.false.
      endif
      if (.not. shutdown) then
      !Don't bother to distribute number of steps, just stop if not even
         if (mod(n,nprocs) /=0) then
             write(*,*) "Number of processes must evenly divide number of steps"
             shutdown=.true.
         endif
      endif
   endif

   call MPI_Bcast(shutdown,1,MPI_LOGICAL,0,MPI_COMM_WORLD,ierr)
   if (shutdown) then
      call MPI_Finalize(ierr)
      stop 
   endif

   call MPI_Bcast(a,1,MPI_REAL,0,MPI_COMM_WORLD,ierr)
   call MPI_Bcast(b,1,MPI_REAL,0,MPI_COMM_WORLD,ierr)
   call MPI_Bcast(n,1,MPI_INTEGER,0,MPI_COMM_WORLD,ierr)

   h=(b-a)/n
   local_n = n/nprocs

   local_a = a + rank*local_n*h
   local_b = local_a + local_n*h
   integral = trap(f, local_a, local_b, h, local_n)

   call MPI_Reduce(integral,total,1,MPI_REAL,MPI_SUM,0,MPI_COMM_WORLD,ierr)

   if (rank .eq. 0) then
      print *, total
   endif

   call MPI_Finalize(ierr)

end program

real function trap(f, a, b, h, n)
   implicit none
   real,    intent(in)   :: a, b, h
   integer, intent(in)   :: n

   interface
     real function f(x)
        implicit none
        real, intent(in) :: x
     end function
   end interface

   real   :: x
   integer:: i
   real   :: integral

   integral = (f(a) + f(b))/2.0

   x=a
   do i=1, n-1
      x = x+h
      integral = integral + f(x)
   enddo

   trap   = integral*h
   return

end function
                                                                                
real function f(x)
  implicit none
  real, intent(in):: x
     f=sin(x)
end function

Python

import sys
import numpy as np
from mpi4py import MPI

# Calculate a definite integral using trapezoid rule
# Does not use scipy so we can parallelize it
                                                                                
def f(x):
    return np.sin(x)

def trap(a,b,h,n,f):
    integral=(f(a) + f(b))/2.0
    x=a
    for i in range(n):
        x+=h
        integral+=f(x)
    integral*=h
    return integral

def main():

    comm=MPI.COMM_WORLD
    nprocs=comm.Get_size()
    rank=comm.Get_rank()

    a=np.empty(1,dtype='float')
    b=np.empty(1,dtype='float')
    n=np.empty(1,dtype='int')
    shutdown=None

    if rank==0:
        if len(sys.argv)==4:
            a[0]=float(sys.argv[1])
            b[0]=float(sys.argv[2])
            n[0]=int(float(sys.argv[3]))
            shutdown=False
        else:
            print("Usage: lower bound, upper bound, number of steps")
            shutdown=True
       #Not going to bother with redistributing on this one
        if (not shutdown):
            if n[0]%nprocs!=0:
                print("The number of processes must evenly divide the number of steps ")
                shutdown=True
    
    #Note lower case and return value
    shutdown=comm.bcast(shutdown,root=0)
    if shutdown:
        MPI.Finalize()
        exit()
    
    comm.Bcast([a,MPI.FLOAT],root=0)
    comm.Bcast([b,MPI.FLOAT],root=0)
    comm.Bcast([n,MPI.INT],root=0)

    h=(b-a)/n
    local_n=n[0]//nprocs
    local_a=a[0]+rank*local_n*h
    local_b=b[0]+rank*local_n*h
    
    integral=np.empty(1,dtype='float')
    integral[0]=trap(local_a,local_b,h,local_n,f)

    total=np.empty(1)
    comm.Reduce(integral,total,op=MPI.SUM)
    
    if rank==0:
        print("Result ",total)
    
    MPI.Finalize()
    
if __name__=="__main__":
    main()

Project 3

We wish to find the maximum of a 3-d surface described by a relatively complicated function. We can use a “brute force” method, which evaluates a large number of $x$ and $y$ values randomly distributed over the independent variables. This method of optimization is easily parallelized and can often be faster than older methods even when those are parallelized.

The surface is defined by the following rules:

$$ \mu_1 = \sqrt{2}\ ,\ \mu_2 = \sqrt{\pi}\ ,\ \sigma_1 = 3.1\ ,\ \sigma_2 = 1.4 $$ $$ z_1 = 0.1 \sin(x) \sin(y) $$ $$ a = \frac{(x - \mu_1)^2}{2\sigma_1^2}\ ,\ b = \frac{(y - \mu_2)^2}{2\sigma_2^2} $$ $$ z_2 = \frac{e^{(a+b)}}{\sigma_1 \sigma_2 \sqrt{2\pi}} $$ $$ z = z_1 + z_2 $$

We consider the ranges $$ -10\pi \le x \le 10\pi\ ,\ -10\pi \le y \le 10\pi $$

Generate a list of N random values for each of x and y over the above range. For testing you can use N=100000 for compiled languages and N=10000 for Python. Be sure to measure the time. Use appropriate MPI gathers to obtain not just the maximum value of $z$ from each process, but the corresponding $x$ and $y$ values. Remember that gather collects items in strict rank order. Use one or more built-ins for your language (maxloc for Fortran, np.argmax for Python) or loops (C/C++) to find the $x$, $y$, and $z$ for the maximum.

Find the maximum of this jagged function.

Serial Codes

C++

#include <iostream>
#include <iomanip>
#include <string>
#include <sstream>
#include <random>
#include <cmath>

using namespace std;

#define urand ((double)rand()/(double)RAND_MAX);

float surface(float x, float y) {
   float pi=4.0*atan(1.0);
   float mu1, mu2, sig1, sig2;
   float a, b;
   float z1, z2;

   mu1=sqrt(2.0);
   mu2=sqrt(pi);
   sig1=3.1;
   sig2=1.4;
   z1=0.1*sin(x)*sin(x*y);
   a=pow((x-mu1),2)/(2*pow(sig1,2));
   b=pow((y-mu2),2)/(2*pow(sig2,2));
   z2=exp(-(a+b))/(sig1*sig2*sqrt(2.0*pi));
   return z1+z2;
}

int main(int argc, char **argv) {

   float xval, yval, zval;

   int nsamps;
   if ( argc != 2 ) {
      cout <<"Usage: number of samples to test\n";
      return 1;
   }
   else {
      nsamps=stoi(argv[1]);
   }

   // Define the parameters to test
   float pi=4.0*atan(1.0);

   float xlo=-10.*pi; 
   float xhi=10.*pi;
   float ylo=-10.*pi; 
   float yhi=10.*pi;

   srand(time(0));

   float xmax=0.;
   float ymax=0.;
   float zmax=0.;

   for (int i=0;i<nsamps;++i) {
      for (int j=0;j<nsamps;++j) {
        xval=xlo+(xhi-xlo)*urand;
        yval=ylo+(yhi-ylo)*urand;
        zval=surface(xval,yval);
        if (zval>zmax) {
           zmax=zval;
           xmax=xval;
           ymax=yval;
	}
      }
   }

   cout<<"Result is "<<zmax<<" at "<<xmax<<","<<ymax<<endl;

}

Fortran

module random
  implicit none

  ! Author:  Katherine Holcomb
  ! Shortened version of a longer module
  ! Module incorporated into this file for convenience; typically it would be in its
  ! own file.
  ! Comment out one or the other when the module is incorporated into a code.

  ! Single precision
  integer, parameter   :: rk = kind(1.0)
  ! Double precision
  !integer, parameter   :: rk = kind(1.0d0)

  contains
     subroutine set_random_seed(seed)
       ! Sets all elements of the seed array
       integer, optional, intent(in)       :: seed

       integer                             :: isize
       integer, dimension(:), allocatable  :: iseed
       integer, dimension(8)               :: idate
       integer                             :: icount, i

       call random_seed(size=isize)

       if ( .not. present(seed) ) then
          call system_clock(icount)
          allocate(iseed(isize),source=icount+370*[(i,i=0,isize-1)])
       else
          allocate(iseed(isize))
          iseed=seed
       endif

       call random_seed(put=iseed)

     end subroutine set_random_seed

     function urand(lb,ub,seed)
       ! Returns a uniformly-distributed random number in the range lb to ub.
       real(rk)                        :: urand
       real(rk), optional, intent(in)  :: lb,ub
       real(rk), optional, intent(in)  :: seed

       integer                         :: iseed
       real(rk)                        :: rnd
       real(rk)                        :: lower,upper

       if ( present(seed) ) then
          iseed=int(seed)
          call set_random_seed(iseed)
       endif

       if ( present(lb) ) then
          lower=lb
       else
          lower=0.0_rk
       endif

       if ( present(ub) ) then
          upper = ub
       else
          upper = 1.0_rk
       endif

       call random_number(rnd)
       urand = lower+(upper-lower)*rnd

       return
     end function urand
end module

program find_max
   use random
   implicit none

   real    :: pi=4.0*atan(1.0)
   real    :: xlo, xhi, ylo, yhi
   real    :: zval, zmax
   real xval, yval
   real xmax, ymax
   integer :: i, j, nargs
   integer :: nsamps
   integer :: max_xind, max_yind
   character(len=12) :: n

   interface
      real function surface(x,y)
         implicit none
         real, intent(in) :: x, y
      end function
   end interface

   nargs=command_argument_count()
   if ( nargs .ne. 1 ) then
      stop "Usage: number of samples to test"
   else
      call get_command_argument(1,n)
      read(n,'(i12)') nsamps
   endif

   call set_random_seed()

   ! Define the parameters to test
   xlo=-10.*pi; xhi=10.*pi
   ylo=-10.*pi; yhi=10.*pi

!We would usually have a lot of x,y points so cannot create z as a function
!of x and y, so have to use a loop rather than intrinsics
   max_xind=1
   max_yind=1
   xmax=0.
   ymax=0.
   zmax=0.
   do i=1,nsamps
     do j=1,nsamps
        xval=urand(xlo,xhi)
        yval=urand(ylo,yhi)
        zval=surface(xval,yval)
        if (zval>zmax) then
           zmax=zval
           xmax=xval
           ymax=yval
        endif
     enddo
   enddo

   write(*,'(a,f0.6,a,f0.6,a,f0.6)') "Result is ",zmax," at x,y=",xmax,", ",ymax


end program

real function surface(x,y)
   implicit none
   real, intent(in) :: x, y

   real  :: pi=4.0*atan(1.0)
   real  :: mu1, mu2, sig1, sig2
   real  :: a, b
   real  :: z1, z2

   mu1=sqrt(2.0)
   mu2=sqrt(pi)
   sig1=3.1
   sig2=1.4
   z1=0.1*sin(x)*sin(x*y)
   a=(x-mu1)**2/(2*sig1**2)
   b=(y-mu2)**2/(2*sig2**2)
   z2=exp(-(a+b))/(sig1*sig2*sqrt(2.0*pi))

   surface=z1+z2
end function

Python

import sys
import numpy as np

def surface(x,y): 
   """This is the main processing function.  We use a ufunc."""
   mu1=np.sqrt(2.0)
   mu2=np.sqrt(np.pi)
   sig1=3.1
   sig2=1.4
   z1=0.1*np.sin(x)*np.sin(x*y)
   a=(x-mu1)**2/(2*sig1**2)
   b=(y-mu2)**2/(2*sig2**2)
   z2=np.exp(-(a+b))/(sig1*sig2*np.sqrt(2.0*np.pi))
   z=z1+z2
   return z

if __name__ == '__main__':

   if len(sys.argv)==2:
      nsamps=int(float(sys.argv[1]))
   else:
      print("Usage: number of samples")
      exit()

   # Define the parameters to test
   xlo=-10.*np.pi; xhi=10.*np.pi
   ylo=-10.*np.pi; yhi=10.*np.pi

#We would usually have a lot of x,y points so cannot create z as a function
#of x and y, so have to use a loop
   zmax=0.
   for i in range(nsamps):
       for j in range(nsamps):
           xval=np.random.uniform(xlo,xhi)
           yval=np.random.uniform(ylo,yhi)
           zval=surface(xval,yval)
           if zval>zmax:
               zmax=zval
               xmax=xval
               ymax=yval
               max_xind=i
               max_yind=j

   print(f"Result is {zmax:.4f} at x,y= {xmax:.4f},{ymax:.4f} ")

Example Solutions

C++

#include <iostream>
#include <iomanip>
#include <string>
#include <sstream>
#include <random>
#include <cmath>
#include "mpi.h"

using namespace std;

#define urand ((double)rand()/(double)RAND_MAX);

float surface(float x, float y) {
   float pi=4.0*atan(1.0);
   float mu1, mu2, sig1, sig2;
   float a, b;
   float z1, z2;

   mu1=sqrt(2.0);
   mu2=sqrt(pi);
   sig1=3.1;
   sig2=1.4;
   z1=0.1*sin(x)*sin(x*y);
   a=pow((x-mu1),2)/(2*pow(sig1,2));
   b=pow((y-mu2),2)/(2*pow(sig2,2));
   z2=exp(-(a+b))/(sig1*sig2*sqrt(2.0*pi));
   return z1+z2;
}

int main(int argc, char **argv) {

   float xval, yval, zval;

   int nprocs, rank;

   MPI_Init(&argc, &argv);
   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
   MPI_Comm_rank(MPI_COMM_WORLD, &rank);

   int nsamps;
   bool shutdown=false;

   if (rank==0) {
      if ( argc != 2 ) {
         cout <<"Usage: number of samples to test\n";
         shutdown=true;
      }
      else {
         nsamps=stoi(argv[1]);
      }
   }

   MPI_Bcast(&shutdown,1,MPI_CXX_BOOL,0,MPI_COMM_WORLD);
   if (shutdown) {
        MPI_Finalize();
        exit(1);
   }

   MPI_Bcast(&nsamps,1,MPI_INT,0,MPI_COMM_WORLD);

   // Define the parameters to test
   float pi=4.0*atan(1.0);

   float xlo=-10.*pi; 
   float xhi=10.*pi;
   float ylo=-10.*pi; 
   float yhi=10.*pi;

   srand(time(0)+rank);

   float xmax=0.;
   float ymax=0.;
   float zmax=0.;

   for (int i=0;i<nsamps;++i) {
      for (int j=0;j<nsamps;++j) {
        xval=xlo+(xhi-xlo)*urand;
        yval=ylo+(yhi-ylo)*urand;
        zval=surface(xval,yval);
        if (zval>zmax) {
           zmax=zval;
           xmax=xval;
           ymax=yval;
	}
      }
   }

   float *xresult=new float[nprocs];
   float *yresult=new float[nprocs];
   float *zresult=new float[nprocs];

   MPI_Gather(&xmax,1,MPI_FLOAT,xresult,1,MPI_FLOAT,0,MPI_COMM_WORLD);
   MPI_Gather(&ymax,1,MPI_FLOAT,yresult,1,MPI_FLOAT,0,MPI_COMM_WORLD);
   MPI_Gather(&zmax,1,MPI_FLOAT,zresult,1,MPI_FLOAT,0,MPI_COMM_WORLD);

   if (rank==0) {
       float g_zmax=zresult[0];
       float g_xmax=xresult[0];
       float g_ymax=yresult[0];
       for (int i=1;i<nprocs;++i) {
           if (zresult[i] > g_zmax) {
               g_zmax=zresult[i];
	       g_xmax=xresult[i];
	       g_ymax=yresult[i];
	   }
       }
       cout<<"Result is "<<g_zmax<<" at x,y="<<g_xmax<<", "<<g_ymax<<endl;
   }

   MPI_Finalize();

}

Fortran

module random
  implicit none

  ! Author:  Katherine Holcomb
  ! Shortened version of a longer module
  ! Module incorporated into this file for convenience; typically it would be in its
  ! own file.
  ! Comment out one or the other when the module is incorporated into a code.

  ! Single precision
  integer, parameter   :: rk = kind(1.0)
  ! Double precision
  !integer, parameter   :: rk = kind(1.0d0)

  contains

     subroutine set_random_seed(seed)
       ! Sets all elements of the seed array
       integer, optional, intent(in)       :: seed

       integer                             :: isize
       integer, dimension(:), allocatable  :: iseed
       integer, dimension(8)               :: idate
       integer                             :: icount, i

       call random_seed(size=isize)

       if ( .not. present(seed) ) then
          call system_clock(icount)
          allocate(iseed(isize),source=icount+370*[(i,i=0,isize-1)])
       else
          allocate(iseed(isize))
          iseed=seed
       endif

       call random_seed(put=iseed)

     end subroutine set_random_seed

     function urand(lb,ub,seed)
       ! Returns a uniformly-distributed random number in the range lb to ub.
       real(rk)                        :: urand
       real(rk), optional, intent(in)  :: lb,ub
       real(rk), optional, intent(in)  :: seed

       integer                         :: iseed
       real(rk)                        :: rnd
       real(rk)                        :: lower,upper

       if ( present(seed) ) then
          iseed=int(seed)
          call set_random_seed(iseed)
       endif

       if ( present(lb) ) then
          lower=lb
       else
          lower=0.0_rk
       endif

       if ( present(ub) ) then
          upper = ub
       else
          upper = 1.0_rk
       endif

       call random_number(rnd)
       urand = lower+(upper-lower)*rnd

       return
     end function urand
end module

program find_max
   use random
   use mpi
   implicit none

   real    :: pi=4.0*atan(1.0)
   real    :: xlo, xhi, ylo, yhi
   real    :: my_xlo, my_xhi, my_ylo, my_yhi
   real    :: zval, zmax
   real    :: xval, xincr, xmax
   real    :: yval, yincr, ymax
   real, dimension(:),   allocatable :: xvals, yvals
   real, dimension(:,:), allocatable :: zvals
   integer :: i, j, nargs
   integer :: nsamps
   character(len=12) :: n

   integer :: rank, nprocs, ierr
   integer, dimension(:), allocatable:: seed
   integer               :: iseed, icount
   integer, dimension(:), allocatable:: maxinds
   real,    dimension(:), allocatable:: xresult,yresult,zresult
   real                  :: g_zmax
   logical               :: shutdown=.false.

   interface
      real function surface(x,y)
         implicit none
         real, intent(in) :: x, y
      end function
   end interface

   call MPI_Init(ierr)
   call MPI_Comm_rank(MPI_COMM_WORLD,rank,ierr)
   call MPI_Comm_size(MPI_COMM_WORLD,nprocs,ierr)

   if (rank==0) then
      nargs=command_argument_count()
      if ( nargs .ne. 1 ) then
         shutdown=.true.
         write(*,*) "Usage: number of samples to test"
      else
         call get_command_argument(1,n)
         read(n,'(i12)') nsamps
      endif
   endif

   call MPI_Bcast(shutdown,1,MPI_LOGICAL,0,MPI_COMM_WORLD,ierr)
   if (shutdown) then
      call MPI_Finalize(ierr)
      stop
   endif

   call MPI_Bcast(nsamps,1,MPI_INTEGER,0,MPI_COMM_WORLD,ierr)

   ! Define the parameters to test
   xlo=-10.*pi; xhi=10.*pi
   ylo=-10.*pi; yhi=10.*pi

!For correct statistical properties, we set random seeds for each rank
   call system_clock(icount)
   iseed=icount+rank
   call set_random_seed(iseed)

   xmax=0.
   ymax=0.
   zmax=0.
   do i=1,nsamps
     do j=1,nsamps
        xval=urand(xlo,xhi)
        yval=urand(ylo,yhi)
        zval=surface(xval,yval)
        if (zval>zmax) then
            zmax=zval
            xmax=xval
            ymax=yval
        endif
     enddo
   enddo

   ! If not allgather, only root needs to allocate
   if (rank==0) then
       allocate(maxinds(nprocs))
       allocate(xresult(nprocs),yresult(nprocs),zresult(nprocs))
   endif

   call MPI_Gather(xmax,1,MPI_REAL,xresult,1,MPI_REAL,0,MPI_COMM_WORLD,ierr)
   call MPI_Gather(ymax,1,MPI_REAL,yresult,1,MPI_REAL,0,MPI_COMM_WORLD,ierr)
   call MPI_Gather(zmax,1,MPI_REAL,zresult,1,MPI_REAL,0,MPI_COMM_WORLD,ierr)

   if (rank==0) then
      g_zmax=maxval(zresult)
      maxinds=maxloc(zresult)
      write(*,'(a,f0.6,a,f0.6,a,f0.6)')                                     &
           "Result is ",g_zmax," at x,y=",xresult(maxinds(1)),", ",yresult(maxinds(1))
   endif

   call MPI_Finalize(ierr)

end program

real function surface(x,y)
   implicit none
   real, intent(in) :: x, y

   real  :: pi=4.0*atan(1.0)
   real  :: mu1, mu2, sig1, sig2
   real  :: a, b
   real  :: z1, z2

   mu1=sqrt(2.0)
   mu2=sqrt(pi)
   sig1=3.1
   sig2=1.4
   z1=0.1*sin(x)*sin(x*y)
   a=(x-mu1)**2/(2*sig1**2)
   b=(y-mu2)**2/(2*sig2**2)
   z2=exp(-(a+b))/(sig1*sig2*sqrt(2.0*pi))

   surface=z1+z2
end function

Python

from mpi4py import MPI
import numpy as np
import sys

def surface(x,y): 
   """This is the main processing function.  We use a ufunc for MPI."""
   mu1=np.sqrt(2.0)
   mu2=np.sqrt(np.pi)
   sig1=3.1
   sig2=1.4
   z1=0.1*np.sin(x)*np.sin(x*y)
   a=(x-mu1)**2/(2*sig1**2)
   b=(y-mu2)**2/(2*sig2**2)
   z2=np.exp(-(a+b))/(sig1*sig2*np.sqrt(2.0*np.pi))
   z=z1+z2
   return z

if __name__ == '__main__':

   comm=MPI.COMM_WORLD
   rank=comm.Get_rank()
   nprocs=comm.Get_size()

   shutdown=None
   nsamps=0

   if rank==0:
       if len(sys.argv)==2:
           nsamps=int(float(sys.argv[1]))
           shutdown=False
       else:
          print("Usage: number of samples")
          shutdown=True

   shutdown=comm.bcast(shutdown,root=0)
   if shutdown:
       MPI.Finalize()
       exit()

   nsamps=comm.bcast(nsamps,root=0)

   #Fairly crude effort to get different seeds on different processes
   rng = np.random.default_rng()

   my_nsamps=nsamps//nprocs

   if nsamps%nprocs !=0:
      #somewhat dopey load-balancing
      remainder=nsamps%nprocs
      for n in range(remainder):
          if rank==n:
             my_nsamps+=1
           
   # Define the parameters to test
   xlo=-10.*np.pi; xhi=10.*np.pi
   ylo=-10.*np.pi; yhi=10.*np.pi

   tic=MPI.Wtime()

   #We would usually have a lot of x,y points so cannot create z as a function
   #of x and y, so have to use a loop
   zmax=0.
   for i in range(nsamps):
       for j in range(nsamps):
           xval=rng.uniform(xlo,xhi)
           yval=rng.uniform(ylo,yhi)
           zval=surface(xval,yval)
           if zval>zmax:
               zmax=zval
               xmax=xval
               ymax=yval
               max_xind=i
               max_yind=j

   my_xresult=np.array([xmax])
   my_yresult=np.array([ymax])
   my_zresult=np.array([zmax])

   xresult=np.empty(nprocs)
   yresult=np.empty(nprocs)
   zresult=np.empty(nprocs)

   comm.Gather([my_xresult,MPI.FLOAT],[xresult,MPI.FLOAT])
   comm.Gather([my_yresult,MPI.FLOAT],[yresult,MPI.FLOAT])
   comm.Gather([my_zresult,MPI.FLOAT],[zresult,MPI.FLOAT])

   toc=MPI.Wtime()

   if rank==0:
       g_zmax=zresult.max()
       maxind=np.argmax(zresult)
       g_xmax=xresult[maxind]
       g_ymax=yresult[maxind]
       print(f"Result is {g_zmax:.4f} at x,y= {g_xmax:.4f},{g_ymax:.4f} ")
   
   MPI.Finalize()

Last updated on Nov 17, 2020