C: Assignments Day 4

Today we have two problems for you to tackle. They both parallelize the pi.c code you developed for day 1. Both programs will need to be compiled at one of the TACC supercomputers.

The figure below shows an method to compute pi by numerical integration. We would like you to implement that computation in a C program.

../_images/pi.png

Computation of pi numerically

The solution pi.c can be found on github. The contents of that file is presented here:

 1#include <stdio.h>
 2#include <time.h>
 3#include <math.h>
 4
 5static long int numSteps = 1000000000;
 6
 7int main() {
 8
 9  // perform calculation
10  double pi   = 0;
11  double dx = 1./numSteps;
12  double x  = dx*0.50;
13  
14  for (int i=0; i<numSteps; i++) {
15    pi += 4./(1.+x*x);
16    x += dx;
17  }
18  
19  pi *= dx;
20  
21  printf("PI = %16.14f Difference from math.h definition %16.14f \n",pi, pi-M_PI);
22  return 0;
23}

Note

  1. When compiling at TACC if you wish to use gcc as I have done, issue the following command when you login.

module load gcc
  1. When building and testing that the application works, use idev, as I have been showing in the videos.

  2. When launchig the job to test the performance you will need to use sbatch and place your job in the queue. To do this you need to create a script that will be launched when the job runs. I have placed two scripts in each of the file folders. The script informs the system how many nodes and cores per node, what the expected run time is, and how to run the jib. Once the executable exists, the job is launched using the following command issued from a login node:

sbatch submit.sh

Full documentation on submitting scripts for OpenMP and MPI can be found online at TACC

Warning

Our solution of pi.c as written as a loop dependency which may need to revise for the second problem.

Problem 1: Parallelize using MPI

You are to modify the pi.c application and run it to use mpi. I have included a few files in code/parallel/ExercisesDay4/ex1 to help you. They include pi.c above, gather1.c and a submit.sh script. gather1.c was presented in the video, and us shown below:

 1#include <mpi.h>
 2#include <stdio.h>
 3#include <stdlib.h>
 4#define LUMP 5
 5
 6int main(int argc, char **argv) {
 7  
 8  int numP, procID;
 9
10  // the usual mpi initialization
11  MPI_Init(&argc, &argv);
12  MPI_Comm_size(MPI_COMM_WORLD, &numP);
13  MPI_Comm_rank(MPI_COMM_WORLD, &procID);
14
15  int *globalData=NULL;
16  int localData[LUMP];
17
18  // process 0 is only 1 that needs global data
19  if (procID == 0) {
20    globalData = malloc(LUMP * numP * sizeof(int) );
21    for (int i=0; i<LUMP*numP; i++)
22      globalData[i] = 0;
23  }
24
25  for (int i=0; i<LUMP; i++)
26    localData[i] = procID*10+i;
27  
28  MPI_Gather(localData, LUMP, MPI_INT, globalData, LUMP, MPI_INT, 0, MPI_COMM_WORLD);
29
30  if (procID == 0) {
31    for (int i=0; i<numP*LUMP; i++)
32      printf("%d ", globalData[i]);
33    printf("\n");
34  }
35
36  if (procID == 0)
37    free(globalData);
38
39  MPI_Finalize();
40}

The submit script is as shown below.

 1#!/bin/bash
 2#--------------------------------------------------------------------
 3# Generic SLURM script – MPI Hello World
 4#
 5# This script requests 1 node and 8 cores/node (out of total 64 avail)
 6# for a total of 1*8 = 8 MPI tasks.
 7#---------------------------------------------------------------------
 8#SBATCH -J myjob
 9#SBATCH -o myjob.%j.out 
10#SBATCH -e myjob.%j.err 
11#SBATCH -p development
12#SBATCH -N 1
13#SBATCH -n 4
14#SBATCH -t 00:02:00
15#SBATCH -A DesignSafe-SimCenter
16
17ibrun ./pi
18
19

One possible solution, which includes multiple approaches, is as shown in the following:

 1#include <mpi.h>
 2#include <stdio.h>
 3#include <stdlib.h>
 4#include <time.h>
 5
 6static int long numSteps = 1000000000;
 7
 8int main(int argc, char **argv) {
 9
10  int numP, pid;
11
12  //
13  // the usual mpi initialization
14  //
15
16  MPI_Init(&argc, &argv);
17  MPI_Comm_size(MPI_COMM_WORLD, &numP);
18  MPI_Comm_rank(MPI_COMM_WORLD, &pid);
19
20  //
21  // start timer
22  //
23
24  clock_t start_t = clock();
25
26  //
27  // init some variable
28  //
29
30  double pi = 0;
31  double dx = 1.0/(double) numSteps;
32
33  //
34  // compute processors contribution to pi
35  //
36
37  for (int i=pid; i<numSteps; i+=numP) {
38    double x = (i+0.5)*dx;
39    pi += 4./(1.+x*x);
40  }
41  pi *= dx;
42
43  //
44  // gather contributions on P0 & sum
45  //
46
47  double *globalSum = 0;
48  if (pid == 0) {
49    globalSum = (double *)malloc(numP * sizeof(double) );
50  }
51  
52  MPI_Gather(&pi, 1, MPI_DOUBLE, globalSum, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
53
54  if (pid == 0) {
55    for (int i=1; i<numP; i++) // 0 as pi already as p0 contribution
56      pi += globalSum[i];
57  }
58
59  if (pid == 0)
60    free(globalSum);
61  
62  // 
63  // end timer
64  //
65
66  clock_t end_t = clock();
67  double time = (double)(end_t - start_t) / CLOCKS_PER_SEC;
68
69  if (pid == 0)
70    printf("PI = %16.8f, duration: %f s\n",pi, time);
71
72  // 
73  // usual termination for MPI
74  //
75
76  MPI_Finalize();
77  return 0;
78}
79

Problem 2: Parallelize using OpenMP

You are to modify the pi.c application and run it to use mpi. I have included a few files in code/parallel/ExercisesDay4/ex1 to help you. They include pi.c above and submitPI.sh script. submitPI.sh is as shown:

One possible solution, which includes multiple approaches, is as shown in the following:

  1
  2//
  3// file to compute pi numerically using a number of different approaches
  4//   - demonstrates various OpenMP approaches
  5
  6#include <omp.h>
  7#include <stdio.h>
  8#include <time.h>
  9
 10static int long numSteps = 1000000000;
 11
 12int main() {
 13
 14  // perform calculation
 15  double pi   = 0;
 16  double dx = 1./numSteps;
 17  double x = 0.5*x;
 18
 19  //
 20  // compute Serially
 21  //
 22
 23  double start = omp_get_wtime();  
 24  {
 25    pi = 0;
 26    double sum = 0;
 27    double x  = dx*0.50;
 28    for (int i=0; i<numSteps; i++) {
 29     pi += 4./(1.+x*x);
 30      x += dx;
 31    }
 32    pi*=dx;
 33  }
 34
 35  printf("Serial: PI = %16.8f in %.4g sec\n",pi, omp_get_wtime()-start);
 36
 37  //  
 38  // Compute in Parallel with false sharing issue
 39  //
 40
 41  start = omp_get_wtime();  
 42  int nThreads;
 43  double pSum[32];
 44  for (int i=0; i<32; i++) 
 45    pSum[i] = 0;
 46
 47#pragma omp parallel 
 48  {
 49    int tid = omp_get_thread_num();
 50    int numT = omp_get_num_threads();
 51    if (tid == 0)
 52      nThreads = numT;
 53    
 54    for (int i=tid; i<numSteps; i+=numT) {
 55      double x = (i+0.5)*dx;  
 56      pSum[tid] += 4./(1.+x*x);  // line with false sharing issue
 57    }
 58  }  
 59
 60  pi = 0;
 61  for (int i=0; i<nThreads; i++) {
 62    pi += pSum[i];
 63  }
 64  pi *= dx;
 65
 66  printf("\nParallel Results: %d Threads\n",nThreads);
 67  printf("Basic False Sharing: PI = %16.8f in %.4g sec\n",pi, omp_get_wtime()-start);  
 68
 69
 70  //
 71  // Basic with padded array to remove false sharing
 72  //  
 73  
 74  start = omp_get_wtime();
 75
 76  double padSum[32][64];
 77  for (int i=0; i<nThreads; i++) 
 78    padSum[i][0] = 0;
 79
 80#pragma omp parallel 
 81  {
 82    int tid = omp_get_thread_num();
 83    int numT = omp_get_num_threads();
 84    if (tid == 0)
 85      nThreads = numT;
 86    
 87    for (int i=tid; i<numSteps; i+=numT) {
 88      double x = (i+0.5)*dx;  
 89      padSum[tid][0] += 4./(1.+x*x);  // padSum .. now no longer assesing
 90                                      //   array values next to each other
 91    }
 92  }  
 93
 94  pi = 0;
 95  for (int i=0; i<nThreads; i++) {
 96    pi += padSum[i][0];
 97  }
 98  pi *= dx;
 99
100  printf("Fix Previous with array padding: PI = %16.8f in %.4g sec\n",pi, omp_get_wtime()-start);  
101
102
103  //
104  // Demonstration #omp parallel for reduction
105  //     
106  
107  start = omp_get_wtime();
108
109#pragma omp parallel for reduction(+:pi) private(x)
110  for (int i=0; i<numSteps; i++) {
111    double x = (i+0.5)*dx;  
112    pi += 4./(1.+x*x);
113  }
114  
115  pi *= dx;
116  
117  printf("Reduction: PI = %16.8f in %.4g sec\n", pi,omp_get_wtime()-start);
118
119  //
120  // Replace Reduction with Synchronization section: critical
121  //
122  
123  start = omp_get_wtime();  
124#pragma omp parallel 
125  {
126    double sum = 0;
127    double x = 0;
128#pragma omp for
129    for (int i=0; i<numSteps; i++) {
130      x = (i+0.5)*dx;
131      sum += 4./(1.+x*x);
132    }
133#pragma omp critical
134    {
135      pi += sum;
136      // OTHER STUFF IF YOU WANT .. NOT TOO MUCH
137    }
138  }
139  
140  pi *= dx;
141  
142  printf("Synchronization: PI = %16.8f in %.4g sec\n",pi, omp_get_wtime()-start);
143
144  return 0;
145}