C: Assignments Day 4¶
Today we have two problems for you to tackle. They both parallelize the pi.c code you developed for day 1. Both programs will need to be compiled at one of the TACC supercomputers.
The figure below shows an method to compute pi by numerical integration. We would like you to implement that computation in a C program.
The solution pi.c can be found on github. The contents of that file is presented here:
1#include <stdio.h>
2#include <time.h>
3#include <math.h>
4
5static long int numSteps = 1000000000;
6
7int main() {
8
9 // perform calculation
10 double pi = 0;
11 double dx = 1./numSteps;
12 double x = dx*0.50;
13
14 for (int i=0; i<numSteps; i++) {
15 pi += 4./(1.+x*x);
16 x += dx;
17 }
18
19 pi *= dx;
20
21 printf("PI = %16.14f Difference from math.h definition %16.14f \n",pi, pi-M_PI);
22 return 0;
23}
Note
When compiling at TACC if you wish to use gcc as I have done, issue the following command when you login.
module load gcc
When building and testing that the application works, use idev, as I have been showing in the videos.
When launchig the job to test the performance you will need to use sbatch and place your job in the queue. To do this you need to create a script that will be launched when the job runs. I have placed two scripts in each of the file folders. The script informs the system how many nodes and cores per node, what the expected run time is, and how to run the jib. Once the executable exists, the job is launched using the following command issued from a login node:
sbatch submit.sh
Full documentation on submitting scripts for OpenMP and MPI can be found online at TACC
Warning
Our solution of pi.c as written as a loop dependency which may need to revise for the second problem.
Problem 1: Parallelize using MPI¶
You are to modify the pi.c application and run it to use mpi. I have included a few files in code/parallel/ExercisesDay4/ex1 to help you. They include pi.c above, gather1.c and a submit.sh script. gather1.c was presented in the video, and us shown below:
1#include <mpi.h>
2#include <stdio.h>
3#include <stdlib.h>
4#define LUMP 5
5
6int main(int argc, char **argv) {
7
8 int numP, procID;
9
10 // the usual mpi initialization
11 MPI_Init(&argc, &argv);
12 MPI_Comm_size(MPI_COMM_WORLD, &numP);
13 MPI_Comm_rank(MPI_COMM_WORLD, &procID);
14
15 int *globalData=NULL;
16 int localData[LUMP];
17
18 // process 0 is only 1 that needs global data
19 if (procID == 0) {
20 globalData = malloc(LUMP * numP * sizeof(int) );
21 for (int i=0; i<LUMP*numP; i++)
22 globalData[i] = 0;
23 }
24
25 for (int i=0; i<LUMP; i++)
26 localData[i] = procID*10+i;
27
28 MPI_Gather(localData, LUMP, MPI_INT, globalData, LUMP, MPI_INT, 0, MPI_COMM_WORLD);
29
30 if (procID == 0) {
31 for (int i=0; i<numP*LUMP; i++)
32 printf("%d ", globalData[i]);
33 printf("\n");
34 }
35
36 if (procID == 0)
37 free(globalData);
38
39 MPI_Finalize();
40}
The submit script is as shown below.
1#!/bin/bash
2#--------------------------------------------------------------------
3# Generic SLURM script – MPI Hello World
4#
5# This script requests 1 node and 8 cores/node (out of total 64 avail)
6# for a total of 1*8 = 8 MPI tasks.
7#---------------------------------------------------------------------
8#SBATCH -J myjob
9#SBATCH -o myjob.%j.out
10#SBATCH -e myjob.%j.err
11#SBATCH -p development
12#SBATCH -N 1
13#SBATCH -n 4
14#SBATCH -t 00:02:00
15#SBATCH -A DesignSafe-SimCenter
16
17ibrun ./pi
18
19
One possible solution, which includes multiple approaches, is as shown in the following:
1#include <mpi.h>
2#include <stdio.h>
3#include <stdlib.h>
4#include <time.h>
5
6static int long numSteps = 1000000000;
7
8int main(int argc, char **argv) {
9
10 int numP, pid;
11
12 //
13 // the usual mpi initialization
14 //
15
16 MPI_Init(&argc, &argv);
17 MPI_Comm_size(MPI_COMM_WORLD, &numP);
18 MPI_Comm_rank(MPI_COMM_WORLD, &pid);
19
20 //
21 // start timer
22 //
23
24 clock_t start_t = clock();
25
26 //
27 // init some variable
28 //
29
30 double pi = 0;
31 double dx = 1.0/(double) numSteps;
32
33 //
34 // compute processors contribution to pi
35 //
36
37 for (int i=pid; i<numSteps; i+=numP) {
38 double x = (i+0.5)*dx;
39 pi += 4./(1.+x*x);
40 }
41 pi *= dx;
42
43 //
44 // gather contributions on P0 & sum
45 //
46
47 double *globalSum = 0;
48 if (pid == 0) {
49 globalSum = (double *)malloc(numP * sizeof(double) );
50 }
51
52 MPI_Gather(&pi, 1, MPI_DOUBLE, globalSum, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
53
54 if (pid == 0) {
55 for (int i=1; i<numP; i++) // 0 as pi already as p0 contribution
56 pi += globalSum[i];
57 }
58
59 if (pid == 0)
60 free(globalSum);
61
62 //
63 // end timer
64 //
65
66 clock_t end_t = clock();
67 double time = (double)(end_t - start_t) / CLOCKS_PER_SEC;
68
69 if (pid == 0)
70 printf("PI = %16.8f, duration: %f s\n",pi, time);
71
72 //
73 // usual termination for MPI
74 //
75
76 MPI_Finalize();
77 return 0;
78}
79
Problem 2: Parallelize using OpenMP¶
You are to modify the pi.c application and run it to use mpi. I have included a few files in code/parallel/ExercisesDay4/ex1 to help you. They include pi.c above and submitPI.sh script. submitPI.sh is as shown:
One possible solution, which includes multiple approaches, is as shown in the following:
1
2//
3// file to compute pi numerically using a number of different approaches
4// - demonstrates various OpenMP approaches
5
6#include <omp.h>
7#include <stdio.h>
8#include <time.h>
9
10static int long numSteps = 1000000000;
11
12int main() {
13
14 // perform calculation
15 double pi = 0;
16 double dx = 1./numSteps;
17 double x = 0.5*x;
18
19 //
20 // compute Serially
21 //
22
23 double start = omp_get_wtime();
24 {
25 pi = 0;
26 double sum = 0;
27 double x = dx*0.50;
28 for (int i=0; i<numSteps; i++) {
29 pi += 4./(1.+x*x);
30 x += dx;
31 }
32 pi*=dx;
33 }
34
35 printf("Serial: PI = %16.8f in %.4g sec\n",pi, omp_get_wtime()-start);
36
37 //
38 // Compute in Parallel with false sharing issue
39 //
40
41 start = omp_get_wtime();
42 int nThreads;
43 double pSum[32];
44 for (int i=0; i<32; i++)
45 pSum[i] = 0;
46
47#pragma omp parallel
48 {
49 int tid = omp_get_thread_num();
50 int numT = omp_get_num_threads();
51 if (tid == 0)
52 nThreads = numT;
53
54 for (int i=tid; i<numSteps; i+=numT) {
55 double x = (i+0.5)*dx;
56 pSum[tid] += 4./(1.+x*x); // line with false sharing issue
57 }
58 }
59
60 pi = 0;
61 for (int i=0; i<nThreads; i++) {
62 pi += pSum[i];
63 }
64 pi *= dx;
65
66 printf("\nParallel Results: %d Threads\n",nThreads);
67 printf("Basic False Sharing: PI = %16.8f in %.4g sec\n",pi, omp_get_wtime()-start);
68
69
70 //
71 // Basic with padded array to remove false sharing
72 //
73
74 start = omp_get_wtime();
75
76 double padSum[32][64];
77 for (int i=0; i<nThreads; i++)
78 padSum[i][0] = 0;
79
80#pragma omp parallel
81 {
82 int tid = omp_get_thread_num();
83 int numT = omp_get_num_threads();
84 if (tid == 0)
85 nThreads = numT;
86
87 for (int i=tid; i<numSteps; i+=numT) {
88 double x = (i+0.5)*dx;
89 padSum[tid][0] += 4./(1.+x*x); // padSum .. now no longer assesing
90 // array values next to each other
91 }
92 }
93
94 pi = 0;
95 for (int i=0; i<nThreads; i++) {
96 pi += padSum[i][0];
97 }
98 pi *= dx;
99
100 printf("Fix Previous with array padding: PI = %16.8f in %.4g sec\n",pi, omp_get_wtime()-start);
101
102
103 //
104 // Demonstration #omp parallel for reduction
105 //
106
107 start = omp_get_wtime();
108
109#pragma omp parallel for reduction(+:pi) private(x)
110 for (int i=0; i<numSteps; i++) {
111 double x = (i+0.5)*dx;
112 pi += 4./(1.+x*x);
113 }
114
115 pi *= dx;
116
117 printf("Reduction: PI = %16.8f in %.4g sec\n", pi,omp_get_wtime()-start);
118
119 //
120 // Replace Reduction with Synchronization section: critical
121 //
122
123 start = omp_get_wtime();
124#pragma omp parallel
125 {
126 double sum = 0;
127 double x = 0;
128#pragma omp for
129 for (int i=0; i<numSteps; i++) {
130 x = (i+0.5)*dx;
131 sum += 4./(1.+x*x);
132 }
133#pragma omp critical
134 {
135 pi += sum;
136 // OTHER STUFF IF YOU WANT .. NOT TOO MUCH
137 }
138 }
139
140 pi *= dx;
141
142 printf("Synchronization: PI = %16.8f in %.4g sec\n",pi, omp_get_wtime()-start);
143
144 return 0;
145}