Cleanup and adjustments L7 & L8

csc-training · Aug 30, 2024 · ebdf8da · ebdf8da
1 parent 837e9d2
commit ebdf8da
Show file tree

Hide file tree

Showing 13 changed files with 34 additions and 46 deletions.
diff --git a/B3/L7/docs/mpi_lec1.md b/B3/L7/docs/mpi_lec1.md
@@ -62,9 +62,6 @@ lang:   en
 </div>
 
 
-- Weak vs strong scaling: Increased parallelism associated with increasing problem size vs faster execution
-
-
 # Programming models {.section}
 
 # Programming models
@@ -146,19 +143,16 @@ CALL doStuff(a)
 # The MPI standard
 
 - Different vendors implementing the standard
-    - MPICH, openMPI, Cray ...
+    - MPICH, openMPI, ...
 - Portability -- works on a variety of platforms
 
 - Since 1994, latest version of the standard 4.1 (2023)
 
 
 # The API 
 
-- Communcation routines
+- Communication routines
 - Execution control - synchronization
-- Advanced features
-	- User defined datatypes
-	- Communication topologies; custom *communicators*
 - == A Lot of stuff, but meaningful solutions can be built with a handful of tools
 
 
@@ -173,7 +167,7 @@ CALL doStuff(a)
 
 ![](img/batio.jpeg){.center width=40%}
 <p style="text-align:center">We're going to give You the keys to the Lamborghini</p>
-<p style="text-align:center">...but you're going to have to drive it!</p>
+<p style="text-align:center">...but you're also going to have to drive it!</p>
 
 
 # MPI programming {.section}
@@ -279,7 +273,7 @@ MPI_Comm_rank(`comm`{.input}, `rank`{.output}, `err`{.output})
 - MPI communicator
 
     - An object connecting a group of processes
-    - **MPI_COMM_WORLD** (includes all tasks in the invoked program)
+    - **MPI_COMM_WORLD** (includes all processes in the invoked program)
     - Carry information about the number of processes and process ranks
     - Possible to define custom communicators for other purposes
 
@@ -290,7 +284,6 @@ MPI_Comm_rank(`comm`{.input}, `rank`{.output}, `err`{.output})
 
 - Point-to-point communication
 - Collective communication
-- (One-sided communication)
 
 <br>
 
@@ -357,7 +350,7 @@ MPI_Recv(`buf`{.input},`count`{.input},`datatype`{.input},`source`{.input},`tag`
 
     ELSE IF (rank == dest) THEN
         CALL MPI_RECV(otherdata,msize,MPI_INTEGER,src, &
-                      tag_rcv,MPI_COMM_WORLD,st,err)
+                      tag_rcv,MPI_COMM_WORLD,MPI_STATUS_IGNORE,err)
     ...
 
     CALL MPI_FINALIZE(err)
@@ -404,6 +397,8 @@ MPI_Recv(`buf`{.input},`count`{.input},`datatype`{.input},`source`{.input},`tag`
 - `git clone https://github.com/csc-training/esiwace-summerschool-2024`  <br>
   (or update existing `git pull origin main`)
 
+- `cd B3/L7/exercises/ex_1`
+
 - Part II lecture at ~10:30
 
 - Take breaks :)

diff --git a/B3/L7/docs/mpi_lec2.md b/B3/L7/docs/mpi_lec2.md
@@ -63,7 +63,7 @@ MPI_IRecv(`buf`{.output}, `count`{.input}, `datatype`{.input}, `source`{.input},
 # Finalizing non-blocking communication {.split-definition}
 
 MPI_Wait(`request`{.input}, `status`{.output}, `err`{.output})
-  : type(MPI_Request) `request`{.input}
+  : integer `request`{.input}
     : Handle of the non-blocking communication
   : integer `status(MPI_STATUS_SIZE)`{.output}
     : Status of the completed communication, same as in **`MPI_Recv`**
@@ -105,7 +105,7 @@ MPI_Test(`request`{.input}, `flag`{.output}, `status`{.output}, `err`{.output})
 - Various types
     - Data  distribution
     - Collective computation -- reduction operations
-    - Synchronization
+    - Synchronization; `MPI_Barrier`
 
 
 
@@ -497,6 +497,8 @@ MPI_Alltoall(`sendbuf`{.input}, `sendcount`{.input}, `sendtype`{.input}, `recvbu
 - `git clone https://github.com/csc-training/esiwace-summerschool-2024`  <br>
   (or update existing `git pull origin main`)
 
+- `cd B3/L7/exercises/ex_2`
+
 - Lunch ~12:00
 
 - Take breaks :)

diff --git a/B3/L7/exercise/ex_1/01_hello-world/.README.md.swp b/B3/L7/exercise/ex_1/01_hello-world/.README.md.swp
diff --git a/B3/L7/exercise/ex_1/04_parallel-pi/.README.md.swp b/B3/L7/exercise/ex_1/04_parallel-pi/.README.md.swp
diff --git a/B3/L7/exercise/scripts/allocate_job.sh b/B3/L7/exercise/scripts/allocate_job.sh
@@ -1,4 +1,4 @@
 #! bin/sh
 
-salloc --ntasks=4 --account=<project id> --partition=<partition> --qos=esiwace --time=00:10:00 
-#salloc --ntasks=1 --cpus-per-task=4 --account=<project id> --partition=<partition> --qos=esiwace --time=00:10:00
+salloc --ntasks=4 --account=bb1153 --partition=shared --qos=esiwace --time=00:10:00 
+#salloc --ntasks=1 --cpus-per-task=4 --account=bb1153 --partition=shared --qos=esiwace --time=00:10:00
diff --git a/B3/L7/exercise/scripts/batch_job.sh b/B3/L7/exercise/scripts/batch_job.sh
@@ -1,12 +1,13 @@
 #!/bin/sh
 
 #SBATCH --job-name=hello
-#SBATCH --partition=<partition>
-#SBATCH --account=<project>
+#SBATCH --partition=shared
+#SBATCH --account=bb1153
+#SBATCH --qos=esiwace
 #SBATCH --time=00:10:00
 #SBATCH --nodes=1
 #SBATCH --ntasks=4
-##SBATCH --cpus-per-task
+##SBATCH --cpus-per-task=4
 
 srun ./application
 
diff --git a/B3/L8/docs/openmp_lec1.md b/B3/L8/docs/openmp_lec1.md
@@ -25,12 +25,12 @@ lang:   en
 - Identify the most common pitfalls and ways to rectify them
 
 
-# Introduction to openMP {.section}
+# Introduction to OpenMP {.section}
 
 # Programming models
 
 - Distributed vs shared memory computation
-    - Tasks vs Threads
+    - Processes vs Threads
 
 ![](img/threads_vs_tasks.svg){.center width=90%}
 
@@ -510,7 +510,7 @@ DO i = 1,N
 END DO
 !$omp end do
 !$omp critical
-global = global + local
+global = global + loc
 !$omp end critical
 !$omp end parallel
 
@@ -541,6 +541,8 @@ global = global + local
 - `git clone https://github.com/csc-training/esiwace-summerschool-2024`  <br>
   (or update existing `git pull origin main`)
 
+- `cd B3/L8/exercises/ex_1`
+
 - Part II lecture ~10:30
 
 - Take breaks :)

diff --git a/B3/L8/docs/openmp_lec2.md b/B3/L8/docs/openmp_lec2.md
@@ -236,7 +236,7 @@ MPI_Init_thread(`required`{.input}, `provided`{.output}, `err`{.output})
   **`MPI_THREAD_SINGLE`** < **`MPI_THREAD_FUNNELED`** < **`MPI_THREAD_SERIALIZED`** < **`MPI_THREAD_MULTIPLE`**
 
 
-# MPI thread support level
+# MPI thread support levels
 
 |                      |                                             |
 |----------------------|---------------------------------------------|
@@ -246,23 +246,6 @@ MPI_Init_thread(`required`{.input}, `provided`{.output}, `err`{.output})
 |MPI_THREAD_MULTIPLE   | No restrictions                             |
 
 
-# MPI thread support levels
-
-- Modern MPI libraries support all threading levels
-    - OpenMPI: Build time configuration, check with
-    ```bash
-    ompi_info | grep 'Thread support'
-    ```
-    - Intel MPI: When compiling with `-qopenmp` a thread safe version of the
-      MPI library is automatically used
-    - Cray MPI: Set **`MPICH_MAX_THREAD_SAFETY`** environment variable to
-      `single`, `funneled`, `serialized`, or `multiple` to select the
-      threading level
-- Note that using **`MPI_THREAD_MULTIPLE`** requires the MPI library to
-  internally lock some data structures to avoid race conditions
-    - may result in additional overhead in MPI calls
-
-
 # Hybrid programming styles: fine/coarse grained
 
 - Fine-grained
@@ -326,6 +309,8 @@ END PROGRAM hello
 - `git clone https://github.com/csc-training/esiwace-summerschool-2024`  <br>
   (or update existing `git pull origin main`)
 
+- `cd B3/L8/exercises/ex_2`
+
 - Lunch ~12:00
 
 - Take breaks :)

diff --git a/B3/L8/exercise/ex_1/04_race-condition/solution/a.out b/B3/L8/exercise/ex_1/04_race-condition/solution/a.out
diff --git a/B3/L8/exercise/ex_2/05_hybrid-area-circle/README.md b/B3/L8/exercise/ex_2/05_hybrid-area-circle/README.md
@@ -7,4 +7,6 @@ threads for parallel looping of the grid points within each process. Finally com
 Try with different numbers of processes and threads and see how this affects the execution time. You can also try to change the number
 of grid points.
 
+Hint: for the OpenMP part, `omp parallel do collapse(2)` can be used to parallellize two nested loops.
+
 
diff --git a/B3/L8/exercise/ex_2/05_hybrid-area-circle/circle.F90 b/B3/L8/exercise/ex_2/05_hybrid-area-circle/circle.F90
@@ -42,7 +42,7 @@ program circle
  ! TODO: define column limits per mpi process
 
  ! TODO: Use openmp to parallelize the estimation of the subdomain area per mpi process
- !       and finally calculate the total area of the circle
+ !       and finally calculate the total area of the circle (sum over processes)
  area_sub = 0.
  do j = 1,nyg
     do i = nxi,nxf

diff --git a/B3/L8/exercise/scripts/allocate_job.sh b/B3/L8/exercise/scripts/allocate_job.sh
@@ -1,4 +1,4 @@
 #! bin/sh
 
-salloc --ntasks=4 --account=<project id> --partition=<partition> --qos=esiwace --time=00:10:00 
-#salloc --ntasks=1 --cpus-per-task=4 --account=<project id> --partition=<partition> --qos=esiwace --time=00:10:00
+salloc --ntasks=4 --account=bb1153 --partition=shared --qos=esiwace --time=00:10:00 
+#salloc --ntasks=1 --cpus-per-task=4 --account=bb1153 --partition=shared --qos=esiwace --time=00:10:00
diff --git a/B3/L8/exercise/scripts/batch_job.sh b/B3/L8/exercise/scripts/batch_job.sh
@@ -1,12 +1,13 @@
 #!/bin/sh
 
 #SBATCH --job-name=hello
-#SBATCH --partition=<partition>
-#SBATCH --account=<project>
+#SBATCH --partition=shared
+#SBATCH --account=bb1153
+#SBATCH --qos=esiwace
 #SBATCH --time=00:10:00
 #SBATCH --nodes=1
 #SBATCH --ntasks=4
-##SBATCH --cpus-per-task
+##SBATCH --cpus-per-task=4
 
 srun ./application