-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpetsc-ecp-parallel-computing.tex
1665 lines (1455 loc) · 91.2 KB
/
petsc-ecp-parallel-computing.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
\documentclass[10pt,journal,compsoc]{IEEEtran}
\usepackage{cite}
\usepackage{lineno,hyperref}
\usepackage{booktabs}
\modulolinenumbers[5]
\usepackage{color}
\usepackage{xcolor}
\usepackage{comment}
\usepackage{verbatim}
\usepackage{float}
\usepackage{tikz}
\usetikzlibrary[shapes,shapes.arrows,arrows,shapes.misc,fit,positioning]
\usepackage[export]{adjustbox}
\usepackage{caption}
\usepackage{hyphenat}
\usepackage{subcaption}
\usepackage{listings}
\lstset
{
language=C++,
basicstyle= \ttfamily\scriptsize,
commentstyle=\color{green},
keywordstyle=\color{orange},
numberstyle=\tiny\color{gray},
stringstyle=\color{purple},
morekeywords={parallel_for,RangePolicy},
morekeywords={[2]{Kokkos}},
keywordstyle={[2]{\color{blue}}},
morekeywords={[3]{VecAXPY,VecNorm,SNESCreate,DMDACreate1d,DMCreateGlobalVector,VecDuplicate,DMCreateMatrix,SNESSetFunction,SNESSetJacobian,SNESSolve,DMDAGetCorners,DMGetLocalVector,DMGlobalToLocal,DMGlobalToLocalBegin,DMGlobalToLocalEnd,DMDAVecGetKokkosOffsetView,DMDAVecGetArrayRead,DMDAVecGetArrayWrite,MatGetKokkosOffsetView,MatSetValues,MatSetValuesKokkos,DMDAVecGetKokkosOffsetView,PetscSFBcast,PetscSFBcastBegin,VecDotLocal,MPI_Allreduce,MatGetKokkos}},
keywordstyle={[3]{\color{magenta}}},
morekeywords={MPI_Datatype,PetscSF},
keywordstyle={[4]{\color{orange}}},
numbers=left,
stepnumber=5,
showstringspaces=false,
tabsize=1,
breaklines=true,
breakatwhitespace=false,
firstnumber=1,
}
\newif\ifargonnereport
\argonnereportfalse
\newif\iffinal
\finaltrue
\iffinal
\newcommand\todd[1]{}
\newcommand\todo[1]{}
\else
\definecolor{puce}{rgb}{0.8,0.53,0.6}
\newcommand\todd[1]{\textcolor{puce}{Todd: #1}}
\newcommand\todo[1]{\textcolor{red}{Todo: #1}}
\fi
\newenvironment{titemize} % Same as itemize, but with minimal vspace
% {\begin{itemize}}
% {\end{itemize}}
{\begin{list}{\labelitemi}{
\setlength{\topsep}{0pt}
\setlength{\parskip}{0pt}
\setlength{\itemsep}{0pt}
\setlength{\parsep}{0pt}
\setlength{\leftmargin}{23pt}
\setlength{\labelwidth}{23pt}
}}
{\end{list}}
\begin{document}
\ifargonnereport
\input{cover}
\fi
\title{A place-holder title: PETScSF: Abstracting Communication for Scalable Communication}
% names below are just copied from previous papers, they are not neccessarily involved in this paper.
%% Authors: Add yourself and indicate your affiliation(s) via footnotes.
\author{
\IEEEauthorblockN{
Mark F. Adams\IEEEauthorrefmark{1}, % LBNL
Satish Balay\IEEEauthorrefmark{2}, % ANL
Jed Brown\IEEEauthorrefmark{3}, % CU
Alp Dener\IEEEauthorrefmark{2}, % ANL
Matthew Knepley\IEEEauthorrefmark{4}, % Buffalo
Scott E. Kruger\IEEEauthorrefmark{5}, % Tech-X
Richard Tran Mills\IEEEauthorrefmark{2}, % ANL
Todd Munson\IEEEauthorrefmark{2}, % ANL
Karl Rupp\IEEEauthorrefmark{6}, % TU Wien
Barrk F. Smith\IEEEauthorrefmark{7}, % ANL associate
Stefano Zampini\IEEEauthorrefmark{8}, % Kaust
Hong Zhang\IEEEauthorrefmark{2} and % ANL
Junchao Zhang\IEEEauthorrefmark{2} % ANL
}
\IEEEauthorblockA{
\IEEEauthorrefmark{1}Lawrence Berkeley National Laboratory, Berkeley, CA 94720 USA
}
\IEEEauthorblockA{
\IEEEauthorrefmark{2}Argonne National Laboratory, 9700 S. Cass Avenue, Lemont, IL 60439 USA
}
\IEEEauthorblockA{
\IEEEauthorrefmark{3}University of Colorado Boulder, Boulder, CO 80302 USA
}
\IEEEauthorblockA{
\IEEEauthorrefmark{4}University at Buffalo, Buffalo, NY 14260 USA
}
\IEEEauthorblockA{
\IEEEauthorrefmark{5}Tech-X Corporation, 5260 Arapahoe Ave. Suite, A, Boulder, CO 80303 USA
}
\IEEEauthorblockA{
\IEEEauthorrefmark{6}Institute for Microelectronics, TU Wien, Gusshausstrasse 27-29/E360, A-1040 Wien, Austria
}
\IEEEauthorblockA{
\IEEEauthorrefmark{7}Argonne Associate of Global Empire, LLC, Argonne National Laboratory, 9700 S. Cass Avenue, Lemont, IL 60439 USA
}
\IEEEauthorblockA{
\IEEEauthorrefmark{7}King Abdullah University of Science and Technology, Extreme Computing Research Center, Thuwal, Saudi Arabia
}
}
\maketitle
\begin{abstract}
IEEE Transactions on Parallel and Distributed Systems, Due February 17, 2021, 12 pages, double column, special exascale issue (not just ECP).
The Portable Extensible Toolkit for Scientific computation (PETSc)
library delivers scalable solvers for
nonlinear time-dependent differential and algebraic equations and for numerical optimization.
The PETSc design for performance portability addresses fundamental GPU accelerator challenges and stresses
flexibility and extensibility by separating the
programming model
used by the application from that used by the library,
and it enables application
developers to use their preferred programming model, such as
Kokkos, RAJA, SYCL, HIP, CUDA, or OpenCL,
on upcoming exascale systems.
%GAIL - all of seems dangerous
A blueprint for using GPUs from PETSc-based codes is provided, and
case studies emphasize the
flexibility and high performance achieved
on current GPU-based systems.
\end{abstract}
\begin{IEEEkeywords}
Numerical software, high-performance computing, GPU acceleration, many-core, performance portability, exascale.
\end{IEEEkeywords}
\linenumbers
%########################################################
\section{Introduction}
High-performance computing (HPC) node architectures are increasingly reliant on
high degrees of fine-grained parallelism,
% as supercomputers approach exascale performance
driven primarily by power management considerations.
Some new designs place this parallelism in CPUs having many
cores and hardware threads and wide SIMD registers:
a prominent example is the Fugaku machine installed at RIKEN in Japan.
%% , which occupies the top spot in the June 2020 TOP500 global ranking of supercomputers.
Most supercomputing centers, however, are relying on GPU-based systems to provide the
needed parallelism, and the next generation of open-science supercomputers to be fielded in
the United States will rely on GPUs from AMD, NVIDIA, or Intel.
%% a new discrete GPU from Intel.
These devices are extremely powerful but have posed fundamental challenges for developers and
users of scientific computing libraries.
This paper shows how to use GPUs from applications written using the
Portable, Extensible Toolkit for Scientific computation (PETSc,
\cite{petsc-user-ref}), describes
the major challenges software libraries face, and shows how PETSc overcomes them.
The PETSc design completely {\bf separates the programming
model used by the application} and {\bf the model used by PETSc}
for its back-end computational kernels; see
Figure~\ref{fig:petsc_accel_support}.
% where ``programming model'' refers to both the model and its supporting runtime.
This separation will allow PETSc users from C/C++, Fortran, or Python to employ their preferred
GPU programming model, such as Kokkos, RAJA, SYCL, HIP, CUDA,
or OpenCL \cite{KOKKOS,RAJA,SYCL,CUDA,HIP,OPENCL}, on
upcoming exascale systems.
% LOIS WOULD SAY THIS IS IMPORTANT TO KEEP IN READERS DONT
% KNOW WHAT IS IN PETSC In all cases, they will be able to use PETSc's powerful,
%scalable, algebraic solvers, advanced time-stepping and adjoint
%capabilities, and numerical optimization methods running on the GPUs.
%
In all cases, users will be able to rely on PETSc's large assortment of composable, hierarchical, and nested
solvers \cite{bkmms2012}, as well as advanced time-stepping and adjoint
capabilities and numerical optimization methods running on the GPU.
%implementations of parallel algorithms
%and data structures,
%and routines for manipulating matrices, vectors, and meshes.
\begingroup
\captionsetup[figure]{skip=0pt,belowskip=0pt}
\begin{figure}[H]
\begin{center}
\includegraphics[width=1.0\linewidth]{PETSc-Diagrams_v3.pdf}
\caption{PETSc application developers will be able to use a variety of programming models for GPUs independently of PETSc's internal programming model.}
\label{fig:petsc_accel_support}
\end{center}
\end{figure}
\endgroup
\vskip-10pt
An application for solving time-dependent partial differential
equations, for example, may compute the Jacobian using Kokkos
and then call PETSc's time-stepping
routines and algebraic solvers that use CUDA, cuBLAS,
and cuSPARSE; see Figure~\ref{fig:petsc_backends}.
%, see Figure~\ref{fig:petsc_backends-kokkos}.
Applications will be able to mix and match programming models,
allowing, for example, some application code in Kokkos and some in CUDA. The
flexible PETSc back-end support is accomplished by {\bf sharing data} between the application and PETSc programming models but not
sharing the programming models' internal data structures.
Because the data is shared, there are no copies between the programming models and no loss of efficiency.
\begin{comment}
\begingroup
\captionsetup[figure]{skip=0pt,belowskip=0pt}
\vspace{-5pt}
\begin{figure}[H]
\begin{center}
\includegraphics[trim = 3.2in 2.0in 3.1in .4in, clip, width=.80\linewidth]{PETSc-backends-kokkos-cuda.png}
\caption{Application using PETSc/Kokkos for CUDA system}
\label{fig:petsc_backends-kokkos}
\end{center}
\end{figure}
\endgroup
\vspace{-20pt}
\end{comment}
%As depicted in Figure~\ref{fig:petsc_accel_support},
GPU vendors and the HPC
community have developed various programming models
for using GPUs. The oldest % widespread
programming model is CUDA, developed by NVIDIA for their GPUs. AMD adopted
an essentially identical model for their hardware, calling their implementation
HIP. Several generations of the OpenCL programming
model, supported in some form by NVIDIA, AMD, and Intel,
have been designed for portability across multiple GPU vendors' hardware.
Moreover, there are the
\emph{C++ data\hyp{}parallel programming models} that make use of C++
lambdas and provide constructs such as {\tt parallel\_for} for programmers to
easily express data parallelism that can be mapped by the compilers into GPU=specific implementations. To some degree, they are C++
variants of the CUDA model. The oldest is Kokkos \cite{KOKKOS}, which has support for
CUDA, HIP, DPC++, and shared-memory OpenMP, but not for OpenCL. RAJA \cite{RAJA} was introduced
more recently. SYCL \cite{SYCL} is a newer model that typically uses OpenCL as
a back-end. A notable SYCL implementation is DPC++ \cite{DPC++}, developed
by Intel for its new discrete GPUs.
Figure~\ref{fig:petsc_backends} (upper right) shows the case where the application
uses OpenMP offload and runs on a HIP system and
the entire zoo of potential programming models and back-end numerical
libraries that will be employed by PETSc.
\begin{comment}
\begin{figure}[H]
\begin{center}
\includegraphics[trim = 2.6in 2.5in 2.4in .5in, clip,width=.90\linewidth]{PETSc-backends-openmp-hip.png}
\caption{Application using PETSc/OpenMP offload (planned) for HIP system}
\label{fig:petsc_backends-openmp}
\end{center}
\end{figure}
\end{comment}
\begin{comment}
\begin{figure}[htbp]
\begin{center}
\includegraphics[trim = .8in .2in 0in .3in, clip,width=.99\linewidth]{PETSc-backends.png}
\caption{All combinations of front and back-ends}
\label{fig:petsc_backends}
\end{center}
\end{figure}
\end{comment}
\begin{figure}[htbp]
\begin{center}
\includegraphics[clip,width=.99\linewidth]{PETSc-backends-all.pdf}
\caption{PETSc usage with Kokkos-cuLIB-CUDA, OpenMP-ROCm-HIP, and all combinations. Here cuLIB indicates cuBLAS and cuSPARSE.}
\label{fig:petsc_backends}
\end{center}
\end{figure}
This paper is organized as follows.
We first %summarize the features of PETSc's front-end that enable portable
%performance and
provide a blueprint for porting PETSc
applications to use GPUs. We next survey the challenges in developing efficient and portable mathematical
libraries for GPU systems.
We then introduce the PETSc GPU programming
model and discuss recent PETSc back-end developments designed to meet
these challenges.
We present a series of case studies using PETSc with GPUs to demonstrate both the
flexible design and the high performance achieved
on current GPU-based systems. In the final section we summarize our conclusions.
Throughout this paper, we use the term ``programming model'' to refer to
both the model and its supporting runtime.
% \vspace{-10pt}
%########################################################
%\section{Progress on PETSc's user facing front-end for GPUs}
%\label{sec:petsc_front_end}
\section{Porting PETSc applications to the GPUs}
\label{sec:petsc_port}
%One of the primary resources for PETSc users to understand how PETSc is used is the substantial set of
%examples in diverse situations.
% Richard's rewording of above sentence follows:
PETSc has a substantial set of examples to illustrate how the library can be applied in many
different situations.
Currently, PETSc has over 10,000 tests that run in 49 different
configurations; we are in the process of developing tutorials and specific examples
of using PETSc for each of the programming models shown in Figure ~\ref{fig:petsc_accel_support}.
Users can request support through \url{https://gitlab.com/petsc/petsc/-/issues} on the web or via email to \url{petsc-maint@mcs.anl.gov} and \url{petsc-users@mcs.anl.gov}.
%\vspace{-10pt}
%--------------------------------
The common usage pattern for PETSc application codes, regardless of whether they
use time integrators, nonlinear solvers, or linear solvers, has always
been the following:
\begin{titemize}
\item compute application-specific data structures,
\item provide a {\tt Function} computation callback,
\item provide a {\tt Jacobian} computation callback, and
\item call the PETSc solver, possibly in a loop.
\end{titemize}
%A simple variant involves multiple functions, matrices, and solvers.
This approach does not change with the use of GPUs. In particular, the creation of
solver, matrix, and vector objects and their manipulation do not
change. The following are points to consider when porting an application to GPUs:
\begin{titemize}
\item Some data structures reside in GPU memory, either
\begin{titemize}
\item constructed on the CPU and copied to the GPU or
\item constructed directly on the GPU.
\end{titemize}
\item {\tt Function} will call GPU kernels.
\item {\tt Jacobian} will call GPU kernels.
\end{titemize}
The recommended approach to port PETSc CPU applications to the GPU is to {\bf incrementally} move the computation to the GPU.
When developing code, the developer should add the new code to the existing code and use small stub routines (see Listing \ref{lst:stub})
that call the new and the old functions and compare the results as the solver runs, in order to detect any
discrepancies.
We further recommend postponing additional refactorizations until after the code has been fully ported to the GPU,
with {\bf testing, verification,} and {\bf performance profiling} performed at each
step of the transition.
\begin{titemize}
\item Step 1: write code to copy the needed portions of already computed application data structures to the GPU.
\item Step 2: write code for {\tt Function} that runs partially or entirely on the GPU.
\item Step 3: write code for {\tt Jacobian} that runs partially or entirely on the GPU.
\item Step 4: evaluate the time that remains from building the initial application data structures on the CPU.
\begin{titemize}
\item If the time is inconsequential relative to the entire simulation run, the port is complete;
\item otherwise port the computation of the data structure to the GPU.
\end{titemize}
\end{titemize}
For convenience, in the rest of this section we focus on an application
using Kokkos; a similar process would be followed for SYCL, RAJA, OpenMP offload,
CUDA, and HIP. We recommend using the sequential
non-GPU build of Kokkos for all development; this allows debugging with
traditional debuggers.
Listing \ref{lst:main} displays an excerpt of a typical PETSc main application program
for solving a nonlinear set of equations on a structured grid using Newton's method.
It creates a solver object {\tt SNES}, a data management object {\tt DM}, a vector of degrees of freedom {\tt Vec}, and a {\tt Mat} to hold the Jacobian.
Then, Function and Jacobian evaluation callbacks are passed to the {\tt SNES} object to solve the nonlinear equations.
%\begingroup
%\captionsetup[figure]{skip=0pt,belowskip=0pt}
%\begin{figure}[h]
\begin{lstlisting}[caption={Main application code for CPU or GPU},label={lst:main},frame=single,captionpos=b]
SNESCreate(PETSC_COMM_WORLD,&snes);
DMDACreate1d(PETSC_COMM_WORLD,...,&ctx.da);
DMCreateGlobalVector(ctx.da,&x);
VecDuplicate(x,&r);
DMCreateMatrix(ctx.da,&J);
if (useKokkos) {
SNESSetFunction(snes,r,KokkosFunction,&ctx);
SNESSetJacobian(snes,J,J,KokkosJacobian,&ctx);
} else {
SNESSetFunction(snes,r,Function,&ctx);
SNESSetJacobian(snes,J,J,Jacobian,&ctx);
}
SNESSolve(snes,NULL,x);
\end{lstlisting}
%\caption{Main application code for CPU or GPU}
%\label{lst:main}
%\end{figure}
%\endgroup
%\begingroup
%\captionsetup[figure]{skip=0pt,belowskip=0pt}
%\begin{figure}[h]
%\begin{tabular}{l}
\begin{lstlisting}[caption={Traditional PETSc Function (top) and Kokkos version (bottom). {\tt xl}, {\tt x}, {\tt r}, {\tt f} are PETSc vectors. {\tt X}, {\tt R}, {\tt F} at the top are {\tt double*} or {\tt const double*} like pointers but at the bottom are Kokkos unmanaged {\tt OffsetView}s.},label={lst:function},frame=single,captionpos=b]
DMGetLocalVector(da,&xl);
DMGlobalToLocal(da,x,INSERT_VALUES,xl);
DMDAVecGetArrayRead(da,xl,&X); // only read X[]
DMDAVecGetArrayWrite(da,r,&R); // only write R[]
DMDAVecGetArrayRead(da,f,&F); // only read F[]
DMDAGetCorners(da,&xs,NULL,NULL,&xm,...);
for (i=xs; i<xs+xm; ++i)
R[i] = d*(X[i-1]-2*X[i]+X[i+1])+X[i]*X[i]-F[i];
--------------------------------------------------------
DMGetLocalVector(da,&xl);
DMGlobalToLocal(da,x,INSERT_VALUES,xl);
DMDAVecGetKokkosOffsetView(da,xl,&X); // no copy
DMDAVecGetKokkosOffsetView(da,r,&R,overwrite);
DMDAVecGetKokkosOffsetView(da,f,&F);
xs = R.begin(0); xm = R.end(0);
Kokkos::parallel_for(
Kokkos::RangePolicy<>(xs,xm),KOKKOS_LAMBDA
(int i) {
R(i) = d*(X(i-1)-2*X(i)+X(i+1))+X(i)*X(i)-F(i);});
\end{lstlisting}
%\end{tabular}
%\caption{}
%\label{lst:function}
%\end{figure}
%\endgroup
% \vspace{-10pt} REALLY UGLY AND UNREADABLE
Listing \ref{lst:function} shows a traditional PETSc (top) and a Kokkos implementation (bottom) of {\tt Function}.
{\tt DMDAVecGetArrayRead} sets the correct dimensions of the array that lies on each MPI rank. {\tt XxxRead/Write} here claims the caller will only read or write the returned data.
These functions are similar in the Kokkos version except that they do not
require the {\tt Read/Write} labels for the accessor routines (these are handled by using the appropriate
{\tt const} qualifiers in the overloaded functions, not shown).
When returning OffsetViews, we wrap but do not copy PETSc vectors' data.
Moreover, in the Kokkos version
we use the {\tt parallel\_for}
construct and determine the loop bounds from the OffsetView.
For simplicity of presentation we have
assumed periodic boundary conditions in one dimension; the same code pattern exists
in two and three dimensions with general boundary conditions.
When porting code with nontrivial boundary conditions, one should first port the main loop, test and verify that the
code is still running correctly, and then incrementally port each boundary condition separately, testing for each.
The Jacobian computation is presented in Listing \ref{lst:jacobian}.
The
callbacks are also similar except for the matrix access request in the
Kokkos version.
In Listing \ref{lst:stub}, we conclude with the crucial stub function. This function
has the same calling sequence as the two functions
that are to be compared and can be passed directly to the solver routine
to verify that both
functions produce the same results while solving the equations.
%\begingroup
%\captionsetup[figure]{skip=0pt,belowskip=0pt}
%\begin{figure}[H]
%\begin{tabular}{c}
\begin{lstlisting}[caption={Traditional PETSc Jacobian (top) and Kokkos version (bottom)},label={lst:jacobian},frame=single,captionpos=b]
DMDAVecGetArrayRead(da,x,&X);
DMDAGetCorners(da,&xs,NULL,NULL,&xm,...);
for (i=xs; i<xs+xm; i++) {
j = {i - 1,i,i + 1}; A = {d, -2*d + 2*X[i],d};
MatSetValues(J,1,&i,3,j,A,INSERT_VALUES);
}
--------------------------------------------------------
DMDAVecGetKokkosOffsetView(da,x,&X);
MatGetKokkos(J,&mat); // handle for device view
xs = X.begin(0); xm = X.end(0);
Kokkos::parallel_for(
Kokkos::RangePolicy<>(xs,xm),KOKKOS_LAMBDA
(int i){
j = {i-1,i,i+1}; A = {d, -2*d + 2*X(i),d};
MatSetValuesKokkos(mat,1,&i,3,j,A,INSERT_VALUES);});
\end{lstlisting}
%\end{tabular}
%\caption{Traditional PETSc Jacobian and Kokkos version}
%\label{lst:jacobian}
%\end{figure}
%\endgroup
%\begingroup
%\captionsetup[figure]{skip=0pt,belowskip=0pt}
%\begin{figure}[ht]
\begin{lstlisting}[caption={Stub routine that runs both implementations},label={lst:stub},frame=single,captionpos=b]
StubFunction(SNES snes,Vec x,Vec r,void *ctx){
Function(snes,x,r,ctx);
KokkosFunction(snes,x,rk,ctx);
VecAXPY(rk,-1.0,r);
VecNorm(rk,NORM_2,&norm);
if (norm > tol) Error message;
}
\end{lstlisting}
%\caption{Stub routine that runs both implementations}
%\label{fig:stub}
%\end{figure}
%\endgroup
% \vspace{-10pt}
%########################################################
\section{Challenges in GPU programming for libraries}
\label{sec:challenges}
Three {\em fundamental} challenges aries in providing libraries that obtain high {\bf throughput} performance on parallel GPU
accelerated systems because of the hardware and low-level software aspects
of the GPUs; no programming model obviates or allows programmers to ignore
them; see Figure~\ref{fig:challenges}. These fundamental challenges are labeled with an {\tt F}. Ancillary challenges arise in the process of
meeting the fundamental challenges; these are labeled with an {\tt A}.
%---------------------------------------
\paragraph{Challenge F1: Portability for application codes}
The first challenge for GPUs' general use in scientific computing is
the portability of application code across different hardware and software stacks. Different vendors support different programming models and provide
different mathematical libraries (with different APIs) and even different
synchronization models. AMD is
revising its HIP and ROCm library support; how closely
it will mimic the CUDA (cuBLAS, cuSPARSE) standards is unclear. On the other hand, NVIDIA
continues to develop the cuSPARSE API, deprecating and refactoring many common usage
patterns for sparse matrix computations.
Kokkos is designed to guarantee application codes' portability, but it does
not provide distributed-memory MPI support and requires the user to work in
its particular C++ programming style. Likewise, OpenCL has portable performance as
an overarching goal, but implementation quality has been uneven, and performance has not met the standard needed for many application codes. OpenMP offload poses unique difficulties for libraries. Since it is compiler based and the data structures and routines are opaque, it may require specific code for each OpenMP implementation.
%---------------------------------------
\paragraph{Challenge F2: Algorithms for high-throughput systems}
In addition to application porting, high-throughput systems such as GPUs require developing and implementing new solver algorithms---in particular, algorithms that exploit high levels of
data-parallel concurrency with low levels of memory bandwidth delivered in a data-parallel fashion. This approach
generally involves algorithms with higher arithmetic intensity than most traditional simulations require.
This challenge is beyond the scope of this paper; a robust research program developing advanced algorithms for GPU-based systems is being undertaken by the PETSc team and their collaborators.
%---------------------------------------
\paragraph{Challenge F3: Utilizing all GPU and CPU compute power}
The current focus of the PETSc library work for GPUs is high computational throughput for large simulations,
keeping all the GPU compute units as busy as possible all of the time.
In order to achieve this, each core must have an uninterrupted stream of instructions and a high-bandwidth stream of data within the constraints of the hardware
and low-level software stack.
The difficulty arises from the complex control flows and data flows, as indicated in Figure \ref{fig:challenges}. With distributed-memory GPU computing, two levels
of instruction flow exist:
\begin{titemize}
\item high-level instructions flow from the CPU memory to the GPU stream queue and the GPU memory controller, and
\item kernel code flows from the GPU memory to the GPU compute cores.
\end{titemize}
The two levels of data flow are
\begin{titemize}
\item between GPU memories through a combination of networks, switches, and RDMA transfers and
\item from the GPU memory to the GPU compute cores.
\end{titemize}
In this work, we are concerned only with the high-level instruction and data flow and assume that the low-level is suitably handled within the computational kernels and the hardware.
With a high-throughput computation on the GPU and the CPU cores, the same basic rule applies. Seamless high-level
data control must exist between the CPU memory and the GPU memory, and the high-level instruction flow for the CPUs and GPUs must be coordinated and synchronized to ensure that neither is
interrupted.\footnote{The OLCF Summit IBM/NVIDIA system includes an additional layer, whereby data control for transfers directly from the CPU caches to the GPU memory, but this will be
ignored.} Since the exascale nodes have more CPU cores than GPUs, a mismatch exists between the two for both high-level instruction management and data flows; this
is called the \emph{oversubscription} problem.
For both the pure GPU throughput problem and the combined CPU and GPU case, one must understand in a little more detail how the CPU controls the GPU operation. %Each GPU has one or more stream queues (for this discussion, it is acceptable to limit it to one queue).
The CPU sends two types of high-level instructions to the stream queue on the GPU:
\begin{titemize}
\item Kernel launch instructions that control the next set of low-level instructions the GPU will run on its compute cores
\item Data transfer instructions that control the movement of data
\end{titemize}
High-level instructions are issued in the order they arrive in the queue.
Should the queue become empty, the GPU computations and memory transfers will cease. One of the most challenging problems currently faced by large-scale distributed-memory GPU-based computing is preventing this queue from
becoming empty. It is crucial that more entries be added to the queue without requiring the queue to be emptied first. The process of adding more entries while the queue
is not empty is called \emph{pipelining}.
\begin{figure}[htbp]
\begin{center}
\includegraphics[trim = 1.5in 2.5in .3in .6in, clip,width=.99\linewidth]{new-challenges.pdf}
\caption{Some of the challenges for which we have developed responses in the PETSc performance-portable library, related to their locations on a GPU node.}
\label{fig:challenges}
\end{center}
\end{figure}
%---------------------------------------
\paragraph{Challenge A1: Managing the kernel queue}
Two other techniques besides pipelining can keep the GPU compute cores busy.
Kernel fusion combines multiple kernels into a single kernel to increase the time spent in kernel computations while reducing the number of kernel launches, with the additional benefit of possibly increasing the arithmetic intensity of the computations.
However, this optimization often results in code that is harder to read and more difficult to maintain.
For thirty years, the general model for library development has been to write short single-concern functions that can be easily combined to promote code reuse. PETSc uses this style where
each concern is its class method.
CUDA graphs allow the user to describe a series of operations (including CUDA kernels) and their dependencies using a graph.
These graphs can be instantiated once and then executed many times without involving the CPU to amortize the high instantiation cost.
This approach is similar to pipelining except that it offers more runtime flexibility in determining the next operation within the GPU based on the state of the computation, whereas with pipelining the
order of operations is set in advance by the CPU.
%---------------------------------------
\paragraph{Challenge A2: Network communication}
GPU-aware MPI was introduced to ease the development of CPU-GPU code by allowing MPI calls to
accept GPU memory address pointers and removing explicit GPU-to-CPU copies before communications. In turn, MPI implementations could choose the most efficient mechanisms for communication.
For example, a CUDA-aware MPI can use NVIDIA GPUDirect point-to-point to copy
data between two GPUs within a compute node or use NVIDIA GPUDirect RDMA to
access the remote GPU memory across compute nodes without requiring CPU
involvement.
Unfortunately, at the time of writing, even GPU-aware MPI implementations cannot pipeline MPI calls and kernel launches, because the MPI API does not utilize the stream queue. Figure \ref{fig:challenges} shows that the GPU-aware MPI instructions pass directly to the memory controls (the black dashed lines)
and do not enter the stream queue. Thus, the stream queue must be empty for every MPI call. Such {\bf expensive GPU synchronization for every MPI call
involving GPU memory} embodies the mismatch between MPI and GPU. This problem is analyzed in more detail in \cite{SNIR}.
In contrast, NVIDIA provides both NCCL \cite{NCCL}, a library of multi-GPU collective communication primitives, and
NVSHMEM \cite{NVSHMEM}, an implementation of the OpenSHMEM \cite{OpenSHMEM} communications model for clusters
of NVIDIA GPUs. Unlike MPI, these libraries can initiate communication
from the CPU on streams to synchronize between computation and
communication taking place on the GPU. NVSHMEM also supports
GPU-initiated communication.
%---------------------------------------
\paragraph{Challenge A3: Over- and undersubscription}
Exascale systems will have many more CPU cores than GPUs,
and applications may use both the CPU and the GPU for their computations.
Libraries and applications must manage the mismatch between the
number of cores and GPUs. Two main alternatives can help
manage this challenge.
\emph{Oversubscription} occurs when multiple CPU
processes use the same GPU simultaneously.
On high-end NVIDIA systems, the
Multi-Process Service (MPS), a runtime system enabling multiprocess
CUDA applications to transparently use the Hyper-Q hardware-managed work queues on the GPU,
allows funneling the work from multiple CUDA contexts into a single context.
In some cases, this can improve resources utilization, but it can also reduce GPU efficiency.
On Summit, for instance,
we have observed reductions between 5\% and 20\% when sharing GPUs between ranks.
Many types of overhead can contribute to this reduced efficiency,
but we group them all as {\it GPU oversubscription overhead}.
For example, in a standard strong scaling regime, increasing the number of processes sharing the same GPU
while holding the computational load per physical GPU constant may result in more kernel launches associated with smaller
chunks of data, ultimately degrading performance.
% Commenting Mark's example out, but keeping this here because it is a useful example to
% know about:
%For example, using our current (v3.14.1) cuSPARSE back-end on a 2D Poisson problem
%(src/snes/tests/ex13.c) discretized with Q3 finite elements with almost 900K
%equations on a Summit node using all six GPUs, 150 iterations of PETSc's
%algebraic multigrid solver takes about 1.5 seconds using 4 MPI processes per GPU, and about 1.15 seconds using 1 MPI process per GPU.
It is unclear what this overhead will be on future systems.
A variant of this approach maintains two communicators, one for all the MPI processes and one with a single process for each physical GPU.
While the GPUs can be used
at all times, control of the GPUs may be dropped down to the smaller communicator during
intense phases of GPU utilization. The individual ranks' data in the GPU memory can be shared with the GPU's single-rank
process via interprocess communication (e.g., {\tt cudaIpc}) calls.
Alternatively, all the extra CPU cores associated with the GPU controlling rank can share a small communicator
over which the needed communication is performed,
either by using MPI shared-memory windows or via scatter/gather operations.
A different approach is to always use one MPI rank per GPU and use, for example, OpenMP
shared-memory programming to employ additional cores controlled by the single
MPI rank that also controls the GPU.
When
individual library or application components use pure MPI and others use
hybrid MPI-OpenMP, a decrease in active ranks is also necessary.
%The most efficient simultaneous use of multiple GPUs performing operations with communication can be started on NVIDIA systems through NVSHMEM's parallel kernel launch capability.
\emph{Undersubscription} occurs when a large computation has short phases that are best computed on a smaller set of the computational cores, in other words, fewer GPUs and CPUs. A complete library system must manage the reduction in computational cores used
and the efficient movement of data required to utilize the fewer resources and the transition back to the full system.
%---------------------------------------
\paragraph{Challenge A4: CPU-GPU communication time}
\label{subsec:ChallengeF2}
CPU-GPU communications can limit the application's overall performance. The best approach to deal with the communication costs is
to reduce the amount of communication needed by moving computation to the GPU. Communication can also overlap with computation on the GPU by performing the computations in different streams.
On NVIDIA and AMD GPUs, the communication time can be reduced by
using {\em pinned memory}. Pinned memory is CPU memory whose pages are
locked to allow the RDMA system to employ the full bandwidth of the CPU-GPU interconnect; we observed bandwidth gains up to a factor of 4.
%---------------------------------------
\paragraph{Challenge A5: Multiple memory types}
\label{subsec:ChallengeD1}
When running on mixed CPU and NVIDIA GPU systems, there are at least four types of memory: regular memory on the CPU,
%obtained with \texttt{malloc},
pinned memory,
%or \texttt{hipHostMalloc},
unified shared memory, and GPU memory. % or {\tt hipMalloc}.
The first two are usable only on the CPU, the
third on both the CPU and GPU, and the fourth only on the GPU;
however, pointers for all of them are allocated, managed, and freed by the CPU
process. With AMD's HIP API and Intel's discrete GPUs, the complexity increases further.
When multiple types of memory are used, especially by different libraries, there must be a system to ensure that the appropriate
version of a function is called for the given memory type. CUDA provides routines to determine
whether the memory is GPU or CPU, but these calls
are expensive (about 0.3 $\mu$s on OLCF Summit) and should be avoided when possible.
OpenMPI and MPICH implementations that are {\em GPU-aware} call these routines repeatedly and thus pay this latency cost for
each MPI call involving memory pointers.
%There are no routines to distinguish between regular memory, pinned memory, and unified shared memory.
With unified memory, the operating
system automatically manages transferring pages of unified memory between
the CPU and GPU. However, this approach means that
callers of libraries using unified memory must ensure they use the correct memory type in application code; otherwise, expensive data migration will be performed in the background.
%;
%which may not even be well %documented for that library.
%---------------------------------------
\paragraph{Challenge A6: Use of multiple streams from libraries}
Multiple streams are currently used by NVIDIA and AMD systems to synchronize different computation and computation phases directly on the GPU by providing multiple stream queues.
This synchronization
is much faster than synchronizations done on the CPU.
Care must be taken not to launch too many overlapping streams, however, since they can slow the rate of the
computation. Thus, it is best that a single entity be aware of all the streams; this requires extra effort when different libraries are all using streams.
Moreover, if different libraries produce computations that require synchronizations between the library calls, those libraries must share common streams.
%---------------------------------------
\paragraph{Challenge A7: Multiprecision on the GPU}
GPU systems often come with more single-precision than
double-precision floating-point units. Even bandwidth-limited computations with single precision can be faster because they
require half of the double-precision computations' memory bandwidth.
Sparse matrix operations do not usually see dramatic improvements, however, because the
integer indices remain the same size.
With the compressed sparse row matrix format, the amount of data that needs to be
transferred from the matrix is $ 64 nz + 32 nz$ for double-precision
computations with 32-bit column indices and $ 32 nz + 32 nz$ for single-precision computations, a $ 2/3 $ ratio. Further reductions in the factor
require compression of the column indices or
% the exploitation of different
using a different storage format; see, for example, \cite{filippone2017GPGPUSpMV}.
Recent GPUs also contain tensor computing units that use 16-bit floating-point operations. These are specialized and do not provide general-purpose 16-bit
floating-point computations, but they have significant computing power; they can be used effectively for dense \cite{haidar2018harnessing} or even sparse matrix computations \cite{zachariadis2020accelerating}. See
also a recent survey on multiprecision computations for GPUs \cite{blanchard2020mixed}.
% \vspace{-10pt}
%%########################################################
\section{Progress on PETSc's back-end for GPUs}
PETSc employs an object-oriented approach with the delegation pattern; every
object is an instance of a class whose data structure and functionality is
provided by specifying a delegated implementation type at runtime. For example,
a matrix in compressed sparse row representation is created as an instance
of class {\tt Mat} with type {\tt MATAIJ}, whereas a sliced ELLPACK storage
matrix has type {\tt MATSELL}.
We refer
to the API of the classes that the user code interacts with {\it front-end} while we call the delegated
classes the {\it back-end}.
Using GPUs to execute the linear algebra operations defined over {\tt Vec} and
{\tt Mat} is accomplished by choosing the appropriate delegated class. For
instance, the computations will use the vendor-provided kernels
from NVIDIA if {\tt VECCUDA} and {\tt MATCUSPARSE} are
specified in user code or through command line
options.
Because the higher-level classes such as the timesteppers {\tt TS} ultimately employ {\tt Vec}
and {\tt Mat} operations for the bulk of their computations, this provides a
means to offload most of the computation for PETSc solvers---even the most
complicated and sophisticated---onto GPUs.
The GPU-specific delegated classes follow the {\bf lazy-mirror} model
\cite{petsc-msk2013}. These implementations internally manage (potentially) two
copies of the data---one on the CPU and one on the GPU---and track which copy is current. When the computation for that data is always on the GPU, there is no
copy back to the CPU. When a GPU implementation of a requested operation
is available, the GPU copy of the data is updated (if needed), and the
GPU-optimized back-end class method is executed. Otherwise, the CPU copy of the data is updated (if
needed), and the CPU implementation is used.
This mechanism offers two advantages \cite{Karl2020preparing}. First, the full set of
operations for the {\tt Vec} and {\tt Mat} classes is always available, and
developers can incrementally add more back-end class methods as execution
bottlenecks become apparent. Second, some operations are difficult or impossible
to implement efficiently on GPUs, and using the CPU implementation offers
acceptable or even optimal performance.
When unified shared memory is available,
the PETSc back-end classes could allocate only a single unified buffer for
both the CPU and GPU and allow the operating system to manage the CPU and GPU movement. However, this approach is likely to be slower than for PETSc to manage
the memory transfers itself.
%In what follows, we repeat each challenge introduced in Section
%\ref{sec:challenges} and discuss PETSc response strategies.
%------------------------------------------
\paragraph{Response F1: Portability for application codes}
%This is discussed in the introduction and the Section \ref{petsc programming model} and allows mixing and matching the users programming model and the PETSc/vendors back-end programming software.
Each PETSc back-end comprises two parts: the calls to numerical libraries that
provide the per GPU algorithmic implementation and the glue code that connects
these calls to the front-end code. The numerical libraries are cuBLAS and
cuSPARSE for the CUDA back-end, ROCm for HIP,
ViennaCL \cite{VIENNACL} for OpenCL (and potentially also for CUDA and
HIP), MKL for SYCL, and Kokkos Kernels for
Kokkos. Different PETSc back-end subclasses
can be used in the same application.
Our highest priority at present is to prepare for the upcoming exascale systems.
We already have a working Kokkos kernel back-end. We
are completing the HIP back-end to exploit the AMD GPUs
planned for the Frontier system.
The basic configuration for HIP is in place, as well as the initial vector
operations. Development is in progress and being tested on the Tulip Frontier prototype
system. Our next planned implementation has begun for SYCL and MKL for use with the Intel GPUs planned for the Aurora system.
As our computation kernels' development continues, we are also
abstracting the fundamental types, their initialization, and the various libraries' interfaces to reduce code duplication.
The PETSc team will work with the Intel and AMD compiler groups to determine the information needed from the OpenMP offload compiler to share its data with the PETSc back-end.
%---------------------------------------
%\paragraph{Response F2: Algorithms for high throughput systems}
%A robust research program developing advanced algorithms for GPU-based systems is being undertaken by the PETSc team and their collaborators.
%------------------------------------------
\paragraph{Response F3: Utilizing all GPU (and CPU) compute power}
Achieving high utilization requires a rethinking of outer-level algorithms to take
advantage of operations with higher arithmetic intensity.
Compact, dense quasi-Newton representations present an avenue for increasing arithmetic intensity and increasing GPU utilization for outer-level algorithms.
Compared with conventional limited-memory matrix-free implementations, these formulations require fewer kernel launches, avoid dot products, and compute the approximate Jacobian action via a sequence of matrix-vector products constructed from accumulated quasi-Newton updates.
Initial support
for solving Krylov methods with multiple right-hand sides has
been added to PETSc \cite{KSPHPDDM}. While having higher
arithmetic intensity, these methods generate larger Krylov subspaces and typically converge
in fewer iterations, and they may thus provide algorithmic speedup for calculating eigenvalues and for multiobjective adjoint computations.
These block methods significantly decrease the number of sequentially launched kernels, providing outer loop fusion
that reduces CPU\hyp{}GPU latencies and calls; see challenge F2. Even with the multiple right-hand side optimizations, NVIDIA's cuSPARSE on a V100
with extremely sparse matrix-vector multiplication achieves only 4\% of peak, thus indicating the desirability of implementing algorithms that
fundamentally change the performance for such simulations.
Outer loop fusion will also be applied for other PETSc operations such as the multiple dot products
operations needed in the Krylov methods (replacing a collection of BLAS 1 operations with a single BLAS 2 operation) to reduce
the number of kernel launches and get more data reuse on the GPU and higher utilization.
A cross-utilization of the CPU and GPUs can occur even within what is
conceptually a single computation. Consider the computation of matrix elements
for a discretization scheme and their insertion into a matrix data
structure. Within PETSc, value insertion is performed in a loop with the
general routine {\tt MatSetValues}, which efficiently adjusts the row sizes
on the fly as values are inserted and takes care of communicating off-processor
values, if needed.
However, the many repeated calls to {\tt MatSetValues} run on the CPU and may
cause a bottleneck when the assembling process runs on a GPU. To address this issue, we
have developed a variant of the function callable from kernels that
will guarantee fast GPU resident assembly. In a somewhat different scenario,
common among complicated applications utilizing PETSc only for
its robust solver functionality, the matrix values can be already available from
the application itself; for such cases, we have recently developed a new API
that accepts a general matrix description in a coordinate list (COO) format
and directly assembles on the GPU utilizing efficient kernels from the Thrust
library. %\cite{THRUST}.
For numerical results, see Section~\ref{subsec:openfoam}.
PETSc will use the simultaneous kernel launch capability of NVSHMEM
to begin its parallel solvers, thus making them more tightly integrated and remaining in step during
the solve process. This is a relatively straightforward generalization of the current solver
launches, and it will be opaque to the users.
The PETSc back-end model is designed to support applications and libraries
using a combination of CPUs and GPUs for computations. Each back-end class
implementation provides two implementations, for host and device, and contains
the code to manage data transfers. This strategy allows the users of PETSc to prescribe the next group of computations to take place by merely
setting a flag on the object.
With this approach, running, for example, the coarser levels of multigrid on the
CPU and the finer levels on the GPU is easy.
Unnecessary GPU-to-CPU communication is avoided with this approach, and any
needed communication is handled internally by the object.
The
associated message passing required by the solver is managed automatically by
{\tt PetscSF}; see Response A2.
\paragraph{Response A1: Managing the kernel queue}
Since MPI does not use streams,
inner products and ghost point updates require a CPU synchronization at these steps; one cannot, for example, pipeline an entire iteration of a solver together.
We will provide support for NVSHMEM to allow the communication to utilize the stream queue; see Response A2. This is an important addition to PETSc to eliminate the empty streams queue problem.
%---------------------------------------
\paragraph{Response A2: Network communication }
\label{subsec:challengeMPIGPUSF}
\texttt{PetscSF} is the PETSc communication module that abstracts communication graphs.
{\tt PetscSF} is incrementally replacing all direct use of MPI in PETSc. This will allow PETSc to utilize the features of NVSHMEM that are required
to eliminate the problems of Challenge A2. A star is a simple tree consisting of one root vertex connected to zero or more leaves. A star forest is a disjoint union of stars.
%GAIL - where did this about star come from?
A \texttt{PetscSF} is created collectively by specifying, for each leaf on the current rank, the rank and offset of the corresponding root, shown in \autoref{fig:starpart}. {\tt PetscSF} then analyzes the graph and derives a communication pattern using persistent nonblocking MPI send and receive (default), other collectives, one-sided, or neighborhood collectives. {\tt PetscSF} provides APIs such as \texttt{PetscSF\{Bcast,Reduce\}Begin/End}. The former
broadcasts root values to leaves, and the latter reduces leaf values into roots with an
\texttt{MPI\_Op}. The \texttt{Begin/End} split-phase design allows users to
insert computations in between, in order to overlap with communication.
\begin{figure}
\centering
\resizebox{.45\textwidth}{!}{
\begin{tikzpicture}[scale=0.6]
\node[blue!50!black] (rtitle) at (0,1) {\bf Roots};
\begin{scope}[every node/.style={rectangle,draw=blue!50!black,fill=blue!20,thick,minimum size=5mm,rounded corners=2.5mm}]
\node (r11) at (0,-1) {11};
\node (r12) at (0,-2) {12};
\node (r13) at (0,-3) {13};
\node (r21) at (0,-4.5) {21};
\node (r22) at (0,-5.5) {22};
\node (r23) at (0,-6.5) {23};
\node (r24) at (0,-7.5) {24};
\node (r31) at (0,-9) {31};
\node (r32) at (0,-10) {32};
\end{scope}
\begin{scope}[every node/.style={circle,draw=blue!50!black,thick}]
\node[rectangle,fill=none,fit=(r11)(r13)] (root0) {};
\node[rectangle,fill=none,fit=(r21)(r24)] (root1) {};
\node[rectangle,fill=none,fit=(r31)(r32)] (root2) {};
\end{scope}
\node[thick,left] at (root0.west) {rank 0};
\node[thick,left] at (root1.west) {rank 1};
\node[thick,left] at (root2.west) {rank 2};
\node[green!50!black] (rtitle) at (6,1) {\bf Leaves};
\begin{scope}[every node/.style={rectangle,draw=green!50!black,fill=green!20,thick}]
\node (c1100) at (6,0) {1100};
\node (c1200) at (6,-1) {1200};
\node (c1300) at (6,-2) {1300};
\node (c1400) at (6,-3) {1400};
\node[fill=none,fit=(c1100) (c1400)] (leaf0) {};
\node (c2100) at (6,-4.5) {2100};
\node (c2200) at (6,-5.5) {2200};
\node (c2300) at (6,-6.5) {2300};
\node (c2400) at (6,-7.5) {2400};
\node[fill=none,fit=(c2100) (c2400)] (leaf1) {};
\node (c3100) at (6,-9) {3100};
\node (c3200) at (6,-10) {3200};
\node (c3300) at (6,-11) {3300};
\node[fill=none,fit=(c3100) (c3300)] (leaf2) {};
\end{scope}
\begin{scope}[thick,draw=black!50,>=stealth]
\draw (r11) -- (c2200);
\draw (r11) -- (c3100);
\draw (r13) -- (c1400);
\draw (r21) -- (c1200);
\draw (r21) -- (c1300);
\draw (r21) -- (c3200);
\draw (r23) -- (c1100);
\draw (r24) -- (c3300);
\draw (r31) -- (c2100);
\draw (r32) -- (c2400);
\end{scope}
\node[black!70] (rtitle) at (9,1) {\tt local};
\node[black!70] (rtitle) at (11,1) {\tt remote};
\begin{scope}[every node/.style={rectangle,draw=black!50,fill=black!20,thick}]
\node (ll1100) at (9,0) {0};
\node (ll1200) at (9,-1) {1};
\node (ll1300) at (9,-2) {2};
\node (ll1400) at (9,-3) {3};
\node[fill=none,fit=(ll1100) (ll1400)] (leaf0) {};
\node (lr1100) at (11,0) {(1,2)};
\node (lr1200) at (11,-1) {(1,0)};
\node (lr1300) at (11,-2) {(1,0)};
\node (lr1400) at (11,-3) {(0,2)};
\node[fill=none,fit=(lr1100) (lr1400)] (leaf0) {};
\node (ll2100) at (9,-4.5) {0};
\node (ll2200) at (9,-5.5) {1};
\node (ll2400) at (9,-6.5) {3};
\node[fill=none,fit=(ll2100) (ll2400)] (leaf1) {};
\node (lr2100) at (11,-4.5) {(2,0)};
\node (lr2200) at (11,-5.5) {(0,0)};
\node (lr2400) at (11,-6.5) {(2,1)};
\node[fill=none,fit=(lr2100) (lr2400)] (leaf1) {};
\node (ll3100) at (9,-9) {0};
\node (ll3200) at (9,-10) {1};
\node (ll3300) at (9,-11) {2};
\node[fill=none,fit=(ll3100) (ll3300)] (leaf2) {};
\node (lr3100) at (11,-9) {(0,0)};
\node (lr3200) at (11,-10) {(1,0)};
\node (lr3300) at (11,-11) {(1,3)};