From 1d58917a3a9329f9b3b951731e71dadd5074849a Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Fri, 22 Sep 2023 07:31:24 +0000 Subject: [PATCH 1/7] arch: Add mapping for Intel PonteVecchio --- devito/arch/archinfo.py | 26 +++++++++++++++++++------- devito/passes/iet/languages/openmp.py | 3 ++- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/devito/arch/archinfo.py b/devito/arch/archinfo.py index 88a397206f..0a5ac0588d 100644 --- a/devito/arch/archinfo.py +++ b/devito/arch/archinfo.py @@ -18,15 +18,17 @@ 'get_cuda_path', 'get_hip_path', 'check_cuda_runtime', 'get_m1_llvm_path', 'Platform', 'Cpu64', 'Intel64', 'IntelSkylake', 'Amd', 'Arm', 'Power', 'Device', 'NvidiaDevice', 'AmdDevice', 'IntelDevice', - # Intel + # Intel CPUs 'INTEL64', 'SNB', 'IVB', 'HSW', 'BDW', 'KNL', 'KNL7210', 'SKX', 'KLX', 'CLX', 'CLK', 'SPR', - # ARM + # ARM CPUs 'AMD', 'ARM', 'M1', 'GRAVITON', - # Other loosely supported CPU architectures + # Other legacy CPUs 'POWER8', 'POWER9', - # GPUs - 'AMDGPUX', 'NVIDIAX', 'INTELGPUX'] + # Generic GPUs + 'AMDGPUX', 'NVIDIAX', 'INTELGPUX', + # Intel GPUs + 'PVC'] @memoized_func @@ -638,13 +640,20 @@ def _detect_isa(self): class Device(Platform): - def __init__(self, name, cores_logical=1, cores_physical=1, isa='cpp'): + def __init__(self, name, cores_logical=1, cores_physical=1, isa='cpp', + max_threads_per_block=1024, max_threads_dimx=1024, + max_threads_dimy=1024, max_threads_dimz=64): super().__init__(name) self.cores_logical = cores_logical self.cores_physical = cores_physical self.isa = isa + self.max_threads_per_block = max_threads_per_block + self.max_threads_dimx = max_threads_dimx + self.max_threads_dimy = max_threads_dimy + self.max_threads_dimz = max_threads_dimz + @classmethod def _mro(cls): # Retain only the Device Platforms @@ -760,6 +769,8 @@ def march(cls): AMDGPUX = AmdDevice('amdgpuX') INTELGPUX = IntelDevice('intelgpuX') +PVC = IntelDevice('pvc', max_threads_per_block=4096) + platform_registry = { 'cpu64-dummy': CPU64_DUMMY, @@ -783,7 +794,8 @@ def march(cls): 'power9': POWER9, 'nvidiaX': NVIDIAX, # Generic NVidia GPU 'amdgpuX': AMDGPUX, # Generic AMD GPU - 'intelgpuX': INTELGPUX # Generic Intel GPU + 'intelgpuX': INTELGPUX, # Generic Intel GPU + 'pvc': PVC # Intel Ponte Vecchio GPU } """ Registry dict for deriving Platform classes according to the environment variable diff --git a/devito/passes/iet/languages/openmp.py b/devito/passes/iet/languages/openmp.py index 0a6876e608..c5c0fab824 100644 --- a/devito/passes/iet/languages/openmp.py +++ b/devito/passes/iet/languages/openmp.py @@ -3,7 +3,7 @@ import cgen as c from sympy import And, Ne, Not -from devito.arch import AMDGPUX, NVIDIAX, INTELGPUX +from devito.arch import AMDGPUX, NVIDIAX, INTELGPUX, PVC from devito.arch.compiler import GNUCompiler from devito.ir import (Call, Conditional, DeviceCall, List, Prodder, ParallelBlock, PointerCast, While, FindSymbols) @@ -117,6 +117,7 @@ class OmpBB(LangBB): AMDGPUX: None, NVIDIAX: None, INTELGPUX: None, + PVC: None, # Runtime library 'init': None, 'thread-num': lambda retobj=None: From 2e4594aefa52a4246ffb29d4f8a940a529348021 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Fri, 22 Sep 2023 07:32:09 +0000 Subject: [PATCH 2/7] compiler: Make Parizer more easily overridable --- devito/passes/iet/languages/openmp.py | 8 ------ devito/passes/iet/parpragma.py | 37 ++++++++++++++++++--------- devito/tools/data_structures.py | 3 ++- 3 files changed, 27 insertions(+), 21 deletions(-) diff --git a/devito/passes/iet/languages/openmp.py b/devito/passes/iet/languages/openmp.py index c5c0fab824..60b0eb0c41 100644 --- a/devito/passes/iet/languages/openmp.py +++ b/devito/passes/iet/languages/openmp.py @@ -78,14 +78,6 @@ def _make_clauses(cls, **kwargs): return clauses - @classmethod - def _process_kwargs(cls, **kwargs): - kwargs = super()._process_kwargs(**kwargs) - - kwargs.pop('gpu_fit', None) - - return kwargs - class ThreadedProdder(Conditional, Prodder): diff --git a/devito/passes/iet/parpragma.py b/devito/passes/iet/parpragma.py index 29ba9a986b..b32f1e0cc5 100644 --- a/devito/passes/iet/parpragma.py +++ b/devito/passes/iet/parpragma.py @@ -15,8 +15,7 @@ from devito.passes.iet.langbase import (LangBB, LangTransformer, DeviceAwareMixin, make_sections_from_imask) from devito.symbolics import INT, ccode -from devito.tools import as_tuple, flatten, is_integer, prod -from devito.tools.data_structures import UnboundTuple +from devito.tools import UnboundTuple, as_tuple, flatten, is_integer, prod from devito.types import Symbol __all__ = ['PragmaSimdTransformer', 'PragmaShmTransformer', @@ -47,8 +46,21 @@ def _support_array_reduction(cls, compiler): def simd_reg_size(self): return self.platform.simd_reg_size - @iet_pass - def make_simd(self, iet): + def _make_simd_pragma(self, iet): + indexeds = FindSymbols('indexeds').visit(iet) + aligned = {i.name for i in indexeds if i.function.is_DiscreteFunction} + if aligned: + simd = self.lang['simd-for-aligned'] + simd = as_tuple(simd(','.join(sorted(aligned)), self.simd_reg_size)) + else: + simd = as_tuple(self.lang['simd-for']) + + return simd + + def _make_simd(self, iet): + """ + Carry out the bulk of `make_simd`. + """ mapper = {} for tree in retrieve_iteration_tree(iet): candidates = [i for i in tree if i.is_ParallelRelaxed] @@ -103,13 +115,7 @@ def make_simd(self, iet): continue # Add SIMD pragma - indexeds = FindSymbols('indexeds').visit(candidate) - aligned = {i.name for i in indexeds if i.function.is_DiscreteFunction} - if aligned: - simd = self.lang['simd-for-aligned'] - simd = as_tuple(simd(','.join(sorted(aligned)), self.simd_reg_size)) - else: - simd = as_tuple(self.lang['simd-for']) + simd = self._make_simd_pragma(candidate) pragmas = candidate.pragmas + simd # Add VECTORIZED property @@ -121,6 +127,10 @@ def make_simd(self, iet): return iet, {} + @iet_pass + def make_simd(self, iet): + return self._make_simd(iet) + class PragmaIteration(ParallelIteration): @@ -128,7 +138,9 @@ def __init__(self, *args, parallel=None, schedule=None, chunk_size=None, nthreads=None, ncollapsed=None, reduction=None, tile=None, gpu_fit=None, **kwargs): - construct = self._make_construct(parallel=parallel) + construct = self._make_construct( + parallel=parallel, ncollapsed=ncollapsed, tile=tile + ) clauses = self._make_clauses( ncollapsed=ncollapsed, chunk_size=chunk_size, nthreads=nthreads, reduction=reduction, schedule=schedule, tile=tile, gpu_fit=gpu_fit, @@ -646,6 +658,7 @@ def _make_partree(self, candidates, nthreads=None, index=None): if self._is_offloadable(root): body = self.DeviceIteration(gpu_fit=self.gpu_fit, ncollapsed=len(collapsable) + 1, + tile=self.par_tile, **root.args) partree = ParallelTree([], body, nthreads=nthreads) diff --git a/devito/tools/data_structures.py b/devito/tools/data_structures.py index 803f399e3e..78890e59e0 100644 --- a/devito/tools/data_structures.py +++ b/devito/tools/data_structures.py @@ -10,7 +10,8 @@ from devito.tools.algorithms import toposort __all__ = ['Bunch', 'EnrichedTuple', 'ReducerMap', 'DefaultOrderedDict', - 'OrderedSet', 'Ordering', 'DAG', 'frozendict', 'UnboundedMultiTuple'] + 'OrderedSet', 'Ordering', 'DAG', 'frozendict', + 'UnboundTuple', 'UnboundedMultiTuple'] class Bunch(object): From 638968defa0cd3ac2b9ebeac256deffe037a1e74 Mon Sep 17 00:00:00 2001 From: mloubout Date: Fri, 22 Sep 2023 13:07:35 -0400 Subject: [PATCH 3/7] compiler: fix tile input --- devito/passes/iet/parpragma.py | 2 +- devito/tools/data_structures.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/devito/passes/iet/parpragma.py b/devito/passes/iet/parpragma.py index b32f1e0cc5..c580229075 100644 --- a/devito/passes/iet/parpragma.py +++ b/devito/passes/iet/parpragma.py @@ -658,7 +658,7 @@ def _make_partree(self, candidates, nthreads=None, index=None): if self._is_offloadable(root): body = self.DeviceIteration(gpu_fit=self.gpu_fit, ncollapsed=len(collapsable) + 1, - tile=self.par_tile, + tile=self.par_tile.next(), **root.args) partree = ParallelTree([], body, nthreads=nthreads) diff --git a/devito/tools/data_structures.py b/devito/tools/data_structures.py index 78890e59e0..0aa06a15dd 100644 --- a/devito/tools/data_structures.py +++ b/devito/tools/data_structures.py @@ -728,6 +728,8 @@ def __init__(self, items): self.current = 0 def next(self): + if self.last == 0: + return None item = self.items[self.current] self.current = min(self.last - 1, self.current+1) return item From 906f75ec96d88ff4c577f56a1eae31127e042d06 Mon Sep 17 00:00:00 2001 From: mloubout Date: Mon, 25 Sep 2023 10:17:32 -0400 Subject: [PATCH 4/7] misc: rework multituple fir easier use --- devito/core/gpu.py | 2 +- devito/core/operator.py | 6 +- devito/passes/iet/languages/openacc.py | 14 +-- devito/passes/iet/parpragma.py | 4 +- devito/tools/data_structures.py | 136 +++++++++++++++---------- tests/test_gpu_openacc.py | 10 +- 6 files changed, 103 insertions(+), 69 deletions(-) diff --git a/devito/core/gpu.py b/devito/core/gpu.py index 8a75b2857a..46f8914f6b 100644 --- a/devito/core/gpu.py +++ b/devito/core/gpu.py @@ -65,7 +65,7 @@ def _normalize_kwargs(cls, **kwargs): o['cire-schedule'] = oo.pop('cire-schedule', cls.CIRE_SCHEDULE) # GPU parallelism - o['par-tile'] = ParTile(oo.pop('par-tile', False), default=(32, 4)) + o['par-tile'] = ParTile(oo.pop('par-tile', False), default=(32, 4, 4, 4)) o['par-collapse-ncores'] = 1 # Always collapse (meaningful if `par-tile=False`) o['par-collapse-work'] = 1 # Always collapse (meaningful if `par-tile=False`) o['par-chunk-nonaffine'] = oo.pop('par-chunk-nonaffine', cls.PAR_CHUNK_NONAFFINE) diff --git a/devito/core/operator.py b/devito/core/operator.py index 887c79978a..19f61f4627 100644 --- a/devito/core/operator.py +++ b/devito/core/operator.py @@ -329,7 +329,9 @@ class OptOption(object): class ParTileArg(tuple): - def __new__(cls, items, rule=None, tag=None): + def __new__(cls, items, shm=0, tag=None): + if items is None: + items = tuple() obj = super().__new__(cls, items) obj.rule = rule obj.tag = tag @@ -340,7 +342,7 @@ class ParTile(tuple, OptOption): def __new__(cls, items, default=None): if not items: - return None + return tuple() elif isinstance(items, bool): if not default: raise ValueError("Expected `default` value, got None") diff --git a/devito/passes/iet/languages/openacc.py b/devito/passes/iet/languages/openacc.py index 939a68f304..875d550417 100644 --- a/devito/passes/iet/languages/openacc.py +++ b/devito/passes/iet/languages/openacc.py @@ -13,7 +13,7 @@ from devito.passes.iet.languages.C import CBB from devito.passes.iet.languages.openmp import OmpRegion, OmpIteration from devito.symbolics import FieldFromPointer, Macro, cast_mapper -from devito.tools import filter_ordered +from devito.tools import filter_ordered, UnboundTuple from devito.types import DeviceMap, Symbol __all__ = ['DeviceAccizer', 'DeviceAccDataManager', 'AccOrchestrator'] @@ -30,7 +30,8 @@ def _make_clauses(cls, ncollapsed=0, reduction=None, tile=None, **kwargs): clauses = [] if tile: - clauses.append('tile(%s)' % ','.join(str(i) for i in tile)) + stile = [str(tile[i]) for i in range(ncollapsed)] + clauses.append('tile(%s)' % ','.join(stile)) elif ncollapsed > 1: clauses.append('collapse(%d)' % ncollapsed) @@ -159,18 +160,13 @@ def _make_partree(self, candidates, nthreads=None): assert candidates root, collapsable = self._select_candidates(candidates) - ncollapsable = len(collapsable) + ncollapsable = len(collapsable) + 1 if self._is_offloadable(root) and \ all(i.is_Affine for i in [root] + collapsable) and \ self.par_tile: tile = self.par_tile.next() - assert isinstance(tile, tuple) - nremainder = (ncollapsable + 1) - len(tile) - if nremainder >= 0: - tile += (tile[-1],)*nremainder - else: - tile = tile[:ncollapsable + 1] + assert isinstance(tile, UnboundTuple) body = self.DeviceIteration(gpu_fit=self.gpu_fit, tile=tile, ncollapsed=ncollapsable, **root.args) diff --git a/devito/passes/iet/parpragma.py b/devito/passes/iet/parpragma.py index c580229075..57c7af5cb5 100644 --- a/devito/passes/iet/parpragma.py +++ b/devito/passes/iet/parpragma.py @@ -622,7 +622,7 @@ def __init__(self, sregistry, options, platform, compiler): super().__init__(sregistry, options, platform, compiler) self.gpu_fit = options['gpu-fit'] - self.par_tile = UnboundTuple(options['par-tile']) + self.par_tile = UnboundTuple(*options['par-tile']) self.par_disabled = options['par-disabled'] def _score_candidate(self, n0, root, collapsable=()): @@ -657,7 +657,7 @@ def _make_partree(self, candidates, nthreads=None, index=None): if self._is_offloadable(root): body = self.DeviceIteration(gpu_fit=self.gpu_fit, - ncollapsed=len(collapsable) + 1, + ncollapsed=len(collapsable)+1, tile=self.par_tile.next(), **root.args) partree = ParallelTree([], body, nthreads=nthreads) diff --git a/devito/tools/data_structures.py b/devito/tools/data_structures.py index 0aa06a15dd..198bf2fb0b 100644 --- a/devito/tools/data_structures.py +++ b/devito/tools/data_structures.py @@ -639,7 +639,80 @@ def __hash__(self): return self._hash -class UnboundedMultiTuple(object): +class UnboundTuple(object): + """ + An UnboundedTuple is a tuple that can be + infinitely iterated over. + + Examples + -------- + >>> ub = UnboundTuple((1, 2),(3, 4)) + >>> ub + UnboundTuple(UnboundTuple(1, 2), UnboundTuple(3, 4)) + >>> ub.next() + UnboundTuple(1, 2) + >>> ub.next() + UnboundTuple(3, 4) + >>> ub.next() + UnboundTuple(3, 4) + """ + + def __init__(self, *items): + nitems = [] + for i in as_tuple(items): + if isinstance(i, Iterable): + nitems.append(UnboundTuple(*i)) + elif i is not None: + nitems.append(i) + + self.items = tuple(nitems) + self.last = len(self.items) + self.current = 0 + + @property + def default(self): + return self.items[0] + + @property + def prod(self): + return np.prod(self.items) + + def next(self): + if self.last == 0: + return None + item = self.items[self.current] + if self.current == self.last-1 or self.current == -1: + self.current = -1 + else: + self.current += 1 + return item + + def __len__(self): + return self.last + + def __repr__(self): + sitems = [s.__repr__() for s in self.items] + return "%s(%s)" % (self.__class__.__name__, ", ".join(sitems)) + + def __getitem__(self, idx): + if isinstance(idx, slice): + start = idx.start or 0 + stop = idx.stop or self.last + if stop < 0: + stop = self.last + stop + step = idx.step or 1 + return UnboundTuple(*[self[i] for i in range(start, stop, step)]) + try: + if idx >= self.last-1: + return self.items[self.last-1] + else: + return self.items[idx] + except TypeError: + # Slice, ... + return UnboundTuple(self[i] for i in idx) + + +class UnboundedMultiTuple(UnboundTuple): """ An UnboundedMultiTuple is an ordered collection of tuples that can be @@ -649,10 +722,10 @@ class UnboundedMultiTuple(object): -------- >>> ub = UnboundedMultiTuple([1, 2], [3, 4]) >>> ub - UnboundedMultiTuple((1, 2), (3, 4)) + UnboundedMultiTuple(UnboundTuple(1, 2), UnboundTuple(3, 4)) >>> ub.iter() >>> ub - UnboundedMultiTuple(*(1, 2), (3, 4)) + UnboundedMultiTuple(UnboundTuple(1, 2), UnboundTuple(3, 4)) >>> ub.next() 1 >>> ub.next() @@ -661,7 +734,7 @@ class UnboundedMultiTuple(object): >>> ub.iter() # No effect, tip has reached the last tuple >>> ub.iter() # No effect, tip has reached the last tuple >>> ub - UnboundedMultiTuple((1, 2), *(3, 4)) + UnboundedMultiTuple(UnboundTuple(1, 2), UnboundTuple(3, 4)) >>> ub.next() 3 >>> ub.next() @@ -672,27 +745,8 @@ class UnboundedMultiTuple(object): """ def __init__(self, *items): - # Normalize input - nitems = [] - for i in as_tuple(items): - if isinstance(i, Iterable): - if isinstance(i, tuple): - # Honours tuple subclasses - nitems.append(i) - else: - nitems.append(tuple(i)) - else: - raise ValueError("Expected sequence, got %s" % type(i)) - - self.items = tuple(nitems) - self.tip = -1 - self.curiter = None - - def __repr__(self): - items = [str(i) for i in self.items] - if self.curiter is not None: - items[self.tip] = "*%s" % items[self.tip] - return "%s(%s)" % (self.__class__.__name__, ", ".join(items)) + super().__init__(*items) + self.current = -1 @property def curitem(self): @@ -706,33 +760,11 @@ def index(self, item): return self.items.index(item) def iter(self): - if not self.items: - raise ValueError("No tuples available") - self.tip = min(self.tip + 1, max(len(self.items) - 1, 0)) - self.curiter = iter(self.items[self.tip]) + self.current = min(self.current + 1, self.last - 1) + self.items[self.current].current = 0 + return def next(self): - if self.curiter is None: + if self.items[self.current].current == -1: raise StopIteration - return next(self.curiter) - - -class UnboundTuple(object): - """ - A simple data structure that returns the last element forever once reached - """ - - def __init__(self, items): - self.items = as_tuple(items) - self.last = len(self.items) - self.current = 0 - - def next(self): - if self.last == 0: - return None - item = self.items[self.current] - self.current = min(self.last - 1, self.current+1) - return item - - def __len__(self): - return self.last + return self.items[self.current].next() diff --git a/tests/test_gpu_openacc.py b/tests/test_gpu_openacc.py index 3085ad85c9..0d38063c33 100644 --- a/tests/test_gpu_openacc.py +++ b/tests/test_gpu_openacc.py @@ -102,6 +102,7 @@ def test_tile_insteadof_collapse(self, par_tile): opt=('advanced', {'par-tile': par_tile})) trees = retrieve_iteration_tree(op) + stile = (32, 4, 4, 4) if par_tile != (32, 4, 4, 8) else (32, 4, 4, 8) assert len(trees) == 4 assert trees[0][1].pragmas[0].value ==\ @@ -109,11 +110,13 @@ def test_tile_insteadof_collapse(self, par_tile): assert trees[1][1].pragmas[0].value ==\ 'acc parallel loop tile(32,4) present(u)' # Only the AFFINE Iterations are tiled + strtile = ','.join([str(i) for i in stile]) assert trees[3][1].pragmas[0].value ==\ - 'acc parallel loop collapse(4) present(src,src_coords,u)' + 'acc parallel loop tile(%s) present(src,src_coords,u)' % strtile @pytest.mark.parametrize('par_tile', [((32, 4, 4), (8, 8)), ((32, 4), (8, 8)), - ((32, 4, 4), (8, 8, 8))]) + ((32, 4, 4), (8, 8, 8)), + ((32, 4, 4), (8, 8), None)]) def test_multiple_tile_sizes(self, par_tile): grid = Grid(shape=(3, 3, 3)) t = grid.stepping_dim @@ -136,8 +139,9 @@ def test_multiple_tile_sizes(self, par_tile): 'acc parallel loop tile(32,4,4) present(u)' assert trees[1][1].pragmas[0].value ==\ 'acc parallel loop tile(8,8) present(u)' + sclause = 'collapse(4)' if par_tile[-1] is None else 'tile(8,8,8,8)' assert trees[3][1].pragmas[0].value ==\ - 'acc parallel loop collapse(4) present(src,src_coords,u)' + 'acc parallel loop %s present(src,src_coords,u)' % sclause def test_multi_tile_blocking_structure(self): grid = Grid(shape=(8, 8, 8)) From b7e3872fef2ee145613bb4d25336dc9e7614bd5b Mon Sep 17 00:00:00 2001 From: mloubout Date: Fri, 13 Oct 2023 10:27:20 -0400 Subject: [PATCH 5/7] arch: fix oneapi flags --- devito/arch/compiler.py | 14 +++++++------- docker/Dockerfile.cpu | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/devito/arch/compiler.py b/devito/arch/compiler.py index 4b7542eb70..edb7bbe832 100644 --- a/devito/arch/compiler.py +++ b/devito/arch/compiler.py @@ -13,7 +13,7 @@ from codepy.toolchain import GCCToolchain, call_capture_output as _call_capture_output from devito.arch import (AMDGPUX, Cpu64, M1, NVIDIAX, POWER8, POWER9, GRAVITON, - INTELGPUX, get_nvidia_cc, check_cuda_runtime, + INTELGPUX, PVC, get_nvidia_cc, check_cuda_runtime, get_m1_llvm_path) from devito.exceptions import CompilationError from devito.logger import debug, warning, error @@ -730,7 +730,7 @@ def __init__(self, *args, **kwargs): self.cflags.append("-qopt-zmm-usage=high") if language == 'openmp': - self.ldflags.append('-qopenmp') + self.cflags.append('-fiopenmp') # Make sure the MPI compiler uses `icc` underneath -- whatever the MPI distro is if kwargs.get('mpi'): @@ -792,8 +792,8 @@ def __init__(self, *args, **kwargs): # Earlier versions to OneAPI 2023.2.0 (clang17 underneath), have an OpenMP bug if self.version < Version('17.0.0') and language == 'openmp': - self.ldflags.remove('-qopenmp') - self.ldflags.append('-fopenmp') + self.cflags.remove('-fiopenmp') + self.cflags.append('-fopenmp') if language == 'sycl': self.cflags.append('-fsycl') @@ -804,9 +804,9 @@ def __init__(self, *args, **kwargs): if platform is NVIDIAX: self.cflags.append('-fopenmp-targets=nvptx64-cuda') - if platform is INTELGPUX: - self.cflags.append('-fopenmp-targets=spir64') - self.cflags.append('-fopenmp-target-simd') + if platform in [INTELGPUX, PVC]: + self.cflags.append('-fopenmp-targets=spir64') + self.cflags.append('-fopenmp-target-simd') if platform is INTELGPUX: self.cflags.remove('-g') # -g disables some optimizations in IGC diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu index 0230da05ce..5f9762a335 100644 --- a/docker/Dockerfile.cpu +++ b/docker/Dockerfile.cpu @@ -67,7 +67,7 @@ RUN echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https:/ # Drivers mandatory for intel gpus RUN curl -fsSL https://repositories.intel.com/graphics/intel-graphics.key | apt-key add - -RUN echo "deb [trusted=yes arch=amd64] https://repositories.intel.com/graphics/ubuntu focal main" > /etc/apt/sources.list.d/intel-graphics.list +RUN echo "deb [trusted=yes arch=amd64] https://repositories.intel.com/graphics/ubuntu all main" > /etc/apt/sources.list.d/intel-graphics.list # Intel advisor and drivers RUN apt-get update -y && \ From 956570b17cda5b5503f7305652af7af5a3023a5b Mon Sep 17 00:00:00 2001 From: mloubout Date: Fri, 13 Oct 2023 10:42:02 -0400 Subject: [PATCH 6/7] deps: add hpc oneapi image --- devito/arch/compiler.py | 12 ++++---- docker/Dockerfile.amd | 2 +- docker/Dockerfile.cpu | 60 ++++++++++++++++++++++++++++++---------- docker/Dockerfile.devito | 7 +++-- docker/Dockerfile.nvidia | 9 ++++-- 5 files changed, 64 insertions(+), 26 deletions(-) diff --git a/devito/arch/compiler.py b/devito/arch/compiler.py index edb7bbe832..8d9babbb5d 100644 --- a/devito/arch/compiler.py +++ b/devito/arch/compiler.py @@ -730,7 +730,7 @@ def __init__(self, *args, **kwargs): self.cflags.append("-qopt-zmm-usage=high") if language == 'openmp': - self.cflags.append('-fiopenmp') + self.ldflags.append('-qopenmp') # Make sure the MPI compiler uses `icc` underneath -- whatever the MPI distro is if kwargs.get('mpi'): @@ -792,8 +792,8 @@ def __init__(self, *args, **kwargs): # Earlier versions to OneAPI 2023.2.0 (clang17 underneath), have an OpenMP bug if self.version < Version('17.0.0') and language == 'openmp': - self.cflags.remove('-fiopenmp') - self.cflags.append('-fopenmp') + self.ldflags.remove('-qopenmp') + self.ldflags.append('-fopenmp') if language == 'sycl': self.cflags.append('-fsycl') @@ -805,10 +805,10 @@ def __init__(self, *args, **kwargs): if platform is NVIDIAX: self.cflags.append('-fopenmp-targets=nvptx64-cuda') if platform in [INTELGPUX, PVC]: - self.cflags.append('-fopenmp-targets=spir64') - self.cflags.append('-fopenmp-target-simd') + self.ldflags.append('-fiopenmp') + self.ldflags.append('-fopenmp-targets=spir64') + self.ldflags.append('-fopenmp-target-simd') - if platform is INTELGPUX: self.cflags.remove('-g') # -g disables some optimizations in IGC self.cflags.append('-gline-tables-only') self.cflags.append('-fdebug-info-for-profiling') diff --git a/docker/Dockerfile.amd b/docker/Dockerfile.amd index 629054fd30..bdcc1c5a26 100644 --- a/docker/Dockerfile.amd +++ b/docker/Dockerfile.amd @@ -125,7 +125,7 @@ ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 RUN apt-get update && \ apt-get install -y dh-autoreconf python3-venv python3-dev python3-pip -RUN apt-get clean && apt-get autoclean && apt-get autoremove && \ +RUN apt-get clean && apt-get autoclean && apt-get autoremove -y && \ rm -rf /var/lib/apt/lists/* EXPOSE 8888 diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu index 5f9762a335..238ab4ff5e 100644 --- a/docker/Dockerfile.cpu +++ b/docker/Dockerfile.cpu @@ -3,18 +3,20 @@ # This Dockerfile contains the Devito codes and can be built using different base images. ############################################################## -ARG pyversion=python:3.9 ARG arch=gcc ARG OMPI_BRANCH="v4.1.4" # Base image -FROM ${pyversion}-slim-bullseye as base +FROM ubuntu:22.04 as base ENV DEBIAN_FRONTEND noninteractive -# Install for basic base not containing it +# Install python RUN apt-get update && \ - apt-get install -y vim wget git flex libnuma-dev tmux \ + apt-get install -y dh-autoreconf python3-venv python3-dev python3-pip + +# Install for basic base not containing it +RUN apt-get install -y vim wget git flex libnuma-dev tmux \ numactl hwloc curl \ autoconf libtool build-essential procps @@ -24,7 +26,7 @@ RUN curl https://raw.githubusercontent.com/Azrael3000/tmpi/master/tmpi -o /usr/l # Install OpenGL library, necessary for the installation of GemPy RUN apt-get install -y libgl1-mesa-glx -RUN apt-get clean && apt-get autoclean && apt-get autoremove && \ +RUN apt-get clean && apt-get autoclean && apt-get autoremove -y && \ rm -rf /var/lib/apt/lists/* EXPOSE 8888 @@ -65,16 +67,20 @@ FROM base as oneapi RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor > /usr/share/keyrings/oneapi-archive-keyring.gpg RUN echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" > /etc/apt/sources.list.d/oneAPI.list -# Drivers mandatory for intel gpus -RUN curl -fsSL https://repositories.intel.com/graphics/intel-graphics.key | apt-key add - -RUN echo "deb [trusted=yes arch=amd64] https://repositories.intel.com/graphics/ubuntu all main" > /etc/apt/sources.list.d/intel-graphics.list - # Intel advisor and drivers RUN apt-get update -y && \ # advisor - apt-get install -y intel-oneapi-advisor && \ - # drivers - apt-get install -y intel-opencl-icd intel-level-zero-gpu level-zero level-zero-dev + apt-get install -y intel-oneapi-advisor + +# Drivers mandatory for intel gpu +# https://dgpu-docs.intel.com/installation-guides/ubuntu/ubuntu-focal.html#ubuntu-20-04-focal +RUN wget -qO - https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor > /usr/share/keyrings/intel-graphics.gpg +RUN echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu focal main" > /etc/apt/sources.list.d/intel.list + +RUN apt-get update -y && apt-get dist-upgrade -y && \ + apt-get install -y intel-opencl-icd intel-level-zero-gpu level-zero level-zero-dev \ + intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \ + libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev ############################################################## # ICC image @@ -83,7 +89,7 @@ RUN apt-get update -y && \ FROM oneapi as icc RUN apt-get update -y && apt-get install -y intel-oneapi-compiler-dpcpp-cpp-and-cpp-classic intel-oneapi-mpi-devel && \ - apt-get clean && apt-get autoclean && apt-get autoremove && \ + apt-get clean && apt-get autoclean && apt-get autoremove -y && \ rm -rf /var/lib/apt/lists/* # Devito config @@ -99,7 +105,7 @@ ENV MPI4PY_FLAGS='. /opt/intel/oneapi/setvars.sh && CFLAGS="-cc=icc"' FROM oneapi as icx RUN apt-get update -y && apt-get install -y intel-oneapi-compiler-dpcpp-cpp intel-oneapi-mpi-devel && \ - apt-get clean && apt-get autoclean && apt-get autoremove && \ + apt-get clean && apt-get autoclean && apt-get autoremove -y && \ rm -rf /var/lib/apt/lists/* # Devito config @@ -108,3 +114,29 @@ ENV DEVITO_LANGUAGE="openmp" # MPICC compiler for mpi4py ENV MPICC=mpiicc ENV MPI4PY_FLAGS='. /opt/intel/oneapi/setvars.sh && CFLAGS="-cc=icx"' + +############################################################## +# ICX hpc image +############################################################## +FROM oneapi as icx-hpc + +# Install both icc and icx to avoid missing dependencies +RUN apt-get update -y && \ + apt-get install -y intel-oneapi-compiler-dpcpp-cpp intel-oneapi-mpi-devel && \ + apt-get install -y intel-oneapi-compiler-dpcpp-cpp-and-cpp-classic + +# Missig components +# https://www.intel.com/content/www/us/en/developer/tools/oneapi/hpc-toolkit-download.html?operatingsystem=linux&distributions=aptpackagemanager +RUN curl -f "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/ebf5d9aa-17a7-46a4-b5df-ace004227c0e/l_dpcpp-cpp-compiler_p_2023.2.1.8.sh" -O && \ + chmod +x l_dpcpp-cpp-compiler_p_2023.2.1.8.sh && ./l_dpcpp-cpp-compiler_p_2023.2.1.8.sh -a -s --eula accept && \ + rm l_dpcpp-cpp-compiler_p_2023.2.1.8.sh + +RUN apt-get clean && apt-get autoclean && apt-get autoremove -y && \ + rm -rf /var/lib/apt/lists/* + +# Devito config +ENV DEVITO_ARCH="icx" +ENV DEVITO_LANGUAGE="openmp" +# MPICC compiler for mpi4py +ENV MPICC=mpiicc +ENV MPI4PY_FLAGS='. /opt/intel/oneapi/setvars.sh && CFLAGS="-cc=icx"' \ No newline at end of file diff --git a/docker/Dockerfile.devito b/docker/Dockerfile.devito index 9c5578844e..703964e45e 100644 --- a/docker/Dockerfile.devito +++ b/docker/Dockerfile.devito @@ -15,6 +15,9 @@ ARG GROUP_ID=1000 # Copy Devito ADD . /app/devito +# Update if outdated +RUN apt-get update && apt-get upgrade -y + # Remove git files RUN rm -rf /app/devito/.git @@ -29,7 +32,7 @@ RUN python3 -m venv /venv && \ # Usefull utilities # Nvtop -RUN apt update && apt install -y git cmake libncurses5-dev libncursesw5-dev libdrm-dev libsystemd-dev cmake && \ +RUN apt-get install -y git cmake libncurses5-dev libncursesw5-dev libdrm-dev libsystemd-dev cmake && \ git clone https://github.com/Syllo/nvtop.git /app/nvtop && \ mkdir -p /app/nvtop/build && cd /app/nvtop/build && \ cmake .. -DNVIDIA_SUPPORT=ON -DAMDGPU_SUPPORT=ON -DINTEL_SUPPORT=ON && \ @@ -38,7 +41,7 @@ RUN apt update && apt install -y git cmake libncurses5-dev libncursesw5-dev libd ln -fs /app/nvtop/build/src/nvtop /venv/bin/nvtop # Safety cleanup -RUN apt-get clean && apt-get autoclean && apt-get autoremove && \ +RUN apt-get clean && apt-get autoclean && apt-get autoremove -y && \ rm -rf /var/lib/apt/lists/* FROM $base as user diff --git a/docker/Dockerfile.nvidia b/docker/Dockerfile.nvidia index 8b0086a8ca..c2d6d074a5 100644 --- a/docker/Dockerfile.nvidia +++ b/docker/Dockerfile.nvidia @@ -2,16 +2,19 @@ # This Dockerfile contains the NVidia HPC SDK (nvc, cuda, OpenMPI) for Devito ############################################################## ARG ver -ARG pyversion=python:3.9 ARG arch="nvc" ######################################################################## # Build base image with apt setup and common env ######################################################################## -FROM ${pyversion}-slim-bullseye as sdk-base +FROM ubuntu:22.04 as sdk-base ENV DEBIAN_FRONTEND noninteractive +# Install python +RUN apt-get update && \ + apt-get install -y dh-autoreconf python3-venv python3-dev python3-pip + RUN apt-get update -y && \ apt-get install -y -q gpg apt-utils curl wget vim libnuma-dev tmux numactl @@ -131,7 +134,7 @@ RUN python3 -m venv /venv && \ /venv/bin/jupyter serverextension enable dask_labextension && \ rm -rf ~/.cache/pip -RUN apt-get clean && apt-get autoclean && apt-get autoremove && \ +RUN apt-get clean && apt-get autoclean && apt-get autoremove -y && \ rm -rf /var/lib/apt/lists/* EXPOSE 8888 From 0a3d8083d833d71a158b8f581e00a7c29c13cd98 Mon Sep 17 00:00:00 2001 From: mloubout Date: Tue, 31 Oct 2023 09:09:45 -0400 Subject: [PATCH 7/7] misc: cleanup multituple --- devito/core/gpu.py | 2 +- devito/core/operator.py | 10 +++---- devito/tools/data_structures.py | 52 +++++++++++++++++++-------------- tests/test_gpu_openacc.py | 1 - 4 files changed, 36 insertions(+), 29 deletions(-) diff --git a/devito/core/gpu.py b/devito/core/gpu.py index 46f8914f6b..7aa24e492f 100644 --- a/devito/core/gpu.py +++ b/devito/core/gpu.py @@ -65,7 +65,7 @@ def _normalize_kwargs(cls, **kwargs): o['cire-schedule'] = oo.pop('cire-schedule', cls.CIRE_SCHEDULE) # GPU parallelism - o['par-tile'] = ParTile(oo.pop('par-tile', False), default=(32, 4, 4, 4)) + o['par-tile'] = ParTile(oo.pop('par-tile', False), default=(32, 4, 4)) o['par-collapse-ncores'] = 1 # Always collapse (meaningful if `par-tile=False`) o['par-collapse-work'] = 1 # Always collapse (meaningful if `par-tile=False`) o['par-chunk-nonaffine'] = oo.pop('par-chunk-nonaffine', cls.PAR_CHUNK_NONAFFINE) diff --git a/devito/core/operator.py b/devito/core/operator.py index 19f61f4627..3daa991316 100644 --- a/devito/core/operator.py +++ b/devito/core/operator.py @@ -6,7 +6,7 @@ from devito.mpi.routines import mpi_registry from devito.parameters import configuration from devito.operator import Operator -from devito.tools import as_tuple, is_integer, timed_pass +from devito.tools import as_tuple, is_integer, timed_pass, UnboundTuple from devito.types import NThreads __all__ = ['CoreOperator', 'CustomOperator', @@ -327,12 +327,12 @@ class OptOption(object): pass -class ParTileArg(tuple): +class ParTileArg(UnboundTuple): - def __new__(cls, items, shm=0, tag=None): + def __new__(cls, items, rule=None, tag=None): if items is None: items = tuple() - obj = super().__new__(cls, items) + obj = super().__new__(cls, *items) obj.rule = rule obj.tag = tag return obj @@ -355,7 +355,7 @@ def __new__(cls, items, default=None): x = items[0] if is_integer(x): - # E.g., (32, 4, 8) + # E.g., 32 items = (ParTileArg(items),) elif x is None: diff --git a/devito/tools/data_structures.py b/devito/tools/data_structures.py index 198bf2fb0b..d8f92e0cf2 100644 --- a/devito/tools/data_structures.py +++ b/devito/tools/data_structures.py @@ -216,7 +216,7 @@ def __reduce__(self): args = tuple() else: args = self.default_factory, - return type(self), args, None, None, self.items() + return type(self), args, None, None, self() def copy(self): return self.__copy__() @@ -639,7 +639,7 @@ def __hash__(self): return self._hash -class UnboundTuple(object): +class UnboundTuple(tuple): """ An UnboundedTuple is a tuple that can be infinitely iterated over. @@ -657,30 +657,37 @@ class UnboundTuple(object): UnboundTuple(3, 4) """ - def __init__(self, *items): + def __new__(cls, *items, **kwargs): nitems = [] for i in as_tuple(items): - if isinstance(i, Iterable): + if isinstance(i, UnboundTuple): + nitems.append(i) + elif isinstance(i, Iterable): nitems.append(UnboundTuple(*i)) elif i is not None: nitems.append(i) - self.items = tuple(nitems) - self.last = len(self.items) - self.current = 0 + obj = super().__new__(cls, tuple(nitems)) + obj.last = len(nitems) + obj.current = 0 + + return obj @property def default(self): - return self.items[0] + return self[0] @property def prod(self): - return np.prod(self.items) + return np.prod(self) + + def iter(self): + self.current = 0 def next(self): if self.last == 0: return None - item = self.items[self.current] + item = self[self.current] if self.current == self.last-1 or self.current == -1: self.current = -1 else: @@ -691,7 +698,7 @@ def __len__(self): return self.last def __repr__(self): - sitems = [s.__repr__() for s in self.items] + sitems = [s.__repr__() for s in self] return "%s(%s)" % (self.__class__.__name__, ", ".join(sitems)) def __getitem__(self, idx): @@ -704,9 +711,9 @@ def __getitem__(self, idx): return UnboundTuple(*[self[i] for i in range(start, stop, step)]) try: if idx >= self.last-1: - return self.items[self.last-1] + return super().__getitem__(self.last-1) else: - return self.items[idx] + return super().__getitem__(idx) except TypeError: # Slice, ... return UnboundTuple(self[i] for i in idx) @@ -744,27 +751,28 @@ class UnboundedMultiTuple(UnboundTuple): 3 """ - def __init__(self, *items): - super().__init__(*items) - self.current = -1 + def __new__(cls, *items, **kwargs): + obj = super().__new__(cls, *items, **kwargs) + obj.current = -1 + return obj @property def curitem(self): - return self.items[self.tip] + return self[self.current] @property def nextitem(self): - return self.items[min(self.tip + 1, max(len(self.items) - 1, 0))] + return self[min(self.current + 1, max(self.last - 1, 0))] def index(self, item): - return self.items.index(item) + return self.index(item) def iter(self): self.current = min(self.current + 1, self.last - 1) - self.items[self.current].current = 0 + self[self.current].current = 0 return def next(self): - if self.items[self.current].current == -1: + if self[self.current].current == -1: raise StopIteration - return self.items[self.current].next() + return self[self.current].next() diff --git a/tests/test_gpu_openacc.py b/tests/test_gpu_openacc.py index 0d38063c33..5bb9424b86 100644 --- a/tests/test_gpu_openacc.py +++ b/tests/test_gpu_openacc.py @@ -109,7 +109,6 @@ def test_tile_insteadof_collapse(self, par_tile): 'acc parallel loop tile(32,4,4) present(u)' assert trees[1][1].pragmas[0].value ==\ 'acc parallel loop tile(32,4) present(u)' - # Only the AFFINE Iterations are tiled strtile = ','.join([str(i) for i in stile]) assert trees[3][1].pragmas[0].value ==\ 'acc parallel loop tile(%s) present(src,src_coords,u)' % strtile