From 4a225330f8f43386d9865a8a9e280ab6a1e43e3c Mon Sep 17 00:00:00 2001 From: Li-Wen Chang <120213201+liwenchangbdbz@users.noreply.github.com> Date: Mon, 25 Sep 2023 23:19:18 -0700 Subject: [PATCH] [Release] Official Release ByteIR 1.4.0 (#71) * [Sync] internal a6ef5f00...22d6dee6 * [AIT] Disabled hw info log, added error handling * [CAT] Adjusted layout support * [compiler/doc] Added codegen doc * [frontend/torch] Added demo code, added more fx pattern for llm, fixed einsum, updated to 23b72244b1e1eaa0511cece8535b32810c1d0d7a * [GPU] Added reduction codegen for PTX * [Mhlo] Fixed transpose movedown, Added canonicalizer for gather with iota * [Release] released 1.4.0 package * [Runtime] Supported non-splat value in FillOp, added dropout support for flashV2 * [Util] Fixed bugs --- compiler/doc/codegen.md | 245 +++++ compiler/include/byteir/Analysis/UseRange.h | 15 + .../byteir/Conversion/FuncToByre/FuncToByre.h | 6 + compiler/include/byteir/Conversion/Passes.td | 26 + .../byteir/Conversion/ToLinalg/ToLinalg.h | 3 +- .../include/byteir/Conversion/ToPTX/ToPTX.h | 3 +- .../include/byteir/Dialect/CMakeLists.txt | 2 + .../include/byteir/Dialect/Cat/IR/CatOps.td | 11 + .../include/byteir/Dialect/GPU/CMakeLists.txt | 3 + compiler/include/byteir/Dialect/GPU/Passes.h | 37 + compiler/include/byteir/Dialect/GPU/Passes.td | 36 + .../Dialect/GPU/Transforms/Transforms.h | 32 + .../include/byteir/Dialect/Linalg/Passes.td | 10 + .../TransformOps/LinalgExtTransformOps.td | 39 + .../Linalg/Transforms/LinalgCollapseLoops.h | 7 +- .../byteir/Dialect/Tensor/CMakeLists.txt | 3 + .../include/byteir/Dialect/Tensor/Passes.h | 31 + .../include/byteir/Dialect/Tensor/Passes.td | 35 + .../Transforms/TensorPadSpecialization.h | 30 + .../byteir/Dialect/Transform/Passes.td | 36 + .../Transform/Transforms/TransformInsertion.h | 10 + .../Dialect/mhlo/Transforms/CanonicalizeExt.h | 3 + .../mhlo/Transforms/ConvertOpToCustomCall.h | 4 +- .../mhlo/Transforms/GenericFusionCommon.h | 4 + .../byteir/Dialect/mhlo/Transforms/HloFuser.h | 6 + .../byteir/Dialect/mhlo/Util/CustomCallUtil.h | 8 + .../byteir/Pipelines/GPU/MappingForall.h | 53 + .../byteir/Pipelines/GPU/ReductionCodegen.h | 153 +++ .../byteir/Pipelines/InitAllPipelines.h | 4 + .../byteir/Transforms/MemoryPlanning.h | 7 +- compiler/include/byteir/Transforms/Passes.td | 2 +- compiler/lib/Analysis/UseRange.cpp | 7 +- compiler/lib/CAPI/CMakeLists.txt | 1 + compiler/lib/CAPI/Passes.cpp | 4 + .../lib/Conversion/FuncToByre/FuncToByre.cpp | 78 ++ .../lib/Conversion/HloToCat/FuseHloToCat.cpp | 45 +- .../HloToCat/FuseHloToCatPattern.td | 30 +- .../Conversion/MemrefToByre/MemrefToByre.cpp | 25 +- compiler/lib/Conversion/ToByre/ToByre.cpp | 15 +- .../ToLinalg/MemrefCopyToLinalg.cpp | 140 +-- .../lib/Conversion/ToPTX/CollectGPUKernel.cpp | 33 +- compiler/lib/Dialect/CMakeLists.txt | 1 + compiler/lib/Dialect/Cat/IR/CatDialect.cpp | 6 + compiler/lib/Dialect/GPU/CMakeLists.txt | 1 + .../lib/Dialect/GPU/Transforms/CMakeLists.txt | 19 + .../Transforms/ShmAllocaToWorkgroupArg.cpp | 86 ++ .../TransformOps/LinalgExtTransformOps.cpp | 132 +++ .../Linalg/Transforms/FuseElementwise.cpp | 25 +- .../Linalg/Transforms/LinalgCollapseLoops.cpp | 45 +- .../Dialect/Tensor/Transforms/CMakeLists.txt | 3 + .../Tensor/Transforms/CanonicalizeExt.cpp | 42 + .../Dialect/Tensor/Transforms/PassDetail.h | 40 + .../Transforms/TensorPadSpecialization.cpp | 242 +++++ .../Transforms/TransformInsertion.cpp | 101 ++ .../mhlo/Transforms/CanonicalizeExt.cpp | 55 + .../lib/Dialect/mhlo/Transforms/CatFusion.cpp | 16 +- .../mhlo/Transforms/ConvertOpToCustomCall.cpp | 91 +- .../Dialect/mhlo/Transforms/GenericFusion.cpp | 104 +- .../mhlo/Transforms/HloAggressiveFusion.cpp | 5 +- .../Dialect/mhlo/Transforms/HloMoveDown.cpp | 54 +- compiler/lib/Pipelines/BufferizeOpt.cpp | 2 + compiler/lib/Pipelines/ByreOpt.cpp | 5 +- compiler/lib/Pipelines/GPU/CMakeLists.txt | 4 + compiler/lib/Pipelines/GPU/GPUOpt.cpp | 39 +- compiler/lib/Pipelines/GPU/MappingForall.cpp | 148 +++ compiler/lib/Pipelines/GPU/NVVMCodegen.cpp | 4 + .../lib/Pipelines/GPU/ReductionCodegen.cpp | 942 ++++++++++++++++++ compiler/lib/Pipelines/HloOpt.cpp | 3 + compiler/lib/Pipelines/LinalgMemrefOpt.cpp | 4 +- compiler/lib/Pipelines/LinalgTensorOpt.cpp | 138 ++- compiler/lib/Transforms/Bufferize.cpp | 238 ++++- compiler/lib/Transforms/MemoryPlanning.cpp | 47 +- compiler/lib/Utils/Utils.cpp | 11 +- compiler/numerical/hlo/canonicalize_ext.mlir | 37 + compiler/numerical/hlo/hlo_move_down.mlir | 22 + compiler/python/ByteIRModules.cpp | 2 + compiler/python/byteir/compile.py | 20 +- .../byteir/dialects/cat/ir_processor.py | 30 +- compiler/python/byteir/tools/compiler.py | 4 +- .../test/Conversion/HloToCat/fused_ops.mlir | 66 +- .../ToByre/convertMemRefToByre.mlir | 22 +- .../transform-op-fold-unit-extent-dims.mlir | 6 +- .../transforms/ConvertOpToCustomCall.mlir | 19 + .../Dialect/Mhlo/transforms/hloMoveDown.mlir | 26 +- .../test/Dialect/Tensor/canonicalizeExt.mlir | 11 + compiler/test/Transforms/canonicalizeExt.mlir | 37 + compiler/test/Transforms/memoryPlanning.mlir | 12 +- compiler/tools/byteir-opt/CMakeLists.txt | 1 + compiler/tools/byteir-opt/byteir-opt.cpp | 4 + external/patches/AITemplate/logging.patch | 17 + .../src/Conversion/OFRewriteToCustomCall.cpp | 1 + .../test/of_rewrite_to_custom_call.mlir | 2 +- .../torch-frontend/examples/demo/README.md | 16 + .../torch-frontend/examples/demo/backend.py | 195 ++++ .../examples/demo/byteir_fusible_pattern.py | 194 ++++ .../examples/demo/compile_utils.py | 92 ++ .../torch-frontend/examples/demo/config.py | 35 + .../examples/demo/fx_match_utils.py | 40 + .../torch-frontend/examples/demo/main.py | 220 ++++ .../examples/demo/partitioners.py | 940 +++++++++++++++++ .../third_party/patches/einsum.patch | 633 +++++++----- .../Conversion/ConvertTorchToCustomCall.cpp | 4 + .../python/test/test_attn_rewrite.py | 18 + .../python/test/test_fx_utils.py | 18 + .../python/torch_frontend/__init__.py | 3 +- .../python/torch_frontend/fx_utils.py | 86 ++ .../include/brt/core/framework/op_accessor.h | 3 + .../cuda/providers/default/ait/ait.cc | 3 +- .../cuda/providers/default/codegen/ptx.cc | 60 +- .../default/flash_attn/flash_attn_bwd.cc | 9 +- .../default/flash_attn/flash_attn_fwd.cc | 19 +- .../default/flash_attn/kernels/flash_api.cu | 11 +- .../default/flash_attn/kernels/flash_api.h | 4 +- .../flash_attn/kernels/flash_bwd_kernel.h | 28 +- .../flash_attn/kernels/flash_fwd_kernel.h | 15 +- .../providers/default/tensor_generate/fill.cc | 45 +- runtime/lib/core/framework/op_accessor.cc | 37 + .../providers/default/kernel/fill_test.cc | 10 +- .../default/kernel/flash_attn_fwd_test.cc | 14 +- .../test/include/brt/test/common/cuda/util.h | 13 + runtime/test/test_files/fill_cuda.mlir | 4 +- runtime/test/test_files/flash_attn_fwd.mlir | 4 +- tests/numerical_test/execute.py | 5 +- tests/numerical_test/main.py | 14 +- .../mlir_tests/ops/bmm_rrr_permute_f16.mlir | 6 + .../mlir_tests/ops/concat2.mlir | 6 + .../torch_dynamo_e2e_testing/backend.py | 18 +- 127 files changed, 6472 insertions(+), 584 deletions(-) create mode 100644 compiler/doc/codegen.md create mode 100644 compiler/include/byteir/Dialect/GPU/CMakeLists.txt create mode 100644 compiler/include/byteir/Dialect/GPU/Passes.h create mode 100644 compiler/include/byteir/Dialect/GPU/Passes.td create mode 100644 compiler/include/byteir/Dialect/GPU/Transforms/Transforms.h create mode 100644 compiler/include/byteir/Dialect/Tensor/CMakeLists.txt create mode 100644 compiler/include/byteir/Dialect/Tensor/Passes.h create mode 100644 compiler/include/byteir/Dialect/Tensor/Passes.td create mode 100644 compiler/include/byteir/Dialect/Tensor/Transforms/TensorPadSpecialization.h create mode 100644 compiler/include/byteir/Pipelines/GPU/MappingForall.h create mode 100644 compiler/include/byteir/Pipelines/GPU/ReductionCodegen.h create mode 100644 compiler/lib/Dialect/GPU/CMakeLists.txt create mode 100644 compiler/lib/Dialect/GPU/Transforms/CMakeLists.txt create mode 100644 compiler/lib/Dialect/GPU/Transforms/ShmAllocaToWorkgroupArg.cpp create mode 100644 compiler/lib/Dialect/Tensor/Transforms/PassDetail.h create mode 100644 compiler/lib/Dialect/Tensor/Transforms/TensorPadSpecialization.cpp create mode 100644 compiler/lib/Pipelines/GPU/MappingForall.cpp create mode 100644 compiler/lib/Pipelines/GPU/ReductionCodegen.cpp create mode 100644 external/patches/AITemplate/logging.patch create mode 100644 frontends/torch-frontend/examples/demo/README.md create mode 100644 frontends/torch-frontend/examples/demo/backend.py create mode 100644 frontends/torch-frontend/examples/demo/byteir_fusible_pattern.py create mode 100644 frontends/torch-frontend/examples/demo/compile_utils.py create mode 100644 frontends/torch-frontend/examples/demo/config.py create mode 100644 frontends/torch-frontend/examples/demo/fx_match_utils.py create mode 100644 frontends/torch-frontend/examples/demo/main.py create mode 100644 frontends/torch-frontend/examples/demo/partitioners.py create mode 100644 frontends/torch-frontend/torch-frontend/python/test/test_fx_utils.py create mode 100644 tests/numerical_test/mlir_tests/ops/bmm_rrr_permute_f16.mlir create mode 100644 tests/numerical_test/mlir_tests/ops/concat2.mlir diff --git a/compiler/doc/codegen.md b/compiler/doc/codegen.md new file mode 100644 index 000000000..9299a6900 --- /dev/null +++ b/compiler/doc/codegen.md @@ -0,0 +1,245 @@ +# Codegen pipeline + +## hlo-opt + +This pass pipeline is mainly used for clustering fusion group on mhlo dialect, each fusion group was expected to fused into a single kernel in later codegen pipeline and would be outlined as a indepedent kernel function. + +- `ReductionFusionPass` reduction fusion in producer direction + +- `ElementFusionPass` elementwise/broadcast/collapse_shape/expand_shape/etc. producer-consumer bi-directional fusion + +- `FusionOutliningPass` fusion group outlining + +## linalg-tensor-opt + +### reduction codegen transformations + +``` + func.func private @Unknown0(%arg0: tensor<8192x50257xf16>) -> tensor<50257xf32> attributes {__byteir_reduction_fusion__} { + %0 = mhlo.constant dense<0.000000e+00> : tensor + %1 = mhlo.convert %arg0 : (tensor<8192x50257xf16>) -> tensor<8192x50257xf32> + %2 = mhlo.reduce(%1 init: %0) across dimensions = [0] : (tensor<8192x50257xf32>, tensor) -> tensor<50257xf32> + reducer(%arg1: tensor, %arg2: tensor) { + %3 = mhlo.add %arg1, %arg2 : tensor + mhlo.return %3 : tensor + } + return %2 : tensor<50257xf32> + } +``` + +This pass pipeline first convert outlined mhlo fusion group into linalg dialect and try to fuse linalg op with its producer/consumer. + +- `createLinalgElementwiseFusionExtPass` linalg fusion pass with our extension, see [linalg pass](linalg.md) for more details + +``` +func.func private @Unknown0(%arg0: tensor<8192x50257xf16>) -> tensor<50257xf32> attributes {__byteir_reduction_fusion__} { + %cst = arith.constant 0.000000e+00 : f32 + %0 = tensor.empty() : tensor<50257xf32> + %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<50257xf32>) -> tensor<50257xf32> + %2 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%arg0 : tensor<8192x50257xf16>) outs(%1 : tensor<50257xf32>) { + ^bb0(%in: f16, %out: f32): + %3 = arith.extf %in : f16 to f32 + %4 = arith.addf %out, %3 : f32 + linalg.yield %4 : f32 + } -> tensor<50257xf32> + return %2 : tensor<50257xf32> +} +``` + +[optional] Split grid-level reduction on `reduction` dimensions + +``` +func.func private @Unknown0(%arg0: tensor<8192x50257xf16>) -> tensor<50257xf32> attributes {__byteir_reduction_fusion__} { + %cst = arith.constant 0.000000e+00 : f32 + %0 = tensor.empty() : tensor<50257xf32> + %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<50257xf32>) -> tensor<50257xf32> + %expanded = tensor.expand_shape %arg0 [[0, 1], [2]] : tensor<8192x50257xf16> into tensor<32x256x50257xf16> + %2 = tensor.empty() : tensor<32x50257xf32> + %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<32x50257xf32>) -> tensor<32x50257xf32> + %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%expanded : tensor<32x256x50257xf16>) outs(%3 : tensor<32x50257xf32>) attrs = {__grid_reduction__} { + ^bb0(%in: f16, %out: f32): + %6 = arith.extf %in : f16 to f32 + %7 = arith.addf %out, %6 : f32 + linalg.yield %7 : f32 + } -> tensor<32x50257xf32> + %5 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["reduction", "parallel"]} ins(%4 : tensor<32x50257xf32>) outs(%1 : tensor<50257xf32>) attrs = {__grid_reduction__} { + ^bb0(%in: f32, %out: f32): + %6 = arith.addf %in, %out : f32 + linalg.yield %6 : f32 + } -> tensor<50257xf32> + return %5 : tensor<50257xf32> +} +``` + +- Tiling reduction on `parallel` dimension and mapping tiled reductions to thread blocks + +``` +func.func private @Unknown0(%arg0: tensor<8192x50257xf16>) -> tensor<50257xf32> attributes {__byteir_reduction_fusion__} { + %cst = arith.constant 0.000000e+00 : f32 + %0 = tensor.empty() : tensor<50257xf32> + %expanded = tensor.expand_shape %arg0 [[0, 1], [2]] : tensor<8192x50257xf16> into tensor<32x256x50257xf16> + %1 = tensor.empty() : tensor<32x50257xf32> + %2 = scf.forall (%arg1, %arg2) in (32, 1571) shared_outs(%arg3 = %1) -> (tensor<32x50257xf32>) { + %4 = affine.min #map(%arg2) + %5 = affine.apply #map1(%arg2) + %extracted_slice = tensor.extract_slice %expanded[%arg1, 0, %5] [1, 256, %4] [1, 1, 1] : tensor<32x256x50257xf16> to tensor<256x?xf16> + %extracted_slice_0 = tensor.extract_slice %arg3[%arg1, %5] [1, %4] [1, 1] : tensor<32x50257xf32> to tensor + %6 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor) -> tensor + %7 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "reduction"]} ins(%extracted_slice : tensor<256x?xf16>) outs(%6 : tensor) { + ^bb0(%in: f16, %out: f32): + %8 = arith.extf %in : f16 to f32 + %9 = arith.addf %out, %8 : f32 + linalg.yield %9 : f32 + } -> tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %7 into %arg3[%arg1, %5] [1, %4] [1, 1] : tensor into tensor<32x50257xf32> + } + } {mapping = [#gpu.block, #gpu.block]} + %3 = scf.forall (%arg1) in (1571) shared_outs(%arg2 = %0) -> (tensor<50257xf32>) { + // ... + } {mapping = [#gpu.block]} + return %3 : tensor<50257xf32> +} +``` + +- Block-level reduction codegen + +``` +%2 = scf.forall (%arg1, %arg2) in (32, 1571) shared_outs(%arg3 = %1) -> (tensor<32x50257xf32>) { + %4 = affine.min #map(%arg2) + %5 = affine.apply #map1(%arg2) + %extracted_slice = tensor.extract_slice %expanded[%arg1, 0, %5] [1, 256, %4] [1, 1, 1] : tensor<32x256x50257xf16> to tensor<256x?xf16> + %6 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<32xf32> + %7 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<16x32xf32> + %8 = scf.forall (%arg4, %arg5) in (16, 32) shared_outs(%arg6 = %7) -> (tensor<16x32xf32>) { + %17 = affine.min #map2(%arg4) + %18 = affine.min #map3(%arg4) + %19 = affine.apply #map4(%18, %17) + %20 = affine.min #map5(%arg5, %arg2) + %21 = affine.min #map6(%arg5, %arg2) + %22 = affine.apply #map4(%21, %20) + %23 = affine.apply #map7(%21, %20) + %extracted_slice_6 = tensor.extract_slice %extracted_slice[%17, %20] [%19, %22] [1, 1] : tensor<256x?xf16> to tensor + %padded = tensor.pad %extracted_slice_6 low[0, 0] high[0, %23] { + ^bb0(%arg7: index, %arg8: index): + tensor.yield %cst : f16 + } : tensor to tensor<16x1xf16> + %extracted_slice_7 = tensor.extract_slice %arg6[%arg4, %arg5] [1, 1] [1, 1] : tensor<16x32xf32> to tensor + %collapsed = tensor.collapse_shape %padded [[0, 1]] : tensor<16x1xf16> into tensor<16xf16> + %24 = linalg.fill ins(%cst_0 : f32) outs(%extracted_slice_7 : tensor) -> tensor + %25 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["reduction"]} ins(%collapsed : tensor<16xf16>) outs(%24 : tensor) { + ^bb0(%in: f16, %out: f32): + %26 = arith.extf %in : f16 to f32 + %27 = arith.addf %out, %26 : f32 + linalg.yield %27 : f32 + } -> tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %25 into %arg6[%arg4, %arg5] [1, 1] [1, 1] : tensor into tensor<16x32xf32> + } + } {mapping = [#gpu.thread, #gpu.thread]} + %expanded_1 = tensor.expand_shape %8 [[0, 1], [2]] : tensor<16x32xf32> into tensor<8x2x32xf32> + %9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<8x32xf32> + %10 = scf.forall (%arg4, %arg5) in (8, 32) shared_outs(%arg6 = %9) -> (tensor<8x32xf32>) { + // ... + } {mapping = [#gpu.thread, #gpu.thread]} + %expanded_2 = tensor.expand_shape %10 [[0, 1], [2]] : tensor<8x32xf32> into tensor<4x2x32xf32> + %11 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<4x32xf32> + %12 = scf.forall (%arg4, %arg5) in (4, 32) shared_outs(%arg6 = %11) -> (tensor<4x32xf32>) { + // ... + } {mapping = [#gpu.thread, #gpu.thread]} + %expanded_3 = tensor.expand_shape %12 [[0, 1], [2]] : tensor<4x32xf32> into tensor<2x2x32xf32> + %13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<2x32xf32> + %14 = scf.forall (%arg4, %arg5) in (2, 32) shared_outs(%arg6 = %13) -> (tensor<2x32xf32>) { + // ... + } {mapping = [#gpu.thread, #gpu.thread]} + %15 = scf.forall (%arg4) in (32) shared_outs(%arg5 = %6) -> (tensor<32xf32>) { + // ... + } {mapping = [#gpu.thread]} + %extracted_slice_4 = tensor.extract_slice %15[0] [%4] [1] : tensor<32xf32> to tensor + %extracted_slice_5 = tensor.extract_slice %arg3[%arg1, %5] [1, %4] [1, 1] : tensor<32x50257xf32> to tensor + %16 = scf.forall (%arg4) in (512) shared_outs(%arg5 = %extracted_slice_5) -> (tensor) { + // ... + } {mapping = [#gpu.linear]} + scf.forall.in_parallel { + tensor.parallel_insert_slice %16 into %arg3[%arg1, %5] [1, %4] [1, 1] : tensor into tensor<32x50257xf32> + } +} {mapping = [#gpu.block, #gpu.block]} +``` + +- Detensorize scalar linalg ops to arith ops and specialize `tensor.pad` + +``` +%2 = scf.forall (%arg1, %arg2) in (32, 1571) shared_outs(%arg3 = %1) -> (tensor<32x50257xf32>) { + %4 = affine.min #map(%arg2) + %5 = affine.apply #map1(%arg2) + %6 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<32xf32> + %7 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<16x32xf32> + %8 = scf.forall (%arg4, %arg5) in (16, 32) shared_outs(%arg6 = %7) -> (tensor<16x32xf32>) { + %17 = affine.min #map2(%arg5, %arg2) + %18 = affine.min #map3(%arg5, %arg2) + %19 = affine.apply #map4(%18, %17) + %20 = arith.cmpi ugt, %19, %c0 : index + %21 = scf.if %20 -> (f16) { + %84 = affine.apply #map5(%arg4) + %85 = affine.apply #map6(%arg2)[%17] + %extracted = tensor.extract %expanded[%arg1, %84, %85] : tensor<32x256x50257xf16> + scf.yield %extracted : f16 + } else { + scf.yield %cst : f16 + } + // ... + %78 = arith.extf %77 : f16 to f32 + %79 = arith.addf %75, %78 : f32 + %80 = arith.cmpi ugt, %19, %c0 : index + %81 = scf.if %80 -> (f16) { + %84 = affine.apply #map21(%arg4) + %85 = affine.apply #map6(%arg2)[%17] + %extracted = tensor.extract %expanded[%arg1, %84, %85] : tensor<32x256x50257xf16> + scf.yield %extracted : f16 + } else { + scf.yield %cst : f16 + } + %82 = arith.extf %81 : f16 to f32 + %83 = arith.addf %79, %82 : f32 + %extracted_slice_5 = tensor.extract_slice %arg6[%arg4, %arg5] [1, 1] [1, 1] : tensor<16x32xf32> to tensor + %inserted = tensor.insert %83 into %extracted_slice_5[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg6[%arg4, %arg5] [1, 1] [1, 1] : tensor into tensor<16x32xf32> + } + } {mapping = [#gpu.thread, #gpu.thread]} + + // ... + %extracted_slice = tensor.extract_slice %15[0] [%4] [1] : tensor<32xf32> to tensor + %extracted_slice_4 = tensor.extract_slice %arg3[%arg1, %5] [1, %4] [1, 1] : tensor<32x50257xf32> to tensor + %16 = scf.forall (%arg4) in (512) shared_outs(%arg5 = %extracted_slice_4) -> (tensor) { + %17 = affine.min #map22(%arg4)[%4] + %18 = affine.max #map23(%17) + %19 = affine.apply #map24(%arg4)[%4] + %extracted_slice_5 = tensor.extract_slice %extracted_slice[%19] [%18] [1] : tensor to tensor + %extracted_slice_6 = tensor.extract_slice %arg5[%19] [%18] [1] : tensor to tensor + %20 = linalg.copy {__byteir_gpu_tile_block_reduction_10} ins(%extracted_slice_5 : tensor) outs(%extracted_slice_6 : tensor) -> tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %20 into %arg5[%19] [%18] [1] : tensor into tensor + } + } {mapping = [#gpu.linear]} + scf.forall.in_parallel { + tensor.parallel_insert_slice %16 into %arg3[%arg1, %5] [1, %4] [1, 1] : tensor into tensor<32x50257xf32> + } +} {mapping = [#gpu.block, #gpu.block]} +``` + +- `structured.split_reduction` split reduction op along `reduction` dimension for increasing parallelism + +- `structured.tile_to_forall_op` tile reduction op along `parallel` dimensions to `forall` op and mapping to block/linear/thread + +- `structured.fuse_into_containing_op` fuse init and pad operands into `scf.forall` + +- `structured.annotate` attach any attribute to target ops, used to annotate reduction op and attach memory space to `allot_tensor` + +- `structured.tile` tile reduction op along `reduction` dimension to sequential for loop + +- `structured.detensorize` use to inline computation region of linalg op which operands have scalar tensor type + +- `LinalgCollapseLoopsPass` collapse consecutive `parallel` and `reduction` loops, this pass could work on both tensor and memref + +- `TensorPadSpecializationPass` specialize `tensor.extract` of pad op to conditional read diff --git a/compiler/include/byteir/Analysis/UseRange.h b/compiler/include/byteir/Analysis/UseRange.h index 07b5588ae..704afcf16 100644 --- a/compiler/include/byteir/Analysis/UseRange.h +++ b/compiler/include/byteir/Analysis/UseRange.h @@ -104,9 +104,24 @@ class UserangeAnalysis { using UsePosition = std::pair; using UsePositionList = std::vector; + using AllocsIterator = mlir::bufferization::BufferPlacementAllocs:: + AllocEntryList::const_iterator; + using AllocsIteratorRange = llvm::iterator_range; + UserangeAnalysis(Liveness *liveness) : liveness(liveness) {} UserangeAnalysis(mlir::Operation *op, Liveness *liveness, const mlir::bufferization::BufferPlacementAllocs &allocs, + const mlir::BufferViewFlowAnalysis &aliases) + : UserangeAnalysis(op, liveness, make_range(allocs.begin(), allocs.end()), + aliases) {} + UserangeAnalysis( + mlir::Operation *op, Liveness *liveness, + const mlir::bufferization::BufferPlacementAllocs::AllocEntryList &allocs, + const mlir::BufferViewFlowAnalysis &aliases) + : UserangeAnalysis(op, liveness, make_range(allocs.begin(), allocs.end()), + aliases) {} + UserangeAnalysis(mlir::Operation *op, Liveness *liveness, + AllocsIteratorRange &&allocs, const mlir::BufferViewFlowAnalysis &aliases); virtual ~UserangeAnalysis() {} diff --git a/compiler/include/byteir/Conversion/FuncToByre/FuncToByre.h b/compiler/include/byteir/Conversion/FuncToByre/FuncToByre.h index 230ea0e1a..92e29fbac 100644 --- a/compiler/include/byteir/Conversion/FuncToByre/FuncToByre.h +++ b/compiler/include/byteir/Conversion/FuncToByre/FuncToByre.h @@ -27,9 +27,15 @@ class ModuleOp; void populateFuncToByreTensorPattern(RewritePatternSet &patterns, bool appendArgTypes); +void populateGPULaunchFuncToByrePattern(RewritePatternSet &patterns, + bool useBarePtrCallConv); + std::unique_ptr> createConvertFuncToByreTensorPass(bool appendArgTypes = false); +std::unique_ptr +createConvertGPULaunchFuncToByrePass(bool useBarePtrCallConv = false); + } // namespace mlir #endif // BYTEIR_CONVERSION_FUNCTOBYRE_FUNCTOBYRE_H diff --git a/compiler/include/byteir/Conversion/Passes.td b/compiler/include/byteir/Conversion/Passes.td index 79c6acbb8..e5ee1c148 100644 --- a/compiler/include/byteir/Conversion/Passes.td +++ b/compiler/include/byteir/Conversion/Passes.td @@ -253,6 +253,9 @@ def CollectGPUKernel : Pass<"collect-gpu-kernel", "ModuleOp"> { Option<"moduleName", "module-name", "std::string", /*default=*/"\"unified\"", "Optional name for GPUModule to put all gpu kernels">, + Option<"removeHost", "remove-host", "bool", + /*default=*/"true", + "Whether to remove host part">, ]; } @@ -349,6 +352,26 @@ def ConvertFuncToByreTensor : Pass<"func-to-byre-tensor", "ModuleOp"> { } +//===----------------------------------------------------------------------===// +// FuncToByreTensor +//===----------------------------------------------------------------------===// + +def ConvertGPULaunchFuncToByre : Pass<"gpu-launch-func-to-byre"> { + let summary = "Convert gpu.launch_func op to byre compute op."; + let constructor = "mlir::createConvertGPULaunchFuncToByrePass()"; + let dependentDialects = [ + "mlir::byre::ByreDialect", + "mlir::gpu::GPUDialect" + ]; + + let options = [ + Option<"useBarePtrCallConv", "use-bare-ptr-memref-call-conv", "bool", + /*default=*/"false", + "Replace memref arguments in GPU functions with bare pointers." + "All memrefs must have static shape">, + ]; +} + //===----------------------------------------------------------------------===// // MemrefToByre //===----------------------------------------------------------------------===// @@ -398,6 +421,9 @@ def MemrefCopyToLinalgPass : Option<"attachAttr", "attach-attr", "std::string", /*default=*/"", "An optional unit attribute attaching on target functions: ">, + Option<"outlining", "outlining", "bool", + /*default=*/"true", + "Whether to outline the copy op to a new function">, ]; } diff --git a/compiler/include/byteir/Conversion/ToLinalg/ToLinalg.h b/compiler/include/byteir/Conversion/ToLinalg/ToLinalg.h index eaed2dbcc..8e64ce9d3 100644 --- a/compiler/include/byteir/Conversion/ToLinalg/ToLinalg.h +++ b/compiler/include/byteir/Conversion/ToLinalg/ToLinalg.h @@ -55,7 +55,8 @@ std::unique_ptr> createLinalgExtToLinalgPass(); std::unique_ptr> createMemrefCopyToLinalgPass(std::string anchorTag = "", - std::string attachAttr = ""); + std::string attachAttr = "", + bool outlining = true); } // namespace mlir diff --git a/compiler/include/byteir/Conversion/ToPTX/ToPTX.h b/compiler/include/byteir/Conversion/ToPTX/ToPTX.h index de932d857..c5185df6c 100644 --- a/compiler/include/byteir/Conversion/ToPTX/ToPTX.h +++ b/compiler/include/byteir/Conversion/ToPTX/ToPTX.h @@ -33,7 +33,8 @@ createGenPTXConfigPass(bool useBarePtrCallConv = false); // TODO move to general GPU std::unique_ptr> -createCollectGPUKernelPass(const std::string &name = "unified"); +createCollectGPUKernelPass(const std::string &name = "unified", + bool removeHost = true); } // namespace mlir diff --git a/compiler/include/byteir/Dialect/CMakeLists.txt b/compiler/include/byteir/Dialect/CMakeLists.txt index c7113baae..3e8627a7e 100644 --- a/compiler/include/byteir/Dialect/CMakeLists.txt +++ b/compiler/include/byteir/Dialect/CMakeLists.txt @@ -3,11 +3,13 @@ add_subdirectory(Affine) add_subdirectory(Byre) add_subdirectory(Cat) add_subdirectory(Ccl) +add_subdirectory(GPU) add_subdirectory(Lace) add_subdirectory(Linalg) add_subdirectory(MemRef) add_subdirectory(mhlo) add_subdirectory(SCF) add_subdirectory(Shape) +add_subdirectory(Tensor) add_subdirectory(Transform) add_subdirectory(Vector) diff --git a/compiler/include/byteir/Dialect/Cat/IR/CatOps.td b/compiler/include/byteir/Dialect/Cat/IR/CatOps.td index be953e7a6..c8ef07b19 100644 --- a/compiler/include/byteir/Dialect/Cat/IR/CatOps.td +++ b/compiler/include/byteir/Dialect/Cat/IR/CatOps.td @@ -281,6 +281,17 @@ def Cat_GemmRCRPermuteOp : Cat_Op<"gemm_rcr_permute", [Cat_CatOpInterface, Pure] let hasVerifier = 1; } +def Cat_GemmRRRPermuteOp : Cat_Op<"gemm_rrr_permute", [Cat_CatOpInterface, Pure]> { + let summary = "gemm_rrr + permute0213 operator, output layout is [m / t1, t1, t2, n / t2]"; + let arguments = (ins AnyTensor : $lhs, + AnyTensor : $rhs, + I64Attr : $t1, + I64Attr : $t2); + let results = (outs AnyTensor : $output); + + let hasVerifier = 1; +} + def Cat_LayerNormOp : Cat_Op<"layernorm", [Cat_CatOpInterface, Pure]> { let summary = "layernorm operator"; let arguments = (ins AnyTensor : $input, diff --git a/compiler/include/byteir/Dialect/GPU/CMakeLists.txt b/compiler/include/byteir/Dialect/GPU/CMakeLists.txt new file mode 100644 index 000000000..53b17ff2e --- /dev/null +++ b/compiler/include/byteir/Dialect/GPU/CMakeLists.txt @@ -0,0 +1,3 @@ +set(LLVM_TARGET_DEFINITIONS Passes.td) +mlir_tablegen(Passes.h.inc -gen-pass-decls -name ByteIRGPU) +add_public_tablegen_target(ByteIRGPUPassIncGen) diff --git a/compiler/include/byteir/Dialect/GPU/Passes.h b/compiler/include/byteir/Dialect/GPU/Passes.h new file mode 100644 index 000000000..6a86b80b6 --- /dev/null +++ b/compiler/include/byteir/Dialect/GPU/Passes.h @@ -0,0 +1,37 @@ +//===- Passes.h ----------------------------------------------*--- C++ -*-===// +// +// Copyright 2022 ByteDance Ltd. and/or its affiliates. All rights reserved. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// + +#ifndef BYTEIR_DIALECT_GPU_PASSES_H +#define BYTEIR_DIALECT_GPU_PASSES_H + +#include "mlir/Pass/Pass.h" + +namespace mlir { +namespace gpu { +class GPUFuncOp; +} // namespace gpu + +#define GEN_PASS_DECL +#include "byteir/Dialect/GPU/Passes.h.inc" + +/// Generate the code for registering transforms passes. +#define GEN_PASS_REGISTRATION +#include "byteir/Dialect/GPU/Passes.h.inc" + +} // namespace mlir + +#endif // BYTEIR_DIALECT_GPU_PASSES_H diff --git a/compiler/include/byteir/Dialect/GPU/Passes.td b/compiler/include/byteir/Dialect/GPU/Passes.td new file mode 100644 index 000000000..862df14a2 --- /dev/null +++ b/compiler/include/byteir/Dialect/GPU/Passes.td @@ -0,0 +1,36 @@ +//===- Passes.td - Transforms pass definition file -------*--- tablegen -*-===// +// +// Copyright 2022 ByteDance Ltd. and/or its affiliates. All rights reserved. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// + + +#ifndef BYTEIR_DIALECT_GPU_PASSES +#define BYTEIR_DIALECT_GPU_PASSES + +include "mlir/Pass/PassBase.td" + +//===----------------------------------------------------------------------===// +// ShmAllocaToWorkgroupArg +//===----------------------------------------------------------------------===// + +def ShmAllocaToWorkgroupArg : Pass<"shm-alloca-to-workgroup-arg", "gpu::GPUModuleOp"> { + let summary = "Hoist shared memory alloca in gpu kernel to workgroup argument"; + let dependentDialects = [ + "gpu::GPUDialect", + "memref::MemRefDialect" + ]; +} + +#endif // BYTEIR_DIALECT_GPU_PASSES diff --git a/compiler/include/byteir/Dialect/GPU/Transforms/Transforms.h b/compiler/include/byteir/Dialect/GPU/Transforms/Transforms.h new file mode 100644 index 000000000..042d045dd --- /dev/null +++ b/compiler/include/byteir/Dialect/GPU/Transforms/Transforms.h @@ -0,0 +1,32 @@ +//===- Transforms.h -------------------------------------------*--- C++ -*-===// +// +// Copyright 2022 ByteDance Ltd. and/or its affiliates. All rights reserved. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// + +#ifndef BYTEIR_DIALECT_GPU_TRANSFORMS_TRANSFORMS_H +#define BYTEIR_DIALECT_GPU_TRANSFORMS_TRANSFORMS_H + +#include "mlir/Dialect/GPU/IR/GPUDialect.h" + +namespace mlir { +namespace gpu { + +// hoist shared memory alloca in gpu kernel to workgroup arg +void hoistShmAllocaToWorkgroup(gpu::GPUFuncOp func); + +} // namespace gpu +} // namespace mlir + +#endif // BYTEIR_DIALECT_GPU_TRANSFORMS_TRANSFORMS_H \ No newline at end of file diff --git a/compiler/include/byteir/Dialect/Linalg/Passes.td b/compiler/include/byteir/Dialect/Linalg/Passes.td index 81c411b53..5393ef410 100644 --- a/compiler/include/byteir/Dialect/Linalg/Passes.td +++ b/compiler/include/byteir/Dialect/Linalg/Passes.td @@ -166,6 +166,16 @@ def LinalgCollapseLoops : Pass<"linalg-collapse-loops", "func::FuncOp"> { "tensor::TensorDialect", "memref::MemRefDialect" ]; + + let options = [ + Option<"iteratorType", "iterator-type", "mlir::utils::IteratorType", + /*default=*/"mlir::utils::IteratorType::parallel", "iterator type", + [{llvm::cl::values( + clEnumValN(mlir::utils::IteratorType::parallel, "parallel", + "parallel iterator type"), + clEnumValN(mlir::utils::IteratorType::reduction, "reduction", + "reduction iterator type"))}]>, + ]; } //===----------------------------------------------------------------------===// diff --git a/compiler/include/byteir/Dialect/Linalg/TransformOps/LinalgExtTransformOps.td b/compiler/include/byteir/Dialect/Linalg/TransformOps/LinalgExtTransformOps.td index ac7d80126..dc918c661 100644 --- a/compiler/include/byteir/Dialect/Linalg/TransformOps/LinalgExtTransformOps.td +++ b/compiler/include/byteir/Dialect/Linalg/TransformOps/LinalgExtTransformOps.td @@ -72,6 +72,18 @@ def CollapseDimsOp : Op]> { + let description = [{ + Detensorize linalg ops. + }]; + + let arguments = (ins PDL_Operation:$target); + + let assemblyFormat = "$target attr-dict"; +} + def FoldUnitExtentDimsOp : Op]> { @@ -245,4 +257,31 @@ def FuseOperandsOp : Op { + let description = [{ + insert_slice_to_copy_ext extension. + }]; + + let arguments = (ins TransformHandleTypeInterface:$target); + let results = (outs TransformHandleTypeInterface:$transformed); + + let assemblyFormat = "$target attr-dict `:` functional-type(operands, results) "; + + let builders = [ + OpBuilder<(ins "Value":$target)>, + ]; + let extraClassDeclaration = [{ + ::mlir::DiagnosedSilenceableFailure applyToOne( + ::mlir::transform::TransformRewriter &rewriter, + ::mlir::Operation *target, + ::mlir::transform::ApplyToEachResultList &results, + ::mlir::transform::TransformState &state); + }]; +} + + #endif // BYTEIR_DIALECT_LINALG_TRANSFORMOPS_LINALG_EXT_TRANSFORMOPS \ No newline at end of file diff --git a/compiler/include/byteir/Dialect/Linalg/Transforms/LinalgCollapseLoops.h b/compiler/include/byteir/Dialect/Linalg/Transforms/LinalgCollapseLoops.h index 556a43c41..b5ef16af4 100644 --- a/compiler/include/byteir/Dialect/Linalg/Transforms/LinalgCollapseLoops.h +++ b/compiler/include/byteir/Dialect/Linalg/Transforms/LinalgCollapseLoops.h @@ -18,6 +18,7 @@ #ifndef BYTEIR_DIALECT_LINALG_TRANSFORMS_LINALGCOLLAPSELOOPS_H #define BYTEIR_DIALECT_LINALG_TRANSFORMS_LINALGCOLLAPSELOOPS_H +#include "mlir/Dialect/Utils/StructuredOpsUtils.h" #include "mlir/Pass/Pass.h" #include @@ -26,7 +27,11 @@ namespace func { class FuncOp; } // namespace func -std::unique_ptr> createLinalgCollapseLoops(); +#define GEN_PASS_DECL_LINALGCOLLAPSELOOPS +#include "byteir/Dialect/Linalg/Passes.h.inc" + +std::unique_ptr> createLinalgCollapseLoops( + utils::IteratorType iteratorType = utils::IteratorType::parallel); } // namespace mlir diff --git a/compiler/include/byteir/Dialect/Tensor/CMakeLists.txt b/compiler/include/byteir/Dialect/Tensor/CMakeLists.txt new file mode 100644 index 000000000..a4f8266f9 --- /dev/null +++ b/compiler/include/byteir/Dialect/Tensor/CMakeLists.txt @@ -0,0 +1,3 @@ +set(LLVM_TARGET_DEFINITIONS Passes.td) +mlir_tablegen(Passes.h.inc -gen-pass-decls -name ByteIRTensor) +add_public_tablegen_target(ByteIRTensorPassIncGen) diff --git a/compiler/include/byteir/Dialect/Tensor/Passes.h b/compiler/include/byteir/Dialect/Tensor/Passes.h new file mode 100644 index 000000000..be0c09f87 --- /dev/null +++ b/compiler/include/byteir/Dialect/Tensor/Passes.h @@ -0,0 +1,31 @@ +//===- Passes.h ---------------------------------------------------- C++ --===// +// +// Copyright 2022 ByteDance Ltd. and/or its affiliates. All rights reserved. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// + +#ifndef BYTEIR_DIALECT_TENSOR_PASSES_H +#define BYTEIR_DIALECT_TENSOR_PASSES_H + +#include "byteir/Dialect/Tensor/Transforms/TensorPadSpecialization.h" + +namespace mlir { + +/// Generate the code for registering transforms passes. +#define GEN_PASS_REGISTRATION +#include "byteir/Dialect/Tensor/Passes.h.inc" + +} // namespace mlir + +#endif // BYTEIR_DIALECT_TENSOR_PASSES_H diff --git a/compiler/include/byteir/Dialect/Tensor/Passes.td b/compiler/include/byteir/Dialect/Tensor/Passes.td new file mode 100644 index 000000000..cdfa73c4d --- /dev/null +++ b/compiler/include/byteir/Dialect/Tensor/Passes.td @@ -0,0 +1,35 @@ +//===- Passes.td - Transforms pass definition file -------*--- tablegen -*-===// +// +// Copyright 2022 ByteDance Ltd. and/or its affiliates. All rights reserved. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// + +#ifndef BYTEIR_DIALECT_TENSOR_PASSES +#define BYTEIR_DIALECT_TENSOR_PASSES + +include "mlir/Pass/PassBase.td" + +//===----------------------------------------------------------------------===// +// TensorPadSpecialization +//===----------------------------------------------------------------------===// + +def TensorPadSpecialization : Pass<"tensor-pad-specialization", ""> { + let summary = "Specialize tensor.pad op"; + let constructor = "mlir::createTensorPadSpecializationPass()"; + let dependentDialects = [ + "scf::SCFDialect", + ]; +} + +#endif // BYTEIR_DIALECT_TENSOR_PASSES diff --git a/compiler/include/byteir/Dialect/Tensor/Transforms/TensorPadSpecialization.h b/compiler/include/byteir/Dialect/Tensor/Transforms/TensorPadSpecialization.h new file mode 100644 index 000000000..72f38cd03 --- /dev/null +++ b/compiler/include/byteir/Dialect/Tensor/Transforms/TensorPadSpecialization.h @@ -0,0 +1,30 @@ +//===- TensorPadSpecialization.h ---------------------------------- C++ --===// +// +// Copyright 2022 ByteDance Ltd. and/or its affiliates. All rights reserved. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// + +#ifndef BYTEIR_DIALECT_SHAPE_TRANSFORMS_TENSORPADSPECIALIZATION_H +#define BYTEIR_DIALECT_SHAPE_TRANSFORMS_TENSORPADSPECIALIZATION_H + +#include "mlir/Pass/Pass.h" +#include + +namespace mlir { + +std::unique_ptr createTensorPadSpecializationPass(); + +} // namespace mlir + +#endif // BYTEIR_DIALECT_SHAPE_TRANSFORMS_TENSORPADSPECIALIZATION_H diff --git a/compiler/include/byteir/Dialect/Transform/Passes.td b/compiler/include/byteir/Dialect/Transform/Passes.td index 6e82c4b3f..49a471a71 100644 --- a/compiler/include/byteir/Dialect/Transform/Passes.td +++ b/compiler/include/byteir/Dialect/Transform/Passes.td @@ -34,6 +34,24 @@ def TransformDialectInterpreter : Pass<"transform-dialect-interpreter", "ModuleO ]; } +//===----------------------------------------------------------------------===// +// DetensorizeTransformationInsertion +//===----------------------------------------------------------------------===// + +def DetensorizeTransformInsertion : Pass<"insert-detensorize-transform", "ModuleOp"> { + let summary = "Insert detensorize transform IR to functions."; + let constructor = "mlir::createDetensorizeTransformInsertionPass()"; + let options = [ + Option<"funcAnchorAttr", "func-anchor", "std::string", + /*default=*/"", + "An optional Unit attribute anchoring on target functions.">, + Option<"matchPrefix", "match-prefix", "std::string", + /*default=*/"\"__byteir_detensorize\"", + "An optional match prefix attribute on target ops.">, + ]; +} + + //===----------------------------------------------------------------------===// // FuseExtTransformInsertion //===----------------------------------------------------------------------===// @@ -60,4 +78,22 @@ def FuseExtTransformInsertion : Pass<"insert-fuse-ext-transform", "ModuleOp"> { ]; } +//===----------------------------------------------------------------------===// +// RewriteInDPSTransformInsertion +//===----------------------------------------------------------------------===// + +def RewriteInDPSTransformInsertion : Pass<"insert-rewrite-in-dps-transform", "ModuleOp"> { + let summary = "Insert rewrite in destination-passing-style transform IR to functions."; + let constructor = "mlir::createRewriteInDPSTransformInsertionPass()"; + let options = [ + Option<"funcAnchorAttr", "func-anchor", "std::string", + /*default=*/"", + "An optional Unit attribute anchoring on target functions.">, + Option<"matchPrefix", "match-prefix", "std::string", + /*default=*/"\"__byteir_detensorize\"", + "An optional match prefix attribute on target ops.">, + ]; +} + + #endif // BYTEIR_DIALECT_TRANSFORM_PASSES diff --git a/compiler/include/byteir/Dialect/Transform/Transforms/TransformInsertion.h b/compiler/include/byteir/Dialect/Transform/Transforms/TransformInsertion.h index 931b6d0bc..5741663ac 100644 --- a/compiler/include/byteir/Dialect/Transform/Transforms/TransformInsertion.h +++ b/compiler/include/byteir/Dialect/Transform/Transforms/TransformInsertion.h @@ -37,12 +37,22 @@ struct TransformInsertionConfig { std::unique_ptr> createGenericTransformInsertionPass(const TransformInsertionConfig &config); +std::unique_ptr> +createDetensorizeTransformInsertionPass( + const std::string &funcAnchor = "", + const std::string &matchPrefix = "__byteir_detensorize"); + std::unique_ptr> createFuseExtTransformInsertionPass( const std::string &funcAnchor = "", const std::string &matchPrefix = "unknown", const std::string &tileSizeAttrName = "", const std::string &tileInterchangeAttrName = "", const bool keepIntermediates = false); + +std::unique_ptr> +createRewriteInDPSTransformInsertionPass( + const std::string &funcAnchor = "", + const std::string &matchPrefix = "__byteir_rewrite_in_dps"); } // namespace mlir #endif // BYTEIR_DIALECT_TRANSFORM_TRANSFORMS_TRANSFORMINSERTION_H \ No newline at end of file diff --git a/compiler/include/byteir/Dialect/mhlo/Transforms/CanonicalizeExt.h b/compiler/include/byteir/Dialect/mhlo/Transforms/CanonicalizeExt.h index fa0a1b94c..78e3aa1bd 100644 --- a/compiler/include/byteir/Dialect/mhlo/Transforms/CanonicalizeExt.h +++ b/compiler/include/byteir/Dialect/mhlo/Transforms/CanonicalizeExt.h @@ -45,6 +45,7 @@ class ReshapeOp; class MulOp; class SliceOp; class ReverseOp; +class GatherOp; // Most of these will push back to upstream // So this file only includes patterns, not a pass. @@ -143,6 +144,8 @@ LogicalResult simplifyTransposeReshapeTranspose(mhlo::TransposeOp op, LogicalResult foldReverseWithConstant(mhlo::ReverseOp op, PatternRewriter &rewriter); +LogicalResult foldGatherWithInput(mhlo::GatherOp op, PatternRewriter &rewriter); + // populate canonicalizeExt patterns void populateCanonicalizeExtPatterns(RewritePatternSet &patterns, MLIRContext *context, diff --git a/compiler/include/byteir/Dialect/mhlo/Transforms/ConvertOpToCustomCall.h b/compiler/include/byteir/Dialect/mhlo/Transforms/ConvertOpToCustomCall.h index e84497390..b36ebc426 100644 --- a/compiler/include/byteir/Dialect/mhlo/Transforms/ConvertOpToCustomCall.h +++ b/compiler/include/byteir/Dialect/mhlo/Transforms/ConvertOpToCustomCall.h @@ -1,4 +1,4 @@ -//===- ConvertRngToCustomCall.h -------------------------------*--- C++ -*-===// +//===- ConvertOpToCustomCall.h --------------------------------*--- C++ -*-===// // // Copyright 2022 ByteDance Ltd. and/or its affiliates. All rights reserved. // Licensed under the Apache License, Version 2.0 (the "License"); @@ -27,6 +27,8 @@ class ModuleOp; void populateRngPatternToCustomCall(RewritePatternSet &patterns); +void populateFlashFwdRewritePattern(RewritePatternSet &patterns); + std::unique_ptr> createConvertOpToCustomCallPass(llvm::StringRef anchor = ""); diff --git a/compiler/include/byteir/Dialect/mhlo/Transforms/GenericFusionCommon.h b/compiler/include/byteir/Dialect/mhlo/Transforms/GenericFusionCommon.h index be1a10a93..df10ecfdc 100644 --- a/compiler/include/byteir/Dialect/mhlo/Transforms/GenericFusionCommon.h +++ b/compiler/include/byteir/Dialect/mhlo/Transforms/GenericFusionCommon.h @@ -41,6 +41,7 @@ struct GenericFuserConfig { std::function fuse_trigger; std::function fuse_with; std::function valid_single_op; + std::function valid_fusion_pattern; }; //===----------------------------------------------------------------------===// @@ -115,6 +116,9 @@ class GenericFusionPass : public GenericFusionBase { for (auto it = plan.rbegin(); it != plan.rend(); ++it) { auto &pattern = *it; + if (!fuse_config.valid_fusion_pattern(pattern)) + continue; + if (pattern.size() > 1) { applyMhloFusionPattern(pattern, fuse_config.fuse_attr); } else if (this->clusterSingleOp.getValue()) { diff --git a/compiler/include/byteir/Dialect/mhlo/Transforms/HloFuser.h b/compiler/include/byteir/Dialect/mhlo/Transforms/HloFuser.h index f49315bbe..6a0e10cb7 100644 --- a/compiler/include/byteir/Dialect/mhlo/Transforms/HloFuser.h +++ b/compiler/include/byteir/Dialect/mhlo/Transforms/HloFuser.h @@ -46,6 +46,10 @@ constexpr StringRef getByteIRMatmulEpilogueFusionAttrName() { return "__byteir_matmul_epilogue_fusion__"; } +constexpr StringRef getByteIRReductionFusionAttrName() { + return "__byteir_reduction_fusion__"; +} + constexpr StringRef getByteIRTrivialFusionAttrName() { return "__byteir_trivial_fusion__"; } @@ -102,6 +106,8 @@ std::unique_ptr> createTrivialFusionPass(); std::unique_ptr> createHloAggressiveFusionPass(); +std::unique_ptr> createReductionFusionPass(); + } // namespace mlir #endif // BYTEIR_DIALECT_MHLO_TRANSFORMS_HLOFUSER_H diff --git a/compiler/include/byteir/Dialect/mhlo/Util/CustomCallUtil.h b/compiler/include/byteir/Dialect/mhlo/Util/CustomCallUtil.h index af68cf38a..83a53e329 100644 --- a/compiler/include/byteir/Dialect/mhlo/Util/CustomCallUtil.h +++ b/compiler/include/byteir/Dialect/mhlo/Util/CustomCallUtil.h @@ -85,6 +85,14 @@ constexpr llvm::StringRef getRngUniformName() { return CUSTOM_CALL_NAME_PREFIX "rng_uniform"; } +constexpr llvm::StringRef getFlashAttnFwdName() { + return CUSTOM_CALL_NAME_PREFIX "flash_attn_fwd"; +} + +constexpr llvm::StringRef getFlashAttnBwdName() { + return CUSTOM_CALL_NAME_PREFIX "flash_attn_bwd"; +} + constexpr llvm::StringRef getDynamicPartitionName() { return TF_NAME_PREFIX "DynamicPartition"; } diff --git a/compiler/include/byteir/Pipelines/GPU/MappingForall.h b/compiler/include/byteir/Pipelines/GPU/MappingForall.h new file mode 100644 index 000000000..202cc1e40 --- /dev/null +++ b/compiler/include/byteir/Pipelines/GPU/MappingForall.h @@ -0,0 +1,53 @@ +//===- MappingForall.h ---------------------------------------*--- C++ -*-===// +// +// Copyright 2022 ByteDance Ltd. and/or its affiliates. All rights reserved. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// + +#ifndef BYTEIR_PIPELINES_GPU_MAPPING_FORALL_H +#define BYTEIR_PIPELINES_GPU_MAPPING_FORALL_H + +#include "mlir/Pass/PassManager.h" +#include "mlir/Pass/PassOptions.h" +#include "mlir/Pass/PassRegistry.h" + +namespace mlir { +struct GPUMappingForallOptions + : public PassPipelineOptions { + Option funcAnchor{ + *this, "func-anchor", + llvm::cl::desc( + "An optional Unit attribute anchoring on target functions."), + llvm::cl::init("")}; + Option annotatePrefix{ + *this, "annotate-prefix", + llvm::cl::desc("An optional annotate prefix attribute on target ops."), + llvm::cl::init("__byteir_gpu_split_grid_reduction")}; + // TODO: option for grid/block dims hint +}; + +void createGPUMappingForallTransform(OpPassManager &pm, + const GPUMappingForallOptions &options); + +inline void registerGPUMappingForallPipelines() { + PassPipelineRegistration( + "insert-gpu-mapping-forall-transform", + "Insert transformation IR to mapping forall to corresponding blocks and " + "threads", + createGPUMappingForallTransform); +} + +} // namespace mlir + +#endif // BYTEIR_PIPELINES_GPU_MAPPING_FORALL_H diff --git a/compiler/include/byteir/Pipelines/GPU/ReductionCodegen.h b/compiler/include/byteir/Pipelines/GPU/ReductionCodegen.h new file mode 100644 index 000000000..7aea80d51 --- /dev/null +++ b/compiler/include/byteir/Pipelines/GPU/ReductionCodegen.h @@ -0,0 +1,153 @@ +//===- ReductionCodegen.h -----------------------------------*--- C++ -*-===// +// +// Copyright 2022 ByteDance Ltd. and/or its affiliates. All rights reserved. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// + +#ifndef BYTEIR_PIPELINES_GPU_REDUCTION_CODEGEN_H +#define BYTEIR_PIPELINES_GPU_REDUCTION_CODEGEN_H + +#include "mlir/Pass/PassManager.h" +#include "mlir/Pass/PassOptions.h" +#include "mlir/Pass/PassRegistry.h" + +namespace mlir { +struct GPUSplitGridReductionOptions + : public PassPipelineOptions { + Option funcAnchor{ + *this, "func-anchor", + llvm::cl::desc( + "An optional Unit attribute anchoring on target functions."), + llvm::cl::init("")}; + Option annotatePrefix{ + *this, "annotate-prefix", + llvm::cl::desc("An optional annotate prefix attribute on target ops."), + llvm::cl::init("__byteir_gpu_split_grid_reduction")}; + Option splitFactor{*this, "split-factor", + llvm::cl::desc("split factor"), + llvm::cl::init(32)}; +}; + +struct GPUTileGridReductionOptions + : public PassPipelineOptions { + Option funcAnchor{ + *this, "func-anchor", + llvm::cl::desc( + "An optional Unit attribute anchoring on target functions."), + llvm::cl::init("")}; + Option annotatePrefix{ + *this, "annotate-prefix", + llvm::cl::desc("An optional annotate prefix attribute on target ops."), + llvm::cl::init("__byteir_gpu_tile_grid_reduction")}; + Option warpSize{*this, "warp-size", llvm::cl::desc("warp size"), + llvm::cl::init(32)}; + Option blockSize{*this, "block-size", llvm::cl::desc("block size"), + llvm::cl::init(256)}; + Option usingForall{*this, "using-forall", + llvm::cl::desc("using forall"), + llvm::cl::init(true)}; +}; + +struct GPUSplitBlockReductionOptions + : public PassPipelineOptions { + Option funcAnchor{ + *this, "func-anchor", + llvm::cl::desc( + "An optional Unit attribute anchoring on target functions."), + llvm::cl::init("")}; + Option annotatePrefix{ + *this, "annotate-prefix", + llvm::cl::desc("An optional annotate prefix attribute on target ops."), + llvm::cl::init("__byteir_gpu_split_block_reduction")}; + Option splitFactor{*this, "split-factor", + llvm::cl::desc("split factor"), + llvm::cl::init(32)}; + Option warpSize{*this, "warp-size", llvm::cl::desc("warp size"), + llvm::cl::init(32)}; +}; + +struct GPUTileBlockReductionOptions + : public PassPipelineOptions { + Option funcAnchor{ + *this, "func-anchor", + llvm::cl::desc( + "An optional Unit attribute anchoring on target functions."), + llvm::cl::init("")}; + Option annotatePrefix{ + *this, "annotate-prefix", + llvm::cl::desc("An optional annotate prefix attribute on target ops."), + llvm::cl::init("__byteir_gpu_tile_block_reduction")}; + Option warpSize{*this, "warp-size", llvm::cl::desc("warp size"), + llvm::cl::init(32)}; + Option blockSize{*this, "block-size", llvm::cl::desc("block size"), + llvm::cl::init(256)}; + Option usingForall{*this, "using-forall", + llvm::cl::desc("using forall"), + llvm::cl::init(true)}; +}; + +struct GPUTileThreadReductionOptions + : public PassPipelineOptions { + Option funcAnchor{ + *this, "func-anchor", + llvm::cl::desc( + "An optional Unit attribute anchoring on target functions."), + llvm::cl::init("")}; + Option annotatePrefix{ + *this, "annotate-prefix", + llvm::cl::desc("An optional annotate prefix attribute on target ops."), + llvm::cl::init("__byteir_gpu_tile_thread_reduction")}; +}; + +void createGPUSplitGridReductionTransform( + OpPassManager &pm, const GPUSplitGridReductionOptions &options); +void createGPUTileGridReductionTransform( + OpPassManager &pm, const GPUTileGridReductionOptions &options); +void createGPUSplitBlockReductionTransform( + OpPassManager &pm, const GPUSplitBlockReductionOptions &options); +void createGPUTileBlockReductionTransform( + OpPassManager &pm, const GPUTileBlockReductionOptions &options); +void createGPUTileThreadReductionTransform( + OpPassManager &pm, const GPUTileThreadReductionOptions &options); + +inline void registerGPUReductionCodegenPipelines() { + PassPipelineRegistration( + "insert-gpu-split-grid-reduction-transform", + "Insert transformation IR to split linalg reduction op", + createGPUSplitGridReductionTransform); + + PassPipelineRegistration( + "insert-gpu-tile-grid-reduction-transform", + "Insert transformation IR to tile linalg reduction op", + createGPUTileGridReductionTransform); + + PassPipelineRegistration( + "insert-gpu-split-block-reduction-transform", + "Insert transformation IR to split linalg reduction op", + createGPUSplitBlockReductionTransform); + + PassPipelineRegistration( + "insert-gpu-tile-block-reduction-transform", + "Insert transformation IR to tile linalg reduction op", + createGPUTileBlockReductionTransform); + + PassPipelineRegistration( + "insert-gpu-tile-thread-reduction-transform", + "Insert transformation IR to tile linalg reduction op", + createGPUTileThreadReductionTransform); +} + +} // namespace mlir + +#endif // BYTEIR_PIPELINES_GPU_REDUCTION_CODEGEN_H diff --git a/compiler/include/byteir/Pipelines/InitAllPipelines.h b/compiler/include/byteir/Pipelines/InitAllPipelines.h index 0cdf1e7ce..2a653f898 100644 --- a/compiler/include/byteir/Pipelines/InitAllPipelines.h +++ b/compiler/include/byteir/Pipelines/InitAllPipelines.h @@ -35,7 +35,9 @@ #include "byteir/Pipelines/GPU/ElementwiseCodegen.h" #include "byteir/Pipelines/GPU/GPUOpt.h" #include "byteir/Pipelines/GPU/LinalgMemrefGPU.h" +#include "byteir/Pipelines/GPU/MappingForall.h" #include "byteir/Pipelines/GPU/NVVMCodegen.h" +#include "byteir/Pipelines/GPU/ReductionCodegen.h" #include "byteir/Pipelines/Host/Codegen.h" #include "byteir/Pipelines/Host/HostOpt.h" @@ -65,6 +67,8 @@ inline void registerAllByteIRGPUPipelines() { registerLinalgMemrefGPUPipeline(); registerMatmulEpilogueGPUPipeline(); registerGPUElementwiseCodegenPipelines(); + registerGPUReductionCodegenPipelines(); + registerGPUMappingForallPipelines(); } inline void registerAllByteIRHostPipelines() { diff --git a/compiler/include/byteir/Transforms/MemoryPlanning.h b/compiler/include/byteir/Transforms/MemoryPlanning.h index 74a83b88c..c1ad2a181 100644 --- a/compiler/include/byteir/Transforms/MemoryPlanning.h +++ b/compiler/include/byteir/Transforms/MemoryPlanning.h @@ -23,18 +23,19 @@ #include namespace mlir { +class FunctionOpInterface; class Value; namespace func { class FuncOp; } // namespace func -std::unique_ptr> createMemoryPlanningPass(); +std::unique_ptr> createMemoryPlanningPass(); /// couldReuseBuffer is a user provided callback which receives a Value as /// parameter and returns whether the allocation corresponding to the Value can /// be reused -std::unique_ptr> -createMemoryPlanningPass(size_t alignment, +std::unique_ptr> +createMemoryPlanningPass(size_t alignment, bool alloca, size_t memSpace, std::function couldReuseAllocation); } // namespace mlir diff --git a/compiler/include/byteir/Transforms/Passes.td b/compiler/include/byteir/Transforms/Passes.td index 4c6a9cb1b..8ac2f3e7c 100644 --- a/compiler/include/byteir/Transforms/Passes.td +++ b/compiler/include/byteir/Transforms/Passes.td @@ -237,7 +237,7 @@ def LoopTag : Pass<"loop-tag", "func::FuncOp"> { //===----------------------------------------------------------------------===// // Memory planning //===----------------------------------------------------------------------===// -def MemoryPlanning: Pass<"memory-planning", "mlir::func::FuncOp"> { +def MemoryPlanning: InterfacePass<"memory-planning", "mlir::FunctionOpInterface"> { let summary = "Pass to perform static memory planning"; let constructor = "mlir::createMemoryPlanningPass()"; let dependentDialects = [ diff --git a/compiler/lib/Analysis/UseRange.cpp b/compiler/lib/Analysis/UseRange.cpp index f8a6e1076..88708c154 100644 --- a/compiler/lib/Analysis/UseRange.cpp +++ b/compiler/lib/Analysis/UseRange.cpp @@ -383,10 +383,9 @@ void UseInterval::mergeAndEraseContiguousIntervals( iter = interval.erase(std::next(iter), next); } -UserangeAnalysis::UserangeAnalysis( - Operation *op, byteir::Liveness *liveness, - const bufferization::BufferPlacementAllocs &allocs, - const BufferViewFlowAnalysis &aliases) +UserangeAnalysis::UserangeAnalysis(Operation *op, byteir::Liveness *liveness, + AllocsIteratorRange &&allocs, + const BufferViewFlowAnalysis &aliases) : liveness(liveness) { // Walk over all operations and map them to an ID. op->walk([&](Operation *operation) { diff --git a/compiler/lib/CAPI/CMakeLists.txt b/compiler/lib/CAPI/CMakeLists.txt index d8bc56463..1ae8a97f2 100644 --- a/compiler/lib/CAPI/CMakeLists.txt +++ b/compiler/lib/CAPI/CMakeLists.txt @@ -33,6 +33,7 @@ add_mlir_public_c_api_library(ByteIRCAPI # dialect specific passes ByteIRAffinePasses ByteIRByrePasses + ByteIRGPUPasses ByteIRLinalgPasses ByteIRMemRefPasses ByteIRMhloPasses diff --git a/compiler/lib/CAPI/Passes.cpp b/compiler/lib/CAPI/Passes.cpp index e686dab22..6707875c9 100644 --- a/compiler/lib/CAPI/Passes.cpp +++ b/compiler/lib/CAPI/Passes.cpp @@ -21,10 +21,12 @@ #include "byteir/Dialect/Ace/Passes.h" #include "byteir/Dialect/Affine/Passes.h" #include "byteir/Dialect/Byre/Passes.h" +#include "byteir/Dialect/GPU/Passes.h" #include "byteir/Dialect/Linalg/Passes.h" #include "byteir/Dialect/MemRef/Passes.h" #include "byteir/Dialect/SCF/Passes.h" #include "byteir/Dialect/Shape/Passes.h" +#include "byteir/Dialect/Tensor/Passes.h" #include "byteir/Dialect/Transform/Passes.h" #include "byteir/Dialect/mhlo/Passes.h" #include "byteir/Pipelines/InitAllPipelines.h" @@ -45,11 +47,13 @@ void byteirRegisterAllPasses() { registerByteIRAcePasses(); registerByteIRAffinePasses(); registerByteIRByrePasses(); + registerByteIRGPUPasses(); registerByteIRLinalgPasses(); registerByteIRMemRefPasses(); registerByteIRMhloPassesExt(); registerByteIRSCFPasses(); registerByteIRShapePasses(); + registerByteIRTensorPasses(); registerByteIRTransformPasses(); // pipelines diff --git a/compiler/lib/Conversion/FuncToByre/FuncToByre.cpp b/compiler/lib/Conversion/FuncToByre/FuncToByre.cpp index 45e87ca16..bd1dce952 100644 --- a/compiler/lib/Conversion/FuncToByre/FuncToByre.cpp +++ b/compiler/lib/Conversion/FuncToByre/FuncToByre.cpp @@ -19,7 +19,9 @@ #include "byteir/Dialect/Byre/ByreDialect.h" #include "byteir/Dialect/Byre/Common.h" #include "byteir/Utils/Utils.h" +#include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/Transforms/DialectConversion.h" @@ -86,6 +88,53 @@ class ConvertCallOpToByreTensorPattern : public OpRewritePattern { bool appendArgTypes; }; +class ConvertGPULaunchFuncToByrePattern + : public OpRewritePattern { + +public: + ConvertGPULaunchFuncToByrePattern(MLIRContext *ctx, bool useBarePtrCallConv) + : OpRewritePattern(ctx), + useBarePtrCallConv(useBarePtrCallConv) {} + + LogicalResult matchAndRewrite(gpu::LaunchFuncOp launchOp, + PatternRewriter &rewriter) const override { + auto computeOp = rewriter.create( + launchOp->getLoc(), TypeRange(), "PTXOp", launchOp.getKernelOperands(), + /*memEffects*/ ArrayAttr()); + + computeOp->setAttr( + rewriter.getStringAttr("kernel_name"), + rewriter.getStringAttr(launchOp.getKernelName().getValue())); + + auto grid = launchOp.getGridSizeOperandValues(); + int64_t gx = cast(grid.x.getDefiningOp()).value(); + int64_t gy = cast(grid.y.getDefiningOp()).value(); + int64_t gz = cast(grid.z.getDefiningOp()).value(); + computeOp->setAttr("GridSize.x", rewriter.getI32IntegerAttr(gx)); + computeOp->setAttr("GridSize.y", rewriter.getI32IntegerAttr(gy)); + computeOp->setAttr("GridSize.z", rewriter.getI32IntegerAttr(gz)); + + auto block = launchOp.getBlockSizeOperandValues(); + int64_t bx = cast(block.x.getDefiningOp()).value(); + int64_t by = cast(block.y.getDefiningOp()).value(); + int64_t bz = cast(block.z.getDefiningOp()).value(); + computeOp->setAttr("BlockSize.x", rewriter.getI32IntegerAttr(bx)); + computeOp->setAttr("BlockSize.y", rewriter.getI32IntegerAttr(by)); + computeOp->setAttr("BlockSize.z", rewriter.getI32IntegerAttr(bz)); + + if (useBarePtrCallConv) { + computeOp->setAttr(byre::getKernelCallConventionAttrName(), + rewriter.getStringAttr("bare_ptr")); + } + rewriter.eraseOp(launchOp); + + return success(); + } + +private: + bool useBarePtrCallConv; +}; + struct ConvertFuncToByreTensorPass : public ConvertFuncToByreTensorBase { public: @@ -104,6 +153,24 @@ struct ConvertFuncToByreTensorPass } } }; + +struct ConvertGPULaunchFuncToByrePass + : public ConvertGPULaunchFuncToByreBase { +public: + ConvertGPULaunchFuncToByrePass(bool useBarePtrCallConv) + : ConvertGPULaunchFuncToByreBase() { + this->useBarePtrCallConv = useBarePtrCallConv; + } + void runOnOperation() override { + MLIRContext &ctx = getContext(); + RewritePatternSet patterns(&ctx); + populateGPULaunchFuncToByrePattern(patterns, useBarePtrCallConv); + if (failed(applyPatternsAndFoldGreedily(getOperation(), + std::move(patterns)))) { + signalPassFailure(); + } + } +}; }; // namespace void mlir::populateFuncToByreTensorPattern(RewritePatternSet &patterns, @@ -112,7 +179,18 @@ void mlir::populateFuncToByreTensorPattern(RewritePatternSet &patterns, appendArgTypes); } +void mlir::populateGPULaunchFuncToByrePattern(RewritePatternSet &patterns, + bool useBarePtrCallConv) { + patterns.add(patterns.getContext(), + useBarePtrCallConv); +} + std::unique_ptr> mlir::createConvertFuncToByreTensorPass(bool appendArgTypes) { return std::make_unique(appendArgTypes); } + +std::unique_ptr +mlir::createConvertGPULaunchFuncToByrePass(bool useBarePtrCallConv) { + return std::make_unique(useBarePtrCallConv); +} \ No newline at end of file diff --git a/compiler/lib/Conversion/HloToCat/FuseHloToCat.cpp b/compiler/lib/Conversion/HloToCat/FuseHloToCat.cpp index e78709194..452075ce4 100644 --- a/compiler/lib/Conversion/HloToCat/FuseHloToCat.cpp +++ b/compiler/lib/Conversion/HloToCat/FuseHloToCat.cpp @@ -283,6 +283,48 @@ struct ConvertBmmReshapeTransposeToBmmReshape } }; +// bmm_rrr(x, broadcast_in_dim(y)) => reshape(gemm_rrr(reshape(x), y)) +struct ConvertBmmRRRBroadcastToReshapeGemmRRRReshape + : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + LogicalResult matchAndRewrite(cat::BMMRRROp op, + PatternRewriter &rewriter) const override { + auto bCastOp = op.getRhs().getDefiningOp(); + if (!bCastOp) { + return failure(); + } + auto lhsType = op.getLhs().getType().cast(); + auto rhsType = op.getRhs().getType().cast(); + if (!lhsType.hasStaticShape() || !rhsType.hasStaticShape()) { + return failure(); + } + SmallVector broadcastDimensions; + getValuesFromDenseIntElementsAttr(bCastOp.getBroadcastDimensions(), + broadcastDimensions); + if (broadcastDimensions.size() != 2) { + return failure(); + } + if (broadcastDimensions[0] != 1 || broadcastDimensions[1] != 2) { + return failure(); + } + + RankedTensorType firstReshapeType = RankedTensorType::get( + {lhsType.getDimSize(0) * lhsType.getDimSize(1), lhsType.getDimSize(2)}, + lhsType.getElementType()); + RankedTensorType gemmType = RankedTensorType::get( + {firstReshapeType.getDimSize(0), rhsType.getDimSize(2)}, + lhsType.getElementType()); + auto firstReshape = rewriter.create( + op.getLoc(), firstReshapeType, op.getLhs()); + auto gemm = rewriter.create( + op.getLoc(), gemmType, firstReshape, bCastOp.getOperand()); + auto secondReshape = + rewriter.create(op.getLoc(), op.getType(), gemm); + rewriter.replaceOp(op, secondReshape); + return success(); + } +}; + struct FuseMhloToCatPass : public FuseMhloToCatBase { public: FuseMhloToCatPass() = default; @@ -317,7 +359,8 @@ void populateFuseMhloToCatPattern(RewritePatternSet &patterns) { ConvertBmmReshapeTransposeToBmmReshape, ConvertBmmReshapeTransposeToBmmReshape, ConvertBmmReshapeTransposeToBmmReshape, - ConvertBmmReshapeTransposeToBmmReshape + ConvertBmmReshapeTransposeToBmmReshape, + ConvertBmmRRRBroadcastToReshapeGemmRRRReshape >(patterns.getContext()); // clang-format on } diff --git a/compiler/lib/Conversion/HloToCat/FuseHloToCatPattern.td b/compiler/lib/Conversion/HloToCat/FuseHloToCatPattern.td index c984993e2..ba101c1af 100644 --- a/compiler/lib/Conversion/HloToCat/FuseHloToCatPattern.td +++ b/compiler/lib/Conversion/HloToCat/FuseHloToCatPattern.td @@ -30,6 +30,7 @@ def OneRank : Constraint().getRank() == 1"> def TwoRank : Constraint().getRank() == 2">, "two rank">; def ThreeRank : Constraint().getRank() == 3">, "three rank">; def FourRank : Constraint().getRank() == 4">, "four rank">; +def Permute10Check : Constraint()[0] == 1 && $0.getValues()[1] == 0">, "transpose <[1, 0]>">; def Permute021Check : Constraint()[0] == 0 && $0.getValues()[1] == 2 && $0.getValues()[2] == 1">, "bmm 3d permute check (for transpose before bmm)">; def Permute0213Check : Constraint()[0] == 0 && $0.getValues()[1] == 2 && $0.getValues()[2] == 1 && $0.getValues()[3] == 3">, "bmm 4d permute check (for transpose after bmm)">; def TransposeCheck : Constraint()[0] == 1 && $0.getValues()[1] == 0">, "matrix transpose check">; @@ -116,6 +117,26 @@ def MhloCatGemmRCRTransToCatGemmRCRPermutePattern (Cat_GemmRCRPermuteOp $lhs, $rhs, (getDim1Attr $reshape_out), (getDim2Attr $reshape_out)), [(TwoRank $lhs), (TwoRank $rhs), (FourRank $reshape_out), (GemmPermuteShapeCheck $reshape_out, $gemm_out), (Permute0213Check $permute)]>; +def MhloCatGemmRRRTransToCatGemmRRRPermutePattern + : Pat<(MHLO_TransposeOp + (MHLO_ReshapeOp : $reshape_out + (Cat_GemmRRROp : $gemm_out + $lhs, $rhs) + ), + $permute), + (Cat_GemmRRRPermuteOp $lhs, $rhs, (getDim1Attr $reshape_out), (getDim2Attr $reshape_out)), + [(TwoRank $lhs), (TwoRank $rhs), (FourRank $reshape_out), (GemmPermuteShapeCheck $reshape_out, $gemm_out), (Permute0213Check $permute)]>; + +def CatGemmRRRPermuteTransToCatGemmRCRPermutePattern + : Pat<(Cat_GemmRRRPermuteOp + $lhs, + (MHLO_TransposeOp $rhs, $permute), + $t1, + $t2 + ), + (Cat_GemmRCRPermuteOp $lhs, $rhs, $t1, $t2), + [(TwoSize $permute), (Permute10Check $permute)]>; + def LayoutFrom3DDotGeneralDimNums : NativeCodeCall<"GetLayoutFrom3DDotGeneralDimNums($0, &$_builder)">; def CheckRRRLayoutFrom3DDotGeneralDimNums @@ -140,6 +161,12 @@ def CheckCCRLayoutFrom3DDotGeneralDimNums CPred<"$0.getLhsContractingDimensions().size() == 1 && $0.getRhsContractingDimensions().size() == 1 && $0.getLhsContractingDimensions()[0] == 1 && $0.getRhsContractingDimensions()[0] == 2">, "is bmm ccr dimension">; +def CheckBMMPermuteShapeSplitOnBatch + : Constraint< + CPred<"$0.getType().cast().getShape()[0] * $0.getType().cast().getShape()[1] == $1.getType().cast().getShape()[0] && $0.getType().cast().getShape()[2] == $1.getType().cast().getShape()[1]">, + "bmm rrr Shape Split On Batch">; + + def MhloDotGeneralReshapeTransposeToBMMRRRPermutePattern : Pat<(MHLO_TransposeOp (MHLO_ReshapeOp : $reshape_out @@ -153,7 +180,8 @@ def MhloDotGeneralReshapeTransposeToBMMRRRPermutePattern (ThreeRank $rhs), (FourSize $permute), (Permute0213Check $permute), - (CheckRRRLayoutFrom3DDotGeneralDimNums $dimension_numbers) + (CheckRRRLayoutFrom3DDotGeneralDimNums $dimension_numbers), + (CheckBMMPermuteShapeSplitOnBatch $reshape_out, $lhs) ]>; def MhloDotGeneralReshapeTransposeToBMMRCRPermutePattern diff --git a/compiler/lib/Conversion/MemrefToByre/MemrefToByre.cpp b/compiler/lib/Conversion/MemrefToByre/MemrefToByre.cpp index 49676e2b1..f759d2d0b 100644 --- a/compiler/lib/Conversion/MemrefToByre/MemrefToByre.cpp +++ b/compiler/lib/Conversion/MemrefToByre/MemrefToByre.cpp @@ -69,6 +69,27 @@ class ConvertViewOpToByrePattern : public OpConversionPattern { } }; +class ConvertSubViewOpToByrePattern + : public OpConversionPattern { +public: + ConvertSubViewOpToByrePattern(MLIRContext *ctx) + : OpConversionPattern(ctx) {} + + LogicalResult + matchAndRewrite(memref::SubViewOp op, memref::SubViewOp::Adaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + if (!op.getType().getLayout().isIdentity()) + return failure(); + + if (!op.getSource().getType().getLayout().isIdentity()) + return failure(); + + rewriter.replaceOpWithNewOp(op, op.getResult().getType(), + adaptor.getSource(), 0); + return success(); + } +}; + class ConvertMemrefCopyOpToByrePattern : public OpConversionPattern { public: @@ -174,8 +195,8 @@ void mlir::populateMemrefToByrePattern(RewritePatternSet &patterns) { patterns.add, - ConvertReshapeLikeOpToByrePattern>( - patterns.getContext()); + ConvertReshapeLikeOpToByrePattern, + ConvertSubViewOpToByrePattern>(patterns.getContext()); } std::unique_ptr> diff --git a/compiler/lib/Conversion/ToByre/ToByre.cpp b/compiler/lib/Conversion/ToByre/ToByre.cpp index 1c83a25b3..31e43449c 100644 --- a/compiler/lib/Conversion/ToByre/ToByre.cpp +++ b/compiler/lib/Conversion/ToByre/ToByre.cpp @@ -1074,10 +1074,11 @@ static bool isRewritablePrivateFunc(func::FuncOp func) { } // identify EntryPoint funciton -static void identifyEntryPointFuncAndCalls( - ModuleOp m, llvm::SmallVector &entries, - llvm::SmallVector &calls, - llvm::SmallVector &removeFuncs) { +static void +identifyEntryPointFuncAndCalls(ModuleOp m, + llvm::SmallVector &entries, + llvm::SmallVector &calls, + llvm::SetVector &removeFuncs) { // get first entry func llvm::SmallPtrSet callSet; @@ -1094,7 +1095,7 @@ static void identifyEntryPointFuncAndCalls( if (isRewritablePrivateFunc(calleeFuncOp) && !callSet.contains(callOp)) { calls.push_back(callOp); callSet.insert(callOp); - removeFuncs.push_back(calleeFuncOp); + removeFuncs.insert(calleeFuncOp); } } } @@ -1273,7 +1274,7 @@ void ConvertFuncAndCallToByrePass::runOnOperation() { MLIRContext &ctx = getContext(); llvm::SmallVector entryCollector; llvm::SmallVector callCollector; - llvm::SmallVector removeFuncCollector; + llvm::SetVector removeFuncCollector; identifyEntryPointFuncAndCalls(m, entryCollector, callCollector, removeFuncCollector); @@ -1330,7 +1331,7 @@ void ConvertFuncAndCallToByrePass::runOnOperation() { return signalPassFailure(); } - for (auto func : removeFuncCollector) { + for (auto func : removeFuncCollector.takeVector()) { func->erase(); } } diff --git a/compiler/lib/Conversion/ToLinalg/MemrefCopyToLinalg.cpp b/compiler/lib/Conversion/ToLinalg/MemrefCopyToLinalg.cpp index 8ceae81d9..2fe676adb 100644 --- a/compiler/lib/Conversion/ToLinalg/MemrefCopyToLinalg.cpp +++ b/compiler/lib/Conversion/ToLinalg/MemrefCopyToLinalg.cpp @@ -39,8 +39,9 @@ namespace { struct MemrefCopyOpToLinalg : public OpRewritePattern { using OpRewritePattern::OpRewritePattern; MemrefCopyOpToLinalg(MLIRContext *ctx, std::string anchorTag, - std::string attachAttr) - : OpRewritePattern(ctx), anchorTag(anchorTag), attachAttr(attachAttr) {} + std::string attachAttr, bool outlining) + : OpRewritePattern(ctx), anchorTag(anchorTag), attachAttr(attachAttr), + outlining(outlining) {} LogicalResult matchAndRewrite(memref::CopyOp copyOp, PatternRewriter &rewriter) const override { @@ -56,84 +57,101 @@ struct MemrefCopyOpToLinalg : public OpRewritePattern { auto dstType = llvm::dyn_cast(dst.getType()); if (!srcType || !dstType) return failure(); - if (srcType.getLayout().isIdentity() && dstType.getLayout().isIdentity()) - return failure(); - SmallVector ops; - auto getViewSource = [&](Value value) { - while (auto viewOp = value.getDefiningOp()) { - ops.push_back(viewOp); - value = viewOp.getViewSource(); + if (outlining) { + if (srcType.getLayout().isIdentity() && dstType.getLayout().isIdentity()) + return failure(); + + SmallVector ops; + auto getViewSource = [&](Value value) { + while (auto viewOp = value.getDefiningOp()) { + ops.push_back(viewOp); + value = viewOp.getViewSource(); + } + return value; + }; + Value callSrc = getViewSource(src); + Value callDst = getViewSource(dst); + + auto symbolTableOp = SymbolTable::getNearestSymbolTable(copyOp); + SymbolTable symbolTable(symbolTableOp); + auto funcType = + rewriter.getFunctionType({callSrc.getType(), callDst.getType()}, {}); + + OpBuilder::InsertionGuard guard(rewriter); + // Insert before module terminator. + rewriter.setInsertionPoint(parentOp); + func::FuncOp funcOp = rewriter.create( + copyOp->getLoc(), "memref_copy_kernel", funcType); + symbolTable.insert(funcOp); + funcOp.setPrivate(); + + Block *entryBlock = funcOp.addEntryBlock(); + rewriter.setInsertionPointToStart(entryBlock); + IRMapping mapping; + mapping.map(ValueRange{callSrc, callDst}, entryBlock->getArguments()); + for (auto &&op : llvm::reverse(ops)) { + auto newOp = rewriter.clone(*op, mapping); + mapping.map(op, newOp); + } + AffineMap id = AffineMap::getMultiDimIdentityMap(dstType.getRank(), + rewriter.getContext()); + SmallVector iteratorTypes( + dstType.getRank(), utils::IteratorType::parallel); + rewriter.create( + copyOp->getLoc(), mapping.lookup(copyOp.getSource()), + mapping.lookup(copyOp.getTarget()), llvm::ArrayRef({id, id}), + iteratorTypes, + [](OpBuilder &b, Location loc, ValueRange args) { + b.create(loc, args.front()); + }, + copyOp->getAttrs()); + rewriter.create(copyOp->getLoc()); + if (!attachAttr.empty()) { + funcOp->setAttr(attachAttr, rewriter.getUnitAttr()); } - return value; - }; - Value callSrc = getViewSource(src); - Value callDst = getViewSource(dst); - - auto symbolTableOp = SymbolTable::getNearestSymbolTable(copyOp); - SymbolTable symbolTable(symbolTableOp); - auto funcType = - rewriter.getFunctionType({callSrc.getType(), callDst.getType()}, {}); - - OpBuilder::InsertionGuard guard(rewriter); - // Insert before module terminator. - rewriter.setInsertionPoint(parentOp); - func::FuncOp funcOp = rewriter.create( - copyOp->getLoc(), "memref_copy_kernel", funcType); - symbolTable.insert(funcOp); - funcOp.setPrivate(); - - Block *entryBlock = funcOp.addEntryBlock(); - rewriter.setInsertionPointToStart(entryBlock); - IRMapping mapping; - mapping.map(ValueRange{callSrc, callDst}, entryBlock->getArguments()); - for (auto &&op : llvm::reverse(ops)) { - auto newOp = rewriter.clone(*op, mapping); - mapping.map(op, newOp); - } - AffineMap id = AffineMap::getMultiDimIdentityMap(dstType.getRank(), - rewriter.getContext()); - SmallVector iteratorTypes( - dstType.getRank(), utils::IteratorType::parallel); - rewriter.create( - copyOp->getLoc(), mapping.lookup(copyOp.getSource()), - mapping.lookup(copyOp.getTarget()), llvm::ArrayRef({id, id}), - iteratorTypes, - [](OpBuilder &b, Location loc, ValueRange args) { - b.create(loc, args.front()); - }, - copyOp->getAttrs()); - rewriter.create(copyOp->getLoc()); - if (!attachAttr.empty()) { - funcOp->setAttr(attachAttr, rewriter.getUnitAttr()); - } - rewriter.setInsertionPoint(copyOp); - auto callOp = rewriter.replaceOpWithNewOp( - copyOp, funcOp, ValueRange{callSrc, callDst}); - callOp->setAttr(byre::getByreCallOpReadonlyOperandNumAttrName(), - rewriter.getIndexAttr(1)); + rewriter.setInsertionPoint(copyOp); + auto callOp = rewriter.replaceOpWithNewOp( + copyOp, funcOp, ValueRange{callSrc, callDst}); + callOp->setAttr(byre::getByreCallOpReadonlyOperandNumAttrName(), + rewriter.getIndexAttr(1)); + } else { + AffineMap id = AffineMap::getMultiDimIdentityMap(dstType.getRank(), + rewriter.getContext()); + SmallVector iteratorTypes( + dstType.getRank(), utils::IteratorType::parallel); + rewriter.replaceOpWithNewOp( + copyOp, src, dst, llvm::ArrayRef({id, id}), iteratorTypes, + [](OpBuilder &b, Location loc, ValueRange args) { + b.create(loc, args.front()); + }, + copyOp->getAttrs()); + } return success(); } private: std::string anchorTag; std::string attachAttr; + bool outlining; }; struct MemrefCopyToLinalgPass : public MemrefCopyToLinalgPassBase { - MemrefCopyToLinalgPass(std::string anchorTag, std::string attachAttr) + MemrefCopyToLinalgPass(std::string anchorTag, std::string attachAttr, + bool outlining) : MemrefCopyToLinalgPassBase() { this->anchorTag = anchorTag; this->attachAttr = attachAttr; + this->outlining = outlining; } void runOnOperation() override { MLIRContext *context = &getContext(); RewritePatternSet patterns(&getContext()); patterns.insert(context, this->anchorTag, - this->attachAttr); + this->attachAttr, this->outlining); if (failed(applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)))) { return signalPassFailure(); @@ -144,8 +162,10 @@ struct MemrefCopyToLinalgPass } // namespace std::unique_ptr> -createMemrefCopyToLinalgPass(std::string anchorTag, std::string attachAttr) { - return std::make_unique(anchorTag, attachAttr); +createMemrefCopyToLinalgPass(std::string anchorTag, std::string attachAttr, + bool outlining) { + return std::make_unique(anchorTag, attachAttr, + outlining); } } // namespace mlir diff --git a/compiler/lib/Conversion/ToPTX/CollectGPUKernel.cpp b/compiler/lib/Conversion/ToPTX/CollectGPUKernel.cpp index d2556024f..7ccd11036 100644 --- a/compiler/lib/Conversion/ToPTX/CollectGPUKernel.cpp +++ b/compiler/lib/Conversion/ToPTX/CollectGPUKernel.cpp @@ -37,8 +37,10 @@ namespace { struct CollectGPUKernelPass : public CollectGPUKernelBase { - CollectGPUKernelPass(const std::string &name) : CollectGPUKernelBase() { + CollectGPUKernelPass(const std::string &name, bool removeHost) + : CollectGPUKernelBase() { this->moduleName = name; + this->removeHost = removeHost; } void runOnOperation() override { @@ -49,20 +51,20 @@ struct CollectGPUKernelPass bool found = false; GPUModuleOp dst; - for (auto &op : m.getBody()->without_terminator()) { - if (auto gm = dyn_cast(op)) { - if (gm.getName() == moduleName) { - found = true; - dst = gm; - } else { - gmCollector.push_back(gm); - } + for (auto gm : m.getOps()) { + if (gm.getName() == moduleName) { + found = true; + dst = gm; + } else { + gmCollector.push_back(gm); } } // Note FuncOps not in m.getBody()->without_terminator() - for (auto func : m.getOps()) { - removeOps.push_back(func); + if (removeHost) { + for (auto func : m.getOps()) { + removeOps.push_back(func); + } } if (gmCollector.size() == 0) { @@ -78,12 +80,13 @@ struct CollectGPUKernelPass } SymbolTable dstTable(dst); - for (auto gm : gmCollector) { for (auto &op : gm.getBody()->without_terminator()) { auto newOp = op.clone(); - dstTable.insert(newOp); + auto newName = dstTable.insert(newOp); + (void)SymbolTable::replaceAllSymbolUses(&op, newName, m); } + (void)SymbolTable::replaceAllSymbolUses(gm, dst.getNameAttr(), m); gm.erase(); } @@ -96,6 +99,6 @@ struct CollectGPUKernelPass } // namespace std::unique_ptr> -mlir::createCollectGPUKernelPass(const std::string &name) { - return std::make_unique(name); +mlir::createCollectGPUKernelPass(const std::string &name, bool removeHost) { + return std::make_unique(name, removeHost); } diff --git a/compiler/lib/Dialect/CMakeLists.txt b/compiler/lib/Dialect/CMakeLists.txt index fe905afe9..3e8627a7e 100644 --- a/compiler/lib/Dialect/CMakeLists.txt +++ b/compiler/lib/Dialect/CMakeLists.txt @@ -3,6 +3,7 @@ add_subdirectory(Affine) add_subdirectory(Byre) add_subdirectory(Cat) add_subdirectory(Ccl) +add_subdirectory(GPU) add_subdirectory(Lace) add_subdirectory(Linalg) add_subdirectory(MemRef) diff --git a/compiler/lib/Dialect/Cat/IR/CatDialect.cpp b/compiler/lib/Dialect/Cat/IR/CatDialect.cpp index 24570d4cc..966a22e22 100644 --- a/compiler/lib/Dialect/Cat/IR/CatDialect.cpp +++ b/compiler/lib/Dialect/Cat/IR/CatDialect.cpp @@ -158,3 +158,9 @@ LogicalResult GemmRCRPermuteOp::verify() { this->getOutput(), this->getT1(), this->getT2(), "rcr"); } + +LogicalResult GemmRRRPermuteOp::verify() { + return VerifyGemmPermute0213Layout(this->getLhs(), this->getRhs(), + this->getOutput(), this->getT1(), + this->getT2(), "rrr"); +} diff --git a/compiler/lib/Dialect/GPU/CMakeLists.txt b/compiler/lib/Dialect/GPU/CMakeLists.txt new file mode 100644 index 000000000..5c919f7df --- /dev/null +++ b/compiler/lib/Dialect/GPU/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(Transforms) \ No newline at end of file diff --git a/compiler/lib/Dialect/GPU/Transforms/CMakeLists.txt b/compiler/lib/Dialect/GPU/Transforms/CMakeLists.txt new file mode 100644 index 000000000..733282ba9 --- /dev/null +++ b/compiler/lib/Dialect/GPU/Transforms/CMakeLists.txt @@ -0,0 +1,19 @@ +add_mlir_dialect_library(ByteIRGPUPasses + ShmAllocaToWorkgroupArg.cpp + + ADDITIONAL_HEADER_DIRS + ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/GPU + ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/GPU/Transforms + + DEPENDS + ByteIRGPUPassIncGen + ByteIRUtils + MLIRGPUDialect + + LINK_LIBS PUBLIC + ByteIRUtils + MLIRIR + MLIRGPUDialect + MLIRMemRefDialect + MLIRSupport +) diff --git a/compiler/lib/Dialect/GPU/Transforms/ShmAllocaToWorkgroupArg.cpp b/compiler/lib/Dialect/GPU/Transforms/ShmAllocaToWorkgroupArg.cpp new file mode 100644 index 000000000..808ed797f --- /dev/null +++ b/compiler/lib/Dialect/GPU/Transforms/ShmAllocaToWorkgroupArg.cpp @@ -0,0 +1,86 @@ +//===- ShmAllocaToWorkgroupArg.cpp --------------------------------- C++ +//-*-===// +// +// Copyright 2022 ByteDance Ltd. and/or its affiliates. All rights reserved. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// + +#include "byteir/Dialect/GPU/Passes.h" +#include "byteir/Dialect/GPU/Transforms/Transforms.h" +#include "byteir/Transforms/MemoryPlanning.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/IRMapping.h" +#include "mlir/Pass/PassManager.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include + +#define DEBUG_TYPE "shm-alloca-to-workgroup-arg" + +namespace mlir { +#define GEN_PASS_DEF_SHMALLOCATOWORKGROUPARG +#include "byteir/Dialect/GPU/Passes.h.inc" +} // namespace mlir + +using namespace llvm; +using namespace mlir; + +namespace { +struct ShmAllocaToWorkgroupArgPass + : public impl::ShmAllocaToWorkgroupArgBase { + void runOnOperation() override { + gpu::GPUModuleOp m = getOperation(); + WalkResult walkResult = m->walk([&](gpu::GPUFuncOp func) { + if (!func.isKernel()) + return WalkResult::advance(); + + // OpPassManager pm(func.getOperationName()); + // pm.addPass(createMemoryPlanningPass(/* alignment */ 1, /* alloca */ + // true, + // /* memory space */ 0, + // /* callback */ nullptr)); + // if (mlir::failed(runPipeline(pm, func))) { + // return WalkResult::interrupt(); + // } + + gpu::hoistShmAllocaToWorkgroup(func); + return WalkResult::advance(); + }); + + if (walkResult.wasInterrupted()) { + m->emitError() << "ShmAllocaToWorkgroupArgPass failed"; + signalPassFailure(); + } + } +}; +} // namespace + +void mlir::gpu::hoistShmAllocaToWorkgroup(gpu::GPUFuncOp func) { + func->walk([&](memref::AllocaOp alloca) { + auto memref = alloca.getType(); + if (auto memorySpace = llvm::dyn_cast_or_null( + memref.getMemorySpace())) { + if (memorySpace.getValue() == + gpu::GPUDialect::getWorkgroupAddressSpace()) { + Value workgroup = func.addWorkgroupAttribution(memref, alloca.getLoc()); + alloca.getMemref().replaceAllUsesWith(workgroup); + alloca->erase(); + } + } + }); +} diff --git a/compiler/lib/Dialect/Linalg/TransformOps/LinalgExtTransformOps.cpp b/compiler/lib/Dialect/Linalg/TransformOps/LinalgExtTransformOps.cpp index 64e456af3..bcb8df731 100644 --- a/compiler/lib/Dialect/Linalg/TransformOps/LinalgExtTransformOps.cpp +++ b/compiler/lib/Dialect/Linalg/TransformOps/LinalgExtTransformOps.cpp @@ -56,6 +56,7 @@ #include "mlir/IR/OpImplementation.h" #include "mlir/IR/PatternMatch.h" #include "mlir/IR/SymbolTable.h" +#include "mlir/Interfaces/DestinationStyleOpInterface.h" #include "mlir/Interfaces/TilingInterface.h" #include "mlir/Transforms/InliningUtils.h" #include "mlir/Transforms/RegionUtils.h" @@ -63,6 +64,7 @@ #include "llvm/ADT/SetVector.h" #include "llvm/ADT/StringSet.h" #include "llvm/Support/Debug.h" + #include using namespace mlir; @@ -150,6 +152,76 @@ transform::CollapseDimsOp::apply(transform::TransformRewriter &rewriter, return DiagnosedSilenceableFailure::success(); } +//===----------------------------------------------------------------------===// +// DetensorizeOp +//===----------------------------------------------------------------------===// +namespace { +LogicalResult detensorizeLinalgOp(OpBuilder &b, linalg::LinalgOp linalgOp) { + if (!linalgOp.hasTensorSemantics()) + return failure(); + + if (linalgOp.getNumLoops()) + return failure(); + + Location loc = linalgOp->getLoc(); + SmallVector scalars; + scalars.reserve(linalgOp->getNumOperands()); + for (auto &&operand : linalgOp->getOpOperands()) { + if (!linalgOp.payloadUsesValueFromOperand(&operand)) { + scalars.push_back(nullptr); + continue; + } + if (linalgOp.isScalar(&operand)) { + scalars.push_back(operand.get()); + continue; + } + auto tensorType = llvm::dyn_cast(operand.get().getType()); + if (!tensorType || !tensorType.hasRank() || tensorType.getRank() != 0) + return failure(); + + scalars.push_back( + b.create(loc, operand.get(), ValueRange())); + } + + Block *body = linalgOp.getBlock(); + IRMapping map; + map.map(body->getArguments(), scalars); + for (auto &&op : body->without_terminator()) { + b.clone(op, map); + } + + for (auto &&opOperand : linalgOp.getDpsInitOperands()) { + OpOperand *yieldOperand = linalgOp.getMatchingYieldValue(opOperand); + Value element = map.lookupOrDefault(yieldOperand->get()); + Value tensor = b.create( + loc, RankedTensorType::get({}, element.getType()), ValueRange(element)); + Value result = linalgOp.getTiedOpResult(opOperand); + result.replaceAllUsesWith(tensor); + } + linalgOp->erase(); + return success(); +} +} // namespace + +DiagnosedSilenceableFailure +transform::DetensorizeOp::apply(transform::TransformRewriter &rewriter, + transform::TransformResults &results, + transform::TransformState &state) { + for (Operation *target : state.getPayloadOps(getTarget())) { + auto linalgOp = dyn_cast_or_null(target); + if (!linalgOp) + return emitDefaultDefiniteFailure(target) + << " detensorize transformation should be applied on linalg op"; + + OpBuilder builder(getContext()); + builder.setInsertionPoint(target); + if (failed(detensorizeLinalgOp(builder, linalgOp))) + return emitDefaultDefiniteFailure(linalgOp) + << " failed to detensorize op"; + } + return DiagnosedSilenceableFailure::success(); +} + //===----------------------------------------------------------------------===// // replace unit extent dims //===----------------------------------------------------------------------===// @@ -1498,6 +1570,66 @@ LogicalResult transform::FuseOperandsOp::verify() { return success(); } +//===----------------------------------------------------------------------===// +// InsertSliceToCopyExtOp +//===----------------------------------------------------------------------===// +template +DiagnosedSilenceableFailure +insertSliceToCopyImpl(RewriterBase &rewriter, OpTy target, + transform::ApplyToEachResultList &results, + transform::TransformState &state) { + static_assert(llvm::is_one_of() && + "wrong op type"); + + if (auto copySource = + target.getSource().template getDefiningOp()) { + results.push_back(copySource); + return DiagnosedSilenceableFailure::success(); + } + + // If we are inside an InParallel region, temporarily set the insertion point + // outside: only tensor.parallel_insert_slice ops are allowed in there. + if constexpr (std::is_same_v) { + rewriter.setInsertionPoint( + target->template getParentOfType()); + } + + Value extracted = rewriter.create( + target.getLoc(), target.getSourceType(), target.getDest(), + target.getMixedOffsets(), target.getMixedSizes(), + target.getMixedStrides()); + Value copied = rewriter + .create(target.getLoc(), + target.getSource(), extracted) + .getResult(0); + // Reset the insertion point. + rewriter.setInsertionPoint(target); + rewriter.replaceOpWithNewOp( + target, copied, target.getDest(), target.getMixedOffsets(), + target.getMixedSizes(), target.getMixedStrides()); + + results.push_back(copied.getDefiningOp()); + return DiagnosedSilenceableFailure::success(); +} + +DiagnosedSilenceableFailure transform::InsertSliceToCopyExtOp::applyToOne( + transform::TransformRewriter &rewriter, Operation *targetOp, + transform::ApplyToEachResultList &results, + transform::TransformState &state) { + rewriter.setInsertionPoint(targetOp); + if (auto target = dyn_cast(targetOp)) + return insertSliceToCopyImpl(rewriter, target, results, state); + if (auto target = dyn_cast(targetOp)) + return insertSliceToCopyImpl(rewriter, target, results, state); + + DiagnosedSilenceableFailure diag = + emitSilenceableError() + << "only InsertSliceOp and ParallelInsertSliceOp ops are supported"; + diag.attachNote(targetOp->getLoc()) << "target op"; + return diag; +} + //===----------------------------------------------------------------------===// // Transform op registration //===----------------------------------------------------------------------===// diff --git a/compiler/lib/Dialect/Linalg/Transforms/FuseElementwise.cpp b/compiler/lib/Dialect/Linalg/Transforms/FuseElementwise.cpp index 1b874e9d2..fc0af1829 100644 --- a/compiler/lib/Dialect/Linalg/Transforms/FuseElementwise.cpp +++ b/compiler/lib/Dialect/Linalg/Transforms/FuseElementwise.cpp @@ -544,9 +544,7 @@ static bool isFusableWithReshapeByDimExpansion(GenericOp genericOp, else return isProjectedPermutationAndAllowConst(map); }) && - genericOp.getMatchingIndexingMap(fusableOpOperand).getNumResults() > - 0 && - llvm::all_of(genericOp.getIteratorTypesArray(), isParallelIterator); + genericOp.getMatchingIndexingMap(fusableOpOperand).getNumResults() > 0; } class ExpansionInfo { @@ -568,6 +566,9 @@ class ExpansionInfo { ArrayRef getExpandedShapeOfDim(unsigned i) const { return expandedShapeMap[i]; } + ArrayRef getIteratorTypes() const { + return iteratorTypes; + } ArrayRef getOriginalShape() const { return originalLoopExtent; } private: @@ -579,6 +580,8 @@ class ExpansionInfo { SmallVector> expandedShapeMap; /// Extent of the loop in the original operation. SmallVector originalLoopExtent; + /// Parallel types of the expanded loops + SmallVector iteratorTypes; unsigned expandedOpNumDims; }; @@ -591,6 +594,7 @@ LogicalResult ExpansionInfo::compute(LinalgOp linalgOp, if (reassociationMaps.empty()) return failure(); AffineMap fusedIndexMap = linalgOp.getMatchingIndexingMap(fusableOpOperand); + auto origIteratorTypes = linalgOp.getIteratorTypesArray(); SmallVector originalLoopRange = linalgOp.getStaticLoopRanges(); originalLoopExtent.assign(originalLoopRange.begin(), originalLoopRange.end()); @@ -621,8 +625,11 @@ LogicalResult ExpansionInfo::compute(LinalgOp linalgOp, auto seq = llvm::seq(sum, sum + numFoldedDim.value()); reassociation.emplace_back(seq.begin(), seq.end()); sum += numFoldedDim.value(); + iteratorTypes.append(numFoldedDim.value(), + origIteratorTypes[numFoldedDim.index()]); } expandedOpNumDims = sum; + return success(); } @@ -871,15 +878,11 @@ fuseWithReshapeByExpansion(GenericOp genericOp, Operation *reshapeOp, } } - // The iterator types of the expanded op are all parallel. - SmallVector iteratorTypes( - expansionInfo.getExpandedOpNumDims(), utils::IteratorType::parallel); - TypeRange resultTypes = ValueRange(outputs).getTypes(); - auto fusedOp = - rewriter.create(genericOp.getLoc(), resultTypes, - /*inputs=*/expandedOpOperands, outputs, - expandedOpIndexingMaps, iteratorTypes); + auto fusedOp = rewriter.create(genericOp.getLoc(), resultTypes, + /*inputs=*/expandedOpOperands, + outputs, expandedOpIndexingMaps, + expansionInfo.getIteratorTypes()); Region &fusedRegion = fusedOp->getRegion(0); Region &originalRegion = genericOp->getRegion(0); rewriter.cloneRegionBefore(originalRegion, fusedRegion, fusedRegion.begin()); diff --git a/compiler/lib/Dialect/Linalg/Transforms/LinalgCollapseLoops.cpp b/compiler/lib/Dialect/Linalg/Transforms/LinalgCollapseLoops.cpp index defd678af..081599f13 100644 --- a/compiler/lib/Dialect/Linalg/Transforms/LinalgCollapseLoops.cpp +++ b/compiler/lib/Dialect/Linalg/Transforms/LinalgCollapseLoops.cpp @@ -64,11 +64,12 @@ namespace { /// dimensions. It only applies these to "parallel" loops without mixing them /// with "reduction" types. static SmallVector -getCollapsibleLoops(linalg::GenericOp genericOp) { +getCollapsibleLoops(linalg::GenericOp genericOp, + utils::IteratorType iteratorType) { SmallVector contiguousLoops; SmallVector pDims; - genericOp.getParallelDims(pDims); + findPositionsOfType(genericOp.getIteratorTypesArray(), iteratorType, pDims); if (pDims.size() < 2) return contiguousLoops; @@ -76,15 +77,18 @@ getCollapsibleLoops(linalg::GenericOp genericOp) { auto hasAllMapsSameSequence = [&](AffineExpr preExpr, AffineExpr nextExpr) { for (AffineMap map : genericOp.getIndexingMapsArray()) { - bool foundSeq = false; - for (auto [index, resultExpr] : llvm::enumerate(map.getResults())) { - if (resultExpr == nextExpr) { - foundSeq = (index > 0 && preExpr == map.getResult(index - 1)); - break; - } + auto prePos = map.getResultPosition(preExpr); + auto nextPos = map.getResultPosition(nextExpr); + if (!prePos.has_value()) { + if (nextPos.has_value()) + return false; + } else { + if (!nextPos.has_value()) + return false; + + if (prePos.value() + 1 != nextPos.value()) + return false; } - if (!foundSeq) - return false; } return true; }; @@ -519,13 +523,17 @@ FailureOr> collapseGenericOpIterationDimsEx( class CollapseLoopsOnGenericOp : public OpRewritePattern { public: using OpRewritePattern::OpRewritePattern; + CollapseLoopsOnGenericOp(MLIRContext *context, + utils::IteratorType iteratorType) + : OpRewritePattern(context), iteratorType(iteratorType) {} + LogicalResult matchAndRewrite(linalg::GenericOp op, PatternRewriter &rewriter) const override { // Collect collapsible loops // TODO: All rules come from iree project, add our own if (!isEligibleForCollapse(op)) return failure(); - auto loops = getCollapsibleLoops(op); + auto loops = getCollapsibleLoops(op, iteratorType); if (loops.empty()) return failure(); @@ -542,22 +550,31 @@ class CollapseLoopsOnGenericOp : public OpRewritePattern { rewriter.replaceOp(op, *replacements); return success(); } + +private: + utils::IteratorType iteratorType; }; struct LinalgCollapseLoopsPass : public impl::LinalgCollapseLoopsBase { + LinalgCollapseLoopsPass(utils::IteratorType iteratorType) + : LinalgCollapseLoopsBase() { + this->iteratorType = iteratorType; + } + void runOnOperation() override { auto op = getOperation(); auto context = op->getContext(); RewritePatternSet patterns(context); - patterns.add(context); + patterns.add(context, iteratorType); if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns)))) signalPassFailure(); } }; } // namespace -std::unique_ptr> mlir::createLinalgCollapseLoops() { - return std::make_unique(); +std::unique_ptr> +mlir::createLinalgCollapseLoops(utils::IteratorType iteratorType) { + return std::make_unique(iteratorType); } diff --git a/compiler/lib/Dialect/Tensor/Transforms/CMakeLists.txt b/compiler/lib/Dialect/Tensor/Transforms/CMakeLists.txt index 524713f46..47cd8bab0 100644 --- a/compiler/lib/Dialect/Tensor/Transforms/CMakeLists.txt +++ b/compiler/lib/Dialect/Tensor/Transforms/CMakeLists.txt @@ -1,6 +1,7 @@ add_mlir_dialect_library(ByteIRTensorPasses CanonicalizeExt.cpp + TensorPadSpecialization.cpp ADDITIONAL_HEADER_DIRS ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/mhlo @@ -8,9 +9,11 @@ add_mlir_dialect_library(ByteIRTensorPasses DEPENDS ByteIRUtils + ByteIRTensorPassIncGen LINK_LIBS PUBLIC MLIRIR MLIRSupport + MLIRSCFDialect ByteIRUtils ) \ No newline at end of file diff --git a/compiler/lib/Dialect/Tensor/Transforms/CanonicalizeExt.cpp b/compiler/lib/Dialect/Tensor/Transforms/CanonicalizeExt.cpp index 2feff7437..c35039404 100644 --- a/compiler/lib/Dialect/Tensor/Transforms/CanonicalizeExt.cpp +++ b/compiler/lib/Dialect/Tensor/Transforms/CanonicalizeExt.cpp @@ -27,6 +27,7 @@ #include "byteir/Utils/AttrUtils.h" #include "byteir/Utils/Utils.h" #include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Arith/Utils/Utils.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/IR/Matchers.h" #include "mlir/IR/TypeUtilities.h" @@ -121,6 +122,46 @@ struct RankReducedExtractSliceCollapseShape return success(); } }; + +/// Fold zero rank from_elements + insert_slice into insert +/// +/// Example: +/// +/// %0 = tensor.from_elements %scalar : tensor +/// %1 = tensor.insert_slice %0 into %1[%c256] : tensor into +/// tensor<1024xf32> +/// +/// will be folded into +/// +/// %0 = tensor.insert %scalar into %1[%c256] : tensor<1024xf32> +struct FoldZeroRankFromElementsInsertSlice + : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(tensor::InsertSliceOp insertSliceOp, + PatternRewriter &rewriter) const override { + auto fromElementsOp = + insertSliceOp.getSource().getDefiningOp(); + if (!fromElementsOp) + return failure(); + + RankedTensorType tensorType = insertSliceOp.getSourceType(); + if (tensorType.getRank() != 0) + return failure(); + + auto elements = fromElementsOp.getElements(); + if (elements.size() != 1) + return failure(); + + SmallVector indices = getValueOrCreateConstantIndexOp( + rewriter, insertSliceOp->getLoc(), + getMixedValues(insertSliceOp.getStaticOffsets(), + insertSliceOp.getOffsets(), rewriter)); + rewriter.replaceOpWithNewOp( + insertSliceOp, elements[0], insertSliceOp.getDest(), indices); + return success(); + } +}; } // namespace void mlir::tensor::populateCanonicalizeExtPatterns(RewritePatternSet &patterns, @@ -132,6 +173,7 @@ void mlir::tensor::populateCanonicalizeExtPatterns(RewritePatternSet &patterns, } patterns.add(ctx); + patterns.add(ctx); } void mlir::tensor::getCanonicalizationExtPatterns(RewritePatternSet &patterns, diff --git a/compiler/lib/Dialect/Tensor/Transforms/PassDetail.h b/compiler/lib/Dialect/Tensor/Transforms/PassDetail.h new file mode 100644 index 000000000..4214a74dc --- /dev/null +++ b/compiler/lib/Dialect/Tensor/Transforms/PassDetail.h @@ -0,0 +1,40 @@ +//===- PassDetail.h -------------------------------------------*--- C++ -*-===// +// +// Copyright 2022 ByteDance Ltd. and/or its affiliates. All rights reserved. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// + +#ifndef BYTEIR_DIALECT_TENSOR_TRANSFORMS_PASSDETAIL_H +#define BYTEIR_DIALECT_TENSOR_TRANSFORMS_PASSDETAIL_H + +#include "mlir/IR/DialectRegistry.h" +#include "mlir/Pass/Pass.h" + +// forward dialects for conversions +namespace mlir { + +namespace scf { +class SCFDialect; +} // namespace scf + +namespace tensor { +class TensorDialect; +} // namespace tensor + +#define GEN_PASS_CLASSES +#include "byteir/Dialect/Tensor/Passes.h.inc" + +} // namespace mlir + +#endif // BYTEIR_DIALECT_TENSOR_TRANSFORMS_PASSDETAIL_H diff --git a/compiler/lib/Dialect/Tensor/Transforms/TensorPadSpecialization.cpp b/compiler/lib/Dialect/Tensor/Transforms/TensorPadSpecialization.cpp new file mode 100644 index 000000000..43f750233 --- /dev/null +++ b/compiler/lib/Dialect/Tensor/Transforms/TensorPadSpecialization.cpp @@ -0,0 +1,242 @@ +//===- TensorPadSpecialization.cpp ---------------------------*--- C++ -*-===// +// +// Copyright 2022 ByteDance Ltd. and/or its affiliates. All rights reserved. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// + +#include "byteir/Dialect/Tensor/Transforms/TensorPadSpecialization.h" +#include "byteir/Utils/AttrUtils.h" +#include "byteir/Utils/Utils.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Affine/ViewLikeInterfaceUtils.h" +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Arith/Utils/Utils.h" +#include "mlir/Dialect/Linalg/Transforms/Transforms.h" +#include "mlir/Dialect/Tensor/IR/Tensor.h" +#include "mlir/Dialect/Tensor/Transforms/TransformUtils.h" +#include "mlir/Dialect/Tensor/Transforms/Transforms.h" +#include "mlir/Dialect/Utils/IndexingUtils.h" +#include "mlir/IR/Matchers.h" +#include "mlir/IR/TypeUtilities.h" +#include "mlir/Interfaces/ViewLikeInterface.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/APSInt.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/Support/Debug.h" + +#include "./PassDetail.h" + +#define DEBUG_TYPE "tensor-pad-specialization" + +using namespace mlir; + +namespace { +static LogicalResult +resolveSourceIndicesCollapseShape(Location loc, PatternRewriter &rewriter, + tensor::CollapseShapeOp collapseShapeOp, + ValueRange indices, + SmallVectorImpl &sourceIndices) { + int64_t cnt = 0; + SmallVector tmp(indices.size()); + SmallVector dynamicIndices; + for (ArrayRef groups : collapseShapeOp.getReassociationIndices()) { + assert(!groups.empty() && "association indices groups cannot be empty"); + dynamicIndices.push_back(indices[cnt++]); + int64_t groupSize = groups.size(); + + // Calculate suffix product for all collapse op source dimension sizes. + SmallVector sizes(groupSize); + for (int64_t i = 0; i < groupSize; ++i) + sizes[i] = collapseShapeOp.getSrcType().getDimSize(groups[i]); + SmallVector suffixProduct = computeSuffixProduct(sizes); + + // Derive the index values along all dimensions of the source corresponding + // to the index wrt to collapsed shape op output. + auto d0 = rewriter.getAffineDimExpr(0); + SmallVector delinearizingExprs = delinearize(d0, suffixProduct); + + // Construct the AffineApplyOp for each delinearizingExpr. + for (int64_t i = 0; i < groupSize; i++) { + OpFoldResult ofr = affine::makeComposedFoldedAffineApply( + rewriter, loc, + AffineMap::get(/*numDims=*/1, /*numSymbols=*/0, + delinearizingExprs[i]), + dynamicIndices); + sourceIndices.push_back( + getValueOrCreateConstantIndexOp(rewriter, loc, ofr)); + } + dynamicIndices.clear(); + } + if (collapseShapeOp.getReassociationIndices().empty()) { + auto zeroAffineMap = rewriter.getConstantAffineMap(0); + int64_t srcRank = + cast(collapseShapeOp.getSrc().getType()).getRank(); + for (int64_t i = 0; i < srcRank; i++) { + OpFoldResult ofr = affine::makeComposedFoldedAffineApply( + rewriter, loc, zeroAffineMap, dynamicIndices); + sourceIndices.push_back( + getValueOrCreateConstantIndexOp(rewriter, loc, ofr)); + } + } + return success(); +} + +struct FoldExtractOfCollapseShape : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(tensor::ExtractOp extractOp, + PatternRewriter &rewriter) const { + auto collapseShapeOp = + extractOp.getTensor().getDefiningOp(); + if (!collapseShapeOp) + return failure(); + + SmallVector indices(extractOp.getIndices().begin(), + extractOp.getIndices().end()); + SmallVector sourceIndices; + if (failed(resolveSourceIndicesCollapseShape(extractOp->getLoc(), rewriter, + collapseShapeOp, indices, + sourceIndices))) + return failure(); + rewriter.replaceOpWithNewOp( + extractOp, extractOp.getType(), collapseShapeOp.getSrc(), + sourceIndices); + return success(); + } +}; + +struct FoldExtractOfExtractSlice : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(tensor::ExtractOp extractOp, + PatternRewriter &rewriter) const override { + auto extractSliceOp = + extractOp.getTensor().getDefiningOp(); + if (!extractSliceOp) + return failure(); + + SmallVector indices(extractOp.getIndices().begin(), + extractOp.getIndices().end()); + SmallVector sourceIndices; + affine::resolveIndicesIntoOpWithOffsetsAndStrides( + rewriter, extractOp->getLoc(), extractSliceOp.getMixedOffsets(), + extractSliceOp.getMixedStrides(), extractSliceOp.getDroppedDims(), + indices, sourceIndices); + rewriter.replaceOpWithNewOp( + extractOp, extractOp.getType(), extractSliceOp.getSource(), + sourceIndices); + return success(); + } +}; + +struct FoldExtractOfPad : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(tensor::ExtractOp extractOp, + PatternRewriter &rewriter) const override { + auto padOp = extractOp.getTensor().getDefiningOp(); + if (!padOp) + return failure(); + + // Only constant padding value supported. + Value padValue = padOp.getConstantPaddingValue(); + if (!padValue) + return failure(); + + // Helper variables and functions for various arithmetic operations. These + // are used extensively for computing new offset/length and padding values. + Location loc = padOp->getLoc(); + AffineExpr dim0, dim1; + bindDims(rewriter.getContext(), dim0, dim1); + // Add two integers. + auto addMap = AffineMap::get(2, 0, {dim0 + dim1}); + auto add = [&](OpFoldResult v1, OpFoldResult v2) { + return affine::makeComposedFoldedAffineApply(rewriter, loc, addMap, + {v1, v2}); + }; + // Subtract two integers. + auto subMap = AffineMap::get(2, 0, {dim0 - dim1}); + auto sub = [&](OpFoldResult v1, OpFoldResult v2) { + return affine::makeComposedFoldedAffineApply(rewriter, loc, subMap, + {v1, v2}); + }; + + auto cmp = [&](OpFoldResult v1, OpFoldResult v2, + arith::CmpIPredicate pred) { + return rewriter.create( + loc, pred, getValueOrCreateConstantIndexOp(rewriter, loc, v1), + getValueOrCreateConstantIndexOp(rewriter, loc, v2)); + }; + + auto offsets = getAsOpFoldResult(extractOp.getIndices()); + SmallVector newOffsets; + Value inBound; + + int64_t rank = padOp.getSourceType().getRank(); + for (unsigned dim = 0; dim < rank; ++dim) { + auto low = padOp.getMixedLowPad()[dim]; + bool hasLowPad = !isConstantIntValue(low, 0); + auto offset = offsets[dim]; + auto srcSize = + tensor::getMixedSize(rewriter, loc, padOp.getSource(), dim); + + OpFoldResult newOffset = hasLowPad ? sub(offset, low) : offset; + newOffsets.push_back(newOffset); + auto lbcheck = cmp(low, offset, arith::CmpIPredicate::ule); + auto ubcheck = cmp(offset, hasLowPad ? add(low, srcSize) : srcSize, + arith::CmpIPredicate::ult); + auto check = rewriter.create(loc, lbcheck, ubcheck); + if (inBound) { + inBound = rewriter.create(loc, inBound, check); + } else { + inBound = check; + } + } + + rewriter.replaceOpWithNewOp( + extractOp, inBound, + [&](OpBuilder &b, Location loc) { + b.create( + loc, b.create( + loc, padOp.getSource(), + getValueOrCreateConstantIndexOp(b, loc, newOffsets)) + .getResult()); + }, + [&](OpBuilder &b, Location loc) { + b.create(loc, padValue); + }); + return success(); + } +}; + +struct TensorPadSpecializationPass + : public TensorPadSpecializationBase { + void runOnOperation() override { + MLIRContext *ctx = &getContext(); + RewritePatternSet patterns(ctx); + patterns.add(ctx); + if (failed(applyPatternsAndFoldGreedily(getOperation(), + std::move(patterns)))) { + signalPassFailure(); + } + } +}; +} // namespace + +std::unique_ptr mlir::createTensorPadSpecializationPass() { + return std::make_unique(); +} diff --git a/compiler/lib/Dialect/Transform/Transforms/TransformInsertion.cpp b/compiler/lib/Dialect/Transform/Transforms/TransformInsertion.cpp index cf5ff1e69..be6485a85 100644 --- a/compiler/lib/Dialect/Transform/Transforms/TransformInsertion.cpp +++ b/compiler/lib/Dialect/Transform/Transforms/TransformInsertion.cpp @@ -25,6 +25,7 @@ #include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/Linalg/TransformOps/DialectExtension.h" #include "mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h" +#include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/Dialect/Transform/IR/TransformDialect.h" #include "mlir/Dialect/Transform/IR/TransformOps.h" #include "mlir/IR/Builders.h" @@ -89,6 +90,61 @@ void insertTransformIR(ModuleOp m, const TransformInsertionConfig &config) { } } +struct DetensorizeTransformInsertionPass + : public DetensorizeTransformInsertionBase< + DetensorizeTransformInsertionPass> { + explicit DetensorizeTransformInsertionPass(const std::string &funcAnchor, + const std::string &matchPrefix) + : DetensorizeTransformInsertionBase() { + this->funcAnchorAttr = funcAnchor; + this->matchPrefix = matchPrefix; + } + + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + linalg::registerTransformDialectExtension(registry); + } + + static bool isScalarTensorOp(linalg::LinalgOp linalgOp) { + if (!linalgOp.hasTensorSemantics()) + return false; + + if (linalgOp.getNumLoops() != 0) + return false; + + auto isScalarOrScalarTensorOperand = [&](OpOperand &operand) { + if (linalgOp.isScalar(&operand)) + return true; + + auto tensorType = + llvm::dyn_cast(operand.get().getType()); + if (!tensorType) + return false; + + return tensorType.getRank() == 0; + }; + return llvm::all_of(linalgOp->getOpOperands(), + isScalarOrScalarTensorOperand); + } + + void runOnOperation() override { + auto opFilter = [](Operation *op) { + if (auto linalgOp = llvm::dyn_cast_or_null(op)) { + return isScalarTensorOp(linalgOp); + } + return false; + }; + + auto transformBuilder = [](ImplicitLocOpBuilder &b, Operation *, + Value pdlValue) { + b.create(pdlValue); + }; + + insertTransformIR(getOperation(), {funcAnchorAttr, matchPrefix, opFilter, + transformBuilder}); + } +}; + struct FuseExtTransformInsertionPass : public FuseExtTransformInsertionBase { explicit FuseExtTransformInsertionPass( @@ -166,8 +222,46 @@ struct GenericTransformInsertionPass protected: TransformInsertionConfig config; }; + +struct RewriteInDPSTransformInsertionPass + : public RewriteInDPSTransformInsertionBase< + RewriteInDPSTransformInsertionPass> { + explicit RewriteInDPSTransformInsertionPass(const std::string &funcAnchor, + const std::string &matchPrefix) + : RewriteInDPSTransformInsertionBase() { + this->funcAnchorAttr = funcAnchor; + this->matchPrefix = matchPrefix; + } + + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + linalg::registerTransformDialectExtension(registry); + } + + void runOnOperation() override { + auto opFilter = [](Operation *op) { + return llvm::isa(op); + }; + + auto transformBuilder = [](ImplicitLocOpBuilder &b, Operation *, + Value pdlValue) { + b.create( + pdlValue.getType(), pdlValue); + }; + + insertTransformIR(getOperation(), {funcAnchorAttr, matchPrefix, opFilter, + transformBuilder}); + } +}; } // namespace +std::unique_ptr> +mlir::createDetensorizeTransformInsertionPass(const std::string &funcAnchor, + const std::string &matchPrefix) { + return std::make_unique(funcAnchor, + matchPrefix); +} + std::unique_ptr> mlir::createFuseExtTransformInsertionPass( const std::string &funcAnchor, const std::string &matchPrefix, @@ -182,4 +276,11 @@ std::unique_ptr> mlir::createGenericTransformInsertionPass( const TransformInsertionConfig &config) { return std::make_unique(config); +} + +std::unique_ptr> +mlir::createRewriteInDPSTransformInsertionPass(const std::string &funcAnchor, + const std::string &matchPrefix) { + return std::make_unique(funcAnchor, + matchPrefix); } \ No newline at end of file diff --git a/compiler/lib/Dialect/mhlo/Transforms/CanonicalizeExt.cpp b/compiler/lib/Dialect/mhlo/Transforms/CanonicalizeExt.cpp index 277174dfa..f5ae6ffea 100644 --- a/compiler/lib/Dialect/mhlo/Transforms/CanonicalizeExt.cpp +++ b/compiler/lib/Dialect/mhlo/Transforms/CanonicalizeExt.cpp @@ -1698,6 +1698,60 @@ LogicalResult mlir::mhlo::foldReverseWithConstant(mhlo::ReverseOp op, return success(); } +// this pattern match a GatherOp with iota start_indices, +// the output of GatherOp maybe equal to the input. +LogicalResult mlir::mhlo::foldGatherWithInput(mhlo::GatherOp gatherOp, + PatternRewriter &rewriter) { + auto operand = gatherOp.getOperand(); + auto operandTy = operand.getType().cast(); + if (!operandTy.hasRank()) { + return failure(); + } + + auto resultTy = gatherOp.getType().cast(); + if (resultTy != operandTy) { + return failure(); + } + + auto startIndices = gatherOp.getStartIndices(); + auto startIndicesTy = startIndices.getType().cast(); + auto iotaOp = startIndices.getDefiningOp(); + if (!iotaOp || !startIndicesTy.hasRank()) { + return failure(); + } + + int64_t indexVectorDim = startIndicesTy.getRank(); + + auto dimensionNumbers = gatherOp.getDimensionNumbers(); + if (dimensionNumbers.getIndexVectorDim() != indexVectorDim || + indexVectorDim != 1) { + return failure(); + } + + if (dimensionNumbers.getStartIndexMap().size() != 1) { + return failure(); + } + + int64_t startIndexMap = dimensionNumbers.getStartIndexMap()[0]; + auto collapsedSilceDims = dimensionNumbers.getCollapsedSliceDims(); + bool mapTocollapsedDim = false; + + for (auto dims : collapsedSilceDims) { + if (dims == startIndexMap) { + mapTocollapsedDim = true; + break; + } + } + // if the start index and offset index are disjoint, + // and the start index is generate by IotaOp, + // the output of gatherOp is equal to input. + if (mapTocollapsedDim) { + rewriter.replaceOp(gatherOp, operand); + return success(); + } + return failure(); +} + void mlir::mhlo::populateCanonicalizeExtPatterns(RewritePatternSet &patterns, MLIRContext *ctx, bool blindFold) { @@ -1725,6 +1779,7 @@ void mlir::mhlo::populateCanonicalizeExtPatterns(RewritePatternSet &patterns, patterns.add(mlir::mhlo::simplifyCumsumToIota); patterns.add(mlir::mhlo::simplifyTransposeReshapeTranspose); patterns.add(mlir::mhlo::foldReverseWithConstant); + patterns.add(mlir::mhlo::foldGatherWithInput); if (blindFold) { patterns.add(mlir::mhlo::foldLargeConcatenate); } diff --git a/compiler/lib/Dialect/mhlo/Transforms/CatFusion.cpp b/compiler/lib/Dialect/mhlo/Transforms/CatFusion.cpp index 6be432b7d..6eac9b56c 100644 --- a/compiler/lib/Dialect/mhlo/Transforms/CatFusion.cpp +++ b/compiler/lib/Dialect/mhlo/Transforms/CatFusion.cpp @@ -67,6 +67,8 @@ bool isFusibleWith(Operation *target, Operation * /*start*/) { return true; } bool isValidSingleOp(Operation *op) { return true; } +bool isValidFusionPattern(const MhloFusionPattern &) { return true; } + bool isFusibleCandidateAggressive(Operation *op) { if (isa(op)) return true; @@ -99,14 +101,16 @@ bool isValidSingleOpAggressive(Operation *op) { } static GenericFuserConfig config{ - getByteIRCatFusionAttrName(), cat_fusion::isFusibleCandidate, - cat_fusion::isFusibleStart, cat_fusion::isFusibleTrigger, - cat_fusion::isFusibleWith, cat_fusion::isValidSingleOp}; + getByteIRCatFusionAttrName(), cat_fusion::isFusibleCandidate, + cat_fusion::isFusibleStart, cat_fusion::isFusibleTrigger, + cat_fusion::isFusibleWith, cat_fusion::isValidSingleOp, + cat_fusion::isValidFusionPattern}; static GenericFuserConfig aggressiveConfig{ - getByteIRCatFusionAttrName(), cat_fusion::isFusibleCandidateAggressive, - cat_fusion::isFusibleStart, cat_fusion::isFusibleTrigger, - cat_fusion::isFusibleWith, cat_fusion::isValidSingleOpAggressive}; + getByteIRCatFusionAttrName(), cat_fusion::isFusibleCandidateAggressive, + cat_fusion::isFusibleStart, cat_fusion::isFusibleTrigger, + cat_fusion::isFusibleWith, cat_fusion::isValidSingleOpAggressive, + cat_fusion::isValidFusionPattern}; } // namespace cat_fusion diff --git a/compiler/lib/Dialect/mhlo/Transforms/ConvertOpToCustomCall.cpp b/compiler/lib/Dialect/mhlo/Transforms/ConvertOpToCustomCall.cpp index 0f7bd48f5..bca092517 100644 --- a/compiler/lib/Dialect/mhlo/Transforms/ConvertOpToCustomCall.cpp +++ b/compiler/lib/Dialect/mhlo/Transforms/ConvertOpToCustomCall.cpp @@ -1,4 +1,4 @@ -//===- ConvertRngToCustomCall.cpp -----------------------------*--- C++ -*-===// +//===- ConvertOpToCustomCall.cpp ------------------------------*--- C++ -*-===// // // Copyright 2022 ByteDance Ltd. and/or its affiliates. All rights reserved. // Licensed under the Apache License, Version 2.0 (the "License"); @@ -74,6 +74,20 @@ func::CallOp getOrCreateCallGetSeedOp(func::FuncOp func, return callGetSeedOp; } +llvm::SmallVector getDefaultAttrs(PatternRewriter &rewriter) { + llvm::SmallVector attrs; + attrs.emplace_back(rewriter.getStringAttr("has_side_effect"), + rewriter.getBoolAttr(false)); + attrs.emplace_back(rewriter.getStringAttr("backend_config"), + rewriter.getStringAttr("")); + attrs.emplace_back(rewriter.getStringAttr("api_version"), + rewriter.getI32IntegerAttr(static_cast( + mhlo::CustomCallApiVersion::API_VERSION_ORIGINAL))); + attrs.emplace_back(rewriter.getStringAttr("called_computations"), + rewriter.getArrayAttr({})); + return attrs; +} + struct ConvertRngUniformToCustomCall : public OpRewritePattern { using OpRewritePattern::OpRewritePattern; @@ -120,6 +134,76 @@ struct ConvertRngUniformToCustomCall : public OpRewritePattern { return success(); } }; + +struct ConvertFlashFwdToCustomCall + : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(mhlo::CustomCallOp op, + PatternRewriter &rewriter) const override { + auto opName = op.getCallTargetName(); + if (opName != getFlashAttnFwdName()) + return rewriter.notifyMatchFailure(op, "op name not match"); + + auto resultNum = op.getNumResults(); + if (resultNum != 4) + return rewriter.notifyMatchFailure(op, "op result num not match"); + auto q = op.getOperand(0); + auto k = op.getOperand(1); + auto v = op.getOperand(2); + Type outType = op.getResult(0).getType(); + Type softmaxLseType = op.getResult(1).getType(); + Type softmaxType = op.getResult(2).getType(); + + TensorType seedOrOffsetType = + RankedTensorType::get({}, rewriter.getI64Type()); + + ModuleOp module = op->getParentRegion()->getParentOfType(); + auto functionType = FunctionType::get(module.getContext(), {}, + ArrayRef{seedOrOffsetType}); + func::FuncOp getSeedFunc = getOrCreatePrivateFunctionDeclare( + module, "GetSeedFunc", "GetSeed", functionType); + func::FuncOp nextOffsetFunc = getOrCreatePrivateFunctionDeclare( + module, "NextOffsetFunc", "NextOffset", functionType); + + // avoid to call @getSeed every time + auto getSeedOp = getOrCreateCallGetSeedOp( + op->getParentRegion()->getParentOfType(), getSeedFunc, + rewriter); + auto getOffsetOp = rewriter.create( + op->getLoc(), nextOffsetFunc, ArrayRef{}); + + TensorType seedOrOffsetReshapedType = + RankedTensorType::get({1}, rewriter.getI64Type()); + TensorType rngStateType = RankedTensorType::get({2}, rewriter.getI64Type()); + auto reshapeSeedOp = rewriter.create( + op.getLoc(), seedOrOffsetReshapedType, getSeedOp.getResult(0)); + auto reshapeOffsetOp = rewriter.create( + op.getLoc(), seedOrOffsetReshapedType, getOffsetOp.getResult(0)); + + auto concatOp = rewriter.create( + op.getLoc(), rngStateType, + ValueRange{reshapeSeedOp.getResult(), reshapeOffsetOp.getResult()}, 0); + SmallVector bufferArgs{q, k, v, concatOp.getResult()}; + auto dictAttr = + op->template getAttrOfType(getCustomCallAttrName()); + auto attrs = getDefaultAttrs(rewriter); + attrs.emplace_back(rewriter.getStringAttr("call_target_name"), + rewriter.getStringAttr(getFlashAttnFwdName())); + attrs.emplace_back(rewriter.getStringAttr(getCustomCallAttrName()), + dictAttr); + auto customCallOp = rewriter.create( + op->getLoc(), ArrayRef{outType, softmaxLseType, softmaxType}, + bufferArgs, ArrayRef{attrs}); + Value outPad = customCallOp.getResult(0); + Value softmaxLse = customCallOp.getResult(1); + Value softmaxReturn = customCallOp.getResult(2); + ValueRange results{outPad, softmaxLse, softmaxReturn, concatOp.getResult()}; + rewriter.replaceOp(op, results); + return success(); + } +}; + struct ConvertOpToCustomCallPass : public ConvertOpToCustomCallBase { @@ -140,6 +224,7 @@ struct ConvertOpToCustomCallPass RewritePatternSet patterns(context); populateRngPatternToCustomCall(patterns); + populateFlashFwdRewritePattern(patterns); FrozenRewritePatternSet frozenPatterns(std::move(patterns)); if (failed(applyPatternsAndFoldGreedily(funcOp, frozenPatterns))) { @@ -155,6 +240,10 @@ void mlir::populateRngPatternToCustomCall(RewritePatternSet &patterns) { patterns.add(patterns.getContext()); } +void mlir::populateFlashFwdRewritePattern(RewritePatternSet &patterns) { + patterns.add(patterns.getContext()); +} + std::unique_ptr> mlir::createConvertOpToCustomCallPass(llvm::StringRef anchor) { return std::make_unique(anchor); diff --git a/compiler/lib/Dialect/mhlo/Transforms/GenericFusion.cpp b/compiler/lib/Dialect/mhlo/Transforms/GenericFusion.cpp index 0caf95fbf..894c5866c 100644 --- a/compiler/lib/Dialect/mhlo/Transforms/GenericFusion.cpp +++ b/compiler/lib/Dialect/mhlo/Transforms/GenericFusion.cpp @@ -96,10 +96,13 @@ bool isValidSingleOp(Operation *op) { isCustomMhloRngOp(op); } +bool isValidFusionPattern(const MhloFusionPattern &) { return true; } + static GenericFuserConfig config{ getByteIRElementwiseFusionAttrName(), elementwise::isFusibleCandidate, elementwise::isFusibleStart, elementwise::isFusibleTrigger, - elementwise::isFusibleWith, elementwise::isValidSingleOp}; + elementwise::isFusibleWith, elementwise::isValidSingleOp, + elementwise::isValidFusionPattern}; } // namespace elementwise @@ -126,15 +129,85 @@ bool isFusibleWith(Operation * /*target*/, Operation * /*start*/) { bool isValidSingleOp(Operation *op) { return false; } +bool isValidFusionPattern(const MhloFusionPattern &) { return true; } + static GenericFuserConfig config{getByteIRMatmulEpilogueFusionAttrName(), matmul_epilogue::isFusibleCandidate, matmul_epilogue::isFusibleStart, matmul_epilogue::isFusibleTrigger, matmul_epilogue::isFusibleWith, - matmul_epilogue::isValidSingleOp}; + matmul_epilogue::isValidSingleOp, + matmul_epilogue::isValidFusionPattern}; } // namespace matmul_epilogue +namespace reduction { +// TODO: maybe we should support non-splat constant on device in future +bool isFusibleCandidate(Operation *op) { + return isMhlo(op) && (op->hasTrait<::mlir::OpTrait::Elementwise>() || + op->hasTrait() || + isSplatMhloConstantLike(op) || + isa(op)); +} + +// every candidate can start +bool isFusibleStart(Operation *op) { return true; } + +bool isFusibleTrigger(Operation *op) { + if (op->hasTrait<::mlir::OpTrait::Elementwise>() || + op->hasTrait() || + isa(op)) { + return true; + } + + // if broadcast, check whether its operand is only used in broadcast + if (isa(op)) { + auto src = op->getOperand(0); + // is foldable we just allow + if (isDeepMhloFoldable(src.getDefiningOp())) { + return true; + } + // otherwise, check it is only used in broadcast + // return useCount(src) == 1; + // LWC FIXME: change back to above after broadcast fusion resolve. + return false; + } + + if (isa(op)) + return true; + + return false; +} + +bool isFusibleWith(Operation *target, Operation * /*start*/) { + return (target->hasTrait<::mlir::OpTrait::Elementwise>() || + target->hasTrait() || + isSplatMhloConstantLike(target) || + isa( + target)) && + target->hasOneUse(); +} + +bool isValidSingleOp(Operation *op) { return isa(op); } + +bool isValidFusionPattern(const MhloFusionPattern &pattern) { + SmallVector outputs = getOutputsOfCluster(pattern); + if (outputs.size() == 1) { + if (outputs[0].getDefiningOp()) + return true; + } + return false; +} + +static GenericFuserConfig config{ + getByteIRReductionFusionAttrName(), reduction::isFusibleCandidate, + reduction::isFusibleStart, reduction::isFusibleTrigger, + reduction::isFusibleWith, reduction::isValidSingleOp, + reduction::isValidFusionPattern}; + +} // namespace reduction + // a derived fusion pass for elementwise struct ElementwiseFusionPass : public GenericFusionPass { @@ -188,6 +261,29 @@ struct MatmulEpilogueFusionPass ::llvm::StringRef getName() const override { return "MatmulEpilogueFusion"; } }; +// a derived fusion pass for reduction fusion +struct ReductionFusionPass : public GenericFusionPass { + + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ReductionFusionPass) + + ReductionFusionPass() : GenericFusionPass(reduction::config, false) {} + + /// Returns the command-line argument attached to this pass. + static constexpr ::llvm::StringLiteral getArgumentName() { + return ::llvm::StringLiteral("fuse-reduction"); + } + ::llvm::StringRef getArgument() const override { return "fuse-reduction"; } + + ::llvm::StringRef getDescription() const override { + return "Fuse reduction with its producer"; + } + + /// Returns the derived pass name. + static constexpr ::llvm::StringLiteral getPassName() { + return ::llvm::StringLiteral("ReductionFusion"); + } + ::llvm::StringRef getName() const override { return "ReductionFusion"; } +}; } // namespace std::unique_ptr> @@ -199,3 +295,7 @@ std::unique_ptr> mlir::createMatmulEpilogueFusionPass() { return std::make_unique(); } + +std::unique_ptr> mlir::createReductionFusionPass() { + return std::make_unique(); +} diff --git a/compiler/lib/Dialect/mhlo/Transforms/HloAggressiveFusion.cpp b/compiler/lib/Dialect/mhlo/Transforms/HloAggressiveFusion.cpp index 678d6bc9c..6db573e5f 100644 --- a/compiler/lib/Dialect/mhlo/Transforms/HloAggressiveFusion.cpp +++ b/compiler/lib/Dialect/mhlo/Transforms/HloAggressiveFusion.cpp @@ -47,12 +47,15 @@ bool isFusibleWith(Operation *, Operation *) { return true; } bool isValidSingleOp(Operation *) { return true; } +bool isValidFusionPattern(const MhloFusionPattern &) { return true; } + static GenericFuserConfig config{getByteIRHloAggressiveFusionAttrName(), aggressive_fusion::isFusibleCandidate, aggressive_fusion::isFusibleStart, aggressive_fusion::isFusibleTrigger, aggressive_fusion::isFusibleWith, - aggressive_fusion::isValidSingleOp}; + aggressive_fusion::isValidSingleOp, + aggressive_fusion::isValidFusionPattern}; } // namespace aggressive_fusion diff --git a/compiler/lib/Dialect/mhlo/Transforms/HloMoveDown.cpp b/compiler/lib/Dialect/mhlo/Transforms/HloMoveDown.cpp index 55b74ef26..7bcbd1371 100644 --- a/compiler/lib/Dialect/mhlo/Transforms/HloMoveDown.cpp +++ b/compiler/lib/Dialect/mhlo/Transforms/HloMoveDown.cpp @@ -59,12 +59,21 @@ struct TransposeMoveDownPattern : public HloMoveDownPattern { LogicalResult matchAndRewrite(mhlo::TransposeOp op, PatternRewriter &rewriter) const override { auto value = op.getResult(); - auto operandType = op.getOperand().getType(); // T1 as Transpose: T1 -> T2 - // early termination if not allMultiUser nor multiUser but has multi users if (!allMultiUser && !multiUser && userCount(value) != 1) { return failure(); } + auto permutationAttr = op.getPermutation(); + + auto isTransposeWithSamePermutation = + [&permutationAttr](Value val) -> bool { + auto op = val.getDefiningOp(); + if (!op) { + return false; + } else { + return op.getPermutation() == permutationAttr; + } + }; llvm::SetVector users; for (auto user : value.getUsers()) { @@ -94,13 +103,19 @@ struct TransposeMoveDownPattern : public HloMoveDownPattern { // isElementwiseOneResult(user) == true bool failed = false; for (auto operand : user->getOperands()) { - if (operand != value && !isSplatMhloConstantValue(operand)) { - if (allMultiUser) - return failure(); - failed = true; - break; + if (operand == value) { + continue; + } else if (isDenseMhloConstantValue(operand)) { + continue; + } else if (isTransposeWithSamePermutation(operand)) { + continue; } + if (allMultiUser) + return failure(); + failed = true; + break; } + if (failed) continue; users.insert(user); @@ -119,8 +134,10 @@ struct TransposeMoveDownPattern : public HloMoveDownPattern { if (!bvm.contains(value)) { bvm.map(value, op.getOperand()); } + } else if (isTransposeWithSamePermutation(operand)) { + bvm.map(operand, operand.getDefiningOp().getOperand()); } else { - // isSplatMhloConstantValue(operand) == true + // isDenseMhloConstantValue(operand) == true // since it has been checked when collecting users if (!constInputs.contains(operand)) { constInputs.insert(operand); @@ -130,14 +147,19 @@ struct TransposeMoveDownPattern : public HloMoveDownPattern { // create all const and put into bvm for (auto input : constInputs) { - ElementsAttr oldConstAttr = - input.getDefiningOp().getValue(); - auto newConstAttr = reshapeSplatElementsAttr(oldConstAttr, operandType); - auto newConstOp = - rewriter.create(op->getLoc(), *newConstAttr); - bvm.map(input, newConstOp.getOutput()); + SmallVector newPermutation(permutationAttr.size()); + std::for_each(permutationAttr.value_begin(), + permutationAttr.value_end(), + [i = 0, &newPermutation](auto e) mutable { + newPermutation[e.getSExtValue()] = (uint64_t)i++; + }); + auto newPermutationAttr = DenseIntElementsAttr::get( + permutationAttr.getType(), newPermutation); + auto ConstOp = input.getDefiningOp(); + auto newTransposeOp = rewriter.create( + ConstOp.getLoc(), ConstOp.getOutput(), newPermutationAttr); + bvm.map(input, newTransposeOp.getResult()); } - auto maybeResultTypes = mixTypes(/*cloneFromElementTypes*/ user->getResultTypes(), /*cloneFromShapes*/ op->getOperandTypes()); @@ -145,6 +167,8 @@ struct TransposeMoveDownPattern : public HloMoveDownPattern { // maybeResultTypes should always have value assert(maybeResultTypes.has_value()); + OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPointAfter(user); // clone an elementwise op as producer auto newProducer = cloneAndReplaceResultTypes(rewriter, user, bvm, *maybeResultTypes); diff --git a/compiler/lib/Pipelines/BufferizeOpt.cpp b/compiler/lib/Pipelines/BufferizeOpt.cpp index dcbb11485..4a5f2c5bf 100644 --- a/compiler/lib/Pipelines/BufferizeOpt.cpp +++ b/compiler/lib/Pipelines/BufferizeOpt.cpp @@ -23,6 +23,7 @@ #include "byteir/Transforms/Passes.h" #include "mlir/Dialect/Bufferization/Transforms/Passes.h" #include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/MemRef/Transforms/Passes.h" #include "mlir/Transforms/Passes.h" #include "transforms/passes.h" @@ -37,6 +38,7 @@ void mlir::createByteIRBufferizeOptPipeline( pm.addPass(byteir::createOneShotBufferizePass()); addCleanUpExtPassPipeline(pm); + pm.addNestedPass(memref::createFoldMemRefAliasOpsPass()); // clean-up possible redundant copy from bufferization // perform twice, since cse is not greedy-based pm.addNestedPass(createRemoveCopyPass()); diff --git a/compiler/lib/Pipelines/ByreOpt.cpp b/compiler/lib/Pipelines/ByreOpt.cpp index 0e78f8742..ad4bba814 100644 --- a/compiler/lib/Pipelines/ByreOpt.cpp +++ b/compiler/lib/Pipelines/ByreOpt.cpp @@ -48,7 +48,10 @@ void createByreOptPipelineImpl(OpPassManager &pm, const std::string &entryFunc, OpPassManager anchoredPM(func::FuncOp::getOperationName()); if (!disableMemoryPlanning) { // underlying memory of constant op cannot be reused - anchoredPM.addPass(createMemoryPlanningPass(128, nullptr)); + anchoredPM.addPass(createMemoryPlanningPass(/* alignment */ 128, + /* alloca */ false, + /* memory space */ 0, + /* callback */ nullptr)); anchoredPM.addPass(createCanonicalizerPass()); } anchoredPM.addPass(createConvertMemrefToByrePass()); diff --git a/compiler/lib/Pipelines/GPU/CMakeLists.txt b/compiler/lib/Pipelines/GPU/CMakeLists.txt index 4ab1ad0a7..8eea9ad17 100644 --- a/compiler/lib/Pipelines/GPU/CMakeLists.txt +++ b/compiler/lib/Pipelines/GPU/CMakeLists.txt @@ -2,7 +2,9 @@ add_mlir_library(ByteIRGPUPipelines ElementwiseCodegen.cpp GPUOpt.cpp LinalgMemrefGPU.cpp + MappingForall.cpp NVVMCodegen.cpp + ReductionCodegen.cpp ADDITIONAL_HEADER_DIRS ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Pipelines/GPU @@ -15,6 +17,7 @@ add_mlir_library(ByteIRGPUPipelines MLIRBufferTransforms LINK_LIBS PUBLIC + ByteIRGPUPasses ByteIRLinalgPasses ByteIRPipelineCommon ByteIRUtils @@ -22,4 +25,5 @@ add_mlir_library(ByteIRGPUPipelines ByteIRToPTX MLIRIR MLIRTransforms + MLIRLinalgExtTransformOps ) \ No newline at end of file diff --git a/compiler/lib/Pipelines/GPU/GPUOpt.cpp b/compiler/lib/Pipelines/GPU/GPUOpt.cpp index e5cf725b8..842901b04 100644 --- a/compiler/lib/Pipelines/GPU/GPUOpt.cpp +++ b/compiler/lib/Pipelines/GPU/GPUOpt.cpp @@ -20,10 +20,14 @@ #include "byteir/Conversion/ToGPU/ToGPU.h" #include "byteir/Conversion/ToPTX/ToPTX.h" #include "byteir/Dialect/Affine/Passes.h" +#include "byteir/Dialect/GPU/Passes.h" #include "byteir/Dialect/SCF/Passes.h" +#include "byteir/Dialect/Transform/Transforms/TransformDialectInterpreter.h" #include "byteir/Dialect/mhlo/Passes.h" #include "byteir/Pipelines/Common/Utils.h" +#include "byteir/Pipelines/GPU/MappingForall.h" #include "byteir/Transforms/Passes.h" +#include "byteir/Transforms/RemoveFuncBody.h" #include "mlir/Conversion/AffineToStandard/AffineToStandard.h" #include "mlir/Dialect/Bufferization/Transforms/Passes.h" #include "mlir/Dialect/GPU/Transforms/Passes.h" @@ -35,8 +39,9 @@ using namespace mlir; using namespace mlir::bufferization; namespace { -void createGPUOptPipelineImpl(OpPassManager &pm, const bool &useBarePtrCallConv, - const std::string &target) { +void createElementwiseGPUOptPipelineImpl(OpPassManager &pm, + const bool &useBarePtrCallConv, + const std::string &target) { // apply PromotoBufferStack to func's with // getByteIRElementwiseFusionAttrName { @@ -73,6 +78,36 @@ void createGPUOptPipelineImpl(OpPassManager &pm, const bool &useBarePtrCallConv, pm.addNestedPass(createGenPTXConfigPass(useBarePtrCallConv)); } +void createReductionGPUOptPipelineImpl(OpPassManager &pm) { + GPUMappingForallOptions options; + options.funcAnchor = getByteIRReductionFusionAttrName().str(); + createGPUMappingForallTransform(pm, options); + pm.addPass(createTransformDialectInterpreter(true)); + pm.addPass(createCSEPass()); + pm.addPass(createCanonicalizerPass()); + pm.addPass(createGpuLauchSinkIndexComputationsPass()); + + { + OpPassManager anchoredPM(func::FuncOp::getOperationName()); + + anchoredPM.addPass(createPromoteBuffersToStackPass( + /*isSmallAlloc =*/[](Value value) { + return value.getParentRegion()->getParentOfType(); + })); + + pm.addNestedPass(createAnchoredPipelinePass( + getByteIRReductionFusionAttrName(), anchoredPM)); + } + pm.addPass(createGpuKernelOutliningPass()); +} + +void createGPUOptPipelineImpl(OpPassManager &pm, const bool &useBarePtrCallConv, + const std::string &target) { + createElementwiseGPUOptPipelineImpl(pm, useBarePtrCallConv, target); + createReductionGPUOptPipelineImpl(pm); + pm.addPass(createCollectGPUKernelPass("unified", false)); +} + } // namespace void mlir::createGPUOptPipeline(OpPassManager &pm, diff --git a/compiler/lib/Pipelines/GPU/MappingForall.cpp b/compiler/lib/Pipelines/GPU/MappingForall.cpp new file mode 100644 index 000000000..633214bee --- /dev/null +++ b/compiler/lib/Pipelines/GPU/MappingForall.cpp @@ -0,0 +1,148 @@ +//===- MappingForall.cpp --------------------------------------*--- C++ -*-===// +// +// Copyright 2022 ByteDance Ltd. and/or its affiliates. All rights reserved. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// + +#include "byteir/Pipelines/GPU/MappingForall.h" + +#include "byteir/Conversion/ToGPU/ToGPU.h" +#include "byteir/Conversion/ToLLVM/ToLLVM.h" +#include "byteir/Dialect/Linalg/TransformOps/LinalgExtTransformOps.h" +#include "byteir/Dialect/Transform/IR/TransformExtOps.h" +#include "byteir/Dialect/Transform/Transforms/TransformInsertion.h" +#include "byteir/Pipelines/Common/Utils.h" +#include "mlir/Dialect/Bufferization/IR/Bufferization.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h" +#include "mlir/Dialect/Linalg/IR/Linalg.h" +#include "mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h" +#include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/IR/BuiltinOps.h" +#include "llvm/ADT/SmallSet.h" + +#include + +using namespace mlir; + +namespace { + +static constexpr int64_t kMaximumBlockDim = 1024; + +struct MappingForallConfig { + SmallVector blockDims; +}; + +// TODO: move to common helper +bool isMappedToGPUBlocks(scf::ForallOp forallOp) { + if (auto mapping = forallOp.getMappingAttr()) { + if (llvm::any_of(mapping.getValue(), [](Attribute attr) { + return isa(attr); + })) { + return true; + } + } + + return false; +} + +bool isMappedToGPUThreads(scf::ForallOp forallOp) { + if (auto mapping = forallOp.getMappingAttr()) { + if (llvm::any_of(mapping.getValue(), [](Attribute attr) { + return isa(attr); + })) { + return true; + } + } + + return false; +} + +void updateBlockDims(scf::ForallOp forallOp, SmallVector &blockDims) { + for (auto &&[lb, ub, step, mappingAttr] : llvm::zip( + forallOp.getMixedLowerBound(), forallOp.getMixedUpperBound(), + forallOp.getMixedStep(), forallOp.getMappingAttr().getValue())) { + if (auto threadMapping = + llvm::dyn_cast_or_null(mappingAttr)) { + auto numIterations = constantTripCount(lb, ub, step); + auto threadIdx = threadMapping.getMappingId(); + if (numIterations.has_value()) { + blockDims[threadIdx] = + std::max(blockDims[threadIdx], numIterations.value()); + } + } + } +} + +std::optional +getMappingForallConfig(scf::ForallOp forallOp) { + if (!isMappedToGPUBlocks(forallOp)) + return std::nullopt; + + SmallVector blockDims{1, 1, 1}; + auto &&block = forallOp.getRegion().front(); + for (auto &&nestedForall : block.getOps()) { + if (isMappedToGPUThreads(nestedForall)) { + updateBlockDims(nestedForall, blockDims); + } + } + + if (blockDims[0] * blockDims[1] * blockDims[2] > kMaximumBlockDim) { + return std::nullopt; + } + return MappingForallConfig{blockDims}; +} + +void createGPUMappingForallTransformImpl(OpPassManager &pm, + const std::string &anchor, + const std::string &prefix) { + TransformInsertionConfig config; + config.funcAnchor = anchor; + config.matchPrefix = prefix; + config.opFilter = [=](Operation *op) { + if (auto forallOp = llvm::dyn_cast_or_null(op)) { + return getMappingForallConfig(forallOp).has_value(); + } + return false; + }; + + config.transformBuilder = [=](ImplicitLocOpBuilder &b, Operation *op, + Value pdlV) { + auto mappingConfig = + getMappingForallConfig(llvm::cast(op)).value(); + auto pdlType = pdl::OperationType::get(b.getContext()); + auto launchOp = b.create( + /* result type */ pdlType, + /* target */ pdlV, + /* grid_dims */ llvm::ArrayRef{}, + /* generate_gpu_launch */ true); + + b.create( + /* result type*/ pdlType, + /* target */ launchOp.getResult(), + /* block_dims */ mappingConfig.blockDims, + /* warp_dims */ llvm::ArrayRef{}, + /* sync_after_distribute*/ true); + }; + + pm.addPass(createGenericTransformInsertionPass(config)); +} +} // namespace + +void mlir::createGPUMappingForallTransform( + OpPassManager &pm, const GPUMappingForallOptions &options) { + invokeOpPassPipelineBuilder(createGPUMappingForallTransformImpl, pm, + options.funcAnchor, options.annotatePrefix); +} diff --git a/compiler/lib/Pipelines/GPU/NVVMCodegen.cpp b/compiler/lib/Pipelines/GPU/NVVMCodegen.cpp index aecfc5b84..d23567a58 100644 --- a/compiler/lib/Pipelines/GPU/NVVMCodegen.cpp +++ b/compiler/lib/Pipelines/GPU/NVVMCodegen.cpp @@ -19,6 +19,7 @@ #include "byteir/Conversion/GPUToNVVM/GPUToNVVM.h" #include "byteir/Conversion/ToPTX/ToPTX.h" +#include "byteir/Dialect/GPU/Passes.h" #include "byteir/Dialect/MemRef/Transforms/ExtractAddressComputation.h" #include "byteir/Dialect/MemRef/Transforms/SimplifyLinearizedIndex.h" #include "byteir/Dialect/mhlo/Passes.h" @@ -39,6 +40,9 @@ void createNVVMCodegenPipelineImpl(OpPassManager &pm, // TODO add target for supporting different SMs // TODO use target to decide passes pm.addPass(createCollectGPUKernelPass()); + pm.addNestedPass(createShmAllocaToWorkgroupArg()); + pm.addPass(createCSEPass()); + pm.addPass(createCanonicalizerPass()); pm.addPass(createConvertSCFToCFPass()); pm.addPass(createExtractAddressComputationPass()); pm.addPass(memref::createExpandStridedMetadataPass()); diff --git a/compiler/lib/Pipelines/GPU/ReductionCodegen.cpp b/compiler/lib/Pipelines/GPU/ReductionCodegen.cpp new file mode 100644 index 000000000..664fce40e --- /dev/null +++ b/compiler/lib/Pipelines/GPU/ReductionCodegen.cpp @@ -0,0 +1,942 @@ +//===- ReductionCodegen.cpp ---------------------------------*--- C++ -*-===// +// +// Copyright 2022 ByteDance Ltd. and/or its affiliates. All rights reserved. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// + +#include "byteir/Pipelines/GPU/ReductionCodegen.h" + +#include "byteir/Conversion/ToGPU/ToGPU.h" +#include "byteir/Conversion/ToLLVM/ToLLVM.h" +#include "byteir/Dialect/Linalg/TransformOps/LinalgExtTransformOps.h" +#include "byteir/Dialect/Transform/IR/TransformExtOps.h" +#include "byteir/Dialect/Transform/Transforms/TransformInsertion.h" +#include "byteir/Pipelines/Common/Utils.h" +#include "mlir/Dialect/Bufferization/IR/Bufferization.h" +#include "mlir/Dialect/Bufferization/TransformOps/BufferizationTransformOps.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/Linalg/IR/Linalg.h" +#include "mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h" +#include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/Dialect/SCF/TransformOps/SCFTransformOps.h" +#include "mlir/Dialect/Tensor/IR/Tensor.h" +#include "mlir/IR/BuiltinOps.h" +#include "llvm/ADT/SmallSet.h" + +#include + +using namespace mlir; + +namespace { +//----------------------------------------------------------------------------// +// common helpers +//----------------------------------------------------------------------------// +// TODO: move to common header + +constexpr bool isPowerOf2(int64_t n) { return (!(n & (n - 1))); } + +constexpr int64_t nextPowerOf2(int64_t n) { + return (n <= 1) ? 1 : (isPowerOf2(n) ? n : (2 * nextPowerOf2((n + 1) / 2))); +} + +bool isMappedToGPUBlocks(scf::ForOp forOp) { + if (auto loopToSIMTAttr = + forOp->getAttrOfType(getLoopToSIMTAttrName())) { + auto mappingTo = loopToSIMTAttr.getValue(); + if (mappingTo == getBlockIdXName() || mappingTo == getBlockIdYName() || + mappingTo == getBlockIdZName()) { + return true; + } + } + return false; +} + +bool isMappedToGPUBlocks(scf::ForallOp forallOp) { + if (auto mapping = forallOp.getMappingAttr()) { + if (llvm::any_of(mapping.getValue(), [](Attribute attr) { + return isa(attr); + })) { + return true; + } + } + + return false; +} + +bool isMappedToGPUBlocks(Operation *op) { + if (auto forOp = llvm::dyn_cast_or_null(op)) { + return isMappedToGPUBlocks(forOp); + } + if (auto forallOp = llvm::dyn_cast_or_null(op)) { + return isMappedToGPUBlocks(forallOp); + } + return false; +} + +bool isMappedToGPUThreads(scf::ForOp forOp) { + if (auto loopToSIMTAttr = + forOp->getAttrOfType(getLoopToSIMTAttrName())) { + auto mappingTo = loopToSIMTAttr.getValue(); + if (mappingTo == getThreadIdXName() || mappingTo == getThreadIdYName() || + mappingTo == getThreadIdZName()) { + return true; + } + } + return false; +} + +bool isMappedToGPUThreads(scf::ForallOp forallOp) { + if (auto mapping = forallOp.getMappingAttr()) { + if (llvm::any_of(mapping.getValue(), [](Attribute attr) { + return isa(attr); + })) { + return true; + } + } + + return false; +} + +bool isMappedToGPUThreads(Operation *op) { + if (auto forOp = llvm::dyn_cast_or_null(op)) { + return isMappedToGPUThreads(forOp); + } + if (auto forallOp = llvm::dyn_cast_or_null(op)) { + return isMappedToGPUThreads(forallOp); + } + return false; +} + +uint64_t getNumTiledLoops(ArrayRef tileSizes) { + return llvm::count_if(tileSizes, + [](int64_t tileSize) { return tileSize > 0; }); +} + +std::optional getReductionDim(linalg::GenericOp genericOp) { + SmallVector reductionDims; + genericOp.getReductionDims(reductionDims); + if (reductionDims.size() == 1) { + return reductionDims[0]; + } + return std::nullopt; +} + +std::optional getOperandReductionDim(OpOperand &operand) { + auto genericOp = llvm::dyn_cast(operand.getOwner()); + if (!genericOp) + return std::nullopt; + + auto dim = getReductionDim(genericOp); + if (!dim.has_value()) + return std::nullopt; + + auto affineMap = genericOp.getIndexingMapsArray()[operand.getOperandNumber()]; + if (!affineMap || !affineMap.isProjectedPermutation()) + return std::nullopt; + + for (auto &&en : llvm::enumerate(affineMap.getResults())) { + if (auto dimExpr = en.value().dyn_cast()) { + if (dimExpr.getPosition() == *dim) { + return en.index(); + } + } + } + + return std::nullopt; +} + +SmallVector getDynamicDims(linalg::GenericOp genericOp) { + auto staticLoopRanges = genericOp.getStaticLoopRanges(); + SmallVector ret; + for (int64_t i = 0; i < staticLoopRanges.size(); ++i) { + if (ShapedType::isDynamic(staticLoopRanges[i])) { + ret.push_back(i); + } + } + return ret; +} + +//----------------------------------------------------------------------------// +// configuration structs +//----------------------------------------------------------------------------// + +static constexpr StringLiteral kGridReduction = "__grid_reduction__"; +static constexpr StringLiteral kBlockReduction = "__block_reduction__"; +static constexpr StringLiteral kWarpReduction = "__warp_reduction__"; +static constexpr StringLiteral kThreadReduction = "__thread_reduction__"; + +struct ProducerSelector { + uint64_t operandNumber; + llvm::StringRef opName; + std::vector producerSelectors; + + ProducerSelector(uint64_t operandNumber, llvm::StringRef opName) + : operandNumber(operandNumber), opName(opName) {} + + static bool detectFillOperand(OpOperand *opOperand, + std::vector &selectors) { + if (opOperand->get().getDefiningOp()) { + selectors.emplace_back(opOperand->getOperandNumber(), + linalg::FillOp::getOperationName()); + return true; + } + return false; + } + + static bool detectPadOperand(OpOperand *opOperand, + std::vector &selectors) { + Operation *definingOp = opOperand->get().getDefiningOp(); + if (!definingOp) + return false; + + if (llvm::isa(definingOp)) { + ProducerSelector selector(opOperand->getOperandNumber(), + definingOp->getName().getStringRef()); + if (detectPadOperand(&definingOp->getOpOperand(0), + selector.producerSelectors)) { + selectors.emplace_back(std::move(selector)); + return true; + } + } else if (llvm::isa(definingOp)) { + selectors.emplace_back(opOperand->getOperandNumber(), + tensor::PadOp::getOperationName()); + return true; + } + return false; + } +}; + +struct GridSplitConfig { + int64_t splitFactor; + int64_t dimension; + + void apply(ImplicitLocOpBuilder &b, Value pdlV); +}; + +struct GridTileConfig { + SmallVector tileSizes; + SmallVector mapping; + std::vector fuseCandidates; + + void apply(ImplicitLocOpBuilder &b, Value pdlV, bool usingForall); +}; + +struct BlockSplitConfig { + SmallVector splitFactors; + SmallVector dimensions; + SmallVector padDims; + SmallVector padValues; + + void apply(ImplicitLocOpBuilder &b, Value pdlV); +}; + +struct BlockTileConfig { + SmallVector tileSizes; + SmallVector mapping; + std::vector fuseCandidates; + + void apply(ImplicitLocOpBuilder &b, Value pdlV, bool usingForall); +}; + +struct ThreadTileConfig { + SmallVector parallelTileSizes; + SmallVector reductionTileSizes; + SmallVector unrollFactors; + std::vector initOperands; + + void apply(ImplicitLocOpBuilder &b, Value pdlV); +}; + +void processProducerSelectors( + ImplicitLocOpBuilder &b, + const std::vector &producerSelectors, Value fuseInto, + SmallVector &selected, Type producerType = nullptr) { + for (auto selector : producerSelectors) { + auto producer = b.create( + /* producer type */ producerType + ? producerType + : transform::OperationType::get(b.getContext(), selector.opName), + /* target */ fuseInto, + /* operand number */ selector.operandNumber); + selected.push_back(producer.getProducer()); + processProducerSelectors(b, selector.producerSelectors, selected.back(), + selected); + } +} + +void tileToForallAndFuseImpl( + ImplicitLocOpBuilder &b, Value toTile, + const SmallVector &tileSizes, + const SmallVector &mapping, + const std::vector &fuseCandidates) { + SmallVector toBeFused; + processProducerSelectors(b, fuseCandidates, toTile, toBeFused); + + auto tileOp = b.create( + /* target */ toTile, + /* staticTileSizes */ tileSizes, + /* ctor tag */ transform::TileSizesSpec(), + /* mapping */ b.getArrayAttr(mapping)); + for (auto &&producerOp : toBeFused) { + b.create( + /* producerOp */ producerOp, + /* containingOp */ tileOp.getForallOp()); + } +} + +void tileToSCFForAndFuseImpl(ImplicitLocOpBuilder &b, Value toTile, + const SmallVector &tileSizes, + const SmallVector &mapping) { + auto pdlType = pdl::OperationType::get(b.getContext()); + auto fuseOp = b.create( + /* transformed */ pdlType, + /* loops */ + SmallVector(getNumTiledLoops(tileSizes), pdlType), + /* target */ toTile, + /* tile_sizes */ b.getI64ArrayAttr(tileSizes), + /* tile_interchange */ ArrayAttr()); + for (auto &&[loop, mapTo] : llvm::zip(fuseOp.getLoops(), mapping)) { + Value paramV = b.create( + /* type */ pdl::AttributeType::get(b.getContext()), + /* value */ mapTo); + b.create( + /* target */ loop, + /* name */ getLoopToSIMTAttrName(), + /* param */ paramV); + } +} + +void GridSplitConfig::apply(ImplicitLocOpBuilder &b, Value pdlV) { + if (splitFactor) { + auto splitted = b.create( + /* target */ pdlV, + /* splitFactor */ splitFactor, + /* insertSplitDimension */ dimension, + /* innerParallel */ false, + /* useScalingAlgorithm */ false, + /* useAlloc */ false); + b.create( + /* target */ splitted.getSplitLinalgOp(), + /* name */ kGridReduction, + /* param */ Value()); + b.create( + /* target */ splitted.getCombiningLinalgOp(), + /* name */ kGridReduction, + /* param */ Value()); + } else { + b.create( + /* target */ pdlV, + /* name */ kGridReduction, + /* param */ Value()); + } +} + +void GridTileConfig::apply(ImplicitLocOpBuilder &b, Value pdlV, + bool usingForall) { + if (usingForall) { + auto mappingAttrs = llvm::to_vector( + llvm::map_range(mapping, [&](gpu::Blocks dim) -> Attribute { + return gpu::GPUBlockMappingAttr::get(b.getContext(), dim); + })); + tileToForallAndFuseImpl(b, pdlV, tileSizes, mappingAttrs, fuseCandidates); + } else { + static constexpr std::array mappings{ + getBlockIdXName(), getBlockIdYName(), getBlockIdZName()}; + auto mappingAttrs = llvm::to_vector( + llvm::map_range(mapping, [&](gpu::Blocks dim) -> Attribute { + return b.getStringAttr(mappings[static_cast(dim)]); + })); + tileToSCFForAndFuseImpl(b, pdlV, tileSizes, mappingAttrs); + } +} + +void BlockSplitConfig::apply(ImplicitLocOpBuilder &b, Value pdlV) { + if (!padDims.empty()) { + auto padOp = b.create( + TypeRange{pdlV.getType(), pdlV.getType()}, pdlV, + /*padding_values=*/b.getArrayAttr(padValues), + /*padding_dimensions=*/ + b.getI64ArrayAttr(padDims), + /*padToMultipleOf=*/ArrayAttr{}, + /*pack_paddings=*/ArrayAttr{}, + /*transpose_paddings=*/ArrayAttr{}, + /*copyBack=*/false); + pdlV = padOp.getPadded(); + } + if (!splitFactors.empty()) { + Value toSplit = pdlV; + for (auto &&[splitFactor, redDim] : llvm::zip(splitFactors, dimensions)) { + auto splitted = b.create( + /* target */ toSplit, + /* splitFactor */ splitFactor, + /* insertSplitDimension */ redDim, + /* innerParallel */ false, + /* useScalingAlgorithm */ false, + /* useAlloc */ false); + b.create( + /* target */ splitted.getInitOrAllocOp(), + /* name */ kBlockReduction, + /* param */ Value()); + b.create( + /* target */ splitted.getCombiningLinalgOp(), + /* name */ kBlockReduction, + /* param */ Value()); + toSplit = splitted.getCombiningLinalgOp(); + } + pdlV = toSplit; + } else { + b.create( + /* target */ pdlV, + /* name */ kBlockReduction, + /* param */ Value()); + } + auto func = b.create( + pdlV.getType(), pdlV, + /* isolated_from_above */ true, + /* op_name */ b.getStringAttr(func::FuncOp::getOperationName()), + /* deduplicate */ false); + b.create(func, [](OpBuilder &b, Location loc) { + b.create(loc); + }); + auto forall = b.create( + pdlV.getType(), pdlV, + /* isolated_from_above */ false, + /* op_name */ b.getStringAttr(scf::ForallOp::getOperationName()), + /* deduplicate */ false); + if (!padDims.empty()) { + auto parallelInsertSliceType = transform::OperationType::get( + b.getContext(), tensor::ParallelInsertSliceOp::getOperationName()); + auto parallelInsertSlice = b.create( + parallelInsertSliceType, forall, + tensor::ParallelInsertSliceOp::getOperationName()); + b.create(pdlV.getType(), + parallelInsertSlice); + } + auto emptyTensorType = transform::OperationType::get( + b.getContext(), tensor::EmptyOp::getOperationName()); + auto emptyTensor = b.create( + emptyTensorType, forall, tensor::EmptyOp::getOperationName()); + auto allocTensorType = transform::OperationType::get( + b.getContext(), bufferization::AllocTensorOp::getOperationName()); + auto allocTensor = b.create( + allocTensorType, emptyTensor); + auto memorySpaceAttrName = + bufferization::AllocTensorOp::getMemorySpaceAttrName(OperationName( + bufferization::AllocTensorOp::getOperationName(), b.getContext())); + auto workgroupMemoryAddressSpace = gpu::AddressSpaceAttr::get( + b.getContext(), gpu::GPUDialect::getWorkgroupAddressSpace()); + Value paramV = b.create( + /* type */ pdl::AttributeType::get(b.getContext()), + /* value */ workgroupMemoryAddressSpace); + b.create( + /* target */ allocTensor, + /* name */ memorySpaceAttrName, + /* param */ paramV); +} + +void BlockTileConfig::apply(ImplicitLocOpBuilder &b, Value pdlV, + bool usingForall) { + if (usingForall) { + auto mappingAttrs = llvm::to_vector( + llvm::map_range(mapping, [&](gpu::Threads dim) -> Attribute { + return gpu::GPUThreadMappingAttr::get(b.getContext(), dim); + })); + tileToForallAndFuseImpl(b, pdlV, tileSizes, mappingAttrs, fuseCandidates); + } else { + static constexpr std::array mappings{ + getThreadIdXName(), getThreadIdYName(), getThreadIdZName()}; + auto mappingAttrs = llvm::to_vector( + llvm::map_range(mapping, [&](gpu::Threads dim) -> Attribute { + return b.getStringAttr(mappings[static_cast(dim)]); + })); + tileToSCFForAndFuseImpl(b, pdlV, tileSizes, mappingAttrs); + } +} + +void ThreadTileConfig::apply(ImplicitLocOpBuilder &b, Value pdlV) { + auto pdlType = pdl::OperationType::get(b.getContext()); + auto numTiledParallelLoops = getNumTiledLoops(parallelTileSizes); + SmallVector loops; + if (numTiledParallelLoops > 0) { + auto fuseOp = b.create( + /* transformed */ pdlType, + /* loops */ + SmallVector(getNumTiledLoops(parallelTileSizes), pdlType), + /* target */ pdlV, + /* tile_sizes */ b.getI64ArrayAttr(parallelTileSizes), + /* tile_interchange */ ArrayAttr()); + loops = fuseOp.getLoops(); + pdlV = fuseOp.getTransformed(); + } + + auto tileOp = b.create( + /* target */ pdlV, + /* tillSizes */ reductionTileSizes); + loops.push_back(tileOp.getLoops()[0]); + for (auto &&[loop, factor] : llvm::reverse(llvm::zip(loops, unrollFactors))) { + b.create(loop, factor); + } +} + +//----------------------------------------------------------------------------// +// codegen strategies +//----------------------------------------------------------------------------// + +bool isReductionOp(linalg::GenericOp genericOp) { + if (genericOp.getNumReductionLoops() != 1) + return false; + + if (!llvm::all_of(genericOp.getIndexingMapsArray(), [](AffineMap affineMap) { + return affineMap.isProjectedPermutation(/* allowZeroInResults */ false); + })) + return false; + + return true; +} + +bool isGridReductionOp(linalg::GenericOp genericOp) { + if (!isReductionOp(genericOp)) + return false; + + // early return for manual tag + if (genericOp->hasAttr(kGridReduction)) + return true; + + // top level generic op in function + if (genericOp->getParentOfType()) + return true; + + return false; +} + +bool isBlockReductionOp(linalg::GenericOp genericOp) { + if (!isReductionOp(genericOp)) + return false; + + // early return for manual tag + if (genericOp->hasAttr(kBlockReduction)) + return true; + + // nested in op which is mapped to GPU blocks + if (isMappedToGPUBlocks(genericOp->getParentOp())) + return true; + + return false; +} + +bool isThreadReductionOp(linalg::GenericOp genericOp) { + if (!isReductionOp(genericOp)) + return false; + + // early return for manual tag + if (genericOp->hasAttr(kThreadReduction)) + return true; + + // nested in op which is mapped to GPU threads + if (isMappedToGPUThreads(genericOp->getParentOp())) + return true; + + return false; +} + +std::optional getGridSplitConfig(linalg::GenericOp genericOp, + int64_t splitFactor) { + if (!isGridReductionOp(genericOp)) + return std::nullopt; + + auto redDim = *getReductionDim(genericOp); + auto staticLoopRanges = genericOp.getStaticLoopRanges(); + if (ShapedType::isDynamic(staticLoopRanges[redDim]) || + staticLoopRanges[redDim] % splitFactor != 0 || + staticLoopRanges[redDim] <= 1024) + return std::nullopt; + + return GridSplitConfig{splitFactor, redDim ? redDim - 1 : redDim}; +} + +std::optional getGridTileConfig(linalg::GenericOp genericOp, + int64_t warpSize, + int64_t blockSize) { + if (!isGridReductionOp(genericOp)) + return std::nullopt; + + int64_t numLoops = genericOp.getNumLoops(); + SmallVector tileSizes(numLoops, 1); + auto loopSizes = + cast(genericOp.getOperation()).computeStaticLoopSizes(); + + for (auto &&affineMap : genericOp.getIndexingMapsArray()) { + if (affineMap.isPermutation()) { + auto dim = affineMap.getDimPosition(numLoops - 1); + if (loopSizes[dim] > warpSize) { // TODO: padding + tileSizes[dim] *= warpSize; + break; + } + } + } + + auto redDim = getReductionDim(genericOp).value(); + tileSizes[redDim] = 0; + + std::vector fuseCandidates; + for (OpOperand *opOperand : genericOp.getDpsInitOperands()) { + ProducerSelector::detectFillOperand(opOperand, fuseCandidates); + } + + auto numTiledLoops = getNumTiledLoops(tileSizes); + if (numTiledLoops >= 1 && numTiledLoops <= 3) { + SmallVector mapping(numLoops, -1); + int64_t dimMapping = static_cast(gpu::Blocks::DimX); + for (auto &&affineMap : genericOp.getIndexingMapsArray()) { + if (affineMap.isPermutation()) { + for (int64_t i = numLoops - 1; i >= 0; i--) { + auto dim = affineMap.getDimPosition(i); + if (tileSizes[dim] > 0) { + mapping[dim] = dimMapping++; + } + } + break; + } + } + mapping.erase(std::remove(mapping.begin(), mapping.end(), -1), + mapping.end()); + if (mapping.size() != numTiledLoops) + return std::nullopt; + + return GridTileConfig{ + tileSizes, + llvm::to_vector(llvm::map_range( + mapping, [](int64_t i) { return static_cast(i); })), + fuseCandidates}; + } + return std::nullopt; +} + +std::optional getBlockSplitConfig(linalg::GenericOp genericOp, + int64_t splitFactor, + int64_t warpSize) { + if (!isBlockReductionOp(genericOp)) + return std::nullopt; + + SmallVector padDims = getDynamicDims(genericOp); + SmallVector padValues; + + SmallVector splitFactors; + SmallVector dimensions; + auto redDim = *getReductionDim(genericOp); + auto staticLoopRanges = genericOp.getStaticLoopRanges(); + if (ShapedType::isDynamic(staticLoopRanges[redDim])) + return std::nullopt; + + if (auto redPos = getOperandReductionDim(*genericOp.getDpsInputOperand(0))) { + if (redPos.value() == genericOp.getNumLoops() - 1) { + auto newSplitFactor = splitFactor * 2; + while (staticLoopRanges[redDim] % newSplitFactor == 0 && + newSplitFactor <= splitFactor * warpSize) { + newSplitFactor *= 2; + } + splitFactor = newSplitFactor / 2; + } + } + + if (staticLoopRanges[redDim] < splitFactor) { + splitFactor = staticLoopRanges[redDim]; + } else { + if (staticLoopRanges[redDim] % splitFactor != 0) + return std::nullopt; + + splitFactors.push_back(splitFactor); + dimensions.push_back(redDim ? redDim - 1 : redDim); + } + + mlir::Builder b(genericOp.getContext()); + for (auto &&operand : genericOp->getOperands()) { + if (auto shapedType = llvm::dyn_cast(operand.getType())) { + padValues.push_back(b.getZeroAttr(shapedType.getElementType())); + } else { + return std::nullopt; + } + } + + for (; splitFactor > 2; splitFactor >>= 1) { + splitFactors.push_back(splitFactor / 2); + dimensions.push_back(redDim ? redDim - 1 : redDim); + } + + return BlockSplitConfig{splitFactors, dimensions, padDims, padValues}; +} + +std::optional getBlockTileConfig(linalg::GenericOp genericOp, + int64_t warpSize, + int64_t blockSize) { + if (!isBlockReductionOp(genericOp)) + return std::nullopt; + + int64_t numLoops = genericOp.getNumLoops(); + SmallVector tileSizes(numLoops, 0); + auto loopSizes = + cast(genericOp.getOperation()).computeStaticLoopSizes(); + + int64_t remainBlockSize = blockSize; + auto redDim = getReductionDim(genericOp).value(); + for (int64_t idx = 0; idx < numLoops && remainBlockSize > 1; ++idx) { + if (idx == redDim) + continue; + int64_t curLoopSize2 = nextPowerOf2(loopSizes[idx]); + int64_t curBlockSize = std::min(curLoopSize2, remainBlockSize); + tileSizes[idx] = curLoopSize2 / curBlockSize; + remainBlockSize /= curBlockSize; + } + + if (remainBlockSize == blockSize) { + tileSizes[redDim] = loopSizes[redDim]; + } + + std::vector fuseCandidates; + for (OpOperand *opOperand : genericOp.getDpsInputOperands()) { + ProducerSelector::detectPadOperand(opOperand, fuseCandidates); + } + for (OpOperand *opOperand : genericOp.getDpsInitOperands()) { + ProducerSelector::detectFillOperand(opOperand, fuseCandidates); + } + + auto numTiledLoops = getNumTiledLoops(tileSizes); + if (numTiledLoops >= 1 && numTiledLoops <= 3) { + SmallVector mapping(numLoops, -1); + int64_t dimMapping = static_cast(gpu::Threads::DimX); + for (auto &&affineMap : genericOp.getIndexingMapsArray()) { + if (affineMap.isPermutation()) { + for (int64_t i = numLoops - 1; i >= 0; i--) { + auto dim = affineMap.getDimPosition(i); + if (tileSizes[dim] > 0) { + mapping[dim] = dimMapping++; + } + } + break; + } + } + mapping.erase(std::remove(mapping.begin(), mapping.end(), -1), + mapping.end()); + if (mapping.size() != numTiledLoops) + return std::nullopt; + + return BlockTileConfig{ + tileSizes, + llvm::to_vector(llvm::map_range( + mapping, [](int64_t i) { return static_cast(i); })), + fuseCandidates}; + } + return std::nullopt; +} + +std::optional +getThreadTileConfig(linalg::GenericOp genericOp) { + if (!isThreadReductionOp(genericOp)) + return std::nullopt; + + int64_t numLoops = genericOp.getNumLoops(); + SmallVector parallelTileSizes(numLoops, 1); + SmallVector reductionTileSizes(numLoops, 0); + auto reductionDim = *getReductionDim(genericOp); + + parallelTileSizes[reductionDim] = 0; + reductionTileSizes[reductionDim] = 1; + + SmallVector unrollFactors = + cast(genericOp.getOperation()).computeStaticLoopSizes(); + + std::vector initOperands; + for (OpOperand *opOperand : genericOp.getDpsInitOperands()) { + ProducerSelector::detectFillOperand(opOperand, initOperands); + } + + return ThreadTileConfig{parallelTileSizes, reductionTileSizes, unrollFactors, + initOperands}; +} + +//----------------------------------------------------------------------------// +// transform insertion impl +//----------------------------------------------------------------------------// + +void createGPUSplitGridReductionTransformImpl(OpPassManager &pm, + const std::string &anchor, + const std::string &prefix, + int64_t splitFactor) { + TransformInsertionConfig config; + config.funcAnchor = anchor; + config.matchPrefix = prefix; + config.opFilter = [=](Operation *op) { + if (auto genericOp = llvm::dyn_cast_or_null(op)) { + return getGridSplitConfig(genericOp, splitFactor).has_value(); + } + return false; + }; + + config.transformBuilder = [=](ImplicitLocOpBuilder &b, Operation *op, + Value pdlV) { + auto splitConfig = + getGridSplitConfig(llvm::cast(op), splitFactor) + .value(); + splitConfig.apply(b, pdlV); + }; + + pm.addPass(createGenericTransformInsertionPass(config)); +} + +void createGPUTileGridReductionTransformImpl( + OpPassManager &pm, const std::string &anchor, const std::string &prefix, + int64_t warpSize, int64_t blockSize, bool usingForall) { + TransformInsertionConfig config; + config.funcAnchor = anchor; + config.matchPrefix = prefix; + config.opFilter = [=](Operation *op) { + if (auto genericOp = llvm::dyn_cast_or_null(op)) { + return getGridTileConfig(genericOp, warpSize, blockSize).has_value(); + } + return false; + }; + + config.transformBuilder = [=](ImplicitLocOpBuilder &b, Operation *op, + Value pdlV) { + auto tileConfig = getGridTileConfig(llvm::cast(op), + warpSize, blockSize) + .value(); + tileConfig.apply(b, pdlV, usingForall); + }; + + pm.addPass(createGenericTransformInsertionPass(config)); +} + +void createGPUSplitBlockReductionTransformImpl(OpPassManager &pm, + const std::string &anchor, + const std::string &prefix, + int64_t splitFactor, + int64_t warpSize) { + TransformInsertionConfig config; + config.funcAnchor = anchor; + config.matchPrefix = prefix; + config.opFilter = [=](Operation *op) { + if (auto genericOp = llvm::dyn_cast_or_null(op)) { + return getBlockSplitConfig(genericOp, splitFactor, warpSize).has_value(); + } + return false; + }; + + config.transformBuilder = [=](ImplicitLocOpBuilder &b, Operation *op, + Value pdlV) { + auto splitConfig = getBlockSplitConfig(llvm::cast(op), + splitFactor, warpSize) + .value(); + splitConfig.apply(b, pdlV); + }; + + pm.addPass(createGenericTransformInsertionPass(config)); +} + +void createGPUTileBlockReductionTransformImpl( + OpPassManager &pm, const std::string &anchor, const std::string &prefix, + int64_t warpSize, int64_t blockSize, bool usingForall) { + TransformInsertionConfig config; + config.funcAnchor = anchor; + config.matchPrefix = prefix; + config.opFilter = [=](Operation *op) { + if (auto genericOp = llvm::dyn_cast_or_null(op)) { + return getBlockTileConfig(genericOp, warpSize, blockSize).has_value(); + } else if (auto copyOp = llvm::dyn_cast_or_null(op)) { + return copyOp.getNumLoops() == 1; + } + return false; + }; + + config.transformBuilder = [=](ImplicitLocOpBuilder &b, Operation *op, + Value pdlV) { + if (auto genericOp = llvm::dyn_cast_or_null(op)) { + auto tileConfig = getBlockTileConfig(llvm::cast(op), + warpSize, blockSize) + .value(); + tileConfig.apply(b, pdlV, usingForall); + } else if (auto copyOp = llvm::dyn_cast_or_null(op)) { + auto tileOp = b.create( + /* target */ pdlV, + /* staticTileSizes */ SmallVector(1, blockSize), + /* ctor tag */ transform::NumThreadsSpec(), + /* mapping */ + b.getArrayAttr(gpu::GPULinearIdMappingAttr::get( + b.getContext(), gpu::LinearId::DimX))); + } + }; + + pm.addPass(createGenericTransformInsertionPass(config)); +} + +void createGPUTileThreadReductionTransformImpl(OpPassManager &pm, + const std::string &anchor, + const std::string &prefix) { + TransformInsertionConfig config; + config.funcAnchor = anchor; + config.matchPrefix = prefix; + config.opFilter = [=](Operation *op) { + if (auto genericOp = llvm::dyn_cast_or_null(op)) { + return getThreadTileConfig(genericOp).has_value(); + } + return false; + }; + + config.transformBuilder = [=](ImplicitLocOpBuilder &b, Operation *op, + Value pdlV) { + auto tileConfig = + getThreadTileConfig(llvm::cast(op)).value(); + tileConfig.apply(b, pdlV); + }; + + pm.addPass(createGenericTransformInsertionPass(config)); +} +} // namespace + +void mlir::createGPUSplitGridReductionTransform( + OpPassManager &pm, const GPUSplitGridReductionOptions &options) { + invokeOpPassPipelineBuilder(createGPUSplitGridReductionTransformImpl, pm, + options.funcAnchor, options.annotatePrefix, + options.splitFactor); +} + +void mlir::createGPUTileGridReductionTransform( + OpPassManager &pm, const GPUTileGridReductionOptions &options) { + invokeOpPassPipelineBuilder(createGPUTileGridReductionTransformImpl, pm, + options.funcAnchor, options.annotatePrefix, + options.warpSize, options.blockSize, + options.usingForall); +} + +void mlir::createGPUSplitBlockReductionTransform( + OpPassManager &pm, const GPUSplitBlockReductionOptions &options) { + invokeOpPassPipelineBuilder(createGPUSplitBlockReductionTransformImpl, pm, + options.funcAnchor, options.annotatePrefix, + options.splitFactor, options.warpSize); +} + +void mlir::createGPUTileBlockReductionTransform( + OpPassManager &pm, const GPUTileBlockReductionOptions &options) { + invokeOpPassPipelineBuilder(createGPUTileBlockReductionTransformImpl, pm, + options.funcAnchor, options.annotatePrefix, + options.warpSize, options.blockSize, + options.usingForall); +} + +void mlir::createGPUTileThreadReductionTransform( + OpPassManager &pm, const GPUTileThreadReductionOptions &options) { + invokeOpPassPipelineBuilder(createGPUTileThreadReductionTransformImpl, pm, + options.funcAnchor, options.annotatePrefix); +} diff --git a/compiler/lib/Pipelines/HloOpt.cpp b/compiler/lib/Pipelines/HloOpt.cpp index b80e815de..7aa45267e 100644 --- a/compiler/lib/Pipelines/HloOpt.cpp +++ b/compiler/lib/Pipelines/HloOpt.cpp @@ -21,6 +21,7 @@ #include "byteir/Pipelines/Common/Utils.h" #include "byteir/Transforms/CanonicalizeExt.h" #include "mhlo/transforms/passes.h" +#include "mlir/Dialect/Func/Transforms/Passes.h" #include "mlir/IR/BuiltinOps.h" #include "mlir/Transforms/Passes.h" @@ -50,6 +51,7 @@ void addGenericHloFusionPatterns(OpPassManager &pm, const std::string &entry, pm.addPass(createCSEPass()); pm.addNestedPass(createFlattenTuplePass()); + pm.addNestedPass(createReductionFusionPass()); // Element fusion (always last?) // Note: if outlineSingleElemwiseOp is set, element fusion must be the last // pass, since it will cluster every elemenwise op which is not fused yet into @@ -106,6 +108,7 @@ void createHloOptPipelineImpl(OpPassManager &pm, const std::string &entryFunc, pm.addPass(createCSEPass()); pm.addPass(createCanonicalizeExtPass()); pm.addPass(createSymbolDCEPass()); + pm.addPass(func::createDuplicateFunctionEliminationPass()); } } // namespace diff --git a/compiler/lib/Pipelines/LinalgMemrefOpt.cpp b/compiler/lib/Pipelines/LinalgMemrefOpt.cpp index f6a4d55b6..6b11d2503 100644 --- a/compiler/lib/Pipelines/LinalgMemrefOpt.cpp +++ b/compiler/lib/Pipelines/LinalgMemrefOpt.cpp @@ -33,7 +33,9 @@ void addGenericLinalgMemrefOptPasses(OpPassManager &pm) { pm.addPass(createMemrefCopyToLinalgPass( getAttrPlaceholderName( byre::ByreDialect::getEntryPointFunctionAttrName()), - getByteIRElementwiseFusionAttrName().str())); + getByteIRElementwiseFusionAttrName().str(), true)); + pm.addPass(createMemrefCopyToLinalgPass( + getByteIRReductionFusionAttrName().str(), "", false)); } void createLinalgMemrefOptPipelineImpl(OpPassManager &pm, diff --git a/compiler/lib/Pipelines/LinalgTensorOpt.cpp b/compiler/lib/Pipelines/LinalgTensorOpt.cpp index c220a382a..88c1ab2b9 100644 --- a/compiler/lib/Pipelines/LinalgTensorOpt.cpp +++ b/compiler/lib/Pipelines/LinalgTensorOpt.cpp @@ -17,58 +17,39 @@ #include "byteir/Pipelines/LinalgTensorOpt.h" #include "byteir/Pipelines/GPU/ElementwiseCodegen.h" +#include "byteir/Pipelines/GPU/ReductionCodegen.h" #include "byteir/Pipelines/Host/Codegen.h" #include "byteir/Conversion/ToLinalg/ToLinalg.h" #include "byteir/Dialect/Linalg/Passes.h" +#include "byteir/Dialect/Tensor/Passes.h" #include "byteir/Dialect/Transform/Transforms/TransformDialectInterpreter.h" +#include "byteir/Dialect/Transform/Transforms/TransformInsertion.h" #include "byteir/Dialect/mhlo/Passes.h" #include "byteir/Dialect/mhlo/Transforms/HloFuser.h" #include "byteir/Pipelines/Common/Utils.h" +#include "byteir/Transforms/AnchoredPipeline.h" #include "byteir/Transforms/CanonicalizeExt.h" +#include "mlir/Dialect/Bufferization/Transforms/Passes.h" #include "mlir/Dialect/Linalg/Passes.h" #include "mlir/Transforms/Passes.h" using namespace mlir; namespace { -void collectBroadcastOperands( - mlir::Operation *op, - mlir::DenseMap &collection) { - - auto tensorSlice = dyn_cast(op); - if (!tensorSlice) { - return; - } - - for (Value res : op->getResults()) { - bool isBroadcast = false; - for (auto &&use : res.getUses()) { - if (auto genericOp = dyn_cast(use.getOwner())) { - auto affineMap = - genericOp.getIndexingMapsArray()[use.getOperandNumber()]; - if (!affineMap.isPermutation() && - affineMap.isProjectedPermutation(/*allowZeroInResults*/ true)) { - isBroadcast = true; - } - } - } - if (isBroadcast) { - collection.insert(std::make_pair(res, std::make_pair(Attribute(), true))); - } - } -} - -void addGenericLinalgElementwisePasses(OpPassManager &pm) { +void addGenericLinalgPasses(OpPassManager &pm) { pm.addNestedPass( createHloFusionToLinalgPass(getByteIRElementwiseFusionAttrName())); + pm.addNestedPass( + createHloFusionToLinalgPass(getByteIRReductionFusionAttrName())); pm.addNestedPass(createUnrealizedCastToLinalgPass()); pm.addPass(createLinalgElementwiseFusionExtPass( /*enableSharedInput*/ true, /*enableDiffShapes*/ false)); pm.addPass(createCSEPass()); - { + { // elementwise codegen + auto elementwiseAnchor = getByteIRElementwiseFusionAttrName().str(); GPUTileElementwiseOptions options; - options.funcAnchor = getByteIRElementwiseFusionAttrName().str(); + options.funcAnchor = elementwiseAnchor; // set to 1 for fully fusion & unroll, and all tiled loops will be coalesced // and mapping to LinearIdx.x in later pipeline // FIXME: set to real blockSize and mapping tiled loops to the corresponding @@ -77,13 +58,98 @@ void addGenericLinalgElementwisePasses(OpPassManager &pm) { options.warpSize = 32; createGPUTileElementwiseTransform(pm, options); pm.addPass(createTransformDialectInterpreter(true)); + { + OpPassManager anchoredPM(func::FuncOp::getOperationName()); + anchoredPM.addPass(createCanonicalizerPass()); + anchoredPM.addPass(createLinalgFoldUnitExtentDimsPass()); + anchoredPM.addPass(createLinalgElementwiseFusionExtPass( + /*enableSharedInput*/ true, /*enableDiffShapes*/ false)); + anchoredPM.addPass(createCSEPass()); + anchoredPM.addPass(createCanonicalizerPass()); + pm.addNestedPass( + createAnchoredPipelinePass(elementwiseAnchor, anchoredPM)); + } + } + { // reduction codegen + auto reductionAnchor = getByteIRReductionFusionAttrName().str(); + { + OpPassManager anchoredPM(func::FuncOp::getOperationName()); + anchoredPM.addPass( + createLinalgCollapseLoops(utils::IteratorType::reduction)); + anchoredPM.addPass( + createLinalgCollapseLoops(utils::IteratorType::parallel)); + pm.addNestedPass( + createAnchoredPipelinePass(reductionAnchor, anchoredPM)); + } + + GPUSplitGridReductionOptions splitGridRedOptions; + splitGridRedOptions.funcAnchor = reductionAnchor; + createGPUSplitGridReductionTransform(pm, splitGridRedOptions); + pm.addPass(createTransformDialectInterpreter(true)); pm.addPass(createCanonicalizerPass()); + + GPUTileGridReductionOptions tileGridRedOptions; + tileGridRedOptions.funcAnchor = reductionAnchor; + tileGridRedOptions.blockSize = 512; + createGPUTileGridReductionTransform(pm, tileGridRedOptions); + pm.addPass(createTransformDialectInterpreter(true)); + { + OpPassManager anchoredPM(func::FuncOp::getOperationName()); + anchoredPM.addPass(createLinalgFoldUnitExtentDimsPass()); + anchoredPM.addPass(createCanonicalizerPass()); + anchoredPM.addPass(createCSEPass()); + pm.addNestedPass( + createAnchoredPipelinePass(reductionAnchor, anchoredPM)); + } + + GPUSplitBlockReductionOptions splitBlockRedOptions; + splitBlockRedOptions.funcAnchor = reductionAnchor; + splitBlockRedOptions.splitFactor = 16; + createGPUSplitBlockReductionTransform(pm, splitBlockRedOptions); + pm.addPass(createTransformDialectInterpreter(true)); + pm.addPass(createCanonicalizerPass()); + + GPUTileBlockReductionOptions tileBlockRedOptions; + tileBlockRedOptions.funcAnchor = reductionAnchor; + tileBlockRedOptions.blockSize = 512; + createGPUTileBlockReductionTransform(pm, tileBlockRedOptions); + pm.addPass(createTransformDialectInterpreter(true)); + { + OpPassManager anchoredPM(func::FuncOp::getOperationName()); + anchoredPM.addPass(createLinalgFoldUnitExtentDimsPass()); + anchoredPM.addPass(createCanonicalizerPass()); + anchoredPM.addPass(createCSEPass()); + pm.addNestedPass( + createAnchoredPipelinePass(reductionAnchor, anchoredPM)); + } + + GPUTileThreadReductionOptions tileThreadRedOptions; + tileThreadRedOptions.funcAnchor = reductionAnchor; + createGPUTileThreadReductionTransform(pm, tileThreadRedOptions); + pm.addPass(createTransformDialectInterpreter(true)); + { + OpPassManager anchoredPM(func::FuncOp::getOperationName()); + anchoredPM.addPass(createLinalgFoldUnitExtentDimsPass()); + anchoredPM.addPass(createCanonicalizerPass()); + anchoredPM.addPass(createCSEPass()); + pm.addNestedPass( + createAnchoredPipelinePass(reductionAnchor, anchoredPM)); + } + + pm.addPass(createDetensorizeTransformInsertionPass(reductionAnchor)); + pm.addPass(createTransformDialectInterpreter(true)); + pm.addPass(createCanonicalizeExtPass()); + pm.addPass(createRewriteInDPSTransformInsertionPass(reductionAnchor)); + pm.addPass(createTransformDialectInterpreter(true)); + pm.addPass(createCanonicalizerPass()); + { + OpPassManager anchoredPM(func::FuncOp::getOperationName()); + anchoredPM.addPass(createTensorPadSpecializationPass()); + anchoredPM.addPass(bufferization::createEmptyTensorEliminationPass()); + pm.addNestedPass( + createAnchoredPipelinePass(reductionAnchor, anchoredPM)); + } } - pm.addPass(createLinalgFoldUnitExtentDimsPass()); - pm.addPass(createLinalgElementwiseFusionExtPass( - /*enableSharedInput*/ true, /*enableDiffShapes*/ false)); - pm.addPass(createCSEPass()); - pm.addPass(createCanonicalizerPass()); } void addCPULinalgOptPasses(OpPassManager &pm) { @@ -110,7 +176,7 @@ void createLinalgTensorOptPipelineImpl(OpPassManager &pm, if (target == "CPU") { addCPULinalgOptPasses(pm); } else { - addGenericLinalgElementwisePasses(pm); + addGenericLinalgPasses(pm); } } } // namespace diff --git a/compiler/lib/Transforms/Bufferize.cpp b/compiler/lib/Transforms/Bufferize.cpp index c93e964ef..0d0d14c4a 100644 --- a/compiler/lib/Transforms/Bufferize.cpp +++ b/compiler/lib/Transforms/Bufferize.cpp @@ -17,7 +17,6 @@ #include "byteir/Transforms/Bufferize.h" -#include "./PassDetail.h" #include "byteir/Dialect/Ace/AceDialect.h" #include "byteir/Dialect/Byre/ByreDialect.h" #include "byteir/Dialect/Byre/Transforms/BufferizableOpInterfaceImpl.h" @@ -40,6 +39,7 @@ #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/Func/Transforms/FuncConversions.h" #include "mlir/Dialect/Func/Transforms/Passes.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/Linalg/Passes.h" #include "mlir/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.h" @@ -62,6 +62,8 @@ #include "mlir/IR/MLIRContext.h" #include "mlir/IR/Operation.h" +#include "./PassDetail.h" + using namespace mlir; using namespace mlir::bufferization; @@ -93,6 +95,28 @@ struct OneShotBufferizePass vector::registerBufferizableOpInterfaceExternalModels(registry); } + static bool isGPUSharedMem(MemRefType type) { + if (auto memorySpace = llvm::dyn_cast_or_null( + type.getMemorySpace())) { + if (memorySpace.getValue() == + gpu::GPUDialect::getWorkgroupAddressSpace()) { + return true; + } + } + return false; + } + + template + static auto createAlloc(OpBuilder &b, Location loc, MemRefType type, + ValueRange dynShape, size_t bufferAlignment) { + if (bufferAlignment != 0) + return b + .create(loc, type, dynShape, + b.getI64IntegerAttr(bufferAlignment)) + .getResult(); + return b.create(loc, type, dynShape).getResult(); + } + void runOnOperation() override { bufferization::OneShotBufferizationOptions opts; opts.allowReturnAllocs = true; @@ -101,6 +125,29 @@ struct OneShotBufferizePass bufferization::LayoutMapOption::IdentityLayoutMap); opts.createDeallocs = false; opts.bufferAlignment = 0; + opts.allocationFn = [](OpBuilder &b, Location loc, MemRefType type, + ValueRange dynShape, + unsigned int bufferAlignment) -> FailureOr { + if (isGPUSharedMem(type)) { + return createAlloc(b, loc, type, dynShape, + bufferAlignment); + } + return createAlloc(b, loc, type, dynShape, + bufferAlignment); + }; + opts.deallocationFn = [](OpBuilder &b, Location loc, + Value allocatedBuffer) -> LogicalResult { + if (auto bufferType = + llvm::dyn_cast_or_null(allocatedBuffer.getType())) { + if (isGPUSharedMem(bufferType)) { + return success(); + } + } + + // Default buffer deallocation via DeallocOp. + b.create(loc, allocatedBuffer); + return success(); + }; // deny some corner cases opts.opFilter.denyOperation([&](Operation *op) { @@ -272,6 +319,180 @@ LogicalResult bufferize(Operation *op, RewriterBase &rewriter, return success(); } } // namespace CallOpBufferizableOpInterfacePatch + +// ------------------------------------------------------------------------ // +// Patch of TensorInsertOp +// ------------------------------------------------------------------------ // +namespace TensorInsertPatch { +bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand, + const AnalysisState &state) { + assert(isa(op) && + "expected that op implements DestinationStyleOpInterface"); + + if (opOperand.getOperandNumber() == 1 && + opOperand.get().getType().cast().getRank() == 0) { + return false; + } + + return true; +} + +} // namespace TensorInsertPatch + +template static bool overwriteEntireTensor(OpTy insertSliceOp) { + RankedTensorType destType = insertSliceOp.getDestType(); + // Dest is not read if it is entirely overwritten. E.g.: + // tensor.insert_slice %a into %t[0][10][1] : ... into tensor<10xf32> + bool allOffsetsZero = + llvm::all_of(insertSliceOp.getMixedOffsets(), + [](OpFoldResult ofr) { return isConstantIntValue(ofr, 0); }); + bool sizesMatchDestSizes = llvm::all_of( + llvm::enumerate(insertSliceOp.getMixedSizes()), [&](const auto &it) { + return getConstantIntValue(it.value()) == + destType.getDimSize(it.index()); + }); + bool allStridesOne = + llvm::all_of(insertSliceOp.getMixedStrides(), + [](OpFoldResult ofr) { return isConstantIntValue(ofr, 1); }); + return !(allOffsetsZero && sizesMatchDestSizes && allStridesOne); +} + +/// Return true if the (ExtractSliceOp, InsertSliceOp) pair match (i.e. +/// equivalent operand / result and same offset/sizes/strides specification). +template +static bool areEquivalentSlices(const AnalysisState &state, + tensor::ExtractSliceOp extractSliceOp, + OpTy insertSliceOp) { + if (!extractSliceOp || !insertSliceOp) + return false; + if (extractSliceOp != insertSliceOp && + !state.areEquivalentBufferizedValues(extractSliceOp.getSource(), + insertSliceOp.getDest())) + return false; + if (!sameOffsetsSizesAndStrides(extractSliceOp, insertSliceOp, + isEqualConstantIntOrValue)) + return false; + return true; +} + +/// Return true if `value` is originating from an ExtractSliceOp that matches +/// the given InsertSliceOp. +template +static bool matchesInsertDestination(const AnalysisState &state, Value value, + OpTy insertSliceOp) { + // Look for matching slices. + auto matchesSlice = [&](Value val) { + if (auto extractSliceOp = val.getDefiningOp()) + if (areEquivalentSlices(state, extractSliceOp, insertSliceOp)) + return true; + return false; + }; + return static_cast(llvm::all_of( + state.findValueInReverseUseDefChain(value, matchesSlice), matchesSlice)); +} + +template +static bool isNotConflictingInsertSliceLikeOp(Operation *op, OpOperand *uRead, + OpOperand *uConflictingWrite, + const AnalysisState &state) { + Operation *readingOp = uRead->getOwner(); + Operation *conflictingWritingOp = uConflictingWrite->getOwner(); + + // Special rules for matching ExtractSliceOp/InsertSliceOp pairs. If + // uRead is an InsertSliceOp... + if (auto insertSliceOp = dyn_cast(readingOp)) { + // As an example, consider the following IR. + // + // %0 = tensor.extract_slice %t[%a, %b][%c, %d][1, 1] {inplace = [true] } + // %1 = linalg.fill %cst, %0 {inplace= [true] } + // %2 = tensor.insert_slice %1 into %t[%a, %b][%c, %d][1, 1] + // {inplace= [true] } + + // TODO: Use insertSliceOp.getDestOpOperand etc. when available. + if (uRead == &insertSliceOp->getOpOperand(1) /*dest*/ && + matchesInsertDestination(state, uConflictingWrite->get(), + insertSliceOp)) + // Case 1: The main insight is that InsertSliceOp reads only part of + // the destination tensor. The overwritten area is not read. If + // uConflictingWrite writes into exactly the memory location that is + // being read by uRead, this is not a conflict. + // + // In the above example: + // uRead = OpOperand 1 (%t) of tensor.insert_slice + // uConflictingWrite = OpOperand 1 (%0) of linalg.fill + // + // The read of %t does not conflict with the write of the FillOp + // (same aliases!) because the area that the FillOp operates on is + // exactly the one that is *not* read via %t. + return true; + + if (uRead == &insertSliceOp->getOpOperand(0) /*source*/ && + uConflictingWrite == &insertSliceOp->getOpOperand(1) /*dest*/ && + (overwriteEntireTensor(insertSliceOp) || + matchesInsertDestination(state, uRead->get(), insertSliceOp))) + // Case 2: The read of the source tensor and the write to the dest + // tensor via an InsertSliceOp is not a conflict if the read is + // reading exactly that part of an equivalent tensor that the + // InsertSliceOp is writing. + // + // In the above example: + // uRead = OpOperand 0 (%1) of tensor.insert_slice + // uConflictingWrite = OpOperand 1 (%t) of tensor.insert_slice + return true; + } + + // If uConflictingWrite is an InsertSliceOp... + if (auto insertSliceOp = dyn_cast(conflictingWritingOp)) + // As an example, consider the following IR. + // + // %0 = tensor.extract_slice %t[%a, %b][%c, %d][1, 1] {inplace = [true] } + // %1 = linalg.fill %cst, %0 {inplace= [true] } + // %2 = tensor.insert_slice %1 into %t[%a, %b][%c, %d][1, 1] + // {inplace= [true] } + // %3 = vector.transfer_read %1, %cst + // + // In the above example: + // uRead = OpOperand 0 (%1) of vector.transfer_read + // uConflictingWrite = OpOperand 1 (%t) of tensor.insert_slice + // definition = %1 + // + // This is not a conflict because the InsertSliceOp overwrites the + // memory segment of %1 with the exact same data. (Effectively, there + // is no memory write here.) + if (uConflictingWrite == &insertSliceOp->getOpOperand(1) /*dest*/ && + state.areEquivalentBufferizedValues(uRead->get(), + insertSliceOp.getSource()) && + matchesInsertDestination(state, insertSliceOp.getSource(), + insertSliceOp)) + return true; + + return false; +} + +// ------------------------------------------------------------------------ // +// Patch of TensorParallelInsertSlice +// ------------------------------------------------------------------------ // +namespace TensorParallelInsertSlicePatch { +bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand, + const AnalysisState &state) { + auto insertSliceOp = cast(op); + + // The source is always read. + if (&opOperand == &op->getOpOperand(0) /*src*/) + return true; + + // For the destination, it depends... + assert(&opOperand == &insertSliceOp->getOpOperand(1) && "expected dest"); + + return overwriteEntireTensor(insertSliceOp); +} +bool isNotConflicting(Operation *op, OpOperand *uRead, + OpOperand *uConflictingWrite, + const AnalysisState &state) { + return isNotConflictingInsertSliceLikeOp( + op, uRead, uConflictingWrite, state); +} +} // namespace TensorParallelInsertSlicePatch } // namespace // TODO: removed this once upstrem fixed it @@ -279,6 +500,21 @@ RegisterOpInterfaceOverride( /*Op=*/func::CallOp, /*Interface=*/BufferizableOpInterface, /*InterfaceMethod=*/bufferize, /*Impl=*/&CallOpBufferizableOpInterfacePatch::bufferize); +RegisterOpInterfaceOverride( + /*Op=*/tensor::InsertOp, /*Interface=*/BufferizableOpInterface, + /*InterfaceMethod=*/bufferizesToMemoryRead, + /*Impl=*/ + &TensorInsertPatch::bufferizesToMemoryRead); +RegisterOpInterfaceOverride( + /*Op=*/tensor::ParallelInsertSliceOp, /*Interface=*/BufferizableOpInterface, + /*InterfaceMethod=*/bufferizesToMemoryRead, + /*Impl=*/ + &TensorParallelInsertSlicePatch::bufferizesToMemoryRead); +RegisterOpInterfaceOverride( + /*Op=*/tensor::ParallelInsertSliceOp, /*Interface=*/BufferizableOpInterface, + /*InterfaceMethod=*/isNotConflicting, + /*Impl=*/ + &TensorParallelInsertSlicePatch::isNotConflicting); std::unique_ptr> byteir::createOneShotBufferizePass() { diff --git a/compiler/lib/Transforms/MemoryPlanning.cpp b/compiler/lib/Transforms/MemoryPlanning.cpp index b00ae8e8b..70e9cce2c 100644 --- a/compiler/lib/Transforms/MemoryPlanning.cpp +++ b/compiler/lib/Transforms/MemoryPlanning.cpp @@ -220,10 +220,11 @@ template class SortedPackingStrategy { : windowSize(windowSize), alignment(alignment), compare(compare) {} /// Optimize the buffer allocations. - void optimze(const mlir::bufferization::BufferPlacementAllocs &allocs, - const UserangeAnalysis &userangeAnalysis, - std::vector &packedBuffers, - std::function isValidAllocation) { + void optimze( + const mlir::bufferization::BufferPlacementAllocs::AllocEntryList &allocs, + const UserangeAnalysis &userangeAnalysis, + std::vector &packedBuffers, + std::function isValidAllocation) { AllocInfoList allocInfos; allocInfos.reserve(std::distance(allocs.begin(), allocs.end())); @@ -344,7 +345,7 @@ template class SortedPackingStrategy { /// maximal userange. size_t computeAllocationInfos( AllocInfoList &allocInfos, const UserangeAnalysis &userangeAnalysis, - const mlir::bufferization::BufferPlacementAllocs &allocs, + const mlir::bufferization::BufferPlacementAllocs::AllocEntryList &allocs, std::function isValidAllocation) { // Create allocInformations and store them in allocInfos. size_t maxUserangeId = 0; @@ -405,13 +406,15 @@ template class SortedPackingStrategy { /// argument. template class BufferPacking : bufferization::BufferPlacementTransformationBase { + static constexpr bool is_alloca = std::is_same_v; + public: template BufferPacking(Operation *op, StrategyT strategy, std::function couldReuseAllocation) : BufferPlacementTransformationBase(op), liveness(op), - userangeAnalysis(op, &liveness, initAllocs(op), aliases), - dominators(op) { + allocs(initAllocs(op)), + userangeAnalysis(op, &liveness, allocs, aliases), dominators(op) { std::vector packedBuffers; strategy.optimze(allocs, userangeAnalysis, packedBuffers, couldReuseAllocation); @@ -434,6 +437,7 @@ class BufferPacking : bufferization::BufferPlacementTransformationBase { private: byteir::Liveness liveness; + bufferization::BufferPlacementAllocs::AllocEntryList allocs; UserangeAnalysis userangeAnalysis; /// The current dominance info. DominanceInfo dominators; @@ -451,13 +455,18 @@ class BufferPacking : bufferization::BufferPlacementTransformationBase { dominators); } - const bufferization::BufferPlacementAllocs &initAllocs(Operation *op) { + bufferization::BufferPlacementAllocs::AllocEntryList + initAllocs(Operation *op) { if constexpr (std::is_same_v) { + bufferization::BufferPlacementAllocs::AllocEntryList ret; op->walk([&](memref::AllocaOp alloca) { - allocs.registerAlloc({alloca.getResult(), nullptr}); + ret.emplace_back(alloca.getResult(), nullptr); }); + return ret; + } else { + auto &&baseAllocs = BufferPlacementTransformationBase::allocs; + return {baseAllocs.begin(), baseAllocs.end()}; } - return allocs; } void createBufferAndViews(const PackedBuffer &packedBuffer) { @@ -505,7 +514,7 @@ class BufferPacking : bufferization::BufferPlacementTransformationBase { }; template -inline void doBufferPacking(mlir::func::FuncOp func, size_t alignment, +inline void doBufferPacking(FunctionOpInterface func, size_t alignment, std::function couldReuseAllocation) { SortedPackingStrategy strategy( 0, // windowSize @@ -517,10 +526,12 @@ inline void doBufferPacking(mlir::func::FuncOp func, size_t alignment, struct MemoryPlanningPass : public MemoryPlanningBase { MemoryPlanningPass() = default; - MemoryPlanningPass(size_t alignment, + MemoryPlanningPass(size_t alignment, bool alloca, size_t memSpace, std::function couldReuseAllocation) : MemoryPlanningBase() { this->alignment = alignment; + this->alloca = alloca; + this->memSpace = memSpace; this->couldReuseAllocation = couldReuseAllocation; } @@ -559,11 +570,15 @@ struct MemoryPlanningPass : public MemoryPlanningBase { }; } // namespace -std::unique_ptr> mlir::createMemoryPlanningPass() { +std::unique_ptr> +mlir::createMemoryPlanningPass() { return std::make_unique(); } -std::unique_ptr> mlir::createMemoryPlanningPass( - size_t alignment, std::function couldReuseAllocation) { - return std::make_unique(alignment, couldReuseAllocation); +std::unique_ptr> +mlir::createMemoryPlanningPass( + size_t alignment, bool alloca, size_t memSpace, + std::function couldReuseAllocation) { + return std::make_unique(alignment, alloca, memSpace, + couldReuseAllocation); } diff --git a/compiler/lib/Utils/Utils.cpp b/compiler/lib/Utils/Utils.cpp index aae017171..0dc02bb91 100644 --- a/compiler/lib/Utils/Utils.cpp +++ b/compiler/lib/Utils/Utils.cpp @@ -452,12 +452,13 @@ Value mlir::getSlice(OpBuilder &b, Location loc, Value source, OpFoldResult mlir::canonicalizeOpFoldResult(OpFoldResult ofr, bool enableFold) { if (auto val = ofr.dyn_cast()) { - SmallVector foldResults; if (enableFold) { - OpBuilder builder(val.getContext()); - Operation *op = val.getDefiningOp(); - if (op && !failed(builder.tryFold(val.getDefiningOp(), foldResults))) { - val = foldResults[0]; + if (auto opResult = llvm::dyn_cast(val)) { + OpBuilder builder(opResult.getOwner()); + SmallVector foldResults; + if (!failed(builder.tryFold(opResult.getOwner(), foldResults))) { + val = foldResults[opResult.getResultNumber()]; + } } } return getAsOpFoldResult(val); diff --git a/compiler/numerical/hlo/canonicalize_ext.mlir b/compiler/numerical/hlo/canonicalize_ext.mlir index 924cfa162..dd961caee 100644 --- a/compiler/numerical/hlo/canonicalize_ext.mlir +++ b/compiler/numerical/hlo/canonicalize_ext.mlir @@ -273,3 +273,40 @@ func.func @fold_large_constant_reverse_float_1(%arg0: tensor<1x3x3x128x64xf32>) // CHECK-NEXT: mhlo.constant // CHECK-NEXT: mhlo.constant // CHECK-NOT: mhlo.reverse + +func.func @replace_gather_with_input_0() -> (tensor<1x64x128xf16>, tensor<1x32x64x128xf16>) { + %0 = mhlo.constant dense<1.000000e+00> : tensor<64x128xf16> + %1 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<64xi64> + %2 = "mhlo.gather"(%0, %1) {dimension_numbers = #mhlo.gather, indices_are_sorted = false, slice_sizes = dense<[1, 128]> : tensor<2xi64>} : (tensor<64x128xf16>, tensor<64xi64>) -> tensor<64x128xf16> + %3 = mhlo.reshape %2 : (tensor<64x128xf16>) -> tensor<1x64x128xf16> + %4 = "mhlo.broadcast_in_dim"(%2) {broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>} : (tensor<64x128xf16>) -> tensor<1x32x64x128xf16> + return %3, %4 : tensor<1x64x128xf16>, tensor<1x32x64x128xf16> +} +// CHECK-LABEL: @replace_gather_with_input_0 +// CHECK-NEXT: mhlo.constant +// CHECK-NEXT: mhlo.constant +// CHECK-NEXT: return + +func.func @replace_gather_with_input_1(%arg0: tensor<64x128xf16>) -> (tensor<1x64x128xf16>, tensor<1x32x64x128xf16>) { + %0 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<64xi64> + %1 = "mhlo.gather"(%arg0, %0) {dimension_numbers = #mhlo.gather, indices_are_sorted = false, slice_sizes = dense<[1, 128]> : tensor<2xi64>} : (tensor<64x128xf16>, tensor<64xi64>) -> tensor<64x128xf16> + %2 = mhlo.reshape %1 : (tensor<64x128xf16>) -> tensor<1x64x128xf16> + %3 = "mhlo.broadcast_in_dim"(%1) {broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>} : (tensor<64x128xf16>) -> tensor<1x32x64x128xf16> + return %2, %3 : tensor<1x64x128xf16>, tensor<1x32x64x128xf16> +} +// CHECK-LABEL: @replace_gather_with_input_1 +// CHECK-NEXT: mhlo.reshape +// CHECK-NEXT: mhlo.broadcast_in_dim +// CHECK-NEXT: return + +func.func @replace_gather_with_input_2(%arg0: tensor<64x128xf16>) -> (tensor<1x64x128xf16>, tensor<1x32x64x128xf16>) { + %0 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<128xi64> + %1 = "mhlo.gather"(%arg0, %0) {dimension_numbers = #mhlo.gather, indices_are_sorted = false, slice_sizes = dense<[64, 1]> : tensor<2xi64>} : (tensor<64x128xf16>, tensor<128xi64>) -> tensor<64x128xf16> + %2 = mhlo.reshape %1 : (tensor<64x128xf16>) -> tensor<1x64x128xf16> + %3 = "mhlo.broadcast_in_dim"(%1) {broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>} : (tensor<64x128xf16>) -> tensor<1x32x64x128xf16> + return %2, %3 : tensor<1x64x128xf16>, tensor<1x32x64x128xf16> +} +// CHECK-LABEL: @replace_gather_with_input_2 +// CHECK-NEXT: mhlo.reshape +// CHECK-NEXT: mhlo.broadcast_in_dim +// CHECK-NEXT: return diff --git a/compiler/numerical/hlo/hlo_move_down.mlir b/compiler/numerical/hlo/hlo_move_down.mlir index 0676ae6f5..0afdd1ce3 100644 --- a/compiler/numerical/hlo/hlo_move_down.mlir +++ b/compiler/numerical/hlo/hlo_move_down.mlir @@ -179,3 +179,25 @@ func.func @broadcast_reshape_dot_with_concat_and_add(%arg0 : tensor<1x64xf16>, % // CHECK-NEXT: mhlo.broadcast_in_dim // CHECK-NEXT: return +func.func @transpose_move_down_binary_case0(%arg0 : tensor<2x128x4x16xf32>) -> tensor<2x4x16x128xf32> { + %0 = mhlo.constant dense<"0x9A4830BD21F00B3C7DFEC9BDF7DAD33C1F0C483DDC57B5BC81D312BEB409853CF9F2F0BCF858843C183DAB3C11C6F5BCAA1CDDBD1F2D073D784D0B3DB0A0FCBDE0458F3C2621DBBDA7D3E3BCA99608BC7E39723C3FD8A63D0671DE3D2C29043B45059DBD057A51BD3157293C086EDC3DF0789FBCFE4A0F3D3F29423DD5D756BAF0C60F3D2FFEA6BD695477BC6A98ADBDD45BEC3D1532593DE3AB2D3D4D23193B0C52A4BDBFC474BC43F4223DB2A1F83A55FA433D5F06093D52146CBD9B1A37BD6A7617BDCA91363E305EDDBCB3DDC0BDDA82B5BC64D8163CDE519C3DE7768CBCBF778D3DC8AFD4BA2269503DB9832E3CEDE6A7BB8AF5F73C658639BB7EDC51BBF6F9BFBB603F063C335269BE0B91EC3B1F248DBC1C51B2BDAB9964BC6E43813CD4872E3B7AD3333DEC88273C428481BC4FCB713D56630B3EC37B0F3C374041BDE5CCCCBDC0685E3DD3D18B3D27ADC23DB086E23B88825BBE3B5425BCA6FE383B5CF0CC3D80FC803D24CDEF3C21DCD0BB6762843C9CDD013EF2BAC23A6552D8BC4460A33D3E314F3D2D69E9BC642193BCB8DB213D8BE53D3C4A653B3DFBF5313D836617BC70FE49BDD6ECA43C8856AB3B96C85C3D6D51BEBDF40248BDAFCE25BCF78108BEB9DD08BE4263233B789B543C6A86C6BD0EBAA13B28A3A03C2B50B53CFD66C6B30D788B3D883134BD6069723D642641BD1D5F0D3E80D58F3D84AB833D559023BD7F7FB7BC2BD7B13CB70B1B3C294693BBB5B2AE3C742B3F3BE099983E6308583CB849A2BC06851B3D60AB86BD505014BDC1BF883B2B436BBD3B6B073D5A65BFBC92C4043D35C882BCAF3A12BE92CBC5BDA2340F3D6508ABBD89D910BC04AD8B3C9E2F693D9D2459BC8D099BBBA4DAE3BC4EB30BBD188C22BDA4F843BD377528BEA6EC253D2759713C7FA1EFBAAFC720BD51B7C93C34E6673D8602E83C871F763DD766CDBC05978A3C0F98FB3C960F8F3D3E49253DAB9A643D5D1E6F3DB0ABDA3A10A15C3D66BDA1BA02CE81BB39B456BEAA35E7BD723D36BD121434BE10C3513DDD10AABD89A172BB47CA2EBDFFFA44BC0E72F93C65CEDA3B257E4C3D0890ADBD63A63DBDE51D17BE90DD52BEF7BEECBD02EB5DBD63BCAD3DDE533CBD7DFD033B591B24BD633FDC3CB39ED93787DD163DEEF841BCF7CF69BC2A975ABD580CA4BDB0F93E3C30D49F3D7EC3B83C3C4BBE3B6BD322BD81EC963DA056933BE7698DBC7E3960BD8F591D3D73CCA139A7843CBC248A623CD871A93DBA5E17BD67D5D33DB35B45BC8FA0823CADB2043D79E8D43CD59FF1BC0ABB783DA14F1E3C9C016E3DFEBE30BC253C98BD85E64B3BEA4A973C9D389DBC674A37BBB05BD03C4C7BC03D5CB6A3BC8E63623D74A3DABC775E90BC41FB833C206E4A3D3104AC3C0D1070BD053005BBB861A6BDEF768EBC4920303D0E00173DBCB70ABD1BB81FBD536AA73A069144BB267A5B3D5EB7583DB24EDD3BDA2FB3BC89FA41BD3CC37D3BF17204BCDEC6B33C92FD85BBABCB16BD437331BD9857533B8614B1BCCE4FBC3BA44AEFBB11C96B3B7DB8E2BC8262FB3BED9DF93B32AEC53AB35589BC72641BBB51FEF0BCD5C2AFBB94187BBC72C61F3D4853A93D65DC023CE04083BDC9F0E93C5CACCFBB8F6C85BD3E7C1C3CBAEFA4BDF71A91BC34EE2A3CF235883D76189ABC0979F13C6657F53C32E895BC011F0C3DBFDF8F3DE147C13C987B26BCAD41713B6EF78B3CC00A573D9E876D3BC4EBD5BC42F03BBCD3917E3D25C384BC1C189C3CA9252F3ED57D833C378B4F3B96200FBCE6D503BC71A0E23C2DE6353DAF9AF1BB5425163C5DA7BE3B20E26BBD31FD1B3E3AE1663DAD26F03C5DE4243D5CD30FBCCFBA8CB8C1A07FBCE877553CDC176EBA59FF3CBD7F67063E977B173D78F4E9BB4B8B1DBAB311A83C653B4EBCD674113D7DCA35BD584041BD8E86033D9A5E1FBDCD620B3D7F8EC5BA1C789DBC2C3C653D308045BDF9F5B53C75A668BD1DE708BDBBF41ABDD4E2F93CE15040BCC5EC85BC97AF64BC4177543C0831033D5428E6BB4AD9CCBC7518923C76F9BB3D674C513CE163CD3C2F08E1BC01AF8ABD5F2E97BB7E8FBB3CB77C0FBC2C7E023EC0D9F53CA8554BBDE38372BC46D7913CEBE7FF3C4BED28B30A8D473D33332C3CA4E6213DFE9CFFBB94399E3C7907BEBCA5B96FBD3A3B9BBC231262BD587717BC9FC1B73B1E378D3C5B2C31BC97A8263CCA6528BCF2DD01BD477AD33B4692B7BCB9F7EC3DCD95963CE6F9893C7EE3323D50A0AD3DE9F8523D88B102BA8A3B11BD3F0B8BBCCD61A4BC9A75893DCF6C4B3BF605CDBCDDBBC03B284214BD3FD13A382C6763BD8400863C39263FBD1DC8D13B6F302EBDC34911BDED527ABC819A9E3B84DF903CBB8DB03C5C7F853BF8779FBC7C2B10BB13259A3D39DD21BC1E1B86BBF76A983DA42F51BC2D5707BCB6CACF3B7DD20CBD06CED23B2585913D35C07C3CF8331BBBF42942BCC6FB04BDB8BD813C0145D5BC76CF9E3DA2DA7DBCF53DE5BB8489523C01D008BDF9FC6FBC55CF643DD0AE213CF04A0CBD152720BD65FE77BD5A0A5E3CAA7F53BCC807E5BB712FC53D62A544BD8C27EC3950A19A3BC56E993D6B5F3435D99F223E3C3422BD96BD0F3BA18E86BC0A452ABCC4421EBBC3BD7D3DFC89F0BBAA28163EE973243D7BE058BBCF4B84BC1293CCBD3FD121BD391EC3BCA8C6203C80CF833CDB40143DEEC0AABCEBB8F4BB8565883D115E37BC85D860BDA36A8ABDDF61D83C4EC03D3E42DA34BD5744A0BCF2A037BD1D700E3DD7D7D739BAAA333A9D0CD33C66699CBB6038313D7661B7BC509D743D5D82CB3CDFD7973C1FB71B3DB98C08BC7B23C2BB60E4113CA2C112BDE0970BBC82F1193CFF8AA03A2792E43C6048063DFF23863D76C2513D5D2708BA9D562B3C616615BCB71C123C50DF0BBD30966A3CE832D4BC3E5224BD1EC6803DDAD805BCCE359ABC1E98ADBA9866B23CB924863C461CF33DFE415DBC16FB88BB6F70DCBCC3EA033DF5F0953C9E7E263C0A25193C7CAFF0BC9958763C60807B3930D82E3D5A955ABB77B4BC3D4A8424BE82026B3D238FA03C0987D03CB6A9E53C1E43E03CFF90A63C5E45083D873E3B3C4E2E343D6E9D3D3BB82A65BBCE79BCBB681BBC3DF67139BD5E1D9F3C5A32053DD17097BC782160BDADA846BDFC4A793CBB42F1BCC6C1A9BD8A074D3D8CC3B23DF0DF4A3C4E98433A277BA23BE94F6EBD0BFFCA3C99B29DBCF16E3EBEB34809BD0AECF03CCA16803D3A3EBABC511CA8BCECDA1D3CCAC0073C8D5EC73CBFFDC33C82410CBCF013263E4481D1BC75E6893CE6FA133A658BA53C78F59EBD735381BB7EFDA6BD0CC57A39222F9DBC9825233D8046BEBCEA5ECDBC23E4C23C7FDDE53C3DECAD3C588CD23CB764CABCBAF92DBC173BB3BC1EBED33BFAD54EBC01EB40BC50EA43BD77EC06BE6FB2AFBCBEED1A3BB251063E725600BD3F2348BC663036BB96361E3C8D7F5FBCCAB8BB3BACFF5B3C37CA3CBD7CCC54BD99AE98BBC9F4B93C96D874BE15C9F63C7879C73CC4FAC83C8F5385BDCD4703BE7C0C113CAE69383DD1B414BDAD93D83C568DD83CBA29513DA7430DB4489B8DBC964383BC7C9908BB96D8593A2276A4BDBD8C243DC3009FBD189B8FBE486E7CBC6A2F763D5CDAB4B9F2D07E3C734688BC974877BB2424B3BD8FC579BB884DCD3CE7AD8F3C9CFCD53C0F73E5BC7E8D9BBCF1460BBE136457BCA5AC703E50D2B1BD377EE33B0572D3BBC7C313BDE946F3BC258D20BD31E00EBCA0FCE63B8F97A33CECFCA7BBE5A28EBD044BAF3D9EE2233C2353A33C576344BC39F3A53C3472843BE63FC23CFEE38BBDE4EA92BC5ADE133D826DF13D4C9332BD3AC4E63CB2CE4E3C082DD03BDFB1883C5F3F203ECADF003D7249ED3D26D782BC30652EBBBB4E083EE6AD583D4E26FCB989FEBABCB273173D53297DBD56585EBDA272683C02B4CEBC32B6BF3C64C1423DDF5F82BB17C4053D8BE4C63C23890BBDCA326F3D454692BCF7206DBCD6592A3DA44E23BE3BFD34BB388FA7BA39D7633C7F9E64BB599228BBAE34E23CCAB3D7B522BE36BC231B063D28593C3C130C693D4B0323BD41DBCA3BB52CCFBC431D0D3C7DF62B3C86AD03BB2AE6DC3C7F4F5ABD2D03C8BC6D4EE23BB9D49CBDF8002ABDD79D23BC89A74D3DDAA573BD2985BBBCE5D5AF3C9D1C41BD2BE1A5BDEDC13EBDA2F92FBEFC68143C6268653CBEF63C3D00D8063C0D1D6EBCED8688BEC422F1BA252AF53C78751C3DD021FBBBD8AC233DBAA1CD3CCB73BBBC09EB5CBD47FB19BE657090BD1FBBBBBC731BA03C2E35D93D85BD2F39E7B9C4BCFF36363D292F08BD6E7049BD787907BE1B661EBD8AD4B239E1D7EFBA0A9D9A3BF985AF3D42269F3B6D33443D3D6F05BD8F14BF3C12F31ABB12D0ACBC6C6D603C8502D83CF6321CBC103891BC9D417E3C05C9E53BCA3A673C11A89D3C696301BDE7BAC43CE4947FBC632B9B3D7428B23D3F3FAEBC2F3D963AB81B9DBC04DA49BD4FFB273C5AFC973B9011293D1098363C3C4E77BDB55155BB2B67923C130D79BDFF2D70BDC42F623C31DD04BE0D2F883D5274753C742A8F3B6A4706BE2CDA78BC4CF7A93CE544F13C8E35413E3941493DC4AE7FBDBEF425BE9A5AB0BCE46E0F3EFD6B04BEBB74C1BD702F44BD5B72AB3CE719883DB8C7043D9E8A8D3D0947943C179923BC7943123C2027473D718D3F3D8A760B3D392832BDA0DC053D27DDA0BC78E0F7BC5D5A1A3DE2AA37BC874747BDBC91B23CA1305F3CFDA26D3AF85B003B8579843DFC9888BC026820BD0086703D9CF3133D889D153DDF0BB03D358DA9BC97B9873C75B2873C9203063C89E3103D4F3F7C3C00B8CF3B5335F43D2EC22EBD858D893A3332C7BCC11D8A3CAD31BB3B3C9840BC826492BD3B3C17BEC2F5BE3D50607DBE19FBF83CBCFAC5BC2689FF3CEEFE3A3D95C7E53CE08C373EA5F0FCBC28D9793C5996333DD423B13DA37C0B3D238A80BC3FCFE7BC7E4C083E1F54E0BDA3ADDD3CB397E43C70E2ECBD444E88B965843C3C8C1AC73CE49A51B4C7F0F53D66CA37BC4C711F3C901142BC983C953D01315BBECB283C3D29CE5A3D2B5A36BD11DF66BDF1398C3C53B224BD8E5B82BC6E44A4BCE9A104BE3B3C833DE551E7BC1A81613C0361A0BCEF5024BB1B5E7A3A21E6ADBD5E953B3D45461BBD891002BD650252BCFF3C24BEA305043CCD79FE3C71914EBC04D98B3C4D0DFF3CD04C463D39F335BD8C29023DE6E86639779BB03B711D61BD5B9652BDC7CBA1BDC532633C3C0E9CBAE434B53D6672063EE178CD3B6E16723DB5CA023D559A973D9CFE0EBB0FC4213D322E68BD9A76973DEE640DBDB08653BC23F4163CE7969A3CD4A19F3D35BFDF3C78AAE2BAF756213D25B76D3BA7B145BD8DB603BEB7AFCE3C908A9DBC345CA53C6198AC3C3395A4BBAE34163DAAE4153D0111C73C5D6D0FBEC68E643D93C3C7BDB52E8B3D6B636CBD2AF7BDBCF119053DD9563CBD7FFDCB3BC87407BEDFE2FABB600D0138F869083DD55213BCFBD14FBDAFC226BDD0D19FBD3734803C8FE3BF3CF371933A55E39A392570B3BE0AB622BEA5C23FBCBBA5223CE6CBC43C2D1AA23C08F1D93CE6C86DBD684E4A3C46AE5D3DB5FAAB3DCA84E83C471CF5BCFB49C8BA150447BD7A84A53BF04F47BCF9E5643D1375743B1DF8853DDB07BDBCC625593C2245CE3A8E5AEF3CCE5BF6BC273C6C3C17213B3CC575A03D2A8A8FBD3D055DBD9FEC553D2712ACBDAEB1443C6EE8083DB63ECBBB7D000DBC3F5C1C3D7EF910BD920B06BC824BBB3C7C0BE83C0866033E175C11BCFF7ABDBC55BC0FBCA5C39EBAD6FEC43CEF24CF3C11EDF8BC460F31BDC4340EBD6C1E87BA6E42CA3BDA50D2BBC750003D3CAD22BD1A34E4BA14B60DBDA6EA883B59D2243CABBC23BC93B7ECBB89E6BABC926B8D3DE995063D14591FBC644C06BAFB30CC38D98102BCE4DC1E3C28078F3D5414BB3DCD1BE5BCA0ABD7BCFC5258BB1B79433A1198B8BD9CCAD7BC7AA774BD90ED803C27AA013DD6ED343C502AB4BC01288EBD70B9CEBC5776163C25C81C3D7738C33C7F389C3C8F160DBC56C007BD9E98CCBC571DAE3DAA0B24BDEF7870BCCD1F93BC98C46D3C0FCCCB3AB46F373DC511E63DD45B43BCAB777A3AB7AACABCE3C2F6BA51E3033D01CCB3BBE7E08BBB5DEC983C637E46392F97FCBB7E9029BC579A813D4B84B43C9ABE983C9DF5C1BCA3182A399B2D15BDC0EB37B95F7D5EBDFF87D83CD1DF9E3DE183E83C548B8A3C5B7F153CA849983D76246B3CE4D5F63ABDAED9BC8F2EDD3B8E25573C81A692BBDEE4AC3CD77B6B3D4712213CBC4B4A3DD73E9CBCB799433C549CA73C01341BBD1A32AA39506606BC9E7959BCC13DE3BCD7C7B3BC56AC793D932D0F3EAE6432BCD7F0B8BD3DB1CCBCC6639F3D2106A13D030D18BA554829BDBAB73ABD3BA31D3B2DC465BC668201BC2A0924BD48E21D3D78C9073D0718EF3C9A848C3CE606CE3B45C4B1B3555B053C37F19DBC491B25BCCEAB47BD5D3C553D4639C83B6B6F1CBD1AD7A7BCB44B88BCD55DD3BC624CFC3A486DA2BC7DC84F3D960A0ABDE018B7BC6951CE37BB3ADD3C83C178BDFB13E73DE917A53BE2FB00BC609C8F3C0CA5CC3CA814CCBB516B833CBF7CF3BB9022CABDDCF6063DEA02873D2182653C140635BC5739413C6829B23A8F5306BD317C32BD9E00863B944BF83BEC5B22BCF2A3A93C2767833CB765413B2E6185BCEA56BC3CC99A813C8C600BBDE8FA9D3BBF83B73C9FDD1C3EEC719DBC6EB0DE3AF384B4BDB9E9B53C7A890FBC9017183C59851ABC4FB0B1BB42EC8B3C6935973CAD0024BBBA91043CA2DBCBB95D628B39ECA107BC3E869C3DED0F88BB12684C3CC97109BDAEA1073CB031093D9A43EABB46B8033C7E5B7CBDBC3721BC72ECDFBBA3EE4DBCDAF7AFBCC46B24BDB0450B3D21E7913C12C7133BE7C6643C0CF3923CC1461EB550EE013ED9F8B4BC7F8DF83BBA9F48BD0D95F4BB65DE59BC54A77B3D83B49BBBD3A5A93D15EB893B7DC24E3A2F16BE3B589B083D2C13533C2249A8BB79CDAB3DC8D8BD3CC708C83C10325F3CAA4BF5BCEAC7AE3C42E744BDA6D8C0BAD796953CE872A83B3C85AA3DEA2C033DB3B11CBCAFCF4C3C90A420BD48A7CF3B52A8603A0D0B673CE3EA79BC6BFD0FBDA1A6E9BAB2A7883D63CECFBDA068583D02F6953C2314DDBCB915393D1ADE45BD6F25DABC3B02C53B47F34ABD179DFE3BF759003D9BA70E3D5E05233DC67DE53C7E68003DB788ABBCF77164BBF7EC843CDDE643BC6A2D5E3D116DF3BDC86D31BEC3052EBCFB8D35BC23EF263C72E4D7BB536B2C3CF74A2D3CA31E21BC48C590BD13720BBEA71DEE3B94972EBD08D0F73B09BE313C4D5B9CBDF0AD123E6AABD13D5F014CBA5BCC143DB19712BC36B89A3CA8A2603D11DD37BAE7D5083D9D809DBDA28A82BE781241BDE6B6323D9C2D083CF70C49BDC1F5263D8C2123BD8BD8A5BB2096F23BA8223A3D64FB39BC319D023CFED9A73C1CC717BEFAAB2B3CB0E2053C2C677ABDD33B2FBC5D532B3D75B66BBBF846B43D338DAA3B8AA3D23CCAC14E3B03B8F53B72478CBC91E08CBCD5847BBCEB3EB53C8E136F3CB66E15BD070A89BC415541BCC737263DCF70FA3BA5B54CBC076478BB5666693C0E5BC6BC70A335BD70BE173B34E40F3995C4023C334CE2BAF9AF0E3D9DD9823D49CE6C3C6AA94FBDD554F63BA1B022BE6454533D07BA4C39D33A90BC5DF454BC581477BE8915A5BC98031EBC382FD23D24B7713DCA679A3CA25E1E3DCE360EBD87E6283AA1B7E2BCC40C6D3CFB29193E1691BF3D3727AB3C8543EBBC0DDEBCBD020202BDDDCB563C68BDCC3C46B3CFBCC117B3BD81E2B3BB432C0EBC9DE6BB3D4125963D9BF5D03CBA62733CE1B778BD0EEA01BDFE31BDBB91463BBC7D91FC3BC7CB6D3CFCB7D4BC1DFBA5BC36DFF0330E102BBCB795953D1F72A93C27270E3E1B38B6BC94D5993D4BF5983C21BA753DC79AA13D8853BF3DDE9A66BBAF95223C0EC24D3CB151633C2DB38EBCAA162FBE000828BC503557BC9F3DDFBD389351BD7D40E5BCF763753D48B2923C257AD3BD534646BD7AF3F7BC30FB54BC48E1A03B538C60BD3D97DABD7D4D623DD4DD8C3BF91143BD4C1099BDAA3D203BB60B89BD084A60BCE2DF8B3CE276023D07113F3CF1E184BD3D45053B71F219BE892EEDBDCA4FA03C0740D2BD1ACD273C0DA11E3D4C5AD33CB8DE0F3CB314183D30A38C3DC2A762BBF5084E3BCF0E393C731EA439D96BD83D903CCCBD38E5CDB9F5999F3DB784563D7EC401BD87B1ECBC2C0704BB625B923B92BE22BD79F484BB8A1638BCBC0389BD4326EC3CADBD98BD522993BDAF49133D6F40AF3DF1DB92BD91734FBD863D063C60D604BE7AE882BCBE447BBA0724843D78BA03BD53974937D59091BB3E3ED43A94843B3D03A0FF3CA61D8B3B7144A5BBB99D123D288FB53C84DB0ABCC2AC883D33F849BD01CA90BDD35922BC04F5BABCD83CB1380A56C1BB1D2E663B90A398BCE2EE85BD619E0ABD40EFC9BDDE46FEBC6F2CE43D443003BCAE8EB93BD4A3AFBC851E9C3C8263AD3CCCEC393DC4C5DA3CA6C1A8BC65BDD0BA30F4C5BC0324373D3A8FFFBBB41AE33CB53699BB60AC8B3CC51F1FBC77F053BE15CD77BCD8CF4D3CFB4BD6BB0B93C2BC28339ABA017395BD86B2E6BC994BD8BC82F43F3CCE9ACFBC54B4A43A10EADCBC06AA643D8996D5BC59C679BCE572FABD5FFC02BC39AB133BCE2CFE3BC0BB29BC577BB5BEC2BE84BCB567CF3B9A78C13CC902F63AB8FD94BCC50650BD8512363C9A6895BC11AEDC3BE6CB503B5A90793B9A5C6EBCBA2F54BC59292B3CAD8294BA8D7DCE3B578EC23C2300B03C5AC785BCFA5083BB4502B13B88634FBD6A047BBC1D7753BC0BDA37BC49C62D3CE02D7E3CC61F65BBA67FCA3B104EB83CDBFD51BBEADEB23CAE10A73CFB86B03BB8F80C3DA81813BB550B20BD2E1A8D3CBE6265BD3D8230BD85FE58BCE64F98BC8CEA1B3D9C2FC0BB93AF9CBBE977EBBC0B59313C3A55C33D209585BBDAAA683DF517B8BC3809E8BC6FDB70BC7F713EBC1E33763CB28D4F3BDB46CB3B0EEEFE3CF0AC503EDC854CBC12214EBCEEB44C3CC8DF14BD0B6C48B995700FBD33ECBDBCABA5F43CF902F4BB9F01A2BC97AF413B3323383C32A5BEBBB060CBBC853C42BCE0F3FDB970B9163BE185013C2C5A94BAB0A7AEBC4612D0BBE64A0CBD861502BCF22A253B68BEC0BCFF9C16BCA2BD0BBD1A8702BD9BAE2FBDC1E89A3C7CC5A3BCA7573EBC0043163C19A1A3BD7FECB2BD6188243D67C9FEBCC88B66BC7213C43C558CFBBDEFDC45BC502F1E3D3013063DA1A17FBC30CCA83C1AEEC63C02EB5B3E9126DBBC1149853D0E93433C0D0E9EBC0966D7BAEF81453350FD20BD185B783C5E7551BE1299E33BD8C8803BCA09ECBCE5BB4D3DAF6E14BDAE06CC3CC322E73CC0225D3A773D5EBC828C32BC91BA2D3C5A13543C3DF282BB0EC50C3D6081953C1BD7D43D8B69B7BAB1479F3D3D1EFA3C40DC83BE93E58CBD8872313C712B293DC531503D71E5AABBD033CA3AFA5A973BB2F8A73BBB79383C71661CBD5406C4BB6597E03C1577E2BC11C9F9BA967FC93A247C913C2A31843B096961BC9A6838BDF1AF113C7379EF3BF90FC13C98FB11BD0CB81E3C766894BC52C9AA3C6556C53D9CB6E63B5B0A843C5BB58B3B4996AC3C32319DBD6D9B62BB09A7233DA9AE0B3CF9D09E3AA5E2FA3939B6063D8CAFABBC2E6BE33B5F6816BF2D764ABA96024DBCDC55D9BCD184C03C64BE8CBCA89380BD41BFBDBC696AE33C6BB1DF3B7380EF3BBA01073DD1005CBB22719B3CA92A09BDC3F602BEAC10DC38D074233D5805CBBDF9E3C6B47EE32FBEAB5896BBDB9677BADB2E5A3C1FC8913CA5E3C1BC74B9C53B770BEA3A7A74E03C060171BCEEB4DF3CE397833C0A0A97BED960B9BDC4445EBAEF5B46BCC22F573D9F0FD7BDFA4521BD659E093CBE48ECBB4A368DBC7B0602BD3A7CCE3C381F8C3B3323563C4F44A1BC289983BCBB2D28BD44DD113D7F423A3D2C9FB4BA06E53FBB9C2761BA9226CD3C331EADBDCD2A593BA102893C78313EBB5367163C5706D83B23F384BDE9BBF2BC6394F8B8F9A4CF3DAB3C243CB850A93CEDB759B997DF92BC38C285BC7F674C3DE29A003C3C7D603DF2B7EDBA86FA42BC172D3B3C078F033D6C67313C775249BD456AD63CC4998C3CB42E583CC90184BC5CBE22BD1D5003BDE4FC44BC58076EBC3F55E43CFB3CE03AEAF230BCB90627BD57C3A93B89544F3D58CBF53B82F8E83A82F1C2BA4577F63C02CD37BC77BCD83A7314793CCF31423DE392A53B18872FBBA670A6BC5C9E293C0D85A0BC63A1053D64B810BD52274F3DC42C0C3DBABB4A3DBA0BF9BBEB32DF3C11EB89BC602C88BDB816033D96DDA93C93431C3DBFB14EBBE82C97BC882C6E3DEE4BA0BBB8F6F3BCBE18093DA4C126BDBBF4703C8E72553C3F263F3C1C63123E0F43BD3C775AA8BCF9FB92BC148BAB3C79C88E3BFD780B3D73AFE8BC159E97BC9640F5BB1ECE9EBCD0467F3D27925D3DDFB2FFBC9B88CC3C18EC1EBD8CA38BB973BBADBC122E9DBC68B1ACBCEC4D5D3B90F4963D145E3F3C6F7098BC9824443C956E6E3C347C57BCE256D2BBA9B8A7BD77E4753CFF15BF3B6EA138BDF1BF7A3D7205703C19B179BD01D2093DF5260EBDFCF9213C7651D9BCB16BF9BB02215CBC94E81A3D2B5430BCBEA1F53B511007BD4246EDBD072D35BE8748663D371E18BD89758BBCB958A93DA4A7BDBDCF9B3E3D531D413CB18D6FBC716F58BC906C2C3D1576F63C82942E3E25FF283B3ADF453BB9F823BD701FDCBBA048153BD9E1BEB32FABD63CA791E5BCEDCE80BD9DEB5A3C72F1C5BB0947AB3C010F55BC50C625BDB3CD2FBC42061B3CFDF8E43B683A0CBD2F44BBBCF136A7BAE7CE95BB8F85B93CAE5480BC4A99913D99BD1F3DC76E69BCA0DD5BBDAFF363BCF653A93D11A786BCE669153D500AA8BBB65DF3BB93A0803A04A58C3C54C6803C9EE68DBD7CD225BDB81BD9377AC896BC14FA0DB80DE2BB3CB4E383BCD99897BB5DE95CBC7C0EAB3C760535BC173AD93BA3DC1C3D8DD6A0BC117E1E3B1E65C7BC3FDF64BCDA5B5F3D48D91BBB8029FC3C0527383C3EB7C03CD1CD9C3C4604863C1055D1BDB5A3263B12A6783DB6E4EC3BE01109BA20BCDC3B40D7873B4B80BB3C49FD253BF58B313DC31095BC87E0CDB9A130893D10AF63BCA7D9D93C4431DF3B8A11E9BCE635463BDC1E21BD0C4A2F3D624B49BC10F120BD832C193B9CECD53C313D0EBEDC1C64BA03A0A73CC0DE903DA062EBB34D031BBD76A75CBD8880FEBC56CD253DF637553B8D9A92BC051D4F3DC19F0B39185D793DB3CE553DF03094BC0C629EBCCEA031BE3F6AC6BD796942BCE27A0ABCD0AE3F3C7C0CABBCE1B0AEBB9200A7BB7EC2103DA457E2BC47E23B39435A0CBCEA06063DAFB9A23D3AC0F03B14EA82BC2E05CDBDE5348C3B69F21F3D178775BA31FCD23CD3BBC3398781B13D3EDD8ABC0E781D3D8125943C67391DBD14E3083D096497BB3B32CDBDDF883FBDE98233BD22F61EBE472D07BDE462D2BB8DC643BD1D45E23C0F45543DBB97C03BAACE6E3C24F53BBC701F6F3AECA645BE5A0C8C3C437AE83CFB9686BCE215B5BCE885C6BBC7FBBA3BADB7AE3BA98FBCBC6C9480BC719FF53C66EB133D51C2923B9A5691BCF8CD84BC116C6E3B52B2AC3D5C1F0F3BD0418BBC91A3B9BB90250FBE55FC38BA868CAB3D57D601BDB2B52ABE5F06A0BBDA5463BD440C983AE62297BDE14976BD57828DBCDEB5C7BC5992603B90E9BEBC0603F2BDD4FC173DE35EC33B12E1BABACF102CBDD68E07BC781FCA3C4B14C0BADF4726BDACA2CDBB9EAE073D4EB5AA3DEA4907B9CBFB593D5AAA5FBCD8089C3D5AADE2BCA03EE2BD03DDD6BD024EA13CDA95B3BA8B7789BC72A81DBEEA04B4BA6C64C4BA758985BD0A4543BBA8FA02BDD58A83BC5080C23BB953C23C8898A03C7E8B883CEBFCB43D02BE4E3D1FF7C53C5890983A01D0A53C92488BBBD85C3D3D629B36BD7FC2A23C43F20DBCC5FEC1BC7945823C27BA34BD90CCD8BBFF36C0BD55EF973C0F07DC3C466AD03CBE6EDEBBD1D7C53C2FBC3DBD281BD83CE894063DA93D92BD3135353DB39056BA443BC83C058005BDC4069DBCCFF869BBF916C63B530EBBBB7657DA3CB854213C78C5143D4AE63E3DF8E3C53D2BE458BB86B16C3C0821BE3D093A17BB8C1E133DCBD3CA3CA2D2113DB4719A3DAB6B19BC9AE9823D9F3E153A154B973CC552273B0BCA29BDF9DC81B22B2F383D6E9ACA3B3832913CE01D4D3CABB6633D1316753DBEA37DBDA0578CBE625B4FBBA47027BC30C819BC32A4B4BC6B31B9BCB2E297BCFBE5D83D0D1482BD3452983970187E3C5294CF3CFA96293CE4D3ADBCE6D495BD4DC4943C34016E3D8D2B353CE33250BB6A4B2BBD693A683DFBBE12BDAD5DAB3C1AC5A2BCD819D6BBEEE3453D8639093E687687BC857DA53D2502B03D29E657BD658817BB0F2F723DF9190FBA526E87BB4D263ABD36C62C3D3C3FF83B1F56FC3DF68478BD634F573DC10DB03C793D423B982B2EBDDADE3DBE12F9AA3C613A2EBEEEA0093C78386EBCC86D103E36AFC5BCF56B4439EAEF95BC147711BD386D553D71526D3D23DEC03B41BB303DF98A33BDE6292A3D4BDC783C5A6C77BCFB167FBB9A5A9BBC775C6B3BFD39D7BC3055453CFB62AABD2454083CCE7B2E3C0A0F0E3D3A73AFBC27EB28BBC6A6F93C7E4F253C07F1F7B61CC46A3B7ABDB9BB575F51BDB184BEBC1944DD3CD2D28D3CA6620DBD29B611BAF84D8A3C7DFF043D1E86B13C45C2BEBDF198CE3CD2EB873CEBBAD23D05AF22BB182F1A3AA7548A3C66601A3DB1D350BDE6A6E039859BC1BBB0C17F3DC71F493DEAB5483DC6548D3C156ABFBC25C3D93BA206B53C05BA88BCA35D113D2FEF3CBA709AB4BD5914163C54DB333BA21EBB3C0777433DADB139BD761641BE0317D93C4DE2B5BB651E77BCD95FB13C561A933CBF093BBC5B6AC73C414D49BD1782DEBB79E1673B3DE5073ED17F9EBDFF8C39BB6AC1163C59954ABB11E8DBBDEEA8CA3C03247EBC2E2CC4BD795F77BD661FA03B101DB63CEBB24ABCBA34983AB64DDDBCA287703D2CAD19BB6A8FE9BB4A296BBC03F407BCA4F8563C20921B3C08E1BE3CA874993CC3F4743D268984BD146D77BAE9C76B3C6BCB943C8DAE7F3D9ED089BDD900D1BC63D79ABD2F19533A0727893C82DD9E3C9C574DBB06F8B33C33A697BC22894E3D1041D1BC568D09BD8D29DCB977A9643D045F4BBCC9A5DD3BC912B33DF14B103D5C933BBD903456BDF2CFEC3D45DB1E3CCE8E1E3D372F683CB078613DF8F0E73C9E8BBDBC02A9283DB437913D9F8831BBD5C4E1BC643CAFBD3F6F25BCD269E4BC76FCB13DB933CE3C962EA4BB29FFE7BCD3C2933CD9D1C5BC6AA790BCF9D7083DFAD4853DC6F67DBDFCCB703AA1AF0C3AC6B9B03C8CCC7FBDEA03203D049F0EBE0EE6D93BBBCA50BBF531293C4EA405BE11C9A53CA6A89A3CEF4CAA3DAD0D013D4DC999BB3797ECBC063BF3B9708B8D3D9BAECA3DF1FCE0BB39AB603D5F9DC33A85E6E9BC5AA82FBC3BCE7E3BDF7E66BD4F6F91BDE38D16BD4360703C59B9833CE906C43C523217BA7EEBD63C626E4D3D1BDCA7BCD04BEBBCB5B1C9BC556FFFBD98E71CBD64D70D3DFAD52E3D9E4983BDBBD14CBE7D9EBCBC11D9613D89B9A73BE2C4F53BE52D083DF2C5923DC5E1D0B3DC740ABDA7A03FBCC10D303DA8A9753D69135E3A974736BD91DC413D5C5028BE6B1AEEBBF285603DC62B49BC4D67E03B890E21BDDFF5D3BBB21F96BD509166BD1773C2BB7892903BD41508BDD32938BD272C7EBC79904B39963E803B20A70B3D6A367CBD5F01EB3C7E4C96BC88FA66BDFAED5CBD788459BDE4496A3C6CFB50BCF44AC8BCBB98C13DF65C7D3B142F953D8BFC0A3DD50E3E3C24D0983CD0ADEFBC25DC333C2552393CBCB9D6BCADEB6DBC066923BC96D1A73D478E8FBD603B053D10BB183DDE5A60BC4814CBBCCA69CE3DAF537D3D504A083E15DFFEBB55E848BC4389203EAFB8DF3CACA54ABBDE3F82BD7DEBFB3C1F6EF4BDEE0989BD8A46653C1E4A6ABDEF92623C415E2A3D037BA5BBCCE07EBC824BBBBA311105BD5BF8253D9F3AA43CAB0347BDBFD038BED5F888BE1D3333BD9097A0BD88DE9ABBBA2B48BBF03ED4BC77BE8DBC075DC436DE16883B3A871FBCE927013D4E510ABE39CF8FBD122C9EBB8362FD3C9AE9553CF582163BC1DF6EBC8428AE3DB080F83C7E5CC13C88C861BC94CC3CBDBE4ACABCEB3472BCE3C1F0B905A122BDAAA68BBDBB1D07BDA0C098BD92DED4BD88D6D33C27089ABDDEC9F8BA4DB7803C166C0C3DC491653C1A01153D5F22A2BE9A76F6BA5731CE3CFEC5B43CDA1412BBA85C053DBDEA18BD2303F3BDDC06893C327F3EBE6760CABD5692BDBC9D3109BC56B14BBD7908703CC528BCBC9078C63C316B1DBD781A10BDB78582BD662D4FBD2A3655BDDF052ABD5F4F06BC4EA07B3D15219CBD7406253BBDA2A7BCEE4548BC24B5A0BD03C662BD2D4E213C75298E3CEA1B16BC008919BC9FCC513CA3D90FBB9E3070BC4AB272BC0D83DE3B7F8A653C6EF07A3C31B1DFBCD6CD3FBB39F42EBDA6611BB984567C3B8ECA12BCFE758DBCCFF6A8BC7FF97EBCC141D4BC402F723A9C79923C0ACACEBCF853283CA90D843DCB5C9BBA06DDC03B75C2A4BB744907BD172CFB3C52547ABD7738DB3C84923E3D17EEC239A38FEABB125D2FBD60D0B43B25DBC13D849E933DAC931E3C66C5FDBD09C78F3C307357BEF82DA0BC474EE3BCE8C31A3B2A9F0ABD4315E5BCA640AE3CDC08BC3CE914943D72C31B3CE5417DBDD0EE833BEBA7AF3C85D559BC2A11CD3C2142C6BDB9B5E0BC8E8E8E3C15ED2ABC4FDD3C3BE363F4B6129151BD2763213D2F3482BC0DF2433C90D3043B39198C3C4F8A9BBC2A24D7BC44CC0ABB5C7E8FBDBC26733CE1A535BC386CCDBB038937BC407A1FBFA597C83C9E5BFA3A7CFA96BE0DBD10BC671D68BD0229133DA5494E3D296414BD62E677BD778303BDAA0371BC4BA4C2BB0F42EB3CE6B2503DB69FC93C742523BE81E2B23C39E0A9BCD9E77F3C5800DF3D0AD10F3D07D9EFBBAF59ABBBA3A4D43B8574D03DD839983BD8E5823DCFA82FBDBFCE663DE2E59B3ED216F43BB758C4BCD543D3B1F0D2533DF3AA1B3C8DDA13BD604D423C15C526BCC8E0E0BDC522033C3FB2F33CF486D03A9E9D473C1904CDBB6E8A233E18FCDC3B7786813C15F661BB63FE85BB33BB34BD5EF2E2BC826F4B3DCEAA263CEEB65CBC45B55E3CE83805BDBF7EC53CD2F3CB3C232180BE3A5A193C923E263C0045B13B0FCA3E3C9E7AE8BDCCF3B5BE4437FE3C705420BC339870BEE306A13B3A99073C4D5B88BAD98404BC95BFB8BC0CC92A3D53B23F3E059DCD3CA72A093D8CCA91BD481E13BD38B11D3B4F0B26BC1E9FE63C2B03023C1898C53CC7A899BBF851EFBD4F32863C03A911BDF3F68BBC3D841B3DF137333DA448A13A8CFAFFBCC53CEB3C613E6DBB614C263C761CD3BDAD01143CC474D4BC3E41BC3B16C02BBC7B1B923B402D013EECEBA53B697EC53B383E4A3DD1922ABCC9B388BC8F85BD3CFDA593BD258C77BB06640FBEE8E2BABBB2B18ABC7B4C7F3DFCAF4A34FE8127BC958339BEA547513C7C308E3C83BD86BB67B9D73CF30AFEBB9DE0BC3B0BABB2BCDCE9A03B9E87853DAFC0A7BC6621913DFCA5A1BC6FB6283CA0EC29BD344B493C0E8B823CEA08563CD4B82A3AEB01C1BC5735E5BCEFA4D6BCCB79DCBCF53B2E3D14C32EBB97B4CF3CF7ED57BC0E22933D8FB2C1BCC3EC983C1457603A99D7B63CB6D2AABC4FA139BDCE07293EAB790DBCB58F563C67A104BD447EFFBC020B2ABC1DA5ABBC2BF6E63C7C54A23BBC36C9BD474A803B9EDB9BBBAE17DABCF77938BDBE6B29BC381C59BC83A69EB96C12393D367F90BBEA7C213D1314DDBC99FB2EBC1547133CB480053D08551EBAB485AC3D5DB3703CB02A5CBC4F7D4C3DDDE2833C9EF9E2BDCA60453C54DCA5BC086C36BCFFAB673C17F0023D2F6C6D3BD2EFE8BD535531BD6BF6EABC9BBE6338EAC5B5BD6EF1493B8225CB3A683BA4BDE4BA433E3465953DFA1EC9BD8E97C7BBD6A34D3C435505BF8D05BEBBD7C6A73BC56A7BBD578614BE5587513D237C7BBB59A218BECB72C73CCBCD10BDE679123C34FB043D540D193B64FFA83CEE4CAFBCDB96A63C85AA15BEBF77AA3DF6B103BD30700DBBB12D023EA08CDD3B667D0C3C445EAABB7C49E73C3971843D9C56E53C34F9913D1B9903BDD88DEFBCB8B1DB3CE3CBF4BCA1E0CA3B8D45243DABC5E0BB214968BD9537A9BDF8D4D33C3C2F80BCC6035CBA229013BC2967D23C9655A9BB1CF5523CE381923C71654D3CE3DE173CAF708E3DCD33493CFE77A2BB68B0763D25A1923C0EBCF03C32D58BBC03952F3C91BD6A3DE6139DBC68D081BC7E6BD8BCFC4E2FBDF2CB10BDB97683BC6808243D2FD513BD86AA47BD7CA4F1BD6BEE083C40AA613C392C79BDD614A638A14B133DB9E573BC8684D0BBD3F1BA3CB4C4253C7D423FBDC5693F3DADA472BDB7CE73BC9319BC3DF1B0043DA3EE2FBDD502653D9B0CB4BDF1100D3D9A6547BD349E7EBBB78F98B3EF5D1A3EC8AFDD3CEF594FBCE195A03CC1E21FBE7B7525BE73D8563DD066913C19290EBDF21FBF3C6C951CBC453BA73D650514BD74B36A3C8956193C02DD6B3D6ACA8C3C43A6153C233926BD17C0FBBBF39FE53B445457BE91CB8CBA6227FC3C732C3A3D5B62B53DCED5623E7ED9513DE4C1873D7C6F60BDFE193E3D6272683D440F803DAB3F00BB588C8DBD24F39E3CBCF3FB3C6CF30D3DA33EFBBCFFCEF5BD3DE070BCCC04B63C1B1CE2BCA733C53C32E433BD3DB66F3DBFABEE3B89849DBE239C08BD495535BD417FD53D12A020BD665C1D3C8618503D024D4A3DC34045BCB9A9E63D1BF498BB6A4F18BA1D33A83DB4292EBEC8C5DCBB647F0F3ED2CFCF39709C94BC1C6C013D7AB6C83D3EFA9CBCE4136BBC1C8B343CD716403D39C90E3E4990FEBD69E1453DE835373D7ABA0B3DB13BA53D5D3763BCE8F9833DF276BE3B111A08BDAA22253D50C08637930FEA3C47F3EA3CF6B6D9B9EC00BA3D2CE677BD52153A3CB5E648BD4E0D123CF454023D9D18AE3D4E86A4BC719B36BB158C953C346C8CBC79F305BDD737013DEC64E5BC40B66E3C5DA77CBE82A5A53D3B7C043E98693ABCB6C69F3C87B82B3E58A2AC3C04C3C9BA9B50A4BCDEE2463CEA511A3DBF8A3B3CE95E8C3D2F2609BAF44B723CE206B83BA2158CBC3017373DDE9707BD0D2CB33DDCEC38BD097C0D3D023A16BD4A5E163D983EECBC7C254FBDB0AD4A3D7B0286BC68FEABBD82A8D3BBE2D2C3BB203E2C3C0B0703BEF6995FBCA5D4BD3CDF079D3CBA0735BD9C285EBC1944E93C71B600BC9CC3DD3BE68903BDB012FF3CEECC33BD39A3CE3BC3BBFD3CE18B99BBFF4296BCD6B7833C80A3F23BF1C3333C512F023A0A1696BC41207EBCA43478BCEEA0253D6560A6BCCC6AAA3AD64D4C3CE3B60F3CF9DF713D3E9DB4BC58304DB9E9B488BBD9B6833C5875523C302FAEBB061C513D5BD81ABD8D5B94BB9C51233DC9FBED3AF02127BE3F5D9A3AB316883DEEBC92BDFDF0FD3D694D203C13E72E3D9475BBBCAB19B9BC73E94CBEACF8FEBE5DF02ABCDD1909BDF1F28D3D1675093DC288ECBCBA08B73C0CC51FBD6F8CB83BA558163DE5444C3D5485863CFE87ABBD3465873B846BB3BC377695BD16A8463D7BFCE13CACBF86BCA44E27BDBF2FC93D194874BDEFA74C3BE87A1C3A9B0F3938DF908EBDF3027CBD0BA8F03C643EFD3C0FE585BC37C244BC8BF7F73B73CD76BC10F9DEBC78E0033B61C351BCC1548A3D1773173D99042BBDE099A43DE9B12FBC3FF622BD12522F3C6DED953D25EA04BCA248E4BB9A9A0F3D0A9EC3BB71574A3D0B43233C6FE3E03C3FCB68BC2DB283BC835B0C3E77E7833E625CA4BDBE6DA4BCEDAC203C44E4513C8729A0BC9CD92B3DF0C300BCA631BE3D29070F3CBCB2A5BBF6FD833D264B34BEE5B01ABDC704593EAB8284BC09B04FBDA9FD0ABC9F1DC5B3587821BC336AD4BC125866BE48AC00BD1023553C7319233D3AFB653C0F8FDD3CC59D373D92E0943C151C25BCCD30B4BC2F1372BCD2A471BC545431BC291B8ABC85B0AEBCBF4813BE912ADD3D03491BBB2426ADBD169AA0BBFDBB93BE78DA5FBDE70F183DFAAA00BEEA56B9BB21151FBDA67890BC0DDCC2BA7ABAB0BEE71401BCEE11BEBCD1ACE3BA4E8A783C931AA1BBB51874BBDD57EF3AE393CABB3640793CDC491D3DEA2AEB3CDA50BF3C2C130BBC1FFE72BDF8B0853D5996D33C8A5830BCDFB7833D55031CBFEF77AF3C14014E3D12DDD33C5430863C5D65383EE2C069BCA31B203E23735FBBDE2B19BB4653B03B4ADFBA3B425C853C3E75C53C2A00583E1AD8D3BC0548BFBC3D4A513B627577BA1E07A4BCF47F54BE4532F5BCFBBBBB3CD388A53C3BA3D0BC8269A43C26C655BD235AEB3AFCAE16BDB6A9D23D1D5427BB47EEA33DBCEF98BE67D1A73629F018BDEAFC8ABDA74982BCEFBED1BBCF745BBBD44DC53C431FB3BBF1B4A83B41935DBE03EC30BCFDA3823CBB1A1E3DE0CC953E49AD013EDF20993CCD48703C503EA9BD8A17C63D7868FABBD9A29A3C8B8606BDA6F587BC0EC92E3C9C9C303C535E3CBB2316B2BCEFAB613D5EEA703D0A60CBBC6B1FB1BC281C5F3D99F9353A5395BEBB84C8203DA34801BEF3C108BDAA3B473D6125B5BC51F997BAFA3E993C1477C03C1B0CE63DA977153C7F1B49BCEABBBE3DE1595C3C7D1DB53CF2ACB23D1437CBBC5356B53C86DA093C0EB1E33A31FF4E3C8385A5BBD4AD5D3E61952FBC4439F13C1C9C69BC9DA2BABDB98CC1BC79C40CBADCC16FBC365368BCC995003D34916DBDECC7E5BDF9AD83BCC2FD2FBD922BC93C9A087DBDDF80313BB7D0C83C34C47439BEFEC63D1DFBDB3DD330983A218D1BBD8EBFF93B3822373EB56B59BC81983EBD9D8966BD62591C3DFE0F1ABD23BD1D3D9FB8703DEEB4B639BE9FDBBC3E2EA93D0BF03FBD0A6B63BDE2458CBBA2162BBE37CFB03D6B4F2EBC04E9BFBDA0016CBD0382BDBC6130D4BC8075C43D9B5FE03CA25A5C3D4AF0233A2FCCE4BC6648CFBCC4043C3D36BF52BD14CE213DCC3D1C3D057D523AA8B454BD46C98D3C6A6DBFBB50BDBCBCBF143A3DA08C2F3D017D93BCD75BAEBC957643BDDF5B06BDD3A859BC9AC9133D1FEB54BC0F041ABD9D441CBA97310CBDD915EBBD3518833C978289BC9CF106BD5292D8BC9D882A3D54963F3DB43D03BD320FA5BB7CE128BC0F1C37BDB8A46EBDBC6B97BCBDE3BB3CAC9C32BDCBC830BDEEB5B13C65C20C3DA759B6BC103194BD4B92023D5F72CE3CD5B80DBEFCAAC1BCADDF853D3068763DFC5C353CEF82F93C72E3ABBCEDC9F7BC0C6BA03C9998F7BD5352183C92A7ACBC82EAAFBE0E63A8BC096CEC3C12BB073DB19F16BE23E1E6BD770D6CB93ADA2EBD5E16663CC11F54BB37C9B63BB3BD103D029D7FB3C7EE943DB49ABABD32BDE8BC686412BE29DB4DBDB15D573CA21E4A3D39209C3BDE57D33BD91A0D3C8A1FEA3A358AF73BF923CDBCE54F803B6727103E368B12BDEE60573C9B7CA7BCDFDA143DC345253CA546EC3BDCD66B3E18F95FB9143669BD2B00843D9D15683BCAB4303DBE6592BDBB8A863C7C3586BD085ADA3CC2931B3B90D0B2BC3B174DBE555A483C4AD0F33DB51F1E3D04AF10B9E7689E3B336B6F3C8243113C599594BCE87DE7BDCE2D91BD4CD091BC6279043DFA30093E80DA69BD6015483CB41482B927F6A1BAEDFE6FBC3AAEE43CB94BF63D35253D3C824416BC2478043EEE8E74BD0EE8993A6BDC40BEC6A7153DADD655BE952200BD66A870BCF3BF13BDE9594F3CEC12ADBDA25FD0BC2B201F3D2E1AFBBCB822CC3D59C432BC32FA843D60C539BA45DDFF3DDEACD4BD7518B53CDBF9E5BC2158233DB4CE313AFFC2B23D82608B3B8C7C1237D8A71A3C8C0D303C63C9B73D41FE1D3D58A19DBDA60DE63B3D7AA83D50B2E43C436FF7BB19A90EBD9CD43D3D58EC30BDA8E0BC3C307787BCEC6068BD7DCA033DEA7B813C6535DDBDA831CABC58E9AABCAEFB48BD1F73733C887EF3BA530C7EBB316F1FBD8ED71FBACC471F3D889F32BDF4C55FBC7DA27B3CB2810CBD146A223B355C313D6F28193CF4E225BD546304BD6D0C9DBC54D8583D0EEE893DEB59B13C5373213CC48C873D0ECC4FBD8C558CBB5C31183D393A903B691D3B3D3FAD93BBE2C7EEBCB2E7DFBD3731CABC7B51BFBCC644DB3BD80D11BC6EAB103E2E1957BCE3B0AC3CECF067BC5C423A3D47BFD1BC60C96DBAC98884BCF571B23B14B05EBC7D84FF3BC28933BD59951FBDEC1910B9A209B73A5F38C4BD4515D13C044DBBBB7B6FF53C9C2DE53C9E07A33A84B48CBA78340CBC334EB7BCBD4476BDE6DD6C3DAC5FACBC754A2DBD212F96BDC86F07BC160E0FBCFE3C0DBDE3DB91BCB8267C3CF9E0C73BB0AA10BD064777BCA38DB83B23B315BE546D1A3DB2AB58BB6FCB9EBD5F49EF3D6913413C90201EBD5A5BA5BDE940173C48D78EBD144491BD9C4BC6BCAC9FCBBC24F10FBCFABA87BDA4C358BCE0E7223DB00B563C8CFD66BDB310A53CF4321FBDB371A7BD3D30123D7557ED3C11BC9A3D200637BCEB15ED3C1630A7BC3C1395BB6FD7513D06749A3C3AA03EBCF609ECBA3AE039BC419D8ABCA8A8783CCD94F73C60902A3CF581CE3BACC81ABEA30BAD3C85A27C39471A90BCA334E5BDC3A12EB9EA27393D42820C3DAFB1213C80FD6BBD8AD51C3B5E410CBD765F1C3D00C676BD96FDB4BB6041D03C773561BD1884043D5AF2BBBDB74A79BE7BF3823D6DA4C6BC981FBB3B8818AEBC2EF463BC8620483D467191BD8A7E9F3C4550B2BCE55E933D8900AB3D002C8B3DFB99F4BB4947AD3AE273153D596C113B699A12BC0CDD7FBC30633BBB03B5AFBC78AB85BD30C3CEB32E0BD23C03F5FD3C61EA58BC16B8BC3C1E28EE3C6ABA0C3D976D2FBD4A31B83C1453843CB0999FBDF5C32B3C06D6DBBAADD7D83D2B57073CE9F03D3E8C4629BD2ABEE43C9CE68F3B637EB43C85911D3D36DC103CE771063EF2E168BC2165BBBDB9C5063DFC0385BCA7CB3CBDC3985E3D70B65D3D7C508B3D6F2AE63C5897383BA6912FBDB0043CBE78749C3DDEE6F73D83DD813CFBFB58BD65B51ABD096C163D792233BC727C9ABCF23DAC3BF166B7BD5C75F3BC769C40BE4BBDE43DFE6562BC1738D8BBB0852EBC59C885BD448780BE0FAA94BC9EB747BE7F2FC23A99DC393CEB6CD23D32F6E03DCCBF65390D4F7B3D537DF5BDD2E7D8BDF1093CBC5427123C68FA093D68BB173DDABC263C83958FBCDC1A9FBCC6E49BBBFAD2763CC85E55BC89188F3C31BB50BC98B7243ED3BFDE3D6692A23CF070DABCE1C0353C2BDF573B0A165DBEF01196BAA06B92378AC43139896EEBBC4EE376BDAA80643D4318A0BC3B38653CD01A27BC96B5A03CA5D540BC1C34B0BD5564D6BB9619A63DD5DD1C3AD524EBBB4791103D7C5D9D3C5AF0BF3C80D13EBD3A210E3DC388713DE8BDBBBC2C7DB6BD244B403D1A12DFBD6F3DB03D85EA30BCDAA579BC90AA4E3BFEA737BBB9E5A13C538F533E30E09D39B7F856BC643B8C3BD740313B43570BBC88DF0C3CEEEB673D6E0C9A3D214F433DB5682EBE343C073D6AB5963A40303E3C3A016D3C1AD2E63B30361CBD1E982B3B592524BAB159AC3D899621BDEF3EF2BB6926E03A7E06AEB929D7373CBDC028BCBC7B9CBAAAA871BCCC6A97BC1BD6983C08619CBDF642043C3B60583AEA7694BC9DEDBABC82B9643B60FF95BA8C67573A034026BB21EAE43B119BCEBC57C9263BCCEA013DAAFBF6BABCF0A53B817ADDBA8A8A2ABB411E913BFBDA7FBCFED2B13C0723A63CC037C33B005616B90544183C01B66E3C2031D8BB49306C3BDFF937BBF0C70B3C3196BC3C9AED1E3EA0C94CBB8F428ABB5EB0FCBB999A2C3C4CC1BC3AD382AE3BB43B843CC9A4E63A3CC5633CBB90C33C45D0683C5F45ECBBCD7F6FBDEB4AD5BB2B5AC83C5F59033CFE5E81BBC14DDA3D3E5A5A399C22A73CD8E094BB8225BABCB2A1683A4F7EE53B9F991BBCE30AF5BB38758BBC492B0B3ACDD2B83D5B28343B1FEA48BC0CF0A83CA2EC3DBCC813B0B975AA9EBC54CBD7B850CAC0BC81D43BBC54B7273DA39CA03C4048AE3AFC1919BB201ED03C6B5CD6BC2683943BF5169ABC365938BCAAA2353CB705E7BB6FD7183DC132813C49B88ABC07F6D8BB03130D3C3500A63CF5D0BABCC12457BCEB2B04BC22B8803C2BA0CDBACD7FDDBA000740BB64624BBDB93E00BD670F32BB6C4F7B3B5212D63A6B21E33CC26E61BDF4729E3C6A2B12BC1A9619BC5D05023B9CD79E3AD511AFBB048E063EE68C2F3C54EDA8BC958D6E3CAF8A033C1F71AE3C96B17F330119C03CDB9C1E3CB011C1BC12ED8B3B58A08D3C419B9DBCDFC4003CB7F2AA3C35F251BC6E6312BC2529D43B08613D3B90E7C53A4F36F2BB204A183C500AB33BD3D5B9BC5CA2573DBC9B8A3BDF7F84BBAFAC833C8432A93B2CE1CEBC8743DB3BFE14323BEC7D8C3BB3D127BC7E1197384953FF3B4F033C3C984793BCC27296BCF5738CBCAF8648BC3941C1BCB3FCC4BC09F5A4B832FB113CF30596BB6BE91BBD5D4D9FBCE9867039DE7A9C3C9C5EE2391E48713B45E799BABDE505BCB2DBE33C30C237BB270E133D47A2893BC6CD87BC716C17BC24E20ABC7A1746BD5A00153C1BD3013CD358D1B91201B0BA80351F3B296709BC89D1CC3BD99ECDBB399EEABDF485C5BA2781C2BBD25393BA79497C3B9C24753C79A1553CD9A864BCA945853A97C46CBC1519E6BBF4F3BDBA66C0B43B0F7B77BBCD62883C6A0AC4BDAEEB6F39878409BC55A34CBCC33BF9B59D69D5BDE0798C3C9FBACF3B6B2778BB1D36EC3B76D9953A59CA9A3C919827BBECB4E43D97BB35BC2FA5B83B06613CBC51B3EABDC02265BDD80A00BA1AF82EBC414B0ABCE36C93BC3679D53C329204BC9461473C583160BB3E6496BB8B6268BC2E2E0E3CBF89853E703642BC4812933A8E2DEBBCD50FDF3C51DE24BC2A358A3A8192113C71FD8D3B00C8643D2B9E06BCBD602E3B6B2BAEBACC128537313A883C462649BC51515DBBDC32F9BBC69814BCA0E4D43C93539C3C9E5B433CCB0A8DBC09ADE63B2FC8433CC4AD5B3C0DB02B3CB40924BBB068D8BB1FD897BEF0F4313C01C88F3C120DA2BCC93ABFBB61DCB43C6FE4F33B4C5E693C4F7888BCD681BEBCD64FC63BDA7CF43CA9C2D93B59C2763CFF82B9BCFE17CB3C51077D3D59BCD63B6B4BF9BB630C74BCF40816BE4B5D93BA3DA3393DB130DFBCDB8879BEFC4D11BB20BC96BD32CB903D10AA23BDD23102BE657232BD39493F3BAA740A3D534C553BF1C504BED49DDDBC1C5D313CF240C1BB5D87333DEA39A4BCD515243DEA9E283CE02FABBD73039EBA300C613D9302003DEA1D023CE19D063BDF8B7BB91C7DEE3D4E890B3C976F05BD8DBA3FBD3DD1DDBC7DAC443C7E29B4BCE33829BEFFAE11BCE8CFBBBBC67BCFBDBDD905BDBF6D83BDA517C3BBEA9D203C1299DE3C36E8A53C714994BA8A77F93D377F373D2CBFF63C5E8F463AC5B9B4BBFD153BBD6989563C63925CBC66D2B63CE6FAACBC401511BD8379813C08B9ABBD636CEBBB5FA07DBD5F38C23C58B1F83C83D0BC3C245EA3BC4FE024BD7973503DCAA7D4BB140707BC2C9A5ABD103F4B3D6297B9BC731D843C3040EDBCCD040ABD2A74663B216C87BCDAA7A5BC7EB7C63B50907DBC0D1ED83C692E1DBDAFA7B83DB91728BC6B5DE8BBF81E2A3E40090FBC6DF1FE3CBC0A8EBB6CD08F3D46A0823D4993E2BC4379DF3CFA9811BDEEBBD93CFAA7903B224B04BD7F514BB20038163C8CC17B3C60A30A3DCA00823A4DE93F3D269EBA3DC86FE9BD841164BEB14179BA6F56A83CB18907BC7B26C1BCA3069EBC8DFE90BBB4E72F3E20564ABDC71393B96F620A3C79F60C3C57F286BDB7250EBDCE16F9BD6B672FBBB1F71A3ECBAB04BD01E7013DF73A3ABC5F02733DE5BD4EBC3E00C9BBE927D93BDD78CDBBA39C313D0ACF0A3E193863BD3D6F683D49466BBB34399DBCB61244BC915E883D73AF8DBC8423033C282798BD0F19973DEB9D193D01380A3E8921A7BD41DAAF3CB5E3A13BE273B93C6BA69CBC63CC06BE7470383DC4B004BEF8018A3CA8750DBB2539FF3DA77008BD0EAAFBBAB98920BBFE7E88BD71AB913DC996313DF9AE433C91662F3DF6A784BBD0F83E3D8B13C2BA4E61A1BC8DBDCEBCA0E8F6BC897B67BB438962BC8F3F213CD2B8B9BDBFA2203C6E13ECBB7E2E353C3558B83B9D0808BBF5CD623C9BE99E3C526A0E376A5E033C737C22BCC11644BD1ED44E3C97AA7C3C317F10BC86CC5DBD0A7BD93B641494BC80CACF3C6CEDF4BC47961EBE985C46BB3D8E7FBB4F0CB73DFDE74FBDC166A73AABE3613C4239803C260C78BD81724B3D99E0ECBCD7975D3D7DE9293BACC5743DE08FFE3BDD47AE3A63F5B33C381F753C554C2CBB1F1D2E3C9CE9E5BA9A2C99BDBFC9AB3C0C43D33B2518273D60ADB83C126781BC47EB00BE2A71A0BC22AAD03BA65336BCB78503BDF51E963B955AD9BCA53144BD449953BD3207F9BB404ECB3BFF62FA3D46E0ADBC93EB81BB961423BDCFE648BA25BF06BD00746DBC3C8A1F3CE5C2D13DBC68DE3DA7EB9E3DFD6EEE3CE26ACD3B961BC4BC1C1A84BC3598E83C543B0CBDD2AEF93CD39C3EBC59BF84BC09E750BD66206ABEAD620ABC8A2D193E73A620BC17BC8F3D89EC893A85F21A3E994FF6BC105FD63B66F8713D81A2743DE295C0BCF0BF9DBC005CDCBBF0BF0CBC444B083D574EDF3CF4BD94BDC6A8A23DBCD9A0BBC2CC2D3DF98101BB93ACBC3DA17D2CBCA952A2BC13C926BDABD14FBD673B74BDA974043CC0D0AA3CCE85343CBC3AAEBD08FB9D3C82530CBE97793A3D826F77BDC46D7C3CC31AA03C9E95FABCA5BDAE3CD3ACAB3D21B28C3C082DB3BCE5E1223BD284883B32505F3C857A963C517FF0BA399C853ED96737BCCCBBF1BC0F7161BD94A92A3C9E3E353CB5B6803A3761643DEAF840BCE93848BCDA91B43CEEA77DBE664B2BBC102CE43C04BE36BB75BD71BCFB6F183DDB3C46BEF49576BB43C69CBEBC9F14BC9BBFBE3C5CF8FB3BCD661ABD36A3F33B0C2818BD566016BD3F5B9BBDCC7A02BC4B3D253D0EC9DF3D30BD293D4E8AD93C97E44FBB298F933C87AD873C23A5BDBB9895093C8839653B17312F3D49BCBC3CB86190BAA65157BD1F44E63CD173FEBB01C602BD77D9CCBDCEFEB93C0979CBBCC5680A3C7EFD86BD1D1E823C11BD1DBCFB908ABC819CF3B24A40473C570AECBCAF1A95BCF84494B9D447A4BD2562B33D12FF7B3DAC103EBDE428BEBBD1333E3D505DCE3B1B8BD4BC1E00F93C90F2A2BC1E7BC03D2F0C4DBED587863DB7FFAD3C5E079EBDAAFEEB3C636D403C99457CBDDF8E8ABC08D4623D714BB03D32D1B43BE7A2A53C63E92E3D6E11073DEB07A8BE20CF7A3C1375EEBBA3D8C6BDF670623B0DDC4B3D411EDFBCBD3217BC4FB7AEBCDCB9F83A4E04DABD35B09EBCFAEB3BBC4C41D1BD3CDA0BBD537E02BD4434B8BC8A02A1BBA44AAE3CC1CE53BD8F96BBBC7C48173D9AFBE4BCE0C6A23CC31C19BC42FA8EBCF6F2043C2D7BBE3D1F09653B14CC0BBBDA3B2EBE754E0E3E6524D83CCC2C293ECD64323C4E70BC3CF4B0693C5216B73D8D172B3D0BD0DF3DD209063CF67B83BC1001A6BE9A1DB03D76BC403E1201D2BD2651CFBBDFB7493CFC511B3EF03B9E3BD872D5B75548A13CAD82283D3C5B0037E02F3E3CC3974E3C8FEAE1BC0443843D4B76D1BC651BD43C765F8EBD8203013C1F55DE3CA09134BBCF62403E211FFF3D3B28F93C9C55DC3DCF94C6BC46FBDB3D8F29DF3C5DDF3CBBC85686BE168A493CBE22093E6282A2BC2C80F7BBF93C0A3D9D97C5BC9199F2BA2B72793D83790ABDEC26EC3CAF0927BD4AF5C93BCDB71A3B6F610A3D58EB4ABC53F3D73C4043BB3C8D83AABC7F186FBD6C9408BE034A04BDF2DB2F3DBDDCAF3C58B206BC11655EBC9AC3903CB5AF293D849BB0BD252C413C01FCFB3C7AB17BBC720BD03B9213A73D24E645BD9706A4BBC055FEBC27F0A5BCF90C393D88F1AD3CD55747BDEAE826BAABDD41BC7089B63C61E584BC270C033D1966B03AFCEC023DFFAA25BD26A3BFBDD61A9F3B007D283EBE626E3D3370543C75DF103DF44C65BC74F9EE3D3B073539B504FA3D215FC6BC0A0C943CB00E083D1DF390BDC9246F3CB5CFB9BD485058BDFF61E73B6B5BF73C165D2D3C6235223BEE197ABAD0FF3DBDAF31253A633ED23B1760713D93CA5DBC88077BBB16C022BD645824BE4F15CBBD63DA903C38FA65BD9BA32CBC85423ABD9ED6D03C2B5DE93D20BF603ABD05E1BDC19E4DBE8043F13CD12CF63B4D361ABC05C8E43C53DD9F3CA928B6BCEEB047BD3E1A8E3CBD6D18B9761B613DBFB40BBD25FA473C98A1813CAB47083D0EAABBBD078FC83C14ABA0BABD3AF3397EC5A4BBDBAE923DE8E7713D5D74803D81B07DBD853660BDD9AE303CB8A31CBEECA191BE451DD53AF64886BD09C7F7BBA0DA55BE298181BB914384BB3F43BC3D277184BD3D28D03C3616EBBB568D85BDCC1754BDF1FA83BCA169083DDBF6083EC5DEE53D47A391BC4994A6BC08A38DBDFF1A05BD4327863C3969EF3C546B88BC1C88A33CB7C6253C2E964A3DEAD39D3DFC63593CFFCF753DA325D7BB42E8E9BD18684ABDADD1AD3A198E0CBDED2197BD8906B23B5F2934BD1BE6483B68415F336816923C2732B53B385F2E3DF2F2ABBCCB51DCBA4FF6CD3DD5771C3DE488693D882C093D695F863D792224BC43698EBCF22D2C3D317FB1B8AD9C8EBD561D58BEF2D1A6BC5245AF3BCE37ACBD0792EEBCC19F103C084A943D223D0E3C5DC8EBBDBE90173DA4C11EBD1BF43CBC8DD5403DA9543CBD0CA40ABE25D8313C049A3D3D3390173C9C3F04BBFA1E51BBD7894CBE76ECFEBACAD50DBC0414D5BC1DC368BDA0F1EABCD418163CF6548ABD18EB1DBE05E56E3BC1EFB3BD14EC3B3C1EC3933B7BFB05BD3382713CD4FEE63C54F01A3D24A601BB18A7A3BDFE8542BCAE29C7BB7843EC3DF140BBBD9CAB1B3BF783933B2197B3BCE3D5A93DCCDD9E3CE1181C3CC05E4F3C31EDC6BDF32DDB3C44A87D3D6E501D3D47FED03CBD80C5BDFA8ED2BD3479363D6124193D36E02ABEBA2323BD2361A63CF71A483DED6541BC5C4AF4BA287B933D6B5EFBB978DDD735D35D0CBCB8BDCFBB72DADD3CD85D973D687C553D1539C93C4C07B6BD8018EF3C9174493C6490043CD720653D31096EBDBA4791BA497F223D6EED4F3C8036A7BB83C810BB83362ABB51C38FBD92C9943B0904203D59F64E3C4974113D5B2A293D8AD7BCBCAC2B4DBBA9B5D4BB4006BA3AC118F33BF9D8C1BC9A062E3CD9FB91BAD0EA5FBC4ABECE3C6024813C624AEE3B1C68523DC892C3BCCA5D3CBEDFDF49BE717BA1BC8AE6BC3BFB92A6BC99A31F3C0E8225BD4F7763BDD1B4DEBC3CD843BCA3631C3C40E944BD2578B43D5E27773C5D5169BDFACC603C2D1BC3BC1530CEBB0901F83C603346BCA80830BD5707373CA09428BC713DD53A2E3F103D9A6A22BDE934A4BC7F5B3A3CCCF4ED3CE9E0ACBCB0BDC43C74B283BCA88B02BE63E00ABD06ADA63D28413C3DDA6158BDB70CEB3AF52395BC7F74F4BC7CB89CBD146417BAB47EC1BC9D20EBBC82C4A6BC2AD6B63CDF94473B4AF996BDC412AC3DBB17D6BC9255013E58522CBDC88CA63C3A60973C37DE73BC6DE067BE23364C3DAEB64EBD3318883DF5F0143D276BC9BC502EEDBDE41BE7BD2D379B3CB1CC09BCF9296CBD1F4369BDF77472BB4BDFE43D62D406BD4716F9BB230F093DA7AF6EBDA24FD4BC77899F3C83EBDABB4C4D92BDF9A57EBD80EABF3BAE07CCBDFFB9AFBD51236D3DABC8F7BC4AD545BCDB799ABB3FCD32BA53B8DC3A1EB4B93DBECF0B3DF63C34BC0E84953DDC37653D2397473A2FE4253C0862AF3C6E02B23C674ADCBD012C57BC1BAFCA3A53492EBDC09BAC3C790D0A3D4ABCF13D7AEA10BDB5123CBD95477ABC9C9237BBBF09003DA2D539BD3505D8BD01C32E3E68C3043CF8D467BD5F7488BBBCCE06BDF1F84BBD0279353D007141BE5EAE27BD699F1D3DBBFFE63CCEBA5A3D30FB543E5F6EF53B01974ABD359F8DBCF5896FBDEC0EACBDA66F4D3D2181433E10D8823D852314BE54B113BC2ABCAE3CCCF416B465C6123E1D3B35BCADD11BBC180E3DBC109A593B76745BBA4A27A2BD891779BDA7D0C0BC7FC885BD14D60D3C9826C63AD6DE84BCDBCF9DBBAF7E09BE852BA43CCD2FD5BC6F58C13BF8848D3D244108BD2BA0073DA38A81BDAB25543D0F86193E7D8FD2BDED0A77BE5D4D09BE2B6FB33DC8C29D3CB8446ABC2CE30DBEAD5A863DA0AD0C3EF5215CBD75157EBEB7A1463CF0C3AE3C76319E3B2D4C08BE7EF16ABDB1457F3C2012B2BDA0DC393D0D25993CB32100BE839F0D3E49D3423DA42DF53D0C30BF3C78DCBB3CBB8954BC1B64EBBD994D6EBDCD492CBE1809253CF850FBBA504BBF3D765DAB3CD7F096BBA98C5DBD67EFA6BDB2A6843DD35457BDCEF3813C3F0B24BDD23BD83CDB31CF3D334033BDDAE7BB3C2420C1BDBC6578BCE9F1D5BC5CDEE73CD9DE953CC79FBB3C4E9158BD2DD21EBE236618BBFF2A13BE9790EC3AA87FBFBDD665E73C3134063844DFFEBC79F184BD8D2025BD748D2D3C2F5E65BD9ABBD43C6E8FC33CF5B0433B33920FBC87963ABC8CC8A7BD6BF1B1BDD024E0BCAE9F573D20A24ABA81F80E3DBB3BF8BCE34F383EF97C8C3D460E803B018964BCFE2615BDB9C2443D7BEBAABC0AC7283DD446033C44FABD3D6714543C2A320B3E36CF28BDE01974BCF4EAF73B6D80D23C8B62F7BAE541BE3DB3DC923C02D2C93D5682FABC559A14BD7279393CBDF412BD2575A13CF9F83C3DB70EEF3C9D21C2BBBD674FBD9A9CF4BA4774673BE579CB3CC278DF3D1E67113C4561BF3C6005C0BD40E9B9BB2F8615BDCA259A3C33DC7A3CD582DD3C637A3A3C0BE8A1BD8FD5A83DD346B3BB9565F73CC6F5133D2389D73C0D01AB3C6CDD893D8D89A4BCEC93B03BA86BDFBBDEC59F3D97D8EEBC6BB1113C74BF163C0A30433605098B39FCB4073D623EC8BC796E013E3195493CB83729BD25E32C3D8187753DA544123B5FF72DBDEF84543DBA62403E8B8981BC30A593BD52FBEDBC47DBA0BBC7128C3CC9B33CBE96C040BDA19784BCAC1D2E3D9540283DC31A69BC382EB3BC87A342BC3D67EABB71C4783CDA20063DD4AC403C11CBB3BD3C13CE3C594F8CBDADE313BA15DE0F3D669DBB3CA30DDC3C85E6B73CFC6CBD3C873C25BD9FBB88BD8DDAB23DA5319A3C69B32A3E14D5B03CB654013C0F53713BD282D0BB7802163B37A1FEB9C58EA4397CAA19BE2D0FF7BCF4914D3CC30BF23CA37F563CF84186BCFB3B713C841E4DBD381A4D3CAB4CB9BD3696373DF619DF3ADDFDF23C200591BC86551C3EFD17253D49825F3BE2020F3DA84D233CD2CB0FBA5EA252BCF7DB3C3C915920BDE84F7E3D4347D8BCB9278E3B1DA4843BECEB203CF6A929BD0FED83BD1CF0CDBE049A7D3DABEB0DBE1D22173DF85BD63D65090EBE9898F3BCB1286B3D1A9D95BB04B7E73D6090143DBDD9A13D7757953D5433063ED69623BE69D0E2BC62FD2ABDD1BBA7B30B95953D739F1FBC11E20B3D584769B9DC8D88BB5047FFBD4FAC063D285AC43C1D349ABBD0C1C1BBB9E835BCC41B36BDB2DF05BA2983123DBC2BE53D018F113BC84E47BE83313E3D78D0A03D48FA0B3D3CC3AC3C952C6D3D16AEC93CA3F430BDF50BDC3DA4E7E2BEB4DF47BD4A505ABD105E28BC0550A2BCA00530BE548D903DC995D9BD72FCA63D886AA9BED302B03C81FB8EBCFDCC6E3C69A42D3C8CDB8CBDBE20043D6B5147BE9FCD17BC1ABD30BA52ED48BED7BCAD3D473030BC0931193DEB7C1C3D0C86FB3ADCC1013D8E9E513E3414A6BD28A1003D3350E0BB6F3804BCFD8E353D6EE0683DECE25BBB2E99D1BC2AE4A1BC2A3D4CBD24E1FF3CF3CFABBD85F268BC4F93F6BBF06C373C44BE203C36980C3C3D3CBBBE7433A9BC322CC13C234BE93C7999E2BCBCF610BD402CCABDD58A91BD0BBD003CE8592EBE1E9E8BBB2FEEF4BC23C117BD43331937CD32F4BC9A202ABE62E10CBC96EA59BC809E8CBCF7CB313C3D6F45BC96B691BBC099183CD7A4403CD656CBBC07522A3DDFB331BDDADE67BD0A3F963CBB962FBDEFF7163EB36A12BC2EE09CBB276E8FBBBB27E7BAA03D22BD9F4F0DBDA4B63BBC58590DBD3617BFBBAC41D33BB29DD7BC9023C83DFEAE65BD75C6A7BD0B4BD23947426C3C9707B63CBD72B5BC9F95D83C4278A4BC7B1BAE3D240BB53C9DCEDCBC7FF97DBCBFC6E43A4779443DEA0751BC5258833D9AD18EBC9372B63C8E4A173C8CEE1EBDB8C220BD8536693C3CA7273C540ACEBC3DAFD2BAFD663BBC258FC23B5DD9EC3B1D3752BB887750BC196CE93BF663243D34B8C93C83FD29BC6C8E213C838B53BC73FB4E3B486796BCA574853B6BD6C33B1D56D33BE8F7C8BCE957BABA42EEAB3C7B5F0E3D38D1C13C1B6E03BB7EFAEF3B81D103BCA427ED3C0D0F49BCFFD1C53C7E5B8ABBF484FABC66F23F3C300D233C4651BDBC525F45BCD4E13ABCF748C83C62F75E3C8B4FF33CFA6912BC53F6F0BBD4BE5DBCE95CD7BC87B01B3DC878E13CF138883C3743B8B9107B21BD3ECE453DD4B15D3DE7115EBB1E5D2EBC8087583B848A1F3C5815FFBB152294BBA427833D716607BA8288A73B78B359BBB4772ABC3C4B833C29FE95BA2F96EDBBA8B2453CB60AADBB452B3C3BAE18EFBC9D697D3CA652913B20469C3CE19C76BBA7F902BA70253CBCD97080BB7C2983BC3E4ADDBCE10A9A3DF7F4B83CEF07B13B0BF503BD390EDDBC21FA0ABDE1D8F83CF0C66DBD72FA203C0918BD3C9BBC3DBC2587843C4352E6BB1880D93A069B473C2D5CE6BCA2DD883C67F8973CCD78C5BC3AF7353CA71EF43CB8787D3C7C0722BA6987343A281E6FBD240996BDCA3A05BD21550BBD43FABB3CA488D13CCFE8AE35D2073D3C9522B8BB939B3B3D90CB74BA08FA643CBE7C27BB1084DCBC0AFB093B85CB4BBD1F6C1BBCF2E1613C3BB2113D665732B34792C43B1D73083CFBAF80BD6EB931BCFD6A413DA490B9BAB57A89BBEDBD20BCDDB14739D3D6CE396815DF3B24B1783BD590D2BC239E6CBB6E4A6FBA294BDBBC424D833949F557BC8E85043D40300EBCCD5467B947DF943B4097A43D5E9289BC4749BF3BAEA9003CE45C74BD50A1AF3BF748343DBE26723C150D7EBC7676513CF84AD4BC0AE230BC439ED2BC6C28C23B68E510BA703D283C6E9E87BC317003BDC8B413BD2C6B92BA5D8E0D3BC98FC8BC75AD773B824C7EBCEB2E043C4476723D8BEBE1BB2EAEBC3DACC69E3B7C88263C82647EBCA18A423DBA6893BDDB48053C11B6853D59DD2FBC426109BBEAD49C3BB19E9EBC09ABC23C66A098BC002A853D44A7F23B842BAEBBB425B2BCA2BA8FBC4C02303D13D2253CA70E3EBD0681DDBC9CBAFCBC7E42033C14295CBCEB8F9A3C7EF7C5BCC45EA33CD85D5DBD61439FBB0014E23CC0CFE53D2B5ECBB693A08E3D86120ABC72F125BCFAC9003CEB3D253BA834CEBAF103173D3E07D539E65AAF3DBA6A87BC26AEB4BB7FB0DFBC2301CABC6AE310BD71019F3BA43D36BDB4DB253C0566383C7DD8303C9767E2BBB982E23CB127B8BC0944AEBBAC08BCBCF5BEC13CC76FC83E32F7F1B8F061C53B096EF8BC804FFABBF3CD0B3C3ACDD43937CE003DA031FC3BC0D397BC5463CFBCB9528A3D1F7BB8BB9FD8503AB7FC413CBD5077BC3FE0D53990D2AEBC6C4AB5BC77E2E3BD94600C3C0DCED53BE46EC4BBA6D8823C86A1713DEB05E63CD0C39F3C4026253A0D3BECBB10ADA83DDED05B3CF1E1093D891103BD0BD8BDBB68183BBD14F5E5B9F7329EBB28494CBC02EF89388178483DDBCAC43BF099C3BCE49C95BD2BDD8EBBBCABACBD9E40853D57CC13BCA1C926BD96A0AC3AFC5D26BD2848F2B8DD659C3DBC5312BDEC9B2ABC2D0800BC1F579ABC573F2ABD9D3E18BE497257BC146416BB5F3FC0B9D5BCE1BC7A876BBCEB9A2ABE20F7B338A4AED6BC66892DBBCBE910BE03948D3D7AE867BC0346C1BDC931D53C8001FD3C2255E83C5BBB093E76F9F53B9D64DCBCF92C7DBD792229BDB0290E3B60AA8DBD167C13BEF17C973DA6512A3D21477DBB558F8DBD86F42A3D589BA3BD0B5230BDB10BA03DB8DFB13CE75CE9BC3D76073C83A41B3C864BB0BC8ED44ABB132F273D5E654E3DD6BFF3BC3F004AB953F014BCDFE160BC4D8000BCBBDD77BC02E774BDF939323CC718C1BD9CD6F83BE58FCA3CA7D04FBC94AB38BE716294BCF51702BD32FB053DB025423D13CAC43C75C259BDA7BCAC3BDE29523D2114C1BC8E01FB3CD3673B3D1CB70E3C087E1CBEE25335BC920BCABD4F26503D735EB4BC842DE33CB31969BC1199F93C66F8B03D00F8F63D224A973C25D0E33C538462BD765E023DE6741E3D0F21193DB1A9CDBD002D403D1C837FBB788F0ABD88C5433B4386A2BB42C73ABC231289BD19B2C232D3278E3DA479533C712B853CD31AC6BD58D96B3DCDB7CDBC9C649A3DE266B8BD6767663D6E171DBD1681273C7332FCBC6144C23C49BB03BC998421BDEBE0B1BDD2DF3E3CB2D09F3AB2286E3D59991F3D685673BCB7201C3E7E0D753B06FCFF3C70C9AF3D39901EBC87C047BDBE8F4C3D1FE8B63C20A61F3C23734B3DEE35ACBBEC3596BDBC2442BD8660843DDE4CBB3D0534403E013DABBDB0EFC13B79ECD7BCBD044CBBB317E0BB8E8277BD42FBD0BD187C25BC6050163D9F69F53C7AAD643CA00307BCA43074BBAD5815BE47A89EBE45AC993CBF0D2EBECB58F33B091AFF3AAEB00E3EBC79073DE298D339649F2CBD56538FBD97514E3D24F5C83CB17BE2BCC4512F3DDA2F0F3CD89E903BB4AA173C131BE6BC7445DF3C36BAAC3BB8BFB8BD226EF7BCE116033D0300D6BC7BB8B03D07A9AF3CFE9BFCBB6D7C703B151319BB519EA8BC8CC734BCB54E793781C99E3CE86A1CBC4B40A2BD465662BDDE88143DA2C9ED3C78180ABC41F4183B08C4C3BC6569C7BDE7320C3E4C4B5ABCC7ABB13C2C41D03C19CC333DF5D8353D061FE93CD1599BBD6AC8403D24828FBCFF665CBCAF64D43ADB260D3D08B392BC85832D3D13DBB6BC68ED80BC502E11BDEFBC2DBC827F74BC6F3AF13DCA416E39ECB3E7BC5067123C8C380DBD8D1461BCBDF553BC58147BBDB47F57BDF719BC3D4678383C1AFC3E3D890C44BD9D1118BD42DE843CD8AA163B7FFEDFBCAB96D0BBA2241D3CF82A1D3EB1E02BBD412E053C8D0FE93D1DB842BDA5AA623DEC8F9ABDB19EE2BCA391383DC91E083E68E5A6BDAE33833E50C327BD9FF1EABC4AC8273EFD1538BA0062B1BDDA26AEBDB9C2093EB5DE12BD2D27803D04E8D23D5F4AF83D173B81BE7002D8BDB03821BE09A777BBBD2C8E3CB488733DD0D7F93D365CA8BD559FBEBD00998ABDAA8D16BE1F4476BD7CCCB6BD69F2593E2EDF223D2B5B103E41C514BE241C583E6ACE98BDEEEB15BC3C230A3E0F529D3D3BA403BE0A47D53D27378DBE4C53C3BCBF58713D629A233C6C1B443ED7AF05BEF6169A3DEAB88A3BE6F1C43DAAD275BD091836BE78A617BD3500EDBDA99F13BC8266103E73CF8CBDE2434BBE969787BDE698FFBDFBFB6F3C5A6D813DAA18013EB4409A3E679CEBBD844ECA3DD1312F3ECC22A8BDC368EC3CB828ABBB8EE228BE015D2ABE20700F3EFDA955BEDD3155BE1B11A9BD70C5A7BD957599BDD46C41BDF5D930BE4A62E2BD27D7923D9BB60F3E34D3A3BD3414C03DF5484DBE8C6A55BD88EFEFBD070F83BDAA9DF3BD6E49DFBDB2A8213ED473453D6EB3BBBCF793F4BD73A9163EF6BE81BB9D43FB3C9AE599BD441F33BE956F27BED7144F3D1EEAAE3DE67DE4BDE16A1FBE9D2C38BEB128733B22FF233E601D00BCEF9B1E3E00DF533E7E901EBE902DA2BDC1D5053E255F723D2E29A1BD3B6C8ABDA19945347FE582BE3B4D4E3C8F0008BE9819193C7A68123D354858BE50DB433E456C543E502BDD3DCEFD243DC12FA6BC297D533D79BA89BC0FE1193C960F2B3E2B5745BD6D840A3E019EE6BD0226D6BDBC1FC13C8C5DDEBC4346703D8D40F0BD1206DDBD41B414BE731F8A3EBFFBB13E0EED26BE0691D0BD42AA263E1019473E38103CBECF8887BE5518FA3D2B41053EB6D2F13CC5E02C3C2F747C3D3C0D4A3E9C0E093E4EFF77BDE8AFFEBD967B31BE92CC743C9AFA0C3E429E34BE55D1D2BD9AB987BEDFC5D23D1C3236BE0E6D05BE3BB36ABE2380DBBDB4598C3ED9CE93BD251C90BC300EC83DED0427BEAFE1393C618E9D3DC7FD093E807517BE90A93F3E356423BEE783353D1884A3BD321D8FBEC8F9373CB62D89BD6BBE12BE135A6CBCD8BE943E5CC0813D3EC08C3D025B093E989C7B3E191CA93D5DC834BE5BC4323E70D984BC01AD3B3E6C547DBDDE8423B85B5EE5BD12201D3E42C4303DE342743C982D053DD1073C3C25F190BD06C7503CBD82AEBD5869323EFF67213E77AD543ED618283E5085B4BD5EEBCEBC0A9697BEF4DBB93D3B4A13BDFFBF5CBE159F953D4EE133BE3874AC3D1042C8BD661EA53DBFBE923DE9DECBBC9E8F21BE0FDF32BC64278DBECBE6083D22936F3E02CF48BCD82CAABD31BBCA3C97EC1EBE32610FBE133F06BE4FCE1C3DA000793BAD1A38BE559B013EB90DD83AC28188BE5D91A5BC68D28B3EAFA6E23DFC718A3DFF6106BD56936EBDD0D637BE9D888BBD543D60BD2503943B6170F9BBE80800BC29B89CBCE2CE27BCFE07933B2F7BFFBC977A43BCD80DBBBC76A8FDBAA1EA0BBC3549A2BBA21A89BCEB2D68BA81BA67BC46D385BC1CA4563A4AEEEABBB90E0FBDAFDADCBB9BAD2E3B7AE55DBC02A830BD975DA0B83304DB3C8306EA3BA073D2BC02412D3DCBA7123D6307863CE2DF11BD9B47FF3CD1A57A3BB1A4213B8BD40CBD9DC30CBC86C5A73AFCBAE93C4A1AF6BDD117D6BB248F8CBCDCFFCFBB3E196A3D4CD142BCCD87A73C326E903C414098BAC93384BD2D43A8BD133E05BC1B0C1BBD088058BC8C153F3C5C37083DA8FE8F3B0857F5B9C52DA2BC2AAB9ABA1359A83BF24D11BC52010FBB89E02F3C0FEE863B7482E4BA157F34BB6A55E9BC9BFE3DBB8C3888BC22F799BCD8917EBC7517553C090D4ABBA86188B94AE50ABC527CA1BC1A780BBD138E043CCD1D973C1452143C02DEA1BCAB76213D13EB933C4F57DF3CDE853E3B9F6D0B3DECBC0BBDD6799B3BE97A193CA454A23BC402E43B6D2AF33B58E2473C8B54CF3A71C7F2B92AD9643DB1A333BD1072743C9BEC4F3CD9EAFCBCD1C39D3AD53E05BCE634213ED5478C3E77CE5CBD18FA953CFB9F483B2B9A3D3D5B75343B91AD073DA1C5CCBCFCDE88BC3E759B3BD2305CBCAE3B0DBDBA8B5CBDFD0B693C4DEC16BC4562B438EC14863B241E2A3CCF4EF5B288F8003D24628E3A875B0EBCADF292BC8A068E3CF1F1803BA244963CB784DBBCC415DEBBEB1084BCD55BA23AA7005F3AAC5CAABCCE4256BC21A827BDC57E673C20FD363DE0B747BD80A401BD42C99C3B95F9AB3CB446C4BD76314EBDFF9BD33BDB99F43B1F82A5BB41DDAE3B957CBC3CC78AD43CB274DFBA8B2939BD299368BAA87C763C581C89392CC5C3BCBD245A3B000F3D3B0EFC4A3CEDCC84BCEC53553CAC115B3C722888BCC9D6FF3C1DC0153C35781F3C632D6E3C568CF5BC6D06DA3C6048D8BB31C988BD1A644A3C148AE9BB21AA5BBBD5A54A3957BBCE3D1FCC75BB707BE73BD174C13CBCB74DBB25E3C4BC93D063BB04DDF93CF902BC3CA92BD83DA617F5BB34FCE1BB180B34BC40BCA23A37DB5CBBB22554BDA174DC3B070C6BBC14A6D4BABF9FA6BBDEE97FBC906801BD08104BBD63211FBB195CCB3DB4D42CBB9CBBFABB856378BD02E46D35CC10013EBF0392BC83CA64BB69842B3DA4F7C63BF24B0E3B0E84183C2C851139486446BD1D199A3AFDD70F3D2AB5843BF527303E8333B33D8F2385BA4DD9923C88178FBCB6AE99BB11A29A3C046F203B7FDEB93B4C5C82BC574610BD8730A1BC9F995F3CA8789D3E0484AFBB5A7A733C6FDC053D33211FBC2DD1163DC13BEE3AA141823CCB4FB13A2EC158BD2825E5BC758FA53CE6CC81BC255E6ABCC21C453D414D41BC7C62F93DE6FF88BBAFBB1FBCBE7D833DB43B87BBE7B5BB3C7B311F3E15AA5F3CBE992F3D119B523C1ED6453C43F8583C234F05BCC9D245BE5FFF903C8189133C348E97BDDB1793BD16FC843BF723CEBBD061B13C491721BC052E543C0FB0533D6008143CFE50D2BCDE59833C7F92283CEC4213BDAACA2ABDDB8E463C70E314BAA8D5953DA9BAC5BD2D8C82BAE0C2C0BD3020FEBCC393E0BD8E3ADD3CA04E1DBDE8EA843D343CD7BDA6215EBDACF35EBD16400C3D2C2B0F3CC928DCBD2DA601BE7EFF72BD29C469BCD21FE0BAD3CDED3DC0EBC13C1DADDF3B459114BC1F30E6BD83F8263D2E1EA63CFF80B83D1539033D65E3953DAEDA8E3C58F3743DC342353CA909A73C2AA018BECF31CC3B10128E3B0E81A03B709980BC1457AE3C2158963D2C4B41BD2A5FAFBC7703043CC273A8BC0652AD3CCE7DBABC857F9CBCF360983C8AEF673CA5398E3CAB3BE83B77CA2EB988CCFBBB1D298A3BB5AA9B3DA3C2EB3B325D873C553803BDDF5110BD833047BC9286B93D7F4527BC72E8213C8B3913BD30E4B0BCFC53EB3B57866DBCBF5CA6BBEE9C5C3DD7917A3CFA395C3D56B035BBD686413D12728DBC1EA6263CF5DA4BBE990EC93A740F853C82F8AC3C8132EBBBC6F03C3CF2374C3CA72B01BD14A1123CAA66453EAFDA8E39CF84353BCF99103E1FCB3DBD1F310F3D25AFF13CBCBBE1BB48D2143D5DA695BCFCF622BD1D4547BD0B59403B1EB821BD58454DBDA14E813214A8B8BBCC14F6BCAA48033D3F6B9F3D516436BC54931C3D85241A3C5E8E823DF4F50EBD9D76A73C29DA443A11B6153C48B007BC188D0E3DDA483DBE38C21DBC42F638BCFBDF30BD1438753D96F5F13C5DAB3FBCEB78B73DE8FD92BC9B8E8ABDA99C28BDDC50E33C345F393D31CEE2BD2F66603BE59582BDFBD4B23B82F6643CF9DE2CBB42861B3E7364D3BC6931CABD16DEE2BD39E838BD27E2523CD53C8E3D26C72BBDE1920E3C73BD93BDF348A33CE6E26C3C8B33C83A92532FBEEC3190BC11C433BBDE67A2BCB914AF3D81CE93BDEC163DBC558119BEE8B8623C55A7A23A11A6083EE44527BD1107BF3A6F88D5BD0FBA09BE31E01C3EE920803BEC6DD03B00B015BBD89C4C3DD0754ABC4494273C90CE8ABD525BE7BC7610A0BDA0E3A63C4A8E74BC030D1BBEDF3645BE2904FB3C8CDC9FBAF5BE19BD5890D63CB2EFE9BAB11F4F3D9BA878BB1DCC9837D620A03CE1F09ABC1BFA1BBD20B697BD21A84E3D10090CBC20EE91BCAA71883C5384B1BC6E75D23BF4F42F3DA5FB1EBE5F4EF0BBAB9A9EBC0FCC703D7D436B3D5F3EAC3B948F6DBC73947A3DED9747BC51B697BB64B334BD54C03A3DDAFF61BCCE8B203D9C7BC4BA020C02BDA1C5AABC678CEDBC7182563C0197FE3C9AE672B9D82E4BBD185E16BC18B6C73BB2E4B13B2FB2073DFB1D9C3D5767633EFCE301BE69673F3D2ED0F93C6F32FE3A2E52463CBFFCA93BC7D8E2BA34941BBD1EE900BCA674553DA50D023C5758A5BC799B8BBCB0846F3D5499BEBCB8A7AEBD5C43E2BB1C12E13BB3452BBC8F2F9D3BC7319FBBA9E228BC80A81A3C90D728BC1D2E713B93ECEF3C991977BC16FAA6BBA25FE23BBA59D93B4A688DB95A6FD83CBBC106BCE2205ABCAA17C83CD3AD7D3B9808EAB6E913083C4CE495BB248ADDBB2A22C9BC2693663C418080BB4D1481BCAC40D7BC01DDA4BB05063C3B9AB159BC0D8E783BEC3DB13C2DF37FBB135F6D3D39F5303C1D48D23BE1FDF8BB56973CBEE028183D983086BBF8E49F3CBC89503CB4974BBE6DD8F93E543951BCDF522FBDCA54883D2F172BBBDE229DBCB875CF3B442FDFBC7BEB96BDBA3F29BC40B0EE3C1546F9BBB7A965BCF5909C3CCEE7273C6801063D63780ABD4CFB80BC14FC093DD37BB2BED979D5BC5284C4BC95677EBC3D17E7BAF57A8139788902BD46FF7EBD4EAF913CCDD3573C57092D3C1CBE87BC19C20FBCDA9C053D3DBC66BCF98296BC42FFF63A4355E1BD5DAFC33CF7EBB03B470FDA3D50B7DCBBD4A1CA3CA39C833C82544CBCA0DE4D3CA8F7A23C6C8BE53DA7C995BCD108C53C448FF63C577B56BCCD63A63C99ABB5BC36ED1ABE39EA07BFB891CABC76D4153CE0997E3C68A9F83CE4AF6ABDF508683C2F759ABB5E41A23BCC459ABC14EB92BC2A87193C8B1E5B3D6E952F3DE62CB43DF71403BD41C05A3A79E45CBC441907B47DE241BC09F4A6BB4EC0D3BE22A685BC0734BE3B9DAF303CDB87203DC81F1BBD553B8EBCD32183BC5028B0B9AB95AA3BF5BCBE3BCE28143C0F93C3BCF591FE396CA1753D00C854BD0F87F6BC3595FEBA861C0D3ED23E58BD101EC53D896124BDE75017BCDEDF193D5A991F3CCC2746BCF12CBF3C0CE7313C3857EEBCFA9E873BB87A263D977C113C2D91AC3C0306243D8C49D9BBCE0C583C7F1747BC3829853CC08CCBBCBF33693C2FCBB9BCE6A81BBCFCF31A3D4815433D3BC8EA3B556B023DA068BC3CB1F3DC3ED6AD6B3C7811083BDCF623BB7422913CA4EF1EBEAACAF2BB3CDFCD3B7FD3B1BCFEC1B839A82D8C3C74C2D9BB3621963C424E46BAC4E440BE91BEAE3B6CAF173DD1D57BBBCFDDE53B4F621E3B116A2ABD6079B73C9ADF783C275C5E3C76F67D3C11FAF2BCC89C2DBD2226863A6A31933B20FDABBD9533B1BB12151F3C83911C3E5B9BC2B12CFFAEBBB8E6F0BB7C98AA3BF930823C76F8D33C08EA6BBB7FBEA0BC889299B8CCF36A3D28A2DA3CFAEC6A3C2E40293DA4E090BE7408AFBD6EBB07BB8ABF9DBC06C197BD5CC83C3C588E543AD92AE03C1551A73C0B41F2BB79B5D3BB42DC433B81B9FBBCB0A6203C07313B3C88BE543C5CAC55BC5B5D02BC271B3E3D4F231ABB829E0F3C66AA92BC71D62CBDAAFCBC3C0D29183D3FEF19BD374F453CC459233DD994AD3B9B1F08BE762CB13B456A1FBB91ACF2BE17B4413C916D74BA842292BD93EA763C59B5B23C0A48A7BB1A506BBB6E16DDBC8EEE36BCF5B9923D16C4FEBBF8735F3D420409BD5F8DDE3B1CA3C0BC179B19BC1A006FBCDCE08BBC06E42ABED21BF4BB83F2CE3D5D0FC6BC60D90ABDE9311EBC9E3A9BBCA9EAE53AF57A6BBB0E9734BDE754C1BC45BCBFBD7982DABA8DD09F3CD6E4163A8C22A9BCC1DECCBD043FDBBD9844183CE06B7BBDA8325E3D183F13BD76C40BBE9A692C3C50A3A238F6792EBCA4487B3D51F09DBC59A37DBBD364983D6727B23B06C64CBBD8352E3D6B6CDD3D464874BD98A1073C7AB6B6BDCB0EDBBC79FBF3BCF4BF4E3C602F123E16A0A03C6B7F08BD395D683BB726533D52301DBC600525BD0A7ED6BC5BF7D53C7194BD3C41F5AD3D10B32F3DD92BBD3B1003E53DB02765BCCCD6F93B83104FBD110F8F3A860D3ABD3C07BEBC497083BCF805573A49654EBCE252383D7C868FB819E299BEE4D9033D37A9063DA8CE833D1B58A7BA0EF1743A067A2A3C1DB1AFBBBD9CFC3CC791AF3CC99D57BCB630A93C7776CA3D297141BC6DF7DF3CECE7C83B565AF9BC63EB413CA68F31BDA620343B1A7021BDF94CE0BDEC18BF3D385463BCE49384BD225A2B3A3BF0DC3C1F418B3C2D49D3BDC4F79FBD1AA48E3B400D263C4503F03DC90653BD1230ED3CE004BA3C2556E63DE011D43BA4B4453B83F24CBC4A2C173CA7C98CBCB6BAB5BC8ABBCC3C9DE044B286D207BEB19AF5BCDF76813DBF5F523D9110B43B947E9CBE48AC0A3BAC68963D8470893D064FC33C95C0A8BA055B0A3CB7D359BC722A803BE7A712BECC00153DBDAEF0BC850EA03CF81F95BC04394EBC74BB37BCDD7796BC99F7B63BD1B6CBBD25092B3D7A3662BD4F1DA03C8E19D1BC79CAF1BAB402F03A9008453C4622213B1A5399BB9DB50EBCC7D1B6BC43671ABDCFF69CBCD5F70C3D60FCF0BB5712AFBC8444BC3CA7267FBB57764D3DFBCCBA3C97561CBCD4DF5EBD561C193C21DCD0BB31D8AD3CC410CA3C201F15BDE32D073EB62AFCBCAE37C03B53CE3FBD643AE6BBB1E0893D04D4323DB33D453B92C2123CA2F358BD77D715BEC9847DBC46833F3CCBB78DBBEEE13ABD5898E93CBCDBFD3C96FEB4BC2B13393D8892DD3C1133883D4A283C3D3196053BDA12F03C133A7CBB7C2FB33B178B51BDBC181CBDCD4B513B7FE408BEF76D8BBD6FEC8DB4B249903CCD7DAABC6BC1C03BA666733D5785503CE08F603BB80436BDD821693B76C081BC89BFD83D469D0BBEE7096A3D129732BDB5959ABC4E8F773C92B3113D54CEE33BF0CD88BC129A07BDE362A4BDEC3658BD38A94ABD4D5C25BC3DD5783C19A3963CA059C6B96FE006BD4F3B053E48FFA53CF3C5243CCAAF963CB82C6EBA53F174BC2307243DB960D43C591A413D54ECFFBBAF837E3D0D700EBCE0AE2C3C1AFBA2BD27E8563C1CE112BC8A8263BCCC4C04BA5F5CFC3C88A79BBCD22E89BBDECA39BDB340553DAA75613DAAF3573A67E053BDB1746D3CE32870BDBA36AA3CD80FA63D220B8B3D69E621BE66593C3DC82B66BD982EF23C87F32B3CB05356BD0A09F2BD01C8853CB6A8353C0DF4FDBD8EC14A3DCDD29DBDD9E8E3BD09E099BC4B129F3DA0099D3D013E2C3E4627423BBB28C33B4A2353BDE9591ABDD674FA3CEE05EE3CD66137BC932D81BC72C61E3EA8A1873C6979C2BD68FC3DBD24A50CBE8D0DA63D2481C0BD8B90863D2ABB353B77AC3C3E367C71BD260EF33C803A17BED1D9BFBC3FE0A33D5B3BF0BCDE86B6BDC0D388BD85718D3CE252E0BBC4AECFBDD87E3FBD55EA6A3D0FB1473CDBFD75BA3C9EBC3DCC83D53C7B19493C009F8EBACA34033ECB022F3D1138DF3CFD4562BDE81C14BDCD5C8DBDF5DD40BEF1BEC13D92A7ABBCB5CA9EBD6C3DACBB974923BC231D083B9F4A8E3DA4DC603DB79A24BD4795C73DDA1B013EFA1F86BCEE673B3CC3C5293C11B2223E853FB43DA295B03DB4DD84BC31C97EBEE941843A2E0BFFBC3566003ED4A8893D5231673C1AB9B5BC17A81B3DF2893A3D867F23BDD03938BC47EB403EC97BD23D102523BDE9138AB97FA792BD4123193C196C5E3D4CD5523DAA1F80BC1224A83C0C20743DD5EF593D2B50043EF445A5BD2DD3D0BDD53DDEBC372E42BEC35D83BDA1A10B3D5675163D4FAB26BE292419BDA2E60E3C7519AB3C56CF0BB47544A83DBB9FB8BD5F42213D540CEBBDA8EBB5BDB4001E3E832E02BDD59AB0BCE3AE4FBC5AA680BD96C0653C793046BD5B51493D7C8CB03BB52F12BE9F54223D81E479BDEC09333DB0D32BBD506CDCBB8377F13CA00E9ABD9B6A7C3D81D17C3D1374183D8F5C84BD36E218BE0973C2BCFE349B3B9AD0FBBDFE2F15BD685FC73CA51FEA3D5148EB3C1F3675BDB58368BE19D226BC02F668BD8DACD5BD51CA2ABE99B0C0BB818EFB3C149A9C3B6FED5ABDDC648FBDC370DB3C9AF0A0BD0478EA3D16593EBDC074FA3C5E90C33C35E9893DFC612FBC1A5C16BEEBDF863DD27A243CCF44B73D4EC581BC286396BBC84772BE33E08BBD19523A3E8C0217BEFEC84F3D6C0141BDF9A7493DAAAB663D8559B3BDD2CC243D92EC993DE3C3843C407518BE3450BFBDFA2F9EBC9DCA15BEFE2D3BBD5CBE06BDD09B923D0A7F89BD850CAB3B22CAE6BC9F8F253D41A2D8371AB0923C303130BD08182FBD28376BBD3EEC3BBD557A703C690FC43D9D247B3C4C34EB3C57306D3C31BCAB3D19401BBD7C486FBDFC98CA3C2C3E1E3D3C9DAF3D2E9DB8BD4CFC593D4908873DFF4F60BDD561E73C129E5DBDBF97373D4AC282BD6CDC5EBDCE5B38BD81A2753DC1431C3D4B45293D6ECE91BD85DA10BD235FA33B23CF153D90DC29BD04F4863CC4FC043DB0F2273E5E3474BD58DA4F3D3862A13D9D309DBD5AAAF6BB38E61C3DB82BB73C90F063BD2862C9BD2F2D4DBDE69232BC48C4F83CF4932D3D28EDBEBCC62A543D20672D3CA010ADBBE7BA6DBD916AB83B1F34EB3B3524ACBB8BE7993B1D5DF8BC8CD2DD3CFAE73B3C76A968BB9018E23B6D69E03CA938BFBD911213BC5D01583D213E993C59F7F33DA314D23CC017533BADAD90BA7F9BA1BC7F08D7BCC59798357644923C09D958BCCA9891BC9B119BBDADDE2ABD2664073D8A6FDFBB54E1FB3C9D949FBC2517713C3CDE263C13FC093CA6434FBD5D4B543CF32507BD12DB2F3CA965FB3D7AC8C6BBD560313CDA6DDE3D908AD33D9BE050BC1823CBBC60BD30BD34A4A13C3747BEBCE09BC1BC4449DEBBECB9313C58E6773D08F01BBDF49F6C3D697BC93C4AD69E3B790F8F3DDA5F0CBB281EE13C6FF36F3DB281ABBB413E2F3D2DDAB03C3D41BB3C8666063D1AAA9E3C4106F53B6737E33CFB5D65BD2F3DAE3B96F784BAF126DC3C5D9F903CBEF04E3DEBEDACBC40E6DBBCA92719BB3B4CB53CF54A863C464E8E3D7BBC0B3C2F3221BC56B0BB3BCCE4023DAC47D4BC30A034BC07DF27BED809FCBCB891A03C117F723DB65B9BBDA05895BD3CD2E13AAC1113BD1032383ED4E6D5BC9B180BBE21459B3BD699BD3C03AFEFBA280E05BC90773BBBE797CB3C2D0F9E3C4302823B297CBFBB297C30BDACBD4CBC5CB5913D4FE44EBB559BA63BDA17C4BD21A57C3B6BACD83CD1933FBDBC39233B5CF79B3B46FDAA3A2C30FAB308AA063C2B1C21BC347404BD2466713C5BF1AEBC5887133DD42640BEE148073E89337BBCC482A7BD6C9E9F3C79D514BDEF8D8A3D74DE00BC2AD305BEF2FE9B3C1E7E173DAE06BF3AEE4FA23D79D2053D0FBA5FBC74EDBD3DB54000BDBCF7453D988B50BC43A0F53C751AB7BBE36E0BBD2DE0AF3BBEDA743C315E783B9D6B10BB49FC303DD7401A3E54207B3DE073B4BD6CAA35BBE2C5A2BC9A9E0FBDDC5E113E1C75A3BB88A2743C30281DBD49320CBEFE33103C7392B7BE687717BDB55C8E3CD0B7B9BB199177BBEC96D83C3A58073EE0B72EBC8C8B043D140B063CF8315A3DDED2E23D2CB6F03D78EE98BAFC8CD73C5A52ABBC25E9FD3D1461F3BC753B5DBCC7AE03BD688E96BB46E6C9BBDDCCB53CB3566C3C513512BD161625BC2AC911BD8695DC3B29CD203CBDA599BD68983DBDA498BABB1C19A4BC4D761C3C4AC9BA388A2E8BBE0F3B293D371BF8B676CEB7B87B6D293C350E89BDA9B6F2BD3EEB26BC50A2853BA9C808BDA11F183A032080BB47CC8EBD9FA5183E3F9F463DE63B373C757B6DBC3E807FBD38D6BCBCFD55903BC7F5613C4ACB933C2636C4BCF352DDBA68B3D6BDF47004BEAE9B0FBD74B329BDA8E488BC81D185BB73352B3D7E9F22BC6499F33A2AD516BE18EE0DBB43919B3C195B573D24D8B03C353B7BBCF70FFA3CE176823CCFCF013D7A809DBCB740C8BD7185943BDDDEDCBB360FAF3BB7450F3B36371E38F422093DF59BBA3C76BFBCBC75D65ABE197243BC1356873C542F02BB8807983BE80EEA3D17C3D53BCCC370BC1E7B54BC650CE83B0775F03BBD429F3B8D97453C84FD0BBBD7A6F93B8E4ACA3CC88C46BDA2B285BC8DE70A3D1D6293BC2AA853BC2E5A113DDE8300BDAFA55C3B712316BDE25DB23C98E7F439A4F0893C2B87B03AC869083E2A66413CB116F73B597BBC3C43FB82BC142A39BD82FB63BC32BB203D617DA8BA9E0BD83C23C212BC0704EEBB72D3EABB095D6C3B4EDD8E3D344C89BC548C183C917280BC7D6DEBBC353E2CBC3CC10DBC3057473D50C2C63CE9E76A3A573E183C6DBED9BCF6A6F9BCD18A94BC8DE529BDA27E7B3B505EB33B73DAB5BB81A4363E4D34F5BB8B51EEBC2B4DDF3AA01996BC95CE35BC6D606B3C113E0CBC34C0DABC4CA00F3C0CBA44BC29A98BBE67D9C43BDA12B13C02858A38BCA226BBC93B8DBD8E48453C1F1E32BD863937BD4E61DABC7FA9963C91E2F73C373E43BDF47C7B3B2B99383D71104BBAA9AC143D92E006BC009A753CA6D5B1BE333B533A0933C33BA519BABC35D8CEBC2E7CA43C6E75233DCF15BB3C690522BCE197C0BD007B70BD15E67D3CE312153B93D2883C1DA285BCEF013C3AFED4ADBB3EC2FF3CAE5F6DB8BC5148BB649A27BE42F03CBC2CA9983D56ABF1BB5ED017BDB7BF103DB073143C283DEEBB61E8C23BE770623BD8BB30BBE489FCBCF11B1EB39E7D0F3C4483E53CB18EAE3BF0FE28BDB03766BCB4BA183D544175BEC9C391BE5A4F8B3B48DCCCBB924A80BBF73DC3BACB1C3CBD86C100BC990CA8BDFA72053DE593683B07D8B0BBBBCB7A3D5BECBA3C2ACAA4BB6EF77BBC1BCC5B3BC525EA3D7C2C9EBCCA7095BACE550BBDDCAA12BDFA2CA6BCF33A933C224137BC2C4D413BCDBBD4BC33EBBEBD136A1BBD4D69C33D3F667E3C43F324BAA4C00B3CBEDBA93DCC85B13B7F0047BCCC982BBD7934AA3B05BD8E3B5CADF83D1593E53CD167AEBC3766B43C8479B4BC467EAD3B812B1F3E354D86B9FEE53B3DAB06EC3B401D5A3BDD589E3DCD9D353C288BDAB9EB589DBD60A4293D9891043EC8522A3D2A70873C2D45A53B3FAA34BCF382F73C5E87C8BB31B7873BB441EA3B5DC57FBC5DA9113DFD5D803CB86C9FB884BB3DBD123E1A3DEFC2623B457DCDBB6054913CD443D8BA5166163C6D0E0BBC8F8796357E9FF2BC1538C33A18B450BCFFF81DBC5E0E54BC289A52BCBE71103CF8917EBAB56F3CBCCB7BFF3C3D21533D3D56E2BDD32B453A93D319BBA2B54EBD241747BBBC7B9FBCE336CB3BF13C513D680741BB8FD2383B7B764DBDFFBFD4BDA5F326BC866C8ABDA7FBB4BCD1396A3D3A3D0CBAFF2CE3BA44129BBA5D54E0BC1037C3B9C998083C4C073B3CD052C7BB140085BC04E3AD3D865D0B3BB9A207BA255AC53D5B07BDBD85F5B4BA8A292CBD7FF8473A104B04BCEDE9F2BC467707BC1462C13BA12768BCA98798BDB008913B5D66BDBBD70F99BB0885643980F880BD9521973CE5D9D8BAF256B13BF815DB3CB98D203DD03501BD98EAF2BCD4D388BAC3F01A3DF2B0D73B1302363D2829A2BC8634F33C873DDDBA5ED7F4BD9CCFEEBCF7D45EBCC3A99E3C8F9D39BB32B38FBD6A1160BA53E693BDF1EF85BCD0E50EBEC6E8E7BD3E4A253D47DBFD3C054588BDB2CA5E3D7540B8BC4D974E3D2FC46EBCE5DDA93C5D893FBD8E7E793C18A01E3D558620BC4F13A4BDFB8CF7BC3E7A5ABC45F183BDDA1B383D93A5993CAFC81CBC775D0A3A5BE270BBFCEB67BD631F87BB0183E33C6AE1CD3BB1D0E3BCAF711DBDA27AC3BD77C1BE3B110939BC1843FFBDE3A4ACBC6C70023CE2ECD2BD9DEE52BD648007BD5542F93B4313613CC8F044BD919EEE3C78730BBC5C18E03D75BA263DDD20A93CA6338EB9621167BA3B5B8ABD145AE3BB7B7C19BE2CE1143DA1C0143C6386BFBD24A6B3BC503E88BC517BF63C4C7676BD5D105FB9503AA03CC118453CC8DC9EBC0B9835BDEBCD863D4795CD3B7FAEEEBC8D4D79BD3B40293D6BF0FC3BEFAA34BC457C233EC9CBB3BDBEB1AEBD8BCB1D3CA304E7BB7F42F3BB1F63773C4149A3BAB4C425BEA7096CBD420E22BC146650BC35696B3DAAD89B3DA6B6423D8002D13B1B0E09BD3B7C8A3DBD0E583B8DD053BDE49D2CBD5E98853BC982A6BC17DB0CBDF12579B3A75F00BDADD24E3DE266503C4283A93C76EF13BCCC967B3D38633ABD836033BE0E8364BBD13AA8BC47C0C6BBB1AE1DBD7D7288BDD04BC73C781D8A3E26109FBCAAF257BC374A173D848C84BBFA271DBDC3CD46BB5D01B8BC87129F3C12D2E53DF80694BCC8A6A3BB120C703D7098733D85591E3D076A133D366B4CBC090483BC14F64CBB1D0D13BE8548FC3C5153043E4A79D5BD7BD125BDEF9DDC3C8AF8A43C8D6682BC483CFBBC75A532BC4078AFBC045B153D7DB2673CC6CE943D42720C3C0A6D08BCCA13213CCDAB033D7DEB46BEE893273C2E8025BE913FA33C0D2B79BCF121023EC859753DC34DC2BA4571AF3CAC8DC2BD5D3204BE4C749D3C6A31563DCF75733CD0004E3D5AE1AB3D39E20F3DDCCA5FBDCD097ABC647F5CBC8D055D3C42BDB1BAAE147FBC27C70A3E3E01AF3D8A3769B907A0F03C5D6C6A3C4C9B0D371E9DBEBD8F5CA03C1243B136D2598CBC19D0DFBB9BB433BD32DDFD3DB066F13BFEFC55BC2ED6A63B0F47263CEA2D84BCEF9B943BD3345EBB21FCDF3C2F58C2BC8E5FB8BC18BD4A3D0D4EF4BCA3EA003D81B40C3D29F5533BC3840A3D7ACD99BCFA2C0BBD66ABA03C303924BD4408803D9F7A45BB9D48CE3C9816453D344C72BBE43F8B3C96DA0C3EBCC4C7BA11C489BDEF74F83C64203DBACA04333CD779313DD84BEF39681E983DB88F053E424F6ABD994CA1BCFE05E0BACF9D2E3DB93859BC298009BD6F5937BD4F6ABA3CBD0B06BDD178133EFE8C95BC6098CEBC1D9B743703346FBB587F52BDC8B53A3BD9B4C93CCFBD013D2F7A80BD1D26AB3C65284CBC8C65043CE695B6BB0B2D00BC2AC52ABD6B82D0BB810E753C8AFFB0BCDE93E13C02240E3C48C5153DD333483A53DD833CCE29723D7AB3643D955996392489EE3C4C3D2EBCC5E309BD9C68D93C2612C93C8E284CBC190599BCC7EDCC3DD7E46EBC9D53F63C855D4ABC55580CBD23E8B4BDF03F49BD7E1B2A3C5A66B0BAE432533DAA25173B8B9F703CC2FFE0BDE04127BE59DE67BD76EE4B3C83533FBD0CFC8A39D678D8398880AABC1F18883DDCC93CBB4717B1BCF2ED7BBE70B3053DFC34F33CD887D43C7A734F3C2172FD3CBA73DE3C5E2EA3BC839BE03BCCC6A23C29A4843C2744AD3B6206E93CC14186392CD0DB3BBA41D0BCAEBF2ABA062ADC3A1CEBF33A41E08BBBB2BF123D670D2C3CED8F543CA5DF2DBDA66DC9BCAA4439BD3D1BBEBBFBED6CBEBA8C9FBCFA4F42BEAAF132BB01D318BC14EC513CE766BC3BB299723B445670BD15D920BB8ACF913C833D1BBD734F9CBD95F31E3D4549533C837994BA734D9E3D4E88253CD0CDD83A1AB30E3B21FA673CCCAB7DBC17B3643B32CFA43A63369CBCA51A123C0531A83B96F42E3DDD89EEBB4592663D8BA5D8BBE2E5FE3BECED4CBD1940C83C9BB4F53C7CBAE7BCD9688ABC877A883B644BBBBCED04CB32A77B593DB975B7BDA4EB093CB7D129BEA64A0EBDCBB1FD3CC558343C49B5673EDA30ADBC51F008BB2288B3BAFC3434BC6DA4B23BB31A78BCC4BED4BDE27F7C3C409B173C9249C23C3E09C7BAA991BD3DA7E0ABBCD69EC33D3AEC093C9353A3BE073C8CBC1AF612BC1BDD5DBC742F3DBDDA42A5BB907300BC4FD10A3DC02002BB6420883D0D22B23D349C843DFF8C85BEC69D973D0E2AF5BB7EBF36BD0AABCF3C9FC09CBC6876313CE5FE0CBCFAF22ABBB61C0DBC1A0C68BD86A484BC769E1A3C90FE6DBC81A01CBCC9E6AABD8E2D263DD24A15BDA70E24BC4A67E53B25D9FABB02CC043E0D640ABE5D4B1B3B823A1FBE91B20DBD9977D3BC871C1D3CFDB76D3CFEDF5BBB4E34B5BBA152E0BC1EF070BD9704DCBC302483BC1EC161BE4F7E77BC30B7C2BC77ABA43D2573B43D0680573D2A4F1B3C25C2AC3CBAE07C3C9B1BB839172A823DE0FE853C43ED8235AE81CD3B8506273C3F6C74BCFED31A3D623CB63B09393E3D064799BD96BBBD3C76BBAE3B93EB02BCD8F773BEAC864CBE5B2DF53A4448EB3C297169BC26A3DFBB88E0A13C517538BC18E1343C6845953C4F6AD93C6A74F2BC8675903D0856003CAC66C0BCAA23F9BA83F361BC7DDB36BC31259BBC4854EEBB63D8713DADCC5DBB53BA27BC4429F03A97519ABCF552E8BC7512853D04199B3C0BE980BD32A606BE9857923D8027123D9A0F69BCC95E8C3C2F9422BC596999BC317A39BC92F2933C9160DC3C2E5758BD1EA7563C5BD137BCA496D7BAA9DD043C573B03BEBD4D6DBB293C8D3DE03E053DBE83B4BC4770A3BC54D11ABC2DC262BAB6718F3BFFE529BD69BFCBBD1ADC3DBEE283733D44047EBCEB53B43B6B0DC23CF08557BDC65D993C0E24EB3DCA79873D44DBDF3BFE78023B495529BD8DB155BDE3D4143B9F185C3D847C20BDA5FFEABBA596263D327F0C3C020AE13C642619BE1DEFAFBC0FF31ABC774D3D3DDA04C4BD7678AC3B474EE63A8673CFBDF9590B3DA081B43B912A963D824AF93D0F8316BC17BA8EBD49A7263C55BC85BCEAC84A3D15F63CBD435881BB0B85C9BCB4310D3D625EB83D9F51D73C7917323DC3539D3C621A25BD24E2033C3827473D7F118E3D9A9CDCBC7FD8B53CD4736E3DF7F883BC535497BDBBC65C3CA00463BDA2013BBD5CBEEDBB0F0607BB755383B935B1E03CF30C413D6582C03CBC78103D545F7B3C1E4E013DA74033BD5C49533C49FE103D8ECA7E3C1982843C6A61863C0FEB333C118DA8BCBA05813B1A66D23D7BCB13BCEA27E83C0DF737BD388005BD2C2529BDEFF5A43982D476BC1B5E2FBEA3BB22BDB89D5DBE739B8A3CEB81283C1129393C9D25033D54918C3C80455C3C3F0CA3BD8DF0C33CABC1553BEE19013D761186BD1ACE88BB85778CBCAD43D93D3407BABD69002B39C40A5F3D6AB7DEBDCA6E72BC2FFB943C24814D3D9B221FB4C7F3483DC644B4BD647B953D82E3DABC66EE0DBD53C822BEC800353DB6B9983D6F4E123DAC7885BDFF5E453AAD1090BC384A183DF0432BBC4835C7BD6B9EC73DF6F04B3BCD98043D7F47A2BB18ABA9BC6BD2703A2F0F10BD93F2CB3CBD5497BD8892F43C0982573A316505BECBB943BC9DD7B73D6232C6BD7A7DC63C4B9223BAB74E463DD25CA13D2CCA283BA28105BDE0EEF9BC9820CC3C3EB1A4BDD7B5A5BD24F59E3C750ECF3BCA4A7D3D81DDAF3D491D65BD34359F3D5F27D1BDF886D73C974A9FBC7494A73CA60018BDCF53083EF166DCBA57387E3D9FE7B1BB190440BCB31ECC3D66AF503DD54D81BBE42E33BC84A114BD31070CBE3944D7BC5875473DB6F83ABD691A003D4A397EBC06953ABE249F2D3D831D033C9EED433DFDEF47BD6FF942BDE349413D6D7D2DBE5553DEBD93F4CFBCA4DA77BC22D00F3DBBD21B3C7D32E1BD648DB53CA7DFEC37CEC9183D901496BCAED006BD71F0A83D74F2C0BD4941903CD22AD63C13184E3C6022A73C2D8D2CBDB0A543BD7A38293C4A322A3C091F163D707E2E3D40A5C53CBFF893BC30B0233D01188D3C901E9D3DB3361B3C7079DDBDD91376BD724E0EBE51B513BDEB7A953B2A24873DED33773C9EDC123D7DCD43BDC04454BDC599253BBDC0803CCF7CB3BC23E8AD3CDD39A13C6ADC503D86C8EF3D31347BB93330B8BDBEE0D8BD5B902A3D2990273D5ED19FBB8B415ABD59AF4EBCB20830BD076AC9BCAB2FBE3CC222ECBDB315903A71137A3C9771E83BE6AAA3BA017384BDFF31E03BC0CC2EBBB393053CF316543CD894A23B0F2C2A3B8954A9BB55478FBBAA0DDABBC914903C293CEB3BDDB4BABC22B5353C4685A8BC56508FBBDAA8413C5FE595BC63E4B33C2CDAFEBCF438AABCC4CE49BAC6B69CBBE78D943B9ABA15BDFFD278BC47CA71BCF5A2463CA5C6C33D988126BC51FD15BCD8622F3D5B125B3C43FBE1BC0BD5023D5C63C83B3694F8BC80414CBB32B04CBD6C31403C968A57BCB8747BBC77123DBDACE8C33C91C7D2BCE01FC73C4B5B003C2E2F073CB6BE10BC27ABEEBC55430BBB827A66B99011BBBB391C8F3C8BA379BCB320D13AFF1C623E37957138B1BA0CBDBB04FEBC18F1A83B83AE6EBB42704ABC934F4CBDFFC3EEBC03D22CBCAFEACD3B81D4AFBE194B243CC193773CA17F3CBA2152143C742F0CBD469F573DA30467BD27B21FBDAA33CBB85D09C2BBF64706BDE010CDBCE8AA18BBBAD3D53B528B7E3CFA80EC3CCCD49ABA3EF0CEBB6371C8BE0BA7033CD8688F3B2982A03BBF2F2BBD9BE4D93A16661D3D623A0DBCFB19B6BD5334A4BD8916C2BC4BC42E3C6422DBBC24670BBB15016BBC0C3204BC1AD1913C3596CA3CE15B5D3C8A87AF3BD486783DD5EC0BBCDA2C6E3D85B7843C9600323DAA980E3D21E9913BBB6126BB1252853C3476593B4838173CC4154A3D6A06DDB34743293CBA5EFBBADFC3553DC3F07ABD3B9D3CBB2038943D8A5757BE7FAC7EBE0FE9ABBCB8C336BCA520AFBB168EC8BC0665E0BC52E023BC0D51AE3DF02800BC4639A0BBA1E486BCB8338E3D528D843B144FB239B7B1D3BC69BB4EBC0EAB733D27165ABBD2F6503CFFC0B4BC0BA3913C820D12BCD3DB843CAE95B4BA07A3713CCDD910BC190FE3BB40920B3DE4E0943DCBADE83B4046453CAFAF4BBBDB275A3DD07A7ABCAE6ADABAF83B83BDCDD422BD1B52303D7774A33DFD89EBBC3A6DCCBC5CBE393BA5BB893AE69844BCF4FD66BDE9A16EBB4FD2203D3FCA8EBB05D911BB7F20823DCDB42F3C2A82EFBAFDC89FBD1FF0F73C7BC98EBDC84D5D3CB166A53942CED0BCCED5C4BC9AD3023D1986163CAADCF4BA27F2063B3BA109BD1254ED3CD1E114BC5C9EACBB4BEF7ABDD3F50DBCA5F971BC068782BC0985533C407245BB36DB7F3806997ABB7A161037A86E43BBD6F1A3BB1856003DC11F7E3C8904E0BCB747DEBAB8671C3C2149193B0B8B4C3A7325353B111FD73DB1A49EBD07EB31BBC6A976BC631C4E3B73B471BA9B70EB3C3BEA1D3973FF593C026798BCB7C33DBC025D11BD3C53203C7C17A7BCA4F83E3D90A06ABC5443763A735DA3BB7D22163C2A6E43BBB262D43D97B931B9F8B923BDCD1F053D226963398E5CBCBCF61A843D1A77D43C6A970DBC567156BC3B3EA8BD5FB89ABCD5D5D4BC7650A4BCAA5AE0BC3BAE2FBD60AE94BBABC2A73C013BE9BC48DB203EA9D6F6BB8B9A8EBCA0BC163D4791A7B86D8D753D3261E03A1580DA3B065B4DBD10EF7CBD1228FB3B6A78713C22AC493CECEFA7BC0E94B8BC66A9D0BCF099FB3CB07B2C3BC39F8A3D4F60F73BB85FF93C349B2E3DCEFBC0BB31A2153CFFC4233D35B33E3DAFCF9CB8956A4A3D63FEC13CFB7D4EBC8AA04A3C8865693D3F3A9FB9B5E3143D9B1434BD16BCC5BB47B9BB3B7D8CC43B9627D43CB8D77FBC20B2D83BC6779339FFBCABBB9F717CBD3DF7DE3BFB33C13C780B84BD79AA28BEC89B28BD89A2ABB9D54F94BD422E873A4AEC85BDD8A313BCC2C2C43C6FD883BCD50EBEBCBE1FDCBD74AE3D3C2DC5243C964815BB6D51583D62C12E3D5FF2323D67E83CBD76F505BC8437D5BCED744D3BC5D59BBBB7456B3D8331E53C2FEB933CA116253DC2C108BDEC7C7E3CA190A038C530143CCE585CBC85DB8DBCA012403CD936A23B37D61ABD9BF5B3BCB076BB3D7098BABD5A359F3CAA02D3BDBE58653A2014603D285217BC8F8DF8BA79A6C4BDD7AF293D6BE9613AED46643C02AC29BD351F2ABD781764BB2E89E53C16CBE8BDB089B93DC88A0F3D5A90473CB7F3B4BC7E8E933C7CEE9F3BD4E0D0BA5EB312BBFE8632BD3D0C7EBC295A903CAFED7DBC33DD743BB1749E3DD6EE973C616FB23D7DB689BD3145093C38FA6D3C7ED92DBD483152BC50A5B03CFDE811BDBAC05AB3A91213BDC1A58B3CFE27A5BC60253DBD337A103D0C5A0EBC8DCD1EBE6E3D403ED44264BC3FCB2E3DEDB67ABBDAA6BEBC82566CBC766932BCEEAC23BDC1FB95BBDD89C13CCF36A13C98B3163D676A203C21AE9BBC08A5A4BDECEF98BB368B38BE39773A3DF725B43CD63A513C3D47B6BD03AAE9BCB58E0B3CC5C32ABCECD7253C5C70123C79358BBD0BF25E3DF2BAEFBD3F7D7B3C55C6BF3BAFBCC9BC0E679F3DAAFCCFBC3B618ABC17E6B1BC8F1A163EC18CCFBC8A4E063DA100BA3C3716073D98431D3C9B1FA63C88788EBB664EE93D34FB42BC19393C3D0E66CEBC77F09ABB035BAE3D0E2D03BEAF05D8B935CF203CE2E1B0BC66AA2BBC040EA1BCD48AA13C2C2B2CBCC0EA373C824DD73B8B15423CB5B87FBC45D5B63C220251BDF460B23C4464E1BB827E9EBDAD19993EF02B7B3C07FB6F3CB3BCAE3ABD84B13BC8BBAFBA4F92073ECBC330BC42798CB6498BFE3AA5EAF13AD8F6193DAEBE4D3DB6B713BD8C730A3B0BCF273C27EBDB3C5EA9F23CB5064D3D321606BF7F227FBE4BF97BBBD47F293C1DAD98BDFEDE0B3D7435B63C4A1D65BC5E6E40BD73DCBABCC84C053D05871ABCAD655C3D5DF6FC3CD84AAEBCA6D8AF3B7AF6283C7792EEBBBC3615BC0DB0B33B8D773EBDE026943A4D08833B0427233D129ED53CB425533CC6A7453DF2A125BD129D8BBD311F44BC7EC4C93C4E57C33C4457C0BCF52AD4BBF7E5423B12404B3CDA7B953C322D043D975027BCBFD3F1BD132D7D3CD12C1FBD5D8C823C4F53B4BC538E4CBD6E4E45BEE2952CBC5CF9F2B8EB809B3B0F596ABC16AB39BFE4FE01BD25E1113D3FA3003C8AADD0BA66D59C3B5DD109BDC66EBD3C32DADEBC44C2E13B1A9DFBBC17B3F53B1583BDBCD58C50BA6642863C309F8FB9489AD53AFB50CD3B0F1BAFBBF0F62EBBBF959D3C9403AA3B7BF1B9BCF6D8F4BC3CE6E6BC9CCE02BC66D126BDB7F09B3B2AA8BCBBFD37883CA5D504BDDF8D243BB731D13C3C83AABCEF0A233E9A521E3D986BF8BCEC5028BD8D1F5ABA312507BEE4AFBEBE7D96D8BC4D4957BCA671533D9045A13C159DD7BB1C82ECBC1A4BBE3BB27CE53DF26E163B985AA53C4525B8BC185A3DBBB377C2BCAB418BBCD37242BD07EBB8BC008D3FBD68D6EE3CEA6C6D3E188F7B3D3032323D97C0DE3CE6AA15BD49516FBA36D4E7BD301FB1BD286551BC02CC8CBB881782BBB08383BC52C1323C32D361BC1F7073BC0C91183C83B1FCBC2B5B583D323018BAFAB4A2BC2D8DB3BD7C58A6BB84A32E3CC1AC2B3D578EA9BCC673A73A6A057E3C0F74263C731D35BDCCA2B5BD9E73023CE7596FBC63D6B8BCC33C0CBD7AB56DBC6009223E94E671BC48B1EE3A7562B93CE1FF243C8346BABCBFECBCBC9FE41A3BAA7EA3BB6488FEBBD12707BD0634C33C3F82093E215A2CBCF94EAD3D5709093E5514A9BC24B4673B2BB35233EA6C25BDACDC9E3C6DE0E8BE4C69E0BCBA22103C57524E3DB1F8183DC7409BBBF1A3ED3C0CE9543D2DC24BBB2B8B17B84FEE0A3C6BB835BCDCAB1A3DA2C76DBC08CCC03C7CC7F0BD0138CD3CDA47F13A71E9603C56ED02BDA7FF1EBF2816FEBC1044823BE68FD5BD46B31E3CDE3C873A74D7B1BB2A286F3C3B52DDBDCBA7FD3CCD51FF3B10E8813CCEDC803D5F8BB4BCB23072BC8BFDE23C0E56493CCC63BA3B6AF441BC7B879BBC30C5F9BB1C06CA3CD01C983D2933293BFF94463CD279B9BC3A067F3CA9B414BE7A974F3C2BC492BC853C543C215E483D4477A63CFF077339C335503DF0949ABC84CED1B9571FF23C56A8183DFAC79DBCF49EA03C2A70AFBE3C244F3B0476D3BCA6FD693B0231073D22C901BD7C5A29BD9E8AFB3CA57A7F3C3F1811BB667B823CF0FC2EBBA3447A3B445798BC956FDEBC4B21763D2FEBA1BA61CC5B3DE7F770BE08BF0537B61B2BBEA1F31DBD9558503B5A28DD3C518AB53CD5DB26BC51D202BB220CDCBB1DF4C5BDF8A5D13CFCBE933D8BF7F23B256ABDBCA3A52BBDA47E55BCA35384BD1991183D2086E4BD11EFEFBC1DBD69BB288FBFBA48C409BD46E749BD9C30953C6DC26DBCEA5EDCBC3BD99A3B54A6933C12F2303DA7F0E63AF92C013D35AEE8B9B7E374BBCF64913C5C6A3EBDAEFC0CBED9F6F63C2A81AE3C2EE6093D6D0BD63C753D46BC225B3EBCA04CBDBB205B823CB4B4093EEE12853A33A6F3BA8443023DDB81C6BCD7205A3C8EA7DCBC55C0053D4CB33F3D6CA36E3C6CCCEDBCD4A8D9BC068093BA52888ABBA75A94BC73BBECBC507C87BD7DF41F3C12AD11B97FFEA8BC834B88BB5D9E4639FFB789BB3C5A4F3D3F9E013DB401383CF8A1883D7D09373CC903BC3C4BF8663C79A868BD64529E3A78DD193DD641193D20A492BCC8F7503CCBAB073D1DAA033B9B6C06BC0430153CFFECAE3C867B123C0D8B01BE9D775EBD1E99653D869EF4BAE8F84ABE6189AABB7D361BBD6D53B83B602F403D6D30583C6A83D8BB81998FBAF673323C6B2887BDADBF47BD4F933F3C279F09BDB35D4F3DCFF0B1BCBC26643D31A2913C752C90BCDD889DBDF4C5733BF88E57BD4166E13BFF83163CBDA03D3CE2A8983942731F3B2A7B253C4C0413BD8B7F333B726BF7BD2A7B593B8860EABD0927CB3B00B13DBB8DE88C392E54813CD35F0DBD7C20EFBCE2F42D3D4AE34C3DAF6ABD3C50AE83BC11872C3D3C05813DF8BDB63DB8AFFD3C542A393D54E2443B751B98BA9163993DE648B23CD810DF3AB082A13BCAA4023DDEA8B13C10B282BC4E67BD3DFEAB003CE54099B9EF41433DFBBF22BD9AE683BADDEC81BC7BDA053EA9C4493E3828A8BD6BAB283DA4D2773C1850B1BC7666D4BB405F0E3D74409EBB3BED86BBF55A9F3B6F951EBDF1CDED3C0CB73CBE77F24E3D5C48883D5D8EDD39D4921B3B86A23A3A339496B3978DA33C5937883CD1F495BD15B539BC3F27423CDFED663D7364113D1BD1A3BDBC08653C3A40FBBBB11250BA4D693DBCFC9A6E3B350253BC4C9EBCBDBD1102BA2583DC3B30889CBDB97A1FBD7AF6153CD02D383DD17008BE49161CBD0B491FBDC73F17BC3A5087390018973CCAC0F23B104B183DF58EAE3CA5D699BD582B513C5FA1F53CE2665FBD3720733DA824573D683DC5BC168C48B9B5AC60BD8D4EFD3C752455BCD6C956BC4CDD1A3DB2E012BD284D48BD7F4A073D1222A53CD56B653C9F63D9BC16AC273DDA6B2E3D9B23BB3CFC79513C677035BCCA33123EDCFE1ABAF5C27CBD1E6D693C061C3EBB871C573B97DA18BC69EE4F3D5A77663C2230773E34A0403BD837D13C6FB550BCF69039BB82BE823BC8CAD5BD45BE333D5FA944BC4D4ABFBC69856A3BEB3F66BDCB899FBD59778FBD838D153DBA81153EE532663AC314E73A5DD7E1BD4C508E368C0F033E75BCB3BB10D15EBC0AE8BF3CB035BA3C9487D93C950A923C9DE752BABB34A3BDC613943D2623593CCC16A9BC9E5D823E21CD6C3D724FB43C8CB1373C5ED09DBDC09F95BCC69AE23B66E6853DE49CD53C3EBA373C55C5A03BDA84BCBCE47F483B536EAB3DCDC982BCC18BD23C22B0003D3AB221BDC5915D3D6FA3B33AC2F2763B60A8C6BB1EB7B7BD8A1E553DF8BD0B3C19936CBC0BC3053CCD4A213DC3EAA13C5BD20E3EFB77B5BC6018883C784260BD9AC509BD5DF7713CEE9E853D3A889B3C966B5A3D440638BD0054863C53620B3D69280BBCB9DD14BD5EAC20BE76C5493B5EF8D23C631B463B46D2193C9ACB6BBE44617CBC011D62BA3C3A9C3C2730C03C6B350EBBA1A580BCB0E6683D314A83BBC8A04BBB0FE1003D84D2043DD48EA63909A5D13C696FB1BCDE3F77BA7C4EAF3CABB8063D4E49003DAF09EEBA01CD1FBC4609A5BC120B233C381576BC8208BD3B4079483DAFF614BD3507623BEA66673DAAD22E3DAA08B13D6BE4BABAFC9458BAADA68B3AEB3967BD6C6D4B3DF4BE9CBC41A99CBC626B733C7EA9683C81B034BD0449F13CC5A636BD90ADAE3DA3FBD5BCA5F5DDBC3564253B6AA92FBCFEE9BF3D2C87BA3B6391F4BC9FC783BD118781BCEAEBB5BACC406EBD6B15E4BCD9AEA53C455FA5BB6FF0E83C6BC6D83D5FF1ECBD28E69ABD8C3153BCB9E233BD35949A39A0041C3C9BC468BD7956253D3FE4333C2E26C2BB98B3373C9DD38E3C9A23A33B3821B33CC36F04BDB002D03ADCB969BD033F6D3D97C751BB8404633D400267BB9806BF3943E1A1BD36B4073B54A1C7BC7BACFABBF90801BC19AB8BBA0BE8813C5605A23CDFFC693AC367D2BB91979B3CA60DB0BD52045DBEEF38063D12F3973CC293853CCD2C7CBCFC754DBE54959E3B9DD40E3D58FC183DAEB38CBC3AF2BE3CD7ED8E3DF19E903EF090BE3C6D96EC3CBA28D1BC44CA5A3C95EEEABA2F6E67B36E0D36BD723654BCF8FB33BDDBC9FE3B11190B3DE6CBF13CE6D5873D389928BDD472B73CB679E63C3557933BD1979CBD21B5173A6FF62C3C49E604BD78592FBC0307543DE8BC11BD4D5E4BBD367E64397820173E430369BDA27C62BE85F99DBD797C9CBDA77B433DF3DE4D3DD26105BD532C9E3CEF9DF13CCB3C94BBD0F075BD74FF663C370407BD5F9BB73DFA164E3D5BCFBBBCDA86B7BC5FFDA83BD713763D88F579BC2D040EBB8E4E713DE3650DBD4A0C0FBDA5D833BC483BC83B05E639BC59ABD9BB1CBC7A3EA829103DF458C53C19113ABC944E033D4562B9BD2109B5B95BB9F1BA2F66753C4D2A813A357E183A844C9DBAC22CA33CFE5E013D308800BF709969BC3863F83895CD2A3CB61A013DC88D723CF30CCEBD2F06B13C140A1F3D804C933C6B2B183DE8C5C3BC48B472BD727D543B476A2D3C495A00BEE7E47BBA2CA3613D42013DBE1949A0B6A0FE53BEBD30E2BCCB3FF7BC4B03ADBB51A1053DA2291DBC80C6CD3C9470863B859AC93D64C3343D05751E3D4BD9DC3C06399FBE2D9CEFBDDC7D4BBADDA17E3C5EB6193DD0EA03BE192B9EBA979B4B3DBDA52D3C34662C3C814C57BCF3A6223D3A71B83CAFA75C3CDC262ABD570F9DBB0E6AC1BD8099593C381B4E3D93E6FCBA1270FA3A47CA4CBCD6FB283AFBEBD6BC75B443BCA547BE3A5DB9AB3C38B3933CCA4A4B3DD54B4FBDDD7581BD6946F93B4F31DDBC9F9A30BAA4185C3C5B9E21BDB26F68BC1AE03D3C084624BD337E8ABC3D98013CBBD9A7BAB3CB63BC3FEFAC3CC9A0D03BFCF7563BF6B7AEBC9D76CF3CEDC420BD393F683B03064ABCB253A3BCE61258BC5EA4FEBBAE3B5ABCF54806BCB3DFF939CE73E0BB329907BDA529FF399291013DC755DF3A89B4B83C95F7CBBAFD04AD3C0544A1BBB2F7A13C8D7E293BCD9B993CDC54B53BBBC284BCA243243C8D04AA3CBC74D8BC1D0DA13C30AC14BCD7605E3C31C42EBCD392EC3D8E86E9BB3D5142BC2920D2BB2C1A81BCDF4CA93C73B3D4BA14A0A03C2286C63AC44DC6BCD2EDA53D5B2BFDBC3FFE05BC857693BC94DF75BC7A8B86BAD08D49BBC962B2BCAB75DC3D6272143B2FE44C3C801870BCF2553ABC3876823B63F984BCD02D903A2ED332BC51AB2EBB5C5589BBD51C013E14FDA73C0F4763BC220DCA3C1473A7BCA74006BA93BC44BCB767A93BB69517BDE67787BB1558303C5572783C3034053CC4110BBD837A1DBCF76E8BBCD55FF0BCC3A8C4BC0919E2BC4B175B3C1FCB17BDFBDB413D5C64473CA3C53BBC8646DB3BDEF6C8BC62F3953AD44D09BD2F78ABBC9F0BC2BA6AA9083D4FEAFFBB8208A93BEE5101BC4A9595BD859B22BD35209F3CDF9FF7BC606D20BDFCF5F23C6F9F8BBDFC5F11BD9B299D3B07B77EBC04F141BB0B3719BCCEB5F9BBAA7D303E7BC529BBC56303BD7AF650BAA6EF833B53D4D33C0B958932C64A063D697B27BB35D44FBD9B7EB33BBBD6F4BC31B706BD4F91443B37428C3B94C46CBCDED4A0BC8FC8073C40A8CEBB67BEA5BBA09669BC96CEA93C8A0F963C70C91DBD39CC843D6AE0B43C6B1885BCD56F863C1657BEBBF8B3C43C91C89ABCEB28973C410E2ABCBF4DE7BB47379BBC165C9B3C600B28BCF6032DBD2753F8BCA3487BBCF91BFABB16C7E8BC0E0DD9BC35E7A1BB067D5D3BFF94F1BCA3C32BBDC7C198BC01DF21BBFCD17B3C5A9F143B1276533CA0C28B3CB8FB4DBC0F50FB3B66EDBEBA9B643B3D1D6805BC4341003D91CB9ABA9728593C07AE95BDD5D5253C1CAE333DCEA96ABCC3B1CDBA4DFE1BBC5F3F303CE1A0C6BC5B86F1B935A7A9BC0F3089BB561C4B3B5A9A633DE08B62BCC361433D4E5D4B3CA82613BD66FF9CBCBA722ABD5A723D3D9EE1A83A05877FBCBED9693C36A2003B4A4890BD99D6BEBA10361B3CEC99E43C0913E4B52BA9F6BC69089E3B75B63EBC9820E53C61C8333C7C3BD2BB7AF3EE3B3A5B78BBB8D7CC3D45E71ABBFA91BDBB679CB8BCBA42C9BD6AFE4CBD9D2AE0BBD6A43EBC1699C7BCA5483E3C1D5EE53B169C63392CE3B03CB7B4E7BC6ACA52BC262B08BDA6BF92BC144A473E8AA7323CDADBBEBB9F8E41BD8C4002BC0CC54ABD6334C0395751FA3C1EFF583A73D3FF3C88C562BCF7F6453C91BC023DCAC2A8BC4B1F9EBC9791B4BC418680BC82B966BBB603253A160964BCE716EBBB9CFA733C01D260BC7142983C35AB7ABC102988BB2A34303C7EABB93DB6EF02BDC26C163E05AD70BD939D95BBE194B03C7F8BC13C841C1C3B77BBC73D77DBEBBCDBAB0DBDC64B0C3CA18D4E3B6816EDBDACE333BD28C4D53C1131FDBBFBBA913DB0BAEE3CEA8B893D211A85BD3D6205BCE7099ABDF4F248BBE2F3E73DAB93E839A87EE93C482CA9BC940485BCDB39E6BD160A60BE8D573EBDA19A89BDAEC8CA3D7B57353DCA1AB83C7578B8BDFA616A3DAC59AB3CA91BC6BB32FEED3D76A7593D2A6F7EBDFBD2893CEC8F54BD3BF79FBD3586623D2ABCC23C42D7933D5474B0BD179F8F3D2771D2BCEAFCA53D776235BEDFCB49BD73DAC43BAA55F1BCB47ABFBC04E5093E847203BD2DD1D2BD147A38BDF3D8323DD0D7593C4B76533C1E730E3DF01EDD3DF1C5B3BC825E623DE647883C8121B1BDEB192E3CA3AC05BBE19779BDBF0CB1BD2451CE3D0C050DBE87531EBEA84147BD578D363D74C11DBE806F5EBDB38E61BD95F9ACBDCEFD103DA8DD92BCDB1396BC3CBEAA3CD19385BD8CE11DBE655A23BD3A25E5BDCFD48BBDB32C55BDC941733D8DFD2B3DEBF047BDCE2508BD69C15D3D513DFC3BD0EE98BC980EBFBA6E6680BD0E68ABBDDCF2653DD202163EAC07DDBC0E056FBD9020D3BDF683543BC41FB83D8C35E4BBCEA7F93C4EDC8C3DE65C23BDCE40023DDE470B3C3C7E1A3CD8B4C3BC83F15CBDD9FE5D3431EE43BEEB8ACF3CEF5F82BD99BC453D821A1D3D10730CBE2A54E73D1AA2853DE15BA03CE5E962BDE0FB95BC8CD1DD3C3DC69DBC6226383CCC9845BE1BF548BE1F936C3DB6E990BD778739BE14C8573D08B3873D468438BD34CA9CBD39E91D3D79B994BD420F8C3D8460F13D875FA0BCE0BC0EBD03B8EA3CE312B33DEE945DBD4E069CBD667360BD40ED283D4954DE3DCF45063DDC4F903CCDDECA3D779FC53C28F005BD6ACD1FBDFB09BBBDCCD02DBD7263043D3071B7BDFB5C8C3B6E3C2EBD9ABA123DC82C92BD36C6AF3D8FB035BE05DB25BC6173B2BD3D1734BD000203BDC4C0D83DD0C5C7BC51BB003C9B94953D05B9153E1FB14CBCC444BE3D2B6CE6BDFC46063C6A9026BDF65031BE521F6E3DAE240FBD6AEE0EBB80EC203CD50AA03DD01C623DC05F0CBC194ABB3DFF89033EC3E4293D84CF763D5F407A3DEEE5A9BB650D1A3E8F0E7EBC83948CB77AF7003A59E2EE3C869F0C3D8513283D2E3A103D8B48003DDF03A7BD16E6D4BB799B42BC7B1AD73D23D81A3E6025E33DCAC6183DE7C135BDE87096BD7B499EBD2169843D431F0A3C524E2BBE9D57033D658EB2BDDC3FF33C2E60B6BD083D863D3B32583DAC671D3DD096E4BDA7BC22BC4763D2BD4472003D10B4113EBCD9E5BB39E6AABDE92F2A3D348534BD921DA0BD9286A9BDAB44FDBCDF09663DB4176A3D9317B23DDC2DF13C1922C9BDA94645BB9D51073E6598A83D019C213CC83A38BDE7F0D3BC8675EBBC20EFB7BD86CCF4BC6C54B1BCF97619BCE9A652BD6B1CFB3C8FF58EBC5D3B8C3B1A99CCBC6E37A1BC1D1FBCBCE615223CB7B4AEBC964892BCCE745EBAD4BEB4BA296FC0BC2F9FD7BC5CC8213CC2BC8ABB208604BDA384033C45F02139300E0D3CC579BBBA7D9D64B93BAB1A3D3649163CF85705BA713FD33BB5A5E53CC04BDA3B150AA8BCD5321A3C200EC43B1519E1BCC0FF8EBD5700ACBCD2F5033DE5A111BD475AF9BD4A023CBC74B3CEBC214576BCBE16023D5533DB3C5235CF3C98579B3A19146B3B6C1D96BD10C958BD4A9E3FBD7D36C7BC9714933CF923A83C4491FF3CDAF84FBA8C1CA9BB670D8E3C59AE303B78629F3BC78DD7BC336CF7BC9BC8F23CAB8507BDC61E2BBCCDB581BC67846BBD00A5E6BCAAED08BC702E1E3D5868C0BCD7A5C23CE093C6BCD0A6E2B9D53766BC94CA443B6BF861BDAEF91ABC35395D3BE2E9913C21789A3C0632673C29A949BCB1F24A3DC893A73A9A8D49BCB02CCFBC68C7EDBA963F05BC08F85E3CC2C5483C1C7E1F3D6AA91D3D3BCE6EBC4B7308BDEDFB1C3D1BFA39BD533F3FBC034C193D961AC7BCCDEF553B7F2B6CBC47C1C93D99AD803E2D9643BD2E9622BC36CCA5BD6A61713D91DFAD3A284D18BB77C14DBCBD80F5BC25FEAE3AE72E2BBDA2ABA6BB8DF782BD0C4397BC577586BCB4BF7FBCAA62DEBAEC66ED3CE4813CB3B750DDBA52F05B3CA9AB3D3B86CAADBB49558ABD2DF16D3C7527A5BB7DA05FBD647ACB3B9E541ABC7563EA3B1348CC3B59D2C5BC0C88DEBC1878723C7158023D1E09283DFF4AFABC3C04A13BFAAE1EBCD3624D3DF335A4BDF807283D26C1F9BCB2A70F3DF5D6E9BC5AE4A2BC45EBDA3C3357283CE4CFD3BC7F5AB2BC18819D3CA98F203D41FA9E3C7C42EB3C2B5EA5BB7E7AA2B92A8FDA3C03C154BD71E88A3C5727BDBBCB2290BCACCAE63C8249F53C324508BB9003C63C985ADBBC6C89853CFA461C3B8506B8BB3FB6D63C2B36573D7E93CA3CD81CA23C8B037C3DC198F63BB80B763B2AB6BA3B872B7FBBABBCD8BCD9826D3C231081BBD07B303CFBA9263E8153673B3A1D64BB6CD93F3D891400BD0F964A3D6B0341BDE79219BDB32408BDAD73F0BCA9987F3D52A9063BBE1E23BDD7E53D3AC0FA5FBCBE7AF73D98F04ABA8985823BE592E13B1DAF0B360D410F3ECB7C47BC46F8123BB9F4913DA996D63CD1DC24BB60B0123C727780BB5610ABBC40CEEB3BD906003C17B9C4BCE6B3213E826EBA3D3CDEAA3B987A863DB3F326BDAFD7063D8043933CCD130E3C3C74F83BF198D5BC3C25CCBCC41C91BC871F27BD5C3E803EF2183BBBB23EDABAF003643C49609EBBD2B6A53B10510B3B6FC81D3DFD0B29BBD74893BD3392C2BC990A3C3C79B7323D55FF29BD6DA8EABBEAA4A4BC3412043E328F96BA895342BB4F54813CC2F9B4BC5D99033D1139D73D8F84C33CBF2D84BBAE24223B5E0B083BCEDBD03CAF8CE8BBC083533E9EDE183C85895DBB2E849BBDE43E0DBE340831BB87F4883C123F14BCB04C04BB941DBD3C582C3E3DA0A21BBDEDE3C8BB4923F93CD2F2BD3C3419D7BD1C8DA53CD48A003D89A4603BEFC2093EA821F83D54DA3AB9B0A75EBDC13E3F3C7A21F43DA8D972BDC894123CAB88ACBC1987003DC855D5BD7C585ABCABDDDB3CAC1C2DBCE3668EBCFA09AA3D6225833D0002E53CC2F22C3AE4360FBE79CBCDBA66B0AC3C36E4DFBD082CECBD527FC83BBF9D88BC91E1B2BDE6AF4E3CC484993C515D043C5E1D163DD1A10FBCB11B8D3DC85E48BEC84B153D4C77373C1376A939028D6EBDA176833BF8D6BC3DBA8198BCACA8F5BB2B0F0ABDE1A3933CB245ED3BBEDCD1BC50F810BC27A88B3B0FB72C3DBE064FBD9683563C8F847F3AA90D85BC1524BFBD185C0A3DD882573C0234F3B9E4D73BBD2E10923D8C71B43B7881A03CB138FB3B2FAB7F3D75CAE93C9F281ABD742351BC611B943B93FE503B6522F33D195F07BD2C76843CF25A34BCA2A3063902B942BC4B4260BBF076C73DE486183DE34F4F3D06A4C4BB51634ABDB98BAC3C5C73DF3B6AE2643C2DF30C3D44A5FEBD3C729BBCD8A5B5BCDF7B14BE3358B93CE4BD643D05931B3DFD63FDBDD32335BE01E364BBB885EB3C914B13BD205B073C0A21463B083B393BC799C5B3EE5C64BCC20A3EBC03EBB1BC9F5DBEBC6281043D3D30843DC84A383D2864633D11F908BDCFD7D33DFDEAB0BBF329133C07597F3CA4BBD43ACF201C3E3BAD6ABDCAFF97BC6EDECDBCBF8E17BDBC1AD03AB8A9453C04234F3DEC10DB3CFECBEEBDDF0063BDFFED003BDADFC23C34901BBED259A7BC6ABDD8BC12FE493DB26683BCA23416BB2A4983BEA553733C81A2253D12AE50BC55C844BC614C9B3C6CFD01BDECC1ABBC17A38DBA2440A8BD4CE267BD425E873CD9F4383CB607033E6BC284BCDBCF6C3C88A0B9BB46A01D3D3FB9B03D1EE6EE3C28A0BB3C314FFA3C3D6DF7BB6359F83D7649A9BD5A39C739015DD4BD53EC78BCA52A2CBE97971FBD1E640C3B137A07BD593520BC0A2E383D3685D63CDAF644BDBCF90B3C8F8B69BD95EBAF3C5980183DFDBB1FBEE6425E3E327FF8BDCF5CB4BA3DD1A6BD882F253CFE8183BA7455D83DC75CD33BDB36CC367BF8883BBC06D0BCBB6A883D39F907BEF05964BD07FDC7BB6D90973DE5700D3D675DE2BB458A1ABC955201BE87232ABE883C833CE04C04BD92B301BD2E96FA3CA98B543CC511A53CF11A36BDAFA598BD282CB9BC4FF7C43CC70E503DD2CC923D6E0B74BCB41FA4BC4ADC913C61F9313CB468B43C08C0CABB738D80BD8E779A39AC369E3C351C823B334CAF3BA9528A3B553A823C67AE19BD5CBC2B3DD226A23C54E54DBC23356E3CC77E88BCD7F4C8BD804FB93B6176B5B93922753DAEEE533C2D55B3393C4CB2BDEE7439BCFC3818BDF6E0B3BD8FCA7C3C55D46DBC6A4B5F3E198D223CA94EC9BC112BC5BA69F7A3BCEF23843AEA1A383CBF2EF5BCC7A924BB8A5AE6BC28ACDF3B7D3E68BA68FF193C2FD0D73C92D34C3CEBCD3ABD85EBF2BCE5998FBCEEE0763DC8858F3C5AB0BE396EF3803C95B984BBFE95D03BDD0629BC4C8B483C97705ABB90A0DFBA5019113C01A7A53B3E5692BB7566AFBC0B4431BC3AE029BC85A79B3B06CDF4BD911E5F3CD623A83DDCC08BBDCE0948BE0F9B593B18EA0A3C8797A73BEABBBF3A88A2B4BD2BE48B3E0D8CC23C8F773EBD98559C3DB8C9723CEA8CFFBC725E943B7C68C5BC9DE1F6BD1F49C7BC4BE3323D10968ABCDEC46DBD0A5656BC297C6A3B34DF08BC3847213DDE57F2BB9DCF77BC698615BFA902B63D68F0953CFC90FBBC34BA6ABB3FF1EDB99569E2BD063080BC30942F3DEC85FB3BAB47003B88CD5C3B22CCF93C7188423BD7F8C8BC5ABB80BDE05E4A3C4C684EBE3AEBFDBB1608D23AF44E0DBE64FD533C490882BC21DA8E3C77915C3D9F5832BD9A3483BBD10DD4BD445AEDBB53317ABDA790463D2447BABBC4CD813B29A7C13C871E8F3C96E28FBEAF45ACBD243BF6BC7B4FF73C04568DBC843309B97197303AD55B413C1EF6513C86B93F3CB905063CD88B013D2E5BCFBED40769BC1FC0063ECD71AB3D711EB83C6F3E78BC80D41EB37493E3BC3B22FFBCAD100BBF1E51F53ACED5763B2D155E3D6C1DDF3CEB27E13CD932BE3B38D5E73B23975ABC66A7E93D696A01BDA1DA2D3B539BAB3C36C7F3BC60A679BDAA860BBE9D17F03DEFBCC5BCC8938CBD7531403DD43D2F3D8D6661BDEE15443DE844ADBD0F1504BD82D128BD4102B4BCF747623B6B1F51BD11F6D73CF0066DBD478ABC3CDE66BCBC7CC441BDDB1A413BC41A023C67B30BBD7F9F90BC1EEC7FBB0BA9283CB4C1D03CD378A43CC07494BB95060ABD2740F53C01058FBB270D993DB312593DA5FA853C215E7A3DDA51933C5C57C43CF0BB953CD70D21BCA6FDCA3DAD88E0B8E949ED39E512023DE292343C62460B3C8FEE103CF8C0B73DAD6291BC1825453CA445B13C7E2C0C3D74EA02BB15F5363D9B711ABD911D273B37163EBD749D10BC197E6F3DAFB977BB1FD142BD72E1DFBC96258FBDA311ACBBA164B53D8026543C346653330B0B3C3D62F413BD623C3F3B6B215CBC1DCD34BB0F629A3C406BB5BA0BED463C6F69DBBD52621EBDBB8F103D14918A3BF1EF4CBDF84D633D7796C33C163816BDC1E28C3D768E873E66B6DEBC9B5E413C5379E13A615B5FBD6E6BB23CA6AFBBBCF938FD3C6F9E653BA3362D3DDCCA613D2A661ABDC4CA2FBC4CC3E03C9B8D4BB5D1AD293CC4B3373CDC6704BE7A07C3BD2C9C793D6169A8BC6A0E153B19E25D3D600A103C44D9ECBC16EFE13CA3DAE3BB942D16BF63D3B1BC868538BC2D6960BBF59E9F3C633E19BCB7DC333C36D75B3A8226033D3EBBD03848471C3E6937CE3B8B034D3CE3CD67BD1BF248BD1537193C29F799BCA9BE89BC4A8204BC7743663DB792F7BC4C082B3D5E3AE9BC849C74BC7B515BBA718685BCA6F2973DB9C408BC86ED033D1F1CA4BC7E43BE3DD7AE753A8BD37D3DF4F3593C4B4F0F3E79C92FBDD6F2843D017D9C3C421E153D23AAACBDC0DF253D22855B3B7579B1BBFB6EC0BC9BF3983DB57176BDBA2F9F3A0FD7FBBA37809BBDE663EBBCBB01CF3CC51EFBBC7CCF2CBDA61870BD86BA3DBDD39FAB3DBCDC533B1B66BCBD96A1D03B22F4C43D0A399F3BEBEC9C3C65C816BDDF5484BD73EF123D68AEA1BBE3C739BE780E163DB9A0723C1D8B513CE67226BBBD70573C768A0D3C7D77CBBC204C573DA2F2633CCFE9033DB9E7FB3DCC79BCBCFD171F3AB58307BB88E98C3C8FD2CCBDC880F93B6A7F53BCAE6B91BC0BED82BCEB5A0A3D5E0A863B5DE8E8BD3F7DEE3CFCBE0FBD367B343BCD2E153DE903F4BCBE0295BA44C26EBC075B16BE74179C3B98E6D03C2F886EBD154613BE388E35BCD624CB3C532D4F3D173E0B3D2DA8A93BC622D83BEF67833DA9323C3C09F9593CA5FC4FBB70204DBDBF7AFFBD4A729D3BCD7DF93CDE9C94BEB5B38D3C919E803D824B9A3C0A0F90BD4EE203BEAD205C3CABFB1DBD606EEBBA3FE9213C8B0CABBA01FA0C3D1FE3D1B31CF324BCBAE2B4BB37A850BC252AC9BCD0B00FBE2CD9563DB57D1DBD60FC23BE0DE3C63CDACAA93DC11186BAF16ACF3C0041D1392F17803B5AF9353DD42F813DEFA11ABBA7103A3CE736893B1EC5063D910776BCCA85CABDBB1BE93CA4C70E3EFBDAFBBC6668773B1FCC833C3590FEBC9BABDE3B56DEB5BC5AC8AABC63A7AC3C18BF563D92672CBE3250A3BDBA2CEE3DA0729EBB11D2223C2AD437BDE370D33CAAEA203C7F607A3C4CD98BBD82F1FAB9D5ECB23B0F30B73D0826913D60BBB2BBFAEBB6BCFFDD843C652A093D4527AD3D64DB18BC164ABF3DF9D4273CFA5CC4BB20AE283EF0E3083DE21123B912ED8CBD210C523DBEA6EABDE321A73B129EAA3C132C21BD3ECB1ABCA65E6C3CE271BE3C6728713D986F65BBE93BD6BCA11CDB3C13CD1C3B969BC03C40510C3EFC092EBE0FE81FBC52C65A3D70EC003D4C6B66B87B7F703DCEACB03CF2D370B6D588DBBB22BE233D2798363D0DF4303DC9F080BD76F11A3D0F22BFBD3E90A13B44934EBA87A905BD5A048CBCFA56D2BDE33F02BD2748A33CE1E25EBD1FE19CBC7CE165BC772B093DE284013A2A853CBC790598BA812BC4BC799D6DBC5CFC8ABD876FFABDA62782BCA4E78A3C09501C3D907943BB9F4704BDC0C524BEE1F20EBAD911283D483DB63CF00E91BC99D023BB839F063DF6162BBA23F777BD52A3E4BD55F780BCA811483B7C56903CE2B6153E1B38963B9F94233C8E40143D384B263C0B5337BD2AD324BE2D9022BD39D85D3B39ABE83CAF9D7EB9E59397390C4DBC3A3AA8B73B92F2523C8C896ABDDB9DDC3BBC817F3C54D011BCE87984BB1F3D903D8EFB263DBA761CBBB46EF0BA71DE32BD765DBD3A83FFA3BD593B5DBD9EF01D3CD3E9F0BC28097C3CD914C3BD9A6738BA879480BD5E2EDFBC066D163C37E33C3D706D8B3CAC22CFBC8EED24BE7C46F93DC11D1D3D5418003934467D3CD2B9D1BC568931BE859FD8BDCC2DFA3ACE3F5ABB8DF392BD4D53073D92729BBB63FCC2BD788B45BDC17FD73B81A7A83CBD18D53DF1EEDA3CF6AA32BC2060323C543687BCB4C2313B8EE601BD7A6C4DBE028666BCB6117C3D3787F43B4165A6BD830A7DBCEF70103CE0FEA1BCD39F1B3CFD4DBA3CC21E14BC51A9B63CC69D5BBD0A79A73C3936C6BCEBAC743D7896563C5F123BBA083A433ACC80FCBB8CB098BC9F74123DE3F58ABA4910DEBC24C933BC2CCB34BE3948A7BC921DF93C505294BC6C9D20BD5D7183BC1AAE09BD12C7D33C68BADE3C360B0D3C1DDED1BC57D5A1BBF36E94BC407624BDF17E763C2D8AE93C6628B33C43CB28BEE60A6ABD4A8B9CBD90D8CE3CDDB14DBC26766EB6312CC1BB5D39EABA2E0C0FBC7913543E080B6F3A4E5A133D3BC580BD51E4A2BC3C5D3E3D274D1A3DBCC5A6BD5497663D995129BC8DD7473B1E3F83BDDE62C83B61FF7FBC1BAE71BD1DE979B326378F3C1A32ADBC7AA60E3C2AFE8CBDF3FD3EBDFB5E81BC3BD3353D492462BD8406BF3C7E7354BDCE37A3BB84D550BC01645C3C5A3CBE3CCA0CCDBD0EFA8EBC1797873CFC70D8BBDF2E683CA4F0703C37B0393C97226A3D63BD2EBC27F7573D1AB94D3A469A6B3C3A5B8F3C908F9CBBE6E83A3D7E6008BCFB764A3C646E00BB02FEB0BB0DF0713D71B97BBB00E1193DBFBB983C9B096ABD5C2E2F3CE4E4B2BC1C432BBC4942C2BB502F96BD821695BD34D50638538CA43DE951A6BDC98F90BCD758EE3A07A38DBC681EB1BCBE9C62BEB90601BDBBAC3BBE1713053B4B4BFFBB6847103E410D80BC14208937F64783BD52FB3DBED0BF043E0ADE5A3C866EC6BAD152E33C7F8F413D11FD5A3CD92996BD2A8B50BD6FDF8B3C5976E63C22D2D7BCE67382BDF6A1E4BD3D79DEBDCF486D3D0F2FD83C2A41A13CA6BBD13C0BBB223A57D9473D9352DF3B0AFFC337AC7FA53C82E62E37D1E09FBD13547ABC9446453D2DE4443C5544103CF7BE7D3BBBEBB7BC39389CBC1376093EF5BF9CBC94A75C3CF279433BED94513D9189FC3CA307243C859FB53C572E953D7652723C6502A7BC2EE3493CC204E2BCB500CCBD57E0723DA34771BC7CE473BCF77115BCD19B9FBC8372B43B486A8B3D64D2DEB912F8A8BB056D55BB84C7E2BC2A8EE1BC7498A93C77C05E3DE3ED153E54469D3C579707BC9EF4803C5791C3BCD2EE14BC5774113C72F282BA88E8CBBCB71BD8BC6CABC63C9B01843DCD3152BCAA83D6BB10FCF1BEA0549BBA869CBE3D5C46153DFB319D3C2EC01C3D68FD09BC6B7E6A3D4E5384BDB569973CC700BFBC74DE293CC70927BC327FFE3CA4204BBCF019ADBCC76C72BC1914273DCCFF9E3C91ADD5BC4B61C63CBB482C3CD92686BD5B1277B90323133D9166EABCA40B813CC06F7ABBE647DABAF3F980B93FCD68BC7A1A13BB352CA4BC1186CABCB54D1CBD2670993CF043003D47658CBB579284BD75FDD13A218509BEA0D0483D9A60E03C89EE6E3AFD8F373DE98E99BA3222E0BC3993A33D4710A43D1B0F11BBD0C0353DEF884EBC2A70373E430BB13BB46607BD10BB933C6864E2BB0A5AA43CD746283DA3C0C8BCDA82573D524021BC793BC53CD7DD46BDA33F6A3DAD1CDDBC88ABE3B8BEB138BD0410E03C3B7D7C3C2AF80D3C2D26573A2203F639BB1E27BDA890023D4D8E363D5407C73CC7B364BC895B1C3D75DA733CD3B080BC54B6EC3CDE761FBDCB0E33BB18FF893D06670E3DF254F6BC740B803D42B390BAE562FA3CB95F1F3E09ED003D8F1805BD64DE0E3B301734BFC82B87BD2B6949BD86D834BD72AC12BD76737CBC090C573CB88B6CBC58328B3C7212AE3C38ED143DBC60C1BAE649F43B040E03BE86A4A13C0CB6053D7EDB123D0A2A203C1F7BDD3D179518BCF534CBBDF045A33D2A01DC3DF0604EBE6157A5BC73F369BC8F0BF5B36B23BEBD8C9F033B9FF83B3C485B173DC6EBA33A953FC9BDDB229E3CEC4CDE3C98A1C03C26692D3C87CF88BAA26BDDBD8AB5523C3310A83BBAFB14BC333E58BC258804BCE4DFE2BD004DAF3D39D75DBC7949B5BC8549853DAB83B73B46C39B3CA1BC0DBDC7FF68BDADFEBE3CAE1A093DDAF0DBBC909C81BBC31208BE77D2643E3674A1BC0706393B6FAB90BDCAA77D3CE10B99BAE1F37ABC1761BEBC6DAB0FBD7561143DC65DFDBD8E81423C86A0573CAAA2293E35E406BDD80D5E3CA2EA85BDC3B1973C9545F33C198072BCAFD5253CE6BD5E3DB4E88B3CD96A543C53B0D43A2ABA193D3BD8553D9ABFE2BA07DB65BC24B2163D019FAFBC714D36BCDB7BD5BB702C98BC6C3F05BCE4DC743C7CE04BBCA3698F3CB151123DDA0DB0BC08CE903CA5B9CEBDB165B63A573DA8BC6535593C1E70B93DB83304BC9CA3CB3C45BA8BBA8D3FE03A0CC589BB2D480DB7F68B0FBDCB77503B8DC3E7BC311046BCAA6AA4BCD251B33C8D1A5B3C629E963BD43DC5BC013A8BBB4BFF303DFA6B67BC5EBECC3BA832F9BCC1E18F3CFA25B6BC7E39AC3D2CBD9B3C3A31883B0C364BBD2A93CF3BA2D301BDA72C43BDA4BED2BC9223B53C11ACAB3A8C80AABBF73D11BD03FFA6BEE2A351BC78319C3C7BBD6B3B37F7B63C6D26C6BA2320D4BDB44FE5BE3EA55BBCA56FDB3CB4F79C3B1F4E7EBD89AC78BBB3C698BB64A0953C9433123CD29492BDE84CC13B4CF6B73A0951BF3BC436EEBC4BFAE23CFB277B3C497AFDBB2D90C23D3A14BB3CFF1E13BD50DC0B3D8B75CDBC9939203C804913BD7B2F6D3C5C60373DDDAFD33A70CFFEBAD85CBDBB71AF5A3B94353BBC7FFF53BC2F09BB3CDDFA773C6A33163DAB0931BDCF1A553B56E42F3DC71E12BDF4D67BBCF17533BA48161D3DF1714DBB29D481BB92B2943C37C90ABDA155D5BB87766EBCE7AB023C8C8E633CE11E87BC318B933CAD53C8BBCB894A3D9AB06A3D6524273DB5C00ABB0119A2BD6F3A74BC0FCE4DBD19F51E3D73820A3C5ABBAA3C6490B83BB0B3C03C11326D3DA4C9A23C2AFF50BCE246923D9A1363BD63390ABDE76150BCB67806BDCF8E3CBC10DAD03BD3FA963B48F026BDAFE73E3CEBEC5B3CF0E2843C3870EBBB58E76CBBB4BCB4BBC2E8B43DF3FB94BD4071F83EC6BD9CBDF1D434BCF961873B17FF4CBAEA3EA03B65BCCEBCD844E53C4DB493BA0F01993CBD34323C44371F3CAB9908BC3972173C14C5B13C0ACDCF39F7339ABD4D9D1BBDBA1738BC6DD609BD557E9D3982CDF13C329E9DBD39E9883DF258C03B204E13BD74B209BD01DCE73C79AD86BD37296FBABC3C39BD310917BA9C79D83BBD5480BDC7E229BD21AAA03DB327923C1C755CBD6EF8EE3BE9A260BD1A8A14BDB6BBA83CD4BE073D8105C43C1F42D8BBCE12DF3CA5AB0CBDE906F63BDA854F3D682129BD641A59BCA395953BA3FFDC32D0B2243C8208F5BA32EB51BDE7193D3DA343303D7618AC3B287FA93CC30FA4BD2B90693B376E94BBAEB949BBF4B169BDAA17A43CBA870EBC1C19E9BC94CDD73C5C6E7CBDB0E53F3DA38624BC55B8ACBCDEDB693D50CF01BC53A6C03CD94A58BDFAAF82BB2DB6943CB87F203D12B2D9BCD3E5B8BB790DCB3C33E7783CEDEFACBD52290E3DA82FC4BC32925D3D33C941BC0A8BFDBCF040183B6704B4BBC9228C3D325909BC3F09E6BCB3CA2BBBD69F123CBB30093DEC039BBCF680F13C7BB2943BC244BD3D10E344BBFC795B3D3F6104BB5D2E733C65D76A3B67978DBC87549E39AF114A3C7E8E633B467964B9B5C5353BB5F6A93C3207B13CC1CED13C2D77293ED25837BC4088DC3BA37DB53C9743353C7941FF3B6481153D7E69043C42333B3D54A83BBD837CA83D542BD7BBA46930BDDB733BBCCAE6A6BCBE36ADBD7853DEBA19BA0B3C0618D53A719CF7B634F83FBBFE9A8BBC53C394BC7F87DD3C5236533CAF86FB3AF93EB73C2AF01F3BFDA25C3C2050593D7DB2183DF6731C3C007C043D5FBF69BD48120BBD99F32ABDE5087E3D8D97843C72685F3CBF87A83CB03762BBA26E9A3CFBB4903B36EEC03CB6683C3D86CDEBBA124EBBBCFA7905B7DAB178BDA9DC223D07E0C33C95E6D6BA0597A03CEE3D093DC6A7973DD51AA6BC2EA6CE3C655CD3BB0169273C61575F3DA3EE1C3B1FBB3B3C74FC86BDE04392BCEE21BABD9E9B213CD9190ABCB9484BBD4B4C303C2BD6553BC96F87BC05BF00BC47D28ABBCB12FFBB4AA2A4BD9B0687BCDF8890BC52C599BC37471DBD66EBC0BCAAFC7A3C3BF121BB6FEC363C33BE3D3D5E48813C3AA465BE14950BBD0351F93C286A333BD07C023E5984D1BC806BA43BE216523D6653DC3C4F710CBDDBC313BAB60D11BCD71324BCB022BF3D6C59E93D78F341BDD184BABCC2679E3C0F7276BA2CAB763BC2C6203C6793683CBDE741BCC7DD27BDE2CE1ABDB62522BD9CA8AD3B1403D33D1752343D440CC5BBC205FE3DFD1B963DF70D46BD752695BCCDC90E3E97613E3D80DE033E26452F3B236DB8BCD9BA5B3C020C823B02BEFE3C7461343DB5B49A3CC1296ABC37625BBCCF64E7BBC95C46BD885E7B3DC5162A3DD685953DB371ADBC2D2ACB375346B53CDBB0563B0E2B76BBAEF9533D634886BD07C0B2BBE4160DBA915785BB150C1EBD1122143DA8E437BD2A92CFBB8296A13CB0066E3C128455BCAE939F3C32238ABC53E7953DCCBA41BD73DD173DE4831CBD8A016ABBE0DCAEBD1ABCA2BB8B26383C50F2083DE5DD9D3B17FFE6BCFA52F03CB790AFBCBC0893BD6A98F0BD913754BE73284E3D6C384E3D96DA6E38FC6D64BC0A6BDABC06B66C3C109C953DE234F13B477B84BBB1B24DBEA644B2BC8BE5323DC1D31C3C27AF0CBD9210D7BD153A0EBCBF3D06BD7968623C1FF9483C82A8D6B95D07D73C4EF397320F99573D60DBE3BCC5BCA2BC3163D33CE0BE963C152B133D455F2EBDC0601C3B9A05923B3FF353BD99D509BB77E8F53B558CC03B6E2687BC1E4E54BEDB739B3CD43A003DEBFCB5BC72DD903DE9BCED3C7404D539A435513E7F7149BC074D5A3D7AFE68BCA707543C28ED1D3C4AEE97BD7E0787BC136F4EBC8E3A983CA68AD83B8EB767BD5B07133E9129923DF96DF3BCFDEC013D3C14E5BC54B20D3DA32F9A3D3C98E9BB967E463C507D62BDEAEDE3BDC68C2CBD2BC8B9BD202E3FBDB0D532BB19FAC33C5BDAB7BC2A67C6B911D2D43D3CC4BB3AB317EE3DA87640BC15F9043C627D083E995EDB3D4CED9BBAF5B536BD4BF1183DC35A893D1F058A3C51FDAEBC156169BDB1E38F3C47794DBD682682BC9C13733C5BC1B9BC381D293C3616AE3C2565D73C8277B2BD8398EEBDB448E4BDBA6EF53B70AE173C93E7BA3C0E4822BB462A47BEE7D02B3C0E9D2837D177533BEB7C333C9B6024BD0C7FBABDC396F2BC26575E3B2DDD693C6321373CDFCCA93A827E7CBD055E1A3E4D7A063D28D99D3CEF7E3C3C17EF8ABD981BAABB835C113D2E3407BD3948363DD19A2B3CE07491BD3D5804BE7E4A41BE4BE2343C47F581BDD752ADBC27151A3D239ACEBC150EBDBB8FBF783D44F839BE05CD65BA45829B3C19E3273C55ED133CBC8477BDFBF98839BC556ABD71629C3D49E384BD095DC9BD506D663D2A925CBD4755A7BD20CEBD3C429185BCE3875F3D61CD9EBC8C4B3EBCC99A23BED8D0BBBC75DFAEBCB48F843BF02BB6BBC17F833CA1F95CBB5DF69D3CCCF502BCF7A920BCF66E573D2878403BC63A9A3BD269243C663DB23B197522BC201B123D2DD7043D14F439BD0D414A3CFDF9B5BC4C97DA3DB71558BB8512003E7CFA963DC08E5A3C2E18173A975A9D3C904772BC5CE5823AD202663BBFAB1C3D9E9F27BBDE3173BC189C03BD409F91BBE6ADA53C5FD787BDF84EDA3C88B7DFBCD958AB3BAF2566BD44CDD6BB98782F3E8F677B3EA5C4DCBCB923E0BC04AD9ABD1400653C679E66BB3430603D15FC1FBCEC610B3D4FF965BD952458BB0279B43DD4B2C33C2B9E3B3D2203793AD6CA3C3C823D243CB18C793C141F2C3D0C3881BA8917C9BBAC11A13DFB44BD3DDC6E94BB57296B3E525C56BC33F1D5BCADC23B3BBA86E3BA11B6933C5BA372BB68A8433AD69B18BFF51AF83C92A5FABC6BDF8FBAE3C9FB3C3C8636BD26C2813C1771C0BC496DAE3C103BD23D6F1399BCAB0A093D457B2ABE563AC6BC6067403CA7790E3D9E818DBB0E6913BDA3DBEABC6DDC293CF3A798B94F009B3CCD0F3ABD14CF583C3812373C2ECF44BC3D1531BDD6FE2BBD2F2EBABC9A737EBA326532BC063CCBBC96294D3DC1D89C3CDB0A5E3D66F8683D7F25E7BC92D7063D604DB5BB9CCB18BE51A0043C9658713D341E4239D614213EA366C93C0A222FBD0FE0FF3B81F1D0B3ECEEB33CB9B1ADBCE07A103D7DB95A3C47E087BC93C4E83C133A86BB5DF0953CF807F13B62A6313D856E833BAE2BD5BD24C3043C5F76B43CD108033CC25B6FBD41C0F4BEBD53953DDD0F953DBF569B3B8E84693CD11CAFBCD29BCD3CB03EAA3CDA9C003E7B37003D627CE93C0FF3613D7AE9C53B02550CBD397244BEAD91613CF42626BF8732BBBC3B9CB53DE10FBEBD4999B23C0CB901BAF2B7C1BCA4ECCBBD3B83433D5E86823D2DEFF43CA143E03AF141D6BDD4020C3C1CC605BC4F77083C3949DD3C24B7263DE6E6D43A27308DBD1AFDC0BC4E1044BDD45E9A3C1CAECF3BE7A2C63D0B8255BDC9D2FDBACBC8EDBAF996ACBB37A116BDA6E3DF3C00E58DBCEA196C3CDCF01A3C2D403C3D79A6463CD0E172BD2D836CBD769D343D8D570CBD2E8F2FBE600CB3BC5888BE3C169BB03B0FE1BDBDBB0A39BC868ECEBDE318F5BBFDC6713D264A783C77DED23606CF20BDBAF56F3C8202A0BBF8CE39BC0EEB28BDE7B7E83B6D9F263B59E4FE3C5F0CC63BAA8593BC3202B83C586D633D69BDBEBD47BE1BBEC31D03BDD965B4BE92A13C3C035191BDC55EC13B6EFF923B1394E1BC986F0A3CCBBF423D73C4423DCCFCB9BC3262B5BC04C3673C104023BB9211ABBD55AD20BE4C07AB3C1606F5BA48BE0BBD5DD4DA3C3B2980BE5E8FBE3CF06BCA3B2D1B67BD9F48FEB7417D90BCD718B1BC6B26433CED8A7E3CF5BC823A86FE3E3A2553963CB0A8313DD753103D5FA7BA3ACCEAFD3A8E0026BC8D440E3C08F63DBD5291CDBB86734BBC60E3F1BC952989BCD075DFBD197684BE47CCBD3CA173C43CD96F92BBE54AD43C2AED533C8199F8BCC127D3BAEA40AFBD249D97BE1B679DBCCC2CD83B0010B83DCBF0813C127499BE16897CBB5654A83D4D061CBA49DC93BD3DADAA3C19E0F9BCD4CF3A3DFBCFFC3D87FBACBB9F39EBBD1F4E30BDFEE44CBD1CF0DC3C0ACB3E3C1B5304BD5809AD3D9FCB193C9012053C6AC0173BB1D4B23DCABDAB3C135FA63B3C3FDDBC28B3A6BDFD6DB23D2B0C31BDB659E03C692DCAB9E9365F3C68F7073D761906BED249E43CED50183E5496493D5EC938BCE715A4BC6BE2B63B55BFB23D1010EFBC2D1223BEC536FB3B09E5D8BC00A3DBBC735D4B3DBF307F3D5E93C3BE1191C13C998FFDBBC06141BD7EE00BBDE35F0A3D0827F6B83E8B8F3C285369BDED4179BD585A253C1D13583E34B772BD716263BD070606BEA1F5213D5ED52F3DB563B53D4BF5DB3C77AC27BE349E1DBD6018F83B2A9CECBB05A1C73D9A08DF3C383E55BD05B230BCC69B0B3D8950343D61B82E3CEAB0C03DF05D633C4BC8F53C893A72BBD39A0E3DB07409BD8CD68CBC9DD70F3CF1F942BCAA9A453DB9D403BD0930CC38C328D4BCB953283C1417AF3CF14F5BBC96B8D3BCF358FFBB09D75FBC5A71133CEE6322BDB96831BC9B0B37BD8B6928BC7FD4F43355734A3DC3693A3DD9A910BD5CF2AF3C1A0E523D04F7BE3D3FA3CA3C2EF95CBDC67A243D69482F3C24ABE339E335233D7200343BDDF67C3C9A939D3DCB2F153E47D8593CE156DA3C73860FBE0E7BCF3C426A04BC0BA3D8BC3D19DABC072A503D422D03BE3202CC3A2BD5813DD41221BEB4121F3C6342E33D8842D13C30EE313CF2F87A3BCBE7313D7678113D3A5995BD1D0BBA3C773A81BD47FF3E3DDD4568BDF9D3BEBC07CF7D3C447CE0BDF8C98EBC55C5843D69136DBDFA8DD83B53D4823B86EF833C00AB753B23AE3E3D89CE7FBC4C83D73B3223CCBC4B97533B8DCAD83B2EF7D33D9F2FABBCC677513A3B5371BD27EAA4BDB800213DAFA741BEBE4E803BAA1C9EBD44BB073D4F8B07BEF347833D6F8110BE1018AA3C1048D3BD9F445B3E2E36A3BD9415CE3DBB4F81BD1A76103CEFEFACBC72D2EFBCEAB38BBC436B91BBF8C2283CA11C72BCE31501370CDE503C845559BADE86563CDA1ED1BD8BDA093D5F2CC63C1BD6823DC79FD63B10FA343DACD7853CF06F133E9DF2CE3DB44BBE3C8639B1BDAD485DBDF6327FBE6A24423DB33B53BB40A7583EB58E17BD83A613BDC0004ABB986D50BC16591A3D28BEABBB73A7433D9226F13C68373EBAD105413C50D7063EE01C7EBB1DF12DBBF3DD363CF19C693C97D304BD3CDB253B433D71BC6B98883C4921E73DDE0E12BD2D43813CD3DD2B3CB230C4BD59F33B3D594B8C3B3365DFBC37C46CBD9DD0B3BBFBF0BF3C3EC04F3CCCA7D63B6BAB06BDCEADD5BD7458F73CAC9207BB40AC443D79EC2C3D6CC6E7BD7D66A7BD57508D3BE36611BE7EBBDE3CB065EC3C4E512BBE8BD1CDBDB03BA13DBF72243DB18AC5BC8567163C0F4689BD0ADE4EBEE30672BD036C5A3E8537253DEAB9113EC56C803B71D5CBBDB43DB2BDEFD733BEB8E7143ECF8AB43D17BEBC3C883D0F3EE185E83CE1E5E33CD4AE8DBEB00BF4BDA308C1BDFCBB2A3E954F44BD71778D3DFB2D2C3C952DCDBDD6CE13BEE68F353DD97FC3BDF51D523E11DF843C82758CBD8665B2BD3F6312BE7BB236BDF295FBBD13ECBDBECB45FFBD105AEB3DD0D6823E60B4E3B91398323DB225083DB5A208BE1A6E0D3D3576F83D50567C3D8F068D3DCB52B5BDC432ACBD34A524BEDBAD68BE866D453E3AA99ABDFB39B8BD28CE513DB1894EBD43C6573B78962A3EFE079F3D3338EDBD96E8673EBEFF673E489BFE3CB3F8893D0E0AF83D23E3693D4F15643E68E9943D9367B2BDE427D1BD8A85E13C41302ABEC22B8F3EDAF68D3B36E3633DDAF4763D81BBC53DAF909D3D5E3D43BE8872BEBDACB7133EB9E6933D4C53F4BD22AE213D6CEFB6BD1D1D853D18CCA43D4C1D643EFFEA29BE003341BE09B1203E7708DF3D287D5D3E371E66BD50185DBE859BA7BC7ABF92BD3B6EE4BD7D8F223EBA4CE63CD2963EBED1FEC3BD836B853C2912A63D82265DB46CC0213E238B8DBDF79FC83D212866BD088E27BEE2CE293EDCB490BE2AB607BE7A8CA9BD912FC4BDC14AC53C4743AABDE08387BC37770EBD68A5C2BC8A182A3ECDBE45BE6D76643EACF6AE3DAB7475BD062D78BC5616AFBD99DB0B3EC0537A3E15EF7E3D26F179BDF76763BE7648E83DCDD24DBCB4AF88BDBA2BCABD2828C63C8BDC113EE19798BD38C03FBE613518BE774B62BD14A067BC172464BEF2E45FBE87A6743D6CCD5F3D8712393ECDC6423E429036BECB84563EB5FC2A3D2ECE8A3E237DC1BD2CF4383E3EF5233DBEFA873EF287A13D236C34BEB518D53CB09EC63BD995B2BC08D2E23D16D6BBBB530BC4BDF9FAC6BDEABD653EB21D14BEF1611C3E42FB81BD7501553D11DD6E3EEB71C8BD5C46733D445D103EE9F3173CBAD275BEB185E9BD2D708CBD1E720A3C4098E4BD5212C4BDBFAA2D3DC78712BE6829D53B17A341BEB837E63D4B0119384CDE523D648795BD6CFBACBDA08982BDF8CCBCBD0217D1BC1C22803C5CEA8E3CE3DE4F3D12B01CBD431C1FBE0A9320BEDD1E25BEEE03E93D2B59A03DEF93003E1C8689BE2239CC3B31764F3E91D0B6BD0B888E3CA431ACBD6041713D4F51B0BD2131ADBD31AD82BCA1C15D3E0109133CFFECD43DD87910BE0B4926BE2B5A143C5960C13CECFEF03CBE4B263EF1A2953DC9C55B3E3A7EB0BDAF3528BE9952683EC49001BEDA84A53B53E6053E5962DDBCA25DCFBDF645BDBDB9A767BD99577E3DD835263D021A1A3EC2F0313C36704A3DD7966D3DE36E39BC3A6EE93C558B1E3C536AFCBBCFAC9FBDEE90043C17F6ECBBFD4BE43C71AB1E3BB376D23B51A078BDDE6AD5BC80D4AE3CF891EFBCD3EFA5BB010A8D3C116615BD39328A3D6B99643C0FE8FD3D4325423CC89FB5BD2FD7CA36384D173D6786043C150F8A3C533FD93D52CD0B3EDED3D5BD7293063B86ED393C0B1E023DFAD3CD3D1E80573C85BE6CBB4A4E9A3B9A6E343E07FD2C3CEE38063CF277EFBD1058F53B5FC4D53C8041053DF9B5C4BBE8A7CEBD5D2D30BCA3ADC13B517F163CF331013E5D1CB4BD7054DDBDE8B54D3D88E643BE4BC2553BA2E2C7BCFEEF21BD53789ABBED1C9E3DDAC4C3BC6946983D724D333C5BB9483DDFBBFBBC58BA3FBDA6609F3C76D2373DBC41D13CE137A03B275642BD6E534BBD5608223C6AA206BA7DF3A3BC2FEABFBD6F06703D0C3FA0BD319476BC997AEE3B6494EBBB596310BE2CCA393CA7226DBB85F7A9BC79C8103DBE3B183D8F30673B418C893C7265AD3D1F1088BCF64E613C138B3ABD58DD3FBDD12A783D207F38BC2B3A38BD16029EBD219747BDACD089BC3B74FE3C7C6E053D1D5F893DBD247EB9BEF244BCC276AB3DEDFB17BDAD62C2BCE516F83BBFCB99BA50BE103B875F8B3CFE1D1D3920F7313E46A8AFBC901DF0BB049AAD3D928E73BDBF4D14BC0D8D433D19EC433BFA83CF33B86AB3BE44D28F3C3CF555BDDAE58EBA6CC08A3C07AE86BEA720333DB70F043DF9D85C3C927149BDD4AC343B954E60BD1677B3BC8F0BA4B909B36FBC05CC27BCEA6BB43B1C66CF3A7EBED3BDFA860E3DF7FD603B7E7918BEFA4D97BC6CF4C33C2E7DF8BCAA161F3D69E1AFBE715DB4BB74202CBD5906343DABE8503DF9F803BD6DE5503DF579EA3BA8902ABE4C3D033DC5C5203CA36693BD3F05B4BCF859B5BDCCBBD73BC1DC4EBD34690C3DFA5B873DF30AE73D3E29D93D693C09BDAED3643E684F4C3CD6A51E3957E58FBD1727173DD1354F3D896F523DD928A9BD3C46B0BBD556993DC95232BC300C6B3BD134793D1D2E2B3E2EC399BDC02C6DBE1331753C13F4013D925BBEBB3EB3F7BD630FFB3D7393473D77D3013D5109D23C8B60F3BD3CBE8A3D3D6D89BCB0F3893D53E08BBC068363BD0143903D97E393BC2BB05E3A36B85B3CEAB879BD725D4D3707C2CE3B67AA813DF33B6CBA063633BD406723BDB9D7F33BF424813D8CEC813CDE23053D544222BE4DA7AFBD163884BCA305F93C79B8E0BCCDDA56BCE934463D26730D3D710BA53B963E243E611F02BC03D02BBDA45041BDA8B70B3C6A4F383ED3F8913C609F353DA2EE3CBD292AA53CF89DDCBCF7C81E3C83B8943C684F71BB1EE944BCD2A339BBD17CBABC033EBA3BC16D22BDA07500BD000EF0BC9A0A493D7908C3BD6252F9BB7D474CBCA2CC0C3D9C42843D1CC2C53CD88A9CBC012443BDCAFC093C41A9AD3C7E46CBBD349541BCB720C9BD6D1501BB6D3FEDBBB0F6263CFEDB5A3C68ED153C31D161BC33761FBD958F363DB04510BC732EA5BBB8B3A2BC1F75DBBC1D9CE53BE27D803AF8CE09BDB182943BF004E6BB83D9A6BE7DEED0BBACC830BDF43FA83CD08B483D9C47243A672345BDFEBBB038CB0F943DCB324BBC31D4EF3C67AF613BDF2EC53CD1B558BD3E79333C525768BC075B5EBD28B7AD3C2D5AAEBD33151F3B79EA54BDD1548EB9F461A43D031E51BE8FFE10BB923D833ACEAEA7BD3BA6913A86B1913C3551BD3DB4B28E3B91D2093B778B70BDF533B6B7E1CF51BDEA9BF3BB8D4621BC1D86D23A6320E23A24CF6F3CB911D13B4228D83BD39F16BF2FE5ADBC193B8BBD3B41D7BD08F3C03CA76759BE10CE36BDF79012BDB83685BC7782A3BC3BFACEBC6C92E23C753BCD3AC4EC56BC1C6E353D33E1FABCBAC85FBC2435F8BCE6B88ABCFAFDDE3CA9AE70BD5C06433CC21124BF08422B3D4560023CA99514BE9BD180BBE939E3BB4315313D334492BC81D3233D0E19193D6C5AB83B780227BB475AC8BCD1B390BCB669293D727E27BCE69AB5BCCBCC74BCCDC141BBFB4682BABBDC473D4EBCDE3D51C5343CD1426FBD38431E3D12C587BE3C877ABD65811EBDEDE34C3DDF080CBC1610F0BD83DDD43B45D39FBDAA699FBCC4DAF83D62BBDA3CE36803BB7AB423BC1744BB32BF4D77BCEAA81BBC5F3C913D4134853C36FF85BC0BA1AD3C082D27BC0616A63C3A571C3D9AC87C3D28CC303C1544733C7F920FBB2AB1843C093DCD3DF28D3EBCC03B5F3EBF79F3BE5B397F3DF662653C232C783C4B83DEBB59BB1F3DD5718FBC9753B9BD0451AABDAEC5FC3B5D19813AAEF2383C14D5183CE91547BE1C6E9BBD50CE593D7AE5BA3C4ABEF43D688391BD74C4063CA820F8BC0C5D01BDC4E4C0BDE6CF403D98C7333D12DAA13CEBB98A3CED39E93DE583E83CA3F9B6BCF1ED763B13449D3CF32A663CF6EC67BCEB577C3D1908593DCF38C0BA2C9EEEBCD49E21BB97E9B23D0A3F57BD8E39CEB9BF02633C77C1423C7ECAF1BC762B063C946FB43BC2C1CCBC6F91AF3AD08374BC2E3E4FBC480173BB7BD3D93D0862973C4275343D25FAD03DBDA0D4BC9554AF3BFD1E21BDCF670D3EFA61F7BC1C018ABD04AFEC3A1E608F3DC52F9ABCBFEA15373A0334BDCCFF5BBB45BEE13C6BF961BC67438FBC6CA0E93C131D2EBBDEB7EEBB9BEEC3BCE2E841BC273D86BCB7F65D3D3F688E3CF69FB53D0AD6E33C3395813E48B5A9BC7C5AA33D709D283DE165BD3C6AD146BBDBA82738ED336D3DB0C6143CCFEA18BDE637EC3B0B58BC3C2F93D13CE8F616BD5E055E3E9DCE90BD3ECDD5BAF102E73C00D5CC3CA60A583D926F6E3DE4A6343C021EF23CB65655BC97DD65BC214309BC3F0E51BCB4BF373CECD3A2BC872BD9BC7B46DA3CBE55B23C655A6B3D66DEF43C1C7B7EBB0186B8BCC05BA4BC3CBFA0BCC85E60BB4E34CCBDC2D1393C3F50DB3CE713F43B7678793C2B4BDFBB2461A9BBBE6BE33B120293BC18B75ABD2A1B14BCBD4FCC3D81CC58BC5A1C2FBD5F5D49BB723E3D3DA7242BBC897094BBA70365BD01F1D9BCCA5B77BDAA766EBA3A8919BC0520A13B197A1EBC3374C7BE341923BD565B0E3E28F396BBB9722B3DCDD03A3C7E3079BDA9BA05B93B06903C99CEC9BD69491ABC7AB20EBCDDB252BC720CB13DB1033BBBD27DE6BBFFE4DC3D3E76063EDB41B9BC9617C8BC525C78BDF1F590BC0BDD0EBE158794BCEAE6173E05225C3C45E7D13C2539753D90D349BD32813FBB081FA0BC62CDB6BC5FB8013D756D5F3CDBAB9C3DD30C80BD785B263DCD5B3E3D3B3704BC33746C3C611C9BBB0160E83CFCA4CDBCF79B1BBCF65DABBC831AD4B9A48328BCD200153D522F1EBB074280BEE433D8BB7A860C3DD1B8AABC7B052F3BD3144F3C3F0EE1BB43F4BCBCFA377D3B4152583CCEC458BDBC172C3DC5A7AE3DA09B58BD0D9CAB3C44DE49BCA9901CBD731701BE5CA7A3B913BE343CB3B3DDBDACD753BD66F6E0BD88D93F3BDBB71F3D26D301BC0D48D63C53AE95BCA7F940BE2C1F8ABDFE4FD83C04CE1B3DC227A03D616A42BD2614A23D2EAF9A3C8C36E83D3219EFBDB4163B3C4C34C4BDB1D54EBD9DF98BBC785C3BBD5AA74E3D8DB10DB4B55C84BDB6951DBDD6DE293D7206313C1B0F8BBD28F361BE2CD6EA3C0327993D6500EA3B2A1D09BDA5917E3A51DA6ABC09A65BBD580A373CF89A0DBEC705B93DEBCFADBC079E1CBCDA3445BCF44582BD0BB58FBBE7E733BB56A1B83C4B54B4BDEA2C2A3DBC67CDBB40AEA63D839C12BD5EE8A83CAAB359BD2836B23C0C62B3BC452BDFBB83BB0D3EB6B9153D6391B2BDBFD993BD18B6DA37B755DBBC046671BB0E3DB8398AF1D73C8641CB3CB371B23D8FC639B973398EBD1CCCDFBDA80414BCD1B00FBDC3D6A23CED72DC3CAEF33A3E0902BEBC3E24A93DA7ADB6B90F3F45398271C03DA1D38F3D1AC5FA3AEB1692BBFEAA453C2CB6B8BD97D4DC3C0DF0513CE4B2D2BC2C17193DCEC7FFBB9E0535BC5FF82D3C0D7B09BDF448BD3B0A292E3D8E49873BF8B4693C66DFE7BDC73F27BE42EECDBB78500B3D5C9F4EBCF716723BAFCF2BBE0A3088BC6602A23626A2AB391762FCBB8D901ABC2EABB03C7E8D07BDDC65B2B9741C17BEA276213B89ACE83B7C8208BEF002B9BD54C7823D54CF44BBC62FD73CA212E6BBCAD93B3D3D739B3C8461843C2CC783BD907C313D0896BF3CD0468DBDC46093BDF361C43C81621CBCB9DBBD3C7418D43C4C68B83D689A1BBC216146BCD5BC84BD265CC4BA26C0B33CE78D233D397F163B6EF38D3CDCE7AB3BBF33BA3DDF26413C266810BE82FC3EBDC8CEC83BBA511B3C6C96C13D0487DFBCCE8AAA3BBA8E92BC82D11F3C69717CBD8F88EFBD309E123EB0C87ABCB7A30FBC0F66AABBF1FF0DBB25A329BCEB1A403D302073BC15E961BCA9878B3BB094953CF94B26BC8B4454BBA92F72BDE093103C4A315D3B8966E53BD84DD63C2E6A023C9C89C73BD830943C8DF9FCBC700180BCCA6E113CDFC194BD84DD21BAAB9494BAFA66CDBC7B2AC0BDF7A23CBD2EC0833C3A7A51BDE61FBB3CFD236E3DE556A1BC66DC653C46F7D8BB692CAF3AAC13903DF6E0923CDADA043D6D1D7C3C49826F3C3E0ABD3C38696C3CBF20213DA145043E2C471A3CDFC53DBDD2DCB0BD9FFDC43B1BB772BDC90BB63C0267A0BCD1FB00BD177629BD2248C2BB1E9BDA3C24C8F93BC831C3BCF55A613D0E30923B0E3C263D49A23CBDB576CBBCEFD3023DC54B1D3D9C2895BCD979123D24158E3CCF7EA03CA04478BBDD9622BCA1AE05BC7E498BB9795DD33CE412EF3C99E9033D8F8EC3BD06C9163D80A3BA3C8D0B20BD48B14D3D7D59B33D2780623D454539BD66D5913B33A7E53BF8B90DBCFE9C3DBC3F92F4BD87E6B83946DCF9BBD6A56E3D5853A7BDF9AE8ABC7E8848BC848FD1BC9130853ECD439FBCEA098F3C1A8EF4BB2F74593CCE6D783CEBC5CC3C21D7CC3B382B6B3BEAA054BD71B3FABBA61B2DBDC438153E823E903CC260173DDD616A3C6266EABC3D84873B8CBB3C3CB4F5493D5B761FBD0EB7003D70BDF2BCC76BD7BC865B59B3E9E219BD7C371C3B6DDD91BCD639203D47B4C63C04A0093DDD261CBECF58873D077A89BB69BB5FBD58E3B1BB9D124BBC53AD1EBD9E69D7BBC6C3173E63C25D3CBE33DC3C4E0A823C6A8FA13D62D41ABD5EA7DFBCBC79CC3D66EC93BC0E82D5BC52B1D7BCE2EB3E3C2EBE333CE6C1093D486EBAB8B8FBF43C1CE580BCF1B2F43C2485243D59B3A5BC8F7DF63C22B4F23CABBCE5BB2F59B43AEECC08BC2704063ECED45DBCA26B803C7BBEA5BCCEBC1BBE518E0A3CED10D5BE01B30D3D1DF311BD2363DF3CC7D6AEBB787933BDDB5A8DBD2D13A63C5BA8D9BD59549C3B3A91A0BB7AFED03D8462AA3D26B5A539D83C843CB29A6ABD616EA1BD343456BD5E6BCB3CEF5171BCB12733BC2DECBABA473C19BC7553A03C8A75033DADF7A23B6DDCF83A82190EBB4237A8BA3982703C1E972E3D648FBEBC652E893D825E9EBC3555B33A1BDF5FBEB704DA3C8A481937F8D0F83CD24AE7BC97CE95BCB9E4CA3CAD4E063C57E446BC412C0DBD649A003C40D2C63BD6F6E13D140BAB3DD0AA773D3D07F1BBB9413F3C95A8DD3A53B74EBB2978BE3C3CDD3B3CA93D88BCFEE81BBB7FA027BCCD42B2BD45F07DBCA6E897BD02302F3DC63C7ABCA5F335BC3E05F63C9CEBD13A4D9A96BC98E2293DBE6F37BA0F6C29BD7FD1413D3D327C3C19C4E03B9F454F3DB9A437BC459AEE3C5F3CD13CAA771ABE9EE0003CF9A8953C4EEF67BC9002233CD2086BBCF2DC5DBD5904B83C09B802BDDC7B14BD4E63D53CC592333D357B59BDAC669ABC5382AABD49C00C3C4A7C0BBB6B3B3E3C597250BDD048513B355E503D335D6A3B2347DABAB1A92CBD971CA3BB116C34BDA60C3CBCED68E53BDA37143BCA3705BCDAC150BD2EBBC6BC591E413BCAA82EBD6FCB5F3D81C61DBB930C243D9246CF3BA74BA8BC029E723CFF8A55BD9137BFBBCAB4F43AD478F5B96DB5F4BC2CB9DDBCBF9C84BAEEB152BDAA28433CE93C303DD5C9B63C42D3C0BCD26A00BD97405FBDA34850BCB2622ABDD9948A3DB9DF6D3B318606BC7C26AABD1C22363DE07D53BD64D0DEBC6717E03DE61424B932A7023C42D49C3CD84775BD0B0CD4BC98F4403BA488153C920161BD92A1693B0125B53BB3E15FBB96C8F93C5877ACBCD2FDE5BBC2AE6B3B3DDF463D87BF473E76D08EBC4EFF8E3C94F1963C587D8FBA7419FA3CCFCB453BEB2BAB3CA8D0383D5EB1C13BBA9DE1BABFDD2B3CB5F4973CF722A73B200492BB0E9E7339938BCABC45366BBD44AC35BB87D253BD392B70BAA63CA83CBB76A7BBBC018B3C5B591C3C073C7A3BF0FEC6BD74F810BD323E0E3D720AA8BA0BB666BCB1F6FBBC2957D1BB49ED073CA7D31E3E57F6C03C7460AB3CF7F938BDFD7D2CBC1A54B73C7D908CBBF6DEA8BBCFA03B3D1DA118BDE9FDDABCF8388ABDAADD1B3DA343343CF61496BD757A3EBDB36DB7BC2EB98D3CDB6F1A339C7932BC8D56193DF39FDE3C1A7DDFBC55A1C9BCB277DE3C536027BDF65F54BD79F564BC1EC0C7BCA504DCBB477D01BDCD71AFBB482EA2BBC6A340BB2990083D2E828BBDE87E073D28383EBCB67C63BCFF890F3FF1AE5FBC94C7E73CEC38693DF8E5F0BC236B9E3BD30616BC7C1910BD6AD719BD8398D4BB953E293D4BA40FBD99E68C3C26C148BBD97F30BD347F38BD7B60A0BD2DD2643B2F25293CBB10E03A89860FBBF75D4CBD605516BB9C25723D9AB251BC3DEA383D6240E23C0D40D03C8D0F913DE77F133D547BFC3CBC5BC13C0E12D4BC5A0E09BD7FBBCE3C75704E3B75CDCD3D38411E3C88538939AB513A3C5DA12BBCF0622E3D0D9799BC17E91A3D38ACAFBCA4AA33BD59BFD13B86FF88BC80784FBC55D7A839F8FFB1BCDF7FB9BC215024BD1C11103DCE97C93CBA3832BD41607DBD1A828CBCF6B4B5BB7E45B9BAD41BF6BBAA4D4F3C3B08DFB6AE65B43C7C8AF2BCC7FC4CBC5545863C1528023A0F136BBB74F302BDD46E8BBC5319A03DB01BC1BB52388CBB6556F6BB735D893D9F603E3AD2D0B5BCC432513C5AFD7FBD003C343D3B262BBB138F273C8388363B570CACBC2EF9A33C0F0C33BCDE52953C290BAEBCD20701BC5E6C6D3DC885013D8E5E2F3AE1D3BCBCA6A1CABA2090D6BCE493E93CDFE28E3CDAE79F3C7B7D6ABCBBD7E6BBD16EDBBC804F2B3D1E4940BCE64C1F3D0ED48EBB64710ABC476DAC3D120B53BD281A5A3CB0EC0C3C47B29DBC585AF73CE010C4BB6A969FBB3FB59FBE36B4E3BB3FECE4BC95E408BDB6C5753CA2AFBB3CE0A88EBC3D57793DEDACAE3D04B032BBB1569BBAE75F3CBC55D44DBC030E073CBE54EE3A2FE6CCBB9DCC43B8010F043C14076B3D814209BD89A10D3D9EB5E9BBCB6FA1BDFB772BBA9BA9DC3BEB6BA8BC04A2813CD6844D3C316496BC34CF4CBBEA26103C3674CE3C9F98CEBC130EA9BAE08707BE9A91B1BC4CC1A63C8E35CB3CF666B23B4D98B1BC00BCE2BD116A533D1176233C20C9DD3B0277583C3A6F9DBA4CAB89BCAA3FA3BB5B64F7BC11E213BD3CDDDABD339AA5BBBDF6A53D2F569D3C663462BC4D5B103D1A54EF3C2F08F83C3C4A1A3CDE7CD1BC1AFA123DAC61F93BA13E563DE6B7E8BC1196E1BC388217BDE038263CF241BA3C9480823CE7EEE3BB5C3936BC56596A3C9A19CB39E3ADD1BCB8B185BA5A48E6BB39AE893C83FF1FBC0DC6B4BBE6C7373BB766083C46B3D4BC650E32BC75B5833C4F1D34BC4429613D4A4F1DBD56CBA63B7535683C5103103C5044003A8617A53C5AAD60BC5D8A0F3A2DBD523D87CF12BD4B248D3DC65C07BDFE20423C19CB19BCC1E60CBB1BEA663DF5627ABD27782C3EBF8DA7BB3B93A43C977B2F3DB65027BE6FE48B3C306883BCB547493D9E8DD53933508F3D749F90BB3CB2F6BC154226BE57D7C53D8AD9D93DE12FC0BC577110BDD9EF4DB3FDC9CBBD190CBE3CC6E1333D3761793C393C1E3CEA98D7BD33F53C3B2044883B5DEB553CA7FAF4BC88571B3A710994BC17459A3C364DCF3B5536F3BCD5BF8DBBFCD3073D60E862BE5B75153C93D7AFBB665F813CD891113D35C40EBB4A18213C1201F1BC2E67733C817F21BD3D9FEE3B8423E3BCA27CF2BBE54E58BE7C6217BEB55C8B3B227C913C625A19BD5E0C5D3DF79EFD380017D0BC3DCD8C3B7AAE17BBF9F0E13C260D923EC7FDCE3B942866BB8351C03D4E1B543D419CACBBE44238BD4E0EFE3BE6ED68BD1ED19C39AA5727BD724C84BA409D06BD2DFC4FBC05D196BA7E884C3D290BF53C52EA6DBB781D67BD1CCE913B5BA647BC80277BBCFE867F3DEB7784BC8FD469BDFB9B29BCD52E5CBDEB1C9F3C12C1493D3AE574BCAC5817BC51DDA8BD9A0689BC5718273AAF78B8BCEBFF5F3E1AA3863C192FDF3DECBAD9BB9B0B39BDCD870ABD85468236774C4CBCED94F1BC3B113EBCEA36673953533CBC3DD8EDBAD402773C72232B3CC70E023D5E2378BB1F370BBD067BCE3C1B5D05BE8074A6BCBA164DBB9A2F2C3CEE793BBF80FFE3BABBAB553D7C553BBC602610BDF21187BC924C9EBA3DCCF23CBD5FF33C8E7460BBD10ED3BB68817CBB654EC6BE79BF28BD0F92E93C2DCF27BA7E0246BC83DD323CBFE385BE86F0403E17A72ABC1ADD82BCCC86AE3C628CA8BC5339A23C9F548BBC244B143D6C941F3C879F983D977A253C3065833BFA3A49BB56551ABD8C7D3C3C9A8755BCA77B60BC3674CDBB70882ABC4F68993C37B099BCC3E191BADD8AB6BC882A1CBD61CE06BCD66C10BC5AB5993C72BB3FBCA65298BC0ACCA6BA84BED43BBD1AAA3B6D82B53C305316BC35471DBDFAF13B3AA630D4BB2DE07BBB230E1B3D21AB21BD9BA423B9D7489BBC82625ABCF060ACBDABD63DBDFE86B63BD6D21BBCFABFEB3C9BB513BC2C6603BDCF21A0BA8BAB193C0CE01BBC6D1B223DF79C8B3CD388B0BBDF92B83B4D6405BDE10B233D7A8112BC61FCE2BA5EA286BDE61BE23BFDDD113AD79F16BBE4B91EBC1F82723C63CF373C505DCCBC1E1B923CF6AFB3BC10A97F3C2ED51C3A0403B33C830C64BC49FD093E49D4403C2ACBA63CA33620BD2875EB3CDBB9473CDCC21BBD1B3F463B4687333D6FF956BC9C9C44BCFD148EBC5E84F8BA9E9BB93B869B8F371993D4BBD4168FBC48A2673DDD27E4BC075004BA6CA763BC8C7482BC3091163DF8337ABDD3045CBCE13640BD46521CBDCFE15A3D4FEAAA3CAFA2013BDDA25FBE397393BC1F2186BC096C14BD1C20113DF1CF693CFF22B5BC1B28F6BAC34C9BBEAF51093DA32F853DE8B4813C025F84BCA1AD0FBB371C713B3FEC9C3B1F4F9B3B7276823DFAA9AF3B493E38BC48A0923DA630C6BCEBC2893DA126F23A5F6FD03D01F585BC1AF2C13CDCCE2E3C656C163CDEB7C43B3FFAB8BB7C5A48BC07C7532EB96A59BB2D77FB3B6239EE3BA30F4FBBB02574BC2D48603CAF0F2EBE8A269D3B81A7AC3C9EC11B3D6FF54DBB588FD4B9E9E4A9BB39FAC93C60C1B13D922DB43C5616833C1A73463C48F7953D41FBAF3C5BE8973C81940FBE888ADD3BB83050BDEB6A0EBC6A03903B66C084BC30290CBD9096673AF05C663BABD319BDA0678DBCB53E043D8B93DC3D7A93D3BC81BA7EBCBE96D7BBB3A51A3C91C919BD7D98B23DDB1F0FBCE3AC8D3CD25569BC8F7A2E3EDBC1A23BC4651A3E952B88BBE1E293BC8321C23C8C16993C88D0C53C9D1D06BD79D3633A0C13903CF8BF59BC4E72A73815AD743D6314D4BDA50744BBEA8957BC6C46CD3CA684A53DA7568D3C22F942BC006805BC44E087BC7E937F3CEF478ABCA4580CBDA3241EBB2B16773B445DB53C72B129BCABD2A93C82760B3E83B5B8BA7EEF5239808B903B0CBC813CBE58C03A7F2ED83D9C6628BBCB9D4B366DF6F33C75A34F3CF51F1D3D3FA542BD30598ABC6F3843BB6C17C1BCCA14313CFAF992BC8B8CA13C0B6A81BE2D4DF2BDBC62113CFFB05DBC50C2063C8998393C0180B53B9BE433BC45D412BC475AC6BCE56A573CAE21EA3BB65FDA3DCCD10BBCB9A5873C64CB59BCD7598ABC03168ABC934E94BBFF67F53CBF90833C783AF4B830CB98BC8F754D3DF690DB3CD0CE55BC8C38343CA555BA3CF3D7CF3C6A4D94BDF2D6A9BCF4F4EB3A9F6A063DB9A773BCDEF174BCBC6BD83C4BE0C1BC1A5C10BBB31B003AC821843CFA5B213DF0B782BB7F689BBB6A4AFDBBC86B483E5E1139BC7AA76A3AB7A1BEBD590694BCACF3FE3C0F1B58BBD3C514BC4499B3BC59DE54BD580CC73CC7F622BC345719BDCA559DBD13AEB13CFA002B3BAA8311BDDF13CD3B15A61BBEA9D83E3D330ED93C0277E6BAD307D03C42A2E5BC805C90BCF2EB37BE9B9576BCDB3BC2BC21D3A6BE260EB1BDA3E570BDD1078FBDA12714BC13D953BCD86405BEE88680BD3B98B13A419F18BCD4AC38BDACAB823C254139BCB63BC0BC185DA43C26169D3D3C6F083D86BC35BDB5EC0EBD5A51E1BD869EA23CBC516E3D7E0E073B575FEB3C827E05BDF877DE3B54EABB3B29E49BBBE80737BCDA57113CA31C103D629680BDDC85163D6E3E1BBA49FC303DEC77F43A3A7BB3BD705BDE3C8FF51A3CCA1E32BDAF741FBCCCB18B3AC04CB2B75DC29E3C1DF6A0BC05379ABCE16503BEF2BF1E3D00FA3FBCF73C383EA9F6B6BAE95434BD874C7B3D4C27063DA17E28BACE3F73BD32E5C23B62E43E3C0474AB3D0AD6D8BCAF330F3CA3F65FBDDB1C55BDF207773C635B0BBC93B5A93C76E54CBCDE9022BDAFC694BCF6F9AD3B50ED64BDC3797BBC06DB813C5AD6A53C56E832BE870AD03DAF5DC03B6BB8F83C69FA383C125FAE3B7F23623DEC96BB3CAF36B4BD9B6F283C99D6163DBD8136BD917282BDBB1D0C3BBA4EACBDAC1343BDA4F8ED334BF754BD4A4EB83C5D7D943C337DC43D0906CDBB2ACF0ABE9978AF3D66192FBDB5F759BC9717E3BCC76450BA86E08F3C3C1C9ABA60D4BB3C30F29BBE41780BBD6F960CBDFF477BBAFEE925BE14AA98BDE7F3763C96BD93BD2D32583CDDB1513D2FAC8FBD7C727FBC09732D3CFAC8463D3B492A3D442A50BC147AA53C130F50BBA19E4ABC82144FBE8359D7BC27DF963D7F37F03C9D2B9D3BEE50D1BB3F6B88BDE4CD82BC492FB33C3403EBBCD01226BD380AB03A9413603D4B8CA43D10E860BC32090BBDD6D78D3CEA5FFB3D9031B1BDD3B1FE3B95FE89BEE5FAC03CB642F53B0948033EB072193C5B79613AF82F7E3CC82CE9BD0AA2BF3D2F3B02BC5A2E893C84A8863B13D94E3D719DE8BB685DC03DA3E7B2BD9593C33D9F57A3BBB16A643C6E5C0D3C9FD33F3C0F80E03DE6979E3D718C0C3CC14E28BDC6A48C3CFB51BE3AEF02043D18B829BDB79C2E37E722B43CEFD069BC7E2E6DBC3C31F73DB329013D464030BB0E0397BD63319BBAB6FD2F3CD4FC33BD9F8063BC9B0E913CC69726BC35A611BC0EBF08BDE3B4863BEF5669BC19F0F73C552F12BD720D04BC01C61DBDF9F43C3DEE6A0ABBEAE8CC3CE77293BC6BEEBA3B14CFBBBAA596253D7E7013BCE0FBA23CA9419F3DC3C69CB82C5A73BD22BA053D00F540BB36EF013DD9B73CBC40B80DBDAEF0B43D56350E3E4D7E3BBD1F9F3D3B14295EBCC80A093C1B5500BC08C73DBB96EE133CCA9B32BBDB7CC3BBF63E563DFB404D3B517998B83C0D133ED0CFD4BCF28DD63DCA09D6BB243E95BCD6C7B63CEF498A3D7A8D07BE576AAC3DEF888BBC742C4CBC14DFE03D9C7FCC3CCA7227BDC57518BD7AC9023DAFD34EBCC41C373D2FF516BC7667263D8D7236BEB11289BD551BA9BDABD315BB280335BD6E40B93C6295CC3C2F6394BC84B5053BE649753C5A34CABD7146E83A9A6204BD8026BF3DAA1B173E5153353D93A800BC1D411A3CF05728BD67D4B63BC33D883D9AC1EABC29108EBD40971A3B7CB404BE723D183CEEC4AB3C55B6903D51A1B43D351556BD12B9823DEAED503C886991BCEC502E3D5E776CBDBFAAC83C67CDBF3CAD0A9ABCFBC6C13DC1C8AE3C82A719BECAA0D33A75ED383DA587A03D2641C0BBEE5D7EBD06EEA43DCE1DCCBD695DAC3DE7BB803D25E903BD1F240DBD3AF23FBB96120E3D699042BD68F72A3D8916A0BD76D4C1BD5D84F0BC08CE1EBDC9CE083D25726DBCFC3275BEA5F798BD1C1983BAD5447D3DB19D08BDD263863D1B0C94BD1F9BED3C073CB9BD53578E3D919C8FBD62A53ABBF9B8603D702480BB8D4514BE4E5C2BBD49A8F03CB9EC373CCF98E6BB04800D3D718E9BBD6BB930BEE5ACE53CEFB9263EDAACA33C3E3C4BBD396410BCF5A2023DA2A3883D815FB93C91E68D3D784B1D3D9A975D3BF287D3BDE56DEE3DCB1891BDB41933BDAA39DCBC5FECD833E7D6D9BC0F1C0B3B67A3D73A9250083DC326F23C5AF077BDFA57C03DFB4FB33DF106803C4E0DF83C9AB354BC547CB93DBAA1E3BB75B2FC3C1881C9BD299055BD57F20F3E6CC7DEBD8DF8E03C47FEE238AF960EBD1E08EB3D931AD9BC288491BC9CD850BD1582CABDBA7B0C3E5C5F0EBD530636BD8E53033D863FD4BB404FD73C8E1ECBBDF23B30BD9683B6BC972D4E3D6742EB3B6229CB3CC9DD113E68A4393DC2AC1CBD7696E1BDF71C93BD2C4C27BD779C5BBDF917F7BD58D0D8BC962BFBBD47DC733DF38707BDCD473ABDA56E5CBE490A39BDC964253D8F6AB0BD46B377BC8745A23D1FE0A2BC0166843B2040613B5B453C3CC84A56BDB818AE3D1214F6BDFEE74EBC5F471F3CDC8B10BE72FAC53DEEDA08BDFD7F1EBE73B5483AACAFD13DB5C74E3E8F4353BCC0758E3D22EE143E0A03B8BC1A7695BDDF371EBDE048A0BB0825A43DCCA049BD2DCEA8B74B0F3DBD6D1928BD08CDA93C6CD80D3C2CE68C3CA85909BC0B0503BDD2B20C3C50F03F3C97A2783C935BC03D6558853DC049563D35D81B3DC3846BBD4146A83D83245E3E54D9E2BD905A9EBD87D1E73B92B270BC9FDC853DDF6B40BD04EB103D4A6AEF3D3F652CBC14E675BD853211BDAB8D3BBBE14D063E6222213E86C88EBBC5F81FBD44F76E3C928B623D9E324ABD65E36CBDA0B40D3A27B9573DC40F30BDF2BA0D3D53A245BB9818FABD35026BBC067A1C3E54F6443DD0C6243CDBD89BBCF0C8B2BCDC383EBD070E993B144376BD01F298BCF951A7BB0FCE993DFD326DBB4F85133DDFAEC5BC7F41BD3C457918BC49E9013C698A1BBD6396153B1432B7BDAE94853C41589FBAFB5219BCE40D72390E5B9A3A6C546CBD03F8423D06B001BD3B877FBC269BD73B3F3FE1BD9B6000BA1291D03C4C3100BC9795BBBD45BD14BD0534B4BC1371EFBD031FADBDA5A9283DAED491BC5A1E983ACCD304BD9EAC593C5F0033BEF8DD7D3BFFAC68BCF795783B806D813DD4A23CBC4CAF603B7ED0D23CCC98223E64D976BD059EAEBCF25735BE0C4C2ABB6C9703BD984CB9BD50B1373C3DD0FFBB438F5BBD2DB356BCAAD28C3D665A2D3A7EC7A7BCB85EB93BB48582BCBBDC183CBDA1053DAE18D2BCB885DCBC46013E3D727A1CBB6A40923C3B6F19BBD082933C5E84523C1FF5CB3BAB024A3C3AB2F6395E67C63CFACC9F3CCF9AD43C44F431BEB432B23C16F72D3D1F54B4BC2937A23D8C19093D853E193D5518BA3B97DEF73BF0BDD7BAF73419BBE3A4893A12EDFFBC59708D3B28E9BD3BCA32BD3C5FF271BD318BD6BC3EA5663BD5D021BDFCBC273E447B1F3D48D706BE7B8D6A3A099F29BDFB75603B50CD23BC7DA4A03CDAC9B53DD0BCCCBDA16939BC1FF6553CA213073E6A89FC3C5B76A53D519E663C3F23AC3C98A5C0BDFC99A0BA9E6E3C3DF8CA21BD8DFA803C8A39223C244106BD276D05B33078A63D3FDEE73C784F9BBC4275553D434B043E55B7B8BC439E49BD88E19E3DA757FABC20975CBCEFCBCBBB4BF181BCFE58F0BD5E192FBCC188A43DE6FBDABA671235BC7458123D5D54173C1EACCD3C5D22F4B96AA3AF3CE43B93BC47AC06BE0A35263CF35110BCFFE3A13CE00F1F3D335F0EBDC9CC433D1D13873C3149E93B6D0F5C3DEDE851BD689D953D979D043C26B9573DD65056BDA2A32DBD125F4E3D7906B6BB99A28DBC5023B93C9A1B15BD4F8F133C4093AEBE84E6A13D57303E3D274709BCF1196C3B5896143CDCEB5CBC35A3B5BB5FD579BDC12C4EBC5715E6BCE8D7B23DDF52833D9B82783B24D2323CBDE0D6BCB1D602BED682CEBC12D37ABC389B44BBE619C3BD51E0A83CCF0E0A3D4C10853D50ABAA3C46B3113BFD5268BD0447A7BBE3ACDFBCA48FC53DDFA7F73B5B75DEBC0538413B89234FBC1B26243B59C562BE5E62D33C3AC423371F1954BC99CF5FBD3E2E1FBD93C4A7BDEAB4D8BC8BC09E3C936C8CBB2595AF3BB1B62E3AD4E481BE4C043BBD328B4F3D7907FF3C0B898E3C9CBFCC3C3149B0BCDD6808BB6DA4543DC0F0B23B9BB972BA27F98DBDDD4F3ABD0B96813B77065F3D999A6C3C8D0118BC222522BDB48D953D59EEE13C48DCABBCF8892E3D2A636E3AAC6898BDFEF2863CD8082B3CDCEF193CEF7FD63CD33504BEEE3432BD6FB85B3DDE288ABC0D532B3B2793833CEE46BDBCB552E53B7A4F633C086D51BDF95C1B3D9834F1BC440BC2BA707A223E6EBA92BC44CC523DB26506BDA0DDACBD284408BD870C75BC400C0E3E54988C3D917E0DBD3FC5DD3DB5C34ABDF21035BDAFC2DE3DCED505BDBB11DC3D4A0F84BC87BB3A3D2E5BEEBBED9055BD7E80433DD5966A3D81BD31BEEE9B06BDBE4C2CBD80B027BBDFCD2DBD37FDB03D18448A3D084011BEE07901BEA020E63C2C39BDBDE591BCBD2AED66BD8080653DF8381C3CFDABE83C6EECEBBC39C1503E485DBABC106306BC47397EBD570A6B3D7A3D8ABDD65E113EAB5653BDEDB5B8BD0CA7263CD07C913D9566A23D561AE33DA79A03BCDE1945BDA0E8473CF95181BD4A8459BD128DF23CD26201BDDFA40ABDC4400C3EA81A80BC70600ABE9C89C7BD6AACC3BD78CD833D0A7D8F3C8253953C4321053E7C63D5BC9EA3363D01D767BD9CBD133C93E83B3C52D91FBB41F796BD072598BDEC24F63D62D5D1BD23A903BE780503BD31E412BE35099FBCB08252BD8FF5B5BD4F85953CB491B13C0109173CEEBF8ABD1052713B84C283BD1B4C7ABD25CA01BDCBA3B0BD40909FBD1BB742BDE40EB13DBA9A5F3D1EBE18BE9C14AEBC0792E53D0D2C243D3A204B3D634B73BDD4E79ABDFEE074BDDC2B50BD491AB1BD0E5B3FBBF0204CBD62FDFDBD222D563DE39C9E3D8A5851BCF966C33D65E0CE3CFED41FBDC82182BDC259983BE1A52A3C77CE97BD4145FDBC73F4E33340CF1CBE8A21243D9E2725BD991A923C3179043D662D2FBEADF6123EC621CC3DAEA25B3DB600963D027C85BC7F12373D981CC2BC69E5B93C0661A13E808EC2BDB00B5E3D3E1C63BD592C33BEB8540ABCB553823D5032E33CE6C095BDD51B0ABE16E0873D8115FA3D16A82B3E087E9BBC90BA93BC24CB543D68B0F73D6BE832BD06E0A7BDA573453D4697F63D470CD33D526FC2BC8BD7643D2D09103E96BA1B3DC91A8ABDF7116DBD5C5E69BDE02F193D2339B43D9E0513BD42BC56BD26B23CBE4C4CED3CA6DC7ABD611D1CBE5960CABD2F6D2EBDCF6B433E9F26F3BCCEBA00BD2308C03DD8CE4CBD672EDF3B4B23BE3D86E0F53C9F10FFBD3663373E15AD68BD5FF83D3D76C881BDA999BBBD5E853CBC81D0A139C672DFBD504CB63B555FA23DA1E16C3D1384D53D25D79BBD5F80E3BD7A35583D9487F43B9FD2803DFD5EA4BBCC82203E8AD254BD84290EB8579A7D3C4FA7683DA50C753D5ADC3D3EDE45BE3CE219033C9C26953DC9648D3C6F5FE3BC10A8183D5B8B053E12BCED3D09F9BD3DDA4BCEBCD401783CEDCC37BD89AFD63DFFB0153B0A9397BE5117A33D8B03FD3D510F393D5DEF93BDAA91533C5C4BC83DDE24A43C1D9D38BD36CE14BD014CBBBD168D2B3CB3BF3439BA171DBC6EAE66BD8537463D31F2E8BCD65C82BD8073BABDD45B9A3DCC24743C395146BE3EB6A43DC12DEA3B660F8FBD3618953C5FC2FA3D86EE7F3D66B097BDBA46D6BC93A972BD888384BD762465BD761A6CBD76BCF53C90703ABB790E15BE74E99E398CE8E33CB0CA4BBCC973B4BDDEF3E03B3877DB3B2532FF3C29147EBC3FF08E3C7B05493DB1BE9D3CE13C273ACF25EC3C59281F3C7467453CA3C370BCD829BB3CA75221BD14FB603DC22296BBE46F78B9E90953BD76DD58BBC0E50BBE242E073D2F0B63BD30479D3D5B04A13C270522BC7E6023BC6DA47F3D8F67C03C5494FFBB1DE5CCBD2422063DF943F4BC19E781BA1EBDD33D23331B3C50CEB83BD055DBBC0ED20BBEC29B43BC6DD8D53C68E918BD5E71FC3C76541E3DEEF33FBB5D9AC43DB4FCE2BCAB7A5D3C4034163D6E98253C5AD9983BBA5023BC6256F0BCB88D403CE1075F3D25FA06BDB5A385BC9CF25B39C7D096BC024F1ABC72170CBDDDE3B8BCFF7BA33CC1C21E3D84CCEF3A1ABB1E3CE47C223AED1F03BC5C48A13C96392D3D02C7A03C23E4793DA03F873CC5BE91BD8505373D7029E43B5979B6BC8DABF4BD23D3CD3CD32C16BD7EBFB13CFBE15A3C676D31BDF6368A3D3E8CF93BBC54583D8C3783BD7468273C52D0253CABC27F3CD1E3EABDF7E4243D1209023D353152BC3BE93FBD159B09BC6B3CD3BBF2C3903B120D7A3C12A475BC015B2D3CBC9BF0BC5DAB013E1F3C16BDF30D813D644D2C3DDB4CA03D743B6ABB83EC103C5B50CF3C636E85BCB90BA53B9CB83B3DAA5811BDD1DB9BB07D6C03BC5FE913BD6572A83BFF34DABDE34E28BD4F7731BC921DC1BDE13D613EFEF8A7BB125F743D87A92B3B5F0EF0BBB8A8CA3C8A36643C6F2F0FBE99E3BEBD9F111B3CCE20FD3A0A8E803C3175DBBC51A8D83B029D2B3C0537EEBC9B8F60BE3660873CE9298E3CD3639C3D65AE46BE030B46BC21C300BD37B0BB3CDAA48FBAF1BCC5BC4E76783D3B3DB83C561353BED63BA0BC85E044BC052A223CC81F0C3EB798903CBA9B223AAA93A0BD28A5DC3D25AF973BDF331FBD3FA6B9BDC1C4F9BC8BF7AF3C78D992BBDA89723C6AE9DE3C58CCE63B3C1149BCDFA4ACBCD9D588BB7F2F033EC1D4FFBD67280438E5F6F2BD9741DCBD02D84EBD29C424BD50B9103B8FD2A1BA488CEEBAD63D683C676C89BC3D276DBD1EBCAABC6B48493E8E9853BCF29E6B3CD45D41BD1305143EA340383D74FEDF3BE813B3BBC1AC3E3DD4C3023BDD29753D29A5993A601F4E378F79EEBA7AF8853A1D1869BD0CA89CBDBD2C153B6E37303C17BAE9BC48FAF63C655289BCD844813D5A6DD9BED3027BBEC9B4FF3CA02E3A3C6E52AD3C15AB8C3C15EA873C9C1F853AB3D564BCA483B4BCE4AF333CA8FD30BC144D623D4A5D7E3CA12C3B3D9FCA9DBCCE7556BBF279AC3CB9DF10BBB6FE2C3CAEAF8E3D252CDA396FDF73BD8332CB3C6AED623B351C22BCCADB483D86B8623C4B5EF53C1C4A21BDD7E9473C4116013D53E7A4BD73638FBB26F8063CF7FE113CA464BABCFAFEA9B978211A3CD541AABD871AB83CFE6C4BBD"> : tensor<2x4x16x128xf32> + %1 = "mhlo.transpose"(%arg0) {permutation = dense<[0,2,3,1]> : tensor<4xi64>} : (tensor<2x128x4x16xf32>) -> tensor<2x4x16x128xf32> + %2 = mhlo.add %1, %0 : tensor<2x4x16x128xf32> + return %2 : tensor<2x4x16x128xf32> +} +// CHECK-LABEL: func.func @transpose_move_down_binary_case0 +// CHECK-NEXT: mhlo.constant +// CHECK-NEXT: mhlo.add +// CHECK-NEXT: mhlo.transpose +// CHECK-NEXT: return + +func.func @transpose_move_down_binary_case1(%arg0 : tensor<2x128x4x16xf32>,%arg1 : tensor<2x128x4x16xf32>) -> tensor<2x4x16x128xf32> { + %0 = "mhlo.transpose"(%arg0) {permutation = dense<[0,2,3,1]> : tensor<4xi64>} : (tensor<2x128x4x16xf32>) -> tensor<2x4x16x128xf32> + %1 = "mhlo.transpose"(%arg1) {permutation = dense<[0,2,3,1]> : tensor<4xi64>} : (tensor<2x128x4x16xf32>) -> tensor<2x4x16x128xf32> + %2 = mhlo.add %1, %0 : tensor<2x4x16x128xf32> + return %2 : tensor<2x4x16x128xf32> +} +// CHECK-LABEL: func.func @transpose_move_down_binary_case1 +// CHECK-NEXT: mhlo.add +// CHECK-NEXT: mhlo.transpose +// CHECK-NEXT: return diff --git a/compiler/python/ByteIRModules.cpp b/compiler/python/ByteIRModules.cpp index 3ba11567f..076e24c6d 100644 --- a/compiler/python/ByteIRModules.cpp +++ b/compiler/python/ByteIRModules.cpp @@ -15,6 +15,7 @@ // //===----------------------------------------------------------------------===// +#include "bindings/c/Passes.h" #include "byteir-c/Dialects.h" #include "byteir-c/Passes.h" #include "byteir-c/Translation.h" @@ -30,6 +31,7 @@ static MlirStringRef toMlirStringRef(const std::string &s) { PYBIND11_MODULE(_byteir, m) { byteirRegisterAllPasses(); + mlirRegisterAllMhloPasses(); m.doc() = "byteir python extension"; diff --git a/compiler/python/byteir/compile.py b/compiler/python/byteir/compile.py index 70c1bc1f8..297beaeb3 100644 --- a/compiler/python/byteir/compile.py +++ b/compiler/python/byteir/compile.py @@ -88,6 +88,11 @@ def compile_cuda( _print_verbose(module, "// IR Dump After GPU Opt:") with context: PassManager.parse("builtin.module(func.func(remove-func-body{anchor-attr=__byteir_elementwise_fusion__}))").run(module.operation) + PassManager.parse("builtin.module(inline)").run(module.operation) + if useBarePtrCallConv: + PassManager.parse("builtin.module(func.func(gpu-launch-func-to-byre{use-bare-ptr-memref-call-conv=true}))").run(module.operation) + else: + PassManager.parse("builtin.module(func.func(gpu-launch-func-to-byre))").run(module.operation) PassManager.parse("builtin.module(func.func(set-op-space{" + entry_func_str + " space={}".format(target) + "}))").run(module.operation) PassManager.parse("builtin.module(set-arg-space{" + entry_func_str + " all-space={}".format(target) + "})").run(module.operation) if verbose: @@ -127,7 +132,7 @@ def compile_cuda_with_ait( name: str = "model", aggressive_mode: bool = False, parallelism: int = 1, - disable_ait_cache: bool = False, + disable_byteir_cache: bool = False, **kwargs, ): target = "cuda" @@ -143,7 +148,7 @@ def compile_cuda_with_ait( processor = IRProcessor(name, "./workspace", compile_parallelism=parallelism, - disable_ait_cache=disable_ait_cache, + disable_byteir_cache=disable_byteir_cache, verbose=verbose) with context: processor.load_from_file(input) @@ -202,6 +207,11 @@ def compile_cuda_with_ait( _print_verbose(processor.module, "// IR Dump After GPU Opt:") with context: PassManager.parse("builtin.module(func.func(remove-func-body{anchor-attr=__byteir_elementwise_fusion__}))").run(processor.module.operation) + PassManager.parse("builtin.module(inline)").run(processor.module.operation) + if useBarePtrCallConv: + PassManager.parse("builtin.module(func.func(gpu-launch-func-to-byre{use-bare-ptr-memref-call-conv=true}))").run(processor.module.operation) + else: + PassManager.parse("builtin.module(func.func(gpu-launch-func-to-byre))").run(processor.module.operation) PassManager.parse("builtin.module(func.func(set-op-space{" + entry_func_str + " space={}".format(target) + "}))").run(processor.module.operation) PassManager.parse("builtin.module(set-arg-space{" + entry_func_str + " all-space={}".format(target) + "})").run(processor.module.operation) if verbose: @@ -241,7 +251,7 @@ def compile( target: str = "cuda", verbose: bool = False, parallelism: int = 1, - disable_ait_cache: bool = False, + disable_byteir_cache: bool = False, **kwargs, ): if target == "cuda": @@ -252,7 +262,7 @@ def compile( entry_func, verbose, parallelism=parallelism, - disable_ait_cache=disable_ait_cache) + disable_byteir_cache=disable_byteir_cache) elif target == "cuda_with_ait_aggressive": compile_cuda_with_ait(input, output, @@ -260,6 +270,6 @@ def compile( verbose, aggressive_mode=True, parallelism=parallelism, - disable_ait_cache=disable_ait_cache) + disable_byteir_cache=disable_byteir_cache) else: raise NotImplemented("not implemented target: {}".format(target)) diff --git a/compiler/python/byteir/dialects/cat/ir_processor.py b/compiler/python/byteir/dialects/cat/ir_processor.py index a397eff0d..c1388385b 100644 --- a/compiler/python/byteir/dialects/cat/ir_processor.py +++ b/compiler/python/byteir/dialects/cat/ir_processor.py @@ -19,7 +19,6 @@ def func_hash_str(func, gpu_type): hash_str = gpu_type + "_" ops = func.entry_block.operations - # assert len(ops) == 2 for op in ops: hash_str += f"{op.get_asm(large_elements_limit=None)};" return hash_str @@ -29,7 +28,7 @@ def __init__(self, job_name, workdir, compile_parallelism = MAX_COMPILATION_PARALLELISM, - disable_ait_cache = False, + disable_byteir_cache = False, verbose = False): self.job_name = job_name self.workdir = workdir @@ -37,14 +36,11 @@ def __init__(self, self.ait_reuse_recorder = {} # key: hash str, value: Tuple(dll_name, ait_module_path) self.compile_parallelism = min(compile_parallelism, MAX_COMPILATION_PARALLELISM) self.pool = multiprocessing.Pool(compile_parallelism) - self.ait_cache = AITCache() + self.byteir_cache = AITCache() self.verbose = verbose - # ait_cache is enabled when ait_reuse is enabled - # in other words, once `ait_reuse` is set to False, - # we will orcely compile all ait ops with bo reuse or cache. - self.disable_ait_cache = disable_ait_cache - if not disable_ait_cache: - self.ait_cache.load_or_create_cache() + self.disable_byteir_cache = disable_byteir_cache + if not disable_byteir_cache: + self.byteir_cache.load_or_create_cache() def _get_builder(self, module, subgraph_name, backend="ait"): assert module != None @@ -139,11 +135,11 @@ def ait_opt_pass(self, anchor_only=False, dump_ir=False): self.ait_reuse_recorder[hash_str] = (builder.dll_name, builder.ait_module_path) libs_to_add_to_cache[hash_str] = builder.ait_module_path dedup_work_items.append((hash_str, func_ir_str)) - - # search in ait cache + + # search in byteir cache work_items_not_in_cache = [] for hash_str, func_ir_str in dedup_work_items: - cached_lib = self.ait_cache.find(gpu_type, hash_str) + cached_lib = self.byteir_cache.find(gpu_type, hash_str) if cached_lib != None: # hit, copy cached lib context = ir.Context() @@ -171,12 +167,12 @@ def ait_opt_pass(self, anchor_only=False, dump_ir=False): t_ed = time.time() print("compilation finished in {}s".format(t_ed-t_st)) - # update ait cache - if not self.disable_ait_cache: + # update byteir cache + if not self.disable_byteir_cache: for key, lib_path in libs_to_add_to_cache.items(): - self.ait_cache.add(gpu_type, key, lib_path, override=False) - self.ait_cache._save() - self.ait_cache.close_cache() + self.byteir_cache.add(gpu_type, key, lib_path, override=False) + self.byteir_cache._save() + self.byteir_cache.close_cache() with self.module.context: pm = PassManager.parse("builtin.module(func.func(gen-ait-config{{func-names={} ait-lib-paths={}}}))".format(funcNameArg, aitLibPathArg)) diff --git a/compiler/python/byteir/tools/compiler.py b/compiler/python/byteir/tools/compiler.py index 5385614af..51aedc583 100644 --- a/compiler/python/byteir/tools/compiler.py +++ b/compiler/python/byteir/tools/compiler.py @@ -11,7 +11,7 @@ parser.add_argument("--target", type=str, default="cuda", help="target device name") parser.add_argument("-v", "--verbose", action="store_true") parser.add_argument("--ait_parallelism", type=int, default=1, help="number of processes to compile ait op") - parser.add_argument("--disable_ait_cache", action="store_true") + parser.add_argument("--disable_byteir_cache", action="store_true") args = parser.parse_args() byteir.compile(args.input_mlir_path, @@ -20,6 +20,6 @@ args.target, args.verbose, args.ait_parallelism, - args.disable_ait_cache) + args.disable_byteir_cache) diff --git a/compiler/test/Conversion/HloToCat/fused_ops.mlir b/compiler/test/Conversion/HloToCat/fused_ops.mlir index 0b84fc4d5..92e0f2d1c 100644 --- a/compiler/test/Conversion/HloToCat/fused_ops.mlir +++ b/compiler/test/Conversion/HloToCat/fused_ops.mlir @@ -61,13 +61,22 @@ func.func @test_bmm_rcr_permute(%arg0: tensor<384x256x256xf32>, %arg1: tensor<38 // CHECK-NEXT: cat.bmm_rcr_permute // CHECK-NEXT: return +func.func @test_not_bmm_rrr_permute(%arg0: tensor<1x64x4096xf32>, %arg1: tensor<1x4096x4096xf32>) -> tensor<1x32x64x128xf32> { + %0 = "mhlo.dot_general"(%arg0, %arg1) {dot_dimension_numbers = #mhlo.dot} : (tensor<1x64x4096xf32>, tensor<1x4096x4096xf32>) -> tensor<1x64x4096xf32> + %1 = mhlo.reshape %0 : (tensor<1x64x4096xf32>) -> tensor<1x64x32x128xf32> + %2 = "mhlo.transpose"(%1) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x64x32x128xf32>) -> tensor<1x32x64x128xf32> + return %2 : tensor<1x32x64x128xf32> +} +// CHECK-LABEL: func.func @test_not_bmm_rrr_permute +// CHECK-NOT: cat.bmm_rrr_permute + func.func @test_bmm_rrr_add_0(%arg0: tensor<384x256x256xf32>, %arg1: tensor<384x256x64xf32>, %arg2: tensor<384x256x64xf32>) -> tensor<384x256x64xf32> { %0 = "mhlo.dot_general"(%arg0, %arg1) {dot_dimension_numbers = #mhlo.dot} : (tensor<384x256x256xf32>, tensor<384x256x64xf32>) -> tensor<384x256x64xf32> %1 = mhlo.add %0, %arg2 : (tensor<384x256x64xf32>, tensor<384x256x64xf32>) -> tensor<384x256x64xf32> return %1 : tensor<384x256x64xf32> } -// CHECK: func.func @test_bmm_rrr_add_0 +// CHECK-LABEL: func.func @test_bmm_rrr_add_0 // CHECK-NEXT: cat.bmm_rrr_add // CHECK-NEXT: return @@ -77,7 +86,7 @@ func.func @test_bmm_rrr_add_1(%arg0: tensor<384x256x256xf32>, %arg1: tensor<384x return %1 : tensor<384x256x64xf32> } -// CHECK: func.func @test_bmm_rrr_add_1 +// CHECK-LABEL: func.func @test_bmm_rrr_add_1 // CHECK-NEXT: cat.bmm_rrr_add // CHECK-NEXT: return @@ -88,7 +97,7 @@ func.func @test_bmm_crr_add(%arg0: tensor<384x256x256xf32>, %arg1: tensor<384x25 return %2 : tensor<384x256x64xf32> } -// CHECK: func.func @test_bmm_crr_add +// CHECK-LABEL: func.func @test_bmm_crr_add // CHECK-NEXT: cat.bmm_crr_add // CHECK-NEXT: return @@ -205,7 +214,7 @@ func.func @test_gemm_rrr_bias(%arg0: tensor<2x2048xf32>, %arg1: tensor<2048x1001 return %3 : tensor<2x1001xf32> } -// CHECK: func.func @test_gemm_rrr_bias +// CHECK-LABEL: func.func @test_gemm_rrr_bias // CHECK-NEXT: mhlo.constant // CHECK-NEXT: cat.gemm_rrr_bias // CHECK-NEXT: return @@ -216,7 +225,7 @@ func.func @test_bmm_crc(%arg0: tensor<512x1024x128xf16>, %arg1: tensor<512x1024x return %1 : tensor<512x1024x128xf16> } -// CHECK: func.func @test_bmm_crc +// CHECK-LABEL: func.func @test_bmm_crc // CHECK-NEXT: cat.bmm_crc // CHECK-NEXT: return @@ -226,7 +235,7 @@ func.func @test_bmm_rrc(%arg0: tensor<512x128x1024xf16>, %arg1: tensor<512x1024x return %1 : tensor<512x1024x128xf16> } -// CHECK: func.func @test_bmm_rrc +// CHECK-LABEL: func.func @test_bmm_rrc // CHECK-NEXT: cat.bmm_rrc // CHECK-NEXT: return @@ -237,7 +246,7 @@ func.func @test_transpose_reshape_bmm_rrr_to_reshape_bmm_rcr(%arg0: tensor<64x12 return %2 : tensor<64x128x128xf16> } -// CHECK: func.func @test_transpose_reshape_bmm_rrr_to_reshape_bmm_rcr +// CHECK-LABEL: func.func @test_transpose_reshape_bmm_rrr_to_reshape_bmm_rcr // CHECK-NEXT: mhlo.reshape // CHECK-NEXT: cat.bmm_rcr // CHECK-NEXT: return @@ -248,7 +257,7 @@ func.func @test_bmm_rrr_reshape_transpose_to_bmm_rrc_reshape(%arg0: tensor<64x12 %2 = "mhlo.transpose"(%1) {permutation = dense<[0, 1, 3, 2]> : tensor<4xi64>} : (tensor<2x32x128x128xf16>) -> tensor<2x32x128x128xf16> return %2 : tensor<2x32x128x128xf16> } -// CHECK: func.func @test_bmm_rrr_reshape_transpose_to_bmm_rrc_reshape +// CHECK-LABEL: func.func @test_bmm_rrr_reshape_transpose_to_bmm_rrc_reshape // CHECK-NEXT: cat.bmm_rrc // CHECK-NEXT: mhlo.reshape // CHECK-NEXT: return @@ -259,7 +268,7 @@ func.func @test_bmm_crr_reshape_transpose_to_bmm_crc_reshape(%arg0: tensor<512x1 %2 = "mhlo.transpose"(%1) {permutation = dense<[0, 1, 3, 2]> : tensor<4xi64>} : (tensor<16x32x128x128xf16>) -> tensor<16x32x128x128xf16> return %2 : tensor<16x32x128x128xf16> } -// CHECK: func.func @test_bmm_crr_reshape_transpose_to_bmm_crc_reshape +// CHECK-LABEL: func.func @test_bmm_crr_reshape_transpose_to_bmm_crc_reshape // CHECK-NEXT: cat.bmm_crc // CHECK-NEXT: mhlo.reshape // CHECK-NEXT: return @@ -268,8 +277,43 @@ func.func @test_softmax_f16(%arg0 : tensor<1x12x1024x1024xf16>) -> tensor<1x12x1 %0 = mhlo.custom_call @byteir.softmax(%arg0) {backend_config = "", byteir_attrs = {axis = 3 : i64}} : (tensor<1x12x1024x1024xf16>) -> tensor<1x12x1024x1024xf32> return %0 : tensor<1x12x1024x1024xf32> } - -// CHECK: func.func @test_softmax_f16 +// CHECK-LABEL: func.func @test_softmax_f16 // CHECK-NEXT: cat.softmax // CHECK-NEXT: mhlo.convert // CHECK-NEXT: return + +func.func @test_bmm_rrr_broadcast_to_reshape_gemm_rrr_reshape(%arg0: tensor<16x1024x4096xf16>, %arg1: tensor<4096x4096xf16>) -> tensor<16x1024x4096xf16> { + %0 = "mhlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<4096x4096xf16>) -> tensor<16x4096x4096xf16> + %1 = "cat.bmm_rrr"(%arg0, %0) : (tensor<16x1024x4096xf16>, tensor<16x4096x4096xf16>) -> tensor<16x1024x4096xf16> + return %1 : tensor<16x1024x4096xf16> +} +// CHECK-LABEL: func.func @test_bmm_rrr_broadcast_to_reshape_gemm_rrr_reshape +// CHECK-NEXT: mhlo.reshape +// CHECK-NEXT: cat.gemm_rrr +// CHECK-NEXT: mhlo.reshape +// CHECK-NEXT: return + +func.func @test_transpose_bmm_rrr_broadcast_to_gemm_rrr_permute(%arg0: tensor<16x1024x4096xf16>, %arg1: tensor<4096x4096xf16>) -> tensor<16x32x1024x128xf16> { + %0 = "mhlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<4096x4096xf16>) -> tensor<16x4096x4096xf16> + %1 = "cat.bmm_rrr"(%arg0, %0) : (tensor<16x1024x4096xf16>, tensor<16x4096x4096xf16>) -> tensor<16x1024x4096xf16> + %2 = mhlo.reshape %1 : (tensor<16x1024x4096xf16>) -> tensor<16x1024x32x128xf16> + %3 = "mhlo.transpose"(%2) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<16x1024x32x128xf16>) -> tensor<16x32x1024x128xf16> + return %3 : tensor<16x32x1024x128xf16> +} +// CHECK-LABEL: func.func @test_transpose_bmm_rrr_broadcast_to_gemm_rrr_permute +// CHECK-NEXT: mhlo.reshape +// CHECK-NEXT: cat.gemm_rrr_permute +// CHECK-NEXT: return + +func.func @test_transpose_bmm_rrr_broadcast_to_gemm_rcr_permute(%arg0: tensor<16x1024x4096xf16>, %arg1: tensor<4096x4096xf16>) -> tensor<16x32x1024x128xf16> { + %t = "mhlo.transpose"(%arg1) {permutation = dense<[1,0]> : tensor<2xi64>} : (tensor<4096x4096xf16>) -> tensor<4096x4096xf16> + %0 = "mhlo.broadcast_in_dim"(%t) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<4096x4096xf16>) -> tensor<16x4096x4096xf16> + %1 = "cat.bmm_rrr"(%arg0, %0) : (tensor<16x1024x4096xf16>, tensor<16x4096x4096xf16>) -> tensor<16x1024x4096xf16> + %2 = mhlo.reshape %1 : (tensor<16x1024x4096xf16>) -> tensor<16x1024x32x128xf16> + %3 = "mhlo.transpose"(%2) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<16x1024x32x128xf16>) -> tensor<16x32x1024x128xf16> + return %3 : tensor<16x32x1024x128xf16> +} +// CHECK-LABEL: func.func @test_transpose_bmm_rrr_broadcast_to_gemm_rcr_permute +// CHECK-NEXT: mhlo.reshape +// CHECK-NEXT: cat.gemm_rcr_permute +// CHECK-NEXT: return \ No newline at end of file diff --git a/compiler/test/Conversion/ToByre/convertMemRefToByre.mlir b/compiler/test/Conversion/ToByre/convertMemRefToByre.mlir index 9eec21e83..4dfa8f99a 100644 --- a/compiler/test/Conversion/ToByre/convertMemRefToByre.mlir +++ b/compiler/test/Conversion/ToByre/convertMemRefToByre.mlir @@ -1,4 +1,4 @@ -// RUN: byteir-opt -convert-lmhlo-to-byre %s | FileCheck %s +// RUN: byteir-opt -memref-to-byre --split-input-file %s | FileCheck %s module attributes {byre.container_module} { // CHECK: module attributes {byre.container_module} { @@ -8,4 +8,22 @@ module attributes {byre.container_module} { // CHECK: byre.copy(%arg0, %alloc) {callee = "cpu2gpu"} : memref<4xf32, "cpu">, memref<4xf32, "gpu"> return } -} \ No newline at end of file +} + +// ----- + +module attributes {byre.container_module} { +// CHECK: module attributes {byre.container_module} { + func.func @forward(%arg0: memref {byre.argname = "A", byre.argtype = 1 : i32}, %arg1: memref<2xi64, "cuda"> {byre.argname = "Out", byre.argtype = 2 : i32}) attributes { byre.entry_point } { + %expand_shape = memref.expand_shape %arg0 [] : memref into memref<1xi64, "cuda"> + // CHECK: byre.alias + %alloc = memref.alloc() : memref<2xi64, "cuda"> + %subview = memref.subview %alloc[0] [1] [1] : memref<2xi64, "cuda"> to memref<1xi64, strided<[1]>, "cuda"> + // CHECK: byre.alias + memref.copy %expand_shape, %subview : memref<1xi64, "cuda"> to memref<1xi64, strided<[1]>, "cuda"> + // CHECK: byre.copy + memref.copy %alloc, %arg1 : memref<2xi64, "cuda"> to memref<2xi64, "cuda"> + // CHECK: byre.copy + return + } +} diff --git a/compiler/test/Dialect/Linalg/transform-op-fold-unit-extent-dims.mlir b/compiler/test/Dialect/Linalg/transform-op-fold-unit-extent-dims.mlir index 8759d25ef..f8e0c1992 100644 --- a/compiler/test/Dialect/Linalg/transform-op-fold-unit-extent-dims.mlir +++ b/compiler/test/Dialect/Linalg/transform-op-fold-unit-extent-dims.mlir @@ -18,6 +18,8 @@ func.func @tensor_collapse(%arg0 : tensor<12x1024x1024xf32>, %arg1 : tensor<1x10 transform.sequence failures(propagate) { ^bb0(%arg0: !pdl.operation): - %0 = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!pdl.operation) -> !pdl.operation - %1 = transform.structured.fold_unit_extent_dims %0 + %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!pdl.operation) -> !pdl.operation + transform.apply_patterns to %0 { + transform.apply_patterns.linalg.fold_unit_extent_dims_via_reshapes + } : !pdl.operation } diff --git a/compiler/test/Dialect/Mhlo/transforms/ConvertOpToCustomCall.mlir b/compiler/test/Dialect/Mhlo/transforms/ConvertOpToCustomCall.mlir index 1b13f2851..2ed4c0d77 100644 --- a/compiler/test/Dialect/Mhlo/transforms/ConvertOpToCustomCall.mlir +++ b/compiler/test/Dialect/Mhlo/transforms/ConvertOpToCustomCall.mlir @@ -61,3 +61,22 @@ func.func @convert_rng_dynamic(%arg0: tensor) -> tensor { // CHECK-NEXT: call @NextOffsetFunc // CHECK-NEXT: mhlo.custom_call // CHECK-SAME: @byteir.rng_uniform + +// ----- + +func.func @flash_attn_fwd(%arg0: tensor<2x256x12x128xf16>, %arg1: tensor<2x256x12x128xf16>, %arg2: tensor<2x256x12x128xf16>) -> (tensor<2x256x12x128xf16>, tensor<2x12x256xf32>, tensor<2x12x256x256xf16>, tensor<2xi64>) { + %0:4 = mhlo.custom_call @byteir.flash_attn_fwd(%arg0, %arg1, %arg2) {backend_config = "", byteir_attrs = {casual = false, dropout_p = 1.000000e-01 : f64, return_softmax = false, softmax_scale = 1.000000e+00 : f64}} : (tensor<2x256x12x128xf16>, tensor<2x256x12x128xf16>, tensor<2x256x12x128xf16>) -> (tensor<2x256x12x128xf16>, tensor<2x12x256xf32>, tensor<2x12x256x256xf16>, tensor<2xi64>) + return %0#0, %0#1, %0#2, %0#3 : tensor<2x256x12x128xf16>, tensor<2x12x256xf32>, tensor<2x12x256x256xf16>, tensor<2xi64> +} + +// CHECK-LABEL: func.func private @NextOffsetFunc() -> tensor attributes {byre_compute_name = "NextOffset", byre_force_compute_name} +// CHECK-LABEL: func.func private @GetSeedFunc() -> tensor attributes {byre_compute_name = "GetSeed", byre_force_compute_name} +// CHECK-LABEL: func.func @flash_attn_fwd +// CHECK-NEXT: call @GetSeedFunc +// CHECK-NEXT: call @NextOffsetFunc +// CHECK-NEXT: mhlo.reshape +// CHECK-NEXT: mhlo.reshape +// CHECK-NEXT: mhlo.concatenate +// CHECK-NEXT: mhlo.custom_call +// CHECK-SAME: @byteir.flash_attn_fwd +// CHECK-SAME: byteir_attrs = {casual = false, dropout_p = 1.000000e-01 : f64, return_softmax = false, softmax_scale = 1.000000e+00 : f64} diff --git a/compiler/test/Dialect/Mhlo/transforms/hloMoveDown.mlir b/compiler/test/Dialect/Mhlo/transforms/hloMoveDown.mlir index 3a46a008b..aa83789d5 100644 --- a/compiler/test/Dialect/Mhlo/transforms/hloMoveDown.mlir +++ b/compiler/test/Dialect/Mhlo/transforms/hloMoveDown.mlir @@ -34,6 +34,18 @@ func.func @transpose_move_down_binary_splat_const(%arg0 : tensor<31x20x32xf32>) // CHECK-NEXT: mhlo.transpose // CHECK-NEXT: return +func.func @transpose_move_down_binary_dense_const(%arg0 : tensor<3x2xf32>) -> tensor<2x3xf32> { + %0 = mhlo.constant dense<[[1.0,2.0,3.0],[4.0,5.0,6.0]]> : tensor<2x3xf32> + %1 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<3x2xf32>) -> tensor<2x3xf32> + %2 = mhlo.add %1, %0 : tensor<2x3xf32> + return %2 : tensor<2x3xf32> +} +// CHECK-LABEL: func.func @transpose_move_down_binary_dense_const +// CHECK-NEXT: mhlo.constant {{.*}} tensor<3x2xf32> +// CHECK-NEXT: mhlo.add +// CHECK-NEXT: mhlo.transpose +// CHECK-NEXT: return + func.func @transpose_move_down_unary_and_cancel(%arg0 : tensor<31x20x32xf32>) -> tensor<31x20x32xf32> { %0 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0, 2]> : tensor<3xi64>} : (tensor<31x20x32xf32>) -> tensor<20x31x32xf32> %1 = "mhlo.abs"(%0) : (tensor<20x31x32xf32>) -> tensor<20x31x32xf32> @@ -76,19 +88,17 @@ func.func @transpose_move_down_two_unary(%arg0 : tensor<31x20x32xf32>) -> tensor // CHECK-NEXT: return // MULTIUSER-LABEL: func.func @transpose_move_down_two_unary -// MULTIUSER-DAG{ABS}: mhlo.abs -// MULTIUSER-NEXT{ABS}: mhlo.transpose -// MULTIUSER-DAG{SINE}: mhlo.sine -// MULTIUSER-NEXT{SINE}: mhlo.transpose +// MULTIUSER-DAG: mhlo.abs +// MULTIUSER-DAG: mhlo.sine // MULTIUSER: mhlo.add +// MULTIUSER-NEXT: mhlo.transpose // MULTIUSER-NEXT: return // AllMULTIUSER-LABEL: func.func @transpose_move_down_two_unary -// AllMULTIUSER-DAG{ABS}: mhlo.abs -// AllMULTIUSER-NEXT{ABS}: mhlo.transpose -// AllMULTIUSER-DAG{SINE}: mhlo.sine -// AllMULTIUSER-NEXT{SINE}: mhlo.transpose +// AllMULTIUSER-DAG: mhlo.abs +// AllMULTIUSER-DAG: mhlo.sine // AllMULTIUSER: mhlo.add +// AllMULTIUSER-NEXT: mhlo.transpose // AllMULTIUSER-NEXT: return func.func @transpose_move_down_1_unary_1_invalid(%arg0 : tensor<31x20x32xf32>, %arg1 : tensor<20x31x32xf32>)-> tensor<20x31x32xf32> { diff --git a/compiler/test/Dialect/Tensor/canonicalizeExt.mlir b/compiler/test/Dialect/Tensor/canonicalizeExt.mlir index 83b828455..b0042aa23 100644 --- a/compiler/test/Dialect/Tensor/canonicalizeExt.mlir +++ b/compiler/test/Dialect/Tensor/canonicalizeExt.mlir @@ -34,3 +34,14 @@ func.func @extract_slice_and_collapse_shape_no_fold(%arg0: tensor<19x1024x1xi32> // CHECK: tensor.extract_slice // CHECK: tensor.collapse_shape +// ---- + +func.func @fold_zero_rank_from_elements_insert_slice(%arg0: tensor<1024xf32>, %scalar : f32) -> tensor<1024xf32> { + %0 = tensor.from_elements %scalar : tensor + %1 = tensor.insert_slice %0 into %arg0[256] [1] [1] : tensor into tensor<1024xf32> + return %1 : tensor<1024xf32> +} +// CHECK-LABEL: fold_zero_rank_from_elements_insert_slice +// CHECK: tensor.insert +// CHECK-NOT: tensor.from_elements +// CHECK-NOT: tensor.insert_slice \ No newline at end of file diff --git a/compiler/test/Transforms/canonicalizeExt.mlir b/compiler/test/Transforms/canonicalizeExt.mlir index abb11d500..a7dbb37cb 100644 --- a/compiler/test/Transforms/canonicalizeExt.mlir +++ b/compiler/test/Transforms/canonicalizeExt.mlir @@ -343,3 +343,40 @@ func.func @transpose_reshape_transpose(%arg0: tensor<2x32x128x256xf16>) -> (tens // CHECK-NEXT: mhlo.reshape // CHECK-NEXT: mhlo.reshape // CHECK-NEXT: return + +func.func @replace_gather_with_input_0() -> (tensor<1x64x128xf16>, tensor<1x32x64x128xf16>) { + %0 = mhlo.constant dense<1.000000e+00> : tensor<64x128xf16> + %1 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<64xi64> + %2 = "mhlo.gather"(%0, %1) {dimension_numbers = #mhlo.gather, indices_are_sorted = false, slice_sizes = dense<[1, 128]> : tensor<2xi64>} : (tensor<64x128xf16>, tensor<64xi64>) -> tensor<64x128xf16> + %3 = mhlo.reshape %2 : (tensor<64x128xf16>) -> tensor<1x64x128xf16> + %4 = "mhlo.broadcast_in_dim"(%2) {broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>} : (tensor<64x128xf16>) -> tensor<1x32x64x128xf16> + return %3, %4 : tensor<1x64x128xf16>, tensor<1x32x64x128xf16> +} +// CHECK-LABEL: @replace_gather_with_input_0 +// CHECK-NEXT: mhlo.constant +// CHECK-NEXT: mhlo.constant +// CHECK-NEXT: return + +func.func @replace_gather_with_input_1(%arg0: tensor<64x128xf16>) -> (tensor<1x64x128xf16>, tensor<1x32x64x128xf16>) { + %0 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<64xi64> + %1 = "mhlo.gather"(%arg0, %0) {dimension_numbers = #mhlo.gather, indices_are_sorted = false, slice_sizes = dense<[1, 128]> : tensor<2xi64>} : (tensor<64x128xf16>, tensor<64xi64>) -> tensor<64x128xf16> + %2 = mhlo.reshape %1 : (tensor<64x128xf16>) -> tensor<1x64x128xf16> + %3 = "mhlo.broadcast_in_dim"(%1) {broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>} : (tensor<64x128xf16>) -> tensor<1x32x64x128xf16> + return %2, %3 : tensor<1x64x128xf16>, tensor<1x32x64x128xf16> +} +// CHECK-LABEL: @replace_gather_with_input_1 +// CHECK-NEXT: mhlo.reshape +// CHECK-NEXT: mhlo.broadcast_in_dim +// CHECK-NEXT: return + +func.func @replace_gather_with_input_2(%arg0: tensor<64x128xf16>) -> (tensor<1x64x128xf16>, tensor<1x32x64x128xf16>) { + %0 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<128xi64> + %1 = "mhlo.gather"(%arg0, %0) {dimension_numbers = #mhlo.gather, indices_are_sorted = false, slice_sizes = dense<[64, 1]> : tensor<2xi64>} : (tensor<64x128xf16>, tensor<128xi64>) -> tensor<64x128xf16> + %2 = mhlo.reshape %1 : (tensor<64x128xf16>) -> tensor<1x64x128xf16> + %3 = "mhlo.broadcast_in_dim"(%1) {broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>} : (tensor<64x128xf16>) -> tensor<1x32x64x128xf16> + return %2, %3 : tensor<1x64x128xf16>, tensor<1x32x64x128xf16> +} +// CHECK-LABEL: @replace_gather_with_input_2 +// CHECK-NEXT: mhlo.reshape +// CHECK-NEXT: mhlo.broadcast_in_dim +// CHECK-NEXT: return diff --git a/compiler/test/Transforms/memoryPlanning.mlir b/compiler/test/Transforms/memoryPlanning.mlir index 1edca2483..bb9abd835 100644 --- a/compiler/test/Transforms/memoryPlanning.mlir +++ b/compiler/test/Transforms/memoryPlanning.mlir @@ -1,7 +1,7 @@ -// RUN: byteir-opt %s -memory-planning --canonicalize --cse | FileCheck %s -// RUN: byteir-opt %s -memory-planning="alignment=64" --canonicalize --cse | byteir-stat --alloc-cnt | FileCheck %s --check-prefix CHECK-STAT -// RUN: byteir-opt %s -memory-planning="alloca" --canonicalize --cse | FileCheck %s --check-prefix CHECK-ALLOCA -// RUN: byteir-opt %s -memory-planning="alloca mem-space=2" --canonicalize --cse | FileCheck %s --check-prefix CHECK-SPACE +// RUN: byteir-opt %s --pass-pipeline='builtin.module(func.func(memory-planning,canonicalize,cse))' | FileCheck %s +// RUN: byteir-opt %s --pass-pipeline='builtin.module(func.func(memory-planning{alignment=64},canonicalize,cse))' | byteir-stat --alloc-cnt | FileCheck %s --check-prefix CHECK-STAT +// RUN: byteir-opt %s --pass-pipeline='builtin.module(func.func(memory-planning{alloca},canonicalize,cse))' | FileCheck %s --check-prefix CHECK-ALLOCA +// RUN: byteir-opt %s --pass-pipeline='builtin.module(func.func(memory-planning{alloca mem-space=2},canonicalize,cse))' | FileCheck %s --check-prefix CHECK-SPACE func.func @test_basic_reuse(%arg0 : memref<256xf32>, %arg1 : memref<256xf32>) -> memref<256xf32> attributes {__placeholder__byre.entry_point} { %0 = memref.alloc() : memref<256xf32> @@ -203,9 +203,9 @@ func.func @test_reuse_sub_chunk_i1(%arg0 : memref<512xi1>, %arg1 : memref<512xi1 func.func @test_reuse_single_memory_space(%arg0 : memref<512xf32, 1>, %arg1 : memref<512xf32, 2>) { %0 = memref.alloc() : memref<512xf32, 1> - %1 = memref.alloc() : memref<512xf32, 2> + %1 = memref.alloca() : memref<512xf32, 2> %2 = memref.alloc() : memref<512xf32, 1> - %3 = memref.alloc() : memref<512xf32, 2> + %3 = memref.alloca() : memref<512xf32, 2> "lmhlo.add"(%arg0, %arg0, %0) : (memref<512xf32, 1>, memref<512xf32, 1>, memref<512xf32, 1>) -> () "lmhlo.add"(%arg1, %arg1, %1) : (memref<512xf32, 2>, memref<512xf32, 2>, memref<512xf32, 2>) -> () "lmhlo.add"(%0, %0, %arg0) : (memref<512xf32, 1>, memref<512xf32, 1>, memref<512xf32, 1>) -> () diff --git a/compiler/tools/byteir-opt/CMakeLists.txt b/compiler/tools/byteir-opt/CMakeLists.txt index 9c13a778e..f8b667c21 100644 --- a/compiler/tools/byteir-opt/CMakeLists.txt +++ b/compiler/tools/byteir-opt/CMakeLists.txt @@ -8,6 +8,7 @@ set(BYTEIR_LIBS MLIRCclTransformOps ByteIRAffinePasses ByteIRByrePasses + ByteIRGPUPasses ByteIRGPUPipelines ByteIRHostPipelines ByteIRLinalgPasses diff --git a/compiler/tools/byteir-opt/byteir-opt.cpp b/compiler/tools/byteir-opt/byteir-opt.cpp index 8cb60be72..ed3c15e51 100644 --- a/compiler/tools/byteir-opt/byteir-opt.cpp +++ b/compiler/tools/byteir-opt/byteir-opt.cpp @@ -21,6 +21,7 @@ #include "byteir/Dialect/Ccl/IR/CclOps.h" #include "byteir/Dialect/Ccl/Passes.h" #include "byteir/Dialect/Ccl/TransformOps/CclTransformOps.h" +#include "byteir/Dialect/GPU/Passes.h" #include "byteir/Dialect/Lace/LaceDialect.h" #include "byteir/Dialect/Linalg/IR/LinalgExtOps.h" #include "byteir/Dialect/Linalg/Passes.h" @@ -30,6 +31,7 @@ #include "byteir/Dialect/Shape/IR/ShapeExtOps.h" #include "byteir/Dialect/Shape/Passes.h" #include "byteir/Dialect/Tensor/IR/TilingInterfaceImpl.h" +#include "byteir/Dialect/Tensor/Passes.h" #include "byteir/Dialect/Transform/IR/TransformExtOps.h" #include "byteir/Dialect/Transform/Passes.h" #include "byteir/Dialect/Vector/Transforms/Passes.h" @@ -123,11 +125,13 @@ int main(int argc, char **argv) { registerByteIRAffinePasses(); registerByteIRByrePasses(); registerByteIRCclPasses(); + registerByteIRGPUPasses(); registerByteIRLinalgPasses(); registerByteIRMemRefPasses(); registerByteIRMhloPassesExt(); registerByteIRSCFPasses(); registerByteIRShapePasses(); + registerByteIRTensorPasses(); registerByteIRTransformPasses(); registerByteIRVectorPasses(); diff --git a/external/patches/AITemplate/logging.patch b/external/patches/AITemplate/logging.patch new file mode 100644 index 000000000..a5dec1b54 --- /dev/null +++ b/external/patches/AITemplate/logging.patch @@ -0,0 +1,17 @@ +diff --git a/static/csrc/model_container.cpp b/static/csrc/model_container.cpp +index 5548a97..920ed60 100644 +--- a/static/csrc/model_container.cpp ++++ b/static/csrc/model_container.cpp +@@ -80,9 +80,9 @@ ModelContainer::ModelContainer( + useDebugLogging = true; + } + } +- LOG(INFO) +- << (useDebugLogging ? PrintDebugDeviceProperties(prop) +- : PrintInfoDeviceProperties(prop)); ++ //LOG(INFO) ++ // << (useDebugLogging ? PrintDebugDeviceProperties(prop) ++ // : PrintInfoDeviceProperties(prop)); + + LOG(INFO) << "Init AITemplate Runtime with " << num_models << " concurrency"; + models_.reserve(num_models); diff --git a/frontends/onnx-frontend/onnx-frontend/src/Conversion/OFRewriteToCustomCall.cpp b/frontends/onnx-frontend/onnx-frontend/src/Conversion/OFRewriteToCustomCall.cpp index dfda80cc2..a8918c43c 100644 --- a/frontends/onnx-frontend/onnx-frontend/src/Conversion/OFRewriteToCustomCall.cpp +++ b/frontends/onnx-frontend/onnx-frontend/src/Conversion/OFRewriteToCustomCall.cpp @@ -188,6 +188,7 @@ Value createL2NormWithoutEps(PatternRewriter &rewriter, Location loc, mhlo::CustomCallSchedule::NONE, nullptr, nullptr, rewriter.getArrayAttr(llvm::ArrayRef{})); DictionaryAttrWrapper attrs(rewriter.getContext()); + attrs.setAttr("epsilon", rewriter.getF64FloatAttr(0.0)); attrs.setAttr("axis", rewriter.getI64ArrayAttr({axis})); customCallOp->setAttr(BYTEIR_ATTRS, getCleanAttr(attrs)); diff --git a/frontends/onnx-frontend/onnx-frontend/test/of_rewrite_to_custom_call.mlir b/frontends/onnx-frontend/onnx-frontend/test/of_rewrite_to_custom_call.mlir index d0d986dc9..0e87cd743 100644 --- a/frontends/onnx-frontend/onnx-frontend/test/of_rewrite_to_custom_call.mlir +++ b/frontends/onnx-frontend/onnx-frontend/test/of_rewrite_to_custom_call.mlir @@ -175,7 +175,7 @@ func.func @test_l2_norm_pat2(%1146: tensor<12x128xf32>) -> tensor<12x128xf32> { return %1148 : tensor<12x128xf32> // CHECK-LABEL: @test_l2_norm_pat2 // CHECK-SAME: ([[PARAM_0_:%.+]]: tensor<12x128xf32>) -> tensor<12x128xf32> { -// CHECK-NEXT: [[VAR_0_:%.+]] = mhlo.custom_call @byteir.l2_norm(%arg0) {backend_config = "", byteir_attrs = {axis = [1]}} : (tensor<12x128xf32>) -> tensor<12x128xf32> +// CHECK-NEXT: [[VAR_0_:%.+]] = mhlo.custom_call @byteir.l2_norm(%arg0) {backend_config = "", byteir_attrs = {axis = [1], epsilon = 0.000000e+00 : f64}} : (tensor<12x128xf32>) -> tensor<12x128xf32> // CHECK-NEXT: return [[VAR_0_]] : tensor<12x128xf32> } diff --git a/frontends/torch-frontend/examples/demo/README.md b/frontends/torch-frontend/examples/demo/README.md new file mode 100644 index 000000000..65ce80a6a --- /dev/null +++ b/frontends/torch-frontend/examples/demo/README.md @@ -0,0 +1,16 @@ +# ByteIR GPU Compiler for LLM on Torch 2.0 + +### Steps to run +1. Build docker image with [Dockerfile](../../../../docker/Dockerfile). +2. Download ByteIR release and unzip it. +3. Install ByteIR components: + * python3 -m pip install -r ByteIR/requirements.txt + * python3 -m pip install ByteIR/*.whl +4. Run training demo: + * python3 main.py \ <--flash> + * **model-name:** ["gpt2", "bloom-560m", "llama", "opt-1.3b", "nanogpt"] + * **--flash:** means enable flash attention +5. Run inference demo: + * python3 main.py \ --infer <--flash> + * **model-name:** ["llama"] + * **--flash:** means enable flash attention diff --git a/frontends/torch-frontend/examples/demo/backend.py b/frontends/torch-frontend/examples/demo/backend.py new file mode 100644 index 000000000..2e67b84d4 --- /dev/null +++ b/frontends/torch-frontend/examples/demo/backend.py @@ -0,0 +1,195 @@ +import os +import torch +import functools + +from torch.fx.experimental.proxy_tensor import maybe_disable_fake_tensor_mode +from torch.fx.passes.fake_tensor_prop import FakeTensorProp + +import brt +import byteir + +from torch_frontend import compile +from torch_frontend import list_decomposed_ops, preprocess_fx_graph, fx_replace_attn_pattern, replace_flash_attn, get_none_indices + + +TRACE = False + +cnt = 0 +MODEL_NAME = '' +FLASH = False + + +from functorch.compile import aot_module +from torch._decomp import get_decompositions + +from torch.cuda.memory import caching_allocator_alloc, caching_allocator_delete + +class ByteIRInferenceFunction: + def __init__(self, module_path): + self._session = brt.Session(alloc_func=caching_allocator_alloc, + free_func=caching_allocator_delete) + self._session.load(module_path) + self._req = self._session.new_request_context( + torch.cuda.current_stream()._as_parameter_.value) + + def __call__(self, *inputs): + device = inputs[0].device + from brt.utils import brt_dtype_to_torch_dtype + results = [torch.empty(self._session.get_static_shape(offset), + dtype=brt_dtype_to_torch_dtype(self._session.get_data_type(offset)), + device=device) for offset in self._session.get_output_arg_offsets()] + + for offset, input in zip(self._session.get_input_arg_offsets(), inputs): + self._req.bind_arg(offset, input.data_ptr()) + for offset, output in zip(self._session.get_output_arg_offsets(), results): + self._req.bind_arg(offset, output.data_ptr()) + self._req.finish_io_binding() + self._req.run() + self._req.sync() + return results + +class ByteIRFunction: + def __init__(self, module_path, output_shapes, output_dtypes, none_indices): + self._session = brt.Session( + alloc_func=caching_allocator_alloc, + free_func=caching_allocator_delete) + self._session.load(module_path) + self._output_shapes = output_shapes + self._output_dtypes = output_dtypes + self._req = self._session.new_request_context( + torch.cuda.current_stream()._as_parameter_.value) + self._none_indices = none_indices + + def __call__(self, *inputs): + if TRACE: + for i in range(len(inputs)): + input = inputs[i] + print("In ByteIRFunction, Inputs["+str(i)+"]", input) + + device = inputs[0].device + rets = [torch.empty(shape, dtype=dtype, device=device) + for shape, dtype in zip(self._output_shapes, self._output_dtypes)] + for offset, arg in zip(self._session.get_input_arg_offsets(), inputs): + assert list(self._session.get_static_shape(offset)) == list(arg.shape) + for offset, ret in zip(self._session.get_output_arg_offsets(), rets): + assert list(self._session.get_static_shape(offset)) == list(ret.shape) + + for i, tensor in zip(self._session.get_input_arg_offsets(), inputs): + self._req.bind_arg(i, tensor.data_ptr()) + for i, tensor in zip(self._session.get_output_arg_offsets(), rets): + self._req.bind_arg(i, tensor.data_ptr()) + self._req.finish_io_binding() + self._req.run() + self._req.sync() + + if TRACE: + for i in range(len(rets)): + r = rets[i] + print("In ByteIRFunction, Outputs["+str(i)+"]", r) + + # add None results to return values + results = [] + none_ptr = 0 + ret_ptr = 0 + for i in range(len(rets) + len(self._none_indices)): + if none_ptr < len(self._none_indices) and i == self._none_indices[none_ptr]: + results.append(None) + none_ptr += 1 + else: + results.append(rets[ret_ptr]) + ret_ptr += 1 + return results + +def byteir_compile_fx_inner(graph: torch.fx.GraphModule, inputs, is_backward, ban_lst=[]): + category = 'backward' if is_backward else 'forward' + + print("\n\n============") + print(f"{category} Part") + print("============\n\n") + none_indices = get_none_indices(graph) + fx_graph = preprocess_fx_graph(graph) + + compile_type = 'mhlo' + backend_legal_ops = [ + "aten._softmax", + "aten.softmax.int", + "aten.log_softmax.int", + "aten._log_softmax", + # "aten.native_layer_norm", + # "aten.layer_norm", + "aten.gelu", + "aten.argmax", + "aten.max.dim", + "aten.one_hot", + "aten.topk", + "byteir.flash_attn_fwd", + "byteir.flash_attn_bwd", + ] + with maybe_disable_fake_tensor_mode(): + compiled_graph = compile(fx_graph, inputs, compile_type, backend_legal_ops=backend_legal_ops) + + model_name = MODEL_NAME + global cnt + TEMP_FOLDER="./temp" + os.makedirs(TEMP_FOLDER, exist_ok=True) + os.makedirs(TEMP_FOLDER + f"/{model_name}_{category}", exist_ok=True) + mlir_file_name = f'{TEMP_FOLDER}/{model_name}_{category}_{cnt}.{compile_type}.mlir' + output_mlir_file_name = f'{TEMP_FOLDER}/{model_name}_{category}/{model_name}_{category}.rt.mlir' + cnt = cnt + 1 + with open(mlir_file_name, "w+") as fout: + compiled_graph.operation.print(file=fout, + large_elements_limit=None) + + with maybe_disable_fake_tensor_mode(): + byteir.compile(mlir_file_name, output_mlir_file_name, entry_func='forward', target='cuda_with_ait') + + outputs = FakeTensorProp(graph).propagate(*inputs) + mhlo_ret_dtypes = [t.dtype for t in outputs] + mhlo_ret_shapes = [t.shape for t in outputs] + + print(output_mlir_file_name) + return ByteIRFunction(output_mlir_file_name, mhlo_ret_shapes, mhlo_ret_dtypes, none_indices) + + +from torch._inductor.virtualized import V +from torch._dynamo.utils import detect_fake_mode +from torch._dynamo.backends.common import aot_autograd +from torch._inductor.fx_passes.joint_graph import joint_graph_passes + + +def fuse_aware_byteir_compile_fx(model_: torch.fx.GraphModule, example_inputs_): + from partitioners import fuse_aware_min_cut_partition + # TODO: can add logging before/after the call to create_aot_dispatcher_function + # in torch._functorch/aot_autograd.py::aot_module_simplified::aot_function_simplified::new_func + # once torchdynamo is merged into pytorch + fake_mode = detect_fake_mode(example_inputs_) or torch._subclasses.FakeTensorMode( + allow_non_fake_inputs=True + ) + tracing_context = ( + torch._guards.TracingContext.get() or torch._guards.TracingContext(fake_mode) + ) + decompose_list = list_decomposed_ops() + decompositions = get_decompositions(decompose_list) + + def partition_fn(graph, joint_inputs, **kwargs): + joint_graph_passes(graph) + return fuse_aware_min_cut_partition( + graph, joint_inputs, **kwargs, compiler="inductor" + ) + + if FLASH: + # preprocess flash attention + # replace attention pattern to scaled_dot_product_attention + model_ = fx_replace_attn_pattern(model_) + # replace scaled_dot_product_attention to byteir.flash_attn + model_ = replace_flash_attn(model_) + + with V.set_fake_mode(fake_mode), torch._guards.tracing(tracing_context): + return aot_autograd( + fw_compiler=functools.partial(byteir_compile_fx_inner, is_backward=False), + bw_compiler=functools.partial(byteir_compile_fx_inner, is_backward=True), + inference_compiler=functools.partial(byteir_compile_fx_inner, is_backward=False), + decompositions=decompositions, + partition_fn=partition_fn, + keep_inference_input_mutations=True, + )(model_, example_inputs_) diff --git a/frontends/torch-frontend/examples/demo/byteir_fusible_pattern.py b/frontends/torch-frontend/examples/demo/byteir_fusible_pattern.py new file mode 100644 index 000000000..1629f39f5 --- /dev/null +++ b/frontends/torch-frontend/examples/demo/byteir_fusible_pattern.py @@ -0,0 +1,194 @@ +import torch +import torch.fx as fx + +from compile_utils import get_aten_target +from fx_match_utils import get_node_consumer, match_chain + +byteir_fusible_patterns = {} +aten = torch.ops.aten + +def register_byteir_pattern(name): + def register(pattern): + if name in byteir_fusible_patterns.keys(): + raise ValueError("Pattern " + name + " has already been registerd.") + byteir_fusible_patterns[name] = pattern + return pattern + return register + + +class ByteIRFusiblePattern: + + @classmethod + def match(cls, node, required_fw_nodes) -> bool: + raise NotImplementedError + + @classmethod + def get_pattern_recompute_nodes(cls, node, required_fw_nodes): + raise NotImplementedError + +@register_byteir_pattern("transpose_dot") +class TransposeDotPattern(ByteIRFusiblePattern): + + @classmethod + def match(cls, node, required_fw_nodes) -> bool: + post_fusible_ops = [aten.mm, aten.bmm] + if get_aten_target(node) in [aten.t, aten.transpose]: + can_fuse = all(get_aten_target(user) in post_fusible_ops for user in node.users) + all_fw_node = all(user in required_fw_nodes for user in node.users) + return (not all_fw_node) and can_fuse + return False + + @classmethod + def get_pattern_recompute_nodes(cls, node, required_fw_nodes): + if cls.match(node, required_fw_nodes): + return [node] + return [] + + +@register_byteir_pattern("transpose_reshape_transpose_dot") +class TransposeReshapeTransposeDotPattern(ByteIRFusiblePattern): + + @classmethod + def match(cls, node, required_fw_nodes) -> bool: + post_fusible_ops = [aten.mm, aten.bmm, aten.transpose] + if get_aten_target(node) not in [aten.transpose]: + return False + if match_chain(node, target_chain=(aten.transpose, aten.expand, aten.clone, aten._unsafe_view)): + expand_node = get_node_consumer(node, 0) + clone_node = get_node_consumer(expand_node, 0) + view_node = get_node_consumer(clone_node, 0) + all_fw_node = all(user in required_fw_nodes for user in view_node.users) + can_fuse = all(get_aten_target(user) in post_fusible_ops for user in view_node.users) + return (not all_fw_node) and can_fuse + return False + + + @classmethod + def get_pattern_recompute_nodes(cls, node, required_fw_nodes): + if cls.match(node, required_fw_nodes): + expand_node = get_node_consumer(node, 0) + clone_node = get_node_consumer(expand_node, 0) + view_node = get_node_consumer(clone_node, 0) + recompute_nodes = [node, expand_node, clone_node, view_node] + for user in view_node.users: + if user not in required_fw_nodes: + recompute_nodes.append(user) + return recompute_nodes + return [] + +@register_byteir_pattern("transpose_transpose") +class TransposeTransposePattern(ByteIRFusiblePattern): + + @classmethod + def match(cls, node, required_fw_nodes) -> bool: + if get_aten_target(node) in [aten.t, aten.transpose]: + for user in node.users: + if get_aten_target(user) in [aten.t, aten.transpose]: + all_fw_node = all(n in required_fw_nodes for n in user.users) + if not all_fw_node: + return True + return False + + + @classmethod + def get_pattern_recompute_nodes(cls, node, required_fw_nodes): + if cls.match(node, required_fw_nodes): + recompute_nodes = [node] + for user in node.users: + if get_aten_target(user) == aten.t: + recompute_nodes.append(user) + return recompute_nodes + return [] + + +@register_byteir_pattern("full_bitwise_not_expand") +class FullBitwiseNotExpandPattern(ByteIRFusiblePattern): + + @classmethod + def match(cls, node, required_fw_nodes) -> bool: + if match_chain(node, target_chain=(aten.full, aten.bitwise_not, aten.expand)): + return True + return False + + + @classmethod + def get_pattern_recompute_nodes(cls, node, required_fw_nodes): + if cls.match(node, required_fw_nodes): + bitwise_node = get_node_consumer(node, 0) + expand_node = get_node_consumer(bitwise_node, 0) + recompute_nodes = [node, bitwise_node, expand_node] + return recompute_nodes + return [] + + +# Note: This pattern is temporary. +# It is only used to fix issue that full op(dtype is bool) is not supported in byteir. +@register_byteir_pattern("copy_bitwise_not_expand") +class CopyBitwiseNotExpandPattern(ByteIRFusiblePattern): + + @classmethod + def match(cls, node, required_fw_nodes) -> bool: + if match_chain(node, target_chain=(aten._to_copy, aten.bitwise_not, aten.expand, aten.bitwise_or)): + bitwise_not_node = get_node_consumer(node, 0) + expand_node = get_node_consumer(bitwise_not_node, 0) + bitwise_or_node = get_node_consumer(expand_node, 0) + return True + return False + + + @classmethod + def get_pattern_recompute_nodes(cls, node, required_fw_nodes): + if cls.match(node, required_fw_nodes): + bitwise_not = get_node_consumer(node, 0) + expand = get_node_consumer(bitwise_not, 0) + bitwise_or = get_node_consumer(expand, 0) + recompute_nodes = [node, bitwise_not, expand, bitwise_or] + return recompute_nodes + return [] + + +def greedy_transpose_fusion(joint_graph, required_fw_nodes): + recompute_nodes = [] + post_fuse_ops = [aten.bmm, aten.mm] + transparent_ops = [aten.clone, aten._to_copy, aten.expand] + view_ops = [aten.view, aten._unsafe_view] + transpose_ops = [aten.t, aten.transpose] + fusible_tag = {} + + INIT_TAG = 0 + POST_FUSION_TAG = 1 + TRANSPOSE_TAG = 2 + + + for node in reversed(joint_graph.nodes): + fusible_tag[node] = INIT_TAG + + for node in reversed(joint_graph.nodes): + if get_aten_target(node) in post_fuse_ops and node not in required_fw_nodes: + fusible_tag[node] = POST_FUSION_TAG + + if get_aten_target(node) in transparent_ops: + for user in node.users: + if user in fusible_tag.keys() and fusible_tag[user] >= POST_FUSION_TAG: + fusible_tag[node] = POST_FUSION_TAG + recompute_nodes.append(node) + + if get_aten_target(node) in transpose_ops: + for user in node.users: + if user in fusible_tag.keys() and fusible_tag[user] >= POST_FUSION_TAG: + recompute_nodes.append(node) + fusible_tag[node] = INIT_TAG + + return recompute_nodes + + +def get_byteir_recompute_nodes(joint_graph, required_fw_nodes): + recompute_nodes = [] + recompute_nodes.extend(greedy_transpose_fusion(joint_graph, required_fw_nodes)) + for name, pattern in byteir_fusible_patterns.items(): + for node in joint_graph.nodes: + if node.op == 'output': + continue + recompute_nodes.extend(pattern.get_pattern_recompute_nodes(node, required_fw_nodes)) + recompute_nodes = list(set(recompute_nodes)) + return recompute_nodes diff --git a/frontends/torch-frontend/examples/demo/compile_utils.py b/frontends/torch-frontend/examples/demo/compile_utils.py new file mode 100644 index 000000000..e08df059e --- /dev/null +++ b/frontends/torch-frontend/examples/demo/compile_utils.py @@ -0,0 +1,92 @@ + +import torch +import torch.fx as fx +from torch.utils._pytree import tree_flatten + +aten = torch.ops.aten + + +def get_aten_target(node): + if hasattr(node.target, 'overloadpacket'): + return node.target.overloadpacket + return node.target + + +rand_ops = [aten.dropout, aten._fused_dropout, aten._standard_gamma, + aten.bernoulli, aten.multinomial, aten.native_dropout, + aten.normal, aten.poisson, aten.binomial, aten.rrelu, + aten.rand_like, aten.rand, aten.randint, aten.randn, aten.randperm] + + +# return a new copy of torch.fx.graph.Graph with CSE applied to the input graph +def fx_graph_cse(fx_g: torch.fx.graph.Graph): + new_graph = fx.Graph() + env = {} # map from node in the old graph to node in the new graph + hash_env = {} # map from hash to a node in the new graph + token_map = {} # map from hash to token + for n in fx_g.nodes: + # The placeholder, output, and get_attr nodes are copied to the new grpah without change + # do not CSE away random operations + if n.op == 'placeholder' or n.op == 'output' or n.op == 'get_attr' or get_aten_target(n) in rand_ops: + new_node = new_graph.node_copy(n, lambda x: env[x]) + env[n] = new_node + else: # n.op == 'call_function', should never see n.op == 'call_module' or 'call_method' + # substitute args and kwargs members to their mapping in env if exists + # specs can be used to reconstruct nested list/dictionaries + def substitute(arg_list): + arg_list, spec = tree_flatten(arg_list) + for i in range(len(arg_list)): + v = arg_list[i] + if isinstance(v, torch.fx.node.Node) and v in env: + arg_list[i] = env[v] + if isinstance(v, (torch.SymBool, torch.SymInt, torch.SymFloat)): + arg_list[i] = v.node + return tuple(arg_list), spec + args, args_spec = substitute(n.args) + kwargs, kwargs_spec = substitute(n.kwargs) + + # each token corresponds to a unique node + # nodes with the same token can be substituted + token = {"target": n.target, "args": args, "args_spec": args_spec, + "kwargs": kwargs, "kwargs_spec": kwargs_spec} + + # hash substituted args to a number, do not hash specs because specs are not hashable + hash_arg = hash((args, kwargs)) + hash_val = (n.target, hash_arg) + + # check if a node has a substitute and can be eliminated + hash_val_in_hash_env = hash_val in hash_env + if hash_val_in_hash_env and token_map[hash_val] == token: + env[n] = hash_env[hash_val] + continue + + new_node = new_graph.node_copy(n, lambda x: env[x]) + env[n] = new_node + if not hash_val_in_hash_env: + hash_env[hash_val] = new_node + token_map[hash_val] = token + + return new_graph + + +def strip_overloads(gm): + """ + Modifies the target of graph nodes in :attr:`gm` to strip overloads. + + Args: + gm(fx.GraphModule): The input Fx graph module to be modified + """ + for node in gm.graph.nodes: + if isinstance(node.target, torch._ops.OpOverload): + node.target = node.target.overloadpacket + gm.recompile() + + +def get_placeholders(graph): + return list(filter(lambda x: x.op == 'placeholder', graph.nodes)) + +def get_outputs(graph): + for node in graph.nodes: + if node.op == 'output': + return tree_flatten(node.args[0])[0] + raise AssertionError("No output node found") diff --git a/frontends/torch-frontend/examples/demo/config.py b/frontends/torch-frontend/examples/demo/config.py new file mode 100644 index 000000000..811bce8cc --- /dev/null +++ b/frontends/torch-frontend/examples/demo/config.py @@ -0,0 +1,35 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +Global flags for aot autograd +""" +import os +import sys + +# Converts torch rng ops to their functional philox rng equivalents. Note that +# we functionalize only CUDA rng ops today. +functionalize_rng_ops = False + +# can be useful for debugging if we are incorrectly creating meta fake tensors +fake_tensor_allow_meta = os.environ.get("FAKE_ALLOW_META", True) + +# Enables optional asserts in hotpath code to check for errors. If +# you are seeing weird accuracy problems, try turning this on. +# This is currently off by default as it will harm tracing time, +# but it is on by default for aot_eager. +debug_assert = False + +debug_partitioner = os.environ.get("AOT_PARTITIONER_DEBUG", False) + +static_weight_shapes = True + +# Applies CSE to the graph before partitioning +cse = True + +# Restricts the amount of computation AOTAutograd can do. +max_dist_from_bw = 3 + diff --git a/frontends/torch-frontend/examples/demo/fx_match_utils.py b/frontends/torch-frontend/examples/demo/fx_match_utils.py new file mode 100644 index 000000000..d7a209c76 --- /dev/null +++ b/frontends/torch-frontend/examples/demo/fx_match_utils.py @@ -0,0 +1,40 @@ +import torch +from compile_utils import get_aten_target + +aten = torch.ops.aten + +def is_used_by_specific_consumer(node, consumer_type=None): + if consumer_type == None: + return True + + all_users = list(node.users) + if len(all_users) != 1: + return False + consumer = all_users[0] + if not isinstance(consumer_type, (list, tuple)): + consumer_type = [consumer_type] + if get_aten_target(consumer) not in consumer_type: + return False + return True + + +def get_node_consumer(node, index): + all_users = list(node.users) + return all_users[index] + + +def match_chain(node, target_chain): + if len(target_chain) == 1: + return get_aten_target(node) in target_chain + + if len(list(node.users)) != 1: + return False + + specific_types = target_chain[0] + + if not isinstance(specific_types, (list, tuple)): + specific_types = [specific_types] + + if get_aten_target(node) in specific_types: + return match_chain(get_node_consumer(node, 0), target_chain[1:]) + return False diff --git a/frontends/torch-frontend/examples/demo/main.py b/frontends/torch-frontend/examples/demo/main.py new file mode 100644 index 000000000..c913ceb5c --- /dev/null +++ b/frontends/torch-frontend/examples/demo/main.py @@ -0,0 +1,220 @@ +from torch import nn +import torch +import transformers +import sys +import os +import functools +import torch._dynamo +import torch.nn.functional as F + +import transformers +import argparse + +MODEL_LIST = ["gpt2", "bloom-560m", "llama", "opt-1.3b", "nanogpt"] + +class InferLLAMAModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.config = transformers.LlamaConfig(num_hidden_layers=4, return_dict=False) + self.model = transformers.LlamaForCausalLM(config=self.config) + def forward(self, x): + return self.model(x)[0] + +class InferOPTModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.config = transformers.AutoConfig.from_pretrained("facebook/opt-1.3b", return_dict=False) + self.config.tie_word_embeddings = False + self.model = transformers.OPTForCausalLM(config=self.config) + def forward(self, x): + return self.model(x)[0] + +class InferBLOOMModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.config = transformers.BloomConfig.from_pretrained('bigscience/bloom-560m', return_dict=False) + self.config.tie_word_embeddings = False + self.model = transformers.BloomForCausalLM(config=self.config) + def forward(self, x): + return self.model(x)[0] + +class InferGPT2Module(torch.nn.Module): + def __init__(self): + super().__init__() + self.config = transformers.GPT2Config.from_pretrained('gpt2', return_dict=False) + self.config.num_labels = self.config.vocab_size + self.model = transformers.GPT2ForTokenClassification(config=self.config) + def forward(self, x): + return self.model(x)[0] + +def make_model(model_name): + if model_name == 'llama': + config = transformers.LlamaConfig(num_hidden_layers=4) + model = transformers.LlamaForCausalLM(config=config) + elif model_name == 'opt-1.3b': + config = transformers.AutoConfig.from_pretrained("facebook/opt-1.3b") + config.tie_word_embeddings = False + model = transformers.OPTForCausalLM(config=config) + elif model_name == 'bloom-560m': + config = transformers.BloomConfig.from_pretrained('bigscience/bloom-560m') + config.tie_word_embeddings = False + model = transformers.BloomForCausalLM(config=config) + elif model_name == 'gpt2': + config = transformers.GPT2Config.from_pretrained('gpt2') + config.num_labels = config.vocab_size + model = transformers.GPT2ForTokenClassification(config=config) + elif model_name == 'nanogpt': + from my_transformers.modeling_nanogpt import GPTConfig, GPT + config_args = dict(n_layer=12, n_head=12, n_embd=768) + config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints + config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints + config_args['bias'] = True # always True for GPT model checkpoints + # we can override the dropout rate, if desired + config_args['dropout'] = 0. + # create a from-scratch initialized minGPT model + config = GPTConfig(**config_args) + model = GPT(config) + else: + assert False + return model + +def make_inference_model(model_name): + if model_name == 'llama': + return InferLLAMAModule() + elif model_name == 'opt-1.3b': + return InferOPTModule() + elif model_name == 'bloom-560m': + return InferBLOOMModule() + elif model_name == 'gpt2': + return InferGPT2Module() + else: + return make_model(model_name) + +def make_data(model, model_name, device): + batch_size = 8 + if model_name == 'llama': + batch_size = 16 + elif model_name == 'opt-1.3b': + batch_size = 4 + seq_len = 1024 + input = torch.randint( + low=0, high=model.config.vocab_size, size=(batch_size, seq_len), device=device + ) + + label = torch.randint(low=0, high=model.config.vocab_size, size=(batch_size, seq_len), + device=device) + return input, label + +def compute_loss(model, data, model_name): + if model_name == 'nanogpt': + input_idx, output_idx = data + _, loss = model(input_idx, output_idx) + else: + input, label = data + output = model(input) + logits = output.logits + loss = F.cross_entropy(logits.view(-1, model.config.vocab_size), label.view(-1)) + return loss + + +def infer_model(args): + device = torch.device('cuda:' + str(args.device_id)) + model = make_inference_model(args.model_name) + model.eval() + model.to(device) + + input, label = make_data(model, args.model_name, device) + trace_data = [input] + if args.model_name == "nanogpt": + trace_data.append(label) + # torch.save(trace_data, "batch_sample_inputs") + + TEMP_FOLDER="./temp" + os.makedirs(TEMP_FOLDER, exist_ok=True) + os.makedirs(TEMP_FOLDER + f"/{args.model_name}_inference_f16", exist_ok=True) + jit_file_name = TEMP_FOLDER + f"/{args.model_name}_inference.f16.jit" + mhlo_file_name = TEMP_FOLDER + f"/{args.model_name}_inference.f16.mhlo.mlir" + byre_file_name = TEMP_FOLDER + f"/{args.model_name}_inference_f16/{args.model_name}.rt.mlir" + + with torch.no_grad(), torch.cuda.amp.autocast(enabled=True, dtype=torch.float16): + # if not os.path.exists(jit_file_name): + # module = torch.jit.trace(model, trace_data, check_trace=False) + # torch.jit.save(module, jit_file_name) + # print("save jit to {}".format(jit_file_name)) + + if not os.path.exists(mhlo_file_name): + # module = torch.jit.load(jit_file_name) + if args.flash: + from torch.fx.experimental.proxy_tensor import make_fx + from torch_frontend import preprocess_fx_graph + module = make_fx(model)(*trace_data) + print("torch inputs:") + print(trace_data) + print("torch outputs:") + print(module(*trace_data)) + module = preprocess_fx_graph(module) + else: + module = torch.jit.trace(model, trace_data, check_trace=False) + print("torch inputs:") + print(trace_data) + print("torch outputs:") + print(module(*trace_data)) + import torch_frontend + mhlo_model = torch_frontend.compile(module, trace_data, "mhlo") + with open(mhlo_file_name, "w") as f: + print(mhlo_model.operation.get_asm(), file=f) + print("save mhlo to {}".format(mhlo_file_name)) + + if not os.path.exists(byre_file_name): + import byteir + print("begin byteir compile") + byteir.compile(mhlo_file_name, byre_file_name, entry_func='forward', target='cuda_with_ait', disable_byteir_cache=False, verbose=False) + print("byteir compile to {}".format(byre_file_name)) + + from backend import ByteIRInferenceFunction + runner = ByteIRInferenceFunction(byre_file_name) + print("byre inputs:") + print(trace_data) + print("byre outputs:") + print(runner(*trace_data)) + +def train_model(args): + torch._dynamo.reset() + torch._dynamo.disallow_in_graph(F.cross_entropy) + + model_name = args.model_name + use_flash_attn = args.flash + device = torch.device('cuda:' + str(args.device_id)) + model = make_model(model_name) + model.to(device) + + import backend + from backend import fuse_aware_byteir_compile_fx + backend.MODEL_NAME = model_name + backend.FLASH = use_flash_attn + + optimized_model = torch.compile(model, backend=fuse_aware_byteir_compile_fx) + + data = make_data(optimized_model, model_name, device) + model.zero_grad(set_to_none=True) + with torch.cuda.amp.autocast(enabled=True, dtype=torch.float16): + loss = compute_loss(optimized_model, data, model_name) + print("loss:", loss) + loss.backward() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("model_name") + parser.add_argument("--flash", action="store_true", help="use flash attention when possible") + parser.add_argument("--infer", action="store_true", help="infer mode") + parser.add_argument("--device_id", type=int, default=0) + args = parser.parse_args() + # print(args) + + assert args.model_name in MODEL_LIST + if args.infer: + infer_model(args) + else: + train_model(args) + diff --git a/frontends/torch-frontend/examples/demo/partitioners.py b/frontends/torch-frontend/examples/demo/partitioners.py new file mode 100644 index 000000000..43bea47b9 --- /dev/null +++ b/frontends/torch-frontend/examples/demo/partitioners.py @@ -0,0 +1,940 @@ +from torch.fx.experimental.proxy_tensor import is_sym_node, py_sym_types +from torch.fx.experimental.symbolic_shapes import ( + hint_int, magic_methods, method_to_operator, free_symbols, + is_symbol_binding_fx_node, find_symbol_binding_fx_nodes +) +import torch +import torch.fx as fx +import operator +import math +import torch.utils._pytree as pytree +import copy +import os +import itertools +import sympy +from collections import defaultdict +from torch.fx.passes import graph_drawer +from typing import Tuple +from compile_utils import fx_graph_cse, get_aten_target +import config +import functools + +from byteir_fusible_pattern import get_byteir_recompute_nodes + +AOT_PARTITIONER_DEBUG = config.debug_partitioner + + +def must_recompute(node): + return node.meta.get("recompute", False) + +def has_recomputable_ops(fx_g): + found = False + for node in fx_g.graph.nodes: + if must_recompute(node): + return True + return False + +def has_recomputable_rng_ops(fx_g): + for node in fx_g.graph.nodes: + if must_recompute(node) and hasattr(node.target, "tags") and torch.Tag.nondeterministic_seeded in node.target.tags: + return True + return False + +def sym_node_size(node): + if isinstance(node.meta["val"], (torch.SymInt, torch.SymBool)): + return 1 + assert isinstance(node.meta["val"], torch.SymFloat) + return 4 + +class InvalidNodeBase: + def __repr__(self): + return "Invalid Node" + + +InvalidNode = InvalidNodeBase() + + +def _extract_graph_with_inputs_outputs(joint_graph, inputs, outputs): + """ + Given a graph, extracts out a subgraph that takes the specified nodes as + inputs and returns the specified outputs. + + This includes specifying non-placeholder nodes as inputs. + + The general strategy is to initialize all inputs with proxies as we + encounter them, and trace through the graph, only keeping values which take + in valid proxies. Then, all dead code is eliminated. + """ + new_graph = fx.Graph() + env = {} + + # Add new placeholder nodes in the order specified by the inputs + for node in inputs: + new_node = new_graph.placeholder(node.name) + # Can't use node_copy here as we may be turning previous call_function into placeholders + new_node.meta = node.meta + env[node] = new_node + + for node in joint_graph.nodes: + if node in inputs: + continue + elif node.op == 'placeholder': + env[node] = InvalidNode + elif node.op == 'call_function': + all_args = pytree.tree_flatten((node.args, node.kwargs))[0] + all_args = [isinstance(env[x], InvalidNodeBase) for x in all_args if isinstance(x, fx.Node)] + if any(all_args): + env[node] = InvalidNode + continue + env[node] = new_graph.node_copy(node, lambda x: env[x]) + elif node.op == 'get_attr': + env[node] = new_graph.node_copy(node, lambda x: env[x]) + elif node.op == 'output': + pass + output_values = [] + for x in outputs: + if isinstance(x, fx.Node): + if x not in env: + raise RuntimeError(f"Node {x} couldn't be found in env") + assert not isinstance(env[x], InvalidNodeBase), f"Node {x} was invalid, but is output" + output_values.append(env[x]) + else: + output_values.append(x) + new_graph.output(output_values) + + new_graph.eliminate_dead_code() + new_graph.lint() + return new_graph + + +def _is_primal(node): + return ( + node.op == "placeholder" + and "tangents" not in node.target + and not _is_bwd_seed_offset(node) + and not _is_fwd_seed_offset(node) + ) + +def _is_tangent(node): + return node.op == "placeholder" and "tangents" in node.target + +def _is_bwd_seed_offset(node): + return node.op == "placeholder" and ("bwd_seed" in node.target or "bwd_base_offset" in node.target) + +def _is_fwd_seed_offset(node): + return node.op == "placeholder" and ("fwd_seed" in node.target or "fwd_base_offset" in node.target) + + +def _extract_fwd_bwd_outputs(joint_module: fx.GraphModule, *, num_fwd_outputs): + outputs = pytree.tree_flatten([node.args for node in joint_module.graph.nodes if node.op == 'output'])[0] + fwd_outputs = outputs[:num_fwd_outputs] + bwd_outputs = outputs[num_fwd_outputs:] + return fwd_outputs, bwd_outputs + + +def _extract_fwd_bwd_modules(joint_module: fx.GraphModule, saved_values, saved_sym_nodes, *, num_fwd_outputs): + fwd_outputs, bwd_outputs = _extract_fwd_bwd_outputs(joint_module, num_fwd_outputs=num_fwd_outputs) + primal_inputs = list(filter(_is_primal, joint_module.graph.nodes)) + tangent_inputs = list(filter(_is_tangent, joint_module.graph.nodes)) + fwd_seed_offset_inputs = list(filter(_is_fwd_seed_offset, joint_module.graph.nodes)) + bwd_seed_offset_inputs = list(filter(_is_bwd_seed_offset, joint_module.graph.nodes)) + + # Construct the forward module + # Keep symints separate from tensors, passed between fwd/bwd graphs, and in the right order. + fwd_graph = _extract_graph_with_inputs_outputs( + joint_module.graph, + primal_inputs + fwd_seed_offset_inputs, + fwd_outputs + saved_values + saved_sym_nodes + ) + bwd_graph = _extract_graph_with_inputs_outputs( + joint_module.graph, + saved_sym_nodes + saved_values + tangent_inputs + bwd_seed_offset_inputs, + bwd_outputs + ) + + # This is to filter out saved values that don't actually end up being used by the backwards pass + for node in bwd_graph.nodes: + if node.op == 'placeholder' and not node.users: + for saved_value in saved_values: + if saved_value.name == node.name: + saved_values.remove(saved_value) + break + + for saved_sym in saved_sym_nodes: + if saved_sym.name == node.name: + saved_sym_nodes.remove(saved_sym) + break + + # Now that we have the finalized list of saved values, we need to ensure + # we propagate all symbols which are referenced by backwards inputs. + # These are not directly used in the graph but are required for downstream + # sizevar assignment + saved_symbols: Set[sympy.Symbol] = set() + saved_sym_nodes_binding = [] + saved_sym_nodes_derived = [] + + # Some symbols may already be bound in the directly saved_sym_nodes, + # keep track of them so we don't re-bind them + for node in saved_sym_nodes: + symbol = is_symbol_binding_fx_node(node) + if symbol: + saved_symbols.add(symbol) + saved_sym_nodes_binding.append(node) + else: + saved_sym_nodes_derived.append(node) + + # Now go through all of the prospective backward inputs and track any + # other symbols we need to bind + symbol_bindings = find_symbol_binding_fx_nodes(joint_module.graph) + for node in itertools.chain(saved_sym_nodes_derived, saved_values, tangent_inputs): + if "val" not in node.meta: + continue + new_symbols = free_symbols(node.meta["val"]) - saved_symbols + # NB: Deterministic order please! + for s in sorted(new_symbols, key=lambda s: s.name): + # NB: For well formed graphs, the symbol should always be present, + # but we also have ways to produce ill-formed graphs, e.g., direct + # make_fx usages, so don't choke in this case + if s not in symbol_bindings: + continue + saved_sym_nodes_binding.append(symbol_bindings[s]) + saved_symbols |= new_symbols + + + # Update saved_sym_nodes that are now reordered to have all bindings at + # front. This can also be used later on to figure out the position of saved + # sym nodes in the output of fwd graph. + saved_sym_nodes.clear() + saved_sym_nodes.extend(saved_sym_nodes_binding + saved_sym_nodes_derived) + + # Now, we re-generate the fwd/bwd graphs. + # NB: This might increase compilation time, but I doubt it matters + fwd_graph = _extract_graph_with_inputs_outputs( + joint_module.graph, + primal_inputs + fwd_seed_offset_inputs, + fwd_outputs + saved_values + saved_sym_nodes + ) + bwd_graph = _extract_graph_with_inputs_outputs( + joint_module.graph, + saved_sym_nodes + saved_values + tangent_inputs + bwd_seed_offset_inputs, + bwd_outputs + ) + + fwd_module = fx.GraphModule(joint_module, fwd_graph) + bwd_module = fx.GraphModule(joint_module, bwd_graph) + return fwd_module, bwd_module + + +def default_partition( + joint_module: fx.GraphModule, _joint_inputs, *, num_fwd_outputs +) -> Tuple[fx.GraphModule, fx.GraphModule]: + """ + Partitions the :attr:`joint_module` in a manner that closely resembles the + behavior observed in the original ``.forward()`` and ``.backward()`` of the + callable, i.e., the resulting forward graph contains those operators that + are executed in the original ``.forward()`` callable passed to + :func:`aot_function`. + + The default partitioner collects the operators that are between the forward + inputs and the forward outputs. This helps in finding the tensors which have + to be stashed for the backward pass. These stashed tensors become the output + of the generated forward graph. The remaining operators are then placed in + the backward graph. + + .. warning:: + This API is experimental and likely to change. + + Args: + joint_module(fx.GraphModule): The joint forward and backward graph. This + is the result of AOT Autograd tracing. + + Returns: + Returns the generated forward and backward Fx graph modules. + """ + if has_recomputable_ops(joint_module): + return min_cut_rematerialization_partition(joint_module, _joint_inputs, num_fwd_outputs=num_fwd_outputs) + primal_inputs = list(filter(_is_primal, joint_module.graph.nodes)) + fwd_seed_offset_inputs = list(filter(_is_fwd_seed_offset, joint_module.graph.nodes)) + inputs = primal_inputs + fwd_seed_offset_inputs + fwd_outputs, bwd_outputs = _extract_fwd_bwd_outputs(joint_module, num_fwd_outputs=num_fwd_outputs) + forward_only_graph = _extract_graph_with_inputs_outputs(joint_module.graph, inputs, fwd_outputs) + forward_node_names = {node.name for node in forward_only_graph.nodes if node.op != 'output'} + saved_values = [] + saved_sym_nodes = [] + + for node in joint_module.graph.nodes: + if node.name not in forward_node_names: + continue + if is_sym_node(node): + # Symints must be kept separate from tensors so that PythonFunction only calls + # save_for_backward on tensors and stashes symints in autograd .ctx + saved_sym_nodes.append(node) + elif ( + 'tensor_meta' not in node.meta + and node.op == 'call_function' + ): + # Since we can't save tuple of tensor values, we need to flatten out what we're saving + users = node.users + assert all(user.target == operator.getitem for user in users) + for user in users: + saved_values.append(user) + else: + backward_usages = [n for n in node.users if n.name not in forward_node_names] + if 'tensor_meta' in node.meta and all(is_sym_node(n) for n in backward_usages): + # If we have a tensor in the forward, where only its sizes/strides are needed in the backward, + # and not the actual tensor data, + # then it will be a lot cheaper to save only the sizes/strides, and not the actual tensor. + # + # Note that saving the tensor could also cause compilation problems: + # If the user mutated an input in the forward and uses its sizes/strides in the backward, + # then we would be obligated to clone the input before saving it to appease autograd. + # (This is how we originally found this bug). + for user in backward_usages: + saved_sym_nodes.append(user) + else: + saved_values.append(node) + saved_values = list({k: None for k in saved_values}.keys()) + saved_sym_nodes = list({k: None for k in saved_sym_nodes}.keys()) + + return _extract_fwd_bwd_modules(joint_module, saved_values, saved_sym_nodes=saved_sym_nodes, num_fwd_outputs=num_fwd_outputs) + + +def _prod(x): + s = 1 + for i in x: + s *= i + return s + +def _tensor_nbytes(numel, dtype): + sizes = { + torch.complex64: 8, + torch.complex128: 16, + torch.float16: 2, + torch.bfloat16: 2, + torch.float32: 4, + torch.float64: 8, + torch.int8: 1, + torch.int16: 2, + torch.int32: 4, + torch.int64: 8, + torch.uint8: 1, + torch.bool: 1, + } + if dtype not in sizes: + raise NotImplementedError("Don't know the size of dtype ", dtype) + + return numel * sizes[dtype] + +def _size_of(node: fx.Node) -> int: + if 'val' in node.meta: + val = node.meta['val'] + if isinstance(val, py_sym_types): + if isinstance(val, torch.SymInt): + return 1 + else: + return 999999 + elif isinstance(val, (list, tuple)): + return sum(_tensor_nbytes(hint_int(n.numel()), n.dtype) for n in val if isinstance(n, torch.Tensor)) + elif isinstance(val, torch.Tensor): + return _tensor_nbytes(hint_int(val.numel()), val.dtype) + + raise RuntimeError(f"Unknown metadata type {type(val)}") + + # Only needed since we don't always trace with fake tensors. + if 'tensor_meta' in node.meta: + metadata = node.meta['tensor_meta'] + numel = _prod(map(to_size_hint, metadata.shape)) + dtype = metadata.dtype + else: + return 0 + + return _tensor_nbytes(numel, dtype) + + +# Used for some investigative purposes +def _count_ops(graph): + from collections import defaultdict + cnt = defaultdict(int) + for node in graph.nodes: + if node.op == 'call_function': + cnt[node.target.__name__] += 1 + print(sorted(cnt.items(), key=lambda x: x[1], reverse=True)) + + +@functools.lru_cache(None) +def pointwise_ops(): + ops = [] + for attr_name in dir(torch.ops.aten): + opoverloadpacket = getattr(torch.ops.aten, attr_name) + if not isinstance(opoverloadpacket, torch._ops.OpOverloadPacket): + continue + + for overload in opoverloadpacket.overloads(): + op_overload = getattr(opoverloadpacket, overload) + if torch.Tag.pointwise in op_overload.tags: + # currently aot autograd uses packet not overload + ops.append(opoverloadpacket) + break + + return ops + +def get_depth(node, depth_map): + if node in depth_map: + return depth_map[node] + + # Base case + if node.op == "placeholder": + depth_map[node] = 0 + return depth_map[node] + + # Handle output node + if node.op == "output": + args = node.args[0] + for arg in args: + if isinstance(arg, torch.fx.node.Node): + get_depth(arg, depth_map) + return + + # Get the depth of args and set the depth of this node + arg_depths = [get_depth(arg, depth_map) for arg in node.all_input_nodes if isinstance(arg, torch.fx.node.Node)] + # factory ops like full, rand might not have any input args + if len(arg_depths) == 0: + arg_depths = [0] + depth_map[node] = max(arg_depths) + 1 + return depth_map[node] + + +def sort_depths(args, depth_map): + arg_depths = {arg: depth_map[arg] for arg in args if isinstance(arg, torch.fx.node.Node)} + return sorted(arg_depths.items(), key=lambda x: x[1], reverse=True) + + +def reordering_to_mimic_autograd_engine(gm): + """ + This pass finds the first bwd node in the graph (by looking at users of + tangents) and then reorders the graph by walking from this node to all the + way to the end of the graph. At each op in this traveral, we insert this op + in a new graph and try to bring only the relevant subgraph from the other + non-bwd edges relevant for this op. This closely mimics the behavior of + autograd engine. + + Why is this pass required in the first place? + + This is an artifact of how partitioners work today. The starting point of + partitioner is a joint graph, which is fwd and then bwd graph. In the case + of checkpointing, we keep portions of fwd graph in their original place in + the joint graph, while obtaining a bwd graph. As a result, the resulting bwd + graph has copies of recomputed fwd subgraphs followed by the original bwd + graph. If we run this naively, this leads to bad memory footprint, because + the fwd subgraphs are live for way longer duration than necessary. This pass + reorders the operations such that we prioritize the ops for the original bwd + graph while only realizing those ops from the fwd graph that are necessary + at any given point in the graph. + """ + + new_graph = fx.Graph() + env = {} + + # Add new placeholder nodes in the order specified by the inputs + for node in gm.graph.nodes: + if node.op == "placeholder": + new_node = new_graph.placeholder(node.name) + # Can't use node_copy here as we may be turning previous call_function into placeholders + new_node.meta = node.meta + env[node] = new_node + + + order = {} + for idx, node in enumerate(gm.graph.nodes): + order[node] = idx + + # Populate depth for the nodes. Depth is the distance from the inputs. + depths = {} + output_node = [node for node in gm.graph.nodes if node.op == "output"][0] + get_depth(output_node, depths) + + def insert_node_in_graph(node): + if node in env: + return env[node] + + # Bias traversal towards the nodes that have higher depth - prioritizes + # critical path first. + for arg, _ in sort_depths(node.all_input_nodes, depths): + env[arg] = insert_node_in_graph(arg) + env[node] = new_graph.node_copy(node, lambda x: env[x]) + return env[node] + + # Find first bwd node in the graph + tangent_inputs = list(filter(_is_tangent, gm.graph.nodes)) + first_node_in_bwd = None + minimum_order = math.inf + for tangent in tangent_inputs: + for user in tangent.users: + if order[user] < minimum_order: + minimum_order = order[user] + first_node_in_bwd = user + assert first_node_in_bwd is not None + + # Build the graph op-by-op by starting from the node all the way to the end + for node in list(gm.graph.nodes)[order[first_node_in_bwd]:]: + insert_node_in_graph(node) + + # The output node is already built by the traversal. + new_gm = torch.fx.GraphModule(gm, new_graph) + return new_gm + + +def functionalize_rng_ops(joint_module, fw_module, bw_module, num_sym_nodes): + # During user-driven activation checkpointing, we have to ensure that a rng + # op in fwd yields the same output as the recomputed rng op in the bwd. To + # do this, we use functionalize wrappers to wrap the random ops and share + # rng state between the fwd and bwd graphs. + + # There are 3 main steps to do this + # Step 1 - Construct a mapping of rng node between the fwd and its counterpart in bwd. + # Step 2 - Modify the fwd pass such that + # 1) Replace rand with run_and_save_rng_state wrapper + # 2) Replace the users of the original op with the output[1] of this op. + # 3) Collect all the rng_state - output[0] of each op, and make them + # output nodes. Special care needs to be taken here because fwd outputs + # has symints at the very end. + # Step 3 - Modify the bwd pass such that + # 1) Add the input nodes just before the tangents for the stashed rng states + # 2) Replace rand with run_with_save_rng_state wrappers + # 3) Use the stashed states as inputs to these ops + + # Unique id to generate name + uid = itertools.count() + + def get_rng_ops(gmod): + random_nodes = {} + for node in gmod.graph.nodes: + if ( + node.op == "call_function" + and hasattr(node.target, "tags") + and torch.Tag.nondeterministic_seeded in node.target.tags + ): + random_nodes[node.name] = node + return random_nodes + + # Step 1 - Construct a mapping of rng node between the fwd and its counterpart in bwd. + joint_graph_rng_ops = get_rng_ops(joint_module) + fw_graph_rng_ops = get_rng_ops(fw_module) + bw_graph_rng_ops = get_rng_ops(bw_module) + recomputable_rng_ops_map = dict() + for node in joint_module.graph.nodes: + if ( + must_recompute(node) + and hasattr(node.target, "tags") + and torch.Tag.nondeterministic_seeded in node.target.tags + ): + base_node = joint_graph_rng_ops[node.name] + fw_node = fw_graph_rng_ops[node.name] + bw_node = bw_graph_rng_ops[node.name] + recomputable_rng_ops_map[base_node] = {"fwd": fw_node, "bwd": bw_node} + + run_and_save_rng = torch._prims.rng_prims.run_and_save_rng_state + run_with_rng_state = torch._prims.rng_prims.run_with_rng_state + + for node in bw_module.graph.nodes: + if node.op == "placeholder" and "tangent" in node.name: + bw_tangent_start_node = node + break + + fw_rng_state_outputs = [] + for base_node, node_pair in recomputable_rng_ops_map.items(): + # Step 2 - Modify the fwd pass such that + fw_node = node_pair["fwd"] + bw_node = node_pair["bwd"] + fw_graph = fw_module.graph + with fw_graph.inserting_before(fw_node): + functional_fw_node = fw_graph.create_node( + "call_function", + run_and_save_rng, + args=(fw_node.target, *fw_node.args), + kwargs=fw_node.kwargs + ) + state = fw_graph.create_node("call_function", operator.getitem, args=(functional_fw_node, 0), kwargs={}) + rng_output = fw_graph.create_node("call_function", operator.getitem, args=(functional_fw_node, 1,), kwargs={}) + fw_node.replace_all_uses_with(rng_output) + fw_graph.erase_node(fw_node) + fw_rng_state_outputs.append(state) + + + # Step 3 - Modify the bwd pass such that + bw_graph = bw_module.graph + with bw_graph.inserting_before(bw_tangent_start_node): + state_name = f"rng_state_output_{next(uid)}" + bw_rng_state_node = bw_graph.placeholder(state_name) + bw_rng_state_node.meta["val"] = torch.cuda.get_rng_state() + + with bw_graph.inserting_before(bw_node): + rng_output = bw_graph.create_node( + "call_function", + run_with_rng_state, + args=(bw_rng_state_node, bw_node.target, *bw_node.args), + kwargs=bw_node.kwargs + ) + + bw_node.replace_all_uses_with(rng_output) + bw_graph.erase_node(bw_node) + + + # Add the rng states in the output of the fwd graph. AOT Autograd assumes + # that symints are at the end of forward graph outputs. So, insert the new + # rng states accordingly. + fw_output_node = [node for node in fw_module.graph.nodes if node.op == "output"][0] + fw_outputs = fw_output_node.args[0] + sym_node_start_idx = len(fw_outputs) - num_sym_nodes + outputs = fw_outputs[:sym_node_start_idx] + fw_rng_state_outputs + fw_outputs[sym_node_start_idx:] + fw_module.graph.output(outputs) + fw_module.graph.erase_node(fw_output_node) + fw_module.recompile() + bw_module.recompile() + return fw_module, bw_module + + +def cleanup_recompute_tags(joint_module): + """ + If there are two consecutive checkpointed blocks with no operator in + between, we would still want to stash the tensor at the boundary of + checkpointed blocks. The following pass makes the last output node + non-recomputable to allow for that. + """ + for node in joint_module.graph.nodes: + if must_recompute(node): + for user in node.users: + if must_recompute(user) and user.meta["recompute"] > node.meta["recompute"]: + node.meta["recompute"] = 0 + return joint_module + + +def fuse_aware_min_cut_partition( + joint_module: fx.GraphModule, _joint_inputs, compiler="inductor", recomputable_ops=None, + *, num_fwd_outputs +) -> Tuple[fx.GraphModule, fx.GraphModule]: + print("num_fwd_outputs : ", num_fwd_outputs) + """ + Partitions the joint graph such that the backward recomputes the forward. + Recomputing helps in trading off memory bandwidth with computation. + + To create the fwd and bwd graph, we copy the joint graph, manually set the + outputs to just original forward or backward outputs. And then we run the + resulting graphs through dead code elimintation. + + .. warning:: + This API is experimental and likely to change. + + Args: + joint_module(fx.GraphModule): The joint forward and backward graph. This + is the result of AOT Autograd tracing. + _joint_inputs: The inputs to the joint graph. This is unused. + compiler: This option determines the default set of recomputable ops. + Currently, there are two options: ``nvfuser`` and ``inductor``. + recomputable_ops: This is an optional set of recomputable ops. If this + is not None, then this set of ops will be used instead of the + default set of ops. + num_fwd_outputs: The number of outputs from the forward graph. + + Returns: + Returns the generated forward and backward Fx graph modules. + """ + try: + import networkx as nx + except ImportError as e: + raise RuntimeError("Need networkx installed to perform smart recomputation " + "heuristics") from e + + joint_module.graph.eliminate_dead_code() + joint_module.recompile() + + fx_g = joint_module.graph + + # add the CSE pass + if config.cse: + cse_graph = fx_graph_cse(fx_g) + joint_module.graph = cse_graph + full_bw_graph = joint_module.graph + + graph_has_recomputable_ops = has_recomputable_ops(joint_module) + graph_has_recomputable_rng_ops = has_recomputable_rng_ops(joint_module) + if graph_has_recomputable_ops: + joint_module = cleanup_recompute_tags(joint_module) + + name_to_node = {} + for node in joint_module.graph.nodes: + name_to_node[node.name] = node + + def classify_nodes(joint_module): + required_bw_nodes = set() + for node in joint_module.graph.nodes: + if node.op == 'placeholder' and "tangents" in node.target: + required_bw_nodes.add(node) + if node in required_bw_nodes: + for user in node.users: + required_bw_nodes.add(user) + + primal_inputs = list(filter(_is_primal, joint_module.graph.nodes)) + fwd_seed_offset_inputs = list(filter(_is_fwd_seed_offset, joint_module.graph.nodes)) + inputs = primal_inputs + fwd_seed_offset_inputs + fwd_outputs, bwd_outputs = _extract_fwd_bwd_outputs(joint_module, num_fwd_outputs=num_fwd_outputs) + required_bw_nodes.update(o for o in bwd_outputs if o is not None) + forward_only_graph = _extract_graph_with_inputs_outputs(joint_module.graph, inputs, fwd_outputs) + required_fw_nodes = {name_to_node[node.name] for node in forward_only_graph.nodes + if node.op != 'output'} + unclaimed_nodes = {node for node in joint_module.graph.nodes + if node not in required_fw_nodes and node not in required_bw_nodes} + return fwd_outputs, required_fw_nodes, required_bw_nodes, unclaimed_nodes + + orig_fw_outputs, required_fw_nodes, required_bw_nodes, unclaimed_nodes = classify_nodes(joint_module) + + # networkx blows up on graphs with no required backward nodes + # Since there's nothing to partition anyway, and the default partitioner can "handle" + # this case, send our graph over to the default partitioner. + if len(required_bw_nodes) == 0: + return default_partition(joint_module, _joint_inputs, num_fwd_outputs=num_fwd_outputs) + + for node in reversed(joint_module.graph.nodes): + if node not in required_fw_nodes: + node.dist_from_bw = 0 + else: + node.dist_from_bw = int(1e9) + for user in node.users: + node.dist_from_bw = min(node.dist_from_bw, user.dist_from_bw + 1) + + aten = torch.ops.aten + prims = torch.ops.prims + + # compiler == "nvfuser" is the default set of recomputable ops + default_recomputable_ops = [aten.add, aten.sub, aten.div, aten.atan2, aten.mul, aten.max, aten.min, aten.pow, aten.remainder, aten.fmod, aten.__and__, aten.__or__, aten.__xor__, aten.__lshift__, aten.__rshift__, aten.eq, aten.ne, aten.ge, aten.gt, aten.le, aten.lt, aten.abs, aten.bitwise_not, aten.ceil, aten.floor, aten.frac, aten.neg, aten.relu, aten.round, aten.silu, aten.trunc, aten.log, aten.log10, aten.log1p, aten.log2, aten.lgamma, aten.exp, aten.expm1, aten.erf, aten.erfc, aten.cos, aten.acos, aten.cosh, aten.sin, aten.asin, aten.sinh, aten.tan, aten.atan, aten.tanh, aten.atanh, aten.sqrt, aten.rsqrt, aten.reciprocal, aten.sigmoid, aten.softplus, aten.threshold, aten.threshold_backward, aten.clamp, aten.where, aten.lerp, aten.addcmul, aten.gelu, aten.gelu_backward, aten.sum, aten.mean, aten._grad_sum_to_size, aten.sum_to_size, aten.amax, aten.to, aten.type_as, operator.getitem, aten.squeeze, aten.unsqueeze, aten.rsub, aten._to_copy] # noqa: E501,B950 + view_ops = [aten.squeeze, aten.unsqueeze, aten.alias] + if compiler == "inductor": + default_recomputable_ops += [prims.div, prims.convert_element_type, aten.clone, aten._to_copy, aten.full_like, prims.var, prims.sum, aten.var, aten.std, prims.broadcast_in_dim, aten.select, aten.permute, aten._unsafe_view, aten.view, aten.expand, aten.slice, aten.reshape, aten.broadcast_tensors, aten.scalar_tensor, aten.ones, aten.new_zeros, aten.lift_fresh_copy, aten.arange, aten.triu, aten.var_mean, aten.isinf, aten.any, aten.full, aten.as_strided, aten.zeros, aten.argmax, aten.maximum, aten.cumsum] # noqa: E501,B950 + view_ops += [aten.view, aten.slice, aten.permute, aten.t, aten.transpose, prims.broadcast_in_dim, aten.expand, aten.as_strided] + # Natalia said that we should allow recomputing indexing :) + default_recomputable_ops += [aten.index] + default_recomputable_ops += view_ops + + default_recomputable_ops += pointwise_ops() + + default_recomputable_ops += [ + aten.zeros_like, + ] + + default_recomputable_ops += [ + method_to_operator(m) + for m in magic_methods + ] + + recomputable_ops = set(recomputable_ops) if recomputable_ops is not None else set(default_recomputable_ops) + + random_ops = [aten.native_dropout, aten.rand_like, aten.randn_like] + compute_intensive_ops = [aten.mm, aten.convolution, aten.convolution_backward, aten.bmm, aten.addmm, aten.upsample_bilinear2d, aten._softmax, aten._softmax_backward_data, aten.native_layer_norm, aten.native_layer_norm_backward, aten.native_batch_norm, aten.native_batch_norm_backward, aten._native_batch_norm_legit] # noqa: E501,B950 + + unrecomputable_ops = random_ops + compute_intensive_ops + + fusible_ops = recomputable_ops | set(random_ops) + + # The node match fusible pattern in byteir, it shoudle be recomputed. + byteir_recompute_nodes = get_byteir_recompute_nodes(full_bw_graph, required_fw_nodes) + + if AOT_PARTITIONER_DEBUG: + joint_module_ops = { + str(node.target._overloadpacket) + for node in joint_module.graph.nodes + if node.op == "call_function" and hasattr(node.target, "_overloadpacket") + } + ops_ignored = joint_module_ops - {str(i) for i in recomputable_ops} + print("Ops banned from rematerialization: ", ops_ignored) + print() + + AGGRESSIVE_RECOMPUTATION = False + + def is_materialized_backwards(node): + cur_nodes = {node} + while len(cur_nodes) > 0: + cur = cur_nodes.pop() + for user in cur.users: + if user not in required_fw_nodes and not is_fusible(cur, user): + return True + if user not in required_fw_nodes and get_aten_target(user) in view_ops: + cur_nodes.add(user) + + return False + + def ban_recomputation(node): + if node in byteir_recompute_nodes: + return False + if "recompute" in node.meta: + return node.meta["recompute"] == 0 + elif AGGRESSIVE_RECOMPUTATION: + return (node.op == 'call_function' and get_aten_target(node) in unrecomputable_ops) + else: + if node.op != 'call_function': + return False + if get_aten_target(node) not in recomputable_ops: + return True + if node.target == operator.getitem: + return False + if node.target in [aten.lift_fresh_copy.default, aten.lift_fresh.default]: + return False + + # If a node *must* be materialized in the backwards pass, then we + # should never recompute it. This is a pretty subtle point. In + # general, the assumption we make is that recomputing a node in the + # backwards pass is "free". However, if a node must be materialized + # in the backwards pass, then recomputing it is never free. + if is_materialized_backwards(node): + return True + + # Arbitrary hack that sometimes seems to help things. The above + # modification appears to have made this heuristic a lot less critical + # for performance. + # TODO: Investigate why this hack helps. + # TODO: Investigate the interaction with compiler assisted + # activation checkpointing. Removing the heuristic improves both + # memory footprint and speedup. + if not graph_has_recomputable_ops: + if compiler == "inductor" and node.dist_from_bw > config.max_dist_from_bw: + return True + # If the output of an op is 4x smaller (arbitrary choice), + # then we don't allow recomputation. + input_tensors_size = sum(_size_of(i) for i in node.args if isinstance(i, fx.Node)) + output_size = _size_of(node) + return (output_size * 4 < input_tensors_size) + + def is_fusible(a, b): + return get_aten_target(a) in fusible_ops and get_aten_target(b) in fusible_ops + + def is_materialized(node): + if node.op == 'placeholder': + return True + + return not all(is_fusible(node, user) for user in node.users) + + def is_byteir_fusible(node): + if get_aten_target(node) in [aten.transpose, aten.t]: + return all(get_aten_target(user) in compute_intensive_ops for user in node.users) + return False + + def get_node_weight(node) -> int: + + mem_sz = _size_of(node) + + # Heuristic to bias towards nodes closer to the backwards pass + # Complete guess about current value + mem_sz = int(mem_sz * (1.1 ** max(min(node.dist_from_bw, 100), 1))) + # mem_sz = int(mem_sz + node.dist_from_bw) + + if is_materialized(node): + return mem_sz + else: + return mem_sz * 2 + + nx_graph = nx.DiGraph() + + for node in full_bw_graph.nodes: + if node.op == 'output': + continue + + if node in required_bw_nodes: + nx_graph.add_edge(node.name + "_in", "sink", capacity=math.inf) + continue + + if _is_primal(node) or _is_fwd_seed_offset(node): + nx_graph.add_edge("source", node.name + "_in", capacity=math.inf) + + # If a node can't be recomputed (too expensive or involves randomness), + # we prevent it from being recomputed by adding an inf edge to the source + # We only need to ban nodes in the fw pass, as those are the only ones that would be recomputed. + if ban_recomputation(node) and node in required_fw_nodes: + nx_graph.add_edge("source", node.name + "_in", capacity=math.inf) + + # Checks if a node is actually a tuple. Can be simplified to just an isisinstance check if we always use faketensors. + is_non_tensor_node = (('val' not in node.meta and 'tensor_meta' not in node.meta) or + ('val' in node.meta and not isinstance(node.meta['val'], torch.Tensor))) + + if is_sym_node(node): + weight = sym_node_size(node) + elif is_non_tensor_node: + weight = math.inf + else: + weight = get_node_weight(node) + + # Creates the weights on the "node" edge + nx_graph.add_edge(node.name + "_in", node.name + "_out", capacity=weight) + for user in node.users: + nx_graph.add_edge(node.name + "_out", user.name + "_in", capacity=math.inf) + + for node in byteir_recompute_nodes: + nx_graph.add_edge(node.name + "_in", "sink", capacity=math.inf) + + try: + cut_value, partition = nx.minimum_cut(nx_graph, "source", "sink") + except Exception: + print('Failed to compute min-cut on following graph:') + print('\n'.join(nx.readwrite.edgelist.generate_edgelist(nx_graph))) + raise + + reachable, non_reachable = partition + cutset = set() + for u, nbrs in ((n, nx_graph[n]) for n in reachable): + cutset.update((u, v) for v in nbrs if v in non_reachable) + + cut_nodes = set() + for node_in, node_out in cutset: + assert node_in[:-3] == node_out[:-4] + node_name = node_in[:-3] + cut_nodes.add(node_name) + + # To make this stuff deterministic + node_idx = {node: idx for idx, node in enumerate(joint_module.graph.nodes)} + saved_values = sorted((name_to_node[node] for node in cut_nodes), key=lambda x: node_idx[x]) + # save_for_backward on tensors and stashes symints in autograd .ctx + saved_sym_nodes = list(filter(lambda n: is_sym_node(n), saved_values)) + saved_values = list(filter(lambda n: not is_sym_node(n), saved_values)) + # NB: saved_sym_nodes will be mutated to reflect the actual saved symbols + fw_module, bw_module = _extract_fwd_bwd_modules( + joint_module, saved_values, saved_sym_nodes=saved_sym_nodes, num_fwd_outputs=num_fwd_outputs) + + + if graph_has_recomputable_ops: + if graph_has_recomputable_rng_ops: + fw_module, bw_module = functionalize_rng_ops( + joint_module, fw_module, bw_module, len(saved_sym_nodes) + ) + bw_module = reordering_to_mimic_autograd_engine(bw_module) + + if AOT_PARTITIONER_DEBUG: + print("Theoretical Activations Stored: ", sum([_size_of(i) for i in saved_values]) / 1e9) + fw_module_nodes = {node.name for node in fw_module.graph.nodes if node.op == 'call_function'} + bw_module_nodes = {node.name for node in bw_module.graph.nodes if node.op == 'call_function'} + remat_nodes = fw_module_nodes & bw_module_nodes + + counts = defaultdict(int) + for node in fw_module.graph.nodes: + if node.name in remat_nodes and hasattr(node.target, '_overloadpacket'): + counts[str(node.target._overloadpacket)] += 1 + print(f"# remat/fw/bw: {len(remat_nodes)}/{len(fw_module_nodes)}/{len(bw_module_nodes)}") + print("Count of Ops Rematerialized: ", sorted(counts.items(), key=lambda x: x[1], reverse=True)) + return fw_module, bw_module + + +def draw_graph(traced: torch.fx.GraphModule, fname: str, figname: str = "fx_graph", clear_meta=True): + if clear_meta: + new_graph = copy.deepcopy(traced.graph) + traced = fx.GraphModule(traced, new_graph) + for node in traced.graph.nodes: + node.meta = {} + base, ext = os.path.splitext(fname) + if not ext: + ext = ".svg" + print(f"Writing FX graph to file: {base}{ext}") + g = graph_drawer.FxGraphDrawer(traced, figname) + x = g.get_main_dot_graph() + getattr(x, "write_" + ext.lstrip("."))(f"{base}{ext}") + + +def draw_joint_graph(graph, joint_inputs, file_name="full_graph.png"): + draw_graph(graph, file_name) + return default_partition(graph, joint_inputs) diff --git a/frontends/torch-frontend/third_party/patches/einsum.patch b/frontends/torch-frontend/third_party/patches/einsum.patch index 3b32cd0f5..4f7b913f9 100644 --- a/frontends/torch-frontend/third_party/patches/einsum.patch +++ b/frontends/torch-frontend/third_party/patches/einsum.patch @@ -1,8 +1,8 @@ diff --git a/include/torch-mlir/Dialect/Torch/IR/GeneratedTorchOps.td b/include/torch-mlir/Dialect/Torch/IR/GeneratedTorchOps.td -index 09147dc8..4b69e9cd 100644 +index efdb89fa..c87de79e 100644 --- a/include/torch-mlir/Dialect/Torch/IR/GeneratedTorchOps.td +++ b/include/torch-mlir/Dialect/Torch/IR/GeneratedTorchOps.td -@@ -7801,6 +7801,31 @@ def Torch_AtenOneHotOp : Torch_Op<"aten.one_hot", [ +@@ -4834,6 +4834,31 @@ def Torch_AtenAddmmOp : Torch_Op<"aten.addmm", [ }]; } @@ -31,27 +31,28 @@ index 09147dc8..4b69e9cd 100644 + }]; +} + - def Torch_AtenBucketizeTensorOp : Torch_Op<"aten.bucketize.Tensor", [ + def Torch_AtenMatmulOp : Torch_Op<"aten.matmul", [ AllowsTypeRefinement, HasValueSemantics, diff --git a/lib/Dialect/Torch/Transforms/DecomposeComplexOps.cpp b/lib/Dialect/Torch/Transforms/DecomposeComplexOps.cpp -index 4dafed1d..a3c908bf 100644 +index 558e31c6..fefc337e 100644 --- a/lib/Dialect/Torch/Transforms/DecomposeComplexOps.cpp +++ b/lib/Dialect/Torch/Transforms/DecomposeComplexOps.cpp -@@ -5061,6 +5061,311 @@ public: +@@ -5022,6 +5022,460 @@ public: }; } // namespace +namespace { -+// Decompose AtenEinsumOp to AtenMmOp or AtenBmmOp -+// Step 1: split input equation to input/result tokens and find batchingDims and -+// contractingDims for future use -+// Step 2: transpose the input tensors to [batchingDims[0,1,2], -+// otherDims[0,1,2], contractingDims[0,1,2]] -+// Step 3: reshape the input tensors, the final shape should -+// be[batchingDims, otherDims, contractingDims] -+// Step 4: use AtenMatmulOp to get the result, loop util we get the final -+// result ++// Decompose AtenEinsumOp to AtenMatmulOp, and supports possible reduce ++// operation and permute operation. Currently, this pass doesn't support ++// Hadamard product. The basic idea is that: ++// Step 1: split the string equation to input/result tokens and find ++// batchingDims, contractingDims, otherDims and reduceDims. ++// Step 2: permute and reshape input tensors suitable ++// for matmul operations. ++// Step 3: use AtenMatmulOp to get the result. ++// Step 4: iteratively execute step 2 & 3 until we get the final result. ++// Step 5: perform remaining permute and reduce operations. +// notice: support static shape only + +static bool parseEquation(const std::string &equation, @@ -85,95 +86,358 @@ index 4dafed1d..a3c908bf 100644 + return true; +} + -+// Prepare Tensor for Matmul Operations, we will transpose the input tensor -+// to make it in order as [batchingDims, otherDims, contractingDims] -+// example: bcwd,bcdh->bcwh -+// Step1 : [b,c,h,d] -+// Step2 : [b*c,h,d] -+// Step3 : [e(=b*c), h, d] -+static Value prepareTensorForMatmulOperations( -+ PatternRewriter &rewriter, Operation *op, Value inputTensor, -+ const SmallVector &shape, const SmallVector &contractingDims, -+ const SmallVector &batchingDims, SmallVector &finalShape, -+ const SmallVector &tokens) { -+ SmallVector otherDims; -+ Value middleDimProduct = -+ rewriter.create(op->getLoc(), rewriter.getI64IntegerAttr(1)); -+ for (size_t i = 0; i < shape.size(); ++i) { -+ if (std::find(batchingDims.begin(), batchingDims.end(), i) == -+ batchingDims.end() && -+ std::find(contractingDims.begin(), contractingDims.end(), i) == -+ contractingDims.end()) { -+ middleDimProduct = -+ rewriter.create(op->getLoc(), middleDimProduct, shape[i]); -+ otherDims.push_back(i); ++// classify every dim token into different categories. Note that although we ++// parse out reduce dims, we delay their execution until ++// `performLastPermuteAndReduce`. ++static void parseDimTokens( ++ SmallVector &lhsTokens, SmallVector &rhsTokens, ++ SmallVector &finalResultTokens, SmallVector &contractingDims, ++ SmallVector &lhsReduceDims, SmallVector &rhsReduceDims, ++ SmallVector &batchingDims, SmallVector &lhsOtherDims, ++ SmallVector &rhsOtherDims) { ++ llvm::SmallDenseSet lhsTokenSet(lhsTokens.begin(), lhsTokens.end()); ++ llvm::SmallDenseSet rhsTokenSet(rhsTokens.begin(), rhsTokens.end()); ++ llvm::SmallDenseSet finalResultTokenSet(finalResultTokens.begin(), ++ finalResultTokens.end()); ++ ++ for (size_t i = 0; i < lhsTokens.size(); ++i) { ++ bool rhsContains = rhsTokenSet.contains(lhsTokens[i]); ++ bool finalResultConatins = finalResultTokenSet.contains(lhsTokens[i]); ++ // batching dim ++ if (rhsContains && finalResultConatins) { ++ batchingDims.push_back(lhsTokens[i]); ++ // reduce dim of lhs ++ } else if (!rhsContains && !finalResultConatins) { ++ lhsReduceDims.push_back(lhsTokens[i]); ++ // other dim of lhs ++ } else if (finalResultConatins) { ++ lhsOtherDims.push_back(lhsTokens[i]); ++ // contracting dim of lhs ++ } else if (rhsContains) { ++ contractingDims.push_back(lhsTokens[i]); + } + } -+ int64_t otherDimsSize = otherDims.size(); -+ if (!batchingDims.empty()) { -+ int64_t usedOtherDim = 0; -+ Value batchingDimProduct = -+ rewriter.create(op->getLoc(), rewriter.getI64IntegerAttr(1)); -+ int64_t batchingDimsRank = batchingDims.size(); -+ for (int64_t i = 0; i < batchingDimsRank; ++i) { -+ batchingDimProduct = -+ rewriter.create(op->getLoc(), batchingDimProduct, -+ shape[batchingDims[i]]); -+ if (batchingDims[i] != i) { -+ Value batchingDim = -+ rewriter.create(op->getLoc(), -+ rewriter.getI64IntegerAttr( -+ batchingDims[i])); -+ Value indexDim = rewriter.create( -+ op->getLoc(), rewriter.getI64IntegerAttr(otherDims[usedOtherDim])); -+ inputTensor = rewriter.create( -+ op->getLoc(), op->getResultTypes(), inputTensor, batchingDim, indexDim); -+ usedOtherDim += 1; -+ } ++ ++ for (size_t i = 0; i < rhsTokens.size(); ++i) { ++ bool lhsContains = lhsTokenSet.contains(rhsTokens[i]); ++ bool finalResultConatins = finalResultTokenSet.contains(rhsTokens[i]); ++ // batching dim ++ if (lhsContains && finalResultConatins) { ++ // reduce dim of rhs ++ } else if (!lhsContains && !finalResultConatins) { ++ rhsReduceDims.push_back(rhsTokens[i]); ++ // other dim of rhs ++ } else if (finalResultConatins) { ++ rhsOtherDims.push_back(rhsTokens[i]); ++ // contracting dim of rhs ++ } else if (lhsContains) { ++ } ++ } ++} ++ ++static void generateIdealReusltDimTokens(SmallVector &batchingDims, ++ SmallVector &lhsOtherDims, ++ SmallVector &rhsOtherDims, ++ SmallVector &lhsReduceDims, ++ SmallVector &rhsReduceDims, ++ SmallVector &resultTokens) { ++ // generate ideal result dims, i.e., ++ // [*batchingDims, *lhsOtherDims, *lhsReduceDims, *rhsOtherDims, ++ // *rhsReduceDims] ++ resultTokens.insert(resultTokens.end(), batchingDims.begin(), ++ batchingDims.end()); ++ resultTokens.insert(resultTokens.end(), lhsOtherDims.begin(), ++ lhsOtherDims.end()); ++ resultTokens.insert(resultTokens.end(), lhsReduceDims.begin(), ++ lhsReduceDims.end()); ++ resultTokens.insert(resultTokens.end(), rhsOtherDims.begin(), ++ rhsOtherDims.end()); ++ resultTokens.insert(resultTokens.end(), rhsReduceDims.begin(), ++ rhsReduceDims.end()); ++} ++ ++static Value permuteTensorForMatmul(PatternRewriter &rewriter, Location loc, ++ Value input, SmallVector &dimTokens, ++ SmallVector &batchingDims, ++ SmallVector &contractingDims, ++ SmallVector &otherDims, ++ SmallVector &reduceDims, bool isLhs) { ++ auto inputType = input.getType().cast(); ++ llvm::SmallDenseMap dimTokenMap; ++ for (size_t idx = 0; idx < dimTokens.size(); ++idx) { ++ dimTokenMap[dimTokens[idx]] = idx; ++ } ++ ++ SmallVector permuteVec; ++ auto appendDims = [&](SmallVector dimTokens) { ++ for (auto d : dimTokens) { ++ permuteVec.push_back(rewriter.create( ++ loc, rewriter.getI64IntegerAttr(dimTokenMap[d]))); ++ } ++ }; ++ ++ appendDims(batchingDims); ++ if (!isLhs) ++ appendDims(contractingDims); ++ appendDims(otherDims); ++ appendDims(reduceDims); ++ if (isLhs) ++ appendDims(contractingDims); ++ ++ Value dstDims = rewriter.create( ++ loc, Torch::ListType::get(Torch::IntType::get(rewriter.getContext())), ++ permuteVec); ++ auto outType = inputType.getWithSizesAndDtype(std::nullopt, ++ inputType.getOptionalDtype()); ++ return rewriter.create(loc, outType, input, dstDims); ++} ++ ++// [*batchingDims, *lhsOtherDims, *lhsReduceDims, *lhsContractingDims] => ++// [batchingDimsProd, lhsOtherDimsProd, lhsContractingDimsProd] ++static Value collapseDimForMatmul(PatternRewriter &rewriter, Location loc, ++ Value input, int64_t batchDimsLength, ++ int64_t contractingDimsLength, ++ int64_t otherDimsLength, ++ int64_t reduceDimsLength, bool isLhs) { ++ auto inputType = input.getType().cast(); ++ auto inputRank = batchDimsLength + contractingDimsLength + otherDimsLength + ++ reduceDimsLength; ++ SmallVector inputShapeTensor; ++ for (auto i = 0; i < inputRank; ++i) { ++ inputShapeTensor.emplace_back(rewriter.create( ++ loc, input, ++ rewriter.create(loc, ++ rewriter.getI64IntegerAttr(i)))); ++ } ++ ++ SmallVector outShapeTensor; ++ Value constOne = ++ rewriter.create(loc, rewriter.getI64IntegerAttr(1)); ++ auto dimOffset = 0; ++ ++ auto appendDims = [&](int64_t dimLength) { ++ Value prod = constOne; ++ for (auto i = 0; i < dimLength; ++i) { ++ prod = rewriter.create(loc, prod, ++ inputShapeTensor[i + dimOffset]); + } -+ finalShape.push_back(batchingDimProduct); ++ outShapeTensor.emplace_back(prod); ++ dimOffset += dimLength; ++ }; ++ ++ appendDims(batchDimsLength); ++ if (!isLhs) ++ appendDims(contractingDimsLength); ++ appendDims(otherDimsLength + reduceDimsLength); ++ if (isLhs) ++ appendDims(contractingDimsLength); ++ ++ auto outShapeValue = rewriter.create( ++ loc, Torch::ListType::get(Torch::IntType::get(input.getContext())), ++ outShapeTensor); ++ ++ auto outType = inputType.getWithSizesAndDtype(std::nullopt, ++ inputType.getOptionalDtype()); ++ return rewriter.create(loc, outType, input, ++ outShapeValue); ++} ++ ++static LogicalResult performMatmul(PatternRewriter &rewriter, Location loc, ++ Value lhs, SmallVector &lhsTokens, ++ Value rhs, SmallVector &rhsTokens, ++ Value &result, ++ SmallVector &resultTokens, ++ SmallVector &finalResultTokens) { ++ auto lhsType = lhs.getType().cast(); ++ auto rhsType = rhs.getType().cast(); ++ Type promotedDType; ++ ++ // promote dtype ++ if (lhsType.hasDtype() && rhsType.hasDtype()) { ++ auto lhsDtype = Torch::getScalarTypeForType(lhsType.getOptionalDtype()); ++ auto rhsDtype = Torch::getScalarTypeForType(rhsType.getOptionalDtype()); ++ auto promotedDTypeInt = ++ torch_upstream::promote_skip_undefined(lhsDtype, rhsDtype); ++ auto promotedDTypeIntValue = rewriter.create( ++ loc, rewriter.getI64IntegerAttr((int)promotedDTypeInt)); ++ auto promotedDTypeInfo = ++ getTypeForScalarType(rewriter.getContext(), promotedDTypeInt, ++ mlir::IntegerType::SignednessSemantics::Signed); ++ if (failed(promotedDTypeInfo)) ++ rewriter.notifyMatchFailure(loc, "Failed to get type for promoted dtype"); ++ promotedDType = *promotedDTypeInfo; ++ ++ auto falseValue = ++ rewriter.create(loc, rewriter.getBoolAttr(false)); ++ auto noneValue = rewriter.create(loc); ++ lhs = rewriter.create( ++ loc, ++ lhsType.getWithSizesAndDtype(lhsType.getOptionalSizes(), promotedDType), ++ lhs, promotedDTypeIntValue, falseValue, falseValue, noneValue); ++ rhs = rewriter.create( ++ loc, ++ rhsType.getWithSizesAndDtype(rhsType.getOptionalSizes(), promotedDType), ++ rhs, promotedDTypeIntValue, falseValue, falseValue, noneValue); ++ } else { ++ promotedDType = lhsType.hasDtype() ? lhsType.getOptionalDtype() : rhsType.getOptionalDtype(); ++ } ++ ++ llvm::SmallDenseMap lhsDimShapeMap; ++ for (size_t idx = 0; idx < lhsTokens.size(); ++idx) { ++ char d = lhsTokens[idx]; ++ lhsDimShapeMap[d] = rewriter.create( ++ loc, lhs, ++ rewriter.create(loc, ++ rewriter.getI64IntegerAttr(idx))); + } -+ finalShape.push_back(middleDimProduct); -+ if (!contractingDims.empty()) { -+ int64_t usedOtherDim = 1; -+ int64_t rank = tokens.size(); -+ Value contractingDimProduct = -+ rewriter.create(op->getLoc(), rewriter.getI64IntegerAttr(1)); -+ int64_t contractingDimsRank = contractingDims.size(); -+ for (int64_t i = contractingDimsRank - 1; i > -1; --i) { -+ contractingDimProduct = -+ rewriter.create(op->getLoc(), contractingDimProduct, -+ shape[contractingDims[i]]); -+ if (contractingDims[i] != rank - contractingDimsRank + i) { -+ Value contractingDim = -+ rewriter.create(op->getLoc(), -+ rewriter.getI64IntegerAttr( -+ contractingDims[i])); -+ Value indexDim = rewriter.create( -+ op->getLoc(), rewriter.getI64IntegerAttr( -+ otherDims[otherDimsSize - usedOtherDim])); -+ inputTensor = rewriter.create( -+ op->getLoc(), op->getResultTypes(), inputTensor, contractingDim, indexDim); -+ usedOtherDim += 1; ++ llvm::SmallDenseMap rhsDimShapeMap; ++ for (size_t idx = 0; idx < rhsTokens.size(); ++idx) { ++ char d = rhsTokens[idx]; ++ rhsDimShapeMap[d] = rewriter.create( ++ loc, rhs, ++ rewriter.create(loc, ++ rewriter.getI64IntegerAttr(idx))); ++ } ++ ++ // parse batch, contracting, other, reduce dims of lhs and rhs ++ SmallVector contractingDims; ++ SmallVector lhsReduceDims; ++ SmallVector rhsReduceDims; ++ SmallVector lhsOtherDims; ++ SmallVector rhsOtherDims; ++ SmallVector batchingDims; ++ parseDimTokens(lhsTokens, rhsTokens, finalResultTokens, contractingDims, ++ lhsReduceDims, rhsReduceDims, batchingDims, lhsOtherDims, ++ rhsOtherDims); ++ ++ llvm::SmallDenseMap outDimShapeMap; ++ auto generateOutDimShapeMap = [&](SmallVector &dims) { ++ for (auto d : dims) { ++ bool lhsContains = lhsDimShapeMap.count(d) > 0; ++ bool rhsContains = rhsDimShapeMap.count(d) > 0; ++ if (lhsContains && rhsContains) { ++ outDimShapeMap[d] = rewriter.create( ++ loc, lhsDimShapeMap[d], rhsDimShapeMap[d]); ++ } else if (lhsContains) { ++ outDimShapeMap[d] = lhsDimShapeMap[d]; ++ } else if (rhsContains) { ++ outDimShapeMap[d] = rhsDimShapeMap[d]; + } + } -+ finalShape.push_back(contractingDimProduct); ++ }; ++ ++ generateOutDimShapeMap(contractingDims); ++ generateOutDimShapeMap(batchingDims); ++ generateOutDimShapeMap(lhsReduceDims); ++ generateOutDimShapeMap(rhsReduceDims); ++ generateOutDimShapeMap(lhsOtherDims); ++ generateOutDimShapeMap(rhsOtherDims); ++ ++ if (contractingDims.size() == 0 && lhsOtherDims.size() == 0 && ++ rhsOtherDims.size() == 0) { ++ return rewriter.notifyMatchFailure( ++ loc, "Hadamard product is currently not supported"); ++ } ++ ++ // shape: [*batchingDims, *lhsOtherDims, *lhsReduceDims, *lhsContractingDims] ++ lhs = permuteTensorForMatmul(rewriter, loc, lhs, lhsTokens, batchingDims, ++ contractingDims, lhsOtherDims, lhsReduceDims, ++ true); ++ // shape: [*batchingDims, *rhsContractingDims, *rhsOtherDims, *rhsReduceDims] ++ rhs = permuteTensorForMatmul(rewriter, loc, rhs, rhsTokens, batchingDims, ++ contractingDims, rhsOtherDims, rhsReduceDims, ++ false); ++ // shape: [batchingDimsProd, lhsOtherDimsProd, lhsContractingDimsProd] ++ lhs = collapseDimForMatmul(rewriter, loc, lhs, batchingDims.size(), ++ contractingDims.size(), lhsOtherDims.size(), ++ lhsReduceDims.size(), true); ++ // shape: [batchingDimsProd, rhsContractingDimsProd, rhsOtherDimsProd] ++ rhs = collapseDimForMatmul(rewriter, loc, rhs, batchingDims.size(), ++ contractingDims.size(), rhsOtherDims.size(), ++ rhsReduceDims.size(), false); ++ ++ // perform matmul ++ auto outType = ++ lhsType.getWithSizesAndDtype(std::nullopt, promotedDType); ++ result = rewriter.create(loc, outType, lhs, rhs); ++ ++ // generate ideal result dims. ++ generateIdealReusltDimTokens(batchingDims, lhsOtherDims, rhsOtherDims, ++ lhsReduceDims, rhsReduceDims, resultTokens); ++ ++ // reshape matmul result to ideal shape: ++ // [batchingDimsProd, lhsOtherDimsProd, rhsOtherDimsProd] => ++ // [*batchingDims, *lhsOtherDims, *lhsReduceDims, *rhsOtherDims, ++ // *rhsReduceDims] ++ SmallVector outShapeTensors; ++ for (char d : resultTokens) { ++ outShapeTensors.emplace_back(outDimShapeMap[d]); + } -+ return inputTensor; ++ ++ auto outResultShape = rewriter.create( ++ loc, Torch::ListType::get(Torch::IntType::get(lhs.getContext())), ++ outShapeTensors); ++ result = rewriter.create( ++ loc, ++ lhsType.getWithSizesAndDtype(std::nullopt, promotedDType), ++ result, outResultShape); ++ return success(); +} + -+static Value createReshapedTensor(PatternRewriter &rewriter, Location loc, -+ Operation* op, Type tensorType, Value tensor, -+ SmallVector &shape) { -+ auto listType = Torch::ListType::get(Torch::IntType::get(op->getContext())); -+ Value reshapedDims = -+ rewriter.create(loc, listType, shape); -+ return rewriter.create(loc, tensorType, tensor, reshapedDims); ++static Value performLastReduceAndPermute(PatternRewriter &rewriter, ++ Location loc, Type outType, Value input, ++ SmallVector &inputTokens, ++ SmallVector &outTokens) { ++ auto inputType = input.getType().cast(); ++ ++ llvm::SmallDenseSet outTokenSet(outTokens.begin(), outTokens.end()); ++ SmallVector sumDims; ++ llvm::SmallDenseMap inputDimToIdx; ++ int64_t idx = 0; ++ for (size_t i = 0; i < inputTokens.size(); ++i) { ++ char d = inputTokens[i]; ++ if (!outTokenSet.contains(d)) { ++ sumDims.emplace_back(i); ++ } else { ++ inputDimToIdx[d] = idx++; ++ } ++ } ++ ++ if (sumDims.size() > 0) { ++ SmallVector sumDimsTensor; ++ for (auto d : sumDims) { ++ sumDimsTensor.emplace_back(rewriter.create( ++ loc, rewriter.getI64IntegerAttr(d))); ++ } ++ auto sumDimsListValue = rewriter.create( ++ loc, Torch::ListType::get(Torch::IntType::get(rewriter.getContext())), ++ sumDimsTensor); ++ auto falseValue = rewriter.create( ++ loc, rewriter.getBoolAttr(false)); ++ auto noneValue = rewriter.create(loc); ++ input = rewriter.create( ++ loc, ++ inputType.getWithSizesAndDtype(std::nullopt, ++ inputType.getOptionalDtype()), ++ input, sumDimsListValue, falseValue, noneValue); ++ } ++ ++ SmallVector permuteDimsTensor; ++ for (auto d : outTokens) { ++ permuteDimsTensor.emplace_back(rewriter.create( ++ loc, rewriter.getI64IntegerAttr(inputDimToIdx[d]))); ++ } ++ auto permuteDimsListValue = rewriter.create( ++ loc, Torch::ListType::get(Torch::IntType::get(input.getContext())), ++ permuteDimsTensor); ++ auto out = rewriter.create(loc, outType, input, ++ permuteDimsListValue); ++ return out; +} + + +class DecomposeAtenEinsumOp : public OpRewritePattern { -+ public: ++public: + using OpRewritePattern::OpRewritePattern; + LogicalResult matchAndRewrite(AtenEinsumOp op, + PatternRewriter &rewriter) const override { @@ -185,162 +449,47 @@ index 4dafed1d..a3c908bf 100644 + SmallVector resultTokens; + SmallVector> inputTokens; + if (!parseEquation(equation, inputTokens, resultTokens)) { -+ return rewriter.notifyMatchFailure(op, "Unexpected character in equations encountered"); ++ return rewriter.notifyMatchFailure( ++ op, "Unexpected character in equations encountered"); + } + + SmallVector inputTensors; -+ SmallVector> inputShapes; + if (!getListConstructElements(op.getTensors(), inputTensors)) { + return rewriter.notifyMatchFailure( + op, "input should comes from a PrimListConstructOp"); + } + -+ for (size_t i = 0; i < inputTensors.size(); i++) { -+ BaseTensorType tensorType = -+ inputTensors[i].getType().cast(); -+ if (!tensorType.hasSizes()) { -+ return rewriter.notifyMatchFailure( -+ op, "unimplemented: input tensor must have known sizes"); -+ } -+ ArrayRef inputShape = tensorType.getSizes(); -+ SmallVector inputValueShape; -+ for (unsigned j = 0; j < inputShape.size(); j++) { -+ inputValueShape.push_back(rewriter.create( -+ loc, inputTensors[i], -+ rewriter.create( -+ loc, rewriter.getI64IntegerAttr(j)))); -+ } -+ inputShapes.push_back(inputValueShape); -+ } -+ -+ auto collectOperandDims = [resultTokens]( -+ const SmallVector operandShape, -+ const SmallVector operandTokens, -+ const SmallVector others, -+ SmallVectorImpl &contractingDims, -+ SmallVectorImpl &batchingDims, -+ SmallVector &dotResultTokens, -+ SmallVector &dotResultShape) { -+ llvm::SmallDenseSet othersSet(others.begin(), others.end()); -+ llvm::SmallDenseSet resultTokensSet(resultTokens.begin(), -+ resultTokens.end()); -+ for (const auto &en : llvm::enumerate(operandTokens)) { -+ bool isResultToken = resultTokensSet.contains(en.value()); -+ bool isOtherToken = othersSet.contains(en.value()); -+ if (!isResultToken && isOtherToken) { -+ contractingDims.push_back(en.index()); -+ } else if (isOtherToken) { -+ batchingDims.push_back(en.index()); -+ } else { -+ dotResultTokens.push_back(en.value()); -+ dotResultShape.push_back(operandShape[en.index()]); -+ } -+ } ++ auto allTensorHasSizes = [](Value tensor) { ++ auto type = tensor.getType().dyn_cast(); ++ if (!type || !type.hasSizes()) ++ return false; ++ return true; + }; + -+ Value constZero = -+ rewriter.create(loc, rewriter.getI64IntegerAttr(0)); -+ Value constOne = -+ rewriter.create(loc, rewriter.getI64IntegerAttr(1)); -+ Value constTwo = -+ rewriter.create(loc, rewriter.getI64IntegerAttr(2)); -+ if (inputTensors.size() == 1) { -+ return rewriter.notifyMatchFailure( -+ op, "unimplemented: single input tensor is not supported"); ++ if (!llvm::all_of(inputTensors, allTensorHasSizes)) { ++ return rewriter.notifyMatchFailure(op, ++ "all input tensors should have sizes"); + } -+ while (inputTensors.size() > 1) { -+ SmallVector lhsContractingDims, lhsBatchingDims, -+ rhsContractingDims, rhsBatchingDims; -+ SmallVector dotResultTokens; -+ SmallVector dotResultShape; -+ SmallVector lhsShape = inputShapes[0]; -+ SmallVector rhsShape = inputShapes[1]; -+ SmallVector lhsTokens = inputTokens[0]; -+ SmallVector rhsTokens = inputTokens[1]; -+ Value lhsTensor = inputTensors[0]; -+ Value rhsTensor = inputTensors[1]; -+ // Step 1: split input equation to input/result tokens -+ collectOperandDims(lhsShape, lhsTokens, rhsTokens, lhsContractingDims, -+ lhsBatchingDims, dotResultTokens, dotResultShape); -+ collectOperandDims(rhsShape, rhsTokens, lhsTokens, rhsContractingDims, -+ rhsBatchingDims, dotResultTokens, dotResultShape); -+ // Prepend batch tokens. -+ for (const auto &it : llvm::enumerate(lhsBatchingDims)) { -+ char batchingToken = lhsTokens[it.value()]; -+ Value batchingShapeDim = lhsShape[it.value()]; -+ dotResultTokens.insert(dotResultTokens.begin() + it.index(), -+ batchingToken); -+ dotResultShape.insert(dotResultShape.begin() + it.index(), -+ batchingShapeDim); -+ } -+ // Lowering to dot_general does not support a mismatch between the number -+ // of result dims and the number of non-contracting dims. -+ -+ SmallVector lhsFinalShape, rhsFinalShape; -+ SmallVector finalShape = dotResultShape; -+ // Step 2: transpose the input tensors to [batchingDims[0,1,2], -+ // otherDims[0,1,2], contractingDims[0,1,2]] -+ lhsTensor = prepareTensorForMatmulOperations(rewriter, op, lhsTensor, lhsShape, -+ lhsContractingDims, lhsBatchingDims, -+ lhsFinalShape, lhsTokens); -+ rhsTensor = prepareTensorForMatmulOperations(rewriter, op, rhsTensor, rhsShape, -+ rhsContractingDims, rhsBatchingDims, -+ rhsFinalShape, rhsTokens); -+ -+ // Step 3: reshape the input tensors, the final shape should -+ // be[batchingDims, otherDims, contractingDims] -+ auto listType = Torch::ListType::get(Torch::IntType::get(op->getContext())); -+ Value lhsReshapedDims = -+ rewriter.create(loc, listType, lhsFinalShape); -+ Value lhs = rewriter.create(loc, op.getType(), lhsTensor, lhsReshapedDims); -+ Value rhsReshapedDims = -+ rewriter.create(loc, listType, rhsFinalShape); -+ Value rhs = rewriter.create(loc, op.getType(), rhsTensor, rhsReshapedDims); -+ Value result; -+ -+ // Step 4: use AtenMatmulOp to get the result, loop util we -+ // get the final result -+ if (!rhsContractingDims.empty() && !rhsBatchingDims.empty()){ -+ rhs = rewriter.create(loc, op.getType(), rhs, constOne, constTwo); -+ } else if (!rhsContractingDims.empty()){ -+ rhs = rewriter.create(loc, op.getType(), rhs, constZero, constOne); -+ } -+ result = rewriter.create(loc, op.getType(), lhs, rhs); -+ result = createReshapedTensor(rewriter, loc, op, op.getType(), result, finalShape); -+ -+ inputTensors.erase(inputTensors.begin(), inputTensors.begin() + 2); -+ inputTokens.erase(inputTokens.begin(), inputTokens.begin() + 2); -+ inputShapes.erase(inputShapes.begin(), inputShapes.begin() + 2); -+ inputTensors.push_back(result); -+ inputTokens.push_back(dotResultTokens); -+ inputShapes.push_back(dotResultShape); -+ if (inputTokens.size() == 1) { -+ // Lowering to dot_general does not support a mismatch between the number -+ // of result dims and the number of non-contracting dims. -+ if (dotResultTokens.size() != resultTokens.size()) { -+ return rewriter.notifyMatchFailure(op, -+ "rank reducing einsum not supported"); -+ } -+ int64_t resultSize = 0; -+ for (char resultToken : resultTokens) { -+ auto *foundIt = std::find(dotResultTokens.begin(), dotResultTokens.end(), -+ resultToken); -+ if (foundIt == dotResultTokens.end()) { -+ return rewriter.notifyMatchFailure( -+ op, "result token not found in operands"); -+ } -+ auto resultIndex = std::distance(dotResultTokens.begin(), foundIt); -+ if (resultIndex > resultSize) { -+ Value first = rewriter.create(loc, rewriter.getI64IntegerAttr(resultSize)); -+ Value second = rewriter.create(loc, rewriter.getI64IntegerAttr(resultIndex)); -+ result = rewriter.create(loc, op.getType(), result, first, second); -+ } -+ resultSize += 1; -+ } -+ // The dot_general is already in an appropriate result order. -+ rewriter.replaceOp(op, ValueRange{result}); ++ ++ SmallVector lhsTokens = inputTokens[0]; ++ Value lhs = inputTensors[0]; ++ Value result; ++ ++ for (size_t i = 1; i < inputTensors.size(); ++i) { ++ auto rhs = inputTensors[i]; ++ auto rhsTokens = inputTokens[i]; ++ SmallVector outTokens; ++ if (failed(performMatmul(rewriter, loc, lhs, lhsTokens, rhs, rhsTokens, ++ result, outTokens, resultTokens))) { ++ return failure(); + } ++ lhs = result; ++ lhsTokens = outTokens; + } ++ ++ result = performLastReduceAndPermute(rewriter, loc, op.getType(), lhs, lhsTokens, ++ resultTokens); ++ rewriter.replaceOp(op, result); + return success(); + } +}; @@ -348,16 +497,16 @@ index 4dafed1d..a3c908bf 100644 + + namespace { - class DecomposeComplexOpsPass - : public DecomposeComplexOpsBase { -@@ -5164,6 +5469,7 @@ public: - addPatternIfTargetOpIsIllegal(patterns); - addPatternIfTargetOpIsIllegal(patterns); - addPatternIfTargetOpIsIllegal(patterns); + // Unconditionally decompose `aten.tile` into `aten.repeat`. + class DecomposeAtenTileOp : public OpRewritePattern { +@@ -5221,6 +5675,7 @@ public: + addPatternIfTargetOpIsIllegal(patterns); + addPatternIfTargetOpIsIllegal(patterns); + addPatternIfTargetOpIsIllegal(patterns); + addPatternIfTargetOpIsIllegal(patterns); - addPatternIfTargetOpIsIllegal(patterns); - addPatternIfTargetOpIsIllegal(patterns); - addPatternIfTargetOpIsIllegal(patterns); + addPatternIfTargetOpIsIllegal(patterns); + addPatternIfTargetOpIsIllegal(patterns); + addPatternIfTargetOpIsIllegal(patterns); diff --git a/lib/Dialect/Torch/Transforms/LowerToBackendContract.cpp b/lib/Dialect/Torch/Transforms/LowerToBackendContract.cpp index 76119828..179440c6 100644 --- a/lib/Dialect/Torch/Transforms/LowerToBackendContract.cpp diff --git a/frontends/torch-frontend/torch-frontend/lib/Conversion/ConvertTorchToCustomCall.cpp b/frontends/torch-frontend/torch-frontend/lib/Conversion/ConvertTorchToCustomCall.cpp index a96b17087..3760e78c7 100644 --- a/frontends/torch-frontend/torch-frontend/lib/Conversion/ConvertTorchToCustomCall.cpp +++ b/frontends/torch-frontend/torch-frontend/lib/Conversion/ConvertTorchToCustomCall.cpp @@ -970,6 +970,10 @@ class ConvertFlashAttnFwdOp : public OpConversionPattern { Type softmaxLseTy = op.getResult(5).getType(); Type softmaxTy = op.getResult(6).getType(); Type rngTy = op.getResult(7).getType(); + // Do not need softmax return if there's no use + if (op.getResult(6).use_empty()) + returnSoftmax = false; + SmallVector resultTypes; if (failed(getTypeConverter()->convertTypes( {outputPadTy, softmaxLseTy, softmaxTy, rngTy}, resultTypes))) { diff --git a/frontends/torch-frontend/torch-frontend/python/test/test_attn_rewrite.py b/frontends/torch-frontend/torch-frontend/python/test/test_attn_rewrite.py index 99bf2fadd..ad64e0508 100644 --- a/frontends/torch-frontend/torch-frontend/python/test/test_attn_rewrite.py +++ b/frontends/torch-frontend/torch-frontend/python/test/test_attn_rewrite.py @@ -183,3 +183,21 @@ def test_flash_attn_opt_pattern(): torch.testing.assert_close(golden_loss, flash_loss, atol=1e-4, rtol=1e-6) torch.testing.assert_close(golden_logits, flash_logits, atol=3e-3, rtol=1e-6) + + +def test_flash_attn_llama_inference_pattern(): + config = transformers.LlamaConfig(num_hidden_layers=4) + model = transformers.LlamaForCausalLM(config=config).to("cuda") + model.eval() + + input, label = make_data(model, "cuda") + trace_data = [input] + + from torch.fx.experimental.proxy_tensor import make_fx + from torch_frontend import preprocess_fx_graph + # module = torch.jit.trace(model, trace_data, check_trace=False) + with torch.no_grad(), torch.cuda.amp.autocast(enabled=True, dtype=torch.float16): + fx_g = make_fx(model)(*trace_data) + fx_g = preprocess_fx_graph(fx_g) + all_formatted = "\n".join([n.format_node() for n in fx_g.graph.nodes]) + FileCheck().check("call_function").check("torch.ops.byteir.flash_attn_fwd").run(all_formatted) diff --git a/frontends/torch-frontend/torch-frontend/python/test/test_fx_utils.py b/frontends/torch-frontend/torch-frontend/python/test/test_fx_utils.py new file mode 100644 index 000000000..d6143c3a1 --- /dev/null +++ b/frontends/torch-frontend/torch-frontend/python/test/test_fx_utils.py @@ -0,0 +1,18 @@ +import torch +import torch.fx as fx +import torch_frontend +from torch_frontend.fx_utils import _replace_aten_full_arugment + +class FullModule(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + y = torch.ops.aten.full(x.shape, True, dtype=torch.bool) + return y + + +def test_full_bool_pattern(): + fx_g = fx.symbolic_trace(FullModule()) + fx_g = _replace_aten_full_arugment(fx_g) + module = torch.jit.script(fx_g) diff --git a/frontends/torch-frontend/torch-frontend/python/torch_frontend/__init__.py b/frontends/torch-frontend/torch-frontend/python/torch_frontend/__init__.py index 7c6fda4a8..86d885215 100644 --- a/frontends/torch-frontend/torch-frontend/python/torch_frontend/__init__.py +++ b/frontends/torch-frontend/torch-frontend/python/torch_frontend/__init__.py @@ -25,8 +25,7 @@ del importlib del _torch_frontend_registry -from .ts_utils import register_decomposition_in_torchscript -from .fx_utils import list_decomposed_ops, preprocess_fx_graph +from .fx_utils import list_decomposed_ops, preprocess_fx_graph, get_none_indices from .convert_to_mhlo import convert_to_mhlo_via_torch_mlir, compile from .flash_attn_op import replace_flash_attn from .fx_rewrite import fx_replace_attn_pattern diff --git a/frontends/torch-frontend/torch-frontend/python/torch_frontend/fx_utils.py b/frontends/torch-frontend/torch-frontend/python/torch_frontend/fx_utils.py index eb075a648..20d95e2b5 100644 --- a/frontends/torch-frontend/torch-frontend/python/torch_frontend/fx_utils.py +++ b/frontends/torch-frontend/torch-frontend/python/torch_frontend/fx_utils.py @@ -76,6 +76,34 @@ def _remove_nones(fx_g: torch.fx.GraphModule) -> List[int]: return removed_indexes +# note: torch.jit.script doesn't support torch.ops.aten.full([2, 1, 1, 128], True, dtype = torch.bool), replace it with torch.ops.aten.full([2, 1, 1, 128], 1, dtype = torch.bool) +def _replace_aten_full_arugment(fx_g: torch.fx.GraphModule) -> torch.fx.GraphModule : + def get_aten_target(node): + if hasattr(node.target, 'overloadpacket'): + return node.target.overloadpacket + return node.target + + nodes = [] + for node in fx_g.graph.nodes: + if get_aten_target(node) == torch.ops.aten.full: + if node.args[1] == True or node.args[1] == False: + nodes.append(node) + for node in nodes: + if node.args[1] == True: + with fx_g.graph.inserting_after(node): + new_node = fx_g.graph.call_function(torch.ops.aten.full, args=(node.args[0], 1), kwargs=node.kwargs) + node.replace_all_uses_with(new_node) + fx_g.graph.erase_node(node) + if node.args[1] == False: + with fx_g.graph.inserting_after(node): + new_node = fx_g.graph.call_function(torch.ops.aten.full, args=(node.args[0], 0), kwargs=node.kwargs) + node.replace_all_uses_with(new_node) + fx_g.graph.erase_node(node) + fx_g.graph.lint() + fx_g.recompile() + return fx_g + + def threshold_backward_pattern(grad_output, inp, threshold): return torch.ops.aten.threshold_backward(grad_output, inp, threshold) @@ -96,6 +124,61 @@ def unsafe_index_put_pattern(self, indices, values, accumulate): def unsafe_index_put_replacement(self, indices, values, accumulate): return torch.ops.aten.index_put_.hacked_twin(self, indices, values, accumulate) +# LLaMA aten attention op pattern +def LLaMAAttnPattern(query, key, value, attn_mask, min_val, inv_scale, batch, num_head, fused_batch, seq_len, head_dim): + transpose_3 = torch.ops.aten.transpose.int(key, 2, 3) + expand_2 = torch.ops.aten.expand.default(query, [batch, num_head, seq_len, head_dim]) + clone = torch.ops.aten.clone.default(expand_2, memory_format = torch.contiguous_format) + _unsafe_view_3 = torch.ops.aten._unsafe_view.default(clone, [fused_batch, seq_len, head_dim]) + expand_3 = torch.ops.aten.expand.default(transpose_3, [batch, num_head, head_dim, seq_len]) + clone_1 = torch.ops.aten.clone.default(expand_3, memory_format = torch.contiguous_format) + _unsafe_view_4 = torch.ops.aten._unsafe_view.default(clone_1, [fused_batch, head_dim, seq_len]) + bmm = torch.ops.aten.bmm.default(_unsafe_view_3, _unsafe_view_4) + _unsafe_view_5 = torch.ops.aten._unsafe_view.default(bmm, [batch, num_head, seq_len, seq_len]) + div = torch.ops.aten.div.Tensor(_unsafe_view_5, inv_scale) + add_5 = torch.ops.aten.add.Tensor(div, attn_mask) + maximum = torch.ops.aten.maximum.default(add_5, min_val) + _softmax = torch.ops.aten._softmax.default(maximum, -1, False) + _to_copy_10 = torch.ops.aten._to_copy.default(_softmax, dtype = torch.float16) + expand_4 = torch.ops.aten.expand.default(_to_copy_10, [batch, num_head, seq_len, seq_len]) + view_8 = torch.ops.aten.view.default(expand_4, [fused_batch, seq_len, seq_len]); expand_4 = None + expand_5 = torch.ops.aten.expand.default(value, [batch, num_head, seq_len, head_dim]) + clone_2 = torch.ops.aten.clone.default(expand_5, memory_format = torch.contiguous_format) + _unsafe_view_6 = torch.ops.aten._unsafe_view.default(clone_2, [fused_batch, seq_len, head_dim]) + bmm_1 = torch.ops.aten.bmm.default(view_8, _unsafe_view_6) + _unsafe_view_5 = torch.ops.aten._unsafe_view.default(bmm_1, [batch, num_head, seq_len, head_dim]) + return _softmax, _unsafe_view_5 + + +def LLaMAAttnReplacement(query, key, value, attn_mask, min_val, inv_scale, batch, num_head, fused_batch, seq_len, head_dim): + out, q_pad, k_pad, v_pad, out_pad, softmax_lse, S_dmask, rng_state = torch.ops.byteir.flash_attn_fwd( + query, + key, + value, + 0.0, + 1.0/inv_scale, + True, + True + ) + return S_dmask, out + + +def get_none_indices(fx_g: torch.fx.GraphModule) -> List[int]: + none_indices = [] + for node in fx_g.graph.nodes: + if node.op == "output": + assert len(node.args) == 1, "Output node must have a single argument" + node_arg = node.args[0] + if isinstance(node_arg, (list, tuple)): + node_arg = list(node_arg) + node_args_len = len(node_arg) + for i in range(node_args_len): + if node_arg[i] is None: + none_indices.append(i) + break + return none_indices + + def list_decomposed_ops(): return [ torch.ops.aten._native_batch_norm_legit_functional, @@ -108,15 +191,18 @@ def list_decomposed_ops(): torch.ops.aten.tril ] + def preprocess_fx_graph(fx_graph: torch.fx.GraphModule): if _returns_nothing(fx_graph): return fx_graph torch.fx.replace_pattern(fx_graph, squeeze_dims_pattern, squeeze_dims_replacement) torch.fx.replace_pattern(fx_graph, unsafe_index_put_pattern, unsafe_index_put_replacement) + torch.fx.replace_pattern(fx_graph, LLaMAAttnPattern, LLaMAAttnReplacement) was_unwrapped = _unwrap_single_tuple_return(fx_graph) was_list_replaced = _list_return_to_tuple_return(fx_graph) removed_none_indexes = _remove_nones(fx_graph) strip_overloads(fx_graph) torch.fx.replace_pattern(fx_graph, threshold_backward_pattern, threshold_backward_replacement) + fx_graph = _replace_aten_full_arugment(fx_graph) return fx_graph diff --git a/runtime/include/brt/core/framework/op_accessor.h b/runtime/include/brt/core/framework/op_accessor.h index 8e3bb0c5d..352a63228 100644 --- a/runtime/include/brt/core/framework/op_accessor.h +++ b/runtime/include/brt/core/framework/op_accessor.h @@ -67,6 +67,9 @@ class OpAccessor { template T GetAttrAsSplatValue(const std::string &name) const; + template + std::vector GetAttrAsVector(const std::string &name) const; + std::string GetUID() const; static int64_t GetNumElementsOfShape(const Shape &shape); diff --git a/runtime/lib/backends/cuda/providers/default/ait/ait.cc b/runtime/lib/backends/cuda/providers/default/ait/ait.cc index e0e9e56e8..728a9aa70 100644 --- a/runtime/lib/backends/cuda/providers/default/ait/ait.cc +++ b/runtime/lib/backends/cuda/providers/default/ait/ait.cc @@ -370,7 +370,8 @@ AITOpKernel::AITOpKernel(const OpKernelInfo &info) std::string lib_path = brt::ir::GetParentPath(ir_path); lib_path += accessor.GetAttrAsString(std::string("ait_lib_file")); aitLibHdl = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_LOCAL); - BRT_ENFORCE(aitLibHdl != nullptr, "AIT lib .so load failed"); + std::string msg = std::string("AIT lib ") + lib_path + " load failed"; + BRT_ENFORCE(aitLibHdl != nullptr, msg); std::string space = accessor.GetAttrAsString("device"); IAllocator *alloc = info_.GetAllocator(space); workspaceSizeInBytes = diff --git a/runtime/lib/backends/cuda/providers/default/codegen/ptx.cc b/runtime/lib/backends/cuda/providers/default/codegen/ptx.cc index 3193bf51d..39c77d5a3 100644 --- a/runtime/lib/backends/cuda/providers/default/codegen/ptx.cc +++ b/runtime/lib/backends/cuda/providers/default/codegen/ptx.cc @@ -37,8 +37,12 @@ using namespace mlir; #define FILE_NAME_ATTR "device_file_name" #define KERNEL_NAME_ATTR "kernel_name" -#define GRID_SIZE_ATTR "GridSize.x" -#define BLOCK_SIZE_ATTR "BlockSize.x" +#define GRID_SIZE_X_ATTR "GridSize.x" +#define GRID_SIZE_Y_ATTR "GridSize.y" +#define GRID_SIZE_Z_ATTR "GridSize.z" +#define BLOCK_SIZE_X_ATTR "BlockSize.x" +#define BLOCK_SIZE_Y_ATTR "BlockSize.y" +#define BLOCK_SIZE_Z_ATTR "BlockSize.z" #define ARG_RANKS_ATTR "arg_ranks" #define CALL_CONVENTION_ATTR "call_convention" @@ -119,29 +123,57 @@ PTXOpKernel::PTXOpKernel(const OpKernelInfo &info) impl_->call_convention = "all"; // static assignment for config // TODO extend to support dynamic - if (!info.GetOperation()->hasAttrOfType(GRID_SIZE_ATTR)) { + if (!info.GetOperation()->hasAttrOfType(GRID_SIZE_X_ATTR)) { BRT_THROW_EX(std::runtime_error, "no GridSize.x attr"); } - if (!info.GetOperation()->hasAttrOfType(BLOCK_SIZE_ATTR)) { + if (!info.GetOperation()->hasAttrOfType(BLOCK_SIZE_X_ATTR)) { BRT_THROW_EX(std::runtime_error, "no BlockSize.x attr"); } - if (!info.GetOperation()->hasAttrOfType(ARG_RANKS_ATTR)) { - BRT_THROW_EX(std::runtime_error, "no arg_ranks attr"); + int gx = static_cast(info.GetOperation() + ->getAttrOfType(GRID_SIZE_X_ATTR) + .getInt()), + gy = 1, gz = 1; + if (info.GetOperation()->hasAttrOfType(GRID_SIZE_Y_ATTR)) { + gy = static_cast(info.GetOperation() + ->getAttrOfType(GRID_SIZE_Y_ATTR) + .getInt()); + } + if (info.GetOperation()->hasAttrOfType(GRID_SIZE_Z_ATTR)) { + gz = static_cast(info.GetOperation() + ->getAttrOfType(GRID_SIZE_Z_ATTR) + .getInt()); } - int gx = static_cast( - info.GetOperation()->getAttrOfType(GRID_SIZE_ATTR).getInt()); int bx = static_cast(info.GetOperation() - ->getAttrOfType(BLOCK_SIZE_ATTR) - .getInt()); - std::vector ranks = GetIntArrayAttr( - info.GetOperation()->getAttrOfType(ARG_RANKS_ATTR)); + ->getAttrOfType(BLOCK_SIZE_X_ATTR) + .getInt()), + by = 1, bz = 1; + if (info.GetOperation()->hasAttrOfType(BLOCK_SIZE_Y_ATTR)) { + by = static_cast(info.GetOperation() + ->getAttrOfType(BLOCK_SIZE_Y_ATTR) + .getInt()); + } + if (info.GetOperation()->hasAttrOfType(BLOCK_SIZE_Z_ATTR)) { + bz = static_cast(info.GetOperation() + ->getAttrOfType(BLOCK_SIZE_Z_ATTR) + .getInt()); + } + + std::vector ranks; + if (info.GetOperation()->hasAttrOfType(ARG_RANKS_ATTR)) { + ranks = GetIntArrayAttr( + info.GetOperation()->getAttrOfType(ARG_RANKS_ATTR)); + } else { + for (unsigned int i = 0; i < GetOpArgNum(info_); ++i) { + ranks.push_back(GetRankFromOpArgIndex(info_, i)); + } + } auto num_arg = GetOpArgNum(info_); - impl_->grid = dim3(gx, 1, 1); - impl_->block = dim3(bx, 1, 1); + impl_->grid = dim3(gx, gy, gz); + impl_->block = dim3(bx, by, bz); impl_->shared_size = 0; impl_->arg_reserve_size = 3; // initial 3 for grid/block/shared_size diff --git a/runtime/lib/backends/cuda/providers/default/flash_attn/flash_attn_bwd.cc b/runtime/lib/backends/cuda/providers/default/flash_attn/flash_attn_bwd.cc index 7dfb2baf3..4f9da6ae5 100644 --- a/runtime/lib/backends/cuda/providers/default/flash_attn/flash_attn_bwd.cc +++ b/runtime/lib/backends/cuda/providers/default/flash_attn/flash_attn_bwd.cc @@ -85,10 +85,10 @@ common::Status FlashAttnBwdOpKernel::RunImpl(const ExecutionContext &ctx) { } // dropout check - bool is_dropout = p_dropout > 0.0; - if (is_dropout) { - return InvalidArgs("currently, we only support p_dropout == 0"); - } + // bool is_dropout = p_dropout > 0.0; + // if (is_dropout) { + // return InvalidArgs("currently, we only support p_dropout == 0"); + // } // type check const auto dout_type = accessor.GetArgDTypeEnum(0); @@ -288,6 +288,7 @@ common::Status FlashAttnBwdOpKernel::RunImpl(const ExecutionContext &ctx) { /* seqlen_k */ seqlen_k, /* seqlen_q_rounded */ seqlen_q_rounded, /* seqlen_k_rounded */ seqlen_k_rounded, + /* p_dropout */ p_dropout, /* is_causal */ is_causal, /* stream */ stream); diff --git a/runtime/lib/backends/cuda/providers/default/flash_attn/flash_attn_fwd.cc b/runtime/lib/backends/cuda/providers/default/flash_attn/flash_attn_fwd.cc index 1702623e4..5a588ac82 100644 --- a/runtime/lib/backends/cuda/providers/default/flash_attn/flash_attn_fwd.cc +++ b/runtime/lib/backends/cuda/providers/default/flash_attn/flash_attn_fwd.cc @@ -50,10 +50,16 @@ common::Status FlashAttnFwdOpKernel::RunImpl(const ExecutionContext &ctx) { void *q_ptr = accessor.GetArgAsyncValueRef(0); void *k_ptr = accessor.GetArgAsyncValueRef(1); void *v_ptr = accessor.GetArgAsyncValueRef(2); - void *o_ptr = accessor.GetArgAsyncValueRef(3); - void *softmax_lse_ptr = accessor.GetArgAsyncValueRef(4); - void *softmax_ptr = accessor.GetArgAsyncValueRef(5); - void *rng_state_ptr = accessor.GetArgAsyncValueRef(6); // TODO : handle rng + void *rng_state_ptr = accessor.GetArgAsyncValueRef(3); + void *o_ptr = accessor.GetArgAsyncValueRef(4); + void *softmax_lse_ptr = accessor.GetArgAsyncValueRef(5); + void *softmax_ptr = accessor.GetArgAsyncValueRef(6); + + // check rng_state + // uint64_t *h_rng_state = new uint64_t[2]; + // cudaMemcpy(h_rng_state, rng_state_ptr, 2 * sizeof(uint64_t), + // cudaMemcpyDeviceToHost); std::cout << h_rng_state[0] << "," << + // h_rng_state[1] << std::endl; cudaDeviceSynchronize(); // attr const bool is_causal = accessor.GetAttrAsBool("causal"); @@ -66,7 +72,7 @@ common::Status FlashAttnFwdOpKernel::RunImpl(const ExecutionContext &ctx) { const auto q_shape = accessor.GetArgShape(0); const auto k_shape = accessor.GetArgShape(1); const auto v_shape = accessor.GetArgShape(2); - const auto o_shape = accessor.GetArgShape(3); + const auto o_shape = accessor.GetArgShape(4); int64_t o_rank = o_shape.size(); int64_t q_rank = q_shape.size(); int64_t k_rank = k_shape.size(); @@ -115,7 +121,7 @@ common::Status FlashAttnFwdOpKernel::RunImpl(const ExecutionContext &ctx) { DTypeEnum q_dtype = accessor.GetArgDTypeEnum(0); DTypeEnum k_dtype = accessor.GetArgDTypeEnum(1); DTypeEnum v_dtype = accessor.GetArgDTypeEnum(2); - DTypeEnum o_dtype = accessor.GetArgDTypeEnum(3); + DTypeEnum o_dtype = accessor.GetArgDTypeEnum(4); if (o_dtype != q_dtype || q_dtype != k_dtype || k_dtype != v_dtype) { return InvalidArgs( "query, key, value, and output must have the same dtype"); @@ -194,6 +200,7 @@ common::Status FlashAttnFwdOpKernel::RunImpl(const ExecutionContext &ctx) { /* seqlen_k */ seqlen_k, /* seqlen_q_rounded */ seqlen_q_rounded, /* seqlen_k_rounded */ seqlen_k_rounded, + /* p_dropout */ p_dropout, /* is_causal */ is_causal, /* stream */ stream); diff --git a/runtime/lib/backends/cuda/providers/default/flash_attn/kernels/flash_api.cu b/runtime/lib/backends/cuda/providers/default/flash_attn/kernels/flash_api.cu index e2981b63a..dce323437 100644 --- a/runtime/lib/backends/cuda/providers/default/flash_attn/kernels/flash_api.cu +++ b/runtime/lib/backends/cuda/providers/default/flash_attn/kernels/flash_api.cu @@ -142,7 +142,7 @@ void run_mha(void *q_ptr, void *k_ptr, void *v_ptr, void *o_ptr, uint32_t seqlen_q, uint32_t seqlen_k, uint32_t seqlen_q_rounded, uint32_t seqlen_k_rounded, - int is_causal, cudaStream_t stream) { + float p_dropout, int is_causal, cudaStream_t stream) { Flash_fwd_params params; // Reset the parameters memset(¶ms, 0, sizeof(params)); @@ -187,7 +187,7 @@ void run_mha(void *q_ptr, void *k_ptr, void *v_ptr, void *o_ptr, params.scale_softmax = softmax_scale; params.scale_softmax_log2 = softmax_scale * M_LOG2E; - params.p_dropout = 1.; // probability to keep + params.p_dropout = 1.f - p_dropout; // probability to keep params.p_dropout_in_uint8_t = uint8_t(std::floor(params.p_dropout * 255.0)); params.rp_dropout = 1.f / params.p_dropout; params.scale_softmax_rp_dropout = params.rp_dropout * params.scale_softmax; @@ -195,6 +195,8 @@ void run_mha(void *q_ptr, void *k_ptr, void *v_ptr, void *o_ptr, params.cu_seqlens_q = cu_seqlens_q_ptr; params.cu_seqlens_k = cu_seqlens_k_ptr; params.p_ptr = softmax_ptr; // used for `return_softmax`. + params.rng_state = static_cast(rng_state_ptr); + // print_Flash_fwd_params(params); FP16_SWITCH(!params.is_bf16, [&] { @@ -225,7 +227,7 @@ void run_mha_bwd(void *q_ptr, void *k_ptr, void *v_ptr, void *o_ptr, uint32_t seqlen_q, uint32_t seqlen_k, uint32_t seqlen_q_rounded, uint32_t seqlen_k_rounded, - int is_causal, cudaStream_t stream) { + float p_dropout, int is_causal, cudaStream_t stream) { Flash_bwd_params params; // Reset the parameters memset(¶ms, 0, sizeof(params)); @@ -293,7 +295,7 @@ void run_mha_bwd(void *q_ptr, void *k_ptr, void *v_ptr, void *o_ptr, params.scale_softmax = softmax_scale; params.scale_softmax_log2 = softmax_scale * M_LOG2E; - params.p_dropout = 1.; // probability to keep + params.p_dropout = 1.f - p_dropout; // probability to keep params.p_dropout_in_uint8_t = uint8_t(std::floor(params.p_dropout * 255.0)); params.rp_dropout = 1.f / params.p_dropout; params.scale_softmax_rp_dropout = params.rp_dropout * params.scale_softmax; @@ -302,6 +304,7 @@ void run_mha_bwd(void *q_ptr, void *k_ptr, void *v_ptr, void *o_ptr, params.cu_seqlens_k = cu_seqlens_k_ptr; params.p_ptr = nullptr; // used for `return_softmax`, no use in bwd params.dsoftmax_sum = dsoftmax_sum_ptr; + params.rng_state = static_cast(rng_state_ptr); // print_Flash_bwd_params(params); diff --git a/runtime/lib/backends/cuda/providers/default/flash_attn/kernels/flash_api.h b/runtime/lib/backends/cuda/providers/default/flash_attn/kernels/flash_api.h index 205ee88cd..5c6e6f70c 100644 --- a/runtime/lib/backends/cuda/providers/default/flash_attn/kernels/flash_api.h +++ b/runtime/lib/backends/cuda/providers/default/flash_attn/kernels/flash_api.h @@ -29,7 +29,7 @@ void run_mha(void *q_ptr, void *k_ptr, void *v_ptr, void *o_ptr, uint32_t seqlen_q, uint32_t seqlen_k, uint32_t seqlen_q_rounded, uint32_t seqlen_k_rounded, - int is_causal, cudaStream_t stream); + float p_dropout, int is_causal, cudaStream_t stream); void run_mha_bwd(void *q_ptr, void *k_ptr, void *v_ptr, void *o_ptr, void *dout_ptr, void *dq_ptr, void *dk_ptr, void *dv_ptr, @@ -53,7 +53,7 @@ void run_mha_bwd(void *q_ptr, void *k_ptr, void *v_ptr, void *o_ptr, uint32_t seqlen_q, uint32_t seqlen_k, uint32_t seqlen_q_rounded, uint32_t seqlen_k_rounded, - int is_causal, cudaStream_t stream); + float p_dropout, int is_causal, cudaStream_t stream); } // namespace kernel } // namespace cuda diff --git a/runtime/lib/backends/cuda/providers/default/flash_attn/kernels/flash_bwd_kernel.h b/runtime/lib/backends/cuda/providers/default/flash_attn/kernels/flash_bwd_kernel.h index b7f9d95a4..91bac6590 100644 --- a/runtime/lib/backends/cuda/providers/default/flash_attn/kernels/flash_bwd_kernel.h +++ b/runtime/lib/backends/cuda/providers/default/flash_attn/kernels/flash_bwd_kernel.h @@ -957,8 +957,18 @@ compute_dq_dk_dv_1colblock(const Params ¶ms, const int bidb, const int bidh, // auto offset = params.rng_state[1] + (bidb * params.h + bidh) * 32 + tidx % // 32; - unsigned long long seed = 0; - unsigned long long offset = 0; + // deprecated: no rng support. + // unsigned long long seed = 0; + // unsigned long long offset = 0; + + unsigned long long seed = params.rng_state[0]; + unsigned long long offset = + params.rng_state[1] + (bidb * params.h + bidh) * 32 + tidx % 32; + + // if (block_id == 0 && tidx == 0) { + // printf("seed:%lu\n",seed); + // printf("offset:%lu\n",offset); + // } clear(acc_dv); clear(acc_dk); @@ -1693,8 +1703,18 @@ compute_dq_dk_dv_1rowblock(const Params ¶ms, const int bidb, const int bidh, // auto offset = params.rng_state[1] + (bidb * params.h + bidh) * 32 + tidx % // 32; - unsigned long long seed = 0; - unsigned long long offset = 0; + // deprecated: no rng support. + // unsigned long long seed = 0; + // unsigned long long offset = 0; + + unsigned long long seed = params.rng_state[0]; + unsigned long long offset = + params.rng_state[1] + (bidb * params.h + bidh) * 32 + tidx % 32; + + // if (block_id == 0 && tidx == 0) { + // printf("seed:%lu\n",seed); + // printf("offset:%lu\n",offset); + // } clear(acc_dq); diff --git a/runtime/lib/backends/cuda/providers/default/flash_attn/kernels/flash_fwd_kernel.h b/runtime/lib/backends/cuda/providers/default/flash_attn/kernels/flash_fwd_kernel.h index e024129f4..d89242b36 100644 --- a/runtime/lib/backends/cuda/providers/default/flash_attn/kernels/flash_fwd_kernel.h +++ b/runtime/lib/backends/cuda/providers/default/flash_attn/kernels/flash_fwd_kernel.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -376,8 +377,18 @@ inline __device__ void compute_attn_1rowblock(const Params ¶ms, // unsigned long long offset = std::get<1>(seeds) + (bidb * params.h + bidh) * // 32 + tidx % 32; - unsigned long long seed = 0; - unsigned long long offset = 0; + // deprecated: no rng support. + // unsigned long long seed = 0; + // unsigned long long offset = 0; + + unsigned long long seed = params.rng_state[0]; + unsigned long long offset = + params.rng_state[1] + (bidb * params.h + bidh) * 32 + tidx % 32; + + // if (block_id == 0 && tidx == 0) { + // printf("seed:%lu\n",seed); + // printf("offset:%lu\n",offset); + // } // Save seed and offset for backward. // if (block_id == 0 && tidx == 0) { diff --git a/runtime/lib/backends/cuda/providers/default/tensor_generate/fill.cc b/runtime/lib/backends/cuda/providers/default/tensor_generate/fill.cc index 582bc8ea0..e04e7a751 100644 --- a/runtime/lib/backends/cuda/providers/default/tensor_generate/fill.cc +++ b/runtime/lib/backends/cuda/providers/default/tensor_generate/fill.cc @@ -40,14 +40,20 @@ common::Status FillOpKernel::RunImpl(const ExecutionContext &ctx) { static_cast(ctx.work_queue)->GetComputeStream(); void *device_p = accessor.GetArgAsyncValueRef(0); size_t length = accessor.GetNumElementsOfShape(accessor.GetArgShape(0)); + // TODO: common helper for dtype dispatch #define CASE(dtype, ctype, mlir_type) \ - case DTypeEnum::dtype: \ - kernel::Fill( \ - stream, static_cast(device_p), \ - static_cast(accessor.GetAttrAsSplatValue("value")), \ - length); \ - return common::Status::OK() + case DTypeEnum::dtype: { \ + if (accessor.HasAttrOfSplatValue("value")) { \ + kernel::Fill(stream, static_cast(device_p), \ + static_cast( \ + accessor.GetAttrAsSplatValue("value")), \ + length); \ + return common::Status::OK(); \ + } \ + break; \ + } + switch (dtype) { CASE(Float32, float, float); CASE(Int64, int64_t, int64_t); @@ -55,11 +61,30 @@ common::Status FillOpKernel::RunImpl(const ExecutionContext &ctx) { CASE(Float16, __half, float); #undef CASE default: - return common::Status(common::StatusCategory::BRT, - common::StatusCode::NOT_IMPLEMENTED, - "not supported dtype"); + break; }; - return common::Status::OK(); + +#define CASE(dtype, ctype) \ + case DTypeEnum::dtype: { \ + std::vector value = accessor.GetAttrAsVector("value"); \ + cudaMemcpyAsync(device_p, value.data(), value.size() * sizeof(ctype), \ + cudaMemcpyHostToDevice, stream); \ + return common::Status::OK(); \ + } + + switch (dtype) { + CASE(Float32, float); + CASE(Int64, int64_t); + CASE(Float64, double); + CASE(Float16, half_float::half); +#undef CASE + default: + break; + }; + + return common::Status(common::StatusCategory::BRT, + common::StatusCode::NOT_IMPLEMENTED, + "not supported FillOp"); } common::Status FillOpKernel::ProloguePerFrame(const ExecutionContext &) { diff --git a/runtime/lib/core/framework/op_accessor.cc b/runtime/lib/core/framework/op_accessor.cc index 18176432e..975f0982e 100644 --- a/runtime/lib/core/framework/op_accessor.cc +++ b/runtime/lib/core/framework/op_accessor.cc @@ -171,6 +171,31 @@ T OpAccessor::GetAttrAsSplatValue(const std::string &name) const { BRT_THROW("Attribute " + name + " is not set"); } +// GetDenseAttrAsVector will iterate every elements in dense attibutes. +// If you want to avoid iterating, consider use getRawData() but special handle +// for i1 ??? +template +std::vector OpAccessor::GetAttrAsVector(const std::string &name) const { + std::vector results; + if (auto attr = + info_.GetOperation()->getAttrOfType(name)) { + results.reserve(attr.size()); + for (APInt &&i : attr) { + results.push_back(static_cast(i.getSExtValue())); + } + return results; + } else if (auto attr = + info_.GetOperation()->getAttrOfType( + name)) { + results.reserve(attr.size()); + for (APFloat &&i : attr) { + results.push_back(static_cast(i.convertToDouble())); + } + return results; + } + BRT_THROW("Attribute " + name + " is not supported to get as vector"); +} + std::string OpAccessor::GetUID() const { auto byre_op = llvm::cast(info_.GetOperation()); return ByREHandle::GetOpUID(byre_op); @@ -211,6 +236,18 @@ INST_ATTR_METH(double) INST_ATTR_METH(StringView) #undef INST_ATTR_METH +#define INST_DENSE_ATTR_METH(T) \ + template std::vector OpAccessor::GetAttrAsVector(const std::string &) \ + const; +INST_DENSE_ATTR_METH(float) +INST_DENSE_ATTR_METH(int32_t) +INST_DENSE_ATTR_METH(int64_t) +INST_DENSE_ATTR_METH(uint8_t) +INST_DENSE_ATTR_METH(uint32_t) +INST_DENSE_ATTR_METH(double) +INST_DENSE_ATTR_METH(half_float::half) +#undef INST_DENSE_ATTR_METH + #define INST_SCALAR_METH(T) \ template T OpAccessor::GetArgScalar(size_t); \ template common::Status OpAccessor::SetResultScalar(size_t result_idx, \ diff --git a/runtime/test/backends/cuda/providers/default/kernel/fill_test.cc b/runtime/test/backends/cuda/providers/default/kernel/fill_test.cc index 6c870d93b..9f8c30335 100644 --- a/runtime/test/backends/cuda/providers/default/kernel/fill_test.cc +++ b/runtime/test/backends/cuda/providers/default/kernel/fill_test.cc @@ -32,8 +32,6 @@ using namespace brt::cuda; using namespace brt::test; TEST(CUDATestFillOp, Basic) { - constexpr size_t length = 512 * 128; - Session session; auto status_allocator = CUDAAllocatorFactory(&session); BRT_TEST_CHECK_STATUS(status_allocator); @@ -54,8 +52,16 @@ TEST(CUDATestFillOp, Basic) { auto status_sync = request->Sync(); BRT_TEST_CHECK_STATUS(status_sync); + size_t length = 512 * 128; CheckCUDAValues(static_cast(request->GetArg(0)), length, 0.f); CheckCUDAValues(static_cast(request->GetArg(1)), length, 1.f); CheckCUDAValues<__half>(static_cast<__half *>(request->GetArg(2)), length, static_cast<__half>(1.f)); + length = 3; + std::vector results = {static_cast(1.f), + static_cast(2.f), + static_cast(3.f)}; + EXPECT_TRUE(CheckCUDAValuesWithCPUValues( + static_cast<__half *>(request->GetArg(3)), + reinterpret_cast<__half *>(results.data()), length)); } diff --git a/runtime/test/backends/cuda/providers/default/kernel/flash_attn_fwd_test.cc b/runtime/test/backends/cuda/providers/default/kernel/flash_attn_fwd_test.cc index f5a2d9a3a..38a0bc87b 100644 --- a/runtime/test/backends/cuda/providers/default/kernel/flash_attn_fwd_test.cc +++ b/runtime/test/backends/cuda/providers/default/kernel/flash_attn_fwd_test.cc @@ -51,6 +51,7 @@ TEST(SM80CUDATestFlashAttnFwd, Basic) { size_t head_dims = 32; size_t input_len = b * seq_len * num_heads * head_dims; size_t softmax_len = b * seq_len * num_heads; + // size_t rng_state_len = 2; Session session; auto status_allocator = CUDAAllocatorFactory(&session); @@ -71,12 +72,22 @@ TEST(SM80CUDATestFlashAttnFwd, Basic) { __half *d_v; float *d_softmax_lse; + // rng_state + // uint64_t *d_rng_state; + // uint64_t h_rng_state[2]; + // h_rng_state[0] = 0UL; + // h_rng_state[1] = 3000UL; + cudaMalloc(&d_o, input_len * sizeof(__half)); cudaMalloc(&d_q, input_len * sizeof(__half)); cudaMalloc(&d_k, input_len * sizeof(__half)); cudaMalloc(&d_v, input_len * sizeof(__half)); cudaMalloc(&d_softmax_lse, softmax_len * sizeof(float)); + // cudaMalloc(&d_rng_state, rng_state_len * sizeof(uint64_t)); + // cudaMemcpy(d_rng_state, h_rng_state, rng_state_len * sizeof(uint64_t), + // cudaMemcpyHostToDevice); + ReadCUDAFloatValues(d_q, input_len, input_q_file); ReadCUDAFloatValues(d_k, input_len, input_k_file); ReadCUDAFloatValues(d_v, input_len, input_v_file); @@ -96,6 +107,7 @@ TEST(SM80CUDATestFlashAttnFwd, Basic) { request->BindArg(2, d_v); request->BindArg(3, d_o); request->BindArg(4, d_softmax_lse); + // request->BindArg(6, d_rng_state); request->FinishIOBinding(); @@ -104,7 +116,7 @@ TEST(SM80CUDATestFlashAttnFwd, Basic) { auto status_sync = request->Sync(); BRT_TEST_CHECK_STATUS(status_sync); - // PrintCUDAValues(d_o, input_len, input_len); + PrintCUDAValues(d_o, input_len, input_len); CheckCUDABuffer<__half>( (__half *)d_o, /* size */ input_len, [&](__half *h_ptr) { diff --git a/runtime/test/include/brt/test/common/cuda/util.h b/runtime/test/include/brt/test/common/cuda/util.h index 6e63144e8..3def1a9af 100644 --- a/runtime/test/include/brt/test/common/cuda/util.h +++ b/runtime/test/include/brt/test/common/cuda/util.h @@ -146,6 +146,19 @@ template return passed; } +template +[[nodiscard]] bool CheckCUDAValuesWithCPUValues(T *first, T *second, + size_t size, + size_t print_count = 10) { + cudaDeviceSynchronize(); + T *h_first = (T *)malloc(size * sizeof(T)); + cudaMemcpy(h_first, first, size * sizeof(T), cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + bool passed = CheckCPUValues(h_first, second, size, print_count); + free(h_first); + return passed; +} + // print floating point values template ::value, int> = 0> diff --git a/runtime/test/test_files/fill_cuda.mlir b/runtime/test/test_files/fill_cuda.mlir index 5c25bf916..81b78c438 100644 --- a/runtime/test/test_files/fill_cuda.mlir +++ b/runtime/test/test_files/fill_cuda.mlir @@ -1,10 +1,12 @@ module attributes {byre.container_module} { func.func @test_fill(%arg0 : memref<512x128xf32, "cuda"> {byre.argname = "Fill0", byre.argtype = 2: i32}, %arg1 : memref<512x128xf32, "cuda"> {byre.argname = "Fill1", byre.argtype = 2: i32}, - %arg2 : memref<512x128xf16, "cuda"> {byre.argname = "Fill1FP16", byre.argtype = 2: i32}) attributes {byre.entry_point} { + %arg2 : memref<512x128xf16, "cuda"> {byre.argname = "Fill1FP16", byre.argtype = 2: i32}, + %arg3 : memref<3xf16, "cuda"> {byre.argname = "FillNonSplat", byre.argtype = 2: i32}) attributes {byre.entry_point} { byre.compute @FillOp(%arg0) {value = dense<0.000000e+00> : tensor<512x128xf32>} : memref<512x128xf32, "cuda"> byre.compute @FillOp(%arg1) {value = dense<1.000000e+00> : tensor<512x128xf32>} : memref<512x128xf32, "cuda"> byre.compute @FillOp(%arg2) {value = dense<1.000000e+00> : tensor<512x128xf16>} : memref<512x128xf16, "cuda"> + byre.compute @FillOp(%arg3) {value = dense<[1.000000e+00, 2.000000e+00, 3.000000e+00]> : tensor<3xf16>} : memref<3xf16, "cuda"> return } } \ No newline at end of file diff --git a/runtime/test/test_files/flash_attn_fwd.mlir b/runtime/test/test_files/flash_attn_fwd.mlir index eab8e694a..358b551f9 100644 --- a/runtime/test/test_files/flash_attn_fwd.mlir +++ b/runtime/test/test_files/flash_attn_fwd.mlir @@ -6,7 +6,7 @@ module attributes {byre.container_module} { %arg4 : memref<1x3x128xf32, "cuda"> {byre.argname = "SoftmaxLse", byre.argtype = 2: i32}, %arg5 : memref<1x3x128x128xf32, "cuda"> {byre.argname = "SoftmaxPtr", byre.argtype = 2: i32}, %arg6 : memref<2xi64, "cuda"> {byre.argname = "RngState", byre.argtype = 2: i32}) attributes {byre.entry_point} { - byre.compute @byteir.flash_attn_fwd(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6) {causal = true, dropout_p = 0.000000e+00 : f32, return_softmax = false, softmax_scale = 0.500000e+00 : f32} : memref<1x128x3x32xf16, "cuda">, memref<1x128x3x32xf16, "cuda">, memref<1x128x3x32xf16, "cuda">, memref<1x128x3x32xf16, "cuda">, memref<1x3x128xf32, "cuda">, memref<1x3x128x128xf32, "cuda">, memref<2xi64, "cuda"> + byre.compute @byteir.flash_attn_fwd(%arg0, %arg1, %arg2, %arg6, %arg3, %arg4, %arg5) {causal = true, dropout_p = 0.000000e+00 : f32, return_softmax = false, softmax_scale = 0.500000e+00 : f32} : memref<1x128x3x32xf16, "cuda">, memref<1x128x3x32xf16, "cuda">, memref<1x128x3x32xf16, "cuda">, memref<2xi64, "cuda">, memref<1x128x3x32xf16, "cuda">, memref<1x3x128xf32, "cuda">, memref<1x3x128x128xf32, "cuda"> return } -} \ No newline at end of file +} diff --git a/tests/numerical_test/execute.py b/tests/numerical_test/execute.py index 46e67351a..c265fd1d7 100644 --- a/tests/numerical_test/execute.py +++ b/tests/numerical_test/execute.py @@ -91,6 +91,7 @@ def compile_and_run_mlir(mhlo_file, target): interp = Interpreter.load_from_file(mhlo_file) np_inputs = generate_np_inputs(interp) func_name = get_entry_func_name(interp) + unique_name = os.path.basename(mhlo_file).split('.')[0] # run golden golden_outputs = interp.call_function(func_name, np_inputs) @@ -98,8 +99,8 @@ def compile_and_run_mlir(mhlo_file, target): # byteir compile TEMP_FOLDER = "./local_test" os.makedirs(TEMP_FOLDER, exist_ok=True) - os.makedirs(TEMP_FOLDER + f"/{func_name}", exist_ok=True) - output_mlir_file_name = f'{TEMP_FOLDER}/{func_name}/{func_name}.rt.mlir' + os.makedirs(TEMP_FOLDER + f"/{unique_name}", exist_ok=True) + output_mlir_file_name = f'{TEMP_FOLDER}/{unique_name}/{unique_name}.rt.mlir' byteir.compile(mhlo_file, output_mlir_file_name, entry_func=func_name, target=target) except Exception as e: diff --git a/tests/numerical_test/main.py b/tests/numerical_test/main.py index ae2c416e4..ed46a05cd 100644 --- a/tests/numerical_test/main.py +++ b/tests/numerical_test/main.py @@ -31,7 +31,7 @@ parser.add_argument("--target", type=str, default="cuda_with_ait", choices=["ait", "cuda", "cuda_with_ait_aggressive"], help="target device name") parser.add_argument("-c", "--config", default="all", - choices=["all", "mlir", "torch"], help="test sets to run.") + choices=["all", "mlir", "torch", "dynamo"], help="test sets to run.") args = parser.parse_args() EXCLUDE_MLIR_TESTS = [] @@ -40,10 +40,11 @@ SM80_PLUS_TESTS = [ "dot_f32.mlir", + "bmm_rrr_permute_f16.mlir", "bmm_rrr_permute_f32.mlir", "MatmulF32Module_basic", "BatchMatmulAddF32Module_basic", - "BatchMatmulF32Module", + "BatchMatmulF32Module_basic", ] @@ -115,13 +116,18 @@ def main(): if args.config == 'all': results = run_mlir_test(arch) results = results + run_torch_test(arch) + # TODO(zzk): disable flash attn test for now + # run_torch_dynamo_tests(arch) elif args.config == 'mlir': results = run_mlir_test(arch) elif args.config == 'torch': results = run_torch_test(arch) + elif args.config == 'dynamo': + # TODO(zzk): use test infra for dynamo tests + # TODO(zzk): disable flash attn test for now + # run_torch_dynamo_tests(arch) + pass failed = report_results(results) - # TODO(zzk): disable flash attn test for now - # run_torch_dynamo_tests(arch) sys.exit(1 if failed else 0) diff --git a/tests/numerical_test/mlir_tests/ops/bmm_rrr_permute_f16.mlir b/tests/numerical_test/mlir_tests/ops/bmm_rrr_permute_f16.mlir new file mode 100644 index 000000000..3c0cb7a86 --- /dev/null +++ b/tests/numerical_test/mlir_tests/ops/bmm_rrr_permute_f16.mlir @@ -0,0 +1,6 @@ +func.func @bmm_rrr_permute(%arg0: tensor<32x64x64xf16>, %arg1: tensor<32x64x128xf16>) -> tensor<1x64x32x128xf16> { + %0 = "mhlo.dot_general"(%arg0, %arg1) {dot_dimension_numbers = #mhlo.dot} : (tensor<32x64x64xf16>, tensor<32x64x128xf16>) -> tensor<32x64x128xf16> + %1 = mhlo.reshape %0 : (tensor<32x64x128xf16>) -> tensor<1x32x64x128xf16> + %2 = "mhlo.transpose"(%1) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x32x64x128xf16>) -> tensor<1x64x32x128xf16> + return %2 : tensor<1x64x32x128xf16> +} \ No newline at end of file diff --git a/tests/numerical_test/mlir_tests/ops/concat2.mlir b/tests/numerical_test/mlir_tests/ops/concat2.mlir new file mode 100644 index 000000000..fe7d7779a --- /dev/null +++ b/tests/numerical_test/mlir_tests/ops/concat2.mlir @@ -0,0 +1,6 @@ +func.func @concat2(%arg0: tensor, %arg1: tensor) -> (tensor<2xi64>) { + %0 = mhlo.reshape %arg0 : (tensor) -> tensor<1xi64> + %1 = mhlo.reshape %arg1 : (tensor) -> tensor<1xi64> + %2 = "mhlo.concatenate"(%0, %1) {dimension = 0 : i64} : (tensor<1xi64>, tensor<1xi64>) -> tensor<2xi64> + return %2 : tensor<2xi64> +} diff --git a/tests/numerical_test/torch_dynamo_e2e_testing/backend.py b/tests/numerical_test/torch_dynamo_e2e_testing/backend.py index 0a7ed28fc..a76663efd 100644 --- a/tests/numerical_test/torch_dynamo_e2e_testing/backend.py +++ b/tests/numerical_test/torch_dynamo_e2e_testing/backend.py @@ -18,7 +18,7 @@ import byteir from torch_frontend import compile -from torch_frontend import list_decomposed_ops, preprocess_fx_graph, fx_replace_attn_pattern, replace_flash_attn +from torch_frontend import list_decomposed_ops, preprocess_fx_graph, fx_replace_attn_pattern, replace_flash_attn, get_none_indices from functorch.compile import aot_module from torch._decomp import get_decompositions @@ -67,22 +67,6 @@ def __call__(self, *inputs): ret_ptr += 1 return results -def get_none_indices(fx_g: torch.fx.GraphModule) -> List[int]: - none_indices = [] - for node in fx_g.graph.nodes: - if node.op == "output": - assert len(node.args) == 1, "Output node must have a single argument" - node_arg = node.args[0] - if isinstance(node_arg, (list, tuple)): - node_arg = list(node_arg) - node_args_len = len(node_arg) - for i in range(node_args_len): - if node_arg[i] is None: - none_indices.append(i) - break - - return none_indices - def byteir_compile_fx_inner(graph: torch.fx.GraphModule, inputs, is_backward, ban_lst=[]): category = 'backward' if is_backward else 'forward'