From 4a225330f8f43386d9865a8a9e280ab6a1e43e3c Mon Sep 17 00:00:00 2001
From: Li-Wen Chang <120213201+liwenchangbdbz@users.noreply.github.com>
Date: Mon, 25 Sep 2023 23:19:18 -0700
Subject: [PATCH] [Release] Official Release ByteIR 1.4.0 (#71)

* [Sync] internal a6ef5f00...22d6dee6
* [AIT] Disabled hw info log, added error handling
* [CAT] Adjusted layout support
* [compiler/doc] Added codegen doc
* [frontend/torch] Added demo code, added more fx pattern for llm, fixed einsum, updated to 23b72244b1e1eaa0511cece8535b32810c1d0d7a
* [GPU] Added reduction codegen for PTX
* [Mhlo] Fixed transpose movedown, Added canonicalizer for gather with iota
* [Release] released 1.4.0 package
* [Runtime] Supported non-splat value in FillOp, added dropout support for flashV2
* [Util] Fixed bugs
---
 compiler/doc/codegen.md                       | 245 +++++
 compiler/include/byteir/Analysis/UseRange.h   |  15 +
 .../byteir/Conversion/FuncToByre/FuncToByre.h |   6 +
 compiler/include/byteir/Conversion/Passes.td  |  26 +
 .../byteir/Conversion/ToLinalg/ToLinalg.h     |   3 +-
 .../include/byteir/Conversion/ToPTX/ToPTX.h   |   3 +-
 .../include/byteir/Dialect/CMakeLists.txt     |   2 +
 .../include/byteir/Dialect/Cat/IR/CatOps.td   |  11 +
 .../include/byteir/Dialect/GPU/CMakeLists.txt |   3 +
 compiler/include/byteir/Dialect/GPU/Passes.h  |  37 +
 compiler/include/byteir/Dialect/GPU/Passes.td |  36 +
 .../Dialect/GPU/Transforms/Transforms.h       |  32 +
 .../include/byteir/Dialect/Linalg/Passes.td   |  10 +
 .../TransformOps/LinalgExtTransformOps.td     |  39 +
 .../Linalg/Transforms/LinalgCollapseLoops.h   |   7 +-
 .../byteir/Dialect/Tensor/CMakeLists.txt      |   3 +
 .../include/byteir/Dialect/Tensor/Passes.h    |  31 +
 .../include/byteir/Dialect/Tensor/Passes.td   |  35 +
 .../Transforms/TensorPadSpecialization.h      |  30 +
 .../byteir/Dialect/Transform/Passes.td        |  36 +
 .../Transform/Transforms/TransformInsertion.h |  10 +
 .../Dialect/mhlo/Transforms/CanonicalizeExt.h |   3 +
 .../mhlo/Transforms/ConvertOpToCustomCall.h   |   4 +-
 .../mhlo/Transforms/GenericFusionCommon.h     |   4 +
 .../byteir/Dialect/mhlo/Transforms/HloFuser.h |   6 +
 .../byteir/Dialect/mhlo/Util/CustomCallUtil.h |   8 +
 .../byteir/Pipelines/GPU/MappingForall.h      |  53 +
 .../byteir/Pipelines/GPU/ReductionCodegen.h   | 153 +++
 .../byteir/Pipelines/InitAllPipelines.h       |   4 +
 .../byteir/Transforms/MemoryPlanning.h        |   7 +-
 compiler/include/byteir/Transforms/Passes.td  |   2 +-
 compiler/lib/Analysis/UseRange.cpp            |   7 +-
 compiler/lib/CAPI/CMakeLists.txt              |   1 +
 compiler/lib/CAPI/Passes.cpp                  |   4 +
 .../lib/Conversion/FuncToByre/FuncToByre.cpp  |  78 ++
 .../lib/Conversion/HloToCat/FuseHloToCat.cpp  |  45 +-
 .../HloToCat/FuseHloToCatPattern.td           |  30 +-
 .../Conversion/MemrefToByre/MemrefToByre.cpp  |  25 +-
 compiler/lib/Conversion/ToByre/ToByre.cpp     |  15 +-
 .../ToLinalg/MemrefCopyToLinalg.cpp           | 140 +--
 .../lib/Conversion/ToPTX/CollectGPUKernel.cpp |  33 +-
 compiler/lib/Dialect/CMakeLists.txt           |   1 +
 compiler/lib/Dialect/Cat/IR/CatDialect.cpp    |   6 +
 compiler/lib/Dialect/GPU/CMakeLists.txt       |   1 +
 .../lib/Dialect/GPU/Transforms/CMakeLists.txt |  19 +
 .../Transforms/ShmAllocaToWorkgroupArg.cpp    |  86 ++
 .../TransformOps/LinalgExtTransformOps.cpp    | 132 +++
 .../Linalg/Transforms/FuseElementwise.cpp     |  25 +-
 .../Linalg/Transforms/LinalgCollapseLoops.cpp |  45 +-
 .../Dialect/Tensor/Transforms/CMakeLists.txt  |   3 +
 .../Tensor/Transforms/CanonicalizeExt.cpp     |  42 +
 .../Dialect/Tensor/Transforms/PassDetail.h    |  40 +
 .../Transforms/TensorPadSpecialization.cpp    | 242 +++++
 .../Transforms/TransformInsertion.cpp         | 101 ++
 .../mhlo/Transforms/CanonicalizeExt.cpp       |  55 +
 .../lib/Dialect/mhlo/Transforms/CatFusion.cpp |  16 +-
 .../mhlo/Transforms/ConvertOpToCustomCall.cpp |  91 +-
 .../Dialect/mhlo/Transforms/GenericFusion.cpp | 104 +-
 .../mhlo/Transforms/HloAggressiveFusion.cpp   |   5 +-
 .../Dialect/mhlo/Transforms/HloMoveDown.cpp   |  54 +-
 compiler/lib/Pipelines/BufferizeOpt.cpp       |   2 +
 compiler/lib/Pipelines/ByreOpt.cpp            |   5 +-
 compiler/lib/Pipelines/GPU/CMakeLists.txt     |   4 +
 compiler/lib/Pipelines/GPU/GPUOpt.cpp         |  39 +-
 compiler/lib/Pipelines/GPU/MappingForall.cpp  | 148 +++
 compiler/lib/Pipelines/GPU/NVVMCodegen.cpp    |   4 +
 .../lib/Pipelines/GPU/ReductionCodegen.cpp    | 942 ++++++++++++++++++
 compiler/lib/Pipelines/HloOpt.cpp             |   3 +
 compiler/lib/Pipelines/LinalgMemrefOpt.cpp    |   4 +-
 compiler/lib/Pipelines/LinalgTensorOpt.cpp    | 138 ++-
 compiler/lib/Transforms/Bufferize.cpp         | 238 ++++-
 compiler/lib/Transforms/MemoryPlanning.cpp    |  47 +-
 compiler/lib/Utils/Utils.cpp                  |  11 +-
 compiler/numerical/hlo/canonicalize_ext.mlir  |  37 +
 compiler/numerical/hlo/hlo_move_down.mlir     |  22 +
 compiler/python/ByteIRModules.cpp             |   2 +
 compiler/python/byteir/compile.py             |  20 +-
 .../byteir/dialects/cat/ir_processor.py       |  30 +-
 compiler/python/byteir/tools/compiler.py      |   4 +-
 .../test/Conversion/HloToCat/fused_ops.mlir   |  66 +-
 .../ToByre/convertMemRefToByre.mlir           |  22 +-
 .../transform-op-fold-unit-extent-dims.mlir   |   6 +-
 .../transforms/ConvertOpToCustomCall.mlir     |  19 +
 .../Dialect/Mhlo/transforms/hloMoveDown.mlir  |  26 +-
 .../test/Dialect/Tensor/canonicalizeExt.mlir  |  11 +
 compiler/test/Transforms/canonicalizeExt.mlir |  37 +
 compiler/test/Transforms/memoryPlanning.mlir  |  12 +-
 compiler/tools/byteir-opt/CMakeLists.txt      |   1 +
 compiler/tools/byteir-opt/byteir-opt.cpp      |   4 +
 external/patches/AITemplate/logging.patch     |  17 +
 .../src/Conversion/OFRewriteToCustomCall.cpp  |   1 +
 .../test/of_rewrite_to_custom_call.mlir       |   2 +-
 .../torch-frontend/examples/demo/README.md    |  16 +
 .../torch-frontend/examples/demo/backend.py   | 195 ++++
 .../examples/demo/byteir_fusible_pattern.py   | 194 ++++
 .../examples/demo/compile_utils.py            |  92 ++
 .../torch-frontend/examples/demo/config.py    |  35 +
 .../examples/demo/fx_match_utils.py           |  40 +
 .../torch-frontend/examples/demo/main.py      | 220 ++++
 .../examples/demo/partitioners.py             | 940 +++++++++++++++++
 .../third_party/patches/einsum.patch          | 633 +++++++-----
 .../Conversion/ConvertTorchToCustomCall.cpp   |   4 +
 .../python/test/test_attn_rewrite.py          |  18 +
 .../python/test/test_fx_utils.py              |  18 +
 .../python/torch_frontend/__init__.py         |   3 +-
 .../python/torch_frontend/fx_utils.py         |  86 ++
 .../include/brt/core/framework/op_accessor.h  |   3 +
 .../cuda/providers/default/ait/ait.cc         |   3 +-
 .../cuda/providers/default/codegen/ptx.cc     |  60 +-
 .../default/flash_attn/flash_attn_bwd.cc      |   9 +-
 .../default/flash_attn/flash_attn_fwd.cc      |  19 +-
 .../default/flash_attn/kernels/flash_api.cu   |  11 +-
 .../default/flash_attn/kernels/flash_api.h    |   4 +-
 .../flash_attn/kernels/flash_bwd_kernel.h     |  28 +-
 .../flash_attn/kernels/flash_fwd_kernel.h     |  15 +-
 .../providers/default/tensor_generate/fill.cc |  45 +-
 runtime/lib/core/framework/op_accessor.cc     |  37 +
 .../providers/default/kernel/fill_test.cc     |  10 +-
 .../default/kernel/flash_attn_fwd_test.cc     |  14 +-
 .../test/include/brt/test/common/cuda/util.h  |  13 +
 runtime/test/test_files/fill_cuda.mlir        |   4 +-
 runtime/test/test_files/flash_attn_fwd.mlir   |   4 +-
 tests/numerical_test/execute.py               |   5 +-
 tests/numerical_test/main.py                  |  14 +-
 .../mlir_tests/ops/bmm_rrr_permute_f16.mlir   |   6 +
 .../mlir_tests/ops/concat2.mlir               |   6 +
 .../torch_dynamo_e2e_testing/backend.py       |  18 +-
 127 files changed, 6472 insertions(+), 584 deletions(-)
 create mode 100644 compiler/doc/codegen.md
 create mode 100644 compiler/include/byteir/Dialect/GPU/CMakeLists.txt
 create mode 100644 compiler/include/byteir/Dialect/GPU/Passes.h
 create mode 100644 compiler/include/byteir/Dialect/GPU/Passes.td
 create mode 100644 compiler/include/byteir/Dialect/GPU/Transforms/Transforms.h
 create mode 100644 compiler/include/byteir/Dialect/Tensor/CMakeLists.txt
 create mode 100644 compiler/include/byteir/Dialect/Tensor/Passes.h
 create mode 100644 compiler/include/byteir/Dialect/Tensor/Passes.td
 create mode 100644 compiler/include/byteir/Dialect/Tensor/Transforms/TensorPadSpecialization.h
 create mode 100644 compiler/include/byteir/Pipelines/GPU/MappingForall.h
 create mode 100644 compiler/include/byteir/Pipelines/GPU/ReductionCodegen.h
 create mode 100644 compiler/lib/Dialect/GPU/CMakeLists.txt
 create mode 100644 compiler/lib/Dialect/GPU/Transforms/CMakeLists.txt
 create mode 100644 compiler/lib/Dialect/GPU/Transforms/ShmAllocaToWorkgroupArg.cpp
 create mode 100644 compiler/lib/Dialect/Tensor/Transforms/PassDetail.h
 create mode 100644 compiler/lib/Dialect/Tensor/Transforms/TensorPadSpecialization.cpp
 create mode 100644 compiler/lib/Pipelines/GPU/MappingForall.cpp
 create mode 100644 compiler/lib/Pipelines/GPU/ReductionCodegen.cpp
 create mode 100644 external/patches/AITemplate/logging.patch
 create mode 100644 frontends/torch-frontend/examples/demo/README.md
 create mode 100644 frontends/torch-frontend/examples/demo/backend.py
 create mode 100644 frontends/torch-frontend/examples/demo/byteir_fusible_pattern.py
 create mode 100644 frontends/torch-frontend/examples/demo/compile_utils.py
 create mode 100644 frontends/torch-frontend/examples/demo/config.py
 create mode 100644 frontends/torch-frontend/examples/demo/fx_match_utils.py
 create mode 100644 frontends/torch-frontend/examples/demo/main.py
 create mode 100644 frontends/torch-frontend/examples/demo/partitioners.py
 create mode 100644 frontends/torch-frontend/torch-frontend/python/test/test_fx_utils.py
 create mode 100644 tests/numerical_test/mlir_tests/ops/bmm_rrr_permute_f16.mlir
 create mode 100644 tests/numerical_test/mlir_tests/ops/concat2.mlir

diff --git a/compiler/doc/codegen.md b/compiler/doc/codegen.md
new file mode 100644
index 000000000..9299a6900
--- /dev/null
+++ b/compiler/doc/codegen.md
@@ -0,0 +1,245 @@
+# Codegen pipeline
+
+## hlo-opt
+
+This pass pipeline is mainly used for clustering fusion group on mhlo dialect, each fusion group was expected to fused into a single kernel in later codegen pipeline and would be outlined as a indepedent kernel function.
+
+- `ReductionFusionPass` reduction fusion in producer direction
+
+- `ElementFusionPass` elementwise/broadcast/collapse_shape/expand_shape/etc. producer-consumer bi-directional fusion
+
+- `FusionOutliningPass` fusion group outlining
+
+## linalg-tensor-opt
+
+### reduction codegen transformations
+
+```
+  func.func private @Unknown0(%arg0: tensor<8192x50257xf16>) -> tensor<50257xf32> attributes {__byteir_reduction_fusion__} {
+    %0 = mhlo.constant dense<0.000000e+00> : tensor<f32>
+    %1 = mhlo.convert %arg0 : (tensor<8192x50257xf16>) -> tensor<8192x50257xf32>
+    %2 = mhlo.reduce(%1 init: %0) across dimensions = [0] : (tensor<8192x50257xf32>, tensor<f32>) -> tensor<50257xf32>
+     reducer(%arg1: tensor<f32>, %arg2: tensor<f32>)  {
+      %3 = mhlo.add %arg1, %arg2 : tensor<f32>
+      mhlo.return %3 : tensor<f32>
+    }
+    return %2 : tensor<50257xf32>
+  }
+```
+
+This pass pipeline first convert outlined mhlo fusion group into linalg dialect and try to fuse linalg op with its producer/consumer.
+
+- `createLinalgElementwiseFusionExtPass` linalg fusion pass with our extension, see [linalg pass](linalg.md) for more details
+
+```
+func.func private @Unknown0(%arg0: tensor<8192x50257xf16>) -> tensor<50257xf32> attributes {__byteir_reduction_fusion__} {
+	%cst = arith.constant 0.000000e+00 : f32
+	%0 = tensor.empty() : tensor<50257xf32>
+	%1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<50257xf32>) -> tensor<50257xf32>
+	%2 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%arg0 : tensor<8192x50257xf16>) outs(%1 : tensor<50257xf32>) {
+	^bb0(%in: f16, %out: f32):
+		%3 = arith.extf %in : f16 to f32
+		%4 = arith.addf %out, %3 : f32
+		linalg.yield %4 : f32
+	} -> tensor<50257xf32>
+	return %2 : tensor<50257xf32>
+}
+```
+
+[optional] Split grid-level reduction on `reduction` dimensions
+
+```
+func.func private @Unknown0(%arg0: tensor<8192x50257xf16>) -> tensor<50257xf32> attributes {__byteir_reduction_fusion__} {
+	%cst = arith.constant 0.000000e+00 : f32
+	%0 = tensor.empty() : tensor<50257xf32>
+	%1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<50257xf32>) -> tensor<50257xf32>
+	%expanded = tensor.expand_shape %arg0 [[0, 1], [2]] : tensor<8192x50257xf16> into tensor<32x256x50257xf16>
+	%2 = tensor.empty() : tensor<32x50257xf32>
+	%3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<32x50257xf32>) -> tensor<32x50257xf32>
+	%4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%expanded : tensor<32x256x50257xf16>) outs(%3 : tensor<32x50257xf32>) attrs =  {__grid_reduction__} {
+	^bb0(%in: f16, %out: f32):
+		%6 = arith.extf %in : f16 to f32
+		%7 = arith.addf %out, %6 : f32
+		linalg.yield %7 : f32
+	} -> tensor<32x50257xf32>
+	%5 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["reduction", "parallel"]} ins(%4 : tensor<32x50257xf32>) outs(%1 : tensor<50257xf32>) attrs =  {__grid_reduction__} {
+	^bb0(%in: f32, %out: f32):
+		%6 = arith.addf %in, %out : f32
+		linalg.yield %6 : f32
+	} -> tensor<50257xf32>
+	return %5 : tensor<50257xf32>
+}
+```
+
+- Tiling reduction on `parallel` dimension and mapping tiled reductions to thread blocks
+
+```
+func.func private @Unknown0(%arg0: tensor<8192x50257xf16>) -> tensor<50257xf32> attributes {__byteir_reduction_fusion__} {
+	%cst = arith.constant 0.000000e+00 : f32
+	%0 = tensor.empty() : tensor<50257xf32>
+	%expanded = tensor.expand_shape %arg0 [[0, 1], [2]] : tensor<8192x50257xf16> into tensor<32x256x50257xf16>
+	%1 = tensor.empty() : tensor<32x50257xf32>
+	%2 = scf.forall (%arg1, %arg2) in (32, 1571) shared_outs(%arg3 = %1) -> (tensor<32x50257xf32>) {
+		%4 = affine.min #map(%arg2)
+		%5 = affine.apply #map1(%arg2)
+		%extracted_slice = tensor.extract_slice %expanded[%arg1, 0, %5] [1, 256, %4] [1, 1, 1] : tensor<32x256x50257xf16> to tensor<256x?xf16>
+		%extracted_slice_0 = tensor.extract_slice %arg3[%arg1, %5] [1, %4] [1, 1] : tensor<32x50257xf32> to tensor<?xf32>
+		%6 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<?xf32>) -> tensor<?xf32>
+		%7 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "reduction"]} ins(%extracted_slice : tensor<256x?xf16>) outs(%6 : tensor<?xf32>) {
+		^bb0(%in: f16, %out: f32):
+			%8 = arith.extf %in : f16 to f32
+			%9 = arith.addf %out, %8 : f32
+			linalg.yield %9 : f32
+		} -> tensor<?xf32>
+		scf.forall.in_parallel {
+			tensor.parallel_insert_slice %7 into %arg3[%arg1, %5] [1, %4] [1, 1] : tensor<?xf32> into tensor<32x50257xf32>
+		}
+	} {mapping = [#gpu.block<y>, #gpu.block<x>]}
+	%3 = scf.forall (%arg1) in (1571) shared_outs(%arg2 = %0) -> (tensor<50257xf32>) {
+		// ...
+	} {mapping = [#gpu.block<x>]}
+	return %3 : tensor<50257xf32>
+}
+```
+
+- Block-level reduction codegen
+
+```
+%2 = scf.forall (%arg1, %arg2) in (32, 1571) shared_outs(%arg3 = %1) -> (tensor<32x50257xf32>) {
+	%4 = affine.min #map(%arg2)
+	%5 = affine.apply #map1(%arg2)
+	%extracted_slice = tensor.extract_slice %expanded[%arg1, 0, %5] [1, 256, %4] [1, 1, 1] : tensor<32x256x50257xf16> to tensor<256x?xf16>
+	%6 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<32xf32>
+	%7 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x32xf32>
+	%8 = scf.forall (%arg4, %arg5) in (16, 32) shared_outs(%arg6 = %7) -> (tensor<16x32xf32>) {
+		%17 = affine.min #map2(%arg4)
+		%18 = affine.min #map3(%arg4)
+		%19 = affine.apply #map4(%18, %17)
+		%20 = affine.min #map5(%arg5, %arg2)
+		%21 = affine.min #map6(%arg5, %arg2)
+		%22 = affine.apply #map4(%21, %20)
+		%23 = affine.apply #map7(%21, %20)
+		%extracted_slice_6 = tensor.extract_slice %extracted_slice[%17, %20] [%19, %22] [1, 1] : tensor<256x?xf16> to tensor<?x?xf16>
+		%padded = tensor.pad %extracted_slice_6 low[0, 0] high[0, %23] {
+		^bb0(%arg7: index, %arg8: index):
+			tensor.yield %cst : f16
+		} : tensor<?x?xf16> to tensor<16x1xf16>
+		%extracted_slice_7 = tensor.extract_slice %arg6[%arg4, %arg5] [1, 1] [1, 1] : tensor<16x32xf32> to tensor<f32>
+		%collapsed = tensor.collapse_shape %padded [[0, 1]] : tensor<16x1xf16> into tensor<16xf16>
+		%24 = linalg.fill ins(%cst_0 : f32) outs(%extracted_slice_7 : tensor<f32>) -> tensor<f32>
+		%25 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["reduction"]} ins(%collapsed : tensor<16xf16>) outs(%24 : tensor<f32>) {
+		^bb0(%in: f16, %out: f32):
+			%26 = arith.extf %in : f16 to f32
+			%27 = arith.addf %out, %26 : f32
+			linalg.yield %27 : f32
+		} -> tensor<f32>
+		scf.forall.in_parallel {
+			tensor.parallel_insert_slice %25 into %arg6[%arg4, %arg5] [1, 1] [1, 1] : tensor<f32> into tensor<16x32xf32>
+		}
+	} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
+	%expanded_1 = tensor.expand_shape %8 [[0, 1], [2]] : tensor<16x32xf32> into tensor<8x2x32xf32>
+	%9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<8x32xf32>
+	%10 = scf.forall (%arg4, %arg5) in (8, 32) shared_outs(%arg6 = %9) -> (tensor<8x32xf32>) {
+		// ...
+	} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
+	%expanded_2 = tensor.expand_shape %10 [[0, 1], [2]] : tensor<8x32xf32> into tensor<4x2x32xf32>
+	%11 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<4x32xf32>
+	%12 = scf.forall (%arg4, %arg5) in (4, 32) shared_outs(%arg6 = %11) -> (tensor<4x32xf32>) {
+		// ...
+	} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
+	%expanded_3 = tensor.expand_shape %12 [[0, 1], [2]] : tensor<4x32xf32> into tensor<2x2x32xf32>
+	%13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x32xf32>
+	%14 = scf.forall (%arg4, %arg5) in (2, 32) shared_outs(%arg6 = %13) -> (tensor<2x32xf32>) {
+		// ...
+	} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
+	%15 = scf.forall (%arg4) in (32) shared_outs(%arg5 = %6) -> (tensor<32xf32>) {
+		// ...
+	} {mapping = [#gpu.thread<x>]}
+	%extracted_slice_4 = tensor.extract_slice %15[0] [%4] [1] : tensor<32xf32> to tensor<?xf32>
+	%extracted_slice_5 = tensor.extract_slice %arg3[%arg1, %5] [1, %4] [1, 1] : tensor<32x50257xf32> to tensor<?xf32>
+	%16 = scf.forall (%arg4) in (512) shared_outs(%arg5 = %extracted_slice_5) -> (tensor<?xf32>) {
+		// ...
+	} {mapping = [#gpu.linear<x>]}
+	scf.forall.in_parallel {
+		tensor.parallel_insert_slice %16 into %arg3[%arg1, %5] [1, %4] [1, 1] : tensor<?xf32> into tensor<32x50257xf32>
+	}
+} {mapping = [#gpu.block<y>, #gpu.block<x>]}
+```
+
+- Detensorize scalar linalg ops to arith ops and specialize `tensor.pad`
+
+```
+%2 = scf.forall (%arg1, %arg2) in (32, 1571) shared_outs(%arg3 = %1) -> (tensor<32x50257xf32>) {
+	%4 = affine.min #map(%arg2)
+	%5 = affine.apply #map1(%arg2)
+	%6 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<32xf32>
+	%7 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x32xf32>
+	%8 = scf.forall (%arg4, %arg5) in (16, 32) shared_outs(%arg6 = %7) -> (tensor<16x32xf32>) {
+		%17 = affine.min #map2(%arg5, %arg2)
+		%18 = affine.min #map3(%arg5, %arg2)
+		%19 = affine.apply #map4(%18, %17)
+		%20 = arith.cmpi ugt, %19, %c0 : index
+		%21 = scf.if %20 -> (f16) {
+			%84 = affine.apply #map5(%arg4)
+			%85 = affine.apply #map6(%arg2)[%17]
+			%extracted = tensor.extract %expanded[%arg1, %84, %85] : tensor<32x256x50257xf16>
+			scf.yield %extracted : f16
+		} else {
+			scf.yield %cst : f16
+		}
+		// ...
+		%78 = arith.extf %77 : f16 to f32
+		%79 = arith.addf %75, %78 : f32
+		%80 = arith.cmpi ugt, %19, %c0 : index
+		%81 = scf.if %80 -> (f16) {
+			%84 = affine.apply #map21(%arg4)
+			%85 = affine.apply #map6(%arg2)[%17]
+			%extracted = tensor.extract %expanded[%arg1, %84, %85] : tensor<32x256x50257xf16>
+			scf.yield %extracted : f16
+		} else {
+			scf.yield %cst : f16
+		}
+		%82 = arith.extf %81 : f16 to f32
+		%83 = arith.addf %79, %82 : f32
+		%extracted_slice_5 = tensor.extract_slice %arg6[%arg4, %arg5] [1, 1] [1, 1] : tensor<16x32xf32> to tensor<f32>
+		%inserted = tensor.insert %83 into %extracted_slice_5[] : tensor<f32>
+		scf.forall.in_parallel {
+			tensor.parallel_insert_slice %inserted into %arg6[%arg4, %arg5] [1, 1] [1, 1] : tensor<f32> into tensor<16x32xf32>
+		}
+	} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
+	
+	// ...
+	%extracted_slice = tensor.extract_slice %15[0] [%4] [1] : tensor<32xf32> to tensor<?xf32>
+	%extracted_slice_4 = tensor.extract_slice %arg3[%arg1, %5] [1, %4] [1, 1] : tensor<32x50257xf32> to tensor<?xf32>
+	%16 = scf.forall (%arg4) in (512) shared_outs(%arg5 = %extracted_slice_4) -> (tensor<?xf32>) {
+		%17 = affine.min #map22(%arg4)[%4]
+		%18 = affine.max #map23(%17)
+		%19 = affine.apply #map24(%arg4)[%4]
+		%extracted_slice_5 = tensor.extract_slice %extracted_slice[%19] [%18] [1] : tensor<?xf32> to tensor<?xf32>
+		%extracted_slice_6 = tensor.extract_slice %arg5[%19] [%18] [1] : tensor<?xf32> to tensor<?xf32>
+		%20 = linalg.copy {__byteir_gpu_tile_block_reduction_10} ins(%extracted_slice_5 : tensor<?xf32>) outs(%extracted_slice_6 : tensor<?xf32>) -> tensor<?xf32>
+		scf.forall.in_parallel {
+			tensor.parallel_insert_slice %20 into %arg5[%19] [%18] [1] : tensor<?xf32> into tensor<?xf32>
+		}
+	} {mapping = [#gpu.linear<x>]}
+	scf.forall.in_parallel {
+		tensor.parallel_insert_slice %16 into %arg3[%arg1, %5] [1, %4] [1, 1] : tensor<?xf32> into tensor<32x50257xf32>
+	}
+} {mapping = [#gpu.block<y>, #gpu.block<x>]}
+```
+
+- `structured.split_reduction` split reduction op along `reduction` dimension for increasing parallelism
+
+- `structured.tile_to_forall_op` tile reduction op along `parallel` dimensions to `forall` op and mapping to block/linear/thread
+
+- `structured.fuse_into_containing_op` fuse init and pad operands into `scf.forall`
+
+- `structured.annotate` attach any attribute to target ops, used to annotate reduction op and attach memory space to `allot_tensor`
+
+- `structured.tile` tile reduction op along `reduction` dimension to sequential for loop
+
+- `structured.detensorize` use to inline computation region of linalg op which operands have scalar tensor type
+
+- `LinalgCollapseLoopsPass` collapse consecutive `parallel` and `reduction` loops, this pass could work on both tensor and memref
+
+- `TensorPadSpecializationPass` specialize `tensor.extract` of pad op to conditional read
diff --git a/compiler/include/byteir/Analysis/UseRange.h b/compiler/include/byteir/Analysis/UseRange.h
index 07b5588ae..704afcf16 100644
--- a/compiler/include/byteir/Analysis/UseRange.h
+++ b/compiler/include/byteir/Analysis/UseRange.h
@@ -104,9 +104,24 @@ class UserangeAnalysis {
   using UsePosition = std::pair<size_t, mlir::Operation *>;
   using UsePositionList = std::vector<UsePosition>;
 
+  using AllocsIterator = mlir::bufferization::BufferPlacementAllocs::
+      AllocEntryList::const_iterator;
+  using AllocsIteratorRange = llvm::iterator_range<AllocsIterator>;
+
   UserangeAnalysis(Liveness *liveness) : liveness(liveness) {}
   UserangeAnalysis(mlir::Operation *op, Liveness *liveness,
                    const mlir::bufferization::BufferPlacementAllocs &allocs,
+                   const mlir::BufferViewFlowAnalysis &aliases)
+      : UserangeAnalysis(op, liveness, make_range(allocs.begin(), allocs.end()),
+                         aliases) {}
+  UserangeAnalysis(
+      mlir::Operation *op, Liveness *liveness,
+      const mlir::bufferization::BufferPlacementAllocs::AllocEntryList &allocs,
+      const mlir::BufferViewFlowAnalysis &aliases)
+      : UserangeAnalysis(op, liveness, make_range(allocs.begin(), allocs.end()),
+                         aliases) {}
+  UserangeAnalysis(mlir::Operation *op, Liveness *liveness,
+                   AllocsIteratorRange &&allocs,
                    const mlir::BufferViewFlowAnalysis &aliases);
   virtual ~UserangeAnalysis() {}
 
diff --git a/compiler/include/byteir/Conversion/FuncToByre/FuncToByre.h b/compiler/include/byteir/Conversion/FuncToByre/FuncToByre.h
index 230ea0e1a..92e29fbac 100644
--- a/compiler/include/byteir/Conversion/FuncToByre/FuncToByre.h
+++ b/compiler/include/byteir/Conversion/FuncToByre/FuncToByre.h
@@ -27,9 +27,15 @@ class ModuleOp;
 void populateFuncToByreTensorPattern(RewritePatternSet &patterns,
                                      bool appendArgTypes);
 
+void populateGPULaunchFuncToByrePattern(RewritePatternSet &patterns,
+                                        bool useBarePtrCallConv);
+
 std::unique_ptr<OperationPass<ModuleOp>>
 createConvertFuncToByreTensorPass(bool appendArgTypes = false);
 
+std::unique_ptr<Pass>
+createConvertGPULaunchFuncToByrePass(bool useBarePtrCallConv = false);
+
 } // namespace mlir
 
 #endif // BYTEIR_CONVERSION_FUNCTOBYRE_FUNCTOBYRE_H
diff --git a/compiler/include/byteir/Conversion/Passes.td b/compiler/include/byteir/Conversion/Passes.td
index 79c6acbb8..e5ee1c148 100644
--- a/compiler/include/byteir/Conversion/Passes.td
+++ b/compiler/include/byteir/Conversion/Passes.td
@@ -253,6 +253,9 @@ def CollectGPUKernel : Pass<"collect-gpu-kernel", "ModuleOp"> {
     Option<"moduleName", "module-name", "std::string",
             /*default=*/"\"unified\"",
             "Optional name for GPUModule to put all gpu kernels">,
+    Option<"removeHost", "remove-host", "bool",
+            /*default=*/"true",
+            "Whether to remove host part">,
   ];
 }
 
@@ -349,6 +352,26 @@ def ConvertFuncToByreTensor : Pass<"func-to-byre-tensor", "ModuleOp"> {
 }
 
 
+//===----------------------------------------------------------------------===//
+// FuncToByreTensor
+//===----------------------------------------------------------------------===//
+
+def ConvertGPULaunchFuncToByre : Pass<"gpu-launch-func-to-byre"> {
+  let summary = "Convert gpu.launch_func op to byre compute op.";
+  let constructor = "mlir::createConvertGPULaunchFuncToByrePass()";
+  let dependentDialects = [
+    "mlir::byre::ByreDialect",
+    "mlir::gpu::GPUDialect"
+  ];
+
+  let options = [
+    Option<"useBarePtrCallConv", "use-bare-ptr-memref-call-conv", "bool",
+          /*default=*/"false",
+          "Replace memref arguments in GPU functions with bare pointers."
+          "All memrefs must have static shape">,
+  ];
+}
+
 //===----------------------------------------------------------------------===//
 // MemrefToByre
 //===----------------------------------------------------------------------===//
@@ -398,6 +421,9 @@ def MemrefCopyToLinalgPass :
     Option<"attachAttr", "attach-attr", "std::string",
             /*default=*/"",
             "An optional unit attribute attaching on target functions: <attr_name>">,
+    Option<"outlining", "outlining", "bool",
+            /*default=*/"true",
+            "Whether to outline the copy op to a new function">,
   ];
 }
 
diff --git a/compiler/include/byteir/Conversion/ToLinalg/ToLinalg.h b/compiler/include/byteir/Conversion/ToLinalg/ToLinalg.h
index eaed2dbcc..8e64ce9d3 100644
--- a/compiler/include/byteir/Conversion/ToLinalg/ToLinalg.h
+++ b/compiler/include/byteir/Conversion/ToLinalg/ToLinalg.h
@@ -55,7 +55,8 @@ std::unique_ptr<OperationPass<func::FuncOp>> createLinalgExtToLinalgPass();
 
 std::unique_ptr<OperationPass<ModuleOp>>
 createMemrefCopyToLinalgPass(std::string anchorTag = "",
-                             std::string attachAttr = "");
+                             std::string attachAttr = "",
+                             bool outlining = true);
 
 } // namespace mlir
 
diff --git a/compiler/include/byteir/Conversion/ToPTX/ToPTX.h b/compiler/include/byteir/Conversion/ToPTX/ToPTX.h
index de932d857..c5185df6c 100644
--- a/compiler/include/byteir/Conversion/ToPTX/ToPTX.h
+++ b/compiler/include/byteir/Conversion/ToPTX/ToPTX.h
@@ -33,7 +33,8 @@ createGenPTXConfigPass(bool useBarePtrCallConv = false);
 
 // TODO move to general GPU
 std::unique_ptr<OperationPass<ModuleOp>>
-createCollectGPUKernelPass(const std::string &name = "unified");
+createCollectGPUKernelPass(const std::string &name = "unified",
+                           bool removeHost = true);
 
 } // namespace mlir
 
diff --git a/compiler/include/byteir/Dialect/CMakeLists.txt b/compiler/include/byteir/Dialect/CMakeLists.txt
index c7113baae..3e8627a7e 100644
--- a/compiler/include/byteir/Dialect/CMakeLists.txt
+++ b/compiler/include/byteir/Dialect/CMakeLists.txt
@@ -3,11 +3,13 @@ add_subdirectory(Affine)
 add_subdirectory(Byre)
 add_subdirectory(Cat)
 add_subdirectory(Ccl)
+add_subdirectory(GPU)
 add_subdirectory(Lace)
 add_subdirectory(Linalg)
 add_subdirectory(MemRef)
 add_subdirectory(mhlo)
 add_subdirectory(SCF)
 add_subdirectory(Shape)
+add_subdirectory(Tensor)
 add_subdirectory(Transform)
 add_subdirectory(Vector)
diff --git a/compiler/include/byteir/Dialect/Cat/IR/CatOps.td b/compiler/include/byteir/Dialect/Cat/IR/CatOps.td
index be953e7a6..c8ef07b19 100644
--- a/compiler/include/byteir/Dialect/Cat/IR/CatOps.td
+++ b/compiler/include/byteir/Dialect/Cat/IR/CatOps.td
@@ -281,6 +281,17 @@ def Cat_GemmRCRPermuteOp : Cat_Op<"gemm_rcr_permute", [Cat_CatOpInterface, Pure]
   let hasVerifier = 1;
 }
 
+def Cat_GemmRRRPermuteOp : Cat_Op<"gemm_rrr_permute", [Cat_CatOpInterface, Pure]> {
+  let summary = "gemm_rrr + permute0213 operator, output layout is [m / t1, t1, t2, n / t2]";
+  let arguments = (ins AnyTensor : $lhs,
+                       AnyTensor : $rhs,
+                       I64Attr : $t1,
+                       I64Attr : $t2);
+  let results = (outs AnyTensor : $output);
+
+  let hasVerifier = 1;
+}
+
 def Cat_LayerNormOp : Cat_Op<"layernorm", [Cat_CatOpInterface, Pure]> {
   let summary = "layernorm operator";
   let arguments = (ins AnyTensor : $input,
diff --git a/compiler/include/byteir/Dialect/GPU/CMakeLists.txt b/compiler/include/byteir/Dialect/GPU/CMakeLists.txt
new file mode 100644
index 000000000..53b17ff2e
--- /dev/null
+++ b/compiler/include/byteir/Dialect/GPU/CMakeLists.txt
@@ -0,0 +1,3 @@
+set(LLVM_TARGET_DEFINITIONS Passes.td)
+mlir_tablegen(Passes.h.inc -gen-pass-decls -name ByteIRGPU)
+add_public_tablegen_target(ByteIRGPUPassIncGen)
diff --git a/compiler/include/byteir/Dialect/GPU/Passes.h b/compiler/include/byteir/Dialect/GPU/Passes.h
new file mode 100644
index 000000000..6a86b80b6
--- /dev/null
+++ b/compiler/include/byteir/Dialect/GPU/Passes.h
@@ -0,0 +1,37 @@
+//===- Passes.h ----------------------------------------------*--- C++ -*-===//
+//
+// Copyright 2022 ByteDance Ltd. and/or its affiliates. All rights reserved.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BYTEIR_DIALECT_GPU_PASSES_H
+#define BYTEIR_DIALECT_GPU_PASSES_H
+
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+namespace gpu {
+class GPUFuncOp;
+} // namespace gpu
+
+#define GEN_PASS_DECL
+#include "byteir/Dialect/GPU/Passes.h.inc"
+
+/// Generate the code for registering transforms passes.
+#define GEN_PASS_REGISTRATION
+#include "byteir/Dialect/GPU/Passes.h.inc"
+
+} // namespace mlir
+
+#endif // BYTEIR_DIALECT_GPU_PASSES_H
diff --git a/compiler/include/byteir/Dialect/GPU/Passes.td b/compiler/include/byteir/Dialect/GPU/Passes.td
new file mode 100644
index 000000000..862df14a2
--- /dev/null
+++ b/compiler/include/byteir/Dialect/GPU/Passes.td
@@ -0,0 +1,36 @@
+//===- Passes.td - Transforms pass definition file -------*--- tablegen -*-===//
+//
+// Copyright 2022 ByteDance Ltd. and/or its affiliates. All rights reserved.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef BYTEIR_DIALECT_GPU_PASSES
+#define BYTEIR_DIALECT_GPU_PASSES
+
+include "mlir/Pass/PassBase.td"
+
+//===----------------------------------------------------------------------===//
+// ShmAllocaToWorkgroupArg
+//===----------------------------------------------------------------------===//
+
+def ShmAllocaToWorkgroupArg : Pass<"shm-alloca-to-workgroup-arg", "gpu::GPUModuleOp"> {
+  let summary = "Hoist shared memory alloca in gpu kernel to workgroup argument";
+  let dependentDialects = [
+    "gpu::GPUDialect",
+    "memref::MemRefDialect"
+  ];
+}
+
+#endif // BYTEIR_DIALECT_GPU_PASSES
diff --git a/compiler/include/byteir/Dialect/GPU/Transforms/Transforms.h b/compiler/include/byteir/Dialect/GPU/Transforms/Transforms.h
new file mode 100644
index 000000000..042d045dd
--- /dev/null
+++ b/compiler/include/byteir/Dialect/GPU/Transforms/Transforms.h
@@ -0,0 +1,32 @@
+//===- Transforms.h -------------------------------------------*--- C++ -*-===//
+//
+// Copyright 2022 ByteDance Ltd. and/or its affiliates. All rights reserved.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BYTEIR_DIALECT_GPU_TRANSFORMS_TRANSFORMS_H
+#define BYTEIR_DIALECT_GPU_TRANSFORMS_TRANSFORMS_H
+
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+
+namespace mlir {
+namespace gpu {
+
+// hoist shared memory alloca in gpu kernel to workgroup arg
+void hoistShmAllocaToWorkgroup(gpu::GPUFuncOp func);
+
+} // namespace gpu
+} // namespace mlir
+
+#endif // BYTEIR_DIALECT_GPU_TRANSFORMS_TRANSFORMS_H
\ No newline at end of file
diff --git a/compiler/include/byteir/Dialect/Linalg/Passes.td b/compiler/include/byteir/Dialect/Linalg/Passes.td
index 81c411b53..5393ef410 100644
--- a/compiler/include/byteir/Dialect/Linalg/Passes.td
+++ b/compiler/include/byteir/Dialect/Linalg/Passes.td
@@ -166,6 +166,16 @@ def LinalgCollapseLoops : Pass<"linalg-collapse-loops", "func::FuncOp"> {
     "tensor::TensorDialect",
     "memref::MemRefDialect"
   ];
+
+  let options = [
+    Option<"iteratorType", "iterator-type", "mlir::utils::IteratorType",
+           /*default=*/"mlir::utils::IteratorType::parallel", "iterator type",
+           [{llvm::cl::values(
+             clEnumValN(mlir::utils::IteratorType::parallel, "parallel",
+                        "parallel iterator type"),
+             clEnumValN(mlir::utils::IteratorType::reduction, "reduction",
+                        "reduction iterator type"))}]>,
+  ];
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/compiler/include/byteir/Dialect/Linalg/TransformOps/LinalgExtTransformOps.td b/compiler/include/byteir/Dialect/Linalg/TransformOps/LinalgExtTransformOps.td
index ac7d80126..dc918c661 100644
--- a/compiler/include/byteir/Dialect/Linalg/TransformOps/LinalgExtTransformOps.td
+++ b/compiler/include/byteir/Dialect/Linalg/TransformOps/LinalgExtTransformOps.td
@@ -72,6 +72,18 @@ def CollapseDimsOp : Op<Transform_Dialect, "structured.collapse_dims",
   }];
 }
 
+def DetensorizeOp : Op<Transform_Dialect, "structured.detensorize",
+    [FunctionalStyleTransformOpTrait, MemoryEffectsOpInterface,
+     DeclareOpInterfaceMethods<TransformOpInterface>]> {
+  let description = [{
+    Detensorize linalg ops.
+  }];
+
+  let arguments = (ins PDL_Operation:$target);
+
+  let assemblyFormat = "$target attr-dict";
+}
+
 def FoldUnitExtentDimsOp : Op<Transform_Dialect, "structured.fold_unit_extent_dims",
        [FunctionalStyleTransformOpTrait, MemoryEffectsOpInterface,
         DeclareOpInterfaceMethods<TransformOpInterface>]> {
@@ -245,4 +257,31 @@ def FuseOperandsOp : Op<Transform_Dialect, "structured.fuse_operands",
   let hasVerifier = 1;
 }
 
+
+def InsertSliceToCopyExtOp :
+  Op<Transform_Dialect, "structured.insert_slice_to_copy_ext",
+    [FunctionalStyleTransformOpTrait, MemoryEffectsOpInterface,
+     TransformEachOpTrait, TransformOpInterface]> {
+  let description = [{
+    insert_slice_to_copy_ext extension.
+  }];
+
+  let arguments = (ins TransformHandleTypeInterface:$target);
+  let results = (outs TransformHandleTypeInterface:$transformed);
+
+  let assemblyFormat = "$target attr-dict `:` functional-type(operands, results) ";
+
+  let builders = [
+    OpBuilder<(ins "Value":$target)>,
+  ];
+  let extraClassDeclaration = [{
+    ::mlir::DiagnosedSilenceableFailure applyToOne(
+        ::mlir::transform::TransformRewriter &rewriter,
+        ::mlir::Operation *target,
+        ::mlir::transform::ApplyToEachResultList &results,
+        ::mlir::transform::TransformState &state);
+  }];
+}
+
+
 #endif // BYTEIR_DIALECT_LINALG_TRANSFORMOPS_LINALG_EXT_TRANSFORMOPS
\ No newline at end of file
diff --git a/compiler/include/byteir/Dialect/Linalg/Transforms/LinalgCollapseLoops.h b/compiler/include/byteir/Dialect/Linalg/Transforms/LinalgCollapseLoops.h
index 556a43c41..b5ef16af4 100644
--- a/compiler/include/byteir/Dialect/Linalg/Transforms/LinalgCollapseLoops.h
+++ b/compiler/include/byteir/Dialect/Linalg/Transforms/LinalgCollapseLoops.h
@@ -18,6 +18,7 @@
 #ifndef BYTEIR_DIALECT_LINALG_TRANSFORMS_LINALGCOLLAPSELOOPS_H
 #define BYTEIR_DIALECT_LINALG_TRANSFORMS_LINALGCOLLAPSELOOPS_H
 
+#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
 #include "mlir/Pass/Pass.h"
 #include <memory>
 
@@ -26,7 +27,11 @@ namespace func {
 class FuncOp;
 } // namespace func
 
-std::unique_ptr<OperationPass<func::FuncOp>> createLinalgCollapseLoops();
+#define GEN_PASS_DECL_LINALGCOLLAPSELOOPS
+#include "byteir/Dialect/Linalg/Passes.h.inc"
+
+std::unique_ptr<OperationPass<func::FuncOp>> createLinalgCollapseLoops(
+    utils::IteratorType iteratorType = utils::IteratorType::parallel);
 
 } // namespace mlir
 
diff --git a/compiler/include/byteir/Dialect/Tensor/CMakeLists.txt b/compiler/include/byteir/Dialect/Tensor/CMakeLists.txt
new file mode 100644
index 000000000..a4f8266f9
--- /dev/null
+++ b/compiler/include/byteir/Dialect/Tensor/CMakeLists.txt
@@ -0,0 +1,3 @@
+set(LLVM_TARGET_DEFINITIONS Passes.td)
+mlir_tablegen(Passes.h.inc -gen-pass-decls -name ByteIRTensor)
+add_public_tablegen_target(ByteIRTensorPassIncGen)
diff --git a/compiler/include/byteir/Dialect/Tensor/Passes.h b/compiler/include/byteir/Dialect/Tensor/Passes.h
new file mode 100644
index 000000000..be0c09f87
--- /dev/null
+++ b/compiler/include/byteir/Dialect/Tensor/Passes.h
@@ -0,0 +1,31 @@
+//===- Passes.h ---------------------------------------------------- C++ --===//
+//
+// Copyright 2022 ByteDance Ltd. and/or its affiliates. All rights reserved.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BYTEIR_DIALECT_TENSOR_PASSES_H
+#define BYTEIR_DIALECT_TENSOR_PASSES_H
+
+#include "byteir/Dialect/Tensor/Transforms/TensorPadSpecialization.h"
+
+namespace mlir {
+
+/// Generate the code for registering transforms passes.
+#define GEN_PASS_REGISTRATION
+#include "byteir/Dialect/Tensor/Passes.h.inc"
+
+} // namespace mlir
+
+#endif // BYTEIR_DIALECT_TENSOR_PASSES_H
diff --git a/compiler/include/byteir/Dialect/Tensor/Passes.td b/compiler/include/byteir/Dialect/Tensor/Passes.td
new file mode 100644
index 000000000..cdfa73c4d
--- /dev/null
+++ b/compiler/include/byteir/Dialect/Tensor/Passes.td
@@ -0,0 +1,35 @@
+//===- Passes.td - Transforms pass definition file -------*--- tablegen -*-===//
+//
+// Copyright 2022 ByteDance Ltd. and/or its affiliates. All rights reserved.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BYTEIR_DIALECT_TENSOR_PASSES
+#define BYTEIR_DIALECT_TENSOR_PASSES
+
+include "mlir/Pass/PassBase.td"
+
+//===----------------------------------------------------------------------===//
+// TensorPadSpecialization
+//===----------------------------------------------------------------------===//
+
+def TensorPadSpecialization : Pass<"tensor-pad-specialization", ""> {
+  let summary = "Specialize tensor.pad op";
+  let constructor = "mlir::createTensorPadSpecializationPass()";
+  let dependentDialects = [
+    "scf::SCFDialect",
+  ];
+}
+
+#endif // BYTEIR_DIALECT_TENSOR_PASSES
diff --git a/compiler/include/byteir/Dialect/Tensor/Transforms/TensorPadSpecialization.h b/compiler/include/byteir/Dialect/Tensor/Transforms/TensorPadSpecialization.h
new file mode 100644
index 000000000..72f38cd03
--- /dev/null
+++ b/compiler/include/byteir/Dialect/Tensor/Transforms/TensorPadSpecialization.h
@@ -0,0 +1,30 @@
+//===- TensorPadSpecialization.h ---------------------------------- C++ --===//
+//
+// Copyright 2022 ByteDance Ltd. and/or its affiliates. All rights reserved.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BYTEIR_DIALECT_SHAPE_TRANSFORMS_TENSORPADSPECIALIZATION_H
+#define BYTEIR_DIALECT_SHAPE_TRANSFORMS_TENSORPADSPECIALIZATION_H
+
+#include "mlir/Pass/Pass.h"
+#include <memory>
+
+namespace mlir {
+
+std::unique_ptr<Pass> createTensorPadSpecializationPass();
+
+} // namespace mlir
+
+#endif // BYTEIR_DIALECT_SHAPE_TRANSFORMS_TENSORPADSPECIALIZATION_H
diff --git a/compiler/include/byteir/Dialect/Transform/Passes.td b/compiler/include/byteir/Dialect/Transform/Passes.td
index 6e82c4b3f..49a471a71 100644
--- a/compiler/include/byteir/Dialect/Transform/Passes.td
+++ b/compiler/include/byteir/Dialect/Transform/Passes.td
@@ -34,6 +34,24 @@ def TransformDialectInterpreter : Pass<"transform-dialect-interpreter", "ModuleO
   ];
 }
 
+//===----------------------------------------------------------------------===//
+// DetensorizeTransformationInsertion
+//===----------------------------------------------------------------------===//
+
+def DetensorizeTransformInsertion : Pass<"insert-detensorize-transform", "ModuleOp"> {
+  let summary = "Insert detensorize transform IR to functions.";
+  let constructor = "mlir::createDetensorizeTransformInsertionPass()";
+  let options = [
+    Option<"funcAnchorAttr", "func-anchor", "std::string",
+            /*default=*/"",
+            "An optional Unit attribute anchoring on target functions.">,
+    Option<"matchPrefix", "match-prefix", "std::string",
+            /*default=*/"\"__byteir_detensorize\"",
+            "An optional match prefix attribute on target ops.">,
+  ];
+}
+
+
 //===----------------------------------------------------------------------===//
 // FuseExtTransformInsertion
 //===----------------------------------------------------------------------===//
@@ -60,4 +78,22 @@ def FuseExtTransformInsertion : Pass<"insert-fuse-ext-transform", "ModuleOp"> {
   ];
 }
 
+//===----------------------------------------------------------------------===//
+// RewriteInDPSTransformInsertion
+//===----------------------------------------------------------------------===//
+
+def RewriteInDPSTransformInsertion : Pass<"insert-rewrite-in-dps-transform", "ModuleOp"> {
+  let summary = "Insert rewrite in destination-passing-style transform IR to functions.";
+  let constructor = "mlir::createRewriteInDPSTransformInsertionPass()";
+  let options = [
+    Option<"funcAnchorAttr", "func-anchor", "std::string",
+            /*default=*/"",
+            "An optional Unit attribute anchoring on target functions.">,
+    Option<"matchPrefix", "match-prefix", "std::string",
+            /*default=*/"\"__byteir_detensorize\"",
+            "An optional match prefix attribute on target ops.">,
+  ];
+}
+
+
 #endif // BYTEIR_DIALECT_TRANSFORM_PASSES
diff --git a/compiler/include/byteir/Dialect/Transform/Transforms/TransformInsertion.h b/compiler/include/byteir/Dialect/Transform/Transforms/TransformInsertion.h
index 931b6d0bc..5741663ac 100644
--- a/compiler/include/byteir/Dialect/Transform/Transforms/TransformInsertion.h
+++ b/compiler/include/byteir/Dialect/Transform/Transforms/TransformInsertion.h
@@ -37,12 +37,22 @@ struct TransformInsertionConfig {
 std::unique_ptr<OperationPass<ModuleOp>>
 createGenericTransformInsertionPass(const TransformInsertionConfig &config);
 
+std::unique_ptr<OperationPass<ModuleOp>>
+createDetensorizeTransformInsertionPass(
+    const std::string &funcAnchor = "",
+    const std::string &matchPrefix = "__byteir_detensorize");
+
 std::unique_ptr<OperationPass<ModuleOp>> createFuseExtTransformInsertionPass(
     const std::string &funcAnchor = "",
     const std::string &matchPrefix = "unknown",
     const std::string &tileSizeAttrName = "",
     const std::string &tileInterchangeAttrName = "",
     const bool keepIntermediates = false);
+
+std::unique_ptr<OperationPass<ModuleOp>>
+createRewriteInDPSTransformInsertionPass(
+    const std::string &funcAnchor = "",
+    const std::string &matchPrefix = "__byteir_rewrite_in_dps");
 } // namespace mlir
 
 #endif // BYTEIR_DIALECT_TRANSFORM_TRANSFORMS_TRANSFORMINSERTION_H
\ No newline at end of file
diff --git a/compiler/include/byteir/Dialect/mhlo/Transforms/CanonicalizeExt.h b/compiler/include/byteir/Dialect/mhlo/Transforms/CanonicalizeExt.h
index fa0a1b94c..78e3aa1bd 100644
--- a/compiler/include/byteir/Dialect/mhlo/Transforms/CanonicalizeExt.h
+++ b/compiler/include/byteir/Dialect/mhlo/Transforms/CanonicalizeExt.h
@@ -45,6 +45,7 @@ class ReshapeOp;
 class MulOp;
 class SliceOp;
 class ReverseOp;
+class GatherOp;
 
 // Most of these will push back to upstream
 // So this file only includes patterns, not a pass.
@@ -143,6 +144,8 @@ LogicalResult simplifyTransposeReshapeTranspose(mhlo::TransposeOp op,
 LogicalResult foldReverseWithConstant(mhlo::ReverseOp op,
                                       PatternRewriter &rewriter);
 
+LogicalResult foldGatherWithInput(mhlo::GatherOp op, PatternRewriter &rewriter);
+
 // populate canonicalizeExt patterns
 void populateCanonicalizeExtPatterns(RewritePatternSet &patterns,
                                      MLIRContext *context,
diff --git a/compiler/include/byteir/Dialect/mhlo/Transforms/ConvertOpToCustomCall.h b/compiler/include/byteir/Dialect/mhlo/Transforms/ConvertOpToCustomCall.h
index e84497390..b36ebc426 100644
--- a/compiler/include/byteir/Dialect/mhlo/Transforms/ConvertOpToCustomCall.h
+++ b/compiler/include/byteir/Dialect/mhlo/Transforms/ConvertOpToCustomCall.h
@@ -1,4 +1,4 @@
-//===- ConvertRngToCustomCall.h -------------------------------*--- C++ -*-===//
+//===- ConvertOpToCustomCall.h --------------------------------*--- C++ -*-===//
 //
 // Copyright 2022 ByteDance Ltd. and/or its affiliates. All rights reserved.
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -27,6 +27,8 @@ class ModuleOp;
 
 void populateRngPatternToCustomCall(RewritePatternSet &patterns);
 
+void populateFlashFwdRewritePattern(RewritePatternSet &patterns);
+
 std::unique_ptr<OperationPass<ModuleOp>>
 createConvertOpToCustomCallPass(llvm::StringRef anchor = "");
 
diff --git a/compiler/include/byteir/Dialect/mhlo/Transforms/GenericFusionCommon.h b/compiler/include/byteir/Dialect/mhlo/Transforms/GenericFusionCommon.h
index be1a10a93..df10ecfdc 100644
--- a/compiler/include/byteir/Dialect/mhlo/Transforms/GenericFusionCommon.h
+++ b/compiler/include/byteir/Dialect/mhlo/Transforms/GenericFusionCommon.h
@@ -41,6 +41,7 @@ struct GenericFuserConfig {
   std::function<bool(Operation *)> fuse_trigger;
   std::function<bool(Operation *, Operation *)> fuse_with;
   std::function<bool(Operation *)> valid_single_op;
+  std::function<bool(MhloFusionPattern)> valid_fusion_pattern;
 };
 
 //===----------------------------------------------------------------------===//
@@ -115,6 +116,9 @@ class GenericFusionPass : public GenericFusionBase<DerivedT> {
 
     for (auto it = plan.rbegin(); it != plan.rend(); ++it) {
       auto &pattern = *it;
+      if (!fuse_config.valid_fusion_pattern(pattern))
+        continue;
+
       if (pattern.size() > 1) {
         applyMhloFusionPattern(pattern, fuse_config.fuse_attr);
       } else if (this->clusterSingleOp.getValue()) {
diff --git a/compiler/include/byteir/Dialect/mhlo/Transforms/HloFuser.h b/compiler/include/byteir/Dialect/mhlo/Transforms/HloFuser.h
index f49315bbe..6a0e10cb7 100644
--- a/compiler/include/byteir/Dialect/mhlo/Transforms/HloFuser.h
+++ b/compiler/include/byteir/Dialect/mhlo/Transforms/HloFuser.h
@@ -46,6 +46,10 @@ constexpr StringRef getByteIRMatmulEpilogueFusionAttrName() {
   return "__byteir_matmul_epilogue_fusion__";
 }
 
+constexpr StringRef getByteIRReductionFusionAttrName() {
+  return "__byteir_reduction_fusion__";
+}
+
 constexpr StringRef getByteIRTrivialFusionAttrName() {
   return "__byteir_trivial_fusion__";
 }
@@ -102,6 +106,8 @@ std::unique_ptr<OperationPass<func::FuncOp>> createTrivialFusionPass();
 
 std::unique_ptr<OperationPass<func::FuncOp>> createHloAggressiveFusionPass();
 
+std::unique_ptr<OperationPass<func::FuncOp>> createReductionFusionPass();
+
 } // namespace mlir
 
 #endif // BYTEIR_DIALECT_MHLO_TRANSFORMS_HLOFUSER_H
diff --git a/compiler/include/byteir/Dialect/mhlo/Util/CustomCallUtil.h b/compiler/include/byteir/Dialect/mhlo/Util/CustomCallUtil.h
index af68cf38a..83a53e329 100644
--- a/compiler/include/byteir/Dialect/mhlo/Util/CustomCallUtil.h
+++ b/compiler/include/byteir/Dialect/mhlo/Util/CustomCallUtil.h
@@ -85,6 +85,14 @@ constexpr llvm::StringRef getRngUniformName() {
   return CUSTOM_CALL_NAME_PREFIX "rng_uniform";
 }
 
+constexpr llvm::StringRef getFlashAttnFwdName() {
+  return CUSTOM_CALL_NAME_PREFIX "flash_attn_fwd";
+}
+
+constexpr llvm::StringRef getFlashAttnBwdName() {
+  return CUSTOM_CALL_NAME_PREFIX "flash_attn_bwd";
+}
+
 constexpr llvm::StringRef getDynamicPartitionName() {
   return TF_NAME_PREFIX "DynamicPartition";
 }
diff --git a/compiler/include/byteir/Pipelines/GPU/MappingForall.h b/compiler/include/byteir/Pipelines/GPU/MappingForall.h
new file mode 100644
index 000000000..202cc1e40
--- /dev/null
+++ b/compiler/include/byteir/Pipelines/GPU/MappingForall.h
@@ -0,0 +1,53 @@
+//===- MappingForall.h ---------------------------------------*--- C++ -*-===//
+//
+// Copyright 2022 ByteDance Ltd. and/or its affiliates. All rights reserved.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BYTEIR_PIPELINES_GPU_MAPPING_FORALL_H
+#define BYTEIR_PIPELINES_GPU_MAPPING_FORALL_H
+
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Pass/PassOptions.h"
+#include "mlir/Pass/PassRegistry.h"
+
+namespace mlir {
+struct GPUMappingForallOptions
+    : public PassPipelineOptions<GPUMappingForallOptions> {
+  Option<std::string> funcAnchor{
+      *this, "func-anchor",
+      llvm::cl::desc(
+          "An optional Unit attribute anchoring on target functions."),
+      llvm::cl::init("")};
+  Option<std::string> annotatePrefix{
+      *this, "annotate-prefix",
+      llvm::cl::desc("An optional annotate prefix attribute on target ops."),
+      llvm::cl::init("__byteir_gpu_split_grid_reduction")};
+  // TODO: option for grid/block dims hint
+};
+
+void createGPUMappingForallTransform(OpPassManager &pm,
+                                     const GPUMappingForallOptions &options);
+
+inline void registerGPUMappingForallPipelines() {
+  PassPipelineRegistration<GPUMappingForallOptions>(
+      "insert-gpu-mapping-forall-transform",
+      "Insert transformation IR to mapping forall to corresponding blocks and "
+      "threads",
+      createGPUMappingForallTransform);
+}
+
+} // namespace mlir
+
+#endif // BYTEIR_PIPELINES_GPU_MAPPING_FORALL_H
diff --git a/compiler/include/byteir/Pipelines/GPU/ReductionCodegen.h b/compiler/include/byteir/Pipelines/GPU/ReductionCodegen.h
new file mode 100644
index 000000000..7aea80d51
--- /dev/null
+++ b/compiler/include/byteir/Pipelines/GPU/ReductionCodegen.h
@@ -0,0 +1,153 @@
+//===- ReductionCodegen.h -----------------------------------*--- C++ -*-===//
+//
+// Copyright 2022 ByteDance Ltd. and/or its affiliates. All rights reserved.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BYTEIR_PIPELINES_GPU_REDUCTION_CODEGEN_H
+#define BYTEIR_PIPELINES_GPU_REDUCTION_CODEGEN_H
+
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Pass/PassOptions.h"
+#include "mlir/Pass/PassRegistry.h"
+
+namespace mlir {
+struct GPUSplitGridReductionOptions
+    : public PassPipelineOptions<GPUSplitGridReductionOptions> {
+  Option<std::string> funcAnchor{
+      *this, "func-anchor",
+      llvm::cl::desc(
+          "An optional Unit attribute anchoring on target functions."),
+      llvm::cl::init("")};
+  Option<std::string> annotatePrefix{
+      *this, "annotate-prefix",
+      llvm::cl::desc("An optional annotate prefix attribute on target ops."),
+      llvm::cl::init("__byteir_gpu_split_grid_reduction")};
+  Option<int64_t> splitFactor{*this, "split-factor",
+                              llvm::cl::desc("split factor"),
+                              llvm::cl::init(32)};
+};
+
+struct GPUTileGridReductionOptions
+    : public PassPipelineOptions<GPUTileGridReductionOptions> {
+  Option<std::string> funcAnchor{
+      *this, "func-anchor",
+      llvm::cl::desc(
+          "An optional Unit attribute anchoring on target functions."),
+      llvm::cl::init("")};
+  Option<std::string> annotatePrefix{
+      *this, "annotate-prefix",
+      llvm::cl::desc("An optional annotate prefix attribute on target ops."),
+      llvm::cl::init("__byteir_gpu_tile_grid_reduction")};
+  Option<int64_t> warpSize{*this, "warp-size", llvm::cl::desc("warp size"),
+                           llvm::cl::init(32)};
+  Option<int64_t> blockSize{*this, "block-size", llvm::cl::desc("block size"),
+                            llvm::cl::init(256)};
+  Option<bool> usingForall{*this, "using-forall",
+                           llvm::cl::desc("using forall"),
+                           llvm::cl::init(true)};
+};
+
+struct GPUSplitBlockReductionOptions
+    : public PassPipelineOptions<GPUSplitBlockReductionOptions> {
+  Option<std::string> funcAnchor{
+      *this, "func-anchor",
+      llvm::cl::desc(
+          "An optional Unit attribute anchoring on target functions."),
+      llvm::cl::init("")};
+  Option<std::string> annotatePrefix{
+      *this, "annotate-prefix",
+      llvm::cl::desc("An optional annotate prefix attribute on target ops."),
+      llvm::cl::init("__byteir_gpu_split_block_reduction")};
+  Option<int64_t> splitFactor{*this, "split-factor",
+                              llvm::cl::desc("split factor"),
+                              llvm::cl::init(32)};
+  Option<int64_t> warpSize{*this, "warp-size", llvm::cl::desc("warp size"),
+                           llvm::cl::init(32)};
+};
+
+struct GPUTileBlockReductionOptions
+    : public PassPipelineOptions<GPUTileBlockReductionOptions> {
+  Option<std::string> funcAnchor{
+      *this, "func-anchor",
+      llvm::cl::desc(
+          "An optional Unit attribute anchoring on target functions."),
+      llvm::cl::init("")};
+  Option<std::string> annotatePrefix{
+      *this, "annotate-prefix",
+      llvm::cl::desc("An optional annotate prefix attribute on target ops."),
+      llvm::cl::init("__byteir_gpu_tile_block_reduction")};
+  Option<int64_t> warpSize{*this, "warp-size", llvm::cl::desc("warp size"),
+                           llvm::cl::init(32)};
+  Option<int64_t> blockSize{*this, "block-size", llvm::cl::desc("block size"),
+                            llvm::cl::init(256)};
+  Option<bool> usingForall{*this, "using-forall",
+                           llvm::cl::desc("using forall"),
+                           llvm::cl::init(true)};
+};
+
+struct GPUTileThreadReductionOptions
+    : public PassPipelineOptions<GPUTileThreadReductionOptions> {
+  Option<std::string> funcAnchor{
+      *this, "func-anchor",
+      llvm::cl::desc(
+          "An optional Unit attribute anchoring on target functions."),
+      llvm::cl::init("")};
+  Option<std::string> annotatePrefix{
+      *this, "annotate-prefix",
+      llvm::cl::desc("An optional annotate prefix attribute on target ops."),
+      llvm::cl::init("__byteir_gpu_tile_thread_reduction")};
+};
+
+void createGPUSplitGridReductionTransform(
+    OpPassManager &pm, const GPUSplitGridReductionOptions &options);
+void createGPUTileGridReductionTransform(
+    OpPassManager &pm, const GPUTileGridReductionOptions &options);
+void createGPUSplitBlockReductionTransform(
+    OpPassManager &pm, const GPUSplitBlockReductionOptions &options);
+void createGPUTileBlockReductionTransform(
+    OpPassManager &pm, const GPUTileBlockReductionOptions &options);
+void createGPUTileThreadReductionTransform(
+    OpPassManager &pm, const GPUTileThreadReductionOptions &options);
+
+inline void registerGPUReductionCodegenPipelines() {
+  PassPipelineRegistration<GPUSplitGridReductionOptions>(
+      "insert-gpu-split-grid-reduction-transform",
+      "Insert transformation IR to split linalg reduction op",
+      createGPUSplitGridReductionTransform);
+
+  PassPipelineRegistration<GPUTileGridReductionOptions>(
+      "insert-gpu-tile-grid-reduction-transform",
+      "Insert transformation IR to tile linalg reduction op",
+      createGPUTileGridReductionTransform);
+
+  PassPipelineRegistration<GPUSplitBlockReductionOptions>(
+      "insert-gpu-split-block-reduction-transform",
+      "Insert transformation IR to split linalg reduction op",
+      createGPUSplitBlockReductionTransform);
+
+  PassPipelineRegistration<GPUTileBlockReductionOptions>(
+      "insert-gpu-tile-block-reduction-transform",
+      "Insert transformation IR to tile linalg reduction op",
+      createGPUTileBlockReductionTransform);
+
+  PassPipelineRegistration<GPUTileThreadReductionOptions>(
+      "insert-gpu-tile-thread-reduction-transform",
+      "Insert transformation IR to tile linalg reduction op",
+      createGPUTileThreadReductionTransform);
+}
+
+} // namespace mlir
+
+#endif // BYTEIR_PIPELINES_GPU_REDUCTION_CODEGEN_H
diff --git a/compiler/include/byteir/Pipelines/InitAllPipelines.h b/compiler/include/byteir/Pipelines/InitAllPipelines.h
index 0cdf1e7ce..2a653f898 100644
--- a/compiler/include/byteir/Pipelines/InitAllPipelines.h
+++ b/compiler/include/byteir/Pipelines/InitAllPipelines.h
@@ -35,7 +35,9 @@
 #include "byteir/Pipelines/GPU/ElementwiseCodegen.h"
 #include "byteir/Pipelines/GPU/GPUOpt.h"
 #include "byteir/Pipelines/GPU/LinalgMemrefGPU.h"
+#include "byteir/Pipelines/GPU/MappingForall.h"
 #include "byteir/Pipelines/GPU/NVVMCodegen.h"
+#include "byteir/Pipelines/GPU/ReductionCodegen.h"
 
 #include "byteir/Pipelines/Host/Codegen.h"
 #include "byteir/Pipelines/Host/HostOpt.h"
@@ -65,6 +67,8 @@ inline void registerAllByteIRGPUPipelines() {
   registerLinalgMemrefGPUPipeline();
   registerMatmulEpilogueGPUPipeline();
   registerGPUElementwiseCodegenPipelines();
+  registerGPUReductionCodegenPipelines();
+  registerGPUMappingForallPipelines();
 }
 
 inline void registerAllByteIRHostPipelines() {
diff --git a/compiler/include/byteir/Transforms/MemoryPlanning.h b/compiler/include/byteir/Transforms/MemoryPlanning.h
index 74a83b88c..c1ad2a181 100644
--- a/compiler/include/byteir/Transforms/MemoryPlanning.h
+++ b/compiler/include/byteir/Transforms/MemoryPlanning.h
@@ -23,18 +23,19 @@
 #include <memory>
 
 namespace mlir {
+class FunctionOpInterface;
 class Value;
 namespace func {
 class FuncOp;
 } // namespace func
 
-std::unique_ptr<OperationPass<func::FuncOp>> createMemoryPlanningPass();
+std::unique_ptr<InterfacePass<FunctionOpInterface>> createMemoryPlanningPass();
 
 /// couldReuseBuffer is a user provided callback which receives a Value as
 /// parameter and returns whether the allocation corresponding to the Value can
 /// be reused
-std::unique_ptr<OperationPass<func::FuncOp>>
-createMemoryPlanningPass(size_t alignment,
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
+createMemoryPlanningPass(size_t alignment, bool alloca, size_t memSpace,
                          std::function<bool(Value)> couldReuseAllocation);
 
 } // namespace mlir
diff --git a/compiler/include/byteir/Transforms/Passes.td b/compiler/include/byteir/Transforms/Passes.td
index 4c6a9cb1b..8ac2f3e7c 100644
--- a/compiler/include/byteir/Transforms/Passes.td
+++ b/compiler/include/byteir/Transforms/Passes.td
@@ -237,7 +237,7 @@ def LoopTag : Pass<"loop-tag", "func::FuncOp"> {
 //===----------------------------------------------------------------------===//
 // Memory planning
 //===----------------------------------------------------------------------===//
-def MemoryPlanning: Pass<"memory-planning", "mlir::func::FuncOp"> {
+def MemoryPlanning: InterfacePass<"memory-planning", "mlir::FunctionOpInterface"> {
   let summary = "Pass to perform static memory planning";
   let constructor = "mlir::createMemoryPlanningPass()";
   let dependentDialects = [
diff --git a/compiler/lib/Analysis/UseRange.cpp b/compiler/lib/Analysis/UseRange.cpp
index f8a6e1076..88708c154 100644
--- a/compiler/lib/Analysis/UseRange.cpp
+++ b/compiler/lib/Analysis/UseRange.cpp
@@ -383,10 +383,9 @@ void UseInterval::mergeAndEraseContiguousIntervals(
     iter = interval.erase(std::next(iter), next);
 }
 
-UserangeAnalysis::UserangeAnalysis(
-    Operation *op, byteir::Liveness *liveness,
-    const bufferization::BufferPlacementAllocs &allocs,
-    const BufferViewFlowAnalysis &aliases)
+UserangeAnalysis::UserangeAnalysis(Operation *op, byteir::Liveness *liveness,
+                                   AllocsIteratorRange &&allocs,
+                                   const BufferViewFlowAnalysis &aliases)
     : liveness(liveness) {
   // Walk over all operations and map them to an ID.
   op->walk([&](Operation *operation) {
diff --git a/compiler/lib/CAPI/CMakeLists.txt b/compiler/lib/CAPI/CMakeLists.txt
index d8bc56463..1ae8a97f2 100644
--- a/compiler/lib/CAPI/CMakeLists.txt
+++ b/compiler/lib/CAPI/CMakeLists.txt
@@ -33,6 +33,7 @@ add_mlir_public_c_api_library(ByteIRCAPI
   # dialect specific passes
   ByteIRAffinePasses
   ByteIRByrePasses
+  ByteIRGPUPasses
   ByteIRLinalgPasses
   ByteIRMemRefPasses
   ByteIRMhloPasses
diff --git a/compiler/lib/CAPI/Passes.cpp b/compiler/lib/CAPI/Passes.cpp
index e686dab22..6707875c9 100644
--- a/compiler/lib/CAPI/Passes.cpp
+++ b/compiler/lib/CAPI/Passes.cpp
@@ -21,10 +21,12 @@
 #include "byteir/Dialect/Ace/Passes.h"
 #include "byteir/Dialect/Affine/Passes.h"
 #include "byteir/Dialect/Byre/Passes.h"
+#include "byteir/Dialect/GPU/Passes.h"
 #include "byteir/Dialect/Linalg/Passes.h"
 #include "byteir/Dialect/MemRef/Passes.h"
 #include "byteir/Dialect/SCF/Passes.h"
 #include "byteir/Dialect/Shape/Passes.h"
+#include "byteir/Dialect/Tensor/Passes.h"
 #include "byteir/Dialect/Transform/Passes.h"
 #include "byteir/Dialect/mhlo/Passes.h"
 #include "byteir/Pipelines/InitAllPipelines.h"
@@ -45,11 +47,13 @@ void byteirRegisterAllPasses() {
   registerByteIRAcePasses();
   registerByteIRAffinePasses();
   registerByteIRByrePasses();
+  registerByteIRGPUPasses();
   registerByteIRLinalgPasses();
   registerByteIRMemRefPasses();
   registerByteIRMhloPassesExt();
   registerByteIRSCFPasses();
   registerByteIRShapePasses();
+  registerByteIRTensorPasses();
   registerByteIRTransformPasses();
 
   // pipelines
diff --git a/compiler/lib/Conversion/FuncToByre/FuncToByre.cpp b/compiler/lib/Conversion/FuncToByre/FuncToByre.cpp
index 45e87ca16..bd1dce952 100644
--- a/compiler/lib/Conversion/FuncToByre/FuncToByre.cpp
+++ b/compiler/lib/Conversion/FuncToByre/FuncToByre.cpp
@@ -19,7 +19,9 @@
 #include "byteir/Dialect/Byre/ByreDialect.h"
 #include "byteir/Dialect/Byre/Common.h"
 #include "byteir/Utils/Utils.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Transforms/DialectConversion.h"
@@ -86,6 +88,53 @@ class ConvertCallOpToByreTensorPattern : public OpRewritePattern<func::CallOp> {
   bool appendArgTypes;
 };
 
+class ConvertGPULaunchFuncToByrePattern
+    : public OpRewritePattern<gpu::LaunchFuncOp> {
+
+public:
+  ConvertGPULaunchFuncToByrePattern(MLIRContext *ctx, bool useBarePtrCallConv)
+      : OpRewritePattern<gpu::LaunchFuncOp>(ctx),
+        useBarePtrCallConv(useBarePtrCallConv) {}
+
+  LogicalResult matchAndRewrite(gpu::LaunchFuncOp launchOp,
+                                PatternRewriter &rewriter) const override {
+    auto computeOp = rewriter.create<byre::ComputeOp>(
+        launchOp->getLoc(), TypeRange(), "PTXOp", launchOp.getKernelOperands(),
+        /*memEffects*/ ArrayAttr());
+
+    computeOp->setAttr(
+        rewriter.getStringAttr("kernel_name"),
+        rewriter.getStringAttr(launchOp.getKernelName().getValue()));
+
+    auto grid = launchOp.getGridSizeOperandValues();
+    int64_t gx = cast<arith::ConstantIndexOp>(grid.x.getDefiningOp()).value();
+    int64_t gy = cast<arith::ConstantIndexOp>(grid.y.getDefiningOp()).value();
+    int64_t gz = cast<arith::ConstantIndexOp>(grid.z.getDefiningOp()).value();
+    computeOp->setAttr("GridSize.x", rewriter.getI32IntegerAttr(gx));
+    computeOp->setAttr("GridSize.y", rewriter.getI32IntegerAttr(gy));
+    computeOp->setAttr("GridSize.z", rewriter.getI32IntegerAttr(gz));
+
+    auto block = launchOp.getBlockSizeOperandValues();
+    int64_t bx = cast<arith::ConstantIndexOp>(block.x.getDefiningOp()).value();
+    int64_t by = cast<arith::ConstantIndexOp>(block.y.getDefiningOp()).value();
+    int64_t bz = cast<arith::ConstantIndexOp>(block.z.getDefiningOp()).value();
+    computeOp->setAttr("BlockSize.x", rewriter.getI32IntegerAttr(bx));
+    computeOp->setAttr("BlockSize.y", rewriter.getI32IntegerAttr(by));
+    computeOp->setAttr("BlockSize.z", rewriter.getI32IntegerAttr(bz));
+
+    if (useBarePtrCallConv) {
+      computeOp->setAttr(byre::getKernelCallConventionAttrName(),
+                         rewriter.getStringAttr("bare_ptr"));
+    }
+    rewriter.eraseOp(launchOp);
+
+    return success();
+  }
+
+private:
+  bool useBarePtrCallConv;
+};
+
 struct ConvertFuncToByreTensorPass
     : public ConvertFuncToByreTensorBase<ConvertFuncToByreTensorPass> {
 public:
@@ -104,6 +153,24 @@ struct ConvertFuncToByreTensorPass
     }
   }
 };
+
+struct ConvertGPULaunchFuncToByrePass
+    : public ConvertGPULaunchFuncToByreBase<ConvertGPULaunchFuncToByrePass> {
+public:
+  ConvertGPULaunchFuncToByrePass(bool useBarePtrCallConv)
+      : ConvertGPULaunchFuncToByreBase() {
+    this->useBarePtrCallConv = useBarePtrCallConv;
+  }
+  void runOnOperation() override {
+    MLIRContext &ctx = getContext();
+    RewritePatternSet patterns(&ctx);
+    populateGPULaunchFuncToByrePattern(patterns, useBarePtrCallConv);
+    if (failed(applyPatternsAndFoldGreedily(getOperation(),
+                                            std::move(patterns)))) {
+      signalPassFailure();
+    }
+  }
+};
 }; // namespace
 
 void mlir::populateFuncToByreTensorPattern(RewritePatternSet &patterns,
@@ -112,7 +179,18 @@ void mlir::populateFuncToByreTensorPattern(RewritePatternSet &patterns,
                                                  appendArgTypes);
 }
 
+void mlir::populateGPULaunchFuncToByrePattern(RewritePatternSet &patterns,
+                                              bool useBarePtrCallConv) {
+  patterns.add<ConvertGPULaunchFuncToByrePattern>(patterns.getContext(),
+                                                  useBarePtrCallConv);
+}
+
 std::unique_ptr<OperationPass<ModuleOp>>
 mlir::createConvertFuncToByreTensorPass(bool appendArgTypes) {
   return std::make_unique<ConvertFuncToByreTensorPass>(appendArgTypes);
 }
+
+std::unique_ptr<Pass>
+mlir::createConvertGPULaunchFuncToByrePass(bool useBarePtrCallConv) {
+  return std::make_unique<ConvertGPULaunchFuncToByrePass>(useBarePtrCallConv);
+}
\ No newline at end of file
diff --git a/compiler/lib/Conversion/HloToCat/FuseHloToCat.cpp b/compiler/lib/Conversion/HloToCat/FuseHloToCat.cpp
index e78709194..452075ce4 100644
--- a/compiler/lib/Conversion/HloToCat/FuseHloToCat.cpp
+++ b/compiler/lib/Conversion/HloToCat/FuseHloToCat.cpp
@@ -283,6 +283,48 @@ struct ConvertBmmReshapeTransposeToBmmReshape
   }
 };
 
+// bmm_rrr(x, broadcast_in_dim(y)) => reshape(gemm_rrr(reshape(x), y))
+struct ConvertBmmRRRBroadcastToReshapeGemmRRRReshape
+    : public OpRewritePattern<cat::BMMRRROp> {
+  using OpRewritePattern<cat::BMMRRROp>::OpRewritePattern;
+  LogicalResult matchAndRewrite(cat::BMMRRROp op,
+                                PatternRewriter &rewriter) const override {
+    auto bCastOp = op.getRhs().getDefiningOp<mhlo::BroadcastInDimOp>();
+    if (!bCastOp) {
+      return failure();
+    }
+    auto lhsType = op.getLhs().getType().cast<ShapedType>();
+    auto rhsType = op.getRhs().getType().cast<ShapedType>();
+    if (!lhsType.hasStaticShape() || !rhsType.hasStaticShape()) {
+      return failure();
+    }
+    SmallVector<int64_t> broadcastDimensions;
+    getValuesFromDenseIntElementsAttr(bCastOp.getBroadcastDimensions(),
+                                      broadcastDimensions);
+    if (broadcastDimensions.size() != 2) {
+      return failure();
+    }
+    if (broadcastDimensions[0] != 1 || broadcastDimensions[1] != 2) {
+      return failure();
+    }
+
+    RankedTensorType firstReshapeType = RankedTensorType::get(
+        {lhsType.getDimSize(0) * lhsType.getDimSize(1), lhsType.getDimSize(2)},
+        lhsType.getElementType());
+    RankedTensorType gemmType = RankedTensorType::get(
+        {firstReshapeType.getDimSize(0), rhsType.getDimSize(2)},
+        lhsType.getElementType());
+    auto firstReshape = rewriter.create<mhlo::ReshapeOp>(
+        op.getLoc(), firstReshapeType, op.getLhs());
+    auto gemm = rewriter.create<cat::GemmRRROp>(
+        op.getLoc(), gemmType, firstReshape, bCastOp.getOperand());
+    auto secondReshape =
+        rewriter.create<mhlo::ReshapeOp>(op.getLoc(), op.getType(), gemm);
+    rewriter.replaceOp(op, secondReshape);
+    return success();
+  }
+};
+
 struct FuseMhloToCatPass : public FuseMhloToCatBase<FuseMhloToCatPass> {
 public:
   FuseMhloToCatPass() = default;
@@ -317,7 +359,8 @@ void populateFuseMhloToCatPattern(RewritePatternSet &patterns) {
                ConvertBmmReshapeTransposeToBmmReshape<cat::BMMRRCOp, cat::BMMRRROp>,
                ConvertBmmReshapeTransposeToBmmReshape<cat::BMMRCCOp, cat::BMMRCROp>,
                ConvertBmmReshapeTransposeToBmmReshape<cat::BMMCRCOp, cat::BMMCRROp>,
-               ConvertBmmReshapeTransposeToBmmReshape<cat::BMMCCCOp, cat::BMMCCROp>
+               ConvertBmmReshapeTransposeToBmmReshape<cat::BMMCCCOp, cat::BMMCCROp>,
+               ConvertBmmRRRBroadcastToReshapeGemmRRRReshape
                >(patterns.getContext());
   // clang-format on
 }
diff --git a/compiler/lib/Conversion/HloToCat/FuseHloToCatPattern.td b/compiler/lib/Conversion/HloToCat/FuseHloToCatPattern.td
index c984993e2..ba101c1af 100644
--- a/compiler/lib/Conversion/HloToCat/FuseHloToCatPattern.td
+++ b/compiler/lib/Conversion/HloToCat/FuseHloToCatPattern.td
@@ -30,6 +30,7 @@ def OneRank : Constraint<CPred<"$0.getType().cast<ShapedType>().getRank() == 1">
 def TwoRank : Constraint<CPred<"$0.getType().cast<ShapedType>().getRank() == 2">, "two rank">;
 def ThreeRank : Constraint<CPred<"$0.getType().cast<ShapedType>().getRank() == 3">, "three rank">;
 def FourRank : Constraint<CPred<"$0.getType().cast<ShapedType>().getRank() == 4">, "four rank">;
+def Permute10Check : Constraint<CPred<"$0.getValues<int64_t>()[0] == 1 && $0.getValues<int64_t>()[1] == 0">, "transpose <[1, 0]>">;
 def Permute021Check : Constraint<CPred<"$0.getValues<int64_t>()[0] == 0 && $0.getValues<int64_t>()[1] == 2 && $0.getValues<int64_t>()[2] == 1">, "bmm 3d permute check (for transpose before bmm)">;
 def Permute0213Check : Constraint<CPred<"$0.getValues<int64_t>()[0] == 0 && $0.getValues<int64_t>()[1] == 2 && $0.getValues<int64_t>()[2] == 1 && $0.getValues<int64_t>()[3] == 3">, "bmm 4d permute check (for transpose after bmm)">;
 def TransposeCheck : Constraint<CPred<"$0.getValues<int64_t>()[0] == 1 && $0.getValues<int64_t>()[1] == 0">, "matrix transpose check">;
@@ -116,6 +117,26 @@ def MhloCatGemmRCRTransToCatGemmRCRPermutePattern
           (Cat_GemmRCRPermuteOp $lhs, $rhs, (getDim1Attr $reshape_out), (getDim2Attr $reshape_out)),
           [(TwoRank $lhs), (TwoRank $rhs), (FourRank $reshape_out), (GemmPermuteShapeCheck $reshape_out, $gemm_out), (Permute0213Check $permute)]>;
 
+def MhloCatGemmRRRTransToCatGemmRRRPermutePattern
+    : Pat<(MHLO_TransposeOp
+            (MHLO_ReshapeOp : $reshape_out
+              (Cat_GemmRRROp : $gemm_out
+                $lhs, $rhs)
+            ),
+            $permute),
+          (Cat_GemmRRRPermuteOp $lhs, $rhs, (getDim1Attr $reshape_out), (getDim2Attr $reshape_out)),
+          [(TwoRank $lhs), (TwoRank $rhs), (FourRank $reshape_out), (GemmPermuteShapeCheck $reshape_out, $gemm_out), (Permute0213Check $permute)]>;
+
+def CatGemmRRRPermuteTransToCatGemmRCRPermutePattern
+    : Pat<(Cat_GemmRRRPermuteOp
+            $lhs,
+            (MHLO_TransposeOp $rhs, $permute),
+            $t1,
+            $t2
+          ),
+          (Cat_GemmRCRPermuteOp $lhs, $rhs, $t1, $t2),
+          [(TwoSize $permute), (Permute10Check $permute)]>;
+
 def LayoutFrom3DDotGeneralDimNums : NativeCodeCall<"GetLayoutFrom3DDotGeneralDimNums($0, &$_builder)">;
 
 def CheckRRRLayoutFrom3DDotGeneralDimNums
@@ -140,6 +161,12 @@ def CheckCCRLayoutFrom3DDotGeneralDimNums
           CPred<"$0.getLhsContractingDimensions().size() == 1 && $0.getRhsContractingDimensions().size() == 1 && $0.getLhsContractingDimensions()[0] == 1 && $0.getRhsContractingDimensions()[0] == 2">,
           "is bmm ccr dimension">;
 
+def CheckBMMPermuteShapeSplitOnBatch
+    : Constraint<
+          CPred<"$0.getType().cast<ShapedType>().getShape()[0] * $0.getType().cast<ShapedType>().getShape()[1] == $1.getType().cast<ShapedType>().getShape()[0] && $0.getType().cast<ShapedType>().getShape()[2] == $1.getType().cast<ShapedType>().getShape()[1]">,
+          "bmm rrr Shape Split On Batch">;
+
+
 def MhloDotGeneralReshapeTransposeToBMMRRRPermutePattern
     : Pat<(MHLO_TransposeOp
             (MHLO_ReshapeOp : $reshape_out
@@ -153,7 +180,8 @@ def MhloDotGeneralReshapeTransposeToBMMRRRPermutePattern
             (ThreeRank $rhs), 
             (FourSize $permute), 
             (Permute0213Check $permute),
-            (CheckRRRLayoutFrom3DDotGeneralDimNums $dimension_numbers)
+            (CheckRRRLayoutFrom3DDotGeneralDimNums $dimension_numbers),
+            (CheckBMMPermuteShapeSplitOnBatch $reshape_out, $lhs)
           ]>;
 
 def MhloDotGeneralReshapeTransposeToBMMRCRPermutePattern
diff --git a/compiler/lib/Conversion/MemrefToByre/MemrefToByre.cpp b/compiler/lib/Conversion/MemrefToByre/MemrefToByre.cpp
index 49676e2b1..f759d2d0b 100644
--- a/compiler/lib/Conversion/MemrefToByre/MemrefToByre.cpp
+++ b/compiler/lib/Conversion/MemrefToByre/MemrefToByre.cpp
@@ -69,6 +69,27 @@ class ConvertViewOpToByrePattern : public OpConversionPattern<memref::ViewOp> {
   }
 };
 
+class ConvertSubViewOpToByrePattern
+    : public OpConversionPattern<memref::SubViewOp> {
+public:
+  ConvertSubViewOpToByrePattern(MLIRContext *ctx)
+      : OpConversionPattern<memref::SubViewOp>(ctx) {}
+
+  LogicalResult
+  matchAndRewrite(memref::SubViewOp op, memref::SubViewOp::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    if (!op.getType().getLayout().isIdentity())
+      return failure();
+
+    if (!op.getSource().getType().getLayout().isIdentity())
+      return failure();
+
+    rewriter.replaceOpWithNewOp<byre::AliasOp>(op, op.getResult().getType(),
+                                               adaptor.getSource(), 0);
+    return success();
+  }
+};
+
 class ConvertMemrefCopyOpToByrePattern
     : public OpConversionPattern<memref::CopyOp> {
 public:
@@ -174,8 +195,8 @@ void mlir::populateMemrefToByrePattern(RewritePatternSet &patterns) {
   patterns.add<ConvertViewOpToByrePattern, ConvertMemrefCopyOpToByrePattern,
                ConvertGetGlobalOpToByrePattern,
                ConvertReshapeLikeOpToByrePattern<memref::CollapseShapeOp>,
-               ConvertReshapeLikeOpToByrePattern<memref::ExpandShapeOp>>(
-      patterns.getContext());
+               ConvertReshapeLikeOpToByrePattern<memref::ExpandShapeOp>,
+               ConvertSubViewOpToByrePattern>(patterns.getContext());
 }
 
 std::unique_ptr<OperationPass<func::FuncOp>>
diff --git a/compiler/lib/Conversion/ToByre/ToByre.cpp b/compiler/lib/Conversion/ToByre/ToByre.cpp
index 1c83a25b3..31e43449c 100644
--- a/compiler/lib/Conversion/ToByre/ToByre.cpp
+++ b/compiler/lib/Conversion/ToByre/ToByre.cpp
@@ -1074,10 +1074,11 @@ static bool isRewritablePrivateFunc(func::FuncOp func) {
 }
 
 // identify EntryPoint funciton
-static void identifyEntryPointFuncAndCalls(
-    ModuleOp m, llvm::SmallVector<func::FuncOp, 4> &entries,
-    llvm::SmallVector<func::CallOp, 16> &calls,
-    llvm::SmallVector<func::FuncOp, 16> &removeFuncs) {
+static void
+identifyEntryPointFuncAndCalls(ModuleOp m,
+                               llvm::SmallVector<func::FuncOp, 4> &entries,
+                               llvm::SmallVector<func::CallOp, 16> &calls,
+                               llvm::SetVector<func::FuncOp> &removeFuncs) {
   // get first entry func
 
   llvm::SmallPtrSet<Operation *, 16> callSet;
@@ -1094,7 +1095,7 @@ static void identifyEntryPointFuncAndCalls(
       if (isRewritablePrivateFunc(calleeFuncOp) && !callSet.contains(callOp)) {
         calls.push_back(callOp);
         callSet.insert(callOp);
-        removeFuncs.push_back(calleeFuncOp);
+        removeFuncs.insert(calleeFuncOp);
       }
     }
   }
@@ -1273,7 +1274,7 @@ void ConvertFuncAndCallToByrePass::runOnOperation() {
   MLIRContext &ctx = getContext();
   llvm::SmallVector<func::FuncOp, 4> entryCollector;
   llvm::SmallVector<func::CallOp, 16> callCollector;
-  llvm::SmallVector<func::FuncOp, 16> removeFuncCollector;
+  llvm::SetVector<func::FuncOp> removeFuncCollector;
 
   identifyEntryPointFuncAndCalls(m, entryCollector, callCollector,
                                  removeFuncCollector);
@@ -1330,7 +1331,7 @@ void ConvertFuncAndCallToByrePass::runOnOperation() {
     return signalPassFailure();
   }
 
-  for (auto func : removeFuncCollector) {
+  for (auto func : removeFuncCollector.takeVector()) {
     func->erase();
   }
 }
diff --git a/compiler/lib/Conversion/ToLinalg/MemrefCopyToLinalg.cpp b/compiler/lib/Conversion/ToLinalg/MemrefCopyToLinalg.cpp
index 8ceae81d9..2fe676adb 100644
--- a/compiler/lib/Conversion/ToLinalg/MemrefCopyToLinalg.cpp
+++ b/compiler/lib/Conversion/ToLinalg/MemrefCopyToLinalg.cpp
@@ -39,8 +39,9 @@ namespace {
 struct MemrefCopyOpToLinalg : public OpRewritePattern<memref::CopyOp> {
   using OpRewritePattern<memref::CopyOp>::OpRewritePattern;
   MemrefCopyOpToLinalg(MLIRContext *ctx, std::string anchorTag,
-                       std::string attachAttr)
-      : OpRewritePattern(ctx), anchorTag(anchorTag), attachAttr(attachAttr) {}
+                       std::string attachAttr, bool outlining)
+      : OpRewritePattern(ctx), anchorTag(anchorTag), attachAttr(attachAttr),
+        outlining(outlining) {}
 
   LogicalResult matchAndRewrite(memref::CopyOp copyOp,
                                 PatternRewriter &rewriter) const override {
@@ -56,84 +57,101 @@ struct MemrefCopyOpToLinalg : public OpRewritePattern<memref::CopyOp> {
     auto dstType = llvm::dyn_cast<MemRefType>(dst.getType());
     if (!srcType || !dstType)
       return failure();
-    if (srcType.getLayout().isIdentity() && dstType.getLayout().isIdentity())
-      return failure();
 
-    SmallVector<Operation *> ops;
-    auto getViewSource = [&](Value value) {
-      while (auto viewOp = value.getDefiningOp<ViewLikeOpInterface>()) {
-        ops.push_back(viewOp);
-        value = viewOp.getViewSource();
+    if (outlining) {
+      if (srcType.getLayout().isIdentity() && dstType.getLayout().isIdentity())
+        return failure();
+
+      SmallVector<Operation *> ops;
+      auto getViewSource = [&](Value value) {
+        while (auto viewOp = value.getDefiningOp<ViewLikeOpInterface>()) {
+          ops.push_back(viewOp);
+          value = viewOp.getViewSource();
+        }
+        return value;
+      };
+      Value callSrc = getViewSource(src);
+      Value callDst = getViewSource(dst);
+
+      auto symbolTableOp = SymbolTable::getNearestSymbolTable(copyOp);
+      SymbolTable symbolTable(symbolTableOp);
+      auto funcType =
+          rewriter.getFunctionType({callSrc.getType(), callDst.getType()}, {});
+
+      OpBuilder::InsertionGuard guard(rewriter);
+      // Insert before module terminator.
+      rewriter.setInsertionPoint(parentOp);
+      func::FuncOp funcOp = rewriter.create<func::FuncOp>(
+          copyOp->getLoc(), "memref_copy_kernel", funcType);
+      symbolTable.insert(funcOp);
+      funcOp.setPrivate();
+
+      Block *entryBlock = funcOp.addEntryBlock();
+      rewriter.setInsertionPointToStart(entryBlock);
+      IRMapping mapping;
+      mapping.map(ValueRange{callSrc, callDst}, entryBlock->getArguments());
+      for (auto &&op : llvm::reverse(ops)) {
+        auto newOp = rewriter.clone(*op, mapping);
+        mapping.map(op, newOp);
+      }
+      AffineMap id = AffineMap::getMultiDimIdentityMap(dstType.getRank(),
+                                                       rewriter.getContext());
+      SmallVector<utils::IteratorType> iteratorTypes(
+          dstType.getRank(), utils::IteratorType::parallel);
+      rewriter.create<linalg::GenericOp>(
+          copyOp->getLoc(), mapping.lookup(copyOp.getSource()),
+          mapping.lookup(copyOp.getTarget()), llvm::ArrayRef({id, id}),
+          iteratorTypes,
+          [](OpBuilder &b, Location loc, ValueRange args) {
+            b.create<linalg::YieldOp>(loc, args.front());
+          },
+          copyOp->getAttrs());
+      rewriter.create<func::ReturnOp>(copyOp->getLoc());
+      if (!attachAttr.empty()) {
+        funcOp->setAttr(attachAttr, rewriter.getUnitAttr());
       }
-      return value;
-    };
-    Value callSrc = getViewSource(src);
-    Value callDst = getViewSource(dst);
-
-    auto symbolTableOp = SymbolTable::getNearestSymbolTable(copyOp);
-    SymbolTable symbolTable(symbolTableOp);
-    auto funcType =
-        rewriter.getFunctionType({callSrc.getType(), callDst.getType()}, {});
-
-    OpBuilder::InsertionGuard guard(rewriter);
-    // Insert before module terminator.
-    rewriter.setInsertionPoint(parentOp);
-    func::FuncOp funcOp = rewriter.create<func::FuncOp>(
-        copyOp->getLoc(), "memref_copy_kernel", funcType);
-    symbolTable.insert(funcOp);
-    funcOp.setPrivate();
-
-    Block *entryBlock = funcOp.addEntryBlock();
-    rewriter.setInsertionPointToStart(entryBlock);
-    IRMapping mapping;
-    mapping.map(ValueRange{callSrc, callDst}, entryBlock->getArguments());
-    for (auto &&op : llvm::reverse(ops)) {
-      auto newOp = rewriter.clone(*op, mapping);
-      mapping.map(op, newOp);
-    }
-    AffineMap id = AffineMap::getMultiDimIdentityMap(dstType.getRank(),
-                                                     rewriter.getContext());
-    SmallVector<utils::IteratorType> iteratorTypes(
-        dstType.getRank(), utils::IteratorType::parallel);
-    rewriter.create<linalg::GenericOp>(
-        copyOp->getLoc(), mapping.lookup(copyOp.getSource()),
-        mapping.lookup(copyOp.getTarget()), llvm::ArrayRef({id, id}),
-        iteratorTypes,
-        [](OpBuilder &b, Location loc, ValueRange args) {
-          b.create<linalg::YieldOp>(loc, args.front());
-        },
-        copyOp->getAttrs());
-    rewriter.create<func::ReturnOp>(copyOp->getLoc());
-    if (!attachAttr.empty()) {
-      funcOp->setAttr(attachAttr, rewriter.getUnitAttr());
-    }
 
-    rewriter.setInsertionPoint(copyOp);
-    auto callOp = rewriter.replaceOpWithNewOp<func::CallOp>(
-        copyOp, funcOp, ValueRange{callSrc, callDst});
-    callOp->setAttr(byre::getByreCallOpReadonlyOperandNumAttrName(),
-                    rewriter.getIndexAttr(1));
+      rewriter.setInsertionPoint(copyOp);
+      auto callOp = rewriter.replaceOpWithNewOp<func::CallOp>(
+          copyOp, funcOp, ValueRange{callSrc, callDst});
+      callOp->setAttr(byre::getByreCallOpReadonlyOperandNumAttrName(),
+                      rewriter.getIndexAttr(1));
+    } else {
+      AffineMap id = AffineMap::getMultiDimIdentityMap(dstType.getRank(),
+                                                       rewriter.getContext());
+      SmallVector<utils::IteratorType> iteratorTypes(
+          dstType.getRank(), utils::IteratorType::parallel);
+      rewriter.replaceOpWithNewOp<linalg::GenericOp>(
+          copyOp, src, dst, llvm::ArrayRef({id, id}), iteratorTypes,
+          [](OpBuilder &b, Location loc, ValueRange args) {
+            b.create<linalg::YieldOp>(loc, args.front());
+          },
+          copyOp->getAttrs());
+    }
     return success();
   }
 
 private:
   std::string anchorTag;
   std::string attachAttr;
+  bool outlining;
 };
 
 struct MemrefCopyToLinalgPass
     : public MemrefCopyToLinalgPassBase<MemrefCopyToLinalgPass> {
-  MemrefCopyToLinalgPass(std::string anchorTag, std::string attachAttr)
+  MemrefCopyToLinalgPass(std::string anchorTag, std::string attachAttr,
+                         bool outlining)
       : MemrefCopyToLinalgPassBase() {
     this->anchorTag = anchorTag;
     this->attachAttr = attachAttr;
+    this->outlining = outlining;
   }
 
   void runOnOperation() override {
     MLIRContext *context = &getContext();
     RewritePatternSet patterns(&getContext());
     patterns.insert<MemrefCopyOpToLinalg>(context, this->anchorTag,
-                                          this->attachAttr);
+                                          this->attachAttr, this->outlining);
     if (failed(applyPatternsAndFoldGreedily(getOperation(),
                                             std::move(patterns)))) {
       return signalPassFailure();
@@ -144,8 +162,10 @@ struct MemrefCopyToLinalgPass
 } // namespace
 
 std::unique_ptr<OperationPass<ModuleOp>>
-createMemrefCopyToLinalgPass(std::string anchorTag, std::string attachAttr) {
-  return std::make_unique<MemrefCopyToLinalgPass>(anchorTag, attachAttr);
+createMemrefCopyToLinalgPass(std::string anchorTag, std::string attachAttr,
+                             bool outlining) {
+  return std::make_unique<MemrefCopyToLinalgPass>(anchorTag, attachAttr,
+                                                  outlining);
 }
 
 } // namespace mlir
diff --git a/compiler/lib/Conversion/ToPTX/CollectGPUKernel.cpp b/compiler/lib/Conversion/ToPTX/CollectGPUKernel.cpp
index d2556024f..7ccd11036 100644
--- a/compiler/lib/Conversion/ToPTX/CollectGPUKernel.cpp
+++ b/compiler/lib/Conversion/ToPTX/CollectGPUKernel.cpp
@@ -37,8 +37,10 @@ namespace {
 struct CollectGPUKernelPass
     : public CollectGPUKernelBase<CollectGPUKernelPass> {
 
-  CollectGPUKernelPass(const std::string &name) : CollectGPUKernelBase() {
+  CollectGPUKernelPass(const std::string &name, bool removeHost)
+      : CollectGPUKernelBase() {
     this->moduleName = name;
+    this->removeHost = removeHost;
   }
 
   void runOnOperation() override {
@@ -49,20 +51,20 @@ struct CollectGPUKernelPass
     bool found = false;
     GPUModuleOp dst;
 
-    for (auto &op : m.getBody()->without_terminator()) {
-      if (auto gm = dyn_cast<gpu::GPUModuleOp>(op)) {
-        if (gm.getName() == moduleName) {
-          found = true;
-          dst = gm;
-        } else {
-          gmCollector.push_back(gm);
-        }
+    for (auto gm : m.getOps<gpu::GPUModuleOp>()) {
+      if (gm.getName() == moduleName) {
+        found = true;
+        dst = gm;
+      } else {
+        gmCollector.push_back(gm);
       }
     }
 
     // Note FuncOps not in m.getBody()->without_terminator()
-    for (auto func : m.getOps<func::FuncOp>()) {
-      removeOps.push_back(func);
+    if (removeHost) {
+      for (auto func : m.getOps<func::FuncOp>()) {
+        removeOps.push_back(func);
+      }
     }
 
     if (gmCollector.size() == 0) {
@@ -78,12 +80,13 @@ struct CollectGPUKernelPass
     }
 
     SymbolTable dstTable(dst);
-
     for (auto gm : gmCollector) {
       for (auto &op : gm.getBody()->without_terminator()) {
         auto newOp = op.clone();
-        dstTable.insert(newOp);
+        auto newName = dstTable.insert(newOp);
+        (void)SymbolTable::replaceAllSymbolUses(&op, newName, m);
       }
+      (void)SymbolTable::replaceAllSymbolUses(gm, dst.getNameAttr(), m);
       gm.erase();
     }
 
@@ -96,6 +99,6 @@ struct CollectGPUKernelPass
 } // namespace
 
 std::unique_ptr<OperationPass<ModuleOp>>
-mlir::createCollectGPUKernelPass(const std::string &name) {
-  return std::make_unique<CollectGPUKernelPass>(name);
+mlir::createCollectGPUKernelPass(const std::string &name, bool removeHost) {
+  return std::make_unique<CollectGPUKernelPass>(name, removeHost);
 }
diff --git a/compiler/lib/Dialect/CMakeLists.txt b/compiler/lib/Dialect/CMakeLists.txt
index fe905afe9..3e8627a7e 100644
--- a/compiler/lib/Dialect/CMakeLists.txt
+++ b/compiler/lib/Dialect/CMakeLists.txt
@@ -3,6 +3,7 @@ add_subdirectory(Affine)
 add_subdirectory(Byre)
 add_subdirectory(Cat)
 add_subdirectory(Ccl)
+add_subdirectory(GPU)
 add_subdirectory(Lace)
 add_subdirectory(Linalg)
 add_subdirectory(MemRef)
diff --git a/compiler/lib/Dialect/Cat/IR/CatDialect.cpp b/compiler/lib/Dialect/Cat/IR/CatDialect.cpp
index 24570d4cc..966a22e22 100644
--- a/compiler/lib/Dialect/Cat/IR/CatDialect.cpp
+++ b/compiler/lib/Dialect/Cat/IR/CatDialect.cpp
@@ -158,3 +158,9 @@ LogicalResult GemmRCRPermuteOp::verify() {
                                      this->getOutput(), this->getT1(),
                                      this->getT2(), "rcr");
 }
+
+LogicalResult GemmRRRPermuteOp::verify() {
+  return VerifyGemmPermute0213Layout(this->getLhs(), this->getRhs(),
+                                     this->getOutput(), this->getT1(),
+                                     this->getT2(), "rrr");
+}
diff --git a/compiler/lib/Dialect/GPU/CMakeLists.txt b/compiler/lib/Dialect/GPU/CMakeLists.txt
new file mode 100644
index 000000000..5c919f7df
--- /dev/null
+++ b/compiler/lib/Dialect/GPU/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(Transforms)
\ No newline at end of file
diff --git a/compiler/lib/Dialect/GPU/Transforms/CMakeLists.txt b/compiler/lib/Dialect/GPU/Transforms/CMakeLists.txt
new file mode 100644
index 000000000..733282ba9
--- /dev/null
+++ b/compiler/lib/Dialect/GPU/Transforms/CMakeLists.txt
@@ -0,0 +1,19 @@
+add_mlir_dialect_library(ByteIRGPUPasses
+  ShmAllocaToWorkgroupArg.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/GPU
+  ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/GPU/Transforms
+
+  DEPENDS
+  ByteIRGPUPassIncGen
+  ByteIRUtils
+  MLIRGPUDialect
+
+  LINK_LIBS PUBLIC
+  ByteIRUtils
+  MLIRIR
+  MLIRGPUDialect
+  MLIRMemRefDialect
+  MLIRSupport
+)
diff --git a/compiler/lib/Dialect/GPU/Transforms/ShmAllocaToWorkgroupArg.cpp b/compiler/lib/Dialect/GPU/Transforms/ShmAllocaToWorkgroupArg.cpp
new file mode 100644
index 000000000..808ed797f
--- /dev/null
+++ b/compiler/lib/Dialect/GPU/Transforms/ShmAllocaToWorkgroupArg.cpp
@@ -0,0 +1,86 @@
+//===- ShmAllocaToWorkgroupArg.cpp --------------------------------- C++
+//-*-===//
+//
+// Copyright 2022 ByteDance Ltd. and/or its affiliates. All rights reserved.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+
+#include "byteir/Dialect/GPU/Passes.h"
+#include "byteir/Dialect/GPU/Transforms/Transforms.h"
+#include "byteir/Transforms/MemoryPlanning.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/Pass/PassManager.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include <utility>
+
+#define DEBUG_TYPE "shm-alloca-to-workgroup-arg"
+
+namespace mlir {
+#define GEN_PASS_DEF_SHMALLOCATOWORKGROUPARG
+#include "byteir/Dialect/GPU/Passes.h.inc"
+} // namespace mlir
+
+using namespace llvm;
+using namespace mlir;
+
+namespace {
+struct ShmAllocaToWorkgroupArgPass
+    : public impl::ShmAllocaToWorkgroupArgBase<ShmAllocaToWorkgroupArgPass> {
+  void runOnOperation() override {
+    gpu::GPUModuleOp m = getOperation();
+    WalkResult walkResult = m->walk([&](gpu::GPUFuncOp func) {
+      if (!func.isKernel())
+        return WalkResult::advance();
+
+      // OpPassManager pm(func.getOperationName());
+      // pm.addPass(createMemoryPlanningPass(/* alignment */ 1, /* alloca */
+      // true,
+      //                                     /* memory space */ 0,
+      //                                     /* callback */ nullptr));
+      // if (mlir::failed(runPipeline(pm, func))) {
+      //   return WalkResult::interrupt();
+      // }
+
+      gpu::hoistShmAllocaToWorkgroup(func);
+      return WalkResult::advance();
+    });
+
+    if (walkResult.wasInterrupted()) {
+      m->emitError() << "ShmAllocaToWorkgroupArgPass failed";
+      signalPassFailure();
+    }
+  }
+};
+} // namespace
+
+void mlir::gpu::hoistShmAllocaToWorkgroup(gpu::GPUFuncOp func) {
+  func->walk([&](memref::AllocaOp alloca) {
+    auto memref = alloca.getType();
+    if (auto memorySpace = llvm::dyn_cast_or_null<gpu::AddressSpaceAttr>(
+            memref.getMemorySpace())) {
+      if (memorySpace.getValue() ==
+          gpu::GPUDialect::getWorkgroupAddressSpace()) {
+        Value workgroup = func.addWorkgroupAttribution(memref, alloca.getLoc());
+        alloca.getMemref().replaceAllUsesWith(workgroup);
+        alloca->erase();
+      }
+    }
+  });
+}
diff --git a/compiler/lib/Dialect/Linalg/TransformOps/LinalgExtTransformOps.cpp b/compiler/lib/Dialect/Linalg/TransformOps/LinalgExtTransformOps.cpp
index 64e456af3..bcb8df731 100644
--- a/compiler/lib/Dialect/Linalg/TransformOps/LinalgExtTransformOps.cpp
+++ b/compiler/lib/Dialect/Linalg/TransformOps/LinalgExtTransformOps.cpp
@@ -56,6 +56,7 @@
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/SymbolTable.h"
+#include "mlir/Interfaces/DestinationStyleOpInterface.h"
 #include "mlir/Interfaces/TilingInterface.h"
 #include "mlir/Transforms/InliningUtils.h"
 #include "mlir/Transforms/RegionUtils.h"
@@ -63,6 +64,7 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/Support/Debug.h"
+
 #include <numeric>
 
 using namespace mlir;
@@ -150,6 +152,76 @@ transform::CollapseDimsOp::apply(transform::TransformRewriter &rewriter,
   return DiagnosedSilenceableFailure::success();
 }
 
+//===----------------------------------------------------------------------===//
+// DetensorizeOp
+//===----------------------------------------------------------------------===//
+namespace {
+LogicalResult detensorizeLinalgOp(OpBuilder &b, linalg::LinalgOp linalgOp) {
+  if (!linalgOp.hasTensorSemantics())
+    return failure();
+
+  if (linalgOp.getNumLoops())
+    return failure();
+
+  Location loc = linalgOp->getLoc();
+  SmallVector<Value> scalars;
+  scalars.reserve(linalgOp->getNumOperands());
+  for (auto &&operand : linalgOp->getOpOperands()) {
+    if (!linalgOp.payloadUsesValueFromOperand(&operand)) {
+      scalars.push_back(nullptr);
+      continue;
+    }
+    if (linalgOp.isScalar(&operand)) {
+      scalars.push_back(operand.get());
+      continue;
+    }
+    auto tensorType = llvm::dyn_cast<TensorType>(operand.get().getType());
+    if (!tensorType || !tensorType.hasRank() || tensorType.getRank() != 0)
+      return failure();
+
+    scalars.push_back(
+        b.create<tensor::ExtractOp>(loc, operand.get(), ValueRange()));
+  }
+
+  Block *body = linalgOp.getBlock();
+  IRMapping map;
+  map.map(body->getArguments(), scalars);
+  for (auto &&op : body->without_terminator()) {
+    b.clone(op, map);
+  }
+
+  for (auto &&opOperand : linalgOp.getDpsInitOperands()) {
+    OpOperand *yieldOperand = linalgOp.getMatchingYieldValue(opOperand);
+    Value element = map.lookupOrDefault(yieldOperand->get());
+    Value tensor = b.create<tensor::FromElementsOp>(
+        loc, RankedTensorType::get({}, element.getType()), ValueRange(element));
+    Value result = linalgOp.getTiedOpResult(opOperand);
+    result.replaceAllUsesWith(tensor);
+  }
+  linalgOp->erase();
+  return success();
+}
+} // namespace
+
+DiagnosedSilenceableFailure
+transform::DetensorizeOp::apply(transform::TransformRewriter &rewriter,
+                                transform::TransformResults &results,
+                                transform::TransformState &state) {
+  for (Operation *target : state.getPayloadOps(getTarget())) {
+    auto linalgOp = dyn_cast_or_null<linalg::LinalgOp>(target);
+    if (!linalgOp)
+      return emitDefaultDefiniteFailure(target)
+             << " detensorize transformation should be applied on linalg op";
+
+    OpBuilder builder(getContext());
+    builder.setInsertionPoint(target);
+    if (failed(detensorizeLinalgOp(builder, linalgOp)))
+      return emitDefaultDefiniteFailure(linalgOp)
+             << " failed to detensorize op";
+  }
+  return DiagnosedSilenceableFailure::success();
+}
+
 //===----------------------------------------------------------------------===//
 // replace unit extent dims
 //===----------------------------------------------------------------------===//
@@ -1498,6 +1570,66 @@ LogicalResult transform::FuseOperandsOp::verify() {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// InsertSliceToCopyExtOp
+//===----------------------------------------------------------------------===//
+template <typename OpTy>
+DiagnosedSilenceableFailure
+insertSliceToCopyImpl(RewriterBase &rewriter, OpTy target,
+                      transform::ApplyToEachResultList &results,
+                      transform::TransformState &state) {
+  static_assert(llvm::is_one_of<OpTy, tensor::InsertSliceOp,
+                                tensor::ParallelInsertSliceOp>() &&
+                "wrong op type");
+
+  if (auto copySource =
+          target.getSource().template getDefiningOp<linalg::CopyOp>()) {
+    results.push_back(copySource);
+    return DiagnosedSilenceableFailure::success();
+  }
+
+  // If we are inside an InParallel region, temporarily set the insertion point
+  // outside: only tensor.parallel_insert_slice ops are allowed in there.
+  if constexpr (std::is_same_v<OpTy, tensor::ParallelInsertSliceOp>) {
+    rewriter.setInsertionPoint(
+        target->template getParentOfType<scf::InParallelOp>());
+  }
+
+  Value extracted = rewriter.create<tensor::ExtractSliceOp>(
+      target.getLoc(), target.getSourceType(), target.getDest(),
+      target.getMixedOffsets(), target.getMixedSizes(),
+      target.getMixedStrides());
+  Value copied = rewriter
+                     .create<linalg::CopyOp>(target.getLoc(),
+                                             target.getSource(), extracted)
+                     .getResult(0);
+  // Reset the insertion point.
+  rewriter.setInsertionPoint(target);
+  rewriter.replaceOpWithNewOp<OpTy>(
+      target, copied, target.getDest(), target.getMixedOffsets(),
+      target.getMixedSizes(), target.getMixedStrides());
+
+  results.push_back(copied.getDefiningOp());
+  return DiagnosedSilenceableFailure::success();
+}
+
+DiagnosedSilenceableFailure transform::InsertSliceToCopyExtOp::applyToOne(
+    transform::TransformRewriter &rewriter, Operation *targetOp,
+    transform::ApplyToEachResultList &results,
+    transform::TransformState &state) {
+  rewriter.setInsertionPoint(targetOp);
+  if (auto target = dyn_cast<tensor::InsertSliceOp>(targetOp))
+    return insertSliceToCopyImpl(rewriter, target, results, state);
+  if (auto target = dyn_cast<tensor::ParallelInsertSliceOp>(targetOp))
+    return insertSliceToCopyImpl(rewriter, target, results, state);
+
+  DiagnosedSilenceableFailure diag =
+      emitSilenceableError()
+      << "only InsertSliceOp and ParallelInsertSliceOp ops are supported";
+  diag.attachNote(targetOp->getLoc()) << "target op";
+  return diag;
+}
+
 //===----------------------------------------------------------------------===//
 // Transform op registration
 //===----------------------------------------------------------------------===//
diff --git a/compiler/lib/Dialect/Linalg/Transforms/FuseElementwise.cpp b/compiler/lib/Dialect/Linalg/Transforms/FuseElementwise.cpp
index 1b874e9d2..fc0af1829 100644
--- a/compiler/lib/Dialect/Linalg/Transforms/FuseElementwise.cpp
+++ b/compiler/lib/Dialect/Linalg/Transforms/FuseElementwise.cpp
@@ -544,9 +544,7 @@ static bool isFusableWithReshapeByDimExpansion(GenericOp genericOp,
                         else
                           return isProjectedPermutationAndAllowConst(map);
                       }) &&
-         genericOp.getMatchingIndexingMap(fusableOpOperand).getNumResults() >
-             0 &&
-         llvm::all_of(genericOp.getIteratorTypesArray(), isParallelIterator);
+         genericOp.getMatchingIndexingMap(fusableOpOperand).getNumResults() > 0;
 }
 
 class ExpansionInfo {
@@ -568,6 +566,9 @@ class ExpansionInfo {
   ArrayRef<int64_t> getExpandedShapeOfDim(unsigned i) const {
     return expandedShapeMap[i];
   }
+  ArrayRef<utils::IteratorType> getIteratorTypes() const {
+    return iteratorTypes;
+  }
   ArrayRef<int64_t> getOriginalShape() const { return originalLoopExtent; }
 
 private:
@@ -579,6 +580,8 @@ class ExpansionInfo {
   SmallVector<SmallVector<int64_t>> expandedShapeMap;
   /// Extent of the loop in the original operation.
   SmallVector<int64_t> originalLoopExtent;
+  /// Parallel types of the expanded loops
+  SmallVector<utils::IteratorType> iteratorTypes;
   unsigned expandedOpNumDims;
 };
 
@@ -591,6 +594,7 @@ LogicalResult ExpansionInfo::compute(LinalgOp linalgOp,
   if (reassociationMaps.empty())
     return failure();
   AffineMap fusedIndexMap = linalgOp.getMatchingIndexingMap(fusableOpOperand);
+  auto origIteratorTypes = linalgOp.getIteratorTypesArray();
 
   SmallVector<int64_t, 4> originalLoopRange = linalgOp.getStaticLoopRanges();
   originalLoopExtent.assign(originalLoopRange.begin(), originalLoopRange.end());
@@ -621,8 +625,11 @@ LogicalResult ExpansionInfo::compute(LinalgOp linalgOp,
     auto seq = llvm::seq<int64_t>(sum, sum + numFoldedDim.value());
     reassociation.emplace_back(seq.begin(), seq.end());
     sum += numFoldedDim.value();
+    iteratorTypes.append(numFoldedDim.value(),
+                         origIteratorTypes[numFoldedDim.index()]);
   }
   expandedOpNumDims = sum;
+
   return success();
 }
 
@@ -871,15 +878,11 @@ fuseWithReshapeByExpansion(GenericOp genericOp, Operation *reshapeOp,
     }
   }
 
-  // The iterator types of the expanded op are all parallel.
-  SmallVector<utils::IteratorType> iteratorTypes(
-      expansionInfo.getExpandedOpNumDims(), utils::IteratorType::parallel);
-
   TypeRange resultTypes = ValueRange(outputs).getTypes();
-  auto fusedOp =
-      rewriter.create<GenericOp>(genericOp.getLoc(), resultTypes,
-                                 /*inputs=*/expandedOpOperands, outputs,
-                                 expandedOpIndexingMaps, iteratorTypes);
+  auto fusedOp = rewriter.create<GenericOp>(genericOp.getLoc(), resultTypes,
+                                            /*inputs=*/expandedOpOperands,
+                                            outputs, expandedOpIndexingMaps,
+                                            expansionInfo.getIteratorTypes());
   Region &fusedRegion = fusedOp->getRegion(0);
   Region &originalRegion = genericOp->getRegion(0);
   rewriter.cloneRegionBefore(originalRegion, fusedRegion, fusedRegion.begin());
diff --git a/compiler/lib/Dialect/Linalg/Transforms/LinalgCollapseLoops.cpp b/compiler/lib/Dialect/Linalg/Transforms/LinalgCollapseLoops.cpp
index defd678af..081599f13 100644
--- a/compiler/lib/Dialect/Linalg/Transforms/LinalgCollapseLoops.cpp
+++ b/compiler/lib/Dialect/Linalg/Transforms/LinalgCollapseLoops.cpp
@@ -64,11 +64,12 @@ namespace {
 /// dimensions. It only applies these to "parallel" loops without mixing them
 /// with "reduction" types.
 static SmallVector<ReassociationIndices>
-getCollapsibleLoops(linalg::GenericOp genericOp) {
+getCollapsibleLoops(linalg::GenericOp genericOp,
+                    utils::IteratorType iteratorType) {
   SmallVector<ReassociationIndices> contiguousLoops;
 
   SmallVector<unsigned> pDims;
-  genericOp.getParallelDims(pDims);
+  findPositionsOfType(genericOp.getIteratorTypesArray(), iteratorType, pDims);
   if (pDims.size() < 2)
     return contiguousLoops;
 
@@ -76,15 +77,18 @@ getCollapsibleLoops(linalg::GenericOp genericOp) {
 
   auto hasAllMapsSameSequence = [&](AffineExpr preExpr, AffineExpr nextExpr) {
     for (AffineMap map : genericOp.getIndexingMapsArray()) {
-      bool foundSeq = false;
-      for (auto [index, resultExpr] : llvm::enumerate(map.getResults())) {
-        if (resultExpr == nextExpr) {
-          foundSeq = (index > 0 && preExpr == map.getResult(index - 1));
-          break;
-        }
+      auto prePos = map.getResultPosition(preExpr);
+      auto nextPos = map.getResultPosition(nextExpr);
+      if (!prePos.has_value()) {
+        if (nextPos.has_value())
+          return false;
+      } else {
+        if (!nextPos.has_value())
+          return false;
+
+        if (prePos.value() + 1 != nextPos.value())
+          return false;
       }
-      if (!foundSeq)
-        return false;
     }
     return true;
   };
@@ -519,13 +523,17 @@ FailureOr<SmallVector<Value>> collapseGenericOpIterationDimsEx(
 class CollapseLoopsOnGenericOp : public OpRewritePattern<linalg::GenericOp> {
 public:
   using OpRewritePattern::OpRewritePattern;
+  CollapseLoopsOnGenericOp(MLIRContext *context,
+                           utils::IteratorType iteratorType)
+      : OpRewritePattern(context), iteratorType(iteratorType) {}
+
   LogicalResult matchAndRewrite(linalg::GenericOp op,
                                 PatternRewriter &rewriter) const override {
     // Collect collapsible loops
     // TODO: All rules come from iree project, add our own
     if (!isEligibleForCollapse(op))
       return failure();
-    auto loops = getCollapsibleLoops(op);
+    auto loops = getCollapsibleLoops(op, iteratorType);
     if (loops.empty())
       return failure();
 
@@ -542,22 +550,31 @@ class CollapseLoopsOnGenericOp : public OpRewritePattern<linalg::GenericOp> {
     rewriter.replaceOp(op, *replacements);
     return success();
   }
+
+private:
+  utils::IteratorType iteratorType;
 };
 
 struct LinalgCollapseLoopsPass
     : public impl::LinalgCollapseLoopsBase<LinalgCollapseLoopsPass> {
+  LinalgCollapseLoopsPass(utils::IteratorType iteratorType)
+      : LinalgCollapseLoopsBase() {
+    this->iteratorType = iteratorType;
+  }
+
   void runOnOperation() override {
     auto op = getOperation();
     auto context = op->getContext();
 
     RewritePatternSet patterns(context);
-    patterns.add<CollapseLoopsOnGenericOp>(context);
+    patterns.add<CollapseLoopsOnGenericOp>(context, iteratorType);
     if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns))))
       signalPassFailure();
   }
 };
 } // namespace
 
-std::unique_ptr<OperationPass<func::FuncOp>> mlir::createLinalgCollapseLoops() {
-  return std::make_unique<LinalgCollapseLoopsPass>();
+std::unique_ptr<OperationPass<func::FuncOp>>
+mlir::createLinalgCollapseLoops(utils::IteratorType iteratorType) {
+  return std::make_unique<LinalgCollapseLoopsPass>(iteratorType);
 }
diff --git a/compiler/lib/Dialect/Tensor/Transforms/CMakeLists.txt b/compiler/lib/Dialect/Tensor/Transforms/CMakeLists.txt
index 524713f46..47cd8bab0 100644
--- a/compiler/lib/Dialect/Tensor/Transforms/CMakeLists.txt
+++ b/compiler/lib/Dialect/Tensor/Transforms/CMakeLists.txt
@@ -1,6 +1,7 @@
 
 add_mlir_dialect_library(ByteIRTensorPasses
   CanonicalizeExt.cpp
+  TensorPadSpecialization.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/mhlo
@@ -8,9 +9,11 @@ add_mlir_dialect_library(ByteIRTensorPasses
 
   DEPENDS
   ByteIRUtils
+  ByteIRTensorPassIncGen
 
   LINK_LIBS PUBLIC
   MLIRIR
   MLIRSupport
+  MLIRSCFDialect
   ByteIRUtils
   )
\ No newline at end of file
diff --git a/compiler/lib/Dialect/Tensor/Transforms/CanonicalizeExt.cpp b/compiler/lib/Dialect/Tensor/Transforms/CanonicalizeExt.cpp
index 2feff7437..c35039404 100644
--- a/compiler/lib/Dialect/Tensor/Transforms/CanonicalizeExt.cpp
+++ b/compiler/lib/Dialect/Tensor/Transforms/CanonicalizeExt.cpp
@@ -27,6 +27,7 @@
 #include "byteir/Utils/AttrUtils.h"
 #include "byteir/Utils/Utils.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/IR/Matchers.h"
 #include "mlir/IR/TypeUtilities.h"
@@ -121,6 +122,46 @@ struct RankReducedExtractSliceCollapseShape
     return success();
   }
 };
+
+/// Fold zero rank from_elements + insert_slice into insert
+///
+/// Example:
+///
+/// %0 = tensor.from_elements %scalar : tensor<f32>
+/// %1 = tensor.insert_slice %0 into %1[%c256] : tensor<f32> into
+/// tensor<1024xf32>
+///
+/// will be folded into
+///
+/// %0 = tensor.insert %scalar into %1[%c256] : tensor<1024xf32>
+struct FoldZeroRankFromElementsInsertSlice
+    : public OpRewritePattern<tensor::InsertSliceOp> {
+  using OpRewritePattern<tensor::InsertSliceOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(tensor::InsertSliceOp insertSliceOp,
+                                PatternRewriter &rewriter) const override {
+    auto fromElementsOp =
+        insertSliceOp.getSource().getDefiningOp<tensor::FromElementsOp>();
+    if (!fromElementsOp)
+      return failure();
+
+    RankedTensorType tensorType = insertSliceOp.getSourceType();
+    if (tensorType.getRank() != 0)
+      return failure();
+
+    auto elements = fromElementsOp.getElements();
+    if (elements.size() != 1)
+      return failure();
+
+    SmallVector<Value> indices = getValueOrCreateConstantIndexOp(
+        rewriter, insertSliceOp->getLoc(),
+        getMixedValues(insertSliceOp.getStaticOffsets(),
+                       insertSliceOp.getOffsets(), rewriter));
+    rewriter.replaceOpWithNewOp<tensor::InsertOp>(
+        insertSliceOp, elements[0], insertSliceOp.getDest(), indices);
+    return success();
+  }
+};
 } // namespace
 
 void mlir::tensor::populateCanonicalizeExtPatterns(RewritePatternSet &patterns,
@@ -132,6 +173,7 @@ void mlir::tensor::populateCanonicalizeExtPatterns(RewritePatternSet &patterns,
   }
 
   patterns.add<RankReducedExtractSliceCollapseShape>(ctx);
+  patterns.add<FoldZeroRankFromElementsInsertSlice>(ctx);
 }
 
 void mlir::tensor::getCanonicalizationExtPatterns(RewritePatternSet &patterns,
diff --git a/compiler/lib/Dialect/Tensor/Transforms/PassDetail.h b/compiler/lib/Dialect/Tensor/Transforms/PassDetail.h
new file mode 100644
index 000000000..4214a74dc
--- /dev/null
+++ b/compiler/lib/Dialect/Tensor/Transforms/PassDetail.h
@@ -0,0 +1,40 @@
+//===- PassDetail.h -------------------------------------------*--- C++ -*-===//
+//
+// Copyright 2022 ByteDance Ltd. and/or its affiliates. All rights reserved.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BYTEIR_DIALECT_TENSOR_TRANSFORMS_PASSDETAIL_H
+#define BYTEIR_DIALECT_TENSOR_TRANSFORMS_PASSDETAIL_H
+
+#include "mlir/IR/DialectRegistry.h"
+#include "mlir/Pass/Pass.h"
+
+// forward dialects for conversions
+namespace mlir {
+
+namespace scf {
+class SCFDialect;
+} // namespace scf
+
+namespace tensor {
+class TensorDialect;
+} // namespace tensor
+
+#define GEN_PASS_CLASSES
+#include "byteir/Dialect/Tensor/Passes.h.inc"
+
+} // namespace mlir
+
+#endif // BYTEIR_DIALECT_TENSOR_TRANSFORMS_PASSDETAIL_H
diff --git a/compiler/lib/Dialect/Tensor/Transforms/TensorPadSpecialization.cpp b/compiler/lib/Dialect/Tensor/Transforms/TensorPadSpecialization.cpp
new file mode 100644
index 000000000..43f750233
--- /dev/null
+++ b/compiler/lib/Dialect/Tensor/Transforms/TensorPadSpecialization.cpp
@@ -0,0 +1,242 @@
+//===- TensorPadSpecialization.cpp ---------------------------*--- C++ -*-===//
+//
+// Copyright 2022 ByteDance Ltd. and/or its affiliates. All rights reserved.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+
+#include "byteir/Dialect/Tensor/Transforms/TensorPadSpecialization.h"
+#include "byteir/Utils/AttrUtils.h"
+#include "byteir/Utils/Utils.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/ViewLikeInterfaceUtils.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Tensor/Transforms/TransformUtils.h"
+#include "mlir/Dialect/Tensor/Transforms/Transforms.h"
+#include "mlir/Dialect/Utils/IndexingUtils.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/Interfaces/ViewLikeInterface.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/APSInt.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/Support/Debug.h"
+
+#include "./PassDetail.h"
+
+#define DEBUG_TYPE "tensor-pad-specialization"
+
+using namespace mlir;
+
+namespace {
+static LogicalResult
+resolveSourceIndicesCollapseShape(Location loc, PatternRewriter &rewriter,
+                                  tensor::CollapseShapeOp collapseShapeOp,
+                                  ValueRange indices,
+                                  SmallVectorImpl<Value> &sourceIndices) {
+  int64_t cnt = 0;
+  SmallVector<Value> tmp(indices.size());
+  SmallVector<OpFoldResult> dynamicIndices;
+  for (ArrayRef<int64_t> groups : collapseShapeOp.getReassociationIndices()) {
+    assert(!groups.empty() && "association indices groups cannot be empty");
+    dynamicIndices.push_back(indices[cnt++]);
+    int64_t groupSize = groups.size();
+
+    // Calculate suffix product for all collapse op source dimension sizes.
+    SmallVector<int64_t> sizes(groupSize);
+    for (int64_t i = 0; i < groupSize; ++i)
+      sizes[i] = collapseShapeOp.getSrcType().getDimSize(groups[i]);
+    SmallVector<int64_t> suffixProduct = computeSuffixProduct(sizes);
+
+    // Derive the index values along all dimensions of the source corresponding
+    // to the index wrt to collapsed shape op output.
+    auto d0 = rewriter.getAffineDimExpr(0);
+    SmallVector<AffineExpr> delinearizingExprs = delinearize(d0, suffixProduct);
+
+    // Construct the AffineApplyOp for each delinearizingExpr.
+    for (int64_t i = 0; i < groupSize; i++) {
+      OpFoldResult ofr = affine::makeComposedFoldedAffineApply(
+          rewriter, loc,
+          AffineMap::get(/*numDims=*/1, /*numSymbols=*/0,
+                         delinearizingExprs[i]),
+          dynamicIndices);
+      sourceIndices.push_back(
+          getValueOrCreateConstantIndexOp(rewriter, loc, ofr));
+    }
+    dynamicIndices.clear();
+  }
+  if (collapseShapeOp.getReassociationIndices().empty()) {
+    auto zeroAffineMap = rewriter.getConstantAffineMap(0);
+    int64_t srcRank =
+        cast<ShapedType>(collapseShapeOp.getSrc().getType()).getRank();
+    for (int64_t i = 0; i < srcRank; i++) {
+      OpFoldResult ofr = affine::makeComposedFoldedAffineApply(
+          rewriter, loc, zeroAffineMap, dynamicIndices);
+      sourceIndices.push_back(
+          getValueOrCreateConstantIndexOp(rewriter, loc, ofr));
+    }
+  }
+  return success();
+}
+
+struct FoldExtractOfCollapseShape : public OpRewritePattern<tensor::ExtractOp> {
+  using OpRewritePattern<tensor::ExtractOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(tensor::ExtractOp extractOp,
+                                PatternRewriter &rewriter) const {
+    auto collapseShapeOp =
+        extractOp.getTensor().getDefiningOp<tensor::CollapseShapeOp>();
+    if (!collapseShapeOp)
+      return failure();
+
+    SmallVector<Value> indices(extractOp.getIndices().begin(),
+                               extractOp.getIndices().end());
+    SmallVector<Value> sourceIndices;
+    if (failed(resolveSourceIndicesCollapseShape(extractOp->getLoc(), rewriter,
+                                                 collapseShapeOp, indices,
+                                                 sourceIndices)))
+      return failure();
+    rewriter.replaceOpWithNewOp<tensor::ExtractOp>(
+        extractOp, extractOp.getType(), collapseShapeOp.getSrc(),
+        sourceIndices);
+    return success();
+  }
+};
+
+struct FoldExtractOfExtractSlice : public OpRewritePattern<tensor::ExtractOp> {
+  using OpRewritePattern<tensor::ExtractOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(tensor::ExtractOp extractOp,
+                                PatternRewriter &rewriter) const override {
+    auto extractSliceOp =
+        extractOp.getTensor().getDefiningOp<tensor::ExtractSliceOp>();
+    if (!extractSliceOp)
+      return failure();
+
+    SmallVector<Value> indices(extractOp.getIndices().begin(),
+                               extractOp.getIndices().end());
+    SmallVector<Value> sourceIndices;
+    affine::resolveIndicesIntoOpWithOffsetsAndStrides(
+        rewriter, extractOp->getLoc(), extractSliceOp.getMixedOffsets(),
+        extractSliceOp.getMixedStrides(), extractSliceOp.getDroppedDims(),
+        indices, sourceIndices);
+    rewriter.replaceOpWithNewOp<tensor::ExtractOp>(
+        extractOp, extractOp.getType(), extractSliceOp.getSource(),
+        sourceIndices);
+    return success();
+  }
+};
+
+struct FoldExtractOfPad : public OpRewritePattern<tensor::ExtractOp> {
+  using OpRewritePattern<tensor::ExtractOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(tensor::ExtractOp extractOp,
+                                PatternRewriter &rewriter) const override {
+    auto padOp = extractOp.getTensor().getDefiningOp<tensor::PadOp>();
+    if (!padOp)
+      return failure();
+
+    // Only constant padding value supported.
+    Value padValue = padOp.getConstantPaddingValue();
+    if (!padValue)
+      return failure();
+
+    // Helper variables and functions for various arithmetic operations. These
+    // are used extensively for computing new offset/length and padding values.
+    Location loc = padOp->getLoc();
+    AffineExpr dim0, dim1;
+    bindDims(rewriter.getContext(), dim0, dim1);
+    // Add two integers.
+    auto addMap = AffineMap::get(2, 0, {dim0 + dim1});
+    auto add = [&](OpFoldResult v1, OpFoldResult v2) {
+      return affine::makeComposedFoldedAffineApply(rewriter, loc, addMap,
+                                                   {v1, v2});
+    };
+    // Subtract two integers.
+    auto subMap = AffineMap::get(2, 0, {dim0 - dim1});
+    auto sub = [&](OpFoldResult v1, OpFoldResult v2) {
+      return affine::makeComposedFoldedAffineApply(rewriter, loc, subMap,
+                                                   {v1, v2});
+    };
+
+    auto cmp = [&](OpFoldResult v1, OpFoldResult v2,
+                   arith::CmpIPredicate pred) {
+      return rewriter.create<arith::CmpIOp>(
+          loc, pred, getValueOrCreateConstantIndexOp(rewriter, loc, v1),
+          getValueOrCreateConstantIndexOp(rewriter, loc, v2));
+    };
+
+    auto offsets = getAsOpFoldResult(extractOp.getIndices());
+    SmallVector<OpFoldResult> newOffsets;
+    Value inBound;
+
+    int64_t rank = padOp.getSourceType().getRank();
+    for (unsigned dim = 0; dim < rank; ++dim) {
+      auto low = padOp.getMixedLowPad()[dim];
+      bool hasLowPad = !isConstantIntValue(low, 0);
+      auto offset = offsets[dim];
+      auto srcSize =
+          tensor::getMixedSize(rewriter, loc, padOp.getSource(), dim);
+
+      OpFoldResult newOffset = hasLowPad ? sub(offset, low) : offset;
+      newOffsets.push_back(newOffset);
+      auto lbcheck = cmp(low, offset, arith::CmpIPredicate::ule);
+      auto ubcheck = cmp(offset, hasLowPad ? add(low, srcSize) : srcSize,
+                         arith::CmpIPredicate::ult);
+      auto check = rewriter.create<arith::AndIOp>(loc, lbcheck, ubcheck);
+      if (inBound) {
+        inBound = rewriter.create<arith::AndIOp>(loc, inBound, check);
+      } else {
+        inBound = check;
+      }
+    }
+
+    rewriter.replaceOpWithNewOp<scf::IfOp>(
+        extractOp, inBound,
+        [&](OpBuilder &b, Location loc) {
+          b.create<scf::YieldOp>(
+              loc, b.create<tensor::ExtractOp>(
+                        loc, padOp.getSource(),
+                        getValueOrCreateConstantIndexOp(b, loc, newOffsets))
+                       .getResult());
+        },
+        [&](OpBuilder &b, Location loc) {
+          b.create<scf::YieldOp>(loc, padValue);
+        });
+    return success();
+  }
+};
+
+struct TensorPadSpecializationPass
+    : public TensorPadSpecializationBase<TensorPadSpecializationPass> {
+  void runOnOperation() override {
+    MLIRContext *ctx = &getContext();
+    RewritePatternSet patterns(ctx);
+    patterns.add<FoldExtractOfCollapseShape, FoldExtractOfExtractSlice,
+                 FoldExtractOfPad>(ctx);
+    if (failed(applyPatternsAndFoldGreedily(getOperation(),
+                                            std::move(patterns)))) {
+      signalPassFailure();
+    }
+  }
+};
+} // namespace
+
+std::unique_ptr<Pass> mlir::createTensorPadSpecializationPass() {
+  return std::make_unique<TensorPadSpecializationPass>();
+}
diff --git a/compiler/lib/Dialect/Transform/Transforms/TransformInsertion.cpp b/compiler/lib/Dialect/Transform/Transforms/TransformInsertion.cpp
index cf5ff1e69..be6485a85 100644
--- a/compiler/lib/Dialect/Transform/Transforms/TransformInsertion.cpp
+++ b/compiler/lib/Dialect/Transform/Transforms/TransformInsertion.cpp
@@ -25,6 +25,7 @@
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/TransformOps/DialectExtension.h"
 #include "mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
 #include "mlir/Dialect/Transform/IR/TransformOps.h"
 #include "mlir/IR/Builders.h"
@@ -89,6 +90,61 @@ void insertTransformIR(ModuleOp m, const TransformInsertionConfig &config) {
   }
 }
 
+struct DetensorizeTransformInsertionPass
+    : public DetensorizeTransformInsertionBase<
+          DetensorizeTransformInsertionPass> {
+  explicit DetensorizeTransformInsertionPass(const std::string &funcAnchor,
+                                             const std::string &matchPrefix)
+      : DetensorizeTransformInsertionBase() {
+    this->funcAnchorAttr = funcAnchor;
+    this->matchPrefix = matchPrefix;
+  }
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<transform::TransformDialect>();
+    linalg::registerTransformDialectExtension(registry);
+  }
+
+  static bool isScalarTensorOp(linalg::LinalgOp linalgOp) {
+    if (!linalgOp.hasTensorSemantics())
+      return false;
+
+    if (linalgOp.getNumLoops() != 0)
+      return false;
+
+    auto isScalarOrScalarTensorOperand = [&](OpOperand &operand) {
+      if (linalgOp.isScalar(&operand))
+        return true;
+
+      auto tensorType =
+          llvm::dyn_cast<RankedTensorType>(operand.get().getType());
+      if (!tensorType)
+        return false;
+
+      return tensorType.getRank() == 0;
+    };
+    return llvm::all_of(linalgOp->getOpOperands(),
+                        isScalarOrScalarTensorOperand);
+  }
+
+  void runOnOperation() override {
+    auto opFilter = [](Operation *op) {
+      if (auto linalgOp = llvm::dyn_cast_or_null<linalg::LinalgOp>(op)) {
+        return isScalarTensorOp(linalgOp);
+      }
+      return false;
+    };
+
+    auto transformBuilder = [](ImplicitLocOpBuilder &b, Operation *,
+                               Value pdlValue) {
+      b.create<transform::DetensorizeOp>(pdlValue);
+    };
+
+    insertTransformIR(getOperation(), {funcAnchorAttr, matchPrefix, opFilter,
+                                       transformBuilder});
+  }
+};
+
 struct FuseExtTransformInsertionPass
     : public FuseExtTransformInsertionBase<FuseExtTransformInsertionPass> {
   explicit FuseExtTransformInsertionPass(
@@ -166,8 +222,46 @@ struct GenericTransformInsertionPass
 protected:
   TransformInsertionConfig config;
 };
+
+struct RewriteInDPSTransformInsertionPass
+    : public RewriteInDPSTransformInsertionBase<
+          RewriteInDPSTransformInsertionPass> {
+  explicit RewriteInDPSTransformInsertionPass(const std::string &funcAnchor,
+                                              const std::string &matchPrefix)
+      : RewriteInDPSTransformInsertionBase() {
+    this->funcAnchorAttr = funcAnchor;
+    this->matchPrefix = matchPrefix;
+  }
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<transform::TransformDialect>();
+    linalg::registerTransformDialectExtension(registry);
+  }
+
+  void runOnOperation() override {
+    auto opFilter = [](Operation *op) {
+      return llvm::isa<tensor::FromElementsOp>(op);
+    };
+
+    auto transformBuilder = [](ImplicitLocOpBuilder &b, Operation *,
+                               Value pdlValue) {
+      b.create<transform::RewriteInDestinationPassingStyleOp>(
+          pdlValue.getType(), pdlValue);
+    };
+
+    insertTransformIR(getOperation(), {funcAnchorAttr, matchPrefix, opFilter,
+                                       transformBuilder});
+  }
+};
 } // namespace
 
+std::unique_ptr<OperationPass<ModuleOp>>
+mlir::createDetensorizeTransformInsertionPass(const std::string &funcAnchor,
+                                              const std::string &matchPrefix) {
+  return std::make_unique<DetensorizeTransformInsertionPass>(funcAnchor,
+                                                             matchPrefix);
+}
+
 std::unique_ptr<OperationPass<ModuleOp>>
 mlir::createFuseExtTransformInsertionPass(
     const std::string &funcAnchor, const std::string &matchPrefix,
@@ -182,4 +276,11 @@ std::unique_ptr<OperationPass<ModuleOp>>
 mlir::createGenericTransformInsertionPass(
     const TransformInsertionConfig &config) {
   return std::make_unique<GenericTransformInsertionPass>(config);
+}
+
+std::unique_ptr<OperationPass<ModuleOp>>
+mlir::createRewriteInDPSTransformInsertionPass(const std::string &funcAnchor,
+                                               const std::string &matchPrefix) {
+  return std::make_unique<RewriteInDPSTransformInsertionPass>(funcAnchor,
+                                                              matchPrefix);
 }
\ No newline at end of file
diff --git a/compiler/lib/Dialect/mhlo/Transforms/CanonicalizeExt.cpp b/compiler/lib/Dialect/mhlo/Transforms/CanonicalizeExt.cpp
index 277174dfa..f5ae6ffea 100644
--- a/compiler/lib/Dialect/mhlo/Transforms/CanonicalizeExt.cpp
+++ b/compiler/lib/Dialect/mhlo/Transforms/CanonicalizeExt.cpp
@@ -1698,6 +1698,60 @@ LogicalResult mlir::mhlo::foldReverseWithConstant(mhlo::ReverseOp op,
   return success();
 }
 
+// this pattern match a GatherOp with iota start_indices,
+// the output of GatherOp maybe equal to the input.
+LogicalResult mlir::mhlo::foldGatherWithInput(mhlo::GatherOp gatherOp,
+                                              PatternRewriter &rewriter) {
+  auto operand = gatherOp.getOperand();
+  auto operandTy = operand.getType().cast<ShapedType>();
+  if (!operandTy.hasRank()) {
+    return failure();
+  }
+
+  auto resultTy = gatherOp.getType().cast<ShapedType>();
+  if (resultTy != operandTy) {
+    return failure();
+  }
+
+  auto startIndices = gatherOp.getStartIndices();
+  auto startIndicesTy = startIndices.getType().cast<ShapedType>();
+  auto iotaOp = startIndices.getDefiningOp<mhlo::IotaOp>();
+  if (!iotaOp || !startIndicesTy.hasRank()) {
+    return failure();
+  }
+
+  int64_t indexVectorDim = startIndicesTy.getRank();
+
+  auto dimensionNumbers = gatherOp.getDimensionNumbers();
+  if (dimensionNumbers.getIndexVectorDim() != indexVectorDim ||
+      indexVectorDim != 1) {
+    return failure();
+  }
+
+  if (dimensionNumbers.getStartIndexMap().size() != 1) {
+    return failure();
+  }
+
+  int64_t startIndexMap = dimensionNumbers.getStartIndexMap()[0];
+  auto collapsedSilceDims = dimensionNumbers.getCollapsedSliceDims();
+  bool mapTocollapsedDim = false;
+
+  for (auto dims : collapsedSilceDims) {
+    if (dims == startIndexMap) {
+      mapTocollapsedDim = true;
+      break;
+    }
+  }
+  // if the start index and offset index are disjoint,
+  // and the start index is generate by IotaOp,
+  // the output of gatherOp is equal to input.
+  if (mapTocollapsedDim) {
+    rewriter.replaceOp(gatherOp, operand);
+    return success();
+  }
+  return failure();
+}
+
 void mlir::mhlo::populateCanonicalizeExtPatterns(RewritePatternSet &patterns,
                                                  MLIRContext *ctx,
                                                  bool blindFold) {
@@ -1725,6 +1779,7 @@ void mlir::mhlo::populateCanonicalizeExtPatterns(RewritePatternSet &patterns,
   patterns.add(mlir::mhlo::simplifyCumsumToIota);
   patterns.add(mlir::mhlo::simplifyTransposeReshapeTranspose);
   patterns.add(mlir::mhlo::foldReverseWithConstant);
+  patterns.add(mlir::mhlo::foldGatherWithInput);
   if (blindFold) {
     patterns.add(mlir::mhlo::foldLargeConcatenate);
   }
diff --git a/compiler/lib/Dialect/mhlo/Transforms/CatFusion.cpp b/compiler/lib/Dialect/mhlo/Transforms/CatFusion.cpp
index 6be432b7d..6eac9b56c 100644
--- a/compiler/lib/Dialect/mhlo/Transforms/CatFusion.cpp
+++ b/compiler/lib/Dialect/mhlo/Transforms/CatFusion.cpp
@@ -67,6 +67,8 @@ bool isFusibleWith(Operation *target, Operation * /*start*/) { return true; }
 
 bool isValidSingleOp(Operation *op) { return true; }
 
+bool isValidFusionPattern(const MhloFusionPattern &) { return true; }
+
 bool isFusibleCandidateAggressive(Operation *op) {
   if (isa<cat::CatOpInterface>(op))
     return true;
@@ -99,14 +101,16 @@ bool isValidSingleOpAggressive(Operation *op) {
 }
 
 static GenericFuserConfig config{
-    getByteIRCatFusionAttrName(), cat_fusion::isFusibleCandidate,
-    cat_fusion::isFusibleStart,   cat_fusion::isFusibleTrigger,
-    cat_fusion::isFusibleWith,    cat_fusion::isValidSingleOp};
+    getByteIRCatFusionAttrName(),    cat_fusion::isFusibleCandidate,
+    cat_fusion::isFusibleStart,      cat_fusion::isFusibleTrigger,
+    cat_fusion::isFusibleWith,       cat_fusion::isValidSingleOp,
+    cat_fusion::isValidFusionPattern};
 
 static GenericFuserConfig aggressiveConfig{
-    getByteIRCatFusionAttrName(), cat_fusion::isFusibleCandidateAggressive,
-    cat_fusion::isFusibleStart,   cat_fusion::isFusibleTrigger,
-    cat_fusion::isFusibleWith,    cat_fusion::isValidSingleOpAggressive};
+    getByteIRCatFusionAttrName(),    cat_fusion::isFusibleCandidateAggressive,
+    cat_fusion::isFusibleStart,      cat_fusion::isFusibleTrigger,
+    cat_fusion::isFusibleWith,       cat_fusion::isValidSingleOpAggressive,
+    cat_fusion::isValidFusionPattern};
 
 } // namespace cat_fusion
 
diff --git a/compiler/lib/Dialect/mhlo/Transforms/ConvertOpToCustomCall.cpp b/compiler/lib/Dialect/mhlo/Transforms/ConvertOpToCustomCall.cpp
index 0f7bd48f5..bca092517 100644
--- a/compiler/lib/Dialect/mhlo/Transforms/ConvertOpToCustomCall.cpp
+++ b/compiler/lib/Dialect/mhlo/Transforms/ConvertOpToCustomCall.cpp
@@ -1,4 +1,4 @@
-//===- ConvertRngToCustomCall.cpp -----------------------------*--- C++ -*-===//
+//===- ConvertOpToCustomCall.cpp ------------------------------*--- C++ -*-===//
 //
 // Copyright 2022 ByteDance Ltd. and/or its affiliates. All rights reserved.
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -74,6 +74,20 @@ func::CallOp getOrCreateCallGetSeedOp(func::FuncOp func,
   return callGetSeedOp;
 }
 
+llvm::SmallVector<NamedAttribute> getDefaultAttrs(PatternRewriter &rewriter) {
+  llvm::SmallVector<NamedAttribute> attrs;
+  attrs.emplace_back(rewriter.getStringAttr("has_side_effect"),
+                     rewriter.getBoolAttr(false));
+  attrs.emplace_back(rewriter.getStringAttr("backend_config"),
+                     rewriter.getStringAttr(""));
+  attrs.emplace_back(rewriter.getStringAttr("api_version"),
+                     rewriter.getI32IntegerAttr(static_cast<int>(
+                         mhlo::CustomCallApiVersion::API_VERSION_ORIGINAL)));
+  attrs.emplace_back(rewriter.getStringAttr("called_computations"),
+                     rewriter.getArrayAttr({}));
+  return attrs;
+}
+
 struct ConvertRngUniformToCustomCall : public OpRewritePattern<mhlo::RngOp> {
   using OpRewritePattern<mhlo::RngOp>::OpRewritePattern;
 
@@ -120,6 +134,76 @@ struct ConvertRngUniformToCustomCall : public OpRewritePattern<mhlo::RngOp> {
     return success();
   }
 };
+
+struct ConvertFlashFwdToCustomCall
+    : public OpRewritePattern<mhlo::CustomCallOp> {
+  using OpRewritePattern<mhlo::CustomCallOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(mhlo::CustomCallOp op,
+                                PatternRewriter &rewriter) const override {
+    auto opName = op.getCallTargetName();
+    if (opName != getFlashAttnFwdName())
+      return rewriter.notifyMatchFailure(op, "op name not match");
+
+    auto resultNum = op.getNumResults();
+    if (resultNum != 4)
+      return rewriter.notifyMatchFailure(op, "op result num not match");
+    auto q = op.getOperand(0);
+    auto k = op.getOperand(1);
+    auto v = op.getOperand(2);
+    Type outType = op.getResult(0).getType();
+    Type softmaxLseType = op.getResult(1).getType();
+    Type softmaxType = op.getResult(2).getType();
+
+    TensorType seedOrOffsetType =
+        RankedTensorType::get({}, rewriter.getI64Type());
+
+    ModuleOp module = op->getParentRegion()->getParentOfType<ModuleOp>();
+    auto functionType = FunctionType::get(module.getContext(), {},
+                                          ArrayRef<Type>{seedOrOffsetType});
+    func::FuncOp getSeedFunc = getOrCreatePrivateFunctionDeclare(
+        module, "GetSeedFunc", "GetSeed", functionType);
+    func::FuncOp nextOffsetFunc = getOrCreatePrivateFunctionDeclare(
+        module, "NextOffsetFunc", "NextOffset", functionType);
+
+    // avoid to call @getSeed every time
+    auto getSeedOp = getOrCreateCallGetSeedOp(
+        op->getParentRegion()->getParentOfType<func::FuncOp>(), getSeedFunc,
+        rewriter);
+    auto getOffsetOp = rewriter.create<func::CallOp>(
+        op->getLoc(), nextOffsetFunc, ArrayRef<Value>{});
+
+    TensorType seedOrOffsetReshapedType =
+        RankedTensorType::get({1}, rewriter.getI64Type());
+    TensorType rngStateType = RankedTensorType::get({2}, rewriter.getI64Type());
+    auto reshapeSeedOp = rewriter.create<mhlo::ReshapeOp>(
+        op.getLoc(), seedOrOffsetReshapedType, getSeedOp.getResult(0));
+    auto reshapeOffsetOp = rewriter.create<mhlo::ReshapeOp>(
+        op.getLoc(), seedOrOffsetReshapedType, getOffsetOp.getResult(0));
+
+    auto concatOp = rewriter.create<mhlo::ConcatenateOp>(
+        op.getLoc(), rngStateType,
+        ValueRange{reshapeSeedOp.getResult(), reshapeOffsetOp.getResult()}, 0);
+    SmallVector<Value> bufferArgs{q, k, v, concatOp.getResult()};
+    auto dictAttr =
+        op->template getAttrOfType<DictionaryAttr>(getCustomCallAttrName());
+    auto attrs = getDefaultAttrs(rewriter);
+    attrs.emplace_back(rewriter.getStringAttr("call_target_name"),
+                       rewriter.getStringAttr(getFlashAttnFwdName()));
+    attrs.emplace_back(rewriter.getStringAttr(getCustomCallAttrName()),
+                       dictAttr);
+    auto customCallOp = rewriter.create<mhlo::CustomCallOp>(
+        op->getLoc(), ArrayRef<Type>{outType, softmaxLseType, softmaxType},
+        bufferArgs, ArrayRef<NamedAttribute>{attrs});
+    Value outPad = customCallOp.getResult(0);
+    Value softmaxLse = customCallOp.getResult(1);
+    Value softmaxReturn = customCallOp.getResult(2);
+    ValueRange results{outPad, softmaxLse, softmaxReturn, concatOp.getResult()};
+    rewriter.replaceOp(op, results);
+    return success();
+  }
+};
+
 struct ConvertOpToCustomCallPass
     : public ConvertOpToCustomCallBase<ConvertOpToCustomCallPass> {
 
@@ -140,6 +224,7 @@ struct ConvertOpToCustomCallPass
 
       RewritePatternSet patterns(context);
       populateRngPatternToCustomCall(patterns);
+      populateFlashFwdRewritePattern(patterns);
 
       FrozenRewritePatternSet frozenPatterns(std::move(patterns));
       if (failed(applyPatternsAndFoldGreedily(funcOp, frozenPatterns))) {
@@ -155,6 +240,10 @@ void mlir::populateRngPatternToCustomCall(RewritePatternSet &patterns) {
   patterns.add<ConvertRngUniformToCustomCall>(patterns.getContext());
 }
 
+void mlir::populateFlashFwdRewritePattern(RewritePatternSet &patterns) {
+  patterns.add<ConvertFlashFwdToCustomCall>(patterns.getContext());
+}
+
 std::unique_ptr<OperationPass<ModuleOp>>
 mlir::createConvertOpToCustomCallPass(llvm::StringRef anchor) {
   return std::make_unique<ConvertOpToCustomCallPass>(anchor);
diff --git a/compiler/lib/Dialect/mhlo/Transforms/GenericFusion.cpp b/compiler/lib/Dialect/mhlo/Transforms/GenericFusion.cpp
index 0caf95fbf..894c5866c 100644
--- a/compiler/lib/Dialect/mhlo/Transforms/GenericFusion.cpp
+++ b/compiler/lib/Dialect/mhlo/Transforms/GenericFusion.cpp
@@ -96,10 +96,13 @@ bool isValidSingleOp(Operation *op) {
          isCustomMhloRngOp(op);
 }
 
+bool isValidFusionPattern(const MhloFusionPattern &) { return true; }
+
 static GenericFuserConfig config{
     getByteIRElementwiseFusionAttrName(), elementwise::isFusibleCandidate,
     elementwise::isFusibleStart,          elementwise::isFusibleTrigger,
-    elementwise::isFusibleWith,           elementwise::isValidSingleOp};
+    elementwise::isFusibleWith,           elementwise::isValidSingleOp,
+    elementwise::isValidFusionPattern};
 
 } // namespace elementwise
 
@@ -126,15 +129,85 @@ bool isFusibleWith(Operation * /*target*/, Operation * /*start*/) {
 
 bool isValidSingleOp(Operation *op) { return false; }
 
+bool isValidFusionPattern(const MhloFusionPattern &) { return true; }
+
 static GenericFuserConfig config{getByteIRMatmulEpilogueFusionAttrName(),
                                  matmul_epilogue::isFusibleCandidate,
                                  matmul_epilogue::isFusibleStart,
                                  matmul_epilogue::isFusibleTrigger,
                                  matmul_epilogue::isFusibleWith,
-                                 matmul_epilogue::isValidSingleOp};
+                                 matmul_epilogue::isValidSingleOp,
+                                 matmul_epilogue::isValidFusionPattern};
 
 } // namespace matmul_epilogue
 
+namespace reduction {
+// TODO: maybe we should support non-splat constant on device in future
+bool isFusibleCandidate(Operation *op) {
+  return isMhlo(op) && (op->hasTrait<::mlir::OpTrait::Elementwise>() ||
+                        op->hasTrait<hlo::OpTrait::BroadcastingElementwise>() ||
+                        isSplatMhloConstantLike(op) ||
+                        isa<mhlo::BroadcastInDimOp, mhlo::BroadcastOp,
+                            mhlo::ReshapeOp, mhlo::ReduceOp>(op));
+}
+
+// every candidate can start
+bool isFusibleStart(Operation *op) { return true; }
+
+bool isFusibleTrigger(Operation *op) {
+  if (op->hasTrait<::mlir::OpTrait::Elementwise>() ||
+      op->hasTrait<hlo::OpTrait::BroadcastingElementwise>() ||
+      isa<mhlo::ReshapeOp>(op)) {
+    return true;
+  }
+
+  // if broadcast, check whether its operand is only used in broadcast
+  if (isa<mhlo::BroadcastInDimOp, mhlo::BroadcastOp>(op)) {
+    auto src = op->getOperand(0);
+    // is foldable we just allow
+    if (isDeepMhloFoldable(src.getDefiningOp())) {
+      return true;
+    }
+    // otherwise, check it is only used in broadcast
+    // return useCount(src) == 1;
+    // LWC FIXME: change back to above after broadcast fusion resolve.
+    return false;
+  }
+
+  if (isa<mhlo::ReduceOp>(op))
+    return true;
+
+  return false;
+}
+
+bool isFusibleWith(Operation *target, Operation * /*start*/) {
+  return (target->hasTrait<::mlir::OpTrait::Elementwise>() ||
+          target->hasTrait<hlo::OpTrait::BroadcastingElementwise>() ||
+          isSplatMhloConstantLike(target) ||
+          isa<mhlo::BroadcastInDimOp, mhlo::BroadcastOp, mhlo::ReshapeOp>(
+              target)) &&
+         target->hasOneUse();
+}
+
+bool isValidSingleOp(Operation *op) { return isa<mhlo::ReduceOp>(op); }
+
+bool isValidFusionPattern(const MhloFusionPattern &pattern) {
+  SmallVector<Value, 4> outputs = getOutputsOfCluster(pattern);
+  if (outputs.size() == 1) {
+    if (outputs[0].getDefiningOp<mhlo::ReduceOp>())
+      return true;
+  }
+  return false;
+}
+
+static GenericFuserConfig config{
+    getByteIRReductionFusionAttrName(), reduction::isFusibleCandidate,
+    reduction::isFusibleStart,          reduction::isFusibleTrigger,
+    reduction::isFusibleWith,           reduction::isValidSingleOp,
+    reduction::isValidFusionPattern};
+
+} // namespace reduction
+
 // a derived fusion pass for elementwise
 struct ElementwiseFusionPass : public GenericFusionPass<ElementwiseFusionPass> {
 
@@ -188,6 +261,29 @@ struct MatmulEpilogueFusionPass
   ::llvm::StringRef getName() const override { return "MatmulEpilogueFusion"; }
 };
 
+// a derived fusion pass for reduction fusion
+struct ReductionFusionPass : public GenericFusionPass<ReductionFusionPass> {
+
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ReductionFusionPass)
+
+  ReductionFusionPass() : GenericFusionPass(reduction::config, false) {}
+
+  /// Returns the command-line argument attached to this pass.
+  static constexpr ::llvm::StringLiteral getArgumentName() {
+    return ::llvm::StringLiteral("fuse-reduction");
+  }
+  ::llvm::StringRef getArgument() const override { return "fuse-reduction"; }
+
+  ::llvm::StringRef getDescription() const override {
+    return "Fuse reduction with its producer";
+  }
+
+  /// Returns the derived pass name.
+  static constexpr ::llvm::StringLiteral getPassName() {
+    return ::llvm::StringLiteral("ReductionFusion");
+  }
+  ::llvm::StringRef getName() const override { return "ReductionFusion"; }
+};
 } // namespace
 
 std::unique_ptr<OperationPass<func::FuncOp>>
@@ -199,3 +295,7 @@ std::unique_ptr<OperationPass<func::FuncOp>>
 mlir::createMatmulEpilogueFusionPass() {
   return std::make_unique<MatmulEpilogueFusionPass>();
 }
+
+std::unique_ptr<OperationPass<func::FuncOp>> mlir::createReductionFusionPass() {
+  return std::make_unique<ReductionFusionPass>();
+}
diff --git a/compiler/lib/Dialect/mhlo/Transforms/HloAggressiveFusion.cpp b/compiler/lib/Dialect/mhlo/Transforms/HloAggressiveFusion.cpp
index 678d6bc9c..6db573e5f 100644
--- a/compiler/lib/Dialect/mhlo/Transforms/HloAggressiveFusion.cpp
+++ b/compiler/lib/Dialect/mhlo/Transforms/HloAggressiveFusion.cpp
@@ -47,12 +47,15 @@ bool isFusibleWith(Operation *, Operation *) { return true; }
 
 bool isValidSingleOp(Operation *) { return true; }
 
+bool isValidFusionPattern(const MhloFusionPattern &) { return true; }
+
 static GenericFuserConfig config{getByteIRHloAggressiveFusionAttrName(),
                                  aggressive_fusion::isFusibleCandidate,
                                  aggressive_fusion::isFusibleStart,
                                  aggressive_fusion::isFusibleTrigger,
                                  aggressive_fusion::isFusibleWith,
-                                 aggressive_fusion::isValidSingleOp};
+                                 aggressive_fusion::isValidSingleOp,
+                                 aggressive_fusion::isValidFusionPattern};
 
 } // namespace aggressive_fusion
 
diff --git a/compiler/lib/Dialect/mhlo/Transforms/HloMoveDown.cpp b/compiler/lib/Dialect/mhlo/Transforms/HloMoveDown.cpp
index 55b74ef26..7bcbd1371 100644
--- a/compiler/lib/Dialect/mhlo/Transforms/HloMoveDown.cpp
+++ b/compiler/lib/Dialect/mhlo/Transforms/HloMoveDown.cpp
@@ -59,12 +59,21 @@ struct TransposeMoveDownPattern : public HloMoveDownPattern<mhlo::TransposeOp> {
   LogicalResult matchAndRewrite(mhlo::TransposeOp op,
                                 PatternRewriter &rewriter) const override {
     auto value = op.getResult();
-    auto operandType = op.getOperand().getType(); // T1 as Transpose: T1 -> T2
-
     // early termination if not allMultiUser nor multiUser but has multi users
     if (!allMultiUser && !multiUser && userCount(value) != 1) {
       return failure();
     }
+    auto permutationAttr = op.getPermutation();
+
+    auto isTransposeWithSamePermutation =
+        [&permutationAttr](Value val) -> bool {
+      auto op = val.getDefiningOp<mhlo::TransposeOp>();
+      if (!op) {
+        return false;
+      } else {
+        return op.getPermutation() == permutationAttr;
+      }
+    };
 
     llvm::SetVector<Operation *> users;
     for (auto user : value.getUsers()) {
@@ -94,13 +103,19 @@ struct TransposeMoveDownPattern : public HloMoveDownPattern<mhlo::TransposeOp> {
       // isElementwiseOneResult(user) == true
       bool failed = false;
       for (auto operand : user->getOperands()) {
-        if (operand != value && !isSplatMhloConstantValue(operand)) {
-          if (allMultiUser)
-            return failure();
-          failed = true;
-          break;
+        if (operand == value) {
+          continue;
+        } else if (isDenseMhloConstantValue(operand)) {
+          continue;
+        } else if (isTransposeWithSamePermutation(operand)) {
+          continue;
         }
+        if (allMultiUser)
+          return failure();
+        failed = true;
+        break;
       }
+
       if (failed)
         continue;
       users.insert(user);
@@ -119,8 +134,10 @@ struct TransposeMoveDownPattern : public HloMoveDownPattern<mhlo::TransposeOp> {
           if (!bvm.contains(value)) {
             bvm.map(value, op.getOperand());
           }
+        } else if (isTransposeWithSamePermutation(operand)) {
+          bvm.map(operand, operand.getDefiningOp<TransposeOp>().getOperand());
         } else {
-          // isSplatMhloConstantValue(operand) == true
+          // isDenseMhloConstantValue(operand) == true
           // since it has been checked when collecting users
           if (!constInputs.contains(operand)) {
             constInputs.insert(operand);
@@ -130,14 +147,19 @@ struct TransposeMoveDownPattern : public HloMoveDownPattern<mhlo::TransposeOp> {
 
       // create all const and put into bvm
       for (auto input : constInputs) {
-        ElementsAttr oldConstAttr =
-            input.getDefiningOp<mhlo::ConstantOp>().getValue();
-        auto newConstAttr = reshapeSplatElementsAttr(oldConstAttr, operandType);
-        auto newConstOp =
-            rewriter.create<mhlo::ConstantOp>(op->getLoc(), *newConstAttr);
-        bvm.map(input, newConstOp.getOutput());
+        SmallVector<uint64_t> newPermutation(permutationAttr.size());
+        std::for_each(permutationAttr.value_begin<APInt>(),
+                      permutationAttr.value_end<APInt>(),
+                      [i = 0, &newPermutation](auto e) mutable {
+                        newPermutation[e.getSExtValue()] = (uint64_t)i++;
+                      });
+        auto newPermutationAttr = DenseIntElementsAttr::get(
+            permutationAttr.getType(), newPermutation);
+        auto ConstOp = input.getDefiningOp<ConstantOp>();
+        auto newTransposeOp = rewriter.create<mhlo::TransposeOp>(
+            ConstOp.getLoc(), ConstOp.getOutput(), newPermutationAttr);
+        bvm.map(input, newTransposeOp.getResult());
       }
-
       auto maybeResultTypes =
           mixTypes(/*cloneFromElementTypes*/ user->getResultTypes(),
                    /*cloneFromShapes*/ op->getOperandTypes());
@@ -145,6 +167,8 @@ struct TransposeMoveDownPattern : public HloMoveDownPattern<mhlo::TransposeOp> {
       // maybeResultTypes should always have value
       assert(maybeResultTypes.has_value());
 
+      OpBuilder::InsertionGuard guard(rewriter);
+      rewriter.setInsertionPointAfter(user);
       // clone an elementwise op as producer
       auto newProducer =
           cloneAndReplaceResultTypes(rewriter, user, bvm, *maybeResultTypes);
diff --git a/compiler/lib/Pipelines/BufferizeOpt.cpp b/compiler/lib/Pipelines/BufferizeOpt.cpp
index dcbb11485..4a5f2c5bf 100644
--- a/compiler/lib/Pipelines/BufferizeOpt.cpp
+++ b/compiler/lib/Pipelines/BufferizeOpt.cpp
@@ -23,6 +23,7 @@
 #include "byteir/Transforms/Passes.h"
 #include "mlir/Dialect/Bufferization/Transforms/Passes.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/Transforms/Passes.h"
 #include "mlir/Transforms/Passes.h"
 #include "transforms/passes.h"
 
@@ -37,6 +38,7 @@ void mlir::createByteIRBufferizeOptPipeline(
         pm.addPass(byteir::createOneShotBufferizePass());
         addCleanUpExtPassPipeline(pm);
 
+        pm.addNestedPass<func::FuncOp>(memref::createFoldMemRefAliasOpsPass());
         // clean-up possible redundant copy from bufferization
         // perform twice, since cse is not greedy-based
         pm.addNestedPass<func::FuncOp>(createRemoveCopyPass());
diff --git a/compiler/lib/Pipelines/ByreOpt.cpp b/compiler/lib/Pipelines/ByreOpt.cpp
index 0e78f8742..ad4bba814 100644
--- a/compiler/lib/Pipelines/ByreOpt.cpp
+++ b/compiler/lib/Pipelines/ByreOpt.cpp
@@ -48,7 +48,10 @@ void createByreOptPipelineImpl(OpPassManager &pm, const std::string &entryFunc,
   OpPassManager anchoredPM(func::FuncOp::getOperationName());
   if (!disableMemoryPlanning) {
     // underlying memory of constant op cannot be reused
-    anchoredPM.addPass(createMemoryPlanningPass(128, nullptr));
+    anchoredPM.addPass(createMemoryPlanningPass(/* alignment */ 128,
+                                                /* alloca */ false,
+                                                /* memory space */ 0,
+                                                /* callback */ nullptr));
     anchoredPM.addPass(createCanonicalizerPass());
   }
   anchoredPM.addPass(createConvertMemrefToByrePass());
diff --git a/compiler/lib/Pipelines/GPU/CMakeLists.txt b/compiler/lib/Pipelines/GPU/CMakeLists.txt
index 4ab1ad0a7..8eea9ad17 100644
--- a/compiler/lib/Pipelines/GPU/CMakeLists.txt
+++ b/compiler/lib/Pipelines/GPU/CMakeLists.txt
@@ -2,7 +2,9 @@ add_mlir_library(ByteIRGPUPipelines
   ElementwiseCodegen.cpp
   GPUOpt.cpp
   LinalgMemrefGPU.cpp
+  MappingForall.cpp
   NVVMCodegen.cpp
+  ReductionCodegen.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Pipelines/GPU
@@ -15,6 +17,7 @@ add_mlir_library(ByteIRGPUPipelines
   MLIRBufferTransforms
 
   LINK_LIBS PUBLIC
+  ByteIRGPUPasses
   ByteIRLinalgPasses
   ByteIRPipelineCommon
   ByteIRUtils
@@ -22,4 +25,5 @@ add_mlir_library(ByteIRGPUPipelines
   ByteIRToPTX
   MLIRIR
   MLIRTransforms
+  MLIRLinalgExtTransformOps
 )
\ No newline at end of file
diff --git a/compiler/lib/Pipelines/GPU/GPUOpt.cpp b/compiler/lib/Pipelines/GPU/GPUOpt.cpp
index e5cf725b8..842901b04 100644
--- a/compiler/lib/Pipelines/GPU/GPUOpt.cpp
+++ b/compiler/lib/Pipelines/GPU/GPUOpt.cpp
@@ -20,10 +20,14 @@
 #include "byteir/Conversion/ToGPU/ToGPU.h"
 #include "byteir/Conversion/ToPTX/ToPTX.h"
 #include "byteir/Dialect/Affine/Passes.h"
+#include "byteir/Dialect/GPU/Passes.h"
 #include "byteir/Dialect/SCF/Passes.h"
+#include "byteir/Dialect/Transform/Transforms/TransformDialectInterpreter.h"
 #include "byteir/Dialect/mhlo/Passes.h"
 #include "byteir/Pipelines/Common/Utils.h"
+#include "byteir/Pipelines/GPU/MappingForall.h"
 #include "byteir/Transforms/Passes.h"
+#include "byteir/Transforms/RemoveFuncBody.h"
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Dialect/Bufferization/Transforms/Passes.h"
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
@@ -35,8 +39,9 @@ using namespace mlir;
 using namespace mlir::bufferization;
 
 namespace {
-void createGPUOptPipelineImpl(OpPassManager &pm, const bool &useBarePtrCallConv,
-                              const std::string &target) {
+void createElementwiseGPUOptPipelineImpl(OpPassManager &pm,
+                                         const bool &useBarePtrCallConv,
+                                         const std::string &target) {
   // apply PromotoBufferStack to func's with
   // getByteIRElementwiseFusionAttrName
   {
@@ -73,6 +78,36 @@ void createGPUOptPipelineImpl(OpPassManager &pm, const bool &useBarePtrCallConv,
   pm.addNestedPass<func::FuncOp>(createGenPTXConfigPass(useBarePtrCallConv));
 }
 
+void createReductionGPUOptPipelineImpl(OpPassManager &pm) {
+  GPUMappingForallOptions options;
+  options.funcAnchor = getByteIRReductionFusionAttrName().str();
+  createGPUMappingForallTransform(pm, options);
+  pm.addPass(createTransformDialectInterpreter(true));
+  pm.addPass(createCSEPass());
+  pm.addPass(createCanonicalizerPass());
+  pm.addPass(createGpuLauchSinkIndexComputationsPass());
+
+  {
+    OpPassManager anchoredPM(func::FuncOp::getOperationName());
+
+    anchoredPM.addPass(createPromoteBuffersToStackPass(
+        /*isSmallAlloc =*/[](Value value) {
+          return value.getParentRegion()->getParentOfType<gpu::LaunchOp>();
+        }));
+
+    pm.addNestedPass<func::FuncOp>(createAnchoredPipelinePass(
+        getByteIRReductionFusionAttrName(), anchoredPM));
+  }
+  pm.addPass(createGpuKernelOutliningPass());
+}
+
+void createGPUOptPipelineImpl(OpPassManager &pm, const bool &useBarePtrCallConv,
+                              const std::string &target) {
+  createElementwiseGPUOptPipelineImpl(pm, useBarePtrCallConv, target);
+  createReductionGPUOptPipelineImpl(pm);
+  pm.addPass(createCollectGPUKernelPass("unified", false));
+}
+
 } // namespace
 
 void mlir::createGPUOptPipeline(OpPassManager &pm,
diff --git a/compiler/lib/Pipelines/GPU/MappingForall.cpp b/compiler/lib/Pipelines/GPU/MappingForall.cpp
new file mode 100644
index 000000000..633214bee
--- /dev/null
+++ b/compiler/lib/Pipelines/GPU/MappingForall.cpp
@@ -0,0 +1,148 @@
+//===- MappingForall.cpp --------------------------------------*--- C++ -*-===//
+//
+// Copyright 2022 ByteDance Ltd. and/or its affiliates. All rights reserved.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+
+#include "byteir/Pipelines/GPU/MappingForall.h"
+
+#include "byteir/Conversion/ToGPU/ToGPU.h"
+#include "byteir/Conversion/ToLLVM/ToLLVM.h"
+#include "byteir/Dialect/Linalg/TransformOps/LinalgExtTransformOps.h"
+#include "byteir/Dialect/Transform/IR/TransformExtOps.h"
+#include "byteir/Dialect/Transform/Transforms/TransformInsertion.h"
+#include "byteir/Pipelines/Common/Utils.h"
+#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "llvm/ADT/SmallSet.h"
+
+#include <optional>
+
+using namespace mlir;
+
+namespace {
+
+static constexpr int64_t kMaximumBlockDim = 1024;
+
+struct MappingForallConfig {
+  SmallVector<int64_t> blockDims;
+};
+
+// TODO: move to common helper
+bool isMappedToGPUBlocks(scf::ForallOp forallOp) {
+  if (auto mapping = forallOp.getMappingAttr()) {
+    if (llvm::any_of(mapping.getValue(), [](Attribute attr) {
+          return isa<gpu::GPUBlockMappingAttr>(attr);
+        })) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool isMappedToGPUThreads(scf::ForallOp forallOp) {
+  if (auto mapping = forallOp.getMappingAttr()) {
+    if (llvm::any_of(mapping.getValue(), [](Attribute attr) {
+          return isa<gpu::GPUThreadMappingAttr>(attr);
+        })) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+void updateBlockDims(scf::ForallOp forallOp, SmallVector<int64_t> &blockDims) {
+  for (auto &&[lb, ub, step, mappingAttr] : llvm::zip(
+           forallOp.getMixedLowerBound(), forallOp.getMixedUpperBound(),
+           forallOp.getMixedStep(), forallOp.getMappingAttr().getValue())) {
+    if (auto threadMapping =
+            llvm::dyn_cast_or_null<gpu::GPUThreadMappingAttr>(mappingAttr)) {
+      auto numIterations = constantTripCount(lb, ub, step);
+      auto threadIdx = threadMapping.getMappingId();
+      if (numIterations.has_value()) {
+        blockDims[threadIdx] =
+            std::max(blockDims[threadIdx], numIterations.value());
+      }
+    }
+  }
+}
+
+std::optional<MappingForallConfig>
+getMappingForallConfig(scf::ForallOp forallOp) {
+  if (!isMappedToGPUBlocks(forallOp))
+    return std::nullopt;
+
+  SmallVector<int64_t> blockDims{1, 1, 1};
+  auto &&block = forallOp.getRegion().front();
+  for (auto &&nestedForall : block.getOps<scf::ForallOp>()) {
+    if (isMappedToGPUThreads(nestedForall)) {
+      updateBlockDims(nestedForall, blockDims);
+    }
+  }
+
+  if (blockDims[0] * blockDims[1] * blockDims[2] > kMaximumBlockDim) {
+    return std::nullopt;
+  }
+  return MappingForallConfig{blockDims};
+}
+
+void createGPUMappingForallTransformImpl(OpPassManager &pm,
+                                         const std::string &anchor,
+                                         const std::string &prefix) {
+  TransformInsertionConfig config;
+  config.funcAnchor = anchor;
+  config.matchPrefix = prefix;
+  config.opFilter = [=](Operation *op) {
+    if (auto forallOp = llvm::dyn_cast_or_null<scf::ForallOp>(op)) {
+      return getMappingForallConfig(forallOp).has_value();
+    }
+    return false;
+  };
+
+  config.transformBuilder = [=](ImplicitLocOpBuilder &b, Operation *op,
+                                Value pdlV) {
+    auto mappingConfig =
+        getMappingForallConfig(llvm::cast<scf::ForallOp>(op)).value();
+    auto pdlType = pdl::OperationType::get(b.getContext());
+    auto launchOp = b.create<transform::MapForallToBlocks>(
+        /* result type */ pdlType,
+        /* target */ pdlV,
+        /* grid_dims */ llvm::ArrayRef<int64_t>{},
+        /* generate_gpu_launch */ true);
+
+    b.create<transform::MapNestedForallToThreads>(
+        /* result type*/ pdlType,
+        /* target */ launchOp.getResult(),
+        /* block_dims */ mappingConfig.blockDims,
+        /* warp_dims */ llvm::ArrayRef<int64_t>{},
+        /* sync_after_distribute*/ true);
+  };
+
+  pm.addPass(createGenericTransformInsertionPass(config));
+}
+} // namespace
+
+void mlir::createGPUMappingForallTransform(
+    OpPassManager &pm, const GPUMappingForallOptions &options) {
+  invokeOpPassPipelineBuilder(createGPUMappingForallTransformImpl, pm,
+                              options.funcAnchor, options.annotatePrefix);
+}
diff --git a/compiler/lib/Pipelines/GPU/NVVMCodegen.cpp b/compiler/lib/Pipelines/GPU/NVVMCodegen.cpp
index aecfc5b84..d23567a58 100644
--- a/compiler/lib/Pipelines/GPU/NVVMCodegen.cpp
+++ b/compiler/lib/Pipelines/GPU/NVVMCodegen.cpp
@@ -19,6 +19,7 @@
 
 #include "byteir/Conversion/GPUToNVVM/GPUToNVVM.h"
 #include "byteir/Conversion/ToPTX/ToPTX.h"
+#include "byteir/Dialect/GPU/Passes.h"
 #include "byteir/Dialect/MemRef/Transforms/ExtractAddressComputation.h"
 #include "byteir/Dialect/MemRef/Transforms/SimplifyLinearizedIndex.h"
 #include "byteir/Dialect/mhlo/Passes.h"
@@ -39,6 +40,9 @@ void createNVVMCodegenPipelineImpl(OpPassManager &pm,
   // TODO add target for supporting different SMs
   // TODO use target to decide passes
   pm.addPass(createCollectGPUKernelPass());
+  pm.addNestedPass<gpu::GPUModuleOp>(createShmAllocaToWorkgroupArg());
+  pm.addPass(createCSEPass());
+  pm.addPass(createCanonicalizerPass());
   pm.addPass(createConvertSCFToCFPass());
   pm.addPass(createExtractAddressComputationPass());
   pm.addPass(memref::createExpandStridedMetadataPass());
diff --git a/compiler/lib/Pipelines/GPU/ReductionCodegen.cpp b/compiler/lib/Pipelines/GPU/ReductionCodegen.cpp
new file mode 100644
index 000000000..664fce40e
--- /dev/null
+++ b/compiler/lib/Pipelines/GPU/ReductionCodegen.cpp
@@ -0,0 +1,942 @@
+//===- ReductionCodegen.cpp ---------------------------------*--- C++ -*-===//
+//
+// Copyright 2022 ByteDance Ltd. and/or its affiliates. All rights reserved.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+
+#include "byteir/Pipelines/GPU/ReductionCodegen.h"
+
+#include "byteir/Conversion/ToGPU/ToGPU.h"
+#include "byteir/Conversion/ToLLVM/ToLLVM.h"
+#include "byteir/Dialect/Linalg/TransformOps/LinalgExtTransformOps.h"
+#include "byteir/Dialect/Transform/IR/TransformExtOps.h"
+#include "byteir/Dialect/Transform/Transforms/TransformInsertion.h"
+#include "byteir/Pipelines/Common/Utils.h"
+#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
+#include "mlir/Dialect/Bufferization/TransformOps/BufferizationTransformOps.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/SCF/TransformOps/SCFTransformOps.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "llvm/ADT/SmallSet.h"
+
+#include <optional>
+
+using namespace mlir;
+
+namespace {
+//----------------------------------------------------------------------------//
+// common helpers
+//----------------------------------------------------------------------------//
+// TODO: move to common header
+
+constexpr bool isPowerOf2(int64_t n) { return (!(n & (n - 1))); }
+
+constexpr int64_t nextPowerOf2(int64_t n) {
+  return (n <= 1) ? 1 : (isPowerOf2(n) ? n : (2 * nextPowerOf2((n + 1) / 2)));
+}
+
+bool isMappedToGPUBlocks(scf::ForOp forOp) {
+  if (auto loopToSIMTAttr =
+          forOp->getAttrOfType<StringAttr>(getLoopToSIMTAttrName())) {
+    auto mappingTo = loopToSIMTAttr.getValue();
+    if (mappingTo == getBlockIdXName() || mappingTo == getBlockIdYName() ||
+        mappingTo == getBlockIdZName()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool isMappedToGPUBlocks(scf::ForallOp forallOp) {
+  if (auto mapping = forallOp.getMappingAttr()) {
+    if (llvm::any_of(mapping.getValue(), [](Attribute attr) {
+          return isa<gpu::GPUBlockMappingAttr>(attr);
+        })) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool isMappedToGPUBlocks(Operation *op) {
+  if (auto forOp = llvm::dyn_cast_or_null<scf::ForOp>(op)) {
+    return isMappedToGPUBlocks(forOp);
+  }
+  if (auto forallOp = llvm::dyn_cast_or_null<scf::ForallOp>(op)) {
+    return isMappedToGPUBlocks(forallOp);
+  }
+  return false;
+}
+
+bool isMappedToGPUThreads(scf::ForOp forOp) {
+  if (auto loopToSIMTAttr =
+          forOp->getAttrOfType<StringAttr>(getLoopToSIMTAttrName())) {
+    auto mappingTo = loopToSIMTAttr.getValue();
+    if (mappingTo == getThreadIdXName() || mappingTo == getThreadIdYName() ||
+        mappingTo == getThreadIdZName()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool isMappedToGPUThreads(scf::ForallOp forallOp) {
+  if (auto mapping = forallOp.getMappingAttr()) {
+    if (llvm::any_of(mapping.getValue(), [](Attribute attr) {
+          return isa<gpu::GPUThreadMappingAttr>(attr);
+        })) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool isMappedToGPUThreads(Operation *op) {
+  if (auto forOp = llvm::dyn_cast_or_null<scf::ForOp>(op)) {
+    return isMappedToGPUThreads(forOp);
+  }
+  if (auto forallOp = llvm::dyn_cast_or_null<scf::ForallOp>(op)) {
+    return isMappedToGPUThreads(forallOp);
+  }
+  return false;
+}
+
+uint64_t getNumTiledLoops(ArrayRef<int64_t> tileSizes) {
+  return llvm::count_if(tileSizes,
+                        [](int64_t tileSize) { return tileSize > 0; });
+}
+
+std::optional<int64_t> getReductionDim(linalg::GenericOp genericOp) {
+  SmallVector<unsigned> reductionDims;
+  genericOp.getReductionDims(reductionDims);
+  if (reductionDims.size() == 1) {
+    return reductionDims[0];
+  }
+  return std::nullopt;
+}
+
+std::optional<int64_t> getOperandReductionDim(OpOperand &operand) {
+  auto genericOp = llvm::dyn_cast<linalg::GenericOp>(operand.getOwner());
+  if (!genericOp)
+    return std::nullopt;
+
+  auto dim = getReductionDim(genericOp);
+  if (!dim.has_value())
+    return std::nullopt;
+
+  auto affineMap = genericOp.getIndexingMapsArray()[operand.getOperandNumber()];
+  if (!affineMap || !affineMap.isProjectedPermutation())
+    return std::nullopt;
+
+  for (auto &&en : llvm::enumerate(affineMap.getResults())) {
+    if (auto dimExpr = en.value().dyn_cast<AffineDimExpr>()) {
+      if (dimExpr.getPosition() == *dim) {
+        return en.index();
+      }
+    }
+  }
+
+  return std::nullopt;
+}
+
+SmallVector<int64_t> getDynamicDims(linalg::GenericOp genericOp) {
+  auto staticLoopRanges = genericOp.getStaticLoopRanges();
+  SmallVector<int64_t> ret;
+  for (int64_t i = 0; i < staticLoopRanges.size(); ++i) {
+    if (ShapedType::isDynamic(staticLoopRanges[i])) {
+      ret.push_back(i);
+    }
+  }
+  return ret;
+}
+
+//----------------------------------------------------------------------------//
+// configuration structs
+//----------------------------------------------------------------------------//
+
+static constexpr StringLiteral kGridReduction = "__grid_reduction__";
+static constexpr StringLiteral kBlockReduction = "__block_reduction__";
+static constexpr StringLiteral kWarpReduction = "__warp_reduction__";
+static constexpr StringLiteral kThreadReduction = "__thread_reduction__";
+
+struct ProducerSelector {
+  uint64_t operandNumber;
+  llvm::StringRef opName;
+  std::vector<ProducerSelector> producerSelectors;
+
+  ProducerSelector(uint64_t operandNumber, llvm::StringRef opName)
+      : operandNumber(operandNumber), opName(opName) {}
+
+  static bool detectFillOperand(OpOperand *opOperand,
+                                std::vector<ProducerSelector> &selectors) {
+    if (opOperand->get().getDefiningOp<linalg::FillOp>()) {
+      selectors.emplace_back(opOperand->getOperandNumber(),
+                             linalg::FillOp::getOperationName());
+      return true;
+    }
+    return false;
+  }
+
+  static bool detectPadOperand(OpOperand *opOperand,
+                               std::vector<ProducerSelector> &selectors) {
+    Operation *definingOp = opOperand->get().getDefiningOp();
+    if (!definingOp)
+      return false;
+
+    if (llvm::isa<tensor::ExpandShapeOp, tensor::CollapseShapeOp>(definingOp)) {
+      ProducerSelector selector(opOperand->getOperandNumber(),
+                                definingOp->getName().getStringRef());
+      if (detectPadOperand(&definingOp->getOpOperand(0),
+                           selector.producerSelectors)) {
+        selectors.emplace_back(std::move(selector));
+        return true;
+      }
+    } else if (llvm::isa<tensor::PadOp>(definingOp)) {
+      selectors.emplace_back(opOperand->getOperandNumber(),
+                             tensor::PadOp::getOperationName());
+      return true;
+    }
+    return false;
+  }
+};
+
+struct GridSplitConfig {
+  int64_t splitFactor;
+  int64_t dimension;
+
+  void apply(ImplicitLocOpBuilder &b, Value pdlV);
+};
+
+struct GridTileConfig {
+  SmallVector<int64_t> tileSizes;
+  SmallVector<gpu::Blocks> mapping;
+  std::vector<ProducerSelector> fuseCandidates;
+
+  void apply(ImplicitLocOpBuilder &b, Value pdlV, bool usingForall);
+};
+
+struct BlockSplitConfig {
+  SmallVector<int64_t> splitFactors;
+  SmallVector<int64_t> dimensions;
+  SmallVector<int64_t> padDims;
+  SmallVector<Attribute> padValues;
+
+  void apply(ImplicitLocOpBuilder &b, Value pdlV);
+};
+
+struct BlockTileConfig {
+  SmallVector<int64_t> tileSizes;
+  SmallVector<gpu::Threads> mapping;
+  std::vector<ProducerSelector> fuseCandidates;
+
+  void apply(ImplicitLocOpBuilder &b, Value pdlV, bool usingForall);
+};
+
+struct ThreadTileConfig {
+  SmallVector<int64_t> parallelTileSizes;
+  SmallVector<int64_t> reductionTileSizes;
+  SmallVector<int64_t> unrollFactors;
+  std::vector<ProducerSelector> initOperands;
+
+  void apply(ImplicitLocOpBuilder &b, Value pdlV);
+};
+
+void processProducerSelectors(
+    ImplicitLocOpBuilder &b,
+    const std::vector<ProducerSelector> &producerSelectors, Value fuseInto,
+    SmallVector<Value> &selected, Type producerType = nullptr) {
+  for (auto selector : producerSelectors) {
+    auto producer = b.create<transform::GetProducerOfOperand>(
+        /* producer type */ producerType
+            ? producerType
+            : transform::OperationType::get(b.getContext(), selector.opName),
+        /* target */ fuseInto,
+        /* operand number */ selector.operandNumber);
+    selected.push_back(producer.getProducer());
+    processProducerSelectors(b, selector.producerSelectors, selected.back(),
+                             selected);
+  }
+}
+
+void tileToForallAndFuseImpl(
+    ImplicitLocOpBuilder &b, Value toTile,
+    const SmallVector<int64_t> &tileSizes,
+    const SmallVector<Attribute> &mapping,
+    const std::vector<ProducerSelector> &fuseCandidates) {
+  SmallVector<Value> toBeFused;
+  processProducerSelectors(b, fuseCandidates, toTile, toBeFused);
+
+  auto tileOp = b.create<transform::TileToForallOp>(
+      /* target */ toTile,
+      /* staticTileSizes */ tileSizes,
+      /* ctor tag */ transform::TileSizesSpec(),
+      /* mapping */ b.getArrayAttr(mapping));
+  for (auto &&producerOp : toBeFused) {
+    b.create<transform::FuseIntoContainingOp>(
+        /* producerOp */ producerOp,
+        /* containingOp */ tileOp.getForallOp());
+  }
+}
+
+void tileToSCFForAndFuseImpl(ImplicitLocOpBuilder &b, Value toTile,
+                             const SmallVector<int64_t> &tileSizes,
+                             const SmallVector<Attribute> &mapping) {
+  auto pdlType = pdl::OperationType::get(b.getContext());
+  auto fuseOp = b.create<transform::FuseOp>(
+      /* transformed */ pdlType,
+      /* loops */
+      SmallVector<Type>(getNumTiledLoops(tileSizes), pdlType),
+      /* target */ toTile,
+      /* tile_sizes */ b.getI64ArrayAttr(tileSizes),
+      /* tile_interchange */ ArrayAttr());
+  for (auto &&[loop, mapTo] : llvm::zip(fuseOp.getLoops(), mapping)) {
+    Value paramV = b.create<transform::ParamConstantOp>(
+        /* type */ pdl::AttributeType::get(b.getContext()),
+        /* value */ mapTo);
+    b.create<transform::AnnotateOp>(
+        /* target */ loop,
+        /* name */ getLoopToSIMTAttrName(),
+        /* param */ paramV);
+  }
+}
+
+void GridSplitConfig::apply(ImplicitLocOpBuilder &b, Value pdlV) {
+  if (splitFactor) {
+    auto splitted = b.create<transform::SplitReductionOp>(
+        /* target */ pdlV,
+        /* splitFactor */ splitFactor,
+        /* insertSplitDimension */ dimension,
+        /* innerParallel */ false,
+        /* useScalingAlgorithm */ false,
+        /* useAlloc */ false);
+    b.create<transform::AnnotateOp>(
+        /* target */ splitted.getSplitLinalgOp(),
+        /* name */ kGridReduction,
+        /* param */ Value());
+    b.create<transform::AnnotateOp>(
+        /* target */ splitted.getCombiningLinalgOp(),
+        /* name */ kGridReduction,
+        /* param */ Value());
+  } else {
+    b.create<transform::AnnotateOp>(
+        /* target */ pdlV,
+        /* name */ kGridReduction,
+        /* param */ Value());
+  }
+}
+
+void GridTileConfig::apply(ImplicitLocOpBuilder &b, Value pdlV,
+                           bool usingForall) {
+  if (usingForall) {
+    auto mappingAttrs = llvm::to_vector(
+        llvm::map_range(mapping, [&](gpu::Blocks dim) -> Attribute {
+          return gpu::GPUBlockMappingAttr::get(b.getContext(), dim);
+        }));
+    tileToForallAndFuseImpl(b, pdlV, tileSizes, mappingAttrs, fuseCandidates);
+  } else {
+    static constexpr std::array<StringRef, 3> mappings{
+        getBlockIdXName(), getBlockIdYName(), getBlockIdZName()};
+    auto mappingAttrs = llvm::to_vector(
+        llvm::map_range(mapping, [&](gpu::Blocks dim) -> Attribute {
+          return b.getStringAttr(mappings[static_cast<int64_t>(dim)]);
+        }));
+    tileToSCFForAndFuseImpl(b, pdlV, tileSizes, mappingAttrs);
+  }
+}
+
+void BlockSplitConfig::apply(ImplicitLocOpBuilder &b, Value pdlV) {
+  if (!padDims.empty()) {
+    auto padOp = b.create<transform::PadOp>(
+        TypeRange{pdlV.getType(), pdlV.getType()}, pdlV,
+        /*padding_values=*/b.getArrayAttr(padValues),
+        /*padding_dimensions=*/
+        b.getI64ArrayAttr(padDims),
+        /*padToMultipleOf=*/ArrayAttr{},
+        /*pack_paddings=*/ArrayAttr{},
+        /*transpose_paddings=*/ArrayAttr{},
+        /*copyBack=*/false);
+    pdlV = padOp.getPadded();
+  }
+  if (!splitFactors.empty()) {
+    Value toSplit = pdlV;
+    for (auto &&[splitFactor, redDim] : llvm::zip(splitFactors, dimensions)) {
+      auto splitted = b.create<transform::SplitReductionOp>(
+          /* target */ toSplit,
+          /* splitFactor */ splitFactor,
+          /* insertSplitDimension */ redDim,
+          /* innerParallel */ false,
+          /* useScalingAlgorithm */ false,
+          /* useAlloc */ false);
+      b.create<transform::AnnotateOp>(
+          /* target */ splitted.getInitOrAllocOp(),
+          /* name */ kBlockReduction,
+          /* param */ Value());
+      b.create<transform::AnnotateOp>(
+          /* target */ splitted.getCombiningLinalgOp(),
+          /* name */ kBlockReduction,
+          /* param */ Value());
+      toSplit = splitted.getCombiningLinalgOp();
+    }
+    pdlV = toSplit;
+  } else {
+    b.create<transform::AnnotateOp>(
+        /* target */ pdlV,
+        /* name */ kBlockReduction,
+        /* param */ Value());
+  }
+  auto func = b.create<transform::GetParentOp>(
+      pdlV.getType(), pdlV,
+      /* isolated_from_above */ true,
+      /* op_name */ b.getStringAttr(func::FuncOp::getOperationName()),
+      /* deduplicate */ false);
+  b.create<transform::ApplyPatternsOp>(func, [](OpBuilder &b, Location loc) {
+    b.create<transform::ApplyCanonicalizationPatternsOp>(loc);
+  });
+  auto forall = b.create<transform::GetParentOp>(
+      pdlV.getType(), pdlV,
+      /* isolated_from_above */ false,
+      /* op_name */ b.getStringAttr(scf::ForallOp::getOperationName()),
+      /* deduplicate */ false);
+  if (!padDims.empty()) {
+    auto parallelInsertSliceType = transform::OperationType::get(
+        b.getContext(), tensor::ParallelInsertSliceOp::getOperationName());
+    auto parallelInsertSlice = b.create<transform::MatchOp>(
+        parallelInsertSliceType, forall,
+        tensor::ParallelInsertSliceOp::getOperationName());
+    b.create<transform::InsertSliceToCopyExtOp>(pdlV.getType(),
+                                                parallelInsertSlice);
+  }
+  auto emptyTensorType = transform::OperationType::get(
+      b.getContext(), tensor::EmptyOp::getOperationName());
+  auto emptyTensor = b.create<transform::MatchOp>(
+      emptyTensorType, forall, tensor::EmptyOp::getOperationName());
+  auto allocTensorType = transform::OperationType::get(
+      b.getContext(), bufferization::AllocTensorOp::getOperationName());
+  auto allocTensor = b.create<transform::EmptyTensorToAllocTensorOp>(
+      allocTensorType, emptyTensor);
+  auto memorySpaceAttrName =
+      bufferization::AllocTensorOp::getMemorySpaceAttrName(OperationName(
+          bufferization::AllocTensorOp::getOperationName(), b.getContext()));
+  auto workgroupMemoryAddressSpace = gpu::AddressSpaceAttr::get(
+      b.getContext(), gpu::GPUDialect::getWorkgroupAddressSpace());
+  Value paramV = b.create<transform::ParamConstantOp>(
+      /* type */ pdl::AttributeType::get(b.getContext()),
+      /* value */ workgroupMemoryAddressSpace);
+  b.create<transform::AnnotateOp>(
+      /* target */ allocTensor,
+      /* name */ memorySpaceAttrName,
+      /* param */ paramV);
+}
+
+void BlockTileConfig::apply(ImplicitLocOpBuilder &b, Value pdlV,
+                            bool usingForall) {
+  if (usingForall) {
+    auto mappingAttrs = llvm::to_vector(
+        llvm::map_range(mapping, [&](gpu::Threads dim) -> Attribute {
+          return gpu::GPUThreadMappingAttr::get(b.getContext(), dim);
+        }));
+    tileToForallAndFuseImpl(b, pdlV, tileSizes, mappingAttrs, fuseCandidates);
+  } else {
+    static constexpr std::array<StringRef, 3> mappings{
+        getThreadIdXName(), getThreadIdYName(), getThreadIdZName()};
+    auto mappingAttrs = llvm::to_vector(
+        llvm::map_range(mapping, [&](gpu::Threads dim) -> Attribute {
+          return b.getStringAttr(mappings[static_cast<int64_t>(dim)]);
+        }));
+    tileToSCFForAndFuseImpl(b, pdlV, tileSizes, mappingAttrs);
+  }
+}
+
+void ThreadTileConfig::apply(ImplicitLocOpBuilder &b, Value pdlV) {
+  auto pdlType = pdl::OperationType::get(b.getContext());
+  auto numTiledParallelLoops = getNumTiledLoops(parallelTileSizes);
+  SmallVector<Value> loops;
+  if (numTiledParallelLoops > 0) {
+    auto fuseOp = b.create<transform::FuseOp>(
+        /* transformed */ pdlType,
+        /* loops */
+        SmallVector<Type>(getNumTiledLoops(parallelTileSizes), pdlType),
+        /* target */ pdlV,
+        /* tile_sizes */ b.getI64ArrayAttr(parallelTileSizes),
+        /* tile_interchange */ ArrayAttr());
+    loops = fuseOp.getLoops();
+    pdlV = fuseOp.getTransformed();
+  }
+
+  auto tileOp = b.create<transform::TileOp>(
+      /* target */ pdlV,
+      /* tillSizes */ reductionTileSizes);
+  loops.push_back(tileOp.getLoops()[0]);
+  for (auto &&[loop, factor] : llvm::reverse(llvm::zip(loops, unrollFactors))) {
+    b.create<transform::LoopUnrollOp>(loop, factor);
+  }
+}
+
+//----------------------------------------------------------------------------//
+// codegen strategies
+//----------------------------------------------------------------------------//
+
+bool isReductionOp(linalg::GenericOp genericOp) {
+  if (genericOp.getNumReductionLoops() != 1)
+    return false;
+
+  if (!llvm::all_of(genericOp.getIndexingMapsArray(), [](AffineMap affineMap) {
+        return affineMap.isProjectedPermutation(/* allowZeroInResults */ false);
+      }))
+    return false;
+
+  return true;
+}
+
+bool isGridReductionOp(linalg::GenericOp genericOp) {
+  if (!isReductionOp(genericOp))
+    return false;
+
+  // early return for manual tag
+  if (genericOp->hasAttr(kGridReduction))
+    return true;
+
+  // top level generic op in function
+  if (genericOp->getParentOfType<func::FuncOp>())
+    return true;
+
+  return false;
+}
+
+bool isBlockReductionOp(linalg::GenericOp genericOp) {
+  if (!isReductionOp(genericOp))
+    return false;
+
+  // early return for manual tag
+  if (genericOp->hasAttr(kBlockReduction))
+    return true;
+
+  // nested in op which is mapped to GPU blocks
+  if (isMappedToGPUBlocks(genericOp->getParentOp()))
+    return true;
+
+  return false;
+}
+
+bool isThreadReductionOp(linalg::GenericOp genericOp) {
+  if (!isReductionOp(genericOp))
+    return false;
+
+  // early return for manual tag
+  if (genericOp->hasAttr(kThreadReduction))
+    return true;
+
+  // nested in op which is mapped to GPU threads
+  if (isMappedToGPUThreads(genericOp->getParentOp()))
+    return true;
+
+  return false;
+}
+
+std::optional<GridSplitConfig> getGridSplitConfig(linalg::GenericOp genericOp,
+                                                  int64_t splitFactor) {
+  if (!isGridReductionOp(genericOp))
+    return std::nullopt;
+
+  auto redDim = *getReductionDim(genericOp);
+  auto staticLoopRanges = genericOp.getStaticLoopRanges();
+  if (ShapedType::isDynamic(staticLoopRanges[redDim]) ||
+      staticLoopRanges[redDim] % splitFactor != 0 ||
+      staticLoopRanges[redDim] <= 1024)
+    return std::nullopt;
+
+  return GridSplitConfig{splitFactor, redDim ? redDim - 1 : redDim};
+}
+
+std::optional<GridTileConfig> getGridTileConfig(linalg::GenericOp genericOp,
+                                                int64_t warpSize,
+                                                int64_t blockSize) {
+  if (!isGridReductionOp(genericOp))
+    return std::nullopt;
+
+  int64_t numLoops = genericOp.getNumLoops();
+  SmallVector<int64_t> tileSizes(numLoops, 1);
+  auto loopSizes =
+      cast<linalg::LinalgOp>(genericOp.getOperation()).computeStaticLoopSizes();
+
+  for (auto &&affineMap : genericOp.getIndexingMapsArray()) {
+    if (affineMap.isPermutation()) {
+      auto dim = affineMap.getDimPosition(numLoops - 1);
+      if (loopSizes[dim] > warpSize) { // TODO: padding
+        tileSizes[dim] *= warpSize;
+        break;
+      }
+    }
+  }
+
+  auto redDim = getReductionDim(genericOp).value();
+  tileSizes[redDim] = 0;
+
+  std::vector<ProducerSelector> fuseCandidates;
+  for (OpOperand *opOperand : genericOp.getDpsInitOperands()) {
+    ProducerSelector::detectFillOperand(opOperand, fuseCandidates);
+  }
+
+  auto numTiledLoops = getNumTiledLoops(tileSizes);
+  if (numTiledLoops >= 1 && numTiledLoops <= 3) {
+    SmallVector<int64_t> mapping(numLoops, -1);
+    int64_t dimMapping = static_cast<int64_t>(gpu::Blocks::DimX);
+    for (auto &&affineMap : genericOp.getIndexingMapsArray()) {
+      if (affineMap.isPermutation()) {
+        for (int64_t i = numLoops - 1; i >= 0; i--) {
+          auto dim = affineMap.getDimPosition(i);
+          if (tileSizes[dim] > 0) {
+            mapping[dim] = dimMapping++;
+          }
+        }
+        break;
+      }
+    }
+    mapping.erase(std::remove(mapping.begin(), mapping.end(), -1),
+                  mapping.end());
+    if (mapping.size() != numTiledLoops)
+      return std::nullopt;
+
+    return GridTileConfig{
+        tileSizes,
+        llvm::to_vector(llvm::map_range(
+            mapping, [](int64_t i) { return static_cast<gpu::Blocks>(i); })),
+        fuseCandidates};
+  }
+  return std::nullopt;
+}
+
+std::optional<BlockSplitConfig> getBlockSplitConfig(linalg::GenericOp genericOp,
+                                                    int64_t splitFactor,
+                                                    int64_t warpSize) {
+  if (!isBlockReductionOp(genericOp))
+    return std::nullopt;
+
+  SmallVector<int64_t> padDims = getDynamicDims(genericOp);
+  SmallVector<Attribute> padValues;
+
+  SmallVector<int64_t> splitFactors;
+  SmallVector<int64_t> dimensions;
+  auto redDim = *getReductionDim(genericOp);
+  auto staticLoopRanges = genericOp.getStaticLoopRanges();
+  if (ShapedType::isDynamic(staticLoopRanges[redDim]))
+    return std::nullopt;
+
+  if (auto redPos = getOperandReductionDim(*genericOp.getDpsInputOperand(0))) {
+    if (redPos.value() == genericOp.getNumLoops() - 1) {
+      auto newSplitFactor = splitFactor * 2;
+      while (staticLoopRanges[redDim] % newSplitFactor == 0 &&
+             newSplitFactor <= splitFactor * warpSize) {
+        newSplitFactor *= 2;
+      }
+      splitFactor = newSplitFactor / 2;
+    }
+  }
+
+  if (staticLoopRanges[redDim] < splitFactor) {
+    splitFactor = staticLoopRanges[redDim];
+  } else {
+    if (staticLoopRanges[redDim] % splitFactor != 0)
+      return std::nullopt;
+
+    splitFactors.push_back(splitFactor);
+    dimensions.push_back(redDim ? redDim - 1 : redDim);
+  }
+
+  mlir::Builder b(genericOp.getContext());
+  for (auto &&operand : genericOp->getOperands()) {
+    if (auto shapedType = llvm::dyn_cast<ShapedType>(operand.getType())) {
+      padValues.push_back(b.getZeroAttr(shapedType.getElementType()));
+    } else {
+      return std::nullopt;
+    }
+  }
+
+  for (; splitFactor > 2; splitFactor >>= 1) {
+    splitFactors.push_back(splitFactor / 2);
+    dimensions.push_back(redDim ? redDim - 1 : redDim);
+  }
+
+  return BlockSplitConfig{splitFactors, dimensions, padDims, padValues};
+}
+
+std::optional<BlockTileConfig> getBlockTileConfig(linalg::GenericOp genericOp,
+                                                  int64_t warpSize,
+                                                  int64_t blockSize) {
+  if (!isBlockReductionOp(genericOp))
+    return std::nullopt;
+
+  int64_t numLoops = genericOp.getNumLoops();
+  SmallVector<int64_t> tileSizes(numLoops, 0);
+  auto loopSizes =
+      cast<linalg::LinalgOp>(genericOp.getOperation()).computeStaticLoopSizes();
+
+  int64_t remainBlockSize = blockSize;
+  auto redDim = getReductionDim(genericOp).value();
+  for (int64_t idx = 0; idx < numLoops && remainBlockSize > 1; ++idx) {
+    if (idx == redDim)
+      continue;
+    int64_t curLoopSize2 = nextPowerOf2(loopSizes[idx]);
+    int64_t curBlockSize = std::min(curLoopSize2, remainBlockSize);
+    tileSizes[idx] = curLoopSize2 / curBlockSize;
+    remainBlockSize /= curBlockSize;
+  }
+
+  if (remainBlockSize == blockSize) {
+    tileSizes[redDim] = loopSizes[redDim];
+  }
+
+  std::vector<ProducerSelector> fuseCandidates;
+  for (OpOperand *opOperand : genericOp.getDpsInputOperands()) {
+    ProducerSelector::detectPadOperand(opOperand, fuseCandidates);
+  }
+  for (OpOperand *opOperand : genericOp.getDpsInitOperands()) {
+    ProducerSelector::detectFillOperand(opOperand, fuseCandidates);
+  }
+
+  auto numTiledLoops = getNumTiledLoops(tileSizes);
+  if (numTiledLoops >= 1 && numTiledLoops <= 3) {
+    SmallVector<int64_t> mapping(numLoops, -1);
+    int64_t dimMapping = static_cast<int64_t>(gpu::Threads::DimX);
+    for (auto &&affineMap : genericOp.getIndexingMapsArray()) {
+      if (affineMap.isPermutation()) {
+        for (int64_t i = numLoops - 1; i >= 0; i--) {
+          auto dim = affineMap.getDimPosition(i);
+          if (tileSizes[dim] > 0) {
+            mapping[dim] = dimMapping++;
+          }
+        }
+        break;
+      }
+    }
+    mapping.erase(std::remove(mapping.begin(), mapping.end(), -1),
+                  mapping.end());
+    if (mapping.size() != numTiledLoops)
+      return std::nullopt;
+
+    return BlockTileConfig{
+        tileSizes,
+        llvm::to_vector(llvm::map_range(
+            mapping, [](int64_t i) { return static_cast<gpu::Threads>(i); })),
+        fuseCandidates};
+  }
+  return std::nullopt;
+}
+
+std::optional<ThreadTileConfig>
+getThreadTileConfig(linalg::GenericOp genericOp) {
+  if (!isThreadReductionOp(genericOp))
+    return std::nullopt;
+
+  int64_t numLoops = genericOp.getNumLoops();
+  SmallVector<int64_t> parallelTileSizes(numLoops, 1);
+  SmallVector<int64_t> reductionTileSizes(numLoops, 0);
+  auto reductionDim = *getReductionDim(genericOp);
+
+  parallelTileSizes[reductionDim] = 0;
+  reductionTileSizes[reductionDim] = 1;
+
+  SmallVector<int64_t> unrollFactors =
+      cast<linalg::LinalgOp>(genericOp.getOperation()).computeStaticLoopSizes();
+
+  std::vector<ProducerSelector> initOperands;
+  for (OpOperand *opOperand : genericOp.getDpsInitOperands()) {
+    ProducerSelector::detectFillOperand(opOperand, initOperands);
+  }
+
+  return ThreadTileConfig{parallelTileSizes, reductionTileSizes, unrollFactors,
+                          initOperands};
+}
+
+//----------------------------------------------------------------------------//
+// transform insertion impl
+//----------------------------------------------------------------------------//
+
+void createGPUSplitGridReductionTransformImpl(OpPassManager &pm,
+                                              const std::string &anchor,
+                                              const std::string &prefix,
+                                              int64_t splitFactor) {
+  TransformInsertionConfig config;
+  config.funcAnchor = anchor;
+  config.matchPrefix = prefix;
+  config.opFilter = [=](Operation *op) {
+    if (auto genericOp = llvm::dyn_cast_or_null<linalg::GenericOp>(op)) {
+      return getGridSplitConfig(genericOp, splitFactor).has_value();
+    }
+    return false;
+  };
+
+  config.transformBuilder = [=](ImplicitLocOpBuilder &b, Operation *op,
+                                Value pdlV) {
+    auto splitConfig =
+        getGridSplitConfig(llvm::cast<linalg::GenericOp>(op), splitFactor)
+            .value();
+    splitConfig.apply(b, pdlV);
+  };
+
+  pm.addPass(createGenericTransformInsertionPass(config));
+}
+
+void createGPUTileGridReductionTransformImpl(
+    OpPassManager &pm, const std::string &anchor, const std::string &prefix,
+    int64_t warpSize, int64_t blockSize, bool usingForall) {
+  TransformInsertionConfig config;
+  config.funcAnchor = anchor;
+  config.matchPrefix = prefix;
+  config.opFilter = [=](Operation *op) {
+    if (auto genericOp = llvm::dyn_cast_or_null<linalg::GenericOp>(op)) {
+      return getGridTileConfig(genericOp, warpSize, blockSize).has_value();
+    }
+    return false;
+  };
+
+  config.transformBuilder = [=](ImplicitLocOpBuilder &b, Operation *op,
+                                Value pdlV) {
+    auto tileConfig = getGridTileConfig(llvm::cast<linalg::GenericOp>(op),
+                                        warpSize, blockSize)
+                          .value();
+    tileConfig.apply(b, pdlV, usingForall);
+  };
+
+  pm.addPass(createGenericTransformInsertionPass(config));
+}
+
+void createGPUSplitBlockReductionTransformImpl(OpPassManager &pm,
+                                               const std::string &anchor,
+                                               const std::string &prefix,
+                                               int64_t splitFactor,
+                                               int64_t warpSize) {
+  TransformInsertionConfig config;
+  config.funcAnchor = anchor;
+  config.matchPrefix = prefix;
+  config.opFilter = [=](Operation *op) {
+    if (auto genericOp = llvm::dyn_cast_or_null<linalg::GenericOp>(op)) {
+      return getBlockSplitConfig(genericOp, splitFactor, warpSize).has_value();
+    }
+    return false;
+  };
+
+  config.transformBuilder = [=](ImplicitLocOpBuilder &b, Operation *op,
+                                Value pdlV) {
+    auto splitConfig = getBlockSplitConfig(llvm::cast<linalg::GenericOp>(op),
+                                           splitFactor, warpSize)
+                           .value();
+    splitConfig.apply(b, pdlV);
+  };
+
+  pm.addPass(createGenericTransformInsertionPass(config));
+}
+
+void createGPUTileBlockReductionTransformImpl(
+    OpPassManager &pm, const std::string &anchor, const std::string &prefix,
+    int64_t warpSize, int64_t blockSize, bool usingForall) {
+  TransformInsertionConfig config;
+  config.funcAnchor = anchor;
+  config.matchPrefix = prefix;
+  config.opFilter = [=](Operation *op) {
+    if (auto genericOp = llvm::dyn_cast_or_null<linalg::GenericOp>(op)) {
+      return getBlockTileConfig(genericOp, warpSize, blockSize).has_value();
+    } else if (auto copyOp = llvm::dyn_cast_or_null<linalg::CopyOp>(op)) {
+      return copyOp.getNumLoops() == 1;
+    }
+    return false;
+  };
+
+  config.transformBuilder = [=](ImplicitLocOpBuilder &b, Operation *op,
+                                Value pdlV) {
+    if (auto genericOp = llvm::dyn_cast_or_null<linalg::GenericOp>(op)) {
+      auto tileConfig = getBlockTileConfig(llvm::cast<linalg::GenericOp>(op),
+                                           warpSize, blockSize)
+                            .value();
+      tileConfig.apply(b, pdlV, usingForall);
+    } else if (auto copyOp = llvm::dyn_cast_or_null<linalg::CopyOp>(op)) {
+      auto tileOp = b.create<transform::TileToForallOp>(
+          /* target */ pdlV,
+          /* staticTileSizes */ SmallVector<int64_t>(1, blockSize),
+          /* ctor tag */ transform::NumThreadsSpec(),
+          /* mapping */
+          b.getArrayAttr(gpu::GPULinearIdMappingAttr::get(
+              b.getContext(), gpu::LinearId::DimX)));
+    }
+  };
+
+  pm.addPass(createGenericTransformInsertionPass(config));
+}
+
+void createGPUTileThreadReductionTransformImpl(OpPassManager &pm,
+                                               const std::string &anchor,
+                                               const std::string &prefix) {
+  TransformInsertionConfig config;
+  config.funcAnchor = anchor;
+  config.matchPrefix = prefix;
+  config.opFilter = [=](Operation *op) {
+    if (auto genericOp = llvm::dyn_cast_or_null<linalg::GenericOp>(op)) {
+      return getThreadTileConfig(genericOp).has_value();
+    }
+    return false;
+  };
+
+  config.transformBuilder = [=](ImplicitLocOpBuilder &b, Operation *op,
+                                Value pdlV) {
+    auto tileConfig =
+        getThreadTileConfig(llvm::cast<linalg::GenericOp>(op)).value();
+    tileConfig.apply(b, pdlV);
+  };
+
+  pm.addPass(createGenericTransformInsertionPass(config));
+}
+} // namespace
+
+void mlir::createGPUSplitGridReductionTransform(
+    OpPassManager &pm, const GPUSplitGridReductionOptions &options) {
+  invokeOpPassPipelineBuilder(createGPUSplitGridReductionTransformImpl, pm,
+                              options.funcAnchor, options.annotatePrefix,
+                              options.splitFactor);
+}
+
+void mlir::createGPUTileGridReductionTransform(
+    OpPassManager &pm, const GPUTileGridReductionOptions &options) {
+  invokeOpPassPipelineBuilder(createGPUTileGridReductionTransformImpl, pm,
+                              options.funcAnchor, options.annotatePrefix,
+                              options.warpSize, options.blockSize,
+                              options.usingForall);
+}
+
+void mlir::createGPUSplitBlockReductionTransform(
+    OpPassManager &pm, const GPUSplitBlockReductionOptions &options) {
+  invokeOpPassPipelineBuilder(createGPUSplitBlockReductionTransformImpl, pm,
+                              options.funcAnchor, options.annotatePrefix,
+                              options.splitFactor, options.warpSize);
+}
+
+void mlir::createGPUTileBlockReductionTransform(
+    OpPassManager &pm, const GPUTileBlockReductionOptions &options) {
+  invokeOpPassPipelineBuilder(createGPUTileBlockReductionTransformImpl, pm,
+                              options.funcAnchor, options.annotatePrefix,
+                              options.warpSize, options.blockSize,
+                              options.usingForall);
+}
+
+void mlir::createGPUTileThreadReductionTransform(
+    OpPassManager &pm, const GPUTileThreadReductionOptions &options) {
+  invokeOpPassPipelineBuilder(createGPUTileThreadReductionTransformImpl, pm,
+                              options.funcAnchor, options.annotatePrefix);
+}
diff --git a/compiler/lib/Pipelines/HloOpt.cpp b/compiler/lib/Pipelines/HloOpt.cpp
index b80e815de..7aa45267e 100644
--- a/compiler/lib/Pipelines/HloOpt.cpp
+++ b/compiler/lib/Pipelines/HloOpt.cpp
@@ -21,6 +21,7 @@
 #include "byteir/Pipelines/Common/Utils.h"
 #include "byteir/Transforms/CanonicalizeExt.h"
 #include "mhlo/transforms/passes.h"
+#include "mlir/Dialect/Func/Transforms/Passes.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/Transforms/Passes.h"
 
@@ -50,6 +51,7 @@ void addGenericHloFusionPatterns(OpPassManager &pm, const std::string &entry,
   pm.addPass(createCSEPass());
   pm.addNestedPass<func::FuncOp>(createFlattenTuplePass());
 
+  pm.addNestedPass<func::FuncOp>(createReductionFusionPass());
   // Element fusion (always last?)
   // Note: if outlineSingleElemwiseOp is set, element fusion must be the last
   // pass, since it will cluster every elemenwise op which is not fused yet into
@@ -106,6 +108,7 @@ void createHloOptPipelineImpl(OpPassManager &pm, const std::string &entryFunc,
   pm.addPass(createCSEPass());
   pm.addPass(createCanonicalizeExtPass());
   pm.addPass(createSymbolDCEPass());
+  pm.addPass(func::createDuplicateFunctionEliminationPass());
 }
 } // namespace
 
diff --git a/compiler/lib/Pipelines/LinalgMemrefOpt.cpp b/compiler/lib/Pipelines/LinalgMemrefOpt.cpp
index f6a4d55b6..6b11d2503 100644
--- a/compiler/lib/Pipelines/LinalgMemrefOpt.cpp
+++ b/compiler/lib/Pipelines/LinalgMemrefOpt.cpp
@@ -33,7 +33,9 @@ void addGenericLinalgMemrefOptPasses(OpPassManager &pm) {
   pm.addPass(createMemrefCopyToLinalgPass(
       getAttrPlaceholderName(
           byre::ByreDialect::getEntryPointFunctionAttrName()),
-      getByteIRElementwiseFusionAttrName().str()));
+      getByteIRElementwiseFusionAttrName().str(), true));
+  pm.addPass(createMemrefCopyToLinalgPass(
+      getByteIRReductionFusionAttrName().str(), "", false));
 }
 
 void createLinalgMemrefOptPipelineImpl(OpPassManager &pm,
diff --git a/compiler/lib/Pipelines/LinalgTensorOpt.cpp b/compiler/lib/Pipelines/LinalgTensorOpt.cpp
index c220a382a..88c1ab2b9 100644
--- a/compiler/lib/Pipelines/LinalgTensorOpt.cpp
+++ b/compiler/lib/Pipelines/LinalgTensorOpt.cpp
@@ -17,58 +17,39 @@
 
 #include "byteir/Pipelines/LinalgTensorOpt.h"
 #include "byteir/Pipelines/GPU/ElementwiseCodegen.h"
+#include "byteir/Pipelines/GPU/ReductionCodegen.h"
 #include "byteir/Pipelines/Host/Codegen.h"
 
 #include "byteir/Conversion/ToLinalg/ToLinalg.h"
 #include "byteir/Dialect/Linalg/Passes.h"
+#include "byteir/Dialect/Tensor/Passes.h"
 #include "byteir/Dialect/Transform/Transforms/TransformDialectInterpreter.h"
+#include "byteir/Dialect/Transform/Transforms/TransformInsertion.h"
 #include "byteir/Dialect/mhlo/Passes.h"
 #include "byteir/Dialect/mhlo/Transforms/HloFuser.h"
 #include "byteir/Pipelines/Common/Utils.h"
+#include "byteir/Transforms/AnchoredPipeline.h"
 #include "byteir/Transforms/CanonicalizeExt.h"
+#include "mlir/Dialect/Bufferization/Transforms/Passes.h"
 #include "mlir/Dialect/Linalg/Passes.h"
 #include "mlir/Transforms/Passes.h"
 
 using namespace mlir;
 
 namespace {
-void collectBroadcastOperands(
-    mlir::Operation *op,
-    mlir::DenseMap<mlir::Value, dataPlaceType> &collection) {
-
-  auto tensorSlice = dyn_cast<tensor::ExtractSliceOp>(op);
-  if (!tensorSlice) {
-    return;
-  }
-
-  for (Value res : op->getResults()) {
-    bool isBroadcast = false;
-    for (auto &&use : res.getUses()) {
-      if (auto genericOp = dyn_cast<linalg::GenericOp>(use.getOwner())) {
-        auto affineMap =
-            genericOp.getIndexingMapsArray()[use.getOperandNumber()];
-        if (!affineMap.isPermutation() &&
-            affineMap.isProjectedPermutation(/*allowZeroInResults*/ true)) {
-          isBroadcast = true;
-        }
-      }
-    }
-    if (isBroadcast) {
-      collection.insert(std::make_pair(res, std::make_pair(Attribute(), true)));
-    }
-  }
-}
-
-void addGenericLinalgElementwisePasses(OpPassManager &pm) {
+void addGenericLinalgPasses(OpPassManager &pm) {
   pm.addNestedPass<func::FuncOp>(
       createHloFusionToLinalgPass(getByteIRElementwiseFusionAttrName()));
+  pm.addNestedPass<func::FuncOp>(
+      createHloFusionToLinalgPass(getByteIRReductionFusionAttrName()));
   pm.addNestedPass<func::FuncOp>(createUnrealizedCastToLinalgPass());
   pm.addPass(createLinalgElementwiseFusionExtPass(
       /*enableSharedInput*/ true, /*enableDiffShapes*/ false));
   pm.addPass(createCSEPass());
-  {
+  { // elementwise codegen
+    auto elementwiseAnchor = getByteIRElementwiseFusionAttrName().str();
     GPUTileElementwiseOptions options;
-    options.funcAnchor = getByteIRElementwiseFusionAttrName().str();
+    options.funcAnchor = elementwiseAnchor;
     // set to 1 for fully fusion & unroll, and all tiled loops will be coalesced
     // and mapping to LinearIdx.x in later pipeline
     // FIXME: set to real blockSize and mapping tiled loops to the corresponding
@@ -77,13 +58,98 @@ void addGenericLinalgElementwisePasses(OpPassManager &pm) {
     options.warpSize = 32;
     createGPUTileElementwiseTransform(pm, options);
     pm.addPass(createTransformDialectInterpreter(true));
+    {
+      OpPassManager anchoredPM(func::FuncOp::getOperationName());
+      anchoredPM.addPass(createCanonicalizerPass());
+      anchoredPM.addPass(createLinalgFoldUnitExtentDimsPass());
+      anchoredPM.addPass(createLinalgElementwiseFusionExtPass(
+          /*enableSharedInput*/ true, /*enableDiffShapes*/ false));
+      anchoredPM.addPass(createCSEPass());
+      anchoredPM.addPass(createCanonicalizerPass());
+      pm.addNestedPass<func::FuncOp>(
+          createAnchoredPipelinePass(elementwiseAnchor, anchoredPM));
+    }
+  }
+  { // reduction codegen
+    auto reductionAnchor = getByteIRReductionFusionAttrName().str();
+    {
+      OpPassManager anchoredPM(func::FuncOp::getOperationName());
+      anchoredPM.addPass(
+          createLinalgCollapseLoops(utils::IteratorType::reduction));
+      anchoredPM.addPass(
+          createLinalgCollapseLoops(utils::IteratorType::parallel));
+      pm.addNestedPass<func::FuncOp>(
+          createAnchoredPipelinePass(reductionAnchor, anchoredPM));
+    }
+
+    GPUSplitGridReductionOptions splitGridRedOptions;
+    splitGridRedOptions.funcAnchor = reductionAnchor;
+    createGPUSplitGridReductionTransform(pm, splitGridRedOptions);
+    pm.addPass(createTransformDialectInterpreter(true));
     pm.addPass(createCanonicalizerPass());
+
+    GPUTileGridReductionOptions tileGridRedOptions;
+    tileGridRedOptions.funcAnchor = reductionAnchor;
+    tileGridRedOptions.blockSize = 512;
+    createGPUTileGridReductionTransform(pm, tileGridRedOptions);
+    pm.addPass(createTransformDialectInterpreter(true));
+    {
+      OpPassManager anchoredPM(func::FuncOp::getOperationName());
+      anchoredPM.addPass(createLinalgFoldUnitExtentDimsPass());
+      anchoredPM.addPass(createCanonicalizerPass());
+      anchoredPM.addPass(createCSEPass());
+      pm.addNestedPass<func::FuncOp>(
+          createAnchoredPipelinePass(reductionAnchor, anchoredPM));
+    }
+
+    GPUSplitBlockReductionOptions splitBlockRedOptions;
+    splitBlockRedOptions.funcAnchor = reductionAnchor;
+    splitBlockRedOptions.splitFactor = 16;
+    createGPUSplitBlockReductionTransform(pm, splitBlockRedOptions);
+    pm.addPass(createTransformDialectInterpreter(true));
+    pm.addPass(createCanonicalizerPass());
+
+    GPUTileBlockReductionOptions tileBlockRedOptions;
+    tileBlockRedOptions.funcAnchor = reductionAnchor;
+    tileBlockRedOptions.blockSize = 512;
+    createGPUTileBlockReductionTransform(pm, tileBlockRedOptions);
+    pm.addPass(createTransformDialectInterpreter(true));
+    {
+      OpPassManager anchoredPM(func::FuncOp::getOperationName());
+      anchoredPM.addPass(createLinalgFoldUnitExtentDimsPass());
+      anchoredPM.addPass(createCanonicalizerPass());
+      anchoredPM.addPass(createCSEPass());
+      pm.addNestedPass<func::FuncOp>(
+          createAnchoredPipelinePass(reductionAnchor, anchoredPM));
+    }
+
+    GPUTileThreadReductionOptions tileThreadRedOptions;
+    tileThreadRedOptions.funcAnchor = reductionAnchor;
+    createGPUTileThreadReductionTransform(pm, tileThreadRedOptions);
+    pm.addPass(createTransformDialectInterpreter(true));
+    {
+      OpPassManager anchoredPM(func::FuncOp::getOperationName());
+      anchoredPM.addPass(createLinalgFoldUnitExtentDimsPass());
+      anchoredPM.addPass(createCanonicalizerPass());
+      anchoredPM.addPass(createCSEPass());
+      pm.addNestedPass<func::FuncOp>(
+          createAnchoredPipelinePass(reductionAnchor, anchoredPM));
+    }
+
+    pm.addPass(createDetensorizeTransformInsertionPass(reductionAnchor));
+    pm.addPass(createTransformDialectInterpreter(true));
+    pm.addPass(createCanonicalizeExtPass());
+    pm.addPass(createRewriteInDPSTransformInsertionPass(reductionAnchor));
+    pm.addPass(createTransformDialectInterpreter(true));
+    pm.addPass(createCanonicalizerPass());
+    {
+      OpPassManager anchoredPM(func::FuncOp::getOperationName());
+      anchoredPM.addPass(createTensorPadSpecializationPass());
+      anchoredPM.addPass(bufferization::createEmptyTensorEliminationPass());
+      pm.addNestedPass<func::FuncOp>(
+          createAnchoredPipelinePass(reductionAnchor, anchoredPM));
+    }
   }
-  pm.addPass(createLinalgFoldUnitExtentDimsPass());
-  pm.addPass(createLinalgElementwiseFusionExtPass(
-      /*enableSharedInput*/ true, /*enableDiffShapes*/ false));
-  pm.addPass(createCSEPass());
-  pm.addPass(createCanonicalizerPass());
 }
 
 void addCPULinalgOptPasses(OpPassManager &pm) {
@@ -110,7 +176,7 @@ void createLinalgTensorOptPipelineImpl(OpPassManager &pm,
   if (target == "CPU") {
     addCPULinalgOptPasses(pm);
   } else {
-    addGenericLinalgElementwisePasses(pm);
+    addGenericLinalgPasses(pm);
   }
 }
 } // namespace
diff --git a/compiler/lib/Transforms/Bufferize.cpp b/compiler/lib/Transforms/Bufferize.cpp
index c93e964ef..0d0d14c4a 100644
--- a/compiler/lib/Transforms/Bufferize.cpp
+++ b/compiler/lib/Transforms/Bufferize.cpp
@@ -17,7 +17,6 @@
 
 #include "byteir/Transforms/Bufferize.h"
 
-#include "./PassDetail.h"
 #include "byteir/Dialect/Ace/AceDialect.h"
 #include "byteir/Dialect/Byre/ByreDialect.h"
 #include "byteir/Dialect/Byre/Transforms/BufferizableOpInterfaceImpl.h"
@@ -40,6 +39,7 @@
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Func/Transforms/FuncConversions.h"
 #include "mlir/Dialect/Func/Transforms/Passes.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/Passes.h"
 #include "mlir/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.h"
@@ -62,6 +62,8 @@
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Operation.h"
 
+#include "./PassDetail.h"
+
 using namespace mlir;
 using namespace mlir::bufferization;
 
@@ -93,6 +95,28 @@ struct OneShotBufferizePass
     vector::registerBufferizableOpInterfaceExternalModels(registry);
   }
 
+  static bool isGPUSharedMem(MemRefType type) {
+    if (auto memorySpace = llvm::dyn_cast_or_null<gpu::AddressSpaceAttr>(
+            type.getMemorySpace())) {
+      if (memorySpace.getValue() ==
+          gpu::GPUDialect::getWorkgroupAddressSpace()) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  template <typename AllocT>
+  static auto createAlloc(OpBuilder &b, Location loc, MemRefType type,
+                          ValueRange dynShape, size_t bufferAlignment) {
+    if (bufferAlignment != 0)
+      return b
+          .create<AllocT>(loc, type, dynShape,
+                          b.getI64IntegerAttr(bufferAlignment))
+          .getResult();
+    return b.create<AllocT>(loc, type, dynShape).getResult();
+  }
+
   void runOnOperation() override {
     bufferization::OneShotBufferizationOptions opts;
     opts.allowReturnAllocs = true;
@@ -101,6 +125,29 @@ struct OneShotBufferizePass
         bufferization::LayoutMapOption::IdentityLayoutMap);
     opts.createDeallocs = false;
     opts.bufferAlignment = 0;
+    opts.allocationFn = [](OpBuilder &b, Location loc, MemRefType type,
+                           ValueRange dynShape,
+                           unsigned int bufferAlignment) -> FailureOr<Value> {
+      if (isGPUSharedMem(type)) {
+        return createAlloc<memref::AllocaOp>(b, loc, type, dynShape,
+                                             bufferAlignment);
+      }
+      return createAlloc<memref::AllocOp>(b, loc, type, dynShape,
+                                          bufferAlignment);
+    };
+    opts.deallocationFn = [](OpBuilder &b, Location loc,
+                             Value allocatedBuffer) -> LogicalResult {
+      if (auto bufferType =
+              llvm::dyn_cast_or_null<MemRefType>(allocatedBuffer.getType())) {
+        if (isGPUSharedMem(bufferType)) {
+          return success();
+        }
+      }
+
+      // Default buffer deallocation via DeallocOp.
+      b.create<memref::DeallocOp>(loc, allocatedBuffer);
+      return success();
+    };
 
     // deny some corner cases
     opts.opFilter.denyOperation([&](Operation *op) {
@@ -272,6 +319,180 @@ LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
   return success();
 }
 } // namespace CallOpBufferizableOpInterfacePatch
+
+// ------------------------------------------------------------------------ //
+// Patch of TensorInsertOp
+// ------------------------------------------------------------------------ //
+namespace TensorInsertPatch {
+bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
+                            const AnalysisState &state) {
+  assert(isa<DestinationStyleOpInterface>(op) &&
+         "expected that op implements DestinationStyleOpInterface");
+
+  if (opOperand.getOperandNumber() == 1 &&
+      opOperand.get().getType().cast<RankedTensorType>().getRank() == 0) {
+    return false;
+  }
+
+  return true;
+}
+
+} // namespace TensorInsertPatch
+
+template <typename OpTy> static bool overwriteEntireTensor(OpTy insertSliceOp) {
+  RankedTensorType destType = insertSliceOp.getDestType();
+  // Dest is not read if it is entirely overwritten. E.g.:
+  // tensor.insert_slice %a into %t[0][10][1] : ... into tensor<10xf32>
+  bool allOffsetsZero =
+      llvm::all_of(insertSliceOp.getMixedOffsets(),
+                   [](OpFoldResult ofr) { return isConstantIntValue(ofr, 0); });
+  bool sizesMatchDestSizes = llvm::all_of(
+      llvm::enumerate(insertSliceOp.getMixedSizes()), [&](const auto &it) {
+        return getConstantIntValue(it.value()) ==
+               destType.getDimSize(it.index());
+      });
+  bool allStridesOne =
+      llvm::all_of(insertSliceOp.getMixedStrides(),
+                   [](OpFoldResult ofr) { return isConstantIntValue(ofr, 1); });
+  return !(allOffsetsZero && sizesMatchDestSizes && allStridesOne);
+}
+
+/// Return true if the (ExtractSliceOp, InsertSliceOp) pair match (i.e.
+/// equivalent operand / result and same offset/sizes/strides specification).
+template <typename OpTy>
+static bool areEquivalentSlices(const AnalysisState &state,
+                                tensor::ExtractSliceOp extractSliceOp,
+                                OpTy insertSliceOp) {
+  if (!extractSliceOp || !insertSliceOp)
+    return false;
+  if (extractSliceOp != insertSliceOp &&
+      !state.areEquivalentBufferizedValues(extractSliceOp.getSource(),
+                                           insertSliceOp.getDest()))
+    return false;
+  if (!sameOffsetsSizesAndStrides(extractSliceOp, insertSliceOp,
+                                  isEqualConstantIntOrValue))
+    return false;
+  return true;
+}
+
+/// Return true if `value` is originating from an ExtractSliceOp that matches
+/// the given InsertSliceOp.
+template <typename OpTy>
+static bool matchesInsertDestination(const AnalysisState &state, Value value,
+                                     OpTy insertSliceOp) {
+  // Look for matching slices.
+  auto matchesSlice = [&](Value val) {
+    if (auto extractSliceOp = val.getDefiningOp<tensor::ExtractSliceOp>())
+      if (areEquivalentSlices(state, extractSliceOp, insertSliceOp))
+        return true;
+    return false;
+  };
+  return static_cast<bool>(llvm::all_of(
+      state.findValueInReverseUseDefChain(value, matchesSlice), matchesSlice));
+}
+
+template <typename OpTy>
+static bool isNotConflictingInsertSliceLikeOp(Operation *op, OpOperand *uRead,
+                                              OpOperand *uConflictingWrite,
+                                              const AnalysisState &state) {
+  Operation *readingOp = uRead->getOwner();
+  Operation *conflictingWritingOp = uConflictingWrite->getOwner();
+
+  // Special rules for matching ExtractSliceOp/InsertSliceOp pairs. If
+  // uRead is an InsertSliceOp...
+  if (auto insertSliceOp = dyn_cast<OpTy>(readingOp)) {
+    // As an example, consider the following IR.
+    //
+    // %0 = tensor.extract_slice %t[%a, %b][%c, %d][1, 1] {inplace = [true] }
+    // %1 = linalg.fill %cst, %0 {inplace= [true] }
+    // %2 = tensor.insert_slice %1 into %t[%a, %b][%c, %d][1, 1]
+    //     {inplace= [true] }
+
+    // TODO: Use insertSliceOp.getDestOpOperand etc. when available.
+    if (uRead == &insertSliceOp->getOpOperand(1) /*dest*/ &&
+        matchesInsertDestination(state, uConflictingWrite->get(),
+                                 insertSliceOp))
+      // Case 1: The main insight is that InsertSliceOp reads only part of
+      // the destination tensor. The overwritten area is not read. If
+      // uConflictingWrite writes into exactly the memory location that is
+      // being read by uRead, this is not a conflict.
+      //
+      // In the above example:
+      // uRead             = OpOperand 1 (%t) of tensor.insert_slice
+      // uConflictingWrite = OpOperand 1 (%0) of linalg.fill
+      //
+      // The read of %t does not conflict with the write of the FillOp
+      // (same aliases!) because the area that the FillOp operates on is
+      // exactly the one that is *not* read via %t.
+      return true;
+
+    if (uRead == &insertSliceOp->getOpOperand(0) /*source*/ &&
+        uConflictingWrite == &insertSliceOp->getOpOperand(1) /*dest*/ &&
+        (overwriteEntireTensor(insertSliceOp) ||
+         matchesInsertDestination(state, uRead->get(), insertSliceOp)))
+      // Case 2: The read of the source tensor and the write to the dest
+      // tensor via an InsertSliceOp is not a conflict if the read is
+      // reading exactly that part of an equivalent tensor that the
+      // InsertSliceOp is writing.
+      //
+      // In the above example:
+      // uRead             = OpOperand 0 (%1) of tensor.insert_slice
+      // uConflictingWrite = OpOperand 1 (%t) of tensor.insert_slice
+      return true;
+  }
+
+  // If uConflictingWrite is an InsertSliceOp...
+  if (auto insertSliceOp = dyn_cast<OpTy>(conflictingWritingOp))
+    // As an example, consider the following IR.
+    //
+    // %0 = tensor.extract_slice %t[%a, %b][%c, %d][1, 1] {inplace = [true] }
+    // %1 = linalg.fill %cst, %0 {inplace= [true] }
+    // %2 = tensor.insert_slice %1 into %t[%a, %b][%c, %d][1, 1]
+    //     {inplace= [true] }
+    // %3 = vector.transfer_read %1, %cst
+    //
+    // In the above example:
+    // uRead             = OpOperand 0 (%1) of vector.transfer_read
+    // uConflictingWrite = OpOperand 1 (%t) of tensor.insert_slice
+    // definition        = %1
+    //
+    // This is not a conflict because the InsertSliceOp overwrites the
+    // memory segment of %1 with the exact same data. (Effectively, there
+    // is no memory write here.)
+    if (uConflictingWrite == &insertSliceOp->getOpOperand(1) /*dest*/ &&
+        state.areEquivalentBufferizedValues(uRead->get(),
+                                            insertSliceOp.getSource()) &&
+        matchesInsertDestination(state, insertSliceOp.getSource(),
+                                 insertSliceOp))
+      return true;
+
+  return false;
+}
+
+// ------------------------------------------------------------------------ //
+// Patch of TensorParallelInsertSlice
+// ------------------------------------------------------------------------ //
+namespace TensorParallelInsertSlicePatch {
+bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
+                            const AnalysisState &state) {
+  auto insertSliceOp = cast<tensor::ParallelInsertSliceOp>(op);
+
+  // The source is always read.
+  if (&opOperand == &op->getOpOperand(0) /*src*/)
+    return true;
+
+  // For the destination, it depends...
+  assert(&opOperand == &insertSliceOp->getOpOperand(1) && "expected dest");
+
+  return overwriteEntireTensor(insertSliceOp);
+}
+bool isNotConflicting(Operation *op, OpOperand *uRead,
+                      OpOperand *uConflictingWrite,
+                      const AnalysisState &state) {
+  return isNotConflictingInsertSliceLikeOp<tensor::ParallelInsertSliceOp>(
+      op, uRead, uConflictingWrite, state);
+}
+} // namespace TensorParallelInsertSlicePatch
 } // namespace
 
 // TODO: removed this once upstrem fixed it
@@ -279,6 +500,21 @@ RegisterOpInterfaceOverride(
     /*Op=*/func::CallOp, /*Interface=*/BufferizableOpInterface,
     /*InterfaceMethod=*/bufferize,
     /*Impl=*/&CallOpBufferizableOpInterfacePatch::bufferize);
+RegisterOpInterfaceOverride(
+    /*Op=*/tensor::InsertOp, /*Interface=*/BufferizableOpInterface,
+    /*InterfaceMethod=*/bufferizesToMemoryRead,
+    /*Impl=*/
+    &TensorInsertPatch::bufferizesToMemoryRead);
+RegisterOpInterfaceOverride(
+    /*Op=*/tensor::ParallelInsertSliceOp, /*Interface=*/BufferizableOpInterface,
+    /*InterfaceMethod=*/bufferizesToMemoryRead,
+    /*Impl=*/
+    &TensorParallelInsertSlicePatch::bufferizesToMemoryRead);
+RegisterOpInterfaceOverride(
+    /*Op=*/tensor::ParallelInsertSliceOp, /*Interface=*/BufferizableOpInterface,
+    /*InterfaceMethod=*/isNotConflicting,
+    /*Impl=*/
+    &TensorParallelInsertSlicePatch::isNotConflicting);
 
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
 byteir::createOneShotBufferizePass() {
diff --git a/compiler/lib/Transforms/MemoryPlanning.cpp b/compiler/lib/Transforms/MemoryPlanning.cpp
index b00ae8e8b..70e9cce2c 100644
--- a/compiler/lib/Transforms/MemoryPlanning.cpp
+++ b/compiler/lib/Transforms/MemoryPlanning.cpp
@@ -220,10 +220,11 @@ template <typename CompareT> class SortedPackingStrategy {
       : windowSize(windowSize), alignment(alignment), compare(compare) {}
 
   /// Optimize the buffer allocations.
-  void optimze(const mlir::bufferization::BufferPlacementAllocs &allocs,
-               const UserangeAnalysis &userangeAnalysis,
-               std::vector<PackedBuffer> &packedBuffers,
-               std::function<bool(Value)> isValidAllocation) {
+  void optimze(
+      const mlir::bufferization::BufferPlacementAllocs::AllocEntryList &allocs,
+      const UserangeAnalysis &userangeAnalysis,
+      std::vector<PackedBuffer> &packedBuffers,
+      std::function<bool(Value)> isValidAllocation) {
     AllocInfoList allocInfos;
     allocInfos.reserve(std::distance(allocs.begin(), allocs.end()));
 
@@ -344,7 +345,7 @@ template <typename CompareT> class SortedPackingStrategy {
   /// maximal userange.
   size_t computeAllocationInfos(
       AllocInfoList &allocInfos, const UserangeAnalysis &userangeAnalysis,
-      const mlir::bufferization::BufferPlacementAllocs &allocs,
+      const mlir::bufferization::BufferPlacementAllocs::AllocEntryList &allocs,
       std::function<bool(Value)> isValidAllocation) {
     // Create allocInformations and store them in allocInfos.
     size_t maxUserangeId = 0;
@@ -405,13 +406,15 @@ template <typename CompareT> class SortedPackingStrategy {
 /// argument.
 template <typename AllocOpT>
 class BufferPacking : bufferization::BufferPlacementTransformationBase {
+  static constexpr bool is_alloca = std::is_same_v<AllocOpT, memref::AllocaOp>;
+
 public:
   template <typename StrategyT>
   BufferPacking(Operation *op, StrategyT strategy,
                 std::function<bool(Value)> couldReuseAllocation)
       : BufferPlacementTransformationBase(op), liveness(op),
-        userangeAnalysis(op, &liveness, initAllocs(op), aliases),
-        dominators(op) {
+        allocs(initAllocs(op)),
+        userangeAnalysis(op, &liveness, allocs, aliases), dominators(op) {
     std::vector<PackedBuffer> packedBuffers;
     strategy.optimze(allocs, userangeAnalysis, packedBuffers,
                      couldReuseAllocation);
@@ -434,6 +437,7 @@ class BufferPacking : bufferization::BufferPlacementTransformationBase {
 
 private:
   byteir::Liveness liveness;
+  bufferization::BufferPlacementAllocs::AllocEntryList allocs;
   UserangeAnalysis userangeAnalysis;
   /// The current dominance info.
   DominanceInfo dominators;
@@ -451,13 +455,18 @@ class BufferPacking : bufferization::BufferPlacementTransformationBase {
                                dominators);
   }
 
-  const bufferization::BufferPlacementAllocs &initAllocs(Operation *op) {
+  bufferization::BufferPlacementAllocs::AllocEntryList
+  initAllocs(Operation *op) {
     if constexpr (std::is_same_v<AllocOpT, memref::AllocaOp>) {
+      bufferization::BufferPlacementAllocs::AllocEntryList ret;
       op->walk([&](memref::AllocaOp alloca) {
-        allocs.registerAlloc({alloca.getResult(), nullptr});
+        ret.emplace_back(alloca.getResult(), nullptr);
       });
+      return ret;
+    } else {
+      auto &&baseAllocs = BufferPlacementTransformationBase::allocs;
+      return {baseAllocs.begin(), baseAllocs.end()};
     }
-    return allocs;
   }
 
   void createBufferAndViews(const PackedBuffer &packedBuffer) {
@@ -505,7 +514,7 @@ class BufferPacking : bufferization::BufferPlacementTransformationBase {
 };
 
 template <typename AllocOp>
-inline void doBufferPacking(mlir::func::FuncOp func, size_t alignment,
+inline void doBufferPacking(FunctionOpInterface func, size_t alignment,
                             std::function<bool(Value)> couldReuseAllocation) {
   SortedPackingStrategy<AllocInfoMemSizeCompare> strategy(
       0,         // windowSize
@@ -517,10 +526,12 @@ inline void doBufferPacking(mlir::func::FuncOp func, size_t alignment,
 
 struct MemoryPlanningPass : public MemoryPlanningBase<MemoryPlanningPass> {
   MemoryPlanningPass() = default;
-  MemoryPlanningPass(size_t alignment,
+  MemoryPlanningPass(size_t alignment, bool alloca, size_t memSpace,
                      std::function<bool(Value)> couldReuseAllocation)
       : MemoryPlanningBase() {
     this->alignment = alignment;
+    this->alloca = alloca;
+    this->memSpace = memSpace;
     this->couldReuseAllocation = couldReuseAllocation;
   }
 
@@ -559,11 +570,15 @@ struct MemoryPlanningPass : public MemoryPlanningBase<MemoryPlanningPass> {
 };
 } // namespace
 
-std::unique_ptr<OperationPass<func::FuncOp>> mlir::createMemoryPlanningPass() {
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
+mlir::createMemoryPlanningPass() {
   return std::make_unique<MemoryPlanningPass>();
 }
 
-std::unique_ptr<OperationPass<func::FuncOp>> mlir::createMemoryPlanningPass(
-    size_t alignment, std::function<bool(Value)> couldReuseAllocation) {
-  return std::make_unique<MemoryPlanningPass>(alignment, couldReuseAllocation);
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
+mlir::createMemoryPlanningPass(
+    size_t alignment, bool alloca, size_t memSpace,
+    std::function<bool(Value)> couldReuseAllocation) {
+  return std::make_unique<MemoryPlanningPass>(alignment, alloca, memSpace,
+                                              couldReuseAllocation);
 }
diff --git a/compiler/lib/Utils/Utils.cpp b/compiler/lib/Utils/Utils.cpp
index aae017171..0dc02bb91 100644
--- a/compiler/lib/Utils/Utils.cpp
+++ b/compiler/lib/Utils/Utils.cpp
@@ -452,12 +452,13 @@ Value mlir::getSlice(OpBuilder &b, Location loc, Value source,
 
 OpFoldResult mlir::canonicalizeOpFoldResult(OpFoldResult ofr, bool enableFold) {
   if (auto val = ofr.dyn_cast<Value>()) {
-    SmallVector<Value> foldResults;
     if (enableFold) {
-      OpBuilder builder(val.getContext());
-      Operation *op = val.getDefiningOp();
-      if (op && !failed(builder.tryFold(val.getDefiningOp(), foldResults))) {
-        val = foldResults[0];
+      if (auto opResult = llvm::dyn_cast<OpResult>(val)) {
+        OpBuilder builder(opResult.getOwner());
+        SmallVector<Value> foldResults;
+        if (!failed(builder.tryFold(opResult.getOwner(), foldResults))) {
+          val = foldResults[opResult.getResultNumber()];
+        }
       }
     }
     return getAsOpFoldResult(val);
diff --git a/compiler/numerical/hlo/canonicalize_ext.mlir b/compiler/numerical/hlo/canonicalize_ext.mlir
index 924cfa162..dd961caee 100644
--- a/compiler/numerical/hlo/canonicalize_ext.mlir
+++ b/compiler/numerical/hlo/canonicalize_ext.mlir
@@ -273,3 +273,40 @@ func.func @fold_large_constant_reverse_float_1(%arg0: tensor<1x3x3x128x64xf32>)
 // CHECK-NEXT: mhlo.constant
 // CHECK-NEXT: mhlo.constant
 // CHECK-NOT: mhlo.reverse
+
+func.func @replace_gather_with_input_0() -> (tensor<1x64x128xf16>, tensor<1x32x64x128xf16>) {
+  %0 = mhlo.constant dense<1.000000e+00> : tensor<64x128xf16>
+  %1 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<64xi64>
+  %2 = "mhlo.gather"(%0, %1) {dimension_numbers = #mhlo.gather<offset_dims = [1], collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 1>, indices_are_sorted = false, slice_sizes = dense<[1, 128]> : tensor<2xi64>} : (tensor<64x128xf16>, tensor<64xi64>) -> tensor<64x128xf16>
+  %3 = mhlo.reshape %2 : (tensor<64x128xf16>) -> tensor<1x64x128xf16>
+  %4 = "mhlo.broadcast_in_dim"(%2) {broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>} : (tensor<64x128xf16>) -> tensor<1x32x64x128xf16>
+  return %3, %4 : tensor<1x64x128xf16>, tensor<1x32x64x128xf16>
+}
+// CHECK-LABEL: @replace_gather_with_input_0
+// CHECK-NEXT: mhlo.constant
+// CHECK-NEXT: mhlo.constant
+// CHECK-NEXT: return
+
+func.func @replace_gather_with_input_1(%arg0: tensor<64x128xf16>) -> (tensor<1x64x128xf16>, tensor<1x32x64x128xf16>) {
+  %0 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<64xi64>
+  %1 = "mhlo.gather"(%arg0, %0) {dimension_numbers = #mhlo.gather<offset_dims = [1], collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 1>, indices_are_sorted = false, slice_sizes = dense<[1, 128]> : tensor<2xi64>} : (tensor<64x128xf16>, tensor<64xi64>) -> tensor<64x128xf16>
+  %2 = mhlo.reshape %1 : (tensor<64x128xf16>) -> tensor<1x64x128xf16>
+  %3 = "mhlo.broadcast_in_dim"(%1) {broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>} : (tensor<64x128xf16>) -> tensor<1x32x64x128xf16>
+  return %2, %3 : tensor<1x64x128xf16>, tensor<1x32x64x128xf16>
+}
+// CHECK-LABEL: @replace_gather_with_input_1
+// CHECK-NEXT: mhlo.reshape
+// CHECK-NEXT: mhlo.broadcast_in_dim
+// CHECK-NEXT: return
+
+func.func @replace_gather_with_input_2(%arg0: tensor<64x128xf16>) -> (tensor<1x64x128xf16>, tensor<1x32x64x128xf16>) {
+  %0 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<128xi64>
+  %1 = "mhlo.gather"(%arg0, %0) {dimension_numbers = #mhlo.gather<offset_dims = [0], collapsed_slice_dims = [1], start_index_map = [1], index_vector_dim = 1>, indices_are_sorted = false, slice_sizes = dense<[64, 1]> : tensor<2xi64>} : (tensor<64x128xf16>, tensor<128xi64>) -> tensor<64x128xf16>
+  %2 = mhlo.reshape %1 : (tensor<64x128xf16>) -> tensor<1x64x128xf16>
+  %3 = "mhlo.broadcast_in_dim"(%1) {broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>} : (tensor<64x128xf16>) -> tensor<1x32x64x128xf16>
+  return %2, %3 : tensor<1x64x128xf16>, tensor<1x32x64x128xf16>
+}
+// CHECK-LABEL: @replace_gather_with_input_2
+// CHECK-NEXT: mhlo.reshape
+// CHECK-NEXT: mhlo.broadcast_in_dim
+// CHECK-NEXT: return
diff --git a/compiler/numerical/hlo/hlo_move_down.mlir b/compiler/numerical/hlo/hlo_move_down.mlir
index 0676ae6f5..0afdd1ce3 100644
--- a/compiler/numerical/hlo/hlo_move_down.mlir
+++ b/compiler/numerical/hlo/hlo_move_down.mlir
@@ -179,3 +179,25 @@ func.func @broadcast_reshape_dot_with_concat_and_add(%arg0 : tensor<1x64xf16>, %
 // CHECK-NEXT: mhlo.broadcast_in_dim
 // CHECK-NEXT: return
 
+func.func @transpose_move_down_binary_case0(%arg0 : tensor<2x128x4x16xf32>) -> tensor<2x4x16x128xf32> {
+    %0 = mhlo.constant dense<"0x9A4830BD21F00B3C7DFEC9BDF7DAD33C1F0C483DDC57B5BC81D312BEB409853CF9F2F0BCF858843C183DAB3C11C6F5BCAA1CDDBD1F2D073D784D0B3DB0A0FCBDE0458F3C2621DBBDA7D3E3BCA99608BC7E39723C3FD8A63D0671DE3D2C29043B45059DBD057A51BD3157293C086EDC3DF0789FBCFE4A0F3D3F29423DD5D756BAF0C60F3D2FFEA6BD695477BC6A98ADBDD45BEC3D1532593DE3AB2D3D4D23193B0C52A4BDBFC474BC43F4223DB2A1F83A55FA433D5F06093D52146CBD9B1A37BD6A7617BDCA91363E305EDDBCB3DDC0BDDA82B5BC64D8163CDE519C3DE7768CBCBF778D3DC8AFD4BA2269503DB9832E3CEDE6A7BB8AF5F73C658639BB7EDC51BBF6F9BFBB603F063C335269BE0B91EC3B1F248DBC1C51B2BDAB9964BC6E43813CD4872E3B7AD3333DEC88273C428481BC4FCB713D56630B3EC37B0F3C374041BDE5CCCCBDC0685E3DD3D18B3D27ADC23DB086E23B88825BBE3B5425BCA6FE383B5CF0CC3D80FC803D24CDEF3C21DCD0BB6762843C9CDD013EF2BAC23A6552D8BC4460A33D3E314F3D2D69E9BC642193BCB8DB213D8BE53D3C4A653B3DFBF5313D836617BC70FE49BDD6ECA43C8856AB3B96C85C3D6D51BEBDF40248BDAFCE25BCF78108BEB9DD08BE4263233B789B543C6A86C6BD0EBAA13B28A3A03C2B50B53CFD66C6B30D788B3D883134BD6069723D642641BD1D5F0D3E80D58F3D84AB833D559023BD7F7FB7BC2BD7B13CB70B1B3C294693BBB5B2AE3C742B3F3BE099983E6308583CB849A2BC06851B3D60AB86BD505014BDC1BF883B2B436BBD3B6B073D5A65BFBC92C4043D35C882BCAF3A12BE92CBC5BDA2340F3D6508ABBD89D910BC04AD8B3C9E2F693D9D2459BC8D099BBBA4DAE3BC4EB30BBD188C22BDA4F843BD377528BEA6EC253D2759713C7FA1EFBAAFC720BD51B7C93C34E6673D8602E83C871F763DD766CDBC05978A3C0F98FB3C960F8F3D3E49253DAB9A643D5D1E6F3DB0ABDA3A10A15C3D66BDA1BA02CE81BB39B456BEAA35E7BD723D36BD121434BE10C3513DDD10AABD89A172BB47CA2EBDFFFA44BC0E72F93C65CEDA3B257E4C3D0890ADBD63A63DBDE51D17BE90DD52BEF7BEECBD02EB5DBD63BCAD3DDE533CBD7DFD033B591B24BD633FDC3CB39ED93787DD163DEEF841BCF7CF69BC2A975ABD580CA4BDB0F93E3C30D49F3D7EC3B83C3C4BBE3B6BD322BD81EC963DA056933BE7698DBC7E3960BD8F591D3D73CCA139A7843CBC248A623CD871A93DBA5E17BD67D5D33DB35B45BC8FA0823CADB2043D79E8D43CD59FF1BC0ABB783DA14F1E3C9C016E3DFEBE30BC253C98BD85E64B3BEA4A973C9D389DBC674A37BBB05BD03C4C7BC03D5CB6A3BC8E63623D74A3DABC775E90BC41FB833C206E4A3D3104AC3C0D1070BD053005BBB861A6BDEF768EBC4920303D0E00173DBCB70ABD1BB81FBD536AA73A069144BB267A5B3D5EB7583DB24EDD3BDA2FB3BC89FA41BD3CC37D3BF17204BCDEC6B33C92FD85BBABCB16BD437331BD9857533B8614B1BCCE4FBC3BA44AEFBB11C96B3B7DB8E2BC8262FB3BED9DF93B32AEC53AB35589BC72641BBB51FEF0BCD5C2AFBB94187BBC72C61F3D4853A93D65DC023CE04083BDC9F0E93C5CACCFBB8F6C85BD3E7C1C3CBAEFA4BDF71A91BC34EE2A3CF235883D76189ABC0979F13C6657F53C32E895BC011F0C3DBFDF8F3DE147C13C987B26BCAD41713B6EF78B3CC00A573D9E876D3BC4EBD5BC42F03BBCD3917E3D25C384BC1C189C3CA9252F3ED57D833C378B4F3B96200FBCE6D503BC71A0E23C2DE6353DAF9AF1BB5425163C5DA7BE3B20E26BBD31FD1B3E3AE1663DAD26F03C5DE4243D5CD30FBCCFBA8CB8C1A07FBCE877553CDC176EBA59FF3CBD7F67063E977B173D78F4E9BB4B8B1DBAB311A83C653B4EBCD674113D7DCA35BD584041BD8E86033D9A5E1FBDCD620B3D7F8EC5BA1C789DBC2C3C653D308045BDF9F5B53C75A668BD1DE708BDBBF41ABDD4E2F93CE15040BCC5EC85BC97AF64BC4177543C0831033D5428E6BB4AD9CCBC7518923C76F9BB3D674C513CE163CD3C2F08E1BC01AF8ABD5F2E97BB7E8FBB3CB77C0FBC2C7E023EC0D9F53CA8554BBDE38372BC46D7913CEBE7FF3C4BED28B30A8D473D33332C3CA4E6213DFE9CFFBB94399E3C7907BEBCA5B96FBD3A3B9BBC231262BD587717BC9FC1B73B1E378D3C5B2C31BC97A8263CCA6528BCF2DD01BD477AD33B4692B7BCB9F7EC3DCD95963CE6F9893C7EE3323D50A0AD3DE9F8523D88B102BA8A3B11BD3F0B8BBCCD61A4BC9A75893DCF6C4B3BF605CDBCDDBBC03B284214BD3FD13A382C6763BD8400863C39263FBD1DC8D13B6F302EBDC34911BDED527ABC819A9E3B84DF903CBB8DB03C5C7F853BF8779FBC7C2B10BB13259A3D39DD21BC1E1B86BBF76A983DA42F51BC2D5707BCB6CACF3B7DD20CBD06CED23B2585913D35C07C3CF8331BBBF42942BCC6FB04BDB8BD813C0145D5BC76CF9E3DA2DA7DBCF53DE5BB8489523C01D008BDF9FC6FBC55CF643DD0AE213CF04A0CBD152720BD65FE77BD5A0A5E3CAA7F53BCC807E5BB712FC53D62A544BD8C27EC3950A19A3BC56E993D6B5F3435D99F223E3C3422BD96BD0F3BA18E86BC0A452ABCC4421EBBC3BD7D3DFC89F0BBAA28163EE973243D7BE058BBCF4B84BC1293CCBD3FD121BD391EC3BCA8C6203C80CF833CDB40143DEEC0AABCEBB8F4BB8565883D115E37BC85D860BDA36A8ABDDF61D83C4EC03D3E42DA34BD5744A0BCF2A037BD1D700E3DD7D7D739BAAA333A9D0CD33C66699CBB6038313D7661B7BC509D743D5D82CB3CDFD7973C1FB71B3DB98C08BC7B23C2BB60E4113CA2C112BDE0970BBC82F1193CFF8AA03A2792E43C6048063DFF23863D76C2513D5D2708BA9D562B3C616615BCB71C123C50DF0BBD30966A3CE832D4BC3E5224BD1EC6803DDAD805BCCE359ABC1E98ADBA9866B23CB924863C461CF33DFE415DBC16FB88BB6F70DCBCC3EA033DF5F0953C9E7E263C0A25193C7CAFF0BC9958763C60807B3930D82E3D5A955ABB77B4BC3D4A8424BE82026B3D238FA03C0987D03CB6A9E53C1E43E03CFF90A63C5E45083D873E3B3C4E2E343D6E9D3D3BB82A65BBCE79BCBB681BBC3DF67139BD5E1D9F3C5A32053DD17097BC782160BDADA846BDFC4A793CBB42F1BCC6C1A9BD8A074D3D8CC3B23DF0DF4A3C4E98433A277BA23BE94F6EBD0BFFCA3C99B29DBCF16E3EBEB34809BD0AECF03CCA16803D3A3EBABC511CA8BCECDA1D3CCAC0073C8D5EC73CBFFDC33C82410CBCF013263E4481D1BC75E6893CE6FA133A658BA53C78F59EBD735381BB7EFDA6BD0CC57A39222F9DBC9825233D8046BEBCEA5ECDBC23E4C23C7FDDE53C3DECAD3C588CD23CB764CABCBAF92DBC173BB3BC1EBED33BFAD54EBC01EB40BC50EA43BD77EC06BE6FB2AFBCBEED1A3BB251063E725600BD3F2348BC663036BB96361E3C8D7F5FBCCAB8BB3BACFF5B3C37CA3CBD7CCC54BD99AE98BBC9F4B93C96D874BE15C9F63C7879C73CC4FAC83C8F5385BDCD4703BE7C0C113CAE69383DD1B414BDAD93D83C568DD83CBA29513DA7430DB4489B8DBC964383BC7C9908BB96D8593A2276A4BDBD8C243DC3009FBD189B8FBE486E7CBC6A2F763D5CDAB4B9F2D07E3C734688BC974877BB2424B3BD8FC579BB884DCD3CE7AD8F3C9CFCD53C0F73E5BC7E8D9BBCF1460BBE136457BCA5AC703E50D2B1BD377EE33B0572D3BBC7C313BDE946F3BC258D20BD31E00EBCA0FCE63B8F97A33CECFCA7BBE5A28EBD044BAF3D9EE2233C2353A33C576344BC39F3A53C3472843BE63FC23CFEE38BBDE4EA92BC5ADE133D826DF13D4C9332BD3AC4E63CB2CE4E3C082DD03BDFB1883C5F3F203ECADF003D7249ED3D26D782BC30652EBBBB4E083EE6AD583D4E26FCB989FEBABCB273173D53297DBD56585EBDA272683C02B4CEBC32B6BF3C64C1423DDF5F82BB17C4053D8BE4C63C23890BBDCA326F3D454692BCF7206DBCD6592A3DA44E23BE3BFD34BB388FA7BA39D7633C7F9E64BB599228BBAE34E23CCAB3D7B522BE36BC231B063D28593C3C130C693D4B0323BD41DBCA3BB52CCFBC431D0D3C7DF62B3C86AD03BB2AE6DC3C7F4F5ABD2D03C8BC6D4EE23BB9D49CBDF8002ABDD79D23BC89A74D3DDAA573BD2985BBBCE5D5AF3C9D1C41BD2BE1A5BDEDC13EBDA2F92FBEFC68143C6268653CBEF63C3D00D8063C0D1D6EBCED8688BEC422F1BA252AF53C78751C3DD021FBBBD8AC233DBAA1CD3CCB73BBBC09EB5CBD47FB19BE657090BD1FBBBBBC731BA03C2E35D93D85BD2F39E7B9C4BCFF36363D292F08BD6E7049BD787907BE1B661EBD8AD4B239E1D7EFBA0A9D9A3BF985AF3D42269F3B6D33443D3D6F05BD8F14BF3C12F31ABB12D0ACBC6C6D603C8502D83CF6321CBC103891BC9D417E3C05C9E53BCA3A673C11A89D3C696301BDE7BAC43CE4947FBC632B9B3D7428B23D3F3FAEBC2F3D963AB81B9DBC04DA49BD4FFB273C5AFC973B9011293D1098363C3C4E77BDB55155BB2B67923C130D79BDFF2D70BDC42F623C31DD04BE0D2F883D5274753C742A8F3B6A4706BE2CDA78BC4CF7A93CE544F13C8E35413E3941493DC4AE7FBDBEF425BE9A5AB0BCE46E0F3EFD6B04BEBB74C1BD702F44BD5B72AB3CE719883DB8C7043D9E8A8D3D0947943C179923BC7943123C2027473D718D3F3D8A760B3D392832BDA0DC053D27DDA0BC78E0F7BC5D5A1A3DE2AA37BC874747BDBC91B23CA1305F3CFDA26D3AF85B003B8579843DFC9888BC026820BD0086703D9CF3133D889D153DDF0BB03D358DA9BC97B9873C75B2873C9203063C89E3103D4F3F7C3C00B8CF3B5335F43D2EC22EBD858D893A3332C7BCC11D8A3CAD31BB3B3C9840BC826492BD3B3C17BEC2F5BE3D50607DBE19FBF83CBCFAC5BC2689FF3CEEFE3A3D95C7E53CE08C373EA5F0FCBC28D9793C5996333DD423B13DA37C0B3D238A80BC3FCFE7BC7E4C083E1F54E0BDA3ADDD3CB397E43C70E2ECBD444E88B965843C3C8C1AC73CE49A51B4C7F0F53D66CA37BC4C711F3C901142BC983C953D01315BBECB283C3D29CE5A3D2B5A36BD11DF66BDF1398C3C53B224BD8E5B82BC6E44A4BCE9A104BE3B3C833DE551E7BC1A81613C0361A0BCEF5024BB1B5E7A3A21E6ADBD5E953B3D45461BBD891002BD650252BCFF3C24BEA305043CCD79FE3C71914EBC04D98B3C4D0DFF3CD04C463D39F335BD8C29023DE6E86639779BB03B711D61BD5B9652BDC7CBA1BDC532633C3C0E9CBAE434B53D6672063EE178CD3B6E16723DB5CA023D559A973D9CFE0EBB0FC4213D322E68BD9A76973DEE640DBDB08653BC23F4163CE7969A3CD4A19F3D35BFDF3C78AAE2BAF756213D25B76D3BA7B145BD8DB603BEB7AFCE3C908A9DBC345CA53C6198AC3C3395A4BBAE34163DAAE4153D0111C73C5D6D0FBEC68E643D93C3C7BDB52E8B3D6B636CBD2AF7BDBCF119053DD9563CBD7FFDCB3BC87407BEDFE2FABB600D0138F869083DD55213BCFBD14FBDAFC226BDD0D19FBD3734803C8FE3BF3CF371933A55E39A392570B3BE0AB622BEA5C23FBCBBA5223CE6CBC43C2D1AA23C08F1D93CE6C86DBD684E4A3C46AE5D3DB5FAAB3DCA84E83C471CF5BCFB49C8BA150447BD7A84A53BF04F47BCF9E5643D1375743B1DF8853DDB07BDBCC625593C2245CE3A8E5AEF3CCE5BF6BC273C6C3C17213B3CC575A03D2A8A8FBD3D055DBD9FEC553D2712ACBDAEB1443C6EE8083DB63ECBBB7D000DBC3F5C1C3D7EF910BD920B06BC824BBB3C7C0BE83C0866033E175C11BCFF7ABDBC55BC0FBCA5C39EBAD6FEC43CEF24CF3C11EDF8BC460F31BDC4340EBD6C1E87BA6E42CA3BDA50D2BBC750003D3CAD22BD1A34E4BA14B60DBDA6EA883B59D2243CABBC23BC93B7ECBB89E6BABC926B8D3DE995063D14591FBC644C06BAFB30CC38D98102BCE4DC1E3C28078F3D5414BB3DCD1BE5BCA0ABD7BCFC5258BB1B79433A1198B8BD9CCAD7BC7AA774BD90ED803C27AA013DD6ED343C502AB4BC01288EBD70B9CEBC5776163C25C81C3D7738C33C7F389C3C8F160DBC56C007BD9E98CCBC571DAE3DAA0B24BDEF7870BCCD1F93BC98C46D3C0FCCCB3AB46F373DC511E63DD45B43BCAB777A3AB7AACABCE3C2F6BA51E3033D01CCB3BBE7E08BBB5DEC983C637E46392F97FCBB7E9029BC579A813D4B84B43C9ABE983C9DF5C1BCA3182A399B2D15BDC0EB37B95F7D5EBDFF87D83CD1DF9E3DE183E83C548B8A3C5B7F153CA849983D76246B3CE4D5F63ABDAED9BC8F2EDD3B8E25573C81A692BBDEE4AC3CD77B6B3D4712213CBC4B4A3DD73E9CBCB799433C549CA73C01341BBD1A32AA39506606BC9E7959BCC13DE3BCD7C7B3BC56AC793D932D0F3EAE6432BCD7F0B8BD3DB1CCBCC6639F3D2106A13D030D18BA554829BDBAB73ABD3BA31D3B2DC465BC668201BC2A0924BD48E21D3D78C9073D0718EF3C9A848C3CE606CE3B45C4B1B3555B053C37F19DBC491B25BCCEAB47BD5D3C553D4639C83B6B6F1CBD1AD7A7BCB44B88BCD55DD3BC624CFC3A486DA2BC7DC84F3D960A0ABDE018B7BC6951CE37BB3ADD3C83C178BDFB13E73DE917A53BE2FB00BC609C8F3C0CA5CC3CA814CCBB516B833CBF7CF3BB9022CABDDCF6063DEA02873D2182653C140635BC5739413C6829B23A8F5306BD317C32BD9E00863B944BF83BEC5B22BCF2A3A93C2767833CB765413B2E6185BCEA56BC3CC99A813C8C600BBDE8FA9D3BBF83B73C9FDD1C3EEC719DBC6EB0DE3AF384B4BDB9E9B53C7A890FBC9017183C59851ABC4FB0B1BB42EC8B3C6935973CAD0024BBBA91043CA2DBCBB95D628B39ECA107BC3E869C3DED0F88BB12684C3CC97109BDAEA1073CB031093D9A43EABB46B8033C7E5B7CBDBC3721BC72ECDFBBA3EE4DBCDAF7AFBCC46B24BDB0450B3D21E7913C12C7133BE7C6643C0CF3923CC1461EB550EE013ED9F8B4BC7F8DF83BBA9F48BD0D95F4BB65DE59BC54A77B3D83B49BBBD3A5A93D15EB893B7DC24E3A2F16BE3B589B083D2C13533C2249A8BB79CDAB3DC8D8BD3CC708C83C10325F3CAA4BF5BCEAC7AE3C42E744BDA6D8C0BAD796953CE872A83B3C85AA3DEA2C033DB3B11CBCAFCF4C3C90A420BD48A7CF3B52A8603A0D0B673CE3EA79BC6BFD0FBDA1A6E9BAB2A7883D63CECFBDA068583D02F6953C2314DDBCB915393D1ADE45BD6F25DABC3B02C53B47F34ABD179DFE3BF759003D9BA70E3D5E05233DC67DE53C7E68003DB788ABBCF77164BBF7EC843CDDE643BC6A2D5E3D116DF3BDC86D31BEC3052EBCFB8D35BC23EF263C72E4D7BB536B2C3CF74A2D3CA31E21BC48C590BD13720BBEA71DEE3B94972EBD08D0F73B09BE313C4D5B9CBDF0AD123E6AABD13D5F014CBA5BCC143DB19712BC36B89A3CA8A2603D11DD37BAE7D5083D9D809DBDA28A82BE781241BDE6B6323D9C2D083CF70C49BDC1F5263D8C2123BD8BD8A5BB2096F23BA8223A3D64FB39BC319D023CFED9A73C1CC717BEFAAB2B3CB0E2053C2C677ABDD33B2FBC5D532B3D75B66BBBF846B43D338DAA3B8AA3D23CCAC14E3B03B8F53B72478CBC91E08CBCD5847BBCEB3EB53C8E136F3CB66E15BD070A89BC415541BCC737263DCF70FA3BA5B54CBC076478BB5666693C0E5BC6BC70A335BD70BE173B34E40F3995C4023C334CE2BAF9AF0E3D9DD9823D49CE6C3C6AA94FBDD554F63BA1B022BE6454533D07BA4C39D33A90BC5DF454BC581477BE8915A5BC98031EBC382FD23D24B7713DCA679A3CA25E1E3DCE360EBD87E6283AA1B7E2BCC40C6D3CFB29193E1691BF3D3727AB3C8543EBBC0DDEBCBD020202BDDDCB563C68BDCC3C46B3CFBCC117B3BD81E2B3BB432C0EBC9DE6BB3D4125963D9BF5D03CBA62733CE1B778BD0EEA01BDFE31BDBB91463BBC7D91FC3BC7CB6D3CFCB7D4BC1DFBA5BC36DFF0330E102BBCB795953D1F72A93C27270E3E1B38B6BC94D5993D4BF5983C21BA753DC79AA13D8853BF3DDE9A66BBAF95223C0EC24D3CB151633C2DB38EBCAA162FBE000828BC503557BC9F3DDFBD389351BD7D40E5BCF763753D48B2923C257AD3BD534646BD7AF3F7BC30FB54BC48E1A03B538C60BD3D97DABD7D4D623DD4DD8C3BF91143BD4C1099BDAA3D203BB60B89BD084A60BCE2DF8B3CE276023D07113F3CF1E184BD3D45053B71F219BE892EEDBDCA4FA03C0740D2BD1ACD273C0DA11E3D4C5AD33CB8DE0F3CB314183D30A38C3DC2A762BBF5084E3BCF0E393C731EA439D96BD83D903CCCBD38E5CDB9F5999F3DB784563D7EC401BD87B1ECBC2C0704BB625B923B92BE22BD79F484BB8A1638BCBC0389BD4326EC3CADBD98BD522993BDAF49133D6F40AF3DF1DB92BD91734FBD863D063C60D604BE7AE882BCBE447BBA0724843D78BA03BD53974937D59091BB3E3ED43A94843B3D03A0FF3CA61D8B3B7144A5BBB99D123D288FB53C84DB0ABCC2AC883D33F849BD01CA90BDD35922BC04F5BABCD83CB1380A56C1BB1D2E663B90A398BCE2EE85BD619E0ABD40EFC9BDDE46FEBC6F2CE43D443003BCAE8EB93BD4A3AFBC851E9C3C8263AD3CCCEC393DC4C5DA3CA6C1A8BC65BDD0BA30F4C5BC0324373D3A8FFFBBB41AE33CB53699BB60AC8B3CC51F1FBC77F053BE15CD77BCD8CF4D3CFB4BD6BB0B93C2BC28339ABA017395BD86B2E6BC994BD8BC82F43F3CCE9ACFBC54B4A43A10EADCBC06AA643D8996D5BC59C679BCE572FABD5FFC02BC39AB133BCE2CFE3BC0BB29BC577BB5BEC2BE84BCB567CF3B9A78C13CC902F63AB8FD94BCC50650BD8512363C9A6895BC11AEDC3BE6CB503B5A90793B9A5C6EBCBA2F54BC59292B3CAD8294BA8D7DCE3B578EC23C2300B03C5AC785BCFA5083BB4502B13B88634FBD6A047BBC1D7753BC0BDA37BC49C62D3CE02D7E3CC61F65BBA67FCA3B104EB83CDBFD51BBEADEB23CAE10A73CFB86B03BB8F80C3DA81813BB550B20BD2E1A8D3CBE6265BD3D8230BD85FE58BCE64F98BC8CEA1B3D9C2FC0BB93AF9CBBE977EBBC0B59313C3A55C33D209585BBDAAA683DF517B8BC3809E8BC6FDB70BC7F713EBC1E33763CB28D4F3BDB46CB3B0EEEFE3CF0AC503EDC854CBC12214EBCEEB44C3CC8DF14BD0B6C48B995700FBD33ECBDBCABA5F43CF902F4BB9F01A2BC97AF413B3323383C32A5BEBBB060CBBC853C42BCE0F3FDB970B9163BE185013C2C5A94BAB0A7AEBC4612D0BBE64A0CBD861502BCF22A253B68BEC0BCFF9C16BCA2BD0BBD1A8702BD9BAE2FBDC1E89A3C7CC5A3BCA7573EBC0043163C19A1A3BD7FECB2BD6188243D67C9FEBCC88B66BC7213C43C558CFBBDEFDC45BC502F1E3D3013063DA1A17FBC30CCA83C1AEEC63C02EB5B3E9126DBBC1149853D0E93433C0D0E9EBC0966D7BAEF81453350FD20BD185B783C5E7551BE1299E33BD8C8803BCA09ECBCE5BB4D3DAF6E14BDAE06CC3CC322E73CC0225D3A773D5EBC828C32BC91BA2D3C5A13543C3DF282BB0EC50C3D6081953C1BD7D43D8B69B7BAB1479F3D3D1EFA3C40DC83BE93E58CBD8872313C712B293DC531503D71E5AABBD033CA3AFA5A973BB2F8A73BBB79383C71661CBD5406C4BB6597E03C1577E2BC11C9F9BA967FC93A247C913C2A31843B096961BC9A6838BDF1AF113C7379EF3BF90FC13C98FB11BD0CB81E3C766894BC52C9AA3C6556C53D9CB6E63B5B0A843C5BB58B3B4996AC3C32319DBD6D9B62BB09A7233DA9AE0B3CF9D09E3AA5E2FA3939B6063D8CAFABBC2E6BE33B5F6816BF2D764ABA96024DBCDC55D9BCD184C03C64BE8CBCA89380BD41BFBDBC696AE33C6BB1DF3B7380EF3BBA01073DD1005CBB22719B3CA92A09BDC3F602BEAC10DC38D074233D5805CBBDF9E3C6B47EE32FBEAB5896BBDB9677BADB2E5A3C1FC8913CA5E3C1BC74B9C53B770BEA3A7A74E03C060171BCEEB4DF3CE397833C0A0A97BED960B9BDC4445EBAEF5B46BCC22F573D9F0FD7BDFA4521BD659E093CBE48ECBB4A368DBC7B0602BD3A7CCE3C381F8C3B3323563C4F44A1BC289983BCBB2D28BD44DD113D7F423A3D2C9FB4BA06E53FBB9C2761BA9226CD3C331EADBDCD2A593BA102893C78313EBB5367163C5706D83B23F384BDE9BBF2BC6394F8B8F9A4CF3DAB3C243CB850A93CEDB759B997DF92BC38C285BC7F674C3DE29A003C3C7D603DF2B7EDBA86FA42BC172D3B3C078F033D6C67313C775249BD456AD63CC4998C3CB42E583CC90184BC5CBE22BD1D5003BDE4FC44BC58076EBC3F55E43CFB3CE03AEAF230BCB90627BD57C3A93B89544F3D58CBF53B82F8E83A82F1C2BA4577F63C02CD37BC77BCD83A7314793CCF31423DE392A53B18872FBBA670A6BC5C9E293C0D85A0BC63A1053D64B810BD52274F3DC42C0C3DBABB4A3DBA0BF9BBEB32DF3C11EB89BC602C88BDB816033D96DDA93C93431C3DBFB14EBBE82C97BC882C6E3DEE4BA0BBB8F6F3BCBE18093DA4C126BDBBF4703C8E72553C3F263F3C1C63123E0F43BD3C775AA8BCF9FB92BC148BAB3C79C88E3BFD780B3D73AFE8BC159E97BC9640F5BB1ECE9EBCD0467F3D27925D3DDFB2FFBC9B88CC3C18EC1EBD8CA38BB973BBADBC122E9DBC68B1ACBCEC4D5D3B90F4963D145E3F3C6F7098BC9824443C956E6E3C347C57BCE256D2BBA9B8A7BD77E4753CFF15BF3B6EA138BDF1BF7A3D7205703C19B179BD01D2093DF5260EBDFCF9213C7651D9BCB16BF9BB02215CBC94E81A3D2B5430BCBEA1F53B511007BD4246EDBD072D35BE8748663D371E18BD89758BBCB958A93DA4A7BDBDCF9B3E3D531D413CB18D6FBC716F58BC906C2C3D1576F63C82942E3E25FF283B3ADF453BB9F823BD701FDCBBA048153BD9E1BEB32FABD63CA791E5BCEDCE80BD9DEB5A3C72F1C5BB0947AB3C010F55BC50C625BDB3CD2FBC42061B3CFDF8E43B683A0CBD2F44BBBCF136A7BAE7CE95BB8F85B93CAE5480BC4A99913D99BD1F3DC76E69BCA0DD5BBDAFF363BCF653A93D11A786BCE669153D500AA8BBB65DF3BB93A0803A04A58C3C54C6803C9EE68DBD7CD225BDB81BD9377AC896BC14FA0DB80DE2BB3CB4E383BCD99897BB5DE95CBC7C0EAB3C760535BC173AD93BA3DC1C3D8DD6A0BC117E1E3B1E65C7BC3FDF64BCDA5B5F3D48D91BBB8029FC3C0527383C3EB7C03CD1CD9C3C4604863C1055D1BDB5A3263B12A6783DB6E4EC3BE01109BA20BCDC3B40D7873B4B80BB3C49FD253BF58B313DC31095BC87E0CDB9A130893D10AF63BCA7D9D93C4431DF3B8A11E9BCE635463BDC1E21BD0C4A2F3D624B49BC10F120BD832C193B9CECD53C313D0EBEDC1C64BA03A0A73CC0DE903DA062EBB34D031BBD76A75CBD8880FEBC56CD253DF637553B8D9A92BC051D4F3DC19F0B39185D793DB3CE553DF03094BC0C629EBCCEA031BE3F6AC6BD796942BCE27A0ABCD0AE3F3C7C0CABBCE1B0AEBB9200A7BB7EC2103DA457E2BC47E23B39435A0CBCEA06063DAFB9A23D3AC0F03B14EA82BC2E05CDBDE5348C3B69F21F3D178775BA31FCD23CD3BBC3398781B13D3EDD8ABC0E781D3D8125943C67391DBD14E3083D096497BB3B32CDBDDF883FBDE98233BD22F61EBE472D07BDE462D2BB8DC643BD1D45E23C0F45543DBB97C03BAACE6E3C24F53BBC701F6F3AECA645BE5A0C8C3C437AE83CFB9686BCE215B5BCE885C6BBC7FBBA3BADB7AE3BA98FBCBC6C9480BC719FF53C66EB133D51C2923B9A5691BCF8CD84BC116C6E3B52B2AC3D5C1F0F3BD0418BBC91A3B9BB90250FBE55FC38BA868CAB3D57D601BDB2B52ABE5F06A0BBDA5463BD440C983AE62297BDE14976BD57828DBCDEB5C7BC5992603B90E9BEBC0603F2BDD4FC173DE35EC33B12E1BABACF102CBDD68E07BC781FCA3C4B14C0BADF4726BDACA2CDBB9EAE073D4EB5AA3DEA4907B9CBFB593D5AAA5FBCD8089C3D5AADE2BCA03EE2BD03DDD6BD024EA13CDA95B3BA8B7789BC72A81DBEEA04B4BA6C64C4BA758985BD0A4543BBA8FA02BDD58A83BC5080C23BB953C23C8898A03C7E8B883CEBFCB43D02BE4E3D1FF7C53C5890983A01D0A53C92488BBBD85C3D3D629B36BD7FC2A23C43F20DBCC5FEC1BC7945823C27BA34BD90CCD8BBFF36C0BD55EF973C0F07DC3C466AD03CBE6EDEBBD1D7C53C2FBC3DBD281BD83CE894063DA93D92BD3135353DB39056BA443BC83C058005BDC4069DBCCFF869BBF916C63B530EBBBB7657DA3CB854213C78C5143D4AE63E3DF8E3C53D2BE458BB86B16C3C0821BE3D093A17BB8C1E133DCBD3CA3CA2D2113DB4719A3DAB6B19BC9AE9823D9F3E153A154B973CC552273B0BCA29BDF9DC81B22B2F383D6E9ACA3B3832913CE01D4D3CABB6633D1316753DBEA37DBDA0578CBE625B4FBBA47027BC30C819BC32A4B4BC6B31B9BCB2E297BCFBE5D83D0D1482BD3452983970187E3C5294CF3CFA96293CE4D3ADBCE6D495BD4DC4943C34016E3D8D2B353CE33250BB6A4B2BBD693A683DFBBE12BDAD5DAB3C1AC5A2BCD819D6BBEEE3453D8639093E687687BC857DA53D2502B03D29E657BD658817BB0F2F723DF9190FBA526E87BB4D263ABD36C62C3D3C3FF83B1F56FC3DF68478BD634F573DC10DB03C793D423B982B2EBDDADE3DBE12F9AA3C613A2EBEEEA0093C78386EBCC86D103E36AFC5BCF56B4439EAEF95BC147711BD386D553D71526D3D23DEC03B41BB303DF98A33BDE6292A3D4BDC783C5A6C77BCFB167FBB9A5A9BBC775C6B3BFD39D7BC3055453CFB62AABD2454083CCE7B2E3C0A0F0E3D3A73AFBC27EB28BBC6A6F93C7E4F253C07F1F7B61CC46A3B7ABDB9BB575F51BDB184BEBC1944DD3CD2D28D3CA6620DBD29B611BAF84D8A3C7DFF043D1E86B13C45C2BEBDF198CE3CD2EB873CEBBAD23D05AF22BB182F1A3AA7548A3C66601A3DB1D350BDE6A6E039859BC1BBB0C17F3DC71F493DEAB5483DC6548D3C156ABFBC25C3D93BA206B53C05BA88BCA35D113D2FEF3CBA709AB4BD5914163C54DB333BA21EBB3C0777433DADB139BD761641BE0317D93C4DE2B5BB651E77BCD95FB13C561A933CBF093BBC5B6AC73C414D49BD1782DEBB79E1673B3DE5073ED17F9EBDFF8C39BB6AC1163C59954ABB11E8DBBDEEA8CA3C03247EBC2E2CC4BD795F77BD661FA03B101DB63CEBB24ABCBA34983AB64DDDBCA287703D2CAD19BB6A8FE9BB4A296BBC03F407BCA4F8563C20921B3C08E1BE3CA874993CC3F4743D268984BD146D77BAE9C76B3C6BCB943C8DAE7F3D9ED089BDD900D1BC63D79ABD2F19533A0727893C82DD9E3C9C574DBB06F8B33C33A697BC22894E3D1041D1BC568D09BD8D29DCB977A9643D045F4BBCC9A5DD3BC912B33DF14B103D5C933BBD903456BDF2CFEC3D45DB1E3CCE8E1E3D372F683CB078613DF8F0E73C9E8BBDBC02A9283DB437913D9F8831BBD5C4E1BC643CAFBD3F6F25BCD269E4BC76FCB13DB933CE3C962EA4BB29FFE7BCD3C2933CD9D1C5BC6AA790BCF9D7083DFAD4853DC6F67DBDFCCB703AA1AF0C3AC6B9B03C8CCC7FBDEA03203D049F0EBE0EE6D93BBBCA50BBF531293C4EA405BE11C9A53CA6A89A3CEF4CAA3DAD0D013D4DC999BB3797ECBC063BF3B9708B8D3D9BAECA3DF1FCE0BB39AB603D5F9DC33A85E6E9BC5AA82FBC3BCE7E3BDF7E66BD4F6F91BDE38D16BD4360703C59B9833CE906C43C523217BA7EEBD63C626E4D3D1BDCA7BCD04BEBBCB5B1C9BC556FFFBD98E71CBD64D70D3DFAD52E3D9E4983BDBBD14CBE7D9EBCBC11D9613D89B9A73BE2C4F53BE52D083DF2C5923DC5E1D0B3DC740ABDA7A03FBCC10D303DA8A9753D69135E3A974736BD91DC413D5C5028BE6B1AEEBBF285603DC62B49BC4D67E03B890E21BDDFF5D3BBB21F96BD509166BD1773C2BB7892903BD41508BDD32938BD272C7EBC79904B39963E803B20A70B3D6A367CBD5F01EB3C7E4C96BC88FA66BDFAED5CBD788459BDE4496A3C6CFB50BCF44AC8BCBB98C13DF65C7D3B142F953D8BFC0A3DD50E3E3C24D0983CD0ADEFBC25DC333C2552393CBCB9D6BCADEB6DBC066923BC96D1A73D478E8FBD603B053D10BB183DDE5A60BC4814CBBCCA69CE3DAF537D3D504A083E15DFFEBB55E848BC4389203EAFB8DF3CACA54ABBDE3F82BD7DEBFB3C1F6EF4BDEE0989BD8A46653C1E4A6ABDEF92623C415E2A3D037BA5BBCCE07EBC824BBBBA311105BD5BF8253D9F3AA43CAB0347BDBFD038BED5F888BE1D3333BD9097A0BD88DE9ABBBA2B48BBF03ED4BC77BE8DBC075DC436DE16883B3A871FBCE927013D4E510ABE39CF8FBD122C9EBB8362FD3C9AE9553CF582163BC1DF6EBC8428AE3DB080F83C7E5CC13C88C861BC94CC3CBDBE4ACABCEB3472BCE3C1F0B905A122BDAAA68BBDBB1D07BDA0C098BD92DED4BD88D6D33C27089ABDDEC9F8BA4DB7803C166C0C3DC491653C1A01153D5F22A2BE9A76F6BA5731CE3CFEC5B43CDA1412BBA85C053DBDEA18BD2303F3BDDC06893C327F3EBE6760CABD5692BDBC9D3109BC56B14BBD7908703CC528BCBC9078C63C316B1DBD781A10BDB78582BD662D4FBD2A3655BDDF052ABD5F4F06BC4EA07B3D15219CBD7406253BBDA2A7BCEE4548BC24B5A0BD03C662BD2D4E213C75298E3CEA1B16BC008919BC9FCC513CA3D90FBB9E3070BC4AB272BC0D83DE3B7F8A653C6EF07A3C31B1DFBCD6CD3FBB39F42EBDA6611BB984567C3B8ECA12BCFE758DBCCFF6A8BC7FF97EBCC141D4BC402F723A9C79923C0ACACEBCF853283CA90D843DCB5C9BBA06DDC03B75C2A4BB744907BD172CFB3C52547ABD7738DB3C84923E3D17EEC239A38FEABB125D2FBD60D0B43B25DBC13D849E933DAC931E3C66C5FDBD09C78F3C307357BEF82DA0BC474EE3BCE8C31A3B2A9F0ABD4315E5BCA640AE3CDC08BC3CE914943D72C31B3CE5417DBDD0EE833BEBA7AF3C85D559BC2A11CD3C2142C6BDB9B5E0BC8E8E8E3C15ED2ABC4FDD3C3BE363F4B6129151BD2763213D2F3482BC0DF2433C90D3043B39198C3C4F8A9BBC2A24D7BC44CC0ABB5C7E8FBDBC26733CE1A535BC386CCDBB038937BC407A1FBFA597C83C9E5BFA3A7CFA96BE0DBD10BC671D68BD0229133DA5494E3D296414BD62E677BD778303BDAA0371BC4BA4C2BB0F42EB3CE6B2503DB69FC93C742523BE81E2B23C39E0A9BCD9E77F3C5800DF3D0AD10F3D07D9EFBBAF59ABBBA3A4D43B8574D03DD839983BD8E5823DCFA82FBDBFCE663DE2E59B3ED216F43BB758C4BCD543D3B1F0D2533DF3AA1B3C8DDA13BD604D423C15C526BCC8E0E0BDC522033C3FB2F33CF486D03A9E9D473C1904CDBB6E8A233E18FCDC3B7786813C15F661BB63FE85BB33BB34BD5EF2E2BC826F4B3DCEAA263CEEB65CBC45B55E3CE83805BDBF7EC53CD2F3CB3C232180BE3A5A193C923E263C0045B13B0FCA3E3C9E7AE8BDCCF3B5BE4437FE3C705420BC339870BEE306A13B3A99073C4D5B88BAD98404BC95BFB8BC0CC92A3D53B23F3E059DCD3CA72A093D8CCA91BD481E13BD38B11D3B4F0B26BC1E9FE63C2B03023C1898C53CC7A899BBF851EFBD4F32863C03A911BDF3F68BBC3D841B3DF137333DA448A13A8CFAFFBCC53CEB3C613E6DBB614C263C761CD3BDAD01143CC474D4BC3E41BC3B16C02BBC7B1B923B402D013EECEBA53B697EC53B383E4A3DD1922ABCC9B388BC8F85BD3CFDA593BD258C77BB06640FBEE8E2BABBB2B18ABC7B4C7F3DFCAF4A34FE8127BC958339BEA547513C7C308E3C83BD86BB67B9D73CF30AFEBB9DE0BC3B0BABB2BCDCE9A03B9E87853DAFC0A7BC6621913DFCA5A1BC6FB6283CA0EC29BD344B493C0E8B823CEA08563CD4B82A3AEB01C1BC5735E5BCEFA4D6BCCB79DCBCF53B2E3D14C32EBB97B4CF3CF7ED57BC0E22933D8FB2C1BCC3EC983C1457603A99D7B63CB6D2AABC4FA139BDCE07293EAB790DBCB58F563C67A104BD447EFFBC020B2ABC1DA5ABBC2BF6E63C7C54A23BBC36C9BD474A803B9EDB9BBBAE17DABCF77938BDBE6B29BC381C59BC83A69EB96C12393D367F90BBEA7C213D1314DDBC99FB2EBC1547133CB480053D08551EBAB485AC3D5DB3703CB02A5CBC4F7D4C3DDDE2833C9EF9E2BDCA60453C54DCA5BC086C36BCFFAB673C17F0023D2F6C6D3BD2EFE8BD535531BD6BF6EABC9BBE6338EAC5B5BD6EF1493B8225CB3A683BA4BDE4BA433E3465953DFA1EC9BD8E97C7BBD6A34D3C435505BF8D05BEBBD7C6A73BC56A7BBD578614BE5587513D237C7BBB59A218BECB72C73CCBCD10BDE679123C34FB043D540D193B64FFA83CEE4CAFBCDB96A63C85AA15BEBF77AA3DF6B103BD30700DBBB12D023EA08CDD3B667D0C3C445EAABB7C49E73C3971843D9C56E53C34F9913D1B9903BDD88DEFBCB8B1DB3CE3CBF4BCA1E0CA3B8D45243DABC5E0BB214968BD9537A9BDF8D4D33C3C2F80BCC6035CBA229013BC2967D23C9655A9BB1CF5523CE381923C71654D3CE3DE173CAF708E3DCD33493CFE77A2BB68B0763D25A1923C0EBCF03C32D58BBC03952F3C91BD6A3DE6139DBC68D081BC7E6BD8BCFC4E2FBDF2CB10BDB97683BC6808243D2FD513BD86AA47BD7CA4F1BD6BEE083C40AA613C392C79BDD614A638A14B133DB9E573BC8684D0BBD3F1BA3CB4C4253C7D423FBDC5693F3DADA472BDB7CE73BC9319BC3DF1B0043DA3EE2FBDD502653D9B0CB4BDF1100D3D9A6547BD349E7EBBB78F98B3EF5D1A3EC8AFDD3CEF594FBCE195A03CC1E21FBE7B7525BE73D8563DD066913C19290EBDF21FBF3C6C951CBC453BA73D650514BD74B36A3C8956193C02DD6B3D6ACA8C3C43A6153C233926BD17C0FBBBF39FE53B445457BE91CB8CBA6227FC3C732C3A3D5B62B53DCED5623E7ED9513DE4C1873D7C6F60BDFE193E3D6272683D440F803DAB3F00BB588C8DBD24F39E3CBCF3FB3C6CF30D3DA33EFBBCFFCEF5BD3DE070BCCC04B63C1B1CE2BCA733C53C32E433BD3DB66F3DBFABEE3B89849DBE239C08BD495535BD417FD53D12A020BD665C1D3C8618503D024D4A3DC34045BCB9A9E63D1BF498BB6A4F18BA1D33A83DB4292EBEC8C5DCBB647F0F3ED2CFCF39709C94BC1C6C013D7AB6C83D3EFA9CBCE4136BBC1C8B343CD716403D39C90E3E4990FEBD69E1453DE835373D7ABA0B3DB13BA53D5D3763BCE8F9833DF276BE3B111A08BDAA22253D50C08637930FEA3C47F3EA3CF6B6D9B9EC00BA3D2CE677BD52153A3CB5E648BD4E0D123CF454023D9D18AE3D4E86A4BC719B36BB158C953C346C8CBC79F305BDD737013DEC64E5BC40B66E3C5DA77CBE82A5A53D3B7C043E98693ABCB6C69F3C87B82B3E58A2AC3C04C3C9BA9B50A4BCDEE2463CEA511A3DBF8A3B3CE95E8C3D2F2609BAF44B723CE206B83BA2158CBC3017373DDE9707BD0D2CB33DDCEC38BD097C0D3D023A16BD4A5E163D983EECBC7C254FBDB0AD4A3D7B0286BC68FEABBD82A8D3BBE2D2C3BB203E2C3C0B0703BEF6995FBCA5D4BD3CDF079D3CBA0735BD9C285EBC1944E93C71B600BC9CC3DD3BE68903BDB012FF3CEECC33BD39A3CE3BC3BBFD3CE18B99BBFF4296BCD6B7833C80A3F23BF1C3333C512F023A0A1696BC41207EBCA43478BCEEA0253D6560A6BCCC6AAA3AD64D4C3CE3B60F3CF9DF713D3E9DB4BC58304DB9E9B488BBD9B6833C5875523C302FAEBB061C513D5BD81ABD8D5B94BB9C51233DC9FBED3AF02127BE3F5D9A3AB316883DEEBC92BDFDF0FD3D694D203C13E72E3D9475BBBCAB19B9BC73E94CBEACF8FEBE5DF02ABCDD1909BDF1F28D3D1675093DC288ECBCBA08B73C0CC51FBD6F8CB83BA558163DE5444C3D5485863CFE87ABBD3465873B846BB3BC377695BD16A8463D7BFCE13CACBF86BCA44E27BDBF2FC93D194874BDEFA74C3BE87A1C3A9B0F3938DF908EBDF3027CBD0BA8F03C643EFD3C0FE585BC37C244BC8BF7F73B73CD76BC10F9DEBC78E0033B61C351BCC1548A3D1773173D99042BBDE099A43DE9B12FBC3FF622BD12522F3C6DED953D25EA04BCA248E4BB9A9A0F3D0A9EC3BB71574A3D0B43233C6FE3E03C3FCB68BC2DB283BC835B0C3E77E7833E625CA4BDBE6DA4BCEDAC203C44E4513C8729A0BC9CD92B3DF0C300BCA631BE3D29070F3CBCB2A5BBF6FD833D264B34BEE5B01ABDC704593EAB8284BC09B04FBDA9FD0ABC9F1DC5B3587821BC336AD4BC125866BE48AC00BD1023553C7319233D3AFB653C0F8FDD3CC59D373D92E0943C151C25BCCD30B4BC2F1372BCD2A471BC545431BC291B8ABC85B0AEBCBF4813BE912ADD3D03491BBB2426ADBD169AA0BBFDBB93BE78DA5FBDE70F183DFAAA00BEEA56B9BB21151FBDA67890BC0DDCC2BA7ABAB0BEE71401BCEE11BEBCD1ACE3BA4E8A783C931AA1BBB51874BBDD57EF3AE393CABB3640793CDC491D3DEA2AEB3CDA50BF3C2C130BBC1FFE72BDF8B0853D5996D33C8A5830BCDFB7833D55031CBFEF77AF3C14014E3D12DDD33C5430863C5D65383EE2C069BCA31B203E23735FBBDE2B19BB4653B03B4ADFBA3B425C853C3E75C53C2A00583E1AD8D3BC0548BFBC3D4A513B627577BA1E07A4BCF47F54BE4532F5BCFBBBBB3CD388A53C3BA3D0BC8269A43C26C655BD235AEB3AFCAE16BDB6A9D23D1D5427BB47EEA33DBCEF98BE67D1A73629F018BDEAFC8ABDA74982BCEFBED1BBCF745BBBD44DC53C431FB3BBF1B4A83B41935DBE03EC30BCFDA3823CBB1A1E3DE0CC953E49AD013EDF20993CCD48703C503EA9BD8A17C63D7868FABBD9A29A3C8B8606BDA6F587BC0EC92E3C9C9C303C535E3CBB2316B2BCEFAB613D5EEA703D0A60CBBC6B1FB1BC281C5F3D99F9353A5395BEBB84C8203DA34801BEF3C108BDAA3B473D6125B5BC51F997BAFA3E993C1477C03C1B0CE63DA977153C7F1B49BCEABBBE3DE1595C3C7D1DB53CF2ACB23D1437CBBC5356B53C86DA093C0EB1E33A31FF4E3C8385A5BBD4AD5D3E61952FBC4439F13C1C9C69BC9DA2BABDB98CC1BC79C40CBADCC16FBC365368BCC995003D34916DBDECC7E5BDF9AD83BCC2FD2FBD922BC93C9A087DBDDF80313BB7D0C83C34C47439BEFEC63D1DFBDB3DD330983A218D1BBD8EBFF93B3822373EB56B59BC81983EBD9D8966BD62591C3DFE0F1ABD23BD1D3D9FB8703DEEB4B639BE9FDBBC3E2EA93D0BF03FBD0A6B63BDE2458CBBA2162BBE37CFB03D6B4F2EBC04E9BFBDA0016CBD0382BDBC6130D4BC8075C43D9B5FE03CA25A5C3D4AF0233A2FCCE4BC6648CFBCC4043C3D36BF52BD14CE213DCC3D1C3D057D523AA8B454BD46C98D3C6A6DBFBB50BDBCBCBF143A3DA08C2F3D017D93BCD75BAEBC957643BDDF5B06BDD3A859BC9AC9133D1FEB54BC0F041ABD9D441CBA97310CBDD915EBBD3518833C978289BC9CF106BD5292D8BC9D882A3D54963F3DB43D03BD320FA5BB7CE128BC0F1C37BDB8A46EBDBC6B97BCBDE3BB3CAC9C32BDCBC830BDEEB5B13C65C20C3DA759B6BC103194BD4B92023D5F72CE3CD5B80DBEFCAAC1BCADDF853D3068763DFC5C353CEF82F93C72E3ABBCEDC9F7BC0C6BA03C9998F7BD5352183C92A7ACBC82EAAFBE0E63A8BC096CEC3C12BB073DB19F16BE23E1E6BD770D6CB93ADA2EBD5E16663CC11F54BB37C9B63BB3BD103D029D7FB3C7EE943DB49ABABD32BDE8BC686412BE29DB4DBDB15D573CA21E4A3D39209C3BDE57D33BD91A0D3C8A1FEA3A358AF73BF923CDBCE54F803B6727103E368B12BDEE60573C9B7CA7BCDFDA143DC345253CA546EC3BDCD66B3E18F95FB9143669BD2B00843D9D15683BCAB4303DBE6592BDBB8A863C7C3586BD085ADA3CC2931B3B90D0B2BC3B174DBE555A483C4AD0F33DB51F1E3D04AF10B9E7689E3B336B6F3C8243113C599594BCE87DE7BDCE2D91BD4CD091BC6279043DFA30093E80DA69BD6015483CB41482B927F6A1BAEDFE6FBC3AAEE43CB94BF63D35253D3C824416BC2478043EEE8E74BD0EE8993A6BDC40BEC6A7153DADD655BE952200BD66A870BCF3BF13BDE9594F3CEC12ADBDA25FD0BC2B201F3D2E1AFBBCB822CC3D59C432BC32FA843D60C539BA45DDFF3DDEACD4BD7518B53CDBF9E5BC2158233DB4CE313AFFC2B23D82608B3B8C7C1237D8A71A3C8C0D303C63C9B73D41FE1D3D58A19DBDA60DE63B3D7AA83D50B2E43C436FF7BB19A90EBD9CD43D3D58EC30BDA8E0BC3C307787BCEC6068BD7DCA033DEA7B813C6535DDBDA831CABC58E9AABCAEFB48BD1F73733C887EF3BA530C7EBB316F1FBD8ED71FBACC471F3D889F32BDF4C55FBC7DA27B3CB2810CBD146A223B355C313D6F28193CF4E225BD546304BD6D0C9DBC54D8583D0EEE893DEB59B13C5373213CC48C873D0ECC4FBD8C558CBB5C31183D393A903B691D3B3D3FAD93BBE2C7EEBCB2E7DFBD3731CABC7B51BFBCC644DB3BD80D11BC6EAB103E2E1957BCE3B0AC3CECF067BC5C423A3D47BFD1BC60C96DBAC98884BCF571B23B14B05EBC7D84FF3BC28933BD59951FBDEC1910B9A209B73A5F38C4BD4515D13C044DBBBB7B6FF53C9C2DE53C9E07A33A84B48CBA78340CBC334EB7BCBD4476BDE6DD6C3DAC5FACBC754A2DBD212F96BDC86F07BC160E0FBCFE3C0DBDE3DB91BCB8267C3CF9E0C73BB0AA10BD064777BCA38DB83B23B315BE546D1A3DB2AB58BB6FCB9EBD5F49EF3D6913413C90201EBD5A5BA5BDE940173C48D78EBD144491BD9C4BC6BCAC9FCBBC24F10FBCFABA87BDA4C358BCE0E7223DB00B563C8CFD66BDB310A53CF4321FBDB371A7BD3D30123D7557ED3C11BC9A3D200637BCEB15ED3C1630A7BC3C1395BB6FD7513D06749A3C3AA03EBCF609ECBA3AE039BC419D8ABCA8A8783CCD94F73C60902A3CF581CE3BACC81ABEA30BAD3C85A27C39471A90BCA334E5BDC3A12EB9EA27393D42820C3DAFB1213C80FD6BBD8AD51C3B5E410CBD765F1C3D00C676BD96FDB4BB6041D03C773561BD1884043D5AF2BBBDB74A79BE7BF3823D6DA4C6BC981FBB3B8818AEBC2EF463BC8620483D467191BD8A7E9F3C4550B2BCE55E933D8900AB3D002C8B3DFB99F4BB4947AD3AE273153D596C113B699A12BC0CDD7FBC30633BBB03B5AFBC78AB85BD30C3CEB32E0BD23C03F5FD3C61EA58BC16B8BC3C1E28EE3C6ABA0C3D976D2FBD4A31B83C1453843CB0999FBDF5C32B3C06D6DBBAADD7D83D2B57073CE9F03D3E8C4629BD2ABEE43C9CE68F3B637EB43C85911D3D36DC103CE771063EF2E168BC2165BBBDB9C5063DFC0385BCA7CB3CBDC3985E3D70B65D3D7C508B3D6F2AE63C5897383BA6912FBDB0043CBE78749C3DDEE6F73D83DD813CFBFB58BD65B51ABD096C163D792233BC727C9ABCF23DAC3BF166B7BD5C75F3BC769C40BE4BBDE43DFE6562BC1738D8BBB0852EBC59C885BD448780BE0FAA94BC9EB747BE7F2FC23A99DC393CEB6CD23D32F6E03DCCBF65390D4F7B3D537DF5BDD2E7D8BDF1093CBC5427123C68FA093D68BB173DDABC263C83958FBCDC1A9FBCC6E49BBBFAD2763CC85E55BC89188F3C31BB50BC98B7243ED3BFDE3D6692A23CF070DABCE1C0353C2BDF573B0A165DBEF01196BAA06B92378AC43139896EEBBC4EE376BDAA80643D4318A0BC3B38653CD01A27BC96B5A03CA5D540BC1C34B0BD5564D6BB9619A63DD5DD1C3AD524EBBB4791103D7C5D9D3C5AF0BF3C80D13EBD3A210E3DC388713DE8BDBBBC2C7DB6BD244B403D1A12DFBD6F3DB03D85EA30BCDAA579BC90AA4E3BFEA737BBB9E5A13C538F533E30E09D39B7F856BC643B8C3BD740313B43570BBC88DF0C3CEEEB673D6E0C9A3D214F433DB5682EBE343C073D6AB5963A40303E3C3A016D3C1AD2E63B30361CBD1E982B3B592524BAB159AC3D899621BDEF3EF2BB6926E03A7E06AEB929D7373CBDC028BCBC7B9CBAAAA871BCCC6A97BC1BD6983C08619CBDF642043C3B60583AEA7694BC9DEDBABC82B9643B60FF95BA8C67573A034026BB21EAE43B119BCEBC57C9263BCCEA013DAAFBF6BABCF0A53B817ADDBA8A8A2ABB411E913BFBDA7FBCFED2B13C0723A63CC037C33B005616B90544183C01B66E3C2031D8BB49306C3BDFF937BBF0C70B3C3196BC3C9AED1E3EA0C94CBB8F428ABB5EB0FCBB999A2C3C4CC1BC3AD382AE3BB43B843CC9A4E63A3CC5633CBB90C33C45D0683C5F45ECBBCD7F6FBDEB4AD5BB2B5AC83C5F59033CFE5E81BBC14DDA3D3E5A5A399C22A73CD8E094BB8225BABCB2A1683A4F7EE53B9F991BBCE30AF5BB38758BBC492B0B3ACDD2B83D5B28343B1FEA48BC0CF0A83CA2EC3DBCC813B0B975AA9EBC54CBD7B850CAC0BC81D43BBC54B7273DA39CA03C4048AE3AFC1919BB201ED03C6B5CD6BC2683943BF5169ABC365938BCAAA2353CB705E7BB6FD7183DC132813C49B88ABC07F6D8BB03130D3C3500A63CF5D0BABCC12457BCEB2B04BC22B8803C2BA0CDBACD7FDDBA000740BB64624BBDB93E00BD670F32BB6C4F7B3B5212D63A6B21E33CC26E61BDF4729E3C6A2B12BC1A9619BC5D05023B9CD79E3AD511AFBB048E063EE68C2F3C54EDA8BC958D6E3CAF8A033C1F71AE3C96B17F330119C03CDB9C1E3CB011C1BC12ED8B3B58A08D3C419B9DBCDFC4003CB7F2AA3C35F251BC6E6312BC2529D43B08613D3B90E7C53A4F36F2BB204A183C500AB33BD3D5B9BC5CA2573DBC9B8A3BDF7F84BBAFAC833C8432A93B2CE1CEBC8743DB3BFE14323BEC7D8C3BB3D127BC7E1197384953FF3B4F033C3C984793BCC27296BCF5738CBCAF8648BC3941C1BCB3FCC4BC09F5A4B832FB113CF30596BB6BE91BBD5D4D9FBCE9867039DE7A9C3C9C5EE2391E48713B45E799BABDE505BCB2DBE33C30C237BB270E133D47A2893BC6CD87BC716C17BC24E20ABC7A1746BD5A00153C1BD3013CD358D1B91201B0BA80351F3B296709BC89D1CC3BD99ECDBB399EEABDF485C5BA2781C2BBD25393BA79497C3B9C24753C79A1553CD9A864BCA945853A97C46CBC1519E6BBF4F3BDBA66C0B43B0F7B77BBCD62883C6A0AC4BDAEEB6F39878409BC55A34CBCC33BF9B59D69D5BDE0798C3C9FBACF3B6B2778BB1D36EC3B76D9953A59CA9A3C919827BBECB4E43D97BB35BC2FA5B83B06613CBC51B3EABDC02265BDD80A00BA1AF82EBC414B0ABCE36C93BC3679D53C329204BC9461473C583160BB3E6496BB8B6268BC2E2E0E3CBF89853E703642BC4812933A8E2DEBBCD50FDF3C51DE24BC2A358A3A8192113C71FD8D3B00C8643D2B9E06BCBD602E3B6B2BAEBACC128537313A883C462649BC51515DBBDC32F9BBC69814BCA0E4D43C93539C3C9E5B433CCB0A8DBC09ADE63B2FC8433CC4AD5B3C0DB02B3CB40924BBB068D8BB1FD897BEF0F4313C01C88F3C120DA2BCC93ABFBB61DCB43C6FE4F33B4C5E693C4F7888BCD681BEBCD64FC63BDA7CF43CA9C2D93B59C2763CFF82B9BCFE17CB3C51077D3D59BCD63B6B4BF9BB630C74BCF40816BE4B5D93BA3DA3393DB130DFBCDB8879BEFC4D11BB20BC96BD32CB903D10AA23BDD23102BE657232BD39493F3BAA740A3D534C553BF1C504BED49DDDBC1C5D313CF240C1BB5D87333DEA39A4BCD515243DEA9E283CE02FABBD73039EBA300C613D9302003DEA1D023CE19D063BDF8B7BB91C7DEE3D4E890B3C976F05BD8DBA3FBD3DD1DDBC7DAC443C7E29B4BCE33829BEFFAE11BCE8CFBBBBC67BCFBDBDD905BDBF6D83BDA517C3BBEA9D203C1299DE3C36E8A53C714994BA8A77F93D377F373D2CBFF63C5E8F463AC5B9B4BBFD153BBD6989563C63925CBC66D2B63CE6FAACBC401511BD8379813C08B9ABBD636CEBBB5FA07DBD5F38C23C58B1F83C83D0BC3C245EA3BC4FE024BD7973503DCAA7D4BB140707BC2C9A5ABD103F4B3D6297B9BC731D843C3040EDBCCD040ABD2A74663B216C87BCDAA7A5BC7EB7C63B50907DBC0D1ED83C692E1DBDAFA7B83DB91728BC6B5DE8BBF81E2A3E40090FBC6DF1FE3CBC0A8EBB6CD08F3D46A0823D4993E2BC4379DF3CFA9811BDEEBBD93CFAA7903B224B04BD7F514BB20038163C8CC17B3C60A30A3DCA00823A4DE93F3D269EBA3DC86FE9BD841164BEB14179BA6F56A83CB18907BC7B26C1BCA3069EBC8DFE90BBB4E72F3E20564ABDC71393B96F620A3C79F60C3C57F286BDB7250EBDCE16F9BD6B672FBBB1F71A3ECBAB04BD01E7013DF73A3ABC5F02733DE5BD4EBC3E00C9BBE927D93BDD78CDBBA39C313D0ACF0A3E193863BD3D6F683D49466BBB34399DBCB61244BC915E883D73AF8DBC8423033C282798BD0F19973DEB9D193D01380A3E8921A7BD41DAAF3CB5E3A13BE273B93C6BA69CBC63CC06BE7470383DC4B004BEF8018A3CA8750DBB2539FF3DA77008BD0EAAFBBAB98920BBFE7E88BD71AB913DC996313DF9AE433C91662F3DF6A784BBD0F83E3D8B13C2BA4E61A1BC8DBDCEBCA0E8F6BC897B67BB438962BC8F3F213CD2B8B9BDBFA2203C6E13ECBB7E2E353C3558B83B9D0808BBF5CD623C9BE99E3C526A0E376A5E033C737C22BCC11644BD1ED44E3C97AA7C3C317F10BC86CC5DBD0A7BD93B641494BC80CACF3C6CEDF4BC47961EBE985C46BB3D8E7FBB4F0CB73DFDE74FBDC166A73AABE3613C4239803C260C78BD81724B3D99E0ECBCD7975D3D7DE9293BACC5743DE08FFE3BDD47AE3A63F5B33C381F753C554C2CBB1F1D2E3C9CE9E5BA9A2C99BDBFC9AB3C0C43D33B2518273D60ADB83C126781BC47EB00BE2A71A0BC22AAD03BA65336BCB78503BDF51E963B955AD9BCA53144BD449953BD3207F9BB404ECB3BFF62FA3D46E0ADBC93EB81BB961423BDCFE648BA25BF06BD00746DBC3C8A1F3CE5C2D13DBC68DE3DA7EB9E3DFD6EEE3CE26ACD3B961BC4BC1C1A84BC3598E83C543B0CBDD2AEF93CD39C3EBC59BF84BC09E750BD66206ABEAD620ABC8A2D193E73A620BC17BC8F3D89EC893A85F21A3E994FF6BC105FD63B66F8713D81A2743DE295C0BCF0BF9DBC005CDCBBF0BF0CBC444B083D574EDF3CF4BD94BDC6A8A23DBCD9A0BBC2CC2D3DF98101BB93ACBC3DA17D2CBCA952A2BC13C926BDABD14FBD673B74BDA974043CC0D0AA3CCE85343CBC3AAEBD08FB9D3C82530CBE97793A3D826F77BDC46D7C3CC31AA03C9E95FABCA5BDAE3CD3ACAB3D21B28C3C082DB3BCE5E1223BD284883B32505F3C857A963C517FF0BA399C853ED96737BCCCBBF1BC0F7161BD94A92A3C9E3E353CB5B6803A3761643DEAF840BCE93848BCDA91B43CEEA77DBE664B2BBC102CE43C04BE36BB75BD71BCFB6F183DDB3C46BEF49576BB43C69CBEBC9F14BC9BBFBE3C5CF8FB3BCD661ABD36A3F33B0C2818BD566016BD3F5B9BBDCC7A02BC4B3D253D0EC9DF3D30BD293D4E8AD93C97E44FBB298F933C87AD873C23A5BDBB9895093C8839653B17312F3D49BCBC3CB86190BAA65157BD1F44E63CD173FEBB01C602BD77D9CCBDCEFEB93C0979CBBCC5680A3C7EFD86BD1D1E823C11BD1DBCFB908ABC819CF3B24A40473C570AECBCAF1A95BCF84494B9D447A4BD2562B33D12FF7B3DAC103EBDE428BEBBD1333E3D505DCE3B1B8BD4BC1E00F93C90F2A2BC1E7BC03D2F0C4DBED587863DB7FFAD3C5E079EBDAAFEEB3C636D403C99457CBDDF8E8ABC08D4623D714BB03D32D1B43BE7A2A53C63E92E3D6E11073DEB07A8BE20CF7A3C1375EEBBA3D8C6BDF670623B0DDC4B3D411EDFBCBD3217BC4FB7AEBCDCB9F83A4E04DABD35B09EBCFAEB3BBC4C41D1BD3CDA0BBD537E02BD4434B8BC8A02A1BBA44AAE3CC1CE53BD8F96BBBC7C48173D9AFBE4BCE0C6A23CC31C19BC42FA8EBCF6F2043C2D7BBE3D1F09653B14CC0BBBDA3B2EBE754E0E3E6524D83CCC2C293ECD64323C4E70BC3CF4B0693C5216B73D8D172B3D0BD0DF3DD209063CF67B83BC1001A6BE9A1DB03D76BC403E1201D2BD2651CFBBDFB7493CFC511B3EF03B9E3BD872D5B75548A13CAD82283D3C5B0037E02F3E3CC3974E3C8FEAE1BC0443843D4B76D1BC651BD43C765F8EBD8203013C1F55DE3CA09134BBCF62403E211FFF3D3B28F93C9C55DC3DCF94C6BC46FBDB3D8F29DF3C5DDF3CBBC85686BE168A493CBE22093E6282A2BC2C80F7BBF93C0A3D9D97C5BC9199F2BA2B72793D83790ABDEC26EC3CAF0927BD4AF5C93BCDB71A3B6F610A3D58EB4ABC53F3D73C4043BB3C8D83AABC7F186FBD6C9408BE034A04BDF2DB2F3DBDDCAF3C58B206BC11655EBC9AC3903CB5AF293D849BB0BD252C413C01FCFB3C7AB17BBC720BD03B9213A73D24E645BD9706A4BBC055FEBC27F0A5BCF90C393D88F1AD3CD55747BDEAE826BAABDD41BC7089B63C61E584BC270C033D1966B03AFCEC023DFFAA25BD26A3BFBDD61A9F3B007D283EBE626E3D3370543C75DF103DF44C65BC74F9EE3D3B073539B504FA3D215FC6BC0A0C943CB00E083D1DF390BDC9246F3CB5CFB9BD485058BDFF61E73B6B5BF73C165D2D3C6235223BEE197ABAD0FF3DBDAF31253A633ED23B1760713D93CA5DBC88077BBB16C022BD645824BE4F15CBBD63DA903C38FA65BD9BA32CBC85423ABD9ED6D03C2B5DE93D20BF603ABD05E1BDC19E4DBE8043F13CD12CF63B4D361ABC05C8E43C53DD9F3CA928B6BCEEB047BD3E1A8E3CBD6D18B9761B613DBFB40BBD25FA473C98A1813CAB47083D0EAABBBD078FC83C14ABA0BABD3AF3397EC5A4BBDBAE923DE8E7713D5D74803D81B07DBD853660BDD9AE303CB8A31CBEECA191BE451DD53AF64886BD09C7F7BBA0DA55BE298181BB914384BB3F43BC3D277184BD3D28D03C3616EBBB568D85BDCC1754BDF1FA83BCA169083DDBF6083EC5DEE53D47A391BC4994A6BC08A38DBDFF1A05BD4327863C3969EF3C546B88BC1C88A33CB7C6253C2E964A3DEAD39D3DFC63593CFFCF753DA325D7BB42E8E9BD18684ABDADD1AD3A198E0CBDED2197BD8906B23B5F2934BD1BE6483B68415F336816923C2732B53B385F2E3DF2F2ABBCCB51DCBA4FF6CD3DD5771C3DE488693D882C093D695F863D792224BC43698EBCF22D2C3D317FB1B8AD9C8EBD561D58BEF2D1A6BC5245AF3BCE37ACBD0792EEBCC19F103C084A943D223D0E3C5DC8EBBDBE90173DA4C11EBD1BF43CBC8DD5403DA9543CBD0CA40ABE25D8313C049A3D3D3390173C9C3F04BBFA1E51BBD7894CBE76ECFEBACAD50DBC0414D5BC1DC368BDA0F1EABCD418163CF6548ABD18EB1DBE05E56E3BC1EFB3BD14EC3B3C1EC3933B7BFB05BD3382713CD4FEE63C54F01A3D24A601BB18A7A3BDFE8542BCAE29C7BB7843EC3DF140BBBD9CAB1B3BF783933B2197B3BCE3D5A93DCCDD9E3CE1181C3CC05E4F3C31EDC6BDF32DDB3C44A87D3D6E501D3D47FED03CBD80C5BDFA8ED2BD3479363D6124193D36E02ABEBA2323BD2361A63CF71A483DED6541BC5C4AF4BA287B933D6B5EFBB978DDD735D35D0CBCB8BDCFBB72DADD3CD85D973D687C553D1539C93C4C07B6BD8018EF3C9174493C6490043CD720653D31096EBDBA4791BA497F223D6EED4F3C8036A7BB83C810BB83362ABB51C38FBD92C9943B0904203D59F64E3C4974113D5B2A293D8AD7BCBCAC2B4DBBA9B5D4BB4006BA3AC118F33BF9D8C1BC9A062E3CD9FB91BAD0EA5FBC4ABECE3C6024813C624AEE3B1C68523DC892C3BCCA5D3CBEDFDF49BE717BA1BC8AE6BC3BFB92A6BC99A31F3C0E8225BD4F7763BDD1B4DEBC3CD843BCA3631C3C40E944BD2578B43D5E27773C5D5169BDFACC603C2D1BC3BC1530CEBB0901F83C603346BCA80830BD5707373CA09428BC713DD53A2E3F103D9A6A22BDE934A4BC7F5B3A3CCCF4ED3CE9E0ACBCB0BDC43C74B283BCA88B02BE63E00ABD06ADA63D28413C3DDA6158BDB70CEB3AF52395BC7F74F4BC7CB89CBD146417BAB47EC1BC9D20EBBC82C4A6BC2AD6B63CDF94473B4AF996BDC412AC3DBB17D6BC9255013E58522CBDC88CA63C3A60973C37DE73BC6DE067BE23364C3DAEB64EBD3318883DF5F0143D276BC9BC502EEDBDE41BE7BD2D379B3CB1CC09BCF9296CBD1F4369BDF77472BB4BDFE43D62D406BD4716F9BB230F093DA7AF6EBDA24FD4BC77899F3C83EBDABB4C4D92BDF9A57EBD80EABF3BAE07CCBDFFB9AFBD51236D3DABC8F7BC4AD545BCDB799ABB3FCD32BA53B8DC3A1EB4B93DBECF0B3DF63C34BC0E84953DDC37653D2397473A2FE4253C0862AF3C6E02B23C674ADCBD012C57BC1BAFCA3A53492EBDC09BAC3C790D0A3D4ABCF13D7AEA10BDB5123CBD95477ABC9C9237BBBF09003DA2D539BD3505D8BD01C32E3E68C3043CF8D467BD5F7488BBBCCE06BDF1F84BBD0279353D007141BE5EAE27BD699F1D3DBBFFE63CCEBA5A3D30FB543E5F6EF53B01974ABD359F8DBCF5896FBDEC0EACBDA66F4D3D2181433E10D8823D852314BE54B113BC2ABCAE3CCCF416B465C6123E1D3B35BCADD11BBC180E3DBC109A593B76745BBA4A27A2BD891779BDA7D0C0BC7FC885BD14D60D3C9826C63AD6DE84BCDBCF9DBBAF7E09BE852BA43CCD2FD5BC6F58C13BF8848D3D244108BD2BA0073DA38A81BDAB25543D0F86193E7D8FD2BDED0A77BE5D4D09BE2B6FB33DC8C29D3CB8446ABC2CE30DBEAD5A863DA0AD0C3EF5215CBD75157EBEB7A1463CF0C3AE3C76319E3B2D4C08BE7EF16ABDB1457F3C2012B2BDA0DC393D0D25993CB32100BE839F0D3E49D3423DA42DF53D0C30BF3C78DCBB3CBB8954BC1B64EBBD994D6EBDCD492CBE1809253CF850FBBA504BBF3D765DAB3CD7F096BBA98C5DBD67EFA6BDB2A6843DD35457BDCEF3813C3F0B24BDD23BD83CDB31CF3D334033BDDAE7BB3C2420C1BDBC6578BCE9F1D5BC5CDEE73CD9DE953CC79FBB3C4E9158BD2DD21EBE236618BBFF2A13BE9790EC3AA87FBFBDD665E73C3134063844DFFEBC79F184BD8D2025BD748D2D3C2F5E65BD9ABBD43C6E8FC33CF5B0433B33920FBC87963ABC8CC8A7BD6BF1B1BDD024E0BCAE9F573D20A24ABA81F80E3DBB3BF8BCE34F383EF97C8C3D460E803B018964BCFE2615BDB9C2443D7BEBAABC0AC7283DD446033C44FABD3D6714543C2A320B3E36CF28BDE01974BCF4EAF73B6D80D23C8B62F7BAE541BE3DB3DC923C02D2C93D5682FABC559A14BD7279393CBDF412BD2575A13CF9F83C3DB70EEF3C9D21C2BBBD674FBD9A9CF4BA4774673BE579CB3CC278DF3D1E67113C4561BF3C6005C0BD40E9B9BB2F8615BDCA259A3C33DC7A3CD582DD3C637A3A3C0BE8A1BD8FD5A83DD346B3BB9565F73CC6F5133D2389D73C0D01AB3C6CDD893D8D89A4BCEC93B03BA86BDFBBDEC59F3D97D8EEBC6BB1113C74BF163C0A30433605098B39FCB4073D623EC8BC796E013E3195493CB83729BD25E32C3D8187753DA544123B5FF72DBDEF84543DBA62403E8B8981BC30A593BD52FBEDBC47DBA0BBC7128C3CC9B33CBE96C040BDA19784BCAC1D2E3D9540283DC31A69BC382EB3BC87A342BC3D67EABB71C4783CDA20063DD4AC403C11CBB3BD3C13CE3C594F8CBDADE313BA15DE0F3D669DBB3CA30DDC3C85E6B73CFC6CBD3C873C25BD9FBB88BD8DDAB23DA5319A3C69B32A3E14D5B03CB654013C0F53713BD282D0BB7802163B37A1FEB9C58EA4397CAA19BE2D0FF7BCF4914D3CC30BF23CA37F563CF84186BCFB3B713C841E4DBD381A4D3CAB4CB9BD3696373DF619DF3ADDFDF23C200591BC86551C3EFD17253D49825F3BE2020F3DA84D233CD2CB0FBA5EA252BCF7DB3C3C915920BDE84F7E3D4347D8BCB9278E3B1DA4843BECEB203CF6A929BD0FED83BD1CF0CDBE049A7D3DABEB0DBE1D22173DF85BD63D65090EBE9898F3BCB1286B3D1A9D95BB04B7E73D6090143DBDD9A13D7757953D5433063ED69623BE69D0E2BC62FD2ABDD1BBA7B30B95953D739F1FBC11E20B3D584769B9DC8D88BB5047FFBD4FAC063D285AC43C1D349ABBD0C1C1BBB9E835BCC41B36BDB2DF05BA2983123DBC2BE53D018F113BC84E47BE83313E3D78D0A03D48FA0B3D3CC3AC3C952C6D3D16AEC93CA3F430BDF50BDC3DA4E7E2BEB4DF47BD4A505ABD105E28BC0550A2BCA00530BE548D903DC995D9BD72FCA63D886AA9BED302B03C81FB8EBCFDCC6E3C69A42D3C8CDB8CBDBE20043D6B5147BE9FCD17BC1ABD30BA52ED48BED7BCAD3D473030BC0931193DEB7C1C3D0C86FB3ADCC1013D8E9E513E3414A6BD28A1003D3350E0BB6F3804BCFD8E353D6EE0683DECE25BBB2E99D1BC2AE4A1BC2A3D4CBD24E1FF3CF3CFABBD85F268BC4F93F6BBF06C373C44BE203C36980C3C3D3CBBBE7433A9BC322CC13C234BE93C7999E2BCBCF610BD402CCABDD58A91BD0BBD003CE8592EBE1E9E8BBB2FEEF4BC23C117BD43331937CD32F4BC9A202ABE62E10CBC96EA59BC809E8CBCF7CB313C3D6F45BC96B691BBC099183CD7A4403CD656CBBC07522A3DDFB331BDDADE67BD0A3F963CBB962FBDEFF7163EB36A12BC2EE09CBB276E8FBBBB27E7BAA03D22BD9F4F0DBDA4B63BBC58590DBD3617BFBBAC41D33BB29DD7BC9023C83DFEAE65BD75C6A7BD0B4BD23947426C3C9707B63CBD72B5BC9F95D83C4278A4BC7B1BAE3D240BB53C9DCEDCBC7FF97DBCBFC6E43A4779443DEA0751BC5258833D9AD18EBC9372B63C8E4A173C8CEE1EBDB8C220BD8536693C3CA7273C540ACEBC3DAFD2BAFD663BBC258FC23B5DD9EC3B1D3752BB887750BC196CE93BF663243D34B8C93C83FD29BC6C8E213C838B53BC73FB4E3B486796BCA574853B6BD6C33B1D56D33BE8F7C8BCE957BABA42EEAB3C7B5F0E3D38D1C13C1B6E03BB7EFAEF3B81D103BCA427ED3C0D0F49BCFFD1C53C7E5B8ABBF484FABC66F23F3C300D233C4651BDBC525F45BCD4E13ABCF748C83C62F75E3C8B4FF33CFA6912BC53F6F0BBD4BE5DBCE95CD7BC87B01B3DC878E13CF138883C3743B8B9107B21BD3ECE453DD4B15D3DE7115EBB1E5D2EBC8087583B848A1F3C5815FFBB152294BBA427833D716607BA8288A73B78B359BBB4772ABC3C4B833C29FE95BA2F96EDBBA8B2453CB60AADBB452B3C3BAE18EFBC9D697D3CA652913B20469C3CE19C76BBA7F902BA70253CBCD97080BB7C2983BC3E4ADDBCE10A9A3DF7F4B83CEF07B13B0BF503BD390EDDBC21FA0ABDE1D8F83CF0C66DBD72FA203C0918BD3C9BBC3DBC2587843C4352E6BB1880D93A069B473C2D5CE6BCA2DD883C67F8973CCD78C5BC3AF7353CA71EF43CB8787D3C7C0722BA6987343A281E6FBD240996BDCA3A05BD21550BBD43FABB3CA488D13CCFE8AE35D2073D3C9522B8BB939B3B3D90CB74BA08FA643CBE7C27BB1084DCBC0AFB093B85CB4BBD1F6C1BBCF2E1613C3BB2113D665732B34792C43B1D73083CFBAF80BD6EB931BCFD6A413DA490B9BAB57A89BBEDBD20BCDDB14739D3D6CE396815DF3B24B1783BD590D2BC239E6CBB6E4A6FBA294BDBBC424D833949F557BC8E85043D40300EBCCD5467B947DF943B4097A43D5E9289BC4749BF3BAEA9003CE45C74BD50A1AF3BF748343DBE26723C150D7EBC7676513CF84AD4BC0AE230BC439ED2BC6C28C23B68E510BA703D283C6E9E87BC317003BDC8B413BD2C6B92BA5D8E0D3BC98FC8BC75AD773B824C7EBCEB2E043C4476723D8BEBE1BB2EAEBC3DACC69E3B7C88263C82647EBCA18A423DBA6893BDDB48053C11B6853D59DD2FBC426109BBEAD49C3BB19E9EBC09ABC23C66A098BC002A853D44A7F23B842BAEBBB425B2BCA2BA8FBC4C02303D13D2253CA70E3EBD0681DDBC9CBAFCBC7E42033C14295CBCEB8F9A3C7EF7C5BCC45EA33CD85D5DBD61439FBB0014E23CC0CFE53D2B5ECBB693A08E3D86120ABC72F125BCFAC9003CEB3D253BA834CEBAF103173D3E07D539E65AAF3DBA6A87BC26AEB4BB7FB0DFBC2301CABC6AE310BD71019F3BA43D36BDB4DB253C0566383C7DD8303C9767E2BBB982E23CB127B8BC0944AEBBAC08BCBCF5BEC13CC76FC83E32F7F1B8F061C53B096EF8BC804FFABBF3CD0B3C3ACDD43937CE003DA031FC3BC0D397BC5463CFBCB9528A3D1F7BB8BB9FD8503AB7FC413CBD5077BC3FE0D53990D2AEBC6C4AB5BC77E2E3BD94600C3C0DCED53BE46EC4BBA6D8823C86A1713DEB05E63CD0C39F3C4026253A0D3BECBB10ADA83DDED05B3CF1E1093D891103BD0BD8BDBB68183BBD14F5E5B9F7329EBB28494CBC02EF89388178483DDBCAC43BF099C3BCE49C95BD2BDD8EBBBCABACBD9E40853D57CC13BCA1C926BD96A0AC3AFC5D26BD2848F2B8DD659C3DBC5312BDEC9B2ABC2D0800BC1F579ABC573F2ABD9D3E18BE497257BC146416BB5F3FC0B9D5BCE1BC7A876BBCEB9A2ABE20F7B338A4AED6BC66892DBBCBE910BE03948D3D7AE867BC0346C1BDC931D53C8001FD3C2255E83C5BBB093E76F9F53B9D64DCBCF92C7DBD792229BDB0290E3B60AA8DBD167C13BEF17C973DA6512A3D21477DBB558F8DBD86F42A3D589BA3BD0B5230BDB10BA03DB8DFB13CE75CE9BC3D76073C83A41B3C864BB0BC8ED44ABB132F273D5E654E3DD6BFF3BC3F004AB953F014BCDFE160BC4D8000BCBBDD77BC02E774BDF939323CC718C1BD9CD6F83BE58FCA3CA7D04FBC94AB38BE716294BCF51702BD32FB053DB025423D13CAC43C75C259BDA7BCAC3BDE29523D2114C1BC8E01FB3CD3673B3D1CB70E3C087E1CBEE25335BC920BCABD4F26503D735EB4BC842DE33CB31969BC1199F93C66F8B03D00F8F63D224A973C25D0E33C538462BD765E023DE6741E3D0F21193DB1A9CDBD002D403D1C837FBB788F0ABD88C5433B4386A2BB42C73ABC231289BD19B2C232D3278E3DA479533C712B853CD31AC6BD58D96B3DCDB7CDBC9C649A3DE266B8BD6767663D6E171DBD1681273C7332FCBC6144C23C49BB03BC998421BDEBE0B1BDD2DF3E3CB2D09F3AB2286E3D59991F3D685673BCB7201C3E7E0D753B06FCFF3C70C9AF3D39901EBC87C047BDBE8F4C3D1FE8B63C20A61F3C23734B3DEE35ACBBEC3596BDBC2442BD8660843DDE4CBB3D0534403E013DABBDB0EFC13B79ECD7BCBD044CBBB317E0BB8E8277BD42FBD0BD187C25BC6050163D9F69F53C7AAD643CA00307BCA43074BBAD5815BE47A89EBE45AC993CBF0D2EBECB58F33B091AFF3AAEB00E3EBC79073DE298D339649F2CBD56538FBD97514E3D24F5C83CB17BE2BCC4512F3DDA2F0F3CD89E903BB4AA173C131BE6BC7445DF3C36BAAC3BB8BFB8BD226EF7BCE116033D0300D6BC7BB8B03D07A9AF3CFE9BFCBB6D7C703B151319BB519EA8BC8CC734BCB54E793781C99E3CE86A1CBC4B40A2BD465662BDDE88143DA2C9ED3C78180ABC41F4183B08C4C3BC6569C7BDE7320C3E4C4B5ABCC7ABB13C2C41D03C19CC333DF5D8353D061FE93CD1599BBD6AC8403D24828FBCFF665CBCAF64D43ADB260D3D08B392BC85832D3D13DBB6BC68ED80BC502E11BDEFBC2DBC827F74BC6F3AF13DCA416E39ECB3E7BC5067123C8C380DBD8D1461BCBDF553BC58147BBDB47F57BDF719BC3D4678383C1AFC3E3D890C44BD9D1118BD42DE843CD8AA163B7FFEDFBCAB96D0BBA2241D3CF82A1D3EB1E02BBD412E053C8D0FE93D1DB842BDA5AA623DEC8F9ABDB19EE2BCA391383DC91E083E68E5A6BDAE33833E50C327BD9FF1EABC4AC8273EFD1538BA0062B1BDDA26AEBDB9C2093EB5DE12BD2D27803D04E8D23D5F4AF83D173B81BE7002D8BDB03821BE09A777BBBD2C8E3CB488733DD0D7F93D365CA8BD559FBEBD00998ABDAA8D16BE1F4476BD7CCCB6BD69F2593E2EDF223D2B5B103E41C514BE241C583E6ACE98BDEEEB15BC3C230A3E0F529D3D3BA403BE0A47D53D27378DBE4C53C3BCBF58713D629A233C6C1B443ED7AF05BEF6169A3DEAB88A3BE6F1C43DAAD275BD091836BE78A617BD3500EDBDA99F13BC8266103E73CF8CBDE2434BBE969787BDE698FFBDFBFB6F3C5A6D813DAA18013EB4409A3E679CEBBD844ECA3DD1312F3ECC22A8BDC368EC3CB828ABBB8EE228BE015D2ABE20700F3EFDA955BEDD3155BE1B11A9BD70C5A7BD957599BDD46C41BDF5D930BE4A62E2BD27D7923D9BB60F3E34D3A3BD3414C03DF5484DBE8C6A55BD88EFEFBD070F83BDAA9DF3BD6E49DFBDB2A8213ED473453D6EB3BBBCF793F4BD73A9163EF6BE81BB9D43FB3C9AE599BD441F33BE956F27BED7144F3D1EEAAE3DE67DE4BDE16A1FBE9D2C38BEB128733B22FF233E601D00BCEF9B1E3E00DF533E7E901EBE902DA2BDC1D5053E255F723D2E29A1BD3B6C8ABDA19945347FE582BE3B4D4E3C8F0008BE9819193C7A68123D354858BE50DB433E456C543E502BDD3DCEFD243DC12FA6BC297D533D79BA89BC0FE1193C960F2B3E2B5745BD6D840A3E019EE6BD0226D6BDBC1FC13C8C5DDEBC4346703D8D40F0BD1206DDBD41B414BE731F8A3EBFFBB13E0EED26BE0691D0BD42AA263E1019473E38103CBECF8887BE5518FA3D2B41053EB6D2F13CC5E02C3C2F747C3D3C0D4A3E9C0E093E4EFF77BDE8AFFEBD967B31BE92CC743C9AFA0C3E429E34BE55D1D2BD9AB987BEDFC5D23D1C3236BE0E6D05BE3BB36ABE2380DBBDB4598C3ED9CE93BD251C90BC300EC83DED0427BEAFE1393C618E9D3DC7FD093E807517BE90A93F3E356423BEE783353D1884A3BD321D8FBEC8F9373CB62D89BD6BBE12BE135A6CBCD8BE943E5CC0813D3EC08C3D025B093E989C7B3E191CA93D5DC834BE5BC4323E70D984BC01AD3B3E6C547DBDDE8423B85B5EE5BD12201D3E42C4303DE342743C982D053DD1073C3C25F190BD06C7503CBD82AEBD5869323EFF67213E77AD543ED618283E5085B4BD5EEBCEBC0A9697BEF4DBB93D3B4A13BDFFBF5CBE159F953D4EE133BE3874AC3D1042C8BD661EA53DBFBE923DE9DECBBC9E8F21BE0FDF32BC64278DBECBE6083D22936F3E02CF48BCD82CAABD31BBCA3C97EC1EBE32610FBE133F06BE4FCE1C3DA000793BAD1A38BE559B013EB90DD83AC28188BE5D91A5BC68D28B3EAFA6E23DFC718A3DFF6106BD56936EBDD0D637BE9D888BBD543D60BD2503943B6170F9BBE80800BC29B89CBCE2CE27BCFE07933B2F7BFFBC977A43BCD80DBBBC76A8FDBAA1EA0BBC3549A2BBA21A89BCEB2D68BA81BA67BC46D385BC1CA4563A4AEEEABBB90E0FBDAFDADCBB9BAD2E3B7AE55DBC02A830BD975DA0B83304DB3C8306EA3BA073D2BC02412D3DCBA7123D6307863CE2DF11BD9B47FF3CD1A57A3BB1A4213B8BD40CBD9DC30CBC86C5A73AFCBAE93C4A1AF6BDD117D6BB248F8CBCDCFFCFBB3E196A3D4CD142BCCD87A73C326E903C414098BAC93384BD2D43A8BD133E05BC1B0C1BBD088058BC8C153F3C5C37083DA8FE8F3B0857F5B9C52DA2BC2AAB9ABA1359A83BF24D11BC52010FBB89E02F3C0FEE863B7482E4BA157F34BB6A55E9BC9BFE3DBB8C3888BC22F799BCD8917EBC7517553C090D4ABBA86188B94AE50ABC527CA1BC1A780BBD138E043CCD1D973C1452143C02DEA1BCAB76213D13EB933C4F57DF3CDE853E3B9F6D0B3DECBC0BBDD6799B3BE97A193CA454A23BC402E43B6D2AF33B58E2473C8B54CF3A71C7F2B92AD9643DB1A333BD1072743C9BEC4F3CD9EAFCBCD1C39D3AD53E05BCE634213ED5478C3E77CE5CBD18FA953CFB9F483B2B9A3D3D5B75343B91AD073DA1C5CCBCFCDE88BC3E759B3BD2305CBCAE3B0DBDBA8B5CBDFD0B693C4DEC16BC4562B438EC14863B241E2A3CCF4EF5B288F8003D24628E3A875B0EBCADF292BC8A068E3CF1F1803BA244963CB784DBBCC415DEBBEB1084BCD55BA23AA7005F3AAC5CAABCCE4256BC21A827BDC57E673C20FD363DE0B747BD80A401BD42C99C3B95F9AB3CB446C4BD76314EBDFF9BD33BDB99F43B1F82A5BB41DDAE3B957CBC3CC78AD43CB274DFBA8B2939BD299368BAA87C763C581C89392CC5C3BCBD245A3B000F3D3B0EFC4A3CEDCC84BCEC53553CAC115B3C722888BCC9D6FF3C1DC0153C35781F3C632D6E3C568CF5BC6D06DA3C6048D8BB31C988BD1A644A3C148AE9BB21AA5BBBD5A54A3957BBCE3D1FCC75BB707BE73BD174C13CBCB74DBB25E3C4BC93D063BB04DDF93CF902BC3CA92BD83DA617F5BB34FCE1BB180B34BC40BCA23A37DB5CBBB22554BDA174DC3B070C6BBC14A6D4BABF9FA6BBDEE97FBC906801BD08104BBD63211FBB195CCB3DB4D42CBB9CBBFABB856378BD02E46D35CC10013EBF0392BC83CA64BB69842B3DA4F7C63BF24B0E3B0E84183C2C851139486446BD1D199A3AFDD70F3D2AB5843BF527303E8333B33D8F2385BA4DD9923C88178FBCB6AE99BB11A29A3C046F203B7FDEB93B4C5C82BC574610BD8730A1BC9F995F3CA8789D3E0484AFBB5A7A733C6FDC053D33211FBC2DD1163DC13BEE3AA141823CCB4FB13A2EC158BD2825E5BC758FA53CE6CC81BC255E6ABCC21C453D414D41BC7C62F93DE6FF88BBAFBB1FBCBE7D833DB43B87BBE7B5BB3C7B311F3E15AA5F3CBE992F3D119B523C1ED6453C43F8583C234F05BCC9D245BE5FFF903C8189133C348E97BDDB1793BD16FC843BF723CEBBD061B13C491721BC052E543C0FB0533D6008143CFE50D2BCDE59833C7F92283CEC4213BDAACA2ABDDB8E463C70E314BAA8D5953DA9BAC5BD2D8C82BAE0C2C0BD3020FEBCC393E0BD8E3ADD3CA04E1DBDE8EA843D343CD7BDA6215EBDACF35EBD16400C3D2C2B0F3CC928DCBD2DA601BE7EFF72BD29C469BCD21FE0BAD3CDED3DC0EBC13C1DADDF3B459114BC1F30E6BD83F8263D2E1EA63CFF80B83D1539033D65E3953DAEDA8E3C58F3743DC342353CA909A73C2AA018BECF31CC3B10128E3B0E81A03B709980BC1457AE3C2158963D2C4B41BD2A5FAFBC7703043CC273A8BC0652AD3CCE7DBABC857F9CBCF360983C8AEF673CA5398E3CAB3BE83B77CA2EB988CCFBBB1D298A3BB5AA9B3DA3C2EB3B325D873C553803BDDF5110BD833047BC9286B93D7F4527BC72E8213C8B3913BD30E4B0BCFC53EB3B57866DBCBF5CA6BBEE9C5C3DD7917A3CFA395C3D56B035BBD686413D12728DBC1EA6263CF5DA4BBE990EC93A740F853C82F8AC3C8132EBBBC6F03C3CF2374C3CA72B01BD14A1123CAA66453EAFDA8E39CF84353BCF99103E1FCB3DBD1F310F3D25AFF13CBCBBE1BB48D2143D5DA695BCFCF622BD1D4547BD0B59403B1EB821BD58454DBDA14E813214A8B8BBCC14F6BCAA48033D3F6B9F3D516436BC54931C3D85241A3C5E8E823DF4F50EBD9D76A73C29DA443A11B6153C48B007BC188D0E3DDA483DBE38C21DBC42F638BCFBDF30BD1438753D96F5F13C5DAB3FBCEB78B73DE8FD92BC9B8E8ABDA99C28BDDC50E33C345F393D31CEE2BD2F66603BE59582BDFBD4B23B82F6643CF9DE2CBB42861B3E7364D3BC6931CABD16DEE2BD39E838BD27E2523CD53C8E3D26C72BBDE1920E3C73BD93BDF348A33CE6E26C3C8B33C83A92532FBEEC3190BC11C433BBDE67A2BCB914AF3D81CE93BDEC163DBC558119BEE8B8623C55A7A23A11A6083EE44527BD1107BF3A6F88D5BD0FBA09BE31E01C3EE920803BEC6DD03B00B015BBD89C4C3DD0754ABC4494273C90CE8ABD525BE7BC7610A0BDA0E3A63C4A8E74BC030D1BBEDF3645BE2904FB3C8CDC9FBAF5BE19BD5890D63CB2EFE9BAB11F4F3D9BA878BB1DCC9837D620A03CE1F09ABC1BFA1BBD20B697BD21A84E3D10090CBC20EE91BCAA71883C5384B1BC6E75D23BF4F42F3DA5FB1EBE5F4EF0BBAB9A9EBC0FCC703D7D436B3D5F3EAC3B948F6DBC73947A3DED9747BC51B697BB64B334BD54C03A3DDAFF61BCCE8B203D9C7BC4BA020C02BDA1C5AABC678CEDBC7182563C0197FE3C9AE672B9D82E4BBD185E16BC18B6C73BB2E4B13B2FB2073DFB1D9C3D5767633EFCE301BE69673F3D2ED0F93C6F32FE3A2E52463CBFFCA93BC7D8E2BA34941BBD1EE900BCA674553DA50D023C5758A5BC799B8BBCB0846F3D5499BEBCB8A7AEBD5C43E2BB1C12E13BB3452BBC8F2F9D3BC7319FBBA9E228BC80A81A3C90D728BC1D2E713B93ECEF3C991977BC16FAA6BBA25FE23BBA59D93B4A688DB95A6FD83CBBC106BCE2205ABCAA17C83CD3AD7D3B9808EAB6E913083C4CE495BB248ADDBB2A22C9BC2693663C418080BB4D1481BCAC40D7BC01DDA4BB05063C3B9AB159BC0D8E783BEC3DB13C2DF37FBB135F6D3D39F5303C1D48D23BE1FDF8BB56973CBEE028183D983086BBF8E49F3CBC89503CB4974BBE6DD8F93E543951BCDF522FBDCA54883D2F172BBBDE229DBCB875CF3B442FDFBC7BEB96BDBA3F29BC40B0EE3C1546F9BBB7A965BCF5909C3CCEE7273C6801063D63780ABD4CFB80BC14FC093DD37BB2BED979D5BC5284C4BC95677EBC3D17E7BAF57A8139788902BD46FF7EBD4EAF913CCDD3573C57092D3C1CBE87BC19C20FBCDA9C053D3DBC66BCF98296BC42FFF63A4355E1BD5DAFC33CF7EBB03B470FDA3D50B7DCBBD4A1CA3CA39C833C82544CBCA0DE4D3CA8F7A23C6C8BE53DA7C995BCD108C53C448FF63C577B56BCCD63A63C99ABB5BC36ED1ABE39EA07BFB891CABC76D4153CE0997E3C68A9F83CE4AF6ABDF508683C2F759ABB5E41A23BCC459ABC14EB92BC2A87193C8B1E5B3D6E952F3DE62CB43DF71403BD41C05A3A79E45CBC441907B47DE241BC09F4A6BB4EC0D3BE22A685BC0734BE3B9DAF303CDB87203DC81F1BBD553B8EBCD32183BC5028B0B9AB95AA3BF5BCBE3BCE28143C0F93C3BCF591FE396CA1753D00C854BD0F87F6BC3595FEBA861C0D3ED23E58BD101EC53D896124BDE75017BCDEDF193D5A991F3CCC2746BCF12CBF3C0CE7313C3857EEBCFA9E873BB87A263D977C113C2D91AC3C0306243D8C49D9BBCE0C583C7F1747BC3829853CC08CCBBCBF33693C2FCBB9BCE6A81BBCFCF31A3D4815433D3BC8EA3B556B023DA068BC3CB1F3DC3ED6AD6B3C7811083BDCF623BB7422913CA4EF1EBEAACAF2BB3CDFCD3B7FD3B1BCFEC1B839A82D8C3C74C2D9BB3621963C424E46BAC4E440BE91BEAE3B6CAF173DD1D57BBBCFDDE53B4F621E3B116A2ABD6079B73C9ADF783C275C5E3C76F67D3C11FAF2BCC89C2DBD2226863A6A31933B20FDABBD9533B1BB12151F3C83911C3E5B9BC2B12CFFAEBBB8E6F0BB7C98AA3BF930823C76F8D33C08EA6BBB7FBEA0BC889299B8CCF36A3D28A2DA3CFAEC6A3C2E40293DA4E090BE7408AFBD6EBB07BB8ABF9DBC06C197BD5CC83C3C588E543AD92AE03C1551A73C0B41F2BB79B5D3BB42DC433B81B9FBBCB0A6203C07313B3C88BE543C5CAC55BC5B5D02BC271B3E3D4F231ABB829E0F3C66AA92BC71D62CBDAAFCBC3C0D29183D3FEF19BD374F453CC459233DD994AD3B9B1F08BE762CB13B456A1FBB91ACF2BE17B4413C916D74BA842292BD93EA763C59B5B23C0A48A7BB1A506BBB6E16DDBC8EEE36BCF5B9923D16C4FEBBF8735F3D420409BD5F8DDE3B1CA3C0BC179B19BC1A006FBCDCE08BBC06E42ABED21BF4BB83F2CE3D5D0FC6BC60D90ABDE9311EBC9E3A9BBCA9EAE53AF57A6BBB0E9734BDE754C1BC45BCBFBD7982DABA8DD09F3CD6E4163A8C22A9BCC1DECCBD043FDBBD9844183CE06B7BBDA8325E3D183F13BD76C40BBE9A692C3C50A3A238F6792EBCA4487B3D51F09DBC59A37DBBD364983D6727B23B06C64CBBD8352E3D6B6CDD3D464874BD98A1073C7AB6B6BDCB0EDBBC79FBF3BCF4BF4E3C602F123E16A0A03C6B7F08BD395D683BB726533D52301DBC600525BD0A7ED6BC5BF7D53C7194BD3C41F5AD3D10B32F3DD92BBD3B1003E53DB02765BCCCD6F93B83104FBD110F8F3A860D3ABD3C07BEBC497083BCF805573A49654EBCE252383D7C868FB819E299BEE4D9033D37A9063DA8CE833D1B58A7BA0EF1743A067A2A3C1DB1AFBBBD9CFC3CC791AF3CC99D57BCB630A93C7776CA3D297141BC6DF7DF3CECE7C83B565AF9BC63EB413CA68F31BDA620343B1A7021BDF94CE0BDEC18BF3D385463BCE49384BD225A2B3A3BF0DC3C1F418B3C2D49D3BDC4F79FBD1AA48E3B400D263C4503F03DC90653BD1230ED3CE004BA3C2556E63DE011D43BA4B4453B83F24CBC4A2C173CA7C98CBCB6BAB5BC8ABBCC3C9DE044B286D207BEB19AF5BCDF76813DBF5F523D9110B43B947E9CBE48AC0A3BAC68963D8470893D064FC33C95C0A8BA055B0A3CB7D359BC722A803BE7A712BECC00153DBDAEF0BC850EA03CF81F95BC04394EBC74BB37BCDD7796BC99F7B63BD1B6CBBD25092B3D7A3662BD4F1DA03C8E19D1BC79CAF1BAB402F03A9008453C4622213B1A5399BB9DB50EBCC7D1B6BC43671ABDCFF69CBCD5F70C3D60FCF0BB5712AFBC8444BC3CA7267FBB57764D3DFBCCBA3C97561CBCD4DF5EBD561C193C21DCD0BB31D8AD3CC410CA3C201F15BDE32D073EB62AFCBCAE37C03B53CE3FBD643AE6BBB1E0893D04D4323DB33D453B92C2123CA2F358BD77D715BEC9847DBC46833F3CCBB78DBBEEE13ABD5898E93CBCDBFD3C96FEB4BC2B13393D8892DD3C1133883D4A283C3D3196053BDA12F03C133A7CBB7C2FB33B178B51BDBC181CBDCD4B513B7FE408BEF76D8BBD6FEC8DB4B249903CCD7DAABC6BC1C03BA666733D5785503CE08F603BB80436BDD821693B76C081BC89BFD83D469D0BBEE7096A3D129732BDB5959ABC4E8F773C92B3113D54CEE33BF0CD88BC129A07BDE362A4BDEC3658BD38A94ABD4D5C25BC3DD5783C19A3963CA059C6B96FE006BD4F3B053E48FFA53CF3C5243CCAAF963CB82C6EBA53F174BC2307243DB960D43C591A413D54ECFFBBAF837E3D0D700EBCE0AE2C3C1AFBA2BD27E8563C1CE112BC8A8263BCCC4C04BA5F5CFC3C88A79BBCD22E89BBDECA39BDB340553DAA75613DAAF3573A67E053BDB1746D3CE32870BDBA36AA3CD80FA63D220B8B3D69E621BE66593C3DC82B66BD982EF23C87F32B3CB05356BD0A09F2BD01C8853CB6A8353C0DF4FDBD8EC14A3DCDD29DBDD9E8E3BD09E099BC4B129F3DA0099D3D013E2C3E4627423BBB28C33B4A2353BDE9591ABDD674FA3CEE05EE3CD66137BC932D81BC72C61E3EA8A1873C6979C2BD68FC3DBD24A50CBE8D0DA63D2481C0BD8B90863D2ABB353B77AC3C3E367C71BD260EF33C803A17BED1D9BFBC3FE0A33D5B3BF0BCDE86B6BDC0D388BD85718D3CE252E0BBC4AECFBDD87E3FBD55EA6A3D0FB1473CDBFD75BA3C9EBC3DCC83D53C7B19493C009F8EBACA34033ECB022F3D1138DF3CFD4562BDE81C14BDCD5C8DBDF5DD40BEF1BEC13D92A7ABBCB5CA9EBD6C3DACBB974923BC231D083B9F4A8E3DA4DC603DB79A24BD4795C73DDA1B013EFA1F86BCEE673B3CC3C5293C11B2223E853FB43DA295B03DB4DD84BC31C97EBEE941843A2E0BFFBC3566003ED4A8893D5231673C1AB9B5BC17A81B3DF2893A3D867F23BDD03938BC47EB403EC97BD23D102523BDE9138AB97FA792BD4123193C196C5E3D4CD5523DAA1F80BC1224A83C0C20743DD5EF593D2B50043EF445A5BD2DD3D0BDD53DDEBC372E42BEC35D83BDA1A10B3D5675163D4FAB26BE292419BDA2E60E3C7519AB3C56CF0BB47544A83DBB9FB8BD5F42213D540CEBBDA8EBB5BDB4001E3E832E02BDD59AB0BCE3AE4FBC5AA680BD96C0653C793046BD5B51493D7C8CB03BB52F12BE9F54223D81E479BDEC09333DB0D32BBD506CDCBB8377F13CA00E9ABD9B6A7C3D81D17C3D1374183D8F5C84BD36E218BE0973C2BCFE349B3B9AD0FBBDFE2F15BD685FC73CA51FEA3D5148EB3C1F3675BDB58368BE19D226BC02F668BD8DACD5BD51CA2ABE99B0C0BB818EFB3C149A9C3B6FED5ABDDC648FBDC370DB3C9AF0A0BD0478EA3D16593EBDC074FA3C5E90C33C35E9893DFC612FBC1A5C16BEEBDF863DD27A243CCF44B73D4EC581BC286396BBC84772BE33E08BBD19523A3E8C0217BEFEC84F3D6C0141BDF9A7493DAAAB663D8559B3BDD2CC243D92EC993DE3C3843C407518BE3450BFBDFA2F9EBC9DCA15BEFE2D3BBD5CBE06BDD09B923D0A7F89BD850CAB3B22CAE6BC9F8F253D41A2D8371AB0923C303130BD08182FBD28376BBD3EEC3BBD557A703C690FC43D9D247B3C4C34EB3C57306D3C31BCAB3D19401BBD7C486FBDFC98CA3C2C3E1E3D3C9DAF3D2E9DB8BD4CFC593D4908873DFF4F60BDD561E73C129E5DBDBF97373D4AC282BD6CDC5EBDCE5B38BD81A2753DC1431C3D4B45293D6ECE91BD85DA10BD235FA33B23CF153D90DC29BD04F4863CC4FC043DB0F2273E5E3474BD58DA4F3D3862A13D9D309DBD5AAAF6BB38E61C3DB82BB73C90F063BD2862C9BD2F2D4DBDE69232BC48C4F83CF4932D3D28EDBEBCC62A543D20672D3CA010ADBBE7BA6DBD916AB83B1F34EB3B3524ACBB8BE7993B1D5DF8BC8CD2DD3CFAE73B3C76A968BB9018E23B6D69E03CA938BFBD911213BC5D01583D213E993C59F7F33DA314D23CC017533BADAD90BA7F9BA1BC7F08D7BCC59798357644923C09D958BCCA9891BC9B119BBDADDE2ABD2664073D8A6FDFBB54E1FB3C9D949FBC2517713C3CDE263C13FC093CA6434FBD5D4B543CF32507BD12DB2F3CA965FB3D7AC8C6BBD560313CDA6DDE3D908AD33D9BE050BC1823CBBC60BD30BD34A4A13C3747BEBCE09BC1BC4449DEBBECB9313C58E6773D08F01BBDF49F6C3D697BC93C4AD69E3B790F8F3DDA5F0CBB281EE13C6FF36F3DB281ABBB413E2F3D2DDAB03C3D41BB3C8666063D1AAA9E3C4106F53B6737E33CFB5D65BD2F3DAE3B96F784BAF126DC3C5D9F903CBEF04E3DEBEDACBC40E6DBBCA92719BB3B4CB53CF54A863C464E8E3D7BBC0B3C2F3221BC56B0BB3BCCE4023DAC47D4BC30A034BC07DF27BED809FCBCB891A03C117F723DB65B9BBDA05895BD3CD2E13AAC1113BD1032383ED4E6D5BC9B180BBE21459B3BD699BD3C03AFEFBA280E05BC90773BBBE797CB3C2D0F9E3C4302823B297CBFBB297C30BDACBD4CBC5CB5913D4FE44EBB559BA63BDA17C4BD21A57C3B6BACD83CD1933FBDBC39233B5CF79B3B46FDAA3A2C30FAB308AA063C2B1C21BC347404BD2466713C5BF1AEBC5887133DD42640BEE148073E89337BBCC482A7BD6C9E9F3C79D514BDEF8D8A3D74DE00BC2AD305BEF2FE9B3C1E7E173DAE06BF3AEE4FA23D79D2053D0FBA5FBC74EDBD3DB54000BDBCF7453D988B50BC43A0F53C751AB7BBE36E0BBD2DE0AF3BBEDA743C315E783B9D6B10BB49FC303DD7401A3E54207B3DE073B4BD6CAA35BBE2C5A2BC9A9E0FBDDC5E113E1C75A3BB88A2743C30281DBD49320CBEFE33103C7392B7BE687717BDB55C8E3CD0B7B9BB199177BBEC96D83C3A58073EE0B72EBC8C8B043D140B063CF8315A3DDED2E23D2CB6F03D78EE98BAFC8CD73C5A52ABBC25E9FD3D1461F3BC753B5DBCC7AE03BD688E96BB46E6C9BBDDCCB53CB3566C3C513512BD161625BC2AC911BD8695DC3B29CD203CBDA599BD68983DBDA498BABB1C19A4BC4D761C3C4AC9BA388A2E8BBE0F3B293D371BF8B676CEB7B87B6D293C350E89BDA9B6F2BD3EEB26BC50A2853BA9C808BDA11F183A032080BB47CC8EBD9FA5183E3F9F463DE63B373C757B6DBC3E807FBD38D6BCBCFD55903BC7F5613C4ACB933C2636C4BCF352DDBA68B3D6BDF47004BEAE9B0FBD74B329BDA8E488BC81D185BB73352B3D7E9F22BC6499F33A2AD516BE18EE0DBB43919B3C195B573D24D8B03C353B7BBCF70FFA3CE176823CCFCF013D7A809DBCB740C8BD7185943BDDDEDCBB360FAF3BB7450F3B36371E38F422093DF59BBA3C76BFBCBC75D65ABE197243BC1356873C542F02BB8807983BE80EEA3D17C3D53BCCC370BC1E7B54BC650CE83B0775F03BBD429F3B8D97453C84FD0BBBD7A6F93B8E4ACA3CC88C46BDA2B285BC8DE70A3D1D6293BC2AA853BC2E5A113DDE8300BDAFA55C3B712316BDE25DB23C98E7F439A4F0893C2B87B03AC869083E2A66413CB116F73B597BBC3C43FB82BC142A39BD82FB63BC32BB203D617DA8BA9E0BD83C23C212BC0704EEBB72D3EABB095D6C3B4EDD8E3D344C89BC548C183C917280BC7D6DEBBC353E2CBC3CC10DBC3057473D50C2C63CE9E76A3A573E183C6DBED9BCF6A6F9BCD18A94BC8DE529BDA27E7B3B505EB33B73DAB5BB81A4363E4D34F5BB8B51EEBC2B4DDF3AA01996BC95CE35BC6D606B3C113E0CBC34C0DABC4CA00F3C0CBA44BC29A98BBE67D9C43BDA12B13C02858A38BCA226BBC93B8DBD8E48453C1F1E32BD863937BD4E61DABC7FA9963C91E2F73C373E43BDF47C7B3B2B99383D71104BBAA9AC143D92E006BC009A753CA6D5B1BE333B533A0933C33BA519BABC35D8CEBC2E7CA43C6E75233DCF15BB3C690522BCE197C0BD007B70BD15E67D3CE312153B93D2883C1DA285BCEF013C3AFED4ADBB3EC2FF3CAE5F6DB8BC5148BB649A27BE42F03CBC2CA9983D56ABF1BB5ED017BDB7BF103DB073143C283DEEBB61E8C23BE770623BD8BB30BBE489FCBCF11B1EB39E7D0F3C4483E53CB18EAE3BF0FE28BDB03766BCB4BA183D544175BEC9C391BE5A4F8B3B48DCCCBB924A80BBF73DC3BACB1C3CBD86C100BC990CA8BDFA72053DE593683B07D8B0BBBBCB7A3D5BECBA3C2ACAA4BB6EF77BBC1BCC5B3BC525EA3D7C2C9EBCCA7095BACE550BBDDCAA12BDFA2CA6BCF33A933C224137BC2C4D413BCDBBD4BC33EBBEBD136A1BBD4D69C33D3F667E3C43F324BAA4C00B3CBEDBA93DCC85B13B7F0047BCCC982BBD7934AA3B05BD8E3B5CADF83D1593E53CD167AEBC3766B43C8479B4BC467EAD3B812B1F3E354D86B9FEE53B3DAB06EC3B401D5A3BDD589E3DCD9D353C288BDAB9EB589DBD60A4293D9891043EC8522A3D2A70873C2D45A53B3FAA34BCF382F73C5E87C8BB31B7873BB441EA3B5DC57FBC5DA9113DFD5D803CB86C9FB884BB3DBD123E1A3DEFC2623B457DCDBB6054913CD443D8BA5166163C6D0E0BBC8F8796357E9FF2BC1538C33A18B450BCFFF81DBC5E0E54BC289A52BCBE71103CF8917EBAB56F3CBCCB7BFF3C3D21533D3D56E2BDD32B453A93D319BBA2B54EBD241747BBBC7B9FBCE336CB3BF13C513D680741BB8FD2383B7B764DBDFFBFD4BDA5F326BC866C8ABDA7FBB4BCD1396A3D3A3D0CBAFF2CE3BA44129BBA5D54E0BC1037C3B9C998083C4C073B3CD052C7BB140085BC04E3AD3D865D0B3BB9A207BA255AC53D5B07BDBD85F5B4BA8A292CBD7FF8473A104B04BCEDE9F2BC467707BC1462C13BA12768BCA98798BDB008913B5D66BDBBD70F99BB0885643980F880BD9521973CE5D9D8BAF256B13BF815DB3CB98D203DD03501BD98EAF2BCD4D388BAC3F01A3DF2B0D73B1302363D2829A2BC8634F33C873DDDBA5ED7F4BD9CCFEEBCF7D45EBCC3A99E3C8F9D39BB32B38FBD6A1160BA53E693BDF1EF85BCD0E50EBEC6E8E7BD3E4A253D47DBFD3C054588BDB2CA5E3D7540B8BC4D974E3D2FC46EBCE5DDA93C5D893FBD8E7E793C18A01E3D558620BC4F13A4BDFB8CF7BC3E7A5ABC45F183BDDA1B383D93A5993CAFC81CBC775D0A3A5BE270BBFCEB67BD631F87BB0183E33C6AE1CD3BB1D0E3BCAF711DBDA27AC3BD77C1BE3B110939BC1843FFBDE3A4ACBC6C70023CE2ECD2BD9DEE52BD648007BD5542F93B4313613CC8F044BD919EEE3C78730BBC5C18E03D75BA263DDD20A93CA6338EB9621167BA3B5B8ABD145AE3BB7B7C19BE2CE1143DA1C0143C6386BFBD24A6B3BC503E88BC517BF63C4C7676BD5D105FB9503AA03CC118453CC8DC9EBC0B9835BDEBCD863D4795CD3B7FAEEEBC8D4D79BD3B40293D6BF0FC3BEFAA34BC457C233EC9CBB3BDBEB1AEBD8BCB1D3CA304E7BB7F42F3BB1F63773C4149A3BAB4C425BEA7096CBD420E22BC146650BC35696B3DAAD89B3DA6B6423D8002D13B1B0E09BD3B7C8A3DBD0E583B8DD053BDE49D2CBD5E98853BC982A6BC17DB0CBDF12579B3A75F00BDADD24E3DE266503C4283A93C76EF13BCCC967B3D38633ABD836033BE0E8364BBD13AA8BC47C0C6BBB1AE1DBD7D7288BDD04BC73C781D8A3E26109FBCAAF257BC374A173D848C84BBFA271DBDC3CD46BB5D01B8BC87129F3C12D2E53DF80694BCC8A6A3BB120C703D7098733D85591E3D076A133D366B4CBC090483BC14F64CBB1D0D13BE8548FC3C5153043E4A79D5BD7BD125BDEF9DDC3C8AF8A43C8D6682BC483CFBBC75A532BC4078AFBC045B153D7DB2673CC6CE943D42720C3C0A6D08BCCA13213CCDAB033D7DEB46BEE893273C2E8025BE913FA33C0D2B79BCF121023EC859753DC34DC2BA4571AF3CAC8DC2BD5D3204BE4C749D3C6A31563DCF75733CD0004E3D5AE1AB3D39E20F3DDCCA5FBDCD097ABC647F5CBC8D055D3C42BDB1BAAE147FBC27C70A3E3E01AF3D8A3769B907A0F03C5D6C6A3C4C9B0D371E9DBEBD8F5CA03C1243B136D2598CBC19D0DFBB9BB433BD32DDFD3DB066F13BFEFC55BC2ED6A63B0F47263CEA2D84BCEF9B943BD3345EBB21FCDF3C2F58C2BC8E5FB8BC18BD4A3D0D4EF4BCA3EA003D81B40C3D29F5533BC3840A3D7ACD99BCFA2C0BBD66ABA03C303924BD4408803D9F7A45BB9D48CE3C9816453D344C72BBE43F8B3C96DA0C3EBCC4C7BA11C489BDEF74F83C64203DBACA04333CD779313DD84BEF39681E983DB88F053E424F6ABD994CA1BCFE05E0BACF9D2E3DB93859BC298009BD6F5937BD4F6ABA3CBD0B06BDD178133EFE8C95BC6098CEBC1D9B743703346FBB587F52BDC8B53A3BD9B4C93CCFBD013D2F7A80BD1D26AB3C65284CBC8C65043CE695B6BB0B2D00BC2AC52ABD6B82D0BB810E753C8AFFB0BCDE93E13C02240E3C48C5153DD333483A53DD833CCE29723D7AB3643D955996392489EE3C4C3D2EBCC5E309BD9C68D93C2612C93C8E284CBC190599BCC7EDCC3DD7E46EBC9D53F63C855D4ABC55580CBD23E8B4BDF03F49BD7E1B2A3C5A66B0BAE432533DAA25173B8B9F703CC2FFE0BDE04127BE59DE67BD76EE4B3C83533FBD0CFC8A39D678D8398880AABC1F18883DDCC93CBB4717B1BCF2ED7BBE70B3053DFC34F33CD887D43C7A734F3C2172FD3CBA73DE3C5E2EA3BC839BE03BCCC6A23C29A4843C2744AD3B6206E93CC14186392CD0DB3BBA41D0BCAEBF2ABA062ADC3A1CEBF33A41E08BBBB2BF123D670D2C3CED8F543CA5DF2DBDA66DC9BCAA4439BD3D1BBEBBFBED6CBEBA8C9FBCFA4F42BEAAF132BB01D318BC14EC513CE766BC3BB299723B445670BD15D920BB8ACF913C833D1BBD734F9CBD95F31E3D4549533C837994BA734D9E3D4E88253CD0CDD83A1AB30E3B21FA673CCCAB7DBC17B3643B32CFA43A63369CBCA51A123C0531A83B96F42E3DDD89EEBB4592663D8BA5D8BBE2E5FE3BECED4CBD1940C83C9BB4F53C7CBAE7BCD9688ABC877A883B644BBBBCED04CB32A77B593DB975B7BDA4EB093CB7D129BEA64A0EBDCBB1FD3CC558343C49B5673EDA30ADBC51F008BB2288B3BAFC3434BC6DA4B23BB31A78BCC4BED4BDE27F7C3C409B173C9249C23C3E09C7BAA991BD3DA7E0ABBCD69EC33D3AEC093C9353A3BE073C8CBC1AF612BC1BDD5DBC742F3DBDDA42A5BB907300BC4FD10A3DC02002BB6420883D0D22B23D349C843DFF8C85BEC69D973D0E2AF5BB7EBF36BD0AABCF3C9FC09CBC6876313CE5FE0CBCFAF22ABBB61C0DBC1A0C68BD86A484BC769E1A3C90FE6DBC81A01CBCC9E6AABD8E2D263DD24A15BDA70E24BC4A67E53B25D9FABB02CC043E0D640ABE5D4B1B3B823A1FBE91B20DBD9977D3BC871C1D3CFDB76D3CFEDF5BBB4E34B5BBA152E0BC1EF070BD9704DCBC302483BC1EC161BE4F7E77BC30B7C2BC77ABA43D2573B43D0680573D2A4F1B3C25C2AC3CBAE07C3C9B1BB839172A823DE0FE853C43ED8235AE81CD3B8506273C3F6C74BCFED31A3D623CB63B09393E3D064799BD96BBBD3C76BBAE3B93EB02BCD8F773BEAC864CBE5B2DF53A4448EB3C297169BC26A3DFBB88E0A13C517538BC18E1343C6845953C4F6AD93C6A74F2BC8675903D0856003CAC66C0BCAA23F9BA83F361BC7DDB36BC31259BBC4854EEBB63D8713DADCC5DBB53BA27BC4429F03A97519ABCF552E8BC7512853D04199B3C0BE980BD32A606BE9857923D8027123D9A0F69BCC95E8C3C2F9422BC596999BC317A39BC92F2933C9160DC3C2E5758BD1EA7563C5BD137BCA496D7BAA9DD043C573B03BEBD4D6DBB293C8D3DE03E053DBE83B4BC4770A3BC54D11ABC2DC262BAB6718F3BFFE529BD69BFCBBD1ADC3DBEE283733D44047EBCEB53B43B6B0DC23CF08557BDC65D993C0E24EB3DCA79873D44DBDF3BFE78023B495529BD8DB155BDE3D4143B9F185C3D847C20BDA5FFEABBA596263D327F0C3C020AE13C642619BE1DEFAFBC0FF31ABC774D3D3DDA04C4BD7678AC3B474EE63A8673CFBDF9590B3DA081B43B912A963D824AF93D0F8316BC17BA8EBD49A7263C55BC85BCEAC84A3D15F63CBD435881BB0B85C9BCB4310D3D625EB83D9F51D73C7917323DC3539D3C621A25BD24E2033C3827473D7F118E3D9A9CDCBC7FD8B53CD4736E3DF7F883BC535497BDBBC65C3CA00463BDA2013BBD5CBEEDBB0F0607BB755383B935B1E03CF30C413D6582C03CBC78103D545F7B3C1E4E013DA74033BD5C49533C49FE103D8ECA7E3C1982843C6A61863C0FEB333C118DA8BCBA05813B1A66D23D7BCB13BCEA27E83C0DF737BD388005BD2C2529BDEFF5A43982D476BC1B5E2FBEA3BB22BDB89D5DBE739B8A3CEB81283C1129393C9D25033D54918C3C80455C3C3F0CA3BD8DF0C33CABC1553BEE19013D761186BD1ACE88BB85778CBCAD43D93D3407BABD69002B39C40A5F3D6AB7DEBDCA6E72BC2FFB943C24814D3D9B221FB4C7F3483DC644B4BD647B953D82E3DABC66EE0DBD53C822BEC800353DB6B9983D6F4E123DAC7885BDFF5E453AAD1090BC384A183DF0432BBC4835C7BD6B9EC73DF6F04B3BCD98043D7F47A2BB18ABA9BC6BD2703A2F0F10BD93F2CB3CBD5497BD8892F43C0982573A316505BECBB943BC9DD7B73D6232C6BD7A7DC63C4B9223BAB74E463DD25CA13D2CCA283BA28105BDE0EEF9BC9820CC3C3EB1A4BDD7B5A5BD24F59E3C750ECF3BCA4A7D3D81DDAF3D491D65BD34359F3D5F27D1BDF886D73C974A9FBC7494A73CA60018BDCF53083EF166DCBA57387E3D9FE7B1BB190440BCB31ECC3D66AF503DD54D81BBE42E33BC84A114BD31070CBE3944D7BC5875473DB6F83ABD691A003D4A397EBC06953ABE249F2D3D831D033C9EED433DFDEF47BD6FF942BDE349413D6D7D2DBE5553DEBD93F4CFBCA4DA77BC22D00F3DBBD21B3C7D32E1BD648DB53CA7DFEC37CEC9183D901496BCAED006BD71F0A83D74F2C0BD4941903CD22AD63C13184E3C6022A73C2D8D2CBDB0A543BD7A38293C4A322A3C091F163D707E2E3D40A5C53CBFF893BC30B0233D01188D3C901E9D3DB3361B3C7079DDBDD91376BD724E0EBE51B513BDEB7A953B2A24873DED33773C9EDC123D7DCD43BDC04454BDC599253BBDC0803CCF7CB3BC23E8AD3CDD39A13C6ADC503D86C8EF3D31347BB93330B8BDBEE0D8BD5B902A3D2990273D5ED19FBB8B415ABD59AF4EBCB20830BD076AC9BCAB2FBE3CC222ECBDB315903A71137A3C9771E83BE6AAA3BA017384BDFF31E03BC0CC2EBBB393053CF316543CD894A23B0F2C2A3B8954A9BB55478FBBAA0DDABBC914903C293CEB3BDDB4BABC22B5353C4685A8BC56508FBBDAA8413C5FE595BC63E4B33C2CDAFEBCF438AABCC4CE49BAC6B69CBBE78D943B9ABA15BDFFD278BC47CA71BCF5A2463CA5C6C33D988126BC51FD15BCD8622F3D5B125B3C43FBE1BC0BD5023D5C63C83B3694F8BC80414CBB32B04CBD6C31403C968A57BCB8747BBC77123DBDACE8C33C91C7D2BCE01FC73C4B5B003C2E2F073CB6BE10BC27ABEEBC55430BBB827A66B99011BBBB391C8F3C8BA379BCB320D13AFF1C623E37957138B1BA0CBDBB04FEBC18F1A83B83AE6EBB42704ABC934F4CBDFFC3EEBC03D22CBCAFEACD3B81D4AFBE194B243CC193773CA17F3CBA2152143C742F0CBD469F573DA30467BD27B21FBDAA33CBB85D09C2BBF64706BDE010CDBCE8AA18BBBAD3D53B528B7E3CFA80EC3CCCD49ABA3EF0CEBB6371C8BE0BA7033CD8688F3B2982A03BBF2F2BBD9BE4D93A16661D3D623A0DBCFB19B6BD5334A4BD8916C2BC4BC42E3C6422DBBC24670BBB15016BBC0C3204BC1AD1913C3596CA3CE15B5D3C8A87AF3BD486783DD5EC0BBCDA2C6E3D85B7843C9600323DAA980E3D21E9913BBB6126BB1252853C3476593B4838173CC4154A3D6A06DDB34743293CBA5EFBBADFC3553DC3F07ABD3B9D3CBB2038943D8A5757BE7FAC7EBE0FE9ABBCB8C336BCA520AFBB168EC8BC0665E0BC52E023BC0D51AE3DF02800BC4639A0BBA1E486BCB8338E3D528D843B144FB239B7B1D3BC69BB4EBC0EAB733D27165ABBD2F6503CFFC0B4BC0BA3913C820D12BCD3DB843CAE95B4BA07A3713CCDD910BC190FE3BB40920B3DE4E0943DCBADE83B4046453CAFAF4BBBDB275A3DD07A7ABCAE6ADABAF83B83BDCDD422BD1B52303D7774A33DFD89EBBC3A6DCCBC5CBE393BA5BB893AE69844BCF4FD66BDE9A16EBB4FD2203D3FCA8EBB05D911BB7F20823DCDB42F3C2A82EFBAFDC89FBD1FF0F73C7BC98EBDC84D5D3CB166A53942CED0BCCED5C4BC9AD3023D1986163CAADCF4BA27F2063B3BA109BD1254ED3CD1E114BC5C9EACBB4BEF7ABDD3F50DBCA5F971BC068782BC0985533C407245BB36DB7F3806997ABB7A161037A86E43BBD6F1A3BB1856003DC11F7E3C8904E0BCB747DEBAB8671C3C2149193B0B8B4C3A7325353B111FD73DB1A49EBD07EB31BBC6A976BC631C4E3B73B471BA9B70EB3C3BEA1D3973FF593C026798BCB7C33DBC025D11BD3C53203C7C17A7BCA4F83E3D90A06ABC5443763A735DA3BB7D22163C2A6E43BBB262D43D97B931B9F8B923BDCD1F053D226963398E5CBCBCF61A843D1A77D43C6A970DBC567156BC3B3EA8BD5FB89ABCD5D5D4BC7650A4BCAA5AE0BC3BAE2FBD60AE94BBABC2A73C013BE9BC48DB203EA9D6F6BB8B9A8EBCA0BC163D4791A7B86D8D753D3261E03A1580DA3B065B4DBD10EF7CBD1228FB3B6A78713C22AC493CECEFA7BC0E94B8BC66A9D0BCF099FB3CB07B2C3BC39F8A3D4F60F73BB85FF93C349B2E3DCEFBC0BB31A2153CFFC4233D35B33E3DAFCF9CB8956A4A3D63FEC13CFB7D4EBC8AA04A3C8865693D3F3A9FB9B5E3143D9B1434BD16BCC5BB47B9BB3B7D8CC43B9627D43CB8D77FBC20B2D83BC6779339FFBCABBB9F717CBD3DF7DE3BFB33C13C780B84BD79AA28BEC89B28BD89A2ABB9D54F94BD422E873A4AEC85BDD8A313BCC2C2C43C6FD883BCD50EBEBCBE1FDCBD74AE3D3C2DC5243C964815BB6D51583D62C12E3D5FF2323D67E83CBD76F505BC8437D5BCED744D3BC5D59BBBB7456B3D8331E53C2FEB933CA116253DC2C108BDEC7C7E3CA190A038C530143CCE585CBC85DB8DBCA012403CD936A23B37D61ABD9BF5B3BCB076BB3D7098BABD5A359F3CAA02D3BDBE58653A2014603D285217BC8F8DF8BA79A6C4BDD7AF293D6BE9613AED46643C02AC29BD351F2ABD781764BB2E89E53C16CBE8BDB089B93DC88A0F3D5A90473CB7F3B4BC7E8E933C7CEE9F3BD4E0D0BA5EB312BBFE8632BD3D0C7EBC295A903CAFED7DBC33DD743BB1749E3DD6EE973C616FB23D7DB689BD3145093C38FA6D3C7ED92DBD483152BC50A5B03CFDE811BDBAC05AB3A91213BDC1A58B3CFE27A5BC60253DBD337A103D0C5A0EBC8DCD1EBE6E3D403ED44264BC3FCB2E3DEDB67ABBDAA6BEBC82566CBC766932BCEEAC23BDC1FB95BBDD89C13CCF36A13C98B3163D676A203C21AE9BBC08A5A4BDECEF98BB368B38BE39773A3DF725B43CD63A513C3D47B6BD03AAE9BCB58E0B3CC5C32ABCECD7253C5C70123C79358BBD0BF25E3DF2BAEFBD3F7D7B3C55C6BF3BAFBCC9BC0E679F3DAAFCCFBC3B618ABC17E6B1BC8F1A163EC18CCFBC8A4E063DA100BA3C3716073D98431D3C9B1FA63C88788EBB664EE93D34FB42BC19393C3D0E66CEBC77F09ABB035BAE3D0E2D03BEAF05D8B935CF203CE2E1B0BC66AA2BBC040EA1BCD48AA13C2C2B2CBCC0EA373C824DD73B8B15423CB5B87FBC45D5B63C220251BDF460B23C4464E1BB827E9EBDAD19993EF02B7B3C07FB6F3CB3BCAE3ABD84B13BC8BBAFBA4F92073ECBC330BC42798CB6498BFE3AA5EAF13AD8F6193DAEBE4D3DB6B713BD8C730A3B0BCF273C27EBDB3C5EA9F23CB5064D3D321606BF7F227FBE4BF97BBBD47F293C1DAD98BDFEDE0B3D7435B63C4A1D65BC5E6E40BD73DCBABCC84C053D05871ABCAD655C3D5DF6FC3CD84AAEBCA6D8AF3B7AF6283C7792EEBBBC3615BC0DB0B33B8D773EBDE026943A4D08833B0427233D129ED53CB425533CC6A7453DF2A125BD129D8BBD311F44BC7EC4C93C4E57C33C4457C0BCF52AD4BBF7E5423B12404B3CDA7B953C322D043D975027BCBFD3F1BD132D7D3CD12C1FBD5D8C823C4F53B4BC538E4CBD6E4E45BEE2952CBC5CF9F2B8EB809B3B0F596ABC16AB39BFE4FE01BD25E1113D3FA3003C8AADD0BA66D59C3B5DD109BDC66EBD3C32DADEBC44C2E13B1A9DFBBC17B3F53B1583BDBCD58C50BA6642863C309F8FB9489AD53AFB50CD3B0F1BAFBBF0F62EBBBF959D3C9403AA3B7BF1B9BCF6D8F4BC3CE6E6BC9CCE02BC66D126BDB7F09B3B2AA8BCBBFD37883CA5D504BDDF8D243BB731D13C3C83AABCEF0A233E9A521E3D986BF8BCEC5028BD8D1F5ABA312507BEE4AFBEBE7D96D8BC4D4957BCA671533D9045A13C159DD7BB1C82ECBC1A4BBE3BB27CE53DF26E163B985AA53C4525B8BC185A3DBBB377C2BCAB418BBCD37242BD07EBB8BC008D3FBD68D6EE3CEA6C6D3E188F7B3D3032323D97C0DE3CE6AA15BD49516FBA36D4E7BD301FB1BD286551BC02CC8CBB881782BBB08383BC52C1323C32D361BC1F7073BC0C91183C83B1FCBC2B5B583D323018BAFAB4A2BC2D8DB3BD7C58A6BB84A32E3CC1AC2B3D578EA9BCC673A73A6A057E3C0F74263C731D35BDCCA2B5BD9E73023CE7596FBC63D6B8BCC33C0CBD7AB56DBC6009223E94E671BC48B1EE3A7562B93CE1FF243C8346BABCBFECBCBC9FE41A3BAA7EA3BB6488FEBBD12707BD0634C33C3F82093E215A2CBCF94EAD3D5709093E5514A9BC24B4673B2BB35233EA6C25BDACDC9E3C6DE0E8BE4C69E0BCBA22103C57524E3DB1F8183DC7409BBBF1A3ED3C0CE9543D2DC24BBB2B8B17B84FEE0A3C6BB835BCDCAB1A3DA2C76DBC08CCC03C7CC7F0BD0138CD3CDA47F13A71E9603C56ED02BDA7FF1EBF2816FEBC1044823BE68FD5BD46B31E3CDE3C873A74D7B1BB2A286F3C3B52DDBDCBA7FD3CCD51FF3B10E8813CCEDC803D5F8BB4BCB23072BC8BFDE23C0E56493CCC63BA3B6AF441BC7B879BBC30C5F9BB1C06CA3CD01C983D2933293BFF94463CD279B9BC3A067F3CA9B414BE7A974F3C2BC492BC853C543C215E483D4477A63CFF077339C335503DF0949ABC84CED1B9571FF23C56A8183DFAC79DBCF49EA03C2A70AFBE3C244F3B0476D3BCA6FD693B0231073D22C901BD7C5A29BD9E8AFB3CA57A7F3C3F1811BB667B823CF0FC2EBBA3447A3B445798BC956FDEBC4B21763D2FEBA1BA61CC5B3DE7F770BE08BF0537B61B2BBEA1F31DBD9558503B5A28DD3C518AB53CD5DB26BC51D202BB220CDCBB1DF4C5BDF8A5D13CFCBE933D8BF7F23B256ABDBCA3A52BBDA47E55BCA35384BD1991183D2086E4BD11EFEFBC1DBD69BB288FBFBA48C409BD46E749BD9C30953C6DC26DBCEA5EDCBC3BD99A3B54A6933C12F2303DA7F0E63AF92C013D35AEE8B9B7E374BBCF64913C5C6A3EBDAEFC0CBED9F6F63C2A81AE3C2EE6093D6D0BD63C753D46BC225B3EBCA04CBDBB205B823CB4B4093EEE12853A33A6F3BA8443023DDB81C6BCD7205A3C8EA7DCBC55C0053D4CB33F3D6CA36E3C6CCCEDBCD4A8D9BC068093BA52888ABBA75A94BC73BBECBC507C87BD7DF41F3C12AD11B97FFEA8BC834B88BB5D9E4639FFB789BB3C5A4F3D3F9E013DB401383CF8A1883D7D09373CC903BC3C4BF8663C79A868BD64529E3A78DD193DD641193D20A492BCC8F7503CCBAB073D1DAA033B9B6C06BC0430153CFFECAE3C867B123C0D8B01BE9D775EBD1E99653D869EF4BAE8F84ABE6189AABB7D361BBD6D53B83B602F403D6D30583C6A83D8BB81998FBAF673323C6B2887BDADBF47BD4F933F3C279F09BDB35D4F3DCFF0B1BCBC26643D31A2913C752C90BCDD889DBDF4C5733BF88E57BD4166E13BFF83163CBDA03D3CE2A8983942731F3B2A7B253C4C0413BD8B7F333B726BF7BD2A7B593B8860EABD0927CB3B00B13DBB8DE88C392E54813CD35F0DBD7C20EFBCE2F42D3D4AE34C3DAF6ABD3C50AE83BC11872C3D3C05813DF8BDB63DB8AFFD3C542A393D54E2443B751B98BA9163993DE648B23CD810DF3AB082A13BCAA4023DDEA8B13C10B282BC4E67BD3DFEAB003CE54099B9EF41433DFBBF22BD9AE683BADDEC81BC7BDA053EA9C4493E3828A8BD6BAB283DA4D2773C1850B1BC7666D4BB405F0E3D74409EBB3BED86BBF55A9F3B6F951EBDF1CDED3C0CB73CBE77F24E3D5C48883D5D8EDD39D4921B3B86A23A3A339496B3978DA33C5937883CD1F495BD15B539BC3F27423CDFED663D7364113D1BD1A3BDBC08653C3A40FBBBB11250BA4D693DBCFC9A6E3B350253BC4C9EBCBDBD1102BA2583DC3B30889CBDB97A1FBD7AF6153CD02D383DD17008BE49161CBD0B491FBDC73F17BC3A5087390018973CCAC0F23B104B183DF58EAE3CA5D699BD582B513C5FA1F53CE2665FBD3720733DA824573D683DC5BC168C48B9B5AC60BD8D4EFD3C752455BCD6C956BC4CDD1A3DB2E012BD284D48BD7F4A073D1222A53CD56B653C9F63D9BC16AC273DDA6B2E3D9B23BB3CFC79513C677035BCCA33123EDCFE1ABAF5C27CBD1E6D693C061C3EBB871C573B97DA18BC69EE4F3D5A77663C2230773E34A0403BD837D13C6FB550BCF69039BB82BE823BC8CAD5BD45BE333D5FA944BC4D4ABFBC69856A3BEB3F66BDCB899FBD59778FBD838D153DBA81153EE532663AC314E73A5DD7E1BD4C508E368C0F033E75BCB3BB10D15EBC0AE8BF3CB035BA3C9487D93C950A923C9DE752BABB34A3BDC613943D2623593CCC16A9BC9E5D823E21CD6C3D724FB43C8CB1373C5ED09DBDC09F95BCC69AE23B66E6853DE49CD53C3EBA373C55C5A03BDA84BCBCE47F483B536EAB3DCDC982BCC18BD23C22B0003D3AB221BDC5915D3D6FA3B33AC2F2763B60A8C6BB1EB7B7BD8A1E553DF8BD0B3C19936CBC0BC3053CCD4A213DC3EAA13C5BD20E3EFB77B5BC6018883C784260BD9AC509BD5DF7713CEE9E853D3A889B3C966B5A3D440638BD0054863C53620B3D69280BBCB9DD14BD5EAC20BE76C5493B5EF8D23C631B463B46D2193C9ACB6BBE44617CBC011D62BA3C3A9C3C2730C03C6B350EBBA1A580BCB0E6683D314A83BBC8A04BBB0FE1003D84D2043DD48EA63909A5D13C696FB1BCDE3F77BA7C4EAF3CABB8063D4E49003DAF09EEBA01CD1FBC4609A5BC120B233C381576BC8208BD3B4079483DAFF614BD3507623BEA66673DAAD22E3DAA08B13D6BE4BABAFC9458BAADA68B3AEB3967BD6C6D4B3DF4BE9CBC41A99CBC626B733C7EA9683C81B034BD0449F13CC5A636BD90ADAE3DA3FBD5BCA5F5DDBC3564253B6AA92FBCFEE9BF3D2C87BA3B6391F4BC9FC783BD118781BCEAEBB5BACC406EBD6B15E4BCD9AEA53C455FA5BB6FF0E83C6BC6D83D5FF1ECBD28E69ABD8C3153BCB9E233BD35949A39A0041C3C9BC468BD7956253D3FE4333C2E26C2BB98B3373C9DD38E3C9A23A33B3821B33CC36F04BDB002D03ADCB969BD033F6D3D97C751BB8404633D400267BB9806BF3943E1A1BD36B4073B54A1C7BC7BACFABBF90801BC19AB8BBA0BE8813C5605A23CDFFC693AC367D2BB91979B3CA60DB0BD52045DBEEF38063D12F3973CC293853CCD2C7CBCFC754DBE54959E3B9DD40E3D58FC183DAEB38CBC3AF2BE3CD7ED8E3DF19E903EF090BE3C6D96EC3CBA28D1BC44CA5A3C95EEEABA2F6E67B36E0D36BD723654BCF8FB33BDDBC9FE3B11190B3DE6CBF13CE6D5873D389928BDD472B73CB679E63C3557933BD1979CBD21B5173A6FF62C3C49E604BD78592FBC0307543DE8BC11BD4D5E4BBD367E64397820173E430369BDA27C62BE85F99DBD797C9CBDA77B433DF3DE4D3DD26105BD532C9E3CEF9DF13CCB3C94BBD0F075BD74FF663C370407BD5F9BB73DFA164E3D5BCFBBBCDA86B7BC5FFDA83BD713763D88F579BC2D040EBB8E4E713DE3650DBD4A0C0FBDA5D833BC483BC83B05E639BC59ABD9BB1CBC7A3EA829103DF458C53C19113ABC944E033D4562B9BD2109B5B95BB9F1BA2F66753C4D2A813A357E183A844C9DBAC22CA33CFE5E013D308800BF709969BC3863F83895CD2A3CB61A013DC88D723CF30CCEBD2F06B13C140A1F3D804C933C6B2B183DE8C5C3BC48B472BD727D543B476A2D3C495A00BEE7E47BBA2CA3613D42013DBE1949A0B6A0FE53BEBD30E2BCCB3FF7BC4B03ADBB51A1053DA2291DBC80C6CD3C9470863B859AC93D64C3343D05751E3D4BD9DC3C06399FBE2D9CEFBDDC7D4BBADDA17E3C5EB6193DD0EA03BE192B9EBA979B4B3DBDA52D3C34662C3C814C57BCF3A6223D3A71B83CAFA75C3CDC262ABD570F9DBB0E6AC1BD8099593C381B4E3D93E6FCBA1270FA3A47CA4CBCD6FB283AFBEBD6BC75B443BCA547BE3A5DB9AB3C38B3933CCA4A4B3DD54B4FBDDD7581BD6946F93B4F31DDBC9F9A30BAA4185C3C5B9E21BDB26F68BC1AE03D3C084624BD337E8ABC3D98013CBBD9A7BAB3CB63BC3FEFAC3CC9A0D03BFCF7563BF6B7AEBC9D76CF3CEDC420BD393F683B03064ABCB253A3BCE61258BC5EA4FEBBAE3B5ABCF54806BCB3DFF939CE73E0BB329907BDA529FF399291013DC755DF3A89B4B83C95F7CBBAFD04AD3C0544A1BBB2F7A13C8D7E293BCD9B993CDC54B53BBBC284BCA243243C8D04AA3CBC74D8BC1D0DA13C30AC14BCD7605E3C31C42EBCD392EC3D8E86E9BB3D5142BC2920D2BB2C1A81BCDF4CA93C73B3D4BA14A0A03C2286C63AC44DC6BCD2EDA53D5B2BFDBC3FFE05BC857693BC94DF75BC7A8B86BAD08D49BBC962B2BCAB75DC3D6272143B2FE44C3C801870BCF2553ABC3876823B63F984BCD02D903A2ED332BC51AB2EBB5C5589BBD51C013E14FDA73C0F4763BC220DCA3C1473A7BCA74006BA93BC44BCB767A93BB69517BDE67787BB1558303C5572783C3034053CC4110BBD837A1DBCF76E8BBCD55FF0BCC3A8C4BC0919E2BC4B175B3C1FCB17BDFBDB413D5C64473CA3C53BBC8646DB3BDEF6C8BC62F3953AD44D09BD2F78ABBC9F0BC2BA6AA9083D4FEAFFBB8208A93BEE5101BC4A9595BD859B22BD35209F3CDF9FF7BC606D20BDFCF5F23C6F9F8BBDFC5F11BD9B299D3B07B77EBC04F141BB0B3719BCCEB5F9BBAA7D303E7BC529BBC56303BD7AF650BAA6EF833B53D4D33C0B958932C64A063D697B27BB35D44FBD9B7EB33BBBD6F4BC31B706BD4F91443B37428C3B94C46CBCDED4A0BC8FC8073C40A8CEBB67BEA5BBA09669BC96CEA93C8A0F963C70C91DBD39CC843D6AE0B43C6B1885BCD56F863C1657BEBBF8B3C43C91C89ABCEB28973C410E2ABCBF4DE7BB47379BBC165C9B3C600B28BCF6032DBD2753F8BCA3487BBCF91BFABB16C7E8BC0E0DD9BC35E7A1BB067D5D3BFF94F1BCA3C32BBDC7C198BC01DF21BBFCD17B3C5A9F143B1276533CA0C28B3CB8FB4DBC0F50FB3B66EDBEBA9B643B3D1D6805BC4341003D91CB9ABA9728593C07AE95BDD5D5253C1CAE333DCEA96ABCC3B1CDBA4DFE1BBC5F3F303CE1A0C6BC5B86F1B935A7A9BC0F3089BB561C4B3B5A9A633DE08B62BCC361433D4E5D4B3CA82613BD66FF9CBCBA722ABD5A723D3D9EE1A83A05877FBCBED9693C36A2003B4A4890BD99D6BEBA10361B3CEC99E43C0913E4B52BA9F6BC69089E3B75B63EBC9820E53C61C8333C7C3BD2BB7AF3EE3B3A5B78BBB8D7CC3D45E71ABBFA91BDBB679CB8BCBA42C9BD6AFE4CBD9D2AE0BBD6A43EBC1699C7BCA5483E3C1D5EE53B169C63392CE3B03CB7B4E7BC6ACA52BC262B08BDA6BF92BC144A473E8AA7323CDADBBEBB9F8E41BD8C4002BC0CC54ABD6334C0395751FA3C1EFF583A73D3FF3C88C562BCF7F6453C91BC023DCAC2A8BC4B1F9EBC9791B4BC418680BC82B966BBB603253A160964BCE716EBBB9CFA733C01D260BC7142983C35AB7ABC102988BB2A34303C7EABB93DB6EF02BDC26C163E05AD70BD939D95BBE194B03C7F8BC13C841C1C3B77BBC73D77DBEBBCDBAB0DBDC64B0C3CA18D4E3B6816EDBDACE333BD28C4D53C1131FDBBFBBA913DB0BAEE3CEA8B893D211A85BD3D6205BCE7099ABDF4F248BBE2F3E73DAB93E839A87EE93C482CA9BC940485BCDB39E6BD160A60BE8D573EBDA19A89BDAEC8CA3D7B57353DCA1AB83C7578B8BDFA616A3DAC59AB3CA91BC6BB32FEED3D76A7593D2A6F7EBDFBD2893CEC8F54BD3BF79FBD3586623D2ABCC23C42D7933D5474B0BD179F8F3D2771D2BCEAFCA53D776235BEDFCB49BD73DAC43BAA55F1BCB47ABFBC04E5093E847203BD2DD1D2BD147A38BDF3D8323DD0D7593C4B76533C1E730E3DF01EDD3DF1C5B3BC825E623DE647883C8121B1BDEB192E3CA3AC05BBE19779BDBF0CB1BD2451CE3D0C050DBE87531EBEA84147BD578D363D74C11DBE806F5EBDB38E61BD95F9ACBDCEFD103DA8DD92BCDB1396BC3CBEAA3CD19385BD8CE11DBE655A23BD3A25E5BDCFD48BBDB32C55BDC941733D8DFD2B3DEBF047BDCE2508BD69C15D3D513DFC3BD0EE98BC980EBFBA6E6680BD0E68ABBDDCF2653DD202163EAC07DDBC0E056FBD9020D3BDF683543BC41FB83D8C35E4BBCEA7F93C4EDC8C3DE65C23BDCE40023DDE470B3C3C7E1A3CD8B4C3BC83F15CBDD9FE5D3431EE43BEEB8ACF3CEF5F82BD99BC453D821A1D3D10730CBE2A54E73D1AA2853DE15BA03CE5E962BDE0FB95BC8CD1DD3C3DC69DBC6226383CCC9845BE1BF548BE1F936C3DB6E990BD778739BE14C8573D08B3873D468438BD34CA9CBD39E91D3D79B994BD420F8C3D8460F13D875FA0BCE0BC0EBD03B8EA3CE312B33DEE945DBD4E069CBD667360BD40ED283D4954DE3DCF45063DDC4F903CCDDECA3D779FC53C28F005BD6ACD1FBDFB09BBBDCCD02DBD7263043D3071B7BDFB5C8C3B6E3C2EBD9ABA123DC82C92BD36C6AF3D8FB035BE05DB25BC6173B2BD3D1734BD000203BDC4C0D83DD0C5C7BC51BB003C9B94953D05B9153E1FB14CBCC444BE3D2B6CE6BDFC46063C6A9026BDF65031BE521F6E3DAE240FBD6AEE0EBB80EC203CD50AA03DD01C623DC05F0CBC194ABB3DFF89033EC3E4293D84CF763D5F407A3DEEE5A9BB650D1A3E8F0E7EBC83948CB77AF7003A59E2EE3C869F0C3D8513283D2E3A103D8B48003DDF03A7BD16E6D4BB799B42BC7B1AD73D23D81A3E6025E33DCAC6183DE7C135BDE87096BD7B499EBD2169843D431F0A3C524E2BBE9D57033D658EB2BDDC3FF33C2E60B6BD083D863D3B32583DAC671D3DD096E4BDA7BC22BC4763D2BD4472003D10B4113EBCD9E5BB39E6AABDE92F2A3D348534BD921DA0BD9286A9BDAB44FDBCDF09663DB4176A3D9317B23DDC2DF13C1922C9BDA94645BB9D51073E6598A83D019C213CC83A38BDE7F0D3BC8675EBBC20EFB7BD86CCF4BC6C54B1BCF97619BCE9A652BD6B1CFB3C8FF58EBC5D3B8C3B1A99CCBC6E37A1BC1D1FBCBCE615223CB7B4AEBC964892BCCE745EBAD4BEB4BA296FC0BC2F9FD7BC5CC8213CC2BC8ABB208604BDA384033C45F02139300E0D3CC579BBBA7D9D64B93BAB1A3D3649163CF85705BA713FD33BB5A5E53CC04BDA3B150AA8BCD5321A3C200EC43B1519E1BCC0FF8EBD5700ACBCD2F5033DE5A111BD475AF9BD4A023CBC74B3CEBC214576BCBE16023D5533DB3C5235CF3C98579B3A19146B3B6C1D96BD10C958BD4A9E3FBD7D36C7BC9714933CF923A83C4491FF3CDAF84FBA8C1CA9BB670D8E3C59AE303B78629F3BC78DD7BC336CF7BC9BC8F23CAB8507BDC61E2BBCCDB581BC67846BBD00A5E6BCAAED08BC702E1E3D5868C0BCD7A5C23CE093C6BCD0A6E2B9D53766BC94CA443B6BF861BDAEF91ABC35395D3BE2E9913C21789A3C0632673C29A949BCB1F24A3DC893A73A9A8D49BCB02CCFBC68C7EDBA963F05BC08F85E3CC2C5483C1C7E1F3D6AA91D3D3BCE6EBC4B7308BDEDFB1C3D1BFA39BD533F3FBC034C193D961AC7BCCDEF553B7F2B6CBC47C1C93D99AD803E2D9643BD2E9622BC36CCA5BD6A61713D91DFAD3A284D18BB77C14DBCBD80F5BC25FEAE3AE72E2BBDA2ABA6BB8DF782BD0C4397BC577586BCB4BF7FBCAA62DEBAEC66ED3CE4813CB3B750DDBA52F05B3CA9AB3D3B86CAADBB49558ABD2DF16D3C7527A5BB7DA05FBD647ACB3B9E541ABC7563EA3B1348CC3B59D2C5BC0C88DEBC1878723C7158023D1E09283DFF4AFABC3C04A13BFAAE1EBCD3624D3DF335A4BDF807283D26C1F9BCB2A70F3DF5D6E9BC5AE4A2BC45EBDA3C3357283CE4CFD3BC7F5AB2BC18819D3CA98F203D41FA9E3C7C42EB3C2B5EA5BB7E7AA2B92A8FDA3C03C154BD71E88A3C5727BDBBCB2290BCACCAE63C8249F53C324508BB9003C63C985ADBBC6C89853CFA461C3B8506B8BB3FB6D63C2B36573D7E93CA3CD81CA23C8B037C3DC198F63BB80B763B2AB6BA3B872B7FBBABBCD8BCD9826D3C231081BBD07B303CFBA9263E8153673B3A1D64BB6CD93F3D891400BD0F964A3D6B0341BDE79219BDB32408BDAD73F0BCA9987F3D52A9063BBE1E23BDD7E53D3AC0FA5FBCBE7AF73D98F04ABA8985823BE592E13B1DAF0B360D410F3ECB7C47BC46F8123BB9F4913DA996D63CD1DC24BB60B0123C727780BB5610ABBC40CEEB3BD906003C17B9C4BCE6B3213E826EBA3D3CDEAA3B987A863DB3F326BDAFD7063D8043933CCD130E3C3C74F83BF198D5BC3C25CCBCC41C91BC871F27BD5C3E803EF2183BBBB23EDABAF003643C49609EBBD2B6A53B10510B3B6FC81D3DFD0B29BBD74893BD3392C2BC990A3C3C79B7323D55FF29BD6DA8EABBEAA4A4BC3412043E328F96BA895342BB4F54813CC2F9B4BC5D99033D1139D73D8F84C33CBF2D84BBAE24223B5E0B083BCEDBD03CAF8CE8BBC083533E9EDE183C85895DBB2E849BBDE43E0DBE340831BB87F4883C123F14BCB04C04BB941DBD3C582C3E3DA0A21BBDEDE3C8BB4923F93CD2F2BD3C3419D7BD1C8DA53CD48A003D89A4603BEFC2093EA821F83D54DA3AB9B0A75EBDC13E3F3C7A21F43DA8D972BDC894123CAB88ACBC1987003DC855D5BD7C585ABCABDDDB3CAC1C2DBCE3668EBCFA09AA3D6225833D0002E53CC2F22C3AE4360FBE79CBCDBA66B0AC3C36E4DFBD082CECBD527FC83BBF9D88BC91E1B2BDE6AF4E3CC484993C515D043C5E1D163DD1A10FBCB11B8D3DC85E48BEC84B153D4C77373C1376A939028D6EBDA176833BF8D6BC3DBA8198BCACA8F5BB2B0F0ABDE1A3933CB245ED3BBEDCD1BC50F810BC27A88B3B0FB72C3DBE064FBD9683563C8F847F3AA90D85BC1524BFBD185C0A3DD882573C0234F3B9E4D73BBD2E10923D8C71B43B7881A03CB138FB3B2FAB7F3D75CAE93C9F281ABD742351BC611B943B93FE503B6522F33D195F07BD2C76843CF25A34BCA2A3063902B942BC4B4260BBF076C73DE486183DE34F4F3D06A4C4BB51634ABDB98BAC3C5C73DF3B6AE2643C2DF30C3D44A5FEBD3C729BBCD8A5B5BCDF7B14BE3358B93CE4BD643D05931B3DFD63FDBDD32335BE01E364BBB885EB3C914B13BD205B073C0A21463B083B393BC799C5B3EE5C64BCC20A3EBC03EBB1BC9F5DBEBC6281043D3D30843DC84A383D2864633D11F908BDCFD7D33DFDEAB0BBF329133C07597F3CA4BBD43ACF201C3E3BAD6ABDCAFF97BC6EDECDBCBF8E17BDBC1AD03AB8A9453C04234F3DEC10DB3CFECBEEBDDF0063BDFFED003BDADFC23C34901BBED259A7BC6ABDD8BC12FE493DB26683BCA23416BB2A4983BEA553733C81A2253D12AE50BC55C844BC614C9B3C6CFD01BDECC1ABBC17A38DBA2440A8BD4CE267BD425E873CD9F4383CB607033E6BC284BCDBCF6C3C88A0B9BB46A01D3D3FB9B03D1EE6EE3C28A0BB3C314FFA3C3D6DF7BB6359F83D7649A9BD5A39C739015DD4BD53EC78BCA52A2CBE97971FBD1E640C3B137A07BD593520BC0A2E383D3685D63CDAF644BDBCF90B3C8F8B69BD95EBAF3C5980183DFDBB1FBEE6425E3E327FF8BDCF5CB4BA3DD1A6BD882F253CFE8183BA7455D83DC75CD33BDB36CC367BF8883BBC06D0BCBB6A883D39F907BEF05964BD07FDC7BB6D90973DE5700D3D675DE2BB458A1ABC955201BE87232ABE883C833CE04C04BD92B301BD2E96FA3CA98B543CC511A53CF11A36BDAFA598BD282CB9BC4FF7C43CC70E503DD2CC923D6E0B74BCB41FA4BC4ADC913C61F9313CB468B43C08C0CABB738D80BD8E779A39AC369E3C351C823B334CAF3BA9528A3B553A823C67AE19BD5CBC2B3DD226A23C54E54DBC23356E3CC77E88BCD7F4C8BD804FB93B6176B5B93922753DAEEE533C2D55B3393C4CB2BDEE7439BCFC3818BDF6E0B3BD8FCA7C3C55D46DBC6A4B5F3E198D223CA94EC9BC112BC5BA69F7A3BCEF23843AEA1A383CBF2EF5BCC7A924BB8A5AE6BC28ACDF3B7D3E68BA68FF193C2FD0D73C92D34C3CEBCD3ABD85EBF2BCE5998FBCEEE0763DC8858F3C5AB0BE396EF3803C95B984BBFE95D03BDD0629BC4C8B483C97705ABB90A0DFBA5019113C01A7A53B3E5692BB7566AFBC0B4431BC3AE029BC85A79B3B06CDF4BD911E5F3CD623A83DDCC08BBDCE0948BE0F9B593B18EA0A3C8797A73BEABBBF3A88A2B4BD2BE48B3E0D8CC23C8F773EBD98559C3DB8C9723CEA8CFFBC725E943B7C68C5BC9DE1F6BD1F49C7BC4BE3323D10968ABCDEC46DBD0A5656BC297C6A3B34DF08BC3847213DDE57F2BB9DCF77BC698615BFA902B63D68F0953CFC90FBBC34BA6ABB3FF1EDB99569E2BD063080BC30942F3DEC85FB3BAB47003B88CD5C3B22CCF93C7188423BD7F8C8BC5ABB80BDE05E4A3C4C684EBE3AEBFDBB1608D23AF44E0DBE64FD533C490882BC21DA8E3C77915C3D9F5832BD9A3483BBD10DD4BD445AEDBB53317ABDA790463D2447BABBC4CD813B29A7C13C871E8F3C96E28FBEAF45ACBD243BF6BC7B4FF73C04568DBC843309B97197303AD55B413C1EF6513C86B93F3CB905063CD88B013D2E5BCFBED40769BC1FC0063ECD71AB3D711EB83C6F3E78BC80D41EB37493E3BC3B22FFBCAD100BBF1E51F53ACED5763B2D155E3D6C1DDF3CEB27E13CD932BE3B38D5E73B23975ABC66A7E93D696A01BDA1DA2D3B539BAB3C36C7F3BC60A679BDAA860BBE9D17F03DEFBCC5BCC8938CBD7531403DD43D2F3D8D6661BDEE15443DE844ADBD0F1504BD82D128BD4102B4BCF747623B6B1F51BD11F6D73CF0066DBD478ABC3CDE66BCBC7CC441BDDB1A413BC41A023C67B30BBD7F9F90BC1EEC7FBB0BA9283CB4C1D03CD378A43CC07494BB95060ABD2740F53C01058FBB270D993DB312593DA5FA853C215E7A3DDA51933C5C57C43CF0BB953CD70D21BCA6FDCA3DAD88E0B8E949ED39E512023DE292343C62460B3C8FEE103CF8C0B73DAD6291BC1825453CA445B13C7E2C0C3D74EA02BB15F5363D9B711ABD911D273B37163EBD749D10BC197E6F3DAFB977BB1FD142BD72E1DFBC96258FBDA311ACBBA164B53D8026543C346653330B0B3C3D62F413BD623C3F3B6B215CBC1DCD34BB0F629A3C406BB5BA0BED463C6F69DBBD52621EBDBB8F103D14918A3BF1EF4CBDF84D633D7796C33C163816BDC1E28C3D768E873E66B6DEBC9B5E413C5379E13A615B5FBD6E6BB23CA6AFBBBCF938FD3C6F9E653BA3362D3DDCCA613D2A661ABDC4CA2FBC4CC3E03C9B8D4BB5D1AD293CC4B3373CDC6704BE7A07C3BD2C9C793D6169A8BC6A0E153B19E25D3D600A103C44D9ECBC16EFE13CA3DAE3BB942D16BF63D3B1BC868538BC2D6960BBF59E9F3C633E19BCB7DC333C36D75B3A8226033D3EBBD03848471C3E6937CE3B8B034D3CE3CD67BD1BF248BD1537193C29F799BCA9BE89BC4A8204BC7743663DB792F7BC4C082B3D5E3AE9BC849C74BC7B515BBA718685BCA6F2973DB9C408BC86ED033D1F1CA4BC7E43BE3DD7AE753A8BD37D3DF4F3593C4B4F0F3E79C92FBDD6F2843D017D9C3C421E153D23AAACBDC0DF253D22855B3B7579B1BBFB6EC0BC9BF3983DB57176BDBA2F9F3A0FD7FBBA37809BBDE663EBBCBB01CF3CC51EFBBC7CCF2CBDA61870BD86BA3DBDD39FAB3DBCDC533B1B66BCBD96A1D03B22F4C43D0A399F3BEBEC9C3C65C816BDDF5484BD73EF123D68AEA1BBE3C739BE780E163DB9A0723C1D8B513CE67226BBBD70573C768A0D3C7D77CBBC204C573DA2F2633CCFE9033DB9E7FB3DCC79BCBCFD171F3AB58307BB88E98C3C8FD2CCBDC880F93B6A7F53BCAE6B91BC0BED82BCEB5A0A3D5E0A863B5DE8E8BD3F7DEE3CFCBE0FBD367B343BCD2E153DE903F4BCBE0295BA44C26EBC075B16BE74179C3B98E6D03C2F886EBD154613BE388E35BCD624CB3C532D4F3D173E0B3D2DA8A93BC622D83BEF67833DA9323C3C09F9593CA5FC4FBB70204DBDBF7AFFBD4A729D3BCD7DF93CDE9C94BEB5B38D3C919E803D824B9A3C0A0F90BD4EE203BEAD205C3CABFB1DBD606EEBBA3FE9213C8B0CABBA01FA0C3D1FE3D1B31CF324BCBAE2B4BB37A850BC252AC9BCD0B00FBE2CD9563DB57D1DBD60FC23BE0DE3C63CDACAA93DC11186BAF16ACF3C0041D1392F17803B5AF9353DD42F813DEFA11ABBA7103A3CE736893B1EC5063D910776BCCA85CABDBB1BE93CA4C70E3EFBDAFBBC6668773B1FCC833C3590FEBC9BABDE3B56DEB5BC5AC8AABC63A7AC3C18BF563D92672CBE3250A3BDBA2CEE3DA0729EBB11D2223C2AD437BDE370D33CAAEA203C7F607A3C4CD98BBD82F1FAB9D5ECB23B0F30B73D0826913D60BBB2BBFAEBB6BCFFDD843C652A093D4527AD3D64DB18BC164ABF3DF9D4273CFA5CC4BB20AE283EF0E3083DE21123B912ED8CBD210C523DBEA6EABDE321A73B129EAA3C132C21BD3ECB1ABCA65E6C3CE271BE3C6728713D986F65BBE93BD6BCA11CDB3C13CD1C3B969BC03C40510C3EFC092EBE0FE81FBC52C65A3D70EC003D4C6B66B87B7F703DCEACB03CF2D370B6D588DBBB22BE233D2798363D0DF4303DC9F080BD76F11A3D0F22BFBD3E90A13B44934EBA87A905BD5A048CBCFA56D2BDE33F02BD2748A33CE1E25EBD1FE19CBC7CE165BC772B093DE284013A2A853CBC790598BA812BC4BC799D6DBC5CFC8ABD876FFABDA62782BCA4E78A3C09501C3D907943BB9F4704BDC0C524BEE1F20EBAD911283D483DB63CF00E91BC99D023BB839F063DF6162BBA23F777BD52A3E4BD55F780BCA811483B7C56903CE2B6153E1B38963B9F94233C8E40143D384B263C0B5337BD2AD324BE2D9022BD39D85D3B39ABE83CAF9D7EB9E59397390C4DBC3A3AA8B73B92F2523C8C896ABDDB9DDC3BBC817F3C54D011BCE87984BB1F3D903D8EFB263DBA761CBBB46EF0BA71DE32BD765DBD3A83FFA3BD593B5DBD9EF01D3CD3E9F0BC28097C3CD914C3BD9A6738BA879480BD5E2EDFBC066D163C37E33C3D706D8B3CAC22CFBC8EED24BE7C46F93DC11D1D3D5418003934467D3CD2B9D1BC568931BE859FD8BDCC2DFA3ACE3F5ABB8DF392BD4D53073D92729BBB63FCC2BD788B45BDC17FD73B81A7A83CBD18D53DF1EEDA3CF6AA32BC2060323C543687BCB4C2313B8EE601BD7A6C4DBE028666BCB6117C3D3787F43B4165A6BD830A7DBCEF70103CE0FEA1BCD39F1B3CFD4DBA3CC21E14BC51A9B63CC69D5BBD0A79A73C3936C6BCEBAC743D7896563C5F123BBA083A433ACC80FCBB8CB098BC9F74123DE3F58ABA4910DEBC24C933BC2CCB34BE3948A7BC921DF93C505294BC6C9D20BD5D7183BC1AAE09BD12C7D33C68BADE3C360B0D3C1DDED1BC57D5A1BBF36E94BC407624BDF17E763C2D8AE93C6628B33C43CB28BEE60A6ABD4A8B9CBD90D8CE3CDDB14DBC26766EB6312CC1BB5D39EABA2E0C0FBC7913543E080B6F3A4E5A133D3BC580BD51E4A2BC3C5D3E3D274D1A3DBCC5A6BD5497663D995129BC8DD7473B1E3F83BDDE62C83B61FF7FBC1BAE71BD1DE979B326378F3C1A32ADBC7AA60E3C2AFE8CBDF3FD3EBDFB5E81BC3BD3353D492462BD8406BF3C7E7354BDCE37A3BB84D550BC01645C3C5A3CBE3CCA0CCDBD0EFA8EBC1797873CFC70D8BBDF2E683CA4F0703C37B0393C97226A3D63BD2EBC27F7573D1AB94D3A469A6B3C3A5B8F3C908F9CBBE6E83A3D7E6008BCFB764A3C646E00BB02FEB0BB0DF0713D71B97BBB00E1193DBFBB983C9B096ABD5C2E2F3CE4E4B2BC1C432BBC4942C2BB502F96BD821695BD34D50638538CA43DE951A6BDC98F90BCD758EE3A07A38DBC681EB1BCBE9C62BEB90601BDBBAC3BBE1713053B4B4BFFBB6847103E410D80BC14208937F64783BD52FB3DBED0BF043E0ADE5A3C866EC6BAD152E33C7F8F413D11FD5A3CD92996BD2A8B50BD6FDF8B3C5976E63C22D2D7BCE67382BDF6A1E4BD3D79DEBDCF486D3D0F2FD83C2A41A13CA6BBD13C0BBB223A57D9473D9352DF3B0AFFC337AC7FA53C82E62E37D1E09FBD13547ABC9446453D2DE4443C5544103CF7BE7D3BBBEBB7BC39389CBC1376093EF5BF9CBC94A75C3CF279433BED94513D9189FC3CA307243C859FB53C572E953D7652723C6502A7BC2EE3493CC204E2BCB500CCBD57E0723DA34771BC7CE473BCF77115BCD19B9FBC8372B43B486A8B3D64D2DEB912F8A8BB056D55BB84C7E2BC2A8EE1BC7498A93C77C05E3DE3ED153E54469D3C579707BC9EF4803C5791C3BCD2EE14BC5774113C72F282BA88E8CBBCB71BD8BC6CABC63C9B01843DCD3152BCAA83D6BB10FCF1BEA0549BBA869CBE3D5C46153DFB319D3C2EC01C3D68FD09BC6B7E6A3D4E5384BDB569973CC700BFBC74DE293CC70927BC327FFE3CA4204BBCF019ADBCC76C72BC1914273DCCFF9E3C91ADD5BC4B61C63CBB482C3CD92686BD5B1277B90323133D9166EABCA40B813CC06F7ABBE647DABAF3F980B93FCD68BC7A1A13BB352CA4BC1186CABCB54D1CBD2670993CF043003D47658CBB579284BD75FDD13A218509BEA0D0483D9A60E03C89EE6E3AFD8F373DE98E99BA3222E0BC3993A33D4710A43D1B0F11BBD0C0353DEF884EBC2A70373E430BB13BB46607BD10BB933C6864E2BB0A5AA43CD746283DA3C0C8BCDA82573D524021BC793BC53CD7DD46BDA33F6A3DAD1CDDBC88ABE3B8BEB138BD0410E03C3B7D7C3C2AF80D3C2D26573A2203F639BB1E27BDA890023D4D8E363D5407C73CC7B364BC895B1C3D75DA733CD3B080BC54B6EC3CDE761FBDCB0E33BB18FF893D06670E3DF254F6BC740B803D42B390BAE562FA3CB95F1F3E09ED003D8F1805BD64DE0E3B301734BFC82B87BD2B6949BD86D834BD72AC12BD76737CBC090C573CB88B6CBC58328B3C7212AE3C38ED143DBC60C1BAE649F43B040E03BE86A4A13C0CB6053D7EDB123D0A2A203C1F7BDD3D179518BCF534CBBDF045A33D2A01DC3DF0604EBE6157A5BC73F369BC8F0BF5B36B23BEBD8C9F033B9FF83B3C485B173DC6EBA33A953FC9BDDB229E3CEC4CDE3C98A1C03C26692D3C87CF88BAA26BDDBD8AB5523C3310A83BBAFB14BC333E58BC258804BCE4DFE2BD004DAF3D39D75DBC7949B5BC8549853DAB83B73B46C39B3CA1BC0DBDC7FF68BDADFEBE3CAE1A093DDAF0DBBC909C81BBC31208BE77D2643E3674A1BC0706393B6FAB90BDCAA77D3CE10B99BAE1F37ABC1761BEBC6DAB0FBD7561143DC65DFDBD8E81423C86A0573CAAA2293E35E406BDD80D5E3CA2EA85BDC3B1973C9545F33C198072BCAFD5253CE6BD5E3DB4E88B3CD96A543C53B0D43A2ABA193D3BD8553D9ABFE2BA07DB65BC24B2163D019FAFBC714D36BCDB7BD5BB702C98BC6C3F05BCE4DC743C7CE04BBCA3698F3CB151123DDA0DB0BC08CE903CA5B9CEBDB165B63A573DA8BC6535593C1E70B93DB83304BC9CA3CB3C45BA8BBA8D3FE03A0CC589BB2D480DB7F68B0FBDCB77503B8DC3E7BC311046BCAA6AA4BCD251B33C8D1A5B3C629E963BD43DC5BC013A8BBB4BFF303DFA6B67BC5EBECC3BA832F9BCC1E18F3CFA25B6BC7E39AC3D2CBD9B3C3A31883B0C364BBD2A93CF3BA2D301BDA72C43BDA4BED2BC9223B53C11ACAB3A8C80AABBF73D11BD03FFA6BEE2A351BC78319C3C7BBD6B3B37F7B63C6D26C6BA2320D4BDB44FE5BE3EA55BBCA56FDB3CB4F79C3B1F4E7EBD89AC78BBB3C698BB64A0953C9433123CD29492BDE84CC13B4CF6B73A0951BF3BC436EEBC4BFAE23CFB277B3C497AFDBB2D90C23D3A14BB3CFF1E13BD50DC0B3D8B75CDBC9939203C804913BD7B2F6D3C5C60373DDDAFD33A70CFFEBAD85CBDBB71AF5A3B94353BBC7FFF53BC2F09BB3CDDFA773C6A33163DAB0931BDCF1A553B56E42F3DC71E12BDF4D67BBCF17533BA48161D3DF1714DBB29D481BB92B2943C37C90ABDA155D5BB87766EBCE7AB023C8C8E633CE11E87BC318B933CAD53C8BBCB894A3D9AB06A3D6524273DB5C00ABB0119A2BD6F3A74BC0FCE4DBD19F51E3D73820A3C5ABBAA3C6490B83BB0B3C03C11326D3DA4C9A23C2AFF50BCE246923D9A1363BD63390ABDE76150BCB67806BDCF8E3CBC10DAD03BD3FA963B48F026BDAFE73E3CEBEC5B3CF0E2843C3870EBBB58E76CBBB4BCB4BBC2E8B43DF3FB94BD4071F83EC6BD9CBDF1D434BCF961873B17FF4CBAEA3EA03B65BCCEBCD844E53C4DB493BA0F01993CBD34323C44371F3CAB9908BC3972173C14C5B13C0ACDCF39F7339ABD4D9D1BBDBA1738BC6DD609BD557E9D3982CDF13C329E9DBD39E9883DF258C03B204E13BD74B209BD01DCE73C79AD86BD37296FBABC3C39BD310917BA9C79D83BBD5480BDC7E229BD21AAA03DB327923C1C755CBD6EF8EE3BE9A260BD1A8A14BDB6BBA83CD4BE073D8105C43C1F42D8BBCE12DF3CA5AB0CBDE906F63BDA854F3D682129BD641A59BCA395953BA3FFDC32D0B2243C8208F5BA32EB51BDE7193D3DA343303D7618AC3B287FA93CC30FA4BD2B90693B376E94BBAEB949BBF4B169BDAA17A43CBA870EBC1C19E9BC94CDD73C5C6E7CBDB0E53F3DA38624BC55B8ACBCDEDB693D50CF01BC53A6C03CD94A58BDFAAF82BB2DB6943CB87F203D12B2D9BCD3E5B8BB790DCB3C33E7783CEDEFACBD52290E3DA82FC4BC32925D3D33C941BC0A8BFDBCF040183B6704B4BBC9228C3D325909BC3F09E6BCB3CA2BBBD69F123CBB30093DEC039BBCF680F13C7BB2943BC244BD3D10E344BBFC795B3D3F6104BB5D2E733C65D76A3B67978DBC87549E39AF114A3C7E8E633B467964B9B5C5353BB5F6A93C3207B13CC1CED13C2D77293ED25837BC4088DC3BA37DB53C9743353C7941FF3B6481153D7E69043C42333B3D54A83BBD837CA83D542BD7BBA46930BDDB733BBCCAE6A6BCBE36ADBD7853DEBA19BA0B3C0618D53A719CF7B634F83FBBFE9A8BBC53C394BC7F87DD3C5236533CAF86FB3AF93EB73C2AF01F3BFDA25C3C2050593D7DB2183DF6731C3C007C043D5FBF69BD48120BBD99F32ABDE5087E3D8D97843C72685F3CBF87A83CB03762BBA26E9A3CFBB4903B36EEC03CB6683C3D86CDEBBA124EBBBCFA7905B7DAB178BDA9DC223D07E0C33C95E6D6BA0597A03CEE3D093DC6A7973DD51AA6BC2EA6CE3C655CD3BB0169273C61575F3DA3EE1C3B1FBB3B3C74FC86BDE04392BCEE21BABD9E9B213CD9190ABCB9484BBD4B4C303C2BD6553BC96F87BC05BF00BC47D28ABBCB12FFBB4AA2A4BD9B0687BCDF8890BC52C599BC37471DBD66EBC0BCAAFC7A3C3BF121BB6FEC363C33BE3D3D5E48813C3AA465BE14950BBD0351F93C286A333BD07C023E5984D1BC806BA43BE216523D6653DC3C4F710CBDDBC313BAB60D11BCD71324BCB022BF3D6C59E93D78F341BDD184BABCC2679E3C0F7276BA2CAB763BC2C6203C6793683CBDE741BCC7DD27BDE2CE1ABDB62522BD9CA8AD3B1403D33D1752343D440CC5BBC205FE3DFD1B963DF70D46BD752695BCCDC90E3E97613E3D80DE033E26452F3B236DB8BCD9BA5B3C020C823B02BEFE3C7461343DB5B49A3CC1296ABC37625BBCCF64E7BBC95C46BD885E7B3DC5162A3DD685953DB371ADBC2D2ACB375346B53CDBB0563B0E2B76BBAEF9533D634886BD07C0B2BBE4160DBA915785BB150C1EBD1122143DA8E437BD2A92CFBB8296A13CB0066E3C128455BCAE939F3C32238ABC53E7953DCCBA41BD73DD173DE4831CBD8A016ABBE0DCAEBD1ABCA2BB8B26383C50F2083DE5DD9D3B17FFE6BCFA52F03CB790AFBCBC0893BD6A98F0BD913754BE73284E3D6C384E3D96DA6E38FC6D64BC0A6BDABC06B66C3C109C953DE234F13B477B84BBB1B24DBEA644B2BC8BE5323DC1D31C3C27AF0CBD9210D7BD153A0EBCBF3D06BD7968623C1FF9483C82A8D6B95D07D73C4EF397320F99573D60DBE3BCC5BCA2BC3163D33CE0BE963C152B133D455F2EBDC0601C3B9A05923B3FF353BD99D509BB77E8F53B558CC03B6E2687BC1E4E54BEDB739B3CD43A003DEBFCB5BC72DD903DE9BCED3C7404D539A435513E7F7149BC074D5A3D7AFE68BCA707543C28ED1D3C4AEE97BD7E0787BC136F4EBC8E3A983CA68AD83B8EB767BD5B07133E9129923DF96DF3BCFDEC013D3C14E5BC54B20D3DA32F9A3D3C98E9BB967E463C507D62BDEAEDE3BDC68C2CBD2BC8B9BD202E3FBDB0D532BB19FAC33C5BDAB7BC2A67C6B911D2D43D3CC4BB3AB317EE3DA87640BC15F9043C627D083E995EDB3D4CED9BBAF5B536BD4BF1183DC35A893D1F058A3C51FDAEBC156169BDB1E38F3C47794DBD682682BC9C13733C5BC1B9BC381D293C3616AE3C2565D73C8277B2BD8398EEBDB448E4BDBA6EF53B70AE173C93E7BA3C0E4822BB462A47BEE7D02B3C0E9D2837D177533BEB7C333C9B6024BD0C7FBABDC396F2BC26575E3B2DDD693C6321373CDFCCA93A827E7CBD055E1A3E4D7A063D28D99D3CEF7E3C3C17EF8ABD981BAABB835C113D2E3407BD3948363DD19A2B3CE07491BD3D5804BE7E4A41BE4BE2343C47F581BDD752ADBC27151A3D239ACEBC150EBDBB8FBF783D44F839BE05CD65BA45829B3C19E3273C55ED133CBC8477BDFBF98839BC556ABD71629C3D49E384BD095DC9BD506D663D2A925CBD4755A7BD20CEBD3C429185BCE3875F3D61CD9EBC8C4B3EBCC99A23BED8D0BBBC75DFAEBCB48F843BF02BB6BBC17F833CA1F95CBB5DF69D3CCCF502BCF7A920BCF66E573D2878403BC63A9A3BD269243C663DB23B197522BC201B123D2DD7043D14F439BD0D414A3CFDF9B5BC4C97DA3DB71558BB8512003E7CFA963DC08E5A3C2E18173A975A9D3C904772BC5CE5823AD202663BBFAB1C3D9E9F27BBDE3173BC189C03BD409F91BBE6ADA53C5FD787BDF84EDA3C88B7DFBCD958AB3BAF2566BD44CDD6BB98782F3E8F677B3EA5C4DCBCB923E0BC04AD9ABD1400653C679E66BB3430603D15FC1FBCEC610B3D4FF965BD952458BB0279B43DD4B2C33C2B9E3B3D2203793AD6CA3C3C823D243CB18C793C141F2C3D0C3881BA8917C9BBAC11A13DFB44BD3DDC6E94BB57296B3E525C56BC33F1D5BCADC23B3BBA86E3BA11B6933C5BA372BB68A8433AD69B18BFF51AF83C92A5FABC6BDF8FBAE3C9FB3C3C8636BD26C2813C1771C0BC496DAE3C103BD23D6F1399BCAB0A093D457B2ABE563AC6BC6067403CA7790E3D9E818DBB0E6913BDA3DBEABC6DDC293CF3A798B94F009B3CCD0F3ABD14CF583C3812373C2ECF44BC3D1531BDD6FE2BBD2F2EBABC9A737EBA326532BC063CCBBC96294D3DC1D89C3CDB0A5E3D66F8683D7F25E7BC92D7063D604DB5BB9CCB18BE51A0043C9658713D341E4239D614213EA366C93C0A222FBD0FE0FF3B81F1D0B3ECEEB33CB9B1ADBCE07A103D7DB95A3C47E087BC93C4E83C133A86BB5DF0953CF807F13B62A6313D856E833BAE2BD5BD24C3043C5F76B43CD108033CC25B6FBD41C0F4BEBD53953DDD0F953DBF569B3B8E84693CD11CAFBCD29BCD3CB03EAA3CDA9C003E7B37003D627CE93C0FF3613D7AE9C53B02550CBD397244BEAD91613CF42626BF8732BBBC3B9CB53DE10FBEBD4999B23C0CB901BAF2B7C1BCA4ECCBBD3B83433D5E86823D2DEFF43CA143E03AF141D6BDD4020C3C1CC605BC4F77083C3949DD3C24B7263DE6E6D43A27308DBD1AFDC0BC4E1044BDD45E9A3C1CAECF3BE7A2C63D0B8255BDC9D2FDBACBC8EDBAF996ACBB37A116BDA6E3DF3C00E58DBCEA196C3CDCF01A3C2D403C3D79A6463CD0E172BD2D836CBD769D343D8D570CBD2E8F2FBE600CB3BC5888BE3C169BB03B0FE1BDBDBB0A39BC868ECEBDE318F5BBFDC6713D264A783C77DED23606CF20BDBAF56F3C8202A0BBF8CE39BC0EEB28BDE7B7E83B6D9F263B59E4FE3C5F0CC63BAA8593BC3202B83C586D633D69BDBEBD47BE1BBEC31D03BDD965B4BE92A13C3C035191BDC55EC13B6EFF923B1394E1BC986F0A3CCBBF423D73C4423DCCFCB9BC3262B5BC04C3673C104023BB9211ABBD55AD20BE4C07AB3C1606F5BA48BE0BBD5DD4DA3C3B2980BE5E8FBE3CF06BCA3B2D1B67BD9F48FEB7417D90BCD718B1BC6B26433CED8A7E3CF5BC823A86FE3E3A2553963CB0A8313DD753103D5FA7BA3ACCEAFD3A8E0026BC8D440E3C08F63DBD5291CDBB86734BBC60E3F1BC952989BCD075DFBD197684BE47CCBD3CA173C43CD96F92BBE54AD43C2AED533C8199F8BCC127D3BAEA40AFBD249D97BE1B679DBCCC2CD83B0010B83DCBF0813C127499BE16897CBB5654A83D4D061CBA49DC93BD3DADAA3C19E0F9BCD4CF3A3DFBCFFC3D87FBACBB9F39EBBD1F4E30BDFEE44CBD1CF0DC3C0ACB3E3C1B5304BD5809AD3D9FCB193C9012053C6AC0173BB1D4B23DCABDAB3C135FA63B3C3FDDBC28B3A6BDFD6DB23D2B0C31BDB659E03C692DCAB9E9365F3C68F7073D761906BED249E43CED50183E5496493D5EC938BCE715A4BC6BE2B63B55BFB23D1010EFBC2D1223BEC536FB3B09E5D8BC00A3DBBC735D4B3DBF307F3D5E93C3BE1191C13C998FFDBBC06141BD7EE00BBDE35F0A3D0827F6B83E8B8F3C285369BDED4179BD585A253C1D13583E34B772BD716263BD070606BEA1F5213D5ED52F3DB563B53D4BF5DB3C77AC27BE349E1DBD6018F83B2A9CECBB05A1C73D9A08DF3C383E55BD05B230BCC69B0B3D8950343D61B82E3CEAB0C03DF05D633C4BC8F53C893A72BBD39A0E3DB07409BD8CD68CBC9DD70F3CF1F942BCAA9A453DB9D403BD0930CC38C328D4BCB953283C1417AF3CF14F5BBC96B8D3BCF358FFBB09D75FBC5A71133CEE6322BDB96831BC9B0B37BD8B6928BC7FD4F43355734A3DC3693A3DD9A910BD5CF2AF3C1A0E523D04F7BE3D3FA3CA3C2EF95CBDC67A243D69482F3C24ABE339E335233D7200343BDDF67C3C9A939D3DCB2F153E47D8593CE156DA3C73860FBE0E7BCF3C426A04BC0BA3D8BC3D19DABC072A503D422D03BE3202CC3A2BD5813DD41221BEB4121F3C6342E33D8842D13C30EE313CF2F87A3BCBE7313D7678113D3A5995BD1D0BBA3C773A81BD47FF3E3DDD4568BDF9D3BEBC07CF7D3C447CE0BDF8C98EBC55C5843D69136DBDFA8DD83B53D4823B86EF833C00AB753B23AE3E3D89CE7FBC4C83D73B3223CCBC4B97533B8DCAD83B2EF7D33D9F2FABBCC677513A3B5371BD27EAA4BDB800213DAFA741BEBE4E803BAA1C9EBD44BB073D4F8B07BEF347833D6F8110BE1018AA3C1048D3BD9F445B3E2E36A3BD9415CE3DBB4F81BD1A76103CEFEFACBC72D2EFBCEAB38BBC436B91BBF8C2283CA11C72BCE31501370CDE503C845559BADE86563CDA1ED1BD8BDA093D5F2CC63C1BD6823DC79FD63B10FA343DACD7853CF06F133E9DF2CE3DB44BBE3C8639B1BDAD485DBDF6327FBE6A24423DB33B53BB40A7583EB58E17BD83A613BDC0004ABB986D50BC16591A3D28BEABBB73A7433D9226F13C68373EBAD105413C50D7063EE01C7EBB1DF12DBBF3DD363CF19C693C97D304BD3CDB253B433D71BC6B98883C4921E73DDE0E12BD2D43813CD3DD2B3CB230C4BD59F33B3D594B8C3B3365DFBC37C46CBD9DD0B3BBFBF0BF3C3EC04F3CCCA7D63B6BAB06BDCEADD5BD7458F73CAC9207BB40AC443D79EC2C3D6CC6E7BD7D66A7BD57508D3BE36611BE7EBBDE3CB065EC3C4E512BBE8BD1CDBDB03BA13DBF72243DB18AC5BC8567163C0F4689BD0ADE4EBEE30672BD036C5A3E8537253DEAB9113EC56C803B71D5CBBDB43DB2BDEFD733BEB8E7143ECF8AB43D17BEBC3C883D0F3EE185E83CE1E5E33CD4AE8DBEB00BF4BDA308C1BDFCBB2A3E954F44BD71778D3DFB2D2C3C952DCDBDD6CE13BEE68F353DD97FC3BDF51D523E11DF843C82758CBD8665B2BD3F6312BE7BB236BDF295FBBD13ECBDBECB45FFBD105AEB3DD0D6823E60B4E3B91398323DB225083DB5A208BE1A6E0D3D3576F83D50567C3D8F068D3DCB52B5BDC432ACBD34A524BEDBAD68BE866D453E3AA99ABDFB39B8BD28CE513DB1894EBD43C6573B78962A3EFE079F3D3338EDBD96E8673EBEFF673E489BFE3CB3F8893D0E0AF83D23E3693D4F15643E68E9943D9367B2BDE427D1BD8A85E13C41302ABEC22B8F3EDAF68D3B36E3633DDAF4763D81BBC53DAF909D3D5E3D43BE8872BEBDACB7133EB9E6933D4C53F4BD22AE213D6CEFB6BD1D1D853D18CCA43D4C1D643EFFEA29BE003341BE09B1203E7708DF3D287D5D3E371E66BD50185DBE859BA7BC7ABF92BD3B6EE4BD7D8F223EBA4CE63CD2963EBED1FEC3BD836B853C2912A63D82265DB46CC0213E238B8DBDF79FC83D212866BD088E27BEE2CE293EDCB490BE2AB607BE7A8CA9BD912FC4BDC14AC53C4743AABDE08387BC37770EBD68A5C2BC8A182A3ECDBE45BE6D76643EACF6AE3DAB7475BD062D78BC5616AFBD99DB0B3EC0537A3E15EF7E3D26F179BDF76763BE7648E83DCDD24DBCB4AF88BDBA2BCABD2828C63C8BDC113EE19798BD38C03FBE613518BE774B62BD14A067BC172464BEF2E45FBE87A6743D6CCD5F3D8712393ECDC6423E429036BECB84563EB5FC2A3D2ECE8A3E237DC1BD2CF4383E3EF5233DBEFA873EF287A13D236C34BEB518D53CB09EC63BD995B2BC08D2E23D16D6BBBB530BC4BDF9FAC6BDEABD653EB21D14BEF1611C3E42FB81BD7501553D11DD6E3EEB71C8BD5C46733D445D103EE9F3173CBAD275BEB185E9BD2D708CBD1E720A3C4098E4BD5212C4BDBFAA2D3DC78712BE6829D53B17A341BEB837E63D4B0119384CDE523D648795BD6CFBACBDA08982BDF8CCBCBD0217D1BC1C22803C5CEA8E3CE3DE4F3D12B01CBD431C1FBE0A9320BEDD1E25BEEE03E93D2B59A03DEF93003E1C8689BE2239CC3B31764F3E91D0B6BD0B888E3CA431ACBD6041713D4F51B0BD2131ADBD31AD82BCA1C15D3E0109133CFFECD43DD87910BE0B4926BE2B5A143C5960C13CECFEF03CBE4B263EF1A2953DC9C55B3E3A7EB0BDAF3528BE9952683EC49001BEDA84A53B53E6053E5962DDBCA25DCFBDF645BDBDB9A767BD99577E3DD835263D021A1A3EC2F0313C36704A3DD7966D3DE36E39BC3A6EE93C558B1E3C536AFCBBCFAC9FBDEE90043C17F6ECBBFD4BE43C71AB1E3BB376D23B51A078BDDE6AD5BC80D4AE3CF891EFBCD3EFA5BB010A8D3C116615BD39328A3D6B99643C0FE8FD3D4325423CC89FB5BD2FD7CA36384D173D6786043C150F8A3C533FD93D52CD0B3EDED3D5BD7293063B86ED393C0B1E023DFAD3CD3D1E80573C85BE6CBB4A4E9A3B9A6E343E07FD2C3CEE38063CF277EFBD1058F53B5FC4D53C8041053DF9B5C4BBE8A7CEBD5D2D30BCA3ADC13B517F163CF331013E5D1CB4BD7054DDBDE8B54D3D88E643BE4BC2553BA2E2C7BCFEEF21BD53789ABBED1C9E3DDAC4C3BC6946983D724D333C5BB9483DDFBBFBBC58BA3FBDA6609F3C76D2373DBC41D13CE137A03B275642BD6E534BBD5608223C6AA206BA7DF3A3BC2FEABFBD6F06703D0C3FA0BD319476BC997AEE3B6494EBBB596310BE2CCA393CA7226DBB85F7A9BC79C8103DBE3B183D8F30673B418C893C7265AD3D1F1088BCF64E613C138B3ABD58DD3FBDD12A783D207F38BC2B3A38BD16029EBD219747BDACD089BC3B74FE3C7C6E053D1D5F893DBD247EB9BEF244BCC276AB3DEDFB17BDAD62C2BCE516F83BBFCB99BA50BE103B875F8B3CFE1D1D3920F7313E46A8AFBC901DF0BB049AAD3D928E73BDBF4D14BC0D8D433D19EC433BFA83CF33B86AB3BE44D28F3C3CF555BDDAE58EBA6CC08A3C07AE86BEA720333DB70F043DF9D85C3C927149BDD4AC343B954E60BD1677B3BC8F0BA4B909B36FBC05CC27BCEA6BB43B1C66CF3A7EBED3BDFA860E3DF7FD603B7E7918BEFA4D97BC6CF4C33C2E7DF8BCAA161F3D69E1AFBE715DB4BB74202CBD5906343DABE8503DF9F803BD6DE5503DF579EA3BA8902ABE4C3D033DC5C5203CA36693BD3F05B4BCF859B5BDCCBBD73BC1DC4EBD34690C3DFA5B873DF30AE73D3E29D93D693C09BDAED3643E684F4C3CD6A51E3957E58FBD1727173DD1354F3D896F523DD928A9BD3C46B0BBD556993DC95232BC300C6B3BD134793D1D2E2B3E2EC399BDC02C6DBE1331753C13F4013D925BBEBB3EB3F7BD630FFB3D7393473D77D3013D5109D23C8B60F3BD3CBE8A3D3D6D89BCB0F3893D53E08BBC068363BD0143903D97E393BC2BB05E3A36B85B3CEAB879BD725D4D3707C2CE3B67AA813DF33B6CBA063633BD406723BDB9D7F33BF424813D8CEC813CDE23053D544222BE4DA7AFBD163884BCA305F93C79B8E0BCCDDA56BCE934463D26730D3D710BA53B963E243E611F02BC03D02BBDA45041BDA8B70B3C6A4F383ED3F8913C609F353DA2EE3CBD292AA53CF89DDCBCF7C81E3C83B8943C684F71BB1EE944BCD2A339BBD17CBABC033EBA3BC16D22BDA07500BD000EF0BC9A0A493D7908C3BD6252F9BB7D474CBCA2CC0C3D9C42843D1CC2C53CD88A9CBC012443BDCAFC093C41A9AD3C7E46CBBD349541BCB720C9BD6D1501BB6D3FEDBBB0F6263CFEDB5A3C68ED153C31D161BC33761FBD958F363DB04510BC732EA5BBB8B3A2BC1F75DBBC1D9CE53BE27D803AF8CE09BDB182943BF004E6BB83D9A6BE7DEED0BBACC830BDF43FA83CD08B483D9C47243A672345BDFEBBB038CB0F943DCB324BBC31D4EF3C67AF613BDF2EC53CD1B558BD3E79333C525768BC075B5EBD28B7AD3C2D5AAEBD33151F3B79EA54BDD1548EB9F461A43D031E51BE8FFE10BB923D833ACEAEA7BD3BA6913A86B1913C3551BD3DB4B28E3B91D2093B778B70BDF533B6B7E1CF51BDEA9BF3BB8D4621BC1D86D23A6320E23A24CF6F3CB911D13B4228D83BD39F16BF2FE5ADBC193B8BBD3B41D7BD08F3C03CA76759BE10CE36BDF79012BDB83685BC7782A3BC3BFACEBC6C92E23C753BCD3AC4EC56BC1C6E353D33E1FABCBAC85FBC2435F8BCE6B88ABCFAFDDE3CA9AE70BD5C06433CC21124BF08422B3D4560023CA99514BE9BD180BBE939E3BB4315313D334492BC81D3233D0E19193D6C5AB83B780227BB475AC8BCD1B390BCB669293D727E27BCE69AB5BCCBCC74BCCDC141BBFB4682BABBDC473D4EBCDE3D51C5343CD1426FBD38431E3D12C587BE3C877ABD65811EBDEDE34C3DDF080CBC1610F0BD83DDD43B45D39FBDAA699FBCC4DAF83D62BBDA3CE36803BB7AB423BC1744BB32BF4D77BCEAA81BBC5F3C913D4134853C36FF85BC0BA1AD3C082D27BC0616A63C3A571C3D9AC87C3D28CC303C1544733C7F920FBB2AB1843C093DCD3DF28D3EBCC03B5F3EBF79F3BE5B397F3DF662653C232C783C4B83DEBB59BB1F3DD5718FBC9753B9BD0451AABDAEC5FC3B5D19813AAEF2383C14D5183CE91547BE1C6E9BBD50CE593D7AE5BA3C4ABEF43D688391BD74C4063CA820F8BC0C5D01BDC4E4C0BDE6CF403D98C7333D12DAA13CEBB98A3CED39E93DE583E83CA3F9B6BCF1ED763B13449D3CF32A663CF6EC67BCEB577C3D1908593DCF38C0BA2C9EEEBCD49E21BB97E9B23D0A3F57BD8E39CEB9BF02633C77C1423C7ECAF1BC762B063C946FB43BC2C1CCBC6F91AF3AD08374BC2E3E4FBC480173BB7BD3D93D0862973C4275343D25FAD03DBDA0D4BC9554AF3BFD1E21BDCF670D3EFA61F7BC1C018ABD04AFEC3A1E608F3DC52F9ABCBFEA15373A0334BDCCFF5BBB45BEE13C6BF961BC67438FBC6CA0E93C131D2EBBDEB7EEBB9BEEC3BCE2E841BC273D86BCB7F65D3D3F688E3CF69FB53D0AD6E33C3395813E48B5A9BC7C5AA33D709D283DE165BD3C6AD146BBDBA82738ED336D3DB0C6143CCFEA18BDE637EC3B0B58BC3C2F93D13CE8F616BD5E055E3E9DCE90BD3ECDD5BAF102E73C00D5CC3CA60A583D926F6E3DE4A6343C021EF23CB65655BC97DD65BC214309BC3F0E51BCB4BF373CECD3A2BC872BD9BC7B46DA3CBE55B23C655A6B3D66DEF43C1C7B7EBB0186B8BCC05BA4BC3CBFA0BCC85E60BB4E34CCBDC2D1393C3F50DB3CE713F43B7678793C2B4BDFBB2461A9BBBE6BE33B120293BC18B75ABD2A1B14BCBD4FCC3D81CC58BC5A1C2FBD5F5D49BB723E3D3DA7242BBC897094BBA70365BD01F1D9BCCA5B77BDAA766EBA3A8919BC0520A13B197A1EBC3374C7BE341923BD565B0E3E28F396BBB9722B3DCDD03A3C7E3079BDA9BA05B93B06903C99CEC9BD69491ABC7AB20EBCDDB252BC720CB13DB1033BBBD27DE6BBFFE4DC3D3E76063EDB41B9BC9617C8BC525C78BDF1F590BC0BDD0EBE158794BCEAE6173E05225C3C45E7D13C2539753D90D349BD32813FBB081FA0BC62CDB6BC5FB8013D756D5F3CDBAB9C3DD30C80BD785B263DCD5B3E3D3B3704BC33746C3C611C9BBB0160E83CFCA4CDBCF79B1BBCF65DABBC831AD4B9A48328BCD200153D522F1EBB074280BEE433D8BB7A860C3DD1B8AABC7B052F3BD3144F3C3F0EE1BB43F4BCBCFA377D3B4152583CCEC458BDBC172C3DC5A7AE3DA09B58BD0D9CAB3C44DE49BCA9901CBD731701BE5CA7A3B913BE343CB3B3DDBDACD753BD66F6E0BD88D93F3BDBB71F3D26D301BC0D48D63C53AE95BCA7F940BE2C1F8ABDFE4FD83C04CE1B3DC227A03D616A42BD2614A23D2EAF9A3C8C36E83D3219EFBDB4163B3C4C34C4BDB1D54EBD9DF98BBC785C3BBD5AA74E3D8DB10DB4B55C84BDB6951DBDD6DE293D7206313C1B0F8BBD28F361BE2CD6EA3C0327993D6500EA3B2A1D09BDA5917E3A51DA6ABC09A65BBD580A373CF89A0DBEC705B93DEBCFADBC079E1CBCDA3445BCF44582BD0BB58FBBE7E733BB56A1B83C4B54B4BDEA2C2A3DBC67CDBB40AEA63D839C12BD5EE8A83CAAB359BD2836B23C0C62B3BC452BDFBB83BB0D3EB6B9153D6391B2BDBFD993BD18B6DA37B755DBBC046671BB0E3DB8398AF1D73C8641CB3CB371B23D8FC639B973398EBD1CCCDFBDA80414BCD1B00FBDC3D6A23CED72DC3CAEF33A3E0902BEBC3E24A93DA7ADB6B90F3F45398271C03DA1D38F3D1AC5FA3AEB1692BBFEAA453C2CB6B8BD97D4DC3C0DF0513CE4B2D2BC2C17193DCEC7FFBB9E0535BC5FF82D3C0D7B09BDF448BD3B0A292E3D8E49873BF8B4693C66DFE7BDC73F27BE42EECDBB78500B3D5C9F4EBCF716723BAFCF2BBE0A3088BC6602A23626A2AB391762FCBB8D901ABC2EABB03C7E8D07BDDC65B2B9741C17BEA276213B89ACE83B7C8208BEF002B9BD54C7823D54CF44BBC62FD73CA212E6BBCAD93B3D3D739B3C8461843C2CC783BD907C313D0896BF3CD0468DBDC46093BDF361C43C81621CBCB9DBBD3C7418D43C4C68B83D689A1BBC216146BCD5BC84BD265CC4BA26C0B33CE78D233D397F163B6EF38D3CDCE7AB3BBF33BA3DDF26413C266810BE82FC3EBDC8CEC83BBA511B3C6C96C13D0487DFBCCE8AAA3BBA8E92BC82D11F3C69717CBD8F88EFBD309E123EB0C87ABCB7A30FBC0F66AABBF1FF0DBB25A329BCEB1A403D302073BC15E961BCA9878B3BB094953CF94B26BC8B4454BBA92F72BDE093103C4A315D3B8966E53BD84DD63C2E6A023C9C89C73BD830943C8DF9FCBC700180BCCA6E113CDFC194BD84DD21BAAB9494BAFA66CDBC7B2AC0BDF7A23CBD2EC0833C3A7A51BDE61FBB3CFD236E3DE556A1BC66DC653C46F7D8BB692CAF3AAC13903DF6E0923CDADA043D6D1D7C3C49826F3C3E0ABD3C38696C3CBF20213DA145043E2C471A3CDFC53DBDD2DCB0BD9FFDC43B1BB772BDC90BB63C0267A0BCD1FB00BD177629BD2248C2BB1E9BDA3C24C8F93BC831C3BCF55A613D0E30923B0E3C263D49A23CBDB576CBBCEFD3023DC54B1D3D9C2895BCD979123D24158E3CCF7EA03CA04478BBDD9622BCA1AE05BC7E498BB9795DD33CE412EF3C99E9033D8F8EC3BD06C9163D80A3BA3C8D0B20BD48B14D3D7D59B33D2780623D454539BD66D5913B33A7E53BF8B90DBCFE9C3DBC3F92F4BD87E6B83946DCF9BBD6A56E3D5853A7BDF9AE8ABC7E8848BC848FD1BC9130853ECD439FBCEA098F3C1A8EF4BB2F74593CCE6D783CEBC5CC3C21D7CC3B382B6B3BEAA054BD71B3FABBA61B2DBDC438153E823E903CC260173DDD616A3C6266EABC3D84873B8CBB3C3CB4F5493D5B761FBD0EB7003D70BDF2BCC76BD7BC865B59B3E9E219BD7C371C3B6DDD91BCD639203D47B4C63C04A0093DDD261CBECF58873D077A89BB69BB5FBD58E3B1BB9D124BBC53AD1EBD9E69D7BBC6C3173E63C25D3CBE33DC3C4E0A823C6A8FA13D62D41ABD5EA7DFBCBC79CC3D66EC93BC0E82D5BC52B1D7BCE2EB3E3C2EBE333CE6C1093D486EBAB8B8FBF43C1CE580BCF1B2F43C2485243D59B3A5BC8F7DF63C22B4F23CABBCE5BB2F59B43AEECC08BC2704063ECED45DBCA26B803C7BBEA5BCCEBC1BBE518E0A3CED10D5BE01B30D3D1DF311BD2363DF3CC7D6AEBB787933BDDB5A8DBD2D13A63C5BA8D9BD59549C3B3A91A0BB7AFED03D8462AA3D26B5A539D83C843CB29A6ABD616EA1BD343456BD5E6BCB3CEF5171BCB12733BC2DECBABA473C19BC7553A03C8A75033DADF7A23B6DDCF83A82190EBB4237A8BA3982703C1E972E3D648FBEBC652E893D825E9EBC3555B33A1BDF5FBEB704DA3C8A481937F8D0F83CD24AE7BC97CE95BCB9E4CA3CAD4E063C57E446BC412C0DBD649A003C40D2C63BD6F6E13D140BAB3DD0AA773D3D07F1BBB9413F3C95A8DD3A53B74EBB2978BE3C3CDD3B3CA93D88BCFEE81BBB7FA027BCCD42B2BD45F07DBCA6E897BD02302F3DC63C7ABCA5F335BC3E05F63C9CEBD13A4D9A96BC98E2293DBE6F37BA0F6C29BD7FD1413D3D327C3C19C4E03B9F454F3DB9A437BC459AEE3C5F3CD13CAA771ABE9EE0003CF9A8953C4EEF67BC9002233CD2086BBCF2DC5DBD5904B83C09B802BDDC7B14BD4E63D53CC592333D357B59BDAC669ABC5382AABD49C00C3C4A7C0BBB6B3B3E3C597250BDD048513B355E503D335D6A3B2347DABAB1A92CBD971CA3BB116C34BDA60C3CBCED68E53BDA37143BCA3705BCDAC150BD2EBBC6BC591E413BCAA82EBD6FCB5F3D81C61DBB930C243D9246CF3BA74BA8BC029E723CFF8A55BD9137BFBBCAB4F43AD478F5B96DB5F4BC2CB9DDBCBF9C84BAEEB152BDAA28433CE93C303DD5C9B63C42D3C0BCD26A00BD97405FBDA34850BCB2622ABDD9948A3DB9DF6D3B318606BC7C26AABD1C22363DE07D53BD64D0DEBC6717E03DE61424B932A7023C42D49C3CD84775BD0B0CD4BC98F4403BA488153C920161BD92A1693B0125B53BB3E15FBB96C8F93C5877ACBCD2FDE5BBC2AE6B3B3DDF463D87BF473E76D08EBC4EFF8E3C94F1963C587D8FBA7419FA3CCFCB453BEB2BAB3CA8D0383D5EB1C13BBA9DE1BABFDD2B3CB5F4973CF722A73B200492BB0E9E7339938BCABC45366BBD44AC35BB87D253BD392B70BAA63CA83CBB76A7BBBC018B3C5B591C3C073C7A3BF0FEC6BD74F810BD323E0E3D720AA8BA0BB666BCB1F6FBBC2957D1BB49ED073CA7D31E3E57F6C03C7460AB3CF7F938BDFD7D2CBC1A54B73C7D908CBBF6DEA8BBCFA03B3D1DA118BDE9FDDABCF8388ABDAADD1B3DA343343CF61496BD757A3EBDB36DB7BC2EB98D3CDB6F1A339C7932BC8D56193DF39FDE3C1A7DDFBC55A1C9BCB277DE3C536027BDF65F54BD79F564BC1EC0C7BCA504DCBB477D01BDCD71AFBB482EA2BBC6A340BB2990083D2E828BBDE87E073D28383EBCB67C63BCFF890F3FF1AE5FBC94C7E73CEC38693DF8E5F0BC236B9E3BD30616BC7C1910BD6AD719BD8398D4BB953E293D4BA40FBD99E68C3C26C148BBD97F30BD347F38BD7B60A0BD2DD2643B2F25293CBB10E03A89860FBBF75D4CBD605516BB9C25723D9AB251BC3DEA383D6240E23C0D40D03C8D0F913DE77F133D547BFC3CBC5BC13C0E12D4BC5A0E09BD7FBBCE3C75704E3B75CDCD3D38411E3C88538939AB513A3C5DA12BBCF0622E3D0D9799BC17E91A3D38ACAFBCA4AA33BD59BFD13B86FF88BC80784FBC55D7A839F8FFB1BCDF7FB9BC215024BD1C11103DCE97C93CBA3832BD41607DBD1A828CBCF6B4B5BB7E45B9BAD41BF6BBAA4D4F3C3B08DFB6AE65B43C7C8AF2BCC7FC4CBC5545863C1528023A0F136BBB74F302BDD46E8BBC5319A03DB01BC1BB52388CBB6556F6BB735D893D9F603E3AD2D0B5BCC432513C5AFD7FBD003C343D3B262BBB138F273C8388363B570CACBC2EF9A33C0F0C33BCDE52953C290BAEBCD20701BC5E6C6D3DC885013D8E5E2F3AE1D3BCBCA6A1CABA2090D6BCE493E93CDFE28E3CDAE79F3C7B7D6ABCBBD7E6BBD16EDBBC804F2B3D1E4940BCE64C1F3D0ED48EBB64710ABC476DAC3D120B53BD281A5A3CB0EC0C3C47B29DBC585AF73CE010C4BB6A969FBB3FB59FBE36B4E3BB3FECE4BC95E408BDB6C5753CA2AFBB3CE0A88EBC3D57793DEDACAE3D04B032BBB1569BBAE75F3CBC55D44DBC030E073CBE54EE3A2FE6CCBB9DCC43B8010F043C14076B3D814209BD89A10D3D9EB5E9BBCB6FA1BDFB772BBA9BA9DC3BEB6BA8BC04A2813CD6844D3C316496BC34CF4CBBEA26103C3674CE3C9F98CEBC130EA9BAE08707BE9A91B1BC4CC1A63C8E35CB3CF666B23B4D98B1BC00BCE2BD116A533D1176233C20C9DD3B0277583C3A6F9DBA4CAB89BCAA3FA3BB5B64F7BC11E213BD3CDDDABD339AA5BBBDF6A53D2F569D3C663462BC4D5B103D1A54EF3C2F08F83C3C4A1A3CDE7CD1BC1AFA123DAC61F93BA13E563DE6B7E8BC1196E1BC388217BDE038263CF241BA3C9480823CE7EEE3BB5C3936BC56596A3C9A19CB39E3ADD1BCB8B185BA5A48E6BB39AE893C83FF1FBC0DC6B4BBE6C7373BB766083C46B3D4BC650E32BC75B5833C4F1D34BC4429613D4A4F1DBD56CBA63B7535683C5103103C5044003A8617A53C5AAD60BC5D8A0F3A2DBD523D87CF12BD4B248D3DC65C07BDFE20423C19CB19BCC1E60CBB1BEA663DF5627ABD27782C3EBF8DA7BB3B93A43C977B2F3DB65027BE6FE48B3C306883BCB547493D9E8DD53933508F3D749F90BB3CB2F6BC154226BE57D7C53D8AD9D93DE12FC0BC577110BDD9EF4DB3FDC9CBBD190CBE3CC6E1333D3761793C393C1E3CEA98D7BD33F53C3B2044883B5DEB553CA7FAF4BC88571B3A710994BC17459A3C364DCF3B5536F3BCD5BF8DBBFCD3073D60E862BE5B75153C93D7AFBB665F813CD891113D35C40EBB4A18213C1201F1BC2E67733C817F21BD3D9FEE3B8423E3BCA27CF2BBE54E58BE7C6217BEB55C8B3B227C913C625A19BD5E0C5D3DF79EFD380017D0BC3DCD8C3B7AAE17BBF9F0E13C260D923EC7FDCE3B942866BB8351C03D4E1B543D419CACBBE44238BD4E0EFE3BE6ED68BD1ED19C39AA5727BD724C84BA409D06BD2DFC4FBC05D196BA7E884C3D290BF53C52EA6DBB781D67BD1CCE913B5BA647BC80277BBCFE867F3DEB7784BC8FD469BDFB9B29BCD52E5CBDEB1C9F3C12C1493D3AE574BCAC5817BC51DDA8BD9A0689BC5718273AAF78B8BCEBFF5F3E1AA3863C192FDF3DECBAD9BB9B0B39BDCD870ABD85468236774C4CBCED94F1BC3B113EBCEA36673953533CBC3DD8EDBAD402773C72232B3CC70E023D5E2378BB1F370BBD067BCE3C1B5D05BE8074A6BCBA164DBB9A2F2C3CEE793BBF80FFE3BABBAB553D7C553BBC602610BDF21187BC924C9EBA3DCCF23CBD5FF33C8E7460BBD10ED3BB68817CBB654EC6BE79BF28BD0F92E93C2DCF27BA7E0246BC83DD323CBFE385BE86F0403E17A72ABC1ADD82BCCC86AE3C628CA8BC5339A23C9F548BBC244B143D6C941F3C879F983D977A253C3065833BFA3A49BB56551ABD8C7D3C3C9A8755BCA77B60BC3674CDBB70882ABC4F68993C37B099BCC3E191BADD8AB6BC882A1CBD61CE06BCD66C10BC5AB5993C72BB3FBCA65298BC0ACCA6BA84BED43BBD1AAA3B6D82B53C305316BC35471DBDFAF13B3AA630D4BB2DE07BBB230E1B3D21AB21BD9BA423B9D7489BBC82625ABCF060ACBDABD63DBDFE86B63BD6D21BBCFABFEB3C9BB513BC2C6603BDCF21A0BA8BAB193C0CE01BBC6D1B223DF79C8B3CD388B0BBDF92B83B4D6405BDE10B233D7A8112BC61FCE2BA5EA286BDE61BE23BFDDD113AD79F16BBE4B91EBC1F82723C63CF373C505DCCBC1E1B923CF6AFB3BC10A97F3C2ED51C3A0403B33C830C64BC49FD093E49D4403C2ACBA63CA33620BD2875EB3CDBB9473CDCC21BBD1B3F463B4687333D6FF956BC9C9C44BCFD148EBC5E84F8BA9E9BB93B869B8F371993D4BBD4168FBC48A2673DDD27E4BC075004BA6CA763BC8C7482BC3091163DF8337ABDD3045CBCE13640BD46521CBDCFE15A3D4FEAAA3CAFA2013BDDA25FBE397393BC1F2186BC096C14BD1C20113DF1CF693CFF22B5BC1B28F6BAC34C9BBEAF51093DA32F853DE8B4813C025F84BCA1AD0FBB371C713B3FEC9C3B1F4F9B3B7276823DFAA9AF3B493E38BC48A0923DA630C6BCEBC2893DA126F23A5F6FD03D01F585BC1AF2C13CDCCE2E3C656C163CDEB7C43B3FFAB8BB7C5A48BC07C7532EB96A59BB2D77FB3B6239EE3BA30F4FBBB02574BC2D48603CAF0F2EBE8A269D3B81A7AC3C9EC11B3D6FF54DBB588FD4B9E9E4A9BB39FAC93C60C1B13D922DB43C5616833C1A73463C48F7953D41FBAF3C5BE8973C81940FBE888ADD3BB83050BDEB6A0EBC6A03903B66C084BC30290CBD9096673AF05C663BABD319BDA0678DBCB53E043D8B93DC3D7A93D3BC81BA7EBCBE96D7BBB3A51A3C91C919BD7D98B23DDB1F0FBCE3AC8D3CD25569BC8F7A2E3EDBC1A23BC4651A3E952B88BBE1E293BC8321C23C8C16993C88D0C53C9D1D06BD79D3633A0C13903CF8BF59BC4E72A73815AD743D6314D4BDA50744BBEA8957BC6C46CD3CA684A53DA7568D3C22F942BC006805BC44E087BC7E937F3CEF478ABCA4580CBDA3241EBB2B16773B445DB53C72B129BCABD2A93C82760B3E83B5B8BA7EEF5239808B903B0CBC813CBE58C03A7F2ED83D9C6628BBCB9D4B366DF6F33C75A34F3CF51F1D3D3FA542BD30598ABC6F3843BB6C17C1BCCA14313CFAF992BC8B8CA13C0B6A81BE2D4DF2BDBC62113CFFB05DBC50C2063C8998393C0180B53B9BE433BC45D412BC475AC6BCE56A573CAE21EA3BB65FDA3DCCD10BBCB9A5873C64CB59BCD7598ABC03168ABC934E94BBFF67F53CBF90833C783AF4B830CB98BC8F754D3DF690DB3CD0CE55BC8C38343CA555BA3CF3D7CF3C6A4D94BDF2D6A9BCF4F4EB3A9F6A063DB9A773BCDEF174BCBC6BD83C4BE0C1BC1A5C10BBB31B003AC821843CFA5B213DF0B782BB7F689BBB6A4AFDBBC86B483E5E1139BC7AA76A3AB7A1BEBD590694BCACF3FE3C0F1B58BBD3C514BC4499B3BC59DE54BD580CC73CC7F622BC345719BDCA559DBD13AEB13CFA002B3BAA8311BDDF13CD3B15A61BBEA9D83E3D330ED93C0277E6BAD307D03C42A2E5BC805C90BCF2EB37BE9B9576BCDB3BC2BC21D3A6BE260EB1BDA3E570BDD1078FBDA12714BC13D953BCD86405BEE88680BD3B98B13A419F18BCD4AC38BDACAB823C254139BCB63BC0BC185DA43C26169D3D3C6F083D86BC35BDB5EC0EBD5A51E1BD869EA23CBC516E3D7E0E073B575FEB3C827E05BDF877DE3B54EABB3B29E49BBBE80737BCDA57113CA31C103D629680BDDC85163D6E3E1BBA49FC303DEC77F43A3A7BB3BD705BDE3C8FF51A3CCA1E32BDAF741FBCCCB18B3AC04CB2B75DC29E3C1DF6A0BC05379ABCE16503BEF2BF1E3D00FA3FBCF73C383EA9F6B6BAE95434BD874C7B3D4C27063DA17E28BACE3F73BD32E5C23B62E43E3C0474AB3D0AD6D8BCAF330F3CA3F65FBDDB1C55BDF207773C635B0BBC93B5A93C76E54CBCDE9022BDAFC694BCF6F9AD3B50ED64BDC3797BBC06DB813C5AD6A53C56E832BE870AD03DAF5DC03B6BB8F83C69FA383C125FAE3B7F23623DEC96BB3CAF36B4BD9B6F283C99D6163DBD8136BD917282BDBB1D0C3BBA4EACBDAC1343BDA4F8ED334BF754BD4A4EB83C5D7D943C337DC43D0906CDBB2ACF0ABE9978AF3D66192FBDB5F759BC9717E3BCC76450BA86E08F3C3C1C9ABA60D4BB3C30F29BBE41780BBD6F960CBDFF477BBAFEE925BE14AA98BDE7F3763C96BD93BD2D32583CDDB1513D2FAC8FBD7C727FBC09732D3CFAC8463D3B492A3D442A50BC147AA53C130F50BBA19E4ABC82144FBE8359D7BC27DF963D7F37F03C9D2B9D3BEE50D1BB3F6B88BDE4CD82BC492FB33C3403EBBCD01226BD380AB03A9413603D4B8CA43D10E860BC32090BBDD6D78D3CEA5FFB3D9031B1BDD3B1FE3B95FE89BEE5FAC03CB642F53B0948033EB072193C5B79613AF82F7E3CC82CE9BD0AA2BF3D2F3B02BC5A2E893C84A8863B13D94E3D719DE8BB685DC03DA3E7B2BD9593C33D9F57A3BBB16A643C6E5C0D3C9FD33F3C0F80E03DE6979E3D718C0C3CC14E28BDC6A48C3CFB51BE3AEF02043D18B829BDB79C2E37E722B43CEFD069BC7E2E6DBC3C31F73DB329013D464030BB0E0397BD63319BBAB6FD2F3CD4FC33BD9F8063BC9B0E913CC69726BC35A611BC0EBF08BDE3B4863BEF5669BC19F0F73C552F12BD720D04BC01C61DBDF9F43C3DEE6A0ABBEAE8CC3CE77293BC6BEEBA3B14CFBBBAA596253D7E7013BCE0FBA23CA9419F3DC3C69CB82C5A73BD22BA053D00F540BB36EF013DD9B73CBC40B80DBDAEF0B43D56350E3E4D7E3BBD1F9F3D3B14295EBCC80A093C1B5500BC08C73DBB96EE133CCA9B32BBDB7CC3BBF63E563DFB404D3B517998B83C0D133ED0CFD4BCF28DD63DCA09D6BB243E95BCD6C7B63CEF498A3D7A8D07BE576AAC3DEF888BBC742C4CBC14DFE03D9C7FCC3CCA7227BDC57518BD7AC9023DAFD34EBCC41C373D2FF516BC7667263D8D7236BEB11289BD551BA9BDABD315BB280335BD6E40B93C6295CC3C2F6394BC84B5053BE649753C5A34CABD7146E83A9A6204BD8026BF3DAA1B173E5153353D93A800BC1D411A3CF05728BD67D4B63BC33D883D9AC1EABC29108EBD40971A3B7CB404BE723D183CEEC4AB3C55B6903D51A1B43D351556BD12B9823DEAED503C886991BCEC502E3D5E776CBDBFAAC83C67CDBF3CAD0A9ABCFBC6C13DC1C8AE3C82A719BECAA0D33A75ED383DA587A03D2641C0BBEE5D7EBD06EEA43DCE1DCCBD695DAC3DE7BB803D25E903BD1F240DBD3AF23FBB96120E3D699042BD68F72A3D8916A0BD76D4C1BD5D84F0BC08CE1EBDC9CE083D25726DBCFC3275BEA5F798BD1C1983BAD5447D3DB19D08BDD263863D1B0C94BD1F9BED3C073CB9BD53578E3D919C8FBD62A53ABBF9B8603D702480BB8D4514BE4E5C2BBD49A8F03CB9EC373CCF98E6BB04800D3D718E9BBD6BB930BEE5ACE53CEFB9263EDAACA33C3E3C4BBD396410BCF5A2023DA2A3883D815FB93C91E68D3D784B1D3D9A975D3BF287D3BDE56DEE3DCB1891BDB41933BDAA39DCBC5FECD833E7D6D9BC0F1C0B3B67A3D73A9250083DC326F23C5AF077BDFA57C03DFB4FB33DF106803C4E0DF83C9AB354BC547CB93DBAA1E3BB75B2FC3C1881C9BD299055BD57F20F3E6CC7DEBD8DF8E03C47FEE238AF960EBD1E08EB3D931AD9BC288491BC9CD850BD1582CABDBA7B0C3E5C5F0EBD530636BD8E53033D863FD4BB404FD73C8E1ECBBDF23B30BD9683B6BC972D4E3D6742EB3B6229CB3CC9DD113E68A4393DC2AC1CBD7696E1BDF71C93BD2C4C27BD779C5BBDF917F7BD58D0D8BC962BFBBD47DC733DF38707BDCD473ABDA56E5CBE490A39BDC964253D8F6AB0BD46B377BC8745A23D1FE0A2BC0166843B2040613B5B453C3CC84A56BDB818AE3D1214F6BDFEE74EBC5F471F3CDC8B10BE72FAC53DEEDA08BDFD7F1EBE73B5483AACAFD13DB5C74E3E8F4353BCC0758E3D22EE143E0A03B8BC1A7695BDDF371EBDE048A0BB0825A43DCCA049BD2DCEA8B74B0F3DBD6D1928BD08CDA93C6CD80D3C2CE68C3CA85909BC0B0503BDD2B20C3C50F03F3C97A2783C935BC03D6558853DC049563D35D81B3DC3846BBD4146A83D83245E3E54D9E2BD905A9EBD87D1E73B92B270BC9FDC853DDF6B40BD04EB103D4A6AEF3D3F652CBC14E675BD853211BDAB8D3BBBE14D063E6222213E86C88EBBC5F81FBD44F76E3C928B623D9E324ABD65E36CBDA0B40D3A27B9573DC40F30BDF2BA0D3D53A245BB9818FABD35026BBC067A1C3E54F6443DD0C6243CDBD89BBCF0C8B2BCDC383EBD070E993B144376BD01F298BCF951A7BB0FCE993DFD326DBB4F85133DDFAEC5BC7F41BD3C457918BC49E9013C698A1BBD6396153B1432B7BDAE94853C41589FBAFB5219BCE40D72390E5B9A3A6C546CBD03F8423D06B001BD3B877FBC269BD73B3F3FE1BD9B6000BA1291D03C4C3100BC9795BBBD45BD14BD0534B4BC1371EFBD031FADBDA5A9283DAED491BC5A1E983ACCD304BD9EAC593C5F0033BEF8DD7D3BFFAC68BCF795783B806D813DD4A23CBC4CAF603B7ED0D23CCC98223E64D976BD059EAEBCF25735BE0C4C2ABB6C9703BD984CB9BD50B1373C3DD0FFBB438F5BBD2DB356BCAAD28C3D665A2D3A7EC7A7BCB85EB93BB48582BCBBDC183CBDA1053DAE18D2BCB885DCBC46013E3D727A1CBB6A40923C3B6F19BBD082933C5E84523C1FF5CB3BAB024A3C3AB2F6395E67C63CFACC9F3CCF9AD43C44F431BEB432B23C16F72D3D1F54B4BC2937A23D8C19093D853E193D5518BA3B97DEF73BF0BDD7BAF73419BBE3A4893A12EDFFBC59708D3B28E9BD3BCA32BD3C5FF271BD318BD6BC3EA5663BD5D021BDFCBC273E447B1F3D48D706BE7B8D6A3A099F29BDFB75603B50CD23BC7DA4A03CDAC9B53DD0BCCCBDA16939BC1FF6553CA213073E6A89FC3C5B76A53D519E663C3F23AC3C98A5C0BDFC99A0BA9E6E3C3DF8CA21BD8DFA803C8A39223C244106BD276D05B33078A63D3FDEE73C784F9BBC4275553D434B043E55B7B8BC439E49BD88E19E3DA757FABC20975CBCEFCBCBBB4BF181BCFE58F0BD5E192FBCC188A43DE6FBDABA671235BC7458123D5D54173C1EACCD3C5D22F4B96AA3AF3CE43B93BC47AC06BE0A35263CF35110BCFFE3A13CE00F1F3D335F0EBDC9CC433D1D13873C3149E93B6D0F5C3DEDE851BD689D953D979D043C26B9573DD65056BDA2A32DBD125F4E3D7906B6BB99A28DBC5023B93C9A1B15BD4F8F133C4093AEBE84E6A13D57303E3D274709BCF1196C3B5896143CDCEB5CBC35A3B5BB5FD579BDC12C4EBC5715E6BCE8D7B23DDF52833D9B82783B24D2323CBDE0D6BCB1D602BED682CEBC12D37ABC389B44BBE619C3BD51E0A83CCF0E0A3D4C10853D50ABAA3C46B3113BFD5268BD0447A7BBE3ACDFBCA48FC53DDFA7F73B5B75DEBC0538413B89234FBC1B26243B59C562BE5E62D33C3AC423371F1954BC99CF5FBD3E2E1FBD93C4A7BDEAB4D8BC8BC09E3C936C8CBB2595AF3BB1B62E3AD4E481BE4C043BBD328B4F3D7907FF3C0B898E3C9CBFCC3C3149B0BCDD6808BB6DA4543DC0F0B23B9BB972BA27F98DBDDD4F3ABD0B96813B77065F3D999A6C3C8D0118BC222522BDB48D953D59EEE13C48DCABBCF8892E3D2A636E3AAC6898BDFEF2863CD8082B3CDCEF193CEF7FD63CD33504BEEE3432BD6FB85B3DDE288ABC0D532B3B2793833CEE46BDBCB552E53B7A4F633C086D51BDF95C1B3D9834F1BC440BC2BA707A223E6EBA92BC44CC523DB26506BDA0DDACBD284408BD870C75BC400C0E3E54988C3D917E0DBD3FC5DD3DB5C34ABDF21035BDAFC2DE3DCED505BDBB11DC3D4A0F84BC87BB3A3D2E5BEEBBED9055BD7E80433DD5966A3D81BD31BEEE9B06BDBE4C2CBD80B027BBDFCD2DBD37FDB03D18448A3D084011BEE07901BEA020E63C2C39BDBDE591BCBD2AED66BD8080653DF8381C3CFDABE83C6EECEBBC39C1503E485DBABC106306BC47397EBD570A6B3D7A3D8ABDD65E113EAB5653BDEDB5B8BD0CA7263CD07C913D9566A23D561AE33DA79A03BCDE1945BDA0E8473CF95181BD4A8459BD128DF23CD26201BDDFA40ABDC4400C3EA81A80BC70600ABE9C89C7BD6AACC3BD78CD833D0A7D8F3C8253953C4321053E7C63D5BC9EA3363D01D767BD9CBD133C93E83B3C52D91FBB41F796BD072598BDEC24F63D62D5D1BD23A903BE780503BD31E412BE35099FBCB08252BD8FF5B5BD4F85953CB491B13C0109173CEEBF8ABD1052713B84C283BD1B4C7ABD25CA01BDCBA3B0BD40909FBD1BB742BDE40EB13DBA9A5F3D1EBE18BE9C14AEBC0792E53D0D2C243D3A204B3D634B73BDD4E79ABDFEE074BDDC2B50BD491AB1BD0E5B3FBBF0204CBD62FDFDBD222D563DE39C9E3D8A5851BCF966C33D65E0CE3CFED41FBDC82182BDC259983BE1A52A3C77CE97BD4145FDBC73F4E33340CF1CBE8A21243D9E2725BD991A923C3179043D662D2FBEADF6123EC621CC3DAEA25B3DB600963D027C85BC7F12373D981CC2BC69E5B93C0661A13E808EC2BDB00B5E3D3E1C63BD592C33BEB8540ABCB553823D5032E33CE6C095BDD51B0ABE16E0873D8115FA3D16A82B3E087E9BBC90BA93BC24CB543D68B0F73D6BE832BD06E0A7BDA573453D4697F63D470CD33D526FC2BC8BD7643D2D09103E96BA1B3DC91A8ABDF7116DBD5C5E69BDE02F193D2339B43D9E0513BD42BC56BD26B23CBE4C4CED3CA6DC7ABD611D1CBE5960CABD2F6D2EBDCF6B433E9F26F3BCCEBA00BD2308C03DD8CE4CBD672EDF3B4B23BE3D86E0F53C9F10FFBD3663373E15AD68BD5FF83D3D76C881BDA999BBBD5E853CBC81D0A139C672DFBD504CB63B555FA23DA1E16C3D1384D53D25D79BBD5F80E3BD7A35583D9487F43B9FD2803DFD5EA4BBCC82203E8AD254BD84290EB8579A7D3C4FA7683DA50C753D5ADC3D3EDE45BE3CE219033C9C26953DC9648D3C6F5FE3BC10A8183D5B8B053E12BCED3D09F9BD3DDA4BCEBCD401783CEDCC37BD89AFD63DFFB0153B0A9397BE5117A33D8B03FD3D510F393D5DEF93BDAA91533C5C4BC83DDE24A43C1D9D38BD36CE14BD014CBBBD168D2B3CB3BF3439BA171DBC6EAE66BD8537463D31F2E8BCD65C82BD8073BABDD45B9A3DCC24743C395146BE3EB6A43DC12DEA3B660F8FBD3618953C5FC2FA3D86EE7F3D66B097BDBA46D6BC93A972BD888384BD762465BD761A6CBD76BCF53C90703ABB790E15BE74E99E398CE8E33CB0CA4BBCC973B4BDDEF3E03B3877DB3B2532FF3C29147EBC3FF08E3C7B05493DB1BE9D3CE13C273ACF25EC3C59281F3C7467453CA3C370BCD829BB3CA75221BD14FB603DC22296BBE46F78B9E90953BD76DD58BBC0E50BBE242E073D2F0B63BD30479D3D5B04A13C270522BC7E6023BC6DA47F3D8F67C03C5494FFBB1DE5CCBD2422063DF943F4BC19E781BA1EBDD33D23331B3C50CEB83BD055DBBC0ED20BBEC29B43BC6DD8D53C68E918BD5E71FC3C76541E3DEEF33FBB5D9AC43DB4FCE2BCAB7A5D3C4034163D6E98253C5AD9983BBA5023BC6256F0BCB88D403CE1075F3D25FA06BDB5A385BC9CF25B39C7D096BC024F1ABC72170CBDDDE3B8BCFF7BA33CC1C21E3D84CCEF3A1ABB1E3CE47C223AED1F03BC5C48A13C96392D3D02C7A03C23E4793DA03F873CC5BE91BD8505373D7029E43B5979B6BC8DABF4BD23D3CD3CD32C16BD7EBFB13CFBE15A3C676D31BDF6368A3D3E8CF93BBC54583D8C3783BD7468273C52D0253CABC27F3CD1E3EABDF7E4243D1209023D353152BC3BE93FBD159B09BC6B3CD3BBF2C3903B120D7A3C12A475BC015B2D3CBC9BF0BC5DAB013E1F3C16BDF30D813D644D2C3DDB4CA03D743B6ABB83EC103C5B50CF3C636E85BCB90BA53B9CB83B3DAA5811BDD1DB9BB07D6C03BC5FE913BD6572A83BFF34DABDE34E28BD4F7731BC921DC1BDE13D613EFEF8A7BB125F743D87A92B3B5F0EF0BBB8A8CA3C8A36643C6F2F0FBE99E3BEBD9F111B3CCE20FD3A0A8E803C3175DBBC51A8D83B029D2B3C0537EEBC9B8F60BE3660873CE9298E3CD3639C3D65AE46BE030B46BC21C300BD37B0BB3CDAA48FBAF1BCC5BC4E76783D3B3DB83C561353BED63BA0BC85E044BC052A223CC81F0C3EB798903CBA9B223AAA93A0BD28A5DC3D25AF973BDF331FBD3FA6B9BDC1C4F9BC8BF7AF3C78D992BBDA89723C6AE9DE3C58CCE63B3C1149BCDFA4ACBCD9D588BB7F2F033EC1D4FFBD67280438E5F6F2BD9741DCBD02D84EBD29C424BD50B9103B8FD2A1BA488CEEBAD63D683C676C89BC3D276DBD1EBCAABC6B48493E8E9853BCF29E6B3CD45D41BD1305143EA340383D74FEDF3BE813B3BBC1AC3E3DD4C3023BDD29753D29A5993A601F4E378F79EEBA7AF8853A1D1869BD0CA89CBDBD2C153B6E37303C17BAE9BC48FAF63C655289BCD844813D5A6DD9BED3027BBEC9B4FF3CA02E3A3C6E52AD3C15AB8C3C15EA873C9C1F853AB3D564BCA483B4BCE4AF333CA8FD30BC144D623D4A5D7E3CA12C3B3D9FCA9DBCCE7556BBF279AC3CB9DF10BBB6FE2C3CAEAF8E3D252CDA396FDF73BD8332CB3C6AED623B351C22BCCADB483D86B8623C4B5EF53C1C4A21BDD7E9473C4116013D53E7A4BD73638FBB26F8063CF7FE113CA464BABCFAFEA9B978211A3CD541AABD871AB83CFE6C4BBD"> : tensor<2x4x16x128xf32>
+    %1 = "mhlo.transpose"(%arg0) {permutation = dense<[0,2,3,1]> : tensor<4xi64>} : (tensor<2x128x4x16xf32>) -> tensor<2x4x16x128xf32>
+    %2 = mhlo.add %1, %0 : tensor<2x4x16x128xf32>
+    return %2 : tensor<2x4x16x128xf32>
+}
+// CHECK-LABEL: func.func @transpose_move_down_binary_case0
+// CHECK-NEXT: mhlo.constant
+// CHECK-NEXT: mhlo.add
+// CHECK-NEXT: mhlo.transpose
+// CHECK-NEXT: return
+
+func.func @transpose_move_down_binary_case1(%arg0 : tensor<2x128x4x16xf32>,%arg1 : tensor<2x128x4x16xf32>) -> tensor<2x4x16x128xf32> {
+    %0 = "mhlo.transpose"(%arg0) {permutation = dense<[0,2,3,1]> : tensor<4xi64>} : (tensor<2x128x4x16xf32>) -> tensor<2x4x16x128xf32>
+    %1 = "mhlo.transpose"(%arg1) {permutation = dense<[0,2,3,1]> : tensor<4xi64>} : (tensor<2x128x4x16xf32>) -> tensor<2x4x16x128xf32>
+    %2 = mhlo.add %1, %0 : tensor<2x4x16x128xf32>
+    return %2 : tensor<2x4x16x128xf32>
+}
+// CHECK-LABEL: func.func @transpose_move_down_binary_case1
+// CHECK-NEXT: mhlo.add
+// CHECK-NEXT: mhlo.transpose
+// CHECK-NEXT: return
diff --git a/compiler/python/ByteIRModules.cpp b/compiler/python/ByteIRModules.cpp
index 3ba11567f..076e24c6d 100644
--- a/compiler/python/ByteIRModules.cpp
+++ b/compiler/python/ByteIRModules.cpp
@@ -15,6 +15,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "bindings/c/Passes.h"
 #include "byteir-c/Dialects.h"
 #include "byteir-c/Passes.h"
 #include "byteir-c/Translation.h"
@@ -30,6 +31,7 @@ static MlirStringRef toMlirStringRef(const std::string &s) {
 
 PYBIND11_MODULE(_byteir, m) {
   byteirRegisterAllPasses();
+  mlirRegisterAllMhloPasses();
 
   m.doc() = "byteir python extension";
 
diff --git a/compiler/python/byteir/compile.py b/compiler/python/byteir/compile.py
index 70c1bc1f8..297beaeb3 100644
--- a/compiler/python/byteir/compile.py
+++ b/compiler/python/byteir/compile.py
@@ -88,6 +88,11 @@ def compile_cuda(
         _print_verbose(module, "// IR Dump After GPU Opt:")
     with context:
         PassManager.parse("builtin.module(func.func(remove-func-body{anchor-attr=__byteir_elementwise_fusion__}))").run(module.operation)
+        PassManager.parse("builtin.module(inline)").run(module.operation)
+        if useBarePtrCallConv:
+            PassManager.parse("builtin.module(func.func(gpu-launch-func-to-byre{use-bare-ptr-memref-call-conv=true}))").run(module.operation)
+        else:
+            PassManager.parse("builtin.module(func.func(gpu-launch-func-to-byre))").run(module.operation)
         PassManager.parse("builtin.module(func.func(set-op-space{" + entry_func_str + " space={}".format(target) +  "}))").run(module.operation)
         PassManager.parse("builtin.module(set-arg-space{" + entry_func_str + " all-space={}".format(target) + "})").run(module.operation)
     if verbose:
@@ -127,7 +132,7 @@ def compile_cuda_with_ait(
     name: str = "model",
     aggressive_mode: bool = False,
     parallelism: int = 1,
-    disable_ait_cache: bool = False,
+    disable_byteir_cache: bool = False,
     **kwargs,
 ):
     target = "cuda"
@@ -143,7 +148,7 @@ def compile_cuda_with_ait(
     processor = IRProcessor(name, 
                             "./workspace", 
                             compile_parallelism=parallelism,
-                            disable_ait_cache=disable_ait_cache,
+                            disable_byteir_cache=disable_byteir_cache,
                             verbose=verbose)
     with context:
         processor.load_from_file(input)
@@ -202,6 +207,11 @@ def compile_cuda_with_ait(
         _print_verbose(processor.module, "// IR Dump After GPU Opt:")
     with context:
         PassManager.parse("builtin.module(func.func(remove-func-body{anchor-attr=__byteir_elementwise_fusion__}))").run(processor.module.operation)
+        PassManager.parse("builtin.module(inline)").run(processor.module.operation)
+        if useBarePtrCallConv:
+            PassManager.parse("builtin.module(func.func(gpu-launch-func-to-byre{use-bare-ptr-memref-call-conv=true}))").run(processor.module.operation)
+        else:
+            PassManager.parse("builtin.module(func.func(gpu-launch-func-to-byre))").run(processor.module.operation)
         PassManager.parse("builtin.module(func.func(set-op-space{" + entry_func_str + " space={}".format(target) +  "}))").run(processor.module.operation)
         PassManager.parse("builtin.module(set-arg-space{" + entry_func_str + " all-space={}".format(target) + "})").run(processor.module.operation)
     if verbose:
@@ -241,7 +251,7 @@ def compile(
     target: str = "cuda",
     verbose: bool = False,
     parallelism: int = 1,
-    disable_ait_cache: bool = False,
+    disable_byteir_cache: bool = False,
     **kwargs,
 ):
     if target == "cuda":
@@ -252,7 +262,7 @@ def compile(
                               entry_func, 
                               verbose, 
                               parallelism=parallelism, 
-                              disable_ait_cache=disable_ait_cache)
+                              disable_byteir_cache=disable_byteir_cache)
     elif target == "cuda_with_ait_aggressive":
         compile_cuda_with_ait(input, 
                               output, 
@@ -260,6 +270,6 @@ def compile(
                               verbose, 
                               aggressive_mode=True, 
                               parallelism=parallelism,
-                              disable_ait_cache=disable_ait_cache)
+                              disable_byteir_cache=disable_byteir_cache)
     else:
         raise NotImplemented("not implemented target: {}".format(target))
diff --git a/compiler/python/byteir/dialects/cat/ir_processor.py b/compiler/python/byteir/dialects/cat/ir_processor.py
index a397eff0d..c1388385b 100644
--- a/compiler/python/byteir/dialects/cat/ir_processor.py
+++ b/compiler/python/byteir/dialects/cat/ir_processor.py
@@ -19,7 +19,6 @@
 def func_hash_str(func, gpu_type):
     hash_str = gpu_type + "_"
     ops = func.entry_block.operations
-    # assert len(ops) == 2
     for op in ops:
         hash_str += f"{op.get_asm(large_elements_limit=None)};"
     return hash_str
@@ -29,7 +28,7 @@ def __init__(self,
                  job_name, 
                  workdir, 
                  compile_parallelism = MAX_COMPILATION_PARALLELISM,
-                 disable_ait_cache = False,
+                 disable_byteir_cache = False,
                  verbose = False):
         self.job_name = job_name
         self.workdir = workdir
@@ -37,14 +36,11 @@ def __init__(self,
         self.ait_reuse_recorder = {} # key: hash str, value: Tuple(dll_name, ait_module_path)
         self.compile_parallelism = min(compile_parallelism, MAX_COMPILATION_PARALLELISM)
         self.pool = multiprocessing.Pool(compile_parallelism)
-        self.ait_cache = AITCache()
+        self.byteir_cache = AITCache()
         self.verbose = verbose
-        # ait_cache is enabled when ait_reuse is enabled
-        # in other words, once `ait_reuse` is set to False, 
-        # we will orcely compile all ait ops with bo reuse or cache.
-        self.disable_ait_cache = disable_ait_cache
-        if not disable_ait_cache:
-            self.ait_cache.load_or_create_cache()
+        self.disable_byteir_cache = disable_byteir_cache
+        if not disable_byteir_cache:
+            self.byteir_cache.load_or_create_cache()
 
     def _get_builder(self, module, subgraph_name, backend="ait"):
         assert module != None
@@ -139,11 +135,11 @@ def ait_opt_pass(self, anchor_only=False, dump_ir=False):
                 self.ait_reuse_recorder[hash_str] = (builder.dll_name, builder.ait_module_path)
                 libs_to_add_to_cache[hash_str] = builder.ait_module_path
                 dedup_work_items.append((hash_str, func_ir_str))
-        
-        # search in ait cache
+
+        # search in byteir cache
         work_items_not_in_cache = []
         for hash_str, func_ir_str in dedup_work_items:
-            cached_lib = self.ait_cache.find(gpu_type, hash_str)
+            cached_lib = self.byteir_cache.find(gpu_type, hash_str)
             if cached_lib != None:
                 # hit, copy cached lib
                 context = ir.Context()
@@ -171,12 +167,12 @@ def ait_opt_pass(self, anchor_only=False, dump_ir=False):
         t_ed = time.time()
         print("compilation finished in {}s".format(t_ed-t_st))
 
-        # update ait cache
-        if not self.disable_ait_cache:
+        # update byteir cache
+        if not self.disable_byteir_cache:
             for key, lib_path in libs_to_add_to_cache.items():
-                self.ait_cache.add(gpu_type, key, lib_path, override=False)
-            self.ait_cache._save()
-            self.ait_cache.close_cache()
+                self.byteir_cache.add(gpu_type, key, lib_path, override=False)
+            self.byteir_cache._save()
+            self.byteir_cache.close_cache()
         
         with self.module.context:
             pm = PassManager.parse("builtin.module(func.func(gen-ait-config{{func-names={} ait-lib-paths={}}}))".format(funcNameArg, aitLibPathArg))
diff --git a/compiler/python/byteir/tools/compiler.py b/compiler/python/byteir/tools/compiler.py
index 5385614af..51aedc583 100644
--- a/compiler/python/byteir/tools/compiler.py
+++ b/compiler/python/byteir/tools/compiler.py
@@ -11,7 +11,7 @@
     parser.add_argument("--target", type=str, default="cuda", help="target device name")
     parser.add_argument("-v", "--verbose", action="store_true")
     parser.add_argument("--ait_parallelism", type=int, default=1, help="number of processes to compile ait op")
-    parser.add_argument("--disable_ait_cache", action="store_true")
+    parser.add_argument("--disable_byteir_cache", action="store_true")
 
     args = parser.parse_args()
     byteir.compile(args.input_mlir_path, 
@@ -20,6 +20,6 @@
                    args.target, 
                    args.verbose, 
                    args.ait_parallelism,
-                   args.disable_ait_cache)
+                   args.disable_byteir_cache)
 
 
diff --git a/compiler/test/Conversion/HloToCat/fused_ops.mlir b/compiler/test/Conversion/HloToCat/fused_ops.mlir
index 0b84fc4d5..92e0f2d1c 100644
--- a/compiler/test/Conversion/HloToCat/fused_ops.mlir
+++ b/compiler/test/Conversion/HloToCat/fused_ops.mlir
@@ -61,13 +61,22 @@ func.func @test_bmm_rcr_permute(%arg0: tensor<384x256x256xf32>, %arg1: tensor<38
 // CHECK-NEXT: cat.bmm_rcr_permute
 // CHECK-NEXT: return
 
+func.func @test_not_bmm_rrr_permute(%arg0: tensor<1x64x4096xf32>, %arg1: tensor<1x4096x4096xf32>) -> tensor<1x32x64x128xf32> {
+    %0 = "mhlo.dot_general"(%arg0, %arg1) {dot_dimension_numbers = #mhlo.dot<lhs_batching_dimensions = [0], rhs_batching_dimensions = [0], lhs_contracting_dimensions = [2], rhs_contracting_dimensions = [1]>} : (tensor<1x64x4096xf32>, tensor<1x4096x4096xf32>) -> tensor<1x64x4096xf32>
+    %1 = mhlo.reshape %0 : (tensor<1x64x4096xf32>) -> tensor<1x64x32x128xf32>
+    %2 = "mhlo.transpose"(%1) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x64x32x128xf32>) -> tensor<1x32x64x128xf32>
+    return %2 : tensor<1x32x64x128xf32>
+}
+// CHECK-LABEL: func.func @test_not_bmm_rrr_permute
+// CHECK-NOT: cat.bmm_rrr_permute
+
 func.func @test_bmm_rrr_add_0(%arg0: tensor<384x256x256xf32>, %arg1: tensor<384x256x64xf32>, %arg2: tensor<384x256x64xf32>) -> tensor<384x256x64xf32> {
     %0 = "mhlo.dot_general"(%arg0, %arg1) {dot_dimension_numbers = #mhlo.dot<lhs_batching_dimensions = [0], rhs_batching_dimensions = [0], lhs_contracting_dimensions = [2], rhs_contracting_dimensions = [1]>} : (tensor<384x256x256xf32>, tensor<384x256x64xf32>) -> tensor<384x256x64xf32>
     %1 = mhlo.add %0, %arg2 : (tensor<384x256x64xf32>, tensor<384x256x64xf32>) -> tensor<384x256x64xf32>
     return %1 : tensor<384x256x64xf32>
 }
 
-// CHECK: func.func @test_bmm_rrr_add_0
+// CHECK-LABEL: func.func @test_bmm_rrr_add_0
 // CHECK-NEXT: cat.bmm_rrr_add
 // CHECK-NEXT: return
 
@@ -77,7 +86,7 @@ func.func @test_bmm_rrr_add_1(%arg0: tensor<384x256x256xf32>, %arg1: tensor<384x
     return %1 : tensor<384x256x64xf32>
 }
 
-// CHECK: func.func @test_bmm_rrr_add_1
+// CHECK-LABEL: func.func @test_bmm_rrr_add_1
 // CHECK-NEXT: cat.bmm_rrr_add
 // CHECK-NEXT: return
 
@@ -88,7 +97,7 @@ func.func @test_bmm_crr_add(%arg0: tensor<384x256x256xf32>, %arg1: tensor<384x25
     return %2 : tensor<384x256x64xf32>
 }
 
-// CHECK: func.func @test_bmm_crr_add
+// CHECK-LABEL: func.func @test_bmm_crr_add
 // CHECK-NEXT: cat.bmm_crr_add
 // CHECK-NEXT: return
 
@@ -205,7 +214,7 @@ func.func @test_gemm_rrr_bias(%arg0: tensor<2x2048xf32>, %arg1: tensor<2048x1001
     return %3 : tensor<2x1001xf32>
 }
 
-// CHECK: func.func @test_gemm_rrr_bias
+// CHECK-LABEL: func.func @test_gemm_rrr_bias
 // CHECK-NEXT: mhlo.constant
 // CHECK-NEXT: cat.gemm_rrr_bias
 // CHECK-NEXT: return
@@ -216,7 +225,7 @@ func.func @test_bmm_crc(%arg0: tensor<512x1024x128xf16>, %arg1: tensor<512x1024x
     return %1 : tensor<512x1024x128xf16>
 }
 
-// CHECK: func.func @test_bmm_crc
+// CHECK-LABEL: func.func @test_bmm_crc
 // CHECK-NEXT: cat.bmm_crc
 // CHECK-NEXT: return
 
@@ -226,7 +235,7 @@ func.func @test_bmm_rrc(%arg0: tensor<512x128x1024xf16>, %arg1: tensor<512x1024x
     return %1 : tensor<512x1024x128xf16>
 }
 
-// CHECK: func.func @test_bmm_rrc
+// CHECK-LABEL: func.func @test_bmm_rrc
 // CHECK-NEXT: cat.bmm_rrc
 // CHECK-NEXT: return
 
@@ -237,7 +246,7 @@ func.func @test_transpose_reshape_bmm_rrr_to_reshape_bmm_rcr(%arg0: tensor<64x12
     return %2 : tensor<64x128x128xf16>
 }
 
-// CHECK: func.func @test_transpose_reshape_bmm_rrr_to_reshape_bmm_rcr
+// CHECK-LABEL: func.func @test_transpose_reshape_bmm_rrr_to_reshape_bmm_rcr
 // CHECK-NEXT: mhlo.reshape
 // CHECK-NEXT: cat.bmm_rcr
 // CHECK-NEXT: return
@@ -248,7 +257,7 @@ func.func @test_bmm_rrr_reshape_transpose_to_bmm_rrc_reshape(%arg0: tensor<64x12
     %2 = "mhlo.transpose"(%1) {permutation = dense<[0, 1, 3, 2]> : tensor<4xi64>} : (tensor<2x32x128x128xf16>) -> tensor<2x32x128x128xf16>
     return %2 : tensor<2x32x128x128xf16>
 }
-// CHECK: func.func @test_bmm_rrr_reshape_transpose_to_bmm_rrc_reshape
+// CHECK-LABEL: func.func @test_bmm_rrr_reshape_transpose_to_bmm_rrc_reshape
 // CHECK-NEXT: cat.bmm_rrc
 // CHECK-NEXT: mhlo.reshape
 // CHECK-NEXT: return
@@ -259,7 +268,7 @@ func.func @test_bmm_crr_reshape_transpose_to_bmm_crc_reshape(%arg0: tensor<512x1
     %2 = "mhlo.transpose"(%1) {permutation = dense<[0, 1, 3, 2]> : tensor<4xi64>} : (tensor<16x32x128x128xf16>) -> tensor<16x32x128x128xf16>
     return %2 : tensor<16x32x128x128xf16>
 }
-// CHECK: func.func @test_bmm_crr_reshape_transpose_to_bmm_crc_reshape
+// CHECK-LABEL: func.func @test_bmm_crr_reshape_transpose_to_bmm_crc_reshape
 // CHECK-NEXT: cat.bmm_crc
 // CHECK-NEXT: mhlo.reshape
 // CHECK-NEXT: return
@@ -268,8 +277,43 @@ func.func @test_softmax_f16(%arg0 : tensor<1x12x1024x1024xf16>) -> tensor<1x12x1
   %0 = mhlo.custom_call @byteir.softmax(%arg0) {backend_config = "", byteir_attrs = {axis = 3 : i64}} : (tensor<1x12x1024x1024xf16>) -> tensor<1x12x1024x1024xf32>
   return %0 : tensor<1x12x1024x1024xf32>
 }
-
-// CHECK: func.func @test_softmax_f16
+// CHECK-LABEL: func.func @test_softmax_f16
 // CHECK-NEXT: cat.softmax
 // CHECK-NEXT: mhlo.convert
 // CHECK-NEXT: return
+
+func.func @test_bmm_rrr_broadcast_to_reshape_gemm_rrr_reshape(%arg0: tensor<16x1024x4096xf16>, %arg1: tensor<4096x4096xf16>) -> tensor<16x1024x4096xf16> {
+  %0 = "mhlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<4096x4096xf16>) -> tensor<16x4096x4096xf16>
+  %1 = "cat.bmm_rrr"(%arg0, %0) : (tensor<16x1024x4096xf16>, tensor<16x4096x4096xf16>) -> tensor<16x1024x4096xf16>
+  return %1 : tensor<16x1024x4096xf16>
+}
+// CHECK-LABEL: func.func @test_bmm_rrr_broadcast_to_reshape_gemm_rrr_reshape
+// CHECK-NEXT: mhlo.reshape
+// CHECK-NEXT: cat.gemm_rrr
+// CHECK-NEXT: mhlo.reshape
+// CHECK-NEXT: return
+
+func.func @test_transpose_bmm_rrr_broadcast_to_gemm_rrr_permute(%arg0: tensor<16x1024x4096xf16>, %arg1: tensor<4096x4096xf16>) -> tensor<16x32x1024x128xf16> {
+  %0 = "mhlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<4096x4096xf16>) -> tensor<16x4096x4096xf16>
+  %1 = "cat.bmm_rrr"(%arg0, %0) : (tensor<16x1024x4096xf16>, tensor<16x4096x4096xf16>) -> tensor<16x1024x4096xf16>
+  %2 = mhlo.reshape %1 : (tensor<16x1024x4096xf16>) -> tensor<16x1024x32x128xf16>
+  %3 = "mhlo.transpose"(%2) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<16x1024x32x128xf16>) -> tensor<16x32x1024x128xf16>
+  return %3 : tensor<16x32x1024x128xf16>
+}
+// CHECK-LABEL: func.func @test_transpose_bmm_rrr_broadcast_to_gemm_rrr_permute
+// CHECK-NEXT: mhlo.reshape
+// CHECK-NEXT: cat.gemm_rrr_permute
+// CHECK-NEXT: return
+
+func.func @test_transpose_bmm_rrr_broadcast_to_gemm_rcr_permute(%arg0: tensor<16x1024x4096xf16>, %arg1: tensor<4096x4096xf16>) -> tensor<16x32x1024x128xf16> {
+  %t = "mhlo.transpose"(%arg1) {permutation = dense<[1,0]> : tensor<2xi64>} : (tensor<4096x4096xf16>) -> tensor<4096x4096xf16>
+  %0 = "mhlo.broadcast_in_dim"(%t) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<4096x4096xf16>) -> tensor<16x4096x4096xf16>
+  %1 = "cat.bmm_rrr"(%arg0, %0) : (tensor<16x1024x4096xf16>, tensor<16x4096x4096xf16>) -> tensor<16x1024x4096xf16>
+  %2 = mhlo.reshape %1 : (tensor<16x1024x4096xf16>) -> tensor<16x1024x32x128xf16>
+  %3 = "mhlo.transpose"(%2) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<16x1024x32x128xf16>) -> tensor<16x32x1024x128xf16>
+  return %3 : tensor<16x32x1024x128xf16>
+}
+// CHECK-LABEL: func.func @test_transpose_bmm_rrr_broadcast_to_gemm_rcr_permute
+// CHECK-NEXT: mhlo.reshape
+// CHECK-NEXT: cat.gemm_rcr_permute
+// CHECK-NEXT: return
\ No newline at end of file
diff --git a/compiler/test/Conversion/ToByre/convertMemRefToByre.mlir b/compiler/test/Conversion/ToByre/convertMemRefToByre.mlir
index 9eec21e83..4dfa8f99a 100644
--- a/compiler/test/Conversion/ToByre/convertMemRefToByre.mlir
+++ b/compiler/test/Conversion/ToByre/convertMemRefToByre.mlir
@@ -1,4 +1,4 @@
-// RUN: byteir-opt -convert-lmhlo-to-byre %s | FileCheck %s
+// RUN: byteir-opt -memref-to-byre --split-input-file %s | FileCheck %s
 
 module attributes {byre.container_module} {
 // CHECK: module attributes {byre.container_module}  {
@@ -8,4 +8,22 @@ module attributes {byre.container_module} {
   // CHECK: byre.copy(%arg0, %alloc) {callee = "cpu2gpu"} : memref<4xf32, "cpu">, memref<4xf32, "gpu">
     return
   }
-}
\ No newline at end of file
+}
+
+// -----
+
+module attributes {byre.container_module} {
+// CHECK: module attributes {byre.container_module}  {
+  func.func @forward(%arg0: memref<i64, "cuda"> {byre.argname = "A", byre.argtype = 1 : i32}, %arg1: memref<2xi64, "cuda"> {byre.argname = "Out", byre.argtype = 2 : i32}) attributes { byre.entry_point } {
+    %expand_shape = memref.expand_shape %arg0 [] : memref<i64, "cuda"> into memref<1xi64, "cuda">
+    // CHECK: byre.alias
+    %alloc = memref.alloc() : memref<2xi64, "cuda">
+    %subview = memref.subview %alloc[0] [1] [1] : memref<2xi64, "cuda"> to memref<1xi64, strided<[1]>, "cuda">
+    // CHECK: byre.alias
+    memref.copy %expand_shape, %subview : memref<1xi64, "cuda"> to memref<1xi64, strided<[1]>, "cuda">
+    // CHECK: byre.copy
+    memref.copy %alloc, %arg1 : memref<2xi64, "cuda"> to memref<2xi64, "cuda">
+    // CHECK: byre.copy
+    return
+  }
+}
diff --git a/compiler/test/Dialect/Linalg/transform-op-fold-unit-extent-dims.mlir b/compiler/test/Dialect/Linalg/transform-op-fold-unit-extent-dims.mlir
index 8759d25ef..f8e0c1992 100644
--- a/compiler/test/Dialect/Linalg/transform-op-fold-unit-extent-dims.mlir
+++ b/compiler/test/Dialect/Linalg/transform-op-fold-unit-extent-dims.mlir
@@ -18,6 +18,8 @@ func.func @tensor_collapse(%arg0 : tensor<12x1024x1024xf32>, %arg1 : tensor<1x10
 
 transform.sequence failures(propagate) {
 ^bb0(%arg0: !pdl.operation):
-  %0 = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!pdl.operation) -> !pdl.operation
-  %1 = transform.structured.fold_unit_extent_dims %0
+  %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!pdl.operation) -> !pdl.operation
+  transform.apply_patterns to %0 {
+    transform.apply_patterns.linalg.fold_unit_extent_dims_via_reshapes
+  } : !pdl.operation
 }
diff --git a/compiler/test/Dialect/Mhlo/transforms/ConvertOpToCustomCall.mlir b/compiler/test/Dialect/Mhlo/transforms/ConvertOpToCustomCall.mlir
index 1b13f2851..2ed4c0d77 100644
--- a/compiler/test/Dialect/Mhlo/transforms/ConvertOpToCustomCall.mlir
+++ b/compiler/test/Dialect/Mhlo/transforms/ConvertOpToCustomCall.mlir
@@ -61,3 +61,22 @@ func.func @convert_rng_dynamic(%arg0: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
 // CHECK-NEXT:  call @NextOffsetFunc
 // CHECK-NEXT:  mhlo.custom_call
 // CHECK-SAME:  @byteir.rng_uniform
+
+// -----
+
+func.func @flash_attn_fwd(%arg0: tensor<2x256x12x128xf16>, %arg1: tensor<2x256x12x128xf16>, %arg2: tensor<2x256x12x128xf16>) -> (tensor<2x256x12x128xf16>, tensor<2x12x256xf32>, tensor<2x12x256x256xf16>, tensor<2xi64>) {
+  %0:4 = mhlo.custom_call @byteir.flash_attn_fwd(%arg0, %arg1, %arg2) {backend_config = "", byteir_attrs = {casual = false, dropout_p = 1.000000e-01 : f64, return_softmax = false, softmax_scale = 1.000000e+00 : f64}} : (tensor<2x256x12x128xf16>, tensor<2x256x12x128xf16>, tensor<2x256x12x128xf16>) -> (tensor<2x256x12x128xf16>, tensor<2x12x256xf32>, tensor<2x12x256x256xf16>, tensor<2xi64>)
+  return %0#0, %0#1, %0#2, %0#3 : tensor<2x256x12x128xf16>, tensor<2x12x256xf32>, tensor<2x12x256x256xf16>, tensor<2xi64>
+}
+
+// CHECK-LABEL: func.func private @NextOffsetFunc() -> tensor<i64> attributes {byre_compute_name = "NextOffset", byre_force_compute_name}
+// CHECK-LABEL: func.func private @GetSeedFunc() -> tensor<i64> attributes {byre_compute_name = "GetSeed", byre_force_compute_name}
+// CHECK-LABEL: func.func @flash_attn_fwd
+// CHECK-NEXT:  call @GetSeedFunc
+// CHECK-NEXT:  call @NextOffsetFunc
+// CHECK-NEXT:  mhlo.reshape
+// CHECK-NEXT:  mhlo.reshape
+// CHECK-NEXT:  mhlo.concatenate
+// CHECK-NEXT:  mhlo.custom_call
+// CHECK-SAME:  @byteir.flash_attn_fwd
+// CHECK-SAME:  byteir_attrs = {casual = false, dropout_p = 1.000000e-01 : f64, return_softmax = false, softmax_scale = 1.000000e+00 : f64}
diff --git a/compiler/test/Dialect/Mhlo/transforms/hloMoveDown.mlir b/compiler/test/Dialect/Mhlo/transforms/hloMoveDown.mlir
index 3a46a008b..aa83789d5 100644
--- a/compiler/test/Dialect/Mhlo/transforms/hloMoveDown.mlir
+++ b/compiler/test/Dialect/Mhlo/transforms/hloMoveDown.mlir
@@ -34,6 +34,18 @@ func.func @transpose_move_down_binary_splat_const(%arg0 : tensor<31x20x32xf32>)
 // CHECK-NEXT: mhlo.transpose
 // CHECK-NEXT: return
 
+func.func @transpose_move_down_binary_dense_const(%arg0 : tensor<3x2xf32>) -> tensor<2x3xf32> {
+    %0 = mhlo.constant dense<[[1.0,2.0,3.0],[4.0,5.0,6.0]]> : tensor<2x3xf32>
+    %1 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<3x2xf32>) -> tensor<2x3xf32>
+    %2 = mhlo.add %1, %0 : tensor<2x3xf32>
+    return %2 : tensor<2x3xf32>
+}
+// CHECK-LABEL: func.func @transpose_move_down_binary_dense_const
+// CHECK-NEXT: mhlo.constant {{.*}} tensor<3x2xf32>
+// CHECK-NEXT: mhlo.add
+// CHECK-NEXT: mhlo.transpose
+// CHECK-NEXT: return
+
 func.func @transpose_move_down_unary_and_cancel(%arg0 : tensor<31x20x32xf32>) -> tensor<31x20x32xf32> {
     %0 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0, 2]> : tensor<3xi64>} : (tensor<31x20x32xf32>) -> tensor<20x31x32xf32>
     %1 = "mhlo.abs"(%0) : (tensor<20x31x32xf32>) -> tensor<20x31x32xf32>
@@ -76,19 +88,17 @@ func.func @transpose_move_down_two_unary(%arg0 : tensor<31x20x32xf32>) -> tensor
 // CHECK-NEXT: return
 
 // MULTIUSER-LABEL: func.func @transpose_move_down_two_unary
-// MULTIUSER-DAG{ABS}: mhlo.abs
-// MULTIUSER-NEXT{ABS}: mhlo.transpose
-// MULTIUSER-DAG{SINE}: mhlo.sine
-// MULTIUSER-NEXT{SINE}: mhlo.transpose
+// MULTIUSER-DAG: mhlo.abs
+// MULTIUSER-DAG: mhlo.sine
 // MULTIUSER: mhlo.add
+// MULTIUSER-NEXT: mhlo.transpose
 // MULTIUSER-NEXT: return
 
 // AllMULTIUSER-LABEL: func.func @transpose_move_down_two_unary
-// AllMULTIUSER-DAG{ABS}: mhlo.abs
-// AllMULTIUSER-NEXT{ABS}: mhlo.transpose
-// AllMULTIUSER-DAG{SINE}: mhlo.sine
-// AllMULTIUSER-NEXT{SINE}: mhlo.transpose
+// AllMULTIUSER-DAG: mhlo.abs
+// AllMULTIUSER-DAG: mhlo.sine
 // AllMULTIUSER: mhlo.add
+// AllMULTIUSER-NEXT: mhlo.transpose
 // AllMULTIUSER-NEXT: return
 
 func.func @transpose_move_down_1_unary_1_invalid(%arg0 : tensor<31x20x32xf32>, %arg1 : tensor<20x31x32xf32>)-> tensor<20x31x32xf32> {
diff --git a/compiler/test/Dialect/Tensor/canonicalizeExt.mlir b/compiler/test/Dialect/Tensor/canonicalizeExt.mlir
index 83b828455..b0042aa23 100644
--- a/compiler/test/Dialect/Tensor/canonicalizeExt.mlir
+++ b/compiler/test/Dialect/Tensor/canonicalizeExt.mlir
@@ -34,3 +34,14 @@ func.func @extract_slice_and_collapse_shape_no_fold(%arg0: tensor<19x1024x1xi32>
 //   CHECK: tensor.extract_slice
 //   CHECK: tensor.collapse_shape
 
+// ----
+
+func.func @fold_zero_rank_from_elements_insert_slice(%arg0: tensor<1024xf32>, %scalar : f32) -> tensor<1024xf32> {
+  %0 = tensor.from_elements %scalar : tensor<f32>
+  %1 = tensor.insert_slice %0 into %arg0[256] [1] [1] : tensor<f32> into tensor<1024xf32>
+  return %1 : tensor<1024xf32>
+}
+// CHECK-LABEL: fold_zero_rank_from_elements_insert_slice
+//   CHECK: tensor.insert
+//   CHECK-NOT: tensor.from_elements
+//   CHECK-NOT: tensor.insert_slice
\ No newline at end of file
diff --git a/compiler/test/Transforms/canonicalizeExt.mlir b/compiler/test/Transforms/canonicalizeExt.mlir
index abb11d500..a7dbb37cb 100644
--- a/compiler/test/Transforms/canonicalizeExt.mlir
+++ b/compiler/test/Transforms/canonicalizeExt.mlir
@@ -343,3 +343,40 @@ func.func @transpose_reshape_transpose(%arg0: tensor<2x32x128x256xf16>) -> (tens
 // CHECK-NEXT: mhlo.reshape
 // CHECK-NEXT: mhlo.reshape
 // CHECK-NEXT: return
+
+func.func @replace_gather_with_input_0() -> (tensor<1x64x128xf16>, tensor<1x32x64x128xf16>) {
+  %0 = mhlo.constant dense<1.000000e+00> : tensor<64x128xf16>
+  %1 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<64xi64>
+  %2 = "mhlo.gather"(%0, %1) {dimension_numbers = #mhlo.gather<offset_dims = [1], collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 1>, indices_are_sorted = false, slice_sizes = dense<[1, 128]> : tensor<2xi64>} : (tensor<64x128xf16>, tensor<64xi64>) -> tensor<64x128xf16>
+  %3 = mhlo.reshape %2 : (tensor<64x128xf16>) -> tensor<1x64x128xf16>
+  %4 = "mhlo.broadcast_in_dim"(%2) {broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>} : (tensor<64x128xf16>) -> tensor<1x32x64x128xf16>
+  return %3, %4 : tensor<1x64x128xf16>, tensor<1x32x64x128xf16>
+}
+// CHECK-LABEL: @replace_gather_with_input_0
+// CHECK-NEXT: mhlo.constant
+// CHECK-NEXT: mhlo.constant
+// CHECK-NEXT: return
+
+func.func @replace_gather_with_input_1(%arg0: tensor<64x128xf16>) -> (tensor<1x64x128xf16>, tensor<1x32x64x128xf16>) {
+  %0 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<64xi64>
+  %1 = "mhlo.gather"(%arg0, %0) {dimension_numbers = #mhlo.gather<offset_dims = [1], collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 1>, indices_are_sorted = false, slice_sizes = dense<[1, 128]> : tensor<2xi64>} : (tensor<64x128xf16>, tensor<64xi64>) -> tensor<64x128xf16>
+  %2 = mhlo.reshape %1 : (tensor<64x128xf16>) -> tensor<1x64x128xf16>
+  %3 = "mhlo.broadcast_in_dim"(%1) {broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>} : (tensor<64x128xf16>) -> tensor<1x32x64x128xf16>
+  return %2, %3 : tensor<1x64x128xf16>, tensor<1x32x64x128xf16>
+}
+// CHECK-LABEL: @replace_gather_with_input_1
+// CHECK-NEXT: mhlo.reshape
+// CHECK-NEXT: mhlo.broadcast_in_dim
+// CHECK-NEXT: return
+
+func.func @replace_gather_with_input_2(%arg0: tensor<64x128xf16>) -> (tensor<1x64x128xf16>, tensor<1x32x64x128xf16>) {
+  %0 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<128xi64>
+  %1 = "mhlo.gather"(%arg0, %0) {dimension_numbers = #mhlo.gather<offset_dims = [0], collapsed_slice_dims = [1], start_index_map = [1], index_vector_dim = 1>, indices_are_sorted = false, slice_sizes = dense<[64, 1]> : tensor<2xi64>} : (tensor<64x128xf16>, tensor<128xi64>) -> tensor<64x128xf16>
+  %2 = mhlo.reshape %1 : (tensor<64x128xf16>) -> tensor<1x64x128xf16>
+  %3 = "mhlo.broadcast_in_dim"(%1) {broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>} : (tensor<64x128xf16>) -> tensor<1x32x64x128xf16>
+  return %2, %3 : tensor<1x64x128xf16>, tensor<1x32x64x128xf16>
+}
+// CHECK-LABEL: @replace_gather_with_input_2
+// CHECK-NEXT: mhlo.reshape
+// CHECK-NEXT: mhlo.broadcast_in_dim
+// CHECK-NEXT: return
diff --git a/compiler/test/Transforms/memoryPlanning.mlir b/compiler/test/Transforms/memoryPlanning.mlir
index 1edca2483..bb9abd835 100644
--- a/compiler/test/Transforms/memoryPlanning.mlir
+++ b/compiler/test/Transforms/memoryPlanning.mlir
@@ -1,7 +1,7 @@
-// RUN: byteir-opt %s -memory-planning --canonicalize --cse | FileCheck %s
-// RUN: byteir-opt %s -memory-planning="alignment=64" --canonicalize --cse | byteir-stat --alloc-cnt | FileCheck %s --check-prefix CHECK-STAT
-// RUN: byteir-opt %s -memory-planning="alloca" --canonicalize --cse | FileCheck %s --check-prefix CHECK-ALLOCA
-// RUN: byteir-opt %s -memory-planning="alloca mem-space=2" --canonicalize --cse | FileCheck %s --check-prefix CHECK-SPACE
+// RUN: byteir-opt %s --pass-pipeline='builtin.module(func.func(memory-planning,canonicalize,cse))' | FileCheck %s
+// RUN: byteir-opt %s --pass-pipeline='builtin.module(func.func(memory-planning{alignment=64},canonicalize,cse))' | byteir-stat --alloc-cnt | FileCheck %s --check-prefix CHECK-STAT
+// RUN: byteir-opt %s --pass-pipeline='builtin.module(func.func(memory-planning{alloca},canonicalize,cse))' | FileCheck %s --check-prefix CHECK-ALLOCA
+// RUN: byteir-opt %s --pass-pipeline='builtin.module(func.func(memory-planning{alloca mem-space=2},canonicalize,cse))' | FileCheck %s --check-prefix CHECK-SPACE
 
 func.func @test_basic_reuse(%arg0 : memref<256xf32>, %arg1 : memref<256xf32>) -> memref<256xf32> attributes {__placeholder__byre.entry_point} {
   %0 = memref.alloc() : memref<256xf32>
@@ -203,9 +203,9 @@ func.func @test_reuse_sub_chunk_i1(%arg0 : memref<512xi1>, %arg1 : memref<512xi1
 
 func.func @test_reuse_single_memory_space(%arg0 : memref<512xf32, 1>, %arg1 : memref<512xf32, 2>) {
   %0 = memref.alloc() : memref<512xf32, 1>
-  %1 = memref.alloc() : memref<512xf32, 2>
+  %1 = memref.alloca() : memref<512xf32, 2>
   %2 = memref.alloc() : memref<512xf32, 1>
-  %3 = memref.alloc() : memref<512xf32, 2>
+  %3 = memref.alloca() : memref<512xf32, 2>
   "lmhlo.add"(%arg0, %arg0, %0) : (memref<512xf32, 1>, memref<512xf32, 1>,  memref<512xf32, 1>) -> ()
   "lmhlo.add"(%arg1, %arg1, %1) : (memref<512xf32, 2>, memref<512xf32, 2>,  memref<512xf32, 2>) -> ()
   "lmhlo.add"(%0, %0, %arg0) : (memref<512xf32, 1>, memref<512xf32, 1>,  memref<512xf32, 1>) -> ()
diff --git a/compiler/tools/byteir-opt/CMakeLists.txt b/compiler/tools/byteir-opt/CMakeLists.txt
index 9c13a778e..f8b667c21 100644
--- a/compiler/tools/byteir-opt/CMakeLists.txt
+++ b/compiler/tools/byteir-opt/CMakeLists.txt
@@ -8,6 +8,7 @@ set(BYTEIR_LIBS
   MLIRCclTransformOps
   ByteIRAffinePasses
   ByteIRByrePasses
+  ByteIRGPUPasses
   ByteIRGPUPipelines
   ByteIRHostPipelines
   ByteIRLinalgPasses
diff --git a/compiler/tools/byteir-opt/byteir-opt.cpp b/compiler/tools/byteir-opt/byteir-opt.cpp
index 8cb60be72..ed3c15e51 100644
--- a/compiler/tools/byteir-opt/byteir-opt.cpp
+++ b/compiler/tools/byteir-opt/byteir-opt.cpp
@@ -21,6 +21,7 @@
 #include "byteir/Dialect/Ccl/IR/CclOps.h"
 #include "byteir/Dialect/Ccl/Passes.h"
 #include "byteir/Dialect/Ccl/TransformOps/CclTransformOps.h"
+#include "byteir/Dialect/GPU/Passes.h"
 #include "byteir/Dialect/Lace/LaceDialect.h"
 #include "byteir/Dialect/Linalg/IR/LinalgExtOps.h"
 #include "byteir/Dialect/Linalg/Passes.h"
@@ -30,6 +31,7 @@
 #include "byteir/Dialect/Shape/IR/ShapeExtOps.h"
 #include "byteir/Dialect/Shape/Passes.h"
 #include "byteir/Dialect/Tensor/IR/TilingInterfaceImpl.h"
+#include "byteir/Dialect/Tensor/Passes.h"
 #include "byteir/Dialect/Transform/IR/TransformExtOps.h"
 #include "byteir/Dialect/Transform/Passes.h"
 #include "byteir/Dialect/Vector/Transforms/Passes.h"
@@ -123,11 +125,13 @@ int main(int argc, char **argv) {
   registerByteIRAffinePasses();
   registerByteIRByrePasses();
   registerByteIRCclPasses();
+  registerByteIRGPUPasses();
   registerByteIRLinalgPasses();
   registerByteIRMemRefPasses();
   registerByteIRMhloPassesExt();
   registerByteIRSCFPasses();
   registerByteIRShapePasses();
+  registerByteIRTensorPasses();
   registerByteIRTransformPasses();
   registerByteIRVectorPasses();
 
diff --git a/external/patches/AITemplate/logging.patch b/external/patches/AITemplate/logging.patch
new file mode 100644
index 000000000..a5dec1b54
--- /dev/null
+++ b/external/patches/AITemplate/logging.patch
@@ -0,0 +1,17 @@
+diff --git a/static/csrc/model_container.cpp b/static/csrc/model_container.cpp
+index 5548a97..920ed60 100644
+--- a/static/csrc/model_container.cpp
++++ b/static/csrc/model_container.cpp
+@@ -80,9 +80,9 @@ ModelContainer::ModelContainer(
+       useDebugLogging = true;
+     }
+   }
+-  LOG(INFO)
+-      << (useDebugLogging ? PrintDebugDeviceProperties(prop)
+-                          : PrintInfoDeviceProperties(prop));
++  //LOG(INFO)
++  //    << (useDebugLogging ? PrintDebugDeviceProperties(prop)
++  //                        : PrintInfoDeviceProperties(prop));
+ 
+   LOG(INFO) << "Init AITemplate Runtime with " << num_models << " concurrency";
+   models_.reserve(num_models);
diff --git a/frontends/onnx-frontend/onnx-frontend/src/Conversion/OFRewriteToCustomCall.cpp b/frontends/onnx-frontend/onnx-frontend/src/Conversion/OFRewriteToCustomCall.cpp
index dfda80cc2..a8918c43c 100644
--- a/frontends/onnx-frontend/onnx-frontend/src/Conversion/OFRewriteToCustomCall.cpp
+++ b/frontends/onnx-frontend/onnx-frontend/src/Conversion/OFRewriteToCustomCall.cpp
@@ -188,6 +188,7 @@ Value createL2NormWithoutEps(PatternRewriter &rewriter, Location loc,
       mhlo::CustomCallSchedule::NONE, nullptr, nullptr,
       rewriter.getArrayAttr(llvm::ArrayRef<mlir::Attribute>{}));
   DictionaryAttrWrapper attrs(rewriter.getContext());
+  attrs.setAttr("epsilon", rewriter.getF64FloatAttr(0.0));
   attrs.setAttr("axis", rewriter.getI64ArrayAttr({axis}));
   customCallOp->setAttr(BYTEIR_ATTRS, getCleanAttr(attrs));
 
diff --git a/frontends/onnx-frontend/onnx-frontend/test/of_rewrite_to_custom_call.mlir b/frontends/onnx-frontend/onnx-frontend/test/of_rewrite_to_custom_call.mlir
index d0d986dc9..0e87cd743 100644
--- a/frontends/onnx-frontend/onnx-frontend/test/of_rewrite_to_custom_call.mlir
+++ b/frontends/onnx-frontend/onnx-frontend/test/of_rewrite_to_custom_call.mlir
@@ -175,7 +175,7 @@ func.func @test_l2_norm_pat2(%1146: tensor<12x128xf32>) -> tensor<12x128xf32> {
   return %1148 : tensor<12x128xf32>
 // CHECK-LABEL:  @test_l2_norm_pat2
 // CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<12x128xf32>) -> tensor<12x128xf32> {
-// CHECK-NEXT:   [[VAR_0_:%.+]] = mhlo.custom_call @byteir.l2_norm(%arg0) {backend_config = "", byteir_attrs = {axis = [1]}} : (tensor<12x128xf32>) -> tensor<12x128xf32>
+// CHECK-NEXT:   [[VAR_0_:%.+]] = mhlo.custom_call @byteir.l2_norm(%arg0) {backend_config = "", byteir_attrs = {axis = [1], epsilon = 0.000000e+00 : f64}} : (tensor<12x128xf32>) -> tensor<12x128xf32>
 // CHECK-NEXT:   return [[VAR_0_]] : tensor<12x128xf32>
 }
 
diff --git a/frontends/torch-frontend/examples/demo/README.md b/frontends/torch-frontend/examples/demo/README.md
new file mode 100644
index 000000000..65ce80a6a
--- /dev/null
+++ b/frontends/torch-frontend/examples/demo/README.md
@@ -0,0 +1,16 @@
+# ByteIR GPU Compiler for LLM on Torch 2.0
+
+### Steps to run
+1. Build docker image with [Dockerfile](../../../../docker/Dockerfile).
+2. Download ByteIR release and unzip it.
+3. Install ByteIR components:
+    * python3 -m pip install -r ByteIR/requirements.txt
+    * python3 -m pip install ByteIR/*.whl
+4. Run training demo:
+    * python3 main.py \<model-name\> <--flash>
+    * **model-name:** ["gpt2", "bloom-560m", "llama", "opt-1.3b", "nanogpt"]
+    * **--flash:** means enable flash attention
+5. Run inference demo:
+    * python3 main.py \<model-name\> --infer <--flash>
+    * **model-name:** ["llama"]
+    * **--flash:** means enable flash attention
diff --git a/frontends/torch-frontend/examples/demo/backend.py b/frontends/torch-frontend/examples/demo/backend.py
new file mode 100644
index 000000000..2e67b84d4
--- /dev/null
+++ b/frontends/torch-frontend/examples/demo/backend.py
@@ -0,0 +1,195 @@
+import os
+import torch
+import functools
+
+from torch.fx.experimental.proxy_tensor import maybe_disable_fake_tensor_mode
+from torch.fx.passes.fake_tensor_prop import FakeTensorProp
+
+import brt
+import byteir
+
+from torch_frontend import compile
+from torch_frontend import list_decomposed_ops, preprocess_fx_graph, fx_replace_attn_pattern, replace_flash_attn, get_none_indices
+
+
+TRACE = False
+
+cnt = 0
+MODEL_NAME = ''
+FLASH = False
+
+
+from functorch.compile import aot_module
+from torch._decomp import get_decompositions
+
+from torch.cuda.memory import caching_allocator_alloc, caching_allocator_delete
+
+class ByteIRInferenceFunction:
+    def __init__(self, module_path):
+        self._session = brt.Session(alloc_func=caching_allocator_alloc,
+                                    free_func=caching_allocator_delete)
+        self._session.load(module_path)
+        self._req = self._session.new_request_context(
+            torch.cuda.current_stream()._as_parameter_.value)
+
+    def __call__(self, *inputs):
+        device = inputs[0].device
+        from brt.utils import brt_dtype_to_torch_dtype
+        results = [torch.empty(self._session.get_static_shape(offset),
+                               dtype=brt_dtype_to_torch_dtype(self._session.get_data_type(offset)),
+                               device=device) for offset in self._session.get_output_arg_offsets()]
+        
+        for offset, input in zip(self._session.get_input_arg_offsets(), inputs):
+            self._req.bind_arg(offset, input.data_ptr())
+        for offset, output in zip(self._session.get_output_arg_offsets(), results):
+            self._req.bind_arg(offset, output.data_ptr())
+        self._req.finish_io_binding()
+        self._req.run()
+        self._req.sync()
+        return results
+
+class ByteIRFunction:
+    def __init__(self, module_path, output_shapes, output_dtypes, none_indices):
+        self._session = brt.Session(
+            alloc_func=caching_allocator_alloc,
+            free_func=caching_allocator_delete)
+        self._session.load(module_path)
+        self._output_shapes = output_shapes
+        self._output_dtypes = output_dtypes
+        self._req = self._session.new_request_context(
+            torch.cuda.current_stream()._as_parameter_.value)
+        self._none_indices = none_indices
+
+    def __call__(self, *inputs):
+        if TRACE:
+            for i in range(len(inputs)):
+                input = inputs[i]
+                print("In ByteIRFunction, Inputs["+str(i)+"]", input)
+
+        device = inputs[0].device
+        rets = [torch.empty(shape, dtype=dtype, device=device)
+                for shape, dtype in zip(self._output_shapes, self._output_dtypes)]
+        for offset, arg in zip(self._session.get_input_arg_offsets(), inputs):
+            assert list(self._session.get_static_shape(offset)) == list(arg.shape)
+        for offset, ret in zip(self._session.get_output_arg_offsets(), rets):
+            assert list(self._session.get_static_shape(offset)) == list(ret.shape)
+
+        for i, tensor in zip(self._session.get_input_arg_offsets(), inputs):
+            self._req.bind_arg(i, tensor.data_ptr())
+        for i, tensor in zip(self._session.get_output_arg_offsets(), rets):
+            self._req.bind_arg(i, tensor.data_ptr())
+        self._req.finish_io_binding()
+        self._req.run()
+        self._req.sync()
+
+        if TRACE:
+            for i in range(len(rets)):
+                r = rets[i]
+                print("In ByteIRFunction, Outputs["+str(i)+"]", r)
+
+        # add None results to return values
+        results = []
+        none_ptr = 0
+        ret_ptr = 0
+        for i in range(len(rets) + len(self._none_indices)):
+            if none_ptr < len(self._none_indices) and i == self._none_indices[none_ptr]:
+                results.append(None)
+                none_ptr += 1
+            else:
+                results.append(rets[ret_ptr])
+                ret_ptr += 1
+        return results
+
+def byteir_compile_fx_inner(graph: torch.fx.GraphModule, inputs, is_backward, ban_lst=[]):
+    category = 'backward' if is_backward else 'forward'
+
+    print("\n\n============")
+    print(f"{category} Part")
+    print("============\n\n")
+    none_indices = get_none_indices(graph)
+    fx_graph = preprocess_fx_graph(graph)
+
+    compile_type = 'mhlo'
+    backend_legal_ops = [
+        "aten._softmax",
+        "aten.softmax.int",
+        "aten.log_softmax.int",
+        "aten._log_softmax",
+        # "aten.native_layer_norm",
+        # "aten.layer_norm",
+        "aten.gelu",
+        "aten.argmax",
+        "aten.max.dim",
+        "aten.one_hot",
+        "aten.topk",
+        "byteir.flash_attn_fwd",
+        "byteir.flash_attn_bwd",
+    ]
+    with maybe_disable_fake_tensor_mode():
+        compiled_graph = compile(fx_graph, inputs, compile_type, backend_legal_ops=backend_legal_ops)
+
+    model_name = MODEL_NAME
+    global cnt
+    TEMP_FOLDER="./temp"
+    os.makedirs(TEMP_FOLDER, exist_ok=True)
+    os.makedirs(TEMP_FOLDER + f"/{model_name}_{category}", exist_ok=True)
+    mlir_file_name = f'{TEMP_FOLDER}/{model_name}_{category}_{cnt}.{compile_type}.mlir'
+    output_mlir_file_name = f'{TEMP_FOLDER}/{model_name}_{category}/{model_name}_{category}.rt.mlir'
+    cnt = cnt + 1
+    with open(mlir_file_name, "w+") as fout:
+        compiled_graph.operation.print(file=fout,
+                                       large_elements_limit=None)
+
+    with maybe_disable_fake_tensor_mode():
+        byteir.compile(mlir_file_name, output_mlir_file_name, entry_func='forward', target='cuda_with_ait')
+
+    outputs = FakeTensorProp(graph).propagate(*inputs)
+    mhlo_ret_dtypes = [t.dtype for t in outputs]
+    mhlo_ret_shapes = [t.shape for t in outputs]
+
+    print(output_mlir_file_name)
+    return ByteIRFunction(output_mlir_file_name, mhlo_ret_shapes, mhlo_ret_dtypes, none_indices)
+
+
+from torch._inductor.virtualized import V
+from torch._dynamo.utils import detect_fake_mode
+from torch._dynamo.backends.common import aot_autograd
+from torch._inductor.fx_passes.joint_graph import joint_graph_passes
+
+
+def fuse_aware_byteir_compile_fx(model_: torch.fx.GraphModule, example_inputs_):
+    from partitioners import fuse_aware_min_cut_partition
+    # TODO: can add logging before/after the call to create_aot_dispatcher_function
+    # in torch._functorch/aot_autograd.py::aot_module_simplified::aot_function_simplified::new_func
+    # once torchdynamo is merged into pytorch
+    fake_mode = detect_fake_mode(example_inputs_) or torch._subclasses.FakeTensorMode(
+        allow_non_fake_inputs=True
+    )
+    tracing_context = (
+        torch._guards.TracingContext.get() or torch._guards.TracingContext(fake_mode)
+    )
+    decompose_list = list_decomposed_ops()
+    decompositions = get_decompositions(decompose_list)
+
+    def partition_fn(graph, joint_inputs, **kwargs):
+        joint_graph_passes(graph)
+        return fuse_aware_min_cut_partition(
+            graph, joint_inputs, **kwargs, compiler="inductor"
+        )
+
+    if FLASH:
+        # preprocess flash attention
+        # replace attention pattern to scaled_dot_product_attention
+        model_ = fx_replace_attn_pattern(model_)
+        # replace scaled_dot_product_attention to byteir.flash_attn
+        model_ = replace_flash_attn(model_)
+
+    with V.set_fake_mode(fake_mode), torch._guards.tracing(tracing_context):
+        return aot_autograd(
+            fw_compiler=functools.partial(byteir_compile_fx_inner, is_backward=False),
+            bw_compiler=functools.partial(byteir_compile_fx_inner, is_backward=True),
+            inference_compiler=functools.partial(byteir_compile_fx_inner, is_backward=False),
+            decompositions=decompositions,
+            partition_fn=partition_fn,
+            keep_inference_input_mutations=True,
+        )(model_, example_inputs_)
diff --git a/frontends/torch-frontend/examples/demo/byteir_fusible_pattern.py b/frontends/torch-frontend/examples/demo/byteir_fusible_pattern.py
new file mode 100644
index 000000000..1629f39f5
--- /dev/null
+++ b/frontends/torch-frontend/examples/demo/byteir_fusible_pattern.py
@@ -0,0 +1,194 @@
+import torch
+import torch.fx as fx
+
+from compile_utils import get_aten_target
+from fx_match_utils import get_node_consumer, match_chain
+
+byteir_fusible_patterns = {}
+aten = torch.ops.aten
+
+def register_byteir_pattern(name):
+    def register(pattern):
+        if name in byteir_fusible_patterns.keys():
+            raise ValueError("Pattern " + name + " has already been registerd.")
+        byteir_fusible_patterns[name] = pattern
+        return pattern
+    return register
+
+
+class ByteIRFusiblePattern:
+
+    @classmethod
+    def match(cls, node, required_fw_nodes) -> bool:
+        raise NotImplementedError
+
+    @classmethod
+    def get_pattern_recompute_nodes(cls, node, required_fw_nodes):
+        raise NotImplementedError
+
+@register_byteir_pattern("transpose_dot")
+class TransposeDotPattern(ByteIRFusiblePattern):
+
+    @classmethod
+    def match(cls, node, required_fw_nodes) -> bool:
+        post_fusible_ops = [aten.mm, aten.bmm]
+        if get_aten_target(node) in [aten.t, aten.transpose]:
+            can_fuse = all(get_aten_target(user) in post_fusible_ops for user in node.users)
+            all_fw_node = all(user in required_fw_nodes for user in node.users)
+            return (not all_fw_node) and can_fuse
+        return False
+
+    @classmethod
+    def get_pattern_recompute_nodes(cls, node, required_fw_nodes):
+        if cls.match(node, required_fw_nodes):
+            return [node]
+        return []
+
+
+@register_byteir_pattern("transpose_reshape_transpose_dot")
+class TransposeReshapeTransposeDotPattern(ByteIRFusiblePattern):
+
+    @classmethod
+    def match(cls, node, required_fw_nodes) -> bool:
+        post_fusible_ops = [aten.mm, aten.bmm, aten.transpose]
+        if get_aten_target(node) not in [aten.transpose]:
+            return False
+        if match_chain(node, target_chain=(aten.transpose, aten.expand, aten.clone, aten._unsafe_view)):
+            expand_node = get_node_consumer(node, 0)
+            clone_node = get_node_consumer(expand_node, 0)
+            view_node = get_node_consumer(clone_node, 0)
+            all_fw_node = all(user in required_fw_nodes for user in view_node.users)
+            can_fuse = all(get_aten_target(user) in post_fusible_ops for user in view_node.users)
+            return (not all_fw_node) and can_fuse
+        return False
+
+
+    @classmethod
+    def get_pattern_recompute_nodes(cls, node, required_fw_nodes):
+        if cls.match(node, required_fw_nodes):
+            expand_node = get_node_consumer(node, 0)
+            clone_node = get_node_consumer(expand_node, 0)
+            view_node = get_node_consumer(clone_node, 0)
+            recompute_nodes = [node, expand_node, clone_node, view_node]
+            for user in view_node.users:
+                if user not in required_fw_nodes:
+                    recompute_nodes.append(user)
+            return recompute_nodes
+        return []
+
+@register_byteir_pattern("transpose_transpose")
+class TransposeTransposePattern(ByteIRFusiblePattern):
+
+    @classmethod
+    def match(cls, node, required_fw_nodes) -> bool:
+        if get_aten_target(node) in [aten.t, aten.transpose]:
+            for user in node.users:
+                if get_aten_target(user) in [aten.t, aten.transpose]:
+                    all_fw_node = all(n in required_fw_nodes for n in user.users)
+                    if not all_fw_node:
+                        return True
+        return False
+
+
+    @classmethod
+    def get_pattern_recompute_nodes(cls, node, required_fw_nodes):
+        if cls.match(node, required_fw_nodes):
+            recompute_nodes = [node]
+            for user in node.users:
+                if get_aten_target(user) == aten.t:
+                    recompute_nodes.append(user)
+            return recompute_nodes
+        return []
+
+
+@register_byteir_pattern("full_bitwise_not_expand")
+class FullBitwiseNotExpandPattern(ByteIRFusiblePattern):
+
+    @classmethod
+    def match(cls, node, required_fw_nodes) -> bool:
+        if match_chain(node, target_chain=(aten.full, aten.bitwise_not, aten.expand)):
+            return True
+        return False
+
+
+    @classmethod
+    def get_pattern_recompute_nodes(cls, node, required_fw_nodes):
+        if cls.match(node, required_fw_nodes):
+            bitwise_node = get_node_consumer(node, 0)
+            expand_node = get_node_consumer(bitwise_node, 0)
+            recompute_nodes = [node, bitwise_node, expand_node]
+            return recompute_nodes
+        return []
+
+
+# Note: This pattern is temporary.
+# It is only used to fix issue that full op(dtype is bool) is not supported in byteir.
+@register_byteir_pattern("copy_bitwise_not_expand")
+class CopyBitwiseNotExpandPattern(ByteIRFusiblePattern):
+
+    @classmethod
+    def match(cls, node, required_fw_nodes) -> bool:
+        if match_chain(node, target_chain=(aten._to_copy, aten.bitwise_not, aten.expand, aten.bitwise_or)):
+            bitwise_not_node = get_node_consumer(node, 0)
+            expand_node = get_node_consumer(bitwise_not_node, 0)
+            bitwise_or_node = get_node_consumer(expand_node, 0)
+            return True
+        return False
+
+
+    @classmethod
+    def get_pattern_recompute_nodes(cls, node, required_fw_nodes):
+        if cls.match(node, required_fw_nodes):
+            bitwise_not = get_node_consumer(node, 0)
+            expand = get_node_consumer(bitwise_not, 0)
+            bitwise_or = get_node_consumer(expand, 0)
+            recompute_nodes = [node, bitwise_not, expand, bitwise_or]
+            return recompute_nodes
+        return []
+
+
+def greedy_transpose_fusion(joint_graph, required_fw_nodes):
+    recompute_nodes = []
+    post_fuse_ops = [aten.bmm, aten.mm]
+    transparent_ops = [aten.clone, aten._to_copy, aten.expand]
+    view_ops = [aten.view, aten._unsafe_view]
+    transpose_ops = [aten.t, aten.transpose]
+    fusible_tag = {}
+
+    INIT_TAG = 0
+    POST_FUSION_TAG = 1
+    TRANSPOSE_TAG = 2
+
+
+    for node in reversed(joint_graph.nodes):
+        fusible_tag[node] = INIT_TAG
+
+    for node in reversed(joint_graph.nodes):
+        if get_aten_target(node) in post_fuse_ops and node not in required_fw_nodes:
+            fusible_tag[node] = POST_FUSION_TAG
+
+        if get_aten_target(node) in transparent_ops:
+            for user in node.users:
+                if user in fusible_tag.keys() and fusible_tag[user] >= POST_FUSION_TAG:
+                    fusible_tag[node] = POST_FUSION_TAG
+                    recompute_nodes.append(node)
+
+        if get_aten_target(node) in transpose_ops:
+            for user in node.users:
+                if user in fusible_tag.keys() and fusible_tag[user] >= POST_FUSION_TAG:
+                    recompute_nodes.append(node)
+                    fusible_tag[node] = INIT_TAG
+
+    return recompute_nodes
+
+
+def get_byteir_recompute_nodes(joint_graph, required_fw_nodes):
+    recompute_nodes = []
+    recompute_nodes.extend(greedy_transpose_fusion(joint_graph, required_fw_nodes))
+    for name, pattern in byteir_fusible_patterns.items():
+        for node in joint_graph.nodes:
+            if node.op == 'output':
+                continue
+            recompute_nodes.extend(pattern.get_pattern_recompute_nodes(node, required_fw_nodes))
+    recompute_nodes = list(set(recompute_nodes))
+    return recompute_nodes
diff --git a/frontends/torch-frontend/examples/demo/compile_utils.py b/frontends/torch-frontend/examples/demo/compile_utils.py
new file mode 100644
index 000000000..e08df059e
--- /dev/null
+++ b/frontends/torch-frontend/examples/demo/compile_utils.py
@@ -0,0 +1,92 @@
+
+import torch
+import torch.fx as fx
+from torch.utils._pytree import tree_flatten
+
+aten = torch.ops.aten
+
+
+def get_aten_target(node):
+    if hasattr(node.target, 'overloadpacket'):
+        return node.target.overloadpacket
+    return node.target
+
+
+rand_ops = [aten.dropout, aten._fused_dropout, aten._standard_gamma,
+            aten.bernoulli, aten.multinomial, aten.native_dropout,
+            aten.normal, aten.poisson, aten.binomial, aten.rrelu,
+            aten.rand_like, aten.rand, aten.randint, aten.randn, aten.randperm]
+
+
+# return a new copy of torch.fx.graph.Graph with CSE applied to the input graph
+def fx_graph_cse(fx_g: torch.fx.graph.Graph):
+    new_graph = fx.Graph()
+    env = {}  # map from node in the old graph to node in the new graph
+    hash_env = {}  # map from hash to a node in the new graph
+    token_map = {}  # map from hash to token
+    for n in fx_g.nodes:
+        # The placeholder, output, and get_attr nodes are copied to the new grpah without change
+        # do not CSE away random operations
+        if n.op == 'placeholder' or n.op == 'output' or n.op == 'get_attr' or get_aten_target(n) in rand_ops:
+            new_node = new_graph.node_copy(n, lambda x: env[x])
+            env[n] = new_node
+        else:  # n.op == 'call_function', should never see n.op == 'call_module' or 'call_method'
+            # substitute args and kwargs members to their mapping in env if exists
+            # specs can be used to reconstruct nested list/dictionaries
+            def substitute(arg_list):
+                arg_list, spec = tree_flatten(arg_list)
+                for i in range(len(arg_list)):
+                    v = arg_list[i]
+                    if isinstance(v, torch.fx.node.Node) and v in env:
+                        arg_list[i] = env[v]
+                    if isinstance(v, (torch.SymBool, torch.SymInt, torch.SymFloat)):
+                        arg_list[i] = v.node
+                return tuple(arg_list), spec
+            args, args_spec = substitute(n.args)
+            kwargs, kwargs_spec = substitute(n.kwargs)
+
+            # each token corresponds to a unique node
+            # nodes with the same token can be substituted
+            token = {"target": n.target, "args": args, "args_spec": args_spec,
+                     "kwargs": kwargs, "kwargs_spec": kwargs_spec}
+
+            # hash substituted args to a number, do not hash specs because specs are not hashable
+            hash_arg = hash((args, kwargs))
+            hash_val = (n.target, hash_arg)
+
+            # check if a node has a substitute and can be eliminated
+            hash_val_in_hash_env = hash_val in hash_env
+            if hash_val_in_hash_env and token_map[hash_val] == token:
+                env[n] = hash_env[hash_val]
+                continue
+
+            new_node = new_graph.node_copy(n, lambda x: env[x])
+            env[n] = new_node
+            if not hash_val_in_hash_env:
+                hash_env[hash_val] = new_node
+                token_map[hash_val] = token
+
+    return new_graph
+
+
+def strip_overloads(gm):
+    """
+    Modifies the target of graph nodes in :attr:`gm` to strip overloads.
+
+    Args:
+        gm(fx.GraphModule): The input Fx graph module to be modified
+    """
+    for node in gm.graph.nodes:
+        if isinstance(node.target, torch._ops.OpOverload):
+            node.target = node.target.overloadpacket
+    gm.recompile()
+
+
+def get_placeholders(graph):
+    return list(filter(lambda x: x.op == 'placeholder', graph.nodes))
+
+def get_outputs(graph):
+    for node in graph.nodes:
+        if node.op == 'output':
+            return tree_flatten(node.args[0])[0]
+    raise AssertionError("No output node found")
diff --git a/frontends/torch-frontend/examples/demo/config.py b/frontends/torch-frontend/examples/demo/config.py
new file mode 100644
index 000000000..811bce8cc
--- /dev/null
+++ b/frontends/torch-frontend/examples/demo/config.py
@@ -0,0 +1,35 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Global flags for aot autograd
+"""
+import os
+import sys
+
+# Converts torch rng ops to their functional philox rng equivalents. Note that
+# we functionalize only CUDA rng ops today.
+functionalize_rng_ops = False
+
+# can be useful for debugging if we are incorrectly creating meta fake tensors
+fake_tensor_allow_meta = os.environ.get("FAKE_ALLOW_META", True)
+
+# Enables optional asserts in hotpath code to check for errors.  If
+# you are seeing weird accuracy problems, try turning this on.
+# This is currently off by default as it will harm tracing time,
+# but it is on by default for aot_eager.
+debug_assert = False
+
+debug_partitioner = os.environ.get("AOT_PARTITIONER_DEBUG", False)
+
+static_weight_shapes = True
+
+# Applies CSE to the graph before partitioning
+cse = True
+
+# Restricts the amount of computation AOTAutograd can do.
+max_dist_from_bw = 3
+
diff --git a/frontends/torch-frontend/examples/demo/fx_match_utils.py b/frontends/torch-frontend/examples/demo/fx_match_utils.py
new file mode 100644
index 000000000..d7a209c76
--- /dev/null
+++ b/frontends/torch-frontend/examples/demo/fx_match_utils.py
@@ -0,0 +1,40 @@
+import torch
+from compile_utils import get_aten_target
+
+aten = torch.ops.aten
+
+def is_used_by_specific_consumer(node, consumer_type=None):
+    if consumer_type == None:
+        return True
+
+    all_users = list(node.users)
+    if len(all_users) != 1:
+        return False
+    consumer = all_users[0]
+    if not isinstance(consumer_type, (list, tuple)):
+        consumer_type = [consumer_type]
+    if get_aten_target(consumer) not in consumer_type:
+        return False
+    return True
+
+
+def get_node_consumer(node, index):
+    all_users = list(node.users)
+    return all_users[index]
+
+
+def match_chain(node, target_chain):
+    if len(target_chain) == 1:
+        return get_aten_target(node) in target_chain
+
+    if len(list(node.users)) != 1:
+        return False
+
+    specific_types = target_chain[0]
+    
+    if not isinstance(specific_types, (list, tuple)):
+        specific_types = [specific_types]
+
+    if get_aten_target(node) in specific_types:
+        return match_chain(get_node_consumer(node, 0), target_chain[1:])
+    return False
diff --git a/frontends/torch-frontend/examples/demo/main.py b/frontends/torch-frontend/examples/demo/main.py
new file mode 100644
index 000000000..c913ceb5c
--- /dev/null
+++ b/frontends/torch-frontend/examples/demo/main.py
@@ -0,0 +1,220 @@
+from torch import nn
+import torch
+import transformers
+import sys
+import os
+import functools
+import torch._dynamo
+import torch.nn.functional as F
+
+import transformers
+import argparse
+
+MODEL_LIST = ["gpt2", "bloom-560m", "llama", "opt-1.3b", "nanogpt"]
+
+class InferLLAMAModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.config = transformers.LlamaConfig(num_hidden_layers=4, return_dict=False)
+        self.model = transformers.LlamaForCausalLM(config=self.config)
+    def forward(self, x):
+        return self.model(x)[0]
+
+class InferOPTModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.config = transformers.AutoConfig.from_pretrained("facebook/opt-1.3b", return_dict=False)
+        self.config.tie_word_embeddings = False
+        self.model = transformers.OPTForCausalLM(config=self.config)
+    def forward(self, x):
+        return self.model(x)[0]
+
+class InferBLOOMModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.config = transformers.BloomConfig.from_pretrained('bigscience/bloom-560m', return_dict=False)
+        self.config.tie_word_embeddings = False
+        self.model = transformers.BloomForCausalLM(config=self.config)
+    def forward(self, x):
+        return self.model(x)[0]
+
+class InferGPT2Module(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.config = transformers.GPT2Config.from_pretrained('gpt2', return_dict=False)
+        self.config.num_labels = self.config.vocab_size
+        self.model = transformers.GPT2ForTokenClassification(config=self.config)
+    def forward(self, x):
+        return self.model(x)[0]
+
+def make_model(model_name):
+    if model_name == 'llama':
+        config = transformers.LlamaConfig(num_hidden_layers=4)
+        model = transformers.LlamaForCausalLM(config=config)
+    elif model_name == 'opt-1.3b':
+        config = transformers.AutoConfig.from_pretrained("facebook/opt-1.3b")
+        config.tie_word_embeddings = False
+        model = transformers.OPTForCausalLM(config=config)
+    elif model_name == 'bloom-560m':
+        config = transformers.BloomConfig.from_pretrained('bigscience/bloom-560m')
+        config.tie_word_embeddings = False
+        model = transformers.BloomForCausalLM(config=config)
+    elif model_name == 'gpt2':
+        config = transformers.GPT2Config.from_pretrained('gpt2')
+        config.num_labels = config.vocab_size
+        model = transformers.GPT2ForTokenClassification(config=config)
+    elif model_name == 'nanogpt':
+        from my_transformers.modeling_nanogpt import GPTConfig, GPT
+        config_args = dict(n_layer=12, n_head=12, n_embd=768)
+        config_args['vocab_size'] = 50257  # always 50257 for GPT model checkpoints
+        config_args['block_size'] = 1024  # always 1024 for GPT model checkpoints
+        config_args['bias'] = True  # always True for GPT model checkpoints
+        # we can override the dropout rate, if desired
+        config_args['dropout'] = 0.
+        # create a from-scratch initialized minGPT model
+        config = GPTConfig(**config_args)
+        model = GPT(config)
+    else:
+        assert False
+    return model
+
+def make_inference_model(model_name):
+    if model_name == 'llama':
+        return InferLLAMAModule()
+    elif model_name == 'opt-1.3b':
+        return InferOPTModule()
+    elif model_name == 'bloom-560m':
+        return InferBLOOMModule()
+    elif model_name == 'gpt2':
+        return InferGPT2Module()
+    else:
+        return make_model(model_name)
+
+def make_data(model, model_name, device):
+    batch_size = 8
+    if model_name == 'llama':
+        batch_size = 16
+    elif model_name == 'opt-1.3b':
+        batch_size = 4
+    seq_len = 1024
+    input = torch.randint(
+        low=0, high=model.config.vocab_size, size=(batch_size, seq_len), device=device
+    )
+
+    label = torch.randint(low=0, high=model.config.vocab_size, size=(batch_size, seq_len),
+                          device=device)
+    return input, label
+
+def compute_loss(model, data, model_name):
+    if model_name == 'nanogpt':
+        input_idx, output_idx = data
+        _, loss = model(input_idx, output_idx)
+    else:
+        input, label = data
+        output = model(input)
+        logits = output.logits
+        loss = F.cross_entropy(logits.view(-1, model.config.vocab_size), label.view(-1))
+    return loss
+
+
+def infer_model(args):
+    device = torch.device('cuda:' + str(args.device_id))
+    model = make_inference_model(args.model_name)
+    model.eval()
+    model.to(device)
+
+    input, label = make_data(model, args.model_name, device)
+    trace_data = [input]
+    if args.model_name == "nanogpt":
+        trace_data.append(label)
+    # torch.save(trace_data, "batch_sample_inputs")
+
+    TEMP_FOLDER="./temp"
+    os.makedirs(TEMP_FOLDER, exist_ok=True)
+    os.makedirs(TEMP_FOLDER + f"/{args.model_name}_inference_f16", exist_ok=True)
+    jit_file_name = TEMP_FOLDER + f"/{args.model_name}_inference.f16.jit"
+    mhlo_file_name = TEMP_FOLDER + f"/{args.model_name}_inference.f16.mhlo.mlir"
+    byre_file_name = TEMP_FOLDER + f"/{args.model_name}_inference_f16/{args.model_name}.rt.mlir"
+
+    with torch.no_grad(), torch.cuda.amp.autocast(enabled=True, dtype=torch.float16):
+        # if not os.path.exists(jit_file_name):
+        #     module = torch.jit.trace(model, trace_data, check_trace=False)
+        #     torch.jit.save(module, jit_file_name)
+        #     print("save jit to {}".format(jit_file_name))
+
+        if not os.path.exists(mhlo_file_name):
+            # module = torch.jit.load(jit_file_name)
+            if args.flash:
+                from torch.fx.experimental.proxy_tensor import make_fx
+                from torch_frontend import preprocess_fx_graph
+                module = make_fx(model)(*trace_data)
+                print("torch inputs:")
+                print(trace_data)
+                print("torch outputs:")
+                print(module(*trace_data))
+                module = preprocess_fx_graph(module)
+            else:
+                module = torch.jit.trace(model, trace_data, check_trace=False)
+                print("torch inputs:")
+                print(trace_data)
+                print("torch outputs:")
+                print(module(*trace_data))
+            import torch_frontend
+            mhlo_model = torch_frontend.compile(module, trace_data, "mhlo")
+            with open(mhlo_file_name, "w") as f:
+                print(mhlo_model.operation.get_asm(), file=f)
+            print("save mhlo to {}".format(mhlo_file_name))
+
+        if not os.path.exists(byre_file_name):
+            import byteir
+            print("begin byteir compile")
+            byteir.compile(mhlo_file_name, byre_file_name, entry_func='forward', target='cuda_with_ait', disable_byteir_cache=False, verbose=False)
+            print("byteir compile to {}".format(byre_file_name))
+
+        from backend import ByteIRInferenceFunction
+        runner = ByteIRInferenceFunction(byre_file_name)
+        print("byre inputs:")
+        print(trace_data)
+        print("byre outputs:")
+        print(runner(*trace_data))
+
+def train_model(args):
+    torch._dynamo.reset()
+    torch._dynamo.disallow_in_graph(F.cross_entropy)
+
+    model_name = args.model_name
+    use_flash_attn = args.flash
+    device = torch.device('cuda:' + str(args.device_id))
+    model = make_model(model_name)
+    model.to(device)
+
+    import backend
+    from backend import fuse_aware_byteir_compile_fx
+    backend.MODEL_NAME = model_name
+    backend.FLASH = use_flash_attn
+
+    optimized_model = torch.compile(model, backend=fuse_aware_byteir_compile_fx)
+
+    data = make_data(optimized_model, model_name, device)
+    model.zero_grad(set_to_none=True)
+    with torch.cuda.amp.autocast(enabled=True, dtype=torch.float16):
+        loss = compute_loss(optimized_model, data, model_name)
+        print("loss:", loss)
+        loss.backward()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("model_name")
+    parser.add_argument("--flash", action="store_true", help="use flash attention when possible")
+    parser.add_argument("--infer", action="store_true", help="infer mode")
+    parser.add_argument("--device_id", type=int, default=0)
+    args = parser.parse_args()
+    # print(args)
+
+    assert args.model_name in MODEL_LIST
+    if args.infer:
+        infer_model(args)
+    else:
+        train_model(args)
+
diff --git a/frontends/torch-frontend/examples/demo/partitioners.py b/frontends/torch-frontend/examples/demo/partitioners.py
new file mode 100644
index 000000000..43bea47b9
--- /dev/null
+++ b/frontends/torch-frontend/examples/demo/partitioners.py
@@ -0,0 +1,940 @@
+from torch.fx.experimental.proxy_tensor import is_sym_node, py_sym_types
+from torch.fx.experimental.symbolic_shapes import (
+    hint_int, magic_methods, method_to_operator, free_symbols,
+    is_symbol_binding_fx_node, find_symbol_binding_fx_nodes
+)
+import torch
+import torch.fx as fx
+import operator
+import math
+import torch.utils._pytree as pytree
+import copy
+import os
+import itertools
+import sympy
+from collections import defaultdict
+from torch.fx.passes import graph_drawer
+from typing import Tuple
+from compile_utils import fx_graph_cse, get_aten_target
+import config
+import functools
+
+from byteir_fusible_pattern import get_byteir_recompute_nodes
+
+AOT_PARTITIONER_DEBUG = config.debug_partitioner
+
+
+def must_recompute(node):
+    return node.meta.get("recompute", False)
+
+def has_recomputable_ops(fx_g):
+    found = False
+    for node in fx_g.graph.nodes:
+        if must_recompute(node):
+            return True
+    return False
+
+def has_recomputable_rng_ops(fx_g):
+    for node in fx_g.graph.nodes:
+        if must_recompute(node) and hasattr(node.target, "tags") and torch.Tag.nondeterministic_seeded in node.target.tags:
+            return True
+    return False
+
+def sym_node_size(node):
+    if isinstance(node.meta["val"], (torch.SymInt, torch.SymBool)):
+        return 1
+    assert isinstance(node.meta["val"], torch.SymFloat)
+    return 4
+
+class InvalidNodeBase:
+    def __repr__(self):
+        return "Invalid Node"
+
+
+InvalidNode = InvalidNodeBase()
+
+
+def _extract_graph_with_inputs_outputs(joint_graph, inputs, outputs):
+    """
+    Given a graph, extracts out a subgraph that takes the specified nodes as
+    inputs and returns the specified outputs.
+
+    This includes specifying non-placeholder nodes as inputs.
+
+    The general strategy is to initialize all inputs with proxies as we
+    encounter them, and trace through the graph, only keeping values which take
+    in valid proxies. Then, all dead code is eliminated.
+    """
+    new_graph = fx.Graph()
+    env = {}
+
+    # Add new placeholder nodes in the order specified by the inputs
+    for node in inputs:
+        new_node = new_graph.placeholder(node.name)
+        # Can't use node_copy here as we may be turning previous call_function into placeholders
+        new_node.meta = node.meta
+        env[node] = new_node
+
+    for node in joint_graph.nodes:
+        if node in inputs:
+            continue
+        elif node.op == 'placeholder':
+            env[node] = InvalidNode
+        elif node.op == 'call_function':
+            all_args = pytree.tree_flatten((node.args, node.kwargs))[0]
+            all_args = [isinstance(env[x], InvalidNodeBase) for x in all_args if isinstance(x, fx.Node)]
+            if any(all_args):
+                env[node] = InvalidNode
+                continue
+            env[node] = new_graph.node_copy(node, lambda x: env[x])
+        elif node.op == 'get_attr':
+            env[node] = new_graph.node_copy(node, lambda x: env[x])
+        elif node.op == 'output':
+            pass
+    output_values = []
+    for x in outputs:
+        if isinstance(x, fx.Node):
+            if x not in env:
+                raise RuntimeError(f"Node {x} couldn't be found in env")
+            assert not isinstance(env[x], InvalidNodeBase), f"Node {x} was invalid, but is output"
+            output_values.append(env[x])
+        else:
+            output_values.append(x)
+    new_graph.output(output_values)
+
+    new_graph.eliminate_dead_code()
+    new_graph.lint()
+    return new_graph
+
+
+def _is_primal(node):
+    return (
+        node.op == "placeholder"
+        and "tangents" not in node.target
+        and not _is_bwd_seed_offset(node)
+        and not _is_fwd_seed_offset(node)
+    )
+
+def _is_tangent(node):
+    return node.op == "placeholder" and "tangents" in node.target
+
+def _is_bwd_seed_offset(node):
+    return node.op == "placeholder" and ("bwd_seed" in node.target or "bwd_base_offset" in node.target)
+
+def _is_fwd_seed_offset(node):
+    return node.op == "placeholder" and ("fwd_seed" in node.target or "fwd_base_offset" in node.target)
+
+
+def _extract_fwd_bwd_outputs(joint_module: fx.GraphModule, *, num_fwd_outputs):
+    outputs = pytree.tree_flatten([node.args for node in joint_module.graph.nodes if node.op == 'output'])[0]
+    fwd_outputs = outputs[:num_fwd_outputs]
+    bwd_outputs = outputs[num_fwd_outputs:]
+    return fwd_outputs, bwd_outputs
+
+
+def _extract_fwd_bwd_modules(joint_module: fx.GraphModule, saved_values, saved_sym_nodes, *, num_fwd_outputs):
+    fwd_outputs, bwd_outputs = _extract_fwd_bwd_outputs(joint_module, num_fwd_outputs=num_fwd_outputs)
+    primal_inputs = list(filter(_is_primal, joint_module.graph.nodes))
+    tangent_inputs = list(filter(_is_tangent, joint_module.graph.nodes))
+    fwd_seed_offset_inputs = list(filter(_is_fwd_seed_offset, joint_module.graph.nodes))
+    bwd_seed_offset_inputs = list(filter(_is_bwd_seed_offset, joint_module.graph.nodes))
+
+    # Construct the forward module
+    # Keep symints separate from tensors, passed between fwd/bwd graphs, and in the right order.
+    fwd_graph = _extract_graph_with_inputs_outputs(
+        joint_module.graph,
+        primal_inputs + fwd_seed_offset_inputs,
+        fwd_outputs + saved_values + saved_sym_nodes
+    )
+    bwd_graph = _extract_graph_with_inputs_outputs(
+        joint_module.graph,
+        saved_sym_nodes + saved_values + tangent_inputs + bwd_seed_offset_inputs,
+        bwd_outputs
+    )
+
+    # This is to filter out saved values that don't actually end up being used by the backwards pass
+    for node in bwd_graph.nodes:
+        if node.op == 'placeholder' and not node.users:
+            for saved_value in saved_values:
+                if saved_value.name == node.name:
+                    saved_values.remove(saved_value)
+                    break
+
+            for saved_sym in saved_sym_nodes:
+                if saved_sym.name == node.name:
+                    saved_sym_nodes.remove(saved_sym)
+                    break
+
+    # Now that we have the finalized list of saved values, we need to ensure
+    # we propagate all symbols which are referenced by backwards inputs.
+    # These are not directly used in the graph but are required for downstream
+    # sizevar assignment
+    saved_symbols: Set[sympy.Symbol] = set()
+    saved_sym_nodes_binding = []
+    saved_sym_nodes_derived = []
+
+    # Some symbols may already be bound in the directly saved_sym_nodes,
+    # keep track of them so we don't re-bind them
+    for node in saved_sym_nodes:
+        symbol = is_symbol_binding_fx_node(node)
+        if symbol:
+            saved_symbols.add(symbol)
+            saved_sym_nodes_binding.append(node)
+        else:
+            saved_sym_nodes_derived.append(node)
+
+    # Now go through all of the prospective backward inputs and track any
+    # other symbols we need to bind
+    symbol_bindings = find_symbol_binding_fx_nodes(joint_module.graph)
+    for node in itertools.chain(saved_sym_nodes_derived, saved_values, tangent_inputs):
+        if "val" not in node.meta:
+            continue
+        new_symbols = free_symbols(node.meta["val"]) - saved_symbols
+        # NB: Deterministic order please!
+        for s in sorted(new_symbols, key=lambda s: s.name):
+            # NB: For well formed graphs, the symbol should always be present,
+            # but we also have ways to produce ill-formed graphs, e.g., direct
+            # make_fx usages, so don't choke in this case
+            if s not in symbol_bindings:
+                continue
+            saved_sym_nodes_binding.append(symbol_bindings[s])
+        saved_symbols |= new_symbols
+
+
+    # Update saved_sym_nodes that are now reordered to have all bindings at
+    # front. This can also be used later on to figure out the position of saved
+    # sym nodes in the output of fwd graph.
+    saved_sym_nodes.clear()
+    saved_sym_nodes.extend(saved_sym_nodes_binding + saved_sym_nodes_derived)
+
+    # Now, we re-generate the fwd/bwd graphs.
+    # NB: This might increase compilation time, but I doubt it matters
+    fwd_graph = _extract_graph_with_inputs_outputs(
+        joint_module.graph,
+        primal_inputs + fwd_seed_offset_inputs,
+        fwd_outputs + saved_values + saved_sym_nodes
+    )
+    bwd_graph = _extract_graph_with_inputs_outputs(
+        joint_module.graph,
+        saved_sym_nodes + saved_values + tangent_inputs + bwd_seed_offset_inputs,
+        bwd_outputs
+    )
+
+    fwd_module = fx.GraphModule(joint_module, fwd_graph)
+    bwd_module = fx.GraphModule(joint_module, bwd_graph)
+    return fwd_module, bwd_module
+
+
+def default_partition(
+    joint_module: fx.GraphModule, _joint_inputs, *, num_fwd_outputs
+) -> Tuple[fx.GraphModule, fx.GraphModule]:
+    """
+    Partitions the :attr:`joint_module` in a manner that closely resembles the
+    behavior observed in the original ``.forward()`` and ``.backward()`` of the
+    callable, i.e., the resulting forward graph contains those operators that
+    are executed in the original ``.forward()`` callable passed to
+    :func:`aot_function`.
+
+    The default partitioner collects the operators that are between the forward
+    inputs and the forward outputs. This helps in finding the tensors which have
+    to be stashed for the backward pass. These stashed tensors become the output
+    of the generated forward graph. The remaining operators are then placed in
+    the backward graph.
+
+    .. warning::
+        This API is experimental and likely to change.
+
+    Args:
+        joint_module(fx.GraphModule): The joint forward and backward graph. This
+            is the result of AOT Autograd tracing.
+
+    Returns:
+        Returns the generated forward and backward Fx graph modules.
+    """
+    if has_recomputable_ops(joint_module):
+        return min_cut_rematerialization_partition(joint_module, _joint_inputs, num_fwd_outputs=num_fwd_outputs)
+    primal_inputs = list(filter(_is_primal, joint_module.graph.nodes))
+    fwd_seed_offset_inputs = list(filter(_is_fwd_seed_offset, joint_module.graph.nodes))
+    inputs = primal_inputs + fwd_seed_offset_inputs
+    fwd_outputs, bwd_outputs = _extract_fwd_bwd_outputs(joint_module, num_fwd_outputs=num_fwd_outputs)
+    forward_only_graph = _extract_graph_with_inputs_outputs(joint_module.graph, inputs, fwd_outputs)
+    forward_node_names = {node.name for node in forward_only_graph.nodes if node.op != 'output'}
+    saved_values = []
+    saved_sym_nodes = []
+
+    for node in joint_module.graph.nodes:
+        if node.name not in forward_node_names:
+            continue
+        if is_sym_node(node):
+            # Symints must be kept separate from tensors so that PythonFunction only calls
+            # save_for_backward on tensors and stashes symints in autograd .ctx
+            saved_sym_nodes.append(node)
+        elif (
+            'tensor_meta' not in node.meta
+            and node.op == 'call_function'
+        ):
+            # Since we can't save tuple of tensor values, we need to flatten out what we're saving
+            users = node.users
+            assert all(user.target == operator.getitem for user in users)
+            for user in users:
+                saved_values.append(user)
+        else:
+            backward_usages = [n for n in node.users if n.name not in forward_node_names]
+            if 'tensor_meta' in node.meta and all(is_sym_node(n) for n in backward_usages):
+                # If we have a tensor in the forward, where only its sizes/strides are needed in the backward,
+                # and not the actual tensor data,
+                # then it will be a lot cheaper to save only the sizes/strides, and not the actual tensor.
+                #
+                # Note that saving the tensor could also cause compilation problems:
+                # If the user mutated an input in the forward and uses its sizes/strides in the backward,
+                # then we would be obligated to clone the input before saving it to appease autograd.
+                # (This is how we originally found this bug).
+                for user in backward_usages:
+                    saved_sym_nodes.append(user)
+            else:
+                saved_values.append(node)
+    saved_values = list({k: None for k in saved_values}.keys())
+    saved_sym_nodes = list({k: None for k in saved_sym_nodes}.keys())
+
+    return _extract_fwd_bwd_modules(joint_module, saved_values, saved_sym_nodes=saved_sym_nodes, num_fwd_outputs=num_fwd_outputs)
+
+
+def _prod(x):
+    s = 1
+    for i in x:
+        s *= i
+    return s
+
+def _tensor_nbytes(numel, dtype):
+    sizes = {
+        torch.complex64: 8,
+        torch.complex128: 16,
+        torch.float16: 2,
+        torch.bfloat16: 2,
+        torch.float32: 4,
+        torch.float64: 8,
+        torch.int8: 1,
+        torch.int16: 2,
+        torch.int32: 4,
+        torch.int64: 8,
+        torch.uint8: 1,
+        torch.bool: 1,
+    }
+    if dtype not in sizes:
+        raise NotImplementedError("Don't know the size of dtype ", dtype)
+
+    return numel * sizes[dtype]
+
+def _size_of(node: fx.Node) -> int:
+    if 'val' in node.meta:
+        val = node.meta['val']
+        if isinstance(val, py_sym_types):
+            if isinstance(val, torch.SymInt):
+                return 1
+            else:
+                return 999999
+        elif isinstance(val, (list, tuple)):
+            return sum(_tensor_nbytes(hint_int(n.numel()), n.dtype) for n in val if isinstance(n, torch.Tensor))
+        elif isinstance(val, torch.Tensor):
+            return _tensor_nbytes(hint_int(val.numel()), val.dtype)
+
+        raise RuntimeError(f"Unknown metadata type {type(val)}")
+
+    # Only needed since we don't always trace with fake tensors.
+    if 'tensor_meta' in node.meta:
+        metadata = node.meta['tensor_meta']
+        numel = _prod(map(to_size_hint, metadata.shape))
+        dtype = metadata.dtype
+    else:
+        return 0
+
+    return _tensor_nbytes(numel, dtype)
+
+
+# Used for some investigative purposes
+def _count_ops(graph):
+    from collections import defaultdict
+    cnt = defaultdict(int)
+    for node in graph.nodes:
+        if node.op == 'call_function':
+            cnt[node.target.__name__] += 1
+    print(sorted(cnt.items(), key=lambda x: x[1], reverse=True))
+
+
+@functools.lru_cache(None)
+def pointwise_ops():
+    ops = []
+    for attr_name in dir(torch.ops.aten):
+        opoverloadpacket = getattr(torch.ops.aten, attr_name)
+        if not isinstance(opoverloadpacket, torch._ops.OpOverloadPacket):
+            continue
+
+        for overload in opoverloadpacket.overloads():
+            op_overload = getattr(opoverloadpacket, overload)
+            if torch.Tag.pointwise in op_overload.tags:
+                # currently aot autograd uses packet not overload
+                ops.append(opoverloadpacket)
+                break
+
+    return ops
+
+def get_depth(node, depth_map):
+    if node in depth_map:
+        return depth_map[node]
+
+    # Base case
+    if node.op == "placeholder":
+        depth_map[node] = 0
+        return depth_map[node]
+
+    # Handle output node
+    if node.op == "output":
+        args = node.args[0]
+        for arg in args:
+            if isinstance(arg, torch.fx.node.Node):
+                get_depth(arg, depth_map)
+        return
+
+    # Get the depth of args and set the depth of this node
+    arg_depths = [get_depth(arg, depth_map) for arg in node.all_input_nodes if isinstance(arg, torch.fx.node.Node)]
+    # factory ops like full, rand might not have any input args
+    if len(arg_depths) == 0:
+        arg_depths = [0]
+    depth_map[node] = max(arg_depths) + 1
+    return depth_map[node]
+
+
+def sort_depths(args, depth_map):
+    arg_depths = {arg: depth_map[arg] for arg in args if isinstance(arg, torch.fx.node.Node)}
+    return sorted(arg_depths.items(), key=lambda x: x[1], reverse=True)
+
+
+def reordering_to_mimic_autograd_engine(gm):
+    """
+    This pass finds the first bwd node in the graph (by looking at users of
+    tangents) and then reorders the graph by walking from this node to all the
+    way to the end of the graph. At each op in this traveral, we insert this op
+    in a new graph and try to bring only the relevant subgraph from the other
+    non-bwd edges relevant for this op. This closely mimics the behavior of
+    autograd engine.
+
+    Why is this pass required in the first place?
+
+    This is an artifact of how partitioners work today. The starting point of
+    partitioner is a joint graph, which is fwd and then bwd graph. In the case
+    of checkpointing, we keep portions of fwd graph in their original place in
+    the joint graph, while obtaining a bwd graph. As a result, the resulting bwd
+    graph has copies of recomputed fwd subgraphs followed by the original bwd
+    graph. If we run this naively, this leads to bad memory footprint, because
+    the fwd subgraphs are live for way longer duration than necessary. This pass
+    reorders the operations such that we prioritize the ops for the original bwd
+    graph while only realizing those ops from the fwd graph that are necessary
+    at any given point in the graph.
+    """
+
+    new_graph = fx.Graph()
+    env = {}
+
+    # Add new placeholder nodes in the order specified by the inputs
+    for node in gm.graph.nodes:
+        if node.op == "placeholder":
+            new_node = new_graph.placeholder(node.name)
+            # Can't use node_copy here as we may be turning previous call_function into placeholders
+            new_node.meta = node.meta
+            env[node] = new_node
+
+
+    order = {}
+    for idx, node in enumerate(gm.graph.nodes):
+        order[node] = idx
+
+    # Populate depth for the nodes. Depth is the distance from the inputs.
+    depths = {}
+    output_node = [node for node in gm.graph.nodes if node.op == "output"][0]
+    get_depth(output_node, depths)
+
+    def insert_node_in_graph(node):
+        if node in env:
+            return env[node]
+
+        # Bias traversal towards the nodes that have higher depth - prioritizes
+        # critical path first.
+        for arg, _ in sort_depths(node.all_input_nodes, depths):
+            env[arg] = insert_node_in_graph(arg)
+        env[node] = new_graph.node_copy(node, lambda x: env[x])
+        return env[node]
+
+    # Find first bwd node in the graph
+    tangent_inputs = list(filter(_is_tangent, gm.graph.nodes))
+    first_node_in_bwd = None
+    minimum_order = math.inf
+    for tangent in tangent_inputs:
+        for user in tangent.users:
+            if order[user] < minimum_order:
+                minimum_order = order[user]
+                first_node_in_bwd = user
+    assert first_node_in_bwd is not None
+
+    # Build the graph op-by-op by starting from the node all the way to the end
+    for node in list(gm.graph.nodes)[order[first_node_in_bwd]:]:
+        insert_node_in_graph(node)
+
+    # The output node is already built by the traversal.
+    new_gm = torch.fx.GraphModule(gm, new_graph)
+    return new_gm
+
+
+def functionalize_rng_ops(joint_module, fw_module, bw_module, num_sym_nodes):
+    # During user-driven activation checkpointing, we have to ensure that a rng
+    # op in fwd yields the same output as the recomputed rng op in the bwd.  To
+    # do this, we use functionalize wrappers to wrap the random ops and share
+    # rng state between the fwd and bwd graphs.
+
+    # There are 3 main steps to do this
+    # Step 1 - Construct a mapping of rng node between the fwd and its counterpart in bwd.
+    # Step 2 - Modify the fwd pass such that
+    #   1) Replace rand with run_and_save_rng_state wrapper
+    #   2) Replace the users of the original op with the output[1] of this op.
+    #   3) Collect all the rng_state - output[0] of each op, and make them
+    #   output nodes. Special care needs to be taken here because fwd outputs
+    #   has symints at the very end.
+    # Step 3 - Modify the bwd pass such that
+    #   1) Add the input nodes just before the tangents for the stashed rng states
+    #   2) Replace rand with run_with_save_rng_state wrappers
+    #   3) Use the stashed states as inputs to these ops
+
+    # Unique id to generate name
+    uid = itertools.count()
+
+    def get_rng_ops(gmod):
+        random_nodes = {}
+        for node in gmod.graph.nodes:
+            if (
+                node.op == "call_function"
+                and hasattr(node.target, "tags")
+                and torch.Tag.nondeterministic_seeded in node.target.tags
+            ):
+                random_nodes[node.name] = node
+        return random_nodes
+
+    # Step 1 - Construct a mapping of rng node between the fwd and its counterpart in bwd.
+    joint_graph_rng_ops = get_rng_ops(joint_module)
+    fw_graph_rng_ops = get_rng_ops(fw_module)
+    bw_graph_rng_ops = get_rng_ops(bw_module)
+    recomputable_rng_ops_map = dict()
+    for node in joint_module.graph.nodes:
+        if (
+            must_recompute(node)
+            and hasattr(node.target, "tags")
+            and torch.Tag.nondeterministic_seeded in node.target.tags
+        ):
+            base_node = joint_graph_rng_ops[node.name]
+            fw_node = fw_graph_rng_ops[node.name]
+            bw_node = bw_graph_rng_ops[node.name]
+            recomputable_rng_ops_map[base_node] = {"fwd": fw_node, "bwd": bw_node}
+
+    run_and_save_rng = torch._prims.rng_prims.run_and_save_rng_state
+    run_with_rng_state = torch._prims.rng_prims.run_with_rng_state
+
+    for node in bw_module.graph.nodes:
+        if node.op == "placeholder" and "tangent" in node.name:
+            bw_tangent_start_node = node
+            break
+
+    fw_rng_state_outputs = []
+    for base_node, node_pair in recomputable_rng_ops_map.items():
+        # Step 2 - Modify the fwd pass such that
+        fw_node = node_pair["fwd"]
+        bw_node = node_pair["bwd"]
+        fw_graph = fw_module.graph
+        with fw_graph.inserting_before(fw_node):
+            functional_fw_node = fw_graph.create_node(
+                "call_function",
+                run_and_save_rng,
+                args=(fw_node.target, *fw_node.args),
+                kwargs=fw_node.kwargs
+            )
+            state = fw_graph.create_node("call_function", operator.getitem, args=(functional_fw_node, 0), kwargs={})
+            rng_output = fw_graph.create_node("call_function", operator.getitem, args=(functional_fw_node, 1,), kwargs={})
+            fw_node.replace_all_uses_with(rng_output)
+            fw_graph.erase_node(fw_node)
+            fw_rng_state_outputs.append(state)
+
+
+        # Step 3 - Modify the bwd pass such that
+        bw_graph = bw_module.graph
+        with bw_graph.inserting_before(bw_tangent_start_node):
+            state_name = f"rng_state_output_{next(uid)}"
+            bw_rng_state_node = bw_graph.placeholder(state_name)
+            bw_rng_state_node.meta["val"] = torch.cuda.get_rng_state()
+
+        with bw_graph.inserting_before(bw_node):
+            rng_output = bw_graph.create_node(
+                "call_function",
+                run_with_rng_state,
+                args=(bw_rng_state_node, bw_node.target, *bw_node.args),
+                kwargs=bw_node.kwargs
+            )
+
+            bw_node.replace_all_uses_with(rng_output)
+            bw_graph.erase_node(bw_node)
+
+
+    # Add the rng states in the output of the fwd graph. AOT Autograd assumes
+    # that symints are at the end of forward graph outputs. So, insert the new
+    # rng states accordingly.
+    fw_output_node = [node for node in fw_module.graph.nodes if node.op == "output"][0]
+    fw_outputs = fw_output_node.args[0]
+    sym_node_start_idx = len(fw_outputs) - num_sym_nodes
+    outputs = fw_outputs[:sym_node_start_idx] + fw_rng_state_outputs + fw_outputs[sym_node_start_idx:]
+    fw_module.graph.output(outputs)
+    fw_module.graph.erase_node(fw_output_node)
+    fw_module.recompile()
+    bw_module.recompile()
+    return fw_module, bw_module
+
+
+def cleanup_recompute_tags(joint_module):
+    """
+    If there are two consecutive checkpointed blocks with no operator in
+    between, we would still want to stash the tensor at the boundary of
+    checkpointed blocks. The following pass makes the last output node
+    non-recomputable to allow for that.
+    """
+    for node in joint_module.graph.nodes:
+        if must_recompute(node):
+            for user in node.users:
+                if must_recompute(user) and user.meta["recompute"] > node.meta["recompute"]:
+                    node.meta["recompute"] = 0
+    return joint_module
+
+
+def fuse_aware_min_cut_partition(
+    joint_module: fx.GraphModule, _joint_inputs, compiler="inductor", recomputable_ops=None,
+    *, num_fwd_outputs
+) -> Tuple[fx.GraphModule, fx.GraphModule]:
+    print("num_fwd_outputs : ", num_fwd_outputs)
+    """
+    Partitions the joint graph such that the backward recomputes the forward.
+    Recomputing helps in trading off memory bandwidth with computation.
+
+    To create the fwd and bwd graph, we copy the joint graph, manually set the
+    outputs to just original forward or backward outputs. And then we run the
+    resulting graphs through dead code elimintation.
+
+    .. warning::
+        This API is experimental and likely to change.
+
+    Args:
+        joint_module(fx.GraphModule): The joint forward and backward graph. This
+            is the result of AOT Autograd tracing.
+        _joint_inputs: The inputs to the joint graph. This is unused.
+        compiler: This option determines the default set of recomputable ops.
+            Currently, there are two options: ``nvfuser`` and ``inductor``.
+        recomputable_ops: This is an optional set of recomputable ops. If this
+            is not None, then this set of ops will be used instead of the
+            default set of ops.
+        num_fwd_outputs: The number of outputs from the forward graph.
+
+    Returns:
+        Returns the generated forward and backward Fx graph modules.
+    """
+    try:
+        import networkx as nx
+    except ImportError as e:
+        raise RuntimeError("Need networkx installed to perform smart recomputation "
+                           "heuristics") from e
+
+    joint_module.graph.eliminate_dead_code()
+    joint_module.recompile()
+
+    fx_g = joint_module.graph
+
+    #  add the CSE pass
+    if config.cse:
+        cse_graph = fx_graph_cse(fx_g)
+        joint_module.graph = cse_graph
+    full_bw_graph = joint_module.graph
+
+    graph_has_recomputable_ops = has_recomputable_ops(joint_module)
+    graph_has_recomputable_rng_ops = has_recomputable_rng_ops(joint_module)
+    if graph_has_recomputable_ops:
+        joint_module = cleanup_recompute_tags(joint_module)
+
+    name_to_node = {}
+    for node in joint_module.graph.nodes:
+        name_to_node[node.name] = node
+
+    def classify_nodes(joint_module):
+        required_bw_nodes = set()
+        for node in joint_module.graph.nodes:
+            if node.op == 'placeholder' and "tangents" in node.target:
+                required_bw_nodes.add(node)
+            if node in required_bw_nodes:
+                for user in node.users:
+                    required_bw_nodes.add(user)
+
+        primal_inputs = list(filter(_is_primal, joint_module.graph.nodes))
+        fwd_seed_offset_inputs = list(filter(_is_fwd_seed_offset, joint_module.graph.nodes))
+        inputs = primal_inputs + fwd_seed_offset_inputs
+        fwd_outputs, bwd_outputs = _extract_fwd_bwd_outputs(joint_module, num_fwd_outputs=num_fwd_outputs)
+        required_bw_nodes.update(o for o in bwd_outputs if o is not None)
+        forward_only_graph = _extract_graph_with_inputs_outputs(joint_module.graph, inputs, fwd_outputs)
+        required_fw_nodes = {name_to_node[node.name] for node in forward_only_graph.nodes
+                             if node.op != 'output'}
+        unclaimed_nodes = {node for node in joint_module.graph.nodes
+                           if node not in required_fw_nodes and node not in required_bw_nodes}
+        return fwd_outputs, required_fw_nodes, required_bw_nodes, unclaimed_nodes
+
+    orig_fw_outputs, required_fw_nodes, required_bw_nodes, unclaimed_nodes = classify_nodes(joint_module)
+
+    # networkx blows up on graphs with no required backward nodes
+    # Since there's nothing to partition anyway, and the default partitioner can "handle"
+    # this case, send our graph over to the default partitioner.
+    if len(required_bw_nodes) == 0:
+        return default_partition(joint_module, _joint_inputs, num_fwd_outputs=num_fwd_outputs)
+
+    for node in reversed(joint_module.graph.nodes):
+        if node not in required_fw_nodes:
+            node.dist_from_bw = 0
+        else:
+            node.dist_from_bw = int(1e9)
+            for user in node.users:
+                node.dist_from_bw = min(node.dist_from_bw, user.dist_from_bw + 1)
+
+    aten = torch.ops.aten
+    prims = torch.ops.prims
+
+    # compiler == "nvfuser" is the default set of recomputable ops
+    default_recomputable_ops = [aten.add, aten.sub, aten.div, aten.atan2, aten.mul, aten.max, aten.min, aten.pow, aten.remainder, aten.fmod, aten.__and__, aten.__or__, aten.__xor__, aten.__lshift__, aten.__rshift__, aten.eq, aten.ne, aten.ge, aten.gt, aten.le, aten.lt, aten.abs, aten.bitwise_not, aten.ceil, aten.floor, aten.frac, aten.neg, aten.relu, aten.round, aten.silu, aten.trunc, aten.log, aten.log10, aten.log1p, aten.log2, aten.lgamma, aten.exp, aten.expm1, aten.erf, aten.erfc, aten.cos, aten.acos, aten.cosh, aten.sin, aten.asin, aten.sinh, aten.tan, aten.atan, aten.tanh, aten.atanh, aten.sqrt, aten.rsqrt, aten.reciprocal, aten.sigmoid, aten.softplus, aten.threshold, aten.threshold_backward, aten.clamp, aten.where, aten.lerp, aten.addcmul, aten.gelu, aten.gelu_backward, aten.sum, aten.mean, aten._grad_sum_to_size, aten.sum_to_size, aten.amax, aten.to, aten.type_as, operator.getitem, aten.squeeze, aten.unsqueeze, aten.rsub, aten._to_copy]  # noqa: E501,B950
+    view_ops = [aten.squeeze, aten.unsqueeze, aten.alias]
+    if compiler == "inductor":
+        default_recomputable_ops += [prims.div, prims.convert_element_type, aten.clone, aten._to_copy, aten.full_like, prims.var, prims.sum, aten.var, aten.std, prims.broadcast_in_dim, aten.select, aten.permute, aten._unsafe_view, aten.view, aten.expand, aten.slice, aten.reshape, aten.broadcast_tensors, aten.scalar_tensor, aten.ones, aten.new_zeros, aten.lift_fresh_copy, aten.arange, aten.triu, aten.var_mean, aten.isinf, aten.any, aten.full, aten.as_strided, aten.zeros, aten.argmax, aten.maximum, aten.cumsum]  # noqa: E501,B950
+        view_ops += [aten.view, aten.slice, aten.permute, aten.t, aten.transpose, prims.broadcast_in_dim, aten.expand, aten.as_strided]
+        # Natalia said that we should allow recomputing indexing :)
+        default_recomputable_ops += [aten.index]
+    default_recomputable_ops += view_ops
+
+    default_recomputable_ops += pointwise_ops()
+
+    default_recomputable_ops += [
+        aten.zeros_like,
+    ]
+
+    default_recomputable_ops += [
+        method_to_operator(m)
+        for m in magic_methods
+    ]
+
+    recomputable_ops = set(recomputable_ops) if recomputable_ops is not None else set(default_recomputable_ops)
+
+    random_ops = [aten.native_dropout, aten.rand_like, aten.randn_like]
+    compute_intensive_ops = [aten.mm, aten.convolution, aten.convolution_backward, aten.bmm, aten.addmm, aten.upsample_bilinear2d, aten._softmax, aten._softmax_backward_data, aten.native_layer_norm, aten.native_layer_norm_backward, aten.native_batch_norm, aten.native_batch_norm_backward, aten._native_batch_norm_legit]  # noqa: E501,B950
+
+    unrecomputable_ops = random_ops + compute_intensive_ops
+
+    fusible_ops = recomputable_ops | set(random_ops)
+    
+    # The node match fusible pattern in byteir, it shoudle be recomputed.
+    byteir_recompute_nodes = get_byteir_recompute_nodes(full_bw_graph, required_fw_nodes)
+
+    if AOT_PARTITIONER_DEBUG:
+        joint_module_ops = {
+            str(node.target._overloadpacket)
+            for node in joint_module.graph.nodes
+            if node.op == "call_function" and hasattr(node.target, "_overloadpacket")
+        }
+        ops_ignored = joint_module_ops - {str(i) for i in recomputable_ops}
+        print("Ops banned from rematerialization: ", ops_ignored)
+        print()
+
+    AGGRESSIVE_RECOMPUTATION = False
+
+    def is_materialized_backwards(node):
+        cur_nodes = {node}
+        while len(cur_nodes) > 0:
+            cur = cur_nodes.pop()
+            for user in cur.users:
+                if user not in required_fw_nodes and not is_fusible(cur, user):
+                    return True
+                if user not in required_fw_nodes and get_aten_target(user) in view_ops:
+                    cur_nodes.add(user)
+
+        return False
+
+    def ban_recomputation(node):
+        if node in byteir_recompute_nodes:
+            return False
+        if "recompute" in node.meta:
+            return node.meta["recompute"] == 0
+        elif AGGRESSIVE_RECOMPUTATION:
+            return (node.op == 'call_function' and get_aten_target(node) in unrecomputable_ops)
+        else:
+            if node.op != 'call_function':
+                return False
+            if get_aten_target(node) not in recomputable_ops:
+                return True
+            if node.target == operator.getitem:
+                return False
+            if node.target in [aten.lift_fresh_copy.default, aten.lift_fresh.default]:
+                return False
+
+            # If a node *must* be materialized in the backwards pass, then we
+            # should never recompute it. This is a pretty subtle point.  In
+            # general, the assumption we make is that recomputing a node in the
+            # backwards pass is "free". However, if a node must be materialized
+            # in the backwards pass, then recomputing it is never free.
+            if is_materialized_backwards(node):
+                return True
+
+            # Arbitrary hack that sometimes seems to help things. The above
+            # modification appears to have made this heuristic a lot less critical
+            # for performance.
+            # TODO: Investigate why this hack helps.
+            # TODO: Investigate the interaction with compiler assisted
+            # activation checkpointing. Removing the heuristic improves both
+            # memory footprint and speedup.
+            if not graph_has_recomputable_ops:
+                if compiler == "inductor" and node.dist_from_bw > config.max_dist_from_bw:
+                    return True
+            # If the output of an op is 4x smaller (arbitrary choice),
+            # then we don't allow recomputation.
+            input_tensors_size = sum(_size_of(i) for i in node.args if isinstance(i, fx.Node))
+            output_size = _size_of(node)
+            return (output_size * 4 < input_tensors_size)
+
+    def is_fusible(a, b):
+        return get_aten_target(a) in fusible_ops and get_aten_target(b) in fusible_ops
+
+    def is_materialized(node):
+        if node.op == 'placeholder':
+            return True
+
+        return not all(is_fusible(node, user) for user in node.users)
+
+    def is_byteir_fusible(node):
+        if  get_aten_target(node) in [aten.transpose, aten.t]:
+            return all(get_aten_target(user) in compute_intensive_ops for user in node.users)
+        return False
+
+    def get_node_weight(node) -> int:
+
+        mem_sz = _size_of(node)
+
+        # Heuristic to bias towards nodes closer to the backwards pass
+        # Complete guess about current value
+        mem_sz = int(mem_sz * (1.1 ** max(min(node.dist_from_bw, 100), 1)))
+        # mem_sz = int(mem_sz + node.dist_from_bw)
+
+        if is_materialized(node):
+            return mem_sz
+        else:
+            return mem_sz * 2
+
+    nx_graph = nx.DiGraph()
+
+    for node in full_bw_graph.nodes:
+        if node.op == 'output':
+            continue
+
+        if node in required_bw_nodes:
+            nx_graph.add_edge(node.name + "_in", "sink", capacity=math.inf)
+            continue
+
+        if _is_primal(node) or _is_fwd_seed_offset(node):
+            nx_graph.add_edge("source", node.name + "_in", capacity=math.inf)
+
+        # If a node can't be recomputed (too expensive or involves randomness),
+        # we prevent it from being recomputed by adding an inf edge to the source
+        # We only need to ban nodes in the fw pass, as those are the only ones that would be recomputed.
+        if ban_recomputation(node) and node in required_fw_nodes:
+            nx_graph.add_edge("source", node.name + "_in", capacity=math.inf)
+
+        # Checks if a node is actually a tuple. Can be simplified to just an isisinstance check if we always use faketensors.
+        is_non_tensor_node = (('val' not in node.meta and 'tensor_meta' not in node.meta) or
+                              ('val' in node.meta and not isinstance(node.meta['val'], torch.Tensor)))
+
+        if is_sym_node(node):
+            weight = sym_node_size(node)
+        elif is_non_tensor_node:
+            weight = math.inf
+        else:
+            weight = get_node_weight(node)
+
+        # Creates the weights on the "node" edge
+        nx_graph.add_edge(node.name + "_in", node.name + "_out", capacity=weight)
+        for user in node.users:
+            nx_graph.add_edge(node.name + "_out", user.name + "_in", capacity=math.inf)
+
+    for node in byteir_recompute_nodes:
+        nx_graph.add_edge(node.name + "_in", "sink", capacity=math.inf)
+
+    try:
+        cut_value, partition = nx.minimum_cut(nx_graph, "source", "sink")
+    except Exception:
+        print('Failed to compute min-cut on following graph:')
+        print('\n'.join(nx.readwrite.edgelist.generate_edgelist(nx_graph)))
+        raise
+
+    reachable, non_reachable = partition
+    cutset = set()
+    for u, nbrs in ((n, nx_graph[n]) for n in reachable):
+        cutset.update((u, v) for v in nbrs if v in non_reachable)
+
+    cut_nodes = set()
+    for node_in, node_out in cutset:
+        assert node_in[:-3] == node_out[:-4]
+        node_name = node_in[:-3]
+        cut_nodes.add(node_name)
+
+    # To make this stuff deterministic
+    node_idx = {node: idx for idx, node in enumerate(joint_module.graph.nodes)}
+    saved_values = sorted((name_to_node[node] for node in cut_nodes), key=lambda x: node_idx[x])
+    # save_for_backward on tensors and stashes symints in autograd .ctx
+    saved_sym_nodes = list(filter(lambda n: is_sym_node(n), saved_values))
+    saved_values = list(filter(lambda n: not is_sym_node(n), saved_values))
+    # NB: saved_sym_nodes will be mutated to reflect the actual saved symbols
+    fw_module, bw_module = _extract_fwd_bwd_modules(
+        joint_module, saved_values, saved_sym_nodes=saved_sym_nodes, num_fwd_outputs=num_fwd_outputs)
+
+
+    if graph_has_recomputable_ops:
+        if graph_has_recomputable_rng_ops:
+            fw_module, bw_module = functionalize_rng_ops(
+                joint_module, fw_module, bw_module, len(saved_sym_nodes)
+            )
+        bw_module = reordering_to_mimic_autograd_engine(bw_module)
+
+    if AOT_PARTITIONER_DEBUG:
+        print("Theoretical Activations Stored: ", sum([_size_of(i) for i in saved_values]) / 1e9)
+        fw_module_nodes = {node.name for node in fw_module.graph.nodes if node.op == 'call_function'}
+        bw_module_nodes = {node.name for node in bw_module.graph.nodes if node.op == 'call_function'}
+        remat_nodes = fw_module_nodes & bw_module_nodes
+
+        counts = defaultdict(int)
+        for node in fw_module.graph.nodes:
+            if node.name in remat_nodes and hasattr(node.target, '_overloadpacket'):
+                counts[str(node.target._overloadpacket)] += 1
+        print(f"# remat/fw/bw: {len(remat_nodes)}/{len(fw_module_nodes)}/{len(bw_module_nodes)}")
+        print("Count of Ops Rematerialized: ", sorted(counts.items(), key=lambda x: x[1], reverse=True))
+    return fw_module, bw_module
+
+
+def draw_graph(traced: torch.fx.GraphModule, fname: str, figname: str = "fx_graph", clear_meta=True):
+    if clear_meta:
+        new_graph = copy.deepcopy(traced.graph)
+        traced = fx.GraphModule(traced, new_graph)
+        for node in traced.graph.nodes:
+            node.meta = {}
+    base, ext = os.path.splitext(fname)
+    if not ext:
+        ext = ".svg"
+    print(f"Writing FX graph to file: {base}{ext}")
+    g = graph_drawer.FxGraphDrawer(traced, figname)
+    x = g.get_main_dot_graph()
+    getattr(x, "write_" + ext.lstrip("."))(f"{base}{ext}")
+
+
+def draw_joint_graph(graph, joint_inputs, file_name="full_graph.png"):
+    draw_graph(graph, file_name)
+    return default_partition(graph, joint_inputs)
diff --git a/frontends/torch-frontend/third_party/patches/einsum.patch b/frontends/torch-frontend/third_party/patches/einsum.patch
index 3b32cd0f5..4f7b913f9 100644
--- a/frontends/torch-frontend/third_party/patches/einsum.patch
+++ b/frontends/torch-frontend/third_party/patches/einsum.patch
@@ -1,8 +1,8 @@
 diff --git a/include/torch-mlir/Dialect/Torch/IR/GeneratedTorchOps.td b/include/torch-mlir/Dialect/Torch/IR/GeneratedTorchOps.td
-index 09147dc8..4b69e9cd 100644
+index efdb89fa..c87de79e 100644
 --- a/include/torch-mlir/Dialect/Torch/IR/GeneratedTorchOps.td
 +++ b/include/torch-mlir/Dialect/Torch/IR/GeneratedTorchOps.td
-@@ -7801,6 +7801,31 @@ def Torch_AtenOneHotOp : Torch_Op<"aten.one_hot", [
+@@ -4834,6 +4834,31 @@ def Torch_AtenAddmmOp : Torch_Op<"aten.addmm", [
    }];
  }
  
@@ -31,27 +31,28 @@ index 09147dc8..4b69e9cd 100644
 +  }];
 +}
 +
- def Torch_AtenBucketizeTensorOp : Torch_Op<"aten.bucketize.Tensor", [
+ def Torch_AtenMatmulOp : Torch_Op<"aten.matmul", [
      AllowsTypeRefinement,
      HasValueSemantics,
 diff --git a/lib/Dialect/Torch/Transforms/DecomposeComplexOps.cpp b/lib/Dialect/Torch/Transforms/DecomposeComplexOps.cpp
-index 4dafed1d..a3c908bf 100644
+index 558e31c6..fefc337e 100644
 --- a/lib/Dialect/Torch/Transforms/DecomposeComplexOps.cpp
 +++ b/lib/Dialect/Torch/Transforms/DecomposeComplexOps.cpp
-@@ -5061,6 +5061,311 @@ public:
+@@ -5022,6 +5022,460 @@ public:
  };
  } // namespace
  
 +namespace {
-+// Decompose AtenEinsumOp to AtenMmOp or AtenBmmOp
-+// Step 1: split input equation to input/result tokens and find batchingDims and
-+// contractingDims for future use
-+// Step 2: transpose the input tensors to [batchingDims[0,1,2],
-+// otherDims[0,1,2], contractingDims[0,1,2]]
-+// Step 3: reshape the input tensors, the final shape should
-+// be[batchingDims, otherDims, contractingDims]
-+// Step 4: use AtenMatmulOp to get the result, loop util we get the final
-+// result
++// Decompose AtenEinsumOp to AtenMatmulOp, and supports possible reduce
++// operation and permute operation. Currently, this pass doesn't support
++// Hadamard product. The basic idea is that: 
++//  Step 1: split the string equation to input/result tokens and find 
++//    batchingDims, contractingDims, otherDims and reduceDims. 
++//  Step 2: permute and reshape input tensors suitable 
++//    for matmul operations. 
++//  Step 3: use AtenMatmulOp to get the result. 
++//  Step 4: iteratively execute step 2 & 3 until we get the final result. 
++//  Step 5: perform remaining permute and reduce operations. 
 +// notice: support static shape only
 +
 +static bool parseEquation(const std::string &equation,
@@ -85,95 +86,358 @@ index 4dafed1d..a3c908bf 100644
 +  return true;
 +}
 +
-+// Prepare Tensor for Matmul Operations, we will transpose the input tensor
-+// to make it in order as [batchingDims, otherDims, contractingDims]
-+// example: bcwd,bcdh->bcwh
-+// Step1 : [b,c,h,d]
-+// Step2 : [b*c,h,d]
-+// Step3 : [e(=b*c), h, d]
-+static Value prepareTensorForMatmulOperations(
-+    PatternRewriter &rewriter, Operation *op, Value inputTensor,
-+    const SmallVector<Value> &shape, const SmallVector<int64_t> &contractingDims,
-+    const SmallVector<int64_t> &batchingDims, SmallVector<Value> &finalShape,
-+    const SmallVector<char> &tokens) {
-+  SmallVector<int64_t> otherDims;
-+  Value middleDimProduct =
-+      rewriter.create<ConstantIntOp>(op->getLoc(), rewriter.getI64IntegerAttr(1));
-+  for (size_t i = 0; i < shape.size(); ++i) {
-+    if (std::find(batchingDims.begin(), batchingDims.end(), i) ==
-+            batchingDims.end() &&
-+        std::find(contractingDims.begin(), contractingDims.end(), i) ==
-+            contractingDims.end()) {
-+      middleDimProduct =
-+          rewriter.create<AtenMulIntOp>(op->getLoc(), middleDimProduct, shape[i]);
-+      otherDims.push_back(i);
++// classify every dim token into different categories. Note that although we
++// parse out reduce dims, we delay their execution until
++// `performLastPermuteAndReduce`.
++static void parseDimTokens(
++    SmallVector<char> &lhsTokens, SmallVector<char> &rhsTokens,
++    SmallVector<char> &finalResultTokens, SmallVector<char> &contractingDims,
++    SmallVector<char> &lhsReduceDims, SmallVector<char> &rhsReduceDims,
++    SmallVector<char> &batchingDims, SmallVector<char> &lhsOtherDims,
++    SmallVector<char> &rhsOtherDims) {
++  llvm::SmallDenseSet<char> lhsTokenSet(lhsTokens.begin(), lhsTokens.end());
++  llvm::SmallDenseSet<char> rhsTokenSet(rhsTokens.begin(), rhsTokens.end());
++  llvm::SmallDenseSet<char> finalResultTokenSet(finalResultTokens.begin(),
++                                                finalResultTokens.end());
++
++  for (size_t i = 0; i < lhsTokens.size(); ++i) {
++    bool rhsContains = rhsTokenSet.contains(lhsTokens[i]);
++    bool finalResultConatins = finalResultTokenSet.contains(lhsTokens[i]);
++    // batching dim
++    if (rhsContains && finalResultConatins) {
++      batchingDims.push_back(lhsTokens[i]);
++      // reduce dim of lhs
++    } else if (!rhsContains && !finalResultConatins) {
++      lhsReduceDims.push_back(lhsTokens[i]);
++      // other dim of lhs
++    } else if (finalResultConatins) {
++      lhsOtherDims.push_back(lhsTokens[i]);
++      // contracting dim of lhs
++    } else if (rhsContains) {
++      contractingDims.push_back(lhsTokens[i]);
 +    }
 +  }
-+  int64_t otherDimsSize = otherDims.size();
-+  if (!batchingDims.empty()) {
-+    int64_t usedOtherDim = 0;
-+    Value batchingDimProduct =
-+        rewriter.create<ConstantIntOp>(op->getLoc(), rewriter.getI64IntegerAttr(1));
-+    int64_t batchingDimsRank = batchingDims.size();
-+    for (int64_t i = 0; i < batchingDimsRank; ++i) {
-+      batchingDimProduct =
-+          rewriter.create<AtenMulIntOp>(op->getLoc(), batchingDimProduct,
-+                                        shape[batchingDims[i]]);
-+      if (batchingDims[i] != i) {
-+        Value batchingDim =
-+            rewriter.create<ConstantIntOp>(op->getLoc(),
-+                                           rewriter.getI64IntegerAttr(
-+                                               batchingDims[i]));
-+        Value indexDim = rewriter.create<ConstantIntOp>(
-+            op->getLoc(), rewriter.getI64IntegerAttr(otherDims[usedOtherDim]));
-+        inputTensor = rewriter.create<AtenTransposeIntOp>(
-+            op->getLoc(), op->getResultTypes(), inputTensor, batchingDim, indexDim);
-+        usedOtherDim += 1;
-+      }
++
++  for (size_t i = 0; i < rhsTokens.size(); ++i) {
++    bool lhsContains = lhsTokenSet.contains(rhsTokens[i]);
++    bool finalResultConatins = finalResultTokenSet.contains(rhsTokens[i]);
++    // batching dim
++    if (lhsContains && finalResultConatins) {
++      // reduce dim of rhs
++    } else if (!lhsContains && !finalResultConatins) {
++      rhsReduceDims.push_back(rhsTokens[i]);
++      // other dim of rhs
++    } else if (finalResultConatins) {
++      rhsOtherDims.push_back(rhsTokens[i]);
++      // contracting dim of rhs
++    } else if (lhsContains) {
++    }
++  }
++}
++
++static void generateIdealReusltDimTokens(SmallVector<char> &batchingDims,
++                                         SmallVector<char> &lhsOtherDims,
++                                         SmallVector<char> &rhsOtherDims,
++                                         SmallVector<char> &lhsReduceDims,
++                                         SmallVector<char> &rhsReduceDims,
++                                         SmallVector<char> &resultTokens) {
++  // generate ideal result dims, i.e.,
++  // [*batchingDims, *lhsOtherDims, *lhsReduceDims, *rhsOtherDims,
++  // *rhsReduceDims]
++  resultTokens.insert(resultTokens.end(), batchingDims.begin(),
++                      batchingDims.end());
++  resultTokens.insert(resultTokens.end(), lhsOtherDims.begin(),
++                      lhsOtherDims.end());
++  resultTokens.insert(resultTokens.end(), lhsReduceDims.begin(),
++                      lhsReduceDims.end());
++  resultTokens.insert(resultTokens.end(), rhsOtherDims.begin(),
++                      rhsOtherDims.end());
++  resultTokens.insert(resultTokens.end(), rhsReduceDims.begin(),
++                      rhsReduceDims.end());
++}
++
++static Value permuteTensorForMatmul(PatternRewriter &rewriter, Location loc,
++                                    Value input, SmallVector<char> &dimTokens,
++                                    SmallVector<char> &batchingDims,
++                                    SmallVector<char> &contractingDims,
++                                    SmallVector<char> &otherDims,
++                                    SmallVector<char> &reduceDims, bool isLhs) {
++  auto inputType = input.getType().cast<BaseTensorType>();
++  llvm::SmallDenseMap<char, int64_t> dimTokenMap;
++  for (size_t idx = 0; idx < dimTokens.size(); ++idx) {
++    dimTokenMap[dimTokens[idx]] = idx;
++  }
++
++  SmallVector<Value> permuteVec;
++  auto appendDims = [&](SmallVector<char> dimTokens) {
++    for (auto d : dimTokens) {
++      permuteVec.push_back(rewriter.create<Torch::ConstantIntOp>(
++          loc, rewriter.getI64IntegerAttr(dimTokenMap[d])));
++    }
++  };
++
++  appendDims(batchingDims);
++  if (!isLhs)
++    appendDims(contractingDims);
++  appendDims(otherDims);
++  appendDims(reduceDims);
++  if (isLhs)
++    appendDims(contractingDims);
++
++  Value dstDims = rewriter.create<Torch::PrimListConstructOp>(
++      loc, Torch::ListType::get(Torch::IntType::get(rewriter.getContext())),
++      permuteVec);
++  auto outType = inputType.getWithSizesAndDtype(std::nullopt,
++                                                inputType.getOptionalDtype());
++  return rewriter.create<Torch::AtenPermuteOp>(loc, outType, input, dstDims);
++}
++
++// [*batchingDims, *lhsOtherDims, *lhsReduceDims, *lhsContractingDims] =>
++// [batchingDimsProd, lhsOtherDimsProd, lhsContractingDimsProd]
++static Value collapseDimForMatmul(PatternRewriter &rewriter, Location loc,
++                                  Value input, int64_t batchDimsLength,
++                                  int64_t contractingDimsLength,
++                                  int64_t otherDimsLength,
++                                  int64_t reduceDimsLength, bool isLhs) {
++  auto inputType = input.getType().cast<BaseTensorType>();
++  auto inputRank = batchDimsLength + contractingDimsLength + otherDimsLength +
++                   reduceDimsLength;
++  SmallVector<Value> inputShapeTensor;
++  for (auto i = 0; i < inputRank; ++i) {
++    inputShapeTensor.emplace_back(rewriter.create<AtenSizeIntOp>(
++        loc, input,
++        rewriter.create<Torch::ConstantIntOp>(loc,
++                                              rewriter.getI64IntegerAttr(i))));
++  }
++
++  SmallVector<Value> outShapeTensor;
++  Value constOne =
++      rewriter.create<Torch::ConstantIntOp>(loc, rewriter.getI64IntegerAttr(1));
++  auto dimOffset = 0;
++
++  auto appendDims = [&](int64_t dimLength) {
++    Value prod = constOne;
++    for (auto i = 0; i < dimLength; ++i) {
++      prod = rewriter.create<AtenMulIntOp>(loc, prod,
++                                           inputShapeTensor[i + dimOffset]);
 +    }
-+    finalShape.push_back(batchingDimProduct);
++    outShapeTensor.emplace_back(prod);
++    dimOffset += dimLength;
++  };
++
++  appendDims(batchDimsLength);
++  if (!isLhs)
++    appendDims(contractingDimsLength);
++  appendDims(otherDimsLength + reduceDimsLength);
++  if (isLhs)
++    appendDims(contractingDimsLength);
++
++  auto outShapeValue = rewriter.create<Torch::PrimListConstructOp>(
++      loc, Torch::ListType::get(Torch::IntType::get(input.getContext())),
++      outShapeTensor);
++
++  auto outType = inputType.getWithSizesAndDtype(std::nullopt,
++                                                inputType.getOptionalDtype());
++  return rewriter.create<Torch::AtenReshapeOp>(loc, outType, input,
++                                               outShapeValue);
++}
++
++static LogicalResult performMatmul(PatternRewriter &rewriter, Location loc,
++                                   Value lhs, SmallVector<char> &lhsTokens,
++                                   Value rhs, SmallVector<char> &rhsTokens,
++                                   Value &result,
++                                   SmallVector<char> &resultTokens,
++                                   SmallVector<char> &finalResultTokens) {
++  auto lhsType = lhs.getType().cast<BaseTensorType>();
++  auto rhsType = rhs.getType().cast<BaseTensorType>();
++  Type promotedDType;
++
++  // promote dtype
++  if (lhsType.hasDtype() && rhsType.hasDtype()) {
++    auto lhsDtype = Torch::getScalarTypeForType(lhsType.getOptionalDtype());
++    auto rhsDtype = Torch::getScalarTypeForType(rhsType.getOptionalDtype());
++    auto promotedDTypeInt =
++        torch_upstream::promote_skip_undefined(lhsDtype, rhsDtype);
++    auto promotedDTypeIntValue = rewriter.create<Torch::ConstantIntOp>(
++        loc, rewriter.getI64IntegerAttr((int)promotedDTypeInt));
++    auto promotedDTypeInfo =
++        getTypeForScalarType(rewriter.getContext(), promotedDTypeInt,
++                            mlir::IntegerType::SignednessSemantics::Signed);
++    if (failed(promotedDTypeInfo))
++      rewriter.notifyMatchFailure(loc, "Failed to get type for promoted dtype");
++    promotedDType = *promotedDTypeInfo;
++
++    auto falseValue =
++        rewriter.create<Torch::ConstantBoolOp>(loc, rewriter.getBoolAttr(false));
++    auto noneValue = rewriter.create<Torch::ConstantNoneOp>(loc);
++    lhs = rewriter.create<Torch::AtenToDtypeOp>(
++        loc,
++        lhsType.getWithSizesAndDtype(lhsType.getOptionalSizes(), promotedDType),
++        lhs, promotedDTypeIntValue, falseValue, falseValue, noneValue);
++    rhs = rewriter.create<Torch::AtenToDtypeOp>(
++        loc,
++        rhsType.getWithSizesAndDtype(rhsType.getOptionalSizes(), promotedDType),
++        rhs, promotedDTypeIntValue, falseValue, falseValue, noneValue);
++  } else {
++    promotedDType = lhsType.hasDtype() ? lhsType.getOptionalDtype() : rhsType.getOptionalDtype();
++  }
++  
++  llvm::SmallDenseMap<char, Value> lhsDimShapeMap;
++  for (size_t idx = 0; idx < lhsTokens.size(); ++idx) {
++    char d = lhsTokens[idx];
++    lhsDimShapeMap[d] = rewriter.create<AtenSizeIntOp>(
++        loc, lhs,
++        rewriter.create<Torch::ConstantIntOp>(loc,
++                                              rewriter.getI64IntegerAttr(idx)));
 +  }
-+  finalShape.push_back(middleDimProduct);
-+  if (!contractingDims.empty()) {
-+    int64_t usedOtherDim = 1;
-+    int64_t rank = tokens.size();
-+    Value contractingDimProduct =
-+        rewriter.create<ConstantIntOp>(op->getLoc(), rewriter.getI64IntegerAttr(1));
-+    int64_t contractingDimsRank = contractingDims.size();
-+    for (int64_t i = contractingDimsRank - 1; i > -1; --i) {
-+      contractingDimProduct =
-+          rewriter.create<AtenMulIntOp>(op->getLoc(), contractingDimProduct,
-+                                        shape[contractingDims[i]]);
-+      if (contractingDims[i] != rank - contractingDimsRank + i) {
-+        Value contractingDim =
-+            rewriter.create<ConstantIntOp>(op->getLoc(),
-+                                           rewriter.getI64IntegerAttr(
-+                                               contractingDims[i]));
-+        Value indexDim = rewriter.create<ConstantIntOp>(
-+            op->getLoc(), rewriter.getI64IntegerAttr(
-+                     otherDims[otherDimsSize - usedOtherDim]));
-+        inputTensor = rewriter.create<AtenTransposeIntOp>(
-+            op->getLoc(), op->getResultTypes(), inputTensor, contractingDim, indexDim);
-+        usedOtherDim += 1;
++  llvm::SmallDenseMap<char, Value> rhsDimShapeMap;
++  for (size_t idx = 0; idx < rhsTokens.size(); ++idx) {
++    char d = rhsTokens[idx];
++    rhsDimShapeMap[d] = rewriter.create<AtenSizeIntOp>(
++        loc, rhs,
++        rewriter.create<Torch::ConstantIntOp>(loc,
++                                              rewriter.getI64IntegerAttr(idx)));
++  }
++
++  // parse batch, contracting, other, reduce dims of lhs and rhs
++  SmallVector<char> contractingDims;
++  SmallVector<char> lhsReduceDims;
++  SmallVector<char> rhsReduceDims;
++  SmallVector<char> lhsOtherDims;
++  SmallVector<char> rhsOtherDims;
++  SmallVector<char> batchingDims;
++  parseDimTokens(lhsTokens, rhsTokens, finalResultTokens, contractingDims,
++                 lhsReduceDims, rhsReduceDims, batchingDims, lhsOtherDims,
++                 rhsOtherDims);
++
++  llvm::SmallDenseMap<char, Value> outDimShapeMap;
++  auto generateOutDimShapeMap = [&](SmallVector<char> &dims) {
++    for (auto d : dims) {
++      bool lhsContains = lhsDimShapeMap.count(d) > 0;
++      bool rhsContains = rhsDimShapeMap.count(d) > 0;
++      if (lhsContains && rhsContains) {
++        outDimShapeMap[d] = rewriter.create<Torch::PrimMaxIntOp>(
++            loc, lhsDimShapeMap[d], rhsDimShapeMap[d]);
++      } else if (lhsContains) {
++        outDimShapeMap[d] = lhsDimShapeMap[d];
++      } else if (rhsContains) {
++        outDimShapeMap[d] = rhsDimShapeMap[d];
 +      }
 +    }
-+    finalShape.push_back(contractingDimProduct);
++  };
++
++  generateOutDimShapeMap(contractingDims);
++  generateOutDimShapeMap(batchingDims);
++  generateOutDimShapeMap(lhsReduceDims);
++  generateOutDimShapeMap(rhsReduceDims);
++  generateOutDimShapeMap(lhsOtherDims);
++  generateOutDimShapeMap(rhsOtherDims);
++
++  if (contractingDims.size() == 0 && lhsOtherDims.size() == 0 &&
++      rhsOtherDims.size() == 0) {
++    return rewriter.notifyMatchFailure(
++        loc, "Hadamard product is currently not supported");
++  }
++
++  // shape: [*batchingDims, *lhsOtherDims, *lhsReduceDims, *lhsContractingDims]
++  lhs = permuteTensorForMatmul(rewriter, loc, lhs, lhsTokens, batchingDims,
++                               contractingDims, lhsOtherDims, lhsReduceDims,
++                               true);
++  // shape: [*batchingDims, *rhsContractingDims, *rhsOtherDims, *rhsReduceDims]
++  rhs = permuteTensorForMatmul(rewriter, loc, rhs, rhsTokens, batchingDims,
++                               contractingDims, rhsOtherDims, rhsReduceDims,
++                               false);
++  // shape: [batchingDimsProd, lhsOtherDimsProd, lhsContractingDimsProd]
++  lhs = collapseDimForMatmul(rewriter, loc, lhs, batchingDims.size(),
++                             contractingDims.size(), lhsOtherDims.size(),
++                             lhsReduceDims.size(), true);
++  // shape: [batchingDimsProd, rhsContractingDimsProd, rhsOtherDimsProd]
++  rhs = collapseDimForMatmul(rewriter, loc, rhs, batchingDims.size(),
++                             contractingDims.size(), rhsOtherDims.size(),
++                             rhsReduceDims.size(), false);
++
++  // perform matmul
++  auto outType =
++    lhsType.getWithSizesAndDtype(std::nullopt, promotedDType);
++  result = rewriter.create<Torch::AtenMatmulOp>(loc, outType, lhs, rhs);
++
++  // generate ideal result dims.
++  generateIdealReusltDimTokens(batchingDims, lhsOtherDims, rhsOtherDims,
++                               lhsReduceDims, rhsReduceDims, resultTokens);
++
++  // reshape matmul result to ideal shape:
++  // [batchingDimsProd, lhsOtherDimsProd, rhsOtherDimsProd] =>
++  // [*batchingDims, *lhsOtherDims, *lhsReduceDims, *rhsOtherDims,
++  // *rhsReduceDims]
++  SmallVector<Value> outShapeTensors;
++  for (char d : resultTokens) {
++    outShapeTensors.emplace_back(outDimShapeMap[d]);
 +  }
-+  return inputTensor;
++
++  auto outResultShape = rewriter.create<Torch::PrimListConstructOp>(
++      loc, Torch::ListType::get(Torch::IntType::get(lhs.getContext())),
++      outShapeTensors);
++  result = rewriter.create<Torch::AtenReshapeOp>(
++      loc,
++      lhsType.getWithSizesAndDtype(std::nullopt, promotedDType),
++      result, outResultShape);
++  return success();
 +}
 +
-+static Value createReshapedTensor(PatternRewriter &rewriter, Location loc,
-+                                  Operation* op, Type tensorType, Value tensor,
-+                                  SmallVector<Value> &shape) {
-+  auto listType = Torch::ListType::get(Torch::IntType::get(op->getContext()));
-+  Value reshapedDims =
-+      rewriter.create<PrimListConstructOp>(loc, listType, shape);
-+  return rewriter.create<AtenReshapeOp>(loc, tensorType, tensor, reshapedDims);
++static Value performLastReduceAndPermute(PatternRewriter &rewriter,
++                                         Location loc, Type outType, Value input,
++                                         SmallVector<char> &inputTokens,
++                                         SmallVector<char> &outTokens) {
++  auto inputType = input.getType().cast<BaseTensorType>();
++
++  llvm::SmallDenseSet<char> outTokenSet(outTokens.begin(), outTokens.end());
++  SmallVector<int64_t> sumDims;
++  llvm::SmallDenseMap<char, int64_t> inputDimToIdx;
++  int64_t idx = 0;
++  for (size_t i = 0; i < inputTokens.size(); ++i) {
++    char d = inputTokens[i];
++    if (!outTokenSet.contains(d)) {
++      sumDims.emplace_back(i);
++    } else {
++      inputDimToIdx[d] = idx++;
++    }
++  }
++
++  if (sumDims.size() > 0) {
++    SmallVector<Value> sumDimsTensor;
++    for (auto d : sumDims) {
++      sumDimsTensor.emplace_back(rewriter.create<Torch::ConstantIntOp>(
++          loc, rewriter.getI64IntegerAttr(d)));
++    }
++    auto sumDimsListValue = rewriter.create<Torch::PrimListConstructOp>(
++        loc, Torch::ListType::get(Torch::IntType::get(rewriter.getContext())),
++        sumDimsTensor);
++    auto falseValue = rewriter.create<Torch::ConstantBoolOp>(
++        loc, rewriter.getBoolAttr(false));
++    auto noneValue = rewriter.create<Torch::ConstantNoneOp>(loc);
++    input = rewriter.create<Torch::AtenSumDimIntListOp>(
++        loc,
++        inputType.getWithSizesAndDtype(std::nullopt,
++                                       inputType.getOptionalDtype()),
++        input, sumDimsListValue, falseValue, noneValue);
++  }
++
++  SmallVector<Value> permuteDimsTensor;
++  for (auto d : outTokens) {
++    permuteDimsTensor.emplace_back(rewriter.create<Torch::ConstantIntOp>(
++        loc, rewriter.getI64IntegerAttr(inputDimToIdx[d])));
++  }
++  auto permuteDimsListValue = rewriter.create<Torch::PrimListConstructOp>(
++      loc, Torch::ListType::get(Torch::IntType::get(input.getContext())),
++      permuteDimsTensor);
++  auto out = rewriter.create<Torch::AtenPermuteOp>(loc, outType, input,
++                                                   permuteDimsListValue);
++  return out;
 +}
 +
 +
 +class DecomposeAtenEinsumOp : public OpRewritePattern<AtenEinsumOp> {
-+ public:
++public:
 +  using OpRewritePattern::OpRewritePattern;
 +  LogicalResult matchAndRewrite(AtenEinsumOp op,
 +                                PatternRewriter &rewriter) const override {
@@ -185,162 +449,47 @@ index 4dafed1d..a3c908bf 100644
 +    SmallVector<char> resultTokens;
 +    SmallVector<SmallVector<char>> inputTokens;
 +    if (!parseEquation(equation, inputTokens, resultTokens)) {
-+      return rewriter.notifyMatchFailure(op, "Unexpected character in equations encountered");
++      return rewriter.notifyMatchFailure(
++          op, "Unexpected character in equations encountered");
 +    }
 +
 +    SmallVector<Value> inputTensors;
-+    SmallVector<SmallVector<Value>> inputShapes;
 +    if (!getListConstructElements(op.getTensors(), inputTensors)) {
 +      return rewriter.notifyMatchFailure(
 +          op, "input should comes from a PrimListConstructOp");
 +    }
 +
-+    for (size_t i = 0; i < inputTensors.size(); i++) {
-+      BaseTensorType tensorType =
-+          inputTensors[i].getType().cast<BaseTensorType>();
-+      if (!tensorType.hasSizes()) {
-+        return rewriter.notifyMatchFailure(
-+            op, "unimplemented: input tensor must have known sizes");
-+      }
-+      ArrayRef<int64_t> inputShape = tensorType.getSizes();
-+      SmallVector<Value> inputValueShape;
-+      for (unsigned j = 0; j < inputShape.size(); j++) {
-+        inputValueShape.push_back(rewriter.create<AtenSizeIntOp>(
-+                                        loc, inputTensors[i],
-+                                        rewriter.create<Torch::ConstantIntOp>(
-+                                            loc, rewriter.getI64IntegerAttr(j))));
-+      }
-+      inputShapes.push_back(inputValueShape);
-+    }
-+
-+    auto collectOperandDims = [resultTokens](
-+                                  const SmallVector<Value> operandShape,
-+                                  const SmallVector<char> operandTokens,
-+                                  const SmallVector<char> others,
-+                                  SmallVectorImpl<int64_t> &contractingDims,
-+                                  SmallVectorImpl<int64_t> &batchingDims,
-+                                  SmallVector<char> &dotResultTokens,
-+                                  SmallVector<Value> &dotResultShape) {
-+      llvm::SmallDenseSet<char> othersSet(others.begin(), others.end());
-+      llvm::SmallDenseSet<char> resultTokensSet(resultTokens.begin(),
-+                                                resultTokens.end());
-+      for (const auto &en : llvm::enumerate(operandTokens)) {
-+        bool isResultToken = resultTokensSet.contains(en.value());
-+        bool isOtherToken = othersSet.contains(en.value());
-+        if (!isResultToken && isOtherToken) {
-+          contractingDims.push_back(en.index());
-+        } else if (isOtherToken) {
-+          batchingDims.push_back(en.index());
-+        } else {
-+          dotResultTokens.push_back(en.value());
-+          dotResultShape.push_back(operandShape[en.index()]);
-+        }
-+      }
++    auto allTensorHasSizes = [](Value tensor) {
++      auto type = tensor.getType().dyn_cast<BaseTensorType>();
++      if (!type || !type.hasSizes())
++        return false;
++      return true;
 +    };
 +
-+    Value constZero =
-+        rewriter.create<ConstantIntOp>(loc, rewriter.getI64IntegerAttr(0));
-+    Value constOne =
-+        rewriter.create<ConstantIntOp>(loc, rewriter.getI64IntegerAttr(1));
-+    Value constTwo =
-+        rewriter.create<ConstantIntOp>(loc, rewriter.getI64IntegerAttr(2));
-+    if (inputTensors.size() == 1) {
-+      return rewriter.notifyMatchFailure(
-+            op, "unimplemented: single input tensor is not supported");
++    if (!llvm::all_of(inputTensors, allTensorHasSizes)) {
++      return rewriter.notifyMatchFailure(op,
++                                         "all input tensors should have sizes");
 +    }
-+    while (inputTensors.size() > 1) {
-+      SmallVector<int64_t> lhsContractingDims, lhsBatchingDims,
-+          rhsContractingDims, rhsBatchingDims;
-+      SmallVector<char> dotResultTokens;
-+      SmallVector<Value> dotResultShape;
-+      SmallVector<Value> lhsShape = inputShapes[0];
-+      SmallVector<Value> rhsShape = inputShapes[1];
-+      SmallVector<char> lhsTokens = inputTokens[0];
-+      SmallVector<char> rhsTokens = inputTokens[1];
-+      Value lhsTensor = inputTensors[0];
-+      Value rhsTensor = inputTensors[1];
-+      // Step 1: split input equation to input/result tokens
-+      collectOperandDims(lhsShape, lhsTokens, rhsTokens, lhsContractingDims,
-+                         lhsBatchingDims, dotResultTokens, dotResultShape);
-+      collectOperandDims(rhsShape, rhsTokens, lhsTokens, rhsContractingDims,
-+                         rhsBatchingDims, dotResultTokens, dotResultShape);
-+      // Prepend batch tokens.
-+      for (const auto &it : llvm::enumerate(lhsBatchingDims)) {
-+        char batchingToken = lhsTokens[it.value()];
-+        Value batchingShapeDim = lhsShape[it.value()];
-+        dotResultTokens.insert(dotResultTokens.begin() + it.index(),
-+                               batchingToken);
-+        dotResultShape.insert(dotResultShape.begin() + it.index(),
-+                              batchingShapeDim);
-+      }
-+      // Lowering to dot_general does not support a mismatch between the number
-+      // of result dims and the number of non-contracting dims.
-+
-+      SmallVector<Value> lhsFinalShape, rhsFinalShape;
-+      SmallVector<Value> finalShape = dotResultShape;
-+      // Step 2: transpose the input tensors to [batchingDims[0,1,2],
-+      // otherDims[0,1,2], contractingDims[0,1,2]]
-+      lhsTensor = prepareTensorForMatmulOperations(rewriter, op, lhsTensor, lhsShape,
-+                               lhsContractingDims, lhsBatchingDims,
-+                               lhsFinalShape, lhsTokens);
-+      rhsTensor = prepareTensorForMatmulOperations(rewriter, op, rhsTensor, rhsShape,
-+                               rhsContractingDims, rhsBatchingDims,
-+                               rhsFinalShape, rhsTokens);
-+
-+      // Step 3: reshape the input tensors, the final shape should
-+      // be[batchingDims, otherDims, contractingDims]
-+      auto listType = Torch::ListType::get(Torch::IntType::get(op->getContext()));
-+      Value lhsReshapedDims =
-+          rewriter.create<PrimListConstructOp>(loc, listType, lhsFinalShape);
-+      Value lhs = rewriter.create<AtenReshapeOp>(loc, op.getType(), lhsTensor, lhsReshapedDims);
-+      Value rhsReshapedDims =
-+          rewriter.create<PrimListConstructOp>(loc, listType, rhsFinalShape);
-+      Value rhs = rewriter.create<AtenReshapeOp>(loc, op.getType(), rhsTensor, rhsReshapedDims);
-+      Value result;
-+
-+      // Step 4: use AtenMatmulOp to get the result, loop util we
-+      // get the final result
-+      if (!rhsContractingDims.empty() && !rhsBatchingDims.empty()){
-+        rhs = rewriter.create<AtenTransposeIntOp>(loc, op.getType(), rhs, constOne, constTwo);
-+      } else if (!rhsContractingDims.empty()){
-+        rhs = rewriter.create<AtenTransposeIntOp>(loc, op.getType(), rhs, constZero, constOne);
-+      }
-+      result = rewriter.create<AtenMatmulOp>(loc, op.getType(), lhs, rhs);
-+      result = createReshapedTensor(rewriter, loc, op, op.getType(), result, finalShape);
-+
-+      inputTensors.erase(inputTensors.begin(), inputTensors.begin() + 2);
-+      inputTokens.erase(inputTokens.begin(), inputTokens.begin() + 2);
-+      inputShapes.erase(inputShapes.begin(), inputShapes.begin() + 2);
-+      inputTensors.push_back(result);
-+      inputTokens.push_back(dotResultTokens);
-+      inputShapes.push_back(dotResultShape);
-+      if (inputTokens.size() == 1) {
-+        // Lowering to dot_general does not support a mismatch between the number
-+        // of result dims and the number of non-contracting dims.
-+        if (dotResultTokens.size() != resultTokens.size()) {
-+          return rewriter.notifyMatchFailure(op,
-+                                            "rank reducing einsum not supported");
-+        }
-+        int64_t resultSize = 0;
-+        for (char resultToken : resultTokens) {
-+          auto *foundIt = std::find(dotResultTokens.begin(), dotResultTokens.end(),
-+                                    resultToken);
-+          if (foundIt == dotResultTokens.end()) {
-+            return rewriter.notifyMatchFailure(
-+                op, "result token not found in operands");
-+          }
-+          auto resultIndex = std::distance(dotResultTokens.begin(), foundIt);
-+          if (resultIndex > resultSize) {
-+            Value first = rewriter.create<Torch::ConstantIntOp>(loc, rewriter.getI64IntegerAttr(resultSize));
-+            Value second = rewriter.create<Torch::ConstantIntOp>(loc, rewriter.getI64IntegerAttr(resultIndex));
-+            result = rewriter.create<AtenTransposeIntOp>(loc, op.getType(), result, first, second);
-+          }
-+          resultSize += 1;
-+        } 
-+        // The dot_general is already in an appropriate result order.
-+        rewriter.replaceOp(op, ValueRange{result});
++
++    SmallVector<char> lhsTokens = inputTokens[0];
++    Value lhs = inputTensors[0];
++    Value result;
++
++    for (size_t i = 1; i < inputTensors.size(); ++i) {
++      auto rhs = inputTensors[i];
++      auto rhsTokens = inputTokens[i];
++      SmallVector<char> outTokens;
++      if (failed(performMatmul(rewriter, loc, lhs, lhsTokens, rhs, rhsTokens,
++                               result, outTokens, resultTokens))) {
++        return failure();
 +      }
++      lhs = result;
++      lhsTokens = outTokens;
 +    }
++
++    result = performLastReduceAndPermute(rewriter, loc, op.getType(), lhs, lhsTokens,
++                                         resultTokens);
++    rewriter.replaceOp(op, result);
 +    return success();
 +  }
 +};
@@ -348,16 +497,16 @@ index 4dafed1d..a3c908bf 100644
 +
 +
  namespace {
- class DecomposeComplexOpsPass
-     : public DecomposeComplexOpsBase<DecomposeComplexOpsPass> {
-@@ -5164,6 +5469,7 @@ public:
-     addPatternIfTargetOpIsIllegal<DecomposeAtenRandLikeOp>(patterns);
-     addPatternIfTargetOpIsIllegal<DecomposeAtenHardsigmoidOp>(patterns);
-     addPatternIfTargetOpIsIllegal<DecomposeAtenRelu6Op>(patterns);
+ // Unconditionally decompose `aten.tile` into `aten.repeat`.
+ class DecomposeAtenTileOp : public OpRewritePattern<AtenTileOp> {
+@@ -5221,6 +5675,7 @@ public:
+     addPatternIfTargetOpIsIllegal<DecomposeAtenVarMeanOp>(patterns);
+     addPatternIfTargetOpIsIllegal<DecomposeAtenEluOp>(patterns);
+     addPatternIfTargetOpIsIllegal<DecomposeAtenLeakyReluOp>(patterns);
 +    addPatternIfTargetOpIsIllegal<DecomposeAtenEinsumOp>(patterns);
-     addPatternIfTargetOpIsIllegal<DecomposeAtenHardswishOp>(patterns);
-     addPatternIfTargetOpIsIllegal<DecomposeAtenSoftplusOp>(patterns);
-     addPatternIfTargetOpIsIllegal<DecomposeAtenSiluOp>(patterns);
+     addPatternIfTargetOpIsIllegal<DecomposeAtenLeakyReluBackwardOp>(patterns);
+     addPatternIfTargetOpIsIllegal<DecomposeAtenNewEmptyStridedOp>(patterns);
+     addPatternIfTargetOpIsIllegal<DecomposeAtenBucketizeTensorOp>(patterns);
 diff --git a/lib/Dialect/Torch/Transforms/LowerToBackendContract.cpp b/lib/Dialect/Torch/Transforms/LowerToBackendContract.cpp
 index 76119828..179440c6 100644
 --- a/lib/Dialect/Torch/Transforms/LowerToBackendContract.cpp
diff --git a/frontends/torch-frontend/torch-frontend/lib/Conversion/ConvertTorchToCustomCall.cpp b/frontends/torch-frontend/torch-frontend/lib/Conversion/ConvertTorchToCustomCall.cpp
index a96b17087..3760e78c7 100644
--- a/frontends/torch-frontend/torch-frontend/lib/Conversion/ConvertTorchToCustomCall.cpp
+++ b/frontends/torch-frontend/torch-frontend/lib/Conversion/ConvertTorchToCustomCall.cpp
@@ -970,6 +970,10 @@ class ConvertFlashAttnFwdOp : public OpConversionPattern<OperatorOp> {
     Type softmaxLseTy = op.getResult(5).getType();
     Type softmaxTy = op.getResult(6).getType();
     Type rngTy = op.getResult(7).getType();
+    // Do not need softmax return if there's no use
+    if (op.getResult(6).use_empty())
+      returnSoftmax = false;
+
     SmallVector<Type> resultTypes;
     if (failed(getTypeConverter()->convertTypes(
             {outputPadTy, softmaxLseTy, softmaxTy, rngTy}, resultTypes))) {
diff --git a/frontends/torch-frontend/torch-frontend/python/test/test_attn_rewrite.py b/frontends/torch-frontend/torch-frontend/python/test/test_attn_rewrite.py
index 99bf2fadd..ad64e0508 100644
--- a/frontends/torch-frontend/torch-frontend/python/test/test_attn_rewrite.py
+++ b/frontends/torch-frontend/torch-frontend/python/test/test_attn_rewrite.py
@@ -183,3 +183,21 @@ def test_flash_attn_opt_pattern():
 
     torch.testing.assert_close(golden_loss, flash_loss, atol=1e-4, rtol=1e-6)
     torch.testing.assert_close(golden_logits, flash_logits, atol=3e-3, rtol=1e-6)
+
+
+def test_flash_attn_llama_inference_pattern():
+    config = transformers.LlamaConfig(num_hidden_layers=4)
+    model = transformers.LlamaForCausalLM(config=config).to("cuda")
+    model.eval()
+
+    input, label = make_data(model, "cuda")
+    trace_data = [input]
+
+    from torch.fx.experimental.proxy_tensor import make_fx
+    from torch_frontend import preprocess_fx_graph
+    # module = torch.jit.trace(model, trace_data, check_trace=False)
+    with torch.no_grad(), torch.cuda.amp.autocast(enabled=True, dtype=torch.float16):
+        fx_g = make_fx(model)(*trace_data)
+        fx_g = preprocess_fx_graph(fx_g)
+        all_formatted = "\n".join([n.format_node() for n in fx_g.graph.nodes])
+        FileCheck().check("call_function").check("torch.ops.byteir.flash_attn_fwd").run(all_formatted)
diff --git a/frontends/torch-frontend/torch-frontend/python/test/test_fx_utils.py b/frontends/torch-frontend/torch-frontend/python/test/test_fx_utils.py
new file mode 100644
index 000000000..d6143c3a1
--- /dev/null
+++ b/frontends/torch-frontend/torch-frontend/python/test/test_fx_utils.py
@@ -0,0 +1,18 @@
+import torch
+import torch.fx as fx
+import torch_frontend
+from torch_frontend.fx_utils import _replace_aten_full_arugment
+
+class FullModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+    
+    def forward(self, x):
+        y = torch.ops.aten.full(x.shape, True, dtype=torch.bool)
+        return y
+
+
+def test_full_bool_pattern():
+    fx_g = fx.symbolic_trace(FullModule())
+    fx_g = _replace_aten_full_arugment(fx_g)
+    module = torch.jit.script(fx_g)
diff --git a/frontends/torch-frontend/torch-frontend/python/torch_frontend/__init__.py b/frontends/torch-frontend/torch-frontend/python/torch_frontend/__init__.py
index 7c6fda4a8..86d885215 100644
--- a/frontends/torch-frontend/torch-frontend/python/torch_frontend/__init__.py
+++ b/frontends/torch-frontend/torch-frontend/python/torch_frontend/__init__.py
@@ -25,8 +25,7 @@
 del importlib
 del _torch_frontend_registry
 
-from .ts_utils import register_decomposition_in_torchscript
-from .fx_utils import list_decomposed_ops, preprocess_fx_graph
+from .fx_utils import list_decomposed_ops, preprocess_fx_graph, get_none_indices
 from .convert_to_mhlo import convert_to_mhlo_via_torch_mlir, compile
 from .flash_attn_op import replace_flash_attn
 from .fx_rewrite import fx_replace_attn_pattern
diff --git a/frontends/torch-frontend/torch-frontend/python/torch_frontend/fx_utils.py b/frontends/torch-frontend/torch-frontend/python/torch_frontend/fx_utils.py
index eb075a648..20d95e2b5 100644
--- a/frontends/torch-frontend/torch-frontend/python/torch_frontend/fx_utils.py
+++ b/frontends/torch-frontend/torch-frontend/python/torch_frontend/fx_utils.py
@@ -76,6 +76,34 @@ def _remove_nones(fx_g: torch.fx.GraphModule) -> List[int]:
     return removed_indexes
 
 
+# note: torch.jit.script doesn't support  torch.ops.aten.full([2, 1, 1, 128], True, dtype = torch.bool), replace it with  torch.ops.aten.full([2, 1, 1, 128], 1, dtype = torch.bool)
+def _replace_aten_full_arugment(fx_g: torch.fx.GraphModule) -> torch.fx.GraphModule :
+    def get_aten_target(node):
+        if hasattr(node.target, 'overloadpacket'):
+            return node.target.overloadpacket
+        return node.target
+
+    nodes = []
+    for node in fx_g.graph.nodes:
+        if get_aten_target(node) == torch.ops.aten.full:
+            if node.args[1] == True or node.args[1] == False:
+                nodes.append(node)
+    for node in nodes:
+        if node.args[1] == True:
+            with fx_g.graph.inserting_after(node):
+                new_node = fx_g.graph.call_function(torch.ops.aten.full, args=(node.args[0], 1), kwargs=node.kwargs)
+                node.replace_all_uses_with(new_node)
+                fx_g.graph.erase_node(node)
+        if node.args[1] == False:
+            with fx_g.graph.inserting_after(node):
+                new_node = fx_g.graph.call_function(torch.ops.aten.full, args=(node.args[0], 0), kwargs=node.kwargs)
+                node.replace_all_uses_with(new_node)
+                fx_g.graph.erase_node(node)
+    fx_g.graph.lint()
+    fx_g.recompile()
+    return fx_g
+
+
 def threshold_backward_pattern(grad_output, inp, threshold):
     return torch.ops.aten.threshold_backward(grad_output, inp, threshold)
 
@@ -96,6 +124,61 @@ def unsafe_index_put_pattern(self, indices, values, accumulate):
 def unsafe_index_put_replacement(self, indices, values, accumulate):
     return  torch.ops.aten.index_put_.hacked_twin(self, indices, values, accumulate)
 
+# LLaMA aten attention op pattern
+def LLaMAAttnPattern(query, key, value, attn_mask, min_val, inv_scale, batch, num_head, fused_batch, seq_len, head_dim):
+    transpose_3 = torch.ops.aten.transpose.int(key, 2, 3)
+    expand_2 = torch.ops.aten.expand.default(query, [batch, num_head, seq_len, head_dim])
+    clone = torch.ops.aten.clone.default(expand_2, memory_format = torch.contiguous_format)
+    _unsafe_view_3 = torch.ops.aten._unsafe_view.default(clone, [fused_batch, seq_len, head_dim])
+    expand_3 = torch.ops.aten.expand.default(transpose_3, [batch, num_head, head_dim, seq_len])
+    clone_1 = torch.ops.aten.clone.default(expand_3, memory_format = torch.contiguous_format)
+    _unsafe_view_4 = torch.ops.aten._unsafe_view.default(clone_1, [fused_batch, head_dim, seq_len])
+    bmm = torch.ops.aten.bmm.default(_unsafe_view_3, _unsafe_view_4)
+    _unsafe_view_5 = torch.ops.aten._unsafe_view.default(bmm, [batch, num_head, seq_len, seq_len])
+    div = torch.ops.aten.div.Tensor(_unsafe_view_5, inv_scale)
+    add_5 = torch.ops.aten.add.Tensor(div, attn_mask)
+    maximum = torch.ops.aten.maximum.default(add_5, min_val)
+    _softmax = torch.ops.aten._softmax.default(maximum, -1, False)
+    _to_copy_10 = torch.ops.aten._to_copy.default(_softmax, dtype = torch.float16)
+    expand_4 = torch.ops.aten.expand.default(_to_copy_10, [batch, num_head, seq_len, seq_len])
+    view_8 = torch.ops.aten.view.default(expand_4, [fused_batch, seq_len, seq_len]);  expand_4 = None
+    expand_5 = torch.ops.aten.expand.default(value, [batch, num_head, seq_len, head_dim])
+    clone_2 = torch.ops.aten.clone.default(expand_5, memory_format = torch.contiguous_format)
+    _unsafe_view_6 = torch.ops.aten._unsafe_view.default(clone_2, [fused_batch, seq_len, head_dim])
+    bmm_1 = torch.ops.aten.bmm.default(view_8, _unsafe_view_6)
+    _unsafe_view_5 = torch.ops.aten._unsafe_view.default(bmm_1, [batch, num_head, seq_len, head_dim])
+    return _softmax, _unsafe_view_5
+
+
+def LLaMAAttnReplacement(query, key, value, attn_mask, min_val, inv_scale, batch, num_head, fused_batch, seq_len, head_dim):
+    out, q_pad, k_pad, v_pad, out_pad, softmax_lse, S_dmask, rng_state = torch.ops.byteir.flash_attn_fwd(
+        query,
+        key,
+        value,
+        0.0,
+        1.0/inv_scale,
+        True,
+        True
+    )
+    return S_dmask, out
+
+
+def get_none_indices(fx_g: torch.fx.GraphModule) -> List[int]:
+    none_indices = []
+    for node in fx_g.graph.nodes:
+        if node.op == "output":
+            assert len(node.args) == 1, "Output node must have a single argument"
+            node_arg = node.args[0]
+            if isinstance(node_arg, (list, tuple)):
+                node_arg = list(node_arg)
+                node_args_len = len(node_arg)
+                for i in range(node_args_len):
+                    if node_arg[i] is None:
+                        none_indices.append(i)
+                break
+    return none_indices
+
+
 def list_decomposed_ops():
     return [
         torch.ops.aten._native_batch_norm_legit_functional,
@@ -108,15 +191,18 @@ def list_decomposed_ops():
         torch.ops.aten.tril
     ]
 
+
 def preprocess_fx_graph(fx_graph: torch.fx.GraphModule):
     if _returns_nothing(fx_graph):
         return fx_graph
 
     torch.fx.replace_pattern(fx_graph, squeeze_dims_pattern, squeeze_dims_replacement)
     torch.fx.replace_pattern(fx_graph, unsafe_index_put_pattern, unsafe_index_put_replacement)
+    torch.fx.replace_pattern(fx_graph, LLaMAAttnPattern, LLaMAAttnReplacement)
     was_unwrapped = _unwrap_single_tuple_return(fx_graph)
     was_list_replaced = _list_return_to_tuple_return(fx_graph)
     removed_none_indexes = _remove_nones(fx_graph)
     strip_overloads(fx_graph)
     torch.fx.replace_pattern(fx_graph, threshold_backward_pattern, threshold_backward_replacement)
+    fx_graph = _replace_aten_full_arugment(fx_graph)
     return fx_graph
diff --git a/runtime/include/brt/core/framework/op_accessor.h b/runtime/include/brt/core/framework/op_accessor.h
index 8e3bb0c5d..352a63228 100644
--- a/runtime/include/brt/core/framework/op_accessor.h
+++ b/runtime/include/brt/core/framework/op_accessor.h
@@ -67,6 +67,9 @@ class OpAccessor {
 
   template <typename T> T GetAttrAsSplatValue(const std::string &name) const;
 
+  template <typename T>
+  std::vector<T> GetAttrAsVector(const std::string &name) const;
+
   std::string GetUID() const;
 
   static int64_t GetNumElementsOfShape(const Shape &shape);
diff --git a/runtime/lib/backends/cuda/providers/default/ait/ait.cc b/runtime/lib/backends/cuda/providers/default/ait/ait.cc
index e0e9e56e8..728a9aa70 100644
--- a/runtime/lib/backends/cuda/providers/default/ait/ait.cc
+++ b/runtime/lib/backends/cuda/providers/default/ait/ait.cc
@@ -370,7 +370,8 @@ AITOpKernel::AITOpKernel(const OpKernelInfo &info)
   std::string lib_path = brt::ir::GetParentPath(ir_path);
   lib_path += accessor.GetAttrAsString(std::string("ait_lib_file"));
   aitLibHdl = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_LOCAL);
-  BRT_ENFORCE(aitLibHdl != nullptr, "AIT lib .so load failed");
+  std::string msg = std::string("AIT lib ") + lib_path + " load failed";
+  BRT_ENFORCE(aitLibHdl != nullptr, msg);
   std::string space = accessor.GetAttrAsString("device");
   IAllocator *alloc = info_.GetAllocator(space);
   workspaceSizeInBytes =
diff --git a/runtime/lib/backends/cuda/providers/default/codegen/ptx.cc b/runtime/lib/backends/cuda/providers/default/codegen/ptx.cc
index 3193bf51d..39c77d5a3 100644
--- a/runtime/lib/backends/cuda/providers/default/codegen/ptx.cc
+++ b/runtime/lib/backends/cuda/providers/default/codegen/ptx.cc
@@ -37,8 +37,12 @@ using namespace mlir;
 
 #define FILE_NAME_ATTR "device_file_name"
 #define KERNEL_NAME_ATTR "kernel_name"
-#define GRID_SIZE_ATTR "GridSize.x"
-#define BLOCK_SIZE_ATTR "BlockSize.x"
+#define GRID_SIZE_X_ATTR "GridSize.x"
+#define GRID_SIZE_Y_ATTR "GridSize.y"
+#define GRID_SIZE_Z_ATTR "GridSize.z"
+#define BLOCK_SIZE_X_ATTR "BlockSize.x"
+#define BLOCK_SIZE_Y_ATTR "BlockSize.y"
+#define BLOCK_SIZE_Z_ATTR "BlockSize.z"
 #define ARG_RANKS_ATTR "arg_ranks"
 #define CALL_CONVENTION_ATTR "call_convention"
 
@@ -119,29 +123,57 @@ PTXOpKernel::PTXOpKernel(const OpKernelInfo &info)
     impl_->call_convention = "all";
   // static assignment for config
   // TODO extend to support dynamic
-  if (!info.GetOperation()->hasAttrOfType<IntegerAttr>(GRID_SIZE_ATTR)) {
+  if (!info.GetOperation()->hasAttrOfType<IntegerAttr>(GRID_SIZE_X_ATTR)) {
     BRT_THROW_EX(std::runtime_error, "no GridSize.x attr");
   }
 
-  if (!info.GetOperation()->hasAttrOfType<IntegerAttr>(BLOCK_SIZE_ATTR)) {
+  if (!info.GetOperation()->hasAttrOfType<IntegerAttr>(BLOCK_SIZE_X_ATTR)) {
     BRT_THROW_EX(std::runtime_error, "no BlockSize.x attr");
   }
 
-  if (!info.GetOperation()->hasAttrOfType<ArrayAttr>(ARG_RANKS_ATTR)) {
-    BRT_THROW_EX(std::runtime_error, "no arg_ranks attr");
+  int gx = static_cast<int>(info.GetOperation()
+                                ->getAttrOfType<IntegerAttr>(GRID_SIZE_X_ATTR)
+                                .getInt()),
+      gy = 1, gz = 1;
+  if (info.GetOperation()->hasAttrOfType<IntegerAttr>(GRID_SIZE_Y_ATTR)) {
+    gy = static_cast<int>(info.GetOperation()
+                              ->getAttrOfType<IntegerAttr>(GRID_SIZE_Y_ATTR)
+                              .getInt());
+  }
+  if (info.GetOperation()->hasAttrOfType<IntegerAttr>(GRID_SIZE_Z_ATTR)) {
+    gz = static_cast<int>(info.GetOperation()
+                              ->getAttrOfType<IntegerAttr>(GRID_SIZE_Z_ATTR)
+                              .getInt());
   }
 
-  int gx = static_cast<int>(
-      info.GetOperation()->getAttrOfType<IntegerAttr>(GRID_SIZE_ATTR).getInt());
   int bx = static_cast<int>(info.GetOperation()
-                                ->getAttrOfType<IntegerAttr>(BLOCK_SIZE_ATTR)
-                                .getInt());
-  std::vector<int> ranks = GetIntArrayAttr(
-      info.GetOperation()->getAttrOfType<ArrayAttr>(ARG_RANKS_ATTR));
+                                ->getAttrOfType<IntegerAttr>(BLOCK_SIZE_X_ATTR)
+                                .getInt()),
+      by = 1, bz = 1;
+  if (info.GetOperation()->hasAttrOfType<IntegerAttr>(BLOCK_SIZE_Y_ATTR)) {
+    by = static_cast<int>(info.GetOperation()
+                              ->getAttrOfType<IntegerAttr>(BLOCK_SIZE_Y_ATTR)
+                              .getInt());
+  }
+  if (info.GetOperation()->hasAttrOfType<IntegerAttr>(BLOCK_SIZE_Z_ATTR)) {
+    bz = static_cast<int>(info.GetOperation()
+                              ->getAttrOfType<IntegerAttr>(BLOCK_SIZE_Z_ATTR)
+                              .getInt());
+  }
+
+  std::vector<int> ranks;
+  if (info.GetOperation()->hasAttrOfType<ArrayAttr>(ARG_RANKS_ATTR)) {
+    ranks = GetIntArrayAttr(
+        info.GetOperation()->getAttrOfType<ArrayAttr>(ARG_RANKS_ATTR));
+  } else {
+    for (unsigned int i = 0; i < GetOpArgNum(info_); ++i) {
+      ranks.push_back(GetRankFromOpArgIndex(info_, i));
+    }
+  }
 
   auto num_arg = GetOpArgNum(info_);
-  impl_->grid = dim3(gx, 1, 1);
-  impl_->block = dim3(bx, 1, 1);
+  impl_->grid = dim3(gx, gy, gz);
+  impl_->block = dim3(bx, by, bz);
   impl_->shared_size = 0;
   impl_->arg_reserve_size = 3; // initial 3 for grid/block/shared_size
 
diff --git a/runtime/lib/backends/cuda/providers/default/flash_attn/flash_attn_bwd.cc b/runtime/lib/backends/cuda/providers/default/flash_attn/flash_attn_bwd.cc
index 7dfb2baf3..4f9da6ae5 100644
--- a/runtime/lib/backends/cuda/providers/default/flash_attn/flash_attn_bwd.cc
+++ b/runtime/lib/backends/cuda/providers/default/flash_attn/flash_attn_bwd.cc
@@ -85,10 +85,10 @@ common::Status FlashAttnBwdOpKernel::RunImpl(const ExecutionContext &ctx) {
   }
 
   // dropout check
-  bool is_dropout = p_dropout > 0.0;
-  if (is_dropout) {
-    return InvalidArgs("currently, we only support p_dropout == 0");
-  }
+  // bool is_dropout = p_dropout > 0.0;
+  // if (is_dropout) {
+  //   return InvalidArgs("currently, we only support p_dropout == 0");
+  // }
 
   // type check
   const auto dout_type = accessor.GetArgDTypeEnum(0);
@@ -288,6 +288,7 @@ common::Status FlashAttnBwdOpKernel::RunImpl(const ExecutionContext &ctx) {
       /* seqlen_k */ seqlen_k,
       /* seqlen_q_rounded */ seqlen_q_rounded,
       /* seqlen_k_rounded */ seqlen_k_rounded,
+      /* p_dropout */ p_dropout,
       /* is_causal */ is_causal,
       /* stream */ stream);
 
diff --git a/runtime/lib/backends/cuda/providers/default/flash_attn/flash_attn_fwd.cc b/runtime/lib/backends/cuda/providers/default/flash_attn/flash_attn_fwd.cc
index 1702623e4..5a588ac82 100644
--- a/runtime/lib/backends/cuda/providers/default/flash_attn/flash_attn_fwd.cc
+++ b/runtime/lib/backends/cuda/providers/default/flash_attn/flash_attn_fwd.cc
@@ -50,10 +50,16 @@ common::Status FlashAttnFwdOpKernel::RunImpl(const ExecutionContext &ctx) {
   void *q_ptr = accessor.GetArgAsyncValueRef(0);
   void *k_ptr = accessor.GetArgAsyncValueRef(1);
   void *v_ptr = accessor.GetArgAsyncValueRef(2);
-  void *o_ptr = accessor.GetArgAsyncValueRef(3);
-  void *softmax_lse_ptr = accessor.GetArgAsyncValueRef(4);
-  void *softmax_ptr = accessor.GetArgAsyncValueRef(5);
-  void *rng_state_ptr = accessor.GetArgAsyncValueRef(6); // TODO : handle rng
+  void *rng_state_ptr = accessor.GetArgAsyncValueRef(3);
+  void *o_ptr = accessor.GetArgAsyncValueRef(4);
+  void *softmax_lse_ptr = accessor.GetArgAsyncValueRef(5);
+  void *softmax_ptr = accessor.GetArgAsyncValueRef(6);
+
+  // check rng_state
+  // uint64_t *h_rng_state = new uint64_t[2];
+  // cudaMemcpy(h_rng_state, rng_state_ptr, 2 * sizeof(uint64_t),
+  // cudaMemcpyDeviceToHost); std::cout << h_rng_state[0] << "," <<
+  // h_rng_state[1] << std::endl; cudaDeviceSynchronize();
 
   // attr
   const bool is_causal = accessor.GetAttrAsBool("causal");
@@ -66,7 +72,7 @@ common::Status FlashAttnFwdOpKernel::RunImpl(const ExecutionContext &ctx) {
   const auto q_shape = accessor.GetArgShape(0);
   const auto k_shape = accessor.GetArgShape(1);
   const auto v_shape = accessor.GetArgShape(2);
-  const auto o_shape = accessor.GetArgShape(3);
+  const auto o_shape = accessor.GetArgShape(4);
   int64_t o_rank = o_shape.size();
   int64_t q_rank = q_shape.size();
   int64_t k_rank = k_shape.size();
@@ -115,7 +121,7 @@ common::Status FlashAttnFwdOpKernel::RunImpl(const ExecutionContext &ctx) {
   DTypeEnum q_dtype = accessor.GetArgDTypeEnum(0);
   DTypeEnum k_dtype = accessor.GetArgDTypeEnum(1);
   DTypeEnum v_dtype = accessor.GetArgDTypeEnum(2);
-  DTypeEnum o_dtype = accessor.GetArgDTypeEnum(3);
+  DTypeEnum o_dtype = accessor.GetArgDTypeEnum(4);
   if (o_dtype != q_dtype || q_dtype != k_dtype || k_dtype != v_dtype) {
     return InvalidArgs(
         "query, key, value, and output must have the same dtype");
@@ -194,6 +200,7 @@ common::Status FlashAttnFwdOpKernel::RunImpl(const ExecutionContext &ctx) {
                   /* seqlen_k */ seqlen_k,
                   /* seqlen_q_rounded */ seqlen_q_rounded,
                   /* seqlen_k_rounded */ seqlen_k_rounded,
+                  /* p_dropout */ p_dropout,
                   /* is_causal */ is_causal,
                   /* stream */ stream);
 
diff --git a/runtime/lib/backends/cuda/providers/default/flash_attn/kernels/flash_api.cu b/runtime/lib/backends/cuda/providers/default/flash_attn/kernels/flash_api.cu
index e2981b63a..dce323437 100644
--- a/runtime/lib/backends/cuda/providers/default/flash_attn/kernels/flash_api.cu
+++ b/runtime/lib/backends/cuda/providers/default/flash_attn/kernels/flash_api.cu
@@ -142,7 +142,7 @@ void run_mha(void *q_ptr, void *k_ptr, void *v_ptr, void *o_ptr,
              uint32_t seqlen_q, uint32_t seqlen_k, uint32_t seqlen_q_rounded,
              uint32_t seqlen_k_rounded,
 
-             int is_causal, cudaStream_t stream) {
+             float p_dropout, int is_causal, cudaStream_t stream) {
   Flash_fwd_params params;
   // Reset the parameters
   memset(&params, 0, sizeof(params));
@@ -187,7 +187,7 @@ void run_mha(void *q_ptr, void *k_ptr, void *v_ptr, void *o_ptr,
   params.scale_softmax = softmax_scale;
   params.scale_softmax_log2 = softmax_scale * M_LOG2E;
 
-  params.p_dropout = 1.; // probability to keep
+  params.p_dropout = 1.f - p_dropout; // probability to keep
   params.p_dropout_in_uint8_t = uint8_t(std::floor(params.p_dropout * 255.0));
   params.rp_dropout = 1.f / params.p_dropout;
   params.scale_softmax_rp_dropout = params.rp_dropout * params.scale_softmax;
@@ -195,6 +195,8 @@ void run_mha(void *q_ptr, void *k_ptr, void *v_ptr, void *o_ptr,
   params.cu_seqlens_q = cu_seqlens_q_ptr;
   params.cu_seqlens_k = cu_seqlens_k_ptr;
   params.p_ptr = softmax_ptr; // used for `return_softmax`.
+  params.rng_state = static_cast<uint64_t *>(rng_state_ptr);
+
   // print_Flash_fwd_params(params);
 
   FP16_SWITCH(!params.is_bf16, [&] {
@@ -225,7 +227,7 @@ void run_mha_bwd(void *q_ptr, void *k_ptr, void *v_ptr, void *o_ptr,
                  uint32_t seqlen_q, uint32_t seqlen_k,
                  uint32_t seqlen_q_rounded, uint32_t seqlen_k_rounded,
 
-                 int is_causal, cudaStream_t stream) {
+                 float p_dropout, int is_causal, cudaStream_t stream) {
   Flash_bwd_params params;
   // Reset the parameters
   memset(&params, 0, sizeof(params));
@@ -293,7 +295,7 @@ void run_mha_bwd(void *q_ptr, void *k_ptr, void *v_ptr, void *o_ptr,
   params.scale_softmax = softmax_scale;
   params.scale_softmax_log2 = softmax_scale * M_LOG2E;
 
-  params.p_dropout = 1.; // probability to keep
+  params.p_dropout = 1.f - p_dropout; // probability to keep
   params.p_dropout_in_uint8_t = uint8_t(std::floor(params.p_dropout * 255.0));
   params.rp_dropout = 1.f / params.p_dropout;
   params.scale_softmax_rp_dropout = params.rp_dropout * params.scale_softmax;
@@ -302,6 +304,7 @@ void run_mha_bwd(void *q_ptr, void *k_ptr, void *v_ptr, void *o_ptr,
   params.cu_seqlens_k = cu_seqlens_k_ptr;
   params.p_ptr = nullptr; // used for `return_softmax`, no use in bwd
   params.dsoftmax_sum = dsoftmax_sum_ptr;
+  params.rng_state = static_cast<uint64_t *>(rng_state_ptr);
 
   // print_Flash_bwd_params(params);
 
diff --git a/runtime/lib/backends/cuda/providers/default/flash_attn/kernels/flash_api.h b/runtime/lib/backends/cuda/providers/default/flash_attn/kernels/flash_api.h
index 205ee88cd..5c6e6f70c 100644
--- a/runtime/lib/backends/cuda/providers/default/flash_attn/kernels/flash_api.h
+++ b/runtime/lib/backends/cuda/providers/default/flash_attn/kernels/flash_api.h
@@ -29,7 +29,7 @@ void run_mha(void *q_ptr, void *k_ptr, void *v_ptr, void *o_ptr,
              uint32_t seqlen_q, uint32_t seqlen_k, uint32_t seqlen_q_rounded,
              uint32_t seqlen_k_rounded,
 
-             int is_causal, cudaStream_t stream);
+             float p_dropout, int is_causal, cudaStream_t stream);
 
 void run_mha_bwd(void *q_ptr, void *k_ptr, void *v_ptr, void *o_ptr,
                  void *dout_ptr, void *dq_ptr, void *dk_ptr, void *dv_ptr,
@@ -53,7 +53,7 @@ void run_mha_bwd(void *q_ptr, void *k_ptr, void *v_ptr, void *o_ptr,
                  uint32_t seqlen_q, uint32_t seqlen_k,
                  uint32_t seqlen_q_rounded, uint32_t seqlen_k_rounded,
 
-                 int is_causal, cudaStream_t stream);
+                 float p_dropout, int is_causal, cudaStream_t stream);
 
 } // namespace kernel
 } // namespace cuda
diff --git a/runtime/lib/backends/cuda/providers/default/flash_attn/kernels/flash_bwd_kernel.h b/runtime/lib/backends/cuda/providers/default/flash_attn/kernels/flash_bwd_kernel.h
index b7f9d95a4..91bac6590 100644
--- a/runtime/lib/backends/cuda/providers/default/flash_attn/kernels/flash_bwd_kernel.h
+++ b/runtime/lib/backends/cuda/providers/default/flash_attn/kernels/flash_bwd_kernel.h
@@ -957,8 +957,18 @@ compute_dq_dk_dv_1colblock(const Params &params, const int bidb, const int bidh,
   // auto offset = params.rng_state[1] + (bidb * params.h + bidh) * 32 + tidx %
   // 32;
 
-  unsigned long long seed = 0;
-  unsigned long long offset = 0;
+  // deprecated: no rng support.
+  // unsigned long long seed = 0;
+  // unsigned long long offset = 0;
+
+  unsigned long long seed = params.rng_state[0];
+  unsigned long long offset =
+      params.rng_state[1] + (bidb * params.h + bidh) * 32 + tidx % 32;
+
+  // if (block_id == 0 && tidx == 0) {
+  //   printf("seed:%lu\n",seed);
+  //   printf("offset:%lu\n",offset);
+  // }
 
   clear(acc_dv);
   clear(acc_dk);
@@ -1693,8 +1703,18 @@ compute_dq_dk_dv_1rowblock(const Params &params, const int bidb, const int bidh,
   // auto offset = params.rng_state[1] + (bidb * params.h + bidh) * 32 + tidx %
   // 32;
 
-  unsigned long long seed = 0;
-  unsigned long long offset = 0;
+  // deprecated: no rng support.
+  // unsigned long long seed = 0;
+  // unsigned long long offset = 0;
+
+  unsigned long long seed = params.rng_state[0];
+  unsigned long long offset =
+      params.rng_state[1] + (bidb * params.h + bidh) * 32 + tidx % 32;
+
+  // if (block_id == 0 && tidx == 0) {
+  //   printf("seed:%lu\n",seed);
+  //   printf("offset:%lu\n",offset);
+  // }
 
   clear(acc_dq);
 
diff --git a/runtime/lib/backends/cuda/providers/default/flash_attn/kernels/flash_fwd_kernel.h b/runtime/lib/backends/cuda/providers/default/flash_attn/kernels/flash_fwd_kernel.h
index e024129f4..d89242b36 100644
--- a/runtime/lib/backends/cuda/providers/default/flash_attn/kernels/flash_fwd_kernel.h
+++ b/runtime/lib/backends/cuda/providers/default/flash_attn/kernels/flash_fwd_kernel.h
@@ -7,6 +7,7 @@
 #include <cmath>
 #include <cute/algorithm/copy.hpp>
 #include <cute/algorithm/gemm.hpp>
+#include <iostream>
 
 #include <cutlass/array.h>
 #include <cutlass/cutlass.h>
@@ -376,8 +377,18 @@ inline __device__ void compute_attn_1rowblock(const Params &params,
   // unsigned long long offset = std::get<1>(seeds) + (bidb * params.h + bidh) *
   // 32 + tidx % 32;
 
-  unsigned long long seed = 0;
-  unsigned long long offset = 0;
+  // deprecated: no rng support.
+  // unsigned long long seed = 0;
+  // unsigned long long offset = 0;
+
+  unsigned long long seed = params.rng_state[0];
+  unsigned long long offset =
+      params.rng_state[1] + (bidb * params.h + bidh) * 32 + tidx % 32;
+
+  // if (block_id == 0 && tidx == 0) {
+  //   printf("seed:%lu\n",seed);
+  //   printf("offset:%lu\n",offset);
+  // }
 
   // Save seed and offset for backward.
   // if (block_id == 0 && tidx == 0) {
diff --git a/runtime/lib/backends/cuda/providers/default/tensor_generate/fill.cc b/runtime/lib/backends/cuda/providers/default/tensor_generate/fill.cc
index 582bc8ea0..e04e7a751 100644
--- a/runtime/lib/backends/cuda/providers/default/tensor_generate/fill.cc
+++ b/runtime/lib/backends/cuda/providers/default/tensor_generate/fill.cc
@@ -40,14 +40,20 @@ common::Status FillOpKernel::RunImpl(const ExecutionContext &ctx) {
       static_cast<CUDAWorkQueue *>(ctx.work_queue)->GetComputeStream();
   void *device_p = accessor.GetArgAsyncValueRef(0);
   size_t length = accessor.GetNumElementsOfShape(accessor.GetArgShape(0));
+
   // TODO: common helper for dtype dispatch
 #define CASE(dtype, ctype, mlir_type)                                          \
-  case DTypeEnum::dtype:                                                       \
-    kernel::Fill(                                                              \
-        stream, static_cast<ctype *>(device_p),                                \
-        static_cast<ctype>(accessor.GetAttrAsSplatValue<mlir_type>("value")),  \
-        length);                                                               \
-    return common::Status::OK()
+  case DTypeEnum::dtype: {                                                     \
+    if (accessor.HasAttrOfSplatValue<mlir_type>("value")) {                    \
+      kernel::Fill(stream, static_cast<ctype *>(device_p),                     \
+                   static_cast<ctype>(                                         \
+                       accessor.GetAttrAsSplatValue<mlir_type>("value")),      \
+                   length);                                                    \
+      return common::Status::OK();                                             \
+    }                                                                          \
+    break;                                                                     \
+  }
+
   switch (dtype) {
     CASE(Float32, float, float);
     CASE(Int64, int64_t, int64_t);
@@ -55,11 +61,30 @@ common::Status FillOpKernel::RunImpl(const ExecutionContext &ctx) {
     CASE(Float16, __half, float);
 #undef CASE
   default:
-    return common::Status(common::StatusCategory::BRT,
-                          common::StatusCode::NOT_IMPLEMENTED,
-                          "not supported dtype");
+    break;
   };
-  return common::Status::OK();
+
+#define CASE(dtype, ctype)                                                     \
+  case DTypeEnum::dtype: {                                                     \
+    std::vector<ctype> value = accessor.GetAttrAsVector<ctype>("value");       \
+    cudaMemcpyAsync(device_p, value.data(), value.size() * sizeof(ctype),      \
+                    cudaMemcpyHostToDevice, stream);                           \
+    return common::Status::OK();                                               \
+  }
+
+  switch (dtype) {
+    CASE(Float32, float);
+    CASE(Int64, int64_t);
+    CASE(Float64, double);
+    CASE(Float16, half_float::half);
+#undef CASE
+  default:
+    break;
+  };
+
+  return common::Status(common::StatusCategory::BRT,
+                        common::StatusCode::NOT_IMPLEMENTED,
+                        "not supported FillOp");
 }
 
 common::Status FillOpKernel::ProloguePerFrame(const ExecutionContext &) {
diff --git a/runtime/lib/core/framework/op_accessor.cc b/runtime/lib/core/framework/op_accessor.cc
index 18176432e..975f0982e 100644
--- a/runtime/lib/core/framework/op_accessor.cc
+++ b/runtime/lib/core/framework/op_accessor.cc
@@ -171,6 +171,31 @@ T OpAccessor::GetAttrAsSplatValue(const std::string &name) const {
   BRT_THROW("Attribute " + name + " is not set");
 }
 
+// GetDenseAttrAsVector will iterate every elements in dense attibutes.
+// If you want to avoid iterating, consider use getRawData() but special handle
+// for i1 ???
+template <typename T>
+std::vector<T> OpAccessor::GetAttrAsVector(const std::string &name) const {
+  std::vector<T> results;
+  if (auto attr =
+          info_.GetOperation()->getAttrOfType<DenseIntElementsAttr>(name)) {
+    results.reserve(attr.size());
+    for (APInt &&i : attr) {
+      results.push_back(static_cast<T>(i.getSExtValue()));
+    }
+    return results;
+  } else if (auto attr =
+                 info_.GetOperation()->getAttrOfType<DenseFPElementsAttr>(
+                     name)) {
+    results.reserve(attr.size());
+    for (APFloat &&i : attr) {
+      results.push_back(static_cast<T>(i.convertToDouble()));
+    }
+    return results;
+  }
+  BRT_THROW("Attribute " + name + " is not supported to get as vector");
+}
+
 std::string OpAccessor::GetUID() const {
   auto byre_op = llvm::cast<byre::ByreOp>(info_.GetOperation());
   return ByREHandle::GetOpUID(byre_op);
@@ -211,6 +236,18 @@ INST_ATTR_METH(double)
 INST_ATTR_METH(StringView)
 #undef INST_ATTR_METH
 
+#define INST_DENSE_ATTR_METH(T)                                                \
+  template std::vector<T> OpAccessor::GetAttrAsVector<T>(const std::string &)  \
+      const;
+INST_DENSE_ATTR_METH(float)
+INST_DENSE_ATTR_METH(int32_t)
+INST_DENSE_ATTR_METH(int64_t)
+INST_DENSE_ATTR_METH(uint8_t)
+INST_DENSE_ATTR_METH(uint32_t)
+INST_DENSE_ATTR_METH(double)
+INST_DENSE_ATTR_METH(half_float::half)
+#undef INST_DENSE_ATTR_METH
+
 #define INST_SCALAR_METH(T)                                                    \
   template T OpAccessor::GetArgScalar<T>(size_t);                              \
   template common::Status OpAccessor::SetResultScalar(size_t result_idx,       \
diff --git a/runtime/test/backends/cuda/providers/default/kernel/fill_test.cc b/runtime/test/backends/cuda/providers/default/kernel/fill_test.cc
index 6c870d93b..9f8c30335 100644
--- a/runtime/test/backends/cuda/providers/default/kernel/fill_test.cc
+++ b/runtime/test/backends/cuda/providers/default/kernel/fill_test.cc
@@ -32,8 +32,6 @@ using namespace brt::cuda;
 using namespace brt::test;
 
 TEST(CUDATestFillOp, Basic) {
-  constexpr size_t length = 512 * 128;
-
   Session session;
   auto status_allocator = CUDAAllocatorFactory(&session);
   BRT_TEST_CHECK_STATUS(status_allocator);
@@ -54,8 +52,16 @@ TEST(CUDATestFillOp, Basic) {
   auto status_sync = request->Sync();
   BRT_TEST_CHECK_STATUS(status_sync);
 
+  size_t length = 512 * 128;
   CheckCUDAValues<float>(static_cast<float *>(request->GetArg(0)), length, 0.f);
   CheckCUDAValues<float>(static_cast<float *>(request->GetArg(1)), length, 1.f);
   CheckCUDAValues<__half>(static_cast<__half *>(request->GetArg(2)), length,
                           static_cast<__half>(1.f));
+  length = 3;
+  std::vector<half_float::half> results = {static_cast<half_float::half>(1.f),
+                                           static_cast<half_float::half>(2.f),
+                                           static_cast<half_float::half>(3.f)};
+  EXPECT_TRUE(CheckCUDAValuesWithCPUValues(
+      static_cast<__half *>(request->GetArg(3)),
+      reinterpret_cast<__half *>(results.data()), length));
 }
diff --git a/runtime/test/backends/cuda/providers/default/kernel/flash_attn_fwd_test.cc b/runtime/test/backends/cuda/providers/default/kernel/flash_attn_fwd_test.cc
index f5a2d9a3a..38a0bc87b 100644
--- a/runtime/test/backends/cuda/providers/default/kernel/flash_attn_fwd_test.cc
+++ b/runtime/test/backends/cuda/providers/default/kernel/flash_attn_fwd_test.cc
@@ -51,6 +51,7 @@ TEST(SM80CUDATestFlashAttnFwd, Basic) {
   size_t head_dims = 32;
   size_t input_len = b * seq_len * num_heads * head_dims;
   size_t softmax_len = b * seq_len * num_heads;
+  // size_t rng_state_len = 2;
 
   Session session;
   auto status_allocator = CUDAAllocatorFactory(&session);
@@ -71,12 +72,22 @@ TEST(SM80CUDATestFlashAttnFwd, Basic) {
   __half *d_v;
   float *d_softmax_lse;
 
+  // rng_state
+  // uint64_t *d_rng_state;
+  // uint64_t h_rng_state[2];
+  // h_rng_state[0] = 0UL;
+  // h_rng_state[1] = 3000UL;
+
   cudaMalloc(&d_o, input_len * sizeof(__half));
   cudaMalloc(&d_q, input_len * sizeof(__half));
   cudaMalloc(&d_k, input_len * sizeof(__half));
   cudaMalloc(&d_v, input_len * sizeof(__half));
   cudaMalloc(&d_softmax_lse, softmax_len * sizeof(float));
 
+  // cudaMalloc(&d_rng_state, rng_state_len * sizeof(uint64_t));
+  // cudaMemcpy(d_rng_state, h_rng_state, rng_state_len * sizeof(uint64_t),
+  // cudaMemcpyHostToDevice);
+
   ReadCUDAFloatValues(d_q, input_len, input_q_file);
   ReadCUDAFloatValues(d_k, input_len, input_k_file);
   ReadCUDAFloatValues(d_v, input_len, input_v_file);
@@ -96,6 +107,7 @@ TEST(SM80CUDATestFlashAttnFwd, Basic) {
   request->BindArg(2, d_v);
   request->BindArg(3, d_o);
   request->BindArg(4, d_softmax_lse);
+  // request->BindArg(6, d_rng_state);
 
   request->FinishIOBinding();
 
@@ -104,7 +116,7 @@ TEST(SM80CUDATestFlashAttnFwd, Basic) {
   auto status_sync = request->Sync();
   BRT_TEST_CHECK_STATUS(status_sync);
 
-  // PrintCUDAValues(d_o, input_len, input_len);
+  PrintCUDAValues(d_o, input_len, input_len);
 
   CheckCUDABuffer<__half>(
       (__half *)d_o, /* size */ input_len, [&](__half *h_ptr) {
diff --git a/runtime/test/include/brt/test/common/cuda/util.h b/runtime/test/include/brt/test/common/cuda/util.h
index 6e63144e8..3def1a9af 100644
--- a/runtime/test/include/brt/test/common/cuda/util.h
+++ b/runtime/test/include/brt/test/common/cuda/util.h
@@ -146,6 +146,19 @@ template <typename T>
   return passed;
 }
 
+template <typename T>
+[[nodiscard]] bool CheckCUDAValuesWithCPUValues(T *first, T *second,
+                                                size_t size,
+                                                size_t print_count = 10) {
+  cudaDeviceSynchronize();
+  T *h_first = (T *)malloc(size * sizeof(T));
+  cudaMemcpy(h_first, first, size * sizeof(T), cudaMemcpyDeviceToHost);
+  cudaDeviceSynchronize();
+  bool passed = CheckCPUValues<T>(h_first, second, size, print_count);
+  free(h_first);
+  return passed;
+}
+
 // print floating point values
 template <typename T,
           std::enable_if_t<std::is_floating_point<T>::value, int> = 0>
diff --git a/runtime/test/test_files/fill_cuda.mlir b/runtime/test/test_files/fill_cuda.mlir
index 5c25bf916..81b78c438 100644
--- a/runtime/test/test_files/fill_cuda.mlir
+++ b/runtime/test/test_files/fill_cuda.mlir
@@ -1,10 +1,12 @@
 module attributes {byre.container_module} {
   func.func @test_fill(%arg0 : memref<512x128xf32, "cuda"> {byre.argname = "Fill0", byre.argtype = 2: i32},
                  %arg1 : memref<512x128xf32, "cuda"> {byre.argname = "Fill1", byre.argtype = 2: i32},
-                 %arg2 : memref<512x128xf16, "cuda"> {byre.argname = "Fill1FP16", byre.argtype = 2: i32}) attributes {byre.entry_point} {
+                 %arg2 : memref<512x128xf16, "cuda"> {byre.argname = "Fill1FP16", byre.argtype = 2: i32},
+                 %arg3 : memref<3xf16, "cuda"> {byre.argname = "FillNonSplat", byre.argtype = 2: i32}) attributes {byre.entry_point} {
     byre.compute @FillOp(%arg0) {value = dense<0.000000e+00> : tensor<512x128xf32>} : memref<512x128xf32, "cuda">
     byre.compute @FillOp(%arg1) {value = dense<1.000000e+00> : tensor<512x128xf32>} : memref<512x128xf32, "cuda">
     byre.compute @FillOp(%arg2) {value = dense<1.000000e+00> : tensor<512x128xf16>} : memref<512x128xf16, "cuda">
+    byre.compute @FillOp(%arg3) {value = dense<[1.000000e+00, 2.000000e+00, 3.000000e+00]> : tensor<3xf16>} : memref<3xf16, "cuda">
     return
   }
 }
\ No newline at end of file
diff --git a/runtime/test/test_files/flash_attn_fwd.mlir b/runtime/test/test_files/flash_attn_fwd.mlir
index eab8e694a..358b551f9 100644
--- a/runtime/test/test_files/flash_attn_fwd.mlir
+++ b/runtime/test/test_files/flash_attn_fwd.mlir
@@ -6,7 +6,7 @@ module attributes {byre.container_module} {
                  %arg4 : memref<1x3x128xf32, "cuda"> {byre.argname = "SoftmaxLse", byre.argtype = 2: i32},
                  %arg5 : memref<1x3x128x128xf32, "cuda"> {byre.argname = "SoftmaxPtr", byre.argtype = 2: i32},
                  %arg6 : memref<2xi64, "cuda"> {byre.argname = "RngState", byre.argtype = 2: i32}) attributes {byre.entry_point} {
-    byre.compute @byteir.flash_attn_fwd(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6) {causal = true, dropout_p = 0.000000e+00 : f32, return_softmax = false, softmax_scale = 0.500000e+00 : f32} : memref<1x128x3x32xf16, "cuda">, memref<1x128x3x32xf16, "cuda">, memref<1x128x3x32xf16, "cuda">, memref<1x128x3x32xf16, "cuda">, memref<1x3x128xf32, "cuda">, memref<1x3x128x128xf32, "cuda">, memref<2xi64, "cuda">
+    byre.compute @byteir.flash_attn_fwd(%arg0, %arg1, %arg2, %arg6, %arg3, %arg4, %arg5) {causal = true, dropout_p = 0.000000e+00 : f32, return_softmax = false, softmax_scale = 0.500000e+00 : f32} : memref<1x128x3x32xf16, "cuda">, memref<1x128x3x32xf16, "cuda">, memref<1x128x3x32xf16, "cuda">, memref<2xi64, "cuda">, memref<1x128x3x32xf16, "cuda">, memref<1x3x128xf32, "cuda">, memref<1x3x128x128xf32, "cuda">
     return
   }
-}
\ No newline at end of file
+}
diff --git a/tests/numerical_test/execute.py b/tests/numerical_test/execute.py
index 46e67351a..c265fd1d7 100644
--- a/tests/numerical_test/execute.py
+++ b/tests/numerical_test/execute.py
@@ -91,6 +91,7 @@ def compile_and_run_mlir(mhlo_file, target):
         interp = Interpreter.load_from_file(mhlo_file)
         np_inputs = generate_np_inputs(interp)
         func_name = get_entry_func_name(interp)
+        unique_name = os.path.basename(mhlo_file).split('.')[0]
 
         # run golden
         golden_outputs = interp.call_function(func_name, np_inputs)
@@ -98,8 +99,8 @@ def compile_and_run_mlir(mhlo_file, target):
         # byteir compile
         TEMP_FOLDER = "./local_test"
         os.makedirs(TEMP_FOLDER, exist_ok=True)
-        os.makedirs(TEMP_FOLDER + f"/{func_name}", exist_ok=True)
-        output_mlir_file_name = f'{TEMP_FOLDER}/{func_name}/{func_name}.rt.mlir'
+        os.makedirs(TEMP_FOLDER + f"/{unique_name}", exist_ok=True)
+        output_mlir_file_name = f'{TEMP_FOLDER}/{unique_name}/{unique_name}.rt.mlir'
         byteir.compile(mhlo_file, output_mlir_file_name,
                        entry_func=func_name, target=target)
     except Exception as e:
diff --git a/tests/numerical_test/main.py b/tests/numerical_test/main.py
index ae2c416e4..ed46a05cd 100644
--- a/tests/numerical_test/main.py
+++ b/tests/numerical_test/main.py
@@ -31,7 +31,7 @@
 parser.add_argument("--target", type=str, default="cuda_with_ait",
                     choices=["ait", "cuda", "cuda_with_ait_aggressive"], help="target device name")
 parser.add_argument("-c", "--config", default="all",
-                    choices=["all", "mlir", "torch"], help="test sets to run.")
+                    choices=["all", "mlir", "torch", "dynamo"], help="test sets to run.")
 args = parser.parse_args()
 
 EXCLUDE_MLIR_TESTS = []
@@ -40,10 +40,11 @@
 
 SM80_PLUS_TESTS = [
     "dot_f32.mlir",
+    "bmm_rrr_permute_f16.mlir",
     "bmm_rrr_permute_f32.mlir",
     "MatmulF32Module_basic",
     "BatchMatmulAddF32Module_basic",
-    "BatchMatmulF32Module",
+    "BatchMatmulF32Module_basic",
 ]
 
 
@@ -115,13 +116,18 @@ def main():
     if args.config == 'all':
         results = run_mlir_test(arch)
         results = results + run_torch_test(arch)
+        # TODO(zzk): disable flash attn test for now
+        # run_torch_dynamo_tests(arch)
     elif args.config == 'mlir':
         results = run_mlir_test(arch)
     elif args.config == 'torch':
         results = run_torch_test(arch)
+    elif args.config == 'dynamo':
+        # TODO(zzk): use test infra for dynamo tests
+        # TODO(zzk): disable flash attn test for now
+        # run_torch_dynamo_tests(arch)
+        pass
     failed = report_results(results)
-    # TODO(zzk): disable flash attn test for now
-    # run_torch_dynamo_tests(arch)
     sys.exit(1 if failed else 0)
 
 
diff --git a/tests/numerical_test/mlir_tests/ops/bmm_rrr_permute_f16.mlir b/tests/numerical_test/mlir_tests/ops/bmm_rrr_permute_f16.mlir
new file mode 100644
index 000000000..3c0cb7a86
--- /dev/null
+++ b/tests/numerical_test/mlir_tests/ops/bmm_rrr_permute_f16.mlir
@@ -0,0 +1,6 @@
+func.func @bmm_rrr_permute(%arg0: tensor<32x64x64xf16>, %arg1: tensor<32x64x128xf16>) -> tensor<1x64x32x128xf16> {
+    %0 = "mhlo.dot_general"(%arg0, %arg1) {dot_dimension_numbers = #mhlo.dot<lhs_batching_dimensions = [0], rhs_batching_dimensions = [0], lhs_contracting_dimensions = [2], rhs_contracting_dimensions = [1]>} : (tensor<32x64x64xf16>, tensor<32x64x128xf16>) -> tensor<32x64x128xf16>
+    %1 = mhlo.reshape %0 : (tensor<32x64x128xf16>) -> tensor<1x32x64x128xf16>
+    %2 = "mhlo.transpose"(%1) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x32x64x128xf16>) -> tensor<1x64x32x128xf16>
+    return %2 : tensor<1x64x32x128xf16>
+}
\ No newline at end of file
diff --git a/tests/numerical_test/mlir_tests/ops/concat2.mlir b/tests/numerical_test/mlir_tests/ops/concat2.mlir
new file mode 100644
index 000000000..fe7d7779a
--- /dev/null
+++ b/tests/numerical_test/mlir_tests/ops/concat2.mlir
@@ -0,0 +1,6 @@
+func.func @concat2(%arg0: tensor<i64>, %arg1: tensor<i64>) -> (tensor<2xi64>) {
+  %0 = mhlo.reshape %arg0 : (tensor<i64>) -> tensor<1xi64>
+  %1 = mhlo.reshape %arg1 : (tensor<i64>) -> tensor<1xi64>
+  %2 = "mhlo.concatenate"(%0, %1) {dimension = 0 : i64} : (tensor<1xi64>, tensor<1xi64>) -> tensor<2xi64>
+  return %2 : tensor<2xi64>
+}
diff --git a/tests/numerical_test/torch_dynamo_e2e_testing/backend.py b/tests/numerical_test/torch_dynamo_e2e_testing/backend.py
index 0a7ed28fc..a76663efd 100644
--- a/tests/numerical_test/torch_dynamo_e2e_testing/backend.py
+++ b/tests/numerical_test/torch_dynamo_e2e_testing/backend.py
@@ -18,7 +18,7 @@
 import byteir
 
 from torch_frontend import compile
-from torch_frontend import list_decomposed_ops, preprocess_fx_graph, fx_replace_attn_pattern, replace_flash_attn 
+from torch_frontend import list_decomposed_ops, preprocess_fx_graph, fx_replace_attn_pattern, replace_flash_attn, get_none_indices
 
 from functorch.compile import aot_module
 from torch._decomp import get_decompositions
@@ -67,22 +67,6 @@ def __call__(self, *inputs):
                 ret_ptr += 1
         return results
 
-def get_none_indices(fx_g: torch.fx.GraphModule) -> List[int]:
-    none_indices = []
-    for node in fx_g.graph.nodes:
-        if node.op == "output":
-            assert len(node.args) == 1, "Output node must have a single argument"
-            node_arg = node.args[0]
-            if isinstance(node_arg, (list, tuple)):
-                node_arg = list(node_arg)
-                node_args_len = len(node_arg)
-                for i in range(node_args_len):
-                    if node_arg[i] is None:
-                        none_indices.append(i)
-                break
-
-    return none_indices
-
 def byteir_compile_fx_inner(graph: torch.fx.GraphModule, inputs, is_backward, ban_lst=[]):
     category = 'backward' if is_backward else 'forward'