diff --git a/include/tpu_mlir/Dialect/Top/IR/TopOps.td b/include/tpu_mlir/Dialect/Top/IR/TopOps.td
index 4e086f696..120e16991 100755
--- a/include/tpu_mlir/Dialect/Top/IR/TopOps.td
+++ b/include/tpu_mlir/Dialect/Top/IR/TopOps.td
@@ -729,7 +729,6 @@ def Top_AttentionOp: Top_Op<"Attention"> {
   );
 
   let results = (outs AnyTensor:$output);
-  let hasCanonicalizer = 1;
 }
 
 def Top_PadOp:Top_Op<"Pad"> {
diff --git a/lib/Dialect/Top/Canonicalize/Attention.cpp b/lib/Dialect/Top/Canonicalize/Attention.cpp
deleted file mode 100644
index a6f84896d..000000000
--- a/lib/Dialect/Top/Canonicalize/Attention.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Copyright (C) 2022 Sophgo Technologies Inc.  All rights reserved.
-//
-// TPU-MLIR is licensed under the 2-Clause BSD License except for the
-// third-party components.
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Pass/Pass.h"
-#include "tpu_mlir/Dialect/Top/IR/TopOps.h"
-#include "tpu_mlir/Support/Module.h"
-
-
-using namespace tpu_mlir::top;
-
-
-struct TopFuseAttention : public OpRewritePattern<AttentionOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(AttentionOp op,
-                                PatternRewriter &rewriter) const override {
-
-    return failure();
-  }
-};
-
-void AttentionOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                              MLIRContext *context) {
-  results.insert<TopFuseAttention>(context);
-}
diff --git a/lib/Dialect/Top/Transforms/ChipOptimize/OptimizeBM1684X.cpp b/lib/Dialect/Top/Transforms/ChipOptimize/OptimizeBM1684X.cpp
index eb281268d..9e05a6d00 100644
--- a/lib/Dialect/Top/Transforms/ChipOptimize/OptimizeBM1684X.cpp
+++ b/lib/Dialect/Top/Transforms/ChipOptimize/OptimizeBM1684X.cpp
@@ -102,8 +102,9 @@ class ConvertMatMul2Attention : public OpRewritePattern<top::MatMulOp> {
   using OpRewritePattern::OpRewritePattern;
   LogicalResult matchAndRewrite(top::MatMulOp op,
                                 PatternRewriter &rewriter) const override {
+    // sd_decoder_pt error in bm1684x/bm1686
+    return failure();
     auto filter = op.getRight();
-    // return failure();
     if (module::isWeight(filter) == false) {
       return failure();
     }
diff --git a/lib/Dialect/Tpu/Interfaces/BM1684X/Load.cpp b/lib/Dialect/Tpu/Interfaces/BM1684X/Load.cpp
index 34b03b419..89f24a832 100644
--- a/lib/Dialect/Tpu/Interfaces/BM1684X/Load.cpp
+++ b/lib/Dialect/Tpu/Interfaces/BM1684X/Load.cpp
@@ -73,8 +73,8 @@ void tpu::LoadOp::codegen_local_bm1684x(int64_t n_step, int64_t c_step,
   gdma_format = BM168x::getGdmaFormat(data_type);
   auto fmt_bytes = BM168x::getFmtBytes(data_type);
   auto g_addr = module::getAddress(getInput());
-  int64_t dhw = D * H * W;
-  int64_t eu_num = BM168x::eu_num(fmt_bytes);
+  // int64_t dhw = D * H * W;
+  // int64_t eu_num = BM168x::eu_num(fmt_bytes);
   int64_t use_3ic = getUse_3icOptimize();
   if (use_3ic < 4 && use_3ic > 0) {
     auto g_stride = BM168x::getGlobalStride(N, C, H, W);
@@ -103,7 +103,9 @@ void tpu::LoadOp::codegen_local_bm1684x(int64_t n_step, int64_t c_step,
           s_stride.N, s_stride.H, gdma_format, true, GDMA_VALUE_DIR_S2L,
           pid_node);
     }
-  } else if (dhw <= eu_num && (C & 0xff) == 0 && data_type == DTYPE_INT8 &&
+  }
+#if 0
+  else if (dhw <= eu_num && (C & 0xff) == 0 && data_type == DTYPE_INT8 &&
              real_dslice == D && real_hslice == H && real_wslice == W &&
              real_cslice == C && N == 1) {
     // optimize coeff load shape
@@ -125,7 +127,9 @@ void tpu::LoadOp::codegen_local_bm1684x(int64_t n_step, int64_t c_step,
         N, C, H, W, nstride, cstride, hstride, wstride,
         dst_nstride, dst_cstride, dst_hstride, dst_wstride,
         gdma_format, GDMA_VALUE_DIR_S2L, 0, pid_node);
-  } else {
+  }
+#endif
+  else {
     int64_t c_num_local = ceiling_func(real_cslice, Arch::NPU_NUM);
     int64_t c_stride = gi.eu_align ? align_up(real_hslice * real_wslice,
                                               Arch::eu_num(fmt_bytes))
diff --git a/python/test/test_torch.py b/python/test/test_torch.py
index 74a66d484..d8b033133 100755
--- a/python/test/test_torch.py
+++ b/python/test/test_torch.py
@@ -47,7 +47,7 @@ def __init__(self,
             "Addmm":            (self.test_Addmm,             Y, Y, Y),
             "Arange":           (self.test_Arange,            Y, Y, Y),
             "Attention":        (self.test_Attention,         Y, Y, Y),
-            "AttentionNew":     (self.test_AttentionNew,      Y, N, N),
+            "AttentionNew":     (self.test_AttentionNew,      N, N, N),
             "AvgPool1d":        (self.test_AvgPool1d,         Y, Y, Y),
             "AvgPool2d":        (self.test_AvgPool2d,         Y, Y, Y),
             "AvgPool3d":        (self.test_AvgPool3d,         Y, Y, Y),
diff --git a/python/transform/TFLiteConverter.py b/python/transform/TFLiteConverter.py
index 8c1264ac9..87e939c57 100644
--- a/python/transform/TFLiteConverter.py
+++ b/python/transform/TFLiteConverter.py
@@ -240,6 +240,8 @@ def __init__(self,
         else:
             self.output_names = output_names
         self.input_shapes = [x.shape for x in self.graph.inputs]
+        for x in self.graph.inputs:
+            self.addShape(x.name, x.shape)
         self.output_shapes = []
         self.outputs = []
         for op in self.graph.operators:
@@ -248,6 +250,7 @@ def __init__(self,
                     self.outputs.append(out)
                     self.__nhwc2nchw(out)
                     self.output_shapes.append(out.shape)
+                    self.addShape(out.name, out.shape)
 
         self.mlir = MLIRImporter(
             self.input_shapes,
diff --git a/third_party/nntoolchain/lib/libbackend_1686.so b/third_party/nntoolchain/lib/libbackend_1686.so
index f9a54f4c4..4135e7242 100755
Binary files a/third_party/nntoolchain/lib/libbackend_1686.so and b/third_party/nntoolchain/lib/libbackend_1686.so differ