update export json template (#103)

PaddlePaddle · Feb 20, 2024 · 6d6ae08 · 6d6ae08
1 parent a3b6d3e
commit 6d6ae08
Show file tree

Hide file tree

Showing 7 changed files with 96 additions and 14 deletions.
diff --git a/padiff/__init__.py b/padiff/__init__.py
@@ -27,7 +27,11 @@
 
 from .report.hooks import info_hook
 from .datas import global_json_laoder as jsons
-from . import cinn_diff
+
+try:
+    from . import cinn_diff
+except:
+    pass
 
 
 def module_filter(name):
@@ -134,7 +138,7 @@ def __init__(self, method):
                     self.__api__ = True
 
                 def forward(self, *args, **kwargs):
-                    return self._method(tensor_obj, *args, **kwargs)
+                    return self._method(*args, **kwargs)
 
                 def __str__(self):
                     return self.__name__
@@ -152,7 +156,7 @@ def __init__(self, method):
                     self.__api__ = True
 
                 def forward(self, *args, **kwargs):
-                    return self._method(tensor_obj, *args, **kwargs)
+                    return self._method(*args, **kwargs)
 
                 def __str__(self):
                     return self.__name__
@@ -163,7 +167,7 @@ def __str__(self):
         else:
             raise RuntimeError("Required module_type is in [paddle, torch], but received {}".format(method_fullname))
 
-        out = layer(*args, **kwargs)
+        out = layer(tensor_obj, *args, **kwargs)
 
         handle.remove()
 

diff --git a/padiff/checker/actions.py b/padiff/checker/actions.py
@@ -67,9 +67,9 @@ def __call__(self, file_list_0, file_list_1, cfg):
         assert len(file_list_0) == len(
             file_list_1
         ), f"number of tensors for compare is not equal, {len(file_list_0)} vs {len(file_list_1)}"
-        for path_0, path_1 in zip(file_list_0, file_list_1):
-            tensor_0 = load_numpy(path_0)
-            tensor_1 = load_numpy(path_1)
+        for info_0, info_1 in zip(file_list_0, file_list_1):
+            tensor_0 = load_numpy(info_0["path"])
+            tensor_1 = load_numpy(info_1["path"])
             if tensor_0.size == 0 or tensor_1.size == 0:
                 if tensor_0.size != tensor_1.size:
                     raise RuntimeError("size of tensors is not equal")

diff --git a/padiff/dump_tools.py b/padiff/dump_tools.py
@@ -15,7 +15,8 @@
 import json
 import os, sys
 import numpy
-from .utils import Counter, reset_dir
+import paddle
+from .utils import Counter, frames_to_string, reset_dir
 
 
 dump_root_path = os.path.join(sys.path[0], "padiff_dump")
@@ -79,14 +80,31 @@ def dump_report_node(wrap_node, tensor_dumper):
             "net_id": wrap_node.fwd_report.net_id,
         },
         "children": [],
+        "stack": frames_to_string(wrap_node.fwd_report.frames),
     }
     for tensor in wrap_node.fwd_report.tensors_for_compare():
         file_name = tensor_dumper(tensor.detach().numpy())
-        node_info["fwd_outputs"].append(file_name)
+        node_info["fwd_outputs"].append(
+            {
+                "path": file_name,
+                "shape": str(tensor.shape),
+                "dtype": str(tensor.dtype),
+                "place": str(tensor.place) if isinstance(tensor, paddle.Tensor) else str(tensor.device),
+                "layout": str(tensor.layout),
+            }
+        )
 
     for tensor in wrap_node.bwd_report.tensors_for_compare():
         file_name = tensor_dumper(tensor.detach().numpy())
-        node_info["bwd_grads"].append(file_name)
+        node_info["bwd_grads"].append(
+            {
+                "path": file_name,
+                "shape": str(tensor.shape),
+                "dtype": str(tensor.dtype),
+                "place": str(tensor.place) if isinstance(tensor, paddle.Tensor) else str(tensor.device),
+                "layout": str(tensor.layout),
+            }
+        )
 
     for child in wrap_node.children:
         child_info = dump_report_node(child, tensor_dumper)

diff --git a/padiff/report/hooks.py b/padiff/report/hooks.py
@@ -20,6 +20,7 @@
     map_structure_and_replace_key,
     flatten,
     for_each_grad_tensor,
+    extract_frame_summary,
 )
 import json
 import numpy
@@ -114,10 +115,12 @@ def info_hook(model, input, output, net_id):
     else:
         _model = model
 
+    _, frames = extract_frame_summary()
+
     new_in = clone_tensors(input)
     new_out = clone_tensors(output)
-    fwd_item = report.put_item("forward", new_in, new_out, _model, net_id)
-    bwd_item = report.put_item("backward", new_in, new_out, _model, net_id)
+    fwd_item = report.put_item("forward", new_in, new_out, _model, net_id, frames)
+    bwd_item = report.put_item("backward", new_in, new_out, _model, net_id, frames)
     bwd_item.set_forward(fwd_item)
 
     report.stack.push_api(_model, fwd_item, bwd_item)

diff --git a/padiff/report/report.py b/padiff/report/report.py
@@ -25,7 +25,7 @@ def __init__(self, marker):
         self.marker = marker
         self.stack = LayerStack()
 
-    def put_item(self, type_, input_, output, net, net_id):
+    def put_item(self, type_, input_, output, net, net_id, frames):
         step = self.counter.get_id()
         self.items.append(
             ReportItem(
@@ -35,6 +35,7 @@ def put_item(self, type_, input_, output, net, net_id):
                 output=output,
                 net=net,
                 net_id=net_id,  # traversal order of sublayers
+                frames=frames,
             )
         )
         return self.items[-1]
@@ -49,7 +50,7 @@ def __str__(self):
 
 
 class ReportItem:
-    def __init__(self, type_, step, input_, output, net, net_id):
+    def __init__(self, type_, step, input_, output, net, net_id, frames):
         assert type_ in [
             "forward",
             "backward",
@@ -65,6 +66,7 @@ def __init__(self, type_, step, input_, output, net, net_id):
         self.fwd_item = None  # bound to another reportitem, if self.type is "backward"
         self.bwd_item = None  # bound to another reportitem, if self.type is "forward"
         self.input_grads = self._gen_input_grads()
+        self.frames = frames
 
     def set_forward(self, fwd):
         assert self.type == "backward", "can't set forward for non-backward item."

diff --git a/padiff/utils.py b/padiff/utils.py
@@ -178,3 +178,57 @@ def get_id(self):
         ret = self.id
         self.id += 1
         return ret
+
+
+"""
+    tools for recording frame stack
+"""
+
+
+import os.path as osp
+import traceback
+
+
+def _is_system_package(filename):
+    exclude = [
+        "lib/python",
+        "/usr/local",
+        osp.dirname(paddle.__file__),
+        osp.dirname(torch.__file__),
+        osp.dirname(__file__),  # exclude padiff
+    ]
+    for pattern in exclude:
+        if pattern in filename:
+            return True
+    return False
+
+
+def extract_frame_summary():
+    """
+    extract the current call stack by traceback module.
+    gather the call information and put them into ReportItem to helper locate the error.
+
+    frame_summary:
+        line: line of the code
+        lineno: line number of the file
+        filename: file name of the stack
+        name: the function name.
+    """
+    frame_summarys = traceback.StackSummary.extract(traceback.walk_stack(None))
+    last_user_fs = None
+    for fs in frame_summarys:
+        if not _is_system_package(fs.filename):
+            last_user_fs = fs
+            break
+    assert last_user_fs is not None, "Error happend, can't return None."
+    return last_user_fs, frame_summarys
+
+
+def frames_to_string(frames, indent=0):
+    indent = " " * indent
+    lines = []
+    for f in frames:
+        lines.append(
+            "{}File {}: {}    {}\n{}{}{}".format(indent, f.filename, f.lineno, f.name, indent, indent, f.line)
+        )
+    return "\n".join(lines)
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -7,3 +7,4 @@ pytest-cov
 regex
 pytest-xdist
 torchvision
+graphviz