diff --git a/padiff/__init__.py b/padiff/__init__.py index 4ead7bd..d86c187 100644 --- a/padiff/__init__.py +++ b/padiff/__init__.py @@ -27,7 +27,11 @@ from .report.hooks import info_hook from .datas import global_json_laoder as jsons -from . import cinn_diff + +try: + from . import cinn_diff +except: + pass def module_filter(name): @@ -134,7 +138,7 @@ def __init__(self, method): self.__api__ = True def forward(self, *args, **kwargs): - return self._method(tensor_obj, *args, **kwargs) + return self._method(*args, **kwargs) def __str__(self): return self.__name__ @@ -152,7 +156,7 @@ def __init__(self, method): self.__api__ = True def forward(self, *args, **kwargs): - return self._method(tensor_obj, *args, **kwargs) + return self._method(*args, **kwargs) def __str__(self): return self.__name__ @@ -163,7 +167,7 @@ def __str__(self): else: raise RuntimeError("Required module_type is in [paddle, torch], but received {}".format(method_fullname)) - out = layer(*args, **kwargs) + out = layer(tensor_obj, *args, **kwargs) handle.remove() diff --git a/padiff/checker/actions.py b/padiff/checker/actions.py index 5b4eb4d..c38bda2 100644 --- a/padiff/checker/actions.py +++ b/padiff/checker/actions.py @@ -67,9 +67,9 @@ def __call__(self, file_list_0, file_list_1, cfg): assert len(file_list_0) == len( file_list_1 ), f"number of tensors for compare is not equal, {len(file_list_0)} vs {len(file_list_1)}" - for path_0, path_1 in zip(file_list_0, file_list_1): - tensor_0 = load_numpy(path_0) - tensor_1 = load_numpy(path_1) + for info_0, info_1 in zip(file_list_0, file_list_1): + tensor_0 = load_numpy(info_0["path"]) + tensor_1 = load_numpy(info_1["path"]) if tensor_0.size == 0 or tensor_1.size == 0: if tensor_0.size != tensor_1.size: raise RuntimeError("size of tensors is not equal") diff --git a/padiff/dump_tools.py b/padiff/dump_tools.py index ba22757..192bcb5 100644 --- a/padiff/dump_tools.py +++ b/padiff/dump_tools.py @@ -15,7 +15,8 @@ import json import os, sys import numpy -from .utils import Counter, reset_dir +import paddle +from .utils import Counter, frames_to_string, reset_dir dump_root_path = os.path.join(sys.path[0], "padiff_dump") @@ -79,14 +80,31 @@ def dump_report_node(wrap_node, tensor_dumper): "net_id": wrap_node.fwd_report.net_id, }, "children": [], + "stack": frames_to_string(wrap_node.fwd_report.frames), } for tensor in wrap_node.fwd_report.tensors_for_compare(): file_name = tensor_dumper(tensor.detach().numpy()) - node_info["fwd_outputs"].append(file_name) + node_info["fwd_outputs"].append( + { + "path": file_name, + "shape": str(tensor.shape), + "dtype": str(tensor.dtype), + "place": str(tensor.place) if isinstance(tensor, paddle.Tensor) else str(tensor.device), + "layout": str(tensor.layout), + } + ) for tensor in wrap_node.bwd_report.tensors_for_compare(): file_name = tensor_dumper(tensor.detach().numpy()) - node_info["bwd_grads"].append(file_name) + node_info["bwd_grads"].append( + { + "path": file_name, + "shape": str(tensor.shape), + "dtype": str(tensor.dtype), + "place": str(tensor.place) if isinstance(tensor, paddle.Tensor) else str(tensor.device), + "layout": str(tensor.layout), + } + ) for child in wrap_node.children: child_info = dump_report_node(child, tensor_dumper) diff --git a/padiff/report/hooks.py b/padiff/report/hooks.py index 23d3711..d8d2367 100644 --- a/padiff/report/hooks.py +++ b/padiff/report/hooks.py @@ -20,6 +20,7 @@ map_structure_and_replace_key, flatten, for_each_grad_tensor, + extract_frame_summary, ) import json import numpy @@ -114,10 +115,12 @@ def info_hook(model, input, output, net_id): else: _model = model + _, frames = extract_frame_summary() + new_in = clone_tensors(input) new_out = clone_tensors(output) - fwd_item = report.put_item("forward", new_in, new_out, _model, net_id) - bwd_item = report.put_item("backward", new_in, new_out, _model, net_id) + fwd_item = report.put_item("forward", new_in, new_out, _model, net_id, frames) + bwd_item = report.put_item("backward", new_in, new_out, _model, net_id, frames) bwd_item.set_forward(fwd_item) report.stack.push_api(_model, fwd_item, bwd_item) diff --git a/padiff/report/report.py b/padiff/report/report.py index 03bca17..aa96a4d 100644 --- a/padiff/report/report.py +++ b/padiff/report/report.py @@ -25,7 +25,7 @@ def __init__(self, marker): self.marker = marker self.stack = LayerStack() - def put_item(self, type_, input_, output, net, net_id): + def put_item(self, type_, input_, output, net, net_id, frames): step = self.counter.get_id() self.items.append( ReportItem( @@ -35,6 +35,7 @@ def put_item(self, type_, input_, output, net, net_id): output=output, net=net, net_id=net_id, # traversal order of sublayers + frames=frames, ) ) return self.items[-1] @@ -49,7 +50,7 @@ def __str__(self): class ReportItem: - def __init__(self, type_, step, input_, output, net, net_id): + def __init__(self, type_, step, input_, output, net, net_id, frames): assert type_ in [ "forward", "backward", @@ -65,6 +66,7 @@ def __init__(self, type_, step, input_, output, net, net_id): self.fwd_item = None # bound to another reportitem, if self.type is "backward" self.bwd_item = None # bound to another reportitem, if self.type is "forward" self.input_grads = self._gen_input_grads() + self.frames = frames def set_forward(self, fwd): assert self.type == "backward", "can't set forward for non-backward item." diff --git a/padiff/utils.py b/padiff/utils.py index 1e31df0..eb2d77f 100644 --- a/padiff/utils.py +++ b/padiff/utils.py @@ -178,3 +178,57 @@ def get_id(self): ret = self.id self.id += 1 return ret + + +""" + tools for recording frame stack +""" + + +import os.path as osp +import traceback + + +def _is_system_package(filename): + exclude = [ + "lib/python", + "/usr/local", + osp.dirname(paddle.__file__), + osp.dirname(torch.__file__), + osp.dirname(__file__), # exclude padiff + ] + for pattern in exclude: + if pattern in filename: + return True + return False + + +def extract_frame_summary(): + """ + extract the current call stack by traceback module. + gather the call information and put them into ReportItem to helper locate the error. + + frame_summary: + line: line of the code + lineno: line number of the file + filename: file name of the stack + name: the function name. + """ + frame_summarys = traceback.StackSummary.extract(traceback.walk_stack(None)) + last_user_fs = None + for fs in frame_summarys: + if not _is_system_package(fs.filename): + last_user_fs = fs + break + assert last_user_fs is not None, "Error happend, can't return None." + return last_user_fs, frame_summarys + + +def frames_to_string(frames, indent=0): + indent = " " * indent + lines = [] + for f in frames: + lines.append( + "{}File {}: {} {}\n{}{}{}".format(indent, f.filename, f.lineno, f.name, indent, indent, f.line) + ) + return "\n".join(lines) diff --git a/requirements-dev.txt b/requirements-dev.txt index 80c80ea..d2a188c 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -7,3 +7,4 @@ pytest-cov regex pytest-xdist torchvision +graphviz