diff --git a/bin/unidiff b/bin/unidiff index c5cb929..3d370e8 100755 --- a/bin/unidiff +++ b/bin/unidiff @@ -45,7 +45,7 @@ if __name__ == '__main__': if PY2: diff_file = codecs.getreader(encoding)(diff_file) - patch = PatchSet(diff_file) + patch = PatchSet(diff_file, metadata_only=(not args.show_diff)) if args.show_diff: print(patch) diff --git a/tests/test_parser.py b/tests/test_parser.py index 9a438f8..841c1c1 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -114,10 +114,10 @@ def test_print_hunks_without_gaps(self): self.assertEqual(lines[12], '@@ -5,16 +11,10 @@') self.assertEqual(lines[31], '@@ -22,3 +22,7 @@') - def test_parse_sample(self): + def _test_parse_sample(self, metadata_only): """Parse sample file.""" with codecs.open(self.sample_file, 'r', encoding='utf-8') as diff_file: - res = PatchSet(diff_file) + res = PatchSet(diff_file, metadata_only=metadata_only) # three file in the patch self.assertEqual(len(res), 3) @@ -164,6 +164,12 @@ def test_parse_sample(self): self.assertEqual(res.added, 21) self.assertEqual(res.removed, 17) + def test_parse_sample_full(self): + self._test_parse_sample(metadata_only=False) + + def test_parse_sample_metadata_only(self): + self._test_parse_sample(metadata_only=True) + def test_patchset_compare(self): with codecs.open(self.sample_file, 'r', encoding='utf-8') as diff_file: ps1 = PatchSet(diff_file) @@ -316,6 +322,38 @@ def test_diff_lines_linenos(self): self.assertEqual(source_line_nos, expected_source_line_nos) self.assertEqual(diff_line_nos, expected_diff_line_nos) + def test_diff_hunk_positions(self): + with open(self.sample_file, 'rb') as diff_file: + res = PatchSet(diff_file, encoding='utf-8') + self.do_test_diff_hunk_positions(res) + + def test_diff_metadata_only(self): + with open(self.sample_file, 'rb') as diff_file: + res = PatchSet(diff_file, encoding='utf-8', metadata_only=True) + self.do_test_diff_hunk_positions(res) + + def do_test_diff_hunk_positions(self, res): + hunk_positions = [] + for diff_file in res: + for hunk in diff_file: + hunk_positions.append((hunk.source_start, hunk.target_start, + hunk.source_length, hunk.target_length)) + + expected_hunk_positions = [ + # File: 1, Hunk: 1 + (1, 1, 3, 9), + # File: 1, Hunk: 2 + (5, 11, 16, 10), + # File: 1, Hunk: 3 + (22, 22, 3, 7), + # File: 2, Hunk: 1 + (0, 1, 0, 9), + # File: 3, Hunk: 1 + (1, 0, 9, 0) + ] + + self.assertEqual(hunk_positions, expected_hunk_positions) + class TestVCSSamples(unittest.TestCase): """Tests for real examples from VCS.""" diff --git a/unidiff/patch.py b/unidiff/patch.py index 507ca92..0ce3dd8 100644 --- a/unidiff/patch.py +++ b/unidiff/patch.py @@ -141,6 +141,8 @@ def __init__(self, src_start=0, src_len=0, tgt_start=0, tgt_len=0, self.target_start = int(tgt_start) self.target_length = int(tgt_len) self.section_header = section_header + self._added = None + self._removed = None def __repr__(self): value = "" % (self.source_start, @@ -168,10 +170,18 @@ def append(self, line): @property def added(self): + if self._added is not None: + return self._added + # re-calculate each time to allow for hunk modifications + # (which should mean metadata_only switch wasn't used) return sum(1 for line in self if line.is_added) @property def removed(self): + if self._removed is not None: + return self._removed + # re-calculate each time to allow for hunk modifications + # (which should mean metadata_only switch wasn't used) return sum(1 for line in self if line.is_removed) def is_valid(self): @@ -229,7 +239,7 @@ def __str__(self): hunks = ''.join(unicode(hunk) for hunk in self) return info + source + target + hunks - def _parse_hunk(self, header, diff, encoding): + def _parse_hunk(self, header, diff, encoding, metadata_only): """Parse hunk details.""" header_info = RE_HUNK_HEADER.match(header) hunk_info = header_info.groups() @@ -239,39 +249,69 @@ def _parse_hunk(self, header, diff, encoding): target_line_no = hunk.target_start expected_source_end = source_line_no + hunk.source_length expected_target_end = target_line_no + hunk.target_length + added = 0 + removed = 0 for diff_line_no, line in diff: if encoding is not None: line = line.decode(encoding) - valid_line = RE_HUNK_BODY_LINE.match(line) - if not valid_line: - valid_line = RE_HUNK_EMPTY_BODY_LINE.match(line) - - if not valid_line: - raise UnidiffParseError('Hunk diff line expected: %s' % line) - - line_type = valid_line.group('line_type') - if line_type == LINE_TYPE_EMPTY: - line_type = LINE_TYPE_CONTEXT - value = valid_line.group('value') - original_line = Line(value, line_type=line_type) - if line_type == LINE_TYPE_ADDED: - original_line.target_line_no = target_line_no - target_line_no += 1 - elif line_type == LINE_TYPE_REMOVED: - original_line.source_line_no = source_line_no - source_line_no += 1 - elif line_type == LINE_TYPE_CONTEXT: - original_line.target_line_no = target_line_no - target_line_no += 1 - original_line.source_line_no = source_line_no - source_line_no += 1 - elif line_type == LINE_TYPE_NO_NEWLINE: - pass - else: + if metadata_only: + # quick line type detection, no regex required + line_type = line[0] if line else LINE_TYPE_CONTEXT + if line_type not in (LINE_TYPE_ADDED, + LINE_TYPE_REMOVED, + LINE_TYPE_CONTEXT, + LINE_TYPE_NO_NEWLINE): + raise UnidiffParseError( + 'Hunk diff line expected: %s' % line) + + if line_type == LINE_TYPE_ADDED: + target_line_no += 1 + added += 1 + elif line_type == LINE_TYPE_REMOVED: + source_line_no += 1 + removed += 1 + elif line_type == LINE_TYPE_CONTEXT: + target_line_no += 1 + source_line_no += 1 + + # no file content tracking original_line = None + else: + # parse diff line content + valid_line = RE_HUNK_BODY_LINE.match(line) + if not valid_line: + valid_line = RE_HUNK_EMPTY_BODY_LINE.match(line) + + if not valid_line: + raise UnidiffParseError( + 'Hunk diff line expected: %s' % line) + + line_type = valid_line.group('line_type') + if line_type == LINE_TYPE_EMPTY: + line_type = LINE_TYPE_CONTEXT + + value = valid_line.group('value') + original_line = Line(value, line_type=line_type) + + if line_type == LINE_TYPE_ADDED: + original_line.target_line_no = target_line_no + target_line_no += 1 + elif line_type == LINE_TYPE_REMOVED: + original_line.source_line_no = source_line_no + source_line_no += 1 + elif line_type == LINE_TYPE_CONTEXT: + original_line.target_line_no = target_line_no + original_line.source_line_no = source_line_no + target_line_no += 1 + source_line_no += 1 + elif line_type == LINE_TYPE_NO_NEWLINE: + pass + else: + original_line = None + # stop parsing if we got past expected number of lines if (source_line_no > expected_source_end or target_line_no > expected_target_end): @@ -291,6 +331,11 @@ def _parse_hunk(self, header, diff, encoding): target_line_no < expected_target_end): raise UnidiffParseError('Hunk is shorter than expected') + if metadata_only: + # HACK: set fixed calculated values when metadata_only is enabled + hunk._added = added + hunk._removed = removed + self.append(hunk) def _add_no_newline_marker_to_last_hunk(self): @@ -360,7 +405,7 @@ def is_modified_file(self): class PatchSet(list): """A list of PatchedFiles.""" - def __init__(self, f, encoding=None): + def __init__(self, f, encoding=None, metadata_only=False): super(PatchSet, self).__init__() # convert string inputs to StringIO objects @@ -370,7 +415,10 @@ def __init__(self, f, encoding=None): # make sure we pass an iterator object to parse data = iter(f) # if encoding is None, assume we are reading unicode data - self._parse(data, encoding=encoding) + # when metadata_only is True, only perform a minimal metadata parsing + # (ie. hunks without content) which is around 2.5-6 times faster; + # it will still validate the diff metadata consistency and get counts + self._parse(data, encoding=encoding, metadata_only=metadata_only) def __repr__(self): return make_str('') % super(PatchSet, self).__repr__() @@ -378,7 +426,7 @@ def __repr__(self): def __str__(self): return ''.join(unicode(patched_file) for patched_file in self) - def _parse(self, diff, encoding): + def _parse(self, diff, encoding, metadata_only): current_file = None patch_info = None @@ -449,7 +497,7 @@ def _parse(self, diff, encoding): if is_hunk_header: if current_file is None: raise UnidiffParseError('Unexpected hunk found: %s' % line) - current_file._parse_hunk(line, diff, encoding) + current_file._parse_hunk(line, diff, encoding, metadata_only) continue # check for no newline marker