From 37480a915e5ee73da3a74414e6b79e2747f3d81b Mon Sep 17 00:00:00 2001 From: Povilas Kanapickas Date: Sat, 4 Apr 2020 21:07:01 +0300 Subject: [PATCH 1/3] Add an option to parse only hunk positions --- tests/test_parser.py | 32 +++++++++++++++++ unidiff/patch.py | 82 ++++++++++++++++++++++++++++---------------- 2 files changed, 84 insertions(+), 30 deletions(-) diff --git a/tests/test_parser.py b/tests/test_parser.py index 9a438f8..8cebfe7 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -316,6 +316,38 @@ def test_diff_lines_linenos(self): self.assertEqual(source_line_nos, expected_source_line_nos) self.assertEqual(diff_line_nos, expected_diff_line_nos) + def test_diff_hunk_positions(self): + with open(self.sample_file, 'rb') as diff_file: + res = PatchSet(diff_file, encoding='utf-8') + self.do_test_diff_hunk_positions(res) + + def test_diff_hunk_positions_only_hunk_positions(self): + with open(self.sample_file, 'rb') as diff_file: + res = PatchSet(diff_file, encoding='utf-8', only_hunk_positions=True) + self.do_test_diff_hunk_positions(res) + + def do_test_diff_hunk_positions(self, res): + hunk_positions = [] + for diff_file in res: + for hunk in diff_file: + hunk_positions.append((hunk.source_start, hunk.target_start, + hunk.source_length, hunk.target_length)) + + expected_hunk_positions = [ + # File: 1, Hunk: 1 + (1, 1, 3, 9), + # File: 1, Hunk: 2 + (5, 11, 16, 10), + # File: 1, Hunk: 3 + (22, 22, 3, 7), + # File: 2, Hunk: 1 + (0, 1, 0, 9), + # File: 3, Hunk: 1 + (1, 0, 9, 0) + ] + + self.assertEqual(hunk_positions, expected_hunk_positions) + class TestVCSSamples(unittest.TestCase): """Tests for real examples from VCS.""" diff --git a/unidiff/patch.py b/unidiff/patch.py index 3d71581..d07bee1 100644 --- a/unidiff/patch.py +++ b/unidiff/patch.py @@ -229,7 +229,7 @@ def __str__(self): hunks = ''.join(unicode(hunk) for hunk in self) return info + source + target + hunks - def _parse_hunk(self, header, diff, encoding): + def _parse_hunk(self, header, diff, encoding, only_hunk_positions): """Parse hunk details.""" header_info = RE_HUNK_HEADER.match(header) hunk_info = header_info.groups() @@ -244,33 +244,53 @@ def _parse_hunk(self, header, diff, encoding): if encoding is not None: line = line.decode(encoding) - valid_line = RE_HUNK_EMPTY_BODY_LINE.match(line) - if not valid_line: - valid_line = RE_HUNK_BODY_LINE.match(line) + if only_hunk_positions: + if not line: + line_type = LINE_TYPE_CONTEXT + else: + line_type = line[0] + + if line_type == LINE_TYPE_ADDED: + target_line_no += 1 + elif line_type == LINE_TYPE_REMOVED: + source_line_no += 1 + elif line_type == LINE_TYPE_CONTEXT: + target_line_no += 1 + source_line_no += 1 + elif line_type == LINE_TYPE_NO_NEWLINE: + pass + else: + raise UnidiffParseError('Hunk diff line expected: %s' % line) - if not valid_line: - raise UnidiffParseError('Hunk diff line expected: %s' % line) - - line_type = valid_line.group('line_type') - if line_type == LINE_TYPE_EMPTY: - line_type = LINE_TYPE_CONTEXT - value = valid_line.group('value') - original_line = Line(value, line_type=line_type) - if line_type == LINE_TYPE_ADDED: - original_line.target_line_no = target_line_no - target_line_no += 1 - elif line_type == LINE_TYPE_REMOVED: - original_line.source_line_no = source_line_no - source_line_no += 1 - elif line_type == LINE_TYPE_CONTEXT: - original_line.target_line_no = target_line_no - target_line_no += 1 - original_line.source_line_no = source_line_no - source_line_no += 1 - elif line_type == LINE_TYPE_NO_NEWLINE: - pass - else: original_line = None + else: + valid_line = RE_HUNK_BODY_LINE.match(line) + if not valid_line: + valid_line = RE_HUNK_EMPTY_BODY_LINE.match(line) + + if not valid_line: + raise UnidiffParseError('Hunk diff line expected: %s' % line) + + line_type = valid_line.group('line_type') + if line_type == LINE_TYPE_EMPTY: + line_type = LINE_TYPE_CONTEXT + value = valid_line.group('value') + original_line = Line(value, line_type=line_type) + if line_type == LINE_TYPE_ADDED: + original_line.target_line_no = target_line_no + target_line_no += 1 + elif line_type == LINE_TYPE_REMOVED: + original_line.source_line_no = source_line_no + source_line_no += 1 + elif line_type == LINE_TYPE_CONTEXT: + original_line.target_line_no = target_line_no + target_line_no += 1 + original_line.source_line_no = source_line_no + source_line_no += 1 + elif line_type == LINE_TYPE_NO_NEWLINE: + pass + else: + original_line = None # stop parsing if we got past expected number of lines if (source_line_no > expected_source_end or @@ -360,7 +380,7 @@ def is_modified_file(self): class PatchSet(list): """A list of PatchedFiles.""" - def __init__(self, f, encoding=None): + def __init__(self, f, encoding=None, only_hunk_positions=False): super(PatchSet, self).__init__() # convert string inputs to StringIO objects @@ -370,7 +390,9 @@ def __init__(self, f, encoding=None): # make sure we pass an iterator object to parse data = iter(f) # if encoding is None, assume we are reading unicode data - self._parse(data, encoding=encoding) + # if only_hunk_positions is True, we perform only minimal parsing of lines within hunks. + # This is around 2.5-6 times faster than full parsing depending on Python version. + self._parse(data, encoding=encoding, only_hunk_positions=only_hunk_positions) def __repr__(self): return make_str('') % super(PatchSet, self).__repr__() @@ -378,7 +400,7 @@ def __repr__(self): def __str__(self): return ''.join(unicode(patched_file) for patched_file in self) - def _parse(self, diff, encoding): + def _parse(self, diff, encoding, only_hunk_positions): current_file = None patch_info = None @@ -449,7 +471,7 @@ def _parse(self, diff, encoding): if is_hunk_header: if current_file is None: raise UnidiffParseError('Unexpected hunk found: %s' % line) - current_file._parse_hunk(line, diff, encoding) + current_file._parse_hunk(line, diff, encoding, only_hunk_positions) continue # check for no newline marker From e9c8aa1fa3b9cc48f7e6dcd431e7fde9b23ff3b6 Mon Sep 17 00:00:00 2001 From: Matias Bordese Date: Tue, 7 Apr 2020 19:49:32 -0300 Subject: [PATCH 2/3] Refactoring (and renaming as) metadata_only option. --- tests/test_parser.py | 4 +-- unidiff/patch.py | 74 ++++++++++++++++++++++---------------------- 2 files changed, 39 insertions(+), 39 deletions(-) diff --git a/tests/test_parser.py b/tests/test_parser.py index 8cebfe7..964f15a 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -321,9 +321,9 @@ def test_diff_hunk_positions(self): res = PatchSet(diff_file, encoding='utf-8') self.do_test_diff_hunk_positions(res) - def test_diff_hunk_positions_only_hunk_positions(self): + def test_diff_metadata_only(self): with open(self.sample_file, 'rb') as diff_file: - res = PatchSet(diff_file, encoding='utf-8', only_hunk_positions=True) + res = PatchSet(diff_file, encoding='utf-8', metadata_only=True) self.do_test_diff_hunk_positions(res) def do_test_diff_hunk_positions(self, res): diff --git a/unidiff/patch.py b/unidiff/patch.py index d07bee1..0369b78 100644 --- a/unidiff/patch.py +++ b/unidiff/patch.py @@ -229,7 +229,7 @@ def __str__(self): hunks = ''.join(unicode(hunk) for hunk in self) return info + source + target + hunks - def _parse_hunk(self, header, diff, encoding, only_hunk_positions): + def _parse_hunk(self, header, diff, encoding, metadata_only): """Parse hunk details.""" header_info = RE_HUNK_HEADER.match(header) hunk_info = header_info.groups() @@ -244,53 +244,52 @@ def _parse_hunk(self, header, diff, encoding, only_hunk_positions): if encoding is not None: line = line.decode(encoding) - if only_hunk_positions: - if not line: - line_type = LINE_TYPE_CONTEXT - else: - line_type = line[0] - - if line_type == LINE_TYPE_ADDED: - target_line_no += 1 - elif line_type == LINE_TYPE_REMOVED: - source_line_no += 1 - elif line_type == LINE_TYPE_CONTEXT: - target_line_no += 1 - source_line_no += 1 - elif line_type == LINE_TYPE_NO_NEWLINE: - pass - else: - raise UnidiffParseError('Hunk diff line expected: %s' % line) - + if metadata_only: + # quick line type detection, no regex required + line_type = line[0] if line else LINE_TYPE_CONTEXT + if line_type not in (LINE_TYPE_ADDED, + LINE_TYPE_REMOVED, + LINE_TYPE_CONTEXT, + LINE_TYPE_NO_NEWLINE): + raise UnidiffParseError( + 'Hunk diff line expected: %s' % line) + # no file contents tracking either original_line = None else: + # parse diff line content valid_line = RE_HUNK_BODY_LINE.match(line) if not valid_line: valid_line = RE_HUNK_EMPTY_BODY_LINE.match(line) if not valid_line: - raise UnidiffParseError('Hunk diff line expected: %s' % line) + raise UnidiffParseError( + 'Hunk diff line expected: %s' % line) line_type = valid_line.group('line_type') if line_type == LINE_TYPE_EMPTY: line_type = LINE_TYPE_CONTEXT + value = valid_line.group('value') original_line = Line(value, line_type=line_type) - if line_type == LINE_TYPE_ADDED: + + if line_type == LINE_TYPE_ADDED: + if original_line is not None: original_line.target_line_no = target_line_no - target_line_no += 1 - elif line_type == LINE_TYPE_REMOVED: + target_line_no += 1 + elif line_type == LINE_TYPE_REMOVED: + if original_line is not None: original_line.source_line_no = source_line_no - source_line_no += 1 - elif line_type == LINE_TYPE_CONTEXT: + source_line_no += 1 + elif line_type == LINE_TYPE_CONTEXT: + if original_line is not None: original_line.target_line_no = target_line_no - target_line_no += 1 original_line.source_line_no = source_line_no - source_line_no += 1 - elif line_type == LINE_TYPE_NO_NEWLINE: - pass - else: - original_line = None + target_line_no += 1 + source_line_no += 1 + elif line_type == LINE_TYPE_NO_NEWLINE: + pass + else: + original_line = None # stop parsing if we got past expected number of lines if (source_line_no > expected_source_end or @@ -380,7 +379,7 @@ def is_modified_file(self): class PatchSet(list): """A list of PatchedFiles.""" - def __init__(self, f, encoding=None, only_hunk_positions=False): + def __init__(self, f, encoding=None, metadata_only=False): super(PatchSet, self).__init__() # convert string inputs to StringIO objects @@ -390,9 +389,10 @@ def __init__(self, f, encoding=None, only_hunk_positions=False): # make sure we pass an iterator object to parse data = iter(f) # if encoding is None, assume we are reading unicode data - # if only_hunk_positions is True, we perform only minimal parsing of lines within hunks. - # This is around 2.5-6 times faster than full parsing depending on Python version. - self._parse(data, encoding=encoding, only_hunk_positions=only_hunk_positions) + # when metadata_only is True, only perform a minimal metadata parsing + # (ie. hunks without content) which is around 2.5-6 times faster; + # it will still validate the diff metadata consistency + self._parse(data, encoding=encoding, metadata_only=metadata_only) def __repr__(self): return make_str('') % super(PatchSet, self).__repr__() @@ -400,7 +400,7 @@ def __repr__(self): def __str__(self): return ''.join(unicode(patched_file) for patched_file in self) - def _parse(self, diff, encoding, only_hunk_positions): + def _parse(self, diff, encoding, metadata_only): current_file = None patch_info = None @@ -471,7 +471,7 @@ def _parse(self, diff, encoding, only_hunk_positions): if is_hunk_header: if current_file is None: raise UnidiffParseError('Unexpected hunk found: %s' % line) - current_file._parse_hunk(line, diff, encoding, only_hunk_positions) + current_file._parse_hunk(line, diff, encoding, metadata_only) continue # check for no newline marker From 3273435e206ff776eaa8501952f76112e34068e6 Mon Sep 17 00:00:00 2001 From: Matias Bordese Date: Thu, 9 Apr 2020 19:32:27 -0300 Subject: [PATCH 3/3] Updated metadata_only to still get diff added/removed counts. --- bin/unidiff | 2 +- tests/test_parser.py | 10 ++++++-- unidiff/patch.py | 58 ++++++++++++++++++++++++++++++++------------ 3 files changed, 51 insertions(+), 19 deletions(-) diff --git a/bin/unidiff b/bin/unidiff index c5cb929..3d370e8 100755 --- a/bin/unidiff +++ b/bin/unidiff @@ -45,7 +45,7 @@ if __name__ == '__main__': if PY2: diff_file = codecs.getreader(encoding)(diff_file) - patch = PatchSet(diff_file) + patch = PatchSet(diff_file, metadata_only=(not args.show_diff)) if args.show_diff: print(patch) diff --git a/tests/test_parser.py b/tests/test_parser.py index 964f15a..841c1c1 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -114,10 +114,10 @@ def test_print_hunks_without_gaps(self): self.assertEqual(lines[12], '@@ -5,16 +11,10 @@') self.assertEqual(lines[31], '@@ -22,3 +22,7 @@') - def test_parse_sample(self): + def _test_parse_sample(self, metadata_only): """Parse sample file.""" with codecs.open(self.sample_file, 'r', encoding='utf-8') as diff_file: - res = PatchSet(diff_file) + res = PatchSet(diff_file, metadata_only=metadata_only) # three file in the patch self.assertEqual(len(res), 3) @@ -164,6 +164,12 @@ def test_parse_sample(self): self.assertEqual(res.added, 21) self.assertEqual(res.removed, 17) + def test_parse_sample_full(self): + self._test_parse_sample(metadata_only=False) + + def test_parse_sample_metadata_only(self): + self._test_parse_sample(metadata_only=True) + def test_patchset_compare(self): with codecs.open(self.sample_file, 'r', encoding='utf-8') as diff_file: ps1 = PatchSet(diff_file) diff --git a/unidiff/patch.py b/unidiff/patch.py index 0369b78..0ce3dd8 100644 --- a/unidiff/patch.py +++ b/unidiff/patch.py @@ -141,6 +141,8 @@ def __init__(self, src_start=0, src_len=0, tgt_start=0, tgt_len=0, self.target_start = int(tgt_start) self.target_length = int(tgt_len) self.section_header = section_header + self._added = None + self._removed = None def __repr__(self): value = "" % (self.source_start, @@ -168,10 +170,18 @@ def append(self, line): @property def added(self): + if self._added is not None: + return self._added + # re-calculate each time to allow for hunk modifications + # (which should mean metadata_only switch wasn't used) return sum(1 for line in self if line.is_added) @property def removed(self): + if self._removed is not None: + return self._removed + # re-calculate each time to allow for hunk modifications + # (which should mean metadata_only switch wasn't used) return sum(1 for line in self if line.is_removed) def is_valid(self): @@ -239,6 +249,8 @@ def _parse_hunk(self, header, diff, encoding, metadata_only): target_line_no = hunk.target_start expected_source_end = source_line_no + hunk.source_length expected_target_end = target_line_no + hunk.target_length + added = 0 + removed = 0 for diff_line_no, line in diff: if encoding is not None: @@ -253,8 +265,20 @@ def _parse_hunk(self, header, diff, encoding, metadata_only): LINE_TYPE_NO_NEWLINE): raise UnidiffParseError( 'Hunk diff line expected: %s' % line) - # no file contents tracking either + + if line_type == LINE_TYPE_ADDED: + target_line_no += 1 + added += 1 + elif line_type == LINE_TYPE_REMOVED: + source_line_no += 1 + removed += 1 + elif line_type == LINE_TYPE_CONTEXT: + target_line_no += 1 + source_line_no += 1 + + # no file content tracking original_line = None + else: # parse diff line content valid_line = RE_HUNK_BODY_LINE.match(line) @@ -272,24 +296,21 @@ def _parse_hunk(self, header, diff, encoding, metadata_only): value = valid_line.group('value') original_line = Line(value, line_type=line_type) - if line_type == LINE_TYPE_ADDED: - if original_line is not None: + if line_type == LINE_TYPE_ADDED: original_line.target_line_no = target_line_no - target_line_no += 1 - elif line_type == LINE_TYPE_REMOVED: - if original_line is not None: + target_line_no += 1 + elif line_type == LINE_TYPE_REMOVED: original_line.source_line_no = source_line_no - source_line_no += 1 - elif line_type == LINE_TYPE_CONTEXT: - if original_line is not None: + source_line_no += 1 + elif line_type == LINE_TYPE_CONTEXT: original_line.target_line_no = target_line_no original_line.source_line_no = source_line_no - target_line_no += 1 - source_line_no += 1 - elif line_type == LINE_TYPE_NO_NEWLINE: - pass - else: - original_line = None + target_line_no += 1 + source_line_no += 1 + elif line_type == LINE_TYPE_NO_NEWLINE: + pass + else: + original_line = None # stop parsing if we got past expected number of lines if (source_line_no > expected_source_end or @@ -310,6 +331,11 @@ def _parse_hunk(self, header, diff, encoding, metadata_only): target_line_no < expected_target_end): raise UnidiffParseError('Hunk is shorter than expected') + if metadata_only: + # HACK: set fixed calculated values when metadata_only is enabled + hunk._added = added + hunk._removed = removed + self.append(hunk) def _add_no_newline_marker_to_last_hunk(self): @@ -391,7 +417,7 @@ def __init__(self, f, encoding=None, metadata_only=False): # if encoding is None, assume we are reading unicode data # when metadata_only is True, only perform a minimal metadata parsing # (ie. hunks without content) which is around 2.5-6 times faster; - # it will still validate the diff metadata consistency + # it will still validate the diff metadata consistency and get counts self._parse(data, encoding=encoding, metadata_only=metadata_only) def __repr__(self):