From 37480a915e5ee73da3a74414e6b79e2747f3d81b Mon Sep 17 00:00:00 2001
From: Povilas Kanapickas <povilas@radix.lt>
Date: Sat, 4 Apr 2020 21:07:01 +0300
Subject: [PATCH 1/3] Add an option to parse only hunk positions

---
 tests/test_parser.py | 32 +++++++++++++++++
 unidiff/patch.py     | 82 ++++++++++++++++++++++++++++----------------
 2 files changed, 84 insertions(+), 30 deletions(-)

diff --git a/tests/test_parser.py b/tests/test_parser.py
index 9a438f8..8cebfe7 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -316,6 +316,38 @@ def test_diff_lines_linenos(self):
         self.assertEqual(source_line_nos, expected_source_line_nos)
         self.assertEqual(diff_line_nos, expected_diff_line_nos)
 
+    def test_diff_hunk_positions(self):
+        with open(self.sample_file, 'rb') as diff_file:
+            res = PatchSet(diff_file, encoding='utf-8')
+        self.do_test_diff_hunk_positions(res)
+
+    def test_diff_hunk_positions_only_hunk_positions(self):
+        with open(self.sample_file, 'rb') as diff_file:
+            res = PatchSet(diff_file, encoding='utf-8', only_hunk_positions=True)
+        self.do_test_diff_hunk_positions(res)
+
+    def do_test_diff_hunk_positions(self, res):
+        hunk_positions = []
+        for diff_file in res:
+            for hunk in diff_file:
+                hunk_positions.append((hunk.source_start, hunk.target_start,
+                                       hunk.source_length, hunk.target_length))
+
+        expected_hunk_positions = [
+            # File: 1, Hunk: 1
+            (1, 1, 3, 9),
+            # File: 1, Hunk: 2
+            (5, 11, 16, 10),
+            # File: 1, Hunk: 3
+            (22, 22, 3, 7),
+            # File: 2, Hunk: 1
+            (0, 1, 0, 9),
+            # File: 3, Hunk: 1
+            (1, 0, 9, 0)
+        ]
+
+        self.assertEqual(hunk_positions, expected_hunk_positions)
+
 
 class TestVCSSamples(unittest.TestCase):
     """Tests for real examples from VCS."""
diff --git a/unidiff/patch.py b/unidiff/patch.py
index 3d71581..d07bee1 100644
--- a/unidiff/patch.py
+++ b/unidiff/patch.py
@@ -229,7 +229,7 @@ def __str__(self):
         hunks = ''.join(unicode(hunk) for hunk in self)
         return info + source + target + hunks
 
-    def _parse_hunk(self, header, diff, encoding):
+    def _parse_hunk(self, header, diff, encoding, only_hunk_positions):
         """Parse hunk details."""
         header_info = RE_HUNK_HEADER.match(header)
         hunk_info = header_info.groups()
@@ -244,33 +244,53 @@ def _parse_hunk(self, header, diff, encoding):
             if encoding is not None:
                 line = line.decode(encoding)
 
-            valid_line = RE_HUNK_EMPTY_BODY_LINE.match(line)
-            if not valid_line:
-                valid_line = RE_HUNK_BODY_LINE.match(line)
+            if only_hunk_positions:
+                if not line:
+                    line_type = LINE_TYPE_CONTEXT
+                else:
+                    line_type = line[0]
+
+                if line_type == LINE_TYPE_ADDED:
+                    target_line_no += 1
+                elif line_type == LINE_TYPE_REMOVED:
+                    source_line_no += 1
+                elif line_type == LINE_TYPE_CONTEXT:
+                    target_line_no += 1
+                    source_line_no += 1
+                elif line_type == LINE_TYPE_NO_NEWLINE:
+                    pass
+                else:
+                    raise UnidiffParseError('Hunk diff line expected: %s' % line)
 
-            if not valid_line:
-                raise UnidiffParseError('Hunk diff line expected: %s' % line)
-
-            line_type = valid_line.group('line_type')
-            if line_type == LINE_TYPE_EMPTY:
-                line_type = LINE_TYPE_CONTEXT
-            value = valid_line.group('value')
-            original_line = Line(value, line_type=line_type)
-            if line_type == LINE_TYPE_ADDED:
-                original_line.target_line_no = target_line_no
-                target_line_no += 1
-            elif line_type == LINE_TYPE_REMOVED:
-                original_line.source_line_no = source_line_no
-                source_line_no += 1
-            elif line_type == LINE_TYPE_CONTEXT:
-                original_line.target_line_no = target_line_no
-                target_line_no += 1
-                original_line.source_line_no = source_line_no
-                source_line_no += 1
-            elif line_type == LINE_TYPE_NO_NEWLINE:
-                pass
-            else:
                 original_line = None
+            else:
+                valid_line = RE_HUNK_BODY_LINE.match(line)
+                if not valid_line:
+                    valid_line = RE_HUNK_EMPTY_BODY_LINE.match(line)
+
+                if not valid_line:
+                    raise UnidiffParseError('Hunk diff line expected: %s' % line)
+
+                line_type = valid_line.group('line_type')
+                if line_type == LINE_TYPE_EMPTY:
+                    line_type = LINE_TYPE_CONTEXT
+                value = valid_line.group('value')
+                original_line = Line(value, line_type=line_type)
+                if line_type == LINE_TYPE_ADDED:
+                    original_line.target_line_no = target_line_no
+                    target_line_no += 1
+                elif line_type == LINE_TYPE_REMOVED:
+                    original_line.source_line_no = source_line_no
+                    source_line_no += 1
+                elif line_type == LINE_TYPE_CONTEXT:
+                    original_line.target_line_no = target_line_no
+                    target_line_no += 1
+                    original_line.source_line_no = source_line_no
+                    source_line_no += 1
+                elif line_type == LINE_TYPE_NO_NEWLINE:
+                    pass
+                else:
+                    original_line = None
 
             # stop parsing if we got past expected number of lines
             if (source_line_no > expected_source_end or
@@ -360,7 +380,7 @@ def is_modified_file(self):
 class PatchSet(list):
     """A list of PatchedFiles."""
 
-    def __init__(self, f, encoding=None):
+    def __init__(self, f, encoding=None, only_hunk_positions=False):
         super(PatchSet, self).__init__()
 
         # convert string inputs to StringIO objects
@@ -370,7 +390,9 @@ def __init__(self, f, encoding=None):
         # make sure we pass an iterator object to parse
         data = iter(f)
         # if encoding is None, assume we are reading unicode data
-        self._parse(data, encoding=encoding)
+        # if only_hunk_positions is True, we perform only minimal parsing of lines within hunks.
+        # This is around 2.5-6 times faster than full parsing depending on Python version.
+        self._parse(data, encoding=encoding, only_hunk_positions=only_hunk_positions)
 
     def __repr__(self):
         return make_str('<PatchSet: %s>') % super(PatchSet, self).__repr__()
@@ -378,7 +400,7 @@ def __repr__(self):
     def __str__(self):
         return ''.join(unicode(patched_file) for patched_file in self)
 
-    def _parse(self, diff, encoding):
+    def _parse(self, diff, encoding, only_hunk_positions):
         current_file = None
         patch_info = None
 
@@ -449,7 +471,7 @@ def _parse(self, diff, encoding):
             if is_hunk_header:
                 if current_file is None:
                     raise UnidiffParseError('Unexpected hunk found: %s' % line)
-                current_file._parse_hunk(line, diff, encoding)
+                current_file._parse_hunk(line, diff, encoding, only_hunk_positions)
                 continue
 
             # check for no newline marker

From e9c8aa1fa3b9cc48f7e6dcd431e7fde9b23ff3b6 Mon Sep 17 00:00:00 2001
From: Matias Bordese <mbordese@gmail.com>
Date: Tue, 7 Apr 2020 19:49:32 -0300
Subject: [PATCH 2/3] Refactoring (and renaming as) metadata_only option.

---
 tests/test_parser.py |  4 +--
 unidiff/patch.py     | 74 ++++++++++++++++++++++----------------------
 2 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/tests/test_parser.py b/tests/test_parser.py
index 8cebfe7..964f15a 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -321,9 +321,9 @@ def test_diff_hunk_positions(self):
             res = PatchSet(diff_file, encoding='utf-8')
         self.do_test_diff_hunk_positions(res)
 
-    def test_diff_hunk_positions_only_hunk_positions(self):
+    def test_diff_metadata_only(self):
         with open(self.sample_file, 'rb') as diff_file:
-            res = PatchSet(diff_file, encoding='utf-8', only_hunk_positions=True)
+            res = PatchSet(diff_file, encoding='utf-8', metadata_only=True)
         self.do_test_diff_hunk_positions(res)
 
     def do_test_diff_hunk_positions(self, res):
diff --git a/unidiff/patch.py b/unidiff/patch.py
index d07bee1..0369b78 100644
--- a/unidiff/patch.py
+++ b/unidiff/patch.py
@@ -229,7 +229,7 @@ def __str__(self):
         hunks = ''.join(unicode(hunk) for hunk in self)
         return info + source + target + hunks
 
-    def _parse_hunk(self, header, diff, encoding, only_hunk_positions):
+    def _parse_hunk(self, header, diff, encoding, metadata_only):
         """Parse hunk details."""
         header_info = RE_HUNK_HEADER.match(header)
         hunk_info = header_info.groups()
@@ -244,53 +244,52 @@ def _parse_hunk(self, header, diff, encoding, only_hunk_positions):
             if encoding is not None:
                 line = line.decode(encoding)
 
-            if only_hunk_positions:
-                if not line:
-                    line_type = LINE_TYPE_CONTEXT
-                else:
-                    line_type = line[0]
-
-                if line_type == LINE_TYPE_ADDED:
-                    target_line_no += 1
-                elif line_type == LINE_TYPE_REMOVED:
-                    source_line_no += 1
-                elif line_type == LINE_TYPE_CONTEXT:
-                    target_line_no += 1
-                    source_line_no += 1
-                elif line_type == LINE_TYPE_NO_NEWLINE:
-                    pass
-                else:
-                    raise UnidiffParseError('Hunk diff line expected: %s' % line)
-
+            if metadata_only:
+                # quick line type detection, no regex required
+                line_type = line[0] if line else LINE_TYPE_CONTEXT
+                if line_type not in (LINE_TYPE_ADDED,
+                                     LINE_TYPE_REMOVED,
+                                     LINE_TYPE_CONTEXT,
+                                     LINE_TYPE_NO_NEWLINE):
+                    raise UnidiffParseError(
+                        'Hunk diff line expected: %s' % line)
+                # no file contents tracking either
                 original_line = None
             else:
+                # parse diff line content
                 valid_line = RE_HUNK_BODY_LINE.match(line)
                 if not valid_line:
                     valid_line = RE_HUNK_EMPTY_BODY_LINE.match(line)
 
                 if not valid_line:
-                    raise UnidiffParseError('Hunk diff line expected: %s' % line)
+                    raise UnidiffParseError(
+                        'Hunk diff line expected: %s' % line)
 
                 line_type = valid_line.group('line_type')
                 if line_type == LINE_TYPE_EMPTY:
                     line_type = LINE_TYPE_CONTEXT
+
                 value = valid_line.group('value')
                 original_line = Line(value, line_type=line_type)
-                if line_type == LINE_TYPE_ADDED:
+
+            if line_type == LINE_TYPE_ADDED:
+                if original_line is not None:
                     original_line.target_line_no = target_line_no
-                    target_line_no += 1
-                elif line_type == LINE_TYPE_REMOVED:
+                target_line_no += 1
+            elif line_type == LINE_TYPE_REMOVED:
+                if original_line is not None:
                     original_line.source_line_no = source_line_no
-                    source_line_no += 1
-                elif line_type == LINE_TYPE_CONTEXT:
+                source_line_no += 1
+            elif line_type == LINE_TYPE_CONTEXT:
+                if original_line is not None:
                     original_line.target_line_no = target_line_no
-                    target_line_no += 1
                     original_line.source_line_no = source_line_no
-                    source_line_no += 1
-                elif line_type == LINE_TYPE_NO_NEWLINE:
-                    pass
-                else:
-                    original_line = None
+                target_line_no += 1
+                source_line_no += 1
+            elif line_type == LINE_TYPE_NO_NEWLINE:
+                pass
+            else:
+                original_line = None
 
             # stop parsing if we got past expected number of lines
             if (source_line_no > expected_source_end or
@@ -380,7 +379,7 @@ def is_modified_file(self):
 class PatchSet(list):
     """A list of PatchedFiles."""
 
-    def __init__(self, f, encoding=None, only_hunk_positions=False):
+    def __init__(self, f, encoding=None, metadata_only=False):
         super(PatchSet, self).__init__()
 
         # convert string inputs to StringIO objects
@@ -390,9 +389,10 @@ def __init__(self, f, encoding=None, only_hunk_positions=False):
         # make sure we pass an iterator object to parse
         data = iter(f)
         # if encoding is None, assume we are reading unicode data
-        # if only_hunk_positions is True, we perform only minimal parsing of lines within hunks.
-        # This is around 2.5-6 times faster than full parsing depending on Python version.
-        self._parse(data, encoding=encoding, only_hunk_positions=only_hunk_positions)
+        # when metadata_only is True, only perform a minimal metadata parsing
+        # (ie. hunks without content) which is around 2.5-6 times faster;
+        # it will still validate the diff metadata consistency
+        self._parse(data, encoding=encoding, metadata_only=metadata_only)
 
     def __repr__(self):
         return make_str('<PatchSet: %s>') % super(PatchSet, self).__repr__()
@@ -400,7 +400,7 @@ def __repr__(self):
     def __str__(self):
         return ''.join(unicode(patched_file) for patched_file in self)
 
-    def _parse(self, diff, encoding, only_hunk_positions):
+    def _parse(self, diff, encoding, metadata_only):
         current_file = None
         patch_info = None
 
@@ -471,7 +471,7 @@ def _parse(self, diff, encoding, only_hunk_positions):
             if is_hunk_header:
                 if current_file is None:
                     raise UnidiffParseError('Unexpected hunk found: %s' % line)
-                current_file._parse_hunk(line, diff, encoding, only_hunk_positions)
+                current_file._parse_hunk(line, diff, encoding, metadata_only)
                 continue
 
             # check for no newline marker

From 3273435e206ff776eaa8501952f76112e34068e6 Mon Sep 17 00:00:00 2001
From: Matias Bordese <mbordese@gmail.com>
Date: Thu, 9 Apr 2020 19:32:27 -0300
Subject: [PATCH 3/3] Updated metadata_only to still get diff added/removed
 counts.

---
 bin/unidiff          |  2 +-
 tests/test_parser.py | 10 ++++++--
 unidiff/patch.py     | 58 ++++++++++++++++++++++++++++++++------------
 3 files changed, 51 insertions(+), 19 deletions(-)

diff --git a/bin/unidiff b/bin/unidiff
index c5cb929..3d370e8 100755
--- a/bin/unidiff
+++ b/bin/unidiff
@@ -45,7 +45,7 @@ if __name__ == '__main__':
     if PY2:
         diff_file = codecs.getreader(encoding)(diff_file)
 
-    patch = PatchSet(diff_file)
+    patch = PatchSet(diff_file, metadata_only=(not args.show_diff))
 
     if args.show_diff:
         print(patch)
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 964f15a..841c1c1 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -114,10 +114,10 @@ def test_print_hunks_without_gaps(self):
         self.assertEqual(lines[12], '@@ -5,16 +11,10 @@')
         self.assertEqual(lines[31], '@@ -22,3 +22,7 @@')
 
-    def test_parse_sample(self):
+    def _test_parse_sample(self, metadata_only):
         """Parse sample file."""
         with codecs.open(self.sample_file, 'r', encoding='utf-8') as diff_file:
-            res = PatchSet(diff_file)
+            res = PatchSet(diff_file, metadata_only=metadata_only)
 
         # three file in the patch
         self.assertEqual(len(res), 3)
@@ -164,6 +164,12 @@ def test_parse_sample(self):
         self.assertEqual(res.added, 21)
         self.assertEqual(res.removed, 17)
 
+    def test_parse_sample_full(self):
+        self._test_parse_sample(metadata_only=False)
+
+    def test_parse_sample_metadata_only(self):
+        self._test_parse_sample(metadata_only=True)
+
     def test_patchset_compare(self):
         with codecs.open(self.sample_file, 'r', encoding='utf-8') as diff_file:
             ps1 = PatchSet(diff_file)
diff --git a/unidiff/patch.py b/unidiff/patch.py
index 0369b78..0ce3dd8 100644
--- a/unidiff/patch.py
+++ b/unidiff/patch.py
@@ -141,6 +141,8 @@ def __init__(self, src_start=0, src_len=0, tgt_start=0, tgt_len=0,
         self.target_start = int(tgt_start)
         self.target_length = int(tgt_len)
         self.section_header = section_header
+        self._added = None
+        self._removed = None
 
     def __repr__(self):
         value = "<Hunk: @@ %d,%d %d,%d @@ %s>" % (self.source_start,
@@ -168,10 +170,18 @@ def append(self, line):
 
     @property
     def added(self):
+        if self._added is not None:
+            return self._added
+        # re-calculate each time to allow for hunk modifications
+        # (which should mean metadata_only switch wasn't used)
         return sum(1 for line in self if line.is_added)
 
     @property
     def removed(self):
+        if self._removed is not None:
+            return self._removed
+        # re-calculate each time to allow for hunk modifications
+        # (which should mean metadata_only switch wasn't used)
         return sum(1 for line in self if line.is_removed)
 
     def is_valid(self):
@@ -239,6 +249,8 @@ def _parse_hunk(self, header, diff, encoding, metadata_only):
         target_line_no = hunk.target_start
         expected_source_end = source_line_no + hunk.source_length
         expected_target_end = target_line_no + hunk.target_length
+        added = 0
+        removed = 0
 
         for diff_line_no, line in diff:
             if encoding is not None:
@@ -253,8 +265,20 @@ def _parse_hunk(self, header, diff, encoding, metadata_only):
                                      LINE_TYPE_NO_NEWLINE):
                     raise UnidiffParseError(
                         'Hunk diff line expected: %s' % line)
-                # no file contents tracking either
+
+                if line_type == LINE_TYPE_ADDED:
+                    target_line_no += 1
+                    added += 1
+                elif line_type == LINE_TYPE_REMOVED:
+                    source_line_no += 1
+                    removed += 1
+                elif line_type == LINE_TYPE_CONTEXT:
+                    target_line_no += 1
+                    source_line_no += 1
+
+                # no file content tracking
                 original_line = None
+
             else:
                 # parse diff line content
                 valid_line = RE_HUNK_BODY_LINE.match(line)
@@ -272,24 +296,21 @@ def _parse_hunk(self, header, diff, encoding, metadata_only):
                 value = valid_line.group('value')
                 original_line = Line(value, line_type=line_type)
 
-            if line_type == LINE_TYPE_ADDED:
-                if original_line is not None:
+                if line_type == LINE_TYPE_ADDED:
                     original_line.target_line_no = target_line_no
-                target_line_no += 1
-            elif line_type == LINE_TYPE_REMOVED:
-                if original_line is not None:
+                    target_line_no += 1
+                elif line_type == LINE_TYPE_REMOVED:
                     original_line.source_line_no = source_line_no
-                source_line_no += 1
-            elif line_type == LINE_TYPE_CONTEXT:
-                if original_line is not None:
+                    source_line_no += 1
+                elif line_type == LINE_TYPE_CONTEXT:
                     original_line.target_line_no = target_line_no
                     original_line.source_line_no = source_line_no
-                target_line_no += 1
-                source_line_no += 1
-            elif line_type == LINE_TYPE_NO_NEWLINE:
-                pass
-            else:
-                original_line = None
+                    target_line_no += 1
+                    source_line_no += 1
+                elif line_type == LINE_TYPE_NO_NEWLINE:
+                    pass
+                else:
+                    original_line = None
 
             # stop parsing if we got past expected number of lines
             if (source_line_no > expected_source_end or
@@ -310,6 +331,11 @@ def _parse_hunk(self, header, diff, encoding, metadata_only):
                 target_line_no < expected_target_end):
             raise UnidiffParseError('Hunk is shorter than expected')
 
+        if metadata_only:
+            # HACK: set fixed calculated values when metadata_only is enabled
+            hunk._added = added
+            hunk._removed = removed
+
         self.append(hunk)
 
     def _add_no_newline_marker_to_last_hunk(self):
@@ -391,7 +417,7 @@ def __init__(self, f, encoding=None, metadata_only=False):
         # if encoding is None, assume we are reading unicode data
         # when metadata_only is True, only perform a minimal metadata parsing
         # (ie. hunks without content) which is around 2.5-6 times faster;
-        # it will still validate the diff metadata consistency
+        # it will still validate the diff metadata consistency and get counts
         self._parse(data, encoding=encoding, metadata_only=metadata_only)
 
     def __repr__(self):