From b362e5fccee3ed0eb33ec461319952c27640368b Mon Sep 17 00:00:00 2001
From: Volo Zyko <volo.zyko@gmail.com>
Date: Sun, 30 Jul 2017 15:23:38 +0300
Subject: [PATCH 1/2] Implement parsing of lines with extended patch info

Beside that, make parsing a bit stricter, fix code for conversion
to string in a number of places so that original and resulting diffs
are the same now, add more tests.
---
 tests/samples/git.diff     |  2 +-
 tests/samples/sample5.diff | 29 +++++++++++++
 tests/samples/sample6.diff | 38 +++++++++++++++++
 tests/samples/sample7.diff | 29 +++++++++++++
 tests/test_parser.py       | 48 +++++++++++++++++----
 unidiff/constants.py       |  6 ++-
 unidiff/patch.py           | 85 ++++++++++++++++++++++++++++++++------
 7 files changed, 213 insertions(+), 24 deletions(-)
 create mode 100644 tests/samples/sample5.diff
 create mode 100644 tests/samples/sample6.diff
 create mode 100644 tests/samples/sample7.diff

diff --git a/tests/samples/git.diff b/tests/samples/git.diff
index 36c9714..3cfa303 100644
--- a/tests/samples/git.diff
+++ b/tests/samples/git.diff
@@ -20,7 +20,7 @@ index c7921f5..8946660 100644
 +This is now updated.
 +
 +This is a new line.
-
+ 
  This will stay.
 \ No newline at end of file
 diff --git a/removed_file b/removed_file
diff --git a/tests/samples/sample5.diff b/tests/samples/sample5.diff
new file mode 100644
index 0000000..a9e2f63
--- /dev/null
+++ b/tests/samples/sample5.diff
@@ -0,0 +1,29 @@
+=== modified file 'modified_file1'
+--- modified_file1	2013-10-13 23:53:13 +0000
++++ modified_file1	2013-10-13 23:53:26 +0000
+@@ -1,5 +1,7 @@
+ This is the original content.
+ 
+-This should be updated.
++This is now updated.
++
++This is a new line.
+
+ This will stay.
+\ No newline at end of file
+
+=== modified file 'modified_file2'
+--- modified_file2	2013-10-13 23:53:13 +0000
++++ modified_file2	2013-10-13 23:53:26 +0000
+@@ -1,5 +1,7 @@
+ This is the original content.
+ 
+-This should be updated.
++This is now updated.
++
++This is a new line.
+
+ This will stay.
+\ No newline at end of file
+
+
diff --git a/tests/samples/sample6.diff b/tests/samples/sample6.diff
new file mode 100644
index 0000000..bb244f4
--- /dev/null
+++ b/tests/samples/sample6.diff
@@ -0,0 +1,38 @@
+--- /path/to/original	''timestamp''
++++ /path/to/new	''timestamp''
+@@ -1,3 +1,9 @@
++This is an important
++notice! It should
++therefore be located at
++the beginning of this
++document!
++
+ This part of the
+ document has stayed the
+ same from version to
+@@ -5,16 +11,13 @@
+ be shown if it doesn't
+ change.  Otherwise, that
+ would not be helping to
+-compress the size of the
+-changes.
+-
+-This paragraph contains
+-text that is outdated.
+-It will be deleted in the
+-near future.
++compress anything.
+ 
+ It is important to spell
+-check this dokument. On
++check this document. On
+ the other hand, a
+ misspelled word isn't
+ the end of the world.
+ this paragraph needs to
+ be changed. Things can
+ be added after it.
++
++This paragraph contains
++important new additions
++to this document.
diff --git a/tests/samples/sample7.diff b/tests/samples/sample7.diff
new file mode 100644
index 0000000..94f8340
--- /dev/null
+++ b/tests/samples/sample7.diff
@@ -0,0 +1,29 @@
+--- /path/to/original	''timestamp''
++++ /path/to/new	''timestamp''
+@@ -1,3 +1,9 @@
++This is an important
++notice! It should
++therefore be located at
++the beginning of this
++document!
++
+ This part of the
+ document has stayed the
+ same from version to
+@@ -5,16 +11,13 @@
+ be shown if it doesn't
+ change.  Otherwise, that
+ would not be helping to
+-compress the size of the
+-changes.
+-
+-This paragraph contains
+-text that is outdated.
++compress anything.
+ 
+ It is important to spell
+-check this dokument. On
++check this document. On
+ the other hand, a
+ misspelled word isn't
+ the end of the world.
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 453b7ab..b210587 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -92,12 +92,27 @@ def test_preserve_dos_line_endings(self):
         added_unicode_line = res.added_files[0][0][1]
         self.assertEqual(added_unicode_line.value, 'holá mundo!\r\n')
 
+    def test_preserve_dos_line_endings_empty_line_type(self):
+        utf8_file = os.path.join(self.samples_dir, 'samples/sample5.diff')
+        with open(utf8_file, 'rb') as diff_file:
+            res = PatchSet(diff_file, encoding='utf-8')
+
+        # 2 files updated by diff
+        self.assertEqual(len(res), 2)
+        modified_unicode_line = res.modified_files[0][0][6]
+        self.assertEqual(modified_unicode_line.value, '\r\n')
+        self.assertEqual(modified_unicode_line.line_type, ' ')
+
+        modified_unicode_line = res.modified_files[1][0][6]
+        self.assertEqual(modified_unicode_line.value, '\n')
+        self.assertEqual(modified_unicode_line.line_type, ' ')
+
     def test_print_hunks_without_gaps(self):
         with codecs.open(self.sample_file, 'r', encoding='utf-8') as diff_file:
             res = PatchSet(diff_file)
         lines = unicode(res).splitlines()
-        self.assertEqual(lines[12], '@@ -5,16 +11,10 @@ ')
-        self.assertEqual(lines[31], '@@ -22,3 +22,7 @@ ')
+        self.assertEqual(lines[12], '@@ -5,16 +11,10 @@')
+        self.assertEqual(lines[31], '@@ -22,3 +22,7 @@')
 
     def test_parse_sample(self):
         """Parse sample file."""
@@ -181,20 +196,32 @@ def test_patchset_from_bytes_string(self):
         self.assertEqual(ps1, ps2)
 
     def test_patchset_string_input(self):
-            with codecs.open(self.sample_file, 'r', encoding='utf-8') as diff_file:
-                diff_data = diff_file.read()
-                ps1 = PatchSet(diff_data)
+        with codecs.open(self.sample_file, 'r', encoding='utf-8') as diff_file:
+            diff_data = diff_file.read()
+            ps1 = PatchSet(diff_data)
 
-            with codecs.open(self.sample_file, 'r', encoding='utf-8') as diff_file:
-                ps2 = PatchSet(diff_file)
+        with codecs.open(self.sample_file, 'r', encoding='utf-8') as diff_file:
+            ps2 = PatchSet(diff_file)
 
-            self.assertEqual(ps1, ps2)
+        self.assertEqual(ps1, ps2)
 
     def test_parse_malformed_diff(self):
         """Parse malformed file."""
         with open(self.sample_bad_file) as diff_file:
             self.assertRaises(UnidiffParseError, PatchSet, diff_file)
 
+    def test_parse_malformed_diff_longer_than_expected(self):
+        """Parse malformed file with non-terminated hunk."""
+        utf8_file = os.path.join(self.samples_dir, 'samples/sample6.diff')
+        with open(utf8_file, 'rb') as diff_file:
+            self.assertRaises(UnidiffParseError, PatchSet, diff_file)
+
+    def test_parse_malformed_diff_shorter_than_expected(self):
+        """Parse malformed file with non-terminated hunk."""
+        utf8_file = os.path.join(self.samples_dir, 'samples/sample7.diff')
+        with open(utf8_file, 'rb') as diff_file:
+            self.assertRaises(UnidiffParseError, PatchSet, diff_file)
+
     def test_diff_lines_linenos(self):
         with open(self.sample_file, 'rb') as diff_file:
             res = PatchSet(diff_file, encoding='utf-8')
@@ -295,3 +322,8 @@ def test_samples(self):
 
             self.assertEqual(res.added, 7)
             self.assertEqual(res.removed, 4)
+
+            # check that original diffs and those produced
+            # by unidiff are the same
+            with codecs.open(file_path, 'r', encoding='utf-8') as diff_file:
+                self.assertEqual(diff_file.read(), str(res))
diff --git a/unidiff/constants.py b/unidiff/constants.py
index 2ed5d6d..d5d7a05 100644
--- a/unidiff/constants.py
+++ b/unidiff/constants.py
@@ -44,7 +44,9 @@
 # -  deleted line
 # \  No newline case
 RE_HUNK_BODY_LINE = re.compile(
-    r'^(?P<line_type>[- \n\+\\])(?P<value>.*)', re.DOTALL)
+    r'^(?P<line_type>[- \+\\])(?P<value>.*)', re.DOTALL)
+RE_HUNK_EMPTY_BODY_LINE = re.compile(
+    r'^(?P<line_type>[- \+\\]?)(?P<value>[\r\n]{1,2})', re.DOTALL)
 
 RE_NO_NEWLINE_MARKER = re.compile(r'^\\ No newline at end of file')
 
@@ -53,6 +55,6 @@
 LINE_TYPE_ADDED = '+'
 LINE_TYPE_REMOVED = '-'
 LINE_TYPE_CONTEXT = ' '
-LINE_TYPE_EMPTY = '\n'
+LINE_TYPE_EMPTY = ''
 LINE_TYPE_NO_NEWLINE = '\\'
 LINE_VALUE_NO_NEWLINE = ' No newline at end of file'
diff --git a/unidiff/patch.py b/unidiff/patch.py
index 70f0284..45cd6fb 100644
--- a/unidiff/patch.py
+++ b/unidiff/patch.py
@@ -38,6 +38,7 @@
     LINE_TYPE_NO_NEWLINE,
     LINE_VALUE_NO_NEWLINE,
     RE_HUNK_BODY_LINE,
+    RE_HUNK_EMPTY_BODY_LINE,
     RE_HUNK_HEADER,
     RE_SOURCE_FILENAME,
     RE_TARGET_FILENAME,
@@ -63,7 +64,7 @@ def implements_to_string(cls):
     implements_to_string = lambda x: x
     unicode = str
     basestring = str
-    
+
 
 @implements_to_string
 class Line(object):
@@ -104,6 +105,19 @@ def is_context(self):
         return self.line_type == LINE_TYPE_CONTEXT
 
 
+@implements_to_string
+class PatchInfo(list):
+    """Lines with extended patch info. Format of this info is not documented
+       and it very much depends on patch producer."""
+
+    def __repr__(self):
+        value = "<PatchInfo: %s>" % self[0].strip()
+        return make_str(value)
+
+    def __str__(self):
+        return ''.join(unicode(line) for line in self)
+
+
 @implements_to_string
 class Hunk(list):
     """Each of the modified blocks of a file."""
@@ -133,9 +147,11 @@ def __repr__(self):
         return make_str(value)
 
     def __str__(self):
-        head = "@@ -%d,%d +%d,%d @@ %s\n" % (
+        # section header is optional and thus we output it only if it's present
+        head = "@@ -%d,%d +%d,%d @@%s\n" % (
             self.source_start, self.source_length,
-            self.target_start, self.target_length, self.section_header)
+            self.target_start, self.target_length,
+            ' ' + self.section_header if self.section_header else '')
         content = ''.join(unicode(line) for line in self)
         return head + content
 
@@ -170,9 +186,10 @@ def target_lines(self):
 class PatchedFile(list):
     """Patch updated file, it is a list of Hunks."""
 
-    def __init__(self, source='', target='',
+    def __init__(self, patch_info=None, source='', target='',
                  source_timestamp=None, target_timestamp=None):
         super(PatchedFile, self).__init__()
+        self.patch_info = patch_info
         self.source_file = source
         self.source_timestamp = source_timestamp
         self.target_file = target
@@ -182,10 +199,16 @@ def __repr__(self):
         return make_str("<PatchedFile: %s>") % make_str(self.path)
 
     def __str__(self):
-        source = "--- %s\n" % self.source_file
-        target = "+++ %s\n" % self.target_file
+        # patch info is optional
+        info = '' if self.patch_info is None else str(self.patch_info)
+        source = "--- %s%s\n" % (
+            self.source_file,
+            '\t' + self.source_timestamp if self.source_timestamp else '')
+        target = "+++ %s%s\n" % (
+            self.target_file,
+            '\t' + self.target_timestamp if self.target_timestamp else '')
         hunks = ''.join(unicode(hunk) for hunk in self)
-        return source + target + hunks
+        return info + source + target + hunks
 
     def _parse_hunk(self, header, diff, encoding):
         """Parse hunk details."""
@@ -201,7 +224,11 @@ def _parse_hunk(self, header, diff, encoding):
         for diff_line_no, line in diff:
             if encoding is not None:
                 line = line.decode(encoding)
-            valid_line = RE_HUNK_BODY_LINE.match(line)
+
+            valid_line = RE_HUNK_EMPTY_BODY_LINE.match(line)
+            if not valid_line:
+                valid_line = RE_HUNK_BODY_LINE.match(line)
+
             if not valid_line:
                 raise UnidiffParseError('Hunk diff line expected: %s' % line)
 
@@ -226,6 +253,11 @@ def _parse_hunk(self, header, diff, encoding):
             else:
                 original_line = None
 
+            # stop parsing if we got past expected number of lines
+            if (source_line_no > expected_source_end or
+                    target_line_no > expected_target_end):
+                raise UnidiffParseError('Hunk is longer than expected')
+
             if original_line:
                 original_line.diff_line_no = diff_line_no
                 hunk.append(original_line)
@@ -235,6 +267,11 @@ def _parse_hunk(self, header, diff, encoding):
                     target_line_no == expected_target_end):
                 break
 
+        # report an error if we haven't got expected number of lines
+        if (source_line_no < expected_source_end or
+                target_line_no < expected_target_end):
+            raise UnidiffParseError('Hunk is shorter than expected')
+
         self.append(hunk)
 
     def _add_no_newline_marker_to_last_hunk(self):
@@ -245,6 +282,12 @@ def _add_no_newline_marker_to_last_hunk(self):
         last_hunk.append(
             Line(LINE_VALUE_NO_NEWLINE + '\n', line_type=LINE_TYPE_NO_NEWLINE))
 
+    def _append_trailing_empty_line(self):
+        if not self:
+            raise UnidiffParseError('Unexpected trailing newline character')
+        last_hunk = self[-1]
+        last_hunk.append(Line('\n', line_type=LINE_TYPE_EMPTY))
+
     @property
     def path(self):
         """Return the file path abstracted from VCS."""
@@ -252,10 +295,10 @@ def path(self):
                 self.target_file.startswith('b/')):
             filepath = self.source_file[2:]
         elif (self.source_file.startswith('a/') and
-                self.target_file == '/dev/null'):
+              self.target_file == '/dev/null'):
             filepath = self.source_file[2:]
         elif (self.target_file.startswith('b/') and
-                self.source_file == '/dev/null'):
+              self.source_file == '/dev/null'):
             filepath = self.target_file[2:]
         else:
             filepath = self.source_file
@@ -295,7 +338,7 @@ class PatchSet(list):
 
     def __init__(self, f, encoding=None):
         super(PatchSet, self).__init__()
-        
+
         # convert string inputs to StringIO objects
         if isinstance(f, basestring):
             f = self._convert_string(f, encoding)
@@ -309,15 +352,17 @@ def __repr__(self):
         return make_str('<PatchSet: %s>') % super(PatchSet, self).__repr__()
 
     def __str__(self):
-        return '\n'.join(unicode(patched_file) for patched_file in self)
+        return ''.join(unicode(patched_file) for patched_file in self)
 
     def _parse(self, diff, encoding):
         current_file = None
+        patch_info = None
 
         diff = enumerate(diff, 1)
         for unused_diff_line_no, line in diff:
             if encoding is not None:
                 line = line.decode(encoding)
+
             # check for source file header
             is_source_filename = RE_SOURCE_FILENAME.match(line)
             if is_source_filename:
@@ -335,9 +380,10 @@ def _parse(self, diff, encoding):
                 target_file = is_target_filename.group('filename')
                 target_timestamp = is_target_filename.group('timestamp')
                 # add current file to PatchSet
-                current_file = PatchedFile(source_file, target_file,
+                current_file = PatchedFile(patch_info, source_file, target_file,
                                            source_timestamp, target_timestamp)
                 self.append(current_file)
+                patch_info = None
                 continue
 
             # check for hunk header
@@ -346,6 +392,7 @@ def _parse(self, diff, encoding):
                 if current_file is None:
                     raise UnidiffParseError('Unexpected hunk found: %s' % line)
                 current_file._parse_hunk(line, diff, encoding)
+                continue
 
             # check for no newline marker
             is_no_newline = RE_NO_NEWLINE_MARKER.match(line)
@@ -353,6 +400,18 @@ def _parse(self, diff, encoding):
                 if current_file is None:
                     raise UnidiffParseError('Unexpected marker: %s' % line)
                 current_file._add_no_newline_marker_to_last_hunk()
+                continue
+
+            # sometimes hunks can be followed by empty lines
+            if line == '\n' and current_file is not None:
+                current_file._append_trailing_empty_line()
+                continue
+
+            # if nothing has matched above then this line is a patch info
+            if patch_info is None:
+                current_file = None
+                patch_info = PatchInfo()
+            patch_info.append(line)
 
     @classmethod
     def from_filename(cls, filename, encoding=DEFAULT_ENCODING, errors=None):

From 83bcb369cb5904d22432a138a61f3fbf91e72cc6 Mon Sep 17 00:00:00 2001
From: Volo Zyko <volo.zyko@gmail.com>
Date: Sun, 30 Jul 2017 16:31:54 +0300
Subject: [PATCH 2/2] Fix tests on python3

---
 tests/test_parser.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_parser.py b/tests/test_parser.py
index b210587..cb49870 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -213,13 +213,13 @@ def test_parse_malformed_diff(self):
     def test_parse_malformed_diff_longer_than_expected(self):
         """Parse malformed file with non-terminated hunk."""
         utf8_file = os.path.join(self.samples_dir, 'samples/sample6.diff')
-        with open(utf8_file, 'rb') as diff_file:
+        with open(utf8_file, 'r') as diff_file:
             self.assertRaises(UnidiffParseError, PatchSet, diff_file)
 
     def test_parse_malformed_diff_shorter_than_expected(self):
         """Parse malformed file with non-terminated hunk."""
         utf8_file = os.path.join(self.samples_dir, 'samples/sample7.diff')
-        with open(utf8_file, 'rb') as diff_file:
+        with open(utf8_file, 'r') as diff_file:
             self.assertRaises(UnidiffParseError, PatchSet, diff_file)
 
     def test_diff_lines_linenos(self):