From b362e5fccee3ed0eb33ec461319952c27640368b Mon Sep 17 00:00:00 2001 From: Volo Zyko Date: Sun, 30 Jul 2017 15:23:38 +0300 Subject: [PATCH 1/2] Implement parsing of lines with extended patch info Beside that, make parsing a bit stricter, fix code for conversion to string in a number of places so that original and resulting diffs are the same now, add more tests. --- tests/samples/git.diff | 2 +- tests/samples/sample5.diff | 29 +++++++++++++ tests/samples/sample6.diff | 38 +++++++++++++++++ tests/samples/sample7.diff | 29 +++++++++++++ tests/test_parser.py | 48 +++++++++++++++++---- unidiff/constants.py | 6 ++- unidiff/patch.py | 85 ++++++++++++++++++++++++++++++++------ 7 files changed, 213 insertions(+), 24 deletions(-) create mode 100644 tests/samples/sample5.diff create mode 100644 tests/samples/sample6.diff create mode 100644 tests/samples/sample7.diff diff --git a/tests/samples/git.diff b/tests/samples/git.diff index 36c9714..3cfa303 100644 --- a/tests/samples/git.diff +++ b/tests/samples/git.diff @@ -20,7 +20,7 @@ index c7921f5..8946660 100644 +This is now updated. + +This is a new line. - + This will stay. \ No newline at end of file diff --git a/removed_file b/removed_file diff --git a/tests/samples/sample5.diff b/tests/samples/sample5.diff new file mode 100644 index 0000000..a9e2f63 --- /dev/null +++ b/tests/samples/sample5.diff @@ -0,0 +1,29 @@ +=== modified file 'modified_file1' +--- modified_file1 2013-10-13 23:53:13 +0000 ++++ modified_file1 2013-10-13 23:53:26 +0000 +@@ -1,5 +1,7 @@ + This is the original content. + +-This should be updated. ++This is now updated. ++ ++This is a new line. + + This will stay. +\ No newline at end of file + +=== modified file 'modified_file2' +--- modified_file2 2013-10-13 23:53:13 +0000 ++++ modified_file2 2013-10-13 23:53:26 +0000 +@@ -1,5 +1,7 @@ + This is the original content. + +-This should be updated. ++This is now updated. ++ ++This is a new line. + + This will stay. +\ No newline at end of file + + diff --git a/tests/samples/sample6.diff b/tests/samples/sample6.diff new file mode 100644 index 0000000..bb244f4 --- /dev/null +++ b/tests/samples/sample6.diff @@ -0,0 +1,38 @@ +--- /path/to/original ''timestamp'' ++++ /path/to/new ''timestamp'' +@@ -1,3 +1,9 @@ ++This is an important ++notice! It should ++therefore be located at ++the beginning of this ++document! ++ + This part of the + document has stayed the + same from version to +@@ -5,16 +11,13 @@ + be shown if it doesn't + change. Otherwise, that + would not be helping to +-compress the size of the +-changes. +- +-This paragraph contains +-text that is outdated. +-It will be deleted in the +-near future. ++compress anything. + + It is important to spell +-check this dokument. On ++check this document. On + the other hand, a + misspelled word isn't + the end of the world. + this paragraph needs to + be changed. Things can + be added after it. ++ ++This paragraph contains ++important new additions ++to this document. diff --git a/tests/samples/sample7.diff b/tests/samples/sample7.diff new file mode 100644 index 0000000..94f8340 --- /dev/null +++ b/tests/samples/sample7.diff @@ -0,0 +1,29 @@ +--- /path/to/original ''timestamp'' ++++ /path/to/new ''timestamp'' +@@ -1,3 +1,9 @@ ++This is an important ++notice! It should ++therefore be located at ++the beginning of this ++document! ++ + This part of the + document has stayed the + same from version to +@@ -5,16 +11,13 @@ + be shown if it doesn't + change. Otherwise, that + would not be helping to +-compress the size of the +-changes. +- +-This paragraph contains +-text that is outdated. ++compress anything. + + It is important to spell +-check this dokument. On ++check this document. On + the other hand, a + misspelled word isn't + the end of the world. diff --git a/tests/test_parser.py b/tests/test_parser.py index 453b7ab..b210587 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -92,12 +92,27 @@ def test_preserve_dos_line_endings(self): added_unicode_line = res.added_files[0][0][1] self.assertEqual(added_unicode_line.value, 'holá mundo!\r\n') + def test_preserve_dos_line_endings_empty_line_type(self): + utf8_file = os.path.join(self.samples_dir, 'samples/sample5.diff') + with open(utf8_file, 'rb') as diff_file: + res = PatchSet(diff_file, encoding='utf-8') + + # 2 files updated by diff + self.assertEqual(len(res), 2) + modified_unicode_line = res.modified_files[0][0][6] + self.assertEqual(modified_unicode_line.value, '\r\n') + self.assertEqual(modified_unicode_line.line_type, ' ') + + modified_unicode_line = res.modified_files[1][0][6] + self.assertEqual(modified_unicode_line.value, '\n') + self.assertEqual(modified_unicode_line.line_type, ' ') + def test_print_hunks_without_gaps(self): with codecs.open(self.sample_file, 'r', encoding='utf-8') as diff_file: res = PatchSet(diff_file) lines = unicode(res).splitlines() - self.assertEqual(lines[12], '@@ -5,16 +11,10 @@ ') - self.assertEqual(lines[31], '@@ -22,3 +22,7 @@ ') + self.assertEqual(lines[12], '@@ -5,16 +11,10 @@') + self.assertEqual(lines[31], '@@ -22,3 +22,7 @@') def test_parse_sample(self): """Parse sample file.""" @@ -181,20 +196,32 @@ def test_patchset_from_bytes_string(self): self.assertEqual(ps1, ps2) def test_patchset_string_input(self): - with codecs.open(self.sample_file, 'r', encoding='utf-8') as diff_file: - diff_data = diff_file.read() - ps1 = PatchSet(diff_data) + with codecs.open(self.sample_file, 'r', encoding='utf-8') as diff_file: + diff_data = diff_file.read() + ps1 = PatchSet(diff_data) - with codecs.open(self.sample_file, 'r', encoding='utf-8') as diff_file: - ps2 = PatchSet(diff_file) + with codecs.open(self.sample_file, 'r', encoding='utf-8') as diff_file: + ps2 = PatchSet(diff_file) - self.assertEqual(ps1, ps2) + self.assertEqual(ps1, ps2) def test_parse_malformed_diff(self): """Parse malformed file.""" with open(self.sample_bad_file) as diff_file: self.assertRaises(UnidiffParseError, PatchSet, diff_file) + def test_parse_malformed_diff_longer_than_expected(self): + """Parse malformed file with non-terminated hunk.""" + utf8_file = os.path.join(self.samples_dir, 'samples/sample6.diff') + with open(utf8_file, 'rb') as diff_file: + self.assertRaises(UnidiffParseError, PatchSet, diff_file) + + def test_parse_malformed_diff_shorter_than_expected(self): + """Parse malformed file with non-terminated hunk.""" + utf8_file = os.path.join(self.samples_dir, 'samples/sample7.diff') + with open(utf8_file, 'rb') as diff_file: + self.assertRaises(UnidiffParseError, PatchSet, diff_file) + def test_diff_lines_linenos(self): with open(self.sample_file, 'rb') as diff_file: res = PatchSet(diff_file, encoding='utf-8') @@ -295,3 +322,8 @@ def test_samples(self): self.assertEqual(res.added, 7) self.assertEqual(res.removed, 4) + + # check that original diffs and those produced + # by unidiff are the same + with codecs.open(file_path, 'r', encoding='utf-8') as diff_file: + self.assertEqual(diff_file.read(), str(res)) diff --git a/unidiff/constants.py b/unidiff/constants.py index 2ed5d6d..d5d7a05 100644 --- a/unidiff/constants.py +++ b/unidiff/constants.py @@ -44,7 +44,9 @@ # - deleted line # \ No newline case RE_HUNK_BODY_LINE = re.compile( - r'^(?P[- \n\+\\])(?P.*)', re.DOTALL) + r'^(?P[- \+\\])(?P.*)', re.DOTALL) +RE_HUNK_EMPTY_BODY_LINE = re.compile( + r'^(?P[- \+\\]?)(?P[\r\n]{1,2})', re.DOTALL) RE_NO_NEWLINE_MARKER = re.compile(r'^\\ No newline at end of file') @@ -53,6 +55,6 @@ LINE_TYPE_ADDED = '+' LINE_TYPE_REMOVED = '-' LINE_TYPE_CONTEXT = ' ' -LINE_TYPE_EMPTY = '\n' +LINE_TYPE_EMPTY = '' LINE_TYPE_NO_NEWLINE = '\\' LINE_VALUE_NO_NEWLINE = ' No newline at end of file' diff --git a/unidiff/patch.py b/unidiff/patch.py index 70f0284..45cd6fb 100644 --- a/unidiff/patch.py +++ b/unidiff/patch.py @@ -38,6 +38,7 @@ LINE_TYPE_NO_NEWLINE, LINE_VALUE_NO_NEWLINE, RE_HUNK_BODY_LINE, + RE_HUNK_EMPTY_BODY_LINE, RE_HUNK_HEADER, RE_SOURCE_FILENAME, RE_TARGET_FILENAME, @@ -63,7 +64,7 @@ def implements_to_string(cls): implements_to_string = lambda x: x unicode = str basestring = str - + @implements_to_string class Line(object): @@ -104,6 +105,19 @@ def is_context(self): return self.line_type == LINE_TYPE_CONTEXT +@implements_to_string +class PatchInfo(list): + """Lines with extended patch info. Format of this info is not documented + and it very much depends on patch producer.""" + + def __repr__(self): + value = "" % self[0].strip() + return make_str(value) + + def __str__(self): + return ''.join(unicode(line) for line in self) + + @implements_to_string class Hunk(list): """Each of the modified blocks of a file.""" @@ -133,9 +147,11 @@ def __repr__(self): return make_str(value) def __str__(self): - head = "@@ -%d,%d +%d,%d @@ %s\n" % ( + # section header is optional and thus we output it only if it's present + head = "@@ -%d,%d +%d,%d @@%s\n" % ( self.source_start, self.source_length, - self.target_start, self.target_length, self.section_header) + self.target_start, self.target_length, + ' ' + self.section_header if self.section_header else '') content = ''.join(unicode(line) for line in self) return head + content @@ -170,9 +186,10 @@ def target_lines(self): class PatchedFile(list): """Patch updated file, it is a list of Hunks.""" - def __init__(self, source='', target='', + def __init__(self, patch_info=None, source='', target='', source_timestamp=None, target_timestamp=None): super(PatchedFile, self).__init__() + self.patch_info = patch_info self.source_file = source self.source_timestamp = source_timestamp self.target_file = target @@ -182,10 +199,16 @@ def __repr__(self): return make_str("") % make_str(self.path) def __str__(self): - source = "--- %s\n" % self.source_file - target = "+++ %s\n" % self.target_file + # patch info is optional + info = '' if self.patch_info is None else str(self.patch_info) + source = "--- %s%s\n" % ( + self.source_file, + '\t' + self.source_timestamp if self.source_timestamp else '') + target = "+++ %s%s\n" % ( + self.target_file, + '\t' + self.target_timestamp if self.target_timestamp else '') hunks = ''.join(unicode(hunk) for hunk in self) - return source + target + hunks + return info + source + target + hunks def _parse_hunk(self, header, diff, encoding): """Parse hunk details.""" @@ -201,7 +224,11 @@ def _parse_hunk(self, header, diff, encoding): for diff_line_no, line in diff: if encoding is not None: line = line.decode(encoding) - valid_line = RE_HUNK_BODY_LINE.match(line) + + valid_line = RE_HUNK_EMPTY_BODY_LINE.match(line) + if not valid_line: + valid_line = RE_HUNK_BODY_LINE.match(line) + if not valid_line: raise UnidiffParseError('Hunk diff line expected: %s' % line) @@ -226,6 +253,11 @@ def _parse_hunk(self, header, diff, encoding): else: original_line = None + # stop parsing if we got past expected number of lines + if (source_line_no > expected_source_end or + target_line_no > expected_target_end): + raise UnidiffParseError('Hunk is longer than expected') + if original_line: original_line.diff_line_no = diff_line_no hunk.append(original_line) @@ -235,6 +267,11 @@ def _parse_hunk(self, header, diff, encoding): target_line_no == expected_target_end): break + # report an error if we haven't got expected number of lines + if (source_line_no < expected_source_end or + target_line_no < expected_target_end): + raise UnidiffParseError('Hunk is shorter than expected') + self.append(hunk) def _add_no_newline_marker_to_last_hunk(self): @@ -245,6 +282,12 @@ def _add_no_newline_marker_to_last_hunk(self): last_hunk.append( Line(LINE_VALUE_NO_NEWLINE + '\n', line_type=LINE_TYPE_NO_NEWLINE)) + def _append_trailing_empty_line(self): + if not self: + raise UnidiffParseError('Unexpected trailing newline character') + last_hunk = self[-1] + last_hunk.append(Line('\n', line_type=LINE_TYPE_EMPTY)) + @property def path(self): """Return the file path abstracted from VCS.""" @@ -252,10 +295,10 @@ def path(self): self.target_file.startswith('b/')): filepath = self.source_file[2:] elif (self.source_file.startswith('a/') and - self.target_file == '/dev/null'): + self.target_file == '/dev/null'): filepath = self.source_file[2:] elif (self.target_file.startswith('b/') and - self.source_file == '/dev/null'): + self.source_file == '/dev/null'): filepath = self.target_file[2:] else: filepath = self.source_file @@ -295,7 +338,7 @@ class PatchSet(list): def __init__(self, f, encoding=None): super(PatchSet, self).__init__() - + # convert string inputs to StringIO objects if isinstance(f, basestring): f = self._convert_string(f, encoding) @@ -309,15 +352,17 @@ def __repr__(self): return make_str('') % super(PatchSet, self).__repr__() def __str__(self): - return '\n'.join(unicode(patched_file) for patched_file in self) + return ''.join(unicode(patched_file) for patched_file in self) def _parse(self, diff, encoding): current_file = None + patch_info = None diff = enumerate(diff, 1) for unused_diff_line_no, line in diff: if encoding is not None: line = line.decode(encoding) + # check for source file header is_source_filename = RE_SOURCE_FILENAME.match(line) if is_source_filename: @@ -335,9 +380,10 @@ def _parse(self, diff, encoding): target_file = is_target_filename.group('filename') target_timestamp = is_target_filename.group('timestamp') # add current file to PatchSet - current_file = PatchedFile(source_file, target_file, + current_file = PatchedFile(patch_info, source_file, target_file, source_timestamp, target_timestamp) self.append(current_file) + patch_info = None continue # check for hunk header @@ -346,6 +392,7 @@ def _parse(self, diff, encoding): if current_file is None: raise UnidiffParseError('Unexpected hunk found: %s' % line) current_file._parse_hunk(line, diff, encoding) + continue # check for no newline marker is_no_newline = RE_NO_NEWLINE_MARKER.match(line) @@ -353,6 +400,18 @@ def _parse(self, diff, encoding): if current_file is None: raise UnidiffParseError('Unexpected marker: %s' % line) current_file._add_no_newline_marker_to_last_hunk() + continue + + # sometimes hunks can be followed by empty lines + if line == '\n' and current_file is not None: + current_file._append_trailing_empty_line() + continue + + # if nothing has matched above then this line is a patch info + if patch_info is None: + current_file = None + patch_info = PatchInfo() + patch_info.append(line) @classmethod def from_filename(cls, filename, encoding=DEFAULT_ENCODING, errors=None): From 83bcb369cb5904d22432a138a61f3fbf91e72cc6 Mon Sep 17 00:00:00 2001 From: Volo Zyko Date: Sun, 30 Jul 2017 16:31:54 +0300 Subject: [PATCH 2/2] Fix tests on python3 --- tests/test_parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_parser.py b/tests/test_parser.py index b210587..cb49870 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -213,13 +213,13 @@ def test_parse_malformed_diff(self): def test_parse_malformed_diff_longer_than_expected(self): """Parse malformed file with non-terminated hunk.""" utf8_file = os.path.join(self.samples_dir, 'samples/sample6.diff') - with open(utf8_file, 'rb') as diff_file: + with open(utf8_file, 'r') as diff_file: self.assertRaises(UnidiffParseError, PatchSet, diff_file) def test_parse_malformed_diff_shorter_than_expected(self): """Parse malformed file with non-terminated hunk.""" utf8_file = os.path.join(self.samples_dir, 'samples/sample7.diff') - with open(utf8_file, 'rb') as diff_file: + with open(utf8_file, 'r') as diff_file: self.assertRaises(UnidiffParseError, PatchSet, diff_file) def test_diff_lines_linenos(self):