Skip to content

Commit

Permalink
Merge pull request #70 from matiasb/p12tic-only-hunk-positions
Browse files Browse the repository at this point in the history
Added option to only parse diff metadata.
  • Loading branch information
matiasb authored Apr 22, 2020
2 parents 9532113 + 3273435 commit d611fe6
Show file tree
Hide file tree
Showing 3 changed files with 120 additions and 34 deletions.
2 changes: 1 addition & 1 deletion bin/unidiff
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ if __name__ == '__main__':
if PY2:
diff_file = codecs.getreader(encoding)(diff_file)

patch = PatchSet(diff_file)
patch = PatchSet(diff_file, metadata_only=(not args.show_diff))

if args.show_diff:
print(patch)
Expand Down
42 changes: 40 additions & 2 deletions tests/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,10 +114,10 @@ def test_print_hunks_without_gaps(self):
self.assertEqual(lines[12], '@@ -5,16 +11,10 @@')
self.assertEqual(lines[31], '@@ -22,3 +22,7 @@')

def test_parse_sample(self):
def _test_parse_sample(self, metadata_only):
"""Parse sample file."""
with codecs.open(self.sample_file, 'r', encoding='utf-8') as diff_file:
res = PatchSet(diff_file)
res = PatchSet(diff_file, metadata_only=metadata_only)

# three file in the patch
self.assertEqual(len(res), 3)
Expand Down Expand Up @@ -164,6 +164,12 @@ def test_parse_sample(self):
self.assertEqual(res.added, 21)
self.assertEqual(res.removed, 17)

def test_parse_sample_full(self):
self._test_parse_sample(metadata_only=False)

def test_parse_sample_metadata_only(self):
self._test_parse_sample(metadata_only=True)

def test_patchset_compare(self):
with codecs.open(self.sample_file, 'r', encoding='utf-8') as diff_file:
ps1 = PatchSet(diff_file)
Expand Down Expand Up @@ -316,6 +322,38 @@ def test_diff_lines_linenos(self):
self.assertEqual(source_line_nos, expected_source_line_nos)
self.assertEqual(diff_line_nos, expected_diff_line_nos)

def test_diff_hunk_positions(self):
with open(self.sample_file, 'rb') as diff_file:
res = PatchSet(diff_file, encoding='utf-8')
self.do_test_diff_hunk_positions(res)

def test_diff_metadata_only(self):
with open(self.sample_file, 'rb') as diff_file:
res = PatchSet(diff_file, encoding='utf-8', metadata_only=True)
self.do_test_diff_hunk_positions(res)

def do_test_diff_hunk_positions(self, res):
hunk_positions = []
for diff_file in res:
for hunk in diff_file:
hunk_positions.append((hunk.source_start, hunk.target_start,
hunk.source_length, hunk.target_length))

expected_hunk_positions = [
# File: 1, Hunk: 1
(1, 1, 3, 9),
# File: 1, Hunk: 2
(5, 11, 16, 10),
# File: 1, Hunk: 3
(22, 22, 3, 7),
# File: 2, Hunk: 1
(0, 1, 0, 9),
# File: 3, Hunk: 1
(1, 0, 9, 0)
]

self.assertEqual(hunk_positions, expected_hunk_positions)


class TestVCSSamples(unittest.TestCase):
"""Tests for real examples from VCS."""
Expand Down
110 changes: 79 additions & 31 deletions unidiff/patch.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,8 @@ def __init__(self, src_start=0, src_len=0, tgt_start=0, tgt_len=0,
self.target_start = int(tgt_start)
self.target_length = int(tgt_len)
self.section_header = section_header
self._added = None
self._removed = None

def __repr__(self):
value = "<Hunk: @@ %d,%d %d,%d @@ %s>" % (self.source_start,
Expand Down Expand Up @@ -168,10 +170,18 @@ def append(self, line):

@property
def added(self):
if self._added is not None:
return self._added
# re-calculate each time to allow for hunk modifications
# (which should mean metadata_only switch wasn't used)
return sum(1 for line in self if line.is_added)

@property
def removed(self):
if self._removed is not None:
return self._removed
# re-calculate each time to allow for hunk modifications
# (which should mean metadata_only switch wasn't used)
return sum(1 for line in self if line.is_removed)

def is_valid(self):
Expand Down Expand Up @@ -229,7 +239,7 @@ def __str__(self):
hunks = ''.join(unicode(hunk) for hunk in self)
return info + source + target + hunks

def _parse_hunk(self, header, diff, encoding):
def _parse_hunk(self, header, diff, encoding, metadata_only):
"""Parse hunk details."""
header_info = RE_HUNK_HEADER.match(header)
hunk_info = header_info.groups()
Expand All @@ -239,39 +249,69 @@ def _parse_hunk(self, header, diff, encoding):
target_line_no = hunk.target_start
expected_source_end = source_line_no + hunk.source_length
expected_target_end = target_line_no + hunk.target_length
added = 0
removed = 0

for diff_line_no, line in diff:
if encoding is not None:
line = line.decode(encoding)

valid_line = RE_HUNK_BODY_LINE.match(line)
if not valid_line:
valid_line = RE_HUNK_EMPTY_BODY_LINE.match(line)

if not valid_line:
raise UnidiffParseError('Hunk diff line expected: %s' % line)

line_type = valid_line.group('line_type')
if line_type == LINE_TYPE_EMPTY:
line_type = LINE_TYPE_CONTEXT
value = valid_line.group('value')
original_line = Line(value, line_type=line_type)
if line_type == LINE_TYPE_ADDED:
original_line.target_line_no = target_line_no
target_line_no += 1
elif line_type == LINE_TYPE_REMOVED:
original_line.source_line_no = source_line_no
source_line_no += 1
elif line_type == LINE_TYPE_CONTEXT:
original_line.target_line_no = target_line_no
target_line_no += 1
original_line.source_line_no = source_line_no
source_line_no += 1
elif line_type == LINE_TYPE_NO_NEWLINE:
pass
else:
if metadata_only:
# quick line type detection, no regex required
line_type = line[0] if line else LINE_TYPE_CONTEXT
if line_type not in (LINE_TYPE_ADDED,
LINE_TYPE_REMOVED,
LINE_TYPE_CONTEXT,
LINE_TYPE_NO_NEWLINE):
raise UnidiffParseError(
'Hunk diff line expected: %s' % line)

if line_type == LINE_TYPE_ADDED:
target_line_no += 1
added += 1
elif line_type == LINE_TYPE_REMOVED:
source_line_no += 1
removed += 1
elif line_type == LINE_TYPE_CONTEXT:
target_line_no += 1
source_line_no += 1

# no file content tracking
original_line = None

else:
# parse diff line content
valid_line = RE_HUNK_BODY_LINE.match(line)
if not valid_line:
valid_line = RE_HUNK_EMPTY_BODY_LINE.match(line)

if not valid_line:
raise UnidiffParseError(
'Hunk diff line expected: %s' % line)

line_type = valid_line.group('line_type')
if line_type == LINE_TYPE_EMPTY:
line_type = LINE_TYPE_CONTEXT

value = valid_line.group('value')
original_line = Line(value, line_type=line_type)

if line_type == LINE_TYPE_ADDED:
original_line.target_line_no = target_line_no
target_line_no += 1
elif line_type == LINE_TYPE_REMOVED:
original_line.source_line_no = source_line_no
source_line_no += 1
elif line_type == LINE_TYPE_CONTEXT:
original_line.target_line_no = target_line_no
original_line.source_line_no = source_line_no
target_line_no += 1
source_line_no += 1
elif line_type == LINE_TYPE_NO_NEWLINE:
pass
else:
original_line = None

# stop parsing if we got past expected number of lines
if (source_line_no > expected_source_end or
target_line_no > expected_target_end):
Expand All @@ -291,6 +331,11 @@ def _parse_hunk(self, header, diff, encoding):
target_line_no < expected_target_end):
raise UnidiffParseError('Hunk is shorter than expected')

if metadata_only:
# HACK: set fixed calculated values when metadata_only is enabled
hunk._added = added
hunk._removed = removed

self.append(hunk)

def _add_no_newline_marker_to_last_hunk(self):
Expand Down Expand Up @@ -360,7 +405,7 @@ def is_modified_file(self):
class PatchSet(list):
"""A list of PatchedFiles."""

def __init__(self, f, encoding=None):
def __init__(self, f, encoding=None, metadata_only=False):
super(PatchSet, self).__init__()

# convert string inputs to StringIO objects
Expand All @@ -370,15 +415,18 @@ def __init__(self, f, encoding=None):
# make sure we pass an iterator object to parse
data = iter(f)
# if encoding is None, assume we are reading unicode data
self._parse(data, encoding=encoding)
# when metadata_only is True, only perform a minimal metadata parsing
# (ie. hunks without content) which is around 2.5-6 times faster;
# it will still validate the diff metadata consistency and get counts
self._parse(data, encoding=encoding, metadata_only=metadata_only)

def __repr__(self):
return make_str('<PatchSet: %s>') % super(PatchSet, self).__repr__()

def __str__(self):
return ''.join(unicode(patched_file) for patched_file in self)

def _parse(self, diff, encoding):
def _parse(self, diff, encoding, metadata_only):
current_file = None
patch_info = None

Expand Down Expand Up @@ -449,7 +497,7 @@ def _parse(self, diff, encoding):
if is_hunk_header:
if current_file is None:
raise UnidiffParseError('Unexpected hunk found: %s' % line)
current_file._parse_hunk(line, diff, encoding)
current_file._parse_hunk(line, diff, encoding, metadata_only)
continue

# check for no newline marker
Expand Down

0 comments on commit d611fe6

Please sign in to comment.