Skip to content

Commit

Permalink
Replace int with Py_ssize_t anywhere dealing with memory offsets
Browse files Browse the repository at this point in the history
Enables matches where start= and end= exceed 2**31 (2 GiB), fixes crash
on OverflowError
  • Loading branch information
occasionallydavid authored and tyteen4a03 committed Mar 3, 2024
1 parent 4531f6e commit 6b0379f
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 36 deletions.
6 changes: 3 additions & 3 deletions src/includes.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ cdef extern from "re2/stringpiece.h" namespace "re2":
StringPiece(const char *)
StringPiece(const char *, int)
const char * data()
int copy(char * buf, size_t n, size_t pos)
int length()
size_t copy(char * buf, size_t n, size_t pos)
size_t length()


cdef extern from "re2/re2.h" namespace "re2":
Expand Down Expand Up @@ -77,7 +77,7 @@ cdef extern from "re2/re2.h" namespace "re2":
cdef cppclass RE2:
RE2(const StringPiece pattern, Options option) nogil
RE2(const StringPiece pattern) nogil
int Match(const StringPiece text, int startpos, int endpos,
int Match(const StringPiece text, Py_ssize_t startpos, Py_ssize_t endpos,
Anchor anchor, StringPiece * match, int nmatch) nogil
int Replace(cpp_string *str, const RE2 pattern,
const StringPiece rewrite) nogil
Expand Down
14 changes: 7 additions & 7 deletions src/match.pxi
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
cdef class Match:
cdef readonly Pattern re
cdef readonly object string
cdef readonly int pos
cdef readonly int endpos
cdef readonly Py_ssize_t pos
cdef readonly Py_ssize_t endpos
cdef readonly tuple regs

cdef StringPiece * matches
Expand Down Expand Up @@ -244,8 +244,8 @@ cdef class Match:
% (group, list(self.re.groupindex)))
return self.regs[self.re.groupindex[group]]

cdef _make_spans(self, char * cstring, int size, int * cpos, int * upos):
cdef int start, end
cdef _make_spans(self, char * cstring, Py_ssize_t size, Py_ssize_t *cpos, Py_ssize_t* upos):
cdef Py_ssize_t start, end
cdef StringPiece * piece

spans = []
Expand All @@ -266,9 +266,9 @@ cdef class Match:
self.regs = tuple(spans)

cdef list _convert_spans(self, spans,
char * cstring, int size, int * cpos, int * upos):
cdef map[int, int] positions
cdef int x, y
char * cstring, Py_ssize_t size, Py_ssize_t * cpos, Py_ssize_t * upos):
cdef map[Py_ssize_t, Py_ssize_t] positions
cdef Py_ssize_t x, y
for x, y in spans:
positions[x] = x
positions[y] = y
Expand Down
38 changes: 19 additions & 19 deletions src/pattern.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,22 @@ cdef class Pattern:
cdef bint encoded # True if this was originally a Unicode pattern
cdef RE2 * re_pattern

def search(self, object string, int pos=0, int endpos=-1):
def search(self, object string, Py_ssize_t pos=0, Py_ssize_t endpos=-1):
"""Scan through string looking for a match, and return a corresponding
Match instance. Return None if no position in the string matches."""
return self._search(string, pos, endpos, UNANCHORED)

def match(self, object string, int pos=0, int endpos=-1):
def match(self, object string, Py_ssize_t pos=0, Py_ssize_t endpos=-1):
"""Matches zero or more characters at the beginning of the string."""
return self._search(string, pos, endpos, ANCHOR_START)

def fullmatch(self, object string, int pos=0, int endpos=-1):
def fullmatch(self, object string, Py_ssize_t pos=0, Py_ssize_t endpos=-1):
""""fullmatch(string[, pos[, endpos]]) --> Match object or None."
Matches the entire string."""
return self._search(string, pos, endpos, ANCHOR_BOTH)

cdef _search(self, object string, int pos, int endpos,
cdef _search(self, object string, Py_ssize_t pos, Py_ssize_t endpos,
re2_Anchor anchoring):
"""Scan through string looking for a match, and return a corresponding
Match instance. Return None if no position in the string matches."""
Expand All @@ -34,7 +34,7 @@ cdef class Pattern:
cdef int encoded = 0
cdef StringPiece * sp
cdef Match m = Match(self, self.groups + 1)
cdef int cpos = 0, upos = pos
cdef Py_ssize_t cpos = 0, upos = pos

if 0 <= endpos <= pos:
return None
Expand Down Expand Up @@ -78,7 +78,7 @@ cdef class Pattern:
release_cstring(&buf)
return m

def contains(self, object string, int pos=0, int endpos=-1):
def contains(self, object string, Py_ssize_t pos=0, Py_ssize_t endpos=-1):
""""contains(string[, pos[, endpos]]) --> bool."
Scan through string looking for a match, and return True or False."""
Expand Down Expand Up @@ -117,7 +117,7 @@ cdef class Pattern:
release_cstring(&buf)
return retval != 0

def count(self, object string, int pos=0, int endpos=-1):
def count(self, object string, Py_ssize_t pos=0, Py_ssize_t endpos=-1):
"""Return number of non-overlapping matches of pattern in string."""
cdef char * cstring
cdef Py_ssize_t size
Expand Down Expand Up @@ -166,7 +166,7 @@ cdef class Pattern:
release_cstring(&buf)
return result

def findall(self, object string, int pos=0, int endpos=-1):
def findall(self, object string, Py_ssize_t pos=0, Py_ssize_t endpos=-1):
"""Return all non-overlapping matches of pattern in string as a list
of strings."""
cdef char * cstring
Expand Down Expand Up @@ -232,22 +232,22 @@ cdef class Pattern:
release_cstring(&buf)
return resultlist

def finditer(self, object string, int pos=0, int endpos=-1):
def finditer(self, object string, Py_ssize_t pos=0, Py_ssize_t endpos=-1):
"""Yield all non-overlapping matches of pattern in string as Match
objects."""
result = iter(self._finditer(string, pos, endpos))
next(result) # dummy value to raise error before start of generator
return result

def _finditer(self, object string, int pos=0, int endpos=-1):
def _finditer(self, object string, Py_ssize_t pos=0, Py_ssize_t endpos=-1):
cdef char * cstring
cdef Py_ssize_t size
cdef Py_buffer buf
cdef int retval
cdef StringPiece * sp = NULL
cdef Match m
cdef int encoded = 0
cdef int cpos = 0, upos = pos
cdef Py_ssize_t cpos = 0, upos = pos

bytestr = unicode_to_bytes(string, &encoded, self.encoded)
if pystring_to_cstring(bytestr, &cstring, &size, &buf) == -1:
Expand Down Expand Up @@ -303,7 +303,7 @@ cdef class Pattern:
cdef char * cstring
cdef Py_ssize_t size
cdef int retval
cdef int pos = 0
cdef Py_ssize_t pos = 0
cdef int lookahead = 0
cdef int num_split = 0
cdef StringPiece * sp
Expand Down Expand Up @@ -459,14 +459,14 @@ cdef class Pattern:
cdef Py_ssize_t size
cdef Py_buffer buf
cdef int retval
cdef int prevendpos = -1
cdef int endpos = 0
cdef int pos = 0
cdef Py_ssize_t prevendpos = -1
cdef Py_ssize_t endpos = 0
cdef Py_ssize_t pos = 0
cdef int encoded = 0
cdef StringPiece * sp
cdef Match m
cdef bytearray result = bytearray()
cdef int cpos = 0, upos = 0
cdef Py_ssize_t cpos = 0, upos = 0

if count < 0:
count = 0
Expand Down Expand Up @@ -525,9 +525,9 @@ cdef class Pattern:
cdef Py_ssize_t size
cdef Py_buffer buf
cdef int retval
cdef int prevendpos = -1
cdef int endpos = 0
cdef int pos = 0
cdef Py_ssize_t prevendpos = -1
cdef Py_ssize_t endpos = 0
cdef Py_ssize_t pos = 0
cdef int encoded = 0
cdef StringPiece * sp
cdef Match m
Expand Down
14 changes: 7 additions & 7 deletions src/re2.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,7 @@ cdef inline unicode cpp_to_unicode(cpp_string input):
input.data(), input.length(), 'strict')


cdef inline unicode char_to_unicode(const char * input, int length):
cdef inline unicode char_to_unicode(const char * input, Py_ssize_t length):
"""Convert a C string to a unicode string."""
return cpython.unicode.PyUnicode_DecodeUTF8(input, length, 'strict')

Expand Down Expand Up @@ -368,13 +368,13 @@ cdef inline void release_cstring(Py_buffer *buf):
PyBuffer_Release(buf)


cdef utf8indices(char * cstring, int size, int *pos, int *endpos):
cdef utf8indices(char * cstring, Py_ssize_t size, Py_ssize_t *pos, Py_ssize_t *endpos):
"""Convert unicode indices ``pos`` and ``endpos`` to UTF-8 indices.
If the indices are out of range, leave them unchanged."""
cdef unsigned char * data = <unsigned char *>cstring
cdef int newpos = pos[0], newendpos = -1
cdef int cpos = 0, upos = 0
cdef Py_ssize_t newpos = pos[0], newendpos = -1
cdef Py_ssize_t cpos = 0, upos = 0
while cpos < size:
if data[cpos] < 0x80:
cpos += 1
Expand Down Expand Up @@ -405,11 +405,11 @@ cdef utf8indices(char * cstring, int size, int *pos, int *endpos):
endpos[0] = newendpos


cdef void unicodeindices(map[int, int] &positions,
char * cstring, int size, int * cpos, int * upos):
cdef void unicodeindices(map[Py_ssize_t, Py_ssize_t] &positions,
char * cstring, Py_ssize_t size, Py_ssize_t * cpos, Py_ssize_t * upos):
"""Convert UTF-8 byte indices to unicode indices."""
cdef unsigned char * s = <unsigned char *>cstring
cdef map[int, int].iterator it = positions.begin()
cdef map[Py_ssize_t, Py_ssize_t].iterator it = positions.begin()

if dereference(it).first == -1:
dereference(it).second = -1
Expand Down

0 comments on commit 6b0379f

Please sign in to comment.