Skip to content

Commit

Permalink
tmp
Browse files Browse the repository at this point in the history
  • Loading branch information
kddnewton committed Oct 9, 2024
1 parent 5e9eaef commit f2268a4
Show file tree
Hide file tree
Showing 2 changed files with 136 additions and 96 deletions.
181 changes: 110 additions & 71 deletions lib/prism/parse_result.rb
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ def initialize(source, start_line = 1, offsets = [])
@source = source
@start_line = start_line # set after parsing is done
@offsets = offsets # set after parsing is done
@code_units_caches = {}
end

# Returns the encoding of the source code, which is set by parameters to the
Expand Down Expand Up @@ -105,81 +104,26 @@ def character_column(byte_offset)
# This method is tested with UTF-8, UTF-16, and UTF-32. If there is the
# concept of code units that differs from the number of characters in other
# encodings, it is not captured here.
def code_units_offset(byte_offset, encoding)
(@code_units_caches[encoding] ||= CodeUnitsCache.new(source, encoding))[byte_offset]
end

# A cache that can be used to quickly compute code unit offsets from byte
# offsets. It purposefully provides only a single #[] method to access the
# cache in order to minimize surface area.
#
# Note that there are some known issues here that may or may not be
# addressed in the future:
#
# * The first is that there are issues when the cache computes values that
# are not on character boundaries. This can result in subsequent
# computations being off by one or more code units.
# * The second is that this cache is currently unbounded. In theory we could
# introduce some kind of LRU cache to limit the number of entries, but
# this has not yet been implemented.
#
class CodeUnitsCache
class UTF16Counter # :nodoc:
def initialize(source, encoding)
@source = source
@encoding = encoding
end

def count(byte_offset, byte_length)
@source.byteslice(byte_offset, byte_length).encode(@encoding, invalid: :replace, undef: :replace).bytesize / 2
end
end

class LengthCounter # :nodoc:
def initialize(source, encoding)
@source = source
@encoding = encoding
end

def count(byte_offset, byte_length)
@source.byteslice(byte_offset, byte_length).encode(@encoding, invalid: :replace, undef: :replace).length
end
end

private_constant :UTF16Counter, :LengthCounter

# Initialize a new cache with the given source and encoding.
def initialize(source, encoding)
@source = source
@counter =
if encoding == Encoding::UTF_16LE || encoding == Encoding::UTF_16BE
UTF16Counter.new(source, encoding)
else
LengthCounter.new(source, encoding)
end

@cache = {}
@offsets = []
end
# We purposefully replace invalid and undefined characters with replacement
# characters in this conversion. This happens for two reasons. First, it's
# possible that the given byte offset will not occur on a character
# boundary. Second, it's possible that the source code will contain a
# character that has no equivalent in the given encoding.
def code_units_offset(byte_offset, encoding)
byteslice = (source.byteslice(0, byte_offset) or raise).encode(encoding, invalid: :replace, undef: :replace)

# Retrieve the code units offset from the given byte offset.
def [](byte_offset)
@cache[byte_offset] ||=
if (index = @offsets.bsearch_index { |offset| offset > byte_offset }).nil?
@offsets << byte_offset
@counter.count(0, byte_offset)
elsif index == 0
@offsets.unshift(byte_offset)
@counter.count(0, byte_offset)
else
@offsets.insert(index, byte_offset)
offset = @offsets[index - 1]
@cache[offset] + @counter.count(offset, byte_offset - offset)
end
if encoding == Encoding::UTF_16LE || encoding == Encoding::UTF_16BE
byteslice.bytesize / 2
else
byteslice.length
end
end

private_constant :CodeUnitsCache
# blah
def code_units_cache(encoding)
CodeUnitsCache.new(source, encoding)
end

# Returns the column number in code units for the given encoding for the
# given byte offset.
Expand Down Expand Up @@ -239,6 +183,11 @@ def code_units_offset(byte_offset, encoding)
byte_offset
end

# blah
def code_units_cache(encoding)
->(byte_offset) { byte_offset }
end

# Specialized version of `code_units_column` that does not depend on
# `code_units_offset`, which is a more expensive operation. This is
# essentially the same as `Prism::Source#column`.
Expand All @@ -247,6 +196,76 @@ def code_units_column(byte_offset, encoding)
end
end

# A cache that can be used to quickly compute code unit offsets from byte
# offsets. It purposefully provides only a single #[] method to access the
# cache in order to minimize surface area.
#
# Note that there are some known issues here that may or may not be addressed
# in the future:
#
# * The first is that there are issues when the cache computes values that are
# not on character boundaries. This can result in subsequent computations
# being off by one or more code units.
# * The second is that this cache is currently unbounded. In theory we could
# introduce some kind of LRU cache to limit the number of entries, but this
# has not yet been implemented.
#
class CodeUnitsCache
class UTF16Counter # :nodoc:
def initialize(source, encoding)
@source = source
@encoding = encoding
end

def count(byte_offset, byte_length)
@source.byteslice(byte_offset, byte_length).encode(@encoding, invalid: :replace, undef: :replace).bytesize / 2
end
end

class LengthCounter # :nodoc:
def initialize(source, encoding)
@source = source
@encoding = encoding
end

def count(byte_offset, byte_length)
@source.byteslice(byte_offset, byte_length).encode(@encoding, invalid: :replace, undef: :replace).length
end
end

private_constant :UTF16Counter, :LengthCounter

# Initialize a new cache with the given source and encoding.
def initialize(source, encoding)
@source = source
@counter =
if encoding == Encoding::UTF_16LE || encoding == Encoding::UTF_16BE
UTF16Counter.new(source, encoding)
else
LengthCounter.new(source, encoding)
end

@cache = {}
@offsets = []
end

# Retrieve the code units offset from the given byte offset.
def [](byte_offset)
@cache[byte_offset] ||=
if (index = @offsets.bsearch_index { |offset| offset > byte_offset }).nil?
@offsets << byte_offset
@counter.count(0, byte_offset)
elsif index == 0
@offsets.unshift(byte_offset)
@counter.count(0, byte_offset)
else
@offsets.insert(index, byte_offset)
offset = @offsets[index - 1]
@cache[offset] + @counter.count(offset, byte_offset - offset)
end
end
end

# This represents a location in the source.
class Location
# A Source object that is used to determine more information from the given
Expand Down Expand Up @@ -348,6 +367,11 @@ def start_code_units_offset(encoding = Encoding::UTF_16LE)
source.code_units_offset(start_offset, encoding)
end

# blah
def cache_start_code_units_offset(code_units_cache)
code_units_cache[start_offset]
end

# The byte offset from the beginning of the source where this location ends.
def end_offset
start_offset + length
Expand All @@ -364,6 +388,11 @@ def end_code_units_offset(encoding = Encoding::UTF_16LE)
source.code_units_offset(end_offset, encoding)
end

# blah
def cache_end_code_units_offset(code_units_cache)
code_units_cache[end_offset]
end

# The line number where this location starts.
def start_line
source.line(start_offset)
Expand Down Expand Up @@ -398,6 +427,11 @@ def start_code_units_column(encoding = Encoding::UTF_16LE)
source.code_units_column(start_offset, encoding)
end

# blah
def cache_start_code_units_column(code_units_cache)
code_units_cache[start_offset] - code_units_cache[source.line_start(start_offset)]
end

# The column number in bytes where this location ends from the start of the
# line.
def end_column
Expand All @@ -416,6 +450,11 @@ def end_code_units_column(encoding = Encoding::UTF_16LE)
source.code_units_column(end_offset, encoding)
end

# blah
def cache_end_code_units_column(code_units_cache)
code_units_cache[end_offset] - code_units_cache[source.line_start(end_offset)]
end

# Implement the hash pattern matching interface for Location.
def deconstruct_keys(keys)
{ start_offset: start_offset, end_offset: end_offset }
Expand Down
51 changes: 26 additions & 25 deletions test/prism/ruby/location_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -61,45 +61,46 @@ def test_character_offsets
end

def test_code_units
program = Prism.parse("😀 + 😀\n😍 ||= 😍").value
result = Prism.parse("😀 + 😀\n😍 ||= 😍")
program = result.value

# first 😀
location = program.statements.body.first.receiver.location

assert_equal 0, location.start_code_units_offset(Encoding::UTF_8)
assert_equal 0, location.start_code_units_offset(Encoding::UTF_16LE)
assert_equal 0, location.start_code_units_offset(Encoding::UTF_32LE)
assert_equal 0, location.cache_start_code_units_offset(result.source.code_units_cache(Encoding::UTF_8))
assert_equal 0, location.cache_start_code_units_offset(result.source.code_units_cache(Encoding::UTF_16LE))
assert_equal 0, location.cache_start_code_units_offset(result.source.code_units_cache(Encoding::UTF_32LE))

assert_equal 1, location.end_code_units_offset(Encoding::UTF_8)
assert_equal 2, location.end_code_units_offset(Encoding::UTF_16LE)
assert_equal 1, location.end_code_units_offset(Encoding::UTF_32LE)
assert_equal 1, location.cache_end_code_units_offset(result.source.code_units_cache(Encoding::UTF_8))
assert_equal 2, location.cache_end_code_units_offset(result.source.code_units_cache(Encoding::UTF_16LE))
assert_equal 1, location.cache_end_code_units_offset(result.source.code_units_cache(Encoding::UTF_32LE))

assert_equal 0, location.start_code_units_column(Encoding::UTF_8)
assert_equal 0, location.start_code_units_column(Encoding::UTF_16LE)
assert_equal 0, location.start_code_units_column(Encoding::UTF_32LE)
assert_equal 0, location.cache_start_code_units_column(result.source.code_units_cache(Encoding::UTF_8))
assert_equal 0, location.cache_start_code_units_column(result.source.code_units_cache(Encoding::UTF_16LE))
assert_equal 0, location.cache_start_code_units_column(result.source.code_units_cache(Encoding::UTF_32LE))

assert_equal 1, location.end_code_units_column(Encoding::UTF_8)
assert_equal 2, location.end_code_units_column(Encoding::UTF_16LE)
assert_equal 1, location.end_code_units_column(Encoding::UTF_32LE)
assert_equal 1, location.cache_end_code_units_column(result.source.code_units_cache(Encoding::UTF_8))
assert_equal 2, location.cache_end_code_units_column(result.source.code_units_cache(Encoding::UTF_16LE))
assert_equal 1, location.cache_end_code_units_column(result.source.code_units_cache(Encoding::UTF_32LE))

# second 😀
location = program.statements.body.first.arguments.arguments.first.location

assert_equal 4, location.start_code_units_offset(Encoding::UTF_8)
assert_equal 5, location.start_code_units_offset(Encoding::UTF_16LE)
assert_equal 4, location.start_code_units_offset(Encoding::UTF_32LE)
assert_equal 4, location.cache_start_code_units_offset(result.source.code_units_cache(Encoding::UTF_8))
assert_equal 5, location.cache_start_code_units_offset(result.source.code_units_cache(Encoding::UTF_16LE))
assert_equal 4, location.cache_start_code_units_offset(result.source.code_units_cache(Encoding::UTF_32LE))

assert_equal 5, location.end_code_units_offset(Encoding::UTF_8)
assert_equal 7, location.end_code_units_offset(Encoding::UTF_16LE)
assert_equal 5, location.end_code_units_offset(Encoding::UTF_32LE)
assert_equal 5, location.cache_end_code_units_offset(result.source.code_units_cache(Encoding::UTF_8))
assert_equal 7, location.cache_end_code_units_offset(result.source.code_units_cache(Encoding::UTF_16LE))
assert_equal 5, location.cache_end_code_units_offset(result.source.code_units_cache(Encoding::UTF_32LE))

assert_equal 4, location.start_code_units_column(Encoding::UTF_8)
assert_equal 5, location.start_code_units_column(Encoding::UTF_16LE)
assert_equal 4, location.start_code_units_column(Encoding::UTF_32LE)
assert_equal 4, location.cache_start_code_units_column(result.source.code_units_cache(Encoding::UTF_8))
assert_equal 5, location.cache_start_code_units_column(result.source.code_units_cache(Encoding::UTF_16LE))
assert_equal 4, location.cache_start_code_units_column(result.source.code_units_cache(Encoding::UTF_32LE))

assert_equal 5, location.end_code_units_column(Encoding::UTF_8)
assert_equal 7, location.end_code_units_column(Encoding::UTF_16LE)
assert_equal 5, location.end_code_units_column(Encoding::UTF_32LE)
assert_equal 5, location.cache_end_code_units_column(result.source.code_units_cache(Encoding::UTF_8))
assert_equal 7, location.cache_end_code_units_column(result.source.code_units_cache(Encoding::UTF_16LE))
assert_equal 5, location.cache_end_code_units_column(result.source.code_units_cache(Encoding::UTF_32LE))

# first 😍
location = program.statements.body.last.name_loc
Expand Down

0 comments on commit f2268a4

Please sign in to comment.