Skip to content

Commit

Permalink
tmp
Browse files Browse the repository at this point in the history
  • Loading branch information
kddnewton committed Oct 9, 2024
1 parent 5e9eaef commit c59b804
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 29 deletions.
42 changes: 38 additions & 4 deletions lib/prism/parse_result.rb
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ def initialize(source, start_line = 1, offsets = [])
@source = source
@start_line = start_line # set after parsing is done
@offsets = offsets # set after parsing is done
@code_units_caches = {}
end

# Returns the encoding of the source code, which is set by parameters to the
Expand Down Expand Up @@ -105,8 +104,25 @@ def character_column(byte_offset)
# This method is tested with UTF-8, UTF-16, and UTF-32. If there is the
# concept of code units that differs from the number of characters in other
# encodings, it is not captured here.
#
# We purposefully replace invalid and undefined characters with replacement
# characters in this conversion. This happens for two reasons. First, it's
# possible that the given byte offset will not occur on a character
# boundary. Second, it's possible that the source code will contain a
# character that has no equivalent in the given encoding.
def code_units_offset(byte_offset, encoding)
(@code_units_caches[encoding] ||= CodeUnitsCache.new(source, encoding))[byte_offset]
byteslice = (source.byteslice(0, byte_offset) or raise).encode(encoding, invalid: :replace, undef: :replace)

if encoding == Encoding::UTF_16LE || encoding == Encoding::UTF_16BE
byteslice.bytesize / 2
else
byteslice.length
end
end

# blah
def code_units_cache(encoding)
CodeUnitsCache.new(source, encoding)
end

# A cache that can be used to quickly compute code unit offsets from byte
Expand Down Expand Up @@ -179,8 +195,6 @@ def [](byte_offset)
end
end

private_constant :CodeUnitsCache

# Returns the column number in code units for the given encoding for the
# given byte offset.
def code_units_column(byte_offset, encoding)
Expand Down Expand Up @@ -348,6 +362,11 @@ def start_code_units_offset(encoding = Encoding::UTF_16LE)
source.code_units_offset(start_offset, encoding)
end

# blah
def cache_start_code_units_offset(code_units_cache)
code_units_cache[start_offset]
end

# The byte offset from the beginning of the source where this location ends.
def end_offset
start_offset + length
Expand All @@ -364,6 +383,11 @@ def end_code_units_offset(encoding = Encoding::UTF_16LE)
source.code_units_offset(end_offset, encoding)
end

# blah
def cache_end_code_units_offset(code_units_cache)
code_units_cache[end_offset]
end

# The line number where this location starts.
def start_line
source.line(start_offset)
Expand Down Expand Up @@ -398,6 +422,11 @@ def start_code_units_column(encoding = Encoding::UTF_16LE)
source.code_units_column(start_offset, encoding)
end

# blah
def cache_start_code_units_column(code_units_cache)
code_units_cache[start_offset] - code_units_cache[source.line_start(start_offset)]
end

# The column number in bytes where this location ends from the start of the
# line.
def end_column
Expand All @@ -416,6 +445,11 @@ def end_code_units_column(encoding = Encoding::UTF_16LE)
source.code_units_column(end_offset, encoding)
end

# blah
def cache_end_code_units_column(code_units_cache)
code_units_cache[end_offset] - code_units_cache[source.line_start(end_offset)]
end

# Implement the hash pattern matching interface for Location.
def deconstruct_keys(keys)
{ start_offset: start_offset, end_offset: end_offset }
Expand Down
51 changes: 26 additions & 25 deletions test/prism/ruby/location_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -61,45 +61,46 @@ def test_character_offsets
end

def test_code_units
program = Prism.parse("😀 + 😀\n😍 ||= 😍").value
result = Prism.parse("😀 + 😀\n😍 ||= 😍")
program = result.value

# first 😀
location = program.statements.body.first.receiver.location

assert_equal 0, location.start_code_units_offset(Encoding::UTF_8)
assert_equal 0, location.start_code_units_offset(Encoding::UTF_16LE)
assert_equal 0, location.start_code_units_offset(Encoding::UTF_32LE)
assert_equal 0, location.cache_start_code_units_offset(result.source.code_units_cache(Encoding::UTF_8))
assert_equal 0, location.cache_start_code_units_offset(result.source.code_units_cache(Encoding::UTF_16LE))
assert_equal 0, location.cache_start_code_units_offset(result.source.code_units_cache(Encoding::UTF_32LE))

assert_equal 1, location.end_code_units_offset(Encoding::UTF_8)
assert_equal 2, location.end_code_units_offset(Encoding::UTF_16LE)
assert_equal 1, location.end_code_units_offset(Encoding::UTF_32LE)
assert_equal 1, location.cache_end_code_units_offset(result.source.code_units_cache(Encoding::UTF_8))
assert_equal 2, location.cache_end_code_units_offset(result.source.code_units_cache(Encoding::UTF_16LE))
assert_equal 1, location.cache_end_code_units_offset(result.source.code_units_cache(Encoding::UTF_32LE))

assert_equal 0, location.start_code_units_column(Encoding::UTF_8)
assert_equal 0, location.start_code_units_column(Encoding::UTF_16LE)
assert_equal 0, location.start_code_units_column(Encoding::UTF_32LE)
assert_equal 0, location.cache_start_code_units_column(result.source.code_units_cache(Encoding::UTF_8))
assert_equal 0, location.cache_start_code_units_column(result.source.code_units_cache(Encoding::UTF_16LE))
assert_equal 0, location.cache_start_code_units_column(result.source.code_units_cache(Encoding::UTF_32LE))

assert_equal 1, location.end_code_units_column(Encoding::UTF_8)
assert_equal 2, location.end_code_units_column(Encoding::UTF_16LE)
assert_equal 1, location.end_code_units_column(Encoding::UTF_32LE)
assert_equal 1, location.cache_end_code_units_column(result.source.code_units_cache(Encoding::UTF_8))
assert_equal 2, location.cache_end_code_units_column(result.source.code_units_cache(Encoding::UTF_16LE))
assert_equal 1, location.cache_end_code_units_column(result.source.code_units_cache(Encoding::UTF_32LE))

# second 😀
location = program.statements.body.first.arguments.arguments.first.location

assert_equal 4, location.start_code_units_offset(Encoding::UTF_8)
assert_equal 5, location.start_code_units_offset(Encoding::UTF_16LE)
assert_equal 4, location.start_code_units_offset(Encoding::UTF_32LE)
assert_equal 4, location.cache_start_code_units_offset(result.source.code_units_cache(Encoding::UTF_8))
assert_equal 5, location.cache_start_code_units_offset(result.source.code_units_cache(Encoding::UTF_16LE))
assert_equal 4, location.cache_start_code_units_offset(result.source.code_units_cache(Encoding::UTF_32LE))

assert_equal 5, location.end_code_units_offset(Encoding::UTF_8)
assert_equal 7, location.end_code_units_offset(Encoding::UTF_16LE)
assert_equal 5, location.end_code_units_offset(Encoding::UTF_32LE)
assert_equal 5, location.cache_end_code_units_offset(result.source.code_units_cache(Encoding::UTF_8))
assert_equal 7, location.cache_end_code_units_offset(result.source.code_units_cache(Encoding::UTF_16LE))
assert_equal 5, location.cache_end_code_units_offset(result.source.code_units_cache(Encoding::UTF_32LE))

assert_equal 4, location.start_code_units_column(Encoding::UTF_8)
assert_equal 5, location.start_code_units_column(Encoding::UTF_16LE)
assert_equal 4, location.start_code_units_column(Encoding::UTF_32LE)
assert_equal 4, location.cache_start_code_units_column(result.source.code_units_cache(Encoding::UTF_8))
assert_equal 5, location.cache_start_code_units_column(result.source.code_units_cache(Encoding::UTF_16LE))
assert_equal 4, location.cache_start_code_units_column(result.source.code_units_cache(Encoding::UTF_32LE))

assert_equal 5, location.end_code_units_column(Encoding::UTF_8)
assert_equal 7, location.end_code_units_column(Encoding::UTF_16LE)
assert_equal 5, location.end_code_units_column(Encoding::UTF_32LE)
assert_equal 5, location.cache_end_code_units_column(result.source.code_units_cache(Encoding::UTF_8))
assert_equal 7, location.cache_end_code_units_column(result.source.code_units_cache(Encoding::UTF_16LE))
assert_equal 5, location.cache_end_code_units_column(result.source.code_units_cache(Encoding::UTF_32LE))

# first 😍
location = program.statements.body.last.name_loc
Expand Down

0 comments on commit c59b804

Please sign in to comment.