Prism::Source::CodeUnitsCache

Calculating code unit offsets for a source can be very expensive, especially when the source is large. This commit introduces a new class that wraps the source and desired encoding into a cache that reuses pre-computed offsets. It performs quite a bit better. There are still some problems with this approach, namely character boundaries and the fact that the cache is unbounded, but both of these may be addressed in subsequent commits.
ruby · Oct 9, 2024 · 5e9eaef · 5e9eaef
1 parent d6e9b8d
commit 5e9eaef
Show file tree

Hide file tree

Showing 2 changed files with 87 additions and 11 deletions.
diff --git a/lib/prism/parse_result.rb b/lib/prism/parse_result.rb
@@ -46,6 +46,7 @@ def initialize(source, start_line = 1, offsets = [])
       @source = source
       @start_line = start_line # set after parsing is done
       @offsets = offsets # set after parsing is done
+      @code_units_caches = {}
     end
 
     # Returns the encoding of the source code, which is set by parameters to the
@@ -104,22 +105,82 @@ def character_column(byte_offset)
     # This method is tested with UTF-8, UTF-16, and UTF-32. If there is the
     # concept of code units that differs from the number of characters in other
     # encodings, it is not captured here.
-    #
-    # We purposefully replace invalid and undefined characters with replacement
-    # characters in this conversion. This happens for two reasons. First, it's
-    # possible that the given byte offset will not occur on a character
-    # boundary. Second, it's possible that the source code will contain a
-    # character that has no equivalent in the given encoding.
     def code_units_offset(byte_offset, encoding)
-      byteslice = (source.byteslice(0, byte_offset) or raise).encode(encoding, invalid: :replace, undef: :replace)
+      (@code_units_caches[encoding] ||= CodeUnitsCache.new(source, encoding))[byte_offset]
+    end
 
-      if encoding == Encoding::UTF_16LE || encoding == Encoding::UTF_16BE
-        byteslice.bytesize / 2
-      else
-        byteslice.length
+    # A cache that can be used to quickly compute code unit offsets from byte
+    # offsets. It purposefully provides only a single #[] method to access the
+    # cache in order to minimize surface area.
+    #
+    # Note that there are some known issues here that may or may not be
+    # addressed in the future:
+    #
+    # * The first is that there are issues when the cache computes values that
+    #   are not on character boundaries. This can result in subsequent
+    #   computations being off by one or more code units.
+    # * The second is that this cache is currently unbounded. In theory we could
+    #   introduce some kind of LRU cache to limit the number of entries, but
+    #   this has not yet been implemented.
+    #
+    class CodeUnitsCache
+      class UTF16Counter # :nodoc:
+        def initialize(source, encoding)
+          @source = source
+          @encoding = encoding
+        end
+
+        def count(byte_offset, byte_length)
+          @source.byteslice(byte_offset, byte_length).encode(@encoding, invalid: :replace, undef: :replace).bytesize / 2
+        end
+      end
+
+      class LengthCounter # :nodoc:
+        def initialize(source, encoding)
+          @source = source
+          @encoding = encoding
+        end
+
+        def count(byte_offset, byte_length)
+          @source.byteslice(byte_offset, byte_length).encode(@encoding, invalid: :replace, undef: :replace).length
+        end
+      end
+
+      private_constant :UTF16Counter, :LengthCounter
+
+      # Initialize a new cache with the given source and encoding.
+      def initialize(source, encoding)
+        @source = source
+        @counter =
+          if encoding == Encoding::UTF_16LE || encoding == Encoding::UTF_16BE
+            UTF16Counter.new(source, encoding)
+          else
+            LengthCounter.new(source, encoding)
+          end
+
+        @cache = {}
+        @offsets = []
+      end
+
+      # Retrieve the code units offset from the given byte offset.
+      def [](byte_offset)
+        @cache[byte_offset] ||=
+          if (index = @offsets.bsearch_index { |offset| offset > byte_offset }).nil?
+            @offsets << byte_offset
+            @counter.count(0, byte_offset)
+          elsif index == 0
+            @offsets.unshift(byte_offset)
+            @counter.count(0, byte_offset)
+          else
+            @offsets.insert(index, byte_offset)
+            offset = @offsets[index - 1]
+            @cache[offset] + @counter.count(offset, byte_offset - offset)
+          end
       end
     end
 
+    private_constant :CodeUnitsCache
+
     # Returns the column number in code units for the given encoding for the
     # given byte offset.
     def code_units_column(byte_offset, encoding)

diff --git a/sig/prism/_private/parse_result.rbs b/sig/prism/_private/parse_result.rbs
@@ -1,5 +1,20 @@
 module Prism
   class Source
+    class CodeUnitsCache
+      class UTF16Counter
+        def initialize: (String source, Encoding encoding) -> void
+        def count: (Integer byte_offset, Integer byte_length) -> Integer
+      end
+
+      class LengthCounter
+        def initialize: (String source, Encoding encoding) -> void
+        def count: (Integer byte_offset, Integer byte_length) -> Integer
+      end
+
+      def initialize: (String source, Encoding encoding) -> void
+      def []: (Integer byte_offset) -> Integer
+    end
+
     private
 
     def find_line: (Integer) -> Integer