Merge pull request #137 from WorksApplications/feature/lazy-sudachi-a…

…nalysis Use `lazyTokenizeSentences`
WorksApplications · Jul 2, 2024 · f9768ad · f9768ad
2 parents 8ec03b2 + a84f94a
commit f9768ad
Show file tree

Hide file tree

Showing 3 changed files with 11 additions and 75 deletions.
diff --git a/spi/build.gradle b/spi/build.gradle
@@ -12,7 +12,7 @@ version = properties["pluginVersion"]
 description = "Plugin interface for Sudachi search engine integrations (ElasticSearch and OpenSearch)"
 
 dependencies {
-    api('com.worksap.nlp:sudachi:0.7.3')
+    api('com.worksap.nlp:sudachi:0.7.4')
 }
 
 spotless {

diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/MorphemeIterator.kt b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/MorphemeIterator.kt
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Works Applications Co., Ltd.
+ * Copyright (c) 2022-2024 Works Applications Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -68,7 +68,7 @@ class NonCachedAnalysis(tokenizer: Tokenizer, input: Reader, splitMode: SplitMod
     override fun next() = throw IllegalStateException()
   }
 
-  private val sentenceIterator = tokenizer.tokenizeSentences(splitMode, input).iterator()
+  private val sentenceIterator = tokenizer.lazyTokenizeSentences(splitMode, input)
   private var morphemeIterator: Iterator<Morpheme> = EmptyIterator
   private var currentLength = 0
 

diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.kt b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.kt
@@ -19,20 +19,13 @@ package com.worksap.nlp.lucene.sudachi.ja
 import com.worksap.nlp.lucene.sudachi.ja.attributes.MorphemeAttribute
 import com.worksap.nlp.lucene.sudachi.ja.attributes.SudachiAttribute
 import com.worksap.nlp.lucene.sudachi.ja.attributes.SudachiAttributeFactory
-import com.worksap.nlp.sudachi.IOTools
-import java.io.StringReader
-import java.nio.CharBuffer
 import org.apache.lucene.analysis.Tokenizer
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute
 import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute
 import org.apache.lucene.util.AttributeFactory
 
-private val MAX_CHUNK_SIZE = 1 * 1024 * 1024
-private val INITIAL_CHUNK_SIZE = 32 * 1024
-private val CHUNK_GROW_SCALE = 8
-
 class SudachiTokenizer(
     private val tokenizer: CachingTokenizer,
     private val discardPunctuation: Boolean,
@@ -48,90 +41,33 @@ class SudachiTokenizer(
     addAttribute<SudachiAttribute> { it.dictionary = tokenizer.dictionary }
   }
 
-  // To cope with huge text, input data split into chunks for tokenize.
-  // Initial chunk size is INITIAL_CHUNK_SIZE.
-  // But it grows, if input data is large (up to MAX_CHUNK_SIZE).
-  // TODO: Should split with meaningful delimitations instead of fixed size.
-  private var chunk: CharBuffer = CharBuffer.allocate(INITIAL_CHUNK_SIZE)
   private var iterator: MorphemeIterator = MorphemeIterator.EMPTY
-  private var offset = 0 // pos from the beginning to current chunk.
-  private var endOffset = 0
 
   override fun reset() {
     super.reset()
-    iterator = MorphemeIterator.EMPTY
-    offset = 0
-    endOffset = 0
-  }
-
-  private fun growChunk() {
-    val newChunkSize = kotlin.math.min(chunk.capacity() * CHUNK_GROW_SCALE, MAX_CHUNK_SIZE)
-    val newChunk = CharBuffer.allocate(newChunkSize)
-    chunk.flip()
-    newChunk.put(chunk)
-
-    chunk = newChunk
-  }
-
-  private fun read(): Boolean {
-    chunk.clear()
-
-    while (true) {
-      val nread = IOTools.readAsMuchAsCan(input, chunk)
-      if (nread < 0) {
-        return chunk.position() > 0
-      }
-
-      // check: chunk reads all data from Reader. No remaining data in Reader.
-      if (chunk.hasRemaining()) {
-        return true
-      }
-
-      // check: chunk is already max size
-      if (chunk.capacity() == MAX_CHUNK_SIZE) {
-        return true
-      }
-
-      growChunk()
+    var iter = tokenizer.tokenize(input)
+    if (discardPunctuation) {
+      iter = NonPunctuationMorphemes(iter)
     }
+    iterator = iter
   }
 
   override fun incrementToken(): Boolean {
     clearAttributes()
-
-    var m = iterator.next()
-    if (m == null) {
-      if (!read()) {
-        return false
-      }
-      chunk.flip()
-
-      var iter = tokenizer.tokenize(StringReader(chunk.toString()))
-      if (discardPunctuation) {
-        iter = NonPunctuationMorphemes(iter)
-      }
-      iterator = iter
-      offset = endOffset
-
-      m = iterator.next() ?: return false
-    }
+    var m = iterator.next() ?: return false
 
     morphemeAtt.setMorpheme(m)
     posLenAtt.positionLength = 1
     posIncAtt.positionIncrement = 1
-    val baseOffset = iterator.baseOffset // offset in this chunk
-    offsetAtt.setOffset(
-        correctOffset(offset + baseOffset + m.begin()),
-        correctOffset(offset + baseOffset + m.end()))
-    endOffset = offset + baseOffset + m.end()
-
+    val baseOffset = iterator.baseOffset
+    offsetAtt.setOffset(correctOffset(baseOffset + m.begin()), correctOffset(baseOffset + m.end()))
     termAtt.setEmpty().append(m.surface())
     return true
   }
 
   override fun end() {
     super.end()
-    val lastOffset = correctOffset(offset + iterator.baseOffset)
+    val lastOffset = correctOffset(iterator.baseOffset)
     offsetAtt.setOffset(lastOffset, lastOffset)
     iterator = MorphemeIterator.EMPTY
   }