From d0a94b75d3041b1e52d0c92bb909427b41115b97 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Wed, 26 Jun 2024 10:40:49 +0900 Subject: [PATCH 1/3] use lazy analysis --- .../com/worksap/nlp/lucene/sudachi/ja/MorphemeIterator.kt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/MorphemeIterator.kt b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/MorphemeIterator.kt index 3e0e438e..cfe4461c 100644 --- a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/MorphemeIterator.kt +++ b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/MorphemeIterator.kt @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023 Works Applications Co., Ltd. + * Copyright (c) 2022-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -68,7 +68,7 @@ class NonCachedAnalysis(tokenizer: Tokenizer, input: Reader, splitMode: SplitMod override fun next() = throw IllegalStateException() } - private val sentenceIterator = tokenizer.tokenizeSentences(splitMode, input).iterator() + private val sentenceIterator = tokenizer.lazyTokenizeSentences(splitMode, input) private var morphemeIterator: Iterator = EmptyIterator private var currentLength = 0 From 6e30395213d9aed4e9bb4a48048cdab72e772abf Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Wed, 26 Jun 2024 10:42:00 +0900 Subject: [PATCH 2/3] revert input buffering --- .../nlp/lucene/sudachi/ja/SudachiTokenizer.kt | 80 ++----------------- 1 file changed, 8 insertions(+), 72 deletions(-) diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.kt b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.kt index 3422e435..6bfbb0eb 100644 --- a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.kt +++ b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.kt @@ -19,9 +19,6 @@ package com.worksap.nlp.lucene.sudachi.ja import com.worksap.nlp.lucene.sudachi.ja.attributes.MorphemeAttribute import com.worksap.nlp.lucene.sudachi.ja.attributes.SudachiAttribute import com.worksap.nlp.lucene.sudachi.ja.attributes.SudachiAttributeFactory -import com.worksap.nlp.sudachi.IOTools -import java.io.StringReader -import java.nio.CharBuffer import org.apache.lucene.analysis.Tokenizer import org.apache.lucene.analysis.tokenattributes.CharTermAttribute import org.apache.lucene.analysis.tokenattributes.OffsetAttribute @@ -29,10 +26,6 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute import org.apache.lucene.util.AttributeFactory -private val MAX_CHUNK_SIZE = 1 * 1024 * 1024 -private val INITIAL_CHUNK_SIZE = 32 * 1024 -private val CHUNK_GROW_SCALE = 8 - class SudachiTokenizer( private val tokenizer: CachingTokenizer, private val discardPunctuation: Boolean, @@ -48,90 +41,33 @@ class SudachiTokenizer( addAttribute { it.dictionary = tokenizer.dictionary } } - // To cope with huge text, input data split into chunks for tokenize. - // Initial chunk size is INITIAL_CHUNK_SIZE. - // But it grows, if input data is large (up to MAX_CHUNK_SIZE). - // TODO: Should split with meaningful delimitations instead of fixed size. - private var chunk: CharBuffer = CharBuffer.allocate(INITIAL_CHUNK_SIZE) private var iterator: MorphemeIterator = MorphemeIterator.EMPTY - private var offset = 0 // pos from the beginning to current chunk. - private var endOffset = 0 override fun reset() { super.reset() - iterator = MorphemeIterator.EMPTY - offset = 0 - endOffset = 0 - } - - private fun growChunk() { - val newChunkSize = kotlin.math.min(chunk.capacity() * CHUNK_GROW_SCALE, MAX_CHUNK_SIZE) - val newChunk = CharBuffer.allocate(newChunkSize) - chunk.flip() - newChunk.put(chunk) - - chunk = newChunk - } - - private fun read(): Boolean { - chunk.clear() - - while (true) { - val nread = IOTools.readAsMuchAsCan(input, chunk) - if (nread < 0) { - return chunk.position() > 0 - } - - // check: chunk reads all data from Reader. No remaining data in Reader. - if (chunk.hasRemaining()) { - return true - } - - // check: chunk is already max size - if (chunk.capacity() == MAX_CHUNK_SIZE) { - return true - } - - growChunk() + var iter = tokenizer.tokenize(input) + if (discardPunctuation) { + iter = NonPunctuationMorphemes(iter) } + iterator = iter } override fun incrementToken(): Boolean { clearAttributes() - - var m = iterator.next() - if (m == null) { - if (!read()) { - return false - } - chunk.flip() - - var iter = tokenizer.tokenize(StringReader(chunk.toString())) - if (discardPunctuation) { - iter = NonPunctuationMorphemes(iter) - } - iterator = iter - offset = endOffset - - m = iterator.next() ?: return false - } + var m = iterator.next() ?: return false morphemeAtt.setMorpheme(m) posLenAtt.positionLength = 1 posIncAtt.positionIncrement = 1 - val baseOffset = iterator.baseOffset // offset in this chunk - offsetAtt.setOffset( - correctOffset(offset + baseOffset + m.begin()), - correctOffset(offset + baseOffset + m.end())) - endOffset = offset + baseOffset + m.end() - + val baseOffset = iterator.baseOffset + offsetAtt.setOffset(correctOffset(baseOffset + m.begin()), correctOffset(baseOffset + m.end())) termAtt.setEmpty().append(m.surface()) return true } override fun end() { super.end() - val lastOffset = correctOffset(offset + iterator.baseOffset) + val lastOffset = correctOffset(iterator.baseOffset) offsetAtt.setOffset(lastOffset, lastOffset) iterator = MorphemeIterator.EMPTY } From a84f94a203468d90dc6a25f7b1a002d626251b79 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Tue, 2 Jul 2024 16:32:20 +0900 Subject: [PATCH 3/3] update depending sudachi version (-> v0.7.4) --- spi/build.gradle | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spi/build.gradle b/spi/build.gradle index 81f8ab5c..8c2620cd 100644 --- a/spi/build.gradle +++ b/spi/build.gradle @@ -12,7 +12,7 @@ version = properties["pluginVersion"] description = "Plugin interface for Sudachi search engine integrations (ElasticSearch and OpenSearch)" dependencies { - api('com.worksap.nlp:sudachi:0.7.3') + api('com.worksap.nlp:sudachi:0.7.4') } spotless {