Skip to content

Commit

Permalink
Merge pull request #137 from WorksApplications/feature/lazy-sudachi-a…
Browse files Browse the repository at this point in the history
…nalysis

Use `lazyTokenizeSentences`
  • Loading branch information
mh-northlander authored Jul 2, 2024
2 parents 8ec03b2 + a84f94a commit f9768ad
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 75 deletions.
2 changes: 1 addition & 1 deletion spi/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ version = properties["pluginVersion"]
description = "Plugin interface for Sudachi search engine integrations (ElasticSearch and OpenSearch)"

dependencies {
api('com.worksap.nlp:sudachi:0.7.3')
api('com.worksap.nlp:sudachi:0.7.4')
}

spotless {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022-2023 Works Applications Co., Ltd.
* Copyright (c) 2022-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -68,7 +68,7 @@ class NonCachedAnalysis(tokenizer: Tokenizer, input: Reader, splitMode: SplitMod
override fun next() = throw IllegalStateException()
}

private val sentenceIterator = tokenizer.tokenizeSentences(splitMode, input).iterator()
private val sentenceIterator = tokenizer.lazyTokenizeSentences(splitMode, input)
private var morphemeIterator: Iterator<Morpheme> = EmptyIterator
private var currentLength = 0

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,20 +19,13 @@ package com.worksap.nlp.lucene.sudachi.ja
import com.worksap.nlp.lucene.sudachi.ja.attributes.MorphemeAttribute
import com.worksap.nlp.lucene.sudachi.ja.attributes.SudachiAttribute
import com.worksap.nlp.lucene.sudachi.ja.attributes.SudachiAttributeFactory
import com.worksap.nlp.sudachi.IOTools
import java.io.StringReader
import java.nio.CharBuffer
import org.apache.lucene.analysis.Tokenizer
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute
import org.apache.lucene.util.AttributeFactory

private val MAX_CHUNK_SIZE = 1 * 1024 * 1024
private val INITIAL_CHUNK_SIZE = 32 * 1024
private val CHUNK_GROW_SCALE = 8

class SudachiTokenizer(
private val tokenizer: CachingTokenizer,
private val discardPunctuation: Boolean,
Expand All @@ -48,90 +41,33 @@ class SudachiTokenizer(
addAttribute<SudachiAttribute> { it.dictionary = tokenizer.dictionary }
}

// To cope with huge text, input data split into chunks for tokenize.
// Initial chunk size is INITIAL_CHUNK_SIZE.
// But it grows, if input data is large (up to MAX_CHUNK_SIZE).
// TODO: Should split with meaningful delimitations instead of fixed size.
private var chunk: CharBuffer = CharBuffer.allocate(INITIAL_CHUNK_SIZE)
private var iterator: MorphemeIterator = MorphemeIterator.EMPTY
private var offset = 0 // pos from the beginning to current chunk.
private var endOffset = 0

override fun reset() {
super.reset()
iterator = MorphemeIterator.EMPTY
offset = 0
endOffset = 0
}

private fun growChunk() {
val newChunkSize = kotlin.math.min(chunk.capacity() * CHUNK_GROW_SCALE, MAX_CHUNK_SIZE)
val newChunk = CharBuffer.allocate(newChunkSize)
chunk.flip()
newChunk.put(chunk)

chunk = newChunk
}

private fun read(): Boolean {
chunk.clear()

while (true) {
val nread = IOTools.readAsMuchAsCan(input, chunk)
if (nread < 0) {
return chunk.position() > 0
}

// check: chunk reads all data from Reader. No remaining data in Reader.
if (chunk.hasRemaining()) {
return true
}

// check: chunk is already max size
if (chunk.capacity() == MAX_CHUNK_SIZE) {
return true
}

growChunk()
var iter = tokenizer.tokenize(input)
if (discardPunctuation) {
iter = NonPunctuationMorphemes(iter)
}
iterator = iter
}

override fun incrementToken(): Boolean {
clearAttributes()

var m = iterator.next()
if (m == null) {
if (!read()) {
return false
}
chunk.flip()

var iter = tokenizer.tokenize(StringReader(chunk.toString()))
if (discardPunctuation) {
iter = NonPunctuationMorphemes(iter)
}
iterator = iter
offset = endOffset

m = iterator.next() ?: return false
}
var m = iterator.next() ?: return false

morphemeAtt.setMorpheme(m)
posLenAtt.positionLength = 1
posIncAtt.positionIncrement = 1
val baseOffset = iterator.baseOffset // offset in this chunk
offsetAtt.setOffset(
correctOffset(offset + baseOffset + m.begin()),
correctOffset(offset + baseOffset + m.end()))
endOffset = offset + baseOffset + m.end()

val baseOffset = iterator.baseOffset
offsetAtt.setOffset(correctOffset(baseOffset + m.begin()), correctOffset(baseOffset + m.end()))
termAtt.setEmpty().append(m.surface())
return true
}

override fun end() {
super.end()
val lastOffset = correctOffset(offset + iterator.baseOffset)
val lastOffset = correctOffset(iterator.baseOffset)
offsetAtt.setOffset(lastOffset, lastOffset)
iterator = MorphemeIterator.EMPTY
}
Expand Down

0 comments on commit f9768ad

Please sign in to comment.