Skip to content

Commit

Permalink
Chunk starts with small size (32K) and grows it when necessary at the…
Browse files Browse the repository at this point in the history
… maximum of 1M.
  • Loading branch information
kenmasumitsu committed Jun 14, 2024
1 parent e6b9676 commit 36cedbf
Showing 1 changed file with 46 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute
import org.apache.lucene.util.AttributeFactory

private val MAX_CHUNK_SIZE = 1 * 1024 * 1024
private val INITIAL_CHUNK_SIZE = 32 * 1024
private val CHUNK_GROW_SCALE = 8

class SudachiTokenizer(
private val tokenizer: CachingTokenizer,
private val discardPunctuation: Boolean,
Expand All @@ -44,32 +48,65 @@ class SudachiTokenizer(
addAttribute<SudachiAttribute> { it.dictionary = tokenizer.dictionary }
}

// To cope with huge text, input data split into chunks for tokenize.
// Initial chunk size is INITIAL_CHUNK_SIZE.
// But it grows, if input data is large (up to MAX_CHUNK_SIZE).
// TODO: Should split with meaningful delimitations instead of fixed size.
private var chunk: CharBuffer = CharBuffer.allocate(INITIAL_CHUNK_SIZE)
private var iterator: MorphemeIterator = MorphemeIterator.EMPTY
private var offset = 0
private var offset = 0 // pos from the beginning to current chunk.
private var endOffset = 0

override fun reset() {
super.reset()
iterator = MorphemeIterator.EMPTY
offset = 0 // pos from the beginning to current chunk.
offset = 0
endOffset = 0
}

private fun growChunk() {
val newChunkSize = kotlin.math.min(chunk.capacity() * CHUNK_GROW_SCALE, MAX_CHUNK_SIZE)
val newChunk = CharBuffer.allocate(newChunkSize)
chunk.flip()
newChunk.put(chunk)

chunk = newChunk
}

private fun read(): Boolean {
chunk.clear()

while (true) {
val nread = IOTools.readAsMuchAsCan(input, chunk)
if (nread < 0) {
return chunk.position() > 0
}

// check: chunk reads all data from Reader. No remaining data in Reader.
if (chunk.hasRemaining()) {
return true
}

// check: chunk is already max size
if (chunk.capacity() == MAX_CHUNK_SIZE) {
return true
}

growChunk()
}
}

override fun incrementToken(): Boolean {
clearAttributes()

var m = iterator.next()
if (m == null) {
// To cope with huge text, it split into chunks (1 MB) for tokenize.
// TODO: Should split with meaningful delimitations instead of fixed size (1 MB).
val buffer = CharBuffer.allocate(1 * 1024 * 1024)
val nread = IOTools.readAsMuchAsCan(input, buffer)
if (nread < 0) {
if (!read()) {
return false
}
buffer.flip()
chunk.flip()

var iter = tokenizer.tokenize(StringReader(buffer.toString()))
var iter = tokenizer.tokenize(StringReader(chunk.toString()))
if (discardPunctuation) {
iter = NonPunctuationMorphemes(iter)
}
Expand Down

0 comments on commit 36cedbf

Please sign in to comment.