Skip to content

Commit

Permalink
run ./gradlew spotlessApply
Browse files Browse the repository at this point in the history
  • Loading branch information
kenmasumitsu committed Jun 12, 2024
1 parent a5665e1 commit 558da29
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@ package com.worksap.nlp.lucene.sudachi.ja
import com.worksap.nlp.lucene.sudachi.ja.attributes.MorphemeAttribute
import com.worksap.nlp.lucene.sudachi.ja.attributes.SudachiAttribute
import com.worksap.nlp.lucene.sudachi.ja.attributes.SudachiAttributeFactory
import java.io.StringReader
import java.nio.CharBuffer
import org.apache.lucene.analysis.Tokenizer
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute
import org.apache.lucene.util.AttributeFactory
import java.io.StringReader
import java.nio.CharBuffer

class SudachiTokenizer(
private val tokenizer: CachingTokenizer,
Expand Down Expand Up @@ -57,7 +57,7 @@ class SudachiTokenizer(
if (m == null) {
// Create 1MB chunk
// TODO: Should split with meaningful delimitations.
val buffer = CharBuffer.allocate(1*1024*1024)
val buffer = CharBuffer.allocate(1 * 1024 * 1024)
val nread = input.read(buffer)
if (nread < 0) {
return false
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2017-2023 Works Applications Co., Ltd.
* Copyright (c) 2017-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -25,6 +25,7 @@ import com.worksap.nlp.sudachi.Config
import com.worksap.nlp.sudachi.PathAnchor
import com.worksap.nlp.sudachi.Tokenizer.SplitMode
import com.worksap.nlp.test.TestDictionary
import java.io.StringReader
import org.apache.lucene.analysis.charfilter.MappingCharFilter
import org.apache.lucene.analysis.charfilter.NormalizeCharMap
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
Expand All @@ -33,8 +34,6 @@ import org.junit.Assert
import org.junit.Before
import org.junit.Rule
import org.junit.Test
import java.io.StringReader


// Test of character segmentation using incrementToken(tokenizer)
open class TestSudachiTokenizer : BaseTokenStreamTestCase() {
Expand Down Expand Up @@ -289,23 +288,22 @@ open class TestSudachiTokenizer : BaseTokenStreamTestCase() {
assertNotEquals(tokenizerA.hashCode().toLong(), tokenizerB.hashCode().toLong())
}


@Test
fun hugeCharactersByDefaultMode() {
val tokenizer = makeTokenizer(SplitMode.C)
//tokenizer.setReader(StringReader("東京都に行った。"))

val charLength = 10*1024*1024
val charLength = 10 * 1024 * 1024
tokenizer.setReader(StringReader("".repeat(charLength)))

val charTermAttribute = tokenizer.addAttribute(
CharTermAttribute::class.java,
)
val charTermAttribute =
tokenizer.addAttribute(
CharTermAttribute::class.java,
)
tokenizer.reset()

var totalLength = 0
while(tokenizer.incrementToken()) {
//println(charTermAttribute.toString())
while (tokenizer.incrementToken()) {
// println(charTermAttribute.toString())
totalLength += charTermAttribute.length
}

Expand Down

0 comments on commit 558da29

Please sign in to comment.