From 558da29a5353d46e1f2a1ab81eef1116cc9a550f Mon Sep 17 00:00:00 2001 From: Ken Masumitsu Date: Wed, 12 Jun 2024 10:20:54 +0900 Subject: [PATCH] run ./gradlew spotlessApply --- .../nlp/lucene/sudachi/ja/SudachiTokenizer.kt | 6 +++--- .../lucene/sudachi/ja/TestSudachiTokenizer.kt | 20 +++++++++---------- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.kt b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.kt index 7ae1430..cbc39c4 100644 --- a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.kt +++ b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.kt @@ -19,14 +19,14 @@ package com.worksap.nlp.lucene.sudachi.ja import com.worksap.nlp.lucene.sudachi.ja.attributes.MorphemeAttribute import com.worksap.nlp.lucene.sudachi.ja.attributes.SudachiAttribute import com.worksap.nlp.lucene.sudachi.ja.attributes.SudachiAttributeFactory +import java.io.StringReader +import java.nio.CharBuffer import org.apache.lucene.analysis.Tokenizer import org.apache.lucene.analysis.tokenattributes.CharTermAttribute import org.apache.lucene.analysis.tokenattributes.OffsetAttribute import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute import org.apache.lucene.util.AttributeFactory -import java.io.StringReader -import java.nio.CharBuffer class SudachiTokenizer( private val tokenizer: CachingTokenizer, @@ -57,7 +57,7 @@ class SudachiTokenizer( if (m == null) { // Create 1MB chunk // TODO: Should split with meaningful delimitations. - val buffer = CharBuffer.allocate(1*1024*1024) + val buffer = CharBuffer.allocate(1 * 1024 * 1024) val nread = input.read(buffer) if (nread < 0) { return false diff --git a/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiTokenizer.kt b/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiTokenizer.kt index d069e8d..1fd6fd4 100644 --- a/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiTokenizer.kt +++ b/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiTokenizer.kt @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2023 Works Applications Co., Ltd. + * Copyright (c) 2017-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,6 +25,7 @@ import com.worksap.nlp.sudachi.Config import com.worksap.nlp.sudachi.PathAnchor import com.worksap.nlp.sudachi.Tokenizer.SplitMode import com.worksap.nlp.test.TestDictionary +import java.io.StringReader import org.apache.lucene.analysis.charfilter.MappingCharFilter import org.apache.lucene.analysis.charfilter.NormalizeCharMap import org.apache.lucene.analysis.tokenattributes.CharTermAttribute @@ -33,8 +34,6 @@ import org.junit.Assert import org.junit.Before import org.junit.Rule import org.junit.Test -import java.io.StringReader - // Test of character segmentation using incrementToken(tokenizer) open class TestSudachiTokenizer : BaseTokenStreamTestCase() { @@ -289,23 +288,22 @@ open class TestSudachiTokenizer : BaseTokenStreamTestCase() { assertNotEquals(tokenizerA.hashCode().toLong(), tokenizerB.hashCode().toLong()) } - @Test fun hugeCharactersByDefaultMode() { val tokenizer = makeTokenizer(SplitMode.C) - //tokenizer.setReader(StringReader("東京都に行った。")) - val charLength = 10*1024*1024 + val charLength = 10 * 1024 * 1024 tokenizer.setReader(StringReader("あ".repeat(charLength))) - val charTermAttribute = tokenizer.addAttribute( - CharTermAttribute::class.java, - ) + val charTermAttribute = + tokenizer.addAttribute( + CharTermAttribute::class.java, + ) tokenizer.reset() var totalLength = 0 - while(tokenizer.incrementToken()) { - //println(charTermAttribute.toString()) + while (tokenizer.incrementToken()) { + // println(charTermAttribute.toString()) totalLength += charTermAttribute.length }