diff --git a/README.md b/README.md index 7ada288..2fd6a2a 100644 --- a/README.md +++ b/README.md @@ -295,7 +295,7 @@ With the `stoptags`, you can filter out the result in any of these forward match "position": 0 }, { - "token": "美味しい", + "token": "おいしい", "start_offset": 3, "end_offset": 7, "type": "word", diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/MorphemeFieldFilter.kt b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/MorphemeFieldFilter.kt index 796e6c2..56bfa0c 100644 --- a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/MorphemeFieldFilter.kt +++ b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/MorphemeFieldFilter.kt @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023 Works Applications Co., Ltd. + * Copyright (c) 2022-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -33,10 +33,10 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute * [org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter] or a custom [TokenFilter] that * sets the [KeywordAttribute] before this [TokenStream]. * - * Values of [MorphemeAttribute] are used to produce the + * Values of [MorphemeAttribute] are used to produce the term. */ abstract class MorphemeFieldFilter(input: TokenStream) : TokenFilter(input) { - @JvmField protected val morpheme = existingAttribute() + @JvmField protected val morphemeAtt = existingAttribute() @JvmField protected val keywordAtt = addAttribute() @JvmField protected val termAtt = addAttribute() @JvmField @@ -52,25 +52,24 @@ abstract class MorphemeFieldFilter(input: TokenStream) : TokenFilter(input) { if (!input.incrementToken()) { return false } - val m = morpheme.morpheme ?: return true - var needToSet = consumer.shouldConsume(this) + val m = morphemeAtt.getMorpheme() ?: return true + var term: CharSequence? = null if (!keywordAtt.isKeyword) { - val term = value(m) - if (term != null) { - termAtt.setEmpty().append(term) - needToSet = false - } + term = value(m) } - if (needToSet) { - termAtt.setEmpty().append(m.surface()) + if (term == null) { + term = m.surface() } + termAtt.setEmpty().append(term) + return true } override fun reset() { super.reset() if (!consumer.shouldConsume(this)) { - logger.warn("an instance of ${javaClass.name} is a no-op, it is not a filter which produces terms in one of your filter chains") + logger.warn( + "an instance of ${javaClass.name} is a no-op, it is not a filter which produces terms in one of your filter chains") } } diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiPartOfSpeechStopFilter.kt b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiPartOfSpeechStopFilter.kt index f9d8631..a5fa290 100644 --- a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiPartOfSpeechStopFilter.kt +++ b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiPartOfSpeechStopFilter.kt @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2023 Works Applications Co., Ltd. + * Copyright (c) 2017-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -26,7 +26,7 @@ class SudachiPartOfSpeechStopFilter( input: TokenStream?, private val matcher: ReloadAware ) : FilteringTokenFilter(input) { - private val morpheme = addAttribute() + private val morphemeAtt = existingAttribute() override fun reset() { super.reset() @@ -34,6 +34,7 @@ class SudachiPartOfSpeechStopFilter( } override fun accept(): Boolean { - return !matcher.get().test(morpheme.morpheme) + val m = morphemeAtt.getMorpheme() ?: return true + return !matcher.get().test(m) } } diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java index 9f2f0a1..ac668d9 100644 --- a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java +++ b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023 Works Applications Co., Ltd. + * Copyright (c) 2020-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -125,9 +125,7 @@ public final boolean incrementToken() throws IOException { if (m == null) { return true; } - if (consumerAttribute.shouldConsume(this)) { - termAtt.append(m.surface()); - } + termAtt.setEmpty().append(m.surface()); if (mode == Mode.EXTENDED && m.isOOV() && (length = Strings.codepointCount(termAtt)) > 1) { oovChars.setOov(offsetAtt.startOffset(), termAtt.buffer(), termAtt.length()); posLengthAtt.setPositionLength(length); @@ -158,9 +156,7 @@ private void setAUnitAttribute(Morpheme morpheme) { offsetAtt.setOffset(aUnitOffset, aUnitOffset + length); aUnitOffset += length; morphemeAtt.setMorpheme(morpheme); - if (consumerAttribute.shouldConsume(this)) { - termAtt.append(morpheme.surface()); - } + termAtt.setEmpty().append(morpheme.surface()); } private void setOOVAttribute() { @@ -172,7 +168,7 @@ private void setOOVAttribute() { posIncAtt.setPositionIncrement(1); } char c = oovChars.next(); - termAtt.append(c); + termAtt.setEmpty().append(c); if (Character.isSurrogate(c) && oovChars.hasNext()) { termAtt.append(oovChars.next()); offsetAtt.setOffset(offset, offset + 2); diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.kt b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.kt index 530dec1..ef209ca 100644 --- a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.kt +++ b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.kt @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023 Works Applications Co., Ltd. + * Copyright (c) 2022-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -57,14 +57,12 @@ class SudachiTokenizer( override fun incrementToken(): Boolean { clearAttributes() val m = iterator.next() ?: return false - morphemeAtt.morpheme = m + morphemeAtt.setMorpheme(m) posLenAtt.positionLength = 1 posIncAtt.positionIncrement = 1 val baseOffset = iterator.baseOffset offsetAtt.setOffset(correctOffset(baseOffset + m.begin()), correctOffset(baseOffset + m.end())) - if (consumer.shouldConsume(this)) { - termAtt.append(m.surface()) - } + termAtt.setEmpty().append(m.surface()) return true } diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttributeImpl.kt b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttributeImpl.kt index fd63e2f..9b494c1 100644 --- a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttributeImpl.kt +++ b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttributeImpl.kt @@ -56,7 +56,7 @@ class MorphemeAttributeImpl : AttributeImpl(), MorphemeAttribute { } override fun copyTo(target: AttributeImpl?) { - (target as? MorphemeAttributeImpl)?.let { it.morpheme = target.morpheme } + (target as? MorphemeAttributeImpl)?.let { it.setMorpheme(getMorpheme()) } } override fun getMorpheme(): Morpheme? { diff --git a/src/test/java/com/worksap/nlp/elasticsearch/sudachi/index/CustomAnalyzerTest.kt b/src/test/java/com/worksap/nlp/elasticsearch/sudachi/index/CustomAnalyzerTest.kt index 17cbd5c..abf5e1e 100644 --- a/src/test/java/com/worksap/nlp/elasticsearch/sudachi/index/CustomAnalyzerTest.kt +++ b/src/test/java/com/worksap/nlp/elasticsearch/sudachi/index/CustomAnalyzerTest.kt @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023 Works Applications Co., Ltd. + * Copyright (c) 2023-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,7 +27,7 @@ class CustomAnalyzerTest : SearchEngineTestBase { val settings = """ { - "index.analysis": { + "index.analysis": { "analyzer": { "sudachi_basic": { "type": "custom", diff --git a/src/test/java/com/worksap/nlp/elasticsearch/sudachi/index/CustomMultiFilterAnalyzerTest.kt b/src/test/java/com/worksap/nlp/elasticsearch/sudachi/index/CustomMultiFilterAnalyzerTest.kt new file mode 100644 index 0000000..0df7e24 --- /dev/null +++ b/src/test/java/com/worksap/nlp/elasticsearch/sudachi/index/CustomMultiFilterAnalyzerTest.kt @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2024 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.elasticsearch.sudachi.index + +import org.junit.Rule +import org.junit.Test + +class CustomMultiFilterAnalyzerTest : SearchEngineTestBase { + @JvmField @Rule var engine = SearchEngineEnv() + + @Test + fun baseform_readingform() { + val settings = + """ + { + "index.analysis": { + "analyzer": { + "sudachi_test": { + "type": "custom", + "tokenizer": "sudachi_tokenizer", + "filter": ["sudachi_baseform", "sudachi_readingform"] + } + }, + "tokenizer": { + "sudachi_tokenizer": { + "type": "sudachi_tokenizer", + "split_mode": "C" + } + } + } + } + """.jsonSettings() + val analyzers = engine.indexAnalyzers(settings) + val analyzer = analyzers.get("sudachi_test") + analyzer.assertTerms("東京に行った", "トウキョウ", "ニ", "イッ", "タ") + } + + @Test + fun stopward_baseform() { + val settings = + """ + { + "index.analysis": { + "analyzer": { + "sudachi_test": { + "type": "custom", + "tokenizer": "sudachi_tokenizer", + "filter": ["stop", "sudachi_baseform"] + } + }, + "tokenizer": { + "sudachi_tokenizer": { + "type": "sudachi_tokenizer", + "split_mode": "C" + } + }, + "filter": { + "stop": { + "type": "sudachi_ja_stop", + "stopwords": ["に", "行く"] + } + } + } + } + """.jsonSettings() + val analyzers = engine.indexAnalyzers(settings) + val analyzer = analyzers.get("sudachi_test") + analyzer.assertTerms("東京に行った", "東京", "行く", "た") + } + + @Test + fun baseform_stopward() { + val settings = + """ + { + "index.analysis": { + "analyzer": { + "sudachi_test": { + "type": "custom", + "tokenizer": "sudachi_tokenizer", + "filter": ["sudachi_baseform", "stop"] + } + }, + "tokenizer": { + "sudachi_tokenizer": { + "type": "sudachi_tokenizer", + "split_mode": "C" + } + }, + "filter": { + "stop": { + "type": "sudachi_ja_stop", + "stopwords": ["に", "行く"] + } + } + } + } + """.jsonSettings() + val analyzers = engine.indexAnalyzers(settings) + val analyzer = analyzers.get("sudachi_test") + analyzer.assertTerms("東京に行った", "東京", "た") + } + + @Test + fun split_baseform() { + val settings = + """ + { + "index.analysis": { + "analyzer": { + "sudachi_test": { + "type": "custom", + "tokenizer": "sudachi_tokenizer", + "filter": ["split_extended", "sudachi_baseform"] + } + }, + "tokenizer": { + "sudachi_tokenizer": { + "type": "sudachi_tokenizer", + "split_mode": "C" + } + }, + "filter": { + "split_extended": { + "type": "sudachi_split", + "mode": "extended" + } + } + } + } + """.jsonSettings() + val analyzers = engine.indexAnalyzers(settings) + val analyzer = analyzers.get("sudachi_test") + analyzer.assertTerms("アマゾンに行った", "アマゾン", "ア", "マ", "ゾ", "ン", "に", "行く", "た") + } + + @Test + fun split_pos() { + val settings = + """ + { + "index.analysis": { + "analyzer": { + "sudachi_test": { + "type": "custom", + "tokenizer": "sudachi_tokenizer", + "filter": ["split_extended", "pos"] + } + }, + "tokenizer": { + "sudachi_tokenizer": { + "type": "sudachi_tokenizer", + "split_mode": "C" + } + }, + "filter": { + "split_extended": { + "type": "sudachi_split", + "mode": "extended" + }, + "pos": { + "type": "sudachi_part_of_speech", + "stoptags": [ + "助詞", + "助動詞", + "補助記号,句点", + "補助記号,読点" + ] + } + } + } + } + """.jsonSettings() + val analyzers = engine.indexAnalyzers(settings) + val analyzer = analyzers.get("sudachi_test") + analyzer.assertTerms("アマゾンに行った", "アマゾン", "ア", "マ", "ゾ", "ン", "行っ") + } +} diff --git a/src/test/java/com/worksap/nlp/lucene/sudachi/ja/MorphemeFieldFilterTest.kt b/src/test/java/com/worksap/nlp/lucene/sudachi/ja/MorphemeFieldFilterTest.kt new file mode 100644 index 0000000..1283e8d --- /dev/null +++ b/src/test/java/com/worksap/nlp/lucene/sudachi/ja/MorphemeFieldFilterTest.kt @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2024 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.lucene.sudachi.ja + +import com.worksap.nlp.lucene.sudachi.aliases.BaseTokenStreamTestCase +import com.worksap.nlp.sudachi.Morpheme +import com.worksap.nlp.test.InMemoryDictionary +import org.apache.lucene.analysis.TokenStream +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilterFactory +import org.junit.Test + +class TestMorphemeFieldFilter : BaseTokenStreamTestCase() { + private val dic = InMemoryDictionary() + + @Test + fun defaultValueFun() { + var tokenStream: TokenStream = dic.tokenizer("東京都に行った。") + tokenStream = SurfaceFilter(tokenStream) + assertTokenStreamContents(tokenStream, arrayOf("東京都", "に", "行っ", "た")) + } + + @Test + fun nullValueFun() { + var tokenStream: TokenStream = dic.tokenizer("東京都に行った。") + tokenStream = NullFilter(tokenStream) + assertTokenStreamContents(tokenStream, arrayOf("東京都", "に", "行っ", "た")) + } + + @Test + fun withKeyword() { + val kwFactory = KeywordMarkerFilterFactory(mutableMapOf("pattern" to "東京都")) + var tokenStream: TokenStream = dic.tokenizer("東京都に行った。") + tokenStream = SurfaceFilter(kwFactory.create(tokenStream)) + assertTokenStreamContents(tokenStream, arrayOf("東京都", "に", "行っ", "た")) + } +} + +class SurfaceFilter(input: TokenStream) : MorphemeFieldFilter(input) {} + +class NullFilter(input: TokenStream) : MorphemeFieldFilter(input) { + override fun value(m: Morpheme): String? = null +} diff --git a/src/test/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttributeImplTest.kt b/src/test/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttributeImplTest.kt index 1247f9c..a630daf 100644 --- a/src/test/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttributeImplTest.kt +++ b/src/test/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttributeImplTest.kt @@ -65,6 +65,21 @@ class MorphemeAttributeImplTest { assertNull(morphemeAtt.getMorpheme()) } + @Test + fun copyTo() { + var morphemeAtt1 = MorphemeAttributeImpl() + var morphemeAtt2 = MorphemeAttributeImpl() + val morpheme = getFirstMorpheme("東京都")!! + + morphemeAtt1.setMorpheme(morpheme) + morphemeAtt1.copyTo(morphemeAtt2) + assertEquals(morpheme, morphemeAtt2.getMorpheme()) + + morphemeAtt1.setMorpheme(null) + morphemeAtt1.copyTo(morphemeAtt2) + assertNull(morphemeAtt2.getMorpheme()) + } + @Test fun toXContent() { var morphemeAtt = MorphemeAttributeImpl()