From 3b2163cc47585c38e22ba00c551cda5cb62e6e40 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Thu, 30 May 2024 10:33:25 +0900 Subject: [PATCH 1/2] rm MorphemeConsumerAttribute --- .../attributes/MorphemeConsumerAttribute.java | 54 ------------------- .../lucene/sudachi/ja/MorphemeFieldFilter.kt | 16 ------ .../lucene/sudachi/ja/SudachiSplitFilter.java | 3 -- .../nlp/lucene/sudachi/ja/SudachiTokenizer.kt | 2 - .../MorphemeConsumerAttributeImpl.kt | 48 ----------------- .../ja/attributes/SudachiAttributeFactory.kt | 3 +- 6 files changed, 1 insertion(+), 125 deletions(-) delete mode 100644 spi/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeConsumerAttribute.java delete mode 100644 src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeConsumerAttributeImpl.kt diff --git a/spi/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeConsumerAttribute.java b/spi/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeConsumerAttribute.java deleted file mode 100644 index d7f1a3b9..00000000 --- a/spi/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeConsumerAttribute.java +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright (c) 2023 Works Applications Co., Ltd. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.worksap.nlp.lucene.sudachi.ja.attributes; - -import org.apache.lucene.util.Attribute; - -/** - * This attribute tells Sudachi-based TokenStreams not to produce anything into - * {@link org.apache.lucene.analysis.tokenattributes.CharTermAttribute} if it is - * not the current consumer.
- * This is performance optimisation and will not change correctness if resetting - * {@code CharTermAttribute} before writing into it. - */ -public interface MorphemeConsumerAttribute extends Attribute { - /** - * Check whether the object should consume the token stream. - * - * @param consumer - * object that will try to consume the token stream - * @return true if the object is current consumer - */ - default boolean shouldConsume(Object consumer) { - return consumer == getCurrentConsumer(); - } - - /** - * Get the current consumer - * - * @return instance that is current consumer - */ - Object getCurrentConsumer(); - - /** - * Set the current consumer for the token stream - * - * @param consumer - * new consumer instance - */ - void setCurrentConsumer(Object consumer); -} diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/MorphemeFieldFilter.kt b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/MorphemeFieldFilter.kt index 56bfa0cc..6ff4825b 100644 --- a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/MorphemeFieldFilter.kt +++ b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/MorphemeFieldFilter.kt @@ -17,9 +17,7 @@ package com.worksap.nlp.lucene.sudachi.ja import com.worksap.nlp.lucene.sudachi.ja.attributes.MorphemeAttribute -import com.worksap.nlp.lucene.sudachi.ja.attributes.MorphemeConsumerAttribute import com.worksap.nlp.sudachi.Morpheme -import org.apache.logging.log4j.LogManager import org.apache.lucene.analysis.TokenFilter import org.apache.lucene.analysis.TokenStream import org.apache.lucene.analysis.tokenattributes.CharTermAttribute @@ -39,8 +37,6 @@ abstract class MorphemeFieldFilter(input: TokenStream) : TokenFilter(input) { @JvmField protected val morphemeAtt = existingAttribute() @JvmField protected val keywordAtt = addAttribute() @JvmField protected val termAtt = addAttribute() - @JvmField - protected val consumer = addAttribute { it.currentConsumer = this } /** * Override this method to customize returned value. This method will not be called if @@ -64,16 +60,4 @@ abstract class MorphemeFieldFilter(input: TokenStream) : TokenFilter(input) { return true } - - override fun reset() { - super.reset() - if (!consumer.shouldConsume(this)) { - logger.warn( - "an instance of ${javaClass.name} is a no-op, it is not a filter which produces terms in one of your filter chains") - } - } - - companion object { - private val logger = LogManager.getLogger(MorphemeFieldFilter::class.java) - } } diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java index ac668d95..361fd66e 100644 --- a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java +++ b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java @@ -86,7 +86,6 @@ public int offset() { private final PositionIncrementAttribute posIncAtt; private final PositionLengthAttribute posLengthAtt; private final MorphemeAttribute morphemeAtt; - private final MorphemeConsumerAttribute consumerAttribute; private ListIterator aUnitIterator; private final OovChars oovChars = new OovChars(); @@ -102,8 +101,6 @@ public SudachiSplitFilter(TokenStream input, Mode mode, Tokenizer.SplitMode spli posIncAtt = addAttribute(PositionIncrementAttribute.class); posLengthAtt = addAttribute(PositionLengthAttribute.class); morphemeAtt = addAttribute(MorphemeAttribute.class); - consumerAttribute = addAttribute(MorphemeConsumerAttribute.class); - consumerAttribute.setCurrentConsumer(this); } @Override diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.kt b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.kt index ef209caa..4bd8a44e 100644 --- a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.kt +++ b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.kt @@ -17,7 +17,6 @@ package com.worksap.nlp.lucene.sudachi.ja import com.worksap.nlp.lucene.sudachi.ja.attributes.MorphemeAttribute -import com.worksap.nlp.lucene.sudachi.ja.attributes.MorphemeConsumerAttribute import com.worksap.nlp.lucene.sudachi.ja.attributes.SudachiAttribute import com.worksap.nlp.lucene.sudachi.ja.attributes.SudachiAttributeFactory import org.apache.lucene.analysis.Tokenizer @@ -37,7 +36,6 @@ class SudachiTokenizer( private val offsetAtt = addAttribute() private val posIncAtt = addAttribute() private val posLenAtt = addAttribute() - private val consumer = addAttribute { it.currentConsumer = this } init { addAttribute { it.dictionary = tokenizer.dictionary } diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeConsumerAttributeImpl.kt b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeConsumerAttributeImpl.kt deleted file mode 100644 index 03ca4d99..00000000 --- a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeConsumerAttributeImpl.kt +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2022-2023 Works Applications Co., Ltd. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.worksap.nlp.lucene.sudachi.ja.attributes - -import com.worksap.nlp.lucene.sudachi.ja.reflect -import org.apache.lucene.util.AttributeImpl -import org.apache.lucene.util.AttributeReflector - -/** - * Sudachi-based TokenStream chain uses to communicate which component produces - * [org.apache.lucene.analysis.tokenattributes.CharTermAttribute] - * - * This is not a token-based attribute, so impl's clear/copyTo do nothing - */ -class MorphemeConsumerAttributeImpl : AttributeImpl(), MorphemeConsumerAttribute { - private var instance: Any = Companion - // does nothing - override fun clear() {} - - override fun reflectWith(reflector: AttributeReflector) { - reflector.reflect("instance", instance.javaClass.name) - } - - override fun copyTo(target: AttributeImpl?) {} - - override fun getCurrentConsumer(): Any = instance - - override fun setCurrentConsumer(consumer: Any?) { - instance = consumer!! - } - - // need something to use as initial value of [instance] variable - private companion object -} diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/SudachiAttributeFactory.kt b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/SudachiAttributeFactory.kt index 0ae1c19e..f21a2263 100644 --- a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/SudachiAttributeFactory.kt +++ b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/SudachiAttributeFactory.kt @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023 Works Applications Co., Ltd. + * Copyright (c) 2023-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,7 +25,6 @@ class SudachiAttributeFactory(private val parent: AttributeFactory) : AttributeF override fun createAttributeInstance(attClass: Class?): AttributeImpl { return when (attClass) { MorphemeAttribute::class.java -> MorphemeAttributeImpl() - MorphemeConsumerAttribute::class.java -> MorphemeConsumerAttributeImpl() SudachiAttribute::class.java -> SudachiAttributeImpl() else -> parent.createAttributeInstance(attClass) } From 7e67b2c94bc0067565ded4c5dfd7ed84f2191670 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Thu, 30 May 2024 14:10:45 +0900 Subject: [PATCH 2/2] add note about overriding behavior of filters --- README.md | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 94194b5f..3986b007 100644 --- a/README.md +++ b/README.md @@ -63,7 +63,7 @@ If you want to update Sudachi that is included in a plugin you have installed, d # Analyzer -An analyzer named "sudachi" is provided. +An analyzer `sudachi` is provided. This is equivalent to the following custom analyzer. ```json @@ -92,6 +92,8 @@ See following sections for the detail of the tokenizer and each filters. # Tokenizer +The `sudachi_tokenizer` tokenizer tokenizes input texts using Sudachi. + - split_mode: Select splitting mode of Sudachi. (A, B, C) (string, default: C) - C: Extracts named entities - Ex) 選挙管理委員会 @@ -168,7 +170,7 @@ dictionary settings ## sudachi\_split -This filter works like `mode` of kuromoji. +The `sudachi_split` token filter works like `mode` of kuromoji. - mode - "search": Additional segmentation useful for search. (Use C and A mode) @@ -258,7 +260,7 @@ Which responds with: ## sudachi\_part\_of\_speech -The sudachi\_part\_of\_speech token filter removes tokens that match a set of part-of-speech tags. It accepts the following setting: +The `sudachi_part_of_speech` token filter removes tokens that match a set of part-of-speech tags. It accepts the following setting: The `stopatgs` is an array of part-of-speech and/or inflection tags that should be removed. It defaults to the stoptags.txt file embedded in the lucene-analysis-sudachi.jar. @@ -348,7 +350,7 @@ Which responds with: ## sudachi\_ja\_stop -The sudachi\_ja\_stop token filter filters out Japanese stopwords (_japanese_), and any other custom stopwords specified by the user. This filter only supports the predefined _japanese_ stopwords list. If you want to use a different predefined list, then use the stop token filter instead. +The `sudachi_ja_stop` token filter filters out Japanese stopwords (_japanese_), and any other custom stopwords specified by the user. This filter only supports the predefined _japanese_ stopwords list. If you want to use a different predefined list, then use the stop token filter instead. ### PUT sudachi_sample @@ -426,7 +428,9 @@ Which responds with: ## sudachi\_baseform -The sudachi\_baseform token filter replaces terms with their SudachiBaseFormAttribute. This acts as a lemmatizer for verbs and adjectives. +The `sudachi_baseform` token filter replaces terms with their Sudachi dictionary form. This acts as a lemmatizer for verbs and adjectives. + +This will be overridden by `sudachi_split`, `sudachi_normalizedform` or `sudachi_readingform` token filters. ### PUT sudachi_sample ```json @@ -479,9 +483,10 @@ Which responds with: ## sudachi\_normalizedform -The sudachi\_normalizedform token filter replaces terms with their SudachiNormalizedFormAttribute. This acts as a normalizer for spelling variants. +The `sudachi_normalizedform` token filter replaces terms with their Sudachi normalized form. This acts as a normalizer for spelling variants. +This filter lemmatizes verbs and adjectives too. You don't need to use `sudachi_baseform` filter with this filter. -This filter lemmatizes verbs and adjectives too. You don't need to use sudachi\_baseform filter with this filter. +This will be overridden by `sudachi_split`, `sudachi_baseform` or `sudachi_readingform` token filters. ### PUT sudachi_sample @@ -535,14 +540,14 @@ Which responds with: ## sudachi\_readingform -Convert to katakana or romaji reading. -The sudachi\_readingform token filter replaces the token with its reading form in either katakana or romaji. It accepts the following setting: +The `sudachi_readingform` token filter replaces the terms with their reading form in either katakana or romaji. -### use_romaji +This will be overridden by `sudachi_split`, `sudachi_baseform` or `sudachi_normalizedform` token filters. -Whether romaji reading form should be output instead of katakana. Defaults to false. +Accepts the following setting: -When using the pre-defined sudachi_readingform filter, use_romaji is set to true. The default when defining a custom sudachi_readingform, however, is false. The only reason to use the custom form is if you need the katakana reading form: +- use_romaji + - Whether romaji reading form should be output instead of katakana. Defaults to false. ### PUT sudachi_sample