From 3b2163cc47585c38e22ba00c551cda5cb62e6e40 Mon Sep 17 00:00:00 2001
From: mh-northlander <mh.northlander+github@gmail.com>
Date: Thu, 30 May 2024 10:33:25 +0900
Subject: [PATCH 1/2] rm MorphemeConsumerAttribute

---
 .../attributes/MorphemeConsumerAttribute.java | 54 -------------------
 .../lucene/sudachi/ja/MorphemeFieldFilter.kt  | 16 ------
 .../lucene/sudachi/ja/SudachiSplitFilter.java |  3 --
 .../nlp/lucene/sudachi/ja/SudachiTokenizer.kt |  2 -
 .../MorphemeConsumerAttributeImpl.kt          | 48 -----------------
 .../ja/attributes/SudachiAttributeFactory.kt  |  3 +-
 6 files changed, 1 insertion(+), 125 deletions(-)
 delete mode 100644 spi/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeConsumerAttribute.java
 delete mode 100644 src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeConsumerAttributeImpl.kt
diff --git a/spi/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeConsumerAttribute.java b/spi/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeConsumerAttribute.java
deleted file mode 100644
index d7f1a3b9..00000000
--- a/spi/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeConsumerAttribute.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2023 Works Applications Co., Ltd.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.worksap.nlp.lucene.sudachi.ja.attributes;
-
-import org.apache.lucene.util.Attribute;
-
-/**
- * This attribute tells Sudachi-based TokenStreams not to produce anything into
- * {@link org.apache.lucene.analysis.tokenattributes.CharTermAttribute} if it is
- * not the current consumer. <br>
- * This is performance optimisation and will not change correctness if resetting
- * {@code CharTermAttribute} before writing into it.
- */
-public interface MorphemeConsumerAttribute extends Attribute {
-    /**
-     * Check whether the object should consume the token stream.
-     * 
-     * @param consumer
-     *            object that will try to consume the token stream
-     * @return true if the object is current consumer
-     */
-    default boolean shouldConsume(Object consumer) {
-        return consumer == getCurrentConsumer();
-    }
-
-    /**
-     * Get the current consumer
-     * 
-     * @return instance that is current consumer
-     */
-    Object getCurrentConsumer();
-
-    /**
-     * Set the current consumer for the token stream
-     * 
-     * @param consumer
-     *            new consumer instance
-     */
-    void setCurrentConsumer(Object consumer);
-}
diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/MorphemeFieldFilter.kt b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/MorphemeFieldFilter.kt
index 56bfa0cc..6ff4825b 100644
--- a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/MorphemeFieldFilter.kt
+++ b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/MorphemeFieldFilter.kt
@@ -17,9 +17,7 @@
 package com.worksap.nlp.lucene.sudachi.ja
 
 import com.worksap.nlp.lucene.sudachi.ja.attributes.MorphemeAttribute
-import com.worksap.nlp.lucene.sudachi.ja.attributes.MorphemeConsumerAttribute
 import com.worksap.nlp.sudachi.Morpheme
-import org.apache.logging.log4j.LogManager
 import org.apache.lucene.analysis.TokenFilter
 import org.apache.lucene.analysis.TokenStream
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
@@ -39,8 +37,6 @@ abstract class MorphemeFieldFilter(input: TokenStream) : TokenFilter(input) {
   @JvmField protected val morphemeAtt = existingAttribute<MorphemeAttribute>()
   @JvmField protected val keywordAtt = addAttribute<KeywordAttribute>()
   @JvmField protected val termAtt = addAttribute<CharTermAttribute>()
-  @JvmField
-  protected val consumer = addAttribute<MorphemeConsumerAttribute> { it.currentConsumer = this }
 
   /**
    * Override this method to customize returned value. This method will not be called if
@@ -64,16 +60,4 @@ abstract class MorphemeFieldFilter(input: TokenStream) : TokenFilter(input) {
 
     return true
   }
-
-  override fun reset() {
-    super.reset()
-    if (!consumer.shouldConsume(this)) {
-      logger.warn(
-          "an instance of ${javaClass.name} is a no-op, it is not a filter which produces terms in one of your filter chains")
-    }
-  }
-
-  companion object {
-    private val logger = LogManager.getLogger(MorphemeFieldFilter::class.java)
-  }
 }
diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java
index ac668d95..361fd66e 100644
--- a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java
+++ b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java
@@ -86,7 +86,6 @@ public int offset() {
     private final PositionIncrementAttribute posIncAtt;
     private final PositionLengthAttribute posLengthAtt;
     private final MorphemeAttribute morphemeAtt;
-    private final MorphemeConsumerAttribute consumerAttribute;
     private ListIterator<Morpheme> aUnitIterator;
     private final OovChars oovChars = new OovChars();
 
@@ -102,8 +101,6 @@ public SudachiSplitFilter(TokenStream input, Mode mode, Tokenizer.SplitMode spli
         posIncAtt = addAttribute(PositionIncrementAttribute.class);
         posLengthAtt = addAttribute(PositionLengthAttribute.class);
         morphemeAtt = addAttribute(MorphemeAttribute.class);
-        consumerAttribute = addAttribute(MorphemeConsumerAttribute.class);
-        consumerAttribute.setCurrentConsumer(this);
     }
 
     @Override
diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.kt b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.kt
index ef209caa..4bd8a44e 100644
--- a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.kt
+++ b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.kt
@@ -17,7 +17,6 @@
 package com.worksap.nlp.lucene.sudachi.ja
 
 import com.worksap.nlp.lucene.sudachi.ja.attributes.MorphemeAttribute
-import com.worksap.nlp.lucene.sudachi.ja.attributes.MorphemeConsumerAttribute
 import com.worksap.nlp.lucene.sudachi.ja.attributes.SudachiAttribute
 import com.worksap.nlp.lucene.sudachi.ja.attributes.SudachiAttributeFactory
 import org.apache.lucene.analysis.Tokenizer
@@ -37,7 +36,6 @@ class SudachiTokenizer(
   private val offsetAtt = addAttribute<OffsetAttribute>()
   private val posIncAtt = addAttribute<PositionIncrementAttribute>()
   private val posLenAtt = addAttribute<PositionLengthAttribute>()
-  private val consumer = addAttribute<MorphemeConsumerAttribute> { it.currentConsumer = this }
 
   init {
     addAttribute<SudachiAttribute> { it.dictionary = tokenizer.dictionary }
diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeConsumerAttributeImpl.kt b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeConsumerAttributeImpl.kt
deleted file mode 100644
index 03ca4d99..00000000
--- a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeConsumerAttributeImpl.kt
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (c) 2022-2023 Works Applications Co., Ltd.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.worksap.nlp.lucene.sudachi.ja.attributes
-
-import com.worksap.nlp.lucene.sudachi.ja.reflect
-import org.apache.lucene.util.AttributeImpl
-import org.apache.lucene.util.AttributeReflector
-
-/**
- * Sudachi-based TokenStream chain uses to communicate which component produces
- * [org.apache.lucene.analysis.tokenattributes.CharTermAttribute]
- *
- * This is not a token-based attribute, so impl's clear/copyTo do nothing
- */
-class MorphemeConsumerAttributeImpl : AttributeImpl(), MorphemeConsumerAttribute {
-  private var instance: Any = Companion
-  // does nothing
-  override fun clear() {}
-
-  override fun reflectWith(reflector: AttributeReflector) {
-    reflector.reflect<MorphemeConsumerAttribute>("instance", instance.javaClass.name)
-  }
-
-  override fun copyTo(target: AttributeImpl?) {}
-
-  override fun getCurrentConsumer(): Any = instance
-
-  override fun setCurrentConsumer(consumer: Any?) {
-    instance = consumer!!
-  }
-
-  // need something to use as initial value of [instance] variable
-  private companion object
-}
diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/SudachiAttributeFactory.kt b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/SudachiAttributeFactory.kt
index 0ae1c19e..f21a2263 100644
--- a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/SudachiAttributeFactory.kt
+++ b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/SudachiAttributeFactory.kt
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Works Applications Co., Ltd.
+ * Copyright (c) 2023-2024 Works Applications Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,7 +25,6 @@ class SudachiAttributeFactory(private val parent: AttributeFactory) : AttributeF
   override fun createAttributeInstance(attClass: Class<out Attribute>?): AttributeImpl {
     return when (attClass) {
       MorphemeAttribute::class.java -> MorphemeAttributeImpl()
-      MorphemeConsumerAttribute::class.java -> MorphemeConsumerAttributeImpl()
       SudachiAttribute::class.java -> SudachiAttributeImpl()
       else -> parent.createAttributeInstance(attClass)
     }

From 7e67b2c94bc0067565ded4c5dfd7ed84f2191670 Mon Sep 17 00:00:00 2001
From: mh-northlander <mh.northlander+github@gmail.com>
Date: Thu, 30 May 2024 14:10:45 +0900
Subject: [PATCH 2/2] add note about overriding behavior of filters

---
 README.md | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 94194b5f..3986b007 100644
--- a/README.md
+++ b/README.md
@@ -63,7 +63,7 @@ If you want to update Sudachi that is included in a plugin you have installed, d
 
 # Analyzer
 
-An analyzer named "sudachi" is provided.
+An analyzer `sudachi` is provided.
 This is equivalent to the following custom analyzer.
 
 ```json
@@ -92,6 +92,8 @@ See following sections for the detail of the tokenizer and each filters.
 
 # Tokenizer
 
+The `sudachi_tokenizer` tokenizer tokenizes input texts using Sudachi.
+
 - split_mode: Select splitting mode of Sudachi. (A, B, C) (string, default: C)
   - C: Extracts named entities
       - Ex) 選挙管理委員会
@@ -168,7 +170,7 @@ dictionary settings
 
 ## sudachi\_split
 
-This filter works like `mode` of kuromoji.
+The `sudachi_split` token filter works like `mode` of kuromoji.
 
 - mode
   - "search": Additional segmentation useful for search. (Use C and A mode)
@@ -258,7 +260,7 @@ Which responds with:
 
 ## sudachi\_part\_of\_speech
 
-The sudachi\_part\_of\_speech token filter removes tokens that match a set of part-of-speech tags. It accepts the following setting:
+The `sudachi_part_of_speech` token filter removes tokens that match a set of part-of-speech tags. It accepts the following setting:
 
 The `stopatgs` is an array of part-of-speech and/or inflection tags that should be removed. It defaults to the stoptags.txt file embedded in the lucene-analysis-sudachi.jar.
 
@@ -348,7 +350,7 @@ Which responds with:
 
 ## sudachi\_ja\_stop
 
-The sudachi\_ja\_stop token filter filters out Japanese stopwords (_japanese_), and any other custom stopwords specified by the user. This filter only supports the predefined _japanese_ stopwords list. If you want to use a different predefined list, then use the stop token filter instead.
+The `sudachi_ja_stop` token filter filters out Japanese stopwords (_japanese_), and any other custom stopwords specified by the user. This filter only supports the predefined _japanese_ stopwords list. If you want to use a different predefined list, then use the stop token filter instead.
 
 ### PUT sudachi_sample
 
@@ -426,7 +428,9 @@ Which responds with:
 
 ## sudachi\_baseform
 
-The sudachi\_baseform token filter replaces terms with their SudachiBaseFormAttribute. This acts as a lemmatizer for verbs and adjectives.
+The `sudachi_baseform` token filter replaces terms with their Sudachi dictionary form. This acts as a lemmatizer for verbs and adjectives.
+
+This will be overridden by `sudachi_split`, `sudachi_normalizedform` or `sudachi_readingform` token filters.
 
 ### PUT sudachi_sample
 ```json
@@ -479,9 +483,10 @@ Which responds with:
 
 ## sudachi\_normalizedform
 
-The sudachi\_normalizedform token filter replaces terms with their SudachiNormalizedFormAttribute. This acts as a normalizer for spelling variants.
+The `sudachi_normalizedform` token filter replaces terms with their Sudachi normalized form. This acts as a normalizer for spelling variants.
+This filter lemmatizes verbs and adjectives too. You don't need to use `sudachi_baseform` filter with this filter.
 
-This filter lemmatizes verbs and adjectives too. You don't need to use sudachi\_baseform filter with this filter.
+This will be overridden by `sudachi_split`, `sudachi_baseform` or `sudachi_readingform` token filters.
 
 ### PUT sudachi_sample
 
@@ -535,14 +540,14 @@ Which responds with:
 
 ## sudachi\_readingform
 
-Convert to katakana or romaji reading.
-The sudachi\_readingform token filter replaces the token with its reading form in either katakana or romaji. It accepts the following setting:
+The `sudachi_readingform` token filter replaces the terms with their reading form in either katakana or romaji.
 
-### use_romaji
+This will be overridden by `sudachi_split`, `sudachi_baseform` or `sudachi_normalizedform` token filters.
 
-Whether romaji reading form should be output instead of katakana. Defaults to false.
+Accepts the following setting:
 
-When using the pre-defined sudachi_readingform filter, use_romaji is set to true. The default when defining a custom sudachi_readingform, however, is false. The only reason to use the custom form is if you need the katakana reading form:
+- use_romaji
+  - Whether romaji reading form should be output instead of katakana. Defaults to false.
 
 ### PUT sudachi_sample