Merge pull request #122 from WorksApplications/feature/issue111

Disable MorphemeConsumerAttribute (+bug fix)
WorksApplications · May 24, 2024 · 28d6e94 · 28d6e94
2 parents e4e3012 + 7db5245
commit 28d6e94
Show file tree

Hide file tree

Showing 10 changed files with 290 additions and 33 deletions.
diff --git a/README.md b/README.md
@@ -295,7 +295,7 @@ With the `stoptags`, you can filter out the result in any of these forward match
       "position": 0
     },
     {
-      "token": "美味しい",
+      "token": "おいしい",
       "start_offset": 3,
       "end_offset": 7,
       "type": "word",

diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/MorphemeFieldFilter.kt b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/MorphemeFieldFilter.kt
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Works Applications Co., Ltd.
+ * Copyright (c) 2022-2024 Works Applications Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,10 +33,10 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute
  * [org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter] or a custom [TokenFilter] that
  * sets the [KeywordAttribute] before this [TokenStream].
  *
- * Values of [MorphemeAttribute] are used to produce the
+ * Values of [MorphemeAttribute] are used to produce the term.
  */
 abstract class MorphemeFieldFilter(input: TokenStream) : TokenFilter(input) {
-  @JvmField protected val morpheme = existingAttribute<MorphemeAttribute>()
+  @JvmField protected val morphemeAtt = existingAttribute<MorphemeAttribute>()
   @JvmField protected val keywordAtt = addAttribute<KeywordAttribute>()
   @JvmField protected val termAtt = addAttribute<CharTermAttribute>()
   @JvmField
@@ -52,25 +52,24 @@ abstract class MorphemeFieldFilter(input: TokenStream) : TokenFilter(input) {
     if (!input.incrementToken()) {
       return false
     }
-    val m = morpheme.morpheme ?: return true
-    var needToSet = consumer.shouldConsume(this)
+    val m = morphemeAtt.getMorpheme() ?: return true
+    var term: CharSequence? = null
     if (!keywordAtt.isKeyword) {
-      val term = value(m)
-      if (term != null) {
-        termAtt.setEmpty().append(term)
-        needToSet = false
-      }
+      term = value(m)
     }
-    if (needToSet) {
-      termAtt.setEmpty().append(m.surface())
+    if (term == null) {
+      term = m.surface()
     }
+    termAtt.setEmpty().append(term)
+
     return true
   }
 
   override fun reset() {
     super.reset()
     if (!consumer.shouldConsume(this)) {
-      logger.warn("an instance of ${javaClass.name} is a no-op, it is not a filter which produces terms in one of your filter chains")
+      logger.warn(
+          "an instance of ${javaClass.name} is a no-op, it is not a filter which produces terms in one of your filter chains")
     }
   }
 

diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiPartOfSpeechStopFilter.kt b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiPartOfSpeechStopFilter.kt
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2023 Works Applications Co., Ltd.
+ * Copyright (c) 2017-2024 Works Applications Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,14 +26,15 @@ class SudachiPartOfSpeechStopFilter(
     input: TokenStream?,
     private val matcher: ReloadAware<PosMatcher>
 ) : FilteringTokenFilter(input) {
-  private val morpheme = addAttribute<MorphemeAttribute>()
+  private val morphemeAtt = existingAttribute<MorphemeAttribute>()
 
   override fun reset() {
     super.reset()
     matcher.maybeReload()
   }
 
   override fun accept(): Boolean {
-    return !matcher.get().test(morpheme.morpheme)
+    val m = morphemeAtt.getMorpheme() ?: return true
+    return !matcher.get().test(m)
   }
 }
diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023 Works Applications Co., Ltd.
+ * Copyright (c) 2020-2024 Works Applications Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -125,9 +125,7 @@ public final boolean incrementToken() throws IOException {
             if (m == null) {
                 return true;
             }
-            if (consumerAttribute.shouldConsume(this)) {
-                termAtt.append(m.surface());
-            }
+            termAtt.setEmpty().append(m.surface());
             if (mode == Mode.EXTENDED && m.isOOV() && (length = Strings.codepointCount(termAtt)) > 1) {
                 oovChars.setOov(offsetAtt.startOffset(), termAtt.buffer(), termAtt.length());
                 posLengthAtt.setPositionLength(length);
@@ -158,9 +156,7 @@ private void setAUnitAttribute(Morpheme morpheme) {
         offsetAtt.setOffset(aUnitOffset, aUnitOffset + length);
         aUnitOffset += length;
         morphemeAtt.setMorpheme(morpheme);
-        if (consumerAttribute.shouldConsume(this)) {
-            termAtt.append(morpheme.surface());
-        }
+        termAtt.setEmpty().append(morpheme.surface());
     }
 
     private void setOOVAttribute() {
@@ -172,7 +168,7 @@ private void setOOVAttribute() {
             posIncAtt.setPositionIncrement(1);
         }
         char c = oovChars.next();
-        termAtt.append(c);
+        termAtt.setEmpty().append(c);
         if (Character.isSurrogate(c) && oovChars.hasNext()) {
             termAtt.append(oovChars.next());
             offsetAtt.setOffset(offset, offset + 2);

diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.kt b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.kt
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Works Applications Co., Ltd.
+ * Copyright (c) 2022-2024 Works Applications Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -57,14 +57,12 @@ class SudachiTokenizer(
   override fun incrementToken(): Boolean {
     clearAttributes()
     val m = iterator.next() ?: return false
-    morphemeAtt.morpheme = m
+    morphemeAtt.setMorpheme(m)
     posLenAtt.positionLength = 1
     posIncAtt.positionIncrement = 1
     val baseOffset = iterator.baseOffset
     offsetAtt.setOffset(correctOffset(baseOffset + m.begin()), correctOffset(baseOffset + m.end()))
-    if (consumer.shouldConsume(this)) {
-      termAtt.append(m.surface())
-    }
+    termAtt.setEmpty().append(m.surface())
     return true
   }
 

diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttributeImpl.kt b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttributeImpl.kt
@@ -56,7 +56,7 @@ class MorphemeAttributeImpl : AttributeImpl(), MorphemeAttribute {
   }
 
   override fun copyTo(target: AttributeImpl?) {
-    (target as? MorphemeAttributeImpl)?.let { it.morpheme = target.morpheme }
+    (target as? MorphemeAttributeImpl)?.let { it.setMorpheme(getMorpheme()) }
   }
 
   override fun getMorpheme(): Morpheme? {

diff --git a/src/test/java/com/worksap/nlp/elasticsearch/sudachi/index/CustomAnalyzerTest.kt b/src/test/java/com/worksap/nlp/elasticsearch/sudachi/index/CustomAnalyzerTest.kt
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Works Applications Co., Ltd.
+ * Copyright (c) 2023-2024 Works Applications Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@ class CustomAnalyzerTest : SearchEngineTestBase {
     val settings =
         """
       {
-        "index.analysis": {          
+        "index.analysis": {
           "analyzer": {
             "sudachi_basic": {
               "type": "custom",

diff --git a/src/test/java/com/worksap/nlp/elasticsearch/sudachi/index/CustomMultiFilterAnalyzerTest.kt b/src/test/java/com/worksap/nlp/elasticsearch/sudachi/index/CustomMultiFilterAnalyzerTest.kt
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2024 Works Applications Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.worksap.nlp.elasticsearch.sudachi.index
+
+import org.junit.Rule
+import org.junit.Test
+
+class CustomMultiFilterAnalyzerTest : SearchEngineTestBase {
+  @JvmField @Rule var engine = SearchEngineEnv()
+
+  @Test
+  fun baseform_readingform() {
+    val settings =
+        """
+      {
+        "index.analysis": {
+          "analyzer": {
+            "sudachi_test": {
+              "type": "custom",
+              "tokenizer": "sudachi_tokenizer",
+              "filter": ["sudachi_baseform", "sudachi_readingform"]
+            }
+          },
+          "tokenizer": {
+            "sudachi_tokenizer": {
+              "type": "sudachi_tokenizer",
+              "split_mode": "C"
+            }
+          }
+        }
+      }
+    """.jsonSettings()
+    val analyzers = engine.indexAnalyzers(settings)
+    val analyzer = analyzers.get("sudachi_test")
+    analyzer.assertTerms("東京に行った", "トウキョウ", "ニ", "イッ", "タ")
+  }
+
+  @Test
+  fun stopward_baseform() {
+    val settings =
+        """
+      {
+        "index.analysis": {
+          "analyzer": {
+            "sudachi_test": {
+              "type": "custom",
+              "tokenizer": "sudachi_tokenizer",
+              "filter": ["stop", "sudachi_baseform"]
+            }
+          },
+          "tokenizer": {
+            "sudachi_tokenizer": {
+              "type": "sudachi_tokenizer",
+              "split_mode": "C"
+            }
+          },
+          "filter": {
+            "stop": {
+              "type": "sudachi_ja_stop",
+              "stopwords": ["に", "行く"]
+            }
+          }
+        }
+      }
+    """.jsonSettings()
+    val analyzers = engine.indexAnalyzers(settings)
+    val analyzer = analyzers.get("sudachi_test")
+    analyzer.assertTerms("東京に行った", "東京", "行く", "た")
+  }
+
+  @Test
+  fun baseform_stopward() {
+    val settings =
+        """
+      {
+        "index.analysis": {
+          "analyzer": {
+            "sudachi_test": {
+              "type": "custom",
+              "tokenizer": "sudachi_tokenizer",
+              "filter": ["sudachi_baseform", "stop"]
+            }
+          },
+          "tokenizer": {
+            "sudachi_tokenizer": {
+              "type": "sudachi_tokenizer",
+              "split_mode": "C"
+            }
+          },
+          "filter": {
+            "stop": {
+              "type": "sudachi_ja_stop",
+              "stopwords": ["に", "行く"]
+            }
+          }
+        }
+      }
+    """.jsonSettings()
+    val analyzers = engine.indexAnalyzers(settings)
+    val analyzer = analyzers.get("sudachi_test")
+    analyzer.assertTerms("東京に行った", "東京", "た")
+  }
+
+  @Test
+  fun split_baseform() {
+    val settings =
+        """
+      {
+        "index.analysis": {
+          "analyzer": {
+            "sudachi_test": {
+              "type": "custom",
+              "tokenizer": "sudachi_tokenizer",
+              "filter": ["split_extended", "sudachi_baseform"]
+            }
+          },
+          "tokenizer": {
+            "sudachi_tokenizer": {
+              "type": "sudachi_tokenizer",
+              "split_mode": "C"
+            }
+          },
+          "filter": {
+            "split_extended": {
+              "type": "sudachi_split",
+              "mode": "extended"
+            }
+          }
+        }
+      }
+    """.jsonSettings()
+    val analyzers = engine.indexAnalyzers(settings)
+    val analyzer = analyzers.get("sudachi_test")
+    analyzer.assertTerms("アマゾンに行った", "アマゾン", "ア", "マ", "ゾ", "ン", "に", "行く", "た")
+  }
+
+  @Test
+  fun split_pos() {
+    val settings =
+        """
+      {
+        "index.analysis": {
+          "analyzer": {
+            "sudachi_test": {
+              "type": "custom",
+              "tokenizer": "sudachi_tokenizer",
+              "filter": ["split_extended", "pos"]
+            }
+          },
+          "tokenizer": {
+            "sudachi_tokenizer": {
+              "type": "sudachi_tokenizer",
+              "split_mode": "C"
+            }
+          },
+          "filter": {
+            "split_extended": {
+              "type": "sudachi_split",
+              "mode": "extended"
+            },
+            "pos": {
+              "type": "sudachi_part_of_speech",
+              "stoptags": [
+                "助詞",
+                "助動詞",
+                "補助記号,句点",
+                "補助記号,読点"
+              ]
+            }
+          }
+        }
+      }
+    """.jsonSettings()
+    val analyzers = engine.indexAnalyzers(settings)
+    val analyzer = analyzers.get("sudachi_test")
+    analyzer.assertTerms("アマゾンに行った", "アマゾン", "ア", "マ", "ゾ", "ン", "行っ")
+  }
+}