Skip to content

Commit

Permalink
Merge pull request #122 from WorksApplications/feature/issue111
Browse files Browse the repository at this point in the history
Disable MorphemeConsumerAttribute (+bug fix)
  • Loading branch information
mh-northlander authored May 24, 2024
2 parents e4e3012 + 7db5245 commit 28d6e94
Show file tree
Hide file tree
Showing 10 changed files with 290 additions and 33 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,7 @@ With the `stoptags`, you can filter out the result in any of these forward match
"position": 0
},
{
"token": "美味しい",
"token": "おいしい",
"start_offset": 3,
"end_offset": 7,
"type": "word",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022-2023 Works Applications Co., Ltd.
* Copyright (c) 2022-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -33,10 +33,10 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute
* [org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter] or a custom [TokenFilter] that
* sets the [KeywordAttribute] before this [TokenStream].
*
* Values of [MorphemeAttribute] are used to produce the
* Values of [MorphemeAttribute] are used to produce the term.
*/
abstract class MorphemeFieldFilter(input: TokenStream) : TokenFilter(input) {
@JvmField protected val morpheme = existingAttribute<MorphemeAttribute>()
@JvmField protected val morphemeAtt = existingAttribute<MorphemeAttribute>()
@JvmField protected val keywordAtt = addAttribute<KeywordAttribute>()
@JvmField protected val termAtt = addAttribute<CharTermAttribute>()
@JvmField
Expand All @@ -52,25 +52,24 @@ abstract class MorphemeFieldFilter(input: TokenStream) : TokenFilter(input) {
if (!input.incrementToken()) {
return false
}
val m = morpheme.morpheme ?: return true
var needToSet = consumer.shouldConsume(this)
val m = morphemeAtt.getMorpheme() ?: return true
var term: CharSequence? = null
if (!keywordAtt.isKeyword) {
val term = value(m)
if (term != null) {
termAtt.setEmpty().append(term)
needToSet = false
}
term = value(m)
}
if (needToSet) {
termAtt.setEmpty().append(m.surface())
if (term == null) {
term = m.surface()
}
termAtt.setEmpty().append(term)

return true
}

override fun reset() {
super.reset()
if (!consumer.shouldConsume(this)) {
logger.warn("an instance of ${javaClass.name} is a no-op, it is not a filter which produces terms in one of your filter chains")
logger.warn(
"an instance of ${javaClass.name} is a no-op, it is not a filter which produces terms in one of your filter chains")
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2017-2023 Works Applications Co., Ltd.
* Copyright (c) 2017-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -26,14 +26,15 @@ class SudachiPartOfSpeechStopFilter(
input: TokenStream?,
private val matcher: ReloadAware<PosMatcher>
) : FilteringTokenFilter(input) {
private val morpheme = addAttribute<MorphemeAttribute>()
private val morphemeAtt = existingAttribute<MorphemeAttribute>()

override fun reset() {
super.reset()
matcher.maybeReload()
}

override fun accept(): Boolean {
return !matcher.get().test(morpheme.morpheme)
val m = morphemeAtt.getMorpheme() ?: return true
return !matcher.get().test(m)
}
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2023 Works Applications Co., Ltd.
* Copyright (c) 2020-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -125,9 +125,7 @@ public final boolean incrementToken() throws IOException {
if (m == null) {
return true;
}
if (consumerAttribute.shouldConsume(this)) {
termAtt.append(m.surface());
}
termAtt.setEmpty().append(m.surface());
if (mode == Mode.EXTENDED && m.isOOV() && (length = Strings.codepointCount(termAtt)) > 1) {
oovChars.setOov(offsetAtt.startOffset(), termAtt.buffer(), termAtt.length());
posLengthAtt.setPositionLength(length);
Expand Down Expand Up @@ -158,9 +156,7 @@ private void setAUnitAttribute(Morpheme morpheme) {
offsetAtt.setOffset(aUnitOffset, aUnitOffset + length);
aUnitOffset += length;
morphemeAtt.setMorpheme(morpheme);
if (consumerAttribute.shouldConsume(this)) {
termAtt.append(morpheme.surface());
}
termAtt.setEmpty().append(morpheme.surface());
}

private void setOOVAttribute() {
Expand All @@ -172,7 +168,7 @@ private void setOOVAttribute() {
posIncAtt.setPositionIncrement(1);
}
char c = oovChars.next();
termAtt.append(c);
termAtt.setEmpty().append(c);
if (Character.isSurrogate(c) && oovChars.hasNext()) {
termAtt.append(oovChars.next());
offsetAtt.setOffset(offset, offset + 2);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022-2023 Works Applications Co., Ltd.
* Copyright (c) 2022-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -57,14 +57,12 @@ class SudachiTokenizer(
override fun incrementToken(): Boolean {
clearAttributes()
val m = iterator.next() ?: return false
morphemeAtt.morpheme = m
morphemeAtt.setMorpheme(m)
posLenAtt.positionLength = 1
posIncAtt.positionIncrement = 1
val baseOffset = iterator.baseOffset
offsetAtt.setOffset(correctOffset(baseOffset + m.begin()), correctOffset(baseOffset + m.end()))
if (consumer.shouldConsume(this)) {
termAtt.append(m.surface())
}
termAtt.setEmpty().append(m.surface())
return true
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ class MorphemeAttributeImpl : AttributeImpl(), MorphemeAttribute {
}

override fun copyTo(target: AttributeImpl?) {
(target as? MorphemeAttributeImpl)?.let { it.morpheme = target.morpheme }
(target as? MorphemeAttributeImpl)?.let { it.setMorpheme(getMorpheme()) }
}

override fun getMorpheme(): Morpheme? {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2023 Works Applications Co., Ltd.
* Copyright (c) 2023-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -27,7 +27,7 @@ class CustomAnalyzerTest : SearchEngineTestBase {
val settings =
"""
{
"index.analysis": {
"index.analysis": {
"analyzer": {
"sudachi_basic": {
"type": "custom",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
/*
* Copyright (c) 2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.worksap.nlp.elasticsearch.sudachi.index

import org.junit.Rule
import org.junit.Test

class CustomMultiFilterAnalyzerTest : SearchEngineTestBase {
@JvmField @Rule var engine = SearchEngineEnv()

@Test
fun baseform_readingform() {
val settings =
"""
{
"index.analysis": {
"analyzer": {
"sudachi_test": {
"type": "custom",
"tokenizer": "sudachi_tokenizer",
"filter": ["sudachi_baseform", "sudachi_readingform"]
}
},
"tokenizer": {
"sudachi_tokenizer": {
"type": "sudachi_tokenizer",
"split_mode": "C"
}
}
}
}
""".jsonSettings()
val analyzers = engine.indexAnalyzers(settings)
val analyzer = analyzers.get("sudachi_test")
analyzer.assertTerms("東京に行った", "トウキョウ", "", "イッ", "")
}

@Test
fun stopward_baseform() {
val settings =
"""
{
"index.analysis": {
"analyzer": {
"sudachi_test": {
"type": "custom",
"tokenizer": "sudachi_tokenizer",
"filter": ["stop", "sudachi_baseform"]
}
},
"tokenizer": {
"sudachi_tokenizer": {
"type": "sudachi_tokenizer",
"split_mode": "C"
}
},
"filter": {
"stop": {
"type": "sudachi_ja_stop",
"stopwords": ["に", "行く"]
}
}
}
}
""".jsonSettings()
val analyzers = engine.indexAnalyzers(settings)
val analyzer = analyzers.get("sudachi_test")
analyzer.assertTerms("東京に行った", "東京", "行く", "")
}

@Test
fun baseform_stopward() {
val settings =
"""
{
"index.analysis": {
"analyzer": {
"sudachi_test": {
"type": "custom",
"tokenizer": "sudachi_tokenizer",
"filter": ["sudachi_baseform", "stop"]
}
},
"tokenizer": {
"sudachi_tokenizer": {
"type": "sudachi_tokenizer",
"split_mode": "C"
}
},
"filter": {
"stop": {
"type": "sudachi_ja_stop",
"stopwords": ["に", "行く"]
}
}
}
}
""".jsonSettings()
val analyzers = engine.indexAnalyzers(settings)
val analyzer = analyzers.get("sudachi_test")
analyzer.assertTerms("東京に行った", "東京", "")
}

@Test
fun split_baseform() {
val settings =
"""
{
"index.analysis": {
"analyzer": {
"sudachi_test": {
"type": "custom",
"tokenizer": "sudachi_tokenizer",
"filter": ["split_extended", "sudachi_baseform"]
}
},
"tokenizer": {
"sudachi_tokenizer": {
"type": "sudachi_tokenizer",
"split_mode": "C"
}
},
"filter": {
"split_extended": {
"type": "sudachi_split",
"mode": "extended"
}
}
}
}
""".jsonSettings()
val analyzers = engine.indexAnalyzers(settings)
val analyzer = analyzers.get("sudachi_test")
analyzer.assertTerms("アマゾンに行った", "アマゾン", "", "", "", "", "", "行く", "")
}

@Test
fun split_pos() {
val settings =
"""
{
"index.analysis": {
"analyzer": {
"sudachi_test": {
"type": "custom",
"tokenizer": "sudachi_tokenizer",
"filter": ["split_extended", "pos"]
}
},
"tokenizer": {
"sudachi_tokenizer": {
"type": "sudachi_tokenizer",
"split_mode": "C"
}
},
"filter": {
"split_extended": {
"type": "sudachi_split",
"mode": "extended"
},
"pos": {
"type": "sudachi_part_of_speech",
"stoptags": [
"助詞",
"助動詞",
"補助記号,句点",
"補助記号,読点"
]
}
}
}
}
""".jsonSettings()
val analyzers = engine.indexAnalyzers(settings)
val analyzer = analyzers.get("sudachi_test")
analyzer.assertTerms("アマゾンに行った", "アマゾン", "", "", "", "", "行っ")
}
}
Loading

0 comments on commit 28d6e94

Please sign in to comment.