Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

explain morpheme detail #121

Merged
merged 4 commits into from
May 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import org.jetbrains.kotlin.gradle.dsl.JvmTarget
plugins {
id 'java-library'
id 'org.jetbrains.kotlin.jvm' version '1.8.0'
id "org.jetbrains.kotlin.plugin.serialization" version "1.8.0"
id 'com.diffplug.spotless' version '6.16.0'
id 'org.sonarqube' version '4.0.0.2929'
id("org.jetbrains.kotlinx.kover") version "0.7.0"
Expand Down Expand Up @@ -44,6 +45,7 @@ dependencies {
testImplementation('org.jetbrains.kotlin:kotlin-test-junit') {
exclude(group: 'org.hamcrest')
}
testImplementation('org.jetbrains.kotlinx:kotlinx-serialization-json:1.6.3')
kover(project(':integration'))
kover(project(':testlib'))
}
Expand Down
7 changes: 5 additions & 2 deletions buildSrc/src/main/groovy/com/worksap/nlp/tools/engines.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ enum EsSupport implements EngineSupport {

enum OsSupport implements EngineSupport {
Os20("os-2.00"),
Os210("os-2.10")
Os27("os-2.07"),
Os210("os-2.10"),

String tag

Expand All @@ -59,8 +60,10 @@ enum OsSupport implements EngineSupport {


static OsSupport supportVersion(Version version) {
if (version.ge(2, 0) && version.lt(2, 10)) {
if (version.ge(2, 0) && version.lt(2, 7)) {
return Os20
} else if (version.ge(2, 7) && version.lt(2, 10)) {
return Os27
} else if (version.ge(2, 10)) {
return Os210
}
Expand Down
25 changes: 25 additions & 0 deletions src/main/ext/es-7.15-ge/xcontent-aliases.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
/*
* Copyright (c) 2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

@file:Suppress("PackageDirectoryMismatch")

package com.worksap.nlp.lucene.aliases

typealias ToXContent = org.elasticsearch.xcontent.ToXContent

typealias ToXContentParams = org.elasticsearch.xcontent.ToXContent.Params

typealias XContentBuilder = org.elasticsearch.xcontent.XContentBuilder
25 changes: 25 additions & 0 deletions src/main/ext/es-7.15-lt/xcontent-aliases.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
/*
* Copyright (c) 2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

@file:Suppress("PackageDirectoryMismatch")

package com.worksap.nlp.lucene.aliases

typealias ToXContent = org.elasticsearch.common.xcontent.ToXContent

typealias ToXContentParams = org.elasticsearch.common.xcontent.ToXContent.Params

typealias XContentBuilder = org.elasticsearch.common.xcontent.XContentBuilder
25 changes: 25 additions & 0 deletions src/main/ext/os-2.07-ge/xcontent-aliases.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
/*
* Copyright (c) 2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

@file:Suppress("PackageDirectoryMismatch")

package com.worksap.nlp.lucene.aliases

typealias ToXContent = org.opensearch.core.xcontent.ToXContent

typealias ToXContentParams = org.opensearch.core.xcontent.ToXContent.Params

typealias XContentBuilder = org.opensearch.core.xcontent.XContentBuilder
25 changes: 25 additions & 0 deletions src/main/ext/os-2.07-lt/xcontent-aliases.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
/*
* Copyright (c) 2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

@file:Suppress("PackageDirectoryMismatch")

package com.worksap.nlp.lucene.aliases

typealias ToXContent = org.opensearch.common.xcontent.ToXContent

typealias ToXContentParams = org.opensearch.common.xcontent.ToXContent.Params

typealias XContentBuilder = org.opensearch.common.xcontent.XContentBuilder
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022-2023 Works Applications Co., Ltd.
* Copyright (c) 2022-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -16,13 +16,36 @@

package com.worksap.nlp.lucene.sudachi.ja.attributes

import com.worksap.nlp.lucene.aliases.ToXContent
import com.worksap.nlp.lucene.aliases.ToXContentParams
import com.worksap.nlp.lucene.aliases.XContentBuilder
import com.worksap.nlp.lucene.sudachi.ja.reflect
import com.worksap.nlp.sudachi.Morpheme
import org.apache.lucene.util.AttributeImpl
import org.apache.lucene.util.AttributeReflector

class MorphemeAttributeImpl : AttributeImpl(), MorphemeAttribute {
private var morpheme: Morpheme? = null
private var morpheme: MorphemeWrapper? = null

private class MorphemeWrapper(morpheme: Morpheme) : ToXContent {
private val morpheme = morpheme

override fun toXContent(builder: XContentBuilder, params: ToXContentParams): XContentBuilder {
builder.value(
mapOf(
"surface" to morpheme.surface(),
"dictionaryForm" to morpheme.dictionaryForm(),
"normalizedForm" to morpheme.normalizedForm(),
"readingForm" to morpheme.readingForm(),
"partOfSpeech" to morpheme.partOfSpeech(),
))
return builder
}

fun unwrap(): Morpheme {
return morpheme
}
}

override fun clear() {
morpheme = null
Expand All @@ -37,10 +60,10 @@ class MorphemeAttributeImpl : AttributeImpl(), MorphemeAttribute {
}

override fun getMorpheme(): Morpheme? {
return morpheme
return morpheme?.let { m -> m.unwrap() }
}

override fun setMorpheme(morpheme: Morpheme?) {
this.morpheme = morpheme
this.morpheme = morpheme?.let { m -> MorphemeWrapper(m) }
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
/*
* Copyright (c) 2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.worksap.nlp.lucene.sudachi.ja.attributes

import com.worksap.nlp.lucene.aliases.ToXContent
import com.worksap.nlp.lucene.aliases.XContentBuilder
import com.worksap.nlp.search.aliases.XContentType
import com.worksap.nlp.sudachi.Config
import com.worksap.nlp.sudachi.DictionaryFactory
import com.worksap.nlp.sudachi.Morpheme
import com.worksap.nlp.test.TestDictionary
import kotlin.test.Test
import kotlin.test.assertEquals
import kotlin.test.assertNotNull
import kotlin.test.assertNull
import kotlin.test.assertTrue
import kotlinx.serialization.Serializable
import kotlinx.serialization.json.Json
import org.junit.Before
import org.junit.Rule

class MorphemeAttributeImplTest {
@JvmField @Rule var testDic = TestDictionary("system")

private lateinit var config: Config

fun getFirstMorpheme(text: String): Morpheme? {
val dict = DictionaryFactory().create(config)
val tok = dict.create()
val morphemes = tok.tokenize(text)

return if (morphemes.size == 0) null else morphemes.get(0)
}

@Before
fun setup() {
val configDir = testDic.root.toPath().resolve("config/sudachi")
config = Config.fromFile(configDir.resolve("sudachi.json"))
}

@Test
fun setMorpheme() {
var morphemeAtt = MorphemeAttributeImpl()
assertNull(morphemeAtt.getMorpheme())

val morpheme = getFirstMorpheme("東京都")!!
morphemeAtt.setMorpheme(morpheme)
assertEquals(morpheme, morphemeAtt.getMorpheme())

morphemeAtt.setMorpheme(null)
assertNull(morphemeAtt.getMorpheme())
}

@Test
fun toXContent() {
var morphemeAtt = MorphemeAttributeImpl()
val morpheme = getFirstMorpheme("東京都")!!
morphemeAtt.setMorpheme(morpheme)

val builder = XContentBuilder.builder(XContentType.JSON.xContent())
builder.startObject()
morphemeAtt.reflectWith(
fun(attClass, key, value) {
assertEquals(MorphemeAttribute::class.java, attClass)
assertEquals("morpheme", key)
assertTrue(value is ToXContent)

builder.field(key, value)
})
builder.endObject()
builder.flush()

val serialized = builder.getOutputStream().toString()
val deserialized = Json.decodeFromString<MorphemeHolder>(serialized)

assertNotNull(deserialized.morpheme)
assertEquals(morpheme.surface(), deserialized.morpheme.surface)
assertEquals(morpheme.dictionaryForm(), deserialized.morpheme.dictionaryForm)
assertEquals(morpheme.normalizedForm(), deserialized.morpheme.normalizedForm)
assertEquals(morpheme.readingForm(), deserialized.morpheme.readingForm)
assertEquals(morpheme.partOfSpeech(), deserialized.morpheme.partOfSpeech)
}
}

@Serializable data class MorphemeHolder(val morpheme: MorphemeAttributeHolder)

@Serializable
data class MorphemeAttributeHolder(
val surface: String,
val dictionaryForm: String,
val normalizedForm: String,
val readingForm: String,
val partOfSpeech: List<String>,
)
21 changes: 21 additions & 0 deletions test-scripts/01-integration-test.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,27 @@ def test_tokenize_using_sudachi_tokenizer(self):
self.assertEqual(6, tokens[3]["end_offset"])
return

def test_explain_tokenizer_details(self):
body = {"tokenizer": "sudachi_tokenizer",
"text": "すだち", "explain": True}
resp = es_instance.analyze(body)
self.assertEqual(200, resp.status)

morpheme = json.loads(resp.data)[
"detail"]["tokenizer"]["tokens"][0]["morpheme"]
self.assertIn("surface", morpheme)
self.assertEqual("すだち", morpheme["surface"])
self.assertIn("dictionaryForm", morpheme)
self.assertEqual("すだち", morpheme["dictionaryForm"])
self.assertIn("normalizedForm", morpheme)
self.assertEqual("酢橘", morpheme["normalizedForm"])
self.assertIn("readingForm", morpheme)
self.assertEqual("スダチ", morpheme["readingForm"])
self.assertIn("partOfSpeech", morpheme)
self.assertEqual(["名詞", "普通名詞", "一般", "*", "*", "*"],
morpheme["partOfSpeech"])
return


class TestICUFiltered(unittest.TestCase):
# requires analysis-icu plugin installed
Expand Down
Loading