search/bleve: write a custom tokenizer and query creator

the tokenizer is in essence a basic whitespace tokenizer but also splitting on known japanese boundaries by using kagome the query creator is a simple match query but using the correct analyzer this time around (the same as the index creation). the analyzer has been changed to use the new tokenizer and not use shingles anymore. the ngram filter has been changed to not include the original term anymore. some specific handling for cjk has also been removed and instead leans on the new tokenizer and ngram filter (with the removal of the shingles this works reasonably well (so far))
R-a-dio · Dec 22, 2024 · eb48b2a · eb48b2a
1 parent ef7c33b
commit eb48b2a
Show file tree

Hide file tree

Showing 7 changed files with 371 additions and 253 deletions.
diff --git a/go.mod b/go.mod
@@ -13,11 +13,10 @@ require (
 	github.com/Wessie/fdstore v1.2.3
 	github.com/XSAM/otelsql v0.36.0
 	github.com/adtac/go-akismet v0.0.0-20181220032308-0ca9e1023047
-	github.com/agoda-com/opentelemetry-go/otelzerolog v0.0.1
-	github.com/agoda-com/opentelemetry-logs-go v0.5.1
 	github.com/alevinval/sse v1.0.2
 	github.com/alexedwards/scs/v2 v2.8.0
 	github.com/blevesearch/bleve/v2 v2.4.4
+	github.com/blevesearch/bleve_index_api v1.2.0
 	github.com/cenkalti/backoff v2.2.1+incompatible
 	github.com/cenkalti/backoff/v4 v4.3.0
 	github.com/davecgh/go-spew v1.1.1
@@ -26,6 +25,8 @@ require (
 	github.com/golang-migrate/migrate/v4 v4.18.1
 	github.com/google/subcommands v1.2.0
 	github.com/gorilla/csrf v1.7.2
+	github.com/ikawaha/kagome-dict/ipa v1.2.0
+	github.com/ikawaha/kagome/v2 v2.10.0
 	github.com/jmoiron/sqlx v1.4.0
 	github.com/justincormack/go-memfd v0.0.0-20170219213707-6e4af0518993
 	github.com/jxskiss/base62 v1.1.0
@@ -56,7 +57,6 @@ require (
 	go.opentelemetry.io/otel/trace v1.33.0
 	golang.org/x/crypto v0.31.0
 	golang.org/x/exp v0.0.0-20241217172543-b2144cdd0a67
-	golang.org/x/term v0.27.0
 	golang.org/x/text v0.21.0
 	golang.org/x/tools v0.28.0
 	google.golang.org/grpc v1.69.2
@@ -65,14 +65,13 @@ require (
 require (
 	dario.cat/mergo v1.0.1 // indirect
 	filippo.io/edwards25519 v1.1.0 // indirect
+	github.com/AdaLogics/go-fuzz-headers v0.0.0-20230811130428-ced1acdcaa24 // indirect
 	github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 // indirect
 	github.com/Microsoft/go-winio v0.6.2 // indirect
-	github.com/Microsoft/hcsshim v0.12.9 // indirect
 	github.com/RoaringBitmap/roaring v1.9.4 // indirect
 	github.com/andybalholm/brotli v1.1.1 // indirect
 	github.com/beorn7/perks v1.0.1 // indirect
 	github.com/bits-and-blooms/bitset v1.20.0 // indirect
-	github.com/blevesearch/bleve_index_api v1.2.0 // indirect
 	github.com/blevesearch/geo v0.1.20 // indirect
 	github.com/blevesearch/go-faiss v1.0.24 // indirect
 	github.com/blevesearch/go-porterstemmer v1.0.3 // indirect
@@ -90,7 +89,6 @@ require (
 	github.com/blevesearch/zapx/v15 v15.3.17 // indirect
 	github.com/blevesearch/zapx/v16 v16.1.10 // indirect
 	github.com/cespare/xxhash/v2 v2.3.0 // indirect
-	github.com/containerd/containerd v1.7.24 // indirect
 	github.com/containerd/log v0.1.0 // indirect
 	github.com/containerd/platforms v0.2.1 // indirect
 	github.com/cpuguy83/dockercfg v0.3.2 // indirect
@@ -109,6 +107,7 @@ require (
 	github.com/google/uuid v1.6.0 // indirect
 	github.com/gorilla/securecookie v1.1.2 // indirect
 	github.com/grpc-ecosystem/grpc-gateway/v2 v2.25.1 // indirect
+	github.com/ikawaha/kagome-dict v1.1.0 // indirect
 	github.com/json-iterator/go v1.1.12 // indirect
 	github.com/lufia/plan9stats v0.0.0-20240909124753-873cd0166683 // indirect
 	github.com/magiconair/properties v1.8.9 // indirect
@@ -143,6 +142,7 @@ require (
 	go.etcd.io/bbolt v1.3.11 // indirect
 	go.opentelemetry.io/auto/sdk v1.1.0 // indirect
 	go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.33.0 // indirect
+	go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.19.0 // indirect
 	go.opentelemetry.io/otel/metric v1.33.0 // indirect
 	go.opentelemetry.io/proto/otlp v1.4.0 // indirect
 	golang.org/x/net v0.33.0 // indirect

diff --git a/go.sum b/go.sum
diff --git a/search/bleve/analyzer.go b/search/bleve/analyzer.go
@@ -3,57 +3,62 @@ package bleve
 import (
 	"bytes"
 	"fmt"
+	"strings"
 
 	"github.com/blevesearch/bleve/v2/analysis"
-	"github.com/blevesearch/bleve/v2/analysis/analyzer/web"
-	"github.com/blevesearch/bleve/v2/analysis/lang/cjk"
 	"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
 	"github.com/blevesearch/bleve/v2/analysis/token/ngram"
-	"github.com/blevesearch/bleve/v2/analysis/token/shingle"
 	"github.com/blevesearch/bleve/v2/analysis/token/unicodenorm"
+	"github.com/blevesearch/bleve/v2/analysis/token/unique"
 	"github.com/blevesearch/bleve/v2/analysis/tokenizer/whitespace"
 	"github.com/blevesearch/bleve/v2/registry"
+	"github.com/ikawaha/kagome-dict/ipa"
+	"github.com/ikawaha/kagome/v2/tokenizer"
 	"github.com/robpike/nihongo"
 )
 
-func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Analyzer, error) {
-	tokenizer, err := cache.TokenizerNamed(web.Name)
-	if err != nil {
-		return nil, err
-	}
+const NgramFilterMin = 2
+const NgramFilterMax = 3
 
-	cjkWidth, err := cache.TokenFilterNamed(cjk.WidthName)
-	if err != nil {
-		return nil, err
-	}
+var _ analysis.Analyzer = new(multiAnalyzer)
 
-	cjkFilter, err := cache.TokenFilterNamed(cjk.BigramName)
-	if err != nil {
-		return nil, err
+type PrefilterFn func(in []byte) (out []byte)
+
+type multiAnalyzer struct {
+	prefilter func(in []byte) (out []byte)
+	analyzers []analysis.Analyzer
+}
+
+func (ma *multiAnalyzer) Analyze(text []byte) analysis.TokenStream {
+	var res analysis.TokenStream
+
+	fmt.Println(string(text))
+	if ma.prefilter != nil {
+		new := ma.prefilter(text)
+		if !bytes.Equal(text, new) {
+			res = ma.analyze(res, new)
+		}
 	}
-	_ = cjkFilter
 
-	toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
-	if err != nil {
-		return nil, err
+	return ma.analyze(res, text)
+}
+
+func (ma *multiAnalyzer) analyze(res analysis.TokenStream, text []byte) analysis.TokenStream {
+	for _, a := range ma.analyzers {
+		res = append(res, a.Analyze(text)...)
 	}
+	return res
+}
 
-	rv := analysis.DefaultAnalyzer{
-		Tokenizer: tokenizer,
-		TokenFilters: []analysis.TokenFilter{
-			cjkWidth,
-			shingle.NewShingleFilter(2, 4, true, " ", "_"),
-			FilterFn(RomajiFilter),
-			toLowerFilter,
-			cjkFilter,
-			unicodenorm.MustNewUnicodeNormalizeFilter(unicodenorm.NFC),
-			NgramFilter(2, 3),
-		},
+func NewMultiAnalyzer(pre PrefilterFn, a ...analysis.Analyzer) analysis.Analyzer {
+	return &multiAnalyzer{
+		prefilter: pre,
+		analyzers: a,
 	}
-	return &rv, nil
 }
 
-func QueryAnalyzerConstructor(config map[string]any, cache *registry.Cache) (analysis.Analyzer, error) {
+func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Analyzer, error) {
+	// construct our normal analyzer
 	tokenizer, err := cache.TokenizerNamed(whitespace.Name)
 	if err != nil {
 		return nil, err
@@ -64,20 +69,42 @@ func QueryAnalyzerConstructor(config map[string]any, cache *registry.Cache) (ana
 		return nil, err
 	}
 
-	rv := analysis.DefaultAnalyzer{
+	normalizeFilter := unicodenorm.MustNewUnicodeNormalizeFilter(unicodenorm.NFC)
+
+	normal := &analysis.DefaultAnalyzer{
 		Tokenizer: tokenizer,
 		TokenFilters: []analysis.TokenFilter{
-			FilterFn(RomajiFilter),
 			toLowerFilter,
+			//shingle.NewShingleFilter(2, 4, true, " ", "_"),
+			normalizeFilter,
+			NgramFilter(NgramFilterMin, NgramFilterMax),
 		},
 	}
 
-	return &rv, nil
+	// construct the japanese specific analyzer
+	japanese := &analysis.DefaultAnalyzer{
+		Tokenizer: NewKagomeTokenizer(),
+		TokenFilters: []analysis.TokenFilter{
+			toLowerFilter,
+			normalizeFilter,
+			FilterFn(RomajiFilter),
+			NgramFilter(NgramFilterMin, NgramFilterMax),
+			unique.NewUniqueTermFilter(),
+		},
+	}
+
+	_ = normal
+	return japanese, nil
+	/*
+		return NewMultiAnalyzer(nihongo.Romaji,
+			japanese,
+			normal,
+		), nil
+	*/
 }
 
 func init() {
 	registry.RegisterAnalyzer("radio", AnalyzerConstructor)
-	registry.RegisterAnalyzer("radio-query", QueryAnalyzerConstructor)
 }
 
 type FilterFn func(input analysis.TokenStream) analysis.TokenStream
@@ -130,7 +157,7 @@ func NgramFilter(min, max int) analysis.TokenFilter {
 		for i, tok := range input {
 			if len(tok.Term) > max {
 				// add the original token if it's above max
-				rv = append(rv, tok)
+				//rv = append(rv, tok)
 			}
 			// add the ngram tokens if this isn't a shingle
 			if tok.Type != analysis.Shingle {
@@ -141,6 +168,117 @@ func NgramFilter(min, max int) analysis.TokenFilter {
 	})
 }
 
+type KagomeTokenizer struct {
+	tok *tokenizer.Tokenizer
+}
+
+func NewKagomeTokenizer() *KagomeTokenizer {
+	tok, err := tokenizer.New(ipa.Dict(), tokenizer.OmitBosEos())
+	if err != nil {
+		return nil
+	}
+
+	return &KagomeTokenizer{
+		tok: tok,
+	}
+}
+
+func (t *KagomeTokenizer) Tokenize(input []byte) analysis.TokenStream {
+	if len(input) < 1 {
+		return nil
+	}
+
+	var bytePos int
+	var surface []byte
+	var rv analysis.TokenStream
+	var tokenPos int
+
+	appendToken := func(token *analysis.Token) {
+		rv, tokenPos = append(rv, token), tokenPos+1
+	}
+
+	for _, m := range t.tok.Analyze(string(input), tokenizer.Search) {
+		bytePos += len(m.Surface) // add to the running byte count
+
+		surfaceLen := len(m.Surface) // record before we trim
+		m.Surface = strings.TrimSpace(m.Surface)
+		if len(m.Surface) == 0 && len(surface) > 0 {
+			// we found some whitespace, emit everything we've collected in the surface
+			token := &analysis.Token{
+				Term:     surface,
+				Position: tokenPos,
+				Start:    bytePos - len(surface) - surfaceLen,
+				End:      bytePos,
+				Type:     analysis.AlphaNumeric,
+			}
+
+			appendToken(token)
+			surface = nil
+			continue
+		}
+
+		if m.Class == tokenizer.KNOWN {
+			// we hit something that the tokenizer knows, this probably means some
+			// japanese text, emit whatever is in the current surface first and then
+			// handle the new token
+			if len(surface) > 0 {
+				token := &analysis.Token{
+					Term:     surface,
+					Position: tokenPos,
+					Start:    bytePos - len(surface),
+					End:      bytePos,
+					Type:     analysis.AlphaNumeric,
+				}
+
+				appendToken(token)
+				surface = nil
+			}
+
+			// now handle the KNOWN token
+			token := &analysis.Token{
+				Term:     []byte(m.Surface),
+				Position: tokenPos,
+				Start:    bytePos - len(m.Surface),
+				End:      bytePos,
+				Type:     analysis.Ideographic,
+			}
+			appendToken(token)
+			continue
+		}
+
+		surface = append(surface, m.Surface...)
+	}
+
+	// end of the input, might have a strangling surface
+	if len(surface) > 0 {
+		token := &analysis.Token{
+			Term:     surface,
+			Position: tokenPos,
+			Start:    bytePos - len(surface),
+			End:      bytePos,
+			Type:     analysis.AlphaNumeric,
+		}
+
+		rv = append(rv, token)
+	}
+
+	/*
+		fmt.Printf("%s ->	", string(input))
+		for _, token := range rv {
+			fmt.Printf("[%s]", string(token.Term))
+		}
+		fmt.Printf("\n")
+	*/
+
+	/*
+		for _, token := range rv {
+			fmt.Printf("TOKEN: %v\n", token)
+			fmt.Println(string(input[token.Start:token.End]))
+		}
+	*/
+	return rv
+}
+
 /*func KagomeFilter() (FilterFn, error) {
 	t, err := tokenizer.New(uni.Dict(), tokenizer.OmitBosEos())
 	if err != nil {