Skip to content

Commit

Permalink
search/bleve: write a custom tokenizer and query creator
Browse files Browse the repository at this point in the history
the tokenizer is in essence a basic whitespace tokenizer but also
splitting on known japanese boundaries by using kagome

the query creator is a simple match query but using the correct analyzer
this time around (the same as the index creation).

the analyzer has been changed to use the new tokenizer and not use shingles
anymore. the ngram filter has been changed to not include the original term
anymore. some specific handling for cjk has also been removed and instead
leans on the new tokenizer and ngram filter (with the removal of the shingles
this works reasonably well (so far))
  • Loading branch information
Wessie committed Dec 22, 2024
1 parent ef7c33b commit eb48b2a
Show file tree
Hide file tree
Showing 7 changed files with 371 additions and 253 deletions.
12 changes: 6 additions & 6 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,10 @@ require (
github.com/Wessie/fdstore v1.2.3
github.com/XSAM/otelsql v0.36.0
github.com/adtac/go-akismet v0.0.0-20181220032308-0ca9e1023047
github.com/agoda-com/opentelemetry-go/otelzerolog v0.0.1
github.com/agoda-com/opentelemetry-logs-go v0.5.1
github.com/alevinval/sse v1.0.2
github.com/alexedwards/scs/v2 v2.8.0
github.com/blevesearch/bleve/v2 v2.4.4
github.com/blevesearch/bleve_index_api v1.2.0
github.com/cenkalti/backoff v2.2.1+incompatible
github.com/cenkalti/backoff/v4 v4.3.0
github.com/davecgh/go-spew v1.1.1
Expand All @@ -26,6 +25,8 @@ require (
github.com/golang-migrate/migrate/v4 v4.18.1
github.com/google/subcommands v1.2.0
github.com/gorilla/csrf v1.7.2
github.com/ikawaha/kagome-dict/ipa v1.2.0
github.com/ikawaha/kagome/v2 v2.10.0
github.com/jmoiron/sqlx v1.4.0
github.com/justincormack/go-memfd v0.0.0-20170219213707-6e4af0518993
github.com/jxskiss/base62 v1.1.0
Expand Down Expand Up @@ -56,7 +57,6 @@ require (
go.opentelemetry.io/otel/trace v1.33.0
golang.org/x/crypto v0.31.0
golang.org/x/exp v0.0.0-20241217172543-b2144cdd0a67
golang.org/x/term v0.27.0
golang.org/x/text v0.21.0
golang.org/x/tools v0.28.0
google.golang.org/grpc v1.69.2
Expand All @@ -65,14 +65,13 @@ require (
require (
dario.cat/mergo v1.0.1 // indirect
filippo.io/edwards25519 v1.1.0 // indirect
github.com/AdaLogics/go-fuzz-headers v0.0.0-20230811130428-ced1acdcaa24 // indirect
github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 // indirect
github.com/Microsoft/go-winio v0.6.2 // indirect
github.com/Microsoft/hcsshim v0.12.9 // indirect
github.com/RoaringBitmap/roaring v1.9.4 // indirect
github.com/andybalholm/brotli v1.1.1 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/bits-and-blooms/bitset v1.20.0 // indirect
github.com/blevesearch/bleve_index_api v1.2.0 // indirect
github.com/blevesearch/geo v0.1.20 // indirect
github.com/blevesearch/go-faiss v1.0.24 // indirect
github.com/blevesearch/go-porterstemmer v1.0.3 // indirect
Expand All @@ -90,7 +89,6 @@ require (
github.com/blevesearch/zapx/v15 v15.3.17 // indirect
github.com/blevesearch/zapx/v16 v16.1.10 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/containerd/containerd v1.7.24 // indirect
github.com/containerd/log v0.1.0 // indirect
github.com/containerd/platforms v0.2.1 // indirect
github.com/cpuguy83/dockercfg v0.3.2 // indirect
Expand All @@ -109,6 +107,7 @@ require (
github.com/google/uuid v1.6.0 // indirect
github.com/gorilla/securecookie v1.1.2 // indirect
github.com/grpc-ecosystem/grpc-gateway/v2 v2.25.1 // indirect
github.com/ikawaha/kagome-dict v1.1.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/lufia/plan9stats v0.0.0-20240909124753-873cd0166683 // indirect
github.com/magiconair/properties v1.8.9 // indirect
Expand Down Expand Up @@ -143,6 +142,7 @@ require (
go.etcd.io/bbolt v1.3.11 // indirect
go.opentelemetry.io/auto/sdk v1.1.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.33.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.19.0 // indirect
go.opentelemetry.io/otel/metric v1.33.0 // indirect
go.opentelemetry.io/proto/otlp v1.4.0 // indirect
golang.org/x/net v0.33.0 // indirect
Expand Down
76 changes: 8 additions & 68 deletions go.sum

Large diffs are not rendered by default.

212 changes: 175 additions & 37 deletions search/bleve/analyzer.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,57 +3,62 @@ package bleve
import (
"bytes"
"fmt"
"strings"

"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/analyzer/web"
"github.com/blevesearch/bleve/v2/analysis/lang/cjk"
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
"github.com/blevesearch/bleve/v2/analysis/token/ngram"
"github.com/blevesearch/bleve/v2/analysis/token/shingle"
"github.com/blevesearch/bleve/v2/analysis/token/unicodenorm"
"github.com/blevesearch/bleve/v2/analysis/token/unique"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/whitespace"
"github.com/blevesearch/bleve/v2/registry"
"github.com/ikawaha/kagome-dict/ipa"
"github.com/ikawaha/kagome/v2/tokenizer"
"github.com/robpike/nihongo"
)

func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Analyzer, error) {
tokenizer, err := cache.TokenizerNamed(web.Name)
if err != nil {
return nil, err
}
const NgramFilterMin = 2
const NgramFilterMax = 3

cjkWidth, err := cache.TokenFilterNamed(cjk.WidthName)
if err != nil {
return nil, err
}
var _ analysis.Analyzer = new(multiAnalyzer)

cjkFilter, err := cache.TokenFilterNamed(cjk.BigramName)
if err != nil {
return nil, err
type PrefilterFn func(in []byte) (out []byte)

type multiAnalyzer struct {
prefilter func(in []byte) (out []byte)
analyzers []analysis.Analyzer
}

func (ma *multiAnalyzer) Analyze(text []byte) analysis.TokenStream {
var res analysis.TokenStream

fmt.Println(string(text))
if ma.prefilter != nil {
new := ma.prefilter(text)
if !bytes.Equal(text, new) {
res = ma.analyze(res, new)
}
}
_ = cjkFilter

toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
if err != nil {
return nil, err
return ma.analyze(res, text)
}

func (ma *multiAnalyzer) analyze(res analysis.TokenStream, text []byte) analysis.TokenStream {
for _, a := range ma.analyzers {
res = append(res, a.Analyze(text)...)
}
return res
}

rv := analysis.DefaultAnalyzer{
Tokenizer: tokenizer,
TokenFilters: []analysis.TokenFilter{
cjkWidth,
shingle.NewShingleFilter(2, 4, true, " ", "_"),
FilterFn(RomajiFilter),
toLowerFilter,
cjkFilter,
unicodenorm.MustNewUnicodeNormalizeFilter(unicodenorm.NFC),
NgramFilter(2, 3),
},
func NewMultiAnalyzer(pre PrefilterFn, a ...analysis.Analyzer) analysis.Analyzer {
return &multiAnalyzer{
prefilter: pre,
analyzers: a,
}
return &rv, nil
}

func QueryAnalyzerConstructor(config map[string]any, cache *registry.Cache) (analysis.Analyzer, error) {
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Analyzer, error) {
// construct our normal analyzer
tokenizer, err := cache.TokenizerNamed(whitespace.Name)
if err != nil {
return nil, err
Expand All @@ -64,20 +69,42 @@ func QueryAnalyzerConstructor(config map[string]any, cache *registry.Cache) (ana
return nil, err
}

rv := analysis.DefaultAnalyzer{
normalizeFilter := unicodenorm.MustNewUnicodeNormalizeFilter(unicodenorm.NFC)

normal := &analysis.DefaultAnalyzer{
Tokenizer: tokenizer,
TokenFilters: []analysis.TokenFilter{
FilterFn(RomajiFilter),
toLowerFilter,
//shingle.NewShingleFilter(2, 4, true, " ", "_"),
normalizeFilter,
NgramFilter(NgramFilterMin, NgramFilterMax),
},
}

return &rv, nil
// construct the japanese specific analyzer
japanese := &analysis.DefaultAnalyzer{
Tokenizer: NewKagomeTokenizer(),
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
normalizeFilter,
FilterFn(RomajiFilter),
NgramFilter(NgramFilterMin, NgramFilterMax),
unique.NewUniqueTermFilter(),
},
}

_ = normal
return japanese, nil
/*
return NewMultiAnalyzer(nihongo.Romaji,
japanese,
normal,
), nil
*/
}

func init() {
registry.RegisterAnalyzer("radio", AnalyzerConstructor)
registry.RegisterAnalyzer("radio-query", QueryAnalyzerConstructor)
}

type FilterFn func(input analysis.TokenStream) analysis.TokenStream
Expand Down Expand Up @@ -130,7 +157,7 @@ func NgramFilter(min, max int) analysis.TokenFilter {
for i, tok := range input {
if len(tok.Term) > max {
// add the original token if it's above max
rv = append(rv, tok)
//rv = append(rv, tok)
}
// add the ngram tokens if this isn't a shingle
if tok.Type != analysis.Shingle {
Expand All @@ -141,6 +168,117 @@ func NgramFilter(min, max int) analysis.TokenFilter {
})
}

type KagomeTokenizer struct {
tok *tokenizer.Tokenizer
}

func NewKagomeTokenizer() *KagomeTokenizer {
tok, err := tokenizer.New(ipa.Dict(), tokenizer.OmitBosEos())
if err != nil {
return nil
}

return &KagomeTokenizer{
tok: tok,
}
}

func (t *KagomeTokenizer) Tokenize(input []byte) analysis.TokenStream {
if len(input) < 1 {
return nil
}

var bytePos int
var surface []byte
var rv analysis.TokenStream
var tokenPos int

appendToken := func(token *analysis.Token) {
rv, tokenPos = append(rv, token), tokenPos+1
}

for _, m := range t.tok.Analyze(string(input), tokenizer.Search) {
bytePos += len(m.Surface) // add to the running byte count

surfaceLen := len(m.Surface) // record before we trim
m.Surface = strings.TrimSpace(m.Surface)
if len(m.Surface) == 0 && len(surface) > 0 {
// we found some whitespace, emit everything we've collected in the surface
token := &analysis.Token{
Term: surface,
Position: tokenPos,
Start: bytePos - len(surface) - surfaceLen,
End: bytePos,
Type: analysis.AlphaNumeric,
}

appendToken(token)
surface = nil
continue
}

if m.Class == tokenizer.KNOWN {
// we hit something that the tokenizer knows, this probably means some
// japanese text, emit whatever is in the current surface first and then
// handle the new token
if len(surface) > 0 {
token := &analysis.Token{
Term: surface,
Position: tokenPos,
Start: bytePos - len(surface),
End: bytePos,
Type: analysis.AlphaNumeric,
}

appendToken(token)
surface = nil
}

// now handle the KNOWN token
token := &analysis.Token{
Term: []byte(m.Surface),
Position: tokenPos,
Start: bytePos - len(m.Surface),
End: bytePos,
Type: analysis.Ideographic,
}
appendToken(token)
continue
}

surface = append(surface, m.Surface...)
}

// end of the input, might have a strangling surface
if len(surface) > 0 {
token := &analysis.Token{
Term: surface,
Position: tokenPos,
Start: bytePos - len(surface),
End: bytePos,
Type: analysis.AlphaNumeric,
}

rv = append(rv, token)
}

/*
fmt.Printf("%s -> ", string(input))
for _, token := range rv {
fmt.Printf("[%s]", string(token.Term))
}
fmt.Printf("\n")
*/

/*
for _, token := range rv {
fmt.Printf("TOKEN: %v\n", token)
fmt.Println(string(input[token.Start:token.End]))
}
*/
return rv
}

/*func KagomeFilter() (FilterFn, error) {
t, err := tokenizer.New(uni.Dict(), tokenizer.OmitBosEos())
if err != nil {
Expand Down
Loading

0 comments on commit eb48b2a

Please sign in to comment.