Skip to content

Commit

Permalink
search/bleve: implement a slightly adjusted ngram token filter
Browse files Browse the repository at this point in the history
this one includes the original token if said token is larger than
the configured ngram max
  • Loading branch information
Wessie committed Jun 26, 2024
1 parent 5c15059 commit a5f54ac
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 2 deletions.
35 changes: 33 additions & 2 deletions search/bleve/analyzer.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@ package bleve

import (
"bytes"
"fmt"

"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/analyzer/web"
"github.com/blevesearch/bleve/v2/analysis/lang/cjk"
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
"github.com/blevesearch/bleve/v2/analysis/token/ngram"
"github.com/blevesearch/bleve/v2/analysis/token/unicodenorm"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/whitespace"
"github.com/blevesearch/bleve/v2/registry"
"github.com/robpike/nihongo"
)
Expand Down Expand Up @@ -43,14 +45,14 @@ func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (
cjkFilter,
toLowerFilter,
unicodenorm.MustNewUnicodeNormalizeFilter(unicodenorm.NFC),
ngram.NewNgramFilter(2, 5),
NgramFilter(2, 5),
},
}
return &rv, nil
}

func QueryAnalyzerConstructor(config map[string]any, cache *registry.Cache) (analysis.Analyzer, error) {
tokenizer, err := cache.TokenizerNamed(web.Name)
tokenizer, err := cache.TokenizerNamed(whitespace.Name)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -82,6 +84,17 @@ func (fn FilterFn) Filter(input analysis.TokenStream) analysis.TokenStream {
return fn(input)
}

func DebugFilter(prefix string) analysis.TokenFilter {
return FilterFn(func(input analysis.TokenStream) analysis.TokenStream {
fmt.Printf("======== %s ========\n", prefix)
for i, token := range input {
fmt.Printf("%d %s\n", i, token)
}
fmt.Printf("======== %s ========\n", prefix)
return input
})
}

func RomajiFilter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0, len(input))

Expand All @@ -106,6 +119,24 @@ func RomajiFilter(input analysis.TokenStream) analysis.TokenStream {
return rv
}

func NgramFilter(min, max int) analysis.TokenFilter {
ngram := ngram.NewNgramFilter(min, max)

return FilterFn(func(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0, len(input))

for i, tok := range input {
if len(tok.Term) > max {
// add the original token if it's above max
rv = append(rv, tok)
}
// add the ngram tokens
rv = append(rv, ngram.Filter(input[i:i+1])...)
}
return rv
})
}

/*func KagomeFilter() (FilterFn, error) {
t, err := tokenizer.New(uni.Dict(), tokenizer.OmitBosEos())
if err != nil {
Expand Down
1 change: 1 addition & 0 deletions search/bleve/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,7 @@ func constructIndexMapping() (mapping.IndexMapping, error) {
data := bleve.NewTextFieldMapping()
data.Index = false
data.Store = true
data.Analyzer = "keyword"
sm.AddFieldMappingsAt("data", data)

// register the song mapping
Expand Down

0 comments on commit a5f54ac

Please sign in to comment.