Skip to content

Commit

Permalink
separate words and numbers (#14)
Browse files Browse the repository at this point in the history
  • Loading branch information
sepehrsoh authored Feb 8, 2025
2 parents c762fec + 49e61f7 commit cc37be7
Show file tree
Hide file tree
Showing 5 changed files with 137 additions and 11 deletions.
68 changes: 66 additions & 2 deletions internal/lookup_compound.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,21 @@ import (
"regexp"
"strconv"
"strings"
"unicode"

"github.com/snapp-incubator/go-symspell/pkg/items"
verbositypkg "github.com/snapp-incubator/go-symspell/pkg/verbosity"
)

func parseWords(phrase string, preserveCase bool, splitBySpace bool) []string {
func parseWords(phrase string, preserveCase, splitBySpace, splitNumber bool) []string {
if !preserveCase {
phrase = strings.ToLower(phrase)
}

if splitBySpace {
if splitNumber {
return separateNumbers(strings.Split(phrase, " "))
}
return strings.Split(phrase, " ")
}

Expand All @@ -29,7 +33,7 @@ func parseWords(phrase string, preserveCase bool, splitBySpace bool) []string {
var reSplit = regexp.MustCompile(`([\p{L}\d]+(?:['’][\p{L}\d]+)?)`)

func (s *SymSpell) LookupCompound(phrase string, maxEditDistance int) *items.SuggestItem {
terms1 := parseWords(phrase, s.PreserveCase, s.SplitWordBySpace)
terms1 := parseWords(phrase, s.PreserveCase, s.SplitWordBySpace, s.SplitWordAndNumber)
cp := compoundProcessor{
suggestions: make([]items.SuggestItem, 0),
suggestionParts: make([]items.SuggestItem, 0),
Expand Down Expand Up @@ -317,3 +321,63 @@ type compoundProcessor struct {
func (c *compoundProcessor) tempTerm() string {
return fmt.Sprintf("%s %s", c.suggestion1.Term, c.suggestion2.Term)
}

func separateNumbers(inputs []string) []string {
var results []string
for _, input := range inputs {
if len(input) == 0 {
continue
}
results = append(results, splitWordAndNumber(input)...)
}

return results
}

func splitWordAndNumber(input string) []string {
// Convert the input string to runes so that we handle Unicode correctly.
runes := []rune(input)
// Determine the type of the first rune.
// We'll use: 1 for digit, 2 for letter, and 0 for any other character.
var firstType int
if unicode.IsDigit(runes[0]) {
firstType = 1
} else if unicode.IsLetter(runes[0]) {
firstType = 2
} else {
firstType = 0
}
// Find the index of the first rune whose type differs from the firstType.
transitionIndex := -1
for i, r := range runes {
// Skip the first character.
if i == 0 {
continue
}

var currentType int
if unicode.IsDigit(r) {
currentType = 1
} else if unicode.IsLetter(r) {
currentType = 2
} else {
currentType = 0
}

if currentType != firstType {
transitionIndex = i
break
}
}

// If no transition is found, use the whole string as is.
if transitionIndex == -1 {
return []string{input}
}
// Split into two groups: group1 is from the beginning to the transition,
// group2 is from the transition until the end.
group1 := string(runes[:transitionIndex])
group2 := string(runes[transitionIndex:])
// Combine the groups with a space.
return []string{group1, group2}
}
34 changes: 34 additions & 0 deletions internal/lookup_compound_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package internal
import (
"encoding/json"
"os"
"reflect"
"testing"

"github.com/snapp-incubator/go-symspell/pkg/options"
Expand Down Expand Up @@ -82,3 +83,36 @@ func TestLookupCompound(t *testing.T) {
}
}
}

func Test_separateNumbers(t *testing.T) {
type args struct {
inputs string
}
tests := []struct {
name string
args args
want []string
}{
{
name: "first number",
args: args{
inputs: "15خرداد",
},
want: []string{"15", "خرداد"},
},
{
name: "first word",
args: args{
inputs: "خرداد15",
},
want: []string{"خرداد", "15"},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := splitWordAndNumber(tt.args.inputs); !reflect.DeepEqual(got, tt.want) {
t.Errorf("separateNumbers() = %v, want %v", got, tt.want)
}
})
}
}
2 changes: 2 additions & 0 deletions internal/symspell.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ type SymSpell struct {
SplitThreshold int
PreserveCase bool
SplitWordBySpace bool
SplitWordAndNumber bool
MinimumCharToChange int
Words map[string]int
BelowThresholdWords map[string]int
Expand Down Expand Up @@ -60,6 +61,7 @@ func NewSymSpell(opt ...options.Options) (*SymSpell, error) {
SplitThreshold: opts.SplitItemThreshold,
PreserveCase: opts.PreserveCase,
SplitWordBySpace: opts.SplitWordBySpace,
SplitWordAndNumber: opts.SplitWordAndNumber,
MinimumCharToChange: opts.MinimumCharacterToChange,
Words: make(map[string]int),
BelowThresholdWords: make(map[string]int),
Expand Down
24 changes: 16 additions & 8 deletions pkg/options/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ var DefaultOptions = SymspellOptions{
SplitItemThreshold: 1,
PreserveCase: false,
SplitWordBySpace: false,
SplitWordAndNumber: false,
MinimumCharacterToChange: 1,
}

Expand All @@ -17,6 +18,7 @@ type SymspellOptions struct {
SplitItemThreshold int
PreserveCase bool
SplitWordBySpace bool
SplitWordAndNumber bool
MinimumCharacterToChange int
}

Expand All @@ -32,48 +34,54 @@ func (w FuncConfig) Apply(conf *SymspellOptions) {
w.ops(conf)
}

func NewFuncWireOption(f func(options *SymspellOptions)) *FuncConfig {
func NewFuncOption(f func(options *SymspellOptions)) *FuncConfig {
return &FuncConfig{ops: f}
}

func WithMaxDictionaryEditDistance(maxDictionaryEditDistance int) Options {
return NewFuncWireOption(func(options *SymspellOptions) {
return NewFuncOption(func(options *SymspellOptions) {
options.MaxDictionaryEditDistance = maxDictionaryEditDistance
})
}

func WithPrefixLength(prefixLength int) Options {
return NewFuncWireOption(func(options *SymspellOptions) {
return NewFuncOption(func(options *SymspellOptions) {
options.PrefixLength = prefixLength
})
}

func WithCountThreshold(countThreshold int) Options {
return NewFuncWireOption(func(options *SymspellOptions) {
return NewFuncOption(func(options *SymspellOptions) {
options.CountThreshold = countThreshold
})
}

func WithSplitItemThreshold(splitThreshold int) Options {
return NewFuncWireOption(func(options *SymspellOptions) {
return NewFuncOption(func(options *SymspellOptions) {
options.SplitItemThreshold = splitThreshold
})
}

func WithPreserveCase() Options {
return NewFuncWireOption(func(options *SymspellOptions) {
return NewFuncOption(func(options *SymspellOptions) {
options.PreserveCase = true
})
}

func WithSplitWordBySpace() Options {
return NewFuncWireOption(func(options *SymspellOptions) {
return NewFuncOption(func(options *SymspellOptions) {
options.SplitWordBySpace = true
})
}

func WithMinimumCharacterToChange(charLength int) Options {
return NewFuncWireOption(func(options *SymspellOptions) {
return NewFuncOption(func(options *SymspellOptions) {
options.MinimumCharacterToChange = charLength
})
}

func WithSplitWordAndNumbers() Options {
return NewFuncOption(func(options *SymspellOptions) {
options.SplitWordAndNumber = true
})
}
20 changes: 19 additions & 1 deletion symspell_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -193,8 +193,25 @@ func TestSymspellLookupCompoundUnigram(t *testing.T) {
},
want: "بیمارستان ا",
},
{
name: "Split Number",
args: args{
a: "17شهریور",
maxEditDistance: 3,
},
want: "17 شهریور",
},
{
name: "Split Number2",
args: args{
a: "خرداد15",
maxEditDistance: 3,
},
want: "خرداد 15",
},
}
symSpell := NewSymSpellWithLoadBigramDictionary("internal/tests/vocab_fa.txt",
symSpell := NewSymSpellWithLoadBigramDictionary(
"internal/tests/vocab_fa.txt",
"internal/tests/vocab_bigram_fa.txt",
0,
1,
Expand All @@ -204,6 +221,7 @@ func TestSymspellLookupCompoundUnigram(t *testing.T) {
options.WithSplitItemThreshold(100),
options.WithSplitWordBySpace(),
options.WithMinimumCharacterToChange(2),
options.WithSplitWordAndNumbers(),
)
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
Expand Down

0 comments on commit cc37be7

Please sign in to comment.