-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtokenizers.go
64 lines (56 loc) · 1.32 KB
/
tokenizers.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
package gortex
import (
"strings"
"fmt"
"unicode"
)
//Tokenizer interface to string tokenizer
type Tokenizer interface {
Split(str string) []string
}
//WhiteSpaceSplitter space delimited tokens
type WhiteSpaceSplitter struct{}
func (s WhiteSpaceSplitter) Split(str string) []string {
return strings.Fields(str)
}
//CharSplitter parses string as sequence of characters
type CharSplitter struct{}
func (s CharSplitter) Split(str string) []string {
runes := []rune(str)
split := make([]string, len(runes))
for i, r := range runes {
split[i] = string(r)
}
return split
}
//WordSplitter
type WordSplitter struct{}
func (s WordSplitter) Split(str string) []string {
var split []string
token := ""
for _, r := range str {
switch {
case unicode.IsPunct(r) || unicode.IsSymbol(r):
if len(token) > 0 {
split = append(split, token)
token = ""
}
split = append(split, string(r))
case len(token) == 0 && unicode.IsSpace(r):
continue // skip leading space
case len(token) == 0 && !unicode.IsSpace(r):
token = string(r)
case len(token) > 0 && !unicode.IsSpace(r):
token += string(r)
case len(token) > 0 && unicode.IsSpace(r):
split = append(split, token)
token = ""
default:
panic(fmt.Errorf("unknown symbol %q", r))
}
}
if len(token) > 0 {
split = append(split, token)
}
return split
}