Skip to content

Commit

Permalink
support unicode
Browse files Browse the repository at this point in the history
  • Loading branch information
goropikari committed Sep 10, 2022
1 parent 63a91f3 commit 43230c0
Show file tree
Hide file tree
Showing 29 changed files with 1,816 additions and 2,489 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
.PHONY: test
test:
test: build
@go test -shuffle on $(shell go list ./... | grep -v sample)

.PHONY: test-verbose
Expand Down
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

tlex is lexical analyzer generator such as Lex.
This is toy implementation for my study, so don't use for production.
tlex supports only ASCII string, doesn't do unicode.
tlex supports Unicode.


```bash
Expand All @@ -25,7 +25,7 @@ Usage of ./tlex:
$ tlex -src sample.l -pkg main -o main.go
$ go run main.go

func foo123bar() int {
func foo123barあいう () int {
x := 1 * 10 + 123 - 1000 / 5432
y := float64(x)

Expand All @@ -37,6 +37,8 @@ Keyword
"func"
Identifier
"foo123bar"
Hiragana
"あいう"
LParen
"("
RParen
Expand Down
44 changes: 21 additions & 23 deletions automata/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package automata
import (
"crypto/sha256"
stdmath "math"
"unicode"

"github.com/goropikari/tlex/collection"
"github.com/goropikari/tlex/utils/guid"
Expand All @@ -21,11 +22,8 @@ func init() {
}
}

var unicodeRange = []Interval{
NewInterval(0, 127),
NewInterval(49152, 57343),
NewInterval(14680064, 15728639),
NewInterval(4026531840, 4160749567),
var UnicodeRange = []Interval{
NewInterval(0, int(unicode.MaxRune)),
}

type RegexID int
Expand All @@ -36,19 +34,19 @@ type Nothing struct{}
var nothing = Nothing{}

type Interval struct {
l int
r int
L int
R int
}

func NewInterval(s, e int) Interval {
return Interval{
l: s,
r: e,
L: s,
R: e,
}
}

func (x Interval) Overlap(y Interval) bool {
return y.l <= x.r && x.l <= y.r
return y.L <= x.R && x.L <= y.R
}

func (x Interval) Difference(y Interval) []Interval {
Expand All @@ -57,11 +55,11 @@ func (x Interval) Difference(y Interval) []Interval {
}

ret := make([]Interval, 0, 2)
if x.l < y.l {
ret = append(ret, NewInterval(x.l, y.l-1))
if x.L < y.L {
ret = append(ret, NewInterval(x.L, y.L-1))
}
if y.r < x.r {
ret = append(ret, NewInterval(y.r+1, x.r))
if x.R > y.R {
ret = append(ret, NewInterval(y.R+1, x.R))
}

return ret
Expand All @@ -71,10 +69,10 @@ func (x Interval) Difference(y Interval) []Interval {
func Disjoin(intvs []Interval) []Interval {
pq := collection.NewPriorityQueue(func(x, y Interval) bool {
// ascending order
if x.l != y.l {
return x.l > y.l
if x.L != y.L {
return x.L > y.L
}
return x.r > y.r
return x.R > y.R
})

for _, v := range intvs {
Expand All @@ -89,17 +87,17 @@ func Disjoin(intvs []Interval) []Interval {
pq.Pop()

if t1.Overlap(t2) {
if t1.l < t2.l {
nx1 := NewInterval(t1.l, t2.l-1)
nx2 := NewInterval(t2.l, t1.r)
nx3 := NewInterval(t2.l, t2.r)
if t1.L < t2.L {
nx1 := NewInterval(t1.L, t2.L-1)
nx2 := NewInterval(t2.L, t1.R)
nx3 := NewInterval(t2.L, t2.R)
pq.Push(nx1)
pq.Push(nx2)
pq.Push(nx3)
} else { // t1.l == t2.l
pq.Push(t1)
nx := NewInterval(t1.r+1, t2.r)
if t1.r+1 <= t2.r {
nx := NewInterval(t1.R+1, t2.R)
if t1.R+1 <= t2.R {
pq.Push(nx)
}
}
Expand Down
54 changes: 54 additions & 0 deletions automata/dfa.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@ func NewDFATransition() *DFATransition {
}
}

func (trans *DFATransition) GetMap(sid StateID) (map[Interval]StateID, bool) {
mp, ok := trans.delta[sid]
return mp, ok
}

func (trans *DFATransition) Set(from StateID, intv Interval, to StateID) {
_, ok := trans.delta[from]
if !ok {
Expand All @@ -21,6 +26,17 @@ func (trans *DFATransition) Set(from StateID, intv Interval, to StateID) {
trans.delta[from][intv] = to
}

func (trans *DFATransition) step(from StateID, intv Interval) (StateID, bool) {
if mp, ok := trans.delta[from]; ok {
for t, to := range mp {
if t.Overlap(intv) {
return to, true
}
}
}
return 0, false
}

type DFA struct {
size int
intvs []Interval
Expand All @@ -31,12 +47,50 @@ type DFA struct {
stIDToRegID StateIDToRegexID
}

func (dfa *DFA) GetInitState() StateID {
return dfa.initState
}

func (dfa *DFA) GetFinStates() *collection.Set[StateID] {
return dfa.finStates
}

func (dfa *DFA) GetStates() []StateID {
return dfa.states.Slice()
}

func (dfa *DFA) GetRegexID(sid StateID) RegexID {
return dfa.stIDToRegID.Get(sid)
}

func (dfa *DFA) GetTransitionTable() *DFATransition {
return dfa.trans
}

func (dfa *DFA) Accept(s string) (RegexID, bool) {
rs := []rune(s)
currSid := dfa.initState
for _, r := range rs {
intv := NewInterval(int(r), int(r))
nx, ok := dfa.trans.step(currSid, intv)
if !ok {
return 0, false
}
currSid = nx
}
return dfa.stIDToRegID.Get(currSid), dfa.finStates.Contains(currSid)
}

// ここで入る intv は dfa.intvs に入っていることを前提としている
func (dfa *DFA) stepIntv(sid StateID, intv Interval) (stateID StateID, nonDeadState bool) {
retID, ok := dfa.trans.delta[sid][intv]
return retID, ok
}

// state minimization for lexical analyzer
// Compilers: Principles, Techniques, and Tools, 2ed ed., ISBN 9780321486813 (Dragon book)
// p.181 Algorithm 3.39
// p.184 3.9.7 State Minimization in Lexical Analyzers
func (dfa *DFA) grouping() [][]StateID {
regIDMap := map[RegexID][]StateID{}
siter := dfa.states.Iterator()
Expand Down
28 changes: 23 additions & 5 deletions automata/dot.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ func (nfa NFA) ToDot() (string, error) {
edges := make(map[collection.Pair[StateID, StateID]]string)
for from, mp := range nfa.trans.mp {
for intv, tos := range mp {
symbols := fmt.Sprintf("[%c-%c]", intv.l, intv.r)
symbols := fmt.Sprintf("[%c-%c]", intv.L, intv.R)
titer := tos.Iterator()
for titer.HasNext() {
to := titer.Next()
Expand Down Expand Up @@ -167,7 +167,7 @@ func (nfa ImdNFA) ToDot() (string, error) {
edges := make(map[collection.Pair[StateID, StateID]]string)
for from, mp := range nfa.trans.mp {
for intv, tos := range mp {
symbols := fmt.Sprintf("[%c-%c]", intv.l, intv.r)
symbols := fmt.Sprintf("[%c-%c]", intv.L, intv.R)
titer := tos.iterator()
for titer.HasNext() {
to := titer.Next()
Expand Down Expand Up @@ -222,8 +222,18 @@ func (nfa ImdNFA) ToDot() (string, error) {

func (dfa DFA) ToDot() (string, error) {
g := graphviz.New()

ftBinary, _ := os.ReadFile("./ipaexg00401/ipaexg.ttf")
var ftBinary []byte
if exists("/usr/share/fonts/opentype/ipaexfont-gothic/ipaexg.ttf") {
ftBinary, _ = os.ReadFile("/usr/share/fonts/opentype/ipaexfont-gothic/ipaexg.ttf")
} else if exists("/usr/share/fonts/OTF/ipaexm.ttf") {
ftBinary, _ = os.ReadFile("/usr/share/fonts/OTF/ipaexm.ttf")
} else {
var err error
ftBinary, err = os.ReadFile("./ipaexg00401/ipaexg.ttf")
if err != nil {
panic(err)
}
}
ft, _ := truetype.Parse(ftBinary)
g.SetFontFace(func(size float64) (font.Face, error) {
opt := &truetype.Options{
Expand Down Expand Up @@ -286,7 +296,10 @@ func (dfa DFA) ToDot() (string, error) {
edges := make(map[collection.Pair[StateID, StateID]]string)
for from, mp := range dfa.trans.delta {
for intv, to := range mp {
symbols := fmt.Sprintf("[%s-%s]", string(rune(intv.l)), string(rune(intv.r)))
var lstr, rstr string
lstr = fmt.Sprintf("%v", intv.L)
rstr = fmt.Sprintf("%v", intv.R)
symbols := fmt.Sprintf("[%s-%s]", lstr, rstr)
p := collection.NewPair(from, to)
if _, ok := edges[p]; ok {
edges[p] = edges[p] + "\n" + symbols
Expand Down Expand Up @@ -323,3 +336,8 @@ func (dfa DFA) ToDot() (string, error) {

return buf.String(), nil
}

func exists(filename string) bool {
_, err := os.Stat(filename)
return err == nil
}
34 changes: 26 additions & 8 deletions automata/nfa.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,20 @@ type EpsilonTransition struct {
mp map[StateID]*collection.Set[StateID]
}

func NewEpsilonTransition(mp map[StateID]*collection.Set[StateID]) EpsilonTransition {
func NewEpsilonTransition() EpsilonTransition {
return EpsilonTransition{
mp: mp,
mp: make(map[StateID]*collection.Set[StateID]),
}
}

func (t EpsilonTransition) set(from, to StateID) {
func (t EpsilonTransition) Set(from, to StateID) EpsilonTransition {
if _, ok := t.mp[from]; ok {
t.mp[from].Insert(to)
} else {
t.mp[from] = collection.NewSet[StateID]().Insert(to)
}

return t
}

func (trans *EpsilonTransition) merge(other EpsilonTransition) {
Expand All @@ -40,8 +42,24 @@ type NFATransition struct {
mp map[StateID]map[Interval]*collection.Set[StateID]
}

func NewTransition(mp map[StateID]map[Interval]*collection.Set[StateID]) NFATransition {
return NFATransition{mp: mp}
func NewNFATransition() NFATransition {
return NFATransition{
mp: make(map[StateID]map[Interval]*collection.Set[StateID]),
}
}

func (trans NFATransition) Set(from StateID, intv Interval, to StateID) NFATransition {
_, ok := trans.mp[from]
if !ok {
trans.mp[from] = make(map[Interval]*collection.Set[StateID])
}
_, ok = trans.mp[from][intv]
if !ok {
trans.mp[from][intv] = collection.NewSet[StateID]()
}
trans.mp[from][intv].Insert(to)

return trans
}

func (trans NFATransition) merge(other NFATransition) {
Expand Down Expand Up @@ -122,7 +140,7 @@ func (nfa *NFA) Concat(other *NFA) *NFA {
iiter := other.initStates.Iterator()
for iiter.HasNext() {
to := iiter.Next()
nfa.epsilonTrans.set(from, to)
nfa.epsilonTrans.Set(from, to)
}
}
nfa.finStates = other.finStates
Expand All @@ -137,12 +155,12 @@ func (nfa *NFA) Star() *NFA {
fiter := nfa.finStates.Iterator()
for fiter.HasNext() {
from := fiter.Next()
nfa.epsilonTrans.set(from, sid)
nfa.epsilonTrans.Set(from, sid)
}
iiter := nfa.initStates.Iterator()
for iiter.HasNext() {
to := iiter.Next()
nfa.epsilonTrans.set(sid, to)
nfa.epsilonTrans.Set(sid, to)
}

states := collection.NewSet[StateID]().Insert(sid)
Expand Down
Loading

0 comments on commit 43230c0

Please sign in to comment.