Skip to content

Commit

Permalink
Merge pull request #3 from jf-tech/scanner
Browse files Browse the repository at this point in the history
Adding NewScannerByDelim/NewScannerByDelim2 and IndexWithEsc/SplitWit…
  • Loading branch information
jf-tech authored May 27, 2020
2 parents 9c8f4b3 + 8a58f9d commit da541c5
Show file tree
Hide file tree
Showing 4 changed files with 416 additions and 0 deletions.
62 changes: 62 additions & 0 deletions scanners.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
package iohelper

import (
"bufio"
"io"
)

// ScannerByDelimFlag is the type of flags passed to NewScannerByDelim/NewScannerByDelim2.
type ScannerByDelimFlag uint

const (
// ScannerByDelimFlagEofAsDelim specifies that the scanner should treat EOF as the delimiter as well.
ScannerByDelimFlagEofAsDelim ScannerByDelimFlag = 1 << iota
// ScannerByDelimFlagDropDelimInReturn specifies that the delimiter should be included in the return value.
ScannerByDelimFlagDropDelimInReturn
scannerByDelimFlagEnd

// ScannerByDelimFlagEofNotAsDelim specifies that the scanner should NOT treat EOF as the delimiter.
ScannerByDelimFlagEofNotAsDelim = 0
// ScannerByDelimFlagIncludeDelimInReturn specifies that the delimiter should NOT be included in the return value.
ScannerByDelimFlagIncludeDelimInReturn = 0
)
const (
// ScannerByDelimFlagDefault specifies the most commonly used flags for the scanner.
ScannerByDelimFlagDefault = ScannerByDelimFlagEofAsDelim | ScannerByDelimFlagDropDelimInReturn
scannerByDelimValidFlags = scannerByDelimFlagEnd - 1
)

// NewScannerByDelim creates a scanner that returns tokens from the source reader separated by a delimiter.
func NewScannerByDelim(r io.Reader, delim string, flags ScannerByDelimFlag) *bufio.Scanner {
return NewScannerByDelim2(r, delim, nil, flags)
}

// NewScannerByDelim2 creates a scanner that returns tokens from the source reader separated by a delimiter, with
// consideration of potential presence of escaping sequence.
// Note: the token returned from the scanner will **NOT** do any unescaping, thus keeping the original value.
func NewScannerByDelim2(r io.Reader, delim string, escape *rune, flags ScannerByDelimFlag) *bufio.Scanner {
flags &= scannerByDelimValidFlags

includeDelimLenInToken := len(delim)
if flags&ScannerByDelimFlagDropDelimInReturn != 0 {
includeDelimLenInToken = 0
}

eofAsDelim := flags&ScannerByDelimFlagEofAsDelim != 0

scanner := bufio.NewScanner(r)
scanner.Split(
func(data []byte, atEof bool) (advance int, token []byte, err error) {
if atEof && len(data) == 0 {
return 0, nil, nil
}
if index := IndexWithEsc(string(data), delim, escape); index >= 0 {
return index + len(delim), data[:index+includeDelimLenInToken], nil
}
if atEof && eofAsDelim {
return len(data), data, nil
}
return 0, nil, nil
})
return scanner
}
95 changes: 95 additions & 0 deletions scanners_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
package iohelper

import (
"io"
"strings"
"testing"

"github.com/stretchr/testify/assert"
)

func TestNewScannerByDelim(t *testing.T) {
for _, test := range []struct {
name string
input io.Reader
delim string
flags ScannerByDelimFlag
expectedTokens []string
}{
{
name: "multi-char delim | eof as delim | drop delim",
input: strings.NewReader("abc#123##efg####???##xyz##"),
delim: "##",
flags: ScannerByDelimFlagEofAsDelim | ScannerByDelimFlagDropDelimInReturn,
expectedTokens: []string{"abc#123", "efg", "", "???", "xyz"},
},
{
name: "CR LF delim | eof as delim | include delim",
input: strings.NewReader("\r\n\rabc\r"),
delim: "\r\n",
flags: ScannerByDelimFlagEofAsDelim | ScannerByDelimFlagIncludeDelimInReturn,
expectedTokens: []string{"\r\n", "\rabc\r"},
},
{
name: "empty reader",
input: strings.NewReader(""),
delim: "*",
flags: ScannerByDelimFlagDefault,
expectedTokens: []string{},
},
{
name: "empty token",
input: strings.NewReader("*"),
delim: "*",
flags: ScannerByDelimFlagEofNotAsDelim | ScannerByDelimFlagDropDelimInReturn,
expectedTokens: []string{""},
},
{
name: "trailing newlines",
input: strings.NewReader("*\n"),
delim: "*",
flags: ScannerByDelimFlagEofAsDelim | ScannerByDelimFlagIncludeDelimInReturn,
expectedTokens: []string{"*", "\n"},
},
} {
t.Run(test.name, func(t *testing.T) {
s := NewScannerByDelim(test.input, test.delim, test.flags)
tokens := []string{}
for s.Scan() {
tokens = append(tokens, s.Text())
}
assert.NoError(t, s.Err())
assert.Equal(t, test.expectedTokens, tokens)
})
}
}

func TestNewScannerByDelim2(t *testing.T) {
for _, test := range []struct {
name string
input io.Reader
delim string
esc rune
flags ScannerByDelimFlag
expectedTokens []string
}{
{
name: "multi-char delim | with delim esc | eof as delim | drop delim",
input: strings.NewReader("abc#123##efg####???##xyz##"),
delim: "##",
esc: rune('?'),
flags: ScannerByDelimFlagEofAsDelim | ScannerByDelimFlagDropDelimInReturn,
expectedTokens: []string{"abc#123", "efg", "", "???##xyz"},
},
} {
t.Run(test.name, func(t *testing.T) {
s := NewScannerByDelim2(test.input, test.delim, RunePtr(test.esc), test.flags)
tokens := []string{}
for s.Scan() {
tokens = append(tokens, s.Text())
}
assert.NoError(t, s.Err())
assert.Equal(t, test.expectedTokens, tokens)
})
}
}
86 changes: 86 additions & 0 deletions strings.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
package iohelper

import (
"strings"
)

// RunePtr returns a pointer to a rune.
func RunePtr(r rune) *rune {
return &r
}

// IndexWithEsc is similar to strings.Index but taking escape sequnce into consideration.
// For example, IndexWithEsc("abc%|efg|xyz", "|", RunePtr("%")) would return 8, not 4.
func IndexWithEsc(s, delim string, esc *rune) int {
if len(delim) == 0 {
return 0
}
if len(s) == 0 {
return -1
}
if esc == nil {
return strings.Index(s, delim)
}

sRunes := []rune(s)
delimRunes := []rune(delim)
escRune := *esc

// Yes this old dumb double loop isn't the most efficient algo but it's super easy and simple to understand
// and bug free compared with fancy strings.Index or bytes.Index which could potentially lead to index errors
// and/or rune/utf-8 bugs. Plus for vast majority of use cases, delim will be of a single rune, so effectively
// not much perf penalty at all.
for i := 0; i < len(sRunes)-len(delimRunes)+1; i++ {
if sRunes[i] == escRune {
// skip the escaped rune (aka the rune after the escape rune)
i++
continue
}
delimFound := true
for j := 0; j < len(delimRunes); j++ {
if sRunes[i+j] != delimRunes[j] {
delimFound = false
break
}
}
if delimFound {
return len(string(sRunes[:i]))
}
}

return -1
}

// SplitWithEsc is similar to strings.Split but taking escape sequence into consideration.
// For example, SplitWithEsc("abc%|efg|xyz", "|", RunePtr("%")) would return []string{"abc%|efg", "xyz"}.
func SplitWithEsc(s, delim string, esc *rune) []string {
if len(delim) == 0 || esc == nil {
return strings.Split(s, delim)
}
// From here on, delim != empty **and** esc is set.
var split []string
for delimIndex := IndexWithEsc(s, delim, esc); delimIndex >= 0; delimIndex = IndexWithEsc(s, delim, esc) {
split = append(split, s[:delimIndex])
s = s[delimIndex+len(delim):]
}
split = append(split, s)
return split
}

// Unescape unescapes a string with escape sequence.
// For example, SplitWithEsc("abc%|efg", RunePtr("%")) would return "abc|efg".
func Unescape(s string, esc *rune) string {
if esc == nil {
return s
}
sRunes := []rune(s)
escRune := *esc
for i := 0; i < len(sRunes); i++ {
if sRunes[i] != escRune {
continue
}
copy(sRunes[i:], sRunes[i+1:])
sRunes = sRunes[:len(sRunes)-1]
}
return string(sRunes)
}
Loading

0 comments on commit da541c5

Please sign in to comment.