-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #3 from jf-tech/scanner
Adding NewScannerByDelim/NewScannerByDelim2 and IndexWithEsc/SplitWit…
- Loading branch information
Showing
4 changed files
with
416 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
package iohelper | ||
|
||
import ( | ||
"bufio" | ||
"io" | ||
) | ||
|
||
// ScannerByDelimFlag is the type of flags passed to NewScannerByDelim/NewScannerByDelim2. | ||
type ScannerByDelimFlag uint | ||
|
||
const ( | ||
// ScannerByDelimFlagEofAsDelim specifies that the scanner should treat EOF as the delimiter as well. | ||
ScannerByDelimFlagEofAsDelim ScannerByDelimFlag = 1 << iota | ||
// ScannerByDelimFlagDropDelimInReturn specifies that the delimiter should be included in the return value. | ||
ScannerByDelimFlagDropDelimInReturn | ||
scannerByDelimFlagEnd | ||
|
||
// ScannerByDelimFlagEofNotAsDelim specifies that the scanner should NOT treat EOF as the delimiter. | ||
ScannerByDelimFlagEofNotAsDelim = 0 | ||
// ScannerByDelimFlagIncludeDelimInReturn specifies that the delimiter should NOT be included in the return value. | ||
ScannerByDelimFlagIncludeDelimInReturn = 0 | ||
) | ||
const ( | ||
// ScannerByDelimFlagDefault specifies the most commonly used flags for the scanner. | ||
ScannerByDelimFlagDefault = ScannerByDelimFlagEofAsDelim | ScannerByDelimFlagDropDelimInReturn | ||
scannerByDelimValidFlags = scannerByDelimFlagEnd - 1 | ||
) | ||
|
||
// NewScannerByDelim creates a scanner that returns tokens from the source reader separated by a delimiter. | ||
func NewScannerByDelim(r io.Reader, delim string, flags ScannerByDelimFlag) *bufio.Scanner { | ||
return NewScannerByDelim2(r, delim, nil, flags) | ||
} | ||
|
||
// NewScannerByDelim2 creates a scanner that returns tokens from the source reader separated by a delimiter, with | ||
// consideration of potential presence of escaping sequence. | ||
// Note: the token returned from the scanner will **NOT** do any unescaping, thus keeping the original value. | ||
func NewScannerByDelim2(r io.Reader, delim string, escape *rune, flags ScannerByDelimFlag) *bufio.Scanner { | ||
flags &= scannerByDelimValidFlags | ||
|
||
includeDelimLenInToken := len(delim) | ||
if flags&ScannerByDelimFlagDropDelimInReturn != 0 { | ||
includeDelimLenInToken = 0 | ||
} | ||
|
||
eofAsDelim := flags&ScannerByDelimFlagEofAsDelim != 0 | ||
|
||
scanner := bufio.NewScanner(r) | ||
scanner.Split( | ||
func(data []byte, atEof bool) (advance int, token []byte, err error) { | ||
if atEof && len(data) == 0 { | ||
return 0, nil, nil | ||
} | ||
if index := IndexWithEsc(string(data), delim, escape); index >= 0 { | ||
return index + len(delim), data[:index+includeDelimLenInToken], nil | ||
} | ||
if atEof && eofAsDelim { | ||
return len(data), data, nil | ||
} | ||
return 0, nil, nil | ||
}) | ||
return scanner | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
package iohelper | ||
|
||
import ( | ||
"io" | ||
"strings" | ||
"testing" | ||
|
||
"github.com/stretchr/testify/assert" | ||
) | ||
|
||
func TestNewScannerByDelim(t *testing.T) { | ||
for _, test := range []struct { | ||
name string | ||
input io.Reader | ||
delim string | ||
flags ScannerByDelimFlag | ||
expectedTokens []string | ||
}{ | ||
{ | ||
name: "multi-char delim | eof as delim | drop delim", | ||
input: strings.NewReader("abc#123##efg####???##xyz##"), | ||
delim: "##", | ||
flags: ScannerByDelimFlagEofAsDelim | ScannerByDelimFlagDropDelimInReturn, | ||
expectedTokens: []string{"abc#123", "efg", "", "???", "xyz"}, | ||
}, | ||
{ | ||
name: "CR LF delim | eof as delim | include delim", | ||
input: strings.NewReader("\r\n\rabc\r"), | ||
delim: "\r\n", | ||
flags: ScannerByDelimFlagEofAsDelim | ScannerByDelimFlagIncludeDelimInReturn, | ||
expectedTokens: []string{"\r\n", "\rabc\r"}, | ||
}, | ||
{ | ||
name: "empty reader", | ||
input: strings.NewReader(""), | ||
delim: "*", | ||
flags: ScannerByDelimFlagDefault, | ||
expectedTokens: []string{}, | ||
}, | ||
{ | ||
name: "empty token", | ||
input: strings.NewReader("*"), | ||
delim: "*", | ||
flags: ScannerByDelimFlagEofNotAsDelim | ScannerByDelimFlagDropDelimInReturn, | ||
expectedTokens: []string{""}, | ||
}, | ||
{ | ||
name: "trailing newlines", | ||
input: strings.NewReader("*\n"), | ||
delim: "*", | ||
flags: ScannerByDelimFlagEofAsDelim | ScannerByDelimFlagIncludeDelimInReturn, | ||
expectedTokens: []string{"*", "\n"}, | ||
}, | ||
} { | ||
t.Run(test.name, func(t *testing.T) { | ||
s := NewScannerByDelim(test.input, test.delim, test.flags) | ||
tokens := []string{} | ||
for s.Scan() { | ||
tokens = append(tokens, s.Text()) | ||
} | ||
assert.NoError(t, s.Err()) | ||
assert.Equal(t, test.expectedTokens, tokens) | ||
}) | ||
} | ||
} | ||
|
||
func TestNewScannerByDelim2(t *testing.T) { | ||
for _, test := range []struct { | ||
name string | ||
input io.Reader | ||
delim string | ||
esc rune | ||
flags ScannerByDelimFlag | ||
expectedTokens []string | ||
}{ | ||
{ | ||
name: "multi-char delim | with delim esc | eof as delim | drop delim", | ||
input: strings.NewReader("abc#123##efg####???##xyz##"), | ||
delim: "##", | ||
esc: rune('?'), | ||
flags: ScannerByDelimFlagEofAsDelim | ScannerByDelimFlagDropDelimInReturn, | ||
expectedTokens: []string{"abc#123", "efg", "", "???##xyz"}, | ||
}, | ||
} { | ||
t.Run(test.name, func(t *testing.T) { | ||
s := NewScannerByDelim2(test.input, test.delim, RunePtr(test.esc), test.flags) | ||
tokens := []string{} | ||
for s.Scan() { | ||
tokens = append(tokens, s.Text()) | ||
} | ||
assert.NoError(t, s.Err()) | ||
assert.Equal(t, test.expectedTokens, tokens) | ||
}) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
package iohelper | ||
|
||
import ( | ||
"strings" | ||
) | ||
|
||
// RunePtr returns a pointer to a rune. | ||
func RunePtr(r rune) *rune { | ||
return &r | ||
} | ||
|
||
// IndexWithEsc is similar to strings.Index but taking escape sequnce into consideration. | ||
// For example, IndexWithEsc("abc%|efg|xyz", "|", RunePtr("%")) would return 8, not 4. | ||
func IndexWithEsc(s, delim string, esc *rune) int { | ||
if len(delim) == 0 { | ||
return 0 | ||
} | ||
if len(s) == 0 { | ||
return -1 | ||
} | ||
if esc == nil { | ||
return strings.Index(s, delim) | ||
} | ||
|
||
sRunes := []rune(s) | ||
delimRunes := []rune(delim) | ||
escRune := *esc | ||
|
||
// Yes this old dumb double loop isn't the most efficient algo but it's super easy and simple to understand | ||
// and bug free compared with fancy strings.Index or bytes.Index which could potentially lead to index errors | ||
// and/or rune/utf-8 bugs. Plus for vast majority of use cases, delim will be of a single rune, so effectively | ||
// not much perf penalty at all. | ||
for i := 0; i < len(sRunes)-len(delimRunes)+1; i++ { | ||
if sRunes[i] == escRune { | ||
// skip the escaped rune (aka the rune after the escape rune) | ||
i++ | ||
continue | ||
} | ||
delimFound := true | ||
for j := 0; j < len(delimRunes); j++ { | ||
if sRunes[i+j] != delimRunes[j] { | ||
delimFound = false | ||
break | ||
} | ||
} | ||
if delimFound { | ||
return len(string(sRunes[:i])) | ||
} | ||
} | ||
|
||
return -1 | ||
} | ||
|
||
// SplitWithEsc is similar to strings.Split but taking escape sequence into consideration. | ||
// For example, SplitWithEsc("abc%|efg|xyz", "|", RunePtr("%")) would return []string{"abc%|efg", "xyz"}. | ||
func SplitWithEsc(s, delim string, esc *rune) []string { | ||
if len(delim) == 0 || esc == nil { | ||
return strings.Split(s, delim) | ||
} | ||
// From here on, delim != empty **and** esc is set. | ||
var split []string | ||
for delimIndex := IndexWithEsc(s, delim, esc); delimIndex >= 0; delimIndex = IndexWithEsc(s, delim, esc) { | ||
split = append(split, s[:delimIndex]) | ||
s = s[delimIndex+len(delim):] | ||
} | ||
split = append(split, s) | ||
return split | ||
} | ||
|
||
// Unescape unescapes a string with escape sequence. | ||
// For example, SplitWithEsc("abc%|efg", RunePtr("%")) would return "abc|efg". | ||
func Unescape(s string, esc *rune) string { | ||
if esc == nil { | ||
return s | ||
} | ||
sRunes := []rune(s) | ||
escRune := *esc | ||
for i := 0; i < len(sRunes); i++ { | ||
if sRunes[i] != escRune { | ||
continue | ||
} | ||
copy(sRunes[i:], sRunes[i+1:]) | ||
sRunes = sRunes[:len(sRunes)-1] | ||
} | ||
return string(sRunes) | ||
} |
Oops, something went wrong.