From 8a58f9ddb19be2356d2afd7cb4cd7e3c23e1ac6b Mon Sep 17 00:00:00 2001 From: jf-tech Date: Wed, 27 May 2020 11:17:25 -0500 Subject: [PATCH] Adding NewScannerByDelim/NewScannerByDelim2 and IndexWithEsc/SplitWithEsc/Unescape --- scanners.go | 62 +++++++++++++++++ scanners_test.go | 95 ++++++++++++++++++++++++++ strings.go | 86 +++++++++++++++++++++++ strings_test.go | 173 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 416 insertions(+) create mode 100644 scanners.go create mode 100644 scanners_test.go create mode 100644 strings.go create mode 100644 strings_test.go diff --git a/scanners.go b/scanners.go new file mode 100644 index 0000000..c2c03f4 --- /dev/null +++ b/scanners.go @@ -0,0 +1,62 @@ +package iohelper + +import ( + "bufio" + "io" +) + +// ScannerByDelimFlag is the type of flags passed to NewScannerByDelim/NewScannerByDelim2. +type ScannerByDelimFlag uint + +const ( + // ScannerByDelimFlagEofAsDelim specifies that the scanner should treat EOF as the delimiter as well. + ScannerByDelimFlagEofAsDelim ScannerByDelimFlag = 1 << iota + // ScannerByDelimFlagDropDelimInReturn specifies that the delimiter should be included in the return value. + ScannerByDelimFlagDropDelimInReturn + scannerByDelimFlagEnd + + // ScannerByDelimFlagEofNotAsDelim specifies that the scanner should NOT treat EOF as the delimiter. + ScannerByDelimFlagEofNotAsDelim = 0 + // ScannerByDelimFlagIncludeDelimInReturn specifies that the delimiter should NOT be included in the return value. + ScannerByDelimFlagIncludeDelimInReturn = 0 +) +const ( + // ScannerByDelimFlagDefault specifies the most commonly used flags for the scanner. + ScannerByDelimFlagDefault = ScannerByDelimFlagEofAsDelim | ScannerByDelimFlagDropDelimInReturn + scannerByDelimValidFlags = scannerByDelimFlagEnd - 1 +) + +// NewScannerByDelim creates a scanner that returns tokens from the source reader separated by a delimiter. +func NewScannerByDelim(r io.Reader, delim string, flags ScannerByDelimFlag) *bufio.Scanner { + return NewScannerByDelim2(r, delim, nil, flags) +} + +// NewScannerByDelim2 creates a scanner that returns tokens from the source reader separated by a delimiter, with +// consideration of potential presence of escaping sequence. +// Note: the token returned from the scanner will **NOT** do any unescaping, thus keeping the original value. +func NewScannerByDelim2(r io.Reader, delim string, escape *rune, flags ScannerByDelimFlag) *bufio.Scanner { + flags &= scannerByDelimValidFlags + + includeDelimLenInToken := len(delim) + if flags&ScannerByDelimFlagDropDelimInReturn != 0 { + includeDelimLenInToken = 0 + } + + eofAsDelim := flags&ScannerByDelimFlagEofAsDelim != 0 + + scanner := bufio.NewScanner(r) + scanner.Split( + func(data []byte, atEof bool) (advance int, token []byte, err error) { + if atEof && len(data) == 0 { + return 0, nil, nil + } + if index := IndexWithEsc(string(data), delim, escape); index >= 0 { + return index + len(delim), data[:index+includeDelimLenInToken], nil + } + if atEof && eofAsDelim { + return len(data), data, nil + } + return 0, nil, nil + }) + return scanner +} diff --git a/scanners_test.go b/scanners_test.go new file mode 100644 index 0000000..de263b4 --- /dev/null +++ b/scanners_test.go @@ -0,0 +1,95 @@ +package iohelper + +import ( + "io" + "strings" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestNewScannerByDelim(t *testing.T) { + for _, test := range []struct { + name string + input io.Reader + delim string + flags ScannerByDelimFlag + expectedTokens []string + }{ + { + name: "multi-char delim | eof as delim | drop delim", + input: strings.NewReader("abc#123##efg####???##xyz##"), + delim: "##", + flags: ScannerByDelimFlagEofAsDelim | ScannerByDelimFlagDropDelimInReturn, + expectedTokens: []string{"abc#123", "efg", "", "???", "xyz"}, + }, + { + name: "CR LF delim | eof as delim | include delim", + input: strings.NewReader("\r\n\rabc\r"), + delim: "\r\n", + flags: ScannerByDelimFlagEofAsDelim | ScannerByDelimFlagIncludeDelimInReturn, + expectedTokens: []string{"\r\n", "\rabc\r"}, + }, + { + name: "empty reader", + input: strings.NewReader(""), + delim: "*", + flags: ScannerByDelimFlagDefault, + expectedTokens: []string{}, + }, + { + name: "empty token", + input: strings.NewReader("*"), + delim: "*", + flags: ScannerByDelimFlagEofNotAsDelim | ScannerByDelimFlagDropDelimInReturn, + expectedTokens: []string{""}, + }, + { + name: "trailing newlines", + input: strings.NewReader("*\n"), + delim: "*", + flags: ScannerByDelimFlagEofAsDelim | ScannerByDelimFlagIncludeDelimInReturn, + expectedTokens: []string{"*", "\n"}, + }, + } { + t.Run(test.name, func(t *testing.T) { + s := NewScannerByDelim(test.input, test.delim, test.flags) + tokens := []string{} + for s.Scan() { + tokens = append(tokens, s.Text()) + } + assert.NoError(t, s.Err()) + assert.Equal(t, test.expectedTokens, tokens) + }) + } +} + +func TestNewScannerByDelim2(t *testing.T) { + for _, test := range []struct { + name string + input io.Reader + delim string + esc rune + flags ScannerByDelimFlag + expectedTokens []string + }{ + { + name: "multi-char delim | with delim esc | eof as delim | drop delim", + input: strings.NewReader("abc#123##efg####???##xyz##"), + delim: "##", + esc: rune('?'), + flags: ScannerByDelimFlagEofAsDelim | ScannerByDelimFlagDropDelimInReturn, + expectedTokens: []string{"abc#123", "efg", "", "???##xyz"}, + }, + } { + t.Run(test.name, func(t *testing.T) { + s := NewScannerByDelim2(test.input, test.delim, RunePtr(test.esc), test.flags) + tokens := []string{} + for s.Scan() { + tokens = append(tokens, s.Text()) + } + assert.NoError(t, s.Err()) + assert.Equal(t, test.expectedTokens, tokens) + }) + } +} diff --git a/strings.go b/strings.go new file mode 100644 index 0000000..f80bbdb --- /dev/null +++ b/strings.go @@ -0,0 +1,86 @@ +package iohelper + +import ( + "strings" +) + +// RunePtr returns a pointer to a rune. +func RunePtr(r rune) *rune { + return &r +} + +// IndexWithEsc is similar to strings.Index but taking escape sequnce into consideration. +// For example, IndexWithEsc("abc%|efg|xyz", "|", RunePtr("%")) would return 8, not 4. +func IndexWithEsc(s, delim string, esc *rune) int { + if len(delim) == 0 { + return 0 + } + if len(s) == 0 { + return -1 + } + if esc == nil { + return strings.Index(s, delim) + } + + sRunes := []rune(s) + delimRunes := []rune(delim) + escRune := *esc + + // Yes this old dumb double loop isn't the most efficient algo but it's super easy and simple to understand + // and bug free compared with fancy strings.Index or bytes.Index which could potentially lead to index errors + // and/or rune/utf-8 bugs. Plus for vast majority of use cases, delim will be of a single rune, so effectively + // not much perf penalty at all. + for i := 0; i < len(sRunes)-len(delimRunes)+1; i++ { + if sRunes[i] == escRune { + // skip the escaped rune (aka the rune after the escape rune) + i++ + continue + } + delimFound := true + for j := 0; j < len(delimRunes); j++ { + if sRunes[i+j] != delimRunes[j] { + delimFound = false + break + } + } + if delimFound { + return len(string(sRunes[:i])) + } + } + + return -1 +} + +// SplitWithEsc is similar to strings.Split but taking escape sequence into consideration. +// For example, SplitWithEsc("abc%|efg|xyz", "|", RunePtr("%")) would return []string{"abc%|efg", "xyz"}. +func SplitWithEsc(s, delim string, esc *rune) []string { + if len(delim) == 0 || esc == nil { + return strings.Split(s, delim) + } + // From here on, delim != empty **and** esc is set. + var split []string + for delimIndex := IndexWithEsc(s, delim, esc); delimIndex >= 0; delimIndex = IndexWithEsc(s, delim, esc) { + split = append(split, s[:delimIndex]) + s = s[delimIndex+len(delim):] + } + split = append(split, s) + return split +} + +// Unescape unescapes a string with escape sequence. +// For example, SplitWithEsc("abc%|efg", RunePtr("%")) would return "abc|efg". +func Unescape(s string, esc *rune) string { + if esc == nil { + return s + } + sRunes := []rune(s) + escRune := *esc + for i := 0; i < len(sRunes); i++ { + if sRunes[i] != escRune { + continue + } + copy(sRunes[i:], sRunes[i+1:]) + sRunes = sRunes[:len(sRunes)-1] + } + return string(sRunes) +} diff --git a/strings_test.go b/strings_test.go new file mode 100644 index 0000000..65e7bed --- /dev/null +++ b/strings_test.go @@ -0,0 +1,173 @@ +package iohelper + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestIndexWithEsc(t *testing.T) { + for _, test := range []struct { + name string + input string + delim string + esc *rune + expected int + }{ + // All edge cases: + { + name: "delim empty", + input: "abc", + delim: "", + esc: RunePtr(rune('宇')), + expected: 0, + }, + { + name: "esc empty", + input: "abc", + delim: "bc", + esc: nil, + expected: 1, + }, + { + name: "input empty, delim non empty, esc non empty", + input: "", + delim: "abc", + esc: RunePtr(rune('宙')), + expected: -1, + }, + // normal non empty cases: + { + name: "len(input) < len(delim)", + input: "a", + delim: "abc", + esc: RunePtr(rune('洪')), + expected: -1, + }, + { + name: "len(input) == len(delim), esc not present", + input: "abc", + delim: "abc", + esc: RunePtr(rune('荒')), + expected: 0, + }, + { + name: "len(input) > len(delim), esc not present", + input: "мир во всем мире", + delim: "мире", + esc: RunePtr(rune('Ф')), + expected: len("мир во всем "), + }, + { + name: "len(input) > len(delim), esc present", + input: "мир во всем /мире", + delim: "мире", + esc: RunePtr(rune('/')), + expected: -1, + }, + { + name: "len(input) > len(delim), esc present", + input: "мир во всем ξξмире", + delim: "мире", + esc: RunePtr(rune('ξ')), + expected: len("мир во всем ξξ"), + }, + { + name: "len(input) > len(delim), consecutive esc present", + input: "мир во вξξξξξсем ξξмире", + delim: "ире", + esc: RunePtr(rune('ξ')), + expected: len("мир во вξξξξξсем ξξм"), + }, + } { + t.Run(test.name, func(t *testing.T) { + assert.Equal(t, test.expected, IndexWithEsc(test.input, test.delim, test.esc)) + if test.expected >= 0 { + assert.True(t, strings.HasPrefix(string([]byte(test.input)[test.expected:]), test.delim)) + } + }) + } +} + +func TestSplitWithEsc(t *testing.T) { + for _, test := range []struct { + name string + input string + delim string + esc *rune + expected []string + }{ + { + name: "delim empty", + input: "abc", + delim: "", + esc: RunePtr(rune('宇')), + expected: []string{"a", "b", "c"}, + }, + { + name: "esc not set", + input: "", + delim: "abc", + esc: nil, + expected: []string{""}, + }, + { + name: "esc set, delim not found", + input: "?xyz", + delim: "xyz", + esc: RunePtr(rune('?')), + expected: []string{"?xyz"}, + }, + { + name: "esc set, delim found", + input: "a*bc/*d*efg", + delim: "*", + esc: RunePtr(rune('/')), + expected: []string{"a", "bc/*d", "efg"}, + }, + { + name: "esc set, delim not empty, input empty", + input: "", + delim: "*", + esc: RunePtr(rune('/')), + expected: []string{""}, + }, + } { + t.Run(test.name, func(t *testing.T) { + assert.Equal(t, test.expected, SplitWithEsc(test.input, test.delim, test.esc)) + }) + } +} + +func TestUnescape(t *testing.T) { + for _, test := range []struct { + name string + input string + esc *rune + expected string + }{ + { + name: "esc not set", + input: "abc", + esc: nil, + expected: "abc", + }, + { + name: "esc set, input empty", + input: "", + esc: RunePtr(rune('宇')), + expected: "", + }, + { + name: "esc set, input non empty", + input: "ξξabcξdξ", + esc: RunePtr(rune('ξ')), + expected: "ξabcd", + }, + } { + t.Run(test.name, func(t *testing.T) { + assert.Equal(t, test.expected, Unescape(test.input, test.esc)) + }) + } +}