Merge pull request #3 from jf-tech/scanner

Adding NewScannerByDelim/NewScannerByDelim2 and IndexWithEsc/SplitWit…
jf-tech · May 27, 2020 · da541c5 · da541c5
2 parents 9c8f4b3 + 8a58f9d
commit da541c5
Show file tree

Hide file tree

Showing 4 changed files with 416 additions and 0 deletions.
diff --git a/scanners.go b/scanners.go
@@ -0,0 +1,62 @@
+package iohelper
+
+import (
+	"bufio"
+	"io"
+)
+
+// ScannerByDelimFlag is the type of flags passed to NewScannerByDelim/NewScannerByDelim2.
+type ScannerByDelimFlag uint
+
+const (
+	// ScannerByDelimFlagEofAsDelim specifies that the scanner should treat EOF as the delimiter as well.
+	ScannerByDelimFlagEofAsDelim ScannerByDelimFlag = 1 << iota
+	// ScannerByDelimFlagDropDelimInReturn specifies that the delimiter should be included in the return value.
+	ScannerByDelimFlagDropDelimInReturn
+	scannerByDelimFlagEnd
+
+	// ScannerByDelimFlagEofNotAsDelim specifies that the scanner should NOT treat EOF as the delimiter.
+	ScannerByDelimFlagEofNotAsDelim = 0
+	// ScannerByDelimFlagIncludeDelimInReturn specifies that the delimiter should NOT be included in the return value.
+	ScannerByDelimFlagIncludeDelimInReturn = 0
+)
+const (
+	// ScannerByDelimFlagDefault specifies the most commonly used flags for the scanner.
+	ScannerByDelimFlagDefault = ScannerByDelimFlagEofAsDelim | ScannerByDelimFlagDropDelimInReturn
+	scannerByDelimValidFlags  = scannerByDelimFlagEnd - 1
+)
+
+// NewScannerByDelim creates a scanner that returns tokens from the source reader separated by a delimiter.
+func NewScannerByDelim(r io.Reader, delim string, flags ScannerByDelimFlag) *bufio.Scanner {
+	return NewScannerByDelim2(r, delim, nil, flags)
+}
+
+// NewScannerByDelim2 creates a scanner that returns tokens from the source reader separated by a delimiter, with
+// consideration of potential presence of escaping sequence.
+// Note: the token returned from the scanner will **NOT** do any unescaping, thus keeping the original value.
+func NewScannerByDelim2(r io.Reader, delim string, escape *rune, flags ScannerByDelimFlag) *bufio.Scanner {
+	flags &= scannerByDelimValidFlags
+
+	includeDelimLenInToken := len(delim)
+	if flags&ScannerByDelimFlagDropDelimInReturn != 0 {
+		includeDelimLenInToken = 0
+	}
+
+	eofAsDelim := flags&ScannerByDelimFlagEofAsDelim != 0
+
+	scanner := bufio.NewScanner(r)
+	scanner.Split(
+		func(data []byte, atEof bool) (advance int, token []byte, err error) {
+			if atEof && len(data) == 0 {
+				return 0, nil, nil
+			}
+			if index := IndexWithEsc(string(data), delim, escape); index >= 0 {
+				return index + len(delim), data[:index+includeDelimLenInToken], nil
+			}
+			if atEof && eofAsDelim {
+				return len(data), data, nil
+			}
+			return 0, nil, nil
+		})
+	return scanner
+}
diff --git a/scanners_test.go b/scanners_test.go
@@ -0,0 +1,95 @@
+package iohelper
+
+import (
+	"io"
+	"strings"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestNewScannerByDelim(t *testing.T) {
+	for _, test := range []struct {
+		name           string
+		input          io.Reader
+		delim          string
+		flags          ScannerByDelimFlag
+		expectedTokens []string
+	}{
+		{
+			name:           "multi-char delim | eof as delim | drop delim",
+			input:          strings.NewReader("abc#123##efg####???##xyz##"),
+			delim:          "##",
+			flags:          ScannerByDelimFlagEofAsDelim | ScannerByDelimFlagDropDelimInReturn,
+			expectedTokens: []string{"abc#123", "efg", "", "???", "xyz"},
+		},
+		{
+			name:           "CR LF delim | eof as delim | include delim",
+			input:          strings.NewReader("\r\n\rabc\r"),
+			delim:          "\r\n",
+			flags:          ScannerByDelimFlagEofAsDelim | ScannerByDelimFlagIncludeDelimInReturn,
+			expectedTokens: []string{"\r\n", "\rabc\r"},
+		},
+		{
+			name:           "empty reader",
+			input:          strings.NewReader(""),
+			delim:          "*",
+			flags:          ScannerByDelimFlagDefault,
+			expectedTokens: []string{},
+		},
+		{
+			name:           "empty token",
+			input:          strings.NewReader("*"),
+			delim:          "*",
+			flags:          ScannerByDelimFlagEofNotAsDelim | ScannerByDelimFlagDropDelimInReturn,
+			expectedTokens: []string{""},
+		},
+		{
+			name:           "trailing newlines",
+			input:          strings.NewReader("*\n"),
+			delim:          "*",
+			flags:          ScannerByDelimFlagEofAsDelim | ScannerByDelimFlagIncludeDelimInReturn,
+			expectedTokens: []string{"*", "\n"},
+		},
+	} {
+		t.Run(test.name, func(t *testing.T) {
+			s := NewScannerByDelim(test.input, test.delim, test.flags)
+			tokens := []string{}
+			for s.Scan() {
+				tokens = append(tokens, s.Text())
+			}
+			assert.NoError(t, s.Err())
+			assert.Equal(t, test.expectedTokens, tokens)
+		})
+	}
+}
+
+func TestNewScannerByDelim2(t *testing.T) {
+	for _, test := range []struct {
+		name           string
+		input          io.Reader
+		delim          string
+		esc            rune
+		flags          ScannerByDelimFlag
+		expectedTokens []string
+	}{
+		{
+			name:           "multi-char delim | with delim esc | eof as delim | drop delim",
+			input:          strings.NewReader("abc#123##efg####???##xyz##"),
+			delim:          "##",
+			esc:            rune('?'),
+			flags:          ScannerByDelimFlagEofAsDelim | ScannerByDelimFlagDropDelimInReturn,
+			expectedTokens: []string{"abc#123", "efg", "", "???##xyz"},
+		},
+	} {
+		t.Run(test.name, func(t *testing.T) {
+			s := NewScannerByDelim2(test.input, test.delim, RunePtr(test.esc), test.flags)
+			tokens := []string{}
+			for s.Scan() {
+				tokens = append(tokens, s.Text())
+			}
+			assert.NoError(t, s.Err())
+			assert.Equal(t, test.expectedTokens, tokens)
+		})
+	}
+}
diff --git a/strings.go b/strings.go
@@ -0,0 +1,86 @@
+package iohelper
+
+import (
+	"strings"
+)
+
+// RunePtr returns a pointer to a rune.
+func RunePtr(r rune) *rune {
+	return &r
+}
+
+// IndexWithEsc is similar to strings.Index but taking escape sequnce into consideration.
+// For example, IndexWithEsc("abc%|efg|xyz", "|", RunePtr("%")) would return 8, not 4.
+func IndexWithEsc(s, delim string, esc *rune) int {
+	if len(delim) == 0 {
+		return 0
+	}
+	if len(s) == 0 {
+		return -1
+	}
+	if esc == nil {
+		return strings.Index(s, delim)
+	}
+
+	sRunes := []rune(s)
+	delimRunes := []rune(delim)
+	escRune := *esc
+
+	// Yes this old dumb double loop isn't the most efficient algo but it's super easy and simple to understand
+	// and bug free compared with fancy strings.Index or bytes.Index which could potentially lead to index errors
+	// and/or rune/utf-8 bugs. Plus for vast majority of use cases, delim will be of a single rune, so effectively
+	// not much perf penalty at all.
+	for i := 0; i < len(sRunes)-len(delimRunes)+1; i++ {
+		if sRunes[i] == escRune {
+			// skip the escaped rune (aka the rune after the escape rune)
+			i++
+			continue
+		}
+		delimFound := true
+		for j := 0; j < len(delimRunes); j++ {
+			if sRunes[i+j] != delimRunes[j] {
+				delimFound = false
+				break
+			}
+		}
+		if delimFound {
+			return len(string(sRunes[:i]))
+		}
+	}
+
+	return -1
+}
+
+// SplitWithEsc is similar to strings.Split but taking escape sequence into consideration.
+// For example, SplitWithEsc("abc%|efg|xyz", "|", RunePtr("%")) would return []string{"abc%|efg", "xyz"}.
+func SplitWithEsc(s, delim string, esc *rune) []string {
+	if len(delim) == 0 || esc == nil {
+		return strings.Split(s, delim)
+	}
+	// From here on, delim != empty **and** esc is set.
+	var split []string
+	for delimIndex := IndexWithEsc(s, delim, esc); delimIndex >= 0; delimIndex = IndexWithEsc(s, delim, esc) {
+		split = append(split, s[:delimIndex])
+		s = s[delimIndex+len(delim):]
+	}
+	split = append(split, s)
+	return split
+}
+
+// Unescape unescapes a string with escape sequence.
+// For example, SplitWithEsc("abc%|efg", RunePtr("%")) would return "abc|efg".
+func Unescape(s string, esc *rune) string {
+	if esc == nil {
+		return s
+	}
+	sRunes := []rune(s)
+	escRune := *esc
+	for i := 0; i < len(sRunes); i++ {
+		if sRunes[i] != escRune {
+			continue
+		}
+		copy(sRunes[i:], sRunes[i+1:])
+		sRunes = sRunes[:len(sRunes)-1]
+	}
+	return string(sRunes)
+}