From 8a58f9ddb19be2356d2afd7cb4cd7e3c23e1ac6b Mon Sep 17 00:00:00 2001
From: jf-tech <jf.tech.llc@gmail.com>
Date: Wed, 27 May 2020 11:17:25 -0500
Subject: [PATCH] Adding NewScannerByDelim/NewScannerByDelim2 and
 IndexWithEsc/SplitWithEsc/Unescape

---
 scanners.go      |  62 +++++++++++++++++
 scanners_test.go |  95 ++++++++++++++++++++++++++
 strings.go       |  86 +++++++++++++++++++++++
 strings_test.go  | 173 +++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 416 insertions(+)
 create mode 100644 scanners.go
 create mode 100644 scanners_test.go
 create mode 100644 strings.go
 create mode 100644 strings_test.go

diff --git a/scanners.go b/scanners.go
new file mode 100644
index 0000000..c2c03f4
--- /dev/null
+++ b/scanners.go
@@ -0,0 +1,62 @@
+package iohelper
+
+import (
+	"bufio"
+	"io"
+)
+
+// ScannerByDelimFlag is the type of flags passed to NewScannerByDelim/NewScannerByDelim2.
+type ScannerByDelimFlag uint
+
+const (
+	// ScannerByDelimFlagEofAsDelim specifies that the scanner should treat EOF as the delimiter as well.
+	ScannerByDelimFlagEofAsDelim ScannerByDelimFlag = 1 << iota
+	// ScannerByDelimFlagDropDelimInReturn specifies that the delimiter should be included in the return value.
+	ScannerByDelimFlagDropDelimInReturn
+	scannerByDelimFlagEnd
+
+	// ScannerByDelimFlagEofNotAsDelim specifies that the scanner should NOT treat EOF as the delimiter.
+	ScannerByDelimFlagEofNotAsDelim = 0
+	// ScannerByDelimFlagIncludeDelimInReturn specifies that the delimiter should NOT be included in the return value.
+	ScannerByDelimFlagIncludeDelimInReturn = 0
+)
+const (
+	// ScannerByDelimFlagDefault specifies the most commonly used flags for the scanner.
+	ScannerByDelimFlagDefault = ScannerByDelimFlagEofAsDelim | ScannerByDelimFlagDropDelimInReturn
+	scannerByDelimValidFlags  = scannerByDelimFlagEnd - 1
+)
+
+// NewScannerByDelim creates a scanner that returns tokens from the source reader separated by a delimiter.
+func NewScannerByDelim(r io.Reader, delim string, flags ScannerByDelimFlag) *bufio.Scanner {
+	return NewScannerByDelim2(r, delim, nil, flags)
+}
+
+// NewScannerByDelim2 creates a scanner that returns tokens from the source reader separated by a delimiter, with
+// consideration of potential presence of escaping sequence.
+// Note: the token returned from the scanner will **NOT** do any unescaping, thus keeping the original value.
+func NewScannerByDelim2(r io.Reader, delim string, escape *rune, flags ScannerByDelimFlag) *bufio.Scanner {
+	flags &= scannerByDelimValidFlags
+
+	includeDelimLenInToken := len(delim)
+	if flags&ScannerByDelimFlagDropDelimInReturn != 0 {
+		includeDelimLenInToken = 0
+	}
+
+	eofAsDelim := flags&ScannerByDelimFlagEofAsDelim != 0
+
+	scanner := bufio.NewScanner(r)
+	scanner.Split(
+		func(data []byte, atEof bool) (advance int, token []byte, err error) {
+			if atEof && len(data) == 0 {
+				return 0, nil, nil
+			}
+			if index := IndexWithEsc(string(data), delim, escape); index >= 0 {
+				return index + len(delim), data[:index+includeDelimLenInToken], nil
+			}
+			if atEof && eofAsDelim {
+				return len(data), data, nil
+			}
+			return 0, nil, nil
+		})
+	return scanner
+}
diff --git a/scanners_test.go b/scanners_test.go
new file mode 100644
index 0000000..de263b4
--- /dev/null
+++ b/scanners_test.go
@@ -0,0 +1,95 @@
+package iohelper
+
+import (
+	"io"
+	"strings"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestNewScannerByDelim(t *testing.T) {
+	for _, test := range []struct {
+		name           string
+		input          io.Reader
+		delim          string
+		flags          ScannerByDelimFlag
+		expectedTokens []string
+	}{
+		{
+			name:           "multi-char delim | eof as delim | drop delim",
+			input:          strings.NewReader("abc#123##efg####???##xyz##"),
+			delim:          "##",
+			flags:          ScannerByDelimFlagEofAsDelim | ScannerByDelimFlagDropDelimInReturn,
+			expectedTokens: []string{"abc#123", "efg", "", "???", "xyz"},
+		},
+		{
+			name:           "CR LF delim | eof as delim | include delim",
+			input:          strings.NewReader("\r\n\rabc\r"),
+			delim:          "\r\n",
+			flags:          ScannerByDelimFlagEofAsDelim | ScannerByDelimFlagIncludeDelimInReturn,
+			expectedTokens: []string{"\r\n", "\rabc\r"},
+		},
+		{
+			name:           "empty reader",
+			input:          strings.NewReader(""),
+			delim:          "*",
+			flags:          ScannerByDelimFlagDefault,
+			expectedTokens: []string{},
+		},
+		{
+			name:           "empty token",
+			input:          strings.NewReader("*"),
+			delim:          "*",
+			flags:          ScannerByDelimFlagEofNotAsDelim | ScannerByDelimFlagDropDelimInReturn,
+			expectedTokens: []string{""},
+		},
+		{
+			name:           "trailing newlines",
+			input:          strings.NewReader("*\n"),
+			delim:          "*",
+			flags:          ScannerByDelimFlagEofAsDelim | ScannerByDelimFlagIncludeDelimInReturn,
+			expectedTokens: []string{"*", "\n"},
+		},
+	} {
+		t.Run(test.name, func(t *testing.T) {
+			s := NewScannerByDelim(test.input, test.delim, test.flags)
+			tokens := []string{}
+			for s.Scan() {
+				tokens = append(tokens, s.Text())
+			}
+			assert.NoError(t, s.Err())
+			assert.Equal(t, test.expectedTokens, tokens)
+		})
+	}
+}
+
+func TestNewScannerByDelim2(t *testing.T) {
+	for _, test := range []struct {
+		name           string
+		input          io.Reader
+		delim          string
+		esc            rune
+		flags          ScannerByDelimFlag
+		expectedTokens []string
+	}{
+		{
+			name:           "multi-char delim | with delim esc | eof as delim | drop delim",
+			input:          strings.NewReader("abc#123##efg####???##xyz##"),
+			delim:          "##",
+			esc:            rune('?'),
+			flags:          ScannerByDelimFlagEofAsDelim | ScannerByDelimFlagDropDelimInReturn,
+			expectedTokens: []string{"abc#123", "efg", "", "???##xyz"},
+		},
+	} {
+		t.Run(test.name, func(t *testing.T) {
+			s := NewScannerByDelim2(test.input, test.delim, RunePtr(test.esc), test.flags)
+			tokens := []string{}
+			for s.Scan() {
+				tokens = append(tokens, s.Text())
+			}
+			assert.NoError(t, s.Err())
+			assert.Equal(t, test.expectedTokens, tokens)
+		})
+	}
+}
diff --git a/strings.go b/strings.go
new file mode 100644
index 0000000..f80bbdb
--- /dev/null
+++ b/strings.go
@@ -0,0 +1,86 @@
+package iohelper
+
+import (
+	"strings"
+)
+
+// RunePtr returns a pointer to a rune.
+func RunePtr(r rune) *rune {
+	return &r
+}
+
+// IndexWithEsc is similar to strings.Index but taking escape sequnce into consideration.
+// For example, IndexWithEsc("abc%|efg|xyz", "|", RunePtr("%")) would return 8, not 4.
+func IndexWithEsc(s, delim string, esc *rune) int {
+	if len(delim) == 0 {
+		return 0
+	}
+	if len(s) == 0 {
+		return -1
+	}
+	if esc == nil {
+		return strings.Index(s, delim)
+	}
+
+	sRunes := []rune(s)
+	delimRunes := []rune(delim)
+	escRune := *esc
+
+	// Yes this old dumb double loop isn't the most efficient algo but it's super easy and simple to understand
+	// and bug free compared with fancy strings.Index or bytes.Index which could potentially lead to index errors
+	// and/or rune/utf-8 bugs. Plus for vast majority of use cases, delim will be of a single rune, so effectively
+	// not much perf penalty at all.
+	for i := 0; i < len(sRunes)-len(delimRunes)+1; i++ {
+		if sRunes[i] == escRune {
+			// skip the escaped rune (aka the rune after the escape rune)
+			i++
+			continue
+		}
+		delimFound := true
+		for j := 0; j < len(delimRunes); j++ {
+			if sRunes[i+j] != delimRunes[j] {
+				delimFound = false
+				break
+			}
+		}
+		if delimFound {
+			return len(string(sRunes[:i]))
+		}
+	}
+
+	return -1
+}
+
+// SplitWithEsc is similar to strings.Split but taking escape sequence into consideration.
+// For example, SplitWithEsc("abc%|efg|xyz", "|", RunePtr("%")) would return []string{"abc%|efg", "xyz"}.
+func SplitWithEsc(s, delim string, esc *rune) []string {
+	if len(delim) == 0 || esc == nil {
+		return strings.Split(s, delim)
+	}
+	// From here on, delim != empty **and** esc is set.
+	var split []string
+	for delimIndex := IndexWithEsc(s, delim, esc); delimIndex >= 0; delimIndex = IndexWithEsc(s, delim, esc) {
+		split = append(split, s[:delimIndex])
+		s = s[delimIndex+len(delim):]
+	}
+	split = append(split, s)
+	return split
+}
+
+// Unescape unescapes a string with escape sequence.
+// For example, SplitWithEsc("abc%|efg", RunePtr("%")) would return "abc|efg".
+func Unescape(s string, esc *rune) string {
+	if esc == nil {
+		return s
+	}
+	sRunes := []rune(s)
+	escRune := *esc
+	for i := 0; i < len(sRunes); i++ {
+		if sRunes[i] != escRune {
+			continue
+		}
+		copy(sRunes[i:], sRunes[i+1:])
+		sRunes = sRunes[:len(sRunes)-1]
+	}
+	return string(sRunes)
+}
diff --git a/strings_test.go b/strings_test.go
new file mode 100644
index 0000000..65e7bed
--- /dev/null
+++ b/strings_test.go
@@ -0,0 +1,173 @@
+package iohelper
+
+import (
+	"strings"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestIndexWithEsc(t *testing.T) {
+	for _, test := range []struct {
+		name     string
+		input    string
+		delim    string
+		esc      *rune
+		expected int
+	}{
+		// All edge cases:
+		{
+			name:     "delim empty",
+			input:    "abc",
+			delim:    "",
+			esc:      RunePtr(rune('宇')),
+			expected: 0,
+		},
+		{
+			name:     "esc empty",
+			input:    "abc",
+			delim:    "bc",
+			esc:      nil,
+			expected: 1,
+		},
+		{
+			name:     "input empty, delim non empty, esc non empty",
+			input:    "",
+			delim:    "abc",
+			esc:      RunePtr(rune('宙')),
+			expected: -1,
+		},
+		// normal non empty cases:
+		{
+			name:     "len(input) < len(delim)",
+			input:    "a",
+			delim:    "abc",
+			esc:      RunePtr(rune('洪')),
+			expected: -1,
+		},
+		{
+			name:     "len(input) == len(delim), esc not present",
+			input:    "abc",
+			delim:    "abc",
+			esc:      RunePtr(rune('荒')),
+			expected: 0,
+		},
+		{
+			name:     "len(input) > len(delim), esc not present",
+			input:    "мир во всем мире",
+			delim:    "мире",
+			esc:      RunePtr(rune('Ф')),
+			expected: len("мир во всем "),
+		},
+		{
+			name:     "len(input) > len(delim), esc present",
+			input:    "мир во всем /мире",
+			delim:    "мире",
+			esc:      RunePtr(rune('/')),
+			expected: -1,
+		},
+		{
+			name:     "len(input) > len(delim), esc present",
+			input:    "мир во всем ξξмире",
+			delim:    "мире",
+			esc:      RunePtr(rune('ξ')),
+			expected: len("мир во всем ξξ"),
+		},
+		{
+			name:     "len(input) > len(delim), consecutive esc present",
+			input:    "мир во вξξξξξсем ξξмире",
+			delim:    "ире",
+			esc:      RunePtr(rune('ξ')),
+			expected: len("мир во вξξξξξсем ξξм"),
+		},
+	} {
+		t.Run(test.name, func(t *testing.T) {
+			assert.Equal(t, test.expected, IndexWithEsc(test.input, test.delim, test.esc))
+			if test.expected >= 0 {
+				assert.True(t, strings.HasPrefix(string([]byte(test.input)[test.expected:]), test.delim))
+			}
+		})
+	}
+}
+
+func TestSplitWithEsc(t *testing.T) {
+	for _, test := range []struct {
+		name     string
+		input    string
+		delim    string
+		esc      *rune
+		expected []string
+	}{
+		{
+			name:     "delim empty",
+			input:    "abc",
+			delim:    "",
+			esc:      RunePtr(rune('宇')),
+			expected: []string{"a", "b", "c"},
+		},
+		{
+			name:     "esc not set",
+			input:    "",
+			delim:    "abc",
+			esc:      nil,
+			expected: []string{""},
+		},
+		{
+			name:     "esc set, delim not found",
+			input:    "?xyz",
+			delim:    "xyz",
+			esc:      RunePtr(rune('?')),
+			expected: []string{"?xyz"},
+		},
+		{
+			name:     "esc set, delim found",
+			input:    "a*bc/*d*efg",
+			delim:    "*",
+			esc:      RunePtr(rune('/')),
+			expected: []string{"a", "bc/*d", "efg"},
+		},
+		{
+			name:     "esc set, delim not empty, input empty",
+			input:    "",
+			delim:    "*",
+			esc:      RunePtr(rune('/')),
+			expected: []string{""},
+		},
+	} {
+		t.Run(test.name, func(t *testing.T) {
+			assert.Equal(t, test.expected, SplitWithEsc(test.input, test.delim, test.esc))
+		})
+	}
+}
+
+func TestUnescape(t *testing.T) {
+	for _, test := range []struct {
+		name     string
+		input    string
+		esc      *rune
+		expected string
+	}{
+		{
+			name:     "esc not set",
+			input:    "abc",
+			esc:      nil,
+			expected: "abc",
+		},
+		{
+			name:     "esc set, input empty",
+			input:    "",
+			esc:      RunePtr(rune('宇')),
+			expected: "",
+		},
+		{
+			name:     "esc set, input non empty",
+			input:    "ξξabcξdξ",
+			esc:      RunePtr(rune('ξ')),
+			expected: "ξabcd",
+		},
+	} {
+		t.Run(test.name, func(t *testing.T) {
+			assert.Equal(t, test.expected, Unescape(test.input, test.esc))
+		})
+	}
+}