diff --git a/.gitignore b/.gitignore index 329c708..6b9388b 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,9 @@ # Test binary, built with `go test -c` *.test +# JetBrain IDE +*.idea + # Output of the go coverage tool, specifically when used with LiteIDE *.out diff --git a/readers.go b/readers.go index 66b9cbf..bf23dad 100644 --- a/readers.go +++ b/readers.go @@ -1,6 +1,7 @@ package iohelper import ( + "bufio" "bytes" "encoding/csv" "fmt" @@ -132,3 +133,62 @@ func (r *BytesReplacingReader) Read(p []byte) (int, error) { } } } + +// ReadLine reads in a single line from a bufio.Reader. +func ReadLine(r *bufio.Reader) (string, error) { + // Turns out even with various bufio.Reader.Read???() and bufio.Scanner, there is not simple clean + // way of reading a single text line in: + // - bufio.ReadSlice('\n') doesn't have '\r' dropping. We want a line returned without neither '\n' nor '\r'. + // - bufio.ReadLine() drops '\r' and '\n', but has a fixed buf so may be unable to read a whole line in one call. + // - bufio.ReadBytes no buf size issue, but doesn't offer '\r' and '\n' cleanup. + // - bufio.ReadString essentially the same as bufio.ReadBytes. + // - bufio.Scanner deals with '\r' and '\n' but has fixed buf issue. + // Oh, come on!! + // + // Also found net/textproto's Reader.ReadLine() which meets all the requirements. But to use it + // we need to create yet another type of Reader (net.textproto.Reader), as if the + // io.Reader -> bufio.Reader isn't enough for us. So decided instead, just shamelessly copy + // net.textproto.Reader.ReadLine() here, credit goes to + // https://github.com/golang/go/blob/master/src/net/textproto/reader.go. However its test code + // coverage is lacking, so create all the new test cases for this ReadLine implementation copy. + var line []byte + for { + l, more, err := r.ReadLine() + if err != nil { + return "", err + } + // Avoid the copy if the first call produced a full line. + if line == nil && !more { + return string(l), nil + } + line = append(line, l...) + if !more { + break + } + } + return string(line), nil +} + +const bom = '\uFEFF' + +// StripBOM returns a new io.Reader that, if needed, strips away the BOM (byte order marker) of +// the input io.Reader. +func StripBOM(reader io.Reader) (io.Reader, error) { + br := bufio.NewReader(reader) + r, _, err := br.ReadRune() + switch { + case err == io.EOF: + // This is to handle empty file, can't call UnreadRune(), will meet ErrInvalidUnreadRune as + // b.lastRuneSize is -1. So simply reset buffer io. + br.Reset(reader) + return br, nil + case err != nil: + return nil, err + case r == bom: + return br, nil + default: + // Here we shouldn't meet any error during unread rune. + _ = br.UnreadRune() + return br, nil + } +} diff --git a/readers_test.go b/readers_test.go index 446c88f..96f5301 100644 --- a/readers_test.go +++ b/readers_test.go @@ -1,7 +1,9 @@ package iohelper import ( + "bufio" "bytes" + "errors" "io" "io/ioutil" "math/rand" @@ -160,3 +162,116 @@ func BenchmarkRegularReader_50KBLength_1000Targets(b *testing.B) { _, _ = ioutil.ReadAll(bytes.NewReader(testInput50KBLength1000Targets)) } } + +func TestReadLine(t *testing.T) { + for _, test := range []struct { + name string + input string + bufsize int + expectedOutput []string + }{ + { + name: "empty", + input: "", + bufsize: 1024, + expectedOutput: []string{}, + }, + { + name: "single-line with no newline", + input: " word1, word2 - word3 !@#$%^&*()", + bufsize: 1024, + expectedOutput: []string{" word1, word2 - word3 !@#$%^&*()"}, + }, + { + name: "single-line with '\\r' and '\\n'", + input: "line1\r\n", + bufsize: 1024, + expectedOutput: []string{"line1"}, + }, + { + name: "multi-line - bufsize enough", + input: "line1\r\nline2\nline3", + bufsize: 1024, + expectedOutput: []string{"line1", "line2", "line3"}, + }, + { + name: "multi-line - bufsize not enough; also empty line", + input: "line1-0123456789012345\r\n\nline3-0123456789012345", + bufsize: 16, // bufio.minReadBufferSize is 16. + expectedOutput: []string{"line1-0123456789012345", "", "line3-0123456789012345"}, + }, + } { + t.Run(test.name, func(t *testing.T) { + r := bufio.NewReaderSize(strings.NewReader(test.input), test.bufsize) + output := []string{} + for { + line, err := ReadLine(r) + if err != nil { + assert.Equal(t, "", line) + assert.Equal(t, io.EOF, err) + break + } + output = append(output, line) + } + assert.Equal(t, test.expectedOutput, output) + }) + } +} + +func TestStripBOM_Success(t *testing.T) { + for _, test := range []struct { + name string + fileContent []byte + expectedContent []byte + }{ + { + name: "Empty content", + fileContent: []byte(""), + expectedContent: []byte(""), + }, + { + name: "Non-bom unicode", + fileContent: []byte("\u1234test content"), + expectedContent: []byte("\u1234test content"), + }, + { + name: "Content without BOM", + fileContent: []byte("test content"), + expectedContent: []byte("test content"), + }, + { + name: "Content with BOM", + fileContent: []byte("\uFEFFtest content"), + expectedContent: []byte("test content"), + }, + { + name: "Content with BOM only", + fileContent: []byte("\uFEFF"), + expectedContent: []byte(""), + }, + } { + r := bytes.NewReader(test.fileContent) + br, err := StripBOM(r) + assert.NoError(t, err) + assert.False(t, br == nil) + line, _, err := br.(*bufio.Reader).ReadLine() + if len(test.expectedContent) <= 0 { + assert.Error(t, err) + assert.Equal(t, io.EOF, err) + continue + } + assert.NoError(t, err) + assert.Equal(t, test.expectedContent, line) + } +} + +type failureReader struct{} + +func (r *failureReader) Read([]byte) (int, error) { return 0, errors.New("test failure") } + +func TestStripBOM_ReadFailure(t *testing.T) { + br, err := StripBOM(&failureReader{}) + assert.True(t, br == nil) + assert.Error(t, err) + assert.Equal(t, "test failure", err.Error()) +}