Skip to content

Commit

Permalink
Merge pull request #6 from jf-tech/bom
Browse files Browse the repository at this point in the history
Add `ReadLine` and `StripBOM` helpers
  • Loading branch information
jf-tech authored Aug 31, 2020
2 parents 0ae9960 + d94410e commit 473c5dc
Show file tree
Hide file tree
Showing 3 changed files with 178 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
# Test binary, built with `go test -c`
*.test

# JetBrain IDE
*.idea

# Output of the go coverage tool, specifically when used with LiteIDE
*.out

Expand Down
60 changes: 60 additions & 0 deletions readers.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package iohelper

import (
"bufio"
"bytes"
"encoding/csv"
"fmt"
Expand Down Expand Up @@ -132,3 +133,62 @@ func (r *BytesReplacingReader) Read(p []byte) (int, error) {
}
}
}

// ReadLine reads in a single line from a bufio.Reader.
func ReadLine(r *bufio.Reader) (string, error) {
// Turns out even with various bufio.Reader.Read???() and bufio.Scanner, there is not simple clean
// way of reading a single text line in:
// - bufio.ReadSlice('\n') doesn't have '\r' dropping. We want a line returned without neither '\n' nor '\r'.
// - bufio.ReadLine() drops '\r' and '\n', but has a fixed buf so may be unable to read a whole line in one call.
// - bufio.ReadBytes no buf size issue, but doesn't offer '\r' and '\n' cleanup.
// - bufio.ReadString essentially the same as bufio.ReadBytes.
// - bufio.Scanner deals with '\r' and '\n' but has fixed buf issue.
// Oh, come on!!
//
// Also found net/textproto's Reader.ReadLine() which meets all the requirements. But to use it
// we need to create yet another type of Reader (net.textproto.Reader), as if the
// io.Reader -> bufio.Reader isn't enough for us. So decided instead, just shamelessly copy
// net.textproto.Reader.ReadLine() here, credit goes to
// https://github.com/golang/go/blob/master/src/net/textproto/reader.go. However its test code
// coverage is lacking, so create all the new test cases for this ReadLine implementation copy.
var line []byte
for {
l, more, err := r.ReadLine()
if err != nil {
return "", err
}
// Avoid the copy if the first call produced a full line.
if line == nil && !more {
return string(l), nil
}
line = append(line, l...)
if !more {
break
}
}
return string(line), nil
}

const bom = '\uFEFF'

// StripBOM returns a new io.Reader that, if needed, strips away the BOM (byte order marker) of
// the input io.Reader.
func StripBOM(reader io.Reader) (io.Reader, error) {
br := bufio.NewReader(reader)
r, _, err := br.ReadRune()
switch {
case err == io.EOF:
// This is to handle empty file, can't call UnreadRune(), will meet ErrInvalidUnreadRune as
// b.lastRuneSize is -1. So simply reset buffer io.
br.Reset(reader)
return br, nil
case err != nil:
return nil, err
case r == bom:
return br, nil
default:
// Here we shouldn't meet any error during unread rune.
_ = br.UnreadRune()
return br, nil
}
}
115 changes: 115 additions & 0 deletions readers_test.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
package iohelper

import (
"bufio"
"bytes"
"errors"
"io"
"io/ioutil"
"math/rand"
Expand Down Expand Up @@ -160,3 +162,116 @@ func BenchmarkRegularReader_50KBLength_1000Targets(b *testing.B) {
_, _ = ioutil.ReadAll(bytes.NewReader(testInput50KBLength1000Targets))
}
}

func TestReadLine(t *testing.T) {
for _, test := range []struct {
name string
input string
bufsize int
expectedOutput []string
}{
{
name: "empty",
input: "",
bufsize: 1024,
expectedOutput: []string{},
},
{
name: "single-line with no newline",
input: " word1, word2 - word3 !@#$%^&*()",
bufsize: 1024,
expectedOutput: []string{" word1, word2 - word3 !@#$%^&*()"},
},
{
name: "single-line with '\\r' and '\\n'",
input: "line1\r\n",
bufsize: 1024,
expectedOutput: []string{"line1"},
},
{
name: "multi-line - bufsize enough",
input: "line1\r\nline2\nline3",
bufsize: 1024,
expectedOutput: []string{"line1", "line2", "line3"},
},
{
name: "multi-line - bufsize not enough; also empty line",
input: "line1-0123456789012345\r\n\nline3-0123456789012345",
bufsize: 16, // bufio.minReadBufferSize is 16.
expectedOutput: []string{"line1-0123456789012345", "", "line3-0123456789012345"},
},
} {
t.Run(test.name, func(t *testing.T) {
r := bufio.NewReaderSize(strings.NewReader(test.input), test.bufsize)
output := []string{}
for {
line, err := ReadLine(r)
if err != nil {
assert.Equal(t, "", line)
assert.Equal(t, io.EOF, err)
break
}
output = append(output, line)
}
assert.Equal(t, test.expectedOutput, output)
})
}
}

func TestStripBOM_Success(t *testing.T) {
for _, test := range []struct {
name string
fileContent []byte
expectedContent []byte
}{
{
name: "Empty content",
fileContent: []byte(""),
expectedContent: []byte(""),
},
{
name: "Non-bom unicode",
fileContent: []byte("\u1234test content"),
expectedContent: []byte("\u1234test content"),
},
{
name: "Content without BOM",
fileContent: []byte("test content"),
expectedContent: []byte("test content"),
},
{
name: "Content with BOM",
fileContent: []byte("\uFEFFtest content"),
expectedContent: []byte("test content"),
},
{
name: "Content with BOM only",
fileContent: []byte("\uFEFF"),
expectedContent: []byte(""),
},
} {
r := bytes.NewReader(test.fileContent)
br, err := StripBOM(r)
assert.NoError(t, err)
assert.False(t, br == nil)
line, _, err := br.(*bufio.Reader).ReadLine()
if len(test.expectedContent) <= 0 {
assert.Error(t, err)
assert.Equal(t, io.EOF, err)
continue
}
assert.NoError(t, err)
assert.Equal(t, test.expectedContent, line)
}
}

type failureReader struct{}

func (r *failureReader) Read([]byte) (int, error) { return 0, errors.New("test failure") }

func TestStripBOM_ReadFailure(t *testing.T) {
br, err := StripBOM(&failureReader{})
assert.True(t, br == nil)
assert.Error(t, err)
assert.Equal(t, "test failure", err.Error())
}

0 comments on commit 473c5dc

Please sign in to comment.