Skip to content

Commit

Permalink
Add ColumnsCSV filter
Browse files Browse the repository at this point in the history
Adds a CSV aware columns filter that preserves quoted columns containing
the delimiter. The existing columns implementation does a simple split
with the delimiter meaning any columns containing the delimiter will be
incorrectly split. For many cases this is fine, but for more complex
scenarios proper parsing is required.
  • Loading branch information
dyson committed Feb 11, 2024
1 parent 37a5de6 commit 5b16f07
Show file tree
Hide file tree
Showing 5 changed files with 73 additions and 28 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ for a good list and examples). These tools do a single job well and have many
powerful features to accomplish anything you might want to do.

On the other hand there can be a bit of a learning curve to remember their
names , flags, and usage - even for basic tasks.
names, flags, and usage - even for basic tasks.

Pipesore is intended to be a single command that covers the most useful use
cases of these tools while being intuitive to even someone who has never seen
Expand Down Expand Up @@ -69,6 +69,8 @@ first line of the input and return all other lines.
| Filter | |
| ------ | ------- |
| Columns(delimiter *string*, columns *string*) | Returns the selected `columns` in order where `columns` is a 1-indexed comma separated list of column positions. Columns are defined by splitting with the 'delimiter'. |

| ColumnsCSV(delimiter *string*, columns *string*)| Returns the selected `columns` in order where `columns` is a 1-indexed comma separated list of column positions. Parsing is CSV aware so quoted columns containing the `delimiter` when splitting are preserved. |
| CountLines() | Returns the line count. Lines are delimited by `\r?\n`. |
| CountRunes() | Returns the rune (Unicode code points) count. Erroneous and short encodings are treated as single runes of width 1 byte. |
| CountWords() | Returns the word count. Words are delimited by<br />`\t\|\n\|\v\|\f\|\r\| \|0x85\|0xA0`. |
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
module github.com/dyson/pipesore

go 1.19
go 1.21
66 changes: 65 additions & 1 deletion pkg/pipeline/filters.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package pipeline
import (
"bufio"
"container/ring"
"encoding/csv"
"fmt"
"io"
"reflect"
Expand All @@ -16,6 +17,7 @@ import (
var (
Filters = map[string]reflect.Value{
"columns": reflect.ValueOf(Columns),
"columnscsv": reflect.ValueOf(ColumnsCSV),
"countlines": reflect.ValueOf(CountLines),
"countrunes": reflect.ValueOf(CountRunes),
"countwords": reflect.ValueOf(CountWords),
Expand Down Expand Up @@ -43,7 +45,7 @@ func Columns(delimiter string, columns string) func(io.Reader, io.Writer) error
for _, column := range strings.Split(columns, ",") {
index, err := strconv.Atoi(strings.TrimSpace(column))
if err != nil {
return fmt.Errorf("list of columns must be comma serarated list of ints, got: %v", columns)
return fmt.Errorf("list of columns must be comma separated list of ints, got: %v", columns)
}

order = append(order, index)
Expand All @@ -66,7 +68,69 @@ func Columns(delimiter string, columns string) func(io.Reader, io.Writer) error

return scanner.Err()
}
}

// ColumnsCSV returns a CSV aware filter that writes the selected 'columns' in
// the order provided where 'columns' is a 1-indexed comma separated list of
// column positions. Columns are defined by splitting with the 'delimiter'.
func ColumnsCSV(delimiter string, columns string) func(io.Reader, io.Writer) error {
return func(r io.Reader, w io.Writer) error {
if utf8.RuneCount([]byte(delimiter)) > 1 {
return fmt.Errorf("delimeter must be a single rune, got: %s", delimiter)
}

order := []int{}
for _, column := range strings.Split(columns, ",") {
index, err := strconv.Atoi(strings.TrimSpace(column))
if err != nil {
return fmt.Errorf("list of columns must be comma separated list of ints, got: %v", columns)
}

order = append(order, index)
}

reader := csv.NewReader(r)
reader.Comma, _ = utf8.DecodeRuneInString(delimiter)
// We really shouldn't be tolerant of malformed CSV input (and should
// error) however we can set LazyQuotes to be less strict for commonly
// incorrect quoting.
//
// Unfortunately how incorrect quoting should be interpreted is highly
// dependent on how it was incorrectly implemented and so with LazyQuotes
// enabled we will in some cases silently parse malformed CSV in a possibly
// unexpected way to the user.
//
// On the other hand users don't always have control over the generation of
// the CSV input and so it is hoped that the trade-off in using LazyQuotes
// will allow for a better experience overall. If this is not that case we
// can disable LazyQuotes and only parse valid rfc4180
// (https://www.rfc-editor.org/rfc/rfc4180.html) csv.
reader.LazyQuotes = true

writer := csv.NewWriter(w)
defer writer.Flush()

for {
lineColumns, err := reader.Read()
if err == io.EOF {
break
}
if err != nil {
return err
}

output := []string{}
for _, v := range order {
if v-1 < len(lineColumns) {
output = append(output, lineColumns[v-1])
}
}

writer.Write(output)
}

return nil
}
}

// CountLines returns a filter that writes the number of lines read.
Expand Down
5 changes: 4 additions & 1 deletion pkg/pipeline/filters_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ func TestFilters(t *testing.T) {
{Columns(",", "3,2,1"), "one\t\tthree\n", "one\t\tthree\n"},
{Columns("\t", "9"), "one\t\tthree\n", "\n"},
{Columns("\t", "3,2,1"), "one\t\tthree\n", "three\t\tone\n"},
{ColumnsCSV(",", "3,2,1"), "one\t\tthree\n", "one\t\tthree\n"},
{ColumnsCSV("\t", "9"), "one\t\tthree\n", "\n"},
{ColumnsCSV(",", "3,2,1"), "one,\"t,w,o\",\"th\"\"ree\"\n", "\"th\"\"ree\",\"t,w,o\",one\n"},
{CountLines(), "", "0\n"},
{CountLines(), input, "3\n"},
{CountRunes(), "", "0\n"},
Expand Down Expand Up @@ -69,7 +72,7 @@ func TestFilters(t *testing.T) {

err := tc.filter(strings.NewReader(tc.input), got)
if err != nil {
t.Fatalf("(test: %d) error executing filter: %v", k, err)
t.Fatalf("(test: %d) error executing filter: %v: input: %v", k, err, tc.input)
}

if tc.want != got.String() {
Expand Down
24 changes: 0 additions & 24 deletions pkg/pipeline/pipeline.go
Original file line number Diff line number Diff line change
@@ -1,27 +1,3 @@
// MIT License

// Copyright (c) 2019 John Arundel, 2022 Dyson Simmons

// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:

// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.

// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.

// MIT License

package pipeline

import (
Expand Down

0 comments on commit 5b16f07

Please sign in to comment.