Skip to content

Commit

Permalink
Merge pull request #1522 from zazuko/1495
Browse files Browse the repository at this point in the history
CSV handling improvements
  • Loading branch information
tpluscode authored Jun 6, 2024
2 parents 3a03048 + f178fad commit cfdb26d
Show file tree
Hide file tree
Showing 14 changed files with 349 additions and 308 deletions.
5 changes: 5 additions & 0 deletions .changeset/breezy-paws-leave.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@cube-creator/core-api": patch
---

Updates `csv-parse` to v5
5 changes: 5 additions & 0 deletions .changeset/wicked-trees-sort.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@cube-creator/cli": patch
---

Empty lines will be ignored when parsing CSVs (fixes #1495)
5 changes: 5 additions & 0 deletions .changeset/wise-ghosts-think.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@cube-creator/cli": patch
---

Whitespace will be trimmed from CSV headers. A message will be displayed to the user in that case. (fixes #1232)
5 changes: 5 additions & 0 deletions apis/core/lib/domain/csv-source/CsvSource.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ interface ApiCsvSource {
*/
setDialect(dialect: Partial<Csvw.Dialect>): boolean
appendOrUpdateColumn(params: CreateOrUpdateColumn): CsvColumn.CsvColumn
setTrimError(): void
}

declare module '@cube-creator/model' {
Expand Down Expand Up @@ -84,6 +85,10 @@ export default function Mixin<Base extends Constructor<Omit<CsvSource, keyof Api

return column
}

setTrimError() {
this.errorMessages.push('Some column names had leading or trailing whitespace. They were trimmed. Please adjust the CSV if that was intentional.')
}
}

return Impl
Expand Down
6 changes: 5 additions & 1 deletion apis/core/lib/domain/csv-source/replace.ts
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,11 @@ async function validateNewFile(csvSource: CsvSource, media: MediaObject, storage
}

// Check header
const { header } = await parse(head, parserOptions)
const { header, headerTrimmed } = await parse(head, parserOptions)

if (headerTrimmed) {
csvSource.setTrimError()
}

const missingColumns = csvSource.columns.filter((column) => !header.includes(column.name)).map((column) => column.name)
if (missingColumns.length > 0) {
Expand Down
6 changes: 5 additions & 1 deletion apis/core/lib/domain/csv-source/update.ts
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,16 @@ export async function createOrUpdateColumns(csvSource: CsvSource, getStorage: Ge
const storage = getStorage(csvSource.associatedMedia)
const fileStream = await storage.getStream(csvSource.associatedMedia)
const head = await loadFileHeadString(fileStream, 500)
const { header, rows } = await parse(head, {
const { header, rows, headerTrimmed } = await parse(head, {
bom: true,
delimiter: csvSource.dialect.delimiter,
quote: csvSource.dialect.quoteChar,
})

if (headerTrimmed) {
csvSource.setTrimError()
}

const sampleCol = sampleValues(header, rows)

for (let index = 0; index < header.length; index++) {
Expand Down
27 changes: 22 additions & 5 deletions apis/core/lib/domain/csv/index.ts
Original file line number Diff line number Diff line change
@@ -1,24 +1,41 @@
import CSVparse from 'csv-parse'
import { parse as CSVparse, Options } from 'csv-parse'
import CSVSniffer from 'csv-sniffer'

const csvDelimiters = [',', ';', '\t']
const sniffer = new (CSVSniffer())(csvDelimiters)

export function parse(csv: string, options: CSVparse.Options): Promise<{ header: any[]; rows: any[] }> {
return new Promise((resolve, reject) => CSVparse(csv, options, (err, records) => {
interface Parsed {
header: string[]
rows: string[][]
headerTrimmed: boolean

}

export function parse(csv: string, options: Options): Promise<Parsed> {
return new Promise((resolve, reject) => CSVparse(csv, options, (err, records: string[][]) => {
if (err) {
reject(err)
return
}
const [header, ...rows] = records

let headerTrimmed = false

resolve({
header,
header: header.map((name) => {
const trimmed = name.trim()
if (trimmed !== name) {
headerTrimmed = true
}
return trimmed
}),
headerTrimmed,
rows,
})
}))
}

export async function sniffParse(csv: string): Promise<{ dialect: {delimiter: string; quote: string}; header: any[]; rows: any[] }> {
export async function sniffParse(csv: string): Promise<Parsed & { dialect: {delimiter: string; quote: string} }> {
const detectedCsvFormat = sniffer.sniff(csv)
const csvDialect = {
bom: true,
Expand Down
2 changes: 1 addition & 1 deletion apis/core/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
"commander": "^6.1.0",
"content-disposition": "^0.5.3",
"cors": "^2.8.5",
"csv-parse": "^4.12.0",
"csv-parse": "^5",
"csv-sniffer": "^0.1.1",
"debug": "^4.1.1",
"express": "^4.17.1",
Expand Down
62 changes: 62 additions & 0 deletions apis/core/test/domain/csv/file-head.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import { createReadStream, promises as fs } from 'fs'
import { resolve } from 'path'
import { Readable } from 'stream'
import { describe, it } from 'mocha'
import { expect } from 'chai'
import { sniffParse } from '../../../lib/domain/csv'
import { loadFileHeadString } from '../../../lib/domain/csv/file-head'

describe('domain/csv/file-head', () => {
it('sniffs and parses', async () => {
const path = resolve(__dirname, '../../fixtures/CH_yearly_air_immission_aggregation_id.csv')
const input = await fs.readFile(path)
const { dialect, header, rows } = await sniffParse(input.toString())
const [lastRow] = rows.slice(-1)

expect(dialect).to.contain({ delimiter: ',', quote: '"' })
expect(header).to.deep.eq(['aggregation_id', 'aggregation_name_de', 'aggregation_name_fr', 'aggregation_name_it', 'aggregation_name_en'])
expect(lastRow).to.deep.eq(['dosisaot40f', 'Dosis AOT40f', 'Dose AOT40f', 'Dose AOT40f', 'Dosis AOT40f'])
})

it('reads parts of a file ', async () => {
const path = resolve(__dirname, '../../fixtures/CH_yearly_air_immission_basetable.csv')
const fileContent = await fs.readFile(path)

const input1 = fileContent.toString()
const input2 = await loadFileHeadString(createReadStream(path))

expect(input1).not.to.eq(input2)

const lines1 = input1?.split('\n') || []
const lines2 = input2?.split('\n') || []
const firstLine2 = lines2[0]
expect(lines1[0]).to.eq(firstLine2)
expect(lines2.length).to.eq(21)
})

it('reads parts of a file with CRLF line endings', async () => {
const input = `"station_id","pollutant_id","aggregation_id","limitvalue","year","value","unit_id","value_remark"\r
"blBAS","so2","annualmean",30,1984,31.9,"µg/m3","incomplete series"\r
"blBAS","so2","annualmean",30,1985,40.2,"µg/m3","incomplete series"\r
"blBAS","so2","annualmean",30,1985,40.2,"µg/m3","incomplete series"\r
"blBAS","so2","annualmean",30,1985,40.2,"µg/m3","incomplete series"\r
"blBAS","so2","annualmean",30,1986,33.6,"µg/m3","incomplete series"\r
"blBAS","so2","annualmean",30,1987,33,"µg/m3","incomplete series"`
const stream = new Readable()
stream.push(input)
stream.push(null)
const head = await loadFileHeadString(stream)

const lines = head.split('\n')
expect(lines[0]).to.eq('"station_id","pollutant_id","aggregation_id","limitvalue","year","value","unit_id","value_remark"')
expect(lines.length).to.eq(5)
})

it('parses all lines on short file', async () => {
const path = resolve(__dirname, '../../fixtures/CH_yearly_air_immission_unit_id.csv')

const input = await loadFileHeadString(createReadStream(path))
const lines = input?.split('\n') || []
expect(lines.length).be.eq(11)
})
})
68 changes: 17 additions & 51 deletions apis/core/test/domain/csv/parse.test.ts
Original file line number Diff line number Diff line change
@@ -1,62 +1,28 @@
import { createReadStream, promises as fs } from 'fs'
import { resolve } from 'path'
import { Readable } from 'stream'
import { describe, it } from 'mocha'
import { expect } from 'chai'
import { sniffParse } from '../../../lib/domain/csv'
import { loadFileHeadString } from '../../../lib/domain/csv/file-head'
import { parse } from '../../../lib/domain/csv'

describe('domain/csv/parse', () => {
it('sniffs and parses', async () => {
const path = resolve(__dirname, '../../fixtures/CH_yearly_air_immission_aggregation_id.csv')
const input = await fs.readFile(path)
const { dialect, header, rows } = await sniffParse(input.toString())
const [lastRow] = rows.slice(-1)
it('trims headers', async () => {
// given
const input = '" station_id ","\tpollutant_id\t","aggregation_id\t","\tlimitvalue","year"'

expect(dialect).to.contain({ delimiter: ',', quote: '"' })
expect(header).to.deep.eq(['aggregation_id', 'aggregation_name_de', 'aggregation_name_fr', 'aggregation_name_it', 'aggregation_name_en'])
expect(lastRow).to.deep.eq(['dosisaot40f', 'Dosis AOT40f', 'Dose AOT40f', 'Dose AOT40f', 'Dosis AOT40f'])
})

it('reads parts of a file ', async () => {
const path = resolve(__dirname, '../../fixtures/CH_yearly_air_immission_basetable.csv')
const fileContent = await fs.readFile(path)

const input1 = fileContent.toString()
const input2 = await loadFileHeadString(createReadStream(path))
// when
const { header, headerTrimmed } = await parse(input, {})

expect(input1).not.to.eq(input2)

const lines1 = input1?.split('\n') || []
const lines2 = input2?.split('\n') || []
const firstLine2 = lines2[0]
expect(lines1[0]).to.eq(firstLine2)
expect(lines2.length).to.eq(21)
// then
expect(headerTrimmed).to.be.true
expect(header).to.contain.ordered.members(['station_id', 'pollutant_id', 'aggregation_id', 'limitvalue', 'year'])
})

it('reads parts of a file with CRLF line endings', async () => {
const input = `"station_id","pollutant_id","aggregation_id","limitvalue","year","value","unit_id","value_remark"\r
"blBAS","so2","annualmean",30,1984,31.9,"µg/m3","incomplete series"\r
"blBAS","so2","annualmean",30,1985,40.2,"µg/m3","incomplete series"\r
"blBAS","so2","annualmean",30,1985,40.2,"µg/m3","incomplete series"\r
"blBAS","so2","annualmean",30,1985,40.2,"µg/m3","incomplete series"\r
"blBAS","so2","annualmean",30,1986,33.6,"µg/m3","incomplete series"\r
"blBAS","so2","annualmean",30,1987,33,"µg/m3","incomplete series"`
const stream = new Readable()
stream.push(input)
stream.push(null)
const head = await loadFileHeadString(stream)

const lines = head.split('\n')
expect(lines[0]).to.eq('"station_id","pollutant_id","aggregation_id","limitvalue","year","value","unit_id","value_remark"')
expect(lines.length).to.eq(5)
})
it('parses header', async () => {
// given
const input = '"station_id","pollutant_id","aggregation_id","limitvalue","year"'

it('parses all lines on short file', async () => {
const path = resolve(__dirname, '../../fixtures/CH_yearly_air_immission_unit_id.csv')
// when
const { header, headerTrimmed } = await parse(input, {})

const input = await loadFileHeadString(createReadStream(path))
const lines = input?.split('\n') || []
expect(lines.length).be.eq(11)
// then
expect(headerTrimmed).to.be.false
expect(header).to.contain.ordered.members(['station_id', 'pollutant_id', 'aggregation_id', 'limitvalue', 'year'])
})
})
8 changes: 4 additions & 4 deletions cli/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,12 @@
"aws-sdk": "^2.559.0",
"barnard59": "^5.0.2",
"barnard59-base": "^2.4.2",
"barnard59-cube": "^1.4.6",
"barnard59-formats": "^2.1.1",
"barnard59-cube": "^1.4.7",
"barnard59-formats": "^3",
"barnard59-graph-store": "^5.1.2",
"barnard59-http": "^2.0.0",
"barnard59-rdf": "^3.4.0",
"barnard59-shacl": "^1.4.5",
"barnard59-shacl": "^1.4.7",
"body-parser": "^1.19.0",
"clownface": "^1",
"commander": "^4.1.1",
Expand All @@ -68,7 +68,7 @@
"once": "^1.4.0",
"rdf-ext": "^1.3.0",
"rdf-literal": "^1.3.0",
"rdf-parser-csvw": "^0.15",
"@zazuko/rdf-parser-csvw": "^0.16",
"rdf-stream-to-dataset-stream": "^1.0.0",
"rdf-utils-fs": "^2.1.0",
"rdf-validate-datatype": "^0.1.3",
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
"yarn-deduplicate": "^6.0.2"
},
"devDependencies": {
"@changesets/cli": "^2.26.0",
"@changesets/cli": "^2.27.5",
"@tpluscode/eslint-config": "^0.3.3",
"@types/node": "^14.14.7",
"@types/webpack-env": "^1.15.3",
Expand Down
22 changes: 0 additions & 22 deletions patches/rdf-parser-csvw+0.15.2.patch

This file was deleted.

Loading

0 comments on commit cfdb26d

Please sign in to comment.