Skip to content

Commit

Permalink
Fix handling of ";" in values
Browse files Browse the repository at this point in the history
Format without quotes for file name is no longer supported.
  • Loading branch information
jlaasonen committed Mar 14, 2019
1 parent 90c987a commit 026b11f
Show file tree
Hide file tree
Showing 5 changed files with 33 additions and 57 deletions.
16 changes: 1 addition & 15 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ Usage:
node index.js <a path to the catalogue directory> <MongoDB URI> <MongoDB database>
```

The catalogue directory should contain "csv" files with the following formats:
The catalogue directory should contain "csv" files with the following format:

```
"Search Results"
Expand Down Expand Up @@ -43,20 +43,6 @@ The catalogue directory should contain "csv" files with the following formats:
```

```
"Search Results"
"Summary"
"Saved on";"27.07.2018 12:19:10"
"Searched for";"K 14021"
"In index(es)";"X:\\yyy\\yyy\\_OA\\Indice\\Index.pdx"
"Number of document(s) found";"1"
"Number of instance(s) found";"1"
"File name";"Title";"Page";"Search Instance"
"BC 3.pdf";"";"467";"houses, etc. [K. 14021] Left-hand corner, 1ain. by 1lin.; ......... + 6 "
```

The fragment ID is parsed from the file name:
- `<number>.csv` => `K.<number without leading zeroes>`
- `Sm <number>.csv` => `Sm.<number without leading zeroes>`
Expand Down
13 changes: 7 additions & 6 deletions lib/extractDocuments.js
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
const _ = require('lodash')
const Papa = require('papaparse')
const Reference = require('./Reference')

function createReference () {
return ([document, title, page]) => new Reference(
_.trim(document, '"').replace(/.pdf$/, ''),
[_.trim(page, '"')].filter(page => /\d+/.test(page)).map(_.toNumber)
function createReference ([document, title, page]) {
return new Reference(
document.replace(/.pdf$/, ''),
[page].filter(page => /\d+/.test(page)).map(_.toNumber)
)
}

Expand All @@ -15,8 +16,8 @@ module.exports = function countHits (catalog) {
.reject(line => /^\s*$/.test(line))
.reject(line => /"?File name"?;"Title";"Page";"Search Instance"/.test(line))
.reject(line => /^"?BC .+\.pdf/.test(line))
.map(line => _(line).split(';').take(3))
.map(createReference())
.map(line => _.take(Papa.parse(line, { delimiter: ';' }).data[0], 3))
.map(createReference)
.groupBy('document')
.mapValues(Reference.mergeAll)
.values()
Expand Down
51 changes: 17 additions & 34 deletions lib/extractDocuments.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,54 +11,37 @@ Number of document(s) found;"9" \t\t\t\t\t\t
Number of instance(s) found;"16" \t\t\t\t\t\t
\t\t\t\t\t\t
File name;"Title";"Page";"Search Instance"\t\t\t\t\t\t
Fincke 2003-2004 The Babylonian Texts of Nineveh AfO 50.pdf;"";"";"Ashurbanipal (54): 48-7-20\t 115; 80-7-19\t 35; 40; 46; 82-"\t\t\t\t
"Fincke 2003-2004 The Babylonian Texts of Nineveh AfO 50.pdf";"";"";"Ashurbanipal (54): 48-7-20\t 115; 80-7-19\t 35; 40; 46; 82-"\t\t\t\t
\t\t\t\t\t\t
File name;"Title";"Page";"Search Instance"\t\t\t\t\t\t
0(7).pdf;"";"7";"14\t [412] 48-7-20\t115\t obv. 12\t [464] K. 1519\t obv. "\t
0(7).pdf;"";"7";"21\t [412] 48-7-20\t115\t rv. 3\t11\t14\t [418] Sm. "
0(7).pdf;"";"";"21\t [412] 48-7-20\t115\t rv. 14\t [571] K. 998\t obv. "\t
0(7).pdf;"";"10";"4\t [412] 48-7-20\t115\t obv. 15\t [460] K. 1250\t obv. "\t
"0(7).pdf";"";"7";"14\t [412] 48-7-20\t115\t obv. 12\t [464] K. 1519\t obv. "\t
"0(7).pdf";"";"7";"21\t [412] 48-7-20\t115\t rv. 3\t11\t14\t [418] Sm. "
"0(7).pdf";"";"";"21\t [412] 48-7-20\t115\t rv. 14\t [571] K. 998\t obv. "\t
"0(7).pdf";"";"10";"4\t [412] 48-7-20\t115\t obv. 15\t [460] K. 1250\t obv. "\t
\t\t\t\t\t\t
File name;"Title";"Page";"Search Instance"\t\t\t\t\t\t
ABL 4-6.pdf;"";"97";"433 [412.] 48-7-20\t 115. OBVERSE. T; ^ft H ' T "\t\t\t\t\t
ABL 4-6.pdf;"";"98";"434 [412.] 48-7-20\t 115 (continued). REVERSE. raTA T T e "\t\t\t\t\t
ABL 4-6.pdf;"";"127";"Rm. 2\t 5 48-7-20\t 115 Bu. 91-5-9\t 12 Rm. 77 "\t\t\t
"ABL 4-6.pdf";"";"97";"433 [412.] 48-7-20\t 115. OBVERSE. T; ^ft H ' T "\t\t\t\t\t
"ABL 4-6.pdf";"";"98";"434 [412.] 48-7-20\t 115 (continued). REVERSE. raTA T T e "\t\t\t\t\t
"ABL 4-6.pdf";"";"127";"Rm. 2\t 5 48-7-20\t 115 Bu. 91-5-9\t 12 Rm. 77 "\t\t\t
\t\t\t\t\t\t
File name;"Title";"Page";"Search Instance"\t\t\t\t\t\t
BC Abcd 123.pdf;"";"330";"p. 372. [48-7-20\t 115] Upper half\t 2 in. by 1 in.; "\t\t\t\t
"BC Abcd 123.pdf";"";"330";"p. 372. [48-7-20\t 115] Upper half\t 2 in. by 1 in.; "\t\t\t\t
\t\t\t\t\t\t
File name;"Title";"Page";"Search Instance"\t\t\t\t\t\t
BC 5.pdf;"";"194";"(p. 219). 48-7-20\t 115 (p. 1688). 67-4-2\t 1 "\t\t\t\t
BC 5.pdf;"";"208";"to the king. 48-7-20\t 115 (p. 1688). Letter to the - "\t\t\t\t\t
CAD 5.pdf;"";"not a number";"to the king. 48-7-20\t 115 (p. 1688). Letter to the - "\t\t\t\t\t`

const alternativeCatalog = `"Search Results"
"Summary"
"Saved on";"27.07.2018 12:19:10"
"Searched for";"K 14021"
"In index(es)";"X:\\yyy\\yyy\\_OA\\Indice\\Index.pdx"
"Number of document(s) found";"1"
"Number of instance(s) found";"1"
"File name";"Title";"Page";"Search Instance"
"ABL 4-6.pdf";"";"97";"433 [412.] 48-7-20\t 115. OBVERSE. T; ^ft H ' T "
"BC 3.pdf";"";"467";"houses, etc. [K. 14021] Left-hand corner, 1ain. by 1lin.; ......... + 6 "
"CAD 3.pdf";"";"";"houses, etc. [K. 14021] Left-hand corner, 1ain. by 1lin.; ......... + 6 "
"BC 5.pdf";"";"194";"(p. 219). 48-7-20\t 115 (p. 1688). 67-4-2\t 1 "\t\t\t\t
"BC 5.pdf";"";"208";"to the king. 48-7-20\t 115 (p. 1688). Letter to the - "\t\t\t\t\t
"CAD 5.pdf";"";"not a number";"to the king. 48-7-20\t 115 (p. 1688). Letter to the - "\t\t\t\t\t
"Abraham, Szuszan in the Egibi Texts (Susan), OLP 28 1997.pdf";"0548;OLP28; 04Abraham";"3";""
"Biggs 1992 Rv Durand ARM 26 1; Charpin et al ARM 26 2 JNES 51.pdf";"<product> <source> <xref ref-type=""transliteration"" rid=""trans12"" ptype=""t545838"" citart=""citart1"">Archives <html_ent glyph=""@#233;"" ascii=""e""></html_ent>pistolaires de Mari</xref> </source> <contrib contrib-type=""author""> <name> <given-names>Jean-Marie</given-names> <surname>Durand</surname> </name> </contrib> </product><product> <source> <xref ref-type=""transliteration"" rid=""trans13"" ptype=""t545838"" citart=""citart1"">Archives <html_ent glyph=""@#233;"" ascii=""e""></html_ent>pistolaires de Mari</xref> </source> <contrib contrib-type=""author""> <name> <given-names>Dominique</given-names> <surname>Charpin</surname> </name> </contrib> <contrib contrib-type=""author""> <name> <given-names>Francis</given-names> <surname>Jonnes</surname> </name> </contrib> <contrib contrib-type=""author""> <name> <given-names>Sylvie</given-names> <surname>Lackenbacher</surname> </name> </contrib> <contrib contrib-type=""author""> <name> <given-names>Bertrand</given-names> <surname>Lafont</surname> </name> </contrib> </product>";"3";"p. 51, n. 243, Durand refers to YOS 10 36. The text concerns the "
`

test('Returns documents not starting with "BC" or "CAD"', () => {
expect(extractDocuments(catalog)).toEqual([
new Reference('Fincke 2003-2004 The Babylonian Texts of Nineveh AfO 50', []),
new Reference('0(7)', [7, 10]),
new Reference('ABL 4-6', [97, 98, 127]),
new Reference('CAD 5', [])
])
})

test('Returns documents not starting with "BC" or "CAD" in alternative format', () => {
expect(extractDocuments(alternativeCatalog)).toEqual([
new Reference('ABL 4-6', [97]),
new Reference('CAD 3', [])
new Reference('CAD 5', []),
new Reference('Abraham, Szuszan in the Egibi Texts (Susan), OLP 28 1997', [3]),
new Reference('Biggs 1992 Rv Durand ARM 26 1; Charpin et al ARM 26 2 JNES 51', [3])
])
})
5 changes: 3 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"name": "fragment-hit-counter",
"version": "2.0.1",
"description": "A script to extract fragment hits from a catalogue",
"version": "3.0.0",
"description": "A script to extract references from a catalogue of CSV files",
"main": "index.js",
"repository": "https://github.com/ElectronicBabylonianLiterature/fragment-hit-counter.git",
"author": "Jussi Laasonen <[email protected]>",
Expand Down Expand Up @@ -30,6 +30,7 @@
"dependencies": {
"lodash": "^4.17.10",
"mongodb": "^3.1.13",
"papaparse": "^4.6.3",
"progress": "https://github.com/turbopope/node-progress.git"
}
}
5 changes: 5 additions & 0 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -3207,6 +3207,11 @@ p-try@^2.0.0:
resolved "https://registry.yarnpkg.com/p-try/-/p-try-2.0.0.tgz#85080bb87c64688fa47996fe8f7dfbe8211760b1"
integrity sha512-hMp0onDKIajHfIkdRk3P4CdCmErkYAxxDtP3Wx/4nZ3aGlau2VKh3mZpcuFkH27WQkL/3WBCPOktzA9ZOAnMQQ==

papaparse@^4.6.3:
version "4.6.3"
resolved "https://registry.yarnpkg.com/papaparse/-/papaparse-4.6.3.tgz#742e5eaaa97fa6c7e1358d2934d8f18f44aee781"
integrity sha512-LRq7BrHC2kHPBYSD50aKuw/B/dGcg29omyJbKWY3KsYUZU69RKwaBHu13jGmCYBtOc4odsLCrFyk6imfyNubJQ==

parse-json@^2.2.0:
version "2.2.0"
resolved "https://registry.yarnpkg.com/parse-json/-/parse-json-2.2.0.tgz#f480f40434ef80741f8469099f8dea18f55a4dc9"
Expand Down

0 comments on commit 026b11f

Please sign in to comment.