Skip to content

Commit

Permalink
Extract value cleaning methods into a new utils class and implement t…
Browse files Browse the repository at this point in the history
…he html entity replacement as per POR-155
  • Loading branch information
mdoering committed Mar 3, 2015
1 parent 93f452a commit e57c26d
Show file tree
Hide file tree
Showing 8 changed files with 113 additions and 45 deletions.
26 changes: 26 additions & 0 deletions src/main/java/org/gbif/dwc/record/CleanUtils.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
package org.gbif.dwc.record;

import java.util.regex.Pattern;

import org.apache.commons.lang3.StringEscapeUtils;

public class CleanUtils {
private static final Pattern NULL_REPL = Pattern.compile("^\\s*(null|\\\\N)?\\s*$", Pattern.CASE_INSENSITIVE);

private CleanUtils() {
}

/**
* Does basic entity replacments if requested to string values.
* @param value the original string
* @param nulls if true replaces common, literal NULL values with real nulls, e.g. "\N" or "NULL"
* @param entities if true replaces html4, xml and numerical entities with their unicode character
*/
public static String clean(String value, boolean nulls, boolean entities) {
if (value == null || (nulls && NULL_REPL.matcher(value).find()) ) {
return null;
}
return entities ? StringEscapeUtils.unescapeHtml4(value) : value;
}

}
48 changes: 24 additions & 24 deletions src/main/java/org/gbif/dwc/record/RecordImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,63 +9,63 @@
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;

import org.apache.commons.lang3.StringUtils;

public class RecordImpl implements Record {

private static final Pattern NULL_REPL = Pattern.compile("^\\s*(null|\\\\N)?\\s*$", Pattern.CASE_INSENSITIVE);
private static final TermFactory TERM_FACTORY = TermFactory.instance();

private final ArchiveField id;
private final Map<Term, ArchiveField> fields;
protected String[] row;
private final Term rowType;
private final boolean replaceNulls;
private final boolean replaceEntities;

public RecordImpl(ArchiveField id, Collection<ArchiveField> fields, Term rowType, boolean replaceNulls) {
/**
* @param replaceNulls if true record values will have literal nulls replaced with NULL.
* @param replaceEntities if true html & xml entities in record values will be replaced with the interpreted value.
*/
public RecordImpl(ArchiveField id, Collection<ArchiveField> fields, Term rowType, boolean replaceNulls, boolean replaceEntities) {
this.id = id;
this.fields = new HashMap<Term, ArchiveField>();
for (ArchiveField f : fields) {
this.fields.put(f.getTerm(), f);
}
this.rowType = rowType;
this.replaceNulls = replaceNulls;
this.replaceEntities = replaceEntities;
}

public RecordImpl(ArchiveField id, Map<Term, ArchiveField> fields, Term rowType, boolean replaceNulls) {
/**
* @param replaceNulls if true record values will have literal nulls replaced with NULL.
* @param replaceEntities if true html & xml entities in record values will be replaced with the interpreted value.
*/
public RecordImpl(ArchiveField id, Map<Term, ArchiveField> fields, Term rowType, boolean replaceNulls, boolean replaceEntities) {
this.id = id;
this.fields = fields;
this.rowType = rowType;
this.replaceNulls = replaceNulls;
this.replaceEntities = replaceEntities;
}

public RecordImpl(ArchiveFile af, boolean replaceNulls) {
/**
* @param replaceNulls if true record values will have literal nulls replaced with NULL.
* @param replaceEntities if true html & xml entities in record values will be replaced with the interpreted value.
*/
public RecordImpl(ArchiveFile af, boolean replaceNulls, boolean replaceEntities) {
this.id = af.getId();
this.fields = af.getFields();
this.rowType = af.getRowType();
this.replaceNulls = replaceNulls;
}

/**
* Method that replaces common, literal NULL values with real nulls.
* For example you often find "null", "NULL" or "\N" as values in text files.
* This method is not used by the value() methods under the hood to allow access to raw data in case NULL makes
* sense.
*
* @return the input string or null in case its a literal form of NULL
*/
protected String replaceNull(String val) {
if (val == null || NULL_REPL.matcher(val).find()) {
return null;
}
return val;
this.replaceEntities = replaceEntities;
}

public String column(int index) {
if (row.length > index) {
return row[index];
// if requested return column value cleaned
return CleanUtils.clean(row[index], replaceNulls, replaceEntities);
}
return null;
}
Expand Down Expand Up @@ -101,12 +101,12 @@ private String value(ArchiveField f) {
return f.getDefaultValue();
}
String val = column(f.getIndex());
if (StringUtils.trimToNull(val) == null) {
if (StringUtils.isBlank(val)) {
// if column is empty use default value
return f.getDefaultValue();
}
// otherwise return column value, if requested with cleaned nulls
return replaceNulls ? replaceNull(val) : val;
// otherwise return already cleand column value
return val;
}
return null;
}
Expand Down
15 changes: 11 additions & 4 deletions src/main/java/org/gbif/dwc/record/RecordIterator.java
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,20 @@ public class RecordIterator implements ClosableIterator<Record> {
private final ArchiveField id;
private final Map<Term, ArchiveField> fields;
private final Term rowType;
private final boolean replaceEntities;
private final boolean replaceNulls;

/**
* @param replaceNulls if true record values will have literal nulls replaced with NULL.
* @param replaceEntities if true html & xml entities in record values will be replaced with the interpreted value.
*/
public RecordIterator(ClosableIterator<String[]> recordSource, ArchiveField id, Map<Term, ArchiveField> fields,
Term rowType, boolean replaceNulls) {
Term rowType, boolean replaceNulls, boolean replaceEntities) {
this.id = id;
this.fields = fields;
this.rowType = rowType;
this.replaceNulls = replaceNulls;
this.replaceEntities = replaceEntities;
closable = recordSource;
if (closable == null) {
Iterator<String[]> empty = Iterators.emptyIterator();
Expand All @@ -60,11 +66,12 @@ public RecordIterator(ClosableIterator<String[]> recordSource, ArchiveField id,

/**
* @param replaceNulls if true record values will have literal nulls replaced with NULL.
* @param replaceEntities if true html & xml entities in record values will be replaced with the interpreted value.
*/
public static RecordIterator build(ArchiveFile source, boolean replaceNulls) {
public static RecordIterator build(ArchiveFile source, boolean replaceNulls, boolean replaceEntities) {
try {
CSVReader csvr = CSVReader.build(source);
return new RecordIterator(csvr, source.getId(), source.getFields(), source.getRowType(), replaceNulls);
return new RecordIterator(csvr, source.getId(), source.getFields(), source.getRowType(), replaceNulls, replaceEntities);
} catch (IOException e) {
LOG.error("Can't open archive file " + source + " for building a record iterator", e);
}
Expand All @@ -84,7 +91,7 @@ public Record next() {
RecordImpl record = null;
try {
// update record with cached row
record = new RecordImpl(id, fields, rowType, replaceNulls);
record = new RecordImpl(id, fields, rowType, replaceNulls, replaceEntities);
String[] row = iter.next();
while (row.length == 0) {
// ignore rows without a single column
Expand Down
24 changes: 13 additions & 11 deletions src/main/java/org/gbif/dwc/text/Archive.java
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ static class ArchiveDwcIterator implements ClosableIterator<DarwinCoreRecord> {
private final Set<Term> mappedTerms = new HashSet<Term>();

ArchiveDwcIterator(Archive archive) {
record = new RecordImpl(archive.getCore(), true);
record = new RecordImpl(archive.getCore(), true, true);
core = archive.getCore();
// remember used DwC and DC terms
for (DwcTerm term : DwcTerm.values()) {
Expand Down Expand Up @@ -135,19 +135,19 @@ class ArchiveIterator implements ClosableIterator<StarRecordImpl> {
/**
* @param replaceNulls if true replaces common literal null values in all records
*/
ArchiveIterator(Archive archive, boolean replaceNulls) {
ArchiveIterator(Archive archive, boolean replaceNulls, boolean replaceEntities) {
List<Term> rowTypes = new ArrayList<Term>();

try {
if (extensions.isEmpty()) {
// no need to sort
coreIter = RecordIterator.build(archive.getCore(), replaceNulls);
coreIter = RecordIterator.build(archive.getCore(), replaceNulls, replaceEntities);
} else {
// sort data files to align extension records into a single star record
if (!archive.sorted) {
archive.sortFiles();
}
coreIter = buildSortedIterator(archive.getCore(), replaceNulls);
coreIter = buildSortedIterator(archive.getCore(), replaceNulls, replaceEntities);
}
} catch (IOException e) {
LOG.warn("IOException opening core file", e);
Expand All @@ -156,7 +156,8 @@ class ArchiveIterator implements ClosableIterator<StarRecordImpl> {
for (ArchiveFile af : archive.getExtensions()) {
rowTypes.add(af.getRowType());
RecordIterator iter =
extensions.isEmpty() ? RecordIterator.build(af, replaceNulls) : buildSortedIterator(af, replaceNulls);
extensions.isEmpty() ? RecordIterator.build(af, replaceNulls, replaceEntities) : buildSortedIterator(af,
replaceNulls, replaceNulls);
closables.add(iter);
extensionIters.put(af.getRowType(), Iterators.peekingIterator(iter));
extensionRecordsSkipped.put(af.getRowType(), 0);
Expand All @@ -165,13 +166,13 @@ class ArchiveIterator implements ClosableIterator<StarRecordImpl> {
rec = new StarRecordImpl(rowTypes);
}

private RecordIterator buildSortedIterator(ArchiveFile af, boolean replaceNulls) {
private RecordIterator buildSortedIterator(ArchiveFile af, boolean replaceNulls, boolean replaceEntities) {
// we need to sort the data files
String original = af.getLocation();
// temporarily modify archive file to create iterator over sorted file
af.getLocations().clear();
af.addLocation(ArchiveFile.getLocationSorted(original));
RecordIterator iter = RecordIterator.build(af, replaceNulls);
RecordIterator iter = RecordIterator.build(af, replaceNulls, replaceEntities);
// revert to original
af.getLocations().clear();
af.addLocation(original);
Expand Down Expand Up @@ -331,18 +332,19 @@ public Map<String, File> getConstituentMetadata() {
}

/**
* @return a complete iterator using star records with all extension records that replace literal null values.
* @return a complete iterator using star records with all extension records that replace literal null values and
* html entities.
*/
public ClosableIterator<StarRecordImpl> iterator() {
return new ArchiveIterator(this, true);
return new ArchiveIterator(this, true, true);
}

/**
* @return a complete iterator using star records with all extension records that are not replacing literal null
* values.
* values or html entities.
*/
public ClosableIterator<StarRecordImpl> iteratorRaw() {
return new ArchiveIterator(this, false);
return new ArchiveIterator(this, false, false);
}

/**
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/gbif/dwc/text/ArchiveFile.java
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ public boolean hasTerm(String term) {
}

public Iterator<Record> iterator() {
return RecordIterator.build(this, true);
return RecordIterator.build(this, true, true);
}

public void setArchive(Archive archive) {
Expand Down
33 changes: 33 additions & 0 deletions src/test/java/org/gbif/dwc/record/CleanUtilsTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
package org.gbif.dwc.record;

import junit.framework.TestCase;
import org.junit.Test;

public class CleanUtilsTest extends TestCase {

@Test
public void testCleanFalse() throws Exception {
for (String x : new String[]{"", " ", " ", ".", "a ", " Me &amp; Bobby McGee"}) {
assertEquals(x, CleanUtils.clean(x, false, false));
}
}

@Test
public void testClean() throws Exception {
assertNull(CleanUtils.clean("", true, true));
assertNull(CleanUtils.clean(null, true, true));
assertNull(CleanUtils.clean(" ", true, true));
assertNull(CleanUtils.clean("\\N", true, true));
assertNull(CleanUtils.clean("NULL", true, true));

assertEquals(" Me & Bobby McGee", CleanUtils.clean(" Me &amp; Bobby McGee", true, true));
assertEquals("Me & Bobby McGee", CleanUtils.clean("Me &#0038; Bobby McGee", true, true));
assertEquals("Me & Bobby McGee", CleanUtils.clean("Me &#38; Bobby McGee", true, true));
assertEquals("Me & Bobby McGee", CleanUtils.clean("Me &#x26; Bobby McGee", true, true));
assertEquals("Me & Bobby McGee", CleanUtils.clean("Me &#X26; Bobby McGee", true, true));

assertEquals("Me &amp", CleanUtils.clean("Me &amp", true, true));
assertEquals("Me &amp ;", CleanUtils.clean("Me &amp ;", true, true));
assertEquals("Me & amp;", CleanUtils.clean("Me & amp;", true, true));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ public void testDefaultValue() {
ArchiveField id = new ArchiveField(0, DwcTerm.taxonID, null, null);
Set<ArchiveField> fields = new HashSet<ArchiveField>();
fields.add(new ArchiveField(1, DwcTerm.datasetName, DATASET, null));
RecordImpl rec = new RecordImpl(id, fields, DwcTerm.Taxon, true);
RecordImpl rec = new RecordImpl(id, fields, DwcTerm.Taxon, true, true);

String[] row = {"5432", "IPNI"};
rec.setRow(row);
Expand Down
8 changes: 4 additions & 4 deletions src/test/java/org/gbif/dwc/record/RecordImplTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ public void testReplaceNull() {
af.setIndex(0);
af.setType(ArchiveField.DataType.string);
List<ArchiveField> fields = Lists.newArrayList(af);
final RecordImpl r = new RecordImpl(null, fields, null, true);
final RecordImpl r2 = new RecordImpl(null, fields, null, false);
final RecordImpl r = new RecordImpl(null, fields, null, true, false);
final RecordImpl r2 = new RecordImpl(null, fields, null, false, false);

String val = setRows(null, r, r2);
assertNull(r.value(t));
Expand Down Expand Up @@ -118,7 +118,7 @@ public void testGetFullScientificName() {
fields.add(new ArchiveField(4, DwcTerm.phylum, null, null));
fields.add(new ArchiveField(6, DwcTerm.class_, null, null));
fields.add(new ArchiveField(7, DwcTerm.acceptedNameUsage, null, null));
RecordImpl rec = new RecordImpl(id, fields, DwcTerm.Taxon, true);
RecordImpl rec = new RecordImpl(id, fields, DwcTerm.Taxon, true, true);

String[] row =
{"5432", "Abies alba Mill.", "Mill.", "Harry", "Monocotyledonae", "Bertram", "Pincodiae", "Picea picaea L."};
Expand All @@ -143,7 +143,7 @@ public void testDefaultValue() {
ArchiveField id = new ArchiveField(0, DwcTerm.taxonID, null, null);
Set<ArchiveField> fields = new HashSet<ArchiveField>();
fields.add(new ArchiveField(1, DwcTerm.datasetName, DATASET, null));
RecordImpl rec = new RecordImpl(id, fields, DwcTerm.Taxon, true);
RecordImpl rec = new RecordImpl(id, fields, DwcTerm.Taxon, true, true);

String[] row = {"5432", "IPNI"};
rec.setRow(row);
Expand Down

0 comments on commit e57c26d

Please sign in to comment.