diff --git a/JAICore/jaicore-ml/src/main/java/ai/libs/jaicore/ml/core/dataset/serialization/CSVDatasetAdapter.java b/JAICore/jaicore-ml/src/main/java/ai/libs/jaicore/ml/core/dataset/serialization/CSVDatasetAdapter.java index ac9687d798..260b8f5fa5 100644 --- a/JAICore/jaicore-ml/src/main/java/ai/libs/jaicore/ml/core/dataset/serialization/CSVDatasetAdapter.java +++ b/JAICore/jaicore-ml/src/main/java/ai/libs/jaicore/ml/core/dataset/serialization/CSVDatasetAdapter.java @@ -1,35 +1,113 @@ package ai.libs.jaicore.ml.core.dataset.serialization; +import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; +import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; +import java.lang.reflect.Method; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; import java.util.stream.Collectors; import java.util.stream.IntStream; import org.api4.java.ai.ml.core.dataset.descriptor.IDatasetDescriptor; +import org.api4.java.ai.ml.core.dataset.schema.ILabeledInstanceSchema; import org.api4.java.ai.ml.core.dataset.schema.attribute.IAttribute; import org.api4.java.ai.ml.core.dataset.schema.attribute.ICategoricalAttribute; import org.api4.java.ai.ml.core.dataset.serialization.DatasetDeserializationFailedException; import org.api4.java.ai.ml.core.dataset.serialization.IDatasetDeserializer; import org.api4.java.ai.ml.core.dataset.supervised.ILabeledDataset; import org.api4.java.ai.ml.core.dataset.supervised.ILabeledInstance; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import ai.libs.jaicore.basic.reconstruction.ReconstructionInstruction; +import ai.libs.jaicore.ml.core.dataset.Dataset; +import ai.libs.jaicore.ml.core.dataset.DenseInstance; +import ai.libs.jaicore.ml.core.dataset.schema.LabeledInstanceSchema; +import ai.libs.jaicore.ml.core.dataset.schema.attribute.IntBasedCategoricalAttribute; +import ai.libs.jaicore.ml.core.dataset.schema.attribute.NumericAttribute; +import ai.libs.jaicore.ml.core.dataset.schema.attribute.StringAttribute; public class CSVDatasetAdapter implements IDatasetDeserializer> { + private enum ColumnType { + Numeric, String, Categorical + } + + private final Logger logger = LoggerFactory.getLogger(getClass()); + public CSVDatasetAdapter() { // nothing to do here } - + @Override - public ILabeledDataset deserializeDataset(final IDatasetDescriptor datasetDescription) throws DatasetDeserializationFailedException, InterruptedException { - throw new UnsupportedOperationException("Not yet supported!"); + public ILabeledDataset deserializeDataset(final IDatasetDescriptor datasetDescriptor) throws DatasetDeserializationFailedException, InterruptedException { + if (!(datasetDescriptor instanceof CSVFileDatasetDescriptor)) { + throw new IllegalArgumentException(String.format("Only {}s supported", CSVFileDatasetDescriptor.class.getSimpleName())); + } + + CSVFileDatasetDescriptor descriptor = (CSVFileDatasetDescriptor) datasetDescriptor; + File csvFile = new File(descriptor.getCsvFile()); + + List columnNames; + List> data; + try { + BufferedReader reader = new BufferedReader(new FileReader(csvFile)); + try { + String header = reader.readLine(); + String[] allColumnNames = header.split(",", -1); + + List ignoredColumnIndices = computeIgnoredColumnIndices(allColumnNames, descriptor.getIgnoredColumns()); + columnNames = filterValues(allColumnNames, ignoredColumnIndices); + data = readAndFilterData(reader, ignoredColumnIndices, allColumnNames.length); + } finally { + reader.close(); + } + } catch (IOException e) { + throw new DatasetDeserializationFailedException(e); + } + + List columnTypes = computeColumnTypes(columnNames, descriptor.getCategoricalColumns(), data); + int labelColumnIndex = computeLabelColumnIndex(columnNames, descriptor.getLabelColumn()); + + List allAttributes = computeAttributes(columnNames, columnTypes, data); + + List valueAttributes = new ArrayList<>(allAttributes); + IAttribute labelAttribute = valueAttributes.remove(labelColumnIndex); + ILabeledInstanceSchema schema = new LabeledInstanceSchema(csvFile.getName(), valueAttributes, labelAttribute); + + Dataset dataset = new Dataset(schema); + while (!data.isEmpty()) { + dataset.add(createLabeledInstance(allAttributes, labelColumnIndex, data.remove(0))); + } + dataset.addInstruction(createReconstructionInstruction(descriptor)); + + logger.info("Successfully created dataset from CSV, {} columns, {} rows, file: {}", columnNames.size(), dataset.size(), csvFile.getAbsolutePath()); + logger.debug("Column types:"); + for (IAttribute attribute : allAttributes) { + logger.debug(" {}: {}", attribute.getName(), attribute.getClass().getSimpleName()); + } + + return dataset; + } + + public static ILabeledDataset readDataset(final String csvInputFile, final String labelColumn) throws DatasetDeserializationFailedException, InterruptedException { + return readDataset(csvInputFile, labelColumn, new ArrayList<>()); } - public static ILabeledDataset readDataset(final File csvInputFile) { - throw new UnsupportedOperationException("Not yet supported!"); + public static ILabeledDataset readDataset(final String csvInputFile, final String labelColumn, List categoricalColumns) throws DatasetDeserializationFailedException, InterruptedException { + return readDataset(csvInputFile, labelColumn, categoricalColumns, new ArrayList<>()); } + public static ILabeledDataset readDataset(final String csvInputFile, final String labelColumn, List categoricalColumns, List ignoredColumns) throws DatasetDeserializationFailedException, InterruptedException { + CSVFileDatasetDescriptor desc = new CSVFileDatasetDescriptor(csvInputFile, labelColumn, categoricalColumns, ignoredColumns); + return new CSVDatasetAdapter().deserializeDataset(desc); + } + public static void writeDataset(final File arffOutputFile, final ILabeledDataset data) throws IOException { try (BufferedWriter bw = new BufferedWriter(new FileWriter(arffOutputFile))) { // write header line for csv @@ -64,4 +142,162 @@ private static String serializeAttributeValue(final IAttribute att, final Object return value; } -} + private List computeIgnoredColumnIndices(String[] allColumnNames, List ignoredColumns) { + ArrayList ignoredColumnIndices = new ArrayList<>(); + for (int col = 0; col < allColumnNames.length; col++) { + String columnName = normalize(allColumnNames[col]); + if (ignoredColumns.contains(columnName)) { + ignoredColumnIndices.add(col); + } + } + return ignoredColumnIndices; + } + + private List filterValues(String[] allValues, List ignoredColumnIndices) { + ArrayList values = new ArrayList<>(); + for (int col = 0; col < allValues.length; col++) { + if (!ignoredColumnIndices.contains(col)) { + values.add(normalize(allValues[col])); + } + } + return values; + } + + private List> readAndFilterData(BufferedReader reader, List ignoredColumnIndices, int targetValueNumber) + throws IOException { + ArrayList> data = new ArrayList<>(); + String line = reader.readLine(); + int lineNumber = 2; + while (line != null) { + String[] allValues = line.split(",", -1); + if (allValues.length == targetValueNumber) { + List values = filterValues(allValues, ignoredColumnIndices); + data.add(values); + } else { + logger.warn("Ignored line {}: should have {} elements, but has {}", lineNumber, targetValueNumber, allValues.length); + } + + line = reader.readLine(); + lineNumber++; + } + return data; + } + + private List computeColumnTypes(List columnNames, List categoricalColumns, List> data) { + ArrayList columnTypes = new ArrayList<>(); + for (int col = 0; col < columnNames.size(); col++) { + ColumnType columnType; + if (categoricalColumns.contains(columnNames.get(col))) { + columnType = ColumnType.Categorical; + } else { + columnType = ColumnType.Numeric; + for (List values : data) { + String value = values.get(col); + if (!value.trim().isEmpty()) { + try { + Double.parseDouble(value); + } catch (NumberFormatException e) { + columnType = ColumnType.String; + break; + } + } + } + } + columnTypes.add(columnType); + } + + return columnTypes; + } + + private int computeLabelColumnIndex(List columnNames, String labelColumn) throws DatasetDeserializationFailedException { + for (int col = 0; col < columnNames.size(); col++) { + if (columnNames.get(col).equals(labelColumn)) { + return col; + } + } + + String msg = String.format("Label column {} not found, columns: {}", labelColumn, String.join(", ", columnNames)); + throw new DatasetDeserializationFailedException(msg); + } + + private List computeAttributes(List columnNames, List columnTypes, + List> data) { + List attributes = new ArrayList<>(); + for (int col = 0; col < columnTypes.size(); col++) { + switch (columnTypes.get(col)) { + case Numeric: + attributes.add(new NumericAttribute(columnNames.get(col))); + break; + case String: + attributes.add(new StringAttribute(columnNames.get(col))); + break; + case Categorical: + List domain = computeDistinctValues(data, col); + attributes.add(new IntBasedCategoricalAttribute(columnNames.get(col), domain)); + break; + default: + throw new RuntimeException("Unsupported literal: " + columnTypes.get(col)); + } + } + return attributes; + } + + private List computeDistinctValues(List> data, int column) { + HashSet distinctValues = new HashSet<>(); + for (List values : data) { + distinctValues.add(values.get(column)); + } + return new ArrayList(distinctValues); + } + + private ILabeledInstance createLabeledInstance(List allAttributes, int labelColumnIndex, + List stringValues) { + ArrayList values = new ArrayList<>(); + for (int col = 0; col < stringValues.size(); col++) { + if (col != labelColumnIndex) { + IAttribute attribute = allAttributes.get(col); + String stringValue = stringValues.get(col); + values.add(computeValue(attribute, stringValue)); + } + } + + IAttribute attribute = allAttributes.get(labelColumnIndex); + String stringValue = stringValues.get(labelColumnIndex); + Object labelValue = computeValue(attribute, stringValue); + + return new DenseInstance(values, labelValue); + } + + private Object computeValue(IAttribute attribute, String stringValue) { + Object value; + if (stringValue.trim().isEmpty()) { + if (attribute instanceof StringAttribute) { + value = ""; + } else { + value = null; + } + } else { + value = attribute.deserializeAttributeValue(stringValue); + } + return value; + } + + private ReconstructionInstruction createReconstructionInstruction(CSVFileDatasetDescriptor descriptor) throws DatasetDeserializationFailedException { + try { + Method method = getClass().getMethod("readDataset", String.class, String.class, List.class, List.class); + Object[] args = new Object[] { + descriptor.getCsvFile(), descriptor.getLabelColumn(), + descriptor.getCategoricalColumns(), descriptor.getIgnoredColumns()}; + return new ReconstructionInstruction(method, args); + } catch (NoSuchMethodException | SecurityException e) { + throw new DatasetDeserializationFailedException(e); + } + } + + private String normalize(String s) { + if (s.startsWith("\"") && s.endsWith("\"")) { + s = s.substring(1, s.length() - 1); + } + return s; + } +} \ No newline at end of file diff --git a/JAICore/jaicore-ml/src/main/java/ai/libs/jaicore/ml/core/dataset/serialization/CSVFileDatasetDescriptor.java b/JAICore/jaicore-ml/src/main/java/ai/libs/jaicore/ml/core/dataset/serialization/CSVFileDatasetDescriptor.java new file mode 100644 index 0000000000..aae7c8204b --- /dev/null +++ b/JAICore/jaicore-ml/src/main/java/ai/libs/jaicore/ml/core/dataset/serialization/CSVFileDatasetDescriptor.java @@ -0,0 +1,52 @@ +package ai.libs.jaicore.ml.core.dataset.serialization; + +import java.util.List; + +import org.api4.java.ai.ml.core.dataset.descriptor.IDatasetDescriptor; + +import com.google.gson.Gson; + +class CSVFileDatasetDescriptor implements IDatasetDescriptor { + + private final String csvFile; + private final String labelColumn; + private final List categoricalColumns; + private final List ignoredColumns; + + public CSVFileDatasetDescriptor(String csvFile, String labelColumn, + List categoricalColumns, List ignoredColumns) { + this.csvFile = csvFile; + this.labelColumn = labelColumn; + this.categoricalColumns = categoricalColumns; + this.ignoredColumns = ignoredColumns; + } + + public CSVFileDatasetDescriptor(String json) { + CSVFileDatasetDescriptor desc = new Gson().fromJson(json, getClass()); + this.csvFile = desc.csvFile; + this.labelColumn = desc.labelColumn; + this.categoricalColumns = desc.categoricalColumns; + this.ignoredColumns = desc.ignoredColumns; + } + + public String getCsvFile() { + return this.csvFile; + } + + public String getLabelColumn() { + return this.labelColumn; + } + + public List getCategoricalColumns() { + return this.categoricalColumns; + } + + public List getIgnoredColumns() { + return this.ignoredColumns; + } + + @Override + public String getDatasetDescription() { + return new Gson().toJson(this); + } +} \ No newline at end of file diff --git a/JAICore/jaicore-ml/src/test/java/ai/libs/jaicore/ml/core/dataset/serialization/CSVDatasetAdapterTest.java b/JAICore/jaicore-ml/src/test/java/ai/libs/jaicore/ml/core/dataset/serialization/CSVDatasetAdapterTest.java index fd7e030f7e..958259de88 100644 --- a/JAICore/jaicore-ml/src/test/java/ai/libs/jaicore/ml/core/dataset/serialization/CSVDatasetAdapterTest.java +++ b/JAICore/jaicore-ml/src/test/java/ai/libs/jaicore/ml/core/dataset/serialization/CSVDatasetAdapterTest.java @@ -1,15 +1,28 @@ package ai.libs.jaicore.ml.core.dataset.serialization; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; import java.io.File; import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import org.api4.java.ai.ml.core.dataset.schema.ILabeledInstanceSchema; +import org.api4.java.ai.ml.core.dataset.schema.attribute.IAttribute; import org.api4.java.ai.ml.core.dataset.serialization.DatasetDeserializationFailedException; import org.api4.java.ai.ml.core.dataset.supervised.ILabeledDataset; import org.api4.java.ai.ml.core.dataset.supervised.ILabeledInstance; +import org.api4.java.ai.ml.core.exception.DatasetCreationException; import org.junit.jupiter.api.Test; +import ai.libs.jaicore.ml.core.dataset.schema.attribute.IntBasedCategoricalAttribute; +import ai.libs.jaicore.ml.core.dataset.schema.attribute.NumericAttribute; +import ai.libs.jaicore.ml.core.dataset.schema.attribute.StringAttribute; + public class CSVDatasetAdapterTest { @Test @@ -20,4 +33,140 @@ public void testWriteDataset() throws DatasetDeserializationFailedException, IOE assertTrue(f.exists()); } + @Test + public void testReadDataset_OpenML() + throws DatasetDeserializationFailedException, DatasetCreationException, InterruptedException { + File testCsvFile = new File("testrsc/dataset/csv/OpenML_42731.csv"); + List categoricalColumns = Arrays.asList("waterfront", "view", "condition", "grade", "zipcode"); + List ignoredColumns = Arrays.asList("id"); + + ILabeledDataset dataset = CSVDatasetAdapter.readDataset(testCsvFile.getAbsolutePath(), + "price", categoricalColumns, ignoredColumns); + + assertDataSetOk(dataset, categoricalColumns, ignoredColumns); + assertDataSetOk((ILabeledDataset) dataset.createCopy(), categoricalColumns, ignoredColumns); + } + + @Test + public void testReadDataset_Custom() + throws DatasetDeserializationFailedException, DatasetCreationException, InterruptedException { + File testCsvFile = new File("testrsc/dataset/csv/another_housing.csv"); + List categoricalColumns = Arrays.asList("waterfront", "view", "condition", "grade", "zipcode"); + List ignoredColumns = Arrays.asList("id"); + + ILabeledDataset dataset = CSVDatasetAdapter.readDataset(testCsvFile.getAbsolutePath(), + "price", categoricalColumns, ignoredColumns); + + assertDataSetOk(dataset, categoricalColumns, ignoredColumns); + assertDataSetOk((ILabeledDataset) dataset.createCopy(), categoricalColumns, ignoredColumns); + + IAttribute dateAttribute = findAttribute(dataset.getInstanceSchema(), "date"); + assertNotNull(dateAttribute); + assertTrue(dateAttribute instanceof StringAttribute); + } + + @Test + public void testReadDataset_MissingValues() + throws DatasetDeserializationFailedException, DatasetCreationException, InterruptedException { + File testCsvFile = new File("testrsc/dataset/csv/OpenML_42731_missing_values.csv"); + List categoricalColumns = Arrays.asList("waterfront", "view", "condition", "grade", "zipcode"); + List ignoredColumns = Arrays.asList("id"); + + ILabeledDataset dataset = CSVDatasetAdapter.readDataset(testCsvFile.getAbsolutePath(), + "price", categoricalColumns, ignoredColumns); + + assertDataSetOk(dataset, categoricalColumns, ignoredColumns); + + assertTrue(dataset.getInstanceSchema().getLabelAttribute() instanceof NumericAttribute); + assertFalse(dataset.get(0).isLabelPresent()); + assertTrue(dataset.get(1).isLabelPresent()); + assertTrue(dataset.get(2).isLabelPresent()); + assertFalse(dataset.get(3).isLabelPresent()); + assertFalse(dataset.get(4).isLabelPresent()); + + int index = getIndexOfColumn("bedrooms", dataset); + assertTrue(dataset.getInstanceSchema().getAttribute(index) instanceof NumericAttribute); + assertNull(dataset.get(0).getAttributeValue(index)); + + index = getIndexOfColumn("bathrooms", dataset); + assertTrue(dataset.getInstanceSchema().getAttribute(index) instanceof NumericAttribute); + assertNull(dataset.get(2).getAttributeValue(index)); + + index = getIndexOfColumn("sqft_living", dataset); + assertTrue(dataset.getInstanceSchema().getAttribute(index) instanceof NumericAttribute); + assertNull(dataset.get(4).getAttributeValue(index)); + + index = getIndexOfColumn("waterfront", dataset); + assertTrue(dataset.getInstanceSchema().getAttribute(index) instanceof IntBasedCategoricalAttribute); + assertNull(dataset.get(0).getAttributeValue(index)); + + index = getIndexOfColumn("view", dataset); + assertTrue(dataset.getInstanceSchema().getAttribute(index) instanceof IntBasedCategoricalAttribute); + assertNull(dataset.get(2).getAttributeValue(index)); + + index = getIndexOfColumn("condition", dataset); + assertTrue(dataset.getInstanceSchema().getAttribute(index) instanceof IntBasedCategoricalAttribute); + assertNull(dataset.get(4).getAttributeValue(index)); + + index = getIndexOfColumn("string_1", dataset); + assertTrue(dataset.getInstanceSchema().getAttribute(index) instanceof StringAttribute); + assertEquals("", dataset.get(0).getAttributeValue(index)); + assertEquals("foo", dataset.get(1).getAttributeValue(index)); + assertEquals("bar", dataset.get(2).getAttributeValue(index)); + assertEquals("", dataset.get(3).getAttributeValue(index)); + assertEquals("", dataset.get(4).getAttributeValue(index)); + } + + @Test + public void testReadDataset_InvalidLines() + throws DatasetDeserializationFailedException, DatasetCreationException, InterruptedException { + File testCsvFile = new File("testrsc/dataset/csv/OpenML_42731_invalid_data.csv"); + List categoricalColumns = Arrays.asList("waterfront", "view", "condition", "grade", "zipcode"); + List ignoredColumns = Arrays.asList("id"); + + ILabeledDataset dataset = CSVDatasetAdapter.readDataset(testCsvFile.getAbsolutePath(), + "price", categoricalColumns, ignoredColumns); + + assertEquals(3, dataset.size()); + } + + private void assertDataSetOk(ILabeledDataset dataset, List categoricalColumns, + List ignoredColumns) { + assertEquals(5, dataset.size()); + ILabeledInstanceSchema schema = dataset.getInstanceSchema(); + + IAttribute labelAttribute = schema.getLabelAttribute(); + assertNotNull(labelAttribute); + assertTrue(labelAttribute instanceof NumericAttribute); + + for (String categoricalColumn : categoricalColumns) { + IAttribute attribute = findAttribute(schema, categoricalColumn); + assertNotNull(attribute); + assertTrue(attribute instanceof IntBasedCategoricalAttribute); + } + for (String ignoredColumn : ignoredColumns) { + IAttribute attribute = findAttribute(schema, ignoredColumn); + assertNull(attribute); + } + } + + private IAttribute findAttribute(ILabeledInstanceSchema schema, String name) { + for (IAttribute attribute : schema.getAttributeList()) { + if (attribute.getName().equals(name)) { + return attribute; + } + } + return null; + } + + private int getIndexOfColumn(String name, ILabeledDataset dataset) { + List attributes = dataset.getInstanceSchema().getAttributeList(); + for (int i = 0; i < attributes.size(); i++) { + if (attributes.get(i).getName().equals(name)) { + return i; + } + } + return -1; + } + } diff --git a/JAICore/jaicore-ml/testrsc/dataset/csv/OpenML_42731.csv b/JAICore/jaicore-ml/testrsc/dataset/csv/OpenML_42731.csv new file mode 100644 index 0000000000..49e4591a0f --- /dev/null +++ b/JAICore/jaicore-ml/testrsc/dataset/csv/OpenML_42731.csv @@ -0,0 +1,6 @@ +"id","price","bedrooms","bathrooms","sqft_living","sqft_lot","floors","waterfront","view","condition","grade","sqft_above","sqft_basement","yr_built","yr_renovated","zipcode","lat","long","sqft_living15","sqft_lot15","date_year","date_month","date_day" +7129300520,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650,2014,10,13 +6414100192,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721000000000004,-122.319,1690,7639,2014,12,9 +5631500400,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.23299999999999,2720,8062,2015,2,25 +2487200875,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.39299999999999,1360,5000,2014,12,9 +1954400510,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503,2015,2,18 diff --git a/JAICore/jaicore-ml/testrsc/dataset/csv/OpenML_42731_invalid_data.csv b/JAICore/jaicore-ml/testrsc/dataset/csv/OpenML_42731_invalid_data.csv new file mode 100644 index 0000000000..d201ec57e7 --- /dev/null +++ b/JAICore/jaicore-ml/testrsc/dataset/csv/OpenML_42731_invalid_data.csv @@ -0,0 +1,6 @@ +"id","price","bedrooms","bathrooms","sqft_living","sqft_lot","floors","waterfront","view","condition","grade","sqft_above","sqft_basement","yr_built","yr_renovated","zipcode","lat","long","sqft_living15","sqft_lot15","date_year","date_month","date_day" +7129300520,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650,2014,10,13 +6414100192,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721000000000004,-122.319,1690,7639,2014,12,9, +5631500400,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.23299999999999,2720,8062,2015,2 +2487200875,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.39299999999999,1360,5000,2014,12,9 +1954400510,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503,2015,2,18 diff --git a/JAICore/jaicore-ml/testrsc/dataset/csv/OpenML_42731_missing_values.csv b/JAICore/jaicore-ml/testrsc/dataset/csv/OpenML_42731_missing_values.csv new file mode 100644 index 0000000000..4db52d3360 --- /dev/null +++ b/JAICore/jaicore-ml/testrsc/dataset/csv/OpenML_42731_missing_values.csv @@ -0,0 +1,6 @@ +"id","price","bedrooms","bathrooms","sqft_living","sqft_lot","floors","waterfront","view","condition","grade","sqft_above","sqft_basement","yr_built","yr_renovated","zipcode","lat","long","sqft_living15","sqft_lot15","date_year","date_month","date_day","string_1" +7129300520,,,1.0,1180,5650,1.0,,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650,2014,10,13, +6414100192,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721000000000004,-122.319,1690,7639,2014,12,9,foo +5631500400,180000.0,2,,770,10000,1.0,0,,3,6,770,0,1933,0,98028,47.7379,-122.23299999999999,2720,8062,2015,2,25,"bar" +2487200875,,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.39299999999999,1360,5000,2014,12,9, +1954400510,,3,2.0,,8080,1.0,0,0,,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503,2015,2,18, diff --git a/JAICore/jaicore-ml/testrsc/dataset/csv/another_housing.csv b/JAICore/jaicore-ml/testrsc/dataset/csv/another_housing.csv new file mode 100644 index 0000000000..b6a0ec3a04 --- /dev/null +++ b/JAICore/jaicore-ml/testrsc/dataset/csv/another_housing.csv @@ -0,0 +1,6 @@ +id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15 +"7129300520","20141013T000000",221900,3,1,1180,5650,"1",0,0,3,7,1180,0,1955,0,"98178",47.5112,-122.257,1340,5650 +"6414100192","20141209T000000",538000,3,2.25,2570,7242,"2",0,0,3,7,2170,400,1951,1991,"98125",47.721,-122.319,1690,7639 +"5631500400","20150225T000000",180000,2,1,770,10000,"1",0,0,3,6,770,0,1933,0,"98028",47.7379,-122.233,2720,8062 +"2487200875","20141209T000000",604000,4,3,1960,5000,"1",0,0,5,7,1050,910,1965,0,"98136",47.5208,-122.393,1360,5000 +"1954400510","20150218T000000",510000,3,2,1680,8080,"1",0,0,3,8,1680,0,1987,0,"98074",47.6168,-122.045,1800,7503