diff --git a/README.rst b/README.rst index a4bd3241..89b1741c 100644 --- a/README.rst +++ b/README.rst @@ -111,6 +111,9 @@ Example Float ``DoubleColumn`` ``d`` Example boolean ``BoolColumn`` ``b`` =============== ================= ==================== +In the case of missing values, the column will be detected as ``StringColumn`` by default. If ``--allow-nan`` is passed to the +``omero metadata populate`` commands, missing values in floating-point columns will be detected as ``DoubleColumn`` and the +missing values will be stored as NaN. However, it is possible to manually define the header types, ignoring the automatic header detection, if a ``CSV`` with a ``# header`` row is passed. The ``# header`` row should be the first row of the CSV and defines columns according to the following list (see examples below): diff --git a/src/omero_metadata/cli.py b/src/omero_metadata/cli.py index edce60b6..3836051b 100755 --- a/src/omero_metadata/cli.py +++ b/src/omero_metadata/cli.py @@ -241,11 +241,13 @@ def _configure(self, parser): populate.add_argument("--localcfg", help=( "Local configuration file or a JSON object string")) - populate.add_argument("--allow_nan", action="store_true", help=( - "Allow empty values to become Nan in Long or Double columns")) + populate.add_argument( + "--allow-nan", "--allow_nan", action="store_true", help=( + "Allow empty values to become Nan in Long or Double columns")) - populate.add_argument("--manual_header", action="store_true", help=( - "Disable automatic header detection during population")) + populate.add_argument( + "--manual-header", "--manual_header", action="store_true", help=( + "Disable automatic header detection during population")) populateroi.add_argument( "--measurement", type=int, default=None, @@ -489,7 +491,7 @@ def testtables(self, args): self.ctx.die(100, "Failed to initialize Table") @staticmethod - def detect_headers(csv_path): + def detect_headers(csv_path, keep_default_na=True): ''' Function to automatically detect headers from a CSV file. This function loads the table to pandas to detects the column type and match headers @@ -497,7 +499,7 @@ def detect_headers(csv_path): conserved_headers = ['well', 'plate', 'image', 'dataset', 'roi'] headers = [] - table = pd.read_csv(csv_path) + table = pd.read_csv(csv_path, keep_default_na=keep_default_na) col_types = table.dtypes.values.tolist() cols = list(table.columns) @@ -577,7 +579,8 @@ def populate(self, args): if not args.manual_header and \ not first_row[0].str.contains('# header').bool(): omero_metadata.populate.log.info("Detecting header types") - header_type = MetadataControl.detect_headers(args.file) + header_type = MetadataControl.detect_headers( + args.file, keep_default_na=args.allow_nan) if args.dry_run: omero_metadata.populate.log.info(f"Header Types:{header_type}") else: diff --git a/test/integration/metadata/test_populate.py b/test/integration/metadata/test_populate.py old mode 100644 new mode 100755 index 619bc968..42af38e7 --- a/test/integration/metadata/test_populate.py +++ b/test/integration/metadata/test_populate.py @@ -175,6 +175,13 @@ def assert_columns(self, columns): col_names = "Well,Well Type,Concentration,Well Name" assert col_names == ",".join([c.name for c in columns]) + def assert_values(self, row_values): + # Unsure where the lower-casing is happening + if "A1" in row_values or "a1" in row_values: + assert "Control" in row_values + elif "A2" in row_values or "a2" in row_values: + assert "Treatment" in row_values + def assert_child_annotations(self, oas): for ma, wid, wr, wc in oas: assert isinstance(ma, MapAnnotationI) @@ -767,6 +774,14 @@ def assert_columns(self, columns): def assert_row_count(self, rows): assert rows == len(self.roi_names) + def assert_values(self, row_values): + if "roi1" in row_values: + assert 0.5 in row_values + assert 100 in row_values + elif "roi2" in row_values: + assert 'nan' in [str(value) for value in row_values] + assert 200 in row_values + def get_target(self): if not self.image: image = self.test.make_image() @@ -1218,17 +1233,7 @@ def _assert_parsing_context_values(self, t, fixture): row_values = [col.values[0] for col in t.read( list(range(len(cols))), hit, hit+1).columns] assert len(row_values) == fixture.count - # Unsure where the lower-casing is happening - if "A1" in row_values or "a1" in row_values: - assert "Control" in row_values - elif "A2" in row_values or "a2" in row_values: - assert "Treatment" in row_values - elif "roi1" in row_values: - assert 0.5 in row_values - assert 100 in row_values - elif "roi2" in row_values: - assert 'nan' in [str(value) for value in row_values] - assert 200 in row_values + fixture.assert_values(row_values) def _test_bulk_to_map_annotation_context(self, fixture, batch_size): # self._testPopulateMetadataPlate() diff --git a/test/unit/test_automatic_header.py b/test/unit/test_automatic_header.py old mode 100644 new mode 100755 index 3e553525..2fd0bcc7 --- a/test/unit/test_automatic_header.py +++ b/test/unit/test_automatic_header.py @@ -17,43 +17,83 @@ StringColumn, WellColumn, DoubleColumn, BoolColumn, DatasetColumn -def test_detect_headers(): - ''' - Test of the default automatic column type detection behaviour - ''' - d = { - 'measurement 1': [11, 22, 33], - 'measurement 2': [0.1, 0.2, 0.3], - 'measurement 3': ['a', 'b', 'c'], - 'measurement 4': [True, True, False], - 'measurement 5': [11, 0.1, True] - } - prefix_list = ['project', 'dataset', 'plate', 'well', 'image', 'roi', ] - # Create a dictionary with every combination of headers - # eg plate_name/platename/plate name/plate_id/plateid/plate id - for prefix in prefix_list: - d[f'{prefix}_name'] = ['a', 'b', 'c'] - d[f'{prefix} name'] = ['a', 'b', 'c'] - d[f'{prefix}name'] = ['a', 'b', 'c'] - d[f'{prefix}_id'] = [1, 2, 3] - d[f'{prefix} id'] = [1, 2, 3] - d[f'{prefix}id'] = [1, 2, 3] - d[f'{prefix}'] = [1, 2, 3] - - df = pd.DataFrame(data=d) - tmp = tempfile.NamedTemporaryFile() - df.to_csv(tmp.name, index=False) - header = MetadataControl.detect_headers(tmp.name) - expected_header = [ - 'l', 'd', 's', 'b', 's', - 's', 's', 's', 'l', 'l', 'l', 'l', - 's', 's', 's', 'dataset', 'dataset', 'dataset', 'dataset', - 'plate', 'plate', 'plate', 'l', 'l', 'l', 'plate', - 'well', 'well', 'well', 'l', 'l', 'l', 'well', - 's', 's', 's', 'image', 'image', 'image', 'image', - 's', 's', 's', 'roi', 'roi', 'roi', 'roi' - ] - assert header == expected_header +class TestDetectHeaders: + """Test the MetadataControl.detect_headers API""" + def assert_detect_headers(self, **kwargs): + df = pd.DataFrame(data=self.d) + tmp = tempfile.NamedTemporaryFile() + df.to_csv(tmp.name, index=False) + header = MetadataControl.detect_headers(tmp.name, **kwargs) + assert header == self.expected_header + + def create_objects_dictionary(self): + # Create a dictionary with every combination of headers + # eg plate_name/platename/plate name/plate_id/plateid/plate id + self.d = {} + prefix_list = ['project', 'dataset', 'plate', 'well', 'image', 'roi', ] + for prefix in prefix_list: + self.d[f'{prefix}_name'] = ['a', 'b', 'c'] + self.d[f'{prefix} name'] = ['a', 'b', 'c'] + self.d[f'{prefix}name'] = ['a', 'b', 'c'] + self.d[f'{prefix}_id'] = [1, 2, 3] + self.d[f'{prefix} id'] = [1, 2, 3] + self.d[f'{prefix}id'] = [1, 2, 3] + self.d[f'{prefix}'] = [1, 2, 3] + self.expected_header = [ + 's', 's', 's', 'l', 'l', 'l', 'l', + 's', 's', 's', 'dataset', 'dataset', 'dataset', 'dataset', + 'plate', 'plate', 'plate', 'l', 'l', 'l', 'plate', + 'well', 'well', 'well', 'l', 'l', 'l', 'well', + 's', 's', 's', 'image', 'image', 'image', 'image', + 's', 's', 's', 'roi', 'roi', 'roi', 'roi' + ] + + def test_objects_columns(self): + self.create_objects_dictionary() + self.assert_detect_headers() + + def test_dense_columns(self): + ''' + Test of the default automatic column type detection behaviour + ''' + self.create_objects_dictionary() + self.d.update({ + 'measurement 1': [11, 22, 33], + 'measurement 2': [0.1, 0.2, 0.3], + 'measurement 3': ['a', 'b', 'c'], + 'measurement 4': [True, True, False], + 'measurement 5': [11, 0.1, True] + }) + self.expected_header.extend(['l', 'd', 's', 'b', 's']) + self.assert_detect_headers() + + def test_sparse_default_na(self): + ''' + Test default handling of missing values + ''' + self.create_objects_dictionary() + self.d.update({ + 'measurement 1': [11, None, 33], + 'measurement 2': [0.1, 0.2, None], + 'measurement 3': ['a', 'b', None], + 'measurement 4': [True, None, False], + }) + self.expected_header.extend(['d', 'd', 's', 's']) + self.assert_detect_headers(keep_default_na=True) + + def test_sparse_no_default_na(self): + ''' + Test handling of missing values as string columns + ''' + self.create_objects_dictionary() + self.d.update({ + 'measurement 1': [11, None, 33], + 'measurement 2': [0.1, 0.2, None], + 'measurement 3': ['a', 'b', None], + 'measurement 4': [True, None, False], + }) + self.expected_header.extend(['s', 's', 's', 's']) + self.assert_detect_headers(keep_default_na=False) class TestColumnTypes: