From 464680f191059e1551d9b185b5b79d9145f5d248 Mon Sep 17 00:00:00 2001
From: William Moore <w.moore@dundee.ac.uk>
Date: Thu, 16 Dec 2021 14:54:00 +0000
Subject: [PATCH 1/6] Support images_by_name for Screen and Plate

---
 src/omero_metadata/populate.py | 78 ++++++++++++++++++++++------------
 1 file changed, 52 insertions(+), 26 deletions(-)

diff --git a/src/omero_metadata/populate.py b/src/omero_metadata/populate.py
index b6b68a39..ab5ed502 100644
--- a/src/omero_metadata/populate.py
+++ b/src/omero_metadata/populate.py
@@ -27,7 +27,7 @@
 from builtins import chr
 from builtins import str
 from builtins import range
-from future.utils import native_str
+from future.utils import isidentifier, native_str
 from past.builtins import basestring
 from builtins import object
 import logging
@@ -225,11 +225,6 @@ def create_columns(self):
     def columns_sanity_check(self, columns):
         column_types = [column.__class__ for column in columns]
         column_names = [column.name for column in columns]
-        if WellColumn in column_types and ImageColumn in column_types:
-            log.debug(column_types)
-            raise MetadataError(
-                ('Well Column and Image Column cannot be resolved at '
-                 'the same time. Pick one.'))
         if RoiColumn in column_types and ROI_NAME_COLUMN in column_names:
             log.debug('Found both ROI names and IDs. Not appending either.')
             return False
@@ -382,8 +377,8 @@ def get_well_name(self, well_id, plate=None):
         row = self.AS_ALPHA[row]
         return '%s%d' % (row, col + 1)
 
-    def get_image_id_by_name(self, iname, dname=None):
-        return self.wrapper.get_image_id_by_name(iname, dname)
+    def get_image_id_by_name(self, iname, did=None):
+        return self.wrapper.get_image_id_by_name(iname, did)
 
     def get_image_name_by_id(self, iid, pid=None):
         return self.wrapper.get_image_name_by_id(iid, pid)
@@ -568,7 +563,7 @@ def get_image_name_by_id(self, iid, pid=None):
             raise Exception("Cannot resolve image to plate")
         return self.images_by_id[pid][iid].name.val
 
-    def parse_plate(self, plate, wells_by_location, wells_by_id, images_by_id):
+    def parse_plate(self, plate, wells_by_location, wells_by_id, images_by_id, images_by_name):
         """
         Accepts PlateData instances
         """
@@ -588,6 +583,7 @@ def parse_plate(self, plate, wells_by_location, wells_by_id, images_by_id):
             for well_sample in well.well_samples:
                 image = well_sample.image
                 images_by_id[image.id.val] = image
+                images_by_name[image.name.val] = image
         log.debug('Completed parsing plate: %s' % plate.name.val)
         for row in wells_by_location:
             log.debug('%s: %r' % (row, list(wells_by_location[row].keys())))
@@ -632,6 +628,9 @@ def __init__(self, value_resolver):
         super(ScreenWrapper, self).__init__(value_resolver)
         self._load()
 
+    def get_image_id_by_name(self, iname, pid=None):
+        return self.images_by_name[pid][iname].id.val
+
     def get_plate_name_by_id(self, plate):
         plate = self.plates_by_id[plate]
         return plate.name.val
@@ -660,6 +659,7 @@ def _load(self):
         if self.target_object is None:
             raise MetadataError('Could not find target object!')
         self.target_name = unwrap(self.target_object.getName())
+        self.images_by_name = dict()
         self.images_by_id = dict()
         self.wells_by_location = dict()
         self.wells_by_id = dict()
@@ -683,8 +683,10 @@ def _load(self):
             wells_by_id = dict()
             self.wells_by_location[plate.name.val] = wells_by_location
             self.wells_by_id[plate.id.val] = wells_by_id
+            images_by_name = dict()
+            self.images_by_name[plate.id.val] = images_by_name
             self.parse_plate(
-                plate, wells_by_location, wells_by_id, images_by_id
+                plate, wells_by_location, wells_by_id, images_by_id, images_by_name
             )
 
 
@@ -736,18 +738,22 @@ def _load(self):
         self.images_by_id = dict()
         images_by_id = dict()
 
+        self.images_by_name = dict()
+        images_by_name = dict()
+        self.images_by_name[self.target_object.id.val] = images_by_name
+
         self.wells_by_location[self.target_object.name.val] = wells_by_location
         self.wells_by_id[self.target_object.id.val] = wells_by_id
         self.images_by_id[self.target_object.id.val] = images_by_id
         self.parse_plate(
             PlateData(self.target_object),
-            wells_by_location, wells_by_id, images_by_id
+            wells_by_location, wells_by_id, images_by_id, images_by_name
         )
 
 
 class PDIWrapper(ValueWrapper):
 
-    def get_image_id_by_name(self, iname, dname=None):
+    def get_image_id_by_name(self, iname, did=None):
         raise Exception("to be implemented by subclasses")
 
 
@@ -759,7 +765,7 @@ def __init__(self, value_resolver):
         self.images_by_name = dict()
         self._load()
 
-    def get_image_id_by_name(self, iname, dname=None):
+    def get_image_id_by_name(self, iname, did=None):
         return self.images_by_name[iname].id.val
 
     def get_image_name_by_id(self, iid, did):
@@ -817,8 +823,8 @@ def __init__(self, value_resolver):
         self.datasets_by_name = dict()
         self._load()
 
-    def get_image_id_by_name(self, iname, dname=None):
-        return self.images_by_name[dname][iname].id.val
+    def get_image_id_by_name(self, iname, did=None):
+        return self.images_by_name[did][iname].id.val
 
     def get_image_name_by_id(self, iid, did=None):
         return self.images_by_id[did][iid].name.val
@@ -1129,12 +1135,7 @@ def parse(self):
                     return self.parse_from_handle_stream(f2)
 
     def preprocess_data(self, reader):
-        # Get count of data columns - e.g. NOT Well Name
-        column_count = 0
-        for column in self.columns:
-            if column.name not in ADDED_COLUMN_NAMES:
-                column_count += 1
-        for i, row in enumerate(reader):
+        for row in reader:
             row = [(self.columns[i], value) for i, value in enumerate(row)]
             for column, original_value in row:
                 log.debug('Original value %s, %s',
@@ -1355,11 +1356,11 @@ def post_process(self):
                     DatasetI is target_class or
                     ProjectI is target_class) and \
                     resolve_image_ids and not resolve_image_names:
+                # PDI - need to know Image IDs from Names
                 iid = -1
+                log.debug(image_column)
+                iname = image_name_column.values[i]
                 try:
-                    log.debug(image_column)
-                    iname = image_name_column.values[i]
-                    did = self.target_object.id.val
                     if "dataset name" in columns_by_name \
                             and target_class is not DatasetI:
                         dname = columns_by_name["dataset name"].values[i]
@@ -1372,12 +1373,14 @@ def post_process(self):
                         iname, did)
                 except KeyError:
                     log.warn(
-                        "%d not found in image ids" % iid)
+                        "%s not found in image names" % iname)
                 assert i == len(image_column.values)
                 image_column.values.append(iid)
             elif image_name_column is not None and (
                     ScreenI is target_class or
-                    PlateI is target_class):
+                    PlateI is target_class) and \
+                    resolve_image_names and not resolve_image_ids:
+                # HCS - we need to know image Names from IDs
                 iid = image_column.values[i]
                 log.info("Checking image %s", iid)
                 pid = None
@@ -1388,6 +1391,29 @@ def post_process(self):
                 image_name_column.size = max(
                     image_name_column.size, len(iname)
                 )
+            elif image_name_column is not None and (
+                    ScreenI is target_class or
+                    PlateI is target_class) and \
+                    resolve_image_ids and not resolve_image_names:
+                # HCS - we need to know image IDs from Names
+                log.debug(image_column)
+                iname = image_name_column.values[i]
+                iid = -1
+                try:
+                    if "plate" in columns_by_name:
+                        pid = int(columns_by_name["plate"].values[i])
+                    elif "plate name" in columns_by_name \
+                            and target_class is not PlateI:
+                        pname = columns_by_name["plate name"].values[i]
+                        pid = self.value_resolver.wrapper.plates_by_name[
+                            pname].id.val
+                    log.debug("Using Plate:%d" % pid)
+                    iid = self.value_resolver.get_image_id_by_name(
+                        iname, pid)
+                except KeyError:
+                    log.warn(
+                        "%s not found in image names" % iname)
+                image_column.values.append(iid)
             elif target_class is not ImageI:
                 log.info('Missing image name column, skipping.')
 

From 1bb59af54fc25c314df8f23509d46d62e510994a Mon Sep 17 00:00:00 2001
From: William Moore <w.moore@dundee.ac.uk>
Date: Fri, 17 Dec 2021 11:10:54 +0000
Subject: [PATCH 2/6] Handle images_by_name when target_object is Plate

---
 src/omero_metadata/populate.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/omero_metadata/populate.py b/src/omero_metadata/populate.py
index ab5ed502..90d44776 100644
--- a/src/omero_metadata/populate.py
+++ b/src/omero_metadata/populate.py
@@ -701,6 +701,10 @@ def get_well_by_id(self, well_id, plate=None):
         wells = self.wells_by_id[plate]
         return wells[well_id]
 
+    def get_image_id_by_name(self, iname, pid=None):
+        plate = self.target_object.id.val
+        return self.images_by_name[plate][iname].id.val
+
     def subselect(self, rows, names):
         """
         If we're processing a plate but the bulk-annotations file contains
@@ -1400,13 +1404,11 @@ def post_process(self):
                 iname = image_name_column.values[i]
                 iid = -1
                 try:
+                    # If target class is Screen, a plate column should exist
                     if "plate" in columns_by_name:
                         pid = int(columns_by_name["plate"].values[i])
-                    elif "plate name" in columns_by_name \
-                            and target_class is not PlateI:
-                        pname = columns_by_name["plate name"].values[i]
-                        pid = self.value_resolver.wrapper.plates_by_name[
-                            pname].id.val
+                    elif target_class is PlateI:
+                        pid = self.target_object.id.val
                     log.debug("Using Plate:%d" % pid)
                     iid = self.value_resolver.get_image_id_by_name(
                         iname, pid)

From 33a95544f814c2f1fd025fa7eb04e2bab6deeffe Mon Sep 17 00:00:00 2001
From: William Moore <w.moore@dundee.ac.uk>
Date: Mon, 20 Dec 2021 10:57:31 +0000
Subject: [PATCH 3/6] add testing of Image Name to ID in Plate

---
 src/omero_metadata/populate.py             | 15 +++---
 test/integration/metadata/test_populate.py | 61 ++++++++++++++++------
 2 files changed, 53 insertions(+), 23 deletions(-)

diff --git a/src/omero_metadata/populate.py b/src/omero_metadata/populate.py
index 90d44776..e5e38d56 100644
--- a/src/omero_metadata/populate.py
+++ b/src/omero_metadata/populate.py
@@ -27,7 +27,7 @@
 from builtins import chr
 from builtins import str
 from builtins import range
-from future.utils import isidentifier, native_str
+from future.utils import native_str
 from past.builtins import basestring
 from builtins import object
 import logging
@@ -563,7 +563,8 @@ def get_image_name_by_id(self, iid, pid=None):
             raise Exception("Cannot resolve image to plate")
         return self.images_by_id[pid][iid].name.val
 
-    def parse_plate(self, plate, wells_by_location, wells_by_id, images_by_id, images_by_name):
+    def parse_plate(self, plate, wells_by_location, wells_by_id,
+                    images_by_id, images_by_name):
         """
         Accepts PlateData instances
         """
@@ -686,7 +687,8 @@ def _load(self):
             images_by_name = dict()
             self.images_by_name[plate.id.val] = images_by_name
             self.parse_plate(
-                plate, wells_by_location, wells_by_id, images_by_id, images_by_name
+                plate, wells_by_location, wells_by_id,
+                images_by_id, images_by_name
             )
 
 
@@ -1139,7 +1141,7 @@ def parse(self):
                     return self.parse_from_handle_stream(f2)
 
     def preprocess_data(self, reader):
-        for row in reader:
+        for i, row in enumerate(reader):
             row = [(self.columns[i], value) for i, value in enumerate(row)]
             for column, original_value in row:
                 log.debug('Original value %s, %s',
@@ -1362,9 +1364,10 @@ def post_process(self):
                     resolve_image_ids and not resolve_image_names:
                 # PDI - need to know Image IDs from Names
                 iid = -1
-                log.debug(image_column)
-                iname = image_name_column.values[i]
                 try:
+                    log.debug(image_column)
+                    iname = image_name_column.values[i]
+                    did = self.target_object.id.val
                     if "dataset name" in columns_by_name \
                             and target_class is not DatasetI:
                         dname = columns_by_name["dataset name"].values[i]
diff --git a/test/integration/metadata/test_populate.py b/test/integration/metadata/test_populate.py
index 62a366fd..0e5c79bd 100644
--- a/test/integration/metadata/test_populate.py
+++ b/test/integration/metadata/test_populate.py
@@ -37,6 +37,7 @@
 import shutil
 
 from omero.api import RoiOptions
+from omero.gateway import BlitzGateway
 from omero.grid import ImageColumn
 from omero.grid import RoiColumn
 from omero.grid import StringColumn
@@ -143,12 +144,26 @@ def create_screen(self, row_count, col_count):
                                          plate_cols=col_count)[0]
         plate2 = self.test.import_plates(plate_rows=row_count,
                                          plate_cols=col_count)[0]
+        # Rename Images like "A1_Field-0" to match names in csv
+        conn = BlitzGateway(client_obj=self.test.client)
+        update = conn.getUpdateService()
+        images_by_id = {}
+        for pid in [plate1.id.val, plate2.id.val]:
+            plate = conn.getObject("Plate", pid)
+            for well in plate.listChildren():
+                for field_index, ws in enumerate(well.listChildren()):
+                    img = ws.getImage()._obj
+                    img.name = rstring(f'{ well.getWellPos() }_Field-{field_index}')
+                    img = update.saveAndReturnObject(img)
+                    images_by_id[img.id.val] = img
         plate1 = self.set_name(plate1, "P001")
         plate2 = self.set_name(plate2, "P002")
         screen = ScreenI()
         screen.name = rstring("Screen")
         screen.linkPlate(plate1.proxy())
         screen.linkPlate(plate2.proxy())
+        # cache images_by_id for checking result
+        self.images_by_id = images_by_id
         return self.test.client.sf.getUpdateService().\
             saveAndReturnObject(screen)
 
@@ -174,6 +189,20 @@ def assert_columns(self, columns):
         col_names = "Well,Well Type,Concentration,Well Name"
         assert col_names == ",".join([c.name for c in columns])
 
+    def assert_table_row(self, row_values, row_index):
+        # Check rows, based on self.create_csv()
+        # Unsure where the lower-casing is happening
+        if "A1" in row_values or "a1" in row_values:
+            assert "Control" in row_values
+        elif "A2" in row_values or "a2" in row_values:
+            assert "Treatment" in row_values
+        elif "roi1" in row_values:
+            assert 0.5 in row_values
+            assert 100 in row_values
+        elif "roi2" in row_values:
+            assert 'nan' in [str(value) for value in row_values]
+            assert 200 in row_values
+
     def assert_child_annotations(self, oas):
         for ma, wid, wr, wc in oas:
             assert isinstance(ma, MapAnnotationI)
@@ -200,14 +229,14 @@ def get_all_map_annotations(self):
 class Screen2Plates(Fixture):
 
     def __init__(self):
-        self.count = 6
+        self.count = 8
         self.ann_count = 4
         self.row_count = 1
         self.col_count = 2
         self.csv = self.create_csv(
-            col_names="Plate,Well,Well Type,Concentration",
-            row_data=("P001,A1,Control,0", "P001,A2,Treatment,10",
-                      "P002,A1,Control,0", "P002,A2,Treatment,10"))
+            col_names="Plate,Well,Image Name,Well Type,Concentration",
+            row_data=("P001,A1,A1_Field-0,Control,0", "P001,A2,A2_Field-0,Treatment,10",
+                      "P002,A1,A1_Field-0,Control,0", "P002,A2,A2_Field-0,Treatment,10"))
         self.screen = None
 
     def assert_row_count(self, rows):
@@ -217,10 +246,18 @@ def assert_row_count(self, rows):
         assert rows == 2 * self.row_count * self.col_count
 
     def assert_columns(self, columns):
-        # Adds Plate Name,Well Name columns
-        col_names = "Plate,Well,Well Type,Concentration,Plate Name,Well Name"
+        # Adds Plate Name,Well Name, Image columns
+        col_names = "Plate,Well,Image Name,Well Type,Concentration,Plate Name,Well Name,Image"
         assert col_names == ",".join([c.name for c in columns])
 
+    def assert_table_row(self, row_values, row_index):
+        super(Screen2Plates, self).assert_table_row(row_values, row_index)
+        # last column should contain valid Image ID
+        image_id = row_values[-1]
+        image_name = row_values[2]
+        assert image_id in self.images_by_id
+        assert self.images_by_id[image_id].name.val == image_name
+
     def get_target(self):
         if not self.screen:
             self.screen = self.create_screen(self.row_count, self.col_count)
@@ -1082,17 +1119,7 @@ def _assert_parsing_context_values(self, t, fixture):
             row_values = [col.values[0] for col in t.read(
                 list(range(len(cols))), hit, hit+1).columns]
             assert len(row_values) == fixture.count
-            # Unsure where the lower-casing is happening
-            if "A1" in row_values or "a1" in row_values:
-                assert "Control" in row_values
-            elif "A2" in row_values or "a2" in row_values:
-                assert "Treatment" in row_values
-            elif "roi1" in row_values:
-                assert 0.5 in row_values
-                assert 100 in row_values
-            elif "roi2" in row_values:
-                assert 'nan' in [str(value) for value in row_values]
-                assert 200 in row_values
+            fixture.assert_table_row(row_values, hit)
 
     def _test_bulk_to_map_annotation_context(self, fixture, batch_size):
         # self._testPopulateMetadataPlate()

From 803f97c555ea9e5cfd6fc66e49b8aa79131785e8 Mon Sep 17 00:00:00 2001
From: William Moore <w.moore@dundee.ac.uk>
Date: Mon, 20 Dec 2021 11:04:55 +0000
Subject: [PATCH 4/6] flake8 fixes

---
 test/integration/metadata/test_populate.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/test/integration/metadata/test_populate.py b/test/integration/metadata/test_populate.py
index 0e5c79bd..a70b1ead 100644
--- a/test/integration/metadata/test_populate.py
+++ b/test/integration/metadata/test_populate.py
@@ -153,7 +153,8 @@ def create_screen(self, row_count, col_count):
             for well in plate.listChildren():
                 for field_index, ws in enumerate(well.listChildren()):
                     img = ws.getImage()._obj
-                    img.name = rstring(f'{ well.getWellPos() }_Field-{field_index}')
+                    img.name = rstring(
+                        f'{ well.getWellPos() }_Field-{field_index}')
                     img = update.saveAndReturnObject(img)
                     images_by_id[img.id.val] = img
         plate1 = self.set_name(plate1, "P001")
@@ -235,8 +236,10 @@ def __init__(self):
         self.col_count = 2
         self.csv = self.create_csv(
             col_names="Plate,Well,Image Name,Well Type,Concentration",
-            row_data=("P001,A1,A1_Field-0,Control,0", "P001,A2,A2_Field-0,Treatment,10",
-                      "P002,A1,A1_Field-0,Control,0", "P002,A2,A2_Field-0,Treatment,10"))
+            row_data=("P001,A1,A1_Field-0,Control,0",
+                      "P001,A2,A2_Field-0,Treatment,10",
+                      "P002,A1,A1_Field-0,Control,0",
+                      "P002,A2,A2_Field-0,Treatment,10"))
         self.screen = None
 
     def assert_row_count(self, rows):
@@ -247,7 +250,8 @@ def assert_row_count(self, rows):
 
     def assert_columns(self, columns):
         # Adds Plate Name,Well Name, Image columns
-        col_names = "Plate,Well,Image Name,Well Type,Concentration,Plate Name,Well Name,Image"
+        col_names = ("Plate,Well,Image Name,Well Type,"
+                     "Concentration,Plate Name,Well Name,Image")
         assert col_names == ",".join([c.name for c in columns])
 
     def assert_table_row(self, row_values, row_index):

From edc08533ee1d81be076605acde00837bb46baaa1 Mon Sep 17 00:00:00 2001
From: William Moore <w.moore@dundee.ac.uk>
Date: Wed, 8 Feb 2023 14:50:24 +0000
Subject: [PATCH 5/6] Add comments to try and help understanding

---
 src/omero_metadata/populate.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/omero_metadata/populate.py b/src/omero_metadata/populate.py
index 94e4e0f5..999d9ace 100644
--- a/src/omero_metadata/populate.py
+++ b/src/omero_metadata/populate.py
@@ -1231,6 +1231,9 @@ def parse(self):
 
     def preprocess_data(self, reader):
         for i, row in enumerate(reader):
+            # For each row in the table,
+            # add a single value to the columns.values
+            # then call post.process() to resolve ID -> name or name -> ID
             row = [(self.columns[i], value) for i, value in enumerate(row)]
             for column, original_value in row:
                 log.debug('Original value %s, %s',
@@ -1315,8 +1318,10 @@ def populate_from_reader(self,
                     self.populate_row(row)
                     row_count = row_count + 1
                     if row_count >= batch_size:
+                        # Call post_process() for this batch
                         self.post_process()
                         table.addData(self.columns)
+                        # clear row data ready for next batch
                         for column in self.columns:
                             column.values = []
                         row_count = 0
@@ -1325,6 +1330,7 @@ def populate_from_reader(self,
         if row_count != 0:
             log.debug("DATA TO ADD")
             log.debug(self.columns)
+            # Call post_process for final remaining rows (less than batch_size)
             self.post_process()
             table.addData(self.columns)
 
@@ -1357,6 +1363,11 @@ def populate(self, rows):
                 log.warning('Skip empty row %d', r + 1)
 
     def post_process(self):
+        # post_process is called at 2 points in the populate workflow...
+        # First called during preprocess_data() on each row at a time (when
+        # each column.values list has a single value)
+        # then again during populate_from_reader(), when all rows are processed
+        # in batches.
         target_class = self.target_object.__class__
         columns_by_name = dict()
         well_column = None
@@ -1436,6 +1447,7 @@ def post_process(self):
                     DatasetI is target_class or
                     ProjectI is target_class) and \
                     resolve_image_names and not resolve_image_ids:
+                # PDI - need to know Image Names from Image IDs
                 iname = ""
                 try:
                     log.debug(image_name_column)

From 78e55539b68e351ed5ef202854718b49a37729a7 Mon Sep 17 00:00:00 2001
From: William Moore <w.moore@dundee.ac.uk>
Date: Wed, 8 Feb 2023 14:52:20 +0000
Subject: [PATCH 6/6] Use log.debug() instead of warn() or info()

The script is working fine, so it looks like these are invalid warnings
but it is hard to understand the working of the script
---
 src/omero_metadata/populate.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/omero_metadata/populate.py b/src/omero_metadata/populate.py
index 999d9ace..e09f619c 100644
--- a/src/omero_metadata/populate.py
+++ b/src/omero_metadata/populate.py
@@ -1384,7 +1384,7 @@ def post_process(self):
         for column in self.columns:
             columns_by_name[column.name.lower()] = column
             if column.__class__ is PlateColumn:
-                log.warn("PlateColumn is unimplemented")
+                log.debug("PlateColumn is unimplemented")
             elif column.__class__ is WellColumn:
                 well_column = column
             elif column.name == WELL_NAME_COLUMN:
@@ -1531,7 +1531,7 @@ def post_process(self):
                         "%s not found in image names" % iname)
                 image_column.values.append(iid)
             elif target_class is not ImageI:
-                log.info('Missing image name column, skipping.')
+                log.debug('Missing image name column, skipping.')
 
             if plate_name_column is not None:
                 plate = columns_by_name['plate'].values[i]   # FIXME