fix(ingest): Enforce each metadata field either being grouped or segm…

…ented (#2370) * Get rid of usually_identical_fields option and make default grouped.
loculus-project · Aug 14, 2024 · 8a52d45 · 8a52d45
1 parent 68b0144
commit 8a52d45
Show file tree

Hide file tree

Showing 2 changed files with 6 additions and 46 deletions.
diff --git a/ingest/config/defaults.yaml b/ingest/config/defaults.yaml
@@ -16,7 +16,6 @@ keep:
   - ncbi_host_name
   - ncbi_host_tax_id
   - ncbi_is_lab_host
-  - ncbi_protein_count
   - ncbi_release_date
   - ncbi_update_date
   - ncbi_sourcedb
@@ -25,22 +24,6 @@ keep:
   - sequence_md5
   - genbank_accession
   - joint_accession
-# Fields that are shared at sample level
-# Used to deduplicate samples
-# Used for segmented viruses only
-shared_fields:
-  - bioproject_accession
-  - biosample_accession
-  - geo_loc_country
-  - geo_loc_admin_1
-  - sample_collection_date
-  - host_name_scientific
-  - host_taxon_id
-  - is_lab_host
-  - specimen_collector_sample_id
-  - author_affiliations
-  - authors
-  - ncbi_release_date #TODO (#2171): Allow segments to have different dates
 all_fields:
   - accession
   - bioprojects

diff --git a/ingest/scripts/group_segments.py b/ingest/scripts/group_segments.py
@@ -44,9 +44,6 @@ class Config:
     compound_country_field: str
     fasta_id_field: str
     insdc_segment_specific_fields: list[str]  # What does this field mean?
-    shared_fields: list[
-        str
-    ]  # Fields that are expected to be identical across all segments for a given isolate
     nucleotide_sequences: list[str]
     segmented: bool
 
@@ -96,15 +93,18 @@ def main(
 
     # Group segments according to isolate, collection date and isolate specific values
     # These are the fields that are expected to be identical across all segments for a given isolate
-    shared_fields = config.shared_fields
-    logger.info(f"Fields required to be identical for grouping: {shared_fields}")
 
     first_row = next(iter(segment_metadata.values()))
     if not first_row:
         msg = "No data found in metadata file"
         raise ValueError(msg)
     all_fields = first_row.keys()
 
+    insdc_segment_specific_fields = set(config.insdc_segment_specific_fields)
+    insdc_segment_specific_fields.add("hash")
+
+    shared_fields = set(all_fields) - insdc_segment_specific_fields - SPECIAL_FIELDS
+
     # Build equivalence classes based on shared fields
     # Use shared fields as the key to group the data
     type SegmentName = str
@@ -167,16 +167,6 @@ def main(
             }
         )
 
-    must_identical_fields = set(config.shared_fields)
-    insdc_segment_specific_fields = set(config.insdc_segment_specific_fields)
-    insdc_segment_specific_fields.add("hash")
-
-    # These need to be treated specially: always single string, but complex if necessary
-    # e.g. "L:2024/nM:2023"
-    usually_identical_fields = (
-        set(all_fields) - must_identical_fields - insdc_segment_specific_fields - SPECIAL_FIELDS
-    )
-
     # Add segment specific metadata for the segments
     metadata: dict[str, dict[str, str]] = {}
     # Map from original accession to the new concatenated accession
@@ -197,7 +187,7 @@ def main(
         for segment, accession in group.items():
             fasta_id_map[accession] = f"{joint_key}_{segment}"
 
-        for field in must_identical_fields:
+        for field in shared_fields:
             values = {segment: segment_metadata[group[segment]][field] for segment in group}
             deduplicated_values = set(values.values())
             if len(deduplicated_values) != 1:
@@ -211,19 +201,6 @@ def main(
                     segment_metadata[group[segment]][field] if segment in group else ""
                 )
 
-        for field in usually_identical_fields:
-            values = {segment: segment_metadata[group[segment]][field] for segment in group}
-            deduplicated_values = set(values.values())
-            if len(deduplicated_values) != 1:
-                combined = "\n".join([f"{segment}:{value}" for segment, value in values.items()])
-                row[field] = combined
-                logger.warning(
-                    f"Values for field: {field} in group: {group} are not identical: {values}. "
-                    f"Passing combined nested string: {combined!r}"
-                )
-                continue
-            row[field] = deduplicated_values.pop()
-
         row["submissionId"] = joint_key
 
         row["hash"] = hashlib.md5(