diff --git a/ingest/config/defaults.yaml b/ingest/config/defaults.yaml index 72d026b81..87da0dd97 100644 --- a/ingest/config/defaults.yaml +++ b/ingest/config/defaults.yaml @@ -16,7 +16,6 @@ keep: - ncbi_host_name - ncbi_host_tax_id - ncbi_is_lab_host - - ncbi_protein_count - ncbi_release_date - ncbi_update_date - ncbi_sourcedb @@ -25,22 +24,6 @@ keep: - sequence_md5 - genbank_accession - joint_accession -# Fields that are shared at sample level -# Used to deduplicate samples -# Used for segmented viruses only -shared_fields: - - bioproject_accession - - biosample_accession - - geo_loc_country - - geo_loc_admin_1 - - sample_collection_date - - host_name_scientific - - host_taxon_id - - is_lab_host - - specimen_collector_sample_id - - author_affiliations - - authors - - ncbi_release_date #TODO (#2171): Allow segments to have different dates all_fields: - accession - bioprojects diff --git a/ingest/scripts/group_segments.py b/ingest/scripts/group_segments.py index 646a94fbe..f281d71d2 100644 --- a/ingest/scripts/group_segments.py +++ b/ingest/scripts/group_segments.py @@ -44,9 +44,6 @@ class Config: compound_country_field: str fasta_id_field: str insdc_segment_specific_fields: list[str] # What does this field mean? - shared_fields: list[ - str - ] # Fields that are expected to be identical across all segments for a given isolate nucleotide_sequences: list[str] segmented: bool @@ -96,8 +93,6 @@ def main( # Group segments according to isolate, collection date and isolate specific values # These are the fields that are expected to be identical across all segments for a given isolate - shared_fields = config.shared_fields - logger.info(f"Fields required to be identical for grouping: {shared_fields}") first_row = next(iter(segment_metadata.values())) if not first_row: @@ -105,6 +100,11 @@ def main( raise ValueError(msg) all_fields = first_row.keys() + insdc_segment_specific_fields = set(config.insdc_segment_specific_fields) + insdc_segment_specific_fields.add("hash") + + shared_fields = set(all_fields) - insdc_segment_specific_fields - SPECIAL_FIELDS + # Build equivalence classes based on shared fields # Use shared fields as the key to group the data type SegmentName = str @@ -167,16 +167,6 @@ def main( } ) - must_identical_fields = set(config.shared_fields) - insdc_segment_specific_fields = set(config.insdc_segment_specific_fields) - insdc_segment_specific_fields.add("hash") - - # These need to be treated specially: always single string, but complex if necessary - # e.g. "L:2024/nM:2023" - usually_identical_fields = ( - set(all_fields) - must_identical_fields - insdc_segment_specific_fields - SPECIAL_FIELDS - ) - # Add segment specific metadata for the segments metadata: dict[str, dict[str, str]] = {} # Map from original accession to the new concatenated accession @@ -197,7 +187,7 @@ def main( for segment, accession in group.items(): fasta_id_map[accession] = f"{joint_key}_{segment}" - for field in must_identical_fields: + for field in shared_fields: values = {segment: segment_metadata[group[segment]][field] for segment in group} deduplicated_values = set(values.values()) if len(deduplicated_values) != 1: @@ -211,19 +201,6 @@ def main( segment_metadata[group[segment]][field] if segment in group else "" ) - for field in usually_identical_fields: - values = {segment: segment_metadata[group[segment]][field] for segment in group} - deduplicated_values = set(values.values()) - if len(deduplicated_values) != 1: - combined = "\n".join([f"{segment}:{value}" for segment, value in values.items()]) - row[field] = combined - logger.warning( - f"Values for field: {field} in group: {group} are not identical: {values}. " - f"Passing combined nested string: {combined!r}" - ) - continue - row[field] = deduplicated_values.pop() - row["submissionId"] = joint_key row["hash"] = hashlib.md5(