Skip to content

Commit

Permalink
fix(ingest): Enforce each metadata field either being grouped or segm…
Browse files Browse the repository at this point in the history
…ented (#2370)

* Get rid of usually_identical_fields option and make default grouped.
  • Loading branch information
anna-parker authored Aug 14, 2024
1 parent 68b0144 commit 8a52d45
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 46 deletions.
17 changes: 0 additions & 17 deletions ingest/config/defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ keep:
- ncbi_host_name
- ncbi_host_tax_id
- ncbi_is_lab_host
- ncbi_protein_count
- ncbi_release_date
- ncbi_update_date
- ncbi_sourcedb
Expand All @@ -25,22 +24,6 @@ keep:
- sequence_md5
- genbank_accession
- joint_accession
# Fields that are shared at sample level
# Used to deduplicate samples
# Used for segmented viruses only
shared_fields:
- bioproject_accession
- biosample_accession
- geo_loc_country
- geo_loc_admin_1
- sample_collection_date
- host_name_scientific
- host_taxon_id
- is_lab_host
- specimen_collector_sample_id
- author_affiliations
- authors
- ncbi_release_date #TODO (#2171): Allow segments to have different dates
all_fields:
- accession
- bioprojects
Expand Down
35 changes: 6 additions & 29 deletions ingest/scripts/group_segments.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,6 @@ class Config:
compound_country_field: str
fasta_id_field: str
insdc_segment_specific_fields: list[str] # What does this field mean?
shared_fields: list[
str
] # Fields that are expected to be identical across all segments for a given isolate
nucleotide_sequences: list[str]
segmented: bool

Expand Down Expand Up @@ -96,15 +93,18 @@ def main(

# Group segments according to isolate, collection date and isolate specific values
# These are the fields that are expected to be identical across all segments for a given isolate
shared_fields = config.shared_fields
logger.info(f"Fields required to be identical for grouping: {shared_fields}")

first_row = next(iter(segment_metadata.values()))
if not first_row:
msg = "No data found in metadata file"
raise ValueError(msg)
all_fields = first_row.keys()

insdc_segment_specific_fields = set(config.insdc_segment_specific_fields)
insdc_segment_specific_fields.add("hash")

shared_fields = set(all_fields) - insdc_segment_specific_fields - SPECIAL_FIELDS

# Build equivalence classes based on shared fields
# Use shared fields as the key to group the data
type SegmentName = str
Expand Down Expand Up @@ -167,16 +167,6 @@ def main(
}
)

must_identical_fields = set(config.shared_fields)
insdc_segment_specific_fields = set(config.insdc_segment_specific_fields)
insdc_segment_specific_fields.add("hash")

# These need to be treated specially: always single string, but complex if necessary
# e.g. "L:2024/nM:2023"
usually_identical_fields = (
set(all_fields) - must_identical_fields - insdc_segment_specific_fields - SPECIAL_FIELDS
)

# Add segment specific metadata for the segments
metadata: dict[str, dict[str, str]] = {}
# Map from original accession to the new concatenated accession
Expand All @@ -197,7 +187,7 @@ def main(
for segment, accession in group.items():
fasta_id_map[accession] = f"{joint_key}_{segment}"

for field in must_identical_fields:
for field in shared_fields:
values = {segment: segment_metadata[group[segment]][field] for segment in group}
deduplicated_values = set(values.values())
if len(deduplicated_values) != 1:
Expand All @@ -211,19 +201,6 @@ def main(
segment_metadata[group[segment]][field] if segment in group else ""
)

for field in usually_identical_fields:
values = {segment: segment_metadata[group[segment]][field] for segment in group}
deduplicated_values = set(values.values())
if len(deduplicated_values) != 1:
combined = "\n".join([f"{segment}:{value}" for segment, value in values.items()])
row[field] = combined
logger.warning(
f"Values for field: {field} in group: {group} are not identical: {values}. "
f"Passing combined nested string: {combined!r}"
)
continue
row[field] = deduplicated_values.pop()

row["submissionId"] = joint_key

row["hash"] = hashlib.md5(
Expand Down

0 comments on commit 8a52d45

Please sign in to comment.