Skip to content

Commit

Permalink
Add function to find and delete Lipidomics records
Browse files Browse the repository at this point in the history
Also update to filter for non-comforming IDs
  • Loading branch information
mbthornton-lbl committed Mar 11, 2024
1 parent 4a31b40 commit 72136cd
Showing 1 changed file with 41 additions and 19 deletions.
60 changes: 41 additions & 19 deletions nmdc_automation/re_iding/scripts/re_id_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,9 +137,9 @@ def extract_records(ctx, study_id, api_base_url):

for data_object_id in omics_processing_has_outputs:
data_object_record = api_client.get_data_object(data_object_id)
# If the data object is an orphan, fail the omics processing record and its data objects
# If the data object is Missing, fail the omics processing record and its data objects
if not data_object_record:
logging.error(f"OmicsProcessingOrphanDataObject: {data_object_id} for {omics_id}")
logging.error(f"OmicsProcessingMissingDataObject: {data_object_id} for {omics_id}")
is_failed_data = True
is_omics_missing_has_output = True
omics_level_failure_count += 1
Expand Down Expand Up @@ -549,7 +549,7 @@ def delete_old_records(ctx, old_records_file):
@click.pass_context
def delete_old_binning_data(ctx, mongo_uri, database_name, direct_connection, no_delete=False):
"""
Delete old binning data from the MongoDB database.
Delete old binning data with non-comforming IDs from the MongoDB database.
Some binning data objects can be found by their data_object_type: 'Metagenome Bins' or 'CheckM Statistics'
Un-typed data objects can be found by looking for 'metabat2' in the description
Expand All @@ -569,8 +569,11 @@ def delete_old_binning_data(ctx, mongo_uri, database_name, direct_connection, no
logging.info(f"Connected to MongoDB server at {mongo_uri}")
db_client = client[database_name]

# Find and delete old binning data with a known data object type
logging.info("Searching for old binning data records with a known data object type and non-comforming IDs")
# Find and delete old binning data with a known data object type and non-comforming IDs
binning_data_query = {
# Exclude data objects with conforming IDs nmdc:dobj-*
"id": {"$not": {"$regex": "^nmdc:dobj-"}},
"data_object_type": {"$in": ["Metagenome Bins", "CheckM Statistics"]},
}
binning_data = db_client["data_object_set"].find(binning_data_query)
Expand All @@ -586,8 +589,12 @@ def delete_old_binning_data(ctx, mongo_uri, database_name, direct_connection, no
for record in binning_data:
logging.info(f"Skipping delete for record: {record['id']} {record['data_object_type']} {record['description']}")

# Find and delete old binning data with a null data object type and 'metabat2' in the description
# Find and delete old binning data with a null data object type and 'metabat2' in the description and
# non-comforming IDs
logging.info("Searching for old binning data records with a null data object type and 'metabat2' in the description")
null_binning_data_query = {
# Exclude data objects with conforming IDs nmdc:dobj-*
"id": {"$not": {"$regex": "^nmdc:dobj-"}},
"data_object_type": None,
"description": {"$regex": "metabat2"},
}
Expand All @@ -604,22 +611,37 @@ def delete_old_binning_data(ctx, mongo_uri, database_name, direct_connection, no
for record in null_binning_data:
logging.info(f"Skipping delete for record: {record['id']} /{record['description']}")

# Find and delete old proteomics data objects
proteomics_data_query = {
"id": {"$regex": "emsl:output_"},
# Find Lipidomics OmicsProcessing and their associated DataObjects and delete them
logging.info("Searching for Lipidomics OmicsProcessing records and their associated DataObjects")
lipidomics_omics_processing_query = {
"omics_type.has_raw_value": "Lipidomics",
}
proteomics_data = db_client["data_object_set"].find(proteomics_data_query)
logging.info(f"Found {len(list(proteomics_data.clone()))} old proteomics data records")
lipidomics_omics_processing = db_client["omics_processing_set"].find(lipidomics_omics_processing_query)
logging.info(f"Found {len(list(lipidomics_omics_processing.clone()))} lipidomics omics processing records")

# Go through the lipidomics omics processing records and get the data object IDs to be deleted
lipidomics_data_object_ids = set()
for record in lipidomics_omics_processing:
logging.info(f"Found lipidomics omics processing record: {record['id']}")
for data_object_id in record["has_output"]:
lipidomics_data_object_ids.add(data_object_id)
logging.info(f"Found {len(lipidomics_data_object_ids)} lipidomics data object records")

if not no_delete:
for record in proteomics_data:
logging.info(f"Deleting proteomics data record: {record['id']} {record['description']}")
logging.info(f"Deleting old proteomics data records")
delete_result = db_client["data_object_set"].delete_many(proteomics_data_query)
logging.info(f"Deleted {delete_result.deleted_count} old proteomics data records")
for data_object_id in lipidomics_data_object_ids:
logging.info(f"Deleting lipidomics data object record: {data_object_id}")
logging.info(f"Deleting lipidomics data object records")
delete_result = db_client["data_object_set"].delete_many({"id": {"$in": list(lipidomics_data_object_ids)}})
logging.info(f"Deleted {delete_result.deleted_count} lipidomics data object records")
# delete the lipidomics omics processing records
delete_result = db_client["omics_processing_set"].delete_many(lipidomics_omics_processing_query)
logging.info(f"Deleted {delete_result.deleted_count} lipidomics omics processing records")
else:
logging.info("No-delete flag is set, skipping delete")
for record in proteomics_data:
logging.info(f"Skipping delete for record: {record['id']} /{record['description']}")
for data_object_id in lipidomics_data_object_ids:
logging.info(f"Skipping delete for lipidomics data object record: {data_object_id}")
for record in lipidomics_omics_processing:
logging.info(f"Skipping delete for lipidomics omics processing record: {record['id']}")


logging.info(f"Elapsed time: {time.time() - start_time}")
Expand All @@ -641,7 +663,7 @@ def orphan_data_objects(ctx, study_id, api_base_url, untyped_data_objects=False)
Write the results to a JSON file of nmdc DataObject instances.
"""
start_time = time.time()
logging.info(f"Scanning for orphaned data objects for {study_id}")
logging.info(f"Scanning for missing data objects for {study_id}")


api_client = NmdcApi(api_base_url)
Expand Down Expand Up @@ -692,7 +714,7 @@ def orphan_data_objects(ctx, study_id, api_base_url, untyped_data_objects=False)
with open(f"{study_id}_untyped_data_objects.json", "w") as f:
f.write(json.dumps(untyped_data_objects, indent=4))
else:
logging.info(f"Found {len(orphan_data_object_ids)} orphaned data objects")
logging.info(f"Found {len(orphan_data_object_ids)} missing data objects")
# get orphaned data objects from the data_objects_by_id if present
orphaned_data_objects = []
for data_object_id in orphan_data_object_ids:
Expand Down

0 comments on commit 72136cd

Please sign in to comment.