Merge pull request #46 from microbiomedata/43-update-re-iding-process…

…-records-to-handle-data-objects-wo-data-file-description-or-url 43 update re iding process records to handle data objects wo data file description or url or with ID not found in the Database
microbiomedata · Feb 13, 2024 · efe919b · efe919b
2 parents fe595e4 + 810570a
commit efe919b
Show file tree

Hide file tree

Showing 26 changed files with 614,149 additions and 28,004 deletions.
diff --git a/nmdc_automation/re_iding/base.py b/nmdc_automation/re_iding/base.py
diff --git a/nmdc_automation/re_iding/changesheets_output/dryrun_changesheet-20240123-095223.tsv b/nmdc_automation/re_iding/changesheets_output/dryrun_changesheet-20240123-095223.tsv
@@ -0,0 +1,3 @@
+id	action	attribute	value
+nmdc:omprc-11-bn8jcq58	remove item	has_output	nmdc:dobj-11-xkx6jy64|
+nmdc:omprc-11-bn8jcq58	insert	has_output	nmdc:dobj-11-77e9jm46|
diff --git a/nmdc_automation/re_iding/db_utils.py b/nmdc_automation/re_iding/db_utils.py
@@ -12,7 +12,7 @@
 OMICS_PROCESSING_SET = "omics_processing_set"
 DATA_OBJECT_SET = "data_object_set"
 READS_QC_SET = "read_qc_analysis_activity_set"
-READS_BASED_TAXONOMY_ANALYSIS_ACTIVITY_SET = "read_based_taxonomy_analysis_activity_set"
+READ_BASED_TAXONOMY_ANALYSIS_ACTIVITY_SET = "read_based_taxonomy_analysis_activity_set"
 METAGENOME_ASSEMBLY_SET = "metagenome_assembly_set"
 METAGENOME_ANNOTATION_ACTIVITY_SET = "metagenome_annotation_activity_set"
 METAGENOME_SEQUENCING_ACTIVITY_SET = "metagenome_sequencing_activity_set"
@@ -24,7 +24,7 @@
 
 ANALYSIS_ACTIVITIES = [
     READS_QC_SET,
-    READS_BASED_TAXONOMY_ANALYSIS_ACTIVITY_SET,
+    READ_BASED_TAXONOMY_ANALYSIS_ACTIVITY_SET,
     METAGENOME_ANNOTATION_ACTIVITY_SET,
     METAGENOME_SEQUENCING_ACTIVITY_SET,
     METAGENOME_ASSEMBLY_SET,
@@ -37,7 +37,6 @@
 
 
 
-
 def get_omics_processing_id(db_record: Dict) -> str:
     """
     Get the ID of the OmicsProcessing record in the given Database instance.
@@ -71,12 +70,3 @@ def get_data_object_record_by_id(db_record: Dict, id: str)-> Optional[Dict]:
     elif len(data_objects) > 1:
         raise ValueError(f"Multiple data objects found with id: {id}")
     return data_objects[0]
-
-
-
-
-
-
-
-
-
diff --git a/nmdc_automation/re_iding/file_utils.py b/nmdc_automation/re_iding/file_utils.py
@@ -35,17 +35,66 @@ def find_data_object_type(data_object_rec: Dict)-> Optional[str]:
     Returns:
     - str: The determined data type or None if the type could not be determined.
     """
+    url = data_object_rec.get("url")
+    name = data_object_rec.get("name")
     if "data_object_type" in data_object_rec:
         return data_object_rec["data_object_type"]
-    url = data_object_rec["url"]
-    if url.endswith("_covstats.txt"):
+    elif url:
+        return _infer_data_object_type_from_url(data_object_rec)
+    elif name:
+        return _infer_data_object_type_from_name(data_object_rec)
+    else:
+        logger.error(f"Could not determine data object type for: {data_object_rec}")
+        return None
+
+
+
+def _infer_data_object_type_from_url(data_object_rec: Dict) -> Optional[str]:
+    """
+    Determine the data_object_type for a DO record based on its URL extension.
+
+    Args:
+    - data_object_record (dict): Dictionary containing the 'url' key which
+    will be inspected to determine the data type.
+
+    Returns:
+    - str: The determined data type or None if the type could not be determined.
+    """
+    if data_object_rec['url'].endswith("_covstats.txt"):
         return "Assembly Coverage Stats"
-    elif url.endswith("_gottcha2_report.tsv"):
+    elif data_object_rec['url'].endswith("_gottcha2_report.tsv"):
         return "GOTTCHA2 Classification Report"
-    elif url.endswith("_gottcha2_report_full.tsv"):
+    elif data_object_rec['url'].endswith("_gottcha2_report_full.tsv"):
         return "GOTTCHA2 Report Full"
+    elif data_object_rec['url'].endswith(".fastq.gz") and "Filtered Reads" in data_object_rec['description']:
+        return "Filtered Sequencing Reads"
     else:
-        logger.error(f"Missing type: {url}")
+        logger.error(f"Cannot infer type from url for: {data_object_rec}")
+        return None
+
+def _infer_data_object_type_from_name(data_object_rec: Dict) -> Optional[str]:
+    """
+    Determine the data_object_type for a DO record based on its name.
+
+    Args:
+    - data_object_record (dict): Dictionary containing the 'name' key which
+    will be inspected to determine the data type.
+
+    Returns:
+    - str: The determined data type or None if the type could not be determined.
+    """
+    if data_object_rec['name'] == "mapping_stats.txt":
+        return "Assembly Coverage Stats"
+    elif data_object_rec['name'] == "assembly_contigs.fna":
+        return "Assembly Contigs"
+    elif data_object_rec['name'] == "assembly_scaffolds.fna":
+        return "Assembly Scaffolds"
+    elif data_object_rec['name'] == "assembly.agp":
+        return "Assembly AGP"
+    elif data_object_rec['name'] == "pairedMapped_sorted.bam":
+        return "Assembly Coverage BAM"
+    else:
+        logger.error(f"Cannot infer type from name for: {data_object_rec}")
         return None
 
 def md5_sum(fn: str) -> str:
@@ -191,11 +240,10 @@ def assembly_file_operations(data_object_record, data_object_type,
     return md5, size
 
 
-def compute_new_paths_and_link(
+def compute_new_data_file_path(
         old_url: str,
         new_base_dir: Union[str, os.PathLike],
         act_id: str,
-        old_base_dir: Optional[Union[str, os.PathLike]] = None,
 ) -> Path:
     """
     Compute the new path for the file based on the old url and the new base directory.
@@ -205,19 +253,19 @@ def compute_new_paths_and_link(
     file_extenstion = file_name.lstrip("nmdc_").split("_", maxsplit=1)[-1]
     new_file_name = f"{act_id}_{file_extenstion}"
     modified_new_file_name = new_file_name.replace(":", "_")
-    destination = Path(new_base_dir, modified_new_file_name)
+    new_data_file_path = Path(new_base_dir, modified_new_file_name)
 
-    if old_base_dir is None:
-        return destination
-    # create a link between the old file and the new file if old_base_dir is provided
+    return new_data_file_path
+
+
+def link_data_file_paths(old_url: str, old_base_dir: Union[str, os.PathLike], new_path: Union[str, os.PathLike])-> \
+        None:
     suffix = old_url.split("https://data.microbiomedata.org/data/")[1]
     old_file_path = Path(old_base_dir, suffix)
     try:
-        os.link(old_file_path, destination)
-        logging.info(f"Successfully created link between {old_file_path} and {destination}")
+        os.link(old_file_path, new_path)
+        logging.info(f"Successfully created link between {old_file_path} and {new_path}")
     except OSError as e:
         logging.error(f"An error occurred while linking the file: {e}")
     except Exception as e:
-        logging.error(f"Unexpected error: {e}")
-
-    return destination
+        logging.error(f"Unexpected error: {e}")