diff --git a/.gitignore b/.gitignore index 1db8504a..60d9ec3a 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,5 @@ htmlcov/ attic .idea/ +configs/.local_napa_config.toml +nmdc_automation/re_iding/scripts/data/dryrun_data/ diff --git a/configs/re_iding_worklfows.yaml b/configs/re_iding_worklfows.yaml index a763e3e0..49801ac4 100644 --- a/configs/re_iding_worklfows.yaml +++ b/configs/re_iding_worklfows.yaml @@ -184,4 +184,37 @@ Workflows: data_object_type: Read Based Analysis Info File description: Read based analysis info for {id} name: File containing reads based analysis information - suffix: profiler.info \ No newline at end of file + suffix: profiler.info + + - Name: Metatranscriptome Activity + Type: nmdc:MetatranscriptomeActivity + Enabled: False + Git_repo: https://github.com/microbiomedata/MetatranscriptomeActivity + Version: 0.0.0 + Collection: metatranscriptome_activity_set + ActivityRange: MetatranscriptomeActivity + Predecessors: + - Reads QC + - Reads QC Interleave + Inputs: + input_file: do:Filtered Sequencing Reads + proj: "{activity_id}" + Activity: + name: "Metatranscriptome Activity for {id}" + type: nmdc:MetatranscriptomeActivity + Outputs: + - output: read_count_and_rpkm + name: Read count and RPKM + suffix: ".json" + data_object_type: Read Count and RPKM + description: "Read count and RPKM for {id}" + - output: qc_non_rRNA_R1 + name: Non-rRNA reads R1 + suffix: "filtered_R1.fastq" + data_object_type: QC non-rRNA R1 + description: "R1 reads without the ribosomal sequences for {id}" + - output: qc_non_rRNA_R2 + name: Non-rRNA reads R2 + suffix: "filtered_R2.fastq" + data_object_type: QC non-rRNA R2 + description: "R2 reads without the ribosomal sequences for {id}" diff --git a/nmdc_automation/api/nmdcapi.py b/nmdc_automation/api/nmdcapi.py index aa386997..9e32e02b 100755 --- a/nmdc_automation/api/nmdcapi.py +++ b/nmdc_automation/api/nmdcapi.py @@ -374,66 +374,6 @@ def request(self, method, url_path, params_or_json_data=None): rv.raise_for_status() return rv - def get_omics_processing_records_for_nmdc_study(self, nmdc_study_id: str): - """ - Retrieve all OmicsProcessing records for the given NMDC study ID. - """ - url = "queries:run" - params = {"find": "omics_processing_set", - "filter": {"part_of": {"$elemMatch": {"$eq": nmdc_study_id}}}} - response = self.request("POST", url, params_or_json_data=params) - if response.status_code != 200: - raise Exception( - f"Error retrieving OmicsProcessing records for study {nmdc_study_id}" - ) - omics_processing_records = response.json()["cursor"]["firstBatch"] - return omics_processing_records - - def get_workflow_activity_informed_by(self, workflow_activity_set: str, - informed_by_id: str): - """ - Retrieve a workflow activity record for the given workflow activity set - and informed by a given OmicsProcessing ID. - """ - url = "queries:run" - params = {"find": workflow_activity_set, - "filter": {"was_informed_by": informed_by_id}} - response = self.request("POST", url, params_or_json_data=params) - if response.status_code != 200: - raise Exception( - f"Error retrieving {workflow_activity_set} record informed by {informed_by_id}" - ) - workflow_activity_record = response.json()["cursor"]["firstBatch"] - return workflow_activity_record - - def get_data_objects_by_description(self, description: str): - """ - Retrieve data objects the given description in its description. - """ - response = self.request( - "POST", - "queries:run", - params_or_json_data={ - "find": "data_object_set", - "filter": {"description": {"$regex": description, "$options": "i"}}, - }, - ) - response.raise_for_status() - return response.json()["cursor"]["firstBatch"] - - def get_data_object_by_id(self, data_object_id: str): - """ - Retrieve a data object record for the given data object ID. - """ - url = f"data_objects/{data_object_id}" - response = self.request("GET", url) - if response.status_code != 200: - raise Exception( - f"Error retrieving data object record for {data_object_id}" - ) - data_object_record = response.json() - return data_object_record - def run_query(self, query: dict): """ Function to run a query using the Microbiome Data API. diff --git a/nmdc_automation/nmdc_common/__init__.py b/nmdc_automation/nmdc_common/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/nmdc_automation/nmdc_common/client.py b/nmdc_automation/nmdc_common/client.py new file mode 100644 index 00000000..9042cb20 --- /dev/null +++ b/nmdc_automation/nmdc_common/client.py @@ -0,0 +1,112 @@ +# -*- coding: utf-8 -*- +"""Client for the NMDC API.""" +# TODO: move all of this to a separate project nmdc-common. But for now, just +# copy it here. + +import logging +import requests +from typing import Optional + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +class NmdcApi: + """ + Basic API Client for GET requests. + """ + + def __init__(self, api_base_url): + if not api_base_url.endswith("/"): + api_base_url += "/" + self.base_url = api_base_url + self.headers = {'accept': 'application/json', 'Content-Type': 'application/json'} + + + def get_biosamples_part_of_study(self, study_id: str) -> list[dict]: + """ + Get the biosamples that are part of a study. + """ + biosample_records = [] + params = { + 'filter': '{"part_of": "'+study_id+'"}', + 'max_page_size': '1000', + } + url = self.base_url + "nmdcschema/biosample_set" + response = requests.get(url, params=params, headers=self.headers) + response.raise_for_status() + biosample_records.extend(response.json()["resources"]) + # Get the next page of results, if any + while response.json().get("next_page_token") is not None: + params['page_token'] = response.json()["next_page_token"] + response = requests.get(url, params=params, headers=self.headers) + response.raise_for_status() + biosample_records.extend(response.json()["resources"]) + + + return biosample_records + + def get_omics_processing_records_part_of_study(self, study_id: str) -> list[dict]: + """ + Get the OmicsProcessing records that are part of a study. + """ + omics_processing_records = [] + params = { + 'filter': '{"part_of": "'+study_id+'"}', + 'max_page_size': '1000', + } + url = self.base_url + "nmdcschema/omics_processing_set" + response = requests.get(url, params=params, headers=self.headers) + response.raise_for_status() + omics_processing_records.extend(response.json()["resources"]) + # Get the next page of results, if any + while response.json().get("next_page_token") is not None: + params['page_token'] = response.json()["next_page_token"] + response = requests.get(url, params=params, headers=self.headers) + response.raise_for_status() + omics_processing_records.extend(response.json()["resources"]) + return omics_processing_records + + def get_workflow_activities_informed_by(self, workflow_activity_set: str, + informed_by_id: str) -> list[dict]: + """ + Retrieve workflow activity record(s) for the given workflow + activity set and informed by a given OmicsProcessing ID. + """ + params = { + 'filter': '{"was_informed_by": "'+informed_by_id+'"}', + } + url = self.base_url + "nmdcschema/" + workflow_activity_set + response = requests.get(url, params=params, headers=self.headers) + logger.info(response.url) + response.raise_for_status() + workflow_activity_record = response.json()["resources"] + return workflow_activity_record + + def get_data_object(self, data_object_id: str) -> Optional[dict]: + """ + Retrieve a data object record by ID. + """ + url = self.base_url + "nmdcschema/data_object_set/" + data_object_id + try: + response = requests.get(url, headers=self.headers) + response.raise_for_status() + data_object_record = response.json() + except requests.exceptions.HTTPError as err: + if err.response.status_code == 404: + return None + else: + raise + return data_object_record + + def get_data_objects_by_description(self, description: str): + """ + Retrieve data object records by description. + """ + params = { + 'filter': '{"description.search": "'+description+'"}', + } + url = self.base_url + "data_objects" + response = requests.get(url, params=params, headers=self.headers) + response.raise_for_status() + data_object_records = response.json()["results"] + return data_object_records diff --git a/nmdc_automation/re_iding/base.py b/nmdc_automation/re_iding/base.py index ea9493f0..d00ab29e 100644 --- a/nmdc_automation/re_iding/base.py +++ b/nmdc_automation/re_iding/base.py @@ -18,13 +18,13 @@ from nmdc_automation.re_iding.db_utils import (OMICS_PROCESSING_SET, READS_QC_SET, METAGENOME_ASSEMBLY_SET, + METATRANSCRIPTOME_ACTIVITY_SET, check_for_single_omics_processing_record, get_data_object_record_by_id, get_omics_processing_id) from nmdc_automation.re_iding.file_utils import (find_data_object_type, - compute_new_paths, - get_new_paths, - assembly_file_operations) + compute_new_paths_and_link, + assembly_file_operations) NAPA_TEMPLATE = "../../../configs/re_iding_worklfows.yaml" DATA_BASE_URL = "https://data.microbiomedata.org/data" @@ -60,6 +60,8 @@ def _workflow_template_for_type(self, workflow_type: str) -> Dict: workflow_type = workflow_type.replace("QC", "Qc") if workflow_type == "nmdc:ReadbasedAnalysis": workflow_type = "nmdc:ReadBasedTaxonomyAnalysisActivity" + if workflow_type == "nmdc:MetaT": + workflow_type = "nmdc:MetatranscriptomeActivity" for t in self.workflow_template: type = t["Type"] @@ -163,7 +165,7 @@ def update_reads_qc_analysis_activity_set(self, db_record: Dict, logger.info(f"old_do_id: {old_do_id}") old_do_rec = get_data_object_record_by_id(db_record, old_do_id) data_object_type = find_data_object_type(old_do_rec) - new_file_path = compute_new_paths( + new_file_path = compute_new_paths_and_link( old_do_rec["url"], new_readsqc_base_dir, new_activity_id, self.data_dir ) logging.info(f"New file path computed for {data_object_type}: {new_file_path}") @@ -221,7 +223,7 @@ def update_metagenome_assembly_set(self, db_record: Dict, data_object_type = find_data_object_type(old_do_rec) if not data_object_type: continue - new_file_path = get_new_paths(old_do_rec["url"],new_assembly_base_dir, new_activity_id) + new_file_path = compute_new_paths_and_link(old_do_rec["url"], new_assembly_base_dir, new_activity_id) updated_md5, updated_file_size = assembly_file_operations( old_do_rec, data_object_type, new_file_path, new_activity_id, self.data_dir) @@ -284,7 +286,7 @@ def update_read_based_taxonomy_analysis_activity_set(self, db_record: Dict, data_object_type = find_data_object_type(old_do_rec) if not data_object_type: continue - new_file_path = compute_new_paths( + new_file_path = compute_new_paths_and_link( old_do_rec["url"], new_readbased_base_dir, new_activity_id, self.data_dir ) logging.info(f"New file path computed for {data_object_type}: {new_file_path}") @@ -314,7 +316,81 @@ def update_read_based_taxonomy_analysis_activity_set(self, db_record: Dict, new_db.read_based_taxonomy_analysis_activity_set.append(new_read_based) return new_db - + + def update_metatranscriptome_activity_set(self, db_record: Dict, + new_db: NmdcDatabase) -> (NmdcDatabase): + """ + Return a new Database instance with the metatranscriptome_activity_set + and its data objects updated to new IDs. + """ + logger.info(f"Updating metatranscriptome_activity_set for " + f"{db_record[OMICS_PROCESSING_SET][0]['id']}") + new_omics_processing = new_db.omics_processing_set[0] + + for metatranscriptome_rec in db_record[METATRANSCRIPTOME_ACTIVITY_SET]: + # old records have non-conforming type e.g. nmdc:MetaT, + # nmdc:metaT etc. - fix it + activity_type = "nmdc:MetatranscriptomeActivity" + metatranscriptome_rec["type"] = activity_type + omics_processing_id = new_omics_processing.id + has_input = [self._get_input_do_id(new_db, "Filtered Sequencing Reads")] + + + + new_activity_id = self.api_client.minter(activity_type) + "." + self.workflow_iteration + logging.info(f"New activity id created for {omics_processing_id} activity type {activity_type}: {new_activity_id}") + new_metatranscriptome_base_dir = os.path.join(self.data_dir, omics_processing_id, + new_activity_id) + logging.info(f"New metatranscriptome base dir: {new_metatranscriptome_base_dir}") + os.makedirs(new_metatranscriptome_base_dir, exist_ok=True) + + updated_has_output = [] + # Get Metatranscriptome data objects and update IDs + for old_do_id in metatranscriptome_rec["has_output"]: + logger.info(f"old_do_id: {old_do_id}") + old_do_rec = get_data_object_record_by_id(db_record, old_do_id) + # there are some data objects that are not in the database + if not old_do_rec: + logger.warning(f"Data object record not found for {old_do_id}") + continue + + data_object_type = find_data_object_type(old_do_rec) + logging.info(f"data_object_type: {data_object_type}") + # TODO: how do we handle data objects w/o type? + if not data_object_type: + logger.warning(f"Data object type not found for {old_do_id}") + # continue + # link data object to new location + new_file_path = compute_new_paths_and_link( + old_do_rec["url"], new_metatranscriptome_base_dir, new_activity_id, self.data_dir) + logging.info(f"New file path computed for {data_object_type}: {new_file_path}") + + new_do = self.make_new_data_object( + omics_processing_id, activity_type, new_activity_id, old_do_rec, data_object_type + ) + # add new data object to new database and update has_output + new_db.data_object_set.append(new_do) + updated_has_output.append(new_do.id) + + # Get new Metatranscriptome activity set + new_metatranscriptome = self._make_new_activity_set_object( + omics_processing_id, new_activity_id, metatranscriptome_rec, has_input, + updated_has_output + ) + # update activity-specific properties + # get new_metatranscriptome properties with no set value + unset_properties = [ + p for p in new_metatranscriptome.__dict__ if not new_metatranscriptome.__dict__[p] + ] + # check for that value in old record + for p in unset_properties: + if p in metatranscriptome_rec: + setattr(new_metatranscriptome, p, metatranscriptome_rec[p]) + + new_db.metatranscriptome_activity_set.append(new_metatranscriptome) + return new_db + + def _get_input_do_id(self, new_db, data_object_type: str): """Returns the string representation of a data object id given data object type""" @@ -334,6 +410,8 @@ def _make_new_activity_set_object(self, omics_processing_id: str, new_activity_i activity_type = activity_set_rec["type"].replace("QC", "Qc") if activity_type == "nmdc:ReadbasedAnalysis": activity_type = "nmdc:ReadBasedTaxonomyAnalysisActivity" + if activity_type == "nmdc:MetaT": + activity_type = "nmdc:MetatranscriptomeActivity" template = self._workflow_template_for_type(activity_type) activity_class = getattr(nmdc, template["ActivityRange"]) diff --git a/nmdc_automation/re_iding/db_utils.py b/nmdc_automation/re_iding/db_utils.py index dee607eb..23f06e3e 100644 --- a/nmdc_automation/re_iding/db_utils.py +++ b/nmdc_automation/re_iding/db_utils.py @@ -3,8 +3,9 @@ db_utils.py: Provides utility functions for working with NMDC Database records and data objects as dicts. """ +import logging from dataclasses import dataclass -from typing import Dict, List +from typing import Dict, Optional from nmdc_schema.nmdc import Database, DataObject # Some constants for set names we care about @@ -12,6 +13,7 @@ DATA_OBJECT_SET = "data_object_set" READS_QC_SET = "read_qc_analysis_activity_set" METAGENOME_ASSEMBLY_SET = "metagenome_assembly_set" +METATRANSCRIPTOME_ACTIVITY_SET = "metatranscriptome_activity_set" @@ -38,13 +40,14 @@ def check_for_single_omics_processing_record(db_record: Dict) -> bool: raise ValueError("Multiple omics_processing_set found in db_record") return True -def get_data_object_record_by_id(db_record: Dict, id: str)-> Dict: +def get_data_object_record_by_id(db_record: Dict, id: str)-> Optional[Dict]: """ Return the data object record with the given ID. """ data_objects = [d for d in db_record[DATA_OBJECT_SET] if d["id"] == id] if len(data_objects) == 0: - raise ValueError(f"No data object found with id: {id}") + logging.warning(f"No data object found with id: {id}") + return None elif len(data_objects) > 1: raise ValueError(f"Multiple data objects found with id: {id}") return data_objects[0] diff --git a/nmdc_automation/re_iding/file_utils.py b/nmdc_automation/re_iding/file_utils.py index b5bbae2e..5e7baead 100644 --- a/nmdc_automation/re_iding/file_utils.py +++ b/nmdc_automation/re_iding/file_utils.py @@ -3,17 +3,18 @@ file_utils.py: Provides utility functions for working with files. """ import logging +from pathlib import Path import os import hashlib import json import gzip from subprocess import check_output -from typing import Dict, Optional +from typing import Dict, Optional, Union, Tuple # BASE_DIR = "/global/cfs/cdirs/m3408/results" -bam_script = os.path.abspath("rewrite_bam.sh") -base = "https://data.microbiomedata.org/data" +BAM_SCRIPT = Path("rewrite_bam.sh").resolve() +API_BASE_URL = "https://data.microbiomedata.org/data/" logging.basicConfig( level=logging.INFO, @@ -47,7 +48,7 @@ def find_data_object_type(data_object_rec: Dict)-> Optional[str]: logger.error(f"Missing type: {url}") return None -def md5_sum(fn): +def md5_sum(fn: str) -> str: """ Calculate the MD5 hash of a file. @@ -64,7 +65,7 @@ def md5_sum(fn): return file_hash.hexdigest() -def read_json_file(filename): +def read_json_file(filename: str)-> Dict[str, str]: """ Read a JSON file and return its content as a dictionary. @@ -79,7 +80,8 @@ def read_json_file(filename): return data -def rewrite_id(src, dst, old_id, new_id, prefix=None): +def rewrite_id(src: str, dst: str, old_id: str, new_id: str, prefix: str = +None) -> Tuple[str, int]: """ Rewrite lines in a file, replacing occurrences of an old ID with a new ID. An optional prefix can be specified to limit which lines are modified. @@ -114,7 +116,7 @@ def find_assembly_id(src): def assembly_contigs(src, dst, act_id): - scaf = src.replace("_contigs", "_scaffolds") + scaf = str(src).replace("_contigs", "_scaffolds") old_id = find_assembly_id(scaf) return rewrite_id(src, dst, old_id, act_id, prefix=">") @@ -125,13 +127,13 @@ def assembly_scaffolds(src, dst, act_id): def assembly_coverage_stats(src, dst, act_id): - scaf = src.replace("_covstats.txt", "_scaffolds.fna") + scaf = str(src).replace("_covstats.txt", "_scaffolds.fna") old_id = find_assembly_id(scaf) return rewrite_id(src, dst, old_id, act_id) def assembly_agp(src, dst, act_id): - scaf = src.replace("_assembly.agp", "_scaffolds.fna") + scaf = str(src).replace("_assembly.agp", "_scaffolds.fna") old_id = find_assembly_id(scaf) return rewrite_id(src, dst, old_id, act_id) @@ -145,7 +147,7 @@ def convert_script(script, src, dst, old_id, act_id): def assembly_coverage_bam(script, src, dst, act_id): - scaf = src.replace("_pairedMapped_sorted.bam", "_scaffolds.fna") + scaf = str(src).replace("_pairedMapped_sorted.bam", "_scaffolds.fna") old_id = find_assembly_id(scaf) return convert_script(script, src, dst, old_id, act_id) @@ -156,18 +158,22 @@ def rewrite_sam(input_sam, output_sam, old_id, new_id): f_out.write(line.replace(old_id, new_id)) -def get_old_file_path(data_object_record, old_base_dir): +def get_old_file_path(data_object_record: dict, old_base_dir: Union[str, os.PathLike]) -> Path: old_url = data_object_record["url"] suffix = old_url.split("https://data.microbiomedata.org/data/")[1] - old_file_path = old_base_dir + "/" + suffix + old_file_path = Path(old_base_dir, suffix) return old_file_path def assembly_file_operations(data_object_record, data_object_type, destination, act_id, old_base_dir): + logging.info(f"Processing {data_object_type} for {act_id}") + logging.info(f"Destination: {destination}") + logging.info(f"Old base dir: {old_base_dir}") # get old file path upfront old_file_path = get_old_file_path(data_object_record, old_base_dir) + logging.info(f"Old file path: {old_file_path}") if data_object_type == "Assembly Coverage Stats": md5, size = assembly_coverage_stats(old_file_path, destination, act_id) @@ -179,36 +185,33 @@ def assembly_file_operations(data_object_record, data_object_type, md5, size = assembly_agp(old_file_path, destination, act_id) elif data_object_type == "Assembly Coverage BAM": md5, size = assembly_coverage_bam( - bam_script, old_file_path, destination, act_id + BAM_SCRIPT, old_file_path, destination, act_id ) return md5, size -def get_new_paths(old_url, new_base_dir, act_id): - """ - Use the url to return the string value of name path and url - """ - file_name = old_url.split("/")[-1] - file_extenstion = file_name.lstrip("nmdc_").split("_", maxsplit=1)[-1] - new_file_name = f"{act_id}_{file_extenstion}" - modified_new_file_name = new_file_name.replace(":", "_") - destination = os.path.join(new_base_dir, modified_new_file_name) - return destination - - -def compute_new_paths(old_url, new_base_dir, act_id, old_base_dir): +def compute_new_paths_and_link( + old_url: str, + new_base_dir: Union[str, os.PathLike], + act_id: str, + old_base_dir: Optional[Union[str, os.PathLike]] = None, +) -> Path: """ - Use the url to compute the new file name path and url + Compute the new path for the file based on the old url and the new base directory. + If the old base directory is provided, create a link between the old file and the new file. """ file_name = old_url.split("/")[-1] - suffix = old_url.split("https://data.microbiomedata.org/data/")[1] - old_file_path = old_base_dir + "/" + suffix file_extenstion = file_name.lstrip("nmdc_").split("_", maxsplit=1)[-1] new_file_name = f"{act_id}_{file_extenstion}" modified_new_file_name = new_file_name.replace(":", "_") - destination = os.path.join(new_base_dir, modified_new_file_name) + destination = Path(new_base_dir, modified_new_file_name) + if old_base_dir is None: + return destination + # create a link between the old file and the new file if old_base_dir is provided + suffix = old_url.split("https://data.microbiomedata.org/data/")[1] + old_file_path = Path(old_base_dir, suffix) try: os.link(old_file_path, destination) logging.info(f"Successfully created link between {old_file_path} and {destination}") @@ -217,5 +220,4 @@ def compute_new_paths(old_url, new_base_dir, act_id, old_base_dir): except Exception as e: logging.error(f"Unexpected error: {e}") - return destination \ No newline at end of file diff --git a/nmdc_automation/re_iding/nmdc:sty-11-aygzgv51_updated_records_example_dry_run.json b/nmdc_automation/re_iding/nmdc:sty-11-aygzgv51_updated_records_example_dry_run.json deleted file mode 100644 index 4d4beae5..00000000 --- a/nmdc_automation/re_iding/nmdc:sty-11-aygzgv51_updated_records_example_dry_run.json +++ /dev/null @@ -1,243 +0,0 @@ -{ - "data_object_set": [ - { - "id": "nmdc:dobj-12-3zw9ed72", - "name": "Reads QC result fastq (clean data)", - "description": "Filtered Reads for nmdc:omprc-11-bn8jcq58", - "file_size_bytes": 2571324879, - "md5_checksum": "7bf778baef033d36f118f8591256d6ef", - "data_object_type": "Filtered Sequencing Reads", - "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfrqc-12-7bfbg150/nmdc_wfrqc-12-7bfbg150_filtered.fastq.gz", - "type": "nmdc:DataObject" - }, - { - "id": "nmdc:dobj-12-f992g204", - "name": "Reads QC summary statistics", - "description": "Filtered Stats for nmdc:omprc-11-bn8jcq58", - "file_size_bytes": 290, - "md5_checksum": "b99ce8adc125c95f0bfdadf36a3f6848", - "data_object_type": "QC Statistics", - "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfrqc-12-7bfbg150/nmdc_wfrqc-12-7bfbg150_filterStats.txt", - "type": "nmdc:DataObject" - }, - { - "id": "nmdc:dobj-12-g19r1r98", - "name": "Final assembly contigs fasta", - "description": "Assembled contigs fasta for nmdc:omprc-11-bn8jcq58", - "file_size_bytes": 90794959, - "md5_checksum": "148dffaee63c7eccc62db4022d916fe1", - "data_object_type": "Assembly Contigs", - "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfmgas-12-nv5zhv63/nmdc:wfmgas-12-nv5zhv63_contigs.fna", - "type": "nmdc:DataObject" - }, - { - "id": "nmdc:dobj-12-6qs0sb92", - "name": "Final assembly scaffolds fasta", - "description": "Assembled scaffold fasta for nmdc:omprc-11-bn8jcq58", - "file_size_bytes": 90283295, - "md5_checksum": "5f66de5a1fa911f3f5e2e4027af8bb8c", - "data_object_type": "Assembly Scaffolds", - "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfmgas-12-nv5zhv63/nmdc:wfmgas-12-nv5zhv63_scaffolds.fna", - "type": "nmdc:DataObject" - }, - { - "id": "nmdc:dobj-12-0aqx6k97", - "name": "Assembled contigs coverage information", - "description": "Metagenome Contig Coverage Stats for nmdc:omprc-11-bn8jcq58", - "file_size_bytes": 14091491, - "md5_checksum": "b2c4779abd596ab9604b06687f804360", - "data_object_type": "Assembly Coverage Stats", - "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfmgas-12-nv5zhv63/nmdc:wfmgas-12-nv5zhv63_covstats.txt", - "type": "nmdc:DataObject" - }, - { - "id": "nmdc:dobj-12-mxghxv82", - "name": "An AGP format file that describes the assembly", - "description": "Assembled AGP file for nmdc:omprc-11-bn8jcq58", - "file_size_bytes": 13901555, - "md5_checksum": "88e65190df33ce4082a224075e8e0ff4", - "data_object_type": "Assembly AGP", - "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfmgas-12-nv5zhv63/nmdc:wfmgas-12-nv5zhv63_assembly.agp", - "type": "nmdc:DataObject" - }, - { - "id": "nmdc:dobj-12-evyw0b45", - "name": "Sorted bam file of reads mapping back to the final assembly", - "description": "Metagenome Alignment BAM file for nmdc:omprc-11-bn8jcq58", - "file_size_bytes": 0, - "md5_checksum": "d41d8cd98f00b204e9800998ecf8427e", - "data_object_type": "Assembly Coverage BAM", - "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfmgas-12-nv5zhv63/nmdc:wfmgas-12-nv5zhv63_pairedMapped_sorted.bam", - "type": "nmdc:DataObject" - }, - { - "id": "nmdc:dobj-12-a1c08w33", - "name": "GOTTCHA2 classification report file", - "description": "Gottcha2 TSV report for nmdc:omprc-11-bn8jcq58", - "file_size_bytes": 13174, - "md5_checksum": "bc7c1bda004aab357c8f6cf5a42242f9", - "data_object_type": "GOTTCHA2 Classification Report", - "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-12-bw74zf55/nmdc_wfrbt-12-bw74zf55_gottcha2_report.tsv", - "type": "nmdc:DataObject" - }, - { - "id": "nmdc:dobj-12-y7y6d813", - "name": "GOTTCHA2 report file", - "description": "Gottcha2 full TSV report for nmdc:omprc-11-bn8jcq58", - "file_size_bytes": 1035818, - "md5_checksum": "9481434cadd0d6c154e2ec4c11ef0e04", - "data_object_type": "GOTTCHA2 Report Full", - "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-12-bw74zf55/nmdc_wfrbt-12-bw74zf55_gottcha2_report_full.tsv", - "type": "nmdc:DataObject" - }, - { - "id": "nmdc:dobj-12-5aweg275", - "name": "GOTTCHA2 krona plot HTML file", - "description": "Gottcha2 Krona HTML report for nmdc:omprc-11-bn8jcq58", - "file_size_bytes": 262669, - "md5_checksum": "6b5bc6ce7f11c1336a5f85a98fc18541", - "data_object_type": "GOTTCHA2 Krona Plot", - "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-12-bw74zf55/nmdc_wfrbt-12-bw74zf55_gottcha2_krona.html", - "type": "nmdc:DataObject" - }, - { - "id": "nmdc:dobj-12-1xzbt082", - "name": "Centrifuge output read classification file", - "description": "Centrifuge classification TSV report for nmdc:omprc-11-bn8jcq58", - "file_size_bytes": 2189843623, - "md5_checksum": "933c71bbc2f4a2e84d50f0d3864cf940", - "data_object_type": "Centrifuge Taxonomic Classification", - "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-12-bw74zf55/nmdc_wfrbt-12-bw74zf55_centrifuge_classification.tsv", - "type": "nmdc:DataObject" - }, - { - "id": "nmdc:dobj-12-zaj6ec82", - "name": "Centrifuge output report file", - "description": "Centrifuge TSV report for nmdc:omprc-11-bn8jcq58", - "file_size_bytes": 260134, - "md5_checksum": "1a208e2519770ef50740ac39f1b9ba9a", - "data_object_type": "Centrifuge Classification Report", - "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-12-bw74zf55/nmdc_wfrbt-12-bw74zf55_centrifuge_report.tsv", - "type": "nmdc:DataObject" - }, - { - "id": "nmdc:dobj-12-gnrpbj27", - "name": "Centrifug krona plot HTML file", - "description": "Centrifuge Krona HTML report for nmdc:omprc-11-bn8jcq58", - "file_size_bytes": 2343980, - "md5_checksum": "f112a3840464ae7a9cf4a3bf295edd5c", - "data_object_type": "Centrifuge Krona Plot", - "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-12-bw74zf55/nmdc_wfrbt-12-bw74zf55_centrifuge_krona.html", - "type": "nmdc:DataObject" - }, - { - "id": "nmdc:dobj-12-avm4zb05", - "name": "Kraken2 output read classification file", - "description": "Kraken classification TSV report for nmdc:omprc-11-bn8jcq58", - "file_size_bytes": 1785563917, - "md5_checksum": "7ca01ea379f0baed96f87d1435925f95", - "data_object_type": "Kraken2 Taxonomic Classification", - "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-12-bw74zf55/nmdc_wfrbt-12-bw74zf55_kraken2_classification.tsv", - "type": "nmdc:DataObject" - }, - { - "id": "nmdc:dobj-12-mf77cq08", - "name": "Kraken2 output report file", - "description": "Kraken2 TSV report for nmdc:omprc-11-bn8jcq58", - "file_size_bytes": 699896, - "md5_checksum": "c85f2f2b4a518c4adb23970448a5cb45", - "data_object_type": "Kraken2 Classification Report", - "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-12-bw74zf55/nmdc_wfrbt-12-bw74zf55_kraken2_report.tsv", - "type": "nmdc:DataObject" - }, - { - "id": "nmdc:dobj-12-gz9j7220", - "name": "Kraken2 Krona plot HTML file", - "description": "Kraken2 Krona HTML report for nmdc:omprc-11-bn8jcq58", - "file_size_bytes": 4221977, - "md5_checksum": "94ee1bc2dc74830a21d5c3471d6cf223", - "data_object_type": "Kraken2 Krona Plot", - "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-12-bw74zf55/nmdc_wfrbt-12-bw74zf55_kraken2_krona.html", - "type": "nmdc:DataObject" - } - ], - "metagenome_assembly_set": [ - { - "id": "nmdc:wfmgas-12-nv5zhv63", - "name": "Metagenome Assembly Activity for nmdc:omprc-11-bn8jcq58", - "started_at_time": "2021-10-11T02:28:26Z", - "ended_at_time": "2021-10-11T04:56:04+00:00", - "was_informed_by": "nmdc:omprc-11-bn8jcq58", - "execution_resource": "NERSC - Perlmutter", - "git_url": "https://github.com/microbiomedata/metaAssembly", - "has_input": [ - "nmdc:dobj-12-3zw9ed72" - ], - "has_output": [ - "nmdc:dobj-12-g19r1r98", - "nmdc:dobj-12-6qs0sb92", - "nmdc:dobj-12-0aqx6k97", - "nmdc:dobj-12-mxghxv82", - "nmdc:dobj-12-evyw0b45" - ], - "type": "nmdc:MetagenomeAssembly", - "part_of": [ - "nmdc:omprc-11-bn8jcq58" - ], - "version": "v1.0.3" - } - ], - "read_qc_analysis_activity_set": [ - { - "id": "nmdc:wfrqc-12-7bfbg150", - "name": "Read QC Activity for nmdc:omprc-11-bn8jcq58", - "started_at_time": "2021-10-11T02:28:26Z", - "ended_at_time": "2021-10-11T04:56:04+00:00", - "was_informed_by": "nmdc:omprc-11-bn8jcq58", - "execution_resource": "NERSC - Perlmutter", - "git_url": "https://github.com/microbiomedata/ReadsQC", - "has_input": [ - "nmdc:30a06664f29cffbbbc49abad86eae6fc" - ], - "has_output": [ - "nmdc:dobj-12-3zw9ed72", - "nmdc:dobj-12-f992g204" - ], - "type": "nmdc:ReadQcAnalysisActivity", - "part_of": [ - "nmdc:omprc-11-bn8jcq58" - ], - "version": "v1.0.8" - } - ], - "read_based_taxonomy_analysis_activity_set": [ - { - "id": "nmdc:wfrbt-12-bw74zf55", - "name": "Readbased Taxonomy Analysis Activity for nmdc:omprc-11-bn8jcq58", - "started_at_time": "2021-10-11T02:28:26Z", - "ended_at_time": "2021-10-11T04:56:04+00:00", - "was_informed_by": "nmdc:omprc-11-bn8jcq58", - "execution_resource": "NERSC - Perlmutter", - "git_url": "https://github.com/microbiomedata/ReadbasedAnalysis", - "has_input": [ - "nmdc:dobj-12-3zw9ed72" - ], - "has_output": [ - "nmdc:dobj-12-a1c08w33", - "nmdc:dobj-12-y7y6d813", - "nmdc:dobj-12-5aweg275", - "nmdc:dobj-12-1xzbt082", - "nmdc:dobj-12-zaj6ec82", - "nmdc:dobj-12-gnrpbj27", - "nmdc:dobj-12-avm4zb05", - "nmdc:dobj-12-mf77cq08", - "nmdc:dobj-12-gz9j7220" - ], - "type": "nmdc:ReadBasedTaxonomyAnalysisActivity", - "part_of": [ - "nmdc:omprc-11-bn8jcq58" - ], - "version": "v1.0.5" - } - ] -} \ No newline at end of file diff --git a/nmdc_automation/re_iding/scripts/data/dryrun_associated_record_dump.json b/nmdc_automation/re_iding/scripts/data/dryrun_associated_record_dump.json index 64fa1542..45f626d8 100644 --- a/nmdc_automation/re_iding/scripts/data/dryrun_associated_record_dump.json +++ b/nmdc_automation/re_iding/scripts/data/dryrun_associated_record_dump.json @@ -506,6 +506,36 @@ "data_object_type": "Structural Annotation GFF", "type": "nmdc:DataObject" }, + { + "file_size_bytes": 37615194, + "id": "nmdc:d00ab33c0bdcf34789be58df977551f3", + "type": "nmdc:DataObject", + "data_object_type": "Read Count and RPKM", + "md5_checksum": "d00ab33c0bdcf34789be58df977551f3", + "url": "https://data.microbiomedata.org/data/503568_190752/metat_out_json/output.json", + "name": "output.json", + "description": "JSON records of features and the associated read counts and RPKMs for gold:Gp0324008" + }, + { + "file_size_bytes": 2396106683, + "id": "nmdc:2b48a6ce9afc35f4698af47bb57c11cb", + "type": "nmdc:DataObject", + "data_object_type": "QC non-rRNA R1", + "md5_checksum": "2b48a6ce9afc35f4698af47bb57c11cb", + "url": "https://data.microbiomedata.org/data/503568_190752/non_ribo_reads/filtered_R1.fastq", + "name": "filtered_R1.fastq", + "description": "R1 reads without the ribosomal sequences for gold:Gp0324008" + }, + { + "file_size_bytes": 2395867661, + "id": "nmdc:6b171f897e3e447d2e728b9d2573cdcd", + "type": "nmdc:DataObject", + "data_object_type": "QC non-rRNA R2", + "md5_checksum": "6b171f897e3e447d2e728b9d2573cdcd", + "url": "https://data.microbiomedata.org/data/503568_190752/non_ribo_reads/filtered_R2.fastq", + "name": "filtered_R2.fastq", + "description": "R2 reads without the ribosomal sequences for gold:Gp0324008" + }, { "_id": { "$oid": "649b00401ae706d7b5b16da8" @@ -740,6 +770,27 @@ "scaf_pct_gt50k": 0.08160224 } ], + "metatranscriptome_activity_set": [ + { + "git_url": "https://github.com/microbiomedata/metaT/releases/tag/v0.0.2", + "ended_at_time": "2021-01-21T00:00:00Z", + "execution_resource": "NERSC - Cori", + "id": "nmdc:f30c9099e7dfaa6e90f5e3a7cf5112de", + "name": "metaT activity for gold:Gp0324008", + "type": "nmdc:metaT", + "has_output": [ + "nmdc:8ef63c54cbfef72c19733e48ad0d1961", + "nmdc:d00ab33c0bdcf34789be58df977551f3", + "nmdc:2b48a6ce9afc35f4698af47bb57c11cb", + "nmdc:6b171f897e3e447d2e728b9d2573cdcd" + ], + "was_informed_by": "gold:Gp0324008", + "has_input": [ + "nmdc:7bf778baef033d36f118f8591256d6ef" + ], + "started_at_time": "2021-01-21T00:00:00Z" + } + ], "omics_processing_set": [ { "_id": { diff --git a/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfmgas-11-wh95cd94.1/nmdc_wfmgas-11-wh95cd94.1_contigs.fna b/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfmgas-11-wh95cd94.1/nmdc_wfmgas-11-wh95cd94.1_contigs.fna new file mode 100644 index 00000000..77c6782f --- /dev/null +++ b/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfmgas-11-wh95cd94.1/nmdc_wfmgas-11-wh95cd94.1_contigs.fna @@ -0,0 +1,2 @@ +nmdc:wfmgas-11-wh95cd94.1>nmdc:wfmgas-11-wh95cd94.1Cnmdc:wfmgas-11-wh95cd94.1onmdc:wfmgas-11-wh95cd94.1nnmdc:wfmgas-11-wh95cd94.1tnmdc:wfmgas-11-wh95cd94.1inmdc:wfmgas-11-wh95cd94.1gnmdc:wfmgas-11-wh95cd94.1_nmdc:wfmgas-11-wh95cd94.10nmdc:wfmgas-11-wh95cd94.10nmdc:wfmgas-11-wh95cd94.10nmdc:wfmgas-11-wh95cd94.11nmdc:wfmgas-11-wh95cd94.1 +nmdc:wfmgas-11-wh95cd94.1 diff --git a/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfmgas-11-wh95cd94.1/nmdc_wfmgas-11-wh95cd94.1_scaffolds.fna b/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfmgas-11-wh95cd94.1/nmdc_wfmgas-11-wh95cd94.1_scaffolds.fna new file mode 100644 index 00000000..cb130d2c --- /dev/null +++ b/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfmgas-11-wh95cd94.1/nmdc_wfmgas-11-wh95cd94.1_scaffolds.fna @@ -0,0 +1,2 @@ +nmdc:wfmgas-11-wh95cd94.1>nmdc:wfmgas-11-wh95cd94.1snmdc:wfmgas-11-wh95cd94.1enmdc:wfmgas-11-wh95cd94.1qnmdc:wfmgas-11-wh95cd94.1unmdc:wfmgas-11-wh95cd94.1enmdc:wfmgas-11-wh95cd94.1nnmdc:wfmgas-11-wh95cd94.1cnmdc:wfmgas-11-wh95cd94.1enmdc:wfmgas-11-wh95cd94.1Inmdc:wfmgas-11-wh95cd94.1Dnmdc:wfmgas-11-wh95cd94.1-nmdc:wfmgas-11-wh95cd94.10nmdc:wfmgas-11-wh95cd94.10nmdc:wfmgas-11-wh95cd94.11nmdc:wfmgas-11-wh95cd94.1 nmdc:wfmgas-11-wh95cd94.1dnmdc:wfmgas-11-wh95cd94.1enmdc:wfmgas-11-wh95cd94.1snmdc:wfmgas-11-wh95cd94.1cnmdc:wfmgas-11-wh95cd94.1rnmdc:wfmgas-11-wh95cd94.1inmdc:wfmgas-11-wh95cd94.1pnmdc:wfmgas-11-wh95cd94.1tnmdc:wfmgas-11-wh95cd94.1inmdc:wfmgas-11-wh95cd94.1onmdc:wfmgas-11-wh95cd94.1nnmdc:wfmgas-11-wh95cd94.1 +nmdc:wfmgas-11-wh95cd94.1ACGT diff --git a/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfmgas-11-x3r8km80.1/nmdc_wfmgas-11-x3r8km80.1_assembly.agp b/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfmgas-11-x3r8km80.1/nmdc_wfmgas-11-x3r8km80.1_assembly.agp new file mode 100644 index 00000000..e69de29b diff --git a/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfmgas-11-x3r8km80.1/nmdc_wfmgas-11-x3r8km80.1_contigs.fna b/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfmgas-11-x3r8km80.1/nmdc_wfmgas-11-x3r8km80.1_contigs.fna new file mode 100644 index 00000000..0f734c64 --- /dev/null +++ b/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfmgas-11-x3r8km80.1/nmdc_wfmgas-11-x3r8km80.1_contigs.fna @@ -0,0 +1,2 @@ +nmdc:wfmgas-11-x3r8km80.1>nmdc:wfmgas-11-x3r8km80.1Cnmdc:wfmgas-11-x3r8km80.1onmdc:wfmgas-11-x3r8km80.1nnmdc:wfmgas-11-x3r8km80.1tnmdc:wfmgas-11-x3r8km80.1inmdc:wfmgas-11-x3r8km80.1gnmdc:wfmgas-11-x3r8km80.1_nmdc:wfmgas-11-x3r8km80.10nmdc:wfmgas-11-x3r8km80.10nmdc:wfmgas-11-x3r8km80.10nmdc:wfmgas-11-x3r8km80.11nmdc:wfmgas-11-x3r8km80.1 +nmdc:wfmgas-11-x3r8km80.1 diff --git a/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfmgas-11-x3r8km80.1/nmdc_wfmgas-11-x3r8km80.1_covstats.txt b/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfmgas-11-x3r8km80.1/nmdc_wfmgas-11-x3r8km80.1_covstats.txt new file mode 100644 index 00000000..e69de29b diff --git a/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfmgas-11-x3r8km80.1/nmdc_wfmgas-11-x3r8km80.1_pairedMapped_sorted.bam b/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfmgas-11-x3r8km80.1/nmdc_wfmgas-11-x3r8km80.1_pairedMapped_sorted.bam new file mode 100644 index 00000000..e69de29b diff --git a/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfmgas-11-x3r8km80.1/nmdc_wfmgas-11-x3r8km80.1_scaffolds.fna b/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfmgas-11-x3r8km80.1/nmdc_wfmgas-11-x3r8km80.1_scaffolds.fna new file mode 100644 index 00000000..8004e191 --- /dev/null +++ b/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfmgas-11-x3r8km80.1/nmdc_wfmgas-11-x3r8km80.1_scaffolds.fna @@ -0,0 +1,2 @@ +nmdc:wfmgas-11-x3r8km80.1>nmdc:wfmgas-11-x3r8km80.1snmdc:wfmgas-11-x3r8km80.1enmdc:wfmgas-11-x3r8km80.1qnmdc:wfmgas-11-x3r8km80.1unmdc:wfmgas-11-x3r8km80.1enmdc:wfmgas-11-x3r8km80.1nnmdc:wfmgas-11-x3r8km80.1cnmdc:wfmgas-11-x3r8km80.1enmdc:wfmgas-11-x3r8km80.1Inmdc:wfmgas-11-x3r8km80.1Dnmdc:wfmgas-11-x3r8km80.1-nmdc:wfmgas-11-x3r8km80.10nmdc:wfmgas-11-x3r8km80.10nmdc:wfmgas-11-x3r8km80.11nmdc:wfmgas-11-x3r8km80.1 nmdc:wfmgas-11-x3r8km80.1dnmdc:wfmgas-11-x3r8km80.1enmdc:wfmgas-11-x3r8km80.1snmdc:wfmgas-11-x3r8km80.1cnmdc:wfmgas-11-x3r8km80.1rnmdc:wfmgas-11-x3r8km80.1inmdc:wfmgas-11-x3r8km80.1pnmdc:wfmgas-11-x3r8km80.1tnmdc:wfmgas-11-x3r8km80.1inmdc:wfmgas-11-x3r8km80.1onmdc:wfmgas-11-x3r8km80.1nnmdc:wfmgas-11-x3r8km80.1 +nmdc:wfmgas-11-x3r8km80.1ACGT diff --git a/nmdc_automation/re_iding/scripts/data/dryrun_re_ided_record_dump.json b/nmdc_automation/re_iding/scripts/data/dryrun_re_ided_record_dump.json index ee84252d..63e6d2b2 100644 --- a/nmdc_automation/re_iding/scripts/data/dryrun_re_ided_record_dump.json +++ b/nmdc_automation/re_iding/scripts/data/dryrun_re_ided_record_dump.json @@ -2,7 +2,7 @@ { "data_object_set": [ { - "id": "nmdc:dobj-11-k7vny888", + "id": "nmdc:dobj-11-77e9jm46", "name": "9422.8.132674.GTTTCG.fastq.gz", "description": "Raw sequencer read data", "file_size_bytes": 2861414297, @@ -10,169 +10,199 @@ "type": "nmdc:DataObject" }, { - "id": "nmdc:dobj-11-019yes10", - "name": "nmdc_wfrqc-11-zma0ys31.1_filtered.fastq.gz", + "id": "nmdc:dobj-11-xwqq5x15", + "name": "nmdc_wfrqc-11-9dahz742.1_filtered.fastq.gz", "description": "Filtered Reads for nmdc:omprc-11-bn8jcq58", "file_size_bytes": 2571324879, "md5_checksum": "7bf778baef033d36f118f8591256d6ef", "data_object_type": "Filtered Sequencing Reads", - "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfrqc-11-zma0ys31.1/nmdc_wfrqc-11-zma0ys31.1_filtered.fastq.gz", + "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfrqc-11-9dahz742.1/nmdc_wfrqc-11-9dahz742.1_filtered.fastq.gz", "type": "nmdc:DataObject" }, { - "id": "nmdc:dobj-11-hty12n62", - "name": "nmdc_wfrqc-11-zma0ys31.1_filterStats.txt", + "id": "nmdc:dobj-11-23h4kn88", + "name": "nmdc_wfrqc-11-9dahz742.1_filterStats.txt", "description": "Filtered Stats for nmdc:omprc-11-bn8jcq58", "file_size_bytes": 290, "md5_checksum": "b99ce8adc125c95f0bfdadf36a3f6848", "data_object_type": "QC Statistics", - "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfrqc-11-zma0ys31.1/nmdc_wfrqc-11-zma0ys31.1_filterStats.txt", + "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfrqc-11-9dahz742.1/nmdc_wfrqc-11-9dahz742.1_filterStats.txt", "type": "nmdc:DataObject" }, { - "id": "nmdc:dobj-11-gast3j11", - "name": "nmdc_wfmgas-11-3jvymb63.1_contigs.fna", + "id": "nmdc:dobj-11-gqy02f96", + "name": "nmdc_wfmgas-11-7fe0dg09.1_contigs.fna", "description": "Assembled contigs fasta for nmdc:omprc-11-bn8jcq58", - "file_size_bytes": 91134523, - "md5_checksum": "b96c8e7796616a8eefe473bff2c62e52", + "file_size_bytes": 364, + "md5_checksum": "2a0786d07c686d2065575cde034ac10a", "data_object_type": "Assembly Contigs", - "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfmgas-11-3jvymb63.1/nmdc_wfmgas-11-3jvymb63.1_contigs.fna", + "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfmgas-11-7fe0dg09.1/nmdc_wfmgas-11-7fe0dg09.1_contigs.fna", "type": "nmdc:DataObject" }, { - "id": "nmdc:dobj-11-bkza5366", - "name": "nmdc_wfmgas-11-3jvymb63.1_scaffolds.fna", + "id": "nmdc:dobj-11-e3jbsp53", + "name": "nmdc_wfmgas-11-7fe0dg09.1_scaffolds.fna", "description": "Assembled scaffold fasta for nmdc:omprc-11-bn8jcq58", - "file_size_bytes": 90622585, - "md5_checksum": "6ca496a8b9b298278ad2b4010a7c8cb2", + "file_size_bytes": 758, + "md5_checksum": "5f616779d9e3db4c2afd6001c1e95a1a", "data_object_type": "Assembly Scaffolds", - "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfmgas-11-3jvymb63.1/nmdc_wfmgas-11-3jvymb63.1_scaffolds.fna", + "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfmgas-11-7fe0dg09.1/nmdc_wfmgas-11-7fe0dg09.1_scaffolds.fna", "type": "nmdc:DataObject" }, { - "id": "nmdc:dobj-11-v9xfxp70", - "name": "nmdc_wfmgas-11-3jvymb63.1_covstats.txt", + "id": "nmdc:dobj-11-mennt828", + "name": "nmdc_wfmgas-11-7fe0dg09.1_covstats.txt", "description": "Metagenome Contig Coverage Stats for nmdc:omprc-11-bn8jcq58", - "file_size_bytes": 14431055, - "md5_checksum": "19782102f68575b03b7c12dd3d48e840", + "file_size_bytes": 0, + "md5_checksum": "d41d8cd98f00b204e9800998ecf8427e", "data_object_type": "Assembly Coverage Stats", - "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfmgas-11-3jvymb63.1/nmdc_wfmgas-11-3jvymb63.1_covstats.txt", + "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfmgas-11-7fe0dg09.1/nmdc_wfmgas-11-7fe0dg09.1_covstats.txt", "type": "nmdc:DataObject" }, { - "id": "nmdc:dobj-11-dz2mw103", - "name": "nmdc_wfmgas-11-3jvymb63.1_assembly.agp", + "id": "nmdc:dobj-11-bvgw0f35", + "name": "nmdc_wfmgas-11-7fe0dg09.1_assembly.agp", "description": "Assembled AGP file for nmdc:omprc-11-bn8jcq58", - "file_size_bytes": 14581247, - "md5_checksum": "419b294106e3fca4a06d18fd3c8e9181", + "file_size_bytes": 0, + "md5_checksum": "d41d8cd98f00b204e9800998ecf8427e", "data_object_type": "Assembly AGP", - "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfmgas-11-3jvymb63.1/nmdc_wfmgas-11-3jvymb63.1_assembly.agp", + "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfmgas-11-7fe0dg09.1/nmdc_wfmgas-11-7fe0dg09.1_assembly.agp", "type": "nmdc:DataObject" }, { - "id": "nmdc:dobj-11-75skzn36", - "name": "nmdc_wfmgas-11-3jvymb63.1_pairedMapped_sorted.bam", + "id": "nmdc:dobj-11-5wnxfs74", + "name": "nmdc_wfmgas-11-7fe0dg09.1_pairedMapped_sorted.bam", "description": "Metagenome Alignment BAM file for nmdc:omprc-11-bn8jcq58", "file_size_bytes": 0, "md5_checksum": "d41d8cd98f00b204e9800998ecf8427e", "data_object_type": "Assembly Coverage BAM", - "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfmgas-11-3jvymb63.1/nmdc_wfmgas-11-3jvymb63.1_pairedMapped_sorted.bam", + "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfmgas-11-7fe0dg09.1/nmdc_wfmgas-11-7fe0dg09.1_pairedMapped_sorted.bam", "type": "nmdc:DataObject" }, { - "id": "nmdc:dobj-11-ppa5pg23", - "name": "nmdc_wfrbt-11-e79d5x03.1_gottcha2_report.tsv", + "id": "nmdc:dobj-11-3xr4an92", + "name": "nmdc_wfrbt-11-1xhefc54.1_gottcha2_report.tsv", "description": "Gottcha2 TSV report for nmdc:omprc-11-bn8jcq58", "file_size_bytes": 13174, "md5_checksum": "bc7c1bda004aab357c8f6cf5a42242f9", "data_object_type": "GOTTCHA2 Classification Report", - "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-11-e79d5x03.1/nmdc_wfrbt-11-e79d5x03.1_gottcha2_report.tsv", + "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-11-1xhefc54.1/nmdc_wfrbt-11-1xhefc54.1_gottcha2_report.tsv", "type": "nmdc:DataObject" }, { - "id": "nmdc:dobj-11-0yn4b055", - "name": "nmdc_wfrbt-11-e79d5x03.1_gottcha2_report_full.tsv", + "id": "nmdc:dobj-11-jcqgy707", + "name": "nmdc_wfrbt-11-1xhefc54.1_gottcha2_report_full.tsv", "description": "Gottcha2 full TSV report for nmdc:omprc-11-bn8jcq58", "file_size_bytes": 1035818, "md5_checksum": "9481434cadd0d6c154e2ec4c11ef0e04", "data_object_type": "GOTTCHA2 Report Full", - "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-11-e79d5x03.1/nmdc_wfrbt-11-e79d5x03.1_gottcha2_report_full.tsv", + "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-11-1xhefc54.1/nmdc_wfrbt-11-1xhefc54.1_gottcha2_report_full.tsv", "type": "nmdc:DataObject" }, { - "id": "nmdc:dobj-11-ty0z3p61", - "name": "nmdc_wfrbt-11-e79d5x03.1_gottcha2_krona.html", + "id": "nmdc:dobj-11-cc16z813", + "name": "nmdc_wfrbt-11-1xhefc54.1_gottcha2_krona.html", "description": "Gottcha2 Krona HTML report for nmdc:omprc-11-bn8jcq58", "file_size_bytes": 262669, "md5_checksum": "6b5bc6ce7f11c1336a5f85a98fc18541", "data_object_type": "GOTTCHA2 Krona Plot", - "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-11-e79d5x03.1/nmdc_wfrbt-11-e79d5x03.1_gottcha2_krona.html", + "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-11-1xhefc54.1/nmdc_wfrbt-11-1xhefc54.1_gottcha2_krona.html", "type": "nmdc:DataObject" }, { - "id": "nmdc:dobj-11-e6h68y35", - "name": "nmdc_wfrbt-11-e79d5x03.1_centrifuge_classification.tsv", + "id": "nmdc:dobj-11-d0srbc38", + "name": "nmdc_wfrbt-11-1xhefc54.1_centrifuge_classification.tsv", "description": "Centrifuge classification TSV report for nmdc:omprc-11-bn8jcq58", "file_size_bytes": 2189843623, "md5_checksum": "933c71bbc2f4a2e84d50f0d3864cf940", "data_object_type": "Centrifuge Taxonomic Classification", - "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-11-e79d5x03.1/nmdc_wfrbt-11-e79d5x03.1_centrifuge_classification.tsv", + "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-11-1xhefc54.1/nmdc_wfrbt-11-1xhefc54.1_centrifuge_classification.tsv", "type": "nmdc:DataObject" }, { - "id": "nmdc:dobj-11-chgp8k25", - "name": "nmdc_wfrbt-11-e79d5x03.1_centrifuge_report.tsv", + "id": "nmdc:dobj-11-359kgj15", + "name": "nmdc_wfrbt-11-1xhefc54.1_centrifuge_report.tsv", "description": "Centrifuge TSV report for nmdc:omprc-11-bn8jcq58", "file_size_bytes": 260134, "md5_checksum": "1a208e2519770ef50740ac39f1b9ba9a", "data_object_type": "Centrifuge Classification Report", - "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-11-e79d5x03.1/nmdc_wfrbt-11-e79d5x03.1_centrifuge_report.tsv", + "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-11-1xhefc54.1/nmdc_wfrbt-11-1xhefc54.1_centrifuge_report.tsv", "type": "nmdc:DataObject" }, { - "id": "nmdc:dobj-11-0wbjqw24", - "name": "nmdc_wfrbt-11-e79d5x03.1_centrifuge_krona.html", + "id": "nmdc:dobj-11-d74t5230", + "name": "nmdc_wfrbt-11-1xhefc54.1_centrifuge_krona.html", "description": "Centrifuge Krona HTML report for nmdc:omprc-11-bn8jcq58", "file_size_bytes": 2343980, "md5_checksum": "f112a3840464ae7a9cf4a3bf295edd5c", "data_object_type": "Centrifuge Krona Plot", - "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-11-e79d5x03.1/nmdc_wfrbt-11-e79d5x03.1_centrifuge_krona.html", + "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-11-1xhefc54.1/nmdc_wfrbt-11-1xhefc54.1_centrifuge_krona.html", "type": "nmdc:DataObject" }, { - "id": "nmdc:dobj-11-xteq6n75", - "name": "nmdc_wfrbt-11-e79d5x03.1_kraken2_classification.tsv", + "id": "nmdc:dobj-11-k27fbb42", + "name": "nmdc_wfrbt-11-1xhefc54.1_kraken2_classification.tsv", "description": "Kraken classification TSV report for nmdc:omprc-11-bn8jcq58", "file_size_bytes": 1785563917, "md5_checksum": "7ca01ea379f0baed96f87d1435925f95", "data_object_type": "Kraken2 Taxonomic Classification", - "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-11-e79d5x03.1/nmdc_wfrbt-11-e79d5x03.1_kraken2_classification.tsv", + "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-11-1xhefc54.1/nmdc_wfrbt-11-1xhefc54.1_kraken2_classification.tsv", "type": "nmdc:DataObject" }, { - "id": "nmdc:dobj-11-1n5y1278", - "name": "nmdc_wfrbt-11-e79d5x03.1_kraken2_report.tsv", + "id": "nmdc:dobj-11-pqbahb35", + "name": "nmdc_wfrbt-11-1xhefc54.1_kraken2_report.tsv", "description": "Kraken2 TSV report for nmdc:omprc-11-bn8jcq58", "file_size_bytes": 699896, "md5_checksum": "c85f2f2b4a518c4adb23970448a5cb45", "data_object_type": "Kraken2 Classification Report", - "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-11-e79d5x03.1/nmdc_wfrbt-11-e79d5x03.1_kraken2_report.tsv", + "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-11-1xhefc54.1/nmdc_wfrbt-11-1xhefc54.1_kraken2_report.tsv", "type": "nmdc:DataObject" }, { - "id": "nmdc:dobj-11-rtjb8n73", - "name": "nmdc_wfrbt-11-e79d5x03.1_kraken2_krona.html", + "id": "nmdc:dobj-11-dpdc3287", + "name": "nmdc_wfrbt-11-1xhefc54.1_kraken2_krona.html", "description": "Kraken2 Krona HTML report for nmdc:omprc-11-bn8jcq58", "file_size_bytes": 4221977, "md5_checksum": "94ee1bc2dc74830a21d5c3471d6cf223", "data_object_type": "Kraken2 Krona Plot", - "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-11-e79d5x03.1/nmdc_wfrbt-11-e79d5x03.1_kraken2_krona.html", + "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-11-1xhefc54.1/nmdc_wfrbt-11-1xhefc54.1_kraken2_krona.html", + "type": "nmdc:DataObject" + }, + { + "id": "nmdc:dobj-11-08050793", + "name": "nmdc_wfmt-11-y9cf0x90.1_output.json", + "description": "JSON records of features and the associated read counts and RPKMs for nmdc:omprc-11-bn8jcq58", + "file_size_bytes": 37615194, + "md5_checksum": "d00ab33c0bdcf34789be58df977551f3", + "data_object_type": "Read Count and RPKM", + "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfmt-11-y9cf0x90.1/nmdc_wfmt-11-y9cf0x90.1_output.json", + "type": "nmdc:DataObject" + }, + { + "id": "nmdc:dobj-11-r8avrg23", + "name": "nmdc_wfmt-11-y9cf0x90.1_R1.fastq", + "description": "R1 reads without the ribosomal sequences for nmdc:omprc-11-bn8jcq58", + "file_size_bytes": 2396106683, + "md5_checksum": "2b48a6ce9afc35f4698af47bb57c11cb", + "data_object_type": "QC non-rRNA R1", + "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfmt-11-y9cf0x90.1/nmdc_wfmt-11-y9cf0x90.1_R1.fastq", + "type": "nmdc:DataObject" + }, + { + "id": "nmdc:dobj-11-92xdtw03", + "name": "nmdc_wfmt-11-y9cf0x90.1_R2.fastq", + "description": "R2 reads without the ribosomal sequences for nmdc:omprc-11-bn8jcq58", + "file_size_bytes": 2395867661, + "md5_checksum": "6b171f897e3e447d2e728b9d2573cdcd", + "data_object_type": "QC non-rRNA R2", + "url": "https://data.microbiomedata.org/data/nmdc:omprc-11-bn8jcq58/nmdc:wfmt-11-y9cf0x90.1/nmdc_wfmt-11-y9cf0x90.1_R2.fastq", "type": "nmdc:DataObject" } ], "metagenome_assembly_set": [ { - "id": "nmdc:wfmgas-11-3jvymb63.1", + "id": "nmdc:wfmgas-11-7fe0dg09.1", "name": "Metagenome Assembly Activity for nmdc:omprc-11-bn8jcq58", "started_at_time": "2021-10-11T02:28:26Z", "ended_at_time": "2021-10-11T04:56:04+00:00", @@ -180,20 +210,20 @@ "execution_resource": "NERSC-Cori", "git_url": "https://github.com/microbiomedata/metaAssembly", "has_input": [ - "nmdc:dobj-11-019yes10" + "nmdc:dobj-11-xwqq5x15" ], "has_output": [ - "nmdc:dobj-11-gast3j11", - "nmdc:dobj-11-bkza5366", - "nmdc:dobj-11-v9xfxp70", - "nmdc:dobj-11-dz2mw103", - "nmdc:dobj-11-75skzn36" + "nmdc:dobj-11-gqy02f96", + "nmdc:dobj-11-e3jbsp53", + "nmdc:dobj-11-mennt828", + "nmdc:dobj-11-bvgw0f35", + "nmdc:dobj-11-5wnxfs74" ], "type": "nmdc:MetagenomeAssembly", "part_of": [ "nmdc:omprc-11-bn8jcq58" ], - "version": "v1.0.3", + "version": "1.0.2", "asm_score": 6.577, "scaffolds": 169645, "scaf_logsum": 215363, @@ -221,6 +251,30 @@ "gc_avg": 0.46001 } ], + "metatranscriptome_activity_set": [ + { + "id": "nmdc:wfmt-11-y9cf0x90.1", + "name": "Metatranscriptome Activity for nmdc:omprc-11-bn8jcq58", + "started_at_time": "2021-01-21T00:00:00Z", + "ended_at_time": "2021-01-21T00:00:00Z", + "was_informed_by": "nmdc:omprc-11-bn8jcq58", + "execution_resource": "NERSC-Cori", + "git_url": "https://github.com/microbiomedata/MetatranscriptomeActivity", + "has_input": [ + "nmdc:dobj-11-xwqq5x15" + ], + "has_output": [ + "nmdc:dobj-11-08050793", + "nmdc:dobj-11-r8avrg23", + "nmdc:dobj-11-92xdtw03" + ], + "type": "nmdc:MetatranscriptomeActivity", + "part_of": [ + "nmdc:omprc-11-bn8jcq58" + ], + "version": "0.0.0" + } + ], "omics_processing_set": [ { "id": "nmdc:omprc-11-bn8jcq58", @@ -234,7 +288,7 @@ "gold:Gp0115663" ], "has_output": [ - "nmdc:dobj-11-k7vny888" + "nmdc:dobj-11-77e9jm46" ], "mod_date": "2021-06-15", "ncbi_project_name": "Sand microcosm microbial communities from a hyporheic zone in Columbia River, Washington, USA - GW-RW T2_23-Sept-14", @@ -253,7 +307,7 @@ ], "read_qc_analysis_activity_set": [ { - "id": "nmdc:wfrqc-11-zma0ys31.1", + "id": "nmdc:wfrqc-11-9dahz742.1", "name": "Read QC Activity for nmdc:omprc-11-bn8jcq58", "started_at_time": "2021-10-11T02:28:26Z", "ended_at_time": "2021-10-11T04:56:04+00:00", @@ -261,17 +315,17 @@ "execution_resource": "NERSC-Cori", "git_url": "https://github.com/microbiomedata/ReadsQC", "has_input": [ - "nmdc:dobj-11-k7vny888" + "nmdc:dobj-11-77e9jm46" ], "has_output": [ - "nmdc:dobj-11-019yes10", - "nmdc:dobj-11-hty12n62" + "nmdc:dobj-11-xwqq5x15", + "nmdc:dobj-11-23h4kn88" ], "type": "nmdc:ReadQcAnalysisActivity", "part_of": [ "nmdc:omprc-11-bn8jcq58" ], - "version": "v1.0.8", + "version": "1.0.2", "input_read_count": 32238374, "output_read_count": 30774080, "input_read_bases": 4867994474, @@ -280,7 +334,7 @@ ], "read_based_taxonomy_analysis_activity_set": [ { - "id": "nmdc:wfrbt-11-e79d5x03.1", + "id": "nmdc:wfrbt-11-1xhefc54.1", "name": "Readbased Taxonomy Analysis Activity for nmdc:omprc-11-bn8jcq58", "started_at_time": "2021-10-11T02:28:26Z", "ended_at_time": "2021-10-11T04:56:04+00:00", @@ -288,24 +342,24 @@ "execution_resource": "NERSC-Cori", "git_url": "https://github.com/microbiomedata/ReadbasedAnalysis", "has_input": [ - "nmdc:dobj-11-019yes10" + "nmdc:dobj-11-xwqq5x15" ], "has_output": [ - "nmdc:dobj-11-ppa5pg23", - "nmdc:dobj-11-0yn4b055", - "nmdc:dobj-11-ty0z3p61", - "nmdc:dobj-11-e6h68y35", - "nmdc:dobj-11-chgp8k25", - "nmdc:dobj-11-0wbjqw24", - "nmdc:dobj-11-xteq6n75", - "nmdc:dobj-11-1n5y1278", - "nmdc:dobj-11-rtjb8n73" + "nmdc:dobj-11-3xr4an92", + "nmdc:dobj-11-jcqgy707", + "nmdc:dobj-11-cc16z813", + "nmdc:dobj-11-d0srbc38", + "nmdc:dobj-11-359kgj15", + "nmdc:dobj-11-d74t5230", + "nmdc:dobj-11-k27fbb42", + "nmdc:dobj-11-pqbahb35", + "nmdc:dobj-11-dpdc3287" ], "type": "nmdc:ReadBasedTaxonomyAnalysisActivity", "part_of": [ "nmdc:omprc-11-bn8jcq58" ], - "version": "v1.0.5" + "version": "v1.0.2" } ] } diff --git a/nmdc_automation/re_iding/scripts/dryrun_process.log b/nmdc_automation/re_iding/scripts/dryrun_process.log new file mode 100644 index 00000000..71f53225 --- /dev/null +++ b/nmdc_automation/re_iding/scripts/dryrun_process.log @@ -0,0 +1,162 @@ +/Users/MBThornton/Library/Caches/pypoetry/virtualenvs/nmdc-automation-VEpwcKpc-py3.9/lib/python3.9/site-packages/urllib3/__init__.py:34: NotOpenSSLWarning: urllib3 v2 only supports OpenSSL 1.1.1+, currently the 'ssl' module is compiled with 'LibreSSL 2.8.3'. See: https://github.com/urllib3/urllib3/issues/3020 + warnings.warn( +INFO:root:Processing workflow records for study_id: nmdc:sty-11-aygzgv51 +INFO:root:Running in dryrun mode +INFO:root:Running in dryrun mode +INFO:root:Using data_dir: /Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results +INFO:root:Using db_infile: /Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_associated_record_dump.json +INFO:root:Read 1 records from db_infile +INFO:root:omics_processing_id: nmdc:omprc-11-bn8jcq58 +INFO:nmdc_automation.re_iding.base:nmdcDataObject jgi:55d740280d8785342fcf7e39 nmdc:dobj-11-77e9jm46 +INFO:nmdc_automation.re_iding.base:Updating reads_qc_analysis_activity_set for nmdc:omprc-11-bn8jcq58 +INFO:root:New activity id created for nmdc:omprc-11-bn8jcq58 activity type nmdc:ReadQcAnalysisActivity: nmdc:wfrqc-11-9dahz742.1 +INFO:nmdc_automation.re_iding.base:old_do_id: nmdc:7bf778baef033d36f118f8591256d6ef +ERROR:root:An error occurred while linking the file: [Errno 2] No such file or directory: '/Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:mga0h9dt75/qa/nmdc_mga0h9dt75_filtered.fastq.gz' -> '/Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfrqc-11-9dahz742.1/nmdc_wfrqc-11-9dahz742.1_filtered.fastq.gz' +INFO:root:New file path computed for Filtered Sequencing Reads: /Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfrqc-11-9dahz742.1/nmdc_wfrqc-11-9dahz742.1_filtered.fastq.gz +INFO:nmdc_automation.re_iding.base:nmdcDataObject nmdc:7bf778baef033d36f118f8591256d6ef nmdc:dobj-11-xwqq5x15 +INFO:nmdc_automation.re_iding.base:new_description: Filtered Reads for nmdc:omprc-11-bn8jcq58 +INFO:nmdc_automation.re_iding.base:new_filename: nmdc_wfrqc-11-9dahz742.1_filtered.fastq.gz +INFO:nmdc_automation.re_iding.base:old_do_id: nmdc:b99ce8adc125c95f0bfdadf36a3f6848 +ERROR:root:An error occurred while linking the file: [Errno 2] No such file or directory: '/Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:mga0h9dt75/qa/nmdc_mga0h9dt75_filterStats.txt' -> '/Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfrqc-11-9dahz742.1/nmdc_wfrqc-11-9dahz742.1_filterStats.txt' +INFO:root:New file path computed for QC Statistics: /Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfrqc-11-9dahz742.1/nmdc_wfrqc-11-9dahz742.1_filterStats.txt +INFO:nmdc_automation.re_iding.base:nmdcDataObject nmdc:b99ce8adc125c95f0bfdadf36a3f6848 nmdc:dobj-11-23h4kn88 +INFO:nmdc_automation.re_iding.base:new_description: Filtered Stats for nmdc:omprc-11-bn8jcq58 +INFO:nmdc_automation.re_iding.base:new_filename: nmdc_wfrqc-11-9dahz742.1_filterStats.txt +INFO:nmdc_automation.re_iding.base:nmdc:ReadQcAnalysisActivity nmdc:b31abf9d7fe53e2f802bb53e2d13542b nmdc:wfrqc-11-9dahz742.1 +INFO:nmdc_automation.re_iding.base:Updating metagenome_assembly_set for nmdc:omprc-11-bn8jcq58 +INFO:root:New activity id created for nmdc:omprc-11-bn8jcq58 activity type nmdc:MetagenomeAssembly: nmdc:wfmgas-11-7fe0dg09.1 +INFO:nmdc_automation.re_iding.base:old_do_id: nmdc:deddd162bf0128fba13b3bc1ca38d1aa +INFO:root:Processing Assembly Contigs for nmdc:wfmgas-11-7fe0dg09.1 +INFO:root:Destination: /Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfmgas-11-7fe0dg09.1/nmdc_wfmgas-11-7fe0dg09.1_contigs.fna +INFO:root:Old base dir: /Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results +INFO:root:Old file path: /Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:mga0h9dt75/assembly/nmdc_mga0h9dt75_contigs.fna +INFO:root:New file path computed for Assembly Contigs: /Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfmgas-11-7fe0dg09.1/nmdc_wfmgas-11-7fe0dg09.1_contigs.fna +INFO:nmdc_automation.re_iding.base:nmdcDataObject nmdc:deddd162bf0128fba13b3bc1ca38d1aa nmdc:dobj-11-gqy02f96 +INFO:nmdc_automation.re_iding.base:new_description: Assembled contigs fasta for nmdc:omprc-11-bn8jcq58 +INFO:nmdc_automation.re_iding.base:new_filename: nmdc_wfmgas-11-7fe0dg09.1_contigs.fna +INFO:nmdc_automation.re_iding.base:old_do_id: nmdc:b3573e3cda5a06611de71ca04c5c14cc +INFO:root:Processing Assembly Scaffolds for nmdc:wfmgas-11-7fe0dg09.1 +INFO:root:Destination: /Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfmgas-11-7fe0dg09.1/nmdc_wfmgas-11-7fe0dg09.1_scaffolds.fna +INFO:root:Old base dir: /Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results +INFO:root:Old file path: /Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:mga0h9dt75/assembly/nmdc_mga0h9dt75_scaffolds.fna +INFO:root:New file path computed for Assembly Scaffolds: /Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfmgas-11-7fe0dg09.1/nmdc_wfmgas-11-7fe0dg09.1_scaffolds.fna +INFO:nmdc_automation.re_iding.base:nmdcDataObject nmdc:b3573e3cda5a06611de71ca04c5c14cc nmdc:dobj-11-e3jbsp53 +INFO:nmdc_automation.re_iding.base:new_description: Assembled scaffold fasta for nmdc:omprc-11-bn8jcq58 +INFO:nmdc_automation.re_iding.base:new_filename: nmdc_wfmgas-11-7fe0dg09.1_scaffolds.fna +INFO:nmdc_automation.re_iding.base:old_do_id: nmdc:c6d0d4cea985ca6fb50a060e15b4a856 +INFO:root:Processing Assembly Coverage Stats for nmdc:wfmgas-11-7fe0dg09.1 +INFO:root:Destination: /Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfmgas-11-7fe0dg09.1/nmdc_wfmgas-11-7fe0dg09.1_covstats.txt +INFO:root:Old base dir: /Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results +INFO:root:Old file path: /Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:mga0h9dt75/assembly/nmdc_mga0h9dt75_covstats.txt +INFO:root:New file path computed for Assembly Coverage Stats: /Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfmgas-11-7fe0dg09.1/nmdc_wfmgas-11-7fe0dg09.1_covstats.txt +INFO:nmdc_automation.re_iding.base:nmdcDataObject nmdc:c6d0d4cea985ca6fb50a060e15b4a856 nmdc:dobj-11-mennt828 +INFO:nmdc_automation.re_iding.base:new_description: Metagenome Contig Coverage Stats for nmdc:omprc-11-bn8jcq58 +INFO:nmdc_automation.re_iding.base:new_filename: nmdc_wfmgas-11-7fe0dg09.1_covstats.txt +INFO:nmdc_automation.re_iding.base:old_do_id: nmdc:f450e3800e17691d5874c89fc46c186a +INFO:root:Processing Assembly AGP for nmdc:wfmgas-11-7fe0dg09.1 +INFO:root:Destination: /Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfmgas-11-7fe0dg09.1/nmdc_wfmgas-11-7fe0dg09.1_assembly.agp +INFO:root:Old base dir: /Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results +INFO:root:Old file path: /Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:mga0h9dt75/assembly/nmdc_mga0h9dt75_assembly.agp +INFO:root:New file path computed for Assembly AGP: /Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfmgas-11-7fe0dg09.1/nmdc_wfmgas-11-7fe0dg09.1_assembly.agp +INFO:nmdc_automation.re_iding.base:nmdcDataObject nmdc:f450e3800e17691d5874c89fc46c186a nmdc:dobj-11-bvgw0f35 +INFO:nmdc_automation.re_iding.base:new_description: Assembled AGP file for nmdc:omprc-11-bn8jcq58 +INFO:nmdc_automation.re_iding.base:new_filename: nmdc_wfmgas-11-7fe0dg09.1_assembly.agp +INFO:nmdc_automation.re_iding.base:old_do_id: nmdc:31dc958d116d02122509e90b0883954f +INFO:root:Processing Assembly Coverage BAM for nmdc:wfmgas-11-7fe0dg09.1 +INFO:root:Destination: /Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfmgas-11-7fe0dg09.1/nmdc_wfmgas-11-7fe0dg09.1_pairedMapped_sorted.bam +INFO:root:Old base dir: /Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results +INFO:root:Old file path: /Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:mga0h9dt75/assembly/nmdc_mga0h9dt75_pairedMapped_sorted.bam +INFO:root:New file path computed for Assembly Coverage BAM: /Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfmgas-11-7fe0dg09.1/nmdc_wfmgas-11-7fe0dg09.1_pairedMapped_sorted.bam +INFO:nmdc_automation.re_iding.base:nmdcDataObject nmdc:31dc958d116d02122509e90b0883954f nmdc:dobj-11-5wnxfs74 +INFO:nmdc_automation.re_iding.base:new_description: Metagenome Alignment BAM file for nmdc:omprc-11-bn8jcq58 +INFO:nmdc_automation.re_iding.base:new_filename: nmdc_wfmgas-11-7fe0dg09.1_pairedMapped_sorted.bam +INFO:nmdc_automation.re_iding.base:nmdc:MetagenomeAssembly nmdc:b31abf9d7fe53e2f802bb53e2d13542b nmdc:wfmgas-11-7fe0dg09.1 +INFO:nmdc_automation.re_iding.base:Updating read_based_taxonomy_analysis_activity_set for nmdc:omprc-11-bn8jcq58 +INFO:root:New activity id created for nmdc:omprc-11-bn8jcq58 activity type nmdc:ReadBasedTaxonomyAnalysisActivity: nmdc:wfrbt-11-1xhefc54.1 +INFO:nmdc_automation.re_iding.base:old_do_id: nmdc:bc7c1bda004aab357c8f6cf5a42242f9 +ERROR:root:An error occurred while linking the file: [Errno 2] No such file or directory: '/Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:mga0h9dt75/ReadbasedAnalysis/nmdc_mga0h9dt75_gottcha2_report.tsv' -> '/Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-11-1xhefc54.1/nmdc_wfrbt-11-1xhefc54.1_gottcha2_report.tsv' +INFO:root:New file path computed for GOTTCHA2 Classification Report: /Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-11-1xhefc54.1/nmdc_wfrbt-11-1xhefc54.1_gottcha2_report.tsv +INFO:nmdc_automation.re_iding.base:nmdcDataObject nmdc:bc7c1bda004aab357c8f6cf5a42242f9 nmdc:dobj-11-3xr4an92 +INFO:nmdc_automation.re_iding.base:new_description: Gottcha2 TSV report for nmdc:omprc-11-bn8jcq58 +INFO:nmdc_automation.re_iding.base:new_filename: nmdc_wfrbt-11-1xhefc54.1_gottcha2_report.tsv +INFO:nmdc_automation.re_iding.base:old_do_id: nmdc:9481434cadd0d6c154e2ec4c11ef0e04 +ERROR:root:An error occurred while linking the file: [Errno 2] No such file or directory: '/Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:mga0h9dt75/ReadbasedAnalysis/nmdc_mga0h9dt75_gottcha2_report_full.tsv' -> '/Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-11-1xhefc54.1/nmdc_wfrbt-11-1xhefc54.1_gottcha2_report_full.tsv' +INFO:root:New file path computed for GOTTCHA2 Report Full: /Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-11-1xhefc54.1/nmdc_wfrbt-11-1xhefc54.1_gottcha2_report_full.tsv +INFO:nmdc_automation.re_iding.base:nmdcDataObject nmdc:9481434cadd0d6c154e2ec4c11ef0e04 nmdc:dobj-11-jcqgy707 +INFO:nmdc_automation.re_iding.base:new_description: Gottcha2 full TSV report for nmdc:omprc-11-bn8jcq58 +INFO:nmdc_automation.re_iding.base:new_filename: nmdc_wfrbt-11-1xhefc54.1_gottcha2_report_full.tsv +INFO:nmdc_automation.re_iding.base:old_do_id: nmdc:6b5bc6ce7f11c1336a5f85a98fc18541 +ERROR:root:An error occurred while linking the file: [Errno 2] No such file or directory: '/Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:mga0h9dt75/ReadbasedAnalysis/nmdc_mga0h9dt75_gottcha2_krona.html' -> '/Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-11-1xhefc54.1/nmdc_wfrbt-11-1xhefc54.1_gottcha2_krona.html' +INFO:root:New file path computed for GOTTCHA2 Krona Plot: /Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-11-1xhefc54.1/nmdc_wfrbt-11-1xhefc54.1_gottcha2_krona.html +INFO:nmdc_automation.re_iding.base:nmdcDataObject nmdc:6b5bc6ce7f11c1336a5f85a98fc18541 nmdc:dobj-11-cc16z813 +INFO:nmdc_automation.re_iding.base:new_description: Gottcha2 Krona HTML report for nmdc:omprc-11-bn8jcq58 +INFO:nmdc_automation.re_iding.base:new_filename: nmdc_wfrbt-11-1xhefc54.1_gottcha2_krona.html +INFO:nmdc_automation.re_iding.base:old_do_id: nmdc:933c71bbc2f4a2e84d50f0d3864cf940 +ERROR:root:An error occurred while linking the file: [Errno 2] No such file or directory: '/Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:mga0h9dt75/ReadbasedAnalysis/nmdc_mga0h9dt75_centrifuge_classification.tsv' -> '/Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-11-1xhefc54.1/nmdc_wfrbt-11-1xhefc54.1_centrifuge_classification.tsv' +INFO:root:New file path computed for Centrifuge Taxonomic Classification: /Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-11-1xhefc54.1/nmdc_wfrbt-11-1xhefc54.1_centrifuge_classification.tsv +INFO:nmdc_automation.re_iding.base:nmdcDataObject nmdc:933c71bbc2f4a2e84d50f0d3864cf940 nmdc:dobj-11-d0srbc38 +INFO:nmdc_automation.re_iding.base:new_description: Centrifuge classification TSV report for nmdc:omprc-11-bn8jcq58 +INFO:nmdc_automation.re_iding.base:new_filename: nmdc_wfrbt-11-1xhefc54.1_centrifuge_classification.tsv +INFO:nmdc_automation.re_iding.base:old_do_id: nmdc:1a208e2519770ef50740ac39f1b9ba9a +ERROR:root:An error occurred while linking the file: [Errno 2] No such file or directory: '/Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:mga0h9dt75/ReadbasedAnalysis/nmdc_mga0h9dt75_centrifuge_report.tsv' -> '/Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-11-1xhefc54.1/nmdc_wfrbt-11-1xhefc54.1_centrifuge_report.tsv' +INFO:root:New file path computed for Centrifuge Classification Report: /Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-11-1xhefc54.1/nmdc_wfrbt-11-1xhefc54.1_centrifuge_report.tsv +INFO:nmdc_automation.re_iding.base:nmdcDataObject nmdc:1a208e2519770ef50740ac39f1b9ba9a nmdc:dobj-11-359kgj15 +INFO:nmdc_automation.re_iding.base:new_description: Centrifuge TSV report for nmdc:omprc-11-bn8jcq58 +INFO:nmdc_automation.re_iding.base:new_filename: nmdc_wfrbt-11-1xhefc54.1_centrifuge_report.tsv +INFO:nmdc_automation.re_iding.base:old_do_id: nmdc:f112a3840464ae7a9cf4a3bf295edd5c +ERROR:root:An error occurred while linking the file: [Errno 2] No such file or directory: '/Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:mga0h9dt75/ReadbasedAnalysis/nmdc_mga0h9dt75_centrifuge_krona.html' -> '/Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-11-1xhefc54.1/nmdc_wfrbt-11-1xhefc54.1_centrifuge_krona.html' +INFO:root:New file path computed for Centrifuge Krona Plot: /Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-11-1xhefc54.1/nmdc_wfrbt-11-1xhefc54.1_centrifuge_krona.html +INFO:nmdc_automation.re_iding.base:nmdcDataObject nmdc:f112a3840464ae7a9cf4a3bf295edd5c nmdc:dobj-11-d74t5230 +INFO:nmdc_automation.re_iding.base:new_description: Centrifuge Krona HTML report for nmdc:omprc-11-bn8jcq58 +INFO:nmdc_automation.re_iding.base:new_filename: nmdc_wfrbt-11-1xhefc54.1_centrifuge_krona.html +INFO:nmdc_automation.re_iding.base:old_do_id: nmdc:7ca01ea379f0baed96f87d1435925f95 +ERROR:root:An error occurred while linking the file: [Errno 2] No such file or directory: '/Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:mga0h9dt75/ReadbasedAnalysis/nmdc_mga0h9dt75_kraken2_classification.tsv' -> '/Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-11-1xhefc54.1/nmdc_wfrbt-11-1xhefc54.1_kraken2_classification.tsv' +INFO:root:New file path computed for Kraken2 Taxonomic Classification: /Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-11-1xhefc54.1/nmdc_wfrbt-11-1xhefc54.1_kraken2_classification.tsv +INFO:nmdc_automation.re_iding.base:nmdcDataObject nmdc:7ca01ea379f0baed96f87d1435925f95 nmdc:dobj-11-k27fbb42 +INFO:nmdc_automation.re_iding.base:new_description: Kraken classification TSV report for nmdc:omprc-11-bn8jcq58 +INFO:nmdc_automation.re_iding.base:new_filename: nmdc_wfrbt-11-1xhefc54.1_kraken2_classification.tsv +INFO:nmdc_automation.re_iding.base:old_do_id: nmdc:c85f2f2b4a518c4adb23970448a5cb45 +ERROR:root:An error occurred while linking the file: [Errno 2] No such file or directory: '/Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:mga0h9dt75/ReadbasedAnalysis/nmdc_mga0h9dt75_kraken2_report.tsv' -> '/Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-11-1xhefc54.1/nmdc_wfrbt-11-1xhefc54.1_kraken2_report.tsv' +INFO:root:New file path computed for Kraken2 Classification Report: /Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-11-1xhefc54.1/nmdc_wfrbt-11-1xhefc54.1_kraken2_report.tsv +INFO:nmdc_automation.re_iding.base:nmdcDataObject nmdc:c85f2f2b4a518c4adb23970448a5cb45 nmdc:dobj-11-pqbahb35 +INFO:nmdc_automation.re_iding.base:new_description: Kraken2 TSV report for nmdc:omprc-11-bn8jcq58 +INFO:nmdc_automation.re_iding.base:new_filename: nmdc_wfrbt-11-1xhefc54.1_kraken2_report.tsv +INFO:nmdc_automation.re_iding.base:old_do_id: nmdc:94ee1bc2dc74830a21d5c3471d6cf223 +ERROR:root:An error occurred while linking the file: [Errno 2] No such file or directory: '/Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:mga0h9dt75/ReadbasedAnalysis/nmdc_mga0h9dt75_kraken2_krona.html' -> '/Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-11-1xhefc54.1/nmdc_wfrbt-11-1xhefc54.1_kraken2_krona.html' +INFO:root:New file path computed for Kraken2 Krona Plot: /Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfrbt-11-1xhefc54.1/nmdc_wfrbt-11-1xhefc54.1_kraken2_krona.html +INFO:nmdc_automation.re_iding.base:nmdcDataObject nmdc:94ee1bc2dc74830a21d5c3471d6cf223 nmdc:dobj-11-dpdc3287 +INFO:nmdc_automation.re_iding.base:new_description: Kraken2 Krona HTML report for nmdc:omprc-11-bn8jcq58 +INFO:nmdc_automation.re_iding.base:new_filename: nmdc_wfrbt-11-1xhefc54.1_kraken2_krona.html +INFO:nmdc_automation.re_iding.base:nmdc:ReadBasedTaxonomyAnalysisActivity nmdc:b31abf9d7fe53e2f802bb53e2d13542b nmdc:wfrbt-11-1xhefc54.1 +INFO:nmdc_automation.re_iding.base:Updating metatranscriptome_activity_set for nmdc:omprc-11-bn8jcq58 +INFO:root:New activity id created for nmdc:omprc-11-bn8jcq58 activity type nmdc:MetatranscriptomeActivity: nmdc:wfmt-11-y9cf0x90.1 +INFO:root:New metatranscriptome base dir: /Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfmt-11-y9cf0x90.1 +INFO:nmdc_automation.re_iding.base:old_do_id: nmdc:8ef63c54cbfef72c19733e48ad0d1961 +WARNING:root:No data object found with id: nmdc:8ef63c54cbfef72c19733e48ad0d1961 +WARNING:nmdc_automation.re_iding.base:Data object record not found for nmdc:8ef63c54cbfef72c19733e48ad0d1961 +INFO:nmdc_automation.re_iding.base:old_do_id: nmdc:d00ab33c0bdcf34789be58df977551f3 +INFO:root:data_object_type: Read Count and RPKM +ERROR:root:An error occurred while linking the file: [Errno 2] No such file or directory: '/Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/503568_190752/metat_out_json/output.json' -> '/Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfmt-11-y9cf0x90.1/nmdc_wfmt-11-y9cf0x90.1_output.json' +INFO:root:New file path computed for Read Count and RPKM: /Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfmt-11-y9cf0x90.1/nmdc_wfmt-11-y9cf0x90.1_output.json +INFO:nmdc_automation.re_iding.base:nmdcDataObject nmdc:d00ab33c0bdcf34789be58df977551f3 nmdc:dobj-11-08050793 +INFO:nmdc_automation.re_iding.base:new_description: JSON records of features and the associated read counts and RPKMs for nmdc:omprc-11-bn8jcq58 +INFO:nmdc_automation.re_iding.base:new_filename: nmdc_wfmt-11-y9cf0x90.1_output.json +INFO:nmdc_automation.re_iding.base:old_do_id: nmdc:2b48a6ce9afc35f4698af47bb57c11cb +INFO:root:data_object_type: QC non-rRNA R1 +ERROR:root:An error occurred while linking the file: [Errno 2] No such file or directory: '/Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/503568_190752/non_ribo_reads/filtered_R1.fastq' -> '/Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfmt-11-y9cf0x90.1/nmdc_wfmt-11-y9cf0x90.1_R1.fastq' +INFO:root:New file path computed for QC non-rRNA R1: /Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfmt-11-y9cf0x90.1/nmdc_wfmt-11-y9cf0x90.1_R1.fastq +INFO:nmdc_automation.re_iding.base:nmdcDataObject nmdc:2b48a6ce9afc35f4698af47bb57c11cb nmdc:dobj-11-r8avrg23 +INFO:nmdc_automation.re_iding.base:new_description: R1 reads without the ribosomal sequences for nmdc:omprc-11-bn8jcq58 +INFO:nmdc_automation.re_iding.base:new_filename: nmdc_wfmt-11-y9cf0x90.1_R1.fastq +INFO:nmdc_automation.re_iding.base:old_do_id: nmdc:6b171f897e3e447d2e728b9d2573cdcd +INFO:root:data_object_type: QC non-rRNA R2 +ERROR:root:An error occurred while linking the file: [Errno 2] No such file or directory: '/Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/503568_190752/non_ribo_reads/filtered_R2.fastq' -> '/Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfmt-11-y9cf0x90.1/nmdc_wfmt-11-y9cf0x90.1_R2.fastq' +INFO:root:New file path computed for QC non-rRNA R2: /Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_data/results/nmdc:omprc-11-bn8jcq58/nmdc:wfmt-11-y9cf0x90.1/nmdc_wfmt-11-y9cf0x90.1_R2.fastq +INFO:nmdc_automation.re_iding.base:nmdcDataObject nmdc:6b171f897e3e447d2e728b9d2573cdcd nmdc:dobj-11-92xdtw03 +INFO:nmdc_automation.re_iding.base:new_description: R2 reads without the ribosomal sequences for nmdc:omprc-11-bn8jcq58 +INFO:nmdc_automation.re_iding.base:new_filename: nmdc_wfmt-11-y9cf0x90.1_R2.fastq +INFO:nmdc_automation.re_iding.base:nmdc:MetatranscriptomeActivity nmdc:f30c9099e7dfaa6e90f5e3a7cf5112de nmdc:wfmt-11-y9cf0x90.1 +Writing 1 records to /Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_re_ided_record_dump.json +INFO:__main__:Writing 1 records to /Users/MBThornton/Documents/code/nmdc_automation/nmdc_automation/re_iding/scripts/data/dryrun_re_ided_record_dump.json +Elapsed time: 13.911892175674438 +INFO:__main__:Elapsed time: 13.911892175674438 diff --git a/nmdc_automation/re_iding/scripts/re_id_tool.py b/nmdc_automation/re_iding/scripts/re_id_tool.py index f39d83d9..9d5b5c0f 100755 --- a/nmdc_automation/re_iding/scripts/re_id_tool.py +++ b/nmdc_automation/re_iding/scripts/re_id_tool.py @@ -13,7 +13,7 @@ from linkml_runtime.dumpers import json_dumper from nmdc_automation.api import NmdcRuntimeApi, NmdcRuntimeUserApi -from nmdc_automation.config import Config +from nmdc_automation.nmdc_common.client import NmdcApi import nmdc_schema.nmdc as nmdc from nmdc_automation.re_iding.base import ReIdTool from nmdc_automation.re_iding.changesheets import Changesheet, ChangesheetLineItem @@ -22,21 +22,25 @@ # Defaults GOLD_STUDY_ID = "gold:Gs0114663" STUDY_ID = "nmdc:sty-11-aygzgv51" -NAPA_CONFIG = Path("../../../configs/napa_config.toml") +NAPA_CONFIG = Path("../../../configs/.local_napa_config.toml") +NAPA_BASE_URL = "https://api-napa.microbiomedata.org/" BASE_DATAFILE_DIR = "/global/cfs/cdirs/m3408/results" -DRYRUN_DATAFILE_DIR = "/global/cfs/cdirs/m3408/results" DATA_DIR = Path(__file__).parent.absolute().joinpath("data") +DRYRUN_DATAFILE_DIR = DATA_DIR.joinpath("dryrun_data/results") +LOG_PATH = DATA_DIR.joinpath("re_id_tool.log") logging.basicConfig( + filename="re_id.log", + filemode="w", level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", - handlers=[logging.StreamHandler()], ) logger = logging.getLogger(__name__) +logger.addHandler(logging.StreamHandler()) @click.group() @@ -56,12 +60,14 @@ def cli(ctx, site_config): @cli.command() @click.option( - "--study_id", + "--study-id", default=STUDY_ID, help=f"Optional updated study ID. Default: {STUDY_ID}", ) +@click.option("--api-base-url", default=NAPA_BASE_URL, + help=f"Optional base URL for the NMDC API. Default: {NAPA_BASE_URL}") @click.pass_context -def extract_records(ctx, study_id): +def extract_records(ctx, study_id, api_base_url): """ Extract metagenome workflow activities and their data object records that are informed_by the legacy ID (GOLD Study ID) for a re-ID-ed Study/ @@ -70,17 +76,19 @@ def extract_records(ctx, study_id): Write the results, as a list of nmdc-schema Database instances to a JSON file. """ start_time = time.time() - logging.info(f"Extracting workflow records for study_id: {study_id}") - logging.info(f"study_id: {study_id}") + logger.info(f"Extracting workflow records for study_id: {study_id}") + logger.info(f"study_id: {study_id}") config = ctx.obj["site_config"] - api_client = NmdcRuntimeUserApi(config) + # api_client = NmdcRuntimeUserApi(config) + api_client = NmdcApi(api_base_url) # 1. Retrieve all OmicsProcessing records for the updated NMDC study ID - omics_processing_records = api_client.get_omics_processing_records_for_nmdc_study( + omics_processing_records = ( + api_client.get_omics_processing_records_part_of_study( study_id - ) - logging.info( + )) + logger.info( f"Retrieved {len(omics_processing_records)} OmicsProcessing records for study {study_id}" ) @@ -88,19 +96,32 @@ def extract_records(ctx, study_id): # 2. For each OmicsProcessing record, find the legacy identifier: for omics_processing_record in omics_processing_records: db = nmdc.Database() - logging.info(f"omics_processing_record: " f"{omics_processing_record['id']}") + logger.info(f"omics_processing_record: " f"{omics_processing_record['id']}") legacy_id = _get_legacy_id(omics_processing_record) - logging.info(f"legacy_id: {legacy_id}") - - if omics_processing_record["omics_type"]["has_raw_value"] != "Metagenome": - logging.info( - f"omics_processing_record {omics_processing_record['id']} " - f"is not a Metagenome" + logger.info(f"legacy_id: {legacy_id}") + + omics_type = omics_processing_record["omics_type"]["has_raw_value"] + omics_id = omics_processing_record["id"] + if omics_type not in ["Metagenome", "Metatranscriptome"]: + logger.info( + f"omics_processing_record {omics_id}: {omics_type}] " + f"is not a Metagenome or Metatranscriptome, skipping" ) continue db.omics_processing_set.append(omics_processing_record) for data_object_id in omics_processing_record["has_output"]: - data_object_record = api_client.get_data_object_by_id(data_object_id) + data_object_record = api_client.get_data_object(data_object_id) + if not data_object_record: + logger.warning(f"no data object found for {data_object_id}") + continue + data_object_type = data_object_record.get("data_object_type") + data_object_description = data_object_record.get("description") + logger.info( + f"has_output: " + f"{data_object_record['id']}, " + f"Type: {data_object_type}, " + f" Description: {data_object_description}" + ) db.data_object_set.append(data_object_record) # downstream workflow activity sets @@ -110,7 +131,8 @@ def extract_records(ctx, study_id): metagenome_assembly_records, metagenome_annotation_records, mags_records, - ) = ([], [], [], [], []) + metatranscriptome_activity_records, + ) = ([], [], [], [], [], []) downstream_workflow_activity_sets = { "read_qc_analysis_activity_set": read_qc_records, @@ -118,30 +140,47 @@ def extract_records(ctx, study_id): "metagenome_assembly_set": metagenome_assembly_records, "metagenome_annotation_activity_set": metagenome_annotation_records, "mags_activity_set": mags_records, + "metatranscriptome_activity_set": metatranscriptome_activity_records, } - for set_name, records in downstream_workflow_activity_sets.items(): - records = api_client.get_workflow_activity_informed_by(set_name, legacy_id) - db.__setattr__(set_name, records) - # Add the data objects referenced by the `has_output` property - for record in records: - logging.info(f"record: {record['id']}, {record['name']}") - for data_object_id in record["has_output"]: - data_object_record = api_client.get_data_object_by_id( + for set_name, workflow_records in downstream_workflow_activity_sets.items(): + logger.info(f"set_name: {set_name} for {legacy_id}") + workflow_records = api_client.get_workflow_activities_informed_by(set_name, + legacy_id) + logger.info(f"found {len(workflow_records)} records") + db.__setattr__(set_name, workflow_records) + for workflow_record in workflow_records: + logger.info(f"record: {workflow_record['id']}, {workflow_record['name']}") + input_output_data_object_ids = [] + if "has_input" in workflow_record: + input_output_data_object_ids.extend(workflow_record["has_input"]) + if "has_output" in workflow_record: + input_output_data_object_ids.extend(workflow_record["has_output"]) + + for data_object_id in input_output_data_object_ids: + data_object_record = api_client.get_data_object( data_object_id ) - logging.info( - f"data_object_record: " - f"{data_object_record['id']}, {data_object_record['description']}" + if not data_object_record: + logger.warning(f"no data object found for {data_object_id}") + continue + data_object_type = data_object_record.get("data_object_type") + data_object_description = data_object_record.get("description") + logger.info( + f"has_output: " + f"{data_object_record['id']}, " + f"Type: {data_object_type}, " + f" Description: {data_object_description}" ) - db.data_object_set.append(data_object_record) + if data_object_record not in db.data_object_set: + db.data_object_set.append(data_object_record) # Search for orphaned data objects with the legacy ID in the description orphaned_data_objects = api_client.get_data_objects_by_description(legacy_id) # check that we don't already have the data object in the set for data_object in orphaned_data_objects: - if data_object["id"] not in [d["id"] for d in db.data_object_set]: + if data_object not in db.data_object_set: db.data_object_set.append(data_object) - logging.info( + logger.info( f"Added orphaned data object: " f"{data_object['id']}, {data_object['description']}" ) @@ -150,6 +189,8 @@ def extract_records(ctx, study_id): json_data = json.loads(json_dumper.dumps(retrieved_databases, inject_type=False)) db_outfile = DATA_DIR.joinpath(f"{study_id}_associated_record_dump.json") + logger.info(f"Writing {len(retrieved_databases)} records to {db_outfile}") + logger.info(f"Elapsed time: {time.time() - start_time}") with open(db_outfile, "w") as f: f.write(json.dumps(json_data, indent=4)) @@ -221,9 +262,13 @@ def process_records(ctx, dryrun, study_id, data_dir): new_db = reid_tool.update_read_based_taxonomy_analysis_activity_set( db_record, new_db ) + # update Metatraanscriptome Activity + new_db = reid_tool.update_metatranscriptome_activity_set(db_record, new_db) re_ided_db_records.append(new_db) + logger.info(f"Writing {len(re_ided_db_records)} records to {db_outfile}") + logger.info(f"Elapsed time: {time.time() - start_time}") json_data = json.loads(json_dumper.dumps(re_ided_db_records, inject_type=False)) with open(db_outfile, "w") as f: f.write(json.dumps(json_data, indent=4)) @@ -422,9 +467,9 @@ def _get_legacy_id(omics_processing_record: dict) -> str: legacy_ids.extend(alternative_ids) if len(legacy_ids) == 0: logging.warning( - f"No legacy IDs found for omics_processing_record: {omics_processing_record['id']}" + f"No legacy IDs found for: {omics_processing_record['id']} using ID instead" ) - return None + return omics_processing_record["id"] elif len(legacy_ids) > 1: logging.warning( f"Multiple legacy IDs found for omics_processing_record: {omics_processing_record['id']}" diff --git a/nmdc_automation/re_iding/tests/test_data/db_record.json b/nmdc_automation/re_iding/tests/test_data/db_record.json index 657e1469..cf67cc47 100644 --- a/nmdc_automation/re_iding/tests/test_data/db_record.json +++ b/nmdc_automation/re_iding/tests/test_data/db_record.json @@ -305,6 +305,36 @@ "md5_checksum": "27c07072f175571200b5931550adb8aa", "id": "nmdc:27c07072f175571200b5931550adb8aa", "file_size_bytes": 1114314 + }, + { + "file_size_bytes": 37615194, + "id": "nmdc:d00ab33c0bdcf34789be58df977551f3", + "type": "nmdc:DataObject", + "data_object_type": "Read Count and RPKM", + "md5_checksum": "d00ab33c0bdcf34789be58df977551f3", + "url": "https://data.microbiomedata.org/data/503568_190752/metat_out_json/output.json", + "name": "output.json", + "description": "JSON records of features and the associated read counts and RPKMs for gold:Gp0324008" + }, + { + "file_size_bytes": 2396106683, + "id": "nmdc:2b48a6ce9afc35f4698af47bb57c11cb", + "type": "nmdc:DataObject", + "data_object_type": "QC non-rRNA R1", + "md5_checksum": "2b48a6ce9afc35f4698af47bb57c11cb", + "url": "https://data.microbiomedata.org/data/503568_190752/non_ribo_reads/filtered_R1.fastq", + "name": "filtered_R1.fastq", + "description": "R1 reads without the ribosomal sequences for gold:Gp0324008" + }, + { + "file_size_bytes": 2395867661, + "id": "nmdc:6b171f897e3e447d2e728b9d2573cdcd", + "type": "nmdc:DataObject", + "data_object_type": "QC non-rRNA R2", + "md5_checksum": "6b171f897e3e447d2e728b9d2573cdcd", + "url": "https://data.microbiomedata.org/data/503568_190752/non_ribo_reads/filtered_R2.fastq", + "name": "filtered_R2.fastq", + "description": "R2 reads without the ribosomal sequences for gold:Gp0324008" } ], "dissolving_activity_set": [], @@ -535,7 +565,27 @@ ], "metagenome_sequencing_activity_set": [], "metaproteomics_analysis_activity_set": [], - "metatranscriptome_activity_set": [], + "metatranscriptome_activity_set": [ + { + "git_url": "https://github.com/microbiomedata/metaT/releases/tag/v0.0.2", + "ended_at_time": "2021-01-21T00:00:00Z", + "execution_resource": "NERSC - Cori", + "id": "nmdc:f30c9099e7dfaa6e90f5e3a7cf5112de", + "name": "metaT activity for Gp0115663", + "type": "nmdc:metaT", + "has_output": [ + "nmdc:8ef63c54cbfef72c19733e48ad0d1961", + "nmdc:d00ab33c0bdcf34789be58df977551f3", + "nmdc:2b48a6ce9afc35f4698af47bb57c11cb", + "nmdc:6b171f897e3e447d2e728b9d2573cdcd" + ], + "was_informed_by": "gold:Gp0115663", + "has_input": [ + "nmdc:7bf778baef033d36f118f8591256d6ef" + ], + "started_at_time": "2021-01-21T00:00:00Z" + } + ], "nom_analysis_activity_set": [], "omics_processing_set": [ {