Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

43 update re iding process records to handle data objects wo data file description or url or with ID not found in the Database #46

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
54a968f
new dryrun input and processed data files to test Metatranscriptome
mbthornton-lbl Jan 24, 2024
5411677
add constant for read_based_taxonomy_analysis_set
mbthornton-lbl Jan 24, 2024
beec6a4
generated dryrun changesheet
mbthornton-lbl Jan 24, 2024
9e3d791
update base to deal with missing activity sets
mbthornton-lbl Jan 24, 2024
dbb8cb2
remove hard-coded URL
mbthornton-lbl Jan 30, 2024
bbf7749
add logic to infer data object type from filename
mbthornton-lbl Jan 30, 2024
0c93d7e
update dryrun testing data
mbthornton-lbl Jan 30, 2024
c148ed2
stegen deletion log
mbthornton-lbl Jan 30, 2024
f943ba6
handle missing type and make logging a bit more consistent
mbthornton-lbl Jan 30, 2024
32d04c9
add record dumps and logs for additional studies
mbthornton-lbl Jan 31, 2024
64e7414
improve logging in re-id tool
mbthornton-lbl Jan 31, 2024
76cfb46
Add all workflow activity set names as constants
mbthornton-lbl Feb 5, 2024
01cf630
improve logging
mbthornton-lbl Feb 5, 2024
6ed76e3
Extracted unique data_objects.json from the filesystem
mbthornton-lbl Feb 5, 2024
add554f
move outputs to attic
mbthornton-lbl Feb 6, 2024
fdacedf
fix logging issue
mbthornton-lbl Feb 6, 2024
91b4502
Merge branch 'main' into 43-update-re-iding-process-records-to-handle…
mbthornton-lbl Feb 6, 2024
884baa4
Merge branch '50-implement-tool-to-find-all-missing-data-objects-from…
mbthornton-lbl Feb 6, 2024
33b294e
delete earlier logs and record dumps
mbthornton-lbl Feb 7, 2024
c99cd3e
Ensure read_qc workflow records are processed first for each omics re…
mbthornton-lbl Feb 7, 2024
70afd2a
Updates to re_id_tool and libraries
mbthornton-lbl Feb 8, 2024
2d8f5bc
Updated extract-records results and logs for 4 re-ID studies
mbthornton-lbl Feb 8, 2024
f822255
make sure os.makedirs only happens if update-links is true
mbthornton-lbl Feb 8, 2024
cab7cec
handle omics_processing records with empty has_output
mbthornton-lbl Feb 8, 2024
775cf03
update functions to infer data object type
mbthornton-lbl Feb 8, 2024
5e45752
update extract-records to handle empty Omics has_output
mbthornton-lbl Feb 9, 2024
641a69b
delete old output and logs and re-run
mbthornton-lbl Feb 9, 2024
b7e787e
delete extract-records output and logs prior to re-run
mbthornton-lbl Feb 9, 2024
aad9d77
handle omics-level failures and missing metatranascriptome
mbthornton-lbl Feb 9, 2024
04e256f
update extraction and processing for nmdc:sty-11-33fbta56 results and…
mbthornton-lbl Feb 9, 2024
8ff939a
handle different variants of "ReadBasedAnalysis"
mbthornton-lbl Feb 12, 2024
471fc9a
Handle omics processing without has_output
mbthornton-lbl Feb 12, 2024
810570a
extract-records and process-records output and logs for 4 studies
mbthornton-lbl Feb 13, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion nmdc_automation/nmdc_common/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def get_workflow_activities_informed_by(self, workflow_activity_set: str,
}
url = self.base_url + "nmdcschema/" + workflow_activity_set
response = requests.get(url, params=params, headers=self.headers)
logger.info(response.url)
logger.debug(response.url)
response.raise_for_status()
workflow_activity_record = response.json()["resources"]
return workflow_activity_record
Expand Down
140 changes: 109 additions & 31 deletions nmdc_automation/re_iding/base.py

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
id action attribute value
nmdc:omprc-11-bn8jcq58 remove item has_output nmdc:dobj-11-xkx6jy64|
nmdc:omprc-11-bn8jcq58 insert has_output nmdc:dobj-11-77e9jm46|
28 changes: 19 additions & 9 deletions nmdc_automation/re_iding/db_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,28 @@
OMICS_PROCESSING_SET = "omics_processing_set"
DATA_OBJECT_SET = "data_object_set"
READS_QC_SET = "read_qc_analysis_activity_set"
READ_BASED_TAXONOMY_ANALYSIS_ACTIVITY_SET = "read_based_taxonomy_analysis_activity_set"
METAGENOME_ASSEMBLY_SET = "metagenome_assembly_set"
METAGENOME_ANNOTATION_ACTIVITY_SET = "metagenome_annotation_activity_set"
METAGENOME_SEQUENCING_ACTIVITY_SET = "metagenome_sequencing_activity_set"
MAGS_ACTIVITY_SET = "mags_activity_set"
METATRANSCRIPTOME_ACTIVITY_SET = "metatranscriptome_activity_set"
METAPROTEOMICS_ANALYSIS_ACTIVITY_SET = "metaproteomics_analysis_activity_set"
METABOLOMICS_ANALYSIS_ACTIVITY_SET = "metabolomics_analysis_activity_set"
NOM_ANALYSIS_ACTIVITY_SET= "nom_analysis_activity_set"

ANALYSIS_ACTIVITIES = [
READS_QC_SET,
READ_BASED_TAXONOMY_ANALYSIS_ACTIVITY_SET,
METAGENOME_ANNOTATION_ACTIVITY_SET,
METAGENOME_SEQUENCING_ACTIVITY_SET,
METAGENOME_ASSEMBLY_SET,
MAGS_ACTIVITY_SET,
METATRANSCRIPTOME_ACTIVITY_SET,
METAPROTEOMICS_ANALYSIS_ACTIVITY_SET,
METABOLOMICS_ANALYSIS_ACTIVITY_SET,
NOM_ANALYSIS_ACTIVITY_SET
]



Expand Down Expand Up @@ -51,12 +70,3 @@ def get_data_object_record_by_id(db_record: Dict, id: str)-> Optional[Dict]:
elif len(data_objects) > 1:
raise ValueError(f"Multiple data objects found with id: {id}")
return data_objects[0]









80 changes: 64 additions & 16 deletions nmdc_automation/re_iding/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,17 +35,66 @@ def find_data_object_type(data_object_rec: Dict)-> Optional[str]:
Returns:
- str: The determined data type or None if the type could not be determined.
"""
url = data_object_rec.get("url")
name = data_object_rec.get("name")
if "data_object_type" in data_object_rec:
return data_object_rec["data_object_type"]
url = data_object_rec["url"]
if url.endswith("_covstats.txt"):
elif url:
return _infer_data_object_type_from_url(data_object_rec)
elif name:
return _infer_data_object_type_from_name(data_object_rec)
else:
logger.error(f"Could not determine data object type for: {data_object_rec}")
return None



def _infer_data_object_type_from_url(data_object_rec: Dict) -> Optional[str]:
"""
Determine the data_object_type for a DO record based on its URL extension.

Args:
- data_object_record (dict): Dictionary containing the 'url' key which
will be inspected to determine the data type.

Returns:
- str: The determined data type or None if the type could not be determined.
"""
if data_object_rec['url'].endswith("_covstats.txt"):
return "Assembly Coverage Stats"
elif url.endswith("_gottcha2_report.tsv"):
elif data_object_rec['url'].endswith("_gottcha2_report.tsv"):
return "GOTTCHA2 Classification Report"
elif url.endswith("_gottcha2_report_full.tsv"):
elif data_object_rec['url'].endswith("_gottcha2_report_full.tsv"):
return "GOTTCHA2 Report Full"
elif data_object_rec['url'].endswith(".fastq.gz") and "Filtered Reads" in data_object_rec['description']:
return "Filtered Sequencing Reads"
else:
logger.error(f"Missing type: {url}")
logger.error(f"Cannot infer type from url for: {data_object_rec}")
return None

def _infer_data_object_type_from_name(data_object_rec: Dict) -> Optional[str]:
"""
Determine the data_object_type for a DO record based on its name.

Args:
- data_object_record (dict): Dictionary containing the 'name' key which
will be inspected to determine the data type.

Returns:
- str: The determined data type or None if the type could not be determined.
"""
if data_object_rec['name'] == "mapping_stats.txt":
return "Assembly Coverage Stats"
elif data_object_rec['name'] == "assembly_contigs.fna":
return "Assembly Contigs"
elif data_object_rec['name'] == "assembly_scaffolds.fna":
return "Assembly Scaffolds"
elif data_object_rec['name'] == "assembly.agp":
return "Assembly AGP"
elif data_object_rec['name'] == "pairedMapped_sorted.bam":
return "Assembly Coverage BAM"
else:
logger.error(f"Cannot infer type from name for: {data_object_rec}")
return None

def md5_sum(fn: str) -> str:
Expand Down Expand Up @@ -191,11 +240,10 @@ def assembly_file_operations(data_object_record, data_object_type,
return md5, size


def compute_new_paths_and_link(
def compute_new_data_file_path(
old_url: str,
new_base_dir: Union[str, os.PathLike],
act_id: str,
old_base_dir: Optional[Union[str, os.PathLike]] = None,
) -> Path:
"""
Compute the new path for the file based on the old url and the new base directory.
Expand All @@ -205,19 +253,19 @@ def compute_new_paths_and_link(
file_extenstion = file_name.lstrip("nmdc_").split("_", maxsplit=1)[-1]
new_file_name = f"{act_id}_{file_extenstion}"
modified_new_file_name = new_file_name.replace(":", "_")
destination = Path(new_base_dir, modified_new_file_name)
new_data_file_path = Path(new_base_dir, modified_new_file_name)

if old_base_dir is None:
return destination
# create a link between the old file and the new file if old_base_dir is provided
return new_data_file_path


def link_data_file_paths(old_url: str, old_base_dir: Union[str, os.PathLike], new_path: Union[str, os.PathLike])-> \
None:
suffix = old_url.split("https://data.microbiomedata.org/data/")[1]
old_file_path = Path(old_base_dir, suffix)
try:
os.link(old_file_path, destination)
logging.info(f"Successfully created link between {old_file_path} and {destination}")
os.link(old_file_path, new_path)
logging.info(f"Successfully created link between {old_file_path} and {new_path}")
except OSError as e:
logging.error(f"An error occurred while linking the file: {e}")
except Exception as e:
logging.error(f"Unexpected error: {e}")

return destination
logging.error(f"Unexpected error: {e}")
Loading
Loading