diff --git a/docs/workflows/public_data_sharing/fetch_srr_accession.md b/docs/workflows/public_data_sharing/fetch_srr_accession.md index b48be2bcb..eb9ccac22 100644 --- a/docs/workflows/public_data_sharing/fetch_srr_accession.md +++ b/docs/workflows/public_data_sharing/fetch_srr_accession.md @@ -16,11 +16,11 @@ The workflow uses the fastq-dl tool to fetch metadata from SRA and specifically | **Terra Task Name** | **Variable** | **Type** | **Description**| **Default Value** | **Terra Status** | | --- | --- | --- | --- | --- | --- | -| fetch_srr_metadata | **sample_accession** | String | SRA-compatible accession, such as a **BioSample ID** (e.g., "SAMN00000000") or **SRA Experiment ID** (e.g., "SRX000000"), used to retrieve SRR metadata. | | Required | -| fetch_srr_metadata | **docker**| String | Docker image for metadata retrieval. | `us-docker.pkg.dev/general-theiagen/biocontainers/fastq-dl:2.0.4--pyhdfd78af_0` | Optional | -| fetch_srr_metadata | **disk_size** | Int | Disk space in GB allocated for the task. | 10 | Optional | | fetch_srr_metadata | **cpu** | Int | Number of CPUs allocated for the task. | 2 | Optional | +| fetch_srr_metadata | **disk_size** | Int | Disk space in GB allocated for the task. | 10 | Optional | +| fetch_srr_metadata | **docker**| String | Docker image for metadata retrieval. | `us-docker.pkg.dev/general-theiagen/biocontainers/fastq-dl:2.0.4--pyhdfd78af_0` | Optional | | fetch_srr_metadata | **memory** | Int | Memory in GB allocated for the task. | 8 | Optional | +| fetch_srr_metadata | **sample_accession** | String | SRA-compatible accession, such as a **BioSample ID** (e.g., "SAMN00000000") or **SRA Experiment ID** (e.g., "SRX000000"), used to retrieve SRR metadata. | | Required | ### Workflow Tasks @@ -35,7 +35,7 @@ This workflow has a single task that performs metadata retrieval for the specifi | Task | [Task on GitHub](https://github.com/theiagen-org/phb-workflows/blob/main/tasks/utilities/data_handling/task_fetch_srr_metadata.wdl) | | Software Source Code | [fastq-dl Source](https://github.com/rvalieris/fastq-dl) | | Software Documentation | [fastq-dl Documentation](https://github.com/rvalieris/fastq-dl#documentation) | - | Original Publication | [fastq-dl Publication](https://doi.org/10.1186/s12859-021-04346-3) | + | Original Publication | [fastq-dl: A fast and reliable tool for downloading SRA metadata](https://doi.org/10.1186/s12859-021-04346-3) | ### Outputs diff --git a/tasks/utilities/data_handling/task_fetch_srr_accession.wdl b/tasks/utilities/data_handling/task_fetch_srr_accession.wdl index c8913587a..ab8f98440 100644 --- a/tasks/utilities/data_handling/task_fetch_srr_accession.wdl +++ b/tasks/utilities/data_handling/task_fetch_srr_accession.wdl @@ -11,7 +11,6 @@ task fetch_srr_accession { meta { volatile: true } - command <<< set -euo pipefail @@ -19,43 +18,32 @@ task fetch_srr_accession { date -u | tee DATE fastq-dl --version | tee VERSION - # Fetch metadata for the sample accession - echo "Fetching metadata for valid biosample ID or SRA: ~{sample_accession}" - if fastq-dl --accession ~{sample_accession} --only-download-metadata --verbose 2> stderr; then - if [[ -f fastq-run-info.tsv ]]; then - echo "Metadata written for valid biosample ID or SRA: ~{sample_accession}" - cat fastq-run-info.tsv + echo "Fetching metadata for accession: ~{sample_accession}" - # Extract SRR accessions from the TSV file - SRR_accessions=$(awk -F'\t' 'NR>1 {print $1}' fastq-run-info.tsv | paste -sd ',' -) + # Run fastq-dl and capture stderr + fastq-dl --accession ~{sample_accession} --only-download-metadata -m 2 --verbose 2> stderr.log || true - if [[ -z "${SRR_accessions}" ]]; then - # Valid biosample ID or SRA, but no SRR accessions found - echo "No SRR accession found for valid biosample ID or SRA: ~{sample_accession}" > srr_accession.txt - else - # Valid biosample ID or SRA with SRR accessions - echo "Extracted SRR accessions: ${SRR_accessions}" - echo "${SRR_accessions}" > srr_accession.txt - fi - else - # No metadata file generated, treat as no SRRs found for valid biosample - echo "No metadata file found for valid biosample ID or SRA: ~{sample_accession}" - echo "No SRR accession found" > srr_accession.txt - fi + # Handle whether the ID/accession is valid and contains SRR metadata based on stderr + if grep -q "No results found for" stderr.log; then + echo "No SRR accession found" > srr_accession.txt + echo "No SRR accession found for accession: ~{sample_accession}" + elif grep -q "received an empty response" stderr.log; then + echo "No SRR accession found" > srr_accession.txt + echo "No SRR accession found for accession: ~{sample_accession}" + elif grep -q "is not a Study, Sample, Experiment, or Run accession" stderr.log; then + echo "Invalid accession: ~{sample_accession}" >&2 + exit 1 + elif [[ ! -f fastq-run-info.tsv ]]; then + echo "No metadata file found for accession: ~{sample_accession}" >&2 + exit 1 else - # Check stderr for specific error messages - if grep -q "Query was successful, but received an empty response" stderr; then - # Valid biosample ID or SRA, but no data found output No SRR accession found - echo "No SRR accession found for valid biosample ID or SRA: ~{sample_accession} -Query was successful, but received an empty response" > srr_accession.txt + # Extract SRR accessions from the TSV file if it exists + SRR_accessions=$(awk -F'\t' 'NR>1 {print $1}' fastq-run-info.tsv | paste -sd ',' -) + if [[ -z "${SRR_accessions}" ]]; then echo "No SRR accession found" > srr_accession.txt - elif grep -q "is not a Study, Sample, Experiment, or Run accession" stderr; then - # Invalid accession ID or SRA Fail workflow - echo "Invalid biosample ID or SRA: ~{sample_accession}" - exit 1 else - # Unexpected error - echo "fastq-dl failed for ~{sample_accession} due to an unknown error." - exit 1 + echo "Extracted SRR accessions: ${SRR_accessions}" + echo "${SRR_accessions}" > srr_accession.txt fi fi >>> diff --git a/workflows/utilities/data_import/wf_fetch_srr_accession.wdl b/workflows/utilities/data_import/wf_fetch_srr_accession.wdl index d28d042e9..e40e54a0f 100644 --- a/workflows/utilities/data_import/wf_fetch_srr_accession.wdl +++ b/workflows/utilities/data_import/wf_fetch_srr_accession.wdl @@ -3,7 +3,7 @@ version 1.0 import "../../../tasks/utilities/data_handling/task_fetch_srr_accession.wdl" as srr_task import "../../../tasks/task_versioning.wdl" as versioning_task -workflow fetch_srr { +workflow fetch_srr_accession { meta { description: "This workflow retrieves the Sequence Read Archive (SRA) accession (SRR) associated with a given sample accession. It uses the fastq-dl tool to fetch metadata from SRA and outputs the SRR accession." } @@ -13,14 +13,14 @@ workflow fetch_srr { call versioning_task.version_capture { input: } - call srr_task.fetch_srr_accession { + call srr_task.fetch_srr_accession as fetch_srr { input: sample_accession = sample_accession } output { - String srr_accession = fetch_srr_accession.srr_accession + String srr_accession = fetch_srr.srr_accession # Version Captures - String phb_version = version_capture.phb_version - String fetch_srr_date = version_capture.date + String fetch_srr_accession_version = version_capture.phb_version + String fetch_srr_accession_analysis_date = version_capture.date } -} \ No newline at end of file +}