Skip to content

Commit

Permalink
update logic for no SRR accessions and invalid samples
Browse files Browse the repository at this point in the history
  • Loading branch information
fraser-combe committed Nov 22, 2024
1 parent 26d8c49 commit 770233c
Show file tree
Hide file tree
Showing 30 changed files with 403 additions and 42 deletions.
36 changes: 36 additions & 0 deletions 20241122_125512_fetch_srr_accession/call-fetch_srr/command
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@

set -euo pipefail

# Output the current date and fastq-dl version for debugging
date -u | tee DATE
fastq-dl --version | tee VERSION

echo "Fetching metadata for accession: SAMD00010204"

# Run fastq-dl and capture stderr
fastq-dl --accession SAMD00010204 --only-download-metadata -m 2 --verbose 2> stderr.log || true

# Handle whether the ID/accession is valid and contains SRR metadata based on stderr
if grep -q "No results found for" stderr.log; then
echo "No SRR accession found" > srr_accession.txt
echo "No SRR accession found for accession: SAMD00010204"
elif grep -q "received an empty response" stderr.log; then
echo "No SRR accession found" > srr_accession.txt
echo "No SRR accession found for accession: SAMD00010204"
elif grep -q "is not a Study, Sample, Experiment, or Run accession" stderr.log; then
echo "Invalid accession: SAMD00010204" >&2
exit 1
elif [[ ! -f fastq-run-info.tsv ]]; then
echo "No metadata file found for accession: SAMD00010204" >&2
exit 1
else
# Extract SRR accessions from the TSV file if it exists
SRR_accessions=$(awk -F'\t' 'NR>1 {print $1}' fastq-run-info.tsv | paste -sd ',' -)
if [[ -z "${SRR_accessions}" ]]; then
echo "No SRR accession found" > srr_accession.txt
else
echo "Extracted SRR accessions: ${SRR_accessions}"
echo "${SRR_accessions}" > srr_accession.txt
fi
fi

Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"sample_accession": "SAMD00010204"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"fetch_srr_accession.fastq_dl_version": "fastq-dl, version 2.0.4",
"fetch_srr_accession.srr_accession": "No SRR accession found"
}
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
17043145
0
6 changes: 6 additions & 0 deletions 20241122_125512_fetch_srr_accession/call-fetch_srr/stdout.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Fri Nov 22 18:55:14 UTC 2024
fastq-dl, version 2.0.4
Fetching metadata for accession: SAMD00010204
No results found for SAMD00010204
No results found for SAMD00010204
No SRR accession found for accession: SAMD00010204
17 changes: 17 additions & 0 deletions 20241122_125512_fetch_srr_accession/call-fetch_srr/task.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
2024-11-22 12:55:12.808 wdl.w:fetch_srr_accession.t:call-fetch_srr NOTICE task setup :: name: "fetch_srr_accession", source: "../../../tasks/utilities/data_handling/task_fetch_srr_accession.wdl", line: 3, column: 1, dir: "/home/frasc/bioinformatics_projects/public_health_bioinformatics/20241122_125512_fetch_srr_accession/call-fetch_srr", thread: 132198141265472
2024-11-22 12:55:12.958 wdl.w:fetch_srr_accession.t:call-fetch_srr NOTICE docker swarm resources :: workers: 1, max_cpus: 4, max_mem_bytes: 16767336448, total_cpus: 4, total_mem_bytes: 16767336448
2024-11-22 12:55:12.958 wdl.w:fetch_srr_accession.t:call-fetch_srr INFO input :: name: "sample_accession", value: "SAMD00010204"
2024-11-22 12:55:12.959 wdl.w:fetch_srr_accession.t:call-fetch_srr INFO eval :: name: "memory", value: 8
2024-11-22 12:55:12.960 wdl.w:fetch_srr_accession.t:call-fetch_srr INFO eval :: name: "docker", value: "us-docker.pkg.dev/general-theiagen/biocontainers/fastq-dl:2.0.4--pyhdfd78af_0"
2024-11-22 12:55:12.961 wdl.w:fetch_srr_accession.t:call-fetch_srr INFO eval :: name: "cpu", value: 2
2024-11-22 12:55:12.962 wdl.w:fetch_srr_accession.t:call-fetch_srr INFO eval :: name: "disk_size", value: 10
2024-11-22 12:55:12.963 wdl.w:fetch_srr_accession.t:call-fetch_srr INFO effective runtime :: docker: "us-docker.pkg.dev/general-theiagen/biocontainers/fastq-dl:2.0.4--pyhdfd78af_0", cpu: 2, memory_reservation: 8000000000, preemptible: 1
2024-11-22 12:55:12.964 wdl.w:fetch_srr_accession.t:call-fetch_srr WARNING ignored runtime settings :: keys: ["disks", "disk"]
2024-11-22 12:55:12.978 wdl.w:fetch_srr_accession.t:call-fetch_srr NOTICE docker image :: tag: "us-docker.pkg.dev/general-theiagen/biocontainers/fastq-dl:2.0.4--pyhdfd78af_0", id: "sha256:c6689b7f5754d89574331af9a748cdb84e89107ecfafe8855fcdc745d41f0674", RepoDigest: "us-docker.pkg.dev/general-theiagen/biocontainers/fastq-dl@sha256:c0a1484561017e0f14e9cb8ceddfac2f28e3576a9bf1a8b743bd12183f4e38b4"
2024-11-22 12:55:14.613 wdl.w:fetch_srr_accession.t:call-fetch_srr NOTICE docker task running :: service: "b752vpzdc7g1", task: "rl8xyzauey", node: "t2vz2h1tc6", message: "started"
2024-11-22 12:55:40.942 wdl.w:fetch_srr_accession.t:call-fetch_srr INFO docker task complete :: service: "b752vpzdc7g1", task: "rl8xyzauey", node: "t2vz2h1tc6", message: "finished"
2024-11-22 12:55:40.942 wdl.w:fetch_srr_accession.t:call-fetch_srr NOTICE docker task exit :: state: "complete", exit_code: 0
2024-11-22 12:55:41.247 wdl.w:fetch_srr_accession.t:call-fetch_srr INFO command stdout unused; consider output `File cmd_out = stdout()` or redirect command to stderr log >&2 :: stdout_file: "/home/frasc/bioinformatics_projects/public_health_bioinformatics/20241122_125512_fetch_srr_accession/call-fetch_srr/stdout.txt"
2024-11-22 12:55:41.247 wdl.w:fetch_srr_accession.t:call-fetch_srr INFO output :: name: "srr_accession", value: "No SRR accession found"
2024-11-22 12:55:41.248 wdl.w:fetch_srr_accession.t:call-fetch_srr INFO output :: name: "fastq_dl_version", value: "fastq-dl, version 2.0.4"
2024-11-22 12:55:41.250 wdl.w:fetch_srr_accession.t:call-fetch_srr NOTICE done
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fri Nov 22 18:55:14 UTC 2024
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
fastq-dl, version 2.0.4
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
No SRR accession found
98 changes: 98 additions & 0 deletions 20241122_125512_fetch_srr_accession/call-fetch_srr/work/stderr.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
2024-11-22 18:55:17 DEBUG 2024-11-22 18:55:17:root:DEBUG - fastq_dl.py:500
Querying ENA for metadata (Attempt
1 of 2)
DEBUG 2024-11-22 connectionpool.py:1048
18:55:17:urllib3.connectionp
ool:DEBUG - Starting new
HTTPS connection (1):
www.ebi.ac.uk:443
2024-11-22 18:55:18 DEBUG 2024-11-22 connectionpool.py:546
18:55:18:urllib3.connectionpo
ol:DEBUG -
https://www.ebi.ac.uk:443
"GET
/ena/portal/api/search?result
=read_run&format=tsv&query=%2
2(sample_accession=SAMD000102
04%20OR%20secondary_sample_ac
cession=SAMD00010204)%22&fiel
ds=all HTTP/1.1" 200 2973
WARNING 2024-11-22 18:55:18:root:WARNING - fastq_dl.py:531
Querying ENA was unsuccessful,
retrying after (10 seconds)
2024-11-22 18:55:28 DEBUG 2024-11-22 18:55:28:root:DEBUG - fastq_dl.py:504
Querying SRA for metadata (Attempt
1 of 2)
DEBUG 2024-11-22 connectionpool.py:1048
18:55:28:urllib3.connectionp
ool:DEBUG - Starting new
HTTPS connection (1):
www.ebi.ac.uk:443
2024-11-22 18:55:29 DEBUG 2024-11-22 connectionpool.py:546
18:55:29:urllib3.connectionpo
ol:DEBUG -
https://www.ebi.ac.uk:443
"GET
/ena/portal/api/search?result
=read_run&format=tsv&query=%2
2(sample_accession=SAMD000102
04%20OR%20secondary_sample_ac
cession=SAMD00010204)%22&fiel
ds=all HTTP/1.1" 200 2973
DEBUG 2024-11-22 18:55:29:root:DEBUG - fastq_dl.py:514
Failed to get metadata from ENA.
Trying SRA...
DEBUG 2024-11-22 connectionpool.py:1048
18:55:29:urllib3.connectionp
ool:DEBUG - Starting new
HTTPS connection (1):
eutils.ncbi.nlm.nih.gov:443
DEBUG 2024-11-22 connectionpool.py:546
18:55:29:urllib3.connectionpo
ol:DEBUG -
https://eutils.ncbi.nlm.nih.g
ov:443 "POST
/entrez/eutils/esearch.fcgi
HTTP/1.1" 200 None
WARNING 2024-11-22 18:55:29:root:WARNING - fastq_dl.py:525
Querying SRA was unsuccessful,
retrying after (10 seconds)
2024-11-22 18:55:39 DEBUG 2024-11-22 18:55:39:root:DEBUG - fastq_dl.py:504
Querying SRA for metadata (Attempt
2 of 2)
DEBUG 2024-11-22 connectionpool.py:1048
18:55:39:urllib3.connectionp
ool:DEBUG - Starting new
HTTPS connection (1):
www.ebi.ac.uk:443
DEBUG 2024-11-22 connectionpool.py:546
18:55:39:urllib3.connectionpo
ol:DEBUG -
https://www.ebi.ac.uk:443
"GET
/ena/portal/api/search?result
=read_run&format=tsv&query=%2
2(sample_accession=SAMD000102
04%20OR%20secondary_sample_ac
cession=SAMD00010204)%22&fiel
ds=all HTTP/1.1" 200 2973
DEBUG 2024-11-22 connectionpool.py:1048
18:55:39:urllib3.connectionp
ool:DEBUG - Starting new
HTTPS connection (1):
eutils.ncbi.nlm.nih.gov:443
2024-11-22 18:55:40 DEBUG 2024-11-22 connectionpool.py:546
18:55:40:urllib3.connectionpo
ol:DEBUG -
https://eutils.ncbi.nlm.nih.g
ov:443 "POST
/entrez/eutils/esearch.fcgi
HTTP/1.1" 200 None
ERROR 2024-11-22 18:55:40:root:ERROR - fastq_dl.py:519
There was an issue querying ENA and
SRA, exiting...
ERROR 2024-11-22 18:55:40:root:ERROR - fastq_dl.py:520
STATUS: 200
ERROR 2024-11-22 18:55:40:root:ERROR - fastq_dl.py:521
TEXT: Query was successful, but
received an empty response
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@

PHB_Version="PHB v2.2.1"

date +"%Y-%m-%d" > TODAY
echo "$PHB_Version" > PHB_VERSION

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{}
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"version_capture.date": "2024-11-22",
"version_capture.phb_version": "PHB v2.2.1"
}
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
17043147
0
Empty file.
11 changes: 11 additions & 0 deletions 20241122_125512_fetch_srr_accession/call-version_capture/task.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
2024-11-22 12:55:12.807 wdl.w:fetch_srr_accession.t:call-version_capture NOTICE task setup :: name: "version_capture", source: "../../../tasks/task_versioning.wdl", line: 3, column: 1, dir: "/home/frasc/bioinformatics_projects/public_health_bioinformatics/20241122_125512_fetch_srr_accession/call-version_capture", thread: 132198151751232
2024-11-22 12:55:12.960 wdl.w:fetch_srr_accession.t:call-version_capture INFO eval :: name: "docker", value: "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0"
2024-11-22 12:55:12.960 wdl.w:fetch_srr_accession.t:call-version_capture INFO eval :: name: "timezone", value: null
2024-11-22 12:55:12.963 wdl.w:fetch_srr_accession.t:call-version_capture INFO effective runtime :: docker: "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0", cpu: 1, memory_reservation: 1000000000, preemptible: 1
2024-11-22 12:55:12.964 wdl.w:fetch_srr_accession.t:call-version_capture WARNING ignored runtime settings :: keys: ["disks", "dx_instance_type"]
2024-11-22 12:55:12.980 wdl.w:fetch_srr_accession.t:call-version_capture NOTICE docker image :: tag: "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0", id: "sha256:e5b3b43b59e1cd3267788b867d9d4c84d4ffc8236278541b3cc6963784c57a5f", RepoDigest: "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash@sha256:f62289e07dea809f88322fbed3a42057f95177e44c8622a38baf22e8113d1ab0"
2024-11-22 12:55:15.058 wdl.w:fetch_srr_accession.t:call-version_capture INFO docker task complete :: service: "q7zvoncm26cc", task: "k6un27duii", node: "t2vz2h1tc6", message: "finished"
2024-11-22 12:55:15.058 wdl.w:fetch_srr_accession.t:call-version_capture NOTICE docker task exit :: state: "complete", exit_code: 0
2024-11-22 12:55:15.412 wdl.w:fetch_srr_accession.t:call-version_capture INFO output :: name: "date", value: "2024-11-22"
2024-11-22 12:55:15.413 wdl.w:fetch_srr_accession.t:call-version_capture INFO output :: name: "phb_version", value: "PHB v2.2.1"
2024-11-22 12:55:15.415 wdl.w:fetch_srr_accession.t:call-version_capture NOTICE done
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
PHB v2.2.1
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
2024-11-22
3 changes: 3 additions & 0 deletions 20241122_125512_fetch_srr_accession/inputs.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"fetch_srr_accession.sample_accession": "SAMD00010204"
}
5 changes: 5 additions & 0 deletions 20241122_125512_fetch_srr_accession/outputs.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"fetch_srr_accession.fetch_srr_accession_analysis_date": "2024-11-22",
"fetch_srr_accession.fetch_srr_accession_version": "PHB v2.2.1",
"fetch_srr_accession.srr_accession": "No SRR accession found"
}
1 change: 1 addition & 0 deletions 20241122_125512_fetch_srr_accession/rerun
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pushd /home/frasc/bioinformatics_projects/public_health_bioinformatics && miniwdl run --verbose /home/frasc/bioinformatics_projects/public_health_bioinformatics/workflows/utilities/data_import/wf_fetch_srr_accession.wdl -- sample_accession=SAMD00010204; popd
30 changes: 30 additions & 0 deletions 20241122_125512_fetch_srr_accession/wdl/tasks/task_versioning.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
version 1.0

task version_capture {
input {
String? timezone
String docker = "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0"
}
meta {
volatile: true
}
command {
PHB_Version="PHB v2.2.1"
~{default='' 'export TZ=' + timezone}
date +"%Y-%m-%d" > TODAY
echo "$PHB_Version" > PHB_VERSION
}
output {
String date = read_string("TODAY")
String phb_version = read_string("PHB_VERSION")
}
runtime {
memory: "1 GB"
cpu: 1
docker: docker
disks: "local-disk 10 HDD"
dx_instance_type: "mem1_ssd1_v2_x2"
preemptible: 1
}
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
version 1.0

task fetch_srr_accession {
input {
String sample_accession
String docker = "us-docker.pkg.dev/general-theiagen/biocontainers/fastq-dl:2.0.4--pyhdfd78af_0"
Int disk_size = 10
Int cpu = 2
Int memory = 8
}
meta {
volatile: true
}

command <<<
set -euo pipefail

# Output the current date and fastq-dl version for debugging
date -u | tee DATE
fastq-dl --version | tee VERSION

echo "Fetching metadata for accession: ~{sample_accession}"

# Run fastq-dl and capture stderr
fastq-dl --accession ~{sample_accession} --only-download-metadata -m 2 --verbose 2> stderr.log || true

# Handle whether the ID/accession is valid and contains SRR metadata based on stderr
if grep -q "No results found for" stderr.log; then
echo "No SRR accession found" > srr_accession.txt
echo "No SRR accession found for accession: ~{sample_accession}"
elif grep -q "received an empty response" stderr.log; then
echo "No SRR accession found" > srr_accession.txt
echo "No SRR accession found for accession: ~{sample_accession}"
elif grep -q "is not a Study, Sample, Experiment, or Run accession" stderr.log; then
echo "Invalid accession: ~{sample_accession}" >&2
exit 1
elif [[ ! -f fastq-run-info.tsv ]]; then
echo "No metadata file found for accession: ~{sample_accession}" >&2
exit 1
else
# Extract SRR accessions from the TSV file if it exists
SRR_accessions=$(awk -F'\t' 'NR>1 {print $1}' fastq-run-info.tsv | paste -sd ',' -)
if [[ -z "${SRR_accessions}" ]]; then
echo "No SRR accession found" > srr_accession.txt
else
echo "Extracted SRR accessions: ${SRR_accessions}"
echo "${SRR_accessions}" > srr_accession.txt
fi
fi
>>>

output {
String srr_accession = read_string("srr_accession.txt")
String fastq_dl_version = read_string("VERSION")
}

runtime {
docker: docker
memory: "~{memory} GB"
cpu: cpu
disks: "local-disk " + disk_size + " SSD"
disk: disk_size + " GB"
preemptible: 1
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
version 1.0

import "../../../tasks/utilities/data_handling/task_fetch_srr_accession.wdl" as srr_task
import "../../../tasks/task_versioning.wdl" as versioning_task

workflow fetch_srr_accession {
meta {
description: "This workflow retrieves the Sequence Read Archive (SRA) accession (SRR) associated with a given sample accession. It uses the fastq-dl tool to fetch metadata from SRA and outputs the SRR accession."
}
input {
String sample_accession
}
call versioning_task.version_capture {
input:
}
call srr_task.fetch_srr_accession as fetch_srr {
input:
sample_accession = sample_accession
}
output {
String srr_accession = fetch_srr.srr_accession
# Version Captures
String fetch_srr_accession_version = version_capture.phb_version
String fetch_srr_accession_analysis_date = version_capture.date
}
}
Loading

0 comments on commit 770233c

Please sign in to comment.