From d7231880c42f421c9e02070e32cd6e133ee6fa55 Mon Sep 17 00:00:00 2001 From: SooLee Date: Tue, 12 Dec 2017 12:21:48 -0500 Subject: [PATCH 1/7] added 4dn-style meta to repliseq-parta.cwl --- cwl_awsem/repliseq/repliseq-parta.cwl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cwl_awsem/repliseq/repliseq-parta.cwl b/cwl_awsem/repliseq/repliseq-parta.cwl index 2ce0db3..959bed0 100644 --- a/cwl_awsem/repliseq/repliseq-parta.cwl +++ b/cwl_awsem/repliseq/repliseq-parta.cwl @@ -1,4 +1,10 @@ { + "4dn_description": { + "data_types": [ "Repli-seq" ], + "category": "clip + align + filter + sort + dedup + count", + "workflow_type": "Repli-seq data processing", + "description": "GITAR Hi-C data processing pipeline" + }, "outputs": [ { "type": [ From 2382b74f6139601b5fa6deb4234f180e3062e7f3 Mon Sep 17 00:00:00 2001 From: SooLee Date: Tue, 12 Dec 2017 12:28:07 -0500 Subject: [PATCH 2/7] added 4dn-style meta to repliseq-parta.cwl --- cwl_awsem/repliseq/repliseq-parta.cwl | 35 +++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/cwl_awsem/repliseq/repliseq-parta.cwl b/cwl_awsem/repliseq/repliseq-parta.cwl index 959bed0..5ecf315 100644 --- a/cwl_awsem/repliseq/repliseq-parta.cwl +++ b/cwl_awsem/repliseq/repliseq-parta.cwl @@ -95,7 +95,12 @@ "id": "#clip.input_fastq" } ], - "id": "#clip" + "id": "#clip", + "4dn_description": { + "software": "cutadapt", + "description": "Adapter removal according to the Repli-seq pipeline", + "analysis_step_types": [ "adapter removal" ] + } }, { "outputs": [ @@ -118,7 +123,12 @@ "id": "#align.nThreads" } ], - "id": "#align" + "id": "#align", + "4dn_description": { + "software": "bwa", + "description": "Alignment according to the Repli-seq pipeline", + "analysis_step_types": [ "alignment" ] + } }, { "outputs": [ @@ -141,7 +151,12 @@ "id": "#filtersort.memperthread" } ], - "id": "#filtersort" + "id": "#filtersort", + "4dn_description": { + "software": "samtools", + "description": "Filtering and sorting according to the Repli-seq pipeline", + "analysis_step_types": [ "filtering", "sorting" ] + } }, { "outputs": [ @@ -156,7 +171,12 @@ "id": "#dedup.input_bam" } ], - "id": "#dedup" + "id": "#dedup", + "4dn_description": { + "software": "samtools", + "description": "PCR Duplicate removal according to the Repli-seq pipeline", + "analysis_step_types": [ "duplicate removal" ] + } }, { "outputs": [ @@ -179,7 +199,12 @@ "id": "#count.winsize" } ], - "id": "#count" + "id": "#count", + "4dn_description": { + "software": "bedtools", + "description": "Read aggregation according to the Repli-seq pipeline", + "analysis_step_types": [ "aggregation" ] + } } ], "requirements": [ From 8fbdc9e9100b3ca40452355505eb0af8eb333d88 Mon Sep 17 00:00:00 2001 From: SooLee Date: Tue, 12 Dec 2017 13:44:48 -0500 Subject: [PATCH 3/7] added more 4dn tags. updated readme to add documentation for the 4dn tags. --- README.md | 16 +++++++++ cwl_awsem/repliseq/repliseq-parta.cwl | 47 +++++++++++++++------------ 2 files changed, 43 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 58425e6..9f743bb 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,22 @@ * This is a collection of pipelines built by 4DN-DCIC that were created and run either on the SevenBridges platform or on the 4DN platform, AWSEM. * The repo contains a benchmarking script for some of the CWLs, that returns total space, mem and CPUs required per given input size and a recommended AWS EC2 instance type. + +### Specification for 4DN +* Currently, 4DN DCIC uses CWL `draft-3`. +* The following 4DN custom fields are added, for automated conversion from cwl to the workflow metadata used by the 4DN Data Portal. + * `4dn_meta` (top level field) : a dictionary that contains `data_types`, `category`, `workflow_type`, `description`. + * `data_types` : an array of strings that correspond to the data types to be processed. e.g.) [ 'Repli-seq' ] + * `category` : a string describing the steps. e.g.) 'clip + align + filter + sort + dedup + count' + * `workflow_type` : a string describing the purpose of the workflow in short. e.g.) 'Repli-seq data processing' + * `description` : a string describing the workflow. e.g.) 'Repli-seq data processing pipeline' + * `4dn_step_meta` (within each `steps` element) : a dictionary that contains `software`, `description`, `analysis_step_types`. + * `software` : an array of strings that refer to the names of the software used. The name must match the names used in the `downloads.sh` in an accompanying Docker source repo. e.g.) [ 'cutadapt' ] + * `description` : a string that describes the step. e.g.) 'Adapter removal according to the Repli-seq pipeline' + * `analysis_step_types` : an array of strings that refer to the step types (i.e. purpose). e.g.) [ 'adapter removal' ] + * `4dn_format` (within each top-level `inputs` and `outputs` element) : a string. e.g.) 'bam' + + ### How to run the cwl To run docker through CWL, you need a cwl executor - we use `cwltool` (https://github.com/common-workflow-language/cwltool) to run CWL with a json/yml file describing input data. Some example input data are inside the `tests/test_input_json` directory and you can see some `cwltool` (=`cwl-runner`) commands inside the `tests/tests.sh` script. diff --git a/cwl_awsem/repliseq/repliseq-parta.cwl b/cwl_awsem/repliseq/repliseq-parta.cwl index 5ecf315..5d9fa57 100644 --- a/cwl_awsem/repliseq/repliseq-parta.cwl +++ b/cwl_awsem/repliseq/repliseq-parta.cwl @@ -1,9 +1,9 @@ { - "4dn_description": { + "4dn_meta": { "data_types": [ "Repli-seq" ], "category": "clip + align + filter + sort + dedup + count", "workflow_type": "Repli-seq data processing", - "description": "GITAR Hi-C data processing pipeline" + "description": "Repli-seq data processing pipeline" }, "outputs": [ { @@ -11,28 +11,32 @@ "File" ], "source": "#align.out_bam", - "id": "#bam" + "id": "#bam", + "4dn_format": "bam" }, { "type": [ "File" ], "source": "#filtersort.out_filtered_sorted_bam", - "id": "#filtered_sorted_bam" + "id": "#filtered_sorted_bam", + "4dn_format": "bam" }, { "type": [ "File" ], "source": "#dedup.out_deduped_bam", - "id": "#filtered_sorted_deduped_bam" + "id": "#filtered_sorted_deduped_bam", + "4dn_format": "bam" }, { "type": [ "File" ], "source": "#count.out_count_bg", - "id": "#count_bg" + "id": "#count_bg", + "4dn_format": "bg" } ], "inputs": [ @@ -40,19 +44,22 @@ "type": [ "File" ], - "id": "#fastq" + "id": "#fastq", + "4dn_format": "fastq" }, { "type": [ "File" ], - "id": "#bwaIndex" + "id": "#bwaIndex", + "4dn_format": "bwaIndex" }, { "type": [ "File" ], - "id": "#chromsizes" + "id": "#chromsizes", + "4dn_format": "chromsizes" }, { "type": [ @@ -68,7 +75,7 @@ "string" ], "id": "#memperthread", - "default": "5G" + "default": "2G" }, { "type": [ @@ -96,8 +103,8 @@ } ], "id": "#clip", - "4dn_description": { - "software": "cutadapt", + "4dn_step_meta": { + "software": [ "cutadapt" ], "description": "Adapter removal according to the Repli-seq pipeline", "analysis_step_types": [ "adapter removal" ] } @@ -124,8 +131,8 @@ } ], "id": "#align", - "4dn_description": { - "software": "bwa", + "4dn_step_meta": { + "software": [ "bwa" ], "description": "Alignment according to the Repli-seq pipeline", "analysis_step_types": [ "alignment" ] } @@ -152,8 +159,8 @@ } ], "id": "#filtersort", - "4dn_description": { - "software": "samtools", + "4dn_step_meta": { + "software": [ "samtools" ], "description": "Filtering and sorting according to the Repli-seq pipeline", "analysis_step_types": [ "filtering", "sorting" ] } @@ -172,8 +179,8 @@ } ], "id": "#dedup", - "4dn_description": { - "software": "samtools", + "4dn_step_meta": { + "software": [ "samtools" ], "description": "PCR Duplicate removal according to the Repli-seq pipeline", "analysis_step_types": [ "duplicate removal" ] } @@ -200,8 +207,8 @@ } ], "id": "#count", - "4dn_description": { - "software": "bedtools", + "4dn_step_meta": { + "software": [ "bedtools" ], "description": "Read aggregation according to the Repli-seq pipeline", "analysis_step_types": [ "aggregation" ] } From 39cc978e9b4d7bce2b488ba904e09d049fe93da7 Mon Sep 17 00:00:00 2001 From: SooLee Date: Tue, 12 Dec 2017 14:12:41 -0500 Subject: [PATCH 4/7] updated 4dn tags for repliseq-parta --- cwl_awsem/repliseq/repliseq-parta.cwl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cwl_awsem/repliseq/repliseq-parta.cwl b/cwl_awsem/repliseq/repliseq-parta.cwl index 5d9fa57..7e45b0c 100644 --- a/cwl_awsem/repliseq/repliseq-parta.cwl +++ b/cwl_awsem/repliseq/repliseq-parta.cwl @@ -1,7 +1,7 @@ { "4dn_meta": { "data_types": [ "Repli-seq" ], - "category": "clip + align + filter + sort + dedup + count", + "category": "align + filter + count", "workflow_type": "Repli-seq data processing", "description": "Repli-seq data processing pipeline" }, @@ -210,7 +210,7 @@ "4dn_step_meta": { "software": [ "bedtools" ], "description": "Read aggregation according to the Repli-seq pipeline", - "analysis_step_types": [ "aggregation" ] + "analysis_step_types": [ "binning", "aggregation" ] } } ], From 82da37e0511bd97097e94f8d0086ca4275cbd84a Mon Sep 17 00:00:00 2001 From: SooLee Date: Tue, 12 Dec 2017 16:19:11 -0500 Subject: [PATCH 5/7] 4dn->fdn for 4dn tags for cwl --- README.md | 6 +++--- cwl_awsem/repliseq/repliseq-parta.cwl | 26 +++++++++++++------------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 9f743bb..7d445ba 100644 --- a/README.md +++ b/README.md @@ -6,16 +6,16 @@ ### Specification for 4DN * Currently, 4DN DCIC uses CWL `draft-3`. * The following 4DN custom fields are added, for automated conversion from cwl to the workflow metadata used by the 4DN Data Portal. - * `4dn_meta` (top level field) : a dictionary that contains `data_types`, `category`, `workflow_type`, `description`. + * `fdn_meta` (top level field) : a dictionary that contains `data_types`, `category`, `workflow_type`, `description`. * `data_types` : an array of strings that correspond to the data types to be processed. e.g.) [ 'Repli-seq' ] * `category` : a string describing the steps. e.g.) 'clip + align + filter + sort + dedup + count' * `workflow_type` : a string describing the purpose of the workflow in short. e.g.) 'Repli-seq data processing' * `description` : a string describing the workflow. e.g.) 'Repli-seq data processing pipeline' - * `4dn_step_meta` (within each `steps` element) : a dictionary that contains `software`, `description`, `analysis_step_types`. + * `fdn_step_meta` (within each `steps` element) : a dictionary that contains `software`, `description`, `analysis_step_types`. * `software` : an array of strings that refer to the names of the software used. The name must match the names used in the `downloads.sh` in an accompanying Docker source repo. e.g.) [ 'cutadapt' ] * `description` : a string that describes the step. e.g.) 'Adapter removal according to the Repli-seq pipeline' * `analysis_step_types` : an array of strings that refer to the step types (i.e. purpose). e.g.) [ 'adapter removal' ] - * `4dn_format` (within each top-level `inputs` and `outputs` element) : a string. e.g.) 'bam' + * `fdn_format` (within each top-level `inputs` and `outputs` element) : a string. e.g.) 'bam' ### How to run the cwl diff --git a/cwl_awsem/repliseq/repliseq-parta.cwl b/cwl_awsem/repliseq/repliseq-parta.cwl index 7e45b0c..c51f0b5 100644 --- a/cwl_awsem/repliseq/repliseq-parta.cwl +++ b/cwl_awsem/repliseq/repliseq-parta.cwl @@ -1,5 +1,5 @@ { - "4dn_meta": { + "fdn_meta": { "data_types": [ "Repli-seq" ], "category": "align + filter + count", "workflow_type": "Repli-seq data processing", @@ -12,7 +12,7 @@ ], "source": "#align.out_bam", "id": "#bam", - "4dn_format": "bam" + "fdn_format": "bam" }, { "type": [ @@ -20,7 +20,7 @@ ], "source": "#filtersort.out_filtered_sorted_bam", "id": "#filtered_sorted_bam", - "4dn_format": "bam" + "fdn_format": "bam" }, { "type": [ @@ -28,7 +28,7 @@ ], "source": "#dedup.out_deduped_bam", "id": "#filtered_sorted_deduped_bam", - "4dn_format": "bam" + "fdn_format": "bam" }, { "type": [ @@ -36,7 +36,7 @@ ], "source": "#count.out_count_bg", "id": "#count_bg", - "4dn_format": "bg" + "fdn_format": "bg" } ], "inputs": [ @@ -45,21 +45,21 @@ "File" ], "id": "#fastq", - "4dn_format": "fastq" + "fdn_format": "fastq" }, { "type": [ "File" ], "id": "#bwaIndex", - "4dn_format": "bwaIndex" + "fdn_format": "bwaIndex" }, { "type": [ "File" ], "id": "#chromsizes", - "4dn_format": "chromsizes" + "fdn_format": "chromsizes" }, { "type": [ @@ -103,7 +103,7 @@ } ], "id": "#clip", - "4dn_step_meta": { + "fdn_step_meta": { "software": [ "cutadapt" ], "description": "Adapter removal according to the Repli-seq pipeline", "analysis_step_types": [ "adapter removal" ] @@ -131,7 +131,7 @@ } ], "id": "#align", - "4dn_step_meta": { + "fdn_step_meta": { "software": [ "bwa" ], "description": "Alignment according to the Repli-seq pipeline", "analysis_step_types": [ "alignment" ] @@ -159,7 +159,7 @@ } ], "id": "#filtersort", - "4dn_step_meta": { + "fdn_step_meta": { "software": [ "samtools" ], "description": "Filtering and sorting according to the Repli-seq pipeline", "analysis_step_types": [ "filtering", "sorting" ] @@ -179,7 +179,7 @@ } ], "id": "#dedup", - "4dn_step_meta": { + "fdn_step_meta": { "software": [ "samtools" ], "description": "PCR Duplicate removal according to the Repli-seq pipeline", "analysis_step_types": [ "duplicate removal" ] @@ -207,7 +207,7 @@ } ], "id": "#count", - "4dn_step_meta": { + "fdn_step_meta": { "software": [ "bedtools" ], "description": "Read aggregation according to the Repli-seq pipeline", "analysis_step_types": [ "binning", "aggregation" ] From 7a43b957bce27d31c2969f9f1d3289672963e1a7 Mon Sep 17 00:00:00 2001 From: SooLee Date: Wed, 13 Dec 2017 10:54:05 -0500 Subject: [PATCH 6/7] software->software_used for 4dn tags for cwl --- README.md | 4 ++-- cwl_awsem/repliseq/repliseq-parta.cwl | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 7d445ba..48ea584 100644 --- a/README.md +++ b/README.md @@ -11,8 +11,8 @@ * `category` : a string describing the steps. e.g.) 'clip + align + filter + sort + dedup + count' * `workflow_type` : a string describing the purpose of the workflow in short. e.g.) 'Repli-seq data processing' * `description` : a string describing the workflow. e.g.) 'Repli-seq data processing pipeline' - * `fdn_step_meta` (within each `steps` element) : a dictionary that contains `software`, `description`, `analysis_step_types`. - * `software` : an array of strings that refer to the names of the software used. The name must match the names used in the `downloads.sh` in an accompanying Docker source repo. e.g.) [ 'cutadapt' ] + * `fdn_step_meta` (within each `steps` element) : a dictionary that contains `software_used`, `description`, `analysis_step_types`. + * `software_used` : an array of strings that refer to the names of the software used. The name must match the names used in the `downloads.sh` in an accompanying Docker source repo. e.g.) [ 'cutadapt' ] * `description` : a string that describes the step. e.g.) 'Adapter removal according to the Repli-seq pipeline' * `analysis_step_types` : an array of strings that refer to the step types (i.e. purpose). e.g.) [ 'adapter removal' ] * `fdn_format` (within each top-level `inputs` and `outputs` element) : a string. e.g.) 'bam' diff --git a/cwl_awsem/repliseq/repliseq-parta.cwl b/cwl_awsem/repliseq/repliseq-parta.cwl index c51f0b5..d75872d 100644 --- a/cwl_awsem/repliseq/repliseq-parta.cwl +++ b/cwl_awsem/repliseq/repliseq-parta.cwl @@ -104,7 +104,7 @@ ], "id": "#clip", "fdn_step_meta": { - "software": [ "cutadapt" ], + "software_used": [ "cutadapt" ], "description": "Adapter removal according to the Repli-seq pipeline", "analysis_step_types": [ "adapter removal" ] } @@ -132,7 +132,7 @@ ], "id": "#align", "fdn_step_meta": { - "software": [ "bwa" ], + "software_used": [ "bwa" ], "description": "Alignment according to the Repli-seq pipeline", "analysis_step_types": [ "alignment" ] } @@ -160,7 +160,7 @@ ], "id": "#filtersort", "fdn_step_meta": { - "software": [ "samtools" ], + "software_used": [ "samtools" ], "description": "Filtering and sorting according to the Repli-seq pipeline", "analysis_step_types": [ "filtering", "sorting" ] } @@ -180,7 +180,7 @@ ], "id": "#dedup", "fdn_step_meta": { - "software": [ "samtools" ], + "software_used": [ "samtools" ], "description": "PCR Duplicate removal according to the Repli-seq pipeline", "analysis_step_types": [ "duplicate removal" ] } @@ -208,7 +208,7 @@ ], "id": "#count", "fdn_step_meta": { - "software": [ "bedtools" ], + "software_used": [ "bedtools" ], "description": "Read aggregation according to the Repli-seq pipeline", "analysis_step_types": [ "binning", "aggregation" ] } From 2a3359136068473fca157a3456cf436dc895f366 Mon Sep 17 00:00:00 2001 From: SooLee Date: Wed, 13 Dec 2017 12:14:02 -0500 Subject: [PATCH 7/7] added fdn_output_type --- README.md | 5 ++++- cwl_awsem/repliseq/repliseq-parta.cwl | 6 +++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 48ea584..2f2dcc8 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,10 @@ * `description` : a string that describes the step. e.g.) 'Adapter removal according to the Repli-seq pipeline' * `analysis_step_types` : an array of strings that refer to the step types (i.e. purpose). e.g.) [ 'adapter removal' ] * `fdn_format` (within each top-level `inputs` and `outputs` element) : a string. e.g.) 'bam' - + * `fdn_output_type` (within each top-level `outputs` element) : a string that corresponds to one of the following three - 'processed', 'QC', 'report' + * processed : generic output file + * QC : output will be used to generate a quality_metric object (e.g. fastqc report) + * report : output will be used to add a metric to input (e.g. md5) ### How to run the cwl To run docker through CWL, you need a cwl executor - we use `cwltool` (https://github.com/common-workflow-language/cwltool) to run CWL with a json/yml file describing input data. Some example input data are inside the `tests/test_input_json` directory and you can see some `cwltool` (=`cwl-runner`) commands inside the `tests/tests.sh` script. diff --git a/cwl_awsem/repliseq/repliseq-parta.cwl b/cwl_awsem/repliseq/repliseq-parta.cwl index d75872d..fbb8ab3 100644 --- a/cwl_awsem/repliseq/repliseq-parta.cwl +++ b/cwl_awsem/repliseq/repliseq-parta.cwl @@ -12,7 +12,8 @@ ], "source": "#align.out_bam", "id": "#bam", - "fdn_format": "bam" + "fdn_format": "bam", + "fdn_output_type": "processed" }, { "type": [ @@ -20,6 +21,7 @@ ], "source": "#filtersort.out_filtered_sorted_bam", "id": "#filtered_sorted_bam", + "fdn_output_type": "processed", "fdn_format": "bam" }, { @@ -28,6 +30,7 @@ ], "source": "#dedup.out_deduped_bam", "id": "#filtered_sorted_deduped_bam", + "fdn_output_type": "processed", "fdn_format": "bam" }, { @@ -36,6 +39,7 @@ ], "source": "#count.out_count_bg", "id": "#count_bg", + "fdn_output_type": "processed", "fdn_format": "bg" } ],