diff --git a/README.md b/README.md index 58425e6..2f2dcc8 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,25 @@ * This is a collection of pipelines built by 4DN-DCIC that were created and run either on the SevenBridges platform or on the 4DN platform, AWSEM. * The repo contains a benchmarking script for some of the CWLs, that returns total space, mem and CPUs required per given input size and a recommended AWS EC2 instance type. + +### Specification for 4DN +* Currently, 4DN DCIC uses CWL `draft-3`. +* The following 4DN custom fields are added, for automated conversion from cwl to the workflow metadata used by the 4DN Data Portal. + * `fdn_meta` (top level field) : a dictionary that contains `data_types`, `category`, `workflow_type`, `description`. + * `data_types` : an array of strings that correspond to the data types to be processed. e.g.) [ 'Repli-seq' ] + * `category` : a string describing the steps. e.g.) 'clip + align + filter + sort + dedup + count' + * `workflow_type` : a string describing the purpose of the workflow in short. e.g.) 'Repli-seq data processing' + * `description` : a string describing the workflow. e.g.) 'Repli-seq data processing pipeline' + * `fdn_step_meta` (within each `steps` element) : a dictionary that contains `software_used`, `description`, `analysis_step_types`. + * `software_used` : an array of strings that refer to the names of the software used. The name must match the names used in the `downloads.sh` in an accompanying Docker source repo. e.g.) [ 'cutadapt' ] + * `description` : a string that describes the step. e.g.) 'Adapter removal according to the Repli-seq pipeline' + * `analysis_step_types` : an array of strings that refer to the step types (i.e. purpose). e.g.) [ 'adapter removal' ] + * `fdn_format` (within each top-level `inputs` and `outputs` element) : a string. e.g.) 'bam' + * `fdn_output_type` (within each top-level `outputs` element) : a string that corresponds to one of the following three - 'processed', 'QC', 'report' + * processed : generic output file + * QC : output will be used to generate a quality_metric object (e.g. fastqc report) + * report : output will be used to add a metric to input (e.g. md5) + ### How to run the cwl To run docker through CWL, you need a cwl executor - we use `cwltool` (https://github.com/common-workflow-language/cwltool) to run CWL with a json/yml file describing input data. Some example input data are inside the `tests/test_input_json` directory and you can see some `cwltool` (=`cwl-runner`) commands inside the `tests/tests.sh` script. diff --git a/cwl_awsem/repliseq/repliseq-parta.cwl b/cwl_awsem/repliseq/repliseq-parta.cwl index 2ce0db3..fbb8ab3 100644 --- a/cwl_awsem/repliseq/repliseq-parta.cwl +++ b/cwl_awsem/repliseq/repliseq-parta.cwl @@ -1,32 +1,46 @@ { + "fdn_meta": { + "data_types": [ "Repli-seq" ], + "category": "align + filter + count", + "workflow_type": "Repli-seq data processing", + "description": "Repli-seq data processing pipeline" + }, "outputs": [ { "type": [ "File" ], "source": "#align.out_bam", - "id": "#bam" + "id": "#bam", + "fdn_format": "bam", + "fdn_output_type": "processed" }, { "type": [ "File" ], "source": "#filtersort.out_filtered_sorted_bam", - "id": "#filtered_sorted_bam" + "id": "#filtered_sorted_bam", + "fdn_output_type": "processed", + "fdn_format": "bam" }, { "type": [ "File" ], "source": "#dedup.out_deduped_bam", - "id": "#filtered_sorted_deduped_bam" + "id": "#filtered_sorted_deduped_bam", + "fdn_output_type": "processed", + "fdn_format": "bam" }, { "type": [ "File" ], "source": "#count.out_count_bg", - "id": "#count_bg" + "id": "#count_bg", + "fdn_output_type": "processed", + "fdn_format": "bg" } ], "inputs": [ @@ -34,19 +48,22 @@ "type": [ "File" ], - "id": "#fastq" + "id": "#fastq", + "fdn_format": "fastq" }, { "type": [ "File" ], - "id": "#bwaIndex" + "id": "#bwaIndex", + "fdn_format": "bwaIndex" }, { "type": [ "File" ], - "id": "#chromsizes" + "id": "#chromsizes", + "fdn_format": "chromsizes" }, { "type": [ @@ -62,7 +79,7 @@ "string" ], "id": "#memperthread", - "default": "5G" + "default": "2G" }, { "type": [ @@ -89,7 +106,12 @@ "id": "#clip.input_fastq" } ], - "id": "#clip" + "id": "#clip", + "fdn_step_meta": { + "software_used": [ "cutadapt" ], + "description": "Adapter removal according to the Repli-seq pipeline", + "analysis_step_types": [ "adapter removal" ] + } }, { "outputs": [ @@ -112,7 +134,12 @@ "id": "#align.nThreads" } ], - "id": "#align" + "id": "#align", + "fdn_step_meta": { + "software_used": [ "bwa" ], + "description": "Alignment according to the Repli-seq pipeline", + "analysis_step_types": [ "alignment" ] + } }, { "outputs": [ @@ -135,7 +162,12 @@ "id": "#filtersort.memperthread" } ], - "id": "#filtersort" + "id": "#filtersort", + "fdn_step_meta": { + "software_used": [ "samtools" ], + "description": "Filtering and sorting according to the Repli-seq pipeline", + "analysis_step_types": [ "filtering", "sorting" ] + } }, { "outputs": [ @@ -150,7 +182,12 @@ "id": "#dedup.input_bam" } ], - "id": "#dedup" + "id": "#dedup", + "fdn_step_meta": { + "software_used": [ "samtools" ], + "description": "PCR Duplicate removal according to the Repli-seq pipeline", + "analysis_step_types": [ "duplicate removal" ] + } }, { "outputs": [ @@ -173,7 +210,12 @@ "id": "#count.winsize" } ], - "id": "#count" + "id": "#count", + "fdn_step_meta": { + "software_used": [ "bedtools" ], + "description": "Read aggregation according to the Repli-seq pipeline", + "analysis_step_types": [ "binning", "aggregation" ] + } } ], "requirements": [