From d7231880c42f421c9e02070e32cd6e133ee6fa55 Mon Sep 17 00:00:00 2001
From: SooLee <duplexa@gmail.com>
Date: Tue, 12 Dec 2017 12:21:48 -0500
Subject: [PATCH 1/7] added 4dn-style meta to repliseq-parta.cwl

---
 cwl_awsem/repliseq/repliseq-parta.cwl | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/cwl_awsem/repliseq/repliseq-parta.cwl b/cwl_awsem/repliseq/repliseq-parta.cwl
index 2ce0db3..959bed0 100644
--- a/cwl_awsem/repliseq/repliseq-parta.cwl
+++ b/cwl_awsem/repliseq/repliseq-parta.cwl
@@ -1,4 +1,10 @@
 {
+    "4dn_description": {
+        "data_types": [ "Repli-seq" ],
+        "category": "clip + align + filter + sort + dedup + count",
+        "workflow_type": "Repli-seq data processing",
+        "description": "GITAR Hi-C data processing pipeline"
+    },
     "outputs": [
         {
             "type": [

From 2382b74f6139601b5fa6deb4234f180e3062e7f3 Mon Sep 17 00:00:00 2001
From: SooLee <duplexa@gmail.com>
Date: Tue, 12 Dec 2017 12:28:07 -0500
Subject: [PATCH 2/7] added 4dn-style meta to repliseq-parta.cwl

---
 cwl_awsem/repliseq/repliseq-parta.cwl | 35 +++++++++++++++++++++++----
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/cwl_awsem/repliseq/repliseq-parta.cwl b/cwl_awsem/repliseq/repliseq-parta.cwl
index 959bed0..5ecf315 100644
--- a/cwl_awsem/repliseq/repliseq-parta.cwl
+++ b/cwl_awsem/repliseq/repliseq-parta.cwl
@@ -95,7 +95,12 @@
                     "id": "#clip.input_fastq"
                 }
             ],
-            "id": "#clip"
+            "id": "#clip",
+            "4dn_description": {
+                "software": "cutadapt",
+                "description": "Adapter removal according to the Repli-seq pipeline",
+                "analysis_step_types": [ "adapter removal" ]
+            }
         },
         {
             "outputs": [
@@ -118,7 +123,12 @@
                     "id": "#align.nThreads"
                 }
             ],
-            "id": "#align"
+            "id": "#align",
+            "4dn_description": {
+                "software": "bwa",
+                "description": "Alignment according to the Repli-seq pipeline",
+                "analysis_step_types": [ "alignment" ]
+            }
         },
         {
             "outputs": [
@@ -141,7 +151,12 @@
                     "id": "#filtersort.memperthread"
                 }
             ],
-            "id": "#filtersort"
+            "id": "#filtersort",
+            "4dn_description": {
+                "software": "samtools",
+                "description": "Filtering and sorting according to the Repli-seq pipeline",
+                "analysis_step_types": [ "filtering", "sorting" ]
+            }
         },
         {
             "outputs": [
@@ -156,7 +171,12 @@
                     "id": "#dedup.input_bam"
                 }
             ],
-            "id": "#dedup"
+            "id": "#dedup",
+            "4dn_description": {
+                "software": "samtools",
+                "description": "PCR Duplicate removal according to the Repli-seq pipeline",
+                "analysis_step_types": [ "duplicate removal" ]
+            }
         },
         {
             "outputs": [
@@ -179,7 +199,12 @@
                     "id": "#count.winsize"
                 }
             ],
-            "id": "#count"
+            "id": "#count",
+            "4dn_description": {
+                "software": "bedtools",
+                "description": "Read aggregation according to the Repli-seq pipeline",
+                "analysis_step_types": [ "aggregation" ]
+            }
         }
     ],
     "requirements": [

From 8fbdc9e9100b3ca40452355505eb0af8eb333d88 Mon Sep 17 00:00:00 2001
From: SooLee <duplexa@gmail.com>
Date: Tue, 12 Dec 2017 13:44:48 -0500
Subject: [PATCH 3/7] added more 4dn tags. updated readme to add documentation
 for the 4dn tags.

---
 README.md                             | 16 +++++++++
 cwl_awsem/repliseq/repliseq-parta.cwl | 47 +++++++++++++++------------
 2 files changed, 43 insertions(+), 20 deletions(-)

diff --git a/README.md b/README.md
index 58425e6..9f743bb 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,22 @@
 * This is a collection of pipelines built by 4DN-DCIC that were created and run either on the SevenBridges platform or on the 4DN platform, AWSEM.
 * The repo contains a benchmarking script for some of the CWLs, that returns total space, mem and CPUs required per given input size and a recommended AWS EC2 instance type.
 
+
+### Specification for 4DN
+* Currently, 4DN DCIC uses CWL `draft-3`.
+* The following 4DN custom fields are added, for automated conversion from cwl to the workflow metadata used by the 4DN Data Portal.
+  * `4dn_meta` (top level field) : a dictionary that contains `data_types`, `category`, `workflow_type`, `description`.
+    * `data_types` : an array of strings that correspond to the data types to be processed. e.g.) [ 'Repli-seq' ]
+    * `category` : a string describing the steps. e.g.) 'clip + align + filter + sort + dedup + count'
+    * `workflow_type` : a string describing the purpose of the workflow in short. e.g.) 'Repli-seq data processing'
+    * `description` : a string describing the workflow. e.g.) 'Repli-seq data processing pipeline'
+  * `4dn_step_meta` (within each `steps` element) : a dictionary that contains `software`, `description`, `analysis_step_types`.
+    * `software` : an array of strings that refer to the names of the software used. The name must match the names used in the `downloads.sh` in an accompanying Docker source repo. e.g.) [ 'cutadapt' ]
+    * `description` : a string that describes the step. e.g.) 'Adapter removal according to the Repli-seq pipeline'
+    * `analysis_step_types` : an array of strings that refer to the step types (i.e. purpose). e.g.) [ 'adapter removal' ]
+  * `4dn_format` (within each top-level `inputs` and `outputs` element) : a string. e.g.) 'bam'
+
+
 ### How to run the cwl
 To run docker through CWL, you need a cwl executor - we use `cwltool` (https://github.com/common-workflow-language/cwltool) to run CWL with a json/yml file describing input data. Some example input data are inside the `tests/test_input_json` directory and you can see some `cwltool` (=`cwl-runner`) commands inside the `tests/tests.sh` script.
 
diff --git a/cwl_awsem/repliseq/repliseq-parta.cwl b/cwl_awsem/repliseq/repliseq-parta.cwl
index 5ecf315..5d9fa57 100644
--- a/cwl_awsem/repliseq/repliseq-parta.cwl
+++ b/cwl_awsem/repliseq/repliseq-parta.cwl
@@ -1,9 +1,9 @@
 {
-    "4dn_description": {
+    "4dn_meta": {
         "data_types": [ "Repli-seq" ],
         "category": "clip + align + filter + sort + dedup + count",
         "workflow_type": "Repli-seq data processing",
-        "description": "GITAR Hi-C data processing pipeline"
+        "description": "Repli-seq data processing pipeline"
     },
     "outputs": [
         {
@@ -11,28 +11,32 @@
                 "File"
             ],
             "source": "#align.out_bam",
-            "id": "#bam"
+            "id": "#bam",
+            "4dn_format": "bam"
         },
         {
             "type": [
                 "File"
             ],
             "source": "#filtersort.out_filtered_sorted_bam",
-            "id": "#filtered_sorted_bam"
+            "id": "#filtered_sorted_bam",
+            "4dn_format": "bam"
         },
         {
             "type": [
                 "File"
             ],
             "source": "#dedup.out_deduped_bam",
-            "id": "#filtered_sorted_deduped_bam"
+            "id": "#filtered_sorted_deduped_bam",
+            "4dn_format": "bam"
         },
         {
             "type": [
                 "File"
             ],
             "source": "#count.out_count_bg",
-            "id": "#count_bg"
+            "id": "#count_bg",
+            "4dn_format": "bg"
         }
     ],
     "inputs": [
@@ -40,19 +44,22 @@
             "type": [
                 "File"
             ],
-            "id": "#fastq"
+            "id": "#fastq",
+            "4dn_format": "fastq"
         },
         {
             "type": [
                 "File"
             ],
-            "id": "#bwaIndex"
+            "id": "#bwaIndex",
+            "4dn_format": "bwaIndex"
         },
         {
             "type": [
                 "File"
             ],
-            "id": "#chromsizes"
+            "id": "#chromsizes",
+            "4dn_format": "chromsizes"
         },
         {
             "type": [
@@ -68,7 +75,7 @@
                 "string"
             ],
             "id": "#memperthread",
-            "default": "5G"
+            "default": "2G"
         },
         {
             "type": [
@@ -96,8 +103,8 @@
                 }
             ],
             "id": "#clip",
-            "4dn_description": {
-                "software": "cutadapt",
+            "4dn_step_meta": {
+                "software": [ "cutadapt" ],
                 "description": "Adapter removal according to the Repli-seq pipeline",
                 "analysis_step_types": [ "adapter removal" ]
             }
@@ -124,8 +131,8 @@
                 }
             ],
             "id": "#align",
-            "4dn_description": {
-                "software": "bwa",
+            "4dn_step_meta": {
+                "software": [ "bwa" ],
                 "description": "Alignment according to the Repli-seq pipeline",
                 "analysis_step_types": [ "alignment" ]
             }
@@ -152,8 +159,8 @@
                 }
             ],
             "id": "#filtersort",
-            "4dn_description": {
-                "software": "samtools",
+            "4dn_step_meta": {
+                "software": [ "samtools" ],
                 "description": "Filtering and sorting according to the Repli-seq pipeline",
                 "analysis_step_types": [ "filtering", "sorting" ]
             }
@@ -172,8 +179,8 @@
                 }
             ],
             "id": "#dedup",
-            "4dn_description": {
-                "software": "samtools",
+            "4dn_step_meta": {
+                "software": [ "samtools" ],
                 "description": "PCR Duplicate removal according to the Repli-seq pipeline",
                 "analysis_step_types": [ "duplicate removal" ]
             }
@@ -200,8 +207,8 @@
                 }
             ],
             "id": "#count",
-            "4dn_description": {
-                "software": "bedtools",
+            "4dn_step_meta": {
+                "software": [ "bedtools" ],
                 "description": "Read aggregation according to the Repli-seq pipeline",
                 "analysis_step_types": [ "aggregation" ]
             }

From 39cc978e9b4d7bce2b488ba904e09d049fe93da7 Mon Sep 17 00:00:00 2001
From: SooLee <duplexa@gmail.com>
Date: Tue, 12 Dec 2017 14:12:41 -0500
Subject: [PATCH 4/7] updated 4dn tags for repliseq-parta

---
 cwl_awsem/repliseq/repliseq-parta.cwl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cwl_awsem/repliseq/repliseq-parta.cwl b/cwl_awsem/repliseq/repliseq-parta.cwl
index 5d9fa57..7e45b0c 100644
--- a/cwl_awsem/repliseq/repliseq-parta.cwl
+++ b/cwl_awsem/repliseq/repliseq-parta.cwl
@@ -1,7 +1,7 @@
 {
     "4dn_meta": {
         "data_types": [ "Repli-seq" ],
-        "category": "clip + align + filter + sort + dedup + count",
+        "category": "align + filter + count",
         "workflow_type": "Repli-seq data processing",
         "description": "Repli-seq data processing pipeline"
     },
@@ -210,7 +210,7 @@
             "4dn_step_meta": {
                 "software": [ "bedtools" ],
                 "description": "Read aggregation according to the Repli-seq pipeline",
-                "analysis_step_types": [ "aggregation" ]
+                "analysis_step_types": [ "binning", "aggregation" ]
             }
         }
     ],

From 82da37e0511bd97097e94f8d0086ca4275cbd84a Mon Sep 17 00:00:00 2001
From: SooLee <duplexa@gmail.com>
Date: Tue, 12 Dec 2017 16:19:11 -0500
Subject: [PATCH 5/7] 4dn->fdn for 4dn tags for cwl

---
 README.md                             |  6 +++---
 cwl_awsem/repliseq/repliseq-parta.cwl | 26 +++++++++++++-------------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/README.md b/README.md
index 9f743bb..7d445ba 100644
--- a/README.md
+++ b/README.md
@@ -6,16 +6,16 @@
 ### Specification for 4DN
 * Currently, 4DN DCIC uses CWL `draft-3`.
 * The following 4DN custom fields are added, for automated conversion from cwl to the workflow metadata used by the 4DN Data Portal.
-  * `4dn_meta` (top level field) : a dictionary that contains `data_types`, `category`, `workflow_type`, `description`.
+  * `fdn_meta` (top level field) : a dictionary that contains `data_types`, `category`, `workflow_type`, `description`.
     * `data_types` : an array of strings that correspond to the data types to be processed. e.g.) [ 'Repli-seq' ]
     * `category` : a string describing the steps. e.g.) 'clip + align + filter + sort + dedup + count'
     * `workflow_type` : a string describing the purpose of the workflow in short. e.g.) 'Repli-seq data processing'
     * `description` : a string describing the workflow. e.g.) 'Repli-seq data processing pipeline'
-  * `4dn_step_meta` (within each `steps` element) : a dictionary that contains `software`, `description`, `analysis_step_types`.
+  * `fdn_step_meta` (within each `steps` element) : a dictionary that contains `software`, `description`, `analysis_step_types`.
     * `software` : an array of strings that refer to the names of the software used. The name must match the names used in the `downloads.sh` in an accompanying Docker source repo. e.g.) [ 'cutadapt' ]
     * `description` : a string that describes the step. e.g.) 'Adapter removal according to the Repli-seq pipeline'
     * `analysis_step_types` : an array of strings that refer to the step types (i.e. purpose). e.g.) [ 'adapter removal' ]
-  * `4dn_format` (within each top-level `inputs` and `outputs` element) : a string. e.g.) 'bam'
+  * `fdn_format` (within each top-level `inputs` and `outputs` element) : a string. e.g.) 'bam'
 
 
 ### How to run the cwl
diff --git a/cwl_awsem/repliseq/repliseq-parta.cwl b/cwl_awsem/repliseq/repliseq-parta.cwl
index 7e45b0c..c51f0b5 100644
--- a/cwl_awsem/repliseq/repliseq-parta.cwl
+++ b/cwl_awsem/repliseq/repliseq-parta.cwl
@@ -1,5 +1,5 @@
 {
-    "4dn_meta": {
+    "fdn_meta": {
         "data_types": [ "Repli-seq" ],
         "category": "align + filter + count",
         "workflow_type": "Repli-seq data processing",
@@ -12,7 +12,7 @@
             ],
             "source": "#align.out_bam",
             "id": "#bam",
-            "4dn_format": "bam"
+            "fdn_format": "bam"
         },
         {
             "type": [
@@ -20,7 +20,7 @@
             ],
             "source": "#filtersort.out_filtered_sorted_bam",
             "id": "#filtered_sorted_bam",
-            "4dn_format": "bam"
+            "fdn_format": "bam"
         },
         {
             "type": [
@@ -28,7 +28,7 @@
             ],
             "source": "#dedup.out_deduped_bam",
             "id": "#filtered_sorted_deduped_bam",
-            "4dn_format": "bam"
+            "fdn_format": "bam"
         },
         {
             "type": [
@@ -36,7 +36,7 @@
             ],
             "source": "#count.out_count_bg",
             "id": "#count_bg",
-            "4dn_format": "bg"
+            "fdn_format": "bg"
         }
     ],
     "inputs": [
@@ -45,21 +45,21 @@
                 "File"
             ],
             "id": "#fastq",
-            "4dn_format": "fastq"
+            "fdn_format": "fastq"
         },
         {
             "type": [
                 "File"
             ],
             "id": "#bwaIndex",
-            "4dn_format": "bwaIndex"
+            "fdn_format": "bwaIndex"
         },
         {
             "type": [
                 "File"
             ],
             "id": "#chromsizes",
-            "4dn_format": "chromsizes"
+            "fdn_format": "chromsizes"
         },
         {
             "type": [
@@ -103,7 +103,7 @@
                 }
             ],
             "id": "#clip",
-            "4dn_step_meta": {
+            "fdn_step_meta": {
                 "software": [ "cutadapt" ],
                 "description": "Adapter removal according to the Repli-seq pipeline",
                 "analysis_step_types": [ "adapter removal" ]
@@ -131,7 +131,7 @@
                 }
             ],
             "id": "#align",
-            "4dn_step_meta": {
+            "fdn_step_meta": {
                 "software": [ "bwa" ],
                 "description": "Alignment according to the Repli-seq pipeline",
                 "analysis_step_types": [ "alignment" ]
@@ -159,7 +159,7 @@
                 }
             ],
             "id": "#filtersort",
-            "4dn_step_meta": {
+            "fdn_step_meta": {
                 "software": [ "samtools" ],
                 "description": "Filtering and sorting according to the Repli-seq pipeline",
                 "analysis_step_types": [ "filtering", "sorting" ]
@@ -179,7 +179,7 @@
                 }
             ],
             "id": "#dedup",
-            "4dn_step_meta": {
+            "fdn_step_meta": {
                 "software": [ "samtools" ],
                 "description": "PCR Duplicate removal according to the Repli-seq pipeline",
                 "analysis_step_types": [ "duplicate removal" ]
@@ -207,7 +207,7 @@
                 }
             ],
             "id": "#count",
-            "4dn_step_meta": {
+            "fdn_step_meta": {
                 "software": [ "bedtools" ],
                 "description": "Read aggregation according to the Repli-seq pipeline",
                 "analysis_step_types": [ "binning", "aggregation" ]

From 7a43b957bce27d31c2969f9f1d3289672963e1a7 Mon Sep 17 00:00:00 2001
From: SooLee <duplexa@gmail.com>
Date: Wed, 13 Dec 2017 10:54:05 -0500
Subject: [PATCH 6/7] software->software_used for 4dn tags for cwl

---
 README.md                             |  4 ++--
 cwl_awsem/repliseq/repliseq-parta.cwl | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 7d445ba..48ea584 100644
--- a/README.md
+++ b/README.md
@@ -11,8 +11,8 @@
     * `category` : a string describing the steps. e.g.) 'clip + align + filter + sort + dedup + count'
     * `workflow_type` : a string describing the purpose of the workflow in short. e.g.) 'Repli-seq data processing'
     * `description` : a string describing the workflow. e.g.) 'Repli-seq data processing pipeline'
-  * `fdn_step_meta` (within each `steps` element) : a dictionary that contains `software`, `description`, `analysis_step_types`.
-    * `software` : an array of strings that refer to the names of the software used. The name must match the names used in the `downloads.sh` in an accompanying Docker source repo. e.g.) [ 'cutadapt' ]
+  * `fdn_step_meta` (within each `steps` element) : a dictionary that contains `software_used`, `description`, `analysis_step_types`.
+    * `software_used` : an array of strings that refer to the names of the software used. The name must match the names used in the `downloads.sh` in an accompanying Docker source repo. e.g.) [ 'cutadapt' ]
     * `description` : a string that describes the step. e.g.) 'Adapter removal according to the Repli-seq pipeline'
     * `analysis_step_types` : an array of strings that refer to the step types (i.e. purpose). e.g.) [ 'adapter removal' ]
   * `fdn_format` (within each top-level `inputs` and `outputs` element) : a string. e.g.) 'bam'
diff --git a/cwl_awsem/repliseq/repliseq-parta.cwl b/cwl_awsem/repliseq/repliseq-parta.cwl
index c51f0b5..d75872d 100644
--- a/cwl_awsem/repliseq/repliseq-parta.cwl
+++ b/cwl_awsem/repliseq/repliseq-parta.cwl
@@ -104,7 +104,7 @@
             ],
             "id": "#clip",
             "fdn_step_meta": {
-                "software": [ "cutadapt" ],
+                "software_used": [ "cutadapt" ],
                 "description": "Adapter removal according to the Repli-seq pipeline",
                 "analysis_step_types": [ "adapter removal" ]
             }
@@ -132,7 +132,7 @@
             ],
             "id": "#align",
             "fdn_step_meta": {
-                "software": [ "bwa" ],
+                "software_used": [ "bwa" ],
                 "description": "Alignment according to the Repli-seq pipeline",
                 "analysis_step_types": [ "alignment" ]
             }
@@ -160,7 +160,7 @@
             ],
             "id": "#filtersort",
             "fdn_step_meta": {
-                "software": [ "samtools" ],
+                "software_used": [ "samtools" ],
                 "description": "Filtering and sorting according to the Repli-seq pipeline",
                 "analysis_step_types": [ "filtering", "sorting" ]
             }
@@ -180,7 +180,7 @@
             ],
             "id": "#dedup",
             "fdn_step_meta": {
-                "software": [ "samtools" ],
+                "software_used": [ "samtools" ],
                 "description": "PCR Duplicate removal according to the Repli-seq pipeline",
                 "analysis_step_types": [ "duplicate removal" ]
             }
@@ -208,7 +208,7 @@
             ],
             "id": "#count",
             "fdn_step_meta": {
-                "software": [ "bedtools" ],
+                "software_used": [ "bedtools" ],
                 "description": "Read aggregation according to the Repli-seq pipeline",
                 "analysis_step_types": [ "binning", "aggregation" ]
             }

From 2a3359136068473fca157a3456cf436dc895f366 Mon Sep 17 00:00:00 2001
From: SooLee <duplexa@gmail.com>
Date: Wed, 13 Dec 2017 12:14:02 -0500
Subject: [PATCH 7/7] added fdn_output_type

---
 README.md                             | 5 ++++-
 cwl_awsem/repliseq/repliseq-parta.cwl | 6 +++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 48ea584..2f2dcc8 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,10 @@
     * `description` : a string that describes the step. e.g.) 'Adapter removal according to the Repli-seq pipeline'
     * `analysis_step_types` : an array of strings that refer to the step types (i.e. purpose). e.g.) [ 'adapter removal' ]
   * `fdn_format` (within each top-level `inputs` and `outputs` element) : a string. e.g.) 'bam'
-
+  * `fdn_output_type` (within each top-level `outputs` element) : a string that corresponds to one of the following three - 'processed', 'QC', 'report'
+    * processed : generic output file
+    * QC : output will be used to generate a quality_metric object (e.g. fastqc report)
+    * report : output will be used to add a metric to input (e.g. md5)
 
 ### How to run the cwl
 To run docker through CWL, you need a cwl executor - we use `cwltool` (https://github.com/common-workflow-language/cwltool) to run CWL with a json/yml file describing input data. Some example input data are inside the `tests/test_input_json` directory and you can see some `cwltool` (=`cwl-runner`) commands inside the `tests/tests.sh` script.
diff --git a/cwl_awsem/repliseq/repliseq-parta.cwl b/cwl_awsem/repliseq/repliseq-parta.cwl
index d75872d..fbb8ab3 100644
--- a/cwl_awsem/repliseq/repliseq-parta.cwl
+++ b/cwl_awsem/repliseq/repliseq-parta.cwl
@@ -12,7 +12,8 @@
             ],
             "source": "#align.out_bam",
             "id": "#bam",
-            "fdn_format": "bam"
+            "fdn_format": "bam",
+            "fdn_output_type": "processed"
         },
         {
             "type": [
@@ -20,6 +21,7 @@
             ],
             "source": "#filtersort.out_filtered_sorted_bam",
             "id": "#filtered_sorted_bam",
+            "fdn_output_type": "processed",
             "fdn_format": "bam"
         },
         {
@@ -28,6 +30,7 @@
             ],
             "source": "#dedup.out_deduped_bam",
             "id": "#filtered_sorted_deduped_bam",
+            "fdn_output_type": "processed",
             "fdn_format": "bam"
         },
         {
@@ -36,6 +39,7 @@
             ],
             "source": "#count.out_count_bg",
             "id": "#count_bg",
+            "fdn_output_type": "processed",
             "fdn_format": "bg"
         }
     ],