-
Notifications
You must be signed in to change notification settings - Fork 4
/
chipseq_workflow.Rmd
153 lines (108 loc) · 3.91 KB
/
chipseq_workflow.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
---
title: "ChIP-Seq Workflow"
output:
html_notebook: default
---
<style>
.main-container { width: 1200px; max-width:2800px;}
</style>
## Genome Preparation
```{r, engine = 'bash', eval = FALSE}
#!/bin/sh
# genome_prep.sh
# Download genome
wget ftp://ftp.ensembl.org/pub/release-90/fasta/drosophila_melanogaster/dna/Drosophila_melanogaster.BDGP6.dna.toplevel.fa.gz
# Extract fasta file
gunzip Drosophila_melanogaster.BDGP6.dna.toplevel.fa.gz
# Select chromosomes
samtools faidx Drosophila_melanogaster.BDGP6.dna.toplevel.fa X 2L 2R 3L 3R > Drosophila_melanogaster.BDGP6.dna.selected.fa
# Index new fasta file
samtools faidx Drosophila_melanogaster.BDGP6.dna.selected.fa
# generate chromInfo table with chromosome length
cut -f1,2 Drosophila_melanogaster.BDGP6.dna.selected.fa.fai > Drosophila_melanogaster.BDGP6.dna.selected.chromInfo.txt
# build bowtie2 index
bowtie2-build Drosophila_melanogaster.BDGP6.dna.selected.fa Drosophila_melanogaster.BDGP6.dna.selected
```
## Data Download
```{r, engine = 'bash', eval = FALSE}
#!/bin/sh
# data_download.sh
# Download ChIP-seq data from Sequence Read Archive
wget ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByExp/sra/SRX/SRX252/SRX2520631/SRR5206750/SRR5206750.sra
wget ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByExp/sra/SRX/SRX252/SRX2520632/SRR5206751/SRR5206751.sra
wget ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByExp/sra/SRX/SRX252/SRX2520634/SRR5206753/SRR5206753.sra
wget ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByExp/sra/SRX/SRX252/SRX2520635/SRR5206754/SRR5206754.sra
```
## Workflow Run
```{r, engine = 'bash', eval = FALSE}
#!/bin/sh
# workflow.sh
# Iterate through files
for FILENAME in *.sra
do
# extract file basename
FILEBASE=`echo ${FILENAME} | sed -e "s/.sra//g"`
# convert sra to fastq
fastq-dump ${FILEBASE}.sra
# run quality control
fastqc ${FILEBASE}.fastq
# run bowtie2 alignment
bowtie2 -x Drosophila_melanogaster.BDGP6.dna.selected -U ${FILEBASE}.fastq -p 4 -S ${FILEBASE}.sam 2> ${FILEBASE}.stats
# convert sam to bam, sort by coordinate and index
samtools view -bS -q 0 ${FILEBASE}.sam | samtools sort - | tee ${FILEBASE}.bam | samtools index - ${FILEBASE}.bam.bai
# convert bam to bedgraph
genomeCoverageBed -ibam ${FILEBASE}.bam -bg -g Drosophila_melanogaster.BDGP6.dna.selected.chromInfo.txt > ${FILEBASE}.bedgraph
# convert bedgraph to tdf
igvtools toTDF -z 5 ${FILEBASE}.bedgraph ${FILEBASE}.tdf Drosophila_melanogaster.BDGP6.dna.selected.fa
done
```
## Homer Run
```{r, engine = 'bash', eval = FALSE}
#!/bin/sh
# homer.sh
# first argument sample table
TABLE="$1"
# collect file names
FILES=`cut -f 1 $TABLE `
# make tag directory for bam files
for FILE in $FILES
do
echo $FILE
makeTagDirectory ${FILE}.dir ${FILE}.bam
done
# iterate through samples
SAMPLES=`cut -f 2 $TABLE | uniq`
for SAMPLE in $SAMPLES
do
# get input
INPUT=`cat $TABLE | grep $SAMPLE | grep "INPUT" | cut -f 1`
# get non-inputs
ASSAYS=`cat $TABLE | grep $SAMPLE | grep -v "INPUT" | cut -f 1`
for ASSAY in $ASSAYS
do
# show ChIP - Input pairs
echo $ASSAY $INPUT
# create total reads and Input normalized bedgraph
makeUCSCfile ${ASSAY}.dir/ -i ${INPUT}.dir/ -o ${ASSAY}.INPnorm.bedgraph
# peak finding against input
findPeaks ${ASSAY}.dir/ -i ${INPUT}.dir/ -style factor -F 2 -o ${ASSAY}.factor.F2.txt
# convert txt to BED
cat ${ASSAY}.factor.F2.txt | grep -v "#" | cut -f 2,3,4 > ${ASSAY}.factor.F2.bed
done
done
```
## Links
Mapping Quality:
http://biofinysics.blogspot.de/2014/05/how-does-bowtie2-assign-mapq-scores.html
http://biofinysics.blogspot.de/2014/05/the-slow-death-of-term-uniquely.html
Homer:
http://homer.ucsd.edu/homer/ngs/ucsc.html
http://homer.ucsd.edu/homer/ngs/peaks.html
## Comments
```{r, engine = 'bash', eval = FALSE}
# make sure files are executable
chmod +x genome_prep.sh
chmod +x data_download.sh
chmod +x workflow.sh
chmod +x homer.sh
```