-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocess_reference.sh
executable file
·150 lines (125 loc) · 4.88 KB
/
process_reference.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#! /usr/bin/env bash
## This script is to prepare the same reference dataset
## used in 10x cellranger
## Please visit 10x document website for details:
## https://support.10xgenomics.com/single-cell-gene-expression/software/release-notes/build#header
## Code for data preparing were directly modified from 10x's document site
process_fasta(){
local fasta_in=$1 # only support gzipped
# Modify sequence headers in the Ensembl FASTA to match the file
# "GRCh38.primary_assembly.genome.fa" from GENCODE. Unplaced and unlocalized
# sequences such as "KI270728.1" have the same names in both versions.
#
# Input FASTA:
# >1 dna:chromosome chromosome:GRCh38:1:1:248956422:1 REF
#
# Output FASTA:
# >chr1 1
# sed commands:
# 1. Replace metadata after space with original contig name, as in GENCODE
# 2. Add "chr" to names of autosomes and sex chromosomes
# 3. Handle the mitochrondrial chromosome
gunzip -c "$fasta_in" \
| sed -E 's/^>(\S+).*/>\1 \1/' \
| sed -E 's/^>([0-9]+|[XY]) />chr\1 /' \
| sed -E 's/^>MT />chrM /' \
> genome.fa
}
process_gtf(){
local gtf_in=$1 # only support gzipped
# Remove version suffix from transcript, gene, and exon IDs in order to match
# previous Cell Ranger reference packages
#
# Input GTF:
# ... gene_id "ENSG00000223972.5"; ...
# Output GTF:
# ... gene_id "ENSG00000223972"; gene_version "5"; ...
gtf_modified=$(mktemp -p ./ tmp.XXXXXX.gtf)
# Pattern matches Ensembl gene, transcript, and exon IDs for human or mouse:
ID="(ENS([A-Z]{3})?[GTE][0-9]+)\.([0-9]+)"
gunzip -c "$gtf_in" \
| sed -E 's/gene_id "'"$ID"'";/gene_id "\1"; gene_version "\3";/' \
| sed -E 's/transcript_id "'"$ID"'";/transcript_id "\1"; transcript_version "\3";/' \
| sed -E 's/exon_id "'"$ID"'";/exon_id "\1"; exon_version "\3";/' \
> "$gtf_modified"
# Define string patterns for GTF tags
# NOTES:
# - Since GENCODE release 31/M22 (Ensembl 97), the "lincRNA" and "antisense"
# biotypes are part of a more generic "lncRNA" biotype.
# - These filters are relevant only to GTF files from GENCODE. The GTFs from
# Ensembl release 98 have the following differences:
# - The names "gene_biotype" and "transcript_biotype" are used instead of
# "gene_type" and "transcript_type".
# - Readthrough transcripts are present but are not marked with the
# "readthrough_transcript" tag.
# - Only the X chromosome versions of genes in the pseudoautosomal regions
# are present, so there is no "PAR" tag.
BIOTYPE_PATTERN="(protein_coding|lncRNA|\
IG_C_gene|IG_D_gene|IG_J_gene|IG_LV_gene|IG_V_gene|\
IG_V_pseudogene|IG_J_pseudogene|IG_C_pseudogene|\
TR_C_gene|TR_D_gene|TR_J_gene|TR_V_gene|\
TR_V_pseudogene|TR_J_pseudogene)"
GENE_PATTERN="gene_biotype \"${BIOTYPE_PATTERN}\""
TX_PATTERN="transcript_biotype \"${BIOTYPE_PATTERN}\""
READTHROUGH_PATTERN="tag \"readthrough_transcript\""
PAR_PATTERN="tag \"PAR\""
# Construct the gene ID allowlist. We filter the list of all transcripts
# based on these criteria:
# - allowable gene_type (biotype)
# - allowable transcript_type (biotype)
# - no "PAR" tag (only present for Y chromosome PAR)
# - no "readthrough_transcript" tag
# We then collect the list of gene IDs that have at least one associated
# transcript passing the filters.
cat "$gtf_modified" \
| awk '$3 == "transcript"' \
| grep -E "$GENE_PATTERN" \
| grep -E "$TX_PATTERN" \
| grep -Ev "$READTHROUGH_PATTERN" \
| grep -Ev "$PAR_PATTERN" \
| sed -E 's/.*(gene_id "[^"]+").*/\1/' \
| sort \
| uniq \
> gene_allowlist
# Filter the GTF file based on the gene allowlist
# Copy header lines beginning with "#"
grep -E "^#" "$gtf_modified" > genes.gtf
# Filter to the gene allowlist
grep -Ff gene_allowlist "$gtf_modified" \
>> genes.gtf
rm $gtf_modified gene_allowlist
}
usage(){
cat <<-EOF
This script will help you to generate 10X style genome fasta and gtf files,
output files are "genome.fa" and "genes.gtf".
The detail preparing procedures could be referred from 10x's documentation website:
https://support.10xgenomics.com/single-cell-gene-expression/software/release-notes/build#header
usage:
$(basename $0) --fasta inputGenome.fa.gz --gtf inputGTF.gtf.gz
EOF
exit 0;
}
if [[ -z $1 ]] || [[ $1 -eq "-h" ]] || [[ $1 -eq "-help" ]] || [[ $1 -eq "--help" ]]
then
usage
fi
while [[ ! -z $1 ]]
do
case $1 in
"--fasta")
inputFasta=$2
shift 2
;;
"--gtf")
inputGTF=$2
shift 2
;;
*)
echo "argument is not supported: $1"
usage
;;
esac
done
process_fasta $inputFasta
process_gtf $inputGTF