This respository enables the reproduction of the analysis described in:
Motheramgari et al. Expanding the Chinese hamster ovary cell long non-coding RNA transcriptome using RNASeq Biotechnology and Bioengineering 2020 https://doi.org/10.1002/bit.27467
NCBI Sequence Read Archive
European Nucleotide Archive
All the programmes must be added to the PATH to run the workflow
-
R 3.5.2
This is a simple way to dowload from ENA, for higher speed download use the Aspera client Total data download size: ~95G
mkdir -p data/ena
wget -q "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR105/057/SRR10572657/*" -P data/ena || { handle ; error ; }
wget -q "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR105/058/SRR10572658/*" -P data/ena || { handle ; error ; }
wget -q "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR105/059/SRR10572659/*" -P data/ena || { handle ; error ; }
wget -q "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR105/060/SRR10572660/*" -P data/ena || { handle ; error ; }
wget -q "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR105/061/SRR10572661/*" -P data/ena || { handle ; error ; }
wget -q "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR105/062/SRR10572662/*" -P data/ena || { handle ; error ; }
wget -q "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR105/063/SRR10572663/*" -P data/ena || { handle ; error ; }
wget -q "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR105/064/SRR10572664/*" -P data/ena || { handle ; error ; }
Adapter trimming for the Illumina TruSeq adapters
mkdir data/cutadapt
cat data/sample_info.txt | cut -f 2 | tail -n 8 | while read sample; do
./scripts/trim_adapter.sh -s $sample -i data/ena -o data/cutadapt
done
Trimmomatic for removing low quality bases and filtering resulting reads that are
too short. The script makes two subfolders in the trimmomatic folder for paired and unpaired reads
mkdir data/preprocessed
cat data/sample_info.txt | cut -f 2 | tail -n 8 | while read sample; do
./scripts/trim_quality.sh -s $sample -i data/cutadapt -o data/preprocessed -p 32
done
. The sequence is also prepped for further analysis by retaining only the scaffold ID. In addition, a complementary annotation file is created from NCBI to help with annotation later
mkdir reference_genome
./scripts/prepare_genome.sh -v 98 -o reference_genome
Use the reference genome to build an index for mapping the RNASeq reads
./scripts/make_star_index.sh -g reference_genome/ensembl_chok1_genome.fa -a reference_genome/ensembl_chok1_genome.gtf -p 32 -d reference_genome
mkdir data/mapped
cat data/sample_info.txt | cut -f 2 | tail -n 8 | while read sample; do
./scripts/star_mapping.sh -s $sample -i data/preprocessed/paired -g reference_genome/star_index -o data/mapped -p 32
done
mkdir transcriptome_assembly
cat data/sample_info.txt | cut -f 2 | tail -n 8 | while read sample; do
./scripts/run_stringtie.sh -s $sample -i data/mapped -g reference_genome/ensembl_chok1_genome.gtf -o transcriptome_assembly -p 32
done
In this code the stringtie is separated into transcripts originating from annotated protein coding genes and non-coding RNAs
./scripts/stringtie_merge.sh -t transcriptome_assembly -g reference_genome/ensembl_chok1_genome.gtf -r reference_genome
Make a directory to hold each of the different analyses
mkdir lncrna_annotation
The TPM expression value is calculated for each transcript assembled by StringTie
cat data/sample_info.txt | cut -f 2 | tail -n 8 | while read sample; do
./scripts/calc_tpm.sh -p 32 -s $sample -g transcriptome_assembly/stringtie.all.transcripts.gtf -o lncrna_annotation/TPM -b data/mapped
done
cat data/sample_info.txt | cut -f 2 | tail -n 8 | while read sample; do
awk 'NR>1 {print $2}' data/sample_info.txt > lncrna_annotation/TPM/sample_list.txt
./scripts/tpm_matrix.sh -s lncrna_annotation/TPM/sample_list.txt -o lncrna_annotation/TPM
done
The stringtie transcriptome assembly is used to predict lncRNAs using FEELNc
./scripts/run_feelnc.sh -G reference_genome/ensembl_chok1_genome.gtf -g transcriptome_assembly/non_protein_coding_stringtie.gtf -f reference_genome/ensembl_chok1_genome.fa -o lncrna_annotation/FEELnc
Assess FEELNc output using additional protein potential calculators, PFAM search and BLAST against protein and RNA databases
./scripts/run_transdecoder.sh -g lncrna_annotation/FEELnc/feelnc_codpot_out/candidate_lncRNA.gtf.lncRNA.gtf -f reference_genome/ensembl_chok1_genome.fa -o lncrna_annotation/TRANSDECODER
Determine protein coding potential using CPAT
./scripts/run_cpat.sh -f lncrna_annotation/TRANSDECODER/candidate_lncrna_cdna.fa -o lncrna_annotation/CPAT
Determine protein coding potential using CPC2
./scripts/run_cpc2.sh -f lncrna_annotation/TRANSDECODER/candidate_lncrna_cdna.fa -o lncrna_annotation/CPC2
Search for known PFAM domains
./scripts/run_hmmscan.sh -t 32 -e 1e-5 -p lncrna_annotation/TRANSDECODER/longest_orfs.pep -o lncrna_annotation/PFAM
Search SWISSPROT,RFAM & MIRBASE
./scripts/run_blast.sh -t 32 -e 1e-5 -s lncrna_annotation/SWISSPROT -p lncrna_annotation/TRANSDECODER/longest_orfs.pep -m lncrna_annotation/MIRBASE -r lncrna_annotation/RFAM -n lncrna_annotation/TRANSDECODER/candidate_lncrna_cdna.fa
Remove candidates from FEELNc that don't meet criteria
/usr/bin/Rscript R/filter_lncrna.R \
"lncrna_annotation/FEELnc/feelnc_codpot_out/candidate_lncRNA.gtf.lncRNA.gtf" \
"lncrna_annotation/CPC2/CPC2.analysis.txt" \
"lncrna_annotation/CPAT/CPAT.analysis.txt" \
"lncrna_annotation/SWISSPROT/blastp.outfmt6" \
"lncrna_annotation/MIRBASE/blastn.outfmt6" \
"lncrna_annotation/PFAM/pfam_domain_transcripts" \
"lncrna_annotation/RFAM/blastn.outfmt6" \
"lncrna_annotation/TPM/transcript_tpm_all_samples.tsv" \
"lncrna_annotation/FEELnc/lncRNA_classes.txt" \
"transcriptome_assembly/non_protein_coding_stringtie.gtf" \
"lncrna_annotation"
Here we further filter potential lncRNAs by removing monoexonic transcripts
./scripts/orthology_analysis.sh -s lncrna_annotation/firstpass_filter/lncRNA.fasta -o lncrna_annotation
- antisense to a protein coding gene 2) are orthologous with human or mouse 3) annotated as lncRNA in ensembl
./scripts/filter_monoexonic_lncrnas.sh \
-o lncrna_annotation \
-g reference_genome/ensembl_chok1_protein.gtf \
-l lncrna_annotation/firstpass_filter/all_lncrna.gtf \
-k reference_genome/ensembl_lncrna_transcript.txt \
-a transcriptome_assembly/non_protein_coding_stringtie.gtf
/usr/bin/Rscript R/Rscript R/simplify_class.R \
"lncrna_annotation/classification/second_classification.txt" \
"lncrna_annotation/classification/final_classification.txt"
Determine the lncRNAs that are differentially expressed between the temperature shift and non-temperature shift conditions
mkdir differential_expression
cat data/sample_info.txt | cut -f 2 | tail -n 8 | while read sample; do
./scripts/htseq_count.sh -s $sample -m data/mapped -g transcriptome_assembly/stringtie.all.transcripts.gtf -o differential_expression/counts&
done
mkdir differential_expression/results
Rscript R/run_deseq.R "differential_expression/counts" "differential_expression/results"