-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCleaning+Alignment.sh
106 lines (91 loc) · 3.88 KB
/
Cleaning+Alignment.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/bin/bash
############################################################################################
# Script Name: Single Cell RNA-seq Processing Pipeline
# Description: This script processes single-cell RNA-seq
# data by cleaning the FASTQ files with fastp
# and then aligning with Salmon. Additionally,
# metadata is extracted and saved.
# Author: Jonathan Anzules
# Contact: [email protected]
# Date Created: September 23, 2023
# Usage: ./Cleaning+Alignment.sh
# Dependencies:
# - fastp
# - salmon
# - Bash
############################################################################################
FILE_LIST="/mnt/c/Users/jonan/Documents/1Work/scWork/Code-for-Single-Cell-Analysis/file_list_T2D.txt"
OUTPUT_DIR="/mnt/c/Users/jonan/Documents/1Work/scWork/Data/PANCDB/Cleaned-N-Aligned-hpapdata"
SALM_INDEX="/mnt/c/Users/jonan/Documents/Genomes/Homo-Sapiens/GRCh38/GRCh38-Transcript-Salmon-index"
scMETADATA_NAME="metadata_T2D"
#Testing count
# echo "How many times should we test?"
# read count_end
# count=0
# Initialize the metadata file
METADATA_FILE="$OUTPUT_DIR/${scMETADATA_NAME}.csv"
echo "quant_file,donor,cell" > $METADATA_FILE
# Initializing the temporary folder
mkdir temp
#Loop over each FASTQ file
for file_pwd in $(cat $FILE_LIST); do
echo "#############################################################"
echo "#############################################################"
echo "Processing file: $file_pwd"
echo "#############################################################"
echo "#############################################################"
for fastq_file in "$file_pwd"/*-R1_fastq-data.fastq.gz; do
# Adding and testing the count
((count++))
if [[ $count -ge $count_end ]]; then
break
fi
base_name=$(basename "$fastq_file" -R1_fastq-data.fastq.gz) #Removing suffix to later find the R2 strand
paired_file="$file_pwd/${base_name}-R2_fastq-data.fastq.gz"
echo "--------------------"
echo "file name:"
echo "$base_name"
echo ""
echo "Fastq file pwd:"
echo "$fastq_file"
echo ""
echo "R2 strand pwd"
echo "$paired_file"
echo "--------------------"
echo ""
echo ""
echo ""
# Use fastp to clean the FASTQ files
# File will be made in local directory, deleted later
# fastp -i "$fastq_file" \
# -o "./temp/${base_name}_clean_R1.fastq.gz" \
# -I "$paired_file" \
# -O "./temp/${base_name}_clean_R2.fastq.gz" \
# --json /dev/null \
# --html /dev/null
# Check if fastp was successful
if [ $? -eq 0 ]; then
# Align with salmon
salmon quant -i "$SALM_INDEX" \
-l A \
-1 ./temp/"${base_name}_clean_R1.fastq.gz" \
-2 ./temp/"${base_name}_clean_R2.fastq.gz" \
-o "$OUTPUT_DIR/${base_name}_quant"
Check if salmon was successful
if [ $? -eq 0 ]; then
# rm "./temp/${base_name}_clean_R1.fastq.gz" "./temp/${base_name}_clean_R2.fastq.gz"
# I will be using the part of the file name to name the cell. OG metadata has a similar cellID
IFS='_' read -ra META <<< "$base_name"
donor="${META[0]}"
cell="${META[2]}"
IFS='-' read -ra META2 <<< "$cell"
cell="${META2[1]}"
# Append to the metadata file
echo "$OUTPUT_DIR/${base_name}_quant/quant.sf,$donor,$cell" >> $METADATA_FILE
fi
fi
echo "--------------------"
done
done
# removing temp folder
# rm -r temp