Skip to content

Commit bb987d1

Browse files
committed
Add feature_analysis pipeline
1 parent b43974b commit bb987d1

File tree

7 files changed

+283
-4
lines changed

7 files changed

+283
-4
lines changed

.github/workflows/test.yml

+9-1
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,16 @@ jobs:
4040
- name: metafx chisq (with depth)
4141
run: |
4242
export PATH=bin:$PATH
43-
metafx chisq -t 6 -m 6G -k 31 -i test_data/sample_list.txt -w wd_chisq_4 --skip-graph -n 1000
43+
metafx chisq -t 6 -m 6G -k 31 -i test_data/sample_list.txt -w wd_chisq_4 -n 1000
4444
metafx chisq -t 6 -m 6G -k 31 -i test_data/sample_list_3.txt -w wd_chisq_3 --skip-graph -n 10000 --depth 15
45+
- name: metafx feature_analysis
46+
run: |
47+
export PATH=bin:$PATH
48+
mkdir reads
49+
ln -s test_data/3* reads/
50+
ln -s test_data/4* reads/
51+
ln -s test_data/test/* reads/
52+
metafx feature_analysis -k 31 -t 6 -m 6G -f wd_chisq_4/ -n A_19 -r reads/ --relab 0.5 -w wd_feat_analysis
4553
- name: metafx stats
4654
run: |
4755
export PATH=bin:$PATH

bin/metafx

+6-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@ help_message () {
2525
echo " predict Machine Learning methods to classify new samples based on pre-trained model"
2626
echo " fit_predict Machine Learning methods to train classification model based on extracted features and immediately apply it to classify new samples"
2727
echo " cv Machine Learning methods to train classification model based on extracted features and check accuracy via cross-validation"
28-
echo " bandage Module to train classifier and prepare for visualisation in Bandage (https://github.com/ctlab/BandageNG)"
28+
echo " bandage Module to train classifier and prepare for visualisation in BandageNG (https://github.com/ctlab/BandageNG)"
29+
echo " feature_analysis Module to analyze selected feature in multiple samples and visualize in BandageNG (https://github.com/ctlab/BandageNG)"
2930
echo ""
3031
echo " calc_features Module to count values for new samples based on previously extracted features"
3132
echo " extract_kmers Module to extract k-mers from samples (to speed up multiple calculations)"
@@ -101,6 +102,10 @@ elif [ "$1" = bandage ]; then
101102
echo metafx bandage ${@:2} | tee -a $LOGFILE
102103
{ time ${PIPES}/bandage_pipe.sh ${@:2} 2>&1; echo $? >> $LOGFILE; } | tee -a $LOGFILE
103104
exit `tail -1 $LOGFILE`
105+
elif [ "$1" = feature_analysis ]; then
106+
echo metafx feature_analysis ${@:2} | tee -a $LOGFILE
107+
{ time ${PIPES}/feature_analysis.sh ${@:2} 2>&1; echo $? >> $LOGFILE; } | tee -a $LOGFILE
108+
exit `tail -1 $LOGFILE`
104109
elif [ "$1" = "-h" ] || [ "$1" = "--help" ]; then
105110
help_message
106111
exit 0
+188
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
#!/usr/bin/env bash
2+
##########################################################################################
3+
##### MetaFX feature_analysis module – analyze selected feature in multiple samples #####
4+
##########################################################################################
5+
6+
help_message () {
7+
echo ""
8+
echo "$(metafx -v)"
9+
echo "MetaFX feature_analysis module – pipeline to build de Bruijn graphs for samples with selected feature and visualize them in BandageNG (https://github.com/ctlab/BandageNG)"
10+
echo "Usage: metafx feature_analysis [<Launch options>] [<Input parameters>]"
11+
echo ""
12+
echo "Launch options:"
13+
echo " -h | --help show this help message and exit"
14+
echo " -t | --threads <int> number of threads to use [default: all]"
15+
echo " -m | --memory <MEM> memory to use (values with suffix: 1500M, 4G, etc.) [default: 90% of free RAM]"
16+
echo " -w | --work-dir <dirname> working directory [default: workDir/]"
17+
echo ""
18+
echo "Input parameters:"
19+
echo " -k | --k <int> k-mer size to build de Bruij graphs (in nucleotides, maximum value is 31) [mandatory]"
20+
echo " -f | --feature-dir <dirname> directory containing folders with contigs for each category, feature_table.tsv and categories_samples.tsv files. Usually, it is workDir from other MetaFX modules (unique, stats, colored, metafast, metaspades) [mandatory]"
21+
echo " -n | --feature-name <string> name of the feature of interest (should be one of the values from first column of feature_table.tsv) [mandatory]"
22+
echo " -r | --reads-dir <dirname> directory containing files with reads for samples. FASTQ, FASTA, gzip- or bzip2-compressed [mandatory]"
23+
echo " --relab <int> minimal relative abundance of feature in sample to include sample for further analysis [optional, default: 0.1]"
24+
echo "";}
25+
26+
27+
# Paths to pipelines and scripts
28+
mfx_path=$(which metafx)
29+
bin_path=${mfx_path%/*}
30+
SOFT=${bin_path}/metafx-scripts
31+
PIPES=${bin_path}/metafx-modules
32+
pwd=`dirname "$0"`
33+
34+
comment () { ${SOFT}/pretty_print.py "$1" "-"; }
35+
warning () { ${SOFT}/pretty_print.py "$1" "*"; }
36+
error () { ${SOFT}/pretty_print.py "$1" "*"; exit 1; }
37+
38+
39+
w="workDir"
40+
POSITIONAL=()
41+
while [[ $# -gt 0 ]]
42+
do
43+
key="$1"
44+
case $key in
45+
-h|--help)
46+
help_message
47+
exit 0
48+
;;
49+
-k|--k)
50+
k="$2"
51+
shift
52+
shift
53+
;;
54+
-f|--feature-dir)
55+
featDir="$2"
56+
shift
57+
shift
58+
;;
59+
-n|--feature-name)
60+
featName="$2"
61+
shift
62+
shift
63+
;;
64+
-r|--reads-dir)
65+
readsDir="$2"
66+
shift
67+
shift
68+
;;
69+
--relab)
70+
relab="$2"
71+
shift
72+
shift
73+
;;
74+
-m|--memory)
75+
m="$2"
76+
shift
77+
shift
78+
;;
79+
-t|--threads)
80+
p="$2"
81+
shift
82+
shift
83+
;;
84+
-w|--work-dir)
85+
w="$2"
86+
shift
87+
shift
88+
;;
89+
*) # unknown option
90+
POSITIONAL+=("$1") # save it in an array for later
91+
shift
92+
;;
93+
esac
94+
done
95+
set -- "${POSITIONAL[@]}" # restore positional parameters
96+
97+
98+
99+
100+
if [ ! -d ${featDir} ]; then
101+
error "Invalid directory with features provided"
102+
fi
103+
104+
if [ ! -d ${readsDir} ]; then
105+
error "Invalid directory with samples' reads provided"
106+
fi
107+
108+
if [ ! -f ${featDir}/feature_table.tsv ]; then
109+
error "feature_table.tsv file missing in ${featDir}"
110+
fi
111+
112+
# ==== Step 1 ====
113+
comment "Running step 1: selecting samples containing feature '${featName}'"
114+
cnt=`awk -v var="${featName}" '$1==var {CNT++} END{ print CNT }' ${featDir}/feature_table.tsv`
115+
if [[ cnt -ne 1 ]]; then
116+
error "Cannot find feature '${featName}' in ${featDir}/feature_table.tsv"
117+
fi
118+
119+
mkdir ${w}
120+
cmd1="python ${SOFT}/select_samples_by_feature.py --work-dir ${featDir} --feature ${featName} --res-dir ${w} "
121+
if [[ $relab ]]; then
122+
cmd+="--board ${relab}"
123+
fi
124+
125+
echo "$cmd1"
126+
$cmd1
127+
if [[ $? -eq 0 ]]; then
128+
echo "Total `wc -l ${w}/samples_list_feature_${featName}.txt` samples were selected"
129+
echo "List of samples containing feature '${featName}' saved to ${w}/samples_list_feature_${featName}.txt"
130+
echo "Nucleotide sequence for feature '${featName}' saved to ${w}/seq_feature_${featName}.fasta"
131+
comment "Step 1 finished successfully!"
132+
else
133+
error "Error during step 1!"
134+
exit 1
135+
fi
136+
137+
138+
# ===== Step 2 =====
139+
140+
comment "Running step 2: constructing de Brujn graphs for each selected sample"
141+
142+
143+
cmd2="${PIPES}/metacherchant.sh "
144+
if [[ $k ]]; then
145+
cmd2+="-k $k "
146+
fi
147+
if [[ $m ]]; then
148+
cmd2+="-m $m "
149+
fi
150+
if [[ $p ]]; then
151+
cmd2+="-p $p "
152+
fi
153+
154+
cmd2+="--coverage 1 --maxradius 1000 --bothdirs true --chunklength 10 --merge true "
155+
cmd2+="--seq ${w}/seq_feature_${featName}.fasta "
156+
157+
mkdir ${w}/graphs
158+
159+
while read sample ; do
160+
cmd2_i=${cmd2}
161+
cmd2_i+="-w ${w}/wd_${sample} "
162+
cmd2_i+="-o ${w}/wd_${sample}/output "
163+
readsFiles=`find ${readsDir}/${sample}_* ${readsDir}/${sample}.* 2>/dev/null | paste -sd " "`
164+
cmd2_i+="--reads ${readsFiles}"
165+
echo -n "Processing sample ${sample} (log saved to ${w}/metacherchant.log) ... "
166+
167+
echo "${cmd2_i}" >> ${w}/metacherchant.log
168+
${cmd2_i} &>> ${w}/metacherchant.log
169+
if [[ $? -eq 0 ]]; then
170+
echo "DONE"
171+
else
172+
error "Error during step 2!"
173+
fi
174+
ln -s `realpath $w`/wd_${sample}/output/merged/graph.gfa ${w}/graphs/${sample}.gfa
175+
done<${w}/samples_list_feature_${featName}.txt
176+
177+
178+
if [[ $? -eq 0 ]]; then
179+
comment "All de Bruijn graphs saved to: ${w}/graphs/. To visualise them simultaneously in BandageNG follow instructions from https://github.com/ctlab/BandageNG/wiki#multigraph-mode"
180+
comment "Step 2 finished successfully!"
181+
else
182+
error "Error during step 2!"
183+
exit 1
184+
fi
185+
186+
187+
comment "MetaFX feature_analysis module finished successfully!"
188+
exit 0

bin/metafx-modules/metacherchant.sh

24.7 MB
Binary file not shown.

bin/metafx-scripts/graph2contigs.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,12 @@
1111
for line in open(wd + "/components-graph.gfa"):
1212
if line.split()[0] == 'S':
1313
_, name, seq, *_ = line.strip().split(sep="\t")
14+
raw_name = name
1415
name = int(name.split("_")[1][1:])
1516
if name != comp:
1617
comp += 1
1718
comp_i = 0
1819
comp_i += 1
19-
print(">" + str(comp) + "_" + str(comp_i), file=file)
20+
print(">" + str(comp) + "_" + str(comp_i) + "\t" + raw_name, file=file)
2021
print(seq, file=file)
2122
file.close()
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
#!/usr/bin/env python
2+
# Utility select samples from feature_table.tsv with feature score greater than board value (default 0.1)
3+
# -*- coding: UTF-8 -*-
4+
5+
import sys
6+
import getopt
7+
import pandas as pd
8+
9+
if __name__ == "__main__":
10+
inputFile = ''
11+
outputFile = ''
12+
feature = ''
13+
category = ''
14+
featureId = ''
15+
board = 0.1
16+
17+
helpString = 'Please add all mandatory parameters --work-dir, --feature, --res-dir and use optional float parameter --board'
18+
19+
argv = sys.argv[1:]
20+
try:
21+
opts, args = getopt.getopt(argv, "h", ["work-dir=", "feature=", "res-dir=", "board="])
22+
except getopt.GetoptError:
23+
print(helpString)
24+
sys.exit(2)
25+
for opt, arg in opts:
26+
if opt == "-h":
27+
print(helpString)
28+
sys.exit()
29+
elif opt == "--work-dir":
30+
workDir = arg
31+
if workDir[0] == "'" or workDir[0] == '"':
32+
workDir = workDir[1:]
33+
if workDir[-1] == "'" or workDir[-1] == '"':
34+
workDir = workDir[:-1]
35+
elif opt == "--feature":
36+
feature = arg
37+
if feature[0] == "'" or feature[0] == '"':
38+
feature = feature[1:]
39+
if feature[-1] == "'" or feature[-1] == '"':
40+
feature = feature[:-1]
41+
category = feature.split("_")[0]
42+
featureId = feature.split("_")[1]
43+
elif opt == "--res-dir":
44+
resDir = arg
45+
if resDir[0] == "'" or resDir[0] == '"':
46+
resDir = resDir[1:]
47+
if resDir[-1] == "'" or resDir[-1] == '"':
48+
resDir = resDir[:-1]
49+
elif opt == "--board":
50+
board = float(arg)
51+
52+
data = pd.read_csv(workDir + '/feature_table.tsv', header=0, index_col=0, sep = '\t')
53+
data = data.T
54+
filteredData = data[feature][data[feature] > board].keys().tolist()
55+
samplesList = open(resDir + '/samples_list_feature_' + feature + '.txt', 'w')
56+
print(*filteredData, sep = "\n", file=samplesList)
57+
samplesList.close()
58+
59+
resSeqFile = open(resDir + '/seq_feature_' + feature + '.fasta', 'w')
60+
featuresFasta = open(workDir + '/contigs_' + category + '/components.seq.fasta', 'r')
61+
62+
while True:
63+
line = featuresFasta.readline()
64+
if not line:
65+
break
66+
if len(line)==0:
67+
continue
68+
if line[0] == '>':
69+
line = line.strip()[1:]
70+
seqName = line.split('_')[0]
71+
if seqName == featureId:
72+
seq = featuresFasta.readline().strip()
73+
print(">"+line, file=resSeqFile)
74+
print(seq, file=resSeqFile)
75+
resSeqFile.close()
76+
featuresFasta.close()

bin/metafx-scripts/tax_to_csv.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,8 @@
4747
listLine = line.split('\t')
4848
if (listLine[0] == 'C'):
4949
tax_id = listLine[2].split('taxid')[1][1:-1]
50-
tax_ids.append((listLine[1], tax_id))
50+
#tax_ids.append((listLine[1], tax_id))
51+
tax_ids.append((listLine[1].split("\t")[1], tax_id))
5152
fileR.close()
5253

5354
ncbi = NCBITaxa()

0 commit comments

Comments
 (0)