Add feature_analysis pipeline

ivartb · ivartb · commit bb987d195517 · 2023-12-25T01:29:38.000+03:00
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -40,8 +40,16 @@ jobs:
     - name: metafx chisq (with depth)
       run: |
         export PATH=bin:$PATH
-        metafx chisq -t 6 -m 6G -k 31 -i test_data/sample_list.txt -w wd_chisq_4 --skip-graph -n 1000
+        metafx chisq -t 6 -m 6G -k 31 -i test_data/sample_list.txt -w wd_chisq_4 -n 1000
         metafx chisq -t 6 -m 6G -k 31 -i test_data/sample_list_3.txt -w wd_chisq_3 --skip-graph -n 10000 --depth 15
+    - name: metafx feature_analysis
+      run: |
+        export PATH=bin:$PATH
+        mkdir reads
+        ln -s test_data/3* reads/
+        ln -s test_data/4* reads/
+        ln -s test_data/test/* reads/
+        metafx feature_analysis -k 31 -t 6 -m 6G -f wd_chisq_4/ -n A_19 -r reads/ --relab 0.5 -w wd_feat_analysis
     - name: metafx stats
       run: |
         export PATH=bin:$PATH
diff --git a/bin/metafx b/bin/metafx
@@ -25,7 +25,8 @@ help_message () {
     echo "    predict           Machine Learning methods to classify new samples based on pre-trained model"
     echo "    fit_predict       Machine Learning methods to train classification model based on extracted features and immediately apply it to classify new samples"
     echo "    cv                Machine Learning methods to train classification model based on extracted features and check accuracy via cross-validation"
-    echo "    bandage           Module to train classifier and prepare for visualisation in Bandage (https://github.com/ctlab/BandageNG)"
+    echo "    bandage           Module to train classifier and prepare for visualisation in BandageNG (https://github.com/ctlab/BandageNG)"
+    echo "    feature_analysis  Module to analyze selected feature in multiple samples and visualize in BandageNG (https://github.com/ctlab/BandageNG)"
     echo ""
     echo "    calc_features     Module to count values for new samples based on previously extracted features"
     echo "    extract_kmers     Module to extract k-mers from samples (to speed up multiple calculations)"
@@ -101,6 +102,10 @@ elif [ "$1" = bandage ]; then
     echo metafx bandage ${@:2} | tee -a $LOGFILE
     { time ${PIPES}/bandage_pipe.sh ${@:2} 2>&1; echo $? >> $LOGFILE; } | tee -a $LOGFILE
     exit `tail -1 $LOGFILE`
+elif [ "$1" = feature_analysis ]; then
+    echo metafx feature_analysis ${@:2} | tee -a $LOGFILE
+    { time ${PIPES}/feature_analysis.sh ${@:2} 2>&1; echo $? >> $LOGFILE; } | tee -a $LOGFILE
+    exit `tail -1 $LOGFILE`
 elif [ "$1" = "-h" ] || [ "$1" = "--help" ]; then
     help_message
     exit 0
diff --git a/bin/metafx-modules/feature_analysis.sh b/bin/metafx-modules/feature_analysis.sh
@@ -0,0 +1,188 @@
+#!/usr/bin/env bash
+##########################################################################################
+##### MetaFX feature_analysis module – analyze selected feature in multiple samples  #####
+##########################################################################################
+
+help_message () {
+    echo ""
+    echo "$(metafx -v)"
+    echo "MetaFX feature_analysis module – pipeline to build de Bruijn graphs for samples with selected feature and visualize them in BandageNG (https://github.com/ctlab/BandageNG)"
+    echo "Usage: metafx feature_analysis [<Launch options>] [<Input parameters>]"
+    echo ""
+    echo "Launch options:"
+    echo "    -h | --help                       show this help message and exit"
+    echo "    -t | --threads       <int>        number of threads to use [default: all]"
+    echo "    -m | --memory        <MEM>        memory to use (values with suffix: 1500M, 4G, etc.) [default: 90% of free RAM]"
+    echo "    -w | --work-dir      <dirname>    working directory [default: workDir/]"
+    echo ""
+    echo "Input parameters:"
+    echo "    -k | --k             <int>        k-mer size to build de Bruij graphs (in nucleotides, maximum value is 31) [mandatory]"
+    echo "    -f | --feature-dir   <dirname>    directory containing folders with contigs for each category, feature_table.tsv and categories_samples.tsv files. Usually, it is workDir from other MetaFX modules (unique, stats, colored, metafast, metaspades) [mandatory]"
+    echo "    -n | --feature-name  <string>     name of the feature of interest (should be one of the values from first column of feature_table.tsv) [mandatory]"
+    echo "    -r | --reads-dir     <dirname>    directory containing files with reads for samples. FASTQ, FASTA, gzip- or bzip2-compressed [mandatory]"
+    echo "         --relab         <int>        minimal relative abundance of feature in sample to include sample for further analysis [optional, default: 0.1]"
+    echo "";}
+
+
+# Paths to pipelines and scripts
+mfx_path=$(which metafx)
+bin_path=${mfx_path%/*}
+SOFT=${bin_path}/metafx-scripts
+PIPES=${bin_path}/metafx-modules
+pwd=`dirname "$0"`
+
+comment () { ${SOFT}/pretty_print.py "$1" "-"; }
+warning () { ${SOFT}/pretty_print.py "$1" "*"; }
+error   () { ${SOFT}/pretty_print.py "$1" "*"; exit 1; }
+
+
+w="workDir"
+POSITIONAL=()
+while [[ $# -gt 0 ]]
+do
+key="$1"
+case $key in
+    -h|--help)
+    help_message
+    exit 0
+    ;;
+    -k|--k)
+    k="$2"
+    shift
+    shift
+    ;;
+    -f|--feature-dir)
+    featDir="$2"
+    shift
+    shift
+    ;;
+    -n|--feature-name)
+    featName="$2"
+    shift
+    shift
+    ;;
+    -r|--reads-dir)
+    readsDir="$2"
+    shift
+    shift
+    ;;
+    --relab)
+    relab="$2"
+    shift
+    shift
+    ;;
+    -m|--memory)
+    m="$2"
+    shift
+    shift
+    ;;
+    -t|--threads)
+    p="$2"
+    shift
+    shift
+    ;;
+    -w|--work-dir)
+    w="$2"
+    shift
+    shift
+    ;;
+    *)    # unknown option
+    POSITIONAL+=("$1") # save it in an array for later
+    shift
+    ;;
+esac
+done
+set -- "${POSITIONAL[@]}" # restore positional parameters
+
+
+
+
+if [ ! -d ${featDir} ]; then
+    error "Invalid directory with features provided"
+fi
+
+if [ ! -d ${readsDir} ]; then
+    error "Invalid directory with samples' reads provided"
+fi
+
+if [ ! -f ${featDir}/feature_table.tsv ]; then
+    error "feature_table.tsv file missing in ${featDir}"
+fi
+
+# ==== Step 1 ====
+comment "Running step 1: selecting samples containing feature '${featName}'"
+cnt=`awk -v var="${featName}" '$1==var {CNT++} END{ print CNT }' ${featDir}/feature_table.tsv`
+if [[ cnt -ne 1 ]]; then
+    error "Cannot find feature '${featName}' in ${featDir}/feature_table.tsv"
+fi
+
+mkdir ${w}
+cmd1="python ${SOFT}/select_samples_by_feature.py --work-dir ${featDir} --feature ${featName} --res-dir ${w} "
+if [[ $relab ]]; then
+    cmd+="--board ${relab}"
+fi
+
+echo "$cmd1"
+$cmd1
+if [[ $? -eq 0 ]]; then
+    echo "Total `wc -l ${w}/samples_list_feature_${featName}.txt` samples were selected"
+    echo "List of samples containing feature '${featName}' saved to ${w}/samples_list_feature_${featName}.txt"
+    echo "Nucleotide sequence for feature '${featName}' saved to ${w}/seq_feature_${featName}.fasta"
+    comment "Step 1 finished successfully!"
+else
+    error "Error during step 1!"
+    exit 1
+fi
+
+
+# ===== Step 2 =====
+
+comment "Running step 2: constructing de Brujn graphs for each selected sample"
+
+
+cmd2="${PIPES}/metacherchant.sh "
+if [[ $k ]]; then
+    cmd2+="-k $k "
+fi
+if [[ $m ]]; then
+    cmd2+="-m $m "
+fi
+if [[ $p ]]; then
+    cmd2+="-p $p "
+fi
+
+cmd2+="--coverage 1 --maxradius 1000 --bothdirs true --chunklength 10  --merge true "
+cmd2+="--seq ${w}/seq_feature_${featName}.fasta "
+
+mkdir ${w}/graphs
+
+while read sample ; do
+    cmd2_i=${cmd2}
+    cmd2_i+="-w ${w}/wd_${sample} "
+    cmd2_i+="-o ${w}/wd_${sample}/output "
+    readsFiles=`find ${readsDir}/${sample}_* ${readsDir}/${sample}.* 2>/dev/null | paste -sd " "`
+    cmd2_i+="--reads ${readsFiles}"
+    echo -n "Processing sample ${sample} (log saved to ${w}/metacherchant.log) ...    "
+    
+    echo "${cmd2_i}" >> ${w}/metacherchant.log
+    ${cmd2_i} &>> ${w}/metacherchant.log
+    if [[ $? -eq 0 ]]; then
+        echo "DONE"
+    else
+        error "Error during step 2!"
+    fi
+    ln -s `realpath $w`/wd_${sample}/output/merged/graph.gfa ${w}/graphs/${sample}.gfa
+done<${w}/samples_list_feature_${featName}.txt
+
+
+if [[ $? -eq 0 ]]; then
+    comment "All de Bruijn graphs saved to: ${w}/graphs/. To visualise them simultaneously in BandageNG follow instructions from https://github.com/ctlab/BandageNG/wiki#multigraph-mode"
+    comment "Step 2 finished successfully!"
+else
+    error "Error during step 2!"
+    exit 1
+fi
+
+
+comment "MetaFX feature_analysis module finished successfully!"
+exit 0
diff --git a/bin/metafx-modules/metacherchant.sh b/bin/metafx-modules/metacherchant.sh
diff --git a/bin/metafx-scripts/graph2contigs.py b/bin/metafx-scripts/graph2contigs.py
@@ -11,11 +11,12 @@
     for line in open(wd + "/components-graph.gfa"):
         if line.split()[0] == 'S':
             _, name, seq, *_ = line.strip().split(sep="\t")
+            raw_name = name
             name = int(name.split("_")[1][1:])
             if name != comp:
                 comp += 1
                 comp_i = 0
             comp_i += 1
-            print(">" + str(comp) + "_" + str(comp_i), file=file)
+            print(">" + str(comp) + "_" + str(comp_i) + "\t" + raw_name, file=file)
             print(seq, file=file)
     file.close()
diff --git a/bin/metafx-scripts/select_samples_by_feature.py b/bin/metafx-scripts/select_samples_by_feature.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python
+# Utility select samples from feature_table.tsv with feature score greater than board value (default 0.1)
+# -*- coding: UTF-8 -*-
+
+import sys
+import getopt
+import pandas as pd
+
+if __name__ == "__main__":
+    inputFile = ''
+    outputFile = ''
+    feature = ''
+    category = ''
+    featureId = ''
+    board = 0.1
+
+    helpString = 'Please add all mandatory parameters --work-dir, --feature, --res-dir and use optional float parameter --board'
+
+    argv = sys.argv[1:]
+    try:
+        opts, args = getopt.getopt(argv, "h", ["work-dir=", "feature=", "res-dir=", "board="])
+    except getopt.GetoptError:
+        print(helpString)
+        sys.exit(2)
+    for opt, arg in opts:
+        if opt == "-h":
+            print(helpString)
+            sys.exit()
+        elif opt == "--work-dir":
+            workDir = arg
+            if workDir[0] == "'" or workDir[0] == '"':
+                workDir = workDir[1:]
+            if workDir[-1] == "'" or workDir[-1] == '"':
+                workDir = workDir[:-1]
+        elif opt == "--feature":
+            feature = arg
+            if feature[0] == "'" or feature[0] == '"':
+                feature = feature[1:]
+            if feature[-1] == "'" or feature[-1] == '"':
+                feature = feature[:-1]
+            category = feature.split("_")[0]
+            featureId = feature.split("_")[1]
+        elif opt == "--res-dir":
+            resDir = arg
+            if resDir[0] == "'" or resDir[0] == '"':
+                resDir = resDir[1:]
+            if resDir[-1] == "'" or resDir[-1] == '"':
+                resDir = resDir[:-1]
+        elif opt == "--board":
+            board = float(arg)
+
+    data = pd.read_csv(workDir + '/feature_table.tsv', header=0, index_col=0, sep = '\t')
+    data = data.T
+    filteredData = data[feature][data[feature] > board].keys().tolist()
+    samplesList = open(resDir + '/samples_list_feature_' + feature + '.txt', 'w')
+    print(*filteredData, sep = "\n", file=samplesList)
+    samplesList.close()
+
+    resSeqFile = open(resDir + '/seq_feature_' + feature + '.fasta', 'w')
+    featuresFasta = open(workDir + '/contigs_' + category + '/components.seq.fasta', 'r')
+
+    while True:
+        line = featuresFasta.readline()
+        if not line:
+            break
+        if len(line)==0:
+            continue
+        if line[0] == '>':
+            line = line.strip()[1:]
+            seqName = line.split('_')[0]
+            if seqName == featureId:
+                seq = featuresFasta.readline().strip()
+                print(">"+line, file=resSeqFile)
+                print(seq, file=resSeqFile)
+    resSeqFile.close()
+    featuresFasta.close()
diff --git a/bin/metafx-scripts/tax_to_csv.py b/bin/metafx-scripts/tax_to_csv.py
@@ -47,7 +47,8 @@
         listLine = line.split('\t')
         if (listLine[0] == 'C'):
             tax_id = listLine[2].split('taxid')[1][1:-1]
-            tax_ids.append((listLine[1], tax_id))
+            #tax_ids.append((listLine[1], tax_id))
+            tax_ids.append((listLine[1].split("\t")[1], tax_id))
     fileR.close()
 
     ncbi = NCBITaxa()