diff --git a/.editorconfig b/.editorconfig deleted file mode 100644 index e69de29..0000000 diff --git a/.gitignore b/.gitignore index d8a10c3..bdea4dc 100644 --- a/.gitignore +++ b/.gitignore @@ -7,10 +7,8 @@ VCFs/ Gencode/ Cache/ *nohup* -Annotations/ PAMs/* samplesIDs/* -guides/ *.csv *.pkl *.zip diff --git a/Dockerfile b/Dockerfile index d3c2316..df6a8a7 100755 --- a/Dockerfile +++ b/Dockerfile @@ -6,7 +6,7 @@ FROM mambaorg/micromamba # Set the variables for version control during installation ARG crispritz_version=2.6.6 -ARG crisprme_version=2.1.5 +ARG crisprme_version=2.1.6 # set the shell to bash ENV SHELL bash diff --git a/LICENSE b/LICENSE index 456a677..b01a24e 100755 --- a/LICENSE +++ b/LICENSE @@ -1,2 +1,16 @@ -CRISRPme has a dual license. It is made available for free to academic researchers under the Affero License (https://www.gnu.org/licenses/agpl-3.0.en.html). -If you plan to use the CRISRPme for-profit, you will need to purchase a license. Please contact rosalba.giugno@univr.it and lpinello@mgh.harvard.edu for more information. +CRISPRme is distributed under a dual-license model: + +1. Academic Use + CRISPRme is freely available for academic research under the GNU Affero + General Public License v3.0 (AGPL-3.0) + (https://www.gnu.org/licenses/agpl-3.0.en.html). + +2. Commercial Use + For-profit institutions or users intending to use CRISPRme for commercial + purposes must acquire a commercial license. For inquiries and licensing + details, please contact: + - Luca Pinello: lpinello@mgh.harvard.edu + - Rosalba Giugno: rosalba.giugno@univr.it + +For more information on licensing terms and conditions, please reach out to the +contacts above. \ No newline at end of file diff --git a/PostProcess/analisi_indels_NNN.sh b/PostProcess/analisi_indels_NNN.sh index 98e8079..feb4e77 100755 --- a/PostProcess/analisi_indels_NNN.sh +++ b/PostProcess/analisi_indels_NNN.sh @@ -1,6 +1,5 @@ #!/bin/bash -set -e # trace all errors # Script per l'analisi dei targets della ricerca REF e ENR con PAM NNN # Il file dei targets della ricerca sul genoma reference si chiama $REFtargets -> INPUT $1 @@ -47,10 +46,8 @@ touch $REFtargets.corrected # 1) Rimozione duplicati, estrazione semicommon e unique e creazione file total #echo 'Creazione file .total.txt' -./extraction.sh "$REFtargets.corrected" "$ENRtargets" "$jobid" || { - echo "CRISPRme ERROR: indels analysis failed (script: ${0} line $((LINENO-1)))" >&2 - exit 1 -} # OUTPUT $jobid.common_targets.txt -> Non usato +./extraction.sh "$REFtargets.corrected" "$ENRtargets" "$jobid" +# OUTPUT $jobid.common_targets.txt -> Non usato # $jobid.semi_common_targets.txt # $jobid.unique_targets.txt @@ -73,10 +70,7 @@ rm "$jobid.semi_common_targets.minmaxdisr.txt" #echo 'Creazione cluster del file .total.txt' # 3) Clustering -./cluster.dict.py "$jobid.total.txt" 'no' 'True' 'True' "$guide_file" 'total' 'orderChr' || { - echo "CRISPRme ERROR: indels clustering failed (script: ${0} line $((LINENO-1)))" >&2 - exit 1 -} # OUTPUT $jobid.total.cluster.txt +./cluster.dict.py "$jobid.total.txt" 'no' 'True' 'True' "$guide_file" 'total' 'orderChr' # OUTPUT $jobid.total.cluster.txt #sed -i ':a;N;$!ba;s/\n/\tn\tn\tn\n/g' $jobid.total.cluster.txt #sed -i '$s/$/\tn\tn\tn/g' $jobid.total.cluster.txt @@ -104,10 +98,7 @@ rm "$jobid.total.txt" #echo 'Estrazione sample dal file .total.cluster.txt' -./analisi_indels_NNN.py "$annotationfile" "$jobid.total.cluster.txt" "$jobid" "$dictionaries" "$pam_file" "$mismatch" "$referencegenome" "$guide_file" $bulgesDNA $bulgesRNA || { - echo "CRISPRme ERROR: indels analysis failed (script: ${0} line $((LINENO-1)))" >&2 - exit 1 -} +./analisi_indels_NNN.py "$annotationfile" "$jobid.total.cluster.txt" "$jobid" "$dictionaries" "$pam_file" "$mismatch" "$referencegenome" "$guide_file" $bulgesDNA $bulgesRNA # OUTPUT $jobid.bestCFD_INDEL.txt # $jobid.CFDGraph.txt (per fare l'area graph dei CFD REF vs ENR) # NOTA AnnotatorAllTargets.py salva su disco SOLO il target con CFD più alto nel cluster e tra le scomposizioni esistenti @@ -133,18 +124,9 @@ echo 'Sorting and adjusting results' # #tail file w/o header and sort for realguide,chr,cluster_pos,score # tail -n +2 $jobid.bestCRISTA_INDEL.txt | LC_ALL=C sort -k15,15 -k4,4 -k6,6n -k21,21rg -T ./ >>$jobid.tmp && mv $jobid.tmp $jobid.bestCRISTA_INDEL.txt -./adjust_cols.py "$jobid.bestCFD_INDEL.txt" || { - echo "CRISPRme ERROR: CFD indels report failed (script: ${0} line $((LINENO-1)))" >&2 - exit 1 -} -./adjust_cols.py "$jobid.bestCRISTA_INDEL.txt" || { - echo "CRISPRme ERROR: CRISTA indels report failed (script: ${0} line $((LINENO-1)))" >&2 - exit 1 -} -./adjust_cols.py "$jobid.bestmmblg_INDEL.txt" || { - echo "CRISPRme ERROR: mismatch+bulges indels report failed (script: ${0} line $((LINENO-1)))" >&2 - exit 1 -} +./adjust_cols.py "$jobid.bestCFD_INDEL.txt" +./adjust_cols.py "$jobid.bestCRISTA_INDEL.txt" +./adjust_cols.py "$jobid.bestmmblg_INDEL.txt" # sed -i '1s/.*/MMBLG_#Bulge_type\tMMBLG_crRNA\tMMBLG_DNA\tMMBLG_Reference\tMMBLG_Chromosome\tMMBLG_Position\tMMBLG_Cluster_Position\tMMBLG_Direction\tMMBLG_Mismatches\tMMBLG_Bulge_Size\tMMBLG_Total\tMMBLG_PAM_gen\tMMBLG_Var_uniq\tMMBLG_Samples\tMMBLG_Annotation_Type\tMMBLG_Real_Guide\tMMBLG_rsID\tMMBLG_AF\tMMBLG_SNP\tMMBLG_#Seq_in_cluster\tMMBLG_CFD\tMMBLG_CFD_ref/' $jobid.bestmmblg_INDEL.txt # sed -i '1s/.*/MMBLG_#Bulge_type\tMMBLG_crRNA\tMMBLG_DNA\tMMBLG_Reference\tMMBLG_Chromosome\tMMBLG_Position\tMMBLG_Cluster_Position\tMMBLG_Direction\tMMBLG_Mismatches\tMMBLG_Bulge_Size\tMMBLG_Total\tMMBLG_PAM_gen\tMMBLG_Var_uniq\tMMBLG_Samples\tMMBLG_Annotation_Type\tMMBLG_Real_Guide\tMMBLG_rsID\tMMBLG_AF\tMMBLG_SNP\tMMBLG_#Seq_in_cluster\tMMBLG_CFD\tMMBLG_CFD_ref/' $jobid.altmmblg.txt @@ -152,18 +134,9 @@ echo 'Sorting and adjusting results' # pr -m -t -J $jobid.bestCFD_INDEL.txt $jobid.bestmmblg_INDEL.txt >$jobid.bestMerge.txt # pr -m -t -J $jobid.altCFD.txt $jobid.altmmblg.txt >$jobid.altMerge.txt -./remove_bad_indel_targets.py "$jobid.bestCFD_INDEL.txt" || { - echo "CRISPRme ERROR: CFD indels report cleaning failed (script: ${0} line $((LINENO-1)))" >&2 - exit 1 -} -./remove_bad_indel_targets.py "$jobid.bestCRISTA_INDEL.txt" || { - echo "CRISPRme ERROR: CRISTA indels report cleaning failed (script: ${0} line $((LINENO-1)))" >&2 - exit 1 -} -./remove_bad_indel_targets.py "$jobid.bestmmblg_INDEL.txt" || { - echo "CRISPRme ERROR: mismatch+bulges indels report cleaning failed (script: ${0} line $((LINENO-1)))" >&2 - exit 1 -} +./remove_bad_indel_targets.py "$jobid.bestCFD_INDEL.txt" +./remove_bad_indel_targets.py "$jobid.bestCRISTA_INDEL.txt" +./remove_bad_indel_targets.py "$jobid.bestmmblg_INDEL.txt" #merge targets in same chr when they are at distance 3 from each other (inclusive) preserving the highest scoring one # ./merge_close_targets_cfd.sh $jobid.bestCFD_INDEL.txt $jobid.bestCFD_INDEL.txt.trimmed 3 'score' diff --git a/PostProcess/extraction.sh b/PostProcess/extraction.sh index b663668..9bb25b7 100755 --- a/PostProcess/extraction.sh +++ b/PostProcess/extraction.sh @@ -1,7 +1,6 @@ #!/bin/bash ##NOTE AWK & GREP REPORT NO STDOUT IF NO MATCHES ARE FOUND (AWK DO NOT PRODUCE ANY OUTPUT) -# set -e # trace all errors #PARAM $1 is ref targets file #PARAM $2 is var targets file diff --git a/PostProcess/merge_alt_chr.sh b/PostProcess/merge_alt_chr.sh index 86b7dd9..deca1af 100755 --- a/PostProcess/merge_alt_chr.sh +++ b/PostProcess/merge_alt_chr.sh @@ -1,7 +1,5 @@ #!/bin/bash -set -e # trace all failures - dir=$(dirname $1) fileIn=$1 fileOut=$2 @@ -15,7 +13,8 @@ head -1 $fileIn >$fileOut for chrom in ${chroms[@]}; do echo $chrom - awk "/${chrom}\t/" test.targets.txt >$fileIn.$chrom.ref + # awk "/${chrom}\t/" test.targets.txt >$fileIn.$chrom.ref + grep -F -w "$chrom" $fileIn >$fileIn.$chrom.ref cut -f 3 $fileIn.$chrom.ref | LC_ALL=C sort -T "$dir" | uniq >$fileIn.$chrom.ref.targets awk -v chrom="$chrom" '$0 ~ chrom"_" {print($0)}' $fileIn >$fileIn.$chrom.alt awk 'NR==FNR{a[$0];next} !($0 in a)' $fileIn.$chrom.ref.targets $fileIn.$chrom.alt >$fileIn.$chrom.merged diff --git a/PostProcess/merge_close_targets_cfd.sh b/PostProcess/merge_close_targets_cfd.sh index e329593..6568973 100755 --- a/PostProcess/merge_close_targets_cfd.sh +++ b/PostProcess/merge_close_targets_cfd.sh @@ -1,7 +1,5 @@ #!/bin/bash -# set -e # capture any failure - fileIn=$1 fileOut=$2 thresh=$3 #threshold to use in order to merge near targets @@ -27,10 +25,16 @@ echo "Sorting done in $(($ENDTIME - $STARTTIME)) seconds" # echo -e $header | cat - $fileIn.sorted.tmp > $fileIn.sorted # rm $fileIn.sorted.tmp echo "Merging contiguous targets" + +if [[ "${sort_pivot}" == "score" ]]; then + criteria=$sorting_criteria_scoring +else + criteria=$sorting_criteria +fi +python merge_contiguous_targets.py $fileIn $fileOut $thresh $chrom $position $total $true_guide $snp_info $cfd $sort_pivot $criteria # python remove_contiguous_samples_cfd.py $fileIn.sorted $fileOut $thresh $chrom $position $total $true_guide $snp_info $cfd -python remove_contiguous_samples_cfd.py $fileIn $fileOut $thresh $chrom $position $total $true_guide $snp_info $cfd $sort_pivot $sorting_criteria_scoring $sorting_criteria -# python remove_contiguous_samples_cfd_new.py $fileIn $fileOut $thresh $chrom $position $total $true_guide $snp_info $cfd $sort_pivot $sorting_criteria_scoring $sorting_criteria || { -# echo "CRISPRme ERROR: contigous SNP removal failed (script: ${0} line $((LINENO-1)))" >&2 -# exit 1 -# } + + +# python remove_contiguous_samples_cfd.py $fileIn $fileOut $thresh $chrom $position $total $true_guide $snp_info $cfd $sort_pivot +# python remove_contiguous_samples_cfd.py $fileIn $fileOut $thresh $chrom $position $total $true_guide $snp_info $cfd $sort_pivot $sorting_criteria_scoring $sorting_criteria # rm $fileIn.sorted diff --git a/PostProcess/merge_contiguous_targets.py b/PostProcess/merge_contiguous_targets.py new file mode 100644 index 0000000..86bcd9d --- /dev/null +++ b/PostProcess/merge_contiguous_targets.py @@ -0,0 +1,615 @@ +""" +This module provides functionality for merging target data from input files based +on specified criteria. It includes functions for parsing command line arguments, +processing target data, and writing results to output files. + +Key functions include: +- `parse_commandline`: Parses and validates command line arguments for target + merging configuration. +- `split_target_row`: Splits a target row string into its individual components. +- `update_target_fields`: Updates specified fields in the target list by appending + corresponding values from another list. +- `distribute_targets`: Distributes targets between reference and variant targets + from a given cluster. +- `target_only_var`: Updates a target list to indicate whether it is a variant-only + target. +- `remove_duplicate_targets`: Removes duplicate values from specified fields in a + target list. +- `unfold_variant_targets`: Recovers and processes all variant targets from a given + dictionary. +- `sorting_score`: Generates a sorting key function based on specified criteria + for sorting. +- `sorting_fewest`: Creates a sorting key function based on the fewest specified + criteria. +- `initialize_sorting_criteria`: Initializes a sorting criteria function based on + the provided parameters. +- `retrieve_best_target`: Identifies and retrieves the best target from a given + cluster of targets. +- `merge_targets`: Merges target data from an input file and writes the best + targets to an output file. +- `main`: The entry point of the module that orchestrates the merging process. + +This module is designed to facilitate the analysis of genomic target data, allowing +users to efficiently merge and sort targets based on various criteria. +""" + +from typing import List, Tuple, Dict, Callable +from time import time +from io import TextIOWrapper + +import sys +import os + +SORTCRITERIA = {"mm": 2, "bulges": 1, "mm+bulges": 0} + + +def parse_commandline( + args: List[str], +) -> Tuple[str, str, int, int, int, int, int, int, int, str, List[str]]: + """ + Parses command line arguments for target merging configuration. + This function validates the input arguments and returns the necessary + parameters for processing target files. + + Args: + args (List[str]): A list of command line arguments where: + - args[0] is the targets file name. + - args[1] is the output file name. + - args[2] is the targets merge range in base pairs. + - args[3] is the chromosome column index (1-based). + - args[4] is the position column index (1-based). + - args[5] is the mm+bulges column index (1-based). + - args[6] is the guide column index (1-based). + - args[7] is the SNP info column index (1-based). + - args[8] is the score column index (1-based). + - args[9] is the sorting pivot (score or mm+bulges). + - args[10] is a comma-separated list of sorting criteria. + + Returns: + Tuple[str, str, int, int, int, int, int, int, int, str, List[str]]: + A tuple containing the parsed parameters: + - targets_fname: The targets file name. + - outfname: The output file name. + - rangebp: The targets merge range in base pairs. + - chromidx: The chromosome column index (0-based). + - posidx: The position column index (0-based). + - mmbidx: The mm+bulges column index (0-based). + - guideidx: The guide column index (0-based). + - snpidx: The SNP info column index (0-based). + - scoreidx: The score column index (0-based). + - pivot: The sorting pivot. + - sortcrit: A list of sorting criteria. + + Raises: + FileNotFoundError: If the targets file cannot be found. + ValueError: If the merge range is less than 1 or if invalid sort criteria are provided. + """ + + targets_fname = args[0] # targets file + if not os.path.isfile(targets_fname): + raise FileNotFoundError(f"Unable to locate {targets_fname}") + outfname = args[1] # output file + rangebp = int(args[2]) # targets merge range (bp) + if rangebp < 1: + raise ValueError(f"Forbidden targets merge range ({rangebp})") + chromidx = int(args[3]) - 1 # chromosome col idx + posidx = int(args[4]) - 1 # position col idx + mmbidx = int(args[5]) - 1 # mm+bulges col idx + guideidx = int(args[6]) - 1 # guide col idx + snpidx = int(args[7]) - 1 # snp info col idx + scoreidx = int(args[8]) - 1 # score col idx + pivot = args[9] # targets sorting pivot (score or mm+bulges) + # comma-separated list of criteria to use while sorting targets + sortcrit = args[10].split(",") + if len(sortcrit) > 3 or any(c not in SORTCRITERIA for c in sortcrit): + offending_vals = ",".join([c for c in sortcrit if c not in SORTCRITERIA]) + raise ValueError(f"Forbidden sort criteria selected: {offending_vals}") + return ( + targets_fname, + outfname, + rangebp, + chromidx, + posidx, + mmbidx, + guideidx, + snpidx, + scoreidx, + pivot, + sortcrit, + ) + + +def split_target_row( + target_row: str, guideidx: int, chromidx: int, posidx: int +) -> Tuple[str, str, int]: + """ + Splits a target row string into its individual components. + This function retrieves the guide, chromosome, and position from a target + row based on specified indices. + + Args: + target_row (str): A string representing a single target row, with fields + separated by whitespace. + guideidx (int): The index of the guide field in the target row. + chromidx (int): The index of the chromosome field in the target row. + posidx (int): The index of the position field in the target row. + + Returns: + Tuple[str, str, int]: A tuple containing the extracted guide, chromosome, + and position: + - guide (str): The guide extracted from the target row. + - chromosome (str): The chromosome extracted from the target row. + - position (int): The position extracted from the target row, converted + to an integer. + """ + + fields = target_row.strip().split() + return fields[guideidx], fields[chromidx], int(fields[posidx]) + + +def update_target_fields( + target: List[str], fields: List[str], samplesidx: int, snpid_idx: int, afidx: int +) -> List[str]: + """Update specified fields in the target list by appending corresponding + values from the fields list. + + This function modifies the target list by concatenating values from the fields + list at given indices, which is useful for aggregating information related to + samples, SNP IDs, and allele frequencies. + + Args: + target (List[str]): The list of target values to be updated. + fields (List[str]): The list of new values to append to the target. + samplesidx (int): The index in the target list for sample information. + snpid_idx (int): The index in the target list for SNP ID information. + afidx (int): The index in the target list for allele frequency information. + + Returns: + List[str]: The updated target list with concatenated values. + """ + + target[samplesidx] = f"{target[samplesidx]},{fields[samplesidx]}" + target[snpid_idx] = f"{target[snpid_idx]},{fields[snpid_idx]}" + target[afidx] = f"{target[afidx]},{fields[afidx]}" + return target + + +def distribute_targets( + cluster: List[str], + snpidx: int, + posidx: int, + snpid_idx: int, + samplesidx: int, + afidx: int, +) -> Tuple[List[List[str]], Dict[str, List[List[str]]]]: + """ + Distributes targets between reference and variant targets from a given cluster. + It merges identical targets found in different datasets into a structured + format. + + This function processes a list of target strings, categorizing them into + reference targets and variant targets based on specific indices. Reference + targets are collected in a list, while variant targets are stored in a dictionary, + allowing for the merging of identical targets across datasets. + + Args: + cluster (List[str]): A list of target strings to be processed. + snpidx (int): The index indicating the SNP status of the target. + posidx (int): The index indicating the position of the target. + snpid_idx (int): The index for SNP IDs in the target fields. + samplesidx (int): The index for sample information in the target fields. + afidx (int): The index for allele frequencies in the target fields. + + Returns: + Tuple[List[List[str]], Dict[str, List[List[str]]]]: A tuple containing a + list of reference + targets and a dictionary of variant targets, where each key corresponds + to a unique target and its value is a list of associated target fields. + + Raises: + ValueError: If the input data is malformed or indices are out of range. + """ + + # distribute targets between reference and variant targets + # dict used to merge identical targets found in different datasets + reftargets, vartargets = [], {} + for target in cluster: + fields = target.strip().split() # retrieve target fields + if fields[snpidx] == "n": # target found in reference + reftargets.append(fields) + else: # target found in variant genomes + targetkey = f"{fields[posidx]}_{fields[snpidx]}" + current_target = vartargets.get(targetkey) + if current_target: + # update current sample list, snp ids, and allele freqs + vartargets[targetkey][0] = update_target_fields( + current_target[0], fields, samplesidx, snpid_idx, afidx + ) + else: + vartargets[targetkey] = [fields] # first target at position + return reftargets, vartargets + + +def target_only_var(target: List[str], varonly: bool) -> List[str]: + """ + Updates a target list to indicate whether it is a variant-only target. + This function modifies the target's status based on the provided flag. + + The function checks the `varonly` flag and updates the 13th element of the + target list to "y" if the flag is set to True, indicating that the target is + only found in variant genomes. It returns the modified target list. + + Args: + target (List[str]): A list representing the target information. + varonly (bool): A flag indicating whether the target is variant-only. + + Returns: + List[str]: The updated target list with the appropriate status. + """ + + # set apprpriate flag if no target in reference in current cluster + target[12] = "y" if varonly else target[12] + return target + + +def remove_duplicate_targets( + target: List[str], snpidx: int, snpid_idx: int, afidx: int, samplesidx: int +) -> List[str]: + """ + Removes duplicate values from specified fields in a target list. + This function ensures that SNP IDs, SNP information, allele frequencies, and + sample data contain only unique entries. + + The function takes a target list and specified indices for SNPs, SNP IDs, + allele frequencies, and samples. It processes each of these fields to eliminate + duplicates by converting them into sets and then back into comma-separated + strings, returning the modified target list. + + Args: + target (List[str]): A list representing the target information. + snpidx (int): The index for SNP information in the target list. + snpid_idx (int): The index for SNP IDs in the target list. + afidx (int): The index for allele frequencies in the target list. + samplesidx (int): The index for sample information in the target list. + + Returns: + List[str]: The updated target list with duplicates removed from specified + fields. + """ + + # remove duplicate values from snp ids, snp info, allele freqs, and samples + target[snpidx] = ",".join(set(target[snpidx].split(","))) + target[snpid_idx] = ",".join(set(target[snpid_idx].split(","))) + target[afidx] = ",".join(set(target[afidx].split(","))) + target[samplesidx] = ",".join(set(target[samplesidx].split(","))) + return target + + +def unfold_variant_targets( + vartargets: Dict[str, List[List[str]]], + varonly: bool, + snpidx: int, + snpid_idx: int, + afidx: int, + samplesidx: int, +) -> List[List[str]]: + """ + Recovers and processes all variant targets from a given dictionary. + This function compiles variant targets into a single list while applying + necessary transformations and removing duplicates. + + The function iterates through the provided dictionary of variant targets, + applying the `target_only_var` function to each target based on the `varonly` + flag. It then removes duplicates from the resulting list of targets using the + `remove_duplicate_targets` function, returning a cleaned list of variant targets. + + Args: + vartargets (Dict[str, List[List[str]]]): A dictionary of variant targets, + where each key corresponds to a unique target identifier and the value + is a list of target fields. + varonly (bool): A flag indicating whether to mark targets as variant-only. + snpidx (int): The index for SNP information in the target list. + snpid_idx (int): The index for SNP IDs in the target list. + afidx (int): The index for allele frequencies in the target list. + samplesidx (int): The index for sample information in the target list. + + Returns: + List[List[str]]: A list of processed variant targets with duplicates removed. + """ + + # recover all variant targets and store in a list + vartargets_list = [] + for targets in vartargets.values(): + vartargets_list.extend([target_only_var(t, varonly) for t in targets]) + # remove duplicate values in targets + return [ + remove_duplicate_targets(t, snpidx, snpid_idx, afidx, samplesidx) + for t in vartargets_list + ] + + +def sorting_score(criteria: List[str], score_idx: int, mmbidx: int) -> Callable: + """ + Generates a sorting key function based on specified criteria for sorting. + This function allows for dynamic sorting of items based on one to three criteria, + prioritizing scores and additional metrics. + + The function returns a callable that can be used as a key in sorting operations. + Depending on the number of criteria provided, it constructs a tuple that includes + the negative score (to sort in descending order) and additional metrics derived + from the specified indices, ensuring that items are sorted according to the defined + priorities. + + Args: + criteria (List[str]): A list of criteria used for sorting. + score_idx (int): The index of the score in the items to be sorted. + mmbidx (int): The base index used to calculate additional metrics from + the criteria. + + Returns: + Callable: A function that takes an item and returns a tuple for sorting purposes. + """ + + if len(criteria) == 1: # single criterion + return lambda x: ( + -float(x[score_idx]), + int(x[mmbidx - SORTCRITERIA[criteria[0]]]), + ) + elif len(criteria) == 2: + return lambda x: ( + -float(x[score_idx]), + int(x[mmbidx - SORTCRITERIA[criteria[0]]]), + int(x[mmbidx - SORTCRITERIA[criteria[1]]]), + ) + # base case (all three ) + return lambda x: ( + -float(x[score_idx]), + int(x[mmbidx - SORTCRITERIA[criteria[0]]]), + int(x[mmbidx - SORTCRITERIA[criteria[1]]]), + int(x[mmbidx - SORTCRITERIA[criteria[2]]]), + ) + + +def sorting_fewest(criteria: List[str], mmbidx: int) -> Callable: + """ + Creates a sorting key function based on the fewest specified criteria. + This function allows for sorting items by one to three criteria, focusing on + the values derived from the specified indices. + + The function returns a callable that can be used as a key in sorting operations. + Depending on the number of criteria provided, it constructs a tuple of integer + values from the specified indices, enabling sorting based on the defined + priorities. + + Args: + criteria (List[str]): A list of criteria used for sorting. + mmbidx (int): The base index used to calculate values from the criteria. + + Returns: + Callable: A function that takes an item and returns a tuple for sorting + purposes. + """ + + if len(criteria) == 1: # one criterion + return lambda x: (int(x[mmbidx - SORTCRITERIA[criteria[0]]])) + elif len(criteria) == 2: + return lambda x: ( + int(x[mmbidx - SORTCRITERIA[criteria[0]]]), + int(x[mmbidx - SORTCRITERIA[criteria[1]]]), + # int(x[mmbidx - 2]), + # int(x[mmbidx - 1]), + ) + # base case (all three ) + return lambda x: ( + int(x[mmbidx - SORTCRITERIA[criteria[0]]]), + int(x[mmbidx - SORTCRITERIA[criteria[1]]]), + int(x[mmbidx - SORTCRITERIA[criteria[2]]]), + ) + + +def initialize_sorting_criteria( + criteria: List[str], scoreidx: int, mmbidx: int, score: bool +) -> Callable: + """ + Initializes a sorting criteria function based on the provided parameters. + This function determines whether to sort by score or by the fewest criteria + based on the input flag. + + Depending on the value of the `score` flag, the function returns either a + sorting function that prioritizes scores or one that focuses on the fewest + specified criteria. This allows for flexible sorting behavior based on the + user's needs. + + Args: + criteria (List[str]): A list of criteria used for sorting. + scoreidx (int): The index of the score in the items to be sorted. + mmbidx (int): The base index used to calculate additional metrics from + the criteria. + score (bool): A flag indicating whether to sort by score or by the fewest + criteria. + + Returns: + Callable: A function that can be used as a key in sorting operations. + """ + + if score: + return sorting_score(criteria, scoreidx, mmbidx) + return sorting_fewest(criteria, mmbidx) + + +def retrieve_best_target( + cluster: List[str], + snpidx: int, + posidx: int, + guideidx: int, + scoreidx: int, + mmbidx: int, + pivot: str, + sorting_criteria: List[str], + outfile: TextIOWrapper, + outfile_disc: TextIOWrapper, +) -> None: + """ + Identifies and retrieves the best target from a given cluster of targets. + This function processes the targets based on specified criteria and outputs + the best target along with alternative alignments to the provided output files. + + The function first distributes the targets into reference and variant categories, + then unfolds the variant targets into a list. It sorts both reference and variant + targets according to the specified sorting criteria, determines the best target + based on the pivot condition, and writes the results to the specified output files, + including the count of remaining targets. + + Args: + cluster (List[str]): A list of target strings representing the cluster. + snpidx (int): The index indicating the SNP status of the target. + posidx (int): The index indicating the position of the target. + guideidx (int): The index for guide information in the target fields. + scoreidx (int): The index for scores in the target fields. + mmbidx (int): The base index used to calculate additional metrics from + the criteria. + pivot (str): A string indicating the pivot for sorting (e.g., "score"). + sorting_criteria (List[str]): A list of criteria used for sorting the + targets. + outfile (TextIOWrapper): The output file for the best target. + outfile_disc (TextIOWrapper): The output file for alternative alignments. + + Returns: + None: This function does not return a value but writes output to the + specified files. + """ + + if not cluster: # opening the first cluster, it will be empty + return # do nothing + reftargets, vartargets = distribute_targets( + cluster, snpidx, posidx, snpidx - 2, guideidx - 2, snpidx - 1 + ) + varonly = not reftargets # check if found only variant targets + # retrieve variant targets in list + vartargets = unfold_variant_targets( + vartargets, varonly, snpidx, snpidx - 2, snpidx - 1, guideidx - 2 + ) + # sort targets using the criteria specified in input + score = pivot == "score" + if reftargets: + reftargets = sorted( + reftargets, + key=initialize_sorting_criteria(sorting_criteria, scoreidx, mmbidx, score), + ) + if vartargets: + vartargets = sorted( + vartargets, + key=initialize_sorting_criteria(sorting_criteria, scoreidx, mmbidx, score), + ) + if varonly: + target = vartargets.pop(0) # retrieve best target + # count the targets remaining in the cluster + target[scoreidx - 1] = str(len(vartargets)) + elif reftargets and vartargets: + if score: # check on score + target = ( + vartargets.pop(0) + if float(vartargets[0][scoreidx]) > float(reftargets[0][scoreidx]) + else reftargets.pop(0) + ) + elif int(vartargets[0][mmbidx]) < int(reftargets[0][mmbidx]): + target = vartargets.pop(0) + else: + target = reftargets.pop(0) + # count the targets remaining in the cluster + target[scoreidx - 1] = str(len(reftargets) + len(vartargets)) + else: + target = reftargets.pop(0) + target[scoreidx - 1] = str(len(reftargets)) + outfile.write("\t".join(target) + "\n") # report the best target + # write alternative alignments + for target in reftargets + vartargets: + target[scoreidx - 1] = str(len(reftargets) + len(vartargets)) + outfile_disc.write("\t".join(target) + "\n") # report the alternative target + + +def merge_targets( + inargs: Tuple[str, str, int, int, int, int, int, int, int, str, List[str]] +) -> None: + """ + Merges target data from an input file and writes the best targets to an output + file. This function processes clusters of targets based on specified criteria + and handles discarded samples in a separate output file. + + The function reads target data from the input file, grouping targets into + clusters based on guide, chromosome, and position. It retrieves the best target + from each cluster and writes the results to the specified output files, ensuring + that discarded samples are also recorded. + + Args: + inargs (Tuple[str, str, int, int, int, int, int, int, int, str, List[str]]): + A tuple containing input parameters, including input and output file + names, indices for various target fields, and sorting criteria. + + Returns: + None: This function does not return a value but writes output to the + specified files. + """ + + outfname_disc = f"{inargs[1]}.discarded_samples" # discarded targets file + # initialize variables used during merge + prevpos, prevguide, prevchrom, cluster = -(inargs[2] + 1), "", "", [] + with open(inargs[0], mode="r") as infile: + with open(inargs[1], mode="w") as outfile: + with open(outfname_disc, mode="w") as outfile_disc: + # header placed in both outfiles + header = infile.readline() + outfile_disc.write(header) + outfile.write(header) + for line in infile: # start reading targets + # retrieve guide chromosome and position + guide, chrom, pos = split_target_row( + line, inargs[6], inargs[3], inargs[4] + ) + # open new targets cluster and retrieve the best target from previous cluster + if ( + prevguide != guide + or prevchrom != chrom + or (pos - prevpos) > inargs[2] + ): + retrieve_best_target( + cluster, + inargs[7], + inargs[4], + inargs[6], + inargs[8], + inargs[5], + inargs[9], + inargs[10], + outfile, + outfile_disc, + ) + cluster = [line] + else: # append target data to current cluster + cluster.append(line) + # update lookup variables + prevpos, prevguide, prevchrom = pos, guide, chrom + retrieve_best_target( + cluster, + inargs[7], + inargs[4], + inargs[6], + inargs[8], + inargs[5], + inargs[9], + inargs[10], + outfile, + outfile_disc, + ) # process the last cluster + + +def main(): + # read input args + inargs = parse_commandline(sys.argv[1:]) + start = time() + merge_targets(inargs) + sys.stdout.write(f"Targets merge completed in {(time() - start):.2f}s\n") + + +if __name__ == "__main__": + main() diff --git a/PostProcess/new_simple_analysis.py b/PostProcess/new_simple_analysis.py index 976c42a..198ef5f 100755 --- a/PostProcess/new_simple_analysis.py +++ b/PostProcess/new_simple_analysis.py @@ -12,10 +12,9 @@ # For scoring of CFD And Doench -tab = str.maketrans("ACTGRYSWMKHDBVactgryswmkhdbv", - "TGACYRSWKMDHVBtgacyrswkmdhvb") +tab = str.maketrans("ACTGRYSWMKHDBVactgryswmkhdbv", "TGACYRSWKMDHVBtgacyrswkmdhvb") -iupac_nucleotides = set('RYSWKMBDHVryswkmbdhv') +iupac_nucleotides = set("RYSWKMBDHVryswkmbdhv") iupac_code_set = { "R": {"A", "G"}, "Y": {"C", "T"}, @@ -45,7 +44,7 @@ "t": {"t"}, "c": {"c"}, "g": {"g"}, - 'N': {'A', 'T', 'G', 'C'} + "N": {"A", "T", "G", "C"}, } @@ -70,7 +69,7 @@ "d": ("A", "G", "T"), "h": ("A", "C", "T"), "v": ("A", "C", "G"), - 'N': ('A', 'T', 'C', 'G') + "N": ("A", "T", "C", "G"), } @@ -79,10 +78,10 @@ def reverse_complement_table(seq): class reversor: - ''' + """ Nel caso debba ordinare più campi però con reverse diversi, eg uno True e l'altro False, posso usare questa classe nella chiave per simulare il contrario del reverse applicato - ''' + """ def __init__(self, obj): self.obj = obj @@ -100,8 +99,8 @@ def calc_cfd(guide_seq, sg, pam, mm_scores, pam_scores, do_scores): # score = -1 # return score score = 1 - sg = sg.replace('T', 'U') - guide_seq = guide_seq.replace('T', 'U') + sg = sg.replace("T", "U") + guide_seq = guide_seq.replace("T", "U") s_list = list(sg) guide_seq_list = list(guide_seq) @@ -110,14 +109,15 @@ def calc_cfd(guide_seq, sg, pam, mm_scores, pam_scores, do_scores): score *= 1 else: try: # Catch exception if IUPAC character - key = 'r' + guide_seq_list[i] + \ - ':d' + revcom(sl) + ',' + str(i + 1) + key = "r" + guide_seq_list[i] + ":d" + revcom(sl) + "," + str(i + 1) except Exception as e: score = 0 break try: score *= mm_scores[key] - except Exception as e: # If '-' is in first position, i do not have the score for that position + except ( + Exception + ) as e: # If '-' is in first position, i do not have the score for that position pass score *= pam_scores[pam] @@ -125,31 +125,35 @@ def calc_cfd(guide_seq, sg, pam, mm_scores, pam_scores, do_scores): def revcom(s): - basecomp = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'U': 'A', '-': '-'} + basecomp = {"A": "T", "C": "G", "G": "C", "T": "A", "U": "A", "-": "-"} letters = list(s[::-1]) try: letters = [basecomp[base] for base in letters] except: return None # If some IUPAC were not translated - return ''.join(letters) + return "".join(letters) def get_mm_pam_scores(): # print(os.path.dirname(os.path.realpath(__file__))) try: - mm_scores = pickle.load(open(os.path.dirname( - os.path.realpath(__file__)) + '/mismatch_score.pkl', 'rb')) - pam_scores = pickle.load(open(os.path.dirname( - os.path.realpath(__file__)) + '/PAM_scores.pkl', 'rb')) + mm_scores = pickle.load( + open( + os.path.dirname(os.path.realpath(__file__)) + "/mismatch_score.pkl", + "rb", + ) + ) + pam_scores = pickle.load( + open(os.path.dirname(os.path.realpath(__file__)) + "/PAM_scores.pkl", "rb") + ) return (mm_scores, pam_scores) except: - raise Exception( - "Could not find file with mismatch scores or PAM scores") + raise Exception("Could not find file with mismatch scores or PAM scores") def retrieveFromDict(chr_pos): try: - entry = mydict[current_chr+','+str(chr_pos+1)] + entry = mydict[current_chr + "," + str(chr_pos + 1)] except: snp_list = [] sample_list = [] @@ -157,40 +161,48 @@ def retrieveFromDict(chr_pos): rsID_list = [] snp_info_list = [] sample_list.append([]) # no samples - snp_list.append('C') # fake snp - rsID_list.append('.') # no rsid - AF_list.append('0') # fake AF + snp_list.append("C") # fake snp + rsID_list.append(".") # no rsid + AF_list.append("0") # fake AF snp_info_list.append( - current_chr+'_'+str(chr_pos+1)+'_'+'C'+'_'+'G') # fake snp info list + current_chr + "_" + str(chr_pos + 1) + "_" + "C" + "_" + "G" + ) # fake snp info list return snp_list, sample_list, rsID_list, AF_list, snp_info_list - multi_entry = entry.split('$') + multi_entry = entry.split("$") snp_list = [] sample_list = [] AF_list = [] rsID_list = [] snp_info_list = [] for entry in multi_entry: - split_entry = entry.split(';') - samples = split_entry[0].strip().split(',') - if samples[0] == '': + split_entry = entry.split(";") + samples = split_entry[0].strip().split(",") + if samples[0] == "": samples = [] sample_list.append(samples) - snp_list.append(split_entry[1].strip().split(',')[1]) + snp_list.append(split_entry[1].strip().split(",")[1]) rsID_list.append(split_entry[2].strip()) AF_list.append(split_entry[3].strip()) snp_info_list.append( - current_chr+'_'+str(chr_pos+1)+'_'+split_entry[1].split(',')[0]+'_'+split_entry[1].split(',')[1]) + current_chr + + "_" + + str(chr_pos + 1) + + "_" + + split_entry[1].split(",")[0] + + "_" + + split_entry[1].split(",")[1] + ) return snp_list, sample_list, rsID_list, AF_list, snp_info_list -def iupac_decomposition(split, guide_no_bulge, - guide_no_pam, cluster_to_save): +def iupac_decomposition(split, guide_no_bulge, guide_no_pam, cluster_to_save): realTarget = split[2] - replaceTarget = split[2].replace('-', '') - refSeq = genomeStr[int(split[4]): int(split[4])+len(replaceTarget)].upper() + replaceTarget = split[2].replace("-", "") + refSeq = genomeStr[int(split[4]) : int(split[4]) + len(replaceTarget)].upper() + refseqlist = list(refSeq) revert = False - if split[6] == '-': + if split[6] == "-": revert = True replaceTarget = reverse_complement_table(replaceTarget) @@ -203,31 +215,44 @@ def iupac_decomposition(split, guide_no_bulge, totalDict[1][0] = dict() countIUPAC = 0 for pos_c, c in enumerate(replaceTarget): + if pos_c >= len(refseqlist) or pos_c < 0: # should not be necessary + countIUPAC = 0 + break if c in iupac_code: countIUPAC += 1 snpToReplace, sampleSet, rsID, AF_var, snpInfo = retrieveFromDict( - pos_c+int(split[4])) + pos_c + int(split[4]) + ) for i, elem in enumerate(snpToReplace): - listReplaceTarget = list(refSeq) + listReplaceTarget = [nt for nt in refseqlist] listReplaceTarget[pos_c] = elem listInfo = [[rsID[i], AF_var[i], snpInfo[i]]] if haplotype_check: haploSamples = {0: [], 1: []} for count, sample in enumerate(sampleSet[i]): - sampleInfo = sample.split(':') + sampleInfo = sample.split(":") for haplo in totalDict: - if sampleInfo[1].split('|')[haplo] != '0': + if sampleInfo[1].split("|")[haplo] != "0": haploSamples[haplo].append(sampleInfo[0]) totalDict[0][0][(pos_c, elem)] = [ - listReplaceTarget, set(haploSamples[0]), listInfo] + listReplaceTarget, + set(haploSamples[0]), + listInfo, + ] totalDict[1][0][(pos_c, elem)] = [ - listReplaceTarget, set(haploSamples[1]), listInfo] + listReplaceTarget, + set(haploSamples[1]), + listInfo, + ] else: sampleList = list() for count, sample in enumerate(sampleSet[i]): - sampleList.append(sample.split(':')[0]) + sampleList.append(sample.split(":")[0]) totalDict[0][0][(pos_c, elem)] = [ - listReplaceTarget, set(sampleList), listInfo] + listReplaceTarget, + set(sampleList), + listInfo, + ] if countIUPAC > 0: # if found valid alternative targets if revert: @@ -239,113 +264,122 @@ def iupac_decomposition(split, guide_no_bulge, createdNewLayer = False else: break - totalDict[count][size+1] = dict() + totalDict[count][size + 1] = dict() # for each snp in target (fixpoint) for key in totalDict[count][size]: # for each other snp in target (> fixpoint) for newkey in totalDict[count][0]: if newkey[-2] > key[-2]: resultSet = totalDict[count][size][key][1].intersection( - totalDict[count][0][newkey][1]) # extract intersection of sample to generate possible multisnp target + totalDict[count][0][newkey][1] + ) # extract intersection of sample to generate possible multisnp target if len(resultSet) > 0: # if set is not null createdNewLayer = True # add new snp to preceding target seq with snp - replaceTarget1 = totalDict[count][0][newkey][0].copy( - ) - replaceTarget2 = totalDict[count][size][key][0].copy( - ) - replaceTarget2[newkey[0] - ] = replaceTarget1[newkey[0]] - listInfo2 = totalDict[count][size][key][2].copy( - ) - listInfo2.extend( - totalDict[count][0][newkey][2]) + replaceTarget1 = totalDict[count][0][newkey][0].copy() + replaceTarget2 = totalDict[count][size][key][0].copy() + replaceTarget2[newkey[0]] = replaceTarget1[newkey[0]] + listInfo2 = totalDict[count][size][key][2].copy() + listInfo2.extend(totalDict[count][0][newkey][2]) # add to next level the modified seq and set of samples and info of snp combinedKey = key + newkey - totalDict[count][size+1][combinedKey] = [ - replaceTarget2, resultSet, listInfo2] + totalDict[count][size + 1][combinedKey] = [ + replaceTarget2, + resultSet, + listInfo2, + ] # remove the new generated sample set from all lower levels # (this should be done only with phased VCF since unphased cannot be verified) if haplotype_check: - totalDict[count][size][key][1] = totalDict[count][size][key][1] - \ - totalDict[count][size + - 1][combinedKey][1] - totalDict[count][0][newkey][1] = totalDict[count][0][newkey][1] - \ - totalDict[count][size + - 1][combinedKey][1] + totalDict[count][size][key][1] = ( + totalDict[count][size][key][1] + - totalDict[count][size + 1][combinedKey][1] + ) + totalDict[count][0][newkey][1] = ( + totalDict[count][0][newkey][1] + - totalDict[count][size + 1][combinedKey][1] + ) refSeq_with_bulges = list(refSeq) for pos, char in enumerate(realTarget): - if char == '-': - refSeq_with_bulges.insert(pos, '-') + if char == "-": + refSeq_with_bulges.insert(pos, "-") - for position_t, char_t in enumerate(refSeq_with_bulges[pos_beg: pos_end]): + for position_t, char_t in enumerate(refSeq_with_bulges[pos_beg:pos_end]): if char_t.upper() != guide_no_pam[position_t]: tmp_pos_mms = position_t - if guide_no_pam[position_t] != '-': - refSeq_with_bulges[pos_beg + - position_t] = char_t.lower() + if guide_no_pam[position_t] != "-": + refSeq_with_bulges[pos_beg + position_t] = char_t.lower() # ref sequence with bulges - refSeq_with_bulges = ''.join(refSeq_with_bulges) + refSeq_with_bulges = "".join(refSeq_with_bulges) for level in totalDict[count]: for key in totalDict[count][level]: if len(totalDict[count][level][key][1]) > 0: if revert: totalDict[count][level][key][0] = reverse_complement_table( - ''.join(totalDict[count][level][key][0])) + "".join(totalDict[count][level][key][0]) + ) else: - totalDict[count][level][key][0] = ''.join( - totalDict[count][level][key][0]) + totalDict[count][level][key][0] = "".join( + totalDict[count][level][key][0] + ) final_line = split.copy() target_to_list = list(totalDict[count][level][key][0]) for pos, char in enumerate(realTarget): - if char == '-': - target_to_list.insert(pos, '-') + if char == "-": + target_to_list.insert(pos, "-") mm_new_t = 0 tmp_pos_mms = 0 - for position_t, char_t in enumerate(target_to_list[pos_beg: pos_end]): + for position_t, char_t in enumerate( + target_to_list[pos_beg:pos_end] + ): if char_t.upper() != guide_no_pam[position_t]: mm_new_t += 1 tmp_pos_mms = position_t - if guide_no_pam[position_t] != '-': - target_to_list[pos_beg + - position_t] = char_t.lower() + if guide_no_pam[position_t] != "-": + target_to_list[pos_beg + position_t] = ( + char_t.lower() + ) # pam respect input PAM after IUPAC resolution pam_ok = True - for pam_chr_pos, pam_chr in enumerate(target_to_list[pam_begin: pam_end]): + for pam_chr_pos, pam_chr in enumerate( + target_to_list[pam_begin:pam_end] + ): if pam_chr.upper() not in iupac_code_set[pam[pam_chr_pos]]: pam_ok = False - target_pam_ref = refSeq_with_bulges[pam_begin: pam_end] + target_pam_ref = refSeq_with_bulges[pam_begin:pam_end] found_creation = False for pos_pam, pam_char in enumerate(target_pam_ref): # ref char not in set of general pam char - if not iupac_code_set[pam[pos_pam]] & iupac_code_set[pam_char]: + if ( + not iupac_code_set[pam[pos_pam]] + & iupac_code_set[pam_char] + ): found_creation = True # value of mm and bulges is over allowed threshold, discard target if mm_new_t - int(split[8]) > allowed_mms: continue elif pam_ok: - final_line[2] = ''.join(target_to_list) + final_line[2] = "".join(target_to_list) final_line[7] = str(mm_new_t - int(final_line[8])) # total differences between targets and guide (mismatches + bulges) final_line[9] = str(mm_new_t) if found_creation: - final_line[10] = ''.join( - target_to_list[pam_begin: pam_end]) - final_line[12] = ','.join( - totalDict[count][level][key][1]) - tmp_matrix = np.array( - totalDict[count][level][key][2]) + final_line[10] = "".join( + target_to_list[pam_begin:pam_end] + ) + final_line[12] = ",".join(totalDict[count][level][key][1]) + tmp_matrix = np.array(totalDict[count][level][key][2]) if tmp_matrix.shape[0] > 1: - final_line[15] = ','.join(tmp_matrix[:, 0]) - final_line[16] = ','.join(tmp_matrix[:, 1]) - final_line[17] = ','.join(tmp_matrix[:, 2]) + final_line[15] = ",".join(tmp_matrix[:, 0]) + final_line[16] = ",".join(tmp_matrix[:, 1]) + final_line[17] = ",".join(tmp_matrix[:, 2]) else: final_line[15] = str(tmp_matrix[0][0]) final_line[16] = str(tmp_matrix[0][1]) @@ -363,27 +397,55 @@ def iupac_decomposition(split, guide_no_bulge, def preprocess_CFD_score(target): # preprocess target then calculate CFD score if do_scores: - if target[0] == 'DNA': - cfd_score = calc_cfd(target[1][int(target[bulge_pos]):], target[2].upper()[int( - target[bulge_pos]):-3], target[2].upper()[-2:], mm_scores, pam_scores, do_scores) + if target[0] == "DNA": + cfd_score = calc_cfd( + target[1][int(target[bulge_pos]) :], + target[2].upper()[int(target[bulge_pos]) : -3], + target[2].upper()[-2:], + mm_scores, + pam_scores, + do_scores, + ) # append to target the CFD score of the aligned sequence (alt or ref) target.append("{:.3f}".format(cfd_score)) # -3 position is a placeholder for ref score - if target[-3] == 55: # if 55 sequence is ref so no score have to be calculated + if ( + target[-3] == 55 + ): # if 55 sequence is ref so no score have to be calculated target[-3] = "{:.3f}".format(cfd_score) - if target[-3] == 33: # if 33 sequence is alt so ref score must be calculated - cfd_ref_score = calc_cfd(target[1][int(target[bulge_pos]):], target[-4].upper()[int( - target[bulge_pos]):-3], target[-4].upper()[-2:], mm_scores, pam_scores, do_scores) + if ( + target[-3] == 33 + ): # if 33 sequence is alt so ref score must be calculated + cfd_ref_score = calc_cfd( + target[1][int(target[bulge_pos]) :], + target[-4].upper()[int(target[bulge_pos]) : -3], + target[-4].upper()[-2:], + mm_scores, + pam_scores, + do_scores, + ) target[-3] = "{:.3f}".format(cfd_ref_score) else: - cfd_score = calc_cfd(target[1], target[2].upper()[ - :-3], target[2].upper()[-2:], mm_scores, pam_scores, do_scores) + cfd_score = calc_cfd( + target[1], + target[2].upper()[:-3], + target[2].upper()[-2:], + mm_scores, + pam_scores, + do_scores, + ) target.append("{:.3f}".format(cfd_score)) if target[-3] == 55: target[-3] = "{:.3f}".format(cfd_score) if target[-3] == 33: - cfd_ref_score = calc_cfd(target[1], target[-4].upper()[ - :-3], target[-4].upper()[-2:], mm_scores, pam_scores, do_scores) + cfd_ref_score = calc_cfd( + target[1], + target[-4].upper()[:-3], + target[-4].upper()[-2:], + mm_scores, + pam_scores, + do_scores, + ) target[-3] = "{:.3f}".format(cfd_ref_score) else: # no score calculated, append -1 in CFD score and in position -3 insert -1 value (-1 means no score calculated) @@ -421,43 +483,47 @@ def preprocess_CRISTA_score(cluster_targets): # process all found targets for index, target in enumerate(cluster_targets): # list with non-aligned sgRNA - sgRNA_non_aligned_list.append( - str(target[1])[:len(str(target[1]))-3]+'NGG') + sgRNA_non_aligned_list.append(str(target[1])[: len(str(target[1])) - 3] + "NGG") # list with aligned DNA DNA_aligned_list.append(str(target[2])) # first 5 nucleotide to add to protospacer - pre_protospacer_DNA = genomeStr[int( - target[4])-5:int(target[4])].upper() + pre_protospacer_DNA = genomeStr[int(target[4]) - 5 : int(target[4])].upper() # protospacer taken directly from the aligned target - protospacerDNA = str(target[2]).replace('-', '') - if target[6] == '-': + protospacerDNA = str(target[2]).replace("-", "") + if target[6] == "-": protospacerDNA = reverse_complement_table(protospacerDNA) # last 5 nucleotides to add to protospacer - post_protospacer_DNA = genomeStr[int( - target[4])+len(target[1]):int(target[4])+len(target[1])+5].upper() + post_protospacer_DNA = genomeStr[ + int(target[4]) + len(target[1]) : int(target[4]) + len(target[1]) + 5 + ].upper() # DNA seq extracted from genome and append to aligned DNA seq from CRISPRme - complete_DNA_seq = str(pre_protospacer_DNA) + \ - protospacerDNA+str(post_protospacer_DNA) + complete_DNA_seq = ( + str(pre_protospacer_DNA) + protospacerDNA + str(post_protospacer_DNA) + ) for elem in iupac_nucleotides: if elem in complete_DNA_seq: - complete_DNA_seq = complete_DNA_seq.replace(elem, '') + complete_DNA_seq = complete_DNA_seq.replace(elem, "") # trim the 3' and 5' end to avoid sequences longer than 29 len_DNA_seq = len(complete_DNA_seq) - first_half = complete_DNA_seq[int(len_DNA_seq/2)-14:int(len_DNA_seq/2)] - second_half = complete_DNA_seq[int( - len_DNA_seq/2):int(len_DNA_seq/2)+15] - complete_DNA_seq = first_half+second_half - if target[6] == '-': + first_half = complete_DNA_seq[int(len_DNA_seq / 2) - 14 : int(len_DNA_seq / 2)] + second_half = complete_DNA_seq[int(len_DNA_seq / 2) : int(len_DNA_seq / 2) + 15] + complete_DNA_seq = first_half + second_half + if target[6] == "-": complete_DNA_seq = reverse_complement_table(complete_DNA_seq) # if 'N' is present in the reference DNA seq, we must use a fake DNA seq to complete the aligned # that will be discarded after - if 'N' in complete_DNA_seq or 'n' in complete_DNA_seq or 'N' in DNA_aligned_list[-1] or 'n' in DNA_aligned_list[-1]: - complete_DNA_seq = 'A'*29 - DNA_aligned_list[-1] = 'A'*len(str(target[2])) + if ( + "N" in complete_DNA_seq + or "n" in complete_DNA_seq + or "N" in DNA_aligned_list[-1] + or "n" in DNA_aligned_list[-1] + ): + complete_DNA_seq = "A" * 29 + DNA_aligned_list[-1] = "A" * len(str(target[2])) index_to_null.append(index) # append sequence to DNA list @@ -467,7 +533,8 @@ def preprocess_CRISTA_score(cluster_targets): crista_score_list_alt = list() if do_scores: crista_score_list_alt = CRISTA_predict_list( - sgRNA_non_aligned_list, DNA_aligned_list, DNAseq_from_genome_list) + sgRNA_non_aligned_list, DNA_aligned_list, DNAseq_from_genome_list + ) # preprocess target then calculate CRISTA score sgRNA_non_aligned_list = list() @@ -476,45 +543,48 @@ def preprocess_CRISTA_score(cluster_targets): # process all ref sequences in targets for index, target in enumerate(cluster_targets): # list with non-aligned sgRNA - sgRNA_non_aligned_list.append( - str(target[1])[:len(str(target[1]))-3]+'NGG') + sgRNA_non_aligned_list.append(str(target[1])[: len(str(target[1])) - 3] + "NGG") # list with aligned DNA - if 'n' not in target[-3]: + if "n" not in target[-3]: DNA_aligned_list.append(str(target[-3])) else: DNA_aligned_list.append(str(target[2])) # first 5 nucleotide to add to protospacer - pre_protospacer_DNA = genomeStr[int( - target[4])-5:int(target[4])] + pre_protospacer_DNA = genomeStr[int(target[4]) - 5 : int(target[4])] # protospacer taken directly from the ref genome - protospacerDNA = genomeStr[int(target[4]):int( - target[4])+len(target[1])] + protospacerDNA = genomeStr[int(target[4]) : int(target[4]) + len(target[1])] # last 5 nucleotides to add to protospacer - post_protospacer_DNA = genomeStr[int( - target[4])+len(target[1]):int(target[4])+len(target[1])+5] + post_protospacer_DNA = genomeStr[ + int(target[4]) + len(target[1]) : int(target[4]) + len(target[1]) + 5 + ] # DNA seq extracted from genome and append to aligned DNA seq from CRISPRme - complete_DNA_seq = str(pre_protospacer_DNA) + \ - protospacerDNA+str(post_protospacer_DNA) + complete_DNA_seq = ( + str(pre_protospacer_DNA) + protospacerDNA + str(post_protospacer_DNA) + ) for elem in iupac_nucleotides: if elem in complete_DNA_seq: - complete_DNA_seq = complete_DNA_seq.replace(elem, '') + complete_DNA_seq = complete_DNA_seq.replace(elem, "") # trim the 3' and 5' end to avoid sequences longer than 29 len_DNA_seq = len(complete_DNA_seq) - first_half = complete_DNA_seq[int(len_DNA_seq/2)-14:int(len_DNA_seq/2)] - second_half = complete_DNA_seq[int( - len_DNA_seq/2):int(len_DNA_seq/2)+15] - complete_DNA_seq = first_half+second_half - if target[6] == '-': + first_half = complete_DNA_seq[int(len_DNA_seq / 2) - 14 : int(len_DNA_seq / 2)] + second_half = complete_DNA_seq[int(len_DNA_seq / 2) : int(len_DNA_seq / 2) + 15] + complete_DNA_seq = first_half + second_half + if target[6] == "-": complete_DNA_seq = reverse_complement_table(complete_DNA_seq) # if 'N' is present in the reference DNA seq, we must use a fake DNA seq to complete the aligned # that will be discarded after - if 'N' in complete_DNA_seq or 'n' in complete_DNA_seq or 'N' in DNA_aligned_list[-1] or 'n' in DNA_aligned_list[-1]: - complete_DNA_seq = 'A'*29 - DNA_aligned_list[-1] = 'A'*len(str(target[2])) + if ( + "N" in complete_DNA_seq + or "n" in complete_DNA_seq + or "N" in DNA_aligned_list[-1] + or "n" in DNA_aligned_list[-1] + ): + complete_DNA_seq = "A" * 29 + DNA_aligned_list[-1] = "A" * len(str(target[2])) index_to_null.append(index) # append sequence to DNA list @@ -524,7 +594,8 @@ def preprocess_CRISTA_score(cluster_targets): crista_score_list_ref = list() if do_scores: crista_score_list_ref = CRISTA_predict_list( - sgRNA_non_aligned_list, DNA_aligned_list, DNAseq_from_genome_list) + sgRNA_non_aligned_list, DNA_aligned_list, DNAseq_from_genome_list + ) for index, target in enumerate(cluster_targets): target_CRISTA = target.copy() @@ -536,15 +607,11 @@ def preprocess_CRISTA_score(cluster_targets): else: # else report the correct score if target_CRISTA[-2] == 55: # reference target have duplicate score - target_CRISTA[-2] = "{:.3f}".format( - crista_score_list_alt[index]) - target_CRISTA.append("{:.3f}".format( - crista_score_list_alt[index])) + target_CRISTA[-2] = "{:.3f}".format(crista_score_list_alt[index]) + target_CRISTA.append("{:.3f}".format(crista_score_list_alt[index])) if target_CRISTA[-2] == 33: # alternative target scoring - target_CRISTA[-2] = "{:.3f}".format( - crista_score_list_ref[index]) - target_CRISTA.append("{:.3f}".format( - crista_score_list_alt[index])) + target_CRISTA[-2] = "{:.3f}".format(crista_score_list_ref[index]) + target_CRISTA.append("{:.3f}".format(crista_score_list_alt[index])) # append to final score cluster cluster_scored.append(target_CRISTA) @@ -565,8 +632,7 @@ def calculate_scores(cluster_to_save): # process score for each target in cluster, at the same time to improve execution time cluster_with_CRISTA_score = preprocess_CRISTA_score(cluster_to_save) - - #REMOVED TO CHECK IF FILE IS RETURN WITH IDENTICAL ROWS COUNT + # REMOVED TO CHECK IF FILE IS RETURN WITH IDENTICAL ROWS COUNT # analyze CFD scored targets, returning for each guide,chr,cluster_pos the highest scoring target (or multiple in case of equal) # df_CFD = pd.DataFrame(cluster_with_CFD_score, columns=['Bulge_type', 'crRNA', 'DNA', 'Chromosome', @@ -619,16 +685,16 @@ def calculate_scores(cluster_to_save): # INPUT AND SETTINGS # fasta of the reference chromosome -inFasta = open(sys.argv[1], 'r') -current_chr = inFasta.readline().strip().replace('>', '') # lettura fasta del chr +inFasta = open(sys.argv[1], "r") +current_chr = inFasta.readline().strip().replace(">", "") # lettura fasta del chr genomeStr = inFasta.readlines() # lettura fasta del chr -genomeStr = ''.join(genomeStr).upper() +genomeStr = "".join(genomeStr).upper() # string of the whole chromosome on single line -genomeStr = genomeStr.replace('\n', '') +genomeStr = genomeStr.replace("\n", "") # targets clusterized by chr and ordered by position -inTarget = open(sys.argv[3], 'r') +inTarget = open(sys.argv[3], "r") # text file with PAM sequence and length -inPAMfile = open(sys.argv[4], 'r') +inPAMfile = open(sys.argv[4], "r") # outfile path outputFile = sys.argv[5] # max allowed mismatches in search (to validate ref targets in alternative case) @@ -636,44 +702,44 @@ def calculate_scores(cluster_to_save): # column of bulges count bulge_pos = 8 # header to insert into final file -header = '#Bulge_type\tcrRNA\tDNA\tChromosome\tPosition\tCluster_Position\tDirection\tMismatches\tBulge_Size\tTotal\tPAM_gen\tVar_uniq\tSamples\tAnnotation_Type\tReal_Guide\trsID\tAF\tSNP\t#Seq_in_cluster\tReference' +header = "#Bulge_type\tcrRNA\tDNA\tChromosome\tPosition\tCluster_Position\tDirection\tMismatches\tBulge_Size\tTotal\tPAM_gen\tVar_uniq\tSamples\tAnnotation_Type\tReal_Guide\trsID\tAF\tSNP\t#Seq_in_cluster\tReference" # cfd graphs pre-processing (deprecated) -cfd_for_graph = {'ref': [0]*101, 'var': [0]*101} +cfd_for_graph = {"ref": [0] * 101, "var": [0] * 101} # OUT BEST FILES FOR EACH SCORING SYSTEM # file with best CFD targets -cfd_best = open(outputFile + '.bestCFD.txt', 'w+') -cfd_best.write(header + '\tCFD\n') # Write header +cfd_best = open(outputFile + ".bestCFD.txt", "w+") +cfd_best.write(header + "\tCFD\n") # Write header # file with best mm+bul targets -mmblg_best = open(outputFile + '.bestmmblg.txt', 'w+') -mmblg_best.write(header + '\tCFD\n') # Write header +mmblg_best = open(outputFile + ".bestmmblg.txt", "w+") +mmblg_best.write(header + "\tCFD\n") # Write header # file with best CRISTA targets -crista_best = open(outputFile + '.bestCRISTA.txt', 'w+') -crista_best.write(header + '\tCFD\n') # Write header +crista_best = open(outputFile + ".bestCRISTA.txt", "w+") +crista_best.write(header + "\tCFD\n") # Write header # check if dictionaries has haplotypes haplotype_check = False try: - inDict = open(sys.argv[2], 'r') + inDict = open(sys.argv[2], "r") mydict = json.load(inDict) for entry in mydict: - if '|' in mydict[entry]: + if "|" in mydict[entry]: haplotype_check = True break - elif '/' in mydict[entry]: + elif "/" in mydict[entry]: break - print('Haplotype processing', haplotype_check) + print("Haplotype processing", haplotype_check) except: print("No dict found for", current_chr) # check PAM position and relative coordinates on targets pam_at_beginning = False line = inPAMfile.read().strip() -pam = line.split(' ')[0] -len_pam = int(line.split(' ')[1]) +pam = line.split(" ")[0] +len_pam = int(line.split(" ")[1]) guide_len = len(pam) - len_pam pos_beg = 0 pos_end = None @@ -689,7 +755,7 @@ def calculate_scores(cluster_to_save): pam_end = len_pam pam_at_beginning = True else: - pam = pam[(len_pam * (-1)):] + pam = pam[(len_pam * (-1)) :] pos_beg = 0 pos_end = len_pam * (-1) pam_begin = len_pam * (-1) @@ -721,23 +787,22 @@ def calculate_scores(cluster_to_save): # read lines from target file for line in inTarget: # split target into list - split = line.strip().split('\t') + split = line.strip().split("\t") # sgRNA sequence (with bulges and PAM) guide = split[1] # found target on DNA (with bulges, mismatches and PAM) target = split[2] - guide_no_bulge = split[1].replace('-', '') + guide_no_bulge = split[1].replace("-", "") guide_no_pam = guide[pos_beg:pos_end] # check if targets cointains IUPAC nucleotide if any((c in iupac_nucleotides) for c in target): - iupac_decomposition(split, guide_no_bulge, - guide_no_pam, cluster_to_save) + iupac_decomposition(split, guide_no_bulge, guide_no_pam, cluster_to_save) else: # process_iupac = False # append to respect file format for post analysis # null ref sequence - split.append('n') + split.append("n") # specific value to represent a ref target to avoid recount score split.append(55) # count of mm_bul for ref sequence in case of alternative target @@ -754,17 +819,14 @@ def calculate_scores(cluster_to_save): # remove count of tmp_mms target.pop(-2) # save CFD targets - cfd_best.write( - '\t'.join(target)+'\t'+str(0)+'\n') + cfd_best.write("\t".join(target) + "\t" + str(0) + "\n") # save mm-bul targets - mmblg_best.write( - '\t'.join(target)+'\t'+str(0)+'\n') + mmblg_best.write("\t".join(target) + "\t" + str(0) + "\n") if count == 1: # CRISTA target # remove count of tmp_mms target.pop(-2) # save CRISTA targets - crista_best.write('\t'.join(target)+'\t' + - str(0)+'\n') + crista_best.write("\t".join(target) + "\t" + str(0) + "\n") cluster_to_save = list() @@ -777,14 +839,26 @@ def calculate_scores(cluster_to_save): mmblg_best.close() crista_best.close() # rewrite header file - os.system("sed -i '1s/.*/#Bulge_type\tcrRNA\tDNA\tChromosome\tPosition\tCluster_Position\tDirection\tMismatches\tBulge_Size\tTotal\tPAM_gen\tVar_uniq\tSamples\tAnnotation_Type\tReal_Guide\trsID\tAF\tSNP\tReference\tCFD_ref\tCFD\t#Seq_in_cluster/' "+outputFile + '.bestCFD.txt') - os.system("sed -i '1s/.*/#Bulge_type\tcrRNA\tDNA\tChromosome\tPosition\tCluster_Position\tDirection\tMismatches\tBulge_Size\tTotal\tPAM_gen\tVar_uniq\tSamples\tAnnotation_Type\tReal_Guide\trsID\tAF\tSNP\tReference\tCFD_ref\tCFD\t#Seq_in_cluster/' "+outputFile + '.bestmmblg.txt') - os.system("sed -i '1s/.*/#Bulge_type\tcrRNA\tDNA\tChromosome\tPosition\tCluster_Position\tDirection\tMismatches\tBulge_Size\tTotal\tPAM_gen\tVar_uniq\tSamples\tAnnotation_Type\tReal_Guide\trsID\tAF\tSNP\tReference\tCFD_ref\tCFD\t#Seq_in_cluster/' "+outputFile + '.bestCRISTA.txt') + os.system( + "sed -i '1s/.*/#Bulge_type\tcrRNA\tDNA\tChromosome\tPosition\tCluster_Position\tDirection\tMismatches\tBulge_Size\tTotal\tPAM_gen\tVar_uniq\tSamples\tAnnotation_Type\tReal_Guide\trsID\tAF\tSNP\tReference\tCFD_ref\tCFD\t#Seq_in_cluster/' " + + outputFile + + ".bestCFD.txt" + ) + os.system( + "sed -i '1s/.*/#Bulge_type\tcrRNA\tDNA\tChromosome\tPosition\tCluster_Position\tDirection\tMismatches\tBulge_Size\tTotal\tPAM_gen\tVar_uniq\tSamples\tAnnotation_Type\tReal_Guide\trsID\tAF\tSNP\tReference\tCFD_ref\tCFD\t#Seq_in_cluster/' " + + outputFile + + ".bestmmblg.txt" + ) + os.system( + "sed -i '1s/.*/#Bulge_type\tcrRNA\tDNA\tChromosome\tPosition\tCluster_Position\tDirection\tMismatches\tBulge_Size\tTotal\tPAM_gen\tVar_uniq\tSamples\tAnnotation_Type\tReal_Guide\trsID\tAF\tSNP\tReference\tCFD_ref\tCFD\t#Seq_in_cluster/' " + + outputFile + + ".bestCRISTA.txt" + ) # cfd dataframe write cfd_dataframe = pd.DataFrame.from_dict(cfd_for_graph) - cfd_dataframe.to_csv(outputFile + '.CFDGraph.txt', sep='\t', index=False) + cfd_dataframe.to_csv(outputFile + ".CFDGraph.txt", sep="\t", index=False) # print complete and exit with no error - print('ANALYSIS COMPLETE IN', time.time() - global_start) + print("ANALYSIS COMPLETE IN", time.time() - global_start) exit(0) # process cluster of targets if less then 1mln rows total @@ -797,28 +871,37 @@ def calculate_scores(cluster_to_save): # remove count of tmp_mms target.pop(-2) # save CFD targets - cfd_best.write( - '\t'.join(target)+'\t'+str(0)+'\n') + cfd_best.write("\t".join(target) + "\t" + str(0) + "\n") # save mm-bul targets - mmblg_best.write( - '\t'.join(target)+'\t'+str(0)+'\n') + mmblg_best.write("\t".join(target) + "\t" + str(0) + "\n") if count == 1: # CRISTA target # remove count of tmp_mms target.pop(-2) # save CRISTA targets - crista_best.write('\t'.join(target)+'\t' + - str(0)+'\n') + crista_best.write("\t".join(target) + "\t" + str(0) + "\n") cfd_best.close() mmblg_best.close() crista_best.close() -os.system("sed -i '1s/.*/#Bulge_type\tcrRNA\tDNA\tChromosome\tPosition\tCluster_Position\tDirection\tMismatches\tBulge_Size\tTotal\tPAM_gen\tVar_uniq\tSamples\tAnnotation_Type\tReal_Guide\trsID\tAF\tSNP\tReference\tCFD_ref\tCFD\t#Seq_in_cluster/' "+outputFile + '.bestCFD.txt') -os.system("sed -i '1s/.*/#Bulge_type\tcrRNA\tDNA\tChromosome\tPosition\tCluster_Position\tDirection\tMismatches\tBulge_Size\tTotal\tPAM_gen\tVar_uniq\tSamples\tAnnotation_Type\tReal_Guide\trsID\tAF\tSNP\tReference\tCFD_ref\tCFD\t#Seq_in_cluster/' "+outputFile + '.bestmmblg.txt') -os.system("sed -i '1s/.*/#Bulge_type\tcrRNA\tDNA\tChromosome\tPosition\tCluster_Position\tDirection\tMismatches\tBulge_Size\tTotal\tPAM_gen\tVar_uniq\tSamples\tAnnotation_Type\tReal_Guide\trsID\tAF\tSNP\tReference\tCFD_ref\tCFD\t#Seq_in_cluster/' "+outputFile + '.bestCRISTA.txt') +os.system( + "sed -i '1s/.*/#Bulge_type\tcrRNA\tDNA\tChromosome\tPosition\tCluster_Position\tDirection\tMismatches\tBulge_Size\tTotal\tPAM_gen\tVar_uniq\tSamples\tAnnotation_Type\tReal_Guide\trsID\tAF\tSNP\tReference\tCFD_ref\tCFD\t#Seq_in_cluster/' " + + outputFile + + ".bestCFD.txt" +) +os.system( + "sed -i '1s/.*/#Bulge_type\tcrRNA\tDNA\tChromosome\tPosition\tCluster_Position\tDirection\tMismatches\tBulge_Size\tTotal\tPAM_gen\tVar_uniq\tSamples\tAnnotation_Type\tReal_Guide\trsID\tAF\tSNP\tReference\tCFD_ref\tCFD\t#Seq_in_cluster/' " + + outputFile + + ".bestmmblg.txt" +) +os.system( + "sed -i '1s/.*/#Bulge_type\tcrRNA\tDNA\tChromosome\tPosition\tCluster_Position\tDirection\tMismatches\tBulge_Size\tTotal\tPAM_gen\tVar_uniq\tSamples\tAnnotation_Type\tReal_Guide\trsID\tAF\tSNP\tReference\tCFD_ref\tCFD\t#Seq_in_cluster/' " + + outputFile + + ".bestCRISTA.txt" +) cfd_dataframe = pd.DataFrame.from_dict(cfd_for_graph) -cfd_dataframe.to_csv(outputFile + '.CFDGraph.txt', sep='\t', index=False) +cfd_dataframe.to_csv(outputFile + ".CFDGraph.txt", sep="\t", index=False) -print('ANALYSIS COMPLETE IN', time.time() - global_start) +print("ANALYSIS COMPLETE IN", time.time() - global_start) diff --git a/PostProcess/pool_post_analisi_snp.py b/PostProcess/pool_post_analisi_snp.py index 36f2e51..71596c9 100755 --- a/PostProcess/pool_post_analisi_snp.py +++ b/PostProcess/pool_post_analisi_snp.py @@ -22,18 +22,16 @@ def start_analysis(f): # splitted = f.split('.') - chrom = str(f).replace('.fa', '') + chrom = str(f).replace(".fa", "") # for elem in splitted: # if "chr" in elem: # chrom = elem os.system( - f"./post_analisi_snp.sh \"{output_folder}\" \"{ref_folder}\" \"{vcf_name}\" \"{guide_file}\" \"{mm}\" \"{bDNA}\" \"{bRNA}\" {annotation_file} {pam_file} {dict_folder} {final_res} {final_res_alt} {chrom}") + f'./post_analisi_snp.sh "{output_folder}" "{ref_folder}" "{vcf_name}" "{guide_file}" "{mm}" "{bDNA}" "{bRNA}" {annotation_file} {pam_file} {dict_folder} {final_res} {final_res_alt} {chrom}' + ) -chrs = [] -for f in os.listdir(ref_folder): - if '.fa' in f and '.fai' not in f: - chrs.append(f) +chrs = [f for f in os.listdir(ref_folder) if ".fa" in f and ".fai" not in f] # t = 6 # if ncpus < 6: diff --git a/PostProcess/post_analisi_indel.sh b/PostProcess/post_analisi_indel.sh index 16aaf9a..d117a12 100755 --- a/PostProcess/post_analisi_indel.sh +++ b/PostProcess/post_analisi_indel.sh @@ -1,14 +1,5 @@ #!/bin/bash -# set -e - -output_folder=$1 -ref_folder=$2 -ref_name=$(basename $2) -vcf_folder=$3 #!/bin/bash - -set -e - output_folder=$1 ref_folder=$2 ref_name=$(basename $2) @@ -40,44 +31,41 @@ awk -v fake_chr="$fake_chr" '$0 ~ fake_chr {print}' "$output_folder/crispritz_ta header=$(head -1 $output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt) sed -i 1i"$header" "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" -./analisi_indels_NNN.sh "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" "$output_folder/${fake_chr}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}" "$annotation_file" "$dict_folder/log_indels_$vcf_name" "$ref_folder/$true_chr.fa" $mm $bDNA $bRNA "$guide_file" "$pam_file" "$output_folder" || { - echo "CRISPRme ERROR: indels analysis failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 -} +./analisi_indels_NNN.sh "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" "$output_folder/${fake_chr}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}" "$annotation_file" "$dict_folder/log_indels_$vcf_name" "$ref_folder/$true_chr.fa" $mm $bDNA $bRNA "$guide_file" "$pam_file" "$output_folder" rm "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" rm "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" -vcf_name=$(basename $3) -guide_file=$4 -guide_name=$(basename $4) -mm=$5 -bDNA=$6 -bRNA=$7 -annotation_file=$8 -annotation_name=$(basename $8) -pam_file=$9 -pam_name=$(basename $9) -# sampleID=${10} -dict_folder=${10} +# vcf_name=$(basename $3) +# guide_file=$4 +# guide_name=$(basename $4) +# mm=$5 +# bDNA=$6 +# bRNA=$7 +# annotation_file=$8 +# annotation_name=$(basename $8) +# pam_file=$9 +# pam_name=$(basename $9) +# # sampleID=${10} +# dict_folder=${10} -final_res=${11} -final_res_alt=${12} +# final_res=${11} +# final_res_alt=${12} -key=${13} +# key=${13} -echo "Processing INDELs results for $key, starting post-analysis" -true_chr=$key -fake_chr="fake$true_chr" +# echo "Processing INDELs results for $key, starting post-analysis" +# true_chr=$key +# fake_chr="fake$true_chr" -# create file to prevent void grep failing -touch "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" -# create file to prevent void grep failing -touch "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" -awk -v fake_chr="$fake_chr" '$0 ~ fake_chr {print}' "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" >"$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" -header=$(head -1 $output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt) -sed -i 1i"$header" "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" +# # create file to prevent void grep failing +# touch "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" +# # create file to prevent void grep failing +# touch "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" +# awk -v fake_chr="$fake_chr" '$0 ~ fake_chr {print}' "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" >"$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" +# header=$(head -1 $output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt) +# sed -i 1i"$header" "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" -./analisi_indels_NNN.sh "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" "$output_folder/${fake_chr}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}" "$annotation_file" "$dict_folder/log_indels_$vcf_name" "$ref_folder/$true_chr.fa" $mm $bDNA $bRNA "$guide_file" "$pam_file" "$output_folder" || { - echo "CRISPRme ERROR: indels analysis failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 -} +# ./analisi_indels_NNN.sh "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" "$output_folder/${fake_chr}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}" "$annotation_file" "$dict_folder/log_indels_$vcf_name" "$ref_folder/$true_chr.fa" $mm $bDNA $bRNA "$guide_file" "$pam_file" "$output_folder" || { +# echo "CRISPRme ERROR: indels analysis failed (script: ${0} line $((LINENO - 1)))" >&2 +# exit 1 +# } diff --git a/PostProcess/post_analisi_snp.sh b/PostProcess/post_analisi_snp.sh index cfb2d28..75e328f 100755 --- a/PostProcess/post_analisi_snp.sh +++ b/PostProcess/post_analisi_snp.sh @@ -1,7 +1,5 @@ #!/bin/bash -set -e # trace all command failures - output_folder=$1 ref_folder=$2 ref_name=$(basename $2) @@ -30,10 +28,7 @@ if ! [ -f "$output_folder/crispritz_targets/${ref_name}+${vcf_name}_${pam_name}_ else awk -v key="$key" '$0 ~ key' "$output_folder/crispritz_targets/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" >"$output_folder/crispritz_targets/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$key" fi -./scriptAnalisiNNN_v3.sh "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.${key}" "$output_folder/crispritz_targets/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.${key}" "$output_folder/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}_$key" "$annotation_file" "${dict_folder}/my_dict_${key}.json" "${ref_folder}/${key}.fa" $mm $bDNA $bRNA "$guide_file" "$pam_file" "$output_folder" || { - echo "CRISPRme ERROR: SNP analysis failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 -} +./scriptAnalisiNNN_v3.sh "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.${key}" "$output_folder/crispritz_targets/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.${key}" "$output_folder/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}_$key" "$annotation_file" "${dict_folder}/my_dict_${key}.json" "${ref_folder}/${key}.fa" $mm $bDNA $bRNA "$guide_file" "$pam_file" "$output_folder" rm "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$key" rm "$output_folder/crispritz_targets/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$key" # header=$(head -1 "$output_folder/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}_$key.bestMerge.txt") diff --git a/PostProcess/post_analysis_only.sh b/PostProcess/post_analysis_only.sh index 1d2c6e8..0dd9053 100755 --- a/PostProcess/post_analysis_only.sh +++ b/PostProcess/post_analysis_only.sh @@ -1,7 +1,5 @@ #!/bin/bash -set -e # trace all failures - #file for automated search of guide+pam in reference and variant genomes ref_folder=$(realpath $1) @@ -121,10 +119,7 @@ if [ "$vcf_name" != "_" ]; then exit fi - ./pool_post_analisi_snp.py $output_folder $ref_folder $vcf_name $guide_file $mm $bDNA $bRNA $annotation_file $pam_file $sampleID $dict_folder $final_res $final_res_alt $ncpus || { - echo "CRISPRme ERROR: indels postprocessing failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 - } + ./pool_post_analisi_snp.py $output_folder $ref_folder $vcf_name $guide_file $mm $bDNA $bRNA $annotation_file $pam_file $sampleID $dict_folder $final_res $final_res_alt $ncpus echo "Post-analysis SNPs End: "$(date +%F-%T) >>$output_folder/$log @@ -155,10 +150,7 @@ else echo "Processing $key" awk -v key="$key" '$0 ~ key { print }' "$output_folder/crispritz_targets/${ref_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" >"$output_folder/crispritz_targets/${ref_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$key" touch "$output_folder/crispritz_targets/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$key" - ./scriptAnalisiNNN_v3.sh "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$key" "$output_folder/crispritz_targets/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$key" "${ref_name}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}_$key" "$annotation_file" "_" "$ref_folder" $mm $bDNA $bRNA "$guide_file" "$pam_file" "$sampleID" "$output_folder" || { - echo "CRISPRme ERROR: analysis failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 - } + ./scriptAnalisiNNN_v3.sh "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$key" "$output_folder/crispritz_targets/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$key" "${ref_name}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}_$key" "$annotation_file" "_" "$ref_folder" $mm $bDNA $bRNA "$guide_file" "$pam_file" "$sampleID" "$output_folder" rm "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$key" rm "$output_folder/crispritz_targets/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$key" header=$(head -1 "$output_folder/${ref_name}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}.bestMerge.txt") @@ -180,10 +172,7 @@ if [ "$vcf_name" != "_" ]; then cd "$starting_dir" echo "Post-analysis INDELs Start: "$(date +%F-%T) >>$output_folder/$log - ./pool_post_analisi_indel.py $output_folder $ref_folder $vcf_folder $guide_file $mm $bDNA $bRNA $annotation_file $pam_file $sampleID "$output_folder/log_indels_$vcf_name" $final_res $final_res_alt $ncpus || { - echo "CRISPRme ERROR:indels analysis failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 - } + ./pool_post_analisi_indel.py $output_folder $ref_folder $vcf_folder $guide_file $mm $bDNA $bRNA $annotation_file $pam_file $sampleID "$output_folder/log_indels_$vcf_name" $final_res $final_res_alt $ncpus echo "Post-analysis INDELs End: "$(date +%F-%T) >>$output_folder/$log for key in "${array_fake_chroms[@]}"; do tail -n +2 "$output_folder/${key}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}.bestMerge.txt" >>"$final_res" #"$output_folder/${fake_chr}_${guide_name}_${mm}_${bDNA}_${bRNA}.bestCFD.txt.tmp" @@ -197,10 +186,7 @@ fi cd "$starting_dir" echo "Merging Close Targets Start: "$(date +%F-%T) >>$output_folder/$log -./merge_close_targets_cfd.sh $final_res $final_res.trimmed $merge_t || { - echo "CRISPRme ERROR: CFD targets merge failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 -} +./merge_close_targets_cfd.sh $final_res $final_res.trimmed $merge_t mv $final_res.trimmed $final_res mv $final_res.trimmed.discarded_samples $final_res_alt @@ -209,10 +195,7 @@ mv $final_res.trimmed.discarded_samples $final_res_alt echo "Merging Close Targets End: "$(date +%F-%T) >>$output_folder/$log echo "Merging Alternative Chromosomes Start: "$(date +%F-%T) >>$output_folder/$log -./merge_alt_chr.sh $final_res $final_res.chr_merged || { - echo "CRISPRme ERROR: alternative targets merge failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 -} +./merge_alt_chr.sh $final_res $final_res.chr_merged #rm $final_res.trimmed #./merge_alt_chr.sh $final_res_alt.trimmed $final_res_alt.trimmed.chr_merged @@ -230,10 +213,7 @@ echo "Cleaning directory" if ! [ -d "$output_folder/cfd_graphs" ]; then mkdir $output_folder/cfd_graphs fi -./assemble_cfd_graphs.py $output_folder || { - echo "CRISPRme ERROR: CFD graph creation failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 -} +./assemble_cfd_graphs.py $output_folder mv $output_folder/snps.CFDGraph.txt $output_folder/cfd_graphs mv $output_folder/indels.CFDGraph.txt $output_folder/cfd_graphs diff --git a/PostProcess/post_process.sh b/PostProcess/post_process.sh index 79fec33..70f978d 100755 --- a/PostProcess/post_process.sh +++ b/PostProcess/post_process.sh @@ -1,7 +1,5 @@ #!/bin/bash -set -e - #$1 is CFD targets file #$2 is genecode annotation #$3 is empirical data for the guide @@ -42,10 +40,7 @@ echo 'Sorting final annotation results using original NR to correspond with orig # LC_ALL=C sort -T $dir -k4,4rg $1.found.bed -o $1.found.bed LC_ALL=C sort -T $dir -k4,4n $1.found.bed -o $1.found.bed echo 'Starting integration with empirical data (this may take a while)' -"$starting_dir"./resultIntegrator.py $1 $3 $1.found.bed $4 $6/ true $5 $7 $9 ${10} ${11} || { - echo "CRISPRme ERROR: result integration failed (script: ${0} line $((LINENO-1)))" >&2 - exit 1 -} +"$starting_dir"./resultIntegrator.py $1 $3 $1.found.bed $4 $6/ true $5 $7 $9 ${10} ${11} echo 'Removing unnecessary files' rm -f $1.bed $1.found.bed $1.redirectFile.out $1.temp.bed # sed -i 1i"#Bulge_type\tcrRNA\tDNA\tReference\tChromosome\tPosition\tCluster_Position\tDirection\tMismatches\tBulge_Size\tTotal\tPAM_gen\tVar_uniq\tSamples\tAnnotation_Type\tReal_Guide\trsID\tAF\tSNP\t#Seq_in_cluster\tCFD\tCFD_ref\tHighest_CFD_Risk_Score\tHighest_CFD_Absolute_Risk_Score\tMMBLG_#Bulge_type\tMMBLG_crRNA\tMMBLG_DNA\tMMBLG_Reference\tMMBLG_Chromosome\tMMBLG_Position\tMMBLG_Cluster_Position\tMMBLG_Direction\tMMBLG_Mismatches\tMMBLG_Bulge_Size\tMMBLG_Total\tMMBLG_PAM_gen\tMMBLG_Var_uniq\tMMBLG_Samples\tMMBLG_Annotation_Type\tMMBLG_Real_Guide\tMMBLG_rsID\tMMBLG_AF\tMMBLG_SNP\tMMBLG_#Seq_in_cluster\tMMBLG_CFD\tMMBLG_CFD_ref\tMMBLG_CFD_Risk_Score\tMMBLG_CFD_Absolute_Risk_Score" "$1" diff --git a/PostProcess/remove_contiguous_samples_cfd.py b/PostProcess/remove_contiguous_samples_cfd.py old mode 100755 new mode 100644 index ca80ac6..20739de --- a/PostProcess/remove_contiguous_samples_cfd.py +++ b/PostProcess/remove_contiguous_samples_cfd.py @@ -1,680 +1,219 @@ +#!/usr/bin/env python """ -This script parses the input arguments and removes contiguous samples from a file based on specified criteria. +Created on Fri Aug 28 15:58:04 2020 -The script expects the following input arguments: -- infname (str): The input file name. -- outfname (str): The output file name. -- rangebp (int): The maximum range in base pairs for samples to be considered contiguous. -- chrom_idx (int): The index of the chromosome column in the input file. -- position_idx (int): The index of the position column in the input file. -- mm_bul_count_idx (int): The index of the mismatch and bulge count column in the input file. -- guide_idx (int): The index of the guide column in the input file. -- snp_info_idx (int): The index of the SNP info column in the input file. -- score_idx (int): The index of the score column in the input file. -- sort_criterion (str): The sorting criterion for the output file. - -The script reads the input file, parses the specified columns, and removes contiguous samples based on the given range. The resulting data is written to the output file. - -Examples: - $ python remove_contiguous_samples_cfd.py input.txt output.txt 10 2 3 4 5 6 7 8 score +@author: francesco """ - -from typing import List, Tuple, Dict, Callable, Union -from io import TextIOWrapper -from time import time - import sys -import os - -INPUT_ARG_COUNT = 12 -FLAG_ALT_ONLY = 12 -SORTING_PIVOTS = ["score", "total"] -SORTING_CRITERIA = {"mm+bulges": 0, "mm": 2, "bulges": 1} - - -class MergeTargets: - """ - Represents a class for merging targets with specified criteria for sorting - and indexing. - - Args: - args (List[str]): List of arguments containing input parameters for - target merging. +import time - Returns: - None - Raises: - FileNotFoundError: If the input targets file is not found. - ValueError: If there are issues with the specified merge range, column - indices, or sorting criteria. - """ - - def __init__(self, args: List[str]) -> None: - self._infname = args[0] # input targets - if not os.path.exists(self._infname) or not os.path.isfile(self._infname): - raise FileNotFoundError(f"{self._infname} not found") - self._outfname = args[1] # output merged targets - self._rangebp = int(args[2]) # merge bp range - if self._rangebp <= 0: - raise ValueError(f"Invalid merge range ({self._rangebp})") - self._chromidx = int(args[3]) - 1 # chromosome index - if self._chromidx != 4: - raise ValueError( - f"Chromosome data is expected on column 5, got {self._chromidx}" - ) - self._posidx = int(args[4]) - 1 # position index - if self._posidx != 6: - raise ValueError( - f"Position data is expected on column 7, got {self._posidx}" +def get_best_targets(cluster, fileOut, fileOut_disc, cfd, snp_info): + # avoid crush when cluster is empty in the first call + if not cluster: + return + list_ref = list() + dict_var = dict() + for ele in cluster: + if ele[snp_info] == "n": + list_ref.append(ele) + else: + # merge samples of identical targets (coming from different VCF datasets) + if (ele[pos], ele[snp_info]) in dict_var.keys(): + dict_var[(ele[pos], ele[snp_info])][0][true_guide - 2] = ( + dict_var[(ele[pos], ele[snp_info])][0][true_guide - 2] + + "," + + ele[true_guide - 2] + ) # true_guide - 2 points to samples column + dict_var[(ele[pos], ele[snp_info])][0][snp_info - 2] = ( + dict_var[(ele[pos], ele[snp_info])][0][snp_info - 2] + + "," + + ele[snp_info - 2] + ) # snp_info - 2 points to rsID column + dict_var[(ele[pos], ele[snp_info])][0][snp_info - 1] = ( + dict_var[(ele[pos], ele[snp_info])][0][snp_info - 1] + + "," + + ele[snp_info - 1] + ) # ttuesnp_info_guide - 2 points to AF column + else: + dict_var[(ele[pos], ele[snp_info])] = [ele] + + final_list_best_ref = list() + var_only = False + for target in list_ref: + final_list_best_ref.append(target) + if not final_list_best_ref: + var_only = True + + final_list_best_var = list() + # for each snp_info in dict, extract the targets + for key in dict_var.keys(): + list_var = dict_var[key] + # copy the targets in the variant list, adding unique if no ref target is found + for target in list_var: + if var_only: + target[12] = "y" + final_list_best_var.append(target) + + temp_final_list_best_var = list() + # for target in final_list_best_var: + for target in final_list_best_var: + # remove duplicates into snp info col + target[snp_info] = ",".join(set(target[snp_info].split(","))) + # remove duplicate into rsID col + target[snp_info - 2] = ",".join(set(target[snp_info - 2].split(","))) + # remove duplicate into AF col + target[snp_info - 1] = ",".join(set(target[snp_info - 1].split(","))) + # remove duplicate into samples col + target[true_guide - 2] = ",".join(set(target[true_guide - 2].split(","))) + # append to temp list + temp_final_list_best_var.append(target) + + # final list with polished targets (no duplicates in snp data) + final_list_best_var = temp_final_list_best_var + + # check if lists are empty + validity_check_ref = False + validity_check_var = False + if final_list_best_ref: + validity_check_ref = True + if final_list_best_var: + validity_check_var = True + + # extract best target for each criteria + if sort_order == "score": + # sort per score (CFD or CRISTA) + if validity_check_ref: + final_list_best_ref = sorted( + final_list_best_ref, key=lambda x: (-float(x[cfd]), int(x[total])) ) - self._mmb_idx = int(args[5]) - 1 # mm+bulges index - if self._mmb_idx != 10: - raise ValueError( - f"MM+Bulges data is expected on column 11, got {self._mmb_idx}" + if validity_check_var: + final_list_best_var = sorted( + final_list_best_var, key=lambda x: (-float(x[cfd]), int(x[total])) ) - self._guide_idx = int(args[6]) - 1 # guide index - if self._guide_idx != 15: - raise ValueError( - f"Guide data is expected on column 16, got {self._guide_idx}" + if var_only: # no ref found + # count the residual targets in the list + final_list_best_var[0][cfd - 1] = str(len(final_list_best_var) - 1) + # append the best target to best_file + fileOut.write("\t".join(final_list_best_var[0])) + # pop the best target from the list + bestTarget = final_list_best_var.pop(0) + elif validity_check_ref and validity_check_var: # ref and var targets found + if float(final_list_best_ref[0][cfd]) >= float(final_list_best_var[0][cfd]): + final_list_best_ref[0][cfd - 1] = str( + len(final_list_best_ref) + len(final_list_best_var) - 1 + ) + fileOut.write("\t".join(final_list_best_ref[0])) + bestTarget = final_list_best_ref.pop(0) + else: + final_list_best_var[0][cfd - 1] = str( + len(final_list_best_ref) + len(final_list_best_var) - 1 + ) + fileOut.write("\t".join(final_list_best_var[0])) + bestTarget = final_list_best_var.pop(0) + else: # only ref + final_list_best_ref[0][cfd - 1] = str(len(final_list_best_ref) - 1) + fileOut.write("\t".join(final_list_best_ref[0])) + bestTarget = final_list_best_ref.pop(0) + # write all the remaining targets in the alt file + for count, elem in enumerate(final_list_best_ref): + final_list_best_ref[count][cfd - 1] = str( + len(final_list_best_ref) + len(final_list_best_var) - 1 ) - self._snp_idx = int(args[7]) - 1 # snp info index - if self._snp_idx != 18: - raise ValueError(f"SNP data is expected on column 19, got {self._snp_idx}") - self._score_idx = int(args[8]) - 1 # score index - if self._score_idx != 20: - raise ValueError( - f"Score data is expected on column 21, got {self._score_idx}" + fileOut_disc.write(("\t".join(ele))) + for count, elem in enumerate(final_list_best_var): + final_list_best_var[count][cfd - 1] = str( + len(final_list_best_ref) + len(final_list_best_var) - 1 ) - self._sort_criterion = args[9] # sorting criterion (pivot) - if self._sort_criterion not in SORTING_PIVOTS: - raise ValueError( - f"Allowed sort pivots: {SORTING_PIVOTS}, got {self._sort_criterion}" + fileOut_disc.write(("\t".join(ele))) + else: + # sort for total (mm+bul) in target + if validity_check_ref: + final_list_best_ref = sorted( + final_list_best_ref, + key=lambda x: (int(x[total - 2]), int(x[total - 1])), ) - self._sorting_criteria_scoring = args[10] # sorting criteria (score is pivot) - self._sorting_criteria = args[11] # sorting criteria (mm+bulges is pivot) - - @property - def targets_fname(self) -> str: - return self._infname - - @property - def targets_fname_merged(self) -> str: - return self._outfname - - @property - def targets_fname_discarded(self) -> str: - return f"{self._outfname}.discarded_samples" - - @property - def rangebp(self) -> int: - return self._rangebp - - @property - def chromidx(self) -> int: - return self._chromidx - - @property - def posidx(self) -> int: - return self._posidx - - @property - def mmbidx(self) -> int: - return self._mmb_idx - - @property - def guideidx(self) -> int: - return self._guide_idx - - @property - def snpidx(self) -> int: - return self._snp_idx - - @property - def scoreidx(self) -> int: - return self._score_idx - - @property - def sort_pivot(self) -> str: - return self._sort_criterion - - @property - def sorting_criteria_scoring(self) -> str: - return self._sorting_criteria_scoring - - @property - def sorting_criteria(self) -> str: - return self._sorting_criteria - - -def parse_input_args(args: List[str]) -> MergeTargets: - """ - Parses the input arguments to create an instance of MergeTargets for handling - target merging based on the provided arguments. - - Args: - args (List[str]): List of input arguments to be parsed for target merging. - - Returns: - MergeTargets: An instance of MergeTargets class for further processing. - """ - - if not args: - raise ValueError("No input argument provided") - if len(args) != INPUT_ARG_COUNT: - raise ValueError(f"Expected {INPUT_ARG_COUNT} arguments, got {len(args)}") - return MergeTargets(args) # parse input arguments - - -def initialize_targets_cluster(rangebp: int) -> Tuple[int, str, str, str, List[str]]: - """ - Initializes the targets cluster by setting the previous cluster position, - guide, chromosome, SNP, and an open starting cluster. - - Args: - rangebp (int): The range of base pairs for the cluster. - - Returns: - Tuple[int, str, str, str, List[str]]: Tuple containing the initialized - cluster parameters. - """ - - pos_prev = -(rangebp + 1) # previous cluster position - guide_prev = "" # previous cluster's guide - chrom_prev = "" # previous cluster's chromosome - snp_prev = "" # previous cluster snp - cluster = [] # open starting cluster - return pos_prev, guide_prev, chrom_prev, snp_prev, cluster - - -def parse_target(target: str) -> List[str]: - """ - Parses the fields of a target string and returns a list of the parsed fields. - - Args: - target (str): The target string to be parsed. - - Returns: - List[str]: List of parsed fields extracted from the target string. - """ - - # parse target's fields - return target.strip().split() - - -def open_targets_cluster( - guide: str, - guide_prev: str, - chrom: str, - chrom_prev: str, - pos: int, - pos_prev: int, - rangebp: int, -) -> bool: - """ - Determines whether a new targets cluster should be opened based on changes - in guide, chromosome, and position within a specified range of base pairs. - - Args: - guide (str): Current guide. - guide_prev (str): Previous guide. - chrom (str): Current chromosome. - chrom_prev (str): Previous chromosome. - pos (int): Current position. - pos_prev (int): Previous position. - rangebp (int): Range of base pairs for cluster merging. - - Returns: - bool: True if a new targets cluster should be opened, False otherwise. - """ - - return guide != guide_prev or chrom != chrom_prev or pos - pos_prev > rangebp - - -def cluster_targets_by_pos_snp( - cluster: List[List[str]], guideidx: int, snpidx: int, posidx: int -) -> Tuple[List[List[str]], Dict[Tuple[int, str], List[List[str]]]]: - """ - Clusters targets based on position and SNP information, separating them into - reference and alternative targets. - - Args: - cluster (List[List[str]]): List of target clusters. - guideidx (int): Index of the guide. - snpidx (int): Index of the SNP information. - posidx (int): Index of the position. - - Returns: - Tuple[List[List[str]], Dict[Tuple[int, str], List[List[str]]]: A tuple - containing the reference targets list and a dictionary of alternative - targets grouped by position and SNP information. - """ - - # initialize reference and alternative targets lists - reference_targets, alternative_targets = [], {} - samplesidx, snpididx, afidx = guideidx - 2, snpidx - 2, snpidx - 1 - for target in cluster: - if target[snpidx] == "n": # reference target - reference_targets.append(target) - continue - # alternative target - pos_t, snp_info_t = target[posidx], target[snpidx] - if (pos_t, snp_info_t) in alternative_targets: - alternative_targets[(pos_t, snp_info_t)][0][ - samplesidx - ] += f",{target[samplesidx]}" # add samples - alternative_targets[(pos_t, snp_info_t)][0][ - snpididx - ] += f",{target[snpididx]}" # add snp ids - alternative_targets[(pos_t, snp_info_t)][0][ - afidx - ] += f",{target[afidx]}" # add allele frequencies - else: - alternative_targets[(pos_t, snp_info_t)] = [target] - return reference_targets, alternative_targets - - -def recover_alternative_targets_list( - alternative_targets: Dict[Tuple[int, str], List[List[str]]], noref: bool -) -> List[List[str]]: - """ - Recovers the list of alternative targets from a dictionary of alternative - targets grouped by position and SNP information. - - Args: - alternative_targets (Dict[Tuple[int, str], List[List[str]]): Dictionary - of alternative targets grouped by position and SNP information. - noref (bool): Flag indicating whether to include reference targets. - - Returns: - List[List[str]]: List of alternative targets with updated flags. - """ - - alternative_targets_list = [] # initialize the list - for v in alternative_targets.values(): - for target in v: - # check whether the target is uniquely found on alternative sequences - target[FLAG_ALT_ONLY] = "y" if noref else target[FLAG_ALT_ONLY] - alternative_targets_list.append(target) - return alternative_targets_list - - -def remove_duplicates(target: List[str], idx: int) -> str: - """ - Removes duplicates from a target list at the specified index and returns the - unique values as a comma-separated string. - - Args: - target (List[str]): List of target values. - idx (int): Index of the target list to remove duplicates from. - - Returns: - str: Comma-separated string of unique values after removing duplicates. - """ - - return ",".join(set(target[idx].split(","))) - - -def unique_values( - targets: List[List[str]], snpidx: int, guideidx: int -) -> List[List[str]]: - """ - Returns a list of targets with unique values for SNP information, SNP ID, - allele frequency, and samples. - - Args: - targets (List[List[str]]): List of target clusters. - snpidx (int): Index of the SNP information. - guideidx (int): Index of the guide. - - Returns: - List[List[str]]: List of targets with unique values for specified fields. - """ - - snpididx, afidx, samplesidx = snpidx - 2, snpidx - 1, guideidx - 2 - targets_filt = [] - # remove duplicate values - for target in targets: - target[snpidx] = remove_duplicates(target, snpidx) # snp info - target[snpididx] = remove_duplicates(target, snpididx) # snp id - target[afidx] = remove_duplicates(target, afidx) # af - target[samplesidx] = remove_duplicates(target, samplesidx) # samples - targets_filt.append(target) # update target - return targets_filt - - -def construct_cluster( - cluster: List[List[str]], - guideidx: int, - snpidx: int, - posidx: int, - scoreidx: int, - mmbidx: int, - sort_pivot: str, - sorting_criteria: str, - sorting_criteria_scoring: str, - outfile: TextIOWrapper, - outfiledisc: TextIOWrapper, -) -> None: - """ - Constructs target clusters by processing and sorting reference and - alternative targets based on specified criteria, and writes the results to - output files. - - Args: - cluster (List[List[str]]): List of target clusters. - guideidx (int): Index of the guide. - snpidx (int): Index of the SNP information. - posidx (int): Index of the position. - scoreidx (int): Index of the score. - mmbidx (int): Index of the MM+Bulges. - sort_pivot (str): Sorting pivot. - sorting_criteria (str): Sorting criteria. - sorting_criteria_scoring (str): Sorting criteria for scoring. - outfile (TextIOWrapper): Output file for reported alignments. - outfiledisc (TextIOWrapper): Output file for alternative alignments. - - Returns: - None - """ - - if not cluster: # avoid crashes when cluster is empty - return - # recover reference and alternative targets - reference_targets, alternative_targets = cluster_targets_by_pos_snp( - cluster, guideidx, snpidx, posidx - ) - noref = ( - not reference_targets - ) # check whether only alternative targets have been found - # recover alternative targets list - alternative_targets = recover_alternative_targets_list(alternative_targets, noref) - # remove duplicates values on snp info, snp id, af, and samples columns - alternative_targets = unique_values(alternative_targets, snpidx, guideidx) - # sort targets - score = sort_pivot == SORTING_PIVOTS[0] - sorting_criteria = sorting_criteria_scoring if score else sorting_criteria - criteria = initialize_sorting_criteria(sorting_criteria, score, scoreidx, mmbidx) - if reference_targets: - reference_targets = sort_targets(reference_targets, criteria) - if alternative_targets: - alternative_targets = sort_targets(alternative_targets, criteria) - # write targets to reported or alternative alignments files - write_best_targets( - reference_targets, - alternative_targets, - noref, - score, - scoreidx, - mmbidx, - outfile, - outfiledisc, - ) - - -def write_best_targets( - reference_targets: List[List[str]], - alternative_targets: List[List[str]], - noref: bool, - score: bool, - scoreidx: int, - mmbidx: int, - outfile: TextIOWrapper, - outfiledisc: TextIOWrapper, -) -> None: - """ - Writes the best targets to the reported alignments file and alternative - alignments file based on specified criteria. - - Args: - reference_targets (List[List[str]]): List of reference targets. - alternative_targets (List[List[str]]): List of alternative targets. - noref (bool): Flag indicating if no reference target is found. - score (bool): Flag indicating if scoring is used for comparison. - scoreidx (int): Index of the score. - mmbidx (int): Index of the MM+Bulges. - outfile (TextIOWrapper): Output file for reported alignments. - outfiledisc (TextIOWrapper): Output file for alternative alignments. - - Returns: - None - """ - - if noref: # no reference target found - target = alternative_targets.pop(0) # pop best target - target[scoreidx - 1] = str(len(alternative_targets)) - elif reference_targets and alternative_targets: # targets found in both - if score: - target = ( - alternative_targets.pop(0) - if float(reference_targets[0][scoreidx]) - < float(alternative_targets[0][scoreidx]) - else reference_targets.pop(0) + if validity_check_var: + final_list_best_var = sorted( + final_list_best_var, + key=lambda x: (int(x[total - 2]), int(x[total - 1])), ) - elif int(alternative_targets[0][mmbidx]) < int(reference_targets[0][mmbidx]): - target = alternative_targets.pop(0) - else: - target = reference_targets.pop(0) - target[scoreidx - 1] = str(len(reference_targets) + len(alternative_targets)) - else: # no alternative target found - target = reference_targets.pop(0) - target[scoreidx - 1] = str(len(reference_targets)) - target = "\t".join(target) - outfile.write(f"{target}\n") - # write the alternative alignments targets - for target in reference_targets + alternative_targets: - target[scoreidx - 1] = str(len(reference_targets) + len(alternative_targets)) - target = "\t".join(target) - outfiledisc.write(f"{target}\n") - - -def sort_targets(targets: List[List[str]], criteria: Callable) -> List[List[str]]: - """ - Sorts the list of targets based on the specified criteria function. - - Args: - targets (List[List[str]]): List of targets to be sorted. - criteria (Callable): Sorting criteria function. - - Returns: - List[List[str]]: Sorted list of targets based on the provided criteria. - """ - - return sorted(targets, key=criteria) - - -def sorting_score(criteria: List[str], score_idx: int, mmb_idx: int) -> Callable: - """ - Returns a sorting function based on the specified criteria for scoring, - MM+Bulges index, and multiple criteria. - - Args: - criteria (List[str]): List of sorting criteria. - score_idx (int): Index of the score. - mmb_idx (int): Index of the MM+Bulges. - - Returns: - Callable: Sorting function based on the provided criteria. - """ - - if len(criteria) == 1: # single criterion - return lambda x: ( - -float(x[score_idx]), - int(x[mmb_idx - SORTING_CRITERIA[criteria[0]]]), - ) - elif len(criteria) == 2: - return lambda x: ( - -float(x[score_idx]), - int(x[mmb_idx - SORTING_CRITERIA[criteria[0]]]), - int(x[mmb_idx - SORTING_CRITERIA[criteria[1]]]), - ) - # base case (all three ) - return lambda x: ( - -float(x[score_idx]), - int(x[mmb_idx - SORTING_CRITERIA[criteria[0]]]), - int(x[mmb_idx - SORTING_CRITERIA[criteria[1]]]), - int(x[mmb_idx - SORTING_CRITERIA[criteria[2]]]), - ) - - -def sorting_fewest(criteria: List[str], mmb_idx: int) -> Callable: - """ - Returns a sorting function based on the specified criteria for the fewest - MM+Bulges index. - - Args: - criteria (List[str]): List of sorting criteria. - mmb_idx (int): Index of the MM+Bulges. - - Returns: - Callable: Sorting function based on the provided criteria for the fewest - MM+Bulges index. - """ - - if len(criteria) == 1: # one criterion - return lambda x: (int(x[mmb_idx - SORTING_CRITERIA[criteria[0]]])) - elif len(criteria) == 2: - return lambda x: ( - int(x[mmb_idx - SORTING_CRITERIA[criteria[0]]]), - int(x[mmb_idx - SORTING_CRITERIA[criteria[1]]]), - ) - # base case (all three ) - return lambda x: ( - int(x[mmb_idx - SORTING_CRITERIA[criteria[0]]]), - int(x[mmb_idx - SORTING_CRITERIA[criteria[1]]]), - int(x[mmb_idx - SORTING_CRITERIA[criteria[2]]]), - ) - - -def initialize_sorting_criteria( - sorting_criteria: str, score: bool, score_idx: int, mmb_idx: int -) -> Callable: - """ - Initializes the sorting criteria function based on the specified sorting - criteria, score flag, score index, and MM+Bulges index. - - Args: - sorting_criteria (str): Comma-separated string of sorting criteria. - score (bool): Flag indicating if scoring is used for sorting. - score_idx (int): Index of the score. - mmb_idx (int): Index of the MM+Bulges. - - Returns: - Callable: Sorting criteria function based on the provided parameters. - """ - - criteria = sorting_criteria.split(",") - if len(criteria) > 3: - raise ValueError("Mismatching sorting criteria selected") - if any(c not in SORTING_CRITERIA for c in criteria): - raise ValueError("Unknown sorting criteria") - if score: - return sorting_score(criteria, score_idx, mmb_idx) - return sorting_fewest(criteria, mmb_idx) - - -def split_targets(input_args: MergeTargets) -> None: - """ - Splits and processes the input targets file into best and alternative targets - files based on specified criteria. - - Args: - input_args (MergeTargets): Object containing input arguments for target - splitting and processing. - - Returns: - None - - Raises: - OSError: If an error occurs during the process of splitting and processing - the targets. - """ - - # open best, and alternative targets files - try: - with open(input_args.targets_fname, mode="r") as infile, open( - input_args.targets_fname_merged, mode="w" - ) as outfile, open(input_args.targets_fname_discarded, mode="w") as discfile: - # write header to outfiles - header = infile.readline().strip() - outfile.write(f"{header}\n") - discfile.write(f"{header}\n") - # begin dividing targets - ( - pos_prev, - guide_prev, - chrom_prev, - snp_prev, - cluster, - ) = initialize_targets_cluster(input_args.rangebp) - for line in infile: - target = parse_target(line) - guide, chrom, pos, snp = ( - target[input_args.guideidx], - target[input_args.chromidx], - int(target[input_args.posidx]), - target[input_args.snpidx], + if var_only: # no ref found + # count the residual targets in the list + final_list_best_var[0][cfd - 1] = str(len(final_list_best_var) - 1) + # append the best target to best_file + fileOut.write("\t".join(final_list_best_var[0])) + # pop the best target from the list + bestTarget = final_list_best_var.pop(0) + elif validity_check_ref and validity_check_var: # ref and var targets found + if int(final_list_best_ref[0][total]) <= int(final_list_best_var[0][total]): + final_list_best_ref[0][cfd - 1] = str( + len(final_list_best_ref) + len(final_list_best_var) - 1 + ) + fileOut.write("\t".join(final_list_best_ref[0])) + bestTarget = final_list_best_ref.pop(0) + else: + final_list_best_var[0][cfd - 1] = str( + len(final_list_best_ref) + len(final_list_best_var) - 1 ) - if open_targets_cluster( - guide, - guide_prev, - chrom, - chrom_prev, - pos, - pos_prev, - input_args.rangebp, + fileOut.write("\t".join(final_list_best_var[0])) + bestTarget = final_list_best_var.pop(0) + else: # only ref + final_list_best_ref[0][cfd - 1] = str(len(final_list_best_ref) - 1) + fileOut.write("\t".join(final_list_best_ref[0])) + bestTarget = final_list_best_ref.pop(0) + # write all the remaining targets in the alt file + for count, elem in enumerate(final_list_best_ref): + final_list_best_ref[count][cfd - 1] = str( + len(final_list_best_ref) + len(final_list_best_var) - 1 + ) + fileOut_disc.write(("\t".join(elem))) + for count, elem in enumerate(final_list_best_var): + final_list_best_var[count][cfd - 1] = str( + len(final_list_best_ref) + len(final_list_best_var) - 1 + ) + fileOut_disc.write(("\t".join(elem))) + + +tau = int(sys.argv[3]) # range in bp to merge targets +chrom = int(sys.argv[4]) - 1 # chromoso +pos = int(sys.argv[5]) - 1 # position of target +total = int(sys.argv[6]) - 1 # mm+bul value +true_guide = int(sys.argv[7]) - 1 # real guide used in the search +snp_info = int(sys.argv[8]) - 1 # snp_info (ref_alt_allele) +cfd = int(sys.argv[9]) - 1 # CFD score +sort_order = str(sys.argv[10]) +# -1 is to get the correct "python enumeration" from the bash script + +start = time.time() +with open(sys.argv[1], "r") as fileIn: + header = fileIn.readline() + with open(sys.argv[2], "w") as fileOut: + with open(sys.argv[2] + ".discarded_samples", "w") as fileOut_disc: + fileOut.write(header) + fileOut_disc.write(header) + prev_pos = -(tau + 1) + best_row = "" + prev_guide = "" + prev_chr = "" + prev_snp = "" + cluster = [] + for line in fileIn: + splitted = line.split("\t") + if ( + prev_guide != splitted[true_guide] + or prev_chr != splitted[chrom] + or int(splitted[pos]) - prev_pos > tau ): - construct_cluster( - cluster, - input_args.guideidx, - input_args.snpidx, - input_args.posidx, - input_args.scoreidx, - input_args.mmbidx, - input_args.sort_pivot, - input_args.sorting_criteria, - input_args.sorting_criteria_scoring, - outfile, - discfile, - ) - cluster = [target] + get_best_targets(cluster, fileOut, fileOut_disc, cfd, snp_info) + cluster = [splitted] else: - cluster.append(target) - guide_prev = guide - pos_prev = pos - chrom_prev = chrom - snp_prev = snp - construct_cluster( - cluster, - input_args.guideidx, - input_args.snpidx, - input_args.posidx, - input_args.scoreidx, - input_args.mmbidx, - input_args.sort_pivot, - input_args.sorting_criteria, - input_args.sorting_criteria_scoring, - outfile, - discfile, - ) - except IOError as e: - raise OSError("An error occurred while merging contiguous targets") from e - - -def merge_targets() -> None: - """ - Merges targets by parsing input arguments, splitting the targets, and - displaying the completion time. - - Returns: - None - """ - - start = time() # start time point - input_args = parse_input_args(sys.argv[1:]) - split_targets(input_args) - sys.stdout.write(f"Merge completed in {(time() - start)}s\n") + cluster.append(splitted) + prev_guide = splitted[true_guide] + prev_pos = int(splitted[pos]) + prev_chr = splitted[chrom] + prev_snp = splitted[snp_info] + get_best_targets(cluster, fileOut, fileOut_disc, cfd, snp_info) -if __name__ == "__main__": - merge_targets() +print("Mergin done in: " + str(time.time() - start)) diff --git a/PostProcess/remove_contiguous_samples_cfd_new.py b/PostProcess/remove_contiguous_samples_cfd_new.py new file mode 100755 index 0000000..ca80ac6 --- /dev/null +++ b/PostProcess/remove_contiguous_samples_cfd_new.py @@ -0,0 +1,680 @@ +""" +This script parses the input arguments and removes contiguous samples from a file based on specified criteria. + +The script expects the following input arguments: +- infname (str): The input file name. +- outfname (str): The output file name. +- rangebp (int): The maximum range in base pairs for samples to be considered contiguous. +- chrom_idx (int): The index of the chromosome column in the input file. +- position_idx (int): The index of the position column in the input file. +- mm_bul_count_idx (int): The index of the mismatch and bulge count column in the input file. +- guide_idx (int): The index of the guide column in the input file. +- snp_info_idx (int): The index of the SNP info column in the input file. +- score_idx (int): The index of the score column in the input file. +- sort_criterion (str): The sorting criterion for the output file. + +The script reads the input file, parses the specified columns, and removes contiguous samples based on the given range. The resulting data is written to the output file. + +Examples: + $ python remove_contiguous_samples_cfd.py input.txt output.txt 10 2 3 4 5 6 7 8 score +""" + +from typing import List, Tuple, Dict, Callable, Union +from io import TextIOWrapper +from time import time + +import sys +import os + +INPUT_ARG_COUNT = 12 +FLAG_ALT_ONLY = 12 +SORTING_PIVOTS = ["score", "total"] +SORTING_CRITERIA = {"mm+bulges": 0, "mm": 2, "bulges": 1} + + +class MergeTargets: + """ + Represents a class for merging targets with specified criteria for sorting + and indexing. + + Args: + args (List[str]): List of arguments containing input parameters for + target merging. + + Returns: + None + + Raises: + FileNotFoundError: If the input targets file is not found. + ValueError: If there are issues with the specified merge range, column + indices, or sorting criteria. + """ + + def __init__(self, args: List[str]) -> None: + self._infname = args[0] # input targets + if not os.path.exists(self._infname) or not os.path.isfile(self._infname): + raise FileNotFoundError(f"{self._infname} not found") + self._outfname = args[1] # output merged targets + self._rangebp = int(args[2]) # merge bp range + if self._rangebp <= 0: + raise ValueError(f"Invalid merge range ({self._rangebp})") + self._chromidx = int(args[3]) - 1 # chromosome index + if self._chromidx != 4: + raise ValueError( + f"Chromosome data is expected on column 5, got {self._chromidx}" + ) + self._posidx = int(args[4]) - 1 # position index + if self._posidx != 6: + raise ValueError( + f"Position data is expected on column 7, got {self._posidx}" + ) + self._mmb_idx = int(args[5]) - 1 # mm+bulges index + if self._mmb_idx != 10: + raise ValueError( + f"MM+Bulges data is expected on column 11, got {self._mmb_idx}" + ) + self._guide_idx = int(args[6]) - 1 # guide index + if self._guide_idx != 15: + raise ValueError( + f"Guide data is expected on column 16, got {self._guide_idx}" + ) + self._snp_idx = int(args[7]) - 1 # snp info index + if self._snp_idx != 18: + raise ValueError(f"SNP data is expected on column 19, got {self._snp_idx}") + self._score_idx = int(args[8]) - 1 # score index + if self._score_idx != 20: + raise ValueError( + f"Score data is expected on column 21, got {self._score_idx}" + ) + self._sort_criterion = args[9] # sorting criterion (pivot) + if self._sort_criterion not in SORTING_PIVOTS: + raise ValueError( + f"Allowed sort pivots: {SORTING_PIVOTS}, got {self._sort_criterion}" + ) + self._sorting_criteria_scoring = args[10] # sorting criteria (score is pivot) + self._sorting_criteria = args[11] # sorting criteria (mm+bulges is pivot) + + @property + def targets_fname(self) -> str: + return self._infname + + @property + def targets_fname_merged(self) -> str: + return self._outfname + + @property + def targets_fname_discarded(self) -> str: + return f"{self._outfname}.discarded_samples" + + @property + def rangebp(self) -> int: + return self._rangebp + + @property + def chromidx(self) -> int: + return self._chromidx + + @property + def posidx(self) -> int: + return self._posidx + + @property + def mmbidx(self) -> int: + return self._mmb_idx + + @property + def guideidx(self) -> int: + return self._guide_idx + + @property + def snpidx(self) -> int: + return self._snp_idx + + @property + def scoreidx(self) -> int: + return self._score_idx + + @property + def sort_pivot(self) -> str: + return self._sort_criterion + + @property + def sorting_criteria_scoring(self) -> str: + return self._sorting_criteria_scoring + + @property + def sorting_criteria(self) -> str: + return self._sorting_criteria + + +def parse_input_args(args: List[str]) -> MergeTargets: + """ + Parses the input arguments to create an instance of MergeTargets for handling + target merging based on the provided arguments. + + Args: + args (List[str]): List of input arguments to be parsed for target merging. + + Returns: + MergeTargets: An instance of MergeTargets class for further processing. + """ + + if not args: + raise ValueError("No input argument provided") + if len(args) != INPUT_ARG_COUNT: + raise ValueError(f"Expected {INPUT_ARG_COUNT} arguments, got {len(args)}") + return MergeTargets(args) # parse input arguments + + +def initialize_targets_cluster(rangebp: int) -> Tuple[int, str, str, str, List[str]]: + """ + Initializes the targets cluster by setting the previous cluster position, + guide, chromosome, SNP, and an open starting cluster. + + Args: + rangebp (int): The range of base pairs for the cluster. + + Returns: + Tuple[int, str, str, str, List[str]]: Tuple containing the initialized + cluster parameters. + """ + + pos_prev = -(rangebp + 1) # previous cluster position + guide_prev = "" # previous cluster's guide + chrom_prev = "" # previous cluster's chromosome + snp_prev = "" # previous cluster snp + cluster = [] # open starting cluster + return pos_prev, guide_prev, chrom_prev, snp_prev, cluster + + +def parse_target(target: str) -> List[str]: + """ + Parses the fields of a target string and returns a list of the parsed fields. + + Args: + target (str): The target string to be parsed. + + Returns: + List[str]: List of parsed fields extracted from the target string. + """ + + # parse target's fields + return target.strip().split() + + +def open_targets_cluster( + guide: str, + guide_prev: str, + chrom: str, + chrom_prev: str, + pos: int, + pos_prev: int, + rangebp: int, +) -> bool: + """ + Determines whether a new targets cluster should be opened based on changes + in guide, chromosome, and position within a specified range of base pairs. + + Args: + guide (str): Current guide. + guide_prev (str): Previous guide. + chrom (str): Current chromosome. + chrom_prev (str): Previous chromosome. + pos (int): Current position. + pos_prev (int): Previous position. + rangebp (int): Range of base pairs for cluster merging. + + Returns: + bool: True if a new targets cluster should be opened, False otherwise. + """ + + return guide != guide_prev or chrom != chrom_prev or pos - pos_prev > rangebp + + +def cluster_targets_by_pos_snp( + cluster: List[List[str]], guideidx: int, snpidx: int, posidx: int +) -> Tuple[List[List[str]], Dict[Tuple[int, str], List[List[str]]]]: + """ + Clusters targets based on position and SNP information, separating them into + reference and alternative targets. + + Args: + cluster (List[List[str]]): List of target clusters. + guideidx (int): Index of the guide. + snpidx (int): Index of the SNP information. + posidx (int): Index of the position. + + Returns: + Tuple[List[List[str]], Dict[Tuple[int, str], List[List[str]]]: A tuple + containing the reference targets list and a dictionary of alternative + targets grouped by position and SNP information. + """ + + # initialize reference and alternative targets lists + reference_targets, alternative_targets = [], {} + samplesidx, snpididx, afidx = guideidx - 2, snpidx - 2, snpidx - 1 + for target in cluster: + if target[snpidx] == "n": # reference target + reference_targets.append(target) + continue + # alternative target + pos_t, snp_info_t = target[posidx], target[snpidx] + if (pos_t, snp_info_t) in alternative_targets: + alternative_targets[(pos_t, snp_info_t)][0][ + samplesidx + ] += f",{target[samplesidx]}" # add samples + alternative_targets[(pos_t, snp_info_t)][0][ + snpididx + ] += f",{target[snpididx]}" # add snp ids + alternative_targets[(pos_t, snp_info_t)][0][ + afidx + ] += f",{target[afidx]}" # add allele frequencies + else: + alternative_targets[(pos_t, snp_info_t)] = [target] + return reference_targets, alternative_targets + + +def recover_alternative_targets_list( + alternative_targets: Dict[Tuple[int, str], List[List[str]]], noref: bool +) -> List[List[str]]: + """ + Recovers the list of alternative targets from a dictionary of alternative + targets grouped by position and SNP information. + + Args: + alternative_targets (Dict[Tuple[int, str], List[List[str]]): Dictionary + of alternative targets grouped by position and SNP information. + noref (bool): Flag indicating whether to include reference targets. + + Returns: + List[List[str]]: List of alternative targets with updated flags. + """ + + alternative_targets_list = [] # initialize the list + for v in alternative_targets.values(): + for target in v: + # check whether the target is uniquely found on alternative sequences + target[FLAG_ALT_ONLY] = "y" if noref else target[FLAG_ALT_ONLY] + alternative_targets_list.append(target) + return alternative_targets_list + + +def remove_duplicates(target: List[str], idx: int) -> str: + """ + Removes duplicates from a target list at the specified index and returns the + unique values as a comma-separated string. + + Args: + target (List[str]): List of target values. + idx (int): Index of the target list to remove duplicates from. + + Returns: + str: Comma-separated string of unique values after removing duplicates. + """ + + return ",".join(set(target[idx].split(","))) + + +def unique_values( + targets: List[List[str]], snpidx: int, guideidx: int +) -> List[List[str]]: + """ + Returns a list of targets with unique values for SNP information, SNP ID, + allele frequency, and samples. + + Args: + targets (List[List[str]]): List of target clusters. + snpidx (int): Index of the SNP information. + guideidx (int): Index of the guide. + + Returns: + List[List[str]]: List of targets with unique values for specified fields. + """ + + snpididx, afidx, samplesidx = snpidx - 2, snpidx - 1, guideidx - 2 + targets_filt = [] + # remove duplicate values + for target in targets: + target[snpidx] = remove_duplicates(target, snpidx) # snp info + target[snpididx] = remove_duplicates(target, snpididx) # snp id + target[afidx] = remove_duplicates(target, afidx) # af + target[samplesidx] = remove_duplicates(target, samplesidx) # samples + targets_filt.append(target) # update target + return targets_filt + + +def construct_cluster( + cluster: List[List[str]], + guideidx: int, + snpidx: int, + posidx: int, + scoreidx: int, + mmbidx: int, + sort_pivot: str, + sorting_criteria: str, + sorting_criteria_scoring: str, + outfile: TextIOWrapper, + outfiledisc: TextIOWrapper, +) -> None: + """ + Constructs target clusters by processing and sorting reference and + alternative targets based on specified criteria, and writes the results to + output files. + + Args: + cluster (List[List[str]]): List of target clusters. + guideidx (int): Index of the guide. + snpidx (int): Index of the SNP information. + posidx (int): Index of the position. + scoreidx (int): Index of the score. + mmbidx (int): Index of the MM+Bulges. + sort_pivot (str): Sorting pivot. + sorting_criteria (str): Sorting criteria. + sorting_criteria_scoring (str): Sorting criteria for scoring. + outfile (TextIOWrapper): Output file for reported alignments. + outfiledisc (TextIOWrapper): Output file for alternative alignments. + + Returns: + None + """ + + if not cluster: # avoid crashes when cluster is empty + return + # recover reference and alternative targets + reference_targets, alternative_targets = cluster_targets_by_pos_snp( + cluster, guideidx, snpidx, posidx + ) + noref = ( + not reference_targets + ) # check whether only alternative targets have been found + # recover alternative targets list + alternative_targets = recover_alternative_targets_list(alternative_targets, noref) + # remove duplicates values on snp info, snp id, af, and samples columns + alternative_targets = unique_values(alternative_targets, snpidx, guideidx) + # sort targets + score = sort_pivot == SORTING_PIVOTS[0] + sorting_criteria = sorting_criteria_scoring if score else sorting_criteria + criteria = initialize_sorting_criteria(sorting_criteria, score, scoreidx, mmbidx) + if reference_targets: + reference_targets = sort_targets(reference_targets, criteria) + if alternative_targets: + alternative_targets = sort_targets(alternative_targets, criteria) + # write targets to reported or alternative alignments files + write_best_targets( + reference_targets, + alternative_targets, + noref, + score, + scoreidx, + mmbidx, + outfile, + outfiledisc, + ) + + +def write_best_targets( + reference_targets: List[List[str]], + alternative_targets: List[List[str]], + noref: bool, + score: bool, + scoreidx: int, + mmbidx: int, + outfile: TextIOWrapper, + outfiledisc: TextIOWrapper, +) -> None: + """ + Writes the best targets to the reported alignments file and alternative + alignments file based on specified criteria. + + Args: + reference_targets (List[List[str]]): List of reference targets. + alternative_targets (List[List[str]]): List of alternative targets. + noref (bool): Flag indicating if no reference target is found. + score (bool): Flag indicating if scoring is used for comparison. + scoreidx (int): Index of the score. + mmbidx (int): Index of the MM+Bulges. + outfile (TextIOWrapper): Output file for reported alignments. + outfiledisc (TextIOWrapper): Output file for alternative alignments. + + Returns: + None + """ + + if noref: # no reference target found + target = alternative_targets.pop(0) # pop best target + target[scoreidx - 1] = str(len(alternative_targets)) + elif reference_targets and alternative_targets: # targets found in both + if score: + target = ( + alternative_targets.pop(0) + if float(reference_targets[0][scoreidx]) + < float(alternative_targets[0][scoreidx]) + else reference_targets.pop(0) + ) + elif int(alternative_targets[0][mmbidx]) < int(reference_targets[0][mmbidx]): + target = alternative_targets.pop(0) + else: + target = reference_targets.pop(0) + target[scoreidx - 1] = str(len(reference_targets) + len(alternative_targets)) + else: # no alternative target found + target = reference_targets.pop(0) + target[scoreidx - 1] = str(len(reference_targets)) + target = "\t".join(target) + outfile.write(f"{target}\n") + # write the alternative alignments targets + for target in reference_targets + alternative_targets: + target[scoreidx - 1] = str(len(reference_targets) + len(alternative_targets)) + target = "\t".join(target) + outfiledisc.write(f"{target}\n") + + +def sort_targets(targets: List[List[str]], criteria: Callable) -> List[List[str]]: + """ + Sorts the list of targets based on the specified criteria function. + + Args: + targets (List[List[str]]): List of targets to be sorted. + criteria (Callable): Sorting criteria function. + + Returns: + List[List[str]]: Sorted list of targets based on the provided criteria. + """ + + return sorted(targets, key=criteria) + + +def sorting_score(criteria: List[str], score_idx: int, mmb_idx: int) -> Callable: + """ + Returns a sorting function based on the specified criteria for scoring, + MM+Bulges index, and multiple criteria. + + Args: + criteria (List[str]): List of sorting criteria. + score_idx (int): Index of the score. + mmb_idx (int): Index of the MM+Bulges. + + Returns: + Callable: Sorting function based on the provided criteria. + """ + + if len(criteria) == 1: # single criterion + return lambda x: ( + -float(x[score_idx]), + int(x[mmb_idx - SORTING_CRITERIA[criteria[0]]]), + ) + elif len(criteria) == 2: + return lambda x: ( + -float(x[score_idx]), + int(x[mmb_idx - SORTING_CRITERIA[criteria[0]]]), + int(x[mmb_idx - SORTING_CRITERIA[criteria[1]]]), + ) + # base case (all three ) + return lambda x: ( + -float(x[score_idx]), + int(x[mmb_idx - SORTING_CRITERIA[criteria[0]]]), + int(x[mmb_idx - SORTING_CRITERIA[criteria[1]]]), + int(x[mmb_idx - SORTING_CRITERIA[criteria[2]]]), + ) + + +def sorting_fewest(criteria: List[str], mmb_idx: int) -> Callable: + """ + Returns a sorting function based on the specified criteria for the fewest + MM+Bulges index. + + Args: + criteria (List[str]): List of sorting criteria. + mmb_idx (int): Index of the MM+Bulges. + + Returns: + Callable: Sorting function based on the provided criteria for the fewest + MM+Bulges index. + """ + + if len(criteria) == 1: # one criterion + return lambda x: (int(x[mmb_idx - SORTING_CRITERIA[criteria[0]]])) + elif len(criteria) == 2: + return lambda x: ( + int(x[mmb_idx - SORTING_CRITERIA[criteria[0]]]), + int(x[mmb_idx - SORTING_CRITERIA[criteria[1]]]), + ) + # base case (all three ) + return lambda x: ( + int(x[mmb_idx - SORTING_CRITERIA[criteria[0]]]), + int(x[mmb_idx - SORTING_CRITERIA[criteria[1]]]), + int(x[mmb_idx - SORTING_CRITERIA[criteria[2]]]), + ) + + +def initialize_sorting_criteria( + sorting_criteria: str, score: bool, score_idx: int, mmb_idx: int +) -> Callable: + """ + Initializes the sorting criteria function based on the specified sorting + criteria, score flag, score index, and MM+Bulges index. + + Args: + sorting_criteria (str): Comma-separated string of sorting criteria. + score (bool): Flag indicating if scoring is used for sorting. + score_idx (int): Index of the score. + mmb_idx (int): Index of the MM+Bulges. + + Returns: + Callable: Sorting criteria function based on the provided parameters. + """ + + criteria = sorting_criteria.split(",") + if len(criteria) > 3: + raise ValueError("Mismatching sorting criteria selected") + if any(c not in SORTING_CRITERIA for c in criteria): + raise ValueError("Unknown sorting criteria") + if score: + return sorting_score(criteria, score_idx, mmb_idx) + return sorting_fewest(criteria, mmb_idx) + + +def split_targets(input_args: MergeTargets) -> None: + """ + Splits and processes the input targets file into best and alternative targets + files based on specified criteria. + + Args: + input_args (MergeTargets): Object containing input arguments for target + splitting and processing. + + Returns: + None + + Raises: + OSError: If an error occurs during the process of splitting and processing + the targets. + """ + + # open best, and alternative targets files + try: + with open(input_args.targets_fname, mode="r") as infile, open( + input_args.targets_fname_merged, mode="w" + ) as outfile, open(input_args.targets_fname_discarded, mode="w") as discfile: + # write header to outfiles + header = infile.readline().strip() + outfile.write(f"{header}\n") + discfile.write(f"{header}\n") + # begin dividing targets + ( + pos_prev, + guide_prev, + chrom_prev, + snp_prev, + cluster, + ) = initialize_targets_cluster(input_args.rangebp) + for line in infile: + target = parse_target(line) + guide, chrom, pos, snp = ( + target[input_args.guideidx], + target[input_args.chromidx], + int(target[input_args.posidx]), + target[input_args.snpidx], + ) + if open_targets_cluster( + guide, + guide_prev, + chrom, + chrom_prev, + pos, + pos_prev, + input_args.rangebp, + ): + construct_cluster( + cluster, + input_args.guideidx, + input_args.snpidx, + input_args.posidx, + input_args.scoreidx, + input_args.mmbidx, + input_args.sort_pivot, + input_args.sorting_criteria, + input_args.sorting_criteria_scoring, + outfile, + discfile, + ) + cluster = [target] + else: + cluster.append(target) + guide_prev = guide + pos_prev = pos + chrom_prev = chrom + snp_prev = snp + construct_cluster( + cluster, + input_args.guideidx, + input_args.snpidx, + input_args.posidx, + input_args.scoreidx, + input_args.mmbidx, + input_args.sort_pivot, + input_args.sorting_criteria, + input_args.sorting_criteria_scoring, + outfile, + discfile, + ) + except IOError as e: + raise OSError("An error occurred while merging contiguous targets") from e + + +def merge_targets() -> None: + """ + Merges targets by parsing input arguments, splitting the targets, and + displaying the completion time. + + Returns: + None + """ + + start = time() # start time point + input_args = parse_input_args(sys.argv[1:]) + split_targets(input_args) + sys.stdout.write(f"Merge completed in {(time() - start)}s\n") + + +if __name__ == "__main__": + merge_targets() diff --git a/PostProcess/scriptAnalisiNNN_v3.sh b/PostProcess/scriptAnalisiNNN_v3.sh index afc1f86..d16119d 100755 --- a/PostProcess/scriptAnalisiNNN_v3.sh +++ b/PostProcess/scriptAnalisiNNN_v3.sh @@ -1,7 +1,5 @@ #!/bin/bash -set -e # trace all commands failures - # Script per l'analisi dei targets della ricerca REF e ENR con PAM NNN # Il file dei targets della ricerca sul genoma reference si chiama $REFtargets -> INPUT $1 # Il file dei targets della ricerca sul genoma enriched si chiama $ENRtargets -> INPUT $2 @@ -41,10 +39,7 @@ output_folder=${12} echo $jobid # 1) Rimozione duplicati, estrazione semicommon e unique e creazione file total #echo 'Creazione file .total.txt' -./extraction.sh $REFtargets $ENRtargets $jobid || { - echo "CRISPRme ERROR: targets extraction failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 -} # OUTPUT $jobid.common_targets.txt -> Non usato +./extraction.sh $REFtargets $ENRtargets $jobid # OUTPUT $jobid.common_targets.txt -> Non usato # $jobid.semi_common_targets.txt # $jobid.unique_targets.txt rm $jobid.common_targets.txt @@ -62,10 +57,7 @@ rm $jobid.semi_common_targets.minmaxdisr.txt #echo 'Creazione cluster del file .total.txt' # 3) Clustering -./cluster.dict.py $jobid.total.txt 'no' 'True' 'True' "$guide_file" 'total' 'orderChr' || { - echo "CRISPRme ERROR: targets clustering failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 -} # OUTPUT $jobid.total.cluster.txt +./cluster.dict.py $jobid.total.txt 'no' 'True' 'True' "$guide_file" 'total' 'orderChr' # OUTPUT $jobid.total.cluster.txt rm $jobid.total.txt #sed -i ':a;N;$!ba;s/\n/\tn\tn\tn\n/g' $jobid.total.cluster.txt @@ -94,10 +86,7 @@ rm $jobid.total.txt #echo 'Estrazione sample dal file .total.cluster.txt' # ./simpleAnalysis_v3.py "$annotationfile" "$jobid.total.cluster.txt" "$jobid" "$dictionaries" "$pam_file" $mismatch "$referencegenome" "$guide_file" $bulgesDNA $bulgesRNA -./new_simple_analysis.py "$referencegenome" "$dictionaries" "$jobid.total.cluster.txt" "${pam_file}" "$jobid" "$mismatch" || { - echo "CRISPRme ERROR: annotation analysis failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 -} +./new_simple_analysis.py "$referencegenome" "$dictionaries" "$jobid.total.cluster.txt" "${pam_file}" "$jobid" "$mismatch" # cp $jobid.bestCFD.txt $jobid.bestCFD.txt.check_analysis # cp $jobid.bestmmblg.txt $jobid.bestmmblg.txt.check_analysis # cp $jobid.bestCRISTA.txt $jobid.bestCRISTA.txt.check_analysis @@ -129,18 +118,9 @@ echo 'Sorting and adjusting results' # cp $jobid.bestCRISTA.txt $jobid.bestCRISTA.txt.after_sort #adjustin columns to have the correct order and remove uncessary ones -./adjust_cols.py $jobid.bestCFD.txt || { - echo "CRISPRme ERROR: CFD report cleaning failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 -} -./adjust_cols.py $jobid.bestmmblg.txt || { - echo "CRISPRme ERROR: mismatch+bulges report cleaning failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 -} -./adjust_cols.py $jobid.bestCRISTA.txt || { - echo "CRISPRme ERROR: CRISTA report cleaning failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 -} +./adjust_cols.py $jobid.bestCFD.txt +./adjust_cols.py $jobid.bestmmblg.txt +./adjust_cols.py $jobid.bestCRISTA.txt # ./adjust_cols.py $jobid.altmmblg.txt # sed -i 1i"#Bulge_type\tcrRNA\tDNA\tReference\tChromosome\tPosition\tCluster_Position\tDirection\tMismatches\tBulge_Size\tTotal\tPAM_gen\tVar_uniq\tSamples\tAnnotation_Type\tReal_Guide\trsID\tAF\tSNP\t#Seq_in_cluster\tCFD\tCFD_ref\t_#Bulge_type\t_crRNA\t_DNA\t_Reference\t_Chromosome\t_Position\t_Cluster_Position\t_Direction\t_Mismatches\t_Bulge_Size\t_Total\t_PAM_gen\t_Var_uniq\t_Samples\t_Annotation_Type\t_Real_Guide\t_rsID\t_AF\t_SNP\t_#Seq_in_cluster\t_CFD\t_CFD_ref\tCRISTA_#Bulge_type\tCRISTA_crRNA\tCRISTA_DNA\tCRISTA_Reference\tCRISTA_Chromosome\tCRISTA_Position\tCRISTA_Cluster_Position\tCRISTA_Direction\tCRISTA_Mismatches\tCRISTA_Bulge_Size\tCRISTA_Total\tCRISTA_PAM_gen\tCRISTA_Var_uniq\tCRISTA_Samples\tCRISTA_Annotation_Type\tCRISTA_Real_Guide\tCRISTA_rsID\tCRISTA_AF\tCRISTA_SNP\tCRISTA_#Seq_in_cluster\tCRISTA_CFD\tCRISTA_CFD_ref" "$final_res" diff --git a/PostProcess/submit_job_automated_new_multiple_vcfs.sh b/PostProcess/submit_job_automated_new_multiple_vcfs.sh index c68621d..fc37f96 100755 --- a/PostProcess/submit_job_automated_new_multiple_vcfs.sh +++ b/PostProcess/submit_job_automated_new_multiple_vcfs.sh @@ -1,33 +1,54 @@ #!/bin/bash -set -e # capture any failure +# This script automates the search for guide RNA and PAM sequences in reference +# and variant genomes. It processes input files, manages directories, and executes +# various analyses to identify potential CRISPR targets. +# The script handles both reference and variant genomes, performs indexing, +# searching, and post-analysis, and generates results in specified output formats. +# +# Args: +# $1: Reference genome folder path. +# $2: List of VCF files. +# $3: Guide RNA file path. +# $4: PAM file path. +# $5: Annotation file path. +# $6: Sample ID file path. +# $7: Maximum bulge size. +# $8: Mismatches allowed. +# $9: Bulge DNA size. +# ${10}: Bulge RNA size. +# ${11}: Merge threshold. +# ${12}: Output folder path. +# ${13}: Starting directory path. +# ${14}: Number of CPUs to use. +# ${15}: Current working directory path. +# ${16}: Gene proximity file path. +# ${17}: Email address for notifications. +# +# Returns: +# Generates various output files including target lists, merged results, and a database. +# Sends an email notification upon completion if an email address is provided. #file for automated search of guide+pam in reference and variant genomes -ref_folder=$(realpath $1) -vcf_list=$(realpath $2) -# IFS=',' read -ra vcf_list <<< $2 -guide_file=$(realpath $3) -pam_file=$(realpath $4) -annotation_file=$(realpath $5) -sampleID=$(realpath $6) +ref_folder=$(realpath $1) # reference genome folder +vcf_list=$(realpath $2) # vcf folders list +guide_file=$(realpath $3) # guide +pam_file=$(realpath $4) # pam +annotation_file=$(realpath $5) # annotation bed +sampleID=$(realpath $6) # sample ids +bMax=$7 # max number of bulges +mm=$8 # mismatches +bDNA=$9 # dna bulges +bRNA=${10} # rna bulges +merge_t=${11} # targets merge threshold (bp) +output_folder=$(realpath ${12}) # output folder +starting_dir=$(realpath ${13}) # root dir +ncpus=${14} # number of threads +current_working_directory=$(realpath ${15}) # current working directory +gene_proximity=$(realpath ${16}) # gene annotation bed +email=${17} # email address (website only) -bMax=$7 -mm=$8 -bDNA=$9 -bRNA=${10} - -merge_t=${11} - -output_folder=$(realpath ${12}) - -starting_dir=$(realpath ${13}) -ncpus=${14} -current_working_directory=$(realpath ${15}) - -gene_proximity=$(realpath ${16}) - -email=${17} echo -e "MAIL: $email" echo -e "CPU used: $ncpus" @@ -40,20 +61,22 @@ base_check_set=${20} sorting_criteria_scoring=${21} sorting_criteria=${22} -log="$output_folder/log.txt" -touch $log +# create log files +log="${output_folder}/log.txt" +touch $log +logerror="${output_folder}/log_error.txt" # log error -> trace errors #echo -e 'Job\tStart\t'$(date) > $log start_time='Job\tStart\t'$(date) # output=$output_folder/output.txt # touch $output -##CREATE DUMMY FILE WITH ONE LINE +## CREATE DUMMY FILE WITH ONE LINE echo -e "dummy_file" >"${output_folder}/.dummy.txt" dummy_file="${output_folder}/.dummy.txt" -##CREATE EMPTY FILE +## CREATE EMPTY FILE touch "${output_folder}/.empty.txt" empty_file="${output_folder}/.empty.txt" -##CREATE EMPTY DIR +## CREATE EMPTY DIR mkdir -p "${output_folder}/.empty" empty_dir="${output_folder}/.empty" @@ -68,6 +91,7 @@ if [ $6 != "_" ]; then echo >>$6 fi +# perform target search and processing for each variants dataset while read vcf_f; do if [ -z "$vcf_f" ]; then continue @@ -149,6 +173,8 @@ while read vcf_f; do mkdir "$output_folder/crispritz_targets" fi + # STEP 1: Enrich genome adding SNPs and INDELs from the input VCF files + # track haplotypes if input VCF is phased if [ "$vcf_name" != "_" ]; then cd "$current_working_directory/Genomes" @@ -156,35 +182,53 @@ while read vcf_f; do echo -e 'Add-variants\tStart\t'$(date) >>$log # echo -e 'Add-variants\tStart\t'$(date) >&2 echo -e "Adding variants" - crispritz.py add-variants "$vcf_folder/" "$ref_folder/" "true" || { - echo "CRISPRme ERROR: genome enrichment failed (script: ${0} line $((LINENO - 1)))" >&2 + crispritz.py add-variants "$vcf_folder/" "$ref_folder/" "true" + # check for add-variants failures + if [ -s $logerror ]; then + printf "ERROR: Genome enrichment failed!\n" >&2 exit 1 - } - #if ! [ -d "${ref_name}+${vcf_name}" ]; then - # mkdir "${ref_name}+${vcf_name}" - #fi + fi mv "$current_working_directory/Genomes/variants_genome/SNPs_genome/${ref_name}_enriched/" "./${ref_name}+${vcf_name}/" if ! [ -d "$current_working_directory/Dictionaries/dictionaries_${vcf_name}/" ]; then mkdir "$current_working_directory/Dictionaries/dictionaries_${vcf_name}/" fi + # check for snp dictionary failures + if [ -s $logerror ]; then + printf "ERROR: SNP dictionary construction failed!\n" >&2 + exit 1 + fi if ! [ -d "$current_working_directory/Dictionaries/log_indels_${vcf_name}/" ]; then mkdir "$current_working_directory/Dictionaries/log_indels_${vcf_name}/" fi + # check for indel dictionary failures + if [ -s $logerror ]; then + printf "ERROR: Indel dictionary construction failed!\n" >&2 + exit 1 + fi mv $current_working_directory/Genomes/variants_genome/SNPs_genome/*.json $current_working_directory/Dictionaries/dictionaries_${vcf_name}/ mv $current_working_directory/Genomes/variants_genome/SNPs_genome/log*.txt $current_working_directory/Dictionaries/log_indels_${vcf_name}/ cd "$current_working_directory/" if ! [ -d "genome_library/${true_pam}_2_${ref_name}+${vcf_name}_INDELS" ]; then mkdir "genome_library/${true_pam}_2_${ref_name}+${vcf_name}_INDELS" fi + # check for genome library failures + if [ -s $logerror ]; then + printf "ERROR: Genome library construction failed!\n" >&2 + exit 1 + fi echo -e 'Add-variants\tEnd\t'$(date) >>$log # echo -e 'Add-variants\tEnd\t'$(date) >&2 + + # STEP 2: indels indexing echo -e 'Indexing Indels\tStart\t'$(date) >>$log # echo -e 'Indexing Indels\tStart\t'$(date) >&2 - ${starting_dir}/./pool_index_indels.py "$current_working_directory/Genomes/variants_genome/" "$pam_file" $true_pam $ref_name $vcf_name $ncpus || { - echo "CRISPRme ERROR: indels indexing failed (script: ${0} line $((LINENO - 1)))" >&2 + ${starting_dir}/./pool_index_indels.py "$current_working_directory/Genomes/variants_genome/" "$pam_file" $true_pam $ref_name $vcf_name $ncpus + # check for indels indexing failures + if [ -s $logerror ]; then + printf "ERROR: Indels indexing failed!\n" >&2 exit 1 - } + fi echo -e 'Indexing Indels\tEnd\t'$(date) >>$log # echo -e 'Indexing Indels\tEnd\t'$(date) >&2 if ! [ -d $current_working_directory/Genomes/${ref_name}+${vcf_name}_INDELS ]; then @@ -204,12 +248,14 @@ while read vcf_f; do if ! [ -d "$current_working_directory/genome_library/${true_pam}_2_${ref_name}+${vcf_name}_INDELS" ]; then echo -e 'Indexing Indels\tStart\t'$(date) >>$log # echo -e 'Indexing Indels\tStart\t'$(date) >&2 - ${starting_dir}/./pool_index_indels.py "$current_working_directory/Genomes/${ref_name}+${vcf_name}_INDELS/" "$pam_file" $true_pam $ref_name $vcf_name $ncpus || { - echo "CRISPRme ERROR: indels indexing failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 - } + ${starting_dir}/./pool_index_indels.py "$current_working_directory/Genomes/${ref_name}+${vcf_name}_INDELS/" "$pam_file" $true_pam $ref_name $vcf_name $ncpus echo -e 'Indexing Indels\tEnd\t'$(date) >>$log # echo -e 'Indexing Indels\tEnd\t'$(date) >&2 + # check for indels indexing failures + if [ -s $logerror ]; then + printf "ERROR: Indels indexing failed!\n" >&2 + exit 1 + fi fi fi @@ -217,6 +263,7 @@ while read vcf_f; do rm -r "$current_working_directory/Dictionaries/fake_chrom_$vcf_name" fi + # STEP 3: index reference genome cd "$current_working_directory/" if ! [ -d "$current_working_directory/genome_library/${true_pam}_${bMax}_${ref_name}" ]; then if ! [ -d "$current_working_directory/genome_library/${true_pam}_2_${ref_name}" ]; then @@ -226,11 +273,13 @@ while read vcf_f; do # echo -e 'Index-genome Reference\tStart\t'$(date) >&2 # echo -e 'Indexing_Reference' > $output echo -e "Indexing reference genome" - crispritz.py index-genome "$ref_name" "$ref_folder/" "$pam_file" -bMax $bMax -th $ncpus || { - echo "CRISPRme ERROR: TST-index construction failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 - } + crispritz.py index-genome "$ref_name" "$ref_folder/" "$pam_file" -bMax $bMax -th $ncpus pid_index_ref=$! + # check for reference genome indexing failures + if [ -s $logerror ]; then + printf "ERROR: Reference genome indexing failed!\n" >&2 + exit 1 + fi echo -e 'Index-genome Reference\tEnd\t'$(date) >>$log # echo -e 'Index-genome Reference\tEnd\t'$(date) >&2 idx_ref="$current_working_directory/genome_library/${true_pam}_${bMax}_${ref_name}" @@ -243,11 +292,13 @@ while read vcf_f; do # echo -e 'Index-genome Reference\tStart\t'$(date) >&2 # echo -e 'Indexing_Reference' > $output echo -e "Indexing reference genome" - crispritz.py index-genome "$ref_name" "$ref_folder/" "$pam_file" -bMax $bMax -th $ncpus || { - echo "CRISPRme ERROR: TST-index construction failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 - } + crispritz.py index-genome "$ref_name" "$ref_folder/" "$pam_file" -bMax $bMax -th $ncpus pid_index_ref=$! + # check for reference genome indexing failures + if [ -s $logerror ]; then + printf "ERROR: Reference genome indexing failed!\n" >&2 + exit 1 + fi echo -e 'Index-genome Reference\tEnd\t'$(date) >>$log # echo -e 'Index-genome Reference\tEnd\t'$(date) >&2 idx_ref="$current_working_directory/genome_library/${true_pam}_${bMax}_${ref_name}" @@ -265,6 +316,7 @@ while read vcf_f; do idx_ref="$current_working_directory/genome_library/${true_pam}_${bMax}_${ref_name}" fi + # STEP 4: index variant genome if [ "$vcf_name" != "_" ]; then if ! [ -d "$current_working_directory/genome_library/${true_pam}_${bMax}_${ref_name}+${vcf_name}" ]; then if ! [ -d "$current_working_directory/genome_library/${true_pam}_2_${ref_name}+${vcf_name}" ]; then @@ -274,11 +326,13 @@ while read vcf_f; do # echo -e 'Index-genome Variant\tStart\t'$(date) >&2 # echo -e 'Indexing_Enriched' > $output echo -e "Indexing variant genome" - crispritz.py index-genome "${ref_name}+${vcf_name}" "$current_working_directory/Genomes/${ref_name}+${vcf_name}/" "$pam_file" -bMax $bMax -th $ncpus || { - echo "CRISPRme ERROR: TST-index construction failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 - } + crispritz.py index-genome "${ref_name}+${vcf_name}" "$current_working_directory/Genomes/${ref_name}+${vcf_name}/" "$pam_file" -bMax $bMax -th $ncpus pid_index_var=$! + # check for variant genome indexing failures + if [ -s $logerror ]; then + printf "ERROR: Variant genome indexing failed!\n" >&2 + exit 1 + fi echo -e 'Index-genome Variant\tEnd\t'$(date) >>$log # echo -e 'Index-genome Variant\tEnd\t'$(date) >&2 idx_var="$current_working_directory/genome_library/${true_pam}_${bMax}_${ref_name}+${vcf_name}" @@ -291,11 +345,13 @@ while read vcf_f; do # echo -e 'Index-genome Variant\tStart\t'$(date) >&2 # echo -e 'Indexing_Enriched' > $output echo -e "Indexing variant genome" - crispritz.py index-genome "${ref_name}+${vcf_name}" "$current_working_directory/Genomes/${ref_name}+${vcf_name}/" "$pam_file" -bMax $bMax -th $ncpus || { - echo "CRISPRme ERROR: TST-index construction failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 - } + crispritz.py index-genome "${ref_name}+${vcf_name}" "$current_working_directory/Genomes/${ref_name}+${vcf_name}/" "$pam_file" -bMax $bMax -th $ncpus pid_index_ref=$! + # check for variant genome indexing failures + if [ -s $logerror ]; then + printf "ERROR: Variant genome indexing failed!\n" >&2 + exit 1 + fi echo -e 'Index-genome Variant\tEnd\t'$(date) >>$log # echo -e 'Index-genome Variant\tEnd\t'$(date) >&2 idx_var="$current_working_directory/genome_library/${true_pam}_${bMax}_${ref_name}+${vcf_name}" @@ -321,6 +377,7 @@ while read vcf_f; do ceiling_result=1 fi + # STEP 5: reference genome search #start searches cd "$output_folder" echo $idx_ref @@ -330,43 +387,50 @@ while read vcf_f; do # echo -e 'Search Reference\tStart\t'$(date) >&2 # echo -e 'Search Reference' > $output if [ "$bDNA" -ne 0 ] || [ "$bRNA" -ne 0 ]; then - crispritz.py search $idx_ref "$pam_file" "$guide_file" "${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}" -index -mm $mm -bDNA $bDNA -bRNA $bRNA -t -th $ceiling_result & - wait || { - echo "CRISPRme ERROR: off-targets search failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 - } # TODO: + crispritz.py search $idx_ref "$pam_file" "$guide_file" "${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}" -index -mm $mm -bDNA $bDNA -bRNA $bRNA -t -th $ceiling_result & pid_search_ref=$! + # check for reference genome search (brute-force) failures + if [ -s $logerror ]; then + printf "ERROR: Reference genome search (brute-force) failed!\n" >&2 + exit 1 + fi + echo -e 'Search Reference\tEnd\t'$(date) >>$log else crispritz.py search "$current_working_directory/Genomes/${ref_name}/" "$pam_file" "$guide_file" "${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}" -mm $mm -r -th $ceiling_result & - wait || { - echo "CRISPRme ERROR: off-targets search failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 - } pid_search_ref=$! + # check for reference genome search (TST) failures + if [ -s $logerror ]; then + printf "ERROR: Reference genome search (TST) failed!\n" >&2 + exit 1 + fi + echo -e 'Search Reference\tEnd\t'$(date) >>$log fi else echo -e "Search for reference already done" fi + # STEP 6: variant genome search if [ "$vcf_name" != "_" ]; then - #TODO RICERCA ALTERNATIVE PARALLELA A REF if ! [ -f "$output_folder/crispritz_targets/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" ]; then echo -e 'Search Variant\tStart\t'$(date) >>$log # echo -e 'Search Variant\tStart\t'$(date) >&2 # echo -e 'Search Variant' > $output if [ "$bDNA" -ne 0 ] || [ "$bRNA" -ne 0 ]; then - crispritz.py search "$idx_var" "$pam_file" "$guide_file" "${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}" -index -mm $mm -bDNA $bDNA -bRNA $bRNA -t -th $ceiling_result -var || { - echo "CRISPRme ERROR: off-targets search failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 - } + crispritz.py search "$idx_var" "$pam_file" "$guide_file" "${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}" -index -mm $mm -bDNA $bDNA -bRNA $bRNA -t -th $ceiling_result -var # mv "${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" "$output_folder/crispritz_targets" echo -e 'Search Variant\tEnd\t'$(date) >>$log + # check for variant genome search (brute-force) failures + if [ -s $logerror ]; then + printf "ERROR: Variant genome search (brute-force) failed!\n" >&2 + exit 1 + fi else crispritz.py search "$current_working_directory/Genomes/${ref_name}+${vcf_name}/" "$pam_file" "$guide_file" "${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}" -mm $mm -r -th $ceiling_result & - wait || { - echo "CRISPRme ERROR: off-targets search failed (script: ${0} line $((LINENO - 1)))" >&2 + # check for variant genome search (TST) failures + if [ -s $logerror ]; then + printf "ERROR: Variant genome search (TST) failed!\n" >&2 exit 1 - } + fi echo -e 'Search Variant\tEnd\t'$(date) >>$log # mv "${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" "$output_folder/crispritz_targets" fi @@ -374,6 +438,7 @@ while read vcf_f; do echo -e "Search for variant already done" fi + # STEP 7: search on indels if ! [ -f "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" ]; then echo -e "Search INDELs Start" echo -e 'Search INDELs\tStart\t'$(date) >>$log @@ -381,15 +446,14 @@ while read vcf_f; do cd $starting_dir #commented to avoid indels search #TODO REMOVE POOL SCRIPT FROM PROCESSING - ./pool_search_indels.py "$ref_folder" "$vcf_folder" "$vcf_name" "$guide_file" "$pam_file" $bMax $mm $bDNA $bRNA "$output_folder" $true_pam "$current_working_directory/" "$ncpus" || { - echo "CRISPRme ERROR: off-targets search on indels failed (script: ${0} line $((LINENO - 1)))" >&2 + ./pool_search_indels.py "$ref_folder" "$vcf_folder" "$vcf_name" "$guide_file" "$pam_file" $bMax $mm $bDNA $bRNA "$output_folder" $true_pam "$current_working_directory/" "$ncpus" + # check for indels genome search failures + if [ -s $logerror ]; then + printf "ERROR: Indels genome search failed!\n" >&2 exit 1 - } + fi # mv "$output_folder/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" "$output_folder/crispritz_targets" - awk '($3 !~ "n") {print $0}' "$output_folder/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" >"$output_folder/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.tmp" || { - echo "CRISPRme ERROR: off-targets report construction failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 - } + awk '($3 !~ "n") {print $0}' "$output_folder/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" >"$output_folder/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.tmp" mv "$output_folder/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.tmp" "$output_folder/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" echo -e "Search INDELs End" echo -e 'Search INDELs\tEnd\t'$(date) >>$log @@ -400,22 +464,33 @@ while read vcf_f; do fi while kill "-0" $pid_search_ref &>/dev/null; do - echo -e "Waiting for search genome reference" + echo -e "Waiting for search genome" sleep 100 done - echo -e 'Search Reference\tEnd\t'$(date) >>$log + echo -e 'Search\tEnd\t'$(date) >>$log # echo -e 'Search Reference\tEnd\t'$(date) >&2 # move all targets into targets directory mv $output_folder/*.targets.txt $output_folder/crispritz_targets + # check for targets folder creation failures + if [ -s $logerror ]; then + printf "ERROR: Targets folder creation failed!\n" >&2 + exit 1 + fi if ! [ -d "$output_folder/crispritz_prof" ]; then mkdir $output_folder/crispritz_prof fi mv $output_folder/*profile* $output_folder/crispritz_prof/ &>/dev/null + # check for profile folder creation failures + if [ -s $logerror ]; then + printf "ERROR: Profile folder creation failed!\n" >&2 + exit 1 + fi cd "$starting_dir" + # STEP 8: snp analysis echo -e "Start post-analysis" # echo -e 'Post analysis' > $output @@ -432,11 +507,12 @@ while read vcf_f; do fi #TODO ANALISI DEGLI SNP IN PARALLELO - ./pool_post_analisi_snp.py $output_folder $ref_folder $vcf_name $guide_file $mm $bDNA $bRNA $annotation_file $pam_file $dict_folder $final_res $final_res_alt $ncpus || { - echo "CRISPRme ERROR: SNP analysis failed (script: ${0} line $((LINENO - 1)))" >&2 + ./pool_post_analisi_snp.py $output_folder $ref_folder $vcf_name $guide_file $mm $bDNA $bRNA $annotation_file $pam_file $dict_folder $final_res $final_res_alt $ncpus + # check for snp analysis failures + if [ -s $logerror ]; then + printf "ERROR: SNP analysis failed!\n" >&2 exit 1 - } - + fi #CONCATENATE REF&VAR RESULTS for key in "${real_chroms[@]}"; do echo "Concatenating $key" @@ -452,6 +528,11 @@ while read vcf_f; do rm "$output_folder/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}_$key.bestmmblg.txt" rm "$output_folder/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}_$key.bestCRISTA.txt" done + # check for reports creation failures + if [ -s $logerror ]; then + printf "ERROR: Temporary reports (snp) creation failed!\n" >&2 + exit 1 + fi echo -e 'Post-analysis SNPs\tEnd\t'$(date) >>$log @@ -467,10 +548,12 @@ while read vcf_f; do touch "$final_res_alt" fi - ./pool_post_analisi_snp.py $output_folder $ref_folder "_" $guide_file $mm $bDNA $bRNA $annotation_file $pam_file "_" $final_res $final_res_alt $ncpus || { - echo "CRISPRme ERROR: SNP analysis failed (script: ${0} line $((LINENO - 1)))" >&2 + ./pool_post_analisi_snp.py $output_folder $ref_folder "_" $guide_file $mm $bDNA $bRNA $annotation_file $pam_file "_" $final_res $final_res_alt $ncpus + # check for targets analysis failures + if [ -s $logerror ]; then + printf "ERROR: Targets analysis failed!\n" >&2 exit 1 - } + fi #CONCATENATE REF&VAR RESULTS for key in "${real_chroms[@]}"; do @@ -487,10 +570,15 @@ while read vcf_f; do rm "$output_folder/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}_$key.bestmmblg.txt" rm "$output_folder/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}_$key.bestCRISTA.txt" done + # check for reports creation failures + if [ -s $logerror ]; then + printf "ERROR: Temporary reports (reference) creation failed!\n" >&2 + exit 1 + fi echo -e 'Post-analysis\tEnd\t'$(date) >>$log - fi + # STEP 9: indels analysis if [ "$vcf_name" != "_" ]; then echo -e "SNPs analysis ended. Starting INDELs analysis" cd "$starting_dir" @@ -499,11 +587,12 @@ while read vcf_f; do #SKIP INDELS ANALYSIS IF NO RESULTS FOUND if [ $(wc -l <"$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt") -gt 1 ]; then - ./pool_post_analisi_indel.py $output_folder $ref_folder $vcf_folder $guide_file $mm $bDNA $bRNA $annotation_file $pam_file "$current_working_directory/Dictionaries/" $final_res $final_res_alt $ncpus || { - echo "CRISPRme ERROR: indels analysis failed (script: ${0} line $((LINENO - 1)))" >&2 + ./pool_post_analisi_indel.py $output_folder $ref_folder $vcf_folder $guide_file $mm $bDNA $bRNA $annotation_file $pam_file "$current_working_directory/Dictionaries/" $final_res $final_res_alt $ncpus + # check for indels analysis failures + if [ -s $logerror ]; then + printf "ERROR: Indels analysis failed!\n" >&2 exit 1 - } - + fi #CONCATENATE INDELS RESULTS for key in "${array_fake_chroms[@]}"; do echo "Concatenating $key" @@ -520,6 +609,11 @@ while read vcf_f; do rm -f "$output_folder/${key}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}.bestCRISTA_INDEL.txt" rm -f "$output_folder/${key}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}.bestmmblg_INDEL.txt" done + # check for reports creation failures + if [ -s $logerror ]; then + printf "ERROR: Temporary reports (indels) creation failed!\n" >&2 + exit 1 + fi fi echo -e 'Post-analysis INDELs\tEnd\t'$(date) >>$log @@ -534,13 +628,15 @@ while read samples; do fi awk '!/^#/ { print }' "${current_working_directory}/samplesIDs/$samples" >>"$output_folder/.sampleID.txt" done <"$sampleID" +# check for samples ids reading failures +if [ -s $logerror ]; then + printf "ERROR: Reading sample IDs failed!\n" >&2 + exit 1 +fi # done <$sampleID # if [ "$vcf_name" != "_" ]; then touch "$output_folder/.sampleID.txt" -sed -i 1i"#SAMPLE_ID\tPOPULATION_ID\tSUPERPOPULATION_ID\tSEX" "$output_folder/.sampleID.txt" || { - echo "CRISPRme ERROR: Samples report construction failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 -} +sed -i 1i"#SAMPLE_ID\tPOPULATION_ID\tSUPERPOPULATION_ID\tSEX" "$output_folder/.sampleID.txt" # fi sampleID=$output_folder/.sampleID.txt @@ -557,28 +653,64 @@ sed -i '1 i\#Bulge_type\tcrRNA\tDNA\tReference\tChromosome\tPosition\tCluster_Po printf $header >$final_res_alt.bestCFD.txt printf $header >$final_res_alt.bestmmblg.txt printf $header >$final_res_alt.bestCRISTA.txt +# check for reports creation failures +if [ -s $logerror ]; then + printf "ERROR: Report files creation failed!\n" >&2 + exit 1 +fi +# STEP 10: Merging contiguous targets echo -e 'Merging Targets\tStart\t'$(date) >>$log #SORT FILE TO HAVE CHR AND POS IN PROXIMITY TO MERGE THEM #sort using guide_seq,chr,cluster_pos,score,total(mm+bul) head -1 $final_res.bestCFD.txt >$final_res.tmp tail -n +2 $final_res.bestCFD.txt | LC_ALL=C sort -k16,16 -k5,5 -k7,7n -k21,21rg -k11,11n -T $output_folder >>$final_res.tmp && mv $final_res.tmp $final_res.bestCFD.txt +# check for CFD report sorting failures +if [ -s $logerror ]; then + printf "ERROR: Sorting CFD report failed!\n" >&2 + exit 1 +fi #sort using guide_seq,chr,cluster_pos,score,total(mm+bul) head -1 $final_res.bestCRISTA.txt >$final_res.tmp tail -n +2 $final_res.bestCRISTA.txt | LC_ALL=C sort -k16,16 -k5,5 -k7,7n -k21,21rg -k11,11n -T $output_folder >>$final_res.tmp && mv $final_res.tmp $final_res.bestCRISTA.txt +# check for CRISTA report sorting failures +if [ -s $logerror ]; then + printf "ERROR: Sorting CRISTA report failed!\n" >&2 + exit 1 +fi #sort using guide_seq,chr,cluster_pos,total(mm+bul) head -1 $final_res.bestmmblg.txt >$final_res.tmp tail -n +2 $final_res.bestmmblg.txt | LC_ALL=C sort -k16,16 -k5,5 -k7,7n -k11,11n -T $output_folder >>$final_res.tmp && mv $final_res.tmp $final_res.bestmmblg.txt +# check for mm+bulges report sorting failures +if [ -s $logerror ]; then + printf "ERROR: Sorting mm+bulges report failed!\n" >&2 + exit 1 +fi # cp $final_res.bestCFD.txt $final_res.sorted.bestCFD.txt #MERGE BEST FILES TARGETS TO REMOVE CONTIGOUS #TODO CHECK MERGE #SCORE CFD ./merge_close_targets_cfd.sh $final_res.bestCFD.txt $final_res.bestCFD.txt.trimmed $merge_t 'score' $sorting_criteria_scoring $sorting_criteria & +# check for targets merge on CFD failures +if [ -s $logerror ]; then + printf "ERROR: merging targets in CFD report failed!\n" >&2 + exit 1 +fi #TOTAL (MM+BUL) ./merge_close_targets_cfd.sh $final_res.bestmmblg.txt $final_res.bestmmblg.txt.trimmed $merge_t 'total' $sorting_criteria_scoring $sorting_criteria & +# check for targets merge on mm+bulges failures +if [ -s $logerror ]; then + printf "ERROR: merging targets in mm+bulges report failed!\n" >&2 + exit 1 +fi #SCORE CRISTA ./merge_close_targets_cfd.sh $final_res.bestCRISTA.txt $final_res.bestCRISTA.txt.trimmed $merge_t 'score' $sorting_criteria_scoring $sorting_criteria & +# check for targets merge on CRISTA failures +if [ -s $logerror ]; then + printf "ERROR: merging targets in CRISTA report failed!\n" >&2 + exit 1 +fi wait #CHANGE NAME TO BEST AND ALT FILES mv $final_res.bestCFD.txt.trimmed $final_res.bestCFD.txt @@ -598,133 +730,148 @@ echo -e 'Merging Targets\tEnd\t'$(date) >>$log echo -e 'Annotating results\tStart\t'$(date) >>$log +# STEP 11: targets annotation #ANNOTATE BEST TARGETS #TODO SISTEMARE ANNOTAZIONE (DIVISIONE INTERVAL TREE / PARALLEL SEARCH) ./annotate_final_results.py $final_res.bestCFD.txt $annotation_file $final_res.bestCFD.txt.annotated & -wait || { - echo "CRISPRme ERROR: CFD annotation failed - reference (script: ${0} line $((LINENO - 1)))" >&2 +# check for annotation on CFD failures +if [ -s $logerror ]; then + printf "ERROR: Targets annotation in CFD report failed!\n" >&2 exit 1 -} +fi ./annotate_final_results.py $final_res.bestmmblg.txt $annotation_file $final_res.bestmmblg.txt.annotated & -wait || { - echo "CRISPRme ERROR: CRISTA annotation failed - reference (script: ${0} line $((LINENO - 1)))" >&2 +# check for annotation on mm+bulges failures +if [ -s $logerror ]; then + printf "ERROR: Targets annotation in mm+bulges report failed!\n" >&2 exit 1 -} +fi ./annotate_final_results.py $final_res.bestCRISTA.txt $annotation_file $final_res.bestCRISTA.txt.annotated & -wait || { - echo "CRISPRme ERROR: mismatch+bulges annotation failed - reference(script: ${0} line $((LINENO - 1)))" >&2 +# check for annotation on CRISTA failures +if [ -s $logerror ]; then + printf "ERROR: Targets annotation in CRISTA report failed!\n" >&2 exit 1 -} +fi wait mv $final_res.bestCFD.txt.annotated $final_res.bestCFD.txt mv $final_res.bestmmblg.txt.annotated $final_res.bestmmblg.txt mv $final_res.bestCRISTA.txt.annotated $final_res.bestCRISTA.txt #ANNOTATE ALT TARGETS ./annotate_final_results.py $final_res_alt.bestCFD.txt $annotation_file $final_res_alt.bestCFD.txt.annotated & -wait || { - echo "CRISPRme ERROR: CFD annotation failed - alternative (script: ${0} line $((LINENO - 1)))" >&2 +# check for annotation on CFD failures +if [ -s $logerror ]; then + printf "ERROR: Targets annotation in CFD alternative report failed!\n" >&2 exit 1 -} +fi ./annotate_final_results.py $final_res_alt.bestmmblg.txt $annotation_file $final_res_alt.bestmmblg.txt.annotated & -wait || { - echo "CRISPRme ERROR: CRISTA annotation failed - alternative (script: ${0} line $((LINENO - 1)))" >&2 +# check for annotation on mm+bulges failures +if [ -s $logerror ]; then + printf "ERROR: Targets annotation in mm+bulges alternative report failed!\n" >&2 exit 1 -} +fi ./annotate_final_results.py $final_res_alt.bestCRISTA.txt $annotation_file $final_res_alt.bestCRISTA.txt.annotated & -wait || { - echo "CRISPRme ERROR: mismatch+bulges annotation failed - alternative (script: ${0} line $((LINENO - 1)))" >&2 +# check for annotation on CRISTA failures +if [ -s $logerror ]; then + printf "ERROR: Targets annotation in CRISTA alternative report failed!\n" >&2 exit 1 -} +fi wait mv $final_res_alt.bestCFD.txt.annotated $final_res_alt.bestCFD.txt mv $final_res_alt.bestmmblg.txt.annotated $final_res_alt.bestmmblg.txt mv $final_res_alt.bestCRISTA.txt.annotated $final_res_alt.bestCRISTA.txt +# STEP 12: compute risk scores #SCORING BEST RESULTS ./add_risk_score.py $final_res.bestCFD.txt $final_res.bestCFD.txt.risk "False" & -wait || { - echo "CRISPRme ERROR: CFD risk score analysis failed - reference (script: ${0} line $((LINENO - 1)))" >&2 +# check for risk score computing on CFD failures +if [ -s $logerror ]; then + printf "ERROR: Risk score in CFD report failed!\n" >&2 exit 1 -} +fi ./add_risk_score.py $final_res.bestmmblg.txt $final_res.bestmmblg.txt.risk "False" & -wait || { - echo "CRISPRme ERROR: CRISTA risk score analysis failed - reference (script: ${0} line $((LINENO - 1)))" >&2 +# check for risk score computing on mm+bulges failures +if [ -s $logerror ]; then + printf "ERROR: Risk score in mm+bulges report failed!\n" >&2 exit 1 -} +fi ./add_risk_score.py $final_res.bestCRISTA.txt $final_res.bestCRISTA.txt.risk "False" & -wait || { - echo "CRISPRme ERROR: mismatch+bulges risk score analysis failed - reference (script: ${0} line $((LINENO - 1)))" >&2 +# check for risk score computing on CRISTA failures +if [ -s $logerror ]; then + printf "ERROR: Risk score in CRISTA report failed!\n" >&2 exit 1 -} +fi wait mv $final_res.bestCFD.txt.risk $final_res.bestCFD.txt mv $final_res.bestmmblg.txt.risk $final_res.bestmmblg.txt mv $final_res.bestCRISTA.txt.risk $final_res.bestCRISTA.txt #SCORING ALT RESULTS ./add_risk_score.py $final_res_alt.bestCFD.txt $final_res_alt.bestCFD.txt.risk "False" & -wait || { - echo "CRISPRme ERROR: CFD risk score analysis failed - alternative (script: ${0} line $((LINENO - 1)))" >&2 +# check for risk score computing on CFD failures +if [ -s $logerror ]; then + printf "ERROR: Risk score in CFD alternative report failed!\n" >&2 exit 1 -} +fi ./add_risk_score.py $final_res_alt.bestmmblg.txt $final_res_alt.bestmmblg.txt.risk "False" & -wait || { - echo "CRISPRme ERROR: CRISTA risk score analysis failed - alternative (script: ${0} line $((LINENO - 1)))" >&2 +# check for risk score computing on mm_bulges failures +if [ -s $logerror ]; then + printf "ERROR: Risk score in mm+bulges alternative report failed!\n" >&2 exit 1 -} +fi ./add_risk_score.py $final_res_alt.bestCRISTA.txt $final_res_alt.bestCRISTA.txt.risk "False" & -wait || { - echo "CRISPRme ERROR: mismatch+bulges risk score analysis failed - alternative (script: ${0} line $((LINENO - 1)))" >&2 +# check for risk score computing on CRISTA failures +if [ -s $logerror ]; then + printf "ERROR: Risk score in CRISTA alternative report failed!\n" >&2 exit 1 -} +fi wait mv $final_res_alt.bestCFD.txt.risk $final_res_alt.bestCFD.txt mv $final_res_alt.bestmmblg.txt.risk $final_res_alt.bestmmblg.txt mv $final_res_alt.bestCRISTA.txt.risk $final_res_alt.bestCRISTA.txt +# STEP 13: clean reports from dots and NaN values #remove N's and dots from rsID from BEST FILES python remove_n_and_dots.py $final_res.bestCFD.txt & -wait || { - echo "CRISPRme ERROR: CFD reports cleaning failed - reference (script: ${0} line $((LINENO - 1)))" >&2 +# check for NaN values cleaning on CFD failures +if [ -s $logerror ]; then + printf "ERROR: NaN values cleaning in CFD report failed!\n" >&2 exit 1 -} +fi python remove_n_and_dots.py $final_res.bestmmblg.txt & -wait || { - echo "CRISPRme ERROR: CRISTA reports cleaning failed - reference (script: ${0} line $((LINENO - 1)))" >&2 +# check for NaN values cleaning on mm+bulges failures +if [ -s $logerror ]; then + printf "ERROR: NaN values cleaning in mm+bulges report failed!\n" >&2 exit 1 -} +fi python remove_n_and_dots.py $final_res.bestCRISTA.txt & -wait || { - echo "CRISPRme ERROR: mismatch+bulges reports cleaning failed - reference (script: ${0} line $((LINENO - 1)))" >&2 +# check for NaN values cleaning on CRISTA failures +if [ -s $logerror ]; then + printf "ERROR: NaN values cleaning in CRISTA report failed!\n" >&2 exit 1 -} +fi wait #remove N's and dots from rsID from ALT FILES python remove_n_and_dots.py $final_res_alt.bestCFD.txt & -wait || { - echo "CRISPRme ERROR: CFD reports cleaning failed - alternative (script: ${0} line $((LINENO - 1)))" >&2 +# check for NaN values cleaning on CFD failures +if [ -s $logerror ]; then + printf "ERROR: NaN values cleaning in CFD alternative report failed!\n" >&2 exit 1 -} +fi python remove_n_and_dots.py $final_res_alt.bestmmblg.txt & -wait || { - echo "CRISPRme ERROR: CRISTA reports cleaning failed - alternative (script: ${0} line $((LINENO - 1)))" >&2 +# check for NaN values cleaning on mm+bulges failures +if [ -s $logerror ]; then + printf "ERROR: NaN values cleaning in mm+bulges alternative report failed!\n" >&2 exit 1 -} +fi python remove_n_and_dots.py $final_res_alt.bestCRISTA.txt & -wait || { - echo "CRISPRme ERROR: mismatch+bulges reports cleaning failed - alternative (script: ${0} line $((LINENO - 1)))" >&2 +# check for NaN values cleaning on CRISTA failures +if [ -s $logerror ]; then + printf "ERROR: NaN values cleaning in CRISTA alternative report failed!\n" >&2 exit 1 -} +fi wait #join targets by columns for BEST and ALT files -pr -m -t -J $final_res.bestCFD.txt $final_res.bestmmblg.txt $final_res.bestCRISTA.txt >$final_res || { - echo "CRISPRme ERROR: final report generation failed - reference (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 -} -pr -m -t -J $final_res_alt.bestCFD.txt $final_res_alt.bestmmblg.txt $final_res_alt.bestCRISTA.txt >$final_res_alt || { - echo "CRISPRme ERROR: final report generation failed - alternative (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 -} +pr -m -t -J $final_res.bestCFD.txt $final_res.bestmmblg.txt $final_res.bestCRISTA.txt >$final_res +pr -m -t -J $final_res_alt.bestCFD.txt $final_res_alt.bestmmblg.txt $final_res_alt.bestCRISTA.txt >$final_res_alt #MERGE ALTERNATIVE CHR IF SAME SEQUENCE OF ALIGNED CHR # ./merge_alt_chr.sh $final_res $final_res.chr_merged @@ -733,9 +880,15 @@ pr -m -t -J $final_res_alt.bestCFD.txt $final_res_alt.bestmmblg.txt $final_res_a #update header for final_res and final_res_alt sed -i '1 s/^.*$/#Bulge_type\tcrRNA\tDNA\tReference\tChromosome\tPosition\tCluster_Position\tDirection\tMismatches\tBulge_Size\tTotal\tPAM_gen\tVar_uniq\tSamples\tAnnotation_Type\tReal_Guide\trsID\tAF\tSNP\t#Seq_in_cluster\tCFD\tCFD_ref\tHighest_CFD_Risk_Score\tHighest_CFD_Absolute_Risk_Score\tMMBLG_#Bulge_type\tMMBLG_crRNA\tMMBLG_DNA\tMMBLG_Reference\tMMBLG_Chromosome\tMMBLG_Position\tMMBLG_Cluster_Position\tMMBLG_Direction\tMMBLG_Mismatches\tMMBLG_Bulge_Size\tMMBLG_Total\tMMBLG_PAM_gen\tMMBLG_Var_uniq\tMMBLG_Samples\tMMBLG_Annotation_Type\tMMBLG_Real_Guide\tMMBLG_rsID\tMMBLG_AF\tMMBLG_SNP\tMMBLG_#Seq_in_cluster\tMMBLG_CFD\tMMBLG_CFD_ref\tMMBLG_CFD_Risk_Score\tMMBLG_CFD_Absolute_Risk_Score\tCRISTA_#Bulge_type\tCRISTA_crRNA\tCRISTA_DNA\tCRISTA_Reference\tCRISTA_Chromosome\tCRISTA_Position\tCRISTA_Cluster_Position\tCRISTA_Direction\tCRISTA_Mismatches\tCRISTA_Bulge_Size\tCRISTA_Total\tCRISTA_PAM_gen\tCRISTA_Var_uniq\tCRISTA_Samples\tCRISTA_Annotation_Type\tCRISTA_Real_Guide\tCRISTA_rsID\tCRISTA_AF\tCRISTA_SNP\tCRISTA_#Seq_in_cluster\tCRISTA_CFD\tCRISTA_CFD_ref\tCRISTA_CFD_Risk_Score\tCRISTA_CFD_Absolute_Risk_Score/' "$final_res" sed -i '1 s/^.*$/#Bulge_type\tcrRNA\tDNA\tReference\tChromosome\tPosition\tCluster_Position\tDirection\tMismatches\tBulge_Size\tTotal\tPAM_gen\tVar_uniq\tSamples\tAnnotation_Type\tReal_Guide\trsID\tAF\tSNP\t#Seq_in_cluster\tCFD\tCFD_ref\tHighest_CFD_Risk_Score\tHighest_CFD_Absolute_Risk_Score\tMMBLG_#Bulge_type\tMMBLG_crRNA\tMMBLG_DNA\tMMBLG_Reference\tMMBLG_Chromosome\tMMBLG_Position\tMMBLG_Cluster_Position\tMMBLG_Direction\tMMBLG_Mismatches\tMMBLG_Bulge_Size\tMMBLG_Total\tMMBLG_PAM_gen\tMMBLG_Var_uniq\tMMBLG_Samples\tMMBLG_Annotation_Type\tMMBLG_Real_Guide\tMMBLG_rsID\tMMBLG_AF\tMMBLG_SNP\tMMBLG_#Seq_in_cluster\tMMBLG_CFD\tMMBLG_CFD_ref\tMMBLG_CFD_Risk_Score\tMMBLG_CFD_Absolute_Risk_Score\tCRISTA_#Bulge_type\tCRISTA_crRNA\tCRISTA_DNA\tCRISTA_Reference\tCRISTA_Chromosome\tCRISTA_Position\tCRISTA_Cluster_Position\tCRISTA_Direction\tCRISTA_Mismatches\tCRISTA_Bulge_Size\tCRISTA_Total\tCRISTA_PAM_gen\tCRISTA_Var_uniq\tCRISTA_Samples\tCRISTA_Annotation_Type\tCRISTA_Real_Guide\tCRISTA_rsID\tCRISTA_AF\tCRISTA_SNP\tCRISTA_#Seq_in_cluster\tCRISTA_CFD\tCRISTA_CFD_ref\tCRISTA_CFD_Risk_Score\tCRISTA_CFD_Absolute_Risk_Score/' "$final_res_alt" +# check for report headers update failures +if [ -s $logerror ]; then + printf "ERROR: Updating report headers failed!\n" >&2 + exit 1 +fi echo -e 'Annotating results\tEnd\t'$(date) >>$log +# STEP 14: figures creation # echo -e 'Creating images' > $output echo -e 'Creating images\tStart\t'$(date) >>$log @@ -753,32 +906,44 @@ mv $final_res_alt "${output_folder}/$(basename ${output_folder}).altMerge.txt" cd $starting_dir if [ "$vcf_name" != "_" ]; then # ./process_summaries.py "${output_folder}/$(basename ${output_folder}).bestMerge.txt" $guide_file $sampleID $mm $bMax "${output_folder}" "var" - ./process_summaries.py $final_res.bestCFD.txt $guide_file $sampleID $mm $bMax "${output_folder}" "var" "CFD" || { - echo "CRISPRme ERROR: CFD report summary failed - reference (script: ${0} line $((LINENO - 1)))" >&2 + ./process_summaries.py $final_res.bestCFD.txt $guide_file $sampleID $mm $bMax "${output_folder}" "var" "CFD" + # check for variant summary processing on CFD failures + if [ -s $logerror ]; then + printf "ERROR: Variant summary process on CFD report failed!\n" >&2 exit 1 - } - ./process_summaries.py $final_res.bestmmblg.txt $guide_file $sampleID $mm $bMax "${output_folder}" "var" "fewest" || { - echo "CRISPRme ERROR: mismatch+bulges report summary failed - reference (script: ${0} line $((LINENO - 1)))" >&2 + fi + ./process_summaries.py $final_res.bestmmblg.txt $guide_file $sampleID $mm $bMax "${output_folder}" "var" "fewest" + # check for summary processing on mm+bulges failures + if [ -s $logerror ]; then + printf "ERROR: Variant summary process on mm+bulges report failed!\n" >&2 exit 1 - } - ./process_summaries.py $final_res.bestCRISTA.txt $guide_file $sampleID $mm $bMax "${output_folder}" "var" "CRISTA" || { - echo "CRISPRme ERROR: CRISTA report summary failed - reference (script: ${0} line $((LINENO - 1)))" >&2 + fi + ./process_summaries.py $final_res.bestCRISTA.txt $guide_file $sampleID $mm $bMax "${output_folder}" "var" "CRISTA" + # check for summary processing on CRISTA failures + if [ -s $logerror ]; then + printf "ERROR: Variant summary process on CRISTA report failed!\n" >&2 exit 1 - } + fi else # ./process_summaries.py "${output_folder}/$(basename ${output_folder}).bestMerge.txt" $guide_file $sampleID $mm $bMax "${output_folder}" "ref" - ./process_summaries.py $final_res.bestCFD.txt $guide_file $sampleID $mm $bMax "${output_folder}" "ref" "CFD" || { - echo "CRISPRme ERROR: CFD report summary failed - alternative (script: ${0} line $((LINENO - 1)))" >&2 + ./process_summaries.py $final_res.bestCFD.txt $guide_file $sampleID $mm $bMax "${output_folder}" "ref" "CFD" + # check for reference summary processing on CFD failures + if [ -s $logerror ]; then + printf "ERROR: Reference summary process on CFD report failed!\n" >&2 exit 1 - } - ./process_summaries.py $final_res.bestmmblg.txt $guide_file $sampleID $mm $bMax "${output_folder}" "ref" "fewest" || { - echo "CRISPRme ERROR: mismatch+bulges report summary failed - alternative (script: ${0} line $((LINENO - 1)))" >&2 + fi + ./process_summaries.py $final_res.bestmmblg.txt $guide_file $sampleID $mm $bMax "${output_folder}" "ref" "fewest" + # check for reference summary processing on mm+bulges failures + if [ -s $logerror ]; then + printf "ERROR: Reference summary process on mm+bulges report failed!\n" >&2 exit 1 - } - ./process_summaries.py $final_res.bestCRISTA.txt $guide_file $sampleID $mm $bMax "${output_folder}" "ref" "CRISTA" || { - echo "CRISPRme ERROR: CRISTA report summary failed - alternative (script: ${0} line $((LINENO - 1)))" >&2 + fi + ./process_summaries.py $final_res.bestCRISTA.txt $guide_file $sampleID $mm $bMax "${output_folder}" "ref" "CRISTA" + # check for reference summary processing on CRISTA failures + if [ -s $logerror ]; then + printf "ERROR: Reference summary process on CRISTA report failed!\n" >&2 exit 1 - } + fi fi if ! [ -d "$output_folder/imgs" ]; then @@ -790,18 +955,24 @@ if [ "$vcf_name" != "_" ]; then while IFS= read -r line || [ -n "$line" ]; do for total in $(seq 0 $(expr $mm + $bMax)); do # python $starting_dir/populations_distribution.py "${output_folder}/.$(basename ${output_folder}).PopulationDistribution.txt" $total $line - python $starting_dir/populations_distribution.py "${output_folder}/.$(basename ${output_folder}).PopulationDistribution_CFD.txt" $total $line "CFD" || { - echo "CRISPRme ERROR: CFD population report failed (script: ${0} line $((LINENO - 1)))" >&2 + python $starting_dir/populations_distribution.py "${output_folder}/.$(basename ${output_folder}).PopulationDistribution_CFD.txt" $total $line "CFD" + # check for population distribution on CFD failures + if [ -s $logerror ]; then + printf "ERROR: Population distribution on CFD report failed!\n" >&2 exit 1 - } - python $starting_dir/populations_distribution.py "${output_folder}/.$(basename ${output_folder}).PopulationDistribution_CRISTA.txt" $total $line "CRISTA" || { - echo "CRISPRme ERROR: CRISTA population report failed (script: ${0} line $((LINENO - 1)))" >&2 + fi + python $starting_dir/populations_distribution.py "${output_folder}/.$(basename ${output_folder}).PopulationDistribution_CRISTA.txt" $total $line "CRISTA" + # check for population distribution on CRISTA failures + if [ -s $logerror ]; then + printf "ERROR: Population distribution on CRISTA report failed!\n" >&2 exit 1 - } - python $starting_dir/populations_distribution.py "${output_folder}/.$(basename ${output_folder}).PopulationDistribution_fewest.txt" $total $line "fewest" || { - echo "CRISPRme ERROR: mismatch+bulges population report failed (script: ${0} line $((LINENO - 1)))" >&2 + fi + python $starting_dir/populations_distribution.py "${output_folder}/.$(basename ${output_folder}).PopulationDistribution_fewest.txt" $total $line "fewest" + # check for population distribution on mm+bulges failures + if [ -s $logerror ]; then + printf "ERROR: Population distribution on mm+bulges report failed!\n" >&2 exit 1 - } + fi done done <$guide_file @@ -810,34 +981,47 @@ fi cd $starting_dir if [ "$vcf_name" != "_" ]; then # ./radar_chart_dict_generator.py $guide_file "${output_folder}/$(basename ${output_folder}).bestMerge.txt" $sampleID $annotation_file "$output_folder" $ncpus $mm $bMax - ./radar_chart_dict_generator.py $guide_file $final_res.bestCFD.txt $sampleID $annotation_file "$output_folder" $ncpus $mm $bMax "CFD" || { - echo "CRISPRme ERROR: CFD radar chart report failed - reference (script: ${0} line $((LINENO - 1)))" >&2 + ./radar_chart_dict_generator.py $guide_file $final_res.bestCFD.txt $sampleID $annotation_file "$output_folder" $ncpus $mm $bMax "CFD" + # check for radar chart generation on CFD failures + if [ -s $logerror ]; then + printf "ERROR: Radar chart generation on variant CFD report failed!\n" >&2 exit 1 - } - ./radar_chart_dict_generator.py $guide_file $final_res.bestCRISTA.txt $sampleID $annotation_file "$output_folder" $ncpus $mm $bMax "CRISTA" || { - echo "CRISPRme ERROR: CRISTA radar chart report failed - reference (script: ${0} line $((LINENO - 1)))" >&2 + fi + ./radar_chart_dict_generator.py $guide_file $final_res.bestCRISTA.txt $sampleID $annotation_file "$output_folder" $ncpus $mm $bMax "CRISTA" + # check for radar chart generation on CRISTA failures + if [ -s $logerror ]; then + printf "ERROR: Radar chart generation on variant CRISTA report failed!\n" >&2 exit 1 - } - ./radar_chart_dict_generator.py $guide_file $final_res.bestmmblg.txt $sampleID $annotation_file "$output_folder" $ncpus $mm $bMax "fewest" || { - echo "CRISPRme ERROR: mismatch+bulges radar chart report failed - reference (script: ${0} line $((LINENO - 1)))" >&2 + fi + ./radar_chart_dict_generator.py $guide_file $final_res.bestmmblg.txt $sampleID $annotation_file "$output_folder" $ncpus $mm $bMax "fewest" + # check for radar chart generation on mm+bulges failures + if [ -s $logerror ]; then + printf "ERROR: Radar chart generation on variant mm+bulges report failed!\n" >&2 exit 1 - } + fi else - ./radar_chart_dict_generator.py $guide_file $final_res.bestCFD.txt $empty_file $annotation_file "$output_folder" $ncpus $mm $bMax "CFD" || { - echo "CRISPRme ERROR: CFD radar chart report failed - alternative (script: ${0} line $((LINENO - 1)))" >&2 + ./radar_chart_dict_generator.py $guide_file $final_res.bestCFD.txt $empty_file $annotation_file "$output_folder" $ncpus $mm $bMax "CFD" + # check for radar chart generation on CFD failures + if [ -s $logerror ]; then + printf "ERROR: Radar chart generation on reference CFD report failed!\n" >&2 exit 1 - } - ./radar_chart_dict_generator.py $guide_file $final_res.bestCRISTA.txt $empty_file $annotation_file "$output_folder" $ncpus $mm $bMax "CRISTA" || { - echo "CRISPRme ERROR: CRISTA radar chart report failed - alternative (script: ${0} line $((LINENO - 1)))" >&2 + fi + ./radar_chart_dict_generator.py $guide_file $final_res.bestCRISTA.txt $empty_file $annotation_file "$output_folder" $ncpus $mm $bMax "CRISTA" + # check for radar chart generation on CRISTA failures + if [ -s $logerror ]; then + printf "ERROR: Radar chart generation on reference CRISTA report failed!\n" >&2 exit 1 - } - ./radar_chart_dict_generator.py $guide_file $final_res.bestmmblg.txt $empty_file $annotation_file "$output_folder" $ncpus $mm $bMax "fewest" || { - echo "CRISPRme ERROR: mismatch+bulges radar chart report failed - alternative (script: ${0} line $((LINENO - 1)))" >&2 + fi + ./radar_chart_dict_generator.py $guide_file $final_res.bestmmblg.txt $empty_file $annotation_file "$output_folder" $ncpus $mm $bMax "fewest" + # check for radar chart generation on mm+bulges failures + if [ -s $logerror ]; then + printf "ERROR: Radar chart generation on reference mm+bulges report failed!\n" >&2 exit 1 - } + fi fi echo -e 'Creating images\tEnd\t'$(date) >>$log +# STEP 15: targets gene annotation echo $gene_proximity echo -e 'Integrating results\tStart\t'$(date) >>$log echo >>$guide_file @@ -845,18 +1029,24 @@ echo >>$guide_file if [ $gene_proximity != "_" ]; then genome_version=$(echo ${ref_name} | sed 's/_ref//' | sed -e 's/\n//') #${output_folder}/Params.txt | awk '{print $2}' | sed 's/_ref//' | sed -e 's/\n//') echo $genome_version - bash $starting_dir/post_process.sh "${output_folder}/$(basename ${output_folder}).bestMerge.txt" "${gene_proximity}" $empty_file "${guide_file}" $genome_version "${output_folder}" $empty_dir $starting_dir/ $base_check_start $base_check_end $base_check_set || { - echo "CRISPRme ERROR: postprocessing failed - reference (script: ${0} line $((LINENO - 1)))" >&2 + bash $starting_dir/post_process.sh "${output_folder}/$(basename ${output_folder}).bestMerge.txt" "${gene_proximity}" $empty_file "${guide_file}" $genome_version "${output_folder}" $empty_dir $starting_dir/ $base_check_start $base_check_end $base_check_set + # check for gene annotation of primary targets failures + if [ -s $logerror ]; then + printf "ERROR: Gene annotation on primary targets failed!\n" >&2 exit 1 - } - bash $starting_dir/post_process.sh "${output_folder}/$(basename ${output_folder}).altMerge.txt" "${gene_proximity}" $empty_file "${guide_file}" $genome_version "${output_folder}" $empty_dir $starting_dir/ $base_check_start $base_check_end $base_check_set || { - echo "CRISPRme ERROR: postprocessing failed - alternative (script: ${0} line $((LINENO - 1)))" >&2 + fi + bash $starting_dir/post_process.sh "${output_folder}/$(basename ${output_folder}).altMerge.txt" "${gene_proximity}" $empty_file "${guide_file}" $genome_version "${output_folder}" $empty_dir $starting_dir/ $base_check_start $base_check_end $base_check_set + # check for gene annotation of alternative targets failures + if [ -s $logerror ]; then + printf "ERROR: Gene annotation on alternative targets failed!\n" >&2 exit 1 - } - python $starting_dir/CRISPRme_plots.py "${output_folder}/$(basename ${output_folder}).bestMerge.txt.integrated_results.tsv" "${output_folder}/imgs/" &>"${output_folder}/warnings.txt" || { - echo "CRISPRme ERROR: plots generation failed (script: ${0} line $((LINENO - 1)))" >&2 + fi + python $starting_dir/CRISPRme_plots.py "${output_folder}/$(basename ${output_folder}).bestMerge.txt.integrated_results.tsv" "${output_folder}/imgs/" &>"${output_folder}/warnings.txt" + # check for plot failures + if [ -s $logerror ]; then + printf "ERROR: Plots generation failed!\n" >&2 exit 1 - } + fi rm -f "${output_folder}/warnings.txt" #delete warnings file fi @@ -874,10 +1064,12 @@ if [ -f "${output_folder}/$(basename ${output_folder}).db" ]; then rm -f "${output_folder}/$(basename ${output_folder}).db" fi #python $starting_dir/db_creation.py "${output_folder}/$(basename ${output_folder}).bestMerge.txt" "${output_folder}/$(basename ${output_folder})" -python $starting_dir/db_creation.py "${output_folder}/$(basename ${output_folder}).bestMerge.txt.integrated_results.tsv" "${output_folder}/.$(basename ${output_folder})" || { - echo "CRISPRme ERROR: database creation failed (script: ${0} line $((LINENO - 1)))" >&2 +python $starting_dir/db_creation.py "${output_folder}/$(basename ${output_folder}).bestMerge.txt.integrated_results.tsv" "${output_folder}/.$(basename ${output_folder})" +# check for database generation failures +if [ -s $logerror ]; then + printf "ERROR: Database generation failed!\n" >&2 exit 1 -} +fi echo -e 'Creating database\tEnd\t'$(date) >>$log # echo -e 'Creating database\tEnd\t'$(date) >&2 @@ -899,14 +1091,34 @@ if [ $(wc -l <"$guide_file") -gt 1 ]; then mv "${output_folder}/$(basename ${output_folder}).altMerge.txt.integrated_results.tsv" "${output_folder}/Multiple_spacers+${true_pam}_$(basename ${ref_folder})+${vcf_name}_${mm}+${bMax}_all_results_with_alternative_alignments.tsv" #generate zipped version for file zip -j "${output_folder}/Multiple_spacers+${true_pam}_$(basename ${ref_folder})+${vcf_name}_${mm}+${bMax}_integrated_results.zip" "${output_folder}/Multiple_spacers+${true_pam}_$(basename ${ref_folder})+${vcf_name}_${mm}+${bMax}_integrated_results.tsv" + # check for compression on multiguide integrated results failures + if [ -s $logerror ]; then + printf "ERROR: File compression for multiguide primary targets report failed!\n" >&2 + exit 1 + fi zip -j "${output_folder}/Multiple_spacers+${true_pam}_$(basename ${ref_folder})+${vcf_name}_${mm}+${bMax}_all_results_with_alternative_alignments.zip" "${output_folder}/Multiple_spacers+${true_pam}_$(basename ${ref_folder})+${vcf_name}_${mm}+${bMax}_all_results_with_alternative_alignments.tsv" + # check for compression on multiguide alternative results failures + if [ -s $logerror ]; then + printf "ERROR: File compression for multiguide alternative targets report failed!\n" >&2 + exit 1 + fi else guide_elem=$(head -1 $guide_file) mv "${output_folder}/$(basename ${output_folder}).bestMerge.txt.integrated_results.tsv" "${output_folder}/${guide_elem}+${true_pam}_$(basename ${ref_folder})+${vcf_name}_${mm}+${bMax}_integrated_results.tsv" mv "${output_folder}/$(basename ${output_folder}).altMerge.txt.integrated_results.tsv" "${output_folder}/${guide_elem}+${true_pam}_$(basename ${ref_folder})+${vcf_name}_${mm}+${bMax}_all_results_with_alternative_alignments.tsv" #generate zipped version for file zip -j "${output_folder}/${guide_elem}+${true_pam}_$(basename ${ref_folder})+${vcf_name}_${mm}+${bMax}_integrated_results.zip" "${output_folder}/${guide_elem}+${true_pam}_$(basename ${ref_folder})+${vcf_name}_${mm}+${bMax}_integrated_results.tsv" + # check for compression on single guide integrated results failures + if [ -s $logerror ]; then + printf "ERROR: File compression for single guide primary targets report failed!\n" >&2 + exit 1 + fi zip -j "${output_folder}/${guide_elem}+${true_pam}_$(basename ${ref_folder})+${vcf_name}_${mm}+${bMax}_all_results_with_alternative_alignments.zip" "${output_folder}/${guide_elem}+${true_pam}_$(basename ${ref_folder})+${vcf_name}_${mm}+${bMax}_all_results_with_alternative_alignments.tsv" + # check for compression on single alternative results failures + if [ -s $logerror ]; then + printf "ERROR: File compression for single guide alternative targets report failed!\n" >&2 + exit 1 + fi fi echo -e "JOB END" @@ -915,7 +1127,7 @@ if [ "$email" != "_" ]; then fi #keep log_error but no block visualization -mv $output_folder/log_error.txt $output_folder/log_error_no_check.txt +mv $logerror $output_folder/log_error_no_check.txt #removing single best files after use and clean merged file to save space #keep the two integrated files with all the targets #save these files to test diff --git a/README.md b/README.md index 39de407..8c42a86 100755 --- a/README.md +++ b/README.md @@ -1,255 +1,1390 @@ -# CRISPRme - [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/crisprme/README.html) ![GitHub release (latest by date)](https://img.shields.io/github/v/release/pinellolab/crisprme) ![Conda](https://img.shields.io/conda/dn/bioconda/crisprme) ![license](https://img.shields.io/badge/license-AGPL--3.0-lightgrey) -CRISPRme is a tool for comprehensive off-target assessment available as a web application [online](http://crisprme.di.univr.it/), offline, and command line. It integrates human genetic variant datasets with orthogonal genomic annotations to predict and prioritize CRISPR-Cas off-target sites at scale. The method considers both single-nucleotide variants (SNVs) and indels, accounts for bona fide haplotypes, accepts spacer:protospacer mismatches and bulges, and is suitable for population and personal genome analyses. CRISPRme takes care of all steps in the process including data download, executing the complete search, and presents an exhaustive report with tables and figures within interactive web-based GUI. +
+ +
-The software has the following main functionalities: +# CRISPRme -- ```complete-search``` performs a search from scratch with the given inputs (including gRNA, reference genome, and genetic variants). -- ```targets-integration``` integrates the search results with GENCODE data to identify genes close to the candidate off-targets and collect the top ranking candidates in term of CFD score, CRISTA score, or number of mismatches/bulges. -- ```web-interface``` starts a local instance of the web interface accessible from any browser. +CRISPRme is a comprehensive tool designed for thorough off-target assessment in +CRISPR-Cas systems. Available as a web application +([http://crisprme.di.univr.it/](http://crisprme.di.univr.it/)), offline tool, and +command-line interface, it integrates human genetic variant datasets with orthogonal +genomic annotations to predict and prioritize potential off-target sites at scale. +CRISPRme accounts for single-nucleotide variants (SNVs) and indels, considers +*bona fide* haplotypes, and allows for spacer:protospacer mismatches and bulges, +making it well-suited for both population-wide and personal genome analyses. CRISPRme +automates the entire workflow, from data download to executing the search, and delivers +detailed reports complete with tables and figures through an interactive web-based +interface. -## Installation +## Table Of Contents -CRISPRme can be installed both via **Conda** (only Linux users) and **Docker** (all operating systems, including OSX and Windows). +0 [System Requirements](#0-system-requirements) ++ +
+ +### 2.2 CRISPRme Functions +--- + +This section provides a comprehensive overview of CRISPRme's core functions, +detailing each feature, the required input data and formats, and the resulting +outputs. The following is a summary of CRISPRme's key features: + +- [**Complete Search**](#221-complete-search) (`complete-search`) +