diff --git a/.editorconfig b/.editorconfig deleted file mode 100644 index e69de29..0000000 diff --git a/.gitignore b/.gitignore index d8a10c3..bdea4dc 100644 --- a/.gitignore +++ b/.gitignore @@ -7,10 +7,8 @@ VCFs/ Gencode/ Cache/ *nohup* -Annotations/ PAMs/* samplesIDs/* -guides/ *.csv *.pkl *.zip diff --git a/Dockerfile b/Dockerfile index d3c2316..df6a8a7 100755 --- a/Dockerfile +++ b/Dockerfile @@ -6,7 +6,7 @@ FROM mambaorg/micromamba # Set the variables for version control during installation ARG crispritz_version=2.6.6 -ARG crisprme_version=2.1.5 +ARG crisprme_version=2.1.6 # set the shell to bash ENV SHELL bash diff --git a/LICENSE b/LICENSE index 456a677..b01a24e 100755 --- a/LICENSE +++ b/LICENSE @@ -1,2 +1,16 @@ -CRISRPme has a dual license. It is made available for free to academic researchers under the Affero License (https://www.gnu.org/licenses/agpl-3.0.en.html). -If you plan to use the CRISRPme for-profit, you will need to purchase a license. Please contact rosalba.giugno@univr.it and lpinello@mgh.harvard.edu for more information. +CRISPRme is distributed under a dual-license model: + +1. Academic Use + CRISPRme is freely available for academic research under the GNU Affero + General Public License v3.0 (AGPL-3.0) + (https://www.gnu.org/licenses/agpl-3.0.en.html). + +2. Commercial Use + For-profit institutions or users intending to use CRISPRme for commercial + purposes must acquire a commercial license. For inquiries and licensing + details, please contact: + - Luca Pinello: lpinello@mgh.harvard.edu + - Rosalba Giugno: rosalba.giugno@univr.it + +For more information on licensing terms and conditions, please reach out to the +contacts above. \ No newline at end of file diff --git a/PostProcess/analisi_indels_NNN.sh b/PostProcess/analisi_indels_NNN.sh index 98e8079..feb4e77 100755 --- a/PostProcess/analisi_indels_NNN.sh +++ b/PostProcess/analisi_indels_NNN.sh @@ -1,6 +1,5 @@ #!/bin/bash -set -e # trace all errors # Script per l'analisi dei targets della ricerca REF e ENR con PAM NNN # Il file dei targets della ricerca sul genoma reference si chiama $REFtargets -> INPUT $1 @@ -47,10 +46,8 @@ touch $REFtargets.corrected # 1) Rimozione duplicati, estrazione semicommon e unique e creazione file total #echo 'Creazione file .total.txt' -./extraction.sh "$REFtargets.corrected" "$ENRtargets" "$jobid" || { - echo "CRISPRme ERROR: indels analysis failed (script: ${0} line $((LINENO-1)))" >&2 - exit 1 -} # OUTPUT $jobid.common_targets.txt -> Non usato +./extraction.sh "$REFtargets.corrected" "$ENRtargets" "$jobid" +# OUTPUT $jobid.common_targets.txt -> Non usato # $jobid.semi_common_targets.txt # $jobid.unique_targets.txt @@ -73,10 +70,7 @@ rm "$jobid.semi_common_targets.minmaxdisr.txt" #echo 'Creazione cluster del file .total.txt' # 3) Clustering -./cluster.dict.py "$jobid.total.txt" 'no' 'True' 'True' "$guide_file" 'total' 'orderChr' || { - echo "CRISPRme ERROR: indels clustering failed (script: ${0} line $((LINENO-1)))" >&2 - exit 1 -} # OUTPUT $jobid.total.cluster.txt +./cluster.dict.py "$jobid.total.txt" 'no' 'True' 'True' "$guide_file" 'total' 'orderChr' # OUTPUT $jobid.total.cluster.txt #sed -i ':a;N;$!ba;s/\n/\tn\tn\tn\n/g' $jobid.total.cluster.txt #sed -i '$s/$/\tn\tn\tn/g' $jobid.total.cluster.txt @@ -104,10 +98,7 @@ rm "$jobid.total.txt" #echo 'Estrazione sample dal file .total.cluster.txt' -./analisi_indels_NNN.py "$annotationfile" "$jobid.total.cluster.txt" "$jobid" "$dictionaries" "$pam_file" "$mismatch" "$referencegenome" "$guide_file" $bulgesDNA $bulgesRNA || { - echo "CRISPRme ERROR: indels analysis failed (script: ${0} line $((LINENO-1)))" >&2 - exit 1 -} +./analisi_indels_NNN.py "$annotationfile" "$jobid.total.cluster.txt" "$jobid" "$dictionaries" "$pam_file" "$mismatch" "$referencegenome" "$guide_file" $bulgesDNA $bulgesRNA # OUTPUT $jobid.bestCFD_INDEL.txt # $jobid.CFDGraph.txt (per fare l'area graph dei CFD REF vs ENR) # NOTA AnnotatorAllTargets.py salva su disco SOLO il target con CFD più alto nel cluster e tra le scomposizioni esistenti @@ -133,18 +124,9 @@ echo 'Sorting and adjusting results' # #tail file w/o header and sort for realguide,chr,cluster_pos,score # tail -n +2 $jobid.bestCRISTA_INDEL.txt | LC_ALL=C sort -k15,15 -k4,4 -k6,6n -k21,21rg -T ./ >>$jobid.tmp && mv $jobid.tmp $jobid.bestCRISTA_INDEL.txt -./adjust_cols.py "$jobid.bestCFD_INDEL.txt" || { - echo "CRISPRme ERROR: CFD indels report failed (script: ${0} line $((LINENO-1)))" >&2 - exit 1 -} -./adjust_cols.py "$jobid.bestCRISTA_INDEL.txt" || { - echo "CRISPRme ERROR: CRISTA indels report failed (script: ${0} line $((LINENO-1)))" >&2 - exit 1 -} -./adjust_cols.py "$jobid.bestmmblg_INDEL.txt" || { - echo "CRISPRme ERROR: mismatch+bulges indels report failed (script: ${0} line $((LINENO-1)))" >&2 - exit 1 -} +./adjust_cols.py "$jobid.bestCFD_INDEL.txt" +./adjust_cols.py "$jobid.bestCRISTA_INDEL.txt" +./adjust_cols.py "$jobid.bestmmblg_INDEL.txt" # sed -i '1s/.*/MMBLG_#Bulge_type\tMMBLG_crRNA\tMMBLG_DNA\tMMBLG_Reference\tMMBLG_Chromosome\tMMBLG_Position\tMMBLG_Cluster_Position\tMMBLG_Direction\tMMBLG_Mismatches\tMMBLG_Bulge_Size\tMMBLG_Total\tMMBLG_PAM_gen\tMMBLG_Var_uniq\tMMBLG_Samples\tMMBLG_Annotation_Type\tMMBLG_Real_Guide\tMMBLG_rsID\tMMBLG_AF\tMMBLG_SNP\tMMBLG_#Seq_in_cluster\tMMBLG_CFD\tMMBLG_CFD_ref/' $jobid.bestmmblg_INDEL.txt # sed -i '1s/.*/MMBLG_#Bulge_type\tMMBLG_crRNA\tMMBLG_DNA\tMMBLG_Reference\tMMBLG_Chromosome\tMMBLG_Position\tMMBLG_Cluster_Position\tMMBLG_Direction\tMMBLG_Mismatches\tMMBLG_Bulge_Size\tMMBLG_Total\tMMBLG_PAM_gen\tMMBLG_Var_uniq\tMMBLG_Samples\tMMBLG_Annotation_Type\tMMBLG_Real_Guide\tMMBLG_rsID\tMMBLG_AF\tMMBLG_SNP\tMMBLG_#Seq_in_cluster\tMMBLG_CFD\tMMBLG_CFD_ref/' $jobid.altmmblg.txt @@ -152,18 +134,9 @@ echo 'Sorting and adjusting results' # pr -m -t -J $jobid.bestCFD_INDEL.txt $jobid.bestmmblg_INDEL.txt >$jobid.bestMerge.txt # pr -m -t -J $jobid.altCFD.txt $jobid.altmmblg.txt >$jobid.altMerge.txt -./remove_bad_indel_targets.py "$jobid.bestCFD_INDEL.txt" || { - echo "CRISPRme ERROR: CFD indels report cleaning failed (script: ${0} line $((LINENO-1)))" >&2 - exit 1 -} -./remove_bad_indel_targets.py "$jobid.bestCRISTA_INDEL.txt" || { - echo "CRISPRme ERROR: CRISTA indels report cleaning failed (script: ${0} line $((LINENO-1)))" >&2 - exit 1 -} -./remove_bad_indel_targets.py "$jobid.bestmmblg_INDEL.txt" || { - echo "CRISPRme ERROR: mismatch+bulges indels report cleaning failed (script: ${0} line $((LINENO-1)))" >&2 - exit 1 -} +./remove_bad_indel_targets.py "$jobid.bestCFD_INDEL.txt" +./remove_bad_indel_targets.py "$jobid.bestCRISTA_INDEL.txt" +./remove_bad_indel_targets.py "$jobid.bestmmblg_INDEL.txt" #merge targets in same chr when they are at distance 3 from each other (inclusive) preserving the highest scoring one # ./merge_close_targets_cfd.sh $jobid.bestCFD_INDEL.txt $jobid.bestCFD_INDEL.txt.trimmed 3 'score' diff --git a/PostProcess/extraction.sh b/PostProcess/extraction.sh index b663668..9bb25b7 100755 --- a/PostProcess/extraction.sh +++ b/PostProcess/extraction.sh @@ -1,7 +1,6 @@ #!/bin/bash ##NOTE AWK & GREP REPORT NO STDOUT IF NO MATCHES ARE FOUND (AWK DO NOT PRODUCE ANY OUTPUT) -# set -e # trace all errors #PARAM $1 is ref targets file #PARAM $2 is var targets file diff --git a/PostProcess/merge_alt_chr.sh b/PostProcess/merge_alt_chr.sh index 86b7dd9..deca1af 100755 --- a/PostProcess/merge_alt_chr.sh +++ b/PostProcess/merge_alt_chr.sh @@ -1,7 +1,5 @@ #!/bin/bash -set -e # trace all failures - dir=$(dirname $1) fileIn=$1 fileOut=$2 @@ -15,7 +13,8 @@ head -1 $fileIn >$fileOut for chrom in ${chroms[@]}; do echo $chrom - awk "/${chrom}\t/" test.targets.txt >$fileIn.$chrom.ref + # awk "/${chrom}\t/" test.targets.txt >$fileIn.$chrom.ref + grep -F -w "$chrom" $fileIn >$fileIn.$chrom.ref cut -f 3 $fileIn.$chrom.ref | LC_ALL=C sort -T "$dir" | uniq >$fileIn.$chrom.ref.targets awk -v chrom="$chrom" '$0 ~ chrom"_" {print($0)}' $fileIn >$fileIn.$chrom.alt awk 'NR==FNR{a[$0];next} !($0 in a)' $fileIn.$chrom.ref.targets $fileIn.$chrom.alt >$fileIn.$chrom.merged diff --git a/PostProcess/merge_close_targets_cfd.sh b/PostProcess/merge_close_targets_cfd.sh index e329593..6568973 100755 --- a/PostProcess/merge_close_targets_cfd.sh +++ b/PostProcess/merge_close_targets_cfd.sh @@ -1,7 +1,5 @@ #!/bin/bash -# set -e # capture any failure - fileIn=$1 fileOut=$2 thresh=$3 #threshold to use in order to merge near targets @@ -27,10 +25,16 @@ echo "Sorting done in $(($ENDTIME - $STARTTIME)) seconds" # echo -e $header | cat - $fileIn.sorted.tmp > $fileIn.sorted # rm $fileIn.sorted.tmp echo "Merging contiguous targets" + +if [[ "${sort_pivot}" == "score" ]]; then + criteria=$sorting_criteria_scoring +else + criteria=$sorting_criteria +fi +python merge_contiguous_targets.py $fileIn $fileOut $thresh $chrom $position $total $true_guide $snp_info $cfd $sort_pivot $criteria # python remove_contiguous_samples_cfd.py $fileIn.sorted $fileOut $thresh $chrom $position $total $true_guide $snp_info $cfd -python remove_contiguous_samples_cfd.py $fileIn $fileOut $thresh $chrom $position $total $true_guide $snp_info $cfd $sort_pivot $sorting_criteria_scoring $sorting_criteria -# python remove_contiguous_samples_cfd_new.py $fileIn $fileOut $thresh $chrom $position $total $true_guide $snp_info $cfd $sort_pivot $sorting_criteria_scoring $sorting_criteria || { -# echo "CRISPRme ERROR: contigous SNP removal failed (script: ${0} line $((LINENO-1)))" >&2 -# exit 1 -# } + + +# python remove_contiguous_samples_cfd.py $fileIn $fileOut $thresh $chrom $position $total $true_guide $snp_info $cfd $sort_pivot +# python remove_contiguous_samples_cfd.py $fileIn $fileOut $thresh $chrom $position $total $true_guide $snp_info $cfd $sort_pivot $sorting_criteria_scoring $sorting_criteria # rm $fileIn.sorted diff --git a/PostProcess/merge_contiguous_targets.py b/PostProcess/merge_contiguous_targets.py new file mode 100644 index 0000000..86bcd9d --- /dev/null +++ b/PostProcess/merge_contiguous_targets.py @@ -0,0 +1,615 @@ +""" +This module provides functionality for merging target data from input files based +on specified criteria. It includes functions for parsing command line arguments, +processing target data, and writing results to output files. + +Key functions include: +- `parse_commandline`: Parses and validates command line arguments for target + merging configuration. +- `split_target_row`: Splits a target row string into its individual components. +- `update_target_fields`: Updates specified fields in the target list by appending + corresponding values from another list. +- `distribute_targets`: Distributes targets between reference and variant targets + from a given cluster. +- `target_only_var`: Updates a target list to indicate whether it is a variant-only + target. +- `remove_duplicate_targets`: Removes duplicate values from specified fields in a + target list. +- `unfold_variant_targets`: Recovers and processes all variant targets from a given + dictionary. +- `sorting_score`: Generates a sorting key function based on specified criteria + for sorting. +- `sorting_fewest`: Creates a sorting key function based on the fewest specified + criteria. +- `initialize_sorting_criteria`: Initializes a sorting criteria function based on + the provided parameters. +- `retrieve_best_target`: Identifies and retrieves the best target from a given + cluster of targets. +- `merge_targets`: Merges target data from an input file and writes the best + targets to an output file. +- `main`: The entry point of the module that orchestrates the merging process. + +This module is designed to facilitate the analysis of genomic target data, allowing +users to efficiently merge and sort targets based on various criteria. +""" + +from typing import List, Tuple, Dict, Callable +from time import time +from io import TextIOWrapper + +import sys +import os + +SORTCRITERIA = {"mm": 2, "bulges": 1, "mm+bulges": 0} + + +def parse_commandline( + args: List[str], +) -> Tuple[str, str, int, int, int, int, int, int, int, str, List[str]]: + """ + Parses command line arguments for target merging configuration. + This function validates the input arguments and returns the necessary + parameters for processing target files. + + Args: + args (List[str]): A list of command line arguments where: + - args[0] is the targets file name. + - args[1] is the output file name. + - args[2] is the targets merge range in base pairs. + - args[3] is the chromosome column index (1-based). + - args[4] is the position column index (1-based). + - args[5] is the mm+bulges column index (1-based). + - args[6] is the guide column index (1-based). + - args[7] is the SNP info column index (1-based). + - args[8] is the score column index (1-based). + - args[9] is the sorting pivot (score or mm+bulges). + - args[10] is a comma-separated list of sorting criteria. + + Returns: + Tuple[str, str, int, int, int, int, int, int, int, str, List[str]]: + A tuple containing the parsed parameters: + - targets_fname: The targets file name. + - outfname: The output file name. + - rangebp: The targets merge range in base pairs. + - chromidx: The chromosome column index (0-based). + - posidx: The position column index (0-based). + - mmbidx: The mm+bulges column index (0-based). + - guideidx: The guide column index (0-based). + - snpidx: The SNP info column index (0-based). + - scoreidx: The score column index (0-based). + - pivot: The sorting pivot. + - sortcrit: A list of sorting criteria. + + Raises: + FileNotFoundError: If the targets file cannot be found. + ValueError: If the merge range is less than 1 or if invalid sort criteria are provided. + """ + + targets_fname = args[0] # targets file + if not os.path.isfile(targets_fname): + raise FileNotFoundError(f"Unable to locate {targets_fname}") + outfname = args[1] # output file + rangebp = int(args[2]) # targets merge range (bp) + if rangebp < 1: + raise ValueError(f"Forbidden targets merge range ({rangebp})") + chromidx = int(args[3]) - 1 # chromosome col idx + posidx = int(args[4]) - 1 # position col idx + mmbidx = int(args[5]) - 1 # mm+bulges col idx + guideidx = int(args[6]) - 1 # guide col idx + snpidx = int(args[7]) - 1 # snp info col idx + scoreidx = int(args[8]) - 1 # score col idx + pivot = args[9] # targets sorting pivot (score or mm+bulges) + # comma-separated list of criteria to use while sorting targets + sortcrit = args[10].split(",") + if len(sortcrit) > 3 or any(c not in SORTCRITERIA for c in sortcrit): + offending_vals = ",".join([c for c in sortcrit if c not in SORTCRITERIA]) + raise ValueError(f"Forbidden sort criteria selected: {offending_vals}") + return ( + targets_fname, + outfname, + rangebp, + chromidx, + posidx, + mmbidx, + guideidx, + snpidx, + scoreidx, + pivot, + sortcrit, + ) + + +def split_target_row( + target_row: str, guideidx: int, chromidx: int, posidx: int +) -> Tuple[str, str, int]: + """ + Splits a target row string into its individual components. + This function retrieves the guide, chromosome, and position from a target + row based on specified indices. + + Args: + target_row (str): A string representing a single target row, with fields + separated by whitespace. + guideidx (int): The index of the guide field in the target row. + chromidx (int): The index of the chromosome field in the target row. + posidx (int): The index of the position field in the target row. + + Returns: + Tuple[str, str, int]: A tuple containing the extracted guide, chromosome, + and position: + - guide (str): The guide extracted from the target row. + - chromosome (str): The chromosome extracted from the target row. + - position (int): The position extracted from the target row, converted + to an integer. + """ + + fields = target_row.strip().split() + return fields[guideidx], fields[chromidx], int(fields[posidx]) + + +def update_target_fields( + target: List[str], fields: List[str], samplesidx: int, snpid_idx: int, afidx: int +) -> List[str]: + """Update specified fields in the target list by appending corresponding + values from the fields list. + + This function modifies the target list by concatenating values from the fields + list at given indices, which is useful for aggregating information related to + samples, SNP IDs, and allele frequencies. + + Args: + target (List[str]): The list of target values to be updated. + fields (List[str]): The list of new values to append to the target. + samplesidx (int): The index in the target list for sample information. + snpid_idx (int): The index in the target list for SNP ID information. + afidx (int): The index in the target list for allele frequency information. + + Returns: + List[str]: The updated target list with concatenated values. + """ + + target[samplesidx] = f"{target[samplesidx]},{fields[samplesidx]}" + target[snpid_idx] = f"{target[snpid_idx]},{fields[snpid_idx]}" + target[afidx] = f"{target[afidx]},{fields[afidx]}" + return target + + +def distribute_targets( + cluster: List[str], + snpidx: int, + posidx: int, + snpid_idx: int, + samplesidx: int, + afidx: int, +) -> Tuple[List[List[str]], Dict[str, List[List[str]]]]: + """ + Distributes targets between reference and variant targets from a given cluster. + It merges identical targets found in different datasets into a structured + format. + + This function processes a list of target strings, categorizing them into + reference targets and variant targets based on specific indices. Reference + targets are collected in a list, while variant targets are stored in a dictionary, + allowing for the merging of identical targets across datasets. + + Args: + cluster (List[str]): A list of target strings to be processed. + snpidx (int): The index indicating the SNP status of the target. + posidx (int): The index indicating the position of the target. + snpid_idx (int): The index for SNP IDs in the target fields. + samplesidx (int): The index for sample information in the target fields. + afidx (int): The index for allele frequencies in the target fields. + + Returns: + Tuple[List[List[str]], Dict[str, List[List[str]]]]: A tuple containing a + list of reference + targets and a dictionary of variant targets, where each key corresponds + to a unique target and its value is a list of associated target fields. + + Raises: + ValueError: If the input data is malformed or indices are out of range. + """ + + # distribute targets between reference and variant targets + # dict used to merge identical targets found in different datasets + reftargets, vartargets = [], {} + for target in cluster: + fields = target.strip().split() # retrieve target fields + if fields[snpidx] == "n": # target found in reference + reftargets.append(fields) + else: # target found in variant genomes + targetkey = f"{fields[posidx]}_{fields[snpidx]}" + current_target = vartargets.get(targetkey) + if current_target: + # update current sample list, snp ids, and allele freqs + vartargets[targetkey][0] = update_target_fields( + current_target[0], fields, samplesidx, snpid_idx, afidx + ) + else: + vartargets[targetkey] = [fields] # first target at position + return reftargets, vartargets + + +def target_only_var(target: List[str], varonly: bool) -> List[str]: + """ + Updates a target list to indicate whether it is a variant-only target. + This function modifies the target's status based on the provided flag. + + The function checks the `varonly` flag and updates the 13th element of the + target list to "y" if the flag is set to True, indicating that the target is + only found in variant genomes. It returns the modified target list. + + Args: + target (List[str]): A list representing the target information. + varonly (bool): A flag indicating whether the target is variant-only. + + Returns: + List[str]: The updated target list with the appropriate status. + """ + + # set apprpriate flag if no target in reference in current cluster + target[12] = "y" if varonly else target[12] + return target + + +def remove_duplicate_targets( + target: List[str], snpidx: int, snpid_idx: int, afidx: int, samplesidx: int +) -> List[str]: + """ + Removes duplicate values from specified fields in a target list. + This function ensures that SNP IDs, SNP information, allele frequencies, and + sample data contain only unique entries. + + The function takes a target list and specified indices for SNPs, SNP IDs, + allele frequencies, and samples. It processes each of these fields to eliminate + duplicates by converting them into sets and then back into comma-separated + strings, returning the modified target list. + + Args: + target (List[str]): A list representing the target information. + snpidx (int): The index for SNP information in the target list. + snpid_idx (int): The index for SNP IDs in the target list. + afidx (int): The index for allele frequencies in the target list. + samplesidx (int): The index for sample information in the target list. + + Returns: + List[str]: The updated target list with duplicates removed from specified + fields. + """ + + # remove duplicate values from snp ids, snp info, allele freqs, and samples + target[snpidx] = ",".join(set(target[snpidx].split(","))) + target[snpid_idx] = ",".join(set(target[snpid_idx].split(","))) + target[afidx] = ",".join(set(target[afidx].split(","))) + target[samplesidx] = ",".join(set(target[samplesidx].split(","))) + return target + + +def unfold_variant_targets( + vartargets: Dict[str, List[List[str]]], + varonly: bool, + snpidx: int, + snpid_idx: int, + afidx: int, + samplesidx: int, +) -> List[List[str]]: + """ + Recovers and processes all variant targets from a given dictionary. + This function compiles variant targets into a single list while applying + necessary transformations and removing duplicates. + + The function iterates through the provided dictionary of variant targets, + applying the `target_only_var` function to each target based on the `varonly` + flag. It then removes duplicates from the resulting list of targets using the + `remove_duplicate_targets` function, returning a cleaned list of variant targets. + + Args: + vartargets (Dict[str, List[List[str]]]): A dictionary of variant targets, + where each key corresponds to a unique target identifier and the value + is a list of target fields. + varonly (bool): A flag indicating whether to mark targets as variant-only. + snpidx (int): The index for SNP information in the target list. + snpid_idx (int): The index for SNP IDs in the target list. + afidx (int): The index for allele frequencies in the target list. + samplesidx (int): The index for sample information in the target list. + + Returns: + List[List[str]]: A list of processed variant targets with duplicates removed. + """ + + # recover all variant targets and store in a list + vartargets_list = [] + for targets in vartargets.values(): + vartargets_list.extend([target_only_var(t, varonly) for t in targets]) + # remove duplicate values in targets + return [ + remove_duplicate_targets(t, snpidx, snpid_idx, afidx, samplesidx) + for t in vartargets_list + ] + + +def sorting_score(criteria: List[str], score_idx: int, mmbidx: int) -> Callable: + """ + Generates a sorting key function based on specified criteria for sorting. + This function allows for dynamic sorting of items based on one to three criteria, + prioritizing scores and additional metrics. + + The function returns a callable that can be used as a key in sorting operations. + Depending on the number of criteria provided, it constructs a tuple that includes + the negative score (to sort in descending order) and additional metrics derived + from the specified indices, ensuring that items are sorted according to the defined + priorities. + + Args: + criteria (List[str]): A list of criteria used for sorting. + score_idx (int): The index of the score in the items to be sorted. + mmbidx (int): The base index used to calculate additional metrics from + the criteria. + + Returns: + Callable: A function that takes an item and returns a tuple for sorting purposes. + """ + + if len(criteria) == 1: # single criterion + return lambda x: ( + -float(x[score_idx]), + int(x[mmbidx - SORTCRITERIA[criteria[0]]]), + ) + elif len(criteria) == 2: + return lambda x: ( + -float(x[score_idx]), + int(x[mmbidx - SORTCRITERIA[criteria[0]]]), + int(x[mmbidx - SORTCRITERIA[criteria[1]]]), + ) + # base case (all three ) + return lambda x: ( + -float(x[score_idx]), + int(x[mmbidx - SORTCRITERIA[criteria[0]]]), + int(x[mmbidx - SORTCRITERIA[criteria[1]]]), + int(x[mmbidx - SORTCRITERIA[criteria[2]]]), + ) + + +def sorting_fewest(criteria: List[str], mmbidx: int) -> Callable: + """ + Creates a sorting key function based on the fewest specified criteria. + This function allows for sorting items by one to three criteria, focusing on + the values derived from the specified indices. + + The function returns a callable that can be used as a key in sorting operations. + Depending on the number of criteria provided, it constructs a tuple of integer + values from the specified indices, enabling sorting based on the defined + priorities. + + Args: + criteria (List[str]): A list of criteria used for sorting. + mmbidx (int): The base index used to calculate values from the criteria. + + Returns: + Callable: A function that takes an item and returns a tuple for sorting + purposes. + """ + + if len(criteria) == 1: # one criterion + return lambda x: (int(x[mmbidx - SORTCRITERIA[criteria[0]]])) + elif len(criteria) == 2: + return lambda x: ( + int(x[mmbidx - SORTCRITERIA[criteria[0]]]), + int(x[mmbidx - SORTCRITERIA[criteria[1]]]), + # int(x[mmbidx - 2]), + # int(x[mmbidx - 1]), + ) + # base case (all three ) + return lambda x: ( + int(x[mmbidx - SORTCRITERIA[criteria[0]]]), + int(x[mmbidx - SORTCRITERIA[criteria[1]]]), + int(x[mmbidx - SORTCRITERIA[criteria[2]]]), + ) + + +def initialize_sorting_criteria( + criteria: List[str], scoreidx: int, mmbidx: int, score: bool +) -> Callable: + """ + Initializes a sorting criteria function based on the provided parameters. + This function determines whether to sort by score or by the fewest criteria + based on the input flag. + + Depending on the value of the `score` flag, the function returns either a + sorting function that prioritizes scores or one that focuses on the fewest + specified criteria. This allows for flexible sorting behavior based on the + user's needs. + + Args: + criteria (List[str]): A list of criteria used for sorting. + scoreidx (int): The index of the score in the items to be sorted. + mmbidx (int): The base index used to calculate additional metrics from + the criteria. + score (bool): A flag indicating whether to sort by score or by the fewest + criteria. + + Returns: + Callable: A function that can be used as a key in sorting operations. + """ + + if score: + return sorting_score(criteria, scoreidx, mmbidx) + return sorting_fewest(criteria, mmbidx) + + +def retrieve_best_target( + cluster: List[str], + snpidx: int, + posidx: int, + guideidx: int, + scoreidx: int, + mmbidx: int, + pivot: str, + sorting_criteria: List[str], + outfile: TextIOWrapper, + outfile_disc: TextIOWrapper, +) -> None: + """ + Identifies and retrieves the best target from a given cluster of targets. + This function processes the targets based on specified criteria and outputs + the best target along with alternative alignments to the provided output files. + + The function first distributes the targets into reference and variant categories, + then unfolds the variant targets into a list. It sorts both reference and variant + targets according to the specified sorting criteria, determines the best target + based on the pivot condition, and writes the results to the specified output files, + including the count of remaining targets. + + Args: + cluster (List[str]): A list of target strings representing the cluster. + snpidx (int): The index indicating the SNP status of the target. + posidx (int): The index indicating the position of the target. + guideidx (int): The index for guide information in the target fields. + scoreidx (int): The index for scores in the target fields. + mmbidx (int): The base index used to calculate additional metrics from + the criteria. + pivot (str): A string indicating the pivot for sorting (e.g., "score"). + sorting_criteria (List[str]): A list of criteria used for sorting the + targets. + outfile (TextIOWrapper): The output file for the best target. + outfile_disc (TextIOWrapper): The output file for alternative alignments. + + Returns: + None: This function does not return a value but writes output to the + specified files. + """ + + if not cluster: # opening the first cluster, it will be empty + return # do nothing + reftargets, vartargets = distribute_targets( + cluster, snpidx, posidx, snpidx - 2, guideidx - 2, snpidx - 1 + ) + varonly = not reftargets # check if found only variant targets + # retrieve variant targets in list + vartargets = unfold_variant_targets( + vartargets, varonly, snpidx, snpidx - 2, snpidx - 1, guideidx - 2 + ) + # sort targets using the criteria specified in input + score = pivot == "score" + if reftargets: + reftargets = sorted( + reftargets, + key=initialize_sorting_criteria(sorting_criteria, scoreidx, mmbidx, score), + ) + if vartargets: + vartargets = sorted( + vartargets, + key=initialize_sorting_criteria(sorting_criteria, scoreidx, mmbidx, score), + ) + if varonly: + target = vartargets.pop(0) # retrieve best target + # count the targets remaining in the cluster + target[scoreidx - 1] = str(len(vartargets)) + elif reftargets and vartargets: + if score: # check on score + target = ( + vartargets.pop(0) + if float(vartargets[0][scoreidx]) > float(reftargets[0][scoreidx]) + else reftargets.pop(0) + ) + elif int(vartargets[0][mmbidx]) < int(reftargets[0][mmbidx]): + target = vartargets.pop(0) + else: + target = reftargets.pop(0) + # count the targets remaining in the cluster + target[scoreidx - 1] = str(len(reftargets) + len(vartargets)) + else: + target = reftargets.pop(0) + target[scoreidx - 1] = str(len(reftargets)) + outfile.write("\t".join(target) + "\n") # report the best target + # write alternative alignments + for target in reftargets + vartargets: + target[scoreidx - 1] = str(len(reftargets) + len(vartargets)) + outfile_disc.write("\t".join(target) + "\n") # report the alternative target + + +def merge_targets( + inargs: Tuple[str, str, int, int, int, int, int, int, int, str, List[str]] +) -> None: + """ + Merges target data from an input file and writes the best targets to an output + file. This function processes clusters of targets based on specified criteria + and handles discarded samples in a separate output file. + + The function reads target data from the input file, grouping targets into + clusters based on guide, chromosome, and position. It retrieves the best target + from each cluster and writes the results to the specified output files, ensuring + that discarded samples are also recorded. + + Args: + inargs (Tuple[str, str, int, int, int, int, int, int, int, str, List[str]]): + A tuple containing input parameters, including input and output file + names, indices for various target fields, and sorting criteria. + + Returns: + None: This function does not return a value but writes output to the + specified files. + """ + + outfname_disc = f"{inargs[1]}.discarded_samples" # discarded targets file + # initialize variables used during merge + prevpos, prevguide, prevchrom, cluster = -(inargs[2] + 1), "", "", [] + with open(inargs[0], mode="r") as infile: + with open(inargs[1], mode="w") as outfile: + with open(outfname_disc, mode="w") as outfile_disc: + # header placed in both outfiles + header = infile.readline() + outfile_disc.write(header) + outfile.write(header) + for line in infile: # start reading targets + # retrieve guide chromosome and position + guide, chrom, pos = split_target_row( + line, inargs[6], inargs[3], inargs[4] + ) + # open new targets cluster and retrieve the best target from previous cluster + if ( + prevguide != guide + or prevchrom != chrom + or (pos - prevpos) > inargs[2] + ): + retrieve_best_target( + cluster, + inargs[7], + inargs[4], + inargs[6], + inargs[8], + inargs[5], + inargs[9], + inargs[10], + outfile, + outfile_disc, + ) + cluster = [line] + else: # append target data to current cluster + cluster.append(line) + # update lookup variables + prevpos, prevguide, prevchrom = pos, guide, chrom + retrieve_best_target( + cluster, + inargs[7], + inargs[4], + inargs[6], + inargs[8], + inargs[5], + inargs[9], + inargs[10], + outfile, + outfile_disc, + ) # process the last cluster + + +def main(): + # read input args + inargs = parse_commandline(sys.argv[1:]) + start = time() + merge_targets(inargs) + sys.stdout.write(f"Targets merge completed in {(time() - start):.2f}s\n") + + +if __name__ == "__main__": + main() diff --git a/PostProcess/new_simple_analysis.py b/PostProcess/new_simple_analysis.py index 976c42a..198ef5f 100755 --- a/PostProcess/new_simple_analysis.py +++ b/PostProcess/new_simple_analysis.py @@ -12,10 +12,9 @@ # For scoring of CFD And Doench -tab = str.maketrans("ACTGRYSWMKHDBVactgryswmkhdbv", - "TGACYRSWKMDHVBtgacyrswkmdhvb") +tab = str.maketrans("ACTGRYSWMKHDBVactgryswmkhdbv", "TGACYRSWKMDHVBtgacyrswkmdhvb") -iupac_nucleotides = set('RYSWKMBDHVryswkmbdhv') +iupac_nucleotides = set("RYSWKMBDHVryswkmbdhv") iupac_code_set = { "R": {"A", "G"}, "Y": {"C", "T"}, @@ -45,7 +44,7 @@ "t": {"t"}, "c": {"c"}, "g": {"g"}, - 'N': {'A', 'T', 'G', 'C'} + "N": {"A", "T", "G", "C"}, } @@ -70,7 +69,7 @@ "d": ("A", "G", "T"), "h": ("A", "C", "T"), "v": ("A", "C", "G"), - 'N': ('A', 'T', 'C', 'G') + "N": ("A", "T", "C", "G"), } @@ -79,10 +78,10 @@ def reverse_complement_table(seq): class reversor: - ''' + """ Nel caso debba ordinare più campi però con reverse diversi, eg uno True e l'altro False, posso usare questa classe nella chiave per simulare il contrario del reverse applicato - ''' + """ def __init__(self, obj): self.obj = obj @@ -100,8 +99,8 @@ def calc_cfd(guide_seq, sg, pam, mm_scores, pam_scores, do_scores): # score = -1 # return score score = 1 - sg = sg.replace('T', 'U') - guide_seq = guide_seq.replace('T', 'U') + sg = sg.replace("T", "U") + guide_seq = guide_seq.replace("T", "U") s_list = list(sg) guide_seq_list = list(guide_seq) @@ -110,14 +109,15 @@ def calc_cfd(guide_seq, sg, pam, mm_scores, pam_scores, do_scores): score *= 1 else: try: # Catch exception if IUPAC character - key = 'r' + guide_seq_list[i] + \ - ':d' + revcom(sl) + ',' + str(i + 1) + key = "r" + guide_seq_list[i] + ":d" + revcom(sl) + "," + str(i + 1) except Exception as e: score = 0 break try: score *= mm_scores[key] - except Exception as e: # If '-' is in first position, i do not have the score for that position + except ( + Exception + ) as e: # If '-' is in first position, i do not have the score for that position pass score *= pam_scores[pam] @@ -125,31 +125,35 @@ def calc_cfd(guide_seq, sg, pam, mm_scores, pam_scores, do_scores): def revcom(s): - basecomp = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'U': 'A', '-': '-'} + basecomp = {"A": "T", "C": "G", "G": "C", "T": "A", "U": "A", "-": "-"} letters = list(s[::-1]) try: letters = [basecomp[base] for base in letters] except: return None # If some IUPAC were not translated - return ''.join(letters) + return "".join(letters) def get_mm_pam_scores(): # print(os.path.dirname(os.path.realpath(__file__))) try: - mm_scores = pickle.load(open(os.path.dirname( - os.path.realpath(__file__)) + '/mismatch_score.pkl', 'rb')) - pam_scores = pickle.load(open(os.path.dirname( - os.path.realpath(__file__)) + '/PAM_scores.pkl', 'rb')) + mm_scores = pickle.load( + open( + os.path.dirname(os.path.realpath(__file__)) + "/mismatch_score.pkl", + "rb", + ) + ) + pam_scores = pickle.load( + open(os.path.dirname(os.path.realpath(__file__)) + "/PAM_scores.pkl", "rb") + ) return (mm_scores, pam_scores) except: - raise Exception( - "Could not find file with mismatch scores or PAM scores") + raise Exception("Could not find file with mismatch scores or PAM scores") def retrieveFromDict(chr_pos): try: - entry = mydict[current_chr+','+str(chr_pos+1)] + entry = mydict[current_chr + "," + str(chr_pos + 1)] except: snp_list = [] sample_list = [] @@ -157,40 +161,48 @@ def retrieveFromDict(chr_pos): rsID_list = [] snp_info_list = [] sample_list.append([]) # no samples - snp_list.append('C') # fake snp - rsID_list.append('.') # no rsid - AF_list.append('0') # fake AF + snp_list.append("C") # fake snp + rsID_list.append(".") # no rsid + AF_list.append("0") # fake AF snp_info_list.append( - current_chr+'_'+str(chr_pos+1)+'_'+'C'+'_'+'G') # fake snp info list + current_chr + "_" + str(chr_pos + 1) + "_" + "C" + "_" + "G" + ) # fake snp info list return snp_list, sample_list, rsID_list, AF_list, snp_info_list - multi_entry = entry.split('$') + multi_entry = entry.split("$") snp_list = [] sample_list = [] AF_list = [] rsID_list = [] snp_info_list = [] for entry in multi_entry: - split_entry = entry.split(';') - samples = split_entry[0].strip().split(',') - if samples[0] == '': + split_entry = entry.split(";") + samples = split_entry[0].strip().split(",") + if samples[0] == "": samples = [] sample_list.append(samples) - snp_list.append(split_entry[1].strip().split(',')[1]) + snp_list.append(split_entry[1].strip().split(",")[1]) rsID_list.append(split_entry[2].strip()) AF_list.append(split_entry[3].strip()) snp_info_list.append( - current_chr+'_'+str(chr_pos+1)+'_'+split_entry[1].split(',')[0]+'_'+split_entry[1].split(',')[1]) + current_chr + + "_" + + str(chr_pos + 1) + + "_" + + split_entry[1].split(",")[0] + + "_" + + split_entry[1].split(",")[1] + ) return snp_list, sample_list, rsID_list, AF_list, snp_info_list -def iupac_decomposition(split, guide_no_bulge, - guide_no_pam, cluster_to_save): +def iupac_decomposition(split, guide_no_bulge, guide_no_pam, cluster_to_save): realTarget = split[2] - replaceTarget = split[2].replace('-', '') - refSeq = genomeStr[int(split[4]): int(split[4])+len(replaceTarget)].upper() + replaceTarget = split[2].replace("-", "") + refSeq = genomeStr[int(split[4]) : int(split[4]) + len(replaceTarget)].upper() + refseqlist = list(refSeq) revert = False - if split[6] == '-': + if split[6] == "-": revert = True replaceTarget = reverse_complement_table(replaceTarget) @@ -203,31 +215,44 @@ def iupac_decomposition(split, guide_no_bulge, totalDict[1][0] = dict() countIUPAC = 0 for pos_c, c in enumerate(replaceTarget): + if pos_c >= len(refseqlist) or pos_c < 0: # should not be necessary + countIUPAC = 0 + break if c in iupac_code: countIUPAC += 1 snpToReplace, sampleSet, rsID, AF_var, snpInfo = retrieveFromDict( - pos_c+int(split[4])) + pos_c + int(split[4]) + ) for i, elem in enumerate(snpToReplace): - listReplaceTarget = list(refSeq) + listReplaceTarget = [nt for nt in refseqlist] listReplaceTarget[pos_c] = elem listInfo = [[rsID[i], AF_var[i], snpInfo[i]]] if haplotype_check: haploSamples = {0: [], 1: []} for count, sample in enumerate(sampleSet[i]): - sampleInfo = sample.split(':') + sampleInfo = sample.split(":") for haplo in totalDict: - if sampleInfo[1].split('|')[haplo] != '0': + if sampleInfo[1].split("|")[haplo] != "0": haploSamples[haplo].append(sampleInfo[0]) totalDict[0][0][(pos_c, elem)] = [ - listReplaceTarget, set(haploSamples[0]), listInfo] + listReplaceTarget, + set(haploSamples[0]), + listInfo, + ] totalDict[1][0][(pos_c, elem)] = [ - listReplaceTarget, set(haploSamples[1]), listInfo] + listReplaceTarget, + set(haploSamples[1]), + listInfo, + ] else: sampleList = list() for count, sample in enumerate(sampleSet[i]): - sampleList.append(sample.split(':')[0]) + sampleList.append(sample.split(":")[0]) totalDict[0][0][(pos_c, elem)] = [ - listReplaceTarget, set(sampleList), listInfo] + listReplaceTarget, + set(sampleList), + listInfo, + ] if countIUPAC > 0: # if found valid alternative targets if revert: @@ -239,113 +264,122 @@ def iupac_decomposition(split, guide_no_bulge, createdNewLayer = False else: break - totalDict[count][size+1] = dict() + totalDict[count][size + 1] = dict() # for each snp in target (fixpoint) for key in totalDict[count][size]: # for each other snp in target (> fixpoint) for newkey in totalDict[count][0]: if newkey[-2] > key[-2]: resultSet = totalDict[count][size][key][1].intersection( - totalDict[count][0][newkey][1]) # extract intersection of sample to generate possible multisnp target + totalDict[count][0][newkey][1] + ) # extract intersection of sample to generate possible multisnp target if len(resultSet) > 0: # if set is not null createdNewLayer = True # add new snp to preceding target seq with snp - replaceTarget1 = totalDict[count][0][newkey][0].copy( - ) - replaceTarget2 = totalDict[count][size][key][0].copy( - ) - replaceTarget2[newkey[0] - ] = replaceTarget1[newkey[0]] - listInfo2 = totalDict[count][size][key][2].copy( - ) - listInfo2.extend( - totalDict[count][0][newkey][2]) + replaceTarget1 = totalDict[count][0][newkey][0].copy() + replaceTarget2 = totalDict[count][size][key][0].copy() + replaceTarget2[newkey[0]] = replaceTarget1[newkey[0]] + listInfo2 = totalDict[count][size][key][2].copy() + listInfo2.extend(totalDict[count][0][newkey][2]) # add to next level the modified seq and set of samples and info of snp combinedKey = key + newkey - totalDict[count][size+1][combinedKey] = [ - replaceTarget2, resultSet, listInfo2] + totalDict[count][size + 1][combinedKey] = [ + replaceTarget2, + resultSet, + listInfo2, + ] # remove the new generated sample set from all lower levels # (this should be done only with phased VCF since unphased cannot be verified) if haplotype_check: - totalDict[count][size][key][1] = totalDict[count][size][key][1] - \ - totalDict[count][size + - 1][combinedKey][1] - totalDict[count][0][newkey][1] = totalDict[count][0][newkey][1] - \ - totalDict[count][size + - 1][combinedKey][1] + totalDict[count][size][key][1] = ( + totalDict[count][size][key][1] + - totalDict[count][size + 1][combinedKey][1] + ) + totalDict[count][0][newkey][1] = ( + totalDict[count][0][newkey][1] + - totalDict[count][size + 1][combinedKey][1] + ) refSeq_with_bulges = list(refSeq) for pos, char in enumerate(realTarget): - if char == '-': - refSeq_with_bulges.insert(pos, '-') + if char == "-": + refSeq_with_bulges.insert(pos, "-") - for position_t, char_t in enumerate(refSeq_with_bulges[pos_beg: pos_end]): + for position_t, char_t in enumerate(refSeq_with_bulges[pos_beg:pos_end]): if char_t.upper() != guide_no_pam[position_t]: tmp_pos_mms = position_t - if guide_no_pam[position_t] != '-': - refSeq_with_bulges[pos_beg + - position_t] = char_t.lower() + if guide_no_pam[position_t] != "-": + refSeq_with_bulges[pos_beg + position_t] = char_t.lower() # ref sequence with bulges - refSeq_with_bulges = ''.join(refSeq_with_bulges) + refSeq_with_bulges = "".join(refSeq_with_bulges) for level in totalDict[count]: for key in totalDict[count][level]: if len(totalDict[count][level][key][1]) > 0: if revert: totalDict[count][level][key][0] = reverse_complement_table( - ''.join(totalDict[count][level][key][0])) + "".join(totalDict[count][level][key][0]) + ) else: - totalDict[count][level][key][0] = ''.join( - totalDict[count][level][key][0]) + totalDict[count][level][key][0] = "".join( + totalDict[count][level][key][0] + ) final_line = split.copy() target_to_list = list(totalDict[count][level][key][0]) for pos, char in enumerate(realTarget): - if char == '-': - target_to_list.insert(pos, '-') + if char == "-": + target_to_list.insert(pos, "-") mm_new_t = 0 tmp_pos_mms = 0 - for position_t, char_t in enumerate(target_to_list[pos_beg: pos_end]): + for position_t, char_t in enumerate( + target_to_list[pos_beg:pos_end] + ): if char_t.upper() != guide_no_pam[position_t]: mm_new_t += 1 tmp_pos_mms = position_t - if guide_no_pam[position_t] != '-': - target_to_list[pos_beg + - position_t] = char_t.lower() + if guide_no_pam[position_t] != "-": + target_to_list[pos_beg + position_t] = ( + char_t.lower() + ) # pam respect input PAM after IUPAC resolution pam_ok = True - for pam_chr_pos, pam_chr in enumerate(target_to_list[pam_begin: pam_end]): + for pam_chr_pos, pam_chr in enumerate( + target_to_list[pam_begin:pam_end] + ): if pam_chr.upper() not in iupac_code_set[pam[pam_chr_pos]]: pam_ok = False - target_pam_ref = refSeq_with_bulges[pam_begin: pam_end] + target_pam_ref = refSeq_with_bulges[pam_begin:pam_end] found_creation = False for pos_pam, pam_char in enumerate(target_pam_ref): # ref char not in set of general pam char - if not iupac_code_set[pam[pos_pam]] & iupac_code_set[pam_char]: + if ( + not iupac_code_set[pam[pos_pam]] + & iupac_code_set[pam_char] + ): found_creation = True # value of mm and bulges is over allowed threshold, discard target if mm_new_t - int(split[8]) > allowed_mms: continue elif pam_ok: - final_line[2] = ''.join(target_to_list) + final_line[2] = "".join(target_to_list) final_line[7] = str(mm_new_t - int(final_line[8])) # total differences between targets and guide (mismatches + bulges) final_line[9] = str(mm_new_t) if found_creation: - final_line[10] = ''.join( - target_to_list[pam_begin: pam_end]) - final_line[12] = ','.join( - totalDict[count][level][key][1]) - tmp_matrix = np.array( - totalDict[count][level][key][2]) + final_line[10] = "".join( + target_to_list[pam_begin:pam_end] + ) + final_line[12] = ",".join(totalDict[count][level][key][1]) + tmp_matrix = np.array(totalDict[count][level][key][2]) if tmp_matrix.shape[0] > 1: - final_line[15] = ','.join(tmp_matrix[:, 0]) - final_line[16] = ','.join(tmp_matrix[:, 1]) - final_line[17] = ','.join(tmp_matrix[:, 2]) + final_line[15] = ",".join(tmp_matrix[:, 0]) + final_line[16] = ",".join(tmp_matrix[:, 1]) + final_line[17] = ",".join(tmp_matrix[:, 2]) else: final_line[15] = str(tmp_matrix[0][0]) final_line[16] = str(tmp_matrix[0][1]) @@ -363,27 +397,55 @@ def iupac_decomposition(split, guide_no_bulge, def preprocess_CFD_score(target): # preprocess target then calculate CFD score if do_scores: - if target[0] == 'DNA': - cfd_score = calc_cfd(target[1][int(target[bulge_pos]):], target[2].upper()[int( - target[bulge_pos]):-3], target[2].upper()[-2:], mm_scores, pam_scores, do_scores) + if target[0] == "DNA": + cfd_score = calc_cfd( + target[1][int(target[bulge_pos]) :], + target[2].upper()[int(target[bulge_pos]) : -3], + target[2].upper()[-2:], + mm_scores, + pam_scores, + do_scores, + ) # append to target the CFD score of the aligned sequence (alt or ref) target.append("{:.3f}".format(cfd_score)) # -3 position is a placeholder for ref score - if target[-3] == 55: # if 55 sequence is ref so no score have to be calculated + if ( + target[-3] == 55 + ): # if 55 sequence is ref so no score have to be calculated target[-3] = "{:.3f}".format(cfd_score) - if target[-3] == 33: # if 33 sequence is alt so ref score must be calculated - cfd_ref_score = calc_cfd(target[1][int(target[bulge_pos]):], target[-4].upper()[int( - target[bulge_pos]):-3], target[-4].upper()[-2:], mm_scores, pam_scores, do_scores) + if ( + target[-3] == 33 + ): # if 33 sequence is alt so ref score must be calculated + cfd_ref_score = calc_cfd( + target[1][int(target[bulge_pos]) :], + target[-4].upper()[int(target[bulge_pos]) : -3], + target[-4].upper()[-2:], + mm_scores, + pam_scores, + do_scores, + ) target[-3] = "{:.3f}".format(cfd_ref_score) else: - cfd_score = calc_cfd(target[1], target[2].upper()[ - :-3], target[2].upper()[-2:], mm_scores, pam_scores, do_scores) + cfd_score = calc_cfd( + target[1], + target[2].upper()[:-3], + target[2].upper()[-2:], + mm_scores, + pam_scores, + do_scores, + ) target.append("{:.3f}".format(cfd_score)) if target[-3] == 55: target[-3] = "{:.3f}".format(cfd_score) if target[-3] == 33: - cfd_ref_score = calc_cfd(target[1], target[-4].upper()[ - :-3], target[-4].upper()[-2:], mm_scores, pam_scores, do_scores) + cfd_ref_score = calc_cfd( + target[1], + target[-4].upper()[:-3], + target[-4].upper()[-2:], + mm_scores, + pam_scores, + do_scores, + ) target[-3] = "{:.3f}".format(cfd_ref_score) else: # no score calculated, append -1 in CFD score and in position -3 insert -1 value (-1 means no score calculated) @@ -421,43 +483,47 @@ def preprocess_CRISTA_score(cluster_targets): # process all found targets for index, target in enumerate(cluster_targets): # list with non-aligned sgRNA - sgRNA_non_aligned_list.append( - str(target[1])[:len(str(target[1]))-3]+'NGG') + sgRNA_non_aligned_list.append(str(target[1])[: len(str(target[1])) - 3] + "NGG") # list with aligned DNA DNA_aligned_list.append(str(target[2])) # first 5 nucleotide to add to protospacer - pre_protospacer_DNA = genomeStr[int( - target[4])-5:int(target[4])].upper() + pre_protospacer_DNA = genomeStr[int(target[4]) - 5 : int(target[4])].upper() # protospacer taken directly from the aligned target - protospacerDNA = str(target[2]).replace('-', '') - if target[6] == '-': + protospacerDNA = str(target[2]).replace("-", "") + if target[6] == "-": protospacerDNA = reverse_complement_table(protospacerDNA) # last 5 nucleotides to add to protospacer - post_protospacer_DNA = genomeStr[int( - target[4])+len(target[1]):int(target[4])+len(target[1])+5].upper() + post_protospacer_DNA = genomeStr[ + int(target[4]) + len(target[1]) : int(target[4]) + len(target[1]) + 5 + ].upper() # DNA seq extracted from genome and append to aligned DNA seq from CRISPRme - complete_DNA_seq = str(pre_protospacer_DNA) + \ - protospacerDNA+str(post_protospacer_DNA) + complete_DNA_seq = ( + str(pre_protospacer_DNA) + protospacerDNA + str(post_protospacer_DNA) + ) for elem in iupac_nucleotides: if elem in complete_DNA_seq: - complete_DNA_seq = complete_DNA_seq.replace(elem, '') + complete_DNA_seq = complete_DNA_seq.replace(elem, "") # trim the 3' and 5' end to avoid sequences longer than 29 len_DNA_seq = len(complete_DNA_seq) - first_half = complete_DNA_seq[int(len_DNA_seq/2)-14:int(len_DNA_seq/2)] - second_half = complete_DNA_seq[int( - len_DNA_seq/2):int(len_DNA_seq/2)+15] - complete_DNA_seq = first_half+second_half - if target[6] == '-': + first_half = complete_DNA_seq[int(len_DNA_seq / 2) - 14 : int(len_DNA_seq / 2)] + second_half = complete_DNA_seq[int(len_DNA_seq / 2) : int(len_DNA_seq / 2) + 15] + complete_DNA_seq = first_half + second_half + if target[6] == "-": complete_DNA_seq = reverse_complement_table(complete_DNA_seq) # if 'N' is present in the reference DNA seq, we must use a fake DNA seq to complete the aligned # that will be discarded after - if 'N' in complete_DNA_seq or 'n' in complete_DNA_seq or 'N' in DNA_aligned_list[-1] or 'n' in DNA_aligned_list[-1]: - complete_DNA_seq = 'A'*29 - DNA_aligned_list[-1] = 'A'*len(str(target[2])) + if ( + "N" in complete_DNA_seq + or "n" in complete_DNA_seq + or "N" in DNA_aligned_list[-1] + or "n" in DNA_aligned_list[-1] + ): + complete_DNA_seq = "A" * 29 + DNA_aligned_list[-1] = "A" * len(str(target[2])) index_to_null.append(index) # append sequence to DNA list @@ -467,7 +533,8 @@ def preprocess_CRISTA_score(cluster_targets): crista_score_list_alt = list() if do_scores: crista_score_list_alt = CRISTA_predict_list( - sgRNA_non_aligned_list, DNA_aligned_list, DNAseq_from_genome_list) + sgRNA_non_aligned_list, DNA_aligned_list, DNAseq_from_genome_list + ) # preprocess target then calculate CRISTA score sgRNA_non_aligned_list = list() @@ -476,45 +543,48 @@ def preprocess_CRISTA_score(cluster_targets): # process all ref sequences in targets for index, target in enumerate(cluster_targets): # list with non-aligned sgRNA - sgRNA_non_aligned_list.append( - str(target[1])[:len(str(target[1]))-3]+'NGG') + sgRNA_non_aligned_list.append(str(target[1])[: len(str(target[1])) - 3] + "NGG") # list with aligned DNA - if 'n' not in target[-3]: + if "n" not in target[-3]: DNA_aligned_list.append(str(target[-3])) else: DNA_aligned_list.append(str(target[2])) # first 5 nucleotide to add to protospacer - pre_protospacer_DNA = genomeStr[int( - target[4])-5:int(target[4])] + pre_protospacer_DNA = genomeStr[int(target[4]) - 5 : int(target[4])] # protospacer taken directly from the ref genome - protospacerDNA = genomeStr[int(target[4]):int( - target[4])+len(target[1])] + protospacerDNA = genomeStr[int(target[4]) : int(target[4]) + len(target[1])] # last 5 nucleotides to add to protospacer - post_protospacer_DNA = genomeStr[int( - target[4])+len(target[1]):int(target[4])+len(target[1])+5] + post_protospacer_DNA = genomeStr[ + int(target[4]) + len(target[1]) : int(target[4]) + len(target[1]) + 5 + ] # DNA seq extracted from genome and append to aligned DNA seq from CRISPRme - complete_DNA_seq = str(pre_protospacer_DNA) + \ - protospacerDNA+str(post_protospacer_DNA) + complete_DNA_seq = ( + str(pre_protospacer_DNA) + protospacerDNA + str(post_protospacer_DNA) + ) for elem in iupac_nucleotides: if elem in complete_DNA_seq: - complete_DNA_seq = complete_DNA_seq.replace(elem, '') + complete_DNA_seq = complete_DNA_seq.replace(elem, "") # trim the 3' and 5' end to avoid sequences longer than 29 len_DNA_seq = len(complete_DNA_seq) - first_half = complete_DNA_seq[int(len_DNA_seq/2)-14:int(len_DNA_seq/2)] - second_half = complete_DNA_seq[int( - len_DNA_seq/2):int(len_DNA_seq/2)+15] - complete_DNA_seq = first_half+second_half - if target[6] == '-': + first_half = complete_DNA_seq[int(len_DNA_seq / 2) - 14 : int(len_DNA_seq / 2)] + second_half = complete_DNA_seq[int(len_DNA_seq / 2) : int(len_DNA_seq / 2) + 15] + complete_DNA_seq = first_half + second_half + if target[6] == "-": complete_DNA_seq = reverse_complement_table(complete_DNA_seq) # if 'N' is present in the reference DNA seq, we must use a fake DNA seq to complete the aligned # that will be discarded after - if 'N' in complete_DNA_seq or 'n' in complete_DNA_seq or 'N' in DNA_aligned_list[-1] or 'n' in DNA_aligned_list[-1]: - complete_DNA_seq = 'A'*29 - DNA_aligned_list[-1] = 'A'*len(str(target[2])) + if ( + "N" in complete_DNA_seq + or "n" in complete_DNA_seq + or "N" in DNA_aligned_list[-1] + or "n" in DNA_aligned_list[-1] + ): + complete_DNA_seq = "A" * 29 + DNA_aligned_list[-1] = "A" * len(str(target[2])) index_to_null.append(index) # append sequence to DNA list @@ -524,7 +594,8 @@ def preprocess_CRISTA_score(cluster_targets): crista_score_list_ref = list() if do_scores: crista_score_list_ref = CRISTA_predict_list( - sgRNA_non_aligned_list, DNA_aligned_list, DNAseq_from_genome_list) + sgRNA_non_aligned_list, DNA_aligned_list, DNAseq_from_genome_list + ) for index, target in enumerate(cluster_targets): target_CRISTA = target.copy() @@ -536,15 +607,11 @@ def preprocess_CRISTA_score(cluster_targets): else: # else report the correct score if target_CRISTA[-2] == 55: # reference target have duplicate score - target_CRISTA[-2] = "{:.3f}".format( - crista_score_list_alt[index]) - target_CRISTA.append("{:.3f}".format( - crista_score_list_alt[index])) + target_CRISTA[-2] = "{:.3f}".format(crista_score_list_alt[index]) + target_CRISTA.append("{:.3f}".format(crista_score_list_alt[index])) if target_CRISTA[-2] == 33: # alternative target scoring - target_CRISTA[-2] = "{:.3f}".format( - crista_score_list_ref[index]) - target_CRISTA.append("{:.3f}".format( - crista_score_list_alt[index])) + target_CRISTA[-2] = "{:.3f}".format(crista_score_list_ref[index]) + target_CRISTA.append("{:.3f}".format(crista_score_list_alt[index])) # append to final score cluster cluster_scored.append(target_CRISTA) @@ -565,8 +632,7 @@ def calculate_scores(cluster_to_save): # process score for each target in cluster, at the same time to improve execution time cluster_with_CRISTA_score = preprocess_CRISTA_score(cluster_to_save) - - #REMOVED TO CHECK IF FILE IS RETURN WITH IDENTICAL ROWS COUNT + # REMOVED TO CHECK IF FILE IS RETURN WITH IDENTICAL ROWS COUNT # analyze CFD scored targets, returning for each guide,chr,cluster_pos the highest scoring target (or multiple in case of equal) # df_CFD = pd.DataFrame(cluster_with_CFD_score, columns=['Bulge_type', 'crRNA', 'DNA', 'Chromosome', @@ -619,16 +685,16 @@ def calculate_scores(cluster_to_save): # INPUT AND SETTINGS # fasta of the reference chromosome -inFasta = open(sys.argv[1], 'r') -current_chr = inFasta.readline().strip().replace('>', '') # lettura fasta del chr +inFasta = open(sys.argv[1], "r") +current_chr = inFasta.readline().strip().replace(">", "") # lettura fasta del chr genomeStr = inFasta.readlines() # lettura fasta del chr -genomeStr = ''.join(genomeStr).upper() +genomeStr = "".join(genomeStr).upper() # string of the whole chromosome on single line -genomeStr = genomeStr.replace('\n', '') +genomeStr = genomeStr.replace("\n", "") # targets clusterized by chr and ordered by position -inTarget = open(sys.argv[3], 'r') +inTarget = open(sys.argv[3], "r") # text file with PAM sequence and length -inPAMfile = open(sys.argv[4], 'r') +inPAMfile = open(sys.argv[4], "r") # outfile path outputFile = sys.argv[5] # max allowed mismatches in search (to validate ref targets in alternative case) @@ -636,44 +702,44 @@ def calculate_scores(cluster_to_save): # column of bulges count bulge_pos = 8 # header to insert into final file -header = '#Bulge_type\tcrRNA\tDNA\tChromosome\tPosition\tCluster_Position\tDirection\tMismatches\tBulge_Size\tTotal\tPAM_gen\tVar_uniq\tSamples\tAnnotation_Type\tReal_Guide\trsID\tAF\tSNP\t#Seq_in_cluster\tReference' +header = "#Bulge_type\tcrRNA\tDNA\tChromosome\tPosition\tCluster_Position\tDirection\tMismatches\tBulge_Size\tTotal\tPAM_gen\tVar_uniq\tSamples\tAnnotation_Type\tReal_Guide\trsID\tAF\tSNP\t#Seq_in_cluster\tReference" # cfd graphs pre-processing (deprecated) -cfd_for_graph = {'ref': [0]*101, 'var': [0]*101} +cfd_for_graph = {"ref": [0] * 101, "var": [0] * 101} # OUT BEST FILES FOR EACH SCORING SYSTEM # file with best CFD targets -cfd_best = open(outputFile + '.bestCFD.txt', 'w+') -cfd_best.write(header + '\tCFD\n') # Write header +cfd_best = open(outputFile + ".bestCFD.txt", "w+") +cfd_best.write(header + "\tCFD\n") # Write header # file with best mm+bul targets -mmblg_best = open(outputFile + '.bestmmblg.txt', 'w+') -mmblg_best.write(header + '\tCFD\n') # Write header +mmblg_best = open(outputFile + ".bestmmblg.txt", "w+") +mmblg_best.write(header + "\tCFD\n") # Write header # file with best CRISTA targets -crista_best = open(outputFile + '.bestCRISTA.txt', 'w+') -crista_best.write(header + '\tCFD\n') # Write header +crista_best = open(outputFile + ".bestCRISTA.txt", "w+") +crista_best.write(header + "\tCFD\n") # Write header # check if dictionaries has haplotypes haplotype_check = False try: - inDict = open(sys.argv[2], 'r') + inDict = open(sys.argv[2], "r") mydict = json.load(inDict) for entry in mydict: - if '|' in mydict[entry]: + if "|" in mydict[entry]: haplotype_check = True break - elif '/' in mydict[entry]: + elif "/" in mydict[entry]: break - print('Haplotype processing', haplotype_check) + print("Haplotype processing", haplotype_check) except: print("No dict found for", current_chr) # check PAM position and relative coordinates on targets pam_at_beginning = False line = inPAMfile.read().strip() -pam = line.split(' ')[0] -len_pam = int(line.split(' ')[1]) +pam = line.split(" ")[0] +len_pam = int(line.split(" ")[1]) guide_len = len(pam) - len_pam pos_beg = 0 pos_end = None @@ -689,7 +755,7 @@ def calculate_scores(cluster_to_save): pam_end = len_pam pam_at_beginning = True else: - pam = pam[(len_pam * (-1)):] + pam = pam[(len_pam * (-1)) :] pos_beg = 0 pos_end = len_pam * (-1) pam_begin = len_pam * (-1) @@ -721,23 +787,22 @@ def calculate_scores(cluster_to_save): # read lines from target file for line in inTarget: # split target into list - split = line.strip().split('\t') + split = line.strip().split("\t") # sgRNA sequence (with bulges and PAM) guide = split[1] # found target on DNA (with bulges, mismatches and PAM) target = split[2] - guide_no_bulge = split[1].replace('-', '') + guide_no_bulge = split[1].replace("-", "") guide_no_pam = guide[pos_beg:pos_end] # check if targets cointains IUPAC nucleotide if any((c in iupac_nucleotides) for c in target): - iupac_decomposition(split, guide_no_bulge, - guide_no_pam, cluster_to_save) + iupac_decomposition(split, guide_no_bulge, guide_no_pam, cluster_to_save) else: # process_iupac = False # append to respect file format for post analysis # null ref sequence - split.append('n') + split.append("n") # specific value to represent a ref target to avoid recount score split.append(55) # count of mm_bul for ref sequence in case of alternative target @@ -754,17 +819,14 @@ def calculate_scores(cluster_to_save): # remove count of tmp_mms target.pop(-2) # save CFD targets - cfd_best.write( - '\t'.join(target)+'\t'+str(0)+'\n') + cfd_best.write("\t".join(target) + "\t" + str(0) + "\n") # save mm-bul targets - mmblg_best.write( - '\t'.join(target)+'\t'+str(0)+'\n') + mmblg_best.write("\t".join(target) + "\t" + str(0) + "\n") if count == 1: # CRISTA target # remove count of tmp_mms target.pop(-2) # save CRISTA targets - crista_best.write('\t'.join(target)+'\t' + - str(0)+'\n') + crista_best.write("\t".join(target) + "\t" + str(0) + "\n") cluster_to_save = list() @@ -777,14 +839,26 @@ def calculate_scores(cluster_to_save): mmblg_best.close() crista_best.close() # rewrite header file - os.system("sed -i '1s/.*/#Bulge_type\tcrRNA\tDNA\tChromosome\tPosition\tCluster_Position\tDirection\tMismatches\tBulge_Size\tTotal\tPAM_gen\tVar_uniq\tSamples\tAnnotation_Type\tReal_Guide\trsID\tAF\tSNP\tReference\tCFD_ref\tCFD\t#Seq_in_cluster/' "+outputFile + '.bestCFD.txt') - os.system("sed -i '1s/.*/#Bulge_type\tcrRNA\tDNA\tChromosome\tPosition\tCluster_Position\tDirection\tMismatches\tBulge_Size\tTotal\tPAM_gen\tVar_uniq\tSamples\tAnnotation_Type\tReal_Guide\trsID\tAF\tSNP\tReference\tCFD_ref\tCFD\t#Seq_in_cluster/' "+outputFile + '.bestmmblg.txt') - os.system("sed -i '1s/.*/#Bulge_type\tcrRNA\tDNA\tChromosome\tPosition\tCluster_Position\tDirection\tMismatches\tBulge_Size\tTotal\tPAM_gen\tVar_uniq\tSamples\tAnnotation_Type\tReal_Guide\trsID\tAF\tSNP\tReference\tCFD_ref\tCFD\t#Seq_in_cluster/' "+outputFile + '.bestCRISTA.txt') + os.system( + "sed -i '1s/.*/#Bulge_type\tcrRNA\tDNA\tChromosome\tPosition\tCluster_Position\tDirection\tMismatches\tBulge_Size\tTotal\tPAM_gen\tVar_uniq\tSamples\tAnnotation_Type\tReal_Guide\trsID\tAF\tSNP\tReference\tCFD_ref\tCFD\t#Seq_in_cluster/' " + + outputFile + + ".bestCFD.txt" + ) + os.system( + "sed -i '1s/.*/#Bulge_type\tcrRNA\tDNA\tChromosome\tPosition\tCluster_Position\tDirection\tMismatches\tBulge_Size\tTotal\tPAM_gen\tVar_uniq\tSamples\tAnnotation_Type\tReal_Guide\trsID\tAF\tSNP\tReference\tCFD_ref\tCFD\t#Seq_in_cluster/' " + + outputFile + + ".bestmmblg.txt" + ) + os.system( + "sed -i '1s/.*/#Bulge_type\tcrRNA\tDNA\tChromosome\tPosition\tCluster_Position\tDirection\tMismatches\tBulge_Size\tTotal\tPAM_gen\tVar_uniq\tSamples\tAnnotation_Type\tReal_Guide\trsID\tAF\tSNP\tReference\tCFD_ref\tCFD\t#Seq_in_cluster/' " + + outputFile + + ".bestCRISTA.txt" + ) # cfd dataframe write cfd_dataframe = pd.DataFrame.from_dict(cfd_for_graph) - cfd_dataframe.to_csv(outputFile + '.CFDGraph.txt', sep='\t', index=False) + cfd_dataframe.to_csv(outputFile + ".CFDGraph.txt", sep="\t", index=False) # print complete and exit with no error - print('ANALYSIS COMPLETE IN', time.time() - global_start) + print("ANALYSIS COMPLETE IN", time.time() - global_start) exit(0) # process cluster of targets if less then 1mln rows total @@ -797,28 +871,37 @@ def calculate_scores(cluster_to_save): # remove count of tmp_mms target.pop(-2) # save CFD targets - cfd_best.write( - '\t'.join(target)+'\t'+str(0)+'\n') + cfd_best.write("\t".join(target) + "\t" + str(0) + "\n") # save mm-bul targets - mmblg_best.write( - '\t'.join(target)+'\t'+str(0)+'\n') + mmblg_best.write("\t".join(target) + "\t" + str(0) + "\n") if count == 1: # CRISTA target # remove count of tmp_mms target.pop(-2) # save CRISTA targets - crista_best.write('\t'.join(target)+'\t' + - str(0)+'\n') + crista_best.write("\t".join(target) + "\t" + str(0) + "\n") cfd_best.close() mmblg_best.close() crista_best.close() -os.system("sed -i '1s/.*/#Bulge_type\tcrRNA\tDNA\tChromosome\tPosition\tCluster_Position\tDirection\tMismatches\tBulge_Size\tTotal\tPAM_gen\tVar_uniq\tSamples\tAnnotation_Type\tReal_Guide\trsID\tAF\tSNP\tReference\tCFD_ref\tCFD\t#Seq_in_cluster/' "+outputFile + '.bestCFD.txt') -os.system("sed -i '1s/.*/#Bulge_type\tcrRNA\tDNA\tChromosome\tPosition\tCluster_Position\tDirection\tMismatches\tBulge_Size\tTotal\tPAM_gen\tVar_uniq\tSamples\tAnnotation_Type\tReal_Guide\trsID\tAF\tSNP\tReference\tCFD_ref\tCFD\t#Seq_in_cluster/' "+outputFile + '.bestmmblg.txt') -os.system("sed -i '1s/.*/#Bulge_type\tcrRNA\tDNA\tChromosome\tPosition\tCluster_Position\tDirection\tMismatches\tBulge_Size\tTotal\tPAM_gen\tVar_uniq\tSamples\tAnnotation_Type\tReal_Guide\trsID\tAF\tSNP\tReference\tCFD_ref\tCFD\t#Seq_in_cluster/' "+outputFile + '.bestCRISTA.txt') +os.system( + "sed -i '1s/.*/#Bulge_type\tcrRNA\tDNA\tChromosome\tPosition\tCluster_Position\tDirection\tMismatches\tBulge_Size\tTotal\tPAM_gen\tVar_uniq\tSamples\tAnnotation_Type\tReal_Guide\trsID\tAF\tSNP\tReference\tCFD_ref\tCFD\t#Seq_in_cluster/' " + + outputFile + + ".bestCFD.txt" +) +os.system( + "sed -i '1s/.*/#Bulge_type\tcrRNA\tDNA\tChromosome\tPosition\tCluster_Position\tDirection\tMismatches\tBulge_Size\tTotal\tPAM_gen\tVar_uniq\tSamples\tAnnotation_Type\tReal_Guide\trsID\tAF\tSNP\tReference\tCFD_ref\tCFD\t#Seq_in_cluster/' " + + outputFile + + ".bestmmblg.txt" +) +os.system( + "sed -i '1s/.*/#Bulge_type\tcrRNA\tDNA\tChromosome\tPosition\tCluster_Position\tDirection\tMismatches\tBulge_Size\tTotal\tPAM_gen\tVar_uniq\tSamples\tAnnotation_Type\tReal_Guide\trsID\tAF\tSNP\tReference\tCFD_ref\tCFD\t#Seq_in_cluster/' " + + outputFile + + ".bestCRISTA.txt" +) cfd_dataframe = pd.DataFrame.from_dict(cfd_for_graph) -cfd_dataframe.to_csv(outputFile + '.CFDGraph.txt', sep='\t', index=False) +cfd_dataframe.to_csv(outputFile + ".CFDGraph.txt", sep="\t", index=False) -print('ANALYSIS COMPLETE IN', time.time() - global_start) +print("ANALYSIS COMPLETE IN", time.time() - global_start) diff --git a/PostProcess/pool_post_analisi_snp.py b/PostProcess/pool_post_analisi_snp.py index 36f2e51..71596c9 100755 --- a/PostProcess/pool_post_analisi_snp.py +++ b/PostProcess/pool_post_analisi_snp.py @@ -22,18 +22,16 @@ def start_analysis(f): # splitted = f.split('.') - chrom = str(f).replace('.fa', '') + chrom = str(f).replace(".fa", "") # for elem in splitted: # if "chr" in elem: # chrom = elem os.system( - f"./post_analisi_snp.sh \"{output_folder}\" \"{ref_folder}\" \"{vcf_name}\" \"{guide_file}\" \"{mm}\" \"{bDNA}\" \"{bRNA}\" {annotation_file} {pam_file} {dict_folder} {final_res} {final_res_alt} {chrom}") + f'./post_analisi_snp.sh "{output_folder}" "{ref_folder}" "{vcf_name}" "{guide_file}" "{mm}" "{bDNA}" "{bRNA}" {annotation_file} {pam_file} {dict_folder} {final_res} {final_res_alt} {chrom}' + ) -chrs = [] -for f in os.listdir(ref_folder): - if '.fa' in f and '.fai' not in f: - chrs.append(f) +chrs = [f for f in os.listdir(ref_folder) if ".fa" in f and ".fai" not in f] # t = 6 # if ncpus < 6: diff --git a/PostProcess/post_analisi_indel.sh b/PostProcess/post_analisi_indel.sh index 16aaf9a..d117a12 100755 --- a/PostProcess/post_analisi_indel.sh +++ b/PostProcess/post_analisi_indel.sh @@ -1,14 +1,5 @@ #!/bin/bash -# set -e - -output_folder=$1 -ref_folder=$2 -ref_name=$(basename $2) -vcf_folder=$3 #!/bin/bash - -set -e - output_folder=$1 ref_folder=$2 ref_name=$(basename $2) @@ -40,44 +31,41 @@ awk -v fake_chr="$fake_chr" '$0 ~ fake_chr {print}' "$output_folder/crispritz_ta header=$(head -1 $output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt) sed -i 1i"$header" "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" -./analisi_indels_NNN.sh "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" "$output_folder/${fake_chr}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}" "$annotation_file" "$dict_folder/log_indels_$vcf_name" "$ref_folder/$true_chr.fa" $mm $bDNA $bRNA "$guide_file" "$pam_file" "$output_folder" || { - echo "CRISPRme ERROR: indels analysis failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 -} +./analisi_indels_NNN.sh "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" "$output_folder/${fake_chr}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}" "$annotation_file" "$dict_folder/log_indels_$vcf_name" "$ref_folder/$true_chr.fa" $mm $bDNA $bRNA "$guide_file" "$pam_file" "$output_folder" rm "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" rm "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" -vcf_name=$(basename $3) -guide_file=$4 -guide_name=$(basename $4) -mm=$5 -bDNA=$6 -bRNA=$7 -annotation_file=$8 -annotation_name=$(basename $8) -pam_file=$9 -pam_name=$(basename $9) -# sampleID=${10} -dict_folder=${10} +# vcf_name=$(basename $3) +# guide_file=$4 +# guide_name=$(basename $4) +# mm=$5 +# bDNA=$6 +# bRNA=$7 +# annotation_file=$8 +# annotation_name=$(basename $8) +# pam_file=$9 +# pam_name=$(basename $9) +# # sampleID=${10} +# dict_folder=${10} -final_res=${11} -final_res_alt=${12} +# final_res=${11} +# final_res_alt=${12} -key=${13} +# key=${13} -echo "Processing INDELs results for $key, starting post-analysis" -true_chr=$key -fake_chr="fake$true_chr" +# echo "Processing INDELs results for $key, starting post-analysis" +# true_chr=$key +# fake_chr="fake$true_chr" -# create file to prevent void grep failing -touch "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" -# create file to prevent void grep failing -touch "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" -awk -v fake_chr="$fake_chr" '$0 ~ fake_chr {print}' "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" >"$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" -header=$(head -1 $output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt) -sed -i 1i"$header" "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" +# # create file to prevent void grep failing +# touch "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" +# # create file to prevent void grep failing +# touch "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" +# awk -v fake_chr="$fake_chr" '$0 ~ fake_chr {print}' "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" >"$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" +# header=$(head -1 $output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt) +# sed -i 1i"$header" "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" -./analisi_indels_NNN.sh "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" "$output_folder/${fake_chr}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}" "$annotation_file" "$dict_folder/log_indels_$vcf_name" "$ref_folder/$true_chr.fa" $mm $bDNA $bRNA "$guide_file" "$pam_file" "$output_folder" || { - echo "CRISPRme ERROR: indels analysis failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 -} +# ./analisi_indels_NNN.sh "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" "$output_folder/${fake_chr}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}" "$annotation_file" "$dict_folder/log_indels_$vcf_name" "$ref_folder/$true_chr.fa" $mm $bDNA $bRNA "$guide_file" "$pam_file" "$output_folder" || { +# echo "CRISPRme ERROR: indels analysis failed (script: ${0} line $((LINENO - 1)))" >&2 +# exit 1 +# } diff --git a/PostProcess/post_analisi_snp.sh b/PostProcess/post_analisi_snp.sh index cfb2d28..75e328f 100755 --- a/PostProcess/post_analisi_snp.sh +++ b/PostProcess/post_analisi_snp.sh @@ -1,7 +1,5 @@ #!/bin/bash -set -e # trace all command failures - output_folder=$1 ref_folder=$2 ref_name=$(basename $2) @@ -30,10 +28,7 @@ if ! [ -f "$output_folder/crispritz_targets/${ref_name}+${vcf_name}_${pam_name}_ else awk -v key="$key" '$0 ~ key' "$output_folder/crispritz_targets/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" >"$output_folder/crispritz_targets/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$key" fi -./scriptAnalisiNNN_v3.sh "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.${key}" "$output_folder/crispritz_targets/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.${key}" "$output_folder/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}_$key" "$annotation_file" "${dict_folder}/my_dict_${key}.json" "${ref_folder}/${key}.fa" $mm $bDNA $bRNA "$guide_file" "$pam_file" "$output_folder" || { - echo "CRISPRme ERROR: SNP analysis failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 -} +./scriptAnalisiNNN_v3.sh "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.${key}" "$output_folder/crispritz_targets/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.${key}" "$output_folder/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}_$key" "$annotation_file" "${dict_folder}/my_dict_${key}.json" "${ref_folder}/${key}.fa" $mm $bDNA $bRNA "$guide_file" "$pam_file" "$output_folder" rm "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$key" rm "$output_folder/crispritz_targets/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$key" # header=$(head -1 "$output_folder/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}_$key.bestMerge.txt") diff --git a/PostProcess/post_analysis_only.sh b/PostProcess/post_analysis_only.sh index 1d2c6e8..0dd9053 100755 --- a/PostProcess/post_analysis_only.sh +++ b/PostProcess/post_analysis_only.sh @@ -1,7 +1,5 @@ #!/bin/bash -set -e # trace all failures - #file for automated search of guide+pam in reference and variant genomes ref_folder=$(realpath $1) @@ -121,10 +119,7 @@ if [ "$vcf_name" != "_" ]; then exit fi - ./pool_post_analisi_snp.py $output_folder $ref_folder $vcf_name $guide_file $mm $bDNA $bRNA $annotation_file $pam_file $sampleID $dict_folder $final_res $final_res_alt $ncpus || { - echo "CRISPRme ERROR: indels postprocessing failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 - } + ./pool_post_analisi_snp.py $output_folder $ref_folder $vcf_name $guide_file $mm $bDNA $bRNA $annotation_file $pam_file $sampleID $dict_folder $final_res $final_res_alt $ncpus echo "Post-analysis SNPs End: "$(date +%F-%T) >>$output_folder/$log @@ -155,10 +150,7 @@ else echo "Processing $key" awk -v key="$key" '$0 ~ key { print }' "$output_folder/crispritz_targets/${ref_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" >"$output_folder/crispritz_targets/${ref_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$key" touch "$output_folder/crispritz_targets/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$key" - ./scriptAnalisiNNN_v3.sh "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$key" "$output_folder/crispritz_targets/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$key" "${ref_name}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}_$key" "$annotation_file" "_" "$ref_folder" $mm $bDNA $bRNA "$guide_file" "$pam_file" "$sampleID" "$output_folder" || { - echo "CRISPRme ERROR: analysis failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 - } + ./scriptAnalisiNNN_v3.sh "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$key" "$output_folder/crispritz_targets/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$key" "${ref_name}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}_$key" "$annotation_file" "_" "$ref_folder" $mm $bDNA $bRNA "$guide_file" "$pam_file" "$sampleID" "$output_folder" rm "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$key" rm "$output_folder/crispritz_targets/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$key" header=$(head -1 "$output_folder/${ref_name}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}.bestMerge.txt") @@ -180,10 +172,7 @@ if [ "$vcf_name" != "_" ]; then cd "$starting_dir" echo "Post-analysis INDELs Start: "$(date +%F-%T) >>$output_folder/$log - ./pool_post_analisi_indel.py $output_folder $ref_folder $vcf_folder $guide_file $mm $bDNA $bRNA $annotation_file $pam_file $sampleID "$output_folder/log_indels_$vcf_name" $final_res $final_res_alt $ncpus || { - echo "CRISPRme ERROR:indels analysis failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 - } + ./pool_post_analisi_indel.py $output_folder $ref_folder $vcf_folder $guide_file $mm $bDNA $bRNA $annotation_file $pam_file $sampleID "$output_folder/log_indels_$vcf_name" $final_res $final_res_alt $ncpus echo "Post-analysis INDELs End: "$(date +%F-%T) >>$output_folder/$log for key in "${array_fake_chroms[@]}"; do tail -n +2 "$output_folder/${key}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}.bestMerge.txt" >>"$final_res" #"$output_folder/${fake_chr}_${guide_name}_${mm}_${bDNA}_${bRNA}.bestCFD.txt.tmp" @@ -197,10 +186,7 @@ fi cd "$starting_dir" echo "Merging Close Targets Start: "$(date +%F-%T) >>$output_folder/$log -./merge_close_targets_cfd.sh $final_res $final_res.trimmed $merge_t || { - echo "CRISPRme ERROR: CFD targets merge failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 -} +./merge_close_targets_cfd.sh $final_res $final_res.trimmed $merge_t mv $final_res.trimmed $final_res mv $final_res.trimmed.discarded_samples $final_res_alt @@ -209,10 +195,7 @@ mv $final_res.trimmed.discarded_samples $final_res_alt echo "Merging Close Targets End: "$(date +%F-%T) >>$output_folder/$log echo "Merging Alternative Chromosomes Start: "$(date +%F-%T) >>$output_folder/$log -./merge_alt_chr.sh $final_res $final_res.chr_merged || { - echo "CRISPRme ERROR: alternative targets merge failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 -} +./merge_alt_chr.sh $final_res $final_res.chr_merged #rm $final_res.trimmed #./merge_alt_chr.sh $final_res_alt.trimmed $final_res_alt.trimmed.chr_merged @@ -230,10 +213,7 @@ echo "Cleaning directory" if ! [ -d "$output_folder/cfd_graphs" ]; then mkdir $output_folder/cfd_graphs fi -./assemble_cfd_graphs.py $output_folder || { - echo "CRISPRme ERROR: CFD graph creation failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 -} +./assemble_cfd_graphs.py $output_folder mv $output_folder/snps.CFDGraph.txt $output_folder/cfd_graphs mv $output_folder/indels.CFDGraph.txt $output_folder/cfd_graphs diff --git a/PostProcess/post_process.sh b/PostProcess/post_process.sh index 79fec33..70f978d 100755 --- a/PostProcess/post_process.sh +++ b/PostProcess/post_process.sh @@ -1,7 +1,5 @@ #!/bin/bash -set -e - #$1 is CFD targets file #$2 is genecode annotation #$3 is empirical data for the guide @@ -42,10 +40,7 @@ echo 'Sorting final annotation results using original NR to correspond with orig # LC_ALL=C sort -T $dir -k4,4rg $1.found.bed -o $1.found.bed LC_ALL=C sort -T $dir -k4,4n $1.found.bed -o $1.found.bed echo 'Starting integration with empirical data (this may take a while)' -"$starting_dir"./resultIntegrator.py $1 $3 $1.found.bed $4 $6/ true $5 $7 $9 ${10} ${11} || { - echo "CRISPRme ERROR: result integration failed (script: ${0} line $((LINENO-1)))" >&2 - exit 1 -} +"$starting_dir"./resultIntegrator.py $1 $3 $1.found.bed $4 $6/ true $5 $7 $9 ${10} ${11} echo 'Removing unnecessary files' rm -f $1.bed $1.found.bed $1.redirectFile.out $1.temp.bed # sed -i 1i"#Bulge_type\tcrRNA\tDNA\tReference\tChromosome\tPosition\tCluster_Position\tDirection\tMismatches\tBulge_Size\tTotal\tPAM_gen\tVar_uniq\tSamples\tAnnotation_Type\tReal_Guide\trsID\tAF\tSNP\t#Seq_in_cluster\tCFD\tCFD_ref\tHighest_CFD_Risk_Score\tHighest_CFD_Absolute_Risk_Score\tMMBLG_#Bulge_type\tMMBLG_crRNA\tMMBLG_DNA\tMMBLG_Reference\tMMBLG_Chromosome\tMMBLG_Position\tMMBLG_Cluster_Position\tMMBLG_Direction\tMMBLG_Mismatches\tMMBLG_Bulge_Size\tMMBLG_Total\tMMBLG_PAM_gen\tMMBLG_Var_uniq\tMMBLG_Samples\tMMBLG_Annotation_Type\tMMBLG_Real_Guide\tMMBLG_rsID\tMMBLG_AF\tMMBLG_SNP\tMMBLG_#Seq_in_cluster\tMMBLG_CFD\tMMBLG_CFD_ref\tMMBLG_CFD_Risk_Score\tMMBLG_CFD_Absolute_Risk_Score" "$1" diff --git a/PostProcess/remove_contiguous_samples_cfd.py b/PostProcess/remove_contiguous_samples_cfd.py old mode 100755 new mode 100644 index ca80ac6..20739de --- a/PostProcess/remove_contiguous_samples_cfd.py +++ b/PostProcess/remove_contiguous_samples_cfd.py @@ -1,680 +1,219 @@ +#!/usr/bin/env python """ -This script parses the input arguments and removes contiguous samples from a file based on specified criteria. +Created on Fri Aug 28 15:58:04 2020 -The script expects the following input arguments: -- infname (str): The input file name. -- outfname (str): The output file name. -- rangebp (int): The maximum range in base pairs for samples to be considered contiguous. -- chrom_idx (int): The index of the chromosome column in the input file. -- position_idx (int): The index of the position column in the input file. -- mm_bul_count_idx (int): The index of the mismatch and bulge count column in the input file. -- guide_idx (int): The index of the guide column in the input file. -- snp_info_idx (int): The index of the SNP info column in the input file. -- score_idx (int): The index of the score column in the input file. -- sort_criterion (str): The sorting criterion for the output file. - -The script reads the input file, parses the specified columns, and removes contiguous samples based on the given range. The resulting data is written to the output file. - -Examples: - $ python remove_contiguous_samples_cfd.py input.txt output.txt 10 2 3 4 5 6 7 8 score +@author: francesco """ - -from typing import List, Tuple, Dict, Callable, Union -from io import TextIOWrapper -from time import time - import sys -import os - -INPUT_ARG_COUNT = 12 -FLAG_ALT_ONLY = 12 -SORTING_PIVOTS = ["score", "total"] -SORTING_CRITERIA = {"mm+bulges": 0, "mm": 2, "bulges": 1} - - -class MergeTargets: - """ - Represents a class for merging targets with specified criteria for sorting - and indexing. - - Args: - args (List[str]): List of arguments containing input parameters for - target merging. +import time - Returns: - None - Raises: - FileNotFoundError: If the input targets file is not found. - ValueError: If there are issues with the specified merge range, column - indices, or sorting criteria. - """ - - def __init__(self, args: List[str]) -> None: - self._infname = args[0] # input targets - if not os.path.exists(self._infname) or not os.path.isfile(self._infname): - raise FileNotFoundError(f"{self._infname} not found") - self._outfname = args[1] # output merged targets - self._rangebp = int(args[2]) # merge bp range - if self._rangebp <= 0: - raise ValueError(f"Invalid merge range ({self._rangebp})") - self._chromidx = int(args[3]) - 1 # chromosome index - if self._chromidx != 4: - raise ValueError( - f"Chromosome data is expected on column 5, got {self._chromidx}" - ) - self._posidx = int(args[4]) - 1 # position index - if self._posidx != 6: - raise ValueError( - f"Position data is expected on column 7, got {self._posidx}" +def get_best_targets(cluster, fileOut, fileOut_disc, cfd, snp_info): + # avoid crush when cluster is empty in the first call + if not cluster: + return + list_ref = list() + dict_var = dict() + for ele in cluster: + if ele[snp_info] == "n": + list_ref.append(ele) + else: + # merge samples of identical targets (coming from different VCF datasets) + if (ele[pos], ele[snp_info]) in dict_var.keys(): + dict_var[(ele[pos], ele[snp_info])][0][true_guide - 2] = ( + dict_var[(ele[pos], ele[snp_info])][0][true_guide - 2] + + "," + + ele[true_guide - 2] + ) # true_guide - 2 points to samples column + dict_var[(ele[pos], ele[snp_info])][0][snp_info - 2] = ( + dict_var[(ele[pos], ele[snp_info])][0][snp_info - 2] + + "," + + ele[snp_info - 2] + ) # snp_info - 2 points to rsID column + dict_var[(ele[pos], ele[snp_info])][0][snp_info - 1] = ( + dict_var[(ele[pos], ele[snp_info])][0][snp_info - 1] + + "," + + ele[snp_info - 1] + ) # ttuesnp_info_guide - 2 points to AF column + else: + dict_var[(ele[pos], ele[snp_info])] = [ele] + + final_list_best_ref = list() + var_only = False + for target in list_ref: + final_list_best_ref.append(target) + if not final_list_best_ref: + var_only = True + + final_list_best_var = list() + # for each snp_info in dict, extract the targets + for key in dict_var.keys(): + list_var = dict_var[key] + # copy the targets in the variant list, adding unique if no ref target is found + for target in list_var: + if var_only: + target[12] = "y" + final_list_best_var.append(target) + + temp_final_list_best_var = list() + # for target in final_list_best_var: + for target in final_list_best_var: + # remove duplicates into snp info col + target[snp_info] = ",".join(set(target[snp_info].split(","))) + # remove duplicate into rsID col + target[snp_info - 2] = ",".join(set(target[snp_info - 2].split(","))) + # remove duplicate into AF col + target[snp_info - 1] = ",".join(set(target[snp_info - 1].split(","))) + # remove duplicate into samples col + target[true_guide - 2] = ",".join(set(target[true_guide - 2].split(","))) + # append to temp list + temp_final_list_best_var.append(target) + + # final list with polished targets (no duplicates in snp data) + final_list_best_var = temp_final_list_best_var + + # check if lists are empty + validity_check_ref = False + validity_check_var = False + if final_list_best_ref: + validity_check_ref = True + if final_list_best_var: + validity_check_var = True + + # extract best target for each criteria + if sort_order == "score": + # sort per score (CFD or CRISTA) + if validity_check_ref: + final_list_best_ref = sorted( + final_list_best_ref, key=lambda x: (-float(x[cfd]), int(x[total])) ) - self._mmb_idx = int(args[5]) - 1 # mm+bulges index - if self._mmb_idx != 10: - raise ValueError( - f"MM+Bulges data is expected on column 11, got {self._mmb_idx}" + if validity_check_var: + final_list_best_var = sorted( + final_list_best_var, key=lambda x: (-float(x[cfd]), int(x[total])) ) - self._guide_idx = int(args[6]) - 1 # guide index - if self._guide_idx != 15: - raise ValueError( - f"Guide data is expected on column 16, got {self._guide_idx}" + if var_only: # no ref found + # count the residual targets in the list + final_list_best_var[0][cfd - 1] = str(len(final_list_best_var) - 1) + # append the best target to best_file + fileOut.write("\t".join(final_list_best_var[0])) + # pop the best target from the list + bestTarget = final_list_best_var.pop(0) + elif validity_check_ref and validity_check_var: # ref and var targets found + if float(final_list_best_ref[0][cfd]) >= float(final_list_best_var[0][cfd]): + final_list_best_ref[0][cfd - 1] = str( + len(final_list_best_ref) + len(final_list_best_var) - 1 + ) + fileOut.write("\t".join(final_list_best_ref[0])) + bestTarget = final_list_best_ref.pop(0) + else: + final_list_best_var[0][cfd - 1] = str( + len(final_list_best_ref) + len(final_list_best_var) - 1 + ) + fileOut.write("\t".join(final_list_best_var[0])) + bestTarget = final_list_best_var.pop(0) + else: # only ref + final_list_best_ref[0][cfd - 1] = str(len(final_list_best_ref) - 1) + fileOut.write("\t".join(final_list_best_ref[0])) + bestTarget = final_list_best_ref.pop(0) + # write all the remaining targets in the alt file + for count, elem in enumerate(final_list_best_ref): + final_list_best_ref[count][cfd - 1] = str( + len(final_list_best_ref) + len(final_list_best_var) - 1 ) - self._snp_idx = int(args[7]) - 1 # snp info index - if self._snp_idx != 18: - raise ValueError(f"SNP data is expected on column 19, got {self._snp_idx}") - self._score_idx = int(args[8]) - 1 # score index - if self._score_idx != 20: - raise ValueError( - f"Score data is expected on column 21, got {self._score_idx}" + fileOut_disc.write(("\t".join(ele))) + for count, elem in enumerate(final_list_best_var): + final_list_best_var[count][cfd - 1] = str( + len(final_list_best_ref) + len(final_list_best_var) - 1 ) - self._sort_criterion = args[9] # sorting criterion (pivot) - if self._sort_criterion not in SORTING_PIVOTS: - raise ValueError( - f"Allowed sort pivots: {SORTING_PIVOTS}, got {self._sort_criterion}" + fileOut_disc.write(("\t".join(ele))) + else: + # sort for total (mm+bul) in target + if validity_check_ref: + final_list_best_ref = sorted( + final_list_best_ref, + key=lambda x: (int(x[total - 2]), int(x[total - 1])), ) - self._sorting_criteria_scoring = args[10] # sorting criteria (score is pivot) - self._sorting_criteria = args[11] # sorting criteria (mm+bulges is pivot) - - @property - def targets_fname(self) -> str: - return self._infname - - @property - def targets_fname_merged(self) -> str: - return self._outfname - - @property - def targets_fname_discarded(self) -> str: - return f"{self._outfname}.discarded_samples" - - @property - def rangebp(self) -> int: - return self._rangebp - - @property - def chromidx(self) -> int: - return self._chromidx - - @property - def posidx(self) -> int: - return self._posidx - - @property - def mmbidx(self) -> int: - return self._mmb_idx - - @property - def guideidx(self) -> int: - return self._guide_idx - - @property - def snpidx(self) -> int: - return self._snp_idx - - @property - def scoreidx(self) -> int: - return self._score_idx - - @property - def sort_pivot(self) -> str: - return self._sort_criterion - - @property - def sorting_criteria_scoring(self) -> str: - return self._sorting_criteria_scoring - - @property - def sorting_criteria(self) -> str: - return self._sorting_criteria - - -def parse_input_args(args: List[str]) -> MergeTargets: - """ - Parses the input arguments to create an instance of MergeTargets for handling - target merging based on the provided arguments. - - Args: - args (List[str]): List of input arguments to be parsed for target merging. - - Returns: - MergeTargets: An instance of MergeTargets class for further processing. - """ - - if not args: - raise ValueError("No input argument provided") - if len(args) != INPUT_ARG_COUNT: - raise ValueError(f"Expected {INPUT_ARG_COUNT} arguments, got {len(args)}") - return MergeTargets(args) # parse input arguments - - -def initialize_targets_cluster(rangebp: int) -> Tuple[int, str, str, str, List[str]]: - """ - Initializes the targets cluster by setting the previous cluster position, - guide, chromosome, SNP, and an open starting cluster. - - Args: - rangebp (int): The range of base pairs for the cluster. - - Returns: - Tuple[int, str, str, str, List[str]]: Tuple containing the initialized - cluster parameters. - """ - - pos_prev = -(rangebp + 1) # previous cluster position - guide_prev = "" # previous cluster's guide - chrom_prev = "" # previous cluster's chromosome - snp_prev = "" # previous cluster snp - cluster = [] # open starting cluster - return pos_prev, guide_prev, chrom_prev, snp_prev, cluster - - -def parse_target(target: str) -> List[str]: - """ - Parses the fields of a target string and returns a list of the parsed fields. - - Args: - target (str): The target string to be parsed. - - Returns: - List[str]: List of parsed fields extracted from the target string. - """ - - # parse target's fields - return target.strip().split() - - -def open_targets_cluster( - guide: str, - guide_prev: str, - chrom: str, - chrom_prev: str, - pos: int, - pos_prev: int, - rangebp: int, -) -> bool: - """ - Determines whether a new targets cluster should be opened based on changes - in guide, chromosome, and position within a specified range of base pairs. - - Args: - guide (str): Current guide. - guide_prev (str): Previous guide. - chrom (str): Current chromosome. - chrom_prev (str): Previous chromosome. - pos (int): Current position. - pos_prev (int): Previous position. - rangebp (int): Range of base pairs for cluster merging. - - Returns: - bool: True if a new targets cluster should be opened, False otherwise. - """ - - return guide != guide_prev or chrom != chrom_prev or pos - pos_prev > rangebp - - -def cluster_targets_by_pos_snp( - cluster: List[List[str]], guideidx: int, snpidx: int, posidx: int -) -> Tuple[List[List[str]], Dict[Tuple[int, str], List[List[str]]]]: - """ - Clusters targets based on position and SNP information, separating them into - reference and alternative targets. - - Args: - cluster (List[List[str]]): List of target clusters. - guideidx (int): Index of the guide. - snpidx (int): Index of the SNP information. - posidx (int): Index of the position. - - Returns: - Tuple[List[List[str]], Dict[Tuple[int, str], List[List[str]]]: A tuple - containing the reference targets list and a dictionary of alternative - targets grouped by position and SNP information. - """ - - # initialize reference and alternative targets lists - reference_targets, alternative_targets = [], {} - samplesidx, snpididx, afidx = guideidx - 2, snpidx - 2, snpidx - 1 - for target in cluster: - if target[snpidx] == "n": # reference target - reference_targets.append(target) - continue - # alternative target - pos_t, snp_info_t = target[posidx], target[snpidx] - if (pos_t, snp_info_t) in alternative_targets: - alternative_targets[(pos_t, snp_info_t)][0][ - samplesidx - ] += f",{target[samplesidx]}" # add samples - alternative_targets[(pos_t, snp_info_t)][0][ - snpididx - ] += f",{target[snpididx]}" # add snp ids - alternative_targets[(pos_t, snp_info_t)][0][ - afidx - ] += f",{target[afidx]}" # add allele frequencies - else: - alternative_targets[(pos_t, snp_info_t)] = [target] - return reference_targets, alternative_targets - - -def recover_alternative_targets_list( - alternative_targets: Dict[Tuple[int, str], List[List[str]]], noref: bool -) -> List[List[str]]: - """ - Recovers the list of alternative targets from a dictionary of alternative - targets grouped by position and SNP information. - - Args: - alternative_targets (Dict[Tuple[int, str], List[List[str]]): Dictionary - of alternative targets grouped by position and SNP information. - noref (bool): Flag indicating whether to include reference targets. - - Returns: - List[List[str]]: List of alternative targets with updated flags. - """ - - alternative_targets_list = [] # initialize the list - for v in alternative_targets.values(): - for target in v: - # check whether the target is uniquely found on alternative sequences - target[FLAG_ALT_ONLY] = "y" if noref else target[FLAG_ALT_ONLY] - alternative_targets_list.append(target) - return alternative_targets_list - - -def remove_duplicates(target: List[str], idx: int) -> str: - """ - Removes duplicates from a target list at the specified index and returns the - unique values as a comma-separated string. - - Args: - target (List[str]): List of target values. - idx (int): Index of the target list to remove duplicates from. - - Returns: - str: Comma-separated string of unique values after removing duplicates. - """ - - return ",".join(set(target[idx].split(","))) - - -def unique_values( - targets: List[List[str]], snpidx: int, guideidx: int -) -> List[List[str]]: - """ - Returns a list of targets with unique values for SNP information, SNP ID, - allele frequency, and samples. - - Args: - targets (List[List[str]]): List of target clusters. - snpidx (int): Index of the SNP information. - guideidx (int): Index of the guide. - - Returns: - List[List[str]]: List of targets with unique values for specified fields. - """ - - snpididx, afidx, samplesidx = snpidx - 2, snpidx - 1, guideidx - 2 - targets_filt = [] - # remove duplicate values - for target in targets: - target[snpidx] = remove_duplicates(target, snpidx) # snp info - target[snpididx] = remove_duplicates(target, snpididx) # snp id - target[afidx] = remove_duplicates(target, afidx) # af - target[samplesidx] = remove_duplicates(target, samplesidx) # samples - targets_filt.append(target) # update target - return targets_filt - - -def construct_cluster( - cluster: List[List[str]], - guideidx: int, - snpidx: int, - posidx: int, - scoreidx: int, - mmbidx: int, - sort_pivot: str, - sorting_criteria: str, - sorting_criteria_scoring: str, - outfile: TextIOWrapper, - outfiledisc: TextIOWrapper, -) -> None: - """ - Constructs target clusters by processing and sorting reference and - alternative targets based on specified criteria, and writes the results to - output files. - - Args: - cluster (List[List[str]]): List of target clusters. - guideidx (int): Index of the guide. - snpidx (int): Index of the SNP information. - posidx (int): Index of the position. - scoreidx (int): Index of the score. - mmbidx (int): Index of the MM+Bulges. - sort_pivot (str): Sorting pivot. - sorting_criteria (str): Sorting criteria. - sorting_criteria_scoring (str): Sorting criteria for scoring. - outfile (TextIOWrapper): Output file for reported alignments. - outfiledisc (TextIOWrapper): Output file for alternative alignments. - - Returns: - None - """ - - if not cluster: # avoid crashes when cluster is empty - return - # recover reference and alternative targets - reference_targets, alternative_targets = cluster_targets_by_pos_snp( - cluster, guideidx, snpidx, posidx - ) - noref = ( - not reference_targets - ) # check whether only alternative targets have been found - # recover alternative targets list - alternative_targets = recover_alternative_targets_list(alternative_targets, noref) - # remove duplicates values on snp info, snp id, af, and samples columns - alternative_targets = unique_values(alternative_targets, snpidx, guideidx) - # sort targets - score = sort_pivot == SORTING_PIVOTS[0] - sorting_criteria = sorting_criteria_scoring if score else sorting_criteria - criteria = initialize_sorting_criteria(sorting_criteria, score, scoreidx, mmbidx) - if reference_targets: - reference_targets = sort_targets(reference_targets, criteria) - if alternative_targets: - alternative_targets = sort_targets(alternative_targets, criteria) - # write targets to reported or alternative alignments files - write_best_targets( - reference_targets, - alternative_targets, - noref, - score, - scoreidx, - mmbidx, - outfile, - outfiledisc, - ) - - -def write_best_targets( - reference_targets: List[List[str]], - alternative_targets: List[List[str]], - noref: bool, - score: bool, - scoreidx: int, - mmbidx: int, - outfile: TextIOWrapper, - outfiledisc: TextIOWrapper, -) -> None: - """ - Writes the best targets to the reported alignments file and alternative - alignments file based on specified criteria. - - Args: - reference_targets (List[List[str]]): List of reference targets. - alternative_targets (List[List[str]]): List of alternative targets. - noref (bool): Flag indicating if no reference target is found. - score (bool): Flag indicating if scoring is used for comparison. - scoreidx (int): Index of the score. - mmbidx (int): Index of the MM+Bulges. - outfile (TextIOWrapper): Output file for reported alignments. - outfiledisc (TextIOWrapper): Output file for alternative alignments. - - Returns: - None - """ - - if noref: # no reference target found - target = alternative_targets.pop(0) # pop best target - target[scoreidx - 1] = str(len(alternative_targets)) - elif reference_targets and alternative_targets: # targets found in both - if score: - target = ( - alternative_targets.pop(0) - if float(reference_targets[0][scoreidx]) - < float(alternative_targets[0][scoreidx]) - else reference_targets.pop(0) + if validity_check_var: + final_list_best_var = sorted( + final_list_best_var, + key=lambda x: (int(x[total - 2]), int(x[total - 1])), ) - elif int(alternative_targets[0][mmbidx]) < int(reference_targets[0][mmbidx]): - target = alternative_targets.pop(0) - else: - target = reference_targets.pop(0) - target[scoreidx - 1] = str(len(reference_targets) + len(alternative_targets)) - else: # no alternative target found - target = reference_targets.pop(0) - target[scoreidx - 1] = str(len(reference_targets)) - target = "\t".join(target) - outfile.write(f"{target}\n") - # write the alternative alignments targets - for target in reference_targets + alternative_targets: - target[scoreidx - 1] = str(len(reference_targets) + len(alternative_targets)) - target = "\t".join(target) - outfiledisc.write(f"{target}\n") - - -def sort_targets(targets: List[List[str]], criteria: Callable) -> List[List[str]]: - """ - Sorts the list of targets based on the specified criteria function. - - Args: - targets (List[List[str]]): List of targets to be sorted. - criteria (Callable): Sorting criteria function. - - Returns: - List[List[str]]: Sorted list of targets based on the provided criteria. - """ - - return sorted(targets, key=criteria) - - -def sorting_score(criteria: List[str], score_idx: int, mmb_idx: int) -> Callable: - """ - Returns a sorting function based on the specified criteria for scoring, - MM+Bulges index, and multiple criteria. - - Args: - criteria (List[str]): List of sorting criteria. - score_idx (int): Index of the score. - mmb_idx (int): Index of the MM+Bulges. - - Returns: - Callable: Sorting function based on the provided criteria. - """ - - if len(criteria) == 1: # single criterion - return lambda x: ( - -float(x[score_idx]), - int(x[mmb_idx - SORTING_CRITERIA[criteria[0]]]), - ) - elif len(criteria) == 2: - return lambda x: ( - -float(x[score_idx]), - int(x[mmb_idx - SORTING_CRITERIA[criteria[0]]]), - int(x[mmb_idx - SORTING_CRITERIA[criteria[1]]]), - ) - # base case (all three ) - return lambda x: ( - -float(x[score_idx]), - int(x[mmb_idx - SORTING_CRITERIA[criteria[0]]]), - int(x[mmb_idx - SORTING_CRITERIA[criteria[1]]]), - int(x[mmb_idx - SORTING_CRITERIA[criteria[2]]]), - ) - - -def sorting_fewest(criteria: List[str], mmb_idx: int) -> Callable: - """ - Returns a sorting function based on the specified criteria for the fewest - MM+Bulges index. - - Args: - criteria (List[str]): List of sorting criteria. - mmb_idx (int): Index of the MM+Bulges. - - Returns: - Callable: Sorting function based on the provided criteria for the fewest - MM+Bulges index. - """ - - if len(criteria) == 1: # one criterion - return lambda x: (int(x[mmb_idx - SORTING_CRITERIA[criteria[0]]])) - elif len(criteria) == 2: - return lambda x: ( - int(x[mmb_idx - SORTING_CRITERIA[criteria[0]]]), - int(x[mmb_idx - SORTING_CRITERIA[criteria[1]]]), - ) - # base case (all three ) - return lambda x: ( - int(x[mmb_idx - SORTING_CRITERIA[criteria[0]]]), - int(x[mmb_idx - SORTING_CRITERIA[criteria[1]]]), - int(x[mmb_idx - SORTING_CRITERIA[criteria[2]]]), - ) - - -def initialize_sorting_criteria( - sorting_criteria: str, score: bool, score_idx: int, mmb_idx: int -) -> Callable: - """ - Initializes the sorting criteria function based on the specified sorting - criteria, score flag, score index, and MM+Bulges index. - - Args: - sorting_criteria (str): Comma-separated string of sorting criteria. - score (bool): Flag indicating if scoring is used for sorting. - score_idx (int): Index of the score. - mmb_idx (int): Index of the MM+Bulges. - - Returns: - Callable: Sorting criteria function based on the provided parameters. - """ - - criteria = sorting_criteria.split(",") - if len(criteria) > 3: - raise ValueError("Mismatching sorting criteria selected") - if any(c not in SORTING_CRITERIA for c in criteria): - raise ValueError("Unknown sorting criteria") - if score: - return sorting_score(criteria, score_idx, mmb_idx) - return sorting_fewest(criteria, mmb_idx) - - -def split_targets(input_args: MergeTargets) -> None: - """ - Splits and processes the input targets file into best and alternative targets - files based on specified criteria. - - Args: - input_args (MergeTargets): Object containing input arguments for target - splitting and processing. - - Returns: - None - - Raises: - OSError: If an error occurs during the process of splitting and processing - the targets. - """ - - # open best, and alternative targets files - try: - with open(input_args.targets_fname, mode="r") as infile, open( - input_args.targets_fname_merged, mode="w" - ) as outfile, open(input_args.targets_fname_discarded, mode="w") as discfile: - # write header to outfiles - header = infile.readline().strip() - outfile.write(f"{header}\n") - discfile.write(f"{header}\n") - # begin dividing targets - ( - pos_prev, - guide_prev, - chrom_prev, - snp_prev, - cluster, - ) = initialize_targets_cluster(input_args.rangebp) - for line in infile: - target = parse_target(line) - guide, chrom, pos, snp = ( - target[input_args.guideidx], - target[input_args.chromidx], - int(target[input_args.posidx]), - target[input_args.snpidx], + if var_only: # no ref found + # count the residual targets in the list + final_list_best_var[0][cfd - 1] = str(len(final_list_best_var) - 1) + # append the best target to best_file + fileOut.write("\t".join(final_list_best_var[0])) + # pop the best target from the list + bestTarget = final_list_best_var.pop(0) + elif validity_check_ref and validity_check_var: # ref and var targets found + if int(final_list_best_ref[0][total]) <= int(final_list_best_var[0][total]): + final_list_best_ref[0][cfd - 1] = str( + len(final_list_best_ref) + len(final_list_best_var) - 1 + ) + fileOut.write("\t".join(final_list_best_ref[0])) + bestTarget = final_list_best_ref.pop(0) + else: + final_list_best_var[0][cfd - 1] = str( + len(final_list_best_ref) + len(final_list_best_var) - 1 ) - if open_targets_cluster( - guide, - guide_prev, - chrom, - chrom_prev, - pos, - pos_prev, - input_args.rangebp, + fileOut.write("\t".join(final_list_best_var[0])) + bestTarget = final_list_best_var.pop(0) + else: # only ref + final_list_best_ref[0][cfd - 1] = str(len(final_list_best_ref) - 1) + fileOut.write("\t".join(final_list_best_ref[0])) + bestTarget = final_list_best_ref.pop(0) + # write all the remaining targets in the alt file + for count, elem in enumerate(final_list_best_ref): + final_list_best_ref[count][cfd - 1] = str( + len(final_list_best_ref) + len(final_list_best_var) - 1 + ) + fileOut_disc.write(("\t".join(elem))) + for count, elem in enumerate(final_list_best_var): + final_list_best_var[count][cfd - 1] = str( + len(final_list_best_ref) + len(final_list_best_var) - 1 + ) + fileOut_disc.write(("\t".join(elem))) + + +tau = int(sys.argv[3]) # range in bp to merge targets +chrom = int(sys.argv[4]) - 1 # chromoso +pos = int(sys.argv[5]) - 1 # position of target +total = int(sys.argv[6]) - 1 # mm+bul value +true_guide = int(sys.argv[7]) - 1 # real guide used in the search +snp_info = int(sys.argv[8]) - 1 # snp_info (ref_alt_allele) +cfd = int(sys.argv[9]) - 1 # CFD score +sort_order = str(sys.argv[10]) +# -1 is to get the correct "python enumeration" from the bash script + +start = time.time() +with open(sys.argv[1], "r") as fileIn: + header = fileIn.readline() + with open(sys.argv[2], "w") as fileOut: + with open(sys.argv[2] + ".discarded_samples", "w") as fileOut_disc: + fileOut.write(header) + fileOut_disc.write(header) + prev_pos = -(tau + 1) + best_row = "" + prev_guide = "" + prev_chr = "" + prev_snp = "" + cluster = [] + for line in fileIn: + splitted = line.split("\t") + if ( + prev_guide != splitted[true_guide] + or prev_chr != splitted[chrom] + or int(splitted[pos]) - prev_pos > tau ): - construct_cluster( - cluster, - input_args.guideidx, - input_args.snpidx, - input_args.posidx, - input_args.scoreidx, - input_args.mmbidx, - input_args.sort_pivot, - input_args.sorting_criteria, - input_args.sorting_criteria_scoring, - outfile, - discfile, - ) - cluster = [target] + get_best_targets(cluster, fileOut, fileOut_disc, cfd, snp_info) + cluster = [splitted] else: - cluster.append(target) - guide_prev = guide - pos_prev = pos - chrom_prev = chrom - snp_prev = snp - construct_cluster( - cluster, - input_args.guideidx, - input_args.snpidx, - input_args.posidx, - input_args.scoreidx, - input_args.mmbidx, - input_args.sort_pivot, - input_args.sorting_criteria, - input_args.sorting_criteria_scoring, - outfile, - discfile, - ) - except IOError as e: - raise OSError("An error occurred while merging contiguous targets") from e - - -def merge_targets() -> None: - """ - Merges targets by parsing input arguments, splitting the targets, and - displaying the completion time. - - Returns: - None - """ - - start = time() # start time point - input_args = parse_input_args(sys.argv[1:]) - split_targets(input_args) - sys.stdout.write(f"Merge completed in {(time() - start)}s\n") + cluster.append(splitted) + prev_guide = splitted[true_guide] + prev_pos = int(splitted[pos]) + prev_chr = splitted[chrom] + prev_snp = splitted[snp_info] + get_best_targets(cluster, fileOut, fileOut_disc, cfd, snp_info) -if __name__ == "__main__": - merge_targets() +print("Mergin done in: " + str(time.time() - start)) diff --git a/PostProcess/remove_contiguous_samples_cfd_new.py b/PostProcess/remove_contiguous_samples_cfd_new.py new file mode 100755 index 0000000..ca80ac6 --- /dev/null +++ b/PostProcess/remove_contiguous_samples_cfd_new.py @@ -0,0 +1,680 @@ +""" +This script parses the input arguments and removes contiguous samples from a file based on specified criteria. + +The script expects the following input arguments: +- infname (str): The input file name. +- outfname (str): The output file name. +- rangebp (int): The maximum range in base pairs for samples to be considered contiguous. +- chrom_idx (int): The index of the chromosome column in the input file. +- position_idx (int): The index of the position column in the input file. +- mm_bul_count_idx (int): The index of the mismatch and bulge count column in the input file. +- guide_idx (int): The index of the guide column in the input file. +- snp_info_idx (int): The index of the SNP info column in the input file. +- score_idx (int): The index of the score column in the input file. +- sort_criterion (str): The sorting criterion for the output file. + +The script reads the input file, parses the specified columns, and removes contiguous samples based on the given range. The resulting data is written to the output file. + +Examples: + $ python remove_contiguous_samples_cfd.py input.txt output.txt 10 2 3 4 5 6 7 8 score +""" + +from typing import List, Tuple, Dict, Callable, Union +from io import TextIOWrapper +from time import time + +import sys +import os + +INPUT_ARG_COUNT = 12 +FLAG_ALT_ONLY = 12 +SORTING_PIVOTS = ["score", "total"] +SORTING_CRITERIA = {"mm+bulges": 0, "mm": 2, "bulges": 1} + + +class MergeTargets: + """ + Represents a class for merging targets with specified criteria for sorting + and indexing. + + Args: + args (List[str]): List of arguments containing input parameters for + target merging. + + Returns: + None + + Raises: + FileNotFoundError: If the input targets file is not found. + ValueError: If there are issues with the specified merge range, column + indices, or sorting criteria. + """ + + def __init__(self, args: List[str]) -> None: + self._infname = args[0] # input targets + if not os.path.exists(self._infname) or not os.path.isfile(self._infname): + raise FileNotFoundError(f"{self._infname} not found") + self._outfname = args[1] # output merged targets + self._rangebp = int(args[2]) # merge bp range + if self._rangebp <= 0: + raise ValueError(f"Invalid merge range ({self._rangebp})") + self._chromidx = int(args[3]) - 1 # chromosome index + if self._chromidx != 4: + raise ValueError( + f"Chromosome data is expected on column 5, got {self._chromidx}" + ) + self._posidx = int(args[4]) - 1 # position index + if self._posidx != 6: + raise ValueError( + f"Position data is expected on column 7, got {self._posidx}" + ) + self._mmb_idx = int(args[5]) - 1 # mm+bulges index + if self._mmb_idx != 10: + raise ValueError( + f"MM+Bulges data is expected on column 11, got {self._mmb_idx}" + ) + self._guide_idx = int(args[6]) - 1 # guide index + if self._guide_idx != 15: + raise ValueError( + f"Guide data is expected on column 16, got {self._guide_idx}" + ) + self._snp_idx = int(args[7]) - 1 # snp info index + if self._snp_idx != 18: + raise ValueError(f"SNP data is expected on column 19, got {self._snp_idx}") + self._score_idx = int(args[8]) - 1 # score index + if self._score_idx != 20: + raise ValueError( + f"Score data is expected on column 21, got {self._score_idx}" + ) + self._sort_criterion = args[9] # sorting criterion (pivot) + if self._sort_criterion not in SORTING_PIVOTS: + raise ValueError( + f"Allowed sort pivots: {SORTING_PIVOTS}, got {self._sort_criterion}" + ) + self._sorting_criteria_scoring = args[10] # sorting criteria (score is pivot) + self._sorting_criteria = args[11] # sorting criteria (mm+bulges is pivot) + + @property + def targets_fname(self) -> str: + return self._infname + + @property + def targets_fname_merged(self) -> str: + return self._outfname + + @property + def targets_fname_discarded(self) -> str: + return f"{self._outfname}.discarded_samples" + + @property + def rangebp(self) -> int: + return self._rangebp + + @property + def chromidx(self) -> int: + return self._chromidx + + @property + def posidx(self) -> int: + return self._posidx + + @property + def mmbidx(self) -> int: + return self._mmb_idx + + @property + def guideidx(self) -> int: + return self._guide_idx + + @property + def snpidx(self) -> int: + return self._snp_idx + + @property + def scoreidx(self) -> int: + return self._score_idx + + @property + def sort_pivot(self) -> str: + return self._sort_criterion + + @property + def sorting_criteria_scoring(self) -> str: + return self._sorting_criteria_scoring + + @property + def sorting_criteria(self) -> str: + return self._sorting_criteria + + +def parse_input_args(args: List[str]) -> MergeTargets: + """ + Parses the input arguments to create an instance of MergeTargets for handling + target merging based on the provided arguments. + + Args: + args (List[str]): List of input arguments to be parsed for target merging. + + Returns: + MergeTargets: An instance of MergeTargets class for further processing. + """ + + if not args: + raise ValueError("No input argument provided") + if len(args) != INPUT_ARG_COUNT: + raise ValueError(f"Expected {INPUT_ARG_COUNT} arguments, got {len(args)}") + return MergeTargets(args) # parse input arguments + + +def initialize_targets_cluster(rangebp: int) -> Tuple[int, str, str, str, List[str]]: + """ + Initializes the targets cluster by setting the previous cluster position, + guide, chromosome, SNP, and an open starting cluster. + + Args: + rangebp (int): The range of base pairs for the cluster. + + Returns: + Tuple[int, str, str, str, List[str]]: Tuple containing the initialized + cluster parameters. + """ + + pos_prev = -(rangebp + 1) # previous cluster position + guide_prev = "" # previous cluster's guide + chrom_prev = "" # previous cluster's chromosome + snp_prev = "" # previous cluster snp + cluster = [] # open starting cluster + return pos_prev, guide_prev, chrom_prev, snp_prev, cluster + + +def parse_target(target: str) -> List[str]: + """ + Parses the fields of a target string and returns a list of the parsed fields. + + Args: + target (str): The target string to be parsed. + + Returns: + List[str]: List of parsed fields extracted from the target string. + """ + + # parse target's fields + return target.strip().split() + + +def open_targets_cluster( + guide: str, + guide_prev: str, + chrom: str, + chrom_prev: str, + pos: int, + pos_prev: int, + rangebp: int, +) -> bool: + """ + Determines whether a new targets cluster should be opened based on changes + in guide, chromosome, and position within a specified range of base pairs. + + Args: + guide (str): Current guide. + guide_prev (str): Previous guide. + chrom (str): Current chromosome. + chrom_prev (str): Previous chromosome. + pos (int): Current position. + pos_prev (int): Previous position. + rangebp (int): Range of base pairs for cluster merging. + + Returns: + bool: True if a new targets cluster should be opened, False otherwise. + """ + + return guide != guide_prev or chrom != chrom_prev or pos - pos_prev > rangebp + + +def cluster_targets_by_pos_snp( + cluster: List[List[str]], guideidx: int, snpidx: int, posidx: int +) -> Tuple[List[List[str]], Dict[Tuple[int, str], List[List[str]]]]: + """ + Clusters targets based on position and SNP information, separating them into + reference and alternative targets. + + Args: + cluster (List[List[str]]): List of target clusters. + guideidx (int): Index of the guide. + snpidx (int): Index of the SNP information. + posidx (int): Index of the position. + + Returns: + Tuple[List[List[str]], Dict[Tuple[int, str], List[List[str]]]: A tuple + containing the reference targets list and a dictionary of alternative + targets grouped by position and SNP information. + """ + + # initialize reference and alternative targets lists + reference_targets, alternative_targets = [], {} + samplesidx, snpididx, afidx = guideidx - 2, snpidx - 2, snpidx - 1 + for target in cluster: + if target[snpidx] == "n": # reference target + reference_targets.append(target) + continue + # alternative target + pos_t, snp_info_t = target[posidx], target[snpidx] + if (pos_t, snp_info_t) in alternative_targets: + alternative_targets[(pos_t, snp_info_t)][0][ + samplesidx + ] += f",{target[samplesidx]}" # add samples + alternative_targets[(pos_t, snp_info_t)][0][ + snpididx + ] += f",{target[snpididx]}" # add snp ids + alternative_targets[(pos_t, snp_info_t)][0][ + afidx + ] += f",{target[afidx]}" # add allele frequencies + else: + alternative_targets[(pos_t, snp_info_t)] = [target] + return reference_targets, alternative_targets + + +def recover_alternative_targets_list( + alternative_targets: Dict[Tuple[int, str], List[List[str]]], noref: bool +) -> List[List[str]]: + """ + Recovers the list of alternative targets from a dictionary of alternative + targets grouped by position and SNP information. + + Args: + alternative_targets (Dict[Tuple[int, str], List[List[str]]): Dictionary + of alternative targets grouped by position and SNP information. + noref (bool): Flag indicating whether to include reference targets. + + Returns: + List[List[str]]: List of alternative targets with updated flags. + """ + + alternative_targets_list = [] # initialize the list + for v in alternative_targets.values(): + for target in v: + # check whether the target is uniquely found on alternative sequences + target[FLAG_ALT_ONLY] = "y" if noref else target[FLAG_ALT_ONLY] + alternative_targets_list.append(target) + return alternative_targets_list + + +def remove_duplicates(target: List[str], idx: int) -> str: + """ + Removes duplicates from a target list at the specified index and returns the + unique values as a comma-separated string. + + Args: + target (List[str]): List of target values. + idx (int): Index of the target list to remove duplicates from. + + Returns: + str: Comma-separated string of unique values after removing duplicates. + """ + + return ",".join(set(target[idx].split(","))) + + +def unique_values( + targets: List[List[str]], snpidx: int, guideidx: int +) -> List[List[str]]: + """ + Returns a list of targets with unique values for SNP information, SNP ID, + allele frequency, and samples. + + Args: + targets (List[List[str]]): List of target clusters. + snpidx (int): Index of the SNP information. + guideidx (int): Index of the guide. + + Returns: + List[List[str]]: List of targets with unique values for specified fields. + """ + + snpididx, afidx, samplesidx = snpidx - 2, snpidx - 1, guideidx - 2 + targets_filt = [] + # remove duplicate values + for target in targets: + target[snpidx] = remove_duplicates(target, snpidx) # snp info + target[snpididx] = remove_duplicates(target, snpididx) # snp id + target[afidx] = remove_duplicates(target, afidx) # af + target[samplesidx] = remove_duplicates(target, samplesidx) # samples + targets_filt.append(target) # update target + return targets_filt + + +def construct_cluster( + cluster: List[List[str]], + guideidx: int, + snpidx: int, + posidx: int, + scoreidx: int, + mmbidx: int, + sort_pivot: str, + sorting_criteria: str, + sorting_criteria_scoring: str, + outfile: TextIOWrapper, + outfiledisc: TextIOWrapper, +) -> None: + """ + Constructs target clusters by processing and sorting reference and + alternative targets based on specified criteria, and writes the results to + output files. + + Args: + cluster (List[List[str]]): List of target clusters. + guideidx (int): Index of the guide. + snpidx (int): Index of the SNP information. + posidx (int): Index of the position. + scoreidx (int): Index of the score. + mmbidx (int): Index of the MM+Bulges. + sort_pivot (str): Sorting pivot. + sorting_criteria (str): Sorting criteria. + sorting_criteria_scoring (str): Sorting criteria for scoring. + outfile (TextIOWrapper): Output file for reported alignments. + outfiledisc (TextIOWrapper): Output file for alternative alignments. + + Returns: + None + """ + + if not cluster: # avoid crashes when cluster is empty + return + # recover reference and alternative targets + reference_targets, alternative_targets = cluster_targets_by_pos_snp( + cluster, guideidx, snpidx, posidx + ) + noref = ( + not reference_targets + ) # check whether only alternative targets have been found + # recover alternative targets list + alternative_targets = recover_alternative_targets_list(alternative_targets, noref) + # remove duplicates values on snp info, snp id, af, and samples columns + alternative_targets = unique_values(alternative_targets, snpidx, guideidx) + # sort targets + score = sort_pivot == SORTING_PIVOTS[0] + sorting_criteria = sorting_criteria_scoring if score else sorting_criteria + criteria = initialize_sorting_criteria(sorting_criteria, score, scoreidx, mmbidx) + if reference_targets: + reference_targets = sort_targets(reference_targets, criteria) + if alternative_targets: + alternative_targets = sort_targets(alternative_targets, criteria) + # write targets to reported or alternative alignments files + write_best_targets( + reference_targets, + alternative_targets, + noref, + score, + scoreidx, + mmbidx, + outfile, + outfiledisc, + ) + + +def write_best_targets( + reference_targets: List[List[str]], + alternative_targets: List[List[str]], + noref: bool, + score: bool, + scoreidx: int, + mmbidx: int, + outfile: TextIOWrapper, + outfiledisc: TextIOWrapper, +) -> None: + """ + Writes the best targets to the reported alignments file and alternative + alignments file based on specified criteria. + + Args: + reference_targets (List[List[str]]): List of reference targets. + alternative_targets (List[List[str]]): List of alternative targets. + noref (bool): Flag indicating if no reference target is found. + score (bool): Flag indicating if scoring is used for comparison. + scoreidx (int): Index of the score. + mmbidx (int): Index of the MM+Bulges. + outfile (TextIOWrapper): Output file for reported alignments. + outfiledisc (TextIOWrapper): Output file for alternative alignments. + + Returns: + None + """ + + if noref: # no reference target found + target = alternative_targets.pop(0) # pop best target + target[scoreidx - 1] = str(len(alternative_targets)) + elif reference_targets and alternative_targets: # targets found in both + if score: + target = ( + alternative_targets.pop(0) + if float(reference_targets[0][scoreidx]) + < float(alternative_targets[0][scoreidx]) + else reference_targets.pop(0) + ) + elif int(alternative_targets[0][mmbidx]) < int(reference_targets[0][mmbidx]): + target = alternative_targets.pop(0) + else: + target = reference_targets.pop(0) + target[scoreidx - 1] = str(len(reference_targets) + len(alternative_targets)) + else: # no alternative target found + target = reference_targets.pop(0) + target[scoreidx - 1] = str(len(reference_targets)) + target = "\t".join(target) + outfile.write(f"{target}\n") + # write the alternative alignments targets + for target in reference_targets + alternative_targets: + target[scoreidx - 1] = str(len(reference_targets) + len(alternative_targets)) + target = "\t".join(target) + outfiledisc.write(f"{target}\n") + + +def sort_targets(targets: List[List[str]], criteria: Callable) -> List[List[str]]: + """ + Sorts the list of targets based on the specified criteria function. + + Args: + targets (List[List[str]]): List of targets to be sorted. + criteria (Callable): Sorting criteria function. + + Returns: + List[List[str]]: Sorted list of targets based on the provided criteria. + """ + + return sorted(targets, key=criteria) + + +def sorting_score(criteria: List[str], score_idx: int, mmb_idx: int) -> Callable: + """ + Returns a sorting function based on the specified criteria for scoring, + MM+Bulges index, and multiple criteria. + + Args: + criteria (List[str]): List of sorting criteria. + score_idx (int): Index of the score. + mmb_idx (int): Index of the MM+Bulges. + + Returns: + Callable: Sorting function based on the provided criteria. + """ + + if len(criteria) == 1: # single criterion + return lambda x: ( + -float(x[score_idx]), + int(x[mmb_idx - SORTING_CRITERIA[criteria[0]]]), + ) + elif len(criteria) == 2: + return lambda x: ( + -float(x[score_idx]), + int(x[mmb_idx - SORTING_CRITERIA[criteria[0]]]), + int(x[mmb_idx - SORTING_CRITERIA[criteria[1]]]), + ) + # base case (all three ) + return lambda x: ( + -float(x[score_idx]), + int(x[mmb_idx - SORTING_CRITERIA[criteria[0]]]), + int(x[mmb_idx - SORTING_CRITERIA[criteria[1]]]), + int(x[mmb_idx - SORTING_CRITERIA[criteria[2]]]), + ) + + +def sorting_fewest(criteria: List[str], mmb_idx: int) -> Callable: + """ + Returns a sorting function based on the specified criteria for the fewest + MM+Bulges index. + + Args: + criteria (List[str]): List of sorting criteria. + mmb_idx (int): Index of the MM+Bulges. + + Returns: + Callable: Sorting function based on the provided criteria for the fewest + MM+Bulges index. + """ + + if len(criteria) == 1: # one criterion + return lambda x: (int(x[mmb_idx - SORTING_CRITERIA[criteria[0]]])) + elif len(criteria) == 2: + return lambda x: ( + int(x[mmb_idx - SORTING_CRITERIA[criteria[0]]]), + int(x[mmb_idx - SORTING_CRITERIA[criteria[1]]]), + ) + # base case (all three ) + return lambda x: ( + int(x[mmb_idx - SORTING_CRITERIA[criteria[0]]]), + int(x[mmb_idx - SORTING_CRITERIA[criteria[1]]]), + int(x[mmb_idx - SORTING_CRITERIA[criteria[2]]]), + ) + + +def initialize_sorting_criteria( + sorting_criteria: str, score: bool, score_idx: int, mmb_idx: int +) -> Callable: + """ + Initializes the sorting criteria function based on the specified sorting + criteria, score flag, score index, and MM+Bulges index. + + Args: + sorting_criteria (str): Comma-separated string of sorting criteria. + score (bool): Flag indicating if scoring is used for sorting. + score_idx (int): Index of the score. + mmb_idx (int): Index of the MM+Bulges. + + Returns: + Callable: Sorting criteria function based on the provided parameters. + """ + + criteria = sorting_criteria.split(",") + if len(criteria) > 3: + raise ValueError("Mismatching sorting criteria selected") + if any(c not in SORTING_CRITERIA for c in criteria): + raise ValueError("Unknown sorting criteria") + if score: + return sorting_score(criteria, score_idx, mmb_idx) + return sorting_fewest(criteria, mmb_idx) + + +def split_targets(input_args: MergeTargets) -> None: + """ + Splits and processes the input targets file into best and alternative targets + files based on specified criteria. + + Args: + input_args (MergeTargets): Object containing input arguments for target + splitting and processing. + + Returns: + None + + Raises: + OSError: If an error occurs during the process of splitting and processing + the targets. + """ + + # open best, and alternative targets files + try: + with open(input_args.targets_fname, mode="r") as infile, open( + input_args.targets_fname_merged, mode="w" + ) as outfile, open(input_args.targets_fname_discarded, mode="w") as discfile: + # write header to outfiles + header = infile.readline().strip() + outfile.write(f"{header}\n") + discfile.write(f"{header}\n") + # begin dividing targets + ( + pos_prev, + guide_prev, + chrom_prev, + snp_prev, + cluster, + ) = initialize_targets_cluster(input_args.rangebp) + for line in infile: + target = parse_target(line) + guide, chrom, pos, snp = ( + target[input_args.guideidx], + target[input_args.chromidx], + int(target[input_args.posidx]), + target[input_args.snpidx], + ) + if open_targets_cluster( + guide, + guide_prev, + chrom, + chrom_prev, + pos, + pos_prev, + input_args.rangebp, + ): + construct_cluster( + cluster, + input_args.guideidx, + input_args.snpidx, + input_args.posidx, + input_args.scoreidx, + input_args.mmbidx, + input_args.sort_pivot, + input_args.sorting_criteria, + input_args.sorting_criteria_scoring, + outfile, + discfile, + ) + cluster = [target] + else: + cluster.append(target) + guide_prev = guide + pos_prev = pos + chrom_prev = chrom + snp_prev = snp + construct_cluster( + cluster, + input_args.guideidx, + input_args.snpidx, + input_args.posidx, + input_args.scoreidx, + input_args.mmbidx, + input_args.sort_pivot, + input_args.sorting_criteria, + input_args.sorting_criteria_scoring, + outfile, + discfile, + ) + except IOError as e: + raise OSError("An error occurred while merging contiguous targets") from e + + +def merge_targets() -> None: + """ + Merges targets by parsing input arguments, splitting the targets, and + displaying the completion time. + + Returns: + None + """ + + start = time() # start time point + input_args = parse_input_args(sys.argv[1:]) + split_targets(input_args) + sys.stdout.write(f"Merge completed in {(time() - start)}s\n") + + +if __name__ == "__main__": + merge_targets() diff --git a/PostProcess/scriptAnalisiNNN_v3.sh b/PostProcess/scriptAnalisiNNN_v3.sh index afc1f86..d16119d 100755 --- a/PostProcess/scriptAnalisiNNN_v3.sh +++ b/PostProcess/scriptAnalisiNNN_v3.sh @@ -1,7 +1,5 @@ #!/bin/bash -set -e # trace all commands failures - # Script per l'analisi dei targets della ricerca REF e ENR con PAM NNN # Il file dei targets della ricerca sul genoma reference si chiama $REFtargets -> INPUT $1 # Il file dei targets della ricerca sul genoma enriched si chiama $ENRtargets -> INPUT $2 @@ -41,10 +39,7 @@ output_folder=${12} echo $jobid # 1) Rimozione duplicati, estrazione semicommon e unique e creazione file total #echo 'Creazione file .total.txt' -./extraction.sh $REFtargets $ENRtargets $jobid || { - echo "CRISPRme ERROR: targets extraction failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 -} # OUTPUT $jobid.common_targets.txt -> Non usato +./extraction.sh $REFtargets $ENRtargets $jobid # OUTPUT $jobid.common_targets.txt -> Non usato # $jobid.semi_common_targets.txt # $jobid.unique_targets.txt rm $jobid.common_targets.txt @@ -62,10 +57,7 @@ rm $jobid.semi_common_targets.minmaxdisr.txt #echo 'Creazione cluster del file .total.txt' # 3) Clustering -./cluster.dict.py $jobid.total.txt 'no' 'True' 'True' "$guide_file" 'total' 'orderChr' || { - echo "CRISPRme ERROR: targets clustering failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 -} # OUTPUT $jobid.total.cluster.txt +./cluster.dict.py $jobid.total.txt 'no' 'True' 'True' "$guide_file" 'total' 'orderChr' # OUTPUT $jobid.total.cluster.txt rm $jobid.total.txt #sed -i ':a;N;$!ba;s/\n/\tn\tn\tn\n/g' $jobid.total.cluster.txt @@ -94,10 +86,7 @@ rm $jobid.total.txt #echo 'Estrazione sample dal file .total.cluster.txt' # ./simpleAnalysis_v3.py "$annotationfile" "$jobid.total.cluster.txt" "$jobid" "$dictionaries" "$pam_file" $mismatch "$referencegenome" "$guide_file" $bulgesDNA $bulgesRNA -./new_simple_analysis.py "$referencegenome" "$dictionaries" "$jobid.total.cluster.txt" "${pam_file}" "$jobid" "$mismatch" || { - echo "CRISPRme ERROR: annotation analysis failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 -} +./new_simple_analysis.py "$referencegenome" "$dictionaries" "$jobid.total.cluster.txt" "${pam_file}" "$jobid" "$mismatch" # cp $jobid.bestCFD.txt $jobid.bestCFD.txt.check_analysis # cp $jobid.bestmmblg.txt $jobid.bestmmblg.txt.check_analysis # cp $jobid.bestCRISTA.txt $jobid.bestCRISTA.txt.check_analysis @@ -129,18 +118,9 @@ echo 'Sorting and adjusting results' # cp $jobid.bestCRISTA.txt $jobid.bestCRISTA.txt.after_sort #adjustin columns to have the correct order and remove uncessary ones -./adjust_cols.py $jobid.bestCFD.txt || { - echo "CRISPRme ERROR: CFD report cleaning failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 -} -./adjust_cols.py $jobid.bestmmblg.txt || { - echo "CRISPRme ERROR: mismatch+bulges report cleaning failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 -} -./adjust_cols.py $jobid.bestCRISTA.txt || { - echo "CRISPRme ERROR: CRISTA report cleaning failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 -} +./adjust_cols.py $jobid.bestCFD.txt +./adjust_cols.py $jobid.bestmmblg.txt +./adjust_cols.py $jobid.bestCRISTA.txt # ./adjust_cols.py $jobid.altmmblg.txt # sed -i 1i"#Bulge_type\tcrRNA\tDNA\tReference\tChromosome\tPosition\tCluster_Position\tDirection\tMismatches\tBulge_Size\tTotal\tPAM_gen\tVar_uniq\tSamples\tAnnotation_Type\tReal_Guide\trsID\tAF\tSNP\t#Seq_in_cluster\tCFD\tCFD_ref\t_#Bulge_type\t_crRNA\t_DNA\t_Reference\t_Chromosome\t_Position\t_Cluster_Position\t_Direction\t_Mismatches\t_Bulge_Size\t_Total\t_PAM_gen\t_Var_uniq\t_Samples\t_Annotation_Type\t_Real_Guide\t_rsID\t_AF\t_SNP\t_#Seq_in_cluster\t_CFD\t_CFD_ref\tCRISTA_#Bulge_type\tCRISTA_crRNA\tCRISTA_DNA\tCRISTA_Reference\tCRISTA_Chromosome\tCRISTA_Position\tCRISTA_Cluster_Position\tCRISTA_Direction\tCRISTA_Mismatches\tCRISTA_Bulge_Size\tCRISTA_Total\tCRISTA_PAM_gen\tCRISTA_Var_uniq\tCRISTA_Samples\tCRISTA_Annotation_Type\tCRISTA_Real_Guide\tCRISTA_rsID\tCRISTA_AF\tCRISTA_SNP\tCRISTA_#Seq_in_cluster\tCRISTA_CFD\tCRISTA_CFD_ref" "$final_res" diff --git a/PostProcess/submit_job_automated_new_multiple_vcfs.sh b/PostProcess/submit_job_automated_new_multiple_vcfs.sh index c68621d..fc37f96 100755 --- a/PostProcess/submit_job_automated_new_multiple_vcfs.sh +++ b/PostProcess/submit_job_automated_new_multiple_vcfs.sh @@ -1,33 +1,54 @@ #!/bin/bash -set -e # capture any failure +# This script automates the search for guide RNA and PAM sequences in reference +# and variant genomes. It processes input files, manages directories, and executes +# various analyses to identify potential CRISPR targets. +# The script handles both reference and variant genomes, performs indexing, +# searching, and post-analysis, and generates results in specified output formats. +# +# Args: +# $1: Reference genome folder path. +# $2: List of VCF files. +# $3: Guide RNA file path. +# $4: PAM file path. +# $5: Annotation file path. +# $6: Sample ID file path. +# $7: Maximum bulge size. +# $8: Mismatches allowed. +# $9: Bulge DNA size. +# ${10}: Bulge RNA size. +# ${11}: Merge threshold. +# ${12}: Output folder path. +# ${13}: Starting directory path. +# ${14}: Number of CPUs to use. +# ${15}: Current working directory path. +# ${16}: Gene proximity file path. +# ${17}: Email address for notifications. +# +# Returns: +# Generates various output files including target lists, merged results, and a database. +# Sends an email notification upon completion if an email address is provided. #file for automated search of guide+pam in reference and variant genomes -ref_folder=$(realpath $1) -vcf_list=$(realpath $2) -# IFS=',' read -ra vcf_list <<< $2 -guide_file=$(realpath $3) -pam_file=$(realpath $4) -annotation_file=$(realpath $5) -sampleID=$(realpath $6) +ref_folder=$(realpath $1) # reference genome folder +vcf_list=$(realpath $2) # vcf folders list +guide_file=$(realpath $3) # guide +pam_file=$(realpath $4) # pam +annotation_file=$(realpath $5) # annotation bed +sampleID=$(realpath $6) # sample ids +bMax=$7 # max number of bulges +mm=$8 # mismatches +bDNA=$9 # dna bulges +bRNA=${10} # rna bulges +merge_t=${11} # targets merge threshold (bp) +output_folder=$(realpath ${12}) # output folder +starting_dir=$(realpath ${13}) # root dir +ncpus=${14} # number of threads +current_working_directory=$(realpath ${15}) # current working directory +gene_proximity=$(realpath ${16}) # gene annotation bed +email=${17} # email address (website only) -bMax=$7 -mm=$8 -bDNA=$9 -bRNA=${10} - -merge_t=${11} - -output_folder=$(realpath ${12}) - -starting_dir=$(realpath ${13}) -ncpus=${14} -current_working_directory=$(realpath ${15}) - -gene_proximity=$(realpath ${16}) - -email=${17} echo -e "MAIL: $email" echo -e "CPU used: $ncpus" @@ -40,20 +61,22 @@ base_check_set=${20} sorting_criteria_scoring=${21} sorting_criteria=${22} -log="$output_folder/log.txt" -touch $log +# create log files +log="${output_folder}/log.txt" +touch $log +logerror="${output_folder}/log_error.txt" # log error -> trace errors #echo -e 'Job\tStart\t'$(date) > $log start_time='Job\tStart\t'$(date) # output=$output_folder/output.txt # touch $output -##CREATE DUMMY FILE WITH ONE LINE +## CREATE DUMMY FILE WITH ONE LINE echo -e "dummy_file" >"${output_folder}/.dummy.txt" dummy_file="${output_folder}/.dummy.txt" -##CREATE EMPTY FILE +## CREATE EMPTY FILE touch "${output_folder}/.empty.txt" empty_file="${output_folder}/.empty.txt" -##CREATE EMPTY DIR +## CREATE EMPTY DIR mkdir -p "${output_folder}/.empty" empty_dir="${output_folder}/.empty" @@ -68,6 +91,7 @@ if [ $6 != "_" ]; then echo >>$6 fi +# perform target search and processing for each variants dataset while read vcf_f; do if [ -z "$vcf_f" ]; then continue @@ -149,6 +173,8 @@ while read vcf_f; do mkdir "$output_folder/crispritz_targets" fi + # STEP 1: Enrich genome adding SNPs and INDELs from the input VCF files + # track haplotypes if input VCF is phased if [ "$vcf_name" != "_" ]; then cd "$current_working_directory/Genomes" @@ -156,35 +182,53 @@ while read vcf_f; do echo -e 'Add-variants\tStart\t'$(date) >>$log # echo -e 'Add-variants\tStart\t'$(date) >&2 echo -e "Adding variants" - crispritz.py add-variants "$vcf_folder/" "$ref_folder/" "true" || { - echo "CRISPRme ERROR: genome enrichment failed (script: ${0} line $((LINENO - 1)))" >&2 + crispritz.py add-variants "$vcf_folder/" "$ref_folder/" "true" + # check for add-variants failures + if [ -s $logerror ]; then + printf "ERROR: Genome enrichment failed!\n" >&2 exit 1 - } - #if ! [ -d "${ref_name}+${vcf_name}" ]; then - # mkdir "${ref_name}+${vcf_name}" - #fi + fi mv "$current_working_directory/Genomes/variants_genome/SNPs_genome/${ref_name}_enriched/" "./${ref_name}+${vcf_name}/" if ! [ -d "$current_working_directory/Dictionaries/dictionaries_${vcf_name}/" ]; then mkdir "$current_working_directory/Dictionaries/dictionaries_${vcf_name}/" fi + # check for snp dictionary failures + if [ -s $logerror ]; then + printf "ERROR: SNP dictionary construction failed!\n" >&2 + exit 1 + fi if ! [ -d "$current_working_directory/Dictionaries/log_indels_${vcf_name}/" ]; then mkdir "$current_working_directory/Dictionaries/log_indels_${vcf_name}/" fi + # check for indel dictionary failures + if [ -s $logerror ]; then + printf "ERROR: Indel dictionary construction failed!\n" >&2 + exit 1 + fi mv $current_working_directory/Genomes/variants_genome/SNPs_genome/*.json $current_working_directory/Dictionaries/dictionaries_${vcf_name}/ mv $current_working_directory/Genomes/variants_genome/SNPs_genome/log*.txt $current_working_directory/Dictionaries/log_indels_${vcf_name}/ cd "$current_working_directory/" if ! [ -d "genome_library/${true_pam}_2_${ref_name}+${vcf_name}_INDELS" ]; then mkdir "genome_library/${true_pam}_2_${ref_name}+${vcf_name}_INDELS" fi + # check for genome library failures + if [ -s $logerror ]; then + printf "ERROR: Genome library construction failed!\n" >&2 + exit 1 + fi echo -e 'Add-variants\tEnd\t'$(date) >>$log # echo -e 'Add-variants\tEnd\t'$(date) >&2 + + # STEP 2: indels indexing echo -e 'Indexing Indels\tStart\t'$(date) >>$log # echo -e 'Indexing Indels\tStart\t'$(date) >&2 - ${starting_dir}/./pool_index_indels.py "$current_working_directory/Genomes/variants_genome/" "$pam_file" $true_pam $ref_name $vcf_name $ncpus || { - echo "CRISPRme ERROR: indels indexing failed (script: ${0} line $((LINENO - 1)))" >&2 + ${starting_dir}/./pool_index_indels.py "$current_working_directory/Genomes/variants_genome/" "$pam_file" $true_pam $ref_name $vcf_name $ncpus + # check for indels indexing failures + if [ -s $logerror ]; then + printf "ERROR: Indels indexing failed!\n" >&2 exit 1 - } + fi echo -e 'Indexing Indels\tEnd\t'$(date) >>$log # echo -e 'Indexing Indels\tEnd\t'$(date) >&2 if ! [ -d $current_working_directory/Genomes/${ref_name}+${vcf_name}_INDELS ]; then @@ -204,12 +248,14 @@ while read vcf_f; do if ! [ -d "$current_working_directory/genome_library/${true_pam}_2_${ref_name}+${vcf_name}_INDELS" ]; then echo -e 'Indexing Indels\tStart\t'$(date) >>$log # echo -e 'Indexing Indels\tStart\t'$(date) >&2 - ${starting_dir}/./pool_index_indels.py "$current_working_directory/Genomes/${ref_name}+${vcf_name}_INDELS/" "$pam_file" $true_pam $ref_name $vcf_name $ncpus || { - echo "CRISPRme ERROR: indels indexing failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 - } + ${starting_dir}/./pool_index_indels.py "$current_working_directory/Genomes/${ref_name}+${vcf_name}_INDELS/" "$pam_file" $true_pam $ref_name $vcf_name $ncpus echo -e 'Indexing Indels\tEnd\t'$(date) >>$log # echo -e 'Indexing Indels\tEnd\t'$(date) >&2 + # check for indels indexing failures + if [ -s $logerror ]; then + printf "ERROR: Indels indexing failed!\n" >&2 + exit 1 + fi fi fi @@ -217,6 +263,7 @@ while read vcf_f; do rm -r "$current_working_directory/Dictionaries/fake_chrom_$vcf_name" fi + # STEP 3: index reference genome cd "$current_working_directory/" if ! [ -d "$current_working_directory/genome_library/${true_pam}_${bMax}_${ref_name}" ]; then if ! [ -d "$current_working_directory/genome_library/${true_pam}_2_${ref_name}" ]; then @@ -226,11 +273,13 @@ while read vcf_f; do # echo -e 'Index-genome Reference\tStart\t'$(date) >&2 # echo -e 'Indexing_Reference' > $output echo -e "Indexing reference genome" - crispritz.py index-genome "$ref_name" "$ref_folder/" "$pam_file" -bMax $bMax -th $ncpus || { - echo "CRISPRme ERROR: TST-index construction failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 - } + crispritz.py index-genome "$ref_name" "$ref_folder/" "$pam_file" -bMax $bMax -th $ncpus pid_index_ref=$! + # check for reference genome indexing failures + if [ -s $logerror ]; then + printf "ERROR: Reference genome indexing failed!\n" >&2 + exit 1 + fi echo -e 'Index-genome Reference\tEnd\t'$(date) >>$log # echo -e 'Index-genome Reference\tEnd\t'$(date) >&2 idx_ref="$current_working_directory/genome_library/${true_pam}_${bMax}_${ref_name}" @@ -243,11 +292,13 @@ while read vcf_f; do # echo -e 'Index-genome Reference\tStart\t'$(date) >&2 # echo -e 'Indexing_Reference' > $output echo -e "Indexing reference genome" - crispritz.py index-genome "$ref_name" "$ref_folder/" "$pam_file" -bMax $bMax -th $ncpus || { - echo "CRISPRme ERROR: TST-index construction failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 - } + crispritz.py index-genome "$ref_name" "$ref_folder/" "$pam_file" -bMax $bMax -th $ncpus pid_index_ref=$! + # check for reference genome indexing failures + if [ -s $logerror ]; then + printf "ERROR: Reference genome indexing failed!\n" >&2 + exit 1 + fi echo -e 'Index-genome Reference\tEnd\t'$(date) >>$log # echo -e 'Index-genome Reference\tEnd\t'$(date) >&2 idx_ref="$current_working_directory/genome_library/${true_pam}_${bMax}_${ref_name}" @@ -265,6 +316,7 @@ while read vcf_f; do idx_ref="$current_working_directory/genome_library/${true_pam}_${bMax}_${ref_name}" fi + # STEP 4: index variant genome if [ "$vcf_name" != "_" ]; then if ! [ -d "$current_working_directory/genome_library/${true_pam}_${bMax}_${ref_name}+${vcf_name}" ]; then if ! [ -d "$current_working_directory/genome_library/${true_pam}_2_${ref_name}+${vcf_name}" ]; then @@ -274,11 +326,13 @@ while read vcf_f; do # echo -e 'Index-genome Variant\tStart\t'$(date) >&2 # echo -e 'Indexing_Enriched' > $output echo -e "Indexing variant genome" - crispritz.py index-genome "${ref_name}+${vcf_name}" "$current_working_directory/Genomes/${ref_name}+${vcf_name}/" "$pam_file" -bMax $bMax -th $ncpus || { - echo "CRISPRme ERROR: TST-index construction failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 - } + crispritz.py index-genome "${ref_name}+${vcf_name}" "$current_working_directory/Genomes/${ref_name}+${vcf_name}/" "$pam_file" -bMax $bMax -th $ncpus pid_index_var=$! + # check for variant genome indexing failures + if [ -s $logerror ]; then + printf "ERROR: Variant genome indexing failed!\n" >&2 + exit 1 + fi echo -e 'Index-genome Variant\tEnd\t'$(date) >>$log # echo -e 'Index-genome Variant\tEnd\t'$(date) >&2 idx_var="$current_working_directory/genome_library/${true_pam}_${bMax}_${ref_name}+${vcf_name}" @@ -291,11 +345,13 @@ while read vcf_f; do # echo -e 'Index-genome Variant\tStart\t'$(date) >&2 # echo -e 'Indexing_Enriched' > $output echo -e "Indexing variant genome" - crispritz.py index-genome "${ref_name}+${vcf_name}" "$current_working_directory/Genomes/${ref_name}+${vcf_name}/" "$pam_file" -bMax $bMax -th $ncpus || { - echo "CRISPRme ERROR: TST-index construction failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 - } + crispritz.py index-genome "${ref_name}+${vcf_name}" "$current_working_directory/Genomes/${ref_name}+${vcf_name}/" "$pam_file" -bMax $bMax -th $ncpus pid_index_ref=$! + # check for variant genome indexing failures + if [ -s $logerror ]; then + printf "ERROR: Variant genome indexing failed!\n" >&2 + exit 1 + fi echo -e 'Index-genome Variant\tEnd\t'$(date) >>$log # echo -e 'Index-genome Variant\tEnd\t'$(date) >&2 idx_var="$current_working_directory/genome_library/${true_pam}_${bMax}_${ref_name}+${vcf_name}" @@ -321,6 +377,7 @@ while read vcf_f; do ceiling_result=1 fi + # STEP 5: reference genome search #start searches cd "$output_folder" echo $idx_ref @@ -330,43 +387,50 @@ while read vcf_f; do # echo -e 'Search Reference\tStart\t'$(date) >&2 # echo -e 'Search Reference' > $output if [ "$bDNA" -ne 0 ] || [ "$bRNA" -ne 0 ]; then - crispritz.py search $idx_ref "$pam_file" "$guide_file" "${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}" -index -mm $mm -bDNA $bDNA -bRNA $bRNA -t -th $ceiling_result & - wait || { - echo "CRISPRme ERROR: off-targets search failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 - } # TODO: + crispritz.py search $idx_ref "$pam_file" "$guide_file" "${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}" -index -mm $mm -bDNA $bDNA -bRNA $bRNA -t -th $ceiling_result & pid_search_ref=$! + # check for reference genome search (brute-force) failures + if [ -s $logerror ]; then + printf "ERROR: Reference genome search (brute-force) failed!\n" >&2 + exit 1 + fi + echo -e 'Search Reference\tEnd\t'$(date) >>$log else crispritz.py search "$current_working_directory/Genomes/${ref_name}/" "$pam_file" "$guide_file" "${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}" -mm $mm -r -th $ceiling_result & - wait || { - echo "CRISPRme ERROR: off-targets search failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 - } pid_search_ref=$! + # check for reference genome search (TST) failures + if [ -s $logerror ]; then + printf "ERROR: Reference genome search (TST) failed!\n" >&2 + exit 1 + fi + echo -e 'Search Reference\tEnd\t'$(date) >>$log fi else echo -e "Search for reference already done" fi + # STEP 6: variant genome search if [ "$vcf_name" != "_" ]; then - #TODO RICERCA ALTERNATIVE PARALLELA A REF if ! [ -f "$output_folder/crispritz_targets/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" ]; then echo -e 'Search Variant\tStart\t'$(date) >>$log # echo -e 'Search Variant\tStart\t'$(date) >&2 # echo -e 'Search Variant' > $output if [ "$bDNA" -ne 0 ] || [ "$bRNA" -ne 0 ]; then - crispritz.py search "$idx_var" "$pam_file" "$guide_file" "${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}" -index -mm $mm -bDNA $bDNA -bRNA $bRNA -t -th $ceiling_result -var || { - echo "CRISPRme ERROR: off-targets search failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 - } + crispritz.py search "$idx_var" "$pam_file" "$guide_file" "${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}" -index -mm $mm -bDNA $bDNA -bRNA $bRNA -t -th $ceiling_result -var # mv "${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" "$output_folder/crispritz_targets" echo -e 'Search Variant\tEnd\t'$(date) >>$log + # check for variant genome search (brute-force) failures + if [ -s $logerror ]; then + printf "ERROR: Variant genome search (brute-force) failed!\n" >&2 + exit 1 + fi else crispritz.py search "$current_working_directory/Genomes/${ref_name}+${vcf_name}/" "$pam_file" "$guide_file" "${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}" -mm $mm -r -th $ceiling_result & - wait || { - echo "CRISPRme ERROR: off-targets search failed (script: ${0} line $((LINENO - 1)))" >&2 + # check for variant genome search (TST) failures + if [ -s $logerror ]; then + printf "ERROR: Variant genome search (TST) failed!\n" >&2 exit 1 - } + fi echo -e 'Search Variant\tEnd\t'$(date) >>$log # mv "${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" "$output_folder/crispritz_targets" fi @@ -374,6 +438,7 @@ while read vcf_f; do echo -e "Search for variant already done" fi + # STEP 7: search on indels if ! [ -f "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" ]; then echo -e "Search INDELs Start" echo -e 'Search INDELs\tStart\t'$(date) >>$log @@ -381,15 +446,14 @@ while read vcf_f; do cd $starting_dir #commented to avoid indels search #TODO REMOVE POOL SCRIPT FROM PROCESSING - ./pool_search_indels.py "$ref_folder" "$vcf_folder" "$vcf_name" "$guide_file" "$pam_file" $bMax $mm $bDNA $bRNA "$output_folder" $true_pam "$current_working_directory/" "$ncpus" || { - echo "CRISPRme ERROR: off-targets search on indels failed (script: ${0} line $((LINENO - 1)))" >&2 + ./pool_search_indels.py "$ref_folder" "$vcf_folder" "$vcf_name" "$guide_file" "$pam_file" $bMax $mm $bDNA $bRNA "$output_folder" $true_pam "$current_working_directory/" "$ncpus" + # check for indels genome search failures + if [ -s $logerror ]; then + printf "ERROR: Indels genome search failed!\n" >&2 exit 1 - } + fi # mv "$output_folder/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" "$output_folder/crispritz_targets" - awk '($3 !~ "n") {print $0}' "$output_folder/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" >"$output_folder/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.tmp" || { - echo "CRISPRme ERROR: off-targets report construction failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 - } + awk '($3 !~ "n") {print $0}' "$output_folder/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" >"$output_folder/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.tmp" mv "$output_folder/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.tmp" "$output_folder/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" echo -e "Search INDELs End" echo -e 'Search INDELs\tEnd\t'$(date) >>$log @@ -400,22 +464,33 @@ while read vcf_f; do fi while kill "-0" $pid_search_ref &>/dev/null; do - echo -e "Waiting for search genome reference" + echo -e "Waiting for search genome" sleep 100 done - echo -e 'Search Reference\tEnd\t'$(date) >>$log + echo -e 'Search\tEnd\t'$(date) >>$log # echo -e 'Search Reference\tEnd\t'$(date) >&2 # move all targets into targets directory mv $output_folder/*.targets.txt $output_folder/crispritz_targets + # check for targets folder creation failures + if [ -s $logerror ]; then + printf "ERROR: Targets folder creation failed!\n" >&2 + exit 1 + fi if ! [ -d "$output_folder/crispritz_prof" ]; then mkdir $output_folder/crispritz_prof fi mv $output_folder/*profile* $output_folder/crispritz_prof/ &>/dev/null + # check for profile folder creation failures + if [ -s $logerror ]; then + printf "ERROR: Profile folder creation failed!\n" >&2 + exit 1 + fi cd "$starting_dir" + # STEP 8: snp analysis echo -e "Start post-analysis" # echo -e 'Post analysis' > $output @@ -432,11 +507,12 @@ while read vcf_f; do fi #TODO ANALISI DEGLI SNP IN PARALLELO - ./pool_post_analisi_snp.py $output_folder $ref_folder $vcf_name $guide_file $mm $bDNA $bRNA $annotation_file $pam_file $dict_folder $final_res $final_res_alt $ncpus || { - echo "CRISPRme ERROR: SNP analysis failed (script: ${0} line $((LINENO - 1)))" >&2 + ./pool_post_analisi_snp.py $output_folder $ref_folder $vcf_name $guide_file $mm $bDNA $bRNA $annotation_file $pam_file $dict_folder $final_res $final_res_alt $ncpus + # check for snp analysis failures + if [ -s $logerror ]; then + printf "ERROR: SNP analysis failed!\n" >&2 exit 1 - } - + fi #CONCATENATE REF&VAR RESULTS for key in "${real_chroms[@]}"; do echo "Concatenating $key" @@ -452,6 +528,11 @@ while read vcf_f; do rm "$output_folder/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}_$key.bestmmblg.txt" rm "$output_folder/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}_$key.bestCRISTA.txt" done + # check for reports creation failures + if [ -s $logerror ]; then + printf "ERROR: Temporary reports (snp) creation failed!\n" >&2 + exit 1 + fi echo -e 'Post-analysis SNPs\tEnd\t'$(date) >>$log @@ -467,10 +548,12 @@ while read vcf_f; do touch "$final_res_alt" fi - ./pool_post_analisi_snp.py $output_folder $ref_folder "_" $guide_file $mm $bDNA $bRNA $annotation_file $pam_file "_" $final_res $final_res_alt $ncpus || { - echo "CRISPRme ERROR: SNP analysis failed (script: ${0} line $((LINENO - 1)))" >&2 + ./pool_post_analisi_snp.py $output_folder $ref_folder "_" $guide_file $mm $bDNA $bRNA $annotation_file $pam_file "_" $final_res $final_res_alt $ncpus + # check for targets analysis failures + if [ -s $logerror ]; then + printf "ERROR: Targets analysis failed!\n" >&2 exit 1 - } + fi #CONCATENATE REF&VAR RESULTS for key in "${real_chroms[@]}"; do @@ -487,10 +570,15 @@ while read vcf_f; do rm "$output_folder/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}_$key.bestmmblg.txt" rm "$output_folder/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}_$key.bestCRISTA.txt" done + # check for reports creation failures + if [ -s $logerror ]; then + printf "ERROR: Temporary reports (reference) creation failed!\n" >&2 + exit 1 + fi echo -e 'Post-analysis\tEnd\t'$(date) >>$log - fi + # STEP 9: indels analysis if [ "$vcf_name" != "_" ]; then echo -e "SNPs analysis ended. Starting INDELs analysis" cd "$starting_dir" @@ -499,11 +587,12 @@ while read vcf_f; do #SKIP INDELS ANALYSIS IF NO RESULTS FOUND if [ $(wc -l <"$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt") -gt 1 ]; then - ./pool_post_analisi_indel.py $output_folder $ref_folder $vcf_folder $guide_file $mm $bDNA $bRNA $annotation_file $pam_file "$current_working_directory/Dictionaries/" $final_res $final_res_alt $ncpus || { - echo "CRISPRme ERROR: indels analysis failed (script: ${0} line $((LINENO - 1)))" >&2 + ./pool_post_analisi_indel.py $output_folder $ref_folder $vcf_folder $guide_file $mm $bDNA $bRNA $annotation_file $pam_file "$current_working_directory/Dictionaries/" $final_res $final_res_alt $ncpus + # check for indels analysis failures + if [ -s $logerror ]; then + printf "ERROR: Indels analysis failed!\n" >&2 exit 1 - } - + fi #CONCATENATE INDELS RESULTS for key in "${array_fake_chroms[@]}"; do echo "Concatenating $key" @@ -520,6 +609,11 @@ while read vcf_f; do rm -f "$output_folder/${key}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}.bestCRISTA_INDEL.txt" rm -f "$output_folder/${key}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}.bestmmblg_INDEL.txt" done + # check for reports creation failures + if [ -s $logerror ]; then + printf "ERROR: Temporary reports (indels) creation failed!\n" >&2 + exit 1 + fi fi echo -e 'Post-analysis INDELs\tEnd\t'$(date) >>$log @@ -534,13 +628,15 @@ while read samples; do fi awk '!/^#/ { print }' "${current_working_directory}/samplesIDs/$samples" >>"$output_folder/.sampleID.txt" done <"$sampleID" +# check for samples ids reading failures +if [ -s $logerror ]; then + printf "ERROR: Reading sample IDs failed!\n" >&2 + exit 1 +fi # done <$sampleID # if [ "$vcf_name" != "_" ]; then touch "$output_folder/.sampleID.txt" -sed -i 1i"#SAMPLE_ID\tPOPULATION_ID\tSUPERPOPULATION_ID\tSEX" "$output_folder/.sampleID.txt" || { - echo "CRISPRme ERROR: Samples report construction failed (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 -} +sed -i 1i"#SAMPLE_ID\tPOPULATION_ID\tSUPERPOPULATION_ID\tSEX" "$output_folder/.sampleID.txt" # fi sampleID=$output_folder/.sampleID.txt @@ -557,28 +653,64 @@ sed -i '1 i\#Bulge_type\tcrRNA\tDNA\tReference\tChromosome\tPosition\tCluster_Po printf $header >$final_res_alt.bestCFD.txt printf $header >$final_res_alt.bestmmblg.txt printf $header >$final_res_alt.bestCRISTA.txt +# check for reports creation failures +if [ -s $logerror ]; then + printf "ERROR: Report files creation failed!\n" >&2 + exit 1 +fi +# STEP 10: Merging contiguous targets echo -e 'Merging Targets\tStart\t'$(date) >>$log #SORT FILE TO HAVE CHR AND POS IN PROXIMITY TO MERGE THEM #sort using guide_seq,chr,cluster_pos,score,total(mm+bul) head -1 $final_res.bestCFD.txt >$final_res.tmp tail -n +2 $final_res.bestCFD.txt | LC_ALL=C sort -k16,16 -k5,5 -k7,7n -k21,21rg -k11,11n -T $output_folder >>$final_res.tmp && mv $final_res.tmp $final_res.bestCFD.txt +# check for CFD report sorting failures +if [ -s $logerror ]; then + printf "ERROR: Sorting CFD report failed!\n" >&2 + exit 1 +fi #sort using guide_seq,chr,cluster_pos,score,total(mm+bul) head -1 $final_res.bestCRISTA.txt >$final_res.tmp tail -n +2 $final_res.bestCRISTA.txt | LC_ALL=C sort -k16,16 -k5,5 -k7,7n -k21,21rg -k11,11n -T $output_folder >>$final_res.tmp && mv $final_res.tmp $final_res.bestCRISTA.txt +# check for CRISTA report sorting failures +if [ -s $logerror ]; then + printf "ERROR: Sorting CRISTA report failed!\n" >&2 + exit 1 +fi #sort using guide_seq,chr,cluster_pos,total(mm+bul) head -1 $final_res.bestmmblg.txt >$final_res.tmp tail -n +2 $final_res.bestmmblg.txt | LC_ALL=C sort -k16,16 -k5,5 -k7,7n -k11,11n -T $output_folder >>$final_res.tmp && mv $final_res.tmp $final_res.bestmmblg.txt +# check for mm+bulges report sorting failures +if [ -s $logerror ]; then + printf "ERROR: Sorting mm+bulges report failed!\n" >&2 + exit 1 +fi # cp $final_res.bestCFD.txt $final_res.sorted.bestCFD.txt #MERGE BEST FILES TARGETS TO REMOVE CONTIGOUS #TODO CHECK MERGE #SCORE CFD ./merge_close_targets_cfd.sh $final_res.bestCFD.txt $final_res.bestCFD.txt.trimmed $merge_t 'score' $sorting_criteria_scoring $sorting_criteria & +# check for targets merge on CFD failures +if [ -s $logerror ]; then + printf "ERROR: merging targets in CFD report failed!\n" >&2 + exit 1 +fi #TOTAL (MM+BUL) ./merge_close_targets_cfd.sh $final_res.bestmmblg.txt $final_res.bestmmblg.txt.trimmed $merge_t 'total' $sorting_criteria_scoring $sorting_criteria & +# check for targets merge on mm+bulges failures +if [ -s $logerror ]; then + printf "ERROR: merging targets in mm+bulges report failed!\n" >&2 + exit 1 +fi #SCORE CRISTA ./merge_close_targets_cfd.sh $final_res.bestCRISTA.txt $final_res.bestCRISTA.txt.trimmed $merge_t 'score' $sorting_criteria_scoring $sorting_criteria & +# check for targets merge on CRISTA failures +if [ -s $logerror ]; then + printf "ERROR: merging targets in CRISTA report failed!\n" >&2 + exit 1 +fi wait #CHANGE NAME TO BEST AND ALT FILES mv $final_res.bestCFD.txt.trimmed $final_res.bestCFD.txt @@ -598,133 +730,148 @@ echo -e 'Merging Targets\tEnd\t'$(date) >>$log echo -e 'Annotating results\tStart\t'$(date) >>$log +# STEP 11: targets annotation #ANNOTATE BEST TARGETS #TODO SISTEMARE ANNOTAZIONE (DIVISIONE INTERVAL TREE / PARALLEL SEARCH) ./annotate_final_results.py $final_res.bestCFD.txt $annotation_file $final_res.bestCFD.txt.annotated & -wait || { - echo "CRISPRme ERROR: CFD annotation failed - reference (script: ${0} line $((LINENO - 1)))" >&2 +# check for annotation on CFD failures +if [ -s $logerror ]; then + printf "ERROR: Targets annotation in CFD report failed!\n" >&2 exit 1 -} +fi ./annotate_final_results.py $final_res.bestmmblg.txt $annotation_file $final_res.bestmmblg.txt.annotated & -wait || { - echo "CRISPRme ERROR: CRISTA annotation failed - reference (script: ${0} line $((LINENO - 1)))" >&2 +# check for annotation on mm+bulges failures +if [ -s $logerror ]; then + printf "ERROR: Targets annotation in mm+bulges report failed!\n" >&2 exit 1 -} +fi ./annotate_final_results.py $final_res.bestCRISTA.txt $annotation_file $final_res.bestCRISTA.txt.annotated & -wait || { - echo "CRISPRme ERROR: mismatch+bulges annotation failed - reference(script: ${0} line $((LINENO - 1)))" >&2 +# check for annotation on CRISTA failures +if [ -s $logerror ]; then + printf "ERROR: Targets annotation in CRISTA report failed!\n" >&2 exit 1 -} +fi wait mv $final_res.bestCFD.txt.annotated $final_res.bestCFD.txt mv $final_res.bestmmblg.txt.annotated $final_res.bestmmblg.txt mv $final_res.bestCRISTA.txt.annotated $final_res.bestCRISTA.txt #ANNOTATE ALT TARGETS ./annotate_final_results.py $final_res_alt.bestCFD.txt $annotation_file $final_res_alt.bestCFD.txt.annotated & -wait || { - echo "CRISPRme ERROR: CFD annotation failed - alternative (script: ${0} line $((LINENO - 1)))" >&2 +# check for annotation on CFD failures +if [ -s $logerror ]; then + printf "ERROR: Targets annotation in CFD alternative report failed!\n" >&2 exit 1 -} +fi ./annotate_final_results.py $final_res_alt.bestmmblg.txt $annotation_file $final_res_alt.bestmmblg.txt.annotated & -wait || { - echo "CRISPRme ERROR: CRISTA annotation failed - alternative (script: ${0} line $((LINENO - 1)))" >&2 +# check for annotation on mm+bulges failures +if [ -s $logerror ]; then + printf "ERROR: Targets annotation in mm+bulges alternative report failed!\n" >&2 exit 1 -} +fi ./annotate_final_results.py $final_res_alt.bestCRISTA.txt $annotation_file $final_res_alt.bestCRISTA.txt.annotated & -wait || { - echo "CRISPRme ERROR: mismatch+bulges annotation failed - alternative (script: ${0} line $((LINENO - 1)))" >&2 +# check for annotation on CRISTA failures +if [ -s $logerror ]; then + printf "ERROR: Targets annotation in CRISTA alternative report failed!\n" >&2 exit 1 -} +fi wait mv $final_res_alt.bestCFD.txt.annotated $final_res_alt.bestCFD.txt mv $final_res_alt.bestmmblg.txt.annotated $final_res_alt.bestmmblg.txt mv $final_res_alt.bestCRISTA.txt.annotated $final_res_alt.bestCRISTA.txt +# STEP 12: compute risk scores #SCORING BEST RESULTS ./add_risk_score.py $final_res.bestCFD.txt $final_res.bestCFD.txt.risk "False" & -wait || { - echo "CRISPRme ERROR: CFD risk score analysis failed - reference (script: ${0} line $((LINENO - 1)))" >&2 +# check for risk score computing on CFD failures +if [ -s $logerror ]; then + printf "ERROR: Risk score in CFD report failed!\n" >&2 exit 1 -} +fi ./add_risk_score.py $final_res.bestmmblg.txt $final_res.bestmmblg.txt.risk "False" & -wait || { - echo "CRISPRme ERROR: CRISTA risk score analysis failed - reference (script: ${0} line $((LINENO - 1)))" >&2 +# check for risk score computing on mm+bulges failures +if [ -s $logerror ]; then + printf "ERROR: Risk score in mm+bulges report failed!\n" >&2 exit 1 -} +fi ./add_risk_score.py $final_res.bestCRISTA.txt $final_res.bestCRISTA.txt.risk "False" & -wait || { - echo "CRISPRme ERROR: mismatch+bulges risk score analysis failed - reference (script: ${0} line $((LINENO - 1)))" >&2 +# check for risk score computing on CRISTA failures +if [ -s $logerror ]; then + printf "ERROR: Risk score in CRISTA report failed!\n" >&2 exit 1 -} +fi wait mv $final_res.bestCFD.txt.risk $final_res.bestCFD.txt mv $final_res.bestmmblg.txt.risk $final_res.bestmmblg.txt mv $final_res.bestCRISTA.txt.risk $final_res.bestCRISTA.txt #SCORING ALT RESULTS ./add_risk_score.py $final_res_alt.bestCFD.txt $final_res_alt.bestCFD.txt.risk "False" & -wait || { - echo "CRISPRme ERROR: CFD risk score analysis failed - alternative (script: ${0} line $((LINENO - 1)))" >&2 +# check for risk score computing on CFD failures +if [ -s $logerror ]; then + printf "ERROR: Risk score in CFD alternative report failed!\n" >&2 exit 1 -} +fi ./add_risk_score.py $final_res_alt.bestmmblg.txt $final_res_alt.bestmmblg.txt.risk "False" & -wait || { - echo "CRISPRme ERROR: CRISTA risk score analysis failed - alternative (script: ${0} line $((LINENO - 1)))" >&2 +# check for risk score computing on mm_bulges failures +if [ -s $logerror ]; then + printf "ERROR: Risk score in mm+bulges alternative report failed!\n" >&2 exit 1 -} +fi ./add_risk_score.py $final_res_alt.bestCRISTA.txt $final_res_alt.bestCRISTA.txt.risk "False" & -wait || { - echo "CRISPRme ERROR: mismatch+bulges risk score analysis failed - alternative (script: ${0} line $((LINENO - 1)))" >&2 +# check for risk score computing on CRISTA failures +if [ -s $logerror ]; then + printf "ERROR: Risk score in CRISTA alternative report failed!\n" >&2 exit 1 -} +fi wait mv $final_res_alt.bestCFD.txt.risk $final_res_alt.bestCFD.txt mv $final_res_alt.bestmmblg.txt.risk $final_res_alt.bestmmblg.txt mv $final_res_alt.bestCRISTA.txt.risk $final_res_alt.bestCRISTA.txt +# STEP 13: clean reports from dots and NaN values #remove N's and dots from rsID from BEST FILES python remove_n_and_dots.py $final_res.bestCFD.txt & -wait || { - echo "CRISPRme ERROR: CFD reports cleaning failed - reference (script: ${0} line $((LINENO - 1)))" >&2 +# check for NaN values cleaning on CFD failures +if [ -s $logerror ]; then + printf "ERROR: NaN values cleaning in CFD report failed!\n" >&2 exit 1 -} +fi python remove_n_and_dots.py $final_res.bestmmblg.txt & -wait || { - echo "CRISPRme ERROR: CRISTA reports cleaning failed - reference (script: ${0} line $((LINENO - 1)))" >&2 +# check for NaN values cleaning on mm+bulges failures +if [ -s $logerror ]; then + printf "ERROR: NaN values cleaning in mm+bulges report failed!\n" >&2 exit 1 -} +fi python remove_n_and_dots.py $final_res.bestCRISTA.txt & -wait || { - echo "CRISPRme ERROR: mismatch+bulges reports cleaning failed - reference (script: ${0} line $((LINENO - 1)))" >&2 +# check for NaN values cleaning on CRISTA failures +if [ -s $logerror ]; then + printf "ERROR: NaN values cleaning in CRISTA report failed!\n" >&2 exit 1 -} +fi wait #remove N's and dots from rsID from ALT FILES python remove_n_and_dots.py $final_res_alt.bestCFD.txt & -wait || { - echo "CRISPRme ERROR: CFD reports cleaning failed - alternative (script: ${0} line $((LINENO - 1)))" >&2 +# check for NaN values cleaning on CFD failures +if [ -s $logerror ]; then + printf "ERROR: NaN values cleaning in CFD alternative report failed!\n" >&2 exit 1 -} +fi python remove_n_and_dots.py $final_res_alt.bestmmblg.txt & -wait || { - echo "CRISPRme ERROR: CRISTA reports cleaning failed - alternative (script: ${0} line $((LINENO - 1)))" >&2 +# check for NaN values cleaning on mm+bulges failures +if [ -s $logerror ]; then + printf "ERROR: NaN values cleaning in mm+bulges alternative report failed!\n" >&2 exit 1 -} +fi python remove_n_and_dots.py $final_res_alt.bestCRISTA.txt & -wait || { - echo "CRISPRme ERROR: mismatch+bulges reports cleaning failed - alternative (script: ${0} line $((LINENO - 1)))" >&2 +# check for NaN values cleaning on CRISTA failures +if [ -s $logerror ]; then + printf "ERROR: NaN values cleaning in CRISTA alternative report failed!\n" >&2 exit 1 -} +fi wait #join targets by columns for BEST and ALT files -pr -m -t -J $final_res.bestCFD.txt $final_res.bestmmblg.txt $final_res.bestCRISTA.txt >$final_res || { - echo "CRISPRme ERROR: final report generation failed - reference (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 -} -pr -m -t -J $final_res_alt.bestCFD.txt $final_res_alt.bestmmblg.txt $final_res_alt.bestCRISTA.txt >$final_res_alt || { - echo "CRISPRme ERROR: final report generation failed - alternative (script: ${0} line $((LINENO - 1)))" >&2 - exit 1 -} +pr -m -t -J $final_res.bestCFD.txt $final_res.bestmmblg.txt $final_res.bestCRISTA.txt >$final_res +pr -m -t -J $final_res_alt.bestCFD.txt $final_res_alt.bestmmblg.txt $final_res_alt.bestCRISTA.txt >$final_res_alt #MERGE ALTERNATIVE CHR IF SAME SEQUENCE OF ALIGNED CHR # ./merge_alt_chr.sh $final_res $final_res.chr_merged @@ -733,9 +880,15 @@ pr -m -t -J $final_res_alt.bestCFD.txt $final_res_alt.bestmmblg.txt $final_res_a #update header for final_res and final_res_alt sed -i '1 s/^.*$/#Bulge_type\tcrRNA\tDNA\tReference\tChromosome\tPosition\tCluster_Position\tDirection\tMismatches\tBulge_Size\tTotal\tPAM_gen\tVar_uniq\tSamples\tAnnotation_Type\tReal_Guide\trsID\tAF\tSNP\t#Seq_in_cluster\tCFD\tCFD_ref\tHighest_CFD_Risk_Score\tHighest_CFD_Absolute_Risk_Score\tMMBLG_#Bulge_type\tMMBLG_crRNA\tMMBLG_DNA\tMMBLG_Reference\tMMBLG_Chromosome\tMMBLG_Position\tMMBLG_Cluster_Position\tMMBLG_Direction\tMMBLG_Mismatches\tMMBLG_Bulge_Size\tMMBLG_Total\tMMBLG_PAM_gen\tMMBLG_Var_uniq\tMMBLG_Samples\tMMBLG_Annotation_Type\tMMBLG_Real_Guide\tMMBLG_rsID\tMMBLG_AF\tMMBLG_SNP\tMMBLG_#Seq_in_cluster\tMMBLG_CFD\tMMBLG_CFD_ref\tMMBLG_CFD_Risk_Score\tMMBLG_CFD_Absolute_Risk_Score\tCRISTA_#Bulge_type\tCRISTA_crRNA\tCRISTA_DNA\tCRISTA_Reference\tCRISTA_Chromosome\tCRISTA_Position\tCRISTA_Cluster_Position\tCRISTA_Direction\tCRISTA_Mismatches\tCRISTA_Bulge_Size\tCRISTA_Total\tCRISTA_PAM_gen\tCRISTA_Var_uniq\tCRISTA_Samples\tCRISTA_Annotation_Type\tCRISTA_Real_Guide\tCRISTA_rsID\tCRISTA_AF\tCRISTA_SNP\tCRISTA_#Seq_in_cluster\tCRISTA_CFD\tCRISTA_CFD_ref\tCRISTA_CFD_Risk_Score\tCRISTA_CFD_Absolute_Risk_Score/' "$final_res" sed -i '1 s/^.*$/#Bulge_type\tcrRNA\tDNA\tReference\tChromosome\tPosition\tCluster_Position\tDirection\tMismatches\tBulge_Size\tTotal\tPAM_gen\tVar_uniq\tSamples\tAnnotation_Type\tReal_Guide\trsID\tAF\tSNP\t#Seq_in_cluster\tCFD\tCFD_ref\tHighest_CFD_Risk_Score\tHighest_CFD_Absolute_Risk_Score\tMMBLG_#Bulge_type\tMMBLG_crRNA\tMMBLG_DNA\tMMBLG_Reference\tMMBLG_Chromosome\tMMBLG_Position\tMMBLG_Cluster_Position\tMMBLG_Direction\tMMBLG_Mismatches\tMMBLG_Bulge_Size\tMMBLG_Total\tMMBLG_PAM_gen\tMMBLG_Var_uniq\tMMBLG_Samples\tMMBLG_Annotation_Type\tMMBLG_Real_Guide\tMMBLG_rsID\tMMBLG_AF\tMMBLG_SNP\tMMBLG_#Seq_in_cluster\tMMBLG_CFD\tMMBLG_CFD_ref\tMMBLG_CFD_Risk_Score\tMMBLG_CFD_Absolute_Risk_Score\tCRISTA_#Bulge_type\tCRISTA_crRNA\tCRISTA_DNA\tCRISTA_Reference\tCRISTA_Chromosome\tCRISTA_Position\tCRISTA_Cluster_Position\tCRISTA_Direction\tCRISTA_Mismatches\tCRISTA_Bulge_Size\tCRISTA_Total\tCRISTA_PAM_gen\tCRISTA_Var_uniq\tCRISTA_Samples\tCRISTA_Annotation_Type\tCRISTA_Real_Guide\tCRISTA_rsID\tCRISTA_AF\tCRISTA_SNP\tCRISTA_#Seq_in_cluster\tCRISTA_CFD\tCRISTA_CFD_ref\tCRISTA_CFD_Risk_Score\tCRISTA_CFD_Absolute_Risk_Score/' "$final_res_alt" +# check for report headers update failures +if [ -s $logerror ]; then + printf "ERROR: Updating report headers failed!\n" >&2 + exit 1 +fi echo -e 'Annotating results\tEnd\t'$(date) >>$log +# STEP 14: figures creation # echo -e 'Creating images' > $output echo -e 'Creating images\tStart\t'$(date) >>$log @@ -753,32 +906,44 @@ mv $final_res_alt "${output_folder}/$(basename ${output_folder}).altMerge.txt" cd $starting_dir if [ "$vcf_name" != "_" ]; then # ./process_summaries.py "${output_folder}/$(basename ${output_folder}).bestMerge.txt" $guide_file $sampleID $mm $bMax "${output_folder}" "var" - ./process_summaries.py $final_res.bestCFD.txt $guide_file $sampleID $mm $bMax "${output_folder}" "var" "CFD" || { - echo "CRISPRme ERROR: CFD report summary failed - reference (script: ${0} line $((LINENO - 1)))" >&2 + ./process_summaries.py $final_res.bestCFD.txt $guide_file $sampleID $mm $bMax "${output_folder}" "var" "CFD" + # check for variant summary processing on CFD failures + if [ -s $logerror ]; then + printf "ERROR: Variant summary process on CFD report failed!\n" >&2 exit 1 - } - ./process_summaries.py $final_res.bestmmblg.txt $guide_file $sampleID $mm $bMax "${output_folder}" "var" "fewest" || { - echo "CRISPRme ERROR: mismatch+bulges report summary failed - reference (script: ${0} line $((LINENO - 1)))" >&2 + fi + ./process_summaries.py $final_res.bestmmblg.txt $guide_file $sampleID $mm $bMax "${output_folder}" "var" "fewest" + # check for summary processing on mm+bulges failures + if [ -s $logerror ]; then + printf "ERROR: Variant summary process on mm+bulges report failed!\n" >&2 exit 1 - } - ./process_summaries.py $final_res.bestCRISTA.txt $guide_file $sampleID $mm $bMax "${output_folder}" "var" "CRISTA" || { - echo "CRISPRme ERROR: CRISTA report summary failed - reference (script: ${0} line $((LINENO - 1)))" >&2 + fi + ./process_summaries.py $final_res.bestCRISTA.txt $guide_file $sampleID $mm $bMax "${output_folder}" "var" "CRISTA" + # check for summary processing on CRISTA failures + if [ -s $logerror ]; then + printf "ERROR: Variant summary process on CRISTA report failed!\n" >&2 exit 1 - } + fi else # ./process_summaries.py "${output_folder}/$(basename ${output_folder}).bestMerge.txt" $guide_file $sampleID $mm $bMax "${output_folder}" "ref" - ./process_summaries.py $final_res.bestCFD.txt $guide_file $sampleID $mm $bMax "${output_folder}" "ref" "CFD" || { - echo "CRISPRme ERROR: CFD report summary failed - alternative (script: ${0} line $((LINENO - 1)))" >&2 + ./process_summaries.py $final_res.bestCFD.txt $guide_file $sampleID $mm $bMax "${output_folder}" "ref" "CFD" + # check for reference summary processing on CFD failures + if [ -s $logerror ]; then + printf "ERROR: Reference summary process on CFD report failed!\n" >&2 exit 1 - } - ./process_summaries.py $final_res.bestmmblg.txt $guide_file $sampleID $mm $bMax "${output_folder}" "ref" "fewest" || { - echo "CRISPRme ERROR: mismatch+bulges report summary failed - alternative (script: ${0} line $((LINENO - 1)))" >&2 + fi + ./process_summaries.py $final_res.bestmmblg.txt $guide_file $sampleID $mm $bMax "${output_folder}" "ref" "fewest" + # check for reference summary processing on mm+bulges failures + if [ -s $logerror ]; then + printf "ERROR: Reference summary process on mm+bulges report failed!\n" >&2 exit 1 - } - ./process_summaries.py $final_res.bestCRISTA.txt $guide_file $sampleID $mm $bMax "${output_folder}" "ref" "CRISTA" || { - echo "CRISPRme ERROR: CRISTA report summary failed - alternative (script: ${0} line $((LINENO - 1)))" >&2 + fi + ./process_summaries.py $final_res.bestCRISTA.txt $guide_file $sampleID $mm $bMax "${output_folder}" "ref" "CRISTA" + # check for reference summary processing on CRISTA failures + if [ -s $logerror ]; then + printf "ERROR: Reference summary process on CRISTA report failed!\n" >&2 exit 1 - } + fi fi if ! [ -d "$output_folder/imgs" ]; then @@ -790,18 +955,24 @@ if [ "$vcf_name" != "_" ]; then while IFS= read -r line || [ -n "$line" ]; do for total in $(seq 0 $(expr $mm + $bMax)); do # python $starting_dir/populations_distribution.py "${output_folder}/.$(basename ${output_folder}).PopulationDistribution.txt" $total $line - python $starting_dir/populations_distribution.py "${output_folder}/.$(basename ${output_folder}).PopulationDistribution_CFD.txt" $total $line "CFD" || { - echo "CRISPRme ERROR: CFD population report failed (script: ${0} line $((LINENO - 1)))" >&2 + python $starting_dir/populations_distribution.py "${output_folder}/.$(basename ${output_folder}).PopulationDistribution_CFD.txt" $total $line "CFD" + # check for population distribution on CFD failures + if [ -s $logerror ]; then + printf "ERROR: Population distribution on CFD report failed!\n" >&2 exit 1 - } - python $starting_dir/populations_distribution.py "${output_folder}/.$(basename ${output_folder}).PopulationDistribution_CRISTA.txt" $total $line "CRISTA" || { - echo "CRISPRme ERROR: CRISTA population report failed (script: ${0} line $((LINENO - 1)))" >&2 + fi + python $starting_dir/populations_distribution.py "${output_folder}/.$(basename ${output_folder}).PopulationDistribution_CRISTA.txt" $total $line "CRISTA" + # check for population distribution on CRISTA failures + if [ -s $logerror ]; then + printf "ERROR: Population distribution on CRISTA report failed!\n" >&2 exit 1 - } - python $starting_dir/populations_distribution.py "${output_folder}/.$(basename ${output_folder}).PopulationDistribution_fewest.txt" $total $line "fewest" || { - echo "CRISPRme ERROR: mismatch+bulges population report failed (script: ${0} line $((LINENO - 1)))" >&2 + fi + python $starting_dir/populations_distribution.py "${output_folder}/.$(basename ${output_folder}).PopulationDistribution_fewest.txt" $total $line "fewest" + # check for population distribution on mm+bulges failures + if [ -s $logerror ]; then + printf "ERROR: Population distribution on mm+bulges report failed!\n" >&2 exit 1 - } + fi done done <$guide_file @@ -810,34 +981,47 @@ fi cd $starting_dir if [ "$vcf_name" != "_" ]; then # ./radar_chart_dict_generator.py $guide_file "${output_folder}/$(basename ${output_folder}).bestMerge.txt" $sampleID $annotation_file "$output_folder" $ncpus $mm $bMax - ./radar_chart_dict_generator.py $guide_file $final_res.bestCFD.txt $sampleID $annotation_file "$output_folder" $ncpus $mm $bMax "CFD" || { - echo "CRISPRme ERROR: CFD radar chart report failed - reference (script: ${0} line $((LINENO - 1)))" >&2 + ./radar_chart_dict_generator.py $guide_file $final_res.bestCFD.txt $sampleID $annotation_file "$output_folder" $ncpus $mm $bMax "CFD" + # check for radar chart generation on CFD failures + if [ -s $logerror ]; then + printf "ERROR: Radar chart generation on variant CFD report failed!\n" >&2 exit 1 - } - ./radar_chart_dict_generator.py $guide_file $final_res.bestCRISTA.txt $sampleID $annotation_file "$output_folder" $ncpus $mm $bMax "CRISTA" || { - echo "CRISPRme ERROR: CRISTA radar chart report failed - reference (script: ${0} line $((LINENO - 1)))" >&2 + fi + ./radar_chart_dict_generator.py $guide_file $final_res.bestCRISTA.txt $sampleID $annotation_file "$output_folder" $ncpus $mm $bMax "CRISTA" + # check for radar chart generation on CRISTA failures + if [ -s $logerror ]; then + printf "ERROR: Radar chart generation on variant CRISTA report failed!\n" >&2 exit 1 - } - ./radar_chart_dict_generator.py $guide_file $final_res.bestmmblg.txt $sampleID $annotation_file "$output_folder" $ncpus $mm $bMax "fewest" || { - echo "CRISPRme ERROR: mismatch+bulges radar chart report failed - reference (script: ${0} line $((LINENO - 1)))" >&2 + fi + ./radar_chart_dict_generator.py $guide_file $final_res.bestmmblg.txt $sampleID $annotation_file "$output_folder" $ncpus $mm $bMax "fewest" + # check for radar chart generation on mm+bulges failures + if [ -s $logerror ]; then + printf "ERROR: Radar chart generation on variant mm+bulges report failed!\n" >&2 exit 1 - } + fi else - ./radar_chart_dict_generator.py $guide_file $final_res.bestCFD.txt $empty_file $annotation_file "$output_folder" $ncpus $mm $bMax "CFD" || { - echo "CRISPRme ERROR: CFD radar chart report failed - alternative (script: ${0} line $((LINENO - 1)))" >&2 + ./radar_chart_dict_generator.py $guide_file $final_res.bestCFD.txt $empty_file $annotation_file "$output_folder" $ncpus $mm $bMax "CFD" + # check for radar chart generation on CFD failures + if [ -s $logerror ]; then + printf "ERROR: Radar chart generation on reference CFD report failed!\n" >&2 exit 1 - } - ./radar_chart_dict_generator.py $guide_file $final_res.bestCRISTA.txt $empty_file $annotation_file "$output_folder" $ncpus $mm $bMax "CRISTA" || { - echo "CRISPRme ERROR: CRISTA radar chart report failed - alternative (script: ${0} line $((LINENO - 1)))" >&2 + fi + ./radar_chart_dict_generator.py $guide_file $final_res.bestCRISTA.txt $empty_file $annotation_file "$output_folder" $ncpus $mm $bMax "CRISTA" + # check for radar chart generation on CRISTA failures + if [ -s $logerror ]; then + printf "ERROR: Radar chart generation on reference CRISTA report failed!\n" >&2 exit 1 - } - ./radar_chart_dict_generator.py $guide_file $final_res.bestmmblg.txt $empty_file $annotation_file "$output_folder" $ncpus $mm $bMax "fewest" || { - echo "CRISPRme ERROR: mismatch+bulges radar chart report failed - alternative (script: ${0} line $((LINENO - 1)))" >&2 + fi + ./radar_chart_dict_generator.py $guide_file $final_res.bestmmblg.txt $empty_file $annotation_file "$output_folder" $ncpus $mm $bMax "fewest" + # check for radar chart generation on mm+bulges failures + if [ -s $logerror ]; then + printf "ERROR: Radar chart generation on reference mm+bulges report failed!\n" >&2 exit 1 - } + fi fi echo -e 'Creating images\tEnd\t'$(date) >>$log +# STEP 15: targets gene annotation echo $gene_proximity echo -e 'Integrating results\tStart\t'$(date) >>$log echo >>$guide_file @@ -845,18 +1029,24 @@ echo >>$guide_file if [ $gene_proximity != "_" ]; then genome_version=$(echo ${ref_name} | sed 's/_ref//' | sed -e 's/\n//') #${output_folder}/Params.txt | awk '{print $2}' | sed 's/_ref//' | sed -e 's/\n//') echo $genome_version - bash $starting_dir/post_process.sh "${output_folder}/$(basename ${output_folder}).bestMerge.txt" "${gene_proximity}" $empty_file "${guide_file}" $genome_version "${output_folder}" $empty_dir $starting_dir/ $base_check_start $base_check_end $base_check_set || { - echo "CRISPRme ERROR: postprocessing failed - reference (script: ${0} line $((LINENO - 1)))" >&2 + bash $starting_dir/post_process.sh "${output_folder}/$(basename ${output_folder}).bestMerge.txt" "${gene_proximity}" $empty_file "${guide_file}" $genome_version "${output_folder}" $empty_dir $starting_dir/ $base_check_start $base_check_end $base_check_set + # check for gene annotation of primary targets failures + if [ -s $logerror ]; then + printf "ERROR: Gene annotation on primary targets failed!\n" >&2 exit 1 - } - bash $starting_dir/post_process.sh "${output_folder}/$(basename ${output_folder}).altMerge.txt" "${gene_proximity}" $empty_file "${guide_file}" $genome_version "${output_folder}" $empty_dir $starting_dir/ $base_check_start $base_check_end $base_check_set || { - echo "CRISPRme ERROR: postprocessing failed - alternative (script: ${0} line $((LINENO - 1)))" >&2 + fi + bash $starting_dir/post_process.sh "${output_folder}/$(basename ${output_folder}).altMerge.txt" "${gene_proximity}" $empty_file "${guide_file}" $genome_version "${output_folder}" $empty_dir $starting_dir/ $base_check_start $base_check_end $base_check_set + # check for gene annotation of alternative targets failures + if [ -s $logerror ]; then + printf "ERROR: Gene annotation on alternative targets failed!\n" >&2 exit 1 - } - python $starting_dir/CRISPRme_plots.py "${output_folder}/$(basename ${output_folder}).bestMerge.txt.integrated_results.tsv" "${output_folder}/imgs/" &>"${output_folder}/warnings.txt" || { - echo "CRISPRme ERROR: plots generation failed (script: ${0} line $((LINENO - 1)))" >&2 + fi + python $starting_dir/CRISPRme_plots.py "${output_folder}/$(basename ${output_folder}).bestMerge.txt.integrated_results.tsv" "${output_folder}/imgs/" &>"${output_folder}/warnings.txt" + # check for plot failures + if [ -s $logerror ]; then + printf "ERROR: Plots generation failed!\n" >&2 exit 1 - } + fi rm -f "${output_folder}/warnings.txt" #delete warnings file fi @@ -874,10 +1064,12 @@ if [ -f "${output_folder}/$(basename ${output_folder}).db" ]; then rm -f "${output_folder}/$(basename ${output_folder}).db" fi #python $starting_dir/db_creation.py "${output_folder}/$(basename ${output_folder}).bestMerge.txt" "${output_folder}/$(basename ${output_folder})" -python $starting_dir/db_creation.py "${output_folder}/$(basename ${output_folder}).bestMerge.txt.integrated_results.tsv" "${output_folder}/.$(basename ${output_folder})" || { - echo "CRISPRme ERROR: database creation failed (script: ${0} line $((LINENO - 1)))" >&2 +python $starting_dir/db_creation.py "${output_folder}/$(basename ${output_folder}).bestMerge.txt.integrated_results.tsv" "${output_folder}/.$(basename ${output_folder})" +# check for database generation failures +if [ -s $logerror ]; then + printf "ERROR: Database generation failed!\n" >&2 exit 1 -} +fi echo -e 'Creating database\tEnd\t'$(date) >>$log # echo -e 'Creating database\tEnd\t'$(date) >&2 @@ -899,14 +1091,34 @@ if [ $(wc -l <"$guide_file") -gt 1 ]; then mv "${output_folder}/$(basename ${output_folder}).altMerge.txt.integrated_results.tsv" "${output_folder}/Multiple_spacers+${true_pam}_$(basename ${ref_folder})+${vcf_name}_${mm}+${bMax}_all_results_with_alternative_alignments.tsv" #generate zipped version for file zip -j "${output_folder}/Multiple_spacers+${true_pam}_$(basename ${ref_folder})+${vcf_name}_${mm}+${bMax}_integrated_results.zip" "${output_folder}/Multiple_spacers+${true_pam}_$(basename ${ref_folder})+${vcf_name}_${mm}+${bMax}_integrated_results.tsv" + # check for compression on multiguide integrated results failures + if [ -s $logerror ]; then + printf "ERROR: File compression for multiguide primary targets report failed!\n" >&2 + exit 1 + fi zip -j "${output_folder}/Multiple_spacers+${true_pam}_$(basename ${ref_folder})+${vcf_name}_${mm}+${bMax}_all_results_with_alternative_alignments.zip" "${output_folder}/Multiple_spacers+${true_pam}_$(basename ${ref_folder})+${vcf_name}_${mm}+${bMax}_all_results_with_alternative_alignments.tsv" + # check for compression on multiguide alternative results failures + if [ -s $logerror ]; then + printf "ERROR: File compression for multiguide alternative targets report failed!\n" >&2 + exit 1 + fi else guide_elem=$(head -1 $guide_file) mv "${output_folder}/$(basename ${output_folder}).bestMerge.txt.integrated_results.tsv" "${output_folder}/${guide_elem}+${true_pam}_$(basename ${ref_folder})+${vcf_name}_${mm}+${bMax}_integrated_results.tsv" mv "${output_folder}/$(basename ${output_folder}).altMerge.txt.integrated_results.tsv" "${output_folder}/${guide_elem}+${true_pam}_$(basename ${ref_folder})+${vcf_name}_${mm}+${bMax}_all_results_with_alternative_alignments.tsv" #generate zipped version for file zip -j "${output_folder}/${guide_elem}+${true_pam}_$(basename ${ref_folder})+${vcf_name}_${mm}+${bMax}_integrated_results.zip" "${output_folder}/${guide_elem}+${true_pam}_$(basename ${ref_folder})+${vcf_name}_${mm}+${bMax}_integrated_results.tsv" + # check for compression on single guide integrated results failures + if [ -s $logerror ]; then + printf "ERROR: File compression for single guide primary targets report failed!\n" >&2 + exit 1 + fi zip -j "${output_folder}/${guide_elem}+${true_pam}_$(basename ${ref_folder})+${vcf_name}_${mm}+${bMax}_all_results_with_alternative_alignments.zip" "${output_folder}/${guide_elem}+${true_pam}_$(basename ${ref_folder})+${vcf_name}_${mm}+${bMax}_all_results_with_alternative_alignments.tsv" + # check for compression on single alternative results failures + if [ -s $logerror ]; then + printf "ERROR: File compression for single guide alternative targets report failed!\n" >&2 + exit 1 + fi fi echo -e "JOB END" @@ -915,7 +1127,7 @@ if [ "$email" != "_" ]; then fi #keep log_error but no block visualization -mv $output_folder/log_error.txt $output_folder/log_error_no_check.txt +mv $logerror $output_folder/log_error_no_check.txt #removing single best files after use and clean merged file to save space #keep the two integrated files with all the targets #save these files to test diff --git a/README.md b/README.md index 39de407..8c42a86 100755 --- a/README.md +++ b/README.md @@ -1,255 +1,1390 @@ -# CRISPRme - [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/crisprme/README.html) ![GitHub release (latest by date)](https://img.shields.io/github/v/release/pinellolab/crisprme) ![Conda](https://img.shields.io/conda/dn/bioconda/crisprme) ![license](https://img.shields.io/badge/license-AGPL--3.0-lightgrey) -CRISPRme is a tool for comprehensive off-target assessment available as a web application [online](http://crisprme.di.univr.it/), offline, and command line. It integrates human genetic variant datasets with orthogonal genomic annotations to predict and prioritize CRISPR-Cas off-target sites at scale. The method considers both single-nucleotide variants (SNVs) and indels, accounts for bona fide haplotypes, accepts spacer:protospacer mismatches and bulges, and is suitable for population and personal genome analyses. CRISPRme takes care of all steps in the process including data download, executing the complete search, and presents an exhaustive report with tables and figures within interactive web-based GUI. +

+ crisprme-logo.png +

-The software has the following main functionalities: +# CRISPRme -- ```complete-search``` performs a search from scratch with the given inputs (including gRNA, reference genome, and genetic variants). -- ```targets-integration``` integrates the search results with GENCODE data to identify genes close to the candidate off-targets and collect the top ranking candidates in term of CFD score, CRISTA score, or number of mismatches/bulges. -- ```web-interface``` starts a local instance of the web interface accessible from any browser. +CRISPRme is a comprehensive tool designed for thorough off-target assessment in +CRISPR-Cas systems. Available as a web application +([http://crisprme.di.univr.it/](http://crisprme.di.univr.it/)), offline tool, and +command-line interface, it integrates human genetic variant datasets with orthogonal +genomic annotations to predict and prioritize potential off-target sites at scale. +CRISPRme accounts for single-nucleotide variants (SNVs) and indels, considers +*bona fide* haplotypes, and allows for spacer:protospacer mismatches and bulges, +making it well-suited for both population-wide and personal genome analyses. CRISPRme +automates the entire workflow, from data download to executing the search, and delivers +detailed reports complete with tables and figures through an interactive web-based +interface. -## Installation +## Table Of Contents -CRISPRme can be installed both via **Conda** (only Linux users) and **Docker** (all operating systems, including OSX and Windows). +0 [System Requirements](#0-system-requirements) +
1 [Installation](#1-installation) +
  1.1 [Install CRISPRme via Conda/Mamba](#11-install-crisprme-via-condamamba) +
    1.1.1 [Installing Conda or Mamba](#111-installing-conda-or-mamba) +
    1.1.2 [Installing CRISPRme](#112-installing-crisprme) +
    1.1.3 [Updating CRISPRme](#113-updating-crisprme) +
  1.2 [Install CRISPRme via Docker](#12-install-crisprme-via-docker) +
    1.2.1 [Installing Docker](#121-installing-docker) +
    1.2.2 [Building and Pulling CRISPRme Docker Image](#122-building-and-pulling-crisprme-docker-image) +
2 [Usage](#2-usage) +
  2.1 [Directory Structure](#21-directory-structure) +
  2.2 [CRISPRme Functions](#22-crisprme-functions) +
     2.2.1 [Complete Search](#221-complete-search) +
     2.2.2 [Complete Test](#222-complete-test) +
     2.2.3 [Targets Integration](#223-targets-integration) +
     2.2.4 [GNOMAD Converter](#224-gnomad-converter) +
     2.2.5 [Generate Personal Card](#225-generate-personal-card) +
     2.2.6 [Web Interface](#226-web-interface) +
3 [Test](#3-test) +
  3.1 [Quick Test](#31-quick-test) +
  3.2 [Detailed Test](#32-detailed-test) +
     3.2.1 [Single Chromosome Test](#321-single-chromosome-test) +
     3.2.2 [Full Genome Test](#322-full-genome-test) +
4 [Citation](#4-citation) +
5 [Contacts](#5-contacts) +
6 [License](#6-license) -### Installation via Conda +## 0 System Requirements -If conda is not already available on your machine, the next section will describe how to obtain a fresh ```conda``` distribution. If ```conda``` is already available on your machine you can skip the next section and go to **Create CRISPRme conda environment** section. +To ensure optimal performance, CRISPRme requires the following: -#### Obtaining a fresh conda distribution -If conda is not already available in your environment you can get a fresh ```miniconda``` distribution. To obtain a fresh ```miniconda``` distribution, open a new terminal window and type: -``` -curl https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64 -bash Miniconda3-latest-Linux-x86_64.sh -``` -Press ENTER when requested and answer ```yes``` when required. Conda will set all the directories in your ```HOME``` path for an easy use. +- **Minimum Memory (RAM)**: 32 GB +
Suitable for typical use cases and smaller datasets. -Close the current terminal window and reopen it to allow the system to start ```conda```. If you see in the new window something similar to -``` -(base) user@nameofPC:~$ -``` -```conda``` was correctly installed and it is ready to run. +- **Recommended Memory for Large Analyses**: 64 GB or more +
Necessary for intensive operations such as whole-genome searches and + processing large variant datasets. -The next step, will be a one-time set up of ```conda``` channels. To set up the channels type on your terminal: -``` -conda config --add channels defaults -conda config --add channels bioconda -conda config --add channels conda-forge -``` +For best results, confirm that your system meets or exceeds these specifications +before running CRISPRme. -#### Create CRISPRme conda environment -To create the ```conda``` environment for CRISPRme, it is suggested to use ```mamba```. ```mamba``` is a drop-in replacement for conda that uses a faster dependency solving library and parts reimplemented in C++ for speed. To install ```mamba```, in your terminal window type: -``` -conda install mamba -n base -c conda-forge -``` +## 1 Installation -Once installed ```mamba```, you are ready to build the CRISPRme environmet. To build the environment, type: -``` -mamba create -n crisprme python=3.9.19 crisprme -y -``` +This section outlines the steps to install CRISPRme, tailored to suit different +operating systems. Select the method that best matches your setup: -To activate the environmment, type: -``` -conda activate crisprme -``` +- [Install CRISPRme via Conda/Mamba (for Linux users)](#11-install-crisprme-via-condamamba) + +- [Install CRISPRme via Docker (compatible with all operating systems)]() -To test the installation, type in your terminal window: -``` -crisprme.py -``` +Each method ensures a streamlined and efficient installation, enabling you to use +CRISPRme with minimal effort. Follow the detailed instructions provided in the +respective sections below. + +### 1.1 Install CRISPRme via Conda/Mamba +--- + +This section is organized into three subsections to guide you through the installation +and maintenance of CRISPRme: -If you see all CRISPRme's functionalities listed, you succesfully installed CRISPRme on your machine, and it is ready to be used on your machine. +- [Installing Conda or Mamba](#111-installing-conda-or-mamba): +
This subsection provides step-by-step instructions to install either + Conda or Mamba. Begin here if you do not have these package managers installed + on your machine. -#### Updating CRISPRme conda installation -If you want to update an older CRISPRme installation to the latest version, we suggest updating as: +- [Installing CRISPRme](#112-installing-crisprme): +
Once you have Conda or Mamba set up, proceed to this subsection for detailed + instructions on creating the CRISPRme environment and installing the necessary + dependencies. + +- [Updating CRISPRme](#113-updating-crisprme): +
Learn how to update an existing CRISPRme installation to the latest version, + ensuring access to new features and bug fixes. + +#### 1.1.1 Installing Conda or Mamba +--- + +Before installing CRISPRme, ensure you have either Conda or Mamba installed on +your machine. Based on recommendations from the Bioconda community, we highly +recommend using Mamba over Conda. Mamba is a faster, more efficient drop-in replacement +for Conda, leveraging a high-performance dependency solver and components +optimized in C++. + +**Step1: Install `Conda` or `Mamba`** + +- To install `Conda`, refer to the official installation guide: +
[Conda Installation Guide](https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html) + +- To install `Mamba`, refer to the official installation guide: +
[Mamba Installation Guide](https://mamba.readthedocs.io/en/latest/installation/mamba-installation.html) + +**Step 2: Configure Bioconda Channels** + +Once `Mamba` is installed, configure it to use Bioconda and related channels by +running the following one-time setup commands: +```bash +mamba config --add channels bioconda +mamba config --add channels defaults +mamba config --add channels conda-forge +mamba config --set channel_priority strict ``` -mamba install crisprme== +> **Note:** If you prefer to use `Conda`, replace `mamba` with `conda` in the +commands above + +By completing these steps, your system will be fully prepared for installing CRISPRme. + +#### 1.1.2 Installing CRISPRme +--- + +We strongly recommend using **`Mamba`** to create CRISPRme's `conda` environment +due to its superior speed and reliability in dependency management. However, if +you prefer `Conda`, you can replace `mamba` with `conda` in all the commands below. + +**Step 1: Create CRISPRme's Environment** + +Open a terminal and execute the following command: + +```bash +mamba create -n crisprme python=3.9 crisprme -y # Install CRISPRme and its dependencies +``` + +This command sets up a dedicated `conda` environment named `crisprme`, installing +CRISPRme along with all required dependencies. + +**Step 2: Activate the Environment** + +To activate the newly created CRISPRme environment, type: + +```bash +mamba activate crisprme # Enable the CRISPRme environment +``` + +**Step 3: Test the Installation** + +To verify that CRISPRme is correctly installed, run the following commands in your terminal: + +```bash +crisprme.py --version # Display the installed CRISPRme version +crisprme.py # List CRISPRme functionalities +``` + +- The first command will output the version of CRISPRme (e.g., `2.1.6`). +- The second command should display CRISPRme's functionalities. + +If both commands execute successfully, your installation is complete, and +CRISPRme is ready to use. + +#### 1.1.3 Updating CRISPRme +--- + +To update an existing CRISPRme installation using `Mamba` or `Conda`, follow the +steps below: + +**Step 1: Check the Latest Version** + +Visit the CRISPRme README to identify the latest version of the tool. + +**Step 2: Update CRISPRme** + +Run the following command in your terminal, replacing with the +desired version number: +```bash +mamba install crisprme= # Update CRISPRme to the specified version ``` -For example: + +For example, to update CRISPRme to version `2.1.6`, execute: +```bash +mamba install crisprme=2.1.6 ``` -mamba install crisprme==2.1.5 +If you're using `Conda`, replace `mamba` with `conda` in the commands above. + +**Step 3: Verify the Update** + +After the update completes, ensure the installation was successful by checking the +version: +```bash +crisprme.py --version # Confirm the installed version ``` -You can find the latest release indicated at the top of our [README](https://github.com/pinellolab/CRISPRme#crisprme). +If the displayed version matches the one you installed, the update was successful. -## Installation via Docker -For OSX and Windows users is suggested to run CRISPRme via [Docker](https://www.docker.com/get-started). Follow the following links to install Docker on [OSX](https://docs.docker.com/docker-for-mac/install/) or [Windows](https://docs.docker.com/docker-for-windows/install/), and follow the on-screen instructions. +### 1.2 Install CRISPRme via Docker +--- -If you plan to use CRISPRme via Docker on a Linux-based OS read and follow the instructions listed in the next section, skip it otherwise. +This section is organized into two subsections to guide you through the setup of +**CRISPRme** using Docker: -### Install CRISPRme using Docker -We assume you have already installed Docker on your system. Open a new terminal window and type: -``` -docker pull pinellolab/crisprme +- [Installing Docker](#121-installing-docker): +
Provides step-by-step instructions for installing Docker on your system, +ensuring compatibility with all operating systems, including Linux, macOS, and Windows. + +- [Building and Pulling CRISPRme Docker Image](#122-building-and-pulling-crisprme-docker-image): +
Explains how to create or download the CRISPRme Docker image to set up a + containerized environment for seamless execution. + +Follow the subsections in order if Docker is not yet installed on your machine. +If Docker is already installed, skip to the second subsection. + +#### 1.2.1 Installing Docker +--- + +MacOS and Windows users are encouraged to install [Docker](https://www.docker.com/get-started) +to use CRISPRme. Linux users may also choose Docker for convenience and compatibility. + +Docker provides tailored distributions for different operating systems. Follow the +official Docker installation guide specific to your OS: + +- [MacOS Installation Guide](https://docs.docker.com/docker-for-mac/install/) +- [Windows Installation Guide](https://docs.docker.com/docker-for-windows/install/) +- [Linux Installation Guide](https://docs.docker.com/engine/install/ubuntu/) + +**Linux-Specific Post-Installation Steps** + +If you're using Linux, additional configuration steps are required: +1. Create the Docker Group: + ```bash + sudo groupadd docker + ``` + +2. Add Your User to the Docker Group: + ```bash + sudo usermod -aG docker $USER + ``` + Repeat this command for any additional users you want to include in the Docker + Group. + +3. Restart Your Machine +
Log out and log back in, or restart your machine to apply the changes. + +**Testing Docker Installation** + +Once Docker is installed, verify the setup by opening a terminal window and typing: +```bash +docker run hello-world ``` -This command will download and install CRISPRme Docker image on your machine. -## Test CRISPRme -To test your CRISPRme installation, open a new terminal window and type: +If Docker is installed correctly, you should see output like this: ``` -mkdir crisprme_test && cd crisprme_test -wget https://raw.githubusercontent.com/pinellolab/CRISPRme/refs/heads/main/crisprme_auto_test_conda.sh - +Hello from Docker! +This message shows that your installation appears to be working correctly. + +To generate this message, Docker took the following steps: + 1. The Docker client contacted the Docker daemon. + 2. The Docker daemon pulled the "hello-world" image from the Docker Hub. + 3. The Docker daemon created a new container from that image, which runs the executable that produces this output. + 4. The Docker daemon streamed this output to the Docker client, which displayed it on your terminal. + +For more examples and ideas, visit: + https://docs.docker.com/get-started/ ``` -This will download a script to download all the data necessary to run a full test using a single gRNA. -Once downloaded, enter the folder by typing: +#### 1.2.2 Building and Pulling CRISPRme Docker Image +--- -If you installed CRISPRme via ```conda```, test your conda installation by typing: +After installing Docker, you can download and build the CRISPRme Docker image by +running the following command in a terminal: +```bash +docker pull pinellolab/crisprme ``` -bash crisprme_auto_test_conda.sh + +This command retrieves the latest pre-built CRISPRme image from Docker Hub and sets +it up on your system, ensuring all required dependencies and configurations are +included. + +Once the download is complete, the CRISPRme Docker image will be ready for use. +To confirm the image is successfully installed, you can list all available Docker +images by typing: +```bash +docker images ``` -Otherwise, if you installed CRISPRme via Docker, test your Docker installation by typing: +Look for an entry similar to the following: ``` -bash crisprme_auto_test_docker.sh +REPOSITORY TAG IMAGE ID CREATED SIZE +pinellolab/crisprme latest ``` -After starting, the tests will download the required test data, then CRISPRme will start its analysis. -**NB** Depending on your hardware the test may take very different time to complete. +You are now ready to run CRISPRme using Docker. + +## 2 Usage + +CRISPRme is a tool designed for variant- and haplotype-aware CRISPR off-target +analysis. It integrates robust functionalities for off-target detection, +variant-aware search, and result analysis. The tool also includes a +user-friendly graphical interface, which can be deployed locally to streamline +its usage. + + +### 2.1 Directory Structure +--- + +CRISPRme operates within a specific directory structure to manage input data and +outputs efficiently. To ensure proper functionality, your working directory must +include the following main subdirectories: + +- **Genomes** + - **Purpose**: Stores reference genomes. + - **Structure**: Each reference genome resides in its own subdirectory. + - **Requirements**: The genome must be split into separate files, each + representing a single chromosome. + +- **VCFs** + - **Purpose**: Contains variant data in VCF format. + - **Structure**: Similar to the Genomes directory, each dataset has a + dedicated subdirectory with VCF files split by chromosome. + - **Requirements**: Files must be compressed using `bgzip` (with a `.gz` + extension). + +- **sampleIDs** + - **Purpose**: Lists the sample identifiers corresponding to the VCF + datasets. + - **Structure**: Tab-separated files, one for each VCF dataset, specifying + the sample IDs. + +- **Annotations** + - **Purpose**: Provides genome annotation data. + - **Format**: Annotation files must be in BED format. + +- **PAMs** + - **Purpose**: Specifies the Protospacer Adjacent Motif (PAM) sequences for + off-target search. + - **Format**: Text files containing PAM sequences. + +The directory organization required by CRISPRme is illustrated below: +

+ crisprme_dirtree.png +

+ +### 2.2 CRISPRme Functions +--- + +This section provides a comprehensive overview of CRISPRme's core functions, +detailing each feature, the required input data and formats, and the resulting +outputs. The following is a summary of CRISPRme's key features: + +- [**Complete Search**](#221-complete-search) (`complete-search`) +
Executes a genome-wide off-targets + search across both reference and variant datasets (if specified), conducts + Cutting Frequency Determination (CFD) and CRISTA analyses (if applicable), and + identifies candidate targets. + +- [**Complete Test**](#222-complete-test) (`complete-test`) +
Tests CRISPRme pipeline on a small input dataset or the full genome, + enabling users to validate the tool's functionality before performing + large-scale analyses. + +- [**Targets Integration**](#223-targets-integration) (`targets-integration`) +
Combines *in silico* predicted targets with experimental data to create a + finalized target panel. + +- [**GNOMAD Converter**](#224-gnomad-converter) (`gnomAD-converter`) +
Transforms GNOMAD VCFs (`vcf.bgz` format) into a format compatible with + CRISPRme. The function supports VCFs from GNOMAD v3.1, v4.0, and v4.1, + including *joint* VCFs. + +- [**Generate Personal Card**](#225-generate-personal-card) (`generate-personal-card`) +
Generates a personalized summary for a specific sample, identifying all + private off-targets unique to that individual. + +- [**Web Interface**](#226-web-interface) (`web-interface`) +
Launches CRISPRme's interactive web interface, allowing users to manage + and execute tasks directly via a local browser. + +#### 2.2.1 Complete Search +--- + +The **Complete Search** function performs an exhaustive variant- and +haplotype-aware off-target analysis, leveraging the provided reference genome +and variant datasets to deliver comprehensive results. This feature integrates +all critical stages of the CRISPRme pipeline, encompassing off-target +identification, functional annotation, and detailed reporting. + +Key highlights of the Complete Search functionality include: + +- **Variant- and Haplotype-Awareness** +
Accurately incorporates genetic variation, including population- and + sample-specific variants, and haplotypes data, to identify off-targets that + reflect real-world genomic diversity. + +- **Comprehensive Off-Target Discovery** +
Searches both the reference genome and user-specified variant datasets + for potential off-targets, including those encompassing mismatches and bulges. + +- **Functional Annotation** +
Annotates off-targets with relevant genomic features, such as + coding/non-coding regions, regulatory elements, and gene proximity. + +- **Detailed Reporting** +
Generates population-specific and sample-specific off-target summaries, + highlighting variations that may impact specificity or introduce novel PAM + sites. Provides CFD (Cutting Frequency Determination) and CRISTA scores, and + mismatches and bulges counts to rank off-targets based on their potential + impact. Includes graphical representations of findings to facilitate result + interpretation. + +- **Output Formats** +
Produces user-friendly output files, including text-based tables and + visualization-ready graphical summaries. + +Usage Example for the Complete Search function: +- **Via Conda/Mamba** + ```bash + crisprme.py complete-search \ + --genome Genomes/hg38 \ # reference genome directory + --vcf vcf_config.1000G.HGDP.txt \ # config file declaring usage of 1000G and HGDP variant datasets + --guide sg1617.txt \ # guide + --pam PAMs/20bp-NGG-spCas9.txt \ # NGG PAM file + --annotation Annotations/dhs+gencode+encode.hg38.bed \ # annotation BED + --gene_annotation Annotations/gencode.protein_coding.bed \ # gene proximity annotation BED + --samplesID samplesIDs.1000G.HGDP.txt \ # config file declaring usage of 1000G and HGDP samples + --be-window 4,8 \ # base editing window start and stop positions within off-targets + --be-base A,G \ # nucleotide to test base editing potential (A>G) + --mm 6 \ # number of max mismatches + --bDNA 2 \ # number of max DNA bulges + --bRNA 2 \ # number of max RNA bulges + --merge 3 \ # merge off-targets mapped within 3 bp in clusters + --sorting-criteria-scoring mm+bulges \ # prioritize within each cluster off-targets with highest score and lowest mm+bulges (CFD and CRISTA reports only) + --sorting-criteria mm,bulges \ # prioritize within each cluster off-targets with lowest mm and bulges counts + --output sg1617-NGG-1000G-HGDP \ # output directory name + --thread 8 # number of threads + ``` + +- **Via Docker** + ```bash + docker run -v ${PWD}:/DATA -w /DATA -i pinellolab/crisprme \ + crisprme.py complete-search \ + --genome Genomes/hg38 \ # reference genome directory + --vcf vcf_config.1000G.HGDP.txt \ # config file declaring usage of 1000G and HGDP variant datasets + --guide sg1617.txt \ # guide + --pam PAMs/20bp-NGG-spCas9.txt \ # NGG PAM file + --annotation Annotations/dhs+gencode+encode.hg38.bed \ # annotation BED + --gene_annotation Annotations/gencode.protein_coding.bed \ # gene proximity annotation BED + --samplesID samplesIDs.1000G.HGDP.txt \ # config file declaring usage of 1000G and HGDP samples + --be-window 4,8 \ # base editing window start and stop positions within off-targets + --be-base A,G \ # nucleotide to test base editing potential (A>G) + --mm 6 \ # number of max mismatches + --bDNA 2 \ # number of max DNA bulges + --bRNA 2 \ # number of max RNA bulges + --merge 3 \ # merge off-targets mapped within 3 bp in clusters + --sorting-criteria-scoring mm+bulges \ # prioritize within each cluster off-targets with highest score and lowest mm+bulges (CFD and CRISTA reports only) + --sorting-criteria mm,bulges \ # prioritize within each cluster off-targets with lowest mm and bulges counts + --output sg1617-NGG-1000G-HGDP \ # output directory name + --thread 8 # number of threads + ``` + +##### Input Arguments +--- + +Below is a detailed list of the input arguments required or optionally used by +the Complete Search function. Each parameter is explained to ensure clarity in +its purpose and usage: + +**General Parameters** +- `--help` +
Displays the help message with usage details and exits. Useful for quickly + referencing all available options. + +- `--output` (*Required*) +
Specifies the name of the output directory where all results from the + analysis will be saved. This directory will be created within the `Results` + directory. + +- `--thread` (*Optional - Default: 4*) +
Defines the number of CPU threads to use for parallel computation. + Increasing the number of threads can speed up analysis on systems with + multiple cores. + +- `--debug` (*Optional*) +
Runs the tool in debug mode. + +**Input Data Parameters** + +- `--genome` (*Required*) +
Path to the directory containing the reference genome in FASTA format. + Each chromosome must be in a separate file (e.g., `chr1.fa`, `chr2.fa`, etc.). + +- `--vcf` (*Optional*) +
Path to text config file listing the directories containing VCF files to + be integrated into the analysis. When provided, CRISPRme conducts variant- + and haplotype-aware searches. If not specified, the tool searches only on the + reference genome. + +- `--guide` +
Path to a text file containing one or more guide RNA sequences (one per + line) to search for in the input genome and variants. This argument cannot be + used together with `--sequence`. + +- `--sequence` +
Path to a FASTA file listing guide RNA sequences. This argument is an + alternative to `--guide` and cannot be used simultaneously. + +- `--pam` (*Required*) +
Path to a text file specifying the PAM sequence(s) required for the + search. The file should define the PAM format (e.g., `NGG` for SpCas9). + +**Annotation Parameters (*Optional*)** + +- `--annotation` +
Path to a BED file containing genomic annotations, such as regulatory + regions (e.g., DNase hypersensitive sites, enhancers, promoters). These + annotations provide functional context for identified off-targets. + +- `--gene_annotation` +
Path to a BED file containing gene information, such as Gencode + protein-coding annotations. This is used to calculate the proximity of + off-targets to genes for downstream analyses. + +**Base Editing Parameters (*Optional*)** + +- `--be-window` +
Specifies the editing window for base editors, defined as start and stop + positions relative to the guide RNA (1-based, comma-separated). This defines + the region of interest for base-editing analysis. + +- `--be-base` +
Defines the target nucleotide(s) for base editing. This is only used when + base editing functionality is needed. + +**Sample-Specific Parameters (*Optional*)** + +- `--samplesID` +
Path to a text config file listing sample identifiers (one per line) + corresponding to VCF datasets. This enables sample-specific off-target + analyses. Mandatory if `--vcf` is specified. + +**Search and Merging Parameters** + +- `--mm` (*Required*) +
Maximum number of mismatches allowed during off-target identification. + +- `--bDNA` (*Required*) +
Maximum allowable DNA bulge size. + +- `--bRNA` (*Required*) +
Maximum allowable RNA bulge size. + +- `--merge` (*Optional - Default: 3*) +
Defines the window size (in base pairs) used to merge closely spaced + off-targets. Pivot targets are selected based on the highest score (e.g., + CFD, CRISTA) or criteria defined by the `--sorting-criteria`. + +- `--sorting-criteria-scoring` (*Optional - Default: `mm+bulges`*) +
Specifies sorting criteria for merging when using CFD/CRISTA scores. + Options include: + - mm: Number of mismatches. + - bulges: Total bulge size. + - mm+bulges: Combined mismatches and bulges. + +- `--sorting-criteria` (*Optional - Default: `mm+bulges,mm`*) +
Sorting criteria used when CFD/CRISTA scores are unavailable. Options are + similar to `--sorting-criteria-scoring` but tailored for simpler analyses. + +**Note 1**: Ensure compatibility between input files and genome builds (e.g., +hg38 or hg19) to avoid alignment issues. + +**Note 2**: Optional arguments can be omitted when not applicable, but required +arguments must always be specified. + +##### Output Data Overview +--- + +The Complete Search function generates a comprehensive suite of reports, +detailing the identified and prioritized targets, along with statistical and +graphical summaries. These outputs are essential for interpreting the search +results and understanding the impact of genetic diversity on CRISPR off-target. + +**Output Off-targets Files Description** + +1. `*.integrated_results.tsv` + + - **Contents**: A detailed file containing the top targets (`*.bestMerge.txt`) + enriched with annotations from the input files. Includes: + - Gene proximity of off-targets + - Overlaps with regulatory elements + + - **Purpose**: Integrates functional genomic context into the prioritization + of off-targets. + +2. `*.all_results_with_alternative_alignments.tsv` + + - **Contents**: Comprehensive listing of all identified targets, including + alternative alignments. Annotated with: + - Gene proximity + - Overlaps with regulatory elements + + - **Purpose**: Facilitates a full exploration of CRISPRme's off-target + predictions and their functional relevance. + +**Guide-Specific Summary Files** + +These files summarize off-target statistics per guide sequence based on +different sorting criteria. + +3. `*.summary_by_guide._CFD.txt` + + - **Contents**: Summarizes off-target counts per guide using the CFD score + as the primary sorting criterion (data derived from + `*.integrated_results.tsv`). Includes counts of: + - Targets by bulge type (DNA, RNA). + - Mismatch number and bulge size. + - Targets in the reference genome, variant genome, and those caused by + PAM creation due to variants. + + - **Purpose**: Provides insight into the distribution and characteristics of + off-targets prioritized by CFD score. + +4. `*.summary_by_guide._CRISTA.txt` + + - **Contents**: Summarizes off-target counts per guide using the CRISTA score + as the primary sorting criterion (data derived from + `*.integrated_results.tsv`). Includes counts of: + - Targets by bulge type (DNA, RNA). + - Mismatch number and bulge size. + - Targets in the reference genome, variant genome, and those caused by + PAM creation due to variants. + + - **Purpose**: Provides insight into the distribution and characteristics of + off-targets prioritized by CRISTA score. + +5. `*.summary_by_guide._fewest.txt` + + - **Contents**: Summarizes off-target counts per guide using the fewest + mismatches and bulges as the sorting criterion. + + - **Purpose**: Highlights off-targets that are closest to perfect matches, + providing an alternative prioritization method. + +**Sample-Specific Summary Files** + +These files focus on off-targets unique to individual samples and their +populations. + +6. `*.summary_by_samples._CFD.txt` + + - **Contents**: Counts of private off-targets per sample, sorted by CFD + score. Reports targets: + - Private to the sample. + - Found in the population or superpopulation. + - Resulting from PAM creation due to a variant. + + - **Purpose**: Quantifies sample-specific off-targets and their broader + population impact. + +7. `*.summary_by_samples._CRISTA.txt` + + - **Contents**: Similar to the CFD-based sample summary but uses CRISTA + score for sorting. + +8. `*.summary_by_samples._fewest.txt` + - **Contents**: Summarizes private off-targets using the fewest mismatches + and bulges as the sorting criterion. -Once downloaded and untared the folder, you will have a ready to use CRISPRme directory tree. -**NB DO NOT CHANGE ANY FOLDER NAME** to avoid losing data or forcing to recompute indexes and dictionaries. **YOU MUST USE THE DEFAULT FOLDERS TO STORE THE DATA** since the software have been designed to recognize only files and folders in its own folder tree (see **Usage** section). +**Graphical Output** +9. `imgs` directory -## Usage -CRISPRme is designed to work and recognize its specific directories tree structure. See the following image for a detailed explanantion of CRISPRme's folders structure -![fig2](./docs/readme/directory_tree.png) + - **Contents**: Contains visual representations of the top 1000 targets + based on CFD score, CRISTA score, and Fewest mismatches and bulges. Images + include: + - Bar plots showing the distribution of targets across populations and + bulge types. + - Graphical summaries illustrating the impact of genetic variants on + mismatches, bulge size, and scores. -
**CAVEAT.** Before running CRISPRme make sure that your system has **>= 64 GB of memory** available. + - **Purpose**: Facilitates easy interpretation and presentation of CRISPRme + results. -The following sections will describe the main functionalities of CRISPRme, listing their input data, and the expected output. +#### 2.2.2 Complete Test +--- -#### Complete-search function -```complete-search``` performs a complete search from scratch returing all the results and post-analysis data. +The **Complete Test** module provides an automated pipeline for verifying the +correct installation and functionality of CRISPRme. This feature is designed to +simplify the validation process by automatically setting up the required +directory structure, downloading essential files, and offering flexible testing +options tailored to different user needs. -**Input**: -- Directory containing a reference genome (FASTA format). The reference genome must be separated into single chromosome files (e.g. chr1.fa, chr2.fa, etc.). -- Text file storing path to the VCF directories [OPTIONAL] -- Text file with a list of guides (1 to N) -- Text file with a single PAM sequence -- BED file with annotations, containing a list of genetic regions with a function associated -- Text file containing a list of path to a samplesID file (1 to N) equal to the number of VCF dataset used [OPTIONAL] -- Base editor window, used to specify the window to search for susceptibilty to certain base editor [OPTIONAL] -- Base editor nucleotide(s), used to specify the base(s) to check for the choosen editor [OPTIONAL] -- BED file extracted from Gencode data to find gene proximity of targets -- Maximal number of allowed bulges of any kind to compute in the index genome -- Threshold of mismatches allowed -- Size of DNA bulges allowed -- Size of RNA bulges allowed -- Merge range, necessary to reduce the inflation of targets due to bulges, it's the window of bp necessary to merge one target into another maintaining the highest scoring one -- Sorting criteria to use while merging targets based on CFD/CRISTA scores (scores have highest priority) -- Sorting criteria to use while merging targets based on fewest mismatches+bulges -- Output directory, in which all the data will be produced -- Number of threads to use in computation +This function automatically creates the CRISPRme directory structure required +for the tool to function properly. Furthermore, it downloads and prepares all +necessary files for testing, ensuring that users do not need to manually manage +dependencies. Complete Test module allows users to perform a test limited to a +specific chromosome (e.g., chromosome 22), significantly reducing runtime and +resource usage. However, is also available the option to test on the entire +genome, ensuring all components of CRISPRme work correctly across large-scale +datasets. This function supports testing with 1000 Genomes Phase 3 dataset and +Human Genome Diversity Project (HGDP) dataset. Testing parameters, such as the +genome dataset and test type, can be customized via command-line arguments, +allowing users to tailor the testing process to their system capabilities and +goals. -**Output** +This module is suited for: -As a given genomic region may have multiple alignments, CRISPRme outputs two lists of candidate off-target sites: -- The "integrated_results" file reports a single candidate off-target per genomic region, merging all possible off-targets within 3 bp (by default, adjustable parameter), and integrates annotation information if provided. The nominated off-targets are selected and sorted based on highest CFD score (by default, users can select other criteria), and when the CFD score is identical, the reference alignment is favored over alternative alignments. CRISPRme only reports candidate off-targets corresponding to observed haplotypes. A more extensive description and explanation of all the columns in this main CRISPRme results file can be found in [Supplementary Table 1 of our CRISPRme paper](https://static-content.springer.com/esm/art%3A10.1038%2Fs41588-022-01257-y/MediaObjects/41588_2022_1257_MOESM1_ESM.pdf). -- The “all_results_with_alternative_alignments” file contains all the candiate off-targets not included in the first file. This file preserves alternative alignments as well as those containing other variants with lower CFD scores. +- **Initial Installation Verification**: Test whether CRISPRme has been +installed correctly and is functioning as expected before running actual +analyses. -Other outputs: -- Parameters data file, containing all the parameters used in the search -- Count and distribution files, containing all the data count file useful in the web-tool representation to generate main tables and view -- Integrated results and database, containing all the tabulated data with genes proximity analysis and database representation to rapid querying on the web-tool GUI -- Directory with raw targets, containing the un-processed results from the search, useful to recover any possible target found during the search -- Directory with images, containing all the images generated to be used with the web-tool +- **System Configuration Testing**: Validate the compatibility of CRISPRme with +different system configurations (e.g., varying thread counts, datasets). -**Example** -- via ```conda```: +- **Dataset Evaluation**: Test specific datasets (1000 Genomes Phase 3 or HGDP) +to confirm their suitability for the user’s research needs. + +Usage Example for the Complete Test function: +- **Via Conda/Mamba** + ```bash + crisprme.py complete-test \ + --chrom chr22 \ # test on chromosome 22 data only + --vcf_dataset 1000G # test using 1000G variants ``` - crisprme.py complete-search --genome Genomes/hg38/ --vcf list_vcf.txt/ --guide sg1617.txt --pam PAMs/20bp-NGG-spCas9.txt --annotation Annotations/gencode_encode.hg38.bed --samplesID list_samplesID.txt --be-window 4,8 --be-base A --gene_annotation Gencode/gencode.protein_coding.bed --bMax 2 --mm 6 --bDNA 2 --bRNA 2 --merge 3 --sorting-criteria-scoring mm+bulges --sorting-criteria mm+bulges,mm --output sg1617/ --thread 4 + +- **Via Docker** + ```bash + docker run -v ${PWD}:/DATA -w /DATA -i pinellolab/crisprme \ + crisprme.py complete-test \ + --chrom chr22 \ # test on chromosome 22 data only + --vcf_dataset 1000G # test using 1000G variants ``` -- via Docker: + +##### Input Arguments +--- + +Below is a detailed list of the input arguments required by the Complete Test +function, including detailed explanations and default behaviors: + +**General Parameters** + +- `--help` +
Displays the help message with details about the available options and + exits the program. + +- `--thread` (*Optional - Default: 4*) +
Specifies the number of threads to use for the computation, allowing for + parallel processing. + +- `--debug` (*Optional*) +
Runs the tool in debug mode. + +**Input Data Parameters (*Optional*)** + +- `--chrom` +
Specifies the chromosome to be used in the CRISPRme test. The chromosome + identifier must be prefixed with chr (e.g., chr22). When this argument is + provided, the test will be limited to the specified chromosome, enabling + faster execution for targeted validation. *Default behavior*: Run the test + across the entire genome. + +- `--vcf_dataset` +
Defines the variant dataset to be utilized for testing CRISPRme. + Available options include: + + - `1000G`: Uses the 1000 Genomes Phase 3 dataset. + + - `HGDP`: Uses the Human Genome Diversity Project dataset. + + *Default behavior*: Use 1000 Genomes variant data. + +##### Output Data Overview +--- + +The results generated by the Complete Test function are saved in a subdirectory +within the Results directory, named `crisprme-test-*` (with `*` replaced by a +unique identifier). + +The output folder contains the same set of files produced by the +Complete Search functionality, including: + +- Detailed target reports that prioritize off-target candidates based on +various criteria. + +- Summaries categorized by guides and samples. + +- Graphical representations of results, such as bar plots and impact +assessments of genetic variants on target metrics. + +For a detailed description of the contents of these files, please refer to the +corresponding subsection in [Complete Search](#221-complete-search). + +#### 2.2.3 Targets Integration +--- + +The **Targets Integration** function enables the seamless combination of +computationally predicted off-targets identified by CRISPRme with +experimentally validated off-targets, such as those obtained from GUIDE-seq, +CIRCLE-seq, or other high-throughput methods. This integration enhances the +interpretability and reliability of CRISPRme’s results by merging empirical +data with *in silico* predictions. + +This module supports integration with user-provided datasets in `BED` format, +enabling flexibility in validation sources. Targets Integration + +Usage Example for the Complete Test function: +- **Via Conda/Mamba** + ```bash + crisprme.py targets-integration \ + --targets results.integrated_results.tsv \ # search results + --empirical_data empirical_data.bed \ # empirical data BED + --output integrated_targets_dir # output directory ``` - docker run -v ${PWD}:/DATA -w /DATA -i pinellolab/crisprme crisprme.py complete-search --genome Genomes/hg38/ --vcf list_vcf.txt/ --guide sg1617.txt --pam ./PAMs/20bp-NGG-SpCas9.txt --annotation ./Annotations/encode+gencode.hg38.bed --samplesID list_samplesID.txt --be-window 4,8 --be-base A --gene_annotation ./Annotations/gencode.protein_coding.bed --bMax 2 --mm 6 --bDNA 2 --bRNA 2 --merge 3 --output sg1617/ --thread 4 + +- **Via Docker** + ```bash + docker run -v ${PWD}:/DATA -w /DATA -i i pinellolab/crisprme \ + crisprme.py targets-integration \ + --targets results.integrated_results.tsv \ # search results + --empirical_data empirical_data.bed \ # empirical data BED + --output integrated_targets_dir # output directory ``` -#### Targets-integration function -```targets-integration``` returns an ```integrated_result``` file with paired empirical targets from an ```integrated_results``` file. +##### Input Arguments +--- + +Below is a detailed list of the input arguments required by the Target +Integration function, including detailed explanations and default behaviors: + +**General Parameters** + +- `--help` +
Displays the help message with details about the available options and + exits the program. + +- `--debug` (*Optional*) +
Runs the tool in debug mode. + +**Input Data Parameters** + +- `--targets` (*Required*) +
Specifies the file containing targets identified and processed from a + CRISPRme search. This file should include predicted off-target data such as + mismatch counts, bulge sizes, and scores (e.g., CFD, CRISTA). + +- `--empirical_data` (*Required*) +
Path to a `BED` file containing empirically validated off-targets. This + file typically includes genomic coordinates and additional metadata from + experiments such as GUIDE-seq or CIRCLE-seq. Ensure that the `BED` file + format adheres to standard conventions for compatibility. + +- `--output` (*Required*) +
Name of the directory where the resulting integrated targets file will be + saved. + +##### Output Data Overview +--- + +The Targets Integration module generates output files that merge +computationally predicted targets from CRISPRme with experimentally validated +off-targets provided in the BED file. The integrated data is stored in the +specified output directory and includes the `integrated_results.tsv`. This +tab-separated file contains the combined list of off-target sites. Each entry +represents a merged record from CRISPRme predictions and empirical data, where +overlaps are identified based on genomic coordinates. + +#### 2.2.4 GNOMAD Converter +--- + +The **GNOMAD Converter** function is a utility designed to preprocess and +convert GNOMAD VCF files into a format compatible with CRISPRme for off-target +analysis. This tool facilitates the inclusion of population-level genetic +variation data from GNOMAD into CRISPRme. + +The converter currently supports VCF files from the following GNOMAD versions: +- **v3.1** +- **v4.0** +- **v4.1**, including joint VCF files (exomes + genomes). + +Since individual sample data are not available in GNOMAD VCFs, the tool relies +on population-level groupings. Populations are treated as individual "samples" +for the purpose of conversion. This approach supports population-based +statistical analyses in CRISPRme, such as identifying population-specific +off-targets. + +**Note 1**: For studies requiring sample-specific statistics, GNOMAD is not +recommended due to its population-level nature. -**Input** -- Integrated results from a search, containing the processed targets -- BED file containing empirical verified OT, like via GUIDE-seq, CIRCLE-seq and other sequencing protocols -- Output directory, in which the integrated result file with empirical data will be created +**Note 2**: The GNOMAD Converter function is particularly useful for creating +GNOMAD-compatible datasets formatted for CRISPRme's population-aware off-target +analysis. -**Output** -- Directory containing the integrated result with each target pair with an existing empirical target (if found) +**Note 3**: Since GNOMAD provides population-level data rather than +individual-level data, CRISPRme interprets populations as pseudo-individuals. +This approach allows meaningful population-level statistics but is not suitable +for applications requiring individual-level granularity. -**Example** -- via ```conda```: +To ensure a smooth conversion process, sample ID files compatible with GNOMAD +VCFs must be provided. These files are available for download from the CRISPRme +GitHub repository: + +- [Sample IDs file for GNOMAD v3.1 and v4.0](https://github.com/pinellolab/CRISPRme/blob/v216/test/data/samplesIDs.gnomad.v40.txt) + +- [Sample IDs file for GNOMAD v4.1](https://github.com/pinellolab/CRISPRme/blob/v216/test/data/samplesIDs.gnomad.v41.txt) + +The conversion process preserves all variant information necessary for CRISPRme +analyses, including allele frequencies and genotypes (if applicable). + +Usage Example for the GNOMAD Converter function: +- **Via Conda/Mamba** + ```bash + crisprme.py gnomAD-converter \ + --gnomAD_VCFdir gnomad_vcf_dir \ # directory containing GNOMAD VCFs + --samplesID samplesIDs.gnomad.v41.txt \ # GNOMAD v4.1 samples file + --keep \ # keep variants with filter different from PASS + --thread 4 # number of threads ``` - crisprme.py targets-integration --targets *integrated_results.tsv --empirical_data empirical_data.tsv --output dir/ + +- **Via Docker** + ```bash + docker run -v ${PWD}:/DATA -w /DATA -i pinellolab/crisprme \ + crisprme.py gnomAD-converter \ + --gnomAD_VCFdir gnomad_vcf_dir \ # directory containing GNOMAD VCFs + --samplesID samplesIDs.gnomad.v41.txt \ # GNOMAD v4.1 samples file + --keep \ # keep variants with filter different from PASS + --thread 4 # number of threads ``` -- via Docker: + +##### Input Arguments +--- + +Below is a detailed list of the input arguments required by the GNOMAD +Converter function, including detailed explanations and default behaviors: + +**General Parameters** + +- `--help` +
Displays the help message with details about the available options and + exits the program. + +- `--thread` (*Optional - Default: 4*) +
Specifies the number of threads to use for the computation, allowing for + parallel processing. + +- `--debug` (*Optional*) +
Runs the tool in debug mode. + +**Input Data Parameters** + +- `--gnomAD_VCFdir` (*Required*) +
Specifies the directory containing the gnomAD VCF files to be processed + and converted into a format compatible with CRISPRme. + +- `--samplesID` (*Required*) +
Path to a text file containing the sample IDs used during the conversion + process. In this file, GNOMAD populations are treated as pseudo-individual + samples to create population-based VCFs for CRISPRme. + +- `--joint` (*Optional*) +
Use this flag if GNOMAD VCFs being processed are joint VCFs, such as + GNOMAD v4.1 joint variant files. *Default Behavior*: Assumes the input VCFs + are not joint. + +- `--keep` (*Optional*) +
Use this flag to retain all variants during the conversion, regardless of + the FILTER field value. *Default behavior*: Excludes variants that do not have + a `PASS` value in the FILTER field. + +- `--multiallelic` (*Optional*) +
Indicates whether to merge variants at the same genomic position into a + single multiallelic record. *Default behavior*: Keeps variants as biallelic + and does not merge them. + +##### Output Data Overview +--- + +The output of the GNOMAD Converter function consists of converted VCF files +formatted to be compatible with CRISPRme. These files are stored in the same +directory as the input VCF files. The conversion process ensures that the output +adheres to CRISPRme's input specifications for population-level analysis. Below +are details about the generated data: + +**File Naming Conventions** + +- **Multiallelic Variant Merging** (`--multiallelic`) +
If multiallelic entries were generated by merging variants at the same + position, the filename will include a tag such as `*.multiallelic.*`, + `*.biallelic.*` otherwise. + +- **Joint Variant Files** (`--joint`) +
If joint VCFs were processed, the filenames will be labeled accordingly, + for example, `*.joint.*`. + +**Content of the Converted VCFs** + +- **Population Representation** +
Each output VCF treats GNOMAD populations as pseudo-individual samples, + enabling CRISPRme to perform population-based statistical analysis. This + structure is reflected in the sample columns of the output VCFs. + +- **Variant Quality** +
If the `--keep` flag is used, all variants from the input VCF are included, + regardless of their quality as indicated in the FILTER field. Without the + `--keep` flag, only variants with a `PASS` in the FILTER field are retained. + +- **Allele Representation** +
By default, the converter preserves biallelic representation, creating one + row per variant. If the `--multiallelic` flag is used, variants at the same + position are merged into multiallelic entries. + +- **Compatible Structure** +
The output files are structured to align with CRISPRme's population-aware + off-target analysis, ensuring seamless integration into the tool's pipeline. + + +#### 2.2.5 Generate Personal Card +--- + +The **Generate Personal Card** functionality creates a **sample-specific** +report, referred to as **personal card**. This report details all off-targets +identified by CRISPRme for a given sample's unique genomic sequence. The report +accounts for private variants (genetic differences specific to the sample not +shared with other populations or individuals) and their potential impact on +off-target editing outcomes. + +This feature is particularly useful in scenarios where personalized +gene-editing strategies are required, such as analyzing how private genetic +variations influence the efficacy and safety of CRISPR-based interventions. The +personal card provides the following key insights: + +- Identification of off-target sequences present exclusively in the sample due +to private variants. This allows researchers to evaluate potential risks or +opportunities unique to the individual’s genome. + +- Detailed information for each input guide, showing how private variants +affect off-target sequences. + +This functionality is a critical tool for advancing personalized medicine and +precision genome editing, enabling researchers and clinicians to tailor +CRISPR-based solutions to an individual’s unique genetic makeup. + +Usage Example for the Generate Personal Card function: +- **Via Conda/Mamba** + ```bash + crisprme.py generate-personal-card \ + --result_dir Results/sg1617.6.2.2 \ # results directory from previous search + --guide_seq CTAACAGTTGCTTTTATCACNNN \ # guide sequence + --sample_id NA21129 # sample ID ``` - docker run -v ${PWD}:/DATA -w /DATA -i i pinellolab/crisprme crisprme.py targets-integration --targets *integrated_results.tsv --empirical_data empirical_data.tsv --output dir/ + +- **Via Docker** + ```bash + docker run -v ${PWD}:/DATA -w /DATA -i i pinellolab/crisprme \ + crisprme.py generate-personal-card \ + --result_dir Results/sg1617.6.2.2 \ # results directory from previous search + --guide_seq CTAACAGTTGCTTTTATCACNNN \ # guide sequence + --sample_id NA21129 # sample ID ``` -#### gnomAD-converter function -```gnomAD-converter``` converts a set of gnomADv3.1 VCFs into compatible VCFs. +##### Input Arguments +--- + +Below is a detailed list of the input arguments required by the Generate +Personal Card function, including detailed explanations and default behaviors: + +**General Parameters** -**Input** -- gnomAD_VCFdir, used to specify the directory containing gnomADv3.1 original VCFs -- samplesID, used to specify the pre-generated samplesID file necessary to introduce samples into gnomAD variant -- thread, the number of threads used in the process (default is ALL available minus 2) +- `--help` +
Displays the help message with details about the available options and + exits the program. -**Output** -- original gnomAD directory with the full set of gnomAD VCFs converted to compatible format +- `--debug` (*Optional*) +
Runs the tool in debug mode. -**Example** -- via ```conda```: +**Input Data Parameters** + +- `--results_dir` (*Required*) +
Specifies the directory containing the CRISPRme search results. The tool + extracts the relevant targets from the reports available in this directory. + Ensure this path includes all necessary output files generated by CRISPRme + during the analysis (`*.integrated_results.*`). + +- `--guide_seq` (*Required*) +
The sgRNA sequence for which the sample-specific targets will be + extracted. This argument ensures that the generated personal card is tailored + to a specific guide of interest, enabling targeted analysis. + +- `--sample_id` (*Required*) +
The identifier of the sample for which the personal card will be created. + This ID corresponds to the unique genetic profile being analyzed. Ensure the + sample ID matches the format used in the input data to avoid discrepancies. + +##### Output Data Overview +--- + +The Generate Personal Card functionality produces sample-specific outputs, +allowing researchers to assess how private genetic variants influence CRISPR +off-target activity. + +**Note 1**: All output files include the sample ID in their file names for easy +identification and traceability (e.g., `*..*`). + +**Note 2**: Outputs are tailored to the specified guide sequence and sample ID, +ensuring precise and personalized reporting. + +**Output Off-targets Files description** + +- `*...private_targets.tsv` +
A detailed table reporting all off-target sequences specific to the + sample. Extracted from the *.integrated_results.* file within the input + directory. The report is generated in the results directory specified via + `--result_dir`. + +**Graphical Output** + +- `imgs` directory +
The function generates plots illustrating the effect of private genetic + variants on the sample-specific targets. Displays changes in the CFD and + CRISTA scores, and number of Mismatches and Bulges highlighting off-target + risk influenced by genetic variants. + +#### 2.2.6 Web Interface +--- + +The **Web Interface** module offers a user-friendly, locally hosted graphical +user interface (GUI) for CRISPRme. This feature replicates the functionality of +CRISPRme's online platform, enabling users to execute CRISPRme workflows and +explore search results interactively without requiring an internet connection. + +The GUI allows users to submit CRISPRme jobs directly through the web interface. +Users can upload input files, configure parameters, and monitor progress with +ease. + +The interface provides an intuitive way to explore the output files generated by +CRISPRme. Users can filter targets by criteria such as mismatch count, bulge +size, or scores (e.g., CFD or CRISTA). The interface includes dynamic plots and +charts. Results are presented in a structured, easy-to-navigate format, linking +data to relevant genomic annotations. The web interface runs as a local server, +ensuring data privacy and fast response times. Users can access it via their +preferred web browser, with compatibility confirmed for: Google Chrome, Mozilla +Firefox, and Safari. All functionalities are self-contained, eliminating the +need for an internet connection. This is particularly useful for secure +environments or systems without reliable internet access. + +Usage example for the Web Interface function: + +- **Via Conda/Mamba** + ```bash + crisprme.py web-interface # Starts the local server and launches the web interface ``` - crisprme.py gnomAD-converter --gnomAD_VCFdir gnomad_dir/ --samplesID samplesIDs/hg38_gnomAD.samplesID.txt -thread 4 + +- **Via Docker** + ```bash + docker run -v ${PWD}:/DATA -w /DATA -i pinellolab/crisprme \ + crisprme.py web-interface # Starts the local server and launches the web interface ``` -- via Docker: + +##### Input Arguments +--- + +Below is a detailed list of the input arguments required by the Web Interface +function: + +- `--help` +
Displays the help message and exits. + +- `--debug` (*Optional*) +
Launches the local server in debug mode. This mode enables verbose logging, + which is useful for troubleshooting and diagnosing issues during setup or + operation. It provides detailed error messages and runtime information in the + console. + +##### Output Data Overview +--- + +This function does not generate output data in the form of files or reports. +Instead, it serves to launch a local graphical user interface, which allows +users to interactively explore and run CRISPRme analyses. All results are +displayed dynamically within the web interface itself, offering an interactive +experience for viewing CRISPRme data and outputs (see [Section 2.2.1](#221-complete-search) +for details). + +## 3 Test + +This section covers CRISPRme testing. This ensures that the software is +correctly installed, configured, and ready for use. This section covers two +testing options tailored to user needs: + +- [Quick Test](#31-quick-test) +
A lightweight test to verify the installation and basic functionality. + +- [Detailed Test](#32-detailed-test) +
A comprehensive pipeline test, replicating a full-scale CRISPRme analysis. + +For persistent issues, refer to the +[CRISPRme GitHub Issues Page](https://github.com/pinellolab/CRISPRme/issues) +or contact the maintainers. + +### 3.1 Quick Test +--- + +The Quick Test provides a simple and efficient way to confirm that CRISPRme is +correctly installed and operational on your system. It ensures that the +software is accessible, dependencies are properly configured, and the version +matches the expected release. + +**Step 1: Verify CRISPRme Installation** + +Open a terminal and execute the following command to check the software version: + +- **Via Conda/Mamba** + ```bash + crisprme.py --version # Expected output: v2.1.6 ``` - docker run -v ${PWD}:/DATA -w /DATA -i i pinellolab/crisprme crisprme.py gnomAD-converter --gnomAD_VCFdir gnomad_dir/ --samplesID samplesIDs/hg38_gnomAD.samplesID.txt -thread 4 + +- **Via Docker** + ```bash + docker run -v ${PWD}:/DATA -w /DATA -i pinellolab/crisprme \ + crisprme.py --version # Expected output: v2.1.6 ``` -#### Generate-personal-card function -```generate-personal-card``` generates a personal card for a specified input sample. +If the output displays the correct software version (e.g., `v2.1.6`), CRISPRme +is successfully installed and ready for use. -**Input** -- result_dir, directory containing the result from which extract the targets to generate the card -- guide_seq, sequence of the guide to use in order to exctract the targets -- sample_id, ID of the sample to use in order to generate the card +**Step 2: Access CRISPRme Help Menu** -**Output** -- Set of plots generated with personal and private targets containing the variant CFD score and the reference CFD score -- Filtered file with private targets of the sample directly extracted from integrated file +To explore the functionalities and input parameters of CRISPRme, use the +`--help` flag: -**Example** -- via ```conda```: +- **Via Conda/Mamba** + ```bash + crisprme.py --help ``` - crisprme.py generate-personal-card --result_dir Results/sg1617.6.2.2/ --guide_seq CTAACAGTTGCTTTTATCACNNN --sample_id NA21129 +- **Via Docker** + ```bash + docker run -v ${PWD}:/DATA -w /DATA -i pinellolab/crisprme \ + crisprme.py --help ``` -- via Docker + +The help menu provides detailed descriptions of CRISPRme's features and usage +instructions, enabling users to familiarize themselves with available options +and workflows. + +**Why Perform the Quick Test?** + +The Quick Test is highly recommended: + +1. Immediately after installation to ensure the setup is correct. + +2. Before running full-scale analyses to verify dependencies and environment +configurations. + +If the Quick Test completes without errors, you can confidently proceed to detailed analyses using CRISPRme’s powerful tools and features. + +### 3.2 Detailed Test +--- + +For a more comprehensive validation of CRISPRme’s functionality, this detailed +test offers a full-scale test of the main CRISPRme pipeline, specifically its +Complete Search module. This test provides two distinct options for testing: a +quicker, focused test on a single chromosome and a more detailed, exhaustive +test across the entire genome: + +- **Single Chromosome Test** +
This test runs a search for potential off-targets on a single chromosome, + such as chromosome 22, enriched with variants from the 1000 Genomes or Human + Genome Diversity Project (HGDP) datasets. This provides a fast way to check + CRISPRme's ability to process variant data and identify off-targets for a + specific chromosome. + +- **Full Genome Test** +
This test checks CRISPRme's ability to search for potential off-targets + across the entire human genome. It runs a search using the sg1617 guide RNA + with an NGG PAM site, incorporating both Gencode and ENCODE annotations to + ensure comprehensive results. + +**Why Perform the Detailed Test?** + +The Detailed Test is ideal for users who wish to: + +- Fully assess CRISPRme’s performance across the genome. +- Perform a more comprehensive check on CRISPRme’s ability to handle +large-scale data and complex analyses. + +Successful completion of the detailed test confirms the full functionality of +CRISPRme, ensuring it is ready to handle large datasets and complex genetic +analysis tasks. + +#### 3.2.1 Single Chromosome Test +--- + +To run the quicker test on chromosome 22 using the 1000 Genomes dataset, +execute the following commands: + +- **Via Conda/Mamba** + ```bash + crisprme.py complete-test \ + --chrom chr22 \ + --vcf_dataset 1000G # to test on HGDP replace '1000G' with 'HGDP' ``` - docker run -v ${PWD}:/DATA -w /DATA -i i pinellolab/crisprme crisprme.py generate-personal-card --result_dir Results/sg1617.6.2.2/ --guide_seq CTAACAGTTGCTTTTATCACNNN --sample_id NA21129 + +- **Via Docker** + ```bash + docker run -v ${PWD}:/DATA -w /DATA -i pinellolab/crisprme \ + crisprme.py complete-test \ + --chrom chr22 \ + --vcf_dataset 1000G # to test on HGDP replace '1000G' with 'HGDP' ``` -#### Web-interface function (only via conda) -```web-interface``` starts a local server to use CRISPRme's web interface. +#### 3.2.2 Full Genome Test +--- -**Example** -- via ```conda``` +To run the detailed test across the entire genome using the 1000 Genomes +variants, execute the following commands: + +- **Via Conda/Mamba** + ```bash + crisprme.py complete-test \ + --vcf_dataset 1000G # to test on HGDP replace '1000G' with 'HGDP' ``` - crisprme.py web-interface + +- **Via Docker** + ```bash + docker run -v ${PWD}:/DATA -w /DATA -i pinellolab/crisprme \ + crisprme.py complete-test \ + --vcf_dataset 1000G # to test on HGDP replace '1000G' with 'HGDP' ``` -## Citation -If you use CRISPRme in your research, please cite our paper [(shareable link to full text)](https://rdcu.be/c1GYQ): -Cancellieri S, Zeng J, Lin LY, Tognon M, Nguyen MA, Lin J, ... Giugno R, Bauer DE, Pinello L. Human genetic diversity alters off-target outcomes of therapeutic gene editing. Nature Genetics, 55, 34–43 (2023). [https://doi.org/10.1038/s41588-022-01257-y.](https://doi.org/10.1038/s41588-022-01257-y) PMID: 36522432. +To run the test on a single chromosome (chromosome 22) and using 1000 Genomes variants, open a new terminal window and run the following command: + +- Via `Conda`/`Mamba` +``` +crisprme.py complete-test --chrom chr22 --vcf_dataset 1000G +``` + +- Via `Docker`: +``` +docker run -v ${PWD}:/DATA -w /DATA -i i pinellolab/crisprme crisprme.py complete-test --chrom 22 --vcf_dataset 1000G +``` + +To run the test on the entire genome and using 1000 Genomes variants, open a new terminal window and run the following command: + +- Via `Conda`/`Mamba` +``` +crisprme.py complete-test --vcf_dataset 1000G +``` + +- Via `Docker`: +``` +docker run -v ${PWD}:/DATA -w /DATA -i i pinellolab/crisprme crisprme.py complete-test --vcf_dataset 1000G +``` + +## 4 Citation +If you use CRISPRme in your research, please cite our [paper](https://rdcu.be/c1GYQ): + +Cancellieri S, Zeng J, Lin LY, Tognon M, Nguyen MA, Lin J, Bombieri N, Maitland +SA, Ciuculescu MF, Katta V, Tsai SQ, Armant M, Wolfe SA, Giugno R, Bauer DE, +Pinello L. Human genetic diversity alters off-target outcomes of therapeutic +gene editing. Nat Genet. 2023 Jan;55(1):34-43. +doi: [10.1038/s41588-022-01257-y](https://doi.org/10.1038/s41588-022-01257-y). +Epub 2022 Dec 15. PMID: 36522432; PMCID: PMC10272994. + +## 5 Contacts + +- Luca Pinello +
lpinello@mgh.harvard.edu + +- Rosalba Giugno +
rosalba.giugno@univr.it + +- Daniel Bauer +
bauer@bloodgroup.tch.harvard.edu + +## 6 License -## License -AGPL-3.0 (academic research only). +CRISPRme is licensed under the **AGPL-3.0** license, which permits its use for +**academic research purposes only**. -For-profit institutions must purchase a license before using CRISPRme. -Contact lpinello@mgh.harvard.edu for further details. +For any commercial or for-profit use, a separate license must be obtained. For +further information regarding licensing for commercial purposes, please contact +**Luca Pinello** at lpinello@mgh.harvard.edu. \ No newline at end of file diff --git a/TODO.txt b/TODO.txt deleted file mode 100755 index d490904..0000000 --- a/TODO.txt +++ /dev/null @@ -1 +0,0 @@ -TEST WITH AWK ON BIGGER DATA FOR EXTRACTION, SELECTION AND FILTERING (MAYBE ISSUE WITH TIME EXEC??) \ No newline at end of file diff --git a/app.py b/app.py index 3bec32e..7c83465 100755 --- a/app.py +++ b/app.py @@ -82,7 +82,7 @@ def _start_message() -> None: ["eq ", "="], ["contains "], ] -ONLINE = True # NOTE change to True for online version, False for offline +ONLINE = False # NOTE change to True for online version, False for offline DISPLAY_OFFLINE = "" DISPLAY_ONLINE = "" if ONLINE: diff --git a/assets/readme/crisprme-logo.png b/assets/readme/crisprme-logo.png new file mode 100644 index 0000000..004421a Binary files /dev/null and b/assets/readme/crisprme-logo.png differ diff --git a/assets/readme/crisprme_dirtree.png b/assets/readme/crisprme_dirtree.png new file mode 100644 index 0000000..d517fec Binary files /dev/null and b/assets/readme/crisprme_dirtree.png differ diff --git a/clean_all.sh b/clean_all.sh deleted file mode 100755 index 331bf7f..0000000 --- a/clean_all.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash - -#clean all directories to state 0 - -#clean genomes from variant genome and pending data -rm -rf Genomes/hg38+* -rm -rf Genomes/variants* - -#clean results -rm -rf Results/* - -#clean dictionaries -rm -rf Dictionaries/* - -#remove genome_library -rm -rf genome_library/ - diff --git a/crisprme.py b/crisprme.py index 666e653..05d0b24 100755 --- a/crisprme.py +++ b/crisprme.py @@ -9,7 +9,7 @@ import re -version = "2.1.5" # CRISPRme version; TODO: update when required +version = "2.1.6" # CRISPRme version; TODO: update when required __version__ = version script_path = os.path.dirname(os.path.abspath(__file__)) @@ -1011,6 +1011,8 @@ def print_help_gnomad_converter(): "\t--samplesID, specifies the precomputed sample IDs file necessary " "for incorporating population-specific information into the output " "VCFs\n" + "\t--joint, optional flag to specify the input GnomAD VCF contain joint " + "allele frequencies\n" "\t--keep, optional flag to retain all variants, regardless of their " "filter flag. By default, variants with a filter flag different from " "PASS are discarded\n" @@ -1073,6 +1075,8 @@ def gnomAD_converter(): raise FileNotFoundError(f"Unable to locate {samples_ids}") except IndexError as e: raise ValueError("Please input some parameter for flag --samplesID") from e + # read joint gnomad vcf files + joint = "--joint" in args # read keep arg keep = "--keep" in args # keep all variants regardless of filter label # read multiallelic arg @@ -1091,8 +1095,8 @@ def gnomAD_converter(): script_path.replace("PostProcess", "src"), "convert_gnomAD_vcfs.py" ) cmd = ( - f"python {gnomad_converter_script} {gnomad_dir} {samples_ids} {keep} " - f"{multiallelic} {threads}" + f"python {gnomad_converter_script} {gnomad_dir} {samples_ids} {joint} " + f"{keep} {multiallelic} {threads}" ) code = subprocess.call(cmd, shell=True) if code != 0: @@ -1207,8 +1211,10 @@ def print_help_complete_test(): "chromosomes\n" "\t--vcf_dataset, VCFs dataset to be used during CRISPRme testing. " "Available options include 1000 Genomes (1000G) and Human Genome " - "Diversity Project (HGDP). The default dataset is 1000 Genomes.\n" - "\t--debug, debug mode\n" + "Diversity Project (HGDP). To use the combined dataset type '1000G+HGDP' " + "The default dataset is 1000 Genomes.\n" + "\t--thread, number of threads.\n" + "\t--debug, debug mode.\n" ) sys.exit(1) @@ -1246,13 +1252,23 @@ def complete_test_crisprme(): except IndexError: sys.stderr.write("Please input some parameter for flag --vcf_dataset\n") sys.exit(1) + threads = 4 + if "--thread" in input_args: # number of threads to use during test + try: + threads = input_args[input_args.index("--thread") + 1] + if threads.startswith("--"): + sys.stderr.write("Please input some parameter for flag --thread\n") + sys.exit(1) + except IndexError: + sys.stderr.write("Please input some value for flag --thread\n") + sys.exit(1) debug = "--debug" in input_args # run local or via conda/Docker # begin crisprme test script_test = os.path.join( - script_path.replace("PostProcess", "src"), "crisprme_test.py" + script_path.replace("PostProcess", "src"), "complete_test.py" ) # the script is located within src -- TODO: start migration to src code = subprocess.call( - f"python {script_test} {chrom} {vcf_dataset} {debug}", shell=True + f"python {script_test} {chrom} {vcf_dataset} {threads} {debug}", shell=True ) if code != 0: raise OSError( @@ -1262,18 +1278,36 @@ def complete_test_crisprme(): # HELP FUNCTION def callHelp(): - print( - "help:\n", - "\nALL FASTA FILEs USED BY THE SOFTWARE MUST BE UNZIPPED AND CHROMOSOME SEPARATED, ALL VCFs USED BY THE SOFTWARE MUST BE ZIPPED AND CHROMOSOME SEPARATED\n", - "\ncrisprme.py complete-search FUNCTION SEARCHING THE WHOLE GENOME (REFERENCE AND VARIANT IF REQUESTED) AND PERFORM CFD ANALYSIS AND TARGET SELECTION", - "\ncrisprme.py complete-test FUNCTION TO TEST THE COMPLETE PIPELINE OF CRISPRme with a small input", - "\ncrisprme.py targets-integration FUNCTION THAT INTEGRATES IN-SILICO TARGETS WITH EMPIRICAL DATA GENERATING A USABLE PANEL", - "\ncrisprme.py gnomAD-converter FUNCTION THAT CONVERTS ALL gnomADv3.1 vcf.bgz FILES INTO COMPATIBLE VCFs", - "\ncrisprme.py generate-personal-card FUNCTION TO GENERATE PERSONAL CARD FOR A SPECIFIC SAMPLE EXTRACTING ALL THE PRIVATE TARGETS", - "\ncrisprme.py web-interface FUNCTION TO ACTIVATE WEB INTERFACE OF CRISPRme TO USE WITH A BROWSER LOCALLY", - "\ncrisprme.py --version PRINT CRISPRME VERSION TO STDOUT AND EXIT", - "\n\nADD help TO ANY FUNCTION TO VISUALIZE A BRIEF HELP PAGE (example: crisprme.py complete-search --help)\n", + # print general help, listing all available functions with a brief + # description of their purpose + sys.stderr.write( + "Help:\n\n" + "- ALL FASTA FILEs USED BY THE SOFTWARE MUST BE UNZIPPED AND SEPARATED BY CHROMOSOME\n" + "- ALL VCFs USED BY THE SOFTWARE MUST BE ZIPPED (WITH BGZIP) AND SEPARATED BY CHROMOSOME\n\n" + "Functionalities:\n\n" + "crisprme.py complete-search\n" + "\tPerforms genome-wide off-targets search (reference and variant, if " + "specified), including CFD and CRISTA analysis, and target selection\n\n" + "crisprme.py complete-test\n" + "\tTest the complete CRISPRme pipeline on single chromosomes or complete " + "genomes\n\n" + "crisprme.py targets-integration\n" + "\tIntegrates in-silico targets with empirical data to generate a usable " + "panel\n\n" + "crisprme.py gnomAD-converter\n" + "\tConverts gnomAD VCF files into CRISPRme compatible VCFs (supports " + "gnomAD >= v3.1)\n\n" + "crisprme.py generate-personal-card\n" + "\tGenerates a personal card for specific samples by extracting all " + "private targets\n\n" + "crisprme.py web-interface\n" + "\tActivates CRISPRme's web interface for local browser use\n\n" + "crisprme.py --version\n" + "\tPrints CRISPRme version to stdout and exit\n\n" + "For additional information on each CRISPRme functionality type " + "--help (e.g. 'crisprme.py complete-search --help')\n" ) + sys.exit(1) # stop execution if len(sys.argv) < 2: diff --git a/crisprme_auto_test_conda.sh b/crisprme_auto_test_conda.sh deleted file mode 100755 index 41dc82e..0000000 --- a/crisprme_auto_test_conda.sh +++ /dev/null @@ -1,132 +0,0 @@ -# ------------------------------------------------------------------------------ -# TEST CRISPRme conda package - -echo "Download and extract fundamental data..." - -# download hg38 genome assembly -GENOMEDIR="Genomes" -HG38="hg38" -echo "Downloading hg38 genome assembly..." -mkdir -p $GENOMEDIR # create Genomes folder -cd $GENOMEDIR -# download chromosomes FASTA files -original_md5sum="a5aa5da14ccf3d259c4308f7b2c18cb0" # see https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/md5sum.txt -while true; do # retry download if caught timeout - wget -T 15 -c https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.chromFa.tar.gz && break -done -chromsfasta="hg38.chromFa.tar.gz" -local_md5sum="$(md5sum $chromsfasta | cut -d ' ' -f 1)" -if [ "$original_md5sum" != "$local_md5sum" ]; then - echo "ERROR: unexpected failure while downloading https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.chromFa.tar.gz" - exit 1 -fi -echo "Extracting ${chromsfasta}..." -tar -xzf $chromsfasta -mv chroms $HG38 # create hg38 directory -cd .. - -# download 1000G VCFs -echo "Downloading 1000 Genomes VCFs..." -VCF="VCFs" -mkdir -p $VCF # create VCF folder -cd $VCF -VCF1000G="hg38_1000G" -mkdir -p $VCF1000G # create 1000 genomes folder -cd $VCF1000G -# download 1000 genomes VCF files -for i in $(seq 1 22; echo "X"); -do - original_md5sum="$(curl -sL ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chr${i}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz | md5sum | cut -d ' ' -f 1)" # compute original md5sum - while true; do # retry download if caught timeout - wget -T 15 -c ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chr${i}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz && break - done - sleep 2 # allow disk sync for md5sum check - local_md5sum="$(md5sum ALL.chr${i}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz | cut -d ' ' -f 1)" - if [ "$original_md5sum" != "$local_md5sum" ]; then # check download consistency - echo "ERROR: unexpected failure while downloading ALL.chr${i}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz" - exit 1 - fi -done -cd ../.. - -# initialize VCF config file -VCFCONFIG="vcf_config.1000G.txt" -printf "${VCF1000G}\n" > $VCFCONFIG - -# download 1000G samplesIDs -SAMPLESIDS="samplesIDs" -mkdir -p $SAMPLESIDS # create sample ids dir -cd $SAMPLESIDS -# download 1000G samples IDs -echo "Downloading samples ids for 1000G dataset" -SAMPLES1000G="hg38_1000G.samplesID.txt" -original_md5sum="$(curl -sL https://raw.githubusercontent.com/pinellolab/CRISPRme/refs/heads/gnomad-4.1-converter/download_data/${SAMPLES1000G} | md5sum | cut -d ' ' -f 1)" # compute original md5sum -while true; do # retry download if caught timeout - wget -T 15 -c https://raw.githubusercontent.com/pinellolab/CRISPRme/refs/heads/gnomad-4.1-converter/download_data/${SAMPLES1000G} && break -done -sleep 2 # allow disk sync for md5sum check -local_md5sum="$(md5sum $SAMPLES1000G | cut -d ' ' -f 1)" -if [ "$original_md5sum" != "$local_md5sum" ]; then - echo "ERROR: unexpected failure while downloading ${SAMPLES1000G}" - exit 1 -fi -cd .. - -# initialize samples config file -SAMPLESCONFIG="samplesIDs.1000G.txt" -printf "${SAMPLES1000G}\n" > $SAMPLESCONFIG - -# download annotation data -ANNOTATIONDIR="Annotations" -mkdir -p $ANNOTATIONDIR # create annotation folder -cd $ANNOTATIONDIR -echo "Downloading ENCODE+GENCODE annotation data..." -original_md5sum="$(curl -sL https://raw.githubusercontent.com/pinellolab/CRISPRme/gnomad-4.1-converter/download_data/dhs+encode+gencode.hg38.bed.tar.gz | md5sum | cut -d ' ' -f 1)" -DHSENCODE="dhs+encode+gencode.hg38.bed.zip" -while true; do # retry download if caught timeout - wget -T 15 -c -O $DHSENCODE https://raw.githubusercontent.com/pinellolab/CRISPRme/gnomad-4.1-converter/download_data/dhs+encode+gencode.hg38.bed.tar.gz && break -done -sleep 2 # allow disk sync for md5sum check -local_md5sum="$(md5sum $DHSENCODE | cut -d ' ' -f 1)" -if [ "$original_md5sum" != "$local_md5sum" ]; then - echo "ERROR: unexpected failure while downloading ${DHSENCODE}" - exit 1 -fi -echo "Extracting ${DHSENCODE}..." -tar -xvf $DHSENCODE -DHSENCODE="dhs+encode+gencode.hg38.bed" - -echo "Downloading GENCODE encoding sequences..." -original_md5sum="$(curl -sL https://raw.githubusercontent.com/pinellolab/CRISPRme/gnomad-4.1-converter/download_data/gencode.protein_coding.bed.tar.gz | md5sum | cut -d ' ' -f 1)" -GENCODE="gencode.protein_coding.bed.zip" -while true; do # retry download if caught timeout - wget -T 15 -c -O $GENCODE https://raw.githubusercontent.com/pinellolab/CRISPRme/gnomad-4.1-converter/download_data/gencode.protein_coding.bed.tar.gz && break -done -sleep 2 # allow disk sync for md5sum check -local_md5sum="$(md5sum $GENCODE | cut -d ' ' -f 1)" -if [ "$original_md5sum" != "$local_md5sum" ]; then - echo "ERROR: unexpected failure while downloading ${GENCODE}" - exit 1 -fi -echo "Extracting ${GENCODE}..." -tar -xvf $GENCODE -GENCODE="gencode.protein_coding.bed" -cd .. - -# create Dictionaries folder -mkdir -p "Dictionaries" - -# create sg1617 guide file -GUIDEFILE="sg1617.txt" -printf "CTAACAGTTGCTTTTATCACNNN\n" > $GUIDEFILE - -# create NGG PAM file -PAM="PAMs" -mkdir -p $PAM -cd $PAM -NGGPAM="20bp-NGG-spCas9.txt" -printf "NNNNNNNNNNNNNNNNNNNNNGG 3\n" > $NGGPAM -cd .. - -echo "Start CRISPRme test..." -crisprme.py complete-search --genome Genomes/$HG38 --vcf $VCFCONFIG --guide $GUIDEFILE --pam PAMs/$NGGPAM --annotation Annotations/$DHSENCODE --samplesID $SAMPLESCONFIG --gene_annotation Annotations/$GENCODE --sorting-criteria-scoring mm+bulges --sorting-criteria mm,bulges --mm 6 --bDNA 2 --bRNA 2 --merge 3 --output sg1617.6.2.2 --thread 4 diff --git a/crisprme_auto_test_docker.sh b/crisprme_auto_test_docker.sh deleted file mode 100755 index de015bc..0000000 --- a/crisprme_auto_test_docker.sh +++ /dev/null @@ -1,132 +0,0 @@ -# ------------------------------------------------------------------------------ -# TEST CRISPRme Docker - -echo "Download and extract fundamental data..." - -# download hg38 genome assembly -GENOMEDIR="Genomes" -HG38="hg38" -echo "Downloading hg38 genome assembly..." -mkdir -p $GENOMEDIR # create Genomes folder -cd $GENOMEDIR -# download chromosomes FASTA files -original_md5sum="a5aa5da14ccf3d259c4308f7b2c18cb0" # see https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/md5sum.txt -while true; do # retry download if caught timeout - wget -T 15 -c https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.chromFa.tar.gz && break -done -chromsfasta="hg38.chromFa.tar.gz" -local_md5sum="$(md5sum $chromsfasta | cut -d ' ' -f 1)" -if [ "$original_md5sum" != "$local_md5sum" ]; then - echo "ERROR: unexpected failure while downloading https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.chromFa.tar.gz" - exit 1 -fi -echo "Extracting ${chromsfasta}..." -tar -xzf $chromsfasta -mv chroms $HG38 # create hg38 directory -cd .. - -# download 1000G VCFs -echo "Downloading 1000 Genomes VCFs..." -VCF="VCFs" -mkdir -p $VCF # create VCF folder -cd $VCF -VCF1000G="hg38_1000G" -mkdir -p $VCF1000G # create 1000 genomes folder -cd $VCF1000G -# download 1000 genomes VCF files -for i in $(seq 1 22; echo "X"); -do - original_md5sum="$(curl -sL ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chr${i}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz | md5sum | cut -d ' ' -f 1)" # compute original md5sum - while true; do # retry download if caught timeout - wget -T 15 -c ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chr${i}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz && break - done - sleep 2 # allow disk sync for md5sum check - local_md5sum="$(md5sum ALL.chr${i}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz | cut -d ' ' -f 1)" - if [ "$original_md5sum" != "$local_md5sum" ]; then # check download consistency - echo "ERROR: unexpected failure while downloading ALL.chr${i}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz" - exit 1 - fi -done -cd ../.. - -# initialize VCF config file -VCFCONFIG="vcf_config.1000G.txt" -printf "${VCF1000G}\n" > $VCFCONFIG - -# download 1000G samplesIDs -SAMPLESIDS="samplesIDs" -mkdir -p $SAMPLESIDS # create sample ids dir -cd $SAMPLESIDS -# download 1000G samples IDs -echo "Downloading samples ids for 1000G dataset" -SAMPLES1000G="hg38_1000G.samplesID.txt" -original_md5sum="$(curl -sL https://raw.githubusercontent.com/pinellolab/CRISPRme/refs/heads/gnomad-4.1-converter/download_data/${SAMPLES1000G} | md5sum | cut -d ' ' -f 1)" # compute original md5sum -while true; do # retry download if caught timeout - wget -T 15 -c https://raw.githubusercontent.com/pinellolab/CRISPRme/refs/heads/gnomad-4.1-converter/download_data/${SAMPLES1000G} && break -done -sleep 2 # allow disk sync for md5sum check -local_md5sum="$(md5sum $SAMPLES1000G | cut -d ' ' -f 1)" -if [ "$original_md5sum" != "$local_md5sum" ]; then - echo "ERROR: unexpected failure while downloading ${SAMPLES1000G}" - exit 1 -fi -cd .. - -# initialize samples config file -SAMPLESCONFIG="samplesIDs.1000G.txt" -printf "${SAMPLES1000G}\n" > $SAMPLESCONFIG - -# download annotation data -ANNOTATIONDIR="Annotations" -mkdir -p $ANNOTATIONDIR # create annotation folder -cd $ANNOTATIONDIR -echo "Downloading ENCODE+GENCODE annotation data..." -original_md5sum="$(curl -sL https://raw.githubusercontent.com/pinellolab/CRISPRme/gnomad-4.1-converter/download_data/dhs+encode+gencode.hg38.bed.tar.gz | md5sum | cut -d ' ' -f 1)" -DHSENCODE="dhs+encode+gencode.hg38.bed.zip" -while true; do # retry download if caught timeout - wget -T 15 -c -O $DHSENCODE https://raw.githubusercontent.com/pinellolab/CRISPRme/gnomad-4.1-converter/download_data/dhs+encode+gencode.hg38.bed.tar.gz && break -done -sleep 2 # allow disk sync for md5sum check -local_md5sum="$(md5sum $DHSENCODE | cut -d ' ' -f 1)" -if [ "$original_md5sum" != "$local_md5sum" ]; then - echo "ERROR: unexpected failure while downloading ${DHSENCODE}" - exit 1 -fi -echo "Extracting ${DHSENCODE}..." -tar -xvf $DHSENCODE -DHSENCODE="dhs+encode+gencode.hg38.bed" - -echo "Downloading GENCODE encoding sequences..." -original_md5sum="$(curl -sL https://raw.githubusercontent.com/pinellolab/CRISPRme/gnomad-4.1-converter/download_data/gencode.protein_coding.bed.tar.gz | md5sum | cut -d ' ' -f 1)" -GENCODE="gencode.protein_coding.bed.zip" -while true; do # retry download if caught timeout - wget -T 15 -c -O $GENCODE https://raw.githubusercontent.com/pinellolab/CRISPRme/gnomad-4.1-converter/download_data/gencode.protein_coding.bed.tar.gz && break -done -sleep 2 # allow disk sync for md5sum check -local_md5sum="$(md5sum $GENCODE | cut -d ' ' -f 1)" -if [ "$original_md5sum" != "$local_md5sum" ]; then - echo "ERROR: unexpected failure while downloading ${GENCODE}" - exit 1 -fi -echo "Extracting ${GENCODE}..." -tar -xvf $GENCODE -GENCODE="gencode.protein_coding.bed" -cd .. - -# create Dictionaries folder -mkdir -p "Dictionaries" - -# create sg1617 guide file -GUIDEFILE="sg1617.txt" -printf "CTAACAGTTGCTTTTATCACNNN\n" > $GUIDEFILE - -# create NGG PAM file -PAM="PAMs" -mkdir -p $PAM -cd $PAM -NGGPAM="20bp-NGG-spCas9.txt" -printf "NNNNNNNNNNNNNNNNNNNNNGG 3\n" > $NGGPAM -cd .. - -echo "Start CRISPRme test..." -docker run -v ${PWD}:/DATA -w /DATA -i pinellolab/crisprme crisprme.py complete-search --genome Genomes/$HG38 --vcf $VCFCONFIG --guide $GUIDEFILE --pam PAMs/$NGGPAM --annotation Annotations/$DHSENCODE --samplesID $SAMPLESCONFIG --gene_annotation Annotations/$GENCODE --sorting-criteria-scoring mm+bulges --sorting-criteria mm,bulges --mm 6 --bDNA 2 --bRNA 2 --merge 3 --output sg1617.6.2.2 --thread 4 diff --git a/crisprme_auto_test_download_essentials.sh b/crisprme_auto_test_download_essentials.sh deleted file mode 100755 index b639710..0000000 --- a/crisprme_auto_test_download_essentials.sh +++ /dev/null @@ -1,54 +0,0 @@ -#crisprme download and test -echo "starting download and unzip of data" - -echo "unzip gencode+encode annotations" -#unzip annotations -cd Annotations/ -tar -xzf encode+gencode.hg38.tar.gz -rm encode+gencode.hg38.tar.gz -tar -xzf gencode.protein_coding.tar.gz -rm gencode.protein_coding.tar.gz -cd ../ - -#unzip gencode -# echo "unzip gencode protein-coding proximity file" -# cd Gencode/ -# tar -xzf gencode.protein_coding.tar.gz -# rm gencode.protein_coding.tar.gz -# cd ../ - -# echo "start download VCF data and genome (this may take a long time due to connection speed)" -# #download VCFs data -# cd VCFs/ -# #download 1000G -# cd hg38_1000G/ -# echo "download 1000G VCFs" -# for i in {1..22}; do -# wget -c -q ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chr$i.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz -# done -# wget -c -q ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chrX.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz - -#download HGDP -#uncomment these lines if you want to download also HGDP VCFs -# cd ../hg38_HGDP -# echo "download HGDP VCFs" -# for i in {1..22} -# do -# wget -c -q ftp://ngs.sanger.ac.uk:21/production/hgdp/hgdp_wgs.20190516/hgdp_wgs.20190516.full.chr$i.vcf.gz -# done -# wget -c -q ftp://ngs.sanger.ac.uk:21/production/hgdp/hgdp_wgs.20190516/hgdp_wgs.20190516.full.chrX.vcf.gz -# wget -c -q ftp://ngs.sanger.ac.uk:21/production/hgdp/hgdp_wgs.20190516/hgdp_wgs.20190516.full.chrY.vcf.gz -# cd ../../ - -#download hg38 -cd Genomes/ -echo "download hg38" -wget -c -q https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.chromFa.tar.gz -tar -xzf hg38.chromFa.tar.gz -mv chroms hg38 -cd ../ - -echo "download complete" - -# echo "start testing" -# crisprme.py complete-search --genome Genomes/hg38/ --vcf list_vcf.txt/ --guide sg1617.txt --pam PAMs/20bp-NGG-spCas9.txt --annotation Annotations/gencode_encode.hg38.bed --samplesID list_samplesID.txt --gene_annotation Gencode/gencode.protein_coding.bed --bMax 2 --mm 6 --bDNA 2 --bRNA 2 --merge 3 --output sg1617.6.2.2 --thread 4 diff --git a/crisprme_auto_test_no_download.sh b/crisprme_auto_test_no_download.sh deleted file mode 100755 index 1d2fc52..0000000 --- a/crisprme_auto_test_no_download.sh +++ /dev/null @@ -1,16 +0,0 @@ -echo "unzip gencode+encode annotations" -#unzip annotations -cd Annotations/ -tar -xzf gencode_encode.hg38.tar.gz &> error.txt -rm -f gencode_encode.hg38.tar.gz error.txt -cd ../ - -#unzip gencode -#echo "unzip gencode protein-coding proximity file" -#cd Gencode/ -#tar -xzf gencode.protein_coding.tar.gz &> error.txt -#rm -f gencode.protein_coding.tar.gz error.txt -#cd ../ - -echo "start testing" -crisprme.py complete-search --genome Genomes/hg38/ --vcf list_vcf.txt/ --guide sg1617.txt --pam PAMs/20bp-NGG-SpCas9.txt --annotation Annotations/encode+gencode.hg38.bed --samplesID list_samplesID.txt --gene_annotation Annotations/gencode.protein_coding.bed --bMax 2 --mm 6 --bDNA 2 --bRNA 2 --merge 3 --output sg1617.6.2.2 --thread 20 diff --git a/crisprme_test_complete_package.sh b/crisprme_test_complete_package.sh deleted file mode 100755 index b33804a..0000000 --- a/crisprme_test_complete_package.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash - -#md5 value for both pacakages -md5_complete_package=b9111f5e1fc9de77ae6f098e12a6ab4e -md5_VCF_package=7feb40213b329c506d6a5e16b7dc10cf - -echo "download the complete crisprme package and then unzip it" -wget -c -q -O complete_test_package_no_VCFs.tar.gz https://www.dropbox.com/s/pzfeb1k9v9ekyhr/complete_test_package_no_VCFs.tar.gz?dl=1 -wget -c -q -O VCFs.tar.gz https://www.dropbox.com/s/v1fxyhopjek6ib1/VCFs.tar.gz?dl=1 -# axel -a -q -n 1 https://www.dropbox.com/s/pzfeb1k9v9ekyhr/complete_test_package_no_VCFs.tar.gz?dl=1 -# axel -a -q -n 1 https://www.dropbox.com/s/v1fxyhopjek6ib1/VCFs.tar.gz?dl=1 - -#check value for complete package -test_md5=$(md5sum complete_test_package_no_VCFs.tar.gz | awk '{print $1}') -if [ "$md5_complete_package" != "$test_md5" ]; then - echo "error during file download, please rerun this test" - exit 1 -fi - -#check value for VCFs package -test_md5=$(md5sum VCFs.tar.gz | awk '{print $1}') -if [ "$md5_VCF_package" != "$test_md5" ]; then - echo "error during file download, please rerun this test" - exit 1 -fi - -echo "unziping all packages" -mkdir crisprme_complete_test -tar -xzf complete_test_package_no_VCFs.tar.gz --directory crisprme_complete_test/ -tar -xzf VCFs.tar.gz --directory crisprme_complete_test/ -cd crisprme_complete_test/ - -echo "generate input test data" -echo "CTAACAGTTGCTTTTATCACNNN" >sg1617.txt -echo "hg38_1000G/" >list_vcf.txt -echo "hg38_1000G.samplesID.txt" >list_samplesID.txt -echo "crisprme.py complete-search --genome Genomes/hg38/ --vcf list_vcf.txt/ --guide sg1617.txt --pam PAMs/20bp-NNN-SpCas9.txt --annotation Annotations/encode+gencode.hg38.bed --samplesID list_samplesID.txt --gene_annotation Annotations/gencode.protein_coding.bed --bMax 2 --mm 6 --bDNA 2 --bRNA 2 --merge 3 --output sg1617.6.2.2 --thread 8" >crisprme_complete_auto_test.sh - -echo "to test the software, launch crisprme_complete_auto_test.sh" diff --git a/docs/readme/directory_tree.png b/docs/readme/directory_tree.png deleted file mode 100755 index 363a698..0000000 Binary files a/docs/readme/directory_tree.png and /dev/null differ diff --git a/docs/readme/hello-world-docker.png b/docs/readme/hello-world-docker.png deleted file mode 100644 index 6ebfcf3..0000000 Binary files a/docs/readme/hello-world-docker.png and /dev/null differ diff --git a/download_data/encode+gencode.hg38.tar.gz b/download_data/encode+gencode.hg38.tar.gz deleted file mode 100644 index 724c78e..0000000 Binary files a/download_data/encode+gencode.hg38.tar.gz and /dev/null differ diff --git a/index.py b/index.py index ed470e7..e79b583 100755 --- a/index.py +++ b/index.py @@ -34,7 +34,6 @@ current_working_directory, cache, ) -from utils import check_directories from dash.dependencies import Input, Output, State from typing import Tuple @@ -50,6 +49,15 @@ HOST = "0.0.0.0" # server host PORTWEB = 80 # website port PORTLOCAL = 8080 # local server port +CRISPRME_DIRS = [ + "Genomes", + "Results", + "Dictionaries", + "VCFs", + "Annotations", + "PAMs", + "samplesIDs", +] # crisprme directory tree # initialize the webpage server = app.server # start server @@ -65,6 +73,29 @@ ) +def check_directories(basedir: str) -> None: + """Check and create necessary directories in the specified base directory. + + This function verifies if the provided base directory exists and creates + any missing subdirectories defined in the CRISPRME_DIRS list. + + Args: + basedir (str): The base directory to check and create subdirectories in. + + Raises: + TypeError: If basedir is not a string. + FileNotFoundError: If the base directory does not exist. + """ + + if not isinstance(basedir, str): + raise TypeError(f"Expected {str.__name__}, got {type(basedir).__name__}") + if not os.path.exists(basedir): + raise FileNotFoundError(f"Unable to locate {basedir}") + for d in CRISPRME_DIRS: + if not os.path.exists(os.path.join(basedir, d)): + os.makedirs(os.path.join(basedir, d)) + + # switch between the website pages @app.callback( [Output("page-content", "children"), Output("job-link", "children")], diff --git a/pages/main_page.py b/pages/main_page.py index e10119d..cc28d46 100755 --- a/pages/main_page.py +++ b/pages/main_page.py @@ -58,16 +58,8 @@ import os -# Allowed mismatches and bulges -# ONLINE = False # NOTE change to True for online version, False for offline -if ONLINE: - MAX_MMS = 7 # max allowed mismatches - MAX_BULGES = 3 # max allowed bulges -else: - # NOTE modify value for increasing/decreasing max mms or bulges available on - # Dropdown selection - MAX_MMS = 7 # max allowed mismatches - MAX_BULGES = 3 # max allowed bulges +MAX_BULGES = 3 # max allowed bulges +MAX_MMS = 7 # max allowed mismatches # mismatches, bulges and guides values AV_MISMATCHES = [{"label": i, "value": i} for i in range(MAX_MMS)] AV_BULGES = [{"label": i, "value": i} for i in range(MAX_BULGES)] @@ -394,9 +386,7 @@ def change_url( gencode_name = "gencode.protein_coding.bed" annotation_name = ".dummy.bed" # to proceed without annotation file if "EN" in annotation_var: - # annotation_name = "encode+gencode.hg38.bed" - ##use annotation wiht dhs - annotation_name = "dhs+encode+gencode.hg38.bed" + annotation_name = "dhs+encode+gencode.hg38.bed" # use dhs annotation file if "MA" in annotation_var: annotation_name = "".join( [ @@ -961,6 +951,9 @@ def change_url( "log_error.txt" ) ) + # set sorting criteria for score and fewest + sorting_criteria_scoring = "mm+bulges" + sorting_criteria = "mm+bulges,mm" # TODO: use functions rather than calling scripts run_job_sh = os.path.join( app_directory, POSTPROCESS_DIR, "submit_job_automated_new_multiple_vcfs.sh" @@ -978,7 +971,7 @@ def change_url( log_error = os.path.join(result_dir, "log_error.txt") assert isinstance(dna, int) assert isinstance(rna, int) - cmd = f"{run_job_sh} {genome} {vcfs} {guides_file} {pam_file} {annotation} {samples_ids} {max(dna, rna)} {mms} {dna} {rna} {merge_default} {result_dir} {postprocess} {4} {current_working_directory} {gencode} {dest_email} {be_start} {be_stop} {be_nt} 1> {log_verbose} 2>{log_error}" + cmd = f"{run_job_sh} {genome} {vcfs} {guides_file} {pam_file} {annotation} {samples_ids} {max(dna, rna)} {mms} {dna} {rna} {merge_default} {result_dir} {postprocess} {4} {current_working_directory} {gencode} {dest_email} {be_start} {be_stop} {be_nt} {sorting_criteria_scoring} {sorting_criteria} 1> {log_verbose} 2>{log_error}" # run job pool_executor.submit(subprocess.run, cmd, shell=True) return ("/load", f"?job={job_id}") @@ -1540,7 +1533,7 @@ def change_variants_checklist_state(genome_value: str) -> List: "disabled": False, }, {"label": " plus HGDP variants", "value": "HGDP", "disabled": False}, - {"label": " plus personal variants*", "value": "PV", "disabled": True}, + {"label": " plus personal variants*", "value": "PV", "disabled": ONLINE}, ] personal_vcf = get_custom_VCF(genome_value) return [checklist_variants_options, personal_vcf] @@ -1859,7 +1852,7 @@ def index_page() -> html.Div: { "label": " Personal annotations*", "value": "MA", - "disabled": True, + "disabled": ONLINE, }, ], id="checklist-annotations", diff --git a/reset_dirs.sh b/reset_dirs.sh deleted file mode 100755 index 9bd965b..0000000 --- a/reset_dirs.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - - -read -p "Do you want to remove all directories with download and user generated data? THE REMOVE IS IRREVERSIBILE (Yes/no)" answer - -echo - -if [[ $answer == "Yes" ]]; then - - echo "REMOVING ALL DIRECTORIES AND USER GENERATED DATA" - rm -rf Genomes/ - rm -rf Results/ - rm -rf Dictionaries/ - rm -rf VCFs/ - rm -rf Annotations/ - rm -rf PAMs/ - rm -rf samplesIDs/ - rm -rf genome_library/ - -fi diff --git a/src/crisprme_test.py b/src/complete_test.py similarity index 54% rename from src/crisprme_test.py rename to src/complete_test.py index 53ad777..592fd2e 100644 --- a/src/crisprme_test.py +++ b/src/complete_test.py @@ -1,4 +1,35 @@ """ +This module provides functionality to execute the CRISPRme test workflow, including +downloading genomic and variant data, preparing input files, and running the CRISPRme +command-line tool. It includes functions for managing directories, downloading data +from various sources, and configuring test parameters. + +Key functions include: +- `ensure_hg38_directory`: Ensures the existence of the 'hg38' directory within the + specified destination. +- `download_genome_data`: Downloads genome data for a specified chromosome to the + destination directory. +- `ensure_vcf_dataset_directory`: Ensures the existence of a directory for a specific + VCF dataset. +- `download_vcf_data`: Downloads VCF data for a specific chromosome and variant dataset. +- `ensure_samplesids_directory`: Ensures the existence of the 'samplesIDs' directory. +- `download_samples_ids_data`: Downloads samples IDs data for a specific variant dataset. +- `ensure_annotation_directory`: Ensures the existence of the 'annotation' directory. +- `download_annotation_data`: Downloads gencode and encode annotation data to the + 'annotation' directory. +- `write_ngg_pamfile`: Writes a test PAM file containing the NGG sequence. +- `write_sg1617_guidefile`: Writes a test guide file containing the sg1617 guide + sequence. +- `write_vcf_config`: Writes a test VCF list file for a specific variant dataset. +- `write_samplesids_config`: Writes a test samples ID list file for a specific + variant dataset. +- `run_crisprme_test`: Executes the CRISPRme test workflow for a specified chromosome + and dataset. +- `main`: The entry point of the module that orchestrates the test execution. + +This module is designed to facilitate the testing and validation of the CRISPRme +tool, ensuring that all necessary data and configurations are correctly handled +before running the analysis. """ from utils import ( @@ -7,8 +38,14 @@ gunzip, untar, rename, + compute_md5, CHROMS, CRISPRME_DIRS, + MD5GENOME, + MD51000G, + MD5HGDP, + MD5SAMPLES, + MD5ANNOTATION, ) from typing import Tuple @@ -18,19 +55,15 @@ import os HG38URL = "https://hgdownload.soe.ucsc.edu/goldenPath/hg38" -VCF1000GPURL = ( - "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/" - "release/20190312_biallelic_SNV_and_INDEL/" +VCF1000GSERVER = "ftp.1000genomes.ebi.ac.uk" +VCF1000GURL = ( + "/vol1/ftp/data_collections/1000_genomes_project/release/" + "20190312_biallelic_SNV_and_INDEL/" "ALL.{}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz" ) -VCFHGDPURL = ( - "ftp://ngs.sanger.ac.uk:21/production/hgdp/hgdp_wgs.20190516/" - "hgdp_wgs.20190516.full.{}.vcf.gz" -) -SAMPLESIDURL = ( - "https://raw.githubusercontent.com/pinellolab/CRISPRme/test-function/download_data" -) -ANNOTATIONURL = "https://github.com/pinellolab/CRISPRme/raw/test-function/download_data" +VCFHGDPSERVER = "ngs.sanger.ac.uk" +VCFHGDPURL = "/production/hgdp/hgdp_wgs.20190516/hgdp_wgs.20190516.full.{}.vcf.gz" +TESTDATAURL = "https://raw.githubusercontent.com/pinellolab/CRISPRme/v216/test/data/" def ensure_hg38_directory(dest: str) -> str: @@ -73,14 +106,18 @@ def download_genome_data(chrom: str, dest: str) -> None: raise ValueError(f"Forbidden input chromosome ({chrom})") if not os.path.isdir(dest): # check dest directory existence raise FileExistsError(f"Unable to locate {dest}") + sys.stderr.write(f"Downloading fasta file for chromosome(s) {chrom}\n") if chrom == "all": - chromstar = download(f"{HG38URL}/bigZips/hg38.chromFa.tar.gz", dest) + chromstar = download(dest, http_url=f"{HG38URL}/bigZips/hg38.chromFa.tar.gz") + # check genome md5 + if MD5GENOME[os.path.basename(chromstar)] != compute_md5(chromstar): + raise ValueError(f"Download for {os.path.basename(chromstar)} failed") chromsdir = untar(chromstar, dest, "chroms") # decompress archive # rename chroms dir to hg38 chromsdir = rename(chromsdir, os.path.join(os.path.dirname(chromsdir), "hg38")) assert os.path.isdir(chromsdir) else: - chromgz = download(f"{HG38URL}/chromosomes/{chrom}.fa.gz", dest) + chromgz = download(dest, http_url=f"{HG38URL}/chromosomes/{chrom}.fa.gz") dest = ensure_hg38_directory(dest) # create hg38 directory chromfa = gunzip( chromgz, @@ -118,7 +155,7 @@ def download_vcf_data(chrom: str, dest: str, dataset: str) -> None: chrom (str): The chromosome identifier in UCSC format. dest (str): The destination directory to save the downloaded VCF data. dataset (str): The name or identifier of the variant dataset (e.g., "1000G", - "HGDP"). + "HGDP", or 1000G+HGDP). Returns: None @@ -134,14 +171,25 @@ def download_vcf_data(chrom: str, dest: str, dataset: str) -> None: if not os.path.isdir(dest): # check dest directory existence raise FileExistsError(f"Unable to locate {dest}") # support for 1000 GP and HGDP datasets - if dataset not in ["1000G", "HGDP"]: + if dataset not in ["1000G", "HGDP", "1000G+HGDP"]: raise ValueError(f"Unknown variant dataset ({dataset})") # create VCF dataset directory within VCFs folder - vcf_dataset_dir = ensure_vcf_dataset_directory(dest, dataset) - vcf_url = VCF1000GPURL if dataset == "1000G" else VCFHGDPURL - chroms = CHROMS if chrom == "all" else [chrom] - for c in chroms: - download(vcf_url.format(c), vcf_dataset_dir) + sys.stderr.write(f"Downloading VCF data for chromsome(s) {chrom}\n") + for ds in dataset.split("+"): + vcf_dataset_dir = ensure_vcf_dataset_directory(dest, ds) + ftp_server = VCF1000GSERVER if ds == "1000G" else VCFHGDPSERVER + vcf_url = VCF1000GURL if ds == "1000G" else VCFHGDPURL + chroms = CHROMS if chrom == "all" else [chrom] + for c in chroms: # request FTP connection + vcf = download( + vcf_dataset_dir, + ftp_conn=True, + ftp_server=ftp_server, + ftp_path=vcf_url.format(c), + ) + md5data = MD51000G if ds == "1000G" else MD5HGDP + if md5data[os.path.basename(vcf)] != compute_md5(vcf): + raise ValueError(f"Download for {os.path.basename(vcf)} failed") def ensure_samplesids_directory(dest: str) -> str: @@ -178,15 +226,21 @@ def download_samples_ids_data(dataset: str) -> None: ValueError: If the variant dataset is unknown. """ - if dataset not in ["1000G", "HGDP"]: + if dataset not in ["1000G", "HGDP", "1000G+HGDP"]: raise ValueError(f"Unknown variant dataset ({dataset})") # samples ids folder must be located within current directory # -- see check_crisprme_directory_tree() for details + sys.stderr.write(f"Downloading sample ids for dataset(s) {dataset}\n") samplesids_dir = ensure_samplesids_directory(os.getcwd()) - samplesid_fname = ( - "hg38_1000G.samplesID.txt" if dataset == "1000G" else "hg38_HGDP.samplesID.txt" - ) - download(f"{SAMPLESIDURL}/{samplesid_fname}", samplesids_dir) + for ds in dataset.split("+"): + samplesid_fname = ( + "samplesIDs.1000G.txt" if ds == "1000G" else "samplesIDs.HGDP.txt" + ) + samplesids = download( + samplesids_dir, http_url=f"{TESTDATAURL}/samplesIDs/{samplesid_fname}" + ) + if MD5SAMPLES[os.path.basename(samplesids)] != compute_md5(samplesids): + raise ValueError(f"Download for {os.path.basename(samplesids)} failed") def ensure_annotation_directory(dest: str) -> str: @@ -218,20 +272,50 @@ def download_annotation_data() -> Tuple[str, str]: files. """ + sys.stderr.write("Downloading ENCODE and GENCODE annotation data\n") annotation_dir = ensure_annotation_directory(os.getcwd()) - # download gencode annotation - gencodetar = download( - f"{ANNOTATIONURL}/gencode.protein_coding.tar.gz", annotation_dir + gencode = _retrieve_ann_data( + annotation_dir, + "Annotations/gencode.protein_coding.bed.tar.gz", + "gencode.protein_coding.bed", ) - gencode = os.path.join( - untar(gencodetar, annotation_dir), "gencode.protein_coding.bed" + encode = _retrieve_ann_data( + annotation_dir, + "Annotations/dhs+encode+gencode.hg38.bed.tar.gz", + "dhs+encode+gencode.hg38.bed", ) - # download encode annotation - encodetar = download(f"{ANNOTATIONURL}/encode+gencode.hg38.tar.gz", annotation_dir) - encode = os.path.join(untar(encodetar, annotation_dir), "encode+gencode.hg38.bed") return gencode, encode +def _retrieve_ann_data(annotation_dir: str, url: str, fname: str) -> str: + """Retrieve and validate annotation data from a specified URL. + + This function downloads annotation data from a given URL, verifies the + integrity of the downloaded file using its MD5 checksum, and extracts the + relevant file to the specified annotation directory. It raises an error if + the checksum does not match. + + Args: + annotation_dir (str): The directory where the annotation data will be + stored. + url (str): The URL from which to download the annotation data. + fname (str): The name of the file to retrieve after extraction. + + Returns: + str: The path to the extracted annotation file. + + Raises: + ValueError: If the MD5 checksum of the downloaded file does not match + the expected value. + """ + + # download gencode annotation + annfile_tar = download(annotation_dir, http_url=os.path.join(TESTDATAURL, url)) + if MD5ANNOTATION[os.path.basename(annfile_tar)] != compute_md5(annfile_tar): + raise ValueError(f"Download for {os.path.basename(annfile_tar)} failed") + return os.path.join(untar(annfile_tar, annotation_dir), fname) + + def ensure_pams_directory(dest: str) -> str: """ Ensure the existence of the 'PAMs' directory within the specified destination @@ -260,6 +344,7 @@ def write_ngg_pamfile() -> str: str: The path to the created test PAM file. """ + sys.stderr.write("Creating PAM file\n") pams_dir = ensure_pams_directory( os.getcwd() ) # PAMs directory must be in current working dir @@ -280,6 +365,7 @@ def write_sg1617_guidefile() -> str: str: The path to the created test guide file. """ + sys.stderr.write("Creating guide file\n") guidefile = "sg1617_test_guide.txt" try: with open(guidefile, mode="w") as outfile: @@ -289,7 +375,7 @@ def write_sg1617_guidefile() -> str: return guidefile -def write_vcf_list(dataset: str) -> str: +def write_vcf_config(dataset: str) -> str: """ Write a test VCF list file for a specific variant dataset. @@ -302,20 +388,21 @@ def write_vcf_list(dataset: str) -> str: """ # support for 1000 GP and HGDP datasets - if dataset not in ["1000G", "HGDP"]: + if dataset not in ["1000G", "HGDP", "1000G+HGDP"]: raise ValueError(f"Unknown variant dataset ({dataset})") - # select the test vcf list - vcflist = f"hg38_{dataset}" - vcflistfile = "vcf_list_test.txt" + # config vcf list file + sys.stderr.write(f"Creating VCF config file for dataset(s) {dataset}\n") + vcf_config = "vcf.config.test.txt" try: - with open(vcflistfile, mode="w") as outfile: - outfile.write(f"{vcflist}\n") + with open(vcf_config, mode="w") as outfile: + for ds in dataset.split("+"): + outfile.write(f"hg38_{ds}\n") except IOError as e: raise IOError("An error occurred while writing the test VCF list") from e - return vcflistfile + return vcf_config -def write_samplesids_list(dataset: str) -> str: +def write_samplesids_config(dataset: str) -> str: """ Write a test samples ID list file for a specific variant dataset. @@ -328,62 +415,65 @@ def write_samplesids_list(dataset: str) -> str: """ # support for 1000 GP and HGDP datasets - if dataset not in ["1000G", "HGDP"]: + if dataset not in ["1000G", "HGDP", "1000G+HGDP"]: raise ValueError(f"Unknown variant dataset ({dataset})") - # select the test vcf list - samplesidslist = ( - "hg38_1000G.samplesID.txt" if dataset == "1000G" else "hg38_HGDP.samplesID.txt" - ) - samplesidslistfile = "samplesID_list_test.txt" + # configure sample ids list + sys.stderr.write(f"Creating samples config file for dataset(s) {dataset}\n") + samples_config = "samplesIDs.config.test.txt" try: - with open(samplesidslistfile, mode="w") as outfile: - outfile.write(f"{samplesidslist}\n") + with open(samples_config, mode="w") as outfile: + for ds in dataset.split("+"): + samplesidslist = ( + "samplesIDs.1000G.txt" if ds == "1000G" else "samplesIDs.HGDP.txt" + ) + outfile.write(f"{samplesidslist}\n") except IOError as e: raise IOError("An error occurred while writing the test VCF list") from e - return samplesidslistfile + return samples_config -def run_crisprme_test(chrom: str, dataset: str, debug: bool) -> None: - """ - Run CRISPRme test on specified chromosome and dataset. +def run_crisprme_test(chrom: str, dataset: str, threads: int, debug: bool) -> None: + """Execute the CRISPRme test workflow for a specified chromosome and dataset. - Args: - chrom (str): The chromosome to run the test on. - dataset (str): The dataset to use for the test. - debug (bool): Flag to enable debug mode. + This function orchestrates the downloading of necessary genomic and VCF data, + prepares input files, and runs the CRISPRme command-line tool to perform a + complete search. - Returns: - None + Args: + chrom (str): The chromosome to be analyzed. + dataset (str): The dataset identifier for VCF data. + threads (int): The number of threads to use for processing. + debug (bool): A flag indicating whether to run in debug mode. Raises: - None + Any exceptions raised by the called functions or subprocess. """ check_crisprme_directory_tree(os.getcwd()) # check crisprme directory tree download_genome_data(chrom, CRISPRME_DIRS[0]) # download genome data download_vcf_data(chrom, CRISPRME_DIRS[3], dataset) # download vcf data + vcf = write_vcf_config(dataset) # write test vcf list download_samples_ids_data(dataset) # download vcf dataset samples ids - gencode, encode = ( - download_annotation_data() - ) # download gencode and encode annotation data + samplesids = write_samplesids_config(dataset) # write test samples ids list + # download gencode and encode annotation data + gencode, encode = download_annotation_data() pam = write_ngg_pamfile() # write test NGG PAM file guide = write_sg1617_guidefile() # write test sg1617 guide - vcf = write_vcf_list(dataset) # write test vcf list - samplesids = write_samplesids_list(dataset) # write test samples ids list debug_arg = "--debug" if debug else "" crisprme_cmd = ( f"crisprme.py complete-search --genome {CRISPRME_DIRS[0]}/hg38 " f"--thread 4 --bmax 1 --mm 4 --bDNA 1 --bRNA 1 --merge 3 --pam {pam} " f"--guide {guide} --vcf {vcf} --samplesID {samplesids} --annotation {encode} " - f"--gene_annotation {gencode} --output crisprme-test-out {debug_arg}" + f"--gene_annotation {gencode} --output crisprme-test-out --thread {threads} " + f"{debug_arg}" ) subprocess.call(crisprme_cmd, shell=True) # run crisprme test def main(): - chrom, dataset, debug = sys.argv[1:] # read commandline args + chrom, dataset, threads, debug = sys.argv[1:] # read commandline args debug = debug == "True" - run_crisprme_test(chrom, dataset, debug) # run crisprme test + run_crisprme_test(chrom, dataset, int(threads), debug) # run crisprme test if __name__ == "__main__": diff --git a/src/convert_gnomAD_vcfs.py b/src/convert_gnomAD_vcfs.py index efec095..1bd7b6e 100644 --- a/src/convert_gnomAD_vcfs.py +++ b/src/convert_gnomAD_vcfs.py @@ -1,7 +1,30 @@ """ +Module for converting gnomAD VCF files. + +This module provides functionality to parse command line arguments, read sample IDs, +and convert gnomAD VCF files by updating their headers, filtering variants, and merging +alleles. It utilizes multiprocessing to handle multiple VCF files efficiently and +ensures robust error handling throughout the conversion process. + +Key functions include: +- `parse_commandline`: Validates and parses command line arguments. +- `read_samples_ids`: Reads sample IDs from a specified file. +- `tabix_index`: Creates an index for a VCF file using tabix. +- `load_vcf`: Loads a VCF file and indexes it if necessary. +- `update_header`: Updates the VCF header with sample information. +- `variant_observed`: Checks if any allele count indicates a variant is observed. +- `format_variant_record`: Formats a variant record into a string. +- `convert_vcf`: Converts a VCF file by updating its header and filtering variants. +- `bcftools_merge`: Merges alleles in a VCF file using bcftools. +- `run_conversion_pipeline`: Runs the conversion pipeline for a single VCF file. +- `convert_gnomad_vcfs`: Main entry point for converting gnomAD VCF files based + on user parameters. + +This module is designed to facilitate the processing of genomic data for research +and analysis purposes. """ -from utils import remove # type: ignore +from utils import remove from functools import partial from typing import List, Tuple @@ -21,6 +44,47 @@ BCFTOOLSNORM = "bcftools norm" +def parse_commandline(args: List[str]) -> Tuple[str, str, bool, bool, bool, int]: + """Parse and validate command line arguments for gnomAD VCF conversion. + + This function checks the provided arguments for correctness and extracts the + necessary parameters for the gnomAD VCF conversion process. It ensures that + the directory, joint processing flag, and thread count are valid before + returning the parsed values. + + Args: + args (List[str]): A list of command line arguments. + + Returns: + Tuple[str, str, bool, bool, bool, int]: A tuple containing the gnomAD + VCF directory, sample IDs, a flag indicating whether to use joint + VCF processing, a flag indicating whether to keep files, a flag for + multiallelic processing, and the number of threads to use. + + Raises: + ValueError: If the number of arguments is incorrect, if the specified + directory is not valid, or if the number of threads is out of + allowed range. + """ + + if len(args) != 6: + raise ValueError( + "Wrong number of input arguments, cannot proceed with gnomAD VCF conversion" + ) + gnomad_vcfs_dir, samples_ids, joint, keep, multiallelic, threads = args + if not os.path.isdir(gnomad_vcfs_dir): + raise ValueError( + f"The specified gnomAD VCF directory is not a directory ({gnomad_vcfs_dir})" + ) + threads = int(threads) + if threads > multiprocessing.cpu_count() or threads < 0: + raise ValueError(f"Forbidden number of threads selected ({threads})") + joint = joint == "True" + keep = keep == "True" + multiallelic = multiallelic == "True" + return gnomad_vcfs_dir, samples_ids, joint, keep, multiallelic, threads + + def read_samples_ids(samples_ids: str): """ Reads sample IDs from a file and returns a list of sample IDs excluding comments. @@ -96,7 +160,7 @@ def load_vcf(vcf_fname: str) -> pysam.VariantFile: return pysam.VariantFile(vcf_fname, index_filename=tbi_index) -def update_header(header: pysam.VariantHeader, samples: List[str]) -> str: +def update_header(header: pysam.VariantHeader, samples: List[str], joint: bool) -> str: """ Updates the header of a VCF file with the specified samples and additional metadata fields. @@ -111,7 +175,8 @@ def update_header(header: pysam.VariantHeader, samples: List[str]) -> str: header.add_line(GTLINE) # add FORMAT metadata field header.add_samples(samples) # add samples to header - return str(header) + header = str(header).replace(" bool: @@ -161,7 +226,7 @@ def format_variant_record(variant: pysam.VariantRecord, genotypes: str) -> str: return "\t".join([f"{e}" if e is not None else "." for e in variant_format]) -def convert_vcf(vcf_fname: str, samples: List[str], keep: bool): +def convert_vcf(vcf_fname: str, samples: List[str], joint: bool, keep: bool): """ Converts a VCF file by updating the header, filtering variants, and creating a new compressed VCF file. @@ -186,12 +251,12 @@ def convert_vcf(vcf_fname: str, samples: List[str], keep: bool): except OSError as e: raise OSError(f"An error occurred while loading {vcf_fname}") from e samples_ac = [ - f"AC_{sample}" for sample in samples + f"AC_joint_{sample}" if joint else f"AC_{sample}" for sample in samples ] # recover allele count field for each input sample try: with gzip.open(vcf_outfname, mode="wt") as outfile: # write the upated header to the converted vcf - outfile.write(update_header(vcf.header.copy(), samples)) + outfile.write(update_header(vcf.header.copy(), samples, joint)) for variant in vcf: if not keep and "PASS" not in variant.filter.keys(): continue @@ -249,7 +314,7 @@ def bcftools_merge(vcf_fname: str, multiallelic: bool) -> str: def run_conversion_pipeline( - vcf_fname: str, samples: List[str], keep: bool, multiallelic: bool + vcf_fname: str, samples: List[str], joint: bool, keep: bool, multiallelic: bool ) -> None: """ Runs a conversion pipeline to process a VCF file by adding genotypes and merging @@ -267,16 +332,21 @@ def run_conversion_pipeline( None """ - vcf_genotypes = convert_vcf(vcf_fname, samples, keep) # add genotypes to input VCF + vcf_genotypes = convert_vcf( + vcf_fname, samples, joint, keep + ) # add genotypes to input VCF # merge variants into mutlialleic/biallelic sites vcf_merged = bcftools_merge(vcf_genotypes, multiallelic) assert os.path.isfile(vcf_merged) and os.stat(vcf_merged).st_size > 0 -def convert_gnomad_vcfs(): +def convert_gnomad_vcfs() -> None: """ Converts gnomAD VCF files based on specified parameters and sample data. + Returns: + None + Raises: ValueError: If the number of input arguments is incorrect, the gnomAD VCF directory is invalid, or an invalid number of threads is selected. @@ -284,21 +354,10 @@ def convert_gnomad_vcfs(): OSError: If an error occurs during the gnomAD VCF conversion process. """ - argv = sys.argv[1:] - if len(argv) != 5: - raise ValueError( - "Wrong number of input arguments, cannot proceed with gnomAD VCF conversion" - ) - gnomad_vcfs_dir, samples_ids, keep, multiallelic, threads = argv - if not os.path.isdir(gnomad_vcfs_dir): - raise ValueError( - f"The specified gnomAD VCF directory is not a directory ({gnomad_vcfs_dir})" - ) - threads = int(threads) - if threads > multiprocessing.cpu_count() or threads < 0: - raise ValueError(f"Forbidden number of threads selected ({threads})") - keep = keep == "True" - multiallelic = multiallelic == "True" + # read input arguments + gnomad_vcfs_dir, samples_ids, joint, keep, multiallelic, threads = ( + parse_commandline(sys.argv[1:]) + ) start = time.time() # recover gnomAD file within the specified location (compressed with bgz extension) gnomad_vcfs = glob(os.path.join(gnomad_vcfs_dir, "*.vcf*bgz")) @@ -311,6 +370,7 @@ def convert_gnomad_vcfs(): partial_run_conversion_pipeline = partial( run_conversion_pipeline, samples=samples, + joint=joint, keep=keep, multiallelic=multiallelic, ) diff --git a/src/utils.py b/src/utils.py index 18ff8c2..00a44af 100644 --- a/src/utils.py +++ b/src/utils.py @@ -1,9 +1,37 @@ """ +This module provides utility functions for file handling, downloading data from +FTP and HTTP sources, and managing directory structures for the CRISPRme project. +It includes functions for checking directory structures, downloading files, +decompressing archives, and computing file hashes. + +Key functions include: +- `check_crisprme_directory_tree`: Checks and creates the necessary CRISPRme + directory structure. +- `ftp_download`: Downloads a file from an FTP server to a specified local destination. +- `http_download`: Downloads a file from an HTTP or HTTPS URL to a specified + local destination. +- `download`: Downloads a file from either an FTP server or an HTTP/HTTPS URL + based on provided parameters. +- `remove`: Removes a file or directory specified by the given path. +- `rename`: Renames a file or directory from the original path to the new + specified name. +- `untar`: Decompresses and extracts the contents of a tar.gz file to a specified + destination. +- `gunzip`: Decompresses a gzip file to the specified output file. +- `compute_md5`: Computes the MD5 hash of a file for integrity verification. + +This module is designed to facilitate data management and processing for genomic +analysis workflows, ensuring that necessary files and directories are correctly +handled and maintained. """ -from typing import Optional +from io import TextIOWrapper +from typing import Optional, Union +from ftplib import FTP import subprocess +import requests +import hashlib import tarfile import gzip import sys @@ -18,7 +46,68 @@ "PAMs", "samplesIDs", ] -CHROMS = [f"chr{i}" for i in list(range(1, 23)) + ["X"]] +CHROMS = [f"chr{i}" for i in list(range(1, 23)) + ["X"]] # canonical chroms +# md5 hashes stored in dictionary, md5s are used to check test files consistency +MD5GENOME = {"hg38.chromFa.tar.gz": "a5aa5da14ccf3d259c4308f7b2c18cb0"} +MD51000G = { + "ALL.chr1.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz": "77f154e53c2b7c36b04d03bab3af8b74", + "ALL.chr2.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz": "f9d29c4935e591b2b269eed7cd7e35d8", + "ALL.chr3.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz": "6e59d00235de71562b4199e09b7e5934", + "ALL.chr4.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz": "70a2c1ede97eceb7baeea06c8e46cf3c", + "ALL.chr5.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz": "74d5486c0fd29b0e6add24d3740fc3b4", + "ALL.chr6.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz": "8c5d83c1a9253058120368af39baf0c8", + "ALL.chr7.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz": "dfaa282712fc1292146173dd2ffeb1d9", + "ALL.chr8.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz": "ddf7b370fcee63462037c237f12b4444", + "ALL.chr9.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz": "5ade69521dc50d88ad7c91bf4ec6fcd8", + "ALL.chr10.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz": "1c409a674426eda2fd29b49078137c5d", + "ALL.chr11.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz": "65339bffc61bc97f2130832fe9f84d7c", + "ALL.chr12.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz": "9a1bda389121140d30c768ef6a1b1370", + "ALL.chr13.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz": "47b0463541be137a8bbfe40f6aade864", + "ALL.chr14.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz": "241aedf0792c45d5345d421105c782af", + "ALL.chr15.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz": "b48e7c64e35b727d34786faa76467f94", + "ALL.chr16.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz": "1ce7d66799cab6718852d78dd2aab765", + "ALL.chr17.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz": "ecc22783fd1ee7a1c66b053491873192", + "ALL.chr18.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz": "fdf3e460e91cd955a9e8cebf01b5d815", + "ALL.chr19.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz": "a2f17e4ec552fc07cbd05c1eac0cf7ec", + "ALL.chr20.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz": "155c3b440d7990630132e4756f7fcc85", + "ALL.chr21.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz": "52882490028507e5d4e606b0905072b1", + "ALL.chr22.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz": "57a1722e6ed7d9df08cb3c0e42b62d53", + "ALL.chrX.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz": "e6a3d41811faee60de177061edcd6fe6", +} +MD5HGDP = { + "hgdp_wgs.20190516.full.chr1.vcf.gz": "70d82ae3ae65cb73858738f547f64e93", + "hgdp_wgs.20190516.full.chr2.vcf.gz": "539d4eb31355b90f0262453fa1349ae6", + "hgdp_wgs.20190516.full.chr3.vcf.gz": "0d37ba60afd5ff092cf1bc75bde3588e", + "hgdp_wgs.20190516.full.chr4.vcf.gz": "68d57e5c2129bbafa1a9dd75f630cf89", + "hgdp_wgs.20190516.full.chr5.vcf.gz": "929eb66a26e9679320bcc26df0bd4116", + "hgdp_wgs.20190516.full.chr6.vcf.gz": "28c9ad734025e7292bde533da908cf68", + "hgdp_wgs.20190516.full.chr7.vcf.gz": "ed7eaf339cd7964b9f1e7581de5bdeb1", + "hgdp_wgs.20190516.full.chr8.vcf.gz": "2d47d60ff6b63e1163d219d964999ee3", + "hgdp_wgs.20190516.full.chr9.vcf.gz": "52f917fc3068eff76f0ba8bde0c59292", + "hgdp_wgs.20190516.full.chr10.vcf.gz": "7131981641e886173da90d215346e857", + "hgdp_wgs.20190516.full.chr11.vcf.gz": "2f127b0006cbc36fb32c66860d4b31d9", + "hgdp_wgs.20190516.full.chr12.vcf.gz": "023d0e2d852c167490d4578f814d043d", + "hgdp_wgs.20190516.full.chr13.vcf.gz": "afcfba8b01258e418f5fb230b14daa02", + "hgdp_wgs.20190516.full.chr14.vcf.gz": "90b0c15b61fd9c47a9751495f2b784ce", + "hgdp_wgs.20190516.full.chr15.vcf.gz": "665e844d7e2e85e226d25827ea8014be", + "hgdp_wgs.20190516.full.chr16.vcf.gz": "0d6f1b6141c78489a2b2e27eeec848dd", + "hgdp_wgs.20190516.full.chr17.vcf.gz": "d53421438b3bc3c5ce5ab51b90578182", + "hgdp_wgs.20190516.full.chr18.vcf.gz": "6351d9b20995cf500ac4b11490ff31c7", + "hgdp_wgs.20190516.full.chr19.vcf.gz": "167ce7a43876b32e586978a75f3b0d39", + "hgdp_wgs.20190516.full.chr20.vcf.gz": "d90130b11620378bed7c2cc43be94b7e", + "hgdp_wgs.20190516.full.chr21.vcf.gz": "8f44e4daa3952cd73751141f66b6e5ae", + "hgdp_wgs.20190516.full.chr22.vcf.gz": "84f4a1d86f54bdc0cd9b19502ff8d2c2", + "hgdp_wgs.20190516.full.chrX.vcf.gz": "8d0e4e178fdfa07db76d0218a9b2ceab", + "hgdp_wgs.20190516.full.chrY.vcf.gz": "54b3aba28600c8d0d8a695c8dcfdc4cd", +} +MD5ANNOTATION = { + "dhs+encode+gencode.hg38.bed.tar.gz": "4f5eb631af903d4091bb2f57558c7b46", + "gencode.protein_coding.bed.tar.gz": "04297ade436db70784733a5b13d42723", +} +MD5SAMPLES = { + "samplesIDs.1000G.txt": "720af666c9a938de74a2808033aa4509", + "samplesIDs.HGDP.txt": "f92e14e5317221486f20597560ca3a31", +} def check_crisprme_directory_tree(basedir: str) -> None: @@ -51,35 +140,152 @@ def check_crisprme_directory_tree(basedir: str) -> None: os.makedirs(os.path.join(basedir, d)) -def download(url: str, dest: str) -> str: - """ - Download a file from the specified URL to the destination directory. +def ftp_download( + ftp_server: Union[str, None], + ftp_path: Union[str, None], + dest: str, + fname: Optional[str] = None, +) -> str: + """Download a file from an FTP server to a specified destination. + + This function connects to the given FTP server and retrieves a file from the + specified path, saving it to the local destination. It handles errors related + to FTP connection and file writing. Args: - url (str): The URL of the file to download. - dest (str): The destination directory to save the downloaded file. + ftp_server (Union[str, None]): The address of the FTP server. + ftp_path (Union[str, None]): The path of the file on the FTP server. + dest (str): The local destination directory where the file will be saved. + fname (Optional[str]): The name to save the file as locally. If not provided, + the base name of the FTP path will be used. Returns: - str: The file path of the downloaded file. + str: The path to the downloaded file. Raises: - TypeError: If url is not a string. - subprocess.SubprocessError: If the download process fails. + ValueError: If the FTP server or path is not provided. + TypeError: If the FTP server or path is not a string. + OSError: If an error occurs while saving the file. + FileNotFoundError: If the downloaded file is not created. + """ - Examples: - download('http://example.com/file.txt', '/path/to/destination/directory') + if ftp_server is None or ftp_path is None: + raise ValueError( + "FTP server and path must be provided if FTP connection is requested" + ) + if not isinstance(ftp_server, str): + raise TypeError(f"Expected {str.__name__}, got {type(ftp_server).__name__}") + if not isinstance(ftp_path, str): + raise TypeError(f"Expected {str.__name__}, got {type(ftp_path).__name__}") + fname = os.path.join(dest, fname or os.path.basename(ftp_path)) + try: + with FTP(ftp_server) as ftp: # initialize ftp server + ftp.login() # open connection to server + with open(fname, mode="wb") as outfile: # write binary data + ftp.retrbinary(f"RETR {ftp_path}", outfile.write) + except IOError as e: + raise OSError(f"An error occurred while saving {fname}") from e + if not os.path.isfile(fname): + raise FileNotFoundError(f"{fname} not created") + return fname + + +def http_download( + http_url: Union[str, None], dest: str, fname: Optional[str] = None +) -> str: + """Download a file from an HTTP or HTTPS URL to a specified destination. + + This function retrieves a file from the provided HTTP URL and saves it to the + specified local destination. It ensures that the URL is valid and handles + errors related to the HTTP request and file writing. + + Args: + http_url (Union[str, None]): The URL of the file to download. + dest (str): The local destination directory where the file will be saved. + fname (Optional[str]): The name to save the file as locally. If not provided, + the base name of the URL will be used. + + Returns: + str: The path to the downloaded file. + + Raises: + ValueError: If the HTTP URL is not provided or is invalid. + TypeError: If the HTTP URL is not a string. + FileNotFoundError: If the downloaded file is not created. """ - if not isinstance(url, str): - raise TypeError(f"Expected {str.__name__}, got {type(url).__name__}") - fname = os.path.join(dest, os.path.basename(url)) - code = subprocess.call(f"wget {url} -O {fname}", shell=True) # download using wget - if code != 0: - raise subprocess.SubprocessError(f"Download from {url} failed") - assert os.path.isfile(fname) + if http_url is None: + raise ValueError("HTTP URL must be provided if HTTP connection is requested") + if not isinstance(http_url, str): + raise TypeError(f"Expected {str.__name__}, got {type(http_url).__name__}") + if not (http_url.startswith("http://") or http_url.startswith("https://")): + raise ValueError( + "Invalid HTTP URL. It must start with 'http://' or 'https://'." + ) + fname = os.path.join(dest, fname or os.path.basename(http_url)) + response = requests.get(http_url, stream=True) # download data from http + response.raise_for_status() # ensure the request was successful + with open(fname, mode="wb") as outfile: + # write downloaded data in fixed size chunks + for chunk in response.iter_content(chunk_size=8192): + if chunk: + outfile.write(chunk) + if not os.path.isfile(fname): + raise FileNotFoundError(f"{fname} not created") return fname +def download( + dest: str, + fname: Optional[str] = None, + ftp_conn: Optional[bool] = False, + ftp_server: Optional[str] = None, + ftp_path: Optional[str] = None, + http_url: Optional[str] = None, +) -> str: + """Download a file from either an FTP server or an HTTP/HTTPS URL. + + This function determines the appropriate method to download a file based on the + provided parameters, either using FTP or HTTP. It validates the input parameters + and ensures that the destination directory exists before proceeding with the + download. + + Args: + dest (str): The local destination directory where the file will be saved. + fname (Optional[str]): The name to save the file as locally. If not provided, + the base name of the source will be used. + ftp_conn (Optional[bool]): Flag indicating whether to use FTP for the download. + ftp_server (Optional[str]): The FTP server address if using FTP. + ftp_path (Optional[str]): The path of the file on the FTP server. + http_url (Optional[str]): The URL of the file to download via HTTP/HTTPS. + + Returns: + str: The path to the downloaded file. + + Raises: + TypeError: If dest or fname is not a string. + ValueError: If the destination directory does not exist, or if both FTP and + HTTP connections are requested simultaneously. + """ + + if not isinstance(dest, str): + raise TypeError(f"Expected {str.__name__}, got {type(dest).__name__}") + if not os.path.isdir(dest): + raise ValueError( + f"Destination directory {dest} does not exist or is not a directory" + ) + if fname is not None and not isinstance(fname, str): + raise TypeError(f"Expected {str.__name__}, got {type(fname).__name__}") + if ftp_conn and http_url: + raise ValueError( + "Both ftp and http connection cannot be requested at the same time" + ) + if ftp_conn: # ftp connection requested + return ftp_download(ftp_server, ftp_path, dest, fname) + else: # http/https connection requested + return http_download(http_url, dest, fname) + + def remove(fname: str) -> None: """ Remove a file or directory specified by the given path. @@ -141,7 +347,7 @@ def untar(fname_tar_gz: str, dest: str, outdir: Optional[str] = "") -> str: try: with gzip.open(fname_tar_gz, mode="rb") as fin: - with tarfile.open(fileobj=fin, mode="r") as tar: + with tarfile.open(fileobj=fin, mode="r") as tar: # type: ignore tar.extractall(dest) except IOError as e: raise IOError(f"An error occurred while decompressing {fname_tar_gz}") from e @@ -175,3 +381,38 @@ def gunzip(fname_gz: str, fname_out: str) -> str: assert os.stat(fname_out).st_size > 0 remove(fname_gz) # delete compressed archive return fname_out + + +def compute_md5(fname: str) -> str: + """Compute the MD5 hash of a file. + + This function reads a file in binary mode and calculates its MD5 hash, returning + the hash as a hexadecimal string. It handles file access errors gracefully. + + Args: + fname (str): The path to the file for which to compute the MD5 hash. + + Returns: + str: The computed MD5 hash of the file as a hexadecimal string. + + Raises: + FileNotFoundError: If the specified file cannot be found. + PermissionError: If there is a permission issue accessing the file. + Exception: For any unexpected errors encountered during the process. + """ + + hashmd5 = hashlib.md5() # initialize md5 + try: # open file and compute its md5 + with open(fname, mode="rb") as f: + for chunk in iter(lambda: f.read(4096), b""): # read 4096 bytes per chunk + hashmd5.update(chunk) + return hashmd5.hexdigest() # return md5 as string + except FileNotFoundError as e: + raise FileNotFoundError(f"Unable to locate {fname}") from e + except PermissionError as e: + raise PermissionError(f"Permission denied when accessing {fname}") from e + except Exception as e: + # sourcery skip: raise-specific-error + raise Exception( + f"Unexpected error encountered while computing md5 on {fname}" + ) from e diff --git a/test/data/Annotations/dhs+encode+gencode.hg38.bed.tar.gz b/test/data/Annotations/dhs+encode+gencode.hg38.bed.tar.gz new file mode 100644 index 0000000..c5fbd2b Binary files /dev/null and b/test/data/Annotations/dhs+encode+gencode.hg38.bed.tar.gz differ diff --git a/download_data/gencode.protein_coding.tar.gz b/test/data/Annotations/gencode.protein_coding.bed.tar.gz similarity index 81% rename from download_data/gencode.protein_coding.tar.gz rename to test/data/Annotations/gencode.protein_coding.bed.tar.gz index f5a5bb8..fed8239 100644 Binary files a/download_data/gencode.protein_coding.tar.gz and b/test/data/Annotations/gencode.protein_coding.bed.tar.gz differ diff --git a/test/data/Guides/sg1617_test_guide.txt b/test/data/Guides/sg1617_test_guide.txt new file mode 100644 index 0000000..204eb0a --- /dev/null +++ b/test/data/Guides/sg1617_test_guide.txt @@ -0,0 +1 @@ +CTAACAGTTGCTTTTATCACNNN diff --git a/test/data/PAMs/20bp-NGG-SpCas9.txt b/test/data/PAMs/20bp-NGG-SpCas9.txt new file mode 100644 index 0000000..303387f --- /dev/null +++ b/test/data/PAMs/20bp-NGG-SpCas9.txt @@ -0,0 +1 @@ +NNNNNNNNNNNNNNNNNNNNNGG 3 diff --git a/download_data/hg38_1000G.samplesID.txt b/test/data/samplesIDs/samplesIDs.1000G.txt similarity index 100% rename from download_data/hg38_1000G.samplesID.txt rename to test/data/samplesIDs/samplesIDs.1000G.txt diff --git a/download_data/hg38_HGDP.samplesID.txt b/test/data/samplesIDs/samplesIDs.HGDP.txt similarity index 100% rename from download_data/hg38_HGDP.samplesID.txt rename to test/data/samplesIDs/samplesIDs.HGDP.txt diff --git a/download_data/hg38_gnomAD.samplesID.txt b/test/data/samplesIDs/samplesIDs.gnomad.v40.txt similarity index 100% rename from download_data/hg38_gnomAD.samplesID.txt rename to test/data/samplesIDs/samplesIDs.gnomad.v40.txt diff --git a/test/data/samplesIDs/samplesIDs.gnomad.v41.txt b/test/data/samplesIDs/samplesIDs.gnomad.v41.txt new file mode 100644 index 0000000..c600199 --- /dev/null +++ b/test/data/samplesIDs/samplesIDs.gnomad.v41.txt @@ -0,0 +1,11 @@ +#SAMPLE_ID POPULATION_ID SUPERPOPULATION_ID SEX +afr AFR AFR n +ami AMI AMI n +amr AMR AMR n +asj ASJ ASJ n +eas EAS EAS n +fin FIN FIN n +nfe NFE NFE n +mid MID MID n +sas SAS SAS n +remaining OTH OTH n diff --git a/utils.py b/utils.py deleted file mode 100644 index 833a37a..0000000 --- a/utils.py +++ /dev/null @@ -1,129 +0,0 @@ -"""Define static variables and utilities functions used throughout CRISPRme. -""" - -import sys -import os - -CRISPRME_DIRS = [ - "Genomes", - "Results", - "Dictionaries", - "VCFs", - "Annotations", - "PAMs", - "samplesIDs", -] - - -def check_directories(basedir: str) -> None: - """The function checks the consistency of CRISPRme's directory tree. - If a directory is not found in the tree, it will be created. - - ... - - Parameters - ---------- - basedir : str - Base directory - - Returns - ------- - None - """ - - if not isinstance(basedir, str): - raise TypeError(f"Expected {str.__name__}, got {type(basedir).__name__}") - if not os.path.exists(basedir): - raise FileNotFoundError(f"Unable to locate {basedir}") - for d in CRISPRME_DIRS: - if not os.path.exists(os.path.join(basedir, d)): - os.makedirs(os.path.join(basedir, d)) - - -def download_genome(chr: str, directory: str) -> None: - if not isinstance(chr, str): - raise TypeError(f"Expected {str.__name__}, got {type(chr).__name__}") - if not isinstance(directory, str): - raise TypeError(f"Expected {str.__name__}, got {type(directory).__name__}") - if chr == "all": - os.system( - "wget https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.chromFa.tar.gz -O hg38.chromFa.tar.gz" - ) - os.system("mv hg38.chromFa.tar.gz Genomes/") - os.system("tar -xzf Genomes/hg38.chromFa.tar.gz") - os.system(f"mv Genomes/chroms mv Genomes/{directory}") - else: - os.system( - f"wget https://hgdownload2.soe.ucsc.edu/goldenPath/hg38/chromosomes/{chr}.fa.gz -O {chr}.fa.gz" - ) - os.system(f"mv {chr}.fa.gz Genomes/") - os.system(f"gunzip Genomes/{chr}.fa.gz") - os.makedirs(f"Genomes/{directory}", exist_ok=True) - os.system(f"mv Genomes/{chr}.fa Genomes/{directory}/{chr}.fa") - - -def download_vcf(chr: str, origin: str) -> None: - if not isinstance(chr, str): - raise TypeError(f"Expected {str.__name__}, got {type(chr).__name__}") - if not isinstance(origin, str): - raise TypeError(f"Expected {str.__name__}, got {type(origin).__name__}") - if chr == "all" and origin == "1000G": - os.makedirs(f"VCFs/hg38_1000G", exist_ok=True) - for i in range(1, 23): - os.system( - f"wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chr${i}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz -O ALL.chr${i}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz" - ) - os.system( - f"mv ALL.chr${i}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz VCFs/hg38_1000G/" - ) - elif chr != "all" and origin == "1000G": - os.makedirs(f"VCFs/hg38_1000G", exist_ok=True) - os.system( - f"wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.{chr}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz -O ALL.{chr}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz" - ) - os.system( - f"mv ALL.{chr}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz VCFs/hg38_1000G/" - ) - elif chr == "all" and origin == "HGDP": - os.makedirs(f"VCFs/hg38_HGDP", exist_ok=True) - for i in range(1, 23): - os.system( - f"wget ftp://ngs.sanger.ac.uk:21/production/hgdp/hgdp_wgs.20190516/hgdp_wgs.20190516.full.chr{i}.vcf.gz -O hgdp_wgs.20190516.full.chr{i}.vcf.gz" - ) - os.system(f"mv hgdp_wgs.20190516.full.chr{i}.vcf.gz VCFs/hg38_HGDP/") - elif chr != "all" and origin == "HGDP": - os.makedirs(f"VCFs/hg38_HGDP", exist_ok=True) - os.system( - f"wget ftp://ngs.sanger.ac.uk:21/production/hgdp/hgdp_wgs.20190516/hgdp_wgs.20190516.full.{chr}.vcf.gz -O hgdp_wgs.20190516.full.{chr}.vcf.gz" - ) - os.system(f"mv hgdp_wgs.20190516.full.{chr}.vcf.gz VCFs/hg38_HGDP/") - - -def download_samplesID() -> None: - os.system( - "wget https://raw.githubusercontent.com/pinellolab/CRISPRme/test-function/download_data/hg38_1000G.samplesID.txt -O hg38_1000G.samplesID.txt" - ) - os.system( - "wget https://raw.githubusercontent.com/pinellolab/CRISPRme/test-function/download_data/hg38_HGDP.samplesID.txt -O hg38_HGDP.samplesID.txt" - ) - os.system( - "wget https://raw.githubusercontent.com/pinellolab/CRISPRme/test-function/download_data/hg38_gnomAD.samplesID.txt -O hg38_gnomAD.samplesID.txt" - ) - os.system("mv hg38_1000G.samplesID.txt samplesIDs/") - os.system("mv hg38_HGDP.samplesID.txt samplesIDs/") - os.system("mv hg38_gnomAD.samplesID.txt samplesIDs/") - - -def download_annotation() -> None: - os.system( - "wget https://github.com/pinellolab/CRISPRme/raw/test-function/download_data/gencode.protein_coding.tar.gz -O gencode.protein_coding.tar.gz" - ) - os.system("tar -xzf gencode.protein_coding.tar.gz") - os.system("rm gencode.protein_coding.tar.gz") - os.system("mv gencode.protein_coding.bed Annotations/") - os.system( - "wget https://github.com/pinellolab/CRISPRme/raw/test-function/download_data/encode+gencode.hg38.tar.gz -O encode+gencode.hg38.tar.gz" - ) - os.system("tar -xzf encode+gencode.hg38.tar.gz") - os.system("rm encode+gencode.hg38.tar.gz") - os.system("mv encode+gencode.hg38.bed Annotations/")