From 8b99b216a76d63719c36b6783c67ac8d31b194d3 Mon Sep 17 00:00:00 2001 From: jeanrjc Date: Thu, 30 Jul 2015 19:36:23 +0200 Subject: [PATCH] Fix important bug in expand function. I also move the function, so we cannot see the change, See below for bug. Add variable for files, and when re-running check if file exist and do not re-run hmmer or infernal if file already exists. Bug was in search_left: window_beg = (window_beg - DISTANCE_THRESHOLD) % SIZE_REPLICON window_end = (window_beg + 200) % SIZE_REPLICON where in the second line window_beg does not have the expected value anymore as we change it the line before. I change this to: window_end = (window_beg + 200) % SIZE_REPLICON window_beg = (window_beg - DISTANCE_THRESHOLD) % SIZE_REPLICON And any occurrences of similar problem. I change some file name --- integron_finder | 164 +++++++++++++++++++++++++++--------------------- 1 file changed, 92 insertions(+), 72 deletions(-) diff --git a/integron_finder b/integron_finder index 038a9bf..e77b54c 100755 --- a/integron_finder +++ b/integron_finder @@ -509,7 +509,7 @@ def search_attc(attc_df, keep_palindromes): return attc_array -def find_integron(replicon_name, attc_file, intI_file, phage_int_file): +def find_integron(replicon_name, attc_file, intI_file, phageI_file): """ Function that looks for integrons given rules : - presence of intI - presence of attC @@ -527,7 +527,7 @@ def find_integron(replicon_name, attc_file, intI_file, phage_int_file): intI = read_hmm(replicon_name, intI_file) intI.sort(["Accession_number", "pos_beg", "evalue"], inplace=True) - phageI = read_hmm(replicon_name, phage_int_file) + phageI = read_hmm(replicon_name, phageI_file) phageI.sort(["Accession_number", "pos_beg", "evalue"], inplace=True) tmp = intI[intI.ID_prot.isin(phageI.ID_prot)].copy() @@ -627,7 +627,7 @@ def find_integron(replicon_name, attc_file, intI_file, phage_int_file): intI_ac.pos_end.values[i], id_int, int(intI_ac.strand.values[i]), - intI_ac.evalue.values[i], intI_ac.query_name.values[i]) + intI_ac.evalue.values[i], intI_ac.query_replicon_name.values[i]) if n_attc_array > 0: # after the integrase loop (<=> no more integrases) for attc_array in attc_ac: @@ -671,65 +671,6 @@ def find_integron(replicon_name, attc_file, intI_file, phage_int_file): return integrons -def expand(window_beg, window_end, max_elt, df_max, search_left=False, search_right=False): - # for a given element, we can search on the left hand side (if integrase is on the right for instance) - # or right hand side (opposite situation) or both side (only integrase or only attC sites) - wb = window_beg - we = window_end - if search_right: - - if circular: - window_beg = (window_end - 200) % SIZE_REPLICON # 200 to allow the detection of sites that would overlap 2 consecutive windows - window_end = (window_end + DISTANCE_THRESHOLD) % SIZE_REPLICON - else: - window_beg = max(0, window_end - 200) # 200 to allow the detection of sites that would overlap 2 consecutive windows - window_end = min(SIZE_REPLICON, window_end + DISTANCE_THRESHOLD) - - searched_strand = "both" if search_left else "top" # search on both strands if search in both directions - - while len(df_max) > 0 and 0 < (window_beg and window_end) < SIZE_REPLICON: - - df_max = local_max(replicon_name, window_beg, window_end, searched_strand) - max_elt = pd.concat([max_elt, df_max]) - - if circular: - window_beg = (window_end - 200) % SIZE_REPLICON - window_end = (window_end + DISTANCE_THRESHOLD) % SIZE_REPLICON - else: - window_beg = max(0, window_end - 200) - window_end = min(SIZE_REPLICON, window_end + DISTANCE_THRESHOLD) - - # re-initialize in case we enter search left too. - df_max = max_elt.copy() - window_beg = wb - window_end = we - - if search_left: - if circular: - window_beg = (window_beg - DISTANCE_THRESHOLD) % SIZE_REPLICON - window_end = (window_beg + 200) % SIZE_REPLICON - else: - window_beg = max(0, window_beg - DISTANCE_THRESHOLD) - window_end = min(SIZE_REPLICON, window_beg + 200) - - searched_strand = "both" if search_right else "bottom" - - while len(df_max) > 0 and 0 < (window_beg and window_end) < SIZE_REPLICON: - - df_max = local_max(replicon_name, window_beg, window_end, searched_strand) - max_elt = pd.concat([max_elt, df_max]) # update of attC list of hits. - - if circular: - window_beg = (window_beg - DISTANCE_THRESHOLD) % SIZE_REPLICON - window_end = (window_beg + 200) % SIZE_REPLICON - else: - window_beg = max(0, window_beg - DISTANCE_THRESHOLD) - window_end = min(SIZE_REPLICON, window_beg + 200) - - max_elt.drop_duplicates(inplace=True) - max_elt.index = range(len(max_elt)) - return max_elt - def find_attc_max(integrons, outfile="attC_max_1.res"): """ Look for attC site with cmsearch --max option wich remove all heuristic filters. @@ -864,6 +805,69 @@ def find_attc_max(integrons, outfile="attC_max_1.res"): return max_final +def expand(window_beg, window_end, max_elt, df_max, search_left=False, search_right=False): + # for a given element, we can search on the left hand side (if integrase is on the right for instance) + # or right hand side (opposite situation) or both side (only integrase or only attC sites) + wb = window_beg + we = window_end + + if search_right: + + if circular: + window_beg = (window_end - 200) % SIZE_REPLICON # 200 to allow the detection of sites that would overlap 2 consecutive windows + window_end = (window_end + DISTANCE_THRESHOLD) % SIZE_REPLICON + else: + window_beg = max(0, window_end - 200) # 200 to allow the detection of sites that would overlap 2 consecutive windows + window_end = min(SIZE_REPLICON, window_end + DISTANCE_THRESHOLD) + + searched_strand = "both" if search_left else "top" # search on both strands if search in both directions + + while len(df_max) > 0 and 0 < (window_beg and window_end) < SIZE_REPLICON: + + df_max = local_max(replicon_name, window_beg, window_end, searched_strand) + max_elt = pd.concat([max_elt, df_max]) + + if circular: + window_beg = (window_end - 200) % SIZE_REPLICON + window_end = (window_end + DISTANCE_THRESHOLD) % SIZE_REPLICON + else: + window_beg = max(0, window_end - 200) + window_end = min(SIZE_REPLICON, window_end + DISTANCE_THRESHOLD) + + # re-initialize in case we enter search left too. + df_max = max_elt.copy() + window_beg = wb + window_end = we + + if search_left: + + if circular: + window_end = (window_beg + 200) % SIZE_REPLICON + window_beg = (window_beg - DISTANCE_THRESHOLD) % SIZE_REPLICON + + else: + window_beg = max(0, window_beg - DISTANCE_THRESHOLD) + window_end = min(SIZE_REPLICON, window_beg + 200) + + searched_strand = "both" if search_right else "bottom" + + while len(df_max) > 0 and 0 < (window_beg and window_end) < SIZE_REPLICON: + + df_max = local_max(replicon_name, window_beg, window_end, searched_strand) + max_elt = pd.concat([max_elt, df_max]) # update of attC list of hits. + + if circular: + window_end = (window_beg + 200) % SIZE_REPLICON + window_beg = (window_beg - DISTANCE_THRESHOLD) % SIZE_REPLICON + + else: + window_end = min(SIZE_REPLICON, window_beg + 200) + window_beg = max(0, window_beg - DISTANCE_THRESHOLD) + + max_elt.drop_duplicates(inplace=True) + max_elt.index = range(len(max_elt)) + return max_elt + def local_max(replicon_name, window_beg, window_end, strand_search="both"): @@ -1504,31 +1508,47 @@ Please install prodigal package or setup 'prodigal' binary path with --prodigal ############### Default search ############### + intI_file = os.path.join(out_dir, replicon_name + "_intI.res") + phageI_file = os.path.join(out_dir, replicon_name + "_phage_int.res") + attC_default_file = os.path.join(out_dir, replicon_name + "_attc_table.res") + if args.no_proteins == False: - find_integrase(replicon_path, replicon_name, out_dir) + if (os.path.isfile(intI_file) == 0 or + os.path.isfile(phageI_file) == 0): + + find_integrase(replicon_path, replicon_name, out_dir) - print "\n>>> Starting Default search ... :" - find_attc(replicon_path, replicon_name, out_dir) + print "\n>>> Starting Default search ... :" + if os.path.isfile(attC_default_file) == 0: + find_attc(replicon_path, replicon_name, out_dir) print ">>> Default search done... : \n" integrons = find_integron(replicon_name, - os.path.join(out_dir, replicon_name + "_attc_table.res"), - os.path.join(out_dir, replicon_name + "_intI.res"), - os.path.join(out_dir, replicon_name + "_phage_int.res")) + attC_default_file, + intI_file, + phageI_file) ############### Search with Eagle Eyes ############### if args.eagle_eyes: print "\n>>>>>> Starting search with Eagle's eyes...:" - integron_max = find_attc_max(integrons) + if os.path.isfile(os.path.join(out_dir, "integron_max.pickle")) == 0: + - print ">>>>>> Search with Eagle's eyes done... : \n" + integron_max = find_attc_max(integrons) + integron_max.to_pickle(os.path.join(out_dir, "integron_max.pickle")) + print ">>>>>> Search with Eagle's eyes done... : \n" + + else: + integron_max = pd.read_pickle(os.path.join(out_dir, + "integron_max.pickle")) + print ">>>>>> Search with Eagle's eyes was already done, continue... : \n" integrons = find_integron(replicon_name, integron_max, - os.path.join(out_dir, replicon_name + "_intI.res"), - os.path.join(out_dir, replicon_name + "_phage_int.res")) + intI_file, + phageI_file) ############### Add promoters and attI ###############