From 877950e7a70990448bb5eb0645436d246cb04db2 Mon Sep 17 00:00:00 2001 From: Chien-Chi Lo Date: Wed, 20 May 2020 22:56:29 -0600 Subject: [PATCH] avoid self hit with buffer length --- src/get_repeat_coords.pl | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/get_repeat_coords.pl b/src/get_repeat_coords.pl index 7af0011..d32499a 100755 --- a/src/get_repeat_coords.pl +++ b/src/get_repeat_coords.pl @@ -6,6 +6,7 @@ my $identity=95; my $len_cutoff=0; +my $buffer=5; my $output="repeats_coords.txt"; my $stats= "repeats_stats.txt"; @@ -16,6 +17,7 @@ 'i=i' => \$identity, 'l=i' => \$len_cutoff, 'o=s' => \$output, + 'b=i' => \$buffer, 's=s' => \$stats, 'help|?' => sub{Usage()}, ); @@ -26,6 +28,7 @@ sub Usage perl $0 [options] -i INT the identity cutoff 0 to 100 (default: 95) -l INT the repeat length cutoff (default:0) + -b INT the buffer base length to skip self-hits (default:5) -o STRING output filename (default: repeats_coords.txt) -s STRING output stats filename (default: repeats_stats.txt) @@ -77,6 +80,12 @@ sub get_coords_file my $seq_id=$fields[7]; my $start=$fields[0]; my $end=$fields[1]; + ## Skip example + #[S1] [E1] [S2] [E2] [LEN 1] [LEN 2] [% IDY] [TAGS] + #19871 29943 19871 29944 10073 10074 99.99 EPI_ISL_417419 EPI_ISL_417419 + if ($start < ($fields[2]+$buffer) && $start > ($fields[2]-$buffer) && $end < ($fields[3]+$buffer) && $end > ($fields[3]-$buffer)){ + next; + } for my $pos ($start..$end){ $hash{$seq_id}->{$pos}=1; }