From 8a74fcf5452f17f789b8461d0a0716b0bb7737b1 Mon Sep 17 00:00:00 2001
From: codegodz <biobeeloo@gmail.com>
Date: Sat, 15 Jul 2023 14:01:46 +0200
Subject: [PATCH 1/2] update -x

---
 graphStats.jl  | 29 ++++++++++++++++++
 graphite.jl    | 83 ++++++++++++++++++++++++++++++++++++--------------
 main.jl        |  1 +
 suffixArray.jl | 32 ++++++++++++++-----
 test.jl        | 36 ++++++++++++++++++++++
 5 files changed, 150 insertions(+), 31 deletions(-)
 create mode 100644 graphStats.jl
 create mode 100644 test.jl

diff --git a/graphStats.jl b/graphStats.jl
new file mode 100644
index 0000000..880e1c7
--- /dev/null
+++ b/graphStats.jl
@@ -0,0 +1,29 @@
+
+
+function parse_numbers(path::AbstractString)
+    numbers = Vector{Int32}()
+    for node in split(path, " ")
+        number = parse(Int32, view(node, 1:length(node)-1))
+        # number = node[end] == "-" ? flipnode(number) : noflip(number)
+        push!(numbers, number)
+    end 
+    return numbers
+end
+
+function get_stats_gfa2(f::String)
+    h = open(f, "r")
+    unique_nodes = Set{Int}()
+    edges = 0
+    for line in eachline(h)
+        identifier, path = split(line, "\t")
+        node_ids = parse_numbers(path)
+        for node in node_ids
+            push!(unique_nodes, node)
+        end
+        edges += length(edges)-1 
+    end
+    println("Nodes: ", nodes)
+    println("Edges: ", edges)
+end
+
+get_stats_gfa2("/net/phage/linuxhome/mgx/people/rickb/Campy/complete_genomees/graph/complete_campy_graph_gfa.gfa2")
\ No newline at end of file
diff --git a/graphite.jl b/graphite.jl
index d3b0ab3..db5e2e2 100644
--- a/graphite.jl
+++ b/graphite.jl
@@ -1,3 +1,6 @@
+using ProgressMeter
+using Dates 
+
 const MASK = Int32(1<<30) 
 
 flipnode(n::Int32) = n ⊻ MASK
@@ -18,9 +21,23 @@ struct Color
     k_size::Int32
 end
 
-function update_color!(color::Color, ref_id::Int32, match_start::Int32, match_size::Int32, ca::Vector{Int32})
+# 175378
+# 175378
+function update_color!(color::Color, ref_id::Int32, match_start::Int32, match_size::Int32, ca::Vector{Int32}, ca_coords::UnitRange{Int32})
     
     match_end = match_start+match_size-1
+    println("Got match: ", match_start, " -> ", match_end)
+    # Check if this is masked as query 
+    # println( ca_coords[1] != Int32(0))
+    # println(match_start >= ca_coords[1])
+    # println(match_end <= ca_coords[2], " ", match_end, " and ", ca_coords[2] )
+    # println(ca_coords)
+    if ca_coords[1] != Int32(0) && match_start >= ca_coords.start && match_end <= ca_coords.stop
+       # println("Same as query skip")
+        return 
+    end
+ 
+    # println("Passed as non query")
     
     # Get info from pervious matches
     at_start = color.origin[match_start]
@@ -91,7 +108,7 @@ function check_this_point(ca::Vector{Int32}, sa::Vector{Int32}, ref::AbstractVec
     return match_size
 end
 
-function extend_from_point!(ca::Vector{Int32}, sa::Vector{Int32}, ref::Vector{Int32}, lcp::Vector{Int32}, point::Int32, forward::Bool, ref_start::Int32, match_size::Int32, ref_id::Int32, color::Color)
+function extend_from_point!(ca::Vector{Int32}, sa::Vector{Int32}, ref::Vector{Int32}, lcp::Vector{Int32}, point::Int32, forward::Bool, ref_start::Int32, match_size::Int32, ref_id::Int32, color::Color, ca_coords::UnitRange{Int32})
     move_dir = forward ? 1 : -1
     lcp_dir  = forward ? 0 :  1
    # println("Extending this point")
@@ -102,12 +119,12 @@ function extend_from_point!(ca::Vector{Int32}, sa::Vector{Int32}, ref::Vector{In
         start_check_from = Int32(min(lcp[i + lcp_dir], match_size))
         # Check the size of this match starting from +1 of the LCP value)
         match_size = check_this_point(ca, sa, ref, ref_start, Int32(i), start_check_from )
-        update_color!(color, ref_id, sa[i], Int32(match_size), ca)
+        update_color!(color, ref_id, sa[i], Int32(match_size), ca, ca_coords)
         i += move_dir        
     end
 end
 
-function align(ref_id::Int32, color::Color, ca::Vector{Int32}, sa::Vector{Int32}, ref::Vector{Int32}, inv_perm_sa::Vector{Int32}, lcp::Vector{Int32})
+function align(ref_id::Int32, color::Color, ca::Vector{Int32}, sa::Vector{Int32}, ref::Vector{Int32}, inv_perm_sa::Vector{Int32}, lcp::Vector{Int32}, ca_coords::UnitRange{Int32})
    # println(ref_id, " at: ", ref[307])
     max_match_index = Int32(0)
     max_match_size = Int32(0)
@@ -121,16 +138,16 @@ function align(ref_id::Int32, color::Color, ca::Vector{Int32}, sa::Vector{Int32}
         # If we have a match keep using the linked list to extend 
         if max_match_size > 0 
             while ref_start <= length(ref)
-                       # Check the match size at this point 
+                # Check the match size at this point 
                 max_match_size = check_this_point(ca, sa, ref, ref_start, max_match_index, Int32(max_match_size-1)) # skip k-1
                 
                 # If we don't have any match we don't have to check the flanks
                 max_match_size == 0 && break 
-                update_color!(color, ref_id, sa[max_match_index], Int32(max_match_size), ca)
+                update_color!(color, ref_id, sa[max_match_index], Int32(max_match_size), ca, ca_coords)
                               
                 # Check up and down in suffix array for other matches
-                extend_from_point!(ca, sa, ref, lcp, max_match_index, false, ref_start, Int32(max_match_size), ref_id, color)
-                extend_from_point!(ca, sa, ref, lcp, max_match_index, true, ref_start, Int32(max_match_size), ref_id, color)
+                extend_from_point!(ca, sa, ref, lcp, max_match_index, false, ref_start, Int32(max_match_size), ref_id, color, ca_coords)
+                extend_from_point!(ca, sa, ref, lcp, max_match_index, true, ref_start, Int32(max_match_size), ref_id, color, ca_coords)
 
                 # # Move to next location in suffix array for the second around
                 max_match_index = inv_perm_sa[sa[max_match_index]+1]
@@ -143,40 +160,60 @@ function align(ref_id::Int32, color::Color, ca::Vector{Int32}, sa::Vector{Int32}
     end
 end
 
-function align_forward_and_reverse(ref_id::Int32, color::Color, ca::Vector{Int32}, sa::Vector{Int32}, ref::Vector{Int32}, inv_perm_sa::Vector{Int32}, lcp::Vector{Int32})
+function align_forward_and_reverse(ref_id::Int32, color::Color, ca::Vector{Int32}, sa::Vector{Int32}, ref::Vector{Int32}, inv_perm_sa::Vector{Int32}, lcp::Vector{Int32}, ca_coords::UnitRange{Int32})
     # First do the forward align 
-    align(ref_id, color, ca, sa, ref, inv_perm_sa,lcp)
+    align(ref_id, color, ca, sa, ref, inv_perm_sa,lcp, ca_coords)
     # Flip the nodes and reverse to do the reverse alignment 
     reverse_complement_ref!(ref)
-    align(ref_id, color, ca, sa, ref, inv_perm_sa,lcp)
+    align(ref_id, color, ca, sa, ref, inv_perm_sa,lcp, ca_coords)
 end
 
 function run(gfa::String, seq_file::String, query_file::String, k_size::Int32, out_file::String; blacklist::String = "") 
-    
+    start_time = now()
     blacklist_ids = !isempty(blacklist) ? read_ids_from_file(blacklist) : OrderedSet{String}()
 
-    queries, query_ids = processGFA(gfa, query_file)
-       
-    ca, sa = create_k_suffix_array(queries, Int32(0))
+    println("Reading GFA...")
+    queries, query_ids, query_ids_numerical = processGFA(gfa, query_file)
+    
+    println("Building data structures (SA, LCP, and ISA")
+    ca, query_offsets, sa = create_k_suffix_array(queries, query_ids_numerical, Int32(0))
     inv_sa_perm = inverse_perm_sa(sa)
     lcp = build_lcp(sa, ca)
 
-  
+    println("Reading node sizes...")
     size_map = read_node_sizes(seq_file)
     len = zeros(Int32, length(ca))
     ori =  [Origin(-1,-1) for i in 1:length(ca)] # Vector{Origin}(undef, length(ca)) 
     color = Color(len, ori, size_map, k_size)
+    println("Aligning references to the suffix array")
+
+    prog = ProgressUnknown("References read: ")
 
     for (ref_id, line) in enumerate(eachline(gfa))
-       identifier, path = split(line, "\t")
-       if !(identifier in query_ids) && !(identifier in blacklist_ids)
-           path_numbers = parse_numbers(path)
-           align_forward_and_reverse(Int32(ref_id), color, ca, sa, path_numbers, inv_sa_perm, lcp)
-       end
+        identifier, path = split(line, "\t")
+        # Skip if blacklisted
+        identifier in blacklist_ids && continue
+        # Get the mask coordinates in the CA array if this is a query 
+        ca_coords = identifier in query_ids ? query_offsets[ref_id] : Int32(0):Int32(0)
+        #if ca_coords[1] != Int32(0)
+            #println("Got coords: ", ca_coords, "id: ", identifier, " ref id: ", ref_id)
+            
+        path_numbers = parse_numbers(path)
+        align_forward_and_reverse(Int32(ref_id), color, ca, sa, path_numbers, inv_sa_perm, lcp, ca_coords)
+        println(ca_coords)
+
+        
+       # end
+        
+
+        
+        ProgressMeter.next!(prog)
+       
     end
-
+    ProgressMeter.finish!(prog)
+    println("Writing output")
     writeResults(ca, color, query_ids, out_file, size_map)
-
+    println("Elapsed: ", now() - start_time)
 end
 
 # run("test", "test", "test", Int32(31), "test")
diff --git a/main.jl b/main.jl
index acd37a7..fa2867d 100644
--- a/main.jl
+++ b/main.jl
@@ -1,5 +1,6 @@
 using OrderedCollections
 using ArgParse
+using IntervalTrees
 
 include("./graphite.jl")
 include("./graph_io.jl")
diff --git a/suffixArray.jl b/suffixArray.jl
index 4894f10..707ba9a 100644
--- a/suffixArray.jl
+++ b/suffixArray.jl
@@ -2,7 +2,9 @@
 
 const LIBSAIS = "libsais-2.7.1/libsais.so.2"
 
-function concat_with_seperator(vectors::Vector{Vector{Int32}})
+
+function concat_with_seperator(vectors::Vector{Vector{Int32}},identifiers::Vector{Int32})
+    length(vectors) == length(identifiers) || error("Vectors != identifiers")
     # Probably a cleaner way to do this :) Like using map 
     # to flip the node ids in slices and copying the slices to the output
     total_size = sum(map(length, vectors))
@@ -10,16 +12,30 @@ function concat_with_seperator(vectors::Vector{Vector{Int32}})
     vect_id = -1 * length(vectors) # to have it decending for sorting
     # Concat with seperator + store sign in most significant bit
     i = 1
+    identifier_i = 1
+    # Sometimes we might want to align back to the other queries just
+    # not to itself. However the relationship between a sequence and it's suffixes 
+    # is not directly evident as there are multiple suffixes along the SA that 
+    # belong to the same query, likely all scattered. 
+    query_offsets = Dict{Int32, UnitRange{Int32}}()
+    last_vect_id = Int32(1)
+    last_start = Int32(1)
+
     @inbounds for v in vectors
         for node_id in v 
             concat_arr[i] = node_id
-            i +=1
+            i += 1
         end 
+        # Get the vector id offsets and store them in the query offsets dict 
+        query_offsets[identifiers[identifier_i]] = last_start:i-1
         concat_arr[i] = vect_id 
-        vect_id +=1
-        i +=1
+        vect_id += 1
+        i += 1
+        last_vect_id = vect_id
+        last_start = i
+        identifier_i += 1
     end
-    return concat_arr
+    return concat_arr, query_offsets
 end
 
 function create_suffix_array(in_vector::Vector{Int32}, free_space::Int32)
@@ -33,10 +49,10 @@ function create_suffix_array(in_vector::Vector{Int32}, free_space::Int32)
     return out_vector
 end
 
-function create_k_suffix_array(vectors::Vector{Vector{Int32}}, free_space::Int32)
-    concat_array = concat_with_seperator(vectors)
+function create_k_suffix_array(vectors::Vector{Vector{Int32}}, query_identifiers::Vector{Int32}, free_space::Int32)
+    concat_array, query_offsets = concat_with_seperator(vectors, query_identifiers)
     suffix_array = create_suffix_array(concat_array, free_space)
-    return concat_array, suffix_array
+    return concat_array, query_offsets, suffix_array
 end
 
 function build_lcp(sa::Vector{Int32}, V::Vector{Int32}, base::Integer=1)
diff --git a/test.jl b/test.jl
new file mode 100644
index 0000000..838d689
--- /dev/null
+++ b/test.jl
@@ -0,0 +1,36 @@
+function concat_with_seperator(vectors::Vector{Vector{Int32}},identifiers::Vector{Int32})
+    length(vectors) == length(identifiers) || error("Vectors != identifiers")
+    # Probably a cleaner way to do this :) Like using map 
+    # to flip the node ids in slices and copying the slices to the output
+    total_size = sum(map(length, vectors))
+    concat_arr = zeros(Int32, total_size + length(vectors)) 
+    vect_id = -1 * length(vectors) # to have it decending for sorting
+    # Concat with seperator + store sign in most significant bit
+    i = 1
+    identifier_i = 1
+    # Sometimes we might want to align back to the other queries just
+    # not to itself. However the relationship between a sequence and it's suffixes 
+    # is not directly evident as there are multiple suffixes along the SA that 
+    # belong to the same query, likely all scattered. 
+    query_offsets = Dict{Int32, UnitRange{Int32}}()
+    last_vect_id = Int32(1)
+    last_start = Int32(1)
+
+    @inbounds for v in vectors
+        for node_id in v 
+            concat_arr[i] = node_id
+            i += 1
+        end 
+        # Get the vector id offsets and store them in the query offsets dict 
+        query_offsets[identifiers[identifier_i]] = last_start:i-1
+        concat_arr[i] = vect_id 
+        vect_id += 1
+        i += 1
+        last_vect_id = vect_id
+        last_start = i
+        identifier_i += 1
+    end
+    return concat_arr, query_offsets
+end
+
+concat_with_seperator( [Int32[1,2,3,4,5], Int32[100,200,100], Int32[10000,10000] ], [Int32(1), Int32(5), Int32(9)])
\ No newline at end of file

From ae3510704d884fae0cd97d9d5c091963b60a113c Mon Sep 17 00:00:00 2001
From: codegodz <biobeeloo@gmail.com>
Date: Sat, 15 Jul 2023 14:02:06 +0200
Subject: [PATCH 2/2] graph io update

---
 graph_io.jl | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/graph_io.jl b/graph_io.jl
index ccc616d..d5fd710 100644
--- a/graph_io.jl
+++ b/graph_io.jl
@@ -11,15 +11,19 @@ end
 function read_queries(f::String, query_ids::OrderedSet{String})
     queries = Vector{Vector{Int32}}()
     query_ids_file_order = OrderedSet{String}() # Might find in different order in the file than query_ids
+    query_ids_numerical = Vector{Int32}()
     for (i, line) in enumerate(eachline(f))
         identifier, path = split(line, "\t")
         if identifier in query_ids
             path_numbers = parse_numbers(path)
             push!(query_ids_file_order, identifier)
             push!(queries, path_numbers)
+            push!(query_ids_numerical, i)
+            length(query_ids) == length(query_ids_file_order) && break
         end
     end
-    return queries, query_ids_file_order
+    println("Read: ", length(query_ids_file_order), " queries")
+    return queries, query_ids_file_order, query_ids_numerical
 end
 
 function read_node_sizes(f::String)
@@ -44,8 +48,8 @@ end
 function processGFA(gfa::String, query_file::String)
     # Read the query ids from the file 
     query_ids = read_ids_from_file(query_file)
-    queries, query_ids = read_queries(gfa, query_ids)
-    return queries, query_ids
+    queries, query_ids, query_ids_numerical = read_queries(gfa, query_ids)
+    return queries, query_ids, query_ids_numerical
 end
 
 function writeResults(ca::Vector{Int32}, color::Color, query_ids::OrderedSet{String}, out_file::String, size_map::Dict{Int32, Int32})