From 8ba98fc126021bab02fd7a91c1c2686b06c4c700 Mon Sep 17 00:00:00 2001
From: anton083 <anton.oresten42@gmail.com>
Date: Thu, 17 Oct 2024 16:25:07 +0200
Subject: [PATCH] Add removeproperties, fix numbering, numbering tests

---
 Project.toml         |  2 +-
 src/ProteinChains.jl |  2 +-
 src/chain.jl         | 28 +++++++++++++----
 src/io/download.jl   | 14 ++++++---
 src/io/read.jl       | 11 +++++--
 src/io/renumber.jl   | 71 ++++++++++++++++----------------------------
 test/runtests.jl     | 50 +++++++++++++++++++++++++++++++
 7 files changed, 120 insertions(+), 58 deletions(-)

diff --git a/Project.toml b/Project.toml
index b2f27d5..2e58203 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "ProteinChains"
 uuid = "b8e8f2a5-48d3-44f1-ba0d-c71cb7726ff8"
 authors = ["Anton Oresten <anton.oresten42@gmail.com> and contributors"]
-version = "0.3.1"
+version = "0.3.2"
 
 [deps]
 AssigningSecondaryStructure = "8ed43e74-60fb-4e11-99b9-91deed37aef7"
diff --git a/src/ProteinChains.jl b/src/ProteinChains.jl
index 290a330..c57c955 100644
--- a/src/ProteinChains.jl
+++ b/src/ProteinChains.jl
@@ -15,7 +15,7 @@ export Atom
 
 include("properties.jl")
 export PersistentProperty, IndexableProperty
-export addproperties
+export addproperties, removeproperties
 @compat public (AbstractProperty, NamedProperties)
 
 include("chain.jl")
diff --git a/src/chain.jl b/src/chain.jl
index 4aad811..241c168 100644
--- a/src/chain.jl
+++ b/src/chain.jl
@@ -8,7 +8,7 @@ The [`addproperties`](@ref) function can be used to instantiate new chains with
 - `id::String`: Identifier for the protein chain.
 - `atoms::Vector{Vector{Atom{T}}}`: List of atoms in each residue.
 - `sequence::String`: Amino acid sequence of the protein.
-- `numbering::Vector{Int32}`: Residue numbering.
+- `numbering::Vector{Int32}`: Residue numbering (author). See [`renumber`](@ref) for renumbering.
 - `properties::Ps`: Named properties associated with the chain.
 
 See also [`addproperties`](@ref), [`PersistentProperty`](@ref), [`IndexableProperty`](@ref).
@@ -39,8 +39,11 @@ ProteinChain(id, atoms, sequence, numbering) = ProteinChain(id, atoms, sequence,
 Base.convert(::Type{ProteinChain{T}}, chain::ProteinChain) where T =
     ProteinChain(chain.id, convert(Vector{Vector{Atom{T}}}, chain.atoms), chain.sequence, chain.numbering, chain.properties)
 
-Base.:(==)(chain1::ProteinChain, chain2::ProteinChain) = propertynames(chain1) == propertynames(chain2) &&
-    !any(getproperty(chain1, name) != getproperty(chain2, name) for name in propertynames(chain1, false))
+function Base.:(==)(chain1::ProteinChain, chain2::ProteinChain)
+    sorted_names1 = sort!(collect(propertynames(chain1, false)))
+    sorted_names2 = sort!(collect(propertynames(chain2, false)))
+    sorted_names1 == sorted_names2 && !any(getproperty(chain1, name) != getproperty(chain2, name) for name in sorted_names1)
+end
 
 Base.length(chain::ProteinChain) = length(chain.atoms)
 
@@ -54,6 +57,8 @@ function Base.getindex(chain::ProteinChain, i::AbstractVector)
     ProteinChain(chain.id, chain.atoms[i], chain.sequence[i], chain.numbering[i], properties)
 end
 
+setproperties(chain::ProteinChain, ps::NamedProperties) = ProteinChain(chain.id, chain.atoms, chain.sequence, chain.numbering, ps)
+
 """
     addproperties(chain::ProteinChain; properties...)
 
@@ -63,11 +68,24 @@ them with `PersistentProperty` or `IndexableProperty`.
 
 Values get wrapped by `PersistentProperty` by default.
 
-See also [`PersistentProperty`](@ref), [`IndexableProperty`](@ref)
+See also [`removeproperties`](@ref), [`PersistentProperty`](@ref), [`IndexableProperty`](@ref)
 """
 function addproperties(chain::ProteinChain; properties...)
     properties = map(p -> p isa AbstractProperty ? p : PersistentProperty(p), NamedTuple(properties))
-    return ProteinChain(chain.id, chain.atoms, chain.sequence, chain.numbering, merge(chain.properties, properties))
+    setproperties(chain, merge(chain.properties, properties))
+end
+
+"""
+    removeproperties(chain::ProteinChain, names::Symbol...)
+
+Creates a new `ProteinChain` instance with the property names in `names` removed.
+
+See also [`addproperties`](@ref)
+"""
+function removeproperties(chain::ProteinChain, names::Symbol...)
+    new_propertynames = filter(name -> name ∉ names, propertynames(chain.properties))
+    properties = NamedTuple{new_propertynames}(chain.properties)
+    setproperties(chain, properties)
 end
 
 Base.summary(chain::ProteinChain) = "$(length(chain))-residue $(typeof(chain)) ($(chain.id))"
diff --git a/src/io/download.jl b/src/io/download.jl
index c9ab1a8..e6fd830 100644
--- a/src/io/download.jl
+++ b/src/io/download.jl
@@ -4,7 +4,7 @@ const MAX_CACHE_ENTRIES = 4
 const CACHE_DIR = Ref{Union{String,Nothing}}(nothing)
 
 function initialize_cache_dir()
-    if CACHE_DIR[] === nothing
+    if isnothing(CACHE_DIR[])
         CACHE_DIR[] = mktempdir(prefix="pdb_cache_")
         atexit(() -> rm(CACHE_DIR[], recursive=true))
     end
@@ -50,11 +50,17 @@ julia> pdb"1EYE"1 # integer suffix to specify "ba_number" keyword
  256-residue ProteinChain{Float64, @NamedTuple{}} (A-2)
 ```
 """
-function pdbentry(pdbid::AbstractString; format=MMCIFFormat, kwargs...)
+function pdbentry(pdbid::AbstractString; dir=nothing, format=MMCIFFormat, kwargs...)
+    isnothing(dir) && return cached_pdbentry(pdbid; format, kwargs...)
+    path = BioStructures.downloadpdb(pdbid; dir, format, kwargs...)
+    return read(path, ProteinStructure, format)
+end
+
+function cached_pdbentry(pdbid::AbstractString; format=MMCIFFormat, kwargs...)
     initialize_cache_dir()
-    path = BioStructures.downloadpdb(pdbid; dir=CACHE_DIR[], format=format, kwargs...)
+    structure = pdbentry(pdbid; dir=CACHE_DIR[], format, kwargs...)
     manage_cache()
-    return read(path, ProteinStructure, format)
+    return structure
 end
 
 macro pdb_str(pdbid::AbstractString)
diff --git a/src/io/read.jl b/src/io/read.jl
index 392751c..0387797 100644
--- a/src/io/read.jl
+++ b/src/io/read.jl
@@ -30,7 +30,9 @@ function ProteinChain{T}(chain::BioStructures.Chain) where T
     atoms = get_atoms(Atom{T}, residues)
     sequence = get_sequence(residues)
     numbering = map(BioStructures.resnumber, residues)
-    return ProteinChain(id, atoms, sequence, numbering)
+    return addproperties(ProteinChain(id, atoms, sequence, numbering);
+        ins_codes = IndexableProperty(map(Int8 ∘ BioStructures.inscode, residues)),
+    )
 end
 
 function ProteinStructure{T}(struc::BioStructures.MolecularStructure; mmcifdict=nothing) where T
@@ -40,7 +42,12 @@ function ProteinStructure{T}(struc::BioStructures.MolecularStructure; mmcifdict=
         push!(proteinchains, ProteinChain{T}(chain))
     end
     proteinstructure = ProteinStructure(struc.name, atoms, proteinchains)
-    mmcifdict isa BioStructures.MMCIFDict && renumber!(proteinstructure, mmcifdict)
+    if !isnothing(mmcifdict)
+        chainwise_renumbering = renumber(proteinstructure, mmcifdict)
+        for (i, (proteinchain, renumbering)) in enumerate(zip(proteinstructure, chainwise_renumbering))
+            proteinstructure[i] = addproperties(proteinchain; renumbering)
+        end
+    end
     return proteinstructure
 end
 
diff --git a/src/io/renumber.jl b/src/io/renumber.jl
index a4f3029..dcbed3e 100644
--- a/src/io/renumber.jl
+++ b/src/io/renumber.jl
@@ -1,60 +1,41 @@
+const missingvals = Set([".", "?"])
+
 """
-    renumber!(structure::ProteinStructure, mmcif_dict::BioStructures.MMCIFDict)
+    renumber(structure::ProteinStructure, mmcif_dict::BioStructures.MMCIFDict)
+
+Return residue numbers from "_atom_site.label_seq_ids".
 
-Renumber the residues in a `ProteinStructure` object according to the numbering aligned to a reference sequence in the MMCIF file.
+The `ProteinStructure` will automatically add a `renumbering` property if
+an MMCIFDict is passed (default if the file is an MMCIF).
 """
-function renumber!(structure::ProteinStructure, mmcif_dict::BioStructures.MMCIFDict)
+function renumber(structure::ProteinStructure, mmcif_dict::BioStructures.MMCIFDict)
     label_seq_ids = mmcif_dict["_atom_site.label_seq_id"]
+    pdbx_PDB_ins_codes = mmcif_dict["_atom_site.pdbx_PDB_ins_code"]
     auth_seq_ids = mmcif_dict["_atom_site.auth_seq_id"]
     auth_asym_ids = mmcif_dict["_atom_site.auth_asym_id"]
-    reduced = Int[]
-    current_auth_seq_id = ""
-    for (i, auth_seq_id) in enumerate(auth_seq_ids)
-        current_auth_seq_id == auth_seq_id && continue
-        current_auth_seq_id = auth_seq_id
-        push!(reduced, i)
-    end
 
-    label_seq_ids = label_seq_ids[reduced]
-    auth_seq_ids = auth_seq_ids[reduced]
-    auth_asym_ids = auth_asym_ids[reduced]
-
-    chainwise_auth_to_label_seq_ids = Dict{String,Dict{String,String}}()
+    label_seq_id_dict = Dict{String,String}()
 
     for asym_id in unique(auth_asym_ids)
-        indices = auth_asym_ids .== asym_id
-        chainwise_auth_to_label_seq_ids[asym_id] = Dict(zip(auth_seq_ids[indices], label_seq_ids[indices]))
+        indices = findall(auth_asym_ids .== asym_id)
+        for i in indices
+            ins_code = pdbx_PDB_ins_codes[i] == "?" ? " " : pdbx_PDB_ins_codes[i]
+            key = asym_id*auth_seq_ids[i]*ins_code
+            !haskey(label_seq_id_dict, key) && (label_seq_id_dict[key] = label_seq_ids[i])
+        end
     end
 
+    chainwise_renumbering = Vector{Int32}[]
     for chain in structure
-        renum_dict = chainwise_auth_to_label_seq_ids[chain.id]
-        numbering_str = map(n -> get(renum_dict, n, "?"), string.(chain.numbering))
-        "?" in numbering_str && break
-        chain.numbering .= parse.(Int, numbering_str)
+        if length(chain) == 0
+            push!(chainwise_renumbering, Int32[])
+            continue
+        end
+        renumbering_str = map(
+            (resnum, ins_code) -> parse(Int32, get(label_seq_id_dict, chain.id*string(resnum)*ins_code, "-1")), chain.numbering,
+            hasproperty(chain, :ins_codes) ? Iterators.map(Char, chain.ins_codes) : Iterators.repeated(' '))
+        push!(chainwise_renumbering, renumbering_str)
     end
 
-    return structure
+    return chainwise_renumbering
 end
-
-#=function renumber!(structure::ProteinStructure, mmcif_dict::BioStructures.MMCIFDict)
-    asym_ids = mmcif_dict["_pdbx_poly_seq_scheme.asym_id"]
-    auth_seq_nums = map(s -> s == "?" ? -1 : parse(Int, s), mmcif_dict["_pdbx_poly_seq_scheme.auth_seq_num"])
-    ndb_seq_nums = map(s -> s == "?" ? -1 : parse(Int, s), mmcif_dict["_pdbx_poly_seq_scheme.ndb_seq_num"])
-    amino_acids = map(s -> get(BioStructures.threeletter_to_aa, s, BioStructures.AA_X), mmcif_dict["_pdbx_poly_seq_scheme.mon_id"])
-
-    chainwise_auth_to_ndb = Dict{String,Dict{Int,Int}}()
-    chainwise_sequences = Dict{String,String}()
-
-    for asym_id in unique(asym_ids)
-        indices = asym_ids .== asym_id
-        chainwise_auth_to_ndb[asym_id] = Dict(zip(auth_seq_nums[indices], ndb_seq_nums[indices]))
-        chainwise_sequences[asym_id] = join(amino_acids[indices])
-    end
-
-    for chain in structure
-        chain.numbering = map(n -> chainwise_auth_to_ndb[chain.id][n], chain.numbering)
-        chain.sequence_reference = chainwise_sequences[chain.id]
-    end
-
-    return structure
-end=#
\ No newline at end of file
diff --git a/test/runtests.jl b/test/runtests.jl
index 3a92eab..c387142 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -49,6 +49,10 @@ using Test
     @testset "chain.jl" begin
         chain = ProteinChain("A", get_atoms(ProteinChains.Backbone(rand(3, 3, 5))), "AMINO", collect(1:5))
         @test length(chain) == 5
+        chain = addproperties(chain, taxid=9606)
+        @test hasproperty(chain, :taxid)
+        chain = removeproperties(chain, :taxid)
+        @test !hasproperty(chain, :taxid)
     end
 
     @testset "structure.jl" begin
@@ -77,6 +81,52 @@ using Test
             @test chains[1].sequence == new_chains[1].sequence
         end
 
+        # See https://proteopedia.org/wiki/index.php/Unusual_sequence_numbering
+        @testset "Unusual numbering" begin
+            
+            @testset "Arbitrary Numbering" begin
+                structure = pdb"1BSZ"
+                @test !allequal(chain -> chain.numbering, structure)
+                @test allequal(chain -> chain.renumbering, structure)
+            end
+
+            @testset "N-Terminal Residues Missing Coordinates" begin
+                chain = pdb"1D66"A
+                @test chain.numbering[begin] == 8
+                @test chain.renumbering[begin] == 8
+            end
+
+            @testset "Starts With Zero Or Negative Numbers" begin
+                chain = pdb"1BXW"A
+                @test chain.numbering[begin] == 0
+                @test chain.renumbering[begin] == 1
+
+                chain = pdb"1D5T"A
+                @test chain.numbering[begin] == -2
+                @test chain.renumbering[begin] == 1
+            end
+
+            @testset "Insertion Codes" begin
+                chain = pdb"1IGY"B
+                @test count(==(82), chain.numbering) == 4
+                @test allunique(chain.renumbering)
+                @test Set(map(Char, unique(chain.ins_codes))) == Set([' ', 'A', 'B', 'C'])
+            end
+
+            @testset "Skipping Sequence Numbers" begin
+                chain = pdb"2ACE"A
+                @test any(!isone, chain.numbering[begin+1:end] - chain.numbering[begin:end-1])
+                @test chain.numbering == chain.renumbering
+            end
+
+            @testset "Not Monotonic" begin
+                chain = pdb"1NSA"A
+                @test issorted(chain.numbering) # BioStructures sorts automatically with `fixlists!`, so residues are in wrong order
+                @test !issorted(chain.renumbering) # the renumbering is not monotonic. it would be if BioStructures didn't sort.
+            end
+
+        end
+
     end
 
     @testset "store" begin