From 8ba98fc126021bab02fd7a91c1c2686b06c4c700 Mon Sep 17 00:00:00 2001 From: anton083 Date: Thu, 17 Oct 2024 16:25:07 +0200 Subject: [PATCH] Add removeproperties, fix numbering, numbering tests --- Project.toml | 2 +- src/ProteinChains.jl | 2 +- src/chain.jl | 28 +++++++++++++---- src/io/download.jl | 14 ++++++--- src/io/read.jl | 11 +++++-- src/io/renumber.jl | 71 ++++++++++++++++---------------------------- test/runtests.jl | 50 +++++++++++++++++++++++++++++++ 7 files changed, 120 insertions(+), 58 deletions(-) diff --git a/Project.toml b/Project.toml index b2f27d5..2e58203 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "ProteinChains" uuid = "b8e8f2a5-48d3-44f1-ba0d-c71cb7726ff8" authors = ["Anton Oresten and contributors"] -version = "0.3.1" +version = "0.3.2" [deps] AssigningSecondaryStructure = "8ed43e74-60fb-4e11-99b9-91deed37aef7" diff --git a/src/ProteinChains.jl b/src/ProteinChains.jl index 290a330..c57c955 100644 --- a/src/ProteinChains.jl +++ b/src/ProteinChains.jl @@ -15,7 +15,7 @@ export Atom include("properties.jl") export PersistentProperty, IndexableProperty -export addproperties +export addproperties, removeproperties @compat public (AbstractProperty, NamedProperties) include("chain.jl") diff --git a/src/chain.jl b/src/chain.jl index 4aad811..241c168 100644 --- a/src/chain.jl +++ b/src/chain.jl @@ -8,7 +8,7 @@ The [`addproperties`](@ref) function can be used to instantiate new chains with - `id::String`: Identifier for the protein chain. - `atoms::Vector{Vector{Atom{T}}}`: List of atoms in each residue. - `sequence::String`: Amino acid sequence of the protein. -- `numbering::Vector{Int32}`: Residue numbering. +- `numbering::Vector{Int32}`: Residue numbering (author). See [`renumber`](@ref) for renumbering. - `properties::Ps`: Named properties associated with the chain. See also [`addproperties`](@ref), [`PersistentProperty`](@ref), [`IndexableProperty`](@ref). @@ -39,8 +39,11 @@ ProteinChain(id, atoms, sequence, numbering) = ProteinChain(id, atoms, sequence, Base.convert(::Type{ProteinChain{T}}, chain::ProteinChain) where T = ProteinChain(chain.id, convert(Vector{Vector{Atom{T}}}, chain.atoms), chain.sequence, chain.numbering, chain.properties) -Base.:(==)(chain1::ProteinChain, chain2::ProteinChain) = propertynames(chain1) == propertynames(chain2) && - !any(getproperty(chain1, name) != getproperty(chain2, name) for name in propertynames(chain1, false)) +function Base.:(==)(chain1::ProteinChain, chain2::ProteinChain) + sorted_names1 = sort!(collect(propertynames(chain1, false))) + sorted_names2 = sort!(collect(propertynames(chain2, false))) + sorted_names1 == sorted_names2 && !any(getproperty(chain1, name) != getproperty(chain2, name) for name in sorted_names1) +end Base.length(chain::ProteinChain) = length(chain.atoms) @@ -54,6 +57,8 @@ function Base.getindex(chain::ProteinChain, i::AbstractVector) ProteinChain(chain.id, chain.atoms[i], chain.sequence[i], chain.numbering[i], properties) end +setproperties(chain::ProteinChain, ps::NamedProperties) = ProteinChain(chain.id, chain.atoms, chain.sequence, chain.numbering, ps) + """ addproperties(chain::ProteinChain; properties...) @@ -63,11 +68,24 @@ them with `PersistentProperty` or `IndexableProperty`. Values get wrapped by `PersistentProperty` by default. -See also [`PersistentProperty`](@ref), [`IndexableProperty`](@ref) +See also [`removeproperties`](@ref), [`PersistentProperty`](@ref), [`IndexableProperty`](@ref) """ function addproperties(chain::ProteinChain; properties...) properties = map(p -> p isa AbstractProperty ? p : PersistentProperty(p), NamedTuple(properties)) - return ProteinChain(chain.id, chain.atoms, chain.sequence, chain.numbering, merge(chain.properties, properties)) + setproperties(chain, merge(chain.properties, properties)) +end + +""" + removeproperties(chain::ProteinChain, names::Symbol...) + +Creates a new `ProteinChain` instance with the property names in `names` removed. + +See also [`addproperties`](@ref) +""" +function removeproperties(chain::ProteinChain, names::Symbol...) + new_propertynames = filter(name -> name ∉ names, propertynames(chain.properties)) + properties = NamedTuple{new_propertynames}(chain.properties) + setproperties(chain, properties) end Base.summary(chain::ProteinChain) = "$(length(chain))-residue $(typeof(chain)) ($(chain.id))" diff --git a/src/io/download.jl b/src/io/download.jl index c9ab1a8..e6fd830 100644 --- a/src/io/download.jl +++ b/src/io/download.jl @@ -4,7 +4,7 @@ const MAX_CACHE_ENTRIES = 4 const CACHE_DIR = Ref{Union{String,Nothing}}(nothing) function initialize_cache_dir() - if CACHE_DIR[] === nothing + if isnothing(CACHE_DIR[]) CACHE_DIR[] = mktempdir(prefix="pdb_cache_") atexit(() -> rm(CACHE_DIR[], recursive=true)) end @@ -50,11 +50,17 @@ julia> pdb"1EYE"1 # integer suffix to specify "ba_number" keyword 256-residue ProteinChain{Float64, @NamedTuple{}} (A-2) ``` """ -function pdbentry(pdbid::AbstractString; format=MMCIFFormat, kwargs...) +function pdbentry(pdbid::AbstractString; dir=nothing, format=MMCIFFormat, kwargs...) + isnothing(dir) && return cached_pdbentry(pdbid; format, kwargs...) + path = BioStructures.downloadpdb(pdbid; dir, format, kwargs...) + return read(path, ProteinStructure, format) +end + +function cached_pdbentry(pdbid::AbstractString; format=MMCIFFormat, kwargs...) initialize_cache_dir() - path = BioStructures.downloadpdb(pdbid; dir=CACHE_DIR[], format=format, kwargs...) + structure = pdbentry(pdbid; dir=CACHE_DIR[], format, kwargs...) manage_cache() - return read(path, ProteinStructure, format) + return structure end macro pdb_str(pdbid::AbstractString) diff --git a/src/io/read.jl b/src/io/read.jl index 392751c..0387797 100644 --- a/src/io/read.jl +++ b/src/io/read.jl @@ -30,7 +30,9 @@ function ProteinChain{T}(chain::BioStructures.Chain) where T atoms = get_atoms(Atom{T}, residues) sequence = get_sequence(residues) numbering = map(BioStructures.resnumber, residues) - return ProteinChain(id, atoms, sequence, numbering) + return addproperties(ProteinChain(id, atoms, sequence, numbering); + ins_codes = IndexableProperty(map(Int8 ∘ BioStructures.inscode, residues)), + ) end function ProteinStructure{T}(struc::BioStructures.MolecularStructure; mmcifdict=nothing) where T @@ -40,7 +42,12 @@ function ProteinStructure{T}(struc::BioStructures.MolecularStructure; mmcifdict= push!(proteinchains, ProteinChain{T}(chain)) end proteinstructure = ProteinStructure(struc.name, atoms, proteinchains) - mmcifdict isa BioStructures.MMCIFDict && renumber!(proteinstructure, mmcifdict) + if !isnothing(mmcifdict) + chainwise_renumbering = renumber(proteinstructure, mmcifdict) + for (i, (proteinchain, renumbering)) in enumerate(zip(proteinstructure, chainwise_renumbering)) + proteinstructure[i] = addproperties(proteinchain; renumbering) + end + end return proteinstructure end diff --git a/src/io/renumber.jl b/src/io/renumber.jl index a4f3029..dcbed3e 100644 --- a/src/io/renumber.jl +++ b/src/io/renumber.jl @@ -1,60 +1,41 @@ +const missingvals = Set([".", "?"]) + """ - renumber!(structure::ProteinStructure, mmcif_dict::BioStructures.MMCIFDict) + renumber(structure::ProteinStructure, mmcif_dict::BioStructures.MMCIFDict) + +Return residue numbers from "_atom_site.label_seq_ids". -Renumber the residues in a `ProteinStructure` object according to the numbering aligned to a reference sequence in the MMCIF file. +The `ProteinStructure` will automatically add a `renumbering` property if +an MMCIFDict is passed (default if the file is an MMCIF). """ -function renumber!(structure::ProteinStructure, mmcif_dict::BioStructures.MMCIFDict) +function renumber(structure::ProteinStructure, mmcif_dict::BioStructures.MMCIFDict) label_seq_ids = mmcif_dict["_atom_site.label_seq_id"] + pdbx_PDB_ins_codes = mmcif_dict["_atom_site.pdbx_PDB_ins_code"] auth_seq_ids = mmcif_dict["_atom_site.auth_seq_id"] auth_asym_ids = mmcif_dict["_atom_site.auth_asym_id"] - reduced = Int[] - current_auth_seq_id = "" - for (i, auth_seq_id) in enumerate(auth_seq_ids) - current_auth_seq_id == auth_seq_id && continue - current_auth_seq_id = auth_seq_id - push!(reduced, i) - end - label_seq_ids = label_seq_ids[reduced] - auth_seq_ids = auth_seq_ids[reduced] - auth_asym_ids = auth_asym_ids[reduced] - - chainwise_auth_to_label_seq_ids = Dict{String,Dict{String,String}}() + label_seq_id_dict = Dict{String,String}() for asym_id in unique(auth_asym_ids) - indices = auth_asym_ids .== asym_id - chainwise_auth_to_label_seq_ids[asym_id] = Dict(zip(auth_seq_ids[indices], label_seq_ids[indices])) + indices = findall(auth_asym_ids .== asym_id) + for i in indices + ins_code = pdbx_PDB_ins_codes[i] == "?" ? " " : pdbx_PDB_ins_codes[i] + key = asym_id*auth_seq_ids[i]*ins_code + !haskey(label_seq_id_dict, key) && (label_seq_id_dict[key] = label_seq_ids[i]) + end end + chainwise_renumbering = Vector{Int32}[] for chain in structure - renum_dict = chainwise_auth_to_label_seq_ids[chain.id] - numbering_str = map(n -> get(renum_dict, n, "?"), string.(chain.numbering)) - "?" in numbering_str && break - chain.numbering .= parse.(Int, numbering_str) + if length(chain) == 0 + push!(chainwise_renumbering, Int32[]) + continue + end + renumbering_str = map( + (resnum, ins_code) -> parse(Int32, get(label_seq_id_dict, chain.id*string(resnum)*ins_code, "-1")), chain.numbering, + hasproperty(chain, :ins_codes) ? Iterators.map(Char, chain.ins_codes) : Iterators.repeated(' ')) + push!(chainwise_renumbering, renumbering_str) end - return structure + return chainwise_renumbering end - -#=function renumber!(structure::ProteinStructure, mmcif_dict::BioStructures.MMCIFDict) - asym_ids = mmcif_dict["_pdbx_poly_seq_scheme.asym_id"] - auth_seq_nums = map(s -> s == "?" ? -1 : parse(Int, s), mmcif_dict["_pdbx_poly_seq_scheme.auth_seq_num"]) - ndb_seq_nums = map(s -> s == "?" ? -1 : parse(Int, s), mmcif_dict["_pdbx_poly_seq_scheme.ndb_seq_num"]) - amino_acids = map(s -> get(BioStructures.threeletter_to_aa, s, BioStructures.AA_X), mmcif_dict["_pdbx_poly_seq_scheme.mon_id"]) - - chainwise_auth_to_ndb = Dict{String,Dict{Int,Int}}() - chainwise_sequences = Dict{String,String}() - - for asym_id in unique(asym_ids) - indices = asym_ids .== asym_id - chainwise_auth_to_ndb[asym_id] = Dict(zip(auth_seq_nums[indices], ndb_seq_nums[indices])) - chainwise_sequences[asym_id] = join(amino_acids[indices]) - end - - for chain in structure - chain.numbering = map(n -> chainwise_auth_to_ndb[chain.id][n], chain.numbering) - chain.sequence_reference = chainwise_sequences[chain.id] - end - - return structure -end=# \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 3a92eab..c387142 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -49,6 +49,10 @@ using Test @testset "chain.jl" begin chain = ProteinChain("A", get_atoms(ProteinChains.Backbone(rand(3, 3, 5))), "AMINO", collect(1:5)) @test length(chain) == 5 + chain = addproperties(chain, taxid=9606) + @test hasproperty(chain, :taxid) + chain = removeproperties(chain, :taxid) + @test !hasproperty(chain, :taxid) end @testset "structure.jl" begin @@ -77,6 +81,52 @@ using Test @test chains[1].sequence == new_chains[1].sequence end + # See https://proteopedia.org/wiki/index.php/Unusual_sequence_numbering + @testset "Unusual numbering" begin + + @testset "Arbitrary Numbering" begin + structure = pdb"1BSZ" + @test !allequal(chain -> chain.numbering, structure) + @test allequal(chain -> chain.renumbering, structure) + end + + @testset "N-Terminal Residues Missing Coordinates" begin + chain = pdb"1D66"A + @test chain.numbering[begin] == 8 + @test chain.renumbering[begin] == 8 + end + + @testset "Starts With Zero Or Negative Numbers" begin + chain = pdb"1BXW"A + @test chain.numbering[begin] == 0 + @test chain.renumbering[begin] == 1 + + chain = pdb"1D5T"A + @test chain.numbering[begin] == -2 + @test chain.renumbering[begin] == 1 + end + + @testset "Insertion Codes" begin + chain = pdb"1IGY"B + @test count(==(82), chain.numbering) == 4 + @test allunique(chain.renumbering) + @test Set(map(Char, unique(chain.ins_codes))) == Set([' ', 'A', 'B', 'C']) + end + + @testset "Skipping Sequence Numbers" begin + chain = pdb"2ACE"A + @test any(!isone, chain.numbering[begin+1:end] - chain.numbering[begin:end-1]) + @test chain.numbering == chain.renumbering + end + + @testset "Not Monotonic" begin + chain = pdb"1NSA"A + @test issorted(chain.numbering) # BioStructures sorts automatically with `fixlists!`, so residues are in wrong order + @test !issorted(chain.renumbering) # the renumbering is not monotonic. it would be if BioStructures didn't sort. + end + + end + end @testset "store" begin