Skip to content

Commit

Permalink
Merge branch 'main' into mmcifutils
Browse files Browse the repository at this point in the history
  • Loading branch information
AntonOresten authored Oct 17, 2024
2 parents c61fc99 + 8ba98fc commit 1ca4a19
Show file tree
Hide file tree
Showing 7 changed files with 120 additions and 58 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "ProteinChains"
uuid = "b8e8f2a5-48d3-44f1-ba0d-c71cb7726ff8"
authors = ["Anton Oresten <[email protected]> and contributors"]
version = "0.3.1"
version = "0.3.2"

[deps]
AssigningSecondaryStructure = "8ed43e74-60fb-4e11-99b9-91deed37aef7"
Expand Down
2 changes: 1 addition & 1 deletion src/ProteinChains.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ export Atom

include("properties.jl")
export PersistentProperty, IndexableProperty
export addproperties
export addproperties, removeproperties
@compat public (AbstractProperty, NamedProperties)

include("chain.jl")
Expand Down
28 changes: 23 additions & 5 deletions src/chain.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ The [`addproperties`](@ref) function can be used to instantiate new chains with
- `id::String`: Identifier for the protein chain.
- `atoms::Vector{Vector{Atom{T}}}`: List of atoms in each residue.
- `sequence::String`: Amino acid sequence of the protein.
- `numbering::Vector{Int32}`: Residue numbering.
- `numbering::Vector{Int32}`: Residue numbering (author). See [`renumber`](@ref) for renumbering.
- `properties::Ps`: Named properties associated with the chain.
See also [`addproperties`](@ref), [`PersistentProperty`](@ref), [`IndexableProperty`](@ref).
Expand Down Expand Up @@ -39,8 +39,11 @@ ProteinChain(id, atoms, sequence, numbering) = ProteinChain(id, atoms, sequence,
Base.convert(::Type{ProteinChain{T}}, chain::ProteinChain) where T =
ProteinChain(chain.id, convert(Vector{Vector{Atom{T}}}, chain.atoms), chain.sequence, chain.numbering, chain.properties)

Base.:(==)(chain1::ProteinChain, chain2::ProteinChain) = propertynames(chain1) == propertynames(chain2) &&
!any(getproperty(chain1, name) != getproperty(chain2, name) for name in propertynames(chain1, false))
function Base.:(==)(chain1::ProteinChain, chain2::ProteinChain)
sorted_names1 = sort!(collect(propertynames(chain1, false)))
sorted_names2 = sort!(collect(propertynames(chain2, false)))
sorted_names1 == sorted_names2 && !any(getproperty(chain1, name) != getproperty(chain2, name) for name in sorted_names1)
end

Base.length(chain::ProteinChain) = length(chain.atoms)

Expand All @@ -54,6 +57,8 @@ function Base.getindex(chain::ProteinChain, i::AbstractVector)
ProteinChain(chain.id, chain.atoms[i], chain.sequence[i], chain.numbering[i], properties)
end

setproperties(chain::ProteinChain, ps::NamedProperties) = ProteinChain(chain.id, chain.atoms, chain.sequence, chain.numbering, ps)

"""
addproperties(chain::ProteinChain; properties...)
Expand All @@ -63,11 +68,24 @@ them with `PersistentProperty` or `IndexableProperty`.
Values get wrapped by `PersistentProperty` by default.
See also [`PersistentProperty`](@ref), [`IndexableProperty`](@ref)
See also [`removeproperties`](@ref), [`PersistentProperty`](@ref), [`IndexableProperty`](@ref)
"""
function addproperties(chain::ProteinChain; properties...)
properties = map(p -> p isa AbstractProperty ? p : PersistentProperty(p), NamedTuple(properties))
return ProteinChain(chain.id, chain.atoms, chain.sequence, chain.numbering, merge(chain.properties, properties))
setproperties(chain, merge(chain.properties, properties))
end

"""
removeproperties(chain::ProteinChain, names::Symbol...)
Creates a new `ProteinChain` instance with the property names in `names` removed.
See also [`addproperties`](@ref)
"""
function removeproperties(chain::ProteinChain, names::Symbol...)
new_propertynames = filter(name -> name names, propertynames(chain.properties))
properties = NamedTuple{new_propertynames}(chain.properties)
setproperties(chain, properties)
end

Base.summary(chain::ProteinChain) = "$(length(chain))-residue $(typeof(chain)) ($(chain.id))"
Expand Down
14 changes: 10 additions & 4 deletions src/io/download.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ const MAX_CACHE_ENTRIES = 4
const CACHE_DIR = Ref{Union{String,Nothing}}(nothing)

function initialize_cache_dir()
if CACHE_DIR[] === nothing
if isnothing(CACHE_DIR[])
CACHE_DIR[] = mktempdir(prefix="pdb_cache_")
atexit(() -> rm(CACHE_DIR[], recursive=true))
end
Expand Down Expand Up @@ -50,11 +50,17 @@ julia> pdb"1EYE"1 # integer suffix to specify "ba_number" keyword
256-residue ProteinChain{Float64, @NamedTuple{}} (A-2)
```
"""
function pdbentry(pdbid::AbstractString; format=MMCIFFormat, kwargs...)
function pdbentry(pdbid::AbstractString; dir=nothing, format=MMCIFFormat, kwargs...)
isnothing(dir) && return cached_pdbentry(pdbid; format, kwargs...)
path = BioStructures.downloadpdb(pdbid; dir, format, kwargs...)
return read(path, ProteinStructure, format)
end

function cached_pdbentry(pdbid::AbstractString; format=MMCIFFormat, kwargs...)
initialize_cache_dir()
path = BioStructures.downloadpdb(pdbid; dir=CACHE_DIR[], format=format, kwargs...)
structure = pdbentry(pdbid; dir=CACHE_DIR[], format, kwargs...)
manage_cache()
return read(path, ProteinStructure, format)
return structure
end

macro pdb_str(pdbid::AbstractString)
Expand Down
11 changes: 9 additions & 2 deletions src/io/read.jl
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,9 @@ function ProteinChain{T}(chain::BioStructures.Chain) where T
atoms = get_atoms(Atom{T}, residues)
sequence = get_sequence(residues)
numbering = map(BioStructures.resnumber, residues)
return ProteinChain(id, atoms, sequence, numbering)
return addproperties(ProteinChain(id, atoms, sequence, numbering);
ins_codes = IndexableProperty(map(Int8 BioStructures.inscode, residues)),
)
end

function ProteinStructure{T}(struc::BioStructures.MolecularStructure; mmcifdict=nothing) where T
Expand All @@ -40,7 +42,12 @@ function ProteinStructure{T}(struc::BioStructures.MolecularStructure; mmcifdict=
push!(proteinchains, ProteinChain{T}(chain))
end
proteinstructure = ProteinStructure(struc.name, atoms, proteinchains)
mmcifdict isa BioStructures.MMCIFDict && renumber!(proteinstructure, mmcifdict)
if !isnothing(mmcifdict)
chainwise_renumbering = renumber(proteinstructure, mmcifdict)
for (i, (proteinchain, renumbering)) in enumerate(zip(proteinstructure, chainwise_renumbering))
proteinstructure[i] = addproperties(proteinchain; renumbering)
end
end
return proteinstructure
end

Expand Down
71 changes: 26 additions & 45 deletions src/io/renumber.jl
Original file line number Diff line number Diff line change
@@ -1,60 +1,41 @@
const missingvals = Set([".", "?"])

"""
renumber!(structure::ProteinStructure, mmcif_dict::BioStructures.MMCIFDict)
renumber(structure::ProteinStructure, mmcif_dict::BioStructures.MMCIFDict)
Return residue numbers from "_atom_site.label_seq_ids".
Renumber the residues in a `ProteinStructure` object according to the numbering aligned to a reference sequence in the MMCIF file.
The `ProteinStructure` will automatically add a `renumbering` property if
an MMCIFDict is passed (default if the file is an MMCIF).
"""
function renumber!(structure::ProteinStructure, mmcif_dict::BioStructures.MMCIFDict)
function renumber(structure::ProteinStructure, mmcif_dict::BioStructures.MMCIFDict)
label_seq_ids = mmcif_dict["_atom_site.label_seq_id"]
pdbx_PDB_ins_codes = mmcif_dict["_atom_site.pdbx_PDB_ins_code"]
auth_seq_ids = mmcif_dict["_atom_site.auth_seq_id"]
auth_asym_ids = mmcif_dict["_atom_site.auth_asym_id"]
reduced = Int[]
current_auth_seq_id = ""
for (i, auth_seq_id) in enumerate(auth_seq_ids)
current_auth_seq_id == auth_seq_id && continue
current_auth_seq_id = auth_seq_id
push!(reduced, i)
end

label_seq_ids = label_seq_ids[reduced]
auth_seq_ids = auth_seq_ids[reduced]
auth_asym_ids = auth_asym_ids[reduced]

chainwise_auth_to_label_seq_ids = Dict{String,Dict{String,String}}()
label_seq_id_dict = Dict{String,String}()

for asym_id in unique(auth_asym_ids)
indices = auth_asym_ids .== asym_id
chainwise_auth_to_label_seq_ids[asym_id] = Dict(zip(auth_seq_ids[indices], label_seq_ids[indices]))
indices = findall(auth_asym_ids .== asym_id)
for i in indices
ins_code = pdbx_PDB_ins_codes[i] == "?" ? " " : pdbx_PDB_ins_codes[i]
key = asym_id*auth_seq_ids[i]*ins_code
!haskey(label_seq_id_dict, key) && (label_seq_id_dict[key] = label_seq_ids[i])
end
end

chainwise_renumbering = Vector{Int32}[]
for chain in structure
renum_dict = chainwise_auth_to_label_seq_ids[chain.id]
numbering_str = map(n -> get(renum_dict, n, "?"), string.(chain.numbering))
"?" in numbering_str && break
chain.numbering .= parse.(Int, numbering_str)
if length(chain) == 0
push!(chainwise_renumbering, Int32[])
continue
end
renumbering_str = map(
(resnum, ins_code) -> parse(Int32, get(label_seq_id_dict, chain.id*string(resnum)*ins_code, "-1")), chain.numbering,
hasproperty(chain, :ins_codes) ? Iterators.map(Char, chain.ins_codes) : Iterators.repeated(' '))
push!(chainwise_renumbering, renumbering_str)
end

return structure
return chainwise_renumbering
end

#=function renumber!(structure::ProteinStructure, mmcif_dict::BioStructures.MMCIFDict)
asym_ids = mmcif_dict["_pdbx_poly_seq_scheme.asym_id"]
auth_seq_nums = map(s -> s == "?" ? -1 : parse(Int, s), mmcif_dict["_pdbx_poly_seq_scheme.auth_seq_num"])
ndb_seq_nums = map(s -> s == "?" ? -1 : parse(Int, s), mmcif_dict["_pdbx_poly_seq_scheme.ndb_seq_num"])
amino_acids = map(s -> get(BioStructures.threeletter_to_aa, s, BioStructures.AA_X), mmcif_dict["_pdbx_poly_seq_scheme.mon_id"])
chainwise_auth_to_ndb = Dict{String,Dict{Int,Int}}()
chainwise_sequences = Dict{String,String}()
for asym_id in unique(asym_ids)
indices = asym_ids .== asym_id
chainwise_auth_to_ndb[asym_id] = Dict(zip(auth_seq_nums[indices], ndb_seq_nums[indices]))
chainwise_sequences[asym_id] = join(amino_acids[indices])
end
for chain in structure
chain.numbering = map(n -> chainwise_auth_to_ndb[chain.id][n], chain.numbering)
chain.sequence_reference = chainwise_sequences[chain.id]
end
return structure
end=#
50 changes: 50 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,10 @@ using Test
@testset "chain.jl" begin
chain = ProteinChain("A", get_atoms(ProteinChains.Backbone(rand(3, 3, 5))), "AMINO", collect(1:5))
@test length(chain) == 5
chain = addproperties(chain, taxid=9606)
@test hasproperty(chain, :taxid)
chain = removeproperties(chain, :taxid)
@test !hasproperty(chain, :taxid)
end

@testset "structure.jl" begin
Expand Down Expand Up @@ -85,6 +89,52 @@ using Test
end
end

# See https://proteopedia.org/wiki/index.php/Unusual_sequence_numbering
@testset "Unusual numbering" begin

@testset "Arbitrary Numbering" begin
structure = pdb"1BSZ"
@test !allequal(chain -> chain.numbering, structure)
@test allequal(chain -> chain.renumbering, structure)
end

@testset "N-Terminal Residues Missing Coordinates" begin
chain = pdb"1D66"A
@test chain.numbering[begin] == 8
@test chain.renumbering[begin] == 8
end

@testset "Starts With Zero Or Negative Numbers" begin
chain = pdb"1BXW"A
@test chain.numbering[begin] == 0
@test chain.renumbering[begin] == 1

chain = pdb"1D5T"A
@test chain.numbering[begin] == -2
@test chain.renumbering[begin] == 1
end

@testset "Insertion Codes" begin
chain = pdb"1IGY"B
@test count(==(82), chain.numbering) == 4
@test allunique(chain.renumbering)
@test Set(map(Char, unique(chain.ins_codes))) == Set([' ', 'A', 'B', 'C'])
end

@testset "Skipping Sequence Numbers" begin
chain = pdb"2ACE"A
@test any(!isone, chain.numbering[begin+1:end] - chain.numbering[begin:end-1])
@test chain.numbering == chain.renumbering
end

@testset "Not Monotonic" begin
chain = pdb"1NSA"A
@test issorted(chain.numbering) # BioStructures sorts automatically with `fixlists!`, so residues are in wrong order
@test !issorted(chain.renumbering) # the renumbering is not monotonic. it would be if BioStructures didn't sort.
end

end

end

@testset "store" begin
Expand Down

0 comments on commit 1ca4a19

Please sign in to comment.