-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path21_minimum_distance_pairs.Rmd
132 lines (100 loc) · 4.13 KB
/
21_minimum_distance_pairs.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
---
title: "pairwise_residue_distances_from_pdbmodel"
author: "Toni Beltran"
date: "19/07/2024"
output: html_document
---
```{r load pdb and calculate distances}
#' doubledeepms__minimum_interchain_distances_from_PDB
#'
#' Calculate minimum inter-chain (side-chain) heavy atom distances.
#'
#' @param input_file path to PDB file (required)
#' @param chain_query query chain id (default:A)
#' @param chain_target target chain id (default:B)
#'
#' @return data.table with minimum inter-chain (side-chain) heavy atom distances
#' @export
#' @import data.table
#custom stuff#
library(data.table)
library(bio3d)
base_dir="/path/to/your/files"
setwd(base_dir)
minimum_pairwise_distances_from_PDB_models <- function(
input_file,
chain_query = "A",
chain_target = "A"
){
#load PDB structure
# sink(file = "/dev/null")
pdb <- bio3d::read.pdb(input_file, rm.alt = TRUE)
# sink()
### Atom selections
###########################
#Protein atoms
sele_protein <- bio3d::atom.select(pdb, "protein", verbose=FALSE)
#Hydrogen atoms
sele_H <-bio3d::atom.select(pdb, "h", verbose=FALSE)
#Water atoms
sele_water <- bio3d::atom.select(pdb, "water", verbose=FALSE)
#Side chain atoms
sele_sc <- bio3d::atom.select(pdb, "sidechain", verbose=FALSE)
#C-alpha atoms
sele_ca <- bio3d::atom.select(pdb, "calpha", verbose=FALSE)
#Glycine c-alpha atoms
sele_glyca <- bio3d::atom.select(pdb, resid = "GLY", string = "calpha", verbose=FALSE)
### Combine atom selections
###########################
#Heavy atoms
sele_HA <- bio3d::combine.select(sele_protein, sele_H, sele_water, operator = "-", verbose=FALSE)
#Side chain heavy atoms + c-alpha for glycine
sele_prot_sc <- bio3d::combine.select(sele_protein, sele_sc, operator = "AND", verbose=FALSE)
sele_prot_sc_glyca <- bio3d::combine.select(sele_prot_sc, sele_glyca, operator = "OR", verbose=FALSE)
sele_scHA <- bio3d::combine.select(sele_prot_sc_glyca, sele_H, sele_water, operator = "-", verbose=FALSE)
#Side chain heavy atoms + c-alpha for all residues
sele_prot_sc_ca <- bio3d::combine.select(sele_prot_sc, sele_ca, operator = "OR", verbose=FALSE)
sele_scHA_ca <- bio3d::combine.select(sele_prot_sc_ca, sele_H, sele_water, operator = "-", verbose=FALSE)
#List
sele_list <- list(
"HA" = sele_HA,
"scHA" = sele_scHA,
"scHA_ca" = sele_scHA_ca)
### Calculate minimum target chain distances
###########################
result_dt <- data.table()
for(metric in names(sele_list)){
#Distance matrix
pdb_sub <- bio3d::trim.pdb(pdb, sele_list[[metric]])
dist_mat <- bio3d::dm.xyz(pdb_sub$xyz, grpby=apply(pdb_sub$atom[,c("resno", "chain")], 1, paste, collapse = "_"), scut=0,
mask.lower = FALSE)
resno_sub <- unique(pdb_sub$atom[,c("resno", "chain")])
dist_mat<-matrix(dist_mat,ncol=ncol(dist_mat),nrow = nrow(dist_mat))
#change rows and columns to residue numbers
colnames(dist_mat) <- resno_sub[resno_sub[,"chain"]==chain_target,"resno"]
rownames(dist_mat) <- resno_sub[resno_sub[,"chain"]==chain_query,"resno"]
#chain target names
dist_mat.mlt<-data.table(reshape2::melt(dist_mat))
colnames(dist_mat.mlt)<-c("Pos1","Pos2",metric)
if (nrow(result_dt)==0){result_dt<-dist_mat.mlt}
else{result_dt<-merge(result_dt,dist_mat.mlt,by=c("Pos1","Pos2"))}
}
#Return
return(result_dt)
}
domainome_domains<-fread("analysis_files/domain_QC_summary_reproducibility_ranked_sortedPFAMID.txt")[retained=="yes",]
distances_domainome_all<-data.frame()
for (domain in domainome_domains$doms){
input_file_path<-paste("analysis_files/pdb_files/",domain,".pdb",sep="")
pdbfile_dists <- minimum_pairwise_distances_from_PDB_models(input_file = input_file_path,
chain_query = "A",
chain_target = "A")
pdbfile_dists[,dom_ID:=domain]
distances_domainome_all<-rbind(distances_domainome_all,
pdbfile_dists)
}
write.table(distances_domainome_all,
file = "analysis_files/minimum_residuedistances_pairwise.txt",
quote=FALSE,
sep="\t")
```