Skip to content

Commit

Permalink
upgraded to new RCSB webservices
Browse files Browse the repository at this point in the history
  • Loading branch information
pwrose committed Jan 27, 2022
1 parent 43b81e4 commit 2da1eac
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 0 deletions.
69 changes: 69 additions & 0 deletions mmtfPyspark/webfilters/blastCluster.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#!/user/bin/env python
'''blastCluster.py
This filter passes through representative structures from the RCSB PDB
MMseqs2 clusters. A sequence identity thresholds needs to be specified.
The representative for each cluster is the first chain in a cluster.
References
----------
BlastClust cluster field names:
`Field names https://www.rcsb.org/docs/programmatic-access/file-download-services`_
Examples
--------
Find representative PDB entries at 90% sequence identity:
>>> sequenceIdentity = 90
>>> pdb = pdb.filter(BlastCluster(90))
'''
__author__ = "Mars (Shih-Cheng) Huang"
__maintainer__ = "Mars (Shih-Cheng) Huang"
__email__ = "[email protected]"
__version__ = "0.2.0"
__status__ = "Done"

import urllib.request


class BlastCluster(object):
'''Filters blast clusters
Attributes
----------
sequenceIdentity : int
sequence indentity for blast
'''
def __init__(self, sequenceIdentity):

clusters = self.get_blast_cluster(sequenceIdentity)

self.pdbIds = set()

for protein in clusters:
self.pdbIds.add(protein)
self.pdbIds.add(protein[:4])


def __call__(self, t):
return t[0] in self.pdbIds


def get_blast_cluster(self, sequenceIdentity):

if sequenceIdentity not in [30,40,50,70,90,95,100]:
raise Exception(f"Error: representative chains are not availible for \
sequence Identity {sequenceIdentity}.\n Must be in \
range [30,40,50,70,90,95,100]")
return

# coreUrl = "https://cdn.rcsb.org/sequence/clusters/"
coreUrl = "https://cdn.rcsb.org/resources/sequence/clusters/"
clusters = []
inputStream = urllib.request.urlopen(f"{coreUrl}bc-{sequenceIdentity}.out")

for line in inputStream:
line = str(line)[2:-3].replace("_",".").strip("\\n")
clusters += line.split(" ")

return clusters
Empty file.
File renamed without changes.

0 comments on commit 2da1eac

Please sign in to comment.