From 7f6e095422ce0e34f3158d8b8680450f45679075 Mon Sep 17 00:00:00 2001
From: NatureGeorge <414731811@qq.com>
Date: Wed, 24 Feb 2021 09:31:07 +0800
Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=A6v0.2.7a1?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pdb_profiling/__init__.py               |   2 +-
 pdb_profiling/commands/__init__.py      |  19 +++++
 pdb_profiling/commands/command.py       |  15 ++--
 pdb_profiling/data.py                   | 106 ++++++++++++++++++++++-
 pdb_profiling/processors/pdbe/record.py | 109 ++++++++++++++++++------
 pdb_profiling/utils.py                  |   3 +-
 pdb_profiling/warnings.py               |   6 +-
 setup.py                                |   2 +-
 8 files changed, 224 insertions(+), 38 deletions(-)

diff --git a/pdb_profiling/__init__.py b/pdb_profiling/__init__.py
index 67b26c4..23475f9 100644
--- a/pdb_profiling/__init__.py
+++ b/pdb_profiling/__init__.py
@@ -4,7 +4,7 @@
 # @Author: ZeFeng Zhu
 # @Last Modified: 2020-05-13 08:54:09 pm
 # @Copyright (c) 2020 MinghuiGroup, Soochow University
-__version__ = '0.2.6'
+__version__ = '0.2.7'
 
 
 def default_config(folder='./'):
diff --git a/pdb_profiling/commands/__init__.py b/pdb_profiling/commands/__init__.py
index d6b5101..35f0033 100644
--- a/pdb_profiling/commands/__init__.py
+++ b/pdb_profiling/commands/__init__.py
@@ -28,7 +28,26 @@ class IDMapping(orm.Model):
             Entry = orm.String(max_length=50, primary_key=True)
             isoform = orm.String(max_length=50, primary_key=True)
             is_canonical = orm.Boolean()
+        
+        class ResidueMapping(orm.Model):
+            __tablename__ = 'ResidueMapping'
+            __metadata__ = self.metadata
+            __database__ = self.database
+            UniProt = orm.String(max_length=50, primary_key=True)
+            author_insertion_code = orm.String(max_length=50, allow_null=True, allow_blank=True, default='')
+            author_residue_number = orm.Integer()
+            chain_id = orm.String(max_length=10)
+            struct_asym_id = orm.String(max_length=10, primary_key=True)
+            entity_id = orm.Integer(primary_key=True)
+            pdb_id = orm.String(max_length=4, primary_key=True)
+            residue_number = orm.Integer(primary_key=True)
+            unp_residue_number = orm.Integer(primary_key=True)
+            residue_name = orm.String(max_length=10)
+            observed_ratio = orm.Float()
+            multiple_conformers = orm.JSON(allow_null=True)
+            conflict_code = orm.String(max_length=3, allow_null=True)
 
 
         self.Mutation = Mutation
         self.IDMapping = IDMapping
+        self.ResidueMapping = ResidueMapping
diff --git a/pdb_profiling/commands/command.py b/pdb_profiling/commands/command.py
index 1205003..33c53ab 100644
--- a/pdb_profiling/commands/command.py
+++ b/pdb_profiling/commands/command.py
@@ -129,7 +129,7 @@ def id_mapping(ctx, input, column, sep, chunksize):
 @click.option('--func', type=str, default='pipe_select_mo')
 @click.option('--kwargs', type=str, default='{}')
 @click.option('--chunksize', type=int, help="the chunksize parameter", default=200)
-@click.option('--entry_filter', type=str, default='(release_date < "20201020") and ((experimental_method in ["X-ray diffraction", "Electron Microscopy"] and resolution <= 3) or experimental_method == "Solution NMR")')
+@click.option('--entry_filter', type=str, default='(release_date < "20210101") and ((experimental_method in ["X-ray diffraction", "Electron Microscopy"] and resolution <= 3) or experimental_method == "Solution NMR")')
 @click.option('--chain_filter', type=str, default="UNK_COUNT < SEQRES_COUNT and ca_p_only == False and identity >=0.9 and repeated == False and reversed == False and OBS_COUNT > 20")
 @click.option('--skip_pdbs', type=str, default='1fc2,6wrg,5jm5,6vnn,2i6l,4zai,5jn1,6bj0,6yth,4fc3,7acu,6lsd,6llc,6xoz,6xp0,6xp1,6xp2,6xp3,6xp4,6xp5,6xp6,6xp7,6xp8,6xpa,6zqz,6t5h,6xwd,6xxc')
 @click.option('--omit', type=int, default=0)
@@ -204,9 +204,9 @@ def get_unp_id(args):
 @Interface.command("residue-mapping")
 @click.option('--input', type=click.Path())
 @click.option('--chunksize', type=int, help="the chunksize parameter", default=10000)
-@click.option('--output', type=str)
-def residue_mapping(input, chunksize, output):
-    output = Path(output)
+@click.option('--output', type=str, default=None)
+@click.pass_context
+def residue_mapping(ctx, input, chunksize, output):
     dfs = read_csv(input, sep='\t', keep_default_na=False,
                    na_values=['NULL', 'null'], chunksize=chunksize)
     for df in dfs:
@@ -222,7 +222,12 @@ def residue_mapping(input, chunksize, output):
         with Progress(*progress_bar_args) as p:
             res = ob.run(p.track).result()
         res_mapping_df = concat(res, sort=False, ignore_index=True)
-        res_mapping_df[sorted(res_mapping_df.columns)].to_csv(output, sep='\t', mode='a+', index=False, header=not output.exists())
+        if output is not None:
+            output = Path(output)
+            res_mapping_df[sorted(res_mapping_df.columns)].to_csv(output, sep='\t', mode='a+', index=False, header=not output.exists())
+        else:
+            sqlite_api = ctx.obj['custom_db']
+            sqlite_api.sync_insert(sqlite_api.ResidueMapping, res_mapping_df.to_dict('records'))
         sleep(uniform(0, 1))
 
 
diff --git a/pdb_profiling/data.py b/pdb_profiling/data.py
index e097ce3..149c3bc 100644
--- a/pdb_profiling/data.py
+++ b/pdb_profiling/data.py
@@ -4,12 +4,37 @@
 # @Author: ZeFeng Zhu
 # @Last Modified: 2020-10-10 04:06:06 pm
 # @Copyright (c) 2020 MinghuiGroup, Soochow University
+# from copy import deepcopy
+
+'''
+class SwapKeyDict(dict):
+
+    def __missing__(self, key):
+        swap = key[::-1]
+        if swap not in self:
+            if hasattr(self, 'gap') and None in key:
+                return self.gap
+            raise KeyError(key)
+        else:
+            return self[swap]
+
+    def set_gap(self, gap):
+        cur = deepcopy(self)
+        cur.gap = gap
+        return cur
+'''
+
+def store_swap_key(data_from, data_to):
+    for key, value in data_from.items():
+        data_to[key] = value
+        data_to[key[::-1]] = value
+    return data_to
 
 '''
 Matrix Data From: https://github.com/biopython/biopython/blob/master/Bio/SubsMat/MatrixInfo.py
 '''
 
-blosum62 = {
+blosum62 = store_swap_key({
     ('W', 'F'): 1, ('L', 'R'): -2, ('S', 'P'): -1, ('V', 'T'): 0,
     ('Q', 'Q'): 5, ('N', 'A'): -2, ('Z', 'Y'): -2, ('W', 'R'): -3,
     ('Q', 'A'): -1, ('S', 'D'): 0, ('H', 'H'): 8, ('S', 'H'): -1,
@@ -79,7 +104,79 @@
     ('Z', 'N'): 0, ('X', 'A'): 0, ('B', 'R'): -1, ('B', 'N'): 3,
     ('F', 'D'): -3, ('X', 'Y'): -1, ('Z', 'R'): 0, ('F', 'H'): -1,
     ('B', 'F'): -3, ('F', 'L'): 0, ('X', 'Q'): -1, ('B', 'B'): 4
-}
+}, dict())
+
+blosum95 = store_swap_key({
+    ("W", "F"): 0, ("L", "R"): -3, ("S", "P"): -2, ("V", "T"): -1,
+    ("Q", "Q"): 7, ("N", "A"): -2, ("Z", "Y"): -4, ("W", "R"): -4,
+    ("Q", "A"): -1, ("S", "D"): -1, ("H", "H"): 9, ("S", "H"): -2,
+    ("H", "D"): -2, ("L", "N"): -5, ("W", "A"): -4, ("Y", "M"): -3,
+    ("G", "R"): -4, ("Y", "I"): -2, ("Y", "E"): -4, ("B", "Y"): -4,
+    ("Y", "A"): -3, ("V", "D"): -5, ("B", "S"): -1, ("Y", "Y"): 8,
+    ("G", "N"): -1, ("E", "C"): -6, ("Y", "Q"): -3, ("Z", "Z"): 4,
+    ("V", "A"): -1, ("C", "C"): 9, ("M", "R"): -2, ("V", "E"): -3,
+    ("T", "N"): -1, ("P", "P"): 8, ("V", "I"): 3, ("V", "S"): -3,
+    ("Z", "P"): -2, ("V", "M"): 0, ("T", "F"): -3, ("V", "Q"): -3,
+    ("K", "K"): 6, ("P", "D"): -3, ("I", "H"): -4, ("I", "D"): -5,
+    ("T", "R"): -2, ("P", "L"): -4, ("K", "G"): -3, ("M", "N"): -3,
+    ("P", "H"): -3, ("F", "Q"): -4, ("Z", "G"): -3, ("X", "L"): -2,
+    ("T", "M"): -1, ("Z", "C"): -5, ("X", "H"): -2, ("D", "R"): -3,
+    ("B", "W"): -6, ("X", "D"): -2, ("Z", "K"): 0, ("F", "A"): -3,
+    ("Z", "W"): -4, ("F", "E"): -5, ("D", "N"): 1, ("B", "K"): -1,
+    ("X", "X"): -2, ("F", "I"): -1, ("B", "G"): -2, ("X", "T"): -1,
+    ("F", "M"): -1, ("B", "C"): -4, ("Z", "I"): -4, ("Z", "V"): -3,
+    ("S", "S"): 5, ("L", "Q"): -3, ("W", "E"): -5, ("Q", "R"): 0,
+    ("N", "N"): 7, ("W", "M"): -2, ("Q", "C"): -4, ("W", "I"): -4,
+    ("S", "C"): -2, ("L", "A"): -2, ("S", "G"): -1, ("L", "E"): -4,
+    ("W", "Q"): -3, ("H", "G"): -3, ("S", "K"): -1, ("Q", "N"): 0,
+    ("N", "R"): -1, ("H", "C"): -5, ("Y", "N"): -3, ("G", "Q"): -3,
+    ("Y", "F"): 3, ("C", "A"): -1, ("V", "L"): 0, ("G", "E"): -3,
+    ("G", "A"): -1, ("K", "R"): 2, ("E", "D"): 1, ("Y", "R"): -3,
+    ("M", "Q"): -1, ("T", "I"): -2, ("C", "D"): -5, ("V", "F"): -2,
+    ("T", "A"): 0, ("T", "P"): -2, ("B", "P"): -3, ("T", "E"): -2,
+    ("V", "N"): -4, ("P", "G"): -4, ("M", "A"): -2, ("K", "H"): -1,
+    ("V", "R"): -4, ("P", "C"): -5, ("M", "E"): -3, ("K", "L"): -3,
+    ("V", "V"): 5, ("M", "I"): 1, ("T", "Q"): -1, ("I", "G"): -6,
+    ("P", "K"): -2, ("M", "M"): 7, ("K", "D"): -2, ("I", "C"): -2,
+    ("Z", "D"): 0, ("F", "R"): -4, ("X", "K"): -1, ("Q", "D"): -1,
+    ("X", "G"): -3, ("Z", "L"): -4, ("X", "C"): -3, ("Z", "H"): 0,
+    ("B", "L"): -5, ("B", "H"): -1, ("F", "F"): 7, ("X", "W"): -4,
+    ("B", "D"): 4, ("D", "A"): -3, ("S", "L"): -3, ("X", "S"): -1,
+    ("F", "N"): -4, ("S", "R"): -2, ("W", "D"): -6, ("V", "Y"): -3,
+    ("W", "L"): -3, ("H", "R"): -1, ("W", "H"): -3, ("H", "N"): 0,
+    ("W", "T"): -4, ("T", "T"): 6, ("S", "F"): -3, ("W", "P"): -5,
+    ("L", "D"): -5, ("B", "I"): -5, ("L", "H"): -4, ("S", "N"): 0,
+    ("B", "T"): -1, ("L", "L"): 5, ("Y", "K"): -3, ("E", "Q"): 2,
+    ("Y", "G"): -5, ("Z", "S"): -1, ("Y", "C"): -4, ("G", "D"): -2,
+    ("B", "V"): -5, ("E", "A"): -1, ("Y", "W"): 2, ("E", "E"): 6,
+    ("Y", "S"): -3, ("C", "N"): -4, ("V", "C"): -2, ("T", "H"): -2,
+    ("P", "R"): -3, ("V", "G"): -5, ("T", "L"): -2, ("V", "K"): -3,
+    ("K", "Q"): 1, ("R", "A"): -2, ("I", "R"): -4, ("T", "D"): -2,
+    ("P", "F"): -5, ("I", "N"): -4, ("K", "I"): -4, ("M", "D"): -5,
+    ("V", "W"): -3, ("W", "W"): 11, ("M", "H"): -3, ("P", "N"): -3,
+    ("K", "A"): -1, ("M", "L"): 2, ("K", "E"): 0, ("Z", "E"): 4,
+    ("X", "N"): -2, ("Z", "A"): -1, ("Z", "M"): -2, ("X", "F"): -2,
+    ("K", "C"): -5, ("B", "Q"): -1, ("X", "B"): -2, ("B", "M"): -4,
+    ("F", "C"): -3, ("Z", "Q"): 4, ("X", "Z"): -1, ("F", "G"): -5,
+    ("B", "E"): 0, ("X", "V"): -2, ("F", "K"): -4, ("B", "A"): -3,
+    ("X", "R"): -2, ("D", "D"): 7, ("W", "G"): -5, ("Z", "F"): -4,
+    ("S", "Q"): -1, ("W", "C"): -4, ("W", "K"): -5, ("H", "Q"): 1,
+    ("L", "C"): -3, ("W", "N"): -5, ("S", "A"): 1, ("L", "G"): -5,
+    ("W", "S"): -4, ("S", "E"): -1, ("H", "E"): -1, ("S", "I"): -3,
+    ("H", "A"): -3, ("S", "M"): -3, ("Y", "L"): -2, ("Y", "H"): 1,
+    ("Y", "D"): -5, ("E", "R"): -1, ("X", "P"): -3, ("G", "G"): 6,
+    ("G", "C"): -5, ("E", "N"): -1, ("Y", "T"): -3, ("Y", "P"): -5,
+    ("T", "K"): -1, ("A", "A"): 5, ("P", "Q"): -2, ("T", "C"): -2,
+    ("V", "H"): -4, ("T", "G"): -3, ("I", "Q"): -4, ("Z", "T"): -2,
+    ("C", "R"): -5, ("V", "P"): -4, ("P", "E"): -2, ("M", "C"): -3,
+    ("K", "N"): 0, ("I", "I"): 5, ("P", "A"): -1, ("M", "G"): -4,
+    ("T", "S"): 1, ("I", "E"): -4, ("P", "M"): -3, ("M", "K"): -2,
+    ("I", "A"): -2, ("P", "I"): -4, ("R", "R"): 7, ("X", "M"): -2,
+    ("L", "I"): 1, ("X", "I"): -2, ("Z", "B"): 0, ("X", "E"): -2,
+    ("Z", "N"): -1, ("X", "A"): -1, ("B", "R"): -2, ("B", "N"): 4,
+    ("F", "D"): -5, ("X", "Y"): -2, ("Z", "R"): -1, ("F", "H"): -2,
+    ("B", "F"): -5, ("F", "L"): 0, ("X", "Q"): -1, ("B", "B"): 4
+}, dict())
 
 
 '''
@@ -95,7 +192,8 @@
 After transform the distance matrix into a similarity matrix, normalize each value by the highest_similarity.
     NOTE: now the maximum similarity is 1
 '''
-miyata_similarity_matrix = {
+
+miyata_similarity_matrix = store_swap_key({
  ('A', 'A'): 1.0,
  ('A', 'C'): -0.112,
  ('A', 'P'): 0.952,
@@ -305,4 +403,4 @@
  ('Y', 'S'): -1.664,
  ('Y', 'T'): -0.96,
  ('Y', 'V'): -0.216,
- ('Y', 'Y'): 1.0}
+ ('Y', 'Y'): 1.0}, dict())
diff --git a/pdb_profiling/processors/pdbe/record.py b/pdb_profiling/processors/pdbe/record.py
index e4c92ed..d4fc7ff 100644
--- a/pdb_profiling/processors/pdbe/record.py
+++ b/pdb_profiling/processors/pdbe/record.py
@@ -5,6 +5,7 @@
 # @Last Modified: 2020-08-11 10:48:11 pm
 # @Copyright (c) 2020 MinghuiGroup, Soochow University
 from typing import Iterable, Union, Callable, Optional, Hashable, Dict, Coroutine, List, Tuple
+from inspect import isawaitable
 from numpy import array, where as np_where, count_nonzero, nan, dot, exp, square
 from pathlib import Path
 from pandas import isna, concat, DataFrame, Series, merge
@@ -49,7 +50,7 @@
 from pdb_profiling.warnings import (WithoutCifKeyWarning, PISAErrorWarning, 
                                     ConflictChainIDWarning, PossibleObsoletedUniProtWarning,
                                     PossibleObsoletedPDBEntryWarning, SkipAssemblyWarning,
-                                    PeptideLinkingWarning, MultiWrittenWarning)
+                                    PeptideLinkingWarning, MultiWrittenWarning, WithoutRCSBClusterMembershipWarning)
 from pdb_profiling.ensure import aio_file_exists_stat
 from textdistance import sorensen
 from warnings import warn
@@ -137,6 +138,11 @@ def get_db_semaphore(cls):
 
     @classmethod
     def set_folder(cls, folder: Union[Path, str]):
+        """Set your folder path
+
+        Args:
+            folder (Union[Path, str]): the path to set
+        """        
         folder = Path(folder)
         assert folder.exists(), "Folder not exist! Please create it or input a valid folder!"
         cls.folder = folder
@@ -154,6 +160,17 @@ def check_folder(cls):
             raise ValueError(f"Please set folder via {cls.__name__}.set_folder(folder: Union[Path, str])")
     
     def fetch_from_pdbe_api(self, api_suffix: str, then_func: Optional[Callable[[Unfuture], Unfuture]] = None, json: bool = False, mask_id: str = None) -> Unfuture:
+        """fetch data from PDBe API
+
+        Args:
+            api_suffix (str): the suffix of the API that you want to retrieve info.
+            then_func (Optional[Callable[[Unfuture], Unfuture]], optional): function arg that pass to Unfuture.then(). Defaults to None.
+            json (bool, optional): whether the data is treated and returned as JSON. Defaults to False.
+            mask_id (str, optional): Defaults to None.
+
+        Returns:
+            Unfuture: Unfuture object
+        """        
         assert api_suffix in API_SET, f"Invlaid API SUFFIX! Valid set:\n{API_SET}"
         identifier = self.get_id() if mask_id is None else mask_id
         task = self.tasks.get((self.__class__.__name__, api_suffix, then_func, json, identifier), None)
@@ -229,7 +246,7 @@ async def to_dataframe_with_kwargs(cls, path, **kwargs):
     @staticmethod
     @unsync
     async def result_set_to_dataframe(data):
-        if isinstance(data, (Unfuture, Coroutine)):
+        if isawaitable(data):
             data = await data
         if data is None:
             return
@@ -384,7 +401,7 @@ def fetch_from_PDBArchive(self, api_suffix: str, then_func: Optional[Callable[[U
     @classmethod
     @unsync
     async def cif2atom_sites_df(cls, path: Union[Unfuture, Coroutine, str, Path]):
-        if isinstance(path, (Unfuture, Coroutine)):
+        if isawaitable(path):
             path = await path
         async with aiofiles_open(path, 'rt') as file_io:
             handle = await file_io.read()
@@ -414,7 +431,7 @@ async def cif2residue_listing(cls, path: Union[Unfuture, Coroutine, str, Path]):
                     'authore_residue_number',
                     'chain_id',
                     'author_insertion_code')
-        if isinstance(path, (Unfuture, Coroutine)):
+        if isawaitable(path):
             path = await path
         with gzip_open(path, 'rt') as handle:
             mmcif_dict = MMCIF2DictPlus(handle, cols)
@@ -490,7 +507,7 @@ def to_rank(rank_dict, assembly_id, struct_asym_id):
             assert var[1] <= var[0]
             return var[1]
         
-        if isinstance(path, (Unfuture, Coroutine)):
+        if isawaitable(path):
             path = await path
         path = Path(path)
         if path.suffix == '.cif':
@@ -1264,7 +1281,7 @@ async def pipe_interface_res_dict(self, chain_pairs=None, au2bu:bool=False, focu
     async def expand_multiple_conformers(dfrm: Union[DataFrame, Unfuture, Coroutine]):
         '''for residue_listing dataframe'''
         '''
-        if isinstance(dfrm, (Coroutine, Unfuture)):
+        if isawaitable(dfrm):
             dfrm = await dfrm
         '''
         pass
@@ -1387,6 +1404,15 @@ async def rcsb_cluster_membership(self, entity_id, identity_cutoff:int=100):
         }
         '''
         dfs = []
+        try:
+            assert res['data']['polymer_entity']['rcsb_cluster_membership'] is not None
+        except Exception as e:
+            info = f"polymer_entity(entry_id: \"{self.pdb_id}\", entity_id: \"{entity_id}\") -> {res}"
+            if isinstance(e, AssertionError):
+                warn(info, WithoutRCSBClusterMembershipWarning)
+                return
+            else:
+                raise ValueError(info)
         for i in res['data']['polymer_entity']['rcsb_cluster_membership']:
             if i['identity'] != identity_cutoff:
                 continue
@@ -1431,7 +1457,7 @@ def __init__(self, pdb_ass_id, pdb_ob: Optional[PDB]=None):
         NOTE: reference: <https://www.ebi.ac.uk/training/online/course/pdbepisa-identifying-and-interpreting-likely-biolo/1555-special-code-doing-nothing-structure>
         '''
         self.interface_filters = {
-            'symmetry_operator': ('isin', ('1_555', '1555'))  # 1555 for api%pisa%asiscomponent%+6e4h%0%interfaces
+            'symmetry_operator': ('isin', ('1_555', '1555', 1555))  # 1555 for api%pisa%asiscomponent%+6e4h%0%interfaces
         }  # 'structure_2.symmetry_id': ('eq', '1_555'),'css': ('ge', 0)
 
     def set_id(self, pdb_ass_id: str):
@@ -1533,7 +1559,7 @@ def to_interface_id(pdb_assembly_id, focus_interface_ids):
         if interfacelist_df is None:
             interfacelist_df, use_au = await self.get_interfacelist_df(
                 'api/pisa/interfacelist/', PDBAssemble.to_interfacelist_df)
-            self.interface_filters['structure_2.symmetry_id'] = ('isin', ('1_555', '1555'))
+            self.interface_filters['structure_2.symmetry_id'] = ('isin', ('1_555', '1555', 1555))
             del self.interface_filters['symmetry_operator']
         else:
             interfacelist_df = interfacelist_df.rename(columns={'complex_formation_score': 'css'})
@@ -1860,8 +1886,19 @@ async def get_interface_res_dict(self, **kwargs):
 
 
 class SIFTS(PDB):
+    '''
+    TODO
+    
+    1. Better OligoState
+      * RAW (both from wwPDB and self assigned)
+      * FILTERED 
+    2. Define Best Isoform
+    3. UniProt Isoform Interaction
+    4. PDBChain Instance Interaction (Biological Relevance)
+    '''
 
     tasks = LRUCache(maxsize=1024)
+    sa_cache = LRUCache(maxsize=100)
 
     EntityChain = namedtuple('EntityChain', 'pdb_id entity_chain_info entity_count chain_count')
     UniProtEntity = namedtuple('UniProtEntity', 'pdb_id unp_entity_info entity_unp_info entity_with_unp_count min_unp_count')
@@ -1906,7 +1943,7 @@ def fetch_unp_fasta(cls, identifier):
     @classmethod
     @unsync
     async def complete_chains(cls, dfrm: Union[DataFrame, Unfuture, Coroutine]):
-        if isinstance(dfrm, (Coroutine, Unfuture)):
+        if isawaitable(dfrm):
             dfrm = await dfrm
         if cls.complete_chains_run_as_completed:
             res = await SIFTSs(dfrm.pdb_id.unique()).fetch('fetch_from_pdbe_api', 
@@ -2136,7 +2173,7 @@ async def add_residue_conflict(cls, dfrm: Union[DataFrame, Tuple, Unfuture, Coro
         '''
         TODO: optimization
         '''
-        if isinstance(dfrm, (Coroutine, Unfuture)):
+        if isawaitable(dfrm):
             dfrm = await dfrm
         if isinstance(dfrm, Tuple):
             dfrm = dfrm[0]
@@ -2188,40 +2225,54 @@ async def renew_sifts_mapping_from_graph_api(cls, UniProt, pdb_id, entity_id, pd
     @staticmethod
     @unsync
     async def deal_with_identical_entity_seq(dfrm):
-        if isinstance(dfrm, (Coroutine, Unfuture)):
+        if isawaitable(dfrm):
             dfrm = await dfrm
         already = set()
         cluster_dfs = []
-        # dfrm = dfrm.copy()
-        # dfrm['pdb_sequence'] = ''
+        dfrm = dfrm.copy()
+        dfrm['pdb_sequence'] = b''
         dfrm_nr = dfrm[['pdb_id', 'entity_id']].drop_duplicates()
         for pdb_id, entity_id in zip(dfrm_nr.pdb_id, dfrm_nr.entity_id):
-            # dfrm.loc[dfrm[dfrm.pdb_sequence.eq('') & dfrm.pdb_id.eq(pdb_id) & dfrm.entity_id.eq(entity_id)].index, 'pdb_sequence'] = await PDB(pdb_id).get_sequence(entity_id=entity_id, mode='raw_pdb_seq')
+            dfrm.loc[dfrm[dfrm.pdb_sequence.eq(b'') & dfrm.pdb_id.eq(pdb_id) & dfrm.entity_id.eq(entity_id)].index, 'pdb_sequence'] = compress(bytes(await PDB(pdb_id).get_sequence(entity_id=entity_id, mode='raw_pdb_seq'), encoding='utf-8'))
             if (pdb_id, entity_id) in already:
                 continue
             cur_cluster_df = await PDB(pdb_id).rcsb_cluster_membership(entity_id=entity_id, identity_cutoff=100)
-            already |= set(zip(cur_cluster_df.pdb_id, cur_cluster_df.entity_id))
+            try:
+                assert cur_cluster_df is not None
+                already |= set(zip(cur_cluster_df.pdb_id, cur_cluster_df.entity_id))
+            except AssertionError:
+                cur_cluster_df = DataFrame([dict(pdb_id=pdb_id, entity_id=entity_id, cluster_id=-1)])
             cluster_dfs.append(cur_cluster_df)
 
         cluster_df = concat(cluster_dfs, sort=False, ignore_index=True)
         assert not any(cluster_df.duplicated())
         dfrm = dfrm.merge(cluster_df[['pdb_id','entity_id','cluster_id']], how='left')
         assert not any(dfrm.cluster_id.isnull()), f"{dfrm[dfrm.cluster_id.isnull()]}"
-        return dfrm
+        dfrm['fix_cluster_id'] = dfrm.groupby(['cluster_id', 'pdb_sequence']).ngroup().astype(str) + '_' + dfrm.cluster_id.astype(str)
+        # ignore/overried cases like (P00720,2b7x,B v.s P00720,2b7x,A)
+        return dfrm.drop(columns=['pdb_sequence'])
 
     @classmethod
     @unsync
     async def double_check_conflict_and_range(cls, dfrm: Union[DataFrame, Unfuture, Coroutine]):
-        if isinstance(dfrm, (Coroutine, Unfuture)):
+        if isawaitable(dfrm):
             dfrm = await dfrm
         focus_part = dfrm[
             dfrm.sifts_range_tag.isin(('Deletion', 'Insertion_Undivided', 'InDel_2', 'InDel_3')) &
             (dfrm.conflict_pdb_index.apply(get_str_dict_len)/dfrm.new_pdb_range.apply(range_len)).ge(0.1)]
         if len(focus_part) == 0:
             return dfrm
-        tasks = tuple(map(cls.renew_sifts_mapping_from_graph_api, focus_part.UniProt, focus_part.pdb_id, focus_part.entity_id, focus_part.pdb_range, focus_part.unp_range, focus_part.range_diff))
-        dfrm.loc[focus_part.index, ['new_unp_range', 'new_pdb_range']] = [await task for task in tasks]
-        res = await cls.add_residue_conflict(dfrm.loc[focus_part.index].drop(columns=['conflict_pdb_index', 'raw_pdb_index', 'conflict_pdb_range', 'conflict_unp_range']))
+        focus_part_iden = await cls.deal_with_identical_entity_seq(focus_part)
+        focus_part_iden_dd = focus_part_iden.drop_duplicates(subset=['UniProt', 'fix_cluster_id']).copy()
+        tasks = tuple(map(cls.renew_sifts_mapping_from_graph_api, focus_part_iden_dd.UniProt, focus_part_iden_dd.pdb_id, focus_part_iden_dd.entity_id, focus_part_iden_dd.pdb_range, focus_part_iden_dd.unp_range, focus_part_iden_dd.range_diff))
+        focus_part_iden_dd[['new_unp_range', 'new_pdb_range']] = [await task for task in tasks]
+        focus_part_iden_dd = await cls.add_residue_conflict(focus_part_iden_dd.drop(columns=['conflict_pdb_index', 'raw_pdb_index', 'conflict_pdb_range', 'conflict_unp_range']))
+        focus_cols = ['UniProt', 'fix_cluster_id', 'new_unp_range', 'new_pdb_range',
+                      'conflict_pdb_index', 'raw_pdb_index', 'conflict_pdb_range', 'conflict_unp_range']
+        res = focus_part_iden.drop(columns=focus_cols[2:]).merge(focus_part_iden_dd[focus_cols], how='left')
+        assert res.isnull().sum().sum() == 0
+        res = res.drop(columns=['fix_cluster_id', 'cluster_id'])
+        assert res.shape == focus_part.shape, f"{res.shape}, {focus_part.shape}"
         res.index = focus_part.index
         dfrm.loc[focus_part.index] = res
         return dfrm
@@ -2297,7 +2348,7 @@ def check_range_tail(new_pdb_range, new_unp_range, pdb_range):
     @classmethod
     @unsync
     async def fix_range(cls, dfrm: Union[DataFrame, Tuple, Unfuture, Coroutine]):
-        if isinstance(dfrm, (Coroutine, Unfuture)):
+        if isawaitable(dfrm):
             dfrm = await dfrm
         if isinstance(dfrm, Tuple):
             dfrm = dfrm[0]
@@ -2325,9 +2376,17 @@ async def fix_range(cls, dfrm: Union[DataFrame, Tuple, Unfuture, Coroutine]):
 
     @staticmethod
     def sliding_alignment_score(range_diff, pdb_seq, pdb_range, unp_seq, unp_range, **kwargs):
+        '''
+        TODO: improve code
+        '''
+        def generate_seq_item(seq, gap_index, gap_num):
+            yield from seq[:gap_index]
+            for _ in range(gap_num):
+                yield '-'
+            yield from seq[gap_index:]
+
         def get_optimal_range(abs_diff, seg_to_add, seg_to_ori, lstart, lend, rstart, rend, on_left):
-            gap_seg = '-' * abs_diff
-            res = tuple(sum(blosum62.get((l, r), blosum62.get((r, l), 0)) for l, r in zip(seg_to_add[:i] + gap_seg + seg_to_add[i:], seg_to_ori)) for i in range(len(seg_to_add)+1))
+            res = tuple(sum(blosum62.get((l, r), 0) for l, r in zip(generate_seq_item(seg_to_add, i, abs_diff), seg_to_ori)) for i in range(len(seg_to_add)+1))
             max_val = max(res)
             index = res.index(max_val)
             assert index >= 0 # ???
@@ -2431,7 +2490,7 @@ def bs_score_aligned_part(new_pdb_range, conflict_pdb_range, conflict_pdb_index,
             pdb_aa = raw_pdb_index.get(i, None)
             if (i not in non_set) and (unp_aa is not None) and (unp_aa != pdb_aa):
                 # NOT Modified & Conflict Residues are fall into here
-                theta = miyata_similarity_matrix.get((unp_aa, pdb_aa), miyata_similarity_matrix.get((pdb_aa, unp_aa), -3.104))
+                theta = miyata_similarity_matrix.get((unp_aa, pdb_aa), -3.104)
             else:
                 # UNK | Modified Residue
                 theta = -3.104
@@ -2724,7 +2783,7 @@ def parallel_interact_df(sifts_df, i3d_df, common_cols=('revision_date', 'deposi
         rename_dict['pdb_id_1'] = 'pdb_id'
         sifts_df_ = sifts_df.add_suffix('_1').rename(columns=rename_dict)
         i3d_df = i3d_df.merge(sifts_df_)
-        sifts_df_ = sifts_df.drop(columns=sifts_df.columns & set(common_cols)).add_suffix('_2').rename(columns={'pdb_id_2': 'pdb_id'})
+        sifts_df_ = sifts_df.drop(columns=sifts_df.columns.intersection(common_cols)).add_suffix('_2').rename(columns={'pdb_id_2': 'pdb_id'})
         i3d_df = i3d_df.merge(sifts_df_)
         swap_index = i3d_df[
             (i3d_df.struct_asym_id_1 > i3d_df.struct_asym_id_2) | 
diff --git a/pdb_profiling/utils.py b/pdb_profiling/utils.py
index 9186b8e..bcba34d 100644
--- a/pdb_profiling/utils.py
+++ b/pdb_profiling/utils.py
@@ -7,6 +7,7 @@
 import os
 import gzip
 import shutil
+from inspect import isawaitable
 from typing import Optional, Union, Dict, Tuple, Iterable, Iterator, List, Coroutine, NamedTuple, Callable, Generator
 from logging import Logger
 from pandas import read_csv, DataFrame, isna, Series, concat
@@ -578,7 +579,7 @@ async def get_seqs_from_parser(res, identifiers:Optional[Iterable[str]]=None):
 
 
 async def a_seq_parser(path: Union[Unfuture, Coroutine, Path, str]):
-    if isinstance(path, (Unfuture, Coroutine)):
+    if isawaitable(path):
         path = await path
     async with aiofiles_open(path, 'rt') as handle:
         header, content = None, ''
diff --git a/pdb_profiling/warnings.py b/pdb_profiling/warnings.py
index c3fcc1f..358f4c3 100644
--- a/pdb_profiling/warnings.py
+++ b/pdb_profiling/warnings.py
@@ -76,4 +76,8 @@ class FileExistsWarning(UserWarning):
 
 
 class InvalidFileContentWarning(UserWarning):
-    pass
\ No newline at end of file
+    pass
+
+
+class WithoutRCSBClusterMembershipWarning(UserWarning):
+    pass
diff --git a/setup.py b/setup.py
index 118c640..b2f9bc3 100644
--- a/setup.py
+++ b/setup.py
@@ -25,7 +25,7 @@
         'tenacity>=6.3.0',
         'orjson>=3.0.2',
         'pyexcel>=0.6.4',
-        'pandas>=1.0.3',
+        'pandas>=1.2.2',
         'numpy>=1.18.1',
         'textdistance>=4.1.5',
         'databases>=0.3.2',