string_PPI.py

# Uniprot to string mapping https://string-db.org/mapping_files/uniprot/human.uniprot_2_string.2018.tsv.gz
# String PPI dataset https://stringdb-static.org/download/protein.actions.v11.0/9606.protein.actions.v11.0.txt.gz
# Columns definition http://www.string-db.org/help/faq/#what-does-the-columns-in-proteinsactions-file-mean

import pandas as pd
import wget
import os
import sys
import metadata
import datetime

source = "https://stringdb-static.org/download/protein.actions.v11.0/9606.protein.actions.v11.0.txt.gz"
mapping = "https://string-db.org/mapping_files/uniprot/human.uniprot_2_string.2018.tsv.gz"

def evaLink(term1, term2, predicate, link_type="ListLink",stv="", ppi=True):
    if not (str(term1) == "nan" or str(term2) == 'nan'):
        if ppi:
            return("(EvaluationLink" + stv + "\n" +
                "\t (PredicateNode \""+ predicate + "\")\n" +
                "\t (" + link_type + " \n" +
                "\t\t (MoleculeNode" + " \"Uniprot:" + term1 + "\")\n" +
                "\t\t (MoleculeNode" + " \"Uniprot:" + term2 + "\")))\n" )
        else:
            return("(EvaluationLink" + stv + "\n" +
            "\t (PredicateNode \""+ predicate + "\")\n" +
            "\t (" + link_type + " \n" +
            "\t\t (GeneNode" + " \"" + term1.upper() + "\")\n" +
            "\t\t (GeneNode" + " \"" + term2.upper() + "\")))\n" )
    else:
        return ""
def import_string():
    print("started at " + str(datetime.datetime.now()))
    if not os.path.exists('raw_data/9606.protein.actions.v11.0.txt.gz'):
        wget.download(source,"raw_data/")
    if not os.path.exists('raw_data/human.uniprot_2_string.2018.tsv.gz'):
        wget.download(mapping,"raw_data/")
    
    df_data = pd.read_csv("raw_data/9606.protein.actions.v11.0.txt.gz", dtype=str, sep="\t")
    df_data_symmetric = df_data[df_data['is_directional'] == "f"]
    df_data_asymmetric = df_data[df_data['is_directional'] == "t"]
    df_mapping = pd.read_csv("raw_data/human.uniprot_2_string.2018.tsv.gz", dtype=str, sep="\t", names=["code", "uniprot", "ensembl","num1","num2"])
   
    # create a mapping dictionary
    mapping_dict = {} 
    for e in df_mapping["ensembl"]:
        if not e in mapping_dict.keys():
            mapping_dict[e] = df_mapping[df_mapping["ensembl"] == e]["uniprot"].values[0]
    print("Done with the Dict, importing into atomese")
    print(len(df_data))
    notmapped = []

    """
        If the directionality of the interaction is true and a is acting, use ListLink and keep the order. Otherwise use SetLink
        * is_directional - describes if the diractionality of the particular interaction is known.
        * a_is_acting - the directionality of the action if applicable ('t' gives that item_id_a is acting upon item_id_b)
        Example:
        item_id_a   item_id_b   mode    is_directional  a_is_acting
        ENSP00000000233	ENSP00000216366 reaction    f   f   
        <=> EvaluationLink 
                PredicateNode "reaction"
                SetLink ENSP00000000233 ENSP00000216366

        ENSP00000000233	ENSP00000216366 reaction    t   f
        <=> EvaluationLink 
                PredicateNode "reaction"
                ListLink ENSP00000216366 ENSP00000000233
        ENSP00000000233	ENSP00000216366	reaction    t   t
        <=> EvaluationLink 
                PredicateNode "reaction"
                ListLink ENSP00000000233 ENSP00000216366
    
        Keep symmetric relations and ignore if the same relation happens to be asymmetric
    """
    symmetric = {}
    if not os.path.exists(os.path.join(os.getcwd(), 'string_dataset')):
        os.makedirs('string_dataset')
    with open("string_dataset/string_ppi_{}.scm".format(str(datetime.date.today())), "w") as f, open('string_dataset/string_ggi_{}.scm'.format(str(datetime.date.today())), 'w') as g:
        for i in range(len(df_data_symmetric)):
            try:
                prot1 = df_data_symmetric.iloc[i]['item_id_a']
                prot2 = df_data_symmetric.iloc[i]['item_id_b']
                mode = df_data_symmetric.iloc[i]['mode']
                score = int(df_data_symmetric.iloc[i]['score'])               

                if prot1 in mapping_dict.keys() and prot2 in mapping_dict.keys():
                    prot1 = mapping_dict[prot1]
                    prot2 = mapping_dict[prot2]
                else:
                    if not prot1 in mapping_dict.keys():
                        notmapped.append(prot1)
                    else:
                        notmapped.append(prot2)
                    continue
                
                protein1 = prot1.split("|")[0]
                gene1 = prot1.split("|")[1].split("_")[0] 
                protein2 = prot2.split("|")[0]
                gene2 = prot2.split("|")[1].split("_")[0]

                f.write(evaLink(protein1, protein2, mode, stv="(stv {} {})".format(1.0, score/1000),link_type="SetLink"))
                g.write(evaLink(gene1, gene2, mode,stv="(stv {} {})".format(1.0, score/1000), link_type="SetLink", ppi=False))  
                symmetric[gene1 + gene2] = mode

            except Exception as e:
                print(e)
        for i in range(len(df_data_asymmetric)):
            try:
                prot1 = df_data_asymmetric.iloc[i]['item_id_a']
                prot2 = df_data_asymmetric.iloc[i]['item_id_b']
                mode = df_data_asymmetric.iloc[i]['mode']
                a_is_acting = df_data_asymmetric.iloc[i]['a_is_acting'] 
                score = int(df_data_asymmetric.iloc[i]['score'])               

                if prot1 in mapping_dict.keys() and prot2 in mapping_dict.keys():
                    prot1 = mapping_dict[prot1]
                    prot2 = mapping_dict[prot2]
                else:
                    if not prot1 in mapping_dict.keys():
                        notmapped.append(prot1)
                    else:
                        notmapped.append(prot2)
                    continue
                
                protein1 = prot1.split("|")[0]
                gene1 = prot1.split("|")[1].split("_")[0] 
                protein2 = prot2.split("|")[0]
                gene2 = prot2.split("|")[1].split("_")[0]

                if not (gene1+gene2 in symmetric.keys() and symmetric[gene1+gene2] == mode):
                    if a_is_acting is "t": 
                        f.write(evaLink(protein1, protein2, mode, stv="(stv {} {})".format(1.0, score/1000)))
                        g.write(evaLink(gene1, gene2, mode, stv="(stv {} {})".format(1.0, score/1000), ppi=False))
                    else:
                        f.write(evaLink(protein2, protein1, mode, stv="(stv {} {})".format(1.0, score/1000)))
                        g.write(evaLink(gene2, gene1, mode, ppi=False, stv="(stv {} {})".format(1.0, score/1000)))

            except Exception as e:
                print(e)
        
        print("Done " + str(datetime.datetime.now()))
        with open("string_dataset/notmapped_ensembles.txt", "w") as n:
            n.write("\n".join(set(notmapped)))

if __name__ == "__main__":
    import_string()