diff --git a/notebooks/TCR_epitope_prediction_NewEpitope_NewTCR_LOOCV.ipynb b/notebooks/TCR_epitope_prediction_NewEpitope_NewTCR_LOOCV.ipynb new file mode 100644 index 0000000..53a9e8a --- /dev/null +++ b/notebooks/TCR_epitope_prediction_NewEpitope_NewTCR_LOOCV.ipynb @@ -0,0 +1,3173 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "-NSHxGYaV7vc" + }, + "outputs": [], + "source": [ + "%%capture\n", + "import os\n", + "from google.colab import drive\n", + "drive.mount('/content/drive')\n", + "path = \"/content/drive/MyDrive/COLAB/TCR_projects\"\n", + "os.chdir(path)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "id": "be5dH_JwZLlu" + }, + "outputs": [], + "source": [ + "import random\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.neural_network import MLPClassifier\n", + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "############################## Negative sampling: random shuffling ##############################\n", + "\n", + "#The random shuffling strategy, also known as \"Random TCR\" or \"Random Epitope\" depending on which element is shuffled, involves creating artificial negative samples by randomly pairing TCRs with epitopes that are not their known binding partners.\n", + "#Separation of Data: First, the positive TCR-epitope pairs are split into training and test sets.\n", + "#Random Pairing: Within each set (train and test), TCRs or epitopes are randomly shuffled to create new pairs that are assumed to be non-binding.\n", + "#Negative Sample Generation: These new random pairs are labeled as negative samples.\n", + "\n", + "# write a script to pair TCRs and epitopes, excluding their true binding partners for mat_train_tab\n", + "# mat_train_tab has columns epitope_aa, tcr_full and value. Value of 1 mean binding, value of 0 means non binding.\n", + "\n", + "def generate_negative_samples(df,tcr_features,num_epitopes=100, num_tcrs=10):\n", + " # generate negatives for both epitope and TCRs\n", + " negative_samples = []\n", + "\n", + " # Ensure the number of samples does not exceed the number of unique elements\n", + " unique_epitopes = list(df['epitope'].unique())\n", + " unique_tcrs = list(df[tcr_features].unique())\n", + "\n", + " sampled_epitopes = random.sample(unique_epitopes, min(num_epitopes, len(unique_epitopes)))\n", + " sampled_tcrs = random.sample(unique_tcrs, min(num_tcrs, len(unique_tcrs)))\n", + "\n", + " for epitope in sampled_epitopes:\n", + " for tcr in sampled_tcrs:\n", + " if df[(df['epitope'] == epitope) & (df[tcr_features] == tcr)].empty:\n", + " negative_samples.append({'epitope': epitope, tcr_features: tcr, 'value': 0})\n", + "\n", + " return pd.DataFrame(negative_samples)\n", + "\n", + "def generate_negative_samples_epitope(df,tcr_features, epitope, num_tcrs=10):\n", + " # generate negatives for TCRs only.\n", + " negative_samples = []\n", + "\n", + " # Ensure the number of samples does not exceed the number of unique elements\n", + " unique_epitopes = epitope\n", + " unique_tcrs = list(df[tcr_features].unique())\n", + "\n", + " #sampled_epitopes = random.sample(unique_epitopes, min(num_epitopes, len(unique_epitopes)))\n", + " sampled_tcrs = random.sample(unique_tcrs, min(num_tcrs, len(unique_tcrs)))\n", + "\n", + " #for epitope in sampled_epitopes:\n", + " for tcr in sampled_tcrs:\n", + " if df[(df['epitope'] == epitope) & (df[tcr_features] == tcr)].empty:\n", + " negative_samples.append({'epitope': epitope, tcr_features: tcr, 'value': 0})\n", + "\n", + " return pd.DataFrame(negative_samples)\n", + "\n", + "def preprocess_features(feat, res, train_indices, test_indices):\n", + " x_train = feat.iloc[train_indices, :]\n", + " y_train = res[train_indices]\n", + " x_test = feat.iloc[test_indices, :]\n", + " y_test = res[test_indices]\n", + " # scale the data\n", + " scaler = StandardScaler().fit(x_train)\n", + " x_train = pd.DataFrame(scaler.transform(x_train), index=x_train.index, columns=x_train.columns)\n", + " x_test = pd.DataFrame(scaler.transform(x_test), index=x_test.index, columns=x_test.columns)\n", + " return x_train, y_train, x_test, y_test\n", + "\n", + "def train (algo, x_train, y_train, x_test):\n", + " model = None\n", + " if algo == \"sklearn_mlp\":\n", + " model = MLPClassifier(\n", + " hidden_layer_sizes=(100, 50, 10), activation='relu',\n", + " learning_rate_init=0.001, alpha=0.01, max_iter=10,\n", + " early_stopping=False, validation_fraction=0.1\n", + " )\n", + " elif algo == \"sklearn_randomforest\":\n", + " model = RandomForestClassifier(\n", + " n_estimators=1000, max_depth=10, oob_score=True\n", + " )\n", + " elif algo == \"sklearn_logit\":\n", + " model = LogisticRegression(\n", + " C=0.5, solver=\"saga\", penalty=\"elasticnet\",\n", + " l1_ratio=0.5, class_weight=\"balanced\"\n", + " )\n", + "\n", + " if model:\n", + " model.fit(x_train, y_train)\n", + " x_train_proba = model.predict_proba(x_train)[:, 1]\n", + " x_test_proba = model.predict_proba(x_test)[:, 1]\n", + " return x_train_proba, x_test_proba, model\n", + " else:\n", + " raise ValueError(\"Specified algorithm is not supported\")\n", + "\n", + "\n", + "def get_feature_importance(model, algo):\n", + " # get model weights for each algorithm\n", + " temp = []\n", + " if algo == \"sklearn_logit\":\n", + " temp = model.coef_\n", + " if algo == \"sklearn_randomforest\":\n", + " temp = model.feature_importances_\n", + " if len(temp) == 1:\n", + " temp = temp.transpose()\n", + " return temp\n", + "\n", + "def aggregate_feature_importance(imp, f_name):\n", + " # aggregate by feature and iteration\n", + " imp_agg = imp.groupby(['feature', 'iteration'])['importance'].mean().reset_index()\n", + " # aggregate by feature and compute mean and sd across all iterations for each feature\n", + " imp_agg = imp_agg.groupby('feature').agg({'importance': ['mean', 'std']})\n", + " imp_agg.columns = ['importance', 'std']\n", + " imp_agg.reset_index(inplace=True)\n", + " imp_agg.columns = ['feature_name', 'importance', 'std']\n", + " return imp_agg\n", + "\n", + "\n", + "########################################## run ML ###########################################\n", + "\n", + "def run_ML(epitope,epitope_embeddings,tcr_embeddings):\n", + " print(epitope)\n", + "\n", + " run_ite = 0 # Initialize iteration number of each repetition, outer loop\n", + " probability_all = pd.DataFrame()\n", + " importance_all = pd.DataFrame()\n", + " for rep in range(repetition):\n", + " run_ite += 1\n", + "\n", + " # split the dataset\n", + " mat_train = mat.drop(epitope, axis=1)\n", + " mat_train_tab = mat_train.stack().reset_index().rename(columns={0:'value'})\n", + "\n", + " mat_test = pd.DataFrame(mat[epitope])\n", + " mat_test_tab = mat_test.stack().reset_index().rename(columns={0:'value'})\n", + " mat_test_tab.columns = mat_train_tab.columns\n", + "\n", + " ############################## Negative sampling: random shuffling ##############################\n", + "\n", + " # pos / neg ratio\n", + " mat_train_pos = len(mat_train_tab)\n", + " epitope_number = len(np.unique(mat_train_tab[\"epitope\"]))\n", + " negative_df = generate_negative_samples(mat_train_tab, tcr_features=tcr_features,\n", + " num_epitopes=epitope_number, num_tcrs=ratio*round(mat_train_pos/epitope_number))\n", + " mat_train_tab = pd.concat([mat_train_tab[negative_df.columns], negative_df], ignore_index=True)\n", + "\n", + "\n", + " max_pos_cases = 1000 # max positive cases for test set\n", + " mat_test_tab = mat_test_tab.sample(n=min(max_pos_cases, len(mat_test_tab)))\n", + " mat_test_pos = np.min([max_pos_cases, len(mat_test_tab)])\n", + "\n", + " # LOOCV, so TCRs have to be taken from training set\n", + " negative_df = generate_negative_samples_epitope(mat_train_tab, tcr_features, epitope=epitope, num_tcrs = ratio*mat_test_pos )\n", + " mat_test_tab = pd.concat([mat_test_tab[negative_df.columns], negative_df], ignore_index=True)\n", + " # remove the TCRs of the test set from the training set\n", + " mat_train_tab = mat_train_tab.loc[~mat_train_tab[tcr_features].isin(mat_test_tab[tcr_features]), :]\n", + "\n", + " print(mat_train_tab.value.value_counts() )\n", + " print(mat_test_tab.value.value_counts() )\n", + "\n", + "\n", + " def get_embeddings(row):\n", + " epitope = epitope_embeddings.loc[row['epitope']].values\n", + " tcr = tcr_embeddings.loc[row[tcr_features]].values\n", + " return np.concatenate((epitope, tcr))\n", + "\n", + " ################# training set features\n", + " features_train = mat_train_tab.apply(get_embeddings, axis=1)\n", + " features_train = pd.DataFrame(features_train.tolist(), index=features_train.index)\n", + " features_train.index = mat_train_tab[\"epitope\"] + \"_\" + mat_train_tab[tcr_features]\n", + " features_train.columns = epitope_embeddings.columns.tolist() + tcr_embeddings.columns.tolist()\n", + " ## add other information\n", + " df_encoded_TCR_subset = df_encoded_TCR.loc[mat_train_tab[tcr_features], : ]\n", + " df_encoded_epitope_subset = df_encoded_epitope.loc[mat_train_tab[\"epitope\"], : ]\n", + "\n", + " ## combine\n", + " # \"ESM3 + VJ genes\" \"all features\" \"ESMonly\" \"withoutESM\"\n", + " if features_name == \"ESM3 + VJ genes\":\n", + " features_train_all = pd.concat([features_train.reset_index(drop=True), df_encoded_TCR_subset.reset_index(drop=True)], axis=1)\n", + " if features_name == \"all features\":\n", + " features_train_all = pd.concat([features_train.reset_index(drop=True), df_encoded_TCR_subset.reset_index(drop=True), df_encoded_epitope_subset.reset_index(drop=True)], axis=1)\n", + " if features_name == \"ESMonly\":\n", + " features_train_all = features_train\n", + " if features_name == \"withoutESM\":\n", + " features_train_all = pd.concat([df_encoded_TCR_subset.reset_index(drop=True), df_encoded_epitope_subset.reset_index(drop=True)], axis=1)\n", + "\n", + " features_train_all.index = features_train.index\n", + "\n", + " ################# test set features\n", + " features_test = mat_test_tab.apply(get_embeddings, axis=1)\n", + " features_test = pd.DataFrame(features_test.tolist(), index=features_test.index)\n", + " features_test.index = mat_test_tab[\"epitope\"] + \"_\" + mat_test_tab[tcr_features]\n", + " features_test.columns = epitope_embeddings.columns.tolist() + tcr_embeddings.columns.tolist()\n", + " ## add other information\n", + " df_encoded_TCR_subset = df_encoded_TCR.loc[mat_test_tab[tcr_features], : ]\n", + " df_encoded_epitope_subset = df_encoded_epitope.loc[mat_test_tab[\"epitope\"], : ]\n", + "\n", + " ## combine\n", + " # \"ESM3 + VJ genes\" \"all features\" \"ESMonly\" \"withoutESM\"\n", + " if features_name == \"ESM3 + VJ genes\":\n", + " features_test_all = pd.concat([features_test.reset_index(drop=True), df_encoded_TCR_subset.reset_index(drop=True)], axis=1)\n", + " if features_name == \"all features\":\n", + " features_test_all = pd.concat([features_test.reset_index(drop=True), df_encoded_TCR_subset.reset_index(drop=True), df_encoded_epitope_subset.reset_index(drop=True)], axis=1)\n", + " if features_name == \"ESMonly\":\n", + " features_test_all = features_test\n", + " if features_name == \"withoutESM\":\n", + " features_test_all = pd.concat([df_encoded_TCR_subset.reset_index(drop=True), df_encoded_epitope_subset.reset_index(drop=True)], axis=1)\n", + "\n", + " features_test_all.index = features_test.index\n", + "\n", + " RES_train = mat_train_tab.value.tolist()\n", + " RES_test = mat_test_tab.value.tolist()\n", + "\n", + " ############################################ run ML ############################################\n", + " X_train = features_train_all\n", + " X_test = features_test_all\n", + " y_train = RES_train\n", + " y_test = RES_test\n", + "\n", + " X_train.columns = X_train.columns.astype(str)\n", + " X_test.columns = X_test.columns.astype(str)\n", + "\n", + " train_prob, test_prob, model = train (algorithm, X_train, y_train, X_test)\n", + "\n", + " p_test = pd.DataFrame(\n", + " { 'split': \"test\",\n", + " 'epitope': epitope,\n", + " 'iteration': [run_ite] * len(y_test),\n", + " 'sample': X_test.index,\n", + " 'predicted_prob': test_prob,\n", + " 'RealClass': y_test\n", + " }\n", + " )\n", + "\n", + " p_train = pd.DataFrame(\n", + " { 'split': \"train\",\n", + " 'epitope': epitope,\n", + " 'iteration': [run_ite] * len(y_train),\n", + " 'sample': X_train.index,\n", + " 'predicted_prob': train_prob,\n", + " 'RealClass': y_train\n", + " }\n", + " )\n", + "\n", + " p = pd.concat([p_test, p_train], ignore_index=True)\n", + " probability_all = pd.concat([probability_all, p], ignore_index=True)\n", + "\n", + "\n", + " # Get feature importance from the model\n", + " importance_values = get_feature_importance(model=model, algo=algorithm)\n", + " importance = pd.DataFrame({\n", + " 'iteration': [run_ite] * len(importance_values),\n", + " 'feature': X_train.columns,\n", + " 'importance': importance_values.flatten()\n", + " })\n", + "\n", + " # append to the importance_all list\n", + " importance_all = pd.concat([importance_all, importance], ignore_index=True)\n", + "\n", + " return probability_all, importance_all\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "from sklearn.metrics import roc_curve, auc\n", + "from numpy import interp\n", + "\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "\n", + "def plot_ROC_curve(probability):\n", + " aucs_all = []\n", + " mean_fpr = np.linspace(0, 1, 100)\n", + " for i in range(1, len(probability[\"iteration\"].unique()) + 1):\n", + " pred_run = probability[probability[\"iteration\"] == i]\n", + " epitopes = np.unique( [i.split('_', 1)[0] for i in pred_run[\"sample\"]] )\n", + "\n", + " # plot an average ROC curve across all runs. Values are interpolated.\n", + " tprs = []\n", + " aucs = []\n", + " for epitope in epitopes:\n", + "\n", + " # select epitope\n", + " pred_run = probability[probability[\"iteration\"] == i]\n", + " pred_run = pred_run[pred_run['sample'].str.contains(epitope)]\n", + "\n", + " fpr, tpr, thresh = roc_curve(pred_run[\"RealClass\"], pred_run[\"predicted_prob\"])\n", + " interpolated_tpr = interp(mean_fpr, fpr, tpr)\n", + " interpolated_tpr[0] = 0.0\n", + " roc_auc = auc(fpr, tpr)\n", + " tprs.append(interpolated_tpr)\n", + " aucs = pd.DataFrame([epitope, roc_auc]).transpose()\n", + "\n", + " if len(aucs_all)== 0:\n", + " aucs_all = aucs\n", + " else:\n", + " aucs_all = pd.concat([aucs_all, aucs])\n", + "\n", + " aucs_all.columns = [\"epitope\",\"AUC\"]\n", + " aucs_all = aucs_all.groupby('epitope').agg('mean')\n", + "\n", + " auc_values = aucs_all.AUC\n", + " print(auc_values)\n", + "\n", + "\n", + " # Assuming 'auc_values' is a pandas Series\n", + " plt.figure(figsize=(3, 5))\n", + " sns.boxplot(y=auc_values, color='lightblue', width=0.4) # Adjust width here\n", + " sns.swarmplot(y=auc_values, color='darkred', size=6) # Adjust size here\n", + " plt.xlabel('AUC')\n", + " plt.title('AUC Scores')\n", + " plt.ylim(0, 1) # Set y-axis limits to 0-1\n", + " # save to pdf\n", + " os.makedirs(\"fig\", exist_ok=True)\n", + " plt.savefig(\"fig/auc_values_epitope_boxplot_\"+features_name+\"_\"+algorithm+\"_N_TCRs\"+str(N_TCRs)+\"_\"+split+\"_species_\"+species+\"_MHC_\"+MHC_class+\".pdf\", format='pdf', bbox_inches='tight')\n", + " plt.show()\n", + "\n", + " print(f\"Mean AUC: {np.mean(auc_values)}\")\n", + " print(f\"Median AUC: {np.median(auc_values)}\")\n", + "\n", + " return auc_values\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "eB-K1R38U_Y3", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 548 + }, + "outputId": "bbc6d27b-f8f6-4614-b0b3-a3741ecaee12" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " epitope cdr3_TRA \\\n", + "DIYKGVYQFKSV_CAGGADRLTFCASSPAGNTLYF DIYKGVYQFKSV CAGGADRLTF \n", + "DIYKGVYQFKSV_CAASGGSNYNVLYFCAWSLWGGPSAETLYF DIYKGVYQFKSV CAASGGSNYNVLYF \n", + "DIYKGVYQFKSV_CAASYNYAQGLTFCASRDWGGRQDTQYF DIYKGVYQFKSV CAASYNYAQGLTF \n", + "DIYKGVYQFKSV_CAAQTGNYKYVFCASGDAGTGQDTQYF DIYKGVYQFKSV CAAQTGNYKYVF \n", + "DIYKGVYQFKSV_CAASLTGGYKVVFCAWRTDNQDTQYF DIYKGVYQFKSV CAASLTGGYKVVF \n", + "... ... ... \n", + "GMGPLLATV_CAVLNNARLMFCASSVDRVADTQYF GMGPLLATV CAVLNNARLMF \n", + "GMGPLLATV_CATDNDMRFCASSFGPDEQYF GMGPLLATV CATDNDMRF \n", + "GMGPLLATV_CAYRSFNNNDMRFCASRSRGGHSPLHF GMGPLLATV CAYRSFNNNDMRF \n", + "GMGPLLATV_CAMTSFQKLVFCASSLRGEKNNYGYTF GMGPLLATV CAMTSFQKLVF \n", + "GMGPLLATV_CAVLNNARLMFCASSVDRVADTQYF GMGPLLATV CAVLNNARLMF \n", + "\n", + " cdr3_TRB TRAV \\\n", + "DIYKGVYQFKSV_CAGGADRLTFCASSPAGNTLYF CASSPAGNTLYF TRAV14-3 \n", + "DIYKGVYQFKSV_CAASGGSNYNVLYFCAWSLWGGPSAETLYF CAWSLWGGPSAETLYF TRAV14N-3 \n", + "DIYKGVYQFKSV_CAASYNYAQGLTFCASRDWGGRQDTQYF CASRDWGGRQDTQYF TRAV14N-3 \n", + "DIYKGVYQFKSV_CAAQTGNYKYVFCASGDAGTGQDTQYF CASGDAGTGQDTQYF TRAV14D-3-DV8 \n", + "DIYKGVYQFKSV_CAASLTGGYKVVFCAWRTDNQDTQYF CAWRTDNQDTQYF TRAV14N-3 \n", + "... ... ... \n", + "GMGPLLATV_CAVLNNARLMFCASSVDRVADTQYF CASSVDRVADTQYF TRAV12-2 \n", + "GMGPLLATV_CATDNDMRFCASSFGPDEQYF CASSFGPDEQYF NaN \n", + "GMGPLLATV_CAYRSFNNNDMRFCASRSRGGHSPLHF CASRSRGGHSPLHF NaN \n", + "GMGPLLATV_CAMTSFQKLVFCASSLRGEKNNYGYTF CASSLRGEKNNYGYTF TRAV39 \n", + "GMGPLLATV_CAVLNNARLMFCASSVDRVADTQYF CASSVDRVADTQYF NaN \n", + "\n", + " TRAJ TRBV \\\n", + "DIYKGVYQFKSV_CAGGADRLTFCASSPAGNTLYF TRAJ45 TRBV14 \n", + "DIYKGVYQFKSV_CAASGGSNYNVLYFCAWSLWGGPSAETLYF TRAJ21 TRBV31 \n", + "DIYKGVYQFKSV_CAASYNYAQGLTFCASRDWGGRQDTQYF TRAJ26 TRBV13-3 \n", + "DIYKGVYQFKSV_CAAQTGNYKYVFCASGDAGTGQDTQYF TRAJ40 TRBV12-2+TRBV13-2 \n", + "DIYKGVYQFKSV_CAASLTGGYKVVFCAWRTDNQDTQYF TRAJ12 TRBV31 \n", + "... ... ... \n", + "GMGPLLATV_CAVLNNARLMFCASSVDRVADTQYF NaN TRBV27 \n", + "GMGPLLATV_CATDNDMRFCASSFGPDEQYF NaN TRBV13 \n", + "GMGPLLATV_CAYRSFNNNDMRFCASRSRGGHSPLHF NaN TRBV13 \n", + "GMGPLLATV_CAMTSFQKLVFCASSLRGEKNNYGYTF NaN TRBV13 \n", + "GMGPLLATV_CAVLNNARLMFCASSVDRVADTQYF NaN TRBV27 \n", + "\n", + " TRBJ MHC MHC_class \\\n", + "DIYKGVYQFKSV_CAGGADRLTFCASSPAGNTLYF TRBJ1-3 H2-IAb MHCII \n", + "DIYKGVYQFKSV_CAASGGSNYNVLYFCAWSLWGGPSAETLYF TRBJ2-3 H2-IAb MHCII \n", + "DIYKGVYQFKSV_CAASYNYAQGLTFCASRDWGGRQDTQYF TRBJ2-5 H2-IAb MHCII \n", + "DIYKGVYQFKSV_CAAQTGNYKYVFCASGDAGTGQDTQYF TRBJ2-5 H2-IAb MHCII \n", + "DIYKGVYQFKSV_CAASLTGGYKVVFCAWRTDNQDTQYF TRBJ2-5 H2-IAb MHCII \n", + "... ... ... ... \n", + "GMGPLLATV_CAVLNNARLMFCASSVDRVADTQYF NaN HLA-A*02:01 MHCI \n", + "GMGPLLATV_CATDNDMRFCASSFGPDEQYF NaN HLA-A*02:01 MHCI \n", + "GMGPLLATV_CAYRSFNNNDMRFCASRSRGGHSPLHF NaN HLA-A*02:01 MHCI \n", + "GMGPLLATV_CAMTSFQKLVFCASSLRGEKNNYGYTF NaN HLA-A*02:01 MHCI \n", + "GMGPLLATV_CAVLNNARLMFCASSVDRVADTQYF NaN HLA-A*02:01 MHCI \n", + "\n", + " species \\\n", + "DIYKGVYQFKSV_CAGGADRLTFCASSPAGNTLYF MusMusculus \n", + "DIYKGVYQFKSV_CAASGGSNYNVLYFCAWSLWGGPSAETLYF MusMusculus \n", + "DIYKGVYQFKSV_CAASYNYAQGLTFCASRDWGGRQDTQYF MusMusculus \n", + "DIYKGVYQFKSV_CAAQTGNYKYVFCASGDAGTGQDTQYF MusMusculus \n", + "DIYKGVYQFKSV_CAASLTGGYKVVFCAWRTDNQDTQYF MusMusculus \n", + "... ... \n", + "GMGPLLATV_CAVLNNARLMFCASSVDRVADTQYF HomoSapiens \n", + "GMGPLLATV_CATDNDMRFCASSFGPDEQYF HomoSapiens \n", + "GMGPLLATV_CAYRSFNNNDMRFCASRSRGGHSPLHF HomoSapiens \n", + "GMGPLLATV_CAMTSFQKLVFCASSLRGEKNNYGYTF HomoSapiens \n", + "GMGPLLATV_CAVLNNARLMFCASSVDRVADTQYF HomoSapiens \n", + "\n", + " cdr3 \\\n", + "DIYKGVYQFKSV_CAGGADRLTFCASSPAGNTLYF CAGGADRLTFCASSPAGNTLYF \n", + "DIYKGVYQFKSV_CAASGGSNYNVLYFCAWSLWGGPSAETLYF CAASGGSNYNVLYFCAWSLWGGPSAETLYF \n", + "DIYKGVYQFKSV_CAASYNYAQGLTFCASRDWGGRQDTQYF CAASYNYAQGLTFCASRDWGGRQDTQYF \n", + "DIYKGVYQFKSV_CAAQTGNYKYVFCASGDAGTGQDTQYF CAAQTGNYKYVFCASGDAGTGQDTQYF \n", + "DIYKGVYQFKSV_CAASLTGGYKVVFCAWRTDNQDTQYF CAASLTGGYKVVFCAWRTDNQDTQYF \n", + "... ... \n", + "GMGPLLATV_CAVLNNARLMFCASSVDRVADTQYF CAVLNNARLMFCASSVDRVADTQYF \n", + "GMGPLLATV_CATDNDMRFCASSFGPDEQYF CATDNDMRFCASSFGPDEQYF \n", + "GMGPLLATV_CAYRSFNNNDMRFCASRSRGGHSPLHF CAYRSFNNNDMRFCASRSRGGHSPLHF \n", + "GMGPLLATV_CAMTSFQKLVFCASSLRGEKNNYGYTF CAMTSFQKLVFCASSLRGEKNNYGYTF \n", + "GMGPLLATV_CAVLNNARLMFCASSVDRVADTQYF CAVLNNARLMFCASSVDRVADTQYF \n", + "\n", + " value \n", + "DIYKGVYQFKSV_CAGGADRLTFCASSPAGNTLYF 1 \n", + "DIYKGVYQFKSV_CAASGGSNYNVLYFCAWSLWGGPSAETLYF 1 \n", + "DIYKGVYQFKSV_CAASYNYAQGLTFCASRDWGGRQDTQYF 1 \n", + "DIYKGVYQFKSV_CAAQTGNYKYVFCASGDAGTGQDTQYF 1 \n", + "DIYKGVYQFKSV_CAASLTGGYKVVFCAWRTDNQDTQYF 1 \n", + "... ... \n", + "GMGPLLATV_CAVLNNARLMFCASSVDRVADTQYF 1 \n", + "GMGPLLATV_CATDNDMRFCASSFGPDEQYF 1 \n", + "GMGPLLATV_CAYRSFNNNDMRFCASRSRGGHSPLHF 1 \n", + "GMGPLLATV_CAMTSFQKLVFCASSLRGEKNNYGYTF 1 \n", + "GMGPLLATV_CAVLNNARLMFCASSVDRVADTQYF 1 \n", + "\n", + "[17676 rows x 12 columns]" + ], + "text/html": [ + "\n", + "
\n", + " | epitope | \n", + "cdr3_TRA | \n", + "cdr3_TRB | \n", + "TRAV | \n", + "TRAJ | \n", + "TRBV | \n", + "TRBJ | \n", + "MHC | \n", + "MHC_class | \n", + "species | \n", + "cdr3 | \n", + "value | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|
DIYKGVYQFKSV_CAGGADRLTFCASSPAGNTLYF | \n", + "DIYKGVYQFKSV | \n", + "CAGGADRLTF | \n", + "CASSPAGNTLYF | \n", + "TRAV14-3 | \n", + "TRAJ45 | \n", + "TRBV14 | \n", + "TRBJ1-3 | \n", + "H2-IAb | \n", + "MHCII | \n", + "MusMusculus | \n", + "CAGGADRLTFCASSPAGNTLYF | \n", + "1 | \n", + "
DIYKGVYQFKSV_CAASGGSNYNVLYFCAWSLWGGPSAETLYF | \n", + "DIYKGVYQFKSV | \n", + "CAASGGSNYNVLYF | \n", + "CAWSLWGGPSAETLYF | \n", + "TRAV14N-3 | \n", + "TRAJ21 | \n", + "TRBV31 | \n", + "TRBJ2-3 | \n", + "H2-IAb | \n", + "MHCII | \n", + "MusMusculus | \n", + "CAASGGSNYNVLYFCAWSLWGGPSAETLYF | \n", + "1 | \n", + "
DIYKGVYQFKSV_CAASYNYAQGLTFCASRDWGGRQDTQYF | \n", + "DIYKGVYQFKSV | \n", + "CAASYNYAQGLTF | \n", + "CASRDWGGRQDTQYF | \n", + "TRAV14N-3 | \n", + "TRAJ26 | \n", + "TRBV13-3 | \n", + "TRBJ2-5 | \n", + "H2-IAb | \n", + "MHCII | \n", + "MusMusculus | \n", + "CAASYNYAQGLTFCASRDWGGRQDTQYF | \n", + "1 | \n", + "
DIYKGVYQFKSV_CAAQTGNYKYVFCASGDAGTGQDTQYF | \n", + "DIYKGVYQFKSV | \n", + "CAAQTGNYKYVF | \n", + "CASGDAGTGQDTQYF | \n", + "TRAV14D-3-DV8 | \n", + "TRAJ40 | \n", + "TRBV12-2+TRBV13-2 | \n", + "TRBJ2-5 | \n", + "H2-IAb | \n", + "MHCII | \n", + "MusMusculus | \n", + "CAAQTGNYKYVFCASGDAGTGQDTQYF | \n", + "1 | \n", + "
DIYKGVYQFKSV_CAASLTGGYKVVFCAWRTDNQDTQYF | \n", + "DIYKGVYQFKSV | \n", + "CAASLTGGYKVVF | \n", + "CAWRTDNQDTQYF | \n", + "TRAV14N-3 | \n", + "TRAJ12 | \n", + "TRBV31 | \n", + "TRBJ2-5 | \n", + "H2-IAb | \n", + "MHCII | \n", + "MusMusculus | \n", + "CAASLTGGYKVVFCAWRTDNQDTQYF | \n", + "1 | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
GMGPLLATV_CAVLNNARLMFCASSVDRVADTQYF | \n", + "GMGPLLATV | \n", + "CAVLNNARLMF | \n", + "CASSVDRVADTQYF | \n", + "TRAV12-2 | \n", + "NaN | \n", + "TRBV27 | \n", + "NaN | \n", + "HLA-A*02:01 | \n", + "MHCI | \n", + "HomoSapiens | \n", + "CAVLNNARLMFCASSVDRVADTQYF | \n", + "1 | \n", + "
GMGPLLATV_CATDNDMRFCASSFGPDEQYF | \n", + "GMGPLLATV | \n", + "CATDNDMRF | \n", + "CASSFGPDEQYF | \n", + "NaN | \n", + "NaN | \n", + "TRBV13 | \n", + "NaN | \n", + "HLA-A*02:01 | \n", + "MHCI | \n", + "HomoSapiens | \n", + "CATDNDMRFCASSFGPDEQYF | \n", + "1 | \n", + "
GMGPLLATV_CAYRSFNNNDMRFCASRSRGGHSPLHF | \n", + "GMGPLLATV | \n", + "CAYRSFNNNDMRF | \n", + "CASRSRGGHSPLHF | \n", + "NaN | \n", + "NaN | \n", + "TRBV13 | \n", + "NaN | \n", + "HLA-A*02:01 | \n", + "MHCI | \n", + "HomoSapiens | \n", + "CAYRSFNNNDMRFCASRSRGGHSPLHF | \n", + "1 | \n", + "
GMGPLLATV_CAMTSFQKLVFCASSLRGEKNNYGYTF | \n", + "GMGPLLATV | \n", + "CAMTSFQKLVF | \n", + "CASSLRGEKNNYGYTF | \n", + "TRAV39 | \n", + "NaN | \n", + "TRBV13 | \n", + "NaN | \n", + "HLA-A*02:01 | \n", + "MHCI | \n", + "HomoSapiens | \n", + "CAMTSFQKLVFCASSLRGEKNNYGYTF | \n", + "1 | \n", + "
GMGPLLATV_CAVLNNARLMFCASSVDRVADTQYF | \n", + "GMGPLLATV | \n", + "CAVLNNARLMF | \n", + "CASSVDRVADTQYF | \n", + "NaN | \n", + "NaN | \n", + "TRBV27 | \n", + "NaN | \n", + "HLA-A*02:01 | \n", + "MHCI | \n", + "HomoSapiens | \n", + "CAVLNNARLMFCASSVDRVADTQYF | \n", + "1 | \n", + "
17676 rows × 12 columns
\n", + "\n", + " | MHC_H2-Db | \n", + "MHC_H2-IAb | \n", + "MHC_H2-IEk | \n", + "MHC_H2-Kb | \n", + "MHC_H2-Kd | \n", + "MHC_H2-Ld | \n", + "MHC_HLA-A*02:01 | \n", + "MHC_HLA-A*08:01 | \n", + "MHC_HLA-A*11:01 | \n", + "MHC_HLA-A*24:02 | \n", + "... | \n", + "MHC_HLA-DQA1:02/DQB1*06:02 | \n", + "MHC_HLA-DRA:01 | \n", + "MHC_HLA-DRA:01/DRB1:01 | \n", + "MHC_HLA-DRB1*04:01 | \n", + "MHC_HLA-DRB1*04:05 | \n", + "MHC_HLA-DRB1*07:01 | \n", + "MHC_HLA-DRB1*11:01 | \n", + "MHC_HLA-DRB1:01 | \n", + "MHC_class_MHCII | \n", + "species_MusMusculus | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
epitope | \n", + "\n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " |
DIYKGVYQFKSV | \n", + "0.0 | \n", + "1.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "1.0 | \n", + "1.0 | \n", + "
GILGFVFTL | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "1.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "
SSYRRPVGI | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "1.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "1.0 | \n", + "
SSLENFRAYV | \n", + "1.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "1.0 | \n", + "
LLWNGPMAV | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "1.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
PQPELPYPQPE | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "1.0 | \n", + "0.0 | \n", + "
PGVLLKEFTVSGNIL | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "... | \n", + "0.0 | \n", + "1.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "1.0 | \n", + "0.0 | \n", + "
LLLEWLAMA | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "1.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "
KGYVYQGL | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "1.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "1.0 | \n", + "1.0 | \n", + "
GMGPLLATV | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "1.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "
144 rows × 37 columns
\n", + "\n", + " | TRAV_TCRAV12-1 | \n", + "TRAV_TCRAV17 | \n", + "TRAV_TCRAV19 | \n", + "TRAV_TCRAV21 | \n", + "TRAV_TCRAV23/DV6 | \n", + "TRAV_TCRAV3 | \n", + "TRAV_TCRAV38-1 | \n", + "TRAV_TCRAV38-2/DV8 | \n", + "TRAV_TCRAV41 | \n", + "TRAV_TRAV-2 | \n", + "... | \n", + "TRBJ_TRBJ2-5 | \n", + "TRBJ_TRBJ2-6 | \n", + "TRBJ_TRBJ2-7 | \n", + "TRBJ_TRBJ2-7 | \n", + "TRBJ_TRBJ20-1 | \n", + "TRBJ_TRBJ24-1 | \n", + "TRBJ_TRBJ38-2/DV8 | \n", + "TRBJ_TRBJ5-1 | \n", + "TRBJ_TRBJ5-6 | \n", + "TRBJ_nan | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
cdr3 | \n", + "\n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " |
CAGGADRLTFCASSPAGNTLYF | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "
CAASGGSNYNVLYFCAWSLWGGPSAETLYF | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "
CAASYNYAQGLTFCASRDWGGRQDTQYF | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "... | \n", + "1.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "
CAAQTGNYKYVFCASGDAGTGQDTQYF | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "... | \n", + "1.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "
CAASLTGGYKVVFCAWRTDNQDTQYF | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "... | \n", + "1.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
CAYRSGEYGNKLVFCASSMAGSSYEQYF | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "1.0 | \n", + "
CAYRSFNNNDMRFCASRSRGGHSPLHF | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "1.0 | \n", + "
CATDNDMRFCASSFGPDEQYF | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "1.0 | \n", + "
CAVLNNARLMFCASSVDRVADTQYF | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "1.0 | \n", + "
CAMTSFQKLVFCASSLRGEKNNYGYTF | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "1.0 | \n", + "
14106 rows × 508 columns
\n", + "\n", + " | count | \n", + "
---|---|
MHC_class | \n", + "\n", + " |
MHCI | \n", + "13248 | \n", + "
MHCII | \n", + "4428 | \n", + "
\n", + " | split | \n", + "epitope | \n", + "iteration | \n", + "sample | \n", + "predicted_prob | \n", + "RealClass | \n", + "
---|---|---|---|---|---|---|
0 | \n", + "test | \n", + "ALSKGVHFV | \n", + "1 | \n", + "ALSKGVHFV_CAVEDGQKLLFCASSPGGTATYEQYF | \n", + "0.998830 | \n", + "1.0 | \n", + "
1 | \n", + "test | \n", + "ALSKGVHFV | \n", + "1 | \n", + "ALSKGVHFV_CALGSDSWGKLQFCASSLAGDSYNEQFF | \n", + "0.877127 | \n", + "1.0 | \n", + "
2 | \n", + "test | \n", + "ALSKGVHFV | \n", + "1 | \n", + "ALSKGVHFV_CALSEGRDDKIIFCASSIVPWDTQYF | \n", + "0.411237 | \n", + "1.0 | \n", + "
3 | \n", + "test | \n", + "ALSKGVHFV | \n", + "1 | \n", + "ALSKGVHFV_CAVAPFGNEKLTFCASSTQSTVNIQYF | \n", + "0.700061 | \n", + "1.0 | \n", + "
4 | \n", + "test | \n", + "ALSKGVHFV | \n", + "1 | \n", + "ALSKGVHFV_CAMRGRTGNQFYFCASSQKLAGDNEQFF | \n", + "0.884197 | \n", + "1.0 | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
83335 | \n", + "train | \n", + "ALSKGVHFV | \n", + "1 | \n", + "CTELKLSDY_CAVTTDSWGKLQFCASSRQPMNTEAFF | \n", + "0.460549 | \n", + "0.0 | \n", + "
83336 | \n", + "train | \n", + "ALSKGVHFV | \n", + "1 | \n", + "CTELKLSDY_CAAKEGYSTLTFCASSEGDRVTEAFF | \n", + "0.230751 | \n", + "0.0 | \n", + "
83337 | \n", + "train | \n", + "ALSKGVHFV | \n", + "1 | \n", + "CTELKLSDY_CAVVYPLTHGSSNTGKLIFCASSLEGQLNEQFF | \n", + "0.478130 | \n", + "0.0 | \n", + "
83338 | \n", + "train | \n", + "ALSKGVHFV | \n", + "1 | \n", + "CTELKLSDY_CAAEAGAGNKLTFCASGDSANSDYTF | \n", + "0.083402 | \n", + "0.0 | \n", + "
83339 | \n", + "train | \n", + "ALSKGVHFV | \n", + "1 | \n", + "CTELKLSDY_CAMRVSGGSNAKLTFCASRGGANTGQLYF | \n", + "0.241574 | \n", + "0.0 | \n", + "
83340 rows × 6 columns
\n", + "