diff --git a/notebooks/openai_embeddings_v2/De_duping_and_Averaging_Status_Embeddings_(20240216).ipynb b/notebooks/openai_embeddings_v2/De_duping_and_Averaging_Status_Embeddings_(20240216).ipynb
new file mode 100644
index 0000000..c1b7718
--- /dev/null
+++ b/notebooks/openai_embeddings_v2/De_duping_and_Averaging_Status_Embeddings_(20240216).ipynb
@@ -0,0 +1,2724 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "provenance": []
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ },
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "source": [
+ "In this notebook, we prepare a clean (de-duped) version of the status embeddings. And we re-construct user embeddings using the average of their status embeddings.\n",
+ "\n",
+ "This notebook saves both datasets back to drive for further analysis."
+ ],
+ "metadata": {
+ "id": "ec7rVSPxy567"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## Google Drive"
+ ],
+ "metadata": {
+ "id": "Rrp_6meLhC7G"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import os\n",
+ "from google.colab import drive\n",
+ "\n",
+ "drive.mount('/content/drive')\n",
+ "print(os.getcwd(), os.listdir(os.getcwd()))"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "1_eMFG-JgS0r",
+ "outputId": "476e8c07-952e-4dad-c5ee-6348d744668b"
+ },
+ "execution_count": 2,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Mounted at /content/drive\n",
+ "/content ['.config', 'drive', 'sample_data']\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# you might need to create a google drive SHORTCUT that has this same path\n",
+ "# ... or update the path to use your own google drive organization\n",
+ "DIRPATH = '/content/drive/MyDrive/Research/DS Research Shared 2024'\n",
+ "\n",
+ "print(DIRPATH)\n",
+ "os.path.isdir(DIRPATH)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "x0kkbd6agWrY",
+ "outputId": "8b801e86-7bd0-43f5-c478-22c781bda63d"
+ },
+ "execution_count": 3,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "/content/drive/MyDrive/Research/DS Research Shared 2024\n"
+ ]
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 3
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "DATA_DIRPATH = os.path.join(DIRPATH, \"projects\", \"Impeachment 2020 Embeddings\", \"data\")\n",
+ "os.path.isdir(DATA_DIRPATH)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "1ShYrv5_gYcs",
+ "outputId": "da03ac2a-52b9-4ae8-de9e-d340b3dc12f3"
+ },
+ "execution_count": 4,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 4
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## Data Loading"
+ ],
+ "metadata": {
+ "id": "YEJ6uTOjg0Uo"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 728
+ },
+ "id": "yXVSfTL6gQ5K",
+ "outputId": "9e8e977b-768e-4308-ad47-65e179965ae4"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "(183815, 1541)\n",
+ "Index(['user_id', 'status_id', 'status_text', 'created_at', 'embeds_length',\n",
+ " 'openai_0', 'openai_1', 'openai_2', 'openai_3', 'openai_4',\n",
+ " ...\n",
+ " 'openai_1526', 'openai_1527', 'openai_1528', 'openai_1529',\n",
+ " 'openai_1530', 'openai_1531', 'openai_1532', 'openai_1533',\n",
+ " 'openai_1534', 'openai_1535'],\n",
+ " dtype='object', length=1541)\n"
+ ]
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " user_id status_id \\\n",
+ "0 897845802701377536 1221540755451392001 \n",
+ "1 935739601301458947 1223458629837295619 \n",
+ "2 571774622 1217445781663363072 \n",
+ "3 384679808 1223705594818748416 \n",
+ "4 701264221653217281 1218459840277729281 \n",
+ "\n",
+ " status_text \\\n",
+ "0 Doubt it..It appears they all have gone the wa... \n",
+ "1 RT @Wyn1745: Democrats are ‘setting the stage’... \n",
+ "2 RT @sarahdwire: I’m loathe to insert myself in... \n",
+ "3 RT @RepRatcliffe: We warned them...As Schiff a... \n",
+ "4 RT @chipfranklin: Because \"impeachment\" in the... \n",
+ "\n",
+ " created_at embeds_length openai_0 openai_1 openai_2 \\\n",
+ "0 2020-01-26 21:09:45+00:00 1536 -0.020428 -0.006720 0.007308 \n",
+ "1 2020-02-01 04:10:42+00:00 1536 -0.036689 -0.007481 0.007968 \n",
+ "2 2020-01-15 13:57:48+00:00 1536 -0.033382 -0.006886 -0.003244 \n",
+ "3 2020-02-01 20:32:03+00:00 1536 -0.008477 -0.007364 0.000919 \n",
+ "4 2020-01-18 09:07:18+00:00 1536 -0.009454 0.017376 0.007016 \n",
+ "\n",
+ " openai_3 openai_4 ... openai_1526 openai_1527 openai_1528 \\\n",
+ "0 -0.022157 -0.041841 ... 0.014616 0.004705 0.012661 \n",
+ "1 -0.006632 -0.022805 ... -0.001696 0.002522 0.020397 \n",
+ "2 -0.015834 0.000172 ... 0.001027 0.002464 0.002013 \n",
+ "3 -0.006435 0.008101 ... -0.028269 0.003193 0.015056 \n",
+ "4 -0.020075 -0.023674 ... -0.013590 0.015564 0.005130 \n",
+ "\n",
+ " openai_1529 openai_1530 openai_1531 openai_1532 openai_1533 \\\n",
+ "0 -0.020974 -0.003458 0.045166 0.029871 -0.021186 \n",
+ "1 -0.046374 -0.046611 0.021068 -0.000085 -0.003701 \n",
+ "2 -0.032766 -0.034265 0.006545 0.014804 0.003027 \n",
+ "3 -0.015333 -0.028137 0.032510 0.010327 -0.013621 \n",
+ "4 0.003077 -0.029167 0.015523 0.017914 -0.008789 \n",
+ "\n",
+ " openai_1534 openai_1535 \n",
+ "0 -0.003376 -0.024937 \n",
+ "1 -0.015370 -0.019213 \n",
+ "2 -0.001518 -0.030946 \n",
+ "3 -0.007686 -0.016216 \n",
+ "4 -0.019767 -0.042353 \n",
+ "\n",
+ "[5 rows x 1541 columns]"
+ ],
+ "text/html": [
+ "\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " user_id | \n",
+ " status_id | \n",
+ " status_text | \n",
+ " created_at | \n",
+ " embeds_length | \n",
+ " openai_0 | \n",
+ " openai_1 | \n",
+ " openai_2 | \n",
+ " openai_3 | \n",
+ " openai_4 | \n",
+ " ... | \n",
+ " openai_1526 | \n",
+ " openai_1527 | \n",
+ " openai_1528 | \n",
+ " openai_1529 | \n",
+ " openai_1530 | \n",
+ " openai_1531 | \n",
+ " openai_1532 | \n",
+ " openai_1533 | \n",
+ " openai_1534 | \n",
+ " openai_1535 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 897845802701377536 | \n",
+ " 1221540755451392001 | \n",
+ " Doubt it..It appears they all have gone the wa... | \n",
+ " 2020-01-26 21:09:45+00:00 | \n",
+ " 1536 | \n",
+ " -0.020428 | \n",
+ " -0.006720 | \n",
+ " 0.007308 | \n",
+ " -0.022157 | \n",
+ " -0.041841 | \n",
+ " ... | \n",
+ " 0.014616 | \n",
+ " 0.004705 | \n",
+ " 0.012661 | \n",
+ " -0.020974 | \n",
+ " -0.003458 | \n",
+ " 0.045166 | \n",
+ " 0.029871 | \n",
+ " -0.021186 | \n",
+ " -0.003376 | \n",
+ " -0.024937 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 935739601301458947 | \n",
+ " 1223458629837295619 | \n",
+ " RT @Wyn1745: Democrats are ‘setting the stage’... | \n",
+ " 2020-02-01 04:10:42+00:00 | \n",
+ " 1536 | \n",
+ " -0.036689 | \n",
+ " -0.007481 | \n",
+ " 0.007968 | \n",
+ " -0.006632 | \n",
+ " -0.022805 | \n",
+ " ... | \n",
+ " -0.001696 | \n",
+ " 0.002522 | \n",
+ " 0.020397 | \n",
+ " -0.046374 | \n",
+ " -0.046611 | \n",
+ " 0.021068 | \n",
+ " -0.000085 | \n",
+ " -0.003701 | \n",
+ " -0.015370 | \n",
+ " -0.019213 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 571774622 | \n",
+ " 1217445781663363072 | \n",
+ " RT @sarahdwire: I’m loathe to insert myself in... | \n",
+ " 2020-01-15 13:57:48+00:00 | \n",
+ " 1536 | \n",
+ " -0.033382 | \n",
+ " -0.006886 | \n",
+ " -0.003244 | \n",
+ " -0.015834 | \n",
+ " 0.000172 | \n",
+ " ... | \n",
+ " 0.001027 | \n",
+ " 0.002464 | \n",
+ " 0.002013 | \n",
+ " -0.032766 | \n",
+ " -0.034265 | \n",
+ " 0.006545 | \n",
+ " 0.014804 | \n",
+ " 0.003027 | \n",
+ " -0.001518 | \n",
+ " -0.030946 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 384679808 | \n",
+ " 1223705594818748416 | \n",
+ " RT @RepRatcliffe: We warned them...As Schiff a... | \n",
+ " 2020-02-01 20:32:03+00:00 | \n",
+ " 1536 | \n",
+ " -0.008477 | \n",
+ " -0.007364 | \n",
+ " 0.000919 | \n",
+ " -0.006435 | \n",
+ " 0.008101 | \n",
+ " ... | \n",
+ " -0.028269 | \n",
+ " 0.003193 | \n",
+ " 0.015056 | \n",
+ " -0.015333 | \n",
+ " -0.028137 | \n",
+ " 0.032510 | \n",
+ " 0.010327 | \n",
+ " -0.013621 | \n",
+ " -0.007686 | \n",
+ " -0.016216 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 701264221653217281 | \n",
+ " 1218459840277729281 | \n",
+ " RT @chipfranklin: Because \"impeachment\" in the... | \n",
+ " 2020-01-18 09:07:18+00:00 | \n",
+ " 1536 | \n",
+ " -0.009454 | \n",
+ " 0.017376 | \n",
+ " 0.007016 | \n",
+ " -0.020075 | \n",
+ " -0.023674 | \n",
+ " ... | \n",
+ " -0.013590 | \n",
+ " 0.015564 | \n",
+ " 0.005130 | \n",
+ " 0.003077 | \n",
+ " -0.029167 | \n",
+ " 0.015523 | \n",
+ " 0.017914 | \n",
+ " -0.008789 | \n",
+ " -0.019767 | \n",
+ " -0.042353 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 1541 columns
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "statuses_df"
+ }
+ },
+ "metadata": {},
+ "execution_count": 5
+ }
+ ],
+ "source": [
+ "from pandas import read_parquet\n",
+ "\n",
+ "pq_filepath = os.path.join(DATA_DIRPATH, \"botometer_sample_max_50_openai_status_embeddings_v3_unpacked.parquet.gzip\")\n",
+ "statuses_df = read_parquet(pq_filepath)\n",
+ "print(statuses_df.shape)\n",
+ "print(statuses_df.columns)\n",
+ "statuses_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "statuses_df[\"user_id\"].nunique()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "NGVktpyCkgJM",
+ "outputId": "b82250ad-387e-462e-87dc-fdcc26fdefbd"
+ },
+ "execution_count": 6,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "7566"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 6
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "len(statuses_df)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "H3GDRXLees44",
+ "outputId": "eff5b6a0-37b5-4472-a24e-74d7b6e24bc0"
+ },
+ "execution_count": 7,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "183815"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 7
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "statuses_df[\"status_id\"].nunique()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "fJ5VxZ7sep0o",
+ "outputId": "4357116c-8b4b-4f76-81ad-b945ad9c29bf"
+ },
+ "execution_count": 8,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "183727"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 8
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Oh no, statuses not unique?"
+ ],
+ "metadata": {
+ "id": "exkSsNEUe7ku"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "statuses_df[\"status_id\"].value_counts()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "NlpK3H50fGYd",
+ "outputId": "73a1f533-5705-4ae0-f6ef-f03b2c7c3f5b"
+ },
+ "execution_count": 9,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "1234905353650761728 6\n",
+ "1209143341901737984 3\n",
+ "1209173027772076033 3\n",
+ "1207894148151308289 2\n",
+ "1217603880453718016 2\n",
+ " ..\n",
+ "1216442996260003840 1\n",
+ "1225979782745272325 1\n",
+ "1206336484170702849 1\n",
+ "1239320120071200771 1\n",
+ "1222940911023333376 1\n",
+ "Name: status_id, Length: 183727, dtype: Int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 9
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "statuses_df[statuses_df[\"status_id\"].duplicated(keep=False)].sort_values(\"status_id\")"
+ ],
+ "metadata": {
+ "id": "rNIISrzo5Wmo"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "The embeddings values appear to be the same for each status, so we can take the first row for each status."
+ ],
+ "metadata": {
+ "id": "AtVp6ErB5a78"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## De-Duping"
+ ],
+ "metadata": {
+ "id": "09UuKDRUg4B4"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "183,727 statuses"
+ ],
+ "metadata": {
+ "id": "XyicPnJXg5ZE"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "print(statuses_df.shape)\n",
+ "statuses_df.drop_duplicates(subset=[\"status_id\"], inplace=True)\n",
+ "print(statuses_df.shape)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "dIdPEknwhLIS",
+ "outputId": "1cb9f0d0-8e57-4927-bb0f-065f7e9d5f31"
+ },
+ "execution_count": 10,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "(183815, 1541)\n",
+ "(183727, 1541)\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Saving to drive:"
+ ],
+ "metadata": {
+ "id": "fX0pkB_5i5y1"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "pq_filepath = os.path.join(DATA_DIRPATH, \"botometer_sample_max_50_openai_status_embeddings_v3_unpacked_deduped.parquet.gzip\")\n",
+ "\n",
+ "statuses_df.to_parquet(pq_filepath, compression=\"gzip\")"
+ ],
+ "metadata": {
+ "id": "e1RHJx9ni0g6"
+ },
+ "execution_count": 23,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## Averaging Embeddings per User"
+ ],
+ "metadata": {
+ "id": "dYPPWlVfg5y4"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "statuses_df.groupby(\"user_id\")[\"status_id\"].count()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "6QmU71bJg8xW",
+ "outputId": "491a0382-f841-48dc-b079-b272168eb167"
+ },
+ "execution_count": 12,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "user_id\n",
+ "2952 6\n",
+ "635553 12\n",
+ "656993 1\n",
+ "761154 4\n",
+ "777554 1\n",
+ " ..\n",
+ "1234200349600288772 50\n",
+ "1234846911028453376 1\n",
+ "1237940420136456192 4\n",
+ "1238854780191195136 1\n",
+ "1240138605726760962 1\n",
+ "Name: status_id, Length: 7566, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 12
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "embeddings_cols = [col for col in statuses_df.columns if \"openai\" in col]\n",
+ "print(len(embeddings_cols))\n",
+ "print(embeddings_cols[0], \"...\", embeddings_cols[-1])"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "n_cGPpiKg2Py",
+ "outputId": "4fada281-c3e3-4d99-c8da-22c17f861bf5"
+ },
+ "execution_count": 13,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "1536\n",
+ "openai_0 ... openai_1535\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "averages = statuses_df.groupby(\"user_id\")[embeddings_cols].mean()\n",
+ "print(averages.shape)\n",
+ "averages.head()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 305
+ },
+ "id": "5befd8PGiJxQ",
+ "outputId": "5f5f977d-1b36-4e4e-ac58-262bbb4f7f95"
+ },
+ "execution_count": 14,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "(7566, 1536)\n"
+ ]
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " openai_0 openai_1 openai_2 openai_3 openai_4 openai_5 openai_6 \\\n",
+ "user_id \n",
+ "2952 -0.023816 0.002004 0.004429 -0.019361 -0.009860 0.004878 0.000960 \n",
+ "635553 -0.030022 -0.006063 0.017259 -0.018501 -0.008536 0.004416 -0.011840 \n",
+ "656993 -0.010723 0.008235 0.004192 -0.040441 -0.015172 0.012798 -0.015786 \n",
+ "761154 -0.021389 -0.004747 0.006925 -0.017395 -0.011900 0.018309 -0.007047 \n",
+ "777554 -0.009369 -0.009612 0.012470 0.005079 -0.019303 -0.010459 0.019815 \n",
+ "\n",
+ " openai_7 openai_8 openai_9 ... openai_1526 openai_1527 \\\n",
+ "user_id ... \n",
+ "2952 -0.015426 -0.006430 0.001027 ... -0.012285 0.001094 \n",
+ "635553 -0.010581 -0.010859 -0.003771 ... -0.005960 -0.007866 \n",
+ "656993 0.008556 -0.022145 -0.017026 ... -0.018110 0.007116 \n",
+ "761154 -0.024175 0.001368 0.002065 ... 0.013326 -0.020819 \n",
+ "777554 -0.019171 -0.017594 -0.006209 ... 0.007358 0.010696 \n",
+ "\n",
+ " openai_1528 openai_1529 openai_1530 openai_1531 openai_1532 \\\n",
+ "user_id \n",
+ "2952 0.015767 -0.026536 -0.024981 0.015113 0.018588 \n",
+ "635553 0.010948 -0.021376 -0.023424 0.020705 0.005084 \n",
+ "656993 -0.004877 -0.032427 -0.023885 -0.000715 0.003886 \n",
+ "761154 0.007364 -0.016794 -0.049548 0.013037 0.024798 \n",
+ "777554 0.008784 -0.024808 -0.008042 0.011077 0.001996 \n",
+ "\n",
+ " openai_1533 openai_1534 openai_1535 \n",
+ "user_id \n",
+ "2952 -0.002324 -0.003782 -0.028532 \n",
+ "635553 -0.011961 -0.003258 -0.026262 \n",
+ "656993 -0.024242 0.003839 -0.048883 \n",
+ "761154 -0.008543 0.006142 -0.035867 \n",
+ "777554 -0.001104 -0.019460 -0.030301 \n",
+ "\n",
+ "[5 rows x 1536 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " openai_0 | \n",
+ " openai_1 | \n",
+ " openai_2 | \n",
+ " openai_3 | \n",
+ " openai_4 | \n",
+ " openai_5 | \n",
+ " openai_6 | \n",
+ " openai_7 | \n",
+ " openai_8 | \n",
+ " openai_9 | \n",
+ " ... | \n",
+ " openai_1526 | \n",
+ " openai_1527 | \n",
+ " openai_1528 | \n",
+ " openai_1529 | \n",
+ " openai_1530 | \n",
+ " openai_1531 | \n",
+ " openai_1532 | \n",
+ " openai_1533 | \n",
+ " openai_1534 | \n",
+ " openai_1535 | \n",
+ "
\n",
+ " \n",
+ " user_id | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 2952 | \n",
+ " -0.023816 | \n",
+ " 0.002004 | \n",
+ " 0.004429 | \n",
+ " -0.019361 | \n",
+ " -0.009860 | \n",
+ " 0.004878 | \n",
+ " 0.000960 | \n",
+ " -0.015426 | \n",
+ " -0.006430 | \n",
+ " 0.001027 | \n",
+ " ... | \n",
+ " -0.012285 | \n",
+ " 0.001094 | \n",
+ " 0.015767 | \n",
+ " -0.026536 | \n",
+ " -0.024981 | \n",
+ " 0.015113 | \n",
+ " 0.018588 | \n",
+ " -0.002324 | \n",
+ " -0.003782 | \n",
+ " -0.028532 | \n",
+ "
\n",
+ " \n",
+ " 635553 | \n",
+ " -0.030022 | \n",
+ " -0.006063 | \n",
+ " 0.017259 | \n",
+ " -0.018501 | \n",
+ " -0.008536 | \n",
+ " 0.004416 | \n",
+ " -0.011840 | \n",
+ " -0.010581 | \n",
+ " -0.010859 | \n",
+ " -0.003771 | \n",
+ " ... | \n",
+ " -0.005960 | \n",
+ " -0.007866 | \n",
+ " 0.010948 | \n",
+ " -0.021376 | \n",
+ " -0.023424 | \n",
+ " 0.020705 | \n",
+ " 0.005084 | \n",
+ " -0.011961 | \n",
+ " -0.003258 | \n",
+ " -0.026262 | \n",
+ "
\n",
+ " \n",
+ " 656993 | \n",
+ " -0.010723 | \n",
+ " 0.008235 | \n",
+ " 0.004192 | \n",
+ " -0.040441 | \n",
+ " -0.015172 | \n",
+ " 0.012798 | \n",
+ " -0.015786 | \n",
+ " 0.008556 | \n",
+ " -0.022145 | \n",
+ " -0.017026 | \n",
+ " ... | \n",
+ " -0.018110 | \n",
+ " 0.007116 | \n",
+ " -0.004877 | \n",
+ " -0.032427 | \n",
+ " -0.023885 | \n",
+ " -0.000715 | \n",
+ " 0.003886 | \n",
+ " -0.024242 | \n",
+ " 0.003839 | \n",
+ " -0.048883 | \n",
+ "
\n",
+ " \n",
+ " 761154 | \n",
+ " -0.021389 | \n",
+ " -0.004747 | \n",
+ " 0.006925 | \n",
+ " -0.017395 | \n",
+ " -0.011900 | \n",
+ " 0.018309 | \n",
+ " -0.007047 | \n",
+ " -0.024175 | \n",
+ " 0.001368 | \n",
+ " 0.002065 | \n",
+ " ... | \n",
+ " 0.013326 | \n",
+ " -0.020819 | \n",
+ " 0.007364 | \n",
+ " -0.016794 | \n",
+ " -0.049548 | \n",
+ " 0.013037 | \n",
+ " 0.024798 | \n",
+ " -0.008543 | \n",
+ " 0.006142 | \n",
+ " -0.035867 | \n",
+ "
\n",
+ " \n",
+ " 777554 | \n",
+ " -0.009369 | \n",
+ " -0.009612 | \n",
+ " 0.012470 | \n",
+ " 0.005079 | \n",
+ " -0.019303 | \n",
+ " -0.010459 | \n",
+ " 0.019815 | \n",
+ " -0.019171 | \n",
+ " -0.017594 | \n",
+ " -0.006209 | \n",
+ " ... | \n",
+ " 0.007358 | \n",
+ " 0.010696 | \n",
+ " 0.008784 | \n",
+ " -0.024808 | \n",
+ " -0.008042 | \n",
+ " 0.011077 | \n",
+ " 0.001996 | \n",
+ " -0.001104 | \n",
+ " -0.019460 | \n",
+ " -0.030301 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 1536 columns
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "averages"
+ }
+ },
+ "metadata": {},
+ "execution_count": 14
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Get user labels from CSV file:"
+ ],
+ "metadata": {
+ "id": "kqcnjUEskVsQ"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from pandas import read_csv\n",
+ "\n",
+ "csv_filepath = os.path.join(DATA_DIRPATH, \"botometer_sample_max_50_openai_user_embeddings_unpacked.csv.gz\")\n",
+ "users_df = read_csv(csv_filepath, compression=\"gzip\")\n",
+ "print(users_df.shape)\n",
+ "print(users_df.columns)\n",
+ "users_df.head()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 416
+ },
+ "id": "QzbQbChmkGxa",
+ "outputId": "87b480ea-a332-42c8-acb6-517e3f196b35"
+ },
+ "execution_count": 15,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "(7566, 1547)\n",
+ "Index(['user_id', 'created_on', 'status_count', 'rt_count', 'is_bot',\n",
+ " 'opinion_community', 'is_q', 'avg_toxicity', 'avg_fact_score',\n",
+ " 'bom_astroturf',\n",
+ " ...\n",
+ " 'openai_1526', 'openai_1527', 'openai_1528', 'openai_1529',\n",
+ " 'openai_1530', 'openai_1531', 'openai_1532', 'openai_1533',\n",
+ " 'openai_1534', 'openai_1535'],\n",
+ " dtype='object', length=1547)\n"
+ ]
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " user_id created_on status_count rt_count is_bot \\\n",
+ "0 3420436216 2015-08-13 555 540 True \n",
+ "1 108121958 2010-01-24 2 2 False \n",
+ "2 3038308638 2015-02-23 755 665 True \n",
+ "3 332396536 2011-07-09 951 951 True \n",
+ "4 955082522479808512 2018-01-21 570 533 True \n",
+ "\n",
+ " opinion_community is_q avg_toxicity avg_fact_score bom_astroturf ... \\\n",
+ "0 0 False 0.056113 1.983193 0.295 ... \n",
+ "1 0 False 0.456710 NaN 0.580 ... \n",
+ "2 0 False 0.069860 3.401786 0.970 ... \n",
+ "3 1 False 0.044264 2.304511 0.580 ... \n",
+ "4 0 False 0.049325 4.714286 0.355 ... \n",
+ "\n",
+ " openai_1526 openai_1527 openai_1528 openai_1529 openai_1530 \\\n",
+ "0 -0.001867 -0.013167 0.020885 -0.022568 -0.033631 \n",
+ "1 0.017651 -0.009439 0.024375 -0.032553 -0.042185 \n",
+ "2 -0.026273 -0.008139 0.030285 -0.029902 -0.030887 \n",
+ "3 -0.005520 -0.005288 0.017071 -0.033637 -0.040202 \n",
+ "4 0.009959 0.004695 0.005555 -0.012851 -0.032229 \n",
+ "\n",
+ " openai_1531 openai_1532 openai_1533 openai_1534 openai_1535 \n",
+ "0 0.016153 0.024127 -0.017519 0.002636 -0.039838 \n",
+ "1 0.013782 0.011320 -0.014862 -0.010413 -0.020359 \n",
+ "2 0.022481 -0.005476 -0.016279 -0.010138 -0.021454 \n",
+ "3 0.041773 -0.009370 0.003352 0.009391 -0.042671 \n",
+ "4 0.031443 0.008163 -0.018501 -0.008724 -0.042027 \n",
+ "\n",
+ "[5 rows x 1547 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " user_id | \n",
+ " created_on | \n",
+ " status_count | \n",
+ " rt_count | \n",
+ " is_bot | \n",
+ " opinion_community | \n",
+ " is_q | \n",
+ " avg_toxicity | \n",
+ " avg_fact_score | \n",
+ " bom_astroturf | \n",
+ " ... | \n",
+ " openai_1526 | \n",
+ " openai_1527 | \n",
+ " openai_1528 | \n",
+ " openai_1529 | \n",
+ " openai_1530 | \n",
+ " openai_1531 | \n",
+ " openai_1532 | \n",
+ " openai_1533 | \n",
+ " openai_1534 | \n",
+ " openai_1535 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 3420436216 | \n",
+ " 2015-08-13 | \n",
+ " 555 | \n",
+ " 540 | \n",
+ " True | \n",
+ " 0 | \n",
+ " False | \n",
+ " 0.056113 | \n",
+ " 1.983193 | \n",
+ " 0.295 | \n",
+ " ... | \n",
+ " -0.001867 | \n",
+ " -0.013167 | \n",
+ " 0.020885 | \n",
+ " -0.022568 | \n",
+ " -0.033631 | \n",
+ " 0.016153 | \n",
+ " 0.024127 | \n",
+ " -0.017519 | \n",
+ " 0.002636 | \n",
+ " -0.039838 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 108121958 | \n",
+ " 2010-01-24 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " False | \n",
+ " 0 | \n",
+ " False | \n",
+ " 0.456710 | \n",
+ " NaN | \n",
+ " 0.580 | \n",
+ " ... | \n",
+ " 0.017651 | \n",
+ " -0.009439 | \n",
+ " 0.024375 | \n",
+ " -0.032553 | \n",
+ " -0.042185 | \n",
+ " 0.013782 | \n",
+ " 0.011320 | \n",
+ " -0.014862 | \n",
+ " -0.010413 | \n",
+ " -0.020359 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3038308638 | \n",
+ " 2015-02-23 | \n",
+ " 755 | \n",
+ " 665 | \n",
+ " True | \n",
+ " 0 | \n",
+ " False | \n",
+ " 0.069860 | \n",
+ " 3.401786 | \n",
+ " 0.970 | \n",
+ " ... | \n",
+ " -0.026273 | \n",
+ " -0.008139 | \n",
+ " 0.030285 | \n",
+ " -0.029902 | \n",
+ " -0.030887 | \n",
+ " 0.022481 | \n",
+ " -0.005476 | \n",
+ " -0.016279 | \n",
+ " -0.010138 | \n",
+ " -0.021454 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 332396536 | \n",
+ " 2011-07-09 | \n",
+ " 951 | \n",
+ " 951 | \n",
+ " True | \n",
+ " 1 | \n",
+ " False | \n",
+ " 0.044264 | \n",
+ " 2.304511 | \n",
+ " 0.580 | \n",
+ " ... | \n",
+ " -0.005520 | \n",
+ " -0.005288 | \n",
+ " 0.017071 | \n",
+ " -0.033637 | \n",
+ " -0.040202 | \n",
+ " 0.041773 | \n",
+ " -0.009370 | \n",
+ " 0.003352 | \n",
+ " 0.009391 | \n",
+ " -0.042671 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 955082522479808512 | \n",
+ " 2018-01-21 | \n",
+ " 570 | \n",
+ " 533 | \n",
+ " True | \n",
+ " 0 | \n",
+ " False | \n",
+ " 0.049325 | \n",
+ " 4.714286 | \n",
+ " 0.355 | \n",
+ " ... | \n",
+ " 0.009959 | \n",
+ " 0.004695 | \n",
+ " 0.005555 | \n",
+ " -0.012851 | \n",
+ " -0.032229 | \n",
+ " 0.031443 | \n",
+ " 0.008163 | \n",
+ " -0.018501 | \n",
+ " -0.008724 | \n",
+ " -0.042027 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 1547 columns
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "users_df"
+ }
+ },
+ "metadata": {},
+ "execution_count": 15
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "user_labels = users_df.drop(columns=embeddings_cols)\n",
+ "user_labels.index = user_labels[\"user_id\"]\n",
+ "user_labels.head()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 258
+ },
+ "id": "6ljFelXnyGP6",
+ "outputId": "8315335f-8bbd-4291-85db-54352bea6b9e"
+ },
+ "execution_count": 19,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " user_id created_on status_count rt_count \\\n",
+ "user_id \n",
+ "3420436216 3420436216 2015-08-13 555 540 \n",
+ "108121958 108121958 2010-01-24 2 2 \n",
+ "3038308638 3038308638 2015-02-23 755 665 \n",
+ "332396536 332396536 2011-07-09 951 951 \n",
+ "955082522479808512 955082522479808512 2018-01-21 570 533 \n",
+ "\n",
+ " is_bot opinion_community is_q avg_toxicity \\\n",
+ "user_id \n",
+ "3420436216 True 0 False 0.056113 \n",
+ "108121958 False 0 False 0.456710 \n",
+ "3038308638 True 0 False 0.069860 \n",
+ "332396536 True 1 False 0.044264 \n",
+ "955082522479808512 True 0 False 0.049325 \n",
+ "\n",
+ " avg_fact_score bom_astroturf bom_overall \n",
+ "user_id \n",
+ "3420436216 1.983193 0.295 0.190 \n",
+ "108121958 NaN 0.580 0.110 \n",
+ "3038308638 3.401786 0.970 0.970 \n",
+ "332396536 2.304511 0.580 0.750 \n",
+ "955082522479808512 4.714286 0.355 0.225 "
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " user_id | \n",
+ " created_on | \n",
+ " status_count | \n",
+ " rt_count | \n",
+ " is_bot | \n",
+ " opinion_community | \n",
+ " is_q | \n",
+ " avg_toxicity | \n",
+ " avg_fact_score | \n",
+ " bom_astroturf | \n",
+ " bom_overall | \n",
+ "
\n",
+ " \n",
+ " user_id | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 3420436216 | \n",
+ " 3420436216 | \n",
+ " 2015-08-13 | \n",
+ " 555 | \n",
+ " 540 | \n",
+ " True | \n",
+ " 0 | \n",
+ " False | \n",
+ " 0.056113 | \n",
+ " 1.983193 | \n",
+ " 0.295 | \n",
+ " 0.190 | \n",
+ "
\n",
+ " \n",
+ " 108121958 | \n",
+ " 108121958 | \n",
+ " 2010-01-24 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " False | \n",
+ " 0 | \n",
+ " False | \n",
+ " 0.456710 | \n",
+ " NaN | \n",
+ " 0.580 | \n",
+ " 0.110 | \n",
+ "
\n",
+ " \n",
+ " 3038308638 | \n",
+ " 3038308638 | \n",
+ " 2015-02-23 | \n",
+ " 755 | \n",
+ " 665 | \n",
+ " True | \n",
+ " 0 | \n",
+ " False | \n",
+ " 0.069860 | \n",
+ " 3.401786 | \n",
+ " 0.970 | \n",
+ " 0.970 | \n",
+ "
\n",
+ " \n",
+ " 332396536 | \n",
+ " 332396536 | \n",
+ " 2011-07-09 | \n",
+ " 951 | \n",
+ " 951 | \n",
+ " True | \n",
+ " 1 | \n",
+ " False | \n",
+ " 0.044264 | \n",
+ " 2.304511 | \n",
+ " 0.580 | \n",
+ " 0.750 | \n",
+ "
\n",
+ " \n",
+ " 955082522479808512 | \n",
+ " 955082522479808512 | \n",
+ " 2018-01-21 | \n",
+ " 570 | \n",
+ " 533 | \n",
+ " True | \n",
+ " 0 | \n",
+ " False | \n",
+ " 0.049325 | \n",
+ " 4.714286 | \n",
+ " 0.355 | \n",
+ " 0.225 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "user_labels",
+ "repr_error": "0"
+ }
+ },
+ "metadata": {},
+ "execution_count": 19
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Merge user labels columns back in:"
+ ],
+ "metadata": {
+ "id": "7c1cNgrUkAiS"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "averages = averages.merge(user_labels, left_index=True, right_index=True)\n",
+ "averages.head()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 287
+ },
+ "id": "yMcrnkt3yapA",
+ "outputId": "cdc4127c-4036-4ef2-fd2d-9f5a7ce7b022"
+ },
+ "execution_count": 20,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " openai_0 openai_1 openai_2 openai_3 openai_4 openai_5 openai_6 \\\n",
+ "user_id \n",
+ "2952 -0.023816 0.002004 0.004429 -0.019361 -0.009860 0.004878 0.000960 \n",
+ "635553 -0.030022 -0.006063 0.017259 -0.018501 -0.008536 0.004416 -0.011840 \n",
+ "656993 -0.010723 0.008235 0.004192 -0.040441 -0.015172 0.012798 -0.015786 \n",
+ "761154 -0.021389 -0.004747 0.006925 -0.017395 -0.011900 0.018309 -0.007047 \n",
+ "777554 -0.009369 -0.009612 0.012470 0.005079 -0.019303 -0.010459 0.019815 \n",
+ "\n",
+ " openai_7 openai_8 openai_9 ... created_on status_count \\\n",
+ "user_id ... \n",
+ "2952 -0.015426 -0.006430 0.001027 ... 2006-07-24 6 \n",
+ "635553 -0.010581 -0.010859 -0.003771 ... 2007-01-15 12 \n",
+ "656993 0.008556 -0.022145 -0.017026 ... 2007-01-17 1 \n",
+ "761154 -0.024175 0.001368 0.002065 ... 2007-02-09 4 \n",
+ "777554 -0.019171 -0.017594 -0.006209 ... 2007-02-17 1 \n",
+ "\n",
+ " rt_count is_bot opinion_community is_q avg_toxicity \\\n",
+ "user_id \n",
+ "2952 6 False 0 False 0.006899 \n",
+ "635553 12 False 0 False 0.077787 \n",
+ "656993 1 False 0 False 0.025031 \n",
+ "761154 0 False 0 False 0.172311 \n",
+ "777554 1 False 0 False 0.001660 \n",
+ "\n",
+ " avg_fact_score bom_astroturf bom_overall \n",
+ "user_id \n",
+ "2952 NaN 0.21 0.20 \n",
+ "635553 NaN 0.24 0.16 \n",
+ "656993 NaN 0.11 0.10 \n",
+ "761154 NaN 0.13 0.72 \n",
+ "777554 NaN 0.15 0.03 \n",
+ "\n",
+ "[5 rows x 1547 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " openai_0 | \n",
+ " openai_1 | \n",
+ " openai_2 | \n",
+ " openai_3 | \n",
+ " openai_4 | \n",
+ " openai_5 | \n",
+ " openai_6 | \n",
+ " openai_7 | \n",
+ " openai_8 | \n",
+ " openai_9 | \n",
+ " ... | \n",
+ " created_on | \n",
+ " status_count | \n",
+ " rt_count | \n",
+ " is_bot | \n",
+ " opinion_community | \n",
+ " is_q | \n",
+ " avg_toxicity | \n",
+ " avg_fact_score | \n",
+ " bom_astroturf | \n",
+ " bom_overall | \n",
+ "
\n",
+ " \n",
+ " user_id | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 2952 | \n",
+ " -0.023816 | \n",
+ " 0.002004 | \n",
+ " 0.004429 | \n",
+ " -0.019361 | \n",
+ " -0.009860 | \n",
+ " 0.004878 | \n",
+ " 0.000960 | \n",
+ " -0.015426 | \n",
+ " -0.006430 | \n",
+ " 0.001027 | \n",
+ " ... | \n",
+ " 2006-07-24 | \n",
+ " 6 | \n",
+ " 6 | \n",
+ " False | \n",
+ " 0 | \n",
+ " False | \n",
+ " 0.006899 | \n",
+ " NaN | \n",
+ " 0.21 | \n",
+ " 0.20 | \n",
+ "
\n",
+ " \n",
+ " 635553 | \n",
+ " -0.030022 | \n",
+ " -0.006063 | \n",
+ " 0.017259 | \n",
+ " -0.018501 | \n",
+ " -0.008536 | \n",
+ " 0.004416 | \n",
+ " -0.011840 | \n",
+ " -0.010581 | \n",
+ " -0.010859 | \n",
+ " -0.003771 | \n",
+ " ... | \n",
+ " 2007-01-15 | \n",
+ " 12 | \n",
+ " 12 | \n",
+ " False | \n",
+ " 0 | \n",
+ " False | \n",
+ " 0.077787 | \n",
+ " NaN | \n",
+ " 0.24 | \n",
+ " 0.16 | \n",
+ "
\n",
+ " \n",
+ " 656993 | \n",
+ " -0.010723 | \n",
+ " 0.008235 | \n",
+ " 0.004192 | \n",
+ " -0.040441 | \n",
+ " -0.015172 | \n",
+ " 0.012798 | \n",
+ " -0.015786 | \n",
+ " 0.008556 | \n",
+ " -0.022145 | \n",
+ " -0.017026 | \n",
+ " ... | \n",
+ " 2007-01-17 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " False | \n",
+ " 0 | \n",
+ " False | \n",
+ " 0.025031 | \n",
+ " NaN | \n",
+ " 0.11 | \n",
+ " 0.10 | \n",
+ "
\n",
+ " \n",
+ " 761154 | \n",
+ " -0.021389 | \n",
+ " -0.004747 | \n",
+ " 0.006925 | \n",
+ " -0.017395 | \n",
+ " -0.011900 | \n",
+ " 0.018309 | \n",
+ " -0.007047 | \n",
+ " -0.024175 | \n",
+ " 0.001368 | \n",
+ " 0.002065 | \n",
+ " ... | \n",
+ " 2007-02-09 | \n",
+ " 4 | \n",
+ " 0 | \n",
+ " False | \n",
+ " 0 | \n",
+ " False | \n",
+ " 0.172311 | \n",
+ " NaN | \n",
+ " 0.13 | \n",
+ " 0.72 | \n",
+ "
\n",
+ " \n",
+ " 777554 | \n",
+ " -0.009369 | \n",
+ " -0.009612 | \n",
+ " 0.012470 | \n",
+ " 0.005079 | \n",
+ " -0.019303 | \n",
+ " -0.010459 | \n",
+ " 0.019815 | \n",
+ " -0.019171 | \n",
+ " -0.017594 | \n",
+ " -0.006209 | \n",
+ " ... | \n",
+ " 2007-02-17 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " False | \n",
+ " 0 | \n",
+ " False | \n",
+ " 0.001660 | \n",
+ " NaN | \n",
+ " 0.15 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 1547 columns
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "averages"
+ }
+ },
+ "metadata": {},
+ "execution_count": 20
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Saving to drive:"
+ ],
+ "metadata": {
+ "id": "mXoU2gHRkFCP"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "csv_filepath = os.path.join(DATA_DIRPATH, \"botometer_sample_max_50_openai_status_embeddings_v3_unpacked_deduped_averaged.csv.gz\")\n",
+ "\n",
+ "averages.to_csv(csv_filepath, compression=\"gzip\")"
+ ],
+ "metadata": {
+ "id": "MJwp84QQiJor"
+ },
+ "execution_count": 21,
+ "outputs": []
+ }
+ ]
+}
\ No newline at end of file
diff --git a/notebooks/openai_embeddings_v2/README.md b/notebooks/openai_embeddings_v2/README.md
index f3877c9..c1e97a5 100644
--- a/notebooks/openai_embeddings_v2/README.md
+++ b/notebooks/openai_embeddings_v2/README.md
@@ -8,6 +8,12 @@
This supercedes earlier approach to fetch embeddings. In this second attempt we are grabbing user-level as well as tweet-level embeddings, to compare the difference in these approaches.
-The "Exporting Embeddings" notebook takes embeddings stored in BigQuery (see app/openai_embeddings_v2/README.md), and exports them to CSV / parquet files on Google Drive for easier and cheaper access
+ 1. The "Exporting Embeddings" notebook takes embeddings stored in BigQuery (see app/openai_embeddings_v2/README.md), and exports them to CSV / parquet files on Google Drive for easier and cheaper access
-The "Analysis Template" notebook provides an example of how to load the files from drive for further analysis.
+
+ 2. The "De duping and Averaging" notebook de-duplicates status embeddings, and also calculates average tweet-level embeddings per user, and saves these CSV files to drive.
+
+
+ 3. The "Analysis Template" notebook provides an example of how to load the files from drive for further analysis.
+
+ 4. The "User vs Tweet Level Embeddings" notebook performs dimensionality reduction on user embeddings vs tweet embeddings averaged for each user. The results are saved to drive, and then copied to the "results/openai_embeddings_v2" folder in this repo.
diff --git a/notebooks/openai_embeddings_v2/User_vs_Tweet_Level_Embeddings_(Impeachment_2020)_Dimensionality_Reduction_(2024).ipynb b/notebooks/openai_embeddings_v2/User_vs_Tweet_Level_Embeddings_(Impeachment_2020)_Dimensionality_Reduction_(2024).ipynb
new file mode 100644
index 0000000..a92895c
--- /dev/null
+++ b/notebooks/openai_embeddings_v2/User_vs_Tweet_Level_Embeddings_(Impeachment_2020)_Dimensionality_Reduction_(2024).ipynb
@@ -0,0 +1,5606 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "provenance": [],
+ "collapsed_sections": [
+ "fbq4scJaCrHN",
+ "B40ykeY2-Nmr",
+ "s0VpVSEB81Tt"
+ ],
+ "machine_shape": "hm"
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "_-39w0IS18f-"
+ },
+ "source": [
+ "We fetched user-level and tweet-level OpenAI embeddings and stored on BQ, and copied the data to CSV files on Drive.\n",
+ "\n",
+ "Then we de-duped the status embeddings and calculated the average status embeddings for each user, and saved these CSV files on drive.\n",
+ "\n",
+ "This notebook provides a preliminary analysis of user-level vs tweet-level embeddings, focusing first on dimensionality reduction."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## Setup"
+ ],
+ "metadata": {
+ "id": "9SWh9Z5xiGUj"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Package installation:"
+ ],
+ "metadata": {
+ "id": "rwAQK1yTiHaR"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "%%capture\n",
+ "!pip install -U kaleido"
+ ],
+ "metadata": {
+ "id": "ObcKYuAshyYD"
+ },
+ "execution_count": 161,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "May need to restart session before continuing."
+ ],
+ "metadata": {
+ "id": "CalQUk_WiMYd"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "!pip list | grep kaleido"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "xf9bm0I1iJQo",
+ "outputId": "0c62227f-6414-4b49-ed83-eec7d081ac55"
+ },
+ "execution_count": 6,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "kaleido 0.2.1\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## Google Drive"
+ ],
+ "metadata": {
+ "id": "FF154lGK_1N6"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import os\n",
+ "from google.colab import drive\n",
+ "\n",
+ "drive.mount('/content/drive')\n",
+ "print(os.getcwd(), os.listdir(os.getcwd()))"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "i_eMkJ5fpKDp",
+ "outputId": "d00abb25-9536-478b-da96-8684f66b3aa4"
+ },
+ "execution_count": 7,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n",
+ "/content ['.config', 'drive', 'results_pca_2', 'user_results_pca_2', 'sample_data']\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "5OKjyFQ0owen",
+ "outputId": "9deef9fe-a8a7-4ceb-8e27-2e4e043f030a"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "/content/drive/MyDrive/Research/DS Research Shared 2024\n"
+ ]
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 8
+ }
+ ],
+ "source": [
+ "# you might need to create a google drive SHORTCUT that has this same path\n",
+ "# ... or update the path to use your own google drive organization\n",
+ "#DIRPATH = '/content/drive/MyDrive/Research/Disinfo Research Shared 2022'\n",
+ "#DIRPATH = '/content/drive/MyDrive/Research/DS Research Shared 2023'\n",
+ "DIRPATH = '/content/drive/MyDrive/Research/DS Research Shared 2024'\n",
+ "\n",
+ "print(DIRPATH)\n",
+ "os.path.isdir(DIRPATH)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "New project-based directory structure for 2024:\n",
+ "\n",
+ "https://drive.google.com/drive/folders/1SuXkqVT400uZ2OYFGGV8SYBf7NhtBo5k?usp=drive_link"
+ ],
+ "metadata": {
+ "id": "dNCNBPJkg9St"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "DATA_DIRPATH = os.path.join(DIRPATH, \"projects\", \"Impeachment 2020 Embeddings\", \"data\")\n",
+ "os.path.isdir(DATA_DIRPATH)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "jjkYs5KJ99LX",
+ "outputId": "b4cca1b9-3f29-436a-f4f4-8d593881814a"
+ },
+ "execution_count": 9,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 9
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "os.listdir(DATA_DIRPATH)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "x9QGLQH_dUGV",
+ "outputId": "c300723f-3451-4011-884f-a2be599a3912"
+ },
+ "execution_count": 10,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "['botometer_sample_max_50_openai_user_embeddings.csv.gz',\n",
+ " 'botometer_sample_max_50_openai_user_embeddings_unpacked.csv.gz',\n",
+ " 'botometer_sample_max_50_openai_status_embeddings_v3.csv.gz',\n",
+ " 'botometer_sample_max_50_openai_status_embeddings_v3_unpacked.parquet.gzip',\n",
+ " 'botometer_sample_max_50_openai_status_embeddings_v3_unpacked_deduped.parquet.gzip',\n",
+ " 'botometer_sample_max_50_openai_status_embeddings_v3_unpacked_deduped_averaged.csv.gz']"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 10
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "The \"unpacked\" versions have a column per embedding, and are generally easier to work with.\n",
+ "\n",
+ "The files we will be working with are:\n",
+ " + \"botometer_sample_max_50_openai_user_embeddings_unpacked.csv.gz\" (user level embeddings) and\n",
+ " + \"botometer_sample_max_50_openai_status_embeddings_v3_unpacked_deduped_averaged.csv.gz\" (average status-level embeddings)."
+ ],
+ "metadata": {
+ "id": "JCNrEG7vhOKo"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "RESULTS_DIRPATH = os.path.join(DIRPATH, \"projects\", \"Impeachment 2020 Embeddings\", \"results\")\n",
+ "os.makedirs(RESULTS_DIRPATH, exist_ok=True)\n",
+ "os.path.isdir(RESULTS_DIRPATH)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "lHdKnZzCj5UB",
+ "outputId": "d67645aa-be31-4bef-b9b3-0f9465a73575"
+ },
+ "execution_count": 55,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 55
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## Colors"
+ ],
+ "metadata": {
+ "id": "fbq4scJaCrHN"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# https://github.com/s2t2/openai-embeddings-2023/blob/main/app/colors.py\n",
+ "\n",
+ "#GREY = \"#ccc\"\n",
+ "#PURPLE = \"#7E57C2\"\n",
+ "\n",
+ "# colorbrewer scales\n",
+ "# light --> dark\n",
+ "BLUES = ['#f7fbff', '#deebf7', '#c6dbef', '#9ecae1', '#6baed6', '#4292c6', '#2171b5', '#08519c', '#08306b']\n",
+ "REDS = ['#fff5f0', '#fee0d2', '#fcbba1', '#fc9272', '#fb6a4a', '#ef3b2c', '#cb181d', '#a50f15', '#67000d']\n",
+ "PURPLES = ['#fcfbfd', '#efedf5', '#dadaeb', '#bcbddc', '#9e9ac8', '#807dba', '#6a51a3', '#54278f', '#3f007d']\n",
+ "GREYS = ['#ffffff', '#f0f0f0', '#d9d9d9', '#bdbdbd', '#969696', '#737373', '#525252', '#252525', '#000000']\n",
+ "GREENS = [\"#edf8e9\",\"#c7e9c0\",\"#a1d99b\",\"#74c476\",\"#41ab5d\",\"#238b45\",\"#005a32\"]\n",
+ "ORANGES = ['#fff5eb', '#fee6ce', '#fdd0a2', '#fdae6b', '#fd8d3c', '#f16913', '#d94801', '#a63603', '#7f2704']\n",
+ "BROWNS = [\"#C46200\", \"#964B00\"]\n",
+ "RD_PU = [\"#feebe2\",\"#fcc5c0\",\"#fa9fb5\",\"#f768a1\",\"#dd3497\",\"#ae017e\",\"#7a0177\"]\n",
+ "PU_RD = [\"#f1eef6\",\"#d4b9da\",\"#c994c7\",\"#df65b0\",\"#e7298a\",\"#ce1256\",\"#91003f\"]\n",
+ "\n",
+ "OPINION_COLORS_MAP = {\"Anti-Trump\": BLUES[5], \"Pro-Trump\": REDS[5]}\n",
+ "BOT_COLORS_MAP = {\"Human\": GREYS[3], \"Bot\": PURPLES[6]}\n",
+ "Q_COLORS_MAP = {\"Normal\": GREYS[3], \"Q-anon\": REDS[6]}\n",
+ "TOXIC_COLORS_MAP = {\"Toxic\": BROWNS[1], \"Normal\": GREYS[3]}\n",
+ "FACT_COLORS_MAP = {\"High Quality\": GREYS[3], \"Low Quality\": RD_PU[4]}\n",
+ "\n",
+ "FOURWAY_COLORS_MAP = {\n",
+ " \"Anti-Trump Human\": BLUES[3],\n",
+ " \"Anti-Trump Bot\": BLUES[6],\n",
+ "\n",
+ " \"Pro-Trump Human\": REDS[3],\n",
+ " \"Pro-Trump Bot\": REDS[6],\n",
+ "}\n",
+ "SIXWAY_COLORS_MAP = {\n",
+ " \"Anti-Trump Human\": BLUES[3],\n",
+ " \"Anti-Trump Bot\": BLUES[6],\n",
+ "\n",
+ " \"Pro-Trump Human\": REDS[3],\n",
+ " \"Pro-Trump Bot\": REDS[6],\n",
+ "\n",
+ " \"Q-anon Human\": REDS[4], # \"Pro-Trump Q-anon Human\"\n",
+ " \"Q-anon Bot\": REDS[7], # \"Pro-Trump Q-anon Bot\"\n",
+ "}\n",
+ "\n",
+ "\n",
+ "COLORS_MAP = {\n",
+ " \"bot_label\": BOT_COLORS_MAP,\n",
+ " \"opinion_label\": OPINION_COLORS_MAP,\n",
+ " \"q_label\": Q_COLORS_MAP,\n",
+ " \"toxic_label\": TOXIC_COLORS_MAP,\n",
+ " \"factual_label\": FACT_COLORS_MAP,\n",
+ "\n",
+ " \"fourway_label\": FOURWAY_COLORS_MAP,\n",
+ " \"sixway_label\": SIXWAY_COLORS_MAP,\n",
+ " \"bom_overall_label\": BOT_COLORS_MAP,\n",
+ " \"bom_astroturf_label\": BOT_COLORS_MAP,\n",
+ "}\n",
+ "\n",
+ "\n",
+ "BOT_LABEL_ORDER = [\"Human\", \"Bot\"]\n",
+ "CATEGORY_ORDERS = {\n",
+ " \"bot_label\": BOT_LABEL_ORDER,\n",
+ " \"bom_overall_label\": BOT_LABEL_ORDER,\n",
+ " \"bom_astroturf_label\": BOT_LABEL_ORDER,\n",
+ " \"opinion_label\": [\"Anti-Trump\", \"Pro-Trump\"],\n",
+ " \"q_label\": [\"Normal\", \"Q-anon\"],\n",
+ "\n",
+ " \"toxic_label\": [\"Normal\", \"Toxic\"],\n",
+ " \"factual_label\": [\"High Quality\", \"Low Quality\"],\n",
+ "\n",
+ " \"fourway_label\": list(FOURWAY_COLORS_MAP.keys()),\n",
+ " \"sixway_label\": list(SIXWAY_COLORS_MAP.keys()),\n",
+ "}"
+ ],
+ "metadata": {
+ "id": "CStYodOfCtIT"
+ },
+ "execution_count": 11,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## Dimensionality Reduction"
+ ],
+ "metadata": {
+ "id": "2oYZLq_i5lQi"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#import warnings\n",
+ "#warnings.filterwarnings(\"ignore\", message=\".*The 'nopython' keyword.*\") # suppress umap warnings https://github.com/slundberg/shap/issues/2909\n",
+ "#warnings.simplefilter(\"ignore\", DeprecationWarning) # suppress warnings.warn(\"pkg_resources is deprecated as an API\", DeprecationWarning) https://discuss.python.org/t/how-to-silence-pkg-resources-warnings/28629/7"
+ ],
+ "metadata": {
+ "id": "9mJcPOh66Bqj"
+ },
+ "execution_count": 12,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import os\n",
+ "\n",
+ "import numpy as np\n",
+ "from pandas import DataFrame\n",
+ "import plotly.express as px\n",
+ "from sklearn.decomposition import PCA\n",
+ "from sklearn.preprocessing import StandardScaler\n",
+ "\n",
+ "N_COMPONENTS = 2\n",
+ "#REDUCTION_RESULTS_DIRPATH = \"results\"\n",
+ "FIG_SHOW = True\n",
+ "FIG_SAVE = False\n",
+ "\n",
+ "class ReductionPipeline:\n",
+ " # adapted from: https://github.com/s2t2/openai-embeddings-2023/blob/main/app/reduction/pipeline.py\n",
+ "\n",
+ " def __init__(self, x, labels, target, n_components=N_COMPONENTS, reducer_type=\"PCA\", #results_dirpath=None\n",
+ " ):\n",
+ " \"\"\"\n",
+ "\n",
+ " \"\"\"\n",
+ "\n",
+ " self.x = x.copy()\n",
+ " self.labels = labels.copy()\n",
+ " self.target = target\n",
+ "\n",
+ " self.reducer_type = reducer_type\n",
+ " self.reducer_name = {\"PCA\": \"pca\", \"T-SNE\": \"tsne\", \"UMAP\": \"umap\"}[self.reducer_type]\n",
+ "\n",
+ " self.x_scaled = (self.x - self.x.mean(axis=0)) / self.x.std(axis=0)\n",
+ " #scaler = StandardScaler()\n",
+ " #self.x_scaled = scaler.fit_transform(self.x)\n",
+ "\n",
+ " self.n_components = n_components\n",
+ " self.component_names = [f\"component_{i+1}\" for i in range(self.n_components)]\n",
+ "\n",
+ " #self.results_dirpath = results_dirpath or f\"results_pca_{self.n_components}\"\n",
+ " #os.makedirs(self.results_dirpath, exist_ok=True)\n",
+ "\n",
+ "\n",
+ " def perform(self):\n",
+ " self.pca = PCA(n_components=self.n_components, random_state=99)\n",
+ " print(self.pca)\n",
+ "\n",
+ " embeddings = self.pca.fit_transform(self.x_scaled)\n",
+ " print(\"EMBEDDINGS:\", embeddings.shape)\n",
+ " self.embeddings_df = DataFrame(embeddings, columns=self.component_names, index=self.x.index)\n",
+ "\n",
+ " print(\"EXPLAINED VARIANCE RATIO:\", self.pca.explained_variance_ratio_)\n",
+ " print(\"EXPLAINED VARIANCE:\", self.pca.explained_variance_ratio_.sum().round(2))\n",
+ "\n",
+ " # https://stackoverflow.com/questions/21217710/factor-loadings-using-sklearn/44728692#44728692\n",
+ " loadings = self.pca.components_.T * np.sqrt(self.pca.explained_variance_)\n",
+ " print(\"LOADINGS\", loadings.shape)\n",
+ " self.loadings_df = DataFrame(loadings, columns=self.component_names, index=self.pca.feature_names_in_)\n",
+ "\n",
+ "\n",
+ " #def plot_embeddings(self, fig_show=True, fig_save=False, height=350, labels=None, hover_data=None):\n",
+ " #\n",
+ " # labels = labels or self.labels\n",
+ " #\n",
+ " # chart_df = self.embeddings_df.copy()\n",
+ " # chart_df = chart_df.merge(self.labels, left_index=True, right_index=True) # ADD TARGET BACK FOR COLOR (ASSUMES INDEX IS the SAME)\n",
+ " # #chart_df = chart_df.merge(self.x, left_index=True, right_index=True) # ADD aLL DATA BACK SO WE CAN INSPECT FEATURES AS WELL\n",
+ " # #chart_df.sort_values(by=self.target, inplace=True)\n",
+ " #\n",
+ " # fig = None\n",
+ " # if self.n_components == 2:\n",
+ " # fig = px.scatter(chart_df, x=\"component_1\", y=\"component_2\",\n",
+ " # color=self.target, height=height,\n",
+ " # title=\"PCA Embeddings (n_components=2)\",\n",
+ " # #hover_data=self.x.columns.tolist() #[\"gender\", \"island\", \"body_mass_g\"]\n",
+ " # hover_data=hover_data\n",
+ " # )\n",
+ " # elif self.n_components == 3:\n",
+ " # fig = px.scatter_3d(chart_df, x=\"component_1\", y=\"component_2\", z=\"component_3\",\n",
+ " # color=self.target, height=height,\n",
+ " # title=\"PCA Embeddings (n_components=3)\",\n",
+ " # #hover_data=self.x.columns.tolist() # [\"gender\", \"island\", \"body_mass_g\"]\n",
+ " # )\n",
+ " #\n",
+ " # if fig and fig_show:\n",
+ " # fig.show()\n",
+ " #\n",
+ " # if fig and fig_save:\n",
+ " # html_filepath = os.path.join(self.results_filepath, f\"features.html\")\n",
+ " # fig.write_html(html_filepath)\n",
+ " #\n",
+ " # png_filepath = os.path.join(self.results_filepath, f\"features.png\")\n",
+ " # fig.write_image(png_filepath)\n",
+ "\n",
+ "\n",
+ "\n",
+ " def plot_embeddings(self, height=500, fig_show=FIG_SHOW, fig_save=FIG_SAVE, results_dirpath=None,\n",
+ " subtitle=None, text=None, size=None, hover_data=None,\n",
+ " color=None, color_map=None, color_scale=None, category_orders=None):\n",
+ "\n",
+ " chart_df = self.embeddings_df.copy()\n",
+ " chart_df = chart_df.merge(self.labels, left_index=True, right_index=True) # ADD TARGET BACK FOR COLOR (ASSUMES INDEX IS the SAME)\n",
+ " #chart_df = chart_df.merge(self.x, left_index=True, right_index=True) # ADD aLL DATA BACK SO WE CAN INSPECT FEATURES AS WELL\n",
+ " #chart_df.sort_values(by=self.target, inplace=True)\n",
+ "\n",
+ " title = f\"Dimensionality Reduction Results ({self.reducer_type} n_components={self.n_components})\"\n",
+ " if subtitle:\n",
+ " title += f\"
{subtitle}\"\n",
+ "\n",
+ " chart_params = dict(x=\"component_1\", y=\"component_2\",\n",
+ " title=title, height=height,\n",
+ " #color=color, #\"artist_name\",\n",
+ " hover_data= hover_data #{\"index\": (self.embeddings_df.index)} #hover_data #[\"audio_filename\", \"track_number\"]\n",
+ " )\n",
+ " if color:\n",
+ " chart_params[\"color\"] = color\n",
+ " if color_map:\n",
+ " chart_params[\"color_discrete_map\"] = color_map\n",
+ " if color_scale:\n",
+ " chart_params[\"color_continuous_scale\"] = color_scale\n",
+ " if category_orders:\n",
+ " chart_params[\"category_orders\"] = category_orders\n",
+ " if hover_data:\n",
+ " chart_params[\"hover_data\"] = hover_data\n",
+ " if size:\n",
+ " chart_params[\"size\"] = size\n",
+ " if text:\n",
+ " chart_params[\"text\"] = text\n",
+ "\n",
+ " if self.n_components == 2:\n",
+ " fig = px.scatter(chart_df, **chart_params)\n",
+ " elif self.n_components == 3:\n",
+ " chart_params[\"z\"] = \"component_3\"\n",
+ " fig = px.scatter_3d(chart_df, **chart_params)\n",
+ " else:\n",
+ " return None\n",
+ "\n",
+ " if fig_show:\n",
+ " fig.show()\n",
+ "\n",
+ " if fig_save:\n",
+ " results_dirpath = results_dirpath or self.results_dirpath\n",
+ " filestem = os.path.join(results_dirpath, f\"{self.reducer_name}_{self.n_components}\")\n",
+ " fig.write_image(f\"{filestem}.png\")\n",
+ " fig.write_html(f\"{filestem}.html\")\n",
+ "\n",
+ " return fig\n",
+ "\n"
+ ],
+ "metadata": {
+ "id": "hqPtK9_j5nBR"
+ },
+ "execution_count": 58,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## User Embeddings"
+ ],
+ "metadata": {
+ "id": "TJUWWC48HcGk"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "7566 users"
+ ],
+ "metadata": {
+ "id": "CGpJ-kDaHfi5"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### Loading"
+ ],
+ "metadata": {
+ "id": "B40ykeY2-Nmr"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Loading CSV from drive:"
+ ],
+ "metadata": {
+ "id": "1TYFGOn7Ow-P"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from pandas import read_csv\n",
+ "\n",
+ "csv_filepath = os.path.join(DATA_DIRPATH, \"botometer_sample_max_50_openai_user_embeddings_unpacked.csv.gz\")\n",
+ "users_df = read_csv(csv_filepath, compression=\"gzip\")\n",
+ "print(users_df.shape)\n",
+ "print(users_df.columns)\n",
+ "users_df.head()"
+ ],
+ "metadata": {
+ "id": "V5m_ZmDFHeLx",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 0
+ },
+ "outputId": "668fab3a-9906-45d5-98a4-e85b1fc467bb"
+ },
+ "execution_count": 15,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "(7566, 1547)\n",
+ "Index(['user_id', 'created_on', 'status_count', 'rt_count', 'is_bot',\n",
+ " 'opinion_community', 'is_q', 'avg_toxicity', 'avg_fact_score',\n",
+ " 'bom_astroturf',\n",
+ " ...\n",
+ " 'openai_1526', 'openai_1527', 'openai_1528', 'openai_1529',\n",
+ " 'openai_1530', 'openai_1531', 'openai_1532', 'openai_1533',\n",
+ " 'openai_1534', 'openai_1535'],\n",
+ " dtype='object', length=1547)\n"
+ ]
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " user_id created_on status_count rt_count is_bot \\\n",
+ "0 3420436216 2015-08-13 555 540 True \n",
+ "1 108121958 2010-01-24 2 2 False \n",
+ "2 3038308638 2015-02-23 755 665 True \n",
+ "3 332396536 2011-07-09 951 951 True \n",
+ "4 955082522479808512 2018-01-21 570 533 True \n",
+ "\n",
+ " opinion_community is_q avg_toxicity avg_fact_score bom_astroturf ... \\\n",
+ "0 0 False 0.056113 1.983193 0.295 ... \n",
+ "1 0 False 0.456710 NaN 0.580 ... \n",
+ "2 0 False 0.069860 3.401786 0.970 ... \n",
+ "3 1 False 0.044264 2.304511 0.580 ... \n",
+ "4 0 False 0.049325 4.714286 0.355 ... \n",
+ "\n",
+ " openai_1526 openai_1527 openai_1528 openai_1529 openai_1530 \\\n",
+ "0 -0.001867 -0.013167 0.020885 -0.022568 -0.033631 \n",
+ "1 0.017651 -0.009439 0.024375 -0.032553 -0.042185 \n",
+ "2 -0.026273 -0.008139 0.030285 -0.029902 -0.030887 \n",
+ "3 -0.005520 -0.005288 0.017071 -0.033637 -0.040202 \n",
+ "4 0.009959 0.004695 0.005555 -0.012851 -0.032229 \n",
+ "\n",
+ " openai_1531 openai_1532 openai_1533 openai_1534 openai_1535 \n",
+ "0 0.016153 0.024127 -0.017519 0.002636 -0.039838 \n",
+ "1 0.013782 0.011320 -0.014862 -0.010413 -0.020359 \n",
+ "2 0.022481 -0.005476 -0.016279 -0.010138 -0.021454 \n",
+ "3 0.041773 -0.009370 0.003352 0.009391 -0.042671 \n",
+ "4 0.031443 0.008163 -0.018501 -0.008724 -0.042027 \n",
+ "\n",
+ "[5 rows x 1547 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " user_id | \n",
+ " created_on | \n",
+ " status_count | \n",
+ " rt_count | \n",
+ " is_bot | \n",
+ " opinion_community | \n",
+ " is_q | \n",
+ " avg_toxicity | \n",
+ " avg_fact_score | \n",
+ " bom_astroturf | \n",
+ " ... | \n",
+ " openai_1526 | \n",
+ " openai_1527 | \n",
+ " openai_1528 | \n",
+ " openai_1529 | \n",
+ " openai_1530 | \n",
+ " openai_1531 | \n",
+ " openai_1532 | \n",
+ " openai_1533 | \n",
+ " openai_1534 | \n",
+ " openai_1535 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 3420436216 | \n",
+ " 2015-08-13 | \n",
+ " 555 | \n",
+ " 540 | \n",
+ " True | \n",
+ " 0 | \n",
+ " False | \n",
+ " 0.056113 | \n",
+ " 1.983193 | \n",
+ " 0.295 | \n",
+ " ... | \n",
+ " -0.001867 | \n",
+ " -0.013167 | \n",
+ " 0.020885 | \n",
+ " -0.022568 | \n",
+ " -0.033631 | \n",
+ " 0.016153 | \n",
+ " 0.024127 | \n",
+ " -0.017519 | \n",
+ " 0.002636 | \n",
+ " -0.039838 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 108121958 | \n",
+ " 2010-01-24 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " False | \n",
+ " 0 | \n",
+ " False | \n",
+ " 0.456710 | \n",
+ " NaN | \n",
+ " 0.580 | \n",
+ " ... | \n",
+ " 0.017651 | \n",
+ " -0.009439 | \n",
+ " 0.024375 | \n",
+ " -0.032553 | \n",
+ " -0.042185 | \n",
+ " 0.013782 | \n",
+ " 0.011320 | \n",
+ " -0.014862 | \n",
+ " -0.010413 | \n",
+ " -0.020359 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3038308638 | \n",
+ " 2015-02-23 | \n",
+ " 755 | \n",
+ " 665 | \n",
+ " True | \n",
+ " 0 | \n",
+ " False | \n",
+ " 0.069860 | \n",
+ " 3.401786 | \n",
+ " 0.970 | \n",
+ " ... | \n",
+ " -0.026273 | \n",
+ " -0.008139 | \n",
+ " 0.030285 | \n",
+ " -0.029902 | \n",
+ " -0.030887 | \n",
+ " 0.022481 | \n",
+ " -0.005476 | \n",
+ " -0.016279 | \n",
+ " -0.010138 | \n",
+ " -0.021454 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 332396536 | \n",
+ " 2011-07-09 | \n",
+ " 951 | \n",
+ " 951 | \n",
+ " True | \n",
+ " 1 | \n",
+ " False | \n",
+ " 0.044264 | \n",
+ " 2.304511 | \n",
+ " 0.580 | \n",
+ " ... | \n",
+ " -0.005520 | \n",
+ " -0.005288 | \n",
+ " 0.017071 | \n",
+ " -0.033637 | \n",
+ " -0.040202 | \n",
+ " 0.041773 | \n",
+ " -0.009370 | \n",
+ " 0.003352 | \n",
+ " 0.009391 | \n",
+ " -0.042671 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 955082522479808512 | \n",
+ " 2018-01-21 | \n",
+ " 570 | \n",
+ " 533 | \n",
+ " True | \n",
+ " 0 | \n",
+ " False | \n",
+ " 0.049325 | \n",
+ " 4.714286 | \n",
+ " 0.355 | \n",
+ " ... | \n",
+ " 0.009959 | \n",
+ " 0.004695 | \n",
+ " 0.005555 | \n",
+ " -0.012851 | \n",
+ " -0.032229 | \n",
+ " 0.031443 | \n",
+ " 0.008163 | \n",
+ " -0.018501 | \n",
+ " -0.008724 | \n",
+ " -0.042027 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 1547 columns
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "users_df"
+ }
+ },
+ "metadata": {},
+ "execution_count": 15
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "users_df[\"user_id\"].nunique()"
+ ],
+ "metadata": {
+ "id": "nQGfxCyBHeIi",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "bd8b1f8c-eb53-43b0-e0e9-6804cd7dbc0e"
+ },
+ "execution_count": 16,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "7566"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 16
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "users_df[\"is_bot\"].value_counts()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "JIwbbnB71suN",
+ "outputId": "d7baeb21-341b-4a5f-ff27-94ad7ed64569"
+ },
+ "execution_count": 17,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "False 4466\n",
+ "True 3100\n",
+ "Name: is_bot, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 17
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "users_df[\"opinion_community\"].value_counts()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "Yi8Qlxi_1spO",
+ "outputId": "8333d45e-a469-4388-fd5c-3b558bfb5715"
+ },
+ "execution_count": 18,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "0 4891\n",
+ "1 2675\n",
+ "Name: opinion_community, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 18
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "users_df[\"avg_fact_score\"].info()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "dG4-L7nDeQC-",
+ "outputId": "7efd044d-fb1b-4e39-f2de-6019a9bbc6b8"
+ },
+ "execution_count": 19,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "\n",
+ "RangeIndex: 7566 entries, 0 to 7565\n",
+ "Series name: avg_fact_score\n",
+ "Non-Null Count Dtype \n",
+ "-------------- ----- \n",
+ "3292 non-null float64\n",
+ "dtypes: float64(1)\n",
+ "memory usage: 59.2 KB\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "\n",
+ "\n",
+ "from pandas import isnull\n",
+ "\n",
+ "def add_labels(users_df):\n",
+ " # APPLY SAME LABELS AS THE ORIGINAL SOURCE CODE\n",
+ " # https://github.com/s2t2/openai-embeddings-2023/blob/1b8372dd36982009df5d4a80871f4c182ada743d/notebooks/2_embeddings_data_export.py#L51\n",
+ " # https://github.com/s2t2/openai-embeddings-2023/blob/main/app/dataset.py#L37-L64\n",
+ "\n",
+ " # labels:\n",
+ " users_df[\"opinion_label\"] = users_df[\"opinion_community\"].map({0:\"Anti-Trump\", 1:\"Pro-Trump\"})\n",
+ " users_df[\"bot_label\"] = users_df[\"is_bot\"].map({True:\"Bot\", False:\"Human\"})\n",
+ " users_df[\"fourway_label\"] = users_df[\"opinion_label\"] + \" \" + users_df[\"bot_label\"]\n",
+ "\n",
+ " # language toxicity scores (0 low - 1 high)\n",
+ " toxic_threshold = 0.1\n",
+ " users_df[\"is_toxic\"] = users_df[\"avg_toxicity\"] >= toxic_threshold\n",
+ " users_df[\"is_toxic\"] = users_df[\"is_toxic\"].map({True: 1, False :0 })\n",
+ " users_df[\"toxic_label\"] = users_df[\"is_toxic\"].map({1: \"Toxic\", 0 :\"Normal\" })\n",
+ "\n",
+ " # fact check / media quality scores (1 low - 5 high)\n",
+ " # there are null avg_fact_score, so we only apply operation if not null, and leave nulls\n",
+ " fact_threshold = 3.0\n",
+ " users_df[\"is_factual\"] = users_df[\"avg_fact_score\"].apply(lambda score: score if isnull(score) else score >= fact_threshold)\n",
+ " users_df[\"is_factual\"] = users_df[\"is_factual\"].map({True: 1, False :0 })\n",
+ " users_df[\"factual_label\"] = users_df[\"is_factual\"].map({1: \"High Quality\", 0 :\"Low Quality\" })\n",
+ "\n",
+ " # botometer binary and labels:\n",
+ " users_df[\"is_bom_overall\"] = users_df[\"bom_overall\"].round()\n",
+ " users_df[\"is_bom_astroturf\"] = users_df[\"bom_astroturf\"].round()\n",
+ " users_df[\"bom_overall_label\"] = users_df[\"is_bom_overall\"].map({1:\"Bot\", 0:\"Human\"})\n",
+ " users_df[\"bom_astroturf_label\"] = users_df[\"is_bom_astroturf\"].map({1:\"Bot\", 0:\"Human\"})\n",
+ " users_df[\"bom_overall_fourway_label\"] = users_df[\"opinion_label\"] + \" \" + users_df[\"bom_overall_label\"]\n",
+ " users_df[\"bom_astroturf_fourway_label\"] = users_df[\"opinion_label\"] + \" \" + users_df[\"bom_astroturf_label\"]\n",
+ "\n",
+ " return users_df\n",
+ "\n",
+ "\n",
+ "users_df = add_labels(users_df)\n",
+ "print(users_df.shape)\n",
+ "print(users_df.columns.tolist())\n",
+ "users_df.head()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 0
+ },
+ "id": "jK9I2mpri_ER",
+ "outputId": "da7541d9-8cdf-4517-bd7a-6bb1ca162d08"
+ },
+ "execution_count": 20,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "(7566, 1560)\n",
+ "['user_id', 'created_on', 'status_count', 'rt_count', 'is_bot', 'opinion_community', 'is_q', 'avg_toxicity', 'avg_fact_score', 'bom_astroturf', 'bom_overall', 'openai_0', 'openai_1', 'openai_2', 'openai_3', 'openai_4', 'openai_5', 'openai_6', 'openai_7', 'openai_8', 'openai_9', 'openai_10', 'openai_11', 'openai_12', 'openai_13', 'openai_14', 'openai_15', 'openai_16', 'openai_17', 'openai_18', 'openai_19', 'openai_20', 'openai_21', 'openai_22', 'openai_23', 'openai_24', 'openai_25', 'openai_26', 'openai_27', 'openai_28', 'openai_29', 'openai_30', 'openai_31', 'openai_32', 'openai_33', 'openai_34', 'openai_35', 'openai_36', 'openai_37', 'openai_38', 'openai_39', 'openai_40', 'openai_41', 'openai_42', 'openai_43', 'openai_44', 'openai_45', 'openai_46', 'openai_47', 'openai_48', 'openai_49', 'openai_50', 'openai_51', 'openai_52', 'openai_53', 'openai_54', 'openai_55', 'openai_56', 'openai_57', 'openai_58', 'openai_59', 'openai_60', 'openai_61', 'openai_62', 'openai_63', 'openai_64', 'openai_65', 'openai_66', 'openai_67', 'openai_68', 'openai_69', 'openai_70', 'openai_71', 'openai_72', 'openai_73', 'openai_74', 'openai_75', 'openai_76', 'openai_77', 'openai_78', 'openai_79', 'openai_80', 'openai_81', 'openai_82', 'openai_83', 'openai_84', 'openai_85', 'openai_86', 'openai_87', 'openai_88', 'openai_89', 'openai_90', 'openai_91', 'openai_92', 'openai_93', 'openai_94', 'openai_95', 'openai_96', 'openai_97', 'openai_98', 'openai_99', 'openai_100', 'openai_101', 'openai_102', 'openai_103', 'openai_104', 'openai_105', 'openai_106', 'openai_107', 'openai_108', 'openai_109', 'openai_110', 'openai_111', 'openai_112', 'openai_113', 'openai_114', 'openai_115', 'openai_116', 'openai_117', 'openai_118', 'openai_119', 'openai_120', 'openai_121', 'openai_122', 'openai_123', 'openai_124', 'openai_125', 'openai_126', 'openai_127', 'openai_128', 'openai_129', 'openai_130', 'openai_131', 'openai_132', 'openai_133', 'openai_134', 'openai_135', 'openai_136', 'openai_137', 'openai_138', 'openai_139', 'openai_140', 'openai_141', 'openai_142', 'openai_143', 'openai_144', 'openai_145', 'openai_146', 'openai_147', 'openai_148', 'openai_149', 'openai_150', 'openai_151', 'openai_152', 'openai_153', 'openai_154', 'openai_155', 'openai_156', 'openai_157', 'openai_158', 'openai_159', 'openai_160', 'openai_161', 'openai_162', 'openai_163', 'openai_164', 'openai_165', 'openai_166', 'openai_167', 'openai_168', 'openai_169', 'openai_170', 'openai_171', 'openai_172', 'openai_173', 'openai_174', 'openai_175', 'openai_176', 'openai_177', 'openai_178', 'openai_179', 'openai_180', 'openai_181', 'openai_182', 'openai_183', 'openai_184', 'openai_185', 'openai_186', 'openai_187', 'openai_188', 'openai_189', 'openai_190', 'openai_191', 'openai_192', 'openai_193', 'openai_194', 'openai_195', 'openai_196', 'openai_197', 'openai_198', 'openai_199', 'openai_200', 'openai_201', 'openai_202', 'openai_203', 'openai_204', 'openai_205', 'openai_206', 'openai_207', 'openai_208', 'openai_209', 'openai_210', 'openai_211', 'openai_212', 'openai_213', 'openai_214', 'openai_215', 'openai_216', 'openai_217', 'openai_218', 'openai_219', 'openai_220', 'openai_221', 'openai_222', 'openai_223', 'openai_224', 'openai_225', 'openai_226', 'openai_227', 'openai_228', 'openai_229', 'openai_230', 'openai_231', 'openai_232', 'openai_233', 'openai_234', 'openai_235', 'openai_236', 'openai_237', 'openai_238', 'openai_239', 'openai_240', 'openai_241', 'openai_242', 'openai_243', 'openai_244', 'openai_245', 'openai_246', 'openai_247', 'openai_248', 'openai_249', 'openai_250', 'openai_251', 'openai_252', 'openai_253', 'openai_254', 'openai_255', 'openai_256', 'openai_257', 'openai_258', 'openai_259', 'openai_260', 'openai_261', 'openai_262', 'openai_263', 'openai_264', 'openai_265', 'openai_266', 'openai_267', 'openai_268', 'openai_269', 'openai_270', 'openai_271', 'openai_272', 'openai_273', 'openai_274', 'openai_275', 'openai_276', 'openai_277', 'openai_278', 'openai_279', 'openai_280', 'openai_281', 'openai_282', 'openai_283', 'openai_284', 'openai_285', 'openai_286', 'openai_287', 'openai_288', 'openai_289', 'openai_290', 'openai_291', 'openai_292', 'openai_293', 'openai_294', 'openai_295', 'openai_296', 'openai_297', 'openai_298', 'openai_299', 'openai_300', 'openai_301', 'openai_302', 'openai_303', 'openai_304', 'openai_305', 'openai_306', 'openai_307', 'openai_308', 'openai_309', 'openai_310', 'openai_311', 'openai_312', 'openai_313', 'openai_314', 'openai_315', 'openai_316', 'openai_317', 'openai_318', 'openai_319', 'openai_320', 'openai_321', 'openai_322', 'openai_323', 'openai_324', 'openai_325', 'openai_326', 'openai_327', 'openai_328', 'openai_329', 'openai_330', 'openai_331', 'openai_332', 'openai_333', 'openai_334', 'openai_335', 'openai_336', 'openai_337', 'openai_338', 'openai_339', 'openai_340', 'openai_341', 'openai_342', 'openai_343', 'openai_344', 'openai_345', 'openai_346', 'openai_347', 'openai_348', 'openai_349', 'openai_350', 'openai_351', 'openai_352', 'openai_353', 'openai_354', 'openai_355', 'openai_356', 'openai_357', 'openai_358', 'openai_359', 'openai_360', 'openai_361', 'openai_362', 'openai_363', 'openai_364', 'openai_365', 'openai_366', 'openai_367', 'openai_368', 'openai_369', 'openai_370', 'openai_371', 'openai_372', 'openai_373', 'openai_374', 'openai_375', 'openai_376', 'openai_377', 'openai_378', 'openai_379', 'openai_380', 'openai_381', 'openai_382', 'openai_383', 'openai_384', 'openai_385', 'openai_386', 'openai_387', 'openai_388', 'openai_389', 'openai_390', 'openai_391', 'openai_392', 'openai_393', 'openai_394', 'openai_395', 'openai_396', 'openai_397', 'openai_398', 'openai_399', 'openai_400', 'openai_401', 'openai_402', 'openai_403', 'openai_404', 'openai_405', 'openai_406', 'openai_407', 'openai_408', 'openai_409', 'openai_410', 'openai_411', 'openai_412', 'openai_413', 'openai_414', 'openai_415', 'openai_416', 'openai_417', 'openai_418', 'openai_419', 'openai_420', 'openai_421', 'openai_422', 'openai_423', 'openai_424', 'openai_425', 'openai_426', 'openai_427', 'openai_428', 'openai_429', 'openai_430', 'openai_431', 'openai_432', 'openai_433', 'openai_434', 'openai_435', 'openai_436', 'openai_437', 'openai_438', 'openai_439', 'openai_440', 'openai_441', 'openai_442', 'openai_443', 'openai_444', 'openai_445', 'openai_446', 'openai_447', 'openai_448', 'openai_449', 'openai_450', 'openai_451', 'openai_452', 'openai_453', 'openai_454', 'openai_455', 'openai_456', 'openai_457', 'openai_458', 'openai_459', 'openai_460', 'openai_461', 'openai_462', 'openai_463', 'openai_464', 'openai_465', 'openai_466', 'openai_467', 'openai_468', 'openai_469', 'openai_470', 'openai_471', 'openai_472', 'openai_473', 'openai_474', 'openai_475', 'openai_476', 'openai_477', 'openai_478', 'openai_479', 'openai_480', 'openai_481', 'openai_482', 'openai_483', 'openai_484', 'openai_485', 'openai_486', 'openai_487', 'openai_488', 'openai_489', 'openai_490', 'openai_491', 'openai_492', 'openai_493', 'openai_494', 'openai_495', 'openai_496', 'openai_497', 'openai_498', 'openai_499', 'openai_500', 'openai_501', 'openai_502', 'openai_503', 'openai_504', 'openai_505', 'openai_506', 'openai_507', 'openai_508', 'openai_509', 'openai_510', 'openai_511', 'openai_512', 'openai_513', 'openai_514', 'openai_515', 'openai_516', 'openai_517', 'openai_518', 'openai_519', 'openai_520', 'openai_521', 'openai_522', 'openai_523', 'openai_524', 'openai_525', 'openai_526', 'openai_527', 'openai_528', 'openai_529', 'openai_530', 'openai_531', 'openai_532', 'openai_533', 'openai_534', 'openai_535', 'openai_536', 'openai_537', 'openai_538', 'openai_539', 'openai_540', 'openai_541', 'openai_542', 'openai_543', 'openai_544', 'openai_545', 'openai_546', 'openai_547', 'openai_548', 'openai_549', 'openai_550', 'openai_551', 'openai_552', 'openai_553', 'openai_554', 'openai_555', 'openai_556', 'openai_557', 'openai_558', 'openai_559', 'openai_560', 'openai_561', 'openai_562', 'openai_563', 'openai_564', 'openai_565', 'openai_566', 'openai_567', 'openai_568', 'openai_569', 'openai_570', 'openai_571', 'openai_572', 'openai_573', 'openai_574', 'openai_575', 'openai_576', 'openai_577', 'openai_578', 'openai_579', 'openai_580', 'openai_581', 'openai_582', 'openai_583', 'openai_584', 'openai_585', 'openai_586', 'openai_587', 'openai_588', 'openai_589', 'openai_590', 'openai_591', 'openai_592', 'openai_593', 'openai_594', 'openai_595', 'openai_596', 'openai_597', 'openai_598', 'openai_599', 'openai_600', 'openai_601', 'openai_602', 'openai_603', 'openai_604', 'openai_605', 'openai_606', 'openai_607', 'openai_608', 'openai_609', 'openai_610', 'openai_611', 'openai_612', 'openai_613', 'openai_614', 'openai_615', 'openai_616', 'openai_617', 'openai_618', 'openai_619', 'openai_620', 'openai_621', 'openai_622', 'openai_623', 'openai_624', 'openai_625', 'openai_626', 'openai_627', 'openai_628', 'openai_629', 'openai_630', 'openai_631', 'openai_632', 'openai_633', 'openai_634', 'openai_635', 'openai_636', 'openai_637', 'openai_638', 'openai_639', 'openai_640', 'openai_641', 'openai_642', 'openai_643', 'openai_644', 'openai_645', 'openai_646', 'openai_647', 'openai_648', 'openai_649', 'openai_650', 'openai_651', 'openai_652', 'openai_653', 'openai_654', 'openai_655', 'openai_656', 'openai_657', 'openai_658', 'openai_659', 'openai_660', 'openai_661', 'openai_662', 'openai_663', 'openai_664', 'openai_665', 'openai_666', 'openai_667', 'openai_668', 'openai_669', 'openai_670', 'openai_671', 'openai_672', 'openai_673', 'openai_674', 'openai_675', 'openai_676', 'openai_677', 'openai_678', 'openai_679', 'openai_680', 'openai_681', 'openai_682', 'openai_683', 'openai_684', 'openai_685', 'openai_686', 'openai_687', 'openai_688', 'openai_689', 'openai_690', 'openai_691', 'openai_692', 'openai_693', 'openai_694', 'openai_695', 'openai_696', 'openai_697', 'openai_698', 'openai_699', 'openai_700', 'openai_701', 'openai_702', 'openai_703', 'openai_704', 'openai_705', 'openai_706', 'openai_707', 'openai_708', 'openai_709', 'openai_710', 'openai_711', 'openai_712', 'openai_713', 'openai_714', 'openai_715', 'openai_716', 'openai_717', 'openai_718', 'openai_719', 'openai_720', 'openai_721', 'openai_722', 'openai_723', 'openai_724', 'openai_725', 'openai_726', 'openai_727', 'openai_728', 'openai_729', 'openai_730', 'openai_731', 'openai_732', 'openai_733', 'openai_734', 'openai_735', 'openai_736', 'openai_737', 'openai_738', 'openai_739', 'openai_740', 'openai_741', 'openai_742', 'openai_743', 'openai_744', 'openai_745', 'openai_746', 'openai_747', 'openai_748', 'openai_749', 'openai_750', 'openai_751', 'openai_752', 'openai_753', 'openai_754', 'openai_755', 'openai_756', 'openai_757', 'openai_758', 'openai_759', 'openai_760', 'openai_761', 'openai_762', 'openai_763', 'openai_764', 'openai_765', 'openai_766', 'openai_767', 'openai_768', 'openai_769', 'openai_770', 'openai_771', 'openai_772', 'openai_773', 'openai_774', 'openai_775', 'openai_776', 'openai_777', 'openai_778', 'openai_779', 'openai_780', 'openai_781', 'openai_782', 'openai_783', 'openai_784', 'openai_785', 'openai_786', 'openai_787', 'openai_788', 'openai_789', 'openai_790', 'openai_791', 'openai_792', 'openai_793', 'openai_794', 'openai_795', 'openai_796', 'openai_797', 'openai_798', 'openai_799', 'openai_800', 'openai_801', 'openai_802', 'openai_803', 'openai_804', 'openai_805', 'openai_806', 'openai_807', 'openai_808', 'openai_809', 'openai_810', 'openai_811', 'openai_812', 'openai_813', 'openai_814', 'openai_815', 'openai_816', 'openai_817', 'openai_818', 'openai_819', 'openai_820', 'openai_821', 'openai_822', 'openai_823', 'openai_824', 'openai_825', 'openai_826', 'openai_827', 'openai_828', 'openai_829', 'openai_830', 'openai_831', 'openai_832', 'openai_833', 'openai_834', 'openai_835', 'openai_836', 'openai_837', 'openai_838', 'openai_839', 'openai_840', 'openai_841', 'openai_842', 'openai_843', 'openai_844', 'openai_845', 'openai_846', 'openai_847', 'openai_848', 'openai_849', 'openai_850', 'openai_851', 'openai_852', 'openai_853', 'openai_854', 'openai_855', 'openai_856', 'openai_857', 'openai_858', 'openai_859', 'openai_860', 'openai_861', 'openai_862', 'openai_863', 'openai_864', 'openai_865', 'openai_866', 'openai_867', 'openai_868', 'openai_869', 'openai_870', 'openai_871', 'openai_872', 'openai_873', 'openai_874', 'openai_875', 'openai_876', 'openai_877', 'openai_878', 'openai_879', 'openai_880', 'openai_881', 'openai_882', 'openai_883', 'openai_884', 'openai_885', 'openai_886', 'openai_887', 'openai_888', 'openai_889', 'openai_890', 'openai_891', 'openai_892', 'openai_893', 'openai_894', 'openai_895', 'openai_896', 'openai_897', 'openai_898', 'openai_899', 'openai_900', 'openai_901', 'openai_902', 'openai_903', 'openai_904', 'openai_905', 'openai_906', 'openai_907', 'openai_908', 'openai_909', 'openai_910', 'openai_911', 'openai_912', 'openai_913', 'openai_914', 'openai_915', 'openai_916', 'openai_917', 'openai_918', 'openai_919', 'openai_920', 'openai_921', 'openai_922', 'openai_923', 'openai_924', 'openai_925', 'openai_926', 'openai_927', 'openai_928', 'openai_929', 'openai_930', 'openai_931', 'openai_932', 'openai_933', 'openai_934', 'openai_935', 'openai_936', 'openai_937', 'openai_938', 'openai_939', 'openai_940', 'openai_941', 'openai_942', 'openai_943', 'openai_944', 'openai_945', 'openai_946', 'openai_947', 'openai_948', 'openai_949', 'openai_950', 'openai_951', 'openai_952', 'openai_953', 'openai_954', 'openai_955', 'openai_956', 'openai_957', 'openai_958', 'openai_959', 'openai_960', 'openai_961', 'openai_962', 'openai_963', 'openai_964', 'openai_965', 'openai_966', 'openai_967', 'openai_968', 'openai_969', 'openai_970', 'openai_971', 'openai_972', 'openai_973', 'openai_974', 'openai_975', 'openai_976', 'openai_977', 'openai_978', 'openai_979', 'openai_980', 'openai_981', 'openai_982', 'openai_983', 'openai_984', 'openai_985', 'openai_986', 'openai_987', 'openai_988', 'openai_989', 'openai_990', 'openai_991', 'openai_992', 'openai_993', 'openai_994', 'openai_995', 'openai_996', 'openai_997', 'openai_998', 'openai_999', 'openai_1000', 'openai_1001', 'openai_1002', 'openai_1003', 'openai_1004', 'openai_1005', 'openai_1006', 'openai_1007', 'openai_1008', 'openai_1009', 'openai_1010', 'openai_1011', 'openai_1012', 'openai_1013', 'openai_1014', 'openai_1015', 'openai_1016', 'openai_1017', 'openai_1018', 'openai_1019', 'openai_1020', 'openai_1021', 'openai_1022', 'openai_1023', 'openai_1024', 'openai_1025', 'openai_1026', 'openai_1027', 'openai_1028', 'openai_1029', 'openai_1030', 'openai_1031', 'openai_1032', 'openai_1033', 'openai_1034', 'openai_1035', 'openai_1036', 'openai_1037', 'openai_1038', 'openai_1039', 'openai_1040', 'openai_1041', 'openai_1042', 'openai_1043', 'openai_1044', 'openai_1045', 'openai_1046', 'openai_1047', 'openai_1048', 'openai_1049', 'openai_1050', 'openai_1051', 'openai_1052', 'openai_1053', 'openai_1054', 'openai_1055', 'openai_1056', 'openai_1057', 'openai_1058', 'openai_1059', 'openai_1060', 'openai_1061', 'openai_1062', 'openai_1063', 'openai_1064', 'openai_1065', 'openai_1066', 'openai_1067', 'openai_1068', 'openai_1069', 'openai_1070', 'openai_1071', 'openai_1072', 'openai_1073', 'openai_1074', 'openai_1075', 'openai_1076', 'openai_1077', 'openai_1078', 'openai_1079', 'openai_1080', 'openai_1081', 'openai_1082', 'openai_1083', 'openai_1084', 'openai_1085', 'openai_1086', 'openai_1087', 'openai_1088', 'openai_1089', 'openai_1090', 'openai_1091', 'openai_1092', 'openai_1093', 'openai_1094', 'openai_1095', 'openai_1096', 'openai_1097', 'openai_1098', 'openai_1099', 'openai_1100', 'openai_1101', 'openai_1102', 'openai_1103', 'openai_1104', 'openai_1105', 'openai_1106', 'openai_1107', 'openai_1108', 'openai_1109', 'openai_1110', 'openai_1111', 'openai_1112', 'openai_1113', 'openai_1114', 'openai_1115', 'openai_1116', 'openai_1117', 'openai_1118', 'openai_1119', 'openai_1120', 'openai_1121', 'openai_1122', 'openai_1123', 'openai_1124', 'openai_1125', 'openai_1126', 'openai_1127', 'openai_1128', 'openai_1129', 'openai_1130', 'openai_1131', 'openai_1132', 'openai_1133', 'openai_1134', 'openai_1135', 'openai_1136', 'openai_1137', 'openai_1138', 'openai_1139', 'openai_1140', 'openai_1141', 'openai_1142', 'openai_1143', 'openai_1144', 'openai_1145', 'openai_1146', 'openai_1147', 'openai_1148', 'openai_1149', 'openai_1150', 'openai_1151', 'openai_1152', 'openai_1153', 'openai_1154', 'openai_1155', 'openai_1156', 'openai_1157', 'openai_1158', 'openai_1159', 'openai_1160', 'openai_1161', 'openai_1162', 'openai_1163', 'openai_1164', 'openai_1165', 'openai_1166', 'openai_1167', 'openai_1168', 'openai_1169', 'openai_1170', 'openai_1171', 'openai_1172', 'openai_1173', 'openai_1174', 'openai_1175', 'openai_1176', 'openai_1177', 'openai_1178', 'openai_1179', 'openai_1180', 'openai_1181', 'openai_1182', 'openai_1183', 'openai_1184', 'openai_1185', 'openai_1186', 'openai_1187', 'openai_1188', 'openai_1189', 'openai_1190', 'openai_1191', 'openai_1192', 'openai_1193', 'openai_1194', 'openai_1195', 'openai_1196', 'openai_1197', 'openai_1198', 'openai_1199', 'openai_1200', 'openai_1201', 'openai_1202', 'openai_1203', 'openai_1204', 'openai_1205', 'openai_1206', 'openai_1207', 'openai_1208', 'openai_1209', 'openai_1210', 'openai_1211', 'openai_1212', 'openai_1213', 'openai_1214', 'openai_1215', 'openai_1216', 'openai_1217', 'openai_1218', 'openai_1219', 'openai_1220', 'openai_1221', 'openai_1222', 'openai_1223', 'openai_1224', 'openai_1225', 'openai_1226', 'openai_1227', 'openai_1228', 'openai_1229', 'openai_1230', 'openai_1231', 'openai_1232', 'openai_1233', 'openai_1234', 'openai_1235', 'openai_1236', 'openai_1237', 'openai_1238', 'openai_1239', 'openai_1240', 'openai_1241', 'openai_1242', 'openai_1243', 'openai_1244', 'openai_1245', 'openai_1246', 'openai_1247', 'openai_1248', 'openai_1249', 'openai_1250', 'openai_1251', 'openai_1252', 'openai_1253', 'openai_1254', 'openai_1255', 'openai_1256', 'openai_1257', 'openai_1258', 'openai_1259', 'openai_1260', 'openai_1261', 'openai_1262', 'openai_1263', 'openai_1264', 'openai_1265', 'openai_1266', 'openai_1267', 'openai_1268', 'openai_1269', 'openai_1270', 'openai_1271', 'openai_1272', 'openai_1273', 'openai_1274', 'openai_1275', 'openai_1276', 'openai_1277', 'openai_1278', 'openai_1279', 'openai_1280', 'openai_1281', 'openai_1282', 'openai_1283', 'openai_1284', 'openai_1285', 'openai_1286', 'openai_1287', 'openai_1288', 'openai_1289', 'openai_1290', 'openai_1291', 'openai_1292', 'openai_1293', 'openai_1294', 'openai_1295', 'openai_1296', 'openai_1297', 'openai_1298', 'openai_1299', 'openai_1300', 'openai_1301', 'openai_1302', 'openai_1303', 'openai_1304', 'openai_1305', 'openai_1306', 'openai_1307', 'openai_1308', 'openai_1309', 'openai_1310', 'openai_1311', 'openai_1312', 'openai_1313', 'openai_1314', 'openai_1315', 'openai_1316', 'openai_1317', 'openai_1318', 'openai_1319', 'openai_1320', 'openai_1321', 'openai_1322', 'openai_1323', 'openai_1324', 'openai_1325', 'openai_1326', 'openai_1327', 'openai_1328', 'openai_1329', 'openai_1330', 'openai_1331', 'openai_1332', 'openai_1333', 'openai_1334', 'openai_1335', 'openai_1336', 'openai_1337', 'openai_1338', 'openai_1339', 'openai_1340', 'openai_1341', 'openai_1342', 'openai_1343', 'openai_1344', 'openai_1345', 'openai_1346', 'openai_1347', 'openai_1348', 'openai_1349', 'openai_1350', 'openai_1351', 'openai_1352', 'openai_1353', 'openai_1354', 'openai_1355', 'openai_1356', 'openai_1357', 'openai_1358', 'openai_1359', 'openai_1360', 'openai_1361', 'openai_1362', 'openai_1363', 'openai_1364', 'openai_1365', 'openai_1366', 'openai_1367', 'openai_1368', 'openai_1369', 'openai_1370', 'openai_1371', 'openai_1372', 'openai_1373', 'openai_1374', 'openai_1375', 'openai_1376', 'openai_1377', 'openai_1378', 'openai_1379', 'openai_1380', 'openai_1381', 'openai_1382', 'openai_1383', 'openai_1384', 'openai_1385', 'openai_1386', 'openai_1387', 'openai_1388', 'openai_1389', 'openai_1390', 'openai_1391', 'openai_1392', 'openai_1393', 'openai_1394', 'openai_1395', 'openai_1396', 'openai_1397', 'openai_1398', 'openai_1399', 'openai_1400', 'openai_1401', 'openai_1402', 'openai_1403', 'openai_1404', 'openai_1405', 'openai_1406', 'openai_1407', 'openai_1408', 'openai_1409', 'openai_1410', 'openai_1411', 'openai_1412', 'openai_1413', 'openai_1414', 'openai_1415', 'openai_1416', 'openai_1417', 'openai_1418', 'openai_1419', 'openai_1420', 'openai_1421', 'openai_1422', 'openai_1423', 'openai_1424', 'openai_1425', 'openai_1426', 'openai_1427', 'openai_1428', 'openai_1429', 'openai_1430', 'openai_1431', 'openai_1432', 'openai_1433', 'openai_1434', 'openai_1435', 'openai_1436', 'openai_1437', 'openai_1438', 'openai_1439', 'openai_1440', 'openai_1441', 'openai_1442', 'openai_1443', 'openai_1444', 'openai_1445', 'openai_1446', 'openai_1447', 'openai_1448', 'openai_1449', 'openai_1450', 'openai_1451', 'openai_1452', 'openai_1453', 'openai_1454', 'openai_1455', 'openai_1456', 'openai_1457', 'openai_1458', 'openai_1459', 'openai_1460', 'openai_1461', 'openai_1462', 'openai_1463', 'openai_1464', 'openai_1465', 'openai_1466', 'openai_1467', 'openai_1468', 'openai_1469', 'openai_1470', 'openai_1471', 'openai_1472', 'openai_1473', 'openai_1474', 'openai_1475', 'openai_1476', 'openai_1477', 'openai_1478', 'openai_1479', 'openai_1480', 'openai_1481', 'openai_1482', 'openai_1483', 'openai_1484', 'openai_1485', 'openai_1486', 'openai_1487', 'openai_1488', 'openai_1489', 'openai_1490', 'openai_1491', 'openai_1492', 'openai_1493', 'openai_1494', 'openai_1495', 'openai_1496', 'openai_1497', 'openai_1498', 'openai_1499', 'openai_1500', 'openai_1501', 'openai_1502', 'openai_1503', 'openai_1504', 'openai_1505', 'openai_1506', 'openai_1507', 'openai_1508', 'openai_1509', 'openai_1510', 'openai_1511', 'openai_1512', 'openai_1513', 'openai_1514', 'openai_1515', 'openai_1516', 'openai_1517', 'openai_1518', 'openai_1519', 'openai_1520', 'openai_1521', 'openai_1522', 'openai_1523', 'openai_1524', 'openai_1525', 'openai_1526', 'openai_1527', 'openai_1528', 'openai_1529', 'openai_1530', 'openai_1531', 'openai_1532', 'openai_1533', 'openai_1534', 'openai_1535', 'opinion_label', 'bot_label', 'fourway_label', 'is_toxic', 'toxic_label', 'is_factual', 'factual_label', 'is_bom_overall', 'is_bom_astroturf', 'bom_overall_label', 'bom_astroturf_label', 'bom_overall_fourway_label', 'bom_astroturf_fourway_label']\n"
+ ]
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " user_id created_on status_count rt_count is_bot \\\n",
+ "0 3420436216 2015-08-13 555 540 True \n",
+ "1 108121958 2010-01-24 2 2 False \n",
+ "2 3038308638 2015-02-23 755 665 True \n",
+ "3 332396536 2011-07-09 951 951 True \n",
+ "4 955082522479808512 2018-01-21 570 533 True \n",
+ "\n",
+ " opinion_community is_q avg_toxicity avg_fact_score bom_astroturf ... \\\n",
+ "0 0 False 0.056113 1.983193 0.295 ... \n",
+ "1 0 False 0.456710 NaN 0.580 ... \n",
+ "2 0 False 0.069860 3.401786 0.970 ... \n",
+ "3 1 False 0.044264 2.304511 0.580 ... \n",
+ "4 0 False 0.049325 4.714286 0.355 ... \n",
+ "\n",
+ " is_toxic toxic_label is_factual factual_label is_bom_overall \\\n",
+ "0 0 Normal 0.0 Low Quality 0.0 \n",
+ "1 1 Toxic NaN NaN 0.0 \n",
+ "2 0 Normal 1.0 High Quality 1.0 \n",
+ "3 0 Normal 0.0 Low Quality 1.0 \n",
+ "4 0 Normal 1.0 High Quality 0.0 \n",
+ "\n",
+ " is_bom_astroturf bom_overall_label bom_astroturf_label \\\n",
+ "0 0.0 Human Human \n",
+ "1 1.0 Human Bot \n",
+ "2 1.0 Bot Bot \n",
+ "3 1.0 Bot Bot \n",
+ "4 0.0 Human Human \n",
+ "\n",
+ " bom_overall_fourway_label bom_astroturf_fourway_label \n",
+ "0 Anti-Trump Human Anti-Trump Human \n",
+ "1 Anti-Trump Human Anti-Trump Bot \n",
+ "2 Anti-Trump Bot Anti-Trump Bot \n",
+ "3 Pro-Trump Bot Pro-Trump Bot \n",
+ "4 Anti-Trump Human Anti-Trump Human \n",
+ "\n",
+ "[5 rows x 1560 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " user_id | \n",
+ " created_on | \n",
+ " status_count | \n",
+ " rt_count | \n",
+ " is_bot | \n",
+ " opinion_community | \n",
+ " is_q | \n",
+ " avg_toxicity | \n",
+ " avg_fact_score | \n",
+ " bom_astroturf | \n",
+ " ... | \n",
+ " is_toxic | \n",
+ " toxic_label | \n",
+ " is_factual | \n",
+ " factual_label | \n",
+ " is_bom_overall | \n",
+ " is_bom_astroturf | \n",
+ " bom_overall_label | \n",
+ " bom_astroturf_label | \n",
+ " bom_overall_fourway_label | \n",
+ " bom_astroturf_fourway_label | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 3420436216 | \n",
+ " 2015-08-13 | \n",
+ " 555 | \n",
+ " 540 | \n",
+ " True | \n",
+ " 0 | \n",
+ " False | \n",
+ " 0.056113 | \n",
+ " 1.983193 | \n",
+ " 0.295 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " Normal | \n",
+ " 0.0 | \n",
+ " Low Quality | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " Human | \n",
+ " Human | \n",
+ " Anti-Trump Human | \n",
+ " Anti-Trump Human | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 108121958 | \n",
+ " 2010-01-24 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " False | \n",
+ " 0 | \n",
+ " False | \n",
+ " 0.456710 | \n",
+ " NaN | \n",
+ " 0.580 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " Toxic | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " Human | \n",
+ " Bot | \n",
+ " Anti-Trump Human | \n",
+ " Anti-Trump Bot | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3038308638 | \n",
+ " 2015-02-23 | \n",
+ " 755 | \n",
+ " 665 | \n",
+ " True | \n",
+ " 0 | \n",
+ " False | \n",
+ " 0.069860 | \n",
+ " 3.401786 | \n",
+ " 0.970 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " Normal | \n",
+ " 1.0 | \n",
+ " High Quality | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " Bot | \n",
+ " Bot | \n",
+ " Anti-Trump Bot | \n",
+ " Anti-Trump Bot | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 332396536 | \n",
+ " 2011-07-09 | \n",
+ " 951 | \n",
+ " 951 | \n",
+ " True | \n",
+ " 1 | \n",
+ " False | \n",
+ " 0.044264 | \n",
+ " 2.304511 | \n",
+ " 0.580 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " Normal | \n",
+ " 0.0 | \n",
+ " Low Quality | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " Bot | \n",
+ " Bot | \n",
+ " Pro-Trump Bot | \n",
+ " Pro-Trump Bot | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 955082522479808512 | \n",
+ " 2018-01-21 | \n",
+ " 570 | \n",
+ " 533 | \n",
+ " True | \n",
+ " 0 | \n",
+ " False | \n",
+ " 0.049325 | \n",
+ " 4.714286 | \n",
+ " 0.355 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " Normal | \n",
+ " 1.0 | \n",
+ " High Quality | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " Human | \n",
+ " Human | \n",
+ " Anti-Trump Human | \n",
+ " Anti-Trump Human | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 1560 columns
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "users_df"
+ }
+ },
+ "metadata": {},
+ "execution_count": 20
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "users_df[\"is_factual\"].value_counts()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "CU_qpBVcjFD4",
+ "outputId": "59ade136-f5e2-42ff-e000-9339d7ff2e76"
+ },
+ "execution_count": 21,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "0.0 1696\n",
+ "1.0 1596\n",
+ "Name: is_factual, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 21
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "users_df[\"factual_label\"].value_counts()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "PYOPG5Hcgw3M",
+ "outputId": "1962ae13-31b3-4702-9348-94c7fd3e3e50"
+ },
+ "execution_count": 22,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "Low Quality 1696\n",
+ "High Quality 1596\n",
+ "Name: factual_label, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 22
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "users_df[\"is_toxic\"].value_counts()\n"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "pbaa5rTJh5NY",
+ "outputId": "4ef18db3-5815-48ea-e5e9-884d81e1ba16"
+ },
+ "execution_count": 23,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "0 6132\n",
+ "1 1434\n",
+ "Name: is_toxic, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 23
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "users_df[\"toxic_label\"].value_counts()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "3diIW61Ng1Cy",
+ "outputId": "4c2f628b-ae92-442a-c90c-e924f69a8dfc"
+ },
+ "execution_count": 24,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "Normal 6132\n",
+ "Toxic 1434\n",
+ "Name: toxic_label, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 24
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "users_df[\"bot_label\"].value_counts()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "T31nFSuniKdY",
+ "outputId": "435b9e13-4f3b-4416-8805-a62967bdf80a"
+ },
+ "execution_count": 25,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "Human 4466\n",
+ "Bot 3100\n",
+ "Name: bot_label, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 25
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "users_df[\"opinion_label\"].value_counts()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "4QX5FgjMk3E0",
+ "outputId": "8692026e-ab73-421b-a96e-2f3476245f4f"
+ },
+ "execution_count": 26,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "Anti-Trump 4891\n",
+ "Pro-Trump 2675\n",
+ "Name: opinion_label, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 26
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "users_df[\"fourway_label\"].value_counts()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "wKHKOfGplAv8",
+ "outputId": "c87bac82-4272-42bc-b7e6-f8e0984ad3a4"
+ },
+ "execution_count": 27,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "Anti-Trump Human 3010\n",
+ "Anti-Trump Bot 1881\n",
+ "Pro-Trump Human 1456\n",
+ "Pro-Trump Bot 1219\n",
+ "Name: fourway_label, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 27
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### Splitting"
+ ],
+ "metadata": {
+ "id": "s0VpVSEB81Tt"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "users_df.index = users_df[\"user_id\"]"
+ ],
+ "metadata": {
+ "id": "sCgftz6i9MCT"
+ },
+ "execution_count": 28,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "embeddings_cols = [col for col in users_df.columns if \"openai\" in col]\n",
+ "print(len(embeddings_cols))\n",
+ "print(embeddings_cols[0], \"...\", embeddings_cols[-1])"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "ChDT1h8R83Gz",
+ "outputId": "9e58ef40-f8d7-43f2-89f3-50b925b336ac"
+ },
+ "execution_count": 29,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "1536\n",
+ "openai_0 ... openai_1535\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "users_x = users_df[embeddings_cols]\n",
+ "users_x.head()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 0
+ },
+ "id": "EigyhIpt9Kd2",
+ "outputId": "197ddc3c-523a-4c2e-f506-b8e41306a594"
+ },
+ "execution_count": 30,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " openai_0 openai_1 openai_2 openai_3 openai_4 \\\n",
+ "user_id \n",
+ "3420436216 -0.018802 -0.007904 0.013753 -0.000709 -0.013829 \n",
+ "108121958 -0.030552 -0.005330 0.014622 -0.015383 0.004529 \n",
+ "3038308638 -0.007297 0.001028 0.002483 -0.004990 -0.021728 \n",
+ "332396536 -0.018347 -0.007322 -0.009216 -0.023215 -0.007946 \n",
+ "955082522479808512 -0.024804 0.007517 0.008514 -0.022979 0.003899 \n",
+ "\n",
+ " openai_5 openai_6 openai_7 openai_8 openai_9 ... \\\n",
+ "user_id ... \n",
+ "3420436216 0.007897 0.018043 -0.015105 -0.006663 -0.000824 ... \n",
+ "108121958 0.022213 -0.004980 -0.008592 -0.004993 -0.007705 ... \n",
+ "3038308638 -0.003700 0.008174 0.004453 0.014321 -0.004018 ... \n",
+ "332396536 0.003921 0.023664 -0.009896 -0.001530 -0.010064 ... \n",
+ "955082522479808512 0.000756 0.024158 -0.016718 -0.003902 -0.008977 ... \n",
+ "\n",
+ " openai_1526 openai_1527 openai_1528 openai_1529 \\\n",
+ "user_id \n",
+ "3420436216 -0.001867 -0.013167 0.020885 -0.022568 \n",
+ "108121958 0.017651 -0.009439 0.024375 -0.032553 \n",
+ "3038308638 -0.026273 -0.008139 0.030285 -0.029902 \n",
+ "332396536 -0.005520 -0.005288 0.017071 -0.033637 \n",
+ "955082522479808512 0.009959 0.004695 0.005555 -0.012851 \n",
+ "\n",
+ " openai_1530 openai_1531 openai_1532 openai_1533 \\\n",
+ "user_id \n",
+ "3420436216 -0.033631 0.016153 0.024127 -0.017519 \n",
+ "108121958 -0.042185 0.013782 0.011320 -0.014862 \n",
+ "3038308638 -0.030887 0.022481 -0.005476 -0.016279 \n",
+ "332396536 -0.040202 0.041773 -0.009370 0.003352 \n",
+ "955082522479808512 -0.032229 0.031443 0.008163 -0.018501 \n",
+ "\n",
+ " openai_1534 openai_1535 \n",
+ "user_id \n",
+ "3420436216 0.002636 -0.039838 \n",
+ "108121958 -0.010413 -0.020359 \n",
+ "3038308638 -0.010138 -0.021454 \n",
+ "332396536 0.009391 -0.042671 \n",
+ "955082522479808512 -0.008724 -0.042027 \n",
+ "\n",
+ "[5 rows x 1536 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " openai_0 | \n",
+ " openai_1 | \n",
+ " openai_2 | \n",
+ " openai_3 | \n",
+ " openai_4 | \n",
+ " openai_5 | \n",
+ " openai_6 | \n",
+ " openai_7 | \n",
+ " openai_8 | \n",
+ " openai_9 | \n",
+ " ... | \n",
+ " openai_1526 | \n",
+ " openai_1527 | \n",
+ " openai_1528 | \n",
+ " openai_1529 | \n",
+ " openai_1530 | \n",
+ " openai_1531 | \n",
+ " openai_1532 | \n",
+ " openai_1533 | \n",
+ " openai_1534 | \n",
+ " openai_1535 | \n",
+ "
\n",
+ " \n",
+ " user_id | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 3420436216 | \n",
+ " -0.018802 | \n",
+ " -0.007904 | \n",
+ " 0.013753 | \n",
+ " -0.000709 | \n",
+ " -0.013829 | \n",
+ " 0.007897 | \n",
+ " 0.018043 | \n",
+ " -0.015105 | \n",
+ " -0.006663 | \n",
+ " -0.000824 | \n",
+ " ... | \n",
+ " -0.001867 | \n",
+ " -0.013167 | \n",
+ " 0.020885 | \n",
+ " -0.022568 | \n",
+ " -0.033631 | \n",
+ " 0.016153 | \n",
+ " 0.024127 | \n",
+ " -0.017519 | \n",
+ " 0.002636 | \n",
+ " -0.039838 | \n",
+ "
\n",
+ " \n",
+ " 108121958 | \n",
+ " -0.030552 | \n",
+ " -0.005330 | \n",
+ " 0.014622 | \n",
+ " -0.015383 | \n",
+ " 0.004529 | \n",
+ " 0.022213 | \n",
+ " -0.004980 | \n",
+ " -0.008592 | \n",
+ " -0.004993 | \n",
+ " -0.007705 | \n",
+ " ... | \n",
+ " 0.017651 | \n",
+ " -0.009439 | \n",
+ " 0.024375 | \n",
+ " -0.032553 | \n",
+ " -0.042185 | \n",
+ " 0.013782 | \n",
+ " 0.011320 | \n",
+ " -0.014862 | \n",
+ " -0.010413 | \n",
+ " -0.020359 | \n",
+ "
\n",
+ " \n",
+ " 3038308638 | \n",
+ " -0.007297 | \n",
+ " 0.001028 | \n",
+ " 0.002483 | \n",
+ " -0.004990 | \n",
+ " -0.021728 | \n",
+ " -0.003700 | \n",
+ " 0.008174 | \n",
+ " 0.004453 | \n",
+ " 0.014321 | \n",
+ " -0.004018 | \n",
+ " ... | \n",
+ " -0.026273 | \n",
+ " -0.008139 | \n",
+ " 0.030285 | \n",
+ " -0.029902 | \n",
+ " -0.030887 | \n",
+ " 0.022481 | \n",
+ " -0.005476 | \n",
+ " -0.016279 | \n",
+ " -0.010138 | \n",
+ " -0.021454 | \n",
+ "
\n",
+ " \n",
+ " 332396536 | \n",
+ " -0.018347 | \n",
+ " -0.007322 | \n",
+ " -0.009216 | \n",
+ " -0.023215 | \n",
+ " -0.007946 | \n",
+ " 0.003921 | \n",
+ " 0.023664 | \n",
+ " -0.009896 | \n",
+ " -0.001530 | \n",
+ " -0.010064 | \n",
+ " ... | \n",
+ " -0.005520 | \n",
+ " -0.005288 | \n",
+ " 0.017071 | \n",
+ " -0.033637 | \n",
+ " -0.040202 | \n",
+ " 0.041773 | \n",
+ " -0.009370 | \n",
+ " 0.003352 | \n",
+ " 0.009391 | \n",
+ " -0.042671 | \n",
+ "
\n",
+ " \n",
+ " 955082522479808512 | \n",
+ " -0.024804 | \n",
+ " 0.007517 | \n",
+ " 0.008514 | \n",
+ " -0.022979 | \n",
+ " 0.003899 | \n",
+ " 0.000756 | \n",
+ " 0.024158 | \n",
+ " -0.016718 | \n",
+ " -0.003902 | \n",
+ " -0.008977 | \n",
+ " ... | \n",
+ " 0.009959 | \n",
+ " 0.004695 | \n",
+ " 0.005555 | \n",
+ " -0.012851 | \n",
+ " -0.032229 | \n",
+ " 0.031443 | \n",
+ " 0.008163 | \n",
+ " -0.018501 | \n",
+ " -0.008724 | \n",
+ " -0.042027 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 1536 columns
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "users_x"
+ }
+ },
+ "metadata": {},
+ "execution_count": 30
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#user_labels = users_df.drop(columns=embeddings_cols)\n",
+ "#print(user_labels.columns.tolist())\n",
+ "#user_labels.head()"
+ ],
+ "metadata": {
+ "id": "g1VfnZf29x2a"
+ },
+ "execution_count": 31,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### PCA 2"
+ ],
+ "metadata": {
+ "id": "R1ATEaSs8n7c"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# /usr/local/lib/python3.10/dist-packages/plotly/express/_core.py:1223:\n",
+ "# PerformanceWarning: DataFrame is highly fragmented.\n",
+ "# This is usually the result of calling `frame.insert` many times, which has poor performance.\n",
+ "# Consider joining all columns at once using pd.concat(axis=1) instead.\n",
+ "# To get a de-fragmented frame, use `newframe = frame.copy()`\n",
+ "# df_output[col_name] = to_unindexed_series(df_input[argument])\n",
+ "\n"
+ ],
+ "metadata": {
+ "id": "ct_LaFXK8zI8"
+ },
+ "execution_count": 124,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "target = \"fourway_label\" #@param [\"bot_label\", \"opinion_label\", \"fourway_label\", \"toxic_label\", \"is_factual\"]\n",
+ "user_labels = users_df[target]\n",
+ "user_labels"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "VhHH2e9H_rJr",
+ "outputId": "174c4540-1e42-4c74-935a-c60d76e5d25b"
+ },
+ "execution_count": 32,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "user_id\n",
+ "3420436216 Anti-Trump Bot\n",
+ "108121958 Anti-Trump Human\n",
+ "3038308638 Anti-Trump Bot\n",
+ "332396536 Pro-Trump Bot\n",
+ "955082522479808512 Anti-Trump Bot\n",
+ " ... \n",
+ "1620694747 Anti-Trump Bot\n",
+ "1047878200406069248 Anti-Trump Bot\n",
+ "823502850336624640 Anti-Trump Bot\n",
+ "26966663 Anti-Trump Bot\n",
+ "884121768428003329 Anti-Trump Bot\n",
+ "Name: fourway_label, Length: 7566, dtype: object"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 32
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "users_pipeline = ReductionPipeline(x=users_x, labels=user_labels, target=target, n_components=2)\n",
+ "\n",
+ "users_pipeline.perform()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "ogD2Z0g98njC",
+ "outputId": "e20eda04-f42f-4635-fc61-a47548dea7c8"
+ },
+ "execution_count": 33,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "PCA(n_components=2, random_state=99)\n",
+ "EMBEDDINGS: (7566, 2)\n",
+ "EXPLAINED VARIANCE RATIO: [0.08211384 0.02338218]\n",
+ "EXPLAINED VARIANCE: 0.11\n",
+ "LOADINGS (1536, 2)\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "users_pipeline.embeddings_df.head()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 237
+ },
+ "id": "gSHkA1gFAGXs",
+ "outputId": "0a050a7f-77be-49ad-88ba-736ad2bfc7e8"
+ },
+ "execution_count": 34,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " component_1 component_2\n",
+ "user_id \n",
+ "3420436216 -12.864011 -0.976722\n",
+ "108121958 7.074388 -5.434687\n",
+ "3038308638 -10.170547 -0.784753\n",
+ "332396536 -10.812691 5.623136\n",
+ "955082522479808512 -10.050573 -0.195144"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " component_1 | \n",
+ " component_2 | \n",
+ "
\n",
+ " \n",
+ " user_id | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 3420436216 | \n",
+ " -12.864011 | \n",
+ " -0.976722 | \n",
+ "
\n",
+ " \n",
+ " 108121958 | \n",
+ " 7.074388 | \n",
+ " -5.434687 | \n",
+ "
\n",
+ " \n",
+ " 3038308638 | \n",
+ " -10.170547 | \n",
+ " -0.784753 | \n",
+ "
\n",
+ " \n",
+ " 332396536 | \n",
+ " -10.812691 | \n",
+ " 5.623136 | \n",
+ "
\n",
+ " \n",
+ " 955082522479808512 | \n",
+ " -10.050573 | \n",
+ " -0.195144 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "summary": "{\n \"name\": \"users_pipeline\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"component_1\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 8.150230389844141,\n \"min\": -12.864011415872412,\n \"max\": 7.07438804638496,\n \"samples\": [\n 7.07438804638496,\n -10.050572989026474,\n -10.17054718508993\n ],\n \"num_unique_values\": 5,\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"component_2\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3.9413862514190274,\n \"min\": -5.434686671061165,\n \"max\": 5.623136416263134,\n \"samples\": [\n -5.434686671061165,\n -0.19514374095730225,\n -0.7847534857164845\n ],\n \"num_unique_values\": 5,\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {},
+ "execution_count": 34
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [],
+ "metadata": {
+ "id": "sL3OppIhFDWi"
+ },
+ "execution_count": 127,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# todo: re-implement colors map and category orders\n",
+ "#users_pipeline.plot_embeddings(fig_show=False, fig_save=False, height=350, )"
+ ],
+ "metadata": {
+ "id": "FP8XEgZO84nP"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "color_map = COLORS_MAP[target]\n",
+ "category_orders = {target: CATEGORY_ORDERS[target]}\n",
+ "\n",
+ "users_pipeline.plot_embeddings(fig_show=False, fig_save=False, height=350,\n",
+ " color=target, color_map=color_map, category_orders=category_orders\n",
+ ")"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 367
+ },
+ "id": "8_NX8EocCgOe",
+ "outputId": "7826d1e0-cea9-4018-dcad-d05b86100f35"
+ },
+ "execution_count": 35,
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ "\n",
+ ""
+ ]
+ },
+ "metadata": {}
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "\n",
+ "groupby_cols = [\n",
+ " \"bot_label\", \"opinion_label\", # \"bom_overall_label\", \"bom_astroturf_label\",\n",
+ " \"toxic_label\", \"factual_label\",\n",
+ " \"fourway_label\", #\"sixway_label\",\n",
+ "]\n",
+ "\n",
+ "for groupby_col in groupby_cols:\n",
+ " color_map = COLORS_MAP[groupby_col]\n",
+ " category_orders = {groupby_col: CATEGORY_ORDERS[groupby_col]}\n",
+ "\n",
+ " labels = users_df[groupby_col]\n",
+ " pipeline = ReductionPipeline(x=users_x, labels=labels, target=groupby_col, n_components=2)\n",
+ "\n",
+ " results_dirpath = os.path.join(RESULTS_DIRPATH, \"openai_embeddings_v2\", \"text-embedding-ada-002\", f\"user_embeddings_{pipeline.reducer_type.lower()}_{pipeline.n_components}\", groupby_col)\n",
+ " os.makedirs(results_dirpath, exist_ok=True)\n",
+ "\n",
+ " pipeline.perform()\n",
+ "\n",
+ " pipeline.plot_embeddings(\n",
+ " color=groupby_col, color_map=color_map, category_orders=category_orders,\n",
+ " #hover_data=[\"user_id\", \"bot_label\"],\n",
+ " fig_show=True, fig_save=True,\n",
+ " results_dirpath=results_dirpath\n",
+ " )"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 1000
+ },
+ "id": "aqVam9Mo84jM",
+ "outputId": "79596956-30d1-4863-e00f-3233772f4fd8"
+ },
+ "execution_count": 59,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "PCA(n_components=2, random_state=99)\n",
+ "EMBEDDINGS: (7566, 2)\n",
+ "EXPLAINED VARIANCE RATIO: [0.08211384 0.02338218]\n",
+ "EXPLAINED VARIANCE: 0.11\n",
+ "LOADINGS (1536, 2)\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ "\n",
+ ""
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "PCA(n_components=2, random_state=99)\n",
+ "EMBEDDINGS: (7566, 2)\n",
+ "EXPLAINED VARIANCE RATIO: [0.08211384 0.02338218]\n",
+ "EXPLAINED VARIANCE: 0.11\n",
+ "LOADINGS (1536, 2)\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ "\n",
+ ""
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "PCA(n_components=2, random_state=99)\n",
+ "EMBEDDINGS: (7566, 2)\n",
+ "EXPLAINED VARIANCE RATIO: [0.08211384 0.02338218]\n",
+ "EXPLAINED VARIANCE: 0.11\n",
+ "LOADINGS (1536, 2)\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ "\n",
+ ""
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "PCA(n_components=2, random_state=99)\n",
+ "EMBEDDINGS: (7566, 2)\n",
+ "EXPLAINED VARIANCE RATIO: [0.08211384 0.02338218]\n",
+ "EXPLAINED VARIANCE: 0.11\n",
+ "LOADINGS (1536, 2)\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ "\n",
+ ""
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "PCA(n_components=2, random_state=99)\n",
+ "EMBEDDINGS: (7566, 2)\n",
+ "EXPLAINED VARIANCE RATIO: [0.08211384 0.02338218]\n",
+ "EXPLAINED VARIANCE: 0.11\n",
+ "LOADINGS (1536, 2)\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ "\n",
+ ""
+ ]
+ },
+ "metadata": {}
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "4t48ewACjXQy"
+ },
+ "source": [
+ "## Tweet Embeddings (User Averages)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "183K statuses, averaged for each user (see prior notebook). 7566 rows resulting"
+ ],
+ "metadata": {
+ "id": "5sJsvSTWCVVX"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### Loading"
+ ],
+ "metadata": {
+ "id": "fjvJbg75dk5r"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from pandas import read_csv\n",
+ "\n",
+ "csv_filepath = os.path.join(DATA_DIRPATH, \"botometer_sample_max_50_openai_status_embeddings_v3_unpacked_deduped_averaged.csv.gz\")\n",
+ "averages_df = read_csv(csv_filepath)\n",
+ "print(averages_df.shape)\n",
+ "print(averages_df.columns)\n",
+ "averages_df.index = averages_df[\"user_id\"]\n",
+ "averages_df.head()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 426
+ },
+ "id": "wy-OIPg_eYX-",
+ "outputId": "e69cb40b-04c4-4666-ad4c-1adbd756f594"
+ },
+ "execution_count": 47,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "(7566, 1548)\n",
+ "Index(['user_id', 'openai_0', 'openai_1', 'openai_2', 'openai_3', 'openai_4',\n",
+ " 'openai_5', 'openai_6', 'openai_7', 'openai_8',\n",
+ " ...\n",
+ " 'created_on', 'status_count', 'rt_count', 'is_bot', 'opinion_community',\n",
+ " 'is_q', 'avg_toxicity', 'avg_fact_score', 'bom_astroturf',\n",
+ " 'bom_overall'],\n",
+ " dtype='object', length=1548)\n"
+ ]
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " user_id openai_0 openai_1 openai_2 openai_3 openai_4 openai_5 \\\n",
+ "user_id \n",
+ "2952 2952 -0.023816 0.002004 0.004429 -0.019361 -0.009860 0.004878 \n",
+ "635553 635553 -0.030022 -0.006063 0.017259 -0.018501 -0.008536 0.004416 \n",
+ "656993 656993 -0.010723 0.008235 0.004192 -0.040441 -0.015172 0.012798 \n",
+ "761154 761154 -0.021389 -0.004747 0.006925 -0.017395 -0.011900 0.018309 \n",
+ "777554 777554 -0.009369 -0.009612 0.012470 0.005079 -0.019303 -0.010459 \n",
+ "\n",
+ " openai_6 openai_7 openai_8 ... created_on status_count \\\n",
+ "user_id ... \n",
+ "2952 0.000960 -0.015426 -0.006430 ... 2006-07-24 6 \n",
+ "635553 -0.011840 -0.010581 -0.010859 ... 2007-01-15 12 \n",
+ "656993 -0.015786 0.008556 -0.022145 ... 2007-01-17 1 \n",
+ "761154 -0.007047 -0.024175 0.001368 ... 2007-02-09 4 \n",
+ "777554 0.019815 -0.019171 -0.017594 ... 2007-02-17 1 \n",
+ "\n",
+ " rt_count is_bot opinion_community is_q avg_toxicity \\\n",
+ "user_id \n",
+ "2952 6 False 0 False 0.006899 \n",
+ "635553 12 False 0 False 0.077787 \n",
+ "656993 1 False 0 False 0.025031 \n",
+ "761154 0 False 0 False 0.172311 \n",
+ "777554 1 False 0 False 0.001660 \n",
+ "\n",
+ " avg_fact_score bom_astroturf bom_overall \n",
+ "user_id \n",
+ "2952 NaN 0.21 0.20 \n",
+ "635553 NaN 0.24 0.16 \n",
+ "656993 NaN 0.11 0.10 \n",
+ "761154 NaN 0.13 0.72 \n",
+ "777554 NaN 0.15 0.03 \n",
+ "\n",
+ "[5 rows x 1548 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " user_id | \n",
+ " openai_0 | \n",
+ " openai_1 | \n",
+ " openai_2 | \n",
+ " openai_3 | \n",
+ " openai_4 | \n",
+ " openai_5 | \n",
+ " openai_6 | \n",
+ " openai_7 | \n",
+ " openai_8 | \n",
+ " ... | \n",
+ " created_on | \n",
+ " status_count | \n",
+ " rt_count | \n",
+ " is_bot | \n",
+ " opinion_community | \n",
+ " is_q | \n",
+ " avg_toxicity | \n",
+ " avg_fact_score | \n",
+ " bom_astroturf | \n",
+ " bom_overall | \n",
+ "
\n",
+ " \n",
+ " user_id | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 2952 | \n",
+ " 2952 | \n",
+ " -0.023816 | \n",
+ " 0.002004 | \n",
+ " 0.004429 | \n",
+ " -0.019361 | \n",
+ " -0.009860 | \n",
+ " 0.004878 | \n",
+ " 0.000960 | \n",
+ " -0.015426 | \n",
+ " -0.006430 | \n",
+ " ... | \n",
+ " 2006-07-24 | \n",
+ " 6 | \n",
+ " 6 | \n",
+ " False | \n",
+ " 0 | \n",
+ " False | \n",
+ " 0.006899 | \n",
+ " NaN | \n",
+ " 0.21 | \n",
+ " 0.20 | \n",
+ "
\n",
+ " \n",
+ " 635553 | \n",
+ " 635553 | \n",
+ " -0.030022 | \n",
+ " -0.006063 | \n",
+ " 0.017259 | \n",
+ " -0.018501 | \n",
+ " -0.008536 | \n",
+ " 0.004416 | \n",
+ " -0.011840 | \n",
+ " -0.010581 | \n",
+ " -0.010859 | \n",
+ " ... | \n",
+ " 2007-01-15 | \n",
+ " 12 | \n",
+ " 12 | \n",
+ " False | \n",
+ " 0 | \n",
+ " False | \n",
+ " 0.077787 | \n",
+ " NaN | \n",
+ " 0.24 | \n",
+ " 0.16 | \n",
+ "
\n",
+ " \n",
+ " 656993 | \n",
+ " 656993 | \n",
+ " -0.010723 | \n",
+ " 0.008235 | \n",
+ " 0.004192 | \n",
+ " -0.040441 | \n",
+ " -0.015172 | \n",
+ " 0.012798 | \n",
+ " -0.015786 | \n",
+ " 0.008556 | \n",
+ " -0.022145 | \n",
+ " ... | \n",
+ " 2007-01-17 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " False | \n",
+ " 0 | \n",
+ " False | \n",
+ " 0.025031 | \n",
+ " NaN | \n",
+ " 0.11 | \n",
+ " 0.10 | \n",
+ "
\n",
+ " \n",
+ " 761154 | \n",
+ " 761154 | \n",
+ " -0.021389 | \n",
+ " -0.004747 | \n",
+ " 0.006925 | \n",
+ " -0.017395 | \n",
+ " -0.011900 | \n",
+ " 0.018309 | \n",
+ " -0.007047 | \n",
+ " -0.024175 | \n",
+ " 0.001368 | \n",
+ " ... | \n",
+ " 2007-02-09 | \n",
+ " 4 | \n",
+ " 0 | \n",
+ " False | \n",
+ " 0 | \n",
+ " False | \n",
+ " 0.172311 | \n",
+ " NaN | \n",
+ " 0.13 | \n",
+ " 0.72 | \n",
+ "
\n",
+ " \n",
+ " 777554 | \n",
+ " 777554 | \n",
+ " -0.009369 | \n",
+ " -0.009612 | \n",
+ " 0.012470 | \n",
+ " 0.005079 | \n",
+ " -0.019303 | \n",
+ " -0.010459 | \n",
+ " 0.019815 | \n",
+ " -0.019171 | \n",
+ " -0.017594 | \n",
+ " ... | \n",
+ " 2007-02-17 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " False | \n",
+ " 0 | \n",
+ " False | \n",
+ " 0.001660 | \n",
+ " NaN | \n",
+ " 0.15 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 1548 columns
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "averages_df"
+ }
+ },
+ "metadata": {},
+ "execution_count": 47
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "averages_df[\"user_id\"].nunique()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "NGVktpyCkgJM",
+ "outputId": "ee21c406-b820-4948-9bc2-f94dc2043d9c"
+ },
+ "execution_count": 38,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "7566"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 38
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "len(averages_df)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "H3GDRXLees44",
+ "outputId": "2d44a93d-78d2-4a7b-8429-c0ac88c688e7"
+ },
+ "execution_count": 39,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "7566"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 39
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "averages_df = add_labels(averages_df)\n",
+ "print(averages_df.shape)\n",
+ "print(averages_df.columns.tolist())\n",
+ "averages_df.head()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 321
+ },
+ "id": "fCMNWudr6md7",
+ "outputId": "4e4c219b-76bd-40b7-d982-fa13fe36ca57"
+ },
+ "execution_count": 48,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "(7566, 1561)\n",
+ "['user_id', 'openai_0', 'openai_1', 'openai_2', 'openai_3', 'openai_4', 'openai_5', 'openai_6', 'openai_7', 'openai_8', 'openai_9', 'openai_10', 'openai_11', 'openai_12', 'openai_13', 'openai_14', 'openai_15', 'openai_16', 'openai_17', 'openai_18', 'openai_19', 'openai_20', 'openai_21', 'openai_22', 'openai_23', 'openai_24', 'openai_25', 'openai_26', 'openai_27', 'openai_28', 'openai_29', 'openai_30', 'openai_31', 'openai_32', 'openai_33', 'openai_34', 'openai_35', 'openai_36', 'openai_37', 'openai_38', 'openai_39', 'openai_40', 'openai_41', 'openai_42', 'openai_43', 'openai_44', 'openai_45', 'openai_46', 'openai_47', 'openai_48', 'openai_49', 'openai_50', 'openai_51', 'openai_52', 'openai_53', 'openai_54', 'openai_55', 'openai_56', 'openai_57', 'openai_58', 'openai_59', 'openai_60', 'openai_61', 'openai_62', 'openai_63', 'openai_64', 'openai_65', 'openai_66', 'openai_67', 'openai_68', 'openai_69', 'openai_70', 'openai_71', 'openai_72', 'openai_73', 'openai_74', 'openai_75', 'openai_76', 'openai_77', 'openai_78', 'openai_79', 'openai_80', 'openai_81', 'openai_82', 'openai_83', 'openai_84', 'openai_85', 'openai_86', 'openai_87', 'openai_88', 'openai_89', 'openai_90', 'openai_91', 'openai_92', 'openai_93', 'openai_94', 'openai_95', 'openai_96', 'openai_97', 'openai_98', 'openai_99', 'openai_100', 'openai_101', 'openai_102', 'openai_103', 'openai_104', 'openai_105', 'openai_106', 'openai_107', 'openai_108', 'openai_109', 'openai_110', 'openai_111', 'openai_112', 'openai_113', 'openai_114', 'openai_115', 'openai_116', 'openai_117', 'openai_118', 'openai_119', 'openai_120', 'openai_121', 'openai_122', 'openai_123', 'openai_124', 'openai_125', 'openai_126', 'openai_127', 'openai_128', 'openai_129', 'openai_130', 'openai_131', 'openai_132', 'openai_133', 'openai_134', 'openai_135', 'openai_136', 'openai_137', 'openai_138', 'openai_139', 'openai_140', 'openai_141', 'openai_142', 'openai_143', 'openai_144', 'openai_145', 'openai_146', 'openai_147', 'openai_148', 'openai_149', 'openai_150', 'openai_151', 'openai_152', 'openai_153', 'openai_154', 'openai_155', 'openai_156', 'openai_157', 'openai_158', 'openai_159', 'openai_160', 'openai_161', 'openai_162', 'openai_163', 'openai_164', 'openai_165', 'openai_166', 'openai_167', 'openai_168', 'openai_169', 'openai_170', 'openai_171', 'openai_172', 'openai_173', 'openai_174', 'openai_175', 'openai_176', 'openai_177', 'openai_178', 'openai_179', 'openai_180', 'openai_181', 'openai_182', 'openai_183', 'openai_184', 'openai_185', 'openai_186', 'openai_187', 'openai_188', 'openai_189', 'openai_190', 'openai_191', 'openai_192', 'openai_193', 'openai_194', 'openai_195', 'openai_196', 'openai_197', 'openai_198', 'openai_199', 'openai_200', 'openai_201', 'openai_202', 'openai_203', 'openai_204', 'openai_205', 'openai_206', 'openai_207', 'openai_208', 'openai_209', 'openai_210', 'openai_211', 'openai_212', 'openai_213', 'openai_214', 'openai_215', 'openai_216', 'openai_217', 'openai_218', 'openai_219', 'openai_220', 'openai_221', 'openai_222', 'openai_223', 'openai_224', 'openai_225', 'openai_226', 'openai_227', 'openai_228', 'openai_229', 'openai_230', 'openai_231', 'openai_232', 'openai_233', 'openai_234', 'openai_235', 'openai_236', 'openai_237', 'openai_238', 'openai_239', 'openai_240', 'openai_241', 'openai_242', 'openai_243', 'openai_244', 'openai_245', 'openai_246', 'openai_247', 'openai_248', 'openai_249', 'openai_250', 'openai_251', 'openai_252', 'openai_253', 'openai_254', 'openai_255', 'openai_256', 'openai_257', 'openai_258', 'openai_259', 'openai_260', 'openai_261', 'openai_262', 'openai_263', 'openai_264', 'openai_265', 'openai_266', 'openai_267', 'openai_268', 'openai_269', 'openai_270', 'openai_271', 'openai_272', 'openai_273', 'openai_274', 'openai_275', 'openai_276', 'openai_277', 'openai_278', 'openai_279', 'openai_280', 'openai_281', 'openai_282', 'openai_283', 'openai_284', 'openai_285', 'openai_286', 'openai_287', 'openai_288', 'openai_289', 'openai_290', 'openai_291', 'openai_292', 'openai_293', 'openai_294', 'openai_295', 'openai_296', 'openai_297', 'openai_298', 'openai_299', 'openai_300', 'openai_301', 'openai_302', 'openai_303', 'openai_304', 'openai_305', 'openai_306', 'openai_307', 'openai_308', 'openai_309', 'openai_310', 'openai_311', 'openai_312', 'openai_313', 'openai_314', 'openai_315', 'openai_316', 'openai_317', 'openai_318', 'openai_319', 'openai_320', 'openai_321', 'openai_322', 'openai_323', 'openai_324', 'openai_325', 'openai_326', 'openai_327', 'openai_328', 'openai_329', 'openai_330', 'openai_331', 'openai_332', 'openai_333', 'openai_334', 'openai_335', 'openai_336', 'openai_337', 'openai_338', 'openai_339', 'openai_340', 'openai_341', 'openai_342', 'openai_343', 'openai_344', 'openai_345', 'openai_346', 'openai_347', 'openai_348', 'openai_349', 'openai_350', 'openai_351', 'openai_352', 'openai_353', 'openai_354', 'openai_355', 'openai_356', 'openai_357', 'openai_358', 'openai_359', 'openai_360', 'openai_361', 'openai_362', 'openai_363', 'openai_364', 'openai_365', 'openai_366', 'openai_367', 'openai_368', 'openai_369', 'openai_370', 'openai_371', 'openai_372', 'openai_373', 'openai_374', 'openai_375', 'openai_376', 'openai_377', 'openai_378', 'openai_379', 'openai_380', 'openai_381', 'openai_382', 'openai_383', 'openai_384', 'openai_385', 'openai_386', 'openai_387', 'openai_388', 'openai_389', 'openai_390', 'openai_391', 'openai_392', 'openai_393', 'openai_394', 'openai_395', 'openai_396', 'openai_397', 'openai_398', 'openai_399', 'openai_400', 'openai_401', 'openai_402', 'openai_403', 'openai_404', 'openai_405', 'openai_406', 'openai_407', 'openai_408', 'openai_409', 'openai_410', 'openai_411', 'openai_412', 'openai_413', 'openai_414', 'openai_415', 'openai_416', 'openai_417', 'openai_418', 'openai_419', 'openai_420', 'openai_421', 'openai_422', 'openai_423', 'openai_424', 'openai_425', 'openai_426', 'openai_427', 'openai_428', 'openai_429', 'openai_430', 'openai_431', 'openai_432', 'openai_433', 'openai_434', 'openai_435', 'openai_436', 'openai_437', 'openai_438', 'openai_439', 'openai_440', 'openai_441', 'openai_442', 'openai_443', 'openai_444', 'openai_445', 'openai_446', 'openai_447', 'openai_448', 'openai_449', 'openai_450', 'openai_451', 'openai_452', 'openai_453', 'openai_454', 'openai_455', 'openai_456', 'openai_457', 'openai_458', 'openai_459', 'openai_460', 'openai_461', 'openai_462', 'openai_463', 'openai_464', 'openai_465', 'openai_466', 'openai_467', 'openai_468', 'openai_469', 'openai_470', 'openai_471', 'openai_472', 'openai_473', 'openai_474', 'openai_475', 'openai_476', 'openai_477', 'openai_478', 'openai_479', 'openai_480', 'openai_481', 'openai_482', 'openai_483', 'openai_484', 'openai_485', 'openai_486', 'openai_487', 'openai_488', 'openai_489', 'openai_490', 'openai_491', 'openai_492', 'openai_493', 'openai_494', 'openai_495', 'openai_496', 'openai_497', 'openai_498', 'openai_499', 'openai_500', 'openai_501', 'openai_502', 'openai_503', 'openai_504', 'openai_505', 'openai_506', 'openai_507', 'openai_508', 'openai_509', 'openai_510', 'openai_511', 'openai_512', 'openai_513', 'openai_514', 'openai_515', 'openai_516', 'openai_517', 'openai_518', 'openai_519', 'openai_520', 'openai_521', 'openai_522', 'openai_523', 'openai_524', 'openai_525', 'openai_526', 'openai_527', 'openai_528', 'openai_529', 'openai_530', 'openai_531', 'openai_532', 'openai_533', 'openai_534', 'openai_535', 'openai_536', 'openai_537', 'openai_538', 'openai_539', 'openai_540', 'openai_541', 'openai_542', 'openai_543', 'openai_544', 'openai_545', 'openai_546', 'openai_547', 'openai_548', 'openai_549', 'openai_550', 'openai_551', 'openai_552', 'openai_553', 'openai_554', 'openai_555', 'openai_556', 'openai_557', 'openai_558', 'openai_559', 'openai_560', 'openai_561', 'openai_562', 'openai_563', 'openai_564', 'openai_565', 'openai_566', 'openai_567', 'openai_568', 'openai_569', 'openai_570', 'openai_571', 'openai_572', 'openai_573', 'openai_574', 'openai_575', 'openai_576', 'openai_577', 'openai_578', 'openai_579', 'openai_580', 'openai_581', 'openai_582', 'openai_583', 'openai_584', 'openai_585', 'openai_586', 'openai_587', 'openai_588', 'openai_589', 'openai_590', 'openai_591', 'openai_592', 'openai_593', 'openai_594', 'openai_595', 'openai_596', 'openai_597', 'openai_598', 'openai_599', 'openai_600', 'openai_601', 'openai_602', 'openai_603', 'openai_604', 'openai_605', 'openai_606', 'openai_607', 'openai_608', 'openai_609', 'openai_610', 'openai_611', 'openai_612', 'openai_613', 'openai_614', 'openai_615', 'openai_616', 'openai_617', 'openai_618', 'openai_619', 'openai_620', 'openai_621', 'openai_622', 'openai_623', 'openai_624', 'openai_625', 'openai_626', 'openai_627', 'openai_628', 'openai_629', 'openai_630', 'openai_631', 'openai_632', 'openai_633', 'openai_634', 'openai_635', 'openai_636', 'openai_637', 'openai_638', 'openai_639', 'openai_640', 'openai_641', 'openai_642', 'openai_643', 'openai_644', 'openai_645', 'openai_646', 'openai_647', 'openai_648', 'openai_649', 'openai_650', 'openai_651', 'openai_652', 'openai_653', 'openai_654', 'openai_655', 'openai_656', 'openai_657', 'openai_658', 'openai_659', 'openai_660', 'openai_661', 'openai_662', 'openai_663', 'openai_664', 'openai_665', 'openai_666', 'openai_667', 'openai_668', 'openai_669', 'openai_670', 'openai_671', 'openai_672', 'openai_673', 'openai_674', 'openai_675', 'openai_676', 'openai_677', 'openai_678', 'openai_679', 'openai_680', 'openai_681', 'openai_682', 'openai_683', 'openai_684', 'openai_685', 'openai_686', 'openai_687', 'openai_688', 'openai_689', 'openai_690', 'openai_691', 'openai_692', 'openai_693', 'openai_694', 'openai_695', 'openai_696', 'openai_697', 'openai_698', 'openai_699', 'openai_700', 'openai_701', 'openai_702', 'openai_703', 'openai_704', 'openai_705', 'openai_706', 'openai_707', 'openai_708', 'openai_709', 'openai_710', 'openai_711', 'openai_712', 'openai_713', 'openai_714', 'openai_715', 'openai_716', 'openai_717', 'openai_718', 'openai_719', 'openai_720', 'openai_721', 'openai_722', 'openai_723', 'openai_724', 'openai_725', 'openai_726', 'openai_727', 'openai_728', 'openai_729', 'openai_730', 'openai_731', 'openai_732', 'openai_733', 'openai_734', 'openai_735', 'openai_736', 'openai_737', 'openai_738', 'openai_739', 'openai_740', 'openai_741', 'openai_742', 'openai_743', 'openai_744', 'openai_745', 'openai_746', 'openai_747', 'openai_748', 'openai_749', 'openai_750', 'openai_751', 'openai_752', 'openai_753', 'openai_754', 'openai_755', 'openai_756', 'openai_757', 'openai_758', 'openai_759', 'openai_760', 'openai_761', 'openai_762', 'openai_763', 'openai_764', 'openai_765', 'openai_766', 'openai_767', 'openai_768', 'openai_769', 'openai_770', 'openai_771', 'openai_772', 'openai_773', 'openai_774', 'openai_775', 'openai_776', 'openai_777', 'openai_778', 'openai_779', 'openai_780', 'openai_781', 'openai_782', 'openai_783', 'openai_784', 'openai_785', 'openai_786', 'openai_787', 'openai_788', 'openai_789', 'openai_790', 'openai_791', 'openai_792', 'openai_793', 'openai_794', 'openai_795', 'openai_796', 'openai_797', 'openai_798', 'openai_799', 'openai_800', 'openai_801', 'openai_802', 'openai_803', 'openai_804', 'openai_805', 'openai_806', 'openai_807', 'openai_808', 'openai_809', 'openai_810', 'openai_811', 'openai_812', 'openai_813', 'openai_814', 'openai_815', 'openai_816', 'openai_817', 'openai_818', 'openai_819', 'openai_820', 'openai_821', 'openai_822', 'openai_823', 'openai_824', 'openai_825', 'openai_826', 'openai_827', 'openai_828', 'openai_829', 'openai_830', 'openai_831', 'openai_832', 'openai_833', 'openai_834', 'openai_835', 'openai_836', 'openai_837', 'openai_838', 'openai_839', 'openai_840', 'openai_841', 'openai_842', 'openai_843', 'openai_844', 'openai_845', 'openai_846', 'openai_847', 'openai_848', 'openai_849', 'openai_850', 'openai_851', 'openai_852', 'openai_853', 'openai_854', 'openai_855', 'openai_856', 'openai_857', 'openai_858', 'openai_859', 'openai_860', 'openai_861', 'openai_862', 'openai_863', 'openai_864', 'openai_865', 'openai_866', 'openai_867', 'openai_868', 'openai_869', 'openai_870', 'openai_871', 'openai_872', 'openai_873', 'openai_874', 'openai_875', 'openai_876', 'openai_877', 'openai_878', 'openai_879', 'openai_880', 'openai_881', 'openai_882', 'openai_883', 'openai_884', 'openai_885', 'openai_886', 'openai_887', 'openai_888', 'openai_889', 'openai_890', 'openai_891', 'openai_892', 'openai_893', 'openai_894', 'openai_895', 'openai_896', 'openai_897', 'openai_898', 'openai_899', 'openai_900', 'openai_901', 'openai_902', 'openai_903', 'openai_904', 'openai_905', 'openai_906', 'openai_907', 'openai_908', 'openai_909', 'openai_910', 'openai_911', 'openai_912', 'openai_913', 'openai_914', 'openai_915', 'openai_916', 'openai_917', 'openai_918', 'openai_919', 'openai_920', 'openai_921', 'openai_922', 'openai_923', 'openai_924', 'openai_925', 'openai_926', 'openai_927', 'openai_928', 'openai_929', 'openai_930', 'openai_931', 'openai_932', 'openai_933', 'openai_934', 'openai_935', 'openai_936', 'openai_937', 'openai_938', 'openai_939', 'openai_940', 'openai_941', 'openai_942', 'openai_943', 'openai_944', 'openai_945', 'openai_946', 'openai_947', 'openai_948', 'openai_949', 'openai_950', 'openai_951', 'openai_952', 'openai_953', 'openai_954', 'openai_955', 'openai_956', 'openai_957', 'openai_958', 'openai_959', 'openai_960', 'openai_961', 'openai_962', 'openai_963', 'openai_964', 'openai_965', 'openai_966', 'openai_967', 'openai_968', 'openai_969', 'openai_970', 'openai_971', 'openai_972', 'openai_973', 'openai_974', 'openai_975', 'openai_976', 'openai_977', 'openai_978', 'openai_979', 'openai_980', 'openai_981', 'openai_982', 'openai_983', 'openai_984', 'openai_985', 'openai_986', 'openai_987', 'openai_988', 'openai_989', 'openai_990', 'openai_991', 'openai_992', 'openai_993', 'openai_994', 'openai_995', 'openai_996', 'openai_997', 'openai_998', 'openai_999', 'openai_1000', 'openai_1001', 'openai_1002', 'openai_1003', 'openai_1004', 'openai_1005', 'openai_1006', 'openai_1007', 'openai_1008', 'openai_1009', 'openai_1010', 'openai_1011', 'openai_1012', 'openai_1013', 'openai_1014', 'openai_1015', 'openai_1016', 'openai_1017', 'openai_1018', 'openai_1019', 'openai_1020', 'openai_1021', 'openai_1022', 'openai_1023', 'openai_1024', 'openai_1025', 'openai_1026', 'openai_1027', 'openai_1028', 'openai_1029', 'openai_1030', 'openai_1031', 'openai_1032', 'openai_1033', 'openai_1034', 'openai_1035', 'openai_1036', 'openai_1037', 'openai_1038', 'openai_1039', 'openai_1040', 'openai_1041', 'openai_1042', 'openai_1043', 'openai_1044', 'openai_1045', 'openai_1046', 'openai_1047', 'openai_1048', 'openai_1049', 'openai_1050', 'openai_1051', 'openai_1052', 'openai_1053', 'openai_1054', 'openai_1055', 'openai_1056', 'openai_1057', 'openai_1058', 'openai_1059', 'openai_1060', 'openai_1061', 'openai_1062', 'openai_1063', 'openai_1064', 'openai_1065', 'openai_1066', 'openai_1067', 'openai_1068', 'openai_1069', 'openai_1070', 'openai_1071', 'openai_1072', 'openai_1073', 'openai_1074', 'openai_1075', 'openai_1076', 'openai_1077', 'openai_1078', 'openai_1079', 'openai_1080', 'openai_1081', 'openai_1082', 'openai_1083', 'openai_1084', 'openai_1085', 'openai_1086', 'openai_1087', 'openai_1088', 'openai_1089', 'openai_1090', 'openai_1091', 'openai_1092', 'openai_1093', 'openai_1094', 'openai_1095', 'openai_1096', 'openai_1097', 'openai_1098', 'openai_1099', 'openai_1100', 'openai_1101', 'openai_1102', 'openai_1103', 'openai_1104', 'openai_1105', 'openai_1106', 'openai_1107', 'openai_1108', 'openai_1109', 'openai_1110', 'openai_1111', 'openai_1112', 'openai_1113', 'openai_1114', 'openai_1115', 'openai_1116', 'openai_1117', 'openai_1118', 'openai_1119', 'openai_1120', 'openai_1121', 'openai_1122', 'openai_1123', 'openai_1124', 'openai_1125', 'openai_1126', 'openai_1127', 'openai_1128', 'openai_1129', 'openai_1130', 'openai_1131', 'openai_1132', 'openai_1133', 'openai_1134', 'openai_1135', 'openai_1136', 'openai_1137', 'openai_1138', 'openai_1139', 'openai_1140', 'openai_1141', 'openai_1142', 'openai_1143', 'openai_1144', 'openai_1145', 'openai_1146', 'openai_1147', 'openai_1148', 'openai_1149', 'openai_1150', 'openai_1151', 'openai_1152', 'openai_1153', 'openai_1154', 'openai_1155', 'openai_1156', 'openai_1157', 'openai_1158', 'openai_1159', 'openai_1160', 'openai_1161', 'openai_1162', 'openai_1163', 'openai_1164', 'openai_1165', 'openai_1166', 'openai_1167', 'openai_1168', 'openai_1169', 'openai_1170', 'openai_1171', 'openai_1172', 'openai_1173', 'openai_1174', 'openai_1175', 'openai_1176', 'openai_1177', 'openai_1178', 'openai_1179', 'openai_1180', 'openai_1181', 'openai_1182', 'openai_1183', 'openai_1184', 'openai_1185', 'openai_1186', 'openai_1187', 'openai_1188', 'openai_1189', 'openai_1190', 'openai_1191', 'openai_1192', 'openai_1193', 'openai_1194', 'openai_1195', 'openai_1196', 'openai_1197', 'openai_1198', 'openai_1199', 'openai_1200', 'openai_1201', 'openai_1202', 'openai_1203', 'openai_1204', 'openai_1205', 'openai_1206', 'openai_1207', 'openai_1208', 'openai_1209', 'openai_1210', 'openai_1211', 'openai_1212', 'openai_1213', 'openai_1214', 'openai_1215', 'openai_1216', 'openai_1217', 'openai_1218', 'openai_1219', 'openai_1220', 'openai_1221', 'openai_1222', 'openai_1223', 'openai_1224', 'openai_1225', 'openai_1226', 'openai_1227', 'openai_1228', 'openai_1229', 'openai_1230', 'openai_1231', 'openai_1232', 'openai_1233', 'openai_1234', 'openai_1235', 'openai_1236', 'openai_1237', 'openai_1238', 'openai_1239', 'openai_1240', 'openai_1241', 'openai_1242', 'openai_1243', 'openai_1244', 'openai_1245', 'openai_1246', 'openai_1247', 'openai_1248', 'openai_1249', 'openai_1250', 'openai_1251', 'openai_1252', 'openai_1253', 'openai_1254', 'openai_1255', 'openai_1256', 'openai_1257', 'openai_1258', 'openai_1259', 'openai_1260', 'openai_1261', 'openai_1262', 'openai_1263', 'openai_1264', 'openai_1265', 'openai_1266', 'openai_1267', 'openai_1268', 'openai_1269', 'openai_1270', 'openai_1271', 'openai_1272', 'openai_1273', 'openai_1274', 'openai_1275', 'openai_1276', 'openai_1277', 'openai_1278', 'openai_1279', 'openai_1280', 'openai_1281', 'openai_1282', 'openai_1283', 'openai_1284', 'openai_1285', 'openai_1286', 'openai_1287', 'openai_1288', 'openai_1289', 'openai_1290', 'openai_1291', 'openai_1292', 'openai_1293', 'openai_1294', 'openai_1295', 'openai_1296', 'openai_1297', 'openai_1298', 'openai_1299', 'openai_1300', 'openai_1301', 'openai_1302', 'openai_1303', 'openai_1304', 'openai_1305', 'openai_1306', 'openai_1307', 'openai_1308', 'openai_1309', 'openai_1310', 'openai_1311', 'openai_1312', 'openai_1313', 'openai_1314', 'openai_1315', 'openai_1316', 'openai_1317', 'openai_1318', 'openai_1319', 'openai_1320', 'openai_1321', 'openai_1322', 'openai_1323', 'openai_1324', 'openai_1325', 'openai_1326', 'openai_1327', 'openai_1328', 'openai_1329', 'openai_1330', 'openai_1331', 'openai_1332', 'openai_1333', 'openai_1334', 'openai_1335', 'openai_1336', 'openai_1337', 'openai_1338', 'openai_1339', 'openai_1340', 'openai_1341', 'openai_1342', 'openai_1343', 'openai_1344', 'openai_1345', 'openai_1346', 'openai_1347', 'openai_1348', 'openai_1349', 'openai_1350', 'openai_1351', 'openai_1352', 'openai_1353', 'openai_1354', 'openai_1355', 'openai_1356', 'openai_1357', 'openai_1358', 'openai_1359', 'openai_1360', 'openai_1361', 'openai_1362', 'openai_1363', 'openai_1364', 'openai_1365', 'openai_1366', 'openai_1367', 'openai_1368', 'openai_1369', 'openai_1370', 'openai_1371', 'openai_1372', 'openai_1373', 'openai_1374', 'openai_1375', 'openai_1376', 'openai_1377', 'openai_1378', 'openai_1379', 'openai_1380', 'openai_1381', 'openai_1382', 'openai_1383', 'openai_1384', 'openai_1385', 'openai_1386', 'openai_1387', 'openai_1388', 'openai_1389', 'openai_1390', 'openai_1391', 'openai_1392', 'openai_1393', 'openai_1394', 'openai_1395', 'openai_1396', 'openai_1397', 'openai_1398', 'openai_1399', 'openai_1400', 'openai_1401', 'openai_1402', 'openai_1403', 'openai_1404', 'openai_1405', 'openai_1406', 'openai_1407', 'openai_1408', 'openai_1409', 'openai_1410', 'openai_1411', 'openai_1412', 'openai_1413', 'openai_1414', 'openai_1415', 'openai_1416', 'openai_1417', 'openai_1418', 'openai_1419', 'openai_1420', 'openai_1421', 'openai_1422', 'openai_1423', 'openai_1424', 'openai_1425', 'openai_1426', 'openai_1427', 'openai_1428', 'openai_1429', 'openai_1430', 'openai_1431', 'openai_1432', 'openai_1433', 'openai_1434', 'openai_1435', 'openai_1436', 'openai_1437', 'openai_1438', 'openai_1439', 'openai_1440', 'openai_1441', 'openai_1442', 'openai_1443', 'openai_1444', 'openai_1445', 'openai_1446', 'openai_1447', 'openai_1448', 'openai_1449', 'openai_1450', 'openai_1451', 'openai_1452', 'openai_1453', 'openai_1454', 'openai_1455', 'openai_1456', 'openai_1457', 'openai_1458', 'openai_1459', 'openai_1460', 'openai_1461', 'openai_1462', 'openai_1463', 'openai_1464', 'openai_1465', 'openai_1466', 'openai_1467', 'openai_1468', 'openai_1469', 'openai_1470', 'openai_1471', 'openai_1472', 'openai_1473', 'openai_1474', 'openai_1475', 'openai_1476', 'openai_1477', 'openai_1478', 'openai_1479', 'openai_1480', 'openai_1481', 'openai_1482', 'openai_1483', 'openai_1484', 'openai_1485', 'openai_1486', 'openai_1487', 'openai_1488', 'openai_1489', 'openai_1490', 'openai_1491', 'openai_1492', 'openai_1493', 'openai_1494', 'openai_1495', 'openai_1496', 'openai_1497', 'openai_1498', 'openai_1499', 'openai_1500', 'openai_1501', 'openai_1502', 'openai_1503', 'openai_1504', 'openai_1505', 'openai_1506', 'openai_1507', 'openai_1508', 'openai_1509', 'openai_1510', 'openai_1511', 'openai_1512', 'openai_1513', 'openai_1514', 'openai_1515', 'openai_1516', 'openai_1517', 'openai_1518', 'openai_1519', 'openai_1520', 'openai_1521', 'openai_1522', 'openai_1523', 'openai_1524', 'openai_1525', 'openai_1526', 'openai_1527', 'openai_1528', 'openai_1529', 'openai_1530', 'openai_1531', 'openai_1532', 'openai_1533', 'openai_1534', 'openai_1535', 'user_id.1', 'created_on', 'status_count', 'rt_count', 'is_bot', 'opinion_community', 'is_q', 'avg_toxicity', 'avg_fact_score', 'bom_astroturf', 'bom_overall', 'opinion_label', 'bot_label', 'fourway_label', 'is_toxic', 'toxic_label', 'is_factual', 'factual_label', 'is_bom_overall', 'is_bom_astroturf', 'bom_overall_label', 'bom_astroturf_label', 'bom_overall_fourway_label', 'bom_astroturf_fourway_label']\n"
+ ]
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " user_id openai_0 openai_1 openai_2 openai_3 openai_4 openai_5 \\\n",
+ "user_id \n",
+ "2952 2952 -0.023816 0.002004 0.004429 -0.019361 -0.009860 0.004878 \n",
+ "635553 635553 -0.030022 -0.006063 0.017259 -0.018501 -0.008536 0.004416 \n",
+ "656993 656993 -0.010723 0.008235 0.004192 -0.040441 -0.015172 0.012798 \n",
+ "761154 761154 -0.021389 -0.004747 0.006925 -0.017395 -0.011900 0.018309 \n",
+ "777554 777554 -0.009369 -0.009612 0.012470 0.005079 -0.019303 -0.010459 \n",
+ "\n",
+ " openai_6 openai_7 openai_8 ... is_toxic toxic_label is_factual \\\n",
+ "user_id ... \n",
+ "2952 0.000960 -0.015426 -0.006430 ... 0 Normal NaN \n",
+ "635553 -0.011840 -0.010581 -0.010859 ... 0 Normal NaN \n",
+ "656993 -0.015786 0.008556 -0.022145 ... 0 Normal NaN \n",
+ "761154 -0.007047 -0.024175 0.001368 ... 1 Toxic NaN \n",
+ "777554 0.019815 -0.019171 -0.017594 ... 0 Normal NaN \n",
+ "\n",
+ " factual_label is_bom_overall is_bom_astroturf bom_overall_label \\\n",
+ "user_id \n",
+ "2952 NaN 0.0 0.0 Human \n",
+ "635553 NaN 0.0 0.0 Human \n",
+ "656993 NaN 0.0 0.0 Human \n",
+ "761154 NaN 1.0 0.0 Bot \n",
+ "777554 NaN 0.0 0.0 Human \n",
+ "\n",
+ " bom_astroturf_label bom_overall_fourway_label \\\n",
+ "user_id \n",
+ "2952 Human Anti-Trump Human \n",
+ "635553 Human Anti-Trump Human \n",
+ "656993 Human Anti-Trump Human \n",
+ "761154 Human Anti-Trump Bot \n",
+ "777554 Human Anti-Trump Human \n",
+ "\n",
+ " bom_astroturf_fourway_label \n",
+ "user_id \n",
+ "2952 Anti-Trump Human \n",
+ "635553 Anti-Trump Human \n",
+ "656993 Anti-Trump Human \n",
+ "761154 Anti-Trump Human \n",
+ "777554 Anti-Trump Human \n",
+ "\n",
+ "[5 rows x 1561 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " user_id | \n",
+ " openai_0 | \n",
+ " openai_1 | \n",
+ " openai_2 | \n",
+ " openai_3 | \n",
+ " openai_4 | \n",
+ " openai_5 | \n",
+ " openai_6 | \n",
+ " openai_7 | \n",
+ " openai_8 | \n",
+ " ... | \n",
+ " is_toxic | \n",
+ " toxic_label | \n",
+ " is_factual | \n",
+ " factual_label | \n",
+ " is_bom_overall | \n",
+ " is_bom_astroturf | \n",
+ " bom_overall_label | \n",
+ " bom_astroturf_label | \n",
+ " bom_overall_fourway_label | \n",
+ " bom_astroturf_fourway_label | \n",
+ "
\n",
+ " \n",
+ " user_id | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 2952 | \n",
+ " 2952 | \n",
+ " -0.023816 | \n",
+ " 0.002004 | \n",
+ " 0.004429 | \n",
+ " -0.019361 | \n",
+ " -0.009860 | \n",
+ " 0.004878 | \n",
+ " 0.000960 | \n",
+ " -0.015426 | \n",
+ " -0.006430 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " Normal | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " Human | \n",
+ " Human | \n",
+ " Anti-Trump Human | \n",
+ " Anti-Trump Human | \n",
+ "
\n",
+ " \n",
+ " 635553 | \n",
+ " 635553 | \n",
+ " -0.030022 | \n",
+ " -0.006063 | \n",
+ " 0.017259 | \n",
+ " -0.018501 | \n",
+ " -0.008536 | \n",
+ " 0.004416 | \n",
+ " -0.011840 | \n",
+ " -0.010581 | \n",
+ " -0.010859 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " Normal | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " Human | \n",
+ " Human | \n",
+ " Anti-Trump Human | \n",
+ " Anti-Trump Human | \n",
+ "
\n",
+ " \n",
+ " 656993 | \n",
+ " 656993 | \n",
+ " -0.010723 | \n",
+ " 0.008235 | \n",
+ " 0.004192 | \n",
+ " -0.040441 | \n",
+ " -0.015172 | \n",
+ " 0.012798 | \n",
+ " -0.015786 | \n",
+ " 0.008556 | \n",
+ " -0.022145 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " Normal | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " Human | \n",
+ " Human | \n",
+ " Anti-Trump Human | \n",
+ " Anti-Trump Human | \n",
+ "
\n",
+ " \n",
+ " 761154 | \n",
+ " 761154 | \n",
+ " -0.021389 | \n",
+ " -0.004747 | \n",
+ " 0.006925 | \n",
+ " -0.017395 | \n",
+ " -0.011900 | \n",
+ " 0.018309 | \n",
+ " -0.007047 | \n",
+ " -0.024175 | \n",
+ " 0.001368 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " Toxic | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " Bot | \n",
+ " Human | \n",
+ " Anti-Trump Bot | \n",
+ " Anti-Trump Human | \n",
+ "
\n",
+ " \n",
+ " 777554 | \n",
+ " 777554 | \n",
+ " -0.009369 | \n",
+ " -0.009612 | \n",
+ " 0.012470 | \n",
+ " 0.005079 | \n",
+ " -0.019303 | \n",
+ " -0.010459 | \n",
+ " 0.019815 | \n",
+ " -0.019171 | \n",
+ " -0.017594 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " Normal | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " Human | \n",
+ " Human | \n",
+ " Anti-Trump Human | \n",
+ " Anti-Trump Human | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 1561 columns
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "averages_df"
+ }
+ },
+ "metadata": {},
+ "execution_count": 48
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### Splitting"
+ ],
+ "metadata": {
+ "id": "slNi31xDeQYI"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "averages_x = averages_df[embeddings_cols]\n",
+ "averages_x"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 475
+ },
+ "id": "swEMBoBSeP4O",
+ "outputId": "4a1baa37-0684-43bb-8687-3d6742aedb67"
+ },
+ "execution_count": 49,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " openai_0 openai_1 openai_2 openai_3 openai_4 \\\n",
+ "user_id \n",
+ "2952 -0.023816 0.002004 0.004429 -0.019361 -0.009860 \n",
+ "635553 -0.030022 -0.006063 0.017259 -0.018501 -0.008536 \n",
+ "656993 -0.010723 0.008235 0.004192 -0.040441 -0.015172 \n",
+ "761154 -0.021389 -0.004747 0.006925 -0.017395 -0.011900 \n",
+ "777554 -0.009369 -0.009612 0.012470 0.005079 -0.019303 \n",
+ "... ... ... ... ... ... \n",
+ "1234200349600288772 -0.024670 -0.007194 0.012253 -0.015047 -0.003037 \n",
+ "1234846911028453376 0.002930 -0.013252 -0.010225 -0.034511 -0.022378 \n",
+ "1237940420136456192 -0.013656 -0.002694 0.007115 -0.019419 -0.001060 \n",
+ "1238854780191195136 -0.041529 -0.024860 0.005283 -0.026827 0.005801 \n",
+ "1240138605726760962 -0.067218 0.000135 -0.009630 -0.002240 0.013352 \n",
+ "\n",
+ " openai_5 openai_6 openai_7 openai_8 openai_9 ... \\\n",
+ "user_id ... \n",
+ "2952 0.004878 0.000960 -0.015426 -0.006430 0.001027 ... \n",
+ "635553 0.004416 -0.011840 -0.010581 -0.010859 -0.003771 ... \n",
+ "656993 0.012798 -0.015786 0.008556 -0.022145 -0.017026 ... \n",
+ "761154 0.018309 -0.007047 -0.024175 0.001368 0.002065 ... \n",
+ "777554 -0.010459 0.019815 -0.019171 -0.017594 -0.006209 ... \n",
+ "... ... ... ... ... ... ... \n",
+ "1234200349600288772 0.004080 -0.004318 -0.005398 0.000077 -0.006043 ... \n",
+ "1234846911028453376 0.011991 0.008446 0.007933 0.007091 -0.018994 ... \n",
+ "1237940420136456192 0.007154 -0.000715 -0.013499 0.001435 -0.011751 ... \n",
+ "1238854780191195136 0.000377 0.011535 -0.013447 0.002424 0.010552 ... \n",
+ "1240138605726760962 0.007857 0.011120 -0.011868 0.010417 0.003038 ... \n",
+ "\n",
+ " openai_1526 openai_1527 openai_1528 openai_1529 \\\n",
+ "user_id \n",
+ "2952 -0.012285 0.001094 0.015767 -0.026536 \n",
+ "635553 -0.005960 -0.007866 0.010948 -0.021376 \n",
+ "656993 -0.018110 0.007116 -0.004877 -0.032427 \n",
+ "761154 0.013326 -0.020819 0.007364 -0.016794 \n",
+ "777554 0.007358 0.010696 0.008784 -0.024808 \n",
+ "... ... ... ... ... \n",
+ "1234200349600288772 0.001035 -0.011842 0.022085 -0.027978 \n",
+ "1234846911028453376 0.009281 -0.010663 0.016082 -0.026220 \n",
+ "1237940420136456192 0.008012 0.011208 0.025522 -0.025476 \n",
+ "1238854780191195136 0.009792 -0.014946 0.028075 -0.031737 \n",
+ "1240138605726760962 0.005619 -0.001152 0.008422 -0.029487 \n",
+ "\n",
+ " openai_1530 openai_1531 openai_1532 openai_1533 \\\n",
+ "user_id \n",
+ "2952 -0.024981 0.015113 0.018588 -0.002324 \n",
+ "635553 -0.023424 0.020705 0.005084 -0.011961 \n",
+ "656993 -0.023885 -0.000715 0.003886 -0.024242 \n",
+ "761154 -0.049548 0.013037 0.024798 -0.008543 \n",
+ "777554 -0.008042 0.011077 0.001996 -0.001104 \n",
+ "... ... ... ... ... \n",
+ "1234200349600288772 -0.030627 0.017037 -0.001254 -0.012667 \n",
+ "1234846911028453376 0.000994 0.016662 0.005803 0.005945 \n",
+ "1237940420136456192 -0.034344 0.023171 -0.007861 -0.006911 \n",
+ "1238854780191195136 -0.047090 0.030326 -0.023545 -0.014824 \n",
+ "1240138605726760962 -0.019286 0.030090 -0.000010 -0.018275 \n",
+ "\n",
+ " openai_1534 openai_1535 \n",
+ "user_id \n",
+ "2952 -0.003782 -0.028532 \n",
+ "635553 -0.003258 -0.026262 \n",
+ "656993 0.003839 -0.048883 \n",
+ "761154 0.006142 -0.035867 \n",
+ "777554 -0.019460 -0.030301 \n",
+ "... ... ... \n",
+ "1234200349600288772 -0.002032 -0.026470 \n",
+ "1234846911028453376 0.001228 -0.041925 \n",
+ "1237940420136456192 -0.005543 -0.026032 \n",
+ "1238854780191195136 0.003257 -0.022161 \n",
+ "1240138605726760962 -0.017999 -0.033294 \n",
+ "\n",
+ "[7566 rows x 1536 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " openai_0 | \n",
+ " openai_1 | \n",
+ " openai_2 | \n",
+ " openai_3 | \n",
+ " openai_4 | \n",
+ " openai_5 | \n",
+ " openai_6 | \n",
+ " openai_7 | \n",
+ " openai_8 | \n",
+ " openai_9 | \n",
+ " ... | \n",
+ " openai_1526 | \n",
+ " openai_1527 | \n",
+ " openai_1528 | \n",
+ " openai_1529 | \n",
+ " openai_1530 | \n",
+ " openai_1531 | \n",
+ " openai_1532 | \n",
+ " openai_1533 | \n",
+ " openai_1534 | \n",
+ " openai_1535 | \n",
+ "
\n",
+ " \n",
+ " user_id | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 2952 | \n",
+ " -0.023816 | \n",
+ " 0.002004 | \n",
+ " 0.004429 | \n",
+ " -0.019361 | \n",
+ " -0.009860 | \n",
+ " 0.004878 | \n",
+ " 0.000960 | \n",
+ " -0.015426 | \n",
+ " -0.006430 | \n",
+ " 0.001027 | \n",
+ " ... | \n",
+ " -0.012285 | \n",
+ " 0.001094 | \n",
+ " 0.015767 | \n",
+ " -0.026536 | \n",
+ " -0.024981 | \n",
+ " 0.015113 | \n",
+ " 0.018588 | \n",
+ " -0.002324 | \n",
+ " -0.003782 | \n",
+ " -0.028532 | \n",
+ "
\n",
+ " \n",
+ " 635553 | \n",
+ " -0.030022 | \n",
+ " -0.006063 | \n",
+ " 0.017259 | \n",
+ " -0.018501 | \n",
+ " -0.008536 | \n",
+ " 0.004416 | \n",
+ " -0.011840 | \n",
+ " -0.010581 | \n",
+ " -0.010859 | \n",
+ " -0.003771 | \n",
+ " ... | \n",
+ " -0.005960 | \n",
+ " -0.007866 | \n",
+ " 0.010948 | \n",
+ " -0.021376 | \n",
+ " -0.023424 | \n",
+ " 0.020705 | \n",
+ " 0.005084 | \n",
+ " -0.011961 | \n",
+ " -0.003258 | \n",
+ " -0.026262 | \n",
+ "
\n",
+ " \n",
+ " 656993 | \n",
+ " -0.010723 | \n",
+ " 0.008235 | \n",
+ " 0.004192 | \n",
+ " -0.040441 | \n",
+ " -0.015172 | \n",
+ " 0.012798 | \n",
+ " -0.015786 | \n",
+ " 0.008556 | \n",
+ " -0.022145 | \n",
+ " -0.017026 | \n",
+ " ... | \n",
+ " -0.018110 | \n",
+ " 0.007116 | \n",
+ " -0.004877 | \n",
+ " -0.032427 | \n",
+ " -0.023885 | \n",
+ " -0.000715 | \n",
+ " 0.003886 | \n",
+ " -0.024242 | \n",
+ " 0.003839 | \n",
+ " -0.048883 | \n",
+ "
\n",
+ " \n",
+ " 761154 | \n",
+ " -0.021389 | \n",
+ " -0.004747 | \n",
+ " 0.006925 | \n",
+ " -0.017395 | \n",
+ " -0.011900 | \n",
+ " 0.018309 | \n",
+ " -0.007047 | \n",
+ " -0.024175 | \n",
+ " 0.001368 | \n",
+ " 0.002065 | \n",
+ " ... | \n",
+ " 0.013326 | \n",
+ " -0.020819 | \n",
+ " 0.007364 | \n",
+ " -0.016794 | \n",
+ " -0.049548 | \n",
+ " 0.013037 | \n",
+ " 0.024798 | \n",
+ " -0.008543 | \n",
+ " 0.006142 | \n",
+ " -0.035867 | \n",
+ "
\n",
+ " \n",
+ " 777554 | \n",
+ " -0.009369 | \n",
+ " -0.009612 | \n",
+ " 0.012470 | \n",
+ " 0.005079 | \n",
+ " -0.019303 | \n",
+ " -0.010459 | \n",
+ " 0.019815 | \n",
+ " -0.019171 | \n",
+ " -0.017594 | \n",
+ " -0.006209 | \n",
+ " ... | \n",
+ " 0.007358 | \n",
+ " 0.010696 | \n",
+ " 0.008784 | \n",
+ " -0.024808 | \n",
+ " -0.008042 | \n",
+ " 0.011077 | \n",
+ " 0.001996 | \n",
+ " -0.001104 | \n",
+ " -0.019460 | \n",
+ " -0.030301 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 1234200349600288772 | \n",
+ " -0.024670 | \n",
+ " -0.007194 | \n",
+ " 0.012253 | \n",
+ " -0.015047 | \n",
+ " -0.003037 | \n",
+ " 0.004080 | \n",
+ " -0.004318 | \n",
+ " -0.005398 | \n",
+ " 0.000077 | \n",
+ " -0.006043 | \n",
+ " ... | \n",
+ " 0.001035 | \n",
+ " -0.011842 | \n",
+ " 0.022085 | \n",
+ " -0.027978 | \n",
+ " -0.030627 | \n",
+ " 0.017037 | \n",
+ " -0.001254 | \n",
+ " -0.012667 | \n",
+ " -0.002032 | \n",
+ " -0.026470 | \n",
+ "
\n",
+ " \n",
+ " 1234846911028453376 | \n",
+ " 0.002930 | \n",
+ " -0.013252 | \n",
+ " -0.010225 | \n",
+ " -0.034511 | \n",
+ " -0.022378 | \n",
+ " 0.011991 | \n",
+ " 0.008446 | \n",
+ " 0.007933 | \n",
+ " 0.007091 | \n",
+ " -0.018994 | \n",
+ " ... | \n",
+ " 0.009281 | \n",
+ " -0.010663 | \n",
+ " 0.016082 | \n",
+ " -0.026220 | \n",
+ " 0.000994 | \n",
+ " 0.016662 | \n",
+ " 0.005803 | \n",
+ " 0.005945 | \n",
+ " 0.001228 | \n",
+ " -0.041925 | \n",
+ "
\n",
+ " \n",
+ " 1237940420136456192 | \n",
+ " -0.013656 | \n",
+ " -0.002694 | \n",
+ " 0.007115 | \n",
+ " -0.019419 | \n",
+ " -0.001060 | \n",
+ " 0.007154 | \n",
+ " -0.000715 | \n",
+ " -0.013499 | \n",
+ " 0.001435 | \n",
+ " -0.011751 | \n",
+ " ... | \n",
+ " 0.008012 | \n",
+ " 0.011208 | \n",
+ " 0.025522 | \n",
+ " -0.025476 | \n",
+ " -0.034344 | \n",
+ " 0.023171 | \n",
+ " -0.007861 | \n",
+ " -0.006911 | \n",
+ " -0.005543 | \n",
+ " -0.026032 | \n",
+ "
\n",
+ " \n",
+ " 1238854780191195136 | \n",
+ " -0.041529 | \n",
+ " -0.024860 | \n",
+ " 0.005283 | \n",
+ " -0.026827 | \n",
+ " 0.005801 | \n",
+ " 0.000377 | \n",
+ " 0.011535 | \n",
+ " -0.013447 | \n",
+ " 0.002424 | \n",
+ " 0.010552 | \n",
+ " ... | \n",
+ " 0.009792 | \n",
+ " -0.014946 | \n",
+ " 0.028075 | \n",
+ " -0.031737 | \n",
+ " -0.047090 | \n",
+ " 0.030326 | \n",
+ " -0.023545 | \n",
+ " -0.014824 | \n",
+ " 0.003257 | \n",
+ " -0.022161 | \n",
+ "
\n",
+ " \n",
+ " 1240138605726760962 | \n",
+ " -0.067218 | \n",
+ " 0.000135 | \n",
+ " -0.009630 | \n",
+ " -0.002240 | \n",
+ " 0.013352 | \n",
+ " 0.007857 | \n",
+ " 0.011120 | \n",
+ " -0.011868 | \n",
+ " 0.010417 | \n",
+ " 0.003038 | \n",
+ " ... | \n",
+ " 0.005619 | \n",
+ " -0.001152 | \n",
+ " 0.008422 | \n",
+ " -0.029487 | \n",
+ " -0.019286 | \n",
+ " 0.030090 | \n",
+ " -0.000010 | \n",
+ " -0.018275 | \n",
+ " -0.017999 | \n",
+ " -0.033294 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
7566 rows × 1536 columns
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "averages_x"
+ }
+ },
+ "metadata": {},
+ "execution_count": 49
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### PCA 2"
+ ],
+ "metadata": {
+ "id": "wcqhC-og54YL"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "averages_labels = averages_df[target]\n",
+ "averages_labels"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "ehDYIt976ZWa",
+ "outputId": "04b24fae-ceda-4d16-a50c-f9753fffd349"
+ },
+ "execution_count": 42,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "user_id\n",
+ "2952 Anti-Trump Human\n",
+ "635553 Anti-Trump Human\n",
+ "656993 Anti-Trump Human\n",
+ "761154 Anti-Trump Human\n",
+ "777554 Anti-Trump Human\n",
+ " ... \n",
+ "1234200349600288772 Pro-Trump Human\n",
+ "1234846911028453376 Anti-Trump Human\n",
+ "1237940420136456192 Pro-Trump Human\n",
+ "1238854780191195136 Anti-Trump Human\n",
+ "1240138605726760962 Anti-Trump Human\n",
+ "Name: fourway_label, Length: 7566, dtype: object"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 42
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "averages_pipeline = ReductionPipeline(x=averages_x, labels=averages_labels, target=target, n_components=2)\n",
+ "\n",
+ "averages_pipeline.perform()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "26515f85-3783-4e74-b8ec-704e28586d0f",
+ "id": "weDB-cX05-jy"
+ },
+ "execution_count": 43,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "PCA(n_components=2, random_state=99)\n",
+ "EMBEDDINGS: (7566, 2)\n",
+ "EXPLAINED VARIANCE RATIO: [0.04725761 0.03663586]\n",
+ "EXPLAINED VARIANCE: 0.08\n",
+ "LOADINGS (1536, 2)\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "averages_pipeline.embeddings_df.head()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 237
+ },
+ "outputId": "d91e7d4a-d0eb-4a8b-d457-1f8125543f49",
+ "id": "E0aY19Zo5-jz"
+ },
+ "execution_count": 44,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " component_1 component_2\n",
+ "user_id \n",
+ "2952 -6.801425 -1.905464\n",
+ "635553 -2.489854 -7.583170\n",
+ "656993 6.493947 -21.433610\n",
+ "761154 -0.132166 -3.147631\n",
+ "777554 -4.050601 7.515363"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " component_1 | \n",
+ " component_2 | \n",
+ "
\n",
+ " \n",
+ " user_id | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 2952 | \n",
+ " -6.801425 | \n",
+ " -1.905464 | \n",
+ "
\n",
+ " \n",
+ " 635553 | \n",
+ " -2.489854 | \n",
+ " -7.583170 | \n",
+ "
\n",
+ " \n",
+ " 656993 | \n",
+ " 6.493947 | \n",
+ " -21.433610 | \n",
+ "
\n",
+ " \n",
+ " 761154 | \n",
+ " -0.132166 | \n",
+ " -3.147631 | \n",
+ "
\n",
+ " \n",
+ " 777554 | \n",
+ " -4.050601 | \n",
+ " 7.515363 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "summary": "{\n \"name\": \"averages_pipeline\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"component_1\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 5.032658089178554,\n \"min\": -6.801425455794354,\n \"max\": 6.493947126606545,\n \"samples\": [\n -2.489853942301952,\n -4.050601021086857,\n 6.493947126606545\n ],\n \"num_unique_values\": 5,\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"component_2\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 10.55811005682525,\n \"min\": -21.4336102269828,\n \"max\": 7.5153633344561666,\n \"samples\": [\n -7.583170181949627,\n 7.5153633344561666,\n -21.4336102269828\n ],\n \"num_unique_values\": 5,\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {},
+ "execution_count": 44
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# todo: re-implement colors map and category orders\n",
+ "#averages_pipeline.plot_embeddings(fig_show=True, fig_save=False, height=350)"
+ ],
+ "metadata": {
+ "id": "PbFddVU_5-jz"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "\n",
+ "groupby_cols = [\n",
+ " \"bot_label\", \"opinion_label\", # \"bom_overall_label\", \"bom_astroturf_label\",\n",
+ " \"toxic_label\", \"factual_label\",\n",
+ " \"fourway_label\", #\"sixway_label\",\n",
+ "]\n",
+ "\n",
+ "for groupby_col in groupby_cols:\n",
+ " color_map = COLORS_MAP[groupby_col]\n",
+ " category_orders = {groupby_col: CATEGORY_ORDERS[groupby_col]}\n",
+ "\n",
+ " labels = averages_df[groupby_col]\n",
+ " pipeline = ReductionPipeline(x=averages_x, labels=labels, target=groupby_col, n_components=2)\n",
+ "\n",
+ " results_dirpath = os.path.join(RESULTS_DIRPATH, \"openai_embeddings_v2\", \"text-embedding-ada-002\", f\"status_avg_embeddings_{pipeline.reducer_type.lower()}_{pipeline.n_components}\", groupby_col)\n",
+ " os.makedirs(results_dirpath, exist_ok=True)\n",
+ "\n",
+ " pipeline.perform()\n",
+ "\n",
+ " pipeline.plot_embeddings(\n",
+ " color=groupby_col, color_map=color_map, category_orders=category_orders,\n",
+ " #hover_data=[\"user_id\", \"bot_label\"],\n",
+ " fig_show=True, fig_save=True,\n",
+ " results_dirpath=results_dirpath\n",
+ " )"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 1000
+ },
+ "id": "51tmPwpfi0er",
+ "outputId": "2fcdf7f5-d912-4cf7-e202-6671ac1087ac"
+ },
+ "execution_count": 61,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "PCA(n_components=2, random_state=99)\n",
+ "EMBEDDINGS: (7566, 2)\n",
+ "EXPLAINED VARIANCE RATIO: [0.04725761 0.03663586]\n",
+ "EXPLAINED VARIANCE: 0.08\n",
+ "LOADINGS (1536, 2)\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ "\n",
+ ""
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "PCA(n_components=2, random_state=99)\n",
+ "EMBEDDINGS: (7566, 2)\n",
+ "EXPLAINED VARIANCE RATIO: [0.04725761 0.03663586]\n",
+ "EXPLAINED VARIANCE: 0.08\n",
+ "LOADINGS (1536, 2)\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ "\n",
+ ""
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "PCA(n_components=2, random_state=99)\n",
+ "EMBEDDINGS: (7566, 2)\n",
+ "EXPLAINED VARIANCE RATIO: [0.04725761 0.03663586]\n",
+ "EXPLAINED VARIANCE: 0.08\n",
+ "LOADINGS (1536, 2)\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ "\n",
+ ""
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "PCA(n_components=2, random_state=99)\n",
+ "EMBEDDINGS: (7566, 2)\n",
+ "EXPLAINED VARIANCE RATIO: [0.04725761 0.03663586]\n",
+ "EXPLAINED VARIANCE: 0.08\n",
+ "LOADINGS (1536, 2)\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ "\n",
+ ""
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "PCA(n_components=2, random_state=99)\n",
+ "EMBEDDINGS: (7566, 2)\n",
+ "EXPLAINED VARIANCE RATIO: [0.04725761 0.03663586]\n",
+ "EXPLAINED VARIANCE: 0.08\n",
+ "LOADINGS (1536, 2)\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ "\n",
+ ""
+ ]
+ },
+ "metadata": {}
+ }
+ ]
+ }
+ ]
+}
\ No newline at end of file
diff --git a/notebooks/openai_embeddings_v2/de_duping_and_averaging_status_embeddings_(20240216).py b/notebooks/openai_embeddings_v2/de_duping_and_averaging_status_embeddings_(20240216).py
new file mode 100644
index 0000000..cd90bcd
--- /dev/null
+++ b/notebooks/openai_embeddings_v2/de_duping_and_averaging_status_embeddings_(20240216).py
@@ -0,0 +1,106 @@
+# -*- coding: utf-8 -*-
+"""De-duping and Averaging Status Embeddings (20240216)
+
+Automatically generated by Colaboratory.
+
+Original file is located at
+ https://colab.research.google.com/drive/1N-aRJ6GfO72QkOSWetqLeXYx9gq_NA8c
+
+In this notebook, we prepare a clean (de-duped) version of the status embeddings. And we re-construct user embeddings using the average of their status embeddings.
+
+This notebook saves both datasets back to drive for further analysis.
+
+## Google Drive
+"""
+
+import os
+from google.colab import drive
+
+drive.mount('/content/drive')
+print(os.getcwd(), os.listdir(os.getcwd()))
+
+# you might need to create a google drive SHORTCUT that has this same path
+# ... or update the path to use your own google drive organization
+DIRPATH = '/content/drive/MyDrive/Research/DS Research Shared 2024'
+
+print(DIRPATH)
+os.path.isdir(DIRPATH)
+
+DATA_DIRPATH = os.path.join(DIRPATH, "projects", "Impeachment 2020 Embeddings", "data")
+os.path.isdir(DATA_DIRPATH)
+
+"""## Data Loading"""
+
+from pandas import read_parquet
+
+pq_filepath = os.path.join(DATA_DIRPATH, "botometer_sample_max_50_openai_status_embeddings_v3_unpacked.parquet.gzip")
+statuses_df = read_parquet(pq_filepath)
+print(statuses_df.shape)
+print(statuses_df.columns)
+statuses_df.head()
+
+statuses_df["user_id"].nunique()
+
+len(statuses_df)
+
+statuses_df["status_id"].nunique()
+
+"""Oh no, statuses not unique?"""
+
+statuses_df["status_id"].value_counts()
+
+statuses_df[statuses_df["status_id"].duplicated(keep=False)].sort_values("status_id")
+
+"""The embeddings values appear to be the same for each status, so we can take the first row for each status.
+
+## De-Duping
+
+183,727 statuses
+"""
+
+print(statuses_df.shape)
+statuses_df.drop_duplicates(subset=["status_id"], inplace=True)
+print(statuses_df.shape)
+
+"""Saving to drive:"""
+
+pq_filepath = os.path.join(DATA_DIRPATH, "botometer_sample_max_50_openai_status_embeddings_v3_unpacked_deduped.parquet.gzip")
+
+statuses_df.to_parquet(pq_filepath, compression="gzip")
+
+"""## Averaging Embeddings per User"""
+
+statuses_df.groupby("user_id")["status_id"].count()
+
+embeddings_cols = [col for col in statuses_df.columns if "openai" in col]
+print(len(embeddings_cols))
+print(embeddings_cols[0], "...", embeddings_cols[-1])
+
+averages = statuses_df.groupby("user_id")[embeddings_cols].mean()
+print(averages.shape)
+averages.head()
+
+"""Get user labels from CSV file:"""
+
+from pandas import read_csv
+
+csv_filepath = os.path.join(DATA_DIRPATH, "botometer_sample_max_50_openai_user_embeddings_unpacked.csv.gz")
+users_df = read_csv(csv_filepath, compression="gzip")
+print(users_df.shape)
+print(users_df.columns)
+users_df.head()
+
+user_labels = users_df.drop(columns=embeddings_cols)
+user_labels.index = user_labels["user_id"]
+user_labels.head()
+
+"""Merge user labels columns back in:"""
+
+averages = averages.merge(user_labels, left_index=True, right_index=True)
+averages.head()
+
+"""Saving to drive:"""
+
+csv_filepath = os.path.join(DATA_DIRPATH, "botometer_sample_max_50_openai_status_embeddings_v3_unpacked_deduped_averaged.csv.gz")
+
+averages.to_csv(csv_filepath, compression="gzip")
\ No newline at end of file
diff --git a/notebooks/openai_embeddings_v2/user_vs_tweet_level_embeddings_(impeachment_2020)_dimensionality_reduction_(2024).py b/notebooks/openai_embeddings_v2/user_vs_tweet_level_embeddings_(impeachment_2020)_dimensionality_reduction_(2024).py
new file mode 100644
index 0000000..8cdbc8f
--- /dev/null
+++ b/notebooks/openai_embeddings_v2/user_vs_tweet_level_embeddings_(impeachment_2020)_dimensionality_reduction_(2024).py
@@ -0,0 +1,508 @@
+# -*- coding: utf-8 -*-
+"""User vs Tweet Level Embeddings (Impeachment 2020) - Dimensionality Reduction (2024)
+
+Automatically generated by Colaboratory.
+
+Original file is located at
+ https://colab.research.google.com/drive/1UiL5SUTIm5V7_B6lf1ueFH97EHn9w6P0
+
+We fetched user-level and tweet-level OpenAI embeddings and stored on BQ, and copied the data to CSV files on Drive.
+
+Then we de-duped the status embeddings and calculated the average status embeddings for each user, and saved these CSV files on drive.
+
+This notebook provides a preliminary analysis of user-level vs tweet-level embeddings, focusing first on dimensionality reduction.
+
+## Setup
+
+Package installation:
+"""
+
+# Commented out IPython magic to ensure Python compatibility.
+# %%capture
+# !pip install -U kaleido
+
+"""May need to restart session before continuing."""
+
+!pip list | grep kaleido
+
+"""## Google Drive"""
+
+import os
+from google.colab import drive
+
+drive.mount('/content/drive')
+print(os.getcwd(), os.listdir(os.getcwd()))
+
+# you might need to create a google drive SHORTCUT that has this same path
+# ... or update the path to use your own google drive organization
+#DIRPATH = '/content/drive/MyDrive/Research/Disinfo Research Shared 2022'
+#DIRPATH = '/content/drive/MyDrive/Research/DS Research Shared 2023'
+DIRPATH = '/content/drive/MyDrive/Research/DS Research Shared 2024'
+
+print(DIRPATH)
+os.path.isdir(DIRPATH)
+
+"""New project-based directory structure for 2024:
+
+https://drive.google.com/drive/folders/1SuXkqVT400uZ2OYFGGV8SYBf7NhtBo5k?usp=drive_link
+"""
+
+DATA_DIRPATH = os.path.join(DIRPATH, "projects", "Impeachment 2020 Embeddings", "data")
+os.path.isdir(DATA_DIRPATH)
+
+os.listdir(DATA_DIRPATH)
+
+"""The "unpacked" versions have a column per embedding, and are generally easier to work with.
+
+The files we will be working with are:
+ + "botometer_sample_max_50_openai_user_embeddings_unpacked.csv.gz" (user level embeddings) and
+ + "botometer_sample_max_50_openai_status_embeddings_v3_unpacked_deduped_averaged.csv.gz" (average status-level embeddings).
+"""
+
+RESULTS_DIRPATH = os.path.join(DIRPATH, "projects", "Impeachment 2020 Embeddings", "results")
+os.makedirs(RESULTS_DIRPATH, exist_ok=True)
+os.path.isdir(RESULTS_DIRPATH)
+
+"""## Colors"""
+
+# https://github.com/s2t2/openai-embeddings-2023/blob/main/app/colors.py
+
+#GREY = "#ccc"
+#PURPLE = "#7E57C2"
+
+# colorbrewer scales
+# light --> dark
+BLUES = ['#f7fbff', '#deebf7', '#c6dbef', '#9ecae1', '#6baed6', '#4292c6', '#2171b5', '#08519c', '#08306b']
+REDS = ['#fff5f0', '#fee0d2', '#fcbba1', '#fc9272', '#fb6a4a', '#ef3b2c', '#cb181d', '#a50f15', '#67000d']
+PURPLES = ['#fcfbfd', '#efedf5', '#dadaeb', '#bcbddc', '#9e9ac8', '#807dba', '#6a51a3', '#54278f', '#3f007d']
+GREYS = ['#ffffff', '#f0f0f0', '#d9d9d9', '#bdbdbd', '#969696', '#737373', '#525252', '#252525', '#000000']
+GREENS = ["#edf8e9","#c7e9c0","#a1d99b","#74c476","#41ab5d","#238b45","#005a32"]
+ORANGES = ['#fff5eb', '#fee6ce', '#fdd0a2', '#fdae6b', '#fd8d3c', '#f16913', '#d94801', '#a63603', '#7f2704']
+BROWNS = ["#C46200", "#964B00"]
+RD_PU = ["#feebe2","#fcc5c0","#fa9fb5","#f768a1","#dd3497","#ae017e","#7a0177"]
+PU_RD = ["#f1eef6","#d4b9da","#c994c7","#df65b0","#e7298a","#ce1256","#91003f"]
+
+OPINION_COLORS_MAP = {"Anti-Trump": BLUES[5], "Pro-Trump": REDS[5]}
+BOT_COLORS_MAP = {"Human": GREYS[3], "Bot": PURPLES[6]}
+Q_COLORS_MAP = {"Normal": GREYS[3], "Q-anon": REDS[6]}
+TOXIC_COLORS_MAP = {"Toxic": BROWNS[1], "Normal": GREYS[3]}
+FACT_COLORS_MAP = {"High Quality": GREYS[3], "Low Quality": RD_PU[4]}
+
+FOURWAY_COLORS_MAP = {
+ "Anti-Trump Human": BLUES[3],
+ "Anti-Trump Bot": BLUES[6],
+
+ "Pro-Trump Human": REDS[3],
+ "Pro-Trump Bot": REDS[6],
+}
+SIXWAY_COLORS_MAP = {
+ "Anti-Trump Human": BLUES[3],
+ "Anti-Trump Bot": BLUES[6],
+
+ "Pro-Trump Human": REDS[3],
+ "Pro-Trump Bot": REDS[6],
+
+ "Q-anon Human": REDS[4], # "Pro-Trump Q-anon Human"
+ "Q-anon Bot": REDS[7], # "Pro-Trump Q-anon Bot"
+}
+
+
+COLORS_MAP = {
+ "bot_label": BOT_COLORS_MAP,
+ "opinion_label": OPINION_COLORS_MAP,
+ "q_label": Q_COLORS_MAP,
+ "toxic_label": TOXIC_COLORS_MAP,
+ "factual_label": FACT_COLORS_MAP,
+
+ "fourway_label": FOURWAY_COLORS_MAP,
+ "sixway_label": SIXWAY_COLORS_MAP,
+ "bom_overall_label": BOT_COLORS_MAP,
+ "bom_astroturf_label": BOT_COLORS_MAP,
+}
+
+
+BOT_LABEL_ORDER = ["Human", "Bot"]
+CATEGORY_ORDERS = {
+ "bot_label": BOT_LABEL_ORDER,
+ "bom_overall_label": BOT_LABEL_ORDER,
+ "bom_astroturf_label": BOT_LABEL_ORDER,
+ "opinion_label": ["Anti-Trump", "Pro-Trump"],
+ "q_label": ["Normal", "Q-anon"],
+
+ "toxic_label": ["Normal", "Toxic"],
+ "factual_label": ["High Quality", "Low Quality"],
+
+ "fourway_label": list(FOURWAY_COLORS_MAP.keys()),
+ "sixway_label": list(SIXWAY_COLORS_MAP.keys()),
+}
+
+"""## Dimensionality Reduction"""
+
+#import warnings
+#warnings.filterwarnings("ignore", message=".*The 'nopython' keyword.*") # suppress umap warnings https://github.com/slundberg/shap/issues/2909
+#warnings.simplefilter("ignore", DeprecationWarning) # suppress warnings.warn("pkg_resources is deprecated as an API", DeprecationWarning) https://discuss.python.org/t/how-to-silence-pkg-resources-warnings/28629/7
+
+import os
+
+import numpy as np
+from pandas import DataFrame
+import plotly.express as px
+from sklearn.decomposition import PCA
+from sklearn.preprocessing import StandardScaler
+
+N_COMPONENTS = 2
+#REDUCTION_RESULTS_DIRPATH = "results"
+FIG_SHOW = True
+FIG_SAVE = False
+
+class ReductionPipeline:
+ # adapted from: https://github.com/s2t2/openai-embeddings-2023/blob/main/app/reduction/pipeline.py
+
+ def __init__(self, x, labels, target, n_components=N_COMPONENTS, reducer_type="PCA", #results_dirpath=None
+ ):
+ """
+
+ """
+
+ self.x = x.copy()
+ self.labels = labels.copy()
+ self.target = target
+
+ self.reducer_type = reducer_type
+ self.reducer_name = {"PCA": "pca", "T-SNE": "tsne", "UMAP": "umap"}[self.reducer_type]
+
+ self.x_scaled = (self.x - self.x.mean(axis=0)) / self.x.std(axis=0)
+ #scaler = StandardScaler()
+ #self.x_scaled = scaler.fit_transform(self.x)
+
+ self.n_components = n_components
+ self.component_names = [f"component_{i+1}" for i in range(self.n_components)]
+
+ #self.results_dirpath = results_dirpath or f"results_pca_{self.n_components}"
+ #os.makedirs(self.results_dirpath, exist_ok=True)
+
+
+ def perform(self):
+ self.pca = PCA(n_components=self.n_components, random_state=99)
+ print(self.pca)
+
+ embeddings = self.pca.fit_transform(self.x_scaled)
+ print("EMBEDDINGS:", embeddings.shape)
+ self.embeddings_df = DataFrame(embeddings, columns=self.component_names, index=self.x.index)
+
+ print("EXPLAINED VARIANCE RATIO:", self.pca.explained_variance_ratio_)
+ print("EXPLAINED VARIANCE:", self.pca.explained_variance_ratio_.sum().round(2))
+
+ # https://stackoverflow.com/questions/21217710/factor-loadings-using-sklearn/44728692#44728692
+ loadings = self.pca.components_.T * np.sqrt(self.pca.explained_variance_)
+ print("LOADINGS", loadings.shape)
+ self.loadings_df = DataFrame(loadings, columns=self.component_names, index=self.pca.feature_names_in_)
+
+
+ #def plot_embeddings(self, fig_show=True, fig_save=False, height=350, labels=None, hover_data=None):
+ #
+ # labels = labels or self.labels
+ #
+ # chart_df = self.embeddings_df.copy()
+ # chart_df = chart_df.merge(self.labels, left_index=True, right_index=True) # ADD TARGET BACK FOR COLOR (ASSUMES INDEX IS the SAME)
+ # #chart_df = chart_df.merge(self.x, left_index=True, right_index=True) # ADD aLL DATA BACK SO WE CAN INSPECT FEATURES AS WELL
+ # #chart_df.sort_values(by=self.target, inplace=True)
+ #
+ # fig = None
+ # if self.n_components == 2:
+ # fig = px.scatter(chart_df, x="component_1", y="component_2",
+ # color=self.target, height=height,
+ # title="PCA Embeddings (n_components=2)",
+ # #hover_data=self.x.columns.tolist() #["gender", "island", "body_mass_g"]
+ # hover_data=hover_data
+ # )
+ # elif self.n_components == 3:
+ # fig = px.scatter_3d(chart_df, x="component_1", y="component_2", z="component_3",
+ # color=self.target, height=height,
+ # title="PCA Embeddings (n_components=3)",
+ # #hover_data=self.x.columns.tolist() # ["gender", "island", "body_mass_g"]
+ # )
+ #
+ # if fig and fig_show:
+ # fig.show()
+ #
+ # if fig and fig_save:
+ # html_filepath = os.path.join(self.results_filepath, f"features.html")
+ # fig.write_html(html_filepath)
+ #
+ # png_filepath = os.path.join(self.results_filepath, f"features.png")
+ # fig.write_image(png_filepath)
+
+
+
+ def plot_embeddings(self, height=500, fig_show=FIG_SHOW, fig_save=FIG_SAVE, results_dirpath=None,
+ subtitle=None, text=None, size=None, hover_data=None,
+ color=None, color_map=None, color_scale=None, category_orders=None):
+
+ chart_df = self.embeddings_df.copy()
+ chart_df = chart_df.merge(self.labels, left_index=True, right_index=True) # ADD TARGET BACK FOR COLOR (ASSUMES INDEX IS the SAME)
+ #chart_df = chart_df.merge(self.x, left_index=True, right_index=True) # ADD aLL DATA BACK SO WE CAN INSPECT FEATURES AS WELL
+ #chart_df.sort_values(by=self.target, inplace=True)
+
+ title = f"Dimensionality Reduction Results ({self.reducer_type} n_components={self.n_components})"
+ if subtitle:
+ title += f"
{subtitle}"
+
+ chart_params = dict(x="component_1", y="component_2",
+ title=title, height=height,
+ #color=color, #"artist_name",
+ hover_data= hover_data #{"index": (self.embeddings_df.index)} #hover_data #["audio_filename", "track_number"]
+ )
+ if color:
+ chart_params["color"] = color
+ if color_map:
+ chart_params["color_discrete_map"] = color_map
+ if color_scale:
+ chart_params["color_continuous_scale"] = color_scale
+ if category_orders:
+ chart_params["category_orders"] = category_orders
+ if hover_data:
+ chart_params["hover_data"] = hover_data
+ if size:
+ chart_params["size"] = size
+ if text:
+ chart_params["text"] = text
+
+ if self.n_components == 2:
+ fig = px.scatter(chart_df, **chart_params)
+ elif self.n_components == 3:
+ chart_params["z"] = "component_3"
+ fig = px.scatter_3d(chart_df, **chart_params)
+ else:
+ return None
+
+ if fig_show:
+ fig.show()
+
+ if fig_save:
+ results_dirpath = results_dirpath or self.results_dirpath
+ filestem = os.path.join(results_dirpath, f"{self.reducer_name}_{self.n_components}")
+ fig.write_image(f"{filestem}.png")
+ fig.write_html(f"{filestem}.html")
+
+ return fig
+
+"""## User Embeddings
+
+7566 users
+
+### Loading
+
+Loading CSV from drive:
+"""
+
+from pandas import read_csv
+
+csv_filepath = os.path.join(DATA_DIRPATH, "botometer_sample_max_50_openai_user_embeddings_unpacked.csv.gz")
+users_df = read_csv(csv_filepath, compression="gzip")
+print(users_df.shape)
+print(users_df.columns)
+users_df.head()
+
+users_df["user_id"].nunique()
+
+users_df["is_bot"].value_counts()
+
+users_df["opinion_community"].value_counts()
+
+users_df["avg_fact_score"].info()
+
+from pandas import isnull
+
+def add_labels(users_df):
+ # APPLY SAME LABELS AS THE ORIGINAL SOURCE CODE
+ # https://github.com/s2t2/openai-embeddings-2023/blob/1b8372dd36982009df5d4a80871f4c182ada743d/notebooks/2_embeddings_data_export.py#L51
+ # https://github.com/s2t2/openai-embeddings-2023/blob/main/app/dataset.py#L37-L64
+
+ # labels:
+ users_df["opinion_label"] = users_df["opinion_community"].map({0:"Anti-Trump", 1:"Pro-Trump"})
+ users_df["bot_label"] = users_df["is_bot"].map({True:"Bot", False:"Human"})
+ users_df["fourway_label"] = users_df["opinion_label"] + " " + users_df["bot_label"]
+
+ # language toxicity scores (0 low - 1 high)
+ toxic_threshold = 0.1
+ users_df["is_toxic"] = users_df["avg_toxicity"] >= toxic_threshold
+ users_df["is_toxic"] = users_df["is_toxic"].map({True: 1, False :0 })
+ users_df["toxic_label"] = users_df["is_toxic"].map({1: "Toxic", 0 :"Normal" })
+
+ # fact check / media quality scores (1 low - 5 high)
+ # there are null avg_fact_score, so we only apply operation if not null, and leave nulls
+ fact_threshold = 3.0
+ users_df["is_factual"] = users_df["avg_fact_score"].apply(lambda score: score if isnull(score) else score >= fact_threshold)
+ users_df["is_factual"] = users_df["is_factual"].map({True: 1, False :0 })
+ users_df["factual_label"] = users_df["is_factual"].map({1: "High Quality", 0 :"Low Quality" })
+
+ # botometer binary and labels:
+ users_df["is_bom_overall"] = users_df["bom_overall"].round()
+ users_df["is_bom_astroturf"] = users_df["bom_astroturf"].round()
+ users_df["bom_overall_label"] = users_df["is_bom_overall"].map({1:"Bot", 0:"Human"})
+ users_df["bom_astroturf_label"] = users_df["is_bom_astroturf"].map({1:"Bot", 0:"Human"})
+ users_df["bom_overall_fourway_label"] = users_df["opinion_label"] + " " + users_df["bom_overall_label"]
+ users_df["bom_astroturf_fourway_label"] = users_df["opinion_label"] + " " + users_df["bom_astroturf_label"]
+
+ return users_df
+
+
+users_df = add_labels(users_df)
+print(users_df.shape)
+print(users_df.columns.tolist())
+users_df.head()
+
+users_df["is_factual"].value_counts()
+
+users_df["factual_label"].value_counts()
+
+users_df["is_toxic"].value_counts()
+
+users_df["toxic_label"].value_counts()
+
+users_df["bot_label"].value_counts()
+
+users_df["opinion_label"].value_counts()
+
+users_df["fourway_label"].value_counts()
+
+"""### Splitting"""
+
+users_df.index = users_df["user_id"]
+
+embeddings_cols = [col for col in users_df.columns if "openai" in col]
+print(len(embeddings_cols))
+print(embeddings_cols[0], "...", embeddings_cols[-1])
+
+users_x = users_df[embeddings_cols]
+users_x.head()
+
+#user_labels = users_df.drop(columns=embeddings_cols)
+#print(user_labels.columns.tolist())
+#user_labels.head()
+
+"""### PCA 2"""
+
+# /usr/local/lib/python3.10/dist-packages/plotly/express/_core.py:1223:
+# PerformanceWarning: DataFrame is highly fragmented.
+# This is usually the result of calling `frame.insert` many times, which has poor performance.
+# Consider joining all columns at once using pd.concat(axis=1) instead.
+# To get a de-fragmented frame, use `newframe = frame.copy()`
+# df_output[col_name] = to_unindexed_series(df_input[argument])
+
+target = "fourway_label" #@param ["bot_label", "opinion_label", "fourway_label", "toxic_label", "is_factual"]
+user_labels = users_df[target]
+user_labels
+
+users_pipeline = ReductionPipeline(x=users_x, labels=user_labels, target=target, n_components=2)
+
+users_pipeline.perform()
+
+users_pipeline.embeddings_df.head()
+
+
+
+# todo: re-implement colors map and category orders
+#users_pipeline.plot_embeddings(fig_show=False, fig_save=False, height=350, )
+
+color_map = COLORS_MAP[target]
+category_orders = {target: CATEGORY_ORDERS[target]}
+
+users_pipeline.plot_embeddings(fig_show=False, fig_save=False, height=350,
+ color=target, color_map=color_map, category_orders=category_orders
+)
+
+groupby_cols = [
+ "bot_label", "opinion_label", # "bom_overall_label", "bom_astroturf_label",
+ "toxic_label", "factual_label",
+ "fourway_label", #"sixway_label",
+]
+
+for groupby_col in groupby_cols:
+ color_map = COLORS_MAP[groupby_col]
+ category_orders = {groupby_col: CATEGORY_ORDERS[groupby_col]}
+
+ labels = users_df[groupby_col]
+ pipeline = ReductionPipeline(x=users_x, labels=labels, target=groupby_col, n_components=2)
+
+ results_dirpath = os.path.join(RESULTS_DIRPATH, "openai_embeddings_v2", "text-embedding-ada-002", f"user_embeddings_{pipeline.reducer_type.lower()}_{pipeline.n_components}", groupby_col)
+ os.makedirs(results_dirpath, exist_ok=True)
+
+ pipeline.perform()
+
+ pipeline.plot_embeddings(
+ color=groupby_col, color_map=color_map, category_orders=category_orders,
+ #hover_data=["user_id", "bot_label"],
+ fig_show=True, fig_save=True,
+ results_dirpath=results_dirpath
+ )
+
+"""## Tweet Embeddings (User Averages)
+
+183K statuses, averaged for each user (see prior notebook). 7566 rows resulting
+
+### Loading
+"""
+
+from pandas import read_csv
+
+csv_filepath = os.path.join(DATA_DIRPATH, "botometer_sample_max_50_openai_status_embeddings_v3_unpacked_deduped_averaged.csv.gz")
+averages_df = read_csv(csv_filepath)
+print(averages_df.shape)
+print(averages_df.columns)
+averages_df.index = averages_df["user_id"]
+averages_df.head()
+
+averages_df["user_id"].nunique()
+
+len(averages_df)
+
+averages_df = add_labels(averages_df)
+print(averages_df.shape)
+print(averages_df.columns.tolist())
+averages_df.head()
+
+"""### Splitting"""
+
+averages_x = averages_df[embeddings_cols]
+averages_x
+
+"""### PCA 2"""
+
+averages_labels = averages_df[target]
+averages_labels
+
+averages_pipeline = ReductionPipeline(x=averages_x, labels=averages_labels, target=target, n_components=2)
+
+averages_pipeline.perform()
+
+averages_pipeline.embeddings_df.head()
+
+# todo: re-implement colors map and category orders
+#averages_pipeline.plot_embeddings(fig_show=True, fig_save=False, height=350)
+
+groupby_cols = [
+ "bot_label", "opinion_label", # "bom_overall_label", "bom_astroturf_label",
+ "toxic_label", "factual_label",
+ "fourway_label", #"sixway_label",
+]
+
+for groupby_col in groupby_cols:
+ color_map = COLORS_MAP[groupby_col]
+ category_orders = {groupby_col: CATEGORY_ORDERS[groupby_col]}
+
+ labels = averages_df[groupby_col]
+ pipeline = ReductionPipeline(x=averages_x, labels=labels, target=groupby_col, n_components=2)
+
+ results_dirpath = os.path.join(RESULTS_DIRPATH, "openai_embeddings_v2", "text-embedding-ada-002", f"status_avg_embeddings_{pipeline.reducer_type.lower()}_{pipeline.n_components}", groupby_col)
+ os.makedirs(results_dirpath, exist_ok=True)
+
+ pipeline.perform()
+
+ pipeline.plot_embeddings(
+ color=groupby_col, color_map=color_map, category_orders=category_orders,
+ #hover_data=["user_id", "bot_label"],
+ fig_show=True, fig_save=True,
+ results_dirpath=results_dirpath
+ )
\ No newline at end of file
diff --git a/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/bot_label/pca_2.html b/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/bot_label/pca_2.html
new file mode 100644
index 0000000..6bfc7a8
--- /dev/null
+++ b/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/bot_label/pca_2.html
@@ -0,0 +1,14 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/bot_label/pca_2.png b/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/bot_label/pca_2.png
new file mode 100644
index 0000000..ce05d91
Binary files /dev/null and b/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/bot_label/pca_2.png differ
diff --git a/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/factual_label/pca_2.html b/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/factual_label/pca_2.html
new file mode 100644
index 0000000..3e878bb
--- /dev/null
+++ b/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/factual_label/pca_2.html
@@ -0,0 +1,14 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/factual_label/pca_2.png b/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/factual_label/pca_2.png
new file mode 100644
index 0000000..b2b4821
Binary files /dev/null and b/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/factual_label/pca_2.png differ
diff --git a/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/fourway_label/pca_2.html b/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/fourway_label/pca_2.html
new file mode 100644
index 0000000..640a70a
--- /dev/null
+++ b/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/fourway_label/pca_2.html
@@ -0,0 +1,14 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/fourway_label/pca_2.png b/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/fourway_label/pca_2.png
new file mode 100644
index 0000000..e48e6e3
Binary files /dev/null and b/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/fourway_label/pca_2.png differ
diff --git a/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/opinion_label/pca_2.html b/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/opinion_label/pca_2.html
new file mode 100644
index 0000000..31a5fef
--- /dev/null
+++ b/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/opinion_label/pca_2.html
@@ -0,0 +1,14 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/opinion_label/pca_2.png b/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/opinion_label/pca_2.png
new file mode 100644
index 0000000..9bdd6ae
Binary files /dev/null and b/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/opinion_label/pca_2.png differ
diff --git a/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/toxic_label/pca_2.html b/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/toxic_label/pca_2.html
new file mode 100644
index 0000000..6a04644
--- /dev/null
+++ b/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/toxic_label/pca_2.html
@@ -0,0 +1,14 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/toxic_label/pca_2.png b/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/toxic_label/pca_2.png
new file mode 100644
index 0000000..3526c27
Binary files /dev/null and b/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/toxic_label/pca_2.png differ
diff --git a/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/bot_label/pca_2.html b/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/bot_label/pca_2.html
new file mode 100644
index 0000000..887d834
--- /dev/null
+++ b/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/bot_label/pca_2.html
@@ -0,0 +1,14 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/bot_label/pca_2.png b/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/bot_label/pca_2.png
new file mode 100644
index 0000000..2532d9d
Binary files /dev/null and b/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/bot_label/pca_2.png differ
diff --git a/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/factual_label/pca_2.html b/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/factual_label/pca_2.html
new file mode 100644
index 0000000..abcba28
--- /dev/null
+++ b/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/factual_label/pca_2.html
@@ -0,0 +1,14 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/factual_label/pca_2.png b/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/factual_label/pca_2.png
new file mode 100644
index 0000000..6bd3ef3
Binary files /dev/null and b/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/factual_label/pca_2.png differ
diff --git a/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/fourway_label/pca_2.html b/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/fourway_label/pca_2.html
new file mode 100644
index 0000000..6b8b863
--- /dev/null
+++ b/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/fourway_label/pca_2.html
@@ -0,0 +1,14 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/fourway_label/pca_2.png b/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/fourway_label/pca_2.png
new file mode 100644
index 0000000..99b0e01
Binary files /dev/null and b/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/fourway_label/pca_2.png differ
diff --git a/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/opinion_label/pca_2.html b/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/opinion_label/pca_2.html
new file mode 100644
index 0000000..c95c852
--- /dev/null
+++ b/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/opinion_label/pca_2.html
@@ -0,0 +1,14 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/opinion_label/pca_2.png b/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/opinion_label/pca_2.png
new file mode 100644
index 0000000..cc35780
Binary files /dev/null and b/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/opinion_label/pca_2.png differ
diff --git a/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/toxic_label/pca_2.html b/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/toxic_label/pca_2.html
new file mode 100644
index 0000000..eb4275a
--- /dev/null
+++ b/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/toxic_label/pca_2.html
@@ -0,0 +1,14 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/toxic_label/pca_2.png b/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/toxic_label/pca_2.png
new file mode 100644
index 0000000..453f054
Binary files /dev/null and b/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/toxic_label/pca_2.png differ