diff --git a/notebooks/openai_embeddings_v2/De_duping_and_Averaging_Status_Embeddings_(20240216).ipynb b/notebooks/openai_embeddings_v2/De_duping_and_Averaging_Status_Embeddings_(20240216).ipynb new file mode 100644 index 0000000..c1b7718 --- /dev/null +++ b/notebooks/openai_embeddings_v2/De_duping_and_Averaging_Status_Embeddings_(20240216).ipynb @@ -0,0 +1,2724 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "In this notebook, we prepare a clean (de-duped) version of the status embeddings. And we re-construct user embeddings using the average of their status embeddings.\n", + "\n", + "This notebook saves both datasets back to drive for further analysis." + ], + "metadata": { + "id": "ec7rVSPxy567" + } + }, + { + "cell_type": "markdown", + "source": [ + "## Google Drive" + ], + "metadata": { + "id": "Rrp_6meLhC7G" + } + }, + { + "cell_type": "code", + "source": [ + "import os\n", + "from google.colab import drive\n", + "\n", + "drive.mount('/content/drive')\n", + "print(os.getcwd(), os.listdir(os.getcwd()))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1_eMFG-JgS0r", + "outputId": "476e8c07-952e-4dad-c5ee-6348d744668b" + }, + "execution_count": 2, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Mounted at /content/drive\n", + "/content ['.config', 'drive', 'sample_data']\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# you might need to create a google drive SHORTCUT that has this same path\n", + "# ... or update the path to use your own google drive organization\n", + "DIRPATH = '/content/drive/MyDrive/Research/DS Research Shared 2024'\n", + "\n", + "print(DIRPATH)\n", + "os.path.isdir(DIRPATH)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "x0kkbd6agWrY", + "outputId": "8b801e86-7bd0-43f5-c478-22c781bda63d" + }, + "execution_count": 3, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "/content/drive/MyDrive/Research/DS Research Shared 2024\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": {}, + "execution_count": 3 + } + ] + }, + { + "cell_type": "code", + "source": [ + "DATA_DIRPATH = os.path.join(DIRPATH, \"projects\", \"Impeachment 2020 Embeddings\", \"data\")\n", + "os.path.isdir(DATA_DIRPATH)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1ShYrv5_gYcs", + "outputId": "da03ac2a-52b9-4ae8-de9e-d340b3dc12f3" + }, + "execution_count": 4, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": {}, + "execution_count": 4 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Data Loading" + ], + "metadata": { + "id": "YEJ6uTOjg0Uo" + } + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 728 + }, + "id": "yXVSfTL6gQ5K", + "outputId": "9e8e977b-768e-4308-ad47-65e179965ae4" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "(183815, 1541)\n", + "Index(['user_id', 'status_id', 'status_text', 'created_at', 'embeds_length',\n", + " 'openai_0', 'openai_1', 'openai_2', 'openai_3', 'openai_4',\n", + " ...\n", + " 'openai_1526', 'openai_1527', 'openai_1528', 'openai_1529',\n", + " 'openai_1530', 'openai_1531', 'openai_1532', 'openai_1533',\n", + " 'openai_1534', 'openai_1535'],\n", + " dtype='object', length=1541)\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " user_id status_id \\\n", + "0 897845802701377536 1221540755451392001 \n", + "1 935739601301458947 1223458629837295619 \n", + "2 571774622 1217445781663363072 \n", + "3 384679808 1223705594818748416 \n", + "4 701264221653217281 1218459840277729281 \n", + "\n", + " status_text \\\n", + "0 Doubt it..It appears they all have gone the wa... \n", + "1 RT @Wyn1745: Democrats are ‘setting the stage’... \n", + "2 RT @sarahdwire: I’m loathe to insert myself in... \n", + "3 RT @RepRatcliffe: We warned them...As Schiff a... \n", + "4 RT @chipfranklin: Because \"impeachment\" in the... \n", + "\n", + " created_at embeds_length openai_0 openai_1 openai_2 \\\n", + "0 2020-01-26 21:09:45+00:00 1536 -0.020428 -0.006720 0.007308 \n", + "1 2020-02-01 04:10:42+00:00 1536 -0.036689 -0.007481 0.007968 \n", + "2 2020-01-15 13:57:48+00:00 1536 -0.033382 -0.006886 -0.003244 \n", + "3 2020-02-01 20:32:03+00:00 1536 -0.008477 -0.007364 0.000919 \n", + "4 2020-01-18 09:07:18+00:00 1536 -0.009454 0.017376 0.007016 \n", + "\n", + " openai_3 openai_4 ... openai_1526 openai_1527 openai_1528 \\\n", + "0 -0.022157 -0.041841 ... 0.014616 0.004705 0.012661 \n", + "1 -0.006632 -0.022805 ... -0.001696 0.002522 0.020397 \n", + "2 -0.015834 0.000172 ... 0.001027 0.002464 0.002013 \n", + "3 -0.006435 0.008101 ... -0.028269 0.003193 0.015056 \n", + "4 -0.020075 -0.023674 ... -0.013590 0.015564 0.005130 \n", + "\n", + " openai_1529 openai_1530 openai_1531 openai_1532 openai_1533 \\\n", + "0 -0.020974 -0.003458 0.045166 0.029871 -0.021186 \n", + "1 -0.046374 -0.046611 0.021068 -0.000085 -0.003701 \n", + "2 -0.032766 -0.034265 0.006545 0.014804 0.003027 \n", + "3 -0.015333 -0.028137 0.032510 0.010327 -0.013621 \n", + "4 0.003077 -0.029167 0.015523 0.017914 -0.008789 \n", + "\n", + " openai_1534 openai_1535 \n", + "0 -0.003376 -0.024937 \n", + "1 -0.015370 -0.019213 \n", + "2 -0.001518 -0.030946 \n", + "3 -0.007686 -0.016216 \n", + "4 -0.019767 -0.042353 \n", + "\n", + "[5 rows x 1541 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idstatus_idstatus_textcreated_atembeds_lengthopenai_0openai_1openai_2openai_3openai_4...openai_1526openai_1527openai_1528openai_1529openai_1530openai_1531openai_1532openai_1533openai_1534openai_1535
08978458027013775361221540755451392001Doubt it..It appears they all have gone the wa...2020-01-26 21:09:45+00:001536-0.020428-0.0067200.007308-0.022157-0.041841...0.0146160.0047050.012661-0.020974-0.0034580.0451660.029871-0.021186-0.003376-0.024937
19357396013014589471223458629837295619RT @Wyn1745: Democrats are ‘setting the stage’...2020-02-01 04:10:42+00:001536-0.036689-0.0074810.007968-0.006632-0.022805...-0.0016960.0025220.020397-0.046374-0.0466110.021068-0.000085-0.003701-0.015370-0.019213
25717746221217445781663363072RT @sarahdwire: I’m loathe to insert myself in...2020-01-15 13:57:48+00:001536-0.033382-0.006886-0.003244-0.0158340.000172...0.0010270.0024640.002013-0.032766-0.0342650.0065450.0148040.003027-0.001518-0.030946
33846798081223705594818748416RT @RepRatcliffe: We warned them...As Schiff a...2020-02-01 20:32:03+00:001536-0.008477-0.0073640.000919-0.0064350.008101...-0.0282690.0031930.015056-0.015333-0.0281370.0325100.010327-0.013621-0.007686-0.016216
47012642216532172811218459840277729281RT @chipfranklin: Because \"impeachment\" in the...2020-01-18 09:07:18+00:001536-0.0094540.0173760.007016-0.020075-0.023674...-0.0135900.0155640.0051300.003077-0.0291670.0155230.017914-0.008789-0.019767-0.042353
\n", + "

5 rows × 1541 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "statuses_df" + } + }, + "metadata": {}, + "execution_count": 5 + } + ], + "source": [ + "from pandas import read_parquet\n", + "\n", + "pq_filepath = os.path.join(DATA_DIRPATH, \"botometer_sample_max_50_openai_status_embeddings_v3_unpacked.parquet.gzip\")\n", + "statuses_df = read_parquet(pq_filepath)\n", + "print(statuses_df.shape)\n", + "print(statuses_df.columns)\n", + "statuses_df.head()" + ] + }, + { + "cell_type": "code", + "source": [ + "statuses_df[\"user_id\"].nunique()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "NGVktpyCkgJM", + "outputId": "b82250ad-387e-462e-87dc-fdcc26fdefbd" + }, + "execution_count": 6, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "7566" + ] + }, + "metadata": {}, + "execution_count": 6 + } + ] + }, + { + "cell_type": "code", + "source": [ + "len(statuses_df)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "H3GDRXLees44", + "outputId": "eff5b6a0-37b5-4472-a24e-74d7b6e24bc0" + }, + "execution_count": 7, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "183815" + ] + }, + "metadata": {}, + "execution_count": 7 + } + ] + }, + { + "cell_type": "code", + "source": [ + "statuses_df[\"status_id\"].nunique()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "fJ5VxZ7sep0o", + "outputId": "4357116c-8b4b-4f76-81ad-b945ad9c29bf" + }, + "execution_count": 8, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "183727" + ] + }, + "metadata": {}, + "execution_count": 8 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Oh no, statuses not unique?" + ], + "metadata": { + "id": "exkSsNEUe7ku" + } + }, + { + "cell_type": "code", + "source": [ + "statuses_df[\"status_id\"].value_counts()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "NlpK3H50fGYd", + "outputId": "73a1f533-5705-4ae0-f6ef-f03b2c7c3f5b" + }, + "execution_count": 9, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "1234905353650761728 6\n", + "1209143341901737984 3\n", + "1209173027772076033 3\n", + "1207894148151308289 2\n", + "1217603880453718016 2\n", + " ..\n", + "1216442996260003840 1\n", + "1225979782745272325 1\n", + "1206336484170702849 1\n", + "1239320120071200771 1\n", + "1222940911023333376 1\n", + "Name: status_id, Length: 183727, dtype: Int64" + ] + }, + "metadata": {}, + "execution_count": 9 + } + ] + }, + { + "cell_type": "code", + "source": [ + "statuses_df[statuses_df[\"status_id\"].duplicated(keep=False)].sort_values(\"status_id\")" + ], + "metadata": { + "id": "rNIISrzo5Wmo" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "The embeddings values appear to be the same for each status, so we can take the first row for each status." + ], + "metadata": { + "id": "AtVp6ErB5a78" + } + }, + { + "cell_type": "markdown", + "source": [ + "## De-Duping" + ], + "metadata": { + "id": "09UuKDRUg4B4" + } + }, + { + "cell_type": "markdown", + "source": [ + "183,727 statuses" + ], + "metadata": { + "id": "XyicPnJXg5ZE" + } + }, + { + "cell_type": "code", + "source": [ + "print(statuses_df.shape)\n", + "statuses_df.drop_duplicates(subset=[\"status_id\"], inplace=True)\n", + "print(statuses_df.shape)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "dIdPEknwhLIS", + "outputId": "1cb9f0d0-8e57-4927-bb0f-065f7e9d5f31" + }, + "execution_count": 10, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "(183815, 1541)\n", + "(183727, 1541)\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Saving to drive:" + ], + "metadata": { + "id": "fX0pkB_5i5y1" + } + }, + { + "cell_type": "code", + "source": [ + "pq_filepath = os.path.join(DATA_DIRPATH, \"botometer_sample_max_50_openai_status_embeddings_v3_unpacked_deduped.parquet.gzip\")\n", + "\n", + "statuses_df.to_parquet(pq_filepath, compression=\"gzip\")" + ], + "metadata": { + "id": "e1RHJx9ni0g6" + }, + "execution_count": 23, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Averaging Embeddings per User" + ], + "metadata": { + "id": "dYPPWlVfg5y4" + } + }, + { + "cell_type": "code", + "source": [ + "statuses_df.groupby(\"user_id\")[\"status_id\"].count()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "6QmU71bJg8xW", + "outputId": "491a0382-f841-48dc-b079-b272168eb167" + }, + "execution_count": 12, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "user_id\n", + "2952 6\n", + "635553 12\n", + "656993 1\n", + "761154 4\n", + "777554 1\n", + " ..\n", + "1234200349600288772 50\n", + "1234846911028453376 1\n", + "1237940420136456192 4\n", + "1238854780191195136 1\n", + "1240138605726760962 1\n", + "Name: status_id, Length: 7566, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 12 + } + ] + }, + { + "cell_type": "code", + "source": [ + "embeddings_cols = [col for col in statuses_df.columns if \"openai\" in col]\n", + "print(len(embeddings_cols))\n", + "print(embeddings_cols[0], \"...\", embeddings_cols[-1])" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "n_cGPpiKg2Py", + "outputId": "4fada281-c3e3-4d99-c8da-22c17f861bf5" + }, + "execution_count": 13, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "1536\n", + "openai_0 ... openai_1535\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "averages = statuses_df.groupby(\"user_id\")[embeddings_cols].mean()\n", + "print(averages.shape)\n", + "averages.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 305 + }, + "id": "5befd8PGiJxQ", + "outputId": "5f5f977d-1b36-4e4e-ac58-262bbb4f7f95" + }, + "execution_count": 14, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "(7566, 1536)\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " openai_0 openai_1 openai_2 openai_3 openai_4 openai_5 openai_6 \\\n", + "user_id \n", + "2952 -0.023816 0.002004 0.004429 -0.019361 -0.009860 0.004878 0.000960 \n", + "635553 -0.030022 -0.006063 0.017259 -0.018501 -0.008536 0.004416 -0.011840 \n", + "656993 -0.010723 0.008235 0.004192 -0.040441 -0.015172 0.012798 -0.015786 \n", + "761154 -0.021389 -0.004747 0.006925 -0.017395 -0.011900 0.018309 -0.007047 \n", + "777554 -0.009369 -0.009612 0.012470 0.005079 -0.019303 -0.010459 0.019815 \n", + "\n", + " openai_7 openai_8 openai_9 ... openai_1526 openai_1527 \\\n", + "user_id ... \n", + "2952 -0.015426 -0.006430 0.001027 ... -0.012285 0.001094 \n", + "635553 -0.010581 -0.010859 -0.003771 ... -0.005960 -0.007866 \n", + "656993 0.008556 -0.022145 -0.017026 ... -0.018110 0.007116 \n", + "761154 -0.024175 0.001368 0.002065 ... 0.013326 -0.020819 \n", + "777554 -0.019171 -0.017594 -0.006209 ... 0.007358 0.010696 \n", + "\n", + " openai_1528 openai_1529 openai_1530 openai_1531 openai_1532 \\\n", + "user_id \n", + "2952 0.015767 -0.026536 -0.024981 0.015113 0.018588 \n", + "635553 0.010948 -0.021376 -0.023424 0.020705 0.005084 \n", + "656993 -0.004877 -0.032427 -0.023885 -0.000715 0.003886 \n", + "761154 0.007364 -0.016794 -0.049548 0.013037 0.024798 \n", + "777554 0.008784 -0.024808 -0.008042 0.011077 0.001996 \n", + "\n", + " openai_1533 openai_1534 openai_1535 \n", + "user_id \n", + "2952 -0.002324 -0.003782 -0.028532 \n", + "635553 -0.011961 -0.003258 -0.026262 \n", + "656993 -0.024242 0.003839 -0.048883 \n", + "761154 -0.008543 0.006142 -0.035867 \n", + "777554 -0.001104 -0.019460 -0.030301 \n", + "\n", + "[5 rows x 1536 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
openai_0openai_1openai_2openai_3openai_4openai_5openai_6openai_7openai_8openai_9...openai_1526openai_1527openai_1528openai_1529openai_1530openai_1531openai_1532openai_1533openai_1534openai_1535
user_id
2952-0.0238160.0020040.004429-0.019361-0.0098600.0048780.000960-0.015426-0.0064300.001027...-0.0122850.0010940.015767-0.026536-0.0249810.0151130.018588-0.002324-0.003782-0.028532
635553-0.030022-0.0060630.017259-0.018501-0.0085360.004416-0.011840-0.010581-0.010859-0.003771...-0.005960-0.0078660.010948-0.021376-0.0234240.0207050.005084-0.011961-0.003258-0.026262
656993-0.0107230.0082350.004192-0.040441-0.0151720.012798-0.0157860.008556-0.022145-0.017026...-0.0181100.007116-0.004877-0.032427-0.023885-0.0007150.003886-0.0242420.003839-0.048883
761154-0.021389-0.0047470.006925-0.017395-0.0119000.018309-0.007047-0.0241750.0013680.002065...0.013326-0.0208190.007364-0.016794-0.0495480.0130370.024798-0.0085430.006142-0.035867
777554-0.009369-0.0096120.0124700.005079-0.019303-0.0104590.019815-0.019171-0.017594-0.006209...0.0073580.0106960.008784-0.024808-0.0080420.0110770.001996-0.001104-0.019460-0.030301
\n", + "

5 rows × 1536 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "averages" + } + }, + "metadata": {}, + "execution_count": 14 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Get user labels from CSV file:" + ], + "metadata": { + "id": "kqcnjUEskVsQ" + } + }, + { + "cell_type": "code", + "source": [ + "from pandas import read_csv\n", + "\n", + "csv_filepath = os.path.join(DATA_DIRPATH, \"botometer_sample_max_50_openai_user_embeddings_unpacked.csv.gz\")\n", + "users_df = read_csv(csv_filepath, compression=\"gzip\")\n", + "print(users_df.shape)\n", + "print(users_df.columns)\n", + "users_df.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 416 + }, + "id": "QzbQbChmkGxa", + "outputId": "87b480ea-a332-42c8-acb6-517e3f196b35" + }, + "execution_count": 15, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "(7566, 1547)\n", + "Index(['user_id', 'created_on', 'status_count', 'rt_count', 'is_bot',\n", + " 'opinion_community', 'is_q', 'avg_toxicity', 'avg_fact_score',\n", + " 'bom_astroturf',\n", + " ...\n", + " 'openai_1526', 'openai_1527', 'openai_1528', 'openai_1529',\n", + " 'openai_1530', 'openai_1531', 'openai_1532', 'openai_1533',\n", + " 'openai_1534', 'openai_1535'],\n", + " dtype='object', length=1547)\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " user_id created_on status_count rt_count is_bot \\\n", + "0 3420436216 2015-08-13 555 540 True \n", + "1 108121958 2010-01-24 2 2 False \n", + "2 3038308638 2015-02-23 755 665 True \n", + "3 332396536 2011-07-09 951 951 True \n", + "4 955082522479808512 2018-01-21 570 533 True \n", + "\n", + " opinion_community is_q avg_toxicity avg_fact_score bom_astroturf ... \\\n", + "0 0 False 0.056113 1.983193 0.295 ... \n", + "1 0 False 0.456710 NaN 0.580 ... \n", + "2 0 False 0.069860 3.401786 0.970 ... \n", + "3 1 False 0.044264 2.304511 0.580 ... \n", + "4 0 False 0.049325 4.714286 0.355 ... \n", + "\n", + " openai_1526 openai_1527 openai_1528 openai_1529 openai_1530 \\\n", + "0 -0.001867 -0.013167 0.020885 -0.022568 -0.033631 \n", + "1 0.017651 -0.009439 0.024375 -0.032553 -0.042185 \n", + "2 -0.026273 -0.008139 0.030285 -0.029902 -0.030887 \n", + "3 -0.005520 -0.005288 0.017071 -0.033637 -0.040202 \n", + "4 0.009959 0.004695 0.005555 -0.012851 -0.032229 \n", + "\n", + " openai_1531 openai_1532 openai_1533 openai_1534 openai_1535 \n", + "0 0.016153 0.024127 -0.017519 0.002636 -0.039838 \n", + "1 0.013782 0.011320 -0.014862 -0.010413 -0.020359 \n", + "2 0.022481 -0.005476 -0.016279 -0.010138 -0.021454 \n", + "3 0.041773 -0.009370 0.003352 0.009391 -0.042671 \n", + "4 0.031443 0.008163 -0.018501 -0.008724 -0.042027 \n", + "\n", + "[5 rows x 1547 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idcreated_onstatus_countrt_countis_botopinion_communityis_qavg_toxicityavg_fact_scorebom_astroturf...openai_1526openai_1527openai_1528openai_1529openai_1530openai_1531openai_1532openai_1533openai_1534openai_1535
034204362162015-08-13555540True0False0.0561131.9831930.295...-0.001867-0.0131670.020885-0.022568-0.0336310.0161530.024127-0.0175190.002636-0.039838
11081219582010-01-2422False0False0.456710NaN0.580...0.017651-0.0094390.024375-0.032553-0.0421850.0137820.011320-0.014862-0.010413-0.020359
230383086382015-02-23755665True0False0.0698603.4017860.970...-0.026273-0.0081390.030285-0.029902-0.0308870.022481-0.005476-0.016279-0.010138-0.021454
33323965362011-07-09951951True1False0.0442642.3045110.580...-0.005520-0.0052880.017071-0.033637-0.0402020.041773-0.0093700.0033520.009391-0.042671
49550825224798085122018-01-21570533True0False0.0493254.7142860.355...0.0099590.0046950.005555-0.012851-0.0322290.0314430.008163-0.018501-0.008724-0.042027
\n", + "

5 rows × 1547 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "users_df" + } + }, + "metadata": {}, + "execution_count": 15 + } + ] + }, + { + "cell_type": "code", + "source": [ + "user_labels = users_df.drop(columns=embeddings_cols)\n", + "user_labels.index = user_labels[\"user_id\"]\n", + "user_labels.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 258 + }, + "id": "6ljFelXnyGP6", + "outputId": "8315335f-8bbd-4291-85db-54352bea6b9e" + }, + "execution_count": 19, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " user_id created_on status_count rt_count \\\n", + "user_id \n", + "3420436216 3420436216 2015-08-13 555 540 \n", + "108121958 108121958 2010-01-24 2 2 \n", + "3038308638 3038308638 2015-02-23 755 665 \n", + "332396536 332396536 2011-07-09 951 951 \n", + "955082522479808512 955082522479808512 2018-01-21 570 533 \n", + "\n", + " is_bot opinion_community is_q avg_toxicity \\\n", + "user_id \n", + "3420436216 True 0 False 0.056113 \n", + "108121958 False 0 False 0.456710 \n", + "3038308638 True 0 False 0.069860 \n", + "332396536 True 1 False 0.044264 \n", + "955082522479808512 True 0 False 0.049325 \n", + "\n", + " avg_fact_score bom_astroturf bom_overall \n", + "user_id \n", + "3420436216 1.983193 0.295 0.190 \n", + "108121958 NaN 0.580 0.110 \n", + "3038308638 3.401786 0.970 0.970 \n", + "332396536 2.304511 0.580 0.750 \n", + "955082522479808512 4.714286 0.355 0.225 " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idcreated_onstatus_countrt_countis_botopinion_communityis_qavg_toxicityavg_fact_scorebom_astroturfbom_overall
user_id
342043621634204362162015-08-13555540True0False0.0561131.9831930.2950.190
1081219581081219582010-01-2422False0False0.456710NaN0.5800.110
303830863830383086382015-02-23755665True0False0.0698603.4017860.9700.970
3323965363323965362011-07-09951951True1False0.0442642.3045110.5800.750
9550825224798085129550825224798085122018-01-21570533True0False0.0493254.7142860.3550.225
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "user_labels", + "repr_error": "0" + } + }, + "metadata": {}, + "execution_count": 19 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Merge user labels columns back in:" + ], + "metadata": { + "id": "7c1cNgrUkAiS" + } + }, + { + "cell_type": "code", + "source": [ + "averages = averages.merge(user_labels, left_index=True, right_index=True)\n", + "averages.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 287 + }, + "id": "yMcrnkt3yapA", + "outputId": "cdc4127c-4036-4ef2-fd2d-9f5a7ce7b022" + }, + "execution_count": 20, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " openai_0 openai_1 openai_2 openai_3 openai_4 openai_5 openai_6 \\\n", + "user_id \n", + "2952 -0.023816 0.002004 0.004429 -0.019361 -0.009860 0.004878 0.000960 \n", + "635553 -0.030022 -0.006063 0.017259 -0.018501 -0.008536 0.004416 -0.011840 \n", + "656993 -0.010723 0.008235 0.004192 -0.040441 -0.015172 0.012798 -0.015786 \n", + "761154 -0.021389 -0.004747 0.006925 -0.017395 -0.011900 0.018309 -0.007047 \n", + "777554 -0.009369 -0.009612 0.012470 0.005079 -0.019303 -0.010459 0.019815 \n", + "\n", + " openai_7 openai_8 openai_9 ... created_on status_count \\\n", + "user_id ... \n", + "2952 -0.015426 -0.006430 0.001027 ... 2006-07-24 6 \n", + "635553 -0.010581 -0.010859 -0.003771 ... 2007-01-15 12 \n", + "656993 0.008556 -0.022145 -0.017026 ... 2007-01-17 1 \n", + "761154 -0.024175 0.001368 0.002065 ... 2007-02-09 4 \n", + "777554 -0.019171 -0.017594 -0.006209 ... 2007-02-17 1 \n", + "\n", + " rt_count is_bot opinion_community is_q avg_toxicity \\\n", + "user_id \n", + "2952 6 False 0 False 0.006899 \n", + "635553 12 False 0 False 0.077787 \n", + "656993 1 False 0 False 0.025031 \n", + "761154 0 False 0 False 0.172311 \n", + "777554 1 False 0 False 0.001660 \n", + "\n", + " avg_fact_score bom_astroturf bom_overall \n", + "user_id \n", + "2952 NaN 0.21 0.20 \n", + "635553 NaN 0.24 0.16 \n", + "656993 NaN 0.11 0.10 \n", + "761154 NaN 0.13 0.72 \n", + "777554 NaN 0.15 0.03 \n", + "\n", + "[5 rows x 1547 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
openai_0openai_1openai_2openai_3openai_4openai_5openai_6openai_7openai_8openai_9...created_onstatus_countrt_countis_botopinion_communityis_qavg_toxicityavg_fact_scorebom_astroturfbom_overall
user_id
2952-0.0238160.0020040.004429-0.019361-0.0098600.0048780.000960-0.015426-0.0064300.001027...2006-07-2466False0False0.006899NaN0.210.20
635553-0.030022-0.0060630.017259-0.018501-0.0085360.004416-0.011840-0.010581-0.010859-0.003771...2007-01-151212False0False0.077787NaN0.240.16
656993-0.0107230.0082350.004192-0.040441-0.0151720.012798-0.0157860.008556-0.022145-0.017026...2007-01-1711False0False0.025031NaN0.110.10
761154-0.021389-0.0047470.006925-0.017395-0.0119000.018309-0.007047-0.0241750.0013680.002065...2007-02-0940False0False0.172311NaN0.130.72
777554-0.009369-0.0096120.0124700.005079-0.019303-0.0104590.019815-0.019171-0.017594-0.006209...2007-02-1711False0False0.001660NaN0.150.03
\n", + "

5 rows × 1547 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "averages" + } + }, + "metadata": {}, + "execution_count": 20 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Saving to drive:" + ], + "metadata": { + "id": "mXoU2gHRkFCP" + } + }, + { + "cell_type": "code", + "source": [ + "csv_filepath = os.path.join(DATA_DIRPATH, \"botometer_sample_max_50_openai_status_embeddings_v3_unpacked_deduped_averaged.csv.gz\")\n", + "\n", + "averages.to_csv(csv_filepath, compression=\"gzip\")" + ], + "metadata": { + "id": "MJwp84QQiJor" + }, + "execution_count": 21, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/notebooks/openai_embeddings_v2/README.md b/notebooks/openai_embeddings_v2/README.md index f3877c9..c1e97a5 100644 --- a/notebooks/openai_embeddings_v2/README.md +++ b/notebooks/openai_embeddings_v2/README.md @@ -8,6 +8,12 @@ This supercedes earlier approach to fetch embeddings. In this second attempt we are grabbing user-level as well as tweet-level embeddings, to compare the difference in these approaches. -The "Exporting Embeddings" notebook takes embeddings stored in BigQuery (see app/openai_embeddings_v2/README.md), and exports them to CSV / parquet files on Google Drive for easier and cheaper access + 1. The "Exporting Embeddings" notebook takes embeddings stored in BigQuery (see app/openai_embeddings_v2/README.md), and exports them to CSV / parquet files on Google Drive for easier and cheaper access -The "Analysis Template" notebook provides an example of how to load the files from drive for further analysis. + + 2. The "De duping and Averaging" notebook de-duplicates status embeddings, and also calculates average tweet-level embeddings per user, and saves these CSV files to drive. + + + 3. The "Analysis Template" notebook provides an example of how to load the files from drive for further analysis. + + 4. The "User vs Tweet Level Embeddings" notebook performs dimensionality reduction on user embeddings vs tweet embeddings averaged for each user. The results are saved to drive, and then copied to the "results/openai_embeddings_v2" folder in this repo. diff --git a/notebooks/openai_embeddings_v2/User_vs_Tweet_Level_Embeddings_(Impeachment_2020)_Dimensionality_Reduction_(2024).ipynb b/notebooks/openai_embeddings_v2/User_vs_Tweet_Level_Embeddings_(Impeachment_2020)_Dimensionality_Reduction_(2024).ipynb new file mode 100644 index 0000000..a92895c --- /dev/null +++ b/notebooks/openai_embeddings_v2/User_vs_Tweet_Level_Embeddings_(Impeachment_2020)_Dimensionality_Reduction_(2024).ipynb @@ -0,0 +1,5606 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "collapsed_sections": [ + "fbq4scJaCrHN", + "B40ykeY2-Nmr", + "s0VpVSEB81Tt" + ], + "machine_shape": "hm" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "_-39w0IS18f-" + }, + "source": [ + "We fetched user-level and tweet-level OpenAI embeddings and stored on BQ, and copied the data to CSV files on Drive.\n", + "\n", + "Then we de-duped the status embeddings and calculated the average status embeddings for each user, and saved these CSV files on drive.\n", + "\n", + "This notebook provides a preliminary analysis of user-level vs tweet-level embeddings, focusing first on dimensionality reduction." + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Setup" + ], + "metadata": { + "id": "9SWh9Z5xiGUj" + } + }, + { + "cell_type": "markdown", + "source": [ + "Package installation:" + ], + "metadata": { + "id": "rwAQK1yTiHaR" + } + }, + { + "cell_type": "code", + "source": [ + "%%capture\n", + "!pip install -U kaleido" + ], + "metadata": { + "id": "ObcKYuAshyYD" + }, + "execution_count": 161, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "May need to restart session before continuing." + ], + "metadata": { + "id": "CalQUk_WiMYd" + } + }, + { + "cell_type": "code", + "source": [ + "!pip list | grep kaleido" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "xf9bm0I1iJQo", + "outputId": "0c62227f-6414-4b49-ed83-eec7d081ac55" + }, + "execution_count": 6, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "kaleido 0.2.1\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Google Drive" + ], + "metadata": { + "id": "FF154lGK_1N6" + } + }, + { + "cell_type": "code", + "source": [ + "import os\n", + "from google.colab import drive\n", + "\n", + "drive.mount('/content/drive')\n", + "print(os.getcwd(), os.listdir(os.getcwd()))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "i_eMkJ5fpKDp", + "outputId": "d00abb25-9536-478b-da96-8684f66b3aa4" + }, + "execution_count": 7, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n", + "/content ['.config', 'drive', 'results_pca_2', 'user_results_pca_2', 'sample_data']\n" + ] + } + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "5OKjyFQ0owen", + "outputId": "9deef9fe-a8a7-4ceb-8e27-2e4e043f030a" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "/content/drive/MyDrive/Research/DS Research Shared 2024\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": {}, + "execution_count": 8 + } + ], + "source": [ + "# you might need to create a google drive SHORTCUT that has this same path\n", + "# ... or update the path to use your own google drive organization\n", + "#DIRPATH = '/content/drive/MyDrive/Research/Disinfo Research Shared 2022'\n", + "#DIRPATH = '/content/drive/MyDrive/Research/DS Research Shared 2023'\n", + "DIRPATH = '/content/drive/MyDrive/Research/DS Research Shared 2024'\n", + "\n", + "print(DIRPATH)\n", + "os.path.isdir(DIRPATH)" + ] + }, + { + "cell_type": "markdown", + "source": [ + "New project-based directory structure for 2024:\n", + "\n", + "https://drive.google.com/drive/folders/1SuXkqVT400uZ2OYFGGV8SYBf7NhtBo5k?usp=drive_link" + ], + "metadata": { + "id": "dNCNBPJkg9St" + } + }, + { + "cell_type": "code", + "source": [ + "DATA_DIRPATH = os.path.join(DIRPATH, \"projects\", \"Impeachment 2020 Embeddings\", \"data\")\n", + "os.path.isdir(DATA_DIRPATH)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "jjkYs5KJ99LX", + "outputId": "b4cca1b9-3f29-436a-f4f4-8d593881814a" + }, + "execution_count": 9, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": {}, + "execution_count": 9 + } + ] + }, + { + "cell_type": "code", + "source": [ + "os.listdir(DATA_DIRPATH)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "x9QGLQH_dUGV", + "outputId": "c300723f-3451-4011-884f-a2be599a3912" + }, + "execution_count": 10, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['botometer_sample_max_50_openai_user_embeddings.csv.gz',\n", + " 'botometer_sample_max_50_openai_user_embeddings_unpacked.csv.gz',\n", + " 'botometer_sample_max_50_openai_status_embeddings_v3.csv.gz',\n", + " 'botometer_sample_max_50_openai_status_embeddings_v3_unpacked.parquet.gzip',\n", + " 'botometer_sample_max_50_openai_status_embeddings_v3_unpacked_deduped.parquet.gzip',\n", + " 'botometer_sample_max_50_openai_status_embeddings_v3_unpacked_deduped_averaged.csv.gz']" + ] + }, + "metadata": {}, + "execution_count": 10 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "The \"unpacked\" versions have a column per embedding, and are generally easier to work with.\n", + "\n", + "The files we will be working with are:\n", + " + \"botometer_sample_max_50_openai_user_embeddings_unpacked.csv.gz\" (user level embeddings) and\n", + " + \"botometer_sample_max_50_openai_status_embeddings_v3_unpacked_deduped_averaged.csv.gz\" (average status-level embeddings)." + ], + "metadata": { + "id": "JCNrEG7vhOKo" + } + }, + { + "cell_type": "code", + "source": [ + "RESULTS_DIRPATH = os.path.join(DIRPATH, \"projects\", \"Impeachment 2020 Embeddings\", \"results\")\n", + "os.makedirs(RESULTS_DIRPATH, exist_ok=True)\n", + "os.path.isdir(RESULTS_DIRPATH)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "lHdKnZzCj5UB", + "outputId": "d67645aa-be31-4bef-b9b3-0f9465a73575" + }, + "execution_count": 55, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": {}, + "execution_count": 55 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Colors" + ], + "metadata": { + "id": "fbq4scJaCrHN" + } + }, + { + "cell_type": "code", + "source": [ + "# https://github.com/s2t2/openai-embeddings-2023/blob/main/app/colors.py\n", + "\n", + "#GREY = \"#ccc\"\n", + "#PURPLE = \"#7E57C2\"\n", + "\n", + "# colorbrewer scales\n", + "# light --> dark\n", + "BLUES = ['#f7fbff', '#deebf7', '#c6dbef', '#9ecae1', '#6baed6', '#4292c6', '#2171b5', '#08519c', '#08306b']\n", + "REDS = ['#fff5f0', '#fee0d2', '#fcbba1', '#fc9272', '#fb6a4a', '#ef3b2c', '#cb181d', '#a50f15', '#67000d']\n", + "PURPLES = ['#fcfbfd', '#efedf5', '#dadaeb', '#bcbddc', '#9e9ac8', '#807dba', '#6a51a3', '#54278f', '#3f007d']\n", + "GREYS = ['#ffffff', '#f0f0f0', '#d9d9d9', '#bdbdbd', '#969696', '#737373', '#525252', '#252525', '#000000']\n", + "GREENS = [\"#edf8e9\",\"#c7e9c0\",\"#a1d99b\",\"#74c476\",\"#41ab5d\",\"#238b45\",\"#005a32\"]\n", + "ORANGES = ['#fff5eb', '#fee6ce', '#fdd0a2', '#fdae6b', '#fd8d3c', '#f16913', '#d94801', '#a63603', '#7f2704']\n", + "BROWNS = [\"#C46200\", \"#964B00\"]\n", + "RD_PU = [\"#feebe2\",\"#fcc5c0\",\"#fa9fb5\",\"#f768a1\",\"#dd3497\",\"#ae017e\",\"#7a0177\"]\n", + "PU_RD = [\"#f1eef6\",\"#d4b9da\",\"#c994c7\",\"#df65b0\",\"#e7298a\",\"#ce1256\",\"#91003f\"]\n", + "\n", + "OPINION_COLORS_MAP = {\"Anti-Trump\": BLUES[5], \"Pro-Trump\": REDS[5]}\n", + "BOT_COLORS_MAP = {\"Human\": GREYS[3], \"Bot\": PURPLES[6]}\n", + "Q_COLORS_MAP = {\"Normal\": GREYS[3], \"Q-anon\": REDS[6]}\n", + "TOXIC_COLORS_MAP = {\"Toxic\": BROWNS[1], \"Normal\": GREYS[3]}\n", + "FACT_COLORS_MAP = {\"High Quality\": GREYS[3], \"Low Quality\": RD_PU[4]}\n", + "\n", + "FOURWAY_COLORS_MAP = {\n", + " \"Anti-Trump Human\": BLUES[3],\n", + " \"Anti-Trump Bot\": BLUES[6],\n", + "\n", + " \"Pro-Trump Human\": REDS[3],\n", + " \"Pro-Trump Bot\": REDS[6],\n", + "}\n", + "SIXWAY_COLORS_MAP = {\n", + " \"Anti-Trump Human\": BLUES[3],\n", + " \"Anti-Trump Bot\": BLUES[6],\n", + "\n", + " \"Pro-Trump Human\": REDS[3],\n", + " \"Pro-Trump Bot\": REDS[6],\n", + "\n", + " \"Q-anon Human\": REDS[4], # \"Pro-Trump Q-anon Human\"\n", + " \"Q-anon Bot\": REDS[7], # \"Pro-Trump Q-anon Bot\"\n", + "}\n", + "\n", + "\n", + "COLORS_MAP = {\n", + " \"bot_label\": BOT_COLORS_MAP,\n", + " \"opinion_label\": OPINION_COLORS_MAP,\n", + " \"q_label\": Q_COLORS_MAP,\n", + " \"toxic_label\": TOXIC_COLORS_MAP,\n", + " \"factual_label\": FACT_COLORS_MAP,\n", + "\n", + " \"fourway_label\": FOURWAY_COLORS_MAP,\n", + " \"sixway_label\": SIXWAY_COLORS_MAP,\n", + " \"bom_overall_label\": BOT_COLORS_MAP,\n", + " \"bom_astroturf_label\": BOT_COLORS_MAP,\n", + "}\n", + "\n", + "\n", + "BOT_LABEL_ORDER = [\"Human\", \"Bot\"]\n", + "CATEGORY_ORDERS = {\n", + " \"bot_label\": BOT_LABEL_ORDER,\n", + " \"bom_overall_label\": BOT_LABEL_ORDER,\n", + " \"bom_astroturf_label\": BOT_LABEL_ORDER,\n", + " \"opinion_label\": [\"Anti-Trump\", \"Pro-Trump\"],\n", + " \"q_label\": [\"Normal\", \"Q-anon\"],\n", + "\n", + " \"toxic_label\": [\"Normal\", \"Toxic\"],\n", + " \"factual_label\": [\"High Quality\", \"Low Quality\"],\n", + "\n", + " \"fourway_label\": list(FOURWAY_COLORS_MAP.keys()),\n", + " \"sixway_label\": list(SIXWAY_COLORS_MAP.keys()),\n", + "}" + ], + "metadata": { + "id": "CStYodOfCtIT" + }, + "execution_count": 11, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Dimensionality Reduction" + ], + "metadata": { + "id": "2oYZLq_i5lQi" + } + }, + { + "cell_type": "code", + "source": [ + "#import warnings\n", + "#warnings.filterwarnings(\"ignore\", message=\".*The 'nopython' keyword.*\") # suppress umap warnings https://github.com/slundberg/shap/issues/2909\n", + "#warnings.simplefilter(\"ignore\", DeprecationWarning) # suppress warnings.warn(\"pkg_resources is deprecated as an API\", DeprecationWarning) https://discuss.python.org/t/how-to-silence-pkg-resources-warnings/28629/7" + ], + "metadata": { + "id": "9mJcPOh66Bqj" + }, + "execution_count": 12, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import os\n", + "\n", + "import numpy as np\n", + "from pandas import DataFrame\n", + "import plotly.express as px\n", + "from sklearn.decomposition import PCA\n", + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "N_COMPONENTS = 2\n", + "#REDUCTION_RESULTS_DIRPATH = \"results\"\n", + "FIG_SHOW = True\n", + "FIG_SAVE = False\n", + "\n", + "class ReductionPipeline:\n", + " # adapted from: https://github.com/s2t2/openai-embeddings-2023/blob/main/app/reduction/pipeline.py\n", + "\n", + " def __init__(self, x, labels, target, n_components=N_COMPONENTS, reducer_type=\"PCA\", #results_dirpath=None\n", + " ):\n", + " \"\"\"\n", + "\n", + " \"\"\"\n", + "\n", + " self.x = x.copy()\n", + " self.labels = labels.copy()\n", + " self.target = target\n", + "\n", + " self.reducer_type = reducer_type\n", + " self.reducer_name = {\"PCA\": \"pca\", \"T-SNE\": \"tsne\", \"UMAP\": \"umap\"}[self.reducer_type]\n", + "\n", + " self.x_scaled = (self.x - self.x.mean(axis=0)) / self.x.std(axis=0)\n", + " #scaler = StandardScaler()\n", + " #self.x_scaled = scaler.fit_transform(self.x)\n", + "\n", + " self.n_components = n_components\n", + " self.component_names = [f\"component_{i+1}\" for i in range(self.n_components)]\n", + "\n", + " #self.results_dirpath = results_dirpath or f\"results_pca_{self.n_components}\"\n", + " #os.makedirs(self.results_dirpath, exist_ok=True)\n", + "\n", + "\n", + " def perform(self):\n", + " self.pca = PCA(n_components=self.n_components, random_state=99)\n", + " print(self.pca)\n", + "\n", + " embeddings = self.pca.fit_transform(self.x_scaled)\n", + " print(\"EMBEDDINGS:\", embeddings.shape)\n", + " self.embeddings_df = DataFrame(embeddings, columns=self.component_names, index=self.x.index)\n", + "\n", + " print(\"EXPLAINED VARIANCE RATIO:\", self.pca.explained_variance_ratio_)\n", + " print(\"EXPLAINED VARIANCE:\", self.pca.explained_variance_ratio_.sum().round(2))\n", + "\n", + " # https://stackoverflow.com/questions/21217710/factor-loadings-using-sklearn/44728692#44728692\n", + " loadings = self.pca.components_.T * np.sqrt(self.pca.explained_variance_)\n", + " print(\"LOADINGS\", loadings.shape)\n", + " self.loadings_df = DataFrame(loadings, columns=self.component_names, index=self.pca.feature_names_in_)\n", + "\n", + "\n", + " #def plot_embeddings(self, fig_show=True, fig_save=False, height=350, labels=None, hover_data=None):\n", + " #\n", + " # labels = labels or self.labels\n", + " #\n", + " # chart_df = self.embeddings_df.copy()\n", + " # chart_df = chart_df.merge(self.labels, left_index=True, right_index=True) # ADD TARGET BACK FOR COLOR (ASSUMES INDEX IS the SAME)\n", + " # #chart_df = chart_df.merge(self.x, left_index=True, right_index=True) # ADD aLL DATA BACK SO WE CAN INSPECT FEATURES AS WELL\n", + " # #chart_df.sort_values(by=self.target, inplace=True)\n", + " #\n", + " # fig = None\n", + " # if self.n_components == 2:\n", + " # fig = px.scatter(chart_df, x=\"component_1\", y=\"component_2\",\n", + " # color=self.target, height=height,\n", + " # title=\"PCA Embeddings (n_components=2)\",\n", + " # #hover_data=self.x.columns.tolist() #[\"gender\", \"island\", \"body_mass_g\"]\n", + " # hover_data=hover_data\n", + " # )\n", + " # elif self.n_components == 3:\n", + " # fig = px.scatter_3d(chart_df, x=\"component_1\", y=\"component_2\", z=\"component_3\",\n", + " # color=self.target, height=height,\n", + " # title=\"PCA Embeddings (n_components=3)\",\n", + " # #hover_data=self.x.columns.tolist() # [\"gender\", \"island\", \"body_mass_g\"]\n", + " # )\n", + " #\n", + " # if fig and fig_show:\n", + " # fig.show()\n", + " #\n", + " # if fig and fig_save:\n", + " # html_filepath = os.path.join(self.results_filepath, f\"features.html\")\n", + " # fig.write_html(html_filepath)\n", + " #\n", + " # png_filepath = os.path.join(self.results_filepath, f\"features.png\")\n", + " # fig.write_image(png_filepath)\n", + "\n", + "\n", + "\n", + " def plot_embeddings(self, height=500, fig_show=FIG_SHOW, fig_save=FIG_SAVE, results_dirpath=None,\n", + " subtitle=None, text=None, size=None, hover_data=None,\n", + " color=None, color_map=None, color_scale=None, category_orders=None):\n", + "\n", + " chart_df = self.embeddings_df.copy()\n", + " chart_df = chart_df.merge(self.labels, left_index=True, right_index=True) # ADD TARGET BACK FOR COLOR (ASSUMES INDEX IS the SAME)\n", + " #chart_df = chart_df.merge(self.x, left_index=True, right_index=True) # ADD aLL DATA BACK SO WE CAN INSPECT FEATURES AS WELL\n", + " #chart_df.sort_values(by=self.target, inplace=True)\n", + "\n", + " title = f\"Dimensionality Reduction Results ({self.reducer_type} n_components={self.n_components})\"\n", + " if subtitle:\n", + " title += f\"
{subtitle}\"\n", + "\n", + " chart_params = dict(x=\"component_1\", y=\"component_2\",\n", + " title=title, height=height,\n", + " #color=color, #\"artist_name\",\n", + " hover_data= hover_data #{\"index\": (self.embeddings_df.index)} #hover_data #[\"audio_filename\", \"track_number\"]\n", + " )\n", + " if color:\n", + " chart_params[\"color\"] = color\n", + " if color_map:\n", + " chart_params[\"color_discrete_map\"] = color_map\n", + " if color_scale:\n", + " chart_params[\"color_continuous_scale\"] = color_scale\n", + " if category_orders:\n", + " chart_params[\"category_orders\"] = category_orders\n", + " if hover_data:\n", + " chart_params[\"hover_data\"] = hover_data\n", + " if size:\n", + " chart_params[\"size\"] = size\n", + " if text:\n", + " chart_params[\"text\"] = text\n", + "\n", + " if self.n_components == 2:\n", + " fig = px.scatter(chart_df, **chart_params)\n", + " elif self.n_components == 3:\n", + " chart_params[\"z\"] = \"component_3\"\n", + " fig = px.scatter_3d(chart_df, **chart_params)\n", + " else:\n", + " return None\n", + "\n", + " if fig_show:\n", + " fig.show()\n", + "\n", + " if fig_save:\n", + " results_dirpath = results_dirpath or self.results_dirpath\n", + " filestem = os.path.join(results_dirpath, f\"{self.reducer_name}_{self.n_components}\")\n", + " fig.write_image(f\"{filestem}.png\")\n", + " fig.write_html(f\"{filestem}.html\")\n", + "\n", + " return fig\n", + "\n" + ], + "metadata": { + "id": "hqPtK9_j5nBR" + }, + "execution_count": 58, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## User Embeddings" + ], + "metadata": { + "id": "TJUWWC48HcGk" + } + }, + { + "cell_type": "markdown", + "source": [ + "7566 users" + ], + "metadata": { + "id": "CGpJ-kDaHfi5" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Loading" + ], + "metadata": { + "id": "B40ykeY2-Nmr" + } + }, + { + "cell_type": "markdown", + "source": [ + "Loading CSV from drive:" + ], + "metadata": { + "id": "1TYFGOn7Ow-P" + } + }, + { + "cell_type": "code", + "source": [ + "from pandas import read_csv\n", + "\n", + "csv_filepath = os.path.join(DATA_DIRPATH, \"botometer_sample_max_50_openai_user_embeddings_unpacked.csv.gz\")\n", + "users_df = read_csv(csv_filepath, compression=\"gzip\")\n", + "print(users_df.shape)\n", + "print(users_df.columns)\n", + "users_df.head()" + ], + "metadata": { + "id": "V5m_ZmDFHeLx", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 0 + }, + "outputId": "668fab3a-9906-45d5-98a4-e85b1fc467bb" + }, + "execution_count": 15, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "(7566, 1547)\n", + "Index(['user_id', 'created_on', 'status_count', 'rt_count', 'is_bot',\n", + " 'opinion_community', 'is_q', 'avg_toxicity', 'avg_fact_score',\n", + " 'bom_astroturf',\n", + " ...\n", + " 'openai_1526', 'openai_1527', 'openai_1528', 'openai_1529',\n", + " 'openai_1530', 'openai_1531', 'openai_1532', 'openai_1533',\n", + " 'openai_1534', 'openai_1535'],\n", + " dtype='object', length=1547)\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " user_id created_on status_count rt_count is_bot \\\n", + "0 3420436216 2015-08-13 555 540 True \n", + "1 108121958 2010-01-24 2 2 False \n", + "2 3038308638 2015-02-23 755 665 True \n", + "3 332396536 2011-07-09 951 951 True \n", + "4 955082522479808512 2018-01-21 570 533 True \n", + "\n", + " opinion_community is_q avg_toxicity avg_fact_score bom_astroturf ... \\\n", + "0 0 False 0.056113 1.983193 0.295 ... \n", + "1 0 False 0.456710 NaN 0.580 ... \n", + "2 0 False 0.069860 3.401786 0.970 ... \n", + "3 1 False 0.044264 2.304511 0.580 ... \n", + "4 0 False 0.049325 4.714286 0.355 ... \n", + "\n", + " openai_1526 openai_1527 openai_1528 openai_1529 openai_1530 \\\n", + "0 -0.001867 -0.013167 0.020885 -0.022568 -0.033631 \n", + "1 0.017651 -0.009439 0.024375 -0.032553 -0.042185 \n", + "2 -0.026273 -0.008139 0.030285 -0.029902 -0.030887 \n", + "3 -0.005520 -0.005288 0.017071 -0.033637 -0.040202 \n", + "4 0.009959 0.004695 0.005555 -0.012851 -0.032229 \n", + "\n", + " openai_1531 openai_1532 openai_1533 openai_1534 openai_1535 \n", + "0 0.016153 0.024127 -0.017519 0.002636 -0.039838 \n", + "1 0.013782 0.011320 -0.014862 -0.010413 -0.020359 \n", + "2 0.022481 -0.005476 -0.016279 -0.010138 -0.021454 \n", + "3 0.041773 -0.009370 0.003352 0.009391 -0.042671 \n", + "4 0.031443 0.008163 -0.018501 -0.008724 -0.042027 \n", + "\n", + "[5 rows x 1547 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idcreated_onstatus_countrt_countis_botopinion_communityis_qavg_toxicityavg_fact_scorebom_astroturf...openai_1526openai_1527openai_1528openai_1529openai_1530openai_1531openai_1532openai_1533openai_1534openai_1535
034204362162015-08-13555540True0False0.0561131.9831930.295...-0.001867-0.0131670.020885-0.022568-0.0336310.0161530.024127-0.0175190.002636-0.039838
11081219582010-01-2422False0False0.456710NaN0.580...0.017651-0.0094390.024375-0.032553-0.0421850.0137820.011320-0.014862-0.010413-0.020359
230383086382015-02-23755665True0False0.0698603.4017860.970...-0.026273-0.0081390.030285-0.029902-0.0308870.022481-0.005476-0.016279-0.010138-0.021454
33323965362011-07-09951951True1False0.0442642.3045110.580...-0.005520-0.0052880.017071-0.033637-0.0402020.041773-0.0093700.0033520.009391-0.042671
49550825224798085122018-01-21570533True0False0.0493254.7142860.355...0.0099590.0046950.005555-0.012851-0.0322290.0314430.008163-0.018501-0.008724-0.042027
\n", + "

5 rows × 1547 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "users_df" + } + }, + "metadata": {}, + "execution_count": 15 + } + ] + }, + { + "cell_type": "code", + "source": [ + "users_df[\"user_id\"].nunique()" + ], + "metadata": { + "id": "nQGfxCyBHeIi", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "bd8b1f8c-eb53-43b0-e0e9-6804cd7dbc0e" + }, + "execution_count": 16, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "7566" + ] + }, + "metadata": {}, + "execution_count": 16 + } + ] + }, + { + "cell_type": "code", + "source": [ + "users_df[\"is_bot\"].value_counts()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "JIwbbnB71suN", + "outputId": "d7baeb21-341b-4a5f-ff27-94ad7ed64569" + }, + "execution_count": 17, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "False 4466\n", + "True 3100\n", + "Name: is_bot, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 17 + } + ] + }, + { + "cell_type": "code", + "source": [ + "users_df[\"opinion_community\"].value_counts()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Yi8Qlxi_1spO", + "outputId": "8333d45e-a469-4388-fd5c-3b558bfb5715" + }, + "execution_count": 18, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0 4891\n", + "1 2675\n", + "Name: opinion_community, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 18 + } + ] + }, + { + "cell_type": "code", + "source": [ + "users_df[\"avg_fact_score\"].info()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "dG4-L7nDeQC-", + "outputId": "7efd044d-fb1b-4e39-f2de-6019a9bbc6b8" + }, + "execution_count": 19, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "RangeIndex: 7566 entries, 0 to 7565\n", + "Series name: avg_fact_score\n", + "Non-Null Count Dtype \n", + "-------------- ----- \n", + "3292 non-null float64\n", + "dtypes: float64(1)\n", + "memory usage: 59.2 KB\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "\n", + "\n", + "from pandas import isnull\n", + "\n", + "def add_labels(users_df):\n", + " # APPLY SAME LABELS AS THE ORIGINAL SOURCE CODE\n", + " # https://github.com/s2t2/openai-embeddings-2023/blob/1b8372dd36982009df5d4a80871f4c182ada743d/notebooks/2_embeddings_data_export.py#L51\n", + " # https://github.com/s2t2/openai-embeddings-2023/blob/main/app/dataset.py#L37-L64\n", + "\n", + " # labels:\n", + " users_df[\"opinion_label\"] = users_df[\"opinion_community\"].map({0:\"Anti-Trump\", 1:\"Pro-Trump\"})\n", + " users_df[\"bot_label\"] = users_df[\"is_bot\"].map({True:\"Bot\", False:\"Human\"})\n", + " users_df[\"fourway_label\"] = users_df[\"opinion_label\"] + \" \" + users_df[\"bot_label\"]\n", + "\n", + " # language toxicity scores (0 low - 1 high)\n", + " toxic_threshold = 0.1\n", + " users_df[\"is_toxic\"] = users_df[\"avg_toxicity\"] >= toxic_threshold\n", + " users_df[\"is_toxic\"] = users_df[\"is_toxic\"].map({True: 1, False :0 })\n", + " users_df[\"toxic_label\"] = users_df[\"is_toxic\"].map({1: \"Toxic\", 0 :\"Normal\" })\n", + "\n", + " # fact check / media quality scores (1 low - 5 high)\n", + " # there are null avg_fact_score, so we only apply operation if not null, and leave nulls\n", + " fact_threshold = 3.0\n", + " users_df[\"is_factual\"] = users_df[\"avg_fact_score\"].apply(lambda score: score if isnull(score) else score >= fact_threshold)\n", + " users_df[\"is_factual\"] = users_df[\"is_factual\"].map({True: 1, False :0 })\n", + " users_df[\"factual_label\"] = users_df[\"is_factual\"].map({1: \"High Quality\", 0 :\"Low Quality\" })\n", + "\n", + " # botometer binary and labels:\n", + " users_df[\"is_bom_overall\"] = users_df[\"bom_overall\"].round()\n", + " users_df[\"is_bom_astroturf\"] = users_df[\"bom_astroturf\"].round()\n", + " users_df[\"bom_overall_label\"] = users_df[\"is_bom_overall\"].map({1:\"Bot\", 0:\"Human\"})\n", + " users_df[\"bom_astroturf_label\"] = users_df[\"is_bom_astroturf\"].map({1:\"Bot\", 0:\"Human\"})\n", + " users_df[\"bom_overall_fourway_label\"] = users_df[\"opinion_label\"] + \" \" + users_df[\"bom_overall_label\"]\n", + " users_df[\"bom_astroturf_fourway_label\"] = users_df[\"opinion_label\"] + \" \" + users_df[\"bom_astroturf_label\"]\n", + "\n", + " return users_df\n", + "\n", + "\n", + "users_df = add_labels(users_df)\n", + "print(users_df.shape)\n", + "print(users_df.columns.tolist())\n", + "users_df.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 0 + }, + "id": "jK9I2mpri_ER", + "outputId": "da7541d9-8cdf-4517-bd7a-6bb1ca162d08" + }, + "execution_count": 20, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "(7566, 1560)\n", + "['user_id', 'created_on', 'status_count', 'rt_count', 'is_bot', 'opinion_community', 'is_q', 'avg_toxicity', 'avg_fact_score', 'bom_astroturf', 'bom_overall', 'openai_0', 'openai_1', 'openai_2', 'openai_3', 'openai_4', 'openai_5', 'openai_6', 'openai_7', 'openai_8', 'openai_9', 'openai_10', 'openai_11', 'openai_12', 'openai_13', 'openai_14', 'openai_15', 'openai_16', 'openai_17', 'openai_18', 'openai_19', 'openai_20', 'openai_21', 'openai_22', 'openai_23', 'openai_24', 'openai_25', 'openai_26', 'openai_27', 'openai_28', 'openai_29', 'openai_30', 'openai_31', 'openai_32', 'openai_33', 'openai_34', 'openai_35', 'openai_36', 'openai_37', 'openai_38', 'openai_39', 'openai_40', 'openai_41', 'openai_42', 'openai_43', 'openai_44', 'openai_45', 'openai_46', 'openai_47', 'openai_48', 'openai_49', 'openai_50', 'openai_51', 'openai_52', 'openai_53', 'openai_54', 'openai_55', 'openai_56', 'openai_57', 'openai_58', 'openai_59', 'openai_60', 'openai_61', 'openai_62', 'openai_63', 'openai_64', 'openai_65', 'openai_66', 'openai_67', 'openai_68', 'openai_69', 'openai_70', 'openai_71', 'openai_72', 'openai_73', 'openai_74', 'openai_75', 'openai_76', 'openai_77', 'openai_78', 'openai_79', 'openai_80', 'openai_81', 'openai_82', 'openai_83', 'openai_84', 'openai_85', 'openai_86', 'openai_87', 'openai_88', 'openai_89', 'openai_90', 'openai_91', 'openai_92', 'openai_93', 'openai_94', 'openai_95', 'openai_96', 'openai_97', 'openai_98', 'openai_99', 'openai_100', 'openai_101', 'openai_102', 'openai_103', 'openai_104', 'openai_105', 'openai_106', 'openai_107', 'openai_108', 'openai_109', 'openai_110', 'openai_111', 'openai_112', 'openai_113', 'openai_114', 'openai_115', 'openai_116', 'openai_117', 'openai_118', 'openai_119', 'openai_120', 'openai_121', 'openai_122', 'openai_123', 'openai_124', 'openai_125', 'openai_126', 'openai_127', 'openai_128', 'openai_129', 'openai_130', 'openai_131', 'openai_132', 'openai_133', 'openai_134', 'openai_135', 'openai_136', 'openai_137', 'openai_138', 'openai_139', 'openai_140', 'openai_141', 'openai_142', 'openai_143', 'openai_144', 'openai_145', 'openai_146', 'openai_147', 'openai_148', 'openai_149', 'openai_150', 'openai_151', 'openai_152', 'openai_153', 'openai_154', 'openai_155', 'openai_156', 'openai_157', 'openai_158', 'openai_159', 'openai_160', 'openai_161', 'openai_162', 'openai_163', 'openai_164', 'openai_165', 'openai_166', 'openai_167', 'openai_168', 'openai_169', 'openai_170', 'openai_171', 'openai_172', 'openai_173', 'openai_174', 'openai_175', 'openai_176', 'openai_177', 'openai_178', 'openai_179', 'openai_180', 'openai_181', 'openai_182', 'openai_183', 'openai_184', 'openai_185', 'openai_186', 'openai_187', 'openai_188', 'openai_189', 'openai_190', 'openai_191', 'openai_192', 'openai_193', 'openai_194', 'openai_195', 'openai_196', 'openai_197', 'openai_198', 'openai_199', 'openai_200', 'openai_201', 'openai_202', 'openai_203', 'openai_204', 'openai_205', 'openai_206', 'openai_207', 'openai_208', 'openai_209', 'openai_210', 'openai_211', 'openai_212', 'openai_213', 'openai_214', 'openai_215', 'openai_216', 'openai_217', 'openai_218', 'openai_219', 'openai_220', 'openai_221', 'openai_222', 'openai_223', 'openai_224', 'openai_225', 'openai_226', 'openai_227', 'openai_228', 'openai_229', 'openai_230', 'openai_231', 'openai_232', 'openai_233', 'openai_234', 'openai_235', 'openai_236', 'openai_237', 'openai_238', 'openai_239', 'openai_240', 'openai_241', 'openai_242', 'openai_243', 'openai_244', 'openai_245', 'openai_246', 'openai_247', 'openai_248', 'openai_249', 'openai_250', 'openai_251', 'openai_252', 'openai_253', 'openai_254', 'openai_255', 'openai_256', 'openai_257', 'openai_258', 'openai_259', 'openai_260', 'openai_261', 'openai_262', 'openai_263', 'openai_264', 'openai_265', 'openai_266', 'openai_267', 'openai_268', 'openai_269', 'openai_270', 'openai_271', 'openai_272', 'openai_273', 'openai_274', 'openai_275', 'openai_276', 'openai_277', 'openai_278', 'openai_279', 'openai_280', 'openai_281', 'openai_282', 'openai_283', 'openai_284', 'openai_285', 'openai_286', 'openai_287', 'openai_288', 'openai_289', 'openai_290', 'openai_291', 'openai_292', 'openai_293', 'openai_294', 'openai_295', 'openai_296', 'openai_297', 'openai_298', 'openai_299', 'openai_300', 'openai_301', 'openai_302', 'openai_303', 'openai_304', 'openai_305', 'openai_306', 'openai_307', 'openai_308', 'openai_309', 'openai_310', 'openai_311', 'openai_312', 'openai_313', 'openai_314', 'openai_315', 'openai_316', 'openai_317', 'openai_318', 'openai_319', 'openai_320', 'openai_321', 'openai_322', 'openai_323', 'openai_324', 'openai_325', 'openai_326', 'openai_327', 'openai_328', 'openai_329', 'openai_330', 'openai_331', 'openai_332', 'openai_333', 'openai_334', 'openai_335', 'openai_336', 'openai_337', 'openai_338', 'openai_339', 'openai_340', 'openai_341', 'openai_342', 'openai_343', 'openai_344', 'openai_345', 'openai_346', 'openai_347', 'openai_348', 'openai_349', 'openai_350', 'openai_351', 'openai_352', 'openai_353', 'openai_354', 'openai_355', 'openai_356', 'openai_357', 'openai_358', 'openai_359', 'openai_360', 'openai_361', 'openai_362', 'openai_363', 'openai_364', 'openai_365', 'openai_366', 'openai_367', 'openai_368', 'openai_369', 'openai_370', 'openai_371', 'openai_372', 'openai_373', 'openai_374', 'openai_375', 'openai_376', 'openai_377', 'openai_378', 'openai_379', 'openai_380', 'openai_381', 'openai_382', 'openai_383', 'openai_384', 'openai_385', 'openai_386', 'openai_387', 'openai_388', 'openai_389', 'openai_390', 'openai_391', 'openai_392', 'openai_393', 'openai_394', 'openai_395', 'openai_396', 'openai_397', 'openai_398', 'openai_399', 'openai_400', 'openai_401', 'openai_402', 'openai_403', 'openai_404', 'openai_405', 'openai_406', 'openai_407', 'openai_408', 'openai_409', 'openai_410', 'openai_411', 'openai_412', 'openai_413', 'openai_414', 'openai_415', 'openai_416', 'openai_417', 'openai_418', 'openai_419', 'openai_420', 'openai_421', 'openai_422', 'openai_423', 'openai_424', 'openai_425', 'openai_426', 'openai_427', 'openai_428', 'openai_429', 'openai_430', 'openai_431', 'openai_432', 'openai_433', 'openai_434', 'openai_435', 'openai_436', 'openai_437', 'openai_438', 'openai_439', 'openai_440', 'openai_441', 'openai_442', 'openai_443', 'openai_444', 'openai_445', 'openai_446', 'openai_447', 'openai_448', 'openai_449', 'openai_450', 'openai_451', 'openai_452', 'openai_453', 'openai_454', 'openai_455', 'openai_456', 'openai_457', 'openai_458', 'openai_459', 'openai_460', 'openai_461', 'openai_462', 'openai_463', 'openai_464', 'openai_465', 'openai_466', 'openai_467', 'openai_468', 'openai_469', 'openai_470', 'openai_471', 'openai_472', 'openai_473', 'openai_474', 'openai_475', 'openai_476', 'openai_477', 'openai_478', 'openai_479', 'openai_480', 'openai_481', 'openai_482', 'openai_483', 'openai_484', 'openai_485', 'openai_486', 'openai_487', 'openai_488', 'openai_489', 'openai_490', 'openai_491', 'openai_492', 'openai_493', 'openai_494', 'openai_495', 'openai_496', 'openai_497', 'openai_498', 'openai_499', 'openai_500', 'openai_501', 'openai_502', 'openai_503', 'openai_504', 'openai_505', 'openai_506', 'openai_507', 'openai_508', 'openai_509', 'openai_510', 'openai_511', 'openai_512', 'openai_513', 'openai_514', 'openai_515', 'openai_516', 'openai_517', 'openai_518', 'openai_519', 'openai_520', 'openai_521', 'openai_522', 'openai_523', 'openai_524', 'openai_525', 'openai_526', 'openai_527', 'openai_528', 'openai_529', 'openai_530', 'openai_531', 'openai_532', 'openai_533', 'openai_534', 'openai_535', 'openai_536', 'openai_537', 'openai_538', 'openai_539', 'openai_540', 'openai_541', 'openai_542', 'openai_543', 'openai_544', 'openai_545', 'openai_546', 'openai_547', 'openai_548', 'openai_549', 'openai_550', 'openai_551', 'openai_552', 'openai_553', 'openai_554', 'openai_555', 'openai_556', 'openai_557', 'openai_558', 'openai_559', 'openai_560', 'openai_561', 'openai_562', 'openai_563', 'openai_564', 'openai_565', 'openai_566', 'openai_567', 'openai_568', 'openai_569', 'openai_570', 'openai_571', 'openai_572', 'openai_573', 'openai_574', 'openai_575', 'openai_576', 'openai_577', 'openai_578', 'openai_579', 'openai_580', 'openai_581', 'openai_582', 'openai_583', 'openai_584', 'openai_585', 'openai_586', 'openai_587', 'openai_588', 'openai_589', 'openai_590', 'openai_591', 'openai_592', 'openai_593', 'openai_594', 'openai_595', 'openai_596', 'openai_597', 'openai_598', 'openai_599', 'openai_600', 'openai_601', 'openai_602', 'openai_603', 'openai_604', 'openai_605', 'openai_606', 'openai_607', 'openai_608', 'openai_609', 'openai_610', 'openai_611', 'openai_612', 'openai_613', 'openai_614', 'openai_615', 'openai_616', 'openai_617', 'openai_618', 'openai_619', 'openai_620', 'openai_621', 'openai_622', 'openai_623', 'openai_624', 'openai_625', 'openai_626', 'openai_627', 'openai_628', 'openai_629', 'openai_630', 'openai_631', 'openai_632', 'openai_633', 'openai_634', 'openai_635', 'openai_636', 'openai_637', 'openai_638', 'openai_639', 'openai_640', 'openai_641', 'openai_642', 'openai_643', 'openai_644', 'openai_645', 'openai_646', 'openai_647', 'openai_648', 'openai_649', 'openai_650', 'openai_651', 'openai_652', 'openai_653', 'openai_654', 'openai_655', 'openai_656', 'openai_657', 'openai_658', 'openai_659', 'openai_660', 'openai_661', 'openai_662', 'openai_663', 'openai_664', 'openai_665', 'openai_666', 'openai_667', 'openai_668', 'openai_669', 'openai_670', 'openai_671', 'openai_672', 'openai_673', 'openai_674', 'openai_675', 'openai_676', 'openai_677', 'openai_678', 'openai_679', 'openai_680', 'openai_681', 'openai_682', 'openai_683', 'openai_684', 'openai_685', 'openai_686', 'openai_687', 'openai_688', 'openai_689', 'openai_690', 'openai_691', 'openai_692', 'openai_693', 'openai_694', 'openai_695', 'openai_696', 'openai_697', 'openai_698', 'openai_699', 'openai_700', 'openai_701', 'openai_702', 'openai_703', 'openai_704', 'openai_705', 'openai_706', 'openai_707', 'openai_708', 'openai_709', 'openai_710', 'openai_711', 'openai_712', 'openai_713', 'openai_714', 'openai_715', 'openai_716', 'openai_717', 'openai_718', 'openai_719', 'openai_720', 'openai_721', 'openai_722', 'openai_723', 'openai_724', 'openai_725', 'openai_726', 'openai_727', 'openai_728', 'openai_729', 'openai_730', 'openai_731', 'openai_732', 'openai_733', 'openai_734', 'openai_735', 'openai_736', 'openai_737', 'openai_738', 'openai_739', 'openai_740', 'openai_741', 'openai_742', 'openai_743', 'openai_744', 'openai_745', 'openai_746', 'openai_747', 'openai_748', 'openai_749', 'openai_750', 'openai_751', 'openai_752', 'openai_753', 'openai_754', 'openai_755', 'openai_756', 'openai_757', 'openai_758', 'openai_759', 'openai_760', 'openai_761', 'openai_762', 'openai_763', 'openai_764', 'openai_765', 'openai_766', 'openai_767', 'openai_768', 'openai_769', 'openai_770', 'openai_771', 'openai_772', 'openai_773', 'openai_774', 'openai_775', 'openai_776', 'openai_777', 'openai_778', 'openai_779', 'openai_780', 'openai_781', 'openai_782', 'openai_783', 'openai_784', 'openai_785', 'openai_786', 'openai_787', 'openai_788', 'openai_789', 'openai_790', 'openai_791', 'openai_792', 'openai_793', 'openai_794', 'openai_795', 'openai_796', 'openai_797', 'openai_798', 'openai_799', 'openai_800', 'openai_801', 'openai_802', 'openai_803', 'openai_804', 'openai_805', 'openai_806', 'openai_807', 'openai_808', 'openai_809', 'openai_810', 'openai_811', 'openai_812', 'openai_813', 'openai_814', 'openai_815', 'openai_816', 'openai_817', 'openai_818', 'openai_819', 'openai_820', 'openai_821', 'openai_822', 'openai_823', 'openai_824', 'openai_825', 'openai_826', 'openai_827', 'openai_828', 'openai_829', 'openai_830', 'openai_831', 'openai_832', 'openai_833', 'openai_834', 'openai_835', 'openai_836', 'openai_837', 'openai_838', 'openai_839', 'openai_840', 'openai_841', 'openai_842', 'openai_843', 'openai_844', 'openai_845', 'openai_846', 'openai_847', 'openai_848', 'openai_849', 'openai_850', 'openai_851', 'openai_852', 'openai_853', 'openai_854', 'openai_855', 'openai_856', 'openai_857', 'openai_858', 'openai_859', 'openai_860', 'openai_861', 'openai_862', 'openai_863', 'openai_864', 'openai_865', 'openai_866', 'openai_867', 'openai_868', 'openai_869', 'openai_870', 'openai_871', 'openai_872', 'openai_873', 'openai_874', 'openai_875', 'openai_876', 'openai_877', 'openai_878', 'openai_879', 'openai_880', 'openai_881', 'openai_882', 'openai_883', 'openai_884', 'openai_885', 'openai_886', 'openai_887', 'openai_888', 'openai_889', 'openai_890', 'openai_891', 'openai_892', 'openai_893', 'openai_894', 'openai_895', 'openai_896', 'openai_897', 'openai_898', 'openai_899', 'openai_900', 'openai_901', 'openai_902', 'openai_903', 'openai_904', 'openai_905', 'openai_906', 'openai_907', 'openai_908', 'openai_909', 'openai_910', 'openai_911', 'openai_912', 'openai_913', 'openai_914', 'openai_915', 'openai_916', 'openai_917', 'openai_918', 'openai_919', 'openai_920', 'openai_921', 'openai_922', 'openai_923', 'openai_924', 'openai_925', 'openai_926', 'openai_927', 'openai_928', 'openai_929', 'openai_930', 'openai_931', 'openai_932', 'openai_933', 'openai_934', 'openai_935', 'openai_936', 'openai_937', 'openai_938', 'openai_939', 'openai_940', 'openai_941', 'openai_942', 'openai_943', 'openai_944', 'openai_945', 'openai_946', 'openai_947', 'openai_948', 'openai_949', 'openai_950', 'openai_951', 'openai_952', 'openai_953', 'openai_954', 'openai_955', 'openai_956', 'openai_957', 'openai_958', 'openai_959', 'openai_960', 'openai_961', 'openai_962', 'openai_963', 'openai_964', 'openai_965', 'openai_966', 'openai_967', 'openai_968', 'openai_969', 'openai_970', 'openai_971', 'openai_972', 'openai_973', 'openai_974', 'openai_975', 'openai_976', 'openai_977', 'openai_978', 'openai_979', 'openai_980', 'openai_981', 'openai_982', 'openai_983', 'openai_984', 'openai_985', 'openai_986', 'openai_987', 'openai_988', 'openai_989', 'openai_990', 'openai_991', 'openai_992', 'openai_993', 'openai_994', 'openai_995', 'openai_996', 'openai_997', 'openai_998', 'openai_999', 'openai_1000', 'openai_1001', 'openai_1002', 'openai_1003', 'openai_1004', 'openai_1005', 'openai_1006', 'openai_1007', 'openai_1008', 'openai_1009', 'openai_1010', 'openai_1011', 'openai_1012', 'openai_1013', 'openai_1014', 'openai_1015', 'openai_1016', 'openai_1017', 'openai_1018', 'openai_1019', 'openai_1020', 'openai_1021', 'openai_1022', 'openai_1023', 'openai_1024', 'openai_1025', 'openai_1026', 'openai_1027', 'openai_1028', 'openai_1029', 'openai_1030', 'openai_1031', 'openai_1032', 'openai_1033', 'openai_1034', 'openai_1035', 'openai_1036', 'openai_1037', 'openai_1038', 'openai_1039', 'openai_1040', 'openai_1041', 'openai_1042', 'openai_1043', 'openai_1044', 'openai_1045', 'openai_1046', 'openai_1047', 'openai_1048', 'openai_1049', 'openai_1050', 'openai_1051', 'openai_1052', 'openai_1053', 'openai_1054', 'openai_1055', 'openai_1056', 'openai_1057', 'openai_1058', 'openai_1059', 'openai_1060', 'openai_1061', 'openai_1062', 'openai_1063', 'openai_1064', 'openai_1065', 'openai_1066', 'openai_1067', 'openai_1068', 'openai_1069', 'openai_1070', 'openai_1071', 'openai_1072', 'openai_1073', 'openai_1074', 'openai_1075', 'openai_1076', 'openai_1077', 'openai_1078', 'openai_1079', 'openai_1080', 'openai_1081', 'openai_1082', 'openai_1083', 'openai_1084', 'openai_1085', 'openai_1086', 'openai_1087', 'openai_1088', 'openai_1089', 'openai_1090', 'openai_1091', 'openai_1092', 'openai_1093', 'openai_1094', 'openai_1095', 'openai_1096', 'openai_1097', 'openai_1098', 'openai_1099', 'openai_1100', 'openai_1101', 'openai_1102', 'openai_1103', 'openai_1104', 'openai_1105', 'openai_1106', 'openai_1107', 'openai_1108', 'openai_1109', 'openai_1110', 'openai_1111', 'openai_1112', 'openai_1113', 'openai_1114', 'openai_1115', 'openai_1116', 'openai_1117', 'openai_1118', 'openai_1119', 'openai_1120', 'openai_1121', 'openai_1122', 'openai_1123', 'openai_1124', 'openai_1125', 'openai_1126', 'openai_1127', 'openai_1128', 'openai_1129', 'openai_1130', 'openai_1131', 'openai_1132', 'openai_1133', 'openai_1134', 'openai_1135', 'openai_1136', 'openai_1137', 'openai_1138', 'openai_1139', 'openai_1140', 'openai_1141', 'openai_1142', 'openai_1143', 'openai_1144', 'openai_1145', 'openai_1146', 'openai_1147', 'openai_1148', 'openai_1149', 'openai_1150', 'openai_1151', 'openai_1152', 'openai_1153', 'openai_1154', 'openai_1155', 'openai_1156', 'openai_1157', 'openai_1158', 'openai_1159', 'openai_1160', 'openai_1161', 'openai_1162', 'openai_1163', 'openai_1164', 'openai_1165', 'openai_1166', 'openai_1167', 'openai_1168', 'openai_1169', 'openai_1170', 'openai_1171', 'openai_1172', 'openai_1173', 'openai_1174', 'openai_1175', 'openai_1176', 'openai_1177', 'openai_1178', 'openai_1179', 'openai_1180', 'openai_1181', 'openai_1182', 'openai_1183', 'openai_1184', 'openai_1185', 'openai_1186', 'openai_1187', 'openai_1188', 'openai_1189', 'openai_1190', 'openai_1191', 'openai_1192', 'openai_1193', 'openai_1194', 'openai_1195', 'openai_1196', 'openai_1197', 'openai_1198', 'openai_1199', 'openai_1200', 'openai_1201', 'openai_1202', 'openai_1203', 'openai_1204', 'openai_1205', 'openai_1206', 'openai_1207', 'openai_1208', 'openai_1209', 'openai_1210', 'openai_1211', 'openai_1212', 'openai_1213', 'openai_1214', 'openai_1215', 'openai_1216', 'openai_1217', 'openai_1218', 'openai_1219', 'openai_1220', 'openai_1221', 'openai_1222', 'openai_1223', 'openai_1224', 'openai_1225', 'openai_1226', 'openai_1227', 'openai_1228', 'openai_1229', 'openai_1230', 'openai_1231', 'openai_1232', 'openai_1233', 'openai_1234', 'openai_1235', 'openai_1236', 'openai_1237', 'openai_1238', 'openai_1239', 'openai_1240', 'openai_1241', 'openai_1242', 'openai_1243', 'openai_1244', 'openai_1245', 'openai_1246', 'openai_1247', 'openai_1248', 'openai_1249', 'openai_1250', 'openai_1251', 'openai_1252', 'openai_1253', 'openai_1254', 'openai_1255', 'openai_1256', 'openai_1257', 'openai_1258', 'openai_1259', 'openai_1260', 'openai_1261', 'openai_1262', 'openai_1263', 'openai_1264', 'openai_1265', 'openai_1266', 'openai_1267', 'openai_1268', 'openai_1269', 'openai_1270', 'openai_1271', 'openai_1272', 'openai_1273', 'openai_1274', 'openai_1275', 'openai_1276', 'openai_1277', 'openai_1278', 'openai_1279', 'openai_1280', 'openai_1281', 'openai_1282', 'openai_1283', 'openai_1284', 'openai_1285', 'openai_1286', 'openai_1287', 'openai_1288', 'openai_1289', 'openai_1290', 'openai_1291', 'openai_1292', 'openai_1293', 'openai_1294', 'openai_1295', 'openai_1296', 'openai_1297', 'openai_1298', 'openai_1299', 'openai_1300', 'openai_1301', 'openai_1302', 'openai_1303', 'openai_1304', 'openai_1305', 'openai_1306', 'openai_1307', 'openai_1308', 'openai_1309', 'openai_1310', 'openai_1311', 'openai_1312', 'openai_1313', 'openai_1314', 'openai_1315', 'openai_1316', 'openai_1317', 'openai_1318', 'openai_1319', 'openai_1320', 'openai_1321', 'openai_1322', 'openai_1323', 'openai_1324', 'openai_1325', 'openai_1326', 'openai_1327', 'openai_1328', 'openai_1329', 'openai_1330', 'openai_1331', 'openai_1332', 'openai_1333', 'openai_1334', 'openai_1335', 'openai_1336', 'openai_1337', 'openai_1338', 'openai_1339', 'openai_1340', 'openai_1341', 'openai_1342', 'openai_1343', 'openai_1344', 'openai_1345', 'openai_1346', 'openai_1347', 'openai_1348', 'openai_1349', 'openai_1350', 'openai_1351', 'openai_1352', 'openai_1353', 'openai_1354', 'openai_1355', 'openai_1356', 'openai_1357', 'openai_1358', 'openai_1359', 'openai_1360', 'openai_1361', 'openai_1362', 'openai_1363', 'openai_1364', 'openai_1365', 'openai_1366', 'openai_1367', 'openai_1368', 'openai_1369', 'openai_1370', 'openai_1371', 'openai_1372', 'openai_1373', 'openai_1374', 'openai_1375', 'openai_1376', 'openai_1377', 'openai_1378', 'openai_1379', 'openai_1380', 'openai_1381', 'openai_1382', 'openai_1383', 'openai_1384', 'openai_1385', 'openai_1386', 'openai_1387', 'openai_1388', 'openai_1389', 'openai_1390', 'openai_1391', 'openai_1392', 'openai_1393', 'openai_1394', 'openai_1395', 'openai_1396', 'openai_1397', 'openai_1398', 'openai_1399', 'openai_1400', 'openai_1401', 'openai_1402', 'openai_1403', 'openai_1404', 'openai_1405', 'openai_1406', 'openai_1407', 'openai_1408', 'openai_1409', 'openai_1410', 'openai_1411', 'openai_1412', 'openai_1413', 'openai_1414', 'openai_1415', 'openai_1416', 'openai_1417', 'openai_1418', 'openai_1419', 'openai_1420', 'openai_1421', 'openai_1422', 'openai_1423', 'openai_1424', 'openai_1425', 'openai_1426', 'openai_1427', 'openai_1428', 'openai_1429', 'openai_1430', 'openai_1431', 'openai_1432', 'openai_1433', 'openai_1434', 'openai_1435', 'openai_1436', 'openai_1437', 'openai_1438', 'openai_1439', 'openai_1440', 'openai_1441', 'openai_1442', 'openai_1443', 'openai_1444', 'openai_1445', 'openai_1446', 'openai_1447', 'openai_1448', 'openai_1449', 'openai_1450', 'openai_1451', 'openai_1452', 'openai_1453', 'openai_1454', 'openai_1455', 'openai_1456', 'openai_1457', 'openai_1458', 'openai_1459', 'openai_1460', 'openai_1461', 'openai_1462', 'openai_1463', 'openai_1464', 'openai_1465', 'openai_1466', 'openai_1467', 'openai_1468', 'openai_1469', 'openai_1470', 'openai_1471', 'openai_1472', 'openai_1473', 'openai_1474', 'openai_1475', 'openai_1476', 'openai_1477', 'openai_1478', 'openai_1479', 'openai_1480', 'openai_1481', 'openai_1482', 'openai_1483', 'openai_1484', 'openai_1485', 'openai_1486', 'openai_1487', 'openai_1488', 'openai_1489', 'openai_1490', 'openai_1491', 'openai_1492', 'openai_1493', 'openai_1494', 'openai_1495', 'openai_1496', 'openai_1497', 'openai_1498', 'openai_1499', 'openai_1500', 'openai_1501', 'openai_1502', 'openai_1503', 'openai_1504', 'openai_1505', 'openai_1506', 'openai_1507', 'openai_1508', 'openai_1509', 'openai_1510', 'openai_1511', 'openai_1512', 'openai_1513', 'openai_1514', 'openai_1515', 'openai_1516', 'openai_1517', 'openai_1518', 'openai_1519', 'openai_1520', 'openai_1521', 'openai_1522', 'openai_1523', 'openai_1524', 'openai_1525', 'openai_1526', 'openai_1527', 'openai_1528', 'openai_1529', 'openai_1530', 'openai_1531', 'openai_1532', 'openai_1533', 'openai_1534', 'openai_1535', 'opinion_label', 'bot_label', 'fourway_label', 'is_toxic', 'toxic_label', 'is_factual', 'factual_label', 'is_bom_overall', 'is_bom_astroturf', 'bom_overall_label', 'bom_astroturf_label', 'bom_overall_fourway_label', 'bom_astroturf_fourway_label']\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " user_id created_on status_count rt_count is_bot \\\n", + "0 3420436216 2015-08-13 555 540 True \n", + "1 108121958 2010-01-24 2 2 False \n", + "2 3038308638 2015-02-23 755 665 True \n", + "3 332396536 2011-07-09 951 951 True \n", + "4 955082522479808512 2018-01-21 570 533 True \n", + "\n", + " opinion_community is_q avg_toxicity avg_fact_score bom_astroturf ... \\\n", + "0 0 False 0.056113 1.983193 0.295 ... \n", + "1 0 False 0.456710 NaN 0.580 ... \n", + "2 0 False 0.069860 3.401786 0.970 ... \n", + "3 1 False 0.044264 2.304511 0.580 ... \n", + "4 0 False 0.049325 4.714286 0.355 ... \n", + "\n", + " is_toxic toxic_label is_factual factual_label is_bom_overall \\\n", + "0 0 Normal 0.0 Low Quality 0.0 \n", + "1 1 Toxic NaN NaN 0.0 \n", + "2 0 Normal 1.0 High Quality 1.0 \n", + "3 0 Normal 0.0 Low Quality 1.0 \n", + "4 0 Normal 1.0 High Quality 0.0 \n", + "\n", + " is_bom_astroturf bom_overall_label bom_astroturf_label \\\n", + "0 0.0 Human Human \n", + "1 1.0 Human Bot \n", + "2 1.0 Bot Bot \n", + "3 1.0 Bot Bot \n", + "4 0.0 Human Human \n", + "\n", + " bom_overall_fourway_label bom_astroturf_fourway_label \n", + "0 Anti-Trump Human Anti-Trump Human \n", + "1 Anti-Trump Human Anti-Trump Bot \n", + "2 Anti-Trump Bot Anti-Trump Bot \n", + "3 Pro-Trump Bot Pro-Trump Bot \n", + "4 Anti-Trump Human Anti-Trump Human \n", + "\n", + "[5 rows x 1560 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idcreated_onstatus_countrt_countis_botopinion_communityis_qavg_toxicityavg_fact_scorebom_astroturf...is_toxictoxic_labelis_factualfactual_labelis_bom_overallis_bom_astroturfbom_overall_labelbom_astroturf_labelbom_overall_fourway_labelbom_astroturf_fourway_label
034204362162015-08-13555540True0False0.0561131.9831930.295...0Normal0.0Low Quality0.00.0HumanHumanAnti-Trump HumanAnti-Trump Human
11081219582010-01-2422False0False0.456710NaN0.580...1ToxicNaNNaN0.01.0HumanBotAnti-Trump HumanAnti-Trump Bot
230383086382015-02-23755665True0False0.0698603.4017860.970...0Normal1.0High Quality1.01.0BotBotAnti-Trump BotAnti-Trump Bot
33323965362011-07-09951951True1False0.0442642.3045110.580...0Normal0.0Low Quality1.01.0BotBotPro-Trump BotPro-Trump Bot
49550825224798085122018-01-21570533True0False0.0493254.7142860.355...0Normal1.0High Quality0.00.0HumanHumanAnti-Trump HumanAnti-Trump Human
\n", + "

5 rows × 1560 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "users_df" + } + }, + "metadata": {}, + "execution_count": 20 + } + ] + }, + { + "cell_type": "code", + "source": [ + "users_df[\"is_factual\"].value_counts()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "CU_qpBVcjFD4", + "outputId": "59ade136-f5e2-42ff-e000-9339d7ff2e76" + }, + "execution_count": 21, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0.0 1696\n", + "1.0 1596\n", + "Name: is_factual, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 21 + } + ] + }, + { + "cell_type": "code", + "source": [ + "users_df[\"factual_label\"].value_counts()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "PYOPG5Hcgw3M", + "outputId": "1962ae13-31b3-4702-9348-94c7fd3e3e50" + }, + "execution_count": 22, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Low Quality 1696\n", + "High Quality 1596\n", + "Name: factual_label, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 22 + } + ] + }, + { + "cell_type": "code", + "source": [ + "users_df[\"is_toxic\"].value_counts()\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "pbaa5rTJh5NY", + "outputId": "4ef18db3-5815-48ea-e5e9-884d81e1ba16" + }, + "execution_count": 23, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0 6132\n", + "1 1434\n", + "Name: is_toxic, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 23 + } + ] + }, + { + "cell_type": "code", + "source": [ + "users_df[\"toxic_label\"].value_counts()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "3diIW61Ng1Cy", + "outputId": "4c2f628b-ae92-442a-c90c-e924f69a8dfc" + }, + "execution_count": 24, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Normal 6132\n", + "Toxic 1434\n", + "Name: toxic_label, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 24 + } + ] + }, + { + "cell_type": "code", + "source": [ + "users_df[\"bot_label\"].value_counts()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "T31nFSuniKdY", + "outputId": "435b9e13-4f3b-4416-8805-a62967bdf80a" + }, + "execution_count": 25, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Human 4466\n", + "Bot 3100\n", + "Name: bot_label, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 25 + } + ] + }, + { + "cell_type": "code", + "source": [ + "users_df[\"opinion_label\"].value_counts()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "4QX5FgjMk3E0", + "outputId": "8692026e-ab73-421b-a96e-2f3476245f4f" + }, + "execution_count": 26, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Anti-Trump 4891\n", + "Pro-Trump 2675\n", + "Name: opinion_label, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 26 + } + ] + }, + { + "cell_type": "code", + "source": [ + "users_df[\"fourway_label\"].value_counts()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "wKHKOfGplAv8", + "outputId": "c87bac82-4272-42bc-b7e6-f8e0984ad3a4" + }, + "execution_count": 27, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Anti-Trump Human 3010\n", + "Anti-Trump Bot 1881\n", + "Pro-Trump Human 1456\n", + "Pro-Trump Bot 1219\n", + "Name: fourway_label, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 27 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Splitting" + ], + "metadata": { + "id": "s0VpVSEB81Tt" + } + }, + { + "cell_type": "code", + "source": [ + "users_df.index = users_df[\"user_id\"]" + ], + "metadata": { + "id": "sCgftz6i9MCT" + }, + "execution_count": 28, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "embeddings_cols = [col for col in users_df.columns if \"openai\" in col]\n", + "print(len(embeddings_cols))\n", + "print(embeddings_cols[0], \"...\", embeddings_cols[-1])" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ChDT1h8R83Gz", + "outputId": "9e58ef40-f8d7-43f2-89f3-50b925b336ac" + }, + "execution_count": 29, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "1536\n", + "openai_0 ... openai_1535\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "users_x = users_df[embeddings_cols]\n", + "users_x.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 0 + }, + "id": "EigyhIpt9Kd2", + "outputId": "197ddc3c-523a-4c2e-f506-b8e41306a594" + }, + "execution_count": 30, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " openai_0 openai_1 openai_2 openai_3 openai_4 \\\n", + "user_id \n", + "3420436216 -0.018802 -0.007904 0.013753 -0.000709 -0.013829 \n", + "108121958 -0.030552 -0.005330 0.014622 -0.015383 0.004529 \n", + "3038308638 -0.007297 0.001028 0.002483 -0.004990 -0.021728 \n", + "332396536 -0.018347 -0.007322 -0.009216 -0.023215 -0.007946 \n", + "955082522479808512 -0.024804 0.007517 0.008514 -0.022979 0.003899 \n", + "\n", + " openai_5 openai_6 openai_7 openai_8 openai_9 ... \\\n", + "user_id ... \n", + "3420436216 0.007897 0.018043 -0.015105 -0.006663 -0.000824 ... \n", + "108121958 0.022213 -0.004980 -0.008592 -0.004993 -0.007705 ... \n", + "3038308638 -0.003700 0.008174 0.004453 0.014321 -0.004018 ... \n", + "332396536 0.003921 0.023664 -0.009896 -0.001530 -0.010064 ... \n", + "955082522479808512 0.000756 0.024158 -0.016718 -0.003902 -0.008977 ... \n", + "\n", + " openai_1526 openai_1527 openai_1528 openai_1529 \\\n", + "user_id \n", + "3420436216 -0.001867 -0.013167 0.020885 -0.022568 \n", + "108121958 0.017651 -0.009439 0.024375 -0.032553 \n", + "3038308638 -0.026273 -0.008139 0.030285 -0.029902 \n", + "332396536 -0.005520 -0.005288 0.017071 -0.033637 \n", + "955082522479808512 0.009959 0.004695 0.005555 -0.012851 \n", + "\n", + " openai_1530 openai_1531 openai_1532 openai_1533 \\\n", + "user_id \n", + "3420436216 -0.033631 0.016153 0.024127 -0.017519 \n", + "108121958 -0.042185 0.013782 0.011320 -0.014862 \n", + "3038308638 -0.030887 0.022481 -0.005476 -0.016279 \n", + "332396536 -0.040202 0.041773 -0.009370 0.003352 \n", + "955082522479808512 -0.032229 0.031443 0.008163 -0.018501 \n", + "\n", + " openai_1534 openai_1535 \n", + "user_id \n", + "3420436216 0.002636 -0.039838 \n", + "108121958 -0.010413 -0.020359 \n", + "3038308638 -0.010138 -0.021454 \n", + "332396536 0.009391 -0.042671 \n", + "955082522479808512 -0.008724 -0.042027 \n", + "\n", + "[5 rows x 1536 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
openai_0openai_1openai_2openai_3openai_4openai_5openai_6openai_7openai_8openai_9...openai_1526openai_1527openai_1528openai_1529openai_1530openai_1531openai_1532openai_1533openai_1534openai_1535
user_id
3420436216-0.018802-0.0079040.013753-0.000709-0.0138290.0078970.018043-0.015105-0.006663-0.000824...-0.001867-0.0131670.020885-0.022568-0.0336310.0161530.024127-0.0175190.002636-0.039838
108121958-0.030552-0.0053300.014622-0.0153830.0045290.022213-0.004980-0.008592-0.004993-0.007705...0.017651-0.0094390.024375-0.032553-0.0421850.0137820.011320-0.014862-0.010413-0.020359
3038308638-0.0072970.0010280.002483-0.004990-0.021728-0.0037000.0081740.0044530.014321-0.004018...-0.026273-0.0081390.030285-0.029902-0.0308870.022481-0.005476-0.016279-0.010138-0.021454
332396536-0.018347-0.007322-0.009216-0.023215-0.0079460.0039210.023664-0.009896-0.001530-0.010064...-0.005520-0.0052880.017071-0.033637-0.0402020.041773-0.0093700.0033520.009391-0.042671
955082522479808512-0.0248040.0075170.008514-0.0229790.0038990.0007560.024158-0.016718-0.003902-0.008977...0.0099590.0046950.005555-0.012851-0.0322290.0314430.008163-0.018501-0.008724-0.042027
\n", + "

5 rows × 1536 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "users_x" + } + }, + "metadata": {}, + "execution_count": 30 + } + ] + }, + { + "cell_type": "code", + "source": [ + "#user_labels = users_df.drop(columns=embeddings_cols)\n", + "#print(user_labels.columns.tolist())\n", + "#user_labels.head()" + ], + "metadata": { + "id": "g1VfnZf29x2a" + }, + "execution_count": 31, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### PCA 2" + ], + "metadata": { + "id": "R1ATEaSs8n7c" + } + }, + { + "cell_type": "code", + "source": [ + "# /usr/local/lib/python3.10/dist-packages/plotly/express/_core.py:1223:\n", + "# PerformanceWarning: DataFrame is highly fragmented.\n", + "# This is usually the result of calling `frame.insert` many times, which has poor performance.\n", + "# Consider joining all columns at once using pd.concat(axis=1) instead.\n", + "# To get a de-fragmented frame, use `newframe = frame.copy()`\n", + "# df_output[col_name] = to_unindexed_series(df_input[argument])\n", + "\n" + ], + "metadata": { + "id": "ct_LaFXK8zI8" + }, + "execution_count": 124, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "target = \"fourway_label\" #@param [\"bot_label\", \"opinion_label\", \"fourway_label\", \"toxic_label\", \"is_factual\"]\n", + "user_labels = users_df[target]\n", + "user_labels" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "VhHH2e9H_rJr", + "outputId": "174c4540-1e42-4c74-935a-c60d76e5d25b" + }, + "execution_count": 32, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "user_id\n", + "3420436216 Anti-Trump Bot\n", + "108121958 Anti-Trump Human\n", + "3038308638 Anti-Trump Bot\n", + "332396536 Pro-Trump Bot\n", + "955082522479808512 Anti-Trump Bot\n", + " ... \n", + "1620694747 Anti-Trump Bot\n", + "1047878200406069248 Anti-Trump Bot\n", + "823502850336624640 Anti-Trump Bot\n", + "26966663 Anti-Trump Bot\n", + "884121768428003329 Anti-Trump Bot\n", + "Name: fourway_label, Length: 7566, dtype: object" + ] + }, + "metadata": {}, + "execution_count": 32 + } + ] + }, + { + "cell_type": "code", + "source": [ + "users_pipeline = ReductionPipeline(x=users_x, labels=user_labels, target=target, n_components=2)\n", + "\n", + "users_pipeline.perform()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ogD2Z0g98njC", + "outputId": "e20eda04-f42f-4635-fc61-a47548dea7c8" + }, + "execution_count": 33, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "PCA(n_components=2, random_state=99)\n", + "EMBEDDINGS: (7566, 2)\n", + "EXPLAINED VARIANCE RATIO: [0.08211384 0.02338218]\n", + "EXPLAINED VARIANCE: 0.11\n", + "LOADINGS (1536, 2)\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "users_pipeline.embeddings_df.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 237 + }, + "id": "gSHkA1gFAGXs", + "outputId": "0a050a7f-77be-49ad-88ba-736ad2bfc7e8" + }, + "execution_count": 34, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " component_1 component_2\n", + "user_id \n", + "3420436216 -12.864011 -0.976722\n", + "108121958 7.074388 -5.434687\n", + "3038308638 -10.170547 -0.784753\n", + "332396536 -10.812691 5.623136\n", + "955082522479808512 -10.050573 -0.195144" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
component_1component_2
user_id
3420436216-12.864011-0.976722
1081219587.074388-5.434687
3038308638-10.170547-0.784753
332396536-10.8126915.623136
955082522479808512-10.050573-0.195144
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"users_pipeline\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"component_1\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 8.150230389844141,\n \"min\": -12.864011415872412,\n \"max\": 7.07438804638496,\n \"samples\": [\n 7.07438804638496,\n -10.050572989026474,\n -10.17054718508993\n ],\n \"num_unique_values\": 5,\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"component_2\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3.9413862514190274,\n \"min\": -5.434686671061165,\n \"max\": 5.623136416263134,\n \"samples\": [\n -5.434686671061165,\n -0.19514374095730225,\n -0.7847534857164845\n ],\n \"num_unique_values\": 5,\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 34 + } + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "sL3OppIhFDWi" + }, + "execution_count": 127, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# todo: re-implement colors map and category orders\n", + "#users_pipeline.plot_embeddings(fig_show=False, fig_save=False, height=350, )" + ], + "metadata": { + "id": "FP8XEgZO84nP" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "color_map = COLORS_MAP[target]\n", + "category_orders = {target: CATEGORY_ORDERS[target]}\n", + "\n", + "users_pipeline.plot_embeddings(fig_show=False, fig_save=False, height=350,\n", + " color=target, color_map=color_map, category_orders=category_orders\n", + ")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 367 + }, + "id": "8_NX8EocCgOe", + "outputId": "7826d1e0-cea9-4018-dcad-d05b86100f35" + }, + "execution_count": 35, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "
\n", + "\n", + "" + ] + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "\n", + "groupby_cols = [\n", + " \"bot_label\", \"opinion_label\", # \"bom_overall_label\", \"bom_astroturf_label\",\n", + " \"toxic_label\", \"factual_label\",\n", + " \"fourway_label\", #\"sixway_label\",\n", + "]\n", + "\n", + "for groupby_col in groupby_cols:\n", + " color_map = COLORS_MAP[groupby_col]\n", + " category_orders = {groupby_col: CATEGORY_ORDERS[groupby_col]}\n", + "\n", + " labels = users_df[groupby_col]\n", + " pipeline = ReductionPipeline(x=users_x, labels=labels, target=groupby_col, n_components=2)\n", + "\n", + " results_dirpath = os.path.join(RESULTS_DIRPATH, \"openai_embeddings_v2\", \"text-embedding-ada-002\", f\"user_embeddings_{pipeline.reducer_type.lower()}_{pipeline.n_components}\", groupby_col)\n", + " os.makedirs(results_dirpath, exist_ok=True)\n", + "\n", + " pipeline.perform()\n", + "\n", + " pipeline.plot_embeddings(\n", + " color=groupby_col, color_map=color_map, category_orders=category_orders,\n", + " #hover_data=[\"user_id\", \"bot_label\"],\n", + " fig_show=True, fig_save=True,\n", + " results_dirpath=results_dirpath\n", + " )" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "aqVam9Mo84jM", + "outputId": "79596956-30d1-4863-e00f-3233772f4fd8" + }, + "execution_count": 59, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "PCA(n_components=2, random_state=99)\n", + "EMBEDDINGS: (7566, 2)\n", + "EXPLAINED VARIANCE RATIO: [0.08211384 0.02338218]\n", + "EXPLAINED VARIANCE: 0.11\n", + "LOADINGS (1536, 2)\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "
\n", + "\n", + "" + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "PCA(n_components=2, random_state=99)\n", + "EMBEDDINGS: (7566, 2)\n", + "EXPLAINED VARIANCE RATIO: [0.08211384 0.02338218]\n", + "EXPLAINED VARIANCE: 0.11\n", + "LOADINGS (1536, 2)\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "
\n", + "\n", + "" + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "PCA(n_components=2, random_state=99)\n", + "EMBEDDINGS: (7566, 2)\n", + "EXPLAINED VARIANCE RATIO: [0.08211384 0.02338218]\n", + "EXPLAINED VARIANCE: 0.11\n", + "LOADINGS (1536, 2)\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "
\n", + "\n", + "" + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "PCA(n_components=2, random_state=99)\n", + "EMBEDDINGS: (7566, 2)\n", + "EXPLAINED VARIANCE RATIO: [0.08211384 0.02338218]\n", + "EXPLAINED VARIANCE: 0.11\n", + "LOADINGS (1536, 2)\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "
\n", + "\n", + "" + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "PCA(n_components=2, random_state=99)\n", + "EMBEDDINGS: (7566, 2)\n", + "EXPLAINED VARIANCE RATIO: [0.08211384 0.02338218]\n", + "EXPLAINED VARIANCE: 0.11\n", + "LOADINGS (1536, 2)\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "
\n", + "\n", + "" + ] + }, + "metadata": {} + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4t48ewACjXQy" + }, + "source": [ + "## Tweet Embeddings (User Averages)" + ] + }, + { + "cell_type": "markdown", + "source": [ + "183K statuses, averaged for each user (see prior notebook). 7566 rows resulting" + ], + "metadata": { + "id": "5sJsvSTWCVVX" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Loading" + ], + "metadata": { + "id": "fjvJbg75dk5r" + } + }, + { + "cell_type": "code", + "source": [ + "from pandas import read_csv\n", + "\n", + "csv_filepath = os.path.join(DATA_DIRPATH, \"botometer_sample_max_50_openai_status_embeddings_v3_unpacked_deduped_averaged.csv.gz\")\n", + "averages_df = read_csv(csv_filepath)\n", + "print(averages_df.shape)\n", + "print(averages_df.columns)\n", + "averages_df.index = averages_df[\"user_id\"]\n", + "averages_df.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 426 + }, + "id": "wy-OIPg_eYX-", + "outputId": "e69cb40b-04c4-4666-ad4c-1adbd756f594" + }, + "execution_count": 47, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "(7566, 1548)\n", + "Index(['user_id', 'openai_0', 'openai_1', 'openai_2', 'openai_3', 'openai_4',\n", + " 'openai_5', 'openai_6', 'openai_7', 'openai_8',\n", + " ...\n", + " 'created_on', 'status_count', 'rt_count', 'is_bot', 'opinion_community',\n", + " 'is_q', 'avg_toxicity', 'avg_fact_score', 'bom_astroturf',\n", + " 'bom_overall'],\n", + " dtype='object', length=1548)\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " user_id openai_0 openai_1 openai_2 openai_3 openai_4 openai_5 \\\n", + "user_id \n", + "2952 2952 -0.023816 0.002004 0.004429 -0.019361 -0.009860 0.004878 \n", + "635553 635553 -0.030022 -0.006063 0.017259 -0.018501 -0.008536 0.004416 \n", + "656993 656993 -0.010723 0.008235 0.004192 -0.040441 -0.015172 0.012798 \n", + "761154 761154 -0.021389 -0.004747 0.006925 -0.017395 -0.011900 0.018309 \n", + "777554 777554 -0.009369 -0.009612 0.012470 0.005079 -0.019303 -0.010459 \n", + "\n", + " openai_6 openai_7 openai_8 ... created_on status_count \\\n", + "user_id ... \n", + "2952 0.000960 -0.015426 -0.006430 ... 2006-07-24 6 \n", + "635553 -0.011840 -0.010581 -0.010859 ... 2007-01-15 12 \n", + "656993 -0.015786 0.008556 -0.022145 ... 2007-01-17 1 \n", + "761154 -0.007047 -0.024175 0.001368 ... 2007-02-09 4 \n", + "777554 0.019815 -0.019171 -0.017594 ... 2007-02-17 1 \n", + "\n", + " rt_count is_bot opinion_community is_q avg_toxicity \\\n", + "user_id \n", + "2952 6 False 0 False 0.006899 \n", + "635553 12 False 0 False 0.077787 \n", + "656993 1 False 0 False 0.025031 \n", + "761154 0 False 0 False 0.172311 \n", + "777554 1 False 0 False 0.001660 \n", + "\n", + " avg_fact_score bom_astroturf bom_overall \n", + "user_id \n", + "2952 NaN 0.21 0.20 \n", + "635553 NaN 0.24 0.16 \n", + "656993 NaN 0.11 0.10 \n", + "761154 NaN 0.13 0.72 \n", + "777554 NaN 0.15 0.03 \n", + "\n", + "[5 rows x 1548 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idopenai_0openai_1openai_2openai_3openai_4openai_5openai_6openai_7openai_8...created_onstatus_countrt_countis_botopinion_communityis_qavg_toxicityavg_fact_scorebom_astroturfbom_overall
user_id
29522952-0.0238160.0020040.004429-0.019361-0.0098600.0048780.000960-0.015426-0.006430...2006-07-2466False0False0.006899NaN0.210.20
635553635553-0.030022-0.0060630.017259-0.018501-0.0085360.004416-0.011840-0.010581-0.010859...2007-01-151212False0False0.077787NaN0.240.16
656993656993-0.0107230.0082350.004192-0.040441-0.0151720.012798-0.0157860.008556-0.022145...2007-01-1711False0False0.025031NaN0.110.10
761154761154-0.021389-0.0047470.006925-0.017395-0.0119000.018309-0.007047-0.0241750.001368...2007-02-0940False0False0.172311NaN0.130.72
777554777554-0.009369-0.0096120.0124700.005079-0.019303-0.0104590.019815-0.019171-0.017594...2007-02-1711False0False0.001660NaN0.150.03
\n", + "

5 rows × 1548 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "averages_df" + } + }, + "metadata": {}, + "execution_count": 47 + } + ] + }, + { + "cell_type": "code", + "source": [ + "averages_df[\"user_id\"].nunique()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "NGVktpyCkgJM", + "outputId": "ee21c406-b820-4948-9bc2-f94dc2043d9c" + }, + "execution_count": 38, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "7566" + ] + }, + "metadata": {}, + "execution_count": 38 + } + ] + }, + { + "cell_type": "code", + "source": [ + "len(averages_df)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "H3GDRXLees44", + "outputId": "2d44a93d-78d2-4a7b-8429-c0ac88c688e7" + }, + "execution_count": 39, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "7566" + ] + }, + "metadata": {}, + "execution_count": 39 + } + ] + }, + { + "cell_type": "code", + "source": [ + "averages_df = add_labels(averages_df)\n", + "print(averages_df.shape)\n", + "print(averages_df.columns.tolist())\n", + "averages_df.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 321 + }, + "id": "fCMNWudr6md7", + "outputId": "4e4c219b-76bd-40b7-d982-fa13fe36ca57" + }, + "execution_count": 48, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "(7566, 1561)\n", + "['user_id', 'openai_0', 'openai_1', 'openai_2', 'openai_3', 'openai_4', 'openai_5', 'openai_6', 'openai_7', 'openai_8', 'openai_9', 'openai_10', 'openai_11', 'openai_12', 'openai_13', 'openai_14', 'openai_15', 'openai_16', 'openai_17', 'openai_18', 'openai_19', 'openai_20', 'openai_21', 'openai_22', 'openai_23', 'openai_24', 'openai_25', 'openai_26', 'openai_27', 'openai_28', 'openai_29', 'openai_30', 'openai_31', 'openai_32', 'openai_33', 'openai_34', 'openai_35', 'openai_36', 'openai_37', 'openai_38', 'openai_39', 'openai_40', 'openai_41', 'openai_42', 'openai_43', 'openai_44', 'openai_45', 'openai_46', 'openai_47', 'openai_48', 'openai_49', 'openai_50', 'openai_51', 'openai_52', 'openai_53', 'openai_54', 'openai_55', 'openai_56', 'openai_57', 'openai_58', 'openai_59', 'openai_60', 'openai_61', 'openai_62', 'openai_63', 'openai_64', 'openai_65', 'openai_66', 'openai_67', 'openai_68', 'openai_69', 'openai_70', 'openai_71', 'openai_72', 'openai_73', 'openai_74', 'openai_75', 'openai_76', 'openai_77', 'openai_78', 'openai_79', 'openai_80', 'openai_81', 'openai_82', 'openai_83', 'openai_84', 'openai_85', 'openai_86', 'openai_87', 'openai_88', 'openai_89', 'openai_90', 'openai_91', 'openai_92', 'openai_93', 'openai_94', 'openai_95', 'openai_96', 'openai_97', 'openai_98', 'openai_99', 'openai_100', 'openai_101', 'openai_102', 'openai_103', 'openai_104', 'openai_105', 'openai_106', 'openai_107', 'openai_108', 'openai_109', 'openai_110', 'openai_111', 'openai_112', 'openai_113', 'openai_114', 'openai_115', 'openai_116', 'openai_117', 'openai_118', 'openai_119', 'openai_120', 'openai_121', 'openai_122', 'openai_123', 'openai_124', 'openai_125', 'openai_126', 'openai_127', 'openai_128', 'openai_129', 'openai_130', 'openai_131', 'openai_132', 'openai_133', 'openai_134', 'openai_135', 'openai_136', 'openai_137', 'openai_138', 'openai_139', 'openai_140', 'openai_141', 'openai_142', 'openai_143', 'openai_144', 'openai_145', 'openai_146', 'openai_147', 'openai_148', 'openai_149', 'openai_150', 'openai_151', 'openai_152', 'openai_153', 'openai_154', 'openai_155', 'openai_156', 'openai_157', 'openai_158', 'openai_159', 'openai_160', 'openai_161', 'openai_162', 'openai_163', 'openai_164', 'openai_165', 'openai_166', 'openai_167', 'openai_168', 'openai_169', 'openai_170', 'openai_171', 'openai_172', 'openai_173', 'openai_174', 'openai_175', 'openai_176', 'openai_177', 'openai_178', 'openai_179', 'openai_180', 'openai_181', 'openai_182', 'openai_183', 'openai_184', 'openai_185', 'openai_186', 'openai_187', 'openai_188', 'openai_189', 'openai_190', 'openai_191', 'openai_192', 'openai_193', 'openai_194', 'openai_195', 'openai_196', 'openai_197', 'openai_198', 'openai_199', 'openai_200', 'openai_201', 'openai_202', 'openai_203', 'openai_204', 'openai_205', 'openai_206', 'openai_207', 'openai_208', 'openai_209', 'openai_210', 'openai_211', 'openai_212', 'openai_213', 'openai_214', 'openai_215', 'openai_216', 'openai_217', 'openai_218', 'openai_219', 'openai_220', 'openai_221', 'openai_222', 'openai_223', 'openai_224', 'openai_225', 'openai_226', 'openai_227', 'openai_228', 'openai_229', 'openai_230', 'openai_231', 'openai_232', 'openai_233', 'openai_234', 'openai_235', 'openai_236', 'openai_237', 'openai_238', 'openai_239', 'openai_240', 'openai_241', 'openai_242', 'openai_243', 'openai_244', 'openai_245', 'openai_246', 'openai_247', 'openai_248', 'openai_249', 'openai_250', 'openai_251', 'openai_252', 'openai_253', 'openai_254', 'openai_255', 'openai_256', 'openai_257', 'openai_258', 'openai_259', 'openai_260', 'openai_261', 'openai_262', 'openai_263', 'openai_264', 'openai_265', 'openai_266', 'openai_267', 'openai_268', 'openai_269', 'openai_270', 'openai_271', 'openai_272', 'openai_273', 'openai_274', 'openai_275', 'openai_276', 'openai_277', 'openai_278', 'openai_279', 'openai_280', 'openai_281', 'openai_282', 'openai_283', 'openai_284', 'openai_285', 'openai_286', 'openai_287', 'openai_288', 'openai_289', 'openai_290', 'openai_291', 'openai_292', 'openai_293', 'openai_294', 'openai_295', 'openai_296', 'openai_297', 'openai_298', 'openai_299', 'openai_300', 'openai_301', 'openai_302', 'openai_303', 'openai_304', 'openai_305', 'openai_306', 'openai_307', 'openai_308', 'openai_309', 'openai_310', 'openai_311', 'openai_312', 'openai_313', 'openai_314', 'openai_315', 'openai_316', 'openai_317', 'openai_318', 'openai_319', 'openai_320', 'openai_321', 'openai_322', 'openai_323', 'openai_324', 'openai_325', 'openai_326', 'openai_327', 'openai_328', 'openai_329', 'openai_330', 'openai_331', 'openai_332', 'openai_333', 'openai_334', 'openai_335', 'openai_336', 'openai_337', 'openai_338', 'openai_339', 'openai_340', 'openai_341', 'openai_342', 'openai_343', 'openai_344', 'openai_345', 'openai_346', 'openai_347', 'openai_348', 'openai_349', 'openai_350', 'openai_351', 'openai_352', 'openai_353', 'openai_354', 'openai_355', 'openai_356', 'openai_357', 'openai_358', 'openai_359', 'openai_360', 'openai_361', 'openai_362', 'openai_363', 'openai_364', 'openai_365', 'openai_366', 'openai_367', 'openai_368', 'openai_369', 'openai_370', 'openai_371', 'openai_372', 'openai_373', 'openai_374', 'openai_375', 'openai_376', 'openai_377', 'openai_378', 'openai_379', 'openai_380', 'openai_381', 'openai_382', 'openai_383', 'openai_384', 'openai_385', 'openai_386', 'openai_387', 'openai_388', 'openai_389', 'openai_390', 'openai_391', 'openai_392', 'openai_393', 'openai_394', 'openai_395', 'openai_396', 'openai_397', 'openai_398', 'openai_399', 'openai_400', 'openai_401', 'openai_402', 'openai_403', 'openai_404', 'openai_405', 'openai_406', 'openai_407', 'openai_408', 'openai_409', 'openai_410', 'openai_411', 'openai_412', 'openai_413', 'openai_414', 'openai_415', 'openai_416', 'openai_417', 'openai_418', 'openai_419', 'openai_420', 'openai_421', 'openai_422', 'openai_423', 'openai_424', 'openai_425', 'openai_426', 'openai_427', 'openai_428', 'openai_429', 'openai_430', 'openai_431', 'openai_432', 'openai_433', 'openai_434', 'openai_435', 'openai_436', 'openai_437', 'openai_438', 'openai_439', 'openai_440', 'openai_441', 'openai_442', 'openai_443', 'openai_444', 'openai_445', 'openai_446', 'openai_447', 'openai_448', 'openai_449', 'openai_450', 'openai_451', 'openai_452', 'openai_453', 'openai_454', 'openai_455', 'openai_456', 'openai_457', 'openai_458', 'openai_459', 'openai_460', 'openai_461', 'openai_462', 'openai_463', 'openai_464', 'openai_465', 'openai_466', 'openai_467', 'openai_468', 'openai_469', 'openai_470', 'openai_471', 'openai_472', 'openai_473', 'openai_474', 'openai_475', 'openai_476', 'openai_477', 'openai_478', 'openai_479', 'openai_480', 'openai_481', 'openai_482', 'openai_483', 'openai_484', 'openai_485', 'openai_486', 'openai_487', 'openai_488', 'openai_489', 'openai_490', 'openai_491', 'openai_492', 'openai_493', 'openai_494', 'openai_495', 'openai_496', 'openai_497', 'openai_498', 'openai_499', 'openai_500', 'openai_501', 'openai_502', 'openai_503', 'openai_504', 'openai_505', 'openai_506', 'openai_507', 'openai_508', 'openai_509', 'openai_510', 'openai_511', 'openai_512', 'openai_513', 'openai_514', 'openai_515', 'openai_516', 'openai_517', 'openai_518', 'openai_519', 'openai_520', 'openai_521', 'openai_522', 'openai_523', 'openai_524', 'openai_525', 'openai_526', 'openai_527', 'openai_528', 'openai_529', 'openai_530', 'openai_531', 'openai_532', 'openai_533', 'openai_534', 'openai_535', 'openai_536', 'openai_537', 'openai_538', 'openai_539', 'openai_540', 'openai_541', 'openai_542', 'openai_543', 'openai_544', 'openai_545', 'openai_546', 'openai_547', 'openai_548', 'openai_549', 'openai_550', 'openai_551', 'openai_552', 'openai_553', 'openai_554', 'openai_555', 'openai_556', 'openai_557', 'openai_558', 'openai_559', 'openai_560', 'openai_561', 'openai_562', 'openai_563', 'openai_564', 'openai_565', 'openai_566', 'openai_567', 'openai_568', 'openai_569', 'openai_570', 'openai_571', 'openai_572', 'openai_573', 'openai_574', 'openai_575', 'openai_576', 'openai_577', 'openai_578', 'openai_579', 'openai_580', 'openai_581', 'openai_582', 'openai_583', 'openai_584', 'openai_585', 'openai_586', 'openai_587', 'openai_588', 'openai_589', 'openai_590', 'openai_591', 'openai_592', 'openai_593', 'openai_594', 'openai_595', 'openai_596', 'openai_597', 'openai_598', 'openai_599', 'openai_600', 'openai_601', 'openai_602', 'openai_603', 'openai_604', 'openai_605', 'openai_606', 'openai_607', 'openai_608', 'openai_609', 'openai_610', 'openai_611', 'openai_612', 'openai_613', 'openai_614', 'openai_615', 'openai_616', 'openai_617', 'openai_618', 'openai_619', 'openai_620', 'openai_621', 'openai_622', 'openai_623', 'openai_624', 'openai_625', 'openai_626', 'openai_627', 'openai_628', 'openai_629', 'openai_630', 'openai_631', 'openai_632', 'openai_633', 'openai_634', 'openai_635', 'openai_636', 'openai_637', 'openai_638', 'openai_639', 'openai_640', 'openai_641', 'openai_642', 'openai_643', 'openai_644', 'openai_645', 'openai_646', 'openai_647', 'openai_648', 'openai_649', 'openai_650', 'openai_651', 'openai_652', 'openai_653', 'openai_654', 'openai_655', 'openai_656', 'openai_657', 'openai_658', 'openai_659', 'openai_660', 'openai_661', 'openai_662', 'openai_663', 'openai_664', 'openai_665', 'openai_666', 'openai_667', 'openai_668', 'openai_669', 'openai_670', 'openai_671', 'openai_672', 'openai_673', 'openai_674', 'openai_675', 'openai_676', 'openai_677', 'openai_678', 'openai_679', 'openai_680', 'openai_681', 'openai_682', 'openai_683', 'openai_684', 'openai_685', 'openai_686', 'openai_687', 'openai_688', 'openai_689', 'openai_690', 'openai_691', 'openai_692', 'openai_693', 'openai_694', 'openai_695', 'openai_696', 'openai_697', 'openai_698', 'openai_699', 'openai_700', 'openai_701', 'openai_702', 'openai_703', 'openai_704', 'openai_705', 'openai_706', 'openai_707', 'openai_708', 'openai_709', 'openai_710', 'openai_711', 'openai_712', 'openai_713', 'openai_714', 'openai_715', 'openai_716', 'openai_717', 'openai_718', 'openai_719', 'openai_720', 'openai_721', 'openai_722', 'openai_723', 'openai_724', 'openai_725', 'openai_726', 'openai_727', 'openai_728', 'openai_729', 'openai_730', 'openai_731', 'openai_732', 'openai_733', 'openai_734', 'openai_735', 'openai_736', 'openai_737', 'openai_738', 'openai_739', 'openai_740', 'openai_741', 'openai_742', 'openai_743', 'openai_744', 'openai_745', 'openai_746', 'openai_747', 'openai_748', 'openai_749', 'openai_750', 'openai_751', 'openai_752', 'openai_753', 'openai_754', 'openai_755', 'openai_756', 'openai_757', 'openai_758', 'openai_759', 'openai_760', 'openai_761', 'openai_762', 'openai_763', 'openai_764', 'openai_765', 'openai_766', 'openai_767', 'openai_768', 'openai_769', 'openai_770', 'openai_771', 'openai_772', 'openai_773', 'openai_774', 'openai_775', 'openai_776', 'openai_777', 'openai_778', 'openai_779', 'openai_780', 'openai_781', 'openai_782', 'openai_783', 'openai_784', 'openai_785', 'openai_786', 'openai_787', 'openai_788', 'openai_789', 'openai_790', 'openai_791', 'openai_792', 'openai_793', 'openai_794', 'openai_795', 'openai_796', 'openai_797', 'openai_798', 'openai_799', 'openai_800', 'openai_801', 'openai_802', 'openai_803', 'openai_804', 'openai_805', 'openai_806', 'openai_807', 'openai_808', 'openai_809', 'openai_810', 'openai_811', 'openai_812', 'openai_813', 'openai_814', 'openai_815', 'openai_816', 'openai_817', 'openai_818', 'openai_819', 'openai_820', 'openai_821', 'openai_822', 'openai_823', 'openai_824', 'openai_825', 'openai_826', 'openai_827', 'openai_828', 'openai_829', 'openai_830', 'openai_831', 'openai_832', 'openai_833', 'openai_834', 'openai_835', 'openai_836', 'openai_837', 'openai_838', 'openai_839', 'openai_840', 'openai_841', 'openai_842', 'openai_843', 'openai_844', 'openai_845', 'openai_846', 'openai_847', 'openai_848', 'openai_849', 'openai_850', 'openai_851', 'openai_852', 'openai_853', 'openai_854', 'openai_855', 'openai_856', 'openai_857', 'openai_858', 'openai_859', 'openai_860', 'openai_861', 'openai_862', 'openai_863', 'openai_864', 'openai_865', 'openai_866', 'openai_867', 'openai_868', 'openai_869', 'openai_870', 'openai_871', 'openai_872', 'openai_873', 'openai_874', 'openai_875', 'openai_876', 'openai_877', 'openai_878', 'openai_879', 'openai_880', 'openai_881', 'openai_882', 'openai_883', 'openai_884', 'openai_885', 'openai_886', 'openai_887', 'openai_888', 'openai_889', 'openai_890', 'openai_891', 'openai_892', 'openai_893', 'openai_894', 'openai_895', 'openai_896', 'openai_897', 'openai_898', 'openai_899', 'openai_900', 'openai_901', 'openai_902', 'openai_903', 'openai_904', 'openai_905', 'openai_906', 'openai_907', 'openai_908', 'openai_909', 'openai_910', 'openai_911', 'openai_912', 'openai_913', 'openai_914', 'openai_915', 'openai_916', 'openai_917', 'openai_918', 'openai_919', 'openai_920', 'openai_921', 'openai_922', 'openai_923', 'openai_924', 'openai_925', 'openai_926', 'openai_927', 'openai_928', 'openai_929', 'openai_930', 'openai_931', 'openai_932', 'openai_933', 'openai_934', 'openai_935', 'openai_936', 'openai_937', 'openai_938', 'openai_939', 'openai_940', 'openai_941', 'openai_942', 'openai_943', 'openai_944', 'openai_945', 'openai_946', 'openai_947', 'openai_948', 'openai_949', 'openai_950', 'openai_951', 'openai_952', 'openai_953', 'openai_954', 'openai_955', 'openai_956', 'openai_957', 'openai_958', 'openai_959', 'openai_960', 'openai_961', 'openai_962', 'openai_963', 'openai_964', 'openai_965', 'openai_966', 'openai_967', 'openai_968', 'openai_969', 'openai_970', 'openai_971', 'openai_972', 'openai_973', 'openai_974', 'openai_975', 'openai_976', 'openai_977', 'openai_978', 'openai_979', 'openai_980', 'openai_981', 'openai_982', 'openai_983', 'openai_984', 'openai_985', 'openai_986', 'openai_987', 'openai_988', 'openai_989', 'openai_990', 'openai_991', 'openai_992', 'openai_993', 'openai_994', 'openai_995', 'openai_996', 'openai_997', 'openai_998', 'openai_999', 'openai_1000', 'openai_1001', 'openai_1002', 'openai_1003', 'openai_1004', 'openai_1005', 'openai_1006', 'openai_1007', 'openai_1008', 'openai_1009', 'openai_1010', 'openai_1011', 'openai_1012', 'openai_1013', 'openai_1014', 'openai_1015', 'openai_1016', 'openai_1017', 'openai_1018', 'openai_1019', 'openai_1020', 'openai_1021', 'openai_1022', 'openai_1023', 'openai_1024', 'openai_1025', 'openai_1026', 'openai_1027', 'openai_1028', 'openai_1029', 'openai_1030', 'openai_1031', 'openai_1032', 'openai_1033', 'openai_1034', 'openai_1035', 'openai_1036', 'openai_1037', 'openai_1038', 'openai_1039', 'openai_1040', 'openai_1041', 'openai_1042', 'openai_1043', 'openai_1044', 'openai_1045', 'openai_1046', 'openai_1047', 'openai_1048', 'openai_1049', 'openai_1050', 'openai_1051', 'openai_1052', 'openai_1053', 'openai_1054', 'openai_1055', 'openai_1056', 'openai_1057', 'openai_1058', 'openai_1059', 'openai_1060', 'openai_1061', 'openai_1062', 'openai_1063', 'openai_1064', 'openai_1065', 'openai_1066', 'openai_1067', 'openai_1068', 'openai_1069', 'openai_1070', 'openai_1071', 'openai_1072', 'openai_1073', 'openai_1074', 'openai_1075', 'openai_1076', 'openai_1077', 'openai_1078', 'openai_1079', 'openai_1080', 'openai_1081', 'openai_1082', 'openai_1083', 'openai_1084', 'openai_1085', 'openai_1086', 'openai_1087', 'openai_1088', 'openai_1089', 'openai_1090', 'openai_1091', 'openai_1092', 'openai_1093', 'openai_1094', 'openai_1095', 'openai_1096', 'openai_1097', 'openai_1098', 'openai_1099', 'openai_1100', 'openai_1101', 'openai_1102', 'openai_1103', 'openai_1104', 'openai_1105', 'openai_1106', 'openai_1107', 'openai_1108', 'openai_1109', 'openai_1110', 'openai_1111', 'openai_1112', 'openai_1113', 'openai_1114', 'openai_1115', 'openai_1116', 'openai_1117', 'openai_1118', 'openai_1119', 'openai_1120', 'openai_1121', 'openai_1122', 'openai_1123', 'openai_1124', 'openai_1125', 'openai_1126', 'openai_1127', 'openai_1128', 'openai_1129', 'openai_1130', 'openai_1131', 'openai_1132', 'openai_1133', 'openai_1134', 'openai_1135', 'openai_1136', 'openai_1137', 'openai_1138', 'openai_1139', 'openai_1140', 'openai_1141', 'openai_1142', 'openai_1143', 'openai_1144', 'openai_1145', 'openai_1146', 'openai_1147', 'openai_1148', 'openai_1149', 'openai_1150', 'openai_1151', 'openai_1152', 'openai_1153', 'openai_1154', 'openai_1155', 'openai_1156', 'openai_1157', 'openai_1158', 'openai_1159', 'openai_1160', 'openai_1161', 'openai_1162', 'openai_1163', 'openai_1164', 'openai_1165', 'openai_1166', 'openai_1167', 'openai_1168', 'openai_1169', 'openai_1170', 'openai_1171', 'openai_1172', 'openai_1173', 'openai_1174', 'openai_1175', 'openai_1176', 'openai_1177', 'openai_1178', 'openai_1179', 'openai_1180', 'openai_1181', 'openai_1182', 'openai_1183', 'openai_1184', 'openai_1185', 'openai_1186', 'openai_1187', 'openai_1188', 'openai_1189', 'openai_1190', 'openai_1191', 'openai_1192', 'openai_1193', 'openai_1194', 'openai_1195', 'openai_1196', 'openai_1197', 'openai_1198', 'openai_1199', 'openai_1200', 'openai_1201', 'openai_1202', 'openai_1203', 'openai_1204', 'openai_1205', 'openai_1206', 'openai_1207', 'openai_1208', 'openai_1209', 'openai_1210', 'openai_1211', 'openai_1212', 'openai_1213', 'openai_1214', 'openai_1215', 'openai_1216', 'openai_1217', 'openai_1218', 'openai_1219', 'openai_1220', 'openai_1221', 'openai_1222', 'openai_1223', 'openai_1224', 'openai_1225', 'openai_1226', 'openai_1227', 'openai_1228', 'openai_1229', 'openai_1230', 'openai_1231', 'openai_1232', 'openai_1233', 'openai_1234', 'openai_1235', 'openai_1236', 'openai_1237', 'openai_1238', 'openai_1239', 'openai_1240', 'openai_1241', 'openai_1242', 'openai_1243', 'openai_1244', 'openai_1245', 'openai_1246', 'openai_1247', 'openai_1248', 'openai_1249', 'openai_1250', 'openai_1251', 'openai_1252', 'openai_1253', 'openai_1254', 'openai_1255', 'openai_1256', 'openai_1257', 'openai_1258', 'openai_1259', 'openai_1260', 'openai_1261', 'openai_1262', 'openai_1263', 'openai_1264', 'openai_1265', 'openai_1266', 'openai_1267', 'openai_1268', 'openai_1269', 'openai_1270', 'openai_1271', 'openai_1272', 'openai_1273', 'openai_1274', 'openai_1275', 'openai_1276', 'openai_1277', 'openai_1278', 'openai_1279', 'openai_1280', 'openai_1281', 'openai_1282', 'openai_1283', 'openai_1284', 'openai_1285', 'openai_1286', 'openai_1287', 'openai_1288', 'openai_1289', 'openai_1290', 'openai_1291', 'openai_1292', 'openai_1293', 'openai_1294', 'openai_1295', 'openai_1296', 'openai_1297', 'openai_1298', 'openai_1299', 'openai_1300', 'openai_1301', 'openai_1302', 'openai_1303', 'openai_1304', 'openai_1305', 'openai_1306', 'openai_1307', 'openai_1308', 'openai_1309', 'openai_1310', 'openai_1311', 'openai_1312', 'openai_1313', 'openai_1314', 'openai_1315', 'openai_1316', 'openai_1317', 'openai_1318', 'openai_1319', 'openai_1320', 'openai_1321', 'openai_1322', 'openai_1323', 'openai_1324', 'openai_1325', 'openai_1326', 'openai_1327', 'openai_1328', 'openai_1329', 'openai_1330', 'openai_1331', 'openai_1332', 'openai_1333', 'openai_1334', 'openai_1335', 'openai_1336', 'openai_1337', 'openai_1338', 'openai_1339', 'openai_1340', 'openai_1341', 'openai_1342', 'openai_1343', 'openai_1344', 'openai_1345', 'openai_1346', 'openai_1347', 'openai_1348', 'openai_1349', 'openai_1350', 'openai_1351', 'openai_1352', 'openai_1353', 'openai_1354', 'openai_1355', 'openai_1356', 'openai_1357', 'openai_1358', 'openai_1359', 'openai_1360', 'openai_1361', 'openai_1362', 'openai_1363', 'openai_1364', 'openai_1365', 'openai_1366', 'openai_1367', 'openai_1368', 'openai_1369', 'openai_1370', 'openai_1371', 'openai_1372', 'openai_1373', 'openai_1374', 'openai_1375', 'openai_1376', 'openai_1377', 'openai_1378', 'openai_1379', 'openai_1380', 'openai_1381', 'openai_1382', 'openai_1383', 'openai_1384', 'openai_1385', 'openai_1386', 'openai_1387', 'openai_1388', 'openai_1389', 'openai_1390', 'openai_1391', 'openai_1392', 'openai_1393', 'openai_1394', 'openai_1395', 'openai_1396', 'openai_1397', 'openai_1398', 'openai_1399', 'openai_1400', 'openai_1401', 'openai_1402', 'openai_1403', 'openai_1404', 'openai_1405', 'openai_1406', 'openai_1407', 'openai_1408', 'openai_1409', 'openai_1410', 'openai_1411', 'openai_1412', 'openai_1413', 'openai_1414', 'openai_1415', 'openai_1416', 'openai_1417', 'openai_1418', 'openai_1419', 'openai_1420', 'openai_1421', 'openai_1422', 'openai_1423', 'openai_1424', 'openai_1425', 'openai_1426', 'openai_1427', 'openai_1428', 'openai_1429', 'openai_1430', 'openai_1431', 'openai_1432', 'openai_1433', 'openai_1434', 'openai_1435', 'openai_1436', 'openai_1437', 'openai_1438', 'openai_1439', 'openai_1440', 'openai_1441', 'openai_1442', 'openai_1443', 'openai_1444', 'openai_1445', 'openai_1446', 'openai_1447', 'openai_1448', 'openai_1449', 'openai_1450', 'openai_1451', 'openai_1452', 'openai_1453', 'openai_1454', 'openai_1455', 'openai_1456', 'openai_1457', 'openai_1458', 'openai_1459', 'openai_1460', 'openai_1461', 'openai_1462', 'openai_1463', 'openai_1464', 'openai_1465', 'openai_1466', 'openai_1467', 'openai_1468', 'openai_1469', 'openai_1470', 'openai_1471', 'openai_1472', 'openai_1473', 'openai_1474', 'openai_1475', 'openai_1476', 'openai_1477', 'openai_1478', 'openai_1479', 'openai_1480', 'openai_1481', 'openai_1482', 'openai_1483', 'openai_1484', 'openai_1485', 'openai_1486', 'openai_1487', 'openai_1488', 'openai_1489', 'openai_1490', 'openai_1491', 'openai_1492', 'openai_1493', 'openai_1494', 'openai_1495', 'openai_1496', 'openai_1497', 'openai_1498', 'openai_1499', 'openai_1500', 'openai_1501', 'openai_1502', 'openai_1503', 'openai_1504', 'openai_1505', 'openai_1506', 'openai_1507', 'openai_1508', 'openai_1509', 'openai_1510', 'openai_1511', 'openai_1512', 'openai_1513', 'openai_1514', 'openai_1515', 'openai_1516', 'openai_1517', 'openai_1518', 'openai_1519', 'openai_1520', 'openai_1521', 'openai_1522', 'openai_1523', 'openai_1524', 'openai_1525', 'openai_1526', 'openai_1527', 'openai_1528', 'openai_1529', 'openai_1530', 'openai_1531', 'openai_1532', 'openai_1533', 'openai_1534', 'openai_1535', 'user_id.1', 'created_on', 'status_count', 'rt_count', 'is_bot', 'opinion_community', 'is_q', 'avg_toxicity', 'avg_fact_score', 'bom_astroturf', 'bom_overall', 'opinion_label', 'bot_label', 'fourway_label', 'is_toxic', 'toxic_label', 'is_factual', 'factual_label', 'is_bom_overall', 'is_bom_astroturf', 'bom_overall_label', 'bom_astroturf_label', 'bom_overall_fourway_label', 'bom_astroturf_fourway_label']\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " user_id openai_0 openai_1 openai_2 openai_3 openai_4 openai_5 \\\n", + "user_id \n", + "2952 2952 -0.023816 0.002004 0.004429 -0.019361 -0.009860 0.004878 \n", + "635553 635553 -0.030022 -0.006063 0.017259 -0.018501 -0.008536 0.004416 \n", + "656993 656993 -0.010723 0.008235 0.004192 -0.040441 -0.015172 0.012798 \n", + "761154 761154 -0.021389 -0.004747 0.006925 -0.017395 -0.011900 0.018309 \n", + "777554 777554 -0.009369 -0.009612 0.012470 0.005079 -0.019303 -0.010459 \n", + "\n", + " openai_6 openai_7 openai_8 ... is_toxic toxic_label is_factual \\\n", + "user_id ... \n", + "2952 0.000960 -0.015426 -0.006430 ... 0 Normal NaN \n", + "635553 -0.011840 -0.010581 -0.010859 ... 0 Normal NaN \n", + "656993 -0.015786 0.008556 -0.022145 ... 0 Normal NaN \n", + "761154 -0.007047 -0.024175 0.001368 ... 1 Toxic NaN \n", + "777554 0.019815 -0.019171 -0.017594 ... 0 Normal NaN \n", + "\n", + " factual_label is_bom_overall is_bom_astroturf bom_overall_label \\\n", + "user_id \n", + "2952 NaN 0.0 0.0 Human \n", + "635553 NaN 0.0 0.0 Human \n", + "656993 NaN 0.0 0.0 Human \n", + "761154 NaN 1.0 0.0 Bot \n", + "777554 NaN 0.0 0.0 Human \n", + "\n", + " bom_astroturf_label bom_overall_fourway_label \\\n", + "user_id \n", + "2952 Human Anti-Trump Human \n", + "635553 Human Anti-Trump Human \n", + "656993 Human Anti-Trump Human \n", + "761154 Human Anti-Trump Bot \n", + "777554 Human Anti-Trump Human \n", + "\n", + " bom_astroturf_fourway_label \n", + "user_id \n", + "2952 Anti-Trump Human \n", + "635553 Anti-Trump Human \n", + "656993 Anti-Trump Human \n", + "761154 Anti-Trump Human \n", + "777554 Anti-Trump Human \n", + "\n", + "[5 rows x 1561 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idopenai_0openai_1openai_2openai_3openai_4openai_5openai_6openai_7openai_8...is_toxictoxic_labelis_factualfactual_labelis_bom_overallis_bom_astroturfbom_overall_labelbom_astroturf_labelbom_overall_fourway_labelbom_astroturf_fourway_label
user_id
29522952-0.0238160.0020040.004429-0.019361-0.0098600.0048780.000960-0.015426-0.006430...0NormalNaNNaN0.00.0HumanHumanAnti-Trump HumanAnti-Trump Human
635553635553-0.030022-0.0060630.017259-0.018501-0.0085360.004416-0.011840-0.010581-0.010859...0NormalNaNNaN0.00.0HumanHumanAnti-Trump HumanAnti-Trump Human
656993656993-0.0107230.0082350.004192-0.040441-0.0151720.012798-0.0157860.008556-0.022145...0NormalNaNNaN0.00.0HumanHumanAnti-Trump HumanAnti-Trump Human
761154761154-0.021389-0.0047470.006925-0.017395-0.0119000.018309-0.007047-0.0241750.001368...1ToxicNaNNaN1.00.0BotHumanAnti-Trump BotAnti-Trump Human
777554777554-0.009369-0.0096120.0124700.005079-0.019303-0.0104590.019815-0.019171-0.017594...0NormalNaNNaN0.00.0HumanHumanAnti-Trump HumanAnti-Trump Human
\n", + "

5 rows × 1561 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "averages_df" + } + }, + "metadata": {}, + "execution_count": 48 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Splitting" + ], + "metadata": { + "id": "slNi31xDeQYI" + } + }, + { + "cell_type": "code", + "source": [ + "averages_x = averages_df[embeddings_cols]\n", + "averages_x" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 475 + }, + "id": "swEMBoBSeP4O", + "outputId": "4a1baa37-0684-43bb-8687-3d6742aedb67" + }, + "execution_count": 49, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " openai_0 openai_1 openai_2 openai_3 openai_4 \\\n", + "user_id \n", + "2952 -0.023816 0.002004 0.004429 -0.019361 -0.009860 \n", + "635553 -0.030022 -0.006063 0.017259 -0.018501 -0.008536 \n", + "656993 -0.010723 0.008235 0.004192 -0.040441 -0.015172 \n", + "761154 -0.021389 -0.004747 0.006925 -0.017395 -0.011900 \n", + "777554 -0.009369 -0.009612 0.012470 0.005079 -0.019303 \n", + "... ... ... ... ... ... \n", + "1234200349600288772 -0.024670 -0.007194 0.012253 -0.015047 -0.003037 \n", + "1234846911028453376 0.002930 -0.013252 -0.010225 -0.034511 -0.022378 \n", + "1237940420136456192 -0.013656 -0.002694 0.007115 -0.019419 -0.001060 \n", + "1238854780191195136 -0.041529 -0.024860 0.005283 -0.026827 0.005801 \n", + "1240138605726760962 -0.067218 0.000135 -0.009630 -0.002240 0.013352 \n", + "\n", + " openai_5 openai_6 openai_7 openai_8 openai_9 ... \\\n", + "user_id ... \n", + "2952 0.004878 0.000960 -0.015426 -0.006430 0.001027 ... \n", + "635553 0.004416 -0.011840 -0.010581 -0.010859 -0.003771 ... \n", + "656993 0.012798 -0.015786 0.008556 -0.022145 -0.017026 ... \n", + "761154 0.018309 -0.007047 -0.024175 0.001368 0.002065 ... \n", + "777554 -0.010459 0.019815 -0.019171 -0.017594 -0.006209 ... \n", + "... ... ... ... ... ... ... \n", + "1234200349600288772 0.004080 -0.004318 -0.005398 0.000077 -0.006043 ... \n", + "1234846911028453376 0.011991 0.008446 0.007933 0.007091 -0.018994 ... \n", + "1237940420136456192 0.007154 -0.000715 -0.013499 0.001435 -0.011751 ... \n", + "1238854780191195136 0.000377 0.011535 -0.013447 0.002424 0.010552 ... \n", + "1240138605726760962 0.007857 0.011120 -0.011868 0.010417 0.003038 ... \n", + "\n", + " openai_1526 openai_1527 openai_1528 openai_1529 \\\n", + "user_id \n", + "2952 -0.012285 0.001094 0.015767 -0.026536 \n", + "635553 -0.005960 -0.007866 0.010948 -0.021376 \n", + "656993 -0.018110 0.007116 -0.004877 -0.032427 \n", + "761154 0.013326 -0.020819 0.007364 -0.016794 \n", + "777554 0.007358 0.010696 0.008784 -0.024808 \n", + "... ... ... ... ... \n", + "1234200349600288772 0.001035 -0.011842 0.022085 -0.027978 \n", + "1234846911028453376 0.009281 -0.010663 0.016082 -0.026220 \n", + "1237940420136456192 0.008012 0.011208 0.025522 -0.025476 \n", + "1238854780191195136 0.009792 -0.014946 0.028075 -0.031737 \n", + "1240138605726760962 0.005619 -0.001152 0.008422 -0.029487 \n", + "\n", + " openai_1530 openai_1531 openai_1532 openai_1533 \\\n", + "user_id \n", + "2952 -0.024981 0.015113 0.018588 -0.002324 \n", + "635553 -0.023424 0.020705 0.005084 -0.011961 \n", + "656993 -0.023885 -0.000715 0.003886 -0.024242 \n", + "761154 -0.049548 0.013037 0.024798 -0.008543 \n", + "777554 -0.008042 0.011077 0.001996 -0.001104 \n", + "... ... ... ... ... \n", + "1234200349600288772 -0.030627 0.017037 -0.001254 -0.012667 \n", + "1234846911028453376 0.000994 0.016662 0.005803 0.005945 \n", + "1237940420136456192 -0.034344 0.023171 -0.007861 -0.006911 \n", + "1238854780191195136 -0.047090 0.030326 -0.023545 -0.014824 \n", + "1240138605726760962 -0.019286 0.030090 -0.000010 -0.018275 \n", + "\n", + " openai_1534 openai_1535 \n", + "user_id \n", + "2952 -0.003782 -0.028532 \n", + "635553 -0.003258 -0.026262 \n", + "656993 0.003839 -0.048883 \n", + "761154 0.006142 -0.035867 \n", + "777554 -0.019460 -0.030301 \n", + "... ... ... \n", + "1234200349600288772 -0.002032 -0.026470 \n", + "1234846911028453376 0.001228 -0.041925 \n", + "1237940420136456192 -0.005543 -0.026032 \n", + "1238854780191195136 0.003257 -0.022161 \n", + "1240138605726760962 -0.017999 -0.033294 \n", + "\n", + "[7566 rows x 1536 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
openai_0openai_1openai_2openai_3openai_4openai_5openai_6openai_7openai_8openai_9...openai_1526openai_1527openai_1528openai_1529openai_1530openai_1531openai_1532openai_1533openai_1534openai_1535
user_id
2952-0.0238160.0020040.004429-0.019361-0.0098600.0048780.000960-0.015426-0.0064300.001027...-0.0122850.0010940.015767-0.026536-0.0249810.0151130.018588-0.002324-0.003782-0.028532
635553-0.030022-0.0060630.017259-0.018501-0.0085360.004416-0.011840-0.010581-0.010859-0.003771...-0.005960-0.0078660.010948-0.021376-0.0234240.0207050.005084-0.011961-0.003258-0.026262
656993-0.0107230.0082350.004192-0.040441-0.0151720.012798-0.0157860.008556-0.022145-0.017026...-0.0181100.007116-0.004877-0.032427-0.023885-0.0007150.003886-0.0242420.003839-0.048883
761154-0.021389-0.0047470.006925-0.017395-0.0119000.018309-0.007047-0.0241750.0013680.002065...0.013326-0.0208190.007364-0.016794-0.0495480.0130370.024798-0.0085430.006142-0.035867
777554-0.009369-0.0096120.0124700.005079-0.019303-0.0104590.019815-0.019171-0.017594-0.006209...0.0073580.0106960.008784-0.024808-0.0080420.0110770.001996-0.001104-0.019460-0.030301
..................................................................
1234200349600288772-0.024670-0.0071940.012253-0.015047-0.0030370.004080-0.004318-0.0053980.000077-0.006043...0.001035-0.0118420.022085-0.027978-0.0306270.017037-0.001254-0.012667-0.002032-0.026470
12348469110284533760.002930-0.013252-0.010225-0.034511-0.0223780.0119910.0084460.0079330.007091-0.018994...0.009281-0.0106630.016082-0.0262200.0009940.0166620.0058030.0059450.001228-0.041925
1237940420136456192-0.013656-0.0026940.007115-0.019419-0.0010600.007154-0.000715-0.0134990.001435-0.011751...0.0080120.0112080.025522-0.025476-0.0343440.023171-0.007861-0.006911-0.005543-0.026032
1238854780191195136-0.041529-0.0248600.005283-0.0268270.0058010.0003770.011535-0.0134470.0024240.010552...0.009792-0.0149460.028075-0.031737-0.0470900.030326-0.023545-0.0148240.003257-0.022161
1240138605726760962-0.0672180.000135-0.009630-0.0022400.0133520.0078570.011120-0.0118680.0104170.003038...0.005619-0.0011520.008422-0.029487-0.0192860.030090-0.000010-0.018275-0.017999-0.033294
\n", + "

7566 rows × 1536 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "averages_x" + } + }, + "metadata": {}, + "execution_count": 49 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### PCA 2" + ], + "metadata": { + "id": "wcqhC-og54YL" + } + }, + { + "cell_type": "code", + "source": [ + "averages_labels = averages_df[target]\n", + "averages_labels" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ehDYIt976ZWa", + "outputId": "04b24fae-ceda-4d16-a50c-f9753fffd349" + }, + "execution_count": 42, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "user_id\n", + "2952 Anti-Trump Human\n", + "635553 Anti-Trump Human\n", + "656993 Anti-Trump Human\n", + "761154 Anti-Trump Human\n", + "777554 Anti-Trump Human\n", + " ... \n", + "1234200349600288772 Pro-Trump Human\n", + "1234846911028453376 Anti-Trump Human\n", + "1237940420136456192 Pro-Trump Human\n", + "1238854780191195136 Anti-Trump Human\n", + "1240138605726760962 Anti-Trump Human\n", + "Name: fourway_label, Length: 7566, dtype: object" + ] + }, + "metadata": {}, + "execution_count": 42 + } + ] + }, + { + "cell_type": "code", + "source": [ + "averages_pipeline = ReductionPipeline(x=averages_x, labels=averages_labels, target=target, n_components=2)\n", + "\n", + "averages_pipeline.perform()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "26515f85-3783-4e74-b8ec-704e28586d0f", + "id": "weDB-cX05-jy" + }, + "execution_count": 43, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "PCA(n_components=2, random_state=99)\n", + "EMBEDDINGS: (7566, 2)\n", + "EXPLAINED VARIANCE RATIO: [0.04725761 0.03663586]\n", + "EXPLAINED VARIANCE: 0.08\n", + "LOADINGS (1536, 2)\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "averages_pipeline.embeddings_df.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 237 + }, + "outputId": "d91e7d4a-d0eb-4a8b-d457-1f8125543f49", + "id": "E0aY19Zo5-jz" + }, + "execution_count": 44, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " component_1 component_2\n", + "user_id \n", + "2952 -6.801425 -1.905464\n", + "635553 -2.489854 -7.583170\n", + "656993 6.493947 -21.433610\n", + "761154 -0.132166 -3.147631\n", + "777554 -4.050601 7.515363" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
component_1component_2
user_id
2952-6.801425-1.905464
635553-2.489854-7.583170
6569936.493947-21.433610
761154-0.132166-3.147631
777554-4.0506017.515363
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"averages_pipeline\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"component_1\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 5.032658089178554,\n \"min\": -6.801425455794354,\n \"max\": 6.493947126606545,\n \"samples\": [\n -2.489853942301952,\n -4.050601021086857,\n 6.493947126606545\n ],\n \"num_unique_values\": 5,\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"component_2\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 10.55811005682525,\n \"min\": -21.4336102269828,\n \"max\": 7.5153633344561666,\n \"samples\": [\n -7.583170181949627,\n 7.5153633344561666,\n -21.4336102269828\n ],\n \"num_unique_values\": 5,\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 44 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# todo: re-implement colors map and category orders\n", + "#averages_pipeline.plot_embeddings(fig_show=True, fig_save=False, height=350)" + ], + "metadata": { + "id": "PbFddVU_5-jz" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "\n", + "groupby_cols = [\n", + " \"bot_label\", \"opinion_label\", # \"bom_overall_label\", \"bom_astroturf_label\",\n", + " \"toxic_label\", \"factual_label\",\n", + " \"fourway_label\", #\"sixway_label\",\n", + "]\n", + "\n", + "for groupby_col in groupby_cols:\n", + " color_map = COLORS_MAP[groupby_col]\n", + " category_orders = {groupby_col: CATEGORY_ORDERS[groupby_col]}\n", + "\n", + " labels = averages_df[groupby_col]\n", + " pipeline = ReductionPipeline(x=averages_x, labels=labels, target=groupby_col, n_components=2)\n", + "\n", + " results_dirpath = os.path.join(RESULTS_DIRPATH, \"openai_embeddings_v2\", \"text-embedding-ada-002\", f\"status_avg_embeddings_{pipeline.reducer_type.lower()}_{pipeline.n_components}\", groupby_col)\n", + " os.makedirs(results_dirpath, exist_ok=True)\n", + "\n", + " pipeline.perform()\n", + "\n", + " pipeline.plot_embeddings(\n", + " color=groupby_col, color_map=color_map, category_orders=category_orders,\n", + " #hover_data=[\"user_id\", \"bot_label\"],\n", + " fig_show=True, fig_save=True,\n", + " results_dirpath=results_dirpath\n", + " )" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "51tmPwpfi0er", + "outputId": "2fcdf7f5-d912-4cf7-e202-6671ac1087ac" + }, + "execution_count": 61, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "PCA(n_components=2, random_state=99)\n", + "EMBEDDINGS: (7566, 2)\n", + "EXPLAINED VARIANCE RATIO: [0.04725761 0.03663586]\n", + "EXPLAINED VARIANCE: 0.08\n", + "LOADINGS (1536, 2)\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "
\n", + "\n", + "" + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "PCA(n_components=2, random_state=99)\n", + "EMBEDDINGS: (7566, 2)\n", + "EXPLAINED VARIANCE RATIO: [0.04725761 0.03663586]\n", + "EXPLAINED VARIANCE: 0.08\n", + "LOADINGS (1536, 2)\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "
\n", + "\n", + "" + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "PCA(n_components=2, random_state=99)\n", + "EMBEDDINGS: (7566, 2)\n", + "EXPLAINED VARIANCE RATIO: [0.04725761 0.03663586]\n", + "EXPLAINED VARIANCE: 0.08\n", + "LOADINGS (1536, 2)\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "
\n", + "\n", + "" + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "PCA(n_components=2, random_state=99)\n", + "EMBEDDINGS: (7566, 2)\n", + "EXPLAINED VARIANCE RATIO: [0.04725761 0.03663586]\n", + "EXPLAINED VARIANCE: 0.08\n", + "LOADINGS (1536, 2)\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "
\n", + "\n", + "" + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "PCA(n_components=2, random_state=99)\n", + "EMBEDDINGS: (7566, 2)\n", + "EXPLAINED VARIANCE RATIO: [0.04725761 0.03663586]\n", + "EXPLAINED VARIANCE: 0.08\n", + "LOADINGS (1536, 2)\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "
\n", + "\n", + "" + ] + }, + "metadata": {} + } + ] + } + ] +} \ No newline at end of file diff --git a/notebooks/openai_embeddings_v2/de_duping_and_averaging_status_embeddings_(20240216).py b/notebooks/openai_embeddings_v2/de_duping_and_averaging_status_embeddings_(20240216).py new file mode 100644 index 0000000..cd90bcd --- /dev/null +++ b/notebooks/openai_embeddings_v2/de_duping_and_averaging_status_embeddings_(20240216).py @@ -0,0 +1,106 @@ +# -*- coding: utf-8 -*- +"""De-duping and Averaging Status Embeddings (20240216) + +Automatically generated by Colaboratory. + +Original file is located at + https://colab.research.google.com/drive/1N-aRJ6GfO72QkOSWetqLeXYx9gq_NA8c + +In this notebook, we prepare a clean (de-duped) version of the status embeddings. And we re-construct user embeddings using the average of their status embeddings. + +This notebook saves both datasets back to drive for further analysis. + +## Google Drive +""" + +import os +from google.colab import drive + +drive.mount('/content/drive') +print(os.getcwd(), os.listdir(os.getcwd())) + +# you might need to create a google drive SHORTCUT that has this same path +# ... or update the path to use your own google drive organization +DIRPATH = '/content/drive/MyDrive/Research/DS Research Shared 2024' + +print(DIRPATH) +os.path.isdir(DIRPATH) + +DATA_DIRPATH = os.path.join(DIRPATH, "projects", "Impeachment 2020 Embeddings", "data") +os.path.isdir(DATA_DIRPATH) + +"""## Data Loading""" + +from pandas import read_parquet + +pq_filepath = os.path.join(DATA_DIRPATH, "botometer_sample_max_50_openai_status_embeddings_v3_unpacked.parquet.gzip") +statuses_df = read_parquet(pq_filepath) +print(statuses_df.shape) +print(statuses_df.columns) +statuses_df.head() + +statuses_df["user_id"].nunique() + +len(statuses_df) + +statuses_df["status_id"].nunique() + +"""Oh no, statuses not unique?""" + +statuses_df["status_id"].value_counts() + +statuses_df[statuses_df["status_id"].duplicated(keep=False)].sort_values("status_id") + +"""The embeddings values appear to be the same for each status, so we can take the first row for each status. + +## De-Duping + +183,727 statuses +""" + +print(statuses_df.shape) +statuses_df.drop_duplicates(subset=["status_id"], inplace=True) +print(statuses_df.shape) + +"""Saving to drive:""" + +pq_filepath = os.path.join(DATA_DIRPATH, "botometer_sample_max_50_openai_status_embeddings_v3_unpacked_deduped.parquet.gzip") + +statuses_df.to_parquet(pq_filepath, compression="gzip") + +"""## Averaging Embeddings per User""" + +statuses_df.groupby("user_id")["status_id"].count() + +embeddings_cols = [col for col in statuses_df.columns if "openai" in col] +print(len(embeddings_cols)) +print(embeddings_cols[0], "...", embeddings_cols[-1]) + +averages = statuses_df.groupby("user_id")[embeddings_cols].mean() +print(averages.shape) +averages.head() + +"""Get user labels from CSV file:""" + +from pandas import read_csv + +csv_filepath = os.path.join(DATA_DIRPATH, "botometer_sample_max_50_openai_user_embeddings_unpacked.csv.gz") +users_df = read_csv(csv_filepath, compression="gzip") +print(users_df.shape) +print(users_df.columns) +users_df.head() + +user_labels = users_df.drop(columns=embeddings_cols) +user_labels.index = user_labels["user_id"] +user_labels.head() + +"""Merge user labels columns back in:""" + +averages = averages.merge(user_labels, left_index=True, right_index=True) +averages.head() + +"""Saving to drive:""" + +csv_filepath = os.path.join(DATA_DIRPATH, "botometer_sample_max_50_openai_status_embeddings_v3_unpacked_deduped_averaged.csv.gz") + +averages.to_csv(csv_filepath, compression="gzip") \ No newline at end of file diff --git a/notebooks/openai_embeddings_v2/user_vs_tweet_level_embeddings_(impeachment_2020)_dimensionality_reduction_(2024).py b/notebooks/openai_embeddings_v2/user_vs_tweet_level_embeddings_(impeachment_2020)_dimensionality_reduction_(2024).py new file mode 100644 index 0000000..8cdbc8f --- /dev/null +++ b/notebooks/openai_embeddings_v2/user_vs_tweet_level_embeddings_(impeachment_2020)_dimensionality_reduction_(2024).py @@ -0,0 +1,508 @@ +# -*- coding: utf-8 -*- +"""User vs Tweet Level Embeddings (Impeachment 2020) - Dimensionality Reduction (2024) + +Automatically generated by Colaboratory. + +Original file is located at + https://colab.research.google.com/drive/1UiL5SUTIm5V7_B6lf1ueFH97EHn9w6P0 + +We fetched user-level and tweet-level OpenAI embeddings and stored on BQ, and copied the data to CSV files on Drive. + +Then we de-duped the status embeddings and calculated the average status embeddings for each user, and saved these CSV files on drive. + +This notebook provides a preliminary analysis of user-level vs tweet-level embeddings, focusing first on dimensionality reduction. + +## Setup + +Package installation: +""" + +# Commented out IPython magic to ensure Python compatibility. +# %%capture +# !pip install -U kaleido + +"""May need to restart session before continuing.""" + +!pip list | grep kaleido + +"""## Google Drive""" + +import os +from google.colab import drive + +drive.mount('/content/drive') +print(os.getcwd(), os.listdir(os.getcwd())) + +# you might need to create a google drive SHORTCUT that has this same path +# ... or update the path to use your own google drive organization +#DIRPATH = '/content/drive/MyDrive/Research/Disinfo Research Shared 2022' +#DIRPATH = '/content/drive/MyDrive/Research/DS Research Shared 2023' +DIRPATH = '/content/drive/MyDrive/Research/DS Research Shared 2024' + +print(DIRPATH) +os.path.isdir(DIRPATH) + +"""New project-based directory structure for 2024: + +https://drive.google.com/drive/folders/1SuXkqVT400uZ2OYFGGV8SYBf7NhtBo5k?usp=drive_link +""" + +DATA_DIRPATH = os.path.join(DIRPATH, "projects", "Impeachment 2020 Embeddings", "data") +os.path.isdir(DATA_DIRPATH) + +os.listdir(DATA_DIRPATH) + +"""The "unpacked" versions have a column per embedding, and are generally easier to work with. + +The files we will be working with are: + + "botometer_sample_max_50_openai_user_embeddings_unpacked.csv.gz" (user level embeddings) and + + "botometer_sample_max_50_openai_status_embeddings_v3_unpacked_deduped_averaged.csv.gz" (average status-level embeddings). +""" + +RESULTS_DIRPATH = os.path.join(DIRPATH, "projects", "Impeachment 2020 Embeddings", "results") +os.makedirs(RESULTS_DIRPATH, exist_ok=True) +os.path.isdir(RESULTS_DIRPATH) + +"""## Colors""" + +# https://github.com/s2t2/openai-embeddings-2023/blob/main/app/colors.py + +#GREY = "#ccc" +#PURPLE = "#7E57C2" + +# colorbrewer scales +# light --> dark +BLUES = ['#f7fbff', '#deebf7', '#c6dbef', '#9ecae1', '#6baed6', '#4292c6', '#2171b5', '#08519c', '#08306b'] +REDS = ['#fff5f0', '#fee0d2', '#fcbba1', '#fc9272', '#fb6a4a', '#ef3b2c', '#cb181d', '#a50f15', '#67000d'] +PURPLES = ['#fcfbfd', '#efedf5', '#dadaeb', '#bcbddc', '#9e9ac8', '#807dba', '#6a51a3', '#54278f', '#3f007d'] +GREYS = ['#ffffff', '#f0f0f0', '#d9d9d9', '#bdbdbd', '#969696', '#737373', '#525252', '#252525', '#000000'] +GREENS = ["#edf8e9","#c7e9c0","#a1d99b","#74c476","#41ab5d","#238b45","#005a32"] +ORANGES = ['#fff5eb', '#fee6ce', '#fdd0a2', '#fdae6b', '#fd8d3c', '#f16913', '#d94801', '#a63603', '#7f2704'] +BROWNS = ["#C46200", "#964B00"] +RD_PU = ["#feebe2","#fcc5c0","#fa9fb5","#f768a1","#dd3497","#ae017e","#7a0177"] +PU_RD = ["#f1eef6","#d4b9da","#c994c7","#df65b0","#e7298a","#ce1256","#91003f"] + +OPINION_COLORS_MAP = {"Anti-Trump": BLUES[5], "Pro-Trump": REDS[5]} +BOT_COLORS_MAP = {"Human": GREYS[3], "Bot": PURPLES[6]} +Q_COLORS_MAP = {"Normal": GREYS[3], "Q-anon": REDS[6]} +TOXIC_COLORS_MAP = {"Toxic": BROWNS[1], "Normal": GREYS[3]} +FACT_COLORS_MAP = {"High Quality": GREYS[3], "Low Quality": RD_PU[4]} + +FOURWAY_COLORS_MAP = { + "Anti-Trump Human": BLUES[3], + "Anti-Trump Bot": BLUES[6], + + "Pro-Trump Human": REDS[3], + "Pro-Trump Bot": REDS[6], +} +SIXWAY_COLORS_MAP = { + "Anti-Trump Human": BLUES[3], + "Anti-Trump Bot": BLUES[6], + + "Pro-Trump Human": REDS[3], + "Pro-Trump Bot": REDS[6], + + "Q-anon Human": REDS[4], # "Pro-Trump Q-anon Human" + "Q-anon Bot": REDS[7], # "Pro-Trump Q-anon Bot" +} + + +COLORS_MAP = { + "bot_label": BOT_COLORS_MAP, + "opinion_label": OPINION_COLORS_MAP, + "q_label": Q_COLORS_MAP, + "toxic_label": TOXIC_COLORS_MAP, + "factual_label": FACT_COLORS_MAP, + + "fourway_label": FOURWAY_COLORS_MAP, + "sixway_label": SIXWAY_COLORS_MAP, + "bom_overall_label": BOT_COLORS_MAP, + "bom_astroturf_label": BOT_COLORS_MAP, +} + + +BOT_LABEL_ORDER = ["Human", "Bot"] +CATEGORY_ORDERS = { + "bot_label": BOT_LABEL_ORDER, + "bom_overall_label": BOT_LABEL_ORDER, + "bom_astroturf_label": BOT_LABEL_ORDER, + "opinion_label": ["Anti-Trump", "Pro-Trump"], + "q_label": ["Normal", "Q-anon"], + + "toxic_label": ["Normal", "Toxic"], + "factual_label": ["High Quality", "Low Quality"], + + "fourway_label": list(FOURWAY_COLORS_MAP.keys()), + "sixway_label": list(SIXWAY_COLORS_MAP.keys()), +} + +"""## Dimensionality Reduction""" + +#import warnings +#warnings.filterwarnings("ignore", message=".*The 'nopython' keyword.*") # suppress umap warnings https://github.com/slundberg/shap/issues/2909 +#warnings.simplefilter("ignore", DeprecationWarning) # suppress warnings.warn("pkg_resources is deprecated as an API", DeprecationWarning) https://discuss.python.org/t/how-to-silence-pkg-resources-warnings/28629/7 + +import os + +import numpy as np +from pandas import DataFrame +import plotly.express as px +from sklearn.decomposition import PCA +from sklearn.preprocessing import StandardScaler + +N_COMPONENTS = 2 +#REDUCTION_RESULTS_DIRPATH = "results" +FIG_SHOW = True +FIG_SAVE = False + +class ReductionPipeline: + # adapted from: https://github.com/s2t2/openai-embeddings-2023/blob/main/app/reduction/pipeline.py + + def __init__(self, x, labels, target, n_components=N_COMPONENTS, reducer_type="PCA", #results_dirpath=None + ): + """ + + """ + + self.x = x.copy() + self.labels = labels.copy() + self.target = target + + self.reducer_type = reducer_type + self.reducer_name = {"PCA": "pca", "T-SNE": "tsne", "UMAP": "umap"}[self.reducer_type] + + self.x_scaled = (self.x - self.x.mean(axis=0)) / self.x.std(axis=0) + #scaler = StandardScaler() + #self.x_scaled = scaler.fit_transform(self.x) + + self.n_components = n_components + self.component_names = [f"component_{i+1}" for i in range(self.n_components)] + + #self.results_dirpath = results_dirpath or f"results_pca_{self.n_components}" + #os.makedirs(self.results_dirpath, exist_ok=True) + + + def perform(self): + self.pca = PCA(n_components=self.n_components, random_state=99) + print(self.pca) + + embeddings = self.pca.fit_transform(self.x_scaled) + print("EMBEDDINGS:", embeddings.shape) + self.embeddings_df = DataFrame(embeddings, columns=self.component_names, index=self.x.index) + + print("EXPLAINED VARIANCE RATIO:", self.pca.explained_variance_ratio_) + print("EXPLAINED VARIANCE:", self.pca.explained_variance_ratio_.sum().round(2)) + + # https://stackoverflow.com/questions/21217710/factor-loadings-using-sklearn/44728692#44728692 + loadings = self.pca.components_.T * np.sqrt(self.pca.explained_variance_) + print("LOADINGS", loadings.shape) + self.loadings_df = DataFrame(loadings, columns=self.component_names, index=self.pca.feature_names_in_) + + + #def plot_embeddings(self, fig_show=True, fig_save=False, height=350, labels=None, hover_data=None): + # + # labels = labels or self.labels + # + # chart_df = self.embeddings_df.copy() + # chart_df = chart_df.merge(self.labels, left_index=True, right_index=True) # ADD TARGET BACK FOR COLOR (ASSUMES INDEX IS the SAME) + # #chart_df = chart_df.merge(self.x, left_index=True, right_index=True) # ADD aLL DATA BACK SO WE CAN INSPECT FEATURES AS WELL + # #chart_df.sort_values(by=self.target, inplace=True) + # + # fig = None + # if self.n_components == 2: + # fig = px.scatter(chart_df, x="component_1", y="component_2", + # color=self.target, height=height, + # title="PCA Embeddings (n_components=2)", + # #hover_data=self.x.columns.tolist() #["gender", "island", "body_mass_g"] + # hover_data=hover_data + # ) + # elif self.n_components == 3: + # fig = px.scatter_3d(chart_df, x="component_1", y="component_2", z="component_3", + # color=self.target, height=height, + # title="PCA Embeddings (n_components=3)", + # #hover_data=self.x.columns.tolist() # ["gender", "island", "body_mass_g"] + # ) + # + # if fig and fig_show: + # fig.show() + # + # if fig and fig_save: + # html_filepath = os.path.join(self.results_filepath, f"features.html") + # fig.write_html(html_filepath) + # + # png_filepath = os.path.join(self.results_filepath, f"features.png") + # fig.write_image(png_filepath) + + + + def plot_embeddings(self, height=500, fig_show=FIG_SHOW, fig_save=FIG_SAVE, results_dirpath=None, + subtitle=None, text=None, size=None, hover_data=None, + color=None, color_map=None, color_scale=None, category_orders=None): + + chart_df = self.embeddings_df.copy() + chart_df = chart_df.merge(self.labels, left_index=True, right_index=True) # ADD TARGET BACK FOR COLOR (ASSUMES INDEX IS the SAME) + #chart_df = chart_df.merge(self.x, left_index=True, right_index=True) # ADD aLL DATA BACK SO WE CAN INSPECT FEATURES AS WELL + #chart_df.sort_values(by=self.target, inplace=True) + + title = f"Dimensionality Reduction Results ({self.reducer_type} n_components={self.n_components})" + if subtitle: + title += f"
{subtitle}" + + chart_params = dict(x="component_1", y="component_2", + title=title, height=height, + #color=color, #"artist_name", + hover_data= hover_data #{"index": (self.embeddings_df.index)} #hover_data #["audio_filename", "track_number"] + ) + if color: + chart_params["color"] = color + if color_map: + chart_params["color_discrete_map"] = color_map + if color_scale: + chart_params["color_continuous_scale"] = color_scale + if category_orders: + chart_params["category_orders"] = category_orders + if hover_data: + chart_params["hover_data"] = hover_data + if size: + chart_params["size"] = size + if text: + chart_params["text"] = text + + if self.n_components == 2: + fig = px.scatter(chart_df, **chart_params) + elif self.n_components == 3: + chart_params["z"] = "component_3" + fig = px.scatter_3d(chart_df, **chart_params) + else: + return None + + if fig_show: + fig.show() + + if fig_save: + results_dirpath = results_dirpath or self.results_dirpath + filestem = os.path.join(results_dirpath, f"{self.reducer_name}_{self.n_components}") + fig.write_image(f"{filestem}.png") + fig.write_html(f"{filestem}.html") + + return fig + +"""## User Embeddings + +7566 users + +### Loading + +Loading CSV from drive: +""" + +from pandas import read_csv + +csv_filepath = os.path.join(DATA_DIRPATH, "botometer_sample_max_50_openai_user_embeddings_unpacked.csv.gz") +users_df = read_csv(csv_filepath, compression="gzip") +print(users_df.shape) +print(users_df.columns) +users_df.head() + +users_df["user_id"].nunique() + +users_df["is_bot"].value_counts() + +users_df["opinion_community"].value_counts() + +users_df["avg_fact_score"].info() + +from pandas import isnull + +def add_labels(users_df): + # APPLY SAME LABELS AS THE ORIGINAL SOURCE CODE + # https://github.com/s2t2/openai-embeddings-2023/blob/1b8372dd36982009df5d4a80871f4c182ada743d/notebooks/2_embeddings_data_export.py#L51 + # https://github.com/s2t2/openai-embeddings-2023/blob/main/app/dataset.py#L37-L64 + + # labels: + users_df["opinion_label"] = users_df["opinion_community"].map({0:"Anti-Trump", 1:"Pro-Trump"}) + users_df["bot_label"] = users_df["is_bot"].map({True:"Bot", False:"Human"}) + users_df["fourway_label"] = users_df["opinion_label"] + " " + users_df["bot_label"] + + # language toxicity scores (0 low - 1 high) + toxic_threshold = 0.1 + users_df["is_toxic"] = users_df["avg_toxicity"] >= toxic_threshold + users_df["is_toxic"] = users_df["is_toxic"].map({True: 1, False :0 }) + users_df["toxic_label"] = users_df["is_toxic"].map({1: "Toxic", 0 :"Normal" }) + + # fact check / media quality scores (1 low - 5 high) + # there are null avg_fact_score, so we only apply operation if not null, and leave nulls + fact_threshold = 3.0 + users_df["is_factual"] = users_df["avg_fact_score"].apply(lambda score: score if isnull(score) else score >= fact_threshold) + users_df["is_factual"] = users_df["is_factual"].map({True: 1, False :0 }) + users_df["factual_label"] = users_df["is_factual"].map({1: "High Quality", 0 :"Low Quality" }) + + # botometer binary and labels: + users_df["is_bom_overall"] = users_df["bom_overall"].round() + users_df["is_bom_astroturf"] = users_df["bom_astroturf"].round() + users_df["bom_overall_label"] = users_df["is_bom_overall"].map({1:"Bot", 0:"Human"}) + users_df["bom_astroturf_label"] = users_df["is_bom_astroturf"].map({1:"Bot", 0:"Human"}) + users_df["bom_overall_fourway_label"] = users_df["opinion_label"] + " " + users_df["bom_overall_label"] + users_df["bom_astroturf_fourway_label"] = users_df["opinion_label"] + " " + users_df["bom_astroturf_label"] + + return users_df + + +users_df = add_labels(users_df) +print(users_df.shape) +print(users_df.columns.tolist()) +users_df.head() + +users_df["is_factual"].value_counts() + +users_df["factual_label"].value_counts() + +users_df["is_toxic"].value_counts() + +users_df["toxic_label"].value_counts() + +users_df["bot_label"].value_counts() + +users_df["opinion_label"].value_counts() + +users_df["fourway_label"].value_counts() + +"""### Splitting""" + +users_df.index = users_df["user_id"] + +embeddings_cols = [col for col in users_df.columns if "openai" in col] +print(len(embeddings_cols)) +print(embeddings_cols[0], "...", embeddings_cols[-1]) + +users_x = users_df[embeddings_cols] +users_x.head() + +#user_labels = users_df.drop(columns=embeddings_cols) +#print(user_labels.columns.tolist()) +#user_labels.head() + +"""### PCA 2""" + +# /usr/local/lib/python3.10/dist-packages/plotly/express/_core.py:1223: +# PerformanceWarning: DataFrame is highly fragmented. +# This is usually the result of calling `frame.insert` many times, which has poor performance. +# Consider joining all columns at once using pd.concat(axis=1) instead. +# To get a de-fragmented frame, use `newframe = frame.copy()` +# df_output[col_name] = to_unindexed_series(df_input[argument]) + +target = "fourway_label" #@param ["bot_label", "opinion_label", "fourway_label", "toxic_label", "is_factual"] +user_labels = users_df[target] +user_labels + +users_pipeline = ReductionPipeline(x=users_x, labels=user_labels, target=target, n_components=2) + +users_pipeline.perform() + +users_pipeline.embeddings_df.head() + + + +# todo: re-implement colors map and category orders +#users_pipeline.plot_embeddings(fig_show=False, fig_save=False, height=350, ) + +color_map = COLORS_MAP[target] +category_orders = {target: CATEGORY_ORDERS[target]} + +users_pipeline.plot_embeddings(fig_show=False, fig_save=False, height=350, + color=target, color_map=color_map, category_orders=category_orders +) + +groupby_cols = [ + "bot_label", "opinion_label", # "bom_overall_label", "bom_astroturf_label", + "toxic_label", "factual_label", + "fourway_label", #"sixway_label", +] + +for groupby_col in groupby_cols: + color_map = COLORS_MAP[groupby_col] + category_orders = {groupby_col: CATEGORY_ORDERS[groupby_col]} + + labels = users_df[groupby_col] + pipeline = ReductionPipeline(x=users_x, labels=labels, target=groupby_col, n_components=2) + + results_dirpath = os.path.join(RESULTS_DIRPATH, "openai_embeddings_v2", "text-embedding-ada-002", f"user_embeddings_{pipeline.reducer_type.lower()}_{pipeline.n_components}", groupby_col) + os.makedirs(results_dirpath, exist_ok=True) + + pipeline.perform() + + pipeline.plot_embeddings( + color=groupby_col, color_map=color_map, category_orders=category_orders, + #hover_data=["user_id", "bot_label"], + fig_show=True, fig_save=True, + results_dirpath=results_dirpath + ) + +"""## Tweet Embeddings (User Averages) + +183K statuses, averaged for each user (see prior notebook). 7566 rows resulting + +### Loading +""" + +from pandas import read_csv + +csv_filepath = os.path.join(DATA_DIRPATH, "botometer_sample_max_50_openai_status_embeddings_v3_unpacked_deduped_averaged.csv.gz") +averages_df = read_csv(csv_filepath) +print(averages_df.shape) +print(averages_df.columns) +averages_df.index = averages_df["user_id"] +averages_df.head() + +averages_df["user_id"].nunique() + +len(averages_df) + +averages_df = add_labels(averages_df) +print(averages_df.shape) +print(averages_df.columns.tolist()) +averages_df.head() + +"""### Splitting""" + +averages_x = averages_df[embeddings_cols] +averages_x + +"""### PCA 2""" + +averages_labels = averages_df[target] +averages_labels + +averages_pipeline = ReductionPipeline(x=averages_x, labels=averages_labels, target=target, n_components=2) + +averages_pipeline.perform() + +averages_pipeline.embeddings_df.head() + +# todo: re-implement colors map and category orders +#averages_pipeline.plot_embeddings(fig_show=True, fig_save=False, height=350) + +groupby_cols = [ + "bot_label", "opinion_label", # "bom_overall_label", "bom_astroturf_label", + "toxic_label", "factual_label", + "fourway_label", #"sixway_label", +] + +for groupby_col in groupby_cols: + color_map = COLORS_MAP[groupby_col] + category_orders = {groupby_col: CATEGORY_ORDERS[groupby_col]} + + labels = averages_df[groupby_col] + pipeline = ReductionPipeline(x=averages_x, labels=labels, target=groupby_col, n_components=2) + + results_dirpath = os.path.join(RESULTS_DIRPATH, "openai_embeddings_v2", "text-embedding-ada-002", f"status_avg_embeddings_{pipeline.reducer_type.lower()}_{pipeline.n_components}", groupby_col) + os.makedirs(results_dirpath, exist_ok=True) + + pipeline.perform() + + pipeline.plot_embeddings( + color=groupby_col, color_map=color_map, category_orders=category_orders, + #hover_data=["user_id", "bot_label"], + fig_show=True, fig_save=True, + results_dirpath=results_dirpath + ) \ No newline at end of file diff --git a/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/bot_label/pca_2.html b/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/bot_label/pca_2.html new file mode 100644 index 0000000..6bfc7a8 --- /dev/null +++ b/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/bot_label/pca_2.html @@ -0,0 +1,14 @@ + + + +
+
+ + \ No newline at end of file diff --git a/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/bot_label/pca_2.png b/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/bot_label/pca_2.png new file mode 100644 index 0000000..ce05d91 Binary files /dev/null and b/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/bot_label/pca_2.png differ diff --git a/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/factual_label/pca_2.html b/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/factual_label/pca_2.html new file mode 100644 index 0000000..3e878bb --- /dev/null +++ b/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/factual_label/pca_2.html @@ -0,0 +1,14 @@ + + + +
+
+ + \ No newline at end of file diff --git a/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/factual_label/pca_2.png b/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/factual_label/pca_2.png new file mode 100644 index 0000000..b2b4821 Binary files /dev/null and b/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/factual_label/pca_2.png differ diff --git a/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/fourway_label/pca_2.html b/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/fourway_label/pca_2.html new file mode 100644 index 0000000..640a70a --- /dev/null +++ b/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/fourway_label/pca_2.html @@ -0,0 +1,14 @@ + + + +
+
+ + \ No newline at end of file diff --git a/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/fourway_label/pca_2.png b/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/fourway_label/pca_2.png new file mode 100644 index 0000000..e48e6e3 Binary files /dev/null and b/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/fourway_label/pca_2.png differ diff --git a/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/opinion_label/pca_2.html b/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/opinion_label/pca_2.html new file mode 100644 index 0000000..31a5fef --- /dev/null +++ b/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/opinion_label/pca_2.html @@ -0,0 +1,14 @@ + + + +
+
+ + \ No newline at end of file diff --git a/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/opinion_label/pca_2.png b/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/opinion_label/pca_2.png new file mode 100644 index 0000000..9bdd6ae Binary files /dev/null and b/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/opinion_label/pca_2.png differ diff --git a/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/toxic_label/pca_2.html b/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/toxic_label/pca_2.html new file mode 100644 index 0000000..6a04644 --- /dev/null +++ b/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/toxic_label/pca_2.html @@ -0,0 +1,14 @@ + + + +
+
+ + \ No newline at end of file diff --git a/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/toxic_label/pca_2.png b/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/toxic_label/pca_2.png new file mode 100644 index 0000000..3526c27 Binary files /dev/null and b/results/openai_embeddings_v2/text-embedding-ada-002/status_avg_embeddings_pca_2/toxic_label/pca_2.png differ diff --git a/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/bot_label/pca_2.html b/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/bot_label/pca_2.html new file mode 100644 index 0000000..887d834 --- /dev/null +++ b/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/bot_label/pca_2.html @@ -0,0 +1,14 @@ + + + +
+
+ + \ No newline at end of file diff --git a/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/bot_label/pca_2.png b/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/bot_label/pca_2.png new file mode 100644 index 0000000..2532d9d Binary files /dev/null and b/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/bot_label/pca_2.png differ diff --git a/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/factual_label/pca_2.html b/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/factual_label/pca_2.html new file mode 100644 index 0000000..abcba28 --- /dev/null +++ b/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/factual_label/pca_2.html @@ -0,0 +1,14 @@ + + + +
+
+ + \ No newline at end of file diff --git a/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/factual_label/pca_2.png b/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/factual_label/pca_2.png new file mode 100644 index 0000000..6bd3ef3 Binary files /dev/null and b/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/factual_label/pca_2.png differ diff --git a/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/fourway_label/pca_2.html b/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/fourway_label/pca_2.html new file mode 100644 index 0000000..6b8b863 --- /dev/null +++ b/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/fourway_label/pca_2.html @@ -0,0 +1,14 @@ + + + +
+
+ + \ No newline at end of file diff --git a/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/fourway_label/pca_2.png b/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/fourway_label/pca_2.png new file mode 100644 index 0000000..99b0e01 Binary files /dev/null and b/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/fourway_label/pca_2.png differ diff --git a/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/opinion_label/pca_2.html b/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/opinion_label/pca_2.html new file mode 100644 index 0000000..c95c852 --- /dev/null +++ b/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/opinion_label/pca_2.html @@ -0,0 +1,14 @@ + + + +
+
+ + \ No newline at end of file diff --git a/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/opinion_label/pca_2.png b/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/opinion_label/pca_2.png new file mode 100644 index 0000000..cc35780 Binary files /dev/null and b/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/opinion_label/pca_2.png differ diff --git a/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/toxic_label/pca_2.html b/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/toxic_label/pca_2.html new file mode 100644 index 0000000..eb4275a --- /dev/null +++ b/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/toxic_label/pca_2.html @@ -0,0 +1,14 @@ + + + +
+
+ + \ No newline at end of file diff --git a/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/toxic_label/pca_2.png b/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/toxic_label/pca_2.png new file mode 100644 index 0000000..453f054 Binary files /dev/null and b/results/openai_embeddings_v2/text-embedding-ada-002/user_embeddings_pca_2/toxic_label/pca_2.png differ