From ea6224548129255f76ad6f6281f55fbdc2d2b6d7 Mon Sep 17 00:00:00 2001
From: zlisto <tauhid.zaman@yale.edu>
Date: Sat, 8 Feb 2025 16:19:49 -0500
Subject: [PATCH] HW3

---
 main/HW3.ipynb | 553 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 553 insertions(+)
 create mode 100644 main/HW3.ipynb

diff --git a/main/HW3.ipynb b/main/HW3.ipynb
new file mode 100644
index 0000000..ec95b6c
--- /dev/null
+++ b/main/HW3.ipynb
@@ -0,0 +1,553 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Homework 3\n",
+        "\n",
+        "This notebook provides some skeleton code to get you started on the homework.  Add in your own code and markdown cells  to answer the homework questions.  You will submit this notebook as an .ipynb file.  \n",
+        "\n",
+        "Before starting, select \"Runtime->Factory reset runtime\" to start with your directories and environment in the base state.\n",
+        "\n",
+        "If you want to save changes to the notebook, select \"File->Save a copy in Drive\" from the top menu in Colab.  This will save the notebook in your Google Drive."
+      ],
+      "metadata": {
+        "id": "vZcOmmvLdZCf"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Clone, Install, Import"
+      ],
+      "metadata": {
+        "id": "xB5kK0YIdfoS"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!git clone https://github.com/zlisto/social_media_genAI\n",
+        "\n",
+        "import os\n",
+        "os.chdir(\"social_media_genAI/main\")"
+      ],
+      "metadata": {
+        "id": "5naKmAgWdm9U",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "ea415bb2-bade-4258-cdf3-b1def8049e1d"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Cloning into 'social_media_genAI'...\n",
+            "remote: Enumerating objects: 1872, done.\u001b[K\n",
+            "remote: Counting objects: 100% (9/9), done.\u001b[K\n",
+            "remote: Compressing objects: 100% (8/8), done.\u001b[K\n",
+            "remote: Total 1872 (delta 1), reused 6 (delta 1), pack-reused 1863 (from 2)\u001b[K\n",
+            "Receiving objects: 100% (1872/1872), 209.13 MiB | 13.71 MiB/s, done.\n",
+            "Resolving deltas: 100% (178/178), done.\n",
+            "Updating files: 100% (1496/1496), done.\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "pip install -q -r requirements.txt"
+      ],
+      "metadata": {
+        "id": "ZmXUOZsqdb8v",
+        "collapsed": true,
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "be8ae64d-5c03-43e4-a99e-c31904ddb78a"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m21.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m548.3/548.3 kB\u001b[0m \u001b[31m26.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.4/1.4 MB\u001b[0m \u001b[31m52.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m85.0/85.0 kB\u001b[0m \u001b[31m5.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m232.6/232.6 kB\u001b[0m \u001b[31m14.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m244.3/244.3 kB\u001b[0m \u001b[31m15.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m264.7/264.7 kB\u001b[0m \u001b[31m15.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m313.6/313.6 kB\u001b[0m \u001b[31m18.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25h  Building wheel for fpdf (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
+            "nx-cugraph-cu12 24.12.0 requires networkx>=3.2, but you have networkx 3.0 which is incompatible.\n",
+            "torch 2.5.1+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == \"Linux\" and platform_machine == \"x86_64\", but you have nvidia-cublas-cu12 12.5.3.2 which is incompatible.\n",
+            "torch 2.5.1+cu124 requires nvidia-cuda-cupti-cu12==12.4.127; platform_system == \"Linux\" and platform_machine == \"x86_64\", but you have nvidia-cuda-cupti-cu12 12.5.82 which is incompatible.\n",
+            "torch 2.5.1+cu124 requires nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == \"Linux\" and platform_machine == \"x86_64\", but you have nvidia-cuda-nvrtc-cu12 12.5.82 which is incompatible.\n",
+            "torch 2.5.1+cu124 requires nvidia-cuda-runtime-cu12==12.4.127; platform_system == \"Linux\" and platform_machine == \"x86_64\", but you have nvidia-cuda-runtime-cu12 12.5.82 which is incompatible.\n",
+            "torch 2.5.1+cu124 requires nvidia-cudnn-cu12==9.1.0.70; platform_system == \"Linux\" and platform_machine == \"x86_64\", but you have nvidia-cudnn-cu12 9.3.0.75 which is incompatible.\n",
+            "torch 2.5.1+cu124 requires nvidia-cufft-cu12==11.2.1.3; platform_system == \"Linux\" and platform_machine == \"x86_64\", but you have nvidia-cufft-cu12 11.2.3.61 which is incompatible.\n",
+            "torch 2.5.1+cu124 requires nvidia-curand-cu12==10.3.5.147; platform_system == \"Linux\" and platform_machine == \"x86_64\", but you have nvidia-curand-cu12 10.3.6.82 which is incompatible.\n",
+            "torch 2.5.1+cu124 requires nvidia-cusolver-cu12==11.6.1.9; platform_system == \"Linux\" and platform_machine == \"x86_64\", but you have nvidia-cusolver-cu12 11.6.3.83 which is incompatible.\n",
+            "torch 2.5.1+cu124 requires nvidia-cusparse-cu12==12.3.1.170; platform_system == \"Linux\" and platform_machine == \"x86_64\", but you have nvidia-cusparse-cu12 12.5.1.3 which is incompatible.\n",
+            "torch 2.5.1+cu124 requires nvidia-nvjitlink-cu12==12.4.127; platform_system == \"Linux\" and platform_machine == \"x86_64\", but you have nvidia-nvjitlink-cu12 12.5.82 which is incompatible.\u001b[0m\u001b[31m\n",
+            "\u001b[0m"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import pandas as pd\n",
+        "import matplotlib.pyplot as plt\n",
+        "import seaborn as sns\n",
+        "import numpy as np\n",
+        "import codecs  #this let's us display tweets properly (emojis, etc.)\n",
+        "import textwrap as tr\n",
+        "from tqdm import tqdm  #progress bar for for loops\n",
+        "from IPython.display import HTML\n",
+        "import json\n",
+        "import textwrap\n",
+        "import plotly.express as px\n",
+        "from sklearn.decomposition import PCA\n",
+        "\n",
+        "from scripts.genai import GenAI\n",
+        "\n",
+        "#this option makes it so tweets display nicely in a dataframe\n",
+        "pd.set_option(\"display.max_colwidth\", None)\n",
+        "\n",
+        "#this code sets the font sizes for plots\n",
+        "plt.rcParams.update({'axes.labelsize': 18,\n",
+        "                     'xtick.labelsize': 14,\n",
+        "                     'ytick.labelsize': 14,\n",
+        "                     'figure.figsize':(8,6),\n",
+        "                     'axes.grid':True})\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "K8a9kY4idotl"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Connect to ChatGPT"
+      ],
+      "metadata": {
+        "id": "pFLwtwa3efWp"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "OPENAI_API_KEY = \"\""
+      ],
+      "metadata": {
+        "id": "La37OkxcecbH"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Initialize GenAI\n",
+        "jarvis = GenAI(OPENAI_API_KEY)\n",
+        "\n",
+        "text = \"Who are you?\"\n",
+        "response = jarvis.generate_text(text)\n",
+        "print(f\"Human:{text}\\nAI:{response}\")"
+      ],
+      "metadata": {
+        "id": "nTp3BC-uec5f",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "2dda11ac-d110-400e-d903-3f435906679d"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Human:Who are you?\n",
+            "AI:I’m Jarvis, an AI designed to assist you with a wide range of questions and tasks. Whether you need information, advice, or help with problem-solving, I'm here to help! What can I assist you with today?\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Problem 1 (30 points) Generate Tweets in Style of User\n",
+        "\n",
+        "We will generate tweets in the style of a user.\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "rLtOQT5id0cp"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Problem 1.1 (5 points) Load tweets\n",
+        "\n",
+        "Load the tweets into a dataframe and call it `df`. The tweets are in the file `\"data/TwExportly/TwExportly_deepseek_ai_tweets_2025_01_30.csv\"`. Print out the number of rows in `df`."
+      ],
+      "metadata": {
+        "id": "rUka0z9Md1yK"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "-g95BQWbbEMS"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Problem 1.2 (5 points) Keep high engagement tweets\n",
+        "\n",
+        "Add a column to `df` called `\"engagement\"` which is the favorite count divided by the view count.  Create two strings called `tweets_str_high` and `tweets_str_low` which are the text of the tweets whose engagment is in the top 10% and bottom 10%.  The string format should be `\"Tweet:text\\n, Tweet:text\\n...\"`."
+      ],
+      "metadata": {
+        "id": "dxgPvnSsgq4c"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "ROq5hyirbFP4"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Problem 1.3 (10 points) Analyze engagement with AI\n",
+        "\n",
+        "Create `instructions` which tells the AI return to you an analysis of how the high engagement tweets differ from low engagement tweets.  Also have the instructions show you specific examples of high and low engagement tweets.   Construct your `prompt` so it contains the high and low engagement tweets and distinguishes between them.  Have the AI return the `analysis` as an HTML file.  Display your `analysis` HTML output with dark mode background using the `display` and `HTML` functions so it is easy to read."
+      ],
+      "metadata": {
+        "id": "G2oVdtFDiw2A"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "J_-EAQk-bGUg"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Problem 1.4 (10 points) Enhance tweets with AI\n",
+        "\n",
+        "Take the 5 lowest engagement tweets and enhance them with the AI based on your `analysis` from the previous problem.  Put the `analysis`, along with the high and low engagement tweet samples in `instructions`.  Print out the original tweet and its `engagement` value (so we know you picked the low engagement tweets) and then below it print the enhanced tweet.  Make sure you put some kind of demarcation string (like `\"-\"*50`) between each group of tweets, put headers for the tweets like `\"Old Tweet\"` and `\"Enhanced Tweet\"`, and use the `fill` function so it is easier to read.  Using formatted strings, `sort_values` and a `for` loop might be handy here."
+      ],
+      "metadata": {
+        "id": "rID114kKj9nG"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "zlnCK8-bbHXh"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Problem 2 (30 points) Actor and Critic\n",
+        "\n",
+        "We will use the AI as a critic to help us improve our tweets.\n"
+      ],
+      "metadata": {
+        "id": "Mhmpr-TtoK9c"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Problem 2.1 (5 points) Generte a simple tweet.\n",
+        "\n",
+        "Choose a `topic` and a `style` and use the AI to generate a tweet and call it `tweet_base`. Display it with the `display_tweet` function as was done in class (put a semi-colon after the `display_tweet` function so it wont print out the raw HTML for the tweet)."
+      ],
+      "metadata": {
+        "id": "HVPGH1Whq3Si"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "OfifztxjbIbZ"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Problem 2.2 (10 points) Critic\n",
+        "\n",
+        "Write a Python function called `critic` that takes as input the tweet text and returns the tweets quality score between 0 and 10, where higher quality means the tweet will get more engagement, the reason why it gave it that score, and ways to improve it.  Have it return the answer as an HTML table in dark model.  Add a doc string to the function so people can get `help` for it.  Print out the `help(critic)` statement."
+      ],
+      "metadata": {
+        "id": "sNFfMSWtH_Wu"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "ryEPzUDxbJir"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Problem 2.3 (5 points) Enhance the base tweet with critic\n",
+        "\n",
+        "Use the `critic` function to enhance your `tweet_base` and call it `tweet_enhanced`.  Display the two tweets with `display_tweet` and display the two analyses wiht the `display` and `HTML` functions.   Display the tweet, and then its analysis so its easy to read."
+      ],
+      "metadata": {
+        "id": "txtqpD2VKi6K"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "zaWs33tbbKh6"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Problem 2.4 (10 points)  Keep enhancing\n",
+        "\n",
+        "Start with `tweet_base` and keep enhancing it with the `critic` function three times.  Display the tweet with `display_tweet` and show the HTML of the `analysis`."
+      ],
+      "metadata": {
+        "id": "MBgfqI7KRDgc"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "AcUrcO55bLoR"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Problem 3 (10 points) Customer Service Tweets\n",
+        "\n",
+        "We will use the AI to make a Twitter customer service bot."
+      ],
+      "metadata": {
+        "id": "TJlGSl1vMuZu"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Problem 3.1 (2 points) Load complaint tweets\n",
+        "\n",
+        "Load the data in `\"data/tweet_complaints_AmericanAir.csv\"` into a dataframe `df_complaints`.  Print the head of the dataframe."
+      ],
+      "metadata": {
+        "id": "IIFuTZIlM5w7"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "Q_rnpm6PbM_b"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Problem 3.2 (3 points) Customer service instructions\n",
+        "\n",
+        "Create a string `instructions` that tells the AI to address the customer complaints. Create another string `instructions_style` that has information on the identity and style of the AI (name, background, speaking style, etc.  ).  Please be creative with the AI identity.  Add `instructions_style` to `instructions`. Print out `instructions`."
+      ],
+      "metadata": {
+        "id": "Rlcd99DOOMpp"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "YTaUisdybN8k"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Problem 3.3 (5 points) Reply to complaints with AI\n",
+        "\n",
+        "Use `instructions` and the AI to generate a tweet to reply to each complaint in `df_complaints`.  You can put the complaint in the `prompt`.  Use `display_tweet` to show the complaint tweet and the AI's response tweet below it.  See the AI's `screen_name` to be `\"AmericanAirlines\"`  Put some space or new lines between each complaint/response pair so it is easy to read.  "
+      ],
+      "metadata": {
+        "id": "onsL-3eQTb58"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "kK3FEcu_bO9U"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Problem 4 (30 points) Twitter to Instagram\n",
+        "\n",
+        "In this problem we will take content from a Twitter account and convert it into content for Instagram"
+      ],
+      "metadata": {
+        "id": "pp6wFpE5UCa6"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Problem 4.1 (5 points) Load tweets\n",
+        "\n",
+        "Load the tweets from the file `\"data/TwExportly/TwExportly_mrbeast_tweets_2024_02_21.csv\"` into a dataframe `df`. Define a variable `screen_name` equal to the screen name of the creator of these tweets. Print `screen_name` and the number of tweets in `df`."
+      ],
+      "metadata": {
+        "id": "Zs5qcHM3Ukhg"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "Hjw0WUpXbQHE"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Problem 4.2 (5 points) Most favorite tweet\n",
+        "\n",
+        "Sort `df` by `favorite_count` from highest to lowest and display ONLY the  `favorite_count` and `text` columns of the most favorited tweets."
+      ],
+      "metadata": {
+        "id": "H08Jam0GVRcq"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "LtiVMhTwbRGD"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Problem 4.3 (5 points) Why so popular?\n",
+        "\n",
+        "Ask the AI to explain why it thinks the top tweet was the most favorite.  Display the answer as a nicely formatted HTML in dark mode."
+      ],
+      "metadata": {
+        "id": "H4SeCuUVVw5n"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "KitRUvfWbSJY"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Problem 4.4 (15 points) Image idea from top tweets\n",
+        "\n",
+        "Take the top 10 most favorited tweets and give them to the AI in a `prompt`.  Tell it that these are the most liked tweets of the account and to generate a description of an image this account can post on Instagram that will be liked by its followers.  Then generate an image and an Instagram caption from the description.\n",
+        "\n",
+        "Display the image and caption as an Instagram post using `display_IG` as was done in class."
+      ],
+      "metadata": {
+        "id": "C7nVPpc4WIE0"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "sTrJPzDrW6Eg"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file