Merge pull request #2 from meenakshi-mittal/main

Added processing notebooks
mihranmiroyan · Sep 25, 2024 · be803e4 · be803e4
2 parents 87e35c8 + 5a223bc
commit be803e4
Show file tree

Hide file tree

Showing 3 changed files with 1,001 additions and 0 deletions.
diff --git a/processing/multiturn_OCR_processing.ipynb b/processing/multiturn_OCR_processing.ipynb
@@ -0,0 +1,251 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "8dulGX8CvneH"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install -qU --upgrade azure-cognitiveservices-vision-computervision\n",
+        "!pip install -qU pillow\n",
+        "!pip install -qU papermill"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "FHKPkmYLF_qS"
+      },
+      "outputs": [],
+      "source": [
+        "import pandas as pd\n",
+        "from google.colab import drive\n",
+        "from azure.cognitiveservices.vision.computervision import ComputerVisionClient\n",
+        "from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes\n",
+        "from azure.cognitiveservices.vision.computervision.models import VisualFeatureTypes\n",
+        "from msrest.authentication import CognitiveServicesCredentials\n",
+        "from array import array\n",
+        "import os\n",
+        "from PIL import Image\n",
+        "import sys\n",
+        "import time\n",
+        "import ast\n",
+        "import numpy as np\n",
+        "from xml.etree import ElementTree as ET"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "XqkUhtJv5LiL"
+      },
+      "outputs": [],
+      "source": [
+        "drive.mount('/content/drive')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "AGxUkkMt4YBQ"
+      },
+      "source": [
+        "This notebook allows you to run OCR processing on all historical data with one cell call. Please input ALL desired semester names in a list like the one below in order to run it on all semesters at once."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "E0-3sn0tGBOc"
+      },
+      "outputs": [],
+      "source": [
+        "class_names = ['data8_sp24_multiturn','data8_fa23_multiturn']"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "zj4dsJAM6AD2"
+      },
+      "source": [
+        "Please replace the subscription key appropriately."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "ai-oyHl2GJaj"
+      },
+      "outputs": [],
+      "source": [
+        "subscription_key = <YOUR_KEY>\n",
+        "endpoint = \"https://edllm-ocr-v1.cognitiveservices.azure.com/\"\n",
+        "\n",
+        "computervision_client = ComputerVisionClient(endpoint, CognitiveServicesCredentials(subscription_key))\n",
+        "\n",
+        "print(\"===== Read File - remote =====\")\n",
+        "read_image_url = \"https://learn.microsoft.com/azure/ai-services/computer-vision/media/quickstarts/presentation.png\"\n",
+        "\n",
+        "read_response = computervision_client.read(read_image_url,  raw=True)\n",
+        "read_operation_location = read_response.headers[\"Operation-Location\"]\n",
+        "operation_id = read_operation_location.split(\"/\")[-1]\n",
+        "while True:\n",
+        "    read_result = computervision_client.get_read_result(operation_id)\n",
+        "    if read_result.status not in ['notStarted', 'running']:\n",
+        "        break\n",
+        "if read_result.status == OperationStatusCodes.succeeded:\n",
+        "    for text_result in read_result.analyze_result.read_results:\n",
+        "        for line in text_result.lines:\n",
+        "            print(line.text)\n",
+        "            print(line.bounding_box)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "S-pyeo2BGtkh"
+      },
+      "outputs": [],
+      "source": [
+        "memoized_dict = {}"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "_Uzl86EgHJK2"
+      },
+      "outputs": [],
+      "source": [
+        "def my_python_tool(question: str, xml: str) -> str:\n",
+        "\n",
+        "        subscription_key = <YOUR_KEY>\n",
+        "        endpoint = 'https://edllm-ocr-v1.cognitiveservices.azure.com/'\n",
+        "        computervision_client = ComputerVisionClient(endpoint, CognitiveServicesCredentials(subscription_key))\n",
+        "        question = str(question)\n",
+        "        root = ET.fromstring(xml)\n",
+        "        image_links = [image.get('src') for image in root.iter('image')]\n",
+        "        extracted_text = ''\n",
+        "\n",
+        "        for img_link in image_links:\n",
+        "            if img_link in memoized_dict:\n",
+        "              extracted_text += memoized_dict[img_link]\n",
+        "              continue\n",
+        "\n",
+        "            try:\n",
+        "\n",
+        "              read_response = computervision_client.read(img_link,  raw=True)\n",
+        "\n",
+        "            except:\n",
+        "              print(f\"Failed question: {question}\")\n",
+        "              continue\n",
+        "            read_operation_location = read_response.headers[\"Operation-Location\"]\n",
+        "            operation_id = read_operation_location.split(\"/\")[-1]\n",
+        "            while True:\n",
+        "                read_result = computervision_client.get_read_result(operation_id)\n",
+        "                if read_result.status not in ['notStarted', 'running']:\n",
+        "                    break\n",
+        "            if read_result.status == OperationStatusCodes.succeeded:\n",
+        "                for text_result in read_result.analyze_result.read_results:\n",
+        "                    for line in text_result.lines:\n",
+        "                        extracted_text += str(line.text)\n",
+        "                        memoized_dict[img_link]=extracted_text\n",
+        "\n",
+        "        return question, extracted_text"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "deveF14eHKbm"
+      },
+      "outputs": [],
+      "source": [
+        "def safe_my_python_tool(row):\n",
+        "        try:\n",
+        "            if row.name % 100 == 0:\n",
+        "              print(f'{row.name}/{len(qa_data)}')\n",
+        "            questions = []\n",
+        "            for i in ast.literal_eval(row['memory']):\n",
+        "              question = {}\n",
+        "              q, context = my_python_tool(i['text'],i['document'])\n",
+        "              if (i['user_role'] == 'student' and not i['endorsed']):\n",
+        "                question['role'] = 'Student'\n",
+        "              else:\n",
+        "                question['role'] = 'TA'\n",
+        "              question['text'] = q\n",
+        "              question['image context'] = context\n",
+        "              questions.append(question)\n",
+        "\n",
+        "            question = {}\n",
+        "            q, context = my_python_tool(row.question, row.document_q)\n",
+        "            question['role'] = 'Student'\n",
+        "            question['text'] = q\n",
+        "            question['image context'] = context\n",
+        "            questions.append(question)\n",
+        "\n",
+        "            return questions\n",
+        "        except Exception as e:\n",
+        "            print(f\"Error processing row with index {row.name}: {e}\")\n",
+        "            return []"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "6tdfXvG94w__"
+      },
+      "source": [
+        "While the next cell is running, you will see a progress bar for each semester along with any OCR errors. This cell may take a long time to run (several hours)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "NF9kT4HPrNeQ"
+      },
+      "outputs": [],
+      "source": [
+        "for class_name in class_names:\n",
+        "    qa_data = pd.read_csv(f'drive/MyDrive/EdSupport/Deployment/Ed_Data_Processing/Data/data_(phase_4)/{class_name}/qa.csv')\n",
+        "\n",
+        "    print('\\n\\n===============================================================================================')\n",
+        "    print(f'BEGIN PROCESSING OF {class_name}')\n",
+        "    print('===============================================================================================\\n\\n')\n",
+        "    test_ser = qa_data.apply(safe_my_python_tool, axis=1)\n",
+        "    qa_data[\"QuestionOCR\"] = test_ser\n",
+        "    data = qa_data\n",
+        "    data['Metadata'] = 'type:' + data['type'] + ' | category:' + data['category']\n",
+        "    data['Question'] = data['QuestionOCR']\n",
+        "    data['Answer'] = data['answer']\n",
+        "    xls = data[['Question', 'Answer', 'Metadata']]\n",
+        "    xls.to_excel(f\"drive/MyDrive/EdSupport/Deployment/Ed_Data_Processing/Data/data_(phase_4)/{class_name}/excel_simple_ocr.xlsx\", index=False)\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}