Skip to content

Commit

Permalink
Merge pull request #2 from meenakshi-mittal/main
Browse files Browse the repository at this point in the history
Added processing notebooks
  • Loading branch information
mihranmiroyan authored Sep 25, 2024
2 parents 87e35c8 + 5a223bc commit be803e4
Show file tree
Hide file tree
Showing 3 changed files with 1,001 additions and 0 deletions.
251 changes: 251 additions & 0 deletions processing/multiturn_OCR_processing.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "8dulGX8CvneH"
},
"outputs": [],
"source": [
"!pip install -qU --upgrade azure-cognitiveservices-vision-computervision\n",
"!pip install -qU pillow\n",
"!pip install -qU papermill"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "FHKPkmYLF_qS"
},
"outputs": [],
"source": [
"import pandas as pd\n",
"from google.colab import drive\n",
"from azure.cognitiveservices.vision.computervision import ComputerVisionClient\n",
"from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes\n",
"from azure.cognitiveservices.vision.computervision.models import VisualFeatureTypes\n",
"from msrest.authentication import CognitiveServicesCredentials\n",
"from array import array\n",
"import os\n",
"from PIL import Image\n",
"import sys\n",
"import time\n",
"import ast\n",
"import numpy as np\n",
"from xml.etree import ElementTree as ET"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "XqkUhtJv5LiL"
},
"outputs": [],
"source": [
"drive.mount('/content/drive')"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "AGxUkkMt4YBQ"
},
"source": [
"This notebook allows you to run OCR processing on all historical data with one cell call. Please input ALL desired semester names in a list like the one below in order to run it on all semesters at once."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "E0-3sn0tGBOc"
},
"outputs": [],
"source": [
"class_names = ['data8_sp24_multiturn','data8_fa23_multiturn']"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "zj4dsJAM6AD2"
},
"source": [
"Please replace the subscription key appropriately."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "ai-oyHl2GJaj"
},
"outputs": [],
"source": [
"subscription_key = <YOUR_KEY>\n",
"endpoint = \"https://edllm-ocr-v1.cognitiveservices.azure.com/\"\n",
"\n",
"computervision_client = ComputerVisionClient(endpoint, CognitiveServicesCredentials(subscription_key))\n",
"\n",
"print(\"===== Read File - remote =====\")\n",
"read_image_url = \"https://learn.microsoft.com/azure/ai-services/computer-vision/media/quickstarts/presentation.png\"\n",
"\n",
"read_response = computervision_client.read(read_image_url, raw=True)\n",
"read_operation_location = read_response.headers[\"Operation-Location\"]\n",
"operation_id = read_operation_location.split(\"/\")[-1]\n",
"while True:\n",
" read_result = computervision_client.get_read_result(operation_id)\n",
" if read_result.status not in ['notStarted', 'running']:\n",
" break\n",
"if read_result.status == OperationStatusCodes.succeeded:\n",
" for text_result in read_result.analyze_result.read_results:\n",
" for line in text_result.lines:\n",
" print(line.text)\n",
" print(line.bounding_box)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "S-pyeo2BGtkh"
},
"outputs": [],
"source": [
"memoized_dict = {}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "_Uzl86EgHJK2"
},
"outputs": [],
"source": [
"def my_python_tool(question: str, xml: str) -> str:\n",
"\n",
" subscription_key = <YOUR_KEY>\n",
" endpoint = 'https://edllm-ocr-v1.cognitiveservices.azure.com/'\n",
" computervision_client = ComputerVisionClient(endpoint, CognitiveServicesCredentials(subscription_key))\n",
" question = str(question)\n",
" root = ET.fromstring(xml)\n",
" image_links = [image.get('src') for image in root.iter('image')]\n",
" extracted_text = ''\n",
"\n",
" for img_link in image_links:\n",
" if img_link in memoized_dict:\n",
" extracted_text += memoized_dict[img_link]\n",
" continue\n",
"\n",
" try:\n",
"\n",
" read_response = computervision_client.read(img_link, raw=True)\n",
"\n",
" except:\n",
" print(f\"Failed question: {question}\")\n",
" continue\n",
" read_operation_location = read_response.headers[\"Operation-Location\"]\n",
" operation_id = read_operation_location.split(\"/\")[-1]\n",
" while True:\n",
" read_result = computervision_client.get_read_result(operation_id)\n",
" if read_result.status not in ['notStarted', 'running']:\n",
" break\n",
" if read_result.status == OperationStatusCodes.succeeded:\n",
" for text_result in read_result.analyze_result.read_results:\n",
" for line in text_result.lines:\n",
" extracted_text += str(line.text)\n",
" memoized_dict[img_link]=extracted_text\n",
"\n",
" return question, extracted_text"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "deveF14eHKbm"
},
"outputs": [],
"source": [
"def safe_my_python_tool(row):\n",
" try:\n",
" if row.name % 100 == 0:\n",
" print(f'{row.name}/{len(qa_data)}')\n",
" questions = []\n",
" for i in ast.literal_eval(row['memory']):\n",
" question = {}\n",
" q, context = my_python_tool(i['text'],i['document'])\n",
" if (i['user_role'] == 'student' and not i['endorsed']):\n",
" question['role'] = 'Student'\n",
" else:\n",
" question['role'] = 'TA'\n",
" question['text'] = q\n",
" question['image context'] = context\n",
" questions.append(question)\n",
"\n",
" question = {}\n",
" q, context = my_python_tool(row.question, row.document_q)\n",
" question['role'] = 'Student'\n",
" question['text'] = q\n",
" question['image context'] = context\n",
" questions.append(question)\n",
"\n",
" return questions\n",
" except Exception as e:\n",
" print(f\"Error processing row with index {row.name}: {e}\")\n",
" return []"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "6tdfXvG94w__"
},
"source": [
"While the next cell is running, you will see a progress bar for each semester along with any OCR errors. This cell may take a long time to run (several hours)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "NF9kT4HPrNeQ"
},
"outputs": [],
"source": [
"for class_name in class_names:\n",
" qa_data = pd.read_csv(f'drive/MyDrive/EdSupport/Deployment/Ed_Data_Processing/Data/data_(phase_4)/{class_name}/qa.csv')\n",
"\n",
" print('\\n\\n===============================================================================================')\n",
" print(f'BEGIN PROCESSING OF {class_name}')\n",
" print('===============================================================================================\\n\\n')\n",
" test_ser = qa_data.apply(safe_my_python_tool, axis=1)\n",
" qa_data[\"QuestionOCR\"] = test_ser\n",
" data = qa_data\n",
" data['Metadata'] = 'type:' + data['type'] + ' | category:' + data['category']\n",
" data['Question'] = data['QuestionOCR']\n",
" data['Answer'] = data['answer']\n",
" xls = data[['Question', 'Answer', 'Metadata']]\n",
" xls.to_excel(f\"drive/MyDrive/EdSupport/Deployment/Ed_Data_Processing/Data/data_(phase_4)/{class_name}/excel_simple_ocr.xlsx\", index=False)\n"
]
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Loading

0 comments on commit be803e4

Please sign in to comment.