-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2 from meenakshi-mittal/main
Added processing notebooks
- Loading branch information
Showing
3 changed files
with
1,001 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,251 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"id": "8dulGX8CvneH" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"!pip install -qU --upgrade azure-cognitiveservices-vision-computervision\n", | ||
"!pip install -qU pillow\n", | ||
"!pip install -qU papermill" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"id": "FHKPkmYLF_qS" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"import pandas as pd\n", | ||
"from google.colab import drive\n", | ||
"from azure.cognitiveservices.vision.computervision import ComputerVisionClient\n", | ||
"from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes\n", | ||
"from azure.cognitiveservices.vision.computervision.models import VisualFeatureTypes\n", | ||
"from msrest.authentication import CognitiveServicesCredentials\n", | ||
"from array import array\n", | ||
"import os\n", | ||
"from PIL import Image\n", | ||
"import sys\n", | ||
"import time\n", | ||
"import ast\n", | ||
"import numpy as np\n", | ||
"from xml.etree import ElementTree as ET" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"id": "XqkUhtJv5LiL" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"drive.mount('/content/drive')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": { | ||
"id": "AGxUkkMt4YBQ" | ||
}, | ||
"source": [ | ||
"This notebook allows you to run OCR processing on all historical data with one cell call. Please input ALL desired semester names in a list like the one below in order to run it on all semesters at once." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"id": "E0-3sn0tGBOc" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"class_names = ['data8_sp24_multiturn','data8_fa23_multiturn']" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": { | ||
"id": "zj4dsJAM6AD2" | ||
}, | ||
"source": [ | ||
"Please replace the subscription key appropriately." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"id": "ai-oyHl2GJaj" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"subscription_key = <YOUR_KEY>\n", | ||
"endpoint = \"https://edllm-ocr-v1.cognitiveservices.azure.com/\"\n", | ||
"\n", | ||
"computervision_client = ComputerVisionClient(endpoint, CognitiveServicesCredentials(subscription_key))\n", | ||
"\n", | ||
"print(\"===== Read File - remote =====\")\n", | ||
"read_image_url = \"https://learn.microsoft.com/azure/ai-services/computer-vision/media/quickstarts/presentation.png\"\n", | ||
"\n", | ||
"read_response = computervision_client.read(read_image_url, raw=True)\n", | ||
"read_operation_location = read_response.headers[\"Operation-Location\"]\n", | ||
"operation_id = read_operation_location.split(\"/\")[-1]\n", | ||
"while True:\n", | ||
" read_result = computervision_client.get_read_result(operation_id)\n", | ||
" if read_result.status not in ['notStarted', 'running']:\n", | ||
" break\n", | ||
"if read_result.status == OperationStatusCodes.succeeded:\n", | ||
" for text_result in read_result.analyze_result.read_results:\n", | ||
" for line in text_result.lines:\n", | ||
" print(line.text)\n", | ||
" print(line.bounding_box)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"id": "S-pyeo2BGtkh" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"memoized_dict = {}" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"id": "_Uzl86EgHJK2" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"def my_python_tool(question: str, xml: str) -> str:\n", | ||
"\n", | ||
" subscription_key = <YOUR_KEY>\n", | ||
" endpoint = 'https://edllm-ocr-v1.cognitiveservices.azure.com/'\n", | ||
" computervision_client = ComputerVisionClient(endpoint, CognitiveServicesCredentials(subscription_key))\n", | ||
" question = str(question)\n", | ||
" root = ET.fromstring(xml)\n", | ||
" image_links = [image.get('src') for image in root.iter('image')]\n", | ||
" extracted_text = ''\n", | ||
"\n", | ||
" for img_link in image_links:\n", | ||
" if img_link in memoized_dict:\n", | ||
" extracted_text += memoized_dict[img_link]\n", | ||
" continue\n", | ||
"\n", | ||
" try:\n", | ||
"\n", | ||
" read_response = computervision_client.read(img_link, raw=True)\n", | ||
"\n", | ||
" except:\n", | ||
" print(f\"Failed question: {question}\")\n", | ||
" continue\n", | ||
" read_operation_location = read_response.headers[\"Operation-Location\"]\n", | ||
" operation_id = read_operation_location.split(\"/\")[-1]\n", | ||
" while True:\n", | ||
" read_result = computervision_client.get_read_result(operation_id)\n", | ||
" if read_result.status not in ['notStarted', 'running']:\n", | ||
" break\n", | ||
" if read_result.status == OperationStatusCodes.succeeded:\n", | ||
" for text_result in read_result.analyze_result.read_results:\n", | ||
" for line in text_result.lines:\n", | ||
" extracted_text += str(line.text)\n", | ||
" memoized_dict[img_link]=extracted_text\n", | ||
"\n", | ||
" return question, extracted_text" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"id": "deveF14eHKbm" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"def safe_my_python_tool(row):\n", | ||
" try:\n", | ||
" if row.name % 100 == 0:\n", | ||
" print(f'{row.name}/{len(qa_data)}')\n", | ||
" questions = []\n", | ||
" for i in ast.literal_eval(row['memory']):\n", | ||
" question = {}\n", | ||
" q, context = my_python_tool(i['text'],i['document'])\n", | ||
" if (i['user_role'] == 'student' and not i['endorsed']):\n", | ||
" question['role'] = 'Student'\n", | ||
" else:\n", | ||
" question['role'] = 'TA'\n", | ||
" question['text'] = q\n", | ||
" question['image context'] = context\n", | ||
" questions.append(question)\n", | ||
"\n", | ||
" question = {}\n", | ||
" q, context = my_python_tool(row.question, row.document_q)\n", | ||
" question['role'] = 'Student'\n", | ||
" question['text'] = q\n", | ||
" question['image context'] = context\n", | ||
" questions.append(question)\n", | ||
"\n", | ||
" return questions\n", | ||
" except Exception as e:\n", | ||
" print(f\"Error processing row with index {row.name}: {e}\")\n", | ||
" return []" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": { | ||
"id": "6tdfXvG94w__" | ||
}, | ||
"source": [ | ||
"While the next cell is running, you will see a progress bar for each semester along with any OCR errors. This cell may take a long time to run (several hours)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"id": "NF9kT4HPrNeQ" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"for class_name in class_names:\n", | ||
" qa_data = pd.read_csv(f'drive/MyDrive/EdSupport/Deployment/Ed_Data_Processing/Data/data_(phase_4)/{class_name}/qa.csv')\n", | ||
"\n", | ||
" print('\\n\\n===============================================================================================')\n", | ||
" print(f'BEGIN PROCESSING OF {class_name}')\n", | ||
" print('===============================================================================================\\n\\n')\n", | ||
" test_ser = qa_data.apply(safe_my_python_tool, axis=1)\n", | ||
" qa_data[\"QuestionOCR\"] = test_ser\n", | ||
" data = qa_data\n", | ||
" data['Metadata'] = 'type:' + data['type'] + ' | category:' + data['category']\n", | ||
" data['Question'] = data['QuestionOCR']\n", | ||
" data['Answer'] = data['answer']\n", | ||
" xls = data[['Question', 'Answer', 'Metadata']]\n", | ||
" xls.to_excel(f\"drive/MyDrive/EdSupport/Deployment/Ed_Data_Processing/Data/data_(phase_4)/{class_name}/excel_simple_ocr.xlsx\", index=False)\n" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"colab": { | ||
"provenance": [] | ||
}, | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"name": "python" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 0 | ||
} |
Oops, something went wrong.