Added notebook for timestamps

ssciwr · Nov 12, 2024 · a6c4f24 · a6c4f24
1 parent 4779ec7
commit a6c4f24
Showing 1 changed file with 192 additions and 0 deletions.
diff --git a/notebook/performance_demo.ipynb b/notebook/performance_demo.ipynb
@@ -0,0 +1,192 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\Felix\\miniconda3\\envs\\mailcom\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "import mailcom.inout\n",
+    "import mailcom.parse\n",
+    "import pandas as pd\n",
+    "import time\n",
+    "import datetime"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create t0 timestamp\n",
+    "t0 = time.time()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of the model checkpoint at xlm-roberta-large-finetuned-conll03-english were not used when initializing XLMRobertaForTokenClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']\n",
+      "- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
+     ]
+    }
+   ],
+   "source": [
+    "# import files from /data/in\n",
+    "\n",
+    "io = mailcom.inout.InoutHandler(\"../mailcom/test/data\")\n",
+    "io.list_of_files()\n",
+    "\n",
+    "# create pseudonymization object\n",
+    "ps = mailcom.parse.Pseudonymize()\n",
+    "ps.init_spacy(\"fr\")\n",
+    "ps.init_transformers()\n",
+    "# time stamp after model loading\n",
+    "t_model_loaded = time.time()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Parsing input file C:\\Users\\Felix\\Documents\\GitHub\\mailcom\\mailcom\\test\\data\\Bonjour Agathe.eml\n",
+      "Parsing input file C:\\Users\\Felix\\Documents\\GitHub\\mailcom\\mailcom\\test\\data\\Re reunião agendada para o dia 24 de abril 2024-04-17T17_39_49+02 00.eml\n",
+      "Parsing input file C:\\Users\\Felix\\Documents\\GitHub\\mailcom\\mailcom\\test\\data\\Re_ purismo.html\n",
+      "Parsing input file C:\\Users\\Felix\\Documents\\GitHub\\mailcom\\mailcom\\test\\data\\Re_ ¡Voy el 24!.html\n"
+     ]
+    }
+   ],
+   "source": [
+    "# loop over mails and pseudonymize them\n",
+    "out_list = []\n",
+    "ts_list = []\n",
+    "for file in io.email_list:\n",
+    "    print(\"Parsing input file {}\".format(file))\n",
+    "    text = io.get_text(file)\n",
+    "    # after this function was called, the email metadata can be accessed via io.email_content\n",
+    "    # the dict already has the entries content, date, attachments, attachment type\n",
+    "    email_dict = io.email_content.copy()\n",
+    "    text = io.get_html_text(text)\n",
+    "    if not text:\n",
+    "        continue\n",
+    "    # Test functionality of Pseudonymize class\n",
+    "    output_text = ps.pseudonymize(text)\n",
+    "    email_dict[\"pseudo_content\"] = output_text\n",
+    "    out_list.append(email_dict)\n",
+    "\n",
+    "    # timestamp after this email\n",
+    "    ts_list.append(time.time())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                                             content  \\\n",
+      "0  J'espère que tu vas bien! Je voulais partager ...   \n",
+      "1  <html><head></head><body><div class=\"ydp9a5bdb...   \n",
+      "2  <div style=\"font-size: 10pt; font-family: Verd...   \n",
+      "3  <div style=\"font-size: 10pt; font-family: Verd...   \n",
+      "\n",
+      "                       date  attachment attachement type  \\\n",
+      "0 2024-04-17 15:13:56+00:00           2       [jpg, jpg]   \n",
+      "1 2024-04-17 15:39:49+00:00           1            [png]   \n",
+      "2 1970-01-01 00:00:00+00:00           0               []   \n",
+      "3 1970-01-01 00:00:00+00:00           0               []   \n",
+      "\n",
+      "                                      pseudo_content  \n",
+      "0  J'espère que tu vas bien! Je voulais partager ...  \n",
+      "1  \\n Olá Claude,Espero que este e-mail te encont...  \n",
+      "2  From : [email] : \"Claude\"< [email] : mié. , [n...  \n",
+      "3  From : [email] : \" Claude\"< [email] Dominique\"...  \n"
+     ]
+    }
+   ],
+   "source": [
+    "# write output to pandas df\n",
+    "df = pd.DataFrame(out_list)\n",
+    "print(df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "T0: 13:51:17\n",
+      "T_model_loaded: 13:51:26\n",
+      "Email 0 finished: 13:51:31\n",
+      "Email 1 finished: 13:51:44\n",
+      "Email 2 finished: 13:51:47\n",
+      "Email 3 finished: 13:51:53\n"
+     ]
+    }
+   ],
+   "source": [
+    "# print timestamps\n",
+    "print(\"T0:\", datetime.datetime.fromtimestamp(t0).strftime('%H:%M:%S'))\n",
+    "print(\"T_model_loaded:\", datetime.datetime.fromtimestamp(t_model_loaded).strftime('%H:%M:%S'))\n",
+    "for i in range(len(ts_list)):\n",
+    "    print(\"Email\", i, \"finished:\", datetime.datetime.fromtimestamp(ts_list[i]).strftime('%H:%M:%S'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "mailcom",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}