Skip to content

Commit

Permalink
Added notebook for timestamps
Browse files Browse the repository at this point in the history
  • Loading branch information
fexfl committed Nov 12, 2024
1 parent 4779ec7 commit a6c4f24
Showing 1 changed file with 192 additions and 0 deletions.
192 changes: 192 additions & 0 deletions notebook/performance_demo.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\Felix\\miniconda3\\envs\\mailcom\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"import mailcom.inout\n",
"import mailcom.parse\n",
"import pandas as pd\n",
"import time\n",
"import datetime"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# create t0 timestamp\n",
"t0 = time.time()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Some weights of the model checkpoint at xlm-roberta-large-finetuned-conll03-english were not used when initializing XLMRobertaForTokenClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']\n",
"- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
]
}
],
"source": [
"# import files from /data/in\n",
"\n",
"io = mailcom.inout.InoutHandler(\"../mailcom/test/data\")\n",
"io.list_of_files()\n",
"\n",
"# create pseudonymization object\n",
"ps = mailcom.parse.Pseudonymize()\n",
"ps.init_spacy(\"fr\")\n",
"ps.init_transformers()\n",
"# time stamp after model loading\n",
"t_model_loaded = time.time()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Parsing input file C:\\Users\\Felix\\Documents\\GitHub\\mailcom\\mailcom\\test\\data\\Bonjour Agathe.eml\n",
"Parsing input file C:\\Users\\Felix\\Documents\\GitHub\\mailcom\\mailcom\\test\\data\\Re reunião agendada para o dia 24 de abril 2024-04-17T17_39_49+02 00.eml\n",
"Parsing input file C:\\Users\\Felix\\Documents\\GitHub\\mailcom\\mailcom\\test\\data\\Re_ purismo.html\n",
"Parsing input file C:\\Users\\Felix\\Documents\\GitHub\\mailcom\\mailcom\\test\\data\\Re_ ¡Voy el 24!.html\n"
]
}
],
"source": [
"# loop over mails and pseudonymize them\n",
"out_list = []\n",
"ts_list = []\n",
"for file in io.email_list:\n",
" print(\"Parsing input file {}\".format(file))\n",
" text = io.get_text(file)\n",
" # after this function was called, the email metadata can be accessed via io.email_content\n",
" # the dict already has the entries content, date, attachments, attachment type\n",
" email_dict = io.email_content.copy()\n",
" text = io.get_html_text(text)\n",
" if not text:\n",
" continue\n",
" # Test functionality of Pseudonymize class\n",
" output_text = ps.pseudonymize(text)\n",
" email_dict[\"pseudo_content\"] = output_text\n",
" out_list.append(email_dict)\n",
"\n",
" # timestamp after this email\n",
" ts_list.append(time.time())"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" content \\\n",
"0 J'espère que tu vas bien! Je voulais partager ... \n",
"1 <html><head></head><body><div class=\"ydp9a5bdb... \n",
"2 <div style=\"font-size: 10pt; font-family: Verd... \n",
"3 <div style=\"font-size: 10pt; font-family: Verd... \n",
"\n",
" date attachment attachement type \\\n",
"0 2024-04-17 15:13:56+00:00 2 [jpg, jpg] \n",
"1 2024-04-17 15:39:49+00:00 1 [png] \n",
"2 1970-01-01 00:00:00+00:00 0 [] \n",
"3 1970-01-01 00:00:00+00:00 0 [] \n",
"\n",
" pseudo_content \n",
"0 J'espère que tu vas bien! Je voulais partager ... \n",
"1 \\n Olá Claude,Espero que este e-mail te encont... \n",
"2 From : [email] : \"Claude\"< [email] : mié. , [n... \n",
"3 From : [email] : \" Claude\"< [email] Dominique\"... \n"
]
}
],
"source": [
"# write output to pandas df\n",
"df = pd.DataFrame(out_list)\n",
"print(df)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"T0: 13:51:17\n",
"T_model_loaded: 13:51:26\n",
"Email 0 finished: 13:51:31\n",
"Email 1 finished: 13:51:44\n",
"Email 2 finished: 13:51:47\n",
"Email 3 finished: 13:51:53\n"
]
}
],
"source": [
"# print timestamps\n",
"print(\"T0:\", datetime.datetime.fromtimestamp(t0).strftime('%H:%M:%S'))\n",
"print(\"T_model_loaded:\", datetime.datetime.fromtimestamp(t_model_loaded).strftime('%H:%M:%S'))\n",
"for i in range(len(ts_list)):\n",
" print(\"Email\", i, \"finished:\", datetime.datetime.fromtimestamp(ts_list[i]).strftime('%H:%M:%S'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "mailcom",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.10"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit a6c4f24

Please sign in to comment.