diff --git a/.env.example b/.env.example
index cad53fc..d2feb09 100644
--- a/.env.example
+++ b/.env.example
@@ -8,6 +8,9 @@ ALIAS = "LLMail"
# This message will be prepended to the message history sent to the LLM as a message from the system role
# Use this for customizing the behavior of the LLM and hence the nature of the responses
SYSTEM_PROMPT=
+# Optionally redact email addresses in logs
+# Set to "true" to redact email addresses or you can remove it (or set it to "false") to not redact email addresses
+REDACT_EMAIL_ADDRESSES=true
OPENAI_API_KEY=""
# At least when writing this, openrouter.ai has a free tier
diff --git a/llmail/__main__.py b/llmail/__main__.py
index 84c191c..113a939 100644
--- a/llmail/__main__.py
+++ b/llmail/__main__.py
@@ -15,6 +15,7 @@
from llmail.utils.cli_args import argparser
+from llmail.utils.utils import set_primary_logger
class EmailThread:
@@ -48,11 +49,15 @@ def sort_replies(self):
# However, this **significantly** reduces complexity so for now, it's fine
def __repr__(self):
- return f"EmailThread(initial_email={self.initial_email}, replies={self.replies})"
+ return (
+ f"EmailThread(initial_email={self.initial_email}, replies={self.replies})"
+ )
class Email:
- def __init__(self, imap_id, message_id, subject, sender, timestamp, body, references):
+ def __init__(
+ self, imap_id, message_id, subject, sender, timestamp, body, references
+ ):
self.imap_id = imap_id
self.message_id = message_id
self.subject = subject
@@ -90,10 +95,12 @@ def main():
bot_email = args.imap_username
# Set up logging
- set_primary_logger(args.log_level)
- ic(args)
+ set_primary_logger(args.log_level, args.redact_email_addresses)
+ logger.debug(args)
if args.watch_interval:
- logger.info(f"Watching for new emails every {args.watch_interval} seconds")
+ logger.info(
+ f"Watching for new emails every {args.watch_interval} seconds"
+ )
while True:
fetch_and_process_emails(
look_for_subject=args.subject_key,
@@ -123,7 +130,11 @@ def fetch_and_process_emails(
client.login(args.imap_username, args.imap_password)
email_threads = {}
- folders = args.folder if args.folder else [folder[2] for folder in client.list_folders()]
+ folders = (
+ args.folder
+ if args.folder
+ else [folder[2] for folder in client.list_folders()]
+ )
# for folder in client.list_folders():
# Disabling fetching from all folders due it not being inefficient
# Instead, just fetch from INBOX and get the threads later
@@ -136,19 +147,29 @@ def fetch_and_process_emails(
continue
# Might be smart to also search for forwarded emails
messages = client.search(
- ["OR", "SUBJECT", look_for_subject, "SUBJECT", f"Re: {look_for_subject}"]
+ [
+ "OR",
+ "SUBJECT",
+ look_for_subject,
+ "SUBJECT",
+ f"Re: {look_for_subject}",
+ ]
)
for msg_id in messages:
# TODO: It seems this will throw a KeyError if an email is sent while this for loop is running. May have been fixed by emptying email_threads at the end of the while loop? This should be tested again to confirm
- msg_data = client.fetch([msg_id], ["ENVELOPE", "BODY[]", "RFC822.HEADER"])
+ msg_data = client.fetch(
+ [msg_id], ["ENVELOPE", "BODY[]", "RFC822.HEADER"]
+ )
envelope = msg_data[msg_id][b"ENVELOPE"]
subject = envelope.subject.decode()
# Use regex to verify that the subject optionally starts with "Fwd: " or "Re: " and then the intended subject (nothing case-sensitive)
# re.escape is used to escape any special characters in the subject
if not re.match(
- r"^(Fwd: ?|Re: ?)?" + re.escape(look_for_subject) + r"$", subject, re.IGNORECASE
+ r"^(Fwd: ?|Re: ?)*" + re.escape(look_for_subject) + r"$",
+ subject,
+ re.IGNORECASE,
):
- logger.info(
+ logger.warning(
f"Skipping email with subject '{subject}' as it does not match the intended subject"
)
continue
@@ -216,8 +237,7 @@ def fetch_and_process_emails(
f"Created new thread for email {message_id} sent at {timestamp}"
)
- # ic([thread for thread in email_threads.values()])
- ic(email_threads)
+ logger.debug(email_threads)
# Check if there are any emails wherein the last email in the thread is a user email
# If so, send a reply
for message_id, email_thread in email_threads.items():
@@ -228,15 +248,23 @@ def fetch_and_process_emails(
message_id = email_thread.initial_email.message_id
msg_id = email_thread.initial_email.imap_id
references_ids = email_thread.initial_email.references
- elif len(email_thread.replies) > 0 and email_thread.replies[-1].sender != bot_email:
+ elif (
+ len(email_thread.replies) > 0
+ and email_thread.replies[-1].sender != bot_email
+ ):
logger.debug(
f"Last email in thread for email {message_id} is from {email_thread.replies[-1].sender}"
)
message_id = email_thread.replies[-1].message_id
msg_id = email_thread.replies[-1].imap_id
references_ids = email_thread.replies[-1].references
- elif len(email_thread.replies) > 0 and email_thread.replies[-1].sender == bot_email:
- logger.debug(f"Last email in thread for email {message_id} is from the bot")
+ elif (
+ len(email_thread.replies) > 0
+ and email_thread.replies[-1].sender == bot_email
+ ):
+ logger.debug(
+ f"Last email in thread for email {message_id} is from the bot"
+ )
continue
else:
ValueError("Invalid email thread")
@@ -289,7 +317,11 @@ def get_thread_history(
)
for email in message_identifier.replies:
thread_history.append(
- {"sender": email.sender, "content": email.body, "timestamp": email.timestamp}
+ {
+ "sender": email.sender,
+ "content": email.body,
+ "timestamp": email.timestamp,
+ }
)
return thread_history
elif isinstance(message_identifier, int) or isinstance(message_identifier, str):
@@ -335,7 +367,9 @@ def get_thread_history(
{
"sender": get_sender(message)["email"],
"content": get_plain_email_content(message),
- "timestamp": make_tz_aware(parsedate_to_datetime(message.get("Date"))),
+ "timestamp": make_tz_aware(
+ parsedate_to_datetime(message.get("Date"))
+ ),
}
)
message = prev_message
@@ -343,7 +377,9 @@ def get_thread_history(
thread_history = sorted(thread_history, key=lambda x: x["timestamp"])
return thread_history
else:
- raise TypeError("Invalid type for message. Must be an int, str, or EmailThread object.")
+ raise TypeError(
+ "Invalid type for message. Must be an int, str, or EmailThread object."
+ )
def get_sender(message: Message) -> dict:
@@ -365,7 +401,9 @@ def get_top_level_email(client, msg_id, message_id=None):
# Extract the References header and split it into individual message IDs
references_header = headers.get("References", "")
- references_ids = [m_id.strip() for m_id in references_header.split() if m_id.strip()]
+ references_ids = [
+ m_id.strip() for m_id in references_header.split() if m_id.strip()
+ ]
# Extract the first message ID, which represents the top-level email in the thread
# If it doesn't exist, use the current message ID. Not msg_id since msg_id is only for IMAP
@@ -413,14 +451,6 @@ def get_uid_from_message_id(imap_client, message_id):
return None
-def set_primary_logger(log_level):
- """Set up the primary logger with the specified log level. Output to stderr and use the format specified."""
- logger.remove()
- # ^10 is a formatting directive to center with a padding of 10
- logger_format = "{time:YYYY-MM-DD HH:mm:ss} |{level: ^10}| {message}"
- logger.add(stderr, format=logger_format, colorize=True, level=log_level)
-
-
def send_reply(
thread: list[dict],
subject: str,
@@ -440,15 +470,6 @@ def send_reply(
if system_prompt:
thread.insert(0, {"role": "system", "content": system_prompt})
references_ids.append(message_id)
- # thread_from_msg_id = get_thread_history(client, msg_id)
- # logger.debug(f"Thread history (message_identifier): {thread_from_msg_id}")
- # logger.debug(f"Thread history length (message_identifier): {len(thread_from_msg_id)}")
- # thread_from_object = get_thread_history(client, email_threads[list(email_threads.keys())[-1]])
- # logger.debug(f"Thread history (EmailThread object): {thread_from_object}")
- # logger.debug(f"Thread history length (EmailThread object): {len(thread_from_object)}")
- logger.info(f"Sending reply to email {message_id} to {sender}")
- logger.debug(f"Thread history: {thread}")
- logger.debug(f"Thread history length: {len(thread)}")
generated_response = openai.chat.completions.create(
model=model,
messages=thread,
@@ -463,6 +484,7 @@ def send_reply(
)
yag.send(
to=sender,
+ # subject=f"Re: {subject}" if not subject.startswith("Re: ") else subject,
subject=f"Re: {subject}",
headers={"In-Reply-To": message_id, "References": " ".join(references_ids)},
contents=generated_response,
@@ -470,6 +492,15 @@ def send_reply(
domain=args.message_id_domain if args.message_id_domain else "llmail"
),
)
+ # thread_from_msg_id = get_thread_history(client, msg_id)
+ # logger.debug(f"Thread history (message_identifier): {thread_from_msg_id}")
+ # logger.debug(f"Thread history length (message_identifier): {len(thread_from_msg_id)}")
+ # thread_from_object = get_thread_history(client, email_threads[list(email_threads.keys())[-1]])
+ # logger.debug(f"Thread history (EmailThread object): {thread_from_object}")
+ # logger.debug(f"Thread history length (EmailThread object): {len(thread_from_object)}")
+ logger.info(f"Sending reply to email {message_id} to {sender}")
+ logger.debug(f"Thread history: {thread}")
+ logger.debug(f"Thread history length: {len(thread)}")
def get_plain_email_content(message: Message | str) -> str:
@@ -485,15 +516,25 @@ def get_plain_email_content(message: Message | str) -> str:
try:
body = part.get_payload(decode=True)
except UnicodeDecodeError:
- logger.debug("UnicodeDecodeError occurred. Trying to get payload as string.")
+ logger.debug(
+ "UnicodeDecodeError occurred. Trying to get payload as string."
+ )
body = str(part.get_payload())
if content_type == "text/plain":
- markdown = html2text.html2text(str(body.decode("unicode_escape"))).strip()
+ markdown = html2text.html2text(
+ str(body.decode("unicode_escape"))
+ ).strip()
# logger.debug(f"Converted to markdown: {markdown}")
+ # if len(markdown) < 5:
+ # logger.warning(
+ # f"Content is less than 5 characters | Content: {markdown}"
+ # )
return markdown
else:
logger.debug("Message is not multipart. Getting payload as string.")
body = message.get_payload(decode=True).decode()
+ # if len(body) < 5:
+ # logger.warning(f"Content is less than 5 characters | Content: {body}")
return html2text.html2text(str(body.decode("unicode_escape")))
diff --git a/llmail/utils/cli_args.py b/llmail/utils/cli_args.py
index 4cf9644..b5ec7a1 100644
--- a/llmail/utils/cli_args.py
+++ b/llmail/utils/cli_args.py
@@ -37,7 +37,9 @@ def set_argparse():
title="Subcommands",
)
# Subcommand: list-folders
- _ = subparsers.add_parser("list-folders", help="List all folders in the IMAP account and exit")
+ _ = subparsers.add_parser(
+ "list-folders", help="List all folders in the IMAP account and exit"
+ )
# General arguments
argparser.add_argument(
"--log-level",
@@ -45,12 +47,28 @@ def set_argparse():
help="Log level",
default=os.getenv("LOG_LEVEL") if os.getenv("LOG_LEVEL") else "INFO",
)
+ argparser.add_argument(
+ "--redact-email-addresses",
+ help="Replace email addresses with '[redacted]' in logs",
+ action="store_true",
+ default=(
+ True
+ if (
+ os.getenv("REDACT_EMAIL_ADDRESSES")
+ and os.getenv("REDACT_EMAIL_ADDRESSES").lower() == "true"
+ and os.getenv("REDACT_EMAIL_ADDRESSES").lower() != "false"
+ )
+ else False
+ ),
+ )
argparser.add_argument(
"--watch-interval",
"-w",
help="Interval in seconds to check for new emails. If not set, will only check once.",
type=int,
- default=int(os.getenv("WATCH_INTERVAL")) if os.getenv("WATCH_INTERVAL") else None,
+ default=(
+ int(os.getenv("WATCH_INTERVAL")) if os.getenv("WATCH_INTERVAL") else None
+ ),
)
# OpenAI-compatible API arguments
ai_api = argparser.add_argument_group("OpenAI-compatible API")
@@ -65,9 +83,11 @@ def set_argparse():
ai_api.add_argument(
"--openai-model",
help="Model to use for the LLM",
- default=os.getenv("OPENAI_MODEL")
- if os.getenv("OPENAI_MODEL")
- else "mistralai/mistral-7b-instruct:free",
+ default=(
+ os.getenv("OPENAI_MODEL")
+ if os.getenv("OPENAI_MODEL")
+ else "mistralai/mistral-7b-instruct:free"
+ ),
)
ai_api.add_argument(
"--system-prompt",
@@ -88,7 +108,9 @@ def set_argparse():
"--subject-key",
"-s",
help="Emails with this subject will be replied to",
- default=os.getenv("SUBJECT_KEY") if os.getenv("SUBJECT_KEY") else "llmail autoreply",
+ default=(
+ os.getenv("SUBJECT_KEY") if os.getenv("SUBJECT_KEY") else "llmail autoreply"
+ ),
)
email.add_argument(
"--alias",
@@ -96,8 +118,12 @@ def set_argparse():
default=os.getenv("ALIAS") if os.getenv("ALIAS") else "LLMail",
)
imap = email.add_argument_group("IMAP")
- imap.add_argument("--imap-host", help="IMAP server hostname", default=os.getenv("IMAP_HOST"))
- imap.add_argument("--imap-port", help="IMAP server port", default=os.getenv("IMAP_PORT"))
+ imap.add_argument(
+ "--imap-host", help="IMAP server hostname", default=os.getenv("IMAP_HOST")
+ )
+ imap.add_argument(
+ "--imap-port", help="IMAP server port", default=os.getenv("IMAP_PORT")
+ )
imap.add_argument(
"--imap-username",
help="IMAP server username",
@@ -109,8 +135,12 @@ def set_argparse():
default=os.getenv("IMAP_PASSWORD"),
)
smtp = email.add_argument_group("SMTP")
- smtp.add_argument("--smtp-host", help="SMTP server hostname", default=os.getenv("SMTP_HOST"))
- smtp.add_argument("--smtp-port", help="SMTP server port", default=os.getenv("SMTP_PORT"))
+ smtp.add_argument(
+ "--smtp-host", help="SMTP server hostname", default=os.getenv("SMTP_HOST")
+ )
+ smtp.add_argument(
+ "--smtp-port", help="SMTP server port", default=os.getenv("SMTP_PORT")
+ )
smtp.add_argument(
"--smtp-username",
help="SMTP server username",
@@ -124,7 +154,9 @@ def set_argparse():
smtp.add_argument(
"--message-id-domain",
help="Domain to use for Message-ID header",
- default=os.getenv("MESSAGE_ID_DOMAIN") if os.getenv("MESSAGE_ID_DOMAIN") else None,
+ default=(
+ os.getenv("MESSAGE_ID_DOMAIN") if os.getenv("MESSAGE_ID_DOMAIN") else None
+ ),
)
check_required_args(
diff --git a/llmail/utils/utils.py b/llmail/utils/utils.py
new file mode 100644
index 0000000..498f6d5
--- /dev/null
+++ b/llmail/utils/utils.py
@@ -0,0 +1,21 @@
+import re
+from loguru import logger
+from sys import stderr
+
+logging_file = stderr
+
+
+def redact_email_sink(message: str):
+ """Custom sink function that redacts email addresses before logging."""
+ email_pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"
+ redacted_message = re.sub(email_pattern, "[redacted]", message)
+ print(redacted_message, file=logging_file)
+
+
+def set_primary_logger(log_level, redact_email_addresses):
+ """Set up the primary logger with the specified log level. Output to stderr and use the format specified."""
+ logger.remove()
+ # ^10 is a formatting directive to center with a padding of 10
+ logger_format = "{time:YYYY-MM-DD HH:mm:ss} |{level: ^10}| {message}"
+ sink = redact_email_sink if redact_email_addresses else stderr
+ logger.add(sink=sink, format=logger_format, colorize=True, level=log_level)