Added some extra documentation

This commit is contained in:
2025-01-02 22:45:31 +13:00
parent 06eadf3ffd
commit 2441ca3c26

View File

@@ -8,12 +8,11 @@ It will find any emails with attachments, marking them as read, and saving
the attachment to the 'attachments' directory for later processing.
It will respect the attachment filetype and extension.
Attachments are output named with the current date-time, and a semi-random
uid designed to crudely prevent namespace collisions. In a VERY high traffic
environment where many files may be created per second, this should be
re-implemented to be more robust.
uid based on the file hash in order to prevent namespace collisions.
No file locking is used, however files are written to a temporary directory
first and renamed upon completion, as renaming is an atomic operation at an
OS level.
first, flushed, and renamed upon completion, as renaming is an atomic operation
at an OS level.
Attachments will not be created if they have an identical hash to a
previously downloaded attachment. This is designed to prevent scenarios where
@@ -21,6 +20,15 @@ the same file has been accidentally sent multiple times. Note that this
identification is done based on file content, and the name of the file is
irrelevant.
Once an attachment is saved, we look up the mapping of fields in the attachment
for input, using a custom mapping based on the domain of the email address of
the sender. Different companies may use differing CSV formats, but we work on
the assumption tha the XML for input into Logistic will need to be the same and
based on a well defined DTD.
After mapping and conversion to XML, the file is validated against the DTD and
written to the xml directory for injection into logistic.
Logging is fairly primitive and done to a log file in the same directory as
the script. This could be upgraded to syslog-style logging if required.
@@ -29,7 +37,8 @@ STARTTLS. If manual STARTTLS is required, the MailBox method will need to be
altered to MailBoxTls. If Outlook or Gmail or similar are used, it will be
necessary to implement OAUTH2.
Authentication is configured in a .env file as described below in the code.
Authentication and other user configuration is configured in a .env file as
described below in the code.
"""
# Standard libraries
@@ -38,7 +47,6 @@ import sys
import csv
import ssl
import json
import time
import hashlib
import logging
import tempfile
@@ -91,7 +99,7 @@ def save_hashes(hashes_file, hashes):
json.dump(list(hashes), f)
def setup_ssl_context():
"""Set up the SSL context."""
"""Set up the SSL context for mail connection."""
ssl_context = ssl.create_default_context()
ssl_context.minimum_version = ssl.TLSVersion.TLSv1_3
ssl_context.maximum_version = ssl.TLSVersion.TLSv1_3
@@ -140,6 +148,8 @@ def csv_to_xml(csv_file, xml_file, dtd_file, csv_mapping):
logging.info("XML file saved to %s", xml_file)
def get_client_mapping(client_domain):
"""Check custom mapping for the client CSV files, returning the custom
mapping if one is found."""
with open("column_mapping.json", "r", encoding="utf-8") as f:
column_mapping = json.load(f)
mapping = column_mapping.get(client_domain, column_mapping["default"])
@@ -147,7 +157,9 @@ def get_client_mapping(client_domain):
def process_attachments(mailbox, attachment_path, saved_hashes, xml_docs,
dtd_name):
"""Process and save email attachments."""
"""Process and save email attachments.
For each attachment found, save it, marking the email as read, and saving
a unique hash so we never process the same attachment again."""
for msg in mailbox.fetch(AND(seen=False), mark_seen=False):
for att in msg.attachments:
attachment_hash = compute_hash(att.payload)
@@ -156,7 +168,7 @@ def process_attachments(mailbox, attachment_path, saved_hashes, xml_docs,
filename, file_ext = os.path.splitext(att.filename)
logging.info('new attachment named %s from %s',
filename, msg.from_)
client_domain = msg.from_.split('@')[1]
client_domain = msg.from_.split('@', maxsplit=1)[1]
csv_mapping=get_client_mapping(client_domain)
logging.debug('loaded client %s csv mapping %s',
client_domain, csv_mapping)
@@ -166,7 +178,7 @@ def process_attachments(mailbox, attachment_path, saved_hashes, xml_docs,
f"{attachment_hash[:8]}"
f"{file_ext}"
)
xml_name = final_name.split('.')[0] + '.xml'
xml_name = final_name.split('.', maxsplit=1)[0] + '.xml'
with tempfile.NamedTemporaryFile(
delete=False, dir="temp") as temp_file:
@@ -186,7 +198,7 @@ def process_attachments(mailbox, attachment_path, saved_hashes, xml_docs,
mailbox.flag(msg.uid, '\\Seen', True)
def main():
"""Code entrypoint."""
"""Initialise logging, load environment, and get to work!"""
setup_logging()
load_environment_variables()