Added some extra documentation

This commit is contained in:
2025-01-02 22:45:31 +13:00
parent 06eadf3ffd
commit 2441ca3c26

View File

@@ -8,12 +8,11 @@ It will find any emails with attachments, marking them as read, and saving
the attachment to the 'attachments' directory for later processing. the attachment to the 'attachments' directory for later processing.
It will respect the attachment filetype and extension. It will respect the attachment filetype and extension.
Attachments are output named with the current date-time, and a semi-random Attachments are output named with the current date-time, and a semi-random
uid designed to crudely prevent namespace collisions. In a VERY high traffic uid based on the file hash in order to prevent namespace collisions.
environment where many files may be created per second, this should be
re-implemented to be more robust.
No file locking is used, however files are written to a temporary directory No file locking is used, however files are written to a temporary directory
first and renamed upon completion, as renaming is an atomic operation at an first, flushed, and renamed upon completion, as renaming is an atomic operation
OS level. at an OS level.
Attachments will not be created if they have an identical hash to a Attachments will not be created if they have an identical hash to a
previously downloaded attachment. This is designed to prevent scenarios where previously downloaded attachment. This is designed to prevent scenarios where
@@ -21,6 +20,15 @@ the same file has been accidentally sent multiple times. Note that this
identification is done based on file content, and the name of the file is identification is done based on file content, and the name of the file is
irrelevant. irrelevant.
Once an attachment is saved, we look up the mapping of fields in the attachment
for input, using a custom mapping based on the domain of the email address of
the sender. Different companies may use differing CSV formats, but we work on
the assumption tha the XML for input into Logistic will need to be the same and
based on a well defined DTD.
After mapping and conversion to XML, the file is validated against the DTD and
written to the xml directory for injection into logistic.
Logging is fairly primitive and done to a log file in the same directory as Logging is fairly primitive and done to a log file in the same directory as
the script. This could be upgraded to syslog-style logging if required. the script. This could be upgraded to syslog-style logging if required.
@@ -29,7 +37,8 @@ STARTTLS. If manual STARTTLS is required, the MailBox method will need to be
altered to MailBoxTls. If Outlook or Gmail or similar are used, it will be altered to MailBoxTls. If Outlook or Gmail or similar are used, it will be
necessary to implement OAUTH2. necessary to implement OAUTH2.
Authentication is configured in a .env file as described below in the code. Authentication and other user configuration is configured in a .env file as
described below in the code.
""" """
# Standard libraries # Standard libraries
@@ -38,7 +47,6 @@ import sys
import csv import csv
import ssl import ssl
import json import json
import time
import hashlib import hashlib
import logging import logging
import tempfile import tempfile
@@ -91,7 +99,7 @@ def save_hashes(hashes_file, hashes):
json.dump(list(hashes), f) json.dump(list(hashes), f)
def setup_ssl_context(): def setup_ssl_context():
"""Set up the SSL context.""" """Set up the SSL context for mail connection."""
ssl_context = ssl.create_default_context() ssl_context = ssl.create_default_context()
ssl_context.minimum_version = ssl.TLSVersion.TLSv1_3 ssl_context.minimum_version = ssl.TLSVersion.TLSv1_3
ssl_context.maximum_version = ssl.TLSVersion.TLSv1_3 ssl_context.maximum_version = ssl.TLSVersion.TLSv1_3
@@ -140,6 +148,8 @@ def csv_to_xml(csv_file, xml_file, dtd_file, csv_mapping):
logging.info("XML file saved to %s", xml_file) logging.info("XML file saved to %s", xml_file)
def get_client_mapping(client_domain): def get_client_mapping(client_domain):
"""Check custom mapping for the client CSV files, returning the custom
mapping if one is found."""
with open("column_mapping.json", "r", encoding="utf-8") as f: with open("column_mapping.json", "r", encoding="utf-8") as f:
column_mapping = json.load(f) column_mapping = json.load(f)
mapping = column_mapping.get(client_domain, column_mapping["default"]) mapping = column_mapping.get(client_domain, column_mapping["default"])
@@ -147,7 +157,9 @@ def get_client_mapping(client_domain):
def process_attachments(mailbox, attachment_path, saved_hashes, xml_docs, def process_attachments(mailbox, attachment_path, saved_hashes, xml_docs,
dtd_name): dtd_name):
"""Process and save email attachments.""" """Process and save email attachments.
For each attachment found, save it, marking the email as read, and saving
a unique hash so we never process the same attachment again."""
for msg in mailbox.fetch(AND(seen=False), mark_seen=False): for msg in mailbox.fetch(AND(seen=False), mark_seen=False):
for att in msg.attachments: for att in msg.attachments:
attachment_hash = compute_hash(att.payload) attachment_hash = compute_hash(att.payload)
@@ -156,7 +168,7 @@ def process_attachments(mailbox, attachment_path, saved_hashes, xml_docs,
filename, file_ext = os.path.splitext(att.filename) filename, file_ext = os.path.splitext(att.filename)
logging.info('new attachment named %s from %s', logging.info('new attachment named %s from %s',
filename, msg.from_) filename, msg.from_)
client_domain = msg.from_.split('@')[1] client_domain = msg.from_.split('@', maxsplit=1)[1]
csv_mapping=get_client_mapping(client_domain) csv_mapping=get_client_mapping(client_domain)
logging.debug('loaded client %s csv mapping %s', logging.debug('loaded client %s csv mapping %s',
client_domain, csv_mapping) client_domain, csv_mapping)
@@ -166,7 +178,7 @@ def process_attachments(mailbox, attachment_path, saved_hashes, xml_docs,
f"{attachment_hash[:8]}" f"{attachment_hash[:8]}"
f"{file_ext}" f"{file_ext}"
) )
xml_name = final_name.split('.')[0] + '.xml' xml_name = final_name.split('.', maxsplit=1)[0] + '.xml'
with tempfile.NamedTemporaryFile( with tempfile.NamedTemporaryFile(
delete=False, dir="temp") as temp_file: delete=False, dir="temp") as temp_file:
@@ -186,7 +198,7 @@ def process_attachments(mailbox, attachment_path, saved_hashes, xml_docs,
mailbox.flag(msg.uid, '\\Seen', True) mailbox.flag(msg.uid, '\\Seen', True)
def main(): def main():
"""Code entrypoint.""" """Initialise logging, load environment, and get to work!"""
setup_logging() setup_logging()
load_environment_variables() load_environment_variables()