diff --git a/getmail.py b/getmail.py index 8ad220d..5653fc3 100755 --- a/getmail.py +++ b/getmail.py @@ -8,12 +8,11 @@ It will find any emails with attachments, marking them as read, and saving the attachment to the 'attachments' directory for later processing. It will respect the attachment filetype and extension. Attachments are output named with the current date-time, and a semi-random -uid designed to crudely prevent namespace collisions. In a VERY high traffic -environment where many files may be created per second, this should be -re-implemented to be more robust. +uid based on the file hash in order to prevent namespace collisions. + No file locking is used, however files are written to a temporary directory -first and renamed upon completion, as renaming is an atomic operation at an -OS level. +first, flushed, and renamed upon completion, as renaming is an atomic operation +at an OS level. Attachments will not be created if they have an identical hash to a previously downloaded attachment. This is designed to prevent scenarios where @@ -21,6 +20,15 @@ the same file has been accidentally sent multiple times. Note that this identification is done based on file content, and the name of the file is irrelevant. +Once an attachment is saved, we look up the mapping of fields in the attachment +for input, using a custom mapping based on the domain of the email address of +the sender. Different companies may use differing CSV formats, but we work on +the assumption tha the XML for input into Logistic will need to be the same and +based on a well defined DTD. + +After mapping and conversion to XML, the file is validated against the DTD and +written to the xml directory for injection into logistic. + Logging is fairly primitive and done to a log file in the same directory as the script. This could be upgraded to syslog-style logging if required. @@ -29,7 +37,8 @@ STARTTLS. If manual STARTTLS is required, the MailBox method will need to be altered to MailBoxTls. If Outlook or Gmail or similar are used, it will be necessary to implement OAUTH2. -Authentication is configured in a .env file as described below in the code. +Authentication and other user configuration is configured in a .env file as +described below in the code. """ # Standard libraries @@ -38,7 +47,6 @@ import sys import csv import ssl import json -import time import hashlib import logging import tempfile @@ -91,7 +99,7 @@ def save_hashes(hashes_file, hashes): json.dump(list(hashes), f) def setup_ssl_context(): - """Set up the SSL context.""" + """Set up the SSL context for mail connection.""" ssl_context = ssl.create_default_context() ssl_context.minimum_version = ssl.TLSVersion.TLSv1_3 ssl_context.maximum_version = ssl.TLSVersion.TLSv1_3 @@ -140,6 +148,8 @@ def csv_to_xml(csv_file, xml_file, dtd_file, csv_mapping): logging.info("XML file saved to %s", xml_file) def get_client_mapping(client_domain): + """Check custom mapping for the client CSV files, returning the custom + mapping if one is found.""" with open("column_mapping.json", "r", encoding="utf-8") as f: column_mapping = json.load(f) mapping = column_mapping.get(client_domain, column_mapping["default"]) @@ -147,7 +157,9 @@ def get_client_mapping(client_domain): def process_attachments(mailbox, attachment_path, saved_hashes, xml_docs, dtd_name): - """Process and save email attachments.""" + """Process and save email attachments. + For each attachment found, save it, marking the email as read, and saving + a unique hash so we never process the same attachment again.""" for msg in mailbox.fetch(AND(seen=False), mark_seen=False): for att in msg.attachments: attachment_hash = compute_hash(att.payload) @@ -156,7 +168,7 @@ def process_attachments(mailbox, attachment_path, saved_hashes, xml_docs, filename, file_ext = os.path.splitext(att.filename) logging.info('new attachment named %s from %s', filename, msg.from_) - client_domain = msg.from_.split('@')[1] + client_domain = msg.from_.split('@', maxsplit=1)[1] csv_mapping=get_client_mapping(client_domain) logging.debug('loaded client %s csv mapping %s', client_domain, csv_mapping) @@ -166,7 +178,7 @@ def process_attachments(mailbox, attachment_path, saved_hashes, xml_docs, f"{attachment_hash[:8]}" f"{file_ext}" ) - xml_name = final_name.split('.')[0] + '.xml' + xml_name = final_name.split('.', maxsplit=1)[0] + '.xml' with tempfile.NamedTemporaryFile( delete=False, dir="temp") as temp_file: @@ -186,7 +198,7 @@ def process_attachments(mailbox, attachment_path, saved_hashes, xml_docs, mailbox.flag(msg.uid, '\\Seen', True) def main(): - """Code entrypoint.""" + """Initialise logging, load environment, and get to work!""" setup_logging() load_environment_variables()