Added some extra documentation
This commit is contained in:
36
getmail.py
36
getmail.py
@@ -8,12 +8,11 @@ It will find any emails with attachments, marking them as read, and saving
|
|||||||
the attachment to the 'attachments' directory for later processing.
|
the attachment to the 'attachments' directory for later processing.
|
||||||
It will respect the attachment filetype and extension.
|
It will respect the attachment filetype and extension.
|
||||||
Attachments are output named with the current date-time, and a semi-random
|
Attachments are output named with the current date-time, and a semi-random
|
||||||
uid designed to crudely prevent namespace collisions. In a VERY high traffic
|
uid based on the file hash in order to prevent namespace collisions.
|
||||||
environment where many files may be created per second, this should be
|
|
||||||
re-implemented to be more robust.
|
|
||||||
No file locking is used, however files are written to a temporary directory
|
No file locking is used, however files are written to a temporary directory
|
||||||
first and renamed upon completion, as renaming is an atomic operation at an
|
first, flushed, and renamed upon completion, as renaming is an atomic operation
|
||||||
OS level.
|
at an OS level.
|
||||||
|
|
||||||
Attachments will not be created if they have an identical hash to a
|
Attachments will not be created if they have an identical hash to a
|
||||||
previously downloaded attachment. This is designed to prevent scenarios where
|
previously downloaded attachment. This is designed to prevent scenarios where
|
||||||
@@ -21,6 +20,15 @@ the same file has been accidentally sent multiple times. Note that this
|
|||||||
identification is done based on file content, and the name of the file is
|
identification is done based on file content, and the name of the file is
|
||||||
irrelevant.
|
irrelevant.
|
||||||
|
|
||||||
|
Once an attachment is saved, we look up the mapping of fields in the attachment
|
||||||
|
for input, using a custom mapping based on the domain of the email address of
|
||||||
|
the sender. Different companies may use differing CSV formats, but we work on
|
||||||
|
the assumption tha the XML for input into Logistic will need to be the same and
|
||||||
|
based on a well defined DTD.
|
||||||
|
|
||||||
|
After mapping and conversion to XML, the file is validated against the DTD and
|
||||||
|
written to the xml directory for injection into logistic.
|
||||||
|
|
||||||
Logging is fairly primitive and done to a log file in the same directory as
|
Logging is fairly primitive and done to a log file in the same directory as
|
||||||
the script. This could be upgraded to syslog-style logging if required.
|
the script. This could be upgraded to syslog-style logging if required.
|
||||||
|
|
||||||
@@ -29,7 +37,8 @@ STARTTLS. If manual STARTTLS is required, the MailBox method will need to be
|
|||||||
altered to MailBoxTls. If Outlook or Gmail or similar are used, it will be
|
altered to MailBoxTls. If Outlook or Gmail or similar are used, it will be
|
||||||
necessary to implement OAUTH2.
|
necessary to implement OAUTH2.
|
||||||
|
|
||||||
Authentication is configured in a .env file as described below in the code.
|
Authentication and other user configuration is configured in a .env file as
|
||||||
|
described below in the code.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Standard libraries
|
# Standard libraries
|
||||||
@@ -38,7 +47,6 @@ import sys
|
|||||||
import csv
|
import csv
|
||||||
import ssl
|
import ssl
|
||||||
import json
|
import json
|
||||||
import time
|
|
||||||
import hashlib
|
import hashlib
|
||||||
import logging
|
import logging
|
||||||
import tempfile
|
import tempfile
|
||||||
@@ -91,7 +99,7 @@ def save_hashes(hashes_file, hashes):
|
|||||||
json.dump(list(hashes), f)
|
json.dump(list(hashes), f)
|
||||||
|
|
||||||
def setup_ssl_context():
|
def setup_ssl_context():
|
||||||
"""Set up the SSL context."""
|
"""Set up the SSL context for mail connection."""
|
||||||
ssl_context = ssl.create_default_context()
|
ssl_context = ssl.create_default_context()
|
||||||
ssl_context.minimum_version = ssl.TLSVersion.TLSv1_3
|
ssl_context.minimum_version = ssl.TLSVersion.TLSv1_3
|
||||||
ssl_context.maximum_version = ssl.TLSVersion.TLSv1_3
|
ssl_context.maximum_version = ssl.TLSVersion.TLSv1_3
|
||||||
@@ -140,6 +148,8 @@ def csv_to_xml(csv_file, xml_file, dtd_file, csv_mapping):
|
|||||||
logging.info("XML file saved to %s", xml_file)
|
logging.info("XML file saved to %s", xml_file)
|
||||||
|
|
||||||
def get_client_mapping(client_domain):
|
def get_client_mapping(client_domain):
|
||||||
|
"""Check custom mapping for the client CSV files, returning the custom
|
||||||
|
mapping if one is found."""
|
||||||
with open("column_mapping.json", "r", encoding="utf-8") as f:
|
with open("column_mapping.json", "r", encoding="utf-8") as f:
|
||||||
column_mapping = json.load(f)
|
column_mapping = json.load(f)
|
||||||
mapping = column_mapping.get(client_domain, column_mapping["default"])
|
mapping = column_mapping.get(client_domain, column_mapping["default"])
|
||||||
@@ -147,7 +157,9 @@ def get_client_mapping(client_domain):
|
|||||||
|
|
||||||
def process_attachments(mailbox, attachment_path, saved_hashes, xml_docs,
|
def process_attachments(mailbox, attachment_path, saved_hashes, xml_docs,
|
||||||
dtd_name):
|
dtd_name):
|
||||||
"""Process and save email attachments."""
|
"""Process and save email attachments.
|
||||||
|
For each attachment found, save it, marking the email as read, and saving
|
||||||
|
a unique hash so we never process the same attachment again."""
|
||||||
for msg in mailbox.fetch(AND(seen=False), mark_seen=False):
|
for msg in mailbox.fetch(AND(seen=False), mark_seen=False):
|
||||||
for att in msg.attachments:
|
for att in msg.attachments:
|
||||||
attachment_hash = compute_hash(att.payload)
|
attachment_hash = compute_hash(att.payload)
|
||||||
@@ -156,7 +168,7 @@ def process_attachments(mailbox, attachment_path, saved_hashes, xml_docs,
|
|||||||
filename, file_ext = os.path.splitext(att.filename)
|
filename, file_ext = os.path.splitext(att.filename)
|
||||||
logging.info('new attachment named %s from %s',
|
logging.info('new attachment named %s from %s',
|
||||||
filename, msg.from_)
|
filename, msg.from_)
|
||||||
client_domain = msg.from_.split('@')[1]
|
client_domain = msg.from_.split('@', maxsplit=1)[1]
|
||||||
csv_mapping=get_client_mapping(client_domain)
|
csv_mapping=get_client_mapping(client_domain)
|
||||||
logging.debug('loaded client %s csv mapping %s',
|
logging.debug('loaded client %s csv mapping %s',
|
||||||
client_domain, csv_mapping)
|
client_domain, csv_mapping)
|
||||||
@@ -166,7 +178,7 @@ def process_attachments(mailbox, attachment_path, saved_hashes, xml_docs,
|
|||||||
f"{attachment_hash[:8]}"
|
f"{attachment_hash[:8]}"
|
||||||
f"{file_ext}"
|
f"{file_ext}"
|
||||||
)
|
)
|
||||||
xml_name = final_name.split('.')[0] + '.xml'
|
xml_name = final_name.split('.', maxsplit=1)[0] + '.xml'
|
||||||
|
|
||||||
with tempfile.NamedTemporaryFile(
|
with tempfile.NamedTemporaryFile(
|
||||||
delete=False, dir="temp") as temp_file:
|
delete=False, dir="temp") as temp_file:
|
||||||
@@ -186,7 +198,7 @@ def process_attachments(mailbox, attachment_path, saved_hashes, xml_docs,
|
|||||||
mailbox.flag(msg.uid, '\\Seen', True)
|
mailbox.flag(msg.uid, '\\Seen', True)
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
"""Code entrypoint."""
|
"""Initialise logging, load environment, and get to work!"""
|
||||||
setup_logging()
|
setup_logging()
|
||||||
load_environment_variables()
|
load_environment_variables()
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user