Improve email body extraction and decoding

This commit is contained in:
2019-11-26 15:17:02 +01:00
parent 086a3fc0ce
commit 9e5f51f6f5
2 changed files with 49 additions and 64 deletions

View File

@@ -215,48 +215,47 @@ class EMailNotification(BaseNotification):
img.add_header("Content-ID", "<{}>".format(basename(img_path))) img.add_header("Content-ID", "<{}>".format(basename(img_path)))
self.embedded_imgs.append(img) self.embedded_imgs.append(img)
def get_text(self, queueid, part): def get_decoded_email_body(self, queueid, msg, preferred=_html_text):
"Get the mail text in html form from email part." "Find and decode email body."
mimetype = part.get_content_type() # try to find the body part
self.logger.debug("{}: trying to find email body".format(queueid))
body = None
for part in msg.walk():
content_type = part.get_content_type()
if content_type in [EMailNotification._plain_text,
EMailNotification._html_text]:
body = part
if content_type == preferred:
break
self.logger.debug( if body is not None:
"{}: extracting content of email text part".format(queueid)) # get the character set, fallback to utf-8 if not defined in header
text = part.get_payload(decode=True) charset = body.get_content_charset()
if charset is None:
charset = "utf-8"
if mimetype == EMailNotification._plain_text: # decode content
content = body.get_payload(decode=True).decode(
encoding=charset, errors="replace")
content_type = body.get_content_type()
if content_type == EMailNotification._plain_text:
# convert text/plain to text/html
self.logger.debug( self.logger.debug(
"{}: content mimetype is {}, converting to {}".format( "{}: content type is {}, converting to {}".format(
queueid, mimetype, self._html_text)) queueid, content_type, EMailNotification._html_text))
text = re.sub(r"^(.*)$", r"\1<br/>", content = re.sub(r"^(.*)$", r"\1<br/>",
escape(text.decode()), flags=re.MULTILINE) escape(content), flags=re.MULTILINE)
else: else:
self.logger.debug( self.logger.debug(
"{}: content mimetype is {}".format( "{}: content type is {}".format(
queueid, mimetype)) queueid, content_type))
self.logger.debug( else:
"{}: trying to create BeatufilSoup object with parser lib {}, " self.logger.error(
"text length is {} bytes".format( "{}: unable to find email body".format(queueid))
queueid, self.parser_lib, len(text))) content = "ERROR: unable to find email body"
soup = BeautifulSoup(text, self.parser_lib)
self.logger.debug(
"{}: sucessfully created BeautifulSoup object".format(queueid))
return soup
def get_text_multipart(self, queueid, msg, preferred=_html_text): return content
"Get the mail text of a multipart email in html form."
soup = None
for part in msg.get_payload():
mimetype = part.get_content_type()
if mimetype in [EMailNotification._plain_text,
EMailNotification._html_text]:
soup = self.get_text(queueid, part)
elif mimetype.startswith("multipart"):
soup = self.get_text_multipart(queueid, part, preferred)
if soup is not None and mimetype == preferred:
break
return soup
def sanitize(self, queueid, soup): def sanitize(self, queueid, soup):
"Sanitize mail html text." "Sanitize mail html text."
@@ -293,27 +292,6 @@ class EMailNotification(BaseNotification):
del(element.attrs[attribute]) del(element.attrs[attribute])
return soup return soup
def get_html_text_part(self, queueid, msg):
"Get the mail text of an email in html form."
soup = None
mimetype = msg.get_content_type()
self.logger.debug(
"{}: trying to find text part of email".format(queueid))
if mimetype in [EMailNotification._plain_text,
EMailNotification._html_text]:
soup = self.get_text(queueid, msg)
elif mimetype.startswith("multipart"):
soup = self.get_text_multipart(queueid, msg)
if soup is None:
self.logger.error(
"{}: unable to extract text part of email".format(queueid))
text = "ERROR: unable to extract text from email body"
soup = BeautifulSoup(text, "lxml", "UTF-8")
return soup
def notify(self, queueid, quarantine_id, mailfrom, recipients, headers, fp, def notify(self, queueid, quarantine_id, mailfrom, recipients, headers, fp,
subgroups=None, named_subgroups=None, synchronous=False): subgroups=None, named_subgroups=None, synchronous=False):
"Notify recipients via email." "Notify recipients via email."
@@ -330,12 +308,19 @@ class EMailNotification(BaseNotification):
named_subgroups, named_subgroups,
synchronous) synchronous)
# extract html text from email # extract body from email
self.logger.debug( content = self.get_decoded_email_body(
"{}: extraction email text from original email".format(queueid))
soup = self.get_html_text_part(
queueid, email.message_from_binary_file(fp)) queueid, email.message_from_binary_file(fp))
# create BeautifulSoup object
self.logger.debug(
"{}: trying to create BeatufilSoup object with parser lib {}, "
"text length is {} bytes".format(
queueid, self.parser_lib, len(content)))
soup = BeautifulSoup(content, self.parser_lib)
self.logger.debug(
"{}: sucessfully created BeautifulSoup object".format(queueid))
# replace picture sources # replace picture sources
image_replaced = False image_replaced = False
if self.strip_images: if self.strip_images:

View File

@@ -1 +1 @@
__version__ = "0.0.4" __version__ = "0.0.5"