
633 lines
18 KiB
Raw Normal View History

2021-10-17 15:22:05 +02:00
import os, sys, re
import string
import argparse
import yaml
2021-10-28 21:12:23 +02:00
import textwrap
2021-10-17 15:22:05 +02:00
import json
from urllib import parse
2021-10-17 15:22:05 +02:00
from bs4 import BeautifulSoup
options = {
'format' : 'text',
executable_extensions = [
options = {
'debug': False,
'verbose': False,
'nocolor' : False,
'log' : sys.stderr,
'format' : 'text',
class Logger:
colors_map = {
'red': 31,
'green': 32,
'yellow': 33,
'blue': 34,
'magenta': 35,
'cyan': 36,
'white': 37,
'grey': 38,
colors_dict = {
'error': colors_map['red'],
'trace': colors_map['magenta'],
'info ': colors_map['green'],
'debug': colors_map['grey'],
'other': colors_map['grey'],
options = {}
def __init__(self, opts = None):
if opts != None and len(opts) > 0:
def with_color(c, s):
return "\x1b[%dm%s\x1b[0m" % (c, s)
def colored(self, txt, col):
if self.options['nocolor']:
return txt
return Logger.with_color(Logger.colors_map[col], txt)
# Invocation:
# def out(txt, mode='info ', fd=None, color=None, noprefix=False, newline=True):
def out(txt, fd, mode='info ', **kwargs):
if txt == None or fd == 'none':
elif fd == None:
raise Exception('[ERROR] Logging descriptor has not been specified!')
args = {
'color': None,
'noprefix': False,
'newline': True,
'nocolor' : False
if type(txt) != str:
txt = str(txt)
txt = txt.replace('\t', ' ' * 4)
if args['nocolor']:
col = ''
elif args['color']:
col = args['color']
if type(col) == str and col in Logger.colors_map.keys():
col = Logger.colors_map[col]
col = Logger.colors_dict.setdefault(mode, Logger.colors_map['grey'])
prefix = ''
if mode:
mode = '[%s] ' % mode
if not args['noprefix']:
if args['nocolor']:
prefix = mode.upper()
prefix = Logger.with_color(Logger.colors_dict['other'], '%s'
% (mode.upper()))
nl = ''
if 'newline' in args:
if args['newline']:
nl = '\n'
if 'force_stdout' in args:
fd = sys.stdout
if type(fd) == str:
with open(fd, 'a') as f:
prefix2 = ''
if mode:
prefix2 = '%s' % (mode.upper())
f.write(prefix2 + txt + nl)
if args['nocolor']:
fd.write(prefix + txt + nl)
fd.write(prefix + Logger.with_color(col, txt) + nl)
# Info shall be used as an ordinary logging facility, for every desired output.
def info(self, txt, forced = False, **kwargs):
kwargs['nocolor'] = self.options['nocolor']
if forced or (self.options['verbose'] or \
self.options['debug'] ) \
or (type(self.options['log']) == str and self.options['log'] != 'none'):
Logger.out(txt, self.options['log'], 'info', **kwargs)
def text(self, txt, **kwargs):
kwargs['noPrefix'] = True
kwargs['nocolor'] = self.options['nocolor']
Logger.out(txt, self.options['log'], '', **kwargs)
def dbg(self, txt, **kwargs):
if self.options['debug']:
kwargs['nocolor'] = self.options['nocolor']
Logger.out(txt, self.options['log'], 'debug', **kwargs)
def err(self, txt, **kwargs):
kwargs['nocolor'] = self.options['nocolor']
Logger.out(txt, self.options['log'], 'error', **kwargs)
def fatal(self, txt, **kwargs):
kwargs['nocolor'] = self.options['nocolor']
Logger.out(txt, self.options['log'], 'error', **kwargs)
logger = Logger(options)
2021-10-17 15:22:05 +02:00
class PhishingMailParser:
def __init__(self, options):
self.options = options
self.results = {}
def parse(self, html):
self.html = html
self.soup = BeautifulSoup(html, features="lxml")
self.results['Embedded Images'] = self.testEmbeddedImages()
self.results['Images without ALT'] = self.testImagesNoAlt()
self.results['Masqueraded Links'] = self.testMaskedLinks()
self.results['Use of underline tag <u>'] = self.testUnderlineTag()
self.results['HTML code in <a> link tags'] = self.testLinksWithHtmlCode()
self.results['<a href="..."> URL contained GET parameter'] = self.testLinksWithGETParams()
self.results['<a href="..."> URL contained GET parameter with URL'] = self.testLinksWithGETParamsBeingURLs()
self.results['<a href="..."> URL pointed to an executable file'] = self.testLinksWithDangerousExtensions()
2021-10-17 15:22:05 +02:00
return {k: v for k, v in self.results.items() if v}
def context(tag):
s = str(tag)
if len(s) < 100:
return s
beg = s[:50]
end = s[-50:]
return f'{beg}...{end}'
2021-10-28 21:12:23 +02:00
def testUnderlineTag(self):
links = self.soup('u')
if not links or len(links) == 0:
return []
desc = 'Underline tags are recognized by anti-spam filters and trigger additional rule (Office365: 67856001), but by their own shouldnt impact spam score.'
result = f'- Found {len(links)} <u> tags. This is not by itself an indication of spam, but is known to trigger some rules (like Office365: 67856001)\n'
context = ''
for i in range(len(links)):
context += str(links[i]) + '\n\n'
if i > 5: break
2021-10-28 21:12:23 +02:00
return {
'description' : desc,
'context' : context,
'analysis' : result
def testLinksWithHtmlCode(self):
links = self.soup('a')
desc = 'Links that contain HTML code within <a> ... </a> may increase Spam score heavily'
context = ''
result = ''
num = 0
embed = ''
for link in links:
text = str(link)
pos = text.find('>')
code = text[pos+1:]
m ='(.+)<\s*/\s*a\s*>', code, re.I)
if m:
code =
suspicious = '<' in text and '>' in text
if suspicious:
num += 1
if num < 5:
N = 70
tmp = text[:N]
if len(text) > N:
tmp += ' ... ' + text[-N:]
context += tmp + '\n'
code2 = PhishingMailParser.context(code)
context += f"\n\t- {logger.colored('Code inside of <a> tag:','red')}\n\t\t" + logger.colored(code2, 'yellow') + '\n'
if num > 0:
result += f'- Found {num} <a> tags that contained HTML code inside!\n'
result += '\t Links conveying HTML code within <a> ... </a> may greatly increase message Spam score!\n'
if len(result) == 0:
return []
return {
'description' : desc,
'context' : context,
'analysis' : result
def testLinksWithGETParams(self):
links = self.soup('a')
desc = 'Links with URLs containing GET parameters will be noticed by anti-spam filters resulting in another rule triggering on message (Office365: 21615005).'
context = ''
result = ''
num = 0
embed = ''
for link in links:
href = link['href']
text = link.getText()
params = dict(parse.parse_qsl(parse.urlsplit(href).query))
if len(params) > 0:
num += 1
if num < 5:
context += PhishingMailParser.context(link) + '\n'
hr = href[:90]
pos = hr.find('?')
hr = hr[:pos] + logger.colored(hr[pos:], 'yellow')
context += f'\thref = "{hr}"\n'
context += f'\ttext = "{text[:90]}"\n\n'
if num > 0:
result += f'- Found {num} <a> tags with href="..." URLs containing GET params.\n'
result += '\t Links with URLs that contain GET params might trigger anti-spam rule (Office365: 21615005)\n'
if len(result) == 0:
return []
return {
'description' : desc,
'context' : context,
'analysis' : result
def testLinksWithDangerousExtensions(self):
links = self.soup('a')
desc = 'Message contained <a> tags with href="..." links pointing to a file with dangerous extension (such as .exe)'
context = ''
result = ''
num = 0
embed = ''
for link in links:
href = link['href']
text = link.getText()
parsed = parse.urlsplit(href)
if '.' not in parsed.path:
pos = parsed.path.rfind('.')
if pos == -1:
extension = parsed.path.lower()[pos:]
if extension in executable_extensions:
num += 1
if num < 5:
context += PhishingMailParser.context(link) + '\n'
hr = href[:90]
pos1 = hr.lower().find(extension.lower())
hr = logger.colored(hr[:pos1], 'yellow') + logger.colored(hr[pos1:pos1+len(extension)], 'red') + logger.colored(hr[pos1+len(extension):], 'yellow')
context += f'\thref = "{hr}"\n'
context += f'\ttext = "{text[:90]}"\n\n'
context += f'\tExtension matched: {logger.colored(extension, "red")}\n'
if num > 0:
result += f'- Found {num} <a> tags with href="..." URLs pointing to files with dangerous extensions (such as .exe).\n'
result += '\t Links with URLs that point to potentially executable files might trigger anti-spam rule (Office365: 460985005)\n'
if len(result) == 0:
return []
return {
'description' : desc,
'context' : context,
'analysis' : result
def testLinksWithGETParamsBeingURLs(self):
links = self.soup('a')
desc = 'Links with URLs that contain GET parameters pointing to another URL, will trigger two Office365 anti-spam rules (Office365: 45080400002).'
context = ''
result = ''
num = 0
embed = ''
for link in links:
href = link['href']
text = link.getText()
params = dict(parse.parse_qsl(parse.urlsplit(href).query))
url = re.compile(r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*')
if len(params) > 0:
for k, v in params.items():
m = url.match(v)
if m:
urlmatched =
num += 1
if num < 5:
context += PhishingMailParser.context(link) + '\n'
hr = href[:90]
hr = logger.colored(hr, 'yellow')
context += f'\thref = "{hr}"\n'
context += f'\ttext = "{text[:90]}"\n\n'
context += f'\thref URL GET parameter contained another URL:\n\t\t' + logger.colored(v, "red") + '\n'
if num > 0:
result += f'- Found {num} <a> tags with href="..." URLs containing GET params containing another URL.\n'
result += '\t Links with URLs that contain GET params with another URL might trigger anti-spam rule (Office365: 45080400002)\n'
if len(result) == 0:
return []
return {
'description' : desc,
'context' : context,
'analysis' : result
2021-10-17 15:22:05 +02:00
def testMaskedLinks(self):
links = self.soup('a')
desc = 'Links that masquerade their href= attribute by displaying different link are considered harmful and will increase Spam score.'
context = ''
result = ''
num = 0
embed = ''
for link in links:
href = link['href']
text = link.getText()
url = re.compile(r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*')
m1 = url.match(href)
m2 = url.match(text)
if m1 and m2:
num += 1
if num < 5:
context += PhishingMailParser.context(link) + '\n'
context += f'\thref = "{logger.colored(href[:90],"green")}"\n'
context += f'\ttext = "{logger.colored(text[:90],"red")}"\n\n'
2021-10-17 15:22:05 +02:00
if num > 0:
result += f'- Found {num} <a> tags that masquerade their href="" links with text!\n'
result += '\t Links that try to hide underyling URL are harmful and will be considered as Spam!\n'
if len(result) == 0:
return []
return {
'description' : desc,
'context' : context,
'analysis' : result
def testImagesNoAlt(self):
images = self.soup('img')
desc = 'Images without ALT="value" attribute may increase Spam scorage.'
context = ''
result = ''
num = 0
embed = ''
for img in images:
src = img['src']
alt = ''
alt = img['alt']
if alt == '':
num += 1
if num < 5:
context += PhishingMailParser.context(img) + '\n\n'
2021-10-17 15:22:05 +02:00
if num > 0:
result += f'- Found {num} <img> tags without ALT="value" attribute.\n'
result += '\t Images without alternate text set in their attribute may increase Spam score\n'
if len(result) == 0:
return []
return {
'description' : desc,
'context' : context,
'analysis' : result
def testEmbeddedImages(self):
images = self.soup('img')
desc = 'Embedded images can increase Spam Confidence Level (SCL) in Office365 by 4 points. Embedded images are those with <img src="data:image/png;base64,<BLOB>"/> . They should be avoided.'
context = ''
result = ''
num = 0
embed = ''
for img in images:
src = img['src']
alt = ''
alt = img['alt']
if src.lower().startswith('data:image/'):
if len(embed) == 0:
embed = src[:30]
num += 1
if num < 5:
if len(alt) > 0:
context += f'- ALT="{alt}": ' + PhishingMailParser.context(img) + '\n'
ctx = PhishingMailParser.context(img)
pos = ctx.find('data:')
pos2 = ctx.find('"', pos+1)
ctx = logger.colored(ctx[:pos], 'yellow') + logger.colored(ctx[pos:pos2], 'red') + logger.colored(ctx[pos2:], 'yellow')
context += ctx + '\n'
2021-10-17 15:22:05 +02:00
if num > 0:
result += f'- Found {num} <img> tags with embedded image ({embed}).\n'
result += '\t Embedded images increase Office365 SCL (Spam) level by 4 points!\n'
if len(result) == 0:
return []
return {
'description' : desc,
'context' : context,
'analysis' : result
def printOutput(out):
if options['format'] == 'text':
width = 100
num = 0
for k, v in out.items():
num += 1
analysis = v['analysis'].strip()
context = v['context'].strip()
2021-10-28 21:12:23 +02:00
desc = '\n'.join(textwrap.wrap(
width = 80,
initial_indent = '',
subsequent_indent = ' '
2021-10-17 15:22:05 +02:00
analysis = analysis.replace('- ', '\t- ')
({num}) Test: {logger.colored(k, "cyan")}
{logger.colored("DESCRIPTION", "blue")}:
2021-10-17 15:22:05 +02:00
2021-10-28 21:12:23 +02:00
{logger.colored("CONTEXT", "blue")}:
2021-10-17 15:22:05 +02:00
{logger.colored("ANALYSIS", "blue")}:
2021-10-17 15:22:05 +02:00
elif options['format'] == 'json':
def opts(argv):
global options
global headers
o = argparse.ArgumentParser(
usage = ' [options] <file.html>'
req = o.add_argument_group('Required arguments')
req.add_argument('file', help = 'Input HTML file')
args = o.parse_args()
2021-10-17 15:22:05 +02:00
return args
def main(argv):
args = opts(argv)
if not args:
return False
:: Phishing HTML Linter
Shows you bad smells in your HTML code that will get your mails busted!
2021-10-24 23:11:42 +02:00
Mariusz Banach / mgeeky
2021-10-17 15:22:05 +02:00
html = ''
with open(args.file, 'rb') as f:
html =
p = PhishingMailParser({})
ret = p.parse(html.decode())
if len(ret) > 0:
print('\n[+] Congrats! Your message does not have any known bad smells that could trigger anti-spam rules.\n')
2021-10-17 15:22:05 +02:00
if __name__ == '__main__':