#!/usr/bin/python3 import os, sys, re import string import argparse import yaml import textwrap import json from bs4 import BeautifulSoup options = { 'format' : 'text', } class PhishingMailParser: def __init__(self, options): self.options = options self.results = {} def parse(self, html): self.html = html self.soup = BeautifulSoup(html, features="lxml") self.results['Embedded Images'] = self.testEmbeddedImages() self.results['Images without ALT'] = self.testImagesNoAlt() self.results['Masqueraded Links'] = self.testMaskedLinks() self.results['Use of underline tag '] = self.testUnderlineTag() return {k: v for k, v in self.results.items() if v} @staticmethod def context(tag): s = str(tag) if len(s) < 100: return s beg = s[:50] end = s[-50:] return f'{beg}...{end}' def testUnderlineTag(self): links = self.soup('u') if not links or len(links) == 0: return [] desc = 'Underline tags are recognized by anti-spam filters and trigger additional rule (Office365: 67856001), but by their own shouldnt impact spam score.' result = f'- Found {len(links)} tags. This is not by itself an indication of spam, but is known to trigger some rules (like Office365: 67856001)\n' context = '' for i in range(len(links)): context += '\t- ' + str(links[i]) + '\n' if i > 10: break return { 'description' : desc, 'context' : context, 'analysis' : result } def testMaskedLinks(self): links = self.soup('a') desc = 'Links that masquerade their href= attribute by displaying different link are considered harmful and will increase Spam score.' context = '' result = '' num = 0 embed = '' for link in links: try: href = link['href'] except: continue text = link.getText() url = re.compile(r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*') m1 = url.match(href) m2 = url.match(text) if m1 and m2: num += 1 context += '- ' + PhishingMailParser.context(link) + '\n' context += f'\thref = "{href[:64]}"\n' context += f'\ttext = "{text[:64]}"\n\n' if num > 0: result += f'- Found {num} tags that masquerade their href="" links with text!\n' result += '\t Links that try to hide underyling URL are harmful and will be considered as Spam!\n' if len(result) == 0: return [] return { 'description' : desc, 'context' : context, 'analysis' : result } def testImagesNoAlt(self): images = self.soup('img') desc = 'Images without ALT="value" attribute may increase Spam scorage.' context = '' result = '' num = 0 embed = '' for img in images: src = img['src'] alt = '' try: alt = img['alt'] except: pass if alt == '': num += 1 context += '- ' + PhishingMailParser.context(img) + '\n' if num > 0: result += f'- Found {num} tags without ALT="value" attribute.\n' result += '\t Images without alternate text set in their attribute may increase Spam score\n' if len(result) == 0: return [] return { 'description' : desc, 'context' : context, 'analysis' : result } def testEmbeddedImages(self): images = self.soup('img') desc = 'Embedded images can increase Spam Confidence Level (SCL) in Office365 by 4 points. Embedded images are those with . They should be avoided.' context = '' result = '' num = 0 embed = '' for img in images: src = img['src'] alt = '' try: alt = img['alt'] except: pass if src.lower().startswith('data:image/'): if len(embed) == 0: embed = src[:30] num += 1 if len(alt) > 0: context += f'- ALT="{alt}": ' + PhishingMailParser.context(img) + '\n' else: context += '- ' + PhishingMailParser.context(img) + '\n' if num > 0: result += f'- Found {num} tags with embedded image ({embed}).\n' result += '\t Embedded images increase Office365 SCL (Spam) level by 4 points!\n' if len(result) == 0: return [] return { 'description' : desc, 'context' : context, 'analysis' : result } def printOutput(out): if options['format'] == 'text': width = 100 num = 0 for k, v in out.items(): num += 1 analysis = v['analysis'] context = v['context'] desc = '\n'.join(textwrap.wrap( v['description'], width = 80, initial_indent = '', subsequent_indent = ' ' )) analysis = analysis.replace('- ', '\t- ') print(f''' ------------------------------------------ ({num}) Test: {k} DESCRIPTION: {desc} CONTEXT: {context} ANALYSIS: {analysis} ''') elif options['format'] == 'json': print(json.dumps(out)) def opts(argv): global options global headers o = argparse.ArgumentParser( usage = 'phishing-HTML-linter.py [options] ' ) req = o.add_argument_group('Required arguments') req.add_argument('file', help = 'Input HTML file') args = o.parse_args() return args def main(argv): args = opts(argv) if not args: return False print(''' :: Phishing HTML Linter Shows you bad smells in your HTML code that will get your mails busted! Mariusz Banach / mgeeky ''') html = '' with open(args.file, 'rb') as f: html = f.read() p = PhishingMailParser({}) ret = p.parse(html.decode()) printOutput(ret) if __name__ == '__main__': main(sys.argv)