#!/usr/bin/python3

import os, sys, re
import string
import argparse
import yaml
import textwrap
import json
from urllib import parse
from bs4 import BeautifulSoup

options = {
    'format' : 'text',
}

executable_extensions = [
    '.exe',
    '.dll',
    '.lnk',
    '.scr',
    '.sys',
    '.ps1',
    '.bat',
    '.js',
    '.jse',
    '.vbs',
    '.vba',
    '.vbe',
    '.wsl',
    '.cpl',
]


options = {
    'debug': False,
    'verbose': False,
    'nocolor' : False,
    'log' : sys.stderr,
    'format' : 'text',
}

class Logger:
    colors_map = {
        'red':      31, 
        'green':    32, 
        'yellow':   33,
        'blue':     34, 
        'magenta':  35, 
        'cyan':     36,
        'white':    37, 
        'grey':     38,
    }

    colors_dict = {
        'error': colors_map['red'],
        'trace': colors_map['magenta'],
        'info ': colors_map['green'],
        'debug': colors_map['grey'],
        'other': colors_map['grey'],
    }

    options = {}

    def __init__(self, opts = None):
        self.options.update(Logger.options)
        if opts != None and len(opts) > 0:
            self.options.update(opts)

    @staticmethod
    def with_color(c, s):
        return "\x1b[%dm%s\x1b[0m" % (c, s)

    def colored(self, txt, col):
        if self.options['nocolor']:
            return txt

        return Logger.with_color(Logger.colors_map[col], txt)
        
    # Invocation:
    #   def out(txt, mode='info ', fd=None, color=None, noprefix=False, newline=True):
    @staticmethod
    def out(txt, fd, mode='info ', **kwargs):
        if txt == None or fd == 'none':
            return 
        elif fd == None:
            raise Exception('[ERROR] Logging descriptor has not been specified!')

        args = {
            'color': None, 
            'noprefix': False, 
            'newline': True,
            'nocolor' : False
        }
        args.update(kwargs)

        if type(txt) != str:
            txt = str(txt)
            
        txt = txt.replace('\t', ' ' * 4)

        if args['nocolor']:
            col = ''
        elif args['color']:
            col = args['color']
            if type(col) == str and col in Logger.colors_map.keys():
                col = Logger.colors_map[col]
        else:
            col = Logger.colors_dict.setdefault(mode, Logger.colors_map['grey'])

        prefix = ''
        if mode:
            mode = '[%s] ' % mode
            
        if not args['noprefix']:
            if args['nocolor']:
                prefix = mode.upper()
            else:
                prefix = Logger.with_color(Logger.colors_dict['other'], '%s' 
                % (mode.upper()))
        
        nl = ''
        if 'newline' in args:
            if args['newline']:
                nl = '\n'

        if 'force_stdout' in args:
            fd = sys.stdout

        if type(fd) == str:
            with open(fd, 'a') as f:
                prefix2 = ''
                if mode: 
                    prefix2 = '%s' % (mode.upper())
                f.write(prefix2 + txt + nl)
                f.flush()

        else:
            if args['nocolor']:
                fd.write(prefix + txt + nl)
            else:
                fd.write(prefix + Logger.with_color(col, txt) + nl)

    # Info shall be used as an ordinary logging facility, for every desired output.
    def info(self, txt, forced = False, **kwargs):
        kwargs['nocolor'] = self.options['nocolor']
        if forced or (self.options['verbose'] or \
            self.options['debug'] ) \
            or (type(self.options['log']) == str and self.options['log'] != 'none'):
            Logger.out(txt, self.options['log'], 'info', **kwargs)

    def text(self, txt, **kwargs):
        kwargs['noPrefix'] = True
        kwargs['nocolor'] = self.options['nocolor']
        Logger.out(txt, self.options['log'], '', **kwargs)

    def dbg(self, txt, **kwargs):
        if self.options['debug']:
            kwargs['nocolor'] = self.options['nocolor']
            Logger.out(txt, self.options['log'], 'debug', **kwargs)

    def err(self, txt, **kwargs):
        kwargs['nocolor'] = self.options['nocolor']
        Logger.out(txt, self.options['log'], 'error', **kwargs)

    def fatal(self, txt, **kwargs):
        kwargs['nocolor'] = self.options['nocolor']
        Logger.out(txt, self.options['log'], 'error', **kwargs)
        os._exit(1)

logger = Logger(options)

class PhishingMailParser:
    def __init__(self, options):
        self.options = options
        self.results = {}

    def parse(self, html):
        self.html = html
        self.soup = BeautifulSoup(html, features="lxml")

        self.results['Embedded Images']                                         = self.testEmbeddedImages()
        self.results['Images without ALT']                                      = self.testImagesNoAlt()
        self.results['Masqueraded Links']                                       = self.testMaskedLinks()
        self.results['Use of underline tag <u>']                                = self.testUnderlineTag()
        self.results['HTML code in <a> link tags']                              = self.testLinksWithHtmlCode()
        self.results['<a href="..."> URL contained GET parameter']              = self.testLinksWithGETParams()
        self.results['<a href="..."> URL contained GET parameter with URL']     = self.testLinksWithGETParamsBeingURLs()
        self.results['<a href="..."> URL pointed to an executable file']        = self.testLinksWithDangerousExtensions()

        return {k: v for k, v in self.results.items() if v}

    @staticmethod
    def context(tag):
        s = str(tag)

        if len(s) < 100:
            return s

        beg = s[:50]
        end = s[-50:]

        return f'{beg}...{end}'

    def testUnderlineTag(self):
        links = self.soup('u')

        if not links or len(links) == 0:
            return []

        desc = 'Underline tags are recognized by anti-spam filters and trigger additional rule (Office365: 67856001), but by their own shouldnt impact spam score.'
        result = f'- Found {len(links)} <u> tags. This is not by itself an indication of spam, but is known to trigger some rules (like Office365: 67856001)\n'

        context = ''
        for i in range(len(links)):
            context += str(links[i]) + '\n\n'
            if i > 5: break

        return {
            'description' : desc,
            'context' : context,
            'analysis' : result
        }

    def testLinksWithHtmlCode(self):
        links = self.soup('a')

        desc = 'Links that contain HTML code within <a> ... </a> may increase Spam score heavily'
        context = ''
        result = ''
        num = 0
        embed = ''

        for link in links:       
            text = str(link)
            pos = text.find('>')
            code = text[pos+1:]

            m = re.search(r'(.+)<\s*/\s*a\s*>', code, re.I)
            if m:
                code = m.group(1)

            suspicious = '<' in text and '>' in text

            if suspicious:
                num += 1

                if num < 5:
                    N = 70
                    tmp = text[:N]

                    if len(text) > N:
                        tmp += ' ... ' + text[-N:]

                    context += tmp + '\n'

                    code2 = PhishingMailParser.context(code)
                    context += f"\n\t- {logger.colored('Code inside of <a> tag:','red')}\n\t\t" + logger.colored(code2, 'yellow') + '\n'

        if num > 0:
            result += f'- Found {num} <a> tags that contained HTML code inside!\n'
            result +=  '\t  Links conveying HTML code within <a> ... </a> may greatly increase message Spam score!\n'

        if len(result) == 0:
            return []

        return {
            'description' : desc,
            'context' : context,
            'analysis' : result
        }


    def testLinksWithGETParams(self):
        links = self.soup('a')

        desc = 'Links with URLs containing GET parameters will be noticed by anti-spam filters resulting in another rule triggering on message (Office365: 21615005).'
        context = ''
        result = ''
        num = 0
        embed = ''

        for link in links:
            try:
                href = link['href']
            except:
                continue
        
            text = link.getText()
            params = dict(parse.parse_qsl(parse.urlsplit(href).query))

            if len(params) > 0:
                num += 1

                if num < 5:
                    context += PhishingMailParser.context(link) + '\n'
                    hr = href[:90]
                    pos = hr.find('?')
                    hr = hr[:pos] + logger.colored(hr[pos:], 'yellow')

                    context += f'\thref = "{hr}"\n'
                    context += f'\ttext = "{text[:90]}"\n\n'

        if num > 0:
            result += f'- Found {num} <a> tags with href="..." URLs containing GET params.\n'
            result +=  '\t  Links with URLs that contain GET params might trigger anti-spam rule (Office365: 21615005)\n'

        if len(result) == 0:
            return []

        return {
            'description' : desc,
            'context' : context,
            'analysis' : result
        }

    def testLinksWithDangerousExtensions(self):
        links = self.soup('a')

        desc = 'Message contained <a> tags with href="..." links pointing to a file with dangerous extension (such as .exe)'
        context = ''
        result = ''
        num = 0
        embed = ''

        for link in links:
            try:
                href = link['href']
            except:
                continue
        
            text = link.getText()
            parsed = parse.urlsplit(href)

            if '.' not in parsed.path: 
                continue

            pos = parsed.path.rfind('.')
            if pos == -1:
                continue

            extension = parsed.path.lower()[pos:]

            if extension in executable_extensions:
                num += 1

                if num < 5:
                    context += PhishingMailParser.context(link) + '\n'
                    hr = href[:90]
                    pos1 = hr.lower().find(extension.lower())

                    hr = logger.colored(hr[:pos1], 'yellow') + logger.colored(hr[pos1:pos1+len(extension)], 'red') + logger.colored(hr[pos1+len(extension):], 'yellow')

                    context += f'\thref = "{hr}"\n'
                    context += f'\ttext = "{text[:90]}"\n\n'

                    context += f'\tExtension matched: {logger.colored(extension, "red")}\n'

        if num > 0:
            result += f'- Found {num} <a> tags with href="..." URLs pointing to files with dangerous extensions (such as .exe).\n'
            result +=  '\t  Links with URLs that point to potentially executable files might trigger anti-spam rule (Office365: 460985005)\n'

        if len(result) == 0:
            return []

        return {
            'description' : desc,
            'context' : context,
            'analysis' : result
        }

    def testLinksWithGETParamsBeingURLs(self):
        links = self.soup('a')

        desc = 'Links with URLs that contain GET parameters pointing to another URL, will trigger two Office365 anti-spam rules (Office365: 45080400002).'
        context = ''
        result = ''
        num = 0
        embed = ''

        for link in links:
            try:
                href = link['href']
            except:
                continue
        
            text = link.getText()
            params = dict(parse.parse_qsl(parse.urlsplit(href).query))

            url = re.compile(r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*')

            if len(params) > 0:
                for k, v in params.items():
                    m = url.match(v)

                    if m:
                        urlmatched = m.group(1)
                        num += 1

                        if num < 5:
                            context += PhishingMailParser.context(link) + '\n'

                            hr = href[:90]
                            hr = logger.colored(hr, 'yellow')

                            context += f'\thref = "{hr}"\n'
                            context += f'\ttext = "{text[:90]}"\n\n'
                            context += f'\thref URL GET parameter contained another URL:\n\t\t' + logger.colored(v, "red") + '\n'

        if num > 0:
            result += f'- Found {num} <a> tags with href="..." URLs containing GET params containing another URL.\n'
            result +=  '\t  Links with URLs that contain GET params with another URL might trigger anti-spam rule (Office365: 45080400002)\n'

        if len(result) == 0:
            return []

        return {
            'description' : desc,
            'context' : context,
            'analysis' : result
        }


    def testMaskedLinks(self):
        links = self.soup('a')

        desc = 'Links that masquerade their href= attribute by displaying different link are considered harmful and will increase Spam score.'
        context = ''
        result = ''
        num = 0
        embed = ''

        for link in links:
            try:
                href = link['href']
            except:
                continue
        
            text = link.getText()

            url = re.compile(r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*')

            m1 = url.match(href)
            m2 = url.match(text)

            if m1 and m2:
                num += 1

                if num < 5:
                    context += PhishingMailParser.context(link) + '\n'
                    context += f'\thref = "{logger.colored(href[:90],"green")}"\n'
                    context += f'\ttext = "{logger.colored(text[:90],"red")}"\n\n'

        if num > 0:
            result += f'- Found {num} <a> tags that masquerade their href="" links with text!\n'
            result +=  '\t  Links that try to hide underyling URL are harmful and will be considered as Spam!\n'

        if len(result) == 0:
            return []

        return {
            'description' : desc,
            'context' : context,
            'analysis' : result
        }

    def testImagesNoAlt(self):
        images = self.soup('img')

        desc = 'Images without ALT="value" attribute may increase Spam scorage.'
        context = ''
        result = ''
        num = 0
        embed = ''

        for img in images:
            src = img['src']
            alt = ''

            try:
                alt = img['alt']
            except:
                pass

            if alt == '':
                num += 1

                if num < 5:
                    context += PhishingMailParser.context(img) + '\n\n'

        if num > 0:
            result += f'- Found {num} <img> tags without ALT="value" attribute.\n'
            result +=  '\t  Images without alternate text set in their attribute may increase Spam score\n'

        if len(result) == 0:
            return []

        return {
            'description' : desc,
            'context' : context,
            'analysis' : result
        }

    def testEmbeddedImages(self):
        images = self.soup('img')

        desc = 'Embedded images can increase Spam Confidence Level (SCL) in Office365 by 4 points. Embedded images are those with <img src="data:image/png;base64,<BLOB>"/> . They should be avoided.'
        context = ''
        result = ''
        num = 0
        embed = ''

        for img in images:
            src = img['src']
            alt = ''

            try:
                alt = img['alt']
            except:
                pass

            if src.lower().startswith('data:image/'):
                if len(embed) == 0:
                    embed = src[:30]

                num += 1

                if num < 5:
                    if len(alt) > 0:
                        context += f'- ALT="{alt}": ' + PhishingMailParser.context(img) + '\n'
                    else:
                        ctx = PhishingMailParser.context(img)
                        pos = ctx.find('data:')
                        pos2 = ctx.find('"', pos+1)

                        ctx = logger.colored(ctx[:pos], 'yellow') + logger.colored(ctx[pos:pos2], 'red') + logger.colored(ctx[pos2:], 'yellow')

                        context += ctx + '\n'

        if num > 0:
            result += f'- Found {num} <img> tags with embedded image ({embed}).\n'
            result +=  '\t  Embedded images increase Office365 SCL (Spam) level by 4 points!\n'

        if len(result) == 0:
            return []

        return {
            'description' : desc,
            'context' : context,
            'analysis' : result
        }


def printOutput(out):
    if options['format'] == 'text':
        width = 100
        num = 0

        for k, v in out.items():
            num += 1
            analysis = v['analysis'].strip()
            context = v['context'].strip()
            desc = '\n'.join(textwrap.wrap(
                v['description'],
                width = 80,
                initial_indent = '',
                subsequent_indent = '    '
            )).strip()

            analysis = analysis.replace('- ', '\t- ')

            print(f'''
------------------------------------------
({num}) Test: {logger.colored(k, "cyan")}

{logger.colored("DESCRIPTION", "blue")}: 

    {desc}

{logger.colored("CONTEXT", "blue")}: 

    {context}

{logger.colored("ANALYSIS", "blue")}: 

    {analysis}
''')
            
    elif options['format'] == 'json':
        print(json.dumps(out))

def opts(argv):
    global options
    global headers

    o = argparse.ArgumentParser(
        usage = 'phishing-HTML-linter.py [options] <file.html>'
    )
    
    req = o.add_argument_group('Required arguments')
    req.add_argument('file', help = 'Input HTML file')

    args = o.parse_args()
    options.update(vars(args))
    return args

def main(argv):
    args = opts(argv)
    if not args:
        return False

    print('''
    :: Phishing HTML Linter
    Shows you bad smells in your HTML code that will get your mails busted!
    Mariusz Banach / mgeeky
''')

    html = ''
    with open(args.file, 'rb') as f:
        html = f.read()

    p = PhishingMailParser({})
    ret = p.parse(html.decode())

    if len(ret) > 0:
        printOutput(ret)

    else:
        print('\n[+] Congrats! Your message does not have any known bad smells that could trigger anti-spam rules.\n')
    

if __name__ == '__main__':
    main(sys.argv)