mgeeky-Penetration-Testing-.../phishing/phishing-HTML-linter.py

#!/usr/bin/python3

import os, sys, re
import string
import argparse
import yaml
import textwrap
import json
from urllib import parse
from bs4 import BeautifulSoup

options = {
    'format' : 'text',
}

executable_extensions = [
    '.exe',
    '.dll',
    '.lnk',
    '.scr',
    '.sys',
    '.ps1',
    '.bat',
    '.js',
    '.jse',
    '.vbs',
    '.vba',
    '.vbe',
    '.wsl',
    '.cpl',
]


options = {
    'debug': False,
    'verbose': False,
    'nocolor' : False,
    'log' : sys.stderr,
    'format' : 'text',
}

class Logger:
    colors_map = {
        'red':      31,
        'green':    32,
        'yellow':   33,
        'blue':     34,
        'magenta':  35,
        'cyan':     36,
        'white':    37,
        'grey':     38,
    }

    colors_dict = {
        'error': colors_map['red'],
        'trace': colors_map['magenta'],
        'info ': colors_map['green'],
        'debug': colors_map['grey'],
        'other': colors_map['grey'],
    }

    options = {}

    def __init__(self, opts = None):
        self.options.update(Logger.options)
        if opts != None and len(opts) > 0:
            self.options.update(opts)

    @staticmethod
    def with_color(c, s):
        return "\x1b[%dm%s\x1b[0m" % (c, s)

    def colored(self, txt, col):
        if self.options['nocolor']:
            return txt

        return Logger.with_color(Logger.colors_map[col], txt)

    # Invocation:
    #   def out(txt, mode='info ', fd=None, color=None, noprefix=False, newline=True):
    @staticmethod
    def out(txt, fd, mode='info ', **kwargs):
        if txt == None or fd == 'none':
            return
        elif fd == None:
            raise Exception('[ERROR] Logging descriptor has not been specified!')

        args = {
            'color': None,
            'noprefix': False,
            'newline': True,
            'nocolor' : False
        }
        args.update(kwargs)

        if type(txt) != str:
            txt = str(txt)

        txt = txt.replace('\t', ' ' * 4)

        if args['nocolor']:
            col = ''
        elif args['color']:
            col = args['color']
            if type(col) == str and col in Logger.colors_map.keys():
                col = Logger.colors_map[col]
        else:
            col = Logger.colors_dict.setdefault(mode, Logger.colors_map['grey'])

        prefix = ''
        if mode:
            mode = '[%s] ' % mode

        if not args['noprefix']:
            if args['nocolor']:
                prefix = mode.upper()
            else:
                prefix = Logger.with_color(Logger.colors_dict['other'], '%s'
                % (mode.upper()))

        nl = ''
        if 'newline' in args:
            if args['newline']:
                nl = '\n'

        if 'force_stdout' in args:
            fd = sys.stdout

        if type(fd) == str:
            with open(fd, 'a') as f:
                prefix2 = ''
                if mode:
                    prefix2 = '%s' % (mode.upper())
                f.write(prefix2 + txt + nl)
                f.flush()

        else:
            if args['nocolor']:
                fd.write(prefix + txt + nl)
            else:
                fd.write(prefix + Logger.with_color(col, txt) + nl)

    # Info shall be used as an ordinary logging facility, for every desired output.
    def info(self, txt, forced = False, **kwargs):
        kwargs['nocolor'] = self.options['nocolor']
        if forced or (self.options['verbose'] or \
            self.options['debug'] ) \
            or (type(self.options['log']) == str and self.options['log'] != 'none'):
            Logger.out(txt, self.options['log'], 'info', **kwargs)

    def text(self, txt, **kwargs):
        kwargs['noPrefix'] = True
        kwargs['nocolor'] = self.options['nocolor']
        Logger.out(txt, self.options['log'], '', **kwargs)

    def dbg(self, txt, **kwargs):
        if self.options['debug']:
            kwargs['nocolor'] = self.options['nocolor']
            Logger.out(txt, self.options['log'], 'debug', **kwargs)

    def err(self, txt, **kwargs):
        kwargs['nocolor'] = self.options['nocolor']
        Logger.out(txt, self.options['log'], 'error', **kwargs)

    def fatal(self, txt, **kwargs):
        kwargs['nocolor'] = self.options['nocolor']
        Logger.out(txt, self.options['log'], 'error', **kwargs)
        os._exit(1)

logger = Logger(options)

class PhishingMailParser:

    #
    # Based on:
    #    https://journeys.autopilotapp.com/blog/email-spam-trigger-words/
    #    https://www.activecampaign.com/blog/spam-words
    #    https://blog.hubspot.com/blog/tabid/6307/bid/30684/the-ultimate-list-of-email-spam-trigger-words.aspx
    #
    Suspicious_Words = {
        'Manipulative': (
            'creating unnecessary urgency or pressure',
            (
                "Act now", "Action", "Apply now", "Apply online", "Buy", "Buy direct", "Call", "Call now", "Click here",
                "Clearance", "Click here", "Do it today", "Don't delete", "Drastically reduced", "Exclusive deal", "Expire",
                "Get", "Get it now", "Get started now", "Important information regarding", "Instant", "Limited time",
                "New customers only", "Now only", "Offer expires", "Once in a lifetime", "Order now", "Please read",
                "Special promotion", "Take action", "This won't last", "Urgent", "While stocks last"
            )
        ),

        'Needy' : (
            'sounding desperate or exaggerated claims',
            (
                "All-new", "Bargain", "Best price", "Bonus", "Email marketing", "Free", "For instant access", "Free gift",
                "Free trial", "Have you been turned down?", "Great offer", "Join millions of Americans", "Incredible deal",
                "Prize", "Satisfaction guaranteed", "Will not believe your eyes"
            )
        ),

        'Sleazy' : (
            'being too pushy',
            (
                "As seen on", "Click here", "Click below", "Deal", "Direct email", "Direct marketing", "Do it today",
                "Order now", "Order today", "Unlimited", "What are you waiting for?", "Visit our website"
            )
        ),

        'Cheap' : (
            'no pre-qualifications, everybody wins',
            (
                "Acceptance", "Access", "Avoid bankruptcy", "Boss", "Cancel", "Card accepted", "Certified",
                "Cheap", "Compare", "Compare rates", "Congratulations", "Credit card offers", "Cures", "Dear ",
                "Dear friend", "Drastically reduced", "Easy terms", "Free grant money", "Free hosting", "Free info",
                "Free membership", "Friend", "Get out of debt", "Giving away", "Guarantee", "Guaranteed",
                "Have you been turned down?", "Hello", "Information you requested", "Join millions", "No age restrictions",
                "No catch", "No experience", "No obligation", "No purchase necessary", "No questions asked",
                "No strings attached", "Offer", "Opportunity", "Save big", "Winner", "Winning", "Won", "You are a winner!",
                "You've been selected!"
            )
        ),

        'Far-fetched' : (
            'statements that are too good to be true',
            (
                "Additional income", "All-natural", "Amazing", "Be your own boss", "Big bucks", "Billion",
                "Billion dollars", "Cash", "Cash bonus", "Consolidate debt and credit", "Consolidate your debt",
                "Double your income", "Earn", "Earn cash", "Earn extra cash", "Eliminate bad credit", "Eliminate debt",
                "Extra", "Fantastic deal", "Financial freedom", "Financially independent", "Free investment", "Free money",
                "Get paid", "Home", "Home-based", "Income", "Increase sales", "Increase traffic", "Lose", "Lose weight",
                "Money back", "No catch", "No fees", "No hidden costs", "No strings attached", "Potential earnings",
                "Pure profit", "Removes wrinkles", "Reverses aging", "Risk-free", "Serious cash", "Stop snoring",
                "Vacation", "Vacation offers", "Weekend getaway", "Weight loss", "While you sleep", "Work from home"
            )
        ),

        'Exaggeration' : (
            'exaggerated claims and promises',
            (
                "100% more", "100% free", "100% satisfied", "Additional income", "Be your own boss", "Best price",
                "Big bucks", "Billion", "Cash bonus", "Cents on the dollar", "Consolidate debt", "Double your cash",
                "Double your income", "Earn extra cash", "Earn money", "Eliminate bad credit", "Extra cash", "Extra income",
                "Expect to earn", "Fast cash", "Financial freedom", "Free access", "Free consultation", "Free gift",
                "Free hosting", "Free info", "Free investment", "Free membership", "Free money", "Free preview", "Free quote",
                "Free trial", "Full refund", "Get out of debt", "Get paid", "Giveaway", "Guaranteed", "Increase sales",
                "Increase traffic", "Incredible deal", "Lower rates", "Lowest price", "Make money", "Million dollars", "Miracle",
                "Money back", "Once in a lifetime", "One time", "Pennies a day", "Potential earnings", "Prize",
                "Promise", "Pure profit", "Risk-free", "Satisfaction guaranteed", "Save big money", "Save up to", "Special promotion",
            )
        ),

        'Urgency' : (
            'create unnecessary urgency and pressure',
            (
                "Act now", "Apply now", "Become a member", "Call now", "Click below", "Click here", "Get it now",
                "Do it today", "Don’t delete", "Exclusive deal", "Get started now", "Important information regarding",
                "Information you requested", "Instant", "Limited time", "New customers only", "Order now", "Please read",
                "See for yourself", "Sign up free", "Take action", "This won’t last", "Urgent", "What are you waiting for?",
                "While supplies last", "Will not believe your eyes", "Winner", "Winning", "You are a winner", "You have been selected",

            )
        ),

        'Spammy' : (
            'shady, spammy, or unethical behavior',
            (
                "Bulk email", "Buy direct", "Cancel at any time", "Check or money order", "Congratulations", "Confidentiality",
                "Cures", "Dear friend", "Direct email", "Direct marketing", "Hidden charges", "Human growth hormone", "Internet marketing",
                "Lose weight", "Mass email", "Meet singles", "Multi-level marketing", "No catch", "No cost", "No credit check",
                "No fees", "No gimmick", "No hidden costs", "No hidden fees", "No interest", "No investment", "No obligation",
                "No purchase necessary", "No questions asked", "No strings attached", "Not junk", "Notspam", "Obligation",
                "Passwords", "Requires initial investment", "Social security number", "This isn’t a scam", "This isn’t junk",
                "This isn’t spam", "Undisclosed", "Unsecured credit", "Unsecured debt", "Unsolicited", "Valium",
                "Viagra", "Vicodin", "We hate spam", "Weight loss", "Xanax",
            )
        ),

        'Jargon' : (
            'jargon or legalese',
            (
                "Accept credit cards", "All new", "As seen on", "Bargain", "Beneficiary", "Billing", "Bonus",
                "Cards accepted", "Cash", "Certified", "Cheap", "Claims", "Clearance", "Compare rates", "Credit card offers",
                "Deal", "Debt", "Discount", "Fantastic", "In accordance with laws", "Income", "Investment", "Join millions",
                "Lifetime", "Loans", "Luxury", "Marketing solution", "Message contains", "Mortgage rates", "Name brand",
                "Offer", "Online marketing", "Opt in", "Pre-approved", "Quote", "Rates", "Refinance", "Removal", "Reserves the right",
                "Score", "Search engine", "Sent in compliance", "Subject to", "Terms and conditions", "Trial", "Unlimited",
                "Warranty", "Web traffic", "Work from home",
            )
        ),

        'Shady' : (
            'ethically or legally questionable behavior',
            (
                "Addresses", "Beneficiary", "Billing", "Casino", "Celebrity", "Collect child support", "Copy DVDs",
                "Fast viagra delivery", "Hidden", "Human growth hormone", "In accordance with laws", "Investment",
                "Junk", "Legal", "Life insurance", "Loan", "Lottery", "Luxury car", "Medicine", "Meet singles", "Message contains",
                "Miracle", "Money", "Multi-level marketing", "Nigerian", "Offshore", "Online degree", "Online pharmacy", "Passwords",
                "Refinance", "Request", "Rolex", "Score", "Social security number", "Spam", "This isn't spam", "Undisclosed recipient",
                "University diplomas", "Unsecured credit", "Unsolicited", "US dollars", "Valium", "Viagra", "Vicodin",
                "Warranty", "Xanax"
            )
        ),

        "Commerce" : (
            "",
            (
                "As seen on", "Buy", "Buy direct", "Buying judgments", "Clearance", "Order", "Order status", "Orders shipped by shopper",
            )
        ),

        "Personal" : (
            "",
            (
                "Dig up dirt on friends", "Meet singles", "Score with babes", "XXX", "Near you",
            )
        ),

        "Employment" : (
            "",
            (
                "Additional income", "Be your own boss", "Compete for your business", "Double your", "Earn $", "Earn extra cash",
                "Earn per week", "Expect to earn", "Extra income", "Home based", "Home employment", "Homebased business", "Income from home",
                "Make $", "Make money", "Money making", "Online biz opportunity", "Online degree", "Opportunity",
                "Potential earnings", "University diplomas", "While you sleep", "Work at home", "Work from home",
            )
        ),

        "Financial - General" : (
            "",
            (
                "$$$", "Affordable", "Bargain", "Beneficiary", "Best price", "Big bucks", "Cash", "Cash bonus", "Cashcashcash",
                "Cents on the dollar", "Cheap", "Check", "Claims", "Collect", "Compare rates", "Cost", "Credit", "Credit bureaus",
                "Discount", "Earn", "Easy terms", "F r e e", "Fast cash", "For just $XXX", "Hidden assets", "hidden charges",
                "Income", "Incredible deal", "Insurance", "Investment", "Loans", "Lowest price", "Million dollars", "Money",
                "Money back", "Mortgage", "Mortgage rates", "No cost", "No fees", "One hundred percent free", "Only $", "Pennies a day",
                "Price", "Profits", "Pure profit", "Quote", "Refinance", "Save $", "Save big money", "Save up to", "Serious cash",
                "Subject to credit", "They keep your money — no refund!", "Unsecured credit", "Unsecured debt",
                "US dollars", "Why pay more?",
            )
        ),

        "Financial - Business" : (
            "",
            (
                "Accept credit cards", "Cards accepted", "Check or money order", "Credit card offers", "Explode your business",
                "Full refund", "Investment decision", "No credit check", "No hidden Costs", "No investment",
                "Requires initial investment", "Sent in compliance", "Stock alert", "Stock disclaimer statement", "Stock pick",
            )
        ),

        "Financial - Personal" : (
            "",
            (
                "Avoice bankruptcy", "Calling creditors", "Collect child support", "Consolidate debt and credit",
                "Consolidate your debt", "Eliminate bad credit", "Eliminate debt", "Financially independent",
                "Get out of debt", "Get paid", "Lower interest rate", "Lower monthly payment", "Lower your mortgage rate",
                "Lowest insurance rates", "Pre-approved", "Refinance home", "Social security number", "Your income",
            )
        ),

        "General" : (
            "",
            (
                "Acceptance", "Accordingly", "Avoid", "Chance", "Dormant", "Freedom", "Here", "Hidden", "Home", "Leave",
                "Lifetime", "Lose", "Maintained", "Medium", "Miracle", "Never", "Passwords", "Problem", "Remove", "Reverses",
                "Sample", "Satisfaction", "Solution", "Stop", "Success", "Teen", "Wife",
            )
        ),

        "Greetings" : (
            "",
            (
                "Dear ", "Friend", "Hello",
            )
        ),

        "Marketing" : (
            "",
            (
                "Ad", "Auto email removal", "Bulk email", "Click", "Click below", "Click here", "Click to remove", "Direct email",
                "Direct marketing", "Email harvest", "Email marketing", "Form", "Increase sales", "Increase traffic",
                "Increase your sales", "Internet market", "Internet marketing", "Marketing", "Marketing solutions", "Mass email",
                "Member", "Month trial offer", "More Internet Traffic", "Multi level marketing", "Notspam", "One time mailing",
                "Online marketing", "Open", "Opt in", "Performance", "Removal instructions", "Sale", "Sales",
                "Search engine listings", "Search engines", "Subscribe", "The following form", "This isn't junk", "This isn't spam",
                "Undisclosed recipient", "Unsubscribe", "Visit our website", "We hate spam", "Web traffic", "Will not believe your eyes",
            )
        ),

        "Medical" : (
            "",
            (
                "Cures baldness", "Diagnostic", "Fast Viagra delivery", "Human growth hormone", "Life insurance",
                "Lose weight", "Lose weight spam", "Medicine", "No medical exams", "Online pharmacy", "Removes wrinkles",
                "Reverses aging", "Stop snoring", "Valium", "Viagra", "Vicodin", "Weight loss", "Xanax",
            )
        ),

        "Numbers" : (
            "",
            (
                "#1", "100% free", "100% satisfied", "4U", "50% off", "Billion", "Billion dollars", "Join millions",
                "Join millions of Americans", "Million", "One hundred percent guaranteed", "Thousands",
            )
        ),

        "Offers" : (
            "",
            (
                "Being a member", "Billing address", "Call", "Cannot be combined with any other offer",
                "Confidentially on all orders", "Deal", "Financial freedom", "Gift certificate", "Giving away",
                "Guarantee", "Have you been turned down?", "If only it were that easy", "Important information regarding",
                "In accordance with laws", "Long distance phone offer", "Mail in order form", "Message contains",
                "Name brand", "Nigerian", "No age restrictions", "No catch", "No claim forms", "No disappointment",
                "No experience", "No gimmick", "No inventory", "No middleman", "No obligation", "No purchase necessary",
                "No questions asked", "No selling", "No strings attached", "No-obligation", "Not intended",
                "Obligation", "Off shore", "Offer", "Per day", "Per week", "Priority mail", "Prize", "Prizes",
                "Produced and sent out", "Reserves the right", "Shopping spree", "Stuff on sale", "Terms and conditions",
                "The best rates", "They’re just giving it away", "Trial", "Unlimited", "Unsolicited", "Vacation",
                "Vacation offers", "Warranty", "We honor all", "Weekend getaway", "What are you waiting for?", "Who really wins?",
                "Win", "Winner", "Winning", "Won", "You are a winner!", "You have been selected", "You’re a Winner!",
            )
        ),

        "Calls-to-Action" : (
            "",
            (
                "Cancel at any time", "Compare", "Copy accurately", "Get", "Give it away", "Print form signature",
                "Print out and fax", "See for yourself", "Sign up free today",
            )
        ),

        "Free" : (
            "",
            (
                "Free", "Free access", "Free cell phone", "Free consultation", "Free DVD", "Free gift", "Free grant money",
                "Free hosting", "Free installation", "Free Instant", "Free investment", "Free leads", "Free membership",
                "Free money", "Free offer", "Free preview", "Free priority mail", "Free quote", "Free sample",
                "Free trial", "Free website",
            )
        ),

        "Descriptions/Adjectives" : (
            "",
            (
                "All natural", "All new", "Amazing", "Certified", "Congratulations", "Drastically reduced", "Fantastic deal",
                "For free", "Guaranteed", "It’s effective", "Outstanding values", "Promise you", "Real thing",
                "Risk free", "Satisfaction guaranteed",
            )
        ),

        "Sense of Urgency" : (
            "",
            (
                "Access", "Act now!", "Apply now", "Apply online", "Call free", "Call now", "Can't live without", "Do it today",
                "Don't delete", "Don't hesitate", "For instant access", "For Only", "For you", "Get it now", "Get started now",
                "Great offer", "Info you requested", "Information you requested", "Instant", "Limited time", "New customers only",
                "Now", "Now only", "Offer expires", "Once in lifetime", "One time", "Only", "Order now", "Order today",
                "Please read", "Special promotion", "Supplies are limited", "Take action now", "Time limited", "Urgent",
                "While supplies last",
            )
        ),

        "Nouns" : (
            "",
            (
                "Addresses on CD", "Beverage", "Bonus", "Brand new pager", "Cable converter", "Casino", "Celebrity",
                "Copy DVDs", "Laser printer", "Legal", "Luxury car", "New domain extensions", "Phone", "Rolex", "Stainless steel"
            )
        )
    }

    def __init__(self, options):
        self.options = options
        self.results = {}

    def parse(self, html):
        self.html = html
        self.soup = BeautifulSoup(html, features="lxml")

        self.results['Embedded Images']                                         = self.testEmbeddedImages()
        self.results['Images without ALT']                                      = self.testImagesNoAlt()
        self.results['Masqueraded Links']                                       = self.testMaskedLinks()
        self.results['Use of underline tag <u>']                                = self.testUnderlineTag()
        self.results['HTML code in <a> link tags']                              = self.testLinksWithHtmlCode()
        self.results['<a href="..."> URL contained GET parameter']              = self.testLinksWithGETParams()
        self.results['<a href="..."> URL contained GET parameter with URL']     = self.testLinksWithGETParamsBeingURLs()
        self.results['<a href="..."> URL pointed to an executable file']        = self.testLinksWithDangerousExtensions()
        self.results['Mail message contained suspicious words']                 = self.testSuspiciousWords()

        return {k: v for k, v in self.results.items() if v}

    @staticmethod
    def context(tag):
        s = str(tag)

        if len(s) < 100:
            return s

        beg = s[:50]
        end = s[-50:]

        return f'{beg}...{end}'

    def testUnderlineTag(self):
        links = self.soup('u')

        if not links or len(links) == 0:
            return []

        desc = 'Underline tags are recognized by anti-spam filters and trigger additional rule (Office365: 67856001), but by their own shouldnt impact spam score.'
        result = f'- Found {len(links)} <u> tags. This is not by itself an indication of spam, but is known to trigger some rules (like Office365: 67856001)\n'

        context = ''
        for i in range(len(links)):
            context += str(links[i]) + '\n\n'
            if i > 5: break

        return {
            'description' : desc,
            'context' : context,
            'analysis' : result
        }

    def testSuspiciousWords(self):
        desc = '''

Input text message contained words considered as suspicious in context of E-Mails.
Therefore you will have better chances of delivering your phishing e-mail when you get rid of them.

'''
        context = ''
        result = ''

        text = self.html
        foundWords = set()
        totalChecked = 0
        totalFound = 0

        for title, words in PhishingMailParser.Suspicious_Words.items():
            found = set()

            for word in words[1]:
                if word.lower() in foundWords:
                    continue

                totalChecked += 1
                if re.search(r'\b' + re.escape(word) + r'\b', text, re.I):
                    found.add(word.lower())

                    foundWords.add(word.lower())
                    pos = text.find(word.lower())

                    if pos != -1:
                        line = ''
                        N = 50
                        if pos > N:
                            line = text[pos-N:pos]

                        line += text[pos:pos+N]
                        pos2 = line.find(word.lower())

                        line = line[:pos2] + logger.colored(line[pos2:pos2+len(word)], "red") + line[pos2+len(word):]
                        line = line.replace('\n', '')
                        line = re.sub(r' {2,}', '  ', line)

                        context += '\n' + line + '\n'

            if len(found) > 0:
                totalFound += len(found)
                result += f'- Found {logger.colored(len(found), "red")} {logger.colored(title, "yellow")} words {logger.colored(words[0], "cyan")}:\n'

                for w in found:
                    result += f'\t- {w}\n'

                result += '\n'

        if totalFound == 0:
            return {}

        result += f'- Found in total {logger.colored(totalFound, "red")} suspicious words (out of {totalChecked} total checked).\n'

        return {
            'description' : desc,
            'context' : context,
            'analysis' : result
        }

    def testLinksWithHtmlCode(self):
        links = self.soup('a')

        desc = 'Links that contain HTML code within <a> ... </a> may increase Spam score heavily'
        context = ''
        result = ''
        num = 0
        embed = ''

        for link in links:
            text = str(link)
            pos = text.find('>')
            code = text[pos+1:]

            m = re.search(r'(.+)<\s*/\s*a\s*>', code, re.I)
            if m:
                code = m.group(1)

            suspicious = '<' in text and '>' in text

            if suspicious:
                num += 1

                if num < 5:
                    N = 70
                    tmp = text[:N]

                    if len(text) > N:
                        tmp += ' ... ' + text[-N:]

                    context += tmp + '\n'

                    code2 = PhishingMailParser.context(code)
                    context += f"\n\t- {logger.colored('Code inside of <a> tag:','red')}\n\t\t" + logger.colored(code2, 'yellow') + '\n'

        if num > 0:
            result += f'- Found {num} <a> tags that contained HTML code inside!\n'
            result +=  '\t  Links conveying HTML code within <a> ... </a> may greatly increase message Spam score!\n'

        if len(result) == 0:
            return []

        return {
            'description' : desc,
            'context' : context,
            'analysis' : result
        }


    def testLinksWithGETParams(self):
        links = self.soup('a')

        desc = 'Links with URLs containing GET parameters will be noticed by anti-spam filters resulting in another rule triggering on message (Office365: 21615005).'
        context = ''
        result = ''
        num = 0
        embed = ''

        for link in links:
            try:
                href = link['href']
            except:
                continue

            text = link.getText()
            params = dict(parse.parse_qsl(parse.urlsplit(href).query))

            if len(params) > 0:
                num += 1

                if num < 5:
                    context += PhishingMailParser.context(link) + '\n'
                    hr = href[:90]
                    pos = hr.find('?')
                    hr = hr[:pos] + logger.colored(hr[pos:], 'yellow')

                    context += f'\thref = "{hr}"\n'
                    context += f'\ttext = "{text[:90]}"\n\n'

        if num > 0:
            result += f'- Found {num} <a> tags with href="..." URLs containing GET params.\n'
            result +=  '\t  Links with URLs that contain GET params might trigger anti-spam rule (Office365: 21615005)\n'

        if len(result) == 0:
            return []

        return {
            'description' : desc,
            'context' : context,
            'analysis' : result
        }

    def testLinksWithDangerousExtensions(self):
        links = self.soup('a')

        desc = 'Message contained <a> tags with href="..." links pointing to a file with dangerous extension (such as .exe)'
        context = ''
        result = ''
        num = 0
        embed = ''

        for link in links:
            try:
                href = link['href']
            except:
                continue

            text = link.getText()
            parsed = parse.urlsplit(href)

            if '.' not in parsed.path:
                continue

            pos = parsed.path.rfind('.')
            if pos == -1:
                continue

            extension = parsed.path.lower()[pos:]

            if extension in executable_extensions:
                num += 1

                if num < 5:
                    context += PhishingMailParser.context(link) + '\n'
                    hr = href[:90]
                    pos1 = hr.lower().find(extension.lower())

                    hr = logger.colored(hr[:pos1], 'yellow') + logger.colored(hr[pos1:pos1+len(extension)], 'red') + logger.colored(hr[pos1+len(extension):], 'yellow')

                    context += f'\thref = "{hr}"\n'
                    context += f'\ttext = "{text[:90]}"\n\n'

                    context += f'\tExtension matched: {logger.colored(extension, "red")}\n'

        if num > 0:
            result += f'- Found {num} <a> tags with href="..." URLs pointing to files with dangerous extensions (such as .exe).\n'
            result +=  '\t  Links with URLs that point to potentially executable files might trigger anti-spam rule (Office365: 460985005)\n'

        if len(result) == 0:
            return []

        return {
            'description' : desc,
            'context' : context,
            'analysis' : result
        }

    def testLinksWithGETParamsBeingURLs(self):
        links = self.soup('a')

        desc = 'Links with URLs that contain GET parameters pointing to another URL, will trigger two Office365 anti-spam rules (Office365: 45080400002).'
        context = ''
        result = ''
        num = 0
        embed = ''

        for link in links:
            try:
                href = link['href']
            except:
                continue

            text = link.getText()
            params = dict(parse.parse_qsl(parse.urlsplit(href).query))

            url = re.compile(r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*')

            if len(params) > 0:
                for k, v in params.items():
                    m = url.match(v)

                    if m:
                        urlmatched = m.group(1)
                        num += 1

                        if num < 5:
                            context += PhishingMailParser.context(link) + '\n'

                            hr = href[:90]
                            hr = logger.colored(hr, 'yellow')

                            context += f'\thref = "{hr}"\n'
                            context += f'\ttext = "{text[:90]}"\n\n'
                            context += f'\thref URL GET parameter contained another URL:\n\t\t' + logger.colored(v, "red") + '\n'

        if num > 0:
            result += f'- Found {num} <a> tags with href="..." URLs containing GET params containing another URL.\n'
            result +=  '\t  Links with URLs that contain GET params with another URL might trigger anti-spam rule (Office365: 45080400002)\n'

        if len(result) == 0:
            return []

        return {
            'description' : desc,
            'context' : context,
            'analysis' : result
        }


    def testMaskedLinks(self):
        links = self.soup('a')

        desc = 'Links that masquerade their href= attribute by displaying different link are considered harmful and will increase Spam score.'
        context = ''
        result = ''
        num = 0
        embed = ''

        for link in links:
            try:
                href = link['href']
            except:
                continue

            text = link.getText()

            url = re.compile(r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*')
            url2 = re.compile(r'((http|https)\:\/\/)[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*')

            m1 = url.match(href)
            m2 = url2.search(text)

            if m1 and m2:
                num += 1

                if num < 5:
                    context += PhishingMailParser.context(link) + '\n'
                    context += f'\thref = "{logger.colored(href[:90],"green")}"\n'
                    context += f'\ttext = "{logger.colored(text[:90],"red")}"\n\n'

        if num > 0:
            result += f'- Found {num} <a> tags that masquerade their href="" links with text!\n'
            result +=  '\t  Links that try to hide underyling URL are harmful and will be considered as Spam!\n'

        if len(result) == 0:
            return []

        return {
            'description' : desc,
            'context' : context,
            'analysis' : result
        }

    def testImagesNoAlt(self):
        images = self.soup('img')

        desc = 'Images without ALT="value" attribute may increase Spam scorage.'
        context = ''
        result = ''
        num = 0
        embed = ''

        for img in images:
            src = img['src']
            alt = ''

            try:
                alt = img['alt']
            except:
                pass

            if alt == '':
                num += 1

                if num < 5:
                    context += PhishingMailParser.context(img) + '\n\n'

        if num > 0:
            result += f'- Found {num} <img> tags without ALT="value" attribute.\n'
            result +=  '\t  Images without alternate text set in their attribute may increase Spam score\n'

        if len(result) == 0:
            return []

        return {
            'description' : desc,
            'context' : context,
            'analysis' : result
        }

    def testEmbeddedImages(self):
        images = self.soup('img')

        desc = 'Embedded images can increase Spam Confidence Level (SCL) in Office365 by 4 points. Embedded images are those with <img src="data:image/png;base64,<BLOB>"/> . They should be avoided.'
        context = ''
        result = ''
        num = 0
        embed = ''

        for img in images:
            src = img['src']
            alt = ''

            try:
                alt = img['alt']
            except:
                pass

            if src.lower().startswith('data:image/'):
                if len(embed) == 0:
                    embed = src[:30]

                num += 1

                if num < 5:
                    if len(alt) > 0:
                        context += f'- ALT="{alt}": ' + PhishingMailParser.context(img) + '\n'
                    else:
                        ctx = PhishingMailParser.context(img)
                        pos = ctx.find('data:')
                        pos2 = ctx.find('"', pos+1)

                        ctx = logger.colored(ctx[:pos], 'yellow') + logger.colored(ctx[pos:pos2], 'red') + logger.colored(ctx[pos2:], 'yellow')

                        context += ctx + '\n'

        if num > 0:
            result += f'- Found {num} <img> tags with embedded image ({embed}).\n'
            result +=  '\t  Embedded images increase Office365 SCL (Spam) level by 4 points!\n'

        if len(result) == 0:
            return []

        return {
            'description' : desc,
            'context' : context,
            'analysis' : result
        }


def printOutput(out):
    if options['format'] == 'text':
        width = 100
        num = 0

        for k, v in out.items():
            num += 1
            analysis = v['analysis'].strip()
            context = v['context'].strip()
            desc = '\n'.join(textwrap.wrap(
                v['description'],
                width = 80,
                initial_indent = '',
                subsequent_indent = '    '
            )).strip()

            analysis = analysis.replace('- ', '\t- ')

            print(f'''
------------------------------------------
({num}) Test: {logger.colored(k, "cyan")}

{logger.colored("DESCRIPTION", "blue")}:

    {desc}

{logger.colored("CONTEXT", "blue")}:

    {context}

{logger.colored("ANALYSIS", "blue")}:

    {analysis}
''')

    elif options['format'] == 'json':
        print(json.dumps(out))

def opts(argv):
    global options
    global headers

    o = argparse.ArgumentParser(
        usage = 'phishing-HTML-linter.py [options] <file.html>'
    )

    req = o.add_argument_group('Required arguments')
    req.add_argument('file', help = 'Input HTML file')

    args = o.parse_args()
    options.update(vars(args))
    return args

def main(argv):
    args = opts(argv)
    if not args:
        return False

    print('''
    :: Phishing HTML Linter
    Shows you bad smells in your HTML code that will get your mails busted!
    Mariusz Banach / mgeeky
''')

    html = ''
    with open(args.file, 'rb') as f:
        html = f.read()

    p = PhishingMailParser({})
    ret = p.parse(html.decode())

    if len(ret) > 0:
        printOutput(ret)

    else:
        print('\n[+] Congrats! Your message does not have any known bad smells that could trigger anti-spam rules.\n')


if __name__ == '__main__':
    main(sys.argv)