mgeeky-Penetration-Testing-.../web/reencode.py

#!/usr/bin/python

# 
# ReEncoder.py - script allowing for recursive encoding detection, decoding and then re-encoding. 
# To be used for instance in fuzzing purposes. Imagine you want to fuzz XML parameters within 
# **PaReq** packet of 3DSecure standard. This packet has been ZLIB compressed, then Base64 encoded, 
# then URLEncoded. In order to modify the inner XML you would need to peel off that encoding layers 
# and then reaplly them in reversed order. This script allows you to do that in an automated manner
# 
# NOTICE:
#   If the input string's length is divisble by 4, Base64 will be able to decode it - thus, the script
#   would wrongly assume it has been encoded using Base64. The same goes for Hex decoding.
#   In order to tackle this issue, the script builds up a tree of possible encoding schemes and then evaluate
#   that tree by choosing the best fitting encodings path (with most points counted upon resulted text's length,
#   entropy and printable'ity).
#
# Requires:
#   - jwt
#   - anytree
#
# Mariusz Banach, 2018
#

import os
import re
import sys
import jwt
import zlib
import math
import base64
import urllib
import string
import anytree
import binascii
from collections import Counter


# =============================================
# RE-ENCODER'S IMPLEMENTATION
#


class ReEncoder:

    # Switch this to show some verbose informations about decoding process.
    DEBUG = True

    PREFER_AUTO = 0     # Automatically determine final output format
    PREFER_TEXT = 1     # Prefer text/printable final output format
    PREFER_BINARY = 2   # Prefer binary final output format

    class Utils:
        @staticmethod
        def isBinaryData(data):
            nonBinary = 0
            percOfBinaryToAssume = 0.10

            for d in data:
                c = ord(d)
                if c in (10, 13): 
                    nonBinary += 1
                elif c >= 0x20 and c <= 0x7f:
                    nonBinary += 1

            binary = len(data) - nonBinary
            return binary >= int(percOfBinaryToAssume * len(data))

    # ============================================================
    # ENCODERS SECTION
    #

    class Encoder:
        def name(self):
            raise NotImplementedError

        def check(self, data):
            raise NotImplementedError
            
        def encode(self, data):
            raise NotImplementedError

        def decode(self, data):
            raise NotImplementedError

    class NoneEncoder(Encoder):
        def name(self):
            return 'None'

        def check(self, data):
            if not data:
                return False
            return True
            
        def encode(self, data):
            return data

        def decode(self, data):
            return data

    class URLEncoder(Encoder):
        def name(self):
            return 'URLEncoder'

        def check(self, data):
            if urllib.quote(urllib.unquote(data)) == data and (urllib.unquote(data) != data):
                return True

            if re.search(r'(?:%[0-9a-f]{2})+', data, re.I):
                return True

            return False
            
        def encode(self, data):
            return urllib.quote(data)

        def decode(self, data):
            return urllib.unquote(data)

    class HexEncoder(Encoder):
        def name(self):
            return 'HexEncoded'

        def check(self, data):
            m = re.match(r'^[0-9a-f]+$', data, re.I)
            if m:
                return True
            return False
            
        def encode(self, data):
            return binascii.hexlify(data).strip()

        def decode(self, data):
            return binascii.unhexlify(data).strip()

    class Base64Encoder(Encoder):
        def name(self):
            return 'Base64'

        def check(self, data):
            try:
                if base64.b64encode(base64.b64decode(data)) == data:
                    m = re.match('^(?:[A-Za-z0-9+\/]{4})*(?:[A-Za-z0-9+\/]{2}==|[A-Za-z0-9+\/]{3}=|[A-Za-z0-9+\/]{4})$', data, re.I)
                    if m: 
                        return True
                    return False
            except:
                pass
            return False
            
        def encode(self, data):
            return base64.b64encode(data)

        def decode(self, data):
            return base64.b64decode(data)

    class Base64URLSafeEncoder(Encoder):
        def name(self):
            return 'Base64URLSafe'

        def check(self, data):
            try:
                if base64.urlsafe_b64encode(base64.urlsafe_b64decode(data)) == data:
                    m = re.match('^(?:[A-Za-z0-9\-_]{4})*(?:[A-Za-z0-9\-_]{2}==|[A-Za-z0-9\-_]{3}=|[A-Za-z0-9\-_]{4})$', data, re.I)
                    if m: 
                        return True
                    return False
            except:
                pass
            return False
            
        def encode(self, data):
            return base64.urlsafe_b64encode(data)

        def decode(self, data):
            return base64.urlsafe_b64decode(data)

    class JWTEncoder(Encoder):
        secret = ''

        def name(self):
            return 'JWT'

        def check(self, data):
            try:
                jwt.decode(data, verify = False)
                return True
            except jwt.exceptions.DecodeError:
                return False
            
        def encode(self, data):
            return jwt.encode(data, JWTEncoder.secret)

        def decode(self, data):
            return jwt.decode(data, verify = False)

    class ZlibEncoder(Encoder):
        def name(self):
            return 'ZLIB'

        def check(self, data):
            if not ReEncoder.Utils.isBinaryData(data):
                return False

            try:
                if zlib.compress(zlib.decompress(data)) == data:
                    return True
            except:
                pass
            return False
            
        def encode(self, data):
            return zlib.compress(data)

        def decode(self, data):
            return zlib.decompress(data)


    # ============================================================
    # ENCODING DETECTION IMPLEMENTATION
    #

    MaxEncodingDepth = 20

    def __init__(self):
        self.encodings = []
        self.encoders = (
            ReEncoder.URLEncoder(),
            ReEncoder.HexEncoder(),
            ReEncoder.Base64Encoder(),
            ReEncoder.Base64URLSafeEncoder(),
            ReEncoder.JWTEncoder(),
            ReEncoder.ZlibEncoder(),

            # None must always be the last detector
            ReEncoder.NoneEncoder(),
        )
        self.encodersMap = {}
        self.data = ''
        self.preferredOutputFormat = ReEncoder.PREFER_AUTO

        for encoder in self.encoders:
            self.encodersMap[encoder.name()] = encoder

    @staticmethod
    def log(text):
        if ReEncoder.DEBUG:
            print(text)

    def verifyEncodings(self, encodings):
        for encoder in encodings:
            if type(encoder) == str:
                if not encoder in self.encodersMap.keys():
                    raise Exception("Passed unknown encoder's name.")
            elif not issubclass(ReEncoder.Encoder, encoder):
                raise Exception("Passed encoder is of unknown type.")

    def generateEncodingTree(self, data):
        step = 0
        maxSteps = len(self.encoders) * ReEncoder.MaxEncodingDepth

        peeledBefore = 0
        peeledOff = 0
        currData = data

        while step < maxSteps:
            peeledBefore = peeledOff
            for encoder in self.encoders:
                step += 1

                ReEncoder.log('[.] Trying: {} (peeled off: {}). Current form: "{}"'.format(encoder.name(), peeledOff, currData))

                if encoder.check(currData):
                    if encoder.name() == 'None':
                        continue

                    if encoder.name().lower().startswith('base64') and (len(currData) % 4 == 0):
                        ReEncoder.log('[.] Unclear situation whether input ({}) is Base64 encoded. Branching.'.format(
                            currData
                        ))

                        yield ('None', currData, True)

                    if encoder.name().lower().startswith('hex') and (len(currData) % 2 == 0):
                        ReEncoder.log('[.] Unclear situation whether input ({}) is Hex encoded. Branching.'.format(
                            currData
                        ))

                        yield ('None', currData, True)

                    ReEncoder.log('[+] Detected encoder: {}'.format(encoder.name()))

                    currData = encoder.decode(currData)
                    yield (encoder.name(), currData, False)

                    peeledOff += 1

                    break

            if (peeledOff - peeledBefore) == 0: 
                break

    def formEncodingCandidates(self, root):
        iters = [[node for node in children] for children in anytree.LevelOrderGroupIter(root)]

        candidates = []

        for node in iters[-1]:
            name = node.name
            decoded = node.decoded

            ReEncoder.log('[.] Candidate for best decode using {}: "{}"...'.format(
                name, decoded[:20]
            ))

            candidates.append([name, decoded, 0.0])

        return candidates

    @staticmethod
    def entropy(data, unit='natural'):
        base = {
            'shannon' : 2.,
            'natural' : math.exp(1),
            'hartley' : 10.
        }

        if len(data) <= 1:
            return 0

        counts = Counter()

        for d in data:
            counts[d] += 1

        probs = [float(c) / len(data) for c in counts.values()]
        probs = [p for p in probs if p > 0.]

        ent = 0

        for p in probs:
            if p > 0.:
                ent -= p * math.log(p, base[unit])

        return ent

    def evaluateEncodingTree(self, root):
        (printableEncodings, printableCandidate) = self.evaluateEncodingTreePicker(root, False)
        (binaryEncodings, binaryCandidate) = self.evaluateEncodingTreePicker(root, True)

        if self.preferredOutputFormat == ReEncoder.PREFER_TEXT:
            ReEncoder.log('Returning text/printable output format as requested preferred one.')
            return printableEncodings
        elif self.preferredOutputFormat == ReEncoder.PREFER_BINARY:
            ReEncoder.log('Returning binary output format as requested preferred one.')
            return binaryEncodings
        else:
            ReEncoder.log('Trying to determine preferred output format...')

        ReEncoder.log('\n---------------------------------------')
        ReEncoder.log('[>] Winning printable encoding path scored: {} points.'.format(
            printableCandidate[2]
        ))
        ReEncoder.log('[>] Winning binary encoding path scored: {} points.'.format(
            binaryCandidate[2]
        ))

        if(printableCandidate[2] >= binaryCandidate[2]):
            ReEncoder.log('\n[+] Choosing all-time winner: PRINTABLE output format.')
            return printableEncodings

        ReEncoder.log('\n[+] Choosing all-time winner: BINARY output format.')
        ReEncoder.log('---------------------------------------\n')
        return binaryEncodings

    def evaluateEncodingTreePicker(self, root, preferBinary):
        candidates = self._evaluateEncodingTreeWorker(root, preferBinary)

        maxCandidate = 0

        for i in range(len(candidates)):
            candidate = candidates[i]

            name = candidate[0]
            decoded = candidate[1]
            points = float(candidate[2])

            if points > candidates[maxCandidate][2]:
                maxCandidate = i

        winningCandidate = candidates[maxCandidate]
        winningPaths = anytree.search.findall_by_attr(
            root, 
            name = 'decoded',
            value = winningCandidate[1]
        )

        ReEncoder.log('[?] Other equally good candidate paths:\n' + str(winningPaths))
        winningPath = winningPaths[0]

        preferred = 'printable'
        if preferBinary:
            preferred = 'binary'

        ReEncoder.log('[+] Winning decode path for {} output is:\n{}'.format(
            preferred,
            str(winningPath))
        )

        encodings = [x.name for x in winningPath.path if x != 'None']
        return (encodings, winningCandidate)

    def _evaluateEncodingTreeWorker(self, root, preferBinary = False):
        weights = {
            'unreadableChars' : 0.0,
            'printableChars' : 9.6,
            'entropyScore' : 4.0,
            'length' : 1.0,
        }

        if preferBinary:
            weights['unreadableChars'] = 24.0
            weights['printableChars'] = 0.0
            weights['entropyScore'] = 2.666667

        candidates = self.formEncodingCandidates(root)

        for i in range(len(candidates)):
            candidate = candidates[i]

            name = candidate[0]
            decoded = candidate[1]
            points = float(candidate[2])
            entropy = ReEncoder.entropy(decoded)
            printables = sum([int(x in string.printable) for x in decoded])
            nonprintables = len(decoded) - printables

            ReEncoder.log('[=] Evaluating candidate: {} (entropy: {}, data: "{}")'.format(
                name, entropy, decoded
            ))

            # Step 1: Adding points for printable percentage.
            printablePoints = float(weights['printableChars']) * (float(printables) / float(len(decoded)))
            nonPrintablePoints = float(weights['unreadableChars']) * (float(nonprintables) / float(len(decoded)))

            # Step 2: If encoder is Base64 and was previously None
            #    - then length and entropy of previous values should be of slighly lower weights
            if name.lower() == 'none' \
                and len(candidates) > i+1 \
                and candidates[i+1][0].lower().startswith('base64'):
                ReEncoder.log('\tAdding fine for being base64')
                entropyPoints = entropy * (weights['entropyScore'] * 0.666666)
                lengthPoints = float(len(decoded)) * (weights['length'] * 0.666666)
            else:
                entropyPoints = entropy * weights['entropyScore']
                lengthPoints = float(len(decoded)) * weights['length']

            if printables > nonprintables:
                ReEncoder.log('More printable chars than binary ones.')
                ReEncoder.log('\tAdding {} points for printable entropy.'.format(entropyPoints))

                ReEncoder.log('\tAdding {} points for printable characters.'.format(printablePoints))
                points += printablePoints
            else:
                ReEncoder.log('More binary chars than printable ones.')
                ReEncoder.log('\tAdding {} points for binary entropy.'.format(entropyPoints))

                ReEncoder.log('\tAdding {} points for binary characters.'.format(nonPrintablePoints))
                points += nonPrintablePoints

            points += entropyPoints

            # Step 4: Add points for length
            ReEncoder.log('\tAdding {} points for length.'.format(lengthPoints))
            points += lengthPoints
            
            ReEncoder.log('\tScored in total: {} points.'.format(points))
            candidates[i][2] = points

        return candidates


    def getWinningDecodePath(self, root):
        return [x for x in self.evaluateEncodingTree(root) if x != 'None']

    def process(self, data):
        root = anytree.Node('None', decoded = data)
        prev = root

        for (name, curr, branch) in self.generateEncodingTree(data):
            ReEncoder.log('[*] Generator returned: ("{}", "{}", {})'.format(
                name, curr[:20], str(branch)
            ))

            currNode = anytree.Node(name, parent = prev, decoded = curr)
            if branch:
                pass
            else:
                prev = currNode

        for pre, fill, node in anytree.RenderTree(root):
            if node.name != 'None':
                ReEncoder.log("%s%s (%s)" % (pre, node.name, node.decoded[:20].decode('ascii', 'ignore')))

        self.encodings = self.getWinningDecodePath(root)
        ReEncoder.log('[+] Selected encodings: {}'.format(str(self.encodings)))

    def decode(self, data, preferredOutputFormat = PREFER_AUTO, encodings = []):
        self.preferredOutputFormat = preferredOutputFormat

        if preferredOutputFormat != ReEncoder.PREFER_AUTO and \
            preferredOutputFormat != ReEncoder.PREFER_TEXT and \
            preferredOutputFormat != ReEncoder.PREFER_BINARY:
            raise Exception('Unknown preferred output format specified in decode(): {}'.format(
                preferredOutputFormat
            ))

        if not encodings:
            self.process(data)
        else:
            self.verifyEncodings(encodings)
            self.encodings = encodings

        for encoderName in self.encodings:
            d = self.encodersMap[encoderName].decode(data)
            data = d

        return data

    def encode(self, data, encodings = []):
        if encodings:
            encodings.reverse()
            self.verifyEncodings(encodings)
            self.encodings = encodings

        for encoderName in self.encodings[::-1]:
            e = self.encodersMap[encoderName].encode(data)
            data = e

        return data


def main(argv):
    # Sample 1: ZLIB -> Base64 -> URLEncode
    sample = 'eJzzSM3JyVcozy%2FKSVFIK8rPVQhKdc1Lzk9JLVIEAIr8Cck%3D'

    # Sample 2: URLEncode -> Base64 -> HexEncode
    #sample = '4a5451344a5459314a545a6a4a545a6a4a545a6d4a5449774a5463334a545a6d4a5463794a545a6a4a5459304a5449784a5449774a544e684a544a6b4a544935'

    if len(argv) != 2:
        print('Usage: reencode.py <text|file>')
        print('Using sample: "{}"'.format(sample))
        text = sample
    else:
        text = argv[1]

        if os.path.isfile(text):
            f = open(text, 'rb')
            text = f.read()
            f.close()

    decoder = ReEncoder()
    decoded = decoder.decode(text)
    
    print('(1) DECODED TEXT: "{}"'.format(decoded))
    
    decoded = 'FOO ' + decoded + ' BAR'
    
    print('\n(2) TO BE ENCODED TEXT: "{}"'.format(decoded))
    
    decoded = decoder.encode(decoded)
    print('(3) ENCODED FORM: "{}"'.format(decoded))

if __name__ == '__main__':
    main(sys.argv)
First 2018-02-02 22:22:43 +01:00			`#!/usr/bin/python`

			`#`
			`# ReEncoder.py - script allowing for recursive encoding detection, decoding and then re-encoding.`
Updated reencode.py to include ZLIB compression 2018-04-16 17:39:09 +02:00			`# To be used for instance in fuzzing purposes. Imagine you want to fuzz XML parameters within`
			`# PaReq packet of 3DSecure standard. This packet has been ZLIB compressed, then Base64 encoded,`
Forgot to switch from DEBUG=True 2018-04-16 17:41:59 +02:00			`# then URLEncoded. In order to modify the inner XML you would need to peel off that encoding layers`
Updated reencode.py to include ZLIB compression 2018-04-16 17:39:09 +02:00			`# and then reaplly them in reversed order. This script allows you to do that in an automated manner`
First 2018-02-02 22:22:43 +01:00			`#`
			`# NOTICE:`
			`# If the input string's length is divisble by 4, Base64 will be able to decode it - thus, the script`
			`# would wrongly assume it has been encoded using Base64. The same goes for Hex decoding.`
			`# In order to tackle this issue, the script builds up a tree of possible encoding schemes and then evaluate`
			`# that tree by choosing the best fitting encodings path (with most points counted upon resulted text's length,`
			`# entropy and printable'ity).`
			`#`
			`# Requires:`
			`# - jwt`
			`# - anytree`
			`#`
updates 2021-10-24 23:11:42 +02:00			`# Mariusz Banach, 2018`
First 2018-02-02 22:22:43 +01:00			`#`

Improved auto-decoding algorithm to be sensible about final output format - whether it should be text or binary. 2018-10-23 19:49:22 +02:00			`import os`
First 2018-02-02 22:22:43 +01:00			`import re`
			`import sys`
			`import jwt`
Updated reencode.py to include ZLIB compression 2018-04-16 17:39:09 +02:00			`import zlib`
First 2018-02-02 22:22:43 +01:00			`import math`
			`import base64`
			`import urllib`
			`import string`
			`import anytree`
			`import binascii`
			`from collections import Counter`


Improved auto-decoding algorithm to be sensible about final output format - whether it should be text or binary. 2018-10-23 19:49:22 +02:00

			`# =============================================`
			`# RE-ENCODER'S IMPLEMENTATION`
			`#`


First 2018-02-02 22:22:43 +01:00			`class ReEncoder:`

			`# Switch this to show some verbose informations about decoding process.`
Improved auto-decoding algorithm to be sensible about final output format - whether it should be text or binary. 2018-10-23 19:49:22 +02:00			`DEBUG = True`

			`PREFER_AUTO = 0 # Automatically determine final output format`
			`PREFER_TEXT = 1 # Prefer text/printable final output format`
			`PREFER_BINARY = 2 # Prefer binary final output format`
Updated reencode.py to include ZLIB compression 2018-04-16 17:39:09 +02:00
			`class Utils:`
			`@staticmethod`
			`def isBinaryData(data):`
			`nonBinary = 0`
			`percOfBinaryToAssume = 0.10`

			`for d in data:`
			`c = ord(d)`
			`if c in (10, 13):`
			`nonBinary += 1`
			`elif c >= 0x20 and c <= 0x7f:`
			`nonBinary += 1`

			`binary = len(data) - nonBinary`
			`return binary >= int(percOfBinaryToAssume * len(data))`
First 2018-02-02 22:22:43 +01:00
			`# ============================================================`
			`# ENCODERS SECTION`
			`#`

			`class Encoder:`
			`def name(self):`
			`raise NotImplementedError`

			`def check(self, data):`
			`raise NotImplementedError`

			`def encode(self, data):`
			`raise NotImplementedError`

			`def decode(self, data):`
			`raise NotImplementedError`

			`class NoneEncoder(Encoder):`
			`def name(self):`
			`return 'None'`

			`def check(self, data):`
			`if not data:`
			`return False`
			`return True`

			`def encode(self, data):`
			`return data`

			`def decode(self, data):`
			`return data`

			`class URLEncoder(Encoder):`
			`def name(self):`
			`return 'URLEncoder'`

			`def check(self, data):`
			`if urllib.quote(urllib.unquote(data)) == data and (urllib.unquote(data) != data):`
			`return True`

Updated reencode.py to include ZLIB compression 2018-04-16 17:39:09 +02:00			`if re.search(r'(?:%[0-9a-f]{2})+', data, re.I):`
First 2018-02-02 22:22:43 +01:00			`return True`

			`return False`

			`def encode(self, data):`
			`return urllib.quote(data)`

			`def decode(self, data):`
			`return urllib.unquote(data)`

			`class HexEncoder(Encoder):`
			`def name(self):`
			`return 'HexEncoded'`

			`def check(self, data):`
			`m = re.match(r'^[0-9a-f]+$', data, re.I)`
			`if m:`
			`return True`
			`return False`

			`def encode(self, data):`
			`return binascii.hexlify(data).strip()`

			`def decode(self, data):`
			`return binascii.unhexlify(data).strip()`

			`class Base64Encoder(Encoder):`
			`def name(self):`
			`return 'Base64'`

			`def check(self, data):`
			`try:`
			`if base64.b64encode(base64.b64decode(data)) == data:`
Improved auto-decoding algorithm to be sensible about final output format - whether it should be text or binary. 2018-10-23 19:49:22 +02:00			`m = re.match('^(?:[A-Za-z0-9+\/]{4})*(?:[A-Za-z0-9+\/]{2}==\|[A-Za-z0-9+\/]{3}=\|[A-Za-z0-9+\/]{4})$', data, re.I)`
			`if m:`
			`return True`
			`return False`
First 2018-02-02 22:22:43 +01:00			`except:`
			`pass`
			`return False`

			`def encode(self, data):`
			`return base64.b64encode(data)`

			`def decode(self, data):`
			`return base64.b64decode(data)`

			`class Base64URLSafeEncoder(Encoder):`
			`def name(self):`
			`return 'Base64URLSafe'`

			`def check(self, data):`
			`try:`
			`if base64.urlsafe_b64encode(base64.urlsafe_b64decode(data)) == data:`
Improved auto-decoding algorithm to be sensible about final output format - whether it should be text or binary. 2018-10-23 19:49:22 +02:00			`m = re.match('^(?:[A-Za-z0-9\-_]{4})*(?:[A-Za-z0-9\-_]{2}==\|[A-Za-z0-9\-_]{3}=\|[A-Za-z0-9\-_]{4})$', data, re.I)`
			`if m:`
			`return True`
			`return False`
First 2018-02-02 22:22:43 +01:00			`except:`
			`pass`
			`return False`

			`def encode(self, data):`
			`return base64.urlsafe_b64encode(data)`

			`def decode(self, data):`
			`return base64.urlsafe_b64decode(data)`

			`class JWTEncoder(Encoder):`
			`secret = ''`

			`def name(self):`
			`return 'JWT'`

			`def check(self, data):`
			`try:`
			`jwt.decode(data, verify = False)`
			`return True`
			`except jwt.exceptions.DecodeError:`
			`return False`

			`def encode(self, data):`
			`return jwt.encode(data, JWTEncoder.secret)`

			`def decode(self, data):`
			`return jwt.decode(data, verify = False)`

Updated reencode.py to include ZLIB compression 2018-04-16 17:39:09 +02:00			`class ZlibEncoder(Encoder):`
			`def name(self):`
			`return 'ZLIB'`

			`def check(self, data):`
			`if not ReEncoder.Utils.isBinaryData(data):`
			`return False`

			`try:`
			`if zlib.compress(zlib.decompress(data)) == data:`
			`return True`
			`except:`
			`pass`
			`return False`

			`def encode(self, data):`
			`return zlib.compress(data)`

			`def decode(self, data):`
			`return zlib.decompress(data)`


First 2018-02-02 22:22:43 +01:00
			`# ============================================================`
			`# ENCODING DETECTION IMPLEMENTATION`
			`#`

			`MaxEncodingDepth = 20`

			`def __init__(self):`
			`self.encodings = []`
			`self.encoders = (`
			`ReEncoder.URLEncoder(),`
			`ReEncoder.HexEncoder(),`
			`ReEncoder.Base64Encoder(),`
			`ReEncoder.Base64URLSafeEncoder(),`
			`ReEncoder.JWTEncoder(),`
Updated reencode.py to include ZLIB compression 2018-04-16 17:39:09 +02:00			`ReEncoder.ZlibEncoder(),`
First 2018-02-02 22:22:43 +01:00
			`# None must always be the last detector`
			`ReEncoder.NoneEncoder(),`
			`)`
			`self.encodersMap = {}`
			`self.data = ''`
Improved auto-decoding algorithm to be sensible about final output format - whether it should be text or binary. 2018-10-23 19:49:22 +02:00			`self.preferredOutputFormat = ReEncoder.PREFER_AUTO`
First 2018-02-02 22:22:43 +01:00
			`for encoder in self.encoders:`
			`self.encodersMap[encoder.name()] = encoder`

			`@staticmethod`
			`def log(text):`
			`if ReEncoder.DEBUG:`
			`print(text)`

			`def verifyEncodings(self, encodings):`
			`for encoder in encodings:`
			`if type(encoder) == str:`
			`if not encoder in self.encodersMap.keys():`
			`raise Exception("Passed unknown encoder's name.")`
			`elif not issubclass(ReEncoder.Encoder, encoder):`
			`raise Exception("Passed encoder is of unknown type.")`

			`def generateEncodingTree(self, data):`
			`step = 0`
			`maxSteps = len(self.encoders) * ReEncoder.MaxEncodingDepth`

			`peeledBefore = 0`
			`peeledOff = 0`
			`currData = data`

			`while step < maxSteps:`
			`peeledBefore = peeledOff`
			`for encoder in self.encoders:`
			`step += 1`

			`ReEncoder.log('[.] Trying: {} (peeled off: {}). Current form: "{}"'.format(encoder.name(), peeledOff, currData))`

			`if encoder.check(currData):`
			`if encoder.name() == 'None':`
			`continue`

			`if encoder.name().lower().startswith('base64') and (len(currData) % 4 == 0):`
			`ReEncoder.log('[.] Unclear situation whether input ({}) is Base64 encoded. Branching.'.format(`
			`currData`
			`))`

			`yield ('None', currData, True)`

			`if encoder.name().lower().startswith('hex') and (len(currData) % 2 == 0):`
			`ReEncoder.log('[.] Unclear situation whether input ({}) is Hex encoded. Branching.'.format(`
			`currData`
			`))`

			`yield ('None', currData, True)`

			`ReEncoder.log('[+] Detected encoder: {}'.format(encoder.name()))`

			`currData = encoder.decode(currData)`
			`yield (encoder.name(), currData, False)`

			`peeledOff += 1`

			`break`

			`if (peeledOff - peeledBefore) == 0:`
			`break`

			`def formEncodingCandidates(self, root):`
			`iters = [[node for node in children] for children in anytree.LevelOrderGroupIter(root)]`

			`candidates = []`

			`for node in iters[-1]:`
			`name = node.name`
			`decoded = node.decoded`

			`ReEncoder.log('[.] Candidate for best decode using {}: "{}"...'.format(`
			`name, decoded[:20]`
			`))`

			`candidates.append([name, decoded, 0.0])`

			`return candidates`

			`@staticmethod`
			`def entropy(data, unit='natural'):`
			`base = {`
			`'shannon' : 2.,`
			`'natural' : math.exp(1),`
			`'hartley' : 10.`
			`}`

			`if len(data) <= 1:`
			`return 0`

			`counts = Counter()`

			`for d in data:`
			`counts[d] += 1`

			`probs = [float(c) / len(data) for c in counts.values()]`
			`probs = [p for p in probs if p > 0.]`

			`ent = 0`

			`for p in probs:`
			`if p > 0.:`
			`ent -= p * math.log(p, base[unit])`

			`return ent`

			`def evaluateEncodingTree(self, root):`
Improved auto-decoding algorithm to be sensible about final output format - whether it should be text or binary. 2018-10-23 19:49:22 +02:00			`(printableEncodings, printableCandidate) = self.evaluateEncodingTreePicker(root, False)`
			`(binaryEncodings, binaryCandidate) = self.evaluateEncodingTreePicker(root, True)`

			`if self.preferredOutputFormat == ReEncoder.PREFER_TEXT:`
			`ReEncoder.log('Returning text/printable output format as requested preferred one.')`
			`return printableEncodings`
			`elif self.preferredOutputFormat == ReEncoder.PREFER_BINARY:`
			`ReEncoder.log('Returning binary output format as requested preferred one.')`
			`return binaryEncodings`
			`else:`
			`ReEncoder.log('Trying to determine preferred output format...')`

			`ReEncoder.log('\n---------------------------------------')`
			`ReEncoder.log('[>] Winning printable encoding path scored: {} points.'.format(`
			`printableCandidate[2]`
			`))`
			`ReEncoder.log('[>] Winning binary encoding path scored: {} points.'.format(`
			`binaryCandidate[2]`
			`))`

			`if(printableCandidate[2] >= binaryCandidate[2]):`
			`ReEncoder.log('\n[+] Choosing all-time winner: PRINTABLE output format.')`
			`return printableEncodings`

			`ReEncoder.log('\n[+] Choosing all-time winner: BINARY output format.')`
			`ReEncoder.log('---------------------------------------\n')`
			`return binaryEncodings`

			`def evaluateEncodingTreePicker(self, root, preferBinary):`
			`candidates = self._evaluateEncodingTreeWorker(root, preferBinary)`

			`maxCandidate = 0`

			`for i in range(len(candidates)):`
			`candidate = candidates[i]`

			`name = candidate[0]`
			`decoded = candidate[1]`
			`points = float(candidate[2])`

			`if points > candidates[maxCandidate][2]:`
			`maxCandidate = i`

			`winningCandidate = candidates[maxCandidate]`
			`winningPaths = anytree.search.findall_by_attr(`
			`root,`
			`name = 'decoded',`
			`value = winningCandidate[1]`
			`)`

			`ReEncoder.log('[?] Other equally good candidate paths:\n' + str(winningPaths))`
			`winningPath = winningPaths[0]`

			`preferred = 'printable'`
			`if preferBinary:`
			`preferred = 'binary'`

			`ReEncoder.log('[+] Winning decode path for {} output is:\n{}'.format(`
			`preferred,`
			`str(winningPath))`
			`)`

			`encodings = [x.name for x in winningPath.path if x != 'None']`
			`return (encodings, winningCandidate)`

			`def _evaluateEncodingTreeWorker(self, root, preferBinary = False):`
First 2018-02-02 22:22:43 +01:00			`weights = {`
Improved auto-decoding algorithm to be sensible about final output format - whether it should be text or binary. 2018-10-23 19:49:22 +02:00			`'unreadableChars' : 0.0,`
			`'printableChars' : 9.6,`
			`'entropyScore' : 4.0,`
			`'length' : 1.0,`
First 2018-02-02 22:22:43 +01:00			`}`

Improved auto-decoding algorithm to be sensible about final output format - whether it should be text or binary. 2018-10-23 19:49:22 +02:00			`if preferBinary:`
			`weights['unreadableChars'] = 24.0`
			`weights['printableChars'] = 0.0`
			`weights['entropyScore'] = 2.666667`

First 2018-02-02 22:22:43 +01:00			`candidates = self.formEncodingCandidates(root)`

			`for i in range(len(candidates)):`
			`candidate = candidates[i]`

			`name = candidate[0]`
			`decoded = candidate[1]`
			`points = float(candidate[2])`
Improved auto-decoding algorithm to be sensible about final output format - whether it should be text or binary. 2018-10-23 19:49:22 +02:00			`entropy = ReEncoder.entropy(decoded)`
			`printables = sum([int(x in string.printable) for x in decoded])`
			`nonprintables = len(decoded) - printables`
First 2018-02-02 22:22:43 +01:00
Improved auto-decoding algorithm to be sensible about final output format - whether it should be text or binary. 2018-10-23 19:49:22 +02:00			`ReEncoder.log('[=] Evaluating candidate: {} (entropy: {}, data: "{}")'.format(`
			`name, entropy, decoded`
First 2018-02-02 22:22:43 +01:00			`))`

			`# Step 1: Adding points for printable percentage.`
Improved auto-decoding algorithm to be sensible about final output format - whether it should be text or binary. 2018-10-23 19:49:22 +02:00			`printablePoints = float(weights['printableChars']) * (float(printables) / float(len(decoded)))`
			`nonPrintablePoints = float(weights['unreadableChars']) * (float(nonprintables) / float(len(decoded)))`
First 2018-02-02 22:22:43 +01:00
Improved auto-decoding algorithm to be sensible about final output format - whether it should be text or binary. 2018-10-23 19:49:22 +02:00			`# Step 2: If encoder is Base64 and was previously None`
First 2018-02-02 22:22:43 +01:00			`# - then length and entropy of previous values should be of slighly lower weights`
			`if name.lower() == 'none' \`
			`and len(candidates) > i+1 \`
			`and candidates[i+1][0].lower().startswith('base64'):`
Improved auto-decoding algorithm to be sensible about final output format - whether it should be text or binary. 2018-10-23 19:49:22 +02:00			`ReEncoder.log('\tAdding fine for being base64')`
			`entropyPoints = entropy * (weights['entropyScore'] * 0.666666)`
			`lengthPoints = float(len(decoded)) * (weights['length'] * 0.666666)`
First 2018-02-02 22:22:43 +01:00			`else:`
Improved auto-decoding algorithm to be sensible about final output format - whether it should be text or binary. 2018-10-23 19:49:22 +02:00			`entropyPoints = entropy * weights['entropyScore']`
First 2018-02-02 22:22:43 +01:00			`lengthPoints = float(len(decoded)) * weights['length']`

Improved auto-decoding algorithm to be sensible about final output format - whether it should be text or binary. 2018-10-23 19:49:22 +02:00			`if printables > nonprintables:`
			`ReEncoder.log('More printable chars than binary ones.')`
			`ReEncoder.log('\tAdding {} points for printable entropy.'.format(entropyPoints))`

			`ReEncoder.log('\tAdding {} points for printable characters.'.format(printablePoints))`
			`points += printablePoints`
			`else:`
			`ReEncoder.log('More binary chars than printable ones.')`
			`ReEncoder.log('\tAdding {} points for binary entropy.'.format(entropyPoints))`

			`ReEncoder.log('\tAdding {} points for binary characters.'.format(nonPrintablePoints))`
			`points += nonPrintablePoints`

First 2018-02-02 22:22:43 +01:00			`points += entropyPoints`

Improved auto-decoding algorithm to be sensible about final output format - whether it should be text or binary. 2018-10-23 19:49:22 +02:00			`# Step 4: Add points for length`
First 2018-02-02 22:22:43 +01:00			`ReEncoder.log('\tAdding {} points for length.'.format(lengthPoints))`
			`points += lengthPoints`

			`ReEncoder.log('\tScored in total: {} points.'.format(points))`
			`candidates[i][2] = points`

Improved auto-decoding algorithm to be sensible about final output format - whether it should be text or binary. 2018-10-23 19:49:22 +02:00			`return candidates`
First 2018-02-02 22:22:43 +01:00

Updated reencode.py to include ZLIB compression 2018-04-16 17:39:09 +02:00			`def getWinningDecodePath(self, root):`
			`return [x for x in self.evaluateEncodingTree(root) if x != 'None']`

First 2018-02-02 22:22:43 +01:00			`def process(self, data):`
			`root = anytree.Node('None', decoded = data)`
			`prev = root`

			`for (name, curr, branch) in self.generateEncodingTree(data):`
			`ReEncoder.log('[*] Generator returned: ("{}", "{}", {})'.format(`
			`name, curr[:20], str(branch)`
			`))`

			`currNode = anytree.Node(name, parent = prev, decoded = curr)`
			`if branch:`
			`pass`
			`else:`
			`prev = currNode`

			`for pre, fill, node in anytree.RenderTree(root):`
Updated reencode.py to include ZLIB compression 2018-04-16 17:39:09 +02:00			`if node.name != 'None':`
			`ReEncoder.log("%s%s (%s)" % (pre, node.name, node.decoded[:20].decode('ascii', 'ignore')))`
First 2018-02-02 22:22:43 +01:00
Updated reencode.py to include ZLIB compression 2018-04-16 17:39:09 +02:00			`self.encodings = self.getWinningDecodePath(root)`
First 2018-02-02 22:22:43 +01:00			`ReEncoder.log('[+] Selected encodings: {}'.format(str(self.encodings)))`

Improved auto-decoding algorithm to be sensible about final output format - whether it should be text or binary. 2018-10-23 19:49:22 +02:00			`def decode(self, data, preferredOutputFormat = PREFER_AUTO, encodings = []):`
			`self.preferredOutputFormat = preferredOutputFormat`

			`if preferredOutputFormat != ReEncoder.PREFER_AUTO and \`
			`preferredOutputFormat != ReEncoder.PREFER_TEXT and \`
			`preferredOutputFormat != ReEncoder.PREFER_BINARY:`
			`raise Exception('Unknown preferred output format specified in decode(): {}'.format(`
			`preferredOutputFormat`
			`))`

First 2018-02-02 22:22:43 +01:00			`if not encodings:`
			`self.process(data)`
			`else:`
			`self.verifyEncodings(encodings)`
			`self.encodings = encodings`

			`for encoderName in self.encodings:`
			`d = self.encodersMap[encoderName].decode(data)`
			`data = d`

			`return data`

			`def encode(self, data, encodings = []):`
			`if encodings:`
			`encodings.reverse()`
			`self.verifyEncodings(encodings)`
			`self.encodings = encodings`

			`for encoderName in self.encodings[::-1]:`
			`e = self.encodersMap[encoderName].encode(data)`
			`data = e`

			`return data`

Improved auto-decoding algorithm to be sensible about final output format - whether it should be text or binary. 2018-10-23 19:49:22 +02:00
First 2018-02-02 22:22:43 +01:00			`def main(argv):`
Updated reencode.py to include ZLIB compression 2018-04-16 17:39:09 +02:00			`# Sample 1: ZLIB -> Base64 -> URLEncode`
			`sample = 'eJzzSM3JyVcozy%2FKSVFIK8rPVQhKdc1Lzk9JLVIEAIr8Cck%3D'`

			`# Sample 2: URLEncode -> Base64 -> HexEncode`
Improved auto-decoding algorithm to be sensible about final output format - whether it should be text or binary. 2018-10-23 19:49:22 +02:00			`#sample = '4a5451344a5459314a545a6a4a545a6a4a545a6d4a5449774a5463334a545a6d4a5463794a545a6a4a5459304a5449784a5449774a544e684a544a6b4a544935'`
First 2018-02-02 22:22:43 +01:00
			`if len(argv) != 2:`
Improved auto-decoding algorithm to be sensible about final output format - whether it should be text or binary. 2018-10-23 19:49:22 +02:00			`print('Usage: reencode.py <text\|file>')`
First 2018-02-02 22:22:43 +01:00			`print('Using sample: "{}"'.format(sample))`
			`text = sample`
			`else:`
			`text = argv[1]`

Improved auto-decoding algorithm to be sensible about final output format - whether it should be text or binary. 2018-10-23 19:49:22 +02:00			`if os.path.isfile(text):`
			`f = open(text, 'rb')`
			`text = f.read()`
			`f.close()`

First 2018-02-02 22:22:43 +01:00			`decoder = ReEncoder()`
			`decoded = decoder.decode(text)`

			`print('(1) DECODED TEXT: "{}"'.format(decoded))`

			`decoded = 'FOO ' + decoded + ' BAR'`

			`print('\n(2) TO BE ENCODED TEXT: "{}"'.format(decoded))`

			`decoded = decoder.encode(decoded)`
			`print('(3) ENCODED FORM: "{}"'.format(decoded))`

			`if __name__ == '__main__':`
Updated reencode.py to include ZLIB compression 2018-04-16 17:39:09 +02:00			`main(sys.argv)`