"""
reScan - v0.05 2008-06-06 Philippe Lagadec

to find IP addresses and other interesting patterns in cleartext in a file.
This simple tool gives a lot of false positives: its only purpose is to save
time when looking for patterns, not to provide accurate detection. ;-)

For more info and updates: http://www.decalage.info/rescan

usage: reScan <file>
"""

# LICENSE: CeCILL v2 (GPL-compatible)
# see http://www.cecill.info/licences/Licence_CeCILL_V2-en.html

# CHANGELOG:
# - 2007-07-11 v0.01 PL: - 1st version
# - 2007-07-30 v0.02 PL: - added list of patterns
# - 2007-07-31 v0.03 PL: - added patterns
#                        - added hexadecimal dump
# - 2007-08-09 v0.04 PL: - improved some regexs, added Petite detection
# - 2008-06-06 v0.05 PL: - escape non-printable characters with '\xNN' when
#                          displaying matches
#                        - optional custom pattern list in reScan_custom.py
#                        - optional call to magic.py to guess filetype

# TODO:
# + improve patterns to avoid some false positives: maybe use pefile or magic.py ?
# + improve regex list with http://regexlib.com (add domain names, e-mail addresses, ...)
# - extract list of common strings found in EXE files
# - add headers from other filetypes (Office, ...)
# - add regex for e-mail addresses, URLS, ...
# - HTML report with color highlighting
# - GUI ?

import sys, re, os, os.path

# try to import magic.py - see http://www.jsnp.net/code/magic.py
try:
    import magic
    MAGIC = True
except:
    MAGIC = False

try:
    f = file(sys.argv[1], 'rb')
except:
    sys.exit(__doc__)

data = f.read()
f.close()

# list of regular expressions for patterns
FIND_REGEX = {
    # NOTE: '(?i)' makes a regex case-insensitive
    "IP addresses": r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}",
    "EXE MZ headers": r"MZ|ZM",
    "EXE PE headers": r"PE",
    "EXE PE DOS message": r"(?i)This program cannot be run in DOS mode",
    #TODO: quicker way to have a list of extensions (bat, cmd, vbs, js, ...) ?
    ".EXE/.COM/.VBS/.JS/.BAT/.CMD/.DLL filename": r"(?i)\.EXE|\.COM|\.VBS|\.JS|\.VBE|\.JSE|\.BAT|\.CMD|\.DLL",
    "EXE: UPX header": r"(?i)UPX",
    "EXE: .text/.data/.rdata section": r"(?i)\.text|\.data|\.rdata",
    "EXE: packed with Petite": r"(?i)\.petite",
    "EXE: interesting Win32 function names": r"(?i)WriteFile|IsDebuggerPresent|RegSetValue|CreateRemoteThread",
    "EXE: interesting WinSock function names": r"(?i)WS2_32\.dll|WSASocket|WSASend|WSARecv",
    "EXE: possibly compiled with Microsoft Visual C++": r"(?i)Microsoft Visual C\+\+",
    "Interesting registry keys": r"(?i)CurrentVersion\\Run|UserInit",
    "Interesting file names": r"(?i)\\drivers\\etc\\hosts|cmd\.exe|\\Start Menu\\Programs\\Startup",
    "Interesting keywords": r"(?i)password|administrator|smtp|pop|http|ftp|ssh|icq|backdoor|vmware",
    "NOP instructions (possible shellcode)": r"\x90{4,}", # this regex matches 4 NOPs or more
    "Possible OLE2 header (D0CF)": r"\xD0\xCF\x11\xE0",
    "VBA macros": r"(?i)VBA",
    }

# try to import reScan_custom.py to add custom FIND_REGEX (optional):
try:
    import reScan_custom
    for pattern in reScan_custom.FIND_REGEX:
        FIND_REGEX[pattern] = reScan_custom.FIND_REGEX[pattern]
except:
    pass

#------------------------------------------------------------------------------
# HEXDUMP from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/142812

FILTER=''.join([(len(repr(chr(x)))==3) and chr(x) or '.' for x in range(256)])

def hexdump(src, length=8):
    N=0; result=''
    while src:
       s,src = src[:length],src[length:]
       hexa = ' '.join(["%02X"%ord(x) for x in s])
       s = s.translate(FILTER)
       result += "%04X   %-*s   %s\n" % (N, length*3, hexa, s)
       N+=length
    return result

def hexdump2(src, length=8):
    result=[]
    for i in xrange(0, len(src), length):
       s = src[i:i+length]
       hexa = ' '.join(["%02X"%ord(x) for x in s])
       printable = s.translate(FILTER)
       result.append("%04X   %-*s   %s\n" % (i, length*3, hexa, printable))
    return ''.join(result)

# my improved hexdump, to add a start index:
def hexdump3(src, length=8, startindex=0):
    """
    Returns a hexadecimal dump of a binary string.
    length: number of bytes per row.
    startindex: index of 1st byte.
    """
    result=[]
    for i in xrange(0, len(src), length):
       s = src[i:i+length]
       hexa = ' '.join(["%02X"%ord(x) for x in s])
       printable = s.translate(FILTER)
       result.append("%04X   %-*s   %s\n" % (i+startindex, length*3, hexa, printable))
    return ''.join(result)

#------------------------------------------------------------------------------

if MAGIC:
    print "Filetype according to magic: %s\n" % magic.whatis(data)

for item in FIND_REGEX:
    r = re.compile(FIND_REGEX[item])
    matches = []
    for m in r.finditer(data):
        matches.append(m)
    if len(matches)>0:
        print "-"*79
        print "%s:" % item
        for m in matches:
            print "at %08X: %s" % (m.start(), repr(m.group()))
            # 5 lines of hexadecimal dump around the pattern: 2 lines = 32 bytes
            start = max(m.start()-32, 0) & 0xFFFFFFF0
            end = min(m.end()+32+15, len(data)) & 0xFFFFFFF0
            length = end-start
            #print start, end, length
            print hexdump3(data[start:end], length=16, startindex=start)
            print ""
##            if item == "EXE MZ headers" and MAGIC:
##                # Check if it's really a EXE header
##                print "Magic: %s\n" % magic.whatis(data[m.start():])