pastebin_crawler.py

#!/usr/bin/env python3
from math import ceil
from optparse import OptionParser
import os
import re
import time
import sys
import urllib
import urllib.request

from pyquery import PyQuery


def get_timestamp():
    return time.strftime('%Y/%m/%d %H:%M:%S')

def all_python_encodings():
     return ["ascii",
             "big5",
             "big5hkscs",
             "cp037",
             "cp424",
             "cp437",
             "cp500",
             "cp720",
             "cp737",
             "cp775",
             "cp850",
             "cp852",
             "cp855",
             "cp856",
             "cp857",
             "cp858",
             "cp860",
             "cp861",
             "cp862",
             "cp863",
             "cp864",
             "cp865",
             "cp866",
             "cp869",
             "cp874",
             "cp875",
             "cp932",
             "cp949",
             "cp950",
             "cp1006",
             "cp1026",
             "cp1140",
             "cp1250",
             "cp1251",
             "cp1252",
             "cp1253",
             "cp1254",
             "cp1255",
             "cp1256",
             "cp1257",
             "cp1258",
             "euc_jp",
             "euc_jis_2004",
             "euc_jisx0213",
             "euc_kr",
             "gb2312",
             "gbk",
             "gb18030",
             "hz",
             "iso2022_jp",
             "iso2022_jp_1",
             "iso2022_jp_2",
             "iso2022_jp_2004",
             "iso2022_jp_3",
             "iso2022_jp_ext",
             "iso2022_kr",
             "latin_1",
             "iso8859_2",
             "iso8859_3",
             "iso8859_4",
             "iso8859_5",
             "iso8859_6",
             "iso8859_7",
             "iso8859_8",
             "iso8859_9",
             "iso8859_10",
             "iso8859_13",
             "iso8859_14",
             "iso8859_15",
             "iso8859_16",
             "johab",
             "koi8_r",
             "koi8_u",
             "mac_cyrillic",
             "mac_greek",
             "mac_iceland",
             "mac_latin2",
             "mac_roman",
             "mac_turkish",
             "ptcp154",
             "shift_jis",
             "shift_jis_2004",
             "shift_jisx0213",
             "utf_32",
             "utf_32_be",
             "utf_32_le",
             "utf_16",
             "utf_16_be",
             "utf_16_le",
             "utf_7",
             "utf_8",
             "utf_8_sig"]


class Logger:

    shell_mod = {
        '':'',
       'PURPLE' : '\033[95m',
       'CYAN' : '\033[96m',
       'DARKCYAN' : '\033[36m',
       'BLUE' : '\033[94m',
       'GREEN' : '\033[92m',
       'YELLOW' : '\033[93m',
       'RED' : '\033[91m',
       'BOLD' : '\033[1m',
       'UNDERLINE' : '\033[4m',
       'RESET' : '\033[0m'
    }

    def log ( self, message, is_bold=False, color='', log_time=True):
        prefix = ''
        suffix = ''

        if log_time:
            prefix += '[{:s}] '.format(get_timestamp())

        if os.name == 'posix':
            if is_bold:
                prefix += self.shell_mod['BOLD']
            prefix += self.shell_mod[color.upper()]

            suffix = self.shell_mod['RESET']

        message = prefix + message + suffix
        print ( message )
        sys.stdout.flush()

    def error(self, err):
        self.log(err, True, 'RED')

    def fatal_error(self, err):
        self.error(err)
        exit()

class Crawler:

    PASTEBIN_URL = 'http://pastebin.com'
    PASTES_URL = PASTEBIN_URL + '/archive'
    REGEXES_FILE = 'regexes.txt'
    OK = 1
    ACCESS_DENIED = -1
    CONNECTION_FAIL = -2
    OTHER_ERROR = -3

    prev_checked_ids = []
    new_checked_ids = []

    def read_regexes(self):
        try:
            with open ( self.REGEXES_FILE, 'r') as f:
                try:
                    self.regexes = [ [ field.strip() for field in line.split(',')] for line in f.readlines() if line.strip() != '' and not line.startswith('#')]

                    # In case commas exist in the regexes...merge everything.
                    for i in range(len(self.regexes)):
                        self.regexes[i] = [','.join(self.regexes[i][:-2])] + self.regexes[i][-2:]
                except KeyboardInterrupt:
                    raise
                except:
                    Logger().fatal_error('Malformed regexes file. Format: regex_pattern,URL logging file, directory logging file.')
        except KeyboardInterrupt:
            raise
        except:
            Logger().fatal_error('{:s} not found or not acessible.'.format(self.REGEXES_FILE))


    def __init__(self):
        self.read_regexes()


    def get_pastes ( self ):
        Logger ().log ( 'Getting pastes', True )
        try:
            page = PyQuery ( url = self.PASTES_URL )
        except KeyboardInterrupt:
            raise
        except:
            return self.CONNECTION_FAIL,None


        """
        There are a set of encoding issues which, coupled with some bugs in etree (such as in the Raspbian packages) can
        trigger encoding exceptions here. As a workaround, we try every possible encoding first, and even if that fails,
        we resort to a very hacky workaround whereby we manually get the page and attempt to encode it as utf-8. It's
        ugly, but it works for now.
        """
        try:
            page_html = page.html ()
        except KeyboardInterrupt:
            raise
        except:
            worked = False
            for enc in all_python_encodings():
                try:
                    page_html = page.html(encoding=enc)
                    worked = True
                    break
                except KeyboardInterrupt:
                    raise
                except:
                    pass
            if not worked:
                # One last try...
                try:
                    f = urllib.request.urlopen(Crawler.PASTES_URL)
                    page_html = PyQuery(str(f.read()).encode('utf8')).html()
                    f.close()
                except KeyboardInterrupt:
                    raise
                except:
                    return self.OTHER_ERROR, None
        if re.match ( r'Pastebin\.com - Access Denied Warning', page_html, re.IGNORECASE ) or 'blocked your IP' in page_html:
            return self.ACCESS_DENIED,None
        else:
            return self.OK,page('.maintable img').next('a')

    def check_paste ( self, paste_id ):
        paste_url = self.PASTEBIN_URL + paste_id
        try:
            paste_txt = PyQuery ( url = paste_url )('#paste_code').text()

            for regex,file,directory in self.regexes:
                if re.match ( regex, paste_txt, re.IGNORECASE ):
                    Logger ().log ( 'Found a matching paste: ' + paste_url + ' (' + file + ')', True, 'CYAN' )
                    self.save_result ( paste_url,paste_id,file,directory )
                    return True
            Logger ().log ( 'Not matching paste: ' + paste_url )
        except KeyboardInterrupt:
            raise
        except:
            Logger ().log ( 'Error reading paste (probably a 404 or encoding issue).', True, 'YELLOW')
        return False

    def save_result ( self, paste_url, paste_id, file, directory ):
        timestamp = get_timestamp()
        with open ( file, 'a' ) as matching:
            matching.write ( timestamp + ' - ' + paste_url + '\n' )

        try:
            os.mkdir(directory)
        except KeyboardInterrupt:
            raise
        except:
            pass

        with open( directory + '/' + timestamp.replace('/','_').replace(':','_').replace(' ','__') + '_' + paste_id.replace('/','') + '.txt', mode='w' ) as paste:
            paste_txt = PyQuery(url=paste_url)('#paste_code').text()
            paste.write(paste_txt + '\n')


    def start ( self, refresh_time = 30, delay = 1, ban_wait = 5, flush_after_x_refreshes=100, connection_timeout=60 ):
        count = 0
        while True:
            status,pastes = self.get_pastes ()

            start_time = time.time()
            if status == self.OK:
                for paste in pastes:
                    paste_id = PyQuery ( paste ).attr('href')
                    self.new_checked_ids.append ( paste_id )
                    if paste_id not in self.prev_checked_ids:
                        self.check_paste ( paste_id )
                        time.sleep ( delay )
                    count += 1

                if count == flush_after_x_refreshes:
                    self.prev_checked_ids = self.new_checked_ids
                    count = 0
                else:
                    self.prev_checked_ids += self.new_checked_ids
                self.new_checked_ids = []

                elapsed_time = time.time() - start_time
                sleep_time = ceil(max(0,(refresh_time - elapsed_time)))
                if sleep_time > 0:
                    Logger().log('Waiting {:d} seconds to refresh...'.format(sleep_time), True)
                    time.sleep ( sleep_time )
            elif status == self.ACCESS_DENIED:
                Logger ().log ( 'Damn! It looks like you have been banned (probably temporarily)', True, 'YELLOW' )
                for n in range ( 0, ban_wait ):
                    Logger ().log ( 'Please wait ' + str ( ban_wait - n ) + ' minute' + ( 's' if ( ban_wait - n ) > 1 else '' ) )
                    time.sleep ( 60 )
            elif status == self.CONNECTION_FAIL:
                Logger().log ( 'Connection down. Waiting {:d} seconds and trying again'.format(connection_timeout), True, 'RED')
                time.sleep(connection_timeout)
            elif status == self.OTHER_ERROR:
                Logger().log('Unknown error. Maybe an encoding problem? Trying again.'.format(connection_timeout), True,'RED')
                time.sleep(1)

def parse_input():
    parser = OptionParser()
    parser.add_option('-r', '--refresh-time', help='Set the refresh time (default: 30)', dest='refresh_time', type='int', default=30)
    parser.add_option('-d', '--delay-time', help='Set the delay time (default: 1)', dest='delay', type='float', default=1)
    parser.add_option('-b', '--ban-wait-time', help='Set the ban wait time (default: 5)', dest='ban_wait', type='int', default=5)
    parser.add_option('-f', '--flush-after-x-refreshes', help='Set the number of refreshes after which memory is flushed (default: 100)', dest='flush_after_x_refreshes', type='int', default=100)
    parser.add_option('-c', '--connection-timeout', help='Set the connection timeout waiting time (default: 60)', dest='connection_timeout', type='float', default=60)
    (options, args) = parser.parse_args()
    return options.refresh_time, options.delay, options.ban_wait, options.flush_after_x_refreshes, options.connection_timeout


try:
    Crawler ().start (*parse_input())
except KeyboardInterrupt:
    Logger ().log ( 'Bye! Hope you found what you were looking for :)', True )