identifyMovie.py

import init
import urllib.request as ur
import urllib.error as ue
import struct
import sys
import re

# load configuration params and start logger
conf,logger = init.configure()
if conf is None:
    logger.error("Could not open config file, reverting to defaults")


URL_OS = conf['opensubtitles_URL']
RE_IMDBID = conf['opensubtitles_IMDBid_regexp']


class hashException(Exception):
    def __init__(self, message):
        self.message = message

class HTTPException(Exception):
    def __init__(self, message):
        self.message = message

class IMDBException(Exception):
    def __init__(self, message):
        self.message = message

# trivial code to get a page and extract a single string with a regexp 
# (currently used for IMDBid from opensubtitles and movie poster from movieposterdb)
# returns None if an error occurs
# returns a regular expression match otherwise
def getAndExtract(url, regex):
    try:
        response = ur.urlopen(url)
        content = str(response.read())
        m = re.search(regex,content)

        if m is None:
            raise hashException('Could not match regexp in provided page')

        return m.group(1)

    except ue.HTTPError as e:
        raise HTTPException('HTTPError: %s' % e.code)

    except ue.URLError as e:
        raise HTTPException('URLError: %s' % e.reason)

    except Exception as e:
        raise HTTPException('getAndExtract failed with an unexpected error: %s' % e)


def hashURL(url):
    try:
        longlongformat = 'q'  # long long 
        bytesize = struct.calcsize(longlongformat)
   
        # get file size
        filesize = ur.urlopen(url).info().get("Content-Length", 0)
        if filesize == 0 : 
            logger.error('No Content-Length header available')
            return None,None
        hash_ = int(filesize)
        
        # work on first 64KB of the file
        r = ur.Request(url)
        r.headers['Range'] = 'bytes=%s-%s' % (0, 65535)
        f = ur.urlopen(r)
        for x in range(int(65536/bytesize)):
            buffer_ = f.read(bytesize)
            (l_value,)= struct.unpack(longlongformat, buffer_)
            hash_ += l_value
            hash_ = hash_ & 0xFFFFFFFFFFFFFFFF #to remain as 64bit number  
    
        ## work on last 64KB of the file
        r.headers['Range'] = 'bytes=%s-' % (int(filesize)-65536)
        f = ur.urlopen(r)
        for x in range(int(65536/bytesize)):
            buffer_ = f.read(bytesize)
            (l_value,)= struct.unpack(longlongformat, buffer_)
            hash_ += l_value
            hash_ = hash_ & 0xFFFFFFFFFFFFFFFF
    
        returnedhash = "%016x" % hash_
        return filesize,returnedhash

    except ue.HTTPError as e:
        raise hashException('HTTPError: %s' % e.code)
    
    except ue.URLError as e:
        raise hashException('URLError: %s' % e.reason)
   
    except struct.error as e:
        raise hashException('StructError: %s' % e)

    except Exception as e:
        raise hashException('hashURL failed with an unexpected error: %s' % e)


def getIMDBid(bytesize,movhash):
        # given a movie filesize and movhash, gets IMDB id from opensubtitles 
        url = URL_OS.replace("BYTESIZE",str(bytesize)).replace("HASH",movhash)

        try:
            # get the IMDB ID from opensubtitles
            IMDBid = getAndExtract(url, RE_IMDBID)
            if not IMDBid:
                raise IMDBException("IMDB is not defined")
            return IMDBid

        except HTTPException as e:
            raise IMDBException(e.message)


if __name__ == '__main__':

    if len(sys.argv)<2:
        url = 'https://archive.org/download/TheInternetsOwnBoyTheStoryOfAaronSwartz/TheInternetsOwnBoy_TheStoryofAaronSwartz-HD.mp4'
    else: 
        url = sys.argv[1]

    IMDBid = None	

    try:
        logger.info("Hashing %s..." % url)
        s,h = hashURL(url)
        logger.info('Size: %s, Hash: %s' % (str(s), str(h)))
    
        logger.info("Identifying movie... ")
        IMDBid = getIMDBid(s,h)
        logger.info('==> http://www.imdb.com/title/tt' + IMDBid)

    except hashException as e:
        logger.error("hashException: %s" % e.message)
    except IMDBException as e:
        logger.error("IMDBException: %s" % e.message)