-
Notifications
You must be signed in to change notification settings - Fork 0
/
identifyMovie.py
138 lines (104 loc) · 4.13 KB
/
identifyMovie.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import init
import urllib.request as ur
import urllib.error as ue
import struct
import sys
import re
# load configuration params and start logger
conf,logger = init.configure()
if conf is None:
logger.error("Could not open config file, reverting to defaults")
URL_OS = conf['opensubtitles_URL']
RE_IMDBID = conf['opensubtitles_IMDBid_regexp']
class hashException(Exception):
def __init__(self, message):
self.message = message
class HTTPException(Exception):
def __init__(self, message):
self.message = message
class IMDBException(Exception):
def __init__(self, message):
self.message = message
# trivial code to get a page and extract a single string with a regexp
# (currently used for IMDBid from opensubtitles and movie poster from movieposterdb)
# returns None if an error occurs
# returns a regular expression match otherwise
def getAndExtract(url, regex):
try:
response = ur.urlopen(url)
content = str(response.read())
m = re.search(regex,content)
if m is None:
raise hashException('Could not match regexp in provided page')
return m.group(1)
except ue.HTTPError as e:
raise HTTPException('HTTPError: %s' % e.code)
except ue.URLError as e:
raise HTTPException('URLError: %s' % e.reason)
except Exception as e:
raise HTTPException('getAndExtract failed with an unexpected error: %s' % e)
def hashURL(url):
try:
longlongformat = 'q' # long long
bytesize = struct.calcsize(longlongformat)
# get file size
filesize = ur.urlopen(url).info().get("Content-Length", 0)
if filesize == 0 :
logger.error('No Content-Length header available')
return None,None
hash_ = int(filesize)
# work on first 64KB of the file
r = ur.Request(url)
r.headers['Range'] = 'bytes=%s-%s' % (0, 65535)
f = ur.urlopen(r)
for x in range(int(65536/bytesize)):
buffer_ = f.read(bytesize)
(l_value,)= struct.unpack(longlongformat, buffer_)
hash_ += l_value
hash_ = hash_ & 0xFFFFFFFFFFFFFFFF #to remain as 64bit number
## work on last 64KB of the file
r.headers['Range'] = 'bytes=%s-' % (int(filesize)-65536)
f = ur.urlopen(r)
for x in range(int(65536/bytesize)):
buffer_ = f.read(bytesize)
(l_value,)= struct.unpack(longlongformat, buffer_)
hash_ += l_value
hash_ = hash_ & 0xFFFFFFFFFFFFFFFF
returnedhash = "%016x" % hash_
return filesize,returnedhash
except ue.HTTPError as e:
raise hashException('HTTPError: %s' % e.code)
except ue.URLError as e:
raise hashException('URLError: %s' % e.reason)
except struct.error as e:
raise hashException('StructError: %s' % e)
except Exception as e:
raise hashException('hashURL failed with an unexpected error: %s' % e)
def getIMDBid(bytesize,movhash):
# given a movie filesize and movhash, gets IMDB id from opensubtitles
url = URL_OS.replace("BYTESIZE",str(bytesize)).replace("HASH",movhash)
try:
# get the IMDB ID from opensubtitles
IMDBid = getAndExtract(url, RE_IMDBID)
if not IMDBid:
raise IMDBException("IMDB is not defined")
return IMDBid
except HTTPException as e:
raise IMDBException(e.message)
if __name__ == '__main__':
if len(sys.argv)<2:
url = 'https://archive.org/download/TheInternetsOwnBoyTheStoryOfAaronSwartz/TheInternetsOwnBoy_TheStoryofAaronSwartz-HD.mp4'
else:
url = sys.argv[1]
IMDBid = None
try:
logger.info("Hashing %s..." % url)
s,h = hashURL(url)
logger.info('Size: %s, Hash: %s' % (str(s), str(h)))
logger.info("Identifying movie... ")
IMDBid = getIMDBid(s,h)
logger.info('==> http://www.imdb.com/title/tt' + IMDBid)
except hashException as e:
logger.error("hashException: %s" % e.message)
except IMDBException as e:
logger.error("IMDBException: %s" % e.message)