forked from textgain/grasp
-
Notifications
You must be signed in to change notification settings - Fork 0
/
grasp.py
executable file
·5775 lines (4913 loc) · 185 KB
/
grasp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# encoding: utf-8
##### GRASP.PY ####################################################################################
__version__ = '2.1'
__license__ = 'BSD'
__credits__ = ['Tom De Smedt', 'Guy De Pauw', 'Walter Daelemans']
__email__ = 'info@textgain.com'
__author__ = 'Textgain'
__copyright__ = 'Textgain'
###################################################################################################
# Copyright (c) 2016, Textgain BVBA
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation and/or
# other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
###################################################################################################
# Grasp.py is a collection of simple algorithms, functions and classes for data mining & analytics:
# WWW Web Mining search engines, servers, HTML DOM + CSS selectors, plaintext
# DB Databases comma-separated values, dates, SQL
# NLP Natural Language Processing tokenization, part-of-speech tagging, sentiment analysis
# ML Machine Learning clustering, classification, confusion matrix, n-grams
# NET Network Analysis shortest paths, centrality, components, communities
# ETC recipes for functions, strings, lists, dates, ...
# Grasp.py is based on the Pattern toolkit (https://github.com/clips/pattern), focusing on brevity.
# Most functions have around 10 lines of code, and most algorithms have around 25-50 lines of code.
# Most classes have about 50-75 lines of code.
###################################################################################################
import sys
import os
import io
import re
import inspect
import logging
import traceback
import threading
import multiprocessing
import multiprocessing.pool
import itertools
import collections
import unicodedata
import socket; socket.setdefaulttimeout(10)
import wsgiref
import wsgiref.simple_server
import urllib
import smtplib
import hashlib
import hmac
import base64
import binascii
import email
import xml.etree.ElementTree as ElementTree
import sqlite3 as sqlite
import csv as csvlib
import json
import zipfile
import tempfile
import mimetypes
import glob
import time
import datetime
import random
import math
import heapq
import bisect
PY2 = sys.version.startswith('2')
PY3 = sys.version.startswith('3')
if PY3:
str, unicode, basestring = bytes, str, str
if PY3:
import collections.abc
else:
collections.abc = collections
if PY3:
from html.parser import HTMLParser
from html import unescape
else:
from HTMLParser import HTMLParser
unescape = HTMLParser().unescape
if PY3:
import http.server as BaseHTTPServer
import socketserver as SocketServer
else:
import BaseHTTPServer
import SocketServer
if PY3:
import http.cookiejar as cookielib
else:
import cookielib
if PY3:
import urllib.request
import urllib.parse
else:
import urllib2
import urlparse
urllib.error = urllib2
urllib.request = urllib2
urllib.parse = urlparse
urllib.parse.urlencode = urllib.urlencode
urllib.parse.unquote = urllib.unquote
urllib.parse.quote = urllib.quote
# In Python 2, Class.__str__ returns a byte string.
# In Python 3, Class.__str__ returns a Unicode string.
# @printable
# class X(object):
# def __str__(self):
# return unicode(' ')
# works on both Python 2 & 3.
def printable(cls):
""" @printable class defines class.__unicode__ in Python 2.
"""
if PY2:
if hasattr(cls, '__str__'):
cls.__unicode__ = cls.__str__
cls.__str__ = lambda self: self.__unicode__().encode('utf-8')
return cls
REGEX = type(re.compile(''))
# isinstance(re.compile(''), REGEX)
###################################################################################################
#---- STATIC --------------------------------------------------------------------------------------
def static(**kwargs):
""" The @static() decorator initializes static variables.
"""
def decorator(f):
for k, v in kwargs.items():
setattr(f, k, v)
return f
return decorator
# @static(i=0)
# def uid():
# uid.i += 1
# return uid.i
#---- PARALLEL ------------------------------------------------------------------------------------
# Parallel processing uses multiple CPU's to execute multiple processes simultaneously.
def parallel(f, values=[], *args, **kwargs):
""" Returns an iterator of f(v, *args, **kwargs)
for values=[v1, v2, ...], using available CPU's.
"""
p = multiprocessing.Pool(processes=None)
p = p.imap(_worker, ((f, v, args, kwargs) for v in values))
return p
def _worker(x):
f, v, args, kwargs = x
return f(v, *args, **kwargs)
# for v in parallel(pow, (1, 2, 3), 2):
# print(v)
#---- ASYNC ---------------------------------------------------------------------------------------
# Asynchronous functions are executed in a separate thread and notify a callback function
# (instead of blocking the main thread).
def asynchronous(f, callback=lambda v, e: None, blocking=False):
""" Returns a new function that calls
callback(value, exception=None) when done.
"""
def thread(*args, **kwargs):
def worker(callback, f, *args, **kwargs):
try:
v = f(*args, **kwargs)
except Exception as e:
callback(None, e)
else:
callback(v, None)
t = threading.Thread
t = t(target=worker, args=(callback, f) + args, kwargs=kwargs)
t.daemon = not blocking
t.start()
return t
return thread
# def ping(v, e=None):
# if e:
# raise e
# print(v)
#
# pow = asynchronous(pow, ping)
# pow(2, 2)
# pow(2, 3) #.join(1)
#
# for _ in range(10):
# time.sleep(0.1)
# print('...')
# Atomic operations are thread-safe, e.g., dict.get() or list.append(),
# but not all operations are atomic, e.g., dict[k] += 1 needs a lock.
Lock = threading.RLock
lock = threading.RLock()
def atomic(f):
""" The @atomic decorator executes a function thread-safe.
"""
def decorator(*args, **kwargs):
with lock:
return f(*args, **kwargs)
return decorator
# hits = collections.Counter()
#
# @atomic
# def hit(k):
# hits[k] += 1
MINUTE, HOUR, DAY = 60, 60*60, 60*60*24
def scheduled(interval=MINUTE, blocking=False):
""" The @scheduled decorator executes a function periodically (async).
"""
def decorator(f):
def timer():
while 1:
time.sleep(interval)
f()
t = threading.Thread(target=timer)
t.daemon = not blocking
t.start()
return f
return decorator
# @scheduled(1)
# @atomic
# def update():
# print('updating...')
def retry(exception, tries, f, *args, **kwargs):
""" Returns the value of f(*args, **kwargs).
Retries if the given exception is raised.
"""
for i in range(tries + 1):
try:
return f(*args, **kwargs)
except exception as e:
if i < tries:
time.sleep(2 ** i) # exponential backoff (1, 2, 4, ...)
except Exception as e:
raise e
raise e
# def search(q,n):
# print('searching %s' % q)
# raise ValueError
#
# retry(ValueError, 3, search, 'cats')
# Asynchronous + retry:
# f = asynchronous(lambda x: retry(Exception, 2, addx, x), callback)
###################################################################################################
#---- LAZY ----------------------------------------------------------------------------------------
# A lazy container takes lambda functions as values, which are evaluated when retrieved.
class LazyDict(collections.abc.MutableMapping):
def __init__(self, *args, **kwargs):
self._dict = dict(*args, **kwargs)
self._done = set()
def __setitem__(self, k, v):
self._dict[k] = v
def __getitem__(self, k):
v = self._dict[k]
if not k in self._done:
self._dict[k] = v = v()
self._done.add(k)
return v
def __delitem__(self, k):
self._dict.pop(k)
self._done.remove(k)
def __len__(self):
return len(self._dict)
def __iter__(self):
return iter(self._dict)
def __repr__(self):
return repr(dict(self))
# models = LazyDict()
# models['en'] = lambda: Perceptron('large.json')
#---- PERSISTENT ----------------------------------------------------------------------------------
# A persistent container stores values in a file.
class PersistentDict(dict):
def __init__(self, path, *args, **kwargs):
if os.path.exists(path or ''):
self.update(json.load(open(path, 'rb')))
self.update(*args)
self.update(kwargs)
self.path = path
@atomic # (thread-safe)
def save(self):
json.dump(self, open(self.path, 'w')) # JSON
# db = PersistentDict('db.json', {'k': 'v'})
# db.save()
###################################################################################################
#---- LOG -----------------------------------------------------------------------------------------
# Functions that access the internet must report the visited URL using the standard logging module.
# See also: https://docs.python.org/2/library/logging.html
SIGNED = '%(time)s %(file)s:%(line)s %(function)s: %(message)s\n' # 12:59:59 grasp.py:1000 <module>
log = logging.getLogger(__name__)
log.level = logging.DEBUG
if not log.handlers:
log.handlers.append(logging.NullHandler())
class Log(collections.deque, logging.Handler):
def __init__(self, n=100, file=None, format=SIGNED, date='%Y-%m-%d %H:%M:%S'):
""" A list of n latest log messages, optionally with a file-like back-end.
"""
collections.deque.__init__(self, maxlen=n)
logging.Handler.__init__(self)
log.handlers.append(self)
self.file = file
self.format = format
self.date = date
def emit(self, r):
r = { # log.info('test')
'time' : r.created + r.relativeCreated, # date().timestamp
'type' : r.levelname.lower(), # 'info'
'message' : r.getMessage(), # 'test'
'function' : r.funcName, # '<module>'
'module' : r.module, # 'grasp'
'path' : r.pathname, # 'grasp.py'
'file' : r.filename, # 'grasp.py'
'line' : r.lineno, # 1234
}
if self.file:
self.file.write(
self.format % dict(r,
time=datetime.datetime.fromtimestamp(r['time']).strftime(self.date)))
self.append(r)
self.update(r)
def update(self, event):
pass
def __del__(self):
try:
log.handlers.remove(self)
except:
pass
def debug(file=sys.stdout, format=SIGNED, date='%Y-%m-%d %H:%M:%S'):
debug.log = Log(0, file, format, date)
# debug()
# debug(open(cd('log.txt'), 'a'))
# request('https://textgain.com')
###################################################################################################
#---- UNICODE -------------------------------------------------------------------------------------
# The u() function returns a Unicode string (Python 2 & 3).
# The b() function returns a byte string, encoded as UTF-8.
# We use u() as early as possible on all input (e.g. HTML).
# We use b() on URLs.
def u(v, encoding='utf-8'):
""" Returns the given value as a Unicode string.
"""
if isinstance(v, str):
for e in ((encoding,), ('windows-1252',), ('utf-8', 'ignore')):
try:
return v.decode(*e)
except:
pass
return v
if isinstance(v, unicode):
return v
return (u'%s' % v) # int, float
def b(v, encoding='utf-8'):
""" Returns the given value as a byte string.
"""
if isinstance(v, unicode):
for e in ((encoding,), ('windows-1252',), ('utf-8', 'ignore')):
try:
return v.encode(*e)
except:
pass
return v
if isinstance(v, str):
return v
return (u'%s' % v).encode()
#---- ITERATION -----------------------------------------------------------------------------------
def first(n, a):
""" Returns a iterator of values from index 0 to n.
"""
return iter(itertools.islice(a, 0, n))
def sliced(a, *ijn):
""" Returns an iterator of values from index i to j, by step n.
"""
return iter(itertools.islice(a, *ijn))
def shuffled(a):
""" Returns an iterator of values in the list, in random order.
"""
a = list(a)
random.shuffle(a)
return iter(a)
def unique(a):
""" Returns an iterator of unique values in the list, in order.
"""
s = set() # seen?
return iter(v for v in a if not (v in s or s.add(v)))
def chunks(a, n=2):
""" Returns an iterator of tuples of n consecutive values.
"""
return iter(zip(*(a[i::n] for i in range(n))))
# for v in chunks([1, 2, 3, 4], n=2): # (1, 2), (3, 4)
# print(v)
def nwise(a, n=2):
""" Returns an iterator of tuples of n consecutive values (rolling).
"""
a = itertools.tee(a, n)
a =(itertools.islice(a, i, None) for i, a in enumerate(a))
a = zip(*a)
a = iter(a)
return a
# for v in nwise([1, 2, 3, 4], n=2): # (1, 2), (2, 3), (3, 4)
# print(v)
def flatten(a, type=list):
""" Returns a list of nested values (depth-first).
"""
q = []
for v in a:
if isinstance(v, type):
q.extend(flatten(v, type))
else:
q.append(v)
return q
# print(flatten([1, [2, [3, 4]]]))
def choices(a, weights=[], k=1):
""" Returns random elements from the given list,
with optional (non-negative) probabilities.
"""
if weights:
n = 0
m = [] # cumsum
for w in weights:
n += w
m.append(n)
return [a[bisect.bisect(m, n * random.random())] for _ in range(k)]
else:
return [random.choice(a) for _ in range(k)]
# print(choices(['a', 'b'], weights=[0.75, 0.25], k=10))
#---- FILE ----------------------------------------------------------------------------------------
# Temporary files are useful when a function takes a filename, but we have the file's data instead.
class tmp(object):
def __init__(self, s, mode='wb'):
""" Returns a named temporary file containing the given string.
"""
self._f = tempfile.NamedTemporaryFile(mode, delete=False)
self._f.write(s)
self._f.close()
@property
def name(self):
return self._f.name
def read(self):
return self._f.read()
def write(self, *args):
return self._f.write(*args)
def close(self):
return self._f.close()
def __enter__(self):
return self._f
def __exit__(self, *args):
try:
os.unlink(self._f.name)
except:
pass
def __del__(self):
try:
os.unlink(self._f.name)
except:
pass
# data = '"username", "tweet", "likes"\n'
#
# with tmp(data) as f:
# for row in csv(f.name):
# print(row)
##### DB ##########################################################################################
#---- CSV -----------------------------------------------------------------------------------------
# A comma-separated values file (CSV) stores table data as plain text.
# Each line in the file is a row in a table.
# Each row consists of column fields, separated by a comma.
class table(list):
def __getitem__(self, i):
""" A 2D list with advanced slicing: table[row1:row2, col1:col2].
"""
if isinstance(i, tuple):
i, j = i
if isinstance(i, slice):
return [v[j] for v in list.__getitem__(self, i)]
return list.__getitem__(self, i)[j]
return list.__getitem__(self, i)
@property
def html(self):
a = ['<table>']
for r in self:
a.append('<tr>')
a.extend('<td>%s</td>' % v for v in r)
a.append('</tr>')
a.append('</table>')
return u'\n'.join(a)
# t = table()
# t.append([1, 2, 3])
# t.append([4, 5, 6])
# t.append([7, 8, 9])
#
# print(t) # [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
# print(t[0]) # [1, 2, 3]
# print(t[0,0]) # 1
# print(t[:,0]) # [1, 4, 7]
# print(t[:2,:2]) # [[1, 2], [4, 5]]
csvlib.field_size_limit(1000000000)
class CSV(table):
@classmethod
def rows(cls, path, separator=',', encoding='utf-8'):
""" Returns the given .csv file as an iterator of rows,
where each row is a list of values.
"""
f = open(path, 'rb')
f = (s.replace(b'\r\n', b'\n') for s in f)
f = (s.replace(b'\r' , b'\n') for s in f)
f = (s.replace(b'\0' , b'' ) for s in f) # null byte
s = next(f, b'')
s = s.lstrip(b'\xef\xbb\xbf') # BOM
s = s.lstrip(b'\xff\xfe')
f = itertools.chain((s,), f)
e = lambda s: u(s, encoding)
if PY3:
f = map(e, f)
for r in csvlib.reader(f, delimiter=separator):
yield r
else:
for r in csvlib.reader(f, delimiter=separator):
yield map(e, r)
def __init__(self, name='', separator=',', rows=[]):
""" Returns the given .csv file as a list of rows,
where each row is a list of values.
"""
try:
self.name = name
self.separator = separator
self.extend(CSV.rows(name, separator))
except IOError:
pass # doesn't exist (yet)
if rows:
self.extend(rows)
def save(self, name=''):
a = []
for r in self:
r = ('"%s"' % u(s).replace('"', '""') for s in r)
r = self.separator.join(r)
a.append(r)
f = io.open(name or self.name, 'w', encoding='utf-8')
f.write(u'\n'.join(a))
f.close()
def clear(self):
list.__init__(self, [])
csv = CSV
# data = csv('test.csv')
# data.append([1, 'hello'])
# data.save()
#
# print(data[0,0]) # 1st cell
# print(data[:,0]) # 1st column
def col(i, a):
""" Returns the i-th column in the given list of lists.
"""
for r in a:
yield r[i]
def cd(*args):
""" Returns the directory of the script that calls cd() + given relative path.
"""
f = inspect.currentframe()
f = inspect.getouterframes(f)[1][1]
f = os.getcwd() if f == '<stdin>' else f
p = os.path.realpath(f)
p = os.path.dirname(p)
p = os.path.join(p, *args)
return p
GRASP = cd()
# print(cd('kb', 'en-loc.csv'))
# for code, name, who, where, what, city, lang, flag in csv(cd(GRASP, 'kb', 'en-loc.csv')):
# print(name)
#---- SQL -----------------------------------------------------------------------------------------
# A database is a collection of tables, with rows and columns of structured data.
# Rows can be edited or selected with SQL statements (Structured Query Language).
# Rows can be indexed for faster retrieval or related to other tables.
# SQLite is a lightweight engine for a portable database stored as a single file.
# https://www.sqlite.org/datatype3.html
affinity = collections.defaultdict(
lambda : 'text' , {
str : 'text' ,
unicode : 'text' ,
bytes : 'blob' ,
bool : 'integer' ,
int : 'integer' ,
float : 'real'
})
def schema(table, *fields, **type):
""" Returns an SQL CREATE TABLE statement,
with indices on '#'-prefixed fields.
A field 'id' is automatically added.
"""
s = 'create table if not exists `%s` (' % table + 'id integer primary key);'
i = 'create index if not exists `%s_%s` on `%s` (`%s`);'
for k in fields:
k = re.sub(r'^\#', '', k) # '#name' => 'name'
v = affinity[type.get(k)] # str => 'text'
s = s[:-2] + ', `%s` %s);' % (k, v)
for k in fields:
if k.startswith('#'):
s += '\n'
s += i % ((table, k[1:]) * 2)
return s
# print(schema('persons', '#name', 'age', age=int))
class DatabaseError(Exception):
pass
class Database(object):
def __init__(self, name, schema=None, timeout=10, factory=sqlite.Row):
""" SQLite database interface.
"""
self.connection = sqlite.connect(name, timeout)
self.connection.row_factory = factory
if schema:
for q in schema.strip(';').split(';'):
q = q.strip()
q = q + ';'
self(q)
self.commit()
def __call__(self, sql, values=(), commit=False):
""" Executes the given SQL statement.
"""
try:
r = self.connection.cursor().execute(sql, values)
if commit:
self.connection.commit()
except Exception as e:
raise DatabaseError('%s' % e)
else:
return r
def execute(self, *args, **kwargs):
return self(*args, **kwargs)
def commit(self):
return self.connection.commit()
def rollback(self):
return self.connection.rollback()
def save(self):
return self('vacuum') # reduce file size
@property
def id(self):
return self('select last_insert_rowid()').fetchone()[0]
def find(self, table, *fields, **filters):
return self(*SQL_SELECT(table, *fields, **filters))
def append(self, table, **fields):
b = fields.pop('commit', True)
return self(*SQL_INSERT(table, **fields), commit=b).lastrowid # id
def update(self, table, id, **fields):
b = fields.pop('commit', True)
return self(*SQL_UPDATE(table, id, **fields), commit=b).rowcount # int
def remove(self, table, id, **fields):
b = fields.pop('commit', True)
return self(*SQL_DELETE(table, id ), commit=b).rowcount # int
def __del__(self):
try:
self.connection.commit()
self.connection.close()
self.connection = None
except:
pass
# db = Database(cd('test.db'), schema('persons', '#name', 'age', age=int))
# db.append('persons', name='Tom', age=30)
# db.append('persons', name='Guy', age=30)
#
# for id, name, age in db.find('persons', age='>20'):
# print(name, age)
def concat(a, format='%s', separator=', '):
# concat([1, 2, 3]) => '1, 2, 3'
return separator.join(format % v for v in a)
def op(v):
# op([1, 2, 3]) => 'in (?, ?, ?)', (1, 2, 3)
if isinstance(v, (int, float)): # 1
return '= ?', (v,)
if isinstance(v, (set, list)): # [1, 2, 3]
return 'in (%s)' % concat('?' * len(v)), v
if isinstance(v, (tuple,)): # (1, 2)
return 'between ? and ?', v[:2]
if v[:2] in ('<=', '>=', '<>', '!='): # '<>1'
return '%s ?' % v[:2], (v[2:],)
if v[:1] in ('<' , '>' ): # '<1'
return '%s ?' % v[:1], (v[1:],)
if '*' in v: # '*ly'
return 'like ?', (v.replace('*', '%'),)
else:
return '= ?', (v,)
def SQL_SELECT(table, *fields, **where):
""" Returns an SQL SELECT statement + parameters.
"""
s = 'select %s ' % (concat(fields, '`%s`') or '*')
s+= 'from `%s` ' % table
s+= 'where %s '
s+= 'order by `%s` ' % where.pop('sort', 'id')
s+= 'limit %s, %s;' % where.pop('slice', (0, -1))
k = where.keys() # ['name', 'age']
v = where.values() # ['Tom*', '>10']
v = map(op, v) # [('like', 'Tom%'), ('>', '10')]
v = zip(*v) # ('like', '>'), ('Tom%', '10')
v = iter(v)
x = next(v, ())
v = next(v, ())
v = itertools.chain(*v)
s = s % (concat(zip(k, x), '`%s` %s', ' and') or 1)
return s, tuple(v)
# print(SQL_SELECT('persons', '*', age='>10', sort='age', slice=(0, 10)))
def SQL_INSERT(table, **fields):
""" Returns an SQL INSERT statement + parameters.
"""
s = 'insert into `%s` (%s) values (%s);'
k = fields.keys()
v = fields.values()
s = s % (table, concat(k, '`%s`'), concat('?' * len(v)))
return s, tuple(v)
# print(SQL_INSERT('persons', name='Tom', age=10))
def SQL_UPDATE(table, id, **fields):
""" Returns an SQL UPDATE statement + parameters.
"""
s = 'update `%s` set %s where id=?;'
k = fields.keys()
v = fields.values()
s = s % (table, concat(k, '`%s`=?'))
return s, tuple(v) + (id,)
# print(SQL_UPDATE('persons', 1, name='Tom', age=20))
def SQL_DELETE(table, id):
""" Returns an SQL DELETE statement + parameters.
"""
s = 'delete from `%s` where id=?;' % table
return s, (id,)
# print(SQL_DELETE('persons', 1))
#---- ENCRYPTION ----------------------------------------------------------------------------------
# The pw() function is secure enough for storing passwords; encrypt() and decrypt() are not secure.
alphanumeric = 'abcdefghijklmnopqrstuvwxyz' + 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' + '0123456789'
def key(n=32, chars=alphanumeric):
""" Returns a new key of length n.
"""
return ''.join(choices(chars, k=n))
def stretch(k, n):
""" Returns a new key of length n.
"""
while len(k) < n:
k += hashlib.md5(b(k)[-1024:]).hexdigest()
return u(k[:n])
def encrypt(s, k=''):
""" Returns the encrypted string.
"""
k = stretch(k, len(s))
k = bytearray(b(k))
s = bytearray(b(s))
s = bytearray(((i + j) % 256) for i, j in zip(s, itertools.cycle(k))) # Vigenère cipher
s = binascii.hexlify(s)
return u(s)
def decrypt(s, k=''):
""" Returns the decrypted string.
"""
k = stretch(k, len(s))
k = bytearray(b(k))
s = bytearray(binascii.unhexlify(s))
s = bytearray(((i - j) % 256) for i, j in zip(s, itertools.cycle(k)))
s = bytes(s)
return u(s)
# print(decrypt(encrypt('hello world', '1234'), '1234'))
def pw(s, f='sha256', n=100000):
""" Returns the encrypted string, using PBKDF2.
"""
k = base64.b64encode(os.urandom(32)) # salt
s = hashlib.pbkdf2_hmac(f, b(s)[:1024], k, n)
s = binascii.hexlify(s)
s = 'pbkdf2:%s:%s:%s:%s' % (f, n, u(k), u(s))
return s
def pw_ok(s1, s2):
""" Returns True if pw(s1) == s2.
"""
_, f, n, k, s = s2.split(':')
s1 = hashlib.pbkdf2_hmac(f, b(s1)[:1024], b(k), int(n))
s1 = binascii.hexlify(s1)
eq = True
for ch1, ch2 in zip(s1, b(s)):
eq = eq and ch1 == ch2 # contstant-time comparison
return eq
# print(pw_ok('1234', pw('1234')))
##### ML ##########################################################################################
#---- STATISTICS ----------------------------------------------------------------------------------
def avg(a):
""" Returns the average (mean) of the given values.
"""
a = list(a)
n = len(a) or 1
return sum(a) / float(n)
def sd(a):
""" Returns the standard deviation of given values.
"""
a = list(a)
n = len(a) or 1
m = avg(a)
return math.sqrt(sum((v - m) ** 2 for v in a) / n)
def peaks(a, z=1):
""" Returns a list of indices of values that are
more than z standard deviations above the mean.
"""
a = list(a)
m = avg(a)
s = sd(a)
a = ((v - m) / s for v in a)
a = [i for i, v in enumerate(a) if v > z]
return a
# print(peaks([0, 0, 0, 10, 100, 1, 0], z=1))
#---- VECTOR --------------------------------------------------------------------------------------
# A vector is a {feature: weight} dict, with n features, or n dimensions.
# Given two points {x: 1, y: 2} and {x: 3, y: 4} in 2D,
# their distance is: sqrt((3 - 1) ** 2 + (4 - 2) ** 2).
# Distance can be calculated for points in 3D or in nD.
# Another distance metric is the angle between vectors (cosine).
# Another distance metric is the difference between vectors.
# For vectorized text cos() works well but diff() is faster.
# Vector weights are assumed to be non-negative, especially
# when using cos(), diff(), knn(), tf(), tfidf() and freq().
def index(data=[]):
""" Returns a dict of (id(vector), label)-items
for the given list of (vector, label)-tuples.
"""
return {id(v): label for v, label in data}
def distance(v1, v2):
""" Returns the distance of the given vectors.
"""
return sum((v1.get(f, 0) - v2.get(f, 0)) ** 2 for f in features((v1, v2))) ** 0.5
def dot(v1, v2):
""" Returns the dot product of the given vectors.
"""
return sum(v1.get(f, 0) * w for f, w in v2.items())
def norm(v):
""" Returns the norm of the given vector.
"""