-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmixins.py
868 lines (665 loc) · 29.7 KB
/
mixins.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
import re
import sys,os
import string
import unicodedata
from functools import partial
from nltk.tokenize import ToktokTokenizer
from nltk.tokenize import SpaceTokenizer
from nltk.tokenize import WhitespaceTokenizer
from nltk.tokenize import TabTokenizer
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
import gensim
import nltk
import razdel
#from spacy.pipeline import SentenceSegmenter
from nlptk.patterns import contractions
from nlptk.patterns import patterns
from nlptk.morphology import morphology
from nlptk.postagging import taggers
from nlptk.spelling import spellers
class RepeatReplacer():
def __init__(self, patterns=None, repl=None):
self.regex = patterns or re.compile("r(\w*)(\w)\2(\w*)")
self.repl = repl or r"\1\2\3"
def replace(self,word):
loop_res = regex.sub(self.repl, word)
if word == loop_res:
return loop_res
else:
return self.replace(loop_res)
class RegexReplacer():
def __init__(self, patterns):
self.patterns = patterns
def replace(self,text):
for regex,rep in self.patterns:
text = regex.sub(rep[0],text)
return text
class SpellerMixin():
@staticmethod
def speller_en(backend='CyHunspell',**kwargs):
''''''
return spellers.Speller(backend=backend, lang='en', **kwargs)
@staticmethod
def speller_ru(backend='CyHunspell',**kwargs):
''''''
return spellers.Speller(backend=backend, lang='ru', **kwargs)
class TaggerMixin():
tagger4 = taggers.get_tagger('4-ngram_tagger')
#tagger3 = taggers.get_tagger('3-ngram_tagger')
@staticmethod
def tagger_4ngram(tokens, *args, lang="eng", **kwargs):
'''4-gram tagger'''
yield from TaggerMixin.tagger4(tokens)
@staticmethod
def tagger_3ngram(tokens, *args, lang="eng", **kwargs):
'''3-gram tagger'''
yield from TaggerMixin.tagger3(tokens)
@staticmethod
def tagger_nltk(tokens, *args, lang="eng", **kwargs):
'''Perceptron tagger'''
yield from nltk.pos_tag(tokens,lang=lang)
class SentencizerMixin():
@staticmethod
def sentencize_nltk(text, *args, lang="english", **kwargs):
'''сегментация текста на предложения'''
SENTENCE_TOKENIZER = nltk.data.load('tokenizers/punkt/%s.pickle' % lang)
#nltk.sent_tokenize(text, language=lang)
min_len = kwargs.get('min_len',2)
for sent in SENTENCE_TOKENIZER.tokenize(text):
sent = sent.strip(string.punctuation)
if len(sent) >= min_len:
yield sent
@staticmethod
def sentencize_nltk_ru(text, *args, lang="russian", **kwargs):
'''сегментация текста на предложения'''
SENTENCE_TOKENIZER = nltk.data.load('tokenizers/punkt/%s.pickle' % lang)
#nltk.sent_tokenize(text, language=lang)
min_len = kwargs.get('min_len',2)
for sent in SENTENCE_TOKENIZER.tokenize(text):
sent = sent.strip(string.punctuation)
if len(sent) >= min_len:
yield sent
@staticmethod
def sentencize_razdel(text, *args, **kwargs):
'''сегментация текста на предложения'''
SENTENCE_TOKENIZER = razdel.sentenize
min_len = kwargs.get('min_len',2)
for sent in SENTENCE_TOKENIZER(text):
sent = sent.strip(string.punctuation)
if len(sent.text) >= min_len:
yield sent.text
@staticmethod
def sentencize_polyglot(text, *args, **kwargs):
'''сегментация текста на предложения'''
SENTENCE_TOKENIZER = polyglot.text.Text
min_len = kwargs.get('min_len',2)
for sent in SENTENCE_TOKENIZER(text).sentenize:
sent = sent.strip(string.punctuation)
if len(sent) >= min_len:
yield sent
@staticmethod
def sentencize_segtok(text, *args, **kwargs):
'''сегментация текста на предложения'''
SENTENCE_TOKENIZER = segtok.segmenter
min_len = kwargs.get('min_len',2)
for sent in SENTENCE_TOKENIZER.split_multi(text):
sent = sent.strip(string.punctuation)
if len(sent) >= min_len:
yield sent
#https://webdevblog.ru/podhody-lemmatizacii-s-primerami-v-python/
class LemmatizerMixin():
@staticmethod
def lemmatize_en(text, *args, **kwargs):
'''возвращает слово вместе с частью речи
работает через библиотеку pattern'''
return gensim.utils.lemmatize(text,*args,**kwargs)
@staticmethod
def lemmatize_nltk(tokens, *args, lang='eng', **kwargs):
return morphology.NLTKLemmatizer(lang=lang,**kwargs).lemmatize(tokens, **kwargs)
@staticmethod
def lemmatize_pt(tokens, *args, lang='en', **kwargs):
return morphology.PatternLemmatizer(lang=lang,**kwargs).lemmatize(tokens, **kwargs)
@staticmethod
def lemmatize(tokens, lang, **kwargs):
return morphology.PymorphyLemmatizer(lang=lang).lemmatize(tokens, **kwargs)
@staticmethod
def lemmatize_ru(tokens, lang='ru',**kwargs):
return morphology.PymorphyLemmatizer(lang=lang).lemmatize(tokens, **kwargs)
@staticmethod
def lemmatize_uk(tokens, lang='uk',**kwargs):
return morphology.PymorphyLemmatizer(lang=lang).lemmatize(tokens, **kwargs)
class TokenizerMixin():
@staticmethod
def simple_tokenize(text, strip=None):
'''Tokenize input test using gensim.utils.PAT_ALPHABETIC.
Using regexp (((?![\d])\w)+)
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
>>> list(gensim.utils.simple_tokenize(s))
['Good', 'muffins', 'cost', 'in', 'New', 'York',
'Please', 'buy', 'me', 'two', 'of', 'them', 'Thanks']
'''
for token in gensim.utils.simple_tokenize(text):
if token and not token.isspace():
yield token.strip(strip)
@staticmethod
def simple_tokenize2(text, strip=None):
'''
Using regexp '\b\w+?\b'
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
>>> RE_WORD.findall(s)
['Good', 'muffins', 'cost', '3', '88', 'in', 'New', 'York',
'Please', 'buy', 'me', 'two', 'of', 'them', 'Thanks']
'''
for match in patterns.RE_WORD.finditer(text):
token = match.group()
if token and not token.isspace():
yield token.strip(strip)
@staticmethod
def token_tokenize(text, strip=None):
for token in patterns.RE_TOKEN.split(text):
if token not in patterns.PUNCTUATION and not token.isspace():
yield token.strip(strip)
@staticmethod
def toktok_tokenize(text, strip=None):
'''
>>> text = u'Is 9.5 or 525,600 my favorite number?'
>>> ToktokTokenizer().tokenize(text)
['Is', '9.5', 'or', '525,600', 'my', 'favorite', 'number', '?']
s = "Good muffins cost $3.88\nin New York. It's inexpensive. Free-for-all. Please buy me\ntwo of them.\n\nThanks."
>>> ToktokTokenizer().tokenize(s)
['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.',
'It', "'", 's', 'inexpensive.', 'Free-for-all.',
'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks', '.']
>>>
'''
for token in ToktokTokenizer().tokenize(text):
if token not in patterns.PUNCTUATION and not token.isspace():
yield token.strip(strip)
@staticmethod
def space_tokenizer(text, strip=None):
''' Only " " blank character
Same as s.split(" ")
>>> s = "Good muffins cost $3.88\nin New York. It's inexpensive. Free-for-all. Please buy me\ntwo of them.\n\nThanks."
>>> SpaceTokenizer().tokenize(s)
['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.',
"It's", 'inexpensive.', 'Free-for-all.',
'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
>>> s.split(' ')
['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.',
"It's", 'inexpensive.', 'Free-for-all.',
'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
>>>'''
for token in SpaceTokenizer().tokenize(text):
if token not in patterns.PUNCTUATION and not token.isspace():
yield token.strip(strip)
@staticmethod
def whitespace_tokenizer(text, strip=None):
''' space, tab, newline
Same as s.split()
>>> s = "Good muffins cost $3.88\nin New York. It's inexpensive. Free-for-all. Please buy me\ntwo of them.\n\nThanks."
['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.',
"It's", 'inexpensive.', 'Free-for-all.',
'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
>>>
>>> s.split()
['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.',
"It's", 'inexpensive.', 'Free-for-all.',
'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
>>>
'''
for token in WhitespaceTokenizer().tokenize(text):
if token not in patterns.PUNCTUATION and not token.isspace():
yield token.strip(strip)
@staticmethod
def tab_tokenizer(text,strip=None):
'''tab-based tokenization'''
for token in TabTokenizer().tokenize(text):
if token not in patterns.PUNCTUATION and not token.isspace():
yield token.strip(strip)
@staticmethod
def wordpunct_tokenize(text, strip=None):
'''
Using the regexp \w+|[^\w\s]+
>>> s = "Good muffins cost $3.88\nin New York. It's inexpensive. Free-for-all. Please buy me\ntwo of them.\n\nThanks."
>>> WordPunctTokenizer().tokenize(s)
['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.',
'It', "'", 's', 'inexpensive', '.', 'Free', '-', 'for', '-', 'all', '.',
'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
>>>
>>> nltk.tokenize.word_tokenize(s)
['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.',
'It', "'s", 'inexpensive', '.', 'Free-for-all', '.',
'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
'''
for token in WordPunctTokenizer().tokenize(text):
if token not in patterns.PUNCTUATION and not token.isspace():
yield token.strip(strip)
@staticmethod
def treebank_word_tokenize(text, strip=None):
'''
using NLTK’s recommended word tokenizer (currently an improved
TreebankWordTokenizer along with PunktSentenceTokenizer for the specified language)
>>> s = "Good muffins cost $3.88\nin New York. It's inexpensive. Free-for-all. Please buy me\ntwo of them.\n\nThanks."
>>> nltk.word_tokenize(s)
['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.',
'It', "'s", 'inexpensive', '.', 'Free-for-all', '.',
'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
'''
for token in nltk.word_tokenize(text):
if token not in patterns.PUNCTUATION and not token.isspace():
yield token.strip(strip)
@staticmethod
def regexp_tokenize(text,pattern=None,strip=None):
'''
>>> s = "Good muffins cost $3.88\nin New York. It's inexpensive. Free-for-all. Please buy me\ntwo of them.\n\nThanks."
>>> tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
>>> tokenizer.tokenize(s)
['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York', '.',
'It', "'s", 'inexpensive', '.', 'Free', '-for-all.',
'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
'''
tokenizer = RegexpTokenizer(pattern or patterns.RE_WORD2)
for token in tokenizer.tokenize(text):
if token not in patterns.PUNCTUATION and not token.isspace():
yield token.strip(strip)
@staticmethod
def punct_tokenize(text, strip=None):
'''
>>> s = "Good muffins cost $3.88\nin New York. It's inexpensive. Free-for-all. Please buy me\ntwo of them.\n\nThanks."
>>> RE_PUNCT.split(s)
['Good muffins cost ', '$', '3', '.', '88\nin New York', '.',
' It', "'", 's inexpensive', '.', ' Free', '-', 'for', '-', 'all', '.',
' Please buy me\ntwo of them', '.', '\n\nThanks', '.', '']
'''
for token in patterns.RE_PUNCT.split(text):
if token not in patterns.PUNCTUATION and not token.isspace():
yield token.strip(strip)
@staticmethod
def nonalpha_tokenize(text, strip=None):
'''
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
>>> RE_NONALPHA.split(s)
['Good', 'muffins', 'cost', '', '3', '88', 'in', 'New', 'York',
'', '', 'Please', 'buy', 'me', 'two', 'of', 'them', '', '', 'Thanks', '']
'''
for token in patterns.RE_NONALPHA.split(text):
if token not in patterns.PUNCTUATION and not token.isspace():
yield token.strip(strip)
@staticmethod
def whitespace_tokenize2(text, strip=None):
'''>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
>>> RE_WHITESPACE.split(s)
['Good', ' ', 'muffins', ' ', 'cost', ' ', '$3.88', '\n', 'in', ' ',
'New', ' ', 'York.', ' ', 'Please', ' ', 'buy', ' ', 'me', '\n',
'two', ' ', 'of', ' ', 'them.', '\n', 'Thanks.']'''
for token in patterns.RE_WHITESPACE.split(text):
if token not in patterns.PUNCTUATION and not token.isspace():
yield token.strip(strip)
@staticmethod
def tags_tokenize(text, strip=None):
for token in patterns.RE_TAGS.split(text):
if token and not token.isspace():
yield token.strip(strip)
class StripperMixin():
'''
RE_PUNCT - Regexp for search an punctuation.
RE_TAGS - Regexp for search an tags.
RE_NUMERIC - Regexp for search an numbers.
RE_NONALPHA - Regexp for search an non-alphabetic character.
RE_NONASCII - Regexp for search an non-ascii character.
RE_AL_NUM - Regexp for search a position between letters and digits.
RE_NUM_AL - Regexp for search a position between digits and letters .
RE_SPACES - Regexp for search space characters.
'''
@staticmethod
def strip_accent(text):
'''Remove letter accents from the given string.'''
norm = unicodedata.normalize("NFD", text)
result = ''.join(ch for ch in norm if unicodedata.category(ch) != 'Mn')
return unicodedata.normalize("NFC", result)
#return gensim.utils.deaccent(text)
@staticmethod
def strip_quotes(text):
'''Removes a variety of quotes from the text'''
return patterns.RE_QOUTES.sub('',text)
@staticmethod
def strip_hyphenation(text):
"""Removing hyphenation."""
text = patterns.RE_HYPHENATED_WORD.sub(r"\1\2",text)
return text
@staticmethod
def strip_punctuation(text,marker=' '):
'''Replace punctuation characters with spaces in s using RE_PUNCT
>>> strip_punctuation(string.punctuation)
' '
>>>
'''
return patterns.RE_PUNCT2.sub(marker, text)
#return gensim.parsing.preprocessing.strip_punctuation(text)
@staticmethod
def strip_tags(text,marker=''):
'''Remove tags from s using RE_TAGS.
>>> strip_tags('<href="http://google.com">')
''
>>>
'''
return patterns.RE_TAGS.sub(marker, text)
#return gensim.parsing.preprocessing.strip_tags(text)
@staticmethod
def strip_urls(text, marker=''):
'''Remove URL's'''
return patterns.RE_URLS.sub(marker, text)
@staticmethod
def strip_multiple_whitespaces(text):
'''Remove repeating whitespace characters (spaces, tabs, line breaks)
from s and turns tabs & line breaks into spaces using RE_WHITESPACE'''
return patterns.RE_WHITESPACE.sub(" ", text)
#return gensim.parsing.preprocessing.strip_multiple_whitespaces(text)
@staticmethod
def strip_digit(text,marker=''):
''' Remove digits (0..9 + some others) from s using RE_DIGIT'''
return patterns.RE_DIGIT.sub(marker, text)
#return gensim.parsing.preprocessing.strip_numeric(text)
@staticmethod
def strip_decimal(text,marker=''):
''' Remove decimal from s using RE_DECIMAL'''
return patterns.RE_DECIMAL.sub(marker, text)
#return gensim.parsing.preprocessing.strip_numeric(text)
@staticmethod
def strip_numeric(text,marker=''):
''' Remove numeric from s using RE_NUMERIC'''
return patterns.RE_NUMERIC.sub(marker, text)
#return gensim.parsing.preprocessing.strip_numeric(text)
@staticmethod
def strip_roman_numerals(text, marker=''):
''' Remove digits from s using RE_ROMAN_NUMERALS'''
return patterns.RE_ROMAN_NUMERALS.sub(marker, text)
@staticmethod
def strip_nonletter_sequences(text, marker=' '):
''' Remove non-letter sequences'''
return patterns.RE_NONLETTER.sub(marker, text)
@staticmethod
def strip_contractions(text):
'''Replacing common contractions'''
return RegexReplacer(contractions.CONTRACTIONS).replace(text)
@staticmethod
def strip_possessive_endings(text, marker=''):
'''Replacing common contractions'''
return patterns.RE_POSSESSIVE_ENDINGS.sub(marker, text)
#--------------------------------------------------
@staticmethod
def strip_non_alphanum(text):
''' Remove non-alphabetic characters from s using RE_NONALPHA'''
return patterns.RE_NONALPHA.sub(" ", text)
#return gensim.parsing.preprocessing.strip_non_alphanum(text)
@staticmethod
def strip_nonasci(text):
''' Remove non-ASCII characters from s using RE_NONASCII'''
return patterns.RE_NONASCII.sub('',text)
@staticmethod
def strip_stopwords(text):
'''Remove STOPWORDS from s'''
return gensim.parsing.preprocessing.remove_stopwords(text)
@staticmethod
def strip_short(text,minsize=3):
'''Remove words with length lesser than minsize from s.'''
return " ".join(e for e in text.split() if len(e) >= minsize)
#return gensim.parsing.preprocessing.strip_short(text,minsize=minsize)
@staticmethod
def split_alphanum(text):
'''Add spaces between digits & letters in s using RE_AL_NUM'''
s = patterns.RE_AL_NUM.sub(r"\1 \2", text)
return patterns.RE_NUM_AL.sub(r"\1 \2", text)
#return gensim.parsing.preprocessing.split_alphanum(text)
@staticmethod
def strip_chars(text, chars=string.punctuation):
'''Removes the beginning and ending punctuation marks from this line.'''
return text.strip(chars)
class RemoverMixin():
#--------------------------------------------------
@staticmethod
def remove_short(tokens, minsize=3):
'''Remove tokens length less than min_word_len characters'''
result = []
if not minsize: return tokens
for token in tokens:
tok = token.lemma if hasattr(token,'lemma') else token
if len(tok) >= minsize:
result.append(token)
return result
@staticmethod
def remove_stopwords(tokens, stopwords=[]):
'''Remove STOPWORDS from list tokens'''
result = []
if not stopwords: return tokens
for token in tokens:
tok = token.lemma if hasattr(token,'lemma') else token
if tok.lower() not in stopwords:
result.append(token)
return result
@staticmethod
def remove_ifnotin_lexicon(tokens, lexicons=[]):
'''Lexicons is a list of lists of words'''
result = []
if not lexicons: return tokens
for token in tokens:
tok = token.lemma if hasattr(token,'lemma') else token
for lex in lexicons:
if any(t in lex for t in [tok,tok.capitalize()]):
result.append(token)
return result
@staticmethod
def remove_if_proper_name(tokens, names=[]):
'''Names is a list of words'''
result = []
if not names: return tokens
for token in tokens:
tok = token.lemma.lower() if hasattr(token,'lemma') else token.lower()
if tok.capitalize() not in names:
result.append(token)
return result
@staticmethod
def remove_by_tagpos(tokens, allowed_tags=[],disallowed_tags=[]):
'''
allowed_tags is a list or set of allowed tags for parts of speech
disallowed_tags is a list or set of disallowed tags for parts of speech
'''
def allowed():
for token in tokens:
pos = token.pos if hasattr(token,'pos') else ''
if pos in allowed_tags:
result.append(token)
def disallowed():
for token in tokens:
pos = token.pos if hasattr(token,'pos') else ''
if pos not in disallowed_tags:
result.append(token)
result = []
if (allowed_tags and disallowed_tags):
allowed_tags = set(allowed_tags) - set(disallowed_tags)
allowed()
elif (allowed_tags and not disallowed_tags):
allowed()
elif not allowed_tags and disallowed_tags:
disallowed()
else:
result = tokens
return result
@staticmethod
def remove_punctuation(tokens,chars=string.punctuation):
'''Removes tokens that represent punctuation characters'''
result = []
for token in tokens:
tok = token.word if hasattr(token,'word') else token
if tok not in chars:
result.append(token)
return result
@staticmethod
def remove_case(tokens,*args):
'''Removes the case of words'''
result = []
for token in tokens:
if hasattr(token,'token'):
token.lemma = token.lemma.lower()
else:
token = token.lower()
result.append(token)
return result
@staticmethod
def remove_trailing_chars(tokens,chars=string.punctuation):
'''Removes the start and end characters from each token'''
result = []
for token in tokens:
if hasattr(token,'word'):
token.word = token.word.strip(chars)
else:
token = token.strip(chars)
result.append(token)
return result
#-----------------------------------------------
# НЕ ИСПОЛЬЗУЮТСЯ, ПОЭТОМУ НЕ РАБОТАЕТ С КЛАССОМ TOKEN
@staticmethod
def remove_quotes(tokens, *args):
'''Removes a variety of quotes from the token'''
result = [patterns.RE_QOUTES.sub('',token) for token in tokens]
return result
@staticmethod
def remove_nonasci(tokens, *args):
'''Removes tokens that contain non-ascii characters'''
def is_ascii(s):
return all(ord(c) < 128 for c in s)
return list(filter(lambda token: is_ascii(token),tokens))
@staticmethod
def remove_nonalphabetic(tokens,other=''):
'''Removes tokens that contain something other than Latin letters'''
letters = set(string.ascii_letters + other)
def ascii_letters(s):
nonlocal letters
return all(c in letters for c in s)
return filter(lambda token: ascii_letters(token),tokens)
@staticmethod
def remove_empty(tokens, *args):
return filter(
lambda token: token in ('\r\n','\r','\n','\t','',' '),
tokens)
@staticmethod
def remove_numeric(tokens, *args):
'''Removes numeric'''
def is_numeric(token):
return token.isnumeric()
return filter(lambda token: not is_numeric(token),tokens)
@staticmethod
def remove_roman_numerals(tokens, *args):
'''Removes Roman numerals'''
def is_roman(token):
return patterns.ROMAN_NUMERALS.match(token)
return filter(lambda token: not is_roman(token),tokens)
@staticmethod
def remove_stopwords2(tokens, stopwords):
'''Remove STOPWORDS from list tokens'''
return gensim.corpora.textcorpus.remove_stopwords(tokens, stopwords)
@staticmethod
def remove_short2(tokens, minsize=3):
'''Remove tokens shorter than `minsize` chars'''
return gensim.corpora.textcorpus.remove_short(tokens, minsize=minsize)
@staticmethod
def make_lower(tokens, *args):
return list(map(str.lower,tokens))
# НЕ ИСПОЛЬЗУЕТСЯ
class FilterMixin():
#----------------------------------------------------------
# фильтры постобработки токенов
# должны возвращать True для допустимого токена
#----------------------------------------------------------
# фильтр на непустую лемму
@staticmethod
def in_nonempty_lemmas(token):
token = token.lemma if hasattr(token,'lemma') else token
return token not in ('',None)
# фильтр на разрешенные теги
@staticmethod
def in_allowed_tags(token, allowed_tags=[]):
pos = token.pos if hasattr(token,'pos') else ''
if not allowed_tags:
result = True
else:
result = pos in allowed_tags
return result
# фильтр на неразрешенные теги
@staticmethod
def in_nondisallowed_tags(token, disallowed_tags=[]):
pos = token.pos if hasattr(token,'pos') else ''
return pos not in disallowed_tags
@staticmethod
def isnot_proper_name(token, lexicon=[]):
token = token.lemma.lower() if hasattr(token,'lemma') else token.lower()
return token.capitalize() not in lexicon
@staticmethod
def isnot_proper_name2(token):
indexes = token.indexes
token_ = token.word if hasattr(token,'word') else token
result = not (token_[0].isupper() and 0 not in indexes)
#if not result:
# print(repr(token), result)
return result
#----------------------------------------------
# фильтр на принадлежность леммы определенному лексикону
@staticmethod
def in_lexicon1(token, lexs):
return any(tok in lex
for tok in [token.lemma,token.capitalize()]
for lex in lexs
)
# фильтр на принадлежность леммы определенному лексикону
@staticmethod
def in_lexicon(token, lexicons):
result = False
token = token.lemma.lower() if hasattr(token,'lemma') else token.lower()
for lex in lexicons:
if any(tok in lex for tok in [token,token.capitalize()]):
result = True
break
return result
class PreprocessorMixin():
@staticmethod
def stem_text(text,**kwargs):
'''Transform s into lowercase and stem it'''
return gensim.parsing.preprocessing.stem_text(text)
@staticmethod
def simple_preprocess(text, **kwargs):
'''
Convert a document into a list of lowercase tokens,
ignoring tokens that are too short or too long.
deacc=False, min_len=2, max_len=15
Uses gensim.utils.tokenize => gensim.utils.simple_tokenize
'''
return gensim.utils.simple_preprocess(text, **kwargs)
@staticmethod
def preprocess_string(text,filters):
'''Apply list of chosen filters to `s`.
Default list of filters:
strip_tags(),
strip_punctuation(),
strip_multiple_whitespaces(),
strip_numeric(),
remove_stopwords(),
strip_short(),
stem_text().'''
return gensim.parsing.preprocessing.preprocess_string(text,filters=filters)
'''
gensim.corpora.textcorpus.TextCorpus
preprocess_text() использует:
lower_to_unicode() - lowercase and convert to unicode (assumes utf8 encoding)
deaccent()- deaccent (asciifolding)
strip_multiple_whitespaces() - collapse multiple whitespaces into a single one
simple_tokenize() - tokenize by splitting on whitespace
remove_short() - remove words less than 3 characters long
remove_stopwords()
'''
if __name__ =="__main__":
text = """I am in your team, aren’t I?
I’m not gonna play tennis with you"""
replacer = RegexReplacer(contractions.CONTRACTIONS)
print(replacer.replace(text))