From 242c80ede3ba60152da7eee21703b48657cb260e Mon Sep 17 00:00:00 2001 From: jeni Shah Date: Sat, 20 Oct 2018 18:52:10 +0530 Subject: [PATCH 01/11] fix phraser memory --- gensim/models/phrases.py | 4 ++-- gensim/test/test_data/phraser_model_3dot6 | Bin 0 -> 543 bytes gensim/test/test_phrases.py | 10 ++++++++++ 3 files changed, 12 insertions(+), 2 deletions(-) create mode 100644 gensim/test/test_data/phraser_model_3dot6 diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 9d8a5f5da6..247f5532a9 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -805,7 +805,7 @@ def __init__(self, phrases_model): for bigram, score in phrases_model.export_phrases(corpus, self.delimiter, as_tuples=True): if bigram in self.phrasegrams: logger.info('Phraser repeat %s', bigram) - self.phrasegrams[bigram] = (phrases_model.vocab[self.delimiter.join(bigram)], score) + self.phrasegrams[bigram] = (None, score) count += 1 if not count % 50000: logger.info('Phraser added %i phrasegrams', count) @@ -848,7 +848,7 @@ def score_item(self, worda, wordb, components, scorer): """ try: - return self.phrasegrams[tuple(components)][1] + return self.phrasegrams[tuple(components)][-1] except KeyError: return -1 diff --git a/gensim/test/test_data/phraser_model_3dot6 b/gensim/test/test_data/phraser_model_3dot6 new file mode 100644 index 0000000000000000000000000000000000000000..ea6e2643744ebc22c39c2d744eac07203da18b2c GIT binary patch literal 543 zcmZ{g%We}f6o!*Dmu6ZZ6aoc$0b1G=XxhRHuw#)8nw2*rYvQ;wmd18|d!|w=RicZ| z!|)=!7RS?Uy1=qz>pTB{ZofMslg2AO)pj8ZKV9Z7_0mVbn%6~Wo!p^)i_U}%Tdh`^ zJLz*s zJiEsz#&fnoriWQGmy&1e7B41wpLL0sN=v-VsjYF0S1VX~8t8R^+cDnMcsuL-{IUI~ zIW^vqsu!$IN{^i&+l{feqEup;V?WUMF+SAz7|bL5-Jx6K;F0Vw$Uen5A{v%N3%1VV zIN(W)lNz5NmHtw(b&_#zD!sh+_?qB!MZ8dfOcR{3sL`m5lF5a~d4dZzZkUsza$a3Y R?$aw-*p$NG5`3@b(|^@5sb&BG literal 0 HcmV?d00001 diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py index e83bf5b2b9..717a8dda25 100644 --- a/gensim/test/test_phrases.py +++ b/gensim/test/test_phrases.py @@ -646,6 +646,16 @@ def testEncoding(self): self.assertTrue(isinstance(transformed, six.text_type)) +class TestPhraserModelCompatibilty(unittest.TestCase): + + def testCompatibilty(self): + bigram_loaded = Phraser.load(datapath("phraser_model_3dot6")) + test_sentences = [u'trees', u'graph', u'minors'] + prev_ver = bigram_loaded[test_sentences] + expected_res = ['trees_graph', 'minors'] + self.assertEqual(prev_ver, expected_res) + + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main() From bba2e464509414808ee98bdbce4dee6a62490177 Mon Sep 17 00:00:00 2001 From: jeni Shah Date: Fri, 26 Oct 2018 09:39:36 +0530 Subject: [PATCH 02/11] reduce phraser memory --- gensim/models/phrases.py | 7 +++++-- gensim/test/test_data/phraser_model_3dot6 | Bin 543 -> 528 bytes gensim/test/test_phrases.py | 1 - 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 247f5532a9..70b738e3ad 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -805,7 +805,7 @@ def __init__(self, phrases_model): for bigram, score in phrases_model.export_phrases(corpus, self.delimiter, as_tuples=True): if bigram in self.phrasegrams: logger.info('Phraser repeat %s', bigram) - self.phrasegrams[bigram] = (None, score) + self.phrasegrams[bigram] = score count += 1 if not count % 50000: logger.info('Phraser added %i phrasegrams', count) @@ -848,7 +848,10 @@ def score_item(self, worda, wordb, components, scorer): """ try: - return self.phrasegrams[tuple(components)][-1] + if list(self.phrasegrams.values())[0].__class__ is tuple: + return self.phrasegrams[tuple(components)][-1] + else: + return self.phrasegrams[tuple(components)] except KeyError: return -1 diff --git a/gensim/test/test_data/phraser_model_3dot6 b/gensim/test/test_data/phraser_model_3dot6 index ea6e2643744ebc22c39c2d744eac07203da18b2c..ec0f634a2fc21ebc5553545e598581c5ed9f3d05 100644 GIT binary patch delta 200 zcmbQwGJ$1-HDkTIgTyK@$Y6_LWnf?^DN0Q(E|kvTXe*QnDwKtarWYj^WE9GQMCF4D z71|0F-5mr9kIMiNiV7u=3gw_e6`&{^(6rply!@i#LRFBkT2P^STcHNr!qNy11_l#` z`1rii+=9yDLe1DhEszjUq&PVfB%&Q#s1v~j7RgKp8kCw+T&Npcs5kj9qcppIY@tDE Gk{$ql+CHHG delta 215 zcmbQhGM{CGHDkRule>e&Dllj(l+Iv_U}a!nC@D%!EiRPF;Akt94Jwp_il!GO7GxC4 zgG3dA3KiQ5l|Y&W3XjVGQCp!hiZT_DGS#3$HJ~UP(8%1(y!@i#LUoX^Mo^(*s?a~Mipt|_@ywco)%Hl$u*g{>XNO5u|NJKBTP(OkTtR^!ZXj*DYaiKwMq2c6r OjMD5zv4zH^NqPVkIYR>g diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py index 717a8dda25..82543ea0a7 100644 --- a/gensim/test/test_phrases.py +++ b/gensim/test/test_phrases.py @@ -12,7 +12,6 @@ import unittest import six - import numpy as np from gensim.utils import to_unicode From 9f9b05f407f01843b0ff59ae6b6f6d55c647121c Mon Sep 17 00:00:00 2001 From: jeni Shah Date: Thu, 22 Nov 2018 10:45:01 +0530 Subject: [PATCH 03/11] using isinstance --- gensim/models/phrases.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 70b738e3ad..9ba5415090 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -848,10 +848,11 @@ def score_item(self, worda, wordb, components, scorer): """ try: - if list(self.phrasegrams.values())[0].__class__ is tuple: - return self.phrasegrams[tuple(components)][-1] + score = self.phrasegrams[tuple(components)] + if isinstance(score, tuple): + return score[1] else: - return self.phrasegrams[tuple(components)] + return score except KeyError: return -1 From c391fe5a77b5321874841826a48bb46f6d54e22c Mon Sep 17 00:00:00 2001 From: jeni Shah Date: Mon, 26 Nov 2018 10:55:39 +0530 Subject: [PATCH 04/11] update model when loaded --- gensim/models/phrases.py | 4 ++++ gensim/test/test_data/phraser_model_3dot6 | Bin 528 -> 558 bytes 2 files changed, 4 insertions(+) diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 9ba5415090..6cc035d248 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -208,6 +208,10 @@ def load(cls, *args, **kwargs): """ model = super(PhrasesTransformation, cls).load(*args, **kwargs) # update older models + # if value in phrasegrams dict is a tuple, load only the scores. + if len(model.__dict__['phrasegrams']): + if isinstance(list(model.__dict__['phrasegrams'].values())[0], tuple): + model.__dict__['phrasegrams'].update((k, v[1]) for k, v in model.__dict__['phrasegrams'].items()) # if no scoring parameter, use default scoring if not hasattr(model, 'scoring'): logger.info('older version of %s loaded without scoring function', cls.__name__) diff --git a/gensim/test/test_data/phraser_model_3dot6 b/gensim/test/test_data/phraser_model_3dot6 index ec0f634a2fc21ebc5553545e598581c5ed9f3d05..36c2fef36516eb6dc56f2210e9a64b6975468ab1 100644 GIT binary patch delta 40 ocmbQhvW`WlfhjpXHLo}`H$Ek`EHx*;KyM=RJWe=gS(7cWN^BDmrodr+; From d154e3a0926882c437073bf4c97072a7cdccbdc3 Mon Sep 17 00:00:00 2001 From: jeni Shah Date: Mon, 26 Nov 2018 11:01:51 +0530 Subject: [PATCH 05/11] update model when loaded --- gensim/models/phrases.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 6cc035d248..075ed2aaf2 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -852,11 +852,7 @@ def score_item(self, worda, wordb, components, scorer): """ try: - score = self.phrasegrams[tuple(components)] - if isinstance(score, tuple): - return score[1] - else: - return score + return self.phrasegrams[tuple(components)] except KeyError: return -1 From 40b667274e2409a62e31513ac5ba4f38eb4fea3b Mon Sep 17 00:00:00 2001 From: jeni Shah Date: Mon, 26 Nov 2018 13:22:53 +0530 Subject: [PATCH 06/11] update model when loaded --- gensim/models/phrases.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 075ed2aaf2..70c7a1b298 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -209,9 +209,11 @@ def load(cls, *args, **kwargs): model = super(PhrasesTransformation, cls).load(*args, **kwargs) # update older models # if value in phrasegrams dict is a tuple, load only the scores. - if len(model.__dict__['phrasegrams']): + try: if isinstance(list(model.__dict__['phrasegrams'].values())[0], tuple): model.__dict__['phrasegrams'].update((k, v[1]) for k, v in model.__dict__['phrasegrams'].items()) + except KeyError: + pass # if no scoring parameter, use default scoring if not hasattr(model, 'scoring'): logger.info('older version of %s loaded without scoring function', cls.__name__) From 40dcbdef893fc39305f54f197e2eed4b6232b36d Mon Sep 17 00:00:00 2001 From: jeni Shah Date: Mon, 3 Dec 2018 10:03:19 +0530 Subject: [PATCH 07/11] updated changes --- gensim/models/phrases.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 70c7a1b298..05dce310a3 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -210,10 +210,12 @@ def load(cls, *args, **kwargs): # update older models # if value in phrasegrams dict is a tuple, load only the scores. try: - if isinstance(list(model.__dict__['phrasegrams'].values())[0], tuple): - model.__dict__['phrasegrams'].update((k, v[1]) for k, v in model.__dict__['phrasegrams'].items()) + for components, scores in model.__dict__['phrasegrams'].items(): + if isinstance(scores, tuple): + model.__dict__['phrasegrams'][components] = scores[1] except KeyError: pass + # if no scoring parameter, use default scoring if not hasattr(model, 'scoring'): logger.info('older version of %s loaded without scoring function', cls.__name__) From 21c391117ff2bcd180fec339d83b76f9b649799f Mon Sep 17 00:00:00 2001 From: jeni Shah Date: Tue, 4 Dec 2018 10:06:08 +0530 Subject: [PATCH 08/11] updated changes --- gensim/models/phrases.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 05dce310a3..76583d80c0 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -209,12 +209,12 @@ def load(cls, *args, **kwargs): model = super(PhrasesTransformation, cls).load(*args, **kwargs) # update older models # if value in phrasegrams dict is a tuple, load only the scores. - try: - for components, scores in model.__dict__['phrasegrams'].items(): - if isinstance(scores, tuple): - model.__dict__['phrasegrams'][components] = scores[1] - except KeyError: - pass + if model.phrasegrams: + components = model.phrasegrams.keys() + for component in components: + score = model.phrasegrams[component] + if isinstance(score, tuple): + model.phrasegrams[component] = score[1] # if no scoring parameter, use default scoring if not hasattr(model, 'scoring'): From 80e922208df42313be517642e9cd2e9e37752694 Mon Sep 17 00:00:00 2001 From: jeni Shah Date: Tue, 4 Dec 2018 15:18:35 +0530 Subject: [PATCH 09/11] update changes --- gensim/models/phrases.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 76583d80c0..40b564be10 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -210,11 +210,11 @@ def load(cls, *args, **kwargs): # update older models # if value in phrasegrams dict is a tuple, load only the scores. if model.phrasegrams: - components = model.phrasegrams.keys() - for component in components: + for component in model.phrasegrams.keys(): score = model.phrasegrams[component] if isinstance(score, tuple): - model.phrasegrams[component] = score[1] + frequency, score_val = score + model.phrasegrams[component] = score_val # if no scoring parameter, use default scoring if not hasattr(model, 'scoring'): From 021226a3d72147f5ee0d7299060ca17ae8a4682b Mon Sep 17 00:00:00 2001 From: Ivan Menshikh Date: Thu, 10 Jan 2019 17:18:29 +0500 Subject: [PATCH 10/11] fix loading --- gensim/models/phrases.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index fc080993eb..55424904f1 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -209,12 +209,11 @@ def load(cls, *args, **kwargs): model = super(PhrasesTransformation, cls).load(*args, **kwargs) # update older models # if value in phrasegrams dict is a tuple, load only the scores. - if model.phrasegrams: - for component in model.phrasegrams.keys(): - score = model.phrasegrams[component] - if isinstance(score, tuple): - frequency, score_val = score - model.phrasegrams[component] = score_val + + for component, score in getattr(model, "phrasegrams", {}).items(): + if isinstance(score, tuple): + frequency, score_val = score + model.phrasegrams[component] = score_val # if no scoring parameter, use default scoring if not hasattr(model, 'scoring'): From 9de249546dc4654995c23cff57c988d13da56a79 Mon Sep 17 00:00:00 2001 From: Ivan Menshikh Date: Thu, 10 Jan 2019 17:19:40 +0500 Subject: [PATCH 11/11] make test better --- gensim/test/test_data/phraser-3.6.0.model | Bin 0 -> 614 bytes gensim/test/test_data/phraser_model_3dot6 | Bin 558 -> 0 bytes gensim/test/test_data/phrases-3.6.0.model | Bin 0 -> 1401 bytes gensim/test/test_phrases.py | 16 +++++++++++----- 4 files changed, 11 insertions(+), 5 deletions(-) create mode 100644 gensim/test/test_data/phraser-3.6.0.model delete mode 100644 gensim/test/test_data/phraser_model_3dot6 create mode 100644 gensim/test/test_data/phrases-3.6.0.model diff --git a/gensim/test/test_data/phraser-3.6.0.model b/gensim/test/test_data/phraser-3.6.0.model new file mode 100644 index 0000000000000000000000000000000000000000..4416b13867e35891228371431ea762bf86784970 GIT binary patch literal 614 zcmZ{gOK;RL6oqG|?X=AlTA<|(rR6n+GEklcHr;hG8|0NYk(GMf#8Pa}v6GQnsS+&6 zk{`m~;*W40X43^$BuC#l_ug~;y(e>JgEk}MCaMghY2mY=Lh`M>eS*&34|Ffk8}ijo zr&ASPg~F8+ER4G?G1?00YFlA(PN6oXlB9`}AyGE|Bz= z8r&WCj*kYv4*$^C;2v35$VzWCbu<$9qw@j7LlPEAsI*boj_?t~PJ`W8zD)w129M|S zDbcjRlW2R&@Qi4DEP80?1D;1bWY}x)V!UvCwE4R|4PIXM?ML4$hS3~{+RxM(UPt_f z;h@3WOWEJmZFC`QZKh`d?^Ar3lOS~@A5(nd2_+Fi=afZF0uEDr=36cEO4dH;nGzwJ PsnTU6e@XGRJ{kQ5a>Kn( literal 0 HcmV?d00001 diff --git a/gensim/test/test_data/phraser_model_3dot6 b/gensim/test/test_data/phraser_model_3dot6 deleted file mode 100644 index 36c2fef36516eb6dc56f2210e9a64b6975468ab1..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 558 zcma))&2H2%6oqF>J8jbfZJ|)0{PlN+mH=f9!lVtbCAR4UR)Wx+fQ zFT!haosN*$VOg^M`P_4_uY9+pDzZTvS<($PZkpa7Q{zf%LOQK|5hx_z`qd{GOnzWE z!_ru+4+ev__7rN@lz8xU*b}3zRIalvmKTJ5w9zejtSG)JNri-L+$0!@A-hPdiV?3( z(Q5kzYjdn;NW?0mpq$sX!p8rV$>K`eqLDF+1_#|gz48Ue?9A|FT#v*u^Hv+e)0(T?&G2jiyTI>z9^qbw z7jwM4yx9BI4;HM%C{GmdDk}Cfyk1atpVoL2>DvtN=6HX3v4!9JJpWM~M8#o-Bc`!a z+-2tjjw3$Fa5~5MZ^BNju|i7Qndxo7B*%vZ2}(zDmg8I`y+&8;LuJ6n9I4ptnJ3lx Upl?WqB7TX2uRrDZ++C%A0NEn30ssI2 diff --git a/gensim/test/test_data/phrases-3.6.0.model b/gensim/test/test_data/phrases-3.6.0.model new file mode 100644 index 0000000000000000000000000000000000000000..65b831439f00d863af5a56fb503f3320eb087736 GIT binary patch literal 1401 zcmZ{k*?$yO5QS%wur-7PFbW8=$P!e-BBBBc3T_N*xwV1e+S7eA)7snV+dYsVf)Dt2 zTUFhe(FdPqe)XMGx9V2)eKivfZRX;1CCww7xRqj52F^O~LHmo&nZM9|fth80Tc^`; zVP3}B5VQYdV>*npATc7erH46x2iwv(Ghtq56}I%c8mUI5b)!6qu$3dbMH4(T<2(!o z#>E>J+irJO2Q&T_CK|=|FwB$0hE<$r&Wr3?P$yLshn0sNp)rFxPAWdecuAfOJ$SFN zldx+!ry(4D( z2yZA`V2j$#RIwzAHwka`X4t)*6=9whH7CG+K?ev2742m-v2kgw=xq@lA{_2@*)K)8 zx*Xeeyd&TU;iw)x$Dnqc^q7Q?6W-PE&WYYAJt6cY;gn{ZZ%xzFP77ZqoSB}s?kbz& zJpu0%KIqMH8Pe!xg~m0Cv!YlboKveb%5yAuA{OTbT_9Z4M#bEC%tY~_C@v8`QnNHu z#i_+*34cuZM2%AJro$-zRQMIbRpqnHZQ*cDqMs2y*Qiv&(Wd|Ff^HDL&~s$Dm%##)(g@Jnd&os$a4( zS)CT^4$t~nZzC>@C9=`SbFJQ(c*vJ(BZoiwc;PQevD{Bsmo6S#