From 5355c0651164b4096faa197f3c8a53193f81e882 Mon Sep 17 00:00:00 2001 From: Dmitry Date: Fri, 23 Feb 2018 09:14:56 +0500 Subject: [PATCH] Refactor documentation for `gensim.similarities.docsim` and `MmCorpus-related`. (#1910) * Added example for text_corpus.py * Fix for example * Updated docstrings for docsim.py * Beta_docstrings for docsim.py * Gamma_docstrings for docsim.py * Massive package of different files. * fix build (PEP8, rst) * retranslate _mmreader.pyx with cython==0.27.3 * fix matutils * fix textcorpus * fix mmcorpus * fix mmreader[2] * fix docsim[1] * fix docsim[2] * fix docsim[3] * fix docsim[4] * fix docsim[5] --- docs/src/apiref.rst | 1 + docs/src/corpora/_mmreader.rst | 9 + gensim/corpora/_mmreader.c | 971 +++++++++++++++++---------------- gensim/corpora/_mmreader.pyx | 73 +-- gensim/corpora/mmcorpus.py | 81 ++- gensim/corpora/textcorpus.py | 21 +- gensim/matutils.py | 97 ++-- gensim/similarities/docsim.py | 767 +++++++++++++++++++------- 8 files changed, 1211 insertions(+), 809 deletions(-) create mode 100644 docs/src/corpora/_mmreader.rst diff --git a/docs/src/apiref.rst b/docs/src/apiref.rst index 7d51a8b906..6176c7c80d 100644 --- a/docs/src/apiref.rst +++ b/docs/src/apiref.rst @@ -20,6 +20,7 @@ Modules: corpora/lowcorpus corpora/malletcorpus corpora/mmcorpus + corpora/_mmreader corpora/sharded_corpus corpora/svmlightcorpus corpora/textcorpus diff --git a/docs/src/corpora/_mmreader.rst b/docs/src/corpora/_mmreader.rst new file mode 100644 index 0000000000..b2802453b2 --- /dev/null +++ b/docs/src/corpora/_mmreader.rst @@ -0,0 +1,9 @@ +:mod:`corpora._mmreader` -- Reader for corpus in the Matrix Market format. +========================================================================== + +.. automodule:: gensim.corpora._mmreader + :synopsis: Reader for corpus in the Matrix Market format. + :members: + :inherited-members: + :undoc-members: + :show-inheritance: diff --git a/gensim/corpora/_mmreader.c b/gensim/corpora/_mmreader.c index be59977333..345984e3ed 100644 --- a/gensim/corpora/_mmreader.c +++ b/gensim/corpora/_mmreader.c @@ -725,7 +725,7 @@ static const char *__pyx_filename; static const char *__pyx_f[] = { - "_mmreader.pyx", + "gensim/corpora/_mmreader.pyx", "stringsource", }; @@ -735,12 +735,12 @@ struct __pyx_obj_6gensim_7corpora_9_mmreader___pyx_scope_struct____init__; struct __pyx_obj_6gensim_7corpora_9_mmreader___pyx_scope_struct_1_genexpr; struct __pyx_obj_6gensim_7corpora_9_mmreader___pyx_scope_struct_2___iter__; -/* "gensim/corpora/_mmreader.pyx":21 +/* "gensim/corpora/_mmreader.pyx":19 * * * cdef class MmReader(object): # <<<<<<<<<<<<<< - * """ - * matrix market file reader + * """Matrix market file reader (fast Cython version), used for :class:`~gensim.corpora.mmcorpus.MmCorpus`. + * */ struct __pyx_obj_6gensim_7corpora_9_mmreader_MmReader { PyObject_HEAD @@ -752,12 +752,12 @@ struct __pyx_obj_6gensim_7corpora_9_mmreader_MmReader { }; -/* "gensim/corpora/_mmreader.pyx":48 +/* "gensim/corpora/_mmreader.pyx":45 * cdef public int num_docs, num_terms, num_nnz * * def __init__(self, input, transposed=True): # <<<<<<<<<<<<<< * """ - * MmReader(input, transposed=True): + * */ struct __pyx_obj_6gensim_7corpora_9_mmreader___pyx_scope_struct____init__ { PyObject_HEAD @@ -765,7 +765,7 @@ struct __pyx_obj_6gensim_7corpora_9_mmreader___pyx_scope_struct____init__ { }; -/* "gensim/corpora/_mmreader.pyx":83 +/* "gensim/corpora/_mmreader.pyx":75 * line = utils.to_unicode(line) * if not line.startswith('%'): * self.num_docs, self.num_terms, self.num_nnz = (int(x) for x in line.split()) # <<<<<<<<<<<<<< @@ -782,12 +782,12 @@ struct __pyx_obj_6gensim_7corpora_9_mmreader___pyx_scope_struct_1_genexpr { }; -/* "gensim/corpora/_mmreader.pyx":117 +/* "gensim/corpora/_mmreader.pyx":107 * break * * def __iter__(self): # <<<<<<<<<<<<<< - * """ - * __iter__() + * """Iterate through corpus. + * */ struct __pyx_obj_6gensim_7corpora_9_mmreader___pyx_scope_struct_2___iter__ { PyObject_HEAD @@ -1356,7 +1356,7 @@ static const char __pyx_k_pyx_unpickle_MmReader[] = "__pyx_unpickle_MmReader"; static const char __pyx_k_gensim_corpora__mmreader[] = "gensim.corpora._mmreader"; static const char __pyx_k_matrixmarket_matrix_coordinate[] = "%%matrixmarket matrix coordinate real general"; static const char __pyx_k_MmCorpus_i_documents_i_features[] = "MmCorpus(%i documents, %i features, %i non-zero entries)"; -static const char __pyx_k_Reader_for_corpus_in_the_Matrix[] = "\nReader for corpus in the Matrix Market format.\n\n"; +static const char __pyx_k_Reader_for_corpus_in_the_Matrix[] = "Reader for corpus in the Matrix Market format."; static const char __pyx_k_File_s_not_in_Matrix_Market_form[] = "File %s not in Matrix Market format with coordinate real general; instead found: \n%s"; static const char __pyx_k_Incompatible_checksums_s_vs_0xea[] = "Incompatible checksums (%s vs 0xea5fe92 = (input, num_docs, num_nnz, num_terms, transposed))"; static const char __pyx_k_accepted_corpus_with_i_documents[] = "accepted corpus with %i documents, %i features, %i non-zero entries"; @@ -1468,17 +1468,17 @@ static PyObject *__pyx_tuple__6; static PyObject *__pyx_tuple__7; static PyObject *__pyx_codeobj__8; -/* "gensim/corpora/_mmreader.pyx":48 +/* "gensim/corpora/_mmreader.pyx":45 * cdef public int num_docs, num_terms, num_nnz * * def __init__(self, input, transposed=True): # <<<<<<<<<<<<<< * """ - * MmReader(input, transposed=True): + * */ /* Python wrapper */ static int __pyx_pw_6gensim_7corpora_9_mmreader_8MmReader_1__init__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ -static char __pyx_doc_6gensim_7corpora_9_mmreader_8MmReader___init__[] = "\n MmReader(input, transposed=True):\n\n Create matrix reader\n\n Parameters\n ----------\n input : string or file-like\n string (file path) or a file-like object that supports\n `seek()` (e.g. gzip.GzipFile, bz2.BZ2File). File-like objects are\n not closed automatically.\n\n transposed : bool\n if True, expects lines to represent doc_id, term_id, value\n else, expects term_id, doc_id, value\n\n "; +static char __pyx_doc_6gensim_7corpora_9_mmreader_8MmReader___init__[] = "\n\n Parameters\n ----------\n input : {str, file-like object}\n Path to input file in MM format or a file-like object that supports `seek()`\n (e.g. :class:`~gzip.GzipFile`, :class:`~bz2.BZ2File`).\n\n transposed : bool, optional\n if True, expects lines to represent doc_id, term_id, value. Else, expects term_id, doc_id, value.\n\n "; #if CYTHON_COMPILING_IN_CPYTHON struct wrapperbase __pyx_wrapperbase_6gensim_7corpora_9_mmreader_8MmReader___init__; #endif @@ -1516,7 +1516,7 @@ static int __pyx_pw_6gensim_7corpora_9_mmreader_8MmReader_1__init__(PyObject *__ } } if (unlikely(kw_args > 0)) { - if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "__init__") < 0)) __PYX_ERR(0, 48, __pyx_L3_error) + if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "__init__") < 0)) __PYX_ERR(0, 45, __pyx_L3_error) } } else { switch (PyTuple_GET_SIZE(__pyx_args)) { @@ -1532,7 +1532,7 @@ static int __pyx_pw_6gensim_7corpora_9_mmreader_8MmReader_1__init__(PyObject *__ } goto __pyx_L4_argument_unpacking_done; __pyx_L5_argtuple_error:; - __Pyx_RaiseArgtupleInvalid("__init__", 0, 1, 2, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 48, __pyx_L3_error) + __Pyx_RaiseArgtupleInvalid("__init__", 0, 1, 2, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 45, __pyx_L3_error) __pyx_L3_error:; __Pyx_AddTraceback("gensim.corpora._mmreader.MmReader.__init__", __pyx_clineno, __pyx_lineno, __pyx_filename); __Pyx_RefNannyFinishContext(); @@ -1546,7 +1546,7 @@ static int __pyx_pw_6gensim_7corpora_9_mmreader_8MmReader_1__init__(PyObject *__ } static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_8__init___2generator1(__pyx_CoroutineObject *__pyx_generator, CYTHON_UNUSED PyThreadState *__pyx_tstate, PyObject *__pyx_sent_value); /* proto */ -/* "gensim/corpora/_mmreader.pyx":83 +/* "gensim/corpora/_mmreader.pyx":75 * line = utils.to_unicode(line) * if not line.startswith('%'): * self.num_docs, self.num_terms, self.num_nnz = (int(x) for x in line.split()) # <<<<<<<<<<<<<< @@ -1563,7 +1563,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_8__init___genexp if (unlikely(!__pyx_cur_scope)) { __pyx_cur_scope = ((struct __pyx_obj_6gensim_7corpora_9_mmreader___pyx_scope_struct_1_genexpr *)Py_None); __Pyx_INCREF(Py_None); - __PYX_ERR(0, 83, __pyx_L1_error) + __PYX_ERR(0, 75, __pyx_L1_error) } else { __Pyx_GOTREF(__pyx_cur_scope); } @@ -1571,7 +1571,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_8__init___genexp __Pyx_INCREF(((PyObject *)__pyx_cur_scope->__pyx_outer_scope)); __Pyx_GIVEREF(__pyx_cur_scope->__pyx_outer_scope); { - __pyx_CoroutineObject *gen = __Pyx_Generator_New((__pyx_coroutine_body_t) __pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_8__init___2generator1, (PyObject *) __pyx_cur_scope, __pyx_n_s_genexpr, __pyx_n_s_init___locals_genexpr, __pyx_n_s_gensim_corpora__mmreader); if (unlikely(!gen)) __PYX_ERR(0, 83, __pyx_L1_error) + __pyx_CoroutineObject *gen = __Pyx_Generator_New((__pyx_coroutine_body_t) __pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_8__init___2generator1, (PyObject *) __pyx_cur_scope, __pyx_n_s_genexpr, __pyx_n_s_init___locals_genexpr, __pyx_n_s_gensim_corpora__mmreader); if (unlikely(!gen)) __PYX_ERR(0, 75, __pyx_L1_error) __Pyx_DECREF(__pyx_cur_scope); __Pyx_RefNannyFinishContext(); return (PyObject *) gen; @@ -1606,9 +1606,9 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_8__init___2gener return NULL; } __pyx_L3_first_run:; - if (unlikely(!__pyx_sent_value)) __PYX_ERR(0, 83, __pyx_L1_error) - if (unlikely(!__pyx_cur_scope->__pyx_outer_scope->__pyx_v_line)) { __Pyx_RaiseClosureNameError("line"); __PYX_ERR(0, 83, __pyx_L1_error) } - __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_cur_scope->__pyx_outer_scope->__pyx_v_line, __pyx_n_s_split); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 83, __pyx_L1_error) + if (unlikely(!__pyx_sent_value)) __PYX_ERR(0, 75, __pyx_L1_error) + if (unlikely(!__pyx_cur_scope->__pyx_outer_scope->__pyx_v_line)) { __Pyx_RaiseClosureNameError("line"); __PYX_ERR(0, 75, __pyx_L1_error) } + __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_cur_scope->__pyx_outer_scope->__pyx_v_line, __pyx_n_s_split); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 75, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); __pyx_t_3 = NULL; if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_2))) { @@ -1621,10 +1621,10 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_8__init___2gener } } if (__pyx_t_3) { - __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 83, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 75, __pyx_L1_error) __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; } else { - __pyx_t_1 = __Pyx_PyObject_CallNoArg(__pyx_t_2); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 83, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_CallNoArg(__pyx_t_2); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 75, __pyx_L1_error) } __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; @@ -1632,9 +1632,9 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_8__init___2gener __pyx_t_2 = __pyx_t_1; __Pyx_INCREF(__pyx_t_2); __pyx_t_4 = 0; __pyx_t_5 = NULL; } else { - __pyx_t_4 = -1; __pyx_t_2 = PyObject_GetIter(__pyx_t_1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 83, __pyx_L1_error) + __pyx_t_4 = -1; __pyx_t_2 = PyObject_GetIter(__pyx_t_1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 75, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); - __pyx_t_5 = Py_TYPE(__pyx_t_2)->tp_iternext; if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 83, __pyx_L1_error) + __pyx_t_5 = Py_TYPE(__pyx_t_2)->tp_iternext; if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 75, __pyx_L1_error) } __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; for (;;) { @@ -1642,17 +1642,17 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_8__init___2gener if (likely(PyList_CheckExact(__pyx_t_2))) { if (__pyx_t_4 >= PyList_GET_SIZE(__pyx_t_2)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS - __pyx_t_1 = PyList_GET_ITEM(__pyx_t_2, __pyx_t_4); __Pyx_INCREF(__pyx_t_1); __pyx_t_4++; if (unlikely(0 < 0)) __PYX_ERR(0, 83, __pyx_L1_error) + __pyx_t_1 = PyList_GET_ITEM(__pyx_t_2, __pyx_t_4); __Pyx_INCREF(__pyx_t_1); __pyx_t_4++; if (unlikely(0 < 0)) __PYX_ERR(0, 75, __pyx_L1_error) #else - __pyx_t_1 = PySequence_ITEM(__pyx_t_2, __pyx_t_4); __pyx_t_4++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 83, __pyx_L1_error) + __pyx_t_1 = PySequence_ITEM(__pyx_t_2, __pyx_t_4); __pyx_t_4++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 75, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); #endif } else { if (__pyx_t_4 >= PyTuple_GET_SIZE(__pyx_t_2)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS - __pyx_t_1 = PyTuple_GET_ITEM(__pyx_t_2, __pyx_t_4); __Pyx_INCREF(__pyx_t_1); __pyx_t_4++; if (unlikely(0 < 0)) __PYX_ERR(0, 83, __pyx_L1_error) + __pyx_t_1 = PyTuple_GET_ITEM(__pyx_t_2, __pyx_t_4); __Pyx_INCREF(__pyx_t_1); __pyx_t_4++; if (unlikely(0 < 0)) __PYX_ERR(0, 75, __pyx_L1_error) #else - __pyx_t_1 = PySequence_ITEM(__pyx_t_2, __pyx_t_4); __pyx_t_4++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 83, __pyx_L1_error) + __pyx_t_1 = PySequence_ITEM(__pyx_t_2, __pyx_t_4); __pyx_t_4++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 75, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); #endif } @@ -1662,7 +1662,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_8__init___2gener PyObject* exc_type = PyErr_Occurred(); if (exc_type) { if (likely(__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear(); - else __PYX_ERR(0, 83, __pyx_L1_error) + else __PYX_ERR(0, 75, __pyx_L1_error) } break; } @@ -1672,7 +1672,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_8__init___2gener __Pyx_XDECREF_SET(__pyx_cur_scope->__pyx_v_x, __pyx_t_1); __Pyx_GIVEREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_t_1 = __Pyx_PyNumber_Int(__pyx_cur_scope->__pyx_v_x); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 83, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyNumber_Int(__pyx_cur_scope->__pyx_v_x); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 75, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __pyx_r = __pyx_t_1; __pyx_t_1 = 0; @@ -1692,7 +1692,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_8__init___2gener __Pyx_XGOTREF(__pyx_t_2); __pyx_t_4 = __pyx_cur_scope->__pyx_t_1; __pyx_t_5 = __pyx_cur_scope->__pyx_t_2; - if (unlikely(!__pyx_sent_value)) __PYX_ERR(0, 83, __pyx_L1_error) + if (unlikely(!__pyx_sent_value)) __PYX_ERR(0, 75, __pyx_L1_error) } __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; CYTHON_MAYBE_UNUSED_VAR(__pyx_cur_scope); @@ -1714,12 +1714,12 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_8__init___2gener return __pyx_r; } -/* "gensim/corpora/_mmreader.pyx":48 +/* "gensim/corpora/_mmreader.pyx":45 * cdef public int num_docs, num_terms, num_nnz * * def __init__(self, input, transposed=True): # <<<<<<<<<<<<<< * """ - * MmReader(input, transposed=True): + * */ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_obj_6gensim_7corpora_9_mmreader_MmReader *__pyx_v_self, PyObject *__pyx_v_input, PyObject *__pyx_v_transposed) { @@ -1756,21 +1756,21 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ if (unlikely(!__pyx_cur_scope)) { __pyx_cur_scope = ((struct __pyx_obj_6gensim_7corpora_9_mmreader___pyx_scope_struct____init__ *)Py_None); __Pyx_INCREF(Py_None); - __PYX_ERR(0, 48, __pyx_L1_error) + __PYX_ERR(0, 45, __pyx_L1_error) } else { __Pyx_GOTREF(__pyx_cur_scope); } - /* "gensim/corpora/_mmreader.pyx":66 + /* "gensim/corpora/_mmreader.pyx":58 * * """ * logger.info("initializing cython corpus reader from %s", input) # <<<<<<<<<<<<<< * self.input, self.transposed = input, transposed * with utils.file_or_filename(self.input) as lines: */ - __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_logger); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 66, __pyx_L1_error) + __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_logger); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 58, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_info); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 66, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_info); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 58, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; __pyx_t_2 = NULL; @@ -1788,7 +1788,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ #if CYTHON_FAST_PYCALL if (PyFunction_Check(__pyx_t_3)) { PyObject *__pyx_temp[3] = {__pyx_t_2, __pyx_kp_s_initializing_cython_corpus_reade, __pyx_v_input}; - __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_3, __pyx_temp+1-__pyx_t_4, 2+__pyx_t_4); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 66, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_3, __pyx_temp+1-__pyx_t_4, 2+__pyx_t_4); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 58, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0; __Pyx_GOTREF(__pyx_t_1); } else @@ -1796,13 +1796,13 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ #if CYTHON_FAST_PYCCALL if (__Pyx_PyFastCFunction_Check(__pyx_t_3)) { PyObject *__pyx_temp[3] = {__pyx_t_2, __pyx_kp_s_initializing_cython_corpus_reade, __pyx_v_input}; - __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_3, __pyx_temp+1-__pyx_t_4, 2+__pyx_t_4); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 66, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_3, __pyx_temp+1-__pyx_t_4, 2+__pyx_t_4); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 58, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0; __Pyx_GOTREF(__pyx_t_1); } else #endif { - __pyx_t_5 = PyTuple_New(2+__pyx_t_4); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 66, __pyx_L1_error) + __pyx_t_5 = PyTuple_New(2+__pyx_t_4); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 58, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); if (__pyx_t_2) { __Pyx_GIVEREF(__pyx_t_2); PyTuple_SET_ITEM(__pyx_t_5, 0, __pyx_t_2); __pyx_t_2 = NULL; @@ -1813,14 +1813,14 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ __Pyx_INCREF(__pyx_v_input); __Pyx_GIVEREF(__pyx_v_input); PyTuple_SET_ITEM(__pyx_t_5, 1+__pyx_t_4, __pyx_v_input); - __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_t_5, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 66, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_t_5, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 58, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; } __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - /* "gensim/corpora/_mmreader.pyx":67 + /* "gensim/corpora/_mmreader.pyx":59 * """ * logger.info("initializing cython corpus reader from %s", input) * self.input, self.transposed = input, transposed # <<<<<<<<<<<<<< @@ -1829,7 +1829,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ */ __pyx_t_1 = __pyx_v_input; __Pyx_INCREF(__pyx_t_1); - __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_v_transposed); if (unlikely((__pyx_t_6 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 67, __pyx_L1_error) + __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_v_transposed); if (unlikely((__pyx_t_6 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 59, __pyx_L1_error) __Pyx_GIVEREF(__pyx_t_1); __Pyx_GOTREF(__pyx_v_self->input); __Pyx_DECREF(__pyx_v_self->input); @@ -1837,7 +1837,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ __pyx_t_1 = 0; __pyx_v_self->transposed = __pyx_t_6; - /* "gensim/corpora/_mmreader.pyx":68 + /* "gensim/corpora/_mmreader.pyx":60 * logger.info("initializing cython corpus reader from %s", input) * self.input, self.transposed = input, transposed * with utils.file_or_filename(self.input) as lines: # <<<<<<<<<<<<<< @@ -1845,9 +1845,9 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ * header = utils.to_unicode(next(lines)).strip() */ /*with:*/ { - __pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_utils); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 68, __pyx_L1_error) + __pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_utils); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 60, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_file_or_filename); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 68, __pyx_L1_error) + __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_file_or_filename); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 60, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __pyx_t_3 = NULL; @@ -1861,13 +1861,13 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ } } if (!__pyx_t_3) { - __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_5, __pyx_v_self->input); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 68, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_5, __pyx_v_self->input); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 60, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); } else { #if CYTHON_FAST_PYCALL if (PyFunction_Check(__pyx_t_5)) { PyObject *__pyx_temp[2] = {__pyx_t_3, __pyx_v_self->input}; - __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_5, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 68, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_5, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 60, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0; __Pyx_GOTREF(__pyx_t_1); } else @@ -1875,27 +1875,27 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ #if CYTHON_FAST_PYCCALL if (__Pyx_PyFastCFunction_Check(__pyx_t_5)) { PyObject *__pyx_temp[2] = {__pyx_t_3, __pyx_v_self->input}; - __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_5, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 68, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_5, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 60, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0; __Pyx_GOTREF(__pyx_t_1); } else #endif { - __pyx_t_2 = PyTuple_New(1+1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 68, __pyx_L1_error) + __pyx_t_2 = PyTuple_New(1+1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 60, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); __Pyx_GIVEREF(__pyx_t_3); PyTuple_SET_ITEM(__pyx_t_2, 0, __pyx_t_3); __pyx_t_3 = NULL; __Pyx_INCREF(__pyx_v_self->input); __Pyx_GIVEREF(__pyx_v_self->input); PyTuple_SET_ITEM(__pyx_t_2, 0+1, __pyx_v_self->input); - __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_5, __pyx_t_2, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 68, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_5, __pyx_t_2, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 60, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; } } __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; - __pyx_t_7 = __Pyx_PyObject_LookupSpecial(__pyx_t_1, __pyx_n_s_exit); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 68, __pyx_L1_error) + __pyx_t_7 = __Pyx_PyObject_LookupSpecial(__pyx_t_1, __pyx_n_s_exit); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 60, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_7); - __pyx_t_2 = __Pyx_PyObject_LookupSpecial(__pyx_t_1, __pyx_n_s_enter); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 68, __pyx_L3_error) + __pyx_t_2 = __Pyx_PyObject_LookupSpecial(__pyx_t_1, __pyx_n_s_enter); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 60, __pyx_L3_error) __Pyx_GOTREF(__pyx_t_2); __pyx_t_3 = NULL; if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_2))) { @@ -1908,10 +1908,10 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ } } if (__pyx_t_3) { - __pyx_t_5 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_t_3); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 68, __pyx_L3_error) + __pyx_t_5 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_t_3); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 60, __pyx_L3_error) __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; } else { - __pyx_t_5 = __Pyx_PyObject_CallNoArg(__pyx_t_2); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 68, __pyx_L3_error) + __pyx_t_5 = __Pyx_PyObject_CallNoArg(__pyx_t_2); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 60, __pyx_L3_error) } __Pyx_GOTREF(__pyx_t_5); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; @@ -1930,7 +1930,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ __pyx_v_lines = __pyx_t_2; __pyx_t_2 = 0; - /* "gensim/corpora/_mmreader.pyx":69 + /* "gensim/corpora/_mmreader.pyx":61 * self.input, self.transposed = input, transposed * with utils.file_or_filename(self.input) as lines: * try: # <<<<<<<<<<<<<< @@ -1946,19 +1946,19 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ __Pyx_XGOTREF(__pyx_t_13); /*try:*/ { - /* "gensim/corpora/_mmreader.pyx":70 + /* "gensim/corpora/_mmreader.pyx":62 * with utils.file_or_filename(self.input) as lines: * try: * header = utils.to_unicode(next(lines)).strip() # <<<<<<<<<<<<<< * if not header.lower().startswith('%%matrixmarket matrix coordinate real general'): * raise ValueError( */ - __pyx_t_5 = __Pyx_GetModuleGlobalName(__pyx_n_s_utils); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 70, __pyx_L13_error) + __pyx_t_5 = __Pyx_GetModuleGlobalName(__pyx_n_s_utils); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 62, __pyx_L13_error) __Pyx_GOTREF(__pyx_t_5); - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_5, __pyx_n_s_to_unicode); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 70, __pyx_L13_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_5, __pyx_n_s_to_unicode); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 62, __pyx_L13_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; - __pyx_t_5 = __Pyx_PyIter_Next(__pyx_v_lines); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 70, __pyx_L13_error) + __pyx_t_5 = __Pyx_PyIter_Next(__pyx_v_lines); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 62, __pyx_L13_error) __Pyx_GOTREF(__pyx_t_5); __pyx_t_14 = NULL; if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_3))) { @@ -1971,14 +1971,14 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ } } if (!__pyx_t_14) { - __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_3, __pyx_t_5); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 70, __pyx_L13_error) + __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_3, __pyx_t_5); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 62, __pyx_L13_error) __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; __Pyx_GOTREF(__pyx_t_1); } else { #if CYTHON_FAST_PYCALL if (PyFunction_Check(__pyx_t_3)) { PyObject *__pyx_temp[2] = {__pyx_t_14, __pyx_t_5}; - __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_3, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 70, __pyx_L13_error) + __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_3, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 62, __pyx_L13_error) __Pyx_XDECREF(__pyx_t_14); __pyx_t_14 = 0; __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; @@ -1987,26 +1987,26 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ #if CYTHON_FAST_PYCCALL if (__Pyx_PyFastCFunction_Check(__pyx_t_3)) { PyObject *__pyx_temp[2] = {__pyx_t_14, __pyx_t_5}; - __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_3, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 70, __pyx_L13_error) + __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_3, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 62, __pyx_L13_error) __Pyx_XDECREF(__pyx_t_14); __pyx_t_14 = 0; __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; } else #endif { - __pyx_t_15 = PyTuple_New(1+1); if (unlikely(!__pyx_t_15)) __PYX_ERR(0, 70, __pyx_L13_error) + __pyx_t_15 = PyTuple_New(1+1); if (unlikely(!__pyx_t_15)) __PYX_ERR(0, 62, __pyx_L13_error) __Pyx_GOTREF(__pyx_t_15); __Pyx_GIVEREF(__pyx_t_14); PyTuple_SET_ITEM(__pyx_t_15, 0, __pyx_t_14); __pyx_t_14 = NULL; __Pyx_GIVEREF(__pyx_t_5); PyTuple_SET_ITEM(__pyx_t_15, 0+1, __pyx_t_5); __pyx_t_5 = 0; - __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_t_15, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 70, __pyx_L13_error) + __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_t_15, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 62, __pyx_L13_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_15); __pyx_t_15 = 0; } } __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_strip); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 70, __pyx_L13_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_strip); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 62, __pyx_L13_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __pyx_t_1 = NULL; @@ -2020,24 +2020,24 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ } } if (__pyx_t_1) { - __pyx_t_2 = __Pyx_PyObject_CallOneArg(__pyx_t_3, __pyx_t_1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 70, __pyx_L13_error) + __pyx_t_2 = __Pyx_PyObject_CallOneArg(__pyx_t_3, __pyx_t_1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 62, __pyx_L13_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; } else { - __pyx_t_2 = __Pyx_PyObject_CallNoArg(__pyx_t_3); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 70, __pyx_L13_error) + __pyx_t_2 = __Pyx_PyObject_CallNoArg(__pyx_t_3); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 62, __pyx_L13_error) } __Pyx_GOTREF(__pyx_t_2); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __pyx_v_header = __pyx_t_2; __pyx_t_2 = 0; - /* "gensim/corpora/_mmreader.pyx":71 + /* "gensim/corpora/_mmreader.pyx":63 * try: * header = utils.to_unicode(next(lines)).strip() * if not header.lower().startswith('%%matrixmarket matrix coordinate real general'): # <<<<<<<<<<<<<< * raise ValueError( * "File %s not in Matrix Market format with coordinate real general; instead found: \n%s" % */ - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_header, __pyx_n_s_lower); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 71, __pyx_L13_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_header, __pyx_n_s_lower); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 63, __pyx_L13_error) __Pyx_GOTREF(__pyx_t_3); __pyx_t_1 = NULL; if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_3))) { @@ -2050,32 +2050,32 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ } } if (__pyx_t_1) { - __pyx_t_2 = __Pyx_PyObject_CallOneArg(__pyx_t_3, __pyx_t_1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 71, __pyx_L13_error) + __pyx_t_2 = __Pyx_PyObject_CallOneArg(__pyx_t_3, __pyx_t_1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 63, __pyx_L13_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; } else { - __pyx_t_2 = __Pyx_PyObject_CallNoArg(__pyx_t_3); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 71, __pyx_L13_error) + __pyx_t_2 = __Pyx_PyObject_CallNoArg(__pyx_t_3); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 63, __pyx_L13_error) } __Pyx_GOTREF(__pyx_t_2); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_startswith); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 71, __pyx_L13_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_startswith); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 63, __pyx_L13_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; - __pyx_t_2 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_tuple_, NULL); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 71, __pyx_L13_error) + __pyx_t_2 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_tuple_, NULL); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 63, __pyx_L13_error) __Pyx_GOTREF(__pyx_t_2); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_2); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(0, 71, __pyx_L13_error) + __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_2); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(0, 63, __pyx_L13_error) __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; __pyx_t_16 = ((!__pyx_t_6) != 0); if (__pyx_t_16) { - /* "gensim/corpora/_mmreader.pyx":74 + /* "gensim/corpora/_mmreader.pyx":66 * raise ValueError( * "File %s not in Matrix Market format with coordinate real general; instead found: \n%s" % * (self.input, header) # <<<<<<<<<<<<<< * ) * except StopIteration: */ - __pyx_t_2 = PyTuple_New(2); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 74, __pyx_L13_error) + __pyx_t_2 = PyTuple_New(2); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 66, __pyx_L13_error) __Pyx_GOTREF(__pyx_t_2); __Pyx_INCREF(__pyx_v_self->input); __Pyx_GIVEREF(__pyx_v_self->input); @@ -2084,37 +2084,37 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ __Pyx_GIVEREF(__pyx_v_header); PyTuple_SET_ITEM(__pyx_t_2, 1, __pyx_v_header); - /* "gensim/corpora/_mmreader.pyx":73 + /* "gensim/corpora/_mmreader.pyx":65 * if not header.lower().startswith('%%matrixmarket matrix coordinate real general'): * raise ValueError( * "File %s not in Matrix Market format with coordinate real general; instead found: \n%s" % # <<<<<<<<<<<<<< * (self.input, header) * ) */ - __pyx_t_3 = __Pyx_PyString_Format(__pyx_kp_s_File_s_not_in_Matrix_Market_form, __pyx_t_2); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 73, __pyx_L13_error) + __pyx_t_3 = __Pyx_PyString_Format(__pyx_kp_s_File_s_not_in_Matrix_Market_form, __pyx_t_2); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 65, __pyx_L13_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; - /* "gensim/corpora/_mmreader.pyx":72 + /* "gensim/corpora/_mmreader.pyx":64 * header = utils.to_unicode(next(lines)).strip() * if not header.lower().startswith('%%matrixmarket matrix coordinate real general'): * raise ValueError( # <<<<<<<<<<<<<< * "File %s not in Matrix Market format with coordinate real general; instead found: \n%s" % * (self.input, header) */ - __pyx_t_2 = PyTuple_New(1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 72, __pyx_L13_error) + __pyx_t_2 = PyTuple_New(1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 64, __pyx_L13_error) __Pyx_GOTREF(__pyx_t_2); __Pyx_GIVEREF(__pyx_t_3); PyTuple_SET_ITEM(__pyx_t_2, 0, __pyx_t_3); __pyx_t_3 = 0; - __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_t_2, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 72, __pyx_L13_error) + __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_t_2, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 64, __pyx_L13_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; __Pyx_Raise(__pyx_t_3, 0, 0, 0); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - __PYX_ERR(0, 72, __pyx_L13_error) + __PYX_ERR(0, 64, __pyx_L13_error) - /* "gensim/corpora/_mmreader.pyx":71 + /* "gensim/corpora/_mmreader.pyx":63 * try: * header = utils.to_unicode(next(lines)).strip() * if not header.lower().startswith('%%matrixmarket matrix coordinate real general'): # <<<<<<<<<<<<<< @@ -2123,7 +2123,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ */ } - /* "gensim/corpora/_mmreader.pyx":69 + /* "gensim/corpora/_mmreader.pyx":61 * self.input, self.transposed = input, transposed * with utils.file_or_filename(self.input) as lines: * try: # <<<<<<<<<<<<<< @@ -2143,7 +2143,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0; __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0; - /* "gensim/corpora/_mmreader.pyx":76 + /* "gensim/corpora/_mmreader.pyx":68 * (self.input, header) * ) * except StopIteration: # <<<<<<<<<<<<<< @@ -2158,7 +2158,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ goto __pyx_L15_except_error; __pyx_L15_except_error:; - /* "gensim/corpora/_mmreader.pyx":69 + /* "gensim/corpora/_mmreader.pyx":61 * self.input, self.transposed = input, transposed * with utils.file_or_filename(self.input) as lines: * try: # <<<<<<<<<<<<<< @@ -2178,7 +2178,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ __pyx_L18_try_end:; } - /* "gensim/corpora/_mmreader.pyx":79 + /* "gensim/corpora/_mmreader.pyx":71 * pass * * self.num_docs = self.num_terms = self.num_nnz = 0 # <<<<<<<<<<<<<< @@ -2189,7 +2189,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ __pyx_v_self->num_terms = 0; __pyx_v_self->num_nnz = 0; - /* "gensim/corpora/_mmreader.pyx":80 + /* "gensim/corpora/_mmreader.pyx":72 * * self.num_docs = self.num_terms = self.num_nnz = 0 * for lineno, line in enumerate(lines): # <<<<<<<<<<<<<< @@ -2202,26 +2202,26 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ __pyx_t_2 = __pyx_v_lines; __Pyx_INCREF(__pyx_t_2); __pyx_t_17 = 0; __pyx_t_18 = NULL; } else { - __pyx_t_17 = -1; __pyx_t_2 = PyObject_GetIter(__pyx_v_lines); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 80, __pyx_L7_error) + __pyx_t_17 = -1; __pyx_t_2 = PyObject_GetIter(__pyx_v_lines); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 72, __pyx_L7_error) __Pyx_GOTREF(__pyx_t_2); - __pyx_t_18 = Py_TYPE(__pyx_t_2)->tp_iternext; if (unlikely(!__pyx_t_18)) __PYX_ERR(0, 80, __pyx_L7_error) + __pyx_t_18 = Py_TYPE(__pyx_t_2)->tp_iternext; if (unlikely(!__pyx_t_18)) __PYX_ERR(0, 72, __pyx_L7_error) } for (;;) { if (likely(!__pyx_t_18)) { if (likely(PyList_CheckExact(__pyx_t_2))) { if (__pyx_t_17 >= PyList_GET_SIZE(__pyx_t_2)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS - __pyx_t_1 = PyList_GET_ITEM(__pyx_t_2, __pyx_t_17); __Pyx_INCREF(__pyx_t_1); __pyx_t_17++; if (unlikely(0 < 0)) __PYX_ERR(0, 80, __pyx_L7_error) + __pyx_t_1 = PyList_GET_ITEM(__pyx_t_2, __pyx_t_17); __Pyx_INCREF(__pyx_t_1); __pyx_t_17++; if (unlikely(0 < 0)) __PYX_ERR(0, 72, __pyx_L7_error) #else - __pyx_t_1 = PySequence_ITEM(__pyx_t_2, __pyx_t_17); __pyx_t_17++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 80, __pyx_L7_error) + __pyx_t_1 = PySequence_ITEM(__pyx_t_2, __pyx_t_17); __pyx_t_17++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 72, __pyx_L7_error) __Pyx_GOTREF(__pyx_t_1); #endif } else { if (__pyx_t_17 >= PyTuple_GET_SIZE(__pyx_t_2)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS - __pyx_t_1 = PyTuple_GET_ITEM(__pyx_t_2, __pyx_t_17); __Pyx_INCREF(__pyx_t_1); __pyx_t_17++; if (unlikely(0 < 0)) __PYX_ERR(0, 80, __pyx_L7_error) + __pyx_t_1 = PyTuple_GET_ITEM(__pyx_t_2, __pyx_t_17); __Pyx_INCREF(__pyx_t_1); __pyx_t_17++; if (unlikely(0 < 0)) __PYX_ERR(0, 72, __pyx_L7_error) #else - __pyx_t_1 = PySequence_ITEM(__pyx_t_2, __pyx_t_17); __pyx_t_17++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 80, __pyx_L7_error) + __pyx_t_1 = PySequence_ITEM(__pyx_t_2, __pyx_t_17); __pyx_t_17++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 72, __pyx_L7_error) __Pyx_GOTREF(__pyx_t_1); #endif } @@ -2231,7 +2231,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ PyObject* exc_type = PyErr_Occurred(); if (exc_type) { if (likely(__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear(); - else __PYX_ERR(0, 80, __pyx_L7_error) + else __PYX_ERR(0, 72, __pyx_L7_error) } break; } @@ -2243,22 +2243,22 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ __pyx_t_1 = 0; __Pyx_INCREF(__pyx_t_3); __Pyx_XDECREF_SET(__pyx_v_lineno, __pyx_t_3); - __pyx_t_1 = __Pyx_PyInt_AddObjC(__pyx_t_3, __pyx_int_1, 1, 0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 80, __pyx_L7_error) + __pyx_t_1 = __Pyx_PyInt_AddObjC(__pyx_t_3, __pyx_int_1, 1, 0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 72, __pyx_L7_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = __pyx_t_1; __pyx_t_1 = 0; - /* "gensim/corpora/_mmreader.pyx":81 + /* "gensim/corpora/_mmreader.pyx":73 * self.num_docs = self.num_terms = self.num_nnz = 0 * for lineno, line in enumerate(lines): * line = utils.to_unicode(line) # <<<<<<<<<<<<<< * if not line.startswith('%'): * self.num_docs, self.num_terms, self.num_nnz = (int(x) for x in line.split()) */ - __pyx_t_15 = __Pyx_GetModuleGlobalName(__pyx_n_s_utils); if (unlikely(!__pyx_t_15)) __PYX_ERR(0, 81, __pyx_L7_error) + __pyx_t_15 = __Pyx_GetModuleGlobalName(__pyx_n_s_utils); if (unlikely(!__pyx_t_15)) __PYX_ERR(0, 73, __pyx_L7_error) __Pyx_GOTREF(__pyx_t_15); - __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_t_15, __pyx_n_s_to_unicode); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 81, __pyx_L7_error) + __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_t_15, __pyx_n_s_to_unicode); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 73, __pyx_L7_error) __Pyx_GOTREF(__pyx_t_5); __Pyx_DECREF(__pyx_t_15); __pyx_t_15 = 0; __pyx_t_15 = NULL; @@ -2272,13 +2272,13 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ } } if (!__pyx_t_15) { - __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_5, __pyx_cur_scope->__pyx_v_line); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 81, __pyx_L7_error) + __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_5, __pyx_cur_scope->__pyx_v_line); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 73, __pyx_L7_error) __Pyx_GOTREF(__pyx_t_1); } else { #if CYTHON_FAST_PYCALL if (PyFunction_Check(__pyx_t_5)) { PyObject *__pyx_temp[2] = {__pyx_t_15, __pyx_cur_scope->__pyx_v_line}; - __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_5, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 81, __pyx_L7_error) + __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_5, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 73, __pyx_L7_error) __Pyx_XDECREF(__pyx_t_15); __pyx_t_15 = 0; __Pyx_GOTREF(__pyx_t_1); } else @@ -2286,19 +2286,19 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ #if CYTHON_FAST_PYCCALL if (__Pyx_PyFastCFunction_Check(__pyx_t_5)) { PyObject *__pyx_temp[2] = {__pyx_t_15, __pyx_cur_scope->__pyx_v_line}; - __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_5, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 81, __pyx_L7_error) + __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_5, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 73, __pyx_L7_error) __Pyx_XDECREF(__pyx_t_15); __pyx_t_15 = 0; __Pyx_GOTREF(__pyx_t_1); } else #endif { - __pyx_t_14 = PyTuple_New(1+1); if (unlikely(!__pyx_t_14)) __PYX_ERR(0, 81, __pyx_L7_error) + __pyx_t_14 = PyTuple_New(1+1); if (unlikely(!__pyx_t_14)) __PYX_ERR(0, 73, __pyx_L7_error) __Pyx_GOTREF(__pyx_t_14); __Pyx_GIVEREF(__pyx_t_15); PyTuple_SET_ITEM(__pyx_t_14, 0, __pyx_t_15); __pyx_t_15 = NULL; __Pyx_INCREF(__pyx_cur_scope->__pyx_v_line); __Pyx_GIVEREF(__pyx_cur_scope->__pyx_v_line); PyTuple_SET_ITEM(__pyx_t_14, 0+1, __pyx_cur_scope->__pyx_v_line); - __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_5, __pyx_t_14, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 81, __pyx_L7_error) + __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_5, __pyx_t_14, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 73, __pyx_L7_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_14); __pyx_t_14 = 0; } @@ -2309,31 +2309,31 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ __Pyx_GIVEREF(__pyx_t_1); __pyx_t_1 = 0; - /* "gensim/corpora/_mmreader.pyx":82 + /* "gensim/corpora/_mmreader.pyx":74 * for lineno, line in enumerate(lines): * line = utils.to_unicode(line) * if not line.startswith('%'): # <<<<<<<<<<<<<< * self.num_docs, self.num_terms, self.num_nnz = (int(x) for x in line.split()) * if not self.transposed: */ - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_cur_scope->__pyx_v_line, __pyx_n_s_startswith); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 82, __pyx_L7_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_cur_scope->__pyx_v_line, __pyx_n_s_startswith); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 74, __pyx_L7_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_5 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_tuple__3, NULL); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 82, __pyx_L7_error) + __pyx_t_5 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_tuple__3, NULL); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 74, __pyx_L7_error) __Pyx_GOTREF(__pyx_t_5); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_t_16 = __Pyx_PyObject_IsTrue(__pyx_t_5); if (unlikely(__pyx_t_16 < 0)) __PYX_ERR(0, 82, __pyx_L7_error) + __pyx_t_16 = __Pyx_PyObject_IsTrue(__pyx_t_5); if (unlikely(__pyx_t_16 < 0)) __PYX_ERR(0, 74, __pyx_L7_error) __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; __pyx_t_6 = ((!__pyx_t_16) != 0); if (__pyx_t_6) { - /* "gensim/corpora/_mmreader.pyx":83 + /* "gensim/corpora/_mmreader.pyx":75 * line = utils.to_unicode(line) * if not line.startswith('%'): * self.num_docs, self.num_terms, self.num_nnz = (int(x) for x in line.split()) # <<<<<<<<<<<<<< * if not self.transposed: * self.num_docs, self.num_terms = self.num_terms, self.num_docs */ - __pyx_t_5 = __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_8__init___genexpr(((PyObject*)__pyx_cur_scope)); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 83, __pyx_L7_error) + __pyx_t_5 = __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_8__init___genexpr(((PyObject*)__pyx_cur_scope)); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 75, __pyx_L7_error) __Pyx_GOTREF(__pyx_t_5); if ((likely(PyTuple_CheckExact(__pyx_t_5))) || (PyList_CheckExact(__pyx_t_5))) { PyObject* sequence = __pyx_t_5; @@ -2345,7 +2345,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ if (unlikely(size != 3)) { if (size > 3) __Pyx_RaiseTooManyValuesError(3); else if (size >= 0) __Pyx_RaiseNeedMoreValuesError(size); - __PYX_ERR(0, 83, __pyx_L7_error) + __PYX_ERR(0, 75, __pyx_L7_error) } #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS if (likely(PyTuple_CheckExact(sequence))) { @@ -2361,17 +2361,17 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ __Pyx_INCREF(__pyx_t_14); __Pyx_INCREF(__pyx_t_15); #else - __pyx_t_1 = PySequence_ITEM(sequence, 0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 83, __pyx_L7_error) + __pyx_t_1 = PySequence_ITEM(sequence, 0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 75, __pyx_L7_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_14 = PySequence_ITEM(sequence, 1); if (unlikely(!__pyx_t_14)) __PYX_ERR(0, 83, __pyx_L7_error) + __pyx_t_14 = PySequence_ITEM(sequence, 1); if (unlikely(!__pyx_t_14)) __PYX_ERR(0, 75, __pyx_L7_error) __Pyx_GOTREF(__pyx_t_14); - __pyx_t_15 = PySequence_ITEM(sequence, 2); if (unlikely(!__pyx_t_15)) __PYX_ERR(0, 83, __pyx_L7_error) + __pyx_t_15 = PySequence_ITEM(sequence, 2); if (unlikely(!__pyx_t_15)) __PYX_ERR(0, 75, __pyx_L7_error) __Pyx_GOTREF(__pyx_t_15); #endif __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; } else { Py_ssize_t index = -1; - __pyx_t_19 = PyObject_GetIter(__pyx_t_5); if (unlikely(!__pyx_t_19)) __PYX_ERR(0, 83, __pyx_L7_error) + __pyx_t_19 = PyObject_GetIter(__pyx_t_5); if (unlikely(!__pyx_t_19)) __PYX_ERR(0, 75, __pyx_L7_error) __Pyx_GOTREF(__pyx_t_19); __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; __pyx_t_20 = Py_TYPE(__pyx_t_19)->tp_iternext; @@ -2381,7 +2381,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ __Pyx_GOTREF(__pyx_t_14); index = 2; __pyx_t_15 = __pyx_t_20(__pyx_t_19); if (unlikely(!__pyx_t_15)) goto __pyx_L23_unpacking_failed; __Pyx_GOTREF(__pyx_t_15); - if (__Pyx_IternextUnpackEndCheck(__pyx_t_20(__pyx_t_19), 3) < 0) __PYX_ERR(0, 83, __pyx_L7_error) + if (__Pyx_IternextUnpackEndCheck(__pyx_t_20(__pyx_t_19), 3) < 0) __PYX_ERR(0, 75, __pyx_L7_error) __pyx_t_20 = NULL; __Pyx_DECREF(__pyx_t_19); __pyx_t_19 = 0; goto __pyx_L24_unpacking_done; @@ -2389,20 +2389,20 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ __Pyx_DECREF(__pyx_t_19); __pyx_t_19 = 0; __pyx_t_20 = NULL; if (__Pyx_IterFinish() == 0) __Pyx_RaiseNeedMoreValuesError(index); - __PYX_ERR(0, 83, __pyx_L7_error) + __PYX_ERR(0, 75, __pyx_L7_error) __pyx_L24_unpacking_done:; } - __pyx_t_4 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_4 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 83, __pyx_L7_error) + __pyx_t_4 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_4 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 75, __pyx_L7_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_t_21 = __Pyx_PyInt_As_int(__pyx_t_14); if (unlikely((__pyx_t_21 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 83, __pyx_L7_error) + __pyx_t_21 = __Pyx_PyInt_As_int(__pyx_t_14); if (unlikely((__pyx_t_21 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 75, __pyx_L7_error) __Pyx_DECREF(__pyx_t_14); __pyx_t_14 = 0; - __pyx_t_22 = __Pyx_PyInt_As_int(__pyx_t_15); if (unlikely((__pyx_t_22 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 83, __pyx_L7_error) + __pyx_t_22 = __Pyx_PyInt_As_int(__pyx_t_15); if (unlikely((__pyx_t_22 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 75, __pyx_L7_error) __Pyx_DECREF(__pyx_t_15); __pyx_t_15 = 0; __pyx_v_self->num_docs = __pyx_t_4; __pyx_v_self->num_terms = __pyx_t_21; __pyx_v_self->num_nnz = __pyx_t_22; - /* "gensim/corpora/_mmreader.pyx":84 + /* "gensim/corpora/_mmreader.pyx":76 * if not line.startswith('%'): * self.num_docs, self.num_terms, self.num_nnz = (int(x) for x in line.split()) * if not self.transposed: # <<<<<<<<<<<<<< @@ -2412,7 +2412,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ __pyx_t_6 = ((!(__pyx_v_self->transposed != 0)) != 0); if (__pyx_t_6) { - /* "gensim/corpora/_mmreader.pyx":85 + /* "gensim/corpora/_mmreader.pyx":77 * self.num_docs, self.num_terms, self.num_nnz = (int(x) for x in line.split()) * if not self.transposed: * self.num_docs, self.num_terms = self.num_terms, self.num_docs # <<<<<<<<<<<<<< @@ -2424,7 +2424,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ __pyx_v_self->num_docs = __pyx_t_22; __pyx_v_self->num_terms = __pyx_t_21; - /* "gensim/corpora/_mmreader.pyx":84 + /* "gensim/corpora/_mmreader.pyx":76 * if not line.startswith('%'): * self.num_docs, self.num_terms, self.num_nnz = (int(x) for x in line.split()) * if not self.transposed: # <<<<<<<<<<<<<< @@ -2433,7 +2433,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ */ } - /* "gensim/corpora/_mmreader.pyx":86 + /* "gensim/corpora/_mmreader.pyx":78 * if not self.transposed: * self.num_docs, self.num_terms = self.num_terms, self.num_docs * break # <<<<<<<<<<<<<< @@ -2442,7 +2442,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ */ goto __pyx_L21_break; - /* "gensim/corpora/_mmreader.pyx":82 + /* "gensim/corpora/_mmreader.pyx":74 * for lineno, line in enumerate(lines): * line = utils.to_unicode(line) * if not line.startswith('%'): # <<<<<<<<<<<<<< @@ -2451,7 +2451,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ */ } - /* "gensim/corpora/_mmreader.pyx":80 + /* "gensim/corpora/_mmreader.pyx":72 * * self.num_docs = self.num_terms = self.num_nnz = 0 * for lineno, line in enumerate(lines): # <<<<<<<<<<<<<< @@ -2463,7 +2463,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - /* "gensim/corpora/_mmreader.pyx":68 + /* "gensim/corpora/_mmreader.pyx":60 * logger.info("initializing cython corpus reader from %s", input) * self.input, self.transposed = input, transposed * with utils.file_or_filename(self.input) as lines: # <<<<<<<<<<<<<< @@ -2485,20 +2485,20 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0; /*except:*/ { __Pyx_AddTraceback("gensim.corpora._mmreader.MmReader.__init__", __pyx_clineno, __pyx_lineno, __pyx_filename); - if (__Pyx_GetException(&__pyx_t_3, &__pyx_t_2, &__pyx_t_5) < 0) __PYX_ERR(0, 68, __pyx_L9_except_error) + if (__Pyx_GetException(&__pyx_t_3, &__pyx_t_2, &__pyx_t_5) < 0) __PYX_ERR(0, 60, __pyx_L9_except_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_GOTREF(__pyx_t_2); __Pyx_GOTREF(__pyx_t_5); - __pyx_t_15 = PyTuple_Pack(3, __pyx_t_3, __pyx_t_2, __pyx_t_5); if (unlikely(!__pyx_t_15)) __PYX_ERR(0, 68, __pyx_L9_except_error) + __pyx_t_15 = PyTuple_Pack(3, __pyx_t_3, __pyx_t_2, __pyx_t_5); if (unlikely(!__pyx_t_15)) __PYX_ERR(0, 60, __pyx_L9_except_error) __Pyx_GOTREF(__pyx_t_15); __pyx_t_13 = __Pyx_PyObject_Call(__pyx_t_7, __pyx_t_15, NULL); __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0; __Pyx_DECREF(__pyx_t_15); __pyx_t_15 = 0; - if (unlikely(!__pyx_t_13)) __PYX_ERR(0, 68, __pyx_L9_except_error) + if (unlikely(!__pyx_t_13)) __PYX_ERR(0, 60, __pyx_L9_except_error) __Pyx_GOTREF(__pyx_t_13); __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_13); __Pyx_DECREF(__pyx_t_13); __pyx_t_13 = 0; - if (__pyx_t_6 < 0) __PYX_ERR(0, 68, __pyx_L9_except_error) + if (__pyx_t_6 < 0) __PYX_ERR(0, 60, __pyx_L9_except_error) __pyx_t_16 = ((!(__pyx_t_6 != 0)) != 0); if (__pyx_t_16) { __Pyx_GIVEREF(__pyx_t_3); @@ -2506,7 +2506,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ __Pyx_XGIVEREF(__pyx_t_5); __Pyx_ErrRestoreWithState(__pyx_t_3, __pyx_t_2, __pyx_t_5); __pyx_t_3 = 0; __pyx_t_2 = 0; __pyx_t_5 = 0; - __PYX_ERR(0, 68, __pyx_L9_except_error) + __PYX_ERR(0, 60, __pyx_L9_except_error) } __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; @@ -2532,7 +2532,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ if (__pyx_t_7) { __pyx_t_10 = __Pyx_PyObject_Call(__pyx_t_7, __pyx_tuple__4, NULL); __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0; - if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 68, __pyx_L1_error) + if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 60, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_10); __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0; } @@ -2547,31 +2547,31 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ __pyx_L29:; } - /* "gensim/corpora/_mmreader.pyx":88 + /* "gensim/corpora/_mmreader.pyx":80 * break * * logger.info( # <<<<<<<<<<<<<< * "accepted corpus with %i documents, %i features, %i non-zero entries", * self.num_docs, self.num_terms, self.num_nnz */ - __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_logger); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 88, __pyx_L1_error) + __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_logger); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 80, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_info); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 88, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_info); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 80, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; - /* "gensim/corpora/_mmreader.pyx":90 + /* "gensim/corpora/_mmreader.pyx":82 * logger.info( * "accepted corpus with %i documents, %i features, %i non-zero entries", * self.num_docs, self.num_terms, self.num_nnz # <<<<<<<<<<<<<< * ) * */ - __pyx_t_2 = __Pyx_PyInt_From_int(__pyx_v_self->num_docs); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 90, __pyx_L1_error) + __pyx_t_2 = __Pyx_PyInt_From_int(__pyx_v_self->num_docs); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 82, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); - __pyx_t_15 = __Pyx_PyInt_From_int(__pyx_v_self->num_terms); if (unlikely(!__pyx_t_15)) __PYX_ERR(0, 90, __pyx_L1_error) + __pyx_t_15 = __Pyx_PyInt_From_int(__pyx_v_self->num_terms); if (unlikely(!__pyx_t_15)) __PYX_ERR(0, 82, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_15); - __pyx_t_14 = __Pyx_PyInt_From_int(__pyx_v_self->num_nnz); if (unlikely(!__pyx_t_14)) __PYX_ERR(0, 90, __pyx_L1_error) + __pyx_t_14 = __Pyx_PyInt_From_int(__pyx_v_self->num_nnz); if (unlikely(!__pyx_t_14)) __PYX_ERR(0, 82, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_14); __pyx_t_1 = NULL; __pyx_t_21 = 0; @@ -2588,7 +2588,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ #if CYTHON_FAST_PYCALL if (PyFunction_Check(__pyx_t_3)) { PyObject *__pyx_temp[5] = {__pyx_t_1, __pyx_kp_s_accepted_corpus_with_i_documents, __pyx_t_2, __pyx_t_15, __pyx_t_14}; - __pyx_t_5 = __Pyx_PyFunction_FastCall(__pyx_t_3, __pyx_temp+1-__pyx_t_21, 4+__pyx_t_21); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 88, __pyx_L1_error) + __pyx_t_5 = __Pyx_PyFunction_FastCall(__pyx_t_3, __pyx_temp+1-__pyx_t_21, 4+__pyx_t_21); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 80, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0; __Pyx_GOTREF(__pyx_t_5); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; @@ -2599,7 +2599,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ #if CYTHON_FAST_PYCCALL if (__Pyx_PyFastCFunction_Check(__pyx_t_3)) { PyObject *__pyx_temp[5] = {__pyx_t_1, __pyx_kp_s_accepted_corpus_with_i_documents, __pyx_t_2, __pyx_t_15, __pyx_t_14}; - __pyx_t_5 = __Pyx_PyCFunction_FastCall(__pyx_t_3, __pyx_temp+1-__pyx_t_21, 4+__pyx_t_21); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 88, __pyx_L1_error) + __pyx_t_5 = __Pyx_PyCFunction_FastCall(__pyx_t_3, __pyx_temp+1-__pyx_t_21, 4+__pyx_t_21); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 80, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0; __Pyx_GOTREF(__pyx_t_5); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; @@ -2608,7 +2608,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ } else #endif { - __pyx_t_19 = PyTuple_New(4+__pyx_t_21); if (unlikely(!__pyx_t_19)) __PYX_ERR(0, 88, __pyx_L1_error) + __pyx_t_19 = PyTuple_New(4+__pyx_t_21); if (unlikely(!__pyx_t_19)) __PYX_ERR(0, 80, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_19); if (__pyx_t_1) { __Pyx_GIVEREF(__pyx_t_1); PyTuple_SET_ITEM(__pyx_t_19, 0, __pyx_t_1); __pyx_t_1 = NULL; @@ -2625,19 +2625,19 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ __pyx_t_2 = 0; __pyx_t_15 = 0; __pyx_t_14 = 0; - __pyx_t_5 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_t_19, NULL); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 88, __pyx_L1_error) + __pyx_t_5 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_t_19, NULL); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 80, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); __Pyx_DECREF(__pyx_t_19); __pyx_t_19 = 0; } __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; - /* "gensim/corpora/_mmreader.pyx":48 + /* "gensim/corpora/_mmreader.pyx":45 * cdef public int num_docs, num_terms, num_nnz * * def __init__(self, input, transposed=True): # <<<<<<<<<<<<<< * """ - * MmReader(input, transposed=True): + * */ /* function exit code */ @@ -2662,16 +2662,20 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ return __pyx_r; } -/* "gensim/corpora/_mmreader.pyx":93 +/* "gensim/corpora/_mmreader.pyx":85 * ) * * def __len__(self): # <<<<<<<<<<<<<< + * """Get size of corpus (number of documents).""" * return self.num_docs - * */ /* Python wrapper */ static Py_ssize_t __pyx_pw_6gensim_7corpora_9_mmreader_8MmReader_3__len__(PyObject *__pyx_v_self); /*proto*/ +static char __pyx_doc_6gensim_7corpora_9_mmreader_8MmReader_2__len__[] = "Get size of corpus (number of documents)."; +#if CYTHON_COMPILING_IN_CPYTHON +struct wrapperbase __pyx_wrapperbase_6gensim_7corpora_9_mmreader_8MmReader_2__len__; +#endif static Py_ssize_t __pyx_pw_6gensim_7corpora_9_mmreader_8MmReader_3__len__(PyObject *__pyx_v_self) { Py_ssize_t __pyx_r; __Pyx_RefNannyDeclarations @@ -2688,9 +2692,9 @@ static Py_ssize_t __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_2__len__(struct __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("__len__", 0); - /* "gensim/corpora/_mmreader.pyx":94 - * + /* "gensim/corpora/_mmreader.pyx":87 * def __len__(self): + * """Get size of corpus (number of documents).""" * return self.num_docs # <<<<<<<<<<<<<< * * def __str__(self): @@ -2698,12 +2702,12 @@ static Py_ssize_t __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_2__len__(struct __pyx_r = __pyx_v_self->num_docs; goto __pyx_L0; - /* "gensim/corpora/_mmreader.pyx":93 + /* "gensim/corpora/_mmreader.pyx":85 * ) * * def __len__(self): # <<<<<<<<<<<<<< + * """Get size of corpus (number of documents).""" * return self.num_docs - * */ /* function exit code */ @@ -2712,7 +2716,7 @@ static Py_ssize_t __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_2__len__(struct return __pyx_r; } -/* "gensim/corpora/_mmreader.pyx":96 +/* "gensim/corpora/_mmreader.pyx":89 * return self.num_docs * * def __str__(self): # <<<<<<<<<<<<<< @@ -2742,7 +2746,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_4__str__(struct PyObject *__pyx_t_4 = NULL; __Pyx_RefNannySetupContext("__str__", 0); - /* "gensim/corpora/_mmreader.pyx":97 + /* "gensim/corpora/_mmreader.pyx":90 * * def __str__(self): * return ("MmCorpus(%i documents, %i features, %i non-zero entries)" % # <<<<<<<<<<<<<< @@ -2751,20 +2755,20 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_4__str__(struct */ __Pyx_XDECREF(__pyx_r); - /* "gensim/corpora/_mmreader.pyx":98 + /* "gensim/corpora/_mmreader.pyx":91 * def __str__(self): * return ("MmCorpus(%i documents, %i features, %i non-zero entries)" % * (self.num_docs, self.num_terms, self.num_nnz)) # <<<<<<<<<<<<<< * * def skip_headers(self, input_file): */ - __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_self->num_docs); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 98, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_self->num_docs); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 91, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_2 = __Pyx_PyInt_From_int(__pyx_v_self->num_terms); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 98, __pyx_L1_error) + __pyx_t_2 = __Pyx_PyInt_From_int(__pyx_v_self->num_terms); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 91, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); - __pyx_t_3 = __Pyx_PyInt_From_int(__pyx_v_self->num_nnz); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 98, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyInt_From_int(__pyx_v_self->num_nnz); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 91, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_4 = PyTuple_New(3); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 98, __pyx_L1_error) + __pyx_t_4 = PyTuple_New(3); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 91, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_4); __Pyx_GIVEREF(__pyx_t_1); PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_1); @@ -2776,21 +2780,21 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_4__str__(struct __pyx_t_2 = 0; __pyx_t_3 = 0; - /* "gensim/corpora/_mmreader.pyx":97 + /* "gensim/corpora/_mmreader.pyx":90 * * def __str__(self): * return ("MmCorpus(%i documents, %i features, %i non-zero entries)" % # <<<<<<<<<<<<<< * (self.num_docs, self.num_terms, self.num_nnz)) * */ - __pyx_t_3 = __Pyx_PyString_Format(__pyx_kp_s_MmCorpus_i_documents_i_features, __pyx_t_4); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 97, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyString_Format(__pyx_kp_s_MmCorpus_i_documents_i_features, __pyx_t_4); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 90, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; __pyx_r = __pyx_t_3; __pyx_t_3 = 0; goto __pyx_L0; - /* "gensim/corpora/_mmreader.pyx":96 + /* "gensim/corpora/_mmreader.pyx":89 * return self.num_docs * * def __str__(self): # <<<<<<<<<<<<<< @@ -2812,17 +2816,17 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_4__str__(struct return __pyx_r; } -/* "gensim/corpora/_mmreader.pyx":100 +/* "gensim/corpora/_mmreader.pyx":93 * (self.num_docs, self.num_terms, self.num_nnz)) * * def skip_headers(self, input_file): # <<<<<<<<<<<<<< - * """ - * skip_headers(self, input_file) + * """Skip file headers that appear before the first document. + * */ /* Python wrapper */ static PyObject *__pyx_pw_6gensim_7corpora_9_mmreader_8MmReader_7skip_headers(PyObject *__pyx_v_self, PyObject *__pyx_v_input_file); /*proto*/ -static char __pyx_doc_6gensim_7corpora_9_mmreader_8MmReader_6skip_headers[] = "\n skip_headers(self, input_file)\n\n Skip file headers that appear before the first document.\n\n Parameters\n ----------\n input_file : iterable\n consumes any lines from start of `input_file` that begin with a %\n\n "; +static char __pyx_doc_6gensim_7corpora_9_mmreader_8MmReader_6skip_headers[] = "MmReader.skip_headers(self, input_file)\nSkip file headers that appear before the first document.\n\n Parameters\n ----------\n input_file : iterable of str\n Iterable taken from file in MM format.\n\n "; static PyObject *__pyx_pw_6gensim_7corpora_9_mmreader_8MmReader_7skip_headers(PyObject *__pyx_v_self, PyObject *__pyx_v_input_file) { PyObject *__pyx_r = 0; __Pyx_RefNannyDeclarations @@ -2846,7 +2850,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_6skip_headers(CY int __pyx_t_6; __Pyx_RefNannySetupContext("skip_headers", 0); - /* "gensim/corpora/_mmreader.pyx":112 + /* "gensim/corpora/_mmreader.pyx":102 * * """ * for line in input_file: # <<<<<<<<<<<<<< @@ -2857,26 +2861,26 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_6skip_headers(CY __pyx_t_1 = __pyx_v_input_file; __Pyx_INCREF(__pyx_t_1); __pyx_t_2 = 0; __pyx_t_3 = NULL; } else { - __pyx_t_2 = -1; __pyx_t_1 = PyObject_GetIter(__pyx_v_input_file); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 112, __pyx_L1_error) + __pyx_t_2 = -1; __pyx_t_1 = PyObject_GetIter(__pyx_v_input_file); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 102, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_3 = Py_TYPE(__pyx_t_1)->tp_iternext; if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 112, __pyx_L1_error) + __pyx_t_3 = Py_TYPE(__pyx_t_1)->tp_iternext; if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 102, __pyx_L1_error) } for (;;) { if (likely(!__pyx_t_3)) { if (likely(PyList_CheckExact(__pyx_t_1))) { if (__pyx_t_2 >= PyList_GET_SIZE(__pyx_t_1)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS - __pyx_t_4 = PyList_GET_ITEM(__pyx_t_1, __pyx_t_2); __Pyx_INCREF(__pyx_t_4); __pyx_t_2++; if (unlikely(0 < 0)) __PYX_ERR(0, 112, __pyx_L1_error) + __pyx_t_4 = PyList_GET_ITEM(__pyx_t_1, __pyx_t_2); __Pyx_INCREF(__pyx_t_4); __pyx_t_2++; if (unlikely(0 < 0)) __PYX_ERR(0, 102, __pyx_L1_error) #else - __pyx_t_4 = PySequence_ITEM(__pyx_t_1, __pyx_t_2); __pyx_t_2++; if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 112, __pyx_L1_error) + __pyx_t_4 = PySequence_ITEM(__pyx_t_1, __pyx_t_2); __pyx_t_2++; if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 102, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_4); #endif } else { if (__pyx_t_2 >= PyTuple_GET_SIZE(__pyx_t_1)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS - __pyx_t_4 = PyTuple_GET_ITEM(__pyx_t_1, __pyx_t_2); __Pyx_INCREF(__pyx_t_4); __pyx_t_2++; if (unlikely(0 < 0)) __PYX_ERR(0, 112, __pyx_L1_error) + __pyx_t_4 = PyTuple_GET_ITEM(__pyx_t_1, __pyx_t_2); __Pyx_INCREF(__pyx_t_4); __pyx_t_2++; if (unlikely(0 < 0)) __PYX_ERR(0, 102, __pyx_L1_error) #else - __pyx_t_4 = PySequence_ITEM(__pyx_t_1, __pyx_t_2); __pyx_t_2++; if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 112, __pyx_L1_error) + __pyx_t_4 = PySequence_ITEM(__pyx_t_1, __pyx_t_2); __pyx_t_2++; if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 102, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_4); #endif } @@ -2886,7 +2890,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_6skip_headers(CY PyObject* exc_type = PyErr_Occurred(); if (exc_type) { if (likely(__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear(); - else __PYX_ERR(0, 112, __pyx_L1_error) + else __PYX_ERR(0, 102, __pyx_L1_error) } break; } @@ -2895,23 +2899,23 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_6skip_headers(CY __Pyx_XDECREF_SET(__pyx_v_line, __pyx_t_4); __pyx_t_4 = 0; - /* "gensim/corpora/_mmreader.pyx":113 + /* "gensim/corpora/_mmreader.pyx":103 * """ * for line in input_file: * if line.startswith(b'%'): # <<<<<<<<<<<<<< * continue * break */ - __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_v_line, __pyx_n_s_startswith); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 113, __pyx_L1_error) + __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_v_line, __pyx_n_s_startswith); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 103, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_4); - __pyx_t_5 = __Pyx_PyObject_Call(__pyx_t_4, __pyx_tuple__5, NULL); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 113, __pyx_L1_error) + __pyx_t_5 = __Pyx_PyObject_Call(__pyx_t_4, __pyx_tuple__5, NULL); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 103, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; - __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_5); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(0, 113, __pyx_L1_error) + __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_5); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(0, 103, __pyx_L1_error) __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; if (__pyx_t_6) { - /* "gensim/corpora/_mmreader.pyx":114 + /* "gensim/corpora/_mmreader.pyx":104 * for line in input_file: * if line.startswith(b'%'): * continue # <<<<<<<<<<<<<< @@ -2920,7 +2924,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_6skip_headers(CY */ goto __pyx_L3_continue; - /* "gensim/corpora/_mmreader.pyx":113 + /* "gensim/corpora/_mmreader.pyx":103 * """ * for line in input_file: * if line.startswith(b'%'): # <<<<<<<<<<<<<< @@ -2929,7 +2933,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_6skip_headers(CY */ } - /* "gensim/corpora/_mmreader.pyx":115 + /* "gensim/corpora/_mmreader.pyx":105 * if line.startswith(b'%'): * continue * break # <<<<<<<<<<<<<< @@ -2938,7 +2942,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_6skip_headers(CY */ goto __pyx_L4_break; - /* "gensim/corpora/_mmreader.pyx":112 + /* "gensim/corpora/_mmreader.pyx":102 * * """ * for line in input_file: # <<<<<<<<<<<<<< @@ -2950,12 +2954,12 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_6skip_headers(CY __pyx_L4_break:; __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - /* "gensim/corpora/_mmreader.pyx":100 + /* "gensim/corpora/_mmreader.pyx":93 * (self.num_docs, self.num_terms, self.num_nnz)) * * def skip_headers(self, input_file): # <<<<<<<<<<<<<< - * """ - * skip_headers(self, input_file) + * """Skip file headers that appear before the first document. + * */ /* function exit code */ @@ -2975,17 +2979,17 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_6skip_headers(CY } static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__pyx_CoroutineObject *__pyx_generator, CYTHON_UNUSED PyThreadState *__pyx_tstate, PyObject *__pyx_sent_value); /* proto */ -/* "gensim/corpora/_mmreader.pyx":117 +/* "gensim/corpora/_mmreader.pyx":107 * break * * def __iter__(self): # <<<<<<<<<<<<<< - * """ - * __iter__() + * """Iterate through corpus. + * */ /* Python wrapper */ static PyObject *__pyx_pw_6gensim_7corpora_9_mmreader_8MmReader_9__iter__(PyObject *__pyx_v_self); /*proto*/ -static char __pyx_doc_6gensim_7corpora_9_mmreader_8MmReader_8__iter__[] = "\n __iter__()\n\n Iterate through vectors from underlying matrix\n\n Yields\n ------\n int, list of (termid, val)\n document id and \"vector\" of terms for next document in matrix\n vector of terms is represented as a list of (termid, val) tuples\n\n Notes\n ------\n Note that the total number of vectors returned is always equal to the\n number of rows specified in the header; empty documents are inserted and\n yielded where appropriate, even if they are not explicitly stored in the\n Matrix Market file.\n\n "; +static char __pyx_doc_6gensim_7corpora_9_mmreader_8MmReader_8__iter__[] = "Iterate through corpus.\n\n Notes\n ------\n Note that the total number of vectors returned is always equal to the number of rows specified\n in the header, empty documents are inserted and yielded where appropriate, even if they are not explicitly\n stored in the Matrix Market file.\n\n Yields\n ------\n (int, list of (int, number))\n Document id and Document in BoW format\n\n "; #if CYTHON_COMPILING_IN_CPYTHON struct wrapperbase __pyx_wrapperbase_6gensim_7corpora_9_mmreader_8MmReader_8__iter__; #endif @@ -3009,7 +3013,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_8__iter__(struct if (unlikely(!__pyx_cur_scope)) { __pyx_cur_scope = ((struct __pyx_obj_6gensim_7corpora_9_mmreader___pyx_scope_struct_2___iter__ *)Py_None); __Pyx_INCREF(Py_None); - __PYX_ERR(0, 117, __pyx_L1_error) + __PYX_ERR(0, 107, __pyx_L1_error) } else { __Pyx_GOTREF(__pyx_cur_scope); } @@ -3017,7 +3021,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_8__iter__(struct __Pyx_INCREF((PyObject *)__pyx_cur_scope->__pyx_v_self); __Pyx_GIVEREF((PyObject *)__pyx_cur_scope->__pyx_v_self); { - __pyx_CoroutineObject *gen = __Pyx_Generator_New((__pyx_coroutine_body_t) __pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator, (PyObject *) __pyx_cur_scope, __pyx_n_s_iter, __pyx_n_s_MmReader___iter, __pyx_n_s_gensim_corpora__mmreader); if (unlikely(!gen)) __PYX_ERR(0, 117, __pyx_L1_error) + __pyx_CoroutineObject *gen = __Pyx_Generator_New((__pyx_coroutine_body_t) __pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator, (PyObject *) __pyx_cur_scope, __pyx_n_s_iter, __pyx_n_s_MmReader___iter, __pyx_n_s_gensim_corpora__mmreader); if (unlikely(!gen)) __PYX_ERR(0, 107, __pyx_L1_error) __Pyx_DECREF(__pyx_cur_scope); __Pyx_RefNannyFinishContext(); return (PyObject *) gen; @@ -3072,9 +3076,9 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py return NULL; } __pyx_L3_first_run:; - if (unlikely(!__pyx_sent_value)) __PYX_ERR(0, 117, __pyx_L1_error) + if (unlikely(!__pyx_sent_value)) __PYX_ERR(0, 107, __pyx_L1_error) - /* "gensim/corpora/_mmreader.pyx":138 + /* "gensim/corpora/_mmreader.pyx":123 * """ * cdef int docid, termid, previd * cdef double val = 0 # <<<<<<<<<<<<<< @@ -3083,7 +3087,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py */ __pyx_cur_scope->__pyx_v_val = 0.0; - /* "gensim/corpora/_mmreader.pyx":140 + /* "gensim/corpora/_mmreader.pyx":125 * cdef double val = 0 * * with utils.file_or_filename(self.input) as lines: # <<<<<<<<<<<<<< @@ -3091,9 +3095,9 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py * */ /*with:*/ { - __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_utils); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 140, __pyx_L1_error) + __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_utils); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 125, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_file_or_filename); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 140, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_file_or_filename); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 125, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; __pyx_t_2 = NULL; @@ -3107,13 +3111,13 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py } } if (!__pyx_t_2) { - __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_3, __pyx_cur_scope->__pyx_v_self->input); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 140, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_3, __pyx_cur_scope->__pyx_v_self->input); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 125, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); } else { #if CYTHON_FAST_PYCALL if (PyFunction_Check(__pyx_t_3)) { PyObject *__pyx_temp[2] = {__pyx_t_2, __pyx_cur_scope->__pyx_v_self->input}; - __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_3, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 140, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_3, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 125, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0; __Pyx_GOTREF(__pyx_t_1); } else @@ -3121,27 +3125,27 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py #if CYTHON_FAST_PYCCALL if (__Pyx_PyFastCFunction_Check(__pyx_t_3)) { PyObject *__pyx_temp[2] = {__pyx_t_2, __pyx_cur_scope->__pyx_v_self->input}; - __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_3, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 140, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_3, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 125, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0; __Pyx_GOTREF(__pyx_t_1); } else #endif { - __pyx_t_4 = PyTuple_New(1+1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 140, __pyx_L1_error) + __pyx_t_4 = PyTuple_New(1+1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 125, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_4); __Pyx_GIVEREF(__pyx_t_2); PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_2); __pyx_t_2 = NULL; __Pyx_INCREF(__pyx_cur_scope->__pyx_v_self->input); __Pyx_GIVEREF(__pyx_cur_scope->__pyx_v_self->input); PyTuple_SET_ITEM(__pyx_t_4, 0+1, __pyx_cur_scope->__pyx_v_self->input); - __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_t_4, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 140, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_t_4, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 125, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; } } __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - __pyx_t_5 = __Pyx_PyObject_LookupSpecial(__pyx_t_1, __pyx_n_s_exit); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 140, __pyx_L1_error) + __pyx_t_5 = __Pyx_PyObject_LookupSpecial(__pyx_t_1, __pyx_n_s_exit); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 125, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); - __pyx_t_4 = __Pyx_PyObject_LookupSpecial(__pyx_t_1, __pyx_n_s_enter); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 140, __pyx_L4_error) + __pyx_t_4 = __Pyx_PyObject_LookupSpecial(__pyx_t_1, __pyx_n_s_enter); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 125, __pyx_L4_error) __Pyx_GOTREF(__pyx_t_4); __pyx_t_2 = NULL; if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_4))) { @@ -3154,10 +3158,10 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py } } if (__pyx_t_2) { - __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_t_4, __pyx_t_2); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 140, __pyx_L4_error) + __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_t_4, __pyx_t_2); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 125, __pyx_L4_error) __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; } else { - __pyx_t_3 = __Pyx_PyObject_CallNoArg(__pyx_t_4); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 140, __pyx_L4_error) + __pyx_t_3 = __Pyx_PyObject_CallNoArg(__pyx_t_4); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 125, __pyx_L4_error) } __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; @@ -3175,14 +3179,14 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py __pyx_cur_scope->__pyx_v_lines = __pyx_t_4; __pyx_t_4 = 0; - /* "gensim/corpora/_mmreader.pyx":141 + /* "gensim/corpora/_mmreader.pyx":126 * * with utils.file_or_filename(self.input) as lines: * self.skip_headers(lines) # <<<<<<<<<<<<<< * * previd = -1 */ - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_cur_scope->__pyx_v_self), __pyx_n_s_skip_headers); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 141, __pyx_L8_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_cur_scope->__pyx_v_self), __pyx_n_s_skip_headers); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 126, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_1); __pyx_t_3 = NULL; if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_1))) { @@ -3195,13 +3199,13 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py } } if (!__pyx_t_3) { - __pyx_t_4 = __Pyx_PyObject_CallOneArg(__pyx_t_1, __pyx_cur_scope->__pyx_v_lines); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 141, __pyx_L8_error) + __pyx_t_4 = __Pyx_PyObject_CallOneArg(__pyx_t_1, __pyx_cur_scope->__pyx_v_lines); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 126, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_4); } else { #if CYTHON_FAST_PYCALL if (PyFunction_Check(__pyx_t_1)) { PyObject *__pyx_temp[2] = {__pyx_t_3, __pyx_cur_scope->__pyx_v_lines}; - __pyx_t_4 = __Pyx_PyFunction_FastCall(__pyx_t_1, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 141, __pyx_L8_error) + __pyx_t_4 = __Pyx_PyFunction_FastCall(__pyx_t_1, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 126, __pyx_L8_error) __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0; __Pyx_GOTREF(__pyx_t_4); } else @@ -3209,19 +3213,19 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py #if CYTHON_FAST_PYCCALL if (__Pyx_PyFastCFunction_Check(__pyx_t_1)) { PyObject *__pyx_temp[2] = {__pyx_t_3, __pyx_cur_scope->__pyx_v_lines}; - __pyx_t_4 = __Pyx_PyCFunction_FastCall(__pyx_t_1, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 141, __pyx_L8_error) + __pyx_t_4 = __Pyx_PyCFunction_FastCall(__pyx_t_1, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 126, __pyx_L8_error) __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0; __Pyx_GOTREF(__pyx_t_4); } else #endif { - __pyx_t_2 = PyTuple_New(1+1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 141, __pyx_L8_error) + __pyx_t_2 = PyTuple_New(1+1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 126, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_2); __Pyx_GIVEREF(__pyx_t_3); PyTuple_SET_ITEM(__pyx_t_2, 0, __pyx_t_3); __pyx_t_3 = NULL; __Pyx_INCREF(__pyx_cur_scope->__pyx_v_lines); __Pyx_GIVEREF(__pyx_cur_scope->__pyx_v_lines); PyTuple_SET_ITEM(__pyx_t_2, 0+1, __pyx_cur_scope->__pyx_v_lines); - __pyx_t_4 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_t_2, NULL); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 141, __pyx_L8_error) + __pyx_t_4 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_t_2, NULL); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 126, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_4); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; } @@ -3229,7 +3233,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; - /* "gensim/corpora/_mmreader.pyx":143 + /* "gensim/corpora/_mmreader.pyx":128 * self.skip_headers(lines) * * previd = -1 # <<<<<<<<<<<<<< @@ -3238,7 +3242,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py */ __pyx_cur_scope->__pyx_v_previd = -1; - /* "gensim/corpora/_mmreader.pyx":144 + /* "gensim/corpora/_mmreader.pyx":129 * * previd = -1 * for line in lines: # <<<<<<<<<<<<<< @@ -3249,26 +3253,26 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py __pyx_t_4 = __pyx_cur_scope->__pyx_v_lines; __Pyx_INCREF(__pyx_t_4); __pyx_t_9 = 0; __pyx_t_10 = NULL; } else { - __pyx_t_9 = -1; __pyx_t_4 = PyObject_GetIter(__pyx_cur_scope->__pyx_v_lines); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 144, __pyx_L8_error) + __pyx_t_9 = -1; __pyx_t_4 = PyObject_GetIter(__pyx_cur_scope->__pyx_v_lines); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 129, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_4); - __pyx_t_10 = Py_TYPE(__pyx_t_4)->tp_iternext; if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 144, __pyx_L8_error) + __pyx_t_10 = Py_TYPE(__pyx_t_4)->tp_iternext; if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 129, __pyx_L8_error) } for (;;) { if (likely(!__pyx_t_10)) { if (likely(PyList_CheckExact(__pyx_t_4))) { if (__pyx_t_9 >= PyList_GET_SIZE(__pyx_t_4)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS - __pyx_t_1 = PyList_GET_ITEM(__pyx_t_4, __pyx_t_9); __Pyx_INCREF(__pyx_t_1); __pyx_t_9++; if (unlikely(0 < 0)) __PYX_ERR(0, 144, __pyx_L8_error) + __pyx_t_1 = PyList_GET_ITEM(__pyx_t_4, __pyx_t_9); __Pyx_INCREF(__pyx_t_1); __pyx_t_9++; if (unlikely(0 < 0)) __PYX_ERR(0, 129, __pyx_L8_error) #else - __pyx_t_1 = PySequence_ITEM(__pyx_t_4, __pyx_t_9); __pyx_t_9++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 144, __pyx_L8_error) + __pyx_t_1 = PySequence_ITEM(__pyx_t_4, __pyx_t_9); __pyx_t_9++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 129, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_1); #endif } else { if (__pyx_t_9 >= PyTuple_GET_SIZE(__pyx_t_4)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS - __pyx_t_1 = PyTuple_GET_ITEM(__pyx_t_4, __pyx_t_9); __Pyx_INCREF(__pyx_t_1); __pyx_t_9++; if (unlikely(0 < 0)) __PYX_ERR(0, 144, __pyx_L8_error) + __pyx_t_1 = PyTuple_GET_ITEM(__pyx_t_4, __pyx_t_9); __Pyx_INCREF(__pyx_t_1); __pyx_t_9++; if (unlikely(0 < 0)) __PYX_ERR(0, 129, __pyx_L8_error) #else - __pyx_t_1 = PySequence_ITEM(__pyx_t_4, __pyx_t_9); __pyx_t_9++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 144, __pyx_L8_error) + __pyx_t_1 = PySequence_ITEM(__pyx_t_4, __pyx_t_9); __pyx_t_9++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 129, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_1); #endif } @@ -3278,7 +3282,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py PyObject* exc_type = PyErr_Occurred(); if (exc_type) { if (likely(__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear(); - else __PYX_ERR(0, 144, __pyx_L8_error) + else __PYX_ERR(0, 129, __pyx_L8_error) } break; } @@ -3289,25 +3293,25 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py __Pyx_GIVEREF(__pyx_t_1); __pyx_t_1 = 0; - /* "gensim/corpora/_mmreader.pyx":146 + /* "gensim/corpora/_mmreader.pyx":131 * for line in lines: * * if (sscanf(line, "%d %d %lg", &docid, &termid, &val) != 3): # <<<<<<<<<<<<<< * raise ValueError("unable to parse line: {}".format(line)) * */ - __pyx_t_11 = __Pyx_PyObject_AsString(__pyx_cur_scope->__pyx_v_line); if (unlikely((!__pyx_t_11) && PyErr_Occurred())) __PYX_ERR(0, 146, __pyx_L8_error) + __pyx_t_11 = __Pyx_PyObject_AsString(__pyx_cur_scope->__pyx_v_line); if (unlikely((!__pyx_t_11) && PyErr_Occurred())) __PYX_ERR(0, 131, __pyx_L8_error) __pyx_t_12 = ((sscanf(__pyx_t_11, ((char const *)"%d %d %lg"), (&__pyx_cur_scope->__pyx_v_docid), (&__pyx_cur_scope->__pyx_v_termid), (&__pyx_cur_scope->__pyx_v_val)) != 3) != 0); if (__pyx_t_12) { - /* "gensim/corpora/_mmreader.pyx":147 + /* "gensim/corpora/_mmreader.pyx":132 * * if (sscanf(line, "%d %d %lg", &docid, &termid, &val) != 3): * raise ValueError("unable to parse line: {}".format(line)) # <<<<<<<<<<<<<< * * if not self.transposed: */ - __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_kp_s_unable_to_parse_line, __pyx_n_s_format); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 147, __pyx_L8_error) + __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_kp_s_unable_to_parse_line, __pyx_n_s_format); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 132, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_2); __pyx_t_3 = NULL; if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_2))) { @@ -3320,13 +3324,13 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py } } if (!__pyx_t_3) { - __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_cur_scope->__pyx_v_line); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 147, __pyx_L8_error) + __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_cur_scope->__pyx_v_line); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 132, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_1); } else { #if CYTHON_FAST_PYCALL if (PyFunction_Check(__pyx_t_2)) { PyObject *__pyx_temp[2] = {__pyx_t_3, __pyx_cur_scope->__pyx_v_line}; - __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 147, __pyx_L8_error) + __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 132, __pyx_L8_error) __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0; __Pyx_GOTREF(__pyx_t_1); } else @@ -3334,37 +3338,37 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py #if CYTHON_FAST_PYCCALL if (__Pyx_PyFastCFunction_Check(__pyx_t_2)) { PyObject *__pyx_temp[2] = {__pyx_t_3, __pyx_cur_scope->__pyx_v_line}; - __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 147, __pyx_L8_error) + __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 132, __pyx_L8_error) __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0; __Pyx_GOTREF(__pyx_t_1); } else #endif { - __pyx_t_13 = PyTuple_New(1+1); if (unlikely(!__pyx_t_13)) __PYX_ERR(0, 147, __pyx_L8_error) + __pyx_t_13 = PyTuple_New(1+1); if (unlikely(!__pyx_t_13)) __PYX_ERR(0, 132, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_13); __Pyx_GIVEREF(__pyx_t_3); PyTuple_SET_ITEM(__pyx_t_13, 0, __pyx_t_3); __pyx_t_3 = NULL; __Pyx_INCREF(__pyx_cur_scope->__pyx_v_line); __Pyx_GIVEREF(__pyx_cur_scope->__pyx_v_line); PyTuple_SET_ITEM(__pyx_t_13, 0+1, __pyx_cur_scope->__pyx_v_line); - __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_13, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 147, __pyx_L8_error) + __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_13, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 132, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_13); __pyx_t_13 = 0; } } __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; - __pyx_t_2 = PyTuple_New(1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 147, __pyx_L8_error) + __pyx_t_2 = PyTuple_New(1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 132, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_2); __Pyx_GIVEREF(__pyx_t_1); PyTuple_SET_ITEM(__pyx_t_2, 0, __pyx_t_1); __pyx_t_1 = 0; - __pyx_t_1 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_t_2, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 147, __pyx_L8_error) + __pyx_t_1 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_t_2, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 132, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; __Pyx_Raise(__pyx_t_1, 0, 0, 0); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __PYX_ERR(0, 147, __pyx_L8_error) + __PYX_ERR(0, 132, __pyx_L8_error) - /* "gensim/corpora/_mmreader.pyx":146 + /* "gensim/corpora/_mmreader.pyx":131 * for line in lines: * * if (sscanf(line, "%d %d %lg", &docid, &termid, &val) != 3): # <<<<<<<<<<<<<< @@ -3373,7 +3377,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py */ } - /* "gensim/corpora/_mmreader.pyx":149 + /* "gensim/corpora/_mmreader.pyx":134 * raise ValueError("unable to parse line: {}".format(line)) * * if not self.transposed: # <<<<<<<<<<<<<< @@ -3383,7 +3387,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py __pyx_t_12 = ((!(__pyx_cur_scope->__pyx_v_self->transposed != 0)) != 0); if (__pyx_t_12) { - /* "gensim/corpora/_mmreader.pyx":150 + /* "gensim/corpora/_mmreader.pyx":135 * * if not self.transposed: * termid, docid = docid, termid # <<<<<<<<<<<<<< @@ -3395,7 +3399,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py __pyx_cur_scope->__pyx_v_termid = __pyx_t_14; __pyx_cur_scope->__pyx_v_docid = __pyx_t_15; - /* "gensim/corpora/_mmreader.pyx":149 + /* "gensim/corpora/_mmreader.pyx":134 * raise ValueError("unable to parse line: {}".format(line)) * * if not self.transposed: # <<<<<<<<<<<<<< @@ -3404,7 +3408,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py */ } - /* "gensim/corpora/_mmreader.pyx":153 + /* "gensim/corpora/_mmreader.pyx":138 * * # -1 because matrix market indexes are 1-based => convert to 0-based * docid -= 1 # <<<<<<<<<<<<<< @@ -3413,7 +3417,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py */ __pyx_cur_scope->__pyx_v_docid = (__pyx_cur_scope->__pyx_v_docid - 1); - /* "gensim/corpora/_mmreader.pyx":154 + /* "gensim/corpora/_mmreader.pyx":139 * # -1 because matrix market indexes are 1-based => convert to 0-based * docid -= 1 * termid -= 1 # <<<<<<<<<<<<<< @@ -3422,7 +3426,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py */ __pyx_cur_scope->__pyx_v_termid = (__pyx_cur_scope->__pyx_v_termid - 1); - /* "gensim/corpora/_mmreader.pyx":156 + /* "gensim/corpora/_mmreader.pyx":141 * termid -= 1 * * assert previd <= docid, "matrix columns must come in ascending order" # <<<<<<<<<<<<<< @@ -3433,12 +3437,12 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py if (unlikely(!Py_OptimizeFlag)) { if (unlikely(!((__pyx_cur_scope->__pyx_v_previd <= __pyx_cur_scope->__pyx_v_docid) != 0))) { PyErr_SetObject(PyExc_AssertionError, __pyx_kp_s_matrix_columns_must_come_in_asce); - __PYX_ERR(0, 156, __pyx_L8_error) + __PYX_ERR(0, 141, __pyx_L8_error) } } #endif - /* "gensim/corpora/_mmreader.pyx":157 + /* "gensim/corpora/_mmreader.pyx":142 * * assert previd <= docid, "matrix columns must come in ascending order" * if docid != previd: # <<<<<<<<<<<<<< @@ -3448,7 +3452,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py __pyx_t_12 = ((__pyx_cur_scope->__pyx_v_docid != __pyx_cur_scope->__pyx_v_previd) != 0); if (__pyx_t_12) { - /* "gensim/corpora/_mmreader.pyx":159 + /* "gensim/corpora/_mmreader.pyx":144 * if docid != previd: * # change of document: return the document read so far (its id is prevId) * if previd >= 0: # <<<<<<<<<<<<<< @@ -3458,17 +3462,17 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py __pyx_t_12 = ((__pyx_cur_scope->__pyx_v_previd >= 0) != 0); if (__pyx_t_12) { - /* "gensim/corpora/_mmreader.pyx":160 + /* "gensim/corpora/_mmreader.pyx":145 * # change of document: return the document read so far (its id is prevId) * if previd >= 0: * yield previd, document # noqa:F821 # <<<<<<<<<<<<<< * * # return implicit (empty) documents between previous id and new id */ - __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_cur_scope->__pyx_v_previd); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 160, __pyx_L8_error) + __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_cur_scope->__pyx_v_previd); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 145, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_1); - if (unlikely(!__pyx_cur_scope->__pyx_v_document)) { __Pyx_RaiseUnboundLocalError("document"); __PYX_ERR(0, 160, __pyx_L8_error) } - __pyx_t_2 = PyTuple_New(2); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 160, __pyx_L8_error) + if (unlikely(!__pyx_cur_scope->__pyx_v_document)) { __Pyx_RaiseUnboundLocalError("document"); __PYX_ERR(0, 145, __pyx_L8_error) } + __pyx_t_2 = PyTuple_New(2); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 145, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_2); __Pyx_GIVEREF(__pyx_t_1); PyTuple_SET_ITEM(__pyx_t_2, 0, __pyx_t_1); @@ -3514,9 +3518,9 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py __Pyx_XGOTREF(__pyx_t_8); __pyx_t_9 = __pyx_cur_scope->__pyx_t_5; __pyx_t_10 = __pyx_cur_scope->__pyx_t_6; - if (unlikely(!__pyx_sent_value)) __PYX_ERR(0, 160, __pyx_L8_error) + if (unlikely(!__pyx_sent_value)) __PYX_ERR(0, 145, __pyx_L8_error) - /* "gensim/corpora/_mmreader.pyx":159 + /* "gensim/corpora/_mmreader.pyx":144 * if docid != previd: * # change of document: return the document read so far (its id is prevId) * if previd >= 0: # <<<<<<<<<<<<<< @@ -3525,18 +3529,18 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py */ } - /* "gensim/corpora/_mmreader.pyx":164 + /* "gensim/corpora/_mmreader.pyx":149 * # return implicit (empty) documents between previous id and new id * # too, to keep consistent document numbering and corpus length * for previd in xrange(previd + 1, docid): # <<<<<<<<<<<<<< * yield previd, [] * */ - __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_xrange); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 164, __pyx_L8_error) + __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_xrange); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 149, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_13 = __Pyx_PyInt_From_long((__pyx_cur_scope->__pyx_v_previd + 1)); if (unlikely(!__pyx_t_13)) __PYX_ERR(0, 164, __pyx_L8_error) + __pyx_t_13 = __Pyx_PyInt_From_long((__pyx_cur_scope->__pyx_v_previd + 1)); if (unlikely(!__pyx_t_13)) __PYX_ERR(0, 149, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_13); - __pyx_t_3 = __Pyx_PyInt_From_int(__pyx_cur_scope->__pyx_v_docid); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 164, __pyx_L8_error) + __pyx_t_3 = __Pyx_PyInt_From_int(__pyx_cur_scope->__pyx_v_docid); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 149, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_3); __pyx_t_16 = NULL; __pyx_t_15 = 0; @@ -3553,7 +3557,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py #if CYTHON_FAST_PYCALL if (PyFunction_Check(__pyx_t_1)) { PyObject *__pyx_temp[3] = {__pyx_t_16, __pyx_t_13, __pyx_t_3}; - __pyx_t_2 = __Pyx_PyFunction_FastCall(__pyx_t_1, __pyx_temp+1-__pyx_t_15, 2+__pyx_t_15); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 164, __pyx_L8_error) + __pyx_t_2 = __Pyx_PyFunction_FastCall(__pyx_t_1, __pyx_temp+1-__pyx_t_15, 2+__pyx_t_15); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 149, __pyx_L8_error) __Pyx_XDECREF(__pyx_t_16); __pyx_t_16 = 0; __Pyx_GOTREF(__pyx_t_2); __Pyx_DECREF(__pyx_t_13); __pyx_t_13 = 0; @@ -3563,7 +3567,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py #if CYTHON_FAST_PYCCALL if (__Pyx_PyFastCFunction_Check(__pyx_t_1)) { PyObject *__pyx_temp[3] = {__pyx_t_16, __pyx_t_13, __pyx_t_3}; - __pyx_t_2 = __Pyx_PyCFunction_FastCall(__pyx_t_1, __pyx_temp+1-__pyx_t_15, 2+__pyx_t_15); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 164, __pyx_L8_error) + __pyx_t_2 = __Pyx_PyCFunction_FastCall(__pyx_t_1, __pyx_temp+1-__pyx_t_15, 2+__pyx_t_15); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 149, __pyx_L8_error) __Pyx_XDECREF(__pyx_t_16); __pyx_t_16 = 0; __Pyx_GOTREF(__pyx_t_2); __Pyx_DECREF(__pyx_t_13); __pyx_t_13 = 0; @@ -3571,7 +3575,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py } else #endif { - __pyx_t_17 = PyTuple_New(2+__pyx_t_15); if (unlikely(!__pyx_t_17)) __PYX_ERR(0, 164, __pyx_L8_error) + __pyx_t_17 = PyTuple_New(2+__pyx_t_15); if (unlikely(!__pyx_t_17)) __PYX_ERR(0, 149, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_17); if (__pyx_t_16) { __Pyx_GIVEREF(__pyx_t_16); PyTuple_SET_ITEM(__pyx_t_17, 0, __pyx_t_16); __pyx_t_16 = NULL; @@ -3582,7 +3586,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py PyTuple_SET_ITEM(__pyx_t_17, 1+__pyx_t_15, __pyx_t_3); __pyx_t_13 = 0; __pyx_t_3 = 0; - __pyx_t_2 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_t_17, NULL); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 164, __pyx_L8_error) + __pyx_t_2 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_t_17, NULL); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 149, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_2); __Pyx_DECREF(__pyx_t_17); __pyx_t_17 = 0; } @@ -3591,9 +3595,9 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py __pyx_t_1 = __pyx_t_2; __Pyx_INCREF(__pyx_t_1); __pyx_t_18 = 0; __pyx_t_19 = NULL; } else { - __pyx_t_18 = -1; __pyx_t_1 = PyObject_GetIter(__pyx_t_2); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 164, __pyx_L8_error) + __pyx_t_18 = -1; __pyx_t_1 = PyObject_GetIter(__pyx_t_2); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 149, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_19 = Py_TYPE(__pyx_t_1)->tp_iternext; if (unlikely(!__pyx_t_19)) __PYX_ERR(0, 164, __pyx_L8_error) + __pyx_t_19 = Py_TYPE(__pyx_t_1)->tp_iternext; if (unlikely(!__pyx_t_19)) __PYX_ERR(0, 149, __pyx_L8_error) } __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; for (;;) { @@ -3601,17 +3605,17 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py if (likely(PyList_CheckExact(__pyx_t_1))) { if (__pyx_t_18 >= PyList_GET_SIZE(__pyx_t_1)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS - __pyx_t_2 = PyList_GET_ITEM(__pyx_t_1, __pyx_t_18); __Pyx_INCREF(__pyx_t_2); __pyx_t_18++; if (unlikely(0 < 0)) __PYX_ERR(0, 164, __pyx_L8_error) + __pyx_t_2 = PyList_GET_ITEM(__pyx_t_1, __pyx_t_18); __Pyx_INCREF(__pyx_t_2); __pyx_t_18++; if (unlikely(0 < 0)) __PYX_ERR(0, 149, __pyx_L8_error) #else - __pyx_t_2 = PySequence_ITEM(__pyx_t_1, __pyx_t_18); __pyx_t_18++; if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 164, __pyx_L8_error) + __pyx_t_2 = PySequence_ITEM(__pyx_t_1, __pyx_t_18); __pyx_t_18++; if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 149, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_2); #endif } else { if (__pyx_t_18 >= PyTuple_GET_SIZE(__pyx_t_1)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS - __pyx_t_2 = PyTuple_GET_ITEM(__pyx_t_1, __pyx_t_18); __Pyx_INCREF(__pyx_t_2); __pyx_t_18++; if (unlikely(0 < 0)) __PYX_ERR(0, 164, __pyx_L8_error) + __pyx_t_2 = PyTuple_GET_ITEM(__pyx_t_1, __pyx_t_18); __Pyx_INCREF(__pyx_t_2); __pyx_t_18++; if (unlikely(0 < 0)) __PYX_ERR(0, 149, __pyx_L8_error) #else - __pyx_t_2 = PySequence_ITEM(__pyx_t_1, __pyx_t_18); __pyx_t_18++; if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 164, __pyx_L8_error) + __pyx_t_2 = PySequence_ITEM(__pyx_t_1, __pyx_t_18); __pyx_t_18++; if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 149, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_2); #endif } @@ -3621,28 +3625,28 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py PyObject* exc_type = PyErr_Occurred(); if (exc_type) { if (likely(__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear(); - else __PYX_ERR(0, 164, __pyx_L8_error) + else __PYX_ERR(0, 149, __pyx_L8_error) } break; } __Pyx_GOTREF(__pyx_t_2); } - __pyx_t_15 = __Pyx_PyInt_As_int(__pyx_t_2); if (unlikely((__pyx_t_15 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 164, __pyx_L8_error) + __pyx_t_15 = __Pyx_PyInt_As_int(__pyx_t_2); if (unlikely((__pyx_t_15 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 149, __pyx_L8_error) __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; __pyx_cur_scope->__pyx_v_previd = __pyx_t_15; - /* "gensim/corpora/_mmreader.pyx":165 + /* "gensim/corpora/_mmreader.pyx":150 * # too, to keep consistent document numbering and corpus length * for previd in xrange(previd + 1, docid): * yield previd, [] # <<<<<<<<<<<<<< * * # from now on start adding fields to a new document, with a new id */ - __pyx_t_2 = __Pyx_PyInt_From_int(__pyx_cur_scope->__pyx_v_previd); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 165, __pyx_L8_error) + __pyx_t_2 = __Pyx_PyInt_From_int(__pyx_cur_scope->__pyx_v_previd); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 150, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_2); - __pyx_t_17 = PyList_New(0); if (unlikely(!__pyx_t_17)) __PYX_ERR(0, 165, __pyx_L8_error) + __pyx_t_17 = PyList_New(0); if (unlikely(!__pyx_t_17)) __PYX_ERR(0, 150, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_17); - __pyx_t_3 = PyTuple_New(2); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 165, __pyx_L8_error) + __pyx_t_3 = PyTuple_New(2); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 150, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_GIVEREF(__pyx_t_2); PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_t_2); @@ -3697,9 +3701,9 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py __pyx_t_10 = __pyx_cur_scope->__pyx_t_6; __pyx_t_18 = __pyx_cur_scope->__pyx_t_8; __pyx_t_19 = __pyx_cur_scope->__pyx_t_9; - if (unlikely(!__pyx_sent_value)) __PYX_ERR(0, 165, __pyx_L8_error) + if (unlikely(!__pyx_sent_value)) __PYX_ERR(0, 150, __pyx_L8_error) - /* "gensim/corpora/_mmreader.pyx":164 + /* "gensim/corpora/_mmreader.pyx":149 * # return implicit (empty) documents between previous id and new id * # too, to keep consistent document numbering and corpus length * for previd in xrange(previd + 1, docid): # <<<<<<<<<<<<<< @@ -3709,7 +3713,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py } __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - /* "gensim/corpora/_mmreader.pyx":168 + /* "gensim/corpora/_mmreader.pyx":153 * * # from now on start adding fields to a new document, with a new id * previd = docid # <<<<<<<<<<<<<< @@ -3718,21 +3722,21 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py */ __pyx_cur_scope->__pyx_v_previd = __pyx_cur_scope->__pyx_v_docid; - /* "gensim/corpora/_mmreader.pyx":169 + /* "gensim/corpora/_mmreader.pyx":154 * # from now on start adding fields to a new document, with a new id * previd = docid * document = [] # <<<<<<<<<<<<<< * * document.append((termid, val,)) # add another field to the current document */ - __pyx_t_1 = PyList_New(0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 169, __pyx_L8_error) + __pyx_t_1 = PyList_New(0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 154, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_XGOTREF(__pyx_cur_scope->__pyx_v_document); __Pyx_XDECREF_SET(__pyx_cur_scope->__pyx_v_document, ((PyObject*)__pyx_t_1)); __Pyx_GIVEREF(__pyx_t_1); __pyx_t_1 = 0; - /* "gensim/corpora/_mmreader.pyx":157 + /* "gensim/corpora/_mmreader.pyx":142 * * assert previd <= docid, "matrix columns must come in ascending order" * if docid != previd: # <<<<<<<<<<<<<< @@ -3741,19 +3745,19 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py */ } - /* "gensim/corpora/_mmreader.pyx":171 + /* "gensim/corpora/_mmreader.pyx":156 * document = [] * * document.append((termid, val,)) # add another field to the current document # <<<<<<<<<<<<<< * * # handle the last document, as a special case */ - if (unlikely(!__pyx_cur_scope->__pyx_v_document)) { __Pyx_RaiseUnboundLocalError("document"); __PYX_ERR(0, 171, __pyx_L8_error) } - __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_cur_scope->__pyx_v_termid); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 171, __pyx_L8_error) + if (unlikely(!__pyx_cur_scope->__pyx_v_document)) { __Pyx_RaiseUnboundLocalError("document"); __PYX_ERR(0, 156, __pyx_L8_error) } + __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_cur_scope->__pyx_v_termid); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 156, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_3 = PyFloat_FromDouble(__pyx_cur_scope->__pyx_v_val); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 171, __pyx_L8_error) + __pyx_t_3 = PyFloat_FromDouble(__pyx_cur_scope->__pyx_v_val); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 156, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_17 = PyTuple_New(2); if (unlikely(!__pyx_t_17)) __PYX_ERR(0, 171, __pyx_L8_error) + __pyx_t_17 = PyTuple_New(2); if (unlikely(!__pyx_t_17)) __PYX_ERR(0, 156, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_17); __Pyx_GIVEREF(__pyx_t_1); PyTuple_SET_ITEM(__pyx_t_17, 0, __pyx_t_1); @@ -3761,10 +3765,10 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py PyTuple_SET_ITEM(__pyx_t_17, 1, __pyx_t_3); __pyx_t_1 = 0; __pyx_t_3 = 0; - __pyx_t_20 = __Pyx_PyList_Append(__pyx_cur_scope->__pyx_v_document, __pyx_t_17); if (unlikely(__pyx_t_20 == ((int)-1))) __PYX_ERR(0, 171, __pyx_L8_error) + __pyx_t_20 = __Pyx_PyList_Append(__pyx_cur_scope->__pyx_v_document, __pyx_t_17); if (unlikely(__pyx_t_20 == ((int)-1))) __PYX_ERR(0, 156, __pyx_L8_error) __Pyx_DECREF(__pyx_t_17); __pyx_t_17 = 0; - /* "gensim/corpora/_mmreader.pyx":144 + /* "gensim/corpora/_mmreader.pyx":129 * * previd = -1 * for line in lines: # <<<<<<<<<<<<<< @@ -3774,7 +3778,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py } __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; - /* "gensim/corpora/_mmreader.pyx":140 + /* "gensim/corpora/_mmreader.pyx":125 * cdef double val = 0 * * with utils.file_or_filename(self.input) as lines: # <<<<<<<<<<<<<< @@ -3796,20 +3800,20 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0; /*except:*/ { __Pyx_AddTraceback("gensim.corpora._mmreader.MmReader.__iter__", __pyx_clineno, __pyx_lineno, __pyx_filename); - if (__Pyx_GetException(&__pyx_t_4, &__pyx_t_17, &__pyx_t_3) < 0) __PYX_ERR(0, 140, __pyx_L10_except_error) + if (__Pyx_GetException(&__pyx_t_4, &__pyx_t_17, &__pyx_t_3) < 0) __PYX_ERR(0, 125, __pyx_L10_except_error) __Pyx_GOTREF(__pyx_t_4); __Pyx_GOTREF(__pyx_t_17); __Pyx_GOTREF(__pyx_t_3); - __pyx_t_1 = PyTuple_Pack(3, __pyx_t_4, __pyx_t_17, __pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 140, __pyx_L10_except_error) + __pyx_t_1 = PyTuple_Pack(3, __pyx_t_4, __pyx_t_17, __pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 125, __pyx_L10_except_error) __Pyx_GOTREF(__pyx_t_1); __pyx_t_21 = __Pyx_PyObject_Call(__pyx_t_5, __pyx_t_1, NULL); __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - if (unlikely(!__pyx_t_21)) __PYX_ERR(0, 140, __pyx_L10_except_error) + if (unlikely(!__pyx_t_21)) __PYX_ERR(0, 125, __pyx_L10_except_error) __Pyx_GOTREF(__pyx_t_21); __pyx_t_12 = __Pyx_PyObject_IsTrue(__pyx_t_21); __Pyx_DECREF(__pyx_t_21); __pyx_t_21 = 0; - if (__pyx_t_12 < 0) __PYX_ERR(0, 140, __pyx_L10_except_error) + if (__pyx_t_12 < 0) __PYX_ERR(0, 125, __pyx_L10_except_error) __pyx_t_22 = ((!(__pyx_t_12 != 0)) != 0); if (__pyx_t_22) { __Pyx_GIVEREF(__pyx_t_4); @@ -3817,7 +3821,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py __Pyx_XGIVEREF(__pyx_t_3); __Pyx_ErrRestoreWithState(__pyx_t_4, __pyx_t_17, __pyx_t_3); __pyx_t_4 = 0; __pyx_t_17 = 0; __pyx_t_3 = 0; - __PYX_ERR(0, 140, __pyx_L10_except_error) + __PYX_ERR(0, 125, __pyx_L10_except_error) } __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; __Pyx_DECREF(__pyx_t_17); __pyx_t_17 = 0; @@ -3843,7 +3847,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py if (__pyx_t_5) { __pyx_t_8 = __Pyx_PyObject_Call(__pyx_t_5, __pyx_tuple__6, NULL); __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; - if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 140, __pyx_L1_error) + if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 125, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_8); __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0; } @@ -3858,7 +3862,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py __pyx_L27:; } - /* "gensim/corpora/_mmreader.pyx":174 + /* "gensim/corpora/_mmreader.pyx":159 * * # handle the last document, as a special case * if previd >= 0: # <<<<<<<<<<<<<< @@ -3868,17 +3872,17 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py __pyx_t_22 = ((__pyx_cur_scope->__pyx_v_previd >= 0) != 0); if (__pyx_t_22) { - /* "gensim/corpora/_mmreader.pyx":175 + /* "gensim/corpora/_mmreader.pyx":160 * # handle the last document, as a special case * if previd >= 0: * yield previd, document # <<<<<<<<<<<<<< * * # return empty documents between the last explicit document and the number */ - __pyx_t_3 = __Pyx_PyInt_From_int(__pyx_cur_scope->__pyx_v_previd); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 175, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyInt_From_int(__pyx_cur_scope->__pyx_v_previd); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 160, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - if (unlikely(!__pyx_cur_scope->__pyx_v_document)) { __Pyx_RaiseUnboundLocalError("document"); __PYX_ERR(0, 175, __pyx_L1_error) } - __pyx_t_17 = PyTuple_New(2); if (unlikely(!__pyx_t_17)) __PYX_ERR(0, 175, __pyx_L1_error) + if (unlikely(!__pyx_cur_scope->__pyx_v_document)) { __Pyx_RaiseUnboundLocalError("document"); __PYX_ERR(0, 160, __pyx_L1_error) } + __pyx_t_17 = PyTuple_New(2); if (unlikely(!__pyx_t_17)) __PYX_ERR(0, 160, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_17); __Pyx_GIVEREF(__pyx_t_3); PyTuple_SET_ITEM(__pyx_t_17, 0, __pyx_t_3); @@ -3895,9 +3899,9 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py __pyx_generator->resume_label = 3; return __pyx_r; __pyx_L29_resume_from_yield:; - if (unlikely(!__pyx_sent_value)) __PYX_ERR(0, 175, __pyx_L1_error) + if (unlikely(!__pyx_sent_value)) __PYX_ERR(0, 160, __pyx_L1_error) - /* "gensim/corpora/_mmreader.pyx":174 + /* "gensim/corpora/_mmreader.pyx":159 * * # handle the last document, as a special case * if previd >= 0: # <<<<<<<<<<<<<< @@ -3906,18 +3910,18 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py */ } - /* "gensim/corpora/_mmreader.pyx":179 + /* "gensim/corpora/_mmreader.pyx":164 * # return empty documents between the last explicit document and the number * # of documents as specified in the header * for previd in xrange(previd + 1, self.num_docs): # <<<<<<<<<<<<<< * yield previd, [] * */ - __pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_xrange); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 179, __pyx_L1_error) + __pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_xrange); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 164, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_4 = __Pyx_PyInt_From_long((__pyx_cur_scope->__pyx_v_previd + 1)); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 179, __pyx_L1_error) + __pyx_t_4 = __Pyx_PyInt_From_long((__pyx_cur_scope->__pyx_v_previd + 1)); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 164, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_4); - __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_cur_scope->__pyx_v_self->num_docs); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 179, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_cur_scope->__pyx_v_self->num_docs); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 164, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __pyx_t_2 = NULL; __pyx_t_15 = 0; @@ -3934,7 +3938,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py #if CYTHON_FAST_PYCALL if (PyFunction_Check(__pyx_t_3)) { PyObject *__pyx_temp[3] = {__pyx_t_2, __pyx_t_4, __pyx_t_1}; - __pyx_t_17 = __Pyx_PyFunction_FastCall(__pyx_t_3, __pyx_temp+1-__pyx_t_15, 2+__pyx_t_15); if (unlikely(!__pyx_t_17)) __PYX_ERR(0, 179, __pyx_L1_error) + __pyx_t_17 = __Pyx_PyFunction_FastCall(__pyx_t_3, __pyx_temp+1-__pyx_t_15, 2+__pyx_t_15); if (unlikely(!__pyx_t_17)) __PYX_ERR(0, 164, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0; __Pyx_GOTREF(__pyx_t_17); __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; @@ -3944,7 +3948,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py #if CYTHON_FAST_PYCCALL if (__Pyx_PyFastCFunction_Check(__pyx_t_3)) { PyObject *__pyx_temp[3] = {__pyx_t_2, __pyx_t_4, __pyx_t_1}; - __pyx_t_17 = __Pyx_PyCFunction_FastCall(__pyx_t_3, __pyx_temp+1-__pyx_t_15, 2+__pyx_t_15); if (unlikely(!__pyx_t_17)) __PYX_ERR(0, 179, __pyx_L1_error) + __pyx_t_17 = __Pyx_PyCFunction_FastCall(__pyx_t_3, __pyx_temp+1-__pyx_t_15, 2+__pyx_t_15); if (unlikely(!__pyx_t_17)) __PYX_ERR(0, 164, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0; __Pyx_GOTREF(__pyx_t_17); __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; @@ -3952,7 +3956,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py } else #endif { - __pyx_t_13 = PyTuple_New(2+__pyx_t_15); if (unlikely(!__pyx_t_13)) __PYX_ERR(0, 179, __pyx_L1_error) + __pyx_t_13 = PyTuple_New(2+__pyx_t_15); if (unlikely(!__pyx_t_13)) __PYX_ERR(0, 164, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_13); if (__pyx_t_2) { __Pyx_GIVEREF(__pyx_t_2); PyTuple_SET_ITEM(__pyx_t_13, 0, __pyx_t_2); __pyx_t_2 = NULL; @@ -3963,7 +3967,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py PyTuple_SET_ITEM(__pyx_t_13, 1+__pyx_t_15, __pyx_t_1); __pyx_t_4 = 0; __pyx_t_1 = 0; - __pyx_t_17 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_t_13, NULL); if (unlikely(!__pyx_t_17)) __PYX_ERR(0, 179, __pyx_L1_error) + __pyx_t_17 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_t_13, NULL); if (unlikely(!__pyx_t_17)) __PYX_ERR(0, 164, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_17); __Pyx_DECREF(__pyx_t_13); __pyx_t_13 = 0; } @@ -3972,9 +3976,9 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py __pyx_t_3 = __pyx_t_17; __Pyx_INCREF(__pyx_t_3); __pyx_t_9 = 0; __pyx_t_10 = NULL; } else { - __pyx_t_9 = -1; __pyx_t_3 = PyObject_GetIter(__pyx_t_17); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 179, __pyx_L1_error) + __pyx_t_9 = -1; __pyx_t_3 = PyObject_GetIter(__pyx_t_17); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 164, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_10 = Py_TYPE(__pyx_t_3)->tp_iternext; if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 179, __pyx_L1_error) + __pyx_t_10 = Py_TYPE(__pyx_t_3)->tp_iternext; if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 164, __pyx_L1_error) } __Pyx_DECREF(__pyx_t_17); __pyx_t_17 = 0; for (;;) { @@ -3982,17 +3986,17 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py if (likely(PyList_CheckExact(__pyx_t_3))) { if (__pyx_t_9 >= PyList_GET_SIZE(__pyx_t_3)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS - __pyx_t_17 = PyList_GET_ITEM(__pyx_t_3, __pyx_t_9); __Pyx_INCREF(__pyx_t_17); __pyx_t_9++; if (unlikely(0 < 0)) __PYX_ERR(0, 179, __pyx_L1_error) + __pyx_t_17 = PyList_GET_ITEM(__pyx_t_3, __pyx_t_9); __Pyx_INCREF(__pyx_t_17); __pyx_t_9++; if (unlikely(0 < 0)) __PYX_ERR(0, 164, __pyx_L1_error) #else - __pyx_t_17 = PySequence_ITEM(__pyx_t_3, __pyx_t_9); __pyx_t_9++; if (unlikely(!__pyx_t_17)) __PYX_ERR(0, 179, __pyx_L1_error) + __pyx_t_17 = PySequence_ITEM(__pyx_t_3, __pyx_t_9); __pyx_t_9++; if (unlikely(!__pyx_t_17)) __PYX_ERR(0, 164, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_17); #endif } else { if (__pyx_t_9 >= PyTuple_GET_SIZE(__pyx_t_3)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS - __pyx_t_17 = PyTuple_GET_ITEM(__pyx_t_3, __pyx_t_9); __Pyx_INCREF(__pyx_t_17); __pyx_t_9++; if (unlikely(0 < 0)) __PYX_ERR(0, 179, __pyx_L1_error) + __pyx_t_17 = PyTuple_GET_ITEM(__pyx_t_3, __pyx_t_9); __Pyx_INCREF(__pyx_t_17); __pyx_t_9++; if (unlikely(0 < 0)) __PYX_ERR(0, 164, __pyx_L1_error) #else - __pyx_t_17 = PySequence_ITEM(__pyx_t_3, __pyx_t_9); __pyx_t_9++; if (unlikely(!__pyx_t_17)) __PYX_ERR(0, 179, __pyx_L1_error) + __pyx_t_17 = PySequence_ITEM(__pyx_t_3, __pyx_t_9); __pyx_t_9++; if (unlikely(!__pyx_t_17)) __PYX_ERR(0, 164, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_17); #endif } @@ -4002,28 +4006,28 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py PyObject* exc_type = PyErr_Occurred(); if (exc_type) { if (likely(__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear(); - else __PYX_ERR(0, 179, __pyx_L1_error) + else __PYX_ERR(0, 164, __pyx_L1_error) } break; } __Pyx_GOTREF(__pyx_t_17); } - __pyx_t_15 = __Pyx_PyInt_As_int(__pyx_t_17); if (unlikely((__pyx_t_15 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 179, __pyx_L1_error) + __pyx_t_15 = __Pyx_PyInt_As_int(__pyx_t_17); if (unlikely((__pyx_t_15 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 164, __pyx_L1_error) __Pyx_DECREF(__pyx_t_17); __pyx_t_17 = 0; __pyx_cur_scope->__pyx_v_previd = __pyx_t_15; - /* "gensim/corpora/_mmreader.pyx":180 + /* "gensim/corpora/_mmreader.pyx":165 * # of documents as specified in the header * for previd in xrange(previd + 1, self.num_docs): * yield previd, [] # <<<<<<<<<<<<<< * * def docbyoffset(self, offset): */ - __pyx_t_17 = __Pyx_PyInt_From_int(__pyx_cur_scope->__pyx_v_previd); if (unlikely(!__pyx_t_17)) __PYX_ERR(0, 180, __pyx_L1_error) + __pyx_t_17 = __Pyx_PyInt_From_int(__pyx_cur_scope->__pyx_v_previd); if (unlikely(!__pyx_t_17)) __PYX_ERR(0, 165, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_17); - __pyx_t_13 = PyList_New(0); if (unlikely(!__pyx_t_13)) __PYX_ERR(0, 180, __pyx_L1_error) + __pyx_t_13 = PyList_New(0); if (unlikely(!__pyx_t_13)) __PYX_ERR(0, 165, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_13); - __pyx_t_1 = PyTuple_New(2); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 180, __pyx_L1_error) + __pyx_t_1 = PyTuple_New(2); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 165, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_GIVEREF(__pyx_t_17); PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_t_17); @@ -4049,9 +4053,9 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py __Pyx_XGOTREF(__pyx_t_3); __pyx_t_9 = __pyx_cur_scope->__pyx_t_5; __pyx_t_10 = __pyx_cur_scope->__pyx_t_6; - if (unlikely(!__pyx_sent_value)) __PYX_ERR(0, 180, __pyx_L1_error) + if (unlikely(!__pyx_sent_value)) __PYX_ERR(0, 165, __pyx_L1_error) - /* "gensim/corpora/_mmreader.pyx":179 + /* "gensim/corpora/_mmreader.pyx":164 * # return empty documents between the last explicit document and the number * # of documents as specified in the header * for previd in xrange(previd + 1, self.num_docs): # <<<<<<<<<<<<<< @@ -4062,12 +4066,12 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; CYTHON_MAYBE_UNUSED_VAR(__pyx_cur_scope); - /* "gensim/corpora/_mmreader.pyx":117 + /* "gensim/corpora/_mmreader.pyx":107 * break * * def __iter__(self): # <<<<<<<<<<<<<< - * """ - * __iter__() + * """Iterate through corpus. + * */ /* function exit code */ @@ -4091,17 +4095,17 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py return __pyx_r; } -/* "gensim/corpora/_mmreader.pyx":182 +/* "gensim/corpora/_mmreader.pyx":167 * yield previd, [] * * def docbyoffset(self, offset): # <<<<<<<<<<<<<< - * """ - * docbyoffset(offset) + * """Get document at file offset `offset` (in bytes). + * */ /* Python wrapper */ static PyObject *__pyx_pw_6gensim_7corpora_9_mmreader_8MmReader_12docbyoffset(PyObject *__pyx_v_self, PyObject *__pyx_v_offset); /*proto*/ -static char __pyx_doc_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset[] = "\n docbyoffset(offset)\n\n Return document at file offset `offset` (in bytes)\n\n Parameters\n ----------\n offset : int\n offset, in bytes, of desired document\n\n Returns\n ------\n list of (termid, val)\n \"vector\" of terms for document at offset\n vector of terms is represented as a list of (termid, val) tuples\n\n "; +static char __pyx_doc_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset[] = "MmReader.docbyoffset(self, offset)\nGet document at file offset `offset` (in bytes).\n\n Parameters\n ----------\n offset : int\n Offset, in bytes, of desired document.\n\n Returns\n ------\n list of (int, str)\n Document in BoW format.\n\n "; static PyObject *__pyx_pw_6gensim_7corpora_9_mmreader_8MmReader_12docbyoffset(PyObject *__pyx_v_self, PyObject *__pyx_v_offset) { PyObject *__pyx_r = 0; __Pyx_RefNannyDeclarations @@ -4139,20 +4143,20 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st int __pyx_t_13; __Pyx_RefNannySetupContext("docbyoffset", 0); - /* "gensim/corpora/_mmreader.pyx":205 + /* "gensim/corpora/_mmreader.pyx":186 * cdef double val * * if offset == -1: # <<<<<<<<<<<<<< * return [] * if isinstance(self.input, string_types): */ - __pyx_t_1 = __Pyx_PyInt_EqObjC(__pyx_v_offset, __pyx_int_neg_1, -1L, 0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 205, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyInt_EqObjC(__pyx_v_offset, __pyx_int_neg_1, -1L, 0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 186, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_t_1); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 205, __pyx_L1_error) + __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_t_1); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 186, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; if (__pyx_t_2) { - /* "gensim/corpora/_mmreader.pyx":206 + /* "gensim/corpora/_mmreader.pyx":187 * * if offset == -1: * return [] # <<<<<<<<<<<<<< @@ -4160,13 +4164,13 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st * fin, close_fin = utils.smart_open(self.input), True */ __Pyx_XDECREF(__pyx_r); - __pyx_t_1 = PyList_New(0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 206, __pyx_L1_error) + __pyx_t_1 = PyList_New(0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 187, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __pyx_r = __pyx_t_1; __pyx_t_1 = 0; goto __pyx_L0; - /* "gensim/corpora/_mmreader.pyx":205 + /* "gensim/corpora/_mmreader.pyx":186 * cdef double val * * if offset == -1: # <<<<<<<<<<<<<< @@ -4175,7 +4179,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st */ } - /* "gensim/corpora/_mmreader.pyx":207 + /* "gensim/corpora/_mmreader.pyx":188 * if offset == -1: * return [] * if isinstance(self.input, string_types): # <<<<<<<<<<<<<< @@ -4184,24 +4188,24 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st */ __pyx_t_1 = __pyx_v_self->input; __Pyx_INCREF(__pyx_t_1); - __pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_string_types); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 207, __pyx_L1_error) + __pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_string_types); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 188, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_2 = PyObject_IsInstance(__pyx_t_1, __pyx_t_3); if (unlikely(__pyx_t_2 == ((int)-1))) __PYX_ERR(0, 207, __pyx_L1_error) + __pyx_t_2 = PyObject_IsInstance(__pyx_t_1, __pyx_t_3); if (unlikely(__pyx_t_2 == ((int)-1))) __PYX_ERR(0, 188, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __pyx_t_4 = (__pyx_t_2 != 0); if (__pyx_t_4) { - /* "gensim/corpora/_mmreader.pyx":208 + /* "gensim/corpora/_mmreader.pyx":189 * return [] * if isinstance(self.input, string_types): * fin, close_fin = utils.smart_open(self.input), True # <<<<<<<<<<<<<< * else: * fin, close_fin = self.input, False */ - __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_utils); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 208, __pyx_L1_error) + __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_utils); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 189, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_smart_open); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 208, __pyx_L1_error) + __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_smart_open); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 189, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __pyx_t_1 = NULL; @@ -4215,13 +4219,13 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st } } if (!__pyx_t_1) { - __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_t_5, __pyx_v_self->input); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 208, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_t_5, __pyx_v_self->input); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 189, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); } else { #if CYTHON_FAST_PYCALL if (PyFunction_Check(__pyx_t_5)) { PyObject *__pyx_temp[2] = {__pyx_t_1, __pyx_v_self->input}; - __pyx_t_3 = __Pyx_PyFunction_FastCall(__pyx_t_5, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 208, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyFunction_FastCall(__pyx_t_5, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 189, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0; __Pyx_GOTREF(__pyx_t_3); } else @@ -4229,19 +4233,19 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st #if CYTHON_FAST_PYCCALL if (__Pyx_PyFastCFunction_Check(__pyx_t_5)) { PyObject *__pyx_temp[2] = {__pyx_t_1, __pyx_v_self->input}; - __pyx_t_3 = __Pyx_PyCFunction_FastCall(__pyx_t_5, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 208, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyCFunction_FastCall(__pyx_t_5, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 189, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0; __Pyx_GOTREF(__pyx_t_3); } else #endif { - __pyx_t_6 = PyTuple_New(1+1); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 208, __pyx_L1_error) + __pyx_t_6 = PyTuple_New(1+1); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 189, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_6); __Pyx_GIVEREF(__pyx_t_1); PyTuple_SET_ITEM(__pyx_t_6, 0, __pyx_t_1); __pyx_t_1 = NULL; __Pyx_INCREF(__pyx_v_self->input); __Pyx_GIVEREF(__pyx_v_self->input); PyTuple_SET_ITEM(__pyx_t_6, 0+1, __pyx_v_self->input); - __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_5, __pyx_t_6, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 208, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_5, __pyx_t_6, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 189, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; } @@ -4252,7 +4256,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st __pyx_t_3 = 0; __pyx_v_close_fin = __pyx_t_4; - /* "gensim/corpora/_mmreader.pyx":207 + /* "gensim/corpora/_mmreader.pyx":188 * if offset == -1: * return [] * if isinstance(self.input, string_types): # <<<<<<<<<<<<<< @@ -4262,7 +4266,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st goto __pyx_L4; } - /* "gensim/corpora/_mmreader.pyx":210 + /* "gensim/corpora/_mmreader.pyx":191 * fin, close_fin = utils.smart_open(self.input), True * else: * fin, close_fin = self.input, False # <<<<<<<<<<<<<< @@ -4279,14 +4283,14 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st } __pyx_L4:; - /* "gensim/corpora/_mmreader.pyx":212 + /* "gensim/corpora/_mmreader.pyx":193 * fin, close_fin = self.input, False * * fin.seek(offset) # works for gzip/bz2 input, too # <<<<<<<<<<<<<< * previd, document = -1, [] * for line in fin: */ - __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_v_fin, __pyx_n_s_seek); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 212, __pyx_L1_error) + __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_v_fin, __pyx_n_s_seek); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 193, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); __pyx_t_6 = NULL; if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_5))) { @@ -4299,13 +4303,13 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st } } if (!__pyx_t_6) { - __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_t_5, __pyx_v_offset); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 212, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_t_5, __pyx_v_offset); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 193, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); } else { #if CYTHON_FAST_PYCALL if (PyFunction_Check(__pyx_t_5)) { PyObject *__pyx_temp[2] = {__pyx_t_6, __pyx_v_offset}; - __pyx_t_3 = __Pyx_PyFunction_FastCall(__pyx_t_5, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 212, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyFunction_FastCall(__pyx_t_5, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 193, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_6); __pyx_t_6 = 0; __Pyx_GOTREF(__pyx_t_3); } else @@ -4313,19 +4317,19 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st #if CYTHON_FAST_PYCCALL if (__Pyx_PyFastCFunction_Check(__pyx_t_5)) { PyObject *__pyx_temp[2] = {__pyx_t_6, __pyx_v_offset}; - __pyx_t_3 = __Pyx_PyCFunction_FastCall(__pyx_t_5, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 212, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyCFunction_FastCall(__pyx_t_5, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 193, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_6); __pyx_t_6 = 0; __Pyx_GOTREF(__pyx_t_3); } else #endif { - __pyx_t_1 = PyTuple_New(1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 212, __pyx_L1_error) + __pyx_t_1 = PyTuple_New(1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 193, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_GIVEREF(__pyx_t_6); PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_t_6); __pyx_t_6 = NULL; __Pyx_INCREF(__pyx_v_offset); __Pyx_GIVEREF(__pyx_v_offset); PyTuple_SET_ITEM(__pyx_t_1, 0+1, __pyx_v_offset); - __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_5, __pyx_t_1, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 212, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_5, __pyx_t_1, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 193, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; } @@ -4333,7 +4337,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - /* "gensim/corpora/_mmreader.pyx":213 + /* "gensim/corpora/_mmreader.pyx":194 * * fin.seek(offset) # works for gzip/bz2 input, too * previd, document = -1, [] # <<<<<<<<<<<<<< @@ -4341,13 +4345,13 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st * if (sscanf(line, "%d %d %lg", &docid, &termid, &val) != 3): */ __pyx_t_7 = -1; - __pyx_t_3 = PyList_New(0); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 213, __pyx_L1_error) + __pyx_t_3 = PyList_New(0); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 194, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __pyx_v_previd = __pyx_t_7; __pyx_v_document = ((PyObject*)__pyx_t_3); __pyx_t_3 = 0; - /* "gensim/corpora/_mmreader.pyx":214 + /* "gensim/corpora/_mmreader.pyx":195 * fin.seek(offset) # works for gzip/bz2 input, too * previd, document = -1, [] * for line in fin: # <<<<<<<<<<<<<< @@ -4358,26 +4362,26 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st __pyx_t_3 = __pyx_v_fin; __Pyx_INCREF(__pyx_t_3); __pyx_t_8 = 0; __pyx_t_9 = NULL; } else { - __pyx_t_8 = -1; __pyx_t_3 = PyObject_GetIter(__pyx_v_fin); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 214, __pyx_L1_error) + __pyx_t_8 = -1; __pyx_t_3 = PyObject_GetIter(__pyx_v_fin); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 195, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_9 = Py_TYPE(__pyx_t_3)->tp_iternext; if (unlikely(!__pyx_t_9)) __PYX_ERR(0, 214, __pyx_L1_error) + __pyx_t_9 = Py_TYPE(__pyx_t_3)->tp_iternext; if (unlikely(!__pyx_t_9)) __PYX_ERR(0, 195, __pyx_L1_error) } for (;;) { if (likely(!__pyx_t_9)) { if (likely(PyList_CheckExact(__pyx_t_3))) { if (__pyx_t_8 >= PyList_GET_SIZE(__pyx_t_3)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS - __pyx_t_5 = PyList_GET_ITEM(__pyx_t_3, __pyx_t_8); __Pyx_INCREF(__pyx_t_5); __pyx_t_8++; if (unlikely(0 < 0)) __PYX_ERR(0, 214, __pyx_L1_error) + __pyx_t_5 = PyList_GET_ITEM(__pyx_t_3, __pyx_t_8); __Pyx_INCREF(__pyx_t_5); __pyx_t_8++; if (unlikely(0 < 0)) __PYX_ERR(0, 195, __pyx_L1_error) #else - __pyx_t_5 = PySequence_ITEM(__pyx_t_3, __pyx_t_8); __pyx_t_8++; if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 214, __pyx_L1_error) + __pyx_t_5 = PySequence_ITEM(__pyx_t_3, __pyx_t_8); __pyx_t_8++; if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 195, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); #endif } else { if (__pyx_t_8 >= PyTuple_GET_SIZE(__pyx_t_3)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS - __pyx_t_5 = PyTuple_GET_ITEM(__pyx_t_3, __pyx_t_8); __Pyx_INCREF(__pyx_t_5); __pyx_t_8++; if (unlikely(0 < 0)) __PYX_ERR(0, 214, __pyx_L1_error) + __pyx_t_5 = PyTuple_GET_ITEM(__pyx_t_3, __pyx_t_8); __Pyx_INCREF(__pyx_t_5); __pyx_t_8++; if (unlikely(0 < 0)) __PYX_ERR(0, 195, __pyx_L1_error) #else - __pyx_t_5 = PySequence_ITEM(__pyx_t_3, __pyx_t_8); __pyx_t_8++; if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 214, __pyx_L1_error) + __pyx_t_5 = PySequence_ITEM(__pyx_t_3, __pyx_t_8); __pyx_t_8++; if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 195, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); #endif } @@ -4387,7 +4391,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st PyObject* exc_type = PyErr_Occurred(); if (exc_type) { if (likely(__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear(); - else __PYX_ERR(0, 214, __pyx_L1_error) + else __PYX_ERR(0, 195, __pyx_L1_error) } break; } @@ -4396,25 +4400,25 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st __Pyx_XDECREF_SET(__pyx_v_line, __pyx_t_5); __pyx_t_5 = 0; - /* "gensim/corpora/_mmreader.pyx":215 + /* "gensim/corpora/_mmreader.pyx":196 * previd, document = -1, [] * for line in fin: * if (sscanf(line, "%d %d %lg", &docid, &termid, &val) != 3): # <<<<<<<<<<<<<< * raise ValueError("unable to parse line: {}".format(line)) * */ - __pyx_t_10 = __Pyx_PyObject_AsString(__pyx_v_line); if (unlikely((!__pyx_t_10) && PyErr_Occurred())) __PYX_ERR(0, 215, __pyx_L1_error) + __pyx_t_10 = __Pyx_PyObject_AsString(__pyx_v_line); if (unlikely((!__pyx_t_10) && PyErr_Occurred())) __PYX_ERR(0, 196, __pyx_L1_error) __pyx_t_4 = ((sscanf(__pyx_t_10, ((char const *)"%d %d %lg"), (&__pyx_v_docid), (&__pyx_v_termid), (&__pyx_v_val)) != 3) != 0); if (__pyx_t_4) { - /* "gensim/corpora/_mmreader.pyx":216 + /* "gensim/corpora/_mmreader.pyx":197 * for line in fin: * if (sscanf(line, "%d %d %lg", &docid, &termid, &val) != 3): * raise ValueError("unable to parse line: {}".format(line)) # <<<<<<<<<<<<<< * * if not self.transposed: */ - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_kp_s_unable_to_parse_line, __pyx_n_s_format); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 216, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_kp_s_unable_to_parse_line, __pyx_n_s_format); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 197, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __pyx_t_6 = NULL; if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_1))) { @@ -4427,13 +4431,13 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st } } if (!__pyx_t_6) { - __pyx_t_5 = __Pyx_PyObject_CallOneArg(__pyx_t_1, __pyx_v_line); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 216, __pyx_L1_error) + __pyx_t_5 = __Pyx_PyObject_CallOneArg(__pyx_t_1, __pyx_v_line); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 197, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); } else { #if CYTHON_FAST_PYCALL if (PyFunction_Check(__pyx_t_1)) { PyObject *__pyx_temp[2] = {__pyx_t_6, __pyx_v_line}; - __pyx_t_5 = __Pyx_PyFunction_FastCall(__pyx_t_1, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 216, __pyx_L1_error) + __pyx_t_5 = __Pyx_PyFunction_FastCall(__pyx_t_1, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 197, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_6); __pyx_t_6 = 0; __Pyx_GOTREF(__pyx_t_5); } else @@ -4441,37 +4445,37 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st #if CYTHON_FAST_PYCCALL if (__Pyx_PyFastCFunction_Check(__pyx_t_1)) { PyObject *__pyx_temp[2] = {__pyx_t_6, __pyx_v_line}; - __pyx_t_5 = __Pyx_PyCFunction_FastCall(__pyx_t_1, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 216, __pyx_L1_error) + __pyx_t_5 = __Pyx_PyCFunction_FastCall(__pyx_t_1, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 197, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_6); __pyx_t_6 = 0; __Pyx_GOTREF(__pyx_t_5); } else #endif { - __pyx_t_11 = PyTuple_New(1+1); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 216, __pyx_L1_error) + __pyx_t_11 = PyTuple_New(1+1); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 197, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_11); __Pyx_GIVEREF(__pyx_t_6); PyTuple_SET_ITEM(__pyx_t_11, 0, __pyx_t_6); __pyx_t_6 = NULL; __Pyx_INCREF(__pyx_v_line); __Pyx_GIVEREF(__pyx_v_line); PyTuple_SET_ITEM(__pyx_t_11, 0+1, __pyx_v_line); - __pyx_t_5 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_t_11, NULL); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 216, __pyx_L1_error) + __pyx_t_5 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_t_11, NULL); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 197, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0; } } __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 216, __pyx_L1_error) + __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 197, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_GIVEREF(__pyx_t_5); PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_t_5); __pyx_t_5 = 0; - __pyx_t_5 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_t_1, NULL); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 216, __pyx_L1_error) + __pyx_t_5 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_t_1, NULL); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 197, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __Pyx_Raise(__pyx_t_5, 0, 0, 0); __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; - __PYX_ERR(0, 216, __pyx_L1_error) + __PYX_ERR(0, 197, __pyx_L1_error) - /* "gensim/corpora/_mmreader.pyx":215 + /* "gensim/corpora/_mmreader.pyx":196 * previd, document = -1, [] * for line in fin: * if (sscanf(line, "%d %d %lg", &docid, &termid, &val) != 3): # <<<<<<<<<<<<<< @@ -4480,7 +4484,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st */ } - /* "gensim/corpora/_mmreader.pyx":218 + /* "gensim/corpora/_mmreader.pyx":199 * raise ValueError("unable to parse line: {}".format(line)) * * if not self.transposed: # <<<<<<<<<<<<<< @@ -4490,7 +4494,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st __pyx_t_4 = ((!(__pyx_v_self->transposed != 0)) != 0); if (__pyx_t_4) { - /* "gensim/corpora/_mmreader.pyx":219 + /* "gensim/corpora/_mmreader.pyx":200 * * if not self.transposed: * termid, docid = docid, termid # <<<<<<<<<<<<<< @@ -4502,7 +4506,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st __pyx_v_termid = __pyx_t_7; __pyx_v_docid = __pyx_t_12; - /* "gensim/corpora/_mmreader.pyx":218 + /* "gensim/corpora/_mmreader.pyx":199 * raise ValueError("unable to parse line: {}".format(line)) * * if not self.transposed: # <<<<<<<<<<<<<< @@ -4511,7 +4515,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st */ } - /* "gensim/corpora/_mmreader.pyx":222 + /* "gensim/corpora/_mmreader.pyx":203 * * # -1 because matrix market indexes are 1-based => convert to 0-based * docid -= 1 # <<<<<<<<<<<<<< @@ -4520,7 +4524,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st */ __pyx_v_docid = (__pyx_v_docid - 1); - /* "gensim/corpora/_mmreader.pyx":223 + /* "gensim/corpora/_mmreader.pyx":204 * # -1 because matrix market indexes are 1-based => convert to 0-based * docid -= 1 * termid -= 1 # <<<<<<<<<<<<<< @@ -4529,7 +4533,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st */ __pyx_v_termid = (__pyx_v_termid - 1); - /* "gensim/corpora/_mmreader.pyx":225 + /* "gensim/corpora/_mmreader.pyx":206 * termid -= 1 * * assert previd <= docid, "matrix columns must come in ascending order" # <<<<<<<<<<<<<< @@ -4540,12 +4544,12 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st if (unlikely(!Py_OptimizeFlag)) { if (unlikely(!((__pyx_v_previd <= __pyx_v_docid) != 0))) { PyErr_SetObject(PyExc_AssertionError, __pyx_kp_s_matrix_columns_must_come_in_asce); - __PYX_ERR(0, 225, __pyx_L1_error) + __PYX_ERR(0, 206, __pyx_L1_error) } } #endif - /* "gensim/corpora/_mmreader.pyx":226 + /* "gensim/corpora/_mmreader.pyx":207 * * assert previd <= docid, "matrix columns must come in ascending order" * if docid != previd: # <<<<<<<<<<<<<< @@ -4555,7 +4559,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st __pyx_t_4 = ((__pyx_v_docid != __pyx_v_previd) != 0); if (__pyx_t_4) { - /* "gensim/corpora/_mmreader.pyx":227 + /* "gensim/corpora/_mmreader.pyx":208 * assert previd <= docid, "matrix columns must come in ascending order" * if docid != previd: * if previd >= 0: # <<<<<<<<<<<<<< @@ -4565,7 +4569,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st __pyx_t_4 = ((__pyx_v_previd >= 0) != 0); if (__pyx_t_4) { - /* "gensim/corpora/_mmreader.pyx":228 + /* "gensim/corpora/_mmreader.pyx":209 * if docid != previd: * if previd >= 0: * break # <<<<<<<<<<<<<< @@ -4574,7 +4578,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st */ goto __pyx_L6_break; - /* "gensim/corpora/_mmreader.pyx":227 + /* "gensim/corpora/_mmreader.pyx":208 * assert previd <= docid, "matrix columns must come in ascending order" * if docid != previd: * if previd >= 0: # <<<<<<<<<<<<<< @@ -4583,7 +4587,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st */ } - /* "gensim/corpora/_mmreader.pyx":229 + /* "gensim/corpora/_mmreader.pyx":210 * if previd >= 0: * break * previd = docid # <<<<<<<<<<<<<< @@ -4592,7 +4596,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st */ __pyx_v_previd = __pyx_v_docid; - /* "gensim/corpora/_mmreader.pyx":226 + /* "gensim/corpora/_mmreader.pyx":207 * * assert previd <= docid, "matrix columns must come in ascending order" * if docid != previd: # <<<<<<<<<<<<<< @@ -4601,18 +4605,18 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st */ } - /* "gensim/corpora/_mmreader.pyx":231 + /* "gensim/corpora/_mmreader.pyx":212 * previd = docid * * document.append((termid, val,)) # add another field to the current document # <<<<<<<<<<<<<< * * if close_fin: */ - __pyx_t_5 = __Pyx_PyInt_From_int(__pyx_v_termid); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 231, __pyx_L1_error) + __pyx_t_5 = __Pyx_PyInt_From_int(__pyx_v_termid); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 212, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); - __pyx_t_1 = PyFloat_FromDouble(__pyx_v_val); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 231, __pyx_L1_error) + __pyx_t_1 = PyFloat_FromDouble(__pyx_v_val); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 212, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_11 = PyTuple_New(2); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 231, __pyx_L1_error) + __pyx_t_11 = PyTuple_New(2); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 212, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_11); __Pyx_GIVEREF(__pyx_t_5); PyTuple_SET_ITEM(__pyx_t_11, 0, __pyx_t_5); @@ -4620,10 +4624,10 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st PyTuple_SET_ITEM(__pyx_t_11, 1, __pyx_t_1); __pyx_t_5 = 0; __pyx_t_1 = 0; - __pyx_t_13 = __Pyx_PyList_Append(__pyx_v_document, __pyx_t_11); if (unlikely(__pyx_t_13 == ((int)-1))) __PYX_ERR(0, 231, __pyx_L1_error) + __pyx_t_13 = __Pyx_PyList_Append(__pyx_v_document, __pyx_t_11); if (unlikely(__pyx_t_13 == ((int)-1))) __PYX_ERR(0, 212, __pyx_L1_error) __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0; - /* "gensim/corpora/_mmreader.pyx":214 + /* "gensim/corpora/_mmreader.pyx":195 * fin.seek(offset) # works for gzip/bz2 input, too * previd, document = -1, [] * for line in fin: # <<<<<<<<<<<<<< @@ -4634,7 +4638,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st __pyx_L6_break:; __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - /* "gensim/corpora/_mmreader.pyx":233 + /* "gensim/corpora/_mmreader.pyx":214 * document.append((termid, val,)) # add another field to the current document * * if close_fin: # <<<<<<<<<<<<<< @@ -4644,13 +4648,13 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st __pyx_t_4 = (__pyx_v_close_fin != 0); if (__pyx_t_4) { - /* "gensim/corpora/_mmreader.pyx":234 + /* "gensim/corpora/_mmreader.pyx":215 * * if close_fin: * fin.close() # <<<<<<<<<<<<<< * return document */ - __pyx_t_11 = __Pyx_PyObject_GetAttrStr(__pyx_v_fin, __pyx_n_s_close); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 234, __pyx_L1_error) + __pyx_t_11 = __Pyx_PyObject_GetAttrStr(__pyx_v_fin, __pyx_n_s_close); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 215, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_11); __pyx_t_1 = NULL; if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_11))) { @@ -4663,16 +4667,16 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st } } if (__pyx_t_1) { - __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_t_11, __pyx_t_1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 234, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_t_11, __pyx_t_1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 215, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; } else { - __pyx_t_3 = __Pyx_PyObject_CallNoArg(__pyx_t_11); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 234, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_CallNoArg(__pyx_t_11); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 215, __pyx_L1_error) } __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0; __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - /* "gensim/corpora/_mmreader.pyx":233 + /* "gensim/corpora/_mmreader.pyx":214 * document.append((termid, val,)) # add another field to the current document * * if close_fin: # <<<<<<<<<<<<<< @@ -4681,7 +4685,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st */ } - /* "gensim/corpora/_mmreader.pyx":235 + /* "gensim/corpora/_mmreader.pyx":216 * if close_fin: * fin.close() * return document # <<<<<<<<<<<<<< @@ -4691,12 +4695,12 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st __pyx_r = __pyx_v_document; goto __pyx_L0; - /* "gensim/corpora/_mmreader.pyx":182 + /* "gensim/corpora/_mmreader.pyx":167 * yield previd, [] * * def docbyoffset(self, offset): # <<<<<<<<<<<<<< - * """ - * docbyoffset(offset) + * """Get document at file offset `offset` (in bytes). + * */ /* function exit code */ @@ -4717,7 +4721,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st return __pyx_r; } -/* "gensim/corpora/_mmreader.pyx":44 +/* "gensim/corpora/_mmreader.pyx":41 * * """ * cdef public input # <<<<<<<<<<<<<< @@ -4812,7 +4816,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_5input_4__del__(struct return __pyx_r; } -/* "gensim/corpora/_mmreader.pyx":45 +/* "gensim/corpora/_mmreader.pyx":42 * """ * cdef public input * cdef public bint transposed # <<<<<<<<<<<<<< @@ -4839,7 +4843,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_10transposed___g PyObject *__pyx_t_1 = NULL; __Pyx_RefNannySetupContext("__get__", 0); __Pyx_XDECREF(__pyx_r); - __pyx_t_1 = __Pyx_PyBool_FromLong(__pyx_v_self->transposed); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 45, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyBool_FromLong(__pyx_v_self->transposed); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 42, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __pyx_r = __pyx_t_1; __pyx_t_1 = 0; @@ -4874,7 +4878,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_10transposed_2__set__( __Pyx_RefNannyDeclarations int __pyx_t_1; __Pyx_RefNannySetupContext("__set__", 0); - __pyx_t_1 = __Pyx_PyObject_IsTrue(__pyx_v_value); if (unlikely((__pyx_t_1 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 45, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_IsTrue(__pyx_v_value); if (unlikely((__pyx_t_1 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 42, __pyx_L1_error) __pyx_v_self->transposed = __pyx_t_1; /* function exit code */ @@ -4888,7 +4892,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_10transposed_2__set__( return __pyx_r; } -/* "gensim/corpora/_mmreader.pyx":46 +/* "gensim/corpora/_mmreader.pyx":43 * cdef public input * cdef public bint transposed * cdef public int num_docs, num_terms, num_nnz # <<<<<<<<<<<<<< @@ -4915,7 +4919,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_8num_docs___get_ PyObject *__pyx_t_1 = NULL; __Pyx_RefNannySetupContext("__get__", 0); __Pyx_XDECREF(__pyx_r); - __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_self->num_docs); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 46, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_self->num_docs); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 43, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __pyx_r = __pyx_t_1; __pyx_t_1 = 0; @@ -4950,7 +4954,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_8num_docs_2__set__(str __Pyx_RefNannyDeclarations int __pyx_t_1; __Pyx_RefNannySetupContext("__set__", 0); - __pyx_t_1 = __Pyx_PyInt_As_int(__pyx_v_value); if (unlikely((__pyx_t_1 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 46, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyInt_As_int(__pyx_v_value); if (unlikely((__pyx_t_1 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 43, __pyx_L1_error) __pyx_v_self->num_docs = __pyx_t_1; /* function exit code */ @@ -4983,7 +4987,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_9num_terms___get PyObject *__pyx_t_1 = NULL; __Pyx_RefNannySetupContext("__get__", 0); __Pyx_XDECREF(__pyx_r); - __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_self->num_terms); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 46, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_self->num_terms); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 43, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __pyx_r = __pyx_t_1; __pyx_t_1 = 0; @@ -5018,7 +5022,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_9num_terms_2__set__(st __Pyx_RefNannyDeclarations int __pyx_t_1; __Pyx_RefNannySetupContext("__set__", 0); - __pyx_t_1 = __Pyx_PyInt_As_int(__pyx_v_value); if (unlikely((__pyx_t_1 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 46, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyInt_As_int(__pyx_v_value); if (unlikely((__pyx_t_1 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 43, __pyx_L1_error) __pyx_v_self->num_terms = __pyx_t_1; /* function exit code */ @@ -5051,7 +5055,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_7num_nnz___get__ PyObject *__pyx_t_1 = NULL; __Pyx_RefNannySetupContext("__get__", 0); __Pyx_XDECREF(__pyx_r); - __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_self->num_nnz); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 46, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_self->num_nnz); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 43, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __pyx_r = __pyx_t_1; __pyx_t_1 = 0; @@ -5086,7 +5090,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_7num_nnz_2__set__(stru __Pyx_RefNannyDeclarations int __pyx_t_1; __Pyx_RefNannySetupContext("__set__", 0); - __pyx_t_1 = __Pyx_PyInt_As_int(__pyx_v_value); if (unlikely((__pyx_t_1 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 46, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyInt_As_int(__pyx_v_value); if (unlikely((__pyx_t_1 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 43, __pyx_L1_error) __pyx_v_self->num_nnz = __pyx_t_1; /* function exit code */ @@ -5108,6 +5112,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_7num_nnz_2__set__(stru /* Python wrapper */ static PyObject *__pyx_pw_6gensim_7corpora_9_mmreader_8MmReader_14__reduce_cython__(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused); /*proto*/ +static char __pyx_doc_6gensim_7corpora_9_mmreader_8MmReader_13__reduce_cython__[] = "MmReader.__reduce_cython__(self)"; static PyObject *__pyx_pw_6gensim_7corpora_9_mmreader_8MmReader_14__reduce_cython__(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused) { PyObject *__pyx_r = 0; __Pyx_RefNannyDeclarations @@ -5364,6 +5369,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_13__reduce_cytho /* Python wrapper */ static PyObject *__pyx_pw_6gensim_7corpora_9_mmreader_8MmReader_16__setstate_cython__(PyObject *__pyx_v_self, PyObject *__pyx_v___pyx_state); /*proto*/ +static char __pyx_doc_6gensim_7corpora_9_mmreader_8MmReader_15__setstate_cython__[] = "MmReader.__setstate_cython__(self, __pyx_state)"; static PyObject *__pyx_pw_6gensim_7corpora_9_mmreader_8MmReader_16__setstate_cython__(PyObject *__pyx_v_self, PyObject *__pyx_v___pyx_state) { PyObject *__pyx_r = 0; __Pyx_RefNannyDeclarations @@ -5419,7 +5425,8 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_15__setstate_cyt /* Python wrapper */ static PyObject *__pyx_pw_6gensim_7corpora_9_mmreader_1__pyx_unpickle_MmReader(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ -static PyMethodDef __pyx_mdef_6gensim_7corpora_9_mmreader_1__pyx_unpickle_MmReader = {"__pyx_unpickle_MmReader", (PyCFunction)__pyx_pw_6gensim_7corpora_9_mmreader_1__pyx_unpickle_MmReader, METH_VARARGS|METH_KEYWORDS, 0}; +static char __pyx_doc_6gensim_7corpora_9_mmreader___pyx_unpickle_MmReader[] = "__pyx_unpickle_MmReader(__pyx_type, long __pyx_checksum, __pyx_state)"; +static PyMethodDef __pyx_mdef_6gensim_7corpora_9_mmreader_1__pyx_unpickle_MmReader = {"__pyx_unpickle_MmReader", (PyCFunction)__pyx_pw_6gensim_7corpora_9_mmreader_1__pyx_unpickle_MmReader, METH_VARARGS|METH_KEYWORDS, __pyx_doc_6gensim_7corpora_9_mmreader___pyx_unpickle_MmReader}; static PyObject *__pyx_pw_6gensim_7corpora_9_mmreader_1__pyx_unpickle_MmReader(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) { PyObject *__pyx_v___pyx_type = 0; long __pyx_v___pyx_checksum; @@ -6042,17 +6049,17 @@ static int __pyx_setprop_6gensim_7corpora_9_mmreader_8MmReader_num_nnz(PyObject static PyMethodDef __pyx_methods_6gensim_7corpora_9_mmreader_MmReader[] = { {"skip_headers", (PyCFunction)__pyx_pw_6gensim_7corpora_9_mmreader_8MmReader_7skip_headers, METH_O, __pyx_doc_6gensim_7corpora_9_mmreader_8MmReader_6skip_headers}, {"docbyoffset", (PyCFunction)__pyx_pw_6gensim_7corpora_9_mmreader_8MmReader_12docbyoffset, METH_O, __pyx_doc_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset}, - {"__reduce_cython__", (PyCFunction)__pyx_pw_6gensim_7corpora_9_mmreader_8MmReader_14__reduce_cython__, METH_NOARGS, 0}, - {"__setstate_cython__", (PyCFunction)__pyx_pw_6gensim_7corpora_9_mmreader_8MmReader_16__setstate_cython__, METH_O, 0}, + {"__reduce_cython__", (PyCFunction)__pyx_pw_6gensim_7corpora_9_mmreader_8MmReader_14__reduce_cython__, METH_NOARGS, __pyx_doc_6gensim_7corpora_9_mmreader_8MmReader_13__reduce_cython__}, + {"__setstate_cython__", (PyCFunction)__pyx_pw_6gensim_7corpora_9_mmreader_8MmReader_16__setstate_cython__, METH_O, __pyx_doc_6gensim_7corpora_9_mmreader_8MmReader_15__setstate_cython__}, {0, 0, 0, 0} }; static struct PyGetSetDef __pyx_getsets_6gensim_7corpora_9_mmreader_MmReader[] = { - {(char *)"input", __pyx_getprop_6gensim_7corpora_9_mmreader_8MmReader_input, __pyx_setprop_6gensim_7corpora_9_mmreader_8MmReader_input, (char *)0, 0}, - {(char *)"transposed", __pyx_getprop_6gensim_7corpora_9_mmreader_8MmReader_transposed, __pyx_setprop_6gensim_7corpora_9_mmreader_8MmReader_transposed, (char *)0, 0}, - {(char *)"num_docs", __pyx_getprop_6gensim_7corpora_9_mmreader_8MmReader_num_docs, __pyx_setprop_6gensim_7corpora_9_mmreader_8MmReader_num_docs, (char *)0, 0}, - {(char *)"num_terms", __pyx_getprop_6gensim_7corpora_9_mmreader_8MmReader_num_terms, __pyx_setprop_6gensim_7corpora_9_mmreader_8MmReader_num_terms, (char *)0, 0}, - {(char *)"num_nnz", __pyx_getprop_6gensim_7corpora_9_mmreader_8MmReader_num_nnz, __pyx_setprop_6gensim_7corpora_9_mmreader_8MmReader_num_nnz, (char *)0, 0}, + {(char *)"input", __pyx_getprop_6gensim_7corpora_9_mmreader_8MmReader_input, __pyx_setprop_6gensim_7corpora_9_mmreader_8MmReader_input, (char *)"input: object", 0}, + {(char *)"transposed", __pyx_getprop_6gensim_7corpora_9_mmreader_8MmReader_transposed, __pyx_setprop_6gensim_7corpora_9_mmreader_8MmReader_transposed, (char *)"transposed: 'bool'", 0}, + {(char *)"num_docs", __pyx_getprop_6gensim_7corpora_9_mmreader_8MmReader_num_docs, __pyx_setprop_6gensim_7corpora_9_mmreader_8MmReader_num_docs, (char *)"num_docs: 'int'", 0}, + {(char *)"num_terms", __pyx_getprop_6gensim_7corpora_9_mmreader_8MmReader_num_terms, __pyx_setprop_6gensim_7corpora_9_mmreader_8MmReader_num_terms, (char *)"num_terms: 'int'", 0}, + {(char *)"num_nnz", __pyx_getprop_6gensim_7corpora_9_mmreader_8MmReader_num_nnz, __pyx_setprop_6gensim_7corpora_9_mmreader_8MmReader_num_nnz, (char *)"num_nnz: 'int'", 0}, {0, 0, 0, 0, 0} }; @@ -6101,7 +6108,7 @@ static PyTypeObject __pyx_type_6gensim_7corpora_9_mmreader_MmReader = { 0, /*tp_setattro*/ 0, /*tp_as_buffer*/ Py_TPFLAGS_DEFAULT|Py_TPFLAGS_HAVE_VERSION_TAG|Py_TPFLAGS_CHECKTYPES|Py_TPFLAGS_HAVE_NEWBUFFER|Py_TPFLAGS_BASETYPE|Py_TPFLAGS_HAVE_GC, /*tp_flags*/ - "\n matrix market file reader\n\n Wrap a term-document matrix on disk (in matrix-market format), and present it\n as an object which supports iteration over the rows (~documents).\n\n Attributes\n ----------\n num_docs : int\n number of documents in market matrix file\n num_terms : int\n number of terms\n num_nnz : int\n number of non-zero terms\n\n Notes\n ----------\n Note that the file is read into memory one document at a time, not the whole\n matrix at once (unlike scipy.io.mmread). This allows us to process corpora\n which are larger than the available RAM.\n\n ", /*tp_doc*/ + "MmReader(input, transposed=True)\nMatrix market file reader (fast Cython version), used for :class:`~gensim.corpora.mmcorpus.MmCorpus`.\n\n Wrap a term-document matrix on disk (in matrix-market format), and present it\n as an object which supports iteration over the rows (~documents).\n\n Attributes\n ----------\n num_docs : int\n Number of documents in market matrix file.\n num_terms : int\n Number of terms.\n num_nnz : int\n Number of non-zero terms.\n\n Notes\n ----------\n Note that the file is read into memory one document at a time, not the whole\n matrix at once (unlike scipy.io.mmread). This allows us to process corpora\n which are larger than the available RAM.\n\n ", /*tp_doc*/ __pyx_tp_traverse_6gensim_7corpora_9_mmreader_MmReader, /*tp_traverse*/ __pyx_tp_clear_6gensim_7corpora_9_mmreader_MmReader, /*tp_clear*/ 0, /*tp_richcompare*/ @@ -6580,9 +6587,9 @@ static __Pyx_StringTabEntry __pyx_string_tab[] = { {0, 0, 0, 0, 0, 0, 0} }; static int __Pyx_InitCachedBuiltins(void) { - __pyx_builtin_ValueError = __Pyx_GetBuiltinName(__pyx_n_s_ValueError); if (!__pyx_builtin_ValueError) __PYX_ERR(0, 72, __pyx_L1_error) - __pyx_builtin_StopIteration = __Pyx_GetBuiltinName(__pyx_n_s_StopIteration); if (!__pyx_builtin_StopIteration) __PYX_ERR(0, 76, __pyx_L1_error) - __pyx_builtin_enumerate = __Pyx_GetBuiltinName(__pyx_n_s_enumerate); if (!__pyx_builtin_enumerate) __PYX_ERR(0, 80, __pyx_L1_error) + __pyx_builtin_ValueError = __Pyx_GetBuiltinName(__pyx_n_s_ValueError); if (!__pyx_builtin_ValueError) __PYX_ERR(0, 64, __pyx_L1_error) + __pyx_builtin_StopIteration = __Pyx_GetBuiltinName(__pyx_n_s_StopIteration); if (!__pyx_builtin_StopIteration) __PYX_ERR(0, 68, __pyx_L1_error) + __pyx_builtin_enumerate = __Pyx_GetBuiltinName(__pyx_n_s_enumerate); if (!__pyx_builtin_enumerate) __PYX_ERR(0, 72, __pyx_L1_error) return 0; __pyx_L1_error:; return -1; @@ -6592,58 +6599,58 @@ static int __Pyx_InitCachedConstants(void) { __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("__Pyx_InitCachedConstants", 0); - /* "gensim/corpora/_mmreader.pyx":71 + /* "gensim/corpora/_mmreader.pyx":63 * try: * header = utils.to_unicode(next(lines)).strip() * if not header.lower().startswith('%%matrixmarket matrix coordinate real general'): # <<<<<<<<<<<<<< * raise ValueError( * "File %s not in Matrix Market format with coordinate real general; instead found: \n%s" % */ - __pyx_tuple_ = PyTuple_Pack(1, __pyx_kp_s_matrixmarket_matrix_coordinate); if (unlikely(!__pyx_tuple_)) __PYX_ERR(0, 71, __pyx_L1_error) + __pyx_tuple_ = PyTuple_Pack(1, __pyx_kp_s_matrixmarket_matrix_coordinate); if (unlikely(!__pyx_tuple_)) __PYX_ERR(0, 63, __pyx_L1_error) __Pyx_GOTREF(__pyx_tuple_); __Pyx_GIVEREF(__pyx_tuple_); - /* "gensim/corpora/_mmreader.pyx":82 + /* "gensim/corpora/_mmreader.pyx":74 * for lineno, line in enumerate(lines): * line = utils.to_unicode(line) * if not line.startswith('%'): # <<<<<<<<<<<<<< * self.num_docs, self.num_terms, self.num_nnz = (int(x) for x in line.split()) * if not self.transposed: */ - __pyx_tuple__3 = PyTuple_Pack(1, __pyx_kp_s__2); if (unlikely(!__pyx_tuple__3)) __PYX_ERR(0, 82, __pyx_L1_error) + __pyx_tuple__3 = PyTuple_Pack(1, __pyx_kp_s__2); if (unlikely(!__pyx_tuple__3)) __PYX_ERR(0, 74, __pyx_L1_error) __Pyx_GOTREF(__pyx_tuple__3); __Pyx_GIVEREF(__pyx_tuple__3); - /* "gensim/corpora/_mmreader.pyx":68 + /* "gensim/corpora/_mmreader.pyx":60 * logger.info("initializing cython corpus reader from %s", input) * self.input, self.transposed = input, transposed * with utils.file_or_filename(self.input) as lines: # <<<<<<<<<<<<<< * try: * header = utils.to_unicode(next(lines)).strip() */ - __pyx_tuple__4 = PyTuple_Pack(3, Py_None, Py_None, Py_None); if (unlikely(!__pyx_tuple__4)) __PYX_ERR(0, 68, __pyx_L1_error) + __pyx_tuple__4 = PyTuple_Pack(3, Py_None, Py_None, Py_None); if (unlikely(!__pyx_tuple__4)) __PYX_ERR(0, 60, __pyx_L1_error) __Pyx_GOTREF(__pyx_tuple__4); __Pyx_GIVEREF(__pyx_tuple__4); - /* "gensim/corpora/_mmreader.pyx":113 + /* "gensim/corpora/_mmreader.pyx":103 * """ * for line in input_file: * if line.startswith(b'%'): # <<<<<<<<<<<<<< * continue * break */ - __pyx_tuple__5 = PyTuple_Pack(1, __pyx_kp_b__2); if (unlikely(!__pyx_tuple__5)) __PYX_ERR(0, 113, __pyx_L1_error) + __pyx_tuple__5 = PyTuple_Pack(1, __pyx_kp_b__2); if (unlikely(!__pyx_tuple__5)) __PYX_ERR(0, 103, __pyx_L1_error) __Pyx_GOTREF(__pyx_tuple__5); __Pyx_GIVEREF(__pyx_tuple__5); - /* "gensim/corpora/_mmreader.pyx":140 + /* "gensim/corpora/_mmreader.pyx":125 * cdef double val = 0 * * with utils.file_or_filename(self.input) as lines: # <<<<<<<<<<<<<< * self.skip_headers(lines) * */ - __pyx_tuple__6 = PyTuple_Pack(3, Py_None, Py_None, Py_None); if (unlikely(!__pyx_tuple__6)) __PYX_ERR(0, 140, __pyx_L1_error) + __pyx_tuple__6 = PyTuple_Pack(3, Py_None, Py_None, Py_None); if (unlikely(!__pyx_tuple__6)) __PYX_ERR(0, 125, __pyx_L1_error) __Pyx_GOTREF(__pyx_tuple__6); __Pyx_GIVEREF(__pyx_tuple__6); @@ -6815,11 +6822,11 @@ static int __pyx_pymod_exec__mmreader(PyObject *__pyx_pyinit_module) /*--- Variable export code ---*/ /*--- Function export code ---*/ /*--- Type init code ---*/ - if (PyType_Ready(&__pyx_type_6gensim_7corpora_9_mmreader_MmReader) < 0) __PYX_ERR(0, 21, __pyx_L1_error) + if (PyType_Ready(&__pyx_type_6gensim_7corpora_9_mmreader_MmReader) < 0) __PYX_ERR(0, 19, __pyx_L1_error) __pyx_type_6gensim_7corpora_9_mmreader_MmReader.tp_print = 0; #if CYTHON_COMPILING_IN_CPYTHON { - PyObject *wrapper = PyObject_GetAttrString((PyObject *)&__pyx_type_6gensim_7corpora_9_mmreader_MmReader, "__init__"); if (unlikely(!wrapper)) __PYX_ERR(0, 21, __pyx_L1_error) + PyObject *wrapper = PyObject_GetAttrString((PyObject *)&__pyx_type_6gensim_7corpora_9_mmreader_MmReader, "__init__"); if (unlikely(!wrapper)) __PYX_ERR(0, 19, __pyx_L1_error) if (Py_TYPE(wrapper) == &PyWrapperDescr_Type) { __pyx_wrapperbase_6gensim_7corpora_9_mmreader_8MmReader___init__ = *((PyWrapperDescrObject *)wrapper)->d_base; __pyx_wrapperbase_6gensim_7corpora_9_mmreader_8MmReader___init__.doc = __pyx_doc_6gensim_7corpora_9_mmreader_8MmReader___init__; @@ -6829,7 +6836,17 @@ static int __pyx_pymod_exec__mmreader(PyObject *__pyx_pyinit_module) #endif #if CYTHON_COMPILING_IN_CPYTHON { - PyObject *wrapper = PyObject_GetAttrString((PyObject *)&__pyx_type_6gensim_7corpora_9_mmreader_MmReader, "__iter__"); if (unlikely(!wrapper)) __PYX_ERR(0, 21, __pyx_L1_error) + PyObject *wrapper = PyObject_GetAttrString((PyObject *)&__pyx_type_6gensim_7corpora_9_mmreader_MmReader, "__len__"); if (unlikely(!wrapper)) __PYX_ERR(0, 19, __pyx_L1_error) + if (Py_TYPE(wrapper) == &PyWrapperDescr_Type) { + __pyx_wrapperbase_6gensim_7corpora_9_mmreader_8MmReader_2__len__ = *((PyWrapperDescrObject *)wrapper)->d_base; + __pyx_wrapperbase_6gensim_7corpora_9_mmreader_8MmReader_2__len__.doc = __pyx_doc_6gensim_7corpora_9_mmreader_8MmReader_2__len__; + ((PyWrapperDescrObject *)wrapper)->d_base = &__pyx_wrapperbase_6gensim_7corpora_9_mmreader_8MmReader_2__len__; + } + } + #endif + #if CYTHON_COMPILING_IN_CPYTHON + { + PyObject *wrapper = PyObject_GetAttrString((PyObject *)&__pyx_type_6gensim_7corpora_9_mmreader_MmReader, "__iter__"); if (unlikely(!wrapper)) __PYX_ERR(0, 19, __pyx_L1_error) if (Py_TYPE(wrapper) == &PyWrapperDescr_Type) { __pyx_wrapperbase_6gensim_7corpora_9_mmreader_8MmReader_8__iter__ = *((PyWrapperDescrObject *)wrapper)->d_base; __pyx_wrapperbase_6gensim_7corpora_9_mmreader_8MmReader_8__iter__.doc = __pyx_doc_6gensim_7corpora_9_mmreader_8MmReader_8__iter__; @@ -6837,16 +6854,16 @@ static int __pyx_pymod_exec__mmreader(PyObject *__pyx_pyinit_module) } } #endif - if (PyObject_SetAttrString(__pyx_m, "MmReader", (PyObject *)&__pyx_type_6gensim_7corpora_9_mmreader_MmReader) < 0) __PYX_ERR(0, 21, __pyx_L1_error) - if (__Pyx_setup_reduce((PyObject*)&__pyx_type_6gensim_7corpora_9_mmreader_MmReader) < 0) __PYX_ERR(0, 21, __pyx_L1_error) + if (PyObject_SetAttrString(__pyx_m, "MmReader", (PyObject *)&__pyx_type_6gensim_7corpora_9_mmreader_MmReader) < 0) __PYX_ERR(0, 19, __pyx_L1_error) + if (__Pyx_setup_reduce((PyObject*)&__pyx_type_6gensim_7corpora_9_mmreader_MmReader) < 0) __PYX_ERR(0, 19, __pyx_L1_error) __pyx_ptype_6gensim_7corpora_9_mmreader_MmReader = &__pyx_type_6gensim_7corpora_9_mmreader_MmReader; - if (PyType_Ready(&__pyx_type_6gensim_7corpora_9_mmreader___pyx_scope_struct____init__) < 0) __PYX_ERR(0, 48, __pyx_L1_error) + if (PyType_Ready(&__pyx_type_6gensim_7corpora_9_mmreader___pyx_scope_struct____init__) < 0) __PYX_ERR(0, 45, __pyx_L1_error) __pyx_type_6gensim_7corpora_9_mmreader___pyx_scope_struct____init__.tp_print = 0; __pyx_ptype_6gensim_7corpora_9_mmreader___pyx_scope_struct____init__ = &__pyx_type_6gensim_7corpora_9_mmreader___pyx_scope_struct____init__; - if (PyType_Ready(&__pyx_type_6gensim_7corpora_9_mmreader___pyx_scope_struct_1_genexpr) < 0) __PYX_ERR(0, 83, __pyx_L1_error) + if (PyType_Ready(&__pyx_type_6gensim_7corpora_9_mmreader___pyx_scope_struct_1_genexpr) < 0) __PYX_ERR(0, 75, __pyx_L1_error) __pyx_type_6gensim_7corpora_9_mmreader___pyx_scope_struct_1_genexpr.tp_print = 0; __pyx_ptype_6gensim_7corpora_9_mmreader___pyx_scope_struct_1_genexpr = &__pyx_type_6gensim_7corpora_9_mmreader___pyx_scope_struct_1_genexpr; - if (PyType_Ready(&__pyx_type_6gensim_7corpora_9_mmreader___pyx_scope_struct_2___iter__) < 0) __PYX_ERR(0, 117, __pyx_L1_error) + if (PyType_Ready(&__pyx_type_6gensim_7corpora_9_mmreader___pyx_scope_struct_2___iter__) < 0) __PYX_ERR(0, 107, __pyx_L1_error) __pyx_type_6gensim_7corpora_9_mmreader___pyx_scope_struct_2___iter__.tp_print = 0; __pyx_ptype_6gensim_7corpora_9_mmreader___pyx_scope_struct_2___iter__ = &__pyx_type_6gensim_7corpora_9_mmreader___pyx_scope_struct_2___iter__; /*--- Type import code ---*/ @@ -6857,94 +6874,94 @@ static int __pyx_pymod_exec__mmreader(PyObject *__pyx_pyinit_module) if (__Pyx_patch_abc() < 0) __PYX_ERR(0, 1, __pyx_L1_error) #endif - /* "gensim/corpora/_mmreader.pyx":8 + /* "gensim/corpora/_mmreader.pyx":6 * from __future__ import with_statement * * from gensim import utils # <<<<<<<<<<<<<< * * from six import string_types */ - __pyx_t_1 = PyList_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 8, __pyx_L1_error) + __pyx_t_1 = PyList_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 6, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_INCREF(__pyx_n_s_utils); __Pyx_GIVEREF(__pyx_n_s_utils); PyList_SET_ITEM(__pyx_t_1, 0, __pyx_n_s_utils); - __pyx_t_2 = __Pyx_Import(__pyx_n_s_gensim, __pyx_t_1, -1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 8, __pyx_L1_error) + __pyx_t_2 = __Pyx_Import(__pyx_n_s_gensim, __pyx_t_1, -1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 6, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_t_1 = __Pyx_ImportFrom(__pyx_t_2, __pyx_n_s_utils); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 8, __pyx_L1_error) + __pyx_t_1 = __Pyx_ImportFrom(__pyx_t_2, __pyx_n_s_utils); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 6, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - if (PyDict_SetItem(__pyx_d, __pyx_n_s_utils, __pyx_t_1) < 0) __PYX_ERR(0, 8, __pyx_L1_error) + if (PyDict_SetItem(__pyx_d, __pyx_n_s_utils, __pyx_t_1) < 0) __PYX_ERR(0, 6, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; - /* "gensim/corpora/_mmreader.pyx":10 + /* "gensim/corpora/_mmreader.pyx":8 * from gensim import utils * * from six import string_types # <<<<<<<<<<<<<< * from six.moves import xrange * import logging */ - __pyx_t_2 = PyList_New(1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 10, __pyx_L1_error) + __pyx_t_2 = PyList_New(1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 8, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); __Pyx_INCREF(__pyx_n_s_string_types); __Pyx_GIVEREF(__pyx_n_s_string_types); PyList_SET_ITEM(__pyx_t_2, 0, __pyx_n_s_string_types); - __pyx_t_1 = __Pyx_Import(__pyx_n_s_six, __pyx_t_2, -1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 10, __pyx_L1_error) + __pyx_t_1 = __Pyx_Import(__pyx_n_s_six, __pyx_t_2, -1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 8, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; - __pyx_t_2 = __Pyx_ImportFrom(__pyx_t_1, __pyx_n_s_string_types); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 10, __pyx_L1_error) + __pyx_t_2 = __Pyx_ImportFrom(__pyx_t_1, __pyx_n_s_string_types); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 8, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); - if (PyDict_SetItem(__pyx_d, __pyx_n_s_string_types, __pyx_t_2) < 0) __PYX_ERR(0, 10, __pyx_L1_error) + if (PyDict_SetItem(__pyx_d, __pyx_n_s_string_types, __pyx_t_2) < 0) __PYX_ERR(0, 8, __pyx_L1_error) __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - /* "gensim/corpora/_mmreader.pyx":11 + /* "gensim/corpora/_mmreader.pyx":9 * * from six import string_types * from six.moves import xrange # <<<<<<<<<<<<<< * import logging * */ - __pyx_t_1 = PyList_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 11, __pyx_L1_error) + __pyx_t_1 = PyList_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 9, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_INCREF(__pyx_n_s_xrange); __Pyx_GIVEREF(__pyx_n_s_xrange); PyList_SET_ITEM(__pyx_t_1, 0, __pyx_n_s_xrange); - __pyx_t_2 = __Pyx_Import(__pyx_n_s_six_moves, __pyx_t_1, -1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 11, __pyx_L1_error) + __pyx_t_2 = __Pyx_Import(__pyx_n_s_six_moves, __pyx_t_1, -1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 9, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_t_1 = __Pyx_ImportFrom(__pyx_t_2, __pyx_n_s_xrange); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 11, __pyx_L1_error) + __pyx_t_1 = __Pyx_ImportFrom(__pyx_t_2, __pyx_n_s_xrange); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 9, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - if (PyDict_SetItem(__pyx_d, __pyx_n_s_xrange, __pyx_t_1) < 0) __PYX_ERR(0, 11, __pyx_L1_error) + if (PyDict_SetItem(__pyx_d, __pyx_n_s_xrange, __pyx_t_1) < 0) __PYX_ERR(0, 9, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; - /* "gensim/corpora/_mmreader.pyx":12 + /* "gensim/corpora/_mmreader.pyx":10 * from six import string_types * from six.moves import xrange * import logging # <<<<<<<<<<<<<< * * cimport cython */ - __pyx_t_2 = __Pyx_Import(__pyx_n_s_logging, 0, -1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 12, __pyx_L1_error) + __pyx_t_2 = __Pyx_Import(__pyx_n_s_logging, 0, -1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 10, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); - if (PyDict_SetItem(__pyx_d, __pyx_n_s_logging, __pyx_t_2) < 0) __PYX_ERR(0, 12, __pyx_L1_error) + if (PyDict_SetItem(__pyx_d, __pyx_n_s_logging, __pyx_t_2) < 0) __PYX_ERR(0, 10, __pyx_L1_error) __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; - /* "gensim/corpora/_mmreader.pyx":18 + /* "gensim/corpora/_mmreader.pyx":16 * * * logger = logging.getLogger(__name__) # <<<<<<<<<<<<<< * * */ - __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_logging); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 18, __pyx_L1_error) + __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_logging); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 16, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_getLogger); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 18, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_getLogger); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 16, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_name); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 18, __pyx_L1_error) + __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_name); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 16, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __pyx_t_4 = NULL; if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_3))) { @@ -6957,14 +6974,14 @@ static int __pyx_pymod_exec__mmreader(PyObject *__pyx_pyinit_module) } } if (!__pyx_t_4) { - __pyx_t_2 = __Pyx_PyObject_CallOneArg(__pyx_t_3, __pyx_t_1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 18, __pyx_L1_error) + __pyx_t_2 = __Pyx_PyObject_CallOneArg(__pyx_t_3, __pyx_t_1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 16, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __Pyx_GOTREF(__pyx_t_2); } else { #if CYTHON_FAST_PYCALL if (PyFunction_Check(__pyx_t_3)) { PyObject *__pyx_temp[2] = {__pyx_t_4, __pyx_t_1}; - __pyx_t_2 = __Pyx_PyFunction_FastCall(__pyx_t_3, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 18, __pyx_L1_error) + __pyx_t_2 = __Pyx_PyFunction_FastCall(__pyx_t_3, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 16, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0; __Pyx_GOTREF(__pyx_t_2); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; @@ -6973,26 +6990,26 @@ static int __pyx_pymod_exec__mmreader(PyObject *__pyx_pyinit_module) #if CYTHON_FAST_PYCCALL if (__Pyx_PyFastCFunction_Check(__pyx_t_3)) { PyObject *__pyx_temp[2] = {__pyx_t_4, __pyx_t_1}; - __pyx_t_2 = __Pyx_PyCFunction_FastCall(__pyx_t_3, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 18, __pyx_L1_error) + __pyx_t_2 = __Pyx_PyCFunction_FastCall(__pyx_t_3, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 16, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0; __Pyx_GOTREF(__pyx_t_2); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; } else #endif { - __pyx_t_5 = PyTuple_New(1+1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 18, __pyx_L1_error) + __pyx_t_5 = PyTuple_New(1+1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 16, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); __Pyx_GIVEREF(__pyx_t_4); PyTuple_SET_ITEM(__pyx_t_5, 0, __pyx_t_4); __pyx_t_4 = NULL; __Pyx_GIVEREF(__pyx_t_1); PyTuple_SET_ITEM(__pyx_t_5, 0+1, __pyx_t_1); __pyx_t_1 = 0; - __pyx_t_2 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_t_5, NULL); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 18, __pyx_L1_error) + __pyx_t_2 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_t_5, NULL); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 16, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; } } __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - if (PyDict_SetItem(__pyx_d, __pyx_n_s_logger, __pyx_t_2) < 0) __PYX_ERR(0, 18, __pyx_L1_error) + if (PyDict_SetItem(__pyx_d, __pyx_n_s_logger, __pyx_t_2) < 0) __PYX_ERR(0, 16, __pyx_L1_error) __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; /* "(tree fragment)":1 @@ -7007,8 +7024,8 @@ static int __pyx_pymod_exec__mmreader(PyObject *__pyx_pyinit_module) /* "gensim/corpora/_mmreader.pyx":1 * # Copyright (C) 2018 Radim Rehurek # <<<<<<<<<<<<<< - * """ - * Reader for corpus in the Matrix Market format. + * # cython: embedsignature=True + * """Reader for corpus in the Matrix Market format.""" */ __pyx_t_2 = __Pyx_PyDict_NewPresized(0); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 1, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); diff --git a/gensim/corpora/_mmreader.pyx b/gensim/corpora/_mmreader.pyx index 725fa4040c..f1f48fab5e 100644 --- a/gensim/corpora/_mmreader.pyx +++ b/gensim/corpora/_mmreader.pyx @@ -1,8 +1,6 @@ # Copyright (C) 2018 Radim Rehurek -""" -Reader for corpus in the Matrix Market format. - -""" +# cython: embedsignature=True +"""Reader for corpus in the Matrix Market format.""" from __future__ import with_statement from gensim import utils @@ -19,8 +17,7 @@ logger = logging.getLogger(__name__) cdef class MmReader(object): - """ - matrix market file reader + """Matrix market file reader (fast Cython version), used for :class:`~gensim.corpora.mmcorpus.MmCorpus`. Wrap a term-document matrix on disk (in matrix-market format), and present it as an object which supports iteration over the rows (~documents). @@ -28,11 +25,11 @@ cdef class MmReader(object): Attributes ---------- num_docs : int - number of documents in market matrix file + Number of documents in market matrix file. num_terms : int - number of terms + Number of terms. num_nnz : int - number of non-zero terms + Number of non-zero terms. Notes ---------- @@ -47,20 +44,15 @@ cdef class MmReader(object): def __init__(self, input, transposed=True): """ - MmReader(input, transposed=True): - - Create matrix reader Parameters ---------- - input : string or file-like - string (file path) or a file-like object that supports - `seek()` (e.g. gzip.GzipFile, bz2.BZ2File). File-like objects are - not closed automatically. + input : {str, file-like object} + Path to input file in MM format or a file-like object that supports `seek()` + (e.g. :class:`~gzip.GzipFile`, :class:`~bz2.BZ2File`). - transposed : bool - if True, expects lines to represent doc_id, term_id, value - else, expects term_id, doc_id, value + transposed : bool, optional + if True, expects lines to represent doc_id, term_id, value. Else, expects term_id, doc_id, value. """ logger.info("initializing cython corpus reader from %s", input) @@ -91,6 +83,7 @@ cdef class MmReader(object): ) def __len__(self): + """Get size of corpus (number of documents).""" return self.num_docs def __str__(self): @@ -98,15 +91,12 @@ cdef class MmReader(object): (self.num_docs, self.num_terms, self.num_nnz)) def skip_headers(self, input_file): - """ - skip_headers(self, input_file) - - Skip file headers that appear before the first document. + """Skip file headers that appear before the first document. Parameters ---------- - input_file : iterable - consumes any lines from start of `input_file` that begin with a % + input_file : iterable of str + Iterable taken from file in MM format. """ for line in input_file: @@ -115,23 +105,18 @@ cdef class MmReader(object): break def __iter__(self): - """ - __iter__() - - Iterate through vectors from underlying matrix + """Iterate through corpus. - Yields + Notes ------ - int, list of (termid, val) - document id and "vector" of terms for next document in matrix - vector of terms is represented as a list of (termid, val) tuples + Note that the total number of vectors returned is always equal to the number of rows specified + in the header, empty documents are inserted and yielded where appropriate, even if they are not explicitly + stored in the Matrix Market file. - Notes + Yields ------ - Note that the total number of vectors returned is always equal to the - number of rows specified in the header; empty documents are inserted and - yielded where appropriate, even if they are not explicitly stored in the - Matrix Market file. + (int, list of (int, number)) + Document id and Document in BoW format """ cdef int docid, termid, previd @@ -180,21 +165,17 @@ cdef class MmReader(object): yield previd, [] def docbyoffset(self, offset): - """ - docbyoffset(offset) - - Return document at file offset `offset` (in bytes) + """Get document at file offset `offset` (in bytes). Parameters ---------- offset : int - offset, in bytes, of desired document + Offset, in bytes, of desired document. Returns ------ - list of (termid, val) - "vector" of terms for document at offset - vector of terms is represented as a list of (termid, val) tuples + list of (int, str) + Document in BoW format. """ # empty documents are not stored explicitly in MM format, so the index marks diff --git a/gensim/corpora/mmcorpus.py b/gensim/corpora/mmcorpus.py index eec8242017..d7770607e8 100644 --- a/gensim/corpora/mmcorpus.py +++ b/gensim/corpora/mmcorpus.py @@ -5,10 +5,7 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -Corpus in the Matrix Market format. -""" - +"""Corpus in the Matrix Market format.""" import logging @@ -20,8 +17,7 @@ class MmCorpus(matutils.MmReader, IndexedCorpus): - """ - Corpus in matrix market format + """Corpus in matrix market format. Wrap a term-document matrix on disk (in matrix-market format), and present it as an object which supports iteration over the rows (~documents). @@ -29,30 +25,37 @@ class MmCorpus(matutils.MmReader, IndexedCorpus): Attributes ---------- num_docs : int - number of documents in market matrix file + Number of documents in market matrix file. num_terms : int - number of terms + Number of terms. num_nnz : int - number of non-zero terms + Number of non-zero terms. Notes ---------- - Note that the file is read into memory one document at a time, not the whole - matrix at once (unlike scipy.io.mmread). This allows us to process corpora - which are larger than the available RAM. + Note that the file is read into memory one document at a time, not the whole matrix at once + (unlike :meth:`~scipy.io.mmread`). This allows us to process corpora which are larger than the available RAM. + + Example + -------- + >>> from gensim.corpora.mmcorpus import MmCorpus + >>> from gensim.test.utils import datapath + >>> import gensim.downloader as api + >>> + >>> corpus = MmCorpus(datapath('test_mmcorpus_with_index.mm')) + >>> for document in corpus: + ... pass """ def __init__(self, fname): """ - Read corpus in matrix market format Parameters ---------- - fname : string or file-like - string (file path) or a file-like object that supports - `seek()` (e.g. gzip.GzipFile, bz2.BZ2File). File-like objects are - not closed automatically. + fname : {str, file-like object} + Path to file in MM format or a file-like object that supports `seek()` + (e.g. :class:`gzip.GzipFile`, :class:`bz2.BZ2File`). """ @@ -61,34 +64,56 @@ def __init__(self, fname): matutils.MmReader.__init__(self, fname) def __iter__(self): - """ - Iterate through vectors from underlying matrix + """Iterate through document. Yields ------ - list of (termid, val) - "vector" of terms for next document in matrix - vector of terms is represented as a list of (termid, val) tuples + list of (int, str) + Document in BoW format. Notes ------ - Note that the total number of vectors returned is always equal to the - number of rows specified in the header; empty documents are inserted and - yielded where appropriate, even if they are not explicitly stored in the + The total number of vectors returned is always equal to the number of rows specified in the header. + Empty documents are inserted and yielded where appropriate, even if they are not explicitly stored in the Matrix Market file. """ - for doc_id, doc in super(MmCorpus, self).__iter__(): yield doc # get rid of doc id, return the sparse vector only @staticmethod def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False): - """ - Save a corpus in the Matrix Market format to disk. + """Save a corpus in the Matrix Market format to disk. + Parameters + ---------- + fname : str + Path to file. + corpus : iterable of list of (int, number) + Corpus in Bow format. + id2word : dict of (int, str), optional + WordId -> Word. + progress_cnt : int, optional + Progress counter. + metadata : bool, optional + If true, writes out additional metadata. + + Notes + ----- This function is automatically called by `MmCorpus.serialize`; don't call it directly, call `serialize` instead. + + Example + ------- + >>> from gensim.corpora.mmcorpus import MmCorpus + >>> from gensim.test.utils import datapath + >>> import gensim.downloader as api + >>> + >>> corpus = MmCorpus(datapath('test_mmcorpus_with_index.mm')) + >>> + >>> MmCorpus.save_corpus("random", corpus) # Do not do it, use `serialize` instead. + [97, 121, 169, 201, 225, 249, 258, 276, 303] + """ logger.info("storing corpus in Matrix Market format to %s", fname) num_terms = len(id2word) if id2word is not None else None diff --git a/gensim/corpora/textcorpus.py b/gensim/corpora/textcorpus.py index 41ad492570..cd3d0d26e4 100644 --- a/gensim/corpora/textcorpus.py +++ b/gensim/corpora/textcorpus.py @@ -216,13 +216,26 @@ def __init__(self, input=None, dictionary=None, metadata=False, character_filter Examples -------- - >>> #TODO Example with inheritance >>> from gensim.corpora.textcorpus import TextCorpus >>> from gensim.test.utils import datapath + >>> from gensim import utils >>> - >>> corpus = TextCorpus(datapath('head500.noblanks.cor.bz2')) - >>> for bow in corpus: - ... pass + >>> + >>> class CorpusMiislita(TextCorpus): + ... stopwords = set('for a of the and to in on'.split()) + ... + ... def get_texts(self): + ... for doc in self.getstream(): + ... yield [word for word in utils.to_unicode(doc).lower().split() if word not in self.stopwords] + ... + ... def __len__(self): + ... self.length = sum(1 for _ in self.get_texts()) + ... return self.length + >>> + >>> corpus = CorpusMiislita(datapath('head500.noblanks.cor.bz2')) + >>> len(corpus) + 250 + >>> document = next(iter(corpus.get_texts())) """ self.input = input diff --git a/gensim/matutils.py b/gensim/matutils.py index b33b32b272..e700add37d 100644 --- a/gensim/matutils.py +++ b/gensim/matutils.py @@ -1111,7 +1111,7 @@ def qr_destroy(la): class MmWriter(object): - """Store a corpus in Matrix Market format. + """Store a corpus in Matrix Market format, used for :class:`~gensim.corpora.mmcorpus.MmCorpus`. Notes ----- @@ -1133,7 +1133,7 @@ def __init__(self, fname): Parameters ---------- fname : str - Path to output file + Path to output file. """ self.fname = fname @@ -1143,16 +1143,16 @@ def __init__(self, fname): self.headers_written = False def write_headers(self, num_docs, num_terms, num_nnz): - """Write headers to file + """Write headers to file. Parameters ---------- num_docs : int - Number of documents in corpus + Number of documents in corpus. num_terms : int - Number of term in corpus + Number of term in corpus. num_nnz : int - Number of non-zero elements in corpus + Number of non-zero elements in corpus. """ self.fout.write(MmWriter.HEADER_LINE) @@ -1176,11 +1176,11 @@ def fake_headers(self, num_docs, num_terms, num_nnz): Parameters ---------- num_docs : int - Number of documents in corpus + Number of documents in corpus. num_terms : int - Number of term in corpus + Number of term in corpus. num_nnz : int - Number of non-zero elements in corpus + Number of non-zero elements in corpus. """ stats = '%i %i %i' % (num_docs, num_terms, num_nnz) @@ -1196,8 +1196,8 @@ def write_vector(self, docno, vector): ---------- docno : int Number of document. - vector : list of (int, float) - Vector in BoW format. + vector : list of (int, number) + Document in BoW format. Returns ------- @@ -1222,8 +1222,8 @@ def write_corpus(fname, corpus, progress_cnt=1000, index=False, num_terms=None, ---------- fname : str Filename of the resulting file. - corpus : iterable of iterable of (int, float) - Corpus in Bow format + corpus : iterable of list of (int, number) + Corpus in Bow format. progress_cnt : int, optional Print progress for every `progress_cnt` number of documents. index : bool, optional @@ -1236,7 +1236,7 @@ def write_corpus(fname, corpus, progress_cnt=1000, index=False, num_terms=None, Returns ------- offsets : {list of int, None} - List of offsets or nothing. + List of offsets (if index=True) or nothing. Notes ----- @@ -1301,18 +1301,17 @@ def write_corpus(fname, corpus, progress_cnt=1000, index=False, num_terms=None, return offsets def __del__(self): - """Automatic destructor which closes the underlying file. + """Close `self.fout` file, alias for :meth:`~gensim.matutils.MmWriter.close`. - Notes - ----- - There must be no circular references contained in the object for __del__ to work! + Warnings + -------- Closing the file explicitly via the close() method is preferred and safer. """ self.close() # does nothing if called twice (on an already closed file), so no worries def close(self): - """Close file.""" + """Close `self.fout` file.""" logger.debug("closing %s", self.fname) if hasattr(self, 'fout'): self.fout.close() @@ -1325,8 +1324,7 @@ def close(self): FAST_VERSION = -1 class MmReader(object): - """ - matrix market file reader + """Matrix market file reader, used for :class:`~gensim.corpora.mmcorpus.MmCorpus`. Wrap a term-document matrix on disk (in matrix-market format), and present it as an object which supports iteration over the rows (~documents). @@ -1342,26 +1340,22 @@ class MmReader(object): Notes ---------- - Note that the file is read into memory one document at a time, not the whole - matrix at once (unlike scipy.io.mmread). This allows us to process corpora - which are larger than the available RAM. + Note that the file is read into memory one document at a time, not the whole matrix at once + (unlike :meth:`~scipy.io.mmread`). This allows us to process corpora which are larger than the available RAM. """ def __init__(self, input, transposed=True): """ - Create matrix reader Parameters ---------- - input : string or file-like - string (file path) or a file-like object that supports - `seek()` (e.g. gzip.GzipFile, bz2.BZ2File). File-like objects are - not closed automatically. + input : {str, file-like object} + Path to input file in MM format or a file-like object that supports `seek()` + (e.g. :class:`~gzip.GzipFile`, :class:`~bz2.BZ2File`). - transposed : bool - if True, expects lines to represent doc_id, term_id, value - else, expects term_id, doc_id, value + transposed : bool, optional + if True, expects lines to represent doc_id, term_id, value. Else, expects term_id, doc_id, value. """ logger.info("initializing corpus reader from %s", input) @@ -1392,6 +1386,7 @@ def __init__(self, input, transposed=True): ) def __len__(self): + """Get size of corpus (number of documents).""" return self.num_docs def __str__(self): @@ -1399,13 +1394,12 @@ def __str__(self): (self.num_docs, self.num_terms, self.num_nnz)) def skip_headers(self, input_file): - """ - Skip file headers that appear before the first document. + """Skip file headers that appear before the first document. Parameters ---------- - input_file : iterable - consumes any lines from start of `input_file` that begin with a % + input_file : iterable of str + Iterable taken from file in MM format. """ for line in input_file: @@ -1414,21 +1408,18 @@ def skip_headers(self, input_file): break def __iter__(self): - """ - Iterate through vectors from underlying matrix + """Iterate through corpus. - Yields + Notes ------ - int, list of (termid, val) - document id and "vector" of terms for next document in matrix - vector of terms is represented as a list of (termid, val) tuples + Note that the total number of vectors returned is always equal to the number of rows specified + in the header, empty documents are inserted and yielded where appropriate, even if they are not explicitly + stored in the Matrix Market file. - Notes + Yields ------ - Note that the total number of vectors returned is always equal to the - number of rows specified in the header; empty documents are inserted and - yielded where appropriate, even if they are not explicitly stored in the - Matrix Market file. + (int, list of (int, number)) + Document id and Document in BoW format """ with utils.file_or_filename(self.input) as lines: @@ -1468,21 +1459,19 @@ def __iter__(self): yield previd, [] def docbyoffset(self, offset): - """ - Return document at file offset `offset` (in bytes) + """Get document at file offset `offset` (in bytes). Parameters ---------- offset : int - offset, in bytes, of desired document + Offset, in bytes, of desired document. Returns ------ - list of (termid, val) - "vector" of terms for document at offset - vector of terms is represented as a list of (termid, val) tuples - """ + list of (int, str) + Document in BoW format. + """ # empty documents are not stored explicitly in MM format, so the index marks # them with a special offset, -1. if offset == -1: diff --git a/gensim/similarities/docsim.py b/gensim/similarities/docsim.py index b315304771..6c6f79a5a4 100755 --- a/gensim/similarities/docsim.py +++ b/gensim/similarities/docsim.py @@ -4,37 +4,42 @@ # Copyright (C) 2013 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -This module contains functions and classes for computing similarities across -a collection of documents in the Vector Space Model. - -The main class is `Similarity`, which builds an index for a given set of documents. -Once the index is built, you can perform efficient queries like "Tell me how similar -is this query document to each document in the index?". The result is a vector -of numbers as large as the size of the initial set of documents, that is, one float -for each index document. Alternatively, you can also request only the top-N most +"""Computing similarities across a collection of documents in the Vector Space Model. + +The main class is :class:`~gensim.similarity.docsim.Similarity`, which builds an index for a given set of documents. +Once the index is built, you can perform efficient queries like "Tell me how similar is this query document to each +document in the index?". The result is a vector of numbers as large as the size of the initial set of documents, +that is, one float for each index document. Alternatively, you can also request only the top-N most similar index documents to the query. -You can later add new documents to the index via `Similarity.add_documents()`. How It Works ------------ - -The `Similarity` class splits the index into several smaller sub-indexes ("shards"), -which are disk-based. If your entire index fits in memory (~hundreds of thousands -documents for 1GB of RAM), you can also use the `MatrixSimilarity` or `SparseMatrixSimilarity` -classes directly. These are more simple but do not scale as well (they keep the -entire index in RAM, no sharding). +The :class:`~gensim.similarity.docsim.Similarity` class splits the index into several smaller sub-indexes ("shards"), +which are disk-based. If your entire index fits in memory (~hundreds of thousands documents for 1GB of RAM), +you can also use the :class:`~gensim.similarity.docsim.MatrixSimilarity` +or :class:`~gensim.similarity.docsim.SparseMatrixSimilarity` classes directly. +These are more simple but do not scale as well (they keep the entire index in RAM, no sharding). Once the index has been initialized, you can query for document similarity simply by: - ->>> index = Similarity('/tmp/tst', corpus, num_features=12) # build the index +>>> from gensim.test.utils import common_corpus, common_dictionary, get_tmpfile +>>> +>>> index_tmpfile = get_tmpfile("index") +>>> query = [(1, 2), (6, 1), (7, 2)] +>>> +>>> index = Similarity(index_tmpfile, common_corpus, num_features=len(common_dictionary)) # build the index >>> similarities = index[query] # get similarities between the query and all index documents If you have more query documents, you can submit them all at once, in a batch: ->>> for similarities in index[batch_of_documents]: # the batch is simply an iterable of documents (=gensim corpus) ->>> ... +>>> from gensim.test.utils import common_corpus, common_dictionary, get_tmpfile +>>> +>>> index_tmpfile = get_tmpfile("index") +>>> batch_of_documents = common_corpus[:] # only as example +>>> index = Similarity(index_tmpfile, common_corpus, num_features=len(common_dictionary)) # build the index +>>> +>>> for similarities in index[batch_of_documents]: # the batch is simply an iterable of documents, aka gensim corpus. +... pass The benefit of this batch (aka "chunked") querying is much better performance. To see the speed-up on your machine, run ``python -m gensim.test.simspeed`` @@ -44,12 +49,15 @@ to the index itself (i.e. queries=indexed documents themselves). This special syntax uses the faster, batch queries internally and **is ideal for all-vs-all pairwise similarities**: +>>> from gensim.test.utils import common_corpus, common_dictionary, get_tmpfile +>>> +>>> index_tmpfile = get_tmpfile("index") +>>> index = Similarity(index_tmpfile, common_corpus, num_features=len(common_dictionary)) # build the index +>>> >>> for similarities in index: # yield similarities of the 1st indexed document, then 2nd... ->>> ... +... pass """ - - import logging import itertools import os @@ -74,16 +82,23 @@ class Shard(utils.SaveLoad): - """ - A proxy class that represents a single shard instance within a Similarity - index. + """A proxy that represents a single shard instance within :class:`~gensim.similarity.docsim.Similarity` index. - Basically just wraps (Sparse)MatrixSimilarity so that it mmaps from disk on - request (query). + Basically just wraps :class:`~gensim.similarities.docsim.MatrixSimilarity`, + :class:`~gensim.similarities.docsim.SparseMatrixSimilarity`, etc, so that it mmaps from disk on request (query). """ def __init__(self, fname, index): + """ + Parameters + ---------- + fname : str + Path to top-level directory (file) to traverse for corpus documents. + index : :class:`~gensim.interfaces.SimilarityABC` + Index object. + + """ self.dirname, self.fname = os.path.split(fname) self.length = len(index) self.cls = index.__class__ @@ -92,12 +107,29 @@ def __init__(self, fname, index): self.index = self.get_index() def fullname(self): + """Get full path to shard file. + + Return + ------ + str + Path to shard instance. + + """ return os.path.join(self.dirname, self.fname) def __len__(self): + """Get length.""" return self.length def __getstate__(self): + """Special handler for pickle. + + Returns + ------- + dict + Object that contains state of current instance without `index`. + + """ result = self.__dict__.copy() # (S)MS objects must be loaded via load() because of mmap (simple pickle.load won't do) if 'index' in result: @@ -108,21 +140,59 @@ def __str__(self): return "%s Shard(%i documents in %s)" % (self.cls.__name__, len(self), self.fullname()) def get_index(self): + """Load & get index. + + Returns + ------- + :class:`~gensim.interfaces.SimilarityABC` + Index instance. + + """ if not hasattr(self, 'index'): logger.debug("mmaping index from %s", self.fullname()) self.index = self.cls.load(self.fullname(), mmap='r') return self.index def get_document_id(self, pos): - """Return index vector at position `pos`. + """Get index vector at position `pos`. + + Parameters + ---------- + pos : int + Vector position. + Return + ------ + {:class:`scipy.sparse.csr_matrix`, :class:`numpy.ndarray`} + Index vector. Type depends on underlying index. + + Notes + ----- The vector is of the same type as the underlying index (ie., dense for - MatrixSimilarity and scipy.sparse for SparseMatrixSimilarity. + :class:`~gensim.similarities.docsim.MatrixSimilarity` + and scipy.sparse for :class:`~gensim.similarities.docsim.SparseMatrixSimilarity`. + TODO: Can dense be scipy.sparse? + """ assert 0 <= pos < len(self), "requested position out of range" return self.get_index().index[pos] def __getitem__(self, query): + """Get similarities of document (or corpus) `query` to all documents in the corpus. + + Parameters + ---------- + query : {iterable of list of (int, number) , list of (int, number))} + Document or corpus. + + Returns + ------- + :class:`numpy.ndarray` + Similarities of document/corpus if index is :class:`~gensim.similarities.docsim.MatrixSimilarity` **or** + :class:`scipy.sparse.csr_matrix` + for case if index is :class:`~gensim.similarities.docsim.SparseMatrixSimilarity`. + + """ index = self.get_index() try: index.num_best = self.num_best @@ -133,6 +203,21 @@ def __getitem__(self, query): def query_shard(args): + """Helper for request query from shard, same as shard[query]. + + Parameters + --------- + args : (list of (int, number), :class:`~gensim.interfaces.SimilarityABC`) + Query and Shard instances + + Returns + ------- + :class:`numpy.ndarray` + Similarities of document/corpus if index is :class:`~gensim.similarities.docsim.MatrixSimilarity` **or** + :class:`scipy.sparse.csr_matrix` + for case if index is :class:`~gensim.similarities.docsim.SparseMatrixSimilarity`. + + """ query, shard = args # simulate starmap (not part of multiprocessing in older Pythons) logger.debug("querying shard %s num_best=%s in process %s", shard, shard.num_best, os.getpid()) result = shard[query] @@ -141,51 +226,81 @@ def query_shard(args): class Similarity(interfaces.SimilarityABC): - """ - Compute cosine similarity of a dynamic query against a static corpus of documents - ("the index"). + """Compute cosine similarity of a dynamic query against a static corpus of documents ('the index'). - Scalability is achieved by sharding the index into smaller pieces, each of which - fits into core memory (see the `(Sparse)MatrixSimilarity` classes in this module). + Notes + ----- + Scalability is achieved by sharding the index into smaller pieces, each of which fits into core memory The shards themselves are simply stored as files to disk and mmap'ed back as needed. - """ - - def __init__(self, output_prefix, corpus, num_features, num_best=None, chunksize=256, shardsize=32768, norm='l2'): - """ - Construct the index from `corpus`. The index can be later extended by calling - the `add_documents` method. **Note**: documents are split (internally, transparently) - into shards of `shardsize` documents each, converted to a matrix, for faster BLAS calls. - Each shard is stored to disk under `output_prefix.shard_number` (=you need write - access to that location). If you don't specify an output prefix, a random - filename in temp will be used. - - `shardsize` should be chosen so that a `shardsize x chunksize` matrix of floats - fits comfortably into main memory. - `num_features` is the number of features in the `corpus` (e.g. size of the - dictionary, or the number of latent topics for latent semantic models). - `norm` is the user-chosen normalization to use. Accepted values are: 'l1' and 'l2'. + Examples + -------- + >>> from gensim.corpora.textcorpus import TextCorpus + >>> from gensim.test.utils import datapath, get_tmpfile + >>> from gensim.similarities import Similarity + >>> + >>> corpus = TextCorpus(datapath('testcorpus.mm')) + >>> index_temp = get_tmpfile("index") + >>> index = Similarity(index_temp, corpus, num_features=400) # create index + >>> + >>> query = next(iter(corpus)) + >>> result = index[query] # search similar to `query` in index + >>> + >>> for sims in index[corpus]: # if you have more query documents, you can submit them all at once, in a batch + ... pass + >>> + >>> # There is also a special syntax for when you need similarity of documents in the index + >>> # to the index itself (i.e. queries=indexed documents themselves). This special syntax + >>> # uses the faster, batch queries internally and **is ideal for all-vs-all pairwise similarities**: + >>> for similarities in index: # yield similarities of the 1st indexed document, then 2nd... + ... pass + + See Also + -------- + :class:`~gensim.similarities.docsim.MatrixSimilarity` + Index similarity (dense with cosine distance). + :class:`~gensim.similarities.docsim.SparseMatrixSimilarity` + Index similarity (sparse with cosine distance). + :class:`~gensim.similarities.docsim.SoftCosineSimilarity` + Index similarity (with soft-cosine distance). + :class:`~gensim.similarities.docsim.WmdSimilarity` + Index similarity (with word-mover distance). - If `num_best` is left unspecified, similarity queries will return a full - vector with one float for every document in the index: - - >>> index = Similarity('/path/to/index', corpus, num_features=400) # if corpus has 7 documents... - >>> index[query] # ... then result will have 7 floats - [0.0, 0.0, 0.2, 0.13, 0.8, 0.0, 0.1] - - If `num_best` is set, queries return only the `num_best` most similar documents, - always leaving out documents for which the similarity is 0. - If the input vector itself only has features with zero values (=the sparse - representation is empty), the returned list will always be empty. + """ - >>> index.num_best = 3 - >>> index[query] # return at most "num_best" of `(index_of_document, similarity)` tuples - [(4, 0.8), (2, 0.13), (3, 0.13)] + def __init__(self, output_prefix, corpus, num_features, num_best=None, chunksize=256, shardsize=32768, norm='l2'): + """ - You can also override `num_best` dynamically, simply by setting e.g. - `self.num_best = 10` before doing a query. + Parameters + ---------- + output_prefix : str + Prefix for shard filename. If None - random filename in temp will be used. + corpus : iterable of list of (int, number) + Corpus in BoW format. + num_features : int + Size of the dictionary (number of features). + num_best : int, optional + If set, return only the `num_best` most similar documents, always leaving out documents with similarity = 0. + Otherwise, return a full vector with one float for every document in the index. + chunksize : int, optional + Size of block. + shardsize : int, optional + Size of shards should be chosen so that a `shardsize x chunksize` matrix of floats fits comfortably + into memory. + norm : {'l1', 'l2'}, optional + Normalization to use. + + Notes + ------------ + Documents are split (internally, transparently) into shards of `shardsize` documents each, converted to matrix, + for faster BLAS calls. Each shard is stored to disk under `output_prefix.shard_number`. + If you don't specify an output prefix, a random filename in temp will be used. + If your entire index fits in memory (~hundreds of thousands + documents for 1GB of RAM), you can also use the :class:`~gensim.similarities.docsim.MatrixSimilarity` + or :class:`~gensim.similarities.docsim.SparseMatrixSimilarity` classes directly. These are more simple + but do not scale as well (they keep the entire index in RAM, no sharding). """ if output_prefix is None: @@ -206,6 +321,7 @@ def __init__(self, output_prefix, corpus, num_features, num_best=None, chunksize self.add_documents(corpus) def __len__(self): + """Get length of index.""" return len(self.fresh_docs) + sum([len(shard) for shard in self.shards]) def __str__(self): @@ -214,11 +330,31 @@ def __str__(self): ) def add_documents(self, corpus): - """ - Extend the index with new documents. + """Extend the index with new documents. + + Parameters + ---------- + corpus : iterable of list of (int, number) + Corpus in BoW format. + + Notes + ----- + Internally, documents are buffered and then spilled to disk when there's `self.shardsize` of them + (or when a query is issued). + + Examples + -------- + >>> from gensim.corpora.textcorpus import TextCorpus + >>> from gensim.test.utils import datapath, get_tmpfile + >>> from gensim.similarities import Similarity + >>> + >>> corpus = TextCorpus(datapath('testcorpus.mm')) + >>> index_temp = get_tmpfile("index") + >>> index = Similarity(index_temp, corpus, num_features=400) # create index + >>> + >>> one_more_corpus = TextCorpus(datapath('testcorpus.txt')) + >>> index.add_documents(one_more_corpus) # add more documents in corpus - Internally, documents are buffered and then spilled to disk when there's - `self.shardsize` of them (or when a query is issued). """ min_ratio = 1.0 # 0.5 to only reopen shards that are <50% complete if self.shards and len(self.shards[-1]) < min_ratio * self.shardsize: @@ -243,19 +379,34 @@ def add_documents(self, corpus): logger.info("PROGRESS: fresh_shard size=%i", len(self.fresh_docs)) def shardid2filename(self, shardid): + """Get shard file by `shardid`. + + Parameters + ---------- + shardid : int + Shard index. + + Return + ------ + str + Path to shard file. + + """ if self.output_prefix.endswith('.'): return "%s%s" % (self.output_prefix, shardid) else: return "%s.%s" % (self.output_prefix, shardid) def close_shard(self): - """ - Force the latest shard to close (be converted to a matrix and stored - to disk). Do nothing if no new documents added since last call. + """Force the latest shard to close (be converted to a matrix and stored to disk). + Do nothing if no new documents added since last call. - **NOTE**: the shard is closed even if it is not full yet (its size is smaller - than `self.shardsize`). If documents are added later via `add_documents()`, + Notes + ----- + The shard is closed even if it is not full yet (its size is smaller than `self.shardsize`). + If documents are added later via :meth:`~gensim.similarities.docsim.MatrixSimilarity.add_documents` this incomplete shard will be loaded again and completed. + """ if not self.fresh_docs: return @@ -276,6 +427,7 @@ def close_shard(self): self.fresh_docs, self.fresh_nnz = [], 0 def reopen_shard(self): + """Reopen incomplete shard.""" assert self.shards if self.fresh_docs: raise ValueError("cannot reopen a shard with fresh documents in index") @@ -289,12 +441,19 @@ def reopen_shard(self): logger.debug("reopen complete") def query_shards(self, query): - """ - Return the result of applying shard[query] for each shard in self.shards, - as a sequence. + """Applying shard[query] for each shard in `self.shards`, as a sequence. + + Parameters + ---------- + query : {iterable of list of (int, number) , list of (int, number))} + Document in BoW format or corpus of documents. + + + Returns + ------- + (None, list of ...) + Result of search. - If PARALLEL_SHARDS is set, the shards are queried in parallel, using - the multiprocessing module. """ args = zip([query] * len(self.shards), self.shards) if PARALLEL_SHARDS and PARALLEL_SHARDS > 1: @@ -308,13 +467,37 @@ def query_shards(self, query): return pool, result def __getitem__(self, query): - """Get similarities of document `query` to all documents in the corpus. + """Get similarities of document (or corpus) `query` to all documents in the corpus. + + Parameters + ---------- + query : {iterable of list of (int, number) , list of (int, number))} + Corpus or document of corpus. + + Return + ------ + :class:`numpy.ndarray` + Similarities of document/corpus if index is :class:`~gensim.similarities.docsim.MatrixSimilarity` **or** + :class:`scipy.sparse.csr_matrix` + for case if index is :class:`~gensim.similarities.docsim.SparseMatrixSimilarity`. + + Notes + ----- + If `query` is a corpus (iterable of documents), return a matrix of similarities of + all query documents vs. all corpus document. This batch query is more efficient than computing the similarities + one document after another. - **or** + Examples + -------- + >>> from gensim.corpora.textcorpus import TextCorpus + >>> from gensim.test.utils import datapath + >>> from gensim.similarities import Similarity + >>> import gensim.downloader as api + >>> + >>> corpus = TextCorpus(datapath('testcorpus.txt')) + >>> index = Similarity('temp', corpus, num_features=400) + >>> result = index[corpus] # similarities matrix - If `query` is a corpus (iterable of documents), return a matrix of similarities - of all query documents vs. all corpus document. This batch query is more - efficient than computing the similarities one document after another. """ self.close_shard() # no-op if no documents added to index since last query @@ -364,8 +547,30 @@ def convert(shard_no, doc): return result def vector_by_id(self, docpos): - """ - Return indexed vector corresponding to the document at position `docpos`. + """Get indexed vector corresponding to the document at position `docpos`. + + Parameters + ---------- + docpos : int + Document position + + Return + ------ + :class:`scipy.sparse.csr_matrix` + Indexed vector, internal type depends on underlying index. + + Examples + -------- + >>> from gensim.corpora.textcorpus import TextCorpus + >>> from gensim.test.utils import datapath + >>> from gensim.similarities import Similarity + >>> import gensim.downloader as api + >>> + >>> # Create index: + >>> corpus = TextCorpus(datapath('testcorpus.txt')) + >>> index = Similarity('temp', corpus, num_features=400) + >>> vector = index.vector_by_id(1) + """ self.close_shard() # no-op if no documents added to index since last query pos = 0 @@ -379,9 +584,30 @@ def vector_by_id(self, docpos): return result def similarity_by_id(self, docpos): - """ - Return similarity of the given document only. `docpos` is the position - of the query document within index. + """Get similarity of the given document only by `docpos`. + + Parameters + ---------- + docpos : int + Document position in index + + Return + ------ + :class:`numpy.ndarray` + Similarities of document/corpus if index is :class:`~gensim.similarities.docsim.MatrixSimilarity` **or** + :class:`scipy.sparse.csr_matrix` + for case if index is :class:`~gensim.similarities.docsim.SparseMatrixSimilarity`. + + Examples + -------- + >>> from gensim.corpora.textcorpus import TextCorpus + >>> from gensim.test.utils import datapath + >>> from gensim.similarities import Similarity + >>> + >>> corpus = TextCorpus(datapath('testcorpus.txt')) + >>> index = Similarity('temp', corpus, num_features=400) + >>> similarities = index.similarity_by_id(1) + """ query = self.vector_by_id(docpos) norm, self.norm = self.norm, False @@ -390,9 +616,16 @@ def similarity_by_id(self, docpos): return result def __iter__(self): - """ - For each index document, compute cosine similarity against all other - documents in the index and yield the result. + """For each index document in index, compute cosine similarity against all other documents in the index. + Using :meth:`~gensim.similarities.docsim.Similarity.iter_chunks`. + + Yields + ------ + :class:`numpy.ndarray` + Similarities of document if index is :class:`~gensim.similarities.docsim.MatrixSimilarity` **or** + :class:`scipy.sparse.csr_matrix` + for case if index is :class:`~gensim.similarities.docsim.SparseMatrixSimilarity`. + """ # turn off query normalization (vectors in the index are already normalized, save some CPU) norm, self.norm = self.norm, False @@ -407,12 +640,25 @@ def __iter__(self): self.norm = norm # restore normalization def iter_chunks(self, chunksize=None): - """ - Iteratively yield the index as chunks of documents, each of size <= chunksize. + """Iteratively yield the index as chunks of documents, each of size <= chunksize. + + Parameters + ---------- + chunksize : int, optional + Size of chunk,, if None - `self.chunksize` will be used. + + Notes + ----- + The chunk is returned in its raw form. + The size of the chunk may be smaller than requested; it is up to the caller to check the result for real length. + + Yields + ------ + :class:`numpy.ndarray` + Similarities of document if index is :class:`~gensim.similarities.docsim.MatrixSimilarity` **or** + :class:`scipy.sparse.csr_matrix` + for case if index is :class:`~gensim.similarities.docsim.SparseMatrixSimilarity`. - The chunk is returned in its raw form (matrix or sparse matrix slice). - The size of the chunk may be smaller than requested; it is up to the caller - to check the result for real length, using `chunk.shape[0]`. """ self.close_shard() @@ -431,19 +677,41 @@ def iter_chunks(self, chunksize=None): yield chunk def check_moved(self): - """ - Update shard locations, in case the server directory has moved on filesystem. - """ + """Update shard locations (for case if the server directory has moved on filesystem).""" dirname = os.path.dirname(self.output_prefix) for shard in self.shards: shard.dirname = dirname def save(self, fname=None, *args, **kwargs): - """ - Save the object via pickling (also see load) under filename specified in - the constructor. + """Save the object via pickling (also see load) under filename specified in the constructor. + + Parameters + ---------- + fname : str, optional + Path for save index, if not provided - will be saved to `self.output_prefix`. + *args : object + Arguments, look at :meth:`gensim.interfaces.SimilarityABC.save`. + **kwargs : object + Keyword arguments, look at :meth:`gensim.interfaces.SimilarityABC.save`. - Calls `close_shard` internally to spill any unfinished shards to disk first. + Notes + ----- + Call :meth:`~gensim.similarities.Similarity.close_shard` internally to spill unfinished shards to disk first. + + Examples + -------- + >>> from gensim.corpora.textcorpus import TextCorpus + >>> from gensim.test.utils import datapath, get_tmpfile + >>> from gensim.similarities import Similarity + >>> + >>> temp_fname = get_tmpfile("index") + >>> output_fname = get_tmpfile("saved_index") + >>> + >>> corpus = TextCorpus(datapath('testcorpus.txt')) + >>> index = Similarity(temp_fname, corpus, num_features=400) + >>> + >>> index.save(output_fname) + >>> loaded_index = index.load(output_fname) """ self.close_shard() @@ -452,11 +720,7 @@ def save(self, fname=None, *args, **kwargs): super(Similarity, self).save(fname, *args, **kwargs) def destroy(self): - """ - Delete all files under self.output_prefix. Object is not usable after calling - this method anymore. Use with care! - - """ + """Delete all files under self.output_prefix, object is not usable after calling this method anymore.""" import glob for fname in glob.glob(self.output_prefix + '*'): logger.info("deleting %s", fname) @@ -464,25 +728,38 @@ def destroy(self): class MatrixSimilarity(interfaces.SimilarityABC): - """ - Compute similarity against a corpus of documents by storing the index matrix - in memory. The similarity measure used is cosine between two vectors. - - Use this if your input corpus contains dense vectors (such as documents in LSI - space) and fits into RAM. + """Compute cosine similarity against a corpus of documents by storing the index matrix in memory. - The matrix is internally stored as a *dense* numpy array. Unless the entire matrix - fits into main memory, use `Similarity` instead. + Unless the entire matrix fits into main memory, use :class:`~gensim.similarities.docsim.Similarity` instead. - See also `Similarity` and `SparseMatrixSimilarity` in this module. + Examples + -------- + >>> from gensim.test.utils import common_corpus, common_dictionary + >>> from gensim.similarities import MatrixSimilarity + >>> + >>> query = [(1, 2), (5, 4)] + >>> index = MatrixSimilarity(common_corpus, num_features=len(common_dictionary)) + >>> sims = index[query] """ - def __init__(self, corpus, num_best=None, dtype=numpy.float32, num_features=None, chunksize=256, corpus_len=None): """ - `num_features` is the number of features in the corpus (will be determined - automatically by scanning the corpus if not specified). See `Similarity` - class for description of the other parameters. + + Parameters + ---------- + corpus : iterable of list of (int, number) + Corpus in BoW format. + num_best : int, optional + If set, return only the `num_best` most similar documents, always leaving out documents with similarity = 0. + Otherwise, return a full vector with one float for every document in the index. + dtype : numpy.dtype + Datatype of internal matrix + num_features : int, optional + Size of the dictionary. + chunksize : int, optional + Size of chunk. + corpus_len : int, optional + Size of `corpus`, if not specified - will scan corpus to determine size. """ if num_features is None: @@ -527,15 +804,22 @@ def __len__(self): return self.index.shape[0] def get_similarities(self, query): - """ - Return similarity of sparse vector `query` to all documents in the corpus, - as a numpy array. + """Get similarity between `query` and current index instance. - If `query` is a collection of documents, return a 2D array of similarities - of each document in `query` to all documents in the corpus (=batch query, - faster than processing each document in turn). + Warnings + -------- + Do not use this function directly, use the :class:`~gensim.similarities.docsim.MatrixSimilarity.__getitem__` + instead. - **Do not use this function directly; use the self[query] syntax instead.** + Parameters + ---------- + query : {list of (int, number), iterable of list of (int, number), :class:`scipy.sparse.csr_matrix` + Document or collection of documents. + + Return + ------ + :class:`numpy.ndarray` + Similarity matrix. """ is_corpus, query = utils.is_corpus(query) @@ -564,8 +848,32 @@ def __str__(self): class SoftCosineSimilarity(interfaces.SimilarityABC): - """Document similarity (like MatrixSimilarity) that uses Soft Cosine Measure as a similarity measure.""" + """Compute soft cosine similarity against a corpus of documents by storing the index matrix in memory. + + Examples + -------- + >>> from gensim.test.utils import common_texts + >>> from gensim.corpora import Dictionary + >>> from gensim.models import Word2Vec + >>> from gensim.similarities import SoftCosineSimilarity + >>> + >>> model = Word2Vec(common_texts, size=20, min_count=1) # train word-vectors + >>> dictionary = Dictionary(common_texts) + >>> bow_corpus = [dictionary.doc2bow(document) for document in common_texts] + >>> + >>> similarity_matrix = model.wv.similarity_matrix(dictionary) # construct similarity matrix + >>> index = SoftCosineSimilarity(bow_corpus, similarity_matrix, num_best=10) + >>> + >>> # Make a query. + >>> query = 'graph trees computer'.split() + >>> # calculate similarity between query and each doc from bow_corpus + >>> sims = index[dictionary.doc2bow(query)] + + Check out `Tutorial Notebook + `_ + for more examples. + """ def __init__(self, corpus, similarity_matrix, num_best=None, chunksize=256): """ @@ -581,7 +889,6 @@ def __init__(self, corpus, similarity_matrix, num_best=None, chunksize=256): chunksize: int, optional Size of one corpus chunk. - See Also -------- :meth:`gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.similarity_matrix` @@ -589,31 +896,6 @@ def __init__(self, corpus, similarity_matrix, num_best=None, chunksize=256): :func:`gensim.matutils.softcossim` The Soft Cosine Measure. - Examples - -------- - >>> from gensim.corpora import Dictionary - >>> import gensim.downloader as api - >>> from gensim.models import Word2Vec - >>> from gensim.similarities import SoftCosineSimilarity - >>> from gensim.utils import simple_preprocess - >>> - >>> # Prepare the model - >>> corpus = api.load("text8") - >>> model = Word2Vec(corpus, workers=3, size=100) - >>> dictionary = Dictionary(corpus) - >>> bow_corpus = [dictionary.doc2bow(document) for document in corpus] - >>> similarity_matrix = model.wv.similarity_matrix(dictionary) - >>> index = SoftCosineSimilarity(bow_corpus, similarity_matrix, num_best=10) - >>> - >>> # Make a query. - >>> query = 'Yummy! Great view of the Bellagio Fountain show.' - >>> # calculate similarity between query and each doc from bow_corpus - >>> sims = index[dictionary.doc2bow(simple_preprocess(query))] - - See `Tutorial Notebook - `_ - for more examples. - """ self.corpus = corpus self.similarity_matrix = similarity_matrix @@ -632,8 +914,22 @@ def __len__(self): return len(self.corpus) def get_similarities(self, query): - """ - **Do not use this function directly; use the self[query] syntax instead.** + """Get similarity between `query` and current index instance. + + Warnings + -------- + Do not use this function directly; use the self[query] syntax instead. + + Parameters + ---------- + query : {list of (int, number), iterable of list of (int, number), :class:`scipy.sparse.csr_matrix` + Document or collection of documents. + + Return + ------ + :class:`numpy.ndarray` + Similarity matrix. + """ if isinstance(query, numpy.ndarray): # Convert document indexes to actual documents. @@ -666,37 +962,53 @@ def __str__(self): class WmdSimilarity(interfaces.SimilarityABC): - """ - Document similarity (like MatrixSimilarity) that uses the negative of WMD - as a similarity measure. See gensim.models.word2vec.wmdistance for more - information. + """Compute negative WMD similarity against a corpus of documents by storing the index matrix in memory. + - When a `num_best` value is provided, only the most similar documents are - retrieved. + See :class:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors` for more information. + Also, tutorial `notebook + `_ for more examples. When using this code, please consider citing the following papers: + `Ofir Pele and Michael Werman, "A linear time histogram metric for improved SIFT matching" + `_, `Ofir Pele and Michael Werman, "Fast and robust earth + mover's distances" `_, `"Matt Kusner et al. "From Word + Embeddings To Document Distances" `_. + + Example + ------- + >>> from gensim.test.utils import common_texts + >>> from gensim.corpora import Dictionary + >>> from gensim.models import Word2Vec + >>> from gensim.similarities import WmdSimilarity + >>> + >>> model = Word2Vec(common_texts, size=20, min_count=1) # train word-vectors + >>> dictionary = Dictionary(common_texts) + >>> bow_corpus = [dictionary.doc2bow(document) for document in common_texts] + >>> + >>> index = WmdSimilarity(bow_corpus, model) + >>> # Make query. + >>> query = 'trees' + >>> sims = index[query] - .. Ofir Pele and Michael Werman, "A linear time histogram metric for improved SIFT matching". - .. Ofir Pele and Michael Werman, "Fast and robust earth mover's distances". - .. Matt Kusner et al. "From Word Embeddings To Document Distances". - - Example: - # See Tutorial Notebook for more examples - https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/WMD_tutorial.ipynb - >>> # Given a document collection "corpus", train word2vec model. - >>> model = word2vec(corpus) - >>> instance = WmdSimilarity(corpus, model, num_best=10) - >>> # Make query. - >>> query = 'Very good, you should seat outdoor.' - >>> sims = instance[query] """ def __init__(self, corpus, w2v_model, num_best=None, normalize_w2v_and_replace=True, chunksize=256): """ - corpus: List of lists of strings, as in gensim.models.word2vec. - w2v_model: A trained word2vec model. - num_best: Number of results to retrieve. - normalize_w2v_and_replace: Whether or not to normalize the word2vec vectors to length 1. + + Parameters + ---------- + corpus: iterable of list of (int, float) + A list of documents in the BoW format. + w2v_model: :class:`~gensim.models.word2vec.Word2VecTrainables` + A trained word2vec model. + num_best: int, optional + Number of results to retrieve. + normalize_w2v_and_replace: bool, optional + Whether or not to normalize the word2vec vectors to length 1. + chunksize : int, optional + Size of chunk. + """ self.corpus = corpus self.w2v_model = w2v_model @@ -714,11 +1026,26 @@ def __init__(self, corpus, w2v_model, num_best=None, normalize_w2v_and_replace=T w2v_model.init_sims(replace=True) def __len__(self): + """Get size of corpus.""" return len(self.corpus) def get_similarities(self, query): - """ - **Do not use this function directly; use the self[query] syntax instead.** + """Get similarity between `query` and current index instance. + + Warnings + -------- + Do not use this function directly; use the self[query] syntax instead. + + Parameters + ---------- + query : {list of (int, number), iterable of list of (int, number), :class:`scipy.sparse.csr_matrix` + Document or collection of documents. + + Return + ------ + :class:`numpy.ndarray` + Similarity matrix. + """ if isinstance(query, numpy.ndarray): # Convert document indexes to actual documents. @@ -751,25 +1078,56 @@ def __str__(self): class SparseMatrixSimilarity(interfaces.SimilarityABC): - """ - Compute similarity against a corpus of documents by storing the sparse index - matrix in memory. The similarity measure used is cosine between two vectors. + """Compute cosine similarity against a corpus of documents by storing the index matrix in memory. - Use this if your input corpus contains sparse vectors (such as documents in - bag-of-words format) and fits into RAM. + Notes + ----- + Use this if your input corpus contains sparse vectors (such as documents in bag-of-words format) and fits into RAM. - The matrix is internally stored as a `scipy.sparse.csr` matrix. Unless the entire - matrix fits into main memory, use `Similarity` instead. + The matrix is internally stored as a :class:`scipy.sparse.csr_matrix` matrix. Unless the entire + matrix fits into main memory, use :class:`~gensim.similarities.docsim.Similarity` instead. Takes an optional `maintain_sparsity` argument, setting this to True causes `get_similarities` to return a sparse matrix instead of a dense representation if possible. - See also `Similarity` and `MatrixSimilarity` in this module. + See also + -------- + :class:`~gensim.similarities.docsim.Similarity` + Index similarity (wrapper for other inheritors of :class:`~gensim.interfaces.SimilarityABC`). + :class:`~gensim.similarities.docsim.MatrixSimilarity` + Index similarity (dense with cosine distance). + """ def __init__(self, corpus, num_features=None, num_terms=None, num_docs=None, num_nnz=None, num_best=None, chunksize=500, dtype=numpy.float32, maintain_sparsity=False): + """ + Parameters + ---------- + corpus: iterable of list of (int, float) + A list of documents in the BoW format. + num_features : int, optional + Size of the dictionary. + num_terms : int, optional + Number of terms, **must be specified**. + num_docs : int, optional + Number of documents in `corpus`. + num_nnz : int, optional + Number of non-zero terms. + num_best : int, optional + If set, return only the `num_best` most similar documents, always leaving out documents with similarity = 0. + Otherwise, return a full vector with one float for every document in the index. + chunksize : int, optional + Size of chunk. + dtype : numpy.dtype, optional + Data type of internal matrix. + maintain_sparsity : bool, optional + if True - will return sparse arr from + :meth:`~gensim.similarities.docsim.SparseMatrixSimilarity.get_similarities`. + + """ + self.num_best = num_best self.normalize = True self.chunksize = chunksize @@ -806,18 +1164,27 @@ def __init__(self, corpus, num_features=None, num_terms=None, num_docs=None, num logger.info("created %r", self.index) def __len__(self): + """Get size of index.""" return self.index.shape[0] def get_similarities(self, query): - """ - Return similarity of sparse vector `query` to all documents in the corpus, - as a numpy array. + """Get similarity between `query` and current index instance. - If `query` is a collection of documents, return a 2D array of similarities - of each document in `query` to all documents in the corpus (=batch query, - faster than processing each document in turn). + Warnings + -------- + Do not use this function directly; use the self[query] syntax instead. - **Do not use this function directly; use the self[query] syntax instead.** + Parameters + ---------- + query : {list of (int, number), iterable of list of (int, number), :class:`scipy.sparse.csr_matrix` + Document or collection of documents. + + Return + ------ + :class:`numpy.ndarray` + Similarity matrix (if maintain_sparsity=False) **OR** + :class:`scipy.sparse.csc` + otherwise """ is_corpus, query = utils.is_corpus(query)