Skip to content

Commit

Permalink
Refactor documentation for gensim.similarities.docsim and `MmCorpus…
Browse files Browse the repository at this point in the history
…-related`. (#1910)

* Added example for text_corpus.py

* Fix for example

* Updated docstrings for docsim.py

* Beta_docstrings for docsim.py

* Gamma_docstrings for docsim.py

* Massive package of different files.

* fix build (PEP8, rst)

* retranslate _mmreader.pyx with cython==0.27.3

* fix matutils

* fix textcorpus

* fix mmcorpus

* fix mmreader[2]

* fix docsim[1]

* fix docsim[2]

* fix docsim[3]

* fix docsim[4]

* fix docsim[5]
  • Loading branch information
CLearERR authored and menshikh-iv committed Feb 23, 2018
1 parent c3f08c1 commit 5355c06
Show file tree
Hide file tree
Showing 8 changed files with 1,211 additions and 809 deletions.
1 change: 1 addition & 0 deletions docs/src/apiref.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ Modules:
corpora/lowcorpus
corpora/malletcorpus
corpora/mmcorpus
corpora/_mmreader
corpora/sharded_corpus
corpora/svmlightcorpus
corpora/textcorpus
Expand Down
9 changes: 9 additions & 0 deletions docs/src/corpora/_mmreader.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
:mod:`corpora._mmreader` -- Reader for corpus in the Matrix Market format.
==========================================================================

.. automodule:: gensim.corpora._mmreader
:synopsis: Reader for corpus in the Matrix Market format.
:members:
:inherited-members:
:undoc-members:
:show-inheritance:
971 changes: 494 additions & 477 deletions gensim/corpora/_mmreader.c

Large diffs are not rendered by default.

73 changes: 27 additions & 46 deletions gensim/corpora/_mmreader.pyx
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
# Copyright (C) 2018 Radim Rehurek <radimrehurek@seznam.cz>
"""
Reader for corpus in the Matrix Market format.
"""
# cython: embedsignature=True
"""Reader for corpus in the Matrix Market format."""
from __future__ import with_statement

from gensim import utils
Expand All @@ -19,20 +17,19 @@ logger = logging.getLogger(__name__)


cdef class MmReader(object):
"""
matrix market file reader
"""Matrix market file reader (fast Cython version), used for :class:`~gensim.corpora.mmcorpus.MmCorpus`.
Wrap a term-document matrix on disk (in matrix-market format), and present it
as an object which supports iteration over the rows (~documents).
Attributes
----------
num_docs : int
number of documents in market matrix file
Number of documents in market matrix file.
num_terms : int
number of terms
Number of terms.
num_nnz : int
number of non-zero terms
Number of non-zero terms.
Notes
----------
Expand All @@ -47,20 +44,15 @@ cdef class MmReader(object):

def __init__(self, input, transposed=True):
"""
MmReader(input, transposed=True):
Create matrix reader
Parameters
----------
input : string or file-like
string (file path) or a file-like object that supports
`seek()` (e.g. gzip.GzipFile, bz2.BZ2File). File-like objects are
not closed automatically.
input : {str, file-like object}
Path to input file in MM format or a file-like object that supports `seek()`
(e.g. :class:`~gzip.GzipFile`, :class:`~bz2.BZ2File`).
transposed : bool
if True, expects lines to represent doc_id, term_id, value
else, expects term_id, doc_id, value
transposed : bool, optional
if True, expects lines to represent doc_id, term_id, value. Else, expects term_id, doc_id, value.
"""
logger.info("initializing cython corpus reader from %s", input)
Expand Down Expand Up @@ -91,22 +83,20 @@ cdef class MmReader(object):
)

def __len__(self):
"""Get size of corpus (number of documents)."""
return self.num_docs

def __str__(self):
return ("MmCorpus(%i documents, %i features, %i non-zero entries)" %
(self.num_docs, self.num_terms, self.num_nnz))

def skip_headers(self, input_file):
"""
skip_headers(self, input_file)
Skip file headers that appear before the first document.
"""Skip file headers that appear before the first document.
Parameters
----------
input_file : iterable
consumes any lines from start of `input_file` that begin with a %
input_file : iterable of str
Iterable taken from file in MM format.
"""
for line in input_file:
Expand All @@ -115,23 +105,18 @@ cdef class MmReader(object):
break

def __iter__(self):
"""
__iter__()
Iterate through vectors from underlying matrix
"""Iterate through corpus.
Yields
Notes
------
int, list of (termid, val)
document id and "vector" of terms for next document in matrix
vector of terms is represented as a list of (termid, val) tuples
Note that the total number of vectors returned is always equal to the number of rows specified
in the header, empty documents are inserted and yielded where appropriate, even if they are not explicitly
stored in the Matrix Market file.
Notes
Yields
------
Note that the total number of vectors returned is always equal to the
number of rows specified in the header; empty documents are inserted and
yielded where appropriate, even if they are not explicitly stored in the
Matrix Market file.
(int, list of (int, number))
Document id and Document in BoW format
"""
cdef int docid, termid, previd
Expand Down Expand Up @@ -180,21 +165,17 @@ cdef class MmReader(object):
yield previd, []

def docbyoffset(self, offset):
"""
docbyoffset(offset)
Return document at file offset `offset` (in bytes)
"""Get document at file offset `offset` (in bytes).
Parameters
----------
offset : int
offset, in bytes, of desired document
Offset, in bytes, of desired document.
Returns
------
list of (termid, val)
"vector" of terms for document at offset
vector of terms is represented as a list of (termid, val) tuples
list of (int, str)
Document in BoW format.
"""
# empty documents are not stored explicitly in MM format, so the index marks
Expand Down
81 changes: 53 additions & 28 deletions gensim/corpora/mmcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,7 @@
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html


"""
Corpus in the Matrix Market format.
"""

"""Corpus in the Matrix Market format."""

import logging

Expand All @@ -20,39 +17,45 @@


class MmCorpus(matutils.MmReader, IndexedCorpus):
"""
Corpus in matrix market format
"""Corpus in matrix market format.
Wrap a term-document matrix on disk (in matrix-market format), and present it
as an object which supports iteration over the rows (~documents).
Attributes
----------
num_docs : int
number of documents in market matrix file
Number of documents in market matrix file.
num_terms : int
number of terms
Number of terms.
num_nnz : int
number of non-zero terms
Number of non-zero terms.
Notes
----------
Note that the file is read into memory one document at a time, not the whole
matrix at once (unlike scipy.io.mmread). This allows us to process corpora
which are larger than the available RAM.
Note that the file is read into memory one document at a time, not the whole matrix at once
(unlike :meth:`~scipy.io.mmread`). This allows us to process corpora which are larger than the available RAM.
Example
--------
>>> from gensim.corpora.mmcorpus import MmCorpus
>>> from gensim.test.utils import datapath
>>> import gensim.downloader as api
>>>
>>> corpus = MmCorpus(datapath('test_mmcorpus_with_index.mm'))
>>> for document in corpus:
... pass
"""

def __init__(self, fname):
"""
Read corpus in matrix market format
Parameters
----------
fname : string or file-like
string (file path) or a file-like object that supports
`seek()` (e.g. gzip.GzipFile, bz2.BZ2File). File-like objects are
not closed automatically.
fname : {str, file-like object}
Path to file in MM format or a file-like object that supports `seek()`
(e.g. :class:`gzip.GzipFile`, :class:`bz2.BZ2File`).
"""

Expand All @@ -61,34 +64,56 @@ def __init__(self, fname):
matutils.MmReader.__init__(self, fname)

def __iter__(self):
"""
Iterate through vectors from underlying matrix
"""Iterate through document.
Yields
------
list of (termid, val)
"vector" of terms for next document in matrix
vector of terms is represented as a list of (termid, val) tuples
list of (int, str)
Document in BoW format.
Notes
------
Note that the total number of vectors returned is always equal to the
number of rows specified in the header; empty documents are inserted and
yielded where appropriate, even if they are not explicitly stored in the
The total number of vectors returned is always equal to the number of rows specified in the header.
Empty documents are inserted and yielded where appropriate, even if they are not explicitly stored in the
Matrix Market file.
"""

for doc_id, doc in super(MmCorpus, self).__iter__():
yield doc # get rid of doc id, return the sparse vector only

@staticmethod
def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False):
"""
Save a corpus in the Matrix Market format to disk.
"""Save a corpus in the Matrix Market format to disk.
Parameters
----------
fname : str
Path to file.
corpus : iterable of list of (int, number)
Corpus in Bow format.
id2word : dict of (int, str), optional
WordId -> Word.
progress_cnt : int, optional
Progress counter.
metadata : bool, optional
If true, writes out additional metadata.
Notes
-----
This function is automatically called by `MmCorpus.serialize`; don't
call it directly, call `serialize` instead.
Example
-------
>>> from gensim.corpora.mmcorpus import MmCorpus
>>> from gensim.test.utils import datapath
>>> import gensim.downloader as api
>>>
>>> corpus = MmCorpus(datapath('test_mmcorpus_with_index.mm'))
>>>
>>> MmCorpus.save_corpus("random", corpus) # Do not do it, use `serialize` instead.
[97, 121, 169, 201, 225, 249, 258, 276, 303]
"""
logger.info("storing corpus in Matrix Market format to %s", fname)
num_terms = len(id2word) if id2word is not None else None
Expand Down
21 changes: 17 additions & 4 deletions gensim/corpora/textcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,13 +216,26 @@ def __init__(self, input=None, dictionary=None, metadata=False, character_filter
Examples
--------
>>> #TODO Example with inheritance
>>> from gensim.corpora.textcorpus import TextCorpus
>>> from gensim.test.utils import datapath
>>> from gensim import utils
>>>
>>> corpus = TextCorpus(datapath('head500.noblanks.cor.bz2'))
>>> for bow in corpus:
... pass
>>>
>>> class CorpusMiislita(TextCorpus):
... stopwords = set('for a of the and to in on'.split())
...
... def get_texts(self):
... for doc in self.getstream():
... yield [word for word in utils.to_unicode(doc).lower().split() if word not in self.stopwords]
...
... def __len__(self):
... self.length = sum(1 for _ in self.get_texts())
... return self.length
>>>
>>> corpus = CorpusMiislita(datapath('head500.noblanks.cor.bz2'))
>>> len(corpus)
250
>>> document = next(iter(corpus.get_texts()))
"""
self.input = input
Expand Down
Loading

0 comments on commit 5355c06

Please sign in to comment.