Source code for gensim.corpora.mmcorpus

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html


"""
Corpus in the Matrix Market format.
"""


import logging

from gensim import interfaces, matutils
from gensim.corpora import IndexedCorpus


logger = logging.getLogger('gensim.corpora.mmcorpus')


[docs]class MmCorpus(matutils.MmReader, IndexedCorpus): """ Corpus in the Matrix Market format. """
[docs] def __init__(self, fname): # avoid calling super(), too confusing IndexedCorpus.__init__(self, fname) matutils.MmReader.__init__(self, fname)
def __iter__(self): """ Interpret a matrix in Matrix Market format as a streamed gensim corpus (yielding one document at a time). """ for doc_id, doc in super(MmCorpus, self).__iter__(): yield doc # get rid of doc id, return the sparse vector only @staticmethod
[docs] def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False): """ Save a corpus in the Matrix Market format to disk. This function is automatically called by `MmCorpus.serialize`; don't call it directly, call `serialize` instead. """ logger.info("storing corpus in Matrix Market format to %s" % fname) num_terms = len(id2word) if id2word is not None else None return matutils.MmWriter.write_corpus(fname, corpus, num_terms=num_terms, index=True, progress_cnt=progress_cnt, metadata=metadata)
# endclass MmCorpus