Source code for gensim.corpora.malletcorpus

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

"""
Corpus in Mallet format of List-Of-Words.
"""

from __future__ import with_statement

import logging

from gensim import utils
from gensim.corpora import LowCorpus


logger = logging.getLogger('gensim.corpora.malletcorpus')


[docs]class MalletCorpus(LowCorpus):
    """
    Quoting http://mallet.cs.umass.edu/import.php:

        One file, one instance per line
        Assume the data is in the following format:

        [URL] [language] [text of the page...]

    Or, more generally,
        [document #1 id] [label] [text of the document...]
        [document #2 id] [label] [text of the document...]
        ...
        [document #N id] [label] [text of the document...]

    Note that language/label is *not* considered in Gensim.

    """
[docs]    def __init__(self, fname, id2word=None, metadata=False):
        self.metadata = metadata
        LowCorpus.__init__(self, fname, id2word)

    def _calculate_num_docs(self):
        with utils.smart_open(self.fname) as fin:
            result = sum([1 for x in fin])
        return result

    def __iter__(self):
        """
        Iterate over the corpus at the given filename.

        Yields a bag-of-words, a.k.a list of tuples of (word id, word count), based on the given id2word dictionary.
        """
        with utils.smart_open(self.fname) as f:
            for line in f:
                yield self.line2doc(line)

[docs]    def line2doc(self, line):
        l = [word for word in utils.to_unicode(line).strip().split(' ') if word]
        docid, doclang, words = l[0], l[1], l[2:]

        doc = super(MalletCorpus, self).line2doc(' '.join(words))

        if self.metadata:
            return doc, (docid, doclang)
        else:
            return doc

    @staticmethod
[docs]    def save_corpus(fname, corpus, id2word=None, metadata=False):
        """
        Save a corpus in the Mallet format.

        The document id will be generated by enumerating the corpus.
        That is, it will range between 0 and number of documents in the corpus.

        Since Mallet has a language field in the format, this defaults to the string '__unknown__'.
        If the language needs to be saved, post-processing will be required.

        This function is automatically called by `MalletCorpus.serialize`; don't
        call it directly, call `serialize` instead.

        """
        if id2word is None:
            logger.info("no word id mapping provided; initializing from corpus")
            id2word = utils.dict_from_corpus(corpus)

        logger.info("storing corpus in Mallet format into %s" % fname)

        truncated = 0
        offsets = []
        with utils.smart_open(fname, 'wb') as fout:
            for doc_id, doc in enumerate(corpus):
                if metadata:
                    doc_id, doc_lang = doc[1]
                    doc = doc[0]
                else:
                    doc_lang = '__unknown__'

                words = []
                for wordid, value in doc:
                    if abs(int(value) - value) > 1e-6:
                        truncated += 1
                    words.extend([utils.to_unicode(id2word[wordid])] * int(value))
                offsets.append(fout.tell())
                fout.write(utils.to_utf8('%s %s %s\n' % (doc_id, doc_lang, ' '.join(words))))

        if truncated:
            logger.warning("Mallet format can only save vectors with "
                            "integer elements; %i float entries were truncated to integer value" %
                            truncated)

        return offsets

[docs]    def docbyoffset(self, offset):
        """
        Return the document stored at file position `offset`.
        """
        with utils.smart_open(self.fname) as f:
            f.seek(offset)
            return self.line2doc(f.readline())

# endclass MalletCorpus