Source code for gensim.corpora.malletcorpus

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

"""
Corpus in Mallet format of List-Of-Words.
"""

from __future__ import with_statement

import logging

from gensim import utils
from gensim.corpora import LowCorpus


logger = logging.getLogger('gensim.corpora.malletcorpus')


[docs]class MalletCorpus(LowCorpus): """ Quoting http://mallet.cs.umass.edu/import.php: One file, one instance per line Assume the data is in the following format: [URL] [language] [text of the page...] Or, more generally, [document #1 id] [label] [text of the document...] [document #2 id] [label] [text of the document...] ... [document #N id] [label] [text of the document...] Note that language/label is *not* considered in Gensim. """
[docs] def __init__(self, fname, id2word=None, metadata=False): self.metadata = metadata LowCorpus.__init__(self, fname, id2word)
def _calculate_num_docs(self): with utils.smart_open(self.fname) as fin: result = sum([1 for x in fin]) return result def __iter__(self): """ Iterate over the corpus at the given filename. Yields a bag-of-words, a.k.a list of tuples of (word id, word count), based on the given id2word dictionary. """ with utils.smart_open(self.fname) as f: for line in f: yield self.line2doc(line)
[docs] def line2doc(self, line): l = [word for word in utils.to_unicode(line).strip().split(' ') if word] docid, doclang, words = l[0], l[1], l[2:] doc = super(MalletCorpus, self).line2doc(' '.join(words)) if self.metadata: return doc, (docid, doclang) else: return doc
@staticmethod
[docs] def save_corpus(fname, corpus, id2word=None, metadata=False): """ Save a corpus in the Mallet format. The document id will be generated by enumerating the corpus. That is, it will range between 0 and number of documents in the corpus. Since Mallet has a language field in the format, this defaults to the string '__unknown__'. If the language needs to be saved, post-processing will be required. This function is automatically called by `MalletCorpus.serialize`; don't call it directly, call `serialize` instead. """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) logger.info("storing corpus in Mallet format into %s" % fname) truncated = 0 offsets = [] with utils.smart_open(fname, 'wb') as fout: for doc_id, doc in enumerate(corpus): if metadata: doc_id, doc_lang = doc[1] doc = doc[0] else: doc_lang = '__unknown__' words = [] for wordid, value in doc: if abs(int(value) - value) > 1e-6: truncated += 1 words.extend([utils.to_unicode(id2word[wordid])] * int(value)) offsets.append(fout.tell()) fout.write(utils.to_utf8('%s %s %s\n' % (doc_id, doc_lang, ' '.join(words)))) if truncated: logger.warning("Mallet format can only save vectors with " "integer elements; %i float entries were truncated to integer value" % truncated) return offsets
[docs] def docbyoffset(self, offset): """ Return the document stored at file position `offset`. """ with utils.smart_open(self.fname) as f: f.seek(offset) return self.line2doc(f.readline())
# endclass MalletCorpus