Source code for gensim.corpora.bleicorpus

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html


"""
Blei's LDA-C format.
"""

from __future__ import with_statement

from os import path
import logging

from gensim import interfaces, utils
from gensim.corpora import IndexedCorpus
from six.moves import xrange


logger = logging.getLogger('gensim.corpora.bleicorpus')


[docs]class BleiCorpus(IndexedCorpus): """ Corpus in Blei's LDA-C format. The corpus is represented as two files: one describing the documents, and another describing the mapping between words and their ids. Each document is one line:: N fieldId1:fieldValue1 fieldId2:fieldValue2 ... fieldIdN:fieldValueN The vocabulary is a file with words, one word per line; word at line K has an implicit ``id=K``. """
[docs] def __init__(self, fname, fname_vocab=None): """ Initialize the corpus from a file. `fname_vocab` is the file with vocabulary; if not specified, it defaults to `fname.vocab`. """ IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s" % fname) if fname_vocab is None: fname_base, _ = path.splitext(fname) fname_dir = path.dirname(fname) for fname_vocab in [ utils.smart_extension(fname, '.vocab'), utils.smart_extension(fname, '/vocab.txt'), utils.smart_extension(fname_base, '.vocab'), utils.smart_extension(fname_dir, '/vocab.txt'), ]: if path.exists(fname_vocab): break else: raise IOError('BleiCorpus: could not find vocabulary file') self.fname = fname with utils.smart_open(fname_vocab) as fin: words = [utils.to_unicode(word).rstrip() for word in fin] self.id2word = dict(enumerate(words))
def __iter__(self): """ Iterate over the corpus, returning one sparse vector at a time. """ lineno = -1 with utils.smart_open(self.fname) as fin: for lineno, line in enumerate(fin): yield self.line2doc(line) self.length = lineno + 1
[docs] def line2doc(self, line): parts = utils.to_unicode(line).split() if int(parts[0]) != len(parts) - 1: raise ValueError("invalid format in %s: %s" % (self.fname, repr(line))) doc = [part.rsplit(':', 1) for part in parts[1:]] doc = [(int(p1), float(p2)) for p1, p2 in doc] return doc
@staticmethod
[docs] def save_corpus(fname, corpus, id2word=None, metadata=False): """ Save a corpus in the LDA-C format. There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. This function is automatically called by `BleiCorpus.serialize`; don't call it directly, call `serialize` instead. """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) num_terms = len(id2word) else: num_terms = 1 + max([-1] + id2word.keys()) logger.info("storing corpus in Blei's LDA-C format into %s" % fname) with utils.smart_open(fname, 'wb') as fout: offsets = [] for doc in corpus: doc = list(doc) offsets.append(fout.tell()) parts = ["%i:%g" % p for p in doc if abs(p[1]) > 1e-7] fout.write(utils.to_utf8("%i %s\n" % (len(doc), ' '.join(parts)))) # write out vocabulary, in a format compatible with Blei's topics.py script fname_vocab = utils.smart_extension(fname, '.vocab') logger.info("saving vocabulary of %i words to %s" % (num_terms, fname_vocab)) with utils.smart_open(fname_vocab, 'wb') as fout: for featureid in xrange(num_terms): fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---'))) return offsets
[docs] def docbyoffset(self, offset): """ Return the document stored at file position `offset`. """ with utils.smart_open(self.fname) as f: f.seek(offset) return self.line2doc(f.readline())
# endclass BleiCorpus