Source code for gensim.models.rpmodel

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html


import logging
import itertools

import numpy
import scipy

from gensim import interfaces, matutils, utils


logger = logging.getLogger('gensim.models.rpmodel')


[docs]class RpModel(interfaces.TransformationABC):
    """
    Objects of this class allow building and maintaining a model for Random Projections
    (also known as Random Indexing). For theoretical background on RP, see:

      Kanerva et al.: "Random indexing of text samples for Latent Semantic Analysis."

    The main methods are:

    1. constructor, which creates the random projection matrix
    2. the [] method, which transforms a simple count representation into the TfIdf
       space.

    >>> rp = RpModel(corpus)
    >>> print(rp[some_doc])
    >>> rp.save('/tmp/foo.rp_model')

    Model persistency is achieved via its load/save methods.
    """
[docs]    def __init__(self, corpus, id2word=None, num_topics=300):
        """
        `id2word` is a mapping from word ids (integers) to words (strings). It is
        used to determine the vocabulary size, as well as for debugging and topic
        printing. If not set, it will be determined from the corpus.
        """
        self.id2word = id2word
        self.num_topics = num_topics
        if corpus is not None:
            self.initialize(corpus)


    def __str__(self):
        return "RpModel(num_terms=%s, num_topics=%s)" % (self.num_terms, self.num_topics)


[docs]    def initialize(self, corpus):
        """
        Initialize the random projection matrix.
        """
        if self.id2word is None:
            logger.info("no word id mapping provided; initializing from corpus, assuming identity")
            self.id2word = utils.dict_from_corpus(corpus)
            self.num_terms = len(self.id2word)
        else:
            self.num_terms = 1 + max([-1] + self.id2word.keys())

        shape = self.num_topics, self.num_terms
        logger.info("constructing %s random matrix" % str(shape))
        # Now construct the projection matrix itself.
        # Here i use a particular form, derived in "Achlioptas: Database-friendly random projection",
        # and his (1) scenario of Theorem 1.1 in particular (all entries are +1/-1).
        randmat = 1 - 2 * numpy.random.binomial(1, 0.5, shape) # convert from 0/1 to +1/-1
        self.projection = numpy.asfortranarray(randmat, dtype=numpy.float32) # convert from int32 to floats, for faster multiplications


    def __getitem__(self, bow):
        """
        Return RP representation of the input vector and/or corpus.
        """
        # if the input vector is in fact a corpus, return a transformed corpus as result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        vec = matutils.sparse2full(bow, self.num_terms).reshape(self.num_terms, 1) / numpy.sqrt(self.num_topics)
        vec = numpy.asfortranarray(vec, dtype=numpy.float32)
        topic_dist = numpy.dot(self.projection, vec) # (k, d) * (d, 1) = (k, 1)
        return [(topicid, float(topicvalue)) for topicid, topicvalue in enumerate(topic_dist.flat)
                if numpy.isfinite(topicvalue) and not numpy.allclose(topicvalue, 0.0)]


    def __setstate__(self, state):
        """
        This is a hack to work around a bug in numpy, where a FORTRAN-order array
        unpickled from disk segfaults on using it.
        """
        self.__dict__ = state
        if self.projection is not None:
            self.projection = self.projection.copy('F') # simply making a fresh copy fixes the broken array
#endclass RpModel