Source code for gensim.models.logentropy_model

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

import logging
import math
from gensim import interfaces, matutils, utils


logger = logging.getLogger('gensim.models.logentropy_model')


[docs]class LogEntropyModel(interfaces.TransformationABC): """ Objects of this class realize the transformation between word-document co-occurence matrix (integers) into a locally/globally weighted matrix (positive floats). This is done by a log entropy normalization, optionally normalizing the resulting documents to unit length. The following formulas explain how to compute the log entropy weight for term `i` in document `j`:: local_weight_{i,j} = log(frequency_{i,j} + 1) P_{i,j} = frequency_{i,j} / sum_j frequency_{i,j} sum_j P_{i,j} * log(P_{i,j}) global_weight_i = 1 + ---------------------------- log(number_of_documents + 1) final_weight_{i,j} = local_weight_{i,j} * global_weight_i The main methods are: 1. constructor, which calculates the global weighting for all terms in a corpus. 2. the [] method, which transforms a simple count representation into the log entropy normalized space. >>> log_ent = LogEntropyModel(corpus) >>> print(log_ent[some_doc]) >>> log_ent.save('/tmp/foo.log_ent_model') Model persistency is achieved via its load/save methods. """
[docs] def __init__(self, corpus, id2word=None, normalize=True): """ `normalize` dictates whether the resulting vectors will be set to unit length. """ self.normalize = normalize self.n_docs = 0 self.n_words = 0 self.entr = {} if corpus is not None: self.initialize(corpus)
def __str__(self): return "LogEntropyModel(n_docs=%s, n_words=%s)" % (self.n_docs, self.n_words)
[docs] def initialize(self, corpus): """ Initialize internal statistics based on a training corpus. Called automatically from the constructor. """ logger.info("calculating counts") glob_freq = {} glob_num_words, doc_no = 0, -1 for doc_no, bow in enumerate(corpus): if doc_no % 10000 == 0: logger.info("PROGRESS: processing document #%i" % doc_no) glob_num_words += len(bow) for term_id, term_count in bow: glob_freq[term_id] = glob_freq.get(term_id, 0) + term_count # keep some stats about the training corpus self.n_docs = doc_no + 1 self.n_words = glob_num_words # and finally compute the global weights logger.info("calculating global log entropy weights for %i " "documents and %i features (%i matrix non-zeros)" % (self.n_docs, len(glob_freq), self.n_words)) logger.debug('iterating over corpus') for doc_no2, bow in enumerate(corpus): for key, freq in bow: p = (float(freq) / glob_freq[key]) * math.log(float(freq) / glob_freq[key]) self.entr[key] = self.entr.get(key, 0.0) + p if doc_no2 != doc_no: raise ValueError("LogEntropyModel doesn't support generators as training data") logger.debug('iterating over keys') for key in self.entr: self.entr[key] = 1 + self.entr[key] / math.log(self.n_docs + 1)
def __getitem__(self, bow): """ Return log entropy representation of the input vector and/or corpus. """ # if the input vector is in fact a corpus, return a transformed corpus is_corpus, bow = utils.is_corpus(bow) if is_corpus: return self._apply(bow) # unknown (new) terms will be given zero weight (NOT infinity/huge) vector = [(term_id, math.log(tf + 1) * self.entr.get(term_id)) for term_id, tf in bow if term_id in self.entr] if self.normalize: vector = matutils.unitvec(vector) return vector