Source code for gensim.models.tfidfmodel

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html


import logging
import math

from gensim import interfaces, matutils, utils
from six import iteritems


logger = logging.getLogger(__name__)


def df2idf(docfreq, totaldocs, log_base=2.0, add=0.0):
    """
    Compute default inverse-document-frequency for a term with document frequency `doc_freq`::

      idf = add + log(totaldocs / doc_freq)
    """
    return add + math.log(1.0 * totaldocs / docfreq, log_base)


def precompute_idfs(wglobal, dfs, total_docs):
    """Precompute the inverse document frequency mapping for all terms."""
    # not strictly necessary and could be computed on the fly in TfidfModel__getitem__.
    # this method is here just to speed things up a little.
    return dict((termid, wglobal(df, total_docs))
                for termid, df in iteritems(dfs))


[docs]class TfidfModel(interfaces.TransformationABC): """ Objects of this class realize the transformation between word-document co-occurrence matrix (integers) into a locally/globally weighted TF_IDF matrix (positive floats). The main methods are: 1. constructor, which calculates inverse document counts for all terms in the training corpus. 2. the [] method, which transforms a simple count representation into the TfIdf space. >>> tfidf = TfidfModel(corpus) >>> print(tfidf[some_doc]) >>> tfidf.save('/tmp/foo.tfidf_model') Model persistency is achieved via its load/save methods. """
[docs] def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.identity, wglobal=df2idf, normalize=True): """ Compute tf-idf by multiplying a local component (term frequency) with a global component (inverse document frequency), and normalizing the resulting documents to unit length. Formula for unnormalized weight of term `i` in document `j` in a corpus of D documents:: weight_{i,j} = frequency_{i,j} * log_2(D / document_freq_{i}) or, more generally:: weight_{i,j} = wlocal(frequency_{i,j}) * wglobal(document_freq_{i}, D) so you can plug in your own custom `wlocal` and `wglobal` functions. Default for `wlocal` is identity (other options: math.sqrt, math.log1p, ...) and default for `wglobal` is `log_2(total_docs / doc_freq)`, giving the formula above. `normalize` dictates how the final transformed vectors will be normalized. `normalize=True` means set to unit length (default); `False` means don't normalize. You can also set `normalize` to your own function that accepts and returns a sparse vector. If `dictionary` is specified, it must be a `corpora.Dictionary` object and it will be used to directly construct the inverse document frequency mapping (then `corpus`, if specified, is ignored). """ self.normalize = normalize self.id2word = id2word self.wlocal, self.wglobal = wlocal, wglobal self.num_docs, self.num_nnz, self.idfs = None, None, None if dictionary is not None: # user supplied a Dictionary object, which already contains all the # statistics we need to construct the IDF mapping. we can skip the # step that goes through the corpus (= an optimization). if corpus is not None: logger.warning("constructor received both corpus and explicit " "inverse document frequencies; ignoring the corpus") self.num_docs, self.num_nnz = dictionary.num_docs, dictionary.num_nnz self.dfs = dictionary.dfs.copy() self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs) elif corpus is not None: self.initialize(corpus) else: # NOTE: everything is left uninitialized; presumably the model will # be initialized in some other way pass
def __str__(self): return "TfidfModel(num_docs=%s, num_nnz=%s)" % (self.num_docs, self.num_nnz)
[docs] def initialize(self, corpus): """ Compute inverse document weights, which will be used to modify term frequencies for documents. """ logger.info("collecting document frequencies") dfs = {} numnnz, docno = 0, -1 for docno, bow in enumerate(corpus): if docno % 10000 == 0: logger.info("PROGRESS: processing document #%i" % docno) numnnz += len(bow) for termid, _ in bow: dfs[termid] = dfs.get(termid, 0) + 1 # keep some stats about the training corpus self.num_docs = docno + 1 self.num_nnz = numnnz self.dfs = dfs # and finally compute the idf weights n_features = max(dfs) if dfs else 0 logger.info("calculating IDF weights for %i documents and %i features (%i matrix non-zeros)" % (self.num_docs, n_features, self.num_nnz)) self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)
def __getitem__(self, bow, eps=1e-12): """ Return tf-idf representation of the input vector and/or corpus. """ # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus: return self._apply(bow) # unknown (new) terms will be given zero weight (NOT infinity/huge weight, # as strict application of the IDF formula would dictate) vector = [(termid, self.wlocal(tf) * self.idfs.get(termid)) for termid, tf in bow if self.idfs.get(termid, 0.0) != 0.0] # and finally, normalize the vector either to unit length, or use a # user-defined normalization function if self.normalize is True: vector = matutils.unitvec(vector) elif self.normalize: vector = self.normalize(vector) # make sure there are no explicit zeroes in the vector (must be sparse) vector = [(termid, weight) for termid, weight in vector if abs(weight) > eps] return vector
#endclass TfidfModel