#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
import logging
from gensim import interfaces, matutils
logger = logging.getLogger(__name__)
[docs]class NormModel(interfaces.TransformationABC):
"""
Objects of this class realize the explicit normalization of
vectors. Supported norms are l1' and 'l2' with 'l2' being
default.
The main methods are:
1. Constructor which normalizes the terms in the given corpus document-wise.
2. The normalize() method which normalizes a simple count representation.
3. The [] transformation which internally calls the self.normalize() method.
>>> norm_l2 = NormModel(corpus)
>>> print(norm_l2[some_doc])
>>> norm_l2.save('/tmp/foo.tfidf_model')
Model persistency is achieved via its load/save methods
"""
[docs] def __init__(self, corpus=None, norm='l2'):
"""
Compute the 'l1' or 'l2' normalization by normalizing separately
for each doc in a corpus.
Formula for 'l1' norm for term 'i' in document 'j' in a corpus of 'D' documents is::
norml1_{i, j} = (i / sum(absolute(values in j)))
Formula for 'l2' norm for term 'i' in document 'j' in a corpus of 'D' documents is::
norml2_{i, j} = (i / sqrt(sum(square(values in j))))
"""
self.norm = norm
if corpus is not None:
self.calc_norm(corpus)
else:
pass
def __str__(self):
return "NormModel(num_docs=%s, num_nnz=%s, norm=%s)" % (self.num_docs, self.num_nnz, self.norm)
[docs] def calc_norm(self, corpus):
"""
Calculates the norm by calling matutils.unitvec with the norm parameter.
"""
logger.info("Performing %s normalization..." % (self.norm))
norms = []
numnnz = 0
docno = 0
for bow in corpus:
docno += 1
numnnz += len(bow)
norms.append(matutils.unitvec(bow, self.norm))
self.num_docs = docno
self.num_nnz = numnnz
self.norms = norms
[docs] def normalize(self, bow):
vector = matutils.unitvec(bow, self.norm)
return vector
def __getitem__(self, bow):
return self.normalize(bow)
#endclass NormModel