Source code for gensim.corpora.hashdictionary

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012 Homer Strong, Radim Rehurek
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html


"""
This module implements the `"hashing trick" <http://en.wikipedia.org/wiki/Hashing-Trick>`_ --
a mapping between words and their integer ids using a fixed, static mapping. The
static mapping has a constant memory footprint, regardless of the number of word-types (features)
in your corpus, so it's suitable for processing extremely large corpora.

The ids are computed as `hash(word) % id_range`, where `hash` is a user-configurable
function (adler32 by default). Using HashDictionary, new words can be represented immediately,
without an extra pass through the corpus to collect all the ids first. This is another
advantage: HashDictionary can be used with non-repeatable (once-only) streams of documents.

A disadvantage of HashDictionary is that, unlike plain :class:`Dictionary`, several words may map
to the same id, causing hash collisions. The word<->id mapping is no longer a bijection.

"""

from __future__ import with_statement

import logging
import itertools
import zlib

from gensim import utils
from six import iteritems, iterkeys


logger = logging.getLogger(__name__)


[docs]class HashDictionary(utils.SaveLoad, dict):
    """
    HashDictionary encapsulates the mapping between normalized words and their
    integer ids.

    Unlike `Dictionary`, building a `HashDictionary` before using it is not a necessary
    step. The documents can be computed immediately, from an uninitialized `HashDictionary`,
    without seeing the rest of the corpus first.

    The main function is `doc2bow`, which converts a collection of words to its
    bag-of-words representation: a list of (word_id, word_frequency) 2-tuples.

    """
[docs]    def __init__(self, documents=None, id_range=32000, myhash=zlib.adler32, debug=True):
        """
        By default, keep track of debug statistics and mappings. If you find yourself
        running out of memory (or are sure you don't need the debug info), set
        `debug=False`.
        """
        self.myhash = myhash  # hash fnc: string->integer
        self.id_range = id_range  # hash range: id = myhash(key) % id_range
        self.debug = debug

        # the following (potentially massive!) dictionaries are only formed if `debug` is True
        self.token2id = {}
        self.id2token = {}  # reverse mapping int->set(words)
        self.dfs = {}  # token_id -> how many documents this token_id appeared in
        self.dfs_debug = {}  # token_string->how many documents this word appeared in

        self.num_docs = 0  # number of documents processed
        self.num_pos = 0  # total number of corpus positions
        self.num_nnz = 0  # total number of non-zeroes in the BOW matrix
        self.allow_update = True

        if documents is not None:
            self.add_documents(documents)

    def __getitem__(self, tokenid):
        """
        Return all words that have mapped to the given id so far, as a set.

        Only works if `self.debug` was enabled.
        """
        return self.id2token.get(tokenid, set())

[docs]    def restricted_hash(self, token):
        """
        Calculate id of the given token. Also keep track of what words were mapped
        to what ids, for debugging reasons.
        """
        h = self.myhash(utils.to_utf8(token)) % self.id_range
        if self.debug:
            self.token2id[token] = h
            self.id2token.setdefault(h, set()).add(token)
        return h

    def __len__(self):
        """
        Return the number of distinct ids = the entire dictionary size.
        """
        return self.id_range

[docs]    def keys(self):
        """Return a list of all token ids."""
        return range(len(self))

    def __str__(self):
        return ("HashDictionary(%i id range)" % len(self))

    @staticmethod
[docs]    def from_documents(*args, **kwargs):
        return HashDictionary(*args, **kwargs)

[docs]    def add_documents(self, documents):
        """
        Build dictionary from a collection of documents. Each document is a list
        of tokens = **tokenized and normalized** utf-8 encoded strings.

        This is only a convenience wrapper for calling `doc2bow` on each document
        with `allow_update=True`.
        """
        for docno, document in enumerate(documents):
            if docno % 10000 == 0:
                logger.info("adding document #%i to %s" % (docno, self))
            _ = self.doc2bow(document, allow_update=True)  # ignore the result, here we only care about updating token ids
        logger.info(
            "built %s from %i documents (total %i corpus positions)",
            self, self.num_docs, self.num_pos)

[docs]    def doc2bow(self, document, allow_update=False, return_missing=False):
        """
        Convert `document` (a list of words) into the bag-of-words format = list
        of `(token_id, token_count)` 2-tuples. Each word is assumed to be a
        **tokenized and normalized** utf-8 encoded string. No further preprocessing
        is done on the words in `document`; apply tokenization, stemming etc. before
        calling this method.

        If `allow_update` or `self.allow_update` is set, then also update dictionary
        in the process: update overall corpus statistics and document frequencies.
        For each id appearing in this document, increase its document frequency
        (`self.dfs`) by one.

        """
        result = {}
        missing = {}
        document = sorted(document)  # convert the input to plain list (needed below)
        for word_norm, group in itertools.groupby(document):
            frequency = len(list(group))  # how many times does this word appear in the input document
            tokenid = self.restricted_hash(word_norm)
            result[tokenid] = result.get(tokenid, 0) + frequency
            if self.debug:
                # increment document count for each unique token that appeared in the document
                self.dfs_debug[word_norm] = self.dfs_debug.get(word_norm, 0) + 1

        if allow_update or self.allow_update:
            self.num_docs += 1
            self.num_pos += len(document)
            self.num_nnz += len(result)
            if self.debug:
                # increment document count for each unique tokenid that appeared in the document
                # done here, because several words may map to the same tokenid
                for tokenid in iterkeys(result):
                    self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1

        # return tokenids, in ascending id order
        result = sorted(iteritems(result))
        if return_missing:
            return result, missing
        else:
            return result

[docs]    def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000):
        """
        Remove document frequency statistics for tokens that appear in

        1. less than `no_below` documents (absolute number) or
        2. more than `no_above` documents (fraction of total corpus size, *not*
           absolute number).
        3. after (1) and (2), keep only the first `keep_n` most frequent tokens (or
           keep all if `None`).

        **Note:** since HashDictionary's id range is fixed and doesn't depend on
        the number of tokens seen, this doesn't really "remove" anything. It only
        clears some supplementary statistics, for easier debugging and a smaller RAM
        footprint.
        """
        no_above_abs = int(no_above * self.num_docs)  # convert fractional threshold to absolute threshold
        ok = [item for item in iteritems(self.dfs_debug) if no_below <= item[1] <= no_above_abs]
        ok = frozenset(word for word, freq in sorted(ok, key=lambda item: -item[1])[:keep_n])

        self.dfs_debug = dict((word, freq)
                              for word, freq in iteritems(self.dfs_debug)
                              if word in ok)
        self.token2id = dict((token, tokenid)
                             for token, tokenid in iteritems(self.token2id)
                             if token in self.dfs_debug)
        self.id2token = dict((tokenid, set(token for token in tokens if token in self.dfs_debug))
                             for tokenid, tokens in iteritems(self.id2token))
        self.dfs = dict((tokenid, freq)
                        for tokenid, freq in iteritems(self.dfs)
                        if self.id2token.get(tokenid, set()))

        # for word->document frequency
        logger.info(
            "kept statistics for which were in no less than %i and no more than %i (=%.1f%%) documents",
            no_below, no_above_abs, 100.0 * no_above)

[docs]    def save_as_text(self, fname):
        """
        Save this HashDictionary to a text file, for easier debugging.

        The format is:
        `id[TAB]document frequency of this id[TAB]tab-separated set of words in UTF8 that map to this id[NEWLINE]`.

        Note: use `save`/`load` to store in binary format instead (pickle).
        """
        logger.info("saving HashDictionary mapping to %s" % fname)
        with utils.smart_open(fname, 'wb') as fout:
            for tokenid in self.keys():
                words = sorted(self[tokenid])
                if words:
                    words_df = [(word, self.dfs_debug.get(word, 0)) for word in words]
                    words_df = ["%s(%i)" % item for item in sorted(words_df, key=lambda item: -item[1])]
                    words_df = '\t'.join(words_df)
                    fout.write(utils.to_utf8("%i\t%i\t%s\n" % (tokenid, self.dfs.get(tokenid, 0), words_df)))