#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012 Homer Strong, Radim Rehurek
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""
This module implements the `"hashing trick" <http://en.wikipedia.org/wiki/Hashing-Trick>`_ --
a mapping between words and their integer ids using a fixed, static mapping. The
static mapping has a constant memory footprint, regardless of the number of word-types (features)
in your corpus, so it's suitable for processing extremely large corpora.
The ids are computed as `hash(word) % id_range`, where `hash` is a user-configurable
function (adler32 by default). Using HashDictionary, new words can be represented immediately,
without an extra pass through the corpus to collect all the ids first. This is another
advantage: HashDictionary can be used with non-repeatable (once-only) streams of documents.
A disadvantage of HashDictionary is that, unlike plain :class:`Dictionary`, several words may map
to the same id, causing hash collisions. The word<->id mapping is no longer a bijection.
"""
from __future__ import with_statement
import logging
import itertools
import zlib
from gensim import utils
from six import iteritems, iterkeys
logger = logging.getLogger(__name__)
[docs]class HashDictionary(utils.SaveLoad, dict):
"""
HashDictionary encapsulates the mapping between normalized words and their
integer ids.
Unlike `Dictionary`, building a `HashDictionary` before using it is not a necessary
step. The documents can be computed immediately, from an uninitialized `HashDictionary`,
without seeing the rest of the corpus first.
The main function is `doc2bow`, which converts a collection of words to its
bag-of-words representation: a list of (word_id, word_frequency) 2-tuples.
"""
[docs] def __init__(self, documents=None, id_range=32000, myhash=zlib.adler32, debug=True):
"""
By default, keep track of debug statistics and mappings. If you find yourself
running out of memory (or are sure you don't need the debug info), set
`debug=False`.
"""
self.myhash = myhash # hash fnc: string->integer
self.id_range = id_range # hash range: id = myhash(key) % id_range
self.debug = debug
# the following (potentially massive!) dictionaries are only formed if `debug` is True
self.token2id = {}
self.id2token = {} # reverse mapping int->set(words)
self.dfs = {} # token_id -> how many documents this token_id appeared in
self.dfs_debug = {} # token_string->how many documents this word appeared in
self.num_docs = 0 # number of documents processed
self.num_pos = 0 # total number of corpus positions
self.num_nnz = 0 # total number of non-zeroes in the BOW matrix
self.allow_update = True
if documents is not None:
self.add_documents(documents)
def __getitem__(self, tokenid):
"""
Return all words that have mapped to the given id so far, as a set.
Only works if `self.debug` was enabled.
"""
return self.id2token.get(tokenid, set())
[docs] def restricted_hash(self, token):
"""
Calculate id of the given token. Also keep track of what words were mapped
to what ids, for debugging reasons.
"""
h = self.myhash(utils.to_utf8(token)) % self.id_range
if self.debug:
self.token2id[token] = h
self.id2token.setdefault(h, set()).add(token)
return h
def __len__(self):
"""
Return the number of distinct ids = the entire dictionary size.
"""
return self.id_range
[docs] def keys(self):
"""Return a list of all token ids."""
return range(len(self))
def __str__(self):
return ("HashDictionary(%i id range)" % len(self))
@staticmethod
[docs] def from_documents(*args, **kwargs):
return HashDictionary(*args, **kwargs)
[docs] def add_documents(self, documents):
"""
Build dictionary from a collection of documents. Each document is a list
of tokens = **tokenized and normalized** utf-8 encoded strings.
This is only a convenience wrapper for calling `doc2bow` on each document
with `allow_update=True`.
"""
for docno, document in enumerate(documents):
if docno % 10000 == 0:
logger.info("adding document #%i to %s" % (docno, self))
_ = self.doc2bow(document, allow_update=True) # ignore the result, here we only care about updating token ids
logger.info(
"built %s from %i documents (total %i corpus positions)",
self, self.num_docs, self.num_pos)
[docs] def doc2bow(self, document, allow_update=False, return_missing=False):
"""
Convert `document` (a list of words) into the bag-of-words format = list
of `(token_id, token_count)` 2-tuples. Each word is assumed to be a
**tokenized and normalized** utf-8 encoded string. No further preprocessing
is done on the words in `document`; apply tokenization, stemming etc. before
calling this method.
If `allow_update` or `self.allow_update` is set, then also update dictionary
in the process: update overall corpus statistics and document frequencies.
For each id appearing in this document, increase its document frequency
(`self.dfs`) by one.
"""
result = {}
missing = {}
document = sorted(document) # convert the input to plain list (needed below)
for word_norm, group in itertools.groupby(document):
frequency = len(list(group)) # how many times does this word appear in the input document
tokenid = self.restricted_hash(word_norm)
result[tokenid] = result.get(tokenid, 0) + frequency
if self.debug:
# increment document count for each unique token that appeared in the document
self.dfs_debug[word_norm] = self.dfs_debug.get(word_norm, 0) + 1
if allow_update or self.allow_update:
self.num_docs += 1
self.num_pos += len(document)
self.num_nnz += len(result)
if self.debug:
# increment document count for each unique tokenid that appeared in the document
# done here, because several words may map to the same tokenid
for tokenid in iterkeys(result):
self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1
# return tokenids, in ascending id order
result = sorted(iteritems(result))
if return_missing:
return result, missing
else:
return result
[docs] def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000):
"""
Remove document frequency statistics for tokens that appear in
1. less than `no_below` documents (absolute number) or
2. more than `no_above` documents (fraction of total corpus size, *not*
absolute number).
3. after (1) and (2), keep only the first `keep_n` most frequent tokens (or
keep all if `None`).
**Note:** since HashDictionary's id range is fixed and doesn't depend on
the number of tokens seen, this doesn't really "remove" anything. It only
clears some supplementary statistics, for easier debugging and a smaller RAM
footprint.
"""
no_above_abs = int(no_above * self.num_docs) # convert fractional threshold to absolute threshold
ok = [item for item in iteritems(self.dfs_debug) if no_below <= item[1] <= no_above_abs]
ok = frozenset(word for word, freq in sorted(ok, key=lambda item: -item[1])[:keep_n])
self.dfs_debug = dict((word, freq)
for word, freq in iteritems(self.dfs_debug)
if word in ok)
self.token2id = dict((token, tokenid)
for token, tokenid in iteritems(self.token2id)
if token in self.dfs_debug)
self.id2token = dict((tokenid, set(token for token in tokens if token in self.dfs_debug))
for tokenid, tokens in iteritems(self.id2token))
self.dfs = dict((tokenid, freq)
for tokenid, freq in iteritems(self.dfs)
if self.id2token.get(tokenid, set()))
# for word->document frequency
logger.info(
"kept statistics for which were in no less than %i and no more than %i (=%.1f%%) documents",
no_below, no_above_abs, 100.0 * no_above)
[docs] def save_as_text(self, fname):
"""
Save this HashDictionary to a text file, for easier debugging.
The format is:
`id[TAB]document frequency of this id[TAB]tab-separated set of words in UTF8 that map to this id[NEWLINE]`.
Note: use `save`/`load` to store in binary format instead (pickle).
"""
logger.info("saving HashDictionary mapping to %s" % fname)
with utils.smart_open(fname, 'wb') as fout:
for tokenid in self.keys():
words = sorted(self[tokenid])
if words:
words_df = [(word, self.dfs_debug.get(word, 0)) for word in words]
words_df = ["%s(%i)" % item for item in sorted(words_df, key=lambda item: -item[1])]
words_df = '\t'.join(words_df)
fout.write(utils.to_utf8("%i\t%i\t%s\n" % (tokenid, self.dfs.get(tokenid, 0), words_df)))