Source code for gensim.corpora.dictionary

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html


"""
This module implements the concept of Dictionary -- a mapping between words and
their integer ids.

Dictionaries can be created from a corpus and can later be pruned according to
document frequency (removing (un)common words via the :func:`Dictionary.filter_extremes` method),
save/loaded from disk (via :func:`Dictionary.save` and :func:`Dictionary.load` methods), merged
with other dictionary (:func:`Dictionary.merge_with`) etc.
"""

from __future__ import with_statement

from collections import Mapping, defaultdict
import sys
import logging
import itertools

from gensim import utils

if sys.version_info[0] >= 3:
    unicode = str

from six import PY3, iteritems, iterkeys, itervalues, string_types
from six.moves import xrange
from six.moves import zip as izip


logger = logging.getLogger('gensim.corpora.dictionary')


[docs]class Dictionary(utils.SaveLoad, Mapping): """ Dictionary encapsulates the mapping between normalized words and their integer ids. The main function is `doc2bow`, which converts a collection of words to its bag-of-words representation: a list of (word_id, word_frequency) 2-tuples. """
[docs] def __init__(self, documents=None, prune_at=2000000): """ If `documents` are given, use them to initialize Dictionary (see `add_documents()`). """ self.token2id = {} # token -> tokenId self.id2token = {} # reverse mapping for token2id; only formed on request, to save memory self.dfs = {} # document frequencies: tokenId -> in how many documents this token appeared self.num_docs = 0 # number of documents processed self.num_pos = 0 # total number of corpus positions self.num_nnz = 0 # total number of non-zeroes in the BOW matrix if documents is not None: self.add_documents(documents, prune_at=prune_at)
def __getitem__(self, tokenid): if len(self.id2token) != len(self.token2id): # the word->id mapping has changed (presumably via add_documents); # recompute id->word accordingly self.id2token = dict((v, k) for k, v in iteritems(self.token2id)) return self.id2token[tokenid] # will throw for non-existent ids def __iter__(self): return iter(self.keys()) if PY3: # restore Py2-style dict API iterkeys = __iter__ def iteritems(self): return self.items() def itervalues(self): return self.values()
[docs] def keys(self): """Return a list of all token ids.""" return list(self.token2id.values())
def __len__(self): """ Return the number of token->id mappings in the dictionary. """ return len(self.token2id) def __str__(self): some_keys = list(itertools.islice(iterkeys(self.token2id), 5)) return "Dictionary(%i unique tokens: %s%s)" % (len(self), some_keys, '...' if len(self) > 5 else '') @staticmethod
[docs] def from_documents(documents): return Dictionary(documents=documents)
[docs] def add_documents(self, documents, prune_at=2000000): """ Update dictionary from a collection of documents. Each document is a list of tokens = **tokenized and normalized** strings (either utf8 or unicode). This is a convenience wrapper for calling `doc2bow` on each document with `allow_update=True`, which also prunes infrequent words, keeping the total number of unique words <= `prune_at`. This is to save memory on very large inputs. To disable this pruning, set `prune_at=None`. >>> print(Dictionary(["máma mele maso".split(), "ema má máma".split()])) Dictionary(5 unique tokens) """ for docno, document in enumerate(documents): # log progress & run a regular check for pruning, once every 10k docs if docno % 10000 == 0: if prune_at is not None and len(self) > prune_at: self.filter_extremes(no_below=0, no_above=1.0, keep_n=prune_at) logger.info("adding document #%i to %s", docno, self) # update Dictionary with the document self.doc2bow(document, allow_update=True) # ignore the result, here we only care about updating token ids logger.info( "built %s from %i documents (total %i corpus positions)", self, self.num_docs, self.num_pos)
[docs] def doc2bow(self, document, allow_update=False, return_missing=False): """ Convert `document` (a list of words) into the bag-of-words format = list of `(token_id, token_count)` 2-tuples. Each word is assumed to be a **tokenized and normalized** string (either unicode or utf8-encoded). No further preprocessing is done on the words in `document`; apply tokenization, stemming etc. before calling this method. If `allow_update` is set, then also update dictionary in the process: create ids for new words. At the same time, update document frequencies -- for each word appearing in this document, increase its document frequency (`self.dfs`) by one. If `allow_update` is **not** set, this function is `const`, aka read-only. """ if isinstance(document, string_types): raise TypeError("doc2bow expects an array of unicode tokens on input, not a single string") # Construct (word, frequency) mapping. counter = defaultdict(int) for w in document: counter[w if isinstance(w, unicode) else unicode(w, 'utf-8')] += 1 token2id = self.token2id if allow_update or return_missing: missing = dict((w, freq) for w, freq in iteritems(counter) if w not in token2id) if allow_update: for w in missing: # new id = number of ids made so far; # NOTE this assumes there are no gaps in the id sequence! token2id[w] = len(token2id) result = dict((token2id[w], freq) for w, freq in iteritems(counter) if w in token2id) if allow_update: self.num_docs += 1 self.num_pos += sum(itervalues(counter)) self.num_nnz += len(result) # increase document count for each unique token that appeared in the document dfs = self.dfs for tokenid in iterkeys(result): dfs[tokenid] = dfs.get(tokenid, 0) + 1 # return tokenids, in ascending id order result = sorted(iteritems(result)) if return_missing: return result, missing else: return result
[docs] def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000): """ Filter out tokens that appear in 1. less than `no_below` documents (absolute number) or 2. more than `no_above` documents (fraction of total corpus size, *not* absolute number). 3. after (1) and (2), keep only the first `keep_n` most frequent tokens (or keep all if `None`). After the pruning, shrink resulting gaps in word ids. **Note**: Due to the gap shrinking, the same word may have a different word id before and after the call to this function! """ no_above_abs = int(no_above * self.num_docs) # convert fractional threshold to absolute threshold # determine which tokens to keep good_ids = ( v for v in itervalues(self.token2id) if no_below <= self.dfs.get(v, 0) <= no_above_abs) good_ids = sorted(good_ids, key=self.dfs.get, reverse=True) if keep_n is not None: good_ids = good_ids[:keep_n] bad_words = [(self[id], self.dfs.get(id, 0)) for id in set(self).difference(good_ids)] logger.info("discarding %i tokens: %s...", len(self) - len(good_ids), bad_words[:10]) logger.info( "keeping %i tokens which were in no less than %i and no more than %i (=%.1f%%) documents", len(good_ids), no_below, no_above_abs, 100.0 * no_above) # do the actual filtering, then rebuild dictionary to remove gaps in ids self.filter_tokens(good_ids=good_ids) logger.info("resulting dictionary: %s", self)
[docs] def filter_n_most_frequent(self, remove_n): """ Filter out the 'remove_n' most frequent tokens that appear in the documents. After the pruning, shrink resulting gaps in word ids. **Note**: Due to the gap shrinking, the same word may have a different word id before and after the call to this function! """ # determine which tokens to keep most_frequent_ids = (v for v in itervalues(self.token2id)) most_frequent_ids = sorted(most_frequent_ids, key=self.dfs.get, reverse=True) most_frequent_ids = most_frequent_ids[:remove_n] # do the actual filtering, then rebuild dictionary to remove gaps in ids most_frequent_words = [(self[id], self.dfs.get(id, 0)) for id in most_frequent_ids] logger.info("discarding %i tokens: %s...", len(most_frequent_ids), most_frequent_words[:10]) self.filter_tokens(bad_ids=most_frequent_ids) logger.info("resulting dictionary: %s" % self)
[docs] def filter_tokens(self, bad_ids=None, good_ids=None): """ Remove the selected `bad_ids` tokens from all dictionary mappings, or, keep selected `good_ids` in the mapping and remove the rest. `bad_ids` and `good_ids` are collections of word ids to be removed. """ if bad_ids is not None: bad_ids = set(bad_ids) self.token2id = dict((token, tokenid) for token, tokenid in iteritems(self.token2id) if tokenid not in bad_ids) self.dfs = dict((tokenid, freq) for tokenid, freq in iteritems(self.dfs) if tokenid not in bad_ids) if good_ids is not None: good_ids = set(good_ids) self.token2id = dict((token, tokenid) for token, tokenid in iteritems(self.token2id) if tokenid in good_ids) self.dfs = dict((tokenid, freq) for tokenid, freq in iteritems(self.dfs) if tokenid in good_ids) self.compactify()
[docs] def compactify(self): """ Assign new word ids to all words. This is done to make the ids more compact, e.g. after some tokens have been removed via :func:`filter_tokens` and there are gaps in the id series. Calling this method will remove the gaps. """ logger.debug("rebuilding dictionary, shrinking gaps") # build mapping from old id -> new id idmap = dict(izip(itervalues(self.token2id), xrange(len(self.token2id)))) # reassign mappings to new ids self.token2id = dict((token, idmap[tokenid]) for token, tokenid in iteritems(self.token2id)) self.id2token = {} self.dfs = dict((idmap[tokenid], freq) for tokenid, freq in iteritems(self.dfs))
[docs] def save_as_text(self, fname, sort_by_word=True): """ Save this Dictionary to a text file, in format: `id[TAB]word_utf8[TAB]document frequency[NEWLINE]`. Sorted by word, or by decreasing word frequency. Note: text format should be use for corpus inspection. Use `save`/`load` to store in binary format (pickle) for improved performance. """ logger.info("saving dictionary mapping to %s", fname) with utils.smart_open(fname, 'wb') as fout: if sort_by_word: for token, tokenid in sorted(iteritems(self.token2id)): line = "%i\t%s\t%i\n" % (tokenid, token, self.dfs.get(tokenid, 0)) fout.write(utils.to_utf8(line)) else: for tokenid, freq in sorted(iteritems(self.dfs), key=lambda item: -item[1]): line = "%i\t%s\t%i\n" % (tokenid, self[tokenid], freq) fout.write(utils.to_utf8(line))
[docs] def merge_with(self, other): """ Merge another dictionary into this dictionary, mapping same tokens to the same ids and new tokens to new ids. The purpose is to merge two corpora created using two different dictionaries, one from `self` and one from `other`. `other` can be any id=>word mapping (a dict, a Dictionary object, ...). Return a transformation object which, when accessed as `result[doc_from_other_corpus]`, will convert documents from a corpus built using the `other` dictionary into a document using the new, merged dictionary (see :class:`gensim.interfaces.TransformationABC`). Example: >>> dict1 = Dictionary(some_documents) >>> dict2 = Dictionary(other_documents) # ids not compatible with dict1! >>> dict2_to_dict1 = dict1.merge_with(dict2) >>> # now we can merge corpora from the two incompatible dictionaries into one >>> merged_corpus = itertools.chain(some_corpus_from_dict1, dict2_to_dict1[some_corpus_from_dict2]) """ old2new = {} for other_id, other_token in iteritems(other): if other_token in self.token2id: new_id = self.token2id[other_token] else: new_id = len(self.token2id) self.token2id[other_token] = new_id self.dfs[new_id] = 0 old2new[other_id] = new_id try: self.dfs[new_id] += other.dfs[other_id] except: # `other` isn't a Dictionary (probably just a dict) => ignore dfs, keep going pass try: self.num_docs += other.num_docs self.num_nnz += other.num_nnz self.num_pos += other.num_pos except: pass import gensim.models return gensim.models.VocabTransform(old2new)
@staticmethod
[docs] def load_from_text(fname): """ Load a previously stored Dictionary from a text file. Mirror function to `save_as_text`. """ result = Dictionary() with utils.smart_open(fname) as f: for lineno, line in enumerate(f): line = utils.to_unicode(line) try: wordid, word, docfreq = line[:-1].split('\t') except Exception: raise ValueError("invalid line in dictionary file %s: %s" % (fname, line.strip())) wordid = int(wordid) if word in result.token2id: raise KeyError('token %s is defined as ID %d and as ID %d' % (word, wordid, result.token2id[word])) result.token2id[word] = wordid result.dfs[wordid] = int(docfreq) return result
@staticmethod
[docs] def from_corpus(corpus, id2word=None): """ Create Dictionary from an existing corpus. This can be useful if you only have a term-document BOW matrix (represented by `corpus`), but not the original text corpus. This will scan the term-document count matrix for all word ids that appear in it, then construct and return Dictionary which maps each `word_id -> id2word[word_id]`. `id2word` is an optional dictionary that maps the `word_id` to a token. In case `id2word` isn't specified the mapping `id2word[word_id] = str(word_id)` will be used. """ result = Dictionary() max_id = -1 for docno, document in enumerate(corpus): if docno % 10000 == 0: logger.info("adding document #%i to %s", docno, result) result.num_docs += 1 result.num_nnz += len(document) for wordid, word_freq in document: max_id = max(wordid, max_id) result.num_pos += word_freq result.dfs[wordid] = result.dfs.get(wordid, 0) + 1 if id2word is None: # make sure length(result) == get_max_id(corpus) + 1 result.token2id = dict((unicode(i), i) for i in xrange(max_id + 1)) else: # id=>word mapping given: simply copy it result.token2id = dict((utils.to_unicode(token), id) for id, token in iteritems(id2word)) for id in itervalues(result.token2id): # make sure all token ids have a valid `dfs` entry result.dfs[id] = result.dfs.get(id, 0) logger.info( "built %s from %i documents (total %i corpus positions)", result, result.num_docs, result.num_pos) return result