Source code for gensim.models.phrases

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

"""
Automatically detect common phrases (multiword expressions) from a stream of sentences.

The phrases are collocations (frequently co-occurring tokens). See [1]_ for the
exact formula.

For example, if your input stream (=an iterable, with each value a list of token strings) looks like:

>>> print(list(sentence_stream))
[[u'the', u'mayor', u'of', u'new', u'york', u'was', u'there'],
 [u'machine', u'learning', u'can', u'be', u'useful', u'sometimes'],
 ...,
]

you'd train the detector with:

>>> bigram = Phrases(sentence_stream)

and then transform any sentence (list of token strings) using the standard gensim syntax:

>>> sent = [u'the', u'mayor', u'of', u'new', u'york', u'was', u'there']
>>> print(bigram[sent])
[u'the', u'mayor', u'of', u'new_york', u'was', u'there']

(note `new_york` became a single token). As usual, you can also transform an entire
sentence stream using:

>>> print(list(bigram[any_sentence_stream]))
[[u'the', u'mayor', u'of', u'new_york', u'was', u'there'],
 [u'machine_learning', u'can', u'be', u'useful', u'sometimes'],
 ...,
]

You can also continue updating the collocation counts with new sentences, by:

>>> bigram.add_vocab(new_sentence_stream)

These **phrase streams are meant to be used during text preprocessing, before
converting the resulting tokens into vectors using `Dictionary`**. See the
:mod:`gensim.models.word2vec` module for an example application of using phrase detection.

The detection can also be **run repeatedly**, to get phrases longer than
two tokens (e.g. `new_york_times`):

>>> trigram = Phrases(bigram[sentence_stream])
>>> sent = [u'the', u'new', u'york', u'times', u'is', u'a', u'newspaper']
>>> print(trigram[bigram[sent]])
[u'the', u'new_york_times', u'is', u'a', u'newspaper']

.. [1] Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean.
       Distributed Representations of Words and Phrases and their Compositionality.
       In Proceedings of NIPS, 2013.

"""

import sys
import os
import logging
from collections import defaultdict

from six import iteritems, string_types

from gensim import utils, interfaces

logger = logging.getLogger(__name__)


[docs]class Phrases(interfaces.TransformationABC): """ Detect phrases, based on collected collocation counts. Adjacent words that appear together more frequently than expected are joined together with the `_` character. It can be used to generate phrases on the fly, using the `phrases[sentence]` and `phrases[corpus]` syntax. """
[docs] def __init__(self, sentences=None, min_count=5, threshold=10.0, max_vocab_size=40000000, delimiter=b'_'): """ Initialize the model from an iterable of `sentences`. Each sentence must be a list of words (unicode strings) that will be used for training. The `sentences` iterable can be simply a list, but for larger corpora, consider a generator that streams the sentences directly from disk/network, without storing everything in RAM. See :class:`BrownCorpus`, :class:`Text8Corpus` or :class:`LineSentence` in the :mod:`gensim.models.word2vec` module for such examples. `min_count` ignore all words and bigrams with total collected count lower than this. `threshold` represents a threshold for forming the phrases (higher means fewer phrases). A phrase of words `a` and `b` is accepted if `(cnt(a, b) - min_count) * N / (cnt(a) * cnt(b)) > threshold`, where `N` is the total vocabulary size. `max_vocab_size` is the maximum size of the vocabulary. Used to control pruning of less common words, to keep memory under control. The default of 40M needs about 3.6GB of RAM; increase/decrease `max_vocab_size` depending on how much available memory you have. `delimiter` is the glue character used to join collocation tokens, and should be a byte string (e.g. b'_'). """ if min_count <= 0: raise ValueError("min_count should be at least 1") if threshold <= 0: raise ValueError("threshold should be positive") self.min_count = min_count self.threshold = threshold self.max_vocab_size = max_vocab_size self.vocab = defaultdict(int) # mapping between utf8 token => its count self.min_reduce = 1 # ignore any tokens with count smaller than this self.delimiter = delimiter if sentences is not None: self.add_vocab(sentences)
def __str__(self): """Get short string representation of this phrase detector.""" return "%s<%i vocab, min_count=%s, threshold=%s, max_vocab_size=%s>" % ( self.__class__.__name__, len(self.vocab), self.min_count, self.threshold, self.max_vocab_size) @staticmethod
[docs] def learn_vocab(sentences, max_vocab_size, delimiter=b'_'): """Collect unigram/bigram counts from the `sentences` iterable.""" sentence_no = -1 total_words = 0 logger.info("collecting all words and their counts") vocab = defaultdict(int) min_reduce = 1 for sentence_no, sentence in enumerate(sentences): if sentence_no % 10000 == 0: logger.info("PROGRESS: at sentence #%i, processed %i words and %i word types" % (sentence_no, total_words, len(vocab))) sentence = [utils.any2utf8(w) for w in sentence] for bigram in zip(sentence, sentence[1:]): vocab[bigram[0]] += 1 vocab[delimiter.join(bigram)] += 1 total_words += 1 if sentence: # add last word skipped by previous loop word = sentence[-1] vocab[word] += 1 if len(vocab) > max_vocab_size: utils.prune_vocab(vocab, min_reduce) min_reduce += 1 logger.info("collected %i word types from a corpus of %i words (unigram + bigrams) and %i sentences" % (len(vocab), total_words, sentence_no + 1)) return min_reduce, vocab
[docs] def add_vocab(self, sentences): """ Merge the collected counts `vocab` into this phrase detector. """ # uses a separate vocab to collect the token counts from `sentences`. # this consumes more RAM than merging new sentences into `self.vocab` # directly, but gives the new sentences a fighting chance to collect # sufficient counts, before being pruned out by the (large) accummulated # counts collected in previous learn_vocab runs. min_reduce, vocab = self.learn_vocab(sentences, self.max_vocab_size, self.delimiter) logger.info("merging %i counts into %s", len(vocab), self) self.min_reduce = max(self.min_reduce, min_reduce) for word, count in iteritems(vocab): self.vocab[word] += count if len(self.vocab) > self.max_vocab_size: utils.prune_vocab(self.vocab, self.min_reduce) self.min_reduce += 1 logger.info("merged %s", self)
[docs] def export_phrases(self, sentences): """ Generate an iterator that contains all phrases in given 'sentences' Example:: >>> sentences = Text8Corpus(path_to_corpus) >>> bigram = Phrases(sentences, min_count=5, threshold=100) >>> for phrase, score in bigram.export_phrases(sentences): ... print(u'{0}\t{1}'.format(phrase, score)) then you can debug the threshold with generated tsv """ for sentence in sentences: s = [utils.any2utf8(w) for w in sentence] last_bigram = False vocab = self.vocab threshold = self.threshold delimiter = self.delimiter min_count = self.min_count for word_a, word_b in zip(s, s[1:]): if word_a in vocab and word_b in vocab: bigram_word = delimiter.join((word_a, word_b)) if bigram_word in vocab and not last_bigram: pa = float(vocab[word_a]) pb = float(vocab[word_b]) pab = float(vocab[bigram_word]) score = (pab - min_count) / pa / pb * len(vocab) # logger.debug("score for %s: (pab=%s - min_count=%s) / pa=%s / pb=%s * vocab_size=%s = %s", # bigram_word, pab, self.min_count, pa, pb, len(self.vocab), score) if score > threshold: yield (b' '.join((word_a, word_b)), score) last_bigram = True
def __getitem__(self, sentence): """ Convert the input tokens `sentence` (=list of unicode strings) into phrase tokens (=list of unicode strings, where detected phrases are joined by u'_'). If `sentence` is an entire corpus (iterable of sentences rather than a single sentence), return an iterable that converts each of the corpus' sentences into phrases on the fly, one after another. Example:: >>> sentences = Text8Corpus(path_to_corpus) >>> bigram = Phrases(sentences, min_count=5, threshold=100) >>> for sentence in phrases[sentences]: ... print(u' '.join(s)) he refuted nechaev other anarchists sometimes identified as pacifist anarchists advocated complete nonviolence leo_tolstoy """ try: is_single = not sentence or isinstance(sentence[0], string_types) except: is_single = False if not is_single: # if the input is an entire corpus (rather than a single sentence), # return an iterable stream. return self._apply(sentence) s, new_s = [utils.any2utf8(w) for w in sentence], [] last_bigram = False vocab = self.vocab threshold = self.threshold delimiter = self.delimiter min_count = self.min_count for word_a, word_b in zip(s, s[1:]): if word_a in vocab and word_b in vocab: bigram_word = delimiter.join((word_a, word_b)) if bigram_word in vocab and not last_bigram: pa = float(vocab[word_a]) pb = float(vocab[word_b]) pab = float(vocab[bigram_word]) score = (pab - min_count) / pa / pb * len(vocab) # logger.debug("score for %s: (pab=%s - min_count=%s) / pa=%s / pb=%s * vocab_size=%s = %s", # bigram_word, pab, self.min_count, pa, pb, len(self.vocab), score) if score > threshold: new_s.append(bigram_word) last_bigram = True continue if not last_bigram: new_s.append(word_a) last_bigram = False if s: # add last word skipped by previous loop last_token = s[-1] if not last_bigram: new_s.append(last_token) return [utils.to_unicode(w) for w in new_s]
if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info("running %s" % " ".join(sys.argv)) # check and process cmdline input program = os.path.basename(sys.argv[0]) if len(sys.argv) < 2: print(globals()['__doc__'] % locals()) sys.exit(1) infile = sys.argv[1] from gensim.models import Phrases # for pickle from gensim.models.word2vec import Text8Corpus sentences = Text8Corpus(infile) # test_doc = LineSentence('test/test_data/testcorpus.txt') bigram = Phrases(sentences, min_count=5, threshold=100) for s in bigram[sentences]: print(utils.to_utf8(u' '.join(s)))