Source code for gensim.models.coherencemodel

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

"""
Module for calculating topic coherence in python. This is the implementation of
the four stage topic coherence pipeline from the paper [1].
The four stage pipeline is basically:

Segmentation -> Probability Estimation -> Confirmation Measure -> Aggregation.

Implementation of this pipeline allows for the user to in essence "make" a
coherence measure of his/her choice by choosing a method in each of the pipelines.

[1] Michael Roeder, Andreas Both and Alexander Hinneburg. Exploring the space of topic
coherence measures. http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf.
"""

import logging

from gensim import interfaces
from gensim.topic_coherence import (segmentation, probability_estimation,
                                    direct_confirmation_measure, indirect_confirmation_measure,
                                    aggregation)
from gensim.matutils import argsort
from gensim.utils import is_corpus, FakeDict
from gensim.models.ldamodel import LdaModel
from gensim.models.wrappers import LdaVowpalWabbit, LdaMallet

import numpy as np

from collections import namedtuple

logger = logging.getLogger(__name__)

boolean_document_based = ['u_mass']
sliding_window_based = ['c_v', 'c_uci', 'c_npmi']
make_pipeline = namedtuple('Coherence_Measure', 'seg, prob, conf, aggr')

coherence_dict = {
    'u_mass': make_pipeline(segmentation.s_one_pre,
                            probability_estimation.p_boolean_document,
                            direct_confirmation_measure.log_conditional_probability,
                            aggregation.arithmetic_mean),
    'c_v': make_pipeline(segmentation.s_one_set,
                         probability_estimation.p_boolean_sliding_window,
                         indirect_confirmation_measure.cosine_similarity,
                         aggregation.arithmetic_mean),
    'c_uci': make_pipeline(segmentation.s_one_one,
                           probability_estimation.p_boolean_sliding_window,
                           direct_confirmation_measure.log_ratio_measure,
                           aggregation.arithmetic_mean),
    'c_npmi': make_pipeline(segmentation.s_one_one,
                            probability_estimation.p_boolean_sliding_window,
                            direct_confirmation_measure.log_ratio_measure,
                            aggregation.arithmetic_mean),
}

sliding_windows_dict = {
    'c_v': 110,
    'c_uci': 10,
    'c_npmi': 10
}

[docs]class CoherenceModel(interfaces.TransformationABC): """ Objects of this class allow for building and maintaining a model for topic coherence. The main methods are: 1. constructor, which initializes the four stage pipeline by accepting a coherence measure, 2. the ``get_coherence()`` method, which returns the topic coherence. One way of using this feature is through providing a trained topic model. A dictionary has to be explicitly provided if the model does not contain a dictionary already. >>> cm = CoherenceModel(model=tm, corpus=corpus, coherence='u_mass') # tm is the trained topic model >>> cm.get_coherence() Another way of using this feature is through providing tokenized topics such as: >>> topics = [['human', 'computer', 'system', 'interface'], ['graph', 'minors', 'trees', 'eps']] >>> cm = CoherenceModel(topics=topics, corpus=corpus, dictionary=dictionary, coherence='u_mass') # note that a dictionary has to be provided. >>> cm.get_coherence() Model persistency is achieved via its load/save methods. """
[docs] def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=None, window_size=None, coherence='c_v', topn=10): """ Args: ---- model : Pre-trained topic model. Should be provided if topics is not provided. topics : List of tokenized topics. If this is preferred over model, dictionary should be provided. eg. topics = [['human', 'machine', 'computer', 'interface'], ['graph', 'trees', 'binary', 'widths']] texts : Tokenized texts. Needed for coherence models that use sliding window based probability estimator. corpus : Gensim document corpus. dictionary : Gensim dictionary mapping of id word to create corpus. If model.id2word is present, this is not needed. If both are provided, dictionary will be used. window_size : Is the size of the window to be used for coherence measures using boolean sliding window as their probability estimator. For 'u_mass' this doesn't matter. If left 'None' the default window sizes are used which are: 'c_v' : 110 'c_uci' : 10 'c_npmi' : 10 coherence : Coherence measure to be used. Supported values are: 'u_mass' 'c_v' 'c_uci' also popularly known as c_pmi 'c_npmi' For 'u_mass' corpus should be provided. If texts is provided, it will be converted to corpus using the dictionary. For 'c_v', 'c_uci' and 'c_npmi' texts should be provided. Corpus is not needed. topn : Integer corresponding to the number of top words to be extracted from each topic. """ if model is None and topics is None: raise ValueError("One of model or topics has to be provided.") elif topics is not None and dictionary is None: raise ValueError("dictionary has to be provided if topics are to be used.") if texts is None and corpus is None: raise ValueError("One of texts or corpus has to be provided.") # Check if associated dictionary is provided. if dictionary is None: if isinstance(model.id2word, FakeDict): raise ValueError("The associated dictionary should be provided with the corpus or 'id2word' for topic model" " should be set as the associated dictionary.") else: self.dictionary = model.id2word else: self.dictionary = dictionary # Check for correct inputs for u_mass coherence measure. if coherence in boolean_document_based: if is_corpus(corpus)[0]: self.corpus = corpus elif texts is not None: self.texts = texts self.corpus = [self.dictionary.doc2bow(text) for text in self.texts] else: raise ValueError("Either 'corpus' with 'dictionary' or 'texts' should be provided for %s coherence." % coherence) # Check for correct inputs for c_v coherence measure. elif coherence in sliding_window_based: self.window_size = window_size if texts is None: raise ValueError("'texts' should be provided for %s coherence." % coherence) else: self.texts = texts else: raise ValueError("%s coherence is not currently supported." % coherence) self.topn = topn self.model = model if model is not None: self.topics = self._get_topics() elif topics is not None: self.topics = [] for topic in topics: t_i = [] for n, _ in enumerate(topic): t_i.append(dictionary.token2id[topic[n]]) self.topics.append(np.array(t_i)) self.coherence = coherence
def __str__(self): return coherence_dict[self.coherence].__str__() def _get_topics(self): """Internal helper function to return topics from a trained topic model.""" topics = [] if isinstance(self.model, LdaModel): for topic in self.model.state.get_lambda(): bestn = argsort(topic, topn=self.topn, reverse=True) topics.append(bestn) elif isinstance(self.model, LdaVowpalWabbit): for topic in self.model._get_topics(): bestn = argsort(topic, topn=self.topn, reverse=True) topics.append(bestn) elif isinstance(self.model, LdaMallet): for topic in self.model.word_topics: bestn = argsort(topic, topn=self.topn, reverse=True) topics.append(bestn) else: raise ValueError("This topic model is not currently supported. Supported topic models are" "LdaModel, LdaVowpalWabbit and LdaMallet.") return topics
[docs] def get_coherence(self): """ Return coherence value based on pipeline parameters. """ measure = coherence_dict[self.coherence] segmented_topics = measure.seg(self.topics) if self.coherence in boolean_document_based: per_topic_postings, num_docs = measure.prob(self.corpus, segmented_topics) confirmed_measures = measure.conf(segmented_topics, per_topic_postings, num_docs) elif self.coherence in sliding_window_based: if self.window_size is not None: self.window_size = sliding_windows_dict[self.coherence] per_topic_postings, num_windows = measure.prob(texts=self.texts, segmented_topics=segmented_topics, dictionary=self.dictionary, window_size=self.window_size) if self.coherence == 'c_v': confirmed_measures = measure.conf(self.topics, segmented_topics, per_topic_postings, 'nlr', 1, num_windows) else: if self.coherence == 'c_npmi': normalize = True else: # For c_uci normalize = False confirmed_measures = measure.conf(segmented_topics, per_topic_postings, num_windows, normalize=normalize) return measure.aggr(confirmed_measures)