#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""
Module for calculating topic coherence in python. This is the implementation of
the four stage topic coherence pipeline from the paper [1].
The four stage pipeline is basically:
Segmentation -> Probability Estimation -> Confirmation Measure -> Aggregation.
Implementation of this pipeline allows for the user to in essence "make" a
coherence measure of his/her choice by choosing a method in each of the pipelines.
[1] Michael Roeder, Andreas Both and Alexander Hinneburg. Exploring the space of topic
coherence measures. http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf.
"""
import logging
from gensim import interfaces
from gensim.topic_coherence import (segmentation, probability_estimation,
direct_confirmation_measure, indirect_confirmation_measure,
aggregation)
from gensim.matutils import argsort
from gensim.utils import is_corpus, FakeDict
from gensim.models.ldamodel import LdaModel
from gensim.models.wrappers import LdaVowpalWabbit, LdaMallet
import numpy as np
from collections import namedtuple
logger = logging.getLogger(__name__)
boolean_document_based = ['u_mass']
sliding_window_based = ['c_v', 'c_uci', 'c_npmi']
make_pipeline = namedtuple('Coherence_Measure', 'seg, prob, conf, aggr')
coherence_dict = {
'u_mass': make_pipeline(segmentation.s_one_pre,
probability_estimation.p_boolean_document,
direct_confirmation_measure.log_conditional_probability,
aggregation.arithmetic_mean),
'c_v': make_pipeline(segmentation.s_one_set,
probability_estimation.p_boolean_sliding_window,
indirect_confirmation_measure.cosine_similarity,
aggregation.arithmetic_mean),
'c_uci': make_pipeline(segmentation.s_one_one,
probability_estimation.p_boolean_sliding_window,
direct_confirmation_measure.log_ratio_measure,
aggregation.arithmetic_mean),
'c_npmi': make_pipeline(segmentation.s_one_one,
probability_estimation.p_boolean_sliding_window,
direct_confirmation_measure.log_ratio_measure,
aggregation.arithmetic_mean),
}
sliding_windows_dict = {
'c_v': 110,
'c_uci': 10,
'c_npmi': 10
}
[docs]class CoherenceModel(interfaces.TransformationABC):
"""
Objects of this class allow for building and maintaining a model for topic
coherence.
The main methods are:
1. constructor, which initializes the four stage pipeline by accepting a coherence measure,
2. the ``get_coherence()`` method, which returns the topic coherence.
One way of using this feature is through providing a trained topic model. A dictionary has to be explicitly
provided if the model does not contain a dictionary already.
>>> cm = CoherenceModel(model=tm, corpus=corpus, coherence='u_mass') # tm is the trained topic model
>>> cm.get_coherence()
Another way of using this feature is through providing tokenized topics such as:
>>> topics = [['human', 'computer', 'system', 'interface'],
['graph', 'minors', 'trees', 'eps']]
>>> cm = CoherenceModel(topics=topics, corpus=corpus, dictionary=dictionary, coherence='u_mass') # note that a dictionary has to be provided.
>>> cm.get_coherence()
Model persistency is achieved via its load/save methods.
"""
[docs] def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=None, window_size=None, coherence='c_v', topn=10):
"""
Args:
----
model : Pre-trained topic model. Should be provided if topics is not provided.
topics : List of tokenized topics. If this is preferred over model, dictionary should be provided.
eg. topics = [['human', 'machine', 'computer', 'interface'],
['graph', 'trees', 'binary', 'widths']]
texts : Tokenized texts. Needed for coherence models that use sliding window based probability estimator.
corpus : Gensim document corpus.
dictionary : Gensim dictionary mapping of id word to create corpus. If model.id2word is present, this is not needed.
If both are provided, dictionary will be used.
window_size : Is the size of the window to be used for coherence measures using boolean sliding window as their
probability estimator. For 'u_mass' this doesn't matter.
If left 'None' the default window sizes are used which are:
'c_v' : 110
'c_uci' : 10
'c_npmi' : 10
coherence : Coherence measure to be used. Supported values are:
'u_mass'
'c_v'
'c_uci' also popularly known as c_pmi
'c_npmi'
For 'u_mass' corpus should be provided. If texts is provided, it will be converted to corpus using the dictionary.
For 'c_v', 'c_uci' and 'c_npmi' texts should be provided. Corpus is not needed.
topn : Integer corresponding to the number of top words to be extracted from each topic.
"""
if model is None and topics is None:
raise ValueError("One of model or topics has to be provided.")
elif topics is not None and dictionary is None:
raise ValueError("dictionary has to be provided if topics are to be used.")
if texts is None and corpus is None:
raise ValueError("One of texts or corpus has to be provided.")
# Check if associated dictionary is provided.
if dictionary is None:
if isinstance(model.id2word, FakeDict):
raise ValueError("The associated dictionary should be provided with the corpus or 'id2word' for topic model"
" should be set as the associated dictionary.")
else:
self.dictionary = model.id2word
else:
self.dictionary = dictionary
# Check for correct inputs for u_mass coherence measure.
if coherence in boolean_document_based:
if is_corpus(corpus)[0]:
self.corpus = corpus
elif texts is not None:
self.texts = texts
self.corpus = [self.dictionary.doc2bow(text) for text in self.texts]
else:
raise ValueError("Either 'corpus' with 'dictionary' or 'texts' should be provided for %s coherence." % coherence)
# Check for correct inputs for c_v coherence measure.
elif coherence in sliding_window_based:
self.window_size = window_size
if texts is None:
raise ValueError("'texts' should be provided for %s coherence." % coherence)
else:
self.texts = texts
else:
raise ValueError("%s coherence is not currently supported." % coherence)
self.topn = topn
self.model = model
if model is not None:
self.topics = self._get_topics()
elif topics is not None:
self.topics = []
for topic in topics:
t_i = []
for n, _ in enumerate(topic):
t_i.append(dictionary.token2id[topic[n]])
self.topics.append(np.array(t_i))
self.coherence = coherence
def __str__(self):
return coherence_dict[self.coherence].__str__()
def _get_topics(self):
"""Internal helper function to return topics from a trained topic model."""
topics = []
if isinstance(self.model, LdaModel):
for topic in self.model.state.get_lambda():
bestn = argsort(topic, topn=self.topn, reverse=True)
topics.append(bestn)
elif isinstance(self.model, LdaVowpalWabbit):
for topic in self.model._get_topics():
bestn = argsort(topic, topn=self.topn, reverse=True)
topics.append(bestn)
elif isinstance(self.model, LdaMallet):
for topic in self.model.word_topics:
bestn = argsort(topic, topn=self.topn, reverse=True)
topics.append(bestn)
else:
raise ValueError("This topic model is not currently supported. Supported topic models are"
"LdaModel, LdaVowpalWabbit and LdaMallet.")
return topics
[docs] def get_coherence(self):
"""
Return coherence value based on pipeline parameters.
"""
measure = coherence_dict[self.coherence]
segmented_topics = measure.seg(self.topics)
if self.coherence in boolean_document_based:
per_topic_postings, num_docs = measure.prob(self.corpus, segmented_topics)
confirmed_measures = measure.conf(segmented_topics, per_topic_postings, num_docs)
elif self.coherence in sliding_window_based:
if self.window_size is not None:
self.window_size = sliding_windows_dict[self.coherence]
per_topic_postings, num_windows = measure.prob(texts=self.texts, segmented_topics=segmented_topics,
dictionary=self.dictionary, window_size=self.window_size)
if self.coherence == 'c_v':
confirmed_measures = measure.conf(self.topics, segmented_topics, per_topic_postings, 'nlr', 1, num_windows)
else:
if self.coherence == 'c_npmi':
normalize = True
else:
# For c_uci
normalize = False
confirmed_measures = measure.conf(segmented_topics, per_topic_postings, num_windows, normalize=normalize)
return measure.aggr(confirmed_measures)