# Natural Language Toolkit: Text Segmentation Metrics
#
# Copyright (C) 2001-2015 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# David Doukhan <david.doukhan@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
Text Segmentation Metrics
1. Windowdiff
Pevzner, L., and Hearst, M., A Critique and Improvement of
an Evaluation Metric for Text Segmentation,
Computational Linguistics 28, 19-36
2. Generalized Hamming Distance
Bookstein A., Kulyukin V.A., Raita T.
Generalized Hamming Distance
Information Retrieval 5, 2002, pp 353-375
Baseline implementation in C++
http://digital.cs.usu.edu/~vkulyukin/vkweb/software/ghd/ghd.html
Study describing benefits of Generalized Hamming Distance Versus
WindowDiff for evaluating text segmentation tasks
Begsten, Y. Quel indice pour mesurer l'efficacite en segmentation de textes ?
TALN 2009
3. Pk text segmentation metric
Beeferman D., Berger A., Lafferty J. (1999)
Statistical Models for Text Segmentation
Machine Learning, 34, 177-210
"""
try:
import numpy as np
except ImportError:
pass
from nltk.compat import xrange
[docs]def windowdiff(seg1, seg2, k, boundary="1", weighted=False):
"""
Compute the windowdiff score for a pair of segmentations. A
segmentation is any sequence over a vocabulary of two items
(e.g. "0", "1"), where the specified boundary value is used to
mark the edge of a segmentation.
>>> s1 = "000100000010"
>>> s2 = "000010000100"
>>> s3 = "100000010000"
>>> '%.2f' % windowdiff(s1, s1, 3)
'0.00'
>>> '%.2f' % windowdiff(s1, s2, 3)
'0.30'
>>> '%.2f' % windowdiff(s2, s3, 3)
'0.80'
:param seg1: a segmentation
:type seg1: str or list
:param seg2: a segmentation
:type seg2: str or list
:param k: window width
:type k: int
:param boundary: boundary value
:type boundary: str or int or bool
:param weighted: use the weighted variant of windowdiff
:type weighted: boolean
:rtype: float
"""
if len(seg1) != len(seg2):
raise ValueError("Segmentations have unequal length")
if k > len(seg1):
raise ValueError("Window width k should be smaller or equal than segmentation lengths")
wd = 0
for i in range(len(seg1) - k + 1):
ndiff = abs(seg1[i:i+k].count(boundary) - seg2[i:i+k].count(boundary))
if weighted:
wd += ndiff
else:
wd += min(1, ndiff)
return wd / (len(seg1) - k + 1.)
# Generalized Hamming Distance
def _init_mat(nrows, ncols, ins_cost, del_cost):
mat = np.empty((nrows, ncols))
mat[0, :] = ins_cost * np.arange(ncols)
mat[:, 0] = del_cost * np.arange(nrows)
return mat
def _ghd_aux(mat, rowv, colv, ins_cost, del_cost, shift_cost_coeff):
for i, rowi in enumerate(rowv):
for j, colj in enumerate(colv):
shift_cost = shift_cost_coeff * abs(rowi - colj) + mat[i, j]
if rowi == colj:
# boundaries are at the same location, no transformation required
tcost = mat[i, j]
elif rowi > colj:
# boundary match through a deletion
tcost = del_cost + mat[i, j + 1]
else:
# boundary match through an insertion
tcost = ins_cost + mat[i + 1, j]
mat[i + 1, j + 1] = min(tcost, shift_cost)
[docs]def ghd(ref, hyp, ins_cost=2.0, del_cost=2.0, shift_cost_coeff=1.0, boundary='1'):
"""
Compute the Generalized Hamming Distance for a reference and a hypothetical
segmentation, corresponding to the cost related to the transformation
of the hypothetical segmentation into the reference segmentation
through boundary insertion, deletion and shift operations.
A segmentation is any sequence over a vocabulary of two items
(e.g. "0", "1"), where the specified boundary value is used to
mark the edge of a segmentation.
Recommended parameter values are a shift_cost_coeff of 2.
Associated with a ins_cost, and del_cost equal to the mean segment
length in the reference segmentation.
>>> # Same examples as Kulyukin C++ implementation
>>> ghd('1100100000', '1100010000', 1.0, 1.0, 0.5)
0.5
>>> ghd('1100100000', '1100000001', 1.0, 1.0, 0.5)
2.0
>>> ghd('011', '110', 1.0, 1.0, 0.5)
1.0
>>> ghd('1', '0', 1.0, 1.0, 0.5)
1.0
>>> ghd('111', '000', 1.0, 1.0, 0.5)
3.0
>>> ghd('000', '111', 1.0, 2.0, 0.5)
6.0
:param ref: the reference segmentation
:type ref: str or list
:param hyp: the hypothetical segmentation
:type hyp: str or list
:param ins_cost: insertion cost
:type ins_cost: float
:param del_cost: deletion cost
:type del_cost: float
:param shift_cost_coeff: constant used to compute the cost of a shift.
shift cost = shift_cost_coeff * |i - j| where i and j are
the positions indicating the shift
:type shift_cost_coeff: float
:param boundary: boundary value
:type boundary: str or int or bool
:rtype: float
"""
ref_idx = [i for (i, val) in enumerate(ref) if val == boundary]
hyp_idx = [i for (i, val) in enumerate(hyp) if val == boundary]
nref_bound = len(ref_idx)
nhyp_bound = len(hyp_idx)
if nref_bound == 0 and nhyp_bound == 0:
return 0.0
elif nref_bound > 0 and nhyp_bound == 0:
return nref_bound * ins_cost
elif nref_bound == 0 and nhyp_bound > 0:
return nhyp_bound * del_cost
mat = _init_mat(nhyp_bound + 1, nref_bound + 1, ins_cost, del_cost)
_ghd_aux(mat, hyp_idx, ref_idx, ins_cost, del_cost, shift_cost_coeff)
return mat[-1, -1]
# Beeferman's Pk text segmentation evaluation metric
[docs]def pk(ref, hyp, k=None, boundary='1'):
"""
Compute the Pk metric for a pair of segmentations A segmentation
is any sequence over a vocabulary of two items (e.g. "0", "1"),
where the specified boundary value is used to mark the edge of a
segmentation.
>>> '%.2f' % pk('0100'*100, '1'*400, 2)
'0.50'
>>> '%.2f' % pk('0100'*100, '0'*400, 2)
'0.50'
>>> '%.2f' % pk('0100'*100, '0100'*100, 2)
'0.00'
:param ref: the reference segmentation
:type ref: str or list
:param hyp: the segmentation to evaluate
:type hyp: str or list
:param k: window size, if None, set to half of the average reference segment length
:type boundary: str or int or bool
:param boundary: boundary value
:type boundary: str or int or bool
:rtype: float
"""
if k is None:
k = int(round(len(ref) / (ref.count(boundary) * 2.)))
err = 0
for i in xrange(len(ref)-k +1):
r = ref[i:i+k].count(boundary) > 0
h = hyp[i:i+k].count(boundary) > 0
if r != h:
err += 1
return err / (len(ref)-k +1.)
# skip doctests if numpy is not installed
def setup_module(module):
from nose import SkipTest
try:
import numpy
except ImportError:
raise SkipTest("numpy is required for nltk.metrics.segmentation")