Source code for nltk.tag.brill

# -*- coding: utf-8 -*-
# Natural Language Toolkit: Transformation-based learning
#
# Copyright (C) 2001-2015 NLTK Project
# Author: Marcus Uneson <marcus.uneson@gmail.com>
#   based on previous (nltk2) version by
#   Christopher Maloof, Edward Loper, Steven Bird
# URL: <http://nltk.org/>
# For license information, see  LICENSE.TXT

from __future__ import print_function, division

from collections import defaultdict

from nltk.compat import Counter
from nltk.tag import TaggerI
from nltk.tbl import Feature, Template
from nltk import jsontags


######################################################################
# Brill Templates
######################################################################

@jsontags.register_tag
class Word(Feature):
    """
    Feature which examines the text (word) of nearby tokens.
    """

    json_tag = 'nltk.tag.brill.Word'

    @staticmethod
    def extract_property(tokens, index):
        """@return: The given token's text."""
        return tokens[index][0]


@jsontags.register_tag
class Pos(Feature):
    """
    Feature which examines the tags of nearby tokens.
    """

    json_tag = 'nltk.tag.brill.Pos'

    @staticmethod
    def extract_property(tokens, index):
        """@return: The given token's tag."""
        return tokens[index][1]


def nltkdemo18():
    """
    Return 18 templates, from the original nltk demo, in multi-feature syntax
    """
    return [
        Template(Pos([-1])),
        Template(Pos([1])),
        Template(Pos([-2])),
        Template(Pos([2])),
        Template(Pos([-2, -1])),
        Template(Pos([1, 2])),
        Template(Pos([-3, -2, -1])),
        Template(Pos([1, 2, 3])),
        Template(Pos([-1]), Pos([1])),
        Template(Word([-1])),
        Template(Word([1])),
        Template(Word([-2])),
        Template(Word([2])),
        Template(Word([-2, -1])),
        Template(Word([1, 2])),
        Template(Word([-3, -2, -1])),
        Template(Word([1, 2, 3])),
        Template(Word([-1]), Word([1])),
    ]


def nltkdemo18plus():
    """
    Return 18 templates, from the original nltk demo, and additionally a few
    multi-feature ones (the motivation is easy comparison with nltkdemo18)
    """
    return nltkdemo18() + [
        Template(Word([-1]), Pos([1])),
        Template(Pos([-1]), Word([1])),
        Template(Word([-1]), Word([0]), Pos([1])),
        Template(Pos([-1]), Word([0]), Word([1])),
        Template(Pos([-1]), Word([0]), Pos([1])),
    ]


def fntbl37():
    """
    Return 37 templates taken from the postagging task of the
    fntbl distribution http://www.cs.jhu.edu/~rflorian/fntbl/
    (37 is after excluding a handful which do not condition on Pos[0];
    fntbl can do that but the current nltk implementation cannot.)
    """
    return [
        Template(Word([0]), Word([1]), Word([2])),
        Template(Word([-1]), Word([0]), Word([1])),
        Template(Word([0]), Word([-1])),
        Template(Word([0]), Word([1])),
        Template(Word([0]), Word([2])),
        Template(Word([0]), Word([-2])),
        Template(Word([1, 2])),
        Template(Word([-2, -1])),
        Template(Word([1, 2, 3])),
        Template(Word([-3, -2, -1])),
        Template(Word([0]), Pos([2])),
        Template(Word([0]), Pos([-2])),
        Template(Word([0]), Pos([1])),
        Template(Word([0]), Pos([-1])),
        Template(Word([0])),
        Template(Word([-2])),
        Template(Word([2])),
        Template(Word([1])),
        Template(Word([-1])),
        Template(Pos([-1]), Pos([1])),
        Template(Pos([1]), Pos([2])),
        Template(Pos([-1]), Pos([-2])),
        Template(Pos([1])),
        Template(Pos([-1])),
        Template(Pos([-2])),
        Template(Pos([2])),
        Template(Pos([1, 2, 3])),
        Template(Pos([1, 2])),
        Template(Pos([-3, -2, -1])),
        Template(Pos([-2, -1])),
        Template(Pos([1]), Word([0]), Word([1])),
        Template(Pos([1]), Word([0]), Word([-1])),
        Template(Pos([-1]), Word([-1]), Word([0])),
        Template(Pos([-1]), Word([0]), Word([1])),
        Template(Pos([-2]), Pos([-1])),
        Template(Pos([1]), Pos([2])),
        Template(Pos([1]), Pos([2]), Word([1]))
    ]


def brill24():
    """
    Return 24 templates of the seminal TBL paper, Brill (1995)
    """
    return [
        Template(Pos([-1])),
        Template(Pos([1])),
        Template(Pos([-2])),
        Template(Pos([2])),
        Template(Pos([-2, -1])),
        Template(Pos([1, 2])),
        Template(Pos([-3, -2, -1])),
        Template(Pos([1, 2, 3])),
        Template(Pos([-1]), Pos([1])),
        Template(Pos([-2]), Pos([-1])),
        Template(Pos([1]), Pos([2])),
        Template(Word([-1])),
        Template(Word([1])),
        Template(Word([-2])),
        Template(Word([2])),
        Template(Word([-2, -1])),
        Template(Word([1, 2])),
        Template(Word([-1, 0])),
        Template(Word([0, 1])),
        Template(Word([0])),
        Template(Word([-1]), Pos([-1])),
        Template(Word([1]), Pos([1])),
        Template(Word([0]), Word([-1]), Pos([-1])),
        Template(Word([0]), Word([1]), Pos([1])),
    ]


def describe_template_sets():
    """
    Print the available template sets in this demo, with a short description"
    """
    import inspect
    import sys

    # a bit of magic to get all functions in this module
    templatesets = inspect.getmembers(sys.modules[__name__], inspect.isfunction)
    for (name, obj) in templatesets:
        if name == "describe_template_sets":
            continue
        print(name, obj.__doc__, "\n")


######################################################################
# The Brill Tagger
######################################################################

@jsontags.register_tag
[docs]class BrillTagger(TaggerI): """ Brill's transformational rule-based tagger. Brill taggers use an initial tagger (such as ``tag.DefaultTagger``) to assign an initial tag sequence to a text; and then apply an ordered list of transformational rules to correct the tags of individual tokens. These transformation rules are specified by the ``TagRule`` interface. Brill taggers can be created directly, from an initial tagger and a list of transformational rules; but more often, Brill taggers are created by learning rules from a training corpus, using one of the TaggerTrainers available. """ json_tag = 'nltk.tag.BrillTagger'
[docs] def __init__(self, initial_tagger, rules, training_stats=None): """ :param initial_tagger: The initial tagger :type initial_tagger: TaggerI :param rules: An ordered list of transformation rules that should be used to correct the initial tagging. :type rules: list(TagRule) :param training_stats: A dictionary of statistics collected during training, for possible later use :type training_stats: dict """ self._initial_tagger = initial_tagger self._rules = tuple(rules) self._training_stats = training_stats
[docs] def encode_json_obj(self): return self._initial_tagger, self._rules, self._training_stats
@classmethod
[docs] def decode_json_obj(cls, obj): _initial_tagger, _rules, _training_stats = obj return cls(_initial_tagger, _rules, _training_stats)
[docs] def rules(self): """ Return the ordered list of transformation rules that this tagger has learnt :return: the ordered list of transformation rules that correct the initial tagging :rtype: list of Rules """ return self._rules
[docs] def train_stats(self, statistic=None): """ Return a named statistic collected during training, or a dictionary of all available statistics if no name given :param statistic: name of statistic :type statistic: str :return: some statistic collected during training of this tagger :rtype: any (but usually a number) """ if statistic is None: return self._training_stats else: return self._training_stats.get(statistic)
[docs] def tag(self, tokens): # Inherit documentation from TaggerI # Run the initial tagger. tagged_tokens = self._initial_tagger.tag(tokens) # Create a dictionary that maps each tag to a list of the # indices of tokens that have that tag. tag_to_positions = defaultdict(set) for i, (token, tag) in enumerate(tagged_tokens): tag_to_positions[tag].add(i) # Apply each rule, in order. Only try to apply rules at # positions that have the desired original tag. for rule in self._rules: # Find the positions where it might apply positions = tag_to_positions.get(rule.original_tag, []) # Apply the rule at those positions. changed = rule.apply(tagged_tokens, positions) # Update tag_to_positions with the positions of tags that # were modified. for i in changed: tag_to_positions[rule.original_tag].remove(i) tag_to_positions[rule.replacement_tag].add(i) return tagged_tokens
[docs] def print_template_statistics(self, test_stats=None, printunused=True): """ Print a list of all templates, ranked according to efficiency. If test_stats is available, the templates are ranked according to their relative contribution (summed for all rules created from a given template, weighted by score) to the performance on the test set. If no test_stats, then statistics collected during training are used instead. There is also an unweighted measure (just counting the rules). This is less informative, though, as many low-score rules will appear towards end of training. :param test_stats: dictionary of statistics collected during testing :type test_stats: dict of str -> any (but usually numbers) :param printunused: if True, print a list of all unused templates :type printunused: bool :return: None :rtype: None """ tids = [r.templateid for r in self._rules] train_stats = self.train_stats() trainscores = train_stats['rulescores'] assert len(trainscores) == len(tids), "corrupt statistics: " \ "{0} train scores for {1} rules".format(trainscores, tids) template_counts = Counter(tids) weighted_traincounts = Counter() for (tid, score) in zip(tids, trainscores): weighted_traincounts[tid] += score tottrainscores = sum(trainscores) # det_tplsort() is for deterministic sorting; # the otherwise convenient Counter.most_common() unfortunately # does not break ties deterministically # between python versions and will break cross-version tests def det_tplsort(tpl_value): return (tpl_value[1], repr(tpl_value[0])) def print_train_stats(): print("TEMPLATE STATISTICS (TRAIN) {0} templates, {1} rules)".format( len(template_counts), len(tids)) ) print("TRAIN ({tokencount:7d} tokens) initial {initialerrors:5d} {initialacc:.4f} " "final: {finalerrors:5d} {finalacc:.4f} ".format(**train_stats)) head = "#ID | Score (train) | #Rules | Template" print(head, "\n", "-" * len(head), sep="") train_tplscores = sorted(weighted_traincounts.items(), key=det_tplsort, reverse=True) for (tid, trainscore) in train_tplscores: s = "{0} | {1:5d} {2:5.3f} |{3:4d} {4:.3f} | {5}".format( tid, trainscore, trainscore/tottrainscores, template_counts[tid], template_counts[tid]/len(tids), Template.ALLTEMPLATES[int(tid)], ) print(s) def print_testtrain_stats(): testscores = test_stats['rulescores'] print("TEMPLATE STATISTICS (TEST AND TRAIN) ({0} templates, {1} rules)".format( len(template_counts), len(tids)), ) print("TEST ({tokencount:7d} tokens) initial {initialerrors:5d} {initialacc:.4f} " "final: {finalerrors:5d} {finalacc:.4f} ".format(**test_stats)) print("TRAIN ({tokencount:7d} tokens) initial {initialerrors:5d} {initialacc:.4f} " "final: {finalerrors:5d} {finalacc:.4f} ".format(**train_stats)) weighted_testcounts = Counter() for (tid, score) in zip(tids, testscores): weighted_testcounts[tid] += score tottestscores = sum(testscores) head = "#ID | Score (test) | Score (train) | #Rules | Template" print(head, "\n", "-" * len(head), sep="") test_tplscores = sorted(weighted_testcounts.items(), key=det_tplsort, reverse=True) for (tid, testscore) in test_tplscores: s = "{0:s} |{1:5d} {2:6.3f} | {3:4d} {4:.3f} |{5:4d} {6:.3f} | {7:s}".format( tid, testscore, testscore/tottestscores, weighted_traincounts[tid], weighted_traincounts[tid]/tottrainscores, template_counts[tid], template_counts[tid]/len(tids), Template.ALLTEMPLATES[int(tid)], ) print(s) def print_unused_templates(): usedtpls = set([int(tid) for tid in tids]) unused = [(tid, tpl) for (tid, tpl) in enumerate(Template.ALLTEMPLATES) if tid not in usedtpls] print("UNUSED TEMPLATES ({0})".format(len(unused))) for (tid, tpl) in unused: print("{0:03d} {1:s}".format(tid, tpl)) if test_stats is None: print_train_stats() else: print_testtrain_stats() print() if printunused: print_unused_templates() print()
[docs] def batch_tag_incremental(self, sequences, gold): """ Tags by applying each rule to the entire corpus (rather than all rules to a single sequence). The point is to collect statistics on the test set for individual rules. NOTE: This is inefficient (does not build any index, so will traverse the entire corpus N times for N rules) -- usually you would not care about statistics for individual rules and thus use batch_tag() instead :param sequences: lists of token sequences (sentences, in some applications) to be tagged :type sequences: list of list of strings :param gold: the gold standard :type gold: list of list of strings :returns: tuple of (tagged_sequences, ordered list of rule scores (one for each rule)) """ def counterrors(xs): return sum(t[1] != g[1] for pair in zip(xs, gold) for (t, g) in zip(*pair)) testing_stats = {} testing_stats['tokencount'] = sum(len(t) for t in sequences) testing_stats['sequencecount'] = len(sequences) tagged_tokenses = [self._initial_tagger.tag(tokens) for tokens in sequences] testing_stats['initialerrors'] = counterrors(tagged_tokenses) testing_stats['initialacc'] = 1 - testing_stats['initialerrors']/testing_stats['tokencount'] # Apply each rule to the entire corpus, in order errors = [testing_stats['initialerrors']] for rule in self._rules: for tagged_tokens in tagged_tokenses: rule.apply(tagged_tokens) errors.append(counterrors(tagged_tokenses)) testing_stats['rulescores'] = [err0 - err1 for (err0, err1) in zip(errors, errors[1:])] testing_stats['finalerrors'] = errors[-1] testing_stats['finalacc'] = 1 - testing_stats['finalerrors']/testing_stats['tokencount'] return (tagged_tokenses, testing_stats)