Source code for nltk.parse.util

# Natural Language Toolkit: Parser Utility Functions
#
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
#
# Copyright (C) 2001-2015 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT


"""
Utility functions for parsers.
"""
from __future__ import print_function

from nltk.grammar import CFG, FeatureGrammar, PCFG
from nltk.data import load

from nltk.parse.chart import Chart, ChartParser
from nltk.parse.pchart import InsideChartParser
from nltk.parse.featurechart import FeatureChart, FeatureChartParser

[docs]def load_parser(grammar_url, trace=0,
                parser=None, chart_class=None,
                beam_size=0, **load_args):
    """
    Load a grammar from a file, and build a parser based on that grammar.
    The parser depends on the grammar format, and might also depend
    on properties of the grammar itself.

    The following grammar formats are currently supported:
      - ``'cfg'``  (CFGs: ``CFG``)
      - ``'pcfg'`` (probabilistic CFGs: ``PCFG``)
      - ``'fcfg'`` (feature-based CFGs: ``FeatureGrammar``)

    :type grammar_url: str
    :param grammar_url: A URL specifying where the grammar is located.
        The default protocol is ``"nltk:"``, which searches for the file
        in the the NLTK data package.
    :type trace: int
    :param trace: The level of tracing that should be used when
        parsing a text.  ``0`` will generate no tracing output;
        and higher numbers will produce more verbose tracing output.
    :param parser: The class used for parsing; should be ``ChartParser``
        or a subclass.
        If None, the class depends on the grammar format.
    :param chart_class: The class used for storing the chart;
        should be ``Chart`` or a subclass.
        Only used for CFGs and feature CFGs.
        If None, the chart class depends on the grammar format.
    :type beam_size: int
    :param beam_size: The maximum length for the parser's edge queue.
        Only used for probabilistic CFGs.
    :param load_args: Keyword parameters used when loading the grammar.
        See ``data.load`` for more information.
    """
    grammar = load(grammar_url, **load_args)
    if not isinstance(grammar, CFG):
        raise ValueError("The grammar must be a CFG, "
                         "or a subclass thereof.")
    if isinstance(grammar, PCFG):
        if parser is None:
            parser = InsideChartParser
        return parser(grammar, trace=trace, beam_size=beam_size)

    elif isinstance(grammar, FeatureGrammar):
        if parser is None:
            parser = FeatureChartParser
        if chart_class is None:
            chart_class = FeatureChart
        return parser(grammar, trace=trace, chart_class=chart_class)

    else: # Plain CFG.
        if parser is None:
            parser = ChartParser
        if chart_class is None:
            chart_class = Chart
        return parser(grammar, trace=trace, chart_class=chart_class)

def taggedsent_to_conll(sentence):
	"""
	A module to convert a single POS tagged sentence into CONLL format.
	
	>>> from nltk import word_tokenize, pos_tag
	>>> text = "This is a foobar sentence."
	>>> for line in taggedsent_to_conll(pos_tag(word_tokenize(text))):
	... 	print(line, end="")
        1	This	_	DT	DT	_	0	a	_	_
        2	is	_	VBZ	VBZ	_	0	a	_	_
        3	a	_	DT	DT	_	0	a	_	_
        4	foobar	_	JJ	JJ	_	0	a	_	_
        5	sentence	_	NN	NN	_	0	a	_	_
        6	.		_	.	.	_	0	a	_	_
	
	:param sentence: A single input sentence to parse
	:type sentence: list(tuple(str, str))
	:rtype: iter(str) 
	:return: a generator yielding a single sentence in CONLL format.
	"""
	for (i, (word, tag)) in enumerate(sentence, start=1):
		input_str = [str(i), word, '_', tag, tag, '_', '0', 'a', '_', '_']
		input_str = "\t".join(input_str) + "\n"
		yield input_str


def taggedsents_to_conll(sentences):
	"""
	A module to convert the a POS tagged document stream
	(i.e. list of list of tuples, a list of sentences) and yield lines 
	in CONLL format. This module yields one line per word and two newlines 
	for end of sentence. 

	>>> from nltk import word_tokenize, sent_tokenize, pos_tag
	>>> text = "This is a foobar sentence. Is that right?"
	>>> sentences = [pos_tag(word_tokenize(sent)) for sent in sent_tokenize(text)]
	>>> for line in taggedsents_to_conll(sentences):
        ...     if line:
	...         print(line, end="")
        1	This	_	DT	DT	_	0	a	_	_
        2	is	_	VBZ	VBZ	_	0	a	_	_
        3	a	_	DT	DT	_	0	a	_	_
        4	foobar	_	JJ	JJ	_	0	a	_	_
        5	sentence	_	NN	NN	_	0	a	_	_
        6	.		_	.	.	_	0	a	_	_
        <BLANKLINE>
        <BLANKLINE>
        1	Is	_	VBZ	VBZ	_	0	a	_	_
        2	that	_	IN	IN	_	0	a	_	_
        3	right	_	NN	NN	_	0	a	_	_
        4	?	_	.	.	_	0	a	_	_
        <BLANKLINE>
        <BLANKLINE>

	:param sentences: Input sentences to parse
	:type sentence: list(list(tuple(str, str)))
	:rtype: iter(str) 
	:return: a generator yielding sentences in CONLL format.
	"""
	for sentence in sentences:
		for input_str in taggedsent_to_conll(sentence):
			yield input_str
		yield '\n\n'		

######################################################################
#{ Test Suites
######################################################################

[docs]class TestGrammar(object):
    """
    Unit tests for  CFG.
    """
[docs]    def __init__(self, grammar, suite, accept=None, reject=None):
        self.test_grammar = grammar

        self.cp = load_parser(grammar, trace=0)
        self.suite = suite
        self._accept = accept
        self._reject = reject


[docs]    def run(self, show_trees=False):
        """
        Sentences in the test suite are divided into two classes:
         - grammatical (``accept``) and
         - ungrammatical (``reject``).
        If a sentence should parse accordng to the grammar, the value of
        ``trees`` will be a non-empty list. If a sentence should be rejected
        according to the grammar, then the value of ``trees`` will be None.
        """
        for test in self.suite:
            print(test['doc'] + ":", end=' ')
            for key in ['accept', 'reject']:
                for sent in test[key]:
                    tokens = sent.split()
                    trees = list(self.cp.parse(tokens))
                    if show_trees and trees:
                        print()
                        print(sent)
                        for tree in trees:
                            print(tree)
                    if key == 'accept':
                        if trees == []:
                            raise ValueError("Sentence '%s' failed to parse'" % sent)
                        else:
                            accepted = True
                    else:
                        if trees:
                            raise ValueError("Sentence '%s' received a parse'" % sent)
                        else:
                            rejected = True
            if accepted and rejected:
                print("All tests passed!")

[docs]def extract_test_sentences(string, comment_chars="#%;", encoding=None):
    """
    Parses a string with one test sentence per line.
    Lines can optionally begin with:
      - a bool, saying if the sentence is grammatical or not, or
      - an int, giving the number of parse trees is should have,
    The result information is followed by a colon, and then the sentence.
    Empty lines and lines beginning with a comment char are ignored.

    :return: a list of tuple of sentences and expected results,
        where a sentence is a list of str,
        and a result is None, or bool, or int

    :param comment_chars: ``str`` of possible comment characters.
    :param encoding: the encoding of the string, if it is binary
    """
    if encoding is not None:
        string = string.decode(encoding)
    sentences = []
    for sentence in string.split('\n'):
        if sentence == '' or sentence[0] in comment_chars:
            continue
        split_info = sentence.split(':', 1)
        result = None
        if len(split_info) == 2:
            if split_info[0] in ['True','true','False','false']:
                result = split_info[0] in ['True','true']
                sentence = split_info[1]
            else:
                result = int(split_info[0])
                sentence = split_info[1]
        tokens = sentence.split()
        if tokens == []:
            continue
        sentences += [(tokens, result)]
    return sentences

# nose thinks it is a test
extract_test_sentences.__test__ = False