Source code for nltk.parse.dependencygraph

# Natural Language Toolkit: Dependency Grammars
#
# Copyright (C) 2001-2015 NLTK Project
# Author: Jason Narad <jason.narad@gmail.com>
#         Steven Bird <stevenbird1@gmail.com> (modifications)
#
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
#

"""
Tools for reading and writing dependency trees.
The input is assumed to be in Malt-TAB format
(http://stp.lingfil.uu.se/~nivre/research/MaltXML.html).
"""
from __future__ import print_function, unicode_literals

from collections import defaultdict
from itertools import chain
from pprint import pformat
import subprocess
import warnings

from nltk.tree import Tree
from nltk.compat import python_2_unicode_compatible, string_types


#################################################################
# DependencyGraph Class
#################################################################


@python_2_unicode_compatible
[docs]class DependencyGraph(object): """ A container for the nodes and labelled edges of a dependency structure. """
[docs] def __init__(self, tree_str=None, cell_extractor=None, zero_based=False, cell_separator=None, top_relation_label='ROOT'): """Dependency graph. We place a dummy `TOP` node with the index 0, since the root node is often assigned 0 as its head. This also means that the indexing of the nodes corresponds directly to the Malt-TAB format, which starts at 1. If zero-based is True, then Malt-TAB-like input with node numbers starting at 0 and the root node assigned -1 (as produced by, e.g., zpar). :param str cell_separator: the cell separator. If not provided, cells are split by whitespace. :param str top_relation_label: the label by which the top relation is identified, for examlple, `ROOT`, `null` or `TOP`. """ self.nodes = defaultdict(lambda: {'address': None, 'word': None, 'lemma': None, 'ctag': None, 'tag': None, 'feats': None, 'head': None, 'deps': defaultdict(list), 'rel': None, }) self.nodes[0].update( { 'ctag': 'TOP', 'tag': 'TOP', 'address': 0, } ) self.root = None if tree_str: self._parse( tree_str, cell_extractor=cell_extractor, zero_based=zero_based, cell_separator=cell_separator, top_relation_label=top_relation_label, )
[docs] def remove_by_address(self, address): """ Removes the node with the given address. References to this node in others will still exist. """ del self.nodes[address]
[docs] def redirect_arcs(self, originals, redirect): """ Redirects arcs to any of the nodes in the originals list to the redirect node address. """ for node in self.nodes.values(): new_deps = [] for dep in node['deps']: if dep in originals: new_deps.append(redirect) else: new_deps.append(dep) node['deps'] = new_deps
[docs] def add_arc(self, head_address, mod_address): """ Adds an arc from the node specified by head_address to the node specified by the mod address. """ relation = self.nodes[mod_address]['rel'] self.nodes[head_address]['deps'].setdefault(relation, []) self.nodes[head_address]['deps'][relation].append(mod_address)
#self.nodes[head_address]['deps'].append(mod_address)
[docs] def connect_graph(self): """ Fully connects all non-root nodes. All nodes are set to be dependents of the root node. """ for node1 in self.nodes.values(): for node2 in self.nodes.values(): if node1['address'] != node2['address'] and node2['rel'] != 'TOP': relation = node2['rel'] node1['deps'].setdefault(relation, []) node1['deps'][relation].append(node2['address'])
#node1['deps'].append(node2['address'])
[docs] def get_by_address(self, node_address): """Return the node with the given address.""" return self.nodes[node_address]
[docs] def contains_address(self, node_address): """ Returns true if the graph contains a node with the given node address, false otherwise. """ return node_address in self.nodes
[docs] def to_dot(self): """Return a dot representation suitable for using with Graphviz. >>> dg = DependencyGraph( ... 'John N 2\\n' ... 'loves V 0\\n' ... 'Mary N 2' ... ) >>> print(dg.to_dot()) digraph G{ edge [dir=forward] node [shape=plaintext] <BLANKLINE> 0 [label="0 (None)"] 0 -> 2 [label="ROOT"] 1 [label="1 (John)"] 2 [label="2 (loves)"] 2 -> 1 [label=""] 2 -> 3 [label=""] 3 [label="3 (Mary)"] } """ # Start the digraph specification s = 'digraph G{\n' s += 'edge [dir=forward]\n' s += 'node [shape=plaintext]\n' # Draw the remaining nodes for node in sorted(self.nodes.values(), key=lambda v: v['address']): s += '\n%s [label="%s (%s)"]' % (node['address'], node['address'], node['word']) for rel, deps in node['deps'].items(): for dep in deps: if rel is not None: s += '\n%s -> %s [label="%s"]' % (node['address'], dep, rel) else: s += '\n%s -> %s ' % (node['address'], dep) s += "\n}" return s
def _repr_svg_(self): """Show SVG representation of the transducer (IPython magic). >>> dg = DependencyGraph( ... 'John N 2\\n' ... 'loves V 0\\n' ... 'Mary N 2' ... ) >>> dg._repr_svg_().split('\\n')[0] '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' """ dot_string = self.to_dot() try: process = subprocess.Popen( ['dot', '-Tsvg'], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, ) except OSError: raise Exception('Cannot find the dot binary from Graphviz package') out, err = process.communicate(dot_string) if err: raise Exception( 'Cannot create svg representation by running dot from string: {}' ''.format(dot_string)) return out def __str__(self): return pformat(self.nodes) def __repr__(self): return "<DependencyGraph with {0} nodes>".format(len(self.nodes)) @staticmethod
[docs] def load(filename, zero_based=False, cell_separator=None, top_relation_label='ROOT'): """ :param filename: a name of a file in Malt-TAB format :param zero_based: nodes in the input file are numbered starting from 0 rather than 1 (as produced by, e.g., zpar) :param str cell_separator: the cell separator. If not provided, cells are split by whitespace. :param str top_relation_label: the label by which the top relation is identified, for examlple, `ROOT`, `null` or `TOP`. :return: a list of DependencyGraphs """ with open(filename) as infile: return [ DependencyGraph( tree_str, zero_based=zero_based, cell_separator=cell_separator, top_relation_label=top_relation_label, ) for tree_str in infile.read().split('\n\n') ]
[docs] def left_children(self, node_index): """ Returns the number of left children under the node specified by the given address. """ children = chain.from_iterable(self.nodes[node_index]['deps'].values()) index = self.nodes[node_index]['address'] return sum(1 for c in children if c < index)
[docs] def right_children(self, node_index): """ Returns the number of right children under the node specified by the given address. """ children = chain.from_iterable(self.nodes[node_index]['deps'].values()) index = self.nodes[node_index]['address'] return sum(1 for c in children if c > index)
[docs] def add_node(self, node): if not self.contains_address(node['address']): self.nodes[node['address']].update(node)
def _parse(self, input_, cell_extractor=None, zero_based=False, cell_separator=None, top_relation_label='ROOT'): """Parse a sentence. :param extractor: a function that given a tuple of cells returns a 7-tuple, where the values are ``word, lemma, ctag, tag, feats, head, rel``. :param str cell_separator: the cell separator. If not provided, cells are split by whitespace. :param str top_relation_label: the label by which the top relation is identified, for examlple, `ROOT`, `null` or `TOP`. """ def extract_3_cells(cells, index): word, tag, head = cells return index, word, word, tag, tag, '', head, '' def extract_4_cells(cells, index): word, tag, head, rel = cells return index, word, word, tag, tag, '', head, rel def extract_7_cells(cells, index): line_index, word, lemma, tag, _, head, rel = cells try: index = int(line_index) except ValueError: # index can't be parsed as an integer, use default pass return index, word, lemma, tag, tag, '', head, rel def extract_10_cells(cells, index): line_index, word, lemma, ctag, tag, feats, head, rel, _, _ = cells try: index = int(line_index) except ValueError: # index can't be parsed as an integer, use default pass return index, word, lemma, ctag, tag, feats, head, rel extractors = { 3: extract_3_cells, 4: extract_4_cells, 7: extract_7_cells, 10: extract_10_cells, } if isinstance(input_, string_types): input_ = (line for line in input_.split('\n')) lines = (l.rstrip() for l in input_) lines = (l for l in lines if l) cell_number = None for index, line in enumerate(lines, start=1): cells = line.split(cell_separator) if cell_number is None: cell_number = len(cells) else: assert cell_number == len(cells) if cell_extractor is None: try: cell_extractor = extractors[cell_number] except KeyError: raise ValueError( 'Number of tab-delimited fields ({0}) not supported by ' 'CoNLL(10) or Malt-Tab(4) format'.format(cell_number) ) try: index, word, lemma, ctag, tag, feats, head, rel = cell_extractor(cells, index) except (TypeError, ValueError): # cell_extractor doesn't take 2 arguments or doesn't return 8 # values; assume the cell_extractor is an older external # extractor and doesn't accept or return an index. word, lemma, ctag, tag, feats, head, rel = cell_extractor(cells) if head == '_': continue head = int(head) if zero_based: head += 1 self.nodes[index].update( { 'address': index, 'word': word, 'lemma': lemma, 'ctag': ctag, 'tag': tag, 'feats': feats, 'head': head, 'rel': rel, } ) # Make sure that the fake root node has labeled dependencies. if (cell_number == 3) and (head == 0): rel = top_relation_label self.nodes[head]['deps'][rel].append(index) if self.nodes[0]['deps'][top_relation_label]: root_address = self.nodes[0]['deps'][top_relation_label][0] self.root = self.nodes[root_address] self.top_relation_label = top_relation_label else: warnings.warn( "The graph doesn't contain a node " "that depends on the root element." ) def _word(self, node, filter=True): w = node['word'] if filter: if w != ',': return w return w def _tree(self, i): """ Turn dependency graphs into NLTK trees. :param int i: index of a node :return: either a word (if the indexed node is a leaf) or a ``Tree``. """ node = self.get_by_address(i) word = node['word'] deps = sorted(chain.from_iterable(node['deps'].values())) if deps: return Tree(word, [self._tree(dep) for dep in deps]) else: return word
[docs] def tree(self): """ Starting with the ``root`` node, build a dependency tree using the NLTK ``Tree`` constructor. Dependency labels are omitted. """ node = self.root word = node['word'] deps = sorted(chain.from_iterable(node['deps'].values())) return Tree(word, [self._tree(dep) for dep in deps])
[docs] def triples(self, node=None): """ Extract dependency triples of the form: ((head word, head tag), rel, (dep word, dep tag)) """ if not node: node = self.root head = (node['word'], node['ctag']) for i in sorted(chain.from_iterable(node['deps'].values())): dep = self.get_by_address(i) yield (head, dep['rel'], (dep['word'], dep['ctag'])) for triple in self.triples(node=dep): yield triple
def _hd(self, i): try: return self.nodes[i]['head'] except IndexError: return None def _rel(self, i): try: return self.nodes[i]['rel'] except IndexError: return None # what's the return type? Boolean or list?
[docs] def contains_cycle(self): """Check whether there are cycles. >>> dg = DependencyGraph(treebank_data) >>> dg.contains_cycle() False >>> cyclic_dg = DependencyGraph() >>> top = {'word': None, 'deps': [1], 'rel': 'TOP', 'address': 0} >>> child1 = {'word': None, 'deps': [2], 'rel': 'NTOP', 'address': 1} >>> child2 = {'word': None, 'deps': [4], 'rel': 'NTOP', 'address': 2} >>> child3 = {'word': None, 'deps': [1], 'rel': 'NTOP', 'address': 3} >>> child4 = {'word': None, 'deps': [3], 'rel': 'NTOP', 'address': 4} >>> cyclic_dg.nodes = { ... 0: top, ... 1: child1, ... 2: child2, ... 3: child3, ... 4: child4, ... } >>> cyclic_dg.root = top >>> cyclic_dg.contains_cycle() [3, 1, 2, 4] """ distances = {} for node in self.nodes.values(): for dep in node['deps']: key = tuple([node['address'], dep]) distances[key] = 1 for _ in self.nodes: new_entries = {} for pair1 in distances: for pair2 in distances: if pair1[1] == pair2[0]: key = tuple([pair1[0], pair2[1]]) new_entries[key] = distances[pair1] + distances[pair2] for pair in new_entries: distances[pair] = new_entries[pair] if pair[0] == pair[1]: path = self.get_cycle_path(self.get_by_address(pair[0]), pair[0]) return path return False # return []?
[docs] def get_cycle_path(self, curr_node, goal_node_index): for dep in curr_node['deps']: if dep == goal_node_index: return [curr_node['address']] for dep in curr_node['deps']: path = self.get_cycle_path(self.get_by_address(dep), goal_node_index) if len(path) > 0: path.insert(0, curr_node['address']) return path return []
[docs] def to_conll(self, style): """ The dependency graph in CoNLL format. :param style: the style to use for the format (3, 4, 10 columns) :type style: int :rtype: str """ if style == 3: template = '{word}\t{tag}\t{head}\n' elif style == 4: template = '{word}\t{tag}\t{head}\t{rel}\n' elif style == 10: template = '{i}\t{word}\t{lemma}\t{ctag}\t{tag}\t{feats}\t{head}\t{rel}\t_\t_\n' else: raise ValueError( 'Number of tab-delimited fields ({0}) not supported by ' 'CoNLL(10) or Malt-Tab(4) format'.format(style) ) return ''.join(template.format(i=i, **node) for i, node in sorted(self.nodes.items()) if node['tag'] != 'TOP')
[docs] def nx_graph(self): """Convert the data in a ``nodelist`` into a networkx labeled directed graph.""" import networkx nx_nodelist = list(range(1, len(self.nodes))) nx_edgelist = [ (n, self._hd(n), self._rel(n)) for n in nx_nodelist if self._hd(n) ] self.nx_labels = {} for n in nx_nodelist: self.nx_labels[n] = self.nodes[n]['word'] g = networkx.MultiDiGraph() g.add_nodes_from(nx_nodelist) g.add_edges_from(nx_edgelist) return g
class DependencyGraphError(Exception): """Dependency graph exception.""" def demo(): malt_demo() conll_demo() conll_file_demo() cycle_finding_demo() def malt_demo(nx=False): """ A demonstration of the result of reading a dependency version of the first sentence of the Penn Treebank. """ dg = DependencyGraph("""Pierre NNP 2 NMOD Vinken NNP 8 SUB , , 2 P 61 CD 5 NMOD years NNS 6 AMOD old JJ 2 NMOD , , 2 P will MD 0 ROOT join VB 8 VC the DT 11 NMOD board NN 9 OBJ as IN 9 VMOD a DT 15 NMOD nonexecutive JJ 15 NMOD director NN 12 PMOD Nov. NNP 9 VMOD 29 CD 16 NMOD . . 9 VMOD """) tree = dg.tree() tree.pprint() if nx: # currently doesn't work import networkx from matplotlib import pylab g = dg.nx_graph() g.info() pos = networkx.spring_layout(g, dim=1) networkx.draw_networkx_nodes(g, pos, node_size=50) # networkx.draw_networkx_edges(g, pos, edge_color='k', width=8) networkx.draw_networkx_labels(g, pos, dg.nx_labels) pylab.xticks([]) pylab.yticks([]) pylab.savefig('tree.png') pylab.show() def conll_demo(): """ A demonstration of how to read a string representation of a CoNLL format dependency tree. """ dg = DependencyGraph(conll_data1) tree = dg.tree() tree.pprint() print(dg) print(dg.to_conll(4)) def conll_file_demo(): print('Mass conll_read demo...') graphs = [DependencyGraph(entry) for entry in conll_data2.split('\n\n') if entry] for graph in graphs: tree = graph.tree() print('\n') tree.pprint() def cycle_finding_demo(): dg = DependencyGraph(treebank_data) print(dg.contains_cycle()) cyclic_dg = DependencyGraph() cyclic_dg.add_node({'word': None, 'deps': [1], 'rel': 'TOP', 'address': 0}) cyclic_dg.add_node({'word': None, 'deps': [2], 'rel': 'NTOP', 'address': 1}) cyclic_dg.add_node({'word': None, 'deps': [4], 'rel': 'NTOP', 'address': 2}) cyclic_dg.add_node({'word': None, 'deps': [1], 'rel': 'NTOP', 'address': 3}) cyclic_dg.add_node({'word': None, 'deps': [3], 'rel': 'NTOP', 'address': 4}) print(cyclic_dg.contains_cycle()) treebank_data = """Pierre NNP 2 NMOD Vinken NNP 8 SUB , , 2 P 61 CD 5 NMOD years NNS 6 AMOD old JJ 2 NMOD , , 2 P will MD 0 ROOT join VB 8 VC the DT 11 NMOD board NN 9 OBJ as IN 9 VMOD a DT 15 NMOD nonexecutive JJ 15 NMOD director NN 12 PMOD Nov. NNP 9 VMOD 29 CD 16 NMOD . . 9 VMOD """ conll_data1 = """ 1 Ze ze Pron Pron per|3|evofmv|nom 2 su _ _ 2 had heb V V trans|ovt|1of2of3|ev 0 ROOT _ _ 3 met met Prep Prep voor 8 mod _ _ 4 haar haar Pron Pron bez|3|ev|neut|attr 5 det _ _ 5 moeder moeder N N soort|ev|neut 3 obj1 _ _ 6 kunnen kan V V hulp|ott|1of2of3|mv 2 vc _ _ 7 gaan ga V V hulp|inf 6 vc _ _ 8 winkelen winkel V V intrans|inf 11 cnj _ _ 9 , , Punc Punc komma 8 punct _ _ 10 zwemmen zwem V V intrans|inf 11 cnj _ _ 11 of of Conj Conj neven 7 vc _ _ 12 terrassen terras N N soort|mv|neut 11 cnj _ _ 13 . . Punc Punc punt 12 punct _ _ """ conll_data2 = """1 Cathy Cathy N N eigen|ev|neut 2 su _ _ 2 zag zie V V trans|ovt|1of2of3|ev 0 ROOT _ _ 3 hen hen Pron Pron per|3|mv|datofacc 2 obj1 _ _ 4 wild wild Adj Adj attr|stell|onverv 5 mod _ _ 5 zwaaien zwaai N N soort|mv|neut 2 vc _ _ 6 . . Punc Punc punt 5 punct _ _ 1 Ze ze Pron Pron per|3|evofmv|nom 2 su _ _ 2 had heb V V trans|ovt|1of2of3|ev 0 ROOT _ _ 3 met met Prep Prep voor 8 mod _ _ 4 haar haar Pron Pron bez|3|ev|neut|attr 5 det _ _ 5 moeder moeder N N soort|ev|neut 3 obj1 _ _ 6 kunnen kan V V hulp|ott|1of2of3|mv 2 vc _ _ 7 gaan ga V V hulp|inf 6 vc _ _ 8 winkelen winkel V V intrans|inf 11 cnj _ _ 9 , , Punc Punc komma 8 punct _ _ 10 zwemmen zwem V V intrans|inf 11 cnj _ _ 11 of of Conj Conj neven 7 vc _ _ 12 terrassen terras N N soort|mv|neut 11 cnj _ _ 13 . . Punc Punc punt 12 punct _ _ 1 Dat dat Pron Pron aanw|neut|attr 2 det _ _ 2 werkwoord werkwoord N N soort|ev|neut 6 obj1 _ _ 3 had heb V V hulp|ovt|1of2of3|ev 0 ROOT _ _ 4 ze ze Pron Pron per|3|evofmv|nom 6 su _ _ 5 zelf zelf Pron Pron aanw|neut|attr|wzelf 3 predm _ _ 6 uitgevonden vind V V trans|verldw|onverv 3 vc _ _ 7 . . Punc Punc punt 6 punct _ _ 1 Het het Pron Pron onbep|neut|zelfst 2 su _ _ 2 hoorde hoor V V trans|ovt|1of2of3|ev 0 ROOT _ _ 3 bij bij Prep Prep voor 2 ld _ _ 4 de de Art Art bep|zijdofmv|neut 6 det _ _ 5 warme warm Adj Adj attr|stell|vervneut 6 mod _ _ 6 zomerdag zomerdag N N soort|ev|neut 3 obj1 _ _ 7 die die Pron Pron betr|neut|zelfst 6 mod _ _ 8 ze ze Pron Pron per|3|evofmv|nom 12 su _ _ 9 ginds ginds Adv Adv gew|aanw 12 mod _ _ 10 achter achter Adv Adv gew|geenfunc|stell|onverv 12 svp _ _ 11 had heb V V hulp|ovt|1of2of3|ev 7 body _ _ 12 gelaten laat V V trans|verldw|onverv 11 vc _ _ 13 . . Punc Punc punt 12 punct _ _ 1 Ze ze Pron Pron per|3|evofmv|nom 2 su _ _ 2 hadden heb V V trans|ovt|1of2of3|mv 0 ROOT _ _ 3 languit languit Adv Adv gew|geenfunc|stell|onverv 11 mod _ _ 4 naast naast Prep Prep voor 11 mod _ _ 5 elkaar elkaar Pron Pron rec|neut 4 obj1 _ _ 6 op op Prep Prep voor 11 ld _ _ 7 de de Art Art bep|zijdofmv|neut 8 det _ _ 8 strandstoelen strandstoel N N soort|mv|neut 6 obj1 _ _ 9 kunnen kan V V hulp|inf 2 vc _ _ 10 gaan ga V V hulp|inf 9 vc _ _ 11 liggen lig V V intrans|inf 10 vc _ _ 12 . . Punc Punc punt 11 punct _ _ 1 Zij zij Pron Pron per|3|evofmv|nom 2 su _ _ 2 zou zal V V hulp|ovt|1of2of3|ev 7 cnj _ _ 3 mams mams N N soort|ev|neut 4 det _ _ 4 rug rug N N soort|ev|neut 5 obj1 _ _ 5 ingewreven wrijf V V trans|verldw|onverv 6 vc _ _ 6 hebben heb V V hulp|inf 2 vc _ _ 7 en en Conj Conj neven 0 ROOT _ _ 8 mam mam V V trans|ovt|1of2of3|ev 7 cnj _ _ 9 de de Art Art bep|zijdofmv|neut 10 det _ _ 10 hare hare Pron Pron bez|3|ev|neut|attr 8 obj1 _ _ 11 . . Punc Punc punt 10 punct _ _ 1 Of of Conj Conj onder|metfin 0 ROOT _ _ 2 ze ze Pron Pron per|3|evofmv|nom 3 su _ _ 3 had heb V V hulp|ovt|1of2of3|ev 0 ROOT _ _ 4 gewoon gewoon Adj Adj adv|stell|onverv 10 mod _ _ 5 met met Prep Prep voor 10 mod _ _ 6 haar haar Pron Pron bez|3|ev|neut|attr 7 det _ _ 7 vriendinnen vriendin N N soort|mv|neut 5 obj1 _ _ 8 rond rond Adv Adv deelv 10 svp _ _ 9 kunnen kan V V hulp|inf 3 vc _ _ 10 slenteren slenter V V intrans|inf 9 vc _ _ 11 in in Prep Prep voor 10 mod _ _ 12 de de Art Art bep|zijdofmv|neut 13 det _ _ 13 buurt buurt N N soort|ev|neut 11 obj1 _ _ 14 van van Prep Prep voor 13 mod _ _ 15 Trafalgar_Square Trafalgar_Square MWU N_N eigen|ev|neut_eigen|ev|neut 14 obj1 _ _ 16 . . Punc Punc punt 15 punct _ _ """ if __name__ == '__main__': demo()