# Natural Language Toolkit: Tagset Mapping
#
# Copyright (C) 2001-2015 NLTK Project
# Author: Nathan Schneider <nathan@cmu.edu>
# Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
Interface for converting POS tags from various treebanks
to the universal tagset of Petrov, Das, & McDonald.
The tagset consists of the following 12 coarse tags:
VERB - verbs (all tenses and modes)
NOUN - nouns (common and proper)
PRON - pronouns
ADJ - adjectives
ADV - adverbs
ADP - adpositions (prepositions and postpositions)
CONJ - conjunctions
DET - determiners
NUM - cardinal numbers
PRT - particles or other function words
X - other: foreign words, typos, abbreviations
. - punctuation
@see: http://arxiv.org/abs/1104.2086 and http://code.google.com/p/universal-pos-tags/
"""
from __future__ import print_function, unicode_literals, division
from collections import defaultdict
from os.path import join
from nltk.data import load
_UNIVERSAL_DATA = "taggers/universal_tagset"
_UNIVERSAL_TAGS = ('VERB','NOUN','PRON','ADJ','ADV','ADP','CONJ','DET','NUM','PRT','X','.')
# _MAPPINGS = defaultdict(lambda: defaultdict(dict))
# the mapping between tagset T1 and T2 returns UNK if appied to an unrecognized tag
_MAPPINGS = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 'UNK')))
def _load_universal_map(fileid):
contents = load(join(_UNIVERSAL_DATA, fileid+'.map'), format="text")
# When mapping to the Universal Tagset,
# map unknown inputs to 'X' not 'UNK'
_MAPPINGS[fileid]['universal'].default_factory = lambda: 'X'
for line in contents.splitlines():
line = line.strip()
if line == '':
continue
fine, coarse = line.split('\t')
assert coarse in _UNIVERSAL_TAGS, 'Unexpected coarse tag: {}'.format(coarse)
assert fine not in _MAPPINGS[fileid]['universal'], 'Multiple entries for original tag: {}'.format(fine)
_MAPPINGS[fileid]['universal'][fine] = coarse
[docs]def map_tag(source, target, source_tag):
"""
Maps the tag from the source tagset to the target tagset.
>>> map_tag('en-ptb', 'universal', 'VBZ')
'VERB'
>>> map_tag('en-ptb', 'universal', 'VBP')
'VERB'
>>> map_tag('en-ptb', 'universal', '``')
'.'
"""
# we need a systematic approach to naming
if target == 'universal':
if source == 'wsj':
source = 'en-ptb'
if source == 'brown':
source = 'en-brown'
return tagset_mapping(source, target)[source_tag]