# -*- coding: utf-8 -*-
# Natural Language Toolkit: Interface to the Stanford Part-of-speech and Named-Entity Taggers
#
# Copyright (C) 2001-2015 NLTK Project
# Author: Nitin Madnani <nmadnani@ets.org>
# Rami Al-Rfou' <ralrfou@cs.stonybrook.edu>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
A module for interfacing with the Stanford taggers.
Tagger models need to be downloaded from http://nlp.stanford.edu/software
and the STANFORD_MODELS environment variable set (a colon-separated
list of paths).
For more details see the documentation for StanfordPOSTagger and StanfordNERTagger.
"""
import os
import tempfile
from subprocess import PIPE
import warnings
from nltk.internals import find_file, find_jar, config_java, java, _java_options, find_jars_within_path
from nltk.tag.api import TaggerI
from nltk import compat
_stanford_url = 'http://nlp.stanford.edu/software'
[docs]class StanfordTagger(TaggerI):
"""
An interface to Stanford taggers. Subclasses must define:
- ``_cmd`` property: A property that returns the command that will be
executed.
- ``_SEPARATOR``: Class constant that represents that character that
is used to separate the tokens from their tags.
- ``_JAR`` file: Class constant that represents the jar file name.
"""
_SEPARATOR = ''
_JAR = ''
[docs] def __init__(self, model_filename, path_to_jar=None, encoding='utf8', verbose=False, java_options='-mx1000m'):
if not self._JAR:
warnings.warn('The StanfordTagger class is not meant to be '
'instantiated directly. Did you mean StanfordPOSTagger or StanfordNERTagger?')
self._stanford_jar = find_jar(
self._JAR, path_to_jar,
searchpath=(), url=_stanford_url,
verbose=verbose)
self._stanford_model = find_file(model_filename,
env_vars=('STANFORD_MODELS',), verbose=verbose)
# Adding logging jar files to classpath
stanford_dir = os.path.split(self._stanford_jar)[0]
self._stanford_jar = tuple(find_jars_within_path(stanford_dir))
self._encoding = encoding
self.java_options = java_options
@property
def _cmd(self):
raise NotImplementedError
[docs] def tag(self, tokens):
# This function should return list of tuple rather than list of list
return sum(self.tag_sents([tokens]), [])
[docs] def tag_sents(self, sentences):
encoding = self._encoding
default_options = ' '.join(_java_options)
config_java(options=self.java_options, verbose=False)
# Create a temporary input file
_input_fh, self._input_file_path = tempfile.mkstemp(text=True)
cmd = list(self._cmd)
cmd.extend(['-encoding', encoding])
# Write the actual sentences to the temporary input file
_input_fh = os.fdopen(_input_fh, 'wb')
_input = '\n'.join((' '.join(x) for x in sentences))
if isinstance(_input, compat.text_type) and encoding:
_input = _input.encode(encoding)
_input_fh.write(_input)
_input_fh.close()
# Run the tagger and get the output
stanpos_output, _stderr = java(cmd, classpath=self._stanford_jar,
stdout=PIPE, stderr=PIPE)
stanpos_output = stanpos_output.decode(encoding)
# Delete the temporary file
os.unlink(self._input_file_path)
# Return java configurations to their default values
config_java(options=default_options, verbose=False)
return self.parse_output(stanpos_output, sentences)
[docs] def parse_output(self, text, sentences = None):
# Output the tagged sentences
tagged_sentences = []
for tagged_sentence in text.strip().split("\n"):
sentence = []
for tagged_word in tagged_sentence.strip().split():
word_tags = tagged_word.strip().split(self._SEPARATOR)
sentence.append((''.join(word_tags[:-1]), word_tags[-1]))
tagged_sentences.append(sentence)
return tagged_sentences
[docs]class StanfordPOSTagger(StanfordTagger):
"""
A class for pos tagging with Stanford Tagger. The input is the paths to:
- a model trained on training data
- (optionally) the path to the stanford tagger jar file. If not specified here,
then this jar file must be specified in the CLASSPATH envinroment variable.
- (optionally) the encoding of the training data (default: UTF-8)
Example:
>>> from nltk.tag import StanfordPOSTagger
>>> st = StanfordPOSTagger('english-bidirectional-distsim.tagger') # doctest: +SKIP
>>> st.tag('What is the airspeed of an unladen swallow ?'.split()) # doctest: +SKIP
[('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
"""
_SEPARATOR = '_'
_JAR = 'stanford-postagger.jar'
[docs] def __init__(self, *args, **kwargs):
super(StanfordPOSTagger, self).__init__(*args, **kwargs)
@property
def _cmd(self):
return ['edu.stanford.nlp.tagger.maxent.MaxentTagger',
'-model', self._stanford_model, '-textFile',
self._input_file_path, '-tokenize', 'false','-outputFormatOptions', 'keepEmptySentences']
[docs]class StanfordNERTagger(StanfordTagger):
"""
A class for Named-Entity Tagging with Stanford Tagger. The input is the paths to:
- a model trained on training data
- (optionally) the path to the stanford tagger jar file. If not specified here,
then this jar file must be specified in the CLASSPATH envinroment variable.
- (optionally) the encoding of the training data (default: UTF-8)
Example:
>>> from nltk.tag import StanfordNERTagger
>>> st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') # doctest: +SKIP
>>> st.tag('Rami Eid is studying at Stony Brook University in NY'.split()) # doctest: +SKIP
[('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'),
('at', 'O'), ('Stony', 'ORGANIZATION'), ('Brook', 'ORGANIZATION'),
('University', 'ORGANIZATION'), ('in', 'O'), ('NY', 'LOCATION')]
"""
_SEPARATOR = '/'
_JAR = 'stanford-ner.jar'
_FORMAT = 'slashTags'
[docs] def __init__(self, *args, **kwargs):
super(StanfordNERTagger, self).__init__(*args, **kwargs)
@property
def _cmd(self):
# Adding -tokenizerFactory edu.stanford.nlp.process.WhitespaceTokenizer -tokenizerOptions tokenizeNLs=false for not using stanford Tokenizer
return ['edu.stanford.nlp.ie.crf.CRFClassifier',
'-loadClassifier', self._stanford_model, '-textFile',
self._input_file_path, '-outputFormat', self._FORMAT, '-tokenizerFactory', 'edu.stanford.nlp.process.WhitespaceTokenizer', '-tokenizerOptions','\"tokenizeNLs=false\"']
[docs] def parse_output(self, text, sentences):
if self._FORMAT == 'slashTags':
# Joint together to a big list
tagged_sentences = []
for tagged_sentence in text.strip().split("\n"):
for tagged_word in tagged_sentence.strip().split():
word_tags = tagged_word.strip().split(self._SEPARATOR)
tagged_sentences.append((''.join(word_tags[:-1]), word_tags[-1]))
# Separate it according to the input
result = []
start = 0
for sent in sentences:
result.append(tagged_sentences[start:start + len(sent)])
start += len(sent);
return result
raise NotImplementedError