# -*- coding: utf-8 -*-
# Natural Language Toolkit: RSLP Stemmer
#
# Copyright (C) 2001-2015 NLTK Project
# Author: Tiago Tresoldi <tresoldi@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
# This code is based on the algorithm presented in the paper "A Stemming
# Algorithm for the Portuguese Language" by Viviane Moreira Orengo and
# Christian Huyck, which unfortunately I had no access to. The code is a
# Python version, with some minor modifications of mine, to the description
# presented at http://www.webcitation.org/5NnvdIzOb and to the C source code
# available at http://www.inf.ufrgs.br/~arcoelho/rslp/integrando_rslp.html.
# Please note that this stemmer is intended for demonstration and educational
# purposes only. Feel free to write me for any comments, including the
# development of a different and/or better stemmer for Portuguese. I also
# suggest using NLTK's mailing list for Portuguese for any discussion.
# Este código é baseado no algoritmo apresentado no artigo "A Stemming
# Algorithm for the Portuguese Language" de Viviane Moreira Orengo e
# Christian Huyck, o qual infelizmente não tive a oportunidade de ler. O
# código é uma conversão para Python, com algumas pequenas modificações
# minhas, daquele apresentado em http://www.webcitation.org/5NnvdIzOb e do
# código para linguagem C disponível em
# http://www.inf.ufrgs.br/~arcoelho/rslp/integrando_rslp.html. Por favor,
# lembre-se de que este stemmer foi desenvolvido com finalidades unicamente
# de demonstração e didáticas. Sinta-se livre para me escrever para qualquer
# comentário, inclusive sobre o desenvolvimento de um stemmer diferente
# e/ou melhor para o português. Também sugiro utilizar-se a lista de discussão
# do NLTK para o português para qualquer debate.
from __future__ import print_function, unicode_literals
from nltk.data import load
from nltk.stem.api import StemmerI
[docs]class RSLPStemmer(StemmerI):
"""
A stemmer for Portuguese.
>>> from nltk.stem import RSLPStemmer
>>> st = RSLPStemmer()
>>> # opening lines of Erico Verissimo's "Música ao Longe"
>>> text = '''
... Clarissa risca com giz no quadro-negro a paisagem que os alunos
... devem copiar . Uma casinha de porta e janela , em cima duma
... coxilha .'''
>>> for token in text.split():
... print(st.stem(token))
clariss risc com giz no quadro-negr a pais que os alun dev copi .
uma cas de port e janel , em cim dum coxilh .
"""
[docs] def __init__ (self):
self._model = []
self._model.append( self.read_rule("step0.pt") )
self._model.append( self.read_rule("step1.pt") )
self._model.append( self.read_rule("step2.pt") )
self._model.append( self.read_rule("step3.pt") )
self._model.append( self.read_rule("step4.pt") )
self._model.append( self.read_rule("step5.pt") )
self._model.append( self.read_rule("step6.pt") )
[docs] def read_rule (self, filename):
rules = load('nltk:stemmers/rslp/' + filename, format='raw').decode("utf8")
lines = rules.split("\n")
lines = [line for line in lines if line != ""] # remove blank lines
lines = [line for line in lines if line[0] != "#"] # remove comments
# NOTE: a simple but ugly hack to make this parser happy with double '\t's
lines = [line.replace("\t\t", "\t") for line in lines]
# parse rules
rules = []
for line in lines:
rule = []
tokens = line.split("\t")
# text to be searched for at the end of the string
rule.append( tokens[0][1:-1] ) # remove quotes
# minimum stem size to perform the replacement
rule.append( int(tokens[1]) )
# text to be replaced into
rule.append( tokens[2][1:-1] ) # remove quotes
# exceptions to this rule
rule.append( [token[1:-1] for token in tokens[3].split(",")] )
# append to the results
rules.append(rule)
return rules
[docs] def stem(self, word):
word = word.lower()
# the word ends in 's'? apply rule for plural reduction
if word[-1] == "s":
word = self.apply_rule(word, 0)
# the word ends in 'a'? apply rule for feminine reduction
if word[-1] == "a":
word = self.apply_rule(word, 1)
# augmentative reduction
word = self.apply_rule(word, 3)
# adverb reduction
word = self.apply_rule(word, 2)
# noun reduction
prev_word = word
word = self.apply_rule(word, 4)
if word == prev_word:
# verb reduction
prev_word = word
word = self.apply_rule(word, 5)
if word == prev_word:
# vowel removal
word = self.apply_rule(word, 6)
return word
[docs] def apply_rule(self, word, rule_index):
rules = self._model[rule_index]
for rule in rules:
suffix_length = len(rule[0])
if word[-suffix_length:] == rule[0]: # if suffix matches
if len(word) >= suffix_length + rule[1]: # if we have minimum size
if word not in rule[3]: # if not an exception
word = word[:-suffix_length] + rule[2]
break
return word