Source code for nltk.classify.textcat

# -*- coding: utf-8 -*-
# Natural Language Toolkit: Language ID module using TextCat algorithm
#
# Copyright (C) 2001-2015 NLTK Project
# Author: Avital Pekker <avital.pekker@utoronto.ca>
#
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT

"""
A module for language identification using the TextCat algorithm.
An implementation of the text categorization algorithm
presented in Cavnar, W. B. and J. M. Trenkle, 
"N-Gram-Based Text Categorization".

The algorithm takes advantage of Zipf's law and uses 
n-gram frequencies to profile languages and text-yet to
be identified-then compares using a distance measure.

Language n-grams are provided by the "An Crubadan"
project. A corpus reader was created seperately to read
those files.

For details regarding the algorithm, see:
http://www.let.rug.nl/~vannoord/TextCat/textcat.pdf

For details about An Crubadan, see:
http://borel.slu.edu/crubadan/index.html
"""

# Ensure that literal strings default to unicode rather than str.
from __future__ import print_function, unicode_literals

from nltk.compat import PY3
from nltk.util import trigrams

if PY3:
    from sys import maxsize
else:
    from sys import maxint

# Note: this is NOT "re" you're likely used to. The regex module
# is an alternative to the standard re module that supports
# Unicode codepoint properties with the \p{} syntax.
# You may have to "pip install regx"
try:
    import regex as re
except ImportError:
    re = None
######################################################################
##  Language identification using TextCat
######################################################################

[docs]class TextCat(object): _corpus = None fingerprints = {} _START_CHAR = "<" _END_CHAR = ">" last_distances = {}
[docs] def __init__(self): if not re: raise EnvironmentError("classify.textcat requires the regex module that " "supports unicode. Try '$ pip install regex' and " "see https://pypi.python.org/pypi/regex for " "further details.") from nltk.corpus import crubadan self._corpus = crubadan # Load all language ngrams into cache for lang in self._corpus.langs(): self._corpus.lang_freq(lang)
[docs] def remove_punctuation(self, text): ''' Get rid of punctuation except apostrophes ''' return re.sub(r"[^\P{P}\']+", "", text)
[docs] def profile(self, text): ''' Create FreqDist of trigrams within text ''' from nltk import word_tokenize, FreqDist clean_text = self.remove_punctuation(text) tokens = word_tokenize(clean_text) fingerprint = FreqDist() for t in tokens: token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR) token_trigrams = [''.join(tri) for tri in token_trigram_tuples] for cur_trigram in token_trigrams: if cur_trigram in fingerprint: fingerprint[cur_trigram] += 1 else: fingerprint[cur_trigram] = 1 return fingerprint
[docs] def calc_dist(self, lang, trigram, text_profile): ''' Calculate the "out-of-place" measure between the text and language profile for a single trigram ''' lang_fd = self._corpus.lang_freq(lang) dist = 0 if trigram in lang_fd: idx_lang_profile = list(lang_fd.keys()).index(trigram) idx_text = list(text_profile.keys()).index(trigram) #print(idx_lang_profile, ", ", idx_text) dist = abs(idx_lang_profile - idx_text) else: # Arbitrary but should be larger than # any possible trigram file length # in terms of total lines if PY3: dist = maxsize else: dist = maxint return dist
[docs] def lang_dists(self, text): ''' Calculate the "out-of-place" measure between the text and all languages ''' distances = {} profile = self.profile(text) # For all the languages for lang in self._corpus._all_lang_freq.keys(): # Calculate distance metric for every trigram in # input text to be identified lang_dist = 0 for trigram in profile: lang_dist += self.calc_dist(lang, trigram, profile) distances[lang] = lang_dist return distances
[docs] def guess_language(self, text): ''' Find the language with the min distance to the text and return its ISO 639-3 code ''' self.last_distances = self.lang_dists(text) return min(self.last_distances, key=self.last_distances.get)
#################################################') def demo(): from nltk.corpus import udhr langs = ['Kurdish-UTF8', 'Abkhaz-UTF8', 'Farsi_Persian-UTF8', 'Hindi-UTF8', 'Hawaiian-UTF8', 'Russian-UTF8', 'Vietnamese-UTF8', 'Serbian_Srpski-UTF8','Esperanto-UTF8'] friendly = {'kmr':'Northern Kurdish', 'abk':'Abkhazian', 'pes':'Iranian Persian', 'hin':'Hindi', 'haw':'Hawaiian', 'rus':'Russian', 'vie':'Vietnamese', 'srp':'Serbian', 'epo':'Esperanto'} tc = TextCat() for cur_lang in langs: # Get raw data from UDHR corpus raw_sentences = udhr.sents(cur_lang) rows = len(raw_sentences) - 1 cols = list(map(len, raw_sentences)) sample = '' # Generate a sample text of the language for i in range(0, rows): cur_sent = '' for j in range(0, cols[i]): cur_sent += ' ' + raw_sentences[i][j] sample += cur_sent # Try to detect what it is print('Language snippet: ' + sample[0:140] + '...') guess = tc.guess_language(sample) print('Language detection: %s (%s)' % (guess, friendly[guess])) print('#' * 140) if __name__ == '__main__': demo()