From c6cb05536e180617cf146fed6399219d9205e1e4 Mon Sep 17 00:00:00 2001
From: Avital Pekker <avital.pekker@utoronto.ca>
Date: Mon, 26 Jan 2015 07:21:05 -0500
Subject: [PATCH] A language ID module using TextCat algorithm using language
 n-grams from "An Crubadan" project.

---
 nltk_contrib/misc/textcat.py | 162 +++++++++++++++++++++++++++++++++++
 1 file changed, 162 insertions(+)
 create mode 100644 nltk_contrib/misc/textcat.py

diff --git a/nltk_contrib/misc/textcat.py b/nltk_contrib/misc/textcat.py
new file mode 100644
index 0000000..0a9ceac
--- /dev/null
+++ b/nltk_contrib/misc/textcat.py
@@ -0,0 +1,162 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Language ID module using TextCat algorithm
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Avital Pekker <avital.pekker@utoronto.ca>
+#
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A module for language identification using the TextCat algorithm.
+An implementation of the text categorization algorithm
+presented in Cavnar, W. B. and J. M. Trenkle, 
+"N-Gram-Based Text Categorization".
+
+The algorithm takes advantage of Zipf's law and uses 
+n-gram frequencies to profile languages and text-yet to
+be identified-then compared using a distance measure.
+
+Language n-grams are provided by the "An Crubadan"
+project. A corpus reader was created seperately to read
+those files.
+
+For details regarding the algorithm, see:
+http://www.let.rug.nl/~vannoord/TextCat/textcat.pdf
+
+For details about An Crubadan, see:
+http://borel.slu.edu/crubadan/index.html
+"""
+
+# Ensure that your own literal strings default to unicode rather than str.
+from __future__ import print_function, unicode_literals
+
+import nltk
+from nltk.corpus.reader.langid import CrubadanCorpusReader
+from nltk.tokenize import word_tokenize
+from nltk.probability import FreqDist
+# Note: this is NOT "re" you're likely used to. The regex module
+# is an alternative to the standard re module that supports
+# Unicode codepoint properties with the \p{} syntax.
+# You may have to "pip install regx"
+import regex as re  
+######################################################################
+##  Language identification using TextCat
+######################################################################
+
+class TextCat():
+
+    _corpus = None
+    fingerprints = {}
+    _START_CHAR = "<".encode('utf8')
+    _END_CHAR = ">".encode('utf8')
+    
+    def __init__(self):
+        self._corpus = CrubadanCorpusReader(nltk.data.find('corpora/langid'), '.*\.txt')
+        self._corpus.load_all_ngrams()
+        
+    def trigrams(self, text):
+        padded_text = self._START_CHAR + text + self._END_CHAR
+        trigrams = []
+        # Generate 3-grams for given text
+        for i in range(0, len(padded_text) - 2):
+            cur_trigram = padded_text[i:(i + 3)]
+            if len(cur_trigram) == 2:
+                cur_trigram = cur_trigram + self._END_CHAR
+
+            trigrams.append(cur_trigram)
+
+        return trigrams
+
+    def _print_trigrams(self, trigrams):
+        for t in trigrams:
+            print(t)
+        
+    def remove_punctuation(self, text):
+        ''' Get rid of punctuation except apostrophes '''
+        return re.sub(ur"[^\P{P}\']+", "", text.decode('utf8'))
+    
+    def profile(self, text):
+        ''' Create FreqDist of trigrams within text '''
+        clean_text = self.remove_punctuation(text)
+        tokens = word_tokenize(clean_text)
+        
+        fingerprint = FreqDist()
+        for t in tokens:
+            token_trigrams = self.trigrams(t)
+            for cur_trigram in token_trigrams:
+                if cur_trigram in fingerprint:
+                    fingerprint[cur_trigram] += 1
+                else:
+                    fingerprint[cur_trigram] = 1
+
+        return fingerprint
+        
+    def calc_dist(self, lang, trigram, text_profile):
+        ''' Calculate the "out-of-place" measure between the
+            text and language profile for a single trigram '''
+        lang_fd = self._corpus.all_lang_freq[lang]
+        #trigram = trigram.decode('utf8')
+        dist = 0
+        
+        if trigram in lang_fd:
+            idx_lang_profile = lang_fd.keys().index(trigram)
+            idx_text = text_profile.keys().index(trigram)
+
+            dist = abs(idx_lang_profile - idx_text) 
+        else:
+            # Arbitrary but should be larger than
+            # any possible trigram file length
+            # in terms of total lines
+            dist = sys.maxint
+
+        return dist
+        
+    def lang_dists(self, text):
+        ''' Calculate the "out-of-place" measure between
+            the text and all languages '''
+        distances = {}
+        profile = self.profile(text)
+        # For all the languages
+        for lang in self._corpus.all_lang_freq.keys():
+            # Calculate distance metric for every trigram in
+            # input text to be identified
+            lang_dist = 0
+            for trigram in profile:
+                lang_dist += self.calc_dist(lang, trigram, profile)
+        
+            distances[lang] = lang_dist
+            
+        return distances
+    
+    def guess_language(self, text):
+        ''' Find the language with the min distance
+            to the text and return its human-friendly name '''
+        r = self.lang_dists(text)
+        return self._corpus.iso_to_friendly( min(r,key=r.get) )
+    
+    def demo(self):
+        ''' Demo of language guessing using a bunch of UTF-8 encoded
+            text files with snippets of text copied from news websites
+            around the web in different languages '''
+        from os import listdir
+        from os.path import isfile
+        # Current dir
+        path = '.'
+        lang_samples = []
+        
+        for f in listdir(path):
+            if isfile(f):
+                m = re.match('sample_\w+\.txt', f)
+                if m: lang_samples.append(f)
+                
+        print(lang_samples)
+        for f in lang_samples:
+            cur_sample = open(f, 'rU')
+            cur_data = cur_sample.read()
+            print('Language sample file: ' + f)
+            print('Contents snippet:  ' + cur_data.decode('utf8')[0:140])
+            print('#################################################')
+            print('Language detection: ' + self.guess_language(cur_data))
+            print('#################################################')
+