NAMD · geron · Nov 23, 2016 · Nov 23, 2016 · Nov 23, 2016 · Nov 23, 2016
diff --git a/doc/conf.py b/doc/conf.py
@@ -46,8 +46,8 @@
 master_doc = 'index'
 
 # General information about the project.
-project = u'PyPLN'
-copyright = u'2011, Flávio Codeço Coelho'
+project = 'PyPLN'
+copyright = '2011, Flávio Codeço Coelho'
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
@@ -187,8 +187,8 @@
 # Grouping the document tree into LaTeX files. List of tuples
 # (source start file, target name, title, author, documentclass [howto/manual]).
 latex_documents = [
-  ('index', 'PyPLN.tex', u'PyPLN Documentation',
-   u'Flávio Codeço Coelho', 'manual'),
+  ('index', 'PyPLN.tex', 'PyPLN Documentation',
+   'Flávio Codeço Coelho', 'manual'),
 ]
 
 # The name of an image file (relative to this directory) to place at the top of
@@ -220,6 +220,6 @@
 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
 man_pages = [
-    ('index', 'pypln', u'PyPLN Documentation',
-     [u'Flávio Codeço Coelho'], 1)
+    ('index', 'pypln', 'PyPLN Documentation',
+     ['Flávio Codeço Coelho'], 1)
 ]
diff --git a/pypln/backend/celery_app.py b/pypln/backend/celery_app.py
@@ -19,7 +19,7 @@
 
 from celery import Celery
 from kombu import Exchange, Queue
-import config
+from . import config
 
 app = Celery('pypln_workers', backend='mongodb',
         broker='amqp://', include=['pypln.backend.workers'])

diff --git a/pypln/backend/celery_task.py b/pypln/backend/celery_task.py
@@ -31,7 +31,7 @@
 from pypln.backend import config
 
 
-mongo_client = pymongo.MongoClient(host=config.MONGODB_URIS)
+mongo_client = pymongo.MongoClient(host=config.MONGODB_URIS, _connect=False)
 database = mongo_client[config.MONGODB_DBNAME]
 document_collection = database[config.MONGODB_COLLECTION]
 

diff --git a/pypln/backend/config.py b/pypln/backend/config.py
@@ -1,16 +1,12 @@
 import os
+import urllib.parse
 
 from decouple import config, Csv
 
-try:
-    import urlparse
-except ImportError:
-    import urllib.parse as urlparse
-
 def parse_url(url):
-    urlparse.uses_netloc.append('mongodb')
-    urlparse.uses_netloc.append('celery')
-    url = urlparse.urlparse(url)
+    urllib.parse.uses_netloc.append('mongodb')
+    urllib.parse.uses_netloc.append('celery')
+    url = urllib.parse.urlparse(url)
 
     path = url.path[1:]
     path = path.split('?', 2)[0]

diff --git a/pypln/backend/workers/__init__.py b/pypln/backend/workers/__init__.py
@@ -17,18 +17,18 @@
 # You should have received a copy of the GNU General Public License
 # along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
 
-from extractor import Extractor
-from tokenizer import Tokenizer
-from freqdist import FreqDist
-from pos import POS
-from statistics import Statistics
-from bigrams import Bigrams
-from palavras_raw import PalavrasRaw
-from lemmatizer_pt import Lemmatizer
-from palavras_noun_phrase import NounPhrase
-from palavras_semantic_tagger import SemanticTagger
-from word_cloud import WordCloud
-from elastic_indexer import ElasticIndexer
+from .extractor import Extractor
+from .tokenizer import Tokenizer
+from .freqdist import FreqDist
+from .pos import POS
+from .statistics import Statistics
+from .bigrams import Bigrams
+from .palavras_raw import PalavrasRaw
+from .lemmatizer_pt import Lemmatizer
+from .palavras_noun_phrase import NounPhrase
+from .palavras_semantic_tagger import SemanticTagger
+from .word_cloud import WordCloud
+from .elastic_indexer import ElasticIndexer
 
 
 __all__ = ['Extractor', 'Tokenizer', 'FreqDist', 'POS', 'Statistics',

diff --git a/pypln/backend/workers/bigrams.py b/pypln/backend/workers/bigrams.py
@@ -45,4 +45,4 @@ def process(self, document):
         for m in metrics:
             for res in bigram_finder.score_ngrams(getattr(bigram_measures,m)):
                 br[res[0]].append(res[1])
-        return {'metrics': metrics, 'bigram_rank': br.items()}
+        return {'metrics': metrics, 'bigram_rank': list(br.items())}
diff --git a/pypln/backend/workers/extractor.py b/pypln/backend/workers/extractor.py
@@ -20,14 +20,14 @@
 import base64
 import shlex
 
-from HTMLParser import HTMLParser
+from html.parser import HTMLParser
 from tempfile import NamedTemporaryFile
 from os import unlink
 from subprocess import Popen, PIPE
 from mimetypes import guess_type
 from re import compile as regexp_compile, DOTALL, escape
 
-import cld
+import pycld2 as cld
 import magic
 
 from pypln.backend.celery_task import PyPLNTask
@@ -84,10 +84,10 @@ def parse_html(html, remove_tags=None, remove_inside=None,
                     [''] * (total_to_remove - 2)
             content_between[index + 1] = '\n'
     complete_tags.append('')
-    result = ''.join(sum(zip(content_between, complete_tags), tuple()))
+    result = ''.join(sum(list(zip(content_between, complete_tags)), tuple()))
     return clean(result)
 
-def get_pdf_metadata(data):
+def get_pdf_metadata(data: str) -> dict:
     lines = data.strip().splitlines()
     metadata = {}
     for line in lines:
@@ -98,7 +98,7 @@ def get_pdf_metadata(data):
         metadata[key.strip()] = value.strip()
     return metadata
 
-def extract_pdf(data):
+def extract_pdf(data: bytes) -> (str, dict):
     temp = NamedTemporaryFile(delete=False)
     filename = temp.name
     temp.close()
@@ -112,14 +112,16 @@ def extract_pdf(data):
     unlink(filename + '_ind.html')
     unlink(filename + 's.html')
     text = parse_html(html.replace('&#160;', ' '), True, ['script', 'style'])
-    pdfinfo = Popen(shlex.split('pdfinfo -'), stdin=PIPE, stdout=PIPE,
-                    stderr=PIPE)
-    meta_out, meta_err = pdfinfo.communicate(input=data)
+
+    info_process = Popen(shlex.split('pdfinfo -'), stdin=PIPE, stdout=PIPE,
+                         stderr=PIPE)
+    meta_out, meta_err = info_process.communicate(input=data)
     try:
-        metadata = get_pdf_metadata(meta_out)
-    except:
+        metadata = get_pdf_metadata(meta_out.decode('utf-8'))
+    except Exception:
+        # TODO: what should I do here?
         metadata = {}
-        #TODO: what should I do here?
+
     if not (text and metadata):
         return '', {}
     elif not html_err:
@@ -128,41 +130,30 @@ def extract_pdf(data):
         return '', {}
 
 
-def trial_decode(text):
+def decode_text_bytes(text: bytes) -> str:
     """
-    Tries to detect text encoding using `magic`. If the detected encoding is
-    not supported, try utf-8, iso-8859-1 and ultimately falls back to decoding
-    as utf-8 replacing invalid chars with `U+FFFD` (the replacement character).
-
-    This is far from an ideal solution, but the extractor and the rest of the
-    pipeline need an unicode object.
+    Tries to detect text encoding using file magic. If that fails or the
+    detected encoding is not supported, tries using utf-8. If that doesn't work
+    tries using iso8859-1.
     """
-    with magic.Magic(flags=magic.MAGIC_MIME_ENCODING) as m:
-        content_encoding = m.id_buffer(text)
-
-    forced_decoding = False
     try:
-        result = text.decode(content_encoding)
-    except LookupError:
-        # If the detected encoding is not supported, we try to decode it as
-        # utf-8.
+        with magic.Magic(flags=magic.MAGIC_MIME_ENCODING) as m:
+            content_encoding = m.id_buffer(text)
+    except magic.MagicError:
+        pass  # This can happen for instance if text is a single char
+    else:
         try:
-            result = text.decode('utf-8')
-        except UnicodeDecodeError:
-            # Is there a better way of doing this than nesting try/except
-            # blocks? This smells really bad.
-            try:
-                result = text.decode('iso-8859-1')
-            except UnicodeDecodeError:
-                # If neither utf-8 nor iso-885901 work are capable of handling
-                # this text, we just decode it using utf-8 and replace invalid
-                # chars with U+FFFD.
-                # Two somewhat arbitrary decisions were made here: use utf-8
-                # and use 'replace' instead of 'ignore'.
-                result = text.decode('utf-8', 'replace')
-                forced_decoding = True
-
-    return result, forced_decoding
+            return text.decode(content_encoding)
+        except LookupError:  # The detected encoding is not supported
+            pass
+
+    try:
+        result = text.decode('utf-8')
+    except UnicodeDecodeError:
+        # Decoding with iso8859-1 doesn't raise UnicodeDecodeError, so this is
+        # a last resort.
+        result = text.decode('iso8859-1')
+    return result
 
 
 class Extractor(PyPLNTask):
@@ -173,11 +164,12 @@ def process(self, file_data):
         contents = base64.b64decode(file_data['contents'])
         with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m:
             file_mime_type = m.id_buffer(contents)
+
         metadata = {}
-        if file_mime_type == 'text/plain':
-            text = contents
-        elif file_mime_type == 'text/html':
-            text = parse_html(contents, True, ['script', 'style'])
+        if file_mime_type in ('text/plain', 'text/html'):
+            text = decode_text_bytes(contents)
+            if file_mime_type == 'text/html':
+                text = parse_html(text, True, ['script', 'style'])
         elif file_mime_type == 'application/pdf':
             text, metadata = extract_pdf(contents)
         else:
@@ -191,9 +183,7 @@ def process(self, file_data):
             return {'mimetype': 'unknown', 'text': "",
                     'file_metadata': {}, 'language': ""}
 
-        text, forced_decoding = trial_decode(text)
-
-        if isinstance(text, unicode):
+        if isinstance(text, str):
             # HTMLParser only handles unicode objects. We can't pass the text
             # through it if we don't know the encoding, and it's possible we
             # also shouldn't. There's no way of knowing if it's a badly encoded
@@ -203,10 +193,18 @@ def process(self, file_data):
 
         text = clean(text)
 
-        if isinstance(text, unicode):
-            language = cld.detect(text.encode('utf-8'))[1]
+        if isinstance(text, str):
+            languages = cld.detect(text.encode('utf-8'))[2]
         else:
-            language = cld.detect(text)[1]
-
-        return {'text': text, 'file_metadata': metadata, 'language': language,
-                'mimetype': file_mime_type, 'forced_decoding': forced_decoding}
+            languages = cld.detect(text)[2]
+
+        detected_language = None
+        if languages:
+            detected_language = languages[0][1]
+
+        # TODO: check for uses of forced_decoding and remove them
+        return {'text': text,
+                'file_metadata': metadata,
+                'language': detected_language,
+                'mimetype': file_mime_type,
+                'forced_decoding': None}
diff --git a/pypln/backend/workers/freqdist.py b/pypln/backend/workers/freqdist.py
@@ -27,7 +27,7 @@ def process(self, document):
         tokens = [info.lower() for info in document_tokens]
         frequency_distribution = {token: tokens.count(token) \
                                   for token in set(tokens)}
-        fd = frequency_distribution.items()
-        fd.sort(lambda x, y: cmp(y[1], x[1]))
+        fd = list(frequency_distribution.items())
+        fd.sort(key=lambda x: (-x[1], x[0]))
 
         return {'freqdist': fd}
diff --git a/pypln/backend/workers/palavras_noun_phrase.py b/pypln/backend/workers/palavras_noun_phrase.py
@@ -40,7 +40,7 @@ def process(self, document):
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE)
         palavras_output = document['palavras_raw']
-        if isinstance(palavras_output, unicode):
+        if isinstance(palavras_output, str):
             # we *need* to send a 'str' to the process. Otherwise it's going to try to use ascii.
             palavras_output = palavras_output.encode('utf-8')
         stdout, stderr = process.communicate(palavras_output)

diff --git a/pypln/backend/workers/palavras_raw.py b/pypln/backend/workers/palavras_raw.py
@@ -39,14 +39,15 @@ def process(self, document):
 
         text = document['text']
 
-        # For some reason, in some pypln installations the document['text'] is
-        # not always unicode as it should be. This may be due to errors during
-        # the decoding process that we fixed earlier. That meant that, when we
-        # got a non-unicode string, python would try to decode it using the
-        # default codec (ascii) in `text.encode(PALAVRAS_ENCODING)`. Since we
-        # know the text came from mongodb, we can just decode it using utf-8 to
-        # make sure we have a unicode object.
-        if not isinstance(text, unicode):
+        # This code is here because when using python2 for some
+        # reason, sometimes document['text'] was not a unicode object
+        # (as it should be, coming from pymongo).  Since we're now
+        # using python3, we should really always get a str (unicode)
+        # object. But, since we do not know the real reason for the
+        # original error, we will keep this code here for now. As
+        # before, if we receive a bytes object, since it came from
+        # mongodb we can be sure it will be encoded in utf-8.
+        if isinstance(text, bytes):
             text = text.decode('utf-8')
 
         process = subprocess.Popen([BASE_PARSER, PARSER_MODE],
@@ -55,4 +56,4 @@ def process(self, document):
                                    stderr=subprocess.PIPE)
         stdout, stderr = process.communicate(text.encode(PALAVRAS_ENCODING))
 
-        return {'palavras_raw': stdout, 'palavras_raw_ran': True}
+        return {'palavras_raw': stdout.decode('utf-8'), 'palavras_raw_ran': True}