TextFilter.py

#!/usr/bin/env python3
# Copyright (c) 2008-11 Qtrac Ltd. All rights reserved.
# This program or module is free software: you can redistribute it and/or
# modify it under the terms of the GNU General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version. It is provided for educational
# purposes and is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.

"""
>>> vowel_counter = CharCounter()
>>> vowel_counter("dog fish and cat fish", "aeiou")     # returns: 5
5
>>> vowel_counter("there's too many junk lawsuits suing too many doctors",
...               "aeiouAEIOU")
16
>>> text = "The Title\\n=========\\nThe text\\n"
>>> rle_encoder = RunLengthEncode()
>>> rle = rle_encoder(text)
>>> rle, len(rle), len(text)
(b'The Title\\n\\x00\\t=\\nThe text\\n', 23, 29)
>>> rle_decoder = RunLengthDecode()
>>> string = rle_decoder(rle)
>>> string, len(string), string == text
('The Title\\n=========\\nThe text\\n', 29, True)
>>> text = "=============++++++++++--------------"
>>> rle = rle_encoder(text)
>>> rle, len(rle), len(text)
(b'\\x00\\r=\\x00\\n+\\x00\\x0e-', 9, 37)
>>> rle_decoder(rle) == text
True
"""


import abc


class TextFilter(metaclass=abc.ABCMeta):

    @abc.abstractproperty
    def is_transformer(self):
        raise NotImplementedError()


    @abc.abstractmethod
    def __call__(self):
        raise NotImplementedError()


class CharCounter(TextFilter):

    @property
    def is_transformer(self):
        return False


    def __call__(self, text, chars):
        count = 0
        for c in text:
            if c in chars:
                count += 1
        return count
        

class RunLengthEncode(TextFilter):

    @property
    def is_transformer(self):
        return True


    def __call__(self, utf8_string):
        byte = None
        count = 0
        binary = bytearray()
        for b in utf8_string.encode("utf8"):
            if byte is None:
                if b == 0:
                    binary.extend((0, 1, 0))
                else:
                    byte = b
                    count = 1
            else:
                if byte == b:
                    count += 1
                    if count == 255:
                        binary.extend((0, count, b))
                        byte = None
                        count = 0
                else:
                    if count == 1:
                        binary.append(byte)
                    elif count == 2:
                        binary.extend((byte, byte))
                    elif count > 2:
                        binary.extend((0, count, byte))
                    if b == 0:
                        binary.extend((0, 1, 0))
                        byte = None
                        count = 0
                    else:
                        byte = b
                        count = 1
        if count == 1:
            binary.append(byte)
        elif count == 2:
            binary.extend((byte, byte))
        elif count > 2:
            binary.extend((0, count, byte))
        return bytes(binary)
        

class RunLengthDecode(TextFilter):

    @property
    def is_transformer(self):
        return True


    def __call__(self, rle_bytes):
        binary = bytearray()
        length = None
        for b in rle_bytes:
            if length == 0:
                length = b
            elif length is not None:
                binary.extend([b for x in range(length)])
                length = None
            elif b == 0:
                length = 0
            else:
                binary.append(b)
                length = None
        if length:
            binary.extend([b for x in range(length)])
        return binary.decode("utf8")
        

if __name__ == "__main__":
    text = "The Story\n=========\n\nOnce upon a time..."
    rle_encoder = RunLengthEncode()
    rle_text = rle_encoder(text)
    rle_decoder = RunLengthDecode()
    original_text = rle_decoder(rle_text)
    assert text == original_text

    import doctest
    doctest.testmod()