diff --git a/oletools/ftguess.py b/oletools/ftguess.py index cd447f91a..e259bf843 100644 --- a/oletools/ftguess.py +++ b/oletools/ftguess.py @@ -62,6 +62,7 @@ import olefile import logging import optparse +import codecs # import lxml or ElementTree for XML parsing: try: @@ -290,6 +291,78 @@ def recognize(cls, ftg): class FType_Unknown(FType_Base): pass +class FType_TEXT(FType_Base): + """ + Try a few popular encoding to detect whether this is just plain text. + + Try the most popular encodings according to wikipedia: + https://en.wikipedia.org/wiki/Popularity_of_text_encodings#Popularity_internally_in_software + + Maybe should add 'windows-1251' (cyrillic) or 'big5' (Chinese) or other formats popular in Asia? + + Implementation is rather hacky, but we do not need a perfect solution here (which would be to use + libmagic) and determining encoding is really not easy. + """ + filetype = FTYPE.TEXT + name = 'plain text' + longname = 'unclassified plain text' + extensions = ['txt',] + content_types = ('text/plain',) # behave like `file` on linux + PUID = 'x-fmt/111' + # encodings we try to decode the bytes with; from limited to more general + ENCODINGS = ('ascii', 'latin1', 'utf8', 'utf-16le', 'utf-16be') + CHECK_SIZE = 4096 # do not try to decode megabytes of data, just check the beginning + + @classmethod + def recognize(cls, ftg): + """ + Try to determine whether this data makes sense as encoded text. + + If yes, set :py:data:`ftg.text_encoding`. + """ + # first, try a few simple ones: + if ftg.data.startswith(codecs.BOM_UTF8): + try: + _ = ftg.data.decode('utf8', errors='strict') + ftg.text_encoding = 'utf8' + return True + except UnicodeError: + return False + elif ftg.data.startswith(codecs.BOM_UTF16_LE): + try: + _ = ftg.data.decode('utf-16le', errors='strict') + ftg.text_encoding = 'utf-16le' + return True + except UnicodeError: + return False + elif ftg.data.startswith(codecs.BOM_UTF16_BE): + try: + _ = ftg.data.decode('utf-16be', errors='strict') + ftg.text_encoding = 'utf-16be' + return True + except UnicodeError: + return False + + # no BOM? then try to decode the first part using various encodings + # could also check if every 2nd byte is zero in 90% of time. If so, this is probably utf16 + for encoding in cls.ENCODINGS: + try: + data_size = len(ftg.data) + decoded = ftg.data[:cls.CHECK_SIZE].decode(encoding, errors='strict') + if data_size > cls.CHECK_SIZE: + rep = repr(decoded[:-10]) # remove the last characters, may be erroneous due to cutting + else: + rep = repr(decoded) + bad_chars = rep.count(r'\x') + rep.count(r'\u') # e.g. in latin1 everything "is valid" but looks horrible + if bad_chars > float(data_size) * 0.05: + continue + ftg.text_encoding = encoding + return True + except UnicodeError: + pass + return False + + class FType_RTF(FType_Base): container = CONTAINER.RTF application = APP.MSWORD @@ -802,6 +875,8 @@ def __init__(self, filepath=None, data=None): # For XML: self.root_xmltag = None self.xmlroot = None + # For TEXT: + self.text_encoding = None if filepath is None and data is None: raise ValueError('FileTypeGuesser requires either a file path or file data, or both') @@ -811,7 +886,7 @@ def __init__(self, filepath=None, data=None): self.data_bytesio = io.BytesIO(self.data) # Identify the main container type: - for ftype in (FType_RTF, FType_Generic_OLE, FType_Generic_Zip, FType_OneNote, FType_PNG): + for ftype in (FType_RTF, FType_Generic_OLE, FType_Generic_Zip, FType_OneNote, FType_PNG, FType_TEXT): if ftype.recognize(self): self.ftype = ftype break diff --git a/oletools/olevba.py b/oletools/olevba.py index 00f1eb93f..071fcdc86 100644 --- a/oletools/olevba.py +++ b/oletools/olevba.py @@ -2790,12 +2790,13 @@ def __init__(self, filename, data=None, container=None, relaxed=True, encoding=D raise FileOpenError(msg) # Check if it is a SLK/SYLK file - https://en.wikipedia.org/wiki/SYmbolic_LinK_(SYLK) # It must start with "ID" in uppercase, no whitespace or newline allowed before by Excel: - if data.startswith(b'ID'): - self.open_slk(data) - # Check if this is a plain text VBA or VBScript file: - # To avoid scanning binary files, we simply check for some control chars: - if self.type is None and b'\x00' not in data: - self.open_text(data) + if self.type is None and self.ftg.ftype == ftguess.FType_TEXT: + data = bytes2str(data, self.ftg.text_encoding) + if data.startswith('ID'): + self.open_slk(data) + else: + # Check if this is a plain text VBA or VBScript file: + self.open_text(data) if self.type is None: # At this stage, could not match a known format: msg = '%s is not a supported file type, cannot extract VBA Macros.' % self.filename @@ -3101,10 +3102,10 @@ def open_ppt(self): log.debug("File appears not to be a ppt file (%s)" % exc) - def open_slk(self, data): + def open_slk(self, str_data): """ Open a SLK file, which may contain XLM/Excel 4 macros - :param data: file contents in a bytes string + :param str_data: file contents in a [unicode] string :return: nothing """ # TODO: Those results should be stored as XLM macros, not VBA @@ -3112,40 +3113,40 @@ def open_slk(self, data): xlm_macro_found = False xlm_macros = [] xlm_macros.append('Formulas and XLM/Excel 4 macros extracted from SLK file:') - for line in data.splitlines(False): - if line.startswith(b'O'): + for line in str_data.splitlines(False): + if line.startswith('O'): # Option: "O;E" indicates a macro sheet, must appear before NN and C rows - for s in line.split(b';'): - if s.startswith(b'E'): + for s in line.split(';'): + if s.startswith('E'): xlm_macro_found = True log.debug('SLK parser: found macro sheet') - elif line.startswith(b'NN') and xlm_macro_found: + elif line.startswith('NN') and xlm_macro_found: # Name that can trigger a macro, for example "Auto_Open" - for s in line.split(b';'): - if s.startswith(b'N') and s.strip() != b'NN': - xlm_macros.append('Named cell: %s' % bytes2str(s[1:])) - elif line.startswith(b'C') and xlm_macro_found: + for s in line.split(';'): + if s.startswith('N') and s.strip() != 'NN': + xlm_macros.append('Named cell: %s' % s[1:]) + elif line.startswith('C') and xlm_macro_found: # Cell - for s in line.split(b';'): - if s.startswith(b'E'): - xlm_macros.append('Formula or Macro: %s' % bytes2str(s[1:])) + for s in line.split(';'): + if s.startswith('E'): + xlm_macros.append('Formula or Macro: %s' % s[1:]) if xlm_macro_found: self.contains_xlm_macros = True self.xlm_macros = xlm_macros self.type = TYPE_SLK - def open_text(self, data): + def open_text(self, str_data): """ Open a text file containing VBA or VBScript source code - :param data: file contents in a string or bytes + :param str_data: file contents in a [unicode] string :return: nothing """ log.info('Opening text file %s' % self.filename) # directly store the source code: # On Python 2, store it as a raw bytes string # On Python 3, convert it to unicode assuming it was encoded with UTF-8 - self.vba_code_all_modules = bytes2str(data) + self.vba_code_all_modules = str_data self.contains_vba_macros = True # set type only if parsing succeeds self.type = TYPE_TEXT diff --git a/tests/ftguess/test_basic.py b/tests/ftguess/test_basic.py index 12b95db2b..bc63cd76e 100644 --- a/tests/ftguess/test_basic.py +++ b/tests/ftguess/test_basic.py @@ -2,6 +2,7 @@ import unittest import os from os.path import splitext, join +import re from oletools import ftguess # Directory with test data, independent of current working directory @@ -13,7 +14,7 @@ class TestFTGuess(unittest.TestCase): """Test ftguess""" def test_all(self): - """Run all files in test-data and compare to known ouput""" + """Run all files in test-data and compare to known output""" # ftguess knows extension for each FType, create a reverse mapping used_types = ( ftguess.FType_RTF, ftguess.FType_Generic_OLE, @@ -30,7 +31,7 @@ def test_all(self): ftguess.FType_Powerpoint2007_Slideshow, ftguess.FType_Powerpoint2007_Macro, ftguess.FType_Powerpoint2007_Slideshow_Macro, - ftguess.FType_XPS, + ftguess.FType_XPS, ftguess.FType_TEXT, ) ftype_for_extension = dict() for ftype in used_types: @@ -44,10 +45,10 @@ def test_all(self): # determine what we expect... before_dot, extension = splitext(filename) - if extension == '.zip': + if extension == '.zip': # zipped files are encrypted versions of other files to not alarm virus scanners extension = splitext(before_dot)[1] elif filename in (join('basic', 'empty'), join('basic', 'text')): - extension = '.csv' # have just like that + extension = '.txt' # behave as if this were simple plain text elif not extension: self.fail('Could not find extension for test sample {0}' .format(filename)) @@ -104,6 +105,19 @@ def test_all(self): self.assertEqual(guess.is_powerpoint(), extension.startswith('p')) + def test_encoding(self): + """Check whether text file encoding is detected correctly""" + n_matches = 0 + for filename, file_contents in loop_over_files(subdir='basic'): + match = re.match(r'basic/test-sample-(ascii|latin1|utf[816_lbe]+)(?:-nobom|-withbom)?.txt', filename) + if not match: + continue + n_matches += 1 + expect_encoding = match.groups()[0].replace('_', '') + guess = ftguess.ftype_guess(data=file_contents) + self.assertEqual(guess.ftype, ftguess.FType_TEXT) + self.assertEqual(guess.text_encoding.replace('-', ''), expect_encoding) + self.assertGreater(n_matches, 0) # just in case somebody calls this file as a script diff --git a/tests/oleid/test_basic.py b/tests/oleid/test_basic.py index 3adefa6b9..d05b39026 100644 --- a/tests/oleid/test_basic.py +++ b/tests/oleid/test_basic.py @@ -120,13 +120,23 @@ def test_macros(self): # xlm detection does not work in-memory (yet) # --> xlm is "unknown" for excel files, except some encrypted files - self.assertIn(value_dict['xlm'], ('Unknown', 'No')) + self.assertIn(value_dict['xlm'], ('Unknown', 'No'), + "Unexpected value '{0}' for XLM-content in test sample {1}'" + .format(value_dict['xlm'], filename)) # "macro detection" in text files leads to interesting results: if filename in find_vba: # no macros! - self.assertEqual(value_dict['vba'], 'Yes') + self.assertEqual(value_dict['vba'], 'Yes', + "Unexpected value '{0}' for test sample {1}'" + .format(value_dict['xlm'], filename)) + elif filename.startswith('basic/test-sample-'): # not clear what macro detection should do with text files + self.assertIn(value_dict['vba'], ('Yes', 'Error'), + "Unexpected value '{0}' for test sample {1}'" + .format(value_dict['vba'], filename)) else: - self.assertIn(value_dict['vba'], ('No', 'Error')) + self.assertIn(value_dict['vba'], ('No', 'Error'), + "Unexpected value '{0}' for test sample {1}'" + .format(value_dict['xlm'], filename)) def test_flash(self): """Test indicator for flash.""" diff --git a/tests/ooxml/test_basic.py b/tests/ooxml/test_basic.py index e21942b13..90d85d427 100644 --- a/tests/ooxml/test_basic.py +++ b/tests/ooxml/test_basic.py @@ -39,7 +39,7 @@ def test_rough_doctype(self): # files that are neither OLE nor xml: except_files = 'empty', 'text' - except_extns = 'rtf', 'csv', 'zip', 'slk' + except_extns = 'rtf', 'csv', 'zip', 'slk', 'txt' # analyse all files in data dir # TODO: use testdata_reader to extract real data from zip files diff --git a/tests/test-data/basic/test-sample-ascii.txt b/tests/test-data/basic/test-sample-ascii.txt new file mode 100644 index 000000000..700629332 --- /dev/null +++ b/tests/test-data/basic/test-sample-ascii.txt @@ -0,0 +1,2 @@ +Test sample file without special chars or emjois, +encoded using ascii diff --git a/tests/test-data/basic/test-sample-latin1.txt b/tests/test-data/basic/test-sample-latin1.txt new file mode 100644 index 000000000..ee5d8d38c --- /dev/null +++ b/tests/test-data/basic/test-sample-latin1.txt @@ -0,0 +1,2 @@ +Test sample file with special chars äöüß, +encoded using latin1 diff --git a/tests/test-data/basic/test-sample-utf8-nobom.txt b/tests/test-data/basic/test-sample-utf8-nobom.txt new file mode 100644 index 000000000..63fa88781 --- /dev/null +++ b/tests/test-data/basic/test-sample-utf8-nobom.txt @@ -0,0 +1,2 @@ +Test sample file with special chars äöüß and emojis 😇🙊, +encoded using utf8 diff --git a/tests/test-data/basic/test-sample-utf8-withbom.txt b/tests/test-data/basic/test-sample-utf8-withbom.txt new file mode 100644 index 000000000..52872f5f2 --- /dev/null +++ b/tests/test-data/basic/test-sample-utf8-withbom.txt @@ -0,0 +1,2 @@ +Test sample file with special chars äöüß and emojis 😇🙊, +encoded using utf8 diff --git a/tests/test-data/basic/test-sample-utf_16_be-nobom.txt b/tests/test-data/basic/test-sample-utf_16_be-nobom.txt new file mode 100644 index 000000000..917d2587e Binary files /dev/null and b/tests/test-data/basic/test-sample-utf_16_be-nobom.txt differ diff --git a/tests/test-data/basic/test-sample-utf_16_be-withbom.txt b/tests/test-data/basic/test-sample-utf_16_be-withbom.txt new file mode 100644 index 000000000..9987b12f6 Binary files /dev/null and b/tests/test-data/basic/test-sample-utf_16_be-withbom.txt differ diff --git a/tests/test-data/basic/test-sample-utf_16_le-nobom.txt b/tests/test-data/basic/test-sample-utf_16_le-nobom.txt new file mode 100644 index 000000000..e6bde2d72 Binary files /dev/null and b/tests/test-data/basic/test-sample-utf_16_le-nobom.txt differ diff --git a/tests/test-data/basic/test-sample-utf_16_le-withbom.txt b/tests/test-data/basic/test-sample-utf_16_le-withbom.txt new file mode 100644 index 000000000..6796e734e Binary files /dev/null and b/tests/test-data/basic/test-sample-utf_16_le-withbom.txt differ