Skip to content

Commit

Permalink
Merge branch 'ci-test'
Browse files Browse the repository at this point in the history
  • Loading branch information
christian-intra2net committed Dec 22, 2023
2 parents c82bf50 + 29c9813 commit a96432f
Show file tree
Hide file tree
Showing 13 changed files with 140 additions and 32 deletions.
77 changes: 76 additions & 1 deletion oletools/ftguess.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
import olefile
import logging
import optparse
import codecs

# import lxml or ElementTree for XML parsing:
try:
Expand Down Expand Up @@ -290,6 +291,78 @@ def recognize(cls, ftg):
class FType_Unknown(FType_Base):
pass

class FType_TEXT(FType_Base):
"""
Try a few popular encoding to detect whether this is just plain text.
Try the most popular encodings according to wikipedia:
https://en.wikipedia.org/wiki/Popularity_of_text_encodings#Popularity_internally_in_software
Maybe should add 'windows-1251' (cyrillic) or 'big5' (Chinese) or other formats popular in Asia?
Implementation is rather hacky, but we do not need a perfect solution here (which would be to use
libmagic) and determining encoding is really not easy.
"""
filetype = FTYPE.TEXT
name = 'plain text'
longname = 'unclassified plain text'
extensions = ['txt',]
content_types = ('text/plain',) # behave like `file` on linux
PUID = 'x-fmt/111'
# encodings we try to decode the bytes with; from limited to more general
ENCODINGS = ('ascii', 'latin1', 'utf8', 'utf-16le', 'utf-16be')
CHECK_SIZE = 4096 # do not try to decode megabytes of data, just check the beginning

@classmethod
def recognize(cls, ftg):
"""
Try to determine whether this data makes sense as encoded text.
If yes, set :py:data:`ftg.text_encoding`.
"""
# first, try a few simple ones:
if ftg.data.startswith(codecs.BOM_UTF8):
try:
_ = ftg.data.decode('utf8', errors='strict')
ftg.text_encoding = 'utf8'
return True
except UnicodeError:
return False
elif ftg.data.startswith(codecs.BOM_UTF16_LE):
try:
_ = ftg.data.decode('utf-16le', errors='strict')
ftg.text_encoding = 'utf-16le'
return True
except UnicodeError:
return False
elif ftg.data.startswith(codecs.BOM_UTF16_BE):
try:
_ = ftg.data.decode('utf-16be', errors='strict')
ftg.text_encoding = 'utf-16be'
return True
except UnicodeError:
return False

# no BOM? then try to decode the first part using various encodings
# could also check if every 2nd byte is zero in 90% of time. If so, this is probably utf16
for encoding in cls.ENCODINGS:
try:
data_size = len(ftg.data)
decoded = ftg.data[:cls.CHECK_SIZE].decode(encoding, errors='strict')
if data_size > cls.CHECK_SIZE:
rep = repr(decoded[:-10]) # remove the last characters, may be erroneous due to cutting
else:
rep = repr(decoded)
bad_chars = rep.count(r'\x') + rep.count(r'\u') # e.g. in latin1 everything "is valid" but looks horrible
if bad_chars > float(data_size) * 0.05:
continue
ftg.text_encoding = encoding
return True
except UnicodeError:
pass
return False


class FType_RTF(FType_Base):
container = CONTAINER.RTF
application = APP.MSWORD
Expand Down Expand Up @@ -802,6 +875,8 @@ def __init__(self, filepath=None, data=None):
# For XML:
self.root_xmltag = None
self.xmlroot = None
# For TEXT:
self.text_encoding = None

if filepath is None and data is None:
raise ValueError('FileTypeGuesser requires either a file path or file data, or both')
Expand All @@ -811,7 +886,7 @@ def __init__(self, filepath=None, data=None):
self.data_bytesio = io.BytesIO(self.data)

# Identify the main container type:
for ftype in (FType_RTF, FType_Generic_OLE, FType_Generic_Zip, FType_OneNote, FType_PNG):
for ftype in (FType_RTF, FType_Generic_OLE, FType_Generic_Zip, FType_OneNote, FType_PNG, FType_TEXT):
if ftype.recognize(self):
self.ftype = ftype
break
Expand Down
47 changes: 24 additions & 23 deletions oletools/olevba.py
Original file line number Diff line number Diff line change
Expand Up @@ -2790,12 +2790,13 @@ def __init__(self, filename, data=None, container=None, relaxed=True, encoding=D
raise FileOpenError(msg)
# Check if it is a SLK/SYLK file - https://en.wikipedia.org/wiki/SYmbolic_LinK_(SYLK)
# It must start with "ID" in uppercase, no whitespace or newline allowed before by Excel:
if data.startswith(b'ID'):
self.open_slk(data)
# Check if this is a plain text VBA or VBScript file:
# To avoid scanning binary files, we simply check for some control chars:
if self.type is None and b'\x00' not in data:
self.open_text(data)
if self.type is None and self.ftg.ftype == ftguess.FType_TEXT:
data = bytes2str(data, self.ftg.text_encoding)
if data.startswith('ID'):
self.open_slk(data)
else:
# Check if this is a plain text VBA or VBScript file:
self.open_text(data)
if self.type is None:
# At this stage, could not match a known format:
msg = '%s is not a supported file type, cannot extract VBA Macros.' % self.filename
Expand Down Expand Up @@ -3101,51 +3102,51 @@ def open_ppt(self):
log.debug("File appears not to be a ppt file (%s)" % exc)


def open_slk(self, data):
def open_slk(self, str_data):
"""
Open a SLK file, which may contain XLM/Excel 4 macros
:param data: file contents in a bytes string
:param str_data: file contents in a [unicode] string
:return: nothing
"""
# TODO: Those results should be stored as XLM macros, not VBA
log.info('Opening SLK file %s' % self.filename)
xlm_macro_found = False
xlm_macros = []
xlm_macros.append('Formulas and XLM/Excel 4 macros extracted from SLK file:')
for line in data.splitlines(False):
if line.startswith(b'O'):
for line in str_data.splitlines(False):
if line.startswith('O'):
# Option: "O;E" indicates a macro sheet, must appear before NN and C rows
for s in line.split(b';'):
if s.startswith(b'E'):
for s in line.split(';'):
if s.startswith('E'):
xlm_macro_found = True
log.debug('SLK parser: found macro sheet')
elif line.startswith(b'NN') and xlm_macro_found:
elif line.startswith('NN') and xlm_macro_found:
# Name that can trigger a macro, for example "Auto_Open"
for s in line.split(b';'):
if s.startswith(b'N') and s.strip() != b'NN':
xlm_macros.append('Named cell: %s' % bytes2str(s[1:]))
elif line.startswith(b'C') and xlm_macro_found:
for s in line.split(';'):
if s.startswith('N') and s.strip() != 'NN':
xlm_macros.append('Named cell: %s' % s[1:])
elif line.startswith('C') and xlm_macro_found:
# Cell
for s in line.split(b';'):
if s.startswith(b'E'):
xlm_macros.append('Formula or Macro: %s' % bytes2str(s[1:]))
for s in line.split(';'):
if s.startswith('E'):
xlm_macros.append('Formula or Macro: %s' % s[1:])
if xlm_macro_found:
self.contains_xlm_macros = True
self.xlm_macros = xlm_macros
self.type = TYPE_SLK


def open_text(self, data):
def open_text(self, str_data):
"""
Open a text file containing VBA or VBScript source code
:param data: file contents in a string or bytes
:param str_data: file contents in a [unicode] string
:return: nothing
"""
log.info('Opening text file %s' % self.filename)
# directly store the source code:
# On Python 2, store it as a raw bytes string
# On Python 3, convert it to unicode assuming it was encoded with UTF-8
self.vba_code_all_modules = bytes2str(data)
self.vba_code_all_modules = str_data
self.contains_vba_macros = True
# set type only if parsing succeeds
self.type = TYPE_TEXT
Expand Down
22 changes: 18 additions & 4 deletions tests/ftguess/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import unittest
import os
from os.path import splitext, join
import re
from oletools import ftguess

# Directory with test data, independent of current working directory
Expand All @@ -13,7 +14,7 @@ class TestFTGuess(unittest.TestCase):
"""Test ftguess"""

def test_all(self):
"""Run all files in test-data and compare to known ouput"""
"""Run all files in test-data and compare to known output"""
# ftguess knows extension for each FType, create a reverse mapping
used_types = (
ftguess.FType_RTF, ftguess.FType_Generic_OLE,
Expand All @@ -30,7 +31,7 @@ def test_all(self):
ftguess.FType_Powerpoint2007_Slideshow,
ftguess.FType_Powerpoint2007_Macro,
ftguess.FType_Powerpoint2007_Slideshow_Macro,
ftguess.FType_XPS,
ftguess.FType_XPS, ftguess.FType_TEXT,
)
ftype_for_extension = dict()
for ftype in used_types:
Expand All @@ -44,10 +45,10 @@ def test_all(self):

# determine what we expect...
before_dot, extension = splitext(filename)
if extension == '.zip':
if extension == '.zip': # zipped files are encrypted versions of other files to not alarm virus scanners
extension = splitext(before_dot)[1]
elif filename in (join('basic', 'empty'), join('basic', 'text')):
extension = '.csv' # have just like that
extension = '.txt' # behave as if this were simple plain text
elif not extension:
self.fail('Could not find extension for test sample {0}'
.format(filename))
Expand Down Expand Up @@ -104,6 +105,19 @@ def test_all(self):
self.assertEqual(guess.is_powerpoint(),
extension.startswith('p'))

def test_encoding(self):
"""Check whether text file encoding is detected correctly"""
n_matches = 0
for filename, file_contents in loop_over_files(subdir='basic'):
match = re.match(r'basic/test-sample-(ascii|latin1|utf[816_lbe]+)(?:-nobom|-withbom)?.txt', filename)
if not match:
continue
n_matches += 1
expect_encoding = match.groups()[0].replace('_', '')
guess = ftguess.ftype_guess(data=file_contents)
self.assertEqual(guess.ftype, ftguess.FType_TEXT)
self.assertEqual(guess.text_encoding.replace('-', ''), expect_encoding)
self.assertGreater(n_matches, 0)


# just in case somebody calls this file as a script
Expand Down
16 changes: 13 additions & 3 deletions tests/oleid/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,13 +120,23 @@ def test_macros(self):

# xlm detection does not work in-memory (yet)
# --> xlm is "unknown" for excel files, except some encrypted files
self.assertIn(value_dict['xlm'], ('Unknown', 'No'))
self.assertIn(value_dict['xlm'], ('Unknown', 'No'),
"Unexpected value '{0}' for XLM-content in test sample {1}'"
.format(value_dict['xlm'], filename))

# "macro detection" in text files leads to interesting results:
if filename in find_vba: # no macros!
self.assertEqual(value_dict['vba'], 'Yes')
self.assertEqual(value_dict['vba'], 'Yes',
"Unexpected value '{0}' for test sample {1}'"
.format(value_dict['xlm'], filename))
elif filename.startswith('basic/test-sample-'): # not clear what macro detection should do with text files
self.assertIn(value_dict['vba'], ('Yes', 'Error'),
"Unexpected value '{0}' for test sample {1}'"
.format(value_dict['vba'], filename))
else:
self.assertIn(value_dict['vba'], ('No', 'Error'))
self.assertIn(value_dict['vba'], ('No', 'Error'),
"Unexpected value '{0}' for test sample {1}'"
.format(value_dict['xlm'], filename))

def test_flash(self):
"""Test indicator for flash."""
Expand Down
2 changes: 1 addition & 1 deletion tests/ooxml/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def test_rough_doctype(self):

# files that are neither OLE nor xml:
except_files = 'empty', 'text'
except_extns = 'rtf', 'csv', 'zip', 'slk'
except_extns = 'rtf', 'csv', 'zip', 'slk', 'txt'

# analyse all files in data dir
# TODO: use testdata_reader to extract real data from zip files
Expand Down
2 changes: 2 additions & 0 deletions tests/test-data/basic/test-sample-ascii.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Test sample file without special chars or emjois,
encoded using ascii
2 changes: 2 additions & 0 deletions tests/test-data/basic/test-sample-latin1.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Test sample file with special chars ����,
encoded using latin1
2 changes: 2 additions & 0 deletions tests/test-data/basic/test-sample-utf8-nobom.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Test sample file with special chars äöüß and emojis 😇🙊,
encoded using utf8
2 changes: 2 additions & 0 deletions tests/test-data/basic/test-sample-utf8-withbom.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Test sample file with special chars äöüß and emojis 😇🙊,
encoded using utf8
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 comments on commit a96432f

Please sign in to comment.