From c737bb1f17bfe526df7e27ffc20aca977b24214a Mon Sep 17 00:00:00 2001 From: JP Bourget Date: Fri, 12 Jul 2019 21:07:06 -0400 Subject: [PATCH 1/8] Fix bad restructured text syntax --- README.rst | 61 +++++++++++++++++++++++++++--------------------------- 1 file changed, 31 insertions(+), 30 deletions(-) diff --git a/README.rst b/README.rst index 1b432ddc..0e60b3a2 100644 --- a/README.rst +++ b/README.rst @@ -40,36 +40,37 @@ that Microsoft have opened up their documentation of the file format. Currently, the README is in the process of being redone. For now, please refer to the usage information provided from the program's help dialog: :: - usage: extract_msg [-h] [--use-content-id] [--dev] [--validate] [--json] - [--file-logging] [--verbose] [--log LOG] - [--config CONFIG_PATH] [--out OUT_PATH] [--use-filename] - msg [msg ...] - - extract_msg: Extracts emails and attachments saved in Microsoft Outlook's .msg - files. https://github.com/mattgwwalker/msg-extractor - - positional arguments: - msg An msg file to be parsed - - optional arguments: - -h, --help show this help message and exit - --use-content-id, --cid - Save attachments by their Content ID, if they have - one. Useful when working with the HTML body. - --dev Changes to use developer mode. Automatically enables - the --verbose flag. Takes precedence over the - --validate flag. - --validate Turns on file validation mode. Turns off regular file - output. - --json Changes to write output files as json. - --file-logging Enables file logging. Implies --verbose - --verbose Turns on console logging. - --log LOG Set the path to write the file log to. - --config CONFIG_PATH Set the path to load the logging config from. - --out OUT_PATH Set the folder to use for the program output. - (Default: Current directory) - --use-filename Sets whether the name of each output is based on the - msg filename. + + usage: extract_msg [-h] [--use-content-id] [--dev] [--validate] [--json] + [--file-logging] [--verbose] [--log LOG] + [--config CONFIG_PATH] [--out OUT_PATH] [--use-filename] + msg [msg ...] + + extract_msg: Extracts emails and attachments saved in Microsoft Outlook's .msg + files. https://github.com/mattgwwalker/msg-extractor + + positional arguments: + msg An msg file to be parsed + + optional arguments: + -h, --help show this help message and exit + --use-content-id, --cid + Save attachments by their Content ID, if they have + one. Useful when working with the HTML body. + --dev Changes to use developer mode. Automatically enables + the --verbose flag. Takes precedence over the + --validate flag. + --validate Turns on file validation mode. Turns off regular file + output. + --json Changes to write output files as json. + --file-logging Enables file logging. Implies --verbose + --verbose Turns on console logging. + --log LOG Set the path to write the file log to. + --config CONFIG_PATH Set the path to load the logging config from. + --out OUT_PATH Set the folder to use for the program output. + (Default: Current directory) + --use-filename Sets whether the name of each output is based on the + msg filename. **To use this in your own script**, start by using: From 1ab5564a43fd646271d638b3b6867f3deddf3744 Mon Sep 17 00:00:00 2001 From: Ken Peterson Date: Wed, 28 Aug 2019 06:37:11 -0700 Subject: [PATCH 2/8] Fixed some issues in readme --- README.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.rst b/README.rst index 0e60b3a2..0f17142e 100644 --- a/README.rst +++ b/README.rst @@ -17,7 +17,7 @@ Usage :: - python extract_msg example.msg + python -m extract_msg example.msg This will produce a new folder named according to the date, time and subject of the message (for example "2013-07-24_0915 Example"). The @@ -89,7 +89,7 @@ to the ExtractMsg.Message Method: :: - msg_raw = b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1\x00 ... \x00x00x00' + msg_raw = b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1\x00 ... \x00\x00\x00' msg = extract_msg.Message(msg_raw) If you want to override the default attachment class and use one of your @@ -180,8 +180,8 @@ Joel Kaufman - First implementations of the json and filename flags .. |License: GPL v3| image:: https://img.shields.io/badge/License-GPLv3-blue.svg :target: LICENSE.txt -.. |PyPI3| image:: https://img.shields.io/badge/pypi-0.23.2-blue.svg - :target: https://pypi.org/project/extract-msg/0.23.2/ +.. |PyPI3| image:: https://img.shields.io/badge/pypi-0.23.3-blue.svg + :target: https://pypi.org/project/extract-msg/0.23.3/ .. |PyPI1| image:: https://img.shields.io/badge/python-2.7+-brightgreen.svg :target: https://www.python.org/downloads/release/python-2715/ From 4dac12481906d249c07525d62531a4d704f47446 Mon Sep 17 00:00:00 2001 From: Ken Peterson Date: Wed, 28 Aug 2019 06:40:55 -0700 Subject: [PATCH 3/8] Update CHANGELOG.md --- CHANGELOG.md | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 439c338a..27091c84 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,20 +1,26 @@ +**v0.23.3** +* Fixed issues in readme. +* [[Syncurity #50](https://github.com/Syncurity/msg-extractor/issues/50)] Updated `dev_classes.Message` to better match the current `Message` class. +* Fixed bad links in changelog. +* [[mattgwwalker #95](https://github.com/mattgwwalker/msg-extractor/issues/95)] Added falback encoding as well as manual encoding change to `dev_classes.Message`. + **v0.23.1** * Fixed issue with embedded msg files caused by the changes in v0.23.0. **v0.23.0** -* [[mattgwwalker #75](https://github.com/Syncurity/msg-extractor/issues/75)] & [[Syncurity #39](https://github.com/Syncurity/msg-extractor/issues/39)] Completely rewrote the function `Message._getStringStream`. This was done for two reasons. The first was to make it actually work with msg files that have their strings encoded in a non-Unicode encoding. The second reason was to make it so that it better reflected msg specification which says that ALL strings in a file will be either Unicode or non-Unicode, but not both. Because of the second part, the `prefer` option has been removed. +* [[mattgwwalker #75](https://github.com/mattgwwalker/msg-extractor/issues/75)] & [[Syncurity #39](https://github.com/Syncurity/msg-extractor/issues/39)] Completely rewrote the function `Message._getStringStream`. This was done for two reasons. The first was to make it actually work with msg files that have their strings encoded in a non-Unicode encoding. The second reason was to make it so that it better reflected msg specification which says that ALL strings in a file will be either Unicode or non-Unicode, but not both. Because of the second part, the `prefer` option has been removed. * As part of fixing the two issues in the previous change, we have added two new properties: 1. a boolean `Message.areStringsUnicode` which tells if the strings are unicode encoded 2. A string `Message.stringEncoding` which tells what the encoding is. This is used by the `Message._getStringStream` to determine how to decode the data into a string. **v0.22.1** -* [[mattgwwalker #69](https://github.com/Syncurity/msg-extractor/issues/69)] Fixed date format not being up to standard. +* [[mattgwwalker #69](https://github.com/mattgwwalker/msg-extractor/issues/69)] Fixed date format not being up to standard. * Fixed a minor spelling error in the code. **v0.22.0** * [[Syncurity #30](https://github.com/Syncurity/msg-extractor/issues/30)] Added `--validate` option. * [[Syncurity #24](https://github.com/Syncurity/msg-extractor/issues/24)] Moved all dev code into its own scripts. Use `--dev` to use from the command line. -* [[mattgwwalker #67](https://github.com/Syncurity/msg-extractor/issues/67)] Added compatability module to enforce unicode os functions. +* [[mattgwwalker #67](https://github.com/mattgwwalker/msg-extractor/issues/67)] Added compatability module to enforce unicode os functions. * Added new function to `Message` class: `Message.sExists`. This function checks if a string stream exists. It's input should be formatted identically to that of `Message._getStringSteam`. * Added new function to `Message` class: `Message.fix_path`. This function will add the proper prefix to the path (if the `prefix` parameter is true) and adjust the path to be a string rather than a list or tuple. * Added new function to `utils.py`: `get_full_class_name`. This function returns a string containing the module name and the class name of any instance of any class. It is returned in the format of `{module}.{class}`. From 8eb1dbcfde75fed660cc203ad72a0fb3e61e6543 Mon Sep 17 00:00:00 2001 From: Ken Peterson Date: Wed, 28 Aug 2019 06:41:48 -0700 Subject: [PATCH 4/8] Bump version --- extract_msg/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extract_msg/__init__.py b/extract_msg/__init__.py index 4757f34b..76ec11a0 100644 --- a/extract_msg/__init__.py +++ b/extract_msg/__init__.py @@ -28,7 +28,7 @@ __author__ = 'Matthew Walker & The Elemental of Creation' __date__ = '2019-04-20' -__version__ = '0.23.2' +__version__ = '0.23.3' from extract_msg import constants from extract_msg.attachment import Attachment From 1e16cda5a2ce168b54e25f3d7a3d2f1be6583fc1 Mon Sep 17 00:00:00 2001 From: Ken Peterson Date: Wed, 28 Aug 2019 06:50:51 -0700 Subject: [PATCH 5/8] Update message.py --- extract_msg/dev_classes/message.py | 87 +++++++++++++++++++++++------- 1 file changed, 67 insertions(+), 20 deletions(-) diff --git a/extract_msg/dev_classes/message.py b/extract_msg/dev_classes/message.py index 3b201f94..6cd67b84 100644 --- a/extract_msg/dev_classes/message.py +++ b/extract_msg/dev_classes/message.py @@ -15,6 +15,7 @@ class Message(olefile.OleFileIO): """ Developer version of the `extract_msg.message.Message` class. + Useful for malformed msg files. """ def __init__(self, path, prefix=''): @@ -99,6 +100,25 @@ def Exists(self, inp): else: inp = self.__prefix + inp return self.exists(inp) + + def sExists(self, inp): + """ + Checks if string stream :param inp: exists in the msg file. + """ + inp = self.fix_path(inp) + return self.exists(inp + '001F') or self.exists(inp + '001E') + + def fix_path(self, inp, prefix=True): + """ + Changes paths so that they have the proper + prefix (should :param prefix: be True) and + are strings rather than lists or tuples. + """ + if isinstance(inp, (list, tuple)): + inp = '/'.join(inp) + if prefix: + inp = self.__prefix + inp + return inp def _getStream(self, filename, prefix=True): if isinstance(filename, list): @@ -115,29 +135,15 @@ def _getStream(self, filename, prefix=True): def _getStringStream(self, filename, prefer='unicode', prefix=True): """ Gets a string representation of the requested filename. - Checks for both ASCII and Unicode representations and returns - a value if possible. If there are both ASCII and Unicode - versions, then :param prefer: specifies which will be - returned. + This should ALWAYS return a string (Unicode in python 2) """ - if isinstance(filename, list): - # Join with slashes to make it easier to append the type - filename = '/'.join(filename) - - asciiVersion = self._getStream(filename + '001E', prefix) - unicodeVersion = windowsUnicode(self._getStream(filename + '001F', prefix)) - logger.log(5, '_getStringStream called for {}. Ascii version found: {}. Unicode version found: {}.'.format( - filename, asciiVersion is not None, unicodeVersion is not None)) - if asciiVersion is None: - return unicodeVersion - elif unicodeVersion is None: - return asciiVersion + filename = self.fix_path(filename, prefix) + if self.areStringsUnicode: + return windowsUnicode(self._getStream(filename + '001F', prefix = False)) else: - if prefer == 'unicode': - return unicodeVersion - else: - return asciiVersion + tmp = self._getStream(filename + '001E', prefix = False) + return None if tmp is None else tmp.decode(self.stringEncoding) @property def path(self): @@ -176,6 +182,47 @@ def mainProperties(self): constants.TYPE_MESSAGE if self.__prefix == '' else constants.TYPE_MESSAGE_EMBED) return self._prop + @property + def stringEncoding(self): + try: + return self.__stringEncoding + except AttributeError: + # We need to calculate the encoding + # Let's first check if the encoding will be unicode: + if self.areStringsUnicode: + self.__stringEncoding = "utf-16-le" + return self.__stringEncoding + else: + # Well, it's not unicode. Now we have to figure out what it IS. + if not self.mainProperties.has_key('3FFD0003'): + logger.error("String encoding is not unicode, but was also not specified. Malformed MSG file detected. Defaulting to utf-8") + self.__stringEncoding = 'utf-8' + return self.__stringEncoding + enc = self.mainProperties['3FFD0003'].value + # Now we just need to translate that value + # Now, this next line SHOULD work, but it is possible that it might not... + self.__stringEncoding = str(enc) + return self.__stringEncoding + + @stringEncoding.setter + def stringEncoding(self, enc): + self.__stringEncoding = enc + + @property + def areStringsUnicode(self): + """ + Returns a boolean telling if the strings are unicode encoded. + """ + try: + return self.__bStringsUnicode + except AttributeError: + if self.mainProperties.has_key('340D0003'): + if (self.mainProperties['340D0003'].value & 0x40000) != 0: + self.__bStringsUnicode = True + return self.__bStringsUnicode + self.__bStringsUnicode = False + return self.__bStringsUnicode + @property def date(self): """ From 0f011523a41012dbafd50a27bac7c106f15ee8ac Mon Sep 17 00:00:00 2001 From: Ken Peterson Date: Tue, 29 Oct 2019 14:48:07 -0700 Subject: [PATCH 6/8] Update message.py --- extract_msg/dev_classes/message.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/extract_msg/dev_classes/message.py b/extract_msg/dev_classes/message.py index 6cd67b84..264d52f9 100644 --- a/extract_msg/dev_classes/message.py +++ b/extract_msg/dev_classes/message.py @@ -121,10 +121,7 @@ def fix_path(self, inp, prefix=True): return inp def _getStream(self, filename, prefix=True): - if isinstance(filename, list): - filename = '/'.join(filename) - if prefix: - filename = self.__prefix + filename + filename = self.fix_path(filename, prefix) if self.exists(filename): stream = self.openstream(filename) return stream.read() From db51f6d4886ad81d31a32c537dc68904ac87d78f Mon Sep 17 00:00:00 2001 From: Ken Peterson Date: Tue, 29 Oct 2019 15:15:22 -0700 Subject: [PATCH 7/8] Finished updating fixed problems that were already fixed in the main `Message` class --- extract_msg/dev_classes/message.py | 58 ++++++++++++++++-------------- 1 file changed, 31 insertions(+), 27 deletions(-) diff --git a/extract_msg/dev_classes/message.py b/extract_msg/dev_classes/message.py index 264d52f9..0af2f1cc 100644 --- a/extract_msg/dev_classes/message.py +++ b/extract_msg/dev_classes/message.py @@ -18,7 +18,7 @@ class Message(olefile.OleFileIO): Useful for malformed msg files. """ - def __init__(self, path, prefix=''): + def __init__(self, path, prefix='', filename=None): """ :param path: path to the msg file in the system or is the raw msg file. :param prefix: used for extracting embedded msg files @@ -29,7 +29,8 @@ def __init__(self, path, prefix=''): self.__path = path olefile.OleFileIO.__init__(self, path) prefixl = [] - if prefix != '': + tmp_condition = prefix != '' + if tmp_condition: if not isinstance(prefix, stri): try: prefix = '/'.join(prefix) @@ -37,23 +38,29 @@ def __init__(self, path, prefix=''): raise TypeError('Invalid prefix type: ' + str(type(prefix)) + '\n(This was probably caused by you setting it manually).') prefix = prefix.replace('\\', '/') - g = prefix.split("/") + g = prefix.split('/') if g[-1] == '': g.pop() prefixl = g if prefix[-1] != '/': prefix += '/' - filename = self._getStringStream(prefixl[:-1] + ['__substg1.0_3001'], prefix=False) self.__prefix = prefix self.__prefixList = prefixl - - logger.log(5, ':param path: has __len__ attribute?: {}'.format(has_len(path))) - if has_len(path): - if len(path) < 1536: - self.filename = path - logger.log(5, ':param path: length is {}; Using :param path: as file path'.format(len(path))) + + if tmp_condition: + filename = self._getStringStream(prefixl[:-1] + ['__substg1.0_3001'], prefix=False) + if filename is not None: + self.filename = filename + else: + logger.log(5, ':param path: has __len__ attribute?: {}'.format(has_len(path))) + if has_len(path): + if len(path) < 1536: + self.filename = path + logger.log(5, ':param path: length is {}; Using :param path: as file path'.format(len(path))) + else: + logger.log(5, ':param path: length is {}; Using :param path: as raw msg stream'.format(len(path))) + self.filename = None else: - logger.log(5, ':param path: length is {}; Using :param path: as raw msg stream'.format(len(path))) self.filename = None self.mainProperties @@ -91,34 +98,31 @@ def listDir(self, streams=True, storages=False): out.append(x) return out - def Exists(self, inp): + def Exists(self, filename): """ - Checks if :param inp: exists in the msg file. + Checks if :param filename: exists in the msg file. """ - if isinstance(inp, list): - inp = self.__prefixList + inp - else: - inp = self.__prefix + inp - return self.exists(inp) + filename = self.fix_path(filename) + return self.exists(filename) - def sExists(self, inp): + def sExists(self, filename): """ - Checks if string stream :param inp: exists in the msg file. + Checks if string stream :param filename: exists in the msg file. """ - inp = self.fix_path(inp) - return self.exists(inp + '001F') or self.exists(inp + '001E') + filename = self.fix_path(filename) + return self.exists(filename + '001F') or self.exists(filename + '001E') - def fix_path(self, inp, prefix=True): + def fix_path(self, filename, prefix=True): """ Changes paths so that they have the proper prefix (should :param prefix: be True) and are strings rather than lists or tuples. """ - if isinstance(inp, (list, tuple)): - inp = '/'.join(inp) + if isinstance(filename, (list, tuple)): + filename = '/'.join(filename) if prefix: - inp = self.__prefix + inp - return inp + filename = self.__prefix + filename + return filename def _getStream(self, filename, prefix=True): filename = self.fix_path(filename, prefix) From b8cb08d9950354bfb7ccfa722ee74b49da54d115 Mon Sep 17 00:00:00 2001 From: Ken Peterson Date: Tue, 29 Oct 2019 15:41:13 -0700 Subject: [PATCH 8/8] Removed spaces on blank line --- extract_msg/dev_classes/message.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extract_msg/dev_classes/message.py b/extract_msg/dev_classes/message.py index 0af2f1cc..541f84c5 100644 --- a/extract_msg/dev_classes/message.py +++ b/extract_msg/dev_classes/message.py @@ -223,7 +223,7 @@ def areStringsUnicode(self): return self.__bStringsUnicode self.__bStringsUnicode = False return self.__bStringsUnicode - + @property def date(self): """