From 13605ebdd45a5c02d99f5bafc69f67000b6a6e66 Mon Sep 17 00:00:00 2001 From: aquasync Date: Tue, 15 May 2007 14:25:54 +0000 Subject: [PATCH] Updates for version 1.2.17 git-svn-id: https://ruby-msg.googlecode.com/svn/trunk@85 c30d66de-b626-0410-988f-81f6512a6d81 --- FIXES | 56 +++++++++++++++++++++++++++++++++++++++++++ Rakefile | 2 +- bin/msgtool | 36 +++++++++++++++++++++++++--- bin/oletool | 2 +- lib/mime.rb | 4 +++- lib/msg.rb | 55 ++++++++++++++++++++++++++++++++++-------- lib/msg/properties.rb | 41 +++++++++++++++++++++---------- 7 files changed, 168 insertions(+), 28 deletions(-) create mode 100644 FIXES diff --git a/FIXES b/FIXES new file mode 100644 index 0000000..7a094b4 --- /dev/null +++ b/FIXES @@ -0,0 +1,56 @@ +FIXES + +recent fixes based on importing results into evolution + +1. was running into some issue with base64 encoded message/rfc822 attachments displaying + as empty. encoding them as plain solved the issue (odd). + +2. problem with a large percentage of emails, not displaying as mime. turned out to be + all received from blackberry. further, turned out there was 2 content-type headers, + "Content-Type", which I add, and "Content-type". normally my override works, but I + need to handle it case insensitvely it would appear. more tricky, whats the story + with these. fixing that will probably fix that whole class of issues there. + evolution was renaming my second content type as X-Invalid-Content-Type or something. + +3. another interesting one. had content-transfer-encoding set in the transport message + headers. it was set to base64. i didn't override that, so evolution "decoded" my + plaintext message into complete garbage. + fix - delete content-transfer-encoding. + +4. added content-location and content-id output in the mime handling of attachments + to get some inline html/image mails to work properly. + further, the containing mime content-type must be multipart/related, not multipart/mixed, + at least for evolution, in order for the images to appear inline. + could still improve in this area. if someone drags and drops in an image, it may + be inline in the rtf version, but exchanges generates crappy html such that the image + doesn't display inline. maybe i should correct the html output in these cases as i'm + throwing away the rtf version. + +5. note you may need wingdings installed. i had a lot of L and J appear in messages from + outlook users. turns out its smilies in wingdings. i think its only if word is used + as email editor and has autotext messing things up. + +6. still unsure about how to do my "\r" handling. + +7. need to join addresses with , instead of ; i think. evolution only shows the + first one otherwise it appears, but all when they are , separated. + +8. need to solve ole storage issues with the very large file using extra bat + stuff. + +9. retest a bit on evolution and thunderbird, and release. tested on a corups + of >1000 msg files, so should be starting to get pretty good quality. + +10. longer term, things fall into a few basic categories: + +- non mail conversions (look further into vcard, ical et al support for other + types of msg) +- further tests and robustness for what i handle now. ie, look into corner + cases covered so far, and work on the mime code. fix random charset encoding + issues, in the various weird mime ways, do header wrapping etc etc. + check fidelity of conversions, and capture some more properties as headers, + such as importance which i don't do yet. +- fix that named property bug. tidy up warnings, exceptions. +- extend conversion to make better html. + this is longer term. as i don't use the rtf, i need to make my html better. + emulating some rtf things. harder, not important atm. diff --git a/Rakefile b/Rakefile index 4f63392..a182665 100644 --- a/Rakefile +++ b/Rakefile @@ -47,7 +47,7 @@ spec = Gem::Specification.new do |s| #s.rubyforge_project = %q{ruby-msg} s.executables = ['msgtool', 'oletool'] - s.files = Dir.glob('data/*.yaml') + ['Rakefile', 'README'] + s.files = Dir.glob('data/*.yaml') + ['Rakefile', 'README', 'FIXES'] s.files += Dir.glob("lib/**/*.rb") s.files += Dir.glob("test/test_*.rb") + Dir.glob("test/*.doc") s.files += Dir.glob("bin/*") diff --git a/bin/msgtool b/bin/msgtool index 189b5d8..a75fdf7 100755 --- a/bin/msgtool +++ b/bin/msgtool @@ -3,13 +3,23 @@ require 'optparse' require 'rubygems' require 'msg' +require 'time' + +def munge_headers mime, opts + opts[:header_defaults].each do |s| + key, val = s.match(/(.*?):\s+(.*)/)[1..-1] + mime.headers[key] = [val] if mime.headers[key].empty? + end +end def msgtool - opts = {:verbose => false, :action => :convert} + opts = {:verbose => false, :action => :convert, :header_defaults => []} op = OptionParser.new do |op| op.banner = "Usage: msgtool [options] [files]" op.separator '' op.on('-c', '--convert', 'Convert msg files (default)') { opts[:action] = :convert } + op.on('-m', '--convert-mbox', 'Convert msg files for mbox usage') { opts[:action] = :convert_mbox } + op.on('-d', '--header-default STR', 'Provide a default value for top level mail header') { |hd| opts[:header_defaults] << hd } op.separator '' op.on('-v', '--[no-]verbose', 'Run verbosely') { |v| opts[:verbose] = v } op.on_tail('-h', '--help', 'Show this message') { puts op; exit } @@ -22,10 +32,30 @@ def msgtool end # just shut up and convert a message to eml Msg::Log.level = Ole::Log.level = opts[:verbose] ? Logger::WARN : Logger::FATAL - if opts[:action] == :convert + case opts[:action] + when :convert + msgs.each do |filename| + msg = Msg.open filename + mime = msg.to_mime + munge_headers mime, opts + puts mime.to_s + end + when :convert_mbox msgs.each do |filename| msg = Msg.open filename - puts msg.to_mime.to_s + # could use something from the msg in our from line if we wanted + puts "From msgtool@ruby-msg #{Time.now.rfc2822}" + mime = msg.to_mime + munge_headers mime, opts + mime.to_s.each do |line| + # we do the append > style mbox quoting (mboxrd i think its called), as it + # is the only one that can be robuslty un-quoted. evolution doesn't use this! + if line =~ /^>*From /o + print '>' + line + else + print line + end + end end end end diff --git a/bin/oletool b/bin/oletool index 9e2bbc2..b658ac7 100755 --- a/bin/oletool +++ b/bin/oletool @@ -27,7 +27,7 @@ def oletool when :tree Ole::Storage.open(file) { |ole| puts ole.root.to_tree } when :repack - Ole::Storage.open(file, &:repack) + Ole::Storage.open file, 'r+', &:repack end end end diff --git a/lib/mime.rb b/lib/mime.rb index c28cd3b..99b9fbc 100644 --- a/lib/mime.rb +++ b/lib/mime.rb @@ -33,7 +33,7 @@ class Mime # Create a Mime object using +str+ as an initial serialization, which must contain headers # and a body (even if empty). Needs work. - def initialize str + def initialize str, ignore_body=false headers, @body = $~[1..-1] if str[/(.*?\r?\n)(?:\r?\n(.*))?\Z/m] @headers = Hash.new { |hash, key| hash[key] = [] } @@ -48,6 +48,8 @@ def initialize str @content_type, attrs = Mime.split_header content_type end + return if ignore_body + if multipart? if body.empty? @preamble = '' diff --git a/lib/msg.rb b/lib/msg.rb index c6205da..ebca8ce 100755 --- a/lib/msg.rb +++ b/lib/msg.rb @@ -20,7 +20,7 @@ # class Msg - VERSION = '1.2.16' + VERSION = '1.2.17' # we look here for the yaml files in data/, and the exe files for support # decoding at the moment. SUPPORT_DIR = File.dirname(__FILE__) + '/..' @@ -72,7 +72,9 @@ def initialize root # headers. we may get nothing. # and other times, when received from external, we get the full cigar, boundaries # etc and all. - @mime = Mime.new props.transport_message_headers.to_s + # sometimes its multipart, with no boundaries. that throws an error. so we'll be more + # forgiving here + @mime = Mime.new props.transport_message_headers.to_s, true populate_headers end @@ -169,11 +171,28 @@ def populate_headers headers['Date'] = [Time.iso8601(time.to_s).rfc2822] if time end - if !headers.has_key?('Message-ID') and props.internet_message_id - headers['Message-ID'] = [props.internet_message_id] - end - if !headers.has_key?('In-Reply-To') and props.in_reply_to_id - headers['In-Reply-To'] = [props.in_reply_to_id] + # some very simplistic mapping between internet message headers and the + # mapi properties + # any of these could be causing duplicates due to case issues. the hack in #to_mime + # just stops re-duplication at that point. need to move some smarts into the mime + # code to handle it. + mapi_header_map = [ + [:internet_message_id, 'Message-ID'], + [:in_reply_to_id, 'In-Reply-To'], + # don't set these values if they're equal to the defaults anyway + [:importance, 'Importance', proc { |val| val.to_s == '1' ? nil : val }], + [:priority, 'Priority', proc { |val| val.to_s == '1' ? nil : val }], + [:sensitivity, 'Sensitivity', proc { |val| val.to_s == '0' ? nil : val }], + # yeah? + [:conversation_topic, 'Thread-Topic'], + # not sure of the distinction here + # :originator_delivery_report_requested ?? + [:read_receipt_requested, 'Disposition-Notification-To', proc { |val| from }] + ] + mapi_header_map.each do |mapi, mime, *f| + next unless q = val = props.send(mapi) or headers.has_key?(mime) + next if f[0] and !(val = f[0].call(val)) + headers[mime] = [val.to_s] end end @@ -251,7 +270,15 @@ def to_mime unless attachments.empty? mime = Mime.new "Content-Type: multipart/mixed\r\n\r\n" mime.parts << body - attachments.each { |attach| mime.parts << attach.to_mime } + # i don't know any better way to do this. need multipart/related for inline images + # referenced by cid: urls to work, but don't want to use it otherwise... + related = false + attachments.each do |attach| + part = attach.to_mime + related = true if part.headers.has_key?('Content-ID') or part.headers.has_key?('Content-Location') + mime.parts << part + end + mime.headers['Content-Type'] = ['multipart/related'] if related end # at this point, mime is either @@ -269,7 +296,10 @@ def to_mime # now that we have a root, we can mix in all our headers headers.each do |key, vals| # don't overwrite the content-type, encoding style stuff - next unless mime.headers[key].empty? + next if mime.headers.has_key? key + # some new temporary hacks + next if key =~ /content-type/i and vals[0] =~ /base64/ + next if mime.headers.keys.map(&:downcase).include? key.downcase mime.headers[key] += vals end # just a stupid hack to make the content-type header last, when using OrderedHash @@ -389,13 +419,18 @@ def to_mime mime = Mime.new "Content-Type: #{mimetype}\r\n\r\n" mime.headers['Content-Disposition'] = [%{attachment; filename="#{filename}"}] mime.headers['Content-Transfer-Encoding'] = ['base64'] + mime.headers['Content-Location'] = [props.attach_content_location] if props.attach_content_location + mime.headers['Content-ID'] = [props.attach_content_id] if props.attach_content_id # data.to_s for now. data was nil for some reason. # perhaps it was a data object not correctly handled? # hmmm, have to use read here. that assumes that the data isa stream. # but if the attachment data is a string, then it won't work. possible? data_str = if @embedded_msg mime.headers['Content-Type'] = 'message/rfc822' + # lets try making it not base64 for now + mime.headers.delete 'Content-Transfer-Encoding' # not filename. rather name, or something else right? + # maybe it should be inline?? i forget attach_method / access meaning mime.headers['Content-Disposition'] = [%{attachment; filename="#{@embedded_msg.subject}"}] @embedded_msg.to_mime.to_s elsif @embedded_ole @@ -409,7 +444,7 @@ def to_mime else data.read.to_s end - mime.body.replace Base64.encode64(data_str).gsub(/\n/, "\r\n") + mime.body.replace @embedded_msg ? data_str : Base64.encode64(data_str).gsub(/\n/, "\r\n") mime end diff --git a/lib/msg/properties.rb b/lib/msg/properties.rb index 1852bd6..059fc9f 100644 --- a/lib/msg/properties.rb +++ b/lib/msg/properties.rb @@ -48,24 +48,28 @@ class Msg # There also needs to be a way to look up properties more specifically: # # properties[0x0037] # => gets the subject - # properties[PS_MAPI, 0x0037] # => still gets the subject - # properties[PS_PUBLIC_STRINGS, 'Keywords'] # => gets the above categories + # properties[0x0037, PS_MAPI] # => still gets the subject + # properties['Keywords', PS_PUBLIC_STRINGS] # => gets outlook's categories array # - # The abbreviate versions work by "resolving" the symbols to full keys: + # The abbreviated versions work by "resolving" the symbols to full keys: # - # properties.resolve :keywords # => [PS_OUTLOOK, 'Keywords'] - # properties.resolve :subject # => [PS_MAPI, 0x0037] + # # the guid here is just PS_PUBLIC_STRINGS + # properties.resolve :keywords # => # + # # the result here is actually also a key + # k = properties.resolve :subject # => 0x0037 + # # it has a guid + # k.guid == Msg::Properties::PS_MAPI # => true # # = Parsing # # There are three objects that need to be parsed to load a +Msg+ property store: # - # 1. The +nameid+ directory (Properties.parse_nameid) + # 1. The +nameid+ directory (Properties.parse_nameid) # 2. The many +substg+ objects, whose names should match Properties::SUBSTG_RX # (Properties#parse_substg) # 3. The +properties+ file (Properties#parse_properties) # - # Understanding of the formats is by no means perfect + # Understanding of the formats is by no means perfect. # # = TODO # @@ -79,7 +83,7 @@ class Msg # current greedy-loading approach. still want strings to work nicely: # props.subject # but don't want to be loading up large binary blobs, typically attachments, eg - # props.attach_data. + # props.attach_data # probably the easiest solution is that the binary "encoding", be to return an io # object instead. and you must read it if you want it as a string # maybe i can avoid the greedy model anyway? rather than parsing the properties completely, @@ -98,7 +102,7 @@ class Properties 0x001f => proc { |obj| Ole::Types::FROM_UTF16.iconv obj.read }, # unicode # ascii # FIXME hack did a[0..-2] before, seems right sometimes, but for some others it chopped the text. chomp - 0x001e => proc { |obj| a = obj.read; a[-1] == 0 ? a[0...-2] : a }, + 0x001e => proc { |obj| obj.read.chomp 0.chr }, 0x0102 => proc { |obj| obj.open }, # binary? :default => proc { |obj| obj.open } } @@ -133,9 +137,13 @@ class Properties attr_reader :unused attr_reader :nameid + # +nameid+ is to provide a way to inherit from parent (needed for property sets for + # attachments and recipients, which inherit from the msg itself. what about nested + # msg??) def initialize @raw = {} @unused = [] + @nameid = nil # FIXME @body_rtf = @body_html = @body = false end @@ -144,7 +152,7 @@ def initialize # The parsing methods #++ - def self.load obj + def self.load obj, ignore=nil prop = Properties.new prop.load obj prop @@ -154,9 +162,16 @@ def self.load obj def load obj # we need to do the nameid first, as it provides the map for later user defined properties children = obj.children.dup - @nameid = if nameid_obj = children.find { |child| child.name == '__nameid_version1.0' } + if nameid_obj = children.find { |child| child.name == '__nameid_version1.0' } children.delete nameid_obj - Properties.parse_nameid nameid_obj + @nameid = Properties.parse_nameid nameid_obj + # hack to make it available to all msg files from the same ole storage object + class << obj.ole + attr_accessor :msg_nameid + end + obj.ole.msg_nameid = @nameid + elsif obj.ole + @nameid = obj.ole.msg_nameid rescue nil end # now parse the actual properties. i think dirs that match the substg should be decoded # as properties to. 0x000d is just another encoding, the dir encoding. it should match @@ -310,6 +325,8 @@ def add_property key, value, pos=nil elsif real_key = @nameid[key] key = real_key else + # i think i hit these when i have a named property, in the PS_MAPI + # guid Log.warn "property in named range not in nameid #{key.inspect}" key = Key.new key end