Convert RTF to HTML respecting original code page

aquasync · Nov 10, 2020 · c9046a2 · c9046a2
1 parent 34fb32e
commit c9046a2
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 8 deletions.
diff --git a/lib/mapi/mime.rb b/lib/mapi/mime.rb
@@ -69,6 +69,10 @@ def initialize str, ignore_body=false
   		end
   	end
 
+    def encode(x)
+      x.encoding == Encoding::UTF_8 ? x : x.encode('utf-8', 'cp1252')
+    end
+
   	def multipart?
   		@content_type && @content_type =~ /^multipart/ ? true : false
   	end
@@ -97,7 +101,7 @@ def to_s opts={}
   		opts = {:boundary_counter => 0}.merge opts
   		if multipart?
   			boundary = Mime.make_boundary opts[:boundary_counter] += 1, self
-  			@body = [preamble, parts.map { |part| "\r\n" + part.to_s(opts) + "\r\n" }, "--\r\n" + epilogue].
+  			@body = [encode(preamble), parts.map { |part| "\r\n" + part.to_s(opts) + "\r\n" }, "--\r\n" + encode(epilogue)].
   				flatten.join("\r\n--" + boundary)
   			content_type, attrs = Mime.split_header @headers['Content-Type'][0]
   			attrs['boundary'] = boundary
@@ -106,9 +110,9 @@ def to_s opts={}
 
   		str = ''
   		@headers.each do |key, vals|
-  			vals.each { |val| str << "#{key}: #{val}\r\n" }
+  			vals.each { |val| str << "#{encode(key)}: #{encode(val)}\r\n" }
   		end
-  		str << "\r\n" + @body
+  		str << "\r\n" + encode(@body)
   	end
 
   	def self.split_header header

diff --git a/lib/mapi/rtf.rb b/lib/mapi/rtf.rb
@@ -122,7 +122,7 @@ def self.rtf2text str, format=:text
 			end
 		end
 
-		RTF_PREBUF = 
+		RTF_PREBUF =
 			"{\\rtf1\\ansi\\mac\\deff0\\deftab720{\\fonttbl;}" \
 			"{\\f0\\fnil \\froman \\fswiss \\fmodern \\fscript " \
 			"\\fdecor MS Sans SerifSymbolArialTimes New RomanCourier" \
@@ -175,7 +175,7 @@ def rtfdecompr data
 			else # unknown magic number
 				raise "Unknown compression type (magic number 0x%08x)" % magic
 			end
-			
+
 			# not sure if its due to a bug in the above code. doesn't seem to be
 			# in my tests, but sometimes there's a trailing null. we chomp it here,
 			# which actually makes the resultant rtf smaller than its advertised
@@ -189,7 +189,7 @@ def rtfdecompr data
 		#
 		# Returns +nil+ if it doesn't look like an rtf encapsulated rtf.
 		#
-		# Some cases that the original didn't deal with have been patched up, eg from 
+		# Some cases that the original didn't deal with have been patched up, eg from
 		# this chunk, where there are tags outside of the htmlrtf ignore block.
 		#
 		# "{\\*\\htmltag116 <br />}\\htmlrtf \\line \\htmlrtf0 \\line {\\*\\htmltag84 <a href..."
@@ -229,8 +229,14 @@ def rtfdecompr data
 		def rtf2html rtf
 			scan = StringScanner.new rtf
 			# require \fromhtml. is this worth keeping? apparently you see \\fromtext if it
-			# was converted from plain text. 
+			# was converted from plain text.
 			return nil unless rtf["\\fromhtml"]
+      if scan.scan_until(/\\ansicpg/)
+        code_page = "cp" + scan.scan(/\d+/)
+        scan.pos = 0
+      else
+        code_page = 'ascii'
+      end
 			html = ''
 			ignore_tag = nil
 			# skip up to the first htmltag. return nil if we don't ever find one
@@ -270,7 +276,7 @@ def rtf2html rtf
 					p :wtf
 				end
 			end
-			html.strip.empty? ? nil : html
+      html.strip.empty? ? nil : html.encode('utf-8', code_page)
 		end
 
 		module_function :rtf2html, :rtfdecompr