From a9a46333ecf152c61c91f49f90c9aea78fdf056a Mon Sep 17 00:00:00 2001 From: Akzhan Abdulin Date: Thu, 2 Aug 2012 17:51:01 +0400 Subject: [PATCH 1/3] Handle encoding header of Git commits when possible. --- lib/grit.rb | 1 + lib/grit/commit.rb | 8 ++++++++ lib/grit/encode.rb | 35 +++++++++++++++++++++++++++++++++ lib/grit/git-ruby/git_object.rb | 18 ++++++++++++----- 4 files changed, 57 insertions(+), 5 deletions(-) create mode 100644 lib/grit/encode.rb diff --git a/lib/grit.rb b/lib/grit.rb index 578f1aba..8ca7dc84 100644 --- a/lib/grit.rb +++ b/lib/grit.rb @@ -28,6 +28,7 @@ require 'grit/ruby1.9' # internal requires +require 'grit/encode' require 'grit/lazy' require 'grit/errors' require 'grit/git-ruby' diff --git a/lib/grit/commit.rb b/lib/grit/commit.rb index 841dfe91..9a5dcf79 100644 --- a/lib/grit/commit.rb +++ b/lib/grit/commit.rb @@ -2,6 +2,7 @@ module Grit class Commit extend Lazy + extend Encode attr_reader :id attr_reader :repo @@ -25,12 +26,17 @@ class Commit def self.parse_batch(repo, sha, size, object) info, message = object.split("\n\n", 2) + encoding = nil + lines = info.split("\n") tree = lines.shift.split(' ', 2).last parents = [] parents << lines.shift[7..-1] while lines.first[0, 6] == 'parent' author, authored_date = Grit::Commit.actor(lines.shift) committer, committed_date = Grit::Commit.actor(lines.shift) + encoding = $1 if lines.detect { |line| line =~ /\Aencoding\s*(.+)\z/ } + + message = message_in_utf8(message, encoding) Grit::Commit.new( repo, sha, parents, tree, @@ -176,6 +182,8 @@ def self.list_from_string(repo, text) lines.shift while lines.first && lines.first.empty? + message_lines = message_lines.map { |line| message_in_utf8(line, encoding) } if encoding + commits << Commit.new(repo, id, parents, tree, author, authored_date, committer, committed_date, message_lines) end diff --git a/lib/grit/encode.rb b/lib/grit/encode.rb new file mode 100644 index 00000000..b28248a3 --- /dev/null +++ b/lib/grit/encode.rb @@ -0,0 +1,35 @@ +module Grit + module Encode + ENCODING_MAP = { + 'koi8r' => 'KOI8-R', + } + + def find_ruby_encoding(encoding) + return nil unless defined?(Encoding) + Encoding.find(ENCODING_MAP[encoding] || encoding) + rescue ArgumentError + warn "Cannot map git encoding to ruby one: #{encoding}" + nil + end + + def message_in_utf8(message, encoding) + message = message.dup + message.force_encoding('UTF-8') if message.respond_to?(:force_encoding) + if encoding && encoding !~ /\Autf\-8i\z/i && message && message.respond_to?(:encode!) + ruby_encoding = find_ruby_encoding(encoding) + if ruby_encoding + message.encode!('UTF-8', ruby_encoding, :invalid => :replace, :undef => :replace) + unless message.valid_encoding? + message.encode!('UTF-8', 'ISO-8859-1', :invalid => :replace, :undef => :replace) + end + else + unless message.valid_encoding? + message.encode!('UTF-8', 'ISO-8859-1', :invalid => :replace, :undef => :replace) + end + end + end + message + end + end +end + diff --git a/lib/grit/git-ruby/git_object.rb b/lib/grit/git-ruby/git_object.rb index 5ac2e4f7..19c144aa 100644 --- a/lib/grit/git-ruby/git_object.rb +++ b/lib/grit/git-ruby/git_object.rb @@ -1,4 +1,5 @@ -# +# --- encoding: utf-8 --- + # converted from the gitrb project # # authors: @@ -232,11 +233,13 @@ def actual_raw end class Commit < GitObject - attr_accessor :author, :committer, :tree, :parent, :message, :headers + extend Grit::Encode + + attr_accessor :author, :committer, :tree, :parent, :message, :decoded_message, :encoding, :headers def self.from_raw(rawobject, repository=nil) parent = [] - tree = author = committer = nil + tree = author = committer = encoding = nil headers, message = rawobject.content.split(/\n\n/, 2) all_headers = headers.split(/\n/).map { |header| header.split(/ /, 2) } @@ -250,6 +253,8 @@ def self.from_raw(rawobject, repository=nil) author = UserInfo.new(value) when "committer" committer = UserInfo.new(value) + when "encoding" + encoding = value else warn "unknown header '%s' in commit %s" % \ [key, rawobject.sha1.unpack("H*")[0]] @@ -258,10 +263,11 @@ def self.from_raw(rawobject, repository=nil) if not tree && author && committer raise RuntimeError, "incomplete raw commit object" end - new(tree, parent, author, committer, message, headers, repository) + decoded_message = message_in_utf8(message, encoding) + new(tree, parent, author, committer, message, headers, repository, decoded_message, encoding) end - def initialize(tree, parent, author, committer, message, headers, repository=nil) + def initialize(tree, parent, author, committer, message, headers, repository=nil, decoded_message=nil, encoding=nil) @tree = tree @author = author @parent = parent @@ -269,6 +275,8 @@ def initialize(tree, parent, author, committer, message, headers, repository=nil @message = message @headers = headers @repository = repository + @decoded_message = decoded_message || message + @encoding = encoding end def type From 31cfebc43ca84ad10c6cb68d7f9118cc5ffca93e Mon Sep 17 00:00:00 2001 From: Akzhan Abdulin Date: Fri, 3 Aug 2012 11:34:05 +0400 Subject: [PATCH 2/3] decoded_message now is lazy RO accessor. --- lib/grit/git-ruby/git_object.rb | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/lib/grit/git-ruby/git_object.rb b/lib/grit/git-ruby/git_object.rb index 19c144aa..e1ae2f80 100644 --- a/lib/grit/git-ruby/git_object.rb +++ b/lib/grit/git-ruby/git_object.rb @@ -235,7 +235,7 @@ def actual_raw class Commit < GitObject extend Grit::Encode - attr_accessor :author, :committer, :tree, :parent, :message, :decoded_message, :encoding, :headers + attr_accessor :author, :committer, :tree, :parent, :message, :encoding, :headers def self.from_raw(rawobject, repository=nil) parent = [] @@ -263,11 +263,10 @@ def self.from_raw(rawobject, repository=nil) if not tree && author && committer raise RuntimeError, "incomplete raw commit object" end - decoded_message = message_in_utf8(message, encoding) - new(tree, parent, author, committer, message, headers, repository, decoded_message, encoding) + new(tree, parent, author, committer, message, headers, repository, encoding) end - def initialize(tree, parent, author, committer, message, headers, repository=nil, decoded_message=nil, encoding=nil) + def initialize(tree, parent, author, committer, message, headers, repository=nil, encoding=nil) @tree = tree @author = author @parent = parent @@ -275,10 +274,13 @@ def initialize(tree, parent, author, committer, message, headers, repository=nil @message = message @headers = headers @repository = repository - @decoded_message = decoded_message || message @encoding = encoding end + def decoded_message + message_in_utf8(message, encoding) + end + def type :commit end From d51cb775160c87b4ce646fdb675ffd44f4cfd6e4 Mon Sep 17 00:00:00 2001 From: Akzhan Abdulin Date: Fri, 3 Aug 2012 11:42:40 +0400 Subject: [PATCH 3/3] git version also looks like "git version 1.7.7.5 (Apple Git-26)\n" --- test/test_git.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_git.rb b/test/test_git.rb index d7e69ab8..8b3dc1f3 100644 --- a/test/test_git.rb +++ b/test/test_git.rb @@ -10,7 +10,7 @@ def teardown end def test_method_missing - assert_match(/^git version [\w\.]*$/, @git.version) + assert_match(/\Agit version [\w\.\(\)\s\-]*\z/, @git.version) end def test_logs_stderr