From bffa8a4f05c6fbc8e78febbbb947c4a4db091861 Mon Sep 17 00:00:00 2001
From: Sanjiv Jha <sanjiv@joshsoftware.com>
Date: Mon, 14 Jul 2014 12:17:05 +0530
Subject: [PATCH 1/2] added pdftotext options

---
 README                         |  8 +++++++-
 lib/docsplit/text_extractor.rb | 15 ++++++++++++---
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/README b/README
index 34ce202..00b0154 100755
--- a/README
+++ b/README
@@ -14,9 +14,15 @@
   Installation:
   gem install docsplit
   
+  Added the options:
+    pdf_opts: which can be used to passed the pdftotext binary file options to docsplit gem
+    For Example:
+      Passing raw options to pdftotext, 
+        Docsplit.extract_text(path, {:pdf_opts => '-raw'})
+
   For documentation, usage, and examples, see:
   http://documentcloud.github.com/docsplit/
   
   To suggest a feature or report a bug: 
   http://github.com/documentcloud/docsplit/issues/
-
+  
diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb
index 0d55f32..89363b1 100644
--- a/lib/docsplit/text_extractor.rb
+++ b/lib/docsplit/text_extractor.rb
@@ -103,14 +103,22 @@ def run(command)
     # Extract the full contents of a pdf as a single file, directly.
     def extract_full(pdf)
       text_path = File.join(@output, "#{@pdf_name}.txt")
-      run "pdftotext -enc UTF-8 #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
+      unless @pdf_txt_opts.empty?
+        run "pdftotext  -enc UTF-8 #{@pdf_txt_opts}  #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
+      else
+        run "pdftotext  -enc UTF-8 #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
+      end
     end
 
     # Extract the contents of a single page of text, directly, adding it to
     # the `@pages_to_ocr` list if the text length is inadequate.
     def extract_page(pdf, page)
       text_path = File.join(@output, "#{@pdf_name}_#{page}.txt")
-      run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
+      unless @pdf_txt_opts.empty?
+        run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{@pdf_txt_opts}  #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
+      else
+        run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
+      end
       unless @forbid_ocr
         @pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE
       end
@@ -123,8 +131,9 @@ def extract_options(options)
       @forbid_ocr = options[:ocr] == false
       @clean_ocr  = !(options[:clean] == false)
       @language   = options[:language] || 'eng'
+      @pdf_txt_opts = options[:pdf_opts] || '' 
     end
 
   end
 
-end
\ No newline at end of file
+end

From ec3f0ed32a7bad0969444ed9a1129bb2886fb268 Mon Sep 17 00:00:00 2001
From: Sanjiv Jha <sanjiv@joshsoftware.com>
Date: Fri, 25 Jul 2014 19:44:48 +0530
Subject: [PATCH 2/2] added the test case with pdftotext options

---
 test/unit/test_extract_text.rb | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/test/unit/test_extract_text.rb b/test/unit/test_extract_text.rb
index 381f85c..2f7411e 100755
--- a/test/unit/test_extract_text.rb
+++ b/test/unit/test_extract_text.rb
@@ -53,5 +53,9 @@ def test_name_escaping_while_extracting_text
     Docsplit.extract_text('test/fixtures/PDF file with spaces \'single\' and "double quotes".pdf', :pages => 'all', :output => OUTPUT)
     assert Dir["#{OUTPUT}/*.txt"].length == 2
   end
-
+  
+  def test_name_escaping_while_extracting_text_with_pdf_opts
+    Docsplit.extract_text('test/fixtures/PDF file with spaces \'single\' and "double quotes".pdf', {:pages => 'all', :output => OUTPUT, :pdf_opts => '-raw'})
+    assert Dir["#{OUTPUT}/*.txt"].length == 2
+  end
 end