diff --git a/README b/README index 34ce202..00b0154 100755 --- a/README +++ b/README @@ -14,9 +14,15 @@ Installation: gem install docsplit + Added the options: + pdf_opts: which can be used to passed the pdftotext binary file options to docsplit gem + For Example: + Passing raw options to pdftotext, + Docsplit.extract_text(path, {:pdf_opts => '-raw'}) + For documentation, usage, and examples, see: http://documentcloud.github.com/docsplit/ To suggest a feature or report a bug: http://github.com/documentcloud/docsplit/issues/ - + diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb index 0d55f32..89363b1 100644 --- a/lib/docsplit/text_extractor.rb +++ b/lib/docsplit/text_extractor.rb @@ -103,14 +103,22 @@ def run(command) # Extract the full contents of a pdf as a single file, directly. def extract_full(pdf) text_path = File.join(@output, "#{@pdf_name}.txt") - run "pdftotext -enc UTF-8 #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1" + unless @pdf_txt_opts.empty? + run "pdftotext -enc UTF-8 #{@pdf_txt_opts} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1" + else + run "pdftotext -enc UTF-8 #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1" + end end # Extract the contents of a single page of text, directly, adding it to # the `@pages_to_ocr` list if the text length is inadequate. def extract_page(pdf, page) text_path = File.join(@output, "#{@pdf_name}_#{page}.txt") - run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1" + unless @pdf_txt_opts.empty? + run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{@pdf_txt_opts} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1" + else + run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1" + end unless @forbid_ocr @pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE end @@ -123,8 +131,9 @@ def extract_options(options) @forbid_ocr = options[:ocr] == false @clean_ocr = !(options[:clean] == false) @language = options[:language] || 'eng' + @pdf_txt_opts = options[:pdf_opts] || '' end end -end \ No newline at end of file +end diff --git a/test/unit/test_extract_text.rb b/test/unit/test_extract_text.rb index 381f85c..2f7411e 100755 --- a/test/unit/test_extract_text.rb +++ b/test/unit/test_extract_text.rb @@ -53,5 +53,9 @@ def test_name_escaping_while_extracting_text Docsplit.extract_text('test/fixtures/PDF file with spaces \'single\' and "double quotes".pdf', :pages => 'all', :output => OUTPUT) assert Dir["#{OUTPUT}/*.txt"].length == 2 end - + + def test_name_escaping_while_extracting_text_with_pdf_opts + Docsplit.extract_text('test/fixtures/PDF file with spaces \'single\' and "double quotes".pdf', {:pages => 'all', :output => OUTPUT, :pdf_opts => '-raw'}) + assert Dir["#{OUTPUT}/*.txt"].length == 2 + end end