From bffa8a4f05c6fbc8e78febbbb947c4a4db091861 Mon Sep 17 00:00:00 2001 From: Sanjiv Jha Date: Mon, 14 Jul 2014 12:17:05 +0530 Subject: [PATCH 1/2] added pdftotext options --- README | 8 +++++++- lib/docsplit/text_extractor.rb | 15 ++++++++++++--- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/README b/README index 34ce202..00b0154 100755 --- a/README +++ b/README @@ -14,9 +14,15 @@ Installation: gem install docsplit + Added the options: + pdf_opts: which can be used to passed the pdftotext binary file options to docsplit gem + For Example: + Passing raw options to pdftotext, + Docsplit.extract_text(path, {:pdf_opts => '-raw'}) + For documentation, usage, and examples, see: http://documentcloud.github.com/docsplit/ To suggest a feature or report a bug: http://github.com/documentcloud/docsplit/issues/ - + diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb index 0d55f32..89363b1 100644 --- a/lib/docsplit/text_extractor.rb +++ b/lib/docsplit/text_extractor.rb @@ -103,14 +103,22 @@ def run(command) # Extract the full contents of a pdf as a single file, directly. def extract_full(pdf) text_path = File.join(@output, "#{@pdf_name}.txt") - run "pdftotext -enc UTF-8 #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1" + unless @pdf_txt_opts.empty? + run "pdftotext -enc UTF-8 #{@pdf_txt_opts} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1" + else + run "pdftotext -enc UTF-8 #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1" + end end # Extract the contents of a single page of text, directly, adding it to # the `@pages_to_ocr` list if the text length is inadequate. def extract_page(pdf, page) text_path = File.join(@output, "#{@pdf_name}_#{page}.txt") - run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1" + unless @pdf_txt_opts.empty? + run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{@pdf_txt_opts} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1" + else + run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1" + end unless @forbid_ocr @pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE end @@ -123,8 +131,9 @@ def extract_options(options) @forbid_ocr = options[:ocr] == false @clean_ocr = !(options[:clean] == false) @language = options[:language] || 'eng' + @pdf_txt_opts = options[:pdf_opts] || '' end end -end \ No newline at end of file +end From ec3f0ed32a7bad0969444ed9a1129bb2886fb268 Mon Sep 17 00:00:00 2001 From: Sanjiv Jha Date: Fri, 25 Jul 2014 19:44:48 +0530 Subject: [PATCH 2/2] added the test case with pdftotext options --- test/unit/test_extract_text.rb | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/test/unit/test_extract_text.rb b/test/unit/test_extract_text.rb index 381f85c..2f7411e 100755 --- a/test/unit/test_extract_text.rb +++ b/test/unit/test_extract_text.rb @@ -53,5 +53,9 @@ def test_name_escaping_while_extracting_text Docsplit.extract_text('test/fixtures/PDF file with spaces \'single\' and "double quotes".pdf', :pages => 'all', :output => OUTPUT) assert Dir["#{OUTPUT}/*.txt"].length == 2 end - + + def test_name_escaping_while_extracting_text_with_pdf_opts + Docsplit.extract_text('test/fixtures/PDF file with spaces \'single\' and "double quotes".pdf', {:pages => 'all', :output => OUTPUT, :pdf_opts => '-raw'}) + assert Dir["#{OUTPUT}/*.txt"].length == 2 + end end