From e7a53d98cb5d2de57ca700de23a8cacd00e9dc84 Mon Sep 17 00:00:00 2001 From: "Timo B. Kranz" Date: Tue, 22 Sep 2015 11:18:22 +0200 Subject: [PATCH 1/7] Clean pdffonts output to avoid invalid UTF-8 characters --- lib/docsplit/text_extractor.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb index 985abdd..2edba91 100644 --- a/lib/docsplit/text_extractor.rb +++ b/lib/docsplit/text_extractor.rb @@ -46,7 +46,7 @@ def extract(pdfs, opts) # Does a PDF have any text embedded? def contains_text?(pdf) fonts = `pdffonts #{ESCAPE[pdf]} 2>&1` - !fonts.match(NO_TEXT_DETECTED) + !fonts.scrub.match(NO_TEXT_DETECTED) end # Extract a page range worth of text from a PDF, directly. From bc41d6ed2af8390dec12a4432c60e6b7c541fc1c Mon Sep 17 00:00:00 2001 From: "Timo B. Kranz" Date: Wed, 6 Feb 2019 13:52:31 +0100 Subject: [PATCH 2/7] Add timeout to OCR --- lib/docsplit/text_extractor.rb | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb index 2edba91..023dbf8 100644 --- a/lib/docsplit/text_extractor.rb +++ b/lib/docsplit/text_extractor.rb @@ -21,6 +21,8 @@ class TextExtractor MIN_TEXT_PER_PAGE = 100 # in bytes + TIMEOUT = '5m' + def initialize @pages_to_ocr = [] end @@ -66,7 +68,7 @@ def extract_from_ocr(pdf, pages) tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif" escaped_tiff = ESCAPE[tiff] file = "#{base_path}_#{page}" - run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1" + run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{TIMEOUT} gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1" run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1" clean_text(file + '.txt') if @clean_ocr FileUtils.remove_entry_secure tiff @@ -74,7 +76,7 @@ def extract_from_ocr(pdf, pages) else tiff = "#{tempdir}/#{@pdf_name}.tif" escaped_tiff = ESCAPE[tiff] - run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1" + run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{TIMEOUT} gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1" #if the user says don't do orientation detection or the plugin is not installed, set psm to 0 run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{psm} 2>&1" clean_text(base_path + '.txt') if @clean_ocr From a6d5f4e7074ac60a0f418312f7b916d6dc3e922c Mon Sep 17 00:00:00 2001 From: "Timo B. Kranz" Date: Wed, 6 Feb 2019 14:24:00 +0100 Subject: [PATCH 3/7] Raise upon failed image conversion --- lib/docsplit/text_extractor.rb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb index 023dbf8..adef04d 100644 --- a/lib/docsplit/text_extractor.rb +++ b/lib/docsplit/text_extractor.rb @@ -69,6 +69,7 @@ def extract_from_ocr(pdf, pages) escaped_tiff = ESCAPE[tiff] file = "#{base_path}_#{page}" run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{TIMEOUT} gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1" + raise Docsplit::ExtractionFailed unless File.exists? escaped_tiff run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1" clean_text(file + '.txt') if @clean_ocr FileUtils.remove_entry_secure tiff @@ -78,6 +79,7 @@ def extract_from_ocr(pdf, pages) escaped_tiff = ESCAPE[tiff] run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{TIMEOUT} gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1" #if the user says don't do orientation detection or the plugin is not installed, set psm to 0 + raise Docsplit::ExtractionFailed unless File.exists? escaped_tiff run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{psm} 2>&1" clean_text(base_path + '.txt') if @clean_ocr end From 1c2ff70a6ff85df4d0e6422351bd40621d33e9b0 Mon Sep 17 00:00:00 2001 From: "Timo B. Kranz" Date: Wed, 3 Mar 2021 18:13:00 +0100 Subject: [PATCH 4/7] Update MAGICK_TMPDIR --- lib/docsplit/text_extractor.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb index adef04d..042935e 100644 --- a/lib/docsplit/text_extractor.rb +++ b/lib/docsplit/text_extractor.rb @@ -68,7 +68,7 @@ def extract_from_ocr(pdf, pages) tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif" escaped_tiff = ESCAPE[tiff] file = "#{base_path}_#{page}" - run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{TIMEOUT} gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1" + run "MAGICK_TEMPORARY_PATH=#{tempdir} OMP_NUM_THREADS=2 timeout #{TIMEOUT} gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1" raise Docsplit::ExtractionFailed unless File.exists? escaped_tiff run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1" clean_text(file + '.txt') if @clean_ocr @@ -77,7 +77,7 @@ def extract_from_ocr(pdf, pages) else tiff = "#{tempdir}/#{@pdf_name}.tif" escaped_tiff = ESCAPE[tiff] - run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{TIMEOUT} gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1" + run "MAGICK_TEMPORARY_PATH=#{tempdir} OMP_NUM_THREADS=2 timeout #{TIMEOUT} gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1" #if the user says don't do orientation detection or the plugin is not installed, set psm to 0 raise Docsplit::ExtractionFailed unless File.exists? escaped_tiff run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{psm} 2>&1" From fba35bc1b8d72a577d0f5fdaccf4bd616a211317 Mon Sep 17 00:00:00 2001 From: "Timo B. Kranz" Date: Wed, 3 Mar 2021 18:13:23 +0100 Subject: [PATCH 5/7] Update image_extractor.rb --- lib/docsplit/image_extractor.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb index 8c29bbc..7916916 100755 --- a/lib/docsplit/image_extractor.rb +++ b/lib/docsplit/image_extractor.rb @@ -37,12 +37,12 @@ def convert(pdf, size, format, previous=nil) common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}" if previous FileUtils.cp(Dir[directory_for(previous) + '/*'], directory) - result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp + result = `MAGICK_TEMPORARY_PATH=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp raise ExtractionFailed, result if $? != 0 else page_list(pages).each do |page| out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")] - cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp + cmd = "MAGICK_TEMPORARY_PATH=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp result = `#{cmd}`.chomp raise ExtractionFailed, result if $? != 0 end From e8e8941958c752bea61049fb980254758702feb5 Mon Sep 17 00:00:00 2001 From: "Timo B. Kranz" Date: Fri, 26 May 2023 14:03:20 +0200 Subject: [PATCH 6/7] Ruby3 compatability --- lib/docsplit/image_extractor.rb | 4 ++-- lib/docsplit/page_extractor.rb | 4 ++-- lib/docsplit/pdf_extractor.rb | 8 ++++---- lib/docsplit/text_extractor.rb | 8 ++++---- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb index 7916916..8c1dc4d 100755 --- a/lib/docsplit/image_extractor.rb +++ b/lib/docsplit/image_extractor.rb @@ -33,7 +33,7 @@ def convert(pdf, size, format, previous=nil) directory = directory_for(size) pages = @pages || '1-' + Docsplit.extract_length(pdf).to_s escaped_pdf = ESCAPE[pdf] - FileUtils.mkdir_p(directory) unless File.exists?(directory) + FileUtils.mkdir_p(directory) unless File.exist?(directory) common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}" if previous FileUtils.cp(Dir[directory_for(previous) + '/*'], directory) @@ -48,7 +48,7 @@ def convert(pdf, size, format, previous=nil) end end ensure - FileUtils.remove_entry_secure tempdir if File.exists?(tempdir) + FileUtils.remove_entry_secure tempdir if File.exist?(tempdir) end diff --git a/lib/docsplit/page_extractor.rb b/lib/docsplit/page_extractor.rb index 145c980..0216ec5 100644 --- a/lib/docsplit/page_extractor.rb +++ b/lib/docsplit/page_extractor.rb @@ -10,7 +10,7 @@ def extract(pdfs, opts) [pdfs].flatten.each do |pdf| pdf_name = File.basename(pdf, File.extname(pdf)) page_path = ESCAPE[File.join(@output, "#{pdf_name}")] + "_%d.pdf" - FileUtils.mkdir_p @output unless File.exists?(@output) + FileUtils.mkdir_p @output unless File.exist?(@output) cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability "pdftailor unstitch --output #{page_path} #{ESCAPE[pdf]} 2>&1" @@ -18,7 +18,7 @@ def extract(pdfs, opts) "pdftk #{ESCAPE[pdf]} burst output #{page_path} 2>&1" end result = `#{cmd}`.chomp - FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt') + FileUtils.rm('doc_data.txt') if File.exist?('doc_data.txt') raise ExtractionFailed, result if $? != 0 result end diff --git a/lib/docsplit/pdf_extractor.rb b/lib/docsplit/pdf_extractor.rb index 21861e2..86ce98b 100644 --- a/lib/docsplit/pdf_extractor.rb +++ b/lib/docsplit/pdf_extractor.rb @@ -78,7 +78,7 @@ def office_executable # raise an error if that path isn't valid, otherwise, add # it to the front of our search paths. if ENV['OFFICE_PATH'] - raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exists? ENV['OFFICE_PATH'] + raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exist? ENV['OFFICE_PATH'] paths.unshift(ENV['OFFICE_PATH']) end @@ -95,11 +95,11 @@ def office_executable # Search for the first suitable office executable # and short circuit an executable is found. paths.each do |path| - if File.exists? path + if File.exist? path @@executable ||= path unless File.directory? path path_pieces.each do |pieces| check_path = File.join(path, pieces) - @@executable ||= check_path if File.exists? check_path + @@executable ||= check_path if File.exist? check_path end end break if @@executable @@ -116,7 +116,7 @@ def office_path # Convert documents to PDF. def extract(docs, opts) out = opts[:output] || '.' - FileUtils.mkdir_p out unless File.exists?(out) + FileUtils.mkdir_p out unless File.exist?(out) [docs].flatten.each do |doc| ext = File.extname(doc) basename = File.basename(doc, ext) diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb index 042935e..d03e3e7 100644 --- a/lib/docsplit/text_extractor.rb +++ b/lib/docsplit/text_extractor.rb @@ -30,7 +30,7 @@ def initialize # Extract text from a list of PDFs. def extract(pdfs, opts) extract_options opts - FileUtils.mkdir_p @output unless File.exists?(@output) + FileUtils.mkdir_p @output unless File.exist?(@output) [pdfs].flatten.each do |pdf| @pdf_name = File.basename(pdf, File.extname(pdf)) pages = (@pages == 'all') ? 1..Docsplit.extract_length(pdf) : @pages @@ -69,7 +69,7 @@ def extract_from_ocr(pdf, pages) escaped_tiff = ESCAPE[tiff] file = "#{base_path}_#{page}" run "MAGICK_TEMPORARY_PATH=#{tempdir} OMP_NUM_THREADS=2 timeout #{TIMEOUT} gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1" - raise Docsplit::ExtractionFailed unless File.exists? escaped_tiff + raise Docsplit::ExtractionFailed unless File.exist? escaped_tiff run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1" clean_text(file + '.txt') if @clean_ocr FileUtils.remove_entry_secure tiff @@ -79,12 +79,12 @@ def extract_from_ocr(pdf, pages) escaped_tiff = ESCAPE[tiff] run "MAGICK_TEMPORARY_PATH=#{tempdir} OMP_NUM_THREADS=2 timeout #{TIMEOUT} gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1" #if the user says don't do orientation detection or the plugin is not installed, set psm to 0 - raise Docsplit::ExtractionFailed unless File.exists? escaped_tiff + raise Docsplit::ExtractionFailed unless File.exist? escaped_tiff run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{psm} 2>&1" clean_text(base_path + '.txt') if @clean_ocr end ensure - FileUtils.remove_entry_secure tempdir if File.exists?(tempdir) + FileUtils.remove_entry_secure tempdir if File.exist?(tempdir) end From 393629668a67735f955dd036db45bca71e62ea91 Mon Sep 17 00:00:00 2001 From: Martin Wilhelmi Date: Wed, 23 Apr 2025 11:35:05 +0200 Subject: [PATCH 7/7] Use qpdf instead of pdftk --- lib/docsplit.rb | 11 ++++++++++- lib/docsplit/page_extractor.rb | 4 +++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/lib/docsplit.rb b/lib/docsplit.rb index 1c49e91..eadc96b 100755 --- a/lib/docsplit.rb +++ b/lib/docsplit.rb @@ -16,7 +16,16 @@ module Docsplit GM_FORMATS = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"] - DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false, :osd => false} + DEPENDENCIES = { + :java => false, + :gm => false, + :qpdf => false, + :pdftotext => false, + :pdftk => false, + :pdftailor => false, + :tesseract => false, + :osd => false + } # Check for all dependencies, and note their absence. dirs = ENV['PATH'].split(File::PATH_SEPARATOR) diff --git a/lib/docsplit/page_extractor.rb b/lib/docsplit/page_extractor.rb index 0216ec5..6d5d959 100644 --- a/lib/docsplit/page_extractor.rb +++ b/lib/docsplit/page_extractor.rb @@ -12,7 +12,9 @@ def extract(pdfs, opts) page_path = ESCAPE[File.join(@output, "#{pdf_name}")] + "_%d.pdf" FileUtils.mkdir_p @output unless File.exist?(@output) - cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability + cmd = if DEPENDENCIES[:qpdf] # prefer qpdf, but keep pdftk for backwards compatability + "qpdf --split-pages #{ESCAPE[pdf]} #{page_path} 2>&1" + elsif DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability "pdftailor unstitch --output #{page_path} #{ESCAPE[pdf]} 2>&1" else "pdftk #{ESCAPE[pdf]} burst output #{page_path} 2>&1"