From e7a53d98cb5d2de57ca700de23a8cacd00e9dc84 Mon Sep 17 00:00:00 2001
From: "Timo B. Kranz" <me@tbk.name>
Date: Tue, 22 Sep 2015 11:18:22 +0200
Subject: [PATCH 1/7] Clean pdffonts output to avoid invalid UTF-8 characters

---
 lib/docsplit/text_extractor.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb
index 985abdd..2edba91 100644
--- a/lib/docsplit/text_extractor.rb
+++ b/lib/docsplit/text_extractor.rb
@@ -46,7 +46,7 @@ def extract(pdfs, opts)
     # Does a PDF have any text embedded?
     def contains_text?(pdf)
       fonts = `pdffonts #{ESCAPE[pdf]} 2>&1`
-      !fonts.match(NO_TEXT_DETECTED)
+      !fonts.scrub.match(NO_TEXT_DETECTED)
     end
 
     # Extract a page range worth of text from a PDF, directly.

From bc41d6ed2af8390dec12a4432c60e6b7c541fc1c Mon Sep 17 00:00:00 2001
From: "Timo B. Kranz" <me@tbk.name>
Date: Wed, 6 Feb 2019 13:52:31 +0100
Subject: [PATCH 2/7] Add timeout to OCR

---
 lib/docsplit/text_extractor.rb | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb
index 2edba91..023dbf8 100644
--- a/lib/docsplit/text_extractor.rb
+++ b/lib/docsplit/text_extractor.rb
@@ -21,6 +21,8 @@ class TextExtractor
 
     MIN_TEXT_PER_PAGE = 100 # in bytes
 
+    TIMEOUT = '5m'
+
     def initialize
       @pages_to_ocr = []
     end
@@ -66,7 +68,7 @@ def extract_from_ocr(pdf, pages)
           tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
           escaped_tiff = ESCAPE[tiff]
           file = "#{base_path}_#{page}"
-          run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
+          run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{TIMEOUT} gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
           run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1"
           clean_text(file + '.txt') if @clean_ocr
           FileUtils.remove_entry_secure tiff
@@ -74,7 +76,7 @@ def extract_from_ocr(pdf, pages)
       else
         tiff = "#{tempdir}/#{@pdf_name}.tif"
         escaped_tiff = ESCAPE[tiff]
-        run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
+        run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{TIMEOUT} gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
         #if the user says don't do orientation detection or the plugin is not installed, set psm to 0
         run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{psm} 2>&1"
         clean_text(base_path + '.txt') if @clean_ocr

From a6d5f4e7074ac60a0f418312f7b916d6dc3e922c Mon Sep 17 00:00:00 2001
From: "Timo B. Kranz" <me@tbk.name>
Date: Wed, 6 Feb 2019 14:24:00 +0100
Subject: [PATCH 3/7] Raise upon failed image conversion

---
 lib/docsplit/text_extractor.rb | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb
index 023dbf8..adef04d 100644
--- a/lib/docsplit/text_extractor.rb
+++ b/lib/docsplit/text_extractor.rb
@@ -69,6 +69,7 @@ def extract_from_ocr(pdf, pages)
           escaped_tiff = ESCAPE[tiff]
           file = "#{base_path}_#{page}"
           run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{TIMEOUT} gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
+          raise Docsplit::ExtractionFailed unless File.exists? escaped_tiff
           run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1"
           clean_text(file + '.txt') if @clean_ocr
           FileUtils.remove_entry_secure tiff
@@ -78,6 +79,7 @@ def extract_from_ocr(pdf, pages)
         escaped_tiff = ESCAPE[tiff]
         run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{TIMEOUT} gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
         #if the user says don't do orientation detection or the plugin is not installed, set psm to 0
+        raise Docsplit::ExtractionFailed unless File.exists? escaped_tiff
         run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{psm} 2>&1"
         clean_text(base_path + '.txt') if @clean_ocr
       end

From 1c2ff70a6ff85df4d0e6422351bd40621d33e9b0 Mon Sep 17 00:00:00 2001
From: "Timo B. Kranz" <tbk@42ls.de>
Date: Wed, 3 Mar 2021 18:13:00 +0100
Subject: [PATCH 4/7] Update MAGICK_TMPDIR

---
 lib/docsplit/text_extractor.rb | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb
index adef04d..042935e 100644
--- a/lib/docsplit/text_extractor.rb
+++ b/lib/docsplit/text_extractor.rb
@@ -68,7 +68,7 @@ def extract_from_ocr(pdf, pages)
           tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
           escaped_tiff = ESCAPE[tiff]
           file = "#{base_path}_#{page}"
-          run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{TIMEOUT} gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
+          run "MAGICK_TEMPORARY_PATH=#{tempdir} OMP_NUM_THREADS=2 timeout #{TIMEOUT} gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
           raise Docsplit::ExtractionFailed unless File.exists? escaped_tiff
           run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1"
           clean_text(file + '.txt') if @clean_ocr
@@ -77,7 +77,7 @@ def extract_from_ocr(pdf, pages)
       else
         tiff = "#{tempdir}/#{@pdf_name}.tif"
         escaped_tiff = ESCAPE[tiff]
-        run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{TIMEOUT} gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
+        run "MAGICK_TEMPORARY_PATH=#{tempdir} OMP_NUM_THREADS=2 timeout #{TIMEOUT} gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
         #if the user says don't do orientation detection or the plugin is not installed, set psm to 0
         raise Docsplit::ExtractionFailed unless File.exists? escaped_tiff
         run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{psm} 2>&1"

From fba35bc1b8d72a577d0f5fdaccf4bd616a211317 Mon Sep 17 00:00:00 2001
From: "Timo B. Kranz" <tbk@42ls.de>
Date: Wed, 3 Mar 2021 18:13:23 +0100
Subject: [PATCH 5/7] Update image_extractor.rb

---
 lib/docsplit/image_extractor.rb | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb
index 8c29bbc..7916916 100755
--- a/lib/docsplit/image_extractor.rb
+++ b/lib/docsplit/image_extractor.rb
@@ -37,12 +37,12 @@ def convert(pdf, size, format, previous=nil)
       common    = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
       if previous
         FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
-        result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp
+        result = `MAGICK_TEMPORARY_PATH=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp
         raise ExtractionFailed, result if $? != 0
       else
         page_list(pages).each do |page|
           out_file  = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
-          cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
+          cmd = "MAGICK_TEMPORARY_PATH=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
           result = `#{cmd}`.chomp
           raise ExtractionFailed, result if $? != 0
         end

From e8e8941958c752bea61049fb980254758702feb5 Mon Sep 17 00:00:00 2001
From: "Timo B. Kranz" <me@tbk.name>
Date: Fri, 26 May 2023 14:03:20 +0200
Subject: [PATCH 6/7] Ruby3 compatability

---
 lib/docsplit/image_extractor.rb | 4 ++--
 lib/docsplit/page_extractor.rb  | 4 ++--
 lib/docsplit/pdf_extractor.rb   | 8 ++++----
 lib/docsplit/text_extractor.rb  | 8 ++++----
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb
index 7916916..8c1dc4d 100755
--- a/lib/docsplit/image_extractor.rb
+++ b/lib/docsplit/image_extractor.rb
@@ -33,7 +33,7 @@ def convert(pdf, size, format, previous=nil)
       directory = directory_for(size)
       pages     = @pages || '1-' + Docsplit.extract_length(pdf).to_s
       escaped_pdf = ESCAPE[pdf]
-      FileUtils.mkdir_p(directory) unless File.exists?(directory)
+      FileUtils.mkdir_p(directory) unless File.exist?(directory)
       common    = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
       if previous
         FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
@@ -48,7 +48,7 @@ def convert(pdf, size, format, previous=nil)
         end
       end
     ensure
-      FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
+      FileUtils.remove_entry_secure tempdir if File.exist?(tempdir)
     end
 
 
diff --git a/lib/docsplit/page_extractor.rb b/lib/docsplit/page_extractor.rb
index 145c980..0216ec5 100644
--- a/lib/docsplit/page_extractor.rb
+++ b/lib/docsplit/page_extractor.rb
@@ -10,7 +10,7 @@ def extract(pdfs, opts)
       [pdfs].flatten.each do |pdf|
         pdf_name = File.basename(pdf, File.extname(pdf))
         page_path = ESCAPE[File.join(@output, "#{pdf_name}")] + "_%d.pdf"
-        FileUtils.mkdir_p @output unless File.exists?(@output)
+        FileUtils.mkdir_p @output unless File.exist?(@output)
         
         cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability
           "pdftailor unstitch --output #{page_path} #{ESCAPE[pdf]} 2>&1"
@@ -18,7 +18,7 @@ def extract(pdfs, opts)
           "pdftk #{ESCAPE[pdf]} burst output #{page_path} 2>&1"
         end
         result = `#{cmd}`.chomp
-        FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
+        FileUtils.rm('doc_data.txt') if File.exist?('doc_data.txt')
         raise ExtractionFailed, result if $? != 0
         result
       end
diff --git a/lib/docsplit/pdf_extractor.rb b/lib/docsplit/pdf_extractor.rb
index 21861e2..86ce98b 100644
--- a/lib/docsplit/pdf_extractor.rb
+++ b/lib/docsplit/pdf_extractor.rb
@@ -78,7 +78,7 @@ def office_executable
       # raise an error if that path isn't valid, otherwise, add
       # it to the front of our search paths.
       if ENV['OFFICE_PATH']
-        raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exists? ENV['OFFICE_PATH']
+        raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exist? ENV['OFFICE_PATH']
         paths.unshift(ENV['OFFICE_PATH'])
       end
       
@@ -95,11 +95,11 @@ def office_executable
       # Search for the first suitable office executable
       # and short circuit an executable is found.
       paths.each do |path|
-        if File.exists? path
+        if File.exist? path
           @@executable ||= path unless File.directory? path
           path_pieces.each do |pieces|
             check_path = File.join(path, pieces)
-            @@executable ||= check_path if File.exists? check_path
+            @@executable ||= check_path if File.exist? check_path
           end
         end
         break if @@executable
@@ -116,7 +116,7 @@ def office_path
     # Convert documents to PDF.
     def extract(docs, opts)
       out = opts[:output] || '.'
-      FileUtils.mkdir_p out unless File.exists?(out)
+      FileUtils.mkdir_p out unless File.exist?(out)
       [docs].flatten.each do |doc|
         ext = File.extname(doc)
         basename = File.basename(doc, ext)
diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb
index 042935e..d03e3e7 100644
--- a/lib/docsplit/text_extractor.rb
+++ b/lib/docsplit/text_extractor.rb
@@ -30,7 +30,7 @@ def initialize
     # Extract text from a list of PDFs.
     def extract(pdfs, opts)
       extract_options opts
-      FileUtils.mkdir_p @output unless File.exists?(@output)
+      FileUtils.mkdir_p @output unless File.exist?(@output)
       [pdfs].flatten.each do |pdf|
         @pdf_name = File.basename(pdf, File.extname(pdf))
         pages = (@pages == 'all') ? 1..Docsplit.extract_length(pdf) : @pages
@@ -69,7 +69,7 @@ def extract_from_ocr(pdf, pages)
           escaped_tiff = ESCAPE[tiff]
           file = "#{base_path}_#{page}"
           run "MAGICK_TEMPORARY_PATH=#{tempdir} OMP_NUM_THREADS=2 timeout #{TIMEOUT} gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
-          raise Docsplit::ExtractionFailed unless File.exists? escaped_tiff
+          raise Docsplit::ExtractionFailed unless File.exist? escaped_tiff
           run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1"
           clean_text(file + '.txt') if @clean_ocr
           FileUtils.remove_entry_secure tiff
@@ -79,12 +79,12 @@ def extract_from_ocr(pdf, pages)
         escaped_tiff = ESCAPE[tiff]
         run "MAGICK_TEMPORARY_PATH=#{tempdir} OMP_NUM_THREADS=2 timeout #{TIMEOUT} gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
         #if the user says don't do orientation detection or the plugin is not installed, set psm to 0
-        raise Docsplit::ExtractionFailed unless File.exists? escaped_tiff
+        raise Docsplit::ExtractionFailed unless File.exist? escaped_tiff
         run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{psm} 2>&1"
         clean_text(base_path + '.txt') if @clean_ocr
       end
     ensure
-      FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
+      FileUtils.remove_entry_secure tempdir if File.exist?(tempdir)
     end
 
 

From 393629668a67735f955dd036db45bca71e62ea91 Mon Sep 17 00:00:00 2001
From: Martin Wilhelmi <martin@wilhelmi.software>
Date: Wed, 23 Apr 2025 11:35:05 +0200
Subject: [PATCH 7/7] Use qpdf instead of pdftk

---
 lib/docsplit.rb                | 11 ++++++++++-
 lib/docsplit/page_extractor.rb |  4 +++-
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/lib/docsplit.rb b/lib/docsplit.rb
index 1c49e91..eadc96b 100755
--- a/lib/docsplit.rb
+++ b/lib/docsplit.rb
@@ -16,7 +16,16 @@ module Docsplit
   
   GM_FORMATS    = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"]
 
-  DEPENDENCIES  = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false, :osd => false}
+  DEPENDENCIES  = {
+    :java => false,
+    :gm => false,
+    :qpdf => false,
+    :pdftotext => false,
+    :pdftk => false,
+    :pdftailor => false,
+    :tesseract => false,
+    :osd => false
+  }
 
   # Check for all dependencies, and note their absence.
   dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
diff --git a/lib/docsplit/page_extractor.rb b/lib/docsplit/page_extractor.rb
index 0216ec5..6d5d959 100644
--- a/lib/docsplit/page_extractor.rb
+++ b/lib/docsplit/page_extractor.rb
@@ -12,7 +12,9 @@ def extract(pdfs, opts)
         page_path = ESCAPE[File.join(@output, "#{pdf_name}")] + "_%d.pdf"
         FileUtils.mkdir_p @output unless File.exist?(@output)
         
-        cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability
+        cmd = if DEPENDENCIES[:qpdf] # prefer qpdf, but keep pdftk for backwards compatability
+          "qpdf --split-pages #{ESCAPE[pdf]} #{page_path} 2>&1"
+        elsif DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability
           "pdftailor unstitch --output #{page_path} #{ESCAPE[pdf]} 2>&1"
         else
           "pdftk #{ESCAPE[pdf]} burst output #{page_path} 2>&1"