From 2835a5a3eff35a02d97cb625c6fd71a8bde2f535 Mon Sep 17 00:00:00 2001
From: mnyrop <marii@nyu.edu>
Date: Mon, 14 Oct 2024 13:38:56 -0400
Subject: [PATCH] use centralized tasks from repo

---
 Gemfile             |   3 +-
 Gemfile.lock        |   9 +++-
 Rakefile            |  17 ++-----
 lib/tasks/pdfs.rake | 109 --------------------------------------------
 lib/tasks/s3.rake   |  62 -------------------------
 5 files changed, 12 insertions(+), 188 deletions(-)
 delete mode 100644 lib/tasks/pdfs.rake
 delete mode 100644 lib/tasks/s3.rake

diff --git a/Gemfile b/Gemfile
index 237b67c..8b15fdd 100644
--- a/Gemfile
+++ b/Gemfile
@@ -1,11 +1,10 @@
 source 'https://rubygems.org'
 
 gem 'aperitiiif', github: 'migrants-and-the-state/aperitiiif-cli', branch: 'main'
-# gem 'aperitiiif', path: 'aperitiiif-cli'
+gem 'og_tasks', github: 'migrants-and-the-state/og_tasks', branch: 'main'
 
 group :development do
   gem 'aws-sdk-s3', '~> 1'
   gem 'dotenv'
   gem 'pdf-reader'
-  gem 'rake'
 end
\ No newline at end of file
diff --git a/Gemfile.lock b/Gemfile.lock
index 6433347..3c03aeb 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -16,6 +16,13 @@ GIT
       safe_yaml
       thor
 
+GIT
+  remote: https://github.com/migrants-and-the-state/og_tasks.git
+  revision: 6c404a92eb070ff5cae56be9cbac387e1de077fa
+  branch: main
+  specs:
+    og_tasks (0.1.0)
+
 GEM
   remote: https://rubygems.org/
   specs:
@@ -145,8 +152,8 @@ DEPENDENCIES
   aperitiiif!
   aws-sdk-s3 (~> 1)
   dotenv
+  og_tasks!
   pdf-reader
-  rake
 
 BUNDLED WITH
    2.5.16
diff --git a/Rakefile b/Rakefile
index d92a529..c9004b1 100644
--- a/Rakefile
+++ b/Rakefile
@@ -1,15 +1,4 @@
-require 'fileutils'
-require 'yaml'
-
-
-CONFIG          = YAML.load_file 'config.yml'
-RW_DIR          = File.dirname CONFIG['source_dir']
-PDF_DIR         = File.join RW_DIR, 'pdfs'
-JPG_DIR         = File.join RW_DIR, 'jpgs'
-ANUM_TXT_FILE   = File.join RW_DIR, 'anumbers.txt'
-AFILES_CSV_FILE = CONFIG.dig 'records', 'file'
-
-Dir.glob("lib/tasks/*.rake").each { |r| load r }
-
-
+require 'og_tasks'
 
+spec = Gem::Specification.find_by_name 'og_tasks'
+Dir.glob("#{spec.gem_dir}/lib/**/*.rake").each { |r| load r }
\ No newline at end of file
diff --git a/lib/tasks/pdfs.rake b/lib/tasks/pdfs.rake
deleted file mode 100644
index 9777bf8..0000000
--- a/lib/tasks/pdfs.rake
+++ /dev/null
@@ -1,109 +0,0 @@
-require 'csv'
-require 'pdf-reader'
-require 'vips'
-
-def records
-  @records ||= CSV.open(AFILES_CSV_FILE, headers: :first_row).map(&:to_h)
-end
-
-def records=(records)
-  @records = records
-end
-
-def records_hash 
-  @records_hash ||= pickle(records)
-end
-
-def records_hash=(records_hash)
-  @records_hash = records_hash
-end
-
-def pdf_paths
-  @pdfs ||= Dir.glob("#{PDF_DIR}/*.pdf")
-end
-
-def infer_anum(pdf_path)
-  base = File.basename(pdf_path, '.pdf')
-  anum = base.sub('_redacted', '').sub('_withdrawal', '')
-  anum
-end
-
-def pickle(array) 
-  array.map { |r| { r['id'].strip => r } }.inject(:merge)
-end
-
-def unpickle(hash)
-  hash.map { |_k, value| value }.sort_by! { |r| r['id']}
-end
-
-def write_to_csv(data, file)
-  CSV.open(file, "wb") do |csv|
-    csv << data.first.keys
-    data.each do |hash|
-      csv << hash.values
-    end
-  end
-end
-
-def deduce_page_count(pdf_path)
-  GC.start
-  PDF::Reader.new(pdf_path).page_count
-end
-
-
-namespace :pdfs do 
-  desc 'spit out txt list of anums inferred from pdfs'
-  task :anum_txt do
-    File.open(ANUM_TXT_FILE, "w") do |file| 
-      pdf_paths.map { |path| file.puts infer_anum(path) }
-    end
-    puts "Done ✓"
-  end
-
-  desc 'add page count to csv'
-  task :page_count_csv do 
-    pdf_paths.each_with_index do |path, i|
-      anum = infer_anum path
-
-      next puts "skipping #{anum}" unless records_hash.dig(anum, 'page_count').nil?
-      
-      page_count = deduce_page_count path
-      raise "no anum #{anum} found in hash!!!" unless records_hash.key? anum
-      puts "#{anum}: #{page_count} pages"
-
-      records_hash[anum]['page_count'] = page_count
-      write_to_csv(unpickle(records_hash), AFILES_CSV_FILE)
-    end
-  end
-  
-  desc 'split pdfs to jpgs'
-  task :split_jpgs do
-    FileUtils.mkdir_p JPG_DIR
-
-    pdf_paths.each_with_index do |path, i|
-      anum        = infer_anum path
-      page_count  = Integer(records_hash.dig(anum, 'page_count') || deduce_page_count(path))
-      dir         = File.join JPG_DIR, anum
-     
-      FileUtils.mkdir_p dir
-    
-      (0..page_count - 1).each do |index|
-        page_num    = index.to_s.rjust(4, "0")
-        page_id     = "#{anum}_#{page_num}"
-        target      = File.join dir, "#{page_num}.jpg"
-
-        next if File.file? target
-  
-        img = Vips::Image.pdfload path, page: index, n: 1, dpi: 300
-        img = img.thumbnail_image(2500, height: 10000000) if (img.width > 2500)
-        img.jpegsave target
-        
-        print "writing #{anum} page #{index} / #{page_count}\r"
-        $stdout.flush
-      end
-      
-      puts "finished pdf #{i+1}/#{pdf_paths.length} — process is #{(i.to_f / pdf_paths.length.to_f * 100.0).round(1)}% complete    \n"
-    end
-    puts "Done ✓"
-  end
-end
\ No newline at end of file
diff --git a/lib/tasks/s3.rake b/lib/tasks/s3.rake
deleted file mode 100644
index a7f9230..0000000
--- a/lib/tasks/s3.rake
+++ /dev/null
@@ -1,62 +0,0 @@
-require 'aws-sdk-s3'
-require 'dotenv'
-
-TIF_DIR     = './build/image/'
-JSON_DIR    = './build/presentation/'
-
-Dotenv.load
-
-def credentials
-  @credentials ||= Aws::Credentials.new ENV['ACCESS_KEY_ID'], ENV['SECRET_ACCESS_KEY']
-end 
-
-def s3
-  @s3 ||= Aws::S3::Client.new(region: ENV['REGION'], credentials: credentials)
-end
-
-namespace :s3 do
-  namespace :push do
-    desc 'sync local tifs to s3'
-    task :tifs do
-      Dir.glob("#{TIF_DIR}/*.tif").each do |path|
-        key = File.basename path
-        s3.put_object({
-          bucket: ENV['IMAGE_BUCKET_NAME'],
-          key: key,
-          content_type: 'image/tiff',
-          content_disposition: 'inline',
-          acl: 'public-read',
-          body: File.read(path)
-        })
-        puts "uploaded #{key}"
-      end
-    end
-
-    desc 'sync local json to s3'
-    task :json do
-      Dir.glob("#{JSON_DIR}/**/*.json").each do |path|
-      key = path.sub JSON_DIR, ''
-      s3.put_object({
-        bucket: ENV['PRESENTATION_BUCKET_NAME'],
-        key: key,
-        content_type: 'application/json',
-        content_disposition: 'inline',
-        acl: 'public-read',
-        body: File.read(path)
-      })
-      puts "uploaded #{key}"
-    end
-    end
-  end
-  namespace :clobber do
-    desc 'clears out og tifs in s3 bucket'
-    task :tifs do
-      puts 'TO DO'
-    end
-
-    desc 'clears out og json in s3 bucket'
-    task :json do
-      puts 'TO DO'
-    end
-  end
-end
\ No newline at end of file