From 2835a5a3eff35a02d97cb625c6fd71a8bde2f535 Mon Sep 17 00:00:00 2001 From: mnyrop Date: Mon, 14 Oct 2024 13:38:56 -0400 Subject: [PATCH] use centralized tasks from repo --- Gemfile | 3 +- Gemfile.lock | 9 +++- Rakefile | 17 ++----- lib/tasks/pdfs.rake | 109 -------------------------------------------- lib/tasks/s3.rake | 62 ------------------------- 5 files changed, 12 insertions(+), 188 deletions(-) delete mode 100644 lib/tasks/pdfs.rake delete mode 100644 lib/tasks/s3.rake diff --git a/Gemfile b/Gemfile index 237b67c..8b15fdd 100644 --- a/Gemfile +++ b/Gemfile @@ -1,11 +1,10 @@ source 'https://rubygems.org' gem 'aperitiiif', github: 'migrants-and-the-state/aperitiiif-cli', branch: 'main' -# gem 'aperitiiif', path: 'aperitiiif-cli' +gem 'og_tasks', github: 'migrants-and-the-state/og_tasks', branch: 'main' group :development do gem 'aws-sdk-s3', '~> 1' gem 'dotenv' gem 'pdf-reader' - gem 'rake' end \ No newline at end of file diff --git a/Gemfile.lock b/Gemfile.lock index 6433347..3c03aeb 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -16,6 +16,13 @@ GIT safe_yaml thor +GIT + remote: https://github.com/migrants-and-the-state/og_tasks.git + revision: 6c404a92eb070ff5cae56be9cbac387e1de077fa + branch: main + specs: + og_tasks (0.1.0) + GEM remote: https://rubygems.org/ specs: @@ -145,8 +152,8 @@ DEPENDENCIES aperitiiif! aws-sdk-s3 (~> 1) dotenv + og_tasks! pdf-reader - rake BUNDLED WITH 2.5.16 diff --git a/Rakefile b/Rakefile index d92a529..c9004b1 100644 --- a/Rakefile +++ b/Rakefile @@ -1,15 +1,4 @@ -require 'fileutils' -require 'yaml' - - -CONFIG = YAML.load_file 'config.yml' -RW_DIR = File.dirname CONFIG['source_dir'] -PDF_DIR = File.join RW_DIR, 'pdfs' -JPG_DIR = File.join RW_DIR, 'jpgs' -ANUM_TXT_FILE = File.join RW_DIR, 'anumbers.txt' -AFILES_CSV_FILE = CONFIG.dig 'records', 'file' - -Dir.glob("lib/tasks/*.rake").each { |r| load r } - - +require 'og_tasks' +spec = Gem::Specification.find_by_name 'og_tasks' +Dir.glob("#{spec.gem_dir}/lib/**/*.rake").each { |r| load r } \ No newline at end of file diff --git a/lib/tasks/pdfs.rake b/lib/tasks/pdfs.rake deleted file mode 100644 index 9777bf8..0000000 --- a/lib/tasks/pdfs.rake +++ /dev/null @@ -1,109 +0,0 @@ -require 'csv' -require 'pdf-reader' -require 'vips' - -def records - @records ||= CSV.open(AFILES_CSV_FILE, headers: :first_row).map(&:to_h) -end - -def records=(records) - @records = records -end - -def records_hash - @records_hash ||= pickle(records) -end - -def records_hash=(records_hash) - @records_hash = records_hash -end - -def pdf_paths - @pdfs ||= Dir.glob("#{PDF_DIR}/*.pdf") -end - -def infer_anum(pdf_path) - base = File.basename(pdf_path, '.pdf') - anum = base.sub('_redacted', '').sub('_withdrawal', '') - anum -end - -def pickle(array) - array.map { |r| { r['id'].strip => r } }.inject(:merge) -end - -def unpickle(hash) - hash.map { |_k, value| value }.sort_by! { |r| r['id']} -end - -def write_to_csv(data, file) - CSV.open(file, "wb") do |csv| - csv << data.first.keys - data.each do |hash| - csv << hash.values - end - end -end - -def deduce_page_count(pdf_path) - GC.start - PDF::Reader.new(pdf_path).page_count -end - - -namespace :pdfs do - desc 'spit out txt list of anums inferred from pdfs' - task :anum_txt do - File.open(ANUM_TXT_FILE, "w") do |file| - pdf_paths.map { |path| file.puts infer_anum(path) } - end - puts "Done ✓" - end - - desc 'add page count to csv' - task :page_count_csv do - pdf_paths.each_with_index do |path, i| - anum = infer_anum path - - next puts "skipping #{anum}" unless records_hash.dig(anum, 'page_count').nil? - - page_count = deduce_page_count path - raise "no anum #{anum} found in hash!!!" unless records_hash.key? anum - puts "#{anum}: #{page_count} pages" - - records_hash[anum]['page_count'] = page_count - write_to_csv(unpickle(records_hash), AFILES_CSV_FILE) - end - end - - desc 'split pdfs to jpgs' - task :split_jpgs do - FileUtils.mkdir_p JPG_DIR - - pdf_paths.each_with_index do |path, i| - anum = infer_anum path - page_count = Integer(records_hash.dig(anum, 'page_count') || deduce_page_count(path)) - dir = File.join JPG_DIR, anum - - FileUtils.mkdir_p dir - - (0..page_count - 1).each do |index| - page_num = index.to_s.rjust(4, "0") - page_id = "#{anum}_#{page_num}" - target = File.join dir, "#{page_num}.jpg" - - next if File.file? target - - img = Vips::Image.pdfload path, page: index, n: 1, dpi: 300 - img = img.thumbnail_image(2500, height: 10000000) if (img.width > 2500) - img.jpegsave target - - print "writing #{anum} page #{index} / #{page_count}\r" - $stdout.flush - end - - puts "finished pdf #{i+1}/#{pdf_paths.length} — process is #{(i.to_f / pdf_paths.length.to_f * 100.0).round(1)}% complete \n" - end - puts "Done ✓" - end -end \ No newline at end of file diff --git a/lib/tasks/s3.rake b/lib/tasks/s3.rake deleted file mode 100644 index a7f9230..0000000 --- a/lib/tasks/s3.rake +++ /dev/null @@ -1,62 +0,0 @@ -require 'aws-sdk-s3' -require 'dotenv' - -TIF_DIR = './build/image/' -JSON_DIR = './build/presentation/' - -Dotenv.load - -def credentials - @credentials ||= Aws::Credentials.new ENV['ACCESS_KEY_ID'], ENV['SECRET_ACCESS_KEY'] -end - -def s3 - @s3 ||= Aws::S3::Client.new(region: ENV['REGION'], credentials: credentials) -end - -namespace :s3 do - namespace :push do - desc 'sync local tifs to s3' - task :tifs do - Dir.glob("#{TIF_DIR}/*.tif").each do |path| - key = File.basename path - s3.put_object({ - bucket: ENV['IMAGE_BUCKET_NAME'], - key: key, - content_type: 'image/tiff', - content_disposition: 'inline', - acl: 'public-read', - body: File.read(path) - }) - puts "uploaded #{key}" - end - end - - desc 'sync local json to s3' - task :json do - Dir.glob("#{JSON_DIR}/**/*.json").each do |path| - key = path.sub JSON_DIR, '' - s3.put_object({ - bucket: ENV['PRESENTATION_BUCKET_NAME'], - key: key, - content_type: 'application/json', - content_disposition: 'inline', - acl: 'public-read', - body: File.read(path) - }) - puts "uploaded #{key}" - end - end - end - namespace :clobber do - desc 'clears out og tifs in s3 bucket' - task :tifs do - puts 'TO DO' - end - - desc 'clears out og json in s3 bucket' - task :json do - puts 'TO DO' - end - end -end \ No newline at end of file