diff --git a/Rakefile b/Rakefile new file mode 100644 index 0000000..7cc1ceb --- /dev/null +++ b/Rakefile @@ -0,0 +1,17 @@ +require 'fileutils' +require 'yaml' + + +CONFIG = YAML.load_file 'config.yml' +RW_DIR = File.dirname CONFIG['source_dir'] +PDF_DIR = File.join RW_DIR, 'pdfs' +JPG_DIR = File.join RW_DIR, 'jpgs' +AP_SRC_DIR = './src' +ANUM_TXT_FILE = File.join RW_DIR, 'anumbers.txt' +AFILES_CSV_FILE = File.join AP_SRC_DIR, 'afiles.csv' +PAGES_CSV_FILE = File.join AP_SRC_DIR, 'pages.csv' + +Dir.glob("lib/tasks/*.rake").each { |r| load r } + + + diff --git a/lib/split-pdfs-populate-csv.rb b/lib/split-pdfs-populate-csv.rb deleted file mode 100644 index 35be7dc..0000000 --- a/lib/split-pdfs-populate-csv.rb +++ /dev/null @@ -1,58 +0,0 @@ -require 'fileutils' -require 'pdf-reader' -require 'vips' - -pdf_dir = '/Volumes/migrants_state/OG-2024-KC-NARA/pdfs' -data_dir = '/Volumes/migrants_state/OG-2024-KC-NARA/jpgs' -pdfs = Dir.glob("#{pdf_dir}/*.pdf") -pdfs_count = pdfs.length - -afiles_csv = './src/afiles.csv' -pages_csv = './src/pages.csv' - - -# write start of csvs -File.open(afiles_csv, 'w') do |file| - file.puts("id,label,og_pdf_id,page_count") -end - -# write start of csvs -File.open(pages_csv, 'w') do |file| - file.puts("doc_id,label,a_number,page_number,extracted_text") -end - -FileUtils.mkdir_p data_dir - -# # process data -pdfs.each_with_index do |path, i| - GC.start - reader = PDF::Reader.new(path) - page_count = reader.page_count - og_pdf_id = File.basename(path, '.pdf') - a_number = og_pdf_id.sub('_redacted', '').sub('_withdrawal', '') - target_dir = "#{data_dir}/#{a_number}" - afile_data = [a_number,a_number,og_pdf_id,page_count] - - File.open(afiles_csv, 'a') { |file| file.puts afile_data.join(',') } - FileUtils.mkdir_p(target_dir) - - (0..page_count - 1).each do |index| - page_number = index.to_s.rjust(4, "0") - doc_id = "#{a_number}_#{page_number}" - target = "#{target_dir}/#{page_number}.jpg" - extracted_text = reader.pages[index].text.to_s.gsub(/\R+/, "|").gsub('"', "'") - doc_data = [doc_id,doc_id,a_number,page_number,"\"#{extracted_text}\""] - - File.open(pages_csv, "a") { |file| file.puts doc_data.join(',') } - - # return if File.exist? target - - img = Vips::Image.pdfload path, page: index, n: 1, dpi: 300 - img.jpegsave target - - print "writing #{File.basename target} page #{index} / #{page_count}\r" - $stdout.flush - end - - puts "finished pdf #{i+1}/#{pdfs_count} — process is #{(i.to_f / pdfs_count.to_f * 100.0).round(1)}% complete \n" -end diff --git a/lib/sync-aws-local-json.rb b/lib/sync-aws-local-json.rb deleted file mode 100644 index bf0bb7e..0000000 --- a/lib/sync-aws-local-json.rb +++ /dev/null @@ -1,21 +0,0 @@ -require 'aws-sdk-s3' -require 'dotenv' - -Dotenv.load - -credentials = Aws::Credentials.new ENV['ACCESS_KEY_ID'], ENV['SECRET_ACCESS_KEY'] -s3 = Aws::S3::Client.new(region: ENV['REGION'], credentials: credentials) -json_docs = Dir.glob("./build/presentation/**/*.json") - -json_docs.each do |file| - key = file.sub './build/presentation/', '' - s3.put_object({ - bucket: ENV['PRESENTATION_BUCKET_NAME'], - key: key, - content_type: 'application/json', - content_disposition: 'inline', - acl: 'public-read', - body: File.read(file) - }) - puts "uploaded #{key}" -end diff --git a/lib/sync-aws-local-tifs.rb b/lib/sync-aws-local-tifs.rb deleted file mode 100644 index 2a3d870..0000000 --- a/lib/sync-aws-local-tifs.rb +++ /dev/null @@ -1,21 +0,0 @@ -require 'aws-sdk-s3' -require 'dotenv' - -Dotenv.load - -credentials = Aws::Credentials.new ENV['ACCESS_KEY_ID'], ENV['SECRET_ACCESS_KEY'] -s3 = Aws::S3::Client.new(region: ENV['REGION'], credentials: credentials) -images = Dir.glob("./build/image/*.tif") - -images.each do |file| - key = File.basename(file) - s3.put_object({ - bucket: ENV['IMAGE_BUCKET_NAME'], - key: key, - content_type: 'image/tiff', - content_disposition: 'inline', - acl: 'public-read', - body: File.read(file) - }) - puts "uploaded #{key}" -end diff --git a/lib/tasks/pdfs.rake b/lib/tasks/pdfs.rake new file mode 100644 index 0000000..443bf0f --- /dev/null +++ b/lib/tasks/pdfs.rake @@ -0,0 +1,61 @@ +require 'pdf-reader' +require 'vips' + +def pdf_paths + @pdfs ||= Dir.glob("#{PDF_DIR}/*.pdf") +end + +def infer_anum(pdf_path) + base = File.basename(pdf_path, '.pdf') + anum = base.sub('_redacted', '').sub('_withdrawal', '') + anum +end + +namespace :pdfs do + desc 'spit out txt list of anums inferred from pdfs' + task :anum_txt do + File.open(ANUM_TXT_FILE, "w") do |file| + pdf_paths.map { |path| file.puts infer_anum(path) } + end + puts "Done ✓" + end + + desc 'split pdfs to jpgs, capture results in csvs' + task :jpg_csv do + File.open(AFILES_CSV_FILE, 'w') { |file| file.puts("id,label,og_pdf_id,page_count") } + File.open(PAGES_CSV_FILE, 'w') { |file| file.puts("id,label,a_number,page_number,extracted_text") } + FileUtils.mkdir_p JPG_DIR + + pdf_paths.each_with_index do |path, i| + GC.start + reader = PDF::Reader.new path + page_count = reader.page_count + anum = infer_anum path + dir = File.join JPG_DIR, anum + pdf_data = [anum,anum,File.basename(path, '.pdf'),page_count] + + File.open(AFILES_CSV_FILE, 'a') { |f| f.puts pdf_data.join(',') } + FileUtils.mkdir_p dir + + (0..page_count - 1).each do |index| + page_num = index.to_s.rjust(4, "0") + page_id = "#{anum}_#{page_num}" + target = File.join dir, "#{page_num}.jpg" + text = reader.pages[index].text.to_s.gsub(/\R+/, "|").gsub('"', "'") + page_data = [page_id,page_id,anum,page_num,"\"#{text}\""] + + File.open(PAGES_CSV_FILE, "a") { |f| f.puts page_data.join(',') } + + img = Vips::Image.pdfload path, page: index, n: 1, dpi: 300 + img = img.thumbnail_image(2500, height: 10000000) if (img.width > 2500) + img.jpegsave target + + print "writing #{anum} page #{index} / #{page_count}\r" + $stdout.flush + end + + puts "finished pdf #{i+1}/#{pdf_paths.length} — process is #{(i.to_f / pdf_paths.length.to_f * 100.0).round(1)}% complete \n" + end + puts "Done ✓" + end +end \ No newline at end of file diff --git a/lib/tasks/s3.rake b/lib/tasks/s3.rake new file mode 100644 index 0000000..a7f9230 --- /dev/null +++ b/lib/tasks/s3.rake @@ -0,0 +1,62 @@ +require 'aws-sdk-s3' +require 'dotenv' + +TIF_DIR = './build/image/' +JSON_DIR = './build/presentation/' + +Dotenv.load + +def credentials + @credentials ||= Aws::Credentials.new ENV['ACCESS_KEY_ID'], ENV['SECRET_ACCESS_KEY'] +end + +def s3 + @s3 ||= Aws::S3::Client.new(region: ENV['REGION'], credentials: credentials) +end + +namespace :s3 do + namespace :push do + desc 'sync local tifs to s3' + task :tifs do + Dir.glob("#{TIF_DIR}/*.tif").each do |path| + key = File.basename path + s3.put_object({ + bucket: ENV['IMAGE_BUCKET_NAME'], + key: key, + content_type: 'image/tiff', + content_disposition: 'inline', + acl: 'public-read', + body: File.read(path) + }) + puts "uploaded #{key}" + end + end + + desc 'sync local json to s3' + task :json do + Dir.glob("#{JSON_DIR}/**/*.json").each do |path| + key = path.sub JSON_DIR, '' + s3.put_object({ + bucket: ENV['PRESENTATION_BUCKET_NAME'], + key: key, + content_type: 'application/json', + content_disposition: 'inline', + acl: 'public-read', + body: File.read(path) + }) + puts "uploaded #{key}" + end + end + end + namespace :clobber do + desc 'clears out og tifs in s3 bucket' + task :tifs do + puts 'TO DO' + end + + desc 'clears out og json in s3 bucket' + task :json do + puts 'TO DO' + end + end +end \ No newline at end of file