Skip to content

Commit

Permalink
refactor: Update TextExtractions and Split to use folder structure
Browse files Browse the repository at this point in the history
  • Loading branch information
motizuki committed Aug 27, 2024
1 parent 2c5025e commit 141838e
Show file tree
Hide file tree
Showing 6 changed files with 75 additions and 75 deletions.
6 changes: 2 additions & 4 deletions app/sidekiq/file_extraction_worker.rb
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,8 @@ def setup_tmp_directory
end

def move_extracted_documents_into_tmp_directory
Dir.children(@extraction_folder).each do |file|
next if file == 'tmp'

FileUtils.move("#{@extraction_folder}/#{file}", "#{@tmp_directory}/#{file}")
Dir.glob("#{@extraction_folder}/**/*.*").each do |file|
FileUtils.move(file, "#{@tmp_directory}/#{File.basename(file)}")
end
end

Expand Down
3 changes: 2 additions & 1 deletion app/sidekiq/split_worker.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,13 @@ def process_extracted_documents
def create_document(records, saved_response)
page_str = format('%09d', @page)[-9..]
name_str = @extraction_definition.name.parameterize(separator: '_')
folder_number = (@page / Extraction::Documents::DOCUMENTS_PER_FOLDER.to_f).ceil

Extraction::Document.new(
url: saved_response['url'], method: saved_response['method'],
params: saved_response['params'], request_headers: saved_response['request_headers'],
status: saved_response['status'], response_headers: saved_response['response_headers'],
body: "<?xml version=\"1.0\"?><root><records>#{records.map(&:to_xml).join}</records></root>"
).save("#{@extraction_folder}/#{name_str}__-__#{page_str}.json")
).save("#{@extraction_folder}/#{folder_number}/#{name_str}__-__#{page_str}.json")
end
end
4 changes: 3 additions & 1 deletion app/sidekiq/text_extraction_worker.rb
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,14 @@ def process_extracted_documents
end

def create_document(extracted_text, filename, process)
folder_number = (@page / Extraction::Documents::DOCUMENTS_PER_FOLDER.to_f).ceil

Extraction::Document.new(
url: saved_response['url'], method: saved_response['method'],
params: saved_response['params'], request_headers: saved_response['request_headers'],
status: saved_response['status'], response_headers: saved_response['response_headers'],
body: { text: extracted_text, process: }.to_json
).save("#{@extraction_folder}/#{filename}")
).save("#{@extraction_folder}/#{folder_number}/#{filename}")
end

private
Expand Down
49 changes: 25 additions & 24 deletions spec/sidekiq/split_worker_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,50 +15,51 @@
end

it 'splits a large file into chunks of 100' do
extracted_files = Dir.glob("#{extraction_job.extraction_folder}/*").select { |e| File.file? e }
extracted_files = Dir.glob("#{extraction_job.extraction_folder}/**/*").select { |e| File.file? e }

expect(extracted_files.count).to eq 1

SplitWorker.new.perform(extraction_job.id)
extracted_files = Dir.glob("#{extraction_job.extraction_folder}/*").select { |e| File.file? e }

extracted_files = Dir.glob("#{extraction_job.extraction_folder}/**/*").select { |e| File.file? e }

expect(extracted_files.count).to eq 2
end

it 'cleans up the tmp folder it creates' do
expect(Dir.exist?("#{extraction_job.extraction_folder}/tmp")).to eq false

SplitWorker.new.perform(extraction_job.id)
SplitWorker.new.perform(extraction_job.id)

expect(Dir.exist?("#{extraction_job.extraction_folder}/tmp")).to eq false
expect(Dir.exist?("#{extraction_job.extraction_folder}/tmp")).to eq false
end

it 'names the new files following the appropriate naming convention' do
SplitWorker.new.perform(extraction_job.id)
expect(File.exist?("#{extraction_job.extraction_folder}/national-library-of-new-zealand_harvest-extraction-210__-__000000001.json"))
expect(File.exist?("#{extraction_job.extraction_folder}/national-library-of-new-zealand_harvest-extraction-210__-__000000002.json"))
expect(File.exist?("#{extraction_job.extraction_folder}/national-library-of-new-zealand_harvest-extraction-210__-__000000003.json"))
expect(File.exist?("#{extraction_job.extraction_folder}/national-library-of-new-zealand_harvest-extraction-210__-__000000004.json"))

expect(File.exist?("#{extraction_job.extraction_folder}/1/national-library-of-new-zealand_harvest-extraction-210__-__000000001.json"))
expect(File.exist?("#{extraction_job.extraction_folder}/1/national-library-of-new-zealand_harvest-extraction-210__-__000000002.json"))
expect(File.exist?("#{extraction_job.extraction_folder}/1/national-library-of-new-zealand_harvest-extraction-210__-__000000003.json"))
expect(File.exist?("#{extraction_job.extraction_folder}/1/national-library-of-new-zealand_harvest-extraction-210__-__000000004.json"))
end
end

context 'when the request is paginated' do
before do
FileUtils.cp("#{Rails.root}/spec/support/split_example.json", "#{extraction_job.extraction_folder}/split_example_01.json")
FileUtils.cp("#{Rails.root}/spec/support/split_example.json", "#{extraction_job.extraction_folder}/split_example_02.json")
FileUtils.mkdir_p("#{extraction_job.extraction_folder}/1")
FileUtils.cp("#{Rails.root}/spec/support/split_example.json", "#{extraction_job.extraction_folder}/1/split_example_01.json")
FileUtils.cp("#{Rails.root}/spec/support/split_example.json", "#{extraction_job.extraction_folder}/1/split_example_02.json")
end

it 'splits both files into many small ones that are unique from each other' do
extracted_files = Dir.glob("#{extraction_job.extraction_folder}/*").select { |e| File.file? e }
extracted_files = Dir.glob("#{extraction_job.extraction_folder}/**/*").select { |e| File.file? e }

expect(extracted_files.count).to eq 2

SplitWorker.new.perform(extraction_job.id)
extracted_files = Dir.glob("#{extraction_job.extraction_folder}/*").select { |e| File.file? e }

extracted_files = Dir.glob("#{extraction_job.extraction_folder}/**/*").select { |e| File.file? e }

expect(extracted_files.count).to eq 4
end
end
Expand All @@ -71,7 +72,7 @@
it 'does not enqueue Transformation Workers' do
expect(TransformationWorker).not_to receive(:perform_async)

SplitWorker.new.perform(extraction_job.id)
SplitWorker.new.perform(extraction_job.id)
end
end

Expand All @@ -90,7 +91,7 @@
it 'enqueues Transformation Workers to process the split files' do
expect(TransformationWorker).to receive(:perform_async).exactly(2).times.and_call_original

SplitWorker.new.perform(extraction_job.id)
SplitWorker.new.perform(extraction_job.id)
end

it 'stops enqueuing TransformationWorkers if the pipeline has been cancelled' do
Expand All @@ -101,7 +102,7 @@
end

it 'updates the Harvest Report appropriately' do
SplitWorker.new.perform(extraction_job.id)
SplitWorker.new.perform(extraction_job.id)

harvest_report.reload

Expand Down
82 changes: 41 additions & 41 deletions spec/sidekiq/text_extraction_worker_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,115 +6,115 @@
let(:pipeline) { create(:pipeline, name: 'PDF Example') }
let(:extraction_definition) { create(:extraction_definition, pipeline:, format: 'JSON', extract_text_from_file: true) }
let(:extraction_job) { create(:extraction_job, extraction_definition:) }

describe "#perform" do
context 'when the PDF extraction is not part of a harvest' do

context 'when the PDF has a text layer' do
before do
FileUtils.cp("#{Rails.root}/spec/support/example.pdf", "#{extraction_job.extraction_folder}/example__1234__01.json")
end

it 'converts a PDF into raw text' do
extracted_files = Dir.glob("#{extraction_job.extraction_folder}/*").select { |e| File.file? e }

expect(extracted_files.count).to eq 1

TextExtractionWorker.new.perform(extraction_job.id)

extracted_files = Dir.glob("#{extraction_job.extraction_folder}/*").select { |e| File.file? e }

expect(extracted_files.count).to eq 1
end

it 'signifies if content has been extracted from a PDF' do
TextExtractionWorker.new.perform(extraction_job.id)

extracted_files = Dir.glob("#{extraction_job.extraction_folder}/*").select { |e| File.file? e }

expect(extracted_files.count).to eq 1

document = JSON.parse(File.read(extracted_files.first))
process = JSON.parse(document['body'])['process']
process = JSON.parse(document['body'])['process']

expect(process).to eq 'Extracted from application/pdf using Yomu'
end

it 'cleans up the tmp folder it creates' do
expect(Dir.exist?("#{extraction_job.extraction_folder}/tmp")).to eq false

TextExtractionWorker.new.perform(extraction_job.id)
expect(Dir.exist?("#{extraction_job.extraction_folder}/tmp")).to eq false

expect(Dir.exist?("#{extraction_job.extraction_folder}/tmp")).to eq false
end

it 'names the new files following as it was originally' do
TextExtractionWorker.new.perform(extraction_job.id)
expect(File.exist?("#{extraction_job.extraction_folder}/example__1234__01.json")).to eq(true)

expect(File.exist?("#{extraction_job.extraction_folder}/1/example__1234__01.json")).to eq(true)
end

it 'does not enqueue Transformation Workers' do
expect(TransformationWorker).not_to receive(:perform_async)
TextExtractionWorker.new.perform(extraction_job.id)

TextExtractionWorker.new.perform(extraction_job.id)
end
end

context 'when the PDF does not have a text layer' do
before do
FileUtils.cp("#{Rails.root}/spec/support/example_needing_ocr.pdf", "#{extraction_job.extraction_folder}/example__1234__01.json")
FileUtils.cp("#{Rails.root}/spec/support/example_needing_ocr.pdf", "#{extraction_job.extraction_folder}/1/example__1234__01.json")
end

it 'OCRs the PDF to add a text layer' do
extracted_files = Dir.glob("#{extraction_job.extraction_folder}/*").select { |e| File.file? e }
extracted_files = Dir.glob("#{extraction_job.extraction_folder}/**/*").select { |e| File.file? e }

expect(extracted_files.count).to eq 1

TextExtractionWorker.new.perform(extraction_job.id)

extracted_files = Dir.glob("#{extraction_job.extraction_folder}/*").select { |e| File.file? e }
extracted_files = Dir.glob("#{extraction_job.extraction_folder}/**/*").select { |e| File.file? e }

expect(extracted_files.count).to eq 1

document = JSON.parse(File.read(extracted_files.first))
text = JSON.parse(document['body'])['text']
text = JSON.parse(document['body'])['text']

expect(text).to include("AUCKLAND OFFICE")
end

it 'signifies if content has been extracted using OCR' do
TextExtractionWorker.new.perform(extraction_job.id)

extracted_files = Dir.glob("#{extraction_job.extraction_folder}/*").select { |e| File.file? e }
extracted_files = Dir.glob("#{extraction_job.extraction_folder}/**/*").select { |e| File.file? e }

expect(extracted_files.count).to eq 1

document = JSON.parse(File.read(extracted_files.first))
process = JSON.parse(document['body'])['process']
process = JSON.parse(document['body'])['process']

expect(process).to eq 'Extracted from PDF using OCRmyPDF'
end
end

context 'when the PDF is invalid' do
before do
FileUtils.cp("#{Rails.root}/spec/support/invalid_pdf.pdf", "#{extraction_job.extraction_folder}/example__1234__01.json")
FileUtils.cp("#{Rails.root}/spec/support/invalid_pdf.pdf", "#{extraction_job.extraction_folder}/1/example__1234__01.json")
end

it 'fails gracefully when dealing with an invalid PDF' do
extracted_files = Dir.glob("#{extraction_job.extraction_folder}/*").select { |e| File.file? e }
extracted_files = Dir.glob("#{extraction_job.extraction_folder}/**/*").select { |e| File.file? e }

expect(extracted_files.count).to eq 1

TextExtractionWorker.new.perform(extraction_job.id)

extracted_files = Dir.glob("#{extraction_job.extraction_folder}/*").select { |e| File.file? e }
extracted_files = Dir.glob("#{extraction_job.extraction_folder}/**/*").select { |e| File.file? e }

expect(extracted_files.count).to eq 1

document = JSON.parse(File.read(extracted_files.first))
text = JSON.parse(document['body'])['text']
text = JSON.parse(document['body'])['text']

expect(text).to include('OCR failed')
end
Expand All @@ -123,7 +123,7 @@

context 'when the PDF extraction is part of a harvest' do
before do
FileUtils.cp("#{Rails.root}/spec/support/example.pdf", "#{extraction_job.extraction_folder}/example__1234__01.json")
FileUtils.cp("#{Rails.root}/spec/support/example.pdf", "#{extraction_job.extraction_folder}/1/example__1234__01.json")
end

let!(:harvest_report) { create(:harvest_report, pipeline_job:, harvest_job:) }
Expand All @@ -132,15 +132,15 @@
let(:harvest_definition) { create(:harvest_definition, pipeline:) }
let(:harvest_job) { create(:harvest_job, harvest_definition:, pipeline_job:) }
let(:extraction_job) { create(:extraction_job, extraction_definition:, harvest_job:) }

it 'enqueues Transformation Workers to process the text from the PDF' do
expect(TransformationWorker).to receive(:perform_async).exactly(1).times.and_call_original

TextExtractionWorker.new.perform(extraction_job.id)
TextExtractionWorker.new.perform(extraction_job.id)
end

it 'updates the Harvest Report appropriately' do
TextExtractionWorker.new.perform(extraction_job.id)
TextExtractionWorker.new.perform(extraction_job.id)

harvest_report.reload

Expand All @@ -152,7 +152,7 @@

context 'when the PDF extraction is part of an enrichment' do
before do
FileUtils.cp("#{Rails.root}/spec/support/example.pdf", "#{extraction_job.extraction_folder}/example__1234__01.json")
FileUtils.cp("#{Rails.root}/spec/support/example.pdf", "#{extraction_job.extraction_folder}/1/example__1234__01.json")
end

let(:destination) { create(:destination) }
Expand All @@ -165,8 +165,8 @@
let(:extraction_job) { create(:extraction_job, extraction_definition:, harvest_job:) }

it 'enqueues Transformation Workers to process the text from the PDF with the API Record ID' do
expect(TransformationWorker).to receive(:perform_async).exactly(1).times.with(anything, anything, "1234").and_call_original
TextExtractionWorker.new.perform(extraction_job.id)
expect(TransformationWorker).to receive(:perform_async).exactly(1).times.with(anything, anything, "1234").and_call_original
TextExtractionWorker.new.perform(extraction_job.id)
end
end
end
Expand Down
6 changes: 2 additions & 4 deletions spec/support/stub_request.rb
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,8 @@

RSpec.configure do |config|
config.before(:suite) do
FileUtils.rm_rf Dir.glob("#{ExtractionJob::EXTRACTIONS_FOLDER}/*/tmp/*")
FileUtils.rmdir Dir.glob("#{ExtractionJob::EXTRACTIONS_FOLDER}/*/tmp")
FileUtils.rm_rf Dir.glob("#{ExtractionJob::EXTRACTIONS_FOLDER}/*/*")
FileUtils.rmdir Dir.glob("#{ExtractionJob::EXTRACTIONS_FOLDER}/*")
FileUtils.rm_rf Dir.glob("#{ExtractionJob::EXTRACTIONS_FOLDER}/*/tmp")
FileUtils.rm_rf Dir.glob("#{ExtractionJob::EXTRACTIONS_FOLDER}/*")
end
end

Expand Down

0 comments on commit 141838e

Please sign in to comment.