Skip to content

Commit

Permalink
refactor: Update TextExtractions and Split to use folder structure
Browse files Browse the repository at this point in the history
  • Loading branch information
motizuki committed Aug 27, 2024
1 parent 2c5025e commit cfd6c65
Show file tree
Hide file tree
Showing 5 changed files with 34 additions and 34 deletions.
6 changes: 2 additions & 4 deletions app/sidekiq/file_extraction_worker.rb
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,8 @@ def setup_tmp_directory
end

def move_extracted_documents_into_tmp_directory
Dir.children(@extraction_folder).each do |file|
next if file == 'tmp'

FileUtils.move("#{@extraction_folder}/#{file}", "#{@tmp_directory}/#{file}")
Dir.glob("#{@extraction_folder}/**/*.*").each do |file|
FileUtils.move(file, "#{@tmp_directory}/#{File.basename(file)}")
end
end

Expand Down
3 changes: 2 additions & 1 deletion app/sidekiq/split_worker.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,13 @@ def process_extracted_documents
def create_document(records, saved_response)
page_str = format('%09d', @page)[-9..]
name_str = @extraction_definition.name.parameterize(separator: '_')
folder_number = (@page / Extraction::Documents::DOCUMENTS_PER_FOLDER.to_f).ceil

Extraction::Document.new(
url: saved_response['url'], method: saved_response['method'],
params: saved_response['params'], request_headers: saved_response['request_headers'],
status: saved_response['status'], response_headers: saved_response['response_headers'],
body: "<?xml version=\"1.0\"?><root><records>#{records.map(&:to_xml).join}</records></root>"
).save("#{@extraction_folder}/#{name_str}__-__#{page_str}.json")
).save("#{@extraction_folder}/#{folder_number}/#{name_str}__-__#{page_str}.json")
end
end
4 changes: 3 additions & 1 deletion app/sidekiq/text_extraction_worker.rb
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,14 @@ def process_extracted_documents
end

def create_document(extracted_text, filename, process)
folder_number = (@page / Documents::DOCUMENTS_PER_FOLDER.to_f).ceil

Extraction::Document.new(
url: saved_response['url'], method: saved_response['method'],
params: saved_response['params'], request_headers: saved_response['request_headers'],
status: saved_response['status'], response_headers: saved_response['response_headers'],
body: { text: extracted_text, process: }.to_json
).save("#{@extraction_folder}/#{filename}")
).save("#{@extraction_folder}/#{folder_number}/#{filename}")
end

private
Expand Down
49 changes: 25 additions & 24 deletions spec/sidekiq/split_worker_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,50 +15,51 @@
end

it 'splits a large file into chunks of 100' do
extracted_files = Dir.glob("#{extraction_job.extraction_folder}/*").select { |e| File.file? e }
extracted_files = Dir.glob("#{extraction_job.extraction_folder}/**/*").select { |e| File.file? e }

expect(extracted_files.count).to eq 1

SplitWorker.new.perform(extraction_job.id)
extracted_files = Dir.glob("#{extraction_job.extraction_folder}/*").select { |e| File.file? e }

extracted_files = Dir.glob("#{extraction_job.extraction_folder}/**/*").select { |e| File.file? e }

expect(extracted_files.count).to eq 2
end

it 'cleans up the tmp folder it creates' do
expect(Dir.exist?("#{extraction_job.extraction_folder}/tmp")).to eq false

SplitWorker.new.perform(extraction_job.id)
SplitWorker.new.perform(extraction_job.id)

expect(Dir.exist?("#{extraction_job.extraction_folder}/tmp")).to eq false
expect(Dir.exist?("#{extraction_job.extraction_folder}/tmp")).to eq false
end

it 'names the new files following the appropriate naming convention' do
SplitWorker.new.perform(extraction_job.id)
expect(File.exist?("#{extraction_job.extraction_folder}/national-library-of-new-zealand_harvest-extraction-210__-__000000001.json"))
expect(File.exist?("#{extraction_job.extraction_folder}/national-library-of-new-zealand_harvest-extraction-210__-__000000002.json"))
expect(File.exist?("#{extraction_job.extraction_folder}/national-library-of-new-zealand_harvest-extraction-210__-__000000003.json"))
expect(File.exist?("#{extraction_job.extraction_folder}/national-library-of-new-zealand_harvest-extraction-210__-__000000004.json"))

expect(File.exist?("#{extraction_job.extraction_folder}/1/national-library-of-new-zealand_harvest-extraction-210__-__000000001.json"))
expect(File.exist?("#{extraction_job.extraction_folder}/1/national-library-of-new-zealand_harvest-extraction-210__-__000000002.json"))
expect(File.exist?("#{extraction_job.extraction_folder}/1/national-library-of-new-zealand_harvest-extraction-210__-__000000003.json"))
expect(File.exist?("#{extraction_job.extraction_folder}/1/national-library-of-new-zealand_harvest-extraction-210__-__000000004.json"))
end
end

context 'when the request is paginated' do
before do
FileUtils.cp("#{Rails.root}/spec/support/split_example.json", "#{extraction_job.extraction_folder}/split_example_01.json")
FileUtils.cp("#{Rails.root}/spec/support/split_example.json", "#{extraction_job.extraction_folder}/split_example_02.json")
FileUtils.mkdir_p("#{extraction_job.extraction_folder}/1")
FileUtils.cp("#{Rails.root}/spec/support/split_example.json", "#{extraction_job.extraction_folder}/1/split_example_01.json")
FileUtils.cp("#{Rails.root}/spec/support/split_example.json", "#{extraction_job.extraction_folder}/1/split_example_02.json")
end

it 'splits both files into many small ones that are unique from each other' do
extracted_files = Dir.glob("#{extraction_job.extraction_folder}/*").select { |e| File.file? e }
extracted_files = Dir.glob("#{extraction_job.extraction_folder}/**/*").select { |e| File.file? e }

expect(extracted_files.count).to eq 2

SplitWorker.new.perform(extraction_job.id)
extracted_files = Dir.glob("#{extraction_job.extraction_folder}/*").select { |e| File.file? e }

extracted_files = Dir.glob("#{extraction_job.extraction_folder}/**/*").select { |e| File.file? e }

expect(extracted_files.count).to eq 4
end
end
Expand All @@ -71,7 +72,7 @@
it 'does not enqueue Transformation Workers' do
expect(TransformationWorker).not_to receive(:perform_async)

SplitWorker.new.perform(extraction_job.id)
SplitWorker.new.perform(extraction_job.id)
end
end

Expand All @@ -90,7 +91,7 @@
it 'enqueues Transformation Workers to process the split files' do
expect(TransformationWorker).to receive(:perform_async).exactly(2).times.and_call_original

SplitWorker.new.perform(extraction_job.id)
SplitWorker.new.perform(extraction_job.id)
end

it 'stops enqueuing TransformationWorkers if the pipeline has been cancelled' do
Expand All @@ -101,7 +102,7 @@
end

it 'updates the Harvest Report appropriately' do
SplitWorker.new.perform(extraction_job.id)
SplitWorker.new.perform(extraction_job.id)

harvest_report.reload

Expand Down
6 changes: 2 additions & 4 deletions spec/support/stub_request.rb
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,8 @@

RSpec.configure do |config|
config.before(:suite) do
FileUtils.rm_rf Dir.glob("#{ExtractionJob::EXTRACTIONS_FOLDER}/*/tmp/*")
FileUtils.rmdir Dir.glob("#{ExtractionJob::EXTRACTIONS_FOLDER}/*/tmp")
FileUtils.rm_rf Dir.glob("#{ExtractionJob::EXTRACTIONS_FOLDER}/*/*")
FileUtils.rmdir Dir.glob("#{ExtractionJob::EXTRACTIONS_FOLDER}/*")
FileUtils.rm_rf Dir.glob("#{ExtractionJob::EXTRACTIONS_FOLDER}/*/tmp")
FileUtils.rm_rf Dir.glob("#{ExtractionJob::EXTRACTIONS_FOLDER}/*")
end
end

Expand Down

0 comments on commit cfd6c65

Please sign in to comment.