refactor: Update TextExtractions and Split to use folder structure

DigitalNZ · Aug 27, 2024 · 141838e · 141838e
1 parent 2c5025e
commit 141838e
Show file tree

Hide file tree

Showing 6 changed files with 75 additions and 75 deletions.
diff --git a/app/sidekiq/file_extraction_worker.rb b/app/sidekiq/file_extraction_worker.rb
@@ -71,10 +71,8 @@ def setup_tmp_directory
   end
 
   def move_extracted_documents_into_tmp_directory
-    Dir.children(@extraction_folder).each do |file|
-      next if file == 'tmp'
-
-      FileUtils.move("#{@extraction_folder}/#{file}", "#{@tmp_directory}/#{file}")
+    Dir.glob("#{@extraction_folder}/**/*.*").each do |file|
+      FileUtils.move(file, "#{@tmp_directory}/#{File.basename(file)}")
     end
   end
 

diff --git a/app/sidekiq/split_worker.rb b/app/sidekiq/split_worker.rb
@@ -15,12 +15,13 @@ def process_extracted_documents
   def create_document(records, saved_response)
     page_str = format('%09d', @page)[-9..]
     name_str = @extraction_definition.name.parameterize(separator: '_')
+    folder_number = (@page / Extraction::Documents::DOCUMENTS_PER_FOLDER.to_f).ceil
 
     Extraction::Document.new(
       url: saved_response['url'], method: saved_response['method'],
       params: saved_response['params'], request_headers: saved_response['request_headers'],
       status: saved_response['status'], response_headers: saved_response['response_headers'],
       body: "<?xml version=\"1.0\"?><root><records>#{records.map(&:to_xml).join}</records></root>"
-    ).save("#{@extraction_folder}/#{name_str}__-__#{page_str}.json")
+    ).save("#{@extraction_folder}/#{folder_number}/#{name_str}__-__#{page_str}.json")
   end
 end
diff --git a/app/sidekiq/text_extraction_worker.rb b/app/sidekiq/text_extraction_worker.rb
@@ -14,12 +14,14 @@ def process_extracted_documents
   end
 
   def create_document(extracted_text, filename, process)
+    folder_number = (@page / Extraction::Documents::DOCUMENTS_PER_FOLDER.to_f).ceil
+
     Extraction::Document.new(
       url: saved_response['url'], method: saved_response['method'],
       params: saved_response['params'], request_headers: saved_response['request_headers'],
       status: saved_response['status'], response_headers: saved_response['response_headers'],
       body: { text: extracted_text, process: }.to_json
-    ).save("#{@extraction_folder}/#{filename}")
+    ).save("#{@extraction_folder}/#{folder_number}/#{filename}")
   end
 
   private

diff --git a/spec/sidekiq/split_worker_spec.rb b/spec/sidekiq/split_worker_spec.rb
@@ -15,50 +15,51 @@
       end
 
       it 'splits a large file into chunks of 100' do
-        extracted_files = Dir.glob("#{extraction_job.extraction_folder}/*").select { |e| File.file? e }
-  
+        extracted_files = Dir.glob("#{extraction_job.extraction_folder}/**/*").select { |e| File.file? e }
+
         expect(extracted_files.count).to eq 1
-  
+
         SplitWorker.new.perform(extraction_job.id)
-  
-        extracted_files = Dir.glob("#{extraction_job.extraction_folder}/*").select { |e| File.file? e }
-  
+
+        extracted_files = Dir.glob("#{extraction_job.extraction_folder}/**/*").select { |e| File.file? e }
+
         expect(extracted_files.count).to eq 2
       end
 
       it 'cleans up the tmp folder it creates' do
         expect(Dir.exist?("#{extraction_job.extraction_folder}/tmp")).to eq false
 
-        SplitWorker.new.perform(extraction_job.id) 
+        SplitWorker.new.perform(extraction_job.id)
 
-        expect(Dir.exist?("#{extraction_job.extraction_folder}/tmp")).to eq false 
+        expect(Dir.exist?("#{extraction_job.extraction_folder}/tmp")).to eq false
       end
 
       it 'names the new files following the appropriate naming convention' do
         SplitWorker.new.perform(extraction_job.id)
-        
-        expect(File.exist?("#{extraction_job.extraction_folder}/national-library-of-new-zealand_harvest-extraction-210__-__000000001.json"))
-        expect(File.exist?("#{extraction_job.extraction_folder}/national-library-of-new-zealand_harvest-extraction-210__-__000000002.json"))
-        expect(File.exist?("#{extraction_job.extraction_folder}/national-library-of-new-zealand_harvest-extraction-210__-__000000003.json"))
-        expect(File.exist?("#{extraction_job.extraction_folder}/national-library-of-new-zealand_harvest-extraction-210__-__000000004.json"))
+
+        expect(File.exist?("#{extraction_job.extraction_folder}/1/national-library-of-new-zealand_harvest-extraction-210__-__000000001.json"))
+        expect(File.exist?("#{extraction_job.extraction_folder}/1/national-library-of-new-zealand_harvest-extraction-210__-__000000002.json"))
+        expect(File.exist?("#{extraction_job.extraction_folder}/1/national-library-of-new-zealand_harvest-extraction-210__-__000000003.json"))
+        expect(File.exist?("#{extraction_job.extraction_folder}/1/national-library-of-new-zealand_harvest-extraction-210__-__000000004.json"))
       end
     end
 
     context 'when the request is paginated' do
       before do
-        FileUtils.cp("#{Rails.root}/spec/support/split_example.json", "#{extraction_job.extraction_folder}/split_example_01.json")
-        FileUtils.cp("#{Rails.root}/spec/support/split_example.json", "#{extraction_job.extraction_folder}/split_example_02.json")
+        FileUtils.mkdir_p("#{extraction_job.extraction_folder}/1")
+        FileUtils.cp("#{Rails.root}/spec/support/split_example.json", "#{extraction_job.extraction_folder}/1/split_example_01.json")
+        FileUtils.cp("#{Rails.root}/spec/support/split_example.json", "#{extraction_job.extraction_folder}/1/split_example_02.json")
       end
 
       it 'splits both files into many small ones that are unique from each other' do
-        extracted_files = Dir.glob("#{extraction_job.extraction_folder}/*").select { |e| File.file? e }
-  
+        extracted_files = Dir.glob("#{extraction_job.extraction_folder}/**/*").select { |e| File.file? e }
+
         expect(extracted_files.count).to eq 2
-  
+
         SplitWorker.new.perform(extraction_job.id)
-  
-        extracted_files = Dir.glob("#{extraction_job.extraction_folder}/*").select { |e| File.file? e }
-  
+
+        extracted_files = Dir.glob("#{extraction_job.extraction_folder}/**/*").select { |e| File.file? e }
+
         expect(extracted_files.count).to eq 4
       end
     end
@@ -71,7 +72,7 @@
       it 'does not enqueue Transformation Workers' do
         expect(TransformationWorker).not_to receive(:perform_async)
 
-        SplitWorker.new.perform(extraction_job.id) 
+        SplitWorker.new.perform(extraction_job.id)
       end
     end
 
@@ -90,7 +91,7 @@
       it 'enqueues Transformation Workers to process the split files' do
         expect(TransformationWorker).to receive(:perform_async).exactly(2).times.and_call_original
 
-        SplitWorker.new.perform(extraction_job.id) 
+        SplitWorker.new.perform(extraction_job.id)
       end
 
       it 'stops enqueuing TransformationWorkers if the pipeline has been cancelled' do
@@ -101,7 +102,7 @@
       end
 
       it 'updates the Harvest Report appropriately' do
-        SplitWorker.new.perform(extraction_job.id)  
+        SplitWorker.new.perform(extraction_job.id)
 
         harvest_report.reload
 

diff --git a/spec/sidekiq/text_extraction_worker_spec.rb b/spec/sidekiq/text_extraction_worker_spec.rb
@@ -6,115 +6,115 @@
   let(:pipeline) { create(:pipeline, name: 'PDF Example') }
   let(:extraction_definition) { create(:extraction_definition, pipeline:, format: 'JSON', extract_text_from_file: true) }
   let(:extraction_job) { create(:extraction_job, extraction_definition:) }
- 
+
   describe "#perform" do
     context 'when the PDF extraction is not part of a harvest' do
 
       context 'when the PDF has a text layer' do
         before do
           FileUtils.cp("#{Rails.root}/spec/support/example.pdf", "#{extraction_job.extraction_folder}/example__1234__01.json")
         end
-  
+
         it 'converts a PDF into raw text' do
           extracted_files = Dir.glob("#{extraction_job.extraction_folder}/*").select { |e| File.file? e }
-    
+
           expect(extracted_files.count).to eq 1
-  
+
           TextExtractionWorker.new.perform(extraction_job.id)
-  
+
           extracted_files = Dir.glob("#{extraction_job.extraction_folder}/*").select { |e| File.file? e }
-    
+
           expect(extracted_files.count).to eq 1
         end
 
         it 'signifies if content has been extracted from a PDF' do
           TextExtractionWorker.new.perform(extraction_job.id)
 
           extracted_files = Dir.glob("#{extraction_job.extraction_folder}/*").select { |e| File.file? e }
-    
+
           expect(extracted_files.count).to eq 1
 
           document = JSON.parse(File.read(extracted_files.first))
-          process = JSON.parse(document['body'])['process'] 
+          process = JSON.parse(document['body'])['process']
 
           expect(process).to eq 'Extracted from application/pdf using Yomu'
         end
-  
+
         it 'cleans up the tmp folder it creates' do
           expect(Dir.exist?("#{extraction_job.extraction_folder}/tmp")).to eq false
-  
+
           TextExtractionWorker.new.perform(extraction_job.id)
-  
-          expect(Dir.exist?("#{extraction_job.extraction_folder}/tmp")).to eq false 
+
+          expect(Dir.exist?("#{extraction_job.extraction_folder}/tmp")).to eq false
         end
-  
+
         it 'names the new files following as it was originally' do
           TextExtractionWorker.new.perform(extraction_job.id)
-        
-          expect(File.exist?("#{extraction_job.extraction_folder}/example__1234__01.json")).to eq(true)
+
+          expect(File.exist?("#{extraction_job.extraction_folder}/1/example__1234__01.json")).to eq(true)
         end
-  
+
         it 'does not enqueue Transformation Workers' do
           expect(TransformationWorker).not_to receive(:perform_async)
-  
-          TextExtractionWorker.new.perform(extraction_job.id) 
+
+          TextExtractionWorker.new.perform(extraction_job.id)
         end
       end
 
       context 'when the PDF does not have a text layer' do
         before do
-          FileUtils.cp("#{Rails.root}/spec/support/example_needing_ocr.pdf", "#{extraction_job.extraction_folder}/example__1234__01.json")
+          FileUtils.cp("#{Rails.root}/spec/support/example_needing_ocr.pdf", "#{extraction_job.extraction_folder}/1/example__1234__01.json")
         end
 
         it 'OCRs the PDF to add a text layer' do
-          extracted_files = Dir.glob("#{extraction_job.extraction_folder}/*").select { |e| File.file? e }
-    
+          extracted_files = Dir.glob("#{extraction_job.extraction_folder}/**/*").select { |e| File.file? e }
+
           expect(extracted_files.count).to eq 1
 
           TextExtractionWorker.new.perform(extraction_job.id)
 
-          extracted_files = Dir.glob("#{extraction_job.extraction_folder}/*").select { |e| File.file? e }
-    
+          extracted_files = Dir.glob("#{extraction_job.extraction_folder}/**/*").select { |e| File.file? e }
+
           expect(extracted_files.count).to eq 1
 
           document = JSON.parse(File.read(extracted_files.first))
-          text = JSON.parse(document['body'])['text'] 
+          text = JSON.parse(document['body'])['text']
 
           expect(text).to include("AUCKLAND OFFICE")
         end
 
         it 'signifies if content has been extracted using OCR' do
           TextExtractionWorker.new.perform(extraction_job.id)
 
-          extracted_files = Dir.glob("#{extraction_job.extraction_folder}/*").select { |e| File.file? e }
-    
+          extracted_files = Dir.glob("#{extraction_job.extraction_folder}/**/*").select { |e| File.file? e }
+
           expect(extracted_files.count).to eq 1
 
           document = JSON.parse(File.read(extracted_files.first))
-          process = JSON.parse(document['body'])['process'] 
+          process = JSON.parse(document['body'])['process']
 
           expect(process).to eq 'Extracted from PDF using OCRmyPDF'
         end
       end
 
       context 'when the PDF is invalid' do
         before do
-          FileUtils.cp("#{Rails.root}/spec/support/invalid_pdf.pdf", "#{extraction_job.extraction_folder}/example__1234__01.json")
+          FileUtils.cp("#{Rails.root}/spec/support/invalid_pdf.pdf", "#{extraction_job.extraction_folder}/1/example__1234__01.json")
         end
 
         it 'fails gracefully when dealing with an invalid PDF' do
-          extracted_files = Dir.glob("#{extraction_job.extraction_folder}/*").select { |e| File.file? e }
-    
+          extracted_files = Dir.glob("#{extraction_job.extraction_folder}/**/*").select { |e| File.file? e }
+
           expect(extracted_files.count).to eq 1
 
           TextExtractionWorker.new.perform(extraction_job.id)
 
-          extracted_files = Dir.glob("#{extraction_job.extraction_folder}/*").select { |e| File.file? e }
-    
+          extracted_files = Dir.glob("#{extraction_job.extraction_folder}/**/*").select { |e| File.file? e }
+
           expect(extracted_files.count).to eq 1
 
           document = JSON.parse(File.read(extracted_files.first))
-          text = JSON.parse(document['body'])['text'] 
+          text = JSON.parse(document['body'])['text']
 
           expect(text).to include('OCR failed')
         end
@@ -123,7 +123,7 @@
 
     context 'when the PDF extraction is part of a harvest' do
       before do
-        FileUtils.cp("#{Rails.root}/spec/support/example.pdf", "#{extraction_job.extraction_folder}/example__1234__01.json")
+        FileUtils.cp("#{Rails.root}/spec/support/example.pdf", "#{extraction_job.extraction_folder}/1/example__1234__01.json")
       end
 
       let!(:harvest_report)    { create(:harvest_report, pipeline_job:, harvest_job:) }
@@ -132,15 +132,15 @@
       let(:harvest_definition) { create(:harvest_definition, pipeline:) }
       let(:harvest_job)        { create(:harvest_job, harvest_definition:, pipeline_job:) }
       let(:extraction_job)     { create(:extraction_job, extraction_definition:, harvest_job:) }
-      
+
       it 'enqueues Transformation Workers to process the text from the PDF' do
         expect(TransformationWorker).to receive(:perform_async).exactly(1).times.and_call_original
 
-        TextExtractionWorker.new.perform(extraction_job.id) 
+        TextExtractionWorker.new.perform(extraction_job.id)
       end
-      
+
       it 'updates the Harvest Report appropriately' do
-        TextExtractionWorker.new.perform(extraction_job.id)  
+        TextExtractionWorker.new.perform(extraction_job.id)
 
         harvest_report.reload
 
@@ -152,7 +152,7 @@
 
     context 'when the PDF extraction is part of an enrichment' do
       before do
-        FileUtils.cp("#{Rails.root}/spec/support/example.pdf", "#{extraction_job.extraction_folder}/example__1234__01.json")
+        FileUtils.cp("#{Rails.root}/spec/support/example.pdf", "#{extraction_job.extraction_folder}/1/example__1234__01.json")
       end
 
       let(:destination) { create(:destination) }
@@ -165,8 +165,8 @@
       let(:extraction_job)     { create(:extraction_job, extraction_definition:, harvest_job:) }
 
       it 'enqueues Transformation Workers to process the text from the PDF with the API Record ID' do
-        expect(TransformationWorker).to receive(:perform_async).exactly(1).times.with(anything, anything, "1234").and_call_original 
-        TextExtractionWorker.new.perform(extraction_job.id) 
+        expect(TransformationWorker).to receive(:perform_async).exactly(1).times.with(anything, anything, "1234").and_call_original
+        TextExtractionWorker.new.perform(extraction_job.id)
       end
     end
   end

diff --git a/spec/support/stub_request.rb b/spec/support/stub_request.rb
@@ -14,10 +14,8 @@
 
 RSpec.configure do |config|
   config.before(:suite) do
-    FileUtils.rm_rf Dir.glob("#{ExtractionJob::EXTRACTIONS_FOLDER}/*/tmp/*")
-    FileUtils.rmdir Dir.glob("#{ExtractionJob::EXTRACTIONS_FOLDER}/*/tmp")
-    FileUtils.rm_rf Dir.glob("#{ExtractionJob::EXTRACTIONS_FOLDER}/*/*")
-    FileUtils.rmdir Dir.glob("#{ExtractionJob::EXTRACTIONS_FOLDER}/*")
+    FileUtils.rm_rf Dir.glob("#{ExtractionJob::EXTRACTIONS_FOLDER}/*/tmp")
+    FileUtils.rm_rf Dir.glob("#{ExtractionJob::EXTRACTIONS_FOLDER}/*")
   end
 end