From 3233a5c143fd927bcd09365b2e4b0e77b13c531a Mon Sep 17 00:00:00 2001 From: Adam Jazairi Date: Thu, 3 Feb 2022 08:44:06 -0500 Subject: [PATCH] Create batch jobs for preservation Why these changes are being introduced: We need a way to initiate SIP creation, bagging and streaming to S3. Relevant ticket(s): https://mitlibraries.atlassian.net/browse/ETD-553 How this addresses that need: This creates two jobs: 1. PreservationSubmissionJob attempts to send a single thesis to preservation and modifies the preservation_status and preserved_at fields accordingly. 2. PreservationSubmissionPrepJob loops through an array of theses to preserve. Side effects of this change: * preservation_status is now an enum with three options: unpreserved (the default), preserved, and error. * Adds `preservation.rake` task to send a single thesis to preservation via Rails CLI. * The readme now indicates that the publishing workflow automatically kicks off the preservation workflow. This is not yet accurate, but will be once ETD-560 is implemented. --- README.md | 23 +++++++- app/jobs/preservation_submission_job.rb | 19 +++++++ app/jobs/preservation_submission_prep_job.rb | 11 ++++ app/models/submission_information_package.rb | 2 + lib/tasks/preservation.rake | 19 +++++++ .../preservation_submission_job_prep_test.rb | 28 ++++++++++ test/jobs/preservation_submission_job_test.rb | 53 +++++++++++++++++++ .../submission_information_package_test.rb | 4 +- ...mission_information_package_zipper_test.rb | 2 +- 9 files changed, 157 insertions(+), 4 deletions(-) create mode 100644 app/jobs/preservation_submission_job.rb create mode 100644 app/jobs/preservation_submission_prep_job.rb create mode 100644 lib/tasks/preservation.rake create mode 100644 test/jobs/preservation_submission_job_prep_test.rb create mode 100644 test/jobs/preservation_submission_job_test.rb diff --git a/README.md b/README.md index 7ab37d18..bb301cb0 100644 --- a/README.md +++ b/README.md @@ -226,7 +226,7 @@ Example usage: heroku run rails dss:process_output_queue --app TARGET-HEROKU-APP ``` -## Publishing a single thesis +### Publishing a single thesis You can publish a single thesis that is already in `Publication review` status by passing the `thesis_id` to a rake task like: @@ -234,6 +234,27 @@ You can publish a single thesis that is already in `Publication review` status b heroku run rails dss:publish_thesis_by_id[THESIS_ID] --app TARGET-HEROKU-APP ``` +## Preservation workflow + +The publishing workflow will automatically trigger preservation for all of the published theses in the results queue. +At this point a submission information package is generated for each thesis, then a bag is constructed, zipped, and +streamed to an S3 bucket. (See the SubmissionInformationPackage and SubmissionInformationPackageZipper classes for more +details on this part of the process.) + +Once the bags are in the S3 bucket, they are replicated to the Digital Preservation S3 bucket, where they can be +ingested into Archivematica. + +A thesis can be sent to preservation more than once. In order to track provenance across multiple preservation events, +we persist certain data about the SIP and audit the model using [paper_trail](https://github.com/paper-trail-gem/paper_trail). + +### Preserving a single thesis + +You can manually send a published thesis to preservation by passing the thesis ID to the following rake task: + +```shell +heroku run rails preservation:preserve_thesis_by_id[THESIS_ID] --app TARGET-HEROKU-APP +``` + ## Validation of thesis record Prior to theses being published to external systems (such as the repository, or diff --git a/app/jobs/preservation_submission_job.rb b/app/jobs/preservation_submission_job.rb new file mode 100644 index 00000000..e0f9613c --- /dev/null +++ b/app/jobs/preservation_submission_job.rb @@ -0,0 +1,19 @@ +class PreservationSubmissionJob < ActiveJob::Base + queue_as :default + + def perform(thesis) + begin + Rails.logger.info("Thesis #{thesis.id} is now being prepared for preservation") + sip = thesis.submission_information_packages.create + SubmissionInformationPackageZipper.new(sip) + sip.preservation_status = 'preserved' + sip.preserved_at = DateTime.now + sip.save + Rails.logger.info("Thesis #{thesis.id} has been sent to preservation") + rescue StandardError, NoMethodError, Aws::Errors => e + Rails.logger.info ("Thesis #{thesis.id} could not be preserved: #{e}") + sip.preservation_status = 'error' + sip.save + end + end +end diff --git a/app/jobs/preservation_submission_prep_job.rb b/app/jobs/preservation_submission_prep_job.rb new file mode 100644 index 00000000..da9a2da4 --- /dev/null +++ b/app/jobs/preservation_submission_prep_job.rb @@ -0,0 +1,11 @@ +class PreservationSubmissionPrepJob < ActiveJob::Base + queue_as :default + + def perform(theses) + Rails.logger.info("Preparing to send #{theses.count} theses to preservation") + + theses.each do |thesis| + PreservationSubmissionJob.perform_later(thesis) + end + end +end diff --git a/app/models/submission_information_package.rb b/app/models/submission_information_package.rb index 012d6ab1..8ddddc42 100644 --- a/app/models/submission_information_package.rb +++ b/app/models/submission_information_package.rb @@ -29,6 +29,8 @@ class SubmissionInformationPackage < ApplicationRecord before_create :set_metadata, :set_bag_declaration, :set_manifest, :set_bag_name + enum preservation_status: [ :unpreserved, :preserved, :error ] + def data file_locations = {} thesis.files.map { |f| file_locations["data/#{f.filename}"] = f.blob } diff --git a/lib/tasks/preservation.rake b/lib/tasks/preservation.rake new file mode 100644 index 00000000..1edb8cf6 --- /dev/null +++ b/lib/tasks/preservation.rake @@ -0,0 +1,19 @@ +namespace :preservation do + desc 'Sends a single thesis to preservation' + task :preserve_thesis_by_id, [:thesis_id] => :environment do |_t, args| + if args.thesis_id + Rails.logger.info("Attempting to send #{args.thesis_id} to preservation...") + thesis = Thesis.find(args.thesis_id) + + # Only published theses may be sent to preservation. We already check for this in SubmissionInformationPackage + # validations, but double-checking here to save potential confusion. + if thesis.publication_status == 'Published' + PreservationSubmissionJob.perform_now(thesis) + else + Rails.logger.info("Thesis status of #{thesis.publication_status} cannot be preserved.") + end + else + Rails.logger.info('No thesis ID provided.') + end + end +end diff --git a/test/jobs/preservation_submission_job_prep_test.rb b/test/jobs/preservation_submission_job_prep_test.rb new file mode 100644 index 00000000..3ed7e553 --- /dev/null +++ b/test/jobs/preservation_submission_job_prep_test.rb @@ -0,0 +1,28 @@ +require 'test_helper' + +class PreservationSubmissionPrepJobTest < ActiveJob::TestCase + + test 'queues 1 job for 1 thesis' do + theses = [theses(:one)].to_a + + assert_enqueued_jobs 1 do + PreservationSubmissionPrepJob.perform_now(theses) + end + end + + test 'queues 2 jobs for 2 theses' do + theses = [theses(:one), theses(:two)].to_a + + assert_enqueued_jobs 2 do + PreservationSubmissionPrepJob.perform_now(theses) + end + end + + test 'queues same number of theses it receives' do + theses = Thesis.in_review.to_a + + assert_enqueued_jobs theses.count do + PreservationSubmissionPrepJob.perform_now(theses) + end + end +end diff --git a/test/jobs/preservation_submission_job_test.rb b/test/jobs/preservation_submission_job_test.rb new file mode 100644 index 00000000..09f3d710 --- /dev/null +++ b/test/jobs/preservation_submission_job_test.rb @@ -0,0 +1,53 @@ +require 'test_helper' + +class PreservationSubmissionJobTest < ActiveJob::TestCase + + # because we need to actually use the file it's easier to attach it in the test rather + # than use our fixtures as the fixtures oddly don't account for the file actually being + # where ActiveStorage expects them to be. We also need this to be a record that looks like + # a published record so we'll use the published fixture, remove the fixtured files, and attach + # one again. + def setup_thesis + thesis = theses(:published) + thesis.files = [] + thesis.save + file = Rails.root.join('test', 'fixtures', 'files', 'registrar_data_small_sample.csv') + thesis.files.attach(io: File.open(file), filename: 'registrar_data_small_sample.csv') + thesis + end + + test 'creates a SIP' do + thesis = setup_thesis + assert_equal 0, thesis.submission_information_packages.count + + PreservationSubmissionJob.perform_now(thesis) + assert_equal 1, thesis.submission_information_packages.count + end + + test 'updates preservation_status to "preserved" after successfully processing a thesis' do + thesis = setup_thesis + PreservationSubmissionJob.perform_now(thesis) + assert_equal 'preserved', thesis.submission_information_packages.last.preservation_status + end + + test 'updates preserved_at to the current time after successfully processing a thesis' do + time = DateTime.now.getutc + Timecop.freeze(time) do + thesis = setup_thesis + PreservationSubmissionJob.perform_now(thesis) + assert_equal time, thesis.submission_information_packages.last.preserved_at + end + end + + test 'rescues exceptions by updating preservation_status to "error"' do + thesis = theses(:one) + PreservationSubmissionJob.perform_now(thesis) + assert_equal 'error', thesis.submission_information_packages.last.preservation_status + end + + test 'does not update preserved_at if the job enters an error state' do + thesis = theses(:one) + PreservationSubmissionJob.perform_now(thesis) + assert_nil thesis.submission_information_packages.last.preserved_at + end +end diff --git a/test/models/submission_information_package_test.rb b/test/models/submission_information_package_test.rb index 18b2d84e..31c9c3ff 100644 --- a/test/models/submission_information_package_test.rb +++ b/test/models/submission_information_package_test.rb @@ -86,9 +86,9 @@ class SubmissionInformationPackageTest < ActiveSupport::TestCase assert_not_nil sip.metadata end - test 'preservation_status defaults to 0' do + test 'preservation_status defaults to unpreserved' do sip = theses(:published).submission_information_packages.create - assert_equal 0, sip.preservation_status + assert_equal 'unpreserved', sip.preservation_status end test 'data generates file location hash' do diff --git a/test/models/submission_information_package_zipper_test.rb b/test/models/submission_information_package_zipper_test.rb index 87a1bcc5..3db8c28c 100644 --- a/test/models/submission_information_package_zipper_test.rb +++ b/test/models/submission_information_package_zipper_test.rb @@ -2,7 +2,7 @@ class SubmissionInformationPackageZipperTest < ActiveSupport::TestCase - # because we need to actually use the file it's easier to attache it in the test rather + # because we need to actually use the file it's easier to attach it in the test rather # than use our fixtures as the fixtures oddly don't account for the file actually being # where ActiveStorage expects them to be. We also need this to be a record that looks like # a published record so we'll use the published fixture, remove the fixtured files, and attach