From 4bd4a37d374c7e29106dfa9a97501fb3b8f1c860 Mon Sep 17 00:00:00 2001 From: Adam Jazairi Date: Mon, 13 Sep 2021 16:53:21 -0400 Subject: [PATCH] Construct DSS JSON Why these changes are being introduced: We want to be able to publish theses to DSpace, which requires us to convert relevant thesis metadata to DSS-compliant JSON. Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/ETD-394 How this addresses that need: This creates the requisite submission metadata for DSS, based on the submission message spec and the data mappings spreadsheet. It does not save said metadata to AWS, nor does it write messages to the queue. Side effects of this change: There are fields that are required for publication that are not validated anywhere in the application. The current process is for Jess to confirm these as part of their workflow, but we should make their life easier by including an automated check as part of the publication status update workflow. A ticket for this work is open here: https://mitlibraries.atlassian.net/browse/ETD-431 Co-Authored-By: Jeremy Prevost --- app/models/dspace_metadata.rb | 73 ++++++++++ test/fixtures/users.yml | 4 + test/models/dspace_metadata_test.rb | 210 ++++++++++++++++++++++++++++ 3 files changed, 287 insertions(+) create mode 100644 app/models/dspace_metadata.rb create mode 100644 test/models/dspace_metadata_test.rb diff --git a/app/models/dspace_metadata.rb b/app/models/dspace_metadata.rb new file mode 100644 index 00000000..ea3a6077 --- /dev/null +++ b/app/models/dspace_metadata.rb @@ -0,0 +1,73 @@ +class DspaceMetadata + def initialize(thesis) + @dc = {} + @dc['dc.publisher'] = 'Massachusetts Institute of Technology' + @dc['dc.type'] = 'Thesis' + title(thesis) + contributors(thesis.users, thesis.advisors) + departments(thesis.departments) + degrees(thesis.degrees) + copyright(thesis.copyright, thesis.license) + date_transferred(thesis.files) + end + + # Generates JSON metadata file required for submission to DSS. + def serialize_dss_metadata + { 'metadata' => @dc.map { |k, v| { 'key' => k, 'value' => v } } }.to_json + end + + def title(thesis) + @dc['dc.title'] = thesis.title + @dc['dc.description.abstract'] = thesis.abstract if thesis.abstract + @dc['dc.date.issued'] = thesis.grad_date.strftime('%Y-%m') + end + + def contributors(thesis_users, thesis_advisors) + @dc['dc.contributor.author'] = thesis_users.map(&:preferred_name) + @dc['dc.identifier.orcid'] = parse_orcids(thesis_users) if parse_orcids(thesis_users) + @dc['dc.contributor.advisor'] = thesis_advisors.map(&:name) + end + + # We don't care about the order of the ORCIDs because DSpace can't assign them to a specific user. + def parse_orcids(thesis_users) + return unless thesis_users.any?(&:orcid) + + orcids = thesis_users.map(&:orcid).compact + return unless orcids.present? + + orcids + end + + def departments(thesis_depts) + @dc['dc.contributor.department'] = thesis_depts.map(&:name_dspace) + end + + def degrees(thesis_degrees) + @dc['dc.description.degree'] = thesis_degrees.map(&:abbreviation) + @dc['thesis.degree.name'] = thesis_degrees.map(&:name_dspace) + + # Degree types should not be repeated if they are the same type. + types = thesis_degrees.map { |degree| degree.degree_type.name }.uniq + @dc['mit.thesis.degree'] = types + end + + def copyright(thesis_copyright, thesis_license) + if thesis_copyright.holder != 'Author' # copyright holder is anyone but author + @dc['dc.rights'] = [thesis_copyright.statement_dspace, "U+00A9 #{thesis_copyright.holder}"] + @dc['dc.rights.uri'] = thesis_copyright.url if thesis_copyright.url + elsif thesis_license # author holds copyright and provides a license + @dc['dc.rights'] = [thesis_license.license_type, 'Copyright retained by author(s)'] + + # Theoretically both license and copyright URLs are required for publication, but there are no constraints on + # the models, and we want to future-proof this. + @dc['dc.rights.uri'] = thesis_license.url if thesis_license.url + else # author holds copyright and no license provided + @dc['dc.rights'] = [thesis_copyright.statement_dspace, 'Copyright retained by author(s)'] + @dc['dc.rights.uri'] = thesis_copyright.url if thesis_copyright.url + end + end + + def date_transferred(files) + @dc['dc.date.submitted'] = files.select { |file| file.purpose == 'thesis_pdf' }.first.blob.created_at + end +end diff --git a/test/fixtures/users.yml b/test/fixtures/users.yml index ad23c618..f01d806f 100644 --- a/test/fixtures/users.yml +++ b/test/fixtures/users.yml @@ -31,6 +31,7 @@ yo: given_name: 'Yo' surname: 'Yobot' display_name: 'Yo Yobot' + orcid: '0001' admin: uid: 'admin_id' @@ -58,6 +59,7 @@ basic: given_name: 'Basic' surname: 'Robot' display_name: 'Basic Robot' + orcid: '0002' second: uid: 'second' @@ -66,6 +68,7 @@ second: given_name: 'Second' surname: 'Student' display_name: 'Second Student' + preferred_name: 'Student, Second' third: uid: 'third' @@ -74,6 +77,7 @@ third: given_name: 'Third' surname: 'Student' display_name: 'Third Student' + preferred_name: 'Student, Third' processor: uid: 'processor_id' diff --git a/test/models/dspace_metadata_test.rb b/test/models/dspace_metadata_test.rb new file mode 100644 index 00000000..15ddb960 --- /dev/null +++ b/test/models/dspace_metadata_test.rb @@ -0,0 +1,210 @@ +require 'test_helper' + +class DspaceMetadataTest < ActiveSupport::TestCase + # Adding some properties that are not included in our fixtures + def dss_friendly_thesis(thesis) + degree = thesis.degrees.first + degree.degree_type_id = degree_types(:bachelor).id + degree.save + file = Rails.root.join('test', 'fixtures', 'files', 'a_pdf.pdf') + thesis.files.attach(io: File.open(file), filename: 'a_pdf.pdf') + thesis.files.first.description = 'My thesis' + thesis.files.first.purpose = 'thesis_pdf' + thesis.save + end + + test 'parses thesis data as DSpace DC' do + t = theses(:one) + dss_friendly_thesis(t) + dc = DspaceMetadata.new(t).instance_variable_get(:@dc) + assert_equal 'MyString', dc['dc.title'] + assert_equal '2017-09', dc['dc.date.issued'] + assert_equal 'MyText', dc['dc.description.abstract'] + + # No abstract (optional for undergraduate theses) + t.abstract = nil + t.save + dc = DspaceMetadata.new(t).instance_variable_get(:@dc) + assert_nil dc['dc.description.abstract'] + end + + test 'parses author data as DSpace DC' do + # One author + t = Thesis.create(title: 'Who cares', graduation_year: '2021', graduation_month: 'February', + advisors: [advisors(:first)], users: [users(:second)], degrees: [degrees(:one)], + departments: [departments(:one)], copyright: copyrights(:mit)) + dss_friendly_thesis(t) + dc = DspaceMetadata.new(t).instance_variable_get(:@dc) + assert_equal ['Student, Second'], dc['dc.contributor.author'] + + # More than one author + t.users = [users(:second), users(:third)] + t.save + dc = DspaceMetadata.new(t).instance_variable_get(:@dc) + assert_equal ['Student, Second', 'Student, Third'], dc['dc.contributor.author'] + end + + test 'parses ORCIDs' do + # One author and one ORCID + t1 = theses(:one) + dss_friendly_thesis(t1) + dc = DspaceMetadata.new(t1).instance_variable_get(:@dc) + assert_equal ['0001'], dc['dc.identifier.orcid'] + + # Multiple authors and multiple ORCIDs + t2 = theses(:two) + dss_friendly_thesis(t2) + + # Since theses(:two) has no advisors, we need to attach one as DspaceMetadata expects this. This can be can be + # removed once we update the Thesis validations to reflect publication requirements. + t2.advisors = [advisors(:first)] + t2.save + dc = DspaceMetadata.new(t2).instance_variable_get(:@dc) + assert_equal %w[0002 0001], dc['dc.identifier.orcid'] + + # Multiple authors and only one ORCID + t2.users.second.orcid = nil + t2.users.second.save + dc = DspaceMetadata.new(t2).instance_variable_get(:@dc) + assert_equal ['0002'], dc['dc.identifier.orcid'] + + # One author and no ORCID + t1.users.first.orcid = nil + t1.users.first.save + dc = DspaceMetadata.new(t1).instance_variable_get(:@dc) + assert_nil dc['dc.identifier.orcid'] + end + + test 'parses advisor data as DSpace DC' do + t = theses(:one) + dss_friendly_thesis(t) + dc = DspaceMetadata.new(t).instance_variable_get(:@dc) + + # One advisor + assert_equal ['Addy McAdvisor'], dc['dc.contributor.advisor'] + + # More than one advisor + t.advisors = [advisors(:first), advisors(:second)] + t.save + dc = DspaceMetadata.new(t).instance_variable_get(:@dc) + assert_equal ['Addy McAdvisor', 'Viola McAdvisor'], dc['dc.contributor.advisor'] + end + + test 'parses copyright data as DSpace DC' do + # Author holds copyright and license is present + t = theses(:downloaded) + t.copyright = copyrights(:author) + t.license = licenses(:ccby) + t.users = [users(:yo)] + t.advisors = [advisors(:first)] + t.save + dss_friendly_thesis(t) + dc = DspaceMetadata.new(t).instance_variable_get(:@dc) + assert_equal ['Attribution 4.0 International (CC BY 4.0)', 'Copyright retained by author(s)'], + dc['dc.rights'] + assert_equal 'https://creativecommons.org/licenses/by/4.0/', dc['dc.rights.uri'] + + # No URI + t.license = licenses(:nocc) + t.save + dc = DspaceMetadata.new(t).instance_variable_get(:@dc) + assert_nil dc['dc.rights.uri'] + + # Author holds copyright and no license + t.license = nil + t.save + dc = DspaceMetadata.new(t).instance_variable_get(:@dc) + assert_equal ['In Copyright', 'Copyright retained by author(s)'], dc['dc.rights'] + assert_equal 'https://rightsstatements.org/page/InC/1.0/', dc['dc.rights.uri'] + + # Any other copyright holder + t.copyright = copyrights(:mit) + t.save + dc = DspaceMetadata.new(t).instance_variable_get(:@dc) + assert_equal ['In Copyright - Educational Use Permitted', 'U+00A9 MIT'], dc['dc.rights'] + assert_equal 'http://rightsstatements.org/page/InC-EDU/1.0/', dc['dc.rights.uri'] + end + + test 'parses department data as DSpace DC' do + # One department + t = theses(:one) + dss_friendly_thesis(t) + dc = DspaceMetadata.new(t).instance_variable_get(:@dc) + assert_equal ['Massachusetts Institute of Technology. Department of Aeronautics and Astronautics'], + dc['dc.contributor.department'] + + # Multiple departments + t.departments = [departments(:one), departments(:two)] + t.save + dc = DspaceMetadata.new(t).instance_variable_get(:@dc) + assert_equal ['Massachusetts Institute of Technology. Department of Aeronautics and Astronautics', + 'MIT Anthropology Program'], dc['dc.contributor.department'] + end + + test 'parses degree data as DSpace DC' do + # One degree + t = theses(:one) + dss_friendly_thesis(t) + d1 = degrees(:one) + d1.degree_type_id = degree_types(:bachelor).id + d1.save + dc = DspaceMetadata.new(t).instance_variable_get(:@dc) + assert_equal ['MFA'], dc['dc.description.degree'] + assert_equal ['Master of Fine Arts'], dc['thesis.degree.name'] + assert_equal ['Bachelor'], dc['mit.thesis.degree'] + + # Multiple degrees + d2 = degrees(:two) + d2.degree_type_id = degree_types(:master).id + d2.save + t.degrees = [d1, d2] + t.save + dc = DspaceMetadata.new(t).instance_variable_get(:@dc) + assert_equal %w[MFA JD], dc['dc.description.degree'] + assert_equal ['Master of Fine Arts', 'Master of Fine Arts'], dc['thesis.degree.name'] + assert_equal %w[Bachelor Master], dc['mit.thesis.degree'] + + # Does not repeat degree types + d2.degree_type_id = degree_types(:bachelor).id + d2.save + dc = DspaceMetadata.new(t).instance_variable_get(:@dc) + assert_equal ['Bachelor'], dc['mit.thesis.degree'] + end + + test 'parses file data as DSpace DC' do + t = theses(:one) + dss_friendly_thesis(t) + dc = DspaceMetadata.new(t).instance_variable_get(:@dc) + blob = t.files.first.blob + assert_equal blob.created_at, dc['dc.date.submitted'] + end + + test 'compiles constituent DC metadata on instantiation' do + t = theses(:one) + dss_friendly_thesis(t) + dc = DspaceMetadata.new(t).instance_variable_get(:@dc) + assert_equal 'Massachusetts Institute of Technology', dc['dc.publisher'] + assert_equal 'Thesis', dc['dc.type'] + + # Checking for presence of keys instead of exact values, as we verify values in other tests. + assert_equal ['dc.publisher', 'dc.type', 'dc.title', 'dc.description.abstract', 'dc.date.issued', + 'dc.contributor.author', 'dc.identifier.orcid', 'dc.contributor.advisor', 'dc.contributor.department', + 'dc.description.degree', 'thesis.degree.name', 'mit.thesis.degree', 'dc.rights', 'dc.rights.uri', + 'dc.date.submitted'], dc.keys + end + + test 'metadata file is structured as expected for DSS' do + t = theses(:one) + dss_friendly_thesis(t) + serialized = DspaceMetadata.new(t).serialize_dss_metadata + + # Make sure the JSON is actually serialized + assert_equal String, serialized.class + + # Unserialize the JSON so we can check that it's well-formed + unserialized = JSON.parse(serialized) + assert_equal ['metadata'], unserialized.keys + assert_equal unserialized['metadata'].first, { 'key' => 'dc.publisher', + 'value' => 'Massachusetts Institute of Technology' } + end +end