Skip to content

Commit

Permalink
Construct DSS JSON
Browse files Browse the repository at this point in the history
Why these changes are being introduced:

We want to be able to publish theses to DSpace, which requires
us to convert relevant thesis metadata to DSS-compliant JSON.

Relevant ticket(s):

* https://mitlibraries.atlassian.net/browse/ETD-394

How this addresses that need:

This creates the requisite submission metadata for DSS, based on the
submission message spec and the data mappings spreadsheet. It does not
save said metadata to AWS, nor does it write messages to the queue.

Side effects of this change:

There are fields that are required for publication that are not
validated anywhere in the application. The current process is for
Jess to confirm these as part of their workflow, but we should make
their life easier by including an automated check as part of
the publication status update workflow. A ticket for this work is
open here: https://mitlibraries.atlassian.net/browse/ETD-431

Co-Authored-By: Jeremy Prevost <jprevost@mit.edu>
  • Loading branch information
jazairi and JPrevost committed Sep 17, 2021
1 parent 690cada commit 4bd4a37
Show file tree
Hide file tree
Showing 3 changed files with 287 additions and 0 deletions.
73 changes: 73 additions & 0 deletions app/models/dspace_metadata.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
class DspaceMetadata
def initialize(thesis)
@dc = {}
@dc['dc.publisher'] = 'Massachusetts Institute of Technology'
@dc['dc.type'] = 'Thesis'
title(thesis)
contributors(thesis.users, thesis.advisors)
departments(thesis.departments)
degrees(thesis.degrees)
copyright(thesis.copyright, thesis.license)
date_transferred(thesis.files)
end

# Generates JSON metadata file required for submission to DSS.
def serialize_dss_metadata
{ 'metadata' => @dc.map { |k, v| { 'key' => k, 'value' => v } } }.to_json
end

def title(thesis)
@dc['dc.title'] = thesis.title
@dc['dc.description.abstract'] = thesis.abstract if thesis.abstract
@dc['dc.date.issued'] = thesis.grad_date.strftime('%Y-%m')
end

def contributors(thesis_users, thesis_advisors)
@dc['dc.contributor.author'] = thesis_users.map(&:preferred_name)
@dc['dc.identifier.orcid'] = parse_orcids(thesis_users) if parse_orcids(thesis_users)
@dc['dc.contributor.advisor'] = thesis_advisors.map(&:name)
end

# We don't care about the order of the ORCIDs because DSpace can't assign them to a specific user.
def parse_orcids(thesis_users)
return unless thesis_users.any?(&:orcid)

orcids = thesis_users.map(&:orcid).compact
return unless orcids.present?

orcids
end

def departments(thesis_depts)
@dc['dc.contributor.department'] = thesis_depts.map(&:name_dspace)
end

def degrees(thesis_degrees)
@dc['dc.description.degree'] = thesis_degrees.map(&:abbreviation)
@dc['thesis.degree.name'] = thesis_degrees.map(&:name_dspace)

# Degree types should not be repeated if they are the same type.
types = thesis_degrees.map { |degree| degree.degree_type.name }.uniq
@dc['mit.thesis.degree'] = types
end

def copyright(thesis_copyright, thesis_license)
if thesis_copyright.holder != 'Author' # copyright holder is anyone but author
@dc['dc.rights'] = [thesis_copyright.statement_dspace, "U+00A9 #{thesis_copyright.holder}"]
@dc['dc.rights.uri'] = thesis_copyright.url if thesis_copyright.url
elsif thesis_license # author holds copyright and provides a license
@dc['dc.rights'] = [thesis_license.license_type, 'Copyright retained by author(s)']

# Theoretically both license and copyright URLs are required for publication, but there are no constraints on
# the models, and we want to future-proof this.
@dc['dc.rights.uri'] = thesis_license.url if thesis_license.url
else # author holds copyright and no license provided
@dc['dc.rights'] = [thesis_copyright.statement_dspace, 'Copyright retained by author(s)']
@dc['dc.rights.uri'] = thesis_copyright.url if thesis_copyright.url
end
end

def date_transferred(files)
@dc['dc.date.submitted'] = files.select { |file| file.purpose == 'thesis_pdf' }.first.blob.created_at
end
end
4 changes: 4 additions & 0 deletions test/fixtures/users.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ yo:
given_name: 'Yo'
surname: 'Yobot'
display_name: 'Yo Yobot'
orcid: '0001'

admin:
uid: 'admin_id'
Expand Down Expand Up @@ -58,6 +59,7 @@ basic:
given_name: 'Basic'
surname: 'Robot'
display_name: 'Basic Robot'
orcid: '0002'

second:
uid: 'second'
Expand All @@ -66,6 +68,7 @@ second:
given_name: 'Second'
surname: 'Student'
display_name: 'Second Student'
preferred_name: 'Student, Second'

third:
uid: 'third'
Expand All @@ -74,6 +77,7 @@ third:
given_name: 'Third'
surname: 'Student'
display_name: 'Third Student'
preferred_name: 'Student, Third'

processor:
uid: 'processor_id'
Expand Down
210 changes: 210 additions & 0 deletions test/models/dspace_metadata_test.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
require 'test_helper'

class DspaceMetadataTest < ActiveSupport::TestCase
# Adding some properties that are not included in our fixtures
def dss_friendly_thesis(thesis)
degree = thesis.degrees.first
degree.degree_type_id = degree_types(:bachelor).id
degree.save
file = Rails.root.join('test', 'fixtures', 'files', 'a_pdf.pdf')
thesis.files.attach(io: File.open(file), filename: 'a_pdf.pdf')
thesis.files.first.description = 'My thesis'
thesis.files.first.purpose = 'thesis_pdf'
thesis.save
end

test 'parses thesis data as DSpace DC' do
t = theses(:one)
dss_friendly_thesis(t)
dc = DspaceMetadata.new(t).instance_variable_get(:@dc)
assert_equal 'MyString', dc['dc.title']
assert_equal '2017-09', dc['dc.date.issued']
assert_equal 'MyText', dc['dc.description.abstract']

# No abstract (optional for undergraduate theses)
t.abstract = nil
t.save
dc = DspaceMetadata.new(t).instance_variable_get(:@dc)
assert_nil dc['dc.description.abstract']
end

test 'parses author data as DSpace DC' do
# One author
t = Thesis.create(title: 'Who cares', graduation_year: '2021', graduation_month: 'February',
advisors: [advisors(:first)], users: [users(:second)], degrees: [degrees(:one)],
departments: [departments(:one)], copyright: copyrights(:mit))
dss_friendly_thesis(t)
dc = DspaceMetadata.new(t).instance_variable_get(:@dc)
assert_equal ['Student, Second'], dc['dc.contributor.author']

# More than one author
t.users = [users(:second), users(:third)]
t.save
dc = DspaceMetadata.new(t).instance_variable_get(:@dc)
assert_equal ['Student, Second', 'Student, Third'], dc['dc.contributor.author']
end

test 'parses ORCIDs' do
# One author and one ORCID
t1 = theses(:one)
dss_friendly_thesis(t1)
dc = DspaceMetadata.new(t1).instance_variable_get(:@dc)
assert_equal ['0001'], dc['dc.identifier.orcid']

# Multiple authors and multiple ORCIDs
t2 = theses(:two)
dss_friendly_thesis(t2)

# Since theses(:two) has no advisors, we need to attach one as DspaceMetadata expects this. This can be can be
# removed once we update the Thesis validations to reflect publication requirements.
t2.advisors = [advisors(:first)]
t2.save
dc = DspaceMetadata.new(t2).instance_variable_get(:@dc)
assert_equal %w[0002 0001], dc['dc.identifier.orcid']

# Multiple authors and only one ORCID
t2.users.second.orcid = nil
t2.users.second.save
dc = DspaceMetadata.new(t2).instance_variable_get(:@dc)
assert_equal ['0002'], dc['dc.identifier.orcid']

# One author and no ORCID
t1.users.first.orcid = nil
t1.users.first.save
dc = DspaceMetadata.new(t1).instance_variable_get(:@dc)
assert_nil dc['dc.identifier.orcid']
end

test 'parses advisor data as DSpace DC' do
t = theses(:one)
dss_friendly_thesis(t)
dc = DspaceMetadata.new(t).instance_variable_get(:@dc)

# One advisor
assert_equal ['Addy McAdvisor'], dc['dc.contributor.advisor']

# More than one advisor
t.advisors = [advisors(:first), advisors(:second)]
t.save
dc = DspaceMetadata.new(t).instance_variable_get(:@dc)
assert_equal ['Addy McAdvisor', 'Viola McAdvisor'], dc['dc.contributor.advisor']
end

test 'parses copyright data as DSpace DC' do
# Author holds copyright and license is present
t = theses(:downloaded)
t.copyright = copyrights(:author)
t.license = licenses(:ccby)
t.users = [users(:yo)]
t.advisors = [advisors(:first)]
t.save
dss_friendly_thesis(t)
dc = DspaceMetadata.new(t).instance_variable_get(:@dc)
assert_equal ['Attribution 4.0 International (CC BY 4.0)', 'Copyright retained by author(s)'],
dc['dc.rights']
assert_equal 'https://creativecommons.org/licenses/by/4.0/', dc['dc.rights.uri']

# No URI
t.license = licenses(:nocc)
t.save
dc = DspaceMetadata.new(t).instance_variable_get(:@dc)
assert_nil dc['dc.rights.uri']

# Author holds copyright and no license
t.license = nil
t.save
dc = DspaceMetadata.new(t).instance_variable_get(:@dc)
assert_equal ['In Copyright', 'Copyright retained by author(s)'], dc['dc.rights']
assert_equal 'https://rightsstatements.org/page/InC/1.0/', dc['dc.rights.uri']

# Any other copyright holder
t.copyright = copyrights(:mit)
t.save
dc = DspaceMetadata.new(t).instance_variable_get(:@dc)
assert_equal ['In Copyright - Educational Use Permitted', 'U+00A9 MIT'], dc['dc.rights']
assert_equal 'http://rightsstatements.org/page/InC-EDU/1.0/', dc['dc.rights.uri']
end

test 'parses department data as DSpace DC' do
# One department
t = theses(:one)
dss_friendly_thesis(t)
dc = DspaceMetadata.new(t).instance_variable_get(:@dc)
assert_equal ['Massachusetts Institute of Technology. Department of Aeronautics and Astronautics'],
dc['dc.contributor.department']

# Multiple departments
t.departments = [departments(:one), departments(:two)]
t.save
dc = DspaceMetadata.new(t).instance_variable_get(:@dc)
assert_equal ['Massachusetts Institute of Technology. Department of Aeronautics and Astronautics',
'MIT Anthropology Program'], dc['dc.contributor.department']
end

test 'parses degree data as DSpace DC' do
# One degree
t = theses(:one)
dss_friendly_thesis(t)
d1 = degrees(:one)
d1.degree_type_id = degree_types(:bachelor).id
d1.save
dc = DspaceMetadata.new(t).instance_variable_get(:@dc)
assert_equal ['MFA'], dc['dc.description.degree']
assert_equal ['Master of Fine Arts'], dc['thesis.degree.name']
assert_equal ['Bachelor'], dc['mit.thesis.degree']

# Multiple degrees
d2 = degrees(:two)
d2.degree_type_id = degree_types(:master).id
d2.save
t.degrees = [d1, d2]
t.save
dc = DspaceMetadata.new(t).instance_variable_get(:@dc)
assert_equal %w[MFA JD], dc['dc.description.degree']
assert_equal ['Master of Fine Arts', 'Master of Fine Arts'], dc['thesis.degree.name']
assert_equal %w[Bachelor Master], dc['mit.thesis.degree']

# Does not repeat degree types
d2.degree_type_id = degree_types(:bachelor).id
d2.save
dc = DspaceMetadata.new(t).instance_variable_get(:@dc)
assert_equal ['Bachelor'], dc['mit.thesis.degree']
end

test 'parses file data as DSpace DC' do
t = theses(:one)
dss_friendly_thesis(t)
dc = DspaceMetadata.new(t).instance_variable_get(:@dc)
blob = t.files.first.blob
assert_equal blob.created_at, dc['dc.date.submitted']
end

test 'compiles constituent DC metadata on instantiation' do
t = theses(:one)
dss_friendly_thesis(t)
dc = DspaceMetadata.new(t).instance_variable_get(:@dc)
assert_equal 'Massachusetts Institute of Technology', dc['dc.publisher']
assert_equal 'Thesis', dc['dc.type']

# Checking for presence of keys instead of exact values, as we verify values in other tests.
assert_equal ['dc.publisher', 'dc.type', 'dc.title', 'dc.description.abstract', 'dc.date.issued',
'dc.contributor.author', 'dc.identifier.orcid', 'dc.contributor.advisor', 'dc.contributor.department',
'dc.description.degree', 'thesis.degree.name', 'mit.thesis.degree', 'dc.rights', 'dc.rights.uri',
'dc.date.submitted'], dc.keys
end

test 'metadata file is structured as expected for DSS' do
t = theses(:one)
dss_friendly_thesis(t)
serialized = DspaceMetadata.new(t).serialize_dss_metadata

# Make sure the JSON is actually serialized
assert_equal String, serialized.class

# Unserialize the JSON so we can check that it's well-formed
unserialized = JSON.parse(serialized)
assert_equal ['metadata'], unserialized.keys
assert_equal unserialized['metadata'].first, { 'key' => 'dc.publisher',
'value' => 'Massachusetts Institute of Technology' }
end
end

0 comments on commit 4bd4a37

Please sign in to comment.