This repository has been archived by the owner on Apr 8, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreport_missing_file_titles.py
executable file
·72 lines (60 loc) · 2.2 KB
/
report_missing_file_titles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/usr/bin/python3
""" Find all the collections (resources) that are missing an EAD Filing Title
This script reports out, by Repository, any Collection (Resource)
that is missing an EAD Filing Title
"""
import sys, getopt, attr, structlog, yaml
import logging
from datetime import datetime
from os.path import exists, expanduser, dirname, realpath
import os
logname = os.path.dirname(os.path.realpath(__file__)) + "/logs/missing_file_title.log"
logging.basicConfig(filename=logname,level=logging.INFO)
main_log = logging.getLogger(__name__)
main_log.setLevel(logging.INFO)
console = logging.StreamHandler()
console.setFormatter(logging.Formatter(' %(message)s'))
main_log.addHandler(console)
console.setLevel(logging.ERROR)
import requests
from boltons.dictutils import OMD
from asnake.aspace import ASpace
import pprint
DATEFORMAT ='%Y-%m-%d %H:%M:%S'
def check_resources(repo):
ctr = 0
miss_ctr = 0
for resource in repo.resources:
if not resource.publish:
continue
ctr += 1
# check for ead_id first!!
ead_id = ''
try:
ead_id = resource.ead_id
except AttributeError as ae:
main_log.error("Missing EAD ID: Resource {} {}".format(resource.uri, resource.title))
continue
file_title = ''
try:
file_title = resource.finding_aid_filing_title
except AttributeError as ae:
file_title = ''
if file_title.strip() == '':
main_log.info("EADID {}, {}".format(ead_id, resource.title))
miss_ctr += 1
if ctr > 0:
main_log.info("Out of {} published collections, {} are missing filing titles ({}%)".format(ctr, miss_ctr, (miss_ctr/ctr * 100)))
else:
main_log.info("No published collections found in {}".format(repo.name))
def main():
ctr = 0
main_log.info("Starting analysis {}".format(datetime.now().strftime(DATEFORMAT) ))
aspace = ASpace()
for repo in aspace.repositories:
if not repo.publish:
continue
main_log.info("\n ****** Checking {} **********\n".format(repo.name))
check_resources(repo)
main_log.info("Completed! {}".format(datetime.now().strftime(DATEFORMAT) ))
main()