diff --git a/ia-legacy-importer/README b/ia-legacy-importer/README new file mode 100644 index 00000000..c6128a01 --- /dev/null +++ b/ia-legacy-importer/README @@ -0,0 +1,21 @@ +INSTALLATION + +you need python2.4, and, to import MARC catalogs, whatever +is required by catalog/marc/marc8_to_utf8.pl, including +this: + +http://search.cpan.org/dist/MARC-Charset/ + +you can test that the script is in basic working order by +running test-marc8_to_utf8.sh while in that directory. + +HOW TO IMPORT CATALOG DATA + +CATALOG=(this directory) +SOURCE_DATA=(path to data) +> cp $CATALOG/config-example.sh $MYDIR/config.sh +> # edit $MYDIR/config.sh +> $CATALOG/import.sh $MYDIR/config.sh "marc" "LC" "marc_records_scriblio_net/part01.dat" <$SOURCE_DATA + +see import.sh for details. + diff --git a/ia-legacy-importer/__init__.py b/ia-legacy-importer/__init__.py new file mode 100644 index 00000000..0216b77d --- /dev/null +++ b/ia-legacy-importer/__init__.py @@ -0,0 +1 @@ +"""catalog""" diff --git a/ia-legacy-importer/add_book/__init__.py b/ia-legacy-importer/add_book/__init__.py new file mode 100644 index 00000000..c9ec4e9c --- /dev/null +++ b/ia-legacy-importer/add_book/__init__.py @@ -0,0 +1,748 @@ +"""Module to load books into Open Library. + +This is used to load books from various MARC sources, including +Internet Archive. + +For loading a book, the available metadata is compiled as a dict, +called a record internally. Here is a sample record: + + { + "title": "The Adventures of Tom Sawyer", + "source_records": ["ia:TheAdventuresOfTomSawyer_201303"], + "authors": [{ + "name": "Mark Twain" + }] + } + +The title and source_records fields are mandatory. + +A record is loaded by calling the load function. + + record = {...} + response = load(record) + +""" +import json +import re +import six +from six.moves import urllib +import unicodedata +import web + +from collections import defaultdict +from copy import copy +from time import sleep + +from infogami import config + +from openlibrary import accounts +from openlibrary.catalog.merge.merge_marc import build_marc +from openlibrary.catalog.utils import mk_norm +from openlibrary.core import lending +from openlibrary.utils.isbn import normalize_isbn + +from openlibrary.catalog.add_book.load_book import build_query, east_in_by_statement, import_author, InvalidLanguage +from openlibrary.catalog.add_book.merge import try_merge + + +re_normalize = re.compile('[^[:alphanum:] ]', re.U) +re_lang = re.compile('^/languages/([a-z]{3})$') + + +type_map = { + 'description': 'text', + 'notes': 'text', + 'number_of_pages': 'int', +} + + +class CoverNotSaved(Exception): + def __init__(self, f): + self.f = f + def __str__(self): + return "coverstore responded with: '%s'" % self.f + + +class RequiredField(Exception): + def __init__(self, f): + self.f = f + def __str__(self): + return "missing required field: %s" % self.f + + +# don't use any of these as work titles +bad_titles = set(('Publications', 'Works. English', 'Missal', 'Works', 'Report', \ + 'Letters', 'Calendar', 'Bulletin', 'Plays', 'Sermons', 'Correspondence', \ + 'Bill', 'Bills', 'Selections', 'Selected works', 'Selected works. English', \ + 'The Novels', 'Laws, etc')) + +subject_fields = ['subjects', 'subject_places', 'subject_times', 'subject_people' ] + + +def strip_accents(s): + """http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string + """ + if isinstance(s, str): + return s + assert isinstance(s, six.text_type) + return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')) + + +def normalize(s): # strip non-alphanums and truncate at 25 chars + norm = strip_accents(s).lower() + norm = norm.replace(' and ', ' ') + if norm.startswith('the '): + norm = norm[4:] + elif norm.startswith('a '): + norm = norm[2:] + # strip bracketed text + norm = re.sub(r' ?\(.*\)', '', norm) + return norm.replace(' ', '')[:25] + + +def is_redirect(thing): + """ + :param Thing thing: + :rtype: bool + """ + if not thing: + return False + return thing.type.key == '/type/redirect' + + +def get_title(e): + if not e.get('work_titles'): + return e['title'] + wt = e['work_titles'][0] + return e['title'] if wt in bad_titles else e['title'] + + +def find_matching_work(e): + """ + Looks for an existing Work representing the new import edition by + comparing normalized titles for every work by each author of the current edition. + Returns the first match found, or None. + + :param dict e: An OL edition suitable for saving, has a key, and has full Authors with keys + but has not yet been saved. + :rtype: None or str + :return: the matched work key "/works/OL..W" if found + """ + + norm_title = mk_norm(get_title(e)) + seen = set() + for a in e['authors']: + q = { + 'type': '/type/work', + 'authors': {'author': {'key': a['key']}} + } + work_keys = list(web.ctx.site.things(q)) + for wkey in work_keys: + w = web.ctx.site.get(wkey) + if wkey in seen: + continue + seen.add(wkey) + if not w.get('title'): + continue + if mk_norm(w['title']) == norm_title: + assert w.type.key == '/type/work' + return wkey + + +def build_author_reply(author_in, edits): + """ + Steps through an import record's authors, and creates new records if new, + adding them to 'edits' to be saved later. + + :param list author_in: List of import sourced author dicts [{"name:" "Some One"}, ...], possibly with dates + :param list edits: list of Things to be saved later. Is modfied by this method. + :rtype: tuple + :return: (list, list) authors [{"key": "/author/OL..A"}, ...], author_reply the JSON status response to return for each author + """ + + authors = [] + author_reply = [] + for a in author_in: + new_author = 'key' not in a + if new_author: + a['key'] = web.ctx.site.new_key('/type/author') + edits.append(a) + authors.append({'key': a['key']}) + author_reply.append({ + 'key': a['key'], + 'name': a['name'], + 'status': ('created' if new_author else 'matched'), + }) + return (authors, author_reply) + + +def new_work(edition, rec, cover_id=None): + """ + :param dict edition: New OL Edition + :param dict rec: Edition import data + :param (int|None) cover_id: cover id + :rtype: dict + :return: a work to save + """ + w = { + 'type': {'key': '/type/work'}, + 'title': get_title(rec), + } + for s in subject_fields: + if s in rec: + w[s] = rec[s] + + if 'authors' in edition: + w['authors'] = [{'type':{'key': '/type/author_role'}, 'author': akey} for akey in edition['authors']] + + if 'description' in rec: + w['description'] = {'type': '/type/text', 'value': rec['description']} + + wkey = web.ctx.site.new_key('/type/work') + if edition.get('covers'): + w['covers'] = edition['covers'] + w['key'] = wkey + return w + + +def add_cover(cover_url, ekey, account=None): + """ + Adds a cover to coverstore and returns the cover id. + + :param str cover_url: URL of cover image + :param str ekey: Edition key /book/OL..M + :rtype: int or None + :return: Cover id, or None if upload did not succeed + """ + olid = ekey.split("/")[-1] + coverstore_url = config.get('coverstore_url').rstrip('/') + upload_url = coverstore_url + '/b/upload2' + if upload_url.startswith("//"): + upload_url = "{0}:{1}".format(web.ctx.get("protocol", "http"), upload_url) + user = account or accounts.get_current_user() + params = { + 'author': user.get('key') or user.get('_key'), + 'data': None, + 'source_url': cover_url, + 'olid': olid, + 'ip': web.ctx.ip, + } + reply = None + for attempt in range(10): + try: + res = urllib.request.urlopen(upload_url, urllib.parse.urlencode(params)) + except IOError: + sleep(2) + continue + body = res.read() + if res.getcode() == 500: + raise CoverNotSaved(body) + if body not in ['', 'None']: + reply = json.loads(body) + if res.getcode() == 200 and 'id' in reply: + break + sleep(2) + if not reply or reply.get('message') == 'Invalid URL': + return + cover_id = int(reply['id']) + return cover_id + +def get_ia_item(ocaid): + import internetarchive as ia + cfg = {'general': {'secure': False}} + item = ia.get_item(ocaid, config=cfg) + return item + +def modify_ia_item(item, data): + access_key = lending.config_ia_ol_metadata_write_s3 and lending.config_ia_ol_metadata_write_s3['s3_key'] + secret_key = lending.config_ia_ol_metadata_write_s3 and lending.config_ia_ol_metadata_write_s3['s3_secret'] + return item.modify_metadata(data, access_key=access_key, secret_key=secret_key) + +def create_ol_subjects_for_ocaid(ocaid, subjects): + item = get_ia_item(ocaid) + openlibrary_subjects = copy(item.metadata.get('openlibrary_subject')) or [] + + if not isinstance(openlibrary_subjects, list): + openlibrary_subjects = [openlibrary_subjects] + + for subject in subjects: + if subject not in openlibrary_subjects: + openlibrary_subjects.append(subject) + + r = modify_ia_item(item, {'openlibrary_subject': openlibrary_subjects}) + if r.status_code != 200: + return ('%s failed: %s' % (item.identifier, r.content)) + else: + return ("success for %s" % item.identifier) + +def update_ia_metadata_for_ol_edition(edition_id): + """ + Writes the Open Library Edition and Work id to a linked + archive.org item. + + :param str edition_id: of the form OL..M + :rtype: dict + :return: error report, or modified archive.org metadata on success + """ + + data = {'error': 'No qualifying edition'} + if edition_id: + ed = web.ctx.site.get('/books/%s' % edition_id) + if ed.ocaid: + work = ed.works[0] if ed.get('works') else None + if work and work.key: + item = get_ia_item(ed.ocaid) + work_id = work.key.split('/')[2] + r = modify_ia_item(item, { + 'openlibrary_work': work_id, + 'openlibrary_edition': edition_id + }) + if r.status_code != 200: + data = {'error': '%s failed: %s' % (item.identifier, r.content)} + else: + data = item.metadata + return data + + +def normalize_record_isbns(rec): + """ + Returns the Edition import record with all ISBN fields cleaned. + + :param dict rec: Edition import record + :rtype: dict + :return: A record with cleaned ISBNs in the various possible ISBN locations. + """ + for field in ('isbn_13', 'isbn_10', 'isbn'): + if rec.get(field): + rec[field] = [normalize_isbn(isbn) for isbn in rec.get(field) if normalize_isbn(isbn)] + return rec + + +def isbns_from_record(rec): + """ + Returns a list of all isbns from the various possible isbn fields. + + :param dict rec: Edition import record + :rtype: list + """ + isbns = rec.get('isbn', []) + rec.get('isbn_10', []) + rec.get('isbn_13', []) + return isbns + + +def build_pool(rec): + """ + Searches for existing edition matches on title and bibliographic keys. + + :param dict rec: Edition record + :rtype: dict + :return: {: [list of /books/OL..M keys that match rec on ]} + """ + pool = defaultdict(set) + match_fields = ('title', 'oclc_numbers', 'lccn', 'ocaid') + + # Find records with matching fields + for field in match_fields: + pool[field] = set(editions_matched(rec, field)) + + # update title pool with normalized title matches + pool['title'].update(set(editions_matched(rec, 'normalized_title_', normalize(rec['title'])))) + + # Find records with matching ISBNs + isbns = isbns_from_record(rec) + if isbns: + pool['isbn'] = set(editions_matched(rec, 'isbn_', isbns)) + + return dict((k, list(v)) for k, v in pool.iteritems() if v) + + +def early_exit(rec): + """ + Attempts to quickly find an existing item match using bibliographic keys. + + :param dict rec: Edition record + :rtype: str|bool + :return: First key matched of format "/books/OL..M" or False if no match found. + """ + + if 'openlibrary' in rec: + return '/books/' + rec['openlibrary'] + + ekeys = editions_matched(rec, 'ocaid') + if ekeys: + return ekeys[0] + + isbns = isbns_from_record(rec) + if isbns: + ekeys = editions_matched(rec, 'isbn_', isbns) + if ekeys: + return ekeys[0] + + # only searches for the first value from these lists + for f in 'source_records', 'oclc_numbers', 'lccn': + if rec.get(f): + ekeys = editions_matched(rec, f, rec[f][0]) + if ekeys: + return ekeys[0] + return False + + +def editions_matched(rec, key, value=None): + """ + Search OL for editions matching record's 'key' value. + + :param dict rec: Edition import record + :param str key: Key to search on, e.g. 'isbn_' + :param list|str value: Value or Values to use, overriding record values + :rtpye: list + :return: List of edition keys ["/books/OL..M",] + """ + if value is None and key not in rec: + return [] + + if value is None: + value = rec[key] + q = { + 'type':'/type/edition', + key: value + } + ekeys = list(web.ctx.site.things(q)) + return ekeys + + +def find_exact_match(rec, edition_pool): + """ + Returns an edition key match for rec from edition_pool + Only returns a key if all values match? + + :param dict rec: Edition import record + :param dict edition_pool: + :rtype: str|bool + :return: edition key + """ + seen = set() + for field, editions in edition_pool.iteritems(): + for ekey in editions: + if ekey in seen: + continue + seen.add(ekey) + existing = web.ctx.site.get(ekey) + match = True + for k, v in rec.items(): + if k == 'source_records': + continue + existing_value = existing.get(k) + if not existing_value: + continue + if k == 'languages': + existing_value = [str(re_lang.match(l.key).group(1)) for l in existing_value] + if k == 'authors': + existing_value = [dict(a) for a in existing_value] + for a in existing_value: + del a['type'] + del a['key'] + for a in v: + if 'entity_type' in a: + del a['entity_type'] + if 'db_name' in a: + del a['db_name'] + + if existing_value != v: + match = False + break + if match: + return ekey + return False + + +def find_match(e1, edition_pool): + """ + Find the best match for e1 in edition_pool and return its key. + :param dict e1: the new edition we are trying to match, output of build_marc(import record) + :param list edition_pool: list of possible edition matches, output of build_pool(import record) + :rtype: str|None + :return: None or the edition key '/books/OL...M' of the best edition match for e1 in edition_pool + """ + seen = set() + for k, v in edition_pool.iteritems(): + for edition_key in v: + if edition_key in seen: + continue + thing = None + found = True + while not thing or is_redirect(thing): + seen.add(edition_key) + thing = web.ctx.site.get(edition_key) + if thing is None: + found = False + break + if is_redirect(thing): + edition_key = thing['location'] + # FIXME: this updates edition_key, but leaves thing as redirect, + # which will raise an exception in try_merge() + if not found: + continue + if try_merge(e1, edition_key, thing): + return edition_key + + +def add_db_name(rec): + """ + db_name = Author name followed by dates. + adds 'db_name' in place for each author. + """ + if 'authors' not in rec: + return + + for a in rec['authors']: + date = None + if 'date' in a: + assert 'birth_date' not in a and 'death_date' not in a + date = a['date'] + elif 'birth_date' in a or 'death_date' in a: + date = a.get('birth_date', '') + '-' + a.get('death_date', '') + a['db_name'] = ' '.join([a['name'], date]) if date else a['name'] + + +def load_data(rec, account=None): + """ + Adds a new Edition to Open Library. Checks for existing Works. + Creates a new Work, and Author, if required, + otherwise associates the new Edition with the existing Work. + + :param dict rec: Edition record to add (no further checks at this point) + :rtype: dict + :return: + { + "success": False, + "error": + } + OR + { + "success": True, + "work": {"key": , "status": "created" | "modified" | "matched"}, + "edition": {"key": , "status": "created"}, + "authors": [{"status": "matched", "name": "John Smith", "key": }, ...] + } + """ + + cover_url = None + if 'cover' in rec: + cover_url = rec['cover'] + del rec['cover'] + try: + # get an OL style edition dict + edition = build_query(rec) + except InvalidLanguage as e: + return { + 'success': False, + 'error': str(e), + } + + ekey = web.ctx.site.new_key('/type/edition') + cover_id = None + if cover_url: + cover_id = add_cover(cover_url, ekey, account=account) + if cover_id: + edition['covers'] = [cover_id] + + edits = [] # Things (Edition, Work, Authors) to be saved + reply = {} + # TOFIX: edition.authors has already been processed by import_authors() in build_query(), following line is a NOP? + author_in = [import_author(a, eastern=east_in_by_statement(rec, a)) for a in edition.get('authors', [])] + # build_author_reply() adds authors to edits + (authors, author_reply) = build_author_reply(author_in, edits) + + if authors: + edition['authors'] = authors + reply['authors'] = author_reply + + wkey = None + work_state = 'created' + # Look for an existing work + if 'authors' in edition: + wkey = find_matching_work(edition) + if wkey: + w = web.ctx.site.get(wkey) + work_state = 'matched' + found_wkey_match = True + need_update = False + for k in subject_fields: + if k not in rec: + continue + for s in rec[k]: + if normalize(s) not in [normalize(existing) for existing in w.get(k, [])]: + w.setdefault(k, []).append(s) + need_update = True + if cover_id: + w.setdefault('covers', []).append(cover_id) + need_update = True + if need_update: + work_state = 'modified' + edits.append(w.dict()) + else: + # Create new work + w = new_work(edition, rec, cover_id) + wkey = w['key'] + edits.append(w) + + assert wkey + edition['works'] = [{'key': wkey}] + edition['key'] = ekey + edits.append(edition) + + web.ctx.site.save_many(edits, comment='import new book', action='add-book') + + # Writes back `openlibrary_edition` and `openlibrary_work` to + # archive.org item after successful import: + if 'ocaid' in rec: + update_ia_metadata_for_ol_edition(ekey.split('/')[-1]) + + reply['success'] = True + reply['edition'] = {'key': ekey, 'status': 'created'} + reply['work'] = {'key': wkey, 'status': work_state} + return reply + + +def load(rec, account=None): + """Given a record, tries to add/match that edition in the system. + + Record is a dictionary containing all the metadata of the edition. + The following fields are mandatory: + + * title: str + * source_records: list + + :param dict rec: Edition record to add + :rtype: dict + :return: a dict to be converted into a JSON HTTP response, same as load_data() + """ + required_fields = ['title', 'source_records'] # ['authors', 'publishers', 'publish_date'] + for field in required_fields: + if not rec.get(field): + raise RequiredField(field) + if isinstance(rec['source_records'], six.string_types): + rec['source_records'] = [rec['source_records']] + + rec = normalize_record_isbns(rec) + + edition_pool = build_pool(rec) + if not edition_pool: + # No match candidates found, add edition + return load_data(rec, account=account) + + match = early_exit(rec) + if not match: + match = find_exact_match(rec, edition_pool) + + if not match: + rec['full_title'] = rec['title'] + if rec.get('subtitle'): + rec['full_title'] += ' ' + rec['subtitle'] + e1 = build_marc(rec) + add_db_name(e1) + match = find_match(e1, edition_pool) + + if not match: + # No match found, add edition + return load_data(rec, account=account) + + # We have an edition match at this point + need_work_save = need_edition_save = False + w = None + e = web.ctx.site.get(match) + # check for, and resolve, author redirects + for a in e.authors: + while is_redirect(a): + if a in e.authors: + e.authors.remove(a) + a = web.ctx.site.get(a.location) + if not is_redirect(a): + e.authors.append(a) + + if e.get('works'): + w = e.works[0].dict() + work_created = False + else: + # Found an edition without a work + work_created = need_work_save = need_edition_save = True + w = new_work(e.dict(), rec) + e.works = [{'key': w['key']}] + + # Add subjects to work, if not already present + if 'subjects' in rec: + work_subjects = list(w.get('subjects', [])) + for s in rec['subjects']: + if s not in work_subjects: + work_subjects.append(s) + need_work_save = True + if need_work_save and work_subjects: + w['subjects'] = work_subjects + + # Add cover to edition + if 'cover' in rec and not e.get_covers(): + cover_url = rec['cover'] + cover_id = add_cover(cover_url, e.key, account=account) + if cover_id: + e['covers'] = [cover_id] + need_edition_save = True + + # Add cover to work, if needed + if not w.get('covers') and e.get_covers(): + w['covers'] = [e['covers'][0]] + need_work_save = True + + # Add description to work, if needed + if not w.get('description') and e.get('description'): + w['description'] = e['description'] + need_work_save = True + + # Add authors to work, if needed + if not w.get('authors'): + authors = [import_author(a) for a in rec.get('authors', [])] + w['authors'] = [{'type':{'key': '/type/author_role'}, 'author': a.key} for a in authors if a.get('key')] + if w.get('authors'): + need_work_save = True + + # Add ocaid to edition (str), if needed + if 'ocaid' in rec and not e.ocaid: + e['ocaid'] = rec['ocaid'] + need_edition_save = True + + edition_fields = [ + 'local_id', 'ia_box_id', 'ia_loaded_id', 'source_records'] + # TODO: + # only consider `source_records` for newly created work + # or if field originally missing: + #if work_created and not e.get('source_records'): + # edition_fields.append('source_records') + for f in edition_fields: + if f not in rec: + continue + # ensure values is a list + values = rec[f] if isinstance(rec[f], list) else [rec[f]] + if f in e: + # get values from rec that are not currently on the edition + to_add = [v for v in values if v not in e[f]] + e[f] += to_add + else: + e[f] = to_add = values + if to_add: + need_edition_save = True + + edits = [] + reply = { + 'success': True, + 'edition': {'key': match, 'status': 'matched'}, + 'work': {'key': w['key'], 'status': 'matched'}, + } + if need_edition_save: + reply['edition']['status'] = 'modified' + edits.append(e.dict()) + if need_work_save: + reply['work']['status'] = 'created' if work_created else 'modified' + edits.append(w) + if edits: + web.ctx.site.save_many(edits, comment='import existing book', action='edit-book') + if 'ocaid' in rec: + update_ia_metadata_for_ol_edition(match.split('/')[-1]) + return reply diff --git a/ia-legacy-importer/add_book/load_book.py b/ia-legacy-importer/add_book/load_book.py new file mode 100644 index 00000000..a226bc32 --- /dev/null +++ b/ia-legacy-importer/add_book/load_book.py @@ -0,0 +1,224 @@ +import web +import re +from openlibrary.catalog.utils import flip_name, author_dates_match, key_int + + +def east_in_by_statement(rec, author): + """ + Returns False if there is no by_statement in rec. + Otherwise returns whether author name uses eastern name order. + TODO: elaborate on what this actually means, and how it is used. + + :param dict rec: import source edition record + :param dict author: import source author dict: {"name": "Some One"} + :rtype: bool + """ + + if 'by_statement' not in rec: + return False + if 'authors' not in rec: + return False + name = author['name'] + flipped = flip_name(name) + name = name.replace('.', '') + name = name.replace(', ', '') + if name == flipped.replace('.', ''): + # name was not flipped + return False + return rec['by_statement'].find(name) != -1 + + +def do_flip(author): + """ + Given an author import dict, flip its name in place + i.e. Smith, John => John Smith + + :param dict author: + :rtype: None + """ + if 'personal_name' not in author: + return + if author['personal_name'] != author['name']: + return + first_comma = author['name'].find(', ') + if first_comma == -1: + return + # e.g: Harper, John Murdoch, 1845- + if author['name'].find(',', first_comma + 1) != -1: + return + if author['name'].find('i.e.') != -1: + return + if author['name'].find('i. e.') != -1: + return + name = flip_name(author['name']) + author['name'] = name + author['personal_name'] = name + + +def pick_from_matches(author, match): + """ + Finds the best match for author from a list of OL authors records, match. + + :param dict author: Author import representation + :param list match: List of matching OL author records + :rtype: dict + :return: A single OL author record from match + """ + maybe = [] + if 'birth_date' in author and 'death_date' in author: + maybe = [m for m in match if 'birth_date' in m and 'death_date' in m] + elif 'date' in author: + maybe = [m for m in match if 'date' in m] + if not maybe: + maybe = match + if len(maybe) == 1: + return maybe[0] + return min(maybe, key=key_int) + + +def find_author(name): + """ + Searches OL for an author by name. + + :param str name: Author's name + :rtype: list + :return: A list of OL author representations than match name + """ + def walk_redirects(obj, seen): + seen.add(obj['key']) + while obj['type']['key'] == '/type/redirect': + assert obj['location'] != obj['key'] + obj = web.ctx.site.get(obj['location']) + seen.add(obj['key']) + return obj + + q = {'type': '/type/author', 'name': name} # FIXME should have no limit + reply = list(web.ctx.site.things(q)) + authors = [web.ctx.site.get(k) for k in reply] + if any(a.type.key != '/type/author' for a in authors): + seen = set() + authors = [walk_redirects(a, seen) for a in authors if a['key'] not in seen] + return authors + + +def find_entity(author): + """ + Looks for an existing Author record in OL by name + and returns it if found. + + :param dict author: Author import dict {"name": "Some One"} + :rtype: dict|None + :return: Existing Author record, if one is found + """ + name = author['name'] + things = find_author(name) + et = author.get('entity_type') + if et and et != 'person': + if not things: + return None + db_entity = things[0] + assert db_entity['type']['key'] == '/type/author' + return db_entity + if ', ' in name: + things += find_author(flip_name(name)) + match = [] + seen = set() + for a in things: + key = a['key'] + if key in seen: + continue + seen.add(key) + orig_key = key + assert a.type.key == '/type/author' + if 'birth_date' in author and 'birth_date' not in a: + continue + if 'birth_date' not in author and 'birth_date' in a: + continue + if not author_dates_match(author, a): + continue + match.append(a) + if not match: + return None + if len(match) == 1: + return match[0] + return pick_from_matches(author, match) + + +def import_author(author, eastern=False): + """ + Converts an import style new-author dictionary into an + Open Library existing author, or new author candidate, representation. + Does NOT create new authors. + + :param dict author: Author import record {"name": "Some One"} + :param bool eastern: Eastern name order + :rtype: dict + :return: Open Library style Author representation, either exisiting with "key", + or new candidate without "key". + """ + existing = find_entity(author) + if existing: + assert existing.type.key == '/type/author' + for k in 'last_modified', 'id', 'revision', 'created': + if existing.k: + del existing.k + new = existing + if 'death_date' in author and 'death_date' not in existing: + new['death_date'] = author['death_date'] + return new + if not eastern: + do_flip(author) + a = {'type': {'key': '/type/author'}} + for f in 'name', 'title', 'personal_name', 'birth_date', 'death_date', 'date': + if f in author: + a[f] = author[f] + return a + + +class InvalidLanguage(Exception): + def __init__(self, code): + self.code = code + def __str__(self): + return "invalid language code: '%s'" % self.code + + +type_map = { 'description': 'text', 'notes': 'text', 'number_of_pages': 'int' } + + +def build_query(rec): + """ + Takes an edition record dict, rec, and returns an Open Library edition + suitable for saving. + + :param dict rec: Edition record to add to Open Library + :rtype: dict + :return: Open Library style edition representation + """ + book = { + 'type': { 'key': '/type/edition'}, + } + + for k, v in rec.iteritems(): + if k == 'authors': + if v and v[0]: + book['authors'] = [] + for author in v: + east = east_in_by_statement(rec, author) + book['authors'].append(import_author(author, eastern=east)) + continue + if k == 'languages': + langs = [] + for l in v: + if web.ctx.site.get('/languages/' + l) is None: + raise InvalidLanguage(l) + book['languages'] = [{'key': '/languages/' + l} for l in v] + continue + if k in type_map: + t = '/type/' + type_map[k] + if isinstance(v, list): + book[k] = [{'type': t, 'value': i} for i in v] + else: + book[k] = {'type': t, 'value': v} + else: + book[k] = v + return book diff --git a/ia-legacy-importer/add_book/merge.py b/ia-legacy-importer/add_book/merge.py new file mode 100644 index 00000000..15c8a60d --- /dev/null +++ b/ia-legacy-importer/add_book/merge.py @@ -0,0 +1,50 @@ +from openlibrary.catalog.merge.merge_marc import build_marc, attempt_merge +import web + +threshold = 875 + +def db_name(a): + date = None + if a.birth_date or a.death_date: + date = a.get('birth_date', '') + '-' + a.get('death_date', '') + elif a.date: + date = a.date + return ' '.join([a['name'], date]) if date else a['name'] + +# FIXME: badly named. edition_record_equal? (candidate_ed, existing_ed) +def try_merge(e1, edition_key, existing): + """ + Converts the existing edition into a comparable dict and performs a + thresholded comparison to decide whether they are the same. + Used by add_book.load() -> add_book.find_match() to check whether two + editions match. + + :param dict e1: Output of build_marc(import record candidate) + :param str edition_key: edition key of existing + :param Thing existing: Edition object to be tested against e1, the object of edition_key + :rtype: bool + :return: Whether e1 is sufficiently the same as the 'existing' edition + """ + + thing_type = existing.type.key + if thing_type == '/type/delete': + return False + # FIXME: will fail if existing is a redirect. + assert thing_type == '/type/edition' + rec2 = {} + rec2['full_title'] = existing.title + if existing.subtitle: + rec2['full_title'] += ' ' + existing.subtitle + for f in 'isbn', 'isbn_10', 'isbn_13', 'lccn', 'publish_country', 'publishers', 'publish_date': + if existing.get(f): + rec2[f] = existing[f] + if existing.authors: + rec2['authors'] = [] + for a in existing.authors: + while a.type.key == '/type/redirect': + a = web.ctx.site.get(a.location) + if a.type.key == '/type/author': + assert a['name'] + rec2['authors'].append({'name': a['name'], 'db_name': db_name(a)}) + e2 = build_marc(rec2) + return attempt_merge(e1, e2, threshold) diff --git a/ia-legacy-importer/add_book/test_add_book.py b/ia-legacy-importer/add_book/test_add_book.py new file mode 100644 index 00000000..5dcf4206 --- /dev/null +++ b/ia-legacy-importer/add_book/test_add_book.py @@ -0,0 +1,840 @@ +from __future__ import print_function + +import os +import pytest + +from copy import deepcopy +from collections import defaultdict + +from infogami.infobase.core import Text + +from openlibrary.catalog import add_book +from openlibrary.catalog.add_book import add_db_name, build_pool, editions_matched, isbns_from_record, load, RequiredField +from openlibrary.catalog.add_book.load_book import build_query, InvalidLanguage +from openlibrary.catalog.add_book.merge import try_merge + +from openlibrary.catalog.merge.merge_marc import build_marc +from openlibrary.catalog.marc.parse import read_edition +from openlibrary.catalog.marc.marc_binary import MarcBinary, BadLength, BadMARC + + +from six.moves.urllib.request import urlopen + + +def open_test_data(filename): + """Returns a file handle to file with specified filename inside test_data directory. + """ + root = os.path.dirname(__file__) + fullpath = os.path.join(root, 'test_data', filename) + return open(fullpath) + +@pytest.fixture +def add_languages(mock_site): + languages = [ + ('eng', 'English'), + ('spa', 'Spanish'), + ('fre', 'French'), + ('yid', 'Yiddish'), + ] + for code, name in languages: + mock_site.save({ + 'key': '/languages/' + code, + 'name': name, + 'type': {'key': '/type/language'}, + }) + +@pytest.fixture +def ia_writeback(monkeypatch): + """Prevent ia writeback from making live requests. + """ + monkeypatch.setattr(add_book, 'update_ia_metadata_for_ol_edition', lambda olid: {}) + +def test_build_query(add_languages): + rec = { + 'title': 'magic', + 'languages': ['eng', 'fre'], + 'authors': [{}], + 'description': 'test', + } + q = build_query(rec) + assert q['title'] == 'magic' + assert q['description'] == {'type': '/type/text', 'value': 'test'} + assert q['type'] == {'key': '/type/edition'} + assert q['languages'] == [{'key': '/languages/eng'}, {'key': '/languages/fre'}] + + pytest.raises(InvalidLanguage, build_query, {'languages': ['wtf']}) + +def test_isbns_from_record(): + rec = {'title': 'test', 'isbn_13': ['9780190906764'], 'isbn_10': ['0190906766']} + result = isbns_from_record(rec) + assert isinstance(result, list) + assert '9780190906764' in result + assert '0190906766' in result + assert len(result) == 2 + +def test_editions_matched_no_results(mock_site): + rec = {'title': 'test', 'isbn_13': ['9780190906764'], 'isbn_10': ['0190906766']} + isbns = isbns_from_record(rec) + result = editions_matched(rec, 'isbn_', isbns) + # returns no results because there are no existing editions + assert result == [] + +def test_editions_matched(mock_site, add_languages, ia_writeback): + rec = {'title': 'test', 'isbn_13': ['9780190906764'], 'isbn_10': ['0190906766'], 'source_records': ['test:001']} + load(rec) + isbns = isbns_from_record(rec) + + result_10 = editions_matched(rec, 'isbn_10', '0190906766') + assert result_10 == ['/books/OL1M'] + + result_13 = editions_matched(rec, 'isbn_13', '9780190906764') + assert result_13 == ['/books/OL1M'] + + # searching on key isbn_ will return a matching record on either isbn_10 or isbn_13 metadata fields + result = editions_matched(rec, 'isbn_', isbns) + assert result == ['/books/OL1M'] + +def test_load_without_required_field(): + rec = {'ocaid': 'test item'} + pytest.raises(RequiredField, load, {'ocaid': 'test_item'}) + +def test_load_test_item(mock_site, add_languages, ia_writeback): + rec = { + 'ocaid': 'test_item', + 'source_records': ['ia:test_item'], + 'title': 'Test item', + 'languages': ['eng'], + } + reply = load(rec) + assert reply['success'] is True + assert reply['edition']['status'] == 'created' + e = mock_site.get(reply['edition']['key']) + assert e.type.key == '/type/edition' + assert e.title == 'Test item' + assert e.ocaid == 'test_item' + assert e.source_records == ['ia:test_item'] + l = e.languages + assert len(l) == 1 and l[0].key == '/languages/eng' + + assert reply['work']['status'] == 'created' + w = mock_site.get(reply['work']['key']) + assert w.title == 'Test item' + assert w.type.key == '/type/work' + +def test_load_with_subjects(mock_site, ia_writeback): + rec = { + 'ocaid': 'test_item', + 'title': 'Test item', + 'subjects': ['Protected DAISY', 'In library'], + 'source_records': 'ia:test_item', + } + reply = load(rec) + assert reply['success'] is True + w = mock_site.get(reply['work']['key']) + assert w.title == 'Test item' + assert w.subjects == ['Protected DAISY', 'In library'] + +def test_load_with_new_author(mock_site, ia_writeback): + rec = { + 'ocaid': 'test_item', + 'title': 'Test item', + 'authors': [{'name': 'John Doe'}], + 'source_records': 'ia:test_item', + } + reply = load(rec) + assert reply['success'] is True + w = mock_site.get(reply['work']['key']) + assert reply['authors'][0]['status'] == 'created' + assert reply['authors'][0]['name'] == 'John Doe' + akey1 = reply['authors'][0]['key'] + assert akey1 == '/authors/OL1A' + a = mock_site.get(akey1) + assert w.authors + assert a.type.key == '/type/author' + + # Tests an existing author is modified if an Author match is found, and more data is provided + # This represents an edition of another work by the above author. + rec = { + 'ocaid': 'test_item1b', + 'title': 'Test item1b', + 'authors': [{'name': 'Doe, John', 'entity_type': 'person'}], + 'source_records': 'ia:test_item1b', + } + reply = load(rec) + assert reply['success'] is True + assert reply['edition']['status'] == 'created' + assert reply['work']['status'] == 'created' + akey2 = reply['authors'][0]['key'] + + # TODO: There is no code that modifies an author if more data is provided. + # previously the status implied the record was always 'modified', when a match was found. + #assert reply['authors'][0]['status'] == 'modified' + #a = mock_site.get(akey2) + #assert 'entity_type' in a + #assert a.entity_type == 'person' + + assert reply['authors'][0]['status'] == 'matched' + assert akey1 == akey2 == '/authors/OL1A' + + # Tests same title with different ocaid and author is not overwritten + rec = { + 'ocaid': 'test_item2', + 'title': 'Test item', + 'authors': [{'name': 'James Smith'}], + 'source_records': 'ia:test_item2', + } + reply = load(rec) + akey3 = reply['authors'][0]['key'] + assert akey3 == '/authors/OL2A' + assert reply['authors'][0]['status'] == 'created' + assert reply['work']['status'] == 'created' + assert reply['edition']['status'] == 'created' + w = mock_site.get(reply['work']['key']) + e = mock_site.get(reply['edition']['key']) + assert e.ocaid == 'test_item2' + assert len(w.authors) == 1 + assert len(e.authors) == 1 + +def test_load_with_redirected_author(mock_site, add_languages): + """Test importing existing editions without works + which have author redirects. A work should be created with + the final author. + """ + redirect_author = { + 'type': {'key': '/type/redirect'}, + 'name': 'John Smith', + 'key': '/authors/OL55A', + 'location': '/authors/OL10A'} + final_author = { + 'type': {'key': '/type/author'}, + 'name': 'John Smith', + 'key': '/authors/OL10A'} + orphaned_edition = { + 'title': 'Test item HATS', + 'key': '/books/OL10M', + 'publishers': ['TestPub'], + 'publish_date': '1994', + 'authors': [{'key': '/authors/OL55A'}], + 'type': {'key': '/type/edition'}} + mock_site.save(orphaned_edition) + mock_site.save(redirect_author) + mock_site.save(final_author) + + rec = { + 'title': 'Test item HATS', + 'authors': [{'name': 'John Smith'}], + 'publishers': ['TestPub'], + 'publish_date': '1994', + 'source_records': 'ia:test_redir_author'} + reply = load(rec) + assert reply['edition']['status'] == 'modified' + assert reply['edition']['key'] == '/books/OL10M' + assert reply['work']['status'] == 'created' + e = mock_site.get(reply['edition']['key']) + assert e.authors[0].key == '/authors/OL10A' + w = mock_site.get(reply['work']['key']) + assert w.authors[0].author.key == '/authors/OL10A' + +def test_duplicate_ia_book(mock_site, add_languages, ia_writeback): + rec = { + 'ocaid': 'test_item', + 'source_records': ['ia:test_item'], + 'title': 'Test item', + 'languages': ['eng'], + } + reply = load(rec) + assert reply['success'] is True + assert reply['edition']['status'] == 'created' + e = mock_site.get(reply['edition']['key']) + assert e.type.key == '/type/edition' + assert e.source_records == ['ia:test_item'] + + rec = { + 'ocaid': 'test_item', + 'source_records': ['ia:test_item'], + # Titles MUST match to be considered the same + 'title': 'Test item', + 'languages': ['fre'], + } + reply = load(rec) + assert reply['success'] is True + assert reply['edition']['status'] == 'matched' + +def test_from_marc_3(mock_site, add_languages): + ia = 'treatiseonhistor00dixo' + data = open_test_data(ia + '_meta.mrc').read() + assert len(data) == int(data[:5]) + rec = read_edition(MarcBinary(data)) + rec['source_records'] = ['ia:' + ia] + reply = load(rec) + assert reply['success'] is True + assert reply['edition']['status'] == 'created' + e = mock_site.get(reply['edition']['key']) + assert e.type.key == '/type/edition' + +def test_from_marc_2(mock_site, add_languages): + ia = 'roadstogreatness00gall' + data = open_test_data(ia + '_meta.mrc').read() + assert len(data) == int(data[:5]) + rec = read_edition(MarcBinary(data)) + rec['source_records'] = ['ia:' + ia] + reply = load(rec) + assert reply['success'] is True + assert reply['edition']['status'] == 'created' + e = mock_site.get(reply['edition']['key']) + assert e.type.key == '/type/edition' + reply = load(rec) + assert reply['success'] is True + assert reply['edition']['status'] == 'matched' + +def test_from_marc(mock_site, add_languages): + ia = 'flatlandromanceo00abbouoft' + data = open_test_data(ia + '_meta.mrc').read() + assert len(data) == int(data[:5]) + rec = read_edition(MarcBinary(data)) + reply = load(rec) + assert reply['success'] is True + akey1 = reply['authors'][0]['key'] + a = mock_site.get(akey1) + assert a.type.key == '/type/author' + assert a.name == 'Edwin Abbott Abbott' + assert a.birth_date == '1838' + assert a.death_date == '1926' + +def test_author_from_700(mock_site, add_languages): + ia = 'sexuallytransmit00egen' + data = open_test_data(ia + '_meta.mrc').read() + rec = read_edition(MarcBinary(data)) + rec['source_records'] = ['ia:' + ia] + reply = load(rec) + assert reply['success'] is True + # author from 700 + akey = reply['authors'][0]['key'] + a = mock_site.get(akey) + assert a.type.key == '/type/author' + assert a.name == 'Laura K. Egendorf' + assert a.birth_date == '1973' + +def test_from_marc_fields(mock_site, add_languages): + ia = 'isbn_9781419594069' + data = open_test_data(ia + '_meta.mrc').read() + rec = read_edition(MarcBinary(data)) + rec['source_records'] = ['ia:' + ia] + reply = load(rec) + assert reply['success'] is True + # author from 100 + assert reply['authors'][0]['name'] == 'Adam Weiner' + + edition = mock_site.get(reply['edition']['key']) + # Publish place, publisher, & publish date - 260$a, $b, $c + assert edition['publishers'][0] == 'Kaplan Publishing' + assert edition['publish_date'] == '2007' + assert edition['publish_places'][0] == 'New York' + # Pagination 300 + assert edition['number_of_pages'] == 264 + assert edition['pagination'] == 'viii, 264 p.' + # 8 subjects, 650 + assert len(edition['subjects']) == 8 + assert edition['subjects'] == [u'Action and adventure films', + u'Miscellanea', + u'Physics', + u'Cinematography', + u'Special effects', + u'Physics in motion pictures', + u'Science fiction films', + u'Popular works'] + # Edition description from 520 + desc = 'Explains the basic laws of physics, covering such topics as mechanics, forces, and energy, while deconstructing famous scenes and stunts from motion pictures, including "Apollo 13" and "Titanic," to determine if they are possible.' + assert isinstance(edition['description'], Text) + assert edition['description'] == desc + # Work description from 520 + work = mock_site.get(reply['work']['key']) + assert isinstance(work['description'], Text) + assert work['description'] == desc + +def test_build_pool(mock_site): + assert build_pool({'title': 'test'}) == {} + etype = '/type/edition' + ekey = mock_site.new_key(etype) + e = { + 'title': 'test', + 'type': {'key': etype}, + 'lccn': ['123'], + 'oclc_numbers': ['456'], + 'ocaid': 'test00test', + 'key': ekey, + } + + mock_site.save(e) + pool = build_pool(e) + assert pool == { + 'lccn': ['/books/OL1M'], + 'oclc_numbers': ['/books/OL1M'], + 'title': ['/books/OL1M'], + 'ocaid': ['/books/OL1M'] + } + + pool = build_pool({'lccn': ['234'], 'oclc_numbers': ['456'], 'title': 'test', 'ocaid': 'test00test'}) + assert pool == { 'oclc_numbers': ['/books/OL1M'], 'title': ['/books/OL1M'], 'ocaid': ['/books/OL1M'] } + +def test_try_merge(mock_site): + rec = { + 'title': 'Test item', + 'lccn': ['123'], + 'authors': [{'name': 'Smith, John', 'birth_date': '1980'}], + 'source_records': ['ia:test_item'], + } + reply = load(rec) + ekey = reply['edition']['key'] + e = mock_site.get(ekey) + + rec['full_title'] = rec['title'] + e1 = build_marc(rec) + add_db_name(e1) + result = try_merge(e1, ekey, e) + assert result is True + +def test_load_multiple(mock_site): + rec = { + 'title': 'Test item', + 'lccn': ['123'], + 'source_records': ['ia:test_item'], + 'authors': [{'name': 'Smith, John', 'birth_date': '1980'}], + } + reply = load(rec) + assert reply['success'] is True + ekey1 = reply['edition']['key'] + + reply = load(rec) + assert reply['success'] is True + ekey2 = reply['edition']['key'] + assert ekey1 == ekey2 + + reply = load({'title': 'Test item', 'source_records': ['ia:test_item2'], 'lccn': ['456']}) + assert reply['success'] is True + ekey3 = reply['edition']['key'] + assert ekey3 != ekey1 + + reply = load(rec) + assert reply['success'] is True + ekey4 = reply['edition']['key'] + + assert ekey1 == ekey2 == ekey4 + +def test_add_db_name(): + authors = [ + {'name': 'Smith, John' }, + {'name': 'Smith, John', 'date': '1950' }, + { 'name': 'Smith, John', + 'birth_date': '1895', + 'death_date': '1964' }, + ] + orig = deepcopy(authors) + add_db_name({'authors': authors}) + orig[0]['db_name'] = orig[0]['name'] + orig[1]['db_name'] = orig[1]['name'] + ' 1950' + orig[2]['db_name'] = orig[2]['name'] + ' 1895-1964' + assert authors == orig + + rec = {} + add_db_name(rec) + assert rec == {} + +def test_from_marc(mock_site, add_languages): + ia = 'coursepuremath00hardrich' + marc = MarcBinary(open_test_data(ia + '_meta.mrc').read()) + rec = read_edition(marc) + rec['source_records'] = ['ia:' + ia] + reply = load(rec) + assert reply['success'] is True + assert reply['edition']['status'] == 'created' + reply = load(rec) + assert reply['success'] is True + assert reply['edition']['status'] == 'matched' + + ia = 'flatlandromanceo00abbouoft' + marc = MarcBinary(open_test_data(ia + '_meta.mrc').read()) + + rec = read_edition(marc) + rec['source_records'] = ['ia:' + ia] + reply = load(rec) + assert reply['success'] is True + assert reply['edition']['status'] == 'created' + reply = load(rec) + assert reply['success'] is True + assert reply['edition']['status'] == 'matched' + +def test_real_example(mock_site, add_languages): + src = 'v38.i37.records.utf8--16478504-1254' + marc = MarcBinary(open_test_data(src).read()) + rec = read_edition(marc) + rec['source_records'] = ['marc:' + src] + reply = load(rec) + assert reply['success'] is True + reply = load(rec) + assert reply['success'] is True + assert reply['edition']['status'] == 'matched' + + src = 'v39.i28.records.utf8--5362776-1764' + marc = MarcBinary(open_test_data(src).read()) + rec = read_edition(marc) + rec['source_records'] = ['marc:' + src] + reply = load(rec) + assert reply['success'] is True + assert reply['edition']['status'] == 'modified' + +def test_missing_ocaid(mock_site, add_languages, ia_writeback): + ia = 'descendantsofhug00cham' + src = ia + '_meta.mrc' + marc = MarcBinary(open_test_data(src).read()) + rec = read_edition(marc) + rec['source_records'] = ['marc:testdata.mrc'] + reply = load(rec) + assert reply['success'] is True + rec['source_records'] = ['ia:' + ia] + rec['ocaid'] = ia + reply = load(rec) + assert reply['success'] is True + e = mock_site.get(reply['edition']['key']) + assert e.ocaid == ia + assert 'ia:' + ia in e.source_records + +def test_extra_author(mock_site, add_languages): + mock_site.save({ + "name": "Hubert Howe Bancroft", + "death_date": "1918.", + "alternate_names": ["HUBERT HOWE BANCROFT", "Hubert Howe Bandcroft"], + "key": "/authors/OL563100A", + "birth_date": "1832", + "personal_name": "Hubert Howe Bancroft", + "type": {"key": "/type/author"}, + }) + + mock_site.save({ + "title": "The works of Hubert Howe Bancroft", + "covers": [6060295, 5551343], + "first_sentence": {"type": "/type/text", "value": "When it first became known to Europe that a new continent had been discovered, the wise men, philosophers, and especially the learned ecclesiastics, were sorely perplexed to account for such a discovery."}, + "subject_places": ["Alaska", "America", "Arizona", "British Columbia", "California", "Canadian Northwest", "Central America", "Colorado", "Idaho", "Mexico", "Montana", "Nevada", "New Mexico", "Northwest Coast of North America", "Northwest boundary of the United States", "Oregon", "Pacific States", "Texas", "United States", "Utah", "Washington (State)", "West (U.S.)", "Wyoming"], + "excerpts": [{"excerpt": "When it first became known to Europe that a new continent had been discovered, the wise men, philosophers, and especially the learned ecclesiastics, were sorely perplexed to account for such a discovery."}], + "first_publish_date": "1882", + "key": "/works/OL3421434W", + "authors": [{"type": {"key": "/type/author_role"}, "author": {"key": "/authors/OL563100A"}}], + "subject_times": ["1540-1810", "1810-1821", "1821-1861", "1821-1951", "1846-1850", "1850-1950", "1859-", "1859-1950", "1867-1910", "1867-1959", "1871-1903", "Civil War, 1861-1865", "Conquest, 1519-1540", "European intervention, 1861-1867", "Spanish colony, 1540-1810", "To 1519", "To 1821", "To 1846", "To 1859", "To 1867", "To 1871", "To 1889", "To 1912", "Wars of Independence, 1810-1821"], + "type": {"key": "/type/work"}, + "subjects": ["Antiquities", "Archaeology", "Autobiography", "Bibliography", "California Civil War, 1861-1865", "Comparative Literature", "Comparative civilization", "Courts", "Description and travel", "Discovery and exploration", "Early accounts to 1600", "English essays", "Ethnology", "Foreign relations", "Gold discoveries", "Historians", "History", "Indians", "Indians of Central America", "Indians of Mexico", "Indians of North America", "Languages", "Law", "Mayas", "Mexican War, 1846-1848", "Nahuas", "Nahuatl language", "Oregon question", "Political aspects of Law", "Politics and government", "Religion and mythology", "Religions", "Social life and customs", "Spanish", "Vigilance committees", "Writing", "Zamorano 80", "Accessible book", "Protected DAISY"] + }) + + ia = 'workshuberthowe00racegoog' + src = ia + '_meta.mrc' + marc = MarcBinary(open_test_data(src).read()) + rec = read_edition(marc) + rec['source_records'] = ['ia:' + ia] + + reply = load(rec) + assert reply['success'] is True + + w = mock_site.get(reply['work']['key']) + + reply = load(rec) + assert reply['success'] is True + w = mock_site.get(reply['work']['key']) + assert len(w['authors']) == 1 + +def test_missing_source_records(mock_site, add_languages): + mock_site.save({ + 'key': '/authors/OL592898A', + 'name': 'Michael Robert Marrus', + 'personal_name': 'Michael Robert Marrus', + 'type': { 'key': '/type/author' } + }) + + mock_site.save({ + 'authors': [{'author': '/authors/OL592898A', 'type': { 'key': '/type/author_role' }}], + 'key': '/works/OL16029710W', + 'subjects': ['Nuremberg Trial of Major German War Criminals, Nuremberg, Germany, 1945-1946', 'Protected DAISY', 'Lending library'], + 'title': 'The Nuremberg war crimes trial, 1945-46', + 'type': { 'key': '/type/work' }, + }) + + mock_site.save({ + "number_of_pages": 276, + "subtitle": "a documentary history", + "series": ["The Bedford series in history and culture"], + "covers": [6649715, 3865334, 173632], + "lc_classifications": ["D804.G42 N87 1997"], + "ocaid": "nurembergwarcrim00marr", + "contributions": ["Marrus, Michael Robert."], + "uri_descriptions": ["Book review (H-Net)"], + "title": "The Nuremberg war crimes trial, 1945-46", + "languages": [{"key": "/languages/eng"}], + "subjects": ["Nuremberg Trial of Major German War Criminals, Nuremberg, Germany, 1945-1946"], + "publish_country": "mau", "by_statement": "[compiled by] Michael R. Marrus.", + "type": {"key": "/type/edition"}, + "uris": ["http://www.h-net.org/review/hrev-a0a6c9-aa"], + "publishers": ["Bedford Books"], + "ia_box_id": ["IA127618"], + "key": "/books/OL1023483M", + "authors": [{"key": "/authors/OL592898A"}], + "publish_places": ["Boston"], + "pagination": "xi, 276 p. :", + "lccn": ["96086777"], + "notes": {"type": "/type/text", "value": "Includes bibliographical references (p. 262-268) and index."}, + "identifiers": {"goodreads": ["326638"], "librarything": ["1114474"]}, + "url": ["http://www.h-net.org/review/hrev-a0a6c9-aa"], + "isbn_10": ["031216386X", "0312136919"], + "publish_date": "1997", + "works": [{"key": "/works/OL16029710W"}] + }) + + ia = 'nurembergwarcrim1997marr' + src = ia + '_meta.mrc' + marc = MarcBinary(open_test_data(src).read()) + rec = read_edition(marc) + rec['source_records'] = ['ia:' + ia] + + reply = load(rec) + assert reply['success'] is True + e = mock_site.get(reply['edition']['key']) + assert 'source_records' in e + +def test_no_extra_author(mock_site, add_languages): + author = { + "name": "Paul Michael Boothe", + "key": "/authors/OL1A", + "type": {"key": "/type/author"}, + } + mock_site.save(author) + + work = { + "title": "A Separate Pension Plan for Alberta", + "covers": [1644794], + "key": "/works/OL1W", + "authors": [{"type": "/type/author_role", "author": {"key": "/authors/OL1A"}}], + "type": {"key": "/type/work"}, + } + mock_site.save(work) + + edition = { + "number_of_pages": 90, + "subtitle": "Analysis and Discussion (Western Studies in Economic Policy, No. 5)", + "weight": "6.2 ounces", + "covers": [1644794], + "latest_revision": 6, + "title": "A Separate Pension Plan for Alberta", + "languages": [{"key": "/languages/eng"}], + "subjects": ["Economics", "Alberta", "Political Science / State & Local Government", "Government policy", "Old age pensions", "Pensions", "Social security"], + "type": {"key": "/type/edition"}, + "physical_dimensions": "9 x 6 x 0.2 inches", + "publishers": ["The University of Alberta Press"], + "physical_format": "Paperback", + "key": "/books/OL1M", + "authors": [{"key": "/authors/OL1A"}], + "identifiers": {"goodreads": ["4340973"], "librarything": ["5580522"]}, + "isbn_13": ["9780888643513"], + "isbn_10": ["0888643519"], + "publish_date": "May 1, 2000", + "works": [{"key": "/works/OL1W"}] + } + mock_site.save(edition) + + src = 'v39.i34.records.utf8--186503-1413' + marc = MarcBinary(open_test_data(src).read()) + rec = read_edition(marc) + rec['source_records'] = ['marc:' + src] + + reply = load(rec) + assert reply['success'] is True + assert reply['edition']['status'] == 'modified' + assert reply['work']['status'] == 'modified' + assert 'authors' not in reply + + assert reply['edition']['key'] == edition['key'] + assert reply['work']['key'] == work['key'] + + e = mock_site.get(reply['edition']['key']) + w = mock_site.get(reply['work']['key']) + + assert 'source_records' in e + assert 'subjects' in w + assert len(e['authors']) == 1 + assert len(w['authors']) == 1 + +def test_don_quixote(mock_site): + """ + All of these items are by 'Miguel de Cervantes Saavedra', + only one Author should be created. Some items have bad + MARC length, others are missing binary MARC altogether + and raise BadMARC exceptions. + """ + pytest.skip("This test make live requests to archive.org") + + dq = [u'lifeexploitsofin01cerv', u'cu31924096224518', + u'elingeniosedcrit04cerv', u'ingeniousgentlem01cervuoft', + u'historyofingenio01cerv', u'lifeexploitsofin02cerviala', + u'elingeniosohidal03cervuoft', u'nybc209000', u'elingeniosohidal11cerv', + u'elingeniosohidal01cervuoft', u'elingeniosoh01cerv', + u'donquixotedelama00cerviala', u'1896elingeniosohid02cerv', + u'ingeniousgentlem04cervuoft', u'cu31924027656978', u'histoiredeladmir01cerv', + u'donquijotedelama04cerv', u'cu31924027657075', u'donquixotedelama03cervuoft', + u'aventurasdedonqu00cerv', u'p1elingeniosohid03cerv', + u'geshikhefundonik01cervuoft', u'historyofvalorou02cerviala', + u'ingeniousgentlem01cerv', u'donquixotedelama01cervuoft', + u'ingeniousgentlem0195cerv', u'firstpartofdelig00cervuoft', + u'p4elingeniosohid02cerv', u'donquijote00cervuoft', u'cu31924008863924', + u'c2elingeniosohid02cerv', u'historyofvalorou03cerviala', + u'historyofingenio01cerviala', u'historyadventure00cerv', + u'elingeniosohidal00cerv', u'lifeexploitsofin01cervuoft', + u'p2elingeniosohid05cerv', u'nybc203136', u'elingeniosohidal00cervuoft', + u'donquixotedelama02cervuoft', u'lingnieuxcheva00cerv', + u'ingeniousgentlem03cerv', u'vidayhechosdeli00siscgoog', + u'lifeandexploits01jarvgoog', u'elingeniosohida00puiggoog', + u'elingeniosohida00navagoog', u'donquichottedel02florgoog', + u'historydonquixo00cogoog', u'vidayhechosdeli01siscgoog', + u'elingeniosohida28saavgoog', u'historyvalorous00brangoog', + u'elingeniosohida01goog', u'historyandadven00unkngoog', + u'historyvalorous01goog', u'ingeniousgentle11saavgoog', + u'elingeniosohida10saavgoog', u'adventuresdonqu00jarvgoog', + u'historydonquixo04saavgoog', u'lingnieuxcheval00rouxgoog', + u'elingeniosohida19saavgoog', u'historyingeniou00lalagoog', + u'elingeniosohida00ormsgoog', u'historyandadven01smolgoog', + u'elingeniosohida27saavgoog', u'elingeniosohida21saavgoog', + u'historyingeniou00mottgoog', u'historyingeniou03unkngoog', + u'lifeandexploits00jarvgoog', u'ingeniousgentle00conggoog', + u'elingeniosohida00quixgoog', u'elingeniosohida01saavgoog', + u'donquixotedelam02saavgoog', u'adventuresdonqu00gilbgoog', + u'historyingeniou02saavgoog', u'donquixotedelam03saavgoog', + u'elingeniosohida00ochogoog', u'historyingeniou08mottgoog', + u'lifeandexploits01saavgoog', u'firstpartdeligh00shelgoog', + u'elingeniosohida00castgoog', u'elingeniosohida01castgoog', + u'adventofdonquixo00cerv', u'portablecervante00cerv', + u'firstpartofdelig14cerv', u'donquixotemanofl00cerv', + u'firstpartofdelig00cerv'] + + bad_length = [] + bad_marc = [] + + add_languages(mock_site) + edition_status_counts = defaultdict(int) + work_status_counts = defaultdict(int) + author_status_counts = defaultdict(int) + + for ocaid in dq: + marc_url = 'https://archive.org/download/%s/%s_meta.mrc' % (ocaid, ocaid) + data = urlopen(marc_url).read() + try: + marc = MarcBinary(data) + except BadLength: + bad_length.append(ocaid) + continue + except BadMARC: + bad_marc.append(ocaid) + continue + + rec = read_edition(marc) + rec['source_records'] = ['ia:' + ocaid] + reply = load(rec) + + q = { + 'type': '/type/work', + 'authors.author': '/authors/OL1A', + } + work_keys = list(mock_site.things(q)) + author_keys = list(mock_site.things({'type': '/type/author'})) + print("\nReply for %s: %s" % (ocaid, reply)) + print("Work keys: %s" % work_keys) + assert author_keys == ['/authors/OL1A'] + assert reply['success'] is True + + # Increment status counters + edition_status_counts[reply['edition']['status']] += 1 + work_status_counts[reply['work']['status']] += 1 + if (reply['work']['status'] != 'matched') and (reply['edition']['status'] != 'modified'): + # No author key in response if work is 'matched' + # No author key in response if edition is 'modified' + author_status_counts[reply['authors'][0]['status']] += 1 + + print("BAD MARC LENGTH items: %s" % bad_length) + print("BAD MARC items: %s" % bad_marc) + print("Edition status counts: %s" % edition_status_counts) + print("Work status counts: %s" % work_status_counts) + print("Author status counts: %s" % author_status_counts) + + +def test_same_twice(mock_site, add_languages): + rec = { + 'source_records': ['ia:test_item'], + "publishers": ["Ten Speed Press"], "pagination": "20 p.", "description": "A macabre mash-up of the children's classic Pat the Bunny and the present-day zombie phenomenon, with the tactile features of the original book revoltingly re-imagined for an adult audience.", "title": "Pat The Zombie", "isbn_13": ["9781607740360"], "languages": ["eng"], "isbn_10": ["1607740362"], "authors": [{"entity_type": "person", "name": "Aaron Ximm", "personal_name": "Aaron Ximm"}], "contributions": ["Kaveh Soofi (Illustrator)"]} + reply = load(rec) + assert reply['success'] is True + assert reply['edition']['status'] == 'created' + assert reply['work']['status'] == 'created' + + reply = load(rec) + assert reply['success'] is True + assert reply['edition']['status'] == 'matched' + assert reply['work']['status'] == 'matched' + + +def test_existing_work(mock_site, add_languages): + author = { + 'type': {'key': '/type/author'}, + 'name': 'John Smith', + 'key': '/authors/OL20A'} + existing_work = { + 'authors': [{'author': '/authors/OL20A', 'type': {'key': '/type/author_role'}}], + 'key': '/works/OL16W', + 'title': 'Finding existing works', + 'type': {'key': '/type/work'}, + } + mock_site.save(author) + mock_site.save(existing_work) + rec = { + 'source_records': 'non-marc:test', + 'title': 'Finding Existing Works', + 'authors': [{'name': 'John Smith'}], + 'publishers': ['Black Spot'], + 'publish_date': 'Jan 09, 2011', + 'isbn_10': ['1250144051'], + } + + reply = load(rec) + assert reply['success'] is True + assert reply['edition']['status'] == 'created' + assert reply['work']['status'] == 'matched' + assert reply['work']['key'] == '/works/OL16W' + assert reply['authors'][0]['status'] == 'matched' + e = mock_site.get(reply['edition']['key']) + assert e.works[0]['key'] == '/works/OL16W' + + +def test_existing_work_with_subtitle(mock_site, add_languages): + author = { + 'type': {'key': '/type/author'}, + 'name': 'John Smith', + 'key': '/authors/OL20A'} + existing_work = { + 'authors': [{'author': '/authors/OL20A', 'type': {'key': '/type/author_role'}}], + 'key': '/works/OL16W', + 'title': 'Finding existing works', + 'type': {'key': '/type/work'}, + } + mock_site.save(author) + mock_site.save(existing_work) + rec = { + 'source_records': 'non-marc:test', + 'title': 'Finding Existing Works', + 'subtitle': 'the ongoing saga!', + 'authors': [{'name': 'John Smith'}], + 'publishers': ['Black Spot'], + 'publish_date': 'Jan 09, 2011', + 'isbn_10': ['1250144051'], + } + + reply = load(rec) + assert reply['success'] is True + assert reply['edition']['status'] == 'created' + assert reply['work']['status'] == 'matched' + assert reply['work']['key'] == '/works/OL16W' + assert reply['authors'][0]['status'] == 'matched' + e = mock_site.get(reply['edition']['key']) + assert e.works[0]['key'] == '/works/OL16W' diff --git a/ia-legacy-importer/add_book/test_merge.py b/ia-legacy-importer/add_book/test_merge.py new file mode 100644 index 00000000..781031a4 --- /dev/null +++ b/ia-legacy-importer/add_book/test_merge.py @@ -0,0 +1,47 @@ +import pytest +import web + +from openlibrary.catalog.add_book.merge import try_merge +from openlibrary.core.models import Edition +from openlibrary.mocks.mock_infobase import MockSite + +@pytest.mark.skip("This should be tested, but tidy up deprecated methods first.") +def test_try_merge(): + web.ctx.site = MockSite() + bpl = {'authors': [{'birth_date': u'1897', + 'db_name': u'Green, Constance McLaughlin 1897-', + 'entity_type': 'person', + 'name': u'Green, Constance McLaughlin', + 'personal_name': u'Green, Constance McLaughlin'}], + 'full_title': u'Eli Whitney and the birth of American technology', + 'isbn': [u'188674632X'], + 'normalized_title': u'eli whitney and the birth of american technology', + 'number_of_pages': 215, + 'publish_date': '1956', + 'publishers': [u'HarperCollins', u'[distributed by Talman Pub.]'], + 'short_title': u'eli whitney and the birth', + 'source_record_loc': 'bpl101.mrc:0:1226', + 'titles': [u'Eli Whitney and the birth of American technology', + u'eli whitney and the birth of american technology']} + # This existing needs to be an Edition Thing object. + existing = {'authors': [{'birth_date': u'1897', + 'db_name': u'Green, Constance McLaughlin 1897-', + 'entity_type': 'person', + 'name': u'Green, Constance McLaughlin', + 'personal_name': u'Green, Constance McLaughlin'}], + 'full_title': u'Eli Whitney and the birth of American technology.', + 'isbn': [], + 'normalized_title': u'eli whitney and the birth of american technology', + 'number_of_pages': 215, + 'publish_date': '1956', + 'publishers': ['Little, Brown'], + 'short_title': u'eli whitney and the birth', + 'source_record_loc': 'marc_records_scriblio_net/part04.dat:119539872:591', + 'title': 'Eli Whitney and the birth of American technology.', + 'type': {'key': '/type/edition'}, + 'key': '/books/OL1M'} + + web.ctx.site.save_many([existing]) + ed = web.ctx.site.get('/books/OL1M') + assert try_merge(bpl, '/books/OL1M', ed) is True + diff --git a/ia-legacy-importer/amazon/__init__.py b/ia-legacy-importer/amazon/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/ia-legacy-importer/amazon/add_covers.py b/ia-legacy-importer/amazon/add_covers.py new file mode 100644 index 00000000..545b3098 --- /dev/null +++ b/ia-legacy-importer/amazon/add_covers.py @@ -0,0 +1,37 @@ +from __future__ import print_function +import simplejson + +from six.moves.urllib.request import urlopen + + +base = 'http://ia331526.us.archive.org:7001/openlibrary.org/log/' + +out = open('edition_and_isbn', 'w') +offset = '2009-06-01:0' +while not offset.startswith('2010-03-17:'): + url = base + offset + ret = simplejson.load(urlopen(url)) + offset, data = ret['offset'], ret['data'] + print(offset, len(data)) + for i in data: + action = i.pop('action') + key = i['data'].pop('key', None) + if action == 'new_account': + continue + author = i['data'].get('author', None) if 'data' in i else None + if author != '/user/ImportBot': + continue + assert action in ('save_many', 'save') + if action == 'save' and key.startswith('/b/'): + e = i['data']['query'] + if e: + isbn = e.get('isbn_10', None) + if isbn: + print((key, isbn), file=out) + elif action == 'save_many': + for e in i['data']['query']: + if e['type'] == '/type/edition' and e['key'].startswith('/b/'): + isbn = e.get('isbn_10', None) + if isbn: + print((e['key'], isbn), file=out) +out.close() diff --git a/ia-legacy-importer/amazon/amazon_to_arc.py b/ia-legacy-importer/amazon/amazon_to_arc.py new file mode 100644 index 00000000..ff88980a --- /dev/null +++ b/ia-legacy-importer/amazon/amazon_to_arc.py @@ -0,0 +1,43 @@ +from __future__ import print_function +import socket + +#url = "http://www.amazon.com/dp/1847195881" +#asin = "1847195881" + +def get(sock, host, url): + send = 'GET %s HTTP/1.1\r\nHost: %s\r\nAccept-Encoding: identity\r\n\r\n' % (url, host) + sock.sendall(send) + + fp = sock.makefile('rb', 0) + + line = fp.readline() + print('status:', repr(line)) + + state = 'header' + for line in fp: + if line == '\r\n': + break + print('header', repr(line)) + + while True: + chunk_size = int(fp.readline(),16) + print(chunk_size) + if chunk_size == 0: + break + print(len(fp.read(chunk_size))) + print(repr(fp.read(2))) + line = fp.readline() + print(line) + fp.close() + +host = 'openlibrary.org' +host = 'www.amazon.com' +sock = socket.create_connection((host, 80)) + +url = 'http://openlibrary.org/type/work' +url = "http://www.amazon.com/dp/1847195881" +get(sock, host, url) + +url = 'http://openlibrary.org/type/edition' +url = "http://www.amazon.com/dp/0393062287" +get(sock, host, url) diff --git a/ia-legacy-importer/amazon/arc_index.py b/ia-legacy-importer/amazon/arc_index.py new file mode 100644 index 00000000..898ed803 --- /dev/null +++ b/ia-legacy-importer/amazon/arc_index.py @@ -0,0 +1,23 @@ +from __future__ import print_function +import os + +arc_dir = '/2/edward/amazon/arc' + +def read_arc(filename): + f = open(arc_dir + '/' + filename) + idx = open(arc_dir + '/' + filename + '.idx', 'w') + while True: + pos = f.tell() + line = f.readline() + if line == '': + break + print(pos, file=idx) + size = int(line[:-1].split(' ')[4]) + f.read(size) + line = f.readline() + f.close() + idx.close() + +for filename in (i for i in os.listdir(arc_dir) if i.endswith('.arc')): + print(filename) + read_arc(filename) diff --git a/ia-legacy-importer/amazon/arc_view.py b/ia-legacy-importer/amazon/arc_view.py new file mode 100644 index 00000000..b277ba46 --- /dev/null +++ b/ia-legacy-importer/amazon/arc_view.py @@ -0,0 +1,65 @@ +import web +import os +from six import StringIO + +arc_dir = '/2/edward/amazon/arc' +urls = ( + '/', 'index', + '/(\d+\.arc)', 'arc_view', + '/(\d+\.arc)/(\d+)', 'page_view', +) +app = web.application(urls, globals(), autoreload=True) + +class arc_view: + def GET(self, filename): + ret = '' + ret += 'back to index
' + ret += '

%s

' % filename + idx = open(arc_dir + '/' + filename + '.idx') + arc = open(arc_dir + '/' + filename) + for pos in idx: + arc.seek(int(pos)) + line = arc.readline()[:-1].split(' ') + ret += 'from ARC OR original %s
' % (filename, int(pos), line[0], line[0]) + idx.close() + + ret += '' + return ret + +class page_view: + def GET(self, filename, offset): + arc = open(arc_dir + '/' + filename) + arc.seek(int(offset)) + size = int(arc.readline().split(' ')[4]) + f = StringIO(arc.read(size)) + f.readline() + ret = '' + while True: + line=f.readline() + if line == '\r\n': + break + while True: + line = f.readline() + chunk_size = int(line, 16) + if chunk_size == 0: + break + buf = f.read(chunk_size) + ret += buf + f.readline() + return ret + +class index: + def GET(self): + ret = '
    ' + for filename in os.listdir(arc_dir): + if not filename.endswith('.arc'): + continue + f = open(arc_dir + '/' + filename) + line = f.readline() + f.close() + ret += '
  • %s - %s' % (filename, filename, line) + ret += '' + return ret + +if __name__ == "__main__": + app.run() diff --git a/ia-legacy-importer/amazon/crawl.py b/ia-legacy-importer/amazon/crawl.py new file mode 100644 index 00000000..a9027f1b --- /dev/null +++ b/ia-legacy-importer/amazon/crawl.py @@ -0,0 +1,291 @@ +from __future__ import print_function +from lxml.html import parse, tostring, fromstring +import re +import sys +import os +import socket +from time import sleep +from os.path import exists +from datetime import date, timedelta, datetime +import codecs + +from six.moves.urllib.parse import unquote +from six.moves.urllib.request import urlopen + + +# scrap Amazon for book and author data + +re_expect_end = re.compile('\n\n[ \n]*$') + +# publisher = Big Idea Books & Just Me Music +re_personalized = re.compile('Personalized for (.*) \((Boy|Girl)\)', re.I) + +def percent(a, b): + return float(a * 100.0) / b + +class PersonalizedBooks(Exception): + pass + +page_size = 12 +max_pages = 100 +max_results = page_size * max_pages + +# http://www.amazon.com/s/qid=1265761735/ref=sr_nr_n_0/177-5112913-4864616?ie=UTF8&rs=1000&bbn=1000&rnid=1000&rh=i%3Astripbooks%2Cp_n%5Ffeature%5Fbrowse-bin%3A618083011%2Cp%5Fn%5Fdate%3A20090101%2Cn%3A%211000%2Cn%3A1 +re_product_title = re.compile('/dp/([^/]*)') +re_result_count = re.compile('Showing (?:[\d,]+ - [\d,]+ of )?([\d,]+) Result') +#re_rh_n = re.compile('rh=n%3A(\d+)%2C') +re_rh_n = re.compile('%2Cn%3A(\d+)') +re_facet_count = re.compile(u'^\xa0\(([\d,]+)\)$') +u'\xa0(8)' + +base_url = "http://www.amazon.com/s?ie=UTF8&rh=" +rh = 'i:stripbooks,p_n_feature_browse-bin:618083011,p_n_date:' + +out_dir = '/0/amazon' +arc_dir = '/0/amazon/arc' + +# 4 = Children's Books, 28 = Teens +re_child_book_param = re.compile(',n:(4|28)(?:&page=\d+)?$') + +def now(): + return datetime.utcnow().replace(microsecond=0) + +max_size = 1024 * 1024 * 1024 * 10 # 10 GB +ip = '207.241.229.141' +content_type_hdr = 'Content-Type: ' +re_charset_header = re.compile('; charset=(.+)\r\n') +version_block = '1 0 Open Library\nURL IP-address Archive-date Content-type Archive-length\n' + +class Scraper: + def __init__(self, recording=True): + self.host = 'www.amazon.com' + self.sock = socket.create_connection((self.host, 80)) + self.recording = recording + self.cur_arc = None + + def add_to_arc(self, url, start, content_type, reply): + d = start.strftime('%Y%m%d%H%M%S') + if self.cur_arc is None or os.stat(arc_dir + self.cur_arc).st_size > max_size: + self.cur_arc = now().strftime('%Y%m%d%H%M%S') + '.arc' + assert not exists(arc_dir + self.cur_arc) + out = open(arc_dir + self.cur_arc, 'w') + out.write(' '.join(['filespec://' + self.cur_arc, ip, d, 'text/plain', str(len(version_block))]) + '\n') + out.write(version_block) + else: + out = open(arc_dir + self.cur_arc, 'a') + out.write('\n' + ' '.join([url, ip, d, content_type, str(len(reply))]) + '\n') + out.write(reply) + out.close() + + def get(self, url): + start = now() + send = 'GET %s HTTP/1.1\r\nHost: %s\r\nUser-Agent: Mozilla/5.0\r\nAccept-Encoding: identity\r\n\r\n' % (url, self.host) + self.sock.sendall(send) + + fp = self.sock.makefile('rb', 0) + recv_buf = '' + + line = fp.readline() + if not line.startswith('HTTP/1.1 200'): + print('status:', repr(line)) + recv_buf += line + + body = '' + content_type = None + charset = None + for line in fp: # read headers + recv_buf += line + if line.lower().startswith('transfer-encoding'): + assert line == 'Transfer-Encoding: chunked\r\n' + if line == '\r\n': + break + if line.lower().startswith('content-type'): + assert line.startswith(content_type_hdr) + assert line[-2:] == '\r\n' + content_type = line[len(content_type_hdr):line.find(';') if ';' in line else -2] + if 'charset' in line.lower(): + m = re_charset_header.search(line) + charset = m.group(1) + + while True: + line = fp.readline() + recv_buf += line + chunk_size = int(line, 16) + if chunk_size == 0: + break + chunk = fp.read(chunk_size) + recv_buf += chunk + body += chunk + assert chunk_size == len(chunk) + recv_buf += fp.read(2) + line = fp.readline() + recv_buf += line + fp.close() + if self.recording: + self.add_to_arc(url, start, content_type, recv_buf) + return body.decode(charset) if charset else body + +scraper = Scraper(recording=True) + +def get_url(params): + url = base_url + params + page = scraper.get(url) + return fromstring(page) + +def get_total(root): + if root.find(".//h1[@id='noResultsTitle']") is not None: + return 0 + result_count = root.find(".//td[@class='resultCount']").text + m = re_result_count.match(result_count) + return int(m.group(1).replace(',', '')) + +def read_books(params, root): + # sometimes there is no link, bug at Amazaon + # either skip it, or reload the page + for i in range(5): + book_links = [e.find('.//a[@href]') for e in root.find_class('dataColumn')] + if all(a is not None for a in book_links): + break + sleep(2) + print('retry:', params) + root = get_url(params) + if re_child_book_param.search(params) and all(re_personalized.search(span.text) for span in root.find_class('srTitle')): + raise PersonalizedBooks + return [re_product_title.search(a.attrib['href']).group(1) for a in book_links if a is not None and a.text] + +def get_cats(root): + cats = [] + for div in root.find_class('narrowItemHeading'): + if div.text != 'Department': + continue + container = div.getparent() + assert container.tag == 'td' and container.attrib['class'] == 'refinementContainer' + break + + table = container.find('table') + for e in table.iterfind(".//div[@class='refinement']"): + a = e[0] + assert a.tag == 'a' + span1 = a[0] + assert span1.tag == 'span' and span1.attrib['class'] == 'refinementLink' + span2 = a[1] + assert span2.tag == 'span' and span2.attrib['class'] == 'narrowValue' + href = a.attrib['href'] + m1 = re_rh_n.search(href) + if not m1: + print('no match:') + print(repr(href)) + m2 = re_facet_count.search(span2.text) + cats.append((int(m1.group(1)), span1.text, int(m2.group(1).replace(',','')))) + + return cats + + for e in container.find('table').find_class('refinementLink'): + a = e.getparent() + assert a.tag == 'a' + cat = { 'url': a.attrib['href'], 'title': e.text } + href = a.attrib['href'] + m = re_rh_n.search(href) + cats.append((int(m.group(1)), e.text)) + +def read_page(params): + # read search results page + root = get_url(params) + total = get_total(root) + if total == 0: + print('no results found') + return total, set(), [] + grand_total = total + pages = (total / page_size) + 1 + print('total:', total, 'pages:', pages) + + cats = get_cats(root) + print('cats 1') + for a, b, c in cats: + print("%8d %-30s %8d" % (a, b, c)) + #return grand_total, [], cats + + books = set() + + books.update(read_books(params, root)) + for page in range(2, min((pages, 100))+1): + params_with_page = params + "&page=%d" % page + books.update(read_books(params_with_page, get_url(params_with_page))) + print(page, len(books)) + + print(len(books)) + + cats = get_cats(root) + print('cats 2') + for a, b, c in cats: + print("%8d %30s %8d" % (a, b, c)) + print('cat total:', sum(i[2] for i in cats)) + if total > max_results: + for n, title, count in cats: + print(repr(n, title, count)) + params_with_cat = params + ",n:" + str(n) + root = get_url(params_with_cat) + cat_total = get_total(root) + pages = (cat_total / page_size) + 1 + print('cat_total:', total, 'pages:', total / page_size) + if cat_total > max_results: + print('cat_total (%d) > max results (%d)' % (total, max_results)) + # assert cat_total <= max_results + try: + books.update(read_books(params_with_cat, root)) + except PersonalizedBooks: + print('WARNING: Personalized Books') + continue + for page in range(2, min((pages, 100)) + 1): + params_with_page = params_with_cat + "&page=%d" % page + try: + books.update(read_books(params_with_page, get_url(params_with_page))) + except PersonalizedBooks: + print('WARNING: Personalized Books') + break + print(repr(n, title, page, cat_total / page_size, len(books), "%.1f%%" % percent(len(books), grand_total))) + + return total, books, cats + +def write_books(books): + i = 0 + error_count = 0 + + for asin in books: + i+= 1 + for attempt in range(5): + try: + #page = urlopen('http://amazon.com/dp/' + asin).read() + page = scraper.get('http://www.amazon.com/dp/' + asin) + if re_expect_end.search(page): + break + print('bad page ending') + print(repr(page[-60:])) + error_count += 1 + if error_count == 50: + print('too many bad endings') + print('http://amazon.com/dp/' + asin) + sys.exit(0) + except: + pass + print('retry') + sleep(5) + +if __name__ == '__main__': + + one_day = timedelta(days=1) + cur = date(2009, 1, 1) # start date + cur = date(2009, 11, 11) # start date + #cur = date(2009, 12, 25) + while True: + print(cur) + total, books, cats = read_page(rh + cur.strftime("%Y%m%d")) + open(out_dir + '/total.' + str(cur), 'w').write(str(total) + "\n") + + out = open(out_dir + "/cats." + str(cur), 'w') + for i in cats: + print(i, file=out) + out.close() + print(len(books)) + write_books(books) + cur += one_day diff --git a/ia-legacy-importer/amazon/crawl_top_books.py b/ia-legacy-importer/amazon/crawl_top_books.py new file mode 100644 index 00000000..fe5b1528 --- /dev/null +++ b/ia-legacy-importer/amazon/crawl_top_books.py @@ -0,0 +1,22 @@ +from __future__ import print_function +from openlibrary.catalog.amazon.crawl import read_page, write_books, get_url, get_cats + +def get_serp(): + params = 'i:stripbooks,n:!1000,p_n_feature_browse-bin:618083011' + + #crawled = set(i[:-1] for i in open('/2/edward/amazon/crawled')) + + total, books, cats = read_page(params) + print('total:', total, 'number of books:', len(books), 'number of cats:', len(cats)) + +#get_serp() + +params = 'i:stripbooks,n:9988' +root = get_url(params) +cats = get_cats(root) + +for a, b, c in cats: + print("%8d %-30s %8d" % (a, b, c)) + +#books = [i[:-1] for i in open('/2/edward/amazon/best_sellers2')] +#write_books(books) diff --git a/ia-legacy-importer/amazon/extract_amazon_fields.py b/ia-legacy-importer/amazon/extract_amazon_fields.py new file mode 100644 index 00000000..0cdee052 --- /dev/null +++ b/ia-legacy-importer/amazon/extract_amazon_fields.py @@ -0,0 +1,34 @@ +# find fields in amazon data that don't appear in MARC data, extract and store in shelve + +import shelve + +seg_file = '/home/edward/ol/amazon/seg/22' + +match = set(eval(line)[0] \ + for line \ + in open('/home/edward/ol/merge/amazon_marc/amazon_lc_map')) + +# fields that MARC is missing: +# binding +# subject +# category +# series +# series_num +# edition +# dimensions +# first_sentence +# sip [] +# cap [] +# shipping_weight + +fields = [ 'binding', 'subject', 'category', 'series', 'series_num', 'edition',\ + 'dimensions', 'first_sentence', 'sip', 'cap', 'shipping_weight' ] + +d = shelve.open('amazon_fields.shelve', protocol=-1, writeback=True) + +for line in open(seg_file): + isbn, item = eval(line) + if isbn not in match: + continue + d[isbn] = dict([(f, item[f]) for f in fields if f in item]) +d.close diff --git a/ia-legacy-importer/amazon/get_other_editions.py b/ia-legacy-importer/amazon/get_other_editions.py new file mode 100644 index 00000000..bc7e1161 --- /dev/null +++ b/ia-legacy-importer/amazon/get_other_editions.py @@ -0,0 +1,38 @@ +from __future__ import print_function +from catalog.read_rc import read_rc +import web +import sys +import os.path +from time import time + +from six.moves import urllib + + +rc = read_rc() +web.config.db_parameters = dict(dbn='postgres', db=rc['db'], user=rc['user'], pw=rc['pw'], host=rc['host']) +web.config.db_printing = False +web.load() +dir = sys.argv[1] + +chunk = 10 +t0 = time() +isbn_iter = web.query('select value from edition_str where key_id=30') +for i, row in enumerate(isbn_iter): + isbn = row.value + dest = dir + '/' + isbn + if os.path.exists(dest): + continue + if len(isbn) != 10: + continue + url = 'http://www.amazon.com/dp/other-editions/' + isbn + try: + page = urllib.request.urlopen(url).read() + except urllib.error.HTTPError as error: + if error.code != 404: + raise + page = '' + open(dest, 'w').write(page) + if i % chunk == 0: + t1 = time() - t0 + rec_per_sec = float(i) / float(t1) + print("%s %s %.2f rec/sec" % (url, isbn, rec_per_sec)) diff --git a/ia-legacy-importer/amazon/import.py b/ia-legacy-importer/amazon/import.py new file mode 100644 index 00000000..a8586f0c --- /dev/null +++ b/ia-legacy-importer/amazon/import.py @@ -0,0 +1,207 @@ +from __future__ import print_function +import sys +import re +import os +from parse import read_edition +from lxml.html import fromstring +import catalog.importer.pool as pool +from catalog.importer.db_read import get_mc, withKey +import catalog.merge.amazon as amazon_merge +from catalog.get_ia import get_from_local, get_ia +from catalog.merge.merge_marc import build_marc +import catalog.marc.fast_parse as fast_parse + +import six +from six.moves import urllib + + +re_amazon = re.compile('^([A-Z0-9]{10}),(\d+):(.*)$', re.S) + +re_normalize = re.compile('[^\w ]') +re_whitespace = re.compile('\s+') +re_title_parens = re.compile('^(.+) \([^)]+?\)$') + +re_meta_marc = re.compile('([^/]+)_(meta|marc)\.(mrc|xml)') +# marc:marc_ithaca_college/ic_marc.mrc:224977427:1064 + +threshold = 875 + +def normalize_str(s): + s = re_normalize.sub('', s.strip()) + s = re_whitespace.sub(' ', s) + return str(s.lower()) + +# isbn, short title +def build_index_fields(asin, edition): + title = edition['title'] + if 'subtitle' in edition: + title += ' ' + edition['subtitle'] + + def norm(s): + return normalize_str(s)[:25].rstrip() + + titles = set([norm(title)]) + m = re_title_parens.match(title) + if m: + titles.add(norm(m.group(1))) + + isbn = set([asin]) + for field in 'asin', 'isbn_10', 'isbn_13': + if field in edition: + isbn.add(edition[field].replace('-', '')) + return {'title': list(titles), 'isbn': list(isbn)} + +def read_amazon_file(f): + while True: + buf = f.read(1024) + if not buf: + break + m = re_amazon.match(buf) + (asin, page_len, page) = m.groups() + page += f.read(int(page_len) - len(page)) + try: + edition = read_edition(fromstring(page)) + except: + print('bad record:', asin) + raise + if not edition: + continue + yield asin, edition + +def follow_redirects(key): + keys = [] + thing = None + while not thing or thing['type']['key'] == '/type/redirect': + keys.append(key) + thing = withKey(key) + assert thing + if thing['type']['key'] == '/type/redirect': + print('following redirect %s => %s' % (key, thing['location'])) + key = thing['location'] + return (keys, thing) + +def ia_match(a, ia): + try: + loc, rec = get_ia(ia) + except urllib.error.HTTPError: + return False + if rec is None or 'full_title' not in rec: + return False + try: + e1 = build_marc(rec) + except TypeError: + print(rec) + raise + return amazon_merge.attempt_merge(a, e1, threshold, debug=False) + +def marc_match(a, loc): + assert loc + rec = fast_parse.read_edition(get_from_local(loc)) + e1 = build_marc(rec) + #print 'amazon:', a + return amazon_merge.attempt_merge(a, e1, threshold, debug=False) + +def source_records_match(a, thing): + marc = 'marc:' + amazon = 'amazon:' + ia = 'ia:' + match = False + for src in thing['source_records']: + if not src.startswith('marc:marc_ithaca_college/ic'): + m = re_meta_marc.search(src) + if m: + src = 'ia:' + m.group(1) + if src.startswith(marc): + if marc_match(a, src[len(marc):]): + match = True + break + elif src.startswith(ia): + if src == 'ia:ic': + print(thing['source_records']) + if ia_match(a, src[len(ia):]): + match = True + break + else: + assert src.startswith(amazon) + continue + return match + + +def try_merge(edition, ekey, thing): + thing_type = thing['type']['key'] + if 'isbn_10' not in edition: + print(edition) + asin = edition.get('isbn_10', None) or edition['asin'] + if 'authors' in edition: + authors = [i['name'] for i in edition['authors']] + else: + authors = [] + a = amazon_merge.build_amazon(edition, authors) + assert isinstance(asin, six.string_types) + assert thing_type == '/type/edition' + #print edition['asin'], ekey + if 'source_records' in thing: + if 'amazon:' + asin in thing['source_records']: + return True + return source_records_match(a, thing) + + #print 'no source records' + mc = get_mc(ekey) + #print 'mc:', mc + if mc == 'amazon:' + asin: + return True + if not mc: + return False + data = get_from_local(mc) + e1 = build_marc(fast_parse.read_edition(data)) + return amazon_merge.attempt_merge(a, e1, threshold, debug=False) + +def import_file(filename): + for asin, edition in read_amazon_file(open(filename)): + index_fields = build_index_fields(asin, edition) + found = pool.build(index_fields) + if 'title' not in found: + print(found) + print(asin) + print(edition) + print(index_fields) + print() + + if not found['title'] and not found['isbn']: + #print 'no pool load book:', asin + # TODO load book + continue + #print asin, found + #print(repr(edition['title'], edition.get('subtitle', None), edition.get('flags', None), edition.get('binding', None))) + if 'sims' in edition: + del edition['sims'] + #print edition + #print + + seen = set() + for k, v in found.iteritems(): + for ekey in v: + if ekey in seen: + continue + keys, thing = follow_redirects(ekey) + seen.update(keys) + assert thing + try: + m = try_merge(edition, ekey, thing) + except: + print(asin) + print(edition) + print(ekey) + print(found) + raise + +# import_file(sys.argv[1]) + +d = sys.argv[1] +for f in os.listdir(d): + if not f.startswith('amazon.'): + continue + print(f) + if '2009-02' in f: + continue + import_file(d + "/" + f) diff --git a/ia-legacy-importer/amazon/list_done.py b/ia-legacy-importer/amazon/list_done.py new file mode 100644 index 00000000..ef1541bc --- /dev/null +++ b/ia-legacy-importer/amazon/list_done.py @@ -0,0 +1,74 @@ +from __future__ import print_function +from lxml.html import fromstring, tostring +from openlibrary.catalog.utils.arc import read_arc, read_body +import re +import os +import sys + +arc_dir = '/2/edward/amazon/arc' +total = 0 +srtitle = 0 +producttitle = 0 + +re_book_url = re.compile('^http://www.amazon.com/[^/]+/dp/([0-9A-Z]{10})/') +re_result_count = re.compile('^Showing ([,0-9]+) - ([,0-9]+) of ([,0-9]+) Results$') + +bad_serp = 0 + +out = open('/2/edward/amazon/crawled2', 'w') + +for filename in (i for i in os.listdir(arc_dir) if i.endswith('.arc')): + print(filename, total, srtitle, producttitle) + for url, wire in read_arc(arc_dir +'/' + filename): + if url.startswith('file'): + continue + if not url.startswith('http://www.amazon.com/s?'): + continue + body = read_body(wire) + doc = fromstring(body) + found = [] + try: + doc.get_element_by_id('noResultsTitle') +# print 'no results:', url + continue + except KeyError: + pass + rc = doc.find_class('resultCount') + if rc: + m = re_result_count.match(rc[0].text) + if m: + (a, b, c) = map(lambda i: int(i.replace(',','')), m.groups()) + if a == c + 1 and b == c: +# print 'result count:', rc[0].text +# print 'empty page' + continue + for e in doc.find_class('fastTrackList'): + if e.text == 'This item is currently not available.': + print(e.text) + + for pt in doc.find_class('productTitle'): + assert pt.tag == 'div' + assert pt[0].tag == 'a' + href = pt[0].attrib['href'] + m = re_book_url.match(href) + found.append(m.group(1)) + total += 1 + producttitle += 1 + + for e in doc.find_class('srTitle'): + td = e.getparent().getparent() + assert td.tag == 'td' + assert td[0].tag == 'a' + href = td[0].attrib['href'] + m = re_book_url.match(href) + found.append(m.group(1)) + total += 1 + srtitle += 1 + + if len(found) == 0: + print(url) + bad_serp += 1 + open('bad_serp%d.html' % bad_serp, 'w').write(body) + for asin in found: + print(asin, file=out) +out.close() diff --git a/ia-legacy-importer/amazon/load_merge.py b/ia-legacy-importer/amazon/load_merge.py new file mode 100644 index 00000000..82b20382 --- /dev/null +++ b/ia-legacy-importer/amazon/load_merge.py @@ -0,0 +1,135 @@ +from __future__ import print_function +from time import time +from catalog.marc.MARC21 import MARC21Record +from catalog.marc.parse import pick_first_date + +from six.moves import urllib + + +entity_fields = ('name', 'birth_date', 'death_date', 'date') + +def find_entity(site, entity): + entity = dict((k, entity[k]) for k in entity_fields if k in entity) + print(entity) + things = site.things(entity) + if not things: + print("person not found") + return + + print("found", len(things), "match") + for key in things: + db_entity = site.withKey(key, lazy=False)._get_data() + for field in entity_fields: + if field in entity: + assert field in db_entity + else: + assert field not in db_entity + +def get_from_archive(locator): + (file, offset, length) = locator.split (":") + offset = int (offset) + length = int (length) + + r0, r1 = offset, offset+length-1 + url = 'http://www.archive.org/download/%s'% file + + assert 0 < length < 100000 + + ureq = urllib.request.Request(url, None, {'Range':'bytes=%d-%d'% (r0, r1)},) + result = urllib.request.urlopen(ureq).read(100000) + rec = MARC21Record(result) + return rec + +def contrib(r): + contribs = [] + for f in r.get_fields('700'): + print(f.subfield_sequence) + contrib = {} + if 'a' not in f.contents and 'c' not in f.contents: + continue # should at least be a name or title + name = " ".join([j.strip(' /,;:') for i, j in f.subfield_sequence if i in 'abc']) + if 'd' in f.contents: + contrib = pick_first_date(f.contents['d']) + contrib['db_name'] = ' '.join([name] + f.contents['d']) + else: + contrib['db_name'] = name + contrib['name'] = name + contrib['entity_type'] = 'person' + subfields = [ + ('a', 'personal_name'), + ('b', 'numeration'), + ('c', 'title') + ] + for subfield, field_name in subfields: + if subfield in f.contents: + contrib[field_name] = ' '.join([x.strip(' /,;:') for x in f.contents[subfield]]) + if 'q' in f.contents: + contrib['fuller_name'] = ' '.join(f.contents['q']) + contribs.append(contrib) + + for f in r.get_fields('710'): + print(f.subfield_sequence) + contrib = { + 'entity_type': 'org', + 'name': " ".join([j.strip(' /,;:') for i, j in f.subfield_sequence if i in 'ab']) + } + contrib['db_name'] = contrib['name'] + contribs.append(contrib) + + for f in r.get_fields('711'): + print(f.subfield_sequence) + contrib = { + 'entity_type': 'event', + 'name': " ".join([j.strip(' /,;:') for i, j in f.subfield_sequence if i in 'acdn']) + } + contrib['db_name'] = contrib['name'] + contribs.append(contrib) + return contribs + +def load(site, filename): + for line in open(filename): + isbn, lc_src, amazon = eval(line) + versions = site.versions({'machine_comment': lc_src}) + assert len(versions) == 1 + thing = site.withID(versions[0]['thing_id']) + + if 'authors' not in amazon: + continue + author_count = 0 + for name, role in amazon['authors']: + if role != 'Author': + continue + author_count+=1 + if author_count > 1: + break + if author_count < 2: + continue + + print(lc_src) + print('amazon:', amazon['authors']) + + + try: + print('LC authors:', [x.name for x in thing.authors]) + except AttributeError: + print('no authors in LC') + lc_contrib = [] + try: + lc_contrib = thing.contributions + print('LC contributions:', lc_contrib) + except AttributeError: + print('no contributions in LC') + if lc_contrib: + r = get_from_archive(lc_src) + contrib_detail = contrib(r) + assert len(lc_contrib) == len(contrib_detail) + for c, detail in zip(lc_contrib, contrib_detail): + print(c, end=' ') + find_entity(site, detail) + print() + continue + # for x in web.query("select thing_id from version where machine_comment=" + web.sqlquote(lc)): + # t = site.withID(x.thing_id) + # print t.title + + diff --git a/ia-legacy-importer/amazon/other_editions.py b/ia-legacy-importer/amazon/other_editions.py new file mode 100644 index 00000000..dc367517 --- /dev/null +++ b/ia-legacy-importer/amazon/other_editions.py @@ -0,0 +1,66 @@ +import re +import os.path +from bs4 import BeautifulSoup + +from six.moves import urllib + + +# http://amazon.com/other-editions/dp/0312153325 has: +# http://www.amazon.com/gp/product/0312247869 +re_link = re.compile('^http://www\.amazon\.com/(?:(.*)/dp|gp/product)/(\d{9}[\dX]|B[A-Z0-9]+)$') + +desc_skip = set(['(Bargain Price)', '(Kindle Book)']) + +def read_bucket_table(f): + html = '' + bucket = False + table = False + for line in f: + if line[:-1] == '
    ': + bucket = True + continue + if bucket and line[:-1] == ' ': + table = True + if table: + html += line + if line[:-1] == '
    ': + break + return html + +def parse_html(html): + soup = BeautifulSoup(html, "lxml") + for tr in soup('tr')[2:]: + td = tr('td') + assert len(td) == 3 + td0 = td[0] + assert td0['class'] == 'small' + assert len(td0) == 3 + (nl, link, desc) = td0 + assert nl == '\n' + href = link['href'] + if href.startswith("http://www.amazon.com:80/gp/redirect.html"): + # audio book, skip for now + continue + m = re_link.match(link['href']) + yield str(m.group(2)), desc.strip() + +def get_from_amazon(isbn): + url = 'http://www.amazon.com/dp/other-editions/' + isbn + try: + return urllib.request.urlopen(url).read() + except urllib.error.HTTPError as error: + if error.code != 404: + raise + return '' + +def find_others(isbn, dir): + filename = dir + "/" + isbn + if len(isbn) != 10: + return [] + if not os.path.exists(filename): + open(filename, 'w').write(get_from_amazon(isbn)) + html = read_bucket_table(open(dir + "/" + isbn)) + if not html: + return [] + l = [i for i in parse_html(html) if not i[0].startswith('B') and i[1] not in desc_skip] + return l diff --git a/ia-legacy-importer/amazon/parse.py b/ia-legacy-importer/amazon/parse.py new file mode 100644 index 00000000..f4d940b3 --- /dev/null +++ b/ia-legacy-importer/amazon/parse.py @@ -0,0 +1,626 @@ +from __future__ import print_function +from lxml.html import parse, tostring +import re +import os +import sys +import web +from warnings import warn +from math import floor +from pprint import pprint +import htmlentitydefs + +import six + + +class BrokenTitle(Exception): + pass + +class IncompletePage(Exception): + pass + +class MissingAuthor(Exception): + pass + +role_re = re.compile("^ \(([^)]+)\)") + +#: sample: ' [Paperback, Large Print]' + +re_title = re.compile(""" + (?:\ \[([A-Za-z, ]+)\])? # flags + (?:\(\ ([^()]+|[^()]*\(.*\)[^()]*)\))? + """, re.MULTILINE | re.X) + +re_split_title = re.compile(r'''^ + (.+?(?:\ \(.+\))?) + (?::\ (\ *[^:]+))?$ +''', re.X) + +re_missing_author = re.compile('\n\n(~ )?\(([A-Za-z, ]+)\), ') + +re_list_price = re.compile('^\$([\d,]+)\.(\d\d)$') +re_amazon_price = re.compile('^\$([\d,]+)\.(\d\d)$') +# '$0.04\n \n ' +re_you_save = re.compile('^\$([\d,]+)\.(\d\d)\s*\((\d+)%\)\s*$') + +re_pages = re.compile('^\s*(\d+)(?:\.0)? pages\s*$') +re_sales_rank = re.compile('^ #([0-9,]+) in Books') +re_html_in_title = re.compile('', re.I) + +def unescape(text): + def fixup(m): + text = m.group(0) + if text[:2] == "&#": + # character reference + try: + if text[:3] == "&#x": + return six.unichr(int(text[3:-1], 16)) + else: + return six.unichr(int(text[2:-1])) + except ValueError: + pass + else: + # named entity + try: + text = six.unichr(htmlentitydefs.name2codepoint[text[1:-1]]) + except KeyError: + pass + return text # leave as is + return re.sub("&#?\w+;", fixup, text) + +def to_dict(k, v): + return {k: v} if v else None + +def read_authors(by_span): + authors = [] + if re_missing_author.match(by_span.text): + raise MissingAuthor + try: + assert by_span.text in ('\n\n', '\n\n~ ') + except: + print(repr(by_span.text)) + raise + expect_end = False + for e in by_span: + if expect_end: + assert e.tag in ('br', 'span') + break + assert e.tag == 'a' + if e.tail.endswith('\n\n'): + expect_end = True + else: + assert e.tail.endswith(', ') + m = role_re.match(e.tail) + if m: + authors.append({ 'name': e.text, 'role': m.group(1), 'href': e.attrib['href'] }) + else: + authors.append({ 'name': e.text, 'href': e.attrib['href'] }) + return authors + +def get_title_and_authors(doc, title_from_html): + try: + prodImage = doc.get_element_by_id('prodImage') + except KeyError: + raise IncompletePage + full_title = unescape(prodImage.attrib['alt']) # double quoted + full_title = re_html_in_title.sub('', full_title).replace(''', "'") + + m = re_split_title.match(full_title) + (title, subtitle) = m.groups() + # maybe need to descape title + title_id = doc.get_element_by_id('btAsinTitle') + assert title_id.tag == 'span' + assert title_id.getparent().tag == 'h1' + assert title_id.getparent().attrib['class'] == 'parseasinTitle' + buying_div = title_id.getparent().getparent() + assert buying_div.tag == 'div' + assert buying_div.attrib['class'] == 'buying' + by_span = buying_div[1] + assert by_span.tag == 'span' + + book = { + 'full_title': full_title, + 'title': title, + 'has_cover_img': "no-image-avail" not in prodImage.attrib['src'] + } + + authors = [] + if len(by_span) and by_span[0].tag == 'a': + #print len(by_span), [e.tag for e in by_span] + book['authors'] = read_authors(by_span) + title_text = title_id.text_content() + if not title_text.startswith(full_title): + print(('alt:', repr(prodImage.attrib['alt']))) + print(('title mistmach:', repr(full_title), '!=', repr(title_text))) + title_text = title_from_html.decode('latin-1') + print(('title_text:', repr(title_text))) + print(('full_title:', repr(full_title))) + if not title_text.startswith(full_title): + print(('alt:', repr(prodImage.attrib['alt']))) + print(('title mistmach:', repr(full_title), '!=', repr(title_text))) + raise BrokenTitle + if full_title != title_text: + btAsinTitle = title_text[len(full_title):] + m = re_title.match(btAsinTitle) + if not m: + print(('title:', repr(btAsinTitle))) + (flag, binding) = m.groups() + if binding is not None: + book['binding'] = binding + if flag: + book['flag'] = flag + if subtitle: + book['subtitle'] = subtitle + + return book + +def dollars_and_cents(dollars, cents): + # input: dollars and cents as strings + # output: value in cents as an int + return int(dollars.replace(',', '')) * 100 + int(cents) + +def read_price_block(doc): + price_block = doc.get_element_by_id('priceBlock', None) + book = {} + if price_block is None: + return + assert price_block.tag == 'div' and price_block.attrib['class'] == 'buying' + table = price_block[0] + assert table.tag == 'table' and table.attrib['class'] == 'product' + for tr in table: + assert tr.tag == 'tr' and len(tr) == 2 + assert all(td.tag == 'td' for td in tr) + heading = tr[0].text + value = tr[1].text_content() + + if heading == 'List Price:': + m = re_list_price.match(value) + list_price = dollars_and_cents(m.group(1), m.group(2)) + book["list_price"] = list_price + elif heading == "Price:": + b = tr[1][0] + assert b.tag == 'b' and b.attrib['class'] == 'priceLarge' + m = re_amazon_price.match(b.text) + amazon_price = dollars_and_cents(m.group(1), m.group(2)) + book["amazon_price"] = amazon_price + elif heading == 'You Save:': + continue # don't need to check + # fails for 057124954X: '$0.04\n \n ' + m = re_you_save.match(value) + you_save = dollars_and_cents(m.group(1), m.group(2)) + assert list_price - amazon_price == you_save + assert floor(float(you_save * 100) / list_price + 0.5) == int(m.group(3)) + elif heading == 'Value Priced at:': + continue # skip + m = re_amazon_price.match(value) + book["value_priced_at"] = dollars_and_cents(m.group(1), m.group(2)) + elif heading == 'Import List Price:': + pass + + return book + +def find_avail_span(doc): + for div in doc.find_class('buying'): + if div.tag != 'div' or not len(div): + continue + if div[0].tag == 'span': + span = div[0] + elif div[0].tag == 'br' and div[1].tag == 'b' and div[2].tag == 'span': + span = div[2] + else: + continue + if span.attrib['class'].startswith('avail'): + return span + +def read_avail(doc): + traffic_signals = set(['Red', 'Orange', 'Green']) + span = find_avail_span(doc) + color = span.attrib['class'][5:] + assert color in traffic_signals + gift_wrap = span.getnext().getnext().tail + book = { + 'avail_color': color, + 'amazon_availability': span.text, + 'gift_wrap': bool(gift_wrap) and 'Gift-wrap available' in gift_wrap + } + return book + +def read_other_editions(doc): + oe = doc.get_element_by_id('oeTable', None) + if oe is None: + return + assert oe.tag == 'table' and oe.attrib['class'] == 'otherEditions' + assert len(oe) == 2 and len(oe[0]) == 2 and len(oe[1]) == 2 + assert oe[0][0][0].tag == 'a' + oe = oe[0][0][1] + assert oe.tag == 'table' + other_editions = [] + for tr in oe[1:]: + assert tr.tag == 'tr' + if 'bgcolor' in tr.attrib: + assert tr.attrib['bgcolor'] == '#ffffff' + else: + assert tr[0].attrib['id'] == 'oeShowMore' + break + assert tr[0].attrib['class'] == 'tiny' + a = tr[0][0] + assert a.tag == 'a' + row = [a.attrib['href'][-10:], a.text, a.tail.strip()] + other_editions.append(row) + return {'other_editions': other_editions } + +def read_sims(doc): + sims = doc.find_class('sims-faceouts') + if len(sims) == 0: + return + assert len(sims) == 1 + sims = sims[0] + assert sims.tag == 'table' + found = [] + if sims[0].tag == 'tbody': + tr = sims[0][0] + else: + assert sims[0].tag == 'tr' + tr = sims[0] + for td in tr: + assert td.tag == 'td' + a = td[1][0] + assert a.tag == 'a' + found.append({'asin': a.attrib['href'][-10:], 'title': a.text}) + return to_dict('sims', found) + +def find_product_details_ul(doc): + a = doc.get_element_by_id('productDetails', None) + if a is None: + return + try: + assert a.tag == 'a' and a.attrib['name'] == 'productDetails' + except: + print(tostring(a)) + raise + hr = a.getnext() + assert hr.tag == 'hr' and hr.attrib['class'] == 'bucketDivider' + table = hr.getnext() + td = table[0][0] + assert td.tag == 'td' and td.attrib['class'] == 'bucket' + h2 = td[0] + assert h2.tag == 'h2' and h2.text == 'Product Details' + div = td[1] + assert div.tag == 'div' and div.attrib['class'] == 'content' + ul = div[0] + if div[0].tag == 'table': + ul = div[1] + assert ul.tag == 'ul' + assert ul[-1].tag == 'div' and ul[-2].tag == 'p' + return ul + +def read_li(li): + assert li.tag == 'li' + b = li[0] + assert b.tag == 'b' + return b + +re_series = re.compile('^
  • (?:This is item (\d+) in|This item is part of) 7: + print(len(content)) + for num, i in enumerate(content): + print(num, i.tag, i.attrib) + a = content[8] + assert a.tag == 'a' + b = content[9] + assert a.attrib['name'] == 'cited' + found['cited'] = b.text + for k, v in found.items(): + m = re_cite[k].match(v) + found[k] = int(m.group(1)) + return found + +def find_inside_this_book(doc): + for b in doc.find_class('h1'): + if b.text == 'Inside This Book': + assert b.tag == 'b' + return b.getparent() + return None + +def read_first_sentence(inside): + if len(inside) == 4: + assert inside[2].tag == 'span' + assert inside[2].attrib['class'] == 'tiny' + assert inside[2][0].tail.strip() == 'Browse and search another edition of this book.' + div = inside[3] + else: + assert len(inside) == 3 + div = inside[2] + assert div.tag == 'div' and div.attrib['class'] == 'content' + if div[0].tag in ('a', 'b'): + assert div[0].text != 'First Sentence:' + return + assert div[0].tag == 'strong' + assert div[0].text == 'First Sentence:' + assert div[1].tag == 'br' + return div[1].tail.strip(u"\n \xa0") + +def find_bucket(doc, text): + for div in doc.find_class('bucket'): + h2 = div[0] + if h2.tag == 'h2' and h2.text == text: + return div + return None + +# New & Used Textbooks + +def read_subject(doc): + div = find_bucket(doc, 'Look for Similar Items by Subject') + if div is None: + return + assert div.tag == 'div' + form = div[1][0] + assert form.tag == 'form' + input = form[0] + assert input.tag == 'input' and input.attrib['type'] == 'hidden' \ + and input.attrib['name'] == 'index' \ + and input.attrib['value'] == 'books' + found = [] + for input in form[3:-4:3]: + a = input.getnext() + assert a.tag == 'a' + found_text = a.text if len(a) == 0 else a[0].text + assert found_text is not None + found.append(found_text) + return to_dict('subjects', found) + +def read_category(doc): + div = find_bucket(doc, 'Look for Similar Items by Category') + if div is None: + return + assert div.tag == 'div' + ul = div[1][0] + assert ul.tag == 'ul' + found = [] + for li in ul: + assert all(a.tail == ' > ' for a in li[:-1]) + cat = [a.text for a in li] + if cat[-1] == 'All Titles': + cat.pop() + found.append(tuple(cat)) +# if 'Series' in cat: +# edition["series2"] = cat + # maybe strip 'Books' from start of category + found = [i[1:] if i[0] == 'Books' else i for i in found] + return to_dict('category', found) + +def read_tags(doc): + table = doc.find_class('tag-cols') + if len(table) == 0: + return + assert len(table) == 1 + table = table[0] + assert len(table) == 1 + tr = table[0] + +def read_edition(doc, title_from_html=None): + edition = {} + book = get_title_and_authors(doc, title_from_html) + edition.update(book) + + ret = read_price_block(doc) + if ret: + edition.update(ret) + inside = find_inside_this_book(doc) + if inside is not None: + sentence = read_first_sentence(inside) + if sentence: + edition['first_sentence'] = sentence + func = [ + #read_citing, + read_plog, + read_series, + #read_avail, + read_product_details, + read_other_editions, + #read_sims, # not needed now + read_subject, + read_category, + ] + for f in func: + ret = f(doc) + if ret: + edition.update(ret) + parse_publisher(edition) + if 'isbn_10' not in edition and 'asin' not in edition: + return None + return edition + +# ['subtitle', 'binding', 'shipping_weight', 'category', 'first_sentence', 'title', 'full_title', 'authors', 'dimensions', 'publisher', 'language', 'number_of_pages', 'isbn_13', 'isbn_10', 'publish_date'] +def edition_to_ol(edition): + ol = {} + fields = ['title', 'subtitle', 'publish_date', 'number_of_pages', 'first_sentence'] + for f in fields: + if f in edition: + ol[f] = edition[f] + if 'isbn_10' in edition: + ol['isbn_10'] = [edition['isbn_10']] + if 'isbn_13' in edition: + ol['isbn_13'] = [edition['isbn_13'].replace('-','')] + if 'category' in edition: + ol['subjects'] = edition['category'] + if 'binding' in edition: + ol['physical_format'] = edition['binding'] + if 'dimensions' in edition: + ol['physical_dimensions'] = edition['dimensions'] + if 'shipping_weight' in edition: + ol['weight'] = edition['shipping_weight'] + if 'authors' in edition: + ol['authors'] = [a for a in edition['authors'] if a['name'] != 'n/a'] + if 'publisher' in edition: + ol['publishers'] = [edition['publisher']] + else: + print('publisher missing') + + for k, v in ol.iteritems(): + if isinstance(v, six.string_types) and v[-1] == '(': + pprint(edition) + print(('ends with "(":', repr(k, v))) + sys.exit(0) + + return ol + +if __name__ == '__main__': + #for dir in ('/2008/sample/', 'pages/'): + page_dir = sys.argv[1] + for filename in os.listdir(page_dir): + #if '1435438671' not in filename: + # continue + if filename.endswith('.swp'): + continue + edition = {} + doc = parse(page_dir + '/' + filename).getroot() + assert doc is not None + edition = read_edition(doc) + ol = edition_to_ol(edition) + pprint (ol) diff --git a/ia-legacy-importer/amazon/read_serp.py b/ia-legacy-importer/amazon/read_serp.py new file mode 100644 index 00000000..a5b6ec38 --- /dev/null +++ b/ia-legacy-importer/amazon/read_serp.py @@ -0,0 +1,79 @@ +from __future__ import print_function +from lxml.html import fromstring +from openlibrary.catalog.utils.arc import read_arc, read_body +import os +import re + +arc_dir = '/2/edward/amazon/arc' + +re_book_url = re.compile('^http://www.amazon.com/[^/]+/dp/([0-9A-Z]{10})/') +re_result_count = re.compile('^Showing ([,0-9]+) - ([,0-9]+) of ([,0-9]+) Results$') +re_title = re.compile('Amazon.com: (.*?)(:?, Page \d+)?') +crawled = set(i[:-1] for i in open('/2/edward/amazon/crawled')) + +# /2/edward/amazon/arc/20100311*.arc + +def find_pt(doc): + found = [] + for pt in doc.find_class('productTitle'): + assert pt.tag == 'div' + assert pt[0].tag == 'a' + href = pt[0].attrib['href'] + m = re_book_url.match(href) + print(m.group(1)) + found.append(m.group(1)) + return found + +def find_srtitle(doc): + found = [] + for e in doc.find_class('srTitle'): + td = e.getparent().getparent() + assert td.tag == 'td' + assert td[0].tag == 'a' + href = td[0].attrib['href'] + m = re_book_url.match(href) + found.append(m.group(1)) + return found + +found_books = set() + +prev = '' +for filename in (i for i in os.listdir(arc_dir) if i.endswith('.arc')): + if not filename.startswith('20100412'): + continue + for url, wire in read_arc(arc_dir +'/' + filename): + #print filename, url + if url.startswith('file'): + continue + if not url.startswith('http://www.amazon.com/s?'): + continue + body = read_body(wire) + m = re_title.search(body) + if m.group(1) != prev: + print(m.group(1)) + prev = m.group(1) + continue + doc = fromstring(body) + try: + doc.get_element_by_id('noResultsTitle') + continue + except KeyError: + pass + rc = doc.find_class('resultCount') + if rc: + m = re_result_count.match(rc[0].text) + if m: + (a, b, c) = map(lambda i: int(i.replace(',','')), m.groups()) + if a == c + 1 and b == c: + continue + for e in doc.find_class('fastTrackList'): + if e.text == 'This item is currently not available.': + print(e.text) + + assert len(find_pt(doc)) == 0 + serp_found = find_srtitle(doc) + for asin in serp_found: + if asin in crawled: + continue + found_books.update(serp_found) + print(len(serp_found), len(found_books), filename, url) diff --git a/ia-legacy-importer/amazon/upload.py b/ia-legacy-importer/amazon/upload.py new file mode 100644 index 00000000..a15dc4b9 --- /dev/null +++ b/ia-legacy-importer/amazon/upload.py @@ -0,0 +1,89 @@ +from __future__ import print_function +from catalog.read_rc import read_rc +import httplib +import web +import time +import sys +from datetime import date, timedelta + +rc = read_rc() +accesskey = rc['s3_accesskey'] +secret = rc['s3_secret'] + +db = web.database(dbn='mysql', host=rc['ia_db_host'], user=rc['ia_db_user'], \ + passwd=rc['ia_db_pass'], db='archive') +db.printing = False + +crawl_dir = '/1/edward/amazon/crawl' +collection = 'ol_data' +mediatype = 'data' + +con = httplib.HTTPConnection('s3.us.archive.org') +con.connect() + +def wait_for_upload(ia): + while True: + rows = list(db.select('catalog', where='identifier = $ia', vars={'ia': ia})) + if len(rows) == 0: + return + print("\r", len(rows), 'tasks still running', end=' ') + time.sleep(5) + print('\ndone') + +no_bucket_error = 'NoSuchBucket' +internal_error = 'InternalError' + +def put_file(con, ia, filename, headers): + print('uploading %s' % filename) + headers['authorization'] = "LOW " + accesskey + ':' + secret + url = 'http://s3.us.archive.org/' + ia + '/' + filename + print(url) + data = open(crawl_dir + '/' + filename).read() + for attempt in range(5): + con.request('PUT', url, data, headers) + res = con.getresponse() + body = res.read() + if '' not in body: + return + print('error') + print(body) + if no_bucket_error not in body and internal_error not in body: + sys.exit(0) + print('retry') + time.sleep(5) + print('too many failed attempts') + +def create_item(con, ia, cur_date): + headers = { + 'x-amz-auto-make-bucket': 1, + 'x-archive-meta01-collection': collection, + 'x-archive-meta-mediatype': mediatype, + 'x-archive-meta-language': 'eng', + 'x-archive-meta-title': 'Amazon crawl ' + cur_date, + 'x-archive-meta-description': 'Crawl of Amazon. Books published on ' + cur_date + '.', + 'x-archive-meta-year': cur_date[:4], + 'x-archive-meta-date': cur_date.replace('-', ''), + } + + filename = 'index.' + cur_date + put_file(con, ia, filename, headers) + +def upload_index(con, cur_date): + ia = 'amazon_crawl.' + cur_date + + create_item(con, ia, cur_date) + wait_for_upload(ia) + time.sleep(5) + + put_file(con, ia, 'amazon.' + cur_date, {}) + put_file(con, ia, 'cats.' + cur_date, {}) + put_file(con, ia, 'list.' + cur_date, {}) + +one_day = timedelta(days=1) +cur = date(2009, 4, 26) # start from +while True: + print(cur) + upload_index(con, str(cur)) + cur -= one_day + +con.close() diff --git a/ia-legacy-importer/amazon/upload_arc.py b/ia-legacy-importer/amazon/upload_arc.py new file mode 100644 index 00000000..45ae5663 --- /dev/null +++ b/ia-legacy-importer/amazon/upload_arc.py @@ -0,0 +1,105 @@ +from __future__ import print_function +from openlibrary.catalog.read_rc import read_rc +import httplib +import web +import time +import sys +import os + +rc = read_rc() +accesskey = rc['s3_accesskey'] +secret = rc['s3_secret'] +#arc_dir = '/2/edward/amazon/arc' +arc_dir = '/0/amazon' + +no_bucket_error = 'NoSuchBucket' +internal_error = 'InternalError' + +done = [ + '20100210013733.arc', + '20100210015013.arc', + '20100210020316.arc', + '20100210021445.arc', + '20100210022726.arc', + '20100210024019.arc', + '20100210025249.arc', + '20100210030609.arc', + '20100210031752.arc', + '20100210033024.arc', + '20100210034255.arc', + '20100210035501.arc', + '20100210040904.arc', + '20100210042130.arc', + '20100210043351.arc', + '20100210044553.arc', + '20100210051017.arc', + '20100210052258.arc', + '20100210053601.arc', + '20100210194700.arc', + '20100210201110.arc', + '20100212000643.arc', + '20100212001705.arc', + '20100212002656.arc', + '20100212004512.arc', + '20100212010934.arc', + '20100212013415.arc', + '20100212015925.arc', + '20100212022248.arc', + '20100212024600.arc', + '20100212030916.arc', + '20100212033221.arc', + '20100212035616.arc', + '20100212042043.arc', + '20100212044622.arc', + '20100212051112.arc', + '20100212053604.arc', + '20100212060140.arc', + '20100212062647.arc', + '20100212065128.arc', + '20100212165731.arc', + '20100212184748.arc', + '20100212184807.arc', + '20100212184822.arc', + '20100212190147.arc', + '20100212192404.arc', + '20100212194513.arc', + '20100212200700.arc', + '20100212202810.arc', + '20100212204852.arc', + '20100212210951.arc', + '20100212213032.arc', + '20100212215107.arc' +] + +def put_file(con, ia, filename, headers): + print('uploading %s' % filename) + headers['authorization'] = "LOW " + accesskey + ':' + secret + url = 'http://s3.us.archive.org/' + ia + '/' + filename + print(url) + data = open(arc_dir + '/' + filename).read() + for attempt in range(5): + con.request('PUT', url, data, headers) + res = con.getresponse() + body = res.read() + if '' not in body: + return + print('error') + print(body) + if no_bucket_error not in body and internal_error not in body: + sys.exit(0) + print('retry') + time.sleep(5) + print('too many failed attempts') + +ia = 'amazon_book_crawl' +for filename in os.listdir(arc_dir): + if filename in done: + continue + if not filename.endswith('.arc'): + continue + print(filename) + con = httplib.HTTPConnection('s3.us.archive.org') + con.connect() + put_file(con, ia, filename, {}) + con.close() + diff --git a/ia-legacy-importer/author/__init__.py b/ia-legacy-importer/author/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/ia-legacy-importer/author/east b/ia-legacy-importer/author/east new file mode 100644 index 00000000..7f2d0084 --- /dev/null +++ b/ia-legacy-importer/author/east @@ -0,0 +1,2156 @@ +Ai, Wei +An, Bin +An, Na +An, Ni +An, Zhimin +Bai, Juyi +Bai, Lin +Bai, Nanfeng +Bai, Tao +Bai, Tian +Bai, Yongquan +Bai, Ziran +Ban, Chao +Bao-Liang, Lu +Bao, Lin +Bao, Zheng +Ben, Lee +BIAO, XIANG +Bi, Chao +Bing, Wang +Bockja, Kim +Boqiao, Tang +Bo, Yang +Bo, Yibo +Bruno, Lee +Bu, Di +Cai, Cheng +Cai, Chusheng +Cai, Guo-Qiang +Cai, Jingfeng +Cai, Longyun +Cai, Qian +Cai, Ruixian +Cai, Wenji +Cai, Xiansheng +Cai, Yong +Cai, Yuanpei +Cai, Yun +Cang, Xin +Cao, Pei +Cao, Pi +Cao, Van Vien +Cao, Yang +Cao, Ying +Cao, Yuzhang +Cao, Zhi +Cao, Zuorui +Chan, Chee Onn +Chan, Chee-yan +Chan, Chen Hei +Chang, Chao +Chang, Chen +Chang, Ch?eng-mei +Chang, Ch'eng-mei +Chang, Chung-yuan +Chang, Fa-shun +Chang, Hao +Chang, Hsin-chang +Chang, Hsin-hai +Chang, Hui-chien +Chang, Hung Ta +Chang, Hwan Kim +Chang, Liu +Chang, Ping-lin +Chang, Shu +Changsu, Kim +Chang, Ti-sheng +Chan, Heng Chee +Chan, Heng Leong +Chan, Kai Lok +Chan, Kok Sing +Chan, Tak Cheung +Chan, Wai Kwan +Chaochen, Zhou +Chao, Hung-pen +Chao, Hung-pe?n +Chao, Jiping +Chao, Lin +Chao, Shu-li +Chao, T'ing-chi +Chao, Tzee Cheng +Chao, Yi +Chao, Yu +Chawnshang, Chang +Cheah, Hock Beng +Cheah, Jin Seng +Cheah, Yin Mee +Chen, Ai Ju +Chen, Baochen +Chen, Bingfu +Chen, Boda +Chen, Changming +Chen, Cheng +Chen, Chong Swee +Chen, Chuan Chong +Chen, Ci Liang +Chen, Dao +Chen, Deren +Chen, Dong +Chen, Duxiu +Chen, Erjin +Chen, Fu +Cheng, Chien +Cheng, Ching-wen +Cheng, Dan-an +Chen, Ge +Chen, Geng +Chen, Gengtao +Cheng, Fang +Cheng, Fangwu +Cheng, Gong +Cheng, Guan +Cheng, He +Cheng, Hsieh +Cheng, Hui +Cheng, Kam Fong +Cheng, Ku +Chengliang, Zhu +Cheng, Lu +Cheng, Manchao +Cheng, Meizhen +Cheng, Ming Yu +Chen, Gongbo +Cheng, Peng +Cheng, Qing +Cheng, Qinghua +Cheng, Ren +Cheng, Shifa +Chen, Gu +Chen, Guansheng +Chen, Guo +Cheng, Weidong +Cheng, Yanqiu +Cheng, Zhang +Cheng, Zhenqiu +Chen, Haiyan +Chen, He +Chen, Hongmou +Chen, Hsi-Ju +Chen, Huiguan +Chen, Jai-Sheng +Chen, Ji +Chen, Jia'er +Chen, Jieru +Chen, Jihai +Chen, Jinding +Chen, Jingpan +Chen, Jirui +Chen, Kaige +Chen, Lian +Chen, Liang Yu +Chen, Liang-Yu +Chen, Lifang +Chen, Lu +Chen, May Yee +Chen, Meng +Chen, Meng-chia +Chen, Mingyuan +Chen, Pixian +Chen, Qingchao +Chen, Ruoxi +Chen, Tong +Chen, Tze-tuan +Chen, Voon Fee +Chen, Weiye +Chen, Xi +Chen, Xihe +Chen, Xuezhao +Chen, Yanqing +Chen, Yifei +Chen, Yinke +Chen, Yongfu +Chen, Yongguo +Chen, Yonglin +Chen, Yun +Chen, Yun-Chung +Chen, Yunlin +Chen, Zelin +Chen, Zhaoxia +Chen, Zhi +Chen, Zhi'an +Chen, Zhiyuan +Chen, Zhong +Chen, Zhu +Chen, Zhucai +Chen, Zongji +Chen, Zude +Chia, Chen +Chiang, Chin +Chiang, Ching-kuo +Chiang, Huan-ching +Chiang, Kai-shek +Chiang, Kai-Shek +Chiang, K'ang-hu +Chiang, Kang-hu +Chiang, Kuei +Chiang, Lan-hung Nora +Chiang, Su-hui +Ch'iao, Chien +Chiao, Chien +Chieh, Liu +Chih-Tung, Chang +Chi, Li +Chin, Kin Wah +Chin, Kok Fay +Chin, Peng +Choan-Seng, Song +Cho, Hee +Cho, Hyun-Chul +Choi, Jungwoon +Choi, Yong-Ho +Choi, Young +Chong, Song-won +Chong, Yap Seng +Chong, Yi +Chou, Bih-Er +Chou, Ching-wen +Chou, En-lai +Chou, Fu +Chou, Ping +Chou, Shun-hsin +Chou, Wen-Chung +Chou, Zhang +Cho, Van Tran +Cho, Wha Soon +Cho, Young-rae +Chuang, Chuang Tsai +Chuang, Hua +Chuang, Ying-chang +Chu, Anping +Chuan, Yu +Chu, Ching-wu +Chu, Ching-Wu +Chu, Dagao +Chu, Djang +Chu, Li +Chu, Minyi +Chun, Doo Hwan +Chun, Soonok +Chun, Soon-ok +Chun, Tao +Chu, Va?n Ta??n +Chu, Yo-han +Chu, Youyi +Chu, Youyi +Cui, Shuyi +Cui, Shuzhi +Cui, Yingjie +Cui, Yongqiang +Cui, Yu +Dai, Guan +Dai, Jinhua +Dai, Qing +Dai, Sijie +Dai, Wangshu +Dai, Xi +Dai, Xianglong +Dai, Zhong +Daxing, Han +Dazhong, Xu +De-cheng, Luo +Deng, Jun +Deng, Xiaoping +Deng, Yingchao +Ding, Chen +Ding, Qiulin +Ding, Song +Ding, Wei +Dingyi, Lu +Ding, Zheng +Ding, Zhongli +Dong, Biwu +Dongfang, Wang +Dong, Guangchang +Dong, Han +Dong, He +Dong, Jie +Dong, Jinxia +Do, Ngoc Diep +Dong, Shizhong +Dong, Yu +Dong, Zhiming +Dong, Zhongshu +Donovan, Lee +Du, Fu +Du, Guangting +Du, Jian'guo +Du, Juan +Du, Liang +Du, Ling +Du, Liping +Duong, Le Quy +Duong, Tan Nhut +Duong, Thanh Bi?nh +Du, Qinggang +Du, Xia +Du, Xiangwan +Du, Zheng +Ersheng, Gao +Eu-Yang, Kwang +Fang, Dan +Fang, Di +Fang, Ding +Fang, Guan +Fang, Huang +Fang, Lizhi +Fang, Ning +Fang, Qian +Fang, Xiang +Fang, Zhaoben +Fang, Zhi-yang +Fang, Zhong +Fan, Hai-fu +Fan, He +Fan, Hong +Fan, Kang +Fan, Keh-Li +Fan, Ren +Fan, Shen +Fan, Shou-shan +Fan, Wang +Fei, Dawei +Fei, Xiaotong +Fei, Xin +Feng, Boyi +Feng, Congying +Feng, Fang +Feng, Guomei +Feng, Jiannan +Feng, Jicai +Feng, Lei +Feng, Menglong +Feng, Shi +Feng, Shu +Feng, Wu +Feng, Xu +Feng, Youlan +Feng, Yuxiang +Feng, Zhaoshu +Feng, Zhiqiang +Fu, Baoshi +Fu, Biao +Fu, Chongyue +Fu, Chunjiang +Fu, Lin +Fung, Chi Ming +Fu, Ping +Fu, Tianchou +Fu, Xi +Fu, Yi +Fu, Yiyuan +Fu, Zheng +Fu, Zhongwen +Fu, Zizhi +Gan, Fu +Gang, Fan +Gan, Gu +Gang, Yi +Gan, Tang +Gao, Chien +Gao, Chongshou +Gao, Ertai +Gao, gang +Gao, Gang +Gao, Hong +Gao, Jun +Gao, Lin +Gao, Min +Gao, Qiang +Gao, Wei +Gao, Xingjian +Gao, Xueyu +Gao, Yaojie +Gao, Zhan +Gao, Zhenzhong +Ge, Chuan'gui +Ge, Hong +Geng, Biao +Geng, Han +Geng, Junying +Geng, Lin +Geng, Yu +Ge, Wen +Ge, Xiao-jia +Ge, Ying +Ge, You +Gita, May +Goh, Chen Chuan +Goh, Chok Tong +Goh, Keng Swee +Goh, Kim Leng +Goh, Pei Ki +Goh, Poh Seng +Goh, Sing Yau +Gong, Li +Gongsun, Long +Guang, Lu +Guan, Hanqing +Guan, Liang +Guan, Tong +Guan, Yu +Gu, Cheng +Gu, Gongxu +Gu, Hongzhong +Gu, Hui +Gui, Lin +Gui, Shixun +Gu, Jiegang +Gu, Jun +Gu, Li +Guo, Benyu +Guo, Boling +Guo, Daiheng +Guo, Degang +Guo, Huadong +Guo, Juan +Guo, Kaizhou +Guo, Moruo +Guo, Shi +Guo, Wenbin +Guo, Yingqiu +Guo, Yue +Guo, Yuji +Gu, Shulin +Gu, Wei +Gu, Xingyuan +Gu, Xiong +Gu, Yan +Gu, Yuan +Gu, Zhenqing +Gu, Zhiwei +Han, Changfu +Han, Chong +Han, Dongfang +Han, Fei +Han, Fook Kwang +Han, Fu-ru +Han, Lianfen +Han, Mac T?u +Han, Meilin +Han, Mui Ling +Han, Shaogong +Han, Suyin +Han, Tang +Han, Wu +Han, Xiang +Han, Xin +Han, Yazhou +Han, Yi +Han, Zhong +Hao, Jie +He, Chang +He, Changling +He, Da +He, Fei +He-guang, Wu +He, Hanqiu +He, Jie +He, Jifeng +He, Jiuying +He, Li +Heng, Chau +Heng, Chye Kiang +Heng, Sure +He, Ping +Heping, Yu +He, Qinglian +He, Shi +He, Tian +He, Ying +He, Yingqin +He, Yuan-Jin +He, Zuoxiu +Hoang, Chu Duy +Hoang, H?ai Thu?y +Hoang, Ngoc Lung +Ho, Chi Wing +Ho, Chung +Ho, Hsiang-ning +Ho, Mian Lian +Hong, Cao +Hong, Chang +Hong, Chengchou +Hong, Hao +Hong, Qiu +Hong, Ren +Hong, Rengan +Hong, Seung-pyo +Hong, Su +Hong, Xuntao +Hong, Yin +Hong, Ying +Hong, Yingming +Hong, Yu +Hong, Yun-suk +Hong, Zhang +Ho, Peng Kee +Ho, Rih Hwa +Hou, Jianping +Hou, Jinglun +Hou, Wai-lu +Hou, Xianguang +Ho, Wing Meng +Ho, Yi +Ho, Yuk Ming +Hua, Gang +Hua, Guofeng +Hua, Junwu +Hua, Kang +Hua, Lu +Hua, Luogeng +Huang, Bing-shan +Huang, Bingsheng +Huang, Bingyin +Huang, Chieh +Huang, Da +Huang, Ertian +Huang, Fan +Huang, Fu +Huang, Geng +Huang, Gongwang +Huang, Jianping +Huang, Kun +Huang, Po +Huang, Qi +Huang, Qian +Huang, Qun +Huang, Songjie +Huang, Tsung-hsi +Huang, Weiwen +Huang, Xiaokai +Huang, Xiaoming +HUANG, XIAOMING +Huang, Xing +Huang, Yan +Huang, Yao +Huang, Yaozeng +Huang, Yong Ping +Huang, Zhenhua +Huang, Zhihong +Huang, Zunxian +Huan, Li +Hua, Wang +Hua, Wu Yin +Hua, Yun +Hu, Baotong +Hu, Chen +Hu, Ch'iao-mu +Hu, Fang +Hu, Hanmin +Hu, Hesheng +Hu, Hsiu-ying +Hui, Wang +Hui, Yuan +Hui, Zhou +Hu, Jianxiong +Hu, Jinchu +Hu, Jun +Hung, Kwok-yuen +Hung, Leung-kim +Hung, Ying-ming +Hu, Ning +Huo, Wang +Hu, Qing +Hu, shi +Hu, Shi +Hu, Shiguang +Hu, Shih Chang +Hu, Shih-Chang +Hu, Tian +Hu, Xiabo +Hu, Yaobang +Hu, Yongkai +Hu, Zhihui +Hu, Zhong-xiong +Hwang, Sun-Ae +Hwang, Yin +I, Cheng +Jaihiun, Kim +Jiang, Bo +Jiang, Guoliang +Jiang, Hong +Jiang, Huan +Jiang, Jiehong +Jiang, Jin +Jiang, Leiwen +Jiang, Pan +Jiang, Ping +Jiang, Qian +Jiang, Qing +Jiang, tianji +Jiang, Wen +Jiang, Yiming +Jiang, Ying +Jiang, Yu +Jiang, Yuan +Jiang, Zemin +Jiang, Zhongyi +Jian, Hu +Jian, Xianai +Jian, Xian'ai +Jian, Zhao +Jiao, Bo +Jiao, Guorui +Jiao, Shunfa +Ji, Dachun +Jie, Chen +Jie, Liang +Jie, Ouyang +Jie, Yuan +Ji, Junxiang +Jikun, Liu +Ji, Lanwei +Ji, Lin +Jin, Bohong +Jin, Cui +Jin, Di +JIN, DI +Jin, Dongyan +Jing, Chi +Jing, Heng +Jing, Ke +Jing, Qi +Jing, Qing +Jingqing, Yang +Jing, Su +Jin, Hui De +Jin, Jiang +Jin, Jie +Jin, Jing +Jin, Li +Jin, Lin +Jin, Nailu +Jin, Wei +Jin, Xuqi +Jin, Yong +Jin, Yun +Ji, Qiang +Ji, Si +Ji, Yuan +Ji-zhou, Yang +Jongsung, Kim +Ju, Zi +Kai, Chen +Kai, Zheng +Kang, Bao +Kang, Feng +Kang, Han +Kang, Mi-sun +Kang, Nae-hui +Kang, Sheng +Kang, Youwei +Katayama, Sen +Kawada, Jun +Ke, Chi +Ke, Fu +Ke, Huang +Ke, Li +Ke, Qin +Ke, Yan +Ke, Yun Lu +Khoo, Boo Teik +Khoo, Hong Woo +Khoo, Hoon Eng +Khoo, Joo Ee +Khoo, Kheng-Hor +Khoo, Seow Hwa +Khoo, Swee Chiow +Kiang, Kang-hu +Kim, Chie-woon +Kim, Chi-ha +Kim, Choong Han +Kim, Chull Baum +Kim, Do-Kyun +Kim, Dong-sung +Kim, Dongwook +Kim, Hak-Joon +Kim, Heechul +Kim, Ho +Kim, Hyo-jin +Kim, Hyung-A +Kim, In +Kim, Jang-Soo +Kim, Jong-Il +KIM, JONG-IL +Kim, Jong Kil +Kim, Kang-wo?n +Kim, Kang Won +Kim, Kap-su +Kim, Kihwan +Kim, Ki-Young +Kim, Kwang Soo +Kim, Kwang-sun +Kim, Kyu-sik +Kim, Pyung Soo +Kim, Ronyoung +Kim, San +Kim, Sang-jin +Kim, Sung-il +Kim, Sung-Soo +Kim, Sung-woo +Kim, Su-yong +Kim, Tae-gyun +Kim, Tae Hee +Kim, Tae-Young +Kim, Unsoo +Kim, Yong-dae +Kim, Yong-il +Kim, Yong-Nam +Kim, Yo?n-kyo?ng +Kim, Young-Sub +Kim, Young-Suk +Kin, Long +Kuan, Hsin-chi +Kuan, Liang +Ku, Fu-sheng +Ku, Hung-ming +Kung, Chen +Kuo, Chu-kun +Kuo, Lien Ying +Kuo, Ping-chia +Kuo, Sung-t'ao +Ku, Sang +Lai, Kwok Kin +Lai, Nam Chen +Lai, Po Kan +Laney, Lee +Lang, Li +Lan, Luh Luh +Lan, Peijin +Lan, Yu +Le, Duc Thuong Thuy +Lee, Chang-Ho +Lee, Chew Kang +Lee, Chi Ho +Lee, Chin-Chiu +Lee, Chin Koon +Lee, Chung Hing +Lee, Ding Fai +Lee, Dongju +Lee, Dong Wook +Lee, Gek Ling +Lee, Hoi-Chang +Lee, Ho Yin +Lee, Hsien Loong +Lee, Kin Kiong +Lee, Kuan Yew +Lee, Kwan +Lee-Lueng, Fu +Lee, May +Lee, Na Hyeon +Lee, Sang-Bok +Lee, Sang-Hun +Lee, Sun-ai +Lee, Sun-young +Lee, Teng-hui +Lee, Tsao Yuan +Lee, Tzu Pheng +Lee, Wai Heng +Lee, Wai-man +Lee, Weng Choy +Lee, Ying-arng +Lee, Ying-Yuan +Lee, Young-Jin +Lei, Chen +Lei, Congyun +Lei, Guang +Lei, Li +Lei, Qin +Le, Manh Hung +Le, Thac Can +Le, Thi Van Hue +Liang, Chen +Liang, Ji +Liang, Liangxing +Liang, Peilong +Liang, Qi +Liang, qichao +Liang, Qichao +Liang, Sicheng +Liang, Xiao +Liang, Xin +Liang, Yan +Liang, Yu +Liang, Yuan +Li, Ao +Liao, Chen +Liao, Zhongkai +Li, Bing +Li, Bing-Ren +Li, Chai +Li, Chengli +Li, Chengsen +Li, Chien-nung +Li, Ching +Li, Chuan-Kuei +Li, Chun +Li, Chunfeng +Li, Chung +Li, Chunxia +Li, Cunxin +Li, Dai +Li, Dajue +Li, Daosheng +Li, Dazhao +Li, Dejin +Li, Deming +Li, Deng +Li, Desheng +Li, Deyin +Lien, Chan +Lienfung, Li +Li, Fanggui +Li, Gonglin +Li, Guang +Li, Guo +Li, Haopei +Li, Ho +Li, Hongzhang +Li, Hongzhi +Lihua, Yang +Li, Jiajun +Li, Jiangshu +Li, Jiaqi +Li, Jinshan +Li, Kaining +Li, Kun +Li, Lan +Li, Lanqing +Li, Lian Ong +Li, Lisan +Li, Liu +Li, Lu +Li, Mao +Lim, Bee-Lum +Lim, Boon Keng +Lim, Chu Sing +Li, Meng +Lim, Guan Hua +Li, Min +Li, Mingzhu +Lim, Jae-Won +Lim, Li Ching +Lim, Li Lin +Lim, Poh Eng +Lim, Yew Hock +Li, Na +Lin, Biao +Lin, Cheng +Lin, Dan +Lin, Fanghua +Lin, Fengmian +Ling, Chung +Ling, Kong +Ling, Li +Ling, Liu +Lin, Gu +Ling, Yeou-ruenn +Ling, Yuan +Lin, Handa +Lin, Huang +Li, Nina +Li, Ning +Lin, Jensen +Lin, Jianhua +Lin, Lap-Chew +Lin, Li +Lin, Lu? +Lin, Piao +Lin, Qian +Linshan, Hua +Lin, Shaopei +Lin, Tsung-yi +Lin, Tsung-Yi +Lin, Xie +Lin, Xu +Lin, Ying +Lin, Yue +Lin, Yutang +Lin, Zexu +Lin, Zhan +Lin, Zhengyan +Lin, Zhong +Lin, Zuo +Li, Peiwen +Li, Peizhu +Li, Peng +Li, Qian +Li, Qiangsheng +Li, Qingzhao +Li, Qunying +Li, Ren +Li, Ruihuan +Li, Ruinian +Li, Shangyin +Li, Shang-yin +Li, Shantong +Li, Shiji +Li, Shizhen +Li, Shuang +Li, Shutian +Li, Si +Li, Siguang +Li, Songfu +Li, Su +Li, Tai +Li, Tan +Li, Tana +Li, Tang +Li, Tche-houa +Li, Tie +Liu, Bao +Liu, Binyan +Liu, Cengdian +Liu, Chen +Liu, Chengcai +Liu, Chen Hui +Liu, Chih +Liu, Chin +Liu, Ching +Liu, Chuang +Liu, Dong +Liu, Dongping +Liu, Fang +Liu, Fei +Liu, Fuhua +Liu, Gongwang +Liu, Guoliang +Liu, Haisu +Liu, Han Wen +Liu, Heung Shing +Liu, Huan +Liu, Huaqing +Liu, Hua yang +Liu, Hua-yang +Liu, Huihao +Liu, Huixia +Liu, I-ming +Liu, Jian +Liu, Jianjun +Liu, Jing-tong +Liu, Ji-ping +Liu, Ju +Liu, Jung-en +Liu, Kai +Liu, Kunyi +Liu, Li +LIU, LI +Liu, Liping +Liu, Qiming +Liu, Shaoqi +Liu, Shehui +Liu, Shicong +Liu, Shuzhen +Liu, Taigong +Liu, Tao +Liu, Ts'un-yan +Liu, Weixin +Liu, Wenhui +Liu, Wenmin +Liu, Wenzhe +Liu, Xia +Liu, Xiaobo +Liu, Xiaoqing +Liu, Xing +Liu, Xingzhen +Liu, Xinwu +Liu, Xujie +Liu, Yandong +Liu, Ye +Liu, Yongqing +Liu, Yuanman +Liu, Yun +Liu, Yunfeng +Liu, Zaihua +Liu, Zaixing +Liu, Zhenkai +Liu, Zhijun +Liu, Zhiwei +Liu, Zhonglu +Li, Weining +Li, Wenliang +Li, Wenyan +Li, Xi +Li, Xiangdong +Li, Xiaofeng +Li, Xiaoxiang +Li, Xieu-Lin +Li, Xinyuan +Li, Xuemei +Li, Xuewu +Li, Ye +Li, Yi +Li, Yihua +Li, Yinhe +Li, Yong +Li, Yuanhong +Li, Yuchun +Li, Yu-ming +Li, Yunfei +Li, Zhaoxiang +Li, Zhaoxing +Li, Zheng +Li, Zhenji +Li, Zhenjie +Li, Zhensheng +Li, Zhiwu +Li, Zhuo +Li, Zigan +Li, Zishun +Li, Zongren +Li, Zongwei +Li, Zunian +Lo, Chiung-yu +Lo, Hsiang-lin +Lo, Kuang-pin +Lo, Mei Hing +Long, Xu +Lu, Chuanrong +Lu, Daren +Lu, Feng +Lufeng, Tang +Lu, Gusun +Lu, Jiaquan +Lu, Jiuyuan +Lu, Le +Lu, Li +Lu, Meng +Luo, Jialun +Luo, Ruiqing +Luo, Ti-lun +Luo, Wei +Luo, Weihong +Luo, Yang +Luo, Yin +Luo, Zewen +Luo, Zhaohong +Lu, Ruilan +Lu, Shengli +Lu, Sheng-yen +Lu, Shi +Lu, Shoukang +Lu, Shun +Lu, Ting +Lu, Xinchang +Lu, Xinhua +Lu, Xinsen +Lu, Xixing +Lu, Yubin +Lu, Zhan +Ma, Baolin +Ma, Chih +Ma, Cindy W +Ma, Huan +Ma, Ji +Ma, Kai +Ma, ke +Malborg, Kim +Ma, Lunzy +Ma, Mingjia +Ma, Mingtong +Ma, Ngok +Ma, Ning +Mao, Dun +Mao, Peiqi +Mao, Yan +Mao, Zedong +Ma, Shijun +Ma, Shuli +Ma, Wen +Ma, Xia +Ma, Xiaodong +Ma, Yinchu +Ma, Ying +Ma, Ying-jeou +Ma, Zhiyuan +Ma, Zhonglin +Ma, Zongjin +Mei, Ding +Mei, Hua +Mei, Lin +Mei, Ying +Mei, Zhang +Mei, Zu-yan +Meng, Hao-jan +Meng, Haoran +Meng, Hua +Meng, Jian +Menglin, Zhao +Meng, Shen +Meng, Xi +Meng, Xiankun +Meng, Xianshi +Meng, Yue +Miao, Ying +Ming, Fan +Ming, Hai +Ming, Ho-Yu +Ming-le, Yao +Ming, Lu +Min, Jiayin +Min, Xu +Min, You +Mo, Bangxian +Mo, Yamin +Mo, Yan +Mo, Yang +Mu, Soeng +Mu, Soeng Sunim +Mu, Xia +Mu, Zi +Nanquan, Lu +Ng, Aik Kwang +Ng, Bee Chin +Ng, Beng Yeong +Ng, Chee Yuen +Ng, Chin-Keong +Ng, Hock Guan +Ng, Ho-yee +Ng, Kung-fu +Ng, Pak Tee +Ng, Seik Weng +Ng, Suat Tong +Nguyen, Dinh Huu +Nguyen, Duc Hiep +Nguyen, Duy Thai Son +Nguyen, Khanh +Nguyen, Thanh Hai +Nguyen, Thi Dieu +Nguyen, Thi Minh Ha +Nguyen, Thi Thanh Binh +Nguyen, Van Canh +Nguyen, Van Chuyen +Nguyen, Van Dao +Nguyen, Van Ly +Nguyen, Van Nghi +Nguyen, Van Thoai +Nguyen, Viet Dung +Nguyen, Viet Thanh +Nguyen, Xuan Oanh +Nguyen, Xuan Thu +Ng, Wun Jern +Ning, Qiang +Ouyang, Yi +Ouyang, Yu +Pan, Chengbiao +Pan, Chengdong +Pan, Guxi +Pan, Jiezi +Pan, Ling +Pan, Song +Pan, Tianshou +Pan, Wuhua +Pan, Xiafeng +Pan, Yue +Pan, Yunhe +Pei-ji, Chen +Pei-kang, Chang +Peilin, Sun +Pei, Songzhi +Pei, Yan +Peng, Dehuai +Peng, Feng +Peng, Liang +Peng, Ming-min +Peng, Peiyun +Peng, Shengchao +Peng, Shi +Peng, Xianchu +Peng, Zhen +Phan, Chu Trinh +Phan, Nhie?n Ha?o +Phan, Thie??n Cha?u +Pi, Lei +Pi, Li +Ping, Fan +Ping, Xin +Ping, Zhou +Po, Lee +Po, Sung-nien +Pu, Wei +Qiang, Dong +Qiang, Han +Qiang, Zhang +Qian, Li +Qian, Lin +Qian, Ma +Qian, Mei +Qian, Qichen +Qian, Shi +Qian, Wu +Qian, Xuantong +Qian, Yongfu +Qian, Zheng +Qian, Zhengying +Qian, Zhongshu +Qiao, Hong +Qiao, Jinlin +Qiao, Yi +Qi, Baishi +Qi, Dong +Qi, Huang +Qi, Jiguang +Qi, Min +Qin, Danhua +Qin, Xiao-meng +Qin, Yu +Qin, Zhong +Qi, Pan +Qi, Peng +Qiqian, Li +Qiu, Jin +Qiu, Jun +Qiu, Yu +Qi, Xin +Qi, Yan +Qiying, Hu +Qu, Bo +Ren, De-lin +Ren, Guang +Ren, Jianxin +Ren, Jie +Ren, Jishun +Ren, Lu +Ren, Wu +Ren, Xingsheng +Ren, Xiong +Ren, Yi +Ren, Yongchang +Ren, Zhong +Ruan, Huaduan +Ruan, Ji +Rui, Guozhang +Rui, Lin +Rui, Mu +Rui, Naiwei +Rui, Zhang +Sha, Jicai +Sha, Kokken +Sha, Lin +Shang, Xianmin +Shang, Yuan Ren +Shao-kung, Lin +Shao, Wei Liu +Shao, Xunzheng +Shao, Ya +Sha, Yexin +Shen, Baozhen +Sheng, Huanye +Sheng, Lijun +Sheng, Peilin +Sheng, Shicai +Sheng, Zhi-yong +Shen, Huang +Shen, Jiaben +Shen, Jianmin +Shen, Jie +SHEN, JIE +Shen, Junru +Shen, Kai +Shen, Kangshen +Shen, Kuo +Shen, Li +Shen, Weirong +Shen, Xianjie +Shen, Zhang +Shen, Zhao-wen +Shen, Zhou +Shen, Ziyin +Shi, Chen +Shi, Hui +Shi, Jun +Shi, Kefa +Shi, Lang +Shi, Min +Shimin, Geng +Shi, Qinan +Shi, Song +Shi, Tao +Shi, Wei +Shi, Xiaojing +Shi, Xiaoqing +Shixuan, Xu +Shi, Yan +Shi, Young +Shi, Yu +Shi, Zuhui +ShouQing, Wang +Shui, Wei +Shui, Yi +Shu, Li +Shu, Lin +Si, Chi Ko +Si, Ma +Sima, Qian +SiMa, Qian +Si, Ren +Situ, Tan +Si, Yuan +Song, Ci +Song, Dong-Wook +Song, Geng +Song, Jiaoren +Song, Jinshou +Song, Nong +Song, Ping +Song, Qi +Song, Shouxiang +Song, Wan +Song, Wu +Song, Xing-Chang +Song, Yankun +Song, Yu +Song, Zheyuan +So-Young, Lee +Su, Dajun +Su, Fu +Su, Hao +Su, Hua +Su, Huana +Su, Ling +Sun, Chao +Sun, Chengnan +Sun, Fuchu +Sun, Guangyuan +Sun, Haichen +Sun, Han +Sun, Jian +Sun, Jianhong +Sun, Kaitai +Sun, Liang +Sun, Ou +Sun, Shifang +Sun, Shuyun +Sun, Wu +Sun, Xiang +Sun, Xiaochun +Sun, Xingyuan +Sun, Xu +Sun, Xue-quan +Sun, Yanqing +Sun, Yat-sen +Sun, Yingjie +Sun, Youjun +Sun, Yue +Sun, Zhen +Sun, Zhiwei +Sun, Zuxun +Su, Rong +Su, Shi +Su, Wenming +Su, Xiaojun +Su, Yu +Su, Zongwei +Taeho, Kim +Tai, Chen +Tai, Ming Cheung +Takashi, Inoguchi +Takashi, Negishi +Tan, Ai Mei +Tan, Beng Luan +Tan, Boon Tee +Tan, Cheng Han +Tan, Cheng Lock +Tan, Chin Kwang +Tan, Chong Kee +Tan, Dun +Tang, Jinfa +Tang, Jing +Tang, Junyi +Tang, Li +Tang, Ting-Ao +Tang, Wei +Tang, Xiaofang +Tang, Xiren +Tang, Yin +Tang, Yungmei +Tang, Zhaoliang +Tang, Zhou +Tan, Hong +Tan, Hong Yew +Tan, Huaixiang +Tan, Jiazhen +Tan, Juay Miang +Tan, Khee Giap +Tan, Kok-Keong +Tan, Koonlin +Tan, Lee Meng +Tan, Lin-tung +Tan, Loke Khoon +Tan, Mew Hong +Tan, Ong Bee +Tan, Phay Ping +Tan, Siew Ee +Tan, Siew Sin +Tan, Sitong +Tan, Sok Khim +Tan, Soo Jiuan +Tan, Su-Lyn +Tan, Tai Wei +Tan, Tee Jim +Tan, Teik-Kheong +Tan, Thiam Soon +Tan, Tin Wee +Tan, Yang Meng +Tan, Yankai +Tan, Yew Hock +Tan, Yew Soon +Tan, Yi +Tao, Cheng +T'ao, Ch'ien +T'ao, Chi'en +Tao, Hua +Tao, Jian +Tao, Jin +Tao, Xingzhi +Tao, Zhang +Tao, Zhenghua +Tay, Boon Nga +Teng, Chao-chao +Teng, Chia-yee +Thai, Ngoc Diep +Thai, Quang Trung +Tian, Jia +Tie, Ning +Tieya, Wang +Tie, Yuan +Ting, Joo Fai +Ting, Li +Ting, Su-Yin +Ting, Wai +Ting, Wang +Ting, Wei +Tong, Dizhou +Tong, Jinnan +Tong, Li +Tong, Shijun +Tong, Zhongtao +Tong, Zhongyi +Torrey, Kim +Tsai, Chin +Ts'ai, Yuan +Tung, shu +T'ung, Shu +Vu, Thien Binh +Vu, Trong Phung +Wan, Changsen +Wang, Anshi +Wang, Anyi +Wang, Bi +Wang, Biao +Wang, Bing +Wang, Can +Wang, Chen-ho +Wang, Chi-ssu +Wang, Chong +Wang, Dao +Wang, Dong +Wang, Dulu +Wang, Du Lu +Wang, Dun +Wang, Duo +Wang, En'guang +Wang, Fangyu +Wang, Fanxi +Wang, Fengzhu +Wang, Fuchun +Wang, Fuzhi +Wang, Gu +Wang, Guangmei +Wang, Guangya +Wang, Guangyi +Wang, Hongtu +Wang, Hongwen +Wang, Hsing-pei +Wang, Huan +Wang, Huanan +Wang, Huijiong +Wang, Hui-ling +Wang, Huiming +Wang, Jiancheng +Wang, Jianmin +Wang, Jianzheng +Wang, Jingwei +Wang, Jinshan +Wang, Jiye +Wang, Kemin +Wang, Kui +Wang, Kun +Wang, Kung-hsing +Wang, Kuo-wei +Wang, Liang +Wang, Liangbi +Wang, Liming +Wang, Liqun +Wang, Lixian +Wang, Lixiong +Wang, Lu +Wang, Luxiang +Wang, Maorong +Wang, Min +Wang, Ming +Wang, Mingjie +Wang, Pi +Wang, Pin +Wang, Pingyang +Wang, Pinxian +Wang, Qiliang +Wang, Qinglin +Wang, Qingyun +Wang, Rongda +Wang, Rui +Wang, Sheng +Wang, Sheng-Wei +Wang, Shijun +Wang, Shixun +Wang, Shiyi +Wang, Shizhen +Wang, Shouren +Wang, Shuo +Wang, Song +Wang, Sung +Wang, Tieya +Wang, Tongsan +Wang, Tuoming +Wang, Wenhua +Wang, Wenjiong +Wang, Xi +Wang, Xianzhi +Wang, Xiao +Wang, Xiaobo +Wang, Xiaoning +Wang, Xiaotian +Wang, Xiaoyan +Wang, Xiaoyun +Wang, Xing Chu +Wang, Xiu +Wang, Xizhi +Wang, Xu +Wang, Xuanjie +Wang, Xuecheng +Wang, Xuewen +Wang, Yanrong +Wang, Yao-t'ing +Wang, Yi'e +Wang, Yinzhi +Wang, Yue +Wang, Yuehan +Wang, Zheng +Wang, Zhengshu +Wang, Zhengyi +Wang, Zhiping +Wang, Zhiwen +Wang, Zhizhi +Wang, Zhongchun +Wang, Zhonggao +Wang, Zhongyi +Wang, Zili +Wan, Kwai Pik +Wan, Li +WAN-SOON, KIM +Wee, Kim Wee +Wei, Dong +Wei, Fajie +Wei, Hu +Wei, Ji +Wei, Jingsheng +Wei, Li Chen +Wei, Liming +Wei, Luo +Weiming, Shen +Wei, Ping +Weiping, Wang +Wei, Song +Wei, Su +Wei, Sui +Wei, Tian +Wei, Wen +Wei, Wenbo +Wei, Wenhua +Wei, Xu +Wei, Yan +Wei, Yang +Wei, Yuanping +Wei, Yue +Weiyu, Jiang +Wei, Zhang +Wei, Zhao +Wei, Zhen +Wen, Chi +Wen, Ding +Wen, Hao +Wen, Hong +Wen, Hua +Wen, Jingen +Wen, Jinhai +Wenkuan, Ma +Wen, Qing +Wenqing, Wang +Wen, Tianxiang +Wen, Yiduo +Wen, Yinghong +Wen, You +Wen, Yu +Wen, Zhengming +Wen, Zhenheng +Wen, Zhong +Whei-Jen, Chen +Wong, Chian Voen +Wong, Choon Ching +Wong, Heung Wah +Wong, Jim +Wong, Kang-Ying +Wong, Kar-wai +Wong, Kin-yuen +Wong, Kwei Cheong +Wong, Kwok-Chu +Wong, Phui Nam +Wong, Shiu Hon +Wong, Sook Ching +Wong, Wah Sang +Wong, Yew Kwan +Wu, Chen +Wu, Chengkang +Wu, Chen-Xu +Wu, Chong-shi +Wu, Chou +Wu, Chuanjun +Wu, Daisheng +Wu, Enyu +Wu, Guang +Wu, Guanghua +Wu, Guanzhong +Wu, Guo +Wu, Hao +Wu, Jialiang +Wu, Jie +Wu, Jiemin +Wu, Jin +Wu, Jingchao +Wu, Jinglian +Wu, Jingzi +Wu, Juntao +Wu, Kepi +Wu, Kong +Wu, Kwang +Wu, Mingyu +Wu, Ningkun +Wu, Piao +Wu, Qun'gan +Wu, Shunjun +Wu, Tao +Wu, Tingfang +Wu, Tsong-shien +Wu, Tung +Wu, Wenjin +Wu, Wenjun +Wu, Xiao An +Wu, Xiaochun +Wu, Xing +Wu, Xiufen +Wu, Xun +Wu, Yi +Wu, Yiming +Wu, Ying-hua +Wu, Yuanfang +Wu, Yuzhang +Wu, Zhang +Wu, Zhengyi +Wu, Zhou +Xia, Guang +Xia, Guoping +Xia, Hong +Xia, Jian-Bai +Xia, Nai +Xiang, Huaicheng +Xiang, Mu +Xiang, Ying +Xiang, Zhang +XIAOBO, LU +Xiao, Cheng +Xiao, Fan +Xiao, Fei +Xiao, Han +Xiao, Hong +Xiao, Jimei +Xiao, Ke +Xiaoli, Sun +Xiao, Ma +Xiaoming, Zhang +Xiao, Qian +Xiao, Qin +XIAO, QIN +Xiao, Shiling +Xiao, Yan +Xiao, Yang +Xiao, Yanling +Xiao, Zhang +Xia, Yang +Xia, Yi +Xie, Fei +Xie, Jialin +Xie, Jin +Xie, Jinyuan +Xie, Juezai +Xie, Jun +Xie, Kai +Xie, Qu-bing +Xie, Xiande +Xie, Xide +Xie, Yu-Zhang +Xie, Zheng +Xie, Zhufan +Xi, Gao +Ximen, Jiye +Xin, Gao +Xing, Huo-yu +Xing, Lin +Xing, Lu +Xing, Lujian +Xing, Shu +Xin, Hua +XINJIANG, RONG +Xin, Lu +Xin, Mu +Xin, Tian +Xin, Wen +Xin, Ying +Xin, Zheng +Xiong, Fan +Xiong, Qinglai +Xiu, Yu +Xi, Zhang +Xuan, Wu +Xu, Beihong +Xu, Bing +Xu, Dan +XU, DAN +Xu, Dixin +Xue, Hanqin +Xue, Jianxin +Xue, Yu +Xu, Gan +Xu, Guangqi +Xu, Guohua +Xu, Hong-yan +Xu, Hong Yan +Xu, Huping +Xu, Jian +Xu, Jianchu +Xu, Jiawei +Xu, Jiazhong +Xu, Jincheng +Xu, Liangying +Xu, Meng +Xu, Mengzhong +Xu, Ming +Xun, Lin +Xun, Zhao +Xu, Qian +Xu, Senlin +xu, Shen +Xu, Shichang +Xu, Weiguo +Xu, Wen +Xu, Xi +Xu, Xian +Xu, Xianquan +Xu, Xiaoge +Xu, Xiaojie +Xu, Xing +Xu, Xuchang +Xu, Yihou +Xu, Yuanzhong +Xu, Yuhuan +Xu, Yunlong +Xu, Zhaoran +Xu, Zhu +Yan, Chongnian +Yan, Fu +Yang, Bo +Yang, Chao +Yang, Chen +Yang, Chengwu +Yang, Dadi +Yang, Dao +Yang, Fang +Yang, Fu +Yang, Fudong +Yang, Hanxi +Yang, Hongyuan +Yang, Hsiao +Yang, Huan +Yang, Hui +Yang, Huizhong +Yang, Jiasan +Yang, Jing +Yang, Jinghui +Yang, Jingyu +Yang, Jizhou +Yang, Kaizhong +Yang, Ke +Yang, Lan +Yang, Liping +Yang, Liwei +Yang, Shangkun +Yang, Wan-li +Yang, Weitao +Yang, Wenyi +Yang, Wenzhen +Yang, Xian +Yang, Xiao +Yang, Xiaojun +Yang, Xiaoqing +Yang, Xiguang +Yang, Xinrong +Yang, Yin +Yang, Ying +Yang, Yi Xian +Yang, Yi-yen +Yang, Yongjian +Yang, Yu +Yang, Yu? +Yang, Zenghong +Yang, Zhensheng +Yang, Zhi-jun +Yang, Zhu +Yang, Zi +Yan, Han +Yan, Hong +Yan, Jian +Yan, Jiaqi +Yan, Kejia +Yan, Menghui +Yan, Pei-Ming +Yan, Ruizhen +Yan, Shang +Yan, Sun +Yan, Wang +Yan, Wei +Yan, Xin +Yan, Xingjian +Yan, Xuetong +Yan, Yixun +Yan, Yu +Yan, Zhen'guo +Yao, Chang +Yao, Dianfang +Yao, Fu +Yao, Huang +Yao, Li +Yao, Ming +Yao, Qian +Yao, Suihan +Yao, Wang +Yao, Wenyuan +Yao, Zhang +Yau, Ching +Ye, Bai +Ye, Duzheng +Ye, Jianying +Ye, Jin +Ye, Lin-Sheng +Ye, Minghan +Yen, Chia-kan +Yen, Hsi-shan +Yen, Ping-Chiu +Yen, Wei +Yen, Wenchun +Ye, Ping Kuei +Ye, Sen +Ye, Su +Ye, Xiao +Ye, Zhongxing +YI, DING +Yi, Hong +Yi, Huang +Yi, Jiang +Yi, Jin +Yi, Lu +Yin, Binyong +Yin, Chamroeun +Ying, Guo +Ying, Jianzhe +Ying, Lei +Ying, Lu +Ying, Lungan +Ying, Yu +Yin, Hongfu +Yin, Huihe +Yin, Jian +Yin, Jianxing +Yin, Shun +Yi, Peng +Yi, Sang +Yi, Su-gwang +Yi, Ying +Yi, Zeng +Yi, Zhongtian +Yong, Hoi-Sen +Yong, Kwet Yew +Yong, Pung How +Yong, Wang +Yong, Yan +Yoon, Choong-Nam +Yoon, Hyung Kim +Yoon, Myung-sook +You, Jia +You, Mo +Younglae, Kim +Young-moo, Kim +You, Xu +You, Yu +Yuan, Chen +Yuan, Daoxian +Yuan, Hong +Yuan, Kang +Yuan, Longping +Yuan, Muzhi +Yuan, Shao Wen +Yuan, Shen +Yuan, Shibing +Yuan, Shikai +Yuan, Xue +Yuan, Yu +Yuan, Yunsheng +Yu, Bin +Yu, Chang-Chin +Yu, Changlong +Yu, Dan +Yu, Dejun +Yu, Dong +Yue, Chongxi +Yue, Fengxia +Yue, Ma +Yu, Fei +Yu, Guang +Yu, Hao +Yu, He +Yu, Hsiu-ching +Yu, Hua +Yu, Hui-chan +Yu, Huihua +Yu, Jing-Yuan +Yu, Jung-yul +Yu, Li Ming +Yu, Lin +Yu, Maohong +Yunbo, Liu +Yung Teng, Chia-yee +Yun, Kim +Yun, Ma +Yun, Mi Antorini +Yun, Qing +Yun, Shouping +Yun, Wu +Yu, Pingbo +Yu, Qin +Yu, Sianglin +Yu, Sung +Yu, Tianwei +Yu, Wei +Yu, Xiang +Yu, Xiaohui +Yu, Xiaoyang +Yu, Xie +Yu, Xihan +Yu, Xuanji +Yu, Yaosheng +Yuyi, Wang +Yu, Young-nan +Yu, Youren +Yu, Yuntian +Yu, Zhang +Yu, Zhao +Yu, Zhu +Yu, Zhuoyun +Yu, Zicheng +Zaifu, Liu +Zeng, Cheng +Zeng, Fanren +Zeng, Guofan +Zeng, Minzu +Zeng, Weiqi +Zeng, Xianyi +Zeng, Zhen +Zhai, Zhenhua +Zhang, Bin +Zhang, Chu +Zhang, Chun +Zhang, Dachun +Zhang, Feng +Zhang, guohua +Zhang, Guohua +Zhang, Hao +Zhang, Hua +Zhang, Huan +Zhang, Junxiang +Zhang, Min +Zhang, Nan +Zhang, Ning +Zhang, Qian +Zhang, Rui +Zhang, Weihong +Zhang, Xiaoping +Zhang, Xin +Zhang, Yanqing +Zhang, Yuan +Zhang, Yue +Zhang, Zhong +Zhao, Cangbi +Zhao, Dan +Zhao, Han +Zhao, Hong +Zhao, Jun +Zhao, Lan +Zhao, Liang +Zhao, Lihong +Zhao, Man +Zhao, Mengfu +Zhao, Muying +Zhao, Qi +Zhao, Qizheng +Zhao, Rui +Zhao, Shuhan +Zhao, Wei +Zhao, Xiaolei +Zhao, Xin +Zhao, Yan +Zhao, Yong +Zhao, Yuqi +Zhao, Zhentao +Zhao, Ziyang +Zheng, Chengsi +Zheng, Dai +Zheng, Guangmei +Zheng, Guili +Zheng, Guo +Zheng, Guogu +Zheng, Guoxiong +Zheng, Hong +Zheng, Jie +Zheng, Jun +Zheng, Junli +Zheng, Lizhong +Zheng, Long +Zheng, Lu +Zheng, Mianping +Zheng, Ping +Zheng, Qing +Zheng, Quan +Zheng, Rusi +Zheng, Shiling +Zheng, Tan +Zheng, Wu +Zheng, Xiaoxu +Zhengxin, Chen +Zheng, Xun +Zheng, Yangwen +Zheng, Yide +Zheng, Yong-Nian +Zheng, Yulian +Zheng, Zhao +Zheng, Zhenman +Zheng, Zhensun +Zheng, Zhi +Zhen, Rong +Zhen, Yuan +Zhi, Chen +Zhi, Feng +Zhigang, Guo +Zhiming, Yuan +Zhi, Yang +Zhi, Yu +Zhong, Bai-song +Zhong, Gongfu +Zhong, Hong +Zhong, Jin +Zhong, Junhua +Zhongmin, Han +Zhong, Shizhen +Zhong, Xiangchong +Zhong, Xiu +Zhongyi, Yuan +Zhong, Zheng +Zhou, Baozhong +Zhou, Can +Zhou, Chuncai +Zhou, Daguan +Zhou, Di +Zhou, Dingzhi +Zhou, Enlai +Zhou, Huang +Zhou, Ji +Zhou, Jianchao +Zhou, Jianmin +Zhou, Jianren +Zhou, Kaiya +Zhou, Keqin +Zhou, Lei +Zhou, Lingzhong +Zhou, Lixing +Zhou, Mi +Zhou, Nanzhao +Zhou, Shidi +Zhou, Siyong +Zhou, Xiaowen +Zhou, Xuan +Zhou, Xun +Zhou, Yan +Zhou, Yiming +Zhou, Yu +Zhou, Yun +Zhou, Zhigang +Zhou, Zhiyi +Zhuang, Fenggan +Zhuang, Li +Zhuang, Youjuan +Zhu, Baoxun +Zhu, Bingyao +Zhu, Chen +Zhu, De +Zhuge, Liang +Zhu, Guanya +Zhu, Guobin +Zhu, Hongda +Zhu, Huayou +Zhu, Jiajin +Zhu, Jianhua +Zhu, Jie +Zhu, Junyi +Zhu, Liang +Zhu, Meilin +Zhu, Miaolong +Zhuo, Huang +Zhuo, Jing-Schmidt +Zhu, Qi +Zhu, Qianzhi +Zhu, Qiuxia +Zhu, Rizhang +Zhu, Rongji +Zhu, Shanan +Zhu, shenghao +Zhu, Shenghao +Zhu, Shijie +Zhu, Tingcheng +Zhu, Xian +Zhu, Xiao Di +Zhu, Xiaodong +Zhu, Xuan +Zhu, Yao +Zhu, Zhengming +Zou, Rong +Zuo, Boyang +Zuo, Qiuming +Zuo, Zongtang diff --git a/ia-legacy-importer/author/list_titles.py b/ia-legacy-importer/author/list_titles.py new file mode 100644 index 00000000..bc36cfd2 --- /dev/null +++ b/ia-legacy-importer/author/list_titles.py @@ -0,0 +1,24 @@ +from __future__ import print_function +titles = {} +with_title = {} + +for line in open("/1/pharos/edward/titles"): + try: + loc, fields = eval(line) + except SyntaxError: + break + except ValueError: + continue + t = [b for a, b in fields if a == 'c'] + if len(t) != 1: + continue + fields = tuple((a, b.strip('.') if a=='d' else b) for a, b in fields) + title = t[0].strip(' ,.').lower() + titles[title] = titles.get(title, 0) + 1 + with_title.setdefault(title, {}) + with_title[title][fields] = with_title[title].get(fields, 0) + 1 + +for k, v in sorted(((a, b) for a, b in titles.items() if b > 10), reverse=True, key=lambda x: x[1]): + print((repr(k), v)) + for a, b in sorted(((a, b) for a, b in with_title[k].items() if b > 5), reverse=True, key=lambda x: x[1])[0:30]: + print((' ', a, b)) \ No newline at end of file diff --git a/ia-legacy-importer/author/marc.py b/ia-legacy-importer/author/marc.py new file mode 100644 index 00000000..4b0a1542 --- /dev/null +++ b/ia-legacy-importer/author/marc.py @@ -0,0 +1,47 @@ +from __future__ import print_function +from catalog.infostore import get_site +from catalog.marc.db.web_marc_db import search_query +from catalog.get_ia import get_data +from catalog.marc.fast_parse import get_all_subfields, get_tag_lines, get_first_tag, get_subfields +import sys +site = get_site() + +name = sys.argv[1] # example: 'Leonardo da Vinci' +author_keys = site.things({'type': '/type/author', 'name': name}) +print(len(author_keys), 'authors found') + +edition_keys = set() +for ak in author_keys: + edition_keys.update(site.things({'type': '/type/edition', 'authors': ak})) +print(len(edition_keys), 'editions found') + +locs = set() +for ek in edition_keys: + e = site.withKey(ek) + for i in e.isbn_10 if e.isbn_10 else []: + locs.update(search_query('isbn', i)) + for i in e.lccn if e.lccn else []: + locs.update(search_query('lccn', i)) + for i in e.oclc_numbers if e.oclc_numbers else []: + locs.update(search_query('oclc', i)) +print(len(locs), 'MARC records found') + +def ldv(line): + for s in ('1452', '1519', 'eonard', 'inci'): + if line.find(s) != -1: + return True + return False + +for loc in locs: +# print loc + data = get_data(loc) + if not data: + print("couldn't get") + continue + line = get_first_tag(data, set(['100', '110', '111'])) + if line and ldv(line): + print(list(get_all_subfields(line))) + + line = get_first_tag(data, set(['700', '710', '711'])) + if line and ldv(line): + print(list(get_all_subfields(line))) diff --git a/ia-legacy-importer/author/merge.py b/ia-legacy-importer/author/merge.py new file mode 100755 index 00000000..c2ad310f --- /dev/null +++ b/ia-legacy-importer/author/merge.py @@ -0,0 +1,220 @@ +# -*- coding: utf-8 -*- +from __future__ import print_function +from openlibrary.catalog.importer.db_read import withKey, get_things, get_mc +from openlibrary.catalog.read_rc import read_rc +from openlibrary.catalog.utils import key_int, match_with_bad_chars, pick_best_author, remove_trailing_number_dot +from unicodedata import normalize +import web +import re +import sys +import codecs + +import six +from six.moves import urllib + +sys.path.append('/home/edward/src/olapi') +from olapi import OpenLibrary, unmarshal, Reference +from openlibrary.catalog.utils.edit import fix_edition +from openlibrary.catalog.utils.query import query_iter + +def urlread(url): + return urllib.request.urlopen(url).read() + +def norm(s): + return normalize('NFC', s) + +def copy_fields(from_author, to_author, name): + new_fields = { 'name': name, 'personal_name': name } + for k, v in from_author.iteritems(): + if k in ('name', 'personal_name', 'key', 'last_modified', 'type', 'id', 'revision'): + continue + if k in to_author: + assert v == to_author[k] + else: + new_fields[k] = v + return new_fields + +def test_copy_fields(): + f = {'name': 'Sheila K. McCullagh', 'personal_name': 'Sheila K. McCullagh', 'last_modified': {'type': '/type/datetime', 'value': '2008-08-30 20:40:41.784992'}, 'key': '/a/OL4340365A', 'birth_date': '1920', 'type': {'key': '/type/author'}, 'id': 18087251, 'revision': 1} + t = {'name': 'Sheila K. McCullagh', 'last_modified': {'type': '/type/datetime', 'value': '2008-04-29 13:35:46.87638'}, 'key': '/a/OL2622088A', 'type': {'key': '/type/author'}, 'id': 9890186, 'revision': 1} + + assert copy_fields(f, t, 'Sheila K. McCullagh') == {'birth_date': '1920', 'name': 'Sheila K. McCullagh', 'personal_name': 'Sheila K. McCullagh'} + + +def update_author(key, new): + q = { 'key': key, } + for k, v in new.iteritems(): + q[k] = { 'connect': 'update', 'value': v } + print(ol.write(q, comment='merge author')) + +def update_edition(ol, e, old, new, debug=False): + key = e['key'] + if debug: + print('key:', key) + print('old:', old) + print('new:', new) + fix_edition(key, e, ol) + authors = [] + if debug: + print('current authors:', e['authors']) + for cur in e['authors']: + cur = cur['key'] + if debug: + print(old, cur in old) + a = new if cur in old else cur + if debug: + print(cur, '->', a) + if a not in authors: + authors.append(a) + if debug: + print('authors:', authors) + e['authors'] = [{'key': a} for a in authors] + + try: + ret = ol.save(key, e, 'merge authors') + except: + if debug: + print(e) + raise + if debug: + print(ret) + + update = [] + for wkey in e.get('works', []): + need_update = False + print('work:', wkey) + w = ol.get(wkey) + for a in w['authors']: + if a['author'] in old: + a['author'] = Reference(new) + need_update = True + if need_update: + update.append(w) + + if update: + ret = ol.save_many(update, 'merge authors') + +def switch_author(ol, old, new, other, debug=False): + q = { 'authors': old, 'type': '/type/edition', } + for e in query_iter(q): + if debug: + print('switch author:', e['key']) + print(e) + e = ol.get(e['key']) + update_edition(ol, e, other, new, debug) + +def make_redirect(ol, old, new): + r = {'type': {'key': '/type/redirect'}, 'location': new} + ol.save(old, r, 'merge authors, replace with redirect') + +re_number_dot = re.compile('\d{2,}[- ]*(\.+)$') + +def do_normalize(author_key, best_key, authors): + #print "do_normalize(%s, %s, %s)" % (author_key, best_key, authors) + need_update = False + a = ol.get(author_key) + if author_key == best_key: + for k, v in a.items(): + if 'date' in k: + m = re_number_dot.search(v) + if m: + need_update = True + v = v[:-len(m.group(1))] + if not isinstance(v, six.text_type): + continue + norm_v = norm(v) + if v == norm_v: + continue + a[k] = norm_v + need_update = True + else: + best = ol.get(best_key) + author_keys = set(k for k in a.keys() + best.keys() if k not in ('key', 'last_modified', 'type', 'id', 'revision')) + for k in author_keys: + if k not in best: + v = a[k] + if not isinstance(v, six.text_type): + continue + norm_v = norm(v) + if v == norm_v: + continue + a[k] = norm_v + need_update = True + continue + v = best[k] + if 'date' in k: + v = remove_trailing_number_dot(v) + if isinstance(v, six.text_type): + v = norm(v) + if k not in a or v != a[k]: + a[k] = v + need_update = True + if not need_update: + return + #print 'save(%s, %s)' % (author_key, repr(a)) + ol.save(author_key, a, 'merge authors') + +def has_image(key): + url = 'https://covers.openlibrary.org/a/query?olid=' + key[3:] + ret = urlread(url).strip() + return ret != '[]' + +def merge_authors(ol, keys, debug=False): +# print 'merge author %s:"%s" and %s:"%s"' % (author['key'], author['name'], merge_with['key'], merge_with['name']) +# print 'becomes: "%s"' % repr(new_name) + authors = [a for a in (withKey(k) for k in keys) if a['type']['key'] != '/type/redirect'] + not_redirect = set(a['key'] for a in authors) + if debug: + for a in authors: + print(a) + + assert all(a['type']['key'] == '/type/author' for a in authors) + name1 = authors[0]['name'] + for a in authors: + print(repr(a['key'], a['name'])) + assert all(match_with_bad_chars(a['name'], name1) for a in authors[1:]) + + best_key = pick_best_author(authors)['key'] + + imgs = [a['key'] for a in authors if a['key'] != '/a/OL2688880A' and has_image(a['key'])] + if len(imgs) == 1: + new_key = imgs[0] + else: + new_key = "/a/OL%dA" % min(key_int(a) for a in authors) + # Molière and O. J. O. Ferreira + if len(imgs) != 0: + print('imgs:', imgs) + return # skip + if not (imgs == [u'/a/OL21848A', u'/a/OL4280680A'] \ + or imgs == [u'/a/OL325189A', u'/a/OL266422A'] \ + or imgs == [u'/a/OL5160945A', u'/a/OL5776228A']): + print(imgs) + assert len(imgs) == 0 + + print(new_key) + print(best_key) + + do_normalize(new_key, best_key, authors) + old_keys = set(k for k in keys if k != new_key) + print('old keys:', old_keys) + + for old in old_keys: + # /b/OL21291659M + switch_author(ol, old, new_key, old_keys, debug=True) + if old in not_redirect: + make_redirect(ol, old, new_key) + q = { 'authors': old, 'type': '/type/edition', } + if list(get_things(q)) != []: + switch_author(ol, old, new_key, old_keys, debug=True) + #l = list(query_iter(q)) + #print old, l + #assert l == [] + +if __name__ == '__main__': + sys.stdout = codecs.getwriter('utf-8')(sys.stdout) + + rc = read_rc() + ol = OpenLibrary("http://openlibrary.org") + ol.login('EdwardBot', rc['EdwardBot']) + assert len(sys.argv) > 2 + merge_authors(ol, sys.argv[1:]) diff --git a/ia-legacy-importer/author/new.py b/ia-legacy-importer/author/new.py new file mode 100644 index 00000000..71a2eff9 --- /dev/null +++ b/ia-legacy-importer/author/new.py @@ -0,0 +1,21 @@ +from __future__ import print_function +from catalog.olwrite import Infogami +from catalog.read_rc import read_rc +import sys + +rc = read_rc() +infogami = Infogami(rc['infogami']) +infogami.login('EdwardBot', rc['EdwardBot']) + +name = sys.argv[1] + +q = { + 'create': 'unless_exists', + 'name': name, + 'personal_name': name, + 'entity_type': 'person', + 'key': infogami.new_key('/type/author'), + 'type': '/type/author', +} + +print(infogami.write(q, comment='create author')) diff --git a/ia-legacy-importer/author/noble.py b/ia-legacy-importer/author/noble.py new file mode 100644 index 00000000..5b1ac9e9 --- /dev/null +++ b/ia-legacy-importer/author/noble.py @@ -0,0 +1,71 @@ +# coding=utf-8 +from __future__ import print_function +from catalog.get_ia import read_marc_file +from catalog.read_rc import read_rc +from time import time +from catalog.marc.fast_parse import index_fields, get_tag_lines, get_first_tag, get_all_subfields +import web +import os +import os.path +import re +import sys + +titles = [ "Accolade", "Adi", "Aetheling", "Aga Khan", "Ajaw", "Ali'i", + "Allamah", "Altgrave", "Ammaveedu", "Anji", "Ryūkyū", "Archtreasurer", + "Aryamehr", "Atabeg", "Ban", "Baron", "Batonishvili", "Begum", "Bey", + "Boier", "Boyar", "Bulou", "Burgmann", "Buring Khan", "Caliph", + "Castellan", "Chakravatin", "Comte", "Conde", "Count", + "Count palatine", "Countess", "Crown prince", "Daula", + "Despot", "Doge", "Dowager", "Duchess of Rothesay", "Duke", "Earl", + "Edler", "Elector", "Elteber", "Emir", "Emperor", "Emperor-elect", + "Erbherr", "Feudal baron", "Fils de France", "Fraujaz", "Fürst", + "Grand duke", "Grand prince", "Grand Župan", "Grandee", "Haty-a", + "Hersir", "Hidalgo", "Highness", "Hold", "Hteik Tin", "Ichirgu-boil", + "Infante", "Jang", "Jarl", "Jonkheer", "Junker", "Kavkhan", "Khagan", + "Khagan Bek", "Khan", "Khanum", "Khatun", "Knight", "Knyaz", + "Kodaw-gyi", "Kralj", "Lady", "Lamido", "Landgrave", "Lendmann", + "Lord", "Madame Royale", "Magnate", "Maha Uparaja", + "Maha Uparaja Anaudrapa Ainshe Min", "Maharaja", "Maharajadhiraja", + "Maharana", "Maharao", "Maharaol", "Malik", "Margrave", "Marquess", + "Marquis de Bauffremont", "Marquise", "Mepe-Mepeta", "Mesne lord", + "Mian", "Min Ye", "Min-nyi Min-tha", "Mir", "Mirza", "Monsieur", "Mormaer", "Morza", "Mwami", "Naib", "Nawab", "Nayak", "Negus", "Nobile", "Obalumo", "Orangun", "Aftab", "Ottoman", "Padishah", "Paigah", "Hyderabad", "Paladin", "Palaiyakkarar", "Palatine", "Panapillai Amma", "Paramount Ruler", "Pasha", "Patricianship", "Pharaoh", "Piast dynasty", "Prescriptive barony", "Prince", "Prince du Sang", "Prince-Bishop", "Princely Highness", "Princeps", "Princess", "Principalía", "Privy chamber", "Rai", "Raja", "Rajah Muda of Sarawak", "Rajus", "Rana", "Rao Raja", "Ratu", "Ridder", "Ro", "Roko", "Sado Min", "Sahib", "Samanta", "Sawai Maharaja", "Shah", "Shahzada", "Shamkhal", "Shanyu", "Shwe Kodaw-gyi", "Shwe Kodaw-gyi Awratha", "Shwe Kodaw-gyi Rajaputra", "Sidi", "Sir", "Sultan", "Sunan", "Susuhunan", "Szlachta", "Tenant-in-chief", "Thakur", "Thampi", "Tsar", "Tsarevitch", "Tu'i", "Ueekata", "Uparaja", "Uparat", "Viceroy", "Victory", "Vidame", "Viscount", "Vizier", "Wazirzada", "Yang di-Pertuan Besar", "Zamindar", "Zeman", "Župa"] + +rc = read_rc() +web.config.db_parameters = dict(dbn='postgres', db='ol_merge', user=rc['user'], pw=rc['pw'], host=rc['host']) +web.config.db_printing = False +web.load() + +def sources(): + return ((i.id, i.archive_id, i.name) for i in web.select('marc_source')) + +def process_record(pos, loc, data): + for tag in '100', '700': + line = get_first_tag(data, set([tag])) + if line: + fields = list(get_all_subfields(line)) + if any(k == 'c' for k, v in fields): + print((loc, fields)) + +def files(ia): + endings = ['.mrc', '.marc', '.out', '.dat', '.records.utf8'] + def good(filename): + return any(filename.endswith(e) for e in endings) + + dir = rc['marc_path'] + ia + dir_len = len(dir) + 1 + files = [] + for dirpath, dirnames, filenames in os.walk(dir): + files.extend(dirpath + "/" + f for f in sorted(filenames)) + return [(i[dir_len:], os.path.getsize(i)) for i in files if good(i)] + +rec_no = 0 + +for source_id, ia, name in sources(): + for part, size in files(ia): + full_part = ia + "/" + part + filename = rc['marc_path'] + full_part + assert os.path.exists(filename) + f = open(filename) + for pos, loc, data in read_marc_file(full_part, f): + rec_no +=1 + process_record(pos, loc, data) diff --git a/ia-legacy-importer/author/rename.py b/ia-legacy-importer/author/rename.py new file mode 100755 index 00000000..24240575 --- /dev/null +++ b/ia-legacy-importer/author/rename.py @@ -0,0 +1,217 @@ +#!/usr/bin/python + +from __future__ import print_function +import web +import re +import sys +import codecs + +sys.stdout = codecs.getwriter('utf-8')(sys.stdout) + +web.load() + +from infogami.infobase.infobase import Infobase +import infogami.infobase.writequery as writequery +site = Infobase().get_site('openlibrary.org') + +re_marc_name = re.compile('^(.*), (.*)$') +re_end_dot = re.compile('[^ ][^ ]\.$', re.UNICODE) +re_odd_dot = re.compile('[^ ][^ ]\. ', re.UNICODE) +re_initial_then_dot = re.compile(r'\b[A-Z]\.') + +def find_by_statements(author_key): + q = { + 'authors': author_key, + 'type': '/type/edition', + } + by = [] + for key in site.things(q): + try: + by.append(site.withKey(key).by_statement.value) + except AttributeError: + pass + return by + +def east_in_by_statement(name, flipped, by_statements): + assert name.find(', ') != -1 + name = name.replace('.', '') + name = name.replace(', ', ' ') + if name == flipped.replace('.', ''): + return False + for by in by_statements: + if by.find(name) != -1: + return True + return False + +def get_type_id(type): + w = "key='" + type + "' and site_id=1" + return web.select('thing', what='id', where=w)[0].id + +author_type_id = get_type_id('/type/author') + +def get_thing(id): + sql = "select key, value from datum where thing_id=%d and end_revision=2147483647 and key != 'type'" % id + iter = web.query(sql) + thing = {} + for row in iter: + thing[row.key] = row.value + return thing + +def get_author_by_name(name): + sql = "select id from thing, datum where thing.type=$type and thing.id=thing_id and datum.key='name' and datum.value=$name and datum.datatype=2 and datum.end_revision=2147483647" + iter = web.query(sql, vars={'name': name, 'type': author_type_id}) + return [row.id for row in iter] + +def flip_name(name): + # strip end dots like this: "Smith, John." but not like this: "Smith, J." + m = re_end_dot.search(name) + if m: + name = name[:-1] + + m = re_marc_name.match(name) + return m.group(2) + ' ' + m.group(1) + +def pick_name(a, b, flipped): + if re_initial_then_dot.search(a): + return flipped + else: + return b + +east_list = [line[:-1].lower() for line in open("east")] +east = frozenset(east_list + [flip_name(i) for i in east_list]) + +def author_dates_match(a, b): + for k in ['birth_date', 'death_date', 'date']: + if k in a and k in b and a[k] != b[k]: + return False + return True + +def get_other_authors(name): + other = get_author_by_name(name) + if name.find('.') != -1: + name = name.replace('.', '') + other.extend(get_author_by_name(name)) + return other + +def key_int(rec): + return int(web.numify(rec['key'])) + +def switch_author(old, new): + q = { 'authors': old['key'], 'type': '/type/edition', } + for key in site.things(q): + edition = site.withKey(key) + authors = [] + for author in edition.authors: + if author.key == old['key']: + author_key = new['key'] + else: + author_key = author.key + authors.append({ 'key': author_key }) + + q = { + 'key': key, + 'authors': { 'connect': 'update_list', 'value': authors } + } + site.write(q, comment='fix author name') + +def make_redirect(old, new): + q = { + 'key': old['key'], + 'location': {'connect': 'update', 'value': new['key'] }, + 'type': {'connect': 'update', 'value': '/type/redirect' }, + } + for k in old.iterkeys(): + if k != 'key': + q[str(k)] = { 'connect': 'update', 'value': None } + print(site.write(q, comment='replace with redirect')) + +def copy_fields(from_author, to_author, name): + new_fields = { 'name': name, 'personal_name': name } + for k, v in from_author.iteritems(): + if k in ('name', 'key'): + continue + if k in author: + assert v == to_author[k] + else: + new_fields[k] = v + return new_fields + +def update_author(key, new): + q = { 'key': key, } + for k, v in new.iteritems(): + q[k] = { 'connect': 'update', 'value': v } + print(site.write(q, comment='fix author name')) + +def merge_authors(author, merge_with, name): + print('merge author %s:"%s" and %s:"%s"' % (author['key'], author['name'], merge_with['key'], merge_with['name'])) + new_name = pick_name(author['name'], merge_with['name'], name) + print('becomes: "%s"' % new_name) + if key_int(author) < key_int(merge_with): + new_key = author['key'] + print("copy fields from merge_with to", new_key) + new = copy_fields(merge_with, author, new_name) + update_author(new_key, new) + switch_author(merge_with, author) +# print "delete merge_with" + make_redirect(merge_with, author) + else: + new_key = merge_with['key'] + print("copy fields from author to", new_key) + new = copy_fields(merge_with, author, new_name) + update_author(new_key, new) + switch_author(author, merge_with) +# print "delete author" + make_redirect(author, merge_with) + print() + +print('running query') +# limit for test runs +for thing_row in web.select('thing', what='id, key', where='type='+repr(author_type_id), limit=10000): + id = thing_row.id + author = get_thing(id) + + if 'personal_name' not in author \ + or author['personal_name'] != author['name']: + continue + if author['name'].find(', ') == -1: + continue + if author['name'].lower().replace('.', '') in east: + continue + + key = author['key'] + name = flip_name(author['name']) + other = get_other_authors(name) + if len(other) == 0 and not re_odd_dot.search(author['name']): + by_statements = find_by_statements(author['key']) + print(author['name'], "by:", ', '.join('"%s"' % i for i in by_statements)) + if east_in_by_statement(author['name'], name, by_statements): + print("east in by statement") + continue + print("rename %s to %s" % (repr(author['name']), repr(name))) + q = { + 'key': key, + 'name': { 'connect': 'update', 'value': name}, + 'personal_name': { 'connect': 'update', 'value': name}, + } + print(repr(q)) + continue + + if len(other) != 1: +# print "other length:", other + continue + # don't merge authors when more than one like "Smith, John" + if len(get_author_by_name(author['name'])) > 1: +# print "found more authors with same name" + continue + + merge_with = get_thing(other[0]) + if not author_dates_match(author, merge_with): + print("date mismatch") + continue + by_statements = find_by_statements(author['key']) + print(author['name'], "by:", ', '.join('"%s"' % i for i in by_statements)) + if east_in_by_statement(author['name'], name, by_statements): + print("east in by statement") + print() + continue + merge_authors(author, merge_with, name) diff --git a/ia-legacy-importer/author/utils.py b/ia-legacy-importer/author/utils.py new file mode 100644 index 00000000..b8534738 --- /dev/null +++ b/ia-legacy-importer/author/utils.py @@ -0,0 +1,51 @@ +import re + +re_marc_name = re.compile('^(.*), (.*)$') +re_initial_then_dot = re.compile(r'\b[A-Z]\.') + +def flip_name(name): + m = re_marc_name.match(name) + return m.group(2) + ' ' + m.group(1) + +def pick_name(a, b): + if re_initial_then_dot.search(a): + return flip_name(a) + else: + return b + +def east_in_by_statement(name, by_statements): + assert name.find(', ') != -1 + name = name.replace('.', '') + flipped = flip_name(name) + name = name.replace(', ', ' ') + if name == flipped: + return False + for by in by_statements: + if by.find(name) != -1: + return True + return False + +def test_merge(): + data = [ + (u'Hood, Christopher', u'Christopher Hood', u'Christopher Hood'), + (u'Pawsey, Margaret M.', u'Margaret M Pawsey', u'Margaret M. Pawsey'), + (u'Elchardus, M.', u'M Elchardus', u'M. Elchardus'), + (u'Hayes, Mike.', u'Mike Hayes', u'Mike Hayes'), + (u'Krause, Rainer.', u'Rainer Krause', u'Rainer Krause'), + (u'Hoffmann, Manfred.', u'Manfred Hoffmann', u'Manfred Hoffmann'), + (u'Masson, Veneta.', u'Veneta Masson', u'Veneta Masson'), + (u'Baker, Ernest.', u'Ernest Baker', u'Ernest Baker'), + (u'Hooper, James.', u'James Hooper', u'James Hooper'), + (u'Bront\xeb, Charlotte', u'Charlotte Bront\xeb', u'Charlotte Bront\xeb'), + (u'Nichols, Francis Henry', u'Francis Henry Nichols', u'Francis Henry Nichols'), + (u'Becker, Bernd', u'Bernd Becker', u'Bernd Becker'), + (u'Sadleir, Richard.', u'Richard Sadleir', u'Richard Sadleir'), + ] + for a, b, want in data: + assert pick_name(a, b) == want + + assert east_in_by_statement("Wang, Qi", ["Wang Qi."]) + assert not east_in_by_statement("Walker, Charles L.",\ + ["edited by A. Karl Larson and Katharine Miles Larson."]) + assert not east_in_by_statement("Luoma, Gary A.", ["Gary A. Luoma"]) + assert not east_in_by_statement("Tan, Tan", ["Tan Tan zhu.", "Tan Tan zhu.", "Tan Tan ; [cha tu Li Ruguang ; ze ren bian ji Wang Zhengxiang]."]) diff --git a/ia-legacy-importer/author/web_merge.py b/ia-legacy-importer/author/web_merge.py new file mode 100644 index 00000000..7cb203bf --- /dev/null +++ b/ia-legacy-importer/author/web_merge.py @@ -0,0 +1,48 @@ +from __future__ import print_function +import web +from catalog.db_read import withKey +from pprint import pformat + +urls = ( + '/', 'index' +) + +base = 'http://openlibrary.org' + +class index: + def GET(self): + web.header('Content-Type','text/html; charset=utf-8', unique=True) + input = web.input() + print("\nAuthor merge") + print("

    Author merge

    ") + print('
    ') + print('') + print('') + author = {} + for field in ('a', 'b'): + print('') + print('') + print('') + if 'a' in author and 'b' in author: + a = author['a'] + b = author['b'] + keys = [withKey(prop['key'])['name'] for prop in withKey('/type/author')['properties']] + for k in keys: + if k in a or k in b: + print('' % \ + (k, a.get(k, ''), b.get(k, ''))) + print('
    Authors') + if field in input: + key = input[field] + if key.startswith(base): + key = key[len(base):] + author[field] = withKey(key) + print('' % (field, key)) + else: + print('' % field) + print('
    %s%s%s
    ') + print("") + +web.webapi.internalerror = web.debugerror + +if __name__ == "__main__": web.run(urls, globals(), web.reloader) diff --git a/ia-legacy-importer/author/web_merge2.py b/ia-legacy-importer/author/web_merge2.py new file mode 100644 index 00000000..482dca4a --- /dev/null +++ b/ia-legacy-importer/author/web_merge2.py @@ -0,0 +1,100 @@ +import web +import re +import simplejson as json +from pprint import pformat + +from catalog.utils.query import query_iter + +from six.moves.urllib.request import urlopen + + +urls = ( + '/', 'index' +) +app = web.application(urls, globals()) + +re_year = re.compile('^(\d+)[,.*]+$') + +def result_table(data, birth, death, order): + html = ' %d results' % len(data) + l = [] + def clean(i, default, field): + if field not in i: + return default + if i[field] is None: + return '' + m = re_year.match(i[field]) + return m.group(1) if m else i[field] + + data = [ + { + 'key': i['key'], + 'name': i['name'], + 'birth': clean(i, birth, 'birth_date'), + 'death': clean(i, death, 'death_date'), + } for i in data] + + base_url = web.htmlquote("?birth=%s&death=%s&order=" % (web.urlquote(birth), web.urlquote(death))) + html += '' + html += '
    Name' + if birth: + html += 'birth' + else: + html += 'birth' + if death: + html += 'death' + else: + html += 'death' + html += '' + if order: + data = sorted(data, key=lambda i:i[order]) + for i in data: + html += '%s%s%s' % (i['key'], web.htmlquote(i['name']), i['birth'], i['death']) + return '' + html + '
    ' + +def get_all(url): + all = [] + offset = 0 + limit = 500 + while True: + ret = json.load(urlopen(url + "&limit=%d&offset=%d" % (limit, offset))) + if not ret: + return all + all += ret + if len(all) >= 1000: + return all + offset += limit + +class index: + def GET(self): + input = web.input() + birth = input.get('birth', '').strip() + death = input.get('death', '').strip() + order = input.get('order', '').strip() + if order not in ('', 'name', 'birth', 'death'): + order = '' + html = ''' + + + +Merge author + + + +''' + html += '\n' + html += 'Birth: \n' % web.htmlquote(birth) + html += 'Death: \n' % web.htmlquote(death) + html += '\n' + + if birth or death: + url = 'http://openlibrary.org/query.json?type=/type/author&birth_date=%s&death_date=%s&name=' % (web.urlquote(birth), web.urlquote(death)) + data = get_all(url) + html += result_table(data, birth, death, order) + return html + '\n' + +if __name__ == "__main__": + app.run() diff --git a/ia-legacy-importer/crawl/catalogue.nla.gov.au/crawl.py b/ia-legacy-importer/crawl/catalogue.nla.gov.au/crawl.py new file mode 100644 index 00000000..ee187057 --- /dev/null +++ b/ia-legacy-importer/crawl/catalogue.nla.gov.au/crawl.py @@ -0,0 +1,107 @@ +from __future__ import print_function +import re +from os.path import exists + +from six.moves.urllib.request import urlopen + + +# crawl catalogue.nla.gov.au + +re_th = re.compile('^(\d{3})$', re.I) +re_td = re.compile('^(.*)$') +re_span = re.compile('\|(.|&(?:gt|lt|amp);)(.*?)') + +trans = dict(lt='<', gt='>', amp='&') + +def read_row(tag, row): + assert len(row) == 3 + if tag[0:2] == '00': + assert all(i == '' for i in row[0:1]) + return (tag, row[2]) + else: + end = 0 + subfields = [] + while end != len(row[2]): + m = re_span.match(row[2], end) + end = m.end() + (k, v) = m.groups() + if len(k) != 1: + k = trans[k[1:-1]] + subfields.append((k, v)) + assert all(len(i) == 1 for i in row[0:1]) + return (tag, row[0], row[1], subfields) + +def extract_marc(f): + expect = 'table' + col = 0 + row = [] + lines = [] + for line in f: # state machine + if expect == 'table': + if '' in line: + expect = 'tr' + continue + if expect == 'tr': + if line.startswith('
    '): + break + assert line.startswith('') + expect = 'th' + continue + if expect == 'th': + m = re_th.match(line) + assert m + tag = m.group(1) + expect = 'td' + continue + if expect == 'td': + if line.startswith(''): + lines.append(read_row(tag, row)) + tag = None + row = [] + expect = 'tr' + continue + if line == '\n': + expect = 'span' + continue + m = re_td.match(line) + row.append(m.group(1)) + continue + if expect == 'span': + row.append(line[:-1]) + expect = '/td' + continue + if expect == '/td': + assert line == '\n' + expect = 'td' + continue + return lines + +i = 1 +while True: + i+=1 + filename = 'marc/%d' % i + if exists(filename): + continue + print(i, end=' ') + url = 'http://catalogue.nla.gov.au/Record/%d/Details' % i + web_input = None + for attempt in range(5): + try: + web_input = urlopen(url) + break + except: + pass + if not web_input: + break + + out = open('marc/%d' % i, 'w') + try: + marc = extract_marc(web_input) + except: + print(url) + raise + print(len(marc)) + for line in marc: + print(line, file=out) + out.close() + #sleep(0.5) diff --git a/ia-legacy-importer/dup/authors.py b/ia-legacy-importer/dup/authors.py new file mode 100644 index 00000000..657d5680 --- /dev/null +++ b/ia-legacy-importer/dup/authors.py @@ -0,0 +1,121 @@ +from __future__ import print_function +from catalog.infostore import get_site +from catalog.read_rc import read_rc +import web +import sys +import codecs +import os.path +import re +from catalog.olwrite import Infogami +site = get_site() + +import psycopg2 +rc = read_rc() +infogami = Infogami(rc['infogami']) +infogami.login('EdwardBot', rc['EdwardBot']) + +re_marc_name = re.compile('^(.*), (.*)$') +re_end_dot = re.compile('[^ ][^ ]\.$', re.UNICODE) + +out = open('author_replace3', 'w') + +# find books with matching ISBN and fix them to use better author record + +def flip_name(name): + # strip end dots like this: "Smith, John." but not like this: "Smith, J." + m = re_end_dot.search(name) + if m: + name = name[:-1] + + m = re_marc_name.match(name) + return m.group(2) + ' ' + m.group(1) + +conn = psycopg2.connect("dbname='%s' user='%s' host='%s' password='%s'" \ + % ('ol_merge', rc['user'], rc['host'], rc['pw'])); +cur = conn.cursor() + +author_fields = ('key', 'name', 'title', 'birth_date', 'death_date', 'personal_name') + +sys.stdout = codecs.getwriter('utf-8')(sys.stdout) +for line in open('dups'): + isbn, num = eval(line) + if isbn < '0273314165': + continue + cur.execute('select key from isbn where value=%(v)s', {'v':isbn}) + found = [] + names = {} + for i in cur.fetchall(): + key = i[0] + e = site.withKey(key) + author_list = e.authors or [] + authors = [dict(((k, v) for k, v in a._get_data().items() if k in author_fields)) for a in author_list if a] + for a in authors: + if 'name' not in a: + continue + name = a['name'] + if name.find(', ') != -1: + name = flip_name(name) + a2 = a.copy() + a2['edition'] = key + names.setdefault(name, []).append(a2) + found.append((key, authors)) + if len([1 for k, a in found if a]) < 2: + continue + if not any(any('birth_date' in j or 'death_date' in j for j in i[1]) for i in found): + continue + names = dict((k, v) for k, v in names.iteritems() if len(set(i['key'] for i in v)) > 1) + if not names: + continue + author_replace = {} + for name, authors in names.items(): + seen = set() +# print 'birth:', [a['birth_date'].strip('.') for a in authors if 'birth_date' in a] +# print 'death:', [a['death_date'].strip('.') for a in authors if 'death_date' in a] + with_dates = None + no_dates = [] + for a in authors: + if a['key'] in seen: + continue + seen.add(a['key']) + if 'birth_date' in a or 'death_date' in a: + if with_dates: + with_dates = None + break + with_dates = a['key'] + continue + no_dates.append(a['key']) + if with_dates and no_dates: + for i in no_dates: + assert i not in author_replace + author_replace[i] = with_dates + if not author_replace: + continue + print(isbn, author_replace) + for key, authors in found: + replace = [a['key'] for a in authors if a['key'] in author_replace] + if len(replace) == 0: + continue +# print len(replace), key, [a['key'] for a in authors] + new_authors = [] + this = {} + for a in authors: + akey = a['key'] + if akey in author_replace: + this[akey] = author_replace[akey] + akey = author_replace[akey] + if akey not in new_authors: + new_authors.append(akey) + q = { + 'key': key, + 'authors': { 'connect': 'update_list', 'value': new_authors } + } + print((key, this), file=out) +# for k in author_replace.keys(): +# print k, len(site.things({'type': '/type/edition', 'authors': k})) + +# for name, v in names.items(): +# print name +# for edition, author in v: +# print author, site.things({'type': '/type/edition', 'authors': author}) +# print +out.close() diff --git a/ia-legacy-importer/dup/find.py b/ia-legacy-importer/dup/find.py new file mode 100644 index 00000000..8b227e86 --- /dev/null +++ b/ia-legacy-importer/dup/find.py @@ -0,0 +1,76 @@ +from __future__ import print_function +import web +import sys +import codecs +import os.path +from catalog.read_rc import read_rc +import psycopg2 +from catalog.infostore import get_site +from catalog.merge.merge_marc import attempt_merge, build_marc +import catalog.marc.fast_parse as fast_parse + +sys.stdout = codecs.getwriter('utf-8')(sys.stdout) + +# need to use multiple databases +# use psycopg2 to until open library is upgraded to web 3.0 + +rc = read_rc() +threshold = 875 + +conn = psycopg2.connect("dbname='%s' user='%s' host='%s' password='%s'" \ + % ('ol_merge', rc['user'], rc['host'], rc['pw'])); +cur1 = conn.cursor() +cur2 = conn.cursor() + +site = get_site() + +marc_path = '/2/pharos/marc/' + +def get_marc(loc): + try: + filename, p, l = loc.split(':') + except ValueError: + return None + if not os.path.exists(marc_path + filename): + return None + f = open(marc_path + filename) + f.seek(int(p)) + buf = f.read(int(l)) + f.close() + rec = fast_parse.read_edition(buf) + if rec: + return build_marc(rec) + +for line in open('dups'): + v, num = eval(line) + cur2.execute('select key from isbn where value=%(v)s', {'v':v}) + editions = [] + for i in cur2.fetchall(): + key = i[0] + t = site.withKey(key) + mc = site.versions({'key': key})[0].machine_comment + editions.append({'key': key, 'title': t.title, 'loc': mc}) + if len(editions) != 2: + continue + if any(not i['loc'] or i['loc'].startswith('amazon:') for i in editions): + continue + e1 = get_marc(editions[0]['loc']) + if not e1: + continue + e2 = get_marc(editions[1]['loc']) + if not e2: + continue + +# print v, [i['title'] for i in editions] +# print e1 +# print e2 + match = attempt_merge(e1, e2, threshold, debug=False) + if match: + print(tuple([v] + [i['key'] for i in editions])) + +sys.exit(0) +cur1.execute('select value, count(*) as num from isbn group by value having count(*) > 1') +for i in cur1.fetchall(): + print(i) + cur2.execute('select key from isbn where value=%(v)s', {'v':i[0]}) + print(cur2.fetchall()) diff --git a/ia-legacy-importer/edition_merge/find_dups.py b/ia-legacy-importer/edition_merge/find_dups.py new file mode 100644 index 00000000..e221f2f8 --- /dev/null +++ b/ia-legacy-importer/edition_merge/find_dups.py @@ -0,0 +1,40 @@ +#!/usr/bin/python + +from __future__ import print_function +from openlibrary.api import OpenLibrary +from subprocess import Popen, PIPE +import MySQLdb + +ia_db_host = 'dbmeta.us.archive.org' +ia_db_user = 'archive' +ia_db_pass = Popen(["/opt/.petabox/dbserver"], stdout=PIPE).communicate()[0] + +ol = OpenLibrary('http://openlibrary.org/') + +local_db = MySQLdb.connect(db='merge_editions') +local_cur = local_db.cursor() + +archive_db = MySQLdb.connect(host=ia_db_host, user=ia_db_user, \ + passwd=ia_db_pass, db='archive') +archive_cur = archive_db.cursor() + +fields = ['identifier', 'updated', 'collection'] +sql_fields = ', '.join(fields) + +archive_cur.execute("select " + sql_fields + \ + " from metadata" + \ + " where scanner is not null and mediatype='texts'" + \ + " and (not curatestate='dark' or curatestate is null)" + \ + " and collection is not null and boxid is not null and identifier not like 'zdanh_test%' and scandate is not null " + \ + " order by updated") + +for num, (ia, updated, collection) in enumerate(archive_cur.fetchall()): + if 'lending' not in collection and 'inlibrary' not in collection: + continue + q = {'type': '/type/edition', 'ocaid': ia} + editions = set(str(i) for i in ol.query(q)) + q = {'type': '/type/edition', 'source_records': 'ia:' + ia} + editions.update(str(i) for i in ol.query(q)) + if len(editions) > 1: + print((ia, list(editions))) + local_cur.execute('replace into merge (ia, editions) values (%s, %s)', [ia, ' '.join(editions)]) diff --git a/ia-legacy-importer/edition_merge/find_easy.py b/ia-legacy-importer/edition_merge/find_easy.py new file mode 100644 index 00000000..5cbe5989 --- /dev/null +++ b/ia-legacy-importer/edition_merge/find_easy.py @@ -0,0 +1,191 @@ +from __future__ import print_function +import MySQLdb +import datetime +import re +import sys +sys.path.append('/1/src/openlibrary') +from openlibrary.api import OpenLibrary, Reference +from collections import defaultdict + +re_edition_key = re.compile('^/books/OL(\d+)M$') +re_nonword = re.compile(r'\W', re.U) +re_edition = re.compile(' ed edition$') + +ol = OpenLibrary('http://openlibrary.org/') + +conn = MySQLdb.connect(db='merge_editions') +cur = conn.cursor() + +skip = 'guineapigscomple00elwa' +skip = None +total = 5601 +cur.execute("select ia, editions, done, unmerge_count from merge where unmerge_count != 0") # and ia='hantayo00hillrich'") +unmerge_field_counts = defaultdict(int) +num = 0 +for ia, ekeys, done, unmerge_count in cur.fetchall(): +# if unmerge_count == 0: +# continue + num += 1 + if num % 100 == 0: + print('%d/%d %.2f%%' % (num, total, ((float(num) * 100) / total)), ia) + if skip: + if skip == ia: + skip = None + continue + ekeys = ['/books/OL%dM' % x for x in sorted(int(re_edition_key.match(ekey).group(1)) for ekey in ekeys.split(' '))] + min_ekey = ekeys[0] + + if len(ekeys) > 3: + print(ia, ekeys) + editions = [ol.get(ekey) for ekey in ekeys] + all_keys = set() + for e in editions: + for k in 'classifications', 'identifiers', 'table_of_contents': + if k in e and not e[k]: + del e[k] + for e in editions: + all_keys.update(e.keys()) + for k in 'latest_revision', 'revision', 'created', 'last_modified', 'key', 'type', 'genres': + if k in all_keys: + all_keys.remove(k) + + for k in all_keys.copy(): + if k.startswith('subject'): + all_keys.remove(k) + + for e in editions: # resolve redirects + if 'authors' not in e: + continue + new_authors = [] + for akey in e['authors']: + a = ol.get(akey) + if a['type'] == Reference('/type/redirect'): + akey = Reference(a['location']) + else: + assert a['type'] == Reference('/type/author') + new_authors.append(akey) + e['authors'] = new_authors + + merged = {} + k = 'publish_date' + publish_dates = set(e[k] for e in editions if k in e and len(e[k]) != 4) + + k = 'pagination' + all_pagination = set(e[k].strip(':.') for e in editions if e.get(k)) + + one_item_lists = {} + for k in 'lc_classifications', 'publishers', 'contributions', 'series', 'authors': + one_item_lists[k] = set(e[k][0].strip('.') for e in editions if e.get(k) and len(set(e[k])) == 1) + + for k in 'source_records', 'ia_box_id': + merged[k] = [] + for e in editions: + for sr in e.get(k, []): + if sr not in merged[k]: + merged[k].append(sr) + + for k in ['other_titles', 'isbn_10', 'series', 'oclc_numbers', 'publishers']: + if k not in all_keys: + continue + merged[k] = [] + for e in editions: + for sr in e.get(k, []): + if sr not in merged[k]: + merged[k].append(sr) + + k = 'ocaid' + for e in editions: + if e.get(k) and 'ia:' + e[k] not in merged['source_records']: + merged['source_records'].append(e[k]) + + k = 'identifiers' + if k in all_keys: + merged[k] = {} + for e in editions: + if k not in e: + continue + for a, b in e[k].items(): + for c in b: + if c in merged[k].setdefault(a, []): + continue + merged[k][a].append(c) + + any_publish_country = False + k = 'publish_country' + if k in all_keys: + for e in editions: + if e.get(k) and not e[k].strip().startswith('xx'): + any_publish_country = True + + skip_fields = set(['source_records', 'ia_box_id', 'identifiers', 'ocaid', 'other_titles', 'series', 'isbn_10']) + for k in all_keys: + if k in skip_fields: + continue + + uniq = defaultdict(list) + for num, e in enumerate(editions): + if e.get(k): + if k == 'publish_date' and len(e[k]) == 4 and e[k].isdigit and any(e[k] in pd for pd in publish_dates): + continue + if k == 'pagination' and any(len(i) > len(e[k].strip('.:')) and e[k].strip('.:') in i for i in all_pagination): + continue + if k in one_item_lists and len(set(e.get(k, []))) == 1 and any(len(i) > len(e[k][0].strip('.')) and e[k][0].strip('.') in i for i in one_item_lists[k]): + continue + if k == 'publish_country' and any_publish_country and e.get(k, '').strip().startswith('xx'): + continue + if k == 'edition_name' and e[k].endswith(' ed edition'): + e[k] = e[k][:-len(' edition')] + uniq[re_nonword.sub('', repr(e[k]).lower())].append(num) + + if len(uniq) == 1: + merged[k] = uniq.keys()[0] + merged[k] = editions[uniq.values()[0][0]][k] + continue + + if k == 'covers': + assert all(isinstance(e[k], list) for e in editions if k in e) + covers = set() + for e in editions: + if k in e: + covers.update(c for c in e[k] if c != -1) + merged['covers'] = sorted(covers) + continue + + if k == 'notes': + merged['notes'] = '' + for e in editions: + if e.get('notes'): + merged['notes'] += e['notes'] + '\n' + continue + + if k == 'ocaid': + for e in editions: + if e.get('ocaid'): + if e['ocaid'].endswith('goog'): + print(e['key'], e['ocaid'], ia) + merged['ocaid'] = e['ocaid'] + break + assert merged['ocaid'] + continue + + if k =='authors': + min_author = set(min((e.get('authors', []) for e in editions), key=len)) + if all(min_author <= set(e.get('authors', [])) for e in editions): + merged[k] = max((e.get('authors', []) for e in editions), key=len) + continue + merged[k] = None + unmerged = len([1 for v in merged.values() if v is None]) + if unmerged == 1: + assert len([k for k, v in merged.items() if v is None]) == 1 + for k, v in merged.items(): + if v is None: + if k == 'series': + print(ia, [e[k] for e in editions if e.get(k)]) + unmerge_field_counts[k] += 1 + #print 'unmerged count', unmerged, ia, ekeys + cur.execute('update merge set unmerge_count=%s where ia=%s', [unmerged, ia]) + +print(dict(unmerge_field_counts)) +print(unmerge_field_counts.items()) +for k,v in sorted(unmerge_field_counts.items(), key=lambda i:i[1]): + print('%30s: %d' % (k, v)) diff --git a/ia-legacy-importer/edition_merge/merge.py b/ia-legacy-importer/edition_merge/merge.py new file mode 100644 index 00000000..688b1130 --- /dev/null +++ b/ia-legacy-importer/edition_merge/merge.py @@ -0,0 +1,247 @@ +#!/usr/bin/python + +import MySQLdb +import datetime +import re +import sys +sys.path.append('/1/src/openlibrary') +from openlibrary.api import OpenLibrary, Reference +from flask import Flask, render_template, request, flash, redirect, url_for, g +from collections import defaultdict +app = Flask(__name__) + +re_edition_key = re.compile('^/books/OL(\d+)M$') + +ol = OpenLibrary('http://openlibrary.org/') +ol.login('EdwardBot', 'As1Wae9b') + +@app.before_request +def before_request(): + g.db = MySQLdb.connect(db='merge_editions') + +@app.after_request +def after_request(r): + g.db.close() + return r + +re_nonword = re.compile(r'\W', re.U) + +rows = 200 + +app.secret_key = 'rt9%s#)5kid$!u*5_@*$f2f_%jq++nl3@d%=7f%v4&78^m4p7c' + +@app.route("/") +def index(): + page = int(request.args.get('page', 1)) + cur = g.db.cursor() + cur.execute('select count(*) from merge where done is null') + total = cur.fetchone()[0] + cur.execute('select count(*) from merge where done is null and unmerge_count = 0') + easy = cur.fetchone()[0] + cur.execute('select ia, editions, unmerge_count from merge where done is null limit %s offset %s', [rows, (page-1) * rows]) + reply = cur.fetchall() + return render_template('index.html', merge_list=reply, total=total, rows=rows, page=page, easy=easy ) + +def run_merge(ia): + cur = g.db.cursor() + cur.execute('select editions from merge where ia=%s', ia) + [ekeys] = cur.fetchone() + ekeys = ['/books/OL%dM' % x for x in sorted(int(re_edition_key.match(ekey).group(1)) for ekey in ekeys.split(' '))] + min_ekey = ekeys[0] + + editions = [ol.get(ekey) for ekey in ekeys] + editions_by_key = dict((e['key'][7:], e) for e in editions) + merged = build_merged(editions) + + missing = [] + for k, v in merged.items(): + if v is not None: + continue + use_ekey = request.form.get(k) + if use_ekey is None: + missing.append(k) + continue + merged[k] = editions_by_key[use_ekey][k] + if missing: + flash('please select: ' + ', '.join(missing)) + return redirect(url_for('merge', ia=ia)) + + master = ol.get(min_ekey) + for k, v in merged.items(): + master[k] = v + + updates = [] + updates.append(master) + for ekey in ekeys: + if ekey == min_ekey: + continue + ol_redirect = { + 'type': Reference('/type/redirect'), + 'location': min_ekey, + 'key': ekey, + } + updates.append(ol_redirect) + #print len(updates), min_ekey + try: + ol.save_many(updates, 'merge lending editions') + except: + #for i in updates: + # print i + raise + cur.execute('update merge set done=now() where ia=%s', [ia]) + + flash(ia + ' merged') + return redirect(url_for('index')) + +def build_merged(editions): + all_keys = set() + + for e in editions: + for k in 'classifications', 'identifiers': + if k in e and not e[k]: + del e[k] + + for e in editions: + all_keys.update(e.keys()) + + for k in 'latest_revision', 'revision', 'created', 'last_modified', 'key', 'type', 'genres': + if k in all_keys: + all_keys.remove(k) + + for k in all_keys.copy(): + if k.startswith('subject'): + all_keys.remove(k) + + merged = {} + k = 'publish_date' + publish_dates = set(e[k] for e in editions if k in e and len(e[k]) != 4) + + k = 'pagination' + all_pagination = set(e[k] for e in editions if e.get(k)) + + one_item_lists = {} + for k in 'lc_classifications', 'publishers', 'contributions', 'series': + one_item_lists[k] = set(e[k][0].strip('.') for e in editions if e.get(k) and len(set(e[k])) == 1) + + for k in 'source_records', 'ia_box_id': + merged[k] = [] + for e in editions: + for sr in e.get(k, []): + if sr not in merged[k]: + merged[k].append(sr) + + for k in ['other_titles', 'isbn_10', 'series']: + if k not in all_keys: + continue + merged[k] = [] + for e in editions: + for sr in e.get(k, []): + if sr not in merged[k]: + merged[k].append(sr) + + + k = 'ocaid' + for e in editions: + if e.get(k) and 'ia:' + e[k] not in merged['source_records']: + merged['source_records'].append(e[k]) + + k = 'identifiers' + if k in all_keys: + merged[k] = {} + for e in editions: + if k not in e: + continue + for a, b in e[k].items(): + for c in b: + if c in merged[k].setdefault(a, []): + continue + merged[k][a].append(c) + + any_publish_country = False + k = 'publish_country' + if k in all_keys: + for e in editions: + if e.get(k) and not e[k].strip().startswith('xx'): + any_publish_country = True + + for k in all_keys: + if k in ('source_records', 'ia_box_id', 'identifiers'): + continue + + uniq = defaultdict(list) + for num, e in enumerate(editions): + if e.get(k): + if k == 'publish_date' and len(e[k]) == 4 and e[k].isdigit and any(e[k] in pd for pd in publish_dates): + continue + if k == 'pagination' and any(len(i) > len(e[k]) and e[k] in i for i in all_pagination): + continue + if k in one_item_lists and len(set(e.get(k, []))) == 1 and any(len(i) > len(e[k][0].strip('.')) and e[k][0].strip('.') in i for i in one_item_lists[k]): + continue + if k == 'publish_country' and any_publish_country and e.get(k, '').strip().startswith('xx'): + continue + if k == 'edition_name' and e[k].endswith(' ed edition'): + e[k] = e[k][:-len(' edition')] + uniq[re_nonword.sub('', repr(e[k]).lower())].append(num) + + if len(uniq) == 1: + #merged[k] = uniq.keys()[0] + merged[k] = editions[uniq.values()[0][0]][k] + continue + + if k == 'covers': + assert all(isinstance(e[k], list) for e in editions if k in e) + covers = set() + for e in editions: + if k in e: + covers.update(c for c in e[k] if c != -1) + merged['covers'] = sorted(covers) + continue + + if k == 'notes': + merged['notes'] = '' + for e in editions: + if e.get('notes'): + merged['notes'] += e['notes'] + '\n' + continue + + if k == 'ocaid': + for e in editions: + if e.get('ocaid'): + #assert not e['ocaid'].endswith('goog') + merged['ocaid'] = e['ocaid'] + break + assert merged['ocaid'] + continue + merged[k] = None + + return merged + +@app.route("/merge/", methods=['GET', 'POST']) +def merge(ia): + if request.method == 'POST': + return run_merge(ia) + + cur = g.db.cursor() + cur.execute('select ia, editions, done from merge where ia = %s', [ia]) + ia, ekeys, done = cur.fetchone() + ekeys = ['/books/OL%dM' % x for x in sorted(int(re_edition_key.match(ekey).group(1)) for ekey in ekeys.split(' '))] + min_ekey = ekeys[0] + + editions = [ol.get(ekey) for ekey in ekeys] + + merged = build_merged(editions) + all_keys = merged.keys() + + works = [] + + return render_template('merge.html', + ia=ia, + editions=editions, + keys=sorted(all_keys), + merged = merged, + ekeys=ekeys, + works=works, + master=min_ekey) + +if __name__ == "__main__": + app.run(host='0.0.0.0', debug=True) diff --git a/ia-legacy-importer/edition_merge/merge_works.py b/ia-legacy-importer/edition_merge/merge_works.py new file mode 100644 index 00000000..4b591e71 --- /dev/null +++ b/ia-legacy-importer/edition_merge/merge_works.py @@ -0,0 +1,149 @@ +from __future__ import print_function +import MySQLdb +import datetime +import re +import sys +from openlibrary.catalog.utils import cmp +sys.path.append('/1/src/openlibrary') +from openlibrary.api import OpenLibrary, Reference + +import six + + +conn = MySQLdb.connect(db='merge_editions') +cur = conn.cursor() + +re_edition_key = re.compile('^/books/OL(\d+)M$') +re_work_key = re.compile('^/works/OL(\d+)W$') +ol = OpenLibrary('http://openlibrary.org/') +ol.login('EdwardBot', 'As1Wae9b') + +re_iso_date = re.compile('^(\d{4})-\d\d-\d\d$') +re_end_year = re.compile('(\d{4})$') + +def get_publish_year(d): + if not d: + return + m = re_iso_date.match(d) + if m: + return int(m.group(1)) + m = re_end_year.match(d) + if m: + return int(m.group(1)) + +{'lc_classifications': ['PZ7.H558 Ru'], 'dewey_number': ['[E]']} +def merge_works(works): + master = works.pop(0) + master_first_publish_year = get_publish_year(master.get('first_publish_date')) + subtitles = sorted((w['subtitle'] for w in works if w.get('subtitle')), key=lambda s: len(s)) + if subtitles and len(subtitles[-1]) > len(master.get('subtitle', '')): + master['subtitle'] = subtitles[-1] + updates = [] + for w in works: + wkey = w.pop('key') + q = {'type': '/type/edition', 'works': wkey} + for ekey in ol.query(q): + e = ol.get(ekey) + assert len(e['works']) == 1 and e['works'][0] == wkey + e['works'] = [Reference(master['key'])] + updates.append(e) + assert w['type'] != Reference('/type/redirect') + updates.append({ + 'key': wkey, + 'type': Reference('/type/redirect'), + 'location': master['key'], + }) + for f in 'covers', 'subjects', 'subject_places', 'subject_people', 'subject_times', 'lc_classifications', 'dewey_number': + if not w.get(f): + continue + assert not isinstance(w[f], six.string_types) + for i in w[f]: + if i not in master.setdefault(f, []): + master[f].append(i) + + if w.get('first_sentence') and not master.get('first_sentence'): + master['first_sentence'] = w['first_sentence'] + if w.get('first_publish_date'): + if not master.get('first_publish_date'): + master['first_publish_date'] = w['first_publish_date'] + else: + publish_year = get_publish_year(w['first_publish_date']) + if publish_year < master_first_publish_year: + master['first_publish_date'] = w['first_publish_date'] + master_first_publish_year = publish_year + + for excerpt in w.get('exceprts', []): + master.setdefault('exceprts', []).append(excerpt) + + for f in 'title', 'subtitle', 'created', 'last_modified', 'latest_revision', 'revision', 'number_of_editions', 'type', 'first_sentence', 'authors', 'first_publish_date', 'excerpts', 'covers', 'subjects', 'subject_places', 'subject_people', 'subject_times', 'lc_classifications', 'dewey_number': + try: + del w[f] + except KeyError: + pass + + print(w) + assert not w + updates.append(master) + print(len(updates), [(doc['key'], doc['type']) for doc in updates]) + # update master + # update editions to point at master + # replace works with redirects + print(ol.save_many(updates, 'merge works')) + +skip = 'seventeenagainst00voig' +skip = 'inlineskatingbas00sava' +skip = 'elephantatwaldor00mira' +skip = 'sybasesqlserverp00paul' +skip = 'karmadunl00dunl' +skip = 'norbychronicles00asim' +skip = 'elizabethbarrett00fors' +skip = None +updates = [] +cur.execute('select ia, editions, done, unmerge_count from merge') +for ia, ekeys, done, unmerge_count in cur.fetchall(): + if skip: + if ia == skip: + skip = None + else: + continue + ekeys = ['/books/OL%dM' % x for x in sorted(int(re_edition_key.match(ekey).group(1)) for ekey in ekeys.split(' '))] + editions = [ol.get(ekey) for ekey in ekeys] + + if any('authors' not in e or 'works' not in e for e in editions): + continue + author0 = editions[0]['authors'][0] + work0 = editions[0]['works'][0] + try: + if not all(author0 == e['authors'][0] for e in editions[1:]): + continue + except: + print('editions:', [e['key'] for e in editions]) + raise + if all(work0 == e['works'][0] for e in editions[1:]): + continue + wkeys = [] + for e in editions: + for wkey in e['works']: + if wkey not in wkeys: + wkeys.append(wkey) + + works = [] + for wkey in wkeys: + w = ol.get(wkey) + q = {'type': '/type/edition', 'works': wkey, 'limit': 1000} + w['number_of_editions'] = len(ol.query(q)) + works.append(w) + title0 = works[0]['title'].lower() + if not all(w['title'].lower() == title0 for w in works[1:]): + continue + print(ia, ekeys) + print(' works:', wkeys) + def work_key_int(wkey): + return int(re_work_key.match(wkey).group(1)) + works = sorted(works, cmp=lambda a,b:-cmp(a['number_of_editions'],b['number_of_editions']) or cmp(work_key_int(a['key']), work_key_int(b['key']))) + print(' titles:', [(w['title'], w['number_of_editions']) for w in works]) + print(author0) + #print [w['authors'][0]['author'] for w in works] + assert all(author0 == w['authors'][0]['author'] for w in works) + merge_works(works) + print() diff --git a/ia-legacy-importer/edition_merge/run_merge.py b/ia-legacy-importer/edition_merge/run_merge.py new file mode 100644 index 00000000..5110ec45 --- /dev/null +++ b/ia-legacy-importer/edition_merge/run_merge.py @@ -0,0 +1,185 @@ +from __future__ import print_function +import MySQLdb +import datetime +import re +import sys +from openlibrary.api import OpenLibrary, Reference +from collections import defaultdict + +import six + + +re_edition_key = re.compile('^/books/OL(\d+)M$') +re_nonword = re.compile(r'\W', re.U) + +conn = MySQLdb.connect(db='merge_editions') +cur = conn.cursor() +cur2 = conn.cursor() + +ol = OpenLibrary('http://openlibrary.org/') +ol.login('EdwardBot', 'As1Wae9b') + +cur.execute('select ia, editions, done from merge where done is null and unmerge_count=0') +for ia, ekeys, done in cur.fetchall(): + updates = [] + ekeys = ['/books/OL%dM' % x for x in sorted(int(re_edition_key.match(ekey).group(1)) for ekey in ekeys.split(' '))] + print((ia, ekeys)) + min_ekey = ekeys[0] + editions = [ol.get(ekey) for ekey in ekeys] + master = editions[0] + + for e in editions: + for k in 'classifications', 'identifiers', 'table_of_contents': + if k in e and not e[k]: + del e[k] + + all_keys = set() + for e in editions: + all_keys.update(k for k, v in e.items() if v) + for k in 'latest_revision', 'revision', 'created', 'last_modified', 'key', 'type', 'genres': + if k in all_keys: + all_keys.remove(k) + + for k in all_keys.copy(): + if k.startswith('subject'): + all_keys.remove(k) + + for e in editions: # resolve redirects + if 'authors' not in e: + continue + new_authors = [] + for akey in e['authors']: + a = ol.get(akey) + if a['type'] == Reference('/type/redirect'): + akey = Reference(a['location']) + else: + assert a['type'] == Reference('/type/author') + new_authors.append(akey) + e['authors'] = new_authors + + k = 'publish_date' + publish_dates = set(e[k] for e in editions if k in e and len(e[k]) != 4) + + k = 'pagination' + all_pagination = set(e[k].strip(':.') for e in editions if e.get(k)) + + one_item_lists = {} + for k in 'lc_classifications', 'publishers', 'contributions', 'series': + one_item_lists[k] = set(e[k][0].strip('.') for e in editions if e.get(k) and len(set(e[k])) == 1) + + + master.setdefault('source_records', []) + for k in 'source_records', 'ia_box_id', 'other_titles','isbn_10','series': + for e in editions[1:]: + if not e.get(k): + continue + for i in e[k]: + if i not in master.setdefault(k, []): + master[k].append(i) + + k = 'ocaid' + for e in editions[1:]: + if e.get(k) and 'ia:' + e[k] not in master['source_records']: + master['source_records'].append(e[k]) + + k = 'identifiers' + if any(k in e for e in editions): + master.setdefault(k, {}) + for e in editions[1:]: + if k not in e: + continue + for a, b in e[k].items(): + for c in b: + if c in master[k].setdefault(a, []): + continue + master[k][a].append(c) + + any_publish_country = False + k = 'publish_country' + if k in all_keys: + for e in editions: + if e.get(k) and not e[k].strip().startswith('xx'): + any_publish_country = True + + no_merge = False + skip_fields = set(['source_records', 'ia_box_id', 'identifiers', 'ocaid', 'other_titles', 'series', 'isbn_10']) + for k in all_keys: + if k in skip_fields: + continue + + uniq = defaultdict(list) + for num, e in enumerate(editions): + if e.get(k): + if k == 'publish_date' and len(e[k]) == 4 and e[k].isdigit and any(e[k] in pd for pd in publish_dates): + continue + if k == 'pagination' and any(len(i) > len(e[k].strip('.:')) and e[k].strip('.:') in i for i in all_pagination): + continue + if k in one_item_lists and len(set(e.get(k, []))) == 1 and any(len(i) > len(e[k][0].strip('.')) and e[k][0].strip('.') in i for i in one_item_lists[k]): + continue + if k == 'publish_country' and any_publish_country and e.get(k, '').strip().startswith('xx'): + continue + if k == 'edition_name' and e[k].endswith(' ed edition'): + e[k] = e[k][:-len(' edition')] + uniq[re_nonword.sub('', repr(e[k]).lower())].append(num) + + if len(uniq) == 0: + continue + if len(uniq) == 1: + master[k] = editions[uniq.values()[0][0]][k] + continue + + if k == 'covers': + assert all(isinstance(e[k], list) for e in editions if k in e) + covers = set() + for e in editions: + if k in e: + covers.update(c for c in e[k] if c != -1) + master['covers'] = sorted(covers) + continue + + if k == 'notes': + master['notes'] = '' + for e in editions: + if e.get('notes'): + master['notes'] += e['notes'] + '\n' + continue + + if k == 'ocaid': + for e in editions: + if e.get('ocaid'): + if e['ocaid'].endswith('goog'): + print(e['key'], e['ocaid'], ia) + master['ocaid'] = e['ocaid'] + break + assert master['ocaid'] + continue + + if k == 'authors': + min_author = set(min((e.get('authors', []) for e in editions), key=len)) + if all(min_author <= set(e.get('authors', [])) for e in editions): + master[k] = max((e.get('authors', []) for e in editions), key=len) + continue + + print('unmerged field:', k) + print([e.get(k) for e in editions]) + no_merge = True + if no_merge: + continue + if 'location' in master and isinstance(master['location'], six.string_types) and master['location'].startswith('/books/'): + del master['location'] + updates.append(master) + for e in editions[1:]: + redirect = { + 'type': Reference('/type/redirect'), + 'location': min_ekey, + 'key': e['key'], + } + updates.append(redirect) + print(len(updates), min_ekey) + try: + print(ol.save_many(updates, 'merge lending editions')) + except: + for i in updates: + print(i) + raise + cur2.execute('update merge set done=now() where ia=%s', [ia]) diff --git a/ia-legacy-importer/edition_merge/schema.sql b/ia-legacy-importer/edition_merge/schema.sql new file mode 100644 index 00000000..080effaf --- /dev/null +++ b/ia-legacy-importer/edition_merge/schema.sql @@ -0,0 +1,5 @@ +create table merge ( + ia varchar(255) not null primary key, + editions varchar(255) not null, + done datetime +); diff --git a/ia-legacy-importer/edition_merge/templates/index.html b/ia-legacy-importer/edition_merge/templates/index.html new file mode 100644 index 00000000..65f06247 --- /dev/null +++ b/ia-legacy-importer/edition_merge/templates/index.html @@ -0,0 +1,61 @@ + + +Merge duplicate editions + + + +

    Merge duplicate editions

    + +{% with messages = get_flashed_messages() %} + {% if messages %} +
      + {% for message in messages %} +
    • {{ message }}
    • + {% endfor %} +
    + {% endif %} +{% endwith %} + +{% macro pager() %} +Pages: +{% for i in range((total / rows)+1) %} +{% if i + 1 == page %} +{{ page }} +{% else %} +
    {{ i + 1}} +{% endif %} +{% endfor %} +

    +{% endmacro %} + +{{ pager() }} + +{{ total }} items need to be merged ({{ easy }} easy)

    + + +{% for ia, editions, unmerge_count in merge_list %} + + + + +{% if unmerge_count == 0 %} + + +{% endfor %} +
    {{ ia }} +{% for ekey in editions.split(' ') %} +{{ekey}} +{% endfor %} +merge +{% else %} + +{% endif %} +{{ unmerge_count }} +
    + +{{ pager() }} + + + diff --git a/ia-legacy-importer/edition_merge/templates/merge.html b/ia-legacy-importer/edition_merge/templates/merge.html new file mode 100644 index 00000000..497418d6 --- /dev/null +++ b/ia-legacy-importer/edition_merge/templates/merge.html @@ -0,0 +1,88 @@ + + +Merge {{ia}} + + + + +

    Merge {{ia}}

    + +back to index

    + +{% with messages = get_flashed_messages() %} + {% if messages %} +

      + {% for message in messages %} +
    • {{ message }}
    • + {% endfor %} +
    + {% endif %} +{% endwith %} + + + +Merging editions: {{ ekeys }} into {{ master }}

    + +

    + + +{% for ekey in ekeys %} + +{% endfor %} + + +{% for k in keys %} + + +{% for e in editions %} +{% set ekey = e['key'][7:] %} + +{% endfor %} +{% if merged.get(k) %} + +{% else %} + +{% endif %} + +{% endfor %} + + + + +
    {{ ekey }}merged
    {{ k }} +{% if k == 'notes' and e.get(k) %} +{% for line in e[k].splitlines() %} +{{ line }}
    +{% endfor %} +{% elif k == 'authors' and e.get(k) %} +{% for akey in e[k] %} + {{akey}} +{% endfor %} + +{% elif not merged.get(k) and e.get(k) %} + {{ e.get(k) }} +{% else %} +{{ e.get(k) }} +{% endif %} +
    +{% if k == 'notes' %} +{% for line in merged[k].splitlines() %} +{{ line }}
    +{% endfor %} +{% else %} +{{ merged[k] }} +{% endif %} +
    + + +
    + + + + + diff --git a/ia-legacy-importer/edition_merge/templates/web_merge.html b/ia-legacy-importer/edition_merge/templates/web_merge.html new file mode 100644 index 00000000..97d78109 --- /dev/null +++ b/ia-legacy-importer/edition_merge/templates/web_merge.html @@ -0,0 +1,58 @@ + + +Merge {{ia}} + + + +

    Merge {{ia}}

    + +Merging editions: {{ ekeys }} into {{ master }}

    + +merge listsskip

    + + + +{% for ekey in ekeys %} + + +{% endfor %} + + +{% for k in keys %} + + +{% for e in editions %} + +{% endfor %} +{% if merged.get(k) %} + +{% else %} + +{% endif %} + +{% endfor %} +
    {{ ekey }}mergemerged
    {{ k }} +{% if k == 'notes' and e.get(k) %} +{% for line in e[k].splitlines() %} +{{ line }}
    +{% endfor %} +{% else %} +{{ e.get(k) }} +{% endif %} +
    +{% if k == 'notes' %} +{% for line in merged[k].splitlines() %} +{{ line }}
    +{% endfor %} +{% else %} +{{ merged[k] }} +{% endif %} +
    + +

    Works

    +
    {{ works | pprint }}
    + + + diff --git a/ia-legacy-importer/get_ia.py b/ia-legacy-importer/get_ia.py new file mode 100644 index 00000000..4be5926e --- /dev/null +++ b/ia-legacy-importer/get_ia.py @@ -0,0 +1,258 @@ +from __future__ import print_function + +import os.path +import socket +import traceback +import xml.parsers.expat + +from infogami import config +from lxml import etree +from six.moves import urllib +from time import sleep + +from openlibrary.catalog.marc.marc_binary import MarcBinary +from openlibrary.catalog.marc.marc_xml import MarcXml +from openlibrary.catalog.marc.parse import read_edition +from openlibrary.catalog.marc.fast_parse import read_file as fast_read_file # Deprecated import +from openlibrary.core import ia + + +IA_BASE_URL = config.get('ia_base_url') +IA_DOWNLOAD_URL = '%s/download/' % IA_BASE_URL +MAX_MARC_LENGTH = 100000 + +class NoMARCXML(IOError): + # DEPRECATED, rely on MarcXml to raise exceptions + pass + +def urlopen_keep_trying(url): + for i in range(3): + try: + f = urllib.request.urlopen(url) + return f + except urllib.error.HTTPError as error: + if error.code in (403, 404, 416): + raise + except urllib.error.URLError: + pass + sleep(2) + +def bad_ia_xml(identifier): + # DEPRECATED + if identifier == 'revistadoinstit01paulgoog': + return False + # need to handle 404s: + # http://www.archive.org/details/index1858mary + loc = "{0}/{0}_marc.xml".format(identifier) + return ' + + + + Notification or update type code + + + + + Early notification + Use for a complete record issued earlier than approximately six months before publication. + + + + + Advance notification (confirmed) + Use for a complete record issued to confirm advance information approximately six months before publication; or for a complete record issued after that date and before information has been confirmed from the book-in-hand. + + + + + Notification confirmed from book-in-hand + Use for a complete record issued to confirm advance information using the book-in-hand at or just before actual publication date; or for a complete record issued at any later date. + + + + + Update (partial) + Intended to be used for an update to a part of the record which is sent without re-issuing the complete record. In practise, however, ONIX updating is invariably by complete record replacement using code 03, and code 04 is not used. + + + + + Delete + Use when sending an instruction to delete a record which was previously issued. Note that a delete instruction should NOT be used when a product is cancelled, put out of print, or otherwise withdrawn from sale: this should be handled as a change of availability status, leaving the receiver to decide whether to retain or delete the record. A delete instruction is only used when there is a particular reason to withdraw a record completely, eg because it was issued in error. + + + + + Notice of sale + Notice of sale of a product, from one publisher to another: sent by the publisher disposing of the product + + + + + Notice of acquisition + Notice of acquisition of a product, by one publisher from another: sent by the acquiring publisher + + + + + + + Reason for deletion code + + + + + + Record source type code + + + + + Unspecified + + + + + + Publisher + + + + + + Publisher's distributor + Use to designate a distributor providing warehousing and fulfilment for a publisher or for a publisher’s sales agent, as distinct from a wholesaler + + + + + Wholesaler + + + + + + Bibliographic agency + + + + + + Library bookseller + + + + + + Publisher’s sales agent + Use for a publisher’s sales agent responsible for marketing the publisher’s products within a territory, as opposed to a publisher’s distributor who fulfils orders but does not market + + + + + + + Product identifier type code + + + + + Proprietary + For example, a publisher’s or wholesaler’s product number. + + + + + ISBN-10 + International Standard Book Number, pre-2007, unhyphenated (10 characters) + + + + + EAN.UCC-13 + EAN-UCC article number (13 digits) + + + + + UPC + UPC product number (12 digits) + + + + + ISMN + International Standard Music Number (10 digits) + + + + + DOI + Digital Object Identifier (variable length and character set) + + + + + LCCN + Library of Congress Control Number (12 characters, alphanumeric) + + + + + GTIN-14 + EAN-UCC Global Trade Item Number (14 digits) + + + + + ISBN-13 + International Standard Book Number, from 2007, unhyphenated (13 digits) + + + + + + + Barcode indicator + + + + + Not barcoded + + + + + + Barcoded, scheme unspecified + + + + + + EAN13 + Position unspecified + + + + + EAN13+5 + Position unspecified + + + + + UPC12 + Type and position unspecified. DEPRECATED: if possible, use more specific values below. + + + + + UPC12+5 + Type and position unspecified. DEPRECATED: if possible, use more specific values below. + + + + + UPC12 (item-specific) + AKA item/price: position unspecified + + + + + UPC12+5 (item-specific) + AKA item/price: position unspecified + + + + + UPC12 (price-point) + AKA price/item: position unspecified + + + + + UPC12+5 (price-point) + AKA price/item: position unspecified + + + + + EAN13 on cover 4 + ‘Cover 4’ is defined as the back cover of a book + + + + + EAN13+5 on cover 4 + ‘Cover 4’ is defined as the back cover of a book + + + + + UPC12 (item-specific) on cover 4 + AKA item/price; 'cover 4' is defined as the back cover of a book + + + + + UPC12+5 (item-specific) on cover 4 + AKA item/price; 'cover 4' is defined as the back cover of a book + + + + + UPC12 (price-point) on cover 4 + AKA price/item; 'cover 4' is defined as the back cover of a book + + + + + UPC12+5 (price-point) on cover 4 + AKA price/item; 'cover 4' is defined as the back cover of a book + + + + + EAN13 on cover 3 + ‘Cover 3’ is defined as the inside back cover of a book + + + + + EAN13+5 on cover 3 + ‘Cover 3’ is defined as the inside back cover of a book + + + + + UPC12 (item-specific) on cover 3 + AKA item/price; 'cover 3' is defined as the inside back cover of a book + + + + + UPC12+5 (item-specific) on cover 3 + AKA item/price; 'cover 3' is defined as the inside back cover of a book + + + + + UPC12 (price-point) on cover 3 + AKA price/item; 'cover 3' is defined as the inside back cover of a book + + + + + UPC12+5 (price-point) on cover 3 + AKA price/item; 'cover 3' is defined as the inside back cover of a book + + + + + EAN13 on cover 2 + ‘Cover 2’ is defined as the inside front cover of a book + + + + + EAN13+5 on cover 2 + ‘Cover 2’ is defined as the inside front cover of a book + + + + + UPC12 (item-specific) on cover 2 + AKA item/price; 'cover 2' is defined as the inside front cover of a book + + + + + UPC12+5 (item-specific) on cover 2 + AKA item/price; 'cover 2' is defined as the inside front cover of a book + + + + + UPC12 (price-point) on cover 2 + AKA price/item; 'cover 2' is defined as the inside front cover of a book + + + + + UPC12+5 (price-point) on cover 2 + AKA price/item; 'cover 2' is defined as the inside front cover of a book + + + + + EAN13 on box + To be used only on boxed products + + + + + EAN13+5 on box + To be used only on boxed products + + + + + UPC12 (item-specific) on box + AKA item/price; to be used only on boxed products + + + + + UPC12+5 (item-specific) on box + AKA item/price; to be used only on boxed products + + + + + UPC12 (price-point) on box + AKA price/item; to be used only on boxed products + + + + + UPC12+5 (price-point) on box + AKA price/item; to be used only on boxed products + + + + + EAN13 on tag + To be used only on products fitted with hanging tags + + + + + EAN13+5 on tag + To be used only on products fitted with hanging tags + + + + + UPC12 (item-specific) on tag + AKA item/price; to be used only on products fitted with hanging tags + + + + + UPC12+5 (item-specific) on tag + AKA item/price; to be used only on products fitted with hanging tags + + + + + UPC12 (price-point) on tag + AKA price/item; to be used only on products fitted with hanging tags + + + + + UPC12+5 (price-point) on tag + AKA price/item; to be used only on products fitted with hanging tags + + + + + EAN13 on bottom + Not be used on books unless they are contained within outer packaging + + + + + EAN13+5 on bottom + Not be used on books unless they are contained within outer packaging + + + + + UPC12 (item-specific) on bottom + AKA item/price; not be used on books unless they are contained within outer packaging + + + + + UPC12+5 (item-specific) on bottom + AKA item/price; not be used on books unless they are contained within outer packaging + + + + + UPC12 (price-point) on bottom + AKA price/item; not be used on books unless they are contained within outer packaging + + + + + UPC12+5 (price-point) on bottom + AKA price/item; not be used on books unless they are contained within outer packaging + + + + + EAN13 on back + Not be used on books unless they are contained within outer packaging + + + + + EAN13+5 on back + Not be used on books unless they are contained within outer packaging + + + + + UPC12 (item-specific) on back + AKA item/price; not be used on books unless they are contained within outer packaging + + + + + UPC12+5 (item-specific) on back + AKA item/price; not be used on books unless they are contained within outer packaging + + + + + UPC12 (price-point) on back + AKA price/item; not be used on books unless they are contained within outer packaging + + + + + UPC12+5 (price-point) on back + AKA price/item; not be used on books unless they are contained within outer packaging + + + + + EAN13 on outer sleeve/back + To be used only on products packaged in outer sleeves + + + + + EAN13+5 on outer sleeve/back + To be used only on products packaged in outer sleeves + + + + + UPC12 (item-specific) on outer sleeve/back + AKA item/price; to be used only on products packaged in outer sleeves + + + + + UPC12+5 (item-specific) on outer sleeve/back + AKA item/price; to be used only on products packaged in outer sleeves + + + + + UPC12 (price-point) on outer sleeve/back + AKA price/item; to be used only on products packaged in outer sleeves + + + + + UPC12+5 (price-point) on outer sleeve/back + AKA price/item; to be used only on products packaged in outer sleeves + + + + + + + Product form code + + + + + Undefined + + + + + + Audio + Audio recording - detail unspecified + + + + + Audio cassette + Audio cassette (analogue) + + + + + CD-Audio + Audio compact disk, in CD-Audio or SACD format + + + + + DAT + Digital audio tape cassette + + + + + Audio disk + Audio disk (excluding CD) + + + + + Audio tape + Audio tape (reel tape) + + + + + MiniDisc + Sony MiniDisc format + + + + + CD-Extra + Audio compact disk with part CD-ROM content + + + + + DVD Audio + + + + + + Downloadable audio file + Audio recording downloadable online + + + + + Other audio format + Other audio format not specified by AB to AJ + + + + + Book + Book - detail unspecified + + + + + Hardback + Hardback or cased book + + + + + Paperback + Paperback or softback book + + + + + Loose-leaf + Loose-leaf book + + + + + Spiral bound + Spiral, comb or coil bound book + + + + + Pamphlet + Pamphlet or brochure, stapled; German 'geheftet' + + + + + Leather / fine binding + + + + + + Board book + Child’s book with all pages printed on board + + + + + Rag book + Child’s book with all pages printed on textile + + + + + Bath book + Child’s book printed on waterproof material + + + + + Novelty book + Use for books whose novelty is expressed in the format itself, not for books in a conventional format which happen to have novelty content + + + + + Slide bound + Slide bound book + + + + + Big book + Extra-large format for teaching etc; this format and terminology may be specifically UK; required as a top-level differentiator + + + + + Part-work (fascículo) + A part-work issued with its own ISBN and intended to be collected and bound into a complete book + + + + + Leporello (folded) + A concertina-folded book, usually a picture book + + + + + Other book format + Other book format or binding not specified by BB to BO + + + + + Sheet map + Sheet map - detail unspecified + + + + + Sheet map, folded + + + + + + Sheet map, flat + + + + + + Sheet map, rolled + See Code List 80 for 'rolled in tube' + + + + + Globe + Globe or planisphere + + + + + Other cartographic + Other cartographic format not specified by CB to CE + + + + + Digital + Digital or multimedia (detail unspecified) + + + + + CD-ROM + + + + + + CD-I + CD interactive + + + + + DVD + Digital Versatile Disk: DEPRECATED - use VI for DVD video, AI for DVD audio, DI for DVD-ROM + + + + + Game cartridge + + + + + + Diskette + AKA 'floppy disk' + + + + + Electronic book text + Electronic book text in proprietary or open standard format + + + + + Online resource + An electronic database or other resource or service accessible through online networks + + + + + DVD-ROM + + + + + + Secure Digital (SD) Memory Card + + + + + + Compact Flash Memory Card + + + + + + Memory Stick Memory Card + + + + + + USB Flash Drive + + + + + + Other digital + Other digital or multimedia not specified by DB to DM + + + + + Film or transparency + Film or transparency – detail unspecified + + + + + Film + Continuous film or filmstrip: DEPRECATED - use FE or FF + + + + + Slides + Photographic transparencies mounted for projection + + + + + OHP transparencies + Transparencies for overhead projector + + + + + Filmstrip + + + + + + Film + Continuous movie film as opposed to filmstrip + + + + + Other film or transparency format + Other film or transparency format not specified by FB to FF + + + + + Microform + Microform – detail unspecified + + + + + Microfiche + + + + + + Microfilm + Roll microfilm + + + + + Other microform + Other microform not specified by MB or MC + + + + + Miscellaneous print + Miscellaneous printed material – detail unspecified + + + + + Address book + + + + + + Calendar + + + + + + Cards + Cards, flash cards (eg for teaching reading) + + + + + Copymasters + Copymasters, photocopiable sheets + + + + + Diary + + + + + + Frieze + + + + + + Kit + + + + + + Sheet music + + + + + + Postcard book or pack + + + + + + Poster + Poster for retail sale – see also XF + + + + + Record book + Record book (eg 'birthday book', 'baby book') + + + + + Wallet or folder + Wallet or folder (containing loose sheets etc): it is preferable to code the contents and treat 'wallet' as packaging (List 80), but if this is not possible the product as a whole may be coded as a 'wallet' + + + + + Pictures or photographs + + + + + + Wallchart + + + + + + Stickers + + + + + + Plate (lámina) + A book-sized (as opposed to poster-sized) sheet, usually in colour or high quality print + + + + + Other printed item + Other printed item not specified by PB to PQ + + + + + Video + Video – detail unspecified + + + + + Video, VHS, PAL + DEPRECATED - use new VJ + + + + + Video, VHS, NTSC + DEPRECATED - use new VJ + + + + + Video, Betamax, PAL + DEPRECATED - use new VK + + + + + Video, Betamax, NTSC + DEPRECATED - use new VK + + + + + Videodisk + eg Laserdisk + + + + + Video, VHS, SECAM + DEPRECATED - use new VJ + + + + + Video, Betamax, SECAM + DEPRECATED - use new VK + + + + + DVD video + DVD video: specify TV standard in List 78 + + + + + VHS video + VHS videotape: specify TV standard in List 78 + + + + + Betamax video + Betamax videotape: specify TV standard in List 78 + + + + + VCD + VideoCD + + + + + SVCD + Super VideoCD + + + + + Other video format + Other video format not specified by VB to VK + + + + + Mixed media product + A product consisting of two or more items in different media, eg book and CD-ROM, book and toy etc + + + + + Quantity pack + A product consisting of (a) a quantity of a single item, or (b) quantities of two or more separate items, packaged together for retail sale, eg a quantity pack of classroom texts, not to be confused with packs intended for trade distribution only – see XC, XE, XL + + + + + Trade-only material + Trade-only material (unspecified) + + + + + Dumpbin – empty + + + + + + Dumpbin – filled + Dumpbin with contents + + + + + Counterpack – empty + + + + + + Counterpack – filled + Counterpack with contents + + + + + Poster, promotional + Promotional poster for display, not for sale – see also PK + + + + + Shelf strip + + + + + + Window piece + Promotional piece for shop window display + + + + + Streamer + + + + + + Spinner + + + + + + Large book display + Large scale facsimile of book for promotional display + + + + + Shrink-wrapped pack + A quantity pack with its own product code, for trade supply only: the retail items it contains are intended for sale individually – see also WX + + + + + Other point of sale + Other point of sale material not specified by XB to XL + + + + + General merchandise + General merchandise – unspecified + + + + + Doll + + + + + + Soft toy + Soft or plush toy + + + + + Toy + + + + + + Game + Board game, or other game (except computer game: see DE) + + + + + T-shirt + + + + + + Other merchandize + Other merchandize not specified by ZB to ZF + + + + + + + Book form detail + + + + + A-format paperback + DEPRECATED + + + + + B-format paperback + ‘B’ format paperback: UK 198 x 129 mm - DEPRECATED + + + + + C-format paperback + ‘C’ format paperback: UK 216 x 135 mm - DEPRECATED + + + + + Paper over boards + DEPRECATED + + + + + Cloth + DEPRECATED + + + + + With dust jacket + DEPRECATED + + + + + Reinforced binding + DEPRECATED + + + + + + + Product classification type code + + + + + WCO Harmonized System + World Customs Organization Harmonized Commodity Coding & Description System + + + + + UNSPSC + UN Standard Product & Service Classification + + + + + HMC&E + UK Customs & Excise classifications, based on the Harmonized System + + + + + Warenverzeichnis für die Außenhandelsstatistik + German export trade classification, based on the Harmonised System + + + + + TARIC + EU TARIC codes, an extended version of the Harmonized System + + + + + + + Epublication type code + + + + + Epublication “content package” + An epublication viewed as a unique package of content which may be converted into any of a number of different types for delivery to the consumer. This code is used when an ONIX <Product> record describes the content package and lists within the record the different forms in which it is available. + + + + + HTML + An epublication delivered in a basic, unprotected, HTML format. Do NOT use for HTML-based formats which include DRM protection. + + + + + PDF + An epublication delivered in a basic, unprotected, PDF format. Do NOT use for PDF-based formats which include DRM protection. + + + + + PDF-Merchant + An epublication delivered in PDF format, capable of being read in the standard Acrobat Reader, and protected by PDF-Merchant DRM features. (This format is no longer supported for new applications.) + + + + + Adobe Ebook Reader + An epublication delivered in an enhanced PDF format, using Adobe’s proprietary EBX DRM, capable of being read in the Adobe Ebook Reader software, on any platform which can support this software, which was formerly known as Glassbook. + + + + + Microsoft Reader Level 1/Level 3 + An epublication delivered in an unencrypted Microsoft .LIT format, capable of being read in the Microsoft Reader software at any level, on any platform which can support this software. (Level 3 differs from Level 1 only in that it embeds the name of the original purchaser.) + + + + + Microsoft Reader Level 5 + An epublication delivered in the Microsoft .LIT format, with full encryption, capable of being read in the Microsoft Reader software at Level 5, on any platform which can support this software. + + + + + NetLibrary + An epublication delivered in a proprietary HTML- or OEBF-based format, capable of being read only through subscription to the NetLibrary service. + + + + + MetaText + An epublication delivered in a proprietary format through a web browser, capable of being read only through subscription to the MetaText service (the educational division of NetLibrary) + + + + + MightyWords + An epublication delivered in a proprietary PDF-based format, capable of being read only through subscription to the MightyWords service. + + + + + Palm Reader + An epublication delivered in a proprietary HTML-based format, capable of being read in reading software which may be used on handheld devices using the Palm OS or Pocket PC/Windows CE operating systems. + + + + + Softbook + An epublication delivered in a proprietary format capable of being read in reading software which is specific to the Softbook hardware platform. Also capable of being read on the Softbook’s successor, the Gemstar REB 1200. + + + + + RocketBook + An epublication delivered in a proprietary .RB format, capable of being read in reading software which is specific to the RocketBook hardware platform. Also capable of being read on the RocketBook’s successor, the Gemstar REB 1100. + + + + + Gemstar REB 1100 + An epublication delivered in a proprietary .RB format, capable of being read in reading software which is specific to the Gemstar REB 1100 hardware platform. Also capable of being read on the RocketBook with some loss of functionality. + + + + + Gemstar REB 1200 + An epublication delivered in a proprietary format, capable of being read in reading software which is specific to the Gemstar REB 1200 hardware platform. Also capable of being read on the Softbook with some loss of functionality. + + + + + Franklin eBookman + An epublication delivered in Franklin’s proprietary HTML-based format, capable of being read in reading software which is specific to the Franklin eBookman platform. + + + + + Books24x7 + An epublication delivered in a proprietary XML-based format and available for online access only through subscription to the Books24x7 service. + + + + + DigitalOwl + An epublication available through DigitalOwl proprietary packaging, distribution and DRM software, delivered in a variety of formats across a range of platforms. + + + + + Handheldmed + An epublication delivered in a proprietary HTML-based format, capable of being read in Handheldmed reader software on Palm OS, Windows, and EPOC/Psion handheld devices, available only through the Handheldmed service. + + + + + WizeUp + An epublication delivered in a proprietary ???-based format and available for download only through the WizeUp service. + + + + + TK3 + An epublication delivered in the proprietary TK3 format, capable of being read only in the TK3 reader software supplied by Night Kitchen Inc, on any platform which can support this software. + + + + + Litraweb + An epublication delivered in an encrypted .RTF format, capable of being read only in the Litraweb Visor software, and available only from Litraweb.com. + + + + + MobiPocket + An epublication delivered in a proprietary format, capable of being read in the MobiPocket software on PalmOS, WindowsCE /Pocket PC, Franklin eBookman, and EPOC32 handheld devices, available only through the MobiPocket service. + + + + + Open Ebook + An epublication delivered in the standard distribution format specified in the Open Ebook Publication Structure (OEBPS) format and capable of being read in any OEBPS-compliant reading system. + + + + + Town Compass DataViewer + An epublication delivered in a proprietary format, capable of being read in Town Compass DataViewer reader software on a Palm OS handheld device. + + + + + TXT + An epublication delivered in an openly available .TXT format, with ASCII or UTF-8 encoding, as used for example in Project Gutenberg + + + + + ExeBook + An epublication delivered as a self-executing file including its own reader software, and created with proprietary ExeBook Self-Publisher software + + + + + Sony BBeB + An epublication delivered in a proprietary format, capable of being read on a Sony Reader handheld device + + + + + + + Epublication format code + + + + + HTML + + + + + + PDF + + + + + + Microsoft Reader + ‘.LIT’ file format used by Microsoft Reader software + + + + + RocketBook + + + + + + Rich text format (RTF) + + + + + + Open Ebook Publication Structure (OEBPS) format standard + + + + + + XML + + + + + + SGML + + + + + + EXE + ‘.EXE’ file format used when an epublication is delivered as a self-executing package of software and content. + + + + + ASCII + ‘.TXT’ file format + + + + + MobiPocket format + Proprietary file format used for the MobiPocket reader software + + + + + + + Trade category code + + + + + UK open market edition + An edition from a UK publisher sold only in territories where exclusive rights are not held. Rights details should be carried in PR.21 as usual. + + + + + Airport edition + In UK, an edition intended primarily for airside sales in UK airports, though it may be available for sale in other territories where exclusive rights are not held. Details should be carried in PR.21 as usual. + + + + + Sonderausgabe + In Germany, a special printing sold at a lower price than the regular hardback + + + + + Pocket paperback + In countries where recognised as a distinct trade category, eg France 'livre de poche', Germany 'Taschenbuch', Italy 'tascabile', Spain 'libro de bolsillo + + + + + International edition (US) + Edition produced solely for sale in designated export markets + + + + + Library audio edition + Audio product sold in special durable packaging and with a replacement guarantee for the contained cassettes or CDs for a specified shelf-life + + + + + US open market edition + An edition from a US publisher sold only in territories where exclusive rights are not held. Rights details should be carried in PR.21 as usual. + + + + + Livre scolaire, déclaré par l'éditeur + In France, a category of book that has a particular legal status, claimed by the publisher + + + + + Livre scolaire (non spécifié) + In France, a category of book that has a particular legal status, designated independently of the publisher + + + + + + + Series identifier type code + + + + + Proprietary + For example, publisher’s own series ID + + + + + ISSN + + + + + + German National Bibliography series ID + Maintained by the Deutsche Bibliothek + + + + + German Books in Print series ID + Maintained by VLB + + + + + Electre series ID + Maintained by Electre Information, France + + + + + DOI + + + + + + + + Text case flag + + + + + Undefined + Default + + + + + Sentence case + Initial capitals on first word and subsequently on proper names only, eg The conquest of Mexico + + + + + Title case + Initial capitals on first word and on all significant words thereafter, eg The Conquest of Mexico + + + + + All capitals + For example, THE CONQUEST OF MEXICO + + + + + + + Title type code + + + + + Undefined + + + + + + Distinctive title(book); cover title (serial) + The full text of the distinctive title of the item, without abbreviation or abridgement. For books, where the title alone is not distinctive, elements may be taken from a set or series title and part number etc to create a distinctive title. Where the item is an omnibus edition containing two or more works by the same author, and there is no separate combined title, a distinctive title may be constructed by concatenating the individual titles, with suitable punctuation, as in Pride and prejudice / Sense and sensibility / Northanger Abbey. + + + + + ISSN key title of serial + Serials only + + + + + Title in original language + Where the subject of the ONIX record is a translated item + + + + + Title acronym + For serials: JACM = Journal of the Association for Computing Machinery + + + + + Abbreviated title + An abbreviated form of Title Type 01 + + + + + Title in other language + A translation of Title Type 01 into another language + + + + + Thematic title of journal issue + Serials only: when a journal issue is explicitly devoted to a specified topic + + + + + Former title + Books or serials: when an item was previously published under another title + + + + + Distributor's title + For books: the title carried in a book distributor's title file: frequently incomplete, and may include elements not properly part of the title + + + + + + + Work identifier type code + + + + + Proprietary + + + + + + ISBN + ISBN of manifestation of work, when this is the only identifier available + + + + + DOI + + + + + + ISTC + + + + + + + + Contributor role code + + + + + By (author) + Author of a textual work + + + + + With + With or as told to: 'ghost' author of a literary work + + + + + Screenplay by + Writer of screenplay or script (film or video) + + + + + Libretto by + Writer of libretto (opera): see also A31 + + + + + Lyrics by + Author of lyrics (song): see also A31 + + + + + By (composer) + Composer of music + + + + + By (artist) + Visual artist when named as the primary creator of, eg, a book of reproductions of artworks + + + + + By (photographer) + Photographer when named as the primary creator of, eg, a book of photographs) + + + + + Created by + + + + + + From an idea by + + + + + + Designed by + + + + + + Illustrated by + Artist when named as the creator of artwork which illustrates a text, or of the artwork of a graphic novel or comic book + + + + + Photographs by + Photographer when named as the creator of photographs which illustrate a text + + + + + Text by + Author of text which accompanies art reproductions or photographs, or which is part of a graphic novel or comic book + + + + + Preface by + Author of preface + + + + + Prologue by + Author of prologue + + + + + Summary by + Author of summary + + + + + Supplement by + Author of supplement + + + + + Afterword by + Author of afterword + + + + + Notes by + Author of notes or annotations: see also A29 + + + + + Commentaries by + Author of commentaries on the main text + + + + + Epilogue by + Author of epilogue + + + + + Foreword by + Author of foreword + + + + + Introduction by + Author of introduction: see also A29 + + + + + Footnotes by + Author/compiler of footnotes + + + + + Memoir by + Author of memoir accompanying main text + + + + + Experiments by + Person who carried out experiments reported in the text + + + + + Introduction and notes by + Author of introduction and notes: see also A20 and A24 + + + + + Software written by + Writer of computer programs ancillary to the text + + + + + Book and lyrics by + Author of the textual content of a musical drama: see also A04 and A05 + + + + + Contributions by + Author of additional contributions to the text + + + + + Appendix by + Author of appendix + + + + + Index by + Compiler of index + + + + + Drawings by + + + + + + Cover design or artwork by + Use also for the cover artist of a graphic novel or comic book if named separately + + + + + Preliminary work by + Responsible for preliminary work on which the work is based + + + + + Original author + Author of the first edition (usually of a standard work) who is not an author of the current edition + + + + + Maps by + Maps drawn or otherwise contributed by + + + + + Inked or colored by + When separate persons are named as having respectively drawn and colored artwork, eg for a graphic novel or comic book, use A12 for 'drawn by' and A40 for 'colored by' + + + + + Other primary creator + Other type of primary creator not specified above + + + + + Edited by + + + + + + Revised by + + + + + + Retold by + + + + + + Abridged by + + + + + + Adapted by + + + + + + Translated by + + + + + + As told by + + + + + + Translated with commentary by + This code applies where a translator has provided a commentary on issues relating to the translation. If the translator has also provided a commentary on the work itself, the name should be entered twice using codes B06 and A21. + + + + + Series edited by + Name of a series editor when the product belongs to a series + + + + + Edited and translated by + + + + + + Editor-in-chief + + + + + + Guest editor + + + + + + Volume editor + + + + + + Editorial board member + + + + + + Editorial coordination by + + + + + + Managing editor + + + + + + Founded by + Usually the founder editor of a serial publication: Begruendet von + + + + + Prepared for publication by + + + + + + Associate editor + + + + + + Consultant editor + Use also for 'advisory editor' + + + + + General editor + + + + + + Dramatized by + + + + + + General rapporteur + In Europe, an expert editor who takes responsibility for the legal content of a collaborative law volume + + + + + Literary editor + An editor who is responsible for establishing the text used in an edition of a literary work, where this is recognised as a distinctive role (in Spain, 'editor literario') + + + + + Other adaptation by + Other type of adaptation or editing not specified above + + + + + Compiled by + + + + + + Selected by + + + + + + Other compilation by + Other type of compilation not specified above + + + + + Producer + + + + + + Director + + + + + + Conductor + Conductor of a musical performance + + + + + Other direction by + Other type of direction not specified above + + + + + Actor + + + + + + Dancer + + + + + + Narrator + + + + + + Commentator + + + + + + Vocal soloist + Singer etc + + + + + Instrumental soloist + + + + + + Read by + Reader of recorded text, as in an audiobook + + + + + Performed by (orchestra, band, ensemble) + Name of a musical group in a performing role + + + + + Performed by + Other type of performer not specified above: use for a recorded performance which does not fit a category above, eg a performance by a stand-up comedian. + + + + + Filmed/photographed by + + + + + + Other recording by + Other type of recording not specified above + + + + + Assisted by + May be associated with any contributor role, and placement should therefore be controlled by contributor sequence numbering + + + + + Other + Other creative responsibility not falling within A to F above + + + + + + + Person name type + + + + + Unspecified + + + + + + Pseudonym + + + + + + Authority-controlled name + + + + + + + + Unnamed person(s) + + + + + Unknown + + + + + + Anonymous + + + + + + et al + And others: additional contributors not listed + + + + + Various authors + When the product is a pack of books by different authors + + + + + + + Conference role + + + + + + Edition type code + + + + + Abridged + Content has been shortened: use for abridged, shortened, concise, condensed. + + + + + Adapted + Content has been adapted to serve a different purpose or audience, or from one medium to another: use for dramatization, novelization etc. Use <EditionStatement> to describe the exact nature of the adaptation. + + + + + Alternate + Do not use. This code is now deprecated, but is retained in the list for reasons of upwards compatibility. + + + + + Annotated + Content is augmented by the addition of notes + + + + + Bilingual edition + Both languages should be specified in the 'Language' group. Use MLL for an edition in more than two languages. + + + + + Braille + Braille edition + + + + + Critical + Content includes critical commentary on the text + + + + + Coursepack + Content was compiled for a specified educational course. + + + + + Enlarged + Content has been enlarged or expanded from that of a previous edition. + + + + + Expurgated + ‘Offensive' content has been removed + + + + + Facsimile + Exact reproduction of the content and format of a previous edition. + + + + + Illustrated + Content includes extensive illustrations which are not part of other editions + + + + + Large type / large print + Large print edition, print sizes 14 to 19 pt - see also ULP + + + + + Microprint + A printed edition in a type size too small to be read without a magnifying glass + + + + + Media tie-in + An edition published to coincide with the release of a film, TV program, or electronic game based on the same work. Use <EditionStatement> to describe the exact nature of the tie-in. + + + + + Multilingual edition + All languages should be specified in the 'Language' group. Use BLL for a bilingual edition. + + + + + New edition + Where no other information is given, or no other coded type is applicable + + + + + Revised + Content has been revised from that of a previous edition. + + + + + School edition + An edition intended specifically for use in schools. + + + + + Special edition + Use for anniversary, collectors’, de luxe, gift, limited, numbered, autographed edition. Use <EditionStatement> to describe the exact nature of the special edition. + + + + + Student edition + Where a text is available in both student and teacher’s editions. + + + + + Teacher’s edition + Where a text is available in both student and teacher’s editions; use also for instructor’s or leader’s editions. + + + + + Unabridged + Where a title has also been published in an abridged edition; also for audiobooks, regardless of whether an abridged audio version also exists. + + + + + Ultra large print + For print sizes 20pt and above, and with typefaces designed for the visually impaired - see also LTE + + + + + Unexpurgated + Content previously considered 'offensive' has been restored + + + + + Variorum + Content includes notes by various commentators, and/or includes and compares several variant texts of the same work. + + + + + + + Language role code + + + + + Language of text + + + + + + Original language of a translated text + Where the text in the original language is NOT part of the current product + + + + + Language of abstracts + Where different from language of text: used mainly for serials + + + + + Rights language + Language to which specified rights apply + + + + + Rights-excluded language + Language to which specified rights do not apply + + + + + Original language in a multilingual edition + Where the text in the original language is part of a bilingual or multilingual edition + + + + + Translated language in a multilingual edition + Where the text in a translated language is part of a bilingual or multilingual edition + + + + + + + Extent type code + + + + + Number of words + Number of words of natural language text + + + + + Duration: use for running time + + + + + + Filesize + + + + + + + + Extent unit code + + + + + Words + Words of natural language text + + + + + Hours (integer and decimals) + + + + + + Minutes (integer and decimals) + + + + + + Seconds (integer only) + + + + + + Hours HHH + + + + + + Hours and minutes HHHMM + + + + + + Hours minutes seconds HHHMMSS + + + + + + Kbytes + + + + + + Mbytes + + + + + + + + Illustration and other content type code + + + + + Illustrations, black & white + + + + + + Illustrations, color + + + + + + Halftones, black & white + Including black & white photographs + + + + + Halftones, color + Including color photographs + + + + + Line drawings, black & white + + + + + + Line drawings, color + + + + + + Tables, black & white + + + + + + Tables, color + + + + + + Illustrations, unspecified + + + + + + Halftones, unspecified + Including photographs + + + + + Tables, unspecified + + + + + + Line drawings, unspecified + + + + + + Halftones, duotone + + + + + + Maps + + + + + + Frontispiece + + + + + + Diagrams + + + + + + Figures + + + + + + Charts + + + + + + Recorded music items + Recorded music extracts or examples, or complete recorded work(s), accompanying textual or other content + + + + + Printed music items + Printed music extracts or examples, or complete music score(s), accompanying textual or other content + + + + + Graphs + To be used in the mathematical sense of a diagram that represents numerical values plotted against an origin and axes, cf codes 16 and 18 + + + + + Plates, unspecified + ‘Plates’ means illustrations that are on separate pages bound into the body of a book + + + + + Plates, black & white + ‘Plates’ means illustrations that are on separate pages bound into the body of a book + + + + + Plates, color + ‘Plates’ means illustrations that are on separate pages bound into the body of a book + + + + + Index + + + + + + Bibliography + + + + + + + + Main subject scheme identifier code + + + + + UDC + Universal Decimal Classification. Code + + + + + BISAC category code + For information on BISAC subject categories, see http://www.bisg.org. Code + + + + + BIC subject category + For all BIC subject codes and qualifiers, see http://www.bic.org.uk/subcats.html. Code + + + + + Tabla de materias ISBN + Latin America. Code + + + + + Warengruppen-Systematik des deutschen Buchhandels + Code + + + + + Thèmes Electre + Subject classification used by Electre (France). Code + + + + + CLIL (France) + Code + + + + + DNB-Sachgruppen + Deutsche Bibliothek subject groups. Code + + + + + NUR + Nederlandstalige Uniforme Rubrieksindeling (Dutch book trade classification). Code + + + + + ECPA Christian Book Category + ECPA Christian Product Category Book Codes, consisting of up to three 3-letter blocks, for Super Category, Primary Category and Sub-Category. See http://www.ecpa.org/ECPA/cbacategories.xls. Code + + + + + Korean Decimal Classification (KDC) + A modified Dewey Decimal Classification used in the Republic of Korea. Code + + + + + DDC Deutsch + Code + + + + + Nippon Decimal Classification + Japanese subject classification scheme. Code + + + + + ANELE Materias + Spain: subject coding scheme of the Asociación Nacional de Editores de Libros y Material de Enseñanza. Code + + + + + + + Subject scheme identifier code + + + + + Dewey + Code + + + + + Abridged Dewey + Code + + + + + LC class number + Code + + + + + LC subject heading + Text + + + + + UDC + Universal Decimal Classification. Code + + + + + BISAC category code + For information on BISAC subject categories, see http://www.bisg.org. Code + + + + + BISAC region code + A geographical qualifier used with a BISAC subject category. Code + + + + + BIC subject category + For all BIC subject codes and qualifiers, see http://www.bic.org.uk/subcats.html. Code + + + + + BIC geographical qualifier + Code + + + + + BIC language qualifier (language as subject) + Code + + + + + BIC time period qualifier + Code + + + + + BIC educational purpose qualifier + Code + + + + + BIC reading level & special interest qualifier + Code + + + + + LC fiction genre heading + Text + + + + + Keywords + Text + + + + + BIC children’s book marketing category + See http://www.bic.org.uk/cbmc.html. Code + + + + + BISAC book merchandising code + For information on BISAC subject categories, see http://www.bisg.org. Code + + + + + Publisher’s own category code + Code + + + + + Proprietary subject scheme + Code + + + + + Tabla de materias ISBN + Latin America. Code + + + + + Warengruppen-Systematik des deutschen Buchhandels + Code + + + + + Schlagwort-Normdatei der Deutschen Bibliothek + Text + + + + + Thèmes Electre + Subject classification used by Electre (France). Code + + + + + CLIL + France. Code + + + + + DNB-Sachgruppen + Deutsche Bibliothek subject groups. Code + + + + + NUGI + Nederlandse Uniforme Genre-Indeling (former Dutch book trade classification). Code + + + + + NUR + Nederlandstalige Uniforme Rubrieksindeling (Dutch book trade classification, from 2002). Code + + + + + ECPA Christian Book Category + ECPA Christian Product Category Book Codes, consisting of up to three x 3-letter blocks, for Super Category, Primary Category and Sub-Category. See http://www.ecpa.org/ECPA/cbacategories.xls. Code + + + + + SISO + Schema Indeling Systematische Catalogus Openbare Bibliotheken (Dutch library classification). Code + + + + + Korean Decimal Classification (KDC) + A modified Dewey Decimal Classification used in the Republic of Korea. Code + + + + + DDC Deutsch + Code + + + + + Bokgrupper + Norwegian book trade product categories (4701) + + + + + Varegrupper + Norwegian bookselling subject categories (4702) + + + + + Læreplaner + Norwegian school curriculum version (4703) + + + + + Nippon Decimal Classification + Japanese subject classification scheme. Code + + + + + BSQ + BookSelling Qualifier: Russian book trade classification. Code + + + + + ANELE Materias + Spain: subject coding scheme of the Asociación Nacional de Editores de Libros y Material de Enseñanza. Code + + + + + Skolefag + Norwegian primary and secondary school subject categories (4705) + + + + + Videregående + Norwegian list of categories used in higher secondary education and vocational training (4706) + + + + + Undervisningsmateriell + Norwegian list of categories for books and other material used in education (4707) + + + + + Norsk DDK + Norwegian version of Dewey Decimal Classification + + + + + + + Audience code + + + + + General/trade + For a non-specialist adult audience + + + + + Children/juvenile + For a juvenile audience, not specifically for any educational purpose + + + + + Young adult + For a teenage audience, not specifically for any educational purpose + + + + + Primary & secondary/elementary & high school + Kindergarten, pre-school, primary/elementary or secondary/high school education + + + + + College/higher education + For universities and colleges of further and higher education + + + + + Professional and scholarly + For an expert adult audience, including academic research + + + + + ELT/ESL + Intended for use in teaching English as a second language + + + + + Adult education + For centres providing academic, vocational or recreational courses for adults + + + + + + + Audience code type + + + + + ONIX audience codes + Using List 28 + + + + + Proprietary + + + + + + MPAA rating + Motion Picture Association of America rating applied to movies + + + + + BBFC rating + British Board of Film Classification rating applied to movies + + + + + FSK rating + German FSK (Freiwillige Selbstkontrolle der Filmwirtschaft) rating applied to movies + + + + + BTLF audience code + French Canadian audience code list, used by BTLF for Memento + + + + + Electre audience code + Audience code used by Electre (France) + + + + + ANELE Tipo + Spain: educational audience and material type code of the Asociación Nacional de Editores de Libros y Material de Enseñanza + + + + + + + Audience range qualifier + + + + + US school grade range + Values for <AudienceRangeValue> are specified in List 77 + + + + + UK school grade + Values are defined by BIC for England & Wales, Scotland and N Ireland + + + + + Interest age, months + For use up to 30 months only: values in <AudienceRangeValue> must be integers + + + + + Interest age, years + Values in <AudienceRangeValue> must be integers + + + + + Reading age, years + Values in <AudienceRangeValue> must be integers + + + + + Spanish school grade + Spain: combined grade and region code, maintained by the Ministerio de Educación + + + + + Skoletrinn + Norwegian educational grades (4704) + + + + + + + Audience range precision + + + + + Exact + + + + + + From + + + + + + To + + + + + + + + Complexity scheme identifier + + + + + Lexile code + + + + + + Lexile number + + + + + + + + Other text type code + + + + + Main description + + + + + + Short description/annotation + Limited to a maximum of 350 characters + + + + + Long description + + + + + + Table of contents + Used for a table of contents sent as a single text field, which may or may not carry structure expressed through HTML etc. Alternatively, a fully structured table of contents may be sent by using the <ContentItem> composite. + + + + + Review quote, restricted length + A review quote that is restricted to a maximum length agreed between the sender and receiver of an ONIX file + + + + + Quote from review of previous edition + A review quote taken from a review of a previous edition of the work + + + + + Review text + Full text of a review of the product + + + + + Review quote + A quote from a review of the product + + + + + Promotional “headline” + A promotional phrase which is intended to headline a description of the product + + + + + Previous review quote + A quote from a review of a previous work by the same author(s) or in the same series + + + + + Author comments + May be part of Reading Group Guide material + + + + + Description for reader + + + + + + Biographical note + A note referring to all contributors to a product – NOT linked to a single contributor + + + + + Description for Reading Group Guide + + + + + + Discussion question for Reading Group Guide + Each instance must carry a single question + + + + + Competing titles + Free text listing of other titles with which the product is in competition: although this text might not appear in “public” ONIX records, it could be required where ONIX Is used as a communication format within a group of publishing and distribution companies + + + + + Flap copy + + + + + + Back cover copy + + + + + + Feature + Text describing a feature of a product to which the publisher wishes to draw attention for promotional purposes. Each separate feature should be described by a separate repeat, so that formatting can be applied at the discretion of the receiver of the ONIX record. + + + + + New feature + As code 19, but used for a feature which is new in a new edition of the product. + + + + + Excerpt from book + + + + + + First chapter + + + + + + Description for sales people + + + + + + Description for press or other media + + + + + + Description for subsidiary rights department + + + + + + Description for teachers/educators + + + + + + Unpublished endorsement + A quote usually provided by a celebrity to promote a new book, not from a review + + + + + Description for bookstore + + + + + + Description for library + + + + + + Introduction or preface + + + + + + Full text + + + + + + + + Text format code + + + + + ASCII text + DEPRECATED: use code 06 or 07 as appropriate + + + + + SGML + + + + + + HTML + Other than XHTML + + + + + XML + Other than XHTML + + + + + PDF + DEPRECATED: was formerly assigned both to PDF and to XHTML + + + + + XHTML + + + + + + Default text format + Default: text in the encoding declared at the head of the message or in the XML default (UTF-8 or UTF-16) if there is no explicit declaration + + + + + Basic ASCII text + Plain text containing no tags of any kind, except for the tags &amp; and &lt; that XML insists must be used to represent ampersand and less-than characters in text; and with the character set limited to the ASCII range, i.e. valid UTF-8 characters whose character number lies between 32 (space) and 126 (tilde) + + + + + PDF + Replaces 04 for the <TextFormat> element, but cannot of course be used as a textformat attribute + + + + + + + Text link type code + + + + + URL + + + + + + DOI + + + + + + PURL + + + + + + URN + + + + + + FTP address + + + + + + filename + + + + + + + + Front cover image file format code + + + + + GIF + + + + + + JPEG + + + + + + TIF + + + + + + + + Front cover image file link type code + + + + + URL + + + + + + DOI + + + + + + PURL + + + + + + URN + + + + + + FTP address + + + + + + filename + + + + + + + + Image/audio/video file type code + + + + + Whole product + Link to a location where the whole product may be found – used for epublications + + + + + Software demo + + + + + + Front cover image + Quality unspecified: if sending both a standard quality and a high quality image, use 04 for standard quality and 06 for high quality + + + + + Front cover high quality image + + + + + + Front cover thumbnail + + + + + + Contributor image + + + + + + Series image + + + + + + Series logo + + + + + + Product logo + Use only for a logo which is specific to an individual product + + + + + Publisher logo + + + + + + Imprint logo + + + + + + Inside page image + + + + + + Video segment + + + + + + Audio segment + + + + + + + + Image/audio/video file format code + + + + + GIF + + + + + + JPEG + + + + + + PDF + + + + + + TIF + + + + + + RealAudio 28.8 + + + + + + MP3 + + + + + + MPEG-4 + MPEG-4 video file + + + + + + + Image/audio/video file link type + + + + + URL + + + + + + DOI + + + + + + PURL + + + + + + URN + + + + + + FTP address + + + + + + filename + + + + + + + + Prize or award achievement code + + + + + Winner + + + + + + Runner-up + Named as being in second place + + + + + Commended + + + + + + Short-listed + Nominated by the judging process to be one of the final 'short-list' from which the winner is selected + + + + + Long-listed + Nominated by the judging process to be one of the preliminary 'long-list' from which first a short-list and then the winner is selected + + + + + Joint winner + Or co-winner + + + + + + + Text item type code + + + + + Textual work + A complete work which is published as a content item in a product which carries two or more such works, eg when two or three novels are published in a single omnibus volume + + + + + Front matter + Text components such as Preface, Introduction etc which appear as preliminaries to the main body of text content in a product + + + + + Body matter + Text components such as Part, Chapter, Section etc which appear as part of the main body of text content in a product + + + + + Back matter + Text components such as Index which appear after the main body of text in a product + + + + + Serial item, miscellaneous or unspecified + For journals + + + + + Research article + For journals + + + + + Review article + For journals + + + + + Letter + For journals + + + + + Short communication + For journals + + + + + Erratum + For journals + + + + + Abstract + For journals + + + + + Book review (or review of other publication) + For journals + + + + + Editorial + For journals + + + + + Product review + For journals + + + + + Index + + + + + + Obituary + For journals + + + + + + + Text item identifier type code + + + + + Proprietary + For example, a publisher’s own identifier + + + + + DOI + + + + + + PII + + + + + + SICI + For serial items only + + + + + + + Name code type + + + + + Proprietary + + + + + + Deutsche Bibliothek publisher identifier + + + + + + Börsenverein Verkehrsnummer + + + + + + German ISBN Agency publisher identifier + + + + + + EAN-UCC GLN + Global location number (formerly EAN location number) + + + + + SAN + Book trade Standard Address Number - US, UK etc + + + + + Centraal Boekhuis Relatie ID + Trading party identifier used in the Netherlands + + + + + + + Publishing role code + + + + + Publisher + + + + + + Co-publisher + + + + + + Sponsor + + + + + + Publisher of original-language version + Of a translated work + + + + + Host/distributor of electronic content + + + + + + Published for/on behalf of + + + + + + Published in association with + Use also for “Published in cooperation with” + + + + + Published on behalf of + DEPRECATED: use code 06 + + + + + New or acquiring publisher + When ownership of a product or title is transferred from one publisher to another + + + + + + + Sales rights type code + + + + + For sale with exclusive rights in the specified country/ies + + + + + + For sale with non-exclusive rights in the specified country/ies + + + + + + Not for sale in the specified country/ies + + + + + + + + Rights region + + + + + World + + + + + + World except territories specified elsewhere in rights statements + + + + + + UK airports + + + + + + UK 'open market' + Use when an open market edition is published under its own ISBN + + + + + + + Measure type code + + + + + Height + For a book, the spine height when standing on a shelf + + + + + Width + For a book, the horizontal dimension of the cover when standing upright + + + + + Thickness + For a book, the thickness of the spine + + + + + Page trim height + Not recommended for general use + + + + + Page trim width + Not recommended for general use + + + + + Unit weight + + + + + + Diameter + Of a globe, for example + + + + + + + Rights territory code + + + + + Australian Capital Territory + + + + + + New South Wales + + + + + + Northern Territory + + + + + + Queensland + + + + + + South Australia + + + + + + Tasmania + + + + + + Victoria + + + + + + Western Australia + + + + + + Alberta + + + + + + British Columbia + + + + + + Manitoba + + + + + + New Brunswick + + + + + + Newfoundland and Labrador + + + + + + Nova Scotia + + + + + + Northwest Territories + + + + + + Nunavut + + + + + + Ontario + + + + + + Prince Edward Island + + + + + + Quebec + + + + + + Saskatchewan + + + + + + Yukon Territory + + + + + + Canary Islands + + + + + + UK airside + Airside outlets at UK international airports only + + + + + UK airports + All UK airports, including both airside and other outlets + + + + + Channel Islands + + + + + + England + + + + + + England, Wales, Scotland + UK excluding Northern Ireland + + + + + Isle of Man + + + + + + Northern Ireland + + + + + + Scotland + + + + + + Wales + + + + + + Rest of world + World except as otherwise specified + + + + + World + + + + + + + + Measure unit code + + + + + Centimeters + + + + + + Grams + + + + + + Inches (US) + + + + + + Pounds (US) + + + + + + Millimeters + + + + + + Ounces (US) + + + + + + + + Relation code + + + + + Includes + X includes Y (where the product described in the ONIX record is X and the related product is Y) + + + + + Is part of + X is part of Y – use for 'also available as part of' + + + + + Replaces + X replaces Y + + + + + Replaced by + X is replaced by Y  + + + + + Alternative format + X is available in an alternative format as Y – indicates an alternative format of the same content which is or may be available. + + + + + Has ancillary product + X has an ancillary or supplementary product Y + + + + + Is ancillary to + X is ancillary or supplementary to Y + + + + + Is remaindered as + X is remaindered as Y, when a remainder merchant assigns its own identifier to the product + + + + + Is remainder of + X was originally sold as Y, indicating the publisher's original identifier for a title which is offered as a remainder under a different identifier + + + + + Is other-language version of + X is an other-language version of Y + + + + + Publisher’s suggested alternative + X has a publisher's suggested alternative Y, which does not, however, carry the same content (cf 05 and 06) + + + + + Epublication based on (print product) + X is an epublication based on printed product Y + + + + + Epublication is distributed as + X is an epublication 'rendered' as Y – use when the ONIX record describes a package of electronic content which is available in multiple 'renderings' + + + + + Epublication is a rendering of + X is a 'rendering' of an epublication Y – use when the ONIX record describes a specific rendering of an epublication content package, to identify the package + + + + + POD replacement for + X is a POD replacement for Y – Y is an out-of-print product replaced by a print-on-demand version under a new ISBN + + + + + Replaced by POD + X is replaced by POD Y – Y is a print-on-demand replacement, under a new ISBN, for an out-of-print product X + + + + + + + Supply-to region code + + + + + UK 'open market' + When the same ISBN is used for open market and UK editions + + + + + + + Returns conditions code type + + + + + French book trade returns conditions code + Maintained by CLIL (Commission Interprofessionnel du Livre) + + + + + BISAC Returnable Indicator code + Maintained by BISAC: see List 66 + + + + + UK book trade returns conditions code + NOT CURRENTLY USED - BIC has decided that it will not maintain a code list for this purpose, since returns conditions are usually at least partly based on the trading relationship + + + + + + + Availability status code + + + + + Cancelled + Publication abandoned after having been announced + + + + + Available direct from publisher only + Apply direct to publisher, item not available to trade + + + + + Availability uncertain + Check with customer service + + + + + No longer stocked by us + Wholesaler or vendor only + + + + + Available + In-print and in stock + + + + + Manufactured on demand + May be accompanied by an estimated average time to supply + + + + + Not yet published + MUST be accompanied by an expected availability date + + + + + Newly catalogued, not yet in stock + Wholesaler or vendor only: MUST be accompanied by expected availability date + + + + + Other format available + This format is out of print, but another format is available: should be accompanied by an identifier for the alternative product + + + + + Out of stock indefinitely + No current plan to reprint + + + + + Out of print + Discontinued, deleted from catalogue + + + + + Replaced by new edition + This edition is out of print, but a new edition has been or will soon be published: should be accompanied by an identifier for the new edition + + + + + Publication postponed indefinitely + Publication has been announced, and subsequently postponed with no new date + + + + + Refer to another supplier + Supply of this item has been transferred to another publisher or distributor: should be accompanied by an identifier for the new supplier + + + + + Remaindered + + + + + + Reprinting + MUST be accompanied by an expected availability date + + + + + Reprinting, undated + Use instead of RP as a last resort, only if it is really impossible to give an expected availability date + + + + + Special order + This item is not stocked but has to be specially ordered from a supplier (eg import item not stocked locally): may be accompanied by an estimated average time to supply + + + + + Temporarily out of stock because publisher cannot supply + Wholesaler or vendor only + + + + + Temporarily unavailable + MUST be accompanied by an expected availability date + + + + + Unavailable, awaiting reissue + The item is out of stock but will be reissued under the same ISBN: MUST be accompanied by an expected availability date and by the reissue date in the <Reissue> composite. See notes on the <Reissue> composite for details on treatment of availability status during reissue. + + + + + Will be remaindered as of (date) + MUST be accompanied by the remainder date + + + + + Withdrawn from sale + Typically, withdrawn indefinitely for legal reasons + + + + + + + Date format + + + + + YYYYMMDD + Year month day (default) + + + + + YYYYMM + Year and month + + + + + YYYYWW + Year and week number + + + + + YYYYQ + Year and quarter (Q = 1, 2, 3, 4) + + + + + YYYYS + Year and season (S = 1, 2, 3, 4, with 1 = “Spring”) + + + + + YYYY + Year + + + + + YYYYMMDDYYYYMMDD + Spread of exact dates + + + + + YYYYMMYYYYMM + Spread of months + + + + + YYYYWWYYYYWW + Spread of week numbers + + + + + YYYYQYYYYQ + Spread of quarters + + + + + YYYYSYYYYS + Spread of seasons + + + + + YYYYYYYY + Spread of years + + + + + Text string + For complex, approximate or uncertain dates + + + + + + + Audience restriction flag + + + + + Restrictions apply, see note + + + + + + Indiziert + Indexed for the German market - in Deutschland indiziert + + + + + + + Unpriced item type code + + + + + Free of charge + + + + + + Price to be announced + + + + + + Not sold separately + + + + + + Contact publisher or supplier + May be used for books that do not carry a recommended retail price, when an ONIX file is “broadcast” rather than sent one-to-one to a single trading partner. + + + + + + + Price type code + + + + + RRP excluding any sales tax or value-added tax + + + + + + RRP including sales or value-added tax if applicable + + + + + + Fixed retail price excluding tax + In countries where retail price maintenance applies by law to certain products: not used in USA + + + + + Fixed retail price including tax + In countries where retail price maintenance applies by law to certain products: not used in USA + + + + + Supplier’s unit cost price excluding any sales tax or value-added tax: goods for retail sale + In North America, this may be referred to as 'net price' + + + + + Supplier’s unit cost price excluding any sales tax or value-added tax: rental goods + Used for video and DVD + + + + + Special sale RRP excluding any sales tax or value-added tax + + + + + + Special sale RRP including sales or value-added tax if applicable + + + + + + Special sale fixed retail price excluding tax + In countries where retail price maintenance applies by law to certain products: not used in USA + + + + + Special sale fixed retail price including tax + In countries where retail price maintenance applies by law to certain products: not used in USA + + + + + Supplier’s unit cost price for special sale excluding any sales tax or value-added tax + + + + + + Pre-publication RRP excluding any sales tax or value-added tax + + + + + + Pre-publication RRP including sales or value-added tax if applicable + + + + + + Pre-publication fixed retail price excluding tax + In countries where retail price maintenance applies by law to certain products: not used in USA + + + + + Pre-publication fixed retail price including tax + In countries where retail price maintenance applies by law to certain products: not used in USA + + + + + Supplier’s pre-publication unit cost price excluding tax + + + + + + Freight-pass-through RRP excluding tax + In the US, books are sometimes supplied on 'freight-pass-through' terms, where a price that is different from the RRP is used as the basis for calculating the supplier’s charge to a reseller. To make it clear when such terms are being invoked, code 31 is used instead of code 01 to indicate the RRP. Code 32 is used for the 'billing price'. + + + + + Freight-pass-through billing price excluding tax + When freight-pass-through terms apply, the price on which the supplier’s charge to a reseller is calculated, ie the price to which trade discount terms are applied. See also code 31. + + + + + + + Price type qualifier + + + + + Member/subscriber price + Price applies to a designated group membership + + + + + Export price + Price applies to sales outside the territory in which the supplier is located + + + + + Reduced price applicable when the item is purchased as part of a set + Use in cases where there is no combined set price, but a lower price is offered for each part if the whole set is purchased + + + + + Voucher price + In the Netherlands (or any other market where similar arrangements exist): a reduced fixed price available for a limited time on presentation of a voucher published in a specified medium, eg a newspaper. Should be accompanied by <PriceTypeCode> 13 and additional detail in <PriceTypeDescription>, and by validity dates in <PriceEffectiveFrom> and <PriceEffectiveUntil>. + + + + + + + Unit of pricing code + + + + + Per copy of whole product + Default + + + + + Per page for printed loose-leaf content only + + + + + + + + Price status code + + + + + Unspecified + Default + + + + + Provisional + + + + + + Firm + + + + + + + + Tax rate, coded + + + + + Higher rate + Specifies that tax is applied at a higher rate than standard + + + + + Lower rate + Specifies that tax is applied at a lower rate than standard + + + + + Standard rate + + + + + + Zero-rated + + + + + + + + Intermediary supplier availability + + + + + + Publishing status + + + + + Unspecified + Status is not specified (as distinct from unknown): the default if the <PublishingStatus> element is not sent. Also to be used in applications where the element is considered mandatory, but the sender of the ONIX message chooses not to pass on status information. + + + + + Cancelled + The product was announced, and subsequently abandoned; the <PublicationDate> element must not be sent. + + + + + Forthcoming + Not yet published, must be accompanied by expected date in <PublicationDate>. + + + + + Postponed indefinitely + The product was announced, and subsequently postponed with no expected publication date; the<Publication Date> element must not be sent. + + + + + Active + The product was published, and is still active in the sense that the publisher will accept orders for it, though it may or may not be immediately available, for which see <SupplyDetail>. + + + + + No longer our product + Ownership of the product has been transferred to another publisher (with details of acquiring publisher if possible in PR.19). + + + + + Out of stock indefinitely + The product was active, but is now inactive in the sense that (a) the publisher will not accept orders for it, though stock may still be available elsewhere in the supply chain, and (b) there are no current plans to bring it back into stock. Code 06 does not specifically imply that returns are or are not still accepted. + + + + + Out of print + The product was active, but is now permanently inactive in the sense that (a) the publisher will not accept orders for it, though stock may still be available elsewhere in the supply chain, and (b) the product will not be made available again under the same ISBN. Code 07 normally implies that the publisher will not accept returns beyond a specified date. + + + + + Inactive + The product was active, but is now permanently or indefinitely inactive in the sense that the publisher will not accept orders for it, though stock may still be available elsewhere in the supply chain. Code 08 covers both of codes 06 and 07, and may be used where the distinction between those values is either unnecessary or meaningless. + + + + + Unknown + The sender of the ONIX record does not know the current publishing status. + + + + + Remaindered + The product is no longer available from the current publisher, under the current ISBN, at the current price. It may be available to be traded through another channel. A Publishing Status code 10 'Remaindered' usually but not always means that the publisher has decided to sell off excess inventory of the book. Copies of books that are remaindered are often made available in the supply chain at a reduced price. However, such remainders are often sold under a product identifier that differs from the ISBN on the full-priced copy of the book. A Publishing Status code 10 'Remaindered' on a given product record may or may not be followed by a Publishing Status code 06 'Out of Stock Indefinitely' or 07 'Out of Print': the practise varies from one publisher to another. Some publishers may revert to a Publishing Status code 04 “Active” if a desired inventory level on the product in question has subsequently been reached. No change in rights should ever be inferred from this (or any other) Publishing Status code value. + + + + + + + Product availability + + + + + Cancelled + Cancelled: product was announced, and subsequently abandoned + + + + + Not yet available + Not yet available (requires <ExpectedShipDate>, except in exceptional circumstances where no date is known) + + + + + Awaiting stock + Not yet available, but will be a stock item when available (requires <ExpectedShipDate>, except in exceptional circumstances where no date is known). Used particularly for imports which have been published in the country of origin but have not yet arrived in the importing country. + + + + + Not yet available, will be POD + Not yet available, to be published as print-on-demand only. May apply either to a POD successor to an existing conventional edition, when the successor will be published under a different ISBN (normally because different trade terms apply); or to a title that is being published as a POD original. + + + + + Available + Available from us (form of availability unspecified) + + + + + In stock + Available from us as a stock item + + + + + To order + Available from us as a non-stock item, by special order + + + + + Manufactured on demand + Available from us by manufacture on demand + + + + + Temporarily unavailable + Temporarily unavailable: temporarily unavailable from us (reason unspecified) (requires <ExpectedShipDate>, except in exceptional circumstances where no date is known) + + + + + Out of stock + Stock item, temporarily out of stock (requires <ExpectedShipDate>, except in exceptional circumstances where no date is known) + + + + + Reprinting + Temporarily unavailable, reprinting (requires <ExpectedShipDate>, except in exceptional circumstances where no date is known) + + + + + Awaiting reissue + Temporarily unavailable, awaiting reissue (requires the <Reissue> composite, and <ExpectedShipDate>, except in exceptional circumstances where no date is known) + + + + + Not available + Not available from us (reason unspecified; if the reason is rights-related, it should be specified in PR.21) + + + + + Replaced by new product + This product is unavailable, but a successor product or edition is or will be available from us (identify successor in <RelatedProduct>) + + + + + Other format available + This product is unavailable, but the same content is or will be available from us in an alternative format (identify other format product in <RelatedProduct>) + + + + + No longer supplied by us + Identify new supplier in <NewSupplier> if possible + + + + + Apply direct + Not available to trade, apply direct to publisher + + + + + Not sold separately + Must be bought as part of a set (identify set in <RelatedProduct>) + + + + + Withdrawn from sale + May be for legal reasons or to avoid giving offence + + + + + Remaindered + Remaindered + + + + + Out of print, replaced by POD + Out of print, but a print-on-demand edition is or will be available under a different ISBN. Use only when the POD successor has a different ISBN, normally because different trade terms apply. + + + + + Uncertain + Apply to customer service + + + + + + + BISAC returnable indicator + + + + + Yes, returnable, full copies only + + + + + + No, not returnable + + + + + + Conditional + Contact publisher for requirements and/or authorization + + + + + Yes, returnable, stripped cover + + + + + + + + Market date role + + + + + Local publication date + The date on which the product is expected to be first published in this market, or – after publication – the date on which it was published in this market. There may or may not be a strict embargo on retail sales before the expected date; if there is, it should be specified separately as an embargo date. + + + + + Consumer on sale date / embargo date + If there is an embargo on retail sales in this market before a certain date, the date from which the embargo is lifted and retail sales are permitted + + + + + + + Market publishing status + + + + + Unspecified + Status is not specified (as distinct from unknown): the default if the <MarketPublishingStatus> element is not sent. + + + + + Cancelled + The product was announced for publication in this market, and subsequently abandoned. + + + + + Forthcoming + Not yet published in this market, should be accompanied by expected local publication date.. + + + + + Postponed indefinitely + The product was announced for publication in this market, and subsequently postponed with no expected local publication date. + + + + + Active + The product was published in this market, and is still active in the sense that the publisher will accept orders for it, though it may or may not be immediately available, for which see <SupplyDetail>. + + + + + No longer our product + Responsibility for the product in this market has been transferred elsewhere. + + + + + Out of stock indefinitely + The product was active, but is now inactive in the sense that (a) no further stock is expected to be made available in this market, though stock may still be available elsewhere in the supply chain, and (b) there are no current plans to bring it back into stock. + + + + + Out of print + The product was active, but is now permanently inactive in the sense that (a) no further stock is expected to be made available in this market, though stock may still be available elsewhere in the supply chain, and (b) the product will not be made available again under the same ISBN. + + + + + Inactive + The product was active, but is now permanently or indefinitely inactive in the sense that no further stock is expected to be made available in this market, though stock may still be available elsewhere in the supply chain. Code 08 covers both of codes 06 and 07, and may be used where the distinction between those values is either unnecessary or meaningless. + + + + + Unknown + The sender of the ONIX record does not know the current publishing status in this market. + + + + + Remaindered + The product is no longer available in this market from the local publisher, under the current ISBN, at the current price. It may be available to be traded through another channel, usually at a reduced price. + + + + + Withdrawn from sale + Withdrawn from sale in this market, typically for legal reasons + + + + + Not available in this market + Either no rights are held for the product in this market, or for other reasons the publisher has decided not to make it available in this market + + + + + Active, but not sold separately + The product is published in this market and active but, as a publishing decision, it is not sold separately – only in an assembly or as part of a package + + + + + Active, with market restrictions + The product is published in this market and active, but is not available to all customer types, typically because the market is split between exclusive sales agents for different market segments. Should be accompanied by a free-text statement in <MarketRestrictionDetail> describing the nature of the restriction. + + + + + + + Agent role + + + + + Exclusive sales agent + Publisher's exclusive sales agent in a specified territory + + + + + Non-exclusive sales agent + Publisher's non-exclusive sales agent in a specified territory + + + + + Local publisher + Publisher for a specified territory + + + + + + + Stock quantity code type + + + + + Proprietary + + + + + + APA stock quantity code + Code scheme defined by the Australian Publishers Association + + + + + + + Sales restriction type code + + + + + Unspecified - see text + Restriction must be described in <SalesRestrictionDetail> + + + + + Retailer exclusive / own brand + For sale only through designated retailer. Retailer must be named in <SalesOutletName>. Use only when it is not possible to assign the more explicit code 04 or 05. + + + + + Office supplies edition + For editions sold only though office supplies wholesalers. Retailer(s) and/or distributor(s) may be named in <SalesOutletName> + + + + + Internal publisher use only: do not list + For an ISBN that is assigned for a publisher's internal purposes + + + + + Retailer exclusive + For sale only through designated retailer, though not under retailer's own brand/imprint. Retailer must be named in <SalesOutletName>. + + + + + Retailer own brand + For sale only through designated retailer under retailer's own brand/imprint. Retailer must be named in <SalesOutletName>. + + + + + Library edition + For sale to libraries only; not for sale through retail trade + + + + + + + Thesis type code + + + + + Habilitationsschrift + Professorial dissertation (thesis for postdoctoral lecturing qualification) + + + + + Dissertationsschrift + Doctoral thesis + + + + + Staatsexamensarbeit + State examination thesis + + + + + Magisterarbeit + Masters degree thesis + + + + + Diplomarbeit + Diploma thesis + + + + + + + Website role + + + + + Unspecified, see website description + + + + + + Publisher’s corporate website + See also codes 17 and 18 + + + + + Publisher’s website for a specified work + A publisher’s informative and/or promotional webpage relating to a specified work (book, journal, online resource or other publication type) + + + + + Online hosting service home page + A webpage giving access to an online content hosting service as a whole + + + + + Journal home page + A webpage giving general information about a journal title, in print or electronic format or both. + + + + + Online journal 'available contents' page + A webpage giving direct access to the content that is available online for a specified journal title version. + + + + + Contributor’s own website + A webpage maintained by an author or other contributor about her/his publications and personal background + + + + + Publisher’s website relating to specified contributor + A publisher’s webpage devoted to a specific author or other contributor + + + + + Other publisher’s website relating to specified contributor + A webpage devoted to a specific author or other contributor, and maintained by a publisher other than the publisher of the item described in the ONIX record + + + + + Third-party website relating to specified contributor + A webpage devoted to a specific author or other contributor, and maintained by a third party (eg a fan site) + + + + + Contributor’s own website for specified work + A webpage maintained by an author or other contributor and specific to an individual work + + + + + Other publisher’s website relating to specified work + A webpage devoted to an individual work, and maintained by a publisher other than the publisher of the item described in the ONIX record + + + + + Third-party website relating to specified work + A webpage devoted to an individual work, and maintained by a third party (eg a fan site) + + + + + Contributor’s own website for group or series of works + A webpage maintained by an author or other contributor and specific to a group or series of works + + + + + Publisher’s website relating to group or series of works + A publisher’s webpage devoted to a group or series of works + + + + + Other publisher’s website relating to group or series of works + A webpage devoted to a group or series of works, and maintained by a publisher other than the publisher of the item described in the ONIX record + + + + + Third-party website relating to group or series of works (eg a fan site) + A webpage devoted to a group or series of works, and maintained by a third party (eg a fan site) + + + + + Publisher’s B2B website + Use instead of code 01 to specify a publisher’s website for trade users + + + + + Publisher’s B2C website + Use instead of code 01 to specify a publisher’s website for consumers + + + + + + + Language code - ISO 639-2/B + + + + + Afar + + + + + + Abkhaz + + + + + + Achinese + + + + + + Acoli + + + + + + Adangme + + + + + + Adygei + + + + + + Afroasiatic (Other) + + + + + + Afrihili (Artificial language) + + + + + + Afrikaans + + + + + + Ainu + + + + + + Akan + + + + + + Akkadian + + + + + + Albanian + + + + + + Aleut + + + + + + Algonquian (Other) + + + + + + Southern Altai + + + + + + Amharic + + + + + + English, Old (ca. 450-1100) + + + + + + Apache languages + + + + + + Arabic + + + + + + Aramaic + + + + + + Aragonese Spanish + + + + + + Armenian + + + + + + Mapuche + + + + + + Arapaho + + + + + + Artificial (Other) + + + + + + Arawak + + + + + + Assamese + + + + + + Bable + + + + + + Athapascan (Other) + + + + + + Australian languages + + + + + + Avaric + + + + + + Avestan + + + + + + Awadhi + + + + + + Aymara + + + + + + Azerbaijani + + + + + + Banda + + + + + + Bamileke languages + + + + + + Bashkir + + + + + + Baluchi + + + + + + Bambara + + + + + + Balinese + + + + + + Basque + + + + + + Basa + + + + + + Baltic (Other) + + + + + + Beja + + + + + + Belarusian + + + + + + Bemba + + + + + + Bengali + + + + + + Berber (Other) + + + + + + Bhojpuri + + + + + + Bihari + + + + + + Bikol + + + + + + Bini + + + + + + Bislama + + + + + + Siksika + + + + + + Bantu (Other) + + + + + + Bosnian + + + + + + Braj + + + + + + Breton + + + + + + Batak + + + + + + Buriat + + + + + + Bugis + + + + + + Bulgarian + + + + + + Burmese + + + + + + Blin; Bilin + + + + + + Caddo + + + + + + Central American Indian (Other) + + + + + + Carib + + + + + + Catalan + + + + + + Caucasian (Other) + + + + + + Cebuano + + + + + + Celtic (Other) + + + + + + Chamorro + + + + + + Chibcha + + + + + + Chechen + + + + + + Chagatai + + + + + + Chinese + + + + + + Truk + + + + + + Mari + + + + + + Chinook jargon + + + + + + Choctaw + + + + + + Chipewyan + + + + + + Cherokee + + + + + + Church Slavic + + + + + + Chuvash + + + + + + Cheyenne + + + + + + Chamic languages + + + + + + Coptic + + + + + + Cornish + + + + + + Corsican + + + + + + Creoles and Pidgins, English-based (Other) + + + + + + Creoles and Pidgins, French-based (Other) + + + + + + Creoles and Pidgins, Portuguese-based (Other) + + + + + + Cree + + + + + + Crimean Turkish; Crimean Tatar + + + + + + Creoles and Pidgins (Other) + + + + + + Kashubian + + + + + + Cushitic (Other) + + + + + + Czech + + + + + + Dakota + + + + + + Danish + + + + + + Dargwa + + + + + + Dayak + + + + + + Delaware + + + + + + Slave + + + + + + Dogrib + + + + + + Dinka + + + + + + Divehi + + + + + + Dogri + + + + + + Dravidian (Other) + + + + + + Lower Sorbian + + + + + + Duala + + + + + + Dutch, Middle (ca. 1050-1350) + + + + + + Dutch + + + + + + Dyula + + + + + + Dzongkha + + + + + + Efik + + + + + + Egyptian + + + + + + Ekajuk + + + + + + Elamite + + + + + + English + + + + + + English, Middle (1100-1500) + + + + + + Esperanto + + + + + + Estonian + + + + + + Ewe + + + + + + Ewondo + + + + + + Fang + + + + + + Faroese + + + + + + Fanti + + + + + + Fijian + + + + + + Filipino; Pilipino + + + + + + Finnish + + + + + + Finno-Ugrian (Other) + + + + + + Fon + + + + + + French + + + + + + French, Middle (ca. 1400-1600) + + + + + + French, Old (ca. 842-1400) + + + + + + Frisian + + + + + + Fula + + + + + + Friulian + + + + + + + + + + + + Gayo + + + + + + Gbaya + + + + + + Germanic (Other) + + + + + + Georgian + + + + + + German + + + + + + Ethiopic + + + + + + Gilbertese + + + + + + Scottish Gaelic + + + + + + Irish + + + + + + Galician + + + + + + Manx + + + + + + German, Middle High (ca. 1050-1500) + + + + + + German, Old High (ca. 750-1050) + + + + + + Gondi + + + + + + Gorontalo + + + + + + Gothic + + + + + + Grebo + + + + + + Greek, Ancient (to 1453) + + + + + + Greek, Modern (1453-) + + + + + + Guarani + + + + + + Gujarati + + + + + + Gwich'in + + + + + + Haida + + + + + + Haitian French Creole + + + + + + Hausa + + + + + + Hawaiian + + + + + + Hebrew + + + + + + Herero + + + + + + Hiligaynon + + + + + + Himachali + + + + + + Hindi + + + + + + Hittite + + + + + + Hmong + + + + + + Hiri Motu + + + + + + Upper Sorbian + + + + + + Hungarian + + + + + + Hupa + + + + + + Iban + + + + + + Igbo + + + + + + Icelandic + + + + + + Ido + + + + + + Sichuan Yi + + + + + + Ijo + + + + + + Inuktitut + + + + + + Interlingue + + + + + + Iloko + + + + + + Interlingua (International Auxiliary Language Association) + + + + + + Indic (Other) + + + + + + Indonesian + + + + + + Indo-European (Other) + + + + + + Ingush + + + + + + Inupiaq + + + + + + Iranian (Other) + + + + + + Iroquoian (Other) + + + + + + Italian + + + + + + Javanese + + + + + + Lojban + + + + + + Japanese + + + + + + Judeo-Persian + + + + + + Judeo-Arabic + + + + + + Kara-Kalpak + + + + + + Kabyle + + + + + + Kachin + + + + + + Kalâtdlisut + + + + + + Kamba + + + + + + Kannada + + + + + + Karen + + + + + + Kashmiri + + + + + + Kanuri + + + + + + Kawi + + + + + + Kazakh + + + + + + Kabardian + + + + + + Khasi + + + + + + Khoisan (Other) + + + + + + Khmer + + + + + + Khotanese + + + + + + Kikuyu + + + + + + Kinyarwanda + + + + + + Kyrgyz + + + + + + Kimbundu + + + + + + Konkani + + + + + + Komi + + + + + + Kongo + + + + + + Korean + + + + + + Kusaie + + + + + + Kpelle + + + + + + Karachay-Balkar + + + + + + Kru + + + + + + Kurukh + + + + + + Kuanyama + + + + + + Kumyk + + + + + + Kurdish + + + + + + Kutenai + + + + + + Ladino + + + + + + Lahnda + + + + + + Lamba + + + + + + Lao + + + + + + Latin + + + + + + Latvian + + + + + + Lezgian + + + + + + Limburgish + + + + + + Lingala + + + + + + Lithuanian + + + + + + Mongo-Nkundu + + + + + + Lozi + + + + + + Letzeburgesch + + + + + + Luba-Lulua + + + + + + Luba-Katanga + + + + + + Ganda + + + + + + Luiseño + + + + + + Lunda + + + + + + Luo (Kenya and Tanzania) + + + + + + Lushai + + + + + + Macedonian + + + + + + Madurese + + + + + + Magahi + + + + + + Marshall + + + + + + Maithili + + + + + + Makasar + + + + + + Malayalam + + + + + + Mandingo + + + + + + Maori + + + + + + Austronesian (Other) + + + + + + Marathi + + + + + + Masai + + + + + + Malay + + + + + + Moksha + + + + + + Mandar + + + + + + Mende + + + + + + Irish, Middle (ca. 1100-1550) + + + + + + Micmac + + + + + + Minangkabau + + + + + + Miscellaneous languages + + + + + + Mon-Khmer (Other) + + + + + + Malagasy + + + + + + Maltese + + + + + + Manchu + + + + + + Manipuri + + + + + + Manobo languages + + + + + + Mohawk + + + + + + Moldavian + + + + + + Mongolian + + + + + + Mooré + + + + + + Multiple languages + + + + + + Munda (Other) + + + + + + Creek + + + + + + Mirandese + + + + + + Marwari + + + + + + Mayan languages + + + + + + Erzya + + + + + + Nahuatl + + + + + + North American Indian (Other) + + + + + + Neapolitan Italian + + + + + + Nauru + + + + + + Navajo + + + + + + Ndebele (South Africa) + + + + + + Ndebele (Zimbabwe) + + + + + + Ndonga + + + + + + Low German + + + + + + Nepali + + + + + + Newari + + + + + + Nias + + + + + + Niger-Kordofanian (Other) + + + + + + Niuean + + + + + + Norwegian Nynorsk + + + + + + Norwegian Bokmål + + + + + + Nogai + + + + + + Old Norse + + + + + + Norwegian + + + + + + Northern Sotho + + + + + + Nubian languages + + + + + + Classical Newari; Old Newari + + + + + + Nyanja + + + + + + Nyamwezi + + + + + + Nyankole + + + + + + Nyoro + + + + + + Nzima + + + + + + Occitan (post-1500) + + + + + + Ojibwa + + + + + + Oriya + + + + + + Oromo + + + + + + Osage + + + + + + Ossetic + + + + + + Turkish, Ottoman + + + + + + Otomian languages + + + + + + Papuan (Other) + + + + + + Pangasinan + + + + + + Pahlavi + + + + + + Pampanga + + + + + + Panjabi + + + + + + Papiamento + + + + + + Palauan + + + + + + Old Persian (ca. 600-400 B.C.) + + + + + + Persian + + + + + + Philippine (Other) + + + + + + Phoenician + + + + + + Pali + + + + + + Polish + + + + + + Ponape + + + + + + Portuguese + + + + + + Prakrit languages + + + + + + Provençal (to 1500) + + + + + + Pushto + + + + + + Aranés + ONIX local code + + + + + Valencian + ONIX local code + + + + + Quechua + + + + + + Rajasthani + + + + + + Rapanui + + + + + + Rarotongan + + + + + + Romance (Other) + + + + + + Raeto-Romance + + + + + + Romany + + + + + + Romanian + + + + + + Rundi + + + + + + Aromanian; Arumanian; Macedo-Romanian + + + + + + Russian + + + + + + Sandawe + + + + + + Sango + + + + + + Yakut + + + + + + South American Indian (Other) + + + + + + Salishan languages + + + + + + Samaritan Aramaic + + + + + + Sanskrit + + + + + + Sasak + + + + + + Santali + + + + + + Serbian + + + + + + Sicilian + + + + + + Scots + + + + + + Croatian + + + + + + Selkup + + + + + + Semitic (Other) + + + + + + Irish, Old (to 1100) + + + + + + Sign languages + + + + + + Shan + + + + + + Sidamo + + + + + + Sinhalese + + + + + + Siouan (Other) + + + + + + Sino-Tibetan (Other) + + + + + + Slavic (Other) + + + + + + Slovak + + + + + + Slovenian + + + + + + Southern Sami + + + + + + Northern Sami + + + + + + Sami + + + + + + Lule Sami + + + + + + Inari Sami + + + + + + Samoan + + + + + + Skolt Sami + + + + + + Shona + + + + + + Sindhi + + + + + + Soninke + + + + + + Sogdian + + + + + + Somali + + + + + + Songhai + + + + + + Sotho + + + + + + Spanish + + + + + + Sardinian + + + + + + Serer + + + + + + Nilo-Saharan (Other) + + + + + + Swazi + + + + + + Sukuma + + + + + + Sundanese + + + + + + Susu + + + + + + Sumerian + + + + + + Swahili + + + + + + Swedish + + + + + + Syriac + + + + + + Tahitian + + + + + + Tai (Other) + + + + + + Tamil + + + + + + Tatar + + + + + + Telugu + + + + + + Temne + + + + + + Terena + + + + + + Tetum + + + + + + Tajik + + + + + + Tagalog + + + + + + Thai + + + + + + Tibetan + + + + + + Tigré + + + + + + Tigrinya + + + + + + Tiv + + + + + + Tokelauan + + + + + + Klingon; tlhIngan-Hol + + + + + + Tlingit + + + + + + Tamashek + + + + + + Tonga (Nyasa) + + + + + + Tongan + + + + + + Tok Pisin + + + + + + Tsimshian + + + + + + Tswana + + + + + + Tsonga + + + + + + Turkmen + + + + + + Tumbuka + + + + + + Tupi languages + + + + + + Turkish + + + + + + Altaic (Other) + + + + + + Tuvaluan + + + + + + Twi + + + + + + Tuvinian + + + + + + Udmurt + + + + + + Ugaritic + + + + + + Uighur + + + + + + Ukrainian + + + + + + Umbundu + + + + + + Undetermined + + + + + + Urdu + + + + + + Uzbek + + + + + + Vai + + + + + + Venda + + + + + + Vietnamese + + + + + + Volapük + + + + + + Votic + + + + + + Wakashan languages + + + + + + Walamo + + + + + + Waray + + + + + + Washo + + + + + + Welsh + + + + + + Sorbian languages + + + + + + Walloon + + + + + + Wolof + + + + + + Kalmyk + + + + + + Xhosa + + + + + + Yao + + + + + + Yapese + + + + + + Yiddish + + + + + + Yoruba + + + + + + Yupik languages + + + + + + Zapotec + + + + + + Zenaga + + + + + + Zhuang + + + + + + Zande + + + + + + Zulu + + + + + + Zuni + + + + + + + + Person date role + + + + + Date of birth + + + + + + Date of death + + + + + + + + Product form feature value - DVD region codes + + + + + All regions + + + + + + US & Canada + US, US Territories, Canada + + + + + Japan, Europe, S Africa, Middle East + Japan, Europe, South Africa and Middle East (including Egypt) + + + + + SE Asia and East Asia + Southeast Asia and East Asia (including Hong Kong) + + + + + Australia, NZ, Pacific Islands, Central America, Mexico, South America, Caribbean + Australia, New Zealand, Pacific Islands, Central America, Mexico, South America and the Caribbean + + + + + Eastern Europe, Indian subcontinent, Africa, North Korea, Mongolia + Eastern Europe (former Soviet Union), Indian subcontinent, Africa, North Korea and Mongolia + + + + + China + + + + + + Reserved + + + + + + Special international venues + Planes, cruise ships etc + + + + + + + US school or college grade + + + + + Preschool + Age typically 0-4 years + + + + + Kindergarten + Age typically 5 years + + + + + First Grade + Age typically 6 years + + + + + Second Grade + Age typically 7 years + + + + + Third Grade + Age typically 8 years + + + + + Fourth Grade + Age typically 9 years + + + + + Fifth Grade + Age typically 10 years + + + + + Sixth Grade + Age typically 11 years + + + + + Seventh Grade + Age typically 12 years + + + + + Eighth Grade + Age typically 13 years + + + + + Ninth Grade + High School Freshman - age typically 14 years + + + + + Tenth Grade + High School Sophomore - age typically 15 years + + + + + Eleventh Grade + High School Junior - age typically 16 years + + + + + Twelfth Grade + High School Senior - age typically 17 years + + + + + College Freshman + Age typically 18 years + + + + + College Sophomore + Age typically 19 years + + + + + College Junior + Age typically 20 years + + + + + College Senior + Age typically 21 years + + + + + College Graduate Student + Age typically 22+ years + + + + + + + Product form detail + + + + + CD standard audio format + CD 'red book' format + + + + + SACD super audio format + + + + + + MP3 format + + + + + + WAV format + + + + + + Real Audio format + + + + + + Mass market (rack) paperback + In North America, a category of paperback characterized partly by page size (typically 4¼ x 7 1/8 inches) and partly by target market and terms of trade. Use with Product Form code BC. + + + + + Trade paperback (US) + In North America, a category of paperback characterized partly by page size and partly by target market and terms of trade. AKA 'quality paperback', and including textbooks. Most paperback books sold in North America except 'mass-market' (B101) and 'tall rack' (B107) are correctly described with this code. Use with Product Form code BC. + + + + + Digest format paperback + In North America, a category of paperback characterized by page size and generally used for children's books; use with Product Form code BC. Note: was wrongly shown as B102 (duplicate entry) in Issue 3. + + + + + A-format paperback + In UK, a category of paperback characterized by page size (normally 178 x 111 mm approx); use with Product Form code BC + + + + + B-format paperback + In UK, a category of paperback characterized by page size (normally 198 x 129 mm approx); use with Product Form code BC + + + + + Trade paperback (UK) + In UK, a category of paperback characterized partly by size (usually in traditional hardback dimensions), and often used for paperback originals; use with Product Form code BC (replaces 'C-format' from former List 8) + + + + + Tall rack paperback (US) + In North America, a category of paperback characterised partly by page size and partly by target market and terms of trade; use with Product Form code BC + + + + + A5: Tankobon + Japanese hardcover format + + + + + B5: Tankobon + Japanese hardcover format + + + + + B6: Tankobon + Japanese hardcover format + + + + + A6: Bunko + Japanese paperback format + + + + + B40-dori: Shinsho + Japanese paperback format + + + + + Coloring / join-the-dot book + + + + + + Lift-the-flap book + + + + + + Fuzzy book + + + + + + Miniature book + Note: was wrongly shown as B203 (duplicate entry) in Issue 3 + + + + + Moving picture / flicker book + + + + + + Pop-up book + + + + + + Scented / 'smelly' book + + + + + + Sound story / 'noisy' book + + + + + + Sticker book + + + + + + Touch-and-feel book + Incorporating different textures + + + + + Toy / die-cut book + Cut in a non-standard shape with moving parts or other features that make it function as a toy as well as a book: use with Product Form BK + + + + + Picture book + Children's picture book: use with applicable Product Form code + + + + + Loose leaf - sheets & binder + Use with Product Form code BD + + + + + Loose leaf - binder only + Use with Product Form code BD + + + + + Loose leaf - sheets only + Use with Product Form code BD + + + + + Sewn + AKA stitched; for 'saddle-sewn', see code B310 + + + + + Unsewn / adhesive bound + Including 'perfect bound', 'glued' + + + + + Library binding + Strengthened binding intended for libraries + + + + + Reinforced binding + Strengthened binding, not specifically intended for libraries + + + + + Half bound + Must be accompanied by a code specifiying a material, eg 'half-bound real leather' + + + + + Quarter bound + Must be accompanied by a code specifiying a material, eg 'quarter bound real leather' + + + + + Saddle-sewn + AKA 'saddle-stitched' or 'wire-stitched' + + + + + Comb bound + Round or oval plastic forms in a clamp-like configuration: use with code BE from List 7 + + + + + Wire-O + Twin loop metal or plastic spine: use with code BE from List 7 + + + + + Concealed wire + Cased over Wire-O binding: use with code BE from List 7 + + + + + Cloth over boards + AKA fabric, linen over boards + + + + + Paper over boards + + + + + + Leather, real + + + + + + Leather, imitation + + + + + + Leather, bonded + + + + + + Vellum + + + + + + Plastic + + + + + + Vinyl + + + + + + Cloth + Cloth, not necessarily over boards – cf B401 + + + + + Imitation cloth + Spanish 'simil-tela' + + + + + With dust jacket + Type unspecified + + + + + With printed dust jacket + Used to distinguish from B503 + + + + + With translucent dust cover + With translucent paper or plastic protective cover + + + + + With flaps + For paperback with flaps + + + + + With thumb index + + + + + + With ribbon marker(s) + If the number of markers is significant, it can be stated as free text in <ProductFormDescription> + + + + + With zip fastener + + + + + + With button snap fastener + + + + + + With leather edge lining + AKA yapp edge? + + + + + Real Video format + + + + + + Quicktime format + + + + + + AVI format + + + + + + Windows Media format + + + + + + MPEG-4 + + + + + + MS-DOS + Use with an applicable Product Form code D* + + + + + Windows + Use with an applicable Product Form code D* + + + + + Macintosh + Use with an applicable Product Form code D* + + + + + UNIX / LINUX + Use with an applicable Product Form code D* + + + + + Other operating system(s) + Use with an applicable Product Form code D* + + + + + Palm OS + Use with an applicable Product Form code D* + + + + + Windows Mobile + Use with an applicable Product Form code D* + + + + + Microsoft XBox + Use with Product Form code DE or DB as applicable + + + + + Nintendo Gameboy Color + Use with Product Form code DE or DB as applicable + + + + + Nintendo Gameboy Advanced + Use with Product Form code DE or DB as applicable + + + + + Nintendo Gameboy + Use with Product Form code DE or DB as applicable + + + + + Nintendo Gamecube + Use with Product Form code DE or DB as applicable + + + + + Nintendo 64 + Use with Product Form code DE or DB as applicable + + + + + Sega Dreamcast + Use with Product Form code DE or DB as applicable + + + + + Sega Genesis/Megadrive + Use with Product Form code DE or DB as applicable + + + + + Sega Saturn + Use with Product Form code DE or DB as applicable + + + + + Sony Playstation 1 + Use with Product Form code DE or DB as applicable + + + + + Sony Playstation 2 + Use with Product Form code DE or DB as applicable + + + + + Nintendo Dual Screen + + + + + + Desk calendar + Use with Product Form code PC + + + + + Mini calendar + Use with Product Form code PC + + + + + Engagement calendar + Use with Product Form code PC + + + + + Day by day calendar + Use with Product Form code PC + + + + + Poster calendar + Use with Product Form code PC + + + + + Wall calendar + Use with Product Form code PC + + + + + Perpetual calendar + Use with Product Form code PC + + + + + Advent calendar + Use with Product Form code PC + + + + + PAL + TV standard for video or DVD + + + + + NTSC + TV standard for video or DVD + + + + + SECAM + TV standard for video or DVD + + + + + + + Product form feature type + + + + + Color of cover + For Product Form Feature values see code list 98 + + + + + Color of page edge + For Product Form Feature values see code list 98 + + + + + Text font + The principal font used for body text, when this is a significant aspect of product description, eg for some Bibles. The accompanying Product Form Feature value is text specifying font size and, if desired, style. + + + + + Special cover material + For Product Form Feature values see code list 99 + + + + + DVD region + For Product Form Feature values see code list 76 + + + + + + + Product packaging type + + + + + Slip-sleeve + + + + + + Clamshell + + + + + + Keep case + + + + + + Jewel case + + + + + + In box + Individual item or set in box with lid: not to be confused with the commonly-used 'boxed set' - see below + + + + + Slip-cased + Slip-case for single item only: German 'Schuber' + + + + + Slip-cased set + Slip-case for multi-volume set: German 'Kassette'; also commonly referred to as 'boxed set' + + + + + Tube + Rolled in tube: sheet map or poster + + + + + Binder + Use for miscellaneous items such as slides, microfiche, when presented in a binder + + + + + In wallet or folder + Use for miscellaneous items such as slides, microfiche, when presented in a wallet or folder + + + + + + + Product content type + + + + + Audiobook + Audio recording of a reading of a book or other text + + + + + Performance - spoken word + Audio recording of a drama or other spoken word performance + + + + + Music recording + Audio recording of a music performance, including musical drama and opera + + + + + Other audio + Audio recording of other sound, eg birdsong + + + + + Game + + + + + + Moving images + Film, video etc + + + + + Still images / graphics + + + + + + Software + + + + + + Data + Data files + + + + + + + Bible contents + + + + + Apocrypha + The seven portions of the Apocrypha added to the Catholic canon at the Council of Trent in 1546: Tobit; Judith; Wisdom of Solomon; Sirach (Ecclesiasticus); Baruch, including the Letter of Jeremiah; I & II Maccabees; Extra portions of Esther and Daniel (Additions to Esther; the Prayer of Azariah; Song of the Three Jews; Susannah; Bel and the Dragon). These are not generally included in the Protestant canon. + + + + + Additional Apocryphal texts: Greek Orthodox canon + I Esdras; Prayer of Manasseh; Psalm 151; III Maccabees. + + + + + Additional Apocryphal texts: Slavonic Orthodox canon + I & II Esdras; Prayer of Manasseh; Psalm 151; III & IV Maccabees. + + + + + Additional Apocryphal texts + Additional Apocryphal texts included in some Bible versions: I & II Esdras; Prayer of Manasseh. + + + + + General canon with Apocrypha + The 66 books included in the Protestant, Catholic and Orthodox canons, together with the seven portions of the Apocrypha included in the Catholic canon. + + + + + General canon + The 66 books included in the Protestant, Catholic and Orthodox canons, 39 from the Old Testament and 27 from the New Testament. The sequence of books may differ in different canons. + + + + + Gospels + The books of Matthew, Mark, Luke and John. + + + + + Old Testament + Those 39 books which were included in the Jewish canon by the rabbinical academy established at Jamma in 90 CE. Also known as the Jewish or Hebrew scriptures. + + + + + New Testament + The 27 books included in the Christian canon through the Easter Letter of Athanasius, Bishop of Alexandria and also by a general council of the Christian church held near the end of the 4th century CE. + + + + + New Testament with Psalms and Proverbs + Includes the 27 books of the New Testament plus Psalms and Proverbs from the Old Testament. + + + + + Paul’s Epistles + The books containing the letters of Paul to the various early Christian churches. + + + + + Psalms and Proverbs + The book of Psalms and the book of Proverbs combined. + + + + + Psalms + The book of Psalms. + + + + + Pentateuch + The first five books of the Bible: Genesis, Exodus, Numbers, Leviticus, Deuteronomy. Also applied to the Torah. + + + + + Other portions + Selected books of either the OT or NT not otherwise noted. + + + + + + + Bible version + + + + + Amplified + A translation based on the American Standard Version and showing multiple options for the translation of ancient text. Published in full in 1965. Sponsored by the Lockman Foundation. + + + + + American Standard + A 1901 translation using verbal equivalence techniques with the purpose of Americanizing the King James version. + + + + + Contemporary English + A translation completed in 1995 and sponsored by the American Bible Society under the leadership of Barclay Newman. + + + + + Douay-Rheims + An early (1580-1609) English translation from the Latin Vulgate designed for Catholics and performed by George Martin. + + + + + English Standard + An update of the Revised Standard Version that makes 'modest' use of gender-free terminology. + + + + + God’s Word + A 1995 translation by the World Bible Publishing Company using the English language in a manner to communicate to the late 20th century American. + + + + + Geneva + An early (1560) English version of the Bible translated by William Whittingham with strong Protestant leanings. + + + + + Good News + A translation sponsored by the American Bible Society. The New Testament was first published (as “Today’s English Version” TEV) in 1966. The Old Testament was completed in 1976, and the whole was published as the “Good News Bible”. + + + + + Original Greek + New Testament text in an original Greek version + + + + + Original Hebrew + Old Testament text in an original Hebrew version + + + + + Holman Christian Standard + Published by Broadman and Holman this translation rejects all forms of gender-neutral wording and is written with strong influences from the Southern Baptist perspective of biblical scholarship. + + + + + International Children’s + A translation completed in 1986 targeting readability at the US third grade level. + + + + + Jerusalem + A translation designed for English speaking Catholics based on the original languages. It is based on French as well as ancient texts and was first published in 1966. + + + + + King James + A translation commissioned by King James I of England and first published in 1611. + + + + + 21st Century King James + A verbal translation led by William Prindele. Published in 1994, it was designed to modernize the language of the King James Version based on Webster’s New International Dictionary, 2nd edition, unabridged. + + + + + Living Bible + A paraphrase translation led by Kenneth N Taylor and first published in 1972. + + + + + Message Bible + A paraphrase translation of the New Testament by Eugene Peterson first published in 1993. + + + + + New American + A translation aimed at Catholic readers first published in its entirely in 1970. A revised New Testament was issued in 1986. + + + + + New American Standard + A translation commissioned by the Lockman Foundation. The New Testament was published in 1960 followed by the entire Bible in 1971. + + + + + New American Standard, Updated + A 1995 translation using more modern language than the NASB. + + + + + Bibelen 1895 + Norwegian Bible translation + + + + + Bibelen 1930 + Norwegian Bible translation + + + + + Bibelen 1938 + Norwegian Bible translation + + + + + Bibelen 1978-85 + Norwegian Bible translation + + + + + Bibelen 1978 + Norwegian Bible translation + + + + + Bibelen 1985 + Norwegian Bible translation + + + + + Bibelen 1988 + Norwegian Bible translation + + + + + Bibelen 1978-85/rev. 2005 + Norwegian Bible translation + + + + + New Century + A translation inspired by the International Children’s version. First published by World Publishing in 1991. + + + + + New English + A translation first issued in 1970 as a result of a proposal at the 1946 General Assembly of the Church of Scotland. + + + + + Bibelen Guds ord + Norwegian Bible translation + + + + + New International + A translation underwritten by the International Bible Society (formerly New York Bible Society). The New Testament was published in 1973 followed by the entire Bible in 1978. + + + + + New International Reader’s + A 1996 translation designed for people with limited literacy in English and based on the NIV. + + + + + New Jerusalem + A revision of the Jerusalem Bible. First published in 1986. + + + + + New King James + A version issued by Thomas Nelson Publishers in 1982-83 designed to update the language of the King James Version while maintaining the phrasing and rhythm and using the same sources as its predecessor. + + + + + Bibelen, nynorsk + Norwegian 'nynorsk' Bible translation + + + + + New Living + A translation sponsored by Tyndale House and first released in 1996. It is considered a revision and updating of the Living Bible. + + + + + New Revised Standard + A revision of the Revised Standard based on ancient texts but updating language to American usage of the 1980s. + + + + + Nueva Version Internacional + A Spanish translation underwritten by the International Bible Society. + + + + + New Testament in Modern English (Phillips) + An idiomatic translation by J B Phillips, first completed in 1966 + + + + + Revised English + A 1989 revision of the NEB. A significant effort was made to reduce the British flavor present in the NEB. + + + + + Revised Version + The first major revision of the King James Version, the Revised Version incorporates insights from early manuscripts discovered between 1611 and 1870, and corrects readings in the KJV which nineteenth-century scholarship deemed mistaken. The New Testament was published in 1881, the Old Testament in 1885, and the Apocrypha in 1895. + + + + + Revised Standard + A translation authorized by the National Council of Churches of Christ in the USA. The New Testament was published in 1946 followed by a complete Protestant canon in 1951. + + + + + Reina Valera + A Spanish translation based on the original texts. + + + + + Bibelen, samisk + Norwegian 'samisk' Bible translation + + + + + Today’s English + A translation of the New Testament sponsored by the American Bible Society and first published in 1966. It was incorporated into the “Good News Bible” GNB in 1976. + + + + + Today’s New International + An updating of the New International Version. The New Testament was published in 2002, and the entire Bible is scheduled for 2005. + + + + + Other + Other translations not otherwise noted. + + + + + + + Study Bible type + + + + + Cambridge Annotated + Contains the work of Howard Clark Kee including a summary of the development of the canon, introductions to the books, notes and cross references. Originally published in 1993, NRSV. + + + + + Life Application + A project of Tyndale House Publishers and Zondervan intended to help readers apply the Bible to daily living. Living Bible, King James, New International, NASB + + + + + Macarthur + A King James version study Bible with notes by James Macarthur first published in 1997. + + + + + Oxford Annotated + A study Bible originally published in the 1960s and based on the RSV / NRSV. + + + + + Studiebibel, Det Nye testamentet + Norwegian study Bible, New Testament + + + + + New Oxford Annotated + Published in 1991 and based on the New Revised Standard version. + + + + + Norsk studiebibel + Norwegian study Bible + + + + + Ryrie + Based on the work of Charles C. Ryrie. King James, NI, NASB + + + + + Scofield + A study Bible based on the early 20th century work of C.I. Scofield. Based on the King James version. + + + + + Spirit Filled + A transdenominational study Bible for persons from the Pentecostal/Charismatic traditions. + + + + + + + Bible purpose + + + + + Award + A Bible designed for presentation from a religious organization. + + + + + Baby + A Bible designed to be a gift to commemorate a child’s birth. + + + + + Bride + A special gift Bible designed for the bride on her wedding day. Usually white. + + + + + Confirmation + A Bible designed to be used in the confirmation reading or as a gift to a confirmand + + + + + Children’s + A text Bible designed in presentation and readability for a child. + + + + + Compact + A small Bible with a trim height of five inches or less. + + + + + Cross-reference + A Bible which includes text conveying cross-references to related scripture passages. + + + + + Daily readings + A Bible laid out to provide readings for each day of the year. + + + + + Devotional + A Bible containing devotional content together with the scripture. + + + + + Family + A Bible containing family record pages and/or additional study material for family devotion + + + + + General/Text + A standard Bible of any version with no distinguishing characteristics beyond the canonical text. + + + + + Gift + A Bible designed for gift or presentation, often including a presentation page. + + + + + Lectern/Pulpit + A large Bible with large print designed for use in reading scriptures in public worship from either the pulpit or lectern. + + + + + Men’s + A Bible especially designed with helps and study guides oriented to the adult male. + + + + + Primary school + A Bible designed for use in primary school + + + + + Pew + Usually inexpensive but sturdy, a Bible designed for use in church pews. + + + + + Scholarly + A Bible including texts in Greek and/or Hebrew and designed for scholarly study. + + + + + Slimline + + + + + + Student + A Bible with study articles and helps especially for use in the classroom. + + + + + Study + A Bible with many extra features, e.g. book introductions, dictionary, concordance, references, maps, etc., to help readers better understand the scripture. + + + + + Wedding gift + A special gift Bible designed as a gift to the couple on their wedding day + + + + + Women’s + A devotional or study Bible with helps targeted at the adult woman. + + + + + Youth + A Bible containing special study and devotional helps designed specifically for the needs of teenagers. + + + + + + + Bible text organization + + + + + Chronological + A Bible with the text organized in the order in which events are believed to have happened. + + + + + Chain reference + A Bible which explores keywords or themes by referring text to preceding or following text. + + + + + Interlinear + A Bible or other text in which different versions are printed one line above the other, so that the variations can easily be detected. + + + + + Parallel + A Bible with two or more versions printed side by side. + + + + + Standard + A Bible in which the text is presented in the traditional order. + + + + + + + Bible reference location + + + + + Center column + References are printed in a narrow column in the center of the page between two columns of text. + + + + + Page end + References are printed at the foot of the page. + + + + + Side column + References are printed in a column to the side of the scripture. + + + + + Verse end + References are printed at the end of the applicable verse. + + + + + Unknown + The person creating the ONIX record does not know where the references are located. + + + + + Other + Other locations not otherwise identified + + + + + + + Religious text identifier + + + + + + Religious text feature type + + + + + + Religious text feature code + + + + + + Country code - ISO 3166-1 + + + + + Andorra + + + + + + United Arab Emirates + + + + + + Afghanistan + + + + + + Antigua and Barbuda + + + + + + Anguilla + + + + + + Albania + + + + + + Armenia + + + + + + Netherlands Antilles + + + + + + Angola + + + + + + Antarctica + + + + + + Argentina + + + + + + American Samoa + + + + + + Austria + + + + + + Australia + + + + + + Aruba + + + + + + Aland Islands + + + + + + Azerbaijan + + + + + + Bosnia and Herzegovina + + + + + + Barbados + + + + + + Bangladesh + + + + + + Belgium + + + + + + Burkina Faso + + + + + + Bulgaria + + + + + + Bahrain + + + + + + Burundi + + + + + + Benin + + + + + + Bermuda + + + + + + Brunei Darussalam + + + + + + Bolivia + + + + + + Brazil + + + + + + Bahamas + + + + + + Bhutan + + + + + + Bouvet Island + + + + + + Botswana + + + + + + Belarus + + + + + + Belize + + + + + + Canada + + + + + + Cocos (Keeling) Islands + + + + + + Congo, Democratic Republic of the + + + + + + Central African Republic + + + + + + Congo + + + + + + Switzerland + + + + + + Cote D'Ivoire + + + + + + Cook Islands + + + + + + Chile + + + + + + Cameroon + + + + + + China + + + + + + Colombia + + + + + + Costa Rica + + + + + + Serbia and Montenegro + Replaces YU – Yugoslavia + + + + + Cuba + + + + + + Cape Verde + + + + + + Christmas Island + + + + + + Cyprus + + + + + + Czech Republic + + + + + + Germany + + + + + + Djibouti + + + + + + Denmark + + + + + + Dominica + + + + + + Dominican Republic + + + + + + Algeria + + + + + + Ecuador + + + + + + Estonia + + + + + + Egypt + + + + + + Western Sahara + + + + + + Eritrea + + + + + + Spain + + + + + + Ethiopia + + + + + + Finland + + + + + + Fiji + + + + + + Falkland Islands (Malvinas) + + + + + + Micronesia, Federated States of + + + + + + Faroe Islands + + + + + + France + + + + + + Gabon + + + + + + United Kingdom + + + + + + Grenada + + + + + + Georgia + + + + + + French Guiana + + + + + + Ghana + + + + + + Gibraltar + + + + + + Greenland + + + + + + Gambia + + + + + + Guinea + + + + + + Guadeloupe + + + + + + Equatorial Guinea + + + + + + Greece + + + + + + South Georgia and the South Sandwich Islands + + + + + + Guatemala + + + + + + Guam + + + + + + Guinea-Bissau + + + + + + Guyana + + + + + + Hong Kong + + + + + + Heard Island and McDonald Islands + + + + + + Honduras + + + + + + Croatia + + + + + + Haiti + + + + + + Hungary + + + + + + Indonesia + + + + + + Ireland + + + + + + Israel + + + + + + India + + + + + + British Indian Ocean Territory + + + + + + Iraq + + + + + + Iran, Islamic Republic of + + + + + + Iceland + + + + + + Italy + + + + + + Jamaica + + + + + + Jordan + + + + + + Japan + + + + + + Kenya + + + + + + Kyrgyzstan + + + + + + Cambodia + + + + + + Kiribati + + + + + + Comoros + + + + + + Saint Kitts and Nevis + + + + + + Korea, Democratic People's Republic of + + + + + + Korea, Republic of + + + + + + Kuwait + + + + + + Cayman Islands + + + + + + Kazakhstan + + + + + + Lao People's Democratic Republic + + + + + + Lebanon + + + + + + Saint Lucia + + + + + + Liechtenstein + + + + + + Sri Lanka + + + + + + Liberia + + + + + + Lesotho + + + + + + Lithuania + + + + + + Luxembourg + + + + + + Latvia + + + + + + Libyan Arab Jamahiriya + + + + + + Morocco + + + + + + Monaco + + + + + + Moldova, Republic of + + + + + + Madagascar + + + + + + Marshall Islands + + + + + + Macedonia, the former Yugoslav Republic of + + + + + + Mali + + + + + + Myanmar + + + + + + Mongolia + + + + + + Macao + + + + + + Northern Mariana Islands + + + + + + Martinique + + + + + + Mauritania + + + + + + Montserrat + + + + + + Malta + + + + + + Mauritius + + + + + + Maldives + + + + + + Malawi + + + + + + Mexico + + + + + + Malaysia + + + + + + Mozambique + + + + + + Namibia + + + + + + New Caledonia + + + + + + Niger + + + + + + Norfolk Island + + + + + + Nigeria + + + + + + Nicaragua + + + + + + Netherlands + + + + + + Norway + + + + + + Nepal + + + + + + Nauru + + + + + + Niue + + + + + + New Zealand + + + + + + Oman + + + + + + Panama + + + + + + Peru + + + + + + French Polynesia + + + + + + Papua New Guinea + + + + + + Philippines + + + + + + Pakistan + + + + + + Poland + + + + + + Saint Pierre and Miquelon + + + + + + Pitcairn + + + + + + Puerto Rico + + + + + + Palestinian Territory, Occupied + + + + + + Portugal + + + + + + Palau + + + + + + Paraguay + + + + + + Qatar + + + + + + Reunion + + + + + + Romania + + + + + + Russian Federation + + + + + + Rwanda + + + + + + Saudi Arabia + + + + + + Solomon Islands + + + + + + Seychelles + + + + + + Sudan + + + + + + Sweden + + + + + + Singapore + + + + + + Saint Helena + + + + + + Slovenia + + + + + + Svalbard and Jan Mayen + + + + + + Slovakia + + + + + + Sierra Leone + + + + + + San Marino + + + + + + Senegal + + + + + + Somalia + + + + + + Suriname + + + + + + Sao Tome and Principe + + + + + + El Salvador + + + + + + Syrian Arab Republic + + + + + + Swaziland + + + + + + Turks and Caicos Islands + + + + + + Chad + + + + + + French Southern Territories + + + + + + Togo + + + + + + Thailand + + + + + + Tajikistan + + + + + + Tokelau + + + + + + Timor-Leste + + + + + + Turkmenistan + + + + + + Tunisia + + + + + + Tonga + + + + + + Turkey + + + + + + Trinidad and Tobago + + + + + + Tuvalu + + + + + + Taiwan, Province of China + + + + + + Tanzania, United Republic of + + + + + + Ukraine + + + + + + Uganda + + + + + + United States Minor Outlying Islands + + + + + + United States + + + + + + Uruguay + + + + + + Uzbekistan + + + + + + Holy See (Vatican City State) + + + + + + Saint Vincent and the Grenadines + + + + + + Venezuela + + + + + + Virgin Islands, British + + + + + + Virgin Islands, US + + + + + + Viet Nam + + + + + + Vanuatu + + + + + + Wallis and Futuna + + + + + + Samoa + + + + + + Yemen + + + + + + Mayotte + + + + + + Yugoslavia + DEPRECATED, replaced by CS – Serbia and Montenegro + + + + + South Africa + + + + + + Zambia + + + + + + Zimbabwe + + + + + + + + Supplier identifier type + + + + + Proprietary + + + + + + Börsenverein Verkehrsnummer + + + + + + German ISBN Agency publisher identifier + + + + + + EAN-UCC GLN + Global location number (formerly EAN location number) + + + + + SAN + Book trade Standard Address Number - US, UK etc + + + + + + + Supplier role + + + + + Unspecified + Default + + + + + Publisher + + + + + + Publisher's exclusive distributor + In a specified supply territory + + + + + Publisher's non-exclusive distributor + In a specified supply territory + + + + + Wholesaler + + + + + + Sales agent + DEPRECATED - use <MarketRepresentation> to specify a sales agent + + + + + Publisher's distributor + In a specified supply territory. Use only where exclusive/non-exclusive status is not known. Prefer 02 or 03 as appropriate, where possible. + + + + + POD supplier + Where a POD product is supplied to retailers and/or consumers direct from a POD source. + + + + + + + Default linear unit + + + + + Centimeters + + + + + + Inches (US) + + + + + + Millimeters + + + + + + + + Default unit of weight + + + + + Pounds (US) + + + + + + Grams + + + + + + Ounces (US) + + + + + + + + Currency code - ISO 4217 + + + + + UAE Dirham + United Arab Emirates + + + + + Afghani + DEPRECATED, replaced by AFN + + + + + Afghani + Afghanistan + + + + + Lek + Albania + + + + + Armenian Dram + Armenia + + + + + Netherlands Antillian Guilder + Netherlands Antilles + + + + + Kwanza + Angola + + + + + Argentine Peso + Argentina + + + + + Austria, Schilling + Now replaced by the Euro (EUR): use only for historical prices that pre-date the introduction of the Euro + + + + + Australian Dollar + Australia, Christmas Island, Cocos (Keeling) Islands, Heard Island and McDonald Islands, Kiribati, Nauru, Norfolk Island, Tuvalu + + + + + Aruban Guilder + Aruba + + + + + Azerbaijanian Manat + Azerbaijan + + + + + Convertible Marks + Bosnia & Herzegovina + + + + + Barbados Dollar + Barbados + + + + + Taka + Bangladesh + + + + + Belgium, Franc + Now replaced by the Euro (EUR): use only for historical prices that pre-date the introduction of the Euro + + + + + Lev + DEPRECATED, replaced by BGN + + + + + Lev + Bulgaria + + + + + Bahraini Dinar + Bahrain + + + + + Burundi Franc + Burundi + + + + + Bermuda Dollar + Bermuda + + + + + Brunei Dollar + Brunei Darussalam + + + + + Boliviano + Bolivia + + + + + Brazilian Real + Brazil + + + + + Bahamian Dollar + Bahamas + + + + + Ngultrun + Bhutan + + + + + Pula + Botswana + + + + + Belarussian Ruble + Belarus + + + + + Belize Dollar + Belize + + + + + Canadian Dollar + Canada + + + + + Franc Congolais + Congo (Democratic Republic of the) + + + + + Swiss Franc + Switzerland, Liechtenstein + + + + + Chilean Peso + Chile + + + + + Yuan Renminbi + China + + + + + Colombian Peso + Colombia + + + + + Costa Rican Colon + Costa Rica + + + + + Serbian Dinar + Serbia + + + + + Cuban Convertible Peso + Cuba (alternative currency) + + + + + Cuban Peso + Cuba + + + + + Cape Verde Escudo + Cape Verde + + + + + Cyprus Pound + Cyprus + + + + + Czech Koruna + Czech Republic + + + + + Germany, Mark + Now replaced by the Euro (EUR): use only for historical prices that pre-date the introduction of the Euro + + + + + Djibouti Franc + Djibouti + + + + + Danish Krone + Denmark, Faroe Islands, Greenland + + + + + Dominican Peso + Dominican Republic + + + + + Algerian Dinar + Algeria + + + + + Kroon + Estonia + + + + + Egyptian Pound + Egypt + + + + + Nakfa + Eritrea + + + + + Spain, Peseta + Now replaced by the Euro (EUR): use only for historical prices that pre-date the introduction of the Euro + + + + + Ethiopian Birr + Ethiopia + + + + + Euro + Andorra, Austria, Belgium, Finland, France, Fr Guiana, Fr S Territories, Germany, Greece, Guadeloupe, Holy See (Vatican City), Ireland, Italy, Luxembourg, Martinique, Mayotte, Monaco, Netherlands, Portugal, Réunion, St Pierre & Miquelon, San Marino, Spain + + + + + Finland, Markka + Now replaced by the Euro (EUR): use only for historical prices that pre-date the introduction of the Euro + + + + + Fiji Dollar + Fiji + + + + + Falkland Islands Pound + Falkland Islands (Malvinas) + + + + + France, Franc + Now replaced by the Euro (EUR): use only for historical prices that pre-date the introduction of the Euro + + + + + Pound Sterling + United Kingdom + + + + + Lari + Georgia + + + + + Cedi + Ghana + + + + + Gibraltar Pound + Gibraltar + + + + + Dalasi + Gambia + + + + + Guinea Franc + Guinea + + + + + Greece, Drachma + Now replaced by the Euro (EUR): use only for historical prices that pre-date the introduction of the Euro + + + + + Quetzal + Guatemala + + + + + Guinea-Bissau Peso + Guinea-Bissau + + + + + Guyana Dollar + Guyana + + + + + Hong Kong Dollar + Hong Kong + + + + + Lempira + Honduras + + + + + Croatian Kuna + Croatia + + + + + Gourde + Haiti + + + + + Forint + Hungary + + + + + Rupiah + Indonesia + + + + + Ireland, Punt + Now replaced by the Euro (EUR): use only for historical prices that pre-date the introduction of the Euro + + + + + Israeli Sheqel + Israel + + + + + Indian Rupee + India + + + + + Iraqi Dinar + Iraq + + + + + Iranian Rial + Iran (Islamic Republic of) + + + + + Iceland Krona + Iceland + + + + + Italy, Lira + Now replaced by the Euro (EUR): use only for historical prices that pre-date the introduction of the Euro + + + + + Jamaican Dollar + Jamaica + + + + + Jordanian Dinar + Jordan + + + + + Yen + Japan + + + + + Kenyan Shilling + Kenya + + + + + Som + Kyrgyzstan + + + + + Riel + Cambodia + + + + + Comoro Franc + Comoros + + + + + North Korean Won + Korea (Democratic People’s Republic of) + + + + + Won + Korea (Republic of) + + + + + Kuwaiti Dinar + Kuwait + + + + + Cayman Islands Dollar + Cayman Islands + + + + + Tenge + Kazakstan + + + + + Kip + Lao People’s Democratic Republic + + + + + Lebanese Pound + Lebanon + + + + + Sri Lanka Rupee + Sri Lanka + + + + + Liberian Dollar + Liberia + + + + + Loti + Lesotho + + + + + Lithuanian Litus + Lithuania + + + + + Luxembourg, Franc + Now replaced by the Euro (EUR): use only for historical prices that pre-date the introduction of the Euro + + + + + Latvian Lats + Latvia + + + + + Libyan Dinar + Libyan Arab Jamahiriya + + + + + Moroccan Dirham + Morocco, Western Sahara + + + + + Moldovan Leu + Moldova, Republic of + + + + + Ariary + Madagascar + + + + + Malagasy Franc + Madagascar + + + + + Denar + Macedonia (former Yugoslav Republic of) + + + + + Kyat + Myanmar + + + + + Tugrik + Mongolia + + + + + Pataca + Macau + + + + + Ouguiya + Mauritania + + + + + Maltese Lira + Malta + + + + + Mauritius Rupee + Mauritius + + + + + Rufiyaa + Maldives + + + + + Kwacha + Malawi + + + + + Mexican Peso + Mexico + + + + + Malaysian Ringgit + Malaysia + + + + + Metical + Mozambique + + + + + Namibia Dollar + Namibia + + + + + Naira + Nigeria + + + + + Cordoba Oro + Nicaragua + + + + + Netherlands, Guilder + Now replaced by the Euro (EUR): use only for historical prices that pre-date the introduction of the Euro + + + + + Norwegian Krone + Norway, Bouvet Island, Svalbard and Jan Mayen + + + + + Nepalese Rupee + Nepal + + + + + New Zealand Dollar + New Zealand, Cook Islands, Niue, Pitcairn, Tokelau + + + + + Rial Omani + Oman + + + + + Balboa + Panama + + + + + Nuevo Sol + Peru + + + + + Kina + Papua New Guinea + + + + + Philippine Peso + Philippines + + + + + Pakistan Rupee + Pakistan + + + + + Zloty + Poland + + + + + Portugal, Escudo + Now replaced by the Euro (EUR): use only for historical prices that pre-date the introduction of the Euro + + + + + Guarani + Paraguay + + + + + Qatari Rial + Qatar + + + + + Leu + Romania + + + + + Russian Ruble + Russian Federation + + + + + Russian Ruble + DEPRECATED, replaced by RUB + + + + + Rwanda Franc + Rwanda + + + + + Saudi Riyal + Saudi Arabia + + + + + Solomon Islands Dollar + Solomon Islands + + + + + Seychelles Rupee + Seychelles + + + + + Sudanese Dinar + Sudan + + + + + Swedish Krona + Sweden + + + + + Singapore Dollar + Singapore + + + + + Saint Helena Pound + Saint Helena + + + + + Tolar + Slovenia + + + + + Slovak Koruna + Slovakia + + + + + Leone + Sierra Leone + + + + + Somali Shilling + Somalia + + + + + Suriname Guilder + Suriname + + + + + Suriname Guilder + DEPRECATED, replaced by SRD + + + + + Dobra + São Tome and Principe + + + + + El Salvador Colon + El Salvador + + + + + Syrian Pound + Syrian Arab Republic + + + + + Lilangeni + Swaziland + + + + + Baht + Thailand + + + + + Somoni + Tajikistan + + + + + Manat + Turkmenistan + + + + + Tunisian Dinar + Tunisia + + + + + Pa’anga + Tonga + + + + + Timor Escudo + East Timor + + + + + Turkish Lira (old) + Turkey + + + + + Turkish Lira (new) + Turkey, from 1 January 2005 + + + + + Trinidad and Tobago Dollar + Trinidad and Tobago + + + + + New Taiwan Dollar + Taiwan (Province of China) + + + + + Tanzanian Shilling + Tanzania (United Republic of) + + + + + Hryvnia + Ukraine + + + + + Uganda Shilling + Uganda + + + + + US Dollar + United States, American Samoa, British Indian Ocean Territory, Ecuador, Guam, Marshall Is, Micronesia (Federated States of), Northern Mariana Is, Palau, Puerto Rico, Turks & Caicos Is, US Minor Outlying Is, Virgin Is (British), Virgin Is (US) + + + + + Peso Uruguayo + Uruguay + + + + + Uzbekistan Sum + Uzbekistan + + + + + Bolivar + Venezuela + + + + + Dong + Viet Nam + + + + + Vatu + Vanuatu + + + + + Tala + Samoa + + + + + CFA Franc BEAC + Cameroon, Central African Republic, Chad, Congo, Equatorial Guinea, Gabon + + + + + East Caribbean Dollar + Anguilla, Antigua and Barbuda, Dominica, Grenada, Montserrat, Saint Kitts and Nevis, Saint Lucia, Saint Vincent and the Grenadines + + + + + CFA Franc BCEAO + Benin, Burkina Faso, Côte D'Ivoire, Mali, Niger, Senegal, Togo + + + + + CFP Franc + French Polynesia, New Caledonia, Wallis and Futuna + + + + + Yemeni Rial + Yemen + + + + + Yugoslavian Dinar + DEPRECATED, replaced by CSD + + + + + Rand + South Africa + + + + + Kwacha + Zambia + + + + + Zimbabwe Dollar + Zimbabwe + + + + + + + Bible text feature + + + + + Red letter + Words spoken by Christ are printed in red + + + + + + + Product form feature value - binding or page edge color + + + + + Black + + + + + + Blue + + + + + + Brown + + + + + + Burgundy/maroon + + + + + + Cream + + + + + + Gold + + + + + + Green + + + + + + Grey + + + + + + Multicolor + + + + + + Navy + + + + + + Orange + + + + + + Pink + + + + + + Purple + + + + + + Red + + + + + + Silver + + + + + + Tan + + + + + + White + + + + + + Yellow + + + + + + Other + + + + + + + + Product form feature value - special cover material + + + + + Berkshire leather + Pigskin + + + + + Calfskin + + + + + + French Morocco + Calf split or sheep split + + + + + Morocco + Goatskin + + + + + Bonded buffalo grain + + + + + + Bonded calf grain + + + + + + Bonded Cordova + + + + + + Bonded eelskin + + + + + + Bonded Ostraleg + + + + + + Bonded ostrich + + + + + + Bonded reptile grain + + + + + + Bonded leather + + + + + + Cowhide + + + + + + Eelskin + + + + + + Kivar + + + + + + Leatherflex + + + + + + Moleskin + + + + + + Softhide leather + + + + + + Metal + + + + + + Velvet + German 'Samt' + + + + + Mother-of-pearl + Spanish 'nácar' + + + + + Papyrus + + + + + + Géltex + An imitation cloth binding material + + + + + Guaflex + An imitation leather binding material + + + + + + + Discount code type + + + + + BIC discount group code + UK publisher's or distributor's discount group code in a format specified by BIC to ensure uniqueness + + + + + Proprietary + + + + + + Boeksoort + Terms code used in the Netherlands book trade + + + + + German terms code + Terms code used in German ONIX applications + + + + + + + Person name identifier type + + + + + Proprietary + + + + + + DNB-PND + Deutsche Bibliothek Personennormdatei - person name code of the German national library + + + + + + + Sales outlet identifier type + + + + + Proprietary + + + + + + BIC sales outlet identifier code + + + + + + + + + + + + + + + + + + diff --git a/ia-legacy-importer/onix/ONIX_BookProduct_Release2.1_reference.xsd b/ia-legacy-importer/onix/ONIX_BookProduct_Release2.1_reference.xsd new file mode 100644 index 00000000..2b93df7e --- /dev/null +++ b/ia-legacy-importer/onix/ONIX_BookProduct_Release2.1_reference.xsd @@ -0,0 +1,6396 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ia-legacy-importer/onix/README b/ia-legacy-importer/onix/README new file mode 100644 index 00000000..92ea16cc --- /dev/null +++ b/ia-legacy-importer/onix/README @@ -0,0 +1,4 @@ + +to test basic onix handling, you can run test-onix.sh +with an onix file on standard input. + diff --git a/ia-legacy-importer/onix/__init__.py b/ia-legacy-importer/onix/__init__.py new file mode 100644 index 00000000..bccec2ef --- /dev/null +++ b/ia-legacy-importer/onix/__init__.py @@ -0,0 +1 @@ +"""onix""" diff --git a/ia-legacy-importer/onix/config.sh b/ia-legacy-importer/onix/config.sh new file mode 100644 index 00000000..cb97d4d0 --- /dev/null +++ b/ia-legacy-importer/onix/config.sh @@ -0,0 +1,7 @@ +#!/bin/sh -e + +export URL_CACHE_DIR=urlcache # a temporary directory +export PHAROS_REPO="../.." # the root of the Open Library repository +export PYTHONPATH="$PHAROS_REPO" +export PYTHON_INTERPRETER=python2.5 + diff --git a/ia-legacy-importer/onix/import-log.sh b/ia-legacy-importer/onix/import-log.sh new file mode 100755 index 00000000..8ec7c7af --- /dev/null +++ b/ia-legacy-importer/onix/import-log.sh @@ -0,0 +1,3 @@ +#!/bin/sh + +exec env PHAROS_DBNAME=dbglog PHAROS_DBUSER=pharos PHAROS_DBPASS=pharos PHAROS_SITE=site0 PHAROS_LOGFILE=/1/dbg/import-logs/dbglog URL_CACHE_DIR=urlcache python2.4 onix-import.py diff --git a/ia-legacy-importer/onix/import.sh b/ia-legacy-importer/onix/import.sh new file mode 100755 index 00000000..a1656065 --- /dev/null +++ b/ia-legacy-importer/onix/import.sh @@ -0,0 +1,13 @@ +#!/bin/sh + +export PHAROS_DBNAME=dbgtest +export PHAROS_DBUSER=dbg +export PHAROS_SITE=site1 + +export PHAROS_EDITION_PREFIX="b/" +export PHAROS_AUTHOR_PREFIX="a/" + +export URL_CACHE_DIR=urlcache +export PYTHONPATH=/home/dbg/lib/python + +exec python2.4 onix-import.py diff --git a/ia-legacy-importer/onix/onix-import.py b/ia-legacy-importer/onix/onix-import.py new file mode 100644 index 00000000..5961ea10 --- /dev/null +++ b/ia-legacy-importer/onix/onix-import.py @@ -0,0 +1,229 @@ +import web +import infogami.tdb as tdb +from infogami.tdb import NotFound, Things, LazyThing +from items import * +from onix import parser +import sys +import unicodedata +import re +import os +from lang import * +from types import * + +source_name = None +source_path = None +edition_prefix = None +author_prefix = None + +edition_records = set ([]) +item_names = {} +#edition_names = set ([]) +#author_names = {} + +def setup (): + def getvar (name, required=True): + val = os.getenv (name) + if required and val is None: + raise Exception ("found no environment variable %s" % name) + return val + dbname = getvar ("PHAROS_DBNAME") + dbuser = getvar ("PHAROS_DBUSER") + dbpass = getvar ("PHAROS_DBPASS") + web.config.db_parameters = dict(dbn='postgres', db=dbname, user=dbuser, pw=dbpass) + web.db._hasPooling = False + web.config.db_printing = False + web.load() + tdb.setup() + logfile = getvar ("PHAROS_LOGFILE", False) + if logfile: + tdb.logger.set_logfile (open (logfile, "a")) + sys.stderr.write ("logging to %s\n" % logfile) + + global source_name, source_path + source_dir = getvar ("PHAROS_SOURCE_DIR") + source_name = sys.argv[1] + source_path = "%s/%s" % (source_dir, source_name) + + global edition_prefix, author_prefix + edition_prefix = getvar ("PHAROS_EDITION_PREFIX", False) or "" + author_prefix = getvar ("PHAROS_AUTHOR_PREFIX", False) or "" + + setup_names () + +def setup_names (): + global item_names, edition_records, source_name + + warn ("walking the length and breadth of the database ...") + author_type = Author.type () + edition_type = Edition.type () + walked = 0 + parent_id = site_object().id + for r in web.query ("SELECT id,name FROM thing WHERE parent_id = $parent_id", vars=locals()): + item_names[r.name] = r.id + + for r in web.query ("SELECT d1.value FROM datum AS d1, datum AS d2 WHERE d1.version_id=d2.version_id AND d1.key='source_record_lineno' AND d2.key='source_name' AND d2.value=$source_name", { 'source_name': source_name }): + edition_records.add (int (r.value)) + + warn ("noted %d items" % len (item_names)) + if len (edition_records) > 0: + warn ("already have %d records from this source; they will be ignored" % len (edition_records)) + +def import_file (input): + n = 0 + for x in parser (input): + n += 1 + import_item (x) + if n % 100 == 0: + sys.stderr.write ("." * 30 + " read %d records\n" % n) + sys.stderr.write ("\nread %d records\n" % n) + +skipped = 0 +imported = 0 + +def import_author (x): + name = author_prefix + name_string (x["name"]) + a = None + + global item_names + aid = item_names.get (name, None) + if aid: + a = LazyThing (aid) + # warn ("---------------------------> already author %s" % name) + else: + a = Author (name, d=massage_dict (x)) + a.save () + item_names[name] = a.id + # warn ("AUTHOR %s" % name) + return a + +def import_item (x): + global skipped, imported + + global edition_records + lineno = x["source_record_lineno"] + if lineno in edition_records: + skipped += 1 + if skipped % 100 == 0: + warn ("skipped %d" % skipped) + return + + # import the authors + authors = map (import_author, x.get ("authors") or []) + if x.get ("authors"): + del x["authors"] + + # find a unique name for the edition + global item_names + name = None + for n in edition_name_choices (x): + nn = edition_prefix + n + if nn not in item_names: + name = nn + break + + if not name: + raise Exception ("couldn't find a unique name for %s" % x) + + e = Edition (name, d=massage_dict (x)) + global source_name + e.source_name = source_name + e.authors = authors + e.save () + item_names[name] = e.id + edition_records.add (e.source_record_lineno) + imported += 1 + if imported % 100 == 0: + warn ("imported %d" % imported) + + # sys.stderr.write ("EDITION %s\n" % name) + +ignore_title_words = ['a', 'the'] +tsep = '_' + +def edition_name_choices (x): + # use up to 25 chars of title, including last word + title = name_safe (x['title']) + title_words = [ w for w in title.split() if w.lower() not in ignore_title_words ] + if len (title_words) == 0: + raise Exception ("no usable title chars") + ttail = title_words.pop (-1) + tlen = len (ttail) + name = "" + nlen = 1 + tlen + if title_words: + name = title_words.pop (0) + nlen = len (name) + 1 + tlen + while title_words: + w = title_words.pop (0) + wlen = len (w) + if nlen + 1 + wlen < 25: + name += "_" + w + nlen += 1 + wlen + if name: + name += "_" + name += ttail + name = name[0:30] + yield name + + ed_number = x.get ('edition_number') + if ed_number: + name = tsep.join ([name, name_string (ed_number)]) + yield name + + ed_type = x.get ('edition_type') + if ed_type: + name = tsep.join ([name, name_string (ed_type)]) + yield name + + ed = x.get ('edition') + if ed: + name = tsep.join ([name, name_string (ed)]) + yield name + + format = x.get ('physical_format') + if format: + name = tsep.join ([name, name_string (format)]) + yield name + + nlen = len (name) + n = 0 + while True: + name = name[:nlen] + tsep + "%d" % n + yield name + n += 1 + + return + +re_name_safe = re.compile (r'[^a-zA-Z0-9]') +def name_safe (s): + s = asciify (s) + s = s.replace ("'", "") + return re.sub (re_name_safe, ' ', s) + +def name_string (s): + s = name_safe (s) + words = s.split () + return '_'.join (words) + +def asciify (s): + return unicodedata.normalize('NFKD', s).encode('ASCII', 'ignore') + +def massage_value (v): + if (isinstance (v, UnicodeType)): + return v.encode ('utf8') + elif (isinstance (v, ListType)): + return map (massage_value, v) + else: + return v + +def massage_dict (d): + dd = {} + for (k, v) in d.iteritems (): + dd[k] = massage_value (v) + return dd + +if __name__ == "__main__": + setup() + sys.stderr.write ("--> setup finished\n") + import_file (open (source_path, "r")) + sys.stderr.write ("--> import finished\n") diff --git a/ia-legacy-importer/onix/onix.py b/ia-legacy-importer/onix/onix.py new file mode 100644 index 00000000..c0a2c2aa --- /dev/null +++ b/ia-legacy-importer/onix/onix.py @@ -0,0 +1,190 @@ +from __future__ import print_function +# wrapper code for easier handling of ONIX files: +# +# OnixHandler -- a sax ContentHandler that produces a stream of ONIX "product" data in xmltramp objects +# +# OnixProduct -- a wrapper for the objects produced by OnixHandler, providing human-friendly field access +# (mostly just providing a dictionary interface where long ("reference") names can be used even when the +# data is encoded with opaque ("short") names.) + +from xml.sax.handler import * +from catalog.onix.sax_utils import * +from catalog.onix import xmltramp + +repo_path = os.getenv ("PHAROS_REPO") +codelists_path = "%s/%s" % (repo_path, "catalog/onix/ONIX_BookProduct_CodeLists.xsd") +ref_dtd_path = "%s/%s" % (repo_path, "catalog/onix/ONIX_BookProduct_Release2.1_reference.xsd") + +# for testing, also set URL_CACHE_DIR; see bottom. + +onix_codelists = None +onix_shortnames = None + +def init (): + f = open (codelists_path, "r") + onix_codelists = parse_codelists (f) + f.close () + f = open (ref_dtd_path, "r") + onix_shortnames = parse_shortnames (f) + f.close () + +class OnixProduct: + # N.B.: this only works when using the "short" names of elements. + # we should check that the document uses the short DTD, and if not, + # use the reference names to access field values. + + def __init__ (self, p): + self.p = p + + @staticmethod + def reify_child (v): + if len (v._dir) == 1 and isinstance (v._dir[0], StringTypes): + return v._dir[0] + else: + return OnixProduct (v) + + def __getitem__ (self, n): + slicing = False + if isinstance (n, SliceType): + slicing = True + reference_name = n.start + else: + reference_name = n + name = OnixProduct.get_shortname (reference_name) # or reference_name.lower () + values = self.p[name:] + if slicing: + return map (OnixProduct.reify_child, values) + else: + if len (values) == 0: + raise KeyError ("no value for %s (%s)" % (reference_name, name)) + elif len (values) > 1: + raise Exception ("more than one value for %s (%s)" % (reference_name, name)) + return OnixProduct.reify_child (values[0]) + + def get (self, n): + try: + return self.__getitem__ (n) + except KeyError: + return None + + def getLineNumber (self): + return self.p.getLineNumber () + + def __unicode__ (self): + return self.p.__unicode__ () + + def __str__ (self): + return self.__unicode__ () + + def pi_type_name (code): + return onix_codelists["List5"][code][0] + + @staticmethod + def contributor_role (code): + return onix_codelists["List17"][code][0] + + @staticmethod + def get_shortname (reference_name): + try: + return onix_shortnames[reference_name] + except KeyError: + raise Exception ("unknown reference name: %s" % reference_name) + +class OnixHandler (ContentHandler): + + def __init__ (self, parser, receiver): + self.parser = parser + self.receiver = receiver + self.subhandler = None + ContentHandler.__init__ (self) + + def startElementNS (self, name, qname, attrs): + if self.subhandler: + self.subhandler.startElementNS (name, qname, attrs) + self.subdepth += 1 + else: + (uri, localname) = name + if localname == "product": + self.subhandler = xmltramp.Seeder (self.parser) + self.subhandler.startElementNS (name, qname, attrs) + self.subdepth = 1 + + def endElementNS (self, name, qname): + if self.subhandler: + self.subhandler.endElementNS (name, qname) + self.subdepth -= 1 + if self.subdepth == 0: + self.receiver (self.subhandler.result) + self.subhandler = None + + def characters (self, content): + if self.subhandler: + self.subhandler.characters (content) + +def parse_shortnames (input): + def schema (name, attrs): + def element (name, attrs): + def typespec (name, attrs): + def attribute (name, attrs): + if (attrs.getValueByQName ('name') == "shortname"): + shortname = attrs.getValueByQName ('fixed') + return CollectorValue (shortname) + else: + return CollectorNone () + return NodeCollector ({ 'attribute': attribute, collector_any: typespec }) + elt_name = attrs.getValueByQName ('name') + return NamedCollector (elt_name, { collector_any: typespec }) + return DictCollector ({ 'element': element }) + return collector_parse (input, { 'schema': schema }) + +def parse_codelists (input): + def schema (name, attrs): + def simpleType (name, attrs): + def restriction (name, attrs): + def enumeration (name, attrs): + def annotation (name, attrs): + def documentation (name, attrs): + return TextCollector () + return ListCollector ({ 'documentation': documentation }) + return NamedCollector (attrs.getValueByQName (u'value'), { 'annotation': annotation }) + return DictCollector ({ 'enumeration': enumeration }) + return NamedCollector (attrs.getValueByQName (u'name'), { 'restriction': restriction }) + return DictCollector ({ 'simpleType': simpleType }) + return collector_parse (input, { 'schema': schema }) + +init () + +### testing + +from xml.sax.saxutils import prepare_input_source + +class TestErrorHandler: + def error (self, exn): + raise exn + def fatalError (self, exn): + raise exn + def warning (self, exn): + sys.stderr.write ("warning: %s\n" % exn.getMessage) + +def produce_items (input, produce): + source = prepare_input_source (input) + + parser = xml.sax.make_parser () + parser.setFeature (xml.sax.handler.feature_namespaces, 1) + parser.setContentHandler (OnixHandler (parser, process_item)) + url_cache_dir = os.getenv ("URL_CACHE_DIR") + if url_cache_dir: + sys.stderr.write ("using url cache in %s\n" % url_cache_dir) + parser.setEntityResolver (CachingEntityResolver (parser, url_cache_dir)) + else: + sys.stderr.write ("no url_cache_dir; XML resources will always be loaded from network\n") + parser.setErrorHandler (TestErrorHandler ()) + parser.parse (source) + +def process_item (i): + print(OnixProduct (i)) + +if __name__ == "__main__": + from sys import stdin + print("Reading ONIX data from standard input ...") + produce_items (stdin, process_item) diff --git a/ia-legacy-importer/onix/parse.py b/ia-legacy-importer/onix/parse.py new file mode 100644 index 00000000..ccf771db --- /dev/null +++ b/ia-legacy-importer/onix/parse.py @@ -0,0 +1,276 @@ +# provides a parser from ONIX files to Open Library items + +import re +import sys +import os +from types import * +from lang import * + +import xml.sax +from xml.sax.handler import * +from xml.sax.saxutils import prepare_input_source + +from thread_utils import AsyncChannel, threaded_generator +from onix import OnixProduct, OnixHandler, onix_codelists + +import six + +def parser (input): + # returns a generator that produces dicts representing Open Library items + + def produce_items (produce): + source = prepare_input_source (input) + + parser = xml.sax.make_parser () + parser.setFeature (xml.sax.handler.feature_namespaces, 1) + parser.setContentHandler (OnixHandler (parser, process_product)) + url_cache_dir = os.getenv ("URL_CACHE_DIR") + if url_cache_dir: + sys.stderr.write ("using url cache in %s\n" % url_cache_dir) + parser.setEntityResolver (CachingEntityResolver (parser, url_cache_dir)) + else: + sys.stderr.write ("no url_cache_dir; XML resources will always be loaded from network\n") + parser.setErrorHandler (MyErrorHandler ()) + parser.parse (source) + + return threaded_generator (produce_items, 50) + +def process_product (p): + op = OnixProduct (p) # the incoming record + o = {} # the Open Library item we're producing + + # record id + o['source_record_lineno'] = p.getLineNumber () + + # title, subtitle + tt = [ t for t in op["Title":] if t["TitleType"] == '01' ] + if len (tt) > 1: + raise Exception ("more than one distinctive title") + elif len(tt) == 0: + raise Exception ("no distinctive title") + t = tt[0] + prefix = t.get ("TitlePrefix") + if prefix: + prefix = prefix.strip () + o['title_prefix_len'] = len (prefix) + 1 # prefix plus space + o['title'] = prefix + " " + t["TitleWithoutPrefix"].strip () + else: + title = t.get ("TitleText") + if title: + o['title'] = title + subtitle = t.get ("Subtitle") + if subtitle: + o['subtitle'] = subtitle + + # id codes (ISBN, etc.) + for pi in op["ProductIdentifier":]: + pi_type = pi["ProductIDType"] + pi_val = pi["IDValue"] + if pi_type != '01': + type_name = str (OnixProduct.pi_type_name (pi_type)).replace ("-", "_") + o[type_name] = pi_val + + # author, contributors + for c in op["Contributor":]: + role_codes = sorted(c["ContributorRole":]) + role_code = role_codes[0] + + name = person_name (c) + if not name: + warn ("=====> no name for contributor at line %d" % c.getLineNumber ()) + continue + + if role_code != 'A01': + role = OnixProduct.contributor_role (role_code) + add_val (o, "contributions", role + ": " + name) + continue + + author = {} + author["name"] = name + add_val (o, "authors", author) + + # iname = c.get ("PersonNameInverted") + # if iname: + # author["inverted_name"] = iname + # # XXX else construct inverted name from name parts + + pnis = c["PersonNameIdentifier":] + if len (pnis) > 0: + warn ("got PersonNameIdentifier(s): %s" % pnis[0]["IDValue"]) + + # other_names = c["Name":] + # XX: for pseudonyms, etc. ... should stash this somewhere + + for pdate in c["PersonDate":]: + role = pdate["PersonDateRole"] + # fmt = None + # fmt_code = pdate.get ("DateFormat") + # if fmt_code: + # fmt = onix_codelists["List55"][fmt_code] + date = pdate["Date"] + if role == "007": author["birth_date"] = date + elif role == "008": author["death_date"] = date + else: die ("bad date role: %s" % role) + + bio = c.get ("BiographicalNote") + if bio: + author["bio"] = bio + + # website + # country + # region + + contrib = op.get ("ContributorStatement") + if not o.get ("authors"): + # XXX: shouldn't do this: the ContributorStatement could have anything in it + # ... but this is the only way to get author names for one of the catalogs + if contrib: + author = {} + author["name"] = re_by.sub ('', contrib) + add_val (o, "authors", author) + + # edition + ed_type = op.get ("EditionTypeCode") + if ed_type: + o["edition_type"] = self.codelists["List21"][ed_type][0] + ed_number = op.get ("EditionNumber") + if ed_number: + ed_vers_num = op.get ("EditionVersionNumber") + if ed_vers_num: + ed_number += "-" + ed_vers_num + o["edition_number"] = ed_number + edition = op.get ("EditionStatement") + if edition: + o["edition"] = edition + + # format + format = op.get ("ProductFormDescription") + if format: + o["physical_format"] = format + npages = op.get ("NumberOfPages") + if npages: + o["number_of_pages"] = npages + nillus = op.get ("NumberOfIllustrations") + if nillus: + o["number_of_illustrations"] = nillus + ill_note = op.get ("IllustrationsNote") + if ill_note: + add_val (o, "notes", ill_note) + # see also composite + + # dimensions + + # language + # (see also composite) + lang_code = op.get ("LanguageOfText") + if lang_code: + o["language_code"] = lang_code + o["language"] = self.codelists["List74"][lang_code][0] + + # subject + bisac = op.get ("BISACMainSubject") + if bisac: + add_val (o, "BISAC_subject_categories", bisac) + for subject in op["Subject":]: + scheme = subject.get ("SubjectSchemeIdentifier") + if scheme and scheme == "10": + code = subject.get ("SubjectCode") + if code: + add_val (o, "BISAC_subject_categories", code) + + # description + for text in op["OtherText":]: + # type = text["TextTypeCode"] + format = text["TextFormat"] + if format not in ("00", "02", "07"): # ASCII, HTML, Basic ASCII + raise Exception ("unsupported description format: %s" % self.codelists["List34"][format][0]) + if o.get ("description"): + o["description"] += "\n" + text["Text"] + else: + o["description"] = text["Text"] + if not o.get ("description"): + descr = op.get ("MainDescription") + if descr: + o["description"] = descr + + self.receiver (o) + + # publisher + for pub in op["Publisher":]: + role = pub.get ("PublishingRole") + if role is None or role == "01": + name = pub.get ("PublisherName") + if name: + o["publisher"] = name + break + if not o.get ("publisher"): + pub = op.get ("PublisherName") + if pub: + o["publisher"] = pub + + # imprint + imprint = op.get ("Imprint") + if imprint: + name = imprint.get ("ImprintName") + if name: + o["imprint"] = name + if not o.get ("imprint"): + imprint = op.get ("ImprintName") + if imprint: + o["imprint"] = imprint + + # publish_status + pstat = op.get ("PublishingStatus") + if pstat and pstat != "??": + status = self.codelists["List64"][pstat][0] + pstatnote = op.get ("PublishingStatusNote") + if pstatnote: + stats += ": " + pstatnote + o["publish_status"] = status + + # publish_date + pdate = op.get ("PublicationDate") + if pdate: + o["publish_date"] = pdate # YYYY[MM[DD]] + # XXX: need to convert + +class MyErrorHandler: + def error (self, exn): + raise exn + def fatalError (self, exn): + raise exn + def warning (self, exn): + sys.stderr.write ("warning: %s\n" % exn.getMessage) + +name_parts = ["TitlesBeforeNames", "NamesBeforeKey", "PrefixToKey", "KeyNames", "NamesAfterKey", "SuffixToKey"] +def person_name (x): + global name_parts + name = x.get ("PersonName") + if not name: + parts = [ p for p in map (lambda p: x.get (p), name_parts) if p ] + name = " ".join (parts) + if not name: + iname = x.get ("PersonNameInverted") + if iname: + # XXX this often works, but is not reliable; + # shouldn't really mess with unstructured names + m = re_iname.match (iname) + if m: + name = m.group (2) + " " + m.group (1) + else: + name = iname + if not name: + name = x.get ("CorporateName") + return name + +def elt_get (e, tag, reference_name): + ee = e.get (tag) or e.get (reference_name.lower ()) + return six.text_type(ee) if ee else None + + +re_by = re.compile ("^\s*by\s+", re.IGNORECASE) +re_iname = re.compile ("^(.*),\s*(.*)$") + +def add_val (o, key, val): + if val is not None: + o.setdefault (key, []).append (val) diff --git a/ia-legacy-importer/onix/sax_utils.py b/ia-legacy-importer/onix/sax_utils.py new file mode 100644 index 00000000..9aa01aa1 --- /dev/null +++ b/ia-legacy-importer/onix/sax_utils.py @@ -0,0 +1,177 @@ +import os +from types import * +from urlcache import URLCache +import xml.sax +from xml.sax.handler import * +import sys + +from six.moves import urllib + + +class CachingEntityResolver (EntityResolver): + def __init__ (self, parser, dir): + self.parser = parser + if not os.path.isdir (dir): + raise Exception ("CachingEntityResolver: no such directory: %s" % dir) + self.cache = URLCache (dir) + + def resolveEntity (self, pubid, sysid): + parser_sysid = self.parser.getSystemId () + src = None + if sysid.startswith ("http:"): + src = self.resolveURL (sysid) + elif isinstance (parser_sysid, StringTypes) and parser_sysid.startswith ("http:"): + src = self.resolveURL (sysid, parser_sysid) + if not src: + src = EntityResolver.resolveEntity (self, p, s) + return src + + def resolveURL (self, sysid, base = ""): + url = urllib.parse.urljoin (base, sysid) + source = xml.sax.xmlreader.InputSource (url) + f = self.cache.get (url) + source.setByteStream (f) + return source + +def collector_parse (input, dispatch): + parser = xml.sax.make_parser () + parser.setFeature (xml.sax.handler.feature_namespaces, 1) + handler = CollectorHandler (parser, dispatch) + # parser.setContentHandler (handler) # CollectorHandler sets ContentHandler + parser.parse (input) + return handler.get_value () + +class CollectorHandler: + def __init__ (self, parser, base): + self.parser = parser + base_collector = None + if isinstance (base, Collector): + base_collector = base + else: + base_collector = NodeCollector (base) + self.collectors = [base_collector] + base_collector.start (None, self) + self.set_handler () + + def get_value (self): + if len (self.collectors) == 1: + return self.collectors[0].finish () + else: + raise Exception ("CollectorHandler.get_value(): collection not finished") + + def top_collector (self): + if not len (self.collectors): + return None + else: + return self.collectors[-1] + + def push_collector (self, collector): + self.collectors.append (collector) + self.set_handler () + + def pop_collector (self): + self.collectors.pop () + self.set_handler () + + def set_handler (self): + self.parser.setContentHandler (self.top_collector ()) + +class Collector (ContentHandler): + def start (self, parent, handler): + self.parent = parent + self.handler = handler + def end (self): + self.handler.pop_collector () + self.handler = None + value = self.finish () + if not isinstance (value, CollectorNoneValue): + self.parent.collect (value) + self.parent = None + def finish (self): + pass + def endElementNS (self, name, qname): + self.end () + +class TextCollector (Collector): + def __init__ (self): + self.value = None + def characters (self, content): + self.value = content + def finish (self): + return self.value + +class NodeCollector (Collector): + def __init__ (self, collector_table, strict=False): + self.collector_table = collector_table + self.strict = strict + self.ignoring = 0 + self.value = collector_none + def startElementNS (self, name, qname, attrs): + if self.ignoring: + self.ignoring += 1 + else: + (uri, localname) = name + c_maker = self.collector_table.get (localname) or self.collector_table.get (collector_any) + if c_maker: + c = c_maker (name, attrs) + c.start (self, self.handler) + self.handler.push_collector (c) + else: + if self.strict: + raise Exception ("no handler for element '%s'; handlers: %s" % (localname, self.collector_table.keys ())) + else: + self.ignoring += 1 + def endElementNS (self, name, qname): + if self.ignoring: + self.ignoring -= 1 + else: + self.end () + def collect (self, value): + self.value = value + def finish (self): + return self.value + +class NamedCollector (NodeCollector): + def __init__ (self, name, collector_table): + NodeCollector.__init__ (self, collector_table) + self.name = name + def finish (self): + if self.value is collector_none: + return collector_none + else: + return (self.name, self.value) + +class ListCollector (NodeCollector): + def __init__ (self, collector_table): + NodeCollector.__init__ (self, collector_table) + self.values = [] + def collect (self, value): + self.values.append (value) + def finish (self): + return self.values + +class DictCollector (NodeCollector): + def __init__ (self, collector_table): + NodeCollector.__init__ (self, collector_table) + self.values = {} + def collect (self, key_value): + (key, value) = key_value + if self.values.get (key): + raise Exception ("dictionary key '%s' is already mapped" % key) + else: + self.values[key] = value + def finish (self): + return self.values + +class CollectorValue (NodeCollector): + def __init__ (self, val): + NodeCollector.__init__ (self, {}, strict=False) + self.collect (val) + +class CollectorNoneValue: pass +collector_none = CollectorNoneValue () +def CollectorNone (): + return CollectorValue (collector_none) + +class CollectorAnyElement: pass +collector_any = CollectorAnyElement () diff --git a/ia-legacy-importer/onix/test-onix.sh b/ia-legacy-importer/onix/test-onix.sh new file mode 100755 index 00000000..b33eb8f4 --- /dev/null +++ b/ia-legacy-importer/onix/test-onix.sh @@ -0,0 +1,6 @@ +#!/bin/sh -e + +. ./config.sh + +exec $PYTHON_INTERPRETER onix.py + diff --git a/ia-legacy-importer/onix/thread_utils.py b/ia-legacy-importer/onix/thread_utils.py new file mode 100644 index 00000000..f8167212 --- /dev/null +++ b/ia-legacy-importer/onix/thread_utils.py @@ -0,0 +1,85 @@ +# 2007 dbg for the Internet Archive + +import sys +from threading import Thread, Lock, Condition + +from six import reraise + +class AsyncChannel: + # yes, i believe this is just Queue ... i was new to python and couldn't find it + + def __init__ (self, buffer_size=1): + self.buffer = [] + self.max_items = buffer_size + self.lock = Lock () + self.not_empty = Condition (self.lock) + self.not_full = Condition (self.lock) + + def get (self): + self.lock.acquire () + while len (self.buffer) == 0: + self.not_empty.wait () + val = self.buffer.pop (0) + self.not_full.notifyAll () + self.lock.release () + return val + + def put (self, val): + self.lock.acquire () + while len (self.buffer) == self.max_items: + self.not_full.wait () + self.buffer.append (val) + self.not_empty.notifyAll () + self.lock.release () + +class ForeignException: + + def __init__ (self, exc_type, exc_value, exc_traceback): + self.exc_type = exc_type + self.exc_value = exc_value + self.exc_traceback = exc_traceback + + def re_raise (self): + reraise(self.exc_type, self.exc_value, self.exc_traceback) + +def ForeignException_extract (): + (exc_type, exc_value, exc_traceback) = sys.exc_info() + return ForeignException (exc_type, exc_value, exc_traceback) + +def threaded_generator (producer, buffer_size=1): + # the producer function will be invoked with a single argument, a "produce" function. + # the producer may pass an object to this "produce" function any number of times before + # returning. the values thus passed will, in turn, be produced by the generator which + # is the return value of threaded_generator(). + # + # this provides a sort of coroutine facility, because python's generators can't do that: + # they can only yield values from the bottom of the call stack. sometimes you need to + # keep control context between producing values. + + t = None + chan = AsyncChannel (buffer_size) + + def produce (val): + chan.put (val) + + def main (): + try: + producer (produce) + chan.put (StopIteration ()) + except: + chan.put (ForeignException_extract ()) + + def generator (): + while True: + v = chan.get () + if isinstance (v, StopIteration): + break + if isinstance (v, ForeignException): + v.re_raise () + else: + yield v + + t = Thread (target=main) + t.setDaemon (True) + t.start () + return generator () diff --git a/ia-legacy-importer/onix/urlcache.py b/ia-legacy-importer/onix/urlcache.py new file mode 100644 index 00000000..ac5d9298 --- /dev/null +++ b/ia-legacy-importer/onix/urlcache.py @@ -0,0 +1,76 @@ +import sys +import os +import time +import shutil +import traceback +from fcntl import * + +from six.moves import urllib + + +class URLCache: + def __init__ (self, dir): + self.dir = dir + + def get_entries (self): + entries = {} + index_file = self.dir + "/index" + next = 0 + index = open (index_file, "a") # create index file if it doesn't exist + index.close () + index = open (index_file, "r+") + flock (index, LOCK_EX) + + for url in index: + entries[url.rstrip ()] = next + next += 1 + return (entries, next, index) + + def get (self, url): + url = url.strip () + (entries, next, index) = self.get_entries () + id = entries.get (url) + if id is None: + # with index locked, add an entry for this url and + # open a locked, temporary file to load its data + index.seek (0, 2) + index.write ("%s\n" % url) + data_file = self.dir + "/" + str (next) + tmp_data_file = data_file + "-fetching" + tmp_data = open (tmp_data_file, "w") + flock (tmp_data, LOCK_EX) + index.close () + + # having released the lock on the index, suck data + # into the temporary file + sys.stderr.write ("URLCache: fetching %s\n" % url) + net_data = urllib.request.urlopen (url) + shutil.copyfileobj (net_data, tmp_data) + tmp_data.flush () + os.link (tmp_data_file, data_file) # the fetch is good: attach it + tmp_data.close () # drop lock on temporary file + os.unlink (tmp_data_file) + id = next + + else: + # there is already an entry for this url, so release the lock on the index + index.close () + + data_file = self.dir + "/" + str (id) + if os.path.exists (data_file): + return open (data_file, "r") + else: + # wait for fetch to finish + tmp_data_file = data_file + "-fetching" + sys.stderr.write ("URLCache: waiting for %s\n" % data_file) + try: + try: + tmp_data = open (tmp_data_file) + flock (tmp_data, LOCK_SH) + tmp_data.close () + except OSError as e: + pass + return open (data_file, "r") + except Exception as exn: + # in case this happens, just blow away your cache + raise Exception ("URLCache: sorry, corrupted state for url '%s': %s" % (url, str (exn))) diff --git a/ia-legacy-importer/onix/xmltramp.py b/ia-legacy-importer/onix/xmltramp.py new file mode 100644 index 00000000..7d681cde --- /dev/null +++ b/ia-legacy-importer/onix/xmltramp.py @@ -0,0 +1,375 @@ +"""xmltramp: Make XML documents easily accessible.""" + +__version__ = "2.17" +__author__ = "Aaron Swartz" +__credits__ = "Many thanks to pjz, bitsko, and DanC." +__copyright__ = "(C) 2003-2006 Aaron Swartz. GNU GPL 2." + +import six + +def isstr(f): return isinstance(f, type('')) or isinstance(f, type(u'')) +def islst(f): return isinstance(f, type(())) or isinstance(f, type([])) + +empty = {'http://www.w3.org/1999/xhtml': ['img', 'br', 'hr', 'meta', 'link', 'base', 'param', 'input', 'col', 'area']} + +def quote(x, elt=True): + if elt and '<' in x and len(x) > 24 and x.find(']]>') == -1: return "" + else: x = x.replace('&', '&').replace('<', '<').replace(']]>', ']]>') + if not elt: x = x.replace('"', '"') + return x + +@six.python_2_unicode_compatible +class Element: + def __init__(self, name, attrs=None, children=None, prefixes=None, line=None): + if islst(name) and name[0] == None: name = name[1] + if attrs: + na = {} + for k in attrs.keys(): + if islst(k) and k[0] == None: na[k[1]] = attrs[k] + else: na[k] = attrs[k] + attrs = na + + self._name = name + self._attrs = attrs or {} + self._dir = children or [] + + prefixes = prefixes or {} + self._prefixes = dict(zip(prefixes.values(), prefixes.keys())) + + if prefixes: self._dNS = prefixes.get(None, None) + else: self._dNS = None + + self._line = line + + def __repr__(self, recursive=0, multiline=0, inprefixes=None): + def qname(name, inprefixes): + if islst(name): + if inprefixes[name[0]] is not None: + return inprefixes[name[0]]+':'+name[1] + else: + return name[1] + else: + return name + + def arep(a, inprefixes, addns=1): + out = '' + + for p in self._prefixes.keys(): + if not p in inprefixes.keys(): + if addns: out += ' xmlns' + if addns and self._prefixes[p]: out += ':'+self._prefixes[p] + if addns: out += '="'+quote(p, False)+'"' + inprefixes[p] = self._prefixes[p] + + for k in a.keys(): + out += ' ' + qname(k, inprefixes)+ '="' + quote(a[k], False) + '"' + + return out + + inprefixes = inprefixes or {u'http://www.w3.org/XML/1998/namespace':'xml'} + + # need to call first to set inprefixes: + attributes = arep(self._attrs, inprefixes, recursive) + out = '<' + qname(self._name, inprefixes) + attributes + + if not self._dir and (self._name[0] in empty.keys() + and self._name[1] in empty[self._name[0]]): + out += ' />' + return out + + out += '>' + + if recursive: + content = 0 + for x in self._dir: + if isinstance(x, Element): content = 1 + + pad = '\n' + ('\t' * recursive) + for x in self._dir: + if multiline and content: out += pad + if isstr(x): out += quote(x) + elif isinstance(x, Element): + out += x.__repr__(recursive+1, multiline, inprefixes.copy()) + else: + raise TypeError("I wasn't expecting "+ repr(x) +".") + if multiline and content: out += '\n' + ('\t' * (recursive-1)) + else: + if self._dir: out += '...' + + out += '' + + return out + + def __str__(self): + text = '' + for x in self._dir: + text += six.text_type(x) + return ' '.join(text.split()) + + def __getattr__(self, n): + if n[0] == '_': raise AttributeError("Use foo['"+n+"'] to access the child element.") + if self._dNS: n = (self._dNS, n) + for x in self._dir: + if isinstance(x, Element) and x._name == n: return x + raise AttributeError('No child element named %s' % repr(n)) + + def __hasattr__(self, n): + for x in self._dir: + if isinstance(x, Element) and x._name == n: return True + return False + + def __setattr__(self, n, v): + if n[0] == '_': self.__dict__[n] = v + else: self[n] = v + + + def __getitem__(self, n): + if isinstance(n, type(0)): # d[1] == d._dir[1] + return self._dir[n] + elif isinstance(n, slice(0).__class__): + # numerical slices + if isinstance(n.start, type(0)): return self._dir[n.start:n.stop] + + # d['foo':] == all s + n = n.start + if self._dNS and not islst(n): n = (self._dNS, n) + out = [] + for x in self._dir: + if isinstance(x, Element) and x._name == n: out.append(x) + return out + else: # d['foo'] == first + if self._dNS and not islst(n): n = (self._dNS, n) + for x in self._dir: + if isinstance(x, Element) and x._name == n: return x + raise KeyError + + def __setitem__(self, n, v): + if isinstance(n, type(0)): # d[1] + self._dir[n] = v + elif isinstance(n, slice(0).__class__): + # d['foo':] adds a new foo + n = n.start + if self._dNS and not islst(n): n = (self._dNS, n) + + nv = Element(n) + self._dir.append(nv) + + else: # d["foo"] replaces first and dels rest + if self._dNS and not islst(n): n = (self._dNS, n) + + nv = Element(n); nv._dir.append(v) + replaced = False + + todel = [] + for i in range(len(self)): + if self[i]._name == n: + if replaced: + todel.append(i) + else: + self[i] = nv + replaced = True + if not replaced: self._dir.append(nv) + for i in todel: del self[i] + + def __delitem__(self, n): + if isinstance(n, type(0)): del self._dir[n] + elif isinstance(n, slice(0).__class__): + # delete all s + n = n.start + if self._dNS and not islst(n): n = (self._dNS, n) + + for i in range(len(self)): + if self[i]._name == n: del self[i] + else: + # delete first foo + for i in range(len(self)): + if self[i]._name == n: del self[i] + break + + def __call__(self, *_pos, **_set): + if _set: + for k in _set.keys(): self._attrs[k] = _set[k] + if len(_pos) > 1: + for i in range(0, len(_pos), 2): + self._attrs[_pos[i]] = _pos[i+1] + if len(_pos) == 1: + return self._attrs[_pos[0]] + if len(_pos) == 0: + return self._attrs + + def __len__(self): return len(self._dir) + + def get(self, n): + try: + return self.__getitem__(n) + except KeyError: + return None + + def getLineNumber (self): + return self._line + +class Namespace: + def __init__(self, uri): self.__uri = uri + def __getattr__(self, n): return (self.__uri, n) + def __getitem__(self, n): return (self.__uri, n) + +from xml.sax.handler import EntityResolver, DTDHandler, ContentHandler, ErrorHandler + +class Seeder(EntityResolver, DTDHandler, ContentHandler, ErrorHandler): + def __init__(self, parser=None): + if parser: + self.getLineNumber = lambda: parser.getLineNumber () + else: + self.getLineNumber = lambda: None + self.stack = [] + self.ch = '' + self.prefixes = {} + ContentHandler.__init__(self) + + def startPrefixMapping(self, prefix, uri): + if prefix not in self.prefixes: self.prefixes[prefix] = [] + self.prefixes[prefix].append(uri) + def endPrefixMapping(self, prefix): + self.prefixes[prefix].pop() + + def startElementNS(self, name, qname, attrs): + ch = self.ch; self.ch = '' + if ch and not ch.isspace(): self.stack[-1]._dir.append(ch) + + attrs = dict(attrs) + newprefixes = {} + for k in self.prefixes.keys(): newprefixes[k] = self.prefixes[k][-1] + + self.stack.append(Element(name, attrs, prefixes=newprefixes.copy(), line=self.getLineNumber ())) + + def characters(self, ch): + self.ch += ch + + def endElementNS(self, name, qname): + ch = self.ch; self.ch = '' + if ch and not ch.isspace(): self.stack[-1]._dir.append(ch) + + element = self.stack.pop() + if self.stack: + self.stack[-1]._dir.append(element) + else: + self.result = element + +from xml.sax import make_parser +from xml.sax.handler import feature_namespaces + +def seed(fileobj): + seeder = Seeder() + parser = make_parser() + parser.setFeature(feature_namespaces, 1) + parser.setContentHandler(seeder) + parser.parse(fileobj) + return seeder.result + +def parse(text): + from six import StringIO + return seed(StringIO(text)) + +def load(url): + import urllib + return seed(urllib.urlopen(url)) + +def unittest(): + parse('afoobara').__repr__(1,1) == \ + '\n\ta\n\t\tfoobar\n\ta\n' + + assert str(parse("")) == "" + assert str(parse("I love you.")) == "I love you." + assert parse("\nmom\nwow\n")[0].strip() == "mom\nwow" + assert str(parse(' center ')) == "center" + assert str(parse('\xcf\x80')) == '\xcf\x80' + + d = Element('foo', attrs={'foo':'bar'}, children=['hit with a', Element('bar'), Element('bar')]) + + try: + d._doesnotexist + raise Exception("ExpectedError but found success. Damn.") + except AttributeError: pass + assert d.bar._name == 'bar' + try: + d.doesnotexist + raise Exception("ExpectedError but found success. Damn.") + except AttributeError: pass + + assert hasattr(d, 'bar') == True + + assert d('foo') == 'bar' + d(silly='yes') + assert d('silly') == 'yes' + assert d() == d._attrs + + assert d[0] == 'hit with a' + d[0] = 'ice cream' + assert d[0] == 'ice cream' + del d[0] + assert d[0]._name == "bar" + assert len(d[:]) == len(d._dir) + assert len(d[1:]) == len(d._dir) - 1 + assert len(d['bar':]) == 2 + d['bar':] = 'baz' + assert len(d['bar':]) == 3 + assert d['bar']._name == 'bar' + + d = Element('foo') + + doc = Namespace("http://example.org/bar") + bbc = Namespace("http://example.org/bbc") + dc = Namespace("http://purl.org/dc/elements/1.1/") + d = parse(""" + John Polk and John Palfrey + John Polk + John Palfrey + Buffy + """) + + assert repr(d) == '...' + assert d.__repr__(1) == 'John Polk and John PalfreyJohn PolkJohn PalfreyBuffy' + assert d.__repr__(1,1) == '\n\tJohn Polk and John Palfrey\n\tJohn Polk\n\tJohn Palfrey\n\tBuffy\n' + + assert repr(parse("")) == '' + + assert str(d.author) == str(d['author']) == "John Polk and John Palfrey" + assert d.author._name == doc.author + assert str(d[dc.creator]) == "John Polk" + assert d[dc.creator]._name == dc.creator + assert str(d[dc.creator:][1]) == "John Palfrey" + d[dc.creator] = "Me!!!" + assert str(d[dc.creator]) == "Me!!!" + assert len(d[dc.creator:]) == 1 + d[dc.creator:] = "You!!!" + assert len(d[dc.creator:]) == 2 + + assert d[bbc.show](bbc.station) == "4" + d[bbc.show](bbc.station, "5") + assert d[bbc.show](bbc.station) == "5" + + e = Element('e') + e.c = '' + assert e.__repr__(1) == '<img src="foo">' + e.c = '2 > 4' + assert e.__repr__(1) == '2 > 4' + e.c = 'CDATA sections are closed with ]]>.' + assert e.__repr__(1) == 'CDATA sections are <em>closed</em> with ]]>.' + e.c = parse('
    i
    love
    you
    ') + assert e.__repr__(1) == '
    i
    love
    you
    ' + + e = Element('e') + e('c', 'that "sucks"') + assert e.__repr__(1) == '' + + + assert quote("]]>") == "]]>" + assert quote('< dkdkdsd dkd sksdksdfsd fsdfdsf]]> kfdfkg >') == '< dkdkdsd dkd sksdksdfsd fsdfdsf]]> kfdfkg >' + + assert parse('').__repr__(1) == '' + assert parse('').__repr__(1) == '' + +if __name__ == '__main__': unittest() diff --git a/ia-legacy-importer/people/build_object.py b/ia-legacy-importer/people/build_object.py new file mode 100644 index 00000000..9fcc6a8d --- /dev/null +++ b/ia-legacy-importer/people/build_object.py @@ -0,0 +1,64 @@ +from openlibrary.catalog.utils import flip_name, pick_first_date + + +def build_person_object(p, marc_alt): + ab = [(k, v.strip(' /,;:')) for k, v in p if k in 'ab'] + + has_b = any(k=='b' for k, v in p) + + orig_name = ' '.join(v if k == 'a' else v for k, v in ab) + c = ' '.join(v for k, v in p if k == 'c') + name = flip_name(orig_name) + if name[0].isdigit(): + name = orig_name + else: + of_count = c.count('of ') + # if of_count == 1 and not has_b and 'of the ' not in c: + # if c.startswith('King') + # + # if c.startswith('Queen'): + # name += ' ' + c[c.find('of '):] + # + if of_count == 1 and 'of the ' not in c and 'Emperor of ' not in c: + name += ' ' + c[c.find('of '):] + elif ' ' not in name and of_count > 1: + name += ', ' + c + elif c.endswith(' of') or c.endswith(' de') and any(k == 'a' and ', ' in v for k, v in p): + name = ' '.join(v for k, v in ab) + c += ' ' + name[:name.find(', ')] + name = name[name.find(', ') + 2:] + ', ' + c + + person = {} + d = [v for k, v in p if k =='d'] + if d: + person = pick_first_date(d) + person['name'] = name + person['sort'] = orig_name + + if any(k=='b' for k, v in p): + person['enumeration'] = ' '.join(v for k, v in p if k == 'b') + + if c: + person['title'] = c + person['marc'] = [p] + list(marc_alt) + + return person + +def test_consort(): + line = (('a', u'Elizabeth'), ('c', u'Empress, consort of Franz Joseph, Emperor of Austria')) + p = build_person_object(marc, []) + p['name'] == u'Empress Elizabeth, consort of Franz Joseph, Emperor of Austria', + + line = (('a', u'Mary'), ('c', u'Queen, Consort of George V, King of Great Britain'), ('d', u'1867-1953')) + p = build_person_object(marc, []) + p['name'] == u'Queen Mary, Consort of George V, King of Great Britain' + +def test_king_no_number(): + marc = (('a', u'Henry'), ('b', u'IV'), ('c', u'King of England'), ('d', u'1367-1413')) + p = build_person_object(marc, []) + assert p['name'] == u'Henry IV of England' + + marc = (('a', u'John'), ('c', u'King of England'), ('d', u'1167-1216')) + p = build_person_object(marc, []) + assert p['name'] == 'King John of England' + diff --git a/ia-legacy-importer/people/from_works.py b/ia-legacy-importer/people/from_works.py new file mode 100644 index 00000000..58530672 --- /dev/null +++ b/ia-legacy-importer/people/from_works.py @@ -0,0 +1,143 @@ +from __future__ import print_function +from openlibrary.catalog.utils.query import query_iter, set_staging, get_mc +from openlibrary.catalog.get_ia import get_data +from openlibrary.catalog.marc.fast_parse import get_tag_lines, get_all_subfields, get_subfields + +from pprint import pprint +from identify_people import read_people +from build_object import build_person_object +import sys +from collections import defaultdict + +set_staging(True) + +def work_and_marc(): + i = 0 + skip = True + for w in query_iter({'type': '/type/work', 'title': None}): + if skip: + if w['key'] == '/w/OL56814W': + skip = False + else: + continue + marc = set() + q = {'type': '/type/edition', 'works': w['key'], 'title': None, 'source_records': None} + for e in query_iter(q): + if e.get('source_records', []): + marc.update(i[5:] for i in e['source_records'] if i.startswith('marc:')) + mc = get_mc(e['key']) + if mc and not mc.startswith('ia:') and not mc.startswith('amazon:'): + marc.add(mc) + if marc: + yield w, marc + else: + print('no marc:', w) + + +def read_works(): + i = 0 + pages = {} + page_marc = {} + + for work, marc in work_and_marc(): + lines = [] + for loc in marc: + data = get_data(loc) + if not data: + continue + found = [v for k, v in get_tag_lines(data, set(['600']))] + if found: + lines.append((loc, found)) + if not lines: + continue + work['lines'] = lines + i += 1 + print(i, work['key'], work['title']) + + try: + people, marc_alt = read_people(j[1] for j in lines) + except AssertionError: + print(work['lines']) + continue + except KeyError: + print(work['lines']) + continue + + marc_alt_reverse = defaultdict(set) + for k, v in marc_alt.items(): + marc_alt_reverse[v].add(k) + + w = ol.get(work['key']) + w['subject_people'] = [] + for p, num in people.iteritems(): + print(' %2d %s' % (num, ' '.join("%s: %s" % (k, v) for k, v in p))) + print(' ', p) + if p in page_marc: + w['subject_people'].append({'key': '/subjects/people/' + page_marc[p]}) + continue + obj = build_person_object(p, marc_alt_reverse.get(p, [])) + key = obj['name'].replace(' ', '_') + full_key = '/subjects/people/' + key + w['subject_people'].append({'key': full_key}) + + if key in pages: + print(key) + pages[key]['marc'].append(p) + continue + + for m in obj['marc']: + page_marc[m] = key + + pages[key] = obj + obj_for_db = obj.copy() + del obj_for_db['marc'] + obj_for_db['key'] = full_key + obj_for_db['type'] = '/type/person' + print(ol.save(full_key.encode('utf-8'), obj_for_db, 'create a new person page')) + + print(w) + print(ol.save(w['key'], w, 'add links to people that this work is about')) + +def from_sample(): + i = 0 + pages = {} + page_marc = {} + for line in open('work_and_marc5'): + i += 1 + w = eval(line) +# print i, w['key'], w['title'] +# print w['lines'] + try: + people, marc_alt = read_people(j[1] for j in w['lines']) + except AssertionError: + print([j[1] for j in w['lines']]) + raise + marc_alt_reverse = defaultdict(set) + for k, v in marc_alt.items(): + marc_alt_reverse[v].add(k) + + for p, num in people.iteritems(): + if p in page_marc: + continue + obj = build_person_object(p, marc_alt_reverse.get(p, [])) + key = obj['name'].replace(' ', '_') + for m in obj['marc']: + page_marc[m] = key + if key in pages: +# print key +# print p +# for m in pages[key]['marc']: +# print m +# print + pages[key]['marc'].append(p) + else: + pages[key] = obj +# pprint(obj) +# continue + if obj['name'][1].isdigit(): + print([j[1] for j in w['lines']]) + pprint(obj) +# assert not obj['name'][1].isdigit() + +from_sample() +#read_works() diff --git a/ia-legacy-importer/people/identify_people.py b/ia-legacy-importer/people/identify_people.py new file mode 100644 index 00000000..7c8126b1 --- /dev/null +++ b/ia-legacy-importer/people/identify_people.py @@ -0,0 +1,997 @@ +from __future__ import print_function +from openlibrary.catalog.marc.cmdline import fmt_subfields +from openlibrary.catalog.marc.fast_parse import get_subfields, get_all_subfields +from openlibrary.catalog.utils import remove_trailing_dot, remove_trailing_number_dot, match_with_bad_chars, pick_first_date +import openlibrary.catalog.utils.authority as authority +from openlibrary.catalog.merge.normalize import normalize +from collections import defaultdict +import re + +def strip_death(date): + return date[:date.rfind('-')+1] + +def test_strip_death(): + assert strip_death("1850-1910") == "1850-" + +re_dates = re.compile(' (\d{2,4}-(?:\d{2,4})?)$') +re_dates_in_field = re.compile('^(\d{2,4})-((?:\d{2,4})?)$') +re_dates_in_field_bc = re.compile('^(\d{2,4}) B\.C\.-((?:\d{2,4}) B\.C\.?)$') + +def test_transpose_date(): + assert transpose_date(u'1452') == u'1425' + +def transpose_date(date): + return date[:-2] + date[-1] + date[-2] + +def is_date_transposed(d1, d2): + m1 = re_dates_in_field.match(d1) + m2 = re_dates_in_field.match(d2) + if not m1 and not m2: + m1 = re_dates_in_field_bc.match(d1) + m2 = re_dates_in_field_bc.match(d2) + if not m1 or not m2: + return False + + if m1.group(1) == m2.group(1): + death1 = m1.group(2) + death2 = m2.group(2) + if not death1 or not death2: + return False + return transpose_date(death1) == death2 + if m1.group(2) == m2.group(2): + birth1 = m1.group(1) + birth2 = m2.group(1) + return transpose_date(birth1) == birth2 + return False + +def test_is_date_transposed(): + assert is_date_transposed(u'1452-1485', u'1425-1485') + +def dates_not_close(d1, d2): + m1 = re_dates_in_field.match(d1) + if not m1: + return False + m2 = re_dates_in_field.match(d2) + if not m2: + return False + + birth1 = int(m1.group(1)) + birth2 = int(m2.group(1)) + if abs(birth1 - birth2) >= 10: + return True + + if not m1.group(2) or not m2.group(2): + return False + + death1 = int(m1.group(2)) + death2 = int(m2.group(2)) + return abs(death1 - death2) >= 10 + +def test_dates_not_close(): + assert dates_not_close('1825-1899', '1804-1849') + assert not dates_not_close(u'1907-2003', u'1909-') + assert not dates_not_close('1825-1899', '1826-1898') + +def combinations(iterable, r): + # combinations('ABCD', 2) --> AB AC AD BC BD CD + # combinations(range(4), 3) --> 012 013 023 123 + pool = tuple(iterable) + n = len(pool) + if r > n: + return + indices = range(r) + yield tuple(pool[i] for i in indices) + while True: + for i in reversed(range(r)): + if indices[i] != i + n - r: + break + else: + return + indices[i] += 1 + for j in range(i+1, r): + indices[j] = indices[j-1] + 1 + yield tuple(pool[i] for i in indices) + +def tidy_subfield(v): + return remove_trailing_dot(v.strip(' /,;:')) + +re_bc_date = re.compile('(\d+)B\.C\.') + +def clean_subfield(k, v): + if k in 'abc': + v = tidy_subfield(v) + elif k == 'd': + v = remove_trailing_number_dot(v.strip(' ,')) + v = re_bc_date.sub(lambda m: m.group(1) + " B.C.", v) + return (k, v) + +def has_subtag(a, subfields): + return any(k==a for k, v in subfields) + +def question_date(p1, p2): + marc_date1 = tuple(v for k, v in p1 if k =='d') + if not marc_date1: + return + marc_date2 = tuple(v for k, v in p2 if k =='d') + if not marc_date1 or not marc_date2 or marc_date1 == marc_date2: + return + + assert len(marc_date1) == 1 and len(marc_date2) == 1 + + name1 = tuple((k, v) for k, v in p1 if k !='d') + name2 = tuple((k, v) for k, v in p2 if k !='d') + if name1 != name2: + return + + marc_date1 = marc_date1[0] + question1 = '?' in marc_date1 + + marc_date2 = marc_date2[0] + question2 = '?' in marc_date2 + + if (not question1 and not question2) or (question1 and question2): + return # xor + + if marc_date1.replace('?', '') != marc_date2.replace('?', ''): + return + return 1 if question1 else 2 + +def get_marc_date(p): + marc_date = tuple(v for k, v in p if k =='d') + if not marc_date: + return + assert len(marc_date) == 1 + return marc_date[0].strip() + +def build_by_name(found): + by_name = defaultdict(set) + for p in found: + if has_subtag('d', p): + without_date = tuple(i for i in p if i[0] != 'd') + by_name[without_date].add(p) + + return by_name + +def build_name_and_birth(found): + # one author missing death date + name_and_birth = defaultdict(set) + for p in found: + d = get_marc_date(p) + if not d or d[-1] == '-' or '-' not in d: + continue + without_death = tuple((k, (v if k!='d' else strip_death(v))) for k, v in p) +# assert without_death not in name_and_birth + name_and_birth[without_death].add(p) + return name_and_birth + +def authority_lookup(to_check, found, marc_alt): + found_matches = False + for person_key, match in to_check.items(): + if len(match) == 1: + continue + if len(match) == 2: + d1, d2 = [get_marc_date(p) for p in match] + if dates_not_close(d1, d2) and not is_date_transposed(d1, d2): + continue + + name = ' '.join(v.strip() for k, v in person_key if k != 'd') + search_results = authority.search(name) + match_dates = dict((get_marc_date(p), p) for p in match) + norm_name = normalize(name) + authority_match = None + for i in search_results: + if i['type'] != 'personal name' or i['a'] == 'reference': + continue + if norm_name not in normalize(i['heading']): + continue + for d, p in match_dates.items(): + if i['heading'].endswith(d): + if authority_match: # more than one match + print('dups:', match_dates.items()) + authority_match = None + break + authority_match = p + if authority_match: + for p in match: + if p == authority_match: + continue + found[authority_match] += found.pop(p) + marc_alt[p] = authority_match + found_matches = True + return found_matches + +def subtag_should_be_c(found, marc_alt): + merge = [] + for p1, p2 in combinations(found, 2): + if len(p1) != len(p2): + continue + + subtag1 = [k for k, v in p1 if k in 'abcdq'] + subtag2 = [k for k, v in p2 if k in 'abcdq'] + + if subtag1 == subtag2: + continue + + def no_question_if_d(p): + return [v.replace('?', '') if k == 'd' else tidy_subfield(v) for k, v in p] + if no_question_if_d(p1) != no_question_if_d(p2): + continue + + for i in range(len(subtag1)): + if subtag1[i] == subtag2[i]: + continue + assert len(p1[i][1]) >= 5 + + if subtag1[i] == 'c': + assert subtag2[i] in 'bq' + merge.append((p1, p2)) + else: + assert subtag1[i] in 'bq' and subtag2[i] == 'c' + merge.append((p2, p1)) + break + + for a, b in merge: + if b not in found: + continue + found[a] += found.pop(b) + marc_alt[b] = a + +def merge_question_date(found, marc_alt): + merge = [] + for p1, p2 in combinations(found, 2): + primary = question_date(p1, p2) + if primary is None: + continue + if primary == 1: + merge.append((p1, p2)) + else: + assert primary == 2 + merge.append((p2, p1)) + + for a, b in merge: + found[a] += found.pop(b) + marc_alt[b] = a + +re_bad_marc = re.compile(' ?\$ ?[a-z] ') +def remove_bad_marc_subtag(s): + s = re_bad_marc.sub(' ', s) + return s + +def test_remove_bad_marc_subtag(): + assert remove_bad_marc_subtag('John, $ c King of England') == 'John, King of England' + + +def just_abcdq(p): + return tuple((k, v) for k, v in p if k in 'abcdq') + +def similar_dates(found, marc_alt): + # 1516 == d. 1516 + merge = [] + for p1, p2 in combinations(found, 2): + subtag1 = [k for k, v in p1] + subtag2 = [k for k, v in p2] + if subtag1 != subtag2: + continue + if [(k, v) for k, v in p1 if k != 'd'] != [(k, v) for k, v in p2 if k != 'd']: + continue + d1 = [v for k, v in p1 if k == 'd'] + d2 = [v for k, v in p2 if k == 'd'] + if d1 == d2: + continue + assert len(d1) == 1 and len(d2) == 1 + d1, d2 = d1[0], d2[0] + if d1 == 'd. ' + d2: + merge.append((p1, p2)) + continue + if d2 == 'd. ' + d1: + merge.append((p2, p1)) + continue + + for a, b in merge: + if b not in found: + continue + found[a] += found.pop(b) + marc_alt[b] = a + +re_simple_date = re.compile('^(\d+)-(\d+)?\.?$') + +def fix_bad_subtags(found, marc_alt): + just_values = defaultdict(lambda:defaultdict(int)) + for p, num in found.items(): + just_values[tuple(v.strip(',') for k, v in p)][p] += num + + for a, b in just_values.items(): + if len(b) == 1: + continue + b = sorted(b.items(), key=lambda i:i[1]) + if b[-1][1] == b[-2][1]: + continue + new = b.pop()[0] + for i, j in b: + found[new] += found.pop(i) + marc_alt[i] = new + +def wrong_subtag_on_date(found, marc_alt): + for p in found.keys(): + found_simple_date = False + for k, v in p: + if k != 'd' and re_simple_date.match(v): + found_simple_date = True + break + if not found_simple_date: + continue + new = tuple((('d' if k !='d' and re_simple_date.match(v) else k), v) for k, v in p) + if new in found: + found[new] += found.pop(p) + marc_alt[p] = new + +def missing_subtag(found, marc_alt): + merge = defaultdict(set) + for p1, p2 in combinations(found, 2): + subtag1 = [k for k, v in p1 if k in 'abcdq'] + subtag2 = [k for k, v in p2 if k in 'abcdq'] + + if subtag1 == subtag2: + continue + + name1 = ' '.join(v.strip() for k, v in p1) + name2 = ' '.join(v.strip() for k, v in p2) + + if not match_with_bad_chars(name1, name2) \ + and normalize(name1) != normalize(name2) \ + and normalize(remove_bad_marc_subtag(name1)) != normalize(remove_bad_marc_subtag(name2)) \ + and normalize(name1.lower().replace(' the', '')) != normalize(name2.lower().replace(' the', '')): + continue + + if len(subtag1) > len(subtag2): + merge[p2].add(just_abcdq(p1)) + else: + merge[p1].add(just_abcdq(p2)) + + def flat_len(p): + return len(' '.join(v for k, v in p)) + + for old, new in merge.items(): + by_size = sorted((flat_len(p), p) for p in new) + if len(by_size) > 1: + assert by_size[-1][0] > by_size[-2][0] + new_marc = by_size[-1][1] + + found[new_marc] += found.pop(old) + marc_alt[old] = new_marc + +def date_field_missing(p): + if has_subtag('d', p): + return p + assert has_subtag('a', p) + for k, v in p: + if k == 'a': + a = v + break + m = re_dates.search(a) + if not m: + return p + d = m.group(1) + a = tidy_subfield(a[:m.start()]) + new = [] + for k, v in p: + if k == 'a' and a: + new.append(('a', a)) + a = None + continue + if k not in ('b', 'c') and d: + new.append(('d', d)) + d = None + new.append((k, v)) + if d: + new.append(('d', d)) + return tuple(new) + +def bad_char_name_match(found, marc_alt): + merge = [] + for p1, p2 in combinations(found, 2): + if p1 == p2: + continue + if get_marc_date(p1) != get_marc_date(p2): + continue + p1, p2 = sorted([p1, p2], key=lambda i:found[i]) + if found[p1] != found[p2]: + name1 = ' '.join(v for k, v in p1 if k in 'abc') + name2 = ' '.join(v for k, v in p2 if k in 'abc') + if match_with_bad_chars(name1, name2): + found[p2] += found.pop(p1) + marc_alt[p1] = p2 + + for a, b in merge: + if b not in found: + continue + found[a] += found.pop(b) + marc_alt[b] = a + +def check_for_dup_a(p): + for a1, a2 in combinations((v for k, v in p if k == 'a'), 2): + assert a1 != a2 + +def read_people(people): + found = defaultdict(int) + marc_alt = {} + people = list(people) + + for lines in people: + for line in lines: + p = tuple(clean_subfield(k, v) for k, v in get_all_subfields(line)) + #check_for_dup_a(p) + found[date_field_missing(p)]+=1 + + for p in found.keys(): + c = None + for k, v in p: + if k == 'c': + c = v + break + if not c or c.lower() != 'family': + continue + new = tuple((k, v + ' family' if k == 'a' else v) for k, v in p if k != 'c') + if new in found: + found[new] += found.pop(p) + marc_alt[p] = new + + fix_bad_subtags(found, marc_alt) + + wrong_subtag_on_date(found, marc_alt) + + try: + missing_subtag(found, marc_alt) + except AssertionError: + print(people) + raise + + found_name = defaultdict(int) + for p, num in found.items(): + found_name[just_abcdq(p)] += num + found = found_name + + assert found + + if len(found) == 1: + return dict(found), marc_alt + + #for func in subtag_should_be_c, merge_question_date: + #for func in subtag_should_be_c, merge_question_date, missing_subtag, bad_char_name_match: + for func in subtag_should_be_c, merge_question_date, bad_char_name_match, similar_dates: + func(found, marc_alt) + + if len(found) == 1: + return dict(found), marc_alt + + assert found + + # one author missing death date + name_and_birth = build_name_and_birth(found) + + assert found + + try: + if authority_lookup(name_and_birth, found, marc_alt): + if len(found) == 1: + return dict(found), marc_alt + + name_and_birth = build_name_and_birth(found) + except AssertionError: + print(people) + raise + + assert found + + for p, num in found.items(): + if p not in name_and_birth: + continue + assert len(name_and_birth[p]) == 1 + new_name = list(name_and_birth[p])[0] + found[new_name] += found.pop(p) + marc_alt[p] = new_name + + assert found + + if len(found) == 1: + return dict(found), marc_alt + + # match up authors with the same name + # where one has dates and the other doesn't + by_name = build_by_name(found) + + try: + if authority_lookup(by_name, found, marc_alt): + if len(found) == 1: + return dict(found), marc_alt + by_name = build_by_name(found) # rebuild + except AssertionError: + print(people) + raise + + for p, num in found.items(): + if p not in by_name: + continue + if len(by_name[p]) != 1: + for i in by_name[p]: + print(i) + print(people) + assert len(by_name[p]) == 1 + new_name = list(by_name[p])[0] + found[new_name] += found.pop(p) + marc_alt[p] = new_name + assert found + + if len(found) == 1: + return dict(found), marc_alt + + by_date = defaultdict(set) + for p in found: + if not has_subtag('d', p): + continue + d = tuple(v for k, v in p if k=='d') + by_date[d].add(p) +# for k, v in by_date.iteritems(): +# print len(v), k, v + + return dict(found), marc_alt + +def read_files(): + read_file('work_and_marc2') + read_file('work_and_marc3') + +def read_file(filename): + for file_line in open(filename): + w = eval(file_line) + if len(w['lines']) == 1: + continue + lines = [i[1] for i in w['lines']] + print(w['key'], w['title']) + print(lines) + people, marc_alt = read_people(lines) +# for p, num in people.iteritems(): +# if any(k=='d' for k, v in people): +# continue + for p, num in people.iteritems(): + print(' %2d %s' % (num, ' '.join("%s: %s" % (k, v) for k, v in p))) + print(' ', p) + print() +#read_file() + +def test_accents(): + lines = [ + ['00\x1faB\xe5adar\xe5aya\xf2na.\x1ftBrahmas\xe5utra.\x1e'], + ['00\x1faB\xe5adar\xe5aya\xf2na.\x1ftBrahmas\xe5utra.\x1e'], + ['00\x1faB\xe5adar\xe5aya\xf2na.\x1ftBrahmas\xe5utra.\x1e'], + ['00\x1faB\xe5adar\xe5aya\xf2na.\x1ftBrahmas\xe5utra.\x1e'], + ['00\x1faB\xe5adar\xe5aya\xf2na.\x1ftBrahmas\xe5utra.\x1e'], + ['00\x1faB\xe5adar\xe5ayana.\x1ftBrahmas\xe5utra.\x1e'] + ] + a, b = read_people(lines) + assert a == {(('a', u'B\u0101dar\u0101ya\u1e47a'),): 6} + assert b == { (('a', u'B\u0101dar\u0101yana'),): (('a', u'B\u0101dar\u0101ya\u1e47a'),)} + +def test_same_name_one_date_missing(): + lines = [ + ['10\x1faAbedin, Zainul\x1fxCriticism and interpretation.\x1e'], + ['10\x1faAbedin, Zainul,\x1fd1914-1976\x1fxCriticism and interpretation.\x1e'], + + ['10\x1faAbedin, Zainul\x1fxCriticism and interpretation.\x1e'], + ['10\x1faAbedin, Zainul,\x1fd1914-1976\x1fxCriticism and interpretation.\x1e'] + ] + a, b = read_people(lines) + + assert a == {(('a', u'Abedin, Zainul'), ('d', u'1914-1976')): 4} + assert b == {(('a', u'Abedin, Zainul'),): (('a', u'Abedin, Zainul'), ('d', u'1914-1976'))} + +def test_matching_name_missing_death(): + lines = [ + ['10\x1faFrisch, Max,\x1fd1911-1991\x1e'], + ['10\x1faFrisch, Max,\x1fd1911-\x1e'], + ['10\x1faFrisch, Max,\x1fd1911-\x1e'] + ] + a, b = read_people(lines) + assert a == {(('a', u'Frisch, Max'), ('d', u'1911-1991')): 3} + assert b == {(('a', u'Frisch, Max'), ('d', u'1911-')): (('a', u'Frisch, Max'), ('d', u'1911-1991'))} + +def test_matching_dates(): + lines = [ + ['00\x1faMichelangelo Buonarroti,\x1fd1475-1564.\x1e'], + ['00\x1faMichelangelo Buonarroti,\x1fd1475-1564.\x1e'], + ['16\x1faBuonarroti, Michel Angelo,\x1fd1475-1564.\x1e'] + ] + a, b = read_people(lines) + +def test_harold_osman_kelly(): + lines = [ + ['10\x1faKelly, Harold Osman,\x1fd1884-1955.\x1e'], + ['10\x1faKelly, Harold Osman,\x1fd1884-1956.\x1e'] + ] + a, b = read_people(lines) + assert a == {(('a', u'Kelly, Harold Osman'), ('d', u'1884-1955')): 2} + assert b == {(('a', u'Kelly, Harold Osman'), ('d', u'1884-1956')): (('a', u'Kelly, Harold Osman'), ('d', u'1884-1955'))} + +def test_question_date(): + lines = [ + ['10\x1faBurke, Edmund,\x1fd1729?-1797.\x1ftReflections on the revolution in France.\x1e', '10\x1faCalonne,\x1fcM. de\x1fq(Charles Alexandre de),\x1fd1734-1802.\x1e'], + ['10\x1faBurke, Edmund,\x1fd1729-1797.\x1ftReflections on the Revolution in France.\x1e'] + ] + a, b = read_people(lines) + assert a == { + (('a', u'Burke, Edmund'), ('d', u'1729?-1797')): 2, + (('a', u'Calonne'), ('c', u'M. de'), ('q', u'(Charles Alexandre de),'), ('d', u'1734-1802')): 1 + } + + assert b == { + (('a', u'Burke, Edmund'), ('d', u'1729-1797')): (('a', u'Burke, Edmund'), ('d', u'1729?-1797')) + } + + +def test_pope_sixtus(): + lines = [ + ['00\x1faSixtus\x1fbV,\x1fcPope,\x1fd1521-1590.\x1e'], + ['04\x1faSixtus\x1fbV,\x1fcPope.\x1e'], + ['00\x1faSixtus\x1fbV,\x1fcPope,\x1fd1520-1590.\x1e'] + ] + a, b = read_people(lines) + assert a == { + ((u'a', u'Sixtus'), (u'b', u'V'), (u'c', u'Pope'), (u'd', u'1520-1590')): 3 + } + + assert b == { + (('a', u'Sixtus'), ('b', u'V'), ('c', u'Pope')): (('a', u'Sixtus'), ('b', u'V'), ('c', u'Pope'), ('d', u'1520-1590')), + (('a', u'Sixtus'), ('b', u'V'), ('c', u'Pope'), ('d', u'1521-1590')): (('a', u'Sixtus'), ('b', u'V'), ('c', u'Pope'), ('d', u'1520-1590')) + } + +def test_william_the_conqueror(): + lines = [ + ['00\x1faWilliam\x1fbI,\x1fcKing of England,\x1fd1027 or 8-1087.\x1e'], ['04\x1faWilliam\x1fbI,\x1fcKing of England,\x1fd1027?-1087.\x1e'], + ['00\x1faWilliam\x1fbI,\x1fcKing of England,\x1fd1027 or 8-1087.\x1e'], ['00\x1faWilliam\x1fbI,\x1fcKing of England,\x1fd1027 or 8-1087\x1e'], + ['00\x1faWilliam\x1fbI,\x1fcKing of England,\x1fd1027 or 8-1087.\x1e'], ['00\x1faWilliam\x1fbI,\x1fcKing of England,\x1fd1027 or 8-1087.\x1e'] + ] + a, b = read_people(lines) + + assert a == {(('a', u'William'), ('b', u'I'), ('c', u'King of England'), ('d', u'1027 or 8-1087')): 6} + assert b == {(('a', u'William'), ('b', u'I'), ('c', u'King of England'), ('d', u'1027?-1087')): (('a', u'William'), ('b', u'I'), ('c', u'King of England'), ('d', u'1027 or 8-1087'))} + +def test_missing_d(): + lines = [ + [' 0\x1faDickens, Charles, 1812-1870\x1fxManuscripts\x1fxFacsimiles.\x1e'], + ['10\x1faDickens, Charles,\x1fd1812-1870\x1fxManuscripts\x1fxFacsimiles.\x1e'] + ] + a, b = read_people(lines) + assert a == {(('a', u'Dickens, Charles'), ('d', u'1812-1870')): 2} + #assert b == {(('a', u'Dickens, Charles, 1812-1870'),): (('a', u'Dickens, Charles'), ('d', u'1812-1870'))} + assert b == {} + +def test_missing_c(): + return # skip for now + lines = [ + ['00\x1faMuhammad Quli Qutb Shah,\x1fcSultan of Golkunda,\x1fd1565-1612.\x1e'], + ['00\x1faMuhammad Quli Qutb Shah,\x1fcSultan of Golkunda,\x1fd1565-1612.\x1e'], + ['10\x1faMuhammad Quli Qutb Shah, Sultan of Golconda,\x1fd1565-1612\x1e'] + ] + a, b = read_people(lines) + assert a == {(('a', u'Muhammad Quli Qutb Shah'), ('c', u'Sultan of Golkunda'), ('d', u'1565-1612')): 3} + +def test_same_len_subtag(): + lines = [ + ['00\x1faJohn,\x1fcKing of England,\x1fd1167-1216\x1e', '10\x1faShakespeare, William,\x1fd1564-1616\x1fxStage history\x1e'], + ['00\x1faJohn,\x1fcKing of England,\x1fd1167-1216\x1fxDrama.\x1e', '10\x1faShakespeare, William,\x1fd1564-1616\x1fxStage history.\x1e'], + ['00\x1faJohn\x1fbKing of England,\x1fd1167-1216\x1fxDrama.\x1e', '10\x1faShakespeare, William,\x1fd1564-1616\x1fxStage history.\x1e'] + ] + a, b = read_people(lines) + assert a == { + (('a', u'John'), ('c', u'King of England'), ('d', u'1167-1216')): 3, + (('a', u'Shakespeare, William'), ('d', u'1564-1616')): 3 + } + +def test_king_john(): + lines = [ + ['00\x1faJohn,\x1fcKing of England,\x1fd1167-1216\x1fxDrama.\x1e'], + ['00\x1faJohn,\x1fcKing of England,\x1fd1167-1216\x1fxDrama\x1e', '10\x1faKean, Charles John,\x1fd1811?-1868\x1e'], + ['00\x1faJohn,\x1fcKing of England,\x1fd1167-1216\x1fxDrama\x1e'], + ['00\x1faJohn\x1fbKing of England,\x1fd1167-1216\x1fxDrama.\x1e'], + ['00\x1faJohn,\x1fcKing of England,\x1fd1167?-1216\x1fxDrama.\x1e'], + ['00\x1faJohn\x1fbKing of England,\x1fd1167-1216\x1fxDrama.\x1e'], + ['00\x1faJohn\x1fbKing of England,\x1fd1167-1216\x1fxDrama.\x1e'], + ['00\x1faJohn,\x1fcKing of England,\x1fd1167-1216\x1fvDrama.\x1e'], + ['00\x1faJohn\x1fbKing of England,\x1fd1167-1216\x1fvDrama.\x1e'], + ['00\x1faJohn,\x1fcKing of England,\x1fd1167?-1216\x1fxDrama.\x1e'], + ['00\x1faJohn,\x1fcKing of England,\x1fd1167?-1216\x1fxDrama.\x1e'], + ['00\x1faJohn,\x1fcKing of England,\x1fd1167?-1216\x1fxDrama.\x1e'], + ['00\x1faJohn,\x1fcKing of England,\x1fd1167-1216\x1fvDrama\x1e'], + ['00\x1faJohn,\x1fcKing of England,\x1fd1167-1216\x1fvDrama.\x1e'], + ['00\x1faJohn,\x1fcKing of England,\x1fd1167?-1216\x1fxDrama.\x1e'], + ['00\x1faJohn,\x1fcKing of England,\x1fd1167-1216\x1fxDrama.\x1e'], + ['00\x1faJohn,\x1fcKing of England,\x1fd1167-1216\x1fxDrama.\x1e'], + ['00\x1faJohn,\x1fcKing of England,\x1fd1167-1216\x1fvDrama.\x1e'], + ['00\x1faJohn,\x1fcKing of England,\x1fd1167-1216\x1fxDrama.\x1e'], + ['00\x1faJohn\x1fbKing of England,\x1fd1167-1216\x1fxDrama.\x1e'], + ['00\x1faJohn,\x1fcKing of England,\x1fd1167?-1216\x1fvDrama.\x1e', '00\x1faHenry\x1fbVIII,\x1fcKing of England,\x1fd1491-1547\x1fvDrama.\x1e'], + ['00\x1faJohn,\x1fcKing of England,\x1fd1167-1216\x1fvDrama.\x1e'], + ['00\x1faJohn,\x1fcKing of England,\x1fd1167-1216\x1fvDrama.\x1e'], + ['00\x1faJohn\x1fbKing of England,\x1fd1167-1216\x1fxDrama.\x1e'], + ['14\x1faShakespeare, William,\x1fd1564-1616.\x1ftKing John.\x1e'], + ['00\x1faJohn,\x1fcKing of England,\x1fd1167-1216\x1fvDrama.\x1e', '10\x1faShakespeare, William,\x1fd1564-1616.\x1ftKing John.\x1e'], + ['00\x1faJohn,\x1fcKing of England,\x1fd1167-1216\x1fxDrama\x1e'], + ['00\x1faJohn,\x1fcKing of England,\x1fd1167-1216\x1fvDrama.\x1e'], + ['00\x1faJohn,\x1fcKing of England,\x1fd1167-1216\x1fxDrama\x1e', '00\x1faHenry\x1fbVIII,\x1fcKing of England,\x1fd1491-1547\x1fxDrama\x1e'], + ['00\x1faJohn,\x1fcKing of England,\x1fd1167-1216\x1fxDrama.\x1e'], + ['00\x1faJohn,\x1fcKing of England,\x1fd1167?-1216\x1fxDrama.\x1e'], + ['00\x1faJohn,\x1fcKing of England,\x1fd1167-1216\x1fxDrama.\x1e', '10\x1faShakespeare, William,\x1fd1564-1616.\x1fxKing John\x1fxProblems, exercises, etc.\x1e', '01\x1faJohn,\x1fcKing of England,\x1fd1167-1216\x1fxDrama.\x1e'], + ['00\x1faJohn, $ c King of England,\x1fd1167-1216\x1fxDrama\x1e'], + ['00\x1faJohn, $ c King of England,\x1fd1167-1216\x1fxDrama\x1e'], + ['00\x1faJohn\x1fbKing of England,\x1fd1167-1216\x1fxDrama.\x1e'] + ] + a, b = read_people(lines) + assert a == { + (('a', u'Shakespeare, William'), ('d', u'1564-1616')): 3, + (('a', u'Kean, Charles John'), ('d', u'1811?-1868')): 1, + (('a', u'John'), ('c', u'King of England'), ('d', u'1167?-1216')): 35, + (('a', u'Henry'), ('b', u'VIII'),('c', u'King of England'), ('d', u'1491-1547')): 2 + } + +def test_non_ascii(): + lines = [ + ['00\x1faA\xe2soka,\x1fcKing of Magadha,\x1fdfl. 259 B.C.\x1e'], + ['00\x1faA{acute}soka,\x1fcKing of Magadha\x1fdfl. 259 B.C.\x1e'], + ['00\x1faAsoka,\x1fcKing of Magadha,\x1fdfl. 259 B.C..\x1e', '30\x1faMaurya family.\x1e'], + ['04\x1faAs\xcc\x81oka,\x1fcKing of Magadha,\x1fdca. 274-232 B.C.\x1e'], + ['00\x1faA\xe2soka,\x1fcKing of Magadha,\x1fdfl. 259 B.C.\x1e', '30\x1faMaurya dynasty.\x1e'], + ['04\x1faAsoka,\x1fcKing of Magadha,\x1fdca. 274-232 B.C.\x1e'], + ['00\x1faA\xe2soka,\x1fcKing of Magadha,\x1fdfl. 259 B.C\x1e', '30\x1faMaurya dynasty\x1e'], + ['00\x1faAs\xcc\x81oka,\x1fcKing of Magadha,\x1fdfl. 259 B.C.\x1e', '30\x1faMaurya family.\x1e'] + ] + a, b = read_people(lines) + print(a) + +def test_q_should_be_c(): + lines = [ + ['10\x1faLafayette, Marie Joseph Paul Yves Roch Gilbert Du Motier,\x1fcmarquis de,\x1fd1757-1834\x1fxTravel\x1fzNew York (State)\x1fzNew York.\x1e'], + ['10\x1faLafayette, Marie Joseph Paul Yves Roch Gilbert Du Motier,\x1fcmarquis de,\x1fd1757-1834\x1fxTravel\x1fzNew York (State)\x1fzNew York.\x1e'], + ['10\x1faLafayette, Marie Joseph Paul Yves Roch Gilbert Du Motier,\x1fqmarquis de,\x1fd1757-1834.\x1e'] + ] + a, b = read_people(lines) + +def test_date_in_a(): + lines = [ + ['10\x1faMachiavelli, Niccol\xe1o,\x1fd1469-1527\x1fxFiction.\x1e', '10\x1faBorgia, Cesare,\x1fd1476?-1507\x1fxFiction.\x1e'], + [' 0\x1faBorgia, Cesare, 1476?-1507\x1fxFiction.\x1e', ' 0\x1faMachiavelli, Niccolo, 1469-1527\x1fxFiction.\x1e'], + ['10\x1faMachiavelli, Niccol\xe1o,\x1fd1469-1527\x1fxFiction.\x1e', '10\x1faBorgia, Cesare,\x1fd1476?-1507\x1fxFiction.\x1e'], + ['10\x1faMachiavelli, Niccol\xe1o,\x1fd1469-1527\x1fxFiction.\x1e', '10\x1faBorgia, Cesare,\x1fd1476?-1507\x1fxFiction.\x1e'], ['10\x1faMachiavelli, Niccol\xe1o,\x1fd1469-1527\x1fxFiction.\x1e', '10\x1faBorgia, Cesare,\x1fd1476?-1507\x1fxFiction.\x1e'], ['10\x1faMachiavelli, Niccol\xe1o,\x1fd1469-1527\x1fxFiction\x1e', '10\x1faBorgia, Cesare,\x1fd1476?-1507\x1fxFiction\x1e'], + ['10\x1faMachiavelli, Niccol\xe1o,\x1fd1469-1527\x1fxFiction.\x1e', '10\x1faBorgia, Cesare,\x1fd1476?-1507\x1fxFiction.\x1e'] + ] + a, b = read_people(lines) + print(a) + assert a == {(('a', u'Borgia, Cesare'), ('d', u'1476?-1507')): 7, (('a', u'Machiavelli, Niccol\xf2'), ('d', u'1469-1527')): 7} + +def test_king_asoka(): + return + lines = [ + ['00\x1faA\xe2soka,\x1fcKing of Magadha,\x1fdfl. 259 B.C.\x1e'], + ['00\x1faA{acute}soka,\x1fcKing of Magadha\x1fdfl. 259 B.C.\x1e'], + ['00\x1faAsoka,\x1fcKing of Magadha,\x1fdfl. 259 B.C..\x1e', '30\x1faMaurya family.\x1e'], + ['04\x1faAs\xcc\x81oka,\x1fcKing of Magadha,\x1fdca. 274-232 B.C.\x1e'], + ['00\x1faA\xe2soka,\x1fcKing of Magadha,\x1fdfl. 259 B.C.\x1e', '30\x1faMaurya dynasty.\x1e'], + ['04\x1faAsoka,\x1fcKing of Magadha,\x1fdca. 274-232 B.C.\x1e'], + ['00\x1faA\xe2soka,\x1fcKing of Magadha,\x1fdfl. 259 B.C\x1e', '30\x1faMaurya dynasty\x1e'], + ['00\x1faAs\xcc\x81oka,\x1fcKing of Magadha,\x1fdfl. 259 B.C.\x1e', '30\x1faMaurya family.\x1e'] + ] + a, b = read_people(lines) + print(a) + # (('a', u'Asoka'), ('c', u'King of Magadha'), ('d', u'fl. 259 B.C..')): 1 + assert a == { + (('a', u'A\u015boka'), ('c', u'King of Magadha'), ('d', u'fl. 259 B.C.')): 7, + (('a', u'Maurya dynasty'),): 2, + (('a', u'Maurya family'),): 2, + (('a', u'Asoka'), ('c', u'King of Magadha'), ('d', u'ca. 274-232 B.C.')): 1 + } + +def test_name_lookup(): + lines = [ + ['10\x1faBellini, Giovanni,\x1fd1516.\x1e'], + ['10\x1faBellini, Giovanni,\x1fdd. 1516\x1e'] + ] + a, b = read_people(lines) + assert a == {(('a', 'Bellini, Giovanni'), ('d', 'd. 1516')): 2} + assert b == {((u'a', u'Bellini, Giovanni'), (u'd', u'1516')): ((u'a', u'Bellini, Giovanni'), (u'd', u'd. 1516'))} + +def test_cleopatra(): + return + lines = [ + ['00\x1faCleopatra,\x1fcQueen of Egypt,\x1fdd. 30 B.C\x1fxFiction.\x1e'], + ['00\x1faCleopatra,\x1fcQueen of Egypt,\x1fdd. 30 B.C.\x1fxFiction\x1e'], + [' 0\x1faCleopatra, Queen of Egypt, d. 30 B.C.\x1fxFiction.\x1e'], + ['00\x1faCleopatra,\x1fcQueen of Egypt,\x1fdd. 30 B.C.\x1fxFiction\x1e'], + ['00\x1faCleopatra,\x1fcqueen of Egypt,\x1fdd. B.C. 30\x1fxFiction\x1e'], + ['00\x1faCleopatra,\x1fcQueen of Egypt,\x1fdd. 30 B.C.\x1fxFiction\x1e'], + ['00\x1faCleopatra,\x1fcQueen of Egypt,\x1fdd. 30 B.C.\x1fvFiction.\x1e'], + ['00\x1faCleopatra,\x1fcQueen of Egypt,\x1fdd. 30 B.C.\x1fxFiction.\x1e'] + ] + a, b = read_people(lines) + assert a == { + (('a', u'Cleopatra'), ('c', u'Queen of Egypt'), ('d', u'd. 30 B.C.')): 8, + } + +def test_date_field_missing(): + lines = [[' 0\x1faMoore, Henry Spencer, 1898-\x1e']] + a, b = read_people(lines) + assert a == { + (('a', u'Moore, Henry Spencer'), ('d', u'1898-')): 1 + } + assert b == {} + +def test_numbers_in_name(): + lines = [ + [' 0\x1faFielding, Henry, 1707-1754. The history of the adventures of Joseph Andrews.\x1e'], + ['14\x1faFielding, Henry,\x1fd1707-1754.\x1ftJoseph Andrews.\x1e'], + ['10\x1faFielding, Henry,\x1fd1707-1754.\x1ftHistory of the adventures of Joseph Andrews.\x1e'], + ['14\x1faFielding, Henry,\x1fd1707-1754.\x1ftJoseph Andrews.\x1e'], + ['10\x1faFielding, Henry,\x1fd1707-1754.\x1ftHistory of the adventures of Joseph Andrews.\x1e'], + ['10\x1faFielding, Henry,\x1fd1707-1754.\x1ftHistory of the adventures of Joseph Andrews.\x1e'], + ['10\x1faFielding, Henry,\x1fd1707-1754.\x1ftHistory of the adventures of Joseph Andrews\x1e'] + ] + a, b = read_people(lines) + assert a == { + (('a', u'Fielding, Henry'), ('d', u'1707-1754')): 7 + } + +def test_caesar(): + lines = [ + ['10\x1faCaesar, Julius.\x1e'], + ['14\x1faCaesar, Julius,\x1fd100 B.C.-44B.C.\x1e'], + ['14\x1faCaesar, Julius,\x1fd100 B.C.-44 B.C.\x1e'], + ['10\x1faCaesar, Julius\x1e'], + ['14\x1faCaesar, Julius,\x1fd100 B.C.-44 B.C.\x1e'] + ] + a, b = read_people(lines) + assert a == {(('a', 'Caesar, Julius'), ('d', '100 B.C.-44 B.C.')): 5} + +def test_salvador_dali(): + lines = [ + ['14\x1faDali\xcc\x81, Salvador,\x1fd1904-1989\x1fvCatalogs.\x1e'], + ['10\x1faDali, Salvador,\x1fd1904-\x1e'], + ['10\x1faDal\xe2i, Salvador,\x1fd1904-\x1e'], + ['10\x1faDal\xe2i, Salvador,\x1fd1904-\x1e'], + ['10\x1faDal\xe2i, Salvador,\x1fd1904-\x1fxCatalogs.\x1e', '10\x1faMorse, Albert Reynolds,\x1fd1914-\x1fxArt collections\x1fxCatalogs.\x1e'], + ['10\x1faDal\xe2i, Salvador\x1fy1904-\x1e'], + ['14\x1faDali\xcc\x81, Salvador,\x1fd1904- \x1fvexhibitions.\x1e'], + ['14\x1faDali\xcc\x81, Salvador,\x1fd1904- \x1fvexhibitions.\x1e'] + ] + a, b = read_people(lines) + assert a == { + ((u'a', u'Dal\xed, Salvador'), (u'd', u'1904-1989')): 8, + ((u'a', u'Morse, Albert Reynolds'), (u'd', u'1914-')): 1 + } + +def test_date_in_y(): + lines = [ + ['10\x1faShakespeare, William,\x1fd1564-1616\x1fxStage history\x1fy1800-1950.\x1e'], + ['10\x1faShakespeare, William,\x1fd1564-1616\x1fxStage history\x1fy1800-\x1e'], + ['10\x1faShakespeare, William,\x1fd1564-1616.\x1e'], + ['10\x1faShakespeare, William,\x1fd1564-1616\x1fxDramatic production\x1e'] + ] + a, b = read_people(lines) + assert a == {((u'a', u'Shakespeare, William'), (u'd', u'1564-1616')): 4} + +def test_subtags_swapped(): + lines = [ + ['20\x1faCompton-Burnett, I.\x1fq(Ivy),\x1fd1884-1969.\x1e'], + ['10\x1faCompton-Burnett, I.\x1fd(Ivy),\x1fq1884-1969\x1fxCriticism and interpretation.\x1e'], + ['20\x1faCompton-Burnett, I.\x1fq(Ivy),\x1fd1884-1969.\x1e'], + ['14\x1faCompton-Burnett, I.\x1fq(Ivy),\x1fd1884-1969.\x1e'], + ['20\x1faCompton-Burnett, I.\x1fq(Ivy),\x1fd1884-1969\x1fxCriticism and interpretation.\x1e'], + ['20\x1faCompton-Burnett, I.\x1fq(Ivy),\x1fd1884-1969\x1fxCriticism and interpretation.\x1e'], + ['20\x1faCompton-Burnett, I.\x1fq(Ivy),\x1fd1884-1969\x1fxCriticism and interpretation.\x1e'], + ['14\x1faCompton-Burnett, I.\x1fq(Ivy),\x1fd1884-1969.\x1e'] + ] + a, b = read_people(lines) + assert a == {((u'a', u'Compton-Burnett, I.'), (u'q', u'(Ivy),'), (u'd', u'1884-1969')): 8} + +def test(): + lines = [ + [ + '10\x1faWashington, George,\x1fd1732-1799\x1fxFamily\x1fvJuvenile fiction.\x1e', + '10\x1faJudge, Oney\x1fvJuvenile fiction.\x1e', + '11\x1faWashington, George,\x1fd1732-1799\x1fxFamily\x1fxFiction.\x1e', + '11\x1faJudge, Oney\x1fvFiction.\x1e' + ], [ + '10\x1faJudge, Oney\x1fvJuvenile fiction.\x1e', + '10\x1faWashington, George,\x1fd1732-1799\x1fxFamily\x1fvJuvenile fiction.\x1e', + '11\x1faJudge, Oney\x1fvFiction.\x1e', + '11\x1faWashington, George,\x1fc1732-1799\x1fxFamily\x1fvFiction.\x1e' + ] + ] + + a, b = read_people(lines) + assert a == {((u'a', u'Judge, Oney'),): 4, ((u'a', u'Washington, George'), (u'd', u'1732-1799')): 4} + +# lines = [ +# [' 0\x1faHadrian, Emperor of Rome, 76-138\x1fxFiction.\x1e'], +# ['00\x1faHadrianus,\x1fcEmperor of Rome,\x1fd76-138\x1fxFiction.\x1e'] +# ] +# lines = [[' 0\x1faGreene, Graham, 1904- . The basement room. 1971.\x1e']] + +# lines = [ +# [' 0\x1faGyllembourg-Ehrensvard, Thomasine Christine Buntzen, 1773-1856. To tidsaldre.\x1e'], +# ['10\x1faGyllembourg, Thomasine,\x1fd1773-1856.\x1ftTo tidsaldre.\x1e'], +# ['14\x1faGyllembourg-Ehrensva\xcc\x88rd, Thomasine.\x1ftTo tidsaldre.\x1e'] +# ] + +# lines = [['10\x1faClifford, Henry de Clifford, 14th lord,\x1fd1455?-1523\x1fxFiction.\x1e',]] + +def test_same_name_different_dates(): + lines = [ + ['10\x1faStrauss, Johann,\x1fd1825-1899.\x1e', '10\x1faStrauss, Johann,\x1fd1804-1849.\x1e'], + ['10\x1faStrauss, Johann,\x1fd1825-1899.\x1e', '10\x1faStrauss, Johann,\x1fd1804-1849.\x1e'], + ['10\x1faStrauss, Johann,\x1fd1825-1899.\x1e', '10\x1faStrauss, Johann,\x1fd1804-1849.\x1e'] + ] + a, b = read_people(lines) + assert a == { + ((u'a', u'Strauss, Johann'), (u'd', u'1804-1849')): 3, + ((u'a', u'Strauss, Johann'), (u'd', u'1825-1899')): 3 + } + +def test_king_richard_iii(): + lines = [ + ['00\x1faRichard\x1fbIII,\x1fcKing of England,\x1fd1452-1485\x1fvDrama.\x1e'], + ['00\x1faRichard\x1fbIII,\x1fcKing of England,\x1fd1425-1485\x1fvDrama.\x1e'], + ['00\x1faRichard\x1fbIII,\x1fcKing of England,\x1fd1452-1485\x1fvDrama.\x1e'], + ['00\x1faRichard\x1fbIII,\x1fcKing of England,\x1fd1452-1485\x1fvDrama.\x1e'], + ['00\x1faRichard\x1fbIII,\x1fcKing of England,\x1fd1452-1485\x1fvDrama.\x1e'], + ['10\x1faShakespeare, William,\x1fd1564-1616.\x1ftKing Richard III.\x1e', '00\x1faRichard\x1fbIII,\x1fcKing of England,\x1fd1452-1485\x1fvDrama.\x1e'], + ['00\x1faRichard\x1fbIII,\x1fcKing of England,\x1fd1452-1485\x1fxDrama\x1e'], + ['00\x1faRichard\x1fbIII,\x1fcKing of England,\x1fd1452-1485\x1fvDrama.\x1e'], + ['00\x1faRichard\x1fbIII,\x1fcKing of England,\x1fd1452-1485\x1fxDrama\x1e'], + ['04\x1faRichard\x1fbIII,\x1fcKing of England\x1fxDrama.\x1e'], + ['00\x1faRichard\x1fbIII,\x1fcKing of England,\x1fd1452-1485\x1fxDrama.\x1e'], + ['00\x1faRichard\x1fbIII,\x1fcKing of England,\x1fd1452-1485\x1fxDrama\x1e'], + ['10\x1faRichard\x1fbIII,\x1fcKing of England,\x1fd1452-1485\x1fxDrama.\x1e'], + ['00\x1faRichard\x1fbIII,\x1fcKing of England,\x1fd1452-1485\x1fvDrama.\x1e'] + ] + a, b = read_people(lines) + assert a == { + ((u'a', u'Richard'), (u'b', u'III'), (u'c', u'King of England'), (u'd', u'1452-1485')): 14, + ((u'a', u'Shakespeare, William'), (u'd', u'1564-1616')): 1 + } + +def test_churchill_family(): + lines = [ + ['30\x1faChurchill family\x1e'], + ['30\x1faChurchill family.\x1e'], + ['34\x1faChurchill,\x1fcfamily.\x1e'], + ['30\x1faChurchill family\x1e'], + ['34\x1faChurchill,\x1fcFamily.\x1e'] + ] + a, b = read_people(lines) + assert a == { (('a', 'Churchill family'),): 5} + +def test_william_thompson(): + lines = [ + ['10\x1faHodgskin, Thomas,\x1fd1787-1869.\x1e', '10\x1faThompson, William,\x1fd1785?-1833.\x1e'], + ['10\x1faHodgskin, Thomas,\x1fd1787-1869.\x1e', '10\x1faThompson, William,\x1fd1775-1833.\x1e'], + ['10\x1faHodgskin, Thomas,\x1fd1787-1869.\x1e', '10\x1faThompson, William,\x1fd1775-1833.\x1e'] + ] + a, b = read_people(lines) + assert a == { # better if we could merge the William Thompson subjects + ((u'a', u'Hodgskin, Thomas'), (u'd', u'1787-1869')): 3, + ((u'a', u'Thompson, William'), (u'd', u'1775-1833')): 2, + ((u'a', u'Thompson, William'), (u'd', u'1785?-1833')): 1 + } + +def test_marcus_porcius(): + lines = [ + ['10\x1faCato, Marcus Porcius,\x1fd95-46 B.C.\x1fxDrama.\x1e'], + ['10\x1faCato, Marcus Porcius,\x1fd95-46 B.C.\x1fxDrama\x1e'], + ['10\x1faCato, Marcus Porcius,\x1fd95-46 B.C.\x1fxDrama.\x1e'], + ['10\x1faCato, Marcus Porcius,\x1fd95-46 B.C.\x1fxDrama.\x1e'], + ['10\x1faCato, Marcus Porcius,\x1fd95-46 B.C.\x1fvDrama.\x1e'], + ['10\x1faCato, Marcus Porcius,\x1fd234-149 B.C.\x1fvDrama.\x1e'], + ['10\x1faCato, Marcus Porcius,\x1fd95-46 B.C.\x1fxDrama\x1fxEarly works to 1800\x1e'] + ] + a, b = read_people(lines) + assert a == { + ((u'a', u'Cato, Marcus Porcius'), (u'd', u'234-149 B.C.')): 1, + ((u'a', u'Cato, Marcus Porcius'), (u'd', u'95-46 B.C.')): 6 + } diff --git a/ia-legacy-importer/read_rc.py b/ia-legacy-importer/read_rc.py new file mode 100644 index 00000000..14e69bb8 --- /dev/null +++ b/ia-legacy-importer/read_rc.py @@ -0,0 +1,16 @@ +import os.path + +# ~/.olrc looks like this: +# +# db='' +# user='' +# pw= '' +# host = '' +# secret_key = '' + +def read_rc(): + rc_file = os.path.expanduser('~/.olrc') + if not os.path.exists(rc_file): + return {} + f = open(rc_file) + return eval('dict(' + ', '.join(i for i in f if i) + ')') diff --git a/ia-legacy-importer/scratch/add_source_records.py b/ia-legacy-importer/scratch/add_source_records.py new file mode 100644 index 00000000..667be4dd --- /dev/null +++ b/ia-legacy-importer/scratch/add_source_records.py @@ -0,0 +1,41 @@ +from __future__ import print_function +import os +import re +import sys +import codecs +from openlibrary.catalog.read_rc import read_rc +from openlibrary.catalog.importer.db_read import get_mc + +sys.path.append('/home/edward/src/olapi') +from olapi import OpenLibrary, unmarshal, marshal + +rc = read_rc() +ol = OpenLibrary("http://dev.openlibrary.org") +ol.login('EdwardBot', rc['EdwardBot']) + +test_dir = '/home/edward/ol/test_data' + +re_edition = re.compile('^/b/OL\d+M$') + +re_meta_mrc = re.compile('^([^/]*)_meta.mrc:0:\d+$') + +for f in os.listdir(test_dir): + key = f.replace('_', '/') + if not re_edition.match(key): + continue + print(key) + continue + mc = get_mc(key) + print(key, mc) + if not mc: + continue + e = ol.get(key) + if e.get('source_records', []): + continue + if mc.startswith('ia:') or mc.startswith('amazon:'): + sr = mc + else: + m = re_meta_mrc.match(mc) + sr = 'marc:' + mc if not m else 'ia:' + m.group(1) + e['source_records'] = [sr] + print(ol.save(key, e, 'add source record')) diff --git a/ia-legacy-importer/scratch/count_41.py b/ia-legacy-importer/scratch/count_41.py new file mode 100644 index 00000000..be889d67 --- /dev/null +++ b/ia-legacy-importer/scratch/count_41.py @@ -0,0 +1,85 @@ +from __future__ import print_function +import web +import os.path +from catalog.get_ia import read_marc_file +from catalog.read_rc import read_rc +from catalog.marc.fast_parse import get_first_tag, get_all_subfields +from catalog.utils.query import query_iter + +marc_index = web.database(dbn='postgres', db='marc_index') +marc_index.printing = False + +rc = read_rc() + +def get_keys(loc): + assert loc.startswith('marc:') + vars = {'loc': loc[5:]} + db_iter = marc_index.query('select k from machine_comment where v=$loc', vars) + mc = list(db_iter) + if mc: + return [r.k for r in mc] + iter = query_iter({'type': '/type/edition', 'source_records': loc}) + return [e['key'] for e in iter] + +def files(): + endings = ['.mrc', '.marc', '.out', '.dat', '.records.utf8'] + def good(filename): + return any(filename.endswith(e) for e in endings) + + dir = rc['marc_path'] + dir_len = len(dir) + 1 + for dirpath, dirnames, filenames in os.walk(dir): + for f in sorted(f for f in filenames if good(f)): + name = dirpath + "/" + f + yield name, name[dir_len:], os.path.getsize(name) + +def percent(a, b): + return "%.2f%%" % (float(a * 100.0) / b) + +chunk = 10000 + +books = 0 +has_041 = 0 +has_a = 0 +has_h = 0 +has_2 = 0 +i2 = 0 +i1_0 = 0 +i1_1 = 0 +for name, part, size in files(): + f = open(name) + print(part) + for pos, loc, data in read_marc_file(part, f): + if str(data)[6:8] != 'am': # only want books + continue + books += 1 + line = get_first_tag(data, set(['041'])) + if not line: + continue + has_041 += 1 + if line[0] == '0': + i1_0 += 1 + if line[0] == '1': + i1_1 += 1 + subfields = list(get_all_subfields(line)) + print(loc) + keys = get_keys(loc) + print(keys, line[0:2], subfields) + continue + if line[1] != ' ': + i2 += 1 + print('i2:', line[0:2], subfields) + if '\x1fa' in line: + has_a +=1 + else: + print('no a:', line[0:2], subfields) + if '\x1fh' in line: + has_h +=1 + if '\x1f2' in line: + has_2 +=1 + print('has 2:', line[0:2], subfields) + if has_041 % chunk == 0: + print(books, percent(has_041, books), percent(i1_0, has_041), \ + percent(i1_1, has_041), i2, percent(has_a, has_041), \ + percent(has_h, has_041), has_2) +# print total, line[0:2], list(get_all_subfields(line)) diff --git a/ia-legacy-importer/scratch/get_651.py b/ia-legacy-importer/scratch/get_651.py new file mode 100644 index 00000000..6e2275c2 --- /dev/null +++ b/ia-legacy-importer/scratch/get_651.py @@ -0,0 +1,34 @@ +from catalog.importer.db_read import get_mc, withKey +from catalog.get_ia import get_from_local +from catalog.marc.fast_parse import get_tag_lines, get_all_subfields +import sys +import web +import simplejson as json + +def get_src(key): + e = withKey(key) + if 'source_records' in e: + return e['source_records'] + src = get_mc(key) + if src: + return [src] + +def get_651(key): + found = [] + for src in get_src(key): + data = get_from_local(src) + for tag, line in get_tag_lines(data, ['651']): + found.append(list(get_all_subfields(line))) + return found + +urls = ( + '^(/b/OL\d+M)$', 'lookup' +) +app = web.application(urls, globals()) + +class lookup: + def GET(self, key): + return json.dumps(get_651(key)) + +if __name__ == "__main__": + app.run() diff --git a/ia-legacy-importer/scratch/remove_subject_period.py b/ia-legacy-importer/scratch/remove_subject_period.py new file mode 100644 index 00000000..09c583a0 --- /dev/null +++ b/ia-legacy-importer/scratch/remove_subject_period.py @@ -0,0 +1,54 @@ +from __future__ import print_function +from catalog.utils.query import query_iter, set_staging, withKey +import sys +import codecs +import re +sys.path.append('/home/edward/src/olapi') +from olapi import OpenLibrary, Reference +from catalog.read_rc import read_rc + +import six + + +rc = read_rc() + +sys.stdout = codecs.getwriter('utf-8')(sys.stdout) +set_staging(True) + +ol = OpenLibrary("http://dev.openlibrary.org") +ol.login('EdwardBot', rc['EdwardBot']) + +re_skip = re.compile('\b([A-Z]|Co|Dr|Jr|Capt|Mr|Mrs|Ms|Prof|Rev|Revd|Hon)\.$') + +def has_dot(s): + return s.endswith('.') and not re_skip.search(s) + +q = { 'type': '/type/edition', 'table_of_contents': None, 'subjects': None } +queue = [] +count = 0 +for e in query_iter(q): + if not e.get('subjects', None) or not any(has_dot(s) for s in e['subjects']): + continue + subjects = [s[:-1] if has_dot(s) else s for s in e['subjects']] + q = { + 'key': e['key'], + 'subjects': {'connect': 'update_list', 'value': subjects }, + } + # need to fix table_of_contents to pass validation + toc = e['table_of_contents'] + if toc and (isinstance(toc[0], six.string_types) or toc[0]['type'] == '/type/text'): + if isinstance(toc[0], six.string_types): + assert all(isinstance(i, six.string_types) for i in toc) + new_toc = [{'title': i, 'type': '/type/toc_item'} for i in toc] + else: + assert all(i['type'] == '/type/text' for i in toc) + new_toc = [{'title': i['value'], 'type': '/type/toc_item'} for i in toc] + q['table_of_contents'] = {'connect': 'update_list', 'value': new_toc } + queue.append(q) + count += 1 + if len(queue) == 100: + print(count, 'writing to db') + print(ol.write(queue, "remove trailing period from subjects")) + queue = [] + +print(ol.write(queue, "remove trailing period from subjects")) diff --git a/ia-legacy-importer/scratch/work_author_role.py b/ia-legacy-importer/scratch/work_author_role.py new file mode 100644 index 00000000..696d6851 --- /dev/null +++ b/ia-legacy-importer/scratch/work_author_role.py @@ -0,0 +1,55 @@ +from __future__ import print_function +import sys +import codecs +from openlibrary.catalog.utils.query import query_iter, set_staging, query +from openlibrary.api import OpenLibrary, Reference +from openlibrary.catalog.read_rc import read_rc +from time import sleep + +set_staging(True) +rc = read_rc() + +ol = OpenLibrary("http://dev.openlibrary.org") +ol.login('EdwardBot', rc['EdwardBot']) +sys.stdout = codecs.getwriter('utf-8')(sys.stdout) + +work_q = { + 'type': '/type/work', + 'authors': None, + 'title': None, +} + +queue = [] + +for w in query_iter(work_q): + if not w.get('authors'): + print('no authors') + continue + if any(isinstance(a, dict) and 'author' in a for a in w['authors']): + continue + print(len(queue), w['key'], w['title']) # , ol.get(w['authors'][0]['key'])['name'] + full = ol.get(w['key']) + authors = full['authors'] + assert all(isinstance(a, Reference) for a in authors) + full['authors'] = [{'author':a} for a in authors] + queue.append(full) + if len(queue) > 1000: + print('saving') + print(ol.save_many(queue, 'update format of authors in works to provide roles')) + queue = [] + print('two second pause') + sleep(2) + continue + work_e = { + 'type': '/type/edition', + 'works': w['key'], + 'by_statement': None, + } + for e in query_iter(work_e): + by = e['by_statement'] + if by: + print(' ', e['key'], by) + +print('saving') +print(ol.save_many(queue, 'update format of authors in works to provide roles')) + diff --git a/ia-legacy-importer/solr/solr.py b/ia-legacy-importer/solr/solr.py new file mode 100755 index 00000000..d2cda9f8 --- /dev/null +++ b/ia-legacy-importer/solr/solr.py @@ -0,0 +1,41 @@ +#!/usr/bin/python + +from __future__ import print_function +from time import sleep, time +import web +import subprocess +import sys +from catalog.read_rc import read_rc + +from six.moves import urllib + + +rc = read_rc() + +def solr_query(q, start=0, rows=None, sort_by="publicdate desc"): + q += " AND NOT collection:test_collection AND NOT collection:opensource AND NOT collection:microfilm" +# q += " AND NOT collection:test_collection AND collection:gutenberg" + url = rc['solr_url'] + "?q=%s;%s&wt=json&start=%d" % (urllib.parse.quote(q), urllib.parse.quote_plus(sort_by), start) + if rows: + url += "&rows=%d" % rows + ret = eval(urllib.request.urlopen(url).read()) + return ret['response'] + +def get_books(**args): + ret = solr_query("mediatype:texts AND format:scandata", **args) + #ret = solr_query("mediatype:texts", **args) + return [d['identifier'] for d in ret['docs']] + +if __name__ == '__main__': + rows = 1000 + out = open(sys.argv[1], 'w') + for i in range(20): + print(i) + books = list(get_books(rows=rows, start=i * rows)) + if not books: + break + for b in books: + print(b, file=out) + out.close() + + print("finished") diff --git a/ia-legacy-importer/talis/isbn_and_author_date.py b/ia-legacy-importer/talis/isbn_and_author_date.py new file mode 100644 index 00000000..de324bdb --- /dev/null +++ b/ia-legacy-importer/talis/isbn_and_author_date.py @@ -0,0 +1,67 @@ +from __future__ import print_function +# read Talis, find books with ISBN and author date, add date to author + +from catalog.read_rc import read_rc +from catalog.marc.fast_parse import * +from catalog.infostore import get_site +from catalog.merge.names import match_name +from catalog.marc.build_record import read_author_person + +import re + +site = get_site() + +re_author_date_subfield = re.compile('\x1f[az]') +re_isbn_subfield = re.compile('\x1f[az]') + +rc = read_rc() +filename = rc['marc_path'] + 'talis_openlibrary_contribution/talis-openlibrary-contribution.mrc' + +seen = set() + +def build_fields(data): + fields = {} + for tag, line in get_tag_lines(data, ['020', '100']): + if tag in fields: + return {} + fields[tag] = line + if '020' not in fields or '100' not in fields: + return {} + if fields['100'].find('\x1fd') == -1: + return {} + if not re_isbn_subfield.search(fields['020']): + return {} + return fields + +def find_authors(isbn_list, name): + edition_keys = [] + for isbn in isbn_list: + edition_keys.extend(site.things({'type': '/type/edition', 'isbn_10': isbn})) + authors = set() + for k in edition_keys: + t = site.withKey(k) + if t.authors: + authors.update(t.authors) + for a in authors: + if not match_name(a.name, name, last_name_only_ok=False): + continue + books = site.things({'type': '/type/edition', 'authors': a.key}) + print(repr(a.key, a.name, a.birth_date, a.death_date, len(books))) + +for data, length in read_file(open(filename)): + fields = build_fields(data) + if not fields: + continue + isbn_list = read_isbn(fields['020']) + if not isbn_list: + continue + + if any(isbn in seen for isbn in isbn_list): + continue + seen.update(isbn_list) + person = read_author_person(fields['100']) + print(list(get_all_subfields(fields['100']))) + print(person) + print(isbn_list) + find_authors(isbn_list, person['personal_name']) +# fields.append(tag, list(get_all_subfields(line))) diff --git a/ia-legacy-importer/title_page_img/__init__.py b/ia-legacy-importer/title_page_img/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/ia-legacy-importer/title_page_img/load.py b/ia-legacy-importer/title_page_img/load.py new file mode 100644 index 00000000..4f612da4 --- /dev/null +++ b/ia-legacy-importer/title_page_img/load.py @@ -0,0 +1,31 @@ +from openlibrary.catalog.read_rc import read_rc +import httplib +import json + +from six.moves import urllib + + +rc = read_rc() + +def add_cover_image(ekey, ia): + h1 = httplib.HTTPConnection('openlibrary.org') + body = json.dumps(dict(username='ImportBot', password=rc['ImportBot'])) + headers = {'Content-Type': 'application/json'} + h1.request('POST', 'http://openlibrary.org/account/login', body, headers) + + res = h1.getresponse() + + res.read() + assert res.status == 200 + cookies = res.getheader('set-cookie').split(',') + cookie = ';'.join([c.split(';')[0] for c in cookies]) + + cover_url = 'http://www.archive.org/download/' + ia + '/page/' + ia + '_preview.jpg' + body = urllib.parse.urlencode({"url": cover_url}) + assert ekey.startswith('/books/') + add_cover_url = 'http://openlibrary.org' + ekey + '/add-cover.json' + headers = {'Content-type': 'application/x-www-form-urlencoded', 'Cookie': cookie} + h1.request('POST', add_cover_url, body, headers) + res = h1.getresponse() + res.read() + return diff --git a/ia-legacy-importer/title_page_img/replace_cover_with_title.py b/ia-legacy-importer/title_page_img/replace_cover_with_title.py new file mode 100644 index 00000000..9c5b6ace --- /dev/null +++ b/ia-legacy-importer/title_page_img/replace_cover_with_title.py @@ -0,0 +1,248 @@ +from __future__ import print_function +from openlibrary.utils.ia import find_item +from openlibrary.catalog.read_rc import read_rc +from openlibrary.catalog.utils.query import query, withKey, has_cover +from subprocess import Popen, PIPE +import web +import re +import sys +import xml.etree.ElementTree as et +import xml.parsers.expat +import socket # for exceptions +import httplib +from time import sleep + +from six.moves import urllib + + +re_single_cover = re.compile('^\[(\d+)\]$') +re_remove_xmlns = re.compile(' xmlns="[^"]+"') + +fh_log = open('/1/edward/logs/covers2', 'a') + +def write_log(ol, ia, url): + print((ol, ia, url), file=fh_log) + fh_log.flush() + +def parse_scandata_xml(f): + xml = f.read() + xml = re_remove_xmlns.sub('', xml) + #tree = et.parse(f) + tree = et.fromstring(xml) + leaf = None + leafNum = None + cover = None + title = None + for e in tree.find('pageData'): + assert e.tag == 'page' + leaf = int(e.attrib['leafNum']) + if leaf > 25: # enough + break + page_type = e.findtext('pageType') + if page_type == 'Cover': + cover = leaf + elif page_type == 'Title Page' or page_type == 'Title': + title = leaf + break + return (cover, title) + +def find_title_leaf_et(ia_host, ia_path, url): + f = urllib.request.urlopen(url) + try: + return parse_scandata_xml(f) + except xml.parsers.expat.ExpatError: + print(url) + return (None, None) + +def jp2_zip_test(ia_host, ia_path, ia): + conn = httplib.HTTPConnection(ia_host) + conn.request('HEAD', ia_path + "/" + ia + "_jp2.zip") + r1 = conn.getresponse() + try: + assert r1.status in (200, 403, 404) + except AssertionError: + print(r1.status, r1.reason) + raise + return r1.status + +def scandata_url(ia_host, ia_path, item_id): + conn = httplib.HTTPConnection(ia_host) + conn.request('HEAD', ia_path + "/scandata.zip") + r = conn.getresponse() + try: + assert r.status in (200, 403, 404) + except AssertionError: + print(r.status, r.reason) + raise + if r.status == 200: + None + conn = httplib.HTTPConnection(ia_host) + path = ia_path + "/" + item_id + "_scandata.xml" + conn.request('HEAD', path) + r = conn.getresponse() + try: + assert r.status in (200, 403, 404) + except AssertionError: + print(ia_host, path) + print(r.status, r.reason) + raise + return 'http://' + ia_host + path if r.status == 200 else None + +def scandata_zip_test(ia_host, ia_path): + conn = httplib.HTTPConnection(ia_host) + conn.request('HEAD', ia_path + "/scandata.zip") + r1 = conn.getresponse() + try: + assert r1.status in (200, 403, 404) + except AssertionError: + print(r1.status, r1.reason) + raise + return r1.status + + + +def urlread(url): + return urllib.request.urlopen(url).read() + +def post_cover(ol, source_url): + param = urllib.parse.urlencode({'olid': ol[3:], 'source_url': source_url}) + headers = {"Content-type": "application/x-www-form-urlencoded"} + conn = httplib.HTTPConnection("covers.openlibrary.org", timeout=20) + conn.request("POST", "/b/upload", param, headers) + r1 = conn.getresponse() + print(r1.status, r1.reason) + if r1.status not in (200, 303, 500): + open('upload.html', 'w').write(r1.read()) + print(r1.getheaders()) + print(r1.msg) + sys.exit() + conn.close() + +def post(ol, ia, ia_host, ia_path, cover, title): + use_cover = False + if title is None: + if cover is None: + return + use_cover = True +# http://covers.openlibrary.org/b/query?olid=OL7232120M + if False and not use_cover: + data = urlread('http://openlibrary.org/query.json?key=/b/OL7232119M&publish_date=') + try: + ret = eval(data) + except: + print(repr(data)) + pub_date = ret[0]['publish_date'] + use_cover = pub_date.isdigit() and int(pub_date) > 1955 + leaf = cover if use_cover else title + source_url = "http://%s/GnuBook/GnuBookImages.php?zip=%s/%s_jp2.zip&file=%s_jp2/%s_%04d.jp2" % (ia_host, ia_path, ia, ia, ia, leaf) +# print leaf, source_url + query = 'https://covers.openlibrary.org/b/query?olid=' + ol[3:] + #print query + if use_cover: + print('use_cover', end=' ') + print('https://openlibrary.org' + ol) + for attempt in range(5): + if attempt > 0: + print('trying again (%d)' % attempt) + try: + ret = urlread(query).strip() + except IOError: + continue + print(ret) + if not re_single_cover.match(ret): + print("unexpected reply: '%s'" % ret) + break + try: + write_log(ol, ia, source_url) + post_cover(ol, source_url) + except socket.timeout: + print('socket timeout') + break + except httplib.BadStatusLine: + print('bad status line') + continue + break + +bad_hosts = set() + +def find_img(item_id): + e = query({'type':'/type/edition', 'source_records':'ia:' + item_id}) + if len(e) != 1: + print('no source_records:', e) + e = query({'type':'/type/edition', 'ocaid': item_id}) + if len(e) != 1: + print('no ocaid:', e) + return + ol = e[0]['key'] + (ia_host, ia_path) = find_item(item_id) + + if not ia_host: + print('no host', item_id, ia_host) + return + if ia_host in bad_hosts: + print('bad_host') + try: + url = scandata_url(ia_host, ia_path, item_id) + if not url: + return + except socket.error: + print('socket error:', ia_host) + bad_hosts.add(ia_host) + return + + try: + status = jp2_zip_test(ia_host, ia_path, item_id) + except socket.error: + print('socket error:', ia_host) + bad_hosts.add(ia_host) + return + if status in (403, 404): + print('jp2 not found:', (ol, item_id)) + return + + try: + (cover, title) = find_title_leaf_et(ia_host, ia_path, url) + except (KeyboardInterrupt, SystemExit, NameError): + raise + if not cover or not title: + return +# except: +# print 'skip error:', ol, item_id, ia_host, ia_path +# return + print((ol, item_id, ia_host, ia_path, cover, title)) + post(ol, item_id, ia_host, ia_path, cover, title) + +def has_cover_retry(key): + for attempt in range(5): + try: + return has_cover(key) + except KeyboardInterrupt: + raise + except: + pass + sleep(2) + +skip = True +skip = False +for line in open('/1/edward/jsondump/2009-07-29/has_ocaid'): + key = line[:-1] + if key == '/b/OL6539962M': # the end + break + if skip: + if key == '/b/OL6539962M': + skip = False + else: + continue + if not has_cover_retry(key): + print('no cover') + continue + print(key) + e = withKey(key) + if not e.get('ocaid', None): + print('no ocaid') + continue + find_img(e['ocaid'].strip()) + +fh_log.close() + +print('finished') diff --git a/ia-legacy-importer/treasury/parse.py b/ia-legacy-importer/treasury/parse.py new file mode 100644 index 00000000..cdf415ad --- /dev/null +++ b/ia-legacy-importer/treasury/parse.py @@ -0,0 +1,71 @@ +from __future__ import print_function +import re +import sys +import xml.etree.ElementTree as et +from pprint import pprint + +def parse_catrecord(catrecord): + record = {} + re_bad_tag = re.compile(r'(<[^>]*?\s[^>]*?>)') + re_white = re.compile(r'\s') + catrecord = re_bad_tag.sub(lambda m: re_white.sub('', m.group(1)), catrecord) + tree = et.fromstring(catrecord) + record = {} + for e in tree: + f = e.tag.lower() + if e.tag == 'AUTHORS': + assert f not in record + record[f] = [(a.tag.lower(), a.text) for a in e] + continue + if e.tag == 'SEGMENT': + d = dict([(a.tag.lower(), a.text) for a in e]) + record.setdefault(f, []).append(d) + continue + elif e.tag in ('SUBJ', 'COLL', 'ALTTI', 'SERIES'): + record.setdefault(f, []).append(e.text) + continue + assert len(e) == 0 + assert f not in record + record[f] = e.text + return record + +def parse_file(): + cur = '' + expect = 'start' + i = 0 + re_call = re.compile('^(.*)\r\n$') + re_itemid = re.compile('^(.*)\r\n$') + for line in open(sys.argv[1]): + i+=1 + assert expect != 'end_file' + if expect == 'start': + assert line == 'Department of Treasury\r\n' + expect = 'start_catrecord' + continue + if expect == 'start_catrecord': + if line == '\r\n': + print("skipping duplicate CATRECORD") + continue + assert line == '\r\n' + cur += line + expect = 'end_catrecord' + continue + if expect == 'end_catrecord': + if line.startswith(''): + cur += '' + yield parse_catrecord(cur) + + cur = '' + if line == '\r\n': + expect = 'end_file' + else: + assert line == '\r\n' + expect = 'start_catrecord' + continue + else: + cur += line + + assert expect == 'end_file' + +for rec in parse_file(): + pprint(rec) diff --git a/ia-legacy-importer/update_count.py b/ia-legacy-importer/update_count.py new file mode 100644 index 00000000..c36edb00 --- /dev/null +++ b/ia-legacy-importer/update_count.py @@ -0,0 +1,71 @@ +from olwrite import Infogami, add_to_database +import web +import dbhash +from read_rc import read_rc +import cjson +import re +import sys +from time import time + +def commify(n): + """ +Add commas to an integer repr(n). + +>>> commify(1) +'1' +>>> commify(123) +'123' +>>> commify(1234) +'1,234' +>>> commify(1234567890) +'1,234,567,890' +>>> commify(None) +>>> +""" + if n is None: return None + r = [] + for i, c in enumerate(reversed(str(n))): + if i and (not (i % 3)): + r.insert(0, ',') + r.insert(0, c) + return ''.join(r) + +def count_books(): + rows = list(web.query("select count(*) as num from thing where type=52")) + return rows[0].num + +def count_fulltext(): + rows = list(web.query("select count(*) as num from edition_str where key_id=40")) + return commify(rows[0].num) + +def get_macro(): + rows = list(web.query("select data from data, thing where thing_id=thing.id and key='/macros/BookCount' and revision=latest_revision")) + return cjson.decode(rows[0].data)['macro']['value'] + +rc = read_rc() +web.config.db_parameters = dict(dbn='postgres', db=rc['db'], user=rc['user'], pw=rc['pw'], host=rc['host']) +web.config.db_printing = False +web.ctx.ip = '127.0.0.1' +web.load() + +book_count = count_books() +open('/home/edward/book_count', 'a').write("%d %d\n" % (time(), book_count)) + +infogami = Infogami(rc['infogami']) +infogami.login('edward', rc['edward']) + +macro = get_macro() +re_books = re.compile(r'books = "[\d,]+"') +books = commify(book_count) +macro = re_books.sub('books = "' + books + '"', macro) + +# full text count is disabled so that the number stays about 1 million +# fulltext = count_fulltext() +# re_fulltext = re.compile(r'fulltext = "[\d,]+"') +# macro = re_fulltext.sub('fulltext = "' + fulltext + '"', macro) + +q = { + 'key': '/macros/BookCount', + 'macro': { 'connect': 'update', 'type': '/type/text', 'value': macro } +} +infogami.write(q, comment='update book count') diff --git a/ia-legacy-importer/utils/__init__.py b/ia-legacy-importer/utils/__init__.py new file mode 100644 index 00000000..abba2342 --- /dev/null +++ b/ia-legacy-importer/utils/__init__.py @@ -0,0 +1,277 @@ +# -*- coding: utf-8 -*- +import re +import web +from unicodedata import normalize +import openlibrary.catalog.merge.normalize as merge + +import six + +try: + cmp = cmp # Python 2 +except NameError: + def cmp(x, y): # Python 3 + return (x > y) - (x < y) + + +re_date = map (re.compile, [ + r'(?P\d+\??)-(?P\d+\??)', + r'(?P\d+\??)-', + r'b\.? (?P(?:ca\. )?\d+\??)', + r'd\.? (?P(?:ca\. )?\d+\??)', + r'(?P.*\d+.*)-(?P.*\d+.*)', + r'^(?P[^-]*\d+[^-]+ cent\.[^-]*)$']) + +re_ad_bc = re.compile(r'\b(B\.C\.?|A\.D\.?)') +re_date_fl = re.compile('^fl[., ]') +re_number_dot = re.compile(r'\d{2,}[- ]*(\.+)$') +re_l_in_date = re.compile(r'(l\d|\dl)') +re_end_dot = re.compile(r'[^ .][^ .]\.$', re.UNICODE) +re_marc_name = re.compile('^(.*?),+ (.*)$') +re_year = re.compile(r'\b(\d{4})\b') + +re_brackets = re.compile(r'^(.+)\[.*?\]$') + + +def key_int(rec): + # extract the number from a key like /a/OL1234A + return int(web.numify(rec['key'])) + + +def author_dates_match(a, b): + """ + Checks if the years of two authors match. Only compares years, + not names or keys. Works by returning False if any year specified in one record + does not match that in the other, otherwise True. If any one author does not have + dates, it will return True. + + :param dict a: Author import dict {"name": "Some One", "birth_date": "1960"} + :param dict b: Author import dict {"name": "Some One"} + :rtype: bool + """ + for k in ['birth_date', 'death_date', 'date']: + if k not in a or a[k] is None or k not in b or b[k] is None: + continue + if a[k] == b[k] or a[k].startswith(b[k]) or b[k].startswith(a[k]): + continue + m1 = re_year.search(a[k]) + if not m1: + return False + m2 = re_year.search(b[k]) + if m2 and m1.group(1) == m2.group(1): + continue + return False + return True + + +def flip_name(name): + """ + Flip author name about the comma, stripping the comma, and removing non + abbreviated end dots. Returns name with end dot stripped if no comma+space found. + The intent is to convert a Library indexed name to natural name order. + + :param str name: e.g. "Smith, John." or "Smith, J." + :rtype: str + :return: e.g. "John Smith" or "J. Smith" + """ + + m = re_end_dot.search(name) + if m: + name = name[:-1] + if name.find(', ') == -1: + return name + m = re_marc_name.match(name) + return m.group(2) + ' ' + m.group(1) + + +def remove_trailing_number_dot(date): + m = re_number_dot.search(date) + if m: + return date[:-len(m.group(1))] + else: + return date + +def remove_trailing_dot(s): + if s.endswith(" Dept."): + return s + m = re_end_dot.search(s) + if m: + s = s[:-1] + return s + +def fix_l_in_date(date): + if not 'l' in date: + return date + return re_l_in_date.sub(lambda m:m.group(1).replace('l', '1'), date) + +re_ca = re.compile('ca\.([^ ])') + +def parse_date(date): + if re_date_fl.match(date): + return {} + date = remove_trailing_number_dot(date) + date = re_ca.sub(lambda m:'ca. ' + m.group(1), date) + if date.find('-') == -1: + for r in re_date: + m = r.search(date) + if m: + return dict((k, fix_l_in_date(v)) for k, v in m.groupdict().items()) + return {} + + parts = date.split('-') + i = { 'birth_date': parts[0].strip() } + if len(parts) == 2: + parts[1] = parts[1].strip() + if parts[1]: + i['death_date'] = fix_l_in_date(parts[1]) + if not re_ad_bc.search(i['birth_date']): + m = re_ad_bc.search(i['death_date']) + if m: + i['birth_date'] += ' ' + m.group(1) + if 'birth_date' in i and 'l' in i['birth_date']: + i['birth_date'] = fix_l_in_date(i['birth_date']) + return i + +re_cent = re.compile('^[\dl][^-]+ cent\.$') + +def pick_first_date(dates): + # this is to handle this case: + # 100: $aLogan, Olive (Logan), $cSikes, $dMrs., $d1839- + # see http://archive.org/download/gettheebehindmes00logaiala/gettheebehindmes00logaiala_meta.mrc + # or http://pharosdb.us.archive.org:9090/show-marc?record=gettheebehindmes00logaiala/gettheebehindmes00logaiala_meta.mrc:0:521 + + dates = list(dates) + if len(dates) == 1 and re_cent.match(dates[0]): + return { 'date': fix_l_in_date(dates[0]) } + + for date in dates: + result = parse_date(date) + if result != {}: + return result + + return { 'date': fix_l_in_date(' '.join([remove_trailing_number_dot(d) for d in dates])) } + +def strip_accents(s): + return normalize('NFKD', six.text_type(s)).encode('ASCII', 'ignore') + +re_drop = re.compile('[?,]') + +def match_with_bad_chars(a, b): + if six.text_type(a) == six.text_type(b): + return True + a = normalize('NFKD', six.text_type(a)).lower() + b = normalize('NFKD', six.text_type(b)).lower() + if a == b: + return True + a = a.encode('ASCII', 'ignore') + b = b.encode('ASCII', 'ignore') + if a == b: + return True + def drop(s): + return re_drop.sub('', s) + return drop(a) == drop(b) + +def accent_count(s): + return len([c for c in norm(s) if ord(c) > 127]) + +def norm(s): + return normalize('NFC', s) if isinstance(s, six.text_type) else s + +def pick_best_name(names): + names = [norm(n) for n in names] + n1 = names[0] + assert all(match_with_bad_chars(n1, n2) for n2 in names[1:]) + names.sort(key=lambda n:accent_count(n), reverse=True) + assert '?' not in names[0] + return names[0] + +def pick_best_author(authors): + n1 = authors[0]['name'] + assert all(match_with_bad_chars(n1, a['name']) for a in authors[1:]) + authors.sort(key=lambda a:accent_count(a['name']), reverse=True) + assert '?' not in authors[0]['name'] + return authors[0] + +def tidy_isbn(input): + output = [] + for i in input: + i = i.replace('-', '') + if len(i) in (10, 13): + output.append(i) + continue + if len(i) == 20 and all(c.isdigit() for c in i): + output.extend([i[:10], i[10:]]) + continue + if len(i) == 21 and not i[10].isdigit(): + output.extend([i[:10], i[11:]]) + continue + if i.find(';') != -1: + no_semicolon = i.replace(';', '') + if len(no_semicolon) in (10, 13): + output.append(no_semicolon) + continue + split = i.split(';') + if all(len(j) in (10, 13) for j in split): + output.extend(split) + continue + output.append(i) + return output + +def strip_count(counts): + foo = {} + for i, j in counts: + foo.setdefault(i.rstrip('.').lower() if isinstance(i, six.string_types) else i, []).append((i, j)) + ret = {} + for k, v in foo.iteritems(): + m = max(v, key=lambda x: len(x[1]))[0] + bar = [] + for i, j in v: + bar.extend(j) + ret[m] = bar + return sorted(ret.iteritems(), cmp=lambda x,y: cmp(len(y[1]), len(x[1]) )) + +def fmt_author(a): + if 'birth_date' in a or 'death_date' in a: + return "%s (%s-%s)" % ( a['name'], a.get('birth_date', ''), a.get('death_date', '') ) + return a['name'] + +def get_title(e): + if e.get('title_prefix', None) is not None: + prefix = e['title_prefix'] + if prefix[-1] != ' ': + prefix += ' ' + title = prefix + e['title'] + else: + title = e['title'] + return title + + +def mk_norm(s): + """ + Normalizes titles and strips ALL spaces and small words + to aid with string comparisons of two titles. + + :param str s: A book title to normalize and strip. + :rtype: str + :return: a lowercase string with no spaces, containg the main words of the title. + """ + + m = re_brackets.match(s) + if m: + s = m.group(1) + norm = merge.normalize(s).strip(' ') + norm = norm.replace(' and ', ' ') + if norm.startswith('the '): + norm = norm[4:] + elif norm.startswith('a '): + norm = norm[2:] + return norm.replace(' ', '') + + +def error_mail(msg_from, msg_to, subject, body): + assert isinstance(msg_to, list) + msg = 'From: %s\nTo: %s\nSubject: %s\n\n%s' % (msg_from, ', '.join(msg_to), subject, body) + + import smtplib + server = smtplib.SMTP('mail.archive.org') + server.sendmail(msg_from, msg_to, msg) + server.quit() diff --git a/ia-legacy-importer/utils/authority.py b/ia-legacy-importer/utils/authority.py new file mode 100644 index 00000000..a0fcbee4 --- /dev/null +++ b/ia-legacy-importer/utils/authority.py @@ -0,0 +1,107 @@ +from __future__ import print_function +from mechanize import Browser +import re +import os.path +from openlibrary.catalog.read_rc import read_rc + +rc = read_rc() + +start = "http://authorities.loc.gov/cgi-bin/Pwebrecon.cgi?DB=local&PAGE=First" +def get_table_rows(fh): + cur = '' + expect = 'thesauri' + for line in fh: + if expect == 'thesauri': + if line == 'Type of Heading\n': + expect = 'headings_close_tr' + continue + if expect == 'headings_close_tr': + assert line == '\n' + expect = 'tr' + continue + if expect == 'tr': + assert line == '\n' + expect = 'center' + continue + if expect == 'center': + if line == '\n': + yield cur.decode('utf-8') + cur = '' + elif line == '\n': + yield cur.decode('utf-8') + break + else: + cur += line + continue + +re_row = re.compile('^\n(?:[^\n(\d+)\n\n\n(\d+)\n\n\n(.+)\n\n\n(.+)\n\n$') +re_no_link = re.compile('^\n\n\d+\n\n') + +def read_serp(fh): + cur_row = 0 + for row in get_table_rows(fh): + cur_row += 1 + if re_no_link.match(row): + continue + m = re_row.match(row) + if not m: + print(row) + (param, a, row_num, bib_records, heading, type_of_heading) = m.groups() + assert str(cur_row) == row_num + yield { + 'a': a, + 'bib_records': bib_records, + 'heading': heading, + 'type': type_of_heading + } + +def search(arg): + assert '/' not in arg # because we use it in a filename + cache = rc['authority_cache'] + filename = cache + '/' + arg + if os.path.exists(filename): + return [eval(i) for i in open(filename)] + br = Browser() + br.set_handle_robots(False) + br.open(start) + br.select_form(name="querybox") + br['Search_Arg'] = arg.encode('utf-8') + br['Search_Code'] = ['NHED_'] + res = br.submit() + found = list(read_serp(res)) + br.close() + out = open(filename, 'w') + for i in found: + print(i, file=out) + out.close() + return found + +def test_harold_osman_kelly(): + arg = 'Kelly, Harold Osman' + found = search(arg) + assert found[0]['heading'] == 'Kelly, Harold Osman, 1884-1955' + +def test_jesus(): + arg = 'Jesus Christ' + found = search(arg) + assert found[0]['heading'] == 'Jesus Christ' + +def test_pope_sixtus(): + arg = 'Sixtus V Pope' + found = search(arg) + assert found[0]['heading'] == 'Sixtus V, Pope, 1520-1590' + +def test_william_the_conqueror(): + arg = 'William I King of England' + found = search(arg) + assert found[0]['heading'] == 'William I, King of England, 1027 or 8-1087' + +def test_non_ascii_result(): + arg = 'Asoka King of Magadha' + found = search(arg) + assert found[0]['heading'] == u'As\u0301oka, King of Magadha, fl. 259 B.C.' + +def test_non_ascii_param(): + arg = u'A\u015boka King of Magadha' + found = search(arg) + assert found[0]['heading'] == u'As\u0301oka, King of Magadha, fl. 259 B.C.' diff --git a/ia-legacy-importer/utils/del.py b/ia-legacy-importer/utils/del.py new file mode 100644 index 00000000..e01bfca3 --- /dev/null +++ b/ia-legacy-importer/utils/del.py @@ -0,0 +1,25 @@ +from __future__ import print_function +from catalog.infostore import get_site +from catalog.olwrite import Infogami +from catalog.read_rc import read_rc + +rc = read_rc() +infogami = Infogami(rc['infogami']) + +site = get_site() + +# throwaway bit of code for deleting bad scan records +# BPL can't scan microtext + +keys = site.things({'type': '/type/scan_record', 'locations': '/scanning_center/MBMBN/BPL1MI', 'scan_status': 'NOT_SCANNED'}) +while keys: + for key in keys: + sr = site.withKey(key) + print(key) + q = { + 'key': key, + 'type': { 'connect': 'update', 'value': '/type/delete' }, + } + ret = infogami.write(q, comment="can't scan microtext") + assert ret['status'] == 'ok' + keys = site.things({'type': '/type/scan_record', 'locations': '/scanning_center/MBMBN/BPL1MI', 'scan_status': 'NOT_SCANNED'}) diff --git a/ia-legacy-importer/utils/edit.py b/ia-legacy-importer/utils/edit.py new file mode 100644 index 00000000..ec94e755 --- /dev/null +++ b/ia-legacy-importer/utils/edit.py @@ -0,0 +1,105 @@ +from __future__ import print_function +import re +import web +import json +from openlibrary.catalog.importer.db_read import get_mc +from openlibrary.api import unmarshal +from time import sleep + +import six +from six.moves import urllib + +re_meta_mrc = re.compile('([^/]+)_(meta|marc).(mrc|xml)') +re_skip = re.compile(r'\b([A-Z]|Co|Dr|Jr|Capt|Mr|Mrs|Ms|Prof|Rev|Revd|Hon)\.$') + +db_amazon = web.database(dbn='postgres', db='amazon') +db_amazon.printing = False + +def query_with_retry(ol, q): + for attempt in range(50): + try: + return ol.query(q) + except: + sleep(5) + print('retry attempt', attempt) + +def get_with_retry(ol, k): + for attempt in range(50): + try: + return ol.get(k) + except: + sleep(5) + print('retry attempt', attempt) + +def amazon_source_records(asin): + iter = db_amazon.select('amazon', where='asin = $asin', vars={'asin':asin}) + return ["amazon:%s:%s:%d:%d" % (asin, r.seg, r.start, r.length) for r in iter] + +def has_dot(s): + return s.endswith('.') and not re_skip.search(s) + +def fix_toc(e): + toc = e.get('table_of_contents', None) + if not toc: + return + if isinstance(toc[0], dict) and toc[0]['type'] == '/type/toc_item': + if len(toc) == 1 and 'title' not in toc[0]: + del e['table_of_contents'] # remove empty toc + return + new_toc = [{'title': six.text_type(i), 'type': '/type/toc_item'} for i in toc if i] + e['table_of_contents'] = new_toc + +def fix_subject(e): + if e.get('subjects', None) and any(has_dot(s) for s in e['subjects']): + subjects = [s[:-1] if has_dot(s) else s for s in e['subjects']] + e['subjects'] = subjects + +def undelete_author(a, ol): + key = a['key'] + assert a['type'] == '/type/delete' + url = 'http://openlibrary.org' + key + '.json?v=' + str(a['revision'] - 1) + prev = unmarshal(json.load(urllib.request.urlopen(url))) + assert prev['type'] == '/type/author' + ol.save(key, prev, 'undelete author') + +def undelete_authors(authors, ol): + for a in authors: + if a['type'] == '/type/delete': + undelete_author(a, ol) + else: + assert a['type'] == '/type/author' + +def fix_authors(e, ol): + if 'authors' not in e: + return + authors = [get_with_retry(ol, akey) for akey in e['authors']] + while any(a['type'] == '/type/redirect' for a in authors): + print('following redirects') + authors = [get_with_retry(ol, a['location']) if a['type'] == '/type/redirect' else a for a in authors] + e['authors'] = [{'key': a['key']} for a in authors] + undelete_authors(authors, ol) + +def fix_edition(key, e, ol): + existing = get_mc(key) + if 'source_records' not in e and existing: + amazon = 'amazon:' + if existing.startswith('ia:'): + sr = [existing] + elif existing.startswith(amazon): + sr = amazon_source_records(existing[len(amazon):]) or [existing] + else: + print('existing:', existing) + m = re_meta_mrc.search(existing) + sr = ['marc:' + existing if not m else 'ia:' + m.group(1)] + e['source_records'] = sr + if 'ocaid' in e: + ia = 'ia:' + e['ocaid'] + if 'source_records' not in e: + e['source_records'] = [ia] + elif ia not in e['source_records']: + e['source_records'].append(ia) + + fix_toc(e) + fix_subject(e) + fix_authors(e, ol) + return e diff --git a/ia-legacy-importer/utils/query.py b/ia-legacy-importer/utils/query.py new file mode 100644 index 00000000..d42f4190 --- /dev/null +++ b/ia-legacy-importer/utils/query.py @@ -0,0 +1,162 @@ +from __future__ import print_function +import web +import simplejson as json +from time import sleep +import sys + +from six.moves import urllib + + +query_host = 'openlibrary.org' + +def urlopen(url, data=None): + version = "%s.%s.%s" % sys.version_info[:3] + user_agent = 'Mozilla/5.0 (openlibrary; %s) Python/%s' % (__name__, version) + headers = { + 'User-Agent': user_agent + } + req = urllib.request.Request(url, data, headers) + return urllib.request.urlopen(req) + +def jsonload(url): + return json.load(urlopen(url)) + +def urlread(url): + return urlopen(url).read() + +def set_query_host(host): + global query_host + query_host = host + +def has_cover(key): + url = 'https://covers.openlibrary.org/' + key[1] + '/query?olid=' + key[3:] + return urlread(url).strip() != '[]' + +def has_cover_retry(key): + for attempt in range(5): + try: + return has_cover(key) + except KeyboardInterrupt: + raise + except: + pass + sleep(2) + +def base_url(): + return "http://" + query_host + +def query_url(): + return base_url() + "/query.json?query=" + +def get_all_ia(): + print('c') + q = {'source_records~': 'ia:*', 'type': '/type/edition'} + limit = 10 + q['limit'] = limit + q['offset'] = 0 + + while True: + url = base_url() + "/api/things?query=" + web.urlquote(json.dumps(q)) + ret = jsonload(url)['result'] + for i in ret: + yield i + if not ret: + return + q['offset'] += limit + +def query(q): + url = query_url() + urllib.parse.quote(json.dumps(q)) + ret = None + for i in range(20): + try: + ret = urlread(url) + while ret.startswith('canceling statement due to statement timeout'): + ret = urlread(url) + if not ret: + print('ret == None') + except IOError: + pass + if ret: + try: + data = json.loads(ret) + if isinstance(data, dict): + if 'error' in data: + print('error:') + print(ret) + assert 'error' not in data + return data + except: + print(ret) + print(url) + sleep(20) + +def query_iter(q, limit=500, offset=0): + q['limit'] = limit + q['offset'] = offset + while True: + ret = query(q) + if not ret: + return + for i in ret: + yield i + # We haven't got as many we have requested. No point making one more request + if len(ret) < limit: + break + q['offset'] += limit + +def get_editions_with_covers_by_author(author, count): + q = {'type': '/type/edition', 'title_prefix': None, 'subtitle': None, 'title': None, 'authors': author} + with_covers = [] + for e in query_iter(q, limit=count): + if not has_cover(e['key']): + continue + with_covers.append(e) + if len(with_covers) == count: + return with_covers + return with_covers + +def version_iter(q, limit=500, offset=0): + q['limit'] = limit + q['offset'] = offset + while True: + url = base_url() + '/version' + v = jsonload(url) + if not v: + return + for i in query(q): + yield i + q['offset'] += limit + +def withKey(key): + url = base_url() + key + '.json' + for i in range(20): + try: + return jsonload(url) + except: + pass + print('retry:', i) + print(url) + +def get_marc_src(e): + mc = get_mc(e['key']) + if mc: + yield mc + if not e.get('source_records', []): + return + for src in e['source_records']: + if src.startswith('marc:') and src != 'marc:' + mc: + yield src[5:] + +def get_mc(key): # get machine comment + v = jsonload(base_url() + key + '.json?m=history') + + comments = [i['machine_comment'] for i in v if i.get('machine_comment', None) and ':' in i['machine_comment']] + if len(comments) == 0: + return None + if len(set(comments)) != 1: + print(key) + print(comments) + assert len(set(comments)) == 1 + if comments[0] == 'initial import': + return None + return comments[0] diff --git a/ia-legacy-importer/wikipedia/__init__.py b/ia-legacy-importer/wikipedia/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/ia-legacy-importer/wikipedia/find_ol_authors.py b/ia-legacy-importer/wikipedia/find_ol_authors.py new file mode 100644 index 00000000..78d01bce --- /dev/null +++ b/ia-legacy-importer/wikipedia/find_ol_authors.py @@ -0,0 +1,28 @@ +from __future__ import print_function +from catalog.utils import pick_first_date +import web +import re +import sys +import codecs +sys.stdout = codecs.getwriter('utf-8')(sys.stdout) + +re_marc_name = re.compile('^(.*), (.*)$') +re_end_dot = re.compile('[^ ][^ ]\.$', re.UNICODE) + +def flip_name(name): + # strip end dots like this: "Smith, John." but not like this: "Smith, J." + m = re_end_dot.search(name) + if m: + name = name[:-1] + + m = re_marc_name.match(name) + return m.group(2) + ' ' + m.group(1) + +for wikipedia, marc in (eval(i) for i in open("matches4")): + dates = pick_first_date(v for k, v in marc if k == 'd') + name = ' '.join(v for k, v in marc if k in 'abc') + print(name) + if ', ' in name: + print(flip_name(name)) + print(dates) + diff --git a/ia-legacy-importer/wikipedia/find_people.pl b/ia-legacy-importer/wikipedia/find_people.pl new file mode 100644 index 00000000..243a7b75 --- /dev/null +++ b/ia-legacy-importer/wikipedia/find_people.pl @@ -0,0 +1,66 @@ +#!/usr/bin/perl + +use strict; +use warnings; +use lib '/home/edward/lib/perl5'; +use JSON::XS; +use Parse::MediaWikiDump; + +#my $coder = JSON::XS->new->ascii; +my $coder = JSON::XS->new->utf8; + +#binmode STDOUT, ":utf8"; + +open my $fh, "-|", "curl http://download.wikimedia.org/enwiki/20081008/enwiki-20081008-pages-articles.xml.bz2 | bzip2 -dc -" or die $!; +my $pages = Parse::MediaWikiDump::Pages->new($fh); + +sub get_template { + my ($template, $text) = @_; + $text =~ /({{\s*$template)/igc or return; + my $depth = 1; + my $infobox = $1; + while ($depth) { + unless ($text =~ /\G(.*?({{|}}))/sgc) { + return; + } + $infobox .= $1; + $2 eq '}}' and do { $depth--; next }; + $2 eq '{{' and do { $depth++; next }; + } + return $infobox; +} + +my $page; +open my $redirect, ">", 'redirects' or die; +open my $people, ">", 'people' or die; +while(defined($page = $pages->next)) { + $page->namespace and next; + if ($page->redirect) { + print $redirect $coder->encode([$page->title, $page->redirect]), "\n"; + next; + } + my $cats = $page->categories; + $cats or next; + my $text = ${$page->text}; + my $len = length($text); + my $skip = 1; + for (@$cats) { + /(writer|people|birth|death)/ or next; + $skip = 0; + last; + } + $skip and next; + my %out = ( + title => $page->title, + cats => $cats, + len => $len, + ); + for (qw(persondata defaultsort infobox lifetime)) { + my $template = get_template($_, $text); + $template and $out{$_} = $template; + } + print $people $coder->encode(\%out), "\n"; +} + +close $redirect; +close $people; diff --git a/ia-legacy-importer/wikipedia/lookup.py b/ia-legacy-importer/wikipedia/lookup.py new file mode 100644 index 00000000..7407d261 --- /dev/null +++ b/ia-legacy-importer/wikipedia/lookup.py @@ -0,0 +1,521 @@ +from __future__ import print_function +import web +import re +import codecs +import sys +from time import time +from catalog.marc.fast_parse import get_subfields, get_all_subfields, get_subfield_values +from catalog.utils import pick_first_date +from unicodedata import normalize +from pprint import pprint + +sys.stdout = codecs.getwriter('utf-8')(sys.stdout) + +# bad cats: +# ... animal births +# ... animal deaths +# ... peoples + +db = web.database(dbn='postgres', db='wiki_people') +db.printing = False + +re_comma = re.compile(', *') + +re_marc_name = re.compile('^(.*), (.*)$') + +def flip_name(name): + m = re_marc_name.match(name) + if m: + return m.group(2) + ' ' + m.group(1) + return name + +re_title_of = re.compile('^(.*) (of .*)$') + +re_digit = re.compile('\d+') +re_decade = re.compile('^(\d+)s$') +re_bc_date = re.compile('^(.*) B\.C\.?$') +re_cent = re.compile('^(?:fl\.? ?)?(\d+)[a-z]{0,2}\.? cent\.$') +# fl. 13th cent/14th cent. +re_cent_range = re.compile('^(?:fl\.? ?)?(\d+)[a-z]{0,2}\.?(?: cent)?[-/](\d+)[a-z]{0,2}\.? cent\.$') +re_century = re.compile('^(\d+)[a-z][a-z] century$') + +def decade_match(a, start, ca): + end = start + 10 + if ca: + start -= 9 + end += 9 + if a.isdigit(): + return start <= int(a) < end + return any((start <= int(c) < end) for c in re_digit.findall(a)) + +def year_approx_match(a, b): + approx_century_match = False + if a.startswith('ca. '): + ca = True + a = a[4:] + range = 15 + else: + ca = False + range = 9 + if a == b: + return True + if a.replace('.', '') == b: + return True # ca. 440 B.C. + if a.endswith(' cent.') and b.endswith(' century') and b.startswith(a[:-1]): + return True + + bc = False + if b.endswith(' BC'): + m = re_bc_date.match(a) + if m: + a = m.group(1) + b = b[:-3] + bc = True + if approx_century_match and a.isdigit() and b.endswith(' century'): + a = int(a) + m = re_century.match(b) + assert m + cent = int(m.group(1)) + start = cent - 1 if not bc else cent + end = cent if not bc else cent + 1 + if start * 100 <= a < end * 100: + return True + + if b.isdigit(): + b = int(b) + if a.isdigit() and (bc or b < 1850) and abs(int(a) - b) <= range: + return True + if approx_century_match and a.endswith(' cent.'): + m = re_cent.match(a) + if m: + cent = int(m.group(1)) + start = cent - 1 if not bc else cent + end = cent if not bc else cent + 1 + if start * 100 <= b < end * 100: + return True + for c in re_digit.findall(a): + c = int(c) + if c == b: + return True + if (bc or b < 1850) and abs(c - b) <= range: + return True + return False + m = re_decade.match(b) + if not m: + return False + start = int(m.group(1)) + return decade_match(a, start, ca) + +def test_year_approx_match(): + assert not year_approx_match('1939', '1940') + assert year_approx_match('582', '6th century') + assert year_approx_match('13th cent.', '1240') + assert year_approx_match('ca. 360 B.C.', '365 BC') + assert year_approx_match('1889', '1890') + assert year_approx_match('1883?', '1882') + assert year_approx_match('1328?', '1320s') + assert year_approx_match('11th cent.', '11th century') + assert not year_approx_match('1330', '1320s') + assert not year_approx_match('245 B.C.', '3rd century BC') + +#test_year_approx_match() + +# fl. 13th cent/14th cent. +def cent_range(c): + m = re_cent_range.match(c) + if m: + a, b = int(m.group(1)), int(m.group(2)) + assert b == a + 1 + return ((a-1) * 100, b * 100) + m = re_cent.match(c) + assert m + a = int(m.group(1)) + return ((a-1) * 100, a * 100) + +re_fl = re.compile('^fl\.? ?(\d+)\.?$') + +def get_birth_and_death(cats): + birth = None + death = None + for c in cats: + if c.endswith(' births'): + birth = c[:-7] + continue + elif c.endswith(' deaths'): + death = c[:-7] + continue + return birth, death + +re_century_writers_cat = re.compile('(\d+)[a-z]{2}-century.* writers') + +def date_match(dates, cats): + match_found = False + if len(dates) == 1 and 'date' in dates: + marc = dates['date'] + if marc.startswith('fl.'): + m = re_fl.match(marc) + if m: + birth, death = get_birth_and_death(cats) + if birth and death and birth.isdigit() and death.isdigit(): + return int(birth) < int(m.group(1)) < int(death) + if marc.endswith(' cent.'): + m = re_cent.match(marc) + if m: + cent = marc[:-6] + '-century' + if any(c.endswith(' writers') and cent in c for c in cats): + return True + m = re_cent_range.match(marc) + if m: + if any(cm.group(1) in m.groups() for cm in (re_century_writers_cat.match(c) for c in cats) if cm): + return True + + try: + (a, b) = cent_range(marc) + except: + print(marc) + raise + for c in cats: + for f in (' births', ' deaths'): + if not c.endswith(f): + continue + date = c[:-len(f)] + if date.isdigit(): + if a < int(date) < b: + match_found = True + else: + return False + else: + if year_approx_match(marc, date): + match_found = True + else: + return False + + return match_found + + for f in ['birth', 'death']: + if f + '_date' not in dates: + continue + marc = dates[f + '_date'] + this_cats = [i[:-(len(f)+2)] for i in cats if i.endswith(' %ss' % f)] + if not this_cats: + continue + m = any(year_approx_match(marc, i) for i in this_cats) + if m: + match_found = True + else: + return False + return match_found + +def norm_name(n): + return re_comma.sub(' ', n).lower() + +# example: "Ibn Daud, Abraham ben David," -> "Ibn Daud" +re_name_comma = re.compile('^([^, ]+ [^, ]+)?, [^ ]') + +def name_lookup(fields): + def join_fields(fields, want): + return ' '.join(v for k, v in fields if k in want) + + fields = [(k, v.lower()) for k, v in fields] + + if not any(k == 'd' for k, v in fields): + return [] + ab = [v for k, v in fields if k in 'ab'] + name = ' '.join(ab) + flipped = flip_name(name) + names = set([name, flipped]) + + a = join_fields(fields, 'a') + m = re_name_comma.match(a) + if m: + names.add(m.group(1)) + + #names = set([flipped]) + if any(k == 'c' for k, v in fields): + name = join_fields(fields, 'abc') + names.update([name, flip_name(name)]) + title = [v for k, v in fields if k in 'c'] + names.update([' '.join(title + ab), ' '.join(title + [flipped])]) + title = ' '.join(title) + names.update(["%s (%s)" % (name, title), "%s (%s)" % (flipped, title)]) + sp = title.find(' ') + if sp != -1: + m = re_title_of.search(title) + if m: + role, of_place = m.groups() + names.update([' '.join(ab + [of_place]), ' '.join([flipped, of_place])]) + names.update([' '.join([role] + ab + [of_place]), ' '.join([role, flipped, of_place])]) + + t = title[:sp] + names.update([' '.join([t] + ab), ' '.join([t, flipped])]) + if 'of st. ' in title: # for "Richard of St. Victor" + names.update([i.replace('of st.', 'of saint') for i in names]) + + found = [] + for n in set(re_comma.sub(' ', n) for n in names): + iter = db.query("select title, cats, name, persondata from names, people where people.id = names.person_id and name=$n", {'n':n}) + x = [(i.title, eval(i.cats), i.name, i.persondata) for i in iter if not i.title.startswith('Personal life of ')] + found += x + return found + +noble_or_clergy = ['King', 'Queen', 'Prince', 'Princess', 'Duke', 'Archduke', 'Baron', 'Pope', 'Antipope', 'Bishop', 'Archbishop'] +re_noble_or_clergy = re.compile('(' + '|'.join( noble_or_clergy ) + ')') + +def strip_brackets(line): + if line[4] == '[' and line[-2] == ']': + return line[0:4] + line[5:-2] + line[-1] + else: + return line + +def fmt_line(fields): + def bold(s): + return ''.join(i + '\b' + i for i in s) + def norm(s): + return normalize('NFC', s) + return ''.join(bold("$" + k) + norm(v) for k, v in fields) + +def pick_from_match(match): + l = [(norm_name(k), v) for k, v in match.items()] + good = [(k, v) for k, v in l if any(k == m for m in v['match_name'])] + if len(good) == 1: + return dict(good) + exact_date = [(k, v) for k, v in l if v['exact_dates']] + if len(exact_date) == 1: + return dict(exact_date) + if len(exact_date) > 1 and len(good) > 1: + exact_date = [(k, v) for k, v in good if v['exact_dates']] + if len(exact_date) == 1: + return dict(exact_date) + return match + +def more_than_one_match(match): + return [("http://en.wikipedia.org/wiki/" + name.replace(' ', '_'), i) for name, i in match.items()] + +def test_date_match(): + # $aAngelico,$cfra,$dca. 1400-l455. + dates = {'birth_date': u'ca. 1400', 'death_date': u'1455'} + cats = [u'1395 births', u'1455 deaths'] + assert date_match(dates, cats) + + # $aAndocides,$dca. 440-ca. 390 B.C. + dates = {'birth_date': u'ca. 440 B.C.', 'death_date': u'ca. 390 B.C.'} + cats = [u'440 BC births', u'390 BC deaths', u'Ancient Athenians'] + assert date_match(dates, cats) + + # $aAlexander,$cof Hales,$dca. 1185-1245. + dates = {'birth_date': u'ca. 1185', 'death_date': u'1245'} + cats = [u'13th century philosophers', u'1245 deaths', u'Roman Catholic philosophers', u'English theologians', u'Franciscans', u'Scholastic philosophers', u'People from Gloucestershire'] + assert date_match(dates, cats) + + dates = {'birth_date': u'1922'} + cats = [u'1830 births', u'1876 deaths'] + assert not date_match(dates, cats) + + dates = {'birth_date': u'1889', 'death_date': u'1947'} + cats = [u'1890 births', u'1947 deaths'] + assert date_match(dates, cats) + + dates = {'birth_date': u'1889', 'death_date': u'1947'} + cats = [u'1890 births', u'1947 deaths'] + assert date_match(dates, cats) + + dates = {} + cats = [u'1890 births', u'1947 deaths'] + assert not date_match(dates, cats) + + dates = {'birth_date': u'1883?', 'death_date': u'1963'} + cats = [u'1882 births', u'1963 deaths'] + assert date_match(dates, cats) + + dates = {'birth_date': u'1328?', 'death_date': u'1369'} + cats = [u'Karaite rabbis', u'1320s births', u'1369 deaths'] + assert date_match(dates, cats) + + dates = {'birth_date': u'ca. 1110', 'death_date': u'ca. 1180'} + cats = [u'1120s births', u'1198 deaths'] + assert date_match(dates, cats) + + # $aAbu Nuwas,$dca. 756-ca. 810. # Abu Nuwas + dates = {'birth_date': u'ca. 756', 'death_date': u'ca. 810'} + cats = [u'750 births', u'810 deaths'] + assert date_match(dates, cats) + +date_cats = (' births', ' deaths', 'century writers', 'century Latin writers', 'century women writers', 'century French writers') # time for an regexp + +def exact_date_match(dates, cats): + if 'date' in dates or not all(i in dates for i in ('birth_date', 'death_date')): + return False + if any('ca.' in i for i in dates.values()): + return False + birth, death = get_birth_and_death(cats) + return dates['birth_date'] == birth and dates['death_date'] == death + +def look_for_match(found, dates, verbose): + match = {} + for name, cats, match_name, pd in found: + found_name_match = norm_name(name) == match_name + #seen.add(name) + if not any(any(cat.endswith(i) for i in date_cats) for cat in cats): + if False and not found_name_match: + print('name match, but no date cats') + print(name, cats, match_name) + print(dates) + print() + continue + exact_dm = exact_date_match(dates, cats) + dm = exact_dm or date_match(dates, cats) + if not dm and found_name_match: + if 'death_date' in dates: + death = dates['death_date'] + if death + ' deaths' in cats: + dm = True + elif 'birth_date' in dates: + birth = dates['birth_date'] + if birth.isdigit(): + assert birth + ' births' not in cats + if dm: + if name in match: + match[name]['match_name'].append(match_name) + else: + match[name] = {'cats': cats, 'exact_dates': exact_dm, 'match_name': [match_name]} + if not verbose: + continue + print((name, match_name)) + print("cats =", cats) + print(('match' if dm else 'no match')) + for field in ['birth', 'death']: + print(field + 's:', [i[:-(len(field)+2)] for i in cats if i.endswith(' %ss' % field)], end=' ') + print() + if verbose: + print('---') + return match + +def test_lookup(): + line = '00\x1faEgeria,\x1fd4th/5th cent.\x1e' # count=3 + wiki = 'Egeria (pilgrim)' + print(fmt_line(get_subfields(line, 'abcd'))) + fields = tuple((k, v.strip(' /,;:')) for k, v in get_subfields(line, 'abcd')) + print(fields) + found = name_lookup(fields) + print(found) + dates = pick_first_date(v for k, v in fields if k == 'd') + assert dates.items()[0] != ('date', '') + print(dates) + print() + print(look_for_match(found, dates, True)) + +#test_lookup() + +def test_lookup2(): + line = '00\x1faRichard,\x1fcof St. Victor,\x1fdd. 1173.\x1e' + print(fmt_line(get_subfields(line, 'abcd'))) + fields = tuple((k, v.strip(' /,;:')) for k, v in get_subfields(line, 'abcd')) + print(fields) + found = name_lookup(fields) + dates = pick_first_date(v for k, v in fields if k == 'd') + assert dates.items()[0] != ('date', '') + print(dates) + print() + match = look_for_match(found, dates, False) + pprint(match) + print() + match = pick_from_match(match) + pprint(match) + +def test_lookup3(): + line = '00\x1faJohn,\x1fcof Paris,\x1fd1240?-1306.\x1e' + print(fmt_line(get_subfields(line, 'abcd'))) + fields = tuple((k, v.strip(' /,;:')) for k, v in get_subfields(line, 'abcd')) + print(fields) + found = name_lookup(fields) +# print [i for i in found if 'Paris' in i[0]] +# found = [(u'John of Paris', [u'Christian philosophers', u'Dominicans', u'Roman Catholic theologians', u'13th-century Latin writers', u'1255 births', u'1306 deaths'], u'john of paris', None)] + dates = pick_first_date(v for k, v in fields if k == 'd') + match = look_for_match(found, dates, False) + match = pick_from_match(match) + pprint(match) + +def test_lookup4(): + fields = (('a', 'Forbes, George'), ('d', '1849-1936.')) + found = name_lookup(fields) + dates = pick_first_date(v for k, v in fields if k == 'd') + match = look_for_match(found, dates, False) + for k, v in match.iteritems(): + print(k, v) + match = pick_from_match(match) + pprint(match) + +#test_lookup4() + +def db_marc_lookup(): + verbose = False + articles = set() + count = 0 + count_with_date = 0 + t0 = time() + match_count = 0 + total = 3596802 + prev_fields = None + fh = open('matches', 'w') + bad = codecs.open('more_than_one_match', 'w', 'utf8') + for line in open('/1/edward/wikipedia/marc_authors2'): + count+=1 +# (author_count, line) = eval(line) + (line, author_count) = eval(line) +# line = strip_brackets(line) + if count % 5000 == 0: + t1 = time() - t0 + rec_per_sec = count / t1 + time_left = (total - count) / rec_per_sec + #print fmt_line(get_subfields(line, 'abcd')) +# print list(get_subfields(line, 'abcd')) + print(line) + print(count, count_with_date, match_count, "%.2f%% %.2f mins left" % (float(match_count * 100.0) / float(count_with_date), time_left / 60)) + fields = tuple((k, v.strip(' /,;:')) for k, v in line) + if prev_fields == fields: + continue + prev_fields = fields + dates = pick_first_date(v for k, v in fields if k == 'd') + if dates.items()[0] == ('date', ''): + continue + count_with_date += 1 + if verbose: + print(line) + print(dates) + is_noble_or_clergy = any(k =='c' and re_noble_or_clergy.search(v) for k, v in fields) + found = name_lookup(fields) + if not found: + continue + if is_noble_or_clergy: + print('noble or clergy not found:', line) + print() + continue + match = look_for_match(found, dates, verbose) + + if not match: + continue + if is_noble_or_clergy: + print('noble or clergy not found:') + print(fmt_line(line)) + print(found) + print() + continue + match_count+=1 +# articles.add(match.keys()[0]) + if len(match) != 1: + match = pick_from_match(match) + if len(match) != 1: + print("\n" + fmt_line(line), file=bad) + for i in more_than_one_match(match): + print(i, file=bad) + else: + #print (list(get_subfields(line, 'abcd')), match.keys()[0]) + cats = match.values()[0]['cats'] + exact = match.values()[0]['exact_dates'] + dc = [i for i in cats if any(i.endswith(j) for j in date_cats)] + print((match.keys()[0], fields, author_count, dc, exact, 'Living people' in cats), file=fh) + print(match_count) + fh.close() + +if __name__ == '__main__': + db_marc_lookup() diff --git a/ia-legacy-importer/wikipedia/process.py b/ia-legacy-importer/wikipedia/process.py new file mode 100644 index 00000000..8a5dadf8 --- /dev/null +++ b/ia-legacy-importer/wikipedia/process.py @@ -0,0 +1,686 @@ +# coding=utf8 +from __future__ import print_function +import bz2 +import codecs +import sys +import re +import simplejson as json +from catalog.marc.fast_parse import get_subfields, get_all_subfields, get_subfield_values +from unicodedata import normalize +import MySQLdb +from catalog.utils import pick_first_date +from time import time + +re_marc_name = re.compile('^(.*), (.*)$') + +def norm(s): + return normalize('NFC', s) + +def get_conn(): + return MySQLdb.connect(passwd='', user='', use_unicode=True, charset='utf8', db='wiki_people') + +def get_cursor(): + return get_conn().cursor() + +sys.stdout = codecs.getwriter('utf8')(sys.stdout) +re_skip = re.compile('^(History|Demograph(ics|y)|Lists?) of') + +def list_names(): + for line in bz2.BZ2File('people.bz2'): + cur = json.loads(line.decode('utf8')) + title = cur['title'] + if re_skip.match(title): + continue + print(title) + +def redirects(): + titles = set([line[:-1] for line in codecs.open('people_names', 'r', 'utf8')]) + + for line in bz2.BZ2File('redirects.bz2'): + (f, t) = json.loads(line.decode('utf8')) + t = t.replace('_', ' ') + if t in titles: + print((f, t)) + +def redirect_dict(): + redirects = {} + for line in open('people_redirects'): + (f, t) = eval(line) + t = t.replace('_', ' ') + redirects.setdefault(t, []).append(f) + print(redirects) + +def add_redirects(): + redirects = eval(open('redirect_dict').read()) + for line in bz2.BZ2File('people.bz2'): + cur = json.loads(line.decode('utf8')) + title = cur['title'] + if re_skip.match(title): + continue + if title in redirects: + cur['redirects'] = redirects[title] + print(cur) + +#add_redirects() +#redirect_dict() + +re_syntax = re.compile(r'(.*?)(\||{{|}}|\[\[|\]\])', re.DOTALL) +re_html_comment = re.compile('') +re_space_or_underscore = re.compile('[ _]') +re_infobox_template = re.compile('^infobox[_ ]books?(?:\s*)?\s*', re.I) +re_persondata = re.compile('^Persondata\s*', re.I) + +re_line = re.compile('^\s*\|\s*([A-Z ]+?)\s*=\s*(.*?)\s*$') +def parse_template2(s): + fields = {} + for l in s.split('\n'): + m = re_line.match(l) + if not m: + continue + name, value = m.groups() + fields[name.strip()] = value + return fields + +def parse_template(s, expected_name): + template_depth = 1 + link_depth = 0 + pos = 2 + buf = '' + + data = [] + while template_depth > 0: + m = re_syntax.match(s[pos:]) + + pos = pos+m.end() + buf += m.group(1) + if m.group(2) == '{{': + buf += m.group(2) + template_depth += 1 + continue + + if m.group(2) == '[[': + buf += m.group(2) + link_depth += 1 + continue + + if template_depth == 1 and link_depth == 0: + data.append(buf) + buf = '' + elif m.group(2) == '|': + buf += '|' + if m.group(2) == '}}': + buf += m.group(2) + template_depth -= 1 + continue + if m.group(2) == ']]': + buf += m.group(2) + if link_depth > 0: + link_depth -= 1 + continue + assert m.group(2) == '|' + if buf != '}}': + return parse_template2(s) + assert buf == '}}' + + template_name = data.pop(0) + try: + assert template_name.lstrip().lower().startswith(expected_name.lower()) + #assert re_persondata.match(infobox_template) + #assert re_infobox_template.match(infobox_template) + except AssertionError: + print(template_name) + raise + + fields = {} + for line in data: + line = line.strip(); + if line == '' or ((line.startswith('')) or line == 'PLEASE SEE [[WP:PDATA]]!': + continue + if '=' in line: + name, value = line.split('=', 1) + else: + m = re_missing_equals.match(line) + if not m: + return parse_template2(s) + name, value = m.groups() + fields[name.strip()] = value.strip() + return fields + +re_missing_equals = re.compile('^([A-Z ]+) (.+)$') + +def parse_pd(pd): + lines = pd.split('\n') + print(repr(lines[-1])) + assert lines[-1] == '}}' + +def read_person_data(): + expect = set([u'DATE OF DEATH', u'NAME', u'SHORT DESCRIPTION', u'ALTERNATIVE NAMES', u'PLACE OF BIRTH', u'DATE OF BIRTH', u'PLACE OF DEATH']) + for line in open('people'): + cur = eval(line) + if 'persondata' not in cur: + continue + title = cur['title'] + if title == 'Murray Bookchin': + continue +# print 'title:', title + pd = cur['persondata'] + k = set(parse_template(pd, 'persondata').keys()) + if k > expect: + print(title) + print(k) + +def iter_people(): + return (eval(line) for line in open('people')) + +def date_cats(): + re_date_cat = re.compile('^(.*\d.*) (birth|death)s$') + cats = {'birth': {}, 'death':{}} + for cur in iter_people(): + title = cur['title'] + #print [cat for cat in cur['cats'] if cat.endswith('births') or cat.endswith('deaths')] + for cat in cur['cats']: + m = re_date_cat.match(cat) + if not m: + continue + cats[m.group(2)].setdefault(m.group(1), set()).add(title) +# print 'birth:', [(i[0], len(i[1])) for i in sorted(cats['birth'].items(), reverse = True, key = lambda i: len(i[1]))[:5]] +# print 'death:', [(i[0], len(i[1])) for i in sorted(cats['death'].items(), reverse = True, key = lambda i: len(i[1]))[:5]] + print(cats) + +#read_person_data() +#date_cats() + +def fmt_line(fields): + def bold(s): + return ''.join(i + '\b' + i for i in s) + return ''.join(bold("$" + k) + norm(v) for k, v in fields) + +def strip_brackets(line): + if line[4] == '[' and line[-2] == ']': + return line[0:4] + line[5:-2] + line[-1] + else: + return line + +def read_marc(): + for line in bz2.BZ2File('marc_authors.bz2'): + line = eval(line) + if '[Sound recording]' in line: + continue + line = strip_brackets(line) + #print expr_in_utf8(get_all_subfields(line)) + print(fmt_line(get_subfields(line, 'abcd'))) + +#read_marc() + +# 528,859 wikipedia +# 3,596,802 MARC + + +def get_names(cur): + titles = [cur['title']] + cur.get('redirects', []) + if 'persondata' in cur: + pd = parse_template(cur['persondata'], 'persondata') + if 'NAME' in pd and pd['NAME']: + titles.append(pd['NAME']) + if 'ALTERNATIVE NAMES' in pd: + alt = pd['ALTERNATIVE NAMES'] + if len(alt) > 100 and ',' in alt and ';' not in alt: + alt = alt.split(',') + else: + alt = alt.split(';') + titles += [j for j in (i.strip() for i in alt) if j] + return set(i.lower() for i in titles) + +def read_people(): + from collections import defaultdict +# wiki = [] +# title_lookup = defaultdict(list) + maximum = 0 + for cur in iter_people(): +# wiki.append(cur) + titles = [cur['title']] + cur.get('redirects', []) + if 'persondata' in cur: + pd = parse_template(cur['persondata'], 'persondata') + if 'NAME' in pd and pd['NAME']: + titles.append(pd['NAME']) + if 'ALTERNATIVE NAMES' in pd: + alt = pd['ALTERNATIVE NAMES'] + if len(alt) > 100 and ',' in alt and ';' not in alt: + alt = alt.split(',') + else: + alt = alt.split(';') + titles += [j for j in (i.strip() for i in alt) if j] + cur_max = max(len(i) for i in titles) + if cur_max > maximum: + maximum = cur_max + print(maximum) + print(cur['title']) + print(titles) +# for t in set(titles): +# title_lookup[t].append(cur) + +# filter names: Robert Bob Adam Hincmar Anselm + +# Personal life of Marcus Tullius Cicero + +def load_db(): + c = get_cursor() + c.execute('truncate people') + c.execute('truncate names') + c.execute('truncate redirects') + for person in iter_people(): +# print person + c.execute('insert into people (title, len, infobox, defaultsort, persondata, cats) values (%s, %s, %s, %s, %s, %s)', (person['title'], person['len'], person.get('infobox', None), person.get('defaultsort', None), person.get('persondata', None), repr(person.get('cats', [])))) + id = conn.insert_id() + c.executemany('insert ignore into names (person_id, name) values (%s, %s)', [(id, n) for n in get_names(person)]) + if 'redirects' in person: + redirects = set(r.lower() for r in person['redirects']) + c.executemany('insert ignore into redirects (person_id, redirect) values (%s, %s)', [(id, r) for r in redirects]) + +re_lifetime = re.compile('\{\{lifetime\| *(\d+s?(?: BC)?|missing|unknown|\d\d?[a-z][a-z] century)? *(?:\| *(\d+s?(?: BC)?|living|unknown|missing|\d\d?[a-z][a-z] century)? *)?(?:\|([^|]*))?\}\}', re.I) + +def load_lifetime(): + c = get_cursor() + for person in iter_people(): + if 'lifetime' not in person: + continue + m = re_lifetime.match(person['lifetime']) + if not m: + continue + (birth, death, defaultsort) = m.groups() + cats = person.get('cats', []) +# print "select id from people where title='%s'" % person['title'] + c.execute("select id from people where title=%s", (person['title'],)) + (id,) = c.fetchone() + update_cats = False + if birth and birth.lower() not in ('missing', 'unknown'): + new_cat = birth + " births" + if new_cat not in cats: + cats.append(new_cat) + update_cats = True + if death and death.lower() not in ('missing', 'unknown', 'living'): + new_cat = death + " deaths" + if new_cat not in cats: + cats.append(new_cat) + update_cats = True + if update_cats: + print(person['title']) +# print 'update people set cats=%s where id=%s' % (repr(cats), id) + c.execute('update people set cats=%s where id=%s', (repr(cats), id)) + if defaultsort: + add_to_names(c, id, defaultsort) + +re_defaultsort = re.compile('^{{defaultsort(?:key)?[;:|]\n?(.*)\n?}}$', re.I) + +re_comma = re.compile(', *') +re_comma_and_space = re.compile('[, ]+') + +def add_to_names(c, id, name): + name = re_comma.sub(' ', name).lower().strip() + c.execute('insert ignore into names (person_id, name) values (%s, %s)', (id, name)) + +def add_default_sort(): + c = get_cursor() + c.execute("select id, title, defaultsort from people where defaultsort is not null") + for id, title, ds in c.fetchall(): +# print id, ds + if title == 'Omar Gooding': + ds = '{{DEFAULTSORT:Gooding, Omar}}' + m = re_defaultsort.match(ds) + if not m: + print("http://en.wikipedia.org/wiki/" + title.replace(' ', '_')) + print(ds) + if m.group(1): + add_to_names(c, id, m.group(1)) + +re_br_or_semicolon = re.compile('(?:|;)') +re_strip = re.compile("(?:\([^)]*\)||\[\[.*?\]\]|'''?)") +re_strip2 = re.compile('(?:<.*?>|\{\{.*?\}\})') + +def add_names_from_infobox(): + c = get_cursor() + c.execute("select id, title, infobox from people where infobox is not null") + for id, title, infobox in c.fetchall(): + try: + infobox = parse_template(infobox, 'infobox') + except AttributeError: + continue + for field in 'name', 'full name': + if field not in infobox or not infobox[field]: + continue + v = re_strip.sub('', infobox[field]) +# v = infobox[field] + v = [i for i in (re_strip2.sub('', i).strip(' ,:') for i in re_br_or_semicolon.split(v)) if i] + for i in v: + i = re_comma_and_space.sub(' ', i) + if title != i: + add_to_names(c, id, i) +# print title, ':', field, ':', infobox[field], v + +def strip_commas_from_names(): + c = get_cursor() + c.execute("select person_id, name from names where name like '%,%'") + for id, name in c.fetchall(): + new = re_comma.sub(' ', name) + if new == ' ' or new == name: + print((id, name, new)) + assert new != ' ' and new != name + c.execute("update ignore names set name=%s where person_id=%s and name=%s", (new, id, name)) + +#read_people() + +#load_db() + +def flip_name(name): + m = re_marc_name.match(name) + if m: + return m.group(2) + ' ' + m.group(1) + return name + +re_digit = re.compile('\d+') +re_decade = re.compile('^(\d+)s$') +re_bc_date = re.compile('^(.*) B\.C\.?$') +re_cent = re.compile('^(\d+)[a-z][a-z] cent\.$') +re_century = re.compile('^(\d+)[a-z][a-z] century$') + +def decade_match(a, start): + end = start + 10 + if a.isdigit(): + return start <= int(a) < end + return any((start <= int(c) < end) for c in re_digit.findall(a)) + +def year_approx_match(a, b): + approx_century_match = False + if a.startswith('ca. '): + ca = True + a = a[4:] + range = 20 + else: + ca = False + range = 9 + if a == b: + return True + if a.replace('.', '') == b: + return True # ca. 440 B.C. + if a.endswith(' cent.') and b.endswith(' century') and b.startswith(a[:-1]): + return True + + bc = False + if b.endswith(' BC'): + m = re_bc_date.match(a) + if m: + a = m.group(1) + b = b[:-3] + bc = True + if approx_century_match and a.isdigit() and b.endswith(' century'): + a = int(a) + m = re_century.match(b) + assert m + cent = int(m.group(1)) + start = cent - 1 if not bc else cent + end = cent if not bc else cent + 1 + #print cent, start, a, end + if start * 100 <= a < end * 100: + return True + + if b.isdigit(): + b = int(b) + if a.isdigit() and (bc or b < 1850) and abs(int(a) - b) <= range: + return True + if approx_century_match and a.endswith(' cent.'): + m = re_cent.match(a) + if m: + cent = int(m.group(1)) + start = cent - 1 if not bc else cent + end = cent if not bc else cent + 1 + if start * 100 <= b < end * 100: + return True + for c in re_digit.findall(a): + c = int(c) + if c == b: + return True + if (bc or b < 1850) and abs(c - b) <= range: + return True + return False + m = re_decade.match(b) + if not m: + return False + start = int(m.group(1)) + return decade_match(a, start) + +def test_year_approx_match(): + assert not year_approx_match('1939', '1940') + assert year_approx_match('582', '6th century') + assert year_approx_match('13th cent.', '1240') + assert year_approx_match('ca. 360 B.C.', '365 BC') + assert year_approx_match('1889', '1890') + assert year_approx_match('1883?', '1882') + assert year_approx_match('1328?', '1320s') + assert year_approx_match('11th cent.', '11th century') + assert not year_approx_match('1330', '1320s') + assert not year_approx_match('245 B.C.', '3rd century BC') + +def date_match(dates, cats): + match_found = False + for f in ['birth', 'death']: + if f + '_date' not in dates: + continue + marc = dates[f + '_date'] + this_cats = [i[:-(len(f)+2)] for i in cats if i.endswith(' %ss' % f)] + if not this_cats: + continue + m = any(year_approx_match(marc, i) for i in this_cats) + #print m, marc, this_cats + if m: + match_found = True + else: + return False + return match_found + +def test_date_match(): + # $aAngelico,$cfra,$dca. 1400-l455. + dates = {'birth_date': u'ca. 1400', 'death_date': u'1455'} + cats = [u'1395 births', u'1455 deaths'] + assert date_match(dates, cats) + + # $aAndocides,$dca. 440-ca. 390 B.C. + dates = {'birth_date': u'ca. 440 B.C.', 'death_date': u'ca. 390 B.C.'} + cats = [u'440 BC births', u'390 BC deaths', u'Ancient Athenians'] + assert date_match(dates, cats) + + # $aAlexander,$cof Hales,$dca. 1185-1245. + dates = {'birth_date': u'ca. 1185', 'death_date': u'1245'} + cats = [u'13th century philosophers', u'1245 deaths', u'Roman Catholic philosophers', u'English theologians', u'Franciscans', u'Scholastic philosophers', u'People from Gloucestershire'] + assert date_match(dates, cats) + + dates = {'birth_date': u'1922'} + cats = [u'1830 births', u'1876 deaths'] + assert not date_match(dates, cats) + + dates = {'birth_date': u'1889', 'death_date': u'1947'} + cats = [u'1890 births', u'1947 deaths'] + assert date_match(dates, cats) + + dates = {'birth_date': u'1889', 'death_date': u'1947'} + cats = [u'1890 births', u'1947 deaths'] + assert date_match(dates, cats) + + dates = {} + cats = [u'1890 births', u'1947 deaths'] + assert not date_match(dates, cats) + + dates = {'birth_date': u'1883?', 'death_date': u'1963'} + cats = [u'1882 births', u'1963 deaths'] + assert date_match(dates, cats) + + dates = {'birth_date': u'1328?', 'death_date': u'1369'} + cats = [u'Karaite rabbis', u'1320s births', u'1369 deaths'] + assert date_match(dates, cats) + + dates = {'birth_date': u'ca. 1110', 'death_date': u'ca. 1180'} + cats = [u'1120s births', u'1198 deaths'] + assert date_match(dates, cats) + + # $aAbu Nuwas,$dca. 756-ca. 810. # Abu Nuwas + dates = {'birth_date': u'ca. 756', 'death_date': u'ca. 810'} + cats = [u'750 births', u'810 deaths'] + assert date_match(dates, cats) + +re_title_of = re.compile('^(.*) (of .*)$') + +def name_lookup(c, fields): + def join_fields(fields, want): + return ' '.join(v for k, v in fields if k in want) + if not any(k == 'd' for k, v in fields): + return [] + ab = [v for k, v in fields if k in 'ab'] + name = ' '.join(ab) + flipped = flip_name(name) + names = set([name, flipped]) + #names = set([flipped]) + if any(k == 'c' for k, v in fields): + name = join_fields(fields, 'abc') + names.update([name, flip_name(name)]) + title = [v for k, v in fields if k in 'c'] + names.update([' '.join(title + ab), ' '.join(title + [flipped])]) + title = ' '.join(title) + names.update(["%s (%s)" % (name, title), "%s (%s)" % (flipped, title)]) + sp = title.find(' ') + if sp != -1: + m = re_title_of.search(title) + if m: + role, of_place = m.groups() + names.update([' '.join(ab + [of_place]), ' '.join([flipped, of_place])]) + names.update([' '.join([role] + ab + [of_place]), ' '.join([role, flipped, of_place])]) + + t = title[:sp] + names.update([' '.join([t] + ab), ' '.join([t, flipped])]) + + found = [] + for n in set(re_comma.sub(' ', n) for n in names): + c.execute("select title, cats, name, persondata from names, people where people.id = names.person_id and name=%s", (n,)) + found += c.fetchall() + return found + +# $aAleksandr Mikhaĭlovich,$cGrand Duke of Russia,$d1866-1933. +# == Grand Duke Alexander Mikhailovich of Russia + +def pick_from_match(match): + good = [(name, (cats, match_name)) for name, (cats, match_name) in match.items() if name.lower() == match_name] + if len(good) == 1: + return dict(good) + return match + +def more_than_one_match(match): + for name, (cats, match_name) in match.items(): + print(name, cats, match_name) + print("http://en.wikipedia.org/wiki/" + name.replace(' ', '_')) + print() + +#$aSmith, William,$d1769-1839 +#William Smith (geologist) [u'English geologists', u'Canal engineers', u'People from Oxfordshire', u'Somerset coalfield', u'1769 births', u'1839 deaths', u'People from Scarborough, North Yorkshire', u'Wollaston Medal winners'] william smith +#http://en.wikipedia.org/wiki/William_Smith_(geologist) +#William Smith (South Carolina senator) [u'1762 births', u'1840 deaths', u'United States Senators from South Carolina', u'Democratic Party (United States) vice presidential nominees', u'South Carolina lawyers'] william smith +#http://en.wikipedia.org/wiki/William_Smith_(South_Carolina_senator) + +noble_or_clergy = ['King', 'Queen', 'Prince', 'Princess', 'Duke', 'Archduke', 'Baron', 'Pope', 'Antipope', 'Bishop', 'Archbishop'] +re_noble_or_clergy = re.compile('(' + '|'.join( noble_or_clergy ) + ')') + +def db_marc_lookup(): + verbose = False + c = get_cursor() + articles = set() + count = 0 + count_with_date = 0 + t0 = time() + match_count = 0 + total = 3596802 + prev_fields = None + fh = open('matches3', 'w') + for line in bz2.BZ2File('marc_authors.bz2'): + count+=1 + line = eval(line) + line = strip_brackets(line) + if count % 5000 == 0: + t1 = time() - t0 + rec_per_sec = count / t1 + time_left = (total - count) / rec_per_sec + print(fmt_line(get_subfields(line, 'abcd'))) + print(count, count_with_date, match_count, "%.2f%% %.2f mins left" % (float(match_count * 100.0) / float(count_with_date), time_left / 60)) + fields = tuple((k, v.strip(' /,;:')) for k, v in get_subfields(line, 'abcd')) + if prev_fields == fields: + continue + prev_fields = fields + dates = pick_first_date(v for k, v in fields if k == 'd') + if dates.items()[0] == ('date', ''): + continue + count_with_date += 1 + if verbose: + print(fmt_line(get_subfields(line, 'abcd'))) + print(dates) + is_noble_or_clergy = any(re_noble_or_clergy.search(v) \ + for v in get_subfield_values(line, 'c')) + found = name_lookup(c, fields) + if not found: + continue + if is_noble_or_clergy: + print('noble or clergy not found:') + print(fmt_line(get_subfields(line, 'abcd'))) + print() + continue + match = {} + seen = set() + for name, cats, match_name, pd in found: + if name in seen: + continue + seen.add(name) + cats = eval(cats) + if not any(cat.endswith(' births') or cat.endswith(' deaths') for cat in cats): + continue + dm = date_match(dates, cats) + if dm: + match[name] = (cats, match_name) + if not verbose: + continue + print((name, match_name)) + print("cats =", cats) + print(('match' if dm else 'no match')) + for field in ['birth', 'death']: + print(field + 's:', [i[:-(len(field)+2)] for i in cats if i.endswith(' %ss' % field)], end=' ') + print() + if verbose: + print('---') + + if not match: + continue + if is_noble_or_clergy: + print('noble or clergy not found:') + print(fmt_line(get_subfields(line, 'abcd'))) + print(found) + print() + continue + match_count+=1 +# articles.add(match.keys()[0]) + if len(match) != 1: + match = pick_from_match(match) + if len(match) != 1: + print(count, match_count) + print(fmt_line(get_subfields(line, 'abcd'))) + more_than_one_match(match) + else: + #print (list(get_subfields(line, 'abcd')), match.keys()[0]) + print((match.keys()[0], fields), file=fh) + continue +# print len(articles), match[0][0], fmt_line(get_subfields(line, 'abcd')) + assert len(match) == 1 + print(match_count) + fh.close() + +#add_names_from_infobox() +#test_year_approx_match() +db_marc_lookup() +#test_date_match() +#add_default_sort() +#strip_commas_from_names() + +#load_lifetime() diff --git a/ia-legacy-importer/wikipedia/read.py b/ia-legacy-importer/wikipedia/read.py new file mode 100644 index 00000000..9c7f4c47 --- /dev/null +++ b/ia-legacy-importer/wikipedia/read.py @@ -0,0 +1,40 @@ +from __future__ import print_function +import sys +import codecs +import re +from catalog.marc.fast_parse import translate +sys.stdout = codecs.getwriter('utf-8')(sys.stdout) + +prev = None +cur_marc = [] + +trans = {'&':'&','<':'<','>':'>','\n':'
    '} +re_html_replace = re.compile('([&<>\n])') + +def esc(s): + return re_html_replace.sub(lambda m: trans[m.group(1)], s) + +def esc_sp(s): + return esc(s).replace(' ', ' ') + +print('\nAuthors\n') + +print('87 authors with 10 or more variants in MARC records
    ') + +def html_subfields(marc): + return ''.join('' + k + '' + esc(translate(v)) for k, v in marc) + +for line in open("matches4"): + wiki, marc = eval(line) + if prev and prev != wiki: + if len(cur_marc) > 9: + print('

    %s

    ' % (prev.replace(" ", "_"), prev)) + print("%d variants in MARC records
    " % len(cur_marc)) + print("
      ", ''.join("
    • %s
    • \n" % html_subfields(li) for li in cur_marc), "
    ") +# for i in cur_marc: +# print ' ', i + cur_marc = [] + cur_marc.append(marc) + prev = wiki + +print('\n') diff --git a/ia-legacy-importer/wikipedia/tidy.py b/ia-legacy-importer/wikipedia/tidy.py new file mode 100644 index 00000000..e4e51b7a --- /dev/null +++ b/ia-legacy-importer/wikipedia/tidy.py @@ -0,0 +1,536 @@ +from __future__ import print_function +import bz2 +import codecs +import sys +import re +import simplejson as json +from catalog.marc.fast_parse import get_subfields, get_all_subfields, get_subfield_values +from unicodedata import normalize +import MySQLdb +from catalog.utils import pick_first_date +from time import time + +re_marc_name = re.compile('^(.*), (.*)$') + +def norm(s): + return normalize('NFC', s) + +def get_conn(): + return MySQLdb.connect(passwd='', user='', use_unicode=True, charset='utf8', db='wiki_people') + +def get_cursor(): + return get_conn().cursor() + + + +sys.stdout = codecs.getwriter('utf8')(sys.stdout) +re_skip = re.compile('^(History|Demograph(ics|y)|Lists?) of') + +def names(): + + for line in bz2.BZ2File('people.bz2'): + cur = json.loads(line.decode('utf8')) + title = cur['title'] + if re_skip.match(title): + continue + print(title) + +def redirects(): + titles = set([line[:-1] for line in codecs.open('people_names', 'r', 'utf8')]) + + for line in bz2.BZ2File('redirects.bz2'): + (f, t) = json.loads(line.decode('utf8')) + t = t.replace('_', ' ') + if t in titles: + print((f, t)) + +def redirect_dict(): + redirects = {} + for line in open('people_redirects'): + (f, t) = eval(line) + t = t.replace('_', ' ') + redirects.setdefault(t, []).append(f) + print(redirects) + +def add_redirects(): + redirects = eval(open('redirect_dict').read()) + for line in bz2.BZ2File('people.bz2'): + cur = json.loads(line.decode('utf8')) + title = cur['title'] + if re_skip.match(title): + continue + if title in redirects: + cur['redirects'] = redirects[title] + print(cur) + +#add_redirects() +#redirect_dict() + +re_syntax = re.compile(r'(.*?)(\||{{|}}|\[\[|\]\])', re.DOTALL) +re_html_comment = re.compile('') +re_space_or_underscore = re.compile('[ _]') +re_infobox_template = re.compile('^infobox[_ ]books?(?:\s*)?\s*', re.I) +re_persondata = re.compile('^Persondata\s*', re.I) + +re_line = re.compile('^\s*\|\s*([A-Z ]+?)\s*=\s*(.*?)\s*$') +def parse_template2(s): + fields = {} + for l in s.split('\n'): + m = re_line.match(l) + if not m: + continue + name, value = m.groups() + fields[name.strip()] = value + return fields + +def parse_template(s): + template_depth = 1 + link_depth = 0 + pos = 2 + buf = '' + + data = [] + while template_depth > 0: + m = re_syntax.match(s[pos:]) + + pos = pos+m.end() + buf += m.group(1) + if m.group(2) == '{{': + buf += m.group(2) + template_depth += 1 + continue + + if m.group(2) == '[[': + buf += m.group(2) + link_depth += 1 + continue + + if template_depth == 1 and link_depth == 0: + data.append(buf) + buf = '' + if m.group(2) == '}}': + buf += m.group(2) + template_depth -= 1 + continue + if m.group(2) == ']]': + buf += m.group(2) + if link_depth > 0: + link_depth -= 1 + continue + assert m.group(2) == '|' + if buf != '}}': + return parse_template2(s) + assert buf == '}}' + + infobox_template = data.pop(0) + try: + assert re_persondata.match(infobox_template) + #assert re_infobox_template.match(infobox_template) + except AssertionError: + print(infobox_template) + raise + + fields = {} + for line in data: + line = line.strip(); + if line == '' or ((line.startswith('')) or line == 'PLEASE SEE [[WP:PDATA]]!': + continue + if '=' in line: + name, value = line.split('=', 1) + else: + m = re_missing_equals.match(line) + if not m: + return parse_template2(s) + name, value = m.groups() + fields[name.strip()] = value.strip() + return fields + +re_missing_equals = re.compile('^([A-Z ]+) (.+)$') + +def parse_pd(pd): + lines = pd.split('\n') + print(repr(lines[-1])) + assert lines[-1] == '}}' + +def read_person_data(): + expect = set([u'DATE OF DEATH', u'NAME', u'SHORT DESCRIPTION', u'ALTERNATIVE NAMES', u'PLACE OF BIRTH', u'DATE OF BIRTH', u'PLACE OF DEATH']) + for line in open('people'): + cur = eval(line) + if 'persondata' not in cur: + continue + title = cur['title'] + if title == 'Murray Bookchin': + continue +# print 'title:', title + pd = cur['persondata'] + k = set(parse_template(pd).keys()) + if k > expect: + print(title) + print(k) + +def iter_people(): + return (eval(line) for line in open('people')) + +def date_cats(): + re_date_cat = re.compile('^(.*\d.*) (birth|death)s$') + cats = {'birth': {}, 'death':{}} + for cur in iter_people(): + title = cur['title'] + #print [cat for cat in cur['cats'] if cat.endswith('births') or cat.endswith('deaths')] + for cat in cur['cats']: + m = re_date_cat.match(cat) + if not m: + continue + cats[m.group(2)].setdefault(m.group(1), set()).add(title) +# print 'birth:', [(i[0], len(i[1])) for i in sorted(cats['birth'].items(), reverse = True, key = lambda i: len(i[1]))[:5]] +# print 'death:', [(i[0], len(i[1])) for i in sorted(cats['death'].items(), reverse = True, key = lambda i: len(i[1]))[:5]] + print(cats) + +#read_person_data() +#date_cats() + +def fmt_line(fields): + def bold(s): + return ''.join(i + '\b' + i for i in s) + return ''.join(bold("$" + k) + norm(v) for k, v in fields) + +def strip_brackets(line): + if line[4] == '[' and line[-2] == ']': + return line[0:4] + line[5:-2] + line[-1] + else: + return line + +def read_marc(): + for line in bz2.BZ2File('marc_authors.bz2'): + line = eval(line) + if '[Sound recording]' in line: + continue + line = strip_brackets(line) + #print expr_in_utf8(get_all_subfields(line)) + print(fmt_line(get_subfields(line, 'abcd'))) + +#read_marc() + +# 528,859 wikipedia +# 3,596,802 MARC + + +def get_names(cur): + titles = [cur['title']] + cur.get('redirects', []) + if 'persondata' in cur: + pd = parse_template(cur['persondata']) + if 'NAME' in pd and pd['NAME']: + titles.append(pd['NAME']) + if 'ALTERNATIVE NAMES' in pd: + alt = pd['ALTERNATIVE NAMES'] + if len(alt) > 100 and ',' in alt and ';' not in alt: + alt = alt.split(',') + else: + alt = alt.split(';') + titles += [j for j in (i.strip() for i in alt) if j] + return set(i.lower() for i in titles) + +def read_people(): + from collections import defaultdict +# wiki = [] +# title_lookup = defaultdict(list) + maximum = 0 + for cur in iter_people(): +# wiki.append(cur) + titles = [cur['title']] + cur.get('redirects', []) + if 'persondata' in cur: + pd = parse_template(cur['persondata']) + if 'NAME' in pd and pd['NAME']: + titles.append(pd['NAME']) + if 'ALTERNATIVE NAMES' in pd: + alt = pd['ALTERNATIVE NAMES'] + if len(alt) > 100 and ',' in alt and ';' not in alt: + alt = alt.split(',') + else: + alt = alt.split(';') + titles += [j for j in (i.strip() for i in alt) if j] + cur_max = max(len(i) for i in titles) + if cur_max > maximum: + maximum = cur_max + print(maximum) + print(cur['title']) + print(titles) +# for t in set(titles): +# title_lookup[t].append(cur) + +def load_db(): + c = get_cursor() + c.execute('truncate people') + c.execute('truncate names') + c.execute('truncate redirects') + for person in iter_people(): +# print person + c.execute('insert into people (title, len, infobox, defaultsort, persondata, cats) values (%s, %s, %s, %s, %s, %s)', (person['title'], person['len'], person.get('infobox', None), person.get('defaultsort', None), person.get('persondata', None), repr(person.get('cats', [])))) + id = conn.insert_id() + c.executemany('insert ignore into names (person_id, name) values (%s, %s)', [(id, n) for n in get_names(person)]) + if 'redirects' in person: + redirects = set(r.lower() for r in person['redirects']) + c.executemany('insert ignore into redirects (person_id, redirect) values (%s, %s)', [(id, r) for r in redirects]) + +# print 'insert into + +#read_people() + +#load_db() + +def flip_name(name): + m = re_marc_name.match(name) + if m: + return m.group(2) + ' ' + m.group(1) + return name + +re_digit = re.compile('\d+') +re_decade = re.compile('^(\d+)s$') +re_bc_date = re.compile('^(.*) B\.C\.?$') +re_cent = re.compile('^(\d+)[a-z][a-z] cent\.$') +re_century = re.compile('^(\d+)[a-z][a-z] century$') + +def decade_match(a, start): + end = start + 10 + if a.isdigit(): + return start <= int(a) < end + return any((start <= int(c) < end) for c in re_digit.findall(a)) + +def year_approx_match(a, b): + approx_century_match = False + if a.startswith('ca. '): + ca = True + a = a[4:] + range = 20 + else: + ca = False + range = 9 + if a == b: + return True + if a.replace('.', '') == b: + return True # ca. 440 B.C. + if a.endswith(' cent.') and b.endswith(' century') and b.startswith(a[:-1]): + return True + + bc = False + if b.endswith(' BC'): + m = re_bc_date.match(a) + if m: + a = m.group(1) + b = b[:-3] + bc = True + if approx_century_match and a.isdigit() and b.endswith(' century'): + a = int(a) + m = re_century.match(b) + assert m + cent = int(m.group(1)) + start = cent - 1 if not bc else cent + end = cent if not bc else cent + 1 + #print cent, start, a, end + if start * 100 <= a < end * 100: + return True + + if b.isdigit(): + b = int(b) + if a.isdigit() and (bc or b < 1900) and abs(int(a) - b) <= range: + return True + if approx_century_match and a.endswith(' cent.'): + m = re_cent.match(a) + if m: + cent = int(m.group(1)) + start = cent - 1 if not bc else cent + end = cent if not bc else cent + 1 + if start * 100 <= b < end * 100: + return True + for c in re_digit.findall(a): + c = int(c) + if c == b: + return True + if (bc or b < 1900) and abs(c - b) <= range: + return True + return False + m = re_decade.match(b) + if not m: + return False + start = int(m.group(1)) + return decade_match(a, start) + +def test_year_approx_match(): + assert not year_approx_match('1939', '1940') + assert year_approx_match('582', '6th century') + assert year_approx_match('13th cent.', '1240') + assert year_approx_match('ca. 360 B.C.', '365 BC') + assert year_approx_match('1889', '1890') + assert year_approx_match('1883?', '1882') + assert year_approx_match('1328?', '1320s') + assert year_approx_match('11th cent.', '11th century') + assert not year_approx_match('1330', '1320s') + assert not year_approx_match('245 B.C.', '3rd century BC') + +def date_match(dates, cats): + match_found = False + for f in ['birth', 'death']: + if f + '_date' not in dates: + continue + marc = dates[f + '_date'] + this_cats = [i[:-(len(f)+2)] for i in cats if i.endswith(' %ss' % f)] + if not this_cats: + continue + m = any(year_approx_match(marc, i) for i in this_cats) + #print m, marc, this_cats + if m: + match_found = True + else: + return False + return match_found + +def test_date_match(): + # $aAngelico,$cfra,$dca. 1400-l455. + dates = {'birth_date': u'ca. 1400', 'death_date': u'1455'} + cats = [u'1395 births', u'1455 deaths'] + assert date_match(dates, cats) + + # $aAndocides,$dca. 440-ca. 390 B.C. + dates = {'birth_date': u'ca. 440 B.C.', 'death_date': u'ca. 390 B.C.'} + cats = [u'440 BC births', u'390 BC deaths', u'Ancient Athenians'] + assert date_match(dates, cats) + + # $aAlexander,$cof Hales,$dca. 1185-1245. + dates = {'birth_date': u'ca. 1185', 'death_date': u'1245'} + cats = [u'13th century philosophers', u'1245 deaths', u'Roman Catholic philosophers', u'English theologians', u'Franciscans', u'Scholastic philosophers', u'People from Gloucestershire'] + assert date_match(dates, cats) + + dates = {'birth_date': u'1922'} + cats = [u'1830 births', u'1876 deaths'] + assert not date_match(dates, cats) + + dates = {'birth_date': u'1889', 'death_date': u'1947'} + cats = [u'1890 births', u'1947 deaths'] + assert date_match(dates, cats) + + dates = {'birth_date': u'1889', 'death_date': u'1947'} + cats = [u'1890 births', u'1947 deaths'] + assert date_match(dates, cats) + + dates = {} + cats = [u'1890 births', u'1947 deaths'] + assert not date_match(dates, cats) + + dates = {'birth_date': u'1883?', 'death_date': u'1963'} + cats = [u'1882 births', u'1963 deaths'] + assert date_match(dates, cats) + + dates = {'birth_date': u'1328?', 'death_date': u'1369'} + cats = [u'Karaite rabbis', u'1320s births', u'1369 deaths'] + assert date_match(dates, cats) + + dates = {'birth_date': u'ca. 1110', 'death_date': u'ca. 1180'} + cats = [u'1120s births', u'1198 deaths'] + assert date_match(dates, cats) + + # $aAbu Nuwas,$dca. 756-ca. 810. # Abu Nuwas + dates = {'birth_date': u'ca. 756', 'death_date': u'ca. 810'} + cats = [u'750 births', u'810 deaths'] + assert date_match(dates, cats) + +re_title_of = re.compile(' (of .*)$') + +def name_lookup(c, fields): + def join_fields(fields, want): + return ' '.join(v for k, v in fields if k in want) + if not any(k == 'd' for k, v in fields): + return [] + ab = [v for k, v in fields if k in 'ab'] + name = ' '.join(ab) + flipped = flip_name(name) + #names.update([name, flipped]) + names = set([flipped]) + if any(k == 'c' for k, v in fields): + name = join_fields(fields, 'abc') + names.update([name, flip_name(name)]) + title = [v for k, v in fields if k in 'c'] + names.update([' '.join(title + ab), ' '.join(title + [flipped])]) + + title = ' '.join(title) + sp = title.find(' ') + if sp != -1: + m = re_title_of.search(title) + if m: + t = m.group(1) + names.update([' '.join(ab + [t]), ' '.join([flipped, t])]) + + t = title[:sp] + names.update([' '.join([t] + ab), ' '.join([t, flipped])]) + + found = [] + names.update(n.replace(',', '') for n in names.copy() if ',' in n) + for n in names: + c.execute("select title, cats, name, persondata from names, people where people.id = names.person_id and name=%s", (n,)) + found += c.fetchall() + return found + +def db_marc_lookup(): + c = get_cursor() + articles = set() + count = 0 + t0 = time() + match_count = 0 + total = 3596802 + for line in bz2.BZ2File('marc_authors.bz2'): + count+=1 + if count % 1000 == 0: + t1 = time() - t0 + rec_per_sec = count / t1 + time_left = (total - count) / rec_per_sec + print(count, match_count, "%.2f%% %.2f mins left" % ((match_count * 100) / count, time_left / 60)) + line = eval(line) + line = strip_brackets(line) + fields = [(k, v.strip(' /,;:')) for k, v in get_subfields(line, 'abcd')] + dates = pick_first_date(v for k, v in fields if k == 'd') + if dates.items()[0] == ('date', ''): + continue + found = name_lookup(c, fields) + if not found: + continue + match = {} + seen = set() +# print fmt_line(get_subfields(line, 'abcd')) +# print dates + for name, cats, match_name, pd in found: + if name in seen: + continue + seen.add(name) + cats = eval(cats) + if not any(cat.endswith(' births') or cat.endswith(' deaths') for cat in cats): + continue + dm = date_match(dates, cats) + if dm: + match[name] = (cats, match_name) + continue + print((name, match_name)) + print("cats =", cats) + print(('match' if dm else 'no match')) + for field in ['birth', 'death']: + print(field + 's:', [i[:-(len(field)+2)] for i in cats if i.endswith(' %ss' % field)], end=' ') + print() +# print '---' + + if not match: + continue + match_count+=1 +# articles.add(match.keys()[0]) + if len(match) != 1: + print(count, match_count) + print(fmt_line(get_subfields(line, 'abcd'))) + for name, (cats, match_name) in match.items(): + print(name, cats, match_name) + print("http://en.wikipedia.org/wiki/" + name.replace(' ', '_')) + print() + continue +# print len(articles), match[0][0], fmt_line(get_subfields(line, 'abcd')) + assert len(match) == 1 + print(match_count) + +#test_year_approx_match() +#db_marc_lookup() +#test_date_match() diff --git a/ia-legacy-importer/wikipedia/uniq.py b/ia-legacy-importer/wikipedia/uniq.py new file mode 100644 index 00000000..95009e6a --- /dev/null +++ b/ia-legacy-importer/wikipedia/uniq.py @@ -0,0 +1,2 @@ +from __future__ import print_function +print(len(set(eval(l)[1] for l in open('matches2')))) diff --git a/ia-legacy-importer/works/__init__.py b/ia-legacy-importer/works/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/ia-legacy-importer/works/add_fields_to_works.py b/ia-legacy-importer/works/add_fields_to_works.py new file mode 100755 index 00000000..b9904434 --- /dev/null +++ b/ia-legacy-importer/works/add_fields_to_works.py @@ -0,0 +1,186 @@ +#!/usr/local/bin/python2.5 +from __future__ import print_function +import sys +import re +import codecs +sys.path.append('/home/edward/src/olapi') +from olapi import OpenLibrary +import simplejson as json +from collections import defaultdict +from catalog.read_rc import read_rc +from catalog.utils.query import query, query_iter, set_staging, base_url +from catalog.utils import mk_norm, get_title +from six.moves import urllib + +import six + + +sys.stdout = codecs.getwriter('utf-8')(sys.stdout) +set_staging(True) + +rc = read_rc() + +ol = OpenLibrary(base_url()) +ol.login('EdwardBot', rc['EdwardBot']) + +re_year = re.compile('(\d{3,})$') + +queue = [] + +def iter_works(fields): + q = { 'type':'/type/work', 'key': None } + for f in fields: q[f] = None + return query_iter(q) + +def dates(): + global queue + f = 'first_publish_date' + for w in iter_works([f, 'title']): + if f in w: + continue + q = { 'type':'/type/edition', 'works': w['key'], 'publish_date': None } + years = defaultdict(list) + for e in query_iter(q): + date = e.get('publish_date', None) + if not date or date == '0000': + continue + m = re_year.match(date) + if not m: + continue + year = int(m.group(1)) + years[year].append(e['key']) + if not years: + continue + first = min(years.keys()) + assert first != 0 + print((w['key'], repr(w['title']), first)) + q = { + 'key': w['key'], + f: { 'connect': 'update', 'value': str(first)} + } + queue.append(q) + if len(queue) == 200: + print(ol.write(queue, comment='add first publish date')) + queue = [] + print(ol.write(queue, comment='add first publish date')) + +def lang(): + f = 'original_languages' + queue = [] + for w in iter_works([f, 'title']): + if f in w and w[f]: + continue + q = { + 'type':'/type/edition', + 'works': w['key'], + 'languages': None, + 'title': None, + 'title_prefix': None + } + editions = [e for e in query_iter(q) if e['languages']] + title = mk_norm(w['title']) + if not editions or any(len(e['languages']) != 1 for e in editions): + continue + lang = [e['languages'][0]['key'] for e in editions if mk_norm(get_title(e)) == title] + if len(lang) < 2: + continue + first = lang[0] + if any(l != first for l in lang): + continue + print((w['key'], repr(w['title']), first, len(lang))) + q = { + 'key': w['key'], + f: { 'connect': 'update_list', 'value': [first]} + } + queue.append(q) + if len(queue) == 200: + print(ol.write(queue, comment='add original language')) + queue = [] + print(ol.write(queue, comment='add original language')) + +def toc_items(toc_list): + return [{'title': item, 'type': '/type/toc_item'} for item in toc_list] + +def add_fields(): + comment = 'add fields to works' + queue = [] + seen = set() + fields = ['genres', 'first_sentence', 'dewey_number', \ + 'lc_classifications', 'publish_date'] #, 'table_of_contents'] + for w in iter_works(fields + ['title']): + if w['key'] in seen or all(w.get(f, None) for f in fields): + continue + seen.add(w['key']) + q = { 'type':'/type/edition', 'works': w['key']} + for f in fields: q[f] = None + editions = list(query_iter(q)) + + found = {} + + for f in fields: + if not w.get(f, None): + if f == 'publish_date': + years = defaultdict(list) + for e in editions: + date = e.get(f, None) + if not date or date == '0000': + continue + m = re_year.match(date) + if not m: + continue + year = int(m.group(1)) + years[year].append(e['key']) + if years: + found[f] = str(min(years.keys())) + continue + if f == 'genres': + found_list = [[g.strip('.') for g in e[f]] for e in editions \ + if e.get(f, None) and not any('translation' in i for i in e[f])] + if f == 'table_of_contents': + found_list = [] + for e in query_iter(q): + if not e.get(f, None): + continue + toc = e[f] + print(e['key'], toc) + print(e) + print() + if isinstance(toc[0], six.string_types): + found_list.append(toc_items(toc)) + else: + assert isinstance(toc[0], dict) + if toc[0]['type'] == '/type/text': + found_list.append(toc_items([i['value'] for i in toc])) + else: + assert toc[0]['type']['key'] == '/type/toc_item' + found_list.append(toc) + else: + found_list = [e[f] for e in query_iter(q) if e.get(f, None)] + if found_list: + first = found_list[0] + if all(i == first for i in found_list): + found[f] = first + + if not found: + continue + + print(len(queue) + 1, w['key'], len(editions), w['title']) + print(found) + + q = { 'key': w['key'], } + for f in fields: + if not f in found: + continue + if f == 'publish_date': + q['first_publish_date'] = { 'connect': 'update', 'value': found[f]} + elif f == 'first_sentence': + q[f] = { 'connect': 'update', 'value': found[f]} + else: + q[f] = { 'connect': 'update_list', 'value': found[f]} + queue.append(q) + if len(queue) == 200: + print(ol.write(queue, comment=comment)) + queue = [] + print(ol.write(queue, comment=comment)) + +add_fields() diff --git a/ia-legacy-importer/works/by_author.py b/ia-legacy-importer/works/by_author.py new file mode 100755 index 00000000..668d3968 --- /dev/null +++ b/ia-legacy-importer/works/by_author.py @@ -0,0 +1,284 @@ +#!/usr/local/bin/python2.5 +from __future__ import print_function +import re +import sys +import codecs +import web +from openlibrary.catalog.get_ia import get_from_archive +from openlibrary.catalog.marc.fast_parse import get_subfield_values, get_first_tag, get_tag_lines, get_subfields +from openlibrary.catalog.utils.query import query_iter, set_staging, query +from openlibrary.catalog.utils import cmp, mk_norm +from openlibrary.catalog.read_rc import read_rc +from collections import defaultdict + +from catalog.utils.edit import fix_edition +from olapi import OpenLibrary, Reference +import olapi +from six.moves import urllib + +import six + + +rc = read_rc() + +ol = OpenLibrary("http://dev.openlibrary.org") +ol.login('EdwardBot', rc['EdwardBot']) + +sys.stdout = codecs.getwriter('utf-8')(sys.stdout) +re_skip = re.compile('\b([A-Z]|Co|Dr|Jr|Capt|Mr|Mrs|Ms|Prof|Rev|Revd|Hon|etc)\.$') + +base_url = "http://dev.openlibrary.org" +query_url = base_url + "/query.json?query=" + +work_num = 184076 + +set_staging(True) + +def withKey(key): + url = base_url + key + ".json" + return urllib.request.urlopen(url).read() + +def find_new_work_key(): + global work_num + while True: + key = "/w/OL%dW" % work_num + ret = withKey(key) + if ret.startswith("Not Found:"): + return work_num + work_num += 1 + +def next_work_key(): + global work_num + key = "/w/OL%dW" % work_num + ret = withKey(key) + while not ret.startswith("Not Found:"): + work_num += 1 + key = "/w/OL%dW" % work_num + ret = withKey(key) + work_num += 1 + return key + +# sample title: The Dollar Hen (Illustrated Edition) (Dodo Press) +re_parens = re.compile('^(.*?)(?: \(.+ (?:Edition|Press)\))+$') + +def top_rev_wt(d): + d_sorted = sorted(d.keys(), cmp=lambda i, j: cmp(d[j], d[i]) or cmp(len(j), len(i))) + return d_sorted[0] + +def books_query(akey): # live version + q = { + 'type':'/type/edition', + 'authors': akey, + '*': None + } + return query_iter(q) + +def freq_dict_top(d): + return sorted(d.keys(), reverse=True, key=lambda i:d[i])[0] + + +def get_work_title(e): + if e['key'] not in marc: + assert not e.get('work_titles', []) + return +# assert e.get('work_titles', []) + data = marc[e['key']][1] + line = get_first_tag(data, set(['240'])) + if not line: + assert not e.get('work_titles', []) + return + return ' '.join(get_subfield_values(line, ['a'])).strip('. ') + +def get_books(akey): + for e in books_query(akey): + if not e.get('title', None): + continue + if len(e.get('authors', [])) != 1: + continue +# if 'works' in e: +# continue + if 'title_prefix' in e and e['title_prefix']: + prefix = e['title_prefix'] + if prefix[-1] != ' ': + prefix += ' ' + title = prefix + e['title'] + else: + title = e['title'] + + title = title.strip(' ') + if has_dot(title): + title = title[:-1] + if title.strip('. ') in ['Publications', 'Works', 'Report', \ + 'Letters', 'Calendar', 'Bulletin', 'Plays', 'Sermons', 'Correspondence']: + continue + + m = re_parens.match(title) + if m: + title = m.group(1) + + n = mk_norm(title) + + book = { + 'title': title, + 'norm_title': n, + 'key': e['key'], + } + + if 'languages' in e: + book['lang'] = [l['key'][3:] for l in e['languages']] + + if e.get('table_of_contents', None): + if isinstance(e['table_of_contents'][0], six.string_types): + book['table_of_contents'] = e['table_of_contents'] + else: + assert isinstance(e['table_of_contents'][0], dict) + if e['table_of_contents'][0]['type'] == '/type/text': + book['table_of_contents'] = [i['value'] for i in e['table_of_contents']] + + wt = get_work_title(e) + if not wt: + yield book + continue + if wt in ('Works', 'Selections'): + yield book + continue + n_wt = mk_norm(wt) + book['work_title'] = wt + book['norm_wt'] = n_wt + yield book + +def build_work_title_map(equiv, norm_titles): + # map of book titles to work titles + title_to_work_title = defaultdict(set) + for (norm_title, norm_wt), v in equiv.items(): + if v != 1: + title_to_work_title[norm_title].add(norm_wt) + + title_map = {} + for title, v in title_to_work_title.items(): + if len(v) == 1: + title_map[title] = list(v)[0] + continue + most_common_title = max(v, key=lambda i:norm_titles[i]) + if title != most_common_title: + title_map[title] = most_common_title + for i in v: + if i != most_common_title: + title_map[i] = most_common_title + return title_map + +def find_works(akey): + equiv = defaultdict(int) # title and work title pairs + norm_titles = defaultdict(int) # frequency of titles + books_by_key = {} + books = [] + rev_wt = defaultdict(lambda: defaultdict(int)) + + for book in get_books(akey): + if 'norm_wt' in book: + pair = (book['norm_title'], book['norm_wt']) + equiv[pair] += 1 + rev_wt[book['norm_wt']][book['work_title']] +=1 + norm_titles[book['norm_title']] += 1 + books_by_key[book['key']] = book + books.append(book) + + title_map = build_work_title_map(equiv, norm_titles) + + works = defaultdict(lambda: defaultdict(list)) + work_titles = defaultdict(list) + for b in books: + if 'eng' not in b.get('lang', []) and 'norm_wt' in b: + work_titles[b['norm_wt']].append(b['key']) + continue + n = b['norm_title'] + title = b['title'] + if n in title_map: + n = title_map[n] + title = top_rev_wt(rev_wt[n]) + works[n][title].append(b['key']) + + works = sorted([(sum(map(len, w.values() + [work_titles[n]])), n, w) for n, w in works.items()]) + + for work_count, norm, w in works: + if work_count < 2: + continue + first = sorted(w.items(), reverse=True, key=lambda i:len(i[1]))[0][0] + titles = defaultdict(int) + for key_list in w.values(): + for ekey in key_list: + b = books_by_key[ekey] + title = b['title'] + titles[title] += 1 + keys = work_titles[norm] + for values in w.values(): + keys += values + assert work_count == len(keys) + title = max(titles.keys(), key=lambda i:titles[i]) + toc = [(k, books_by_key[k].get('table_of_contents', None)) for k in keys] + yield {'title': first, 'editions': keys, 'toc': dict((k, v) for k, v in toc if v)} + +def print_works(works): + for w in works: + print(len(w['editions']), w['title']) + +def toc_items(toc_list): + return [{'title': six.text_type(item), 'type': Reference('/type/toc_item')} for item in toc_list] + +def add_works(akey, works): + queue = [] + for w in works: + w['key'] = next_work_key() + q = { + 'authors': [akey], + 'create': 'unless_exists', + 'type': '/type/work', + 'key': w['key'], + 'title': w['title'] + } + #queue.append(q) + print(ol.write(q, comment='create work')) + for ekey in w['editions']: + e = ol.get(ekey) + fix_edition(ekey, e, ol) + e['works'] = [Reference(w['key'])] + try: + ol.save(ekey, e, 'found a work') + except olapi.OLError: + print(ekey) + print(e) + raise + +def by_authors(): + find_new_work_key() + + skipping = False + skipping = True + q = { 'type':'/type/author', 'name': None, 'works': None } + for a in query_iter(q, offset=215000): + akey = a['key'] + if skipping: + print('skipping:', akey, a['name']) + if akey == '/a/OL218496A': + skipping = False + continue + + q = { + 'type':'/type/work', + 'authors': akey, + } + if query(q): + print((akey, repr(a['name']), 'has works')) + continue + + # print akey, a['name'] + found = find_works(akey) + works = [i for i in found if len(i['editions']) > 2] + if works: + #open('found/' + akey[3:], 'w').write(repr(works)) + print((akey, repr(a['name']))) + #print_works(works) + add_works(akey, works) + print() + +by_authors() \ No newline at end of file diff --git a/ia-legacy-importer/works/find.py b/ia-legacy-importer/works/find.py new file mode 100755 index 00000000..2a54d9ef --- /dev/null +++ b/ia-legacy-importer/works/find.py @@ -0,0 +1,136 @@ +from __future__ import print_function +import web +import re +import sys +from catalog.read_rc import read_rc +from catalog.infostore import get_site +#from catalog.db_read import get_things, withKey +from catalog.amazon.other_editions import find_others + +rc = read_rc() + +re_translation_of = re.compile('^Translation of\b[: ]*([^\n]*?)\.?$', re.I | re.M) + +site = get_site() + +def isbn_link(i): + return '%s (Amazon.com)' % (i, i, i) + +def ol_link(key): + return '%s' % (key, key) + +def search(title, author): + q = { 'type': '/type/author', 'name': author } + print(q) + authors = site.things(q) + print(authors) + seen = set() + pool = set() +# for a in authors: +# q = { 'type': '/type/edition', 'authors': a, 'title': title } +# pool.update(site.things(q)) + found_titles = {} + found_isbn = {} + author_keys = ','.join("'%s'" % a for a in authors) + + print(author_keys) + iter = web.query("select id, key from thing where thing.id in (select thing_id from edition_ref, thing where edition_ref.key_id=11 and edition_ref.value = thing.id and thing.key in (" + author_keys + "))") + key_to_id = {} + id_to_key = {} + for row in iter: + print(row) + key_to_id[row.key] = row.id + id_to_key[row.id] = row.key + + + iter = web.query("select thing_id, edition_str.value as title from edition_str where key_id=3 and thing_id in (select thing_id from edition_ref, thing where edition_ref.key_id=11 and edition_ref.value = thing.id and thing.key in (" + author_keys + "))") + id_to_title = {} + title_to_key = {} + for row in iter: + print(row) + t = row.title.lower().strip('.') + id_to_title[row.thing_id] = row.title + title_to_key.setdefault(t, []).append(id_to_key[row.thing_id]) + + if title.lower() not in title_to_key: + print('title not found') + return + + pool = set(title_to_key[title.lower()]) + + editions = [] + while pool: + key = pool.pop() + print(key) + seen.add(key) + e = site.withKey(key) + translation_of = None + if e.notes: + m = re_translation_of.search(e.notes) + if m: + translation_of = m.group(1).lower() + pool.update(k for k in title_to_key[translation_of] if k not in seen) + found_titles.setdefault(translation_of, []).append(key) + if e.isbn_10: + for i in e.isbn_10: + found_isbn.setdefault(i, []).append(key) + join_isbn = ', '.join(map(isbn_link, e.isbn_10)) + else: + join_isbn = '' + rec = { + 'key': key, + 'publish_date': e.publish_date, + 'publishers': ', '.join(p.encode('utf-8') for p in (e.publishers or [])), + 'isbn': join_isbn, + } + editions.append(rec) + + if e.work_titles: + for t in e.work_titles: + t=t.strip('.') + pool.update(k for k in title_to_key.get(t.lower(), []) if k not in seen) + found_titles.setdefault(t, []).append(key) + if e.other_titles: + for t in e.other_titles: + t=t.strip('.') + pool.update(k for k in title_to_key.get(t.lower(), []) if k not in seen) + found_titles.setdefault(t, []).append(key) + + print('') + for e in sorted(editions, key=lambda e: e['publish_date'] and e['publish_date'][-4:]): + print('') + print('') + print('') + print('') + print('
    ', ol_link(e['key'])) + print('', e['publish_date'], '', e['publishers'], '', e['isbn'], '
    ') + + if found_titles: + print('

    Other titles

    ') + print('
      ') + for k, v in found_titles.iteritems(): + if k == title: + continue + print('
    • %s' % (k, author, k), end=' ') + print('from', ', '.join(ol_link(i) for i in v)) + print('
    ') + + extra_isbn = {} + for k, v in found_isbn.iteritems(): + for isbn, note in find_others(k, rc['amazon_other_editions']): + if note.lower().find('audio') != -1: + continue + if isbn not in found_isbn: + extra_isbn.setdefault(isbn, []).extend(v) + + if extra_isbn: + print('

    Other ISBN

    ') + print('
      ') + for k in sorted(extra_isbn): + print('
    • ', isbn_link(k), end=' ') + print('from', ', '.join(ol_link(i) for i in extra_isbn[k])) + print('
    ') + +title = 'Journey to the centre of the earth' +author = 'Jules Verne' +search(title, author) diff --git a/ia-legacy-importer/works/find_other_editions.py b/ia-legacy-importer/works/find_other_editions.py new file mode 100755 index 00000000..be8dfc1a --- /dev/null +++ b/ia-legacy-importer/works/find_other_editions.py @@ -0,0 +1,65 @@ +#!/usr/local/bin/python2.5 +from __future__ import print_function +import sys +import codecs +from catalog.merge.names import match_name +from catalog.utils import fmt_author, get_title, mk_norm +from catalog.utils.query import query_iter, set_staging, withKey + +# find duplicate authors and other editions of works + +sys.stdout = codecs.getwriter('utf-8')(sys.stdout) +set_staging(True) + +def other_editions(title, wkey, work_author): + # look for other editions with the same title + wakey = work_author['key'] + q = { 'type': '/type/edition', 'title': title } + for k in 'works', 'title_prefix', 'key', 'authors': + q[k] = None + found = [] + for e in query_iter(q): + if not e.get('authors', None): + continue + if e.get('works', None) and any(i['key'] == wkey for i in e['works']): + continue + if any(i['key'] == wakey for i in e['authors']): + continue + for akey in (a['key'] for a in e.get('authors', [])): + a = withKey(akey) + name = a.get('name', '') + if match_name(name, work_author['name'], last_name_only_ok=True): + yield (e, a) + +q = { 'type':'/type/work' } +for k in 'key', 'title', 'authors': + q[k] = None + +for w in query_iter(q): + wkey = w['key'] + titles = set([w['title']]) + q = { 'type': '/type/edition', 'works': wkey } + for k in 'title', 'title_prefix', 'key', 'authors': + q[k] = None + + wakey = w['authors'][0]['key'] + work_author = withKey(wakey) + + for e in query_iter(q): + if not e.get('title', None): + continue + titles.update([get_title(e), e['title']]) + + found = [] + for title in titles: + found += list(other_editions(title, wkey, work_author)) + + if not found: + continue + print(w) + print(titles) + print(wakey + ':', fmt_author(work_author)) + for e, a in found: + print(' ', a['key'] + ": ", fmt_author(a)) + print(' ', e) + print() diff --git a/ia-legacy-importer/works/find_work_for_edition.py b/ia-legacy-importer/works/find_work_for_edition.py new file mode 100644 index 00000000..a108adb1 --- /dev/null +++ b/ia-legacy-importer/works/find_work_for_edition.py @@ -0,0 +1,43 @@ +from __future__ import print_function +# try and find an existing work for a book + +from openlibrary.api import OpenLibrary +from openlibrary.catalog.utils import mk_norm +import sys +from time import time + +ol = OpenLibrary("http://openlibrary.org") + +def find_matching_work(e): + norm_title = mk_norm(e['title']) + + seen = set() + for akey in e['authors']: + q = { + 'type':'/type/work', + 'authors': {'author': {'key': akey}}, + 'limit': 0, + 'title': None, + } + t0 = time() + work_keys = list(ol.query(q)) + t1 = time() - t0 + print('time to find books by author: %.1f seconds' % t1) + for w in work_keys: + wkey = w['key'] + if wkey in seen: + continue + seen.add(wkey) + if not w.get('title'): + continue + if mk_norm(w['title']) == norm_title: + assert ol.query({'key': wkey, 'type': None})[0]['type'] == '/type/work' + return wkey + +def test_book(): + ekey = '/books/OL24335218M' + wkey = find_matching_work(ekey) + if wkey: + print('found match:', wkey) + else: + print('no match') diff --git a/ia-legacy-importer/works/find_works.py b/ia-legacy-importer/works/find_works.py new file mode 100755 index 00000000..cf158cb1 --- /dev/null +++ b/ia-legacy-importer/works/find_works.py @@ -0,0 +1,799 @@ +#!/usr/bin/python +# find works and create pages on production + +from __future__ import print_function +import re +import simplejson as json +import sys +import web + +from collections import defaultdict +from lxml import etree +from time import sleep, time, strftime + +from openlibrary.api import OpenLibrary +from openlibrary.catalog.get_ia import get_from_archive, get_data +from openlibrary.catalog.importer.db_read import get_mc +from openlibrary.catalog.marc.fast_parse import get_subfield_values, get_first_tag, get_tag_lines, get_subfields, BadDictionary +from openlibrary.catalog.marc.marc_subject import get_work_subjects, four_types +from openlibrary.catalog.read_rc import read_rc +from openlibrary.catalog.utils import cmp, mk_norm +from openlibrary.catalog.utils.edit import fix_edition +from openlibrary.catalog.utils.query import query_iter, withKey +from openlibrary.solr.update_work import update_work, solr_update, update_author + +import six +from six.moves import urllib +from six.moves.urllib.request import urlopen + + +ol = OpenLibrary("http://openlibrary.org") + +re_skip = re.compile(r'\b([A-Z]|Co|Dr|Jr|Capt|Mr|Mrs|Ms|Prof|Rev|Revd|Hon|etc)\.$') +re_work_key = re.compile(r'^/works/OL(\d+)W$') +re_lang_key = re.compile('^/(?:l|languages)/([a-z]{3})$') +re_author_key = re.compile(r'^/(?:a|authors)/(OL\d+A)$') + +re_ia_marc = re.compile(r'^(?:.*/)?([^/]+)_(marc\.xml|meta\.mrc)(:0:\d+)?$') + +ns = '{http://www.loc.gov/MARC21/slim}' +ns_leader = ns + 'leader' +ns_data = ns + 'datafield' + +def has_dot(s): + return s.endswith('.') and not re_skip.search(s) + +def get_with_retry(k): + for attempt in range(50): + try: + return ol.get(k) + except: + pass + print('retry') + sleep(5) + return ol.get() + +#set_staging(True) + +# sample title: The Dollar Hen (Illustrated Edition) (Dodo Press) +re_parens = re.compile(r'^(.*?)(?: \(.+ (?:Edition|Press|Print|Plays|Collection|Publication|Novels|Mysteries|Book Series|Classics Library|Classics|Books)\))+$', re.I) + +def top_rev_wt(d): + d_sorted = sorted(d.keys(), cmp=lambda i, j: cmp(d[j], d[i]) or cmp(len(j), len(i))) + return d_sorted[0] + +def books_query(akey): # live version + q = { + 'type':'/type/edition', + 'authors': akey, + 'source_records': None, + 'title': None, + 'work_title': None, + 'table_of_contents': None, + 'languages': None, + 'title_prefix': None, + 'subtitle': None, + } + return query_iter(q) + +def freq_dict_top(d): + return sorted(d.keys(), reverse=True, key=lambda i:d[i])[0] + +def get_marc_src(e, mc): + if mc and mc.startswith('amazon:'): + mc = None + if mc and mc.startswith('ia:'): + yield 'ia', mc[3:] + elif mc: + m = re_ia_marc.match(mc) + if m: + yield 'ia', m.group(1) + else: + yield 'marc', mc + source_records = e.get('source_records', []) + if not source_records: + return + for src in source_records: + if src.startswith('ia:'): + if not mc or src != mc: + yield 'ia', src[3:] + continue + if src.startswith('marc:'): + if not mc or src != 'marc:' + mc: + yield 'marc', src[5:] + continue + +def get_ia_work_title(ia): + # FIXME: rewrite to use MARC binary + url = 'http://www.archive.org/download/' + ia + '/' + ia + '_marc.xml' + try: + root = etree.parse(urlopen(url)).getroot() + except KeyboardInterrupt: + raise + except: + return + e = root.find(ns_data + "[@tag='240']") + if e is None: + return + wt = ' '.join(s.text for s in e if s.attrib['code'] == 'a' and s.text) + return wt + +def get_work_title(e, mc): + # use first work title we find in source MARC records + wt = None + for src_type, src in get_marc_src(e, mc): + if src_type == 'ia': + wt = get_ia_work_title(src) + if wt: + wt = wt.strip('. ') + if wt: + break + continue + assert src_type == 'marc' + data = None + try: + data = get_data(src) + except ValueError: + print('bad record source:', src) + print('http://openlibrary.org' + e['key']) + continue + except urllib.error.HTTPError as error: + print('HTTP error:', error.code, error.msg) + print(e['key']) + if not data: + continue + is_marc8 = data[9] != 'a' + try: + line = get_first_tag(data, set(['240'])) + except BadDictionary: + print('bad dictionary:', src) + print('http://openlibrary.org' + e['key']) + continue + if line: + wt = ' '.join(get_subfield_values(line, ['a'], is_marc8)).strip('. ') + break + if wt: + return wt + for f in 'work_titles', 'work_title': + e_wt = e.get(f, []) + if e_wt: + assert isinstance(e_wt, list) + return e_wt[0].strip('. ') + +# don't use any of these as work titles +bad_titles = ['Publications', 'Works. English', 'Missal', 'Works', 'Report', \ + 'Letters', 'Calendar', 'Bulletin', 'Plays', 'Sermons', 'Correspondence', \ + 'Bill', 'Bills', 'Selections', 'Selected works', 'Selected works. English', \ + 'The Novels', 'Laws, etc'] + +def get_books(akey, query, do_get_mc=True): + for e in query: + try: + if not e.get('title', None): + continue + except: + print(e) +# if len(e.get('authors', [])) != 1: +# continue + if 'title_prefix' in e and e['title_prefix']: + prefix = e['title_prefix'] + if prefix[-1] != ' ': + prefix += ' ' + title = prefix + e['title'] + else: + title = e['title'] + + title = title.strip(' ') + if has_dot(title): + title = title[:-1] + + m = re_parens.match(title) + if m: + title = m.group(1) + + n = mk_norm(title) + + book = { + 'title': title, + 'norm_title': n, + 'key': e['key'], + } + + lang = e.get('languages', []) + if lang: + book['lang'] = [re_lang_key.match(l['key']).group(1) for l in lang] + + if e.get('table_of_contents', None): + if isinstance(e['table_of_contents'][0], six.string_types): + book['table_of_contents'] = e['table_of_contents'] + else: + assert isinstance(e['table_of_contents'][0], dict) + if e['table_of_contents'][0].get('type', None) == '/type/text': + book['table_of_contents'] = [i['value'] for i in e['table_of_contents']] + if 'subtitle' in e: + book['subtitle'] = e['subtitle'] + + if 'source_records' in e: + book['source_records'] = e['source_records'] + + mc = get_mc(e['key']) if do_get_mc else None + wt = get_work_title(e, mc) + if not wt: + yield book + continue + if wt in bad_titles: + yield book + continue + n_wt = mk_norm(wt) + book['work_title'] = wt + book['norm_wt'] = n_wt + yield book + +def build_work_title_map(equiv, norm_titles): + # map of normalized book titles to normalized work titles + if not equiv: + return {} + title_to_work_title = defaultdict(set) + for (norm_title, norm_wt), v in equiv.items(): + if v != 1: + title_to_work_title[norm_title].add(norm_wt) + + title_map = {} + for norm_title, work_titles in title_to_work_title.items(): + if len(work_titles) == 1: + title_map[norm_title] = list(work_titles)[0] + continue + most_common_title = max(work_titles, key=lambda i:norm_titles[i]) + if norm_title != most_common_title: + title_map[norm_title] = most_common_title + for work_title in work_titles: + if work_title != most_common_title: + title_map[work_title] = most_common_title + return title_map + +def get_first_version(key): + url = 'http://openlibrary.org' + key + '.json?v=1' + try: + return json.load(urlopen(url)) + except: + print(url) + raise + +def get_existing_works(akey): + q = { + 'type':'/type/work', + 'authors': {'author': {'key': akey}}, + 'limit': 0, + } + seen = set() + for wkey in ol.query(q): + if wkey in seen: + continue # skip dups + if wkey.startswith('DUP'): + continue + try: + w = get_with_retry(wkey) + except: + print(wkey) + raise + if w['type'] in ('/type/redirect', '/type/delete'): + continue + if w['type'] != '/type/work': + print('infobase error, should only return works') + print(q) + print(w['key']) + assert w['type'] == '/type/work' + yield w + +def find_title_redirects(akey): + title_redirects = {} + for w in get_existing_works(akey): + try: + norm_wt = mk_norm(w['title']) + except: + print(w['key']) + raise + q = {'type':'/type/redirect', 'location': str(w['key']), 'limit': 0} + try: + query_iter = ol.query(q) + except: + print(q) + raise + for r in map(get_first_version, query_iter): + redirect_history = json.load(urlopen('http://openlibrary.org%s.json?m=history' % r['key'])) + if any(v['author'].endswith('/WorkBot') and v['comment'] == "merge works" for v in redirect_history): + continue + #print 'redirect:', r + if mk_norm(r['title']) == norm_wt: + continue + if r['title'] in title_redirects: + assert title_redirects[r['title']] == w['title'] + #print 'redirect:', r['key'], r['title'], 'work:', w['key'], w['title'] + title_redirects[r['title']] = w['title'] + return title_redirects + +def find_works2(book_iter): + var = {} + var['equiv'] = defaultdict(int) # normalized title and work title pairs + var['norm_titles'] = defaultdict(int) # frequency of titles + var['books_by_key'] = {} + var['books'] = [] + # normalized work title to regular title + var['rev_wt'] = defaultdict(lambda: defaultdict(int)) + + for book in book_iter: + if 'norm_wt' in book: + pair = (book['norm_title'], book['norm_wt']) + var['equiv'][pair] += 1 + var['rev_wt'][book['norm_wt']][book['work_title']] +=1 + var['norm_titles'][book['norm_title']] += 1 # used to build title_map + var['books_by_key'][book['key']] = book + var['books'].append(book) + + return var + +def find_works3(var, existing={}): + title_map = build_work_title_map(var['equiv'], var['norm_titles']) + + for a, b in existing.items(): + norm_a = mk_norm(a) + norm_b = mk_norm(b) + var['rev_wt'][norm_b][norm_a] +=1 + title_map[norm_a] = norm_b + + var['works'] = defaultdict(lambda: defaultdict(list)) + var['work_titles'] = defaultdict(list) + for b in var['books']: + if 'eng' not in b.get('lang', []) and 'norm_wt' in b: + var['work_titles'][b['norm_wt']].append(b['key']) + n = b['norm_title'] + title = b['title'] + if n in title_map: + n = title_map[n] + title = top_rev_wt(var['rev_wt'][n]) + var['works'][n][title].append(b['key']) + +def find_work_sort(var): + def sum_len(n, w): + # example n: 'magic' + # example w: {'magic': ['/books/OL1M', ... '/books/OL4M']} + # example work_titles: {'magic': ['/books/OL1M', '/books/OL3M']} + return sum(len(i) for i in w.values() + [var['work_titles'][n]]) + return sorted([(sum_len(n, w), n, w) for n, w in var['works'].items()]) + +def find_works(book_iter, existing={}, do_get_mc=True): + + var = find_works2(book_iter) + find_works3(var, existing) + + works = find_work_sort(var) + + for work_count, norm, w in works: + first = sorted(w.items(), reverse=True, key=lambda i:len(i[1]))[0][0] + titles = defaultdict(int) + for key_list in w.values(): + for ekey in key_list: + b = var['books_by_key'][ekey] + title = b['title'] + titles[title] += 1 + keys = var['work_titles'][norm] + for values in w.values(): + keys += values + assert work_count == len(keys) + title = max(titles.keys(), key=lambda i:titles[i]) + toc_iter = ((k, var['books_by_key'][k].get('table_of_contents', None)) for k in keys) + toc = dict((k, v) for k, v in toc_iter if v) + # sometimes keys contains duplicates + editions = [var['books_by_key'][k] for k in set(keys)] + subtitles = defaultdict(lambda: defaultdict(int)) + edition_count = 0 + with_subtitle_count = 0 + for e in editions: + edition_count += 1 + subtitle = e.get('subtitle') or '' + if subtitle != '': + with_subtitle_count += 1 + norm_subtitle = mk_norm(subtitle) + if norm_subtitle != norm: + subtitles[norm_subtitle][subtitle] += 1 + use_subtitle = None + for k, v in subtitles.iteritems(): + lc_k = k.strip(' .').lower() + if lc_k in ('', 'roman') or 'edition' in lc_k: + continue + num = sum(v.values()) + overall = float(num) / float(edition_count) + ratio = float(num) / float(with_subtitle_count) + if overall > 0.2 and ratio > 0.5: + use_subtitle = freq_dict_top(v) + w = {'title': first, 'editions': editions} + if use_subtitle: + w['subtitle'] = use_subtitle + if toc: + w['toc'] = toc + try: + subjects = four_types(get_work_subjects(w, do_get_mc=do_get_mc)) + except: + print(w) + raise + if subjects: + w['subjects'] = subjects + yield w + +def print_works(works): + for w in works: + print(len(w['editions']), w['title']) + print(' ', [e['key'] for e in w['editions']]) + print(' ', w.get('subtitle', None)) + print(' ', w.get('subjects', None)) + + +def books_from_cache(): + for line in open('book_cache'): + yield eval(line) + +def add_subjects_to_work(subjects, w): + mapping = { + 'subject': 'subjects', + 'place': 'subject_places', + 'time': 'subject_times', + 'person': 'subject_people', + } + for k, v in subjects.items(): + k = mapping[k] + subjects = [i[0] for i in sorted(v.items(), key=lambda i:i[1], reverse=True) if i != ''] + existing_subjects = set(w.get(k, [])) + w.setdefault(k, []).extend(s for s in subjects if s not in existing_subjects) + if w.get(k): + w[k] = [six.text_type(i) for i in w[k]] + try: + assert all(i != '' and not i.endswith(' ') for i in w[k]) + except AssertionError: + print('subjects end with space') + print(w) + print(subjects) + raise + +def add_detail_to_work(i, j): + if 'subtitle' in i: + j['subtitle'] = i['subtitle'] + if 'subjects' in i: + add_subjects_to_work(i['subjects'], j) + +def fix_up_authors(w, akey, editions): + print('looking for author:', akey) + #print (w, akey, editions) + seen_akey = False + need_save = False + for a in w.get('authors', []): + print('work:', w['key']) + obj = withKey(a['author']['key']) + if obj['type']['key'] == '/type/redirect': + a['author']['key'] = obj['location'] + print(obj['key'], 'redirects to', obj['location']) + #a['author']['key'] = '/authors/' + re_author_key.match(a['author']['key']).group(1) + assert a['author']['key'].startswith('/authors/') + obj = withKey(a['author']['key']) + assert obj['type']['key'] == '/type/author' + need_save = True + if akey == a['author']['key']: + seen_akey = True + if seen_akey: + if need_save: + print('need save:', a) + return need_save + try: + ekey = editions[0]['key'] + except: + print('editions:', editions) + raise + #print 'author %s missing. copying from first edition %s' % (akey, ekey) + #print 'before:' + for a in w.get('authors', []): + print(a) + e = withKey(ekey) + #print e + if not e.get('authors', None): + print('no authors in edition') + return + print('authors from first edition', e['authors']) + w['authors'] = [{'type':'/type/author_role', 'author':a} for a in e['authors']] + #print 'after:' + #for a in w['authors']: + # print a + return True + +def new_work(akey, w, do_updates, fh_log): + ol_work = { + 'title': w['title'], + 'type': '/type/work', + 'authors': [{'type':'/type/author_role', 'author': akey}], + } + add_detail_to_work(w, ol_work) + print(ol_work, file=fh_log) + if do_updates: + for attempt in range(5): + try: + wkey = ol.new(ol_work, comment='work found') + break + except: + if attempt == 4: + raise + print('retrying: %d attempt' % attempt) + print('new work:', wkey, repr(w['title']), file=fh_log) + else: + print('new work:', repr(w['title']), file=fh_log) + update = [] + for e in w['editions']: + try: + e = ol.get(e['key']) + except: + print('edition:', e['key']) + raise + if do_updates: + e['works'] = [{'key': wkey}] + assert e['type'] == '/type/edition' + update.append(e) + if do_updates: + print(ol.save_many(update, "add editions to new work"), file=fh_log) + return [wkey] + return [] + +def fix_toc(e): + toc = e.get('table_of_contents') + if not toc: + return + try: + if isinstance(toc[0], dict) and toc[0]['type'] == '/type/toc_item': + return + except: + print('toc') + print(toc) + print(repr(toc)) + return [{'title': six.text_type(i), 'type': '/type/toc_item'} for i in toc if i] + +def update_work_with_best_match(akey, w, work_to_edition, do_updates, fh_log): + work_updated = [] + best = w['best_match']['key'] + update = [] + subjects_from_existing_works = defaultdict(set) + for wkey in w['existing_works'].iterkeys(): + if wkey == best: + continue + existing = get_with_retry(wkey) + for k in 'subjects', 'subject_places', 'subject_times', 'subject_people': + if existing.get(k): + subjects_from_existing_works[k].update(existing[k]) + + update.append({'type': '/type/redirect', 'location': best, 'key': wkey}) + work_updated.append(wkey) + + for wkey in w['existing_works'].iterkeys(): + editions = set(work_to_edition[wkey]) + editions.update(e['key'] for e in w['editions']) + for ekey in editions: + e = get_with_retry(ekey) + e['works'] = [{'key': best}] + authors = [] + for akey in e['authors']: + a = get_with_retry(akey) + if a['type'] == '/type/redirect': + m = re_author_key.match(a['location']) + akey = '/authors/' + m.group(1) + authors.append({'key': str(akey)}) + e['authors'] = authors + new_toc = fix_toc(e) + if new_toc: + e['table_of_contents'] = new_toc + update.append(e) + + cur_work = w['best_match'] + need_save = fix_up_authors(cur_work, akey, w['editions']) + if any(subjects_from_existing_works.values()): + need_save = True + if need_save or cur_work['title'] != w['title'] \ + or ('subtitle' in w and 'subtitle' not in cur_work) \ + or ('subjects' in w and 'subjects' not in cur_work): + if cur_work['title'] != w['title']: + print(( 'update work title:', best, repr(cur_work['title']), '->', repr(w['title']))) + existing_work = get_with_retry(best) + assert existing_work['type'] == '/type/work', "{type} == '/type/work'".format(**existing_work) + existing_work['title'] = w['title'] + for k, v in subjects_from_existing_works.items(): + existing_subjects = set(existing_work.get(k, [])) + existing_work.setdefault(k, []).extend(s for s in v if s not in existing_subjects) + add_detail_to_work(w, existing_work) + for a in existing_work.get('authors', []): + obj = withKey(a['author']) + if obj['type']['key'] != '/type/redirect': + continue + new_akey = obj['location'] + a['author'] = {'key': new_akey} + assert new_akey.startswith('/authors/') + obj = withKey(new_akey) + assert obj['type']['key'] == '/type/author' + print('existing:', existing_work, file=fh_log) + print('subtitle:', repr(existing_work['subtitle']) if 'subtitle' in existing_work else 'n/a', file=fh_log) + update.append(existing_work) + work_updated.append(best) + if do_updates: + try: + print(ol.save_many(update, 'merge works'), file=fh_log) + except: + for page in update: + print(page) + raise + return work_updated + +def update_works(akey, works, do_updates=False): + # we can now look up all works by an author + if do_updates: + rc = read_rc() + ol.login('WorkBot', rc['WorkBot']) + assert do_updates + + fh_log = open('/1/var/log/openlibrary/work_finder/' + strftime('%F_%T'), 'w') + works = list(works) + print(akey, file=fh_log) + print('works:', file=fh_log) + + while True: # until redirects repaired + q = {'type':'/type/edition', 'authors': akey, 'works': None} + work_to_edition = defaultdict(set) + edition_to_work = defaultdict(set) + for e in query_iter(q): + if not isinstance(e, dict): + continue + if e.get('works', None): + for w in e['works']: + work_to_edition[w['key']].add(e['key']) + edition_to_work[e['key']].add(w['key']) + + work_by_key = {} + fix_redirects = [] + for k, editions in work_to_edition.items(): + w = withKey(k) + if w['type']['key'] == '/type/redirect': + wkey = w['location'] + print('redirect found', w['key'], '->', wkey, editions, file=fh_log) + assert re_work_key.match(wkey) + for ekey in editions: + e = get_with_retry(ekey) + e['works'] = [{'key': wkey}] + fix_redirects.append(e) + continue + work_by_key[k] = w + if not fix_redirects: + print('no redirects left', file=fh_log) + break + print('save redirects', file=fh_log) + try: + ol.save_many(fix_redirects, "merge works") + except: + for r in fix_redirects: + print(r) + raise + + all_existing = set() + work_keys = [] + print('edition_to_work:', file=fh_log) + print(repr(dict(edition_to_work)), file=fh_log) + print(file=fh_log) + print('work_to_edition', file=fh_log) + print(repr(dict(work_to_edition)), file=fh_log) + print(file=fh_log) + +# open('edition_to_work', 'w').write(repr(dict(edition_to_work))) +# open('work_to_edition', 'w').write(repr(dict(work_to_edition))) +# open('work_by_key', 'w').write(repr(dict(work_by_key))) + + work_title_match = {} + works_by_title = {} + for w in works: # 1st pass + for e in w['editions']: + ekey = e['key'] if isinstance(e, dict) else e + for wkey in edition_to_work.get(ekey, []): + try: + wtitle = work_by_key[wkey]['title'] + except: + print('bad work:', wkey) + raise + if wtitle == w['title']: + work_title_match[wkey] = w['title'] + + wkey_to_new_title = defaultdict(set) + + for w in works: # 2nd pass + works_by_title[w['title']] = w + w['existing_works'] = defaultdict(int) + for e in w['editions']: + ekey = e['key'] if isinstance(e, dict) else e + for wkey in edition_to_work.get(ekey, []): + if wkey in work_title_match and work_title_match[wkey] != w['title']: + continue + wtitle = work_by_key[wkey]['title'] + w['existing_works'][wkey] += 1 + wkey_to_new_title[wkey].add(w['title']) + + existing_work_with_conflict = defaultdict(set) + + for w in works: # 3rd pass + for wkey, v in w['existing_works'].iteritems(): + if any(title != w['title'] for title in wkey_to_new_title[wkey]): + w['has_conflict'] = True + existing_work_with_conflict[wkey].add(w['title']) + break + + for wkey, v in existing_work_with_conflict.iteritems(): + cur_work = work_by_key[wkey] + existing_titles = defaultdict(int) + for ekey in work_to_edition[wkey]: + e = withKey(ekey) + title = e['title'] + if e.get('title_prefix', None): + title = e['title_prefix'].strip() + ' ' + e['title'] + existing_titles[title] += 1 + best_match = max(v, key=lambda wt: existing_titles[wt]) + works_by_title[best_match]['best_match'] = work_by_key[wkey] + for wtitle in v: + del works_by_title[wtitle]['has_conflict'] + if wtitle != best_match: + works_by_title[wtitle]['existing_works'] = {} + + def other_matches(w, existing_wkey): + return [title for title in wkey_to_new_title[existing_wkey] if title != w['title']] + + works_updated_this_session = set() + + for w in works: # 4th pass + assert 'has_conflict' not in w, 'w: {}'.format(w) + if len(w['existing_works']) == 1: + existing_wkey = w['existing_works'].keys()[0] + if not other_matches(w, existing_wkey): + w['best_match'] = work_by_key[existing_wkey] + if 'best_match' in w: + updated = update_work_with_best_match(akey, w, work_to_edition, do_updates, fh_log) + for wkey in updated: + if wkey in works_updated_this_session: + print(wkey, 'already updated!', file=fh_log) + print(wkey, 'already updated!') + works_updated_this_session.update(updated) + continue + if not w['existing_works']: + updated = new_work(akey, w, do_updates, fh_log) + for wkey in updated: + assert wkey not in works_updated_this_session + works_updated_this_session.update(updated) + continue + + assert not any(other_matches(w, wkey) for wkey in w['existing_works'].iterkeys()) + best_match = max(w['existing_works'].iteritems(), key=lambda i:i[1])[0] + w['best_match'] = work_by_key[best_match] + updated = update_work_with_best_match(akey, w, work_to_edition, do_updates, fh_log) + for wkey in updated: + if wkey in works_updated_this_session: + print(wkey, 'already updated!', file=fh_log) + print(wkey, 'already updated!') + works_updated_this_session.update(updated) + + #if not do_updates: + # return [] + + return [withKey(key) for key in works_updated_this_session] + +if __name__ == '__main__': + akey = '/authors/' + sys.argv[1] + + title_redirects = find_title_redirects(akey) + works = find_works(akey, get_books(akey, books_query(akey)), existing=title_redirects) + to_update = update_works(akey, works, do_updates=True) + + requests = [] + for w in to_update: + requests += update_work(w) + + if to_update: + solr_update(requests + [''], debug=True) + + requests = update_author(akey) + solr_update(requests + [''], debug=True) diff --git a/ia-legacy-importer/works/from_sample.py b/ia-legacy-importer/works/from_sample.py new file mode 100644 index 00000000..397f52b9 --- /dev/null +++ b/ia-legacy-importer/works/from_sample.py @@ -0,0 +1,230 @@ +from __future__ import print_function +import web +import re +import sys +import codecs +from catalog.marc.fast_parse import * +from catalog.utils import pick_first_date +import catalog.marc.new_parser as parser + +sys.stdout = codecs.getwriter('utf-8')(sys.stdout) + +re_recording = re.compile('\x1f(hsound ?record|[hn] ?\[\[?(sound|video|phonodisc))', re.I) +re_end_dot = re.compile('[^ ][^ ]\.$', re.UNICODE) +re_marc_name = re.compile('^(.*), (.*)$') + +authors = {} +family_names = {} +by_author = {} +by_contrib = {} + +def remove_trailing_dot(s): + m = re_end_dot.search(s) + if m: + s = s[:-1] + return s + +def strip_q(q): + if q.endswith(').'): + q = q[:-1] + q = q.strip(' ()/,;:') + return q + +def read(data): + want = ['008', '041', '100', '110', '111', '130', '240', '245', '500', '700', '710', '711'] + fields = get_tag_lines(data, ['006', '008', '245', '260'] + want) + seen_008 = False + found = [] + for tag, line in fields: + if tag in want: + found.append((tag, line)) + if tag == '006': + if line[0] == 'm': # don't want electronic resources + return (fields, None) + continue + if tag == '008': + if seen_008: # dup + return (fields, None) + seen_008 = True + continue + if tag in ('240', '245', '260'): + if re_recording.search(line): # sound recording + return (fields, None) + continue + return (fields, found) + +def initials(s): + return [i[0] for i in s.split(' ')] + +def parse_person(line): + contents = get_person_content(line) + marc_orig = list(get_all_subfields(line)), + if not ('a' in contents or 'c' in contents): + return marc_orig, {} + assert 'a' in contents or 'c' in contents + + if 'd' in contents: + author = pick_first_date(contents['d']) + else: + author = {} + #author['marc_orig'] = list(get_all_subfields(line)), + for tag, f in [ ('b', 'numeration'), ('c', 'title') ]: + if tag in contents: + author[f] = ' '.join(x.strip(' /,;:') for x in contents[tag]) + + if 'a' in contents: + name = ' '.join(x.strip(' /,;:') for x in contents['a']) + name = remove_trailing_dot(name) + m = re_marc_name.match(name) + if m: + author['family_name'] = m.group(1) + author['given_names'] = m.group(2) + author['name'] = m.group(2) + ' ' + m.group(1) + else: + author['name'] = name + name_subfields = get_subfield_values(line, ['a', 'b', 'c']) + author['sort'] = ' '.join(v.strip(' /,;:') for v in name_subfields) + + + if 'q' in contents: + if len(contents['q']) != 1: + print(marc_orig) + assert len(contents['q']) == 1 + q = strip_q(contents['q'][0]) + if 'given_names' in authors: + assert initials(q) == initials(author['given_names']) \ + or q.startswith(author['given_names']) + author['given_names'] = q + return marc_orig, author + +def test_parse_person(): + line = '1 \x1faMoeran, E. J.\x1fq(Ernest John)\x1fq(1894-1950)\x1e' + person = ([('a', u'Moeran, E. J.'), ('q', u'(Ernest John)'), ('q', u'(1894-1950)')],) + parse_person(line) + +#test_parse_person() + +def full_title(line): + title = ' '.join(v for k, v in line if k in ('a', 'b')).strip(' /,;:') + return remove_trailing_dot(title) + +def test_strip_q(): + for i in ['(%s),', '(%s)', '(%s,']: + k = i % ('foo') + j = strip_q(k) + print(k, j) + assert j == 'foo' + + name = 'John X.' + assert name == strip_q('(%s)' % name) + +def print_author(a): + for k in ('name', 'sort', 'numeration', 'title', 'given_names', 'family_name', 'birth_date', 'death_date'): + print("%12s: %s" % (k, author.get(k, ''))) + + +def person_as_tuple(p): + return tuple(p.get(i, None) for i in ('sort', 'birth_date', 'death_date')) + +def family_name(a): + if 'family_name' not in a: + return + this = a['family_name'] + family_names.setdefault(this, {}) + as_tuple = tuple(a.get(i, None) for i in ('sort', 'birth_date', 'death_date')) + as_tuple = person_as_tuple(a) + family_names[this][as_tuple] = family_names[this].get(as_tuple, 0) + 1 + +interested = set(['Rowling', 'Shakespeare', 'Sagan', 'Darwin', 'Verne', 'Beckett', 'Churchill', 'Dickens', 'Twain', 'Doyle']) +sorted_interest = sorted(interested) + +def edition_list(l): + for e in l: + print(e['loc']) + for k in sorted((k for k in e.keys() if k.isdigit()), key=int): + if k == '245': + t = ' '.join(v.strip(' /,;:') for k, v in e[k][0] if k == 'a') + title = remove_trailing_dot(t) + full = full_title(e[k][0]) + print(' title:', title) + if title != full: + print('full title:', full) + print(' ', k, e[k]) + print('---') + +def print_interest(): + for k in sorted_interest: + if k not in family_names: + continue + print(k) + for a in sorted(family_names[k].keys()): + if family_names[k][a] > 5: + print(" %3d %s" % (family_names[k][a], a)) + if a in by_author: + print(" by: ") + for i in sorted(by_author[a].keys()): + print(' WORK: %s (%d)' % (i, len(by_author[a][i]))) + edition_list(by_author[a][i]) +# if a in by_contrib: +# print " contrib: " +# edition_list(by_contrib[a]) + print() + +def work_title(edition): + if '240' in edition: + t = ' '.join(v for k, v in edition['240'][0] if k in ('a', 'm', 'n', 'p', 'r')) + else: + t = ' '.join(v.strip(' /,;:') for k, v in edition['245'][0] if k == 'a') + return remove_trailing_dot(t) + +#for line in open(sys.argv[1]): +for line in sys.stdin: + loc, data = eval(line) + (orig_fields, fields) = read(data) + if not fields: + continue + new_interest = False + edition = {} + for tag, l in fields: + #if tag in ('100', '700'): + if tag == '100': + try: + marc, person = parse_person(l) + except: + print(loc) + raise + if not person: + continue + #print author['marc_orig'] +# print marc + if person.get('family_name', None) in interested: +# family_name(person) + new_interest = True +# print_author(author) + continue + tag_map = { '100': 'authors', '700': 'contribs' } + person['marc'] = marc + edition.setdefault(tag_map[tag], []).append(person) + continue + if tag == '008': + lang = str(l)[35:38] + edition['lang'] = lang + continue + edition.setdefault(tag, []).append(list(get_all_subfields(line))) + #for k in sorted(family_names.keys()): + + if new_interest: + edition['loc'] = loc + print((loc, data)) + continue + title = work_title(edition) +# rec = parser.read_edition(loc, data) + for p in edition.get('authors', []): + a = by_author.setdefault(person_as_tuple(p), {}) + a.setdefault(title, []).append(edition) +# for p in edition.get('contribs', []): +# by_contrib.setdefault(person_as_tuple(p), []).append(edition) +for k, v in by_author.items(): + print((k, v)) +#print_interest() + diff --git a/ia-legacy-importer/works/live.py b/ia-legacy-importer/works/live.py new file mode 100755 index 00000000..35c3a67e --- /dev/null +++ b/ia-legacy-importer/works/live.py @@ -0,0 +1,474 @@ +#!/usr/bin/python + +# find works and create pages on production + +from __future__ import print_function +import re +import sys +import codecs +import web +from openlibrary.catalog.get_ia import get_from_archive, get_data +from openlibrary.catalog.marc.fast_parse import get_subfield_values, get_first_tag, get_tag_lines, get_subfields, BadDictionary +from openlibrary.catalog.utils.query import query_iter, set_staging, query +from openlibrary.catalog.utils import cmp, mk_norm +from openlibrary.catalog.read_rc import read_rc +from collections import defaultdict +from pprint import pformat +from openlibrary.catalog.utils.edit import fix_edition +from openlibrary.catalog.importer.db_read import get_mc +from openlibrary.api import OpenLibrary, Reference +from lxml import etree +from time import sleep, time + +import six +from six.moves import urllib + + +rc = read_rc() + +ol = OpenLibrary("http://openlibrary.org") +ol.login('WorkBot', rc['WorkBot']) + +def write_log(cat, key, title): + print((("%.2f" % time()), cat, key, title), file=fh_log) + fh_log.flush() + +sys.stdout = codecs.getwriter('utf-8')(sys.stdout) +re_skip = re.compile('\b([A-Z]|Co|Dr|Jr|Capt|Mr|Mrs|Ms|Prof|Rev|Revd|Hon|etc)\.$') + +re_ia_marc = re.compile('^(?:.*/)?([^/]+)_(marc\.xml|meta\.mrc)(:0:\d+)?$') + +ns = '{http://www.loc.gov/MARC21/slim}' +ns_leader = ns + 'leader' +ns_data = ns + 'datafield' + +def has_dot(s): + return s.endswith('.') and not re_skip.search(s) + +#set_staging(True) + +# sample title: The Dollar Hen (Illustrated Edition) (Dodo Press) +re_parens = re.compile('^(.*?)(?: \(.+ (?:Edition|Press)\))+$') + +def key_int(key): + # extract the number from a key like /a/OL1234A + return int(web.numify(key)) + +def update_work_edition(ekey, wkey, use): + print((ekey, wkey, use)) + e = ol.get(ekey) + works = [] + for w in e['works']: + if w == wkey: + if use not in works: + works.append(Reference(use)) + else: + if w not in works: + works.append(w) + + if e['works'] == works: + return + print('before:', e['works']) + print('after:', works) + e['works'] = works + print(ol.save(e['key'], e, 'remove duplicate work page')) + +def top_rev_wt(d): + d_sorted = sorted(d.keys(), cmp=lambda i, j: cmp(d[j], d[i]) or cmp(len(j), len(i))) + return d_sorted[0] + +def books_query(akey): # live version + q = { + 'type':'/type/edition', + 'authors': akey, + 'source_records': None, + 'title': None, + 'work_title': None, + 'languages': None, + 'title_prefix': None, + 'subtitle': None, + } + return query_iter(q) + +def freq_dict_top(d): + return sorted(d.keys(), reverse=True, key=lambda i:d[i])[0] + +def get_marc_src(e): + mc = get_mc(e['key']) + if mc and mc.startswith('amazon:'): + mc = None + if mc and mc.startswith('ia:'): + yield 'ia', mc[3:] + elif mc: + m = re_ia_marc.match(mc) + if m: + #print 'IA marc match:', m.group(1) + yield 'ia', m.group(1) + else: + yield 'marc', mc + source_records = e.get('source_records', []) + if not source_records: + return + for src in source_records: + if src.startswith('ia:'): + if not mc or src != mc: + yield 'ia', src[3:] + continue + if src.startswith('marc:'): + if not mc or src != 'marc:' + mc: + yield 'marc', src[5:] + continue + +def get_ia_work_title(ia): + url = 'http://www.archive.org/download/' + ia + '/' + ia + '_marc.xml' + try: + root = etree.parse(urllib.request.urlopen(url)).getroot() + except KeyboardInterrupt: + raise + except: + #print 'bad XML', ia + #print url + return + #print etree.tostring(root) + e = root.find(ns_data + "[@tag='240']") + if e is None: + return + #print e.tag + wt = ' '.join(s.text for s in e if s.attrib['code'] == 'a' and s.text) + return wt + +def get_work_title(e): + # use first work title we find in source MARC records + wt = None + for src_type, src in get_marc_src(e): + if src_type == 'ia': + wt = get_ia_work_title(src) + if wt: + break + continue + assert src_type == 'marc' + data = None + #print 'get from archive:', src + try: + data = get_data(src) + except ValueError: + print('bad record source:', src) + print('http://openlibrary.org' + e['key']) + continue + except urllib.error.HTTPError as error: + print('HTTP error:', error.code, error.msg) + print(e['key']) + if not data: + continue + try: + line = get_first_tag(data, set(['240'])) + except BadDictionary: + print('bad dictionary:', src) + print('http://openlibrary.org' + e['key']) + continue + if line: + wt = ' '.join(get_subfield_values(line, ['a'])).strip('. ') + break + if wt: + return wt + if not e.get('work_titles', []): + return + print('work title in MARC, but not in OL') + print('http://openlibrary.org' + e['key']) + return e['work_titles'][0] + +def get_books(akey, query): + for e in query: + if not e.get('title', None): + continue +# if len(e.get('authors', [])) != 1: +# continue + if 'title_prefix' in e and e['title_prefix']: + prefix = e['title_prefix'] + if prefix[-1] != ' ': + prefix += ' ' + title = prefix + e['title'] + else: + title = e['title'] + + title = title.strip(' ') + if has_dot(title): + title = title[:-1] + if title.strip('. ') in ['Publications', 'Works', 'Report', \ + 'Letters', 'Calendar', 'Bulletin', 'Plays', 'Sermons', 'Correspondence']: + continue + + m = re_parens.match(title) + if m: + title = m.group(1) + + n = mk_norm(title) + + book = { + 'title': title, + 'norm_title': n, + 'key': e['key'], + } + + lang = e.get('languages', []) + if lang: + book['lang'] = [l['key'][3:] for l in lang] + + if e.get('table_of_contents', None): + if isinstance(e['table_of_contents'][0], six.string_types): + book['table_of_contents'] = e['table_of_contents'] + else: + assert isinstance(e['table_of_contents'][0], dict) + if e['table_of_contents'][0]['type'] == '/type/text': + book['table_of_contents'] = [i['value'] for i in e['table_of_contents']] + + wt = get_work_title(e) + if not wt: + yield book + continue + if wt in ('Works', 'Selections'): + yield book + continue + n_wt = mk_norm(wt) + book['work_title'] = wt + book['norm_wt'] = n_wt + yield book + +def build_work_title_map(equiv, norm_titles): + # map of book titles to work titles + title_to_work_title = defaultdict(set) + for (norm_title, norm_wt), v in equiv.items(): + if v != 1: + title_to_work_title[norm_title].add(norm_wt) + + title_map = {} + for title, v in title_to_work_title.items(): + if len(v) == 1: + title_map[title] = list(v)[0] + continue + most_common_title = max(v, key=lambda i:norm_titles[i]) + if title != most_common_title: + title_map[title] = most_common_title + for i in v: + if i != most_common_title: + title_map[i] = most_common_title + return title_map + +def find_works(akey, book_iter): + equiv = defaultdict(int) # title and work title pairs + norm_titles = defaultdict(int) # frequency of titles + books_by_key = {} + books = [] + rev_wt = defaultdict(lambda: defaultdict(int)) + + for book in book_iter: + if 'norm_wt' in book: + pair = (book['norm_title'], book['norm_wt']) + equiv[pair] += 1 + rev_wt[book['norm_wt']][book['work_title']] +=1 + norm_titles[book['norm_title']] += 1 + books_by_key[book['key']] = book + books.append(book) + + title_map = build_work_title_map(equiv, norm_titles) + + works = defaultdict(lambda: defaultdict(list)) + work_titles = defaultdict(list) + for b in books: + if 'eng' not in b.get('lang', []) and 'norm_wt' in b: + work_titles[b['norm_wt']].append(b['key']) + continue + n = b['norm_title'] + title = b['title'] + if n in title_map: + n = title_map[n] + title = top_rev_wt(rev_wt[n]) + works[n][title].append(b['key']) + + works = sorted([(sum(map(len, w.values() + [work_titles[n]])), n, w) for n, w in works.items()]) + + for work_count, norm, w in works: +# if work_count < 2: +# continue + first = sorted(w.items(), reverse=True, key=lambda i:len(i[1]))[0][0] + titles = defaultdict(int) + for key_list in w.values(): + for ekey in key_list: + b = books_by_key[ekey] + title = b['title'] + titles[title] += 1 + keys = work_titles[norm] + for values in w.values(): + keys += values + assert work_count == len(keys) + title = max(titles.keys(), key=lambda i:titles[i]) + toc = [(k, books_by_key[k].get('table_of_contents', None)) for k in keys] + yield {'title': first, 'editions': keys, 'toc': dict((k, v) for k, v in toc if v)} + +def print_works(works): + for w in works: + print(len(w['editions']), w['title']) + +def toc_items(toc_list): + return [{'title': six.text_type(item), 'type': Reference('/type/toc_item')} for item in toc_list] + +def add_works(works): + q = [] + for w in works: + cur = { + 'authors': [{'author': Reference(w['author'])}], + 'type': '/type/work', + 'title': w['title'] + } + if 'subjects' in w: + cur['subjects'] = w['subjects'] + q.append(cur) + try: + return ol.new(q, comment='create work page') + except: + print(q) + raise + +def add_work(akey, w): + q = { + 'authors': [{'author': Reference(akey)}], + 'type': '/type/work', + 'title': w['title'] + } + try: + wkey = ol.new(q, comment='create work page') + except: + print(q) + raise + write_log('work', wkey, w['title']) + assert isinstance(wkey, six.string_types) + for ekey in w['editions']: + e = ol.get(ekey) + fix_edition(ekey, e, ol) + #assert 'works' not in e + write_log('edition', ekey, e.get('title', 'title missing')) + e['works'] = [Reference(wkey)] + yield e + +def save_editions(queue): + print('saving') + try: + print(ol.save_many(queue, 'add edition to work page')) + except: + print('ol.save_many() failed, trying again in 30 seconds') + sleep(30) + print(ol.save_many(queue, 'add edition to work page')) + print('saved') + +def merge_works(work_keys): + use = "/works/OL%dW" % min(key_int(w) for w in work_keys) + for wkey in work_keys: + if wkey == use: + continue + w_query = {'type':'/type/edition', 'works':wkey, 'limit':False} + for e in ol.query(w_query): # returns strings? + print(e) + update_work_edition(e, wkey, use) + w = ol.get(wkey) + assert w['type'] == '/type/work' + w['type'] = '/type/redirect' + w['location'] = use + print(ol.save(wkey, w, 'delete duplicate work page')) + +def update_edition(ekey, wkey): + e = ol.get(ekey) + fix_edition(ekey, e, ol) + write_log('edition', ekey, e.get('title', 'title missing')) + if e.get('works', []): + assert len(e['works']) == 1 + if e['works'][0] != wkey: + print('e:', e) + print('wkey:', wkey) + print('ekey:', ekey) + print('e["works"]:', e['works']) + #merge_works([e['works'][0], wkey]) + #assert e['works'][0] == wkey + return None + e['works'] = [Reference(wkey)] + return e + +def run_queue(queue): + work_keys = add_works(queue) + for w, wkey in zip(queue, work_keys): + w['key'] = wkey + write_log('work', wkey, w['title']) + for ekey in w['editions']: + e = update_edition(ekey, wkey) + if e: + yield e + +def get_work_key(title, akey): + q = { + 'type': '/type/work', + 'title': title, + 'authors': None, + } + matches = [w for w in ol.query(q) if any(a['author'] == akey for a in w['authors'])] + if not matches: + return None + if len(matches) != 1: + print('time to fix duplicate works') + print(repr(title)) + print('http://openlibrary.org' + akey) + print(matches) + assert len(matches) == 1 + return matches[0]['key'] + +def by_authors(): + skip = '/a/OL25755A' + q = { 'type':'/type/author', 'name': None } + for a in query_iter(q): + akey = a['key'] + if skip: + if akey == skip: + skip = None + else: + continue + write_log('author', akey, a.get('name', 'name missing')) + + works = find_works(akey, get_books(akey, books_query(akey))) + print((akey, repr(a['name']))) + + for w in works: + w['author'] = akey + wkey = get_work_key(w['title'], akey) + if wkey: + w['key'] = wkey + yield w + +if __name__ == '__main__': + fh_log = open('/1/edward/logs/WorkBot', 'a') + edition_queue = [] + work_queue = [] + + for w in by_authors(): + if 'key' in w: + for ekey in w['editions']: + e = update_edition(ekey, w['key']) + if e: + edition_queue.append(e) + continue + + work_queue.append(w) + if len(work_queue) > 1000: + for e in run_queue(work_queue): + print(e['key'], repr(e['title'])) + edition_queue.append(e) + if len(edition_queue) > 1000: + save_editions(edition_queue) + edition_queue = [] + sleep(5) + work_queue = [] + + print('almost finished') + for e in run_queue(work_queue): + edition_queue.append(e) + save_editions(edition_queue) + print('finished') + + fh_log.close() diff --git a/ia-legacy-importer/works/load_to_staging.py b/ia-legacy-importer/works/load_to_staging.py new file mode 100644 index 00000000..d5046826 --- /dev/null +++ b/ia-legacy-importer/works/load_to_staging.py @@ -0,0 +1,47 @@ +from __future__ import print_function +import sys +sys.path.remove('/usr/local/lib/python2.5/site-packages/web.py-0.23-py2.5.egg') +from staging_save import Infogami +from catalog.read_rc import read_rc +import catalog.importer.db_read as db_read +import re +import sys +import codecs + +db_read.set_staging(True) + +sys.stdout = codecs.getwriter('utf-8')(sys.stdout) + +rc = read_rc() +infogami = Infogami() +infogami.login('edward', rc['edward']) + +for line in open('works_for_staging'): + work_key, title, authors, editions = eval(line) + if not all(db_read.withKey('/a/' + a) for a in authors): + continue + work = db_read.withKey(work_key) + print(work_key) + if work: + continue + if not work: + q = { + 'create': 'unless_exists', + 'type': { 'key': '/type/work' }, + 'key': work_key, + 'title': title, + 'authors': [{'key': '/a/' + a} for a in authors], + } + ret = infogami.write(q, comment='create work') + print(ret) + for edition_key in editions: + edition = db_read.withKey(edition_key) + if not edition: continue + if 'works' in edition: continue + q = { + 'key': edition_key, + 'works': { 'connect': 'update_list', 'value': [{'key': work_key}]} + } + ret = infogami.write(q, comment='add work to edition') + print(edition_key, ret) + assert ret['result']['updated'] diff --git a/ia-legacy-importer/works/sample_marc.py b/ia-legacy-importer/works/sample_marc.py new file mode 100644 index 00000000..cf2f1c81 --- /dev/null +++ b/ia-legacy-importer/works/sample_marc.py @@ -0,0 +1,19 @@ +from __future__ import print_function +from catalog.marc.all import iter_marc +import re + +# random authors and subjects +terms = [ + 'rowling', 'harry potter', 'shakespeare', 'hamlet', 'twain', 'darwin', + 'sagan', 'huckleberry finn', 'tom sawyer', 'verne', 'waiting for godot', + 'beckett', 'churchill', 'darwin', 'dickens', 'doyle', 'leonardo', + 'da vinci', +] + +re_terms = re.compile('(' + '|'.join(terms) + ')', re.I) + +out = open('/1/pharos/edward/sample_marc2', 'w') +for rec_no, pos, loc, data in iter_marc(): + if re_terms.search(data): + print((loc, data), file=out) +out.close() diff --git a/ia-legacy-importer/works/test_find_works.py b/ia-legacy-importer/works/test_find_works.py new file mode 100644 index 00000000..1f82cae7 --- /dev/null +++ b/ia-legacy-importer/works/test_find_works.py @@ -0,0 +1,147 @@ +# -*- coding: utf-8 -*- +from openlibrary.catalog.merge.normalize import normalize +from openlibrary.catalog.works.find_works import top_rev_wt, has_dot, freq_dict_top, find_works, get_books, find_works2, build_work_title_map, find_works3, find_work_sort + + +def test_has_dot(): + assert has_dot('Magic.') + assert not has_dot('Magic') + assert not has_dot('Magic etc.') + +def test_top_rev_wt(): + input_data = { + 'aaa': 'test data', + 'aaaa': 'more test data', + 'bbbb': 'test date', + 'cc': 'some more test data', + } + assert top_rev_wt(input_data) == 'bbbb' + +def test_freq_dict_top(): + assert freq_dict_top({'a': 0}) == 'a' + assert freq_dict_top({'a': 3, 'b': 6, 'c': 4}) == 'b' + +def test_find_works(): + works = list(find_works([])) + assert works == [] + + books = [{'title': 'Magic', 'key': '/books/OL1M'}] + book_iter = get_books('', books, do_get_mc=False) + + books2 = list(book_iter) + assert books2 == [{'key': '/books/OL1M', 'norm_title': 'magic', 'title': 'Magic'}] + + var = find_works2(books2) + assert var['equiv'] == {} + assert var['norm_titles'] == {'magic': 1} + assert var['books_by_key'] == {'/books/OL1M': books2[0]} + assert var['books'] == books2 + assert var['rev_wt'] == {} + + assert build_work_title_map({}, {'magic': 1}) == {} + assert build_work_title_map({}, {'magic': 2, 'test': 0}) == {} + + works = list(find_works(books2, do_get_mc=False)) + expect = [ + { 'title': 'Magic', + 'editions': [{ + 'key': '/books/OL1M', + 'norm_title': 'magic', + 'title': 'Magic'}], + }] + assert works == expect + + + books = [ + {'title': 'Magic', 'key': '/books/OL1M'}, + {'title': 'Magic', 'key': '/books/OL2M'}, + ] + book_iter = get_books('', books, do_get_mc=False) + books2 = list(book_iter) + + var = find_works2(books2) + assert var['equiv'] == {} + assert var['norm_titles'] == {'magic': 2} + assert var['books_by_key'] == {'/books/OL1M': books2[0], '/books/OL2M': books2[1]} + assert var['books'] == books2 + assert var['rev_wt'] == {} + + works = list(find_works(books2, do_get_mc=False)) + expect = [ + { 'title': 'Magic', + 'editions': [ + { 'key': '/books/OL1M', 'norm_title': 'magic', 'title': 'Magic'}, + { 'key': '/books/OL2M', 'norm_title': 'magic', 'title': 'Magic'}, + ], + }] + assert works == expect + + magico = u'm\xe1gico' + + assert normalize(magico) == magico + + books = [ + {'title': magico, 'work_title': ['magic'], 'key': '/books/OL1M'}, + {'title': 'magic', 'key': '/books/OL2M'}, + {'title': magico, 'work_title': ['magic'], 'key': '/books/OL3M'}, + {'title': 'magic', 'key': '/books/OL4M'}, + ] + expect_keys = sorted(e['key'] for e in books) + book_iter = get_books('', books, do_get_mc=False) + books2 = list(book_iter) + + expect = [ + {'key': '/books/OL1M', 'norm_title': magico, 'work_title': 'magic', 'norm_wt': 'magic', 'title': magico}, + {'key': '/books/OL2M', 'norm_title': 'magic', 'title': 'magic'}, + {'key': '/books/OL3M', 'norm_title': magico, 'work_title': 'magic', 'norm_wt': 'magic', 'title': magico}, + {'key': '/books/OL4M', 'norm_title': 'magic', 'title': 'magic'}, + ] + + assert len(books2) == 4 + for i in range(4): + assert books2[i] == expect[i] + + var = find_works2(books2) + assert var['equiv'] == {(magico, 'magic'): 2} + assert var['norm_titles'] == {magico: 2, 'magic': 2} + assert len(var['books_by_key']) == 4 + bk = var['books_by_key'] + assert bk['/books/OL1M'] == books2[0] + assert bk['/books/OL2M'] == books2[1] + assert bk['/books/OL3M'] == books2[2] + assert bk['/books/OL4M'] == books2[3] + assert var['books'] == books2 + assert var['rev_wt'] == {'magic': {'magic': 2}} + + title_map = build_work_title_map(var['equiv'], var['norm_titles']) + + assert title_map == {magico: 'magic'} + + find_works3(var) + assert var['works'] == {'magic': {'magic': expect_keys}} + assert var['work_titles'] == {'magic': ['/books/OL1M', '/books/OL3M']} + + sorted_works = find_work_sort(var) + assert sorted_works == [(6, 'magic', {'magic': expect_keys})] + + works = list(find_works(books2, do_get_mc=False)) + expect = [{ + 'title': u'Magic', + 'editions': [ + {'key': '/books/OL2M', 'norm_title': 'magic', 'title': 'magic'}, + {'key': '/books/OL1M', 'norm_title': u'mágico', 'norm_wt': 'magic', 'title': u'Mágico'}, + ], + }] + + work_count = len(works) + assert work_count == 1 + editions = works[0]['editions'] + edition_count = len(works[0]['editions']) + edition_keys = sorted(e['key'] for e in editions) + assert edition_keys == expect_keys + assert edition_count == 4 + del works[0]['editions'] + assert works[0] == {'title': 'magic'} + #assert works == expect + + diff --git a/ia-legacy-importer/works/tests.py b/ia-legacy-importer/works/tests.py new file mode 100644 index 00000000..059afa7b --- /dev/null +++ b/ia-legacy-importer/works/tests.py @@ -0,0 +1,143 @@ +from __future__ import print_function +from collections import defaultdict +import re +import catalog.merge.normalize as merge + +def freq_dict_top(d): + return sorted(d.keys(), reverse=True, key=lambda i:d[i])[0] + +re_brackets = re.compile('^(.*)\[.*?\]$') +re_parens = re.compile('^(.*?)(?: \(.+ (?:Edition|Press)\))+$') + +def mk_norm(title): + m = re_brackets.match(title) + if m: + title = m.group(1) + norm = merge.normalize(title).strip(' ') + norm = norm.replace(' and ', ' ') + if norm.startswith('the '): + norm = norm[4:] + elif norm.startswith('a '): + norm = norm[2:] + return norm.replace('-', '').replace(' ', '') + +def build_work_title_map(equiv, norm_titles): + title_to_work_title = defaultdict(set) + for (norm_title, norm_wt), v in equiv.items(): + if v != 1: + title_to_work_title[norm_title].add(norm_wt) + + title_map = {} + for title, v in title_to_work_title.items(): + if len(v) == 1: + title_map[title] = list(v)[0] + continue + most_common_title = max(v, key=lambda i:norm_titles[i]) + if title != most_common_title: + title_map[title] = most_common_title + for i in v: + if i != most_common_title: + title_map[i] = most_common_title + return title_map + + +milo_m_hastings = [ + {'lang': ['eng'], 'key': '/b/OL7009753M', 'title': 'The dollar hen'}, + {'lang': ['eng'], 'key': '/b/OL9563276M', 'title': 'The Dollar Hen (Large Print Edition)'}, + {'lang': ['eng'], 'key': '/b/OL9636071M', 'title': 'The Dollar Hen'}, + {'lang': ['eng'], 'key': '/b/OL15083244M', 'title': 'The dollar hen'}, + {'lang': ['eng'], 'key': '/b/OL8566971M', 'title': 'The Dollar Hen'}, + {'lang': ['eng'], 'key': '/b/OL9353753M', 'title': 'City of Endless Night'}, + {'lang': ['eng'], 'key': '/b/OL9462083M', 'title': 'City of Endless Night (Large Print Edition)'}, + {'lang': ['eng'], 'key': '/b/OL9642528M', 'title': 'The Dollar Hen'}, + {'lang': ['eng'], 'key': '/b/OL9736536M', 'title': 'The Dollar Hen'}, + {'lang': ['eng'], 'key': '/b/OL9735362M', 'title': 'The Dollar Hen (Illustrated Edition) (Dodo Press)'}, + {'lang': ['eng'], 'key': '/b/OL9800490M', 'title': 'The Dollar Hen'}, + {'lang': ['eng'], 'key': '/b/OL11676559M', 'title': 'City of Endless Night (Dodo Press)'}, + {'lang': ['eng'], 'key': '/b/OL11752220M', 'title': 'The Dollar Hen'}, + {'lang': ['eng'], 'key': '/b/OL11985500M', 'title': 'The Dollar Hen'}, + {'lang': ['eng'], 'key': '/b/OL11985503M', 'title': 'The Dollar Hen'} +] + +aaron_bancroft = [ # /a/OL17005A + {'lang': ['eng'], 'key': '/b/OL595471M', 'title': 'A sermon preached before His Excellency Caleb Strong, Esq., Governour, the Honourable the Council, Senate, and House of Representatives of the commonwealth of Massachusetts, May 27, 1801'}, + {'lang': ['eng'], 'key': '/b/OL1247387M', 'title': 'A discourse delivered before the convention of Congregational ministers of Massachusetts, at their annual meeting in Boston, June 1, 1820'}, + {'lang': ['eng'], 'key': '/b/OL6472976M', 'title': 'The importance of a religious education illustrated and enforced'}, + {'lang': ['eng'], 'key': '/b/OL6919451M', 'title': 'A discourse delivered at Windsor, in the state of Vermont, on the 23rd of June, MDCCXC'}, + {'lang': ['eng'], 'key': '/b/OL6950265M', 'title': 'A sermon delivered in Worcester, January 31, 1836'}, + {'key': '/b/OL7048038M', 'title': 'Sermons on those doctrines of the gospel, and on those constituent principles of the church, which Christian professors have made the subject of controversy. ..'}, + {'key': '/b/OL7197334M', 'title': 'The life of George Washington ....'}, + {'lang': ['eng'], 'key': '/b/OL14572992M', 'title': 'A sermon, delivered at Worcester, on the eleventh of June, 1793'}, + {'lang': ['eng'], 'key': '/b/OL14588026M', 'title': 'An eulogy on the character of the late Gen. George Washington.'}, + {'lang': ['eng'], 'key': '/b/OL14601446M', 'title': 'A sermon, delivered at Brimfield, on the 20th of June, 1798'}, + {'lang': ['eng'], 'key': '/b/OL14608347M', 'title': 'The importance of a religious education illustrated and enforced.'}, + {'lang': ['eng'], 'key': '/b/OL14702050M', 'title': 'The nature and worth of Christian liberty'}, + {'lang': ['eng'], 'key': '/b/OL14981988M', 'title': 'A vindication of the result of the late Mutual Council convened in Princeton'}, + {'lang': ['eng'], 'key': '/b/OL14992328M', 'title': 'An essay on the life of George Washington'}, + {'lang': ['eng'], 'key': '/b/OL15054440M', 'title': 'Importance of education'}, + {'lang': ['eng'], 'key': '/b/OL15070888M', 'title': 'The leaf an emblem of human life'}, + {'lang': ['eng'], 'key': '/b/OL15075529M', 'title': 'The world passeth away, but the children of God abide forever'}, + {'lang': ['eng'], 'key': '/b/OL15085786M', 'title': 'The doctrine of immortality'}, + {'lang': ['eng'], 'key': '/b/OL15093560M', 'title': 'The comparative advantages of the ministerial profession'}, + {'lang': ['eng'], 'key': '/b/OL15115706M', 'title': 'The duties enjoined by the Fourth commandment'}, + {'lang': ['eng'], 'key': '/b/OL15120201M', 'title': 'A discourse on conversion'}, + {'lang': ['eng'], 'key': '/b/OL15120290M', 'title': 'The nature and worth of Christian liberty'}, + {'lang': ['eng'], 'key': '/b/OL17052663M', 'title': 'An eulogy on the character of the late Gen. George Washington'}, + {'lang': ['eng'], 'key': '/b/OL17704747M', 'title': 'The doctrine of immortality'}, + {'lang': ['eng'], 'key': '/b/OL17707429M', 'title': 'Importance of education'}, + {'lang': ['eng'], 'key': '/b/OL17709244M', 'title': 'A vindication of the result of the late mutual council convened in Princeton'}, + {'lang': ['eng'], 'key': '/b/OL18776110M', 'title': 'Sermons on those doctrines of the gospel, and on those constituent principles of the church, which Christian professors have made the subject of controversy'}, + {'lang': ['eng'], 'key': '/b/OL6573411M', 'title': 'The life of George Washington, commander in chief of the American army, through the revolutionary war'}, + {'lang': ['eng'], 'key': '/b/OL15592993M', 'title': 'A discourse on conversion'}, + {'lang': ['eng'], 'key': '/b/OL17712475M', 'title': 'A discourse on conversion'}, + {'lang': ['eng'], 'key': '/b/OL6290214M', 'title': 'The life of George Washington'}, + {'lang': ['eng'], 'key': '/b/OL6571503M', 'title': 'The life of George Washington'}, + {'lang': ['eng'], 'key': '/b/OL6573412M', 'title': 'Life of George Washington'}, + {'work_title': 'Essay on the life of George Washington', 'key': '/b/OL7168113M', 'title': 'Life of George Washington, commander in chief of the American army through the revolutionary war, and the first president of the United States.'}, + {'work_title': 'Essay on the life of George Washington', 'key': '/b/OL7243025M', 'title': 'The life of George Washington, commander in chief of the American army, through the revolutionary war, and the first president of the United States'}, + {'lang': ['eng'], 'key': '/b/OL28289M', 'title': 'The life of George Washington, commander-in-chief of the American Army through the Revolutionary War and the first President of the United States'}, + {'lang': ['eng'], 'key': '/b/OL6354818M', 'title': 'The life of George Washington, commander-in-chief of the American Army through the revolutionary war, and the first president of the United States.'}, + {'key': '/b/OL7113589M', 'title': 'The life of George Washington, Commander-in-Chief of the American Army, through the Revolutionary War; and the first President of the United States.'} +] + +def find_works(books): + for book in books: + m = re_parens.match(book['title']) + if m: + book['title'] = m.group(1) + n = mk_norm(book['title']) + book['norm_title'] = n + + books_by_key = dict((b['key'], b) for b in books) + norm_titles = defaultdict(int) + + for book in books: + norm_titles[book['norm_title']] += 1 + + title_map = build_work_title_map({}, norm_titles) + + works = defaultdict(lambda: defaultdict(list)) + work_titles = defaultdict(list) + for b in books: + if 'eng' not in b.get('lang', []) and 'norm_wt' in b: + work_titles[b['norm_wt']].append(b['key']) + continue + n = b['norm_title'] + title = b['title'] + if n in title_map: + n = title_map[n] + title = freq_dict_top(rev_wt[n]) + works[n][title].append(b['key']) + + #for k, v in works.items(): + # print k + # print ' ', sum(len(i) for i in v.values()), dict(v) + #print + + works = sorted([(sum(map(len, w.values() + [work_titles[n]])), n, w) for n, w in works.items()]) + + for a, b, c in works: + print(a, b, dict(c)) + +find_works(milo_m_hastings) +find_works(aaron_bancroft) diff --git a/ia-legacy-importer/works/use_amazon.py b/ia-legacy-importer/works/use_amazon.py new file mode 100755 index 00000000..dee37cf4 --- /dev/null +++ b/ia-legacy-importer/works/use_amazon.py @@ -0,0 +1,78 @@ +from __future__ import print_function +import os +import re +import sys +import codecs +import dbhash +from catalog.amazon.other_editions import find_others +from catalog.infostore import get_site +from catalog.read_rc import read_rc +from catalog.get_ia import get_data +from catalog.marc.build_record import build_record +from catalog.marc.fast_parse import get_tag_lines, get_all_subfields + +sys.stdout = codecs.getwriter('utf-8')(sys.stdout) +rc = read_rc() +db = dbhash.open(rc['index_path'] + 'isbn_to_marc.dbm', 'r') + +site = get_site() + +def get_records_from_marc(isbn): + if isbn not in db: + return +# for loc in db[isbn].split(' '): +# data = get_data(loc) +# print loc +# want = ['100', '110', '111', '240', '245', '260'] + [str(i) for i in range(500,600) if i not in (505, 520)] +# for tag, line in get_tag_lines(data, set(want)): +# sub = list(get_all_subfields(line)) +# if tag.startswith('5'): +# assert len(sub) == 1 and sub[0][0] == 'a' +# note = sub[0][1] +# if note.find('ublish') != -1 or note.find('riginal') != -1: +# print ' note:', note +# continue +# print ' ', tag, sub +# print + recs = [(loc, build_record(get_data(loc))) for loc in db[isbn].split(' ')] + keys = set() + print() + for loc, rec in recs: + print(' ', loc) +# keys.update([k for k in rec.keys() if k.find('title') != -1 or k in ('authors', 'title', 'contributions', 'work_title')]) + keys.update(rec.keys()) + print() + for k in keys: + print(k) + for loc, rec in recs: + print(" ", rec.get(k, '###')) + print() + print() + +dir = sys.argv[1] +for filename in os.listdir(dir): + if not filename[0].isdigit(): + continue + l = find_others(filename, dir) + if not l: + continue + print(filename) + for k in site.things({'isbn_10': filename, 'type': '/type/edition'}): + t = site.withKey(k) + num = len(t.isbn_10) + if num == 1: + num = '' + print(' OL:', k, t.title, num) + get_records_from_marc(filename) + for asin, extra in l: + print(asin, extra) + things = site.things({'isbn_10': asin, 'type': '/type/edition'}) + if things: + for k in things: + t = site.withKey(k) + num = len(t.isbn_10) + if num == 1: + num = '' + print(' OL:', k, t.title, num) + get_records_from_marc(asin) + print("----") diff --git a/ia-legacy-importer/works/use_amazon2.py b/ia-legacy-importer/works/use_amazon2.py new file mode 100755 index 00000000..369b0176 --- /dev/null +++ b/ia-legacy-importer/works/use_amazon2.py @@ -0,0 +1,27 @@ +from __future__ import print_function +import os +import re +import sys +import codecs +import dbhash +from catalog.amazon.other_editions import find_others +from catalog.infostore import get_site +from catalog.read_rc import read_rc +from catalog.get_ia import get_data +from catalog.marc.build_record import build_record +from catalog.marc.fast_parse import get_tag_lines, get_all_subfields + +sys.stdout = codecs.getwriter('utf-8')(sys.stdout) +rc = read_rc() +db = dbhash.open(rc['index_path'] + 'isbn_to_marc.dbm', 'r') + +site = get_site() + +dir = sys.argv[1] +for filename in os.listdir(dir): + if not filename[0].isdigit(): + continue + l = find_others(filename, dir) + if len(l) < 8: + continue + print(filename, len(l)) diff --git a/ia-legacy-importer/works/web_ui.py b/ia-legacy-importer/works/web_ui.py new file mode 100644 index 00000000..10227b48 --- /dev/null +++ b/ia-legacy-importer/works/web_ui.py @@ -0,0 +1,178 @@ +from __future__ import print_function +import web +import re +from time import time +from catalog.read_rc import read_rc +from catalog.infostore import get_site +#from catalog.db_read import get_things, withKey +from catalog.amazon.other_editions import find_others +from catalog.merge.normalize import normalize + +rc = read_rc() + +re_translation_of = re.compile('^Translation of\b[: ]*([^\n]*?)\.?$', re.I | re.M) + +site = get_site() + +def isbn_link(i): + return '%s (Amazon.com)' % (i, i, i) + +def ol_link(key): + return '%s' % (key, key) + +def get_author_keys(name): + authors = site.things({ 'type': '/type/author', 'name': name }) + if authors: + return ','.join("'%s'" % a for a in authors) + else: + return None + +def get_title_to_key(author): + # get id to key mapping of all editions by author + author_keys = get_author_keys(author) + if not author_keys: + return {} + + # get title to key mapping of all editions by author + t0 = time() + sql = "select key, value as title from thing, edition_str " \ + + "where thing.id = thing_id and key_id=3 and thing_id in (" \ + + "select thing_id from edition_ref, thing " \ + + "where edition_ref.key_id=11 and edition_ref.value = thing.id and thing.key in (" + author_keys + "))" + print(sql) + return {} + title_to_key = {} + for r in web.query(sql): + t = normalize(r.title).strip('.') + title_to_key.setdefault(t, []).append(r.key) + return title_to_key + +def search(title, author): + + title_to_key = get_title_to_key(author) + norm_title = normalize(title).strip('.') + + if norm_title not in title_to_key: + print('title not found') + return + + pool = set(title_to_key[norm_title]) + + editions = [] + seen = set() + found_titles = {} + found_isbn = {} + while pool: + key = pool.pop() + seen.add(key) + e = site.withKey(key) + translation_of = None + if False and e.notes: + m = re_translation_of.search(e.notes) + if m: + translation_of = m.group(1).lower() + pool.update(k for k in title_to_key[translation_of] if k not in seen) + found_titles.setdefault(translation_of, []).append(key) + if False and e.isbn_10: + for i in e.isbn_10: + found_isbn.setdefault(i, []).append(key) + join_isbn = ', '.join(map(isbn_link, e.isbn_10)) + else: + join_isbn = '' + rec = { + 'key': key, + 'publish_date': e.publish_date, + 'publishers': ', '.join(p.encode('utf-8') for p in (e.publishers or [])), + 'isbn': join_isbn, + } + editions.append(rec) + + if e.work_titles: + for t in e.work_titles: + t=t.strip('.') + pool.update(k for k in title_to_key.get(t.lower(), []) if k not in seen) + found_titles.setdefault(t, []).append(key) + if e.other_titles: + for t in e.other_titles: + t=t.strip('.') + pool.update(k for k in title_to_key.get(t.lower(), []) if k not in seen) + found_titles.setdefault(t, []).append(key) + + print('') + for e in sorted(editions, key=lambda e: e['publish_date'] and e['publish_date'][-4:]): + print('') + print('') + print('') + print('') + print('
    ', ol_link(e['key'])) + print('', e['publish_date'], '', e['publishers'], '', e['isbn'], '
    ') + + if found_titles: + print('

    Other titles

    ') + print('
      ') + for k, v in found_titles.iteritems(): + if k == title: + continue + print('
    • %s' % (k, author, k), end=' ') + print('from', ', '.join(ol_link(i) for i in v)) + print('
    ') + + extra_isbn = {} + for k, v in found_isbn.iteritems(): + for isbn, note in find_others(k, rc['amazon_other_editions']): + if note.lower().find('audio') != -1: + continue + if isbn not in found_isbn: + extra_isbn.setdefault(isbn, []).extend(v) + + if extra_isbn: + print('

    Other ISBN

    ') + print('
      ') + for k in sorted(extra_isbn): + print('
    • ', isbn_link(k), end=' ') + print('from', ', '.join(ol_link(i) for i in extra_isbn[k])) + print('
    ') + +urls = ( + '/', 'index' +) + +def textbox(name, input): + if name in input: + return '' % (name, web.htmlquote(input[name])) + else: + return '' % (name) + +class index: + def GET(self): + web.header('Content-Type','text/html; charset=utf-8', unique=True) + input = web.input() + title = None + author = None + if 'title' in input: + title = input.title + if 'author' in input: + author = input.author + html_title = 'Work finder' + print("\n\n%s" % html_title) + print(''' +''') + + print('') + + print("") + print('
    ') + print('', end=' ') + print('') + print('') + print('') + print('') + print('
    Title', textbox('title', input), '
    Author', textbox('author', input), '
    ') + if title and author: + search(title, author) + print('
    ') + +if __name__ == "__main__": web.run(urls, globals(), web.reloader)