Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ejhumphrey/ismir18 archive #26

Merged
merged 32 commits into from
Mar 27, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
6b26ff5
working through polish to upload ISMIR18 to zenodo
Nov 11, 2018
fe8c51f
Merge branch 'master' of https://github.com/ismir/conference-archive …
Nov 11, 2018
ebc8c48
Added explicit data models for different entity types
Nov 12, 2018
3f2c6b5
added data model fields and merge method
Nov 13, 2018
97df040
Added current archive flow
Nov 13, 2018
f44e3d9
Updated models and dev/prod defines
Nov 13, 2018
eaa6802
Cleaned conference names
Nov 13, 2018
ca30858
updated zenodo uploader for strict data models
Nov 13, 2018
97e2b43
added database subdir for different converences
Nov 13, 2018
bf5395e
Moved conference JSON database
Nov 13, 2018
22abebe
Removing stale data
Nov 13, 2018
334e4ff
Updated readme
Nov 16, 2018
a35712b
Renamed zenodo upload script
Nov 16, 2018
6f71820
Fixed naming for zenodo uploader
Nov 16, 2018
d68a47a
renamed metadata exporter
Nov 16, 2018
9debb6e
Updated naming in markdown exporter
Nov 16, 2018
68f91fb
overhauled readme
Nov 16, 2018
d464960
Adds pages to IsmirPaper, updates 2018 records
Nov 20, 2018
7e97a7e
Fixed markdown export sorting to key on pages
Nov 20, 2018
623e990
Added pagesort flag
Nov 20, 2018
6c0bb13
Successful zenodo prod upload
Nov 20, 2018
9480ab8
manually verified export to md works
Nov 21, 2018
c31cb63
rm'ed old scripts
Nov 21, 2018
e1e6567
updates to zen.api and added starter tests
Nov 21, 2018
791cef7
placeholders for zen.models tests
Nov 21, 2018
f463652
Forgot to add conftest
Nov 21, 2018
7d994e5
Finished out models tests
Nov 23, 2018
3f1d3e7
Added placeholder tests for api, need internetz
Nov 23, 2018
b6054b6
Rounded out tests for zen.api
Nov 23, 2018
9ce33f6
first pass at uploader tests, need internetz
Nov 23, 2018
d56316f
Working through remaining tests
Nov 26, 2018
554a7b8
Manually resolved merge conflicts.
Mar 19, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -128,4 +128,15 @@ $ ./scripts/export_to_markdown.py \
proceedings.md
```

TODO[@ejhumphrey]: This is forward facing, and the export tools must be updated for the modern record schema.
TODO[@ejhumphrey]: This is forward facing, and the export tools must be updated for the modern record schema.


## Development

### Running Tests

After installing `py.test` and `pytest-cov`, run tests and check coverage locally.

```bash
$ PYTHONPATH=.:scripts py.test -vs tests --cov zen scripts
```
936 changes: 520 additions & 416 deletions database/proceedings/2018.json

Large diffs are not rendered by default.

32 changes: 16 additions & 16 deletions scripts/export_to_markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@

Or, this can be used with `parallel` to bulk export a number of pages:

$ seq -w 00 17 | \
$ seq -w 00 18 | \
parallel -j4 -v "./scripts/metadata_to_markdown.py \
data/proceedings-20181003.json \
proceedings/ismir20{}.md --year 20{}"
database/proceedings/2018.json \
assets/md/ismir20{}.md --page_sort"
"""
import argparse
import copy
Expand Down Expand Up @@ -44,38 +44,38 @@ def render_one(record):
else:
authors = record['author']

return ('|{0}<br>**[{title}]({url})** [[pdf]({ee})]|'
.format(authors, **record))
pages = record.pop('pages', '') + ' '

return ('|{0}<br>**[{title}]({url})** {1}[[pdf]({ee})]|'
.format(authors, pages, **record))

def render(records, year=None):

def render(records, year=None, page_sort=False):
if year is not None:
records = filter(lambda x: x['year'] == year, records)

records = sorted(records, key=lambda x: x['@key'])
if page_sort:
records = sorted(records, key=lambda x: int(x['pages'].split('-')[0]))

lines = [render_one(record) for record in records]
return '\n'.join(TEMPLATE + lines)
return '\n'.join([TEMPLATE] + lines)


if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__)

# Inputs
parser.add_argument("proceedings",
metavar="proceedings", type=str,
parser.add_argument("proceedings", type=str,
help="Path to proceedings records.")
parser.add_argument("output_file",
metavar="output_file", type=str,
parser.add_argument("output_file", type=str,
help="Path to output markdown file.")
parser.add_argument("--page_sort", dest="page_sort", action='store_true',
help="Path to output markdown file.")
parser.add_argument("--year",
metavar="year", type=str, default=None,
help="Year filter for records")

args = parser.parse_args()
proceedings = json.load(open(args.proceedings))

with open(args.output_file, 'w') as fp:
fp.write(render(proceedings.values(), year=args.year))
fp.write(render(proceedings, page_sort=args.page_sort))

sys.exit(0 if os.path.exists(args.output_file) else 1)
21 changes: 11 additions & 10 deletions scripts/extract_pdf_abstract.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,20 @@
./path/to/abstracts.json

"""
import os
import json
import io
import tempfile
import argparse
import tqdm
from joblib import Parallel, delayed
import json
import io
import os
import pdfminer.high_level
import pdfminer.layout
import pdfminer.settings
from pdfrw import PdfReader, PdfWriter
from pdfrw.findobjs import page_per_xobj
import tempfile
import tqdm


pdfminer.settings.STRICT = False


Expand Down Expand Up @@ -75,10 +77,8 @@ def extract_abstract(raw_text):
if intro_index == -1:
intro_index = raw_text.find('1. INTRODUCTION')

try:
# if no intro index was found, return empty abstract
assert intro_index != -1
except AssertionError:
# if no intro index was found, return empty abstract
if intro_index == -1:
return ''

# post-processing
Expand Down Expand Up @@ -113,8 +113,9 @@ def extract(key, path_pdf):
print('{}: Could not extract abstract.'.format(path_pdf))

# clean up temp file
os.remove(path_tmp_pdf)
os.unlink(path_tmp_pdf)

# TODO: Fix this return object
out = {'@key': key, 'abstract': abstract}

return out
Expand Down
19 changes: 12 additions & 7 deletions scripts/upload_to_zenodo.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
$ ./scripts/upload_to_zenodo.py \
data/proceedings.json \
data/conferences.json \
--output_file updated-proceedings.json \
uploaded-proceedings.json \
--stage dev \
--verbose 50 \
--num_cpus -2 \
Expand Down Expand Up @@ -72,14 +72,20 @@ def upload(ismir_paper, conferences, stage=zen.DEV):
upload_response = zen.upload_file(zid, ismir_paper['ee'], stage=stage)
ismir_paper['ee'] = upload_response['links']['download']

# TODO: Should be a package function
zenodo_meta = zen.models.merge(
zen.models.Zenodo, ismir_paper, conf,
creators=zen.models.author_to_creators(ismir_paper['author']),
partof_pages=ismir_paper['pages'],
description=ismir_paper['abstract'])

zen.update_metadata(zid, zenodo_meta.dropna(), stage=stage)
publish_response = zen.publish(zid, stage=stage)
ismir_paper.update(doi=publish_response['doi'], url=publish_response['doi_url'])

ismir_paper.update(doi=publish_response['doi'],
url=publish_response['doi_url'],
zenodo_id=zid)

return ismir_paper


Expand All @@ -100,10 +106,9 @@ def archive(proceedings, conferences, stage=zen.DEV, num_cpus=-2, verbose=0):
parser.add_argument("conferences",
metavar="conferences", type=str,
help="Path to a JSON file of conference metadata.")
parser.add_argument("--output_file",
metavar="--output_file", type=str, default=None,
help="Path to log updated records; if unspecified, "
"will overwrite the input.")
parser.add_argument("output_file",
metavar="output_file", type=str,
help="Path to an output JSON file for writing updated records.")
parser.add_argument("--stage",
metavar="stage", type=str, default=zen.DEV,
help="Stage to execute.")
Expand All @@ -127,7 +132,7 @@ def archive(proceedings, conferences, stage=zen.DEV, num_cpus=-2, verbose=0):

results = archive(proceedings, conferences, args.stage, args.num_cpus, args.verbose)

with open(args.output_file or args.proceedings, 'w') as fp:
with open(args.output_file, 'w') as fp:
json.dump(results, fp, indent=2)

sys.exit(0 if os.path.exists(args.output_file) else 1)
75 changes: 0 additions & 75 deletions scripts/uploader.py

This file was deleted.

23 changes: 23 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import pytest

import os


@pytest.fixture()
def root_dir():
return os.path.join(os.path.dirname(__file__), os.path.pardir)


@pytest.fixture()
def resources_dir():
return os.path.join(os.path.dirname(__file__), 'resources')


@pytest.fixture()
def scripts_dir(root_dir):
return os.path.join(root_dir, 'scripts')


@pytest.fixture()
def pdf_file(resources_dir):
return os.path.join(resources_dir, 'sample.pdf')
18 changes: 18 additions & 0 deletions tests/resources/sample-confs.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"1995": {
"conference_dates": "Smarch 13, 1995",
"conference_place": "The Cloud",
"imprint_place": "The Cloud",
"conference_title": "International Society for Music Information Retrieval",
"partof_title": "Proceedings of the International Society for Music Information Retrieval Conference that never happened",
"publication_date": "1995-13-13",
"imprint_isbn": "foo bar",
"conference_acronym": "ISMIR Integration Tests",
"conference_url": "http://github.com/ismir/conference-archive",
"imprint_publisher": "ISMIR",
"upload_type": "publication",
"publication_type": "conferencepaper",
"access_right": "open",
"license": "CC-BY-4.0"
}
}
13 changes: 13 additions & 0 deletions tests/resources/sample-papers.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
[
{
"title": "Sample ISMIR Upload",
"author": "ISMIR Webmaster",
"year": "1995",
"doi": null,
"url": "",
"ee": "./tests/resources/sample.pdf",
"abstract": "This is a sample pdf uploaded via the conference-archive integration tests. Please contact webmaster@ismir.net if something bad or unexpected has occurred.",
"zenodo_id": null,
"dblp_key": "conf/ismir/Sample1995"
}
]
Binary file added tests/resources/sample.pdf
Binary file not shown.
36 changes: 36 additions & 0 deletions tests/test_extract_pdf_abstract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import pytest

import os
import shutil

import extract_pdf_abstract


def test_extract_pdf_abstract_extract_first_page(pdf_file, tmpdir):
tmp_file = extract_pdf_abstract.extract_first_page(pdf_file)
assert os.path.exists(tmp_file)
shutil.copy(tmp_file, str(tmpdir))


def test_extract_pdf_abstract_extract_text(pdf_file, tmpdir):
all_text = extract_pdf_abstract.extract_text(pdf_file)
assert len(all_text) > 1000


def test_extract_pdf_abstract_extract_abstract():
raw_text = 'foo barr ABSTRACT here\nis the abst-\nract 1. INTRODUCTION and the rest'
abstract = extract_pdf_abstract.extract_abstract(raw_text)
assert abstract == 'here is the abstract'
assert extract_pdf_abstract.extract_abstract('there is no abstract') == ''


def test_extract_pdf_abstract_extract_extract():
pass


def test_extract_pdf_abstract_main():
pass


def test_extract_pdf_abstract_cli():
pass
Loading