ismir · stefan-balke · Mar 27, 2019 · Nov 11, 2018 · Nov 11, 2018 · Nov 12, 2018
diff --git a/README.md b/README.md
@@ -128,4 +128,15 @@ $ ./scripts/export_to_markdown.py \
     proceedings.md
 ```
 
-TODO[@ejhumphrey]: This is forward facing, and the export tools must be updated for the modern record schema.
+TODO[@ejhumphrey]: This is forward facing, and the export tools must be updated for the modern record schema.
+
+
+## Development
+
+### Running Tests
+
+After installing `py.test` and `pytest-cov`, run tests and check coverage locally.
+
+```bash
+$ PYTHONPATH=.:scripts py.test -vs tests --cov zen scripts
+```
diff --git a/database/proceedings/2018.json b/database/proceedings/2018.json
diff --git a/scripts/export_to_markdown.py b/scripts/export_to_markdown.py
@@ -11,10 +11,10 @@
 
 Or, this can be used with `parallel` to bulk export a number of pages:
 
-$ seq -w 00 17 | \
+$ seq -w 00 18 | \
     parallel -j4 -v "./scripts/metadata_to_markdown.py \
-        data/proceedings-20181003.json \
-        proceedings/ismir20{}.md --year 20{}"
+        database/proceedings/2018.json \
+        assets/md/ismir20{}.md --page_sort"
 """
 import argparse
 import copy
@@ -44,38 +44,38 @@ def render_one(record):
     else:
         authors = record['author']
 
-    return ('|{0}<br>**[{title}]({url})** [[pdf]({ee})]|'
-            .format(authors, **record))
+    pages = record.pop('pages', '') + ' '
 
+    return ('|{0}<br>**[{title}]({url})** {1}[[pdf]({ee})]|'
+            .format(authors, pages, **record))
 
-def render(records, year=None):
+
+def render(records, year=None, page_sort=False):
     if year is not None:
         records = filter(lambda x: x['year'] == year, records)
 
-    records = sorted(records, key=lambda x: x['@key'])
+    if page_sort:
+        records = sorted(records, key=lambda x: int(x['pages'].split('-')[0]))
 
     lines = [render_one(record) for record in records]
-    return '\n'.join(TEMPLATE + lines)
+    return '\n'.join([TEMPLATE] + lines)
 
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description=__doc__)
 
     # Inputs
-    parser.add_argument("proceedings",
-                        metavar="proceedings", type=str,
+    parser.add_argument("proceedings", type=str,
                         help="Path to proceedings records.")
-    parser.add_argument("output_file",
-                        metavar="output_file", type=str,
+    parser.add_argument("output_file", type=str,
+                        help="Path to output markdown file.")
+    parser.add_argument("--page_sort", dest="page_sort", action='store_true',
                         help="Path to output markdown file.")
-    parser.add_argument("--year",
-                        metavar="year", type=str, default=None,
-                        help="Year filter for records")
 
     args = parser.parse_args()
     proceedings = json.load(open(args.proceedings))
 
     with open(args.output_file, 'w') as fp:
-        fp.write(render(proceedings.values(), year=args.year))
+        fp.write(render(proceedings, page_sort=args.page_sort))
 
     sys.exit(0 if os.path.exists(args.output_file) else 1)
diff --git a/scripts/extract_pdf_abstract.py b/scripts/extract_pdf_abstract.py
@@ -11,18 +11,20 @@
     ./path/to/abstracts.json
 
 """
-import os
-import json
-import io
-import tempfile
 import argparse
-import tqdm
 from joblib import Parallel, delayed
+import json
+import io
+import os
 import pdfminer.high_level
 import pdfminer.layout
 import pdfminer.settings
 from pdfrw import PdfReader, PdfWriter
 from pdfrw.findobjs import page_per_xobj
+import tempfile
+import tqdm
+
+
 pdfminer.settings.STRICT = False
 
 
@@ -75,10 +77,8 @@ def extract_abstract(raw_text):
     if intro_index == -1:
         intro_index = raw_text.find('1.  INTRODUCTION')
 
-    try:
-        # if no intro index was found, return empty abstract
-        assert intro_index != -1
-    except AssertionError:
+    # if no intro index was found, return empty abstract
+    if intro_index == -1:
         return ''
 
     # post-processing
@@ -113,8 +113,9 @@ def extract(key, path_pdf):
         print('{}: Could not extract abstract.'.format(path_pdf))
 
     # clean up temp file
-    os.remove(path_tmp_pdf)
+    os.unlink(path_tmp_pdf)
 
+    # TODO: Fix this return object
     out = {'@key': key, 'abstract': abstract}
 
     return out

diff --git a/scripts/upload_to_zenodo.py b/scripts/upload_to_zenodo.py
@@ -17,7 +17,7 @@
 $ ./scripts/upload_to_zenodo.py \
     data/proceedings.json \
     data/conferences.json \
-    --output_file updated-proceedings.json \
+    uploaded-proceedings.json \
     --stage dev \
     --verbose 50 \
     --num_cpus -2 \
@@ -72,14 +72,20 @@ def upload(ismir_paper, conferences, stage=zen.DEV):
     upload_response = zen.upload_file(zid, ismir_paper['ee'], stage=stage)
     ismir_paper['ee'] = upload_response['links']['download']
 
+    # TODO: Should be a package function
     zenodo_meta = zen.models.merge(
         zen.models.Zenodo, ismir_paper, conf,
         creators=zen.models.author_to_creators(ismir_paper['author']),
+        partof_pages=ismir_paper['pages'],
         description=ismir_paper['abstract'])
 
     zen.update_metadata(zid, zenodo_meta.dropna(), stage=stage)
     publish_response = zen.publish(zid, stage=stage)
-    ismir_paper.update(doi=publish_response['doi'], url=publish_response['doi_url'])
+
+    ismir_paper.update(doi=publish_response['doi'],
+                       url=publish_response['doi_url'],
+                       zenodo_id=zid)
+
     return ismir_paper
 
 
@@ -100,10 +106,9 @@ def archive(proceedings, conferences, stage=zen.DEV, num_cpus=-2, verbose=0):
     parser.add_argument("conferences",
                         metavar="conferences", type=str,
                         help="Path to a JSON file of conference metadata.")
-    parser.add_argument("--output_file",
-                        metavar="--output_file", type=str, default=None,
-                        help="Path to log updated records; if unspecified, "
-                             "will overwrite the input.")
+    parser.add_argument("output_file",
+                        metavar="output_file", type=str,
+                        help="Path to an output JSON file for writing updated records.")
     parser.add_argument("--stage",
                         metavar="stage", type=str, default=zen.DEV,
                         help="Stage to execute.")
@@ -127,7 +132,7 @@ def archive(proceedings, conferences, stage=zen.DEV, num_cpus=-2, verbose=0):
 
     results = archive(proceedings, conferences, args.stage, args.num_cpus, args.verbose)
 
-    with open(args.output_file or args.proceedings, 'w') as fp:
+    with open(args.output_file, 'w') as fp:
         json.dump(results, fp, indent=2)
 
     sys.exit(0 if os.path.exists(args.output_file) else 1)
diff --git a/scripts/uploader.py b/scripts/uploader.py
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,23 @@
+import pytest
+
+import os
+
+
+@pytest.fixture()
+def root_dir():
+    return os.path.join(os.path.dirname(__file__), os.path.pardir)
+
+
+@pytest.fixture()
+def resources_dir():
+    return os.path.join(os.path.dirname(__file__), 'resources')
+
+
+@pytest.fixture()
+def scripts_dir(root_dir):
+    return os.path.join(root_dir, 'scripts')
+
+
+@pytest.fixture()
+def pdf_file(resources_dir):
+    return os.path.join(resources_dir, 'sample.pdf')
diff --git a/tests/resources/sample-confs.json b/tests/resources/sample-confs.json
@@ -0,0 +1,18 @@
+{
+    "1995": {
+        "conference_dates": "Smarch 13, 1995",
+        "conference_place": "The Cloud",
+        "imprint_place": "The Cloud",
+        "conference_title": "International Society for Music Information Retrieval",
+        "partof_title": "Proceedings of the International Society for Music Information Retrieval Conference that never happened",
+        "publication_date": "1995-13-13",
+        "imprint_isbn": "foo bar",
+        "conference_acronym": "ISMIR Integration Tests",
+        "conference_url": "http://github.com/ismir/conference-archive",
+        "imprint_publisher": "ISMIR",
+        "upload_type": "publication",
+        "publication_type": "conferencepaper",
+        "access_right": "open",
+        "license": "CC-BY-4.0"
+    }
+}
diff --git a/tests/resources/sample-papers.json b/tests/resources/sample-papers.json
@@ -0,0 +1,13 @@
+[
+  {
+    "title": "Sample ISMIR Upload",
+    "author": "ISMIR Webmaster",
+    "year": "1995",
+    "doi": null,
+    "url": "",
+    "ee": "./tests/resources/sample.pdf",
+    "abstract": "This is a sample pdf uploaded via the conference-archive integration tests. Please contact webmaster@ismir.net if something bad or unexpected has occurred.",
+    "zenodo_id": null,
+    "dblp_key": "conf/ismir/Sample1995"
+  }
+]
diff --git a/tests/resources/sample.pdf b/tests/resources/sample.pdf
diff --git a/tests/test_extract_pdf_abstract.py b/tests/test_extract_pdf_abstract.py
@@ -0,0 +1,36 @@
+import pytest
+
+import os
+import shutil
+
+import extract_pdf_abstract
+
+
+def test_extract_pdf_abstract_extract_first_page(pdf_file, tmpdir):
+    tmp_file = extract_pdf_abstract.extract_first_page(pdf_file)
+    assert os.path.exists(tmp_file)
+    shutil.copy(tmp_file, str(tmpdir))
+
+
+def test_extract_pdf_abstract_extract_text(pdf_file, tmpdir):
+    all_text = extract_pdf_abstract.extract_text(pdf_file)
+    assert len(all_text) > 1000
+
+
+def test_extract_pdf_abstract_extract_abstract():
+    raw_text = 'foo barr ABSTRACT here\nis the abst-\nract 1. INTRODUCTION and the rest'
+    abstract = extract_pdf_abstract.extract_abstract(raw_text)
+    assert abstract == 'here is the abstract'
+    assert extract_pdf_abstract.extract_abstract('there is no abstract') == ''
+
+
+def test_extract_pdf_abstract_extract_extract():
+    pass
+
+
+def test_extract_pdf_abstract_main():
+    pass
+
+
+def test_extract_pdf_abstract_cli():
+    pass