Adds WARC upload backend, combining indexing, sorting, and replacemen…

…t for replay for #436
oduwsdl · Aug 10, 2018 · 797a091 · ibnesayeed · Aug 10, 2018 · machawk1
1 parent 8d90351
commit 797a091
Show file tree

Hide file tree

Showing 4 changed files with 122 additions and 12 deletions.
diff --git a/ipwb/indexer.py b/ipwb/indexer.py
@@ -31,6 +31,7 @@
 from six.moves import input
 
 from util import IPFSAPI_HOST, IPFSAPI_PORT
+from util import generateCDXJMetadata
 
 # from warcio.archiveiterator import ArchiveIterator
 
@@ -278,18 +279,6 @@ def getCDXJLinesFromFile(warcPath, **encCompOpts):
         return cdxjLines
 
 
-def generateCDXJMetadata(cdxjLines=None):
-    metadata = ['!context ["http://tools.ietf.org/html/rfc7089"]']
-    metaVals = {
-        'generator': "InterPlanetary Wayback v.{0}".format(ipwbVersion),
-        'created_at': '{0}'.format(datetime.datetime.now().isoformat())
-    }
-    metaVals = '!meta {0}'.format(json.dumps(metaVals))
-    metadata.append(metaVals)
-
-    return metadata
-
-
 def askUserForEncryptionKey():
     if DEBUG:  # Allows testing instead of requiring a user prompt
         return 'ipwb'

diff --git a/ipwb/replay.py b/ipwb/replay.py
@@ -19,6 +19,8 @@
 import surt
 import re
 import signal
+import random
+import string
 
 from pywb.utils.canonicalize import unsurt
 
@@ -40,6 +42,8 @@
 from util import IPFSAPI_HOST, IPFSAPI_PORT, IPWBREPLAY_HOST, IPWBREPLAY_PORT
 from util import INDEX_FILE
 
+import indexer
+
 from base64 import b64decode
 from Crypto.Cipher import AES
 from Crypto.Util.Padding import pad
@@ -50,7 +54,15 @@
 from __init__ import __version__ as ipwbVersion
 
 
+from flask import flash, url_for
+from werkzeug.utils import secure_filename
+from flask import send_from_directory
+
+UPLOAD_FOLDER = '/tmp'
+ALLOWED_EXTENSIONS = set(['warc', 'warc.gz'])
+
 app = Flask(__name__)
+app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
 app.debug = False
 
 IPFS_API = ipfsapi.Client(IPFSAPI_HOST, IPFSAPI_PORT)
@@ -62,6 +74,62 @@ def setServerHeader(response):
     return response
 
 
+def allowed_file(filename):
+    return '.' in filename and \
+           filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
+
+
+@app.route('/upload', methods=['GET', 'POST'])
+def upload_file():
+    if request.method == 'POST':
+        # check if the post request has the file part
+        if 'file' not in request.files:
+            flash('No file part')
+            return redirect(request.url)
+        file = request.files['file']
+        # if user does not select file, browser also
+        # submit an empty part without filename
+        if file.filename == '':
+            flash('No selected file')
+            return redirect(request.url)
+        if file and allowed_file(file.filename):
+            filename = secure_filename(file.filename)
+            warcPath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
+            file.save(warcPath)
+
+            cdxjPath = '/tmp/' + ''.join(random.sample(
+                string.ascii_uppercase + string.digits * 6, 6)) + '.cdxj'
+            combinedcdxjPath = '/tmp/' + ''.join(random.sample(
+                string.ascii_uppercase + string.digits * 6, 6)) + '.cdxj'
+
+            # Check if semaphore lock exists
+            # Index file, produce new.cdxj
+            print('Indexing file from uploaded WARC at {0} to {1}'.format(
+                warcPath, cdxjPath))
+            indexer.indexFileAt(warcPath, outfile=cdxjPath)
+            print('index created at {0}'.format(cdxjPath))
+
+            # Create semaphore
+            # Join current.cdxj w/ new.cdxj, write to combined.cdxj
+            print('* Prior index file: ' + app.cdxjFilePath)
+            print('* Index file of new WARC: ' + cdxjPath)
+            print('* Combined index file (to-write): ' + combinedcdxjPath)
+            ipwbUtils.joinCDXJFiles(
+                app.cdxjFilePath, cdxjPath, combinedcdxjPath)
+            print('Setting ipwb replay index variables')
+
+            ipwbUtils.setIPWBReplayIndexPath(combinedcdxjPath)
+            app.cdxjFilePath = combinedcdxjPath
+            app.cdxjFileContents = getIndexFileContents(combinedcdxjPath)
+
+            # Set replay.index to path of combined.cdxj
+            # Release lock
+            # Restart replay system?
+
+            return redirect('/')
+    return 'Upload failed, send POST'
+
+
 @app.route('/webui/<path:path>')
 def showWebUI(path):
     """ Handle requests for the IPWB replay Web interface and requests

diff --git a/ipwb/util.py b/ipwb/util.py
@@ -18,6 +18,7 @@
 import datetime
 import logging
 import platform
+import shutil
 
 import urllib2
 import json
@@ -70,6 +71,48 @@ def isValidCDXJ(stringIn):  # TODO: Check specific strict syntax
     return True
 
 
+def generateCDXJMetadata(cdxjLines=None):
+    metadata = ['!context ["http://tools.ietf.org/html/rfc7089"]']
+    metaVals = {
+        'generator': "InterPlanetary Wayback v.{0}".format(ipwbVersion),
+        'created_at': '{0}'.format(datetime.datetime.now().isoformat())
+    }
+    metaVals = '!meta {0}'.format(json.dumps(metaVals))
+    metadata.append(metaVals)
+
+    return metadata
+
+
+def joinCDXJFiles(cdxjPath1, cdxjPath2, outputFilePath):
+    # CDXJ2 takes precedence in surt uri and datetimes identity
+
+    # Join two files quickly
+    with open(outputFilePath, 'wb') as wfd:
+        for f in [cdxjPath1, cdxjPath2]:
+            with open(f, 'rb') as fd:
+                shutil.copyfileobj(fd, wfd, 1024 * 1024 * 10)
+
+    cdxjLines = ''
+    with open(outputFilePath, 'r') as wfd:
+        cdxjLines = wfd.read().split('\n')
+
+        # De-dupe and sort, needed for CDXJ adherence (pulled from indexer.py)
+        cdxjLines = list(set(cdxjLines))
+        cdxjLines.sort()
+
+        cdxjLines[:] = [line for line in cdxjLines
+                        if len(line) > 0 and line[0] != '!']
+
+        # Prepend metadata
+        cdxjMetadataLines = generateCDXJMetadata(cdxjLines)
+        cdxjLines = cdxjMetadataLines + cdxjLines
+
+        cdxjLines = '\n'.join(cdxjLines)
+
+    with open(outputFilePath, 'w') as wfd:
+        wfd.write(cdxjLines)
+
+
 def isValidCDXJLine(cdxjLine):
     try:
         (surtURI, datetime, jsonData) = cdxjLine.split(' ', 2)

diff --git a/ipwb/webui/index.html b/ipwb/webui/index.html
@@ -17,6 +17,7 @@
 </script>
 </head>
 <body>
+
 <div id="wrapper">
   <div>
     <h1><img src="./webui/logo.png" alt="ipwb" /></h1>
@@ -35,6 +36,15 @@ <h1><img src="./webui/logo.png" alt="ipwb" /></h1>
       <p class="centered topSpace"><a id="webui" target="_blank">IPFS WebUI</a></p>
       <p class="centered"><a href="https://github.com/oduwsdl/ipwb/" target="_blank">IPWB Help</a></p>
   </details>
+      <div style="margin: auto; background-color: #eee; border: 1px solid #999; width: 400px;">
+          <label style="width: 100%; background-color: #999; display: block;">Upload WARC</label>
+        <form method="post" action="/upload" enctype="multipart/form-data">
+          <input type="file" name="file" style="display: inline-block;">
+          <input type="submit" value="Upload" style="display: inline-block;">
+        </form>
+      </div>
+
+
   </footer>
     <div id="uris" class="hidden">
     <h3 id="urisHeader"><abbr title="Uniform Resource Identifiers">URIs</abbr> locally available</h3>