-
Notifications
You must be signed in to change notification settings - Fork 40
Commit
…t for replay for #436
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,6 +31,7 @@ | |
from six.moves import input | ||
|
||
from util import IPFSAPI_HOST, IPFSAPI_PORT | ||
from util import generateCDXJMetadata | ||
|
||
# from warcio.archiveiterator import ArchiveIterator | ||
|
||
|
@@ -278,18 +279,6 @@ def getCDXJLinesFromFile(warcPath, **encCompOpts): | |
return cdxjLines | ||
|
||
|
||
def generateCDXJMetadata(cdxjLines=None): | ||
This comment has been minimized.
Sorry, something went wrong.
This comment has been minimized.
Sorry, something went wrong.
machawk1
Author
Member
|
||
metadata = ['!context ["http://tools.ietf.org/html/rfc7089"]'] | ||
metaVals = { | ||
'generator': "InterPlanetary Wayback v.{0}".format(ipwbVersion), | ||
'created_at': '{0}'.format(datetime.datetime.now().isoformat()) | ||
} | ||
metaVals = '!meta {0}'.format(json.dumps(metaVals)) | ||
metadata.append(metaVals) | ||
|
||
return metadata | ||
|
||
|
||
def askUserForEncryptionKey(): | ||
if DEBUG: # Allows testing instead of requiring a user prompt | ||
return 'ipwb' | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,8 @@ | |
import surt | ||
import re | ||
import signal | ||
import random | ||
import string | ||
|
||
from pywb.utils.canonicalize import unsurt | ||
|
||
|
@@ -40,6 +42,8 @@ | |
from util import IPFSAPI_HOST, IPFSAPI_PORT, IPWBREPLAY_HOST, IPWBREPLAY_PORT | ||
from util import INDEX_FILE | ||
|
||
import indexer | ||
|
||
from base64 import b64decode | ||
from Crypto.Cipher import AES | ||
from Crypto.Util.Padding import pad | ||
|
@@ -50,7 +54,15 @@ | |
from __init__ import __version__ as ipwbVersion | ||
|
||
|
||
from flask import flash, url_for | ||
from werkzeug.utils import secure_filename | ||
from flask import send_from_directory | ||
|
||
UPLOAD_FOLDER = '/tmp' | ||
ALLOWED_EXTENSIONS = set(['warc', 'warc.gz']) | ||
|
||
app = Flask(__name__) | ||
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER | ||
app.debug = False | ||
|
||
IPFS_API = ipfsapi.Client(IPFSAPI_HOST, IPFSAPI_PORT) | ||
|
@@ -62,6 +74,62 @@ def setServerHeader(response): | |
return response | ||
|
||
|
||
def allowed_file(filename): | ||
return '.' in filename and \ | ||
filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS | ||
This comment has been minimized.
Sorry, something went wrong.
ibnesayeed
Member
|
||
|
||
|
||
@app.route('/upload', methods=['GET', 'POST']) | ||
def upload_file(): | ||
if request.method == 'POST': | ||
# check if the post request has the file part | ||
if 'file' not in request.files: | ||
flash('No file part') | ||
return redirect(request.url) | ||
file = request.files['file'] | ||
# if user does not select file, browser also | ||
# submit an empty part without filename | ||
if file.filename == '': | ||
flash('No selected file') | ||
return redirect(request.url) | ||
if file and allowed_file(file.filename): | ||
filename = secure_filename(file.filename) | ||
warcPath = os.path.join(app.config['UPLOAD_FOLDER'], filename) | ||
file.save(warcPath) | ||
|
||
cdxjPath = '/tmp/' + ''.join(random.sample( | ||
This comment has been minimized.
Sorry, something went wrong.
ibnesayeed
Member
|
||
string.ascii_uppercase + string.digits * 6, 6)) + '.cdxj' | ||
combinedcdxjPath = '/tmp/' + ''.join(random.sample( | ||
string.ascii_uppercase + string.digits * 6, 6)) + '.cdxj' | ||
|
||
# Check if semaphore lock exists | ||
# Index file, produce new.cdxj | ||
print('Indexing file from uploaded WARC at {0} to {1}'.format( | ||
warcPath, cdxjPath)) | ||
indexer.indexFileAt(warcPath, outfile=cdxjPath) | ||
print('index created at {0}'.format(cdxjPath)) | ||
|
||
# Create semaphore | ||
# Join current.cdxj w/ new.cdxj, write to combined.cdxj | ||
print('* Prior index file: ' + app.cdxjFilePath) | ||
print('* Index file of new WARC: ' + cdxjPath) | ||
print('* Combined index file (to-write): ' + combinedcdxjPath) | ||
ipwbUtils.joinCDXJFiles( | ||
app.cdxjFilePath, cdxjPath, combinedcdxjPath) | ||
print('Setting ipwb replay index variables') | ||
|
||
ipwbUtils.setIPWBReplayIndexPath(combinedcdxjPath) | ||
app.cdxjFilePath = combinedcdxjPath | ||
app.cdxjFileContents = getIndexFileContents(combinedcdxjPath) | ||
|
||
# Set replay.index to path of combined.cdxj | ||
# Release lock | ||
# Restart replay system? | ||
|
||
return redirect('/') | ||
return 'Upload failed, send POST' | ||
|
||
|
||
@app.route('/webui/<path:path>') | ||
def showWebUI(path): | ||
""" Handle requests for the IPWB replay Web interface and requests | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,6 +18,7 @@ | |
import datetime | ||
import logging | ||
import platform | ||
import shutil | ||
|
||
import urllib2 | ||
import json | ||
|
@@ -70,6 +71,48 @@ def isValidCDXJ(stringIn): # TODO: Check specific strict syntax | |
return True | ||
|
||
|
||
def generateCDXJMetadata(cdxjLines=None): | ||
metadata = ['!context ["http://tools.ietf.org/html/rfc7089"]'] | ||
metaVals = { | ||
'generator': "InterPlanetary Wayback v.{0}".format(ipwbVersion), | ||
'created_at': '{0}'.format(datetime.datetime.now().isoformat()) | ||
} | ||
metaVals = '!meta {0}'.format(json.dumps(metaVals)) | ||
metadata.append(metaVals) | ||
|
||
return metadata | ||
|
||
|
||
def joinCDXJFiles(cdxjPath1, cdxjPath2, outputFilePath): | ||
This comment has been minimized.
Sorry, something went wrong.
ibnesayeed
Member
|
||
# CDXJ2 takes precedence in surt uri and datetimes identity | ||
|
||
# Join two files quickly | ||
with open(outputFilePath, 'wb') as wfd: | ||
for f in [cdxjPath1, cdxjPath2]: | ||
with open(f, 'rb') as fd: | ||
shutil.copyfileobj(fd, wfd, 1024 * 1024 * 10) | ||
|
||
cdxjLines = '' | ||
with open(outputFilePath, 'r') as wfd: | ||
cdxjLines = wfd.read().split('\n') | ||
|
||
# De-dupe and sort, needed for CDXJ adherence (pulled from indexer.py) | ||
cdxjLines = list(set(cdxjLines)) | ||
cdxjLines.sort() | ||
|
||
cdxjLines[:] = [line for line in cdxjLines | ||
if len(line) > 0 and line[0] != '!'] | ||
|
||
# Prepend metadata | ||
cdxjMetadataLines = generateCDXJMetadata(cdxjLines) | ||
cdxjLines = cdxjMetadataLines + cdxjLines | ||
|
||
cdxjLines = '\n'.join(cdxjLines) | ||
|
||
with open(outputFilePath, 'w') as wfd: | ||
wfd.write(cdxjLines) | ||
|
||
|
||
def isValidCDXJLine(cdxjLine): | ||
try: | ||
(surtURI, datetime, jsonData) = cdxjLine.split(' ', 2) | ||
|
1 comment
on commit 797a091
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@ibnesayeed Please have a look at these additions relative to #436. I am going to wait until I have fresh eyes/brain before I submit a PR.
Also, disregard styling, layout, and such. This effort was mainly to get the backend in-place.
Do we need to move this function in the utils file? We are already importing indexer in the replay, so we might be able to directly use it from there.