Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes #41

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 21 additions & 8 deletions SpiderKeeper/app/proxy/contrib/scrapy.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import datetime, time
import datetime
import time

import requests

Expand Down Expand Up @@ -31,7 +32,9 @@ def get_project_list(self):

def delete_project(self, project_name):
post_data = dict(project=project_name)
data = request("post", self._scrapyd_url() + "/delproject.json", data=post_data, return_type="json")
data = request(
"post", self._scrapyd_url() + "/delproject.json", data=post_data, return_type="json"
)
return True if data and data['status'] == 'ok' else False

def get_spider_list(self, project_name):
Expand All @@ -57,22 +60,32 @@ def get_job_list(self, project_name, spider_status=None):
for item in data[self.spider_status_name_dict[_status]]:
start_time, end_time = None, None
if item.get('start_time'):
start_time = datetime.datetime.strptime(item['start_time'], '%Y-%m-%d %H:%M:%S.%f')
start_time = datetime.datetime.strptime(
item['start_time'], '%Y-%m-%d %H:%M:%S.%f'
)
if item.get('end_time'):
end_time = datetime.datetime.strptime(item['end_time'], '%Y-%m-%d %H:%M:%S.%f')
result[_status].append(dict(id=item['id'], start_time=start_time, end_time=end_time))
end_time = datetime.datetime.strptime(
item['end_time'], '%Y-%m-%d %H:%M:%S.%f'
)
result[_status].append(
dict(id=item['id'], start_time=start_time, end_time=end_time)
)
return result if not spider_status else result[spider_status]

def start_spider(self, project_name, spider_name, arguments):
post_data = dict(project=project_name, spider=spider_name)
post_data.update(arguments)
data = request("post", self._scrapyd_url() + "/schedule.json", data=post_data, return_type="json")
data = request(
"post", self._scrapyd_url() + "/schedule.json", data=post_data, return_type="json"
)
return data['jobid'] if data and data['status'] == 'ok' else None

def cancel_spider(self, project_name, job_id):
post_data = dict(project=project_name, job=job_id)
data = request("post", self._scrapyd_url() + "/cancel.json", data=post_data, return_type="json")
return data != None
data = request(
"post", self._scrapyd_url() + "/cancel.json", data=post_data, return_type="json"
)
return data is not None

def deploy(self, project_name, file_path):
with open(file_path, 'rb') as f:
Expand Down
19 changes: 11 additions & 8 deletions SpiderKeeper/app/proxy/spiderctrl.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import datetime
import random
from functools import reduce

from SpiderKeeper.app import db
from SpiderKeeper.app.spider.model import SpiderStatus, JobExecution, JobInstance, Project, JobPriority
from SpiderKeeper.app.spider.model import SpiderStatus, JobExecution, JobInstance, Project, \
JobPriority


class SpiderServiceProxy(object):
Expand Down Expand Up @@ -87,7 +87,8 @@ def delete_project(self, project):
spider_service_instance.delete_project(project.project_name)

def get_spider_list(self, project):
spider_instance_list = self.spider_service_instances[0].get_spider_list(project.project_name)
spider_instance_list = self.spider_service_instances[0]\
.get_spider_list(project.project_name)
for spider_instance in spider_instance_list:
spider_instance.project_id = project.id
return spider_instance_list
Expand Down Expand Up @@ -119,7 +120,7 @@ def sync_job_status(self, project):
db.session.commit()

def start_spider(self, job_instance):
project = Project.find_project_by_id(job_instance.project_id)
project = Project.query.get(job_instance.project_id)
spider_name = job_instance.spider_name
arguments = {}
if job_instance.spider_arguments:
Expand Down Expand Up @@ -154,7 +155,7 @@ def start_spider(self, job_instance):

def cancel_spider(self, job_execution):
job_instance = JobInstance.find_job_instance_by_id(job_execution.job_instance_id)
project = Project.find_project_by_id(job_instance.project_id)
project = Project.query.get(job_instance.project_id)
for spider_service_instance in self.spider_service_instances:
if spider_service_instance.server == job_execution.running_on:
if spider_service_instance.cancel_spider(project.project_name, job_execution.service_job_execution_id):
Expand All @@ -171,11 +172,13 @@ def deploy(self, project, file_path):

def log_url(self, job_execution):
job_instance = JobInstance.find_job_instance_by_id(job_execution.job_instance_id)
project = Project.find_project_by_id(job_instance.project_id)
project = Project.query.get(job_instance.project_id)
for spider_service_instance in self.spider_service_instances:
if spider_service_instance.server == job_execution.running_on:
return spider_service_instance.log_url(project.project_name, job_instance.spider_name,
job_execution.service_job_execution_id)
return spider_service_instance.log_url(
project.project_name, job_instance.spider_name,
job_execution.service_job_execution_id
)

@property
def servers(self):
Expand Down
14 changes: 9 additions & 5 deletions SpiderKeeper/app/schedulers/common.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import threading
import time

from SpiderKeeper.app import scheduler, app, agent, db
from SpiderKeeper.app import scheduler, app, agent
from SpiderKeeper.app.spider.model import Project, JobInstance, SpiderInstance


Expand Down Expand Up @@ -51,7 +50,9 @@ def reload_runnable_spider_job_execution():
available_job_ids = set()
# add new job to schedule
for job_instance in JobInstance.query.filter_by(enabled=0, run_type="periodic").all():
job_id = "spider_job_%s:%s" % (job_instance.id, int(time.mktime(job_instance.date_modified.timetuple())))
job_id = "spider_job_%s:%s" % (
job_instance.id, int(time.mktime(job_instance.date_modified.timetuple()))
)
available_job_ids.add(job_id)
if job_id not in running_job_ids:
scheduler.add_job(run_spider_job,
Expand All @@ -67,8 +68,11 @@ def reload_runnable_spider_job_execution():
max_instances=999,
misfire_grace_time=60 * 60,
coalesce=True)
app.logger.info('[load_spider_job][project:%s][spider_name:%s][job_instance_id:%s][job_id:%s]' % (
job_instance.project_id, job_instance.spider_name, job_instance.id, job_id))
app.logger.info(
'[load_spider_job][project:%s][spider_name:%s][job_instance_id:%s][job_id:%s]' % (
job_instance.project_id, job_instance.spider_name, job_instance.id, job_id
)
)
# remove invalid jobs
for invalid_job_id in filter(lambda job_id: job_id.startswith("spider_job_"),
running_job_ids.difference(available_job_ids)):
Expand Down
18 changes: 6 additions & 12 deletions SpiderKeeper/app/spider/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@ class SpiderCtrl(flask_restful.Resource):
"dataType": 'int'
}])
def get(self, project_id):
project = Project.find_project_by_id(project_id)
return [spider_instance.to_dict() for spider_instance in
SpiderInstance.query.filter_by(project_id=project_id).all()]

Expand Down Expand Up @@ -452,7 +451,7 @@ def inject_project():
project = Project.query.first()
session['project_id'] = project.id
if session.get('project_id'):
project_context['project'] = Project.find_project_by_id(session['project_id'])
project_context['project'] = Project.query.get(session['project_id'])
project_context['spider_list'] = [spider_instance.to_dict() for spider_instance in
SpiderInstance.query.filter_by(project_id=session['project_id']).all()]
else:
Expand Down Expand Up @@ -517,7 +516,7 @@ def project_create():

@app.route("/project/<project_id>/delete")
def project_delete(project_id):
project = Project.find_project_by_id(project_id)
project = Project.query.get(project_id)
agent.delete_project(project)
db.session.delete(project)
db.session.commit()
Expand All @@ -536,7 +535,6 @@ def job_dashboard(project_id):

@app.route("/project/<project_id>/job/periodic")
def job_periodic(project_id):
project = Project.find_project_by_id(project_id)
job_instance_list = [job_instance.to_dict() for job_instance in
JobInstance.query.filter_by(run_type="periodic", project_id=project_id).all()]
return render_template("job_periodic.html",
Expand All @@ -545,7 +543,6 @@ def job_periodic(project_id):

@app.route("/project/<project_id>/job/add", methods=['post'])
def job_add(project_id):
project = Project.find_project_by_id(project_id)
job_instance = JobInstance()
job_instance.spider_name = request.form['spider_name']
job_instance.project_id = project_id
Expand Down Expand Up @@ -602,9 +599,9 @@ def job_run(project_id, job_instance_id):
return redirect(request.referrer, code=302)


@app.route("/project/<project_id>/job/<job_instance_id>/remove")
def job_remove(project_id, job_instance_id):
job_instance = JobInstance.query.filter_by(project_id=project_id, id=job_instance_id).first()
@app.route("/job/<job_instance_id>/remove")
def job_remove(job_instance_id):
job_instance = JobInstance.query.get(job_instance_id)
db.session.delete(job_instance)
db.session.commit()
return redirect(request.referrer, code=302)
Expand All @@ -627,13 +624,12 @@ def spider_dashboard(project_id):

@app.route("/project/<project_id>/spider/deploy")
def spider_deploy(project_id):
project = Project.find_project_by_id(project_id)
return render_template("spider_deploy.html")


@app.route("/project/<project_id>/spider/upload", methods=['post'])
def spider_egg_upload(project_id):
project = Project.find_project_by_id(project_id)
project = Project.query.get(project_id)
if 'file' not in request.files:
flash('No file part')
return redirect(request.referrer)
Expand All @@ -654,13 +650,11 @@ def spider_egg_upload(project_id):

@app.route("/project/<project_id>/project/stats")
def project_stats(project_id):
project = Project.find_project_by_id(project_id)
run_stats = JobExecution.list_run_stats_by_hours(project_id)
return render_template("project_stats.html", run_stats=run_stats)


@app.route("/project/<project_id>/server/stats")
def service_stats(project_id):
project = Project.find_project_by_id(project_id)
run_stats = JobExecution.list_run_stats_by_hours(project_id)
return render_template("server_stats.html", run_stats=run_stats)
17 changes: 8 additions & 9 deletions SpiderKeeper/app/spider/model.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import datetime
from sqlalchemy import desc
from sqlalchemy.orm import relation
from SpiderKeeper.app import db, Base


Expand All @@ -16,10 +17,6 @@ def load_project(cls, project_list):
db.session.add(project)
db.session.commit()

@classmethod
def find_project_by_id(cls, project_id):
return Project.query.filter_by(id=project_id).first()

def to_dict(self):
return {
"project_id": self.id,
Expand All @@ -36,8 +33,9 @@ class SpiderInstance(Base):
@classmethod
def update_spider_instances(cls, project_id, spider_instance_list):
for spider_instance in spider_instance_list:
existed_spider_instance = cls.query.filter_by(project_id=project_id,
spider_name=spider_instance.spider_name).first()
existed_spider_instance = cls.query.filter_by(
project_id=project_id, spider_name=spider_instance.spider_name
).first()
if not existed_spider_instance:
db.session.add(spider_instance)
db.session.commit()
Expand Down Expand Up @@ -152,15 +150,16 @@ class JobExecution(Base):

project_id = db.Column(db.INTEGER, nullable=False, index=True)
service_job_execution_id = db.Column(db.String(50), nullable=False, index=True)
job_instance_id = db.Column(db.INTEGER, nullable=False, index=True)
job_instance_id = db.Column(db.INTEGER,
db.ForeignKey('sk_job_instance.id'), nullable=False, index=True)
job_instance = relation(JobInstance)
create_time = db.Column(db.DATETIME)
start_time = db.Column(db.DATETIME)
end_time = db.Column(db.DATETIME)
running_status = db.Column(db.INTEGER, default=SpiderStatus.PENDING)
running_on = db.Column(db.Text)

def to_dict(self):
job_instance = JobInstance.query.filter_by(id=self.job_instance_id).first()
return {
'project_id': self.project_id,
'job_execution_id': self.id,
Expand All @@ -171,7 +170,7 @@ def to_dict(self):
'end_time': self.end_time.strftime('%Y-%m-%d %H:%M:%S') if self.end_time else None,
'running_status': self.running_status,
'running_on': self.running_on,
'job_instance': job_instance.to_dict() if job_instance else {}
'job_instance': self.job_instance
}

@classmethod
Expand Down
2 changes: 1 addition & 1 deletion SpiderKeeper/app/templates/job_dashboard.html
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ <h1>Job Dashboard</h1>
top: 15px;
right: 10px;">
<button type="button" class="btn btn-success btn-flat" style="margin-top: -10px;" data-toggle="modal"
data-target="#job-run-modal">RunOnce
data-target="#job-run-modal">Run Once
</button>
</ol>
{% endblock %}
Expand Down
4 changes: 2 additions & 2 deletions SpiderKeeper/app/templates/job_periodic.html
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,8 @@ <h3 class="box-title">Periodic jobs (Spiders)</h3>
<td>
<a href="/project/{{ project.id }}/job/{{ job_instance.job_instance_id }}/run"><span
class="label label-info">Run</span></a>
<a href="/project/{{ project.id }}/job/{{ job_instance.job_instance_id }}/remove"><span
class="label label-danger">Remove</span></a>
<a href="{{ url_for('job_remove', job_instance_id=job_instance.job_instance_id) }}">
<span class="label label-danger">Remove</span></a>
</td>
</tr>
{% endfor %}
Expand Down
5 changes: 3 additions & 2 deletions SpiderKeeper/app/util/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
def project_path():
import inspect, os
import os
import inspect
this_file = inspect.getfile(inspect.currentframe())
return os.path.abspath(os.path.dirname(this_file)+'/../')
return os.path.abspath(os.path.dirname(this_file)+'/../')
6 changes: 4 additions & 2 deletions SpiderKeeper/app/util/http.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,10 @@ def request(request_type, url, data=None, retry_times=5, return_type="text"):
res = request_get(url, retry_times)
if request_type == 'post':
res = request_post(url, data, retry_times)
if not res: return res
if return_type == 'text': return res.text
if not res:
return res
if return_type == 'text':
return res.text
if return_type == 'json':
try:
res = res.json()
Expand Down