-
Notifications
You must be signed in to change notification settings - Fork 0
/
ThePyScraper.py
101 lines (75 loc) · 4.27 KB
/
ThePyScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import cherrypy
import os.path
import json
import sqlite3
import queue
import threading
import time
from RSS.DBUtils import *
from time import strftime
import feedparser
import concurrent.futures
DATABASE = "RSS.sqlite" #Name of SQLite database.
THREAD_LIMIT = 20 #Maximum # of threads for feed updates
if os.path.isfile(DATABASE): #Check if SQLite database already exists...
print("Database already exist skipping creation...")
else: #If it doesn't exist create database.
DBConnection = openSQLConnection(DATABASE, 1) #Open Database Connection
SQLExecute(DBConnection.cursor(), 'CREATE TABLE IF NOT EXISTS RSSFeeds (feed_id INTEGER PRIMARY KEY AUTOINCREMENT, title VARCHAR(1000), url VARCHAR(1000), category VARCHAR(1000) DEFAULT \'Uncategorized\', date DEFAULT (datetime(\'now\')));')
commitSQLConnection(DBConnection) #Commit table(RSSFeeds) to allow referencial integrity in table(RSSEntries)
SQLExecute(DBConnection.cursor(), 'CREATE TABLE IF NOT EXISTS RSSEntries (entry_id INTEGER PRIMARY KEY AUTOINCREMENT, feed_id INTEGER, url VARCHAR(1000), title, content, date, FOREIGN KEY(feed_id) REFERENCES RSSFeeds(feed_id));')
commitSQLConnection(DBConnection) #Commit table(RSSEntries).
closeSQLConnection(DBConnection) #Close Connection
class ThePyScraper(object):
@cherrypy.expose
def index(self):
return open('ThePyScraper.html') #Return default page.
@cherrypy.expose
def addAFeed(self, feed_url): #Get feed from user.
DBConnection = openSQLConnection(DATABASE, 1) #Open Database Connection
feed_url_parsed = feedparser.parse(feed_url) #Parse feed_url, is it a feed?
if feed_url_parsed.bozo == 0: #If feed_url was parsable
doesFeedExist = len(DBConnection.cursor().execute('SELECT feed_id from RSSFeeds WHERE title=?', (feed_url_parsed.feed.title,)).fetchall()) #Check if feed already exists. Checks feed title to bypass any url formating differences.
if doesFeedExist == 0: #If the feed doesn't already exist.
DBConnection.cursor().execute("INSERT INTO RSSFeeds(url, title) VALUES(?,?);", (feed_url, feed_url_parsed.feed.title)) #Add feed to table(RSSFeeds).
feed_id = DBConnection.cursor().execute('SELECT feed_id from RSSFeeds WHERE url=?', (feed_url,)).fetchone()[0]
commitSQLConnection(DBConnection) #Commit new feed to table(RSSFeeds).
closeSQLConnection(DBConnection) #Close Connection
raise cherrypy.HTTPRedirect('/updateFeeds?feed_id=' + str(feed_id)) #Redirect to updateFeeds for added feed.
closeSQLConnection(DBConnection)
raise cherrypy.HTTPRedirect('/')
@cherrypy.expose
@cherrypy.tools.json_out()
def getPosts(self, LIM):
DBConnection = openSQLConnection(DATABASE, 1)
rows = DBConnection.cursor().execute('SELECT url, title, content, date FROM RSSEntries ORDER BY date DESC LIMIT 10 OFFSET ?', (LIM,)).fetchall()
commitSQLConnection(DBConnection)
closeSQLConnection(DBConnection)
return json.dumps( [dict(ix) for ix in rows] )
@cherrypy.expose
@cherrypy.tools.json_out()
def getFeeds(self):
DBConnection = openSQLConnection(DATABASE, 1)
rows = DBConnection.cursor().execute('SELECT url, title, content, date FROM RSSFeeds ORDER BY date DESC LIMIT 10 OFFSET ?', (LIM,)).fetchall()
commitSQLConnection(DBConnection)
closeSQLConnection(DBConnection)
return json.dumps( [dict(ix) for ix in rows] )
@cherrypy.expose
def updateFeeds(self, feed_id):
DBConnection = openSQLConnection(DATABASE, 0)
if feed_id == "0": #If feed_id is 0 update all feeds.
feeds = DBConnection.cursor().execute('SELECT feed_id, url, date FROM RSSFeeds').fetchall()
else: #Otherwise, update only the feed matching the value of feed_id
feeds = DBConnection.cursor().execute('SELECT feed_id, url, date FROM RSSFeeds WHERE feed_id=?', (feed_id,)).fetchall()
with concurrent.futures.ThreadPoolExecutor(max_workers=THREAD_LIMIT) as threadPool:
for info in feeds:
print("Launching Thread ID#:" + str(info['feed_id']) + " = " + info['url'] + ", " + info['date'] + ".")
threadPool.submit(fetchAndProcessFeed, info['feed_id'], info['url'], info['date'], DBConnection)
commitSQLConnection(DBConnection)
closeSQLConnection(DBConnection)
raise cherrypy.HTTPRedirect('/')
@cherrypy.expose
def quit(self):
cherrypy.engine.exit()
conf = os.path.join(os.path.dirname(__file__) + "scripts/", 'server.conf')
cherrypy.quickstart(ThePyScraper(),config=conf)