-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathapp.py
94 lines (77 loc) · 2.71 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import threading
import time
import feedparser
import requests
import base64
import io
from flask import Flask
from bs4 import BeautifulSoup
from flask import render_template
from wordcloud import WordCloud
app = Flask(__name__)
categoryList= []
categoryTitleList=[]
categoryListInThreadOrder=[]
categoryTitleListInThreadOrder=[]
cloudListInThreadOrder=[]
categorydict={}
cloudDict ={}
texts = []
wordClouds = []
LIMIT = 10
def get_wordcloud(text,url):
if(text!= ''):
print(' Turning into word cloud')
pil_img = WordCloud()
wordCloud=pil_img.generate(text=text).to_image()
img= io.BytesIO()
wordCloud.save(img,"PNG")
img.seek(0)
img_b64=base64.b64encode(img.getvalue()).decode()
cloudDict[url] = img_b64
wordClouds.append(img_b64)
else:
print('Passing this text')
def parse_article(url):
categoryListInThreadOrder.append(url)
print("Downloading {}".format(url))
soup= getContentOfSite(url)
ps=soup.find_all('p')
text= "\n".join(p.get_text() for p in ps)
texts.append(text)
get_wordcloud(text,url)
def getContentOfSite(url):
response = requests.get(url= url)
soup = BeautifulSoup(response.content, 'html.parser')
return soup
# Get all categories of news
def getCategoryAndTitleList(categoriesSoup, id):
allCategories = categoriesSoup.find(id=id).find_all("a")
for category in allCategories:
if (category['href'] != "#orb-footer"):
categoryTitleList.append(category.string)
categoryList.append(category['href'])
categorydict[category['href']]= category.string
return categoryList, categoryTitleList
@app.route('/')
def home():
categoriesSoup = getContentOfSite("https://www.bbc.com/")
categoryList, categoryTitleList = getCategoryAndTitleList(categoriesSoup,"orb-nav-links")
start = time.time()
threads = [threading.Thread(target=parse_article, args=(url,)) for url in categoryList]
for thread in threads:
thread.start()
for thread in threads:
thread.join()
for categoryLink in categoryListInThreadOrder:
try:
categoryTitle = list(categorydict.values())[list(categorydict.keys()).index(categoryLink)]
cloud = list(cloudDict.values())[list(cloudDict.keys()).index(categoryLink)]
categoryTitleListInThreadOrder.append(categoryTitle)
cloudListInThreadOrder.append(cloud)
except:
print(categoryLink, " is not in List.")
print("Elapsed Time: %s" % (time.time() - start))
return render_template('home.html', clouds=cloudListInThreadOrder, categoryList=categoryListInThreadOrder, categoryTitleList=categoryTitleListInThreadOrder)
if __name__ == '__main__':
app.run('0.0.0.0')