-
Notifications
You must be signed in to change notification settings - Fork 0
/
app.py
174 lines (130 loc) · 5.96 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import requests
import numpy as np
import pandas as pd
import streamlit as st
import tensorflow as tf
import urllib.request, json
from tensorflow import keras
from bs4 import BeautifulSoup
from keras_preprocessing.text import tokenizer_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences
"""
# Fake News Classifier
This is a simple classifier. There are two options:
1. General News: With this option, the classifier takes in an article and tells you if the article is fake news or not. It was trained on a data set including articles of real and fake news.
2. Covid News/Claims: With this option, the classifier takes in either text input about a covid claim or a covid-related news article url. Based on the claim or url, the classifier will tell you if the truthfulness of the claim/article. This classifier was trained on a data set including covid-related articles/claims.
"""
def predict(url, inference):
if url != "":
try:
news_content = []
response = requests.get(url)
# opening the url for reading
html = urllib.request.urlopen(url)
# parsing the html file
htmlParse = BeautifulSoup(html, 'html.parser')
# Getting the title
title = htmlParse.find('title').get_text()
# Reading the content (it is usually divided into paragraphs)
article = requests.get(url)
article_content = article.content
# getting all the paragraphs
body = htmlParse.find_all("p")
# Unifying the paragraphs
list_paragraphs = []
for p in np.arange (0, len(body)):
paragraph = body[p].get_text()
list_paragraphs.append(paragraph)
final_article = " ".join(list_paragraphs)
news_content.append(final_article)
# df_features
url_data = pd.DataFrame(
{'text': news_content
})
# Pad the data similarly to how it was padded with the training data
sequences = tokenizer.texts_to_sequences(url_data['text'])
padded = pad_sequences(sequences, maxlen=512, padding='post', truncating='post')
# Get the model's prediction on url's news article
pred = inference.predict(padded)
f"""
# Article: {title}.
## This article **{'is' if pred < 0.5 else 'is not'}** Fake News.
### Probability of Real News: {pred.item()*100: .2f}%
"""
except:
st.text("Problem reading article from", url)
def covid_predict(url, inference):
if url != "":
try:
content = []
response = requests.get(url)
# opening the url for reading
html = urllib.request.urlopen(url)
# parsing the html file
htmlParse = BeautifulSoup(html, 'html.parser')
print('reading file')
# Getting the article title/claim
title = htmlParse.find('title').get_text()
content.append(title)
df = pd.DataFrame(
{'text': content
})
# Pad the data similarly to how it was padded with the training data
sequences = tokenizer.texts_to_sequences(df['text'])
padded = pad_sequences(sequences, maxlen=80, padding='post', truncating='post')
# Get the model's prediction on url's news article
pred = inference.predict(padded)
f"""
# Article/Claim: {title}.
## This article/claim **{'is' if pred < 0.5 else 'is not'}** False.
### Probability of Truthfulness: {pred.item()*100: .2f}%
"""
except:
st.text("Problem reading article/claim from", url)
def covid_claim_pred(claim, inference):
if claim != "":
try:
content = []
content.append(claim)
# Turning the claim into a data frame
df = pd.DataFrame(
{'text': content
})
# Pad the data similarly to how it was padded with the training data
sequences = tokenizer.texts_to_sequences(df['text'])
padded = pad_sequences(sequences, maxlen=80, padding='post', truncating='post')
# Get the model's prediction on url's news article
pred = inference.predict(padded)
f"""
# Claim: {claim}.
## This claim **{'is' if pred < 0.5 else 'is not'}** False.
### Probability of Truthfulness: {pred.item()*100: .2f}%
"""
except:
st.text("Problem reading user input from", claim)
# Application Interface
option = st.radio("", ["General News", "COVID News"])
if option == "General News":
url = st.text_input("Please input a url.")
# Load the model and tokenizer
general_news_inference = keras.models.load_model("code-general/rcnn2-model")
with open('code-general/rcnn2-tokenizer.json') as f:
data = json.load(f)
tokenizer = tokenizer_from_json(data)
# Predict based on the URL
predict(url, general_news_inference)
else:
covid_options = st.radio("", ["URL", "Claim"])
# Load the model and tokenizer
covid_inference = keras.models.load_model("code-covid/covid-model-1")
with open('code-covid/covid1-tokenizer.json') as f:
data = json.load(f)
tokenizer = tokenizer_from_json(data)
if covid_options == "URL":
covid_url = st.text_input("Please input a covid-related url.")
# Predict based on the URL
covid_predict(covid_url, covid_inference)
else:
user_input = st.text_input("Please input a covid-related claim.")
# Predict based on the claim
covid_claim_pred(user_input, covid_inference)