-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathnews_clf_app.py
119 lines (104 loc) · 3.97 KB
/
news_clf_app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import numpy as np
import tensorflow as tf
from tensorflow import keras
import re
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from wordcloud import STOPWORDS
import streamlit as st
nltk.download('all')
# st components
header = st.container()
intro = st.container()
data_info = st.container()
text_input = st.container()
with header:
st.title("Welcome to News descriptions classification web application")
st.text("This is the most simple text classification demo app")
with intro:
st.subheader("Introduction to the projecrt")
st.text("""Text is one of the most widespread forms of sequence data.
It can be understood as either a sequence of charactors or a sequence of words,
it's most common to work at level of words. Text-sequence processing
includes following applications:
Applications of deep learning for text data:
1. Document classification
2. Articles lebelling
3. Sentiment analysis
4. Author identification
5. Question-answering
6. Language detection
7. Translation Tasks
""")
st.text("""Typical workflow to prepare text data for machine learning models:
1. Tokenization
2. One-Hot encoding or word indexing
3. Pad sequencing
4. Embedding layer (Word2Vec)
5. Corresponding word vector
""")
with data_info:
st.subheader("Dataset information and feature description")
st.text("The dataset file contains 202,372 records. Each json record contains following attributes:")
st.text("""category: Category article belongs to
headline: Headline of the article
authors: Person authored the article
link: Link to the post
short_description: Short description of the article
date: Date the article was published""")
with text_input:
input_t = st.text_input("Enter news description here:", value=" ")
# clean the input sequences:
def data_cleaning(text):
whitespace = re.compile(r"\s+")
user = re.compile(r"(?i)@[a-z0-9_]+")
text = whitespace.sub(' ', text)
text = user.sub('', text)
text = re.sub(r"\[[^()]*\]","", text)
text = re.sub("\d+", "", text)
text = re.sub(r'[^\w\s]','',text)
text = re.sub(r"(?:@\S*|#\S*|http(?=.*://)\S*)", "", text)
text = text.lower()
# removing stop-words
text = [word for word in text.split() if word not in list(STOPWORDS)]
# word lemmatization
sentence = []
for word in text:
lemmatizer = WordNetLemmatizer()
sentence.append(lemmatizer.lemmatize(word,'v'))
return ' '.join(sentence)
# load the dataset to tokenize the sentence
import pandas as pd
df = pd.read_json("News_Category_Dataset_v2.json", lines=True)
df['length_of_news'] = df['headline'] + df['short_description']
df.drop(['headline','short_description'], inplace=True, axis=1)
politics_list = list(df[df['category'] == 'POLITICS'].index)
list_16 = politics_list[:16000] # list of 16000 row labels of POLITICS category
ndf = df.copy()
ndf.drop(list_16, axis=0, inplace=True)
# tokenize the input_t into integer list
tokenizer = keras.preprocessing.text.Tokenizer(num_words=100000, oov_token='<00V>')
tokenizer.fit_on_texts(ndf['length_of_news'])
# encode the labels
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
cats = encoder.fit_transform(ndf['category'])
list_cat = list(encoder.classes_)
cat_dict = {}
for k, v in enumerate(list_cat):
cat_dict.update({k:v})
# take input text, clean the text, tokenize it, pad it and feed it to the model
seq = data_cleaning(input_t)
token_seq = tokenizer.texts_to_sequences(seq)
token_pad = keras.preprocessing.sequence.pad_sequences(token_seq, maxlen=130)
# load the model for our prediction task
model = tf.keras.models.load_model('new_clf_exp06.h5')
if input_t is None:
st.text("Please entered any news description")
else:
st.write("You have entered news description as following:\n", input_t)
output = model.predict(token_pad)
output2 = output[0]
arg_max = np.argmax(output2)
output_cat = cat_dict[arg_max]
st.write(f"News category for your description {input_t} is {output_cat}")