You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
import pandas as pd
import numpy as np
import os
import re
import operator
import pickle
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
news = pd.read_json('https://raw.githubusercontent.com/zayedrais/DocumentSearchEngine/master/data/newsgroups.json')
for i,txt in enumerate(news['content']):
subject = re.findall('Subject:(.*\n)',txt) # TODO: Clean This, not every Regex is good
news.loc[i, 'Subject'] = str(i) + ' ' + subject[0] if len(subject) ! = 0 else 'NA'
# TODO: is it likely to be 0, 1, or >1?
df_news['content'] = [entry.lower() for entry in df_news['content']] # standardization
def content_cleaner(x): # data Cleaning for content data of news
sw = ['subject:', 'organization:', 'thanks', 'thank', 're:', 'from:(.*\n)', 'lines:(.*\n)', ' ']
for sw_item in sw: x = x.replace(to_replace=sw_item, value='', regex=True)
sw = ['-', '\s+', '[!"#$%&\'()*+,/:;<=>?@[\\]^_`{|}~]']
for sw_item in sw: x = x.replace(to_replace=sw_item, value=' ', regex=True)
x = x.apply(lambda x: x.strip()) # whitespace trim
return x
df_news.content = content_cleaner(df_news.content)
def subject_cleaner(x): # data cleaning for subject data
sw = ['re:', ' ']
for sw_item in sw: x = x.replace(to_replace=sw_item, value='', regex=True)
sw = ['\s+', '[!"#$%&\'()*+,/:;<=>?@[\\]^_`{|}~]']
for sw_item in sw: x = x.replace(to_replace=sw_item, value=' ', regex=True)
x = x.apply(lambda x: x.strip())
return x
df_news.Subject = subject_cleaner(df_news.Subject)
for i,sen in enumerate(df_news.content):
if len(sen.strip()) == 0: df_news = df_news.drop(str(i), axis=0).reset_index().drop('index', axis=1)
df_news['Tokens'] = [word_tokenize(entry) for entry in df_news.content]
def wordLemmatizer(data):
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'], tag_map['V'], tag_map['R'] = wn.ADJ, wn.VERB, wn.ADV
file_clean_k = pd.DataFrame()
for index,entry in enumerate(data):
Final_words = []
word_Lemmatized = WordNetLemmatizer()
for word, tag in pos_tag(entry):
if len(word) > 1 and word not in stopwords.words('english') and word.isalpha():
word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
Final_words.append(word_Final)
file_clean_k.loc[index, 'Keyword_final'] = str(Final_words)
return file_clean_k
df_clean = wordLemmatizer(df_news['Tokens'][0:10])
def lemma_cleaner(x):
sw = ["\[.", "'", " ", '\]']
for sw_item in sw: x = x.replace(to_replace=sw_item, value='', regex=True)
return x
df_clean = lemma_cleaner(df_clean)
df_news.insert(loc=3, column='Clean_Keyword', value=df_clean['Keyword_final'].tolist())
df = pd.read_json('https://raw.githubusercontent.com/zayedrais/DocumentSearchEngine/master/data/WordLemmatize20NewsGroup.json')
df_news['Clean_Keyword'] = df['Clean_Keyword']
df_news_save = df_news.copy().drop(['Tokens', 'Clean_Keyword'], axis=1)
df_news_save.to_csv("df_news_index.csv", index=False, header=True)
reacted with thumbs up emoji reacted with thumbs down emoji reacted with laugh emoji reacted with hooray emoji reacted with confused emoji reacted with heart emoji reacted with rocket emoji reacted with eyes emoji
-
Here is an edited code piece with some edit recommendations
Beta Was this translation helpful? Give feedback.
All reactions