-
Notifications
You must be signed in to change notification settings - Fork 0
/
quotes.py
52 lines (40 loc) · 1.72 KB
/
quotes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import streamlit as st
import pandas as pd
from bs4 import BeautifulSoup
import requests
st.title('MX screaning test')
st.header('Web scraping streamlit application', divider='rainbow')
st.write("")
def get_tags():
link = requests.get('https://quotes.toscrape.com/').text
code = BeautifulSoup(link, 'html.parser')
tags_div = code.find('div',{'class':'col-md-4 tags-box'})
tags = [x.text.strip() for x in tags_div.find_all('a')]
return tags
all_tag_names = get_tags()
choosen_tag = st.selectbox('Choose a tag', all_tag_names, index=None)
if choosen_tag:
to_scrape_url = f"https://quotes.toscrape.com/tag/{choosen_tag}/"
st.write(f'scraping this link >> {to_scrape_url}')
def scraper(pass_link):
link = requests.get(pass_link).text
code = BeautifulSoup(link, 'html.parser')
quote_div = code.find_all('div',{'class':'col-md-8'})[1]
all_quotes = [x.get_text(strip=True) for x in quote_div.find_all('span',{'class':'text'})]
author = [x.get_text(strip=True) for x in quote_div.find_all('small',{'class':'author'})]
about_author_urls = ['https://quotes.toscrape.com'+x.find_next('a')['href'] for x in quote_div.find_all('span',{'class':'text'}) ]
data_dict = {}
data_dict['tag_choosen'] = choosen_tag
data_dict['author'] = author
data_dict['quote'] = all_quotes
data_dict['about_author'] = about_author_urls
return pd.DataFrame(data_dict)
if choosen_tag:
scraped_quotes_by_tag = scraper(to_scrape_url)
data_frame = st.dataframe(scraped_quotes_by_tag)
get_csv = st.button('Download CSV')
if get_csv:
try:
scraped_quotes_by_tag.to_csv(f'{choosen_tag}.csv', index=False)
except:
st.write('Error in generating CSV')