-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
93 lines (63 loc) · 3.15 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import requests
from bs4 import BeautifulSoup
def fetch_page_content(page_url):
"""Fetch the content of a web page."""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36'
}
try:
# Make a GET request to the specified URL.
response = requests.get(page_url, headers=headers, timeout=10)
response.raise_for_status() # Raises an error for non-successful requests
return response.content # Return page content if successful
except requests.exceptions.HTTPError as http_err:
print(f"HTTP error occurred: {http_err}")
except requests.exceptions.Timeout as timeout_err:
print(f"Timeout error occurred: {timeout_err}")
except requests.exceptions.ConnectionError:
print("Connection error occurred. Please check your internet connection.")
except requests.exceptions.RequestException as err:
print(f"Other error occurred: {err}")
return None # Explicitly return None if an error occurs
def extract_quotes_and_authors(page_content):
"""Extract quotes and their authors from the page content."""
# Make a soup object
soup = BeautifulSoup(page_content, 'lxml')
# Find all quotes
quotes = soup.find_all('span', class_='text')
# Find author's names
authors = soup.find_all('small', class_='author')
quotes_and_authors = zip(quotes, authors) # Use zip to combine iterables
return quotes_and_authors
def get_number_of_pages():
"""Get the number of pages to scrape from the user."""
while True:
try:
# Get user input for number of pages to scrape
num_of_pages = int(input("How many pages would you like to scrape? "))
return num_of_pages
except ValueError:
print("Invalid input, please enter a valid number.\n")
continue
def scrape_quotes():
"""Main function to scrape quotes from the specified number of pages."""
pages = get_number_of_pages()
# Get the file name from user
file_name = input("\nEnter the name of the file to save the data: ")
# Get the data for each page
for current_page_num in range(1, pages + 1):
page_url = f'https://quotes.toscrape.com/page/{current_page_num}'
print(f"Scraping page {current_page_num}...") # Feedback on progress
# Get the page content
page_content = fetch_page_content(page_url)
# Check if page content was fetched successfully
if page_content:
# Parse or scrape the content
quotes_and_authors = extract_quotes_and_authors(page_content)
# Save all quotes along their author's name to a text file
with open(f"{file_name}.txt", "a") as f: # Use "a" to append to the file
for quote, author in quotes_and_authors:
f.write(f"{quote.text}\n- {author.text}\n\n")
print(f"\nSaving data to {file_name}.txt ...")
if __name__ == '__main__':
scrape_quotes()