-
Notifications
You must be signed in to change notification settings - Fork 0
/
tests_bookwyrm.py
117 lines (97 loc) · 5.2 KB
/
tests_bookwyrm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import feedparser
import logging
import requests
from typing import List
import datetime
from bs4 import BeautifulSoup, NavigableString
from rich.console import Console
from rich.logging import RichHandler
from urllib.parse import urlparse, urljoin
import re
from bookwyrm import *
from classes import Review, BookUser
FORMAT = "%(message)s"
logging.basicConfig(level="DEBUG",
format=FORMAT,
datefmt="[%X]",
handlers=[RichHandler(markup=True, rich_tracebacks=True)])
log = logging.getLogger("rich")
console = Console()
profile_url = 'https://bookwyrm.social/user/potajito'
log.debug(f"Trying for https://www.goodreads.com/user/updates_rss/34998873")
def append_to_url(url: str, path_to_append: str) -> str:
# Ensure path_to_append starts with a slash and url doesn't have one
if not url.endswith('/'):
url = f"{url}/"
if path_to_append.startswith('/'):
path_to_append = path_to_append[1:]
return url + path_to_append
def parse_user_profile (profile_url: str) -> List[Review]:
reviews: List[Review] = []
try:
profile_url_domain = urlparse(profile_url).hostname
profile_url_scheme = urlparse(profile_url).scheme
reviews_url = append_to_url(profile_url,'/reviews-comments')
soup = BeautifulSoup(requests.get(reviews_url).text,"html.parser")
user_image_url = soup.find('img', class_=re.compile(r'avatar image*')).get('src')
header_entries: List[NavigableString] = soup.find_all('div', class_='media-content')
#box_entries = soup.find_all('section', class_='card-content')
for entry in header_entries:
if ' rated ' in entry.text:
username = entry.find('span', itemprop='name').text.strip()
book_name = find_book_title(entry)
time_elapsed_str = find_time_elapsed(entry)
review_time_stamp = convert_elapsed_to_timestamp(time_elapsed_str)
score_in_stars = entry.select_one('.stars .is-sr-only').text.strip()
score = int(re.findall(r'\d+', score_in_stars)[0])
section_tag = entry.find_next('section', class_='card-content')
author = find_book_author(section_tag)
section_a_tags = section_tag.find_all('a')
section_img_tag = section_tag.find("img", class_="book-cover")
try:
image_url = section_img_tag.get('src')
except Exception:
image_url = 'https://cover2coverbookdesign.com/site/wp-content/uploads/2019/03/geometric1.jpg'
for a_tag in section_a_tags:
if "/book/" in a_tag.get('href'):
book_url = f"{profile_url_scheme}://{profile_url_domain}{a_tag.get('href')}"
# log.debug(book_url)
break
reviews.append(fill_review(book_name, score, author,
book_url, image_url, profile_url,
username, user_image_url, review_time_stamp))
clean_string = f"{username} rated {book_name} by {author}: {score}"
log.info(clean_string)
if ' reviewed ' in entry.text:
username = entry.find('span', itemprop='name').text.strip()
book_name = find_book_title(entry)
time_elapsed_str = find_time_elapsed(entry)
review_time_stamp = convert_elapsed_to_timestamp(time_elapsed_str)
author = find_book_author(entry)
section_tag = entry.find_next('section', class_='card-content')
score_in_stars = section_tag.find('span', class_='is-sr-only').text.strip()
score = int(re.findall(r'\d+', score_in_stars)[0])
section_a_tags = section_tag.find_all('a')
section_img_tag = section_tag.find("img", class_="book-cover")
# Extract review text
try:
image_url = section_img_tag.get('src')
except Exception:
image_url = 'https://cover2coverbookdesign.com/site/wp-content/uploads/2019/03/geometric1.jpg'
for a_tag in section_a_tags:
if "/book/" in a_tag.get('href'):
book_url = f"{profile_url_scheme}://{profile_url_domain}{a_tag.get('href')}"
# log.debug(book_url)
break
reviews.append(fill_review(book_name, score, author,
book_url, image_url, profile_url,
username, user_image_url, review_time_stamp))
clean_string = f"{username} reviewed {book_name} by {author}: {score}"
log.info(clean_string)
log.info(f"Found {len(reviews)} reviews")
#log.debug(pprint(reviews))
return reviews
except Exception as error:
print('Could not parse:', reviews_url)
console.print_exception()
return []