-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathread_opengraph.py
executable file
·99 lines (80 loc) · 2.87 KB
/
read_opengraph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#!/usr/bin/python3
from bs4 import BeautifulSoup
import requests
from pathlib import Path
import os.path
import re
NON_ALPHANUMERIC = re.compile('[^a-zA-Z0-9\\s]')
WHITESPACES = re.compile('\\s')
def get_id(title):
title = NON_ALPHANUMERIC.sub('', title)
title = WHITESPACES.sub('-', title)
title = title.lower()
return title
def read_metadata(url):
req = requests.get(url, headers={'User-Agent': 'purkkafi-opengraph-parser'})
req.encoding = 'utf-8'
html = req.text
print('parsing OG for', url)
if not req.ok:
req.raise_for_status()
soup = BeautifulSoup(html, 'html.parser')
title, by = soup.head.title.string.split(' by ')
description = soup.head.find(property='og:description')
site = soup.head.find(property='og:site_name')
link_id = get_id(title)
itch_image_url = soup.head.find(property='og:image')['content']
image_path = f'/assets/opengraph/{ link_id }.{ itch_image_url.split(".")[-1] }'
with open(f'html/{image_path}', 'wb') as f:
f.write(requests.get(itch_image_url).content)
return {
'title': title,
'id': link_id,
'by': by,
'url': url,
'site': site['content'] if site != None else '',
'image': image_path,
'description': description['content'] if description != None else ''
}
urls = set()
for file in Path('pages').rglob('*.bgc'):
with open(file) as f:
urls.update(re.findall(r'\\opengraph_preview\{(.+)\}', f.read()))
urls = list(urls)
CACHE_FILE = 'beagic/opengraph/.cache'
cache = str(sorted(urls))
if os.path.isfile(CACHE_FILE):
with open(CACHE_FILE, 'r') as f:
in_cache = f.read()
if cache == in_cache:
print('OG previews up to date')
exit()
for url in urls:
if f"'{url}'" in in_cache:
continue
path = Path('beagic/opengraph', url)
path.parent.mkdir(parents=True, exist_ok=True)
with path.open('w') as f:
meta = read_metadata(url)
f.write(f"""
<a id="{meta['id']}" class="opengraphLink" href="{meta['url']}">
<div class="opengraphPreview" style="--og-img: url('{meta['image']}')">
<img class="opengraphThumb" src="{meta['image']}" loading="lazy" />
<div class="opengraphContent">
<p class="opengraphText">
<span class="opengraphTitle">{meta['title']}</span>
<br>
by <span class="opengraphBy">{meta['by']}</span>
<br>
<span class="opengraphDescription">{meta['description']}</span>
</p>
<div class="opengraphSiteButton">
View{(' on ' + meta['site']) if meta['site'] != '' else ''}
</div>
</div>
</div>
</a>
<div class="opengraphShare"><a href="#{meta['id']}">🔗</a></div>
""")
with open(CACHE_FILE, 'w') as f:
f.write(cache)