-
Notifications
You must be signed in to change notification settings - Fork 0
/
article.py
56 lines (40 loc) · 1.39 KB
/
article.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import datetime
import logging
import re
import httpx
from bs4 import BeautifulSoup
from markdown2 import markdown
from telegraph.aio import Telegraph
client = httpx.AsyncClient(timeout=600)
async def write_to_telegraph(html: str) -> str:
telegraph = Telegraph()
if not telegraph.get_access_token():
logging.info(await telegraph.create_account(short_name="anonymous"))
response = await telegraph.create_page(
f"article {datetime.datetime.now(datetime.UTC)}",
html_content=html,
)
logging.info(response)
return response["url"]
def markdown_to_text(markdown_string):
"""Converts a markdown string to plaintext"""
# md -> html -> text since BeautifulSoup can extract text cleanly
html = markdown(markdown_string)
# remove code snippets
html = re.sub(r"<pre>(.*?)</pre>", " ", html)
html = re.sub(r"<code>(.*?)</code >", " ", html)
# extract text
soup = BeautifulSoup(html, "html.parser")
text = "".join(soup.findAll(text=True))
return text
async def fetch_markdown_from_url(source_url: str) -> str:
"""
Fetch markdown content asynchronously from a given URL.
Parameters:
- source_url (str): The URL to get the markdown for.
Returns:
- str: The fetched markdown content.
"""
r_jina_url = f"https://r.jina.ai/{source_url}"
response = await client.get(r_jina_url)
return response.text