Skip to content

Commit

Permalink
idk
Browse files Browse the repository at this point in the history
  • Loading branch information
MaxBittker committed Nov 30, 2023
1 parent d48b78c commit 979ffc8
Show file tree
Hide file tree
Showing 4 changed files with 14 additions and 34 deletions.
26 changes: 0 additions & 26 deletions parsers/atproto.py

This file was deleted.

6 changes: 5 additions & 1 deletion parsers/nyt.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,11 @@ class NYTParser(BaseParser):

def _parse(self, html):
# print("html: " + html)
soup = BeautifulSoup(html.decode("utf-8"), "html5lib")
# if it's not a str, decode it:
if not isinstance(html, str):
html = html.decode("utf-8")

soup = BeautifulSoup(html, "html5lib")

for comment in soup.find_all(text=lambda text: isinstance(text, Comment)):
comment.extract()
Expand Down
14 changes: 8 additions & 6 deletions parsers/simple_scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from api_check import check_api
from nyt import NYTParser
from datetime import date
from atproto import bloot, bloot2
from bsky import bloot, bloot2
from sentry_sdk import capture_exception, capture_message

today = date.today()
Expand Down Expand Up @@ -74,10 +74,11 @@ def check_word(word, article_url, word_context):

def tweet_word(word, article_url, word_context):
try:
firstPost = bloot(word).json()
firstPost = bloot(word)
bloot2(
'"{}" occurred in: {}'.format(word_context, article_url),
{"root": firstPost, "parent": firstPost},
article_url,
firstPost,
)
data = {"status": word}
url = "%s/api/v1/statuses" % "https://botsin.space"
Expand All @@ -93,6 +94,7 @@ def tweet_word(word, article_url, word_context):
)

status = api.PostUpdate(word)
return
contextApi.PostUpdate(
'@{} "{}" occurred in: {}'.format(
status.user.screen_name, word_context, article_url
Expand Down Expand Up @@ -147,7 +149,7 @@ def context(content, word):
loc = content.find(word)
to_period = content[loc:].find(".")
prev_period = content[:loc].rfind(".")
allowance = 82
allowance = 70
if to_period < allowance:
end = content[loc : loc + to_period + 1]
else:
Expand Down Expand Up @@ -199,7 +201,7 @@ def process_links(links):
# seen = False
# unseen article
if not seen:
time.sleep(1)
time.sleep(30)
sentry_sdk.set_context("link", {"link": link})
capture_message("Getting Article")

Expand All @@ -211,7 +213,7 @@ def process_links(links):


start_time = time.time()
# tweet_word("testing", "context", "a")
#tweet_word("testing", "http://example.com", "a")
# process_links(['https://www.nytimes.com/2022/04/01/learning/word-of-the-day-oblivionaire.html'])
process_links(parser.feed_urls())
# process_links(['https://www.nytimes.com/2019/11/06/magazine/turtleneck-man-bbc-question-time-brexit.html'])
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ python-twitter
regex
redis
langid
atprototools
atproto

0 comments on commit 979ffc8

Please sign in to comment.