Skip to content

Commit

Permalink
Black
Browse files Browse the repository at this point in the history
Format with black!
  • Loading branch information
edsu committed Sep 14, 2023
1 parent e57ad4a commit 68e1e86
Show file tree
Hide file tree
Showing 6 changed files with 95 additions and 47 deletions.
3 changes: 3 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,8 @@ jobs:
pip install poetry
poetry install
- name: Check formatting
run: black --check .

- name: Test with pytest
run: poetry run pytest -v
14 changes: 7 additions & 7 deletions memento_cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ def cli():


@cli.command()
@click.argument('url')
@click.argument("url")
def list(url):
# auto-detect the timemap if it's a memento supporting web archive
timemap_url = memento.get_timemap_url(url)
Expand All @@ -23,15 +23,15 @@ def list(url):


@cli.command()
@click.argument('start-url')
@click.argument('end-url')
@click.option('--text', help='text to look for on the page')
@click.option('--missing', is_flag=True, help='missing text to look for on the page')
@click.option('--show-browser', is_flag=True, help='see the browser')
@click.argument("start-url")
@click.argument("end-url")
@click.option("--text", help="text to look for on the page")
@click.option("--missing", is_flag=True, help="missing text to look for on the page")
@click.option("--show-browser", is_flag=True, help="see the browser")
def bisect(start_url, end_url, text, missing, show_browser):
print()
url = memento.bisect_urls(start_url, end_url, text, missing, show_browser)
click.echo(f'\rFound your archive snapshot: {url}')
click.echo(f"\rFound your archive snapshot: {url}")


def main():
Expand Down
1 change: 1 addition & 0 deletions memento_cli/browser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options


class Browser:
"""
A class for fetching text from a web page using a browser. This ensures
Expand Down
48 changes: 30 additions & 18 deletions memento_cli/memento.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,20 @@
from .browser import Browser


Memento = namedtuple('Memento', ['url', 'datetime'])
Memento = namedtuple("Memento", ["url", "datetime"])


def get_timemap_url(url):
"""
Look for a Memento Timemap URL in the response headers for a web resource.
"""
resp = requests.get(url)
if resp.status_code == 200 and 'timemap' in resp.links and 'url' in resp.links['timemap']:
return resp.links['timemap']['url']
if (
resp.status_code == 200
and "timemap" in resp.links
and "url" in resp.links["timemap"]
):
return resp.links["timemap"]["url"]
return None


Expand All @@ -27,14 +32,18 @@ def get_mementos(timemap_url) -> list[Memento]:
"""
resp = requests.get(timemap_url)
mementos = []
if resp.headers.get('content-type') == 'application/link-format':
if resp.headers.get("content-type") == "application/link-format":
for link in parse_links(resp.text):
if link.get('rel') == 'memento':
mementos.append(Memento(
link['url'],
datetime.datetime.strptime(link['datetime'], "%a, %d %b %Y %H:%M:%S GMT")
))

if link.get("rel") == "memento":
mementos.append(
Memento(
link["url"],
datetime.datetime.strptime(
link["datetime"], "%a, %d %b %Y %H:%M:%S GMT"
),
)
)

return mementos


Expand All @@ -45,14 +54,18 @@ def parse_links(text) -> list[dict]:
# lean on requests for the parsing, but make prep the text to allow for
# whitespace since parse_header_links is designed for a single line header

text = re.sub(r'^\s+', '', text) # strip leading whitespace
text = re.sub(r',\s*$', '', text) # strip trailing comma and any optional whitespace
text = re.sub(r'",\r?\n', ', ', text) # remove dos/unix newlines between links
text = re.sub(r"^\s+", "", text) # strip leading whitespace
text = re.sub(
r",\s*$", "", text
) # strip trailing comma and any optional whitespace
text = re.sub(r'",\r?\n', ", ", text) # remove dos/unix newlines between links

return requests.utils.parse_header_links(text)


def bisect_urls(start_url, end_url, text=None, missing=False, show_browser=False) -> str:
def bisect_urls(
start_url, end_url, text=None, missing=False, show_browser=False
) -> str:
timemap_url = get_timemap_url(start_url)
mementos = sorted(get_mementos(timemap_url), key=lambda m: m.datetime)
memento_urls = [m.url for m in mementos]
Expand All @@ -68,8 +81,7 @@ def bisect_urls(start_url, end_url, text=None, missing=False, show_browser=False
return bisect(start, end, memento_urls, text, missing, browser)


def bisect(start, end, memento_urls, text, missing, browser) -> str:

def bisect(start, end, memento_urls, text, missing, browser) -> str:
mid = start + int((end - start) / 2)
if mid == start:
return memento_urls[end]
Expand All @@ -85,7 +97,7 @@ def bisect(start, end, memento_urls, text, missing, browser) -> str:
text_in_page = False
# look in the page text
else:
print('\r' + meter(start, end, len(memento_urls)), end='')
print("\r" + meter(start, end, len(memento_urls)), end="")
text_in_page = text in page_text

# do we want to find the page where the text went missing?
Expand All @@ -111,4 +123,4 @@ def meter(start, end, n):
b = int((end - start + 1) * scale)
c = int((n - end + 1) * scale)

return f'[{n - (end - start)}/{n}]: ' + a * '█' + b * '░' + c * '█'
return f"[{n - (end - start)}/{n}]: " + a * "█" + b * "░" + c * "█"
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ pytest = "^7.4.2"
[tool.poetry.scripts]
memento = "memento_cli:main"

[tool.black]
include = ".py$"

[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
73 changes: 51 additions & 22 deletions test_memento_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,23 +5,43 @@
get_mementos,
parse_links,
bisect,
bisect_urls
bisect_urls,
)

from memento_cli.browser import Browser


def test_get_timemap_url():
assert get_timemap_url('https://web.archive.org/web/20230621094005/https://help.twitter.com/en/rules-and-policies/hateful-conduct-policy') == 'https://web.archive.org/web/timemap/link/https://help.twitter.com/en/rules-and-policies/hateful-conduct-policy'
assert get_timemap_url('https://perma.cc/7CN8-NJNV') == 'https://perma.cc/timemap/html/http://arboretum.harvard.edu'
assert get_timemap_url('https://swap.stanford.edu/was/20230524140954/http://news.stanford.edu/') == 'https://swap.stanford.edu/was/timemap/link/http://news.stanford.edu/'
assert get_timemap_url('https://nytimes.com') is None
assert (
get_timemap_url(
"https://web.archive.org/web/20230621094005/https://help.twitter.com/en/rules-and-policies/hateful-conduct-policy"
)
== "https://web.archive.org/web/timemap/link/https://help.twitter.com/en/rules-and-policies/hateful-conduct-policy"
)
assert (
get_timemap_url("https://perma.cc/7CN8-NJNV")
== "https://perma.cc/timemap/html/http://arboretum.harvard.edu"
)
assert (
get_timemap_url(
"https://swap.stanford.edu/was/20230524140954/http://news.stanford.edu/"
)
== "https://swap.stanford.edu/was/timemap/link/http://news.stanford.edu/"
)
assert get_timemap_url("https://nytimes.com") is None


def test_get_mementos():
mementos = list(get_mementos('https://web.archive.org/web/timemap/link/https://help.twitter.com/en/rules-and-policies/hateful-conduct-policy'))
mementos = list(
get_mementos(
"https://web.archive.org/web/timemap/link/https://help.twitter.com/en/rules-and-policies/hateful-conduct-policy"
)
)
assert len(mementos) > 2000
assert mementos[0].url == 'https://web.archive.org/web/20171229054051/https://help.twitter.com/en/rules-and-policies/hateful-conduct-policy'
assert (
mementos[0].url
== "https://web.archive.org/web/20171229054051/https://help.twitter.com/en/rules-and-policies/hateful-conduct-policy"
)
assert mementos[0].datetime == datetime.datetime(2017, 12, 29, 5, 40, 51)
assert mementos[-1].datetime.year >= 2023

Expand All @@ -39,33 +59,42 @@ def test_parse_links():
"""
links = parse_links(text)
assert len(links) == 8
assert links[0]['rel'] == 'original'
assert links[0]['url'] == 'http://www.nytimes.com:80/'
assert links[7]['url'] == 'https://web.archive.org/web/19961219002950/http://www.nytimes.com:80/'
assert links[7]['rel'] == 'memento'
assert links[7]['datetime'] == 'Thu, 19 Dec 1996 00:29:50 GMT'
assert links[0]["rel"] == "original"
assert links[0]["url"] == "http://www.nytimes.com:80/"
assert (
links[7]["url"]
== "https://web.archive.org/web/19961219002950/http://www.nytimes.com:80/"
)
assert links[7]["rel"] == "memento"
assert links[7]["datetime"] == "Thu, 19 Dec 1996 00:29:50 GMT"


def test_bisect_urls():
start_url = 'http://web.archive.org/web/20200102102511/https://inkdroid.org/'
end_url = 'http://web.archive.org/web/20230902020134/https://inkdroid.org/'
url = bisect_urls(start_url, end_url, 'ReSpec Writing')
assert url == 'http://web.archive.org/web/20230601013229/https://inkdroid.org/'
start_url = "http://web.archive.org/web/20200102102511/https://inkdroid.org/"
end_url = "http://web.archive.org/web/20230902020134/https://inkdroid.org/"

url = bisect_urls(start_url, end_url, "ReSpec Writing")
assert url == "http://web.archive.org/web/20230601013229/https://inkdroid.org/"


def test_bisect():
timemap = get_timemap_url('http://web.archive.org/web/20230902020134/https://inkdroid.org/')
timemap = get_timemap_url(
"http://web.archive.org/web/20230902020134/https://inkdroid.org/"
)
mementos = sorted(get_mementos(timemap), key=lambda m: m.datetime)
mementos = [m.url for m in mementos]
browser = Browser(headless=True)

url = bisect(0, len(mementos), mementos, 'ReSpec Writing', missing=False, browser=browser)
assert url == 'http://web.archive.org/web/20230601013229/https://inkdroid.org/'
url = bisect(
0, len(mementos), mementos, "ReSpec Writing", missing=False, browser=browser
)
assert url == "http://web.archive.org/web/20230601013229/https://inkdroid.org/"


def test_browser():
browser = Browser(headless=True)
text = browser.get('https://swap.stanford.edu/was/20230524140954/https://library.stanford.edu/node/172367')
text = browser.get(
"https://swap.stanford.edu/was/20230524140954/https://library.stanford.edu/node/172367"
)
# This text appears in an iframe provided by pywb
assert 'East Asian telegraph codes' in text
assert "East Asian telegraph codes" in text

0 comments on commit 68e1e86

Please sign in to comment.