Skip to content

Commit

Permalink
UrlStore: add ternary state and discard function (#44)
Browse files Browse the repository at this point in the history
* UrlStore: prune()

* add discard

* add state info to domains

* test consistency

* consistency

* rearrange
  • Loading branch information
adbar authored Jun 15, 2023
1 parent 260ef1a commit 8e97548
Show file tree
Hide file tree
Showing 2 changed files with 119 additions and 73 deletions.
144 changes: 80 additions & 64 deletions courlan/urlstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from collections import defaultdict, deque
from datetime import datetime, timedelta
from enum import Enum
from threading import Lock
from typing import (
Any,
Expand All @@ -35,14 +36,21 @@
LOGGER = logging.getLogger(__name__)


class State(Enum):
"Record state information about a domain or host."
OPEN = 1
ALL_VISITED = 2
BUSTED = 3


class DomainEntry:
"Class to record host-related information and URL paths."
__slots__ = ("all_visited", "count", "rules", "timestamp", "total", "tuples")
__slots__ = ("count", "rules", "state", "timestamp", "total", "tuples")

def __init__(self) -> None:
self.all_visited: bool = False
self.count: int = 0
self.rules: Optional[RobotFileParser] = None
self.state: State = State.OPEN
self.timestamp: Optional[Any] = None
self.total: int = 0
self.tuples: Deque[UrlPathTuple] = deque()
Expand Down Expand Up @@ -88,14 +96,6 @@ def dump_unvisited_urls(num: Any, frame: Any) -> None:
signal.signal(signal.SIGINT, dump_unvisited_urls)
signal.signal(signal.SIGTERM, dump_unvisited_urls)

def reset(self) -> None:
"Re-initialize the URL store."
with self._lock:
self.urldict = defaultdict(DomainEntry)
clear_caches()
num = gc.collect()
LOGGER.debug("UrlStore reset, %s objects in GC", num)

def _buffer_urls(
self, data: List[str], visited: bool = False
) -> DefaultDict[str, Deque[UrlPathTuple]]:
Expand Down Expand Up @@ -128,6 +128,11 @@ def _load_urls(self, domain: str) -> Deque[UrlPathTuple]:
return self.urldict[domain].tuples
return deque()

def _set_done(self) -> None:
if not self.done and all(self.is_exhausted_domain(d) for d in self.urldict):
with self._lock:
self.done = True

def _store_urls(
self,
domain: str,
Expand All @@ -150,6 +155,9 @@ def _store_urls(

# load URLs or create entry
if domain in self.urldict:
# discard if busted
if self.urldict[domain].state is State.BUSTED:
return
urls = self._load_urls(domain)
known = {u.urlpath for u in urls}
else:
Expand All @@ -162,24 +170,24 @@ def _store_urls(
if to_left is not None:
urls.extendleft(t for t in to_left if not is_known_link(t.urlpath, known))

# use lock
with self._lock:
self.urldict[domain].total = len(urls)
# compression
if self.compressed:
self.urldict[domain].tuples = bz2.compress( # type: ignore[assignment]
pickle.dumps(urls, protocol=4)
)
else:
self.urldict[domain].tuples = urls
# adjust all_visited status
self.urldict[domain].all_visited = all(u.visited for u in urls)
# timestamp/backoff value
self.urldict[domain].total = len(urls)

if timestamp is not None:
self.urldict[domain].timestamp = timestamp
# adjust general state
if self.done and not self.urldict[domain].all_visited:
self.done = False

if all(u.visited for u in urls):
self.urldict[domain].state = State.ALL_VISITED
else:
self.urldict[domain].state = State.OPEN
if self.done:
self.done = False

def _search_urls(
self, urls: List[str], switch: Optional[int] = None
Expand All @@ -194,12 +202,7 @@ def _search_urls(
# examine domain
if hostinfo != last_domain:
last_domain = hostinfo
if switch == 1:
known_paths = {u.urlpath: None for u in self._load_urls(hostinfo)}
elif switch == 2:
known_paths = {
u.urlpath: u.visited for u in self._load_urls(hostinfo)
}
known_paths = {u.urlpath: u.visited for u in self._load_urls(hostinfo)}
# run checks: case 1: the path matches, case 2: visited URL
if urlpath in known_paths and (
switch == 1 or (switch == 2 and known_paths[urlpath])
Expand All @@ -211,7 +214,7 @@ def _search_urls(
def _timestamp(self, domain: str) -> Optional[datetime]:
return self.urldict[domain].timestamp

# URL MANIPULATION AND INFO
# ADDITIONS AND DELETIONS

def add_urls(
self,
Expand Down Expand Up @@ -251,51 +254,55 @@ def add_from_html(
)
self.add_urls(urls=links, appendleft=links_priority)

def is_known(self, url: str) -> bool:
"Check if the given URL has already been stored."
hostinfo, urlpath = get_host_and_path(url)
# returns False if domain or URL is new
return urlpath in {u.urlpath for u in self._load_urls(hostinfo)}

def find_known_urls(self, domain: str) -> List[str]:
"""Get all already known URLs for the given domain (ex. "https://example.org")."""
return [domain + u.urlpath for u in self._load_urls(domain)]
def discard(self, domains: List[str]) -> None:
"Declare domains void and prune the store."
with self._lock:
for d in domains:
self.urldict[d] = DomainEntry()
self.urldict[d].state = State.BUSTED
self._set_done()
num = gc.collect()
LOGGER.debug("%s objects in GC after UrlStore.discard", num)

def filter_unknown_urls(self, urls: List[str]) -> List[str]:
"Take a list of URLs and return the currently unknown ones."
return self._search_urls(urls, switch=1)
def reset(self) -> None:
"Re-initialize the URL store."
with self._lock:
self.urldict = defaultdict(DomainEntry)
clear_caches()
num = gc.collect()
LOGGER.debug("UrlStore reset, %s objects in GC", num)

# DOMAINS / HOSTNAMES

def get_known_domains(self) -> List[str]:
"Return all known domains as a list."
return list(self.urldict)

def is_exhausted_domain(self, domain: str) -> bool:
"Tell if all known URLs for the website have been visited."
if domain in self.urldict:
return self.urldict[domain].all_visited
raise KeyError("website not in store")

def get_unvisited_domains(self) -> List[str]:
"""Find all domains for which there are unvisited URLs
and potentially adjust done meta-information."""
unvisited = []
with self._lock:
if not self.done:
unvisited = [d for d in self.urldict if not self.urldict[d].all_visited]
if not unvisited:
self.done = True
if not self.done:
unvisited = [d for d in self.urldict if not self.is_exhausted_domain(d)]
if not unvisited:
self._set_done()
return unvisited

def is_exhausted_domain(self, domain: str) -> bool:
"Tell if all known URLs for the website have been visited."
if domain in self.urldict:
return self.urldict[domain].state in (State.ALL_VISITED, State.BUSTED)
raise KeyError("website not in store")

def unvisited_websites_number(self) -> int:
"Return the number of websites for which there are still URLs to visit."
return len(self.get_unvisited_domains())

# URL-BASED QUERIES

def has_been_visited(self, url: str) -> bool:
"Check if the given URL has already been visited.."
hostinfo, urlpath = get_host_and_path(url)
known_urlpaths = {u.urlpath: u.visited for u in self._load_urls(hostinfo)}
# defaults to None, thus False
return known_urlpaths.get(urlpath) or False
def find_known_urls(self, domain: str) -> List[str]:
"""Get all already known URLs for the given domain (ex. "https://example.org")."""
return [domain + u.urlpath for u in self._load_urls(domain)]

def find_unvisited_urls(self, domain: str) -> List[str]:
"Get all unvisited URLs for the given domain."
Expand All @@ -305,13 +312,23 @@ def find_unvisited_urls(self, domain: str) -> List[str]:
]
return []

def filter_unknown_urls(self, urls: List[str]) -> List[str]:
"Take a list of URLs and return the currently unknown ones."
return self._search_urls(urls, switch=1)

def filter_unvisited_urls(self, urls: List[str]) -> List[Union[Any, str]]:
"Take a list of URLs and return the currently unvisited ones."
return self._search_urls(urls, switch=2)

def unvisited_websites_number(self) -> int:
"Return the number of websites for which there are still URLs to visit."
return len(self.get_unvisited_domains())
def has_been_visited(self, url: str) -> bool:
"Check if the given URL has already been visited."
return not bool(self.filter_unvisited_urls([url]))

def is_known(self, url: str) -> bool:
"Check if the given URL has already been stored."
hostinfo, urlpath = get_host_and_path(url)
# returns False if domain or URL is new
return urlpath in {u.urlpath for u in self._load_urls(hostinfo)}

# DOWNLOADS

Expand All @@ -332,15 +349,14 @@ def get_url(self, domain: str, as_visited: bool = True) -> Optional[str]:
return domain + url.urlpath
# nothing to draw from
with self._lock:
self.urldict[domain].all_visited = True
self.urldict[domain].state = State.ALL_VISITED
self._set_done()
return None

def get_download_urls(self, timelimit: int = 10) -> Optional[List[str]]:
"""Get a list of immediately downloadable URLs according to the given
time limit per domain."""
potential = self.get_unvisited_domains()
if not potential:
return []
targets = []
for domain in potential:
timestamp = self._timestamp(domain)
Expand Down Expand Up @@ -401,6 +417,7 @@ def establish_download_schedule(
# store new info
self._store_urls(domain, url_tuples, timestamp=total_diff)
# sort by first tuple element (time in secs)
self._set_done()
return sorted(targets, key=lambda x: x[0]) # type: ignore[arg-type]

# CRAWLING
Expand Down Expand Up @@ -455,9 +472,8 @@ def dump_urls(self) -> List[str]:

def print_unvisited_urls(self) -> None:
"Print all unvisited URLs in store."
with self._lock:
for domain in self.urldict:
print("\n".join(self.find_unvisited_urls(domain)))
for domain in self.urldict:
print("\n".join(self.find_unvisited_urls(domain)))

def print_urls(self) -> None:
"Print all URLs in store (URL + TAB + visited or not)."
Expand Down
48 changes: 39 additions & 9 deletions tests/urlstore_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from courlan import UrlStore
from courlan.core import filter_links
from courlan.urlstore import State


def test_urlstore():
Expand Down Expand Up @@ -92,13 +93,22 @@ def test_urlstore():
["https://www.sitemaps.org/es/1"], appendleft=["https://www.sitemaps.org/fi/2"]
)
assert len(my_urls.urldict["https://www.sitemaps.org"].tuples) == 1
# pruning
assert not my_urls.done
my_urls.urldict["https://www.sitemaps.org"].state = State.ALL_VISITED
my_urls._set_done()
assert my_urls.done

# try example URLs
example_domain = "https://www.example.org"
example_urls = [f"{example_domain}/{str(a)}" for a in range(10000)]
test_urls = [f"https://test.org/{str(uuid.uuid4())[:20]}" for _ in range(10000)]
urls = example_urls + test_urls

# test loading
url_buffer = UrlStore()._buffer_urls(urls)
assert sum(len(v) for _, v in url_buffer.items()) == len(urls)

# compression 1
my_urls = UrlStore(compressed=True)
url_buffer = UrlStore()._buffer_urls(example_urls)
Expand All @@ -117,9 +127,29 @@ def test_urlstore():
my_urls._lock = None
assert len(pickle.dumps(my_urls)) < len(pickle.dumps(url_buffer))

# test loading
url_buffer = UrlStore()._buffer_urls(urls)
assert sum(len(v) for _, v in url_buffer.items()) == len(urls)
# test discard
my_urls = UrlStore()
my_urls.add_urls(urls)
ref_num_domains = my_urls.get_known_domains()

assert my_urls.total_url_number() != 0
my_urls.discard(my_urls.get_known_domains())
assert (
my_urls.total_url_number() == 0
and my_urls.get_known_domains() == ref_num_domains
and not my_urls.get_unvisited_domains()
and my_urls.done is True
)
my_urls.add_urls(
["https://www.example.org/1", "https://test.org/1", "https://www.other.org/1"]
)
assert (
my_urls.total_url_number() == 1
and len(my_urls.get_known_domains()) == 3
and my_urls.get_unvisited_domains() == ["https://www.other.org"]
and my_urls.done is False
)

my_urls = UrlStore()
my_urls.add_urls(urls)
assert sum(len(my_urls._load_urls(k)) for k, _ in my_urls.urldict.items()) == len(
Expand All @@ -132,19 +162,19 @@ def test_urlstore():
assert sum(len(v.tuples) for _, v in my_urls.urldict.items()) == len(urls)
my_urls.add_urls(["https://visited.com/visited"], visited=True)
assert my_urls.urldict["https://visited.com"].tuples[0].visited is True
assert my_urls.urldict["https://visited.com"].all_visited is True
assert my_urls.urldict["https://visited.com"].state is State.ALL_VISITED
assert not my_urls.find_unvisited_urls("https://visited.com")
assert my_urls.is_exhausted_domain("https://visited.com") is True
# new unvisited URLs
my_urls.add_urls(["https://visited.com/1"], visited=False)
assert my_urls.urldict["https://visited.com"].tuples[1].visited is False
assert my_urls.urldict["https://visited.com"].all_visited is False
assert my_urls.urldict["https://visited.com"].state is State.OPEN
assert my_urls.is_exhausted_domain("https://visited.com") is False
with pytest.raises(KeyError):
assert my_urls.is_exhausted_domain("https://visited2.com") is True
# revert changes for further tests
del my_urls.urldict["https://visited.com"].tuples[1]
my_urls.urldict["https://visited.com"].all_visited = True
my_urls.urldict["https://visited.com"].state = State.ALL_VISITED

# test extension
extension_urls = [f"{example_domain}/1/{str(a)}" for a in range(10)]
Expand Down Expand Up @@ -193,10 +223,10 @@ def test_urlstore():
# timestamp
assert my_urls.urldict[example_domain].timestamp is not None
# nothing left
assert my_urls.urldict[example_domain].all_visited is False
assert my_urls.urldict[example_domain].state is State.OPEN
my_urls.add_urls(["http://tovisit.com/page"])
assert my_urls.get_url("http://tovisit.com") == "http://tovisit.com/page"
assert my_urls.urldict["http://tovisit.com"].all_visited is True
assert my_urls.urldict["http://tovisit.com"].state is State.ALL_VISITED
assert my_urls.get_url("http://tovisit.com") is None

# known domains
Expand Down Expand Up @@ -227,7 +257,7 @@ def test_urlstore():
and url_tuples[2].visited is False
)
assert my_urls.has_been_visited("http://tovisit.com/page") is True
assert my_urls.urldict["http://tovisit.com"].all_visited is True
assert my_urls.urldict["http://tovisit.com"].state is State.ALL_VISITED
assert not my_urls.filter_unvisited_urls(["http://tovisit.com/page"])
assert my_urls.filter_unvisited_urls(["http://tovisit.com/otherpage"]) == [
"http://tovisit.com/otherpage"
Expand Down

0 comments on commit 8e97548

Please sign in to comment.