Skip to content

Commit

Permalink
rearrange
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Jun 13, 2023
1 parent c7f02e0 commit a176186
Showing 1 changed file with 31 additions and 34 deletions.
65 changes: 31 additions & 34 deletions courlan/urlstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,14 +96,6 @@ def dump_unvisited_urls(num: Any, frame: Any) -> None:
signal.signal(signal.SIGINT, dump_unvisited_urls)
signal.signal(signal.SIGTERM, dump_unvisited_urls)

def reset(self) -> None:
"Re-initialize the URL store."
with self._lock:
self.urldict = defaultdict(DomainEntry)
clear_caches()
num = gc.collect()
LOGGER.debug("UrlStore reset, %s objects in GC", num)

def _buffer_urls(
self, data: List[str], visited: bool = False
) -> DefaultDict[str, Deque[UrlPathTuple]]:
Expand Down Expand Up @@ -222,7 +214,7 @@ def _search_urls(
def _timestamp(self, domain: str) -> Optional[datetime]:
return self.urldict[domain].timestamp

# URL MANIPULATION AND INFO
# ADDITIONS AND DELETIONS

def add_urls(
self,
Expand Down Expand Up @@ -272,32 +264,20 @@ def discard(self, domains: List[str]) -> None:
num = gc.collect()
LOGGER.debug("%s objects in GC after UrlStore.discard", num)

def is_known(self, url: str) -> bool:
"Check if the given URL has already been stored."
hostinfo, urlpath = get_host_and_path(url)
# returns False if domain or URL is new
return urlpath in {u.urlpath for u in self._load_urls(hostinfo)}

def find_known_urls(self, domain: str) -> List[str]:
"""Get all already known URLs for the given domain (ex. "https://example.org")."""
return [domain + u.urlpath for u in self._load_urls(domain)]

def filter_unknown_urls(self, urls: List[str]) -> List[str]:
"Take a list of URLs and return the currently unknown ones."
return self._search_urls(urls, switch=1)
def reset(self) -> None:
"Re-initialize the URL store."
with self._lock:
self.urldict = defaultdict(DomainEntry)
clear_caches()
num = gc.collect()
LOGGER.debug("UrlStore reset, %s objects in GC", num)

# DOMAINS / HOSTNAMES

def get_known_domains(self) -> List[str]:
"Return all known domains as a list."
return list(self.urldict)

def is_exhausted_domain(self, domain: str) -> bool:
"Tell if all known URLs for the website have been visited."
if domain in self.urldict:
return self.urldict[domain].state in (State.ALL_VISITED, State.BUSTED)
raise KeyError("website not in store")

def get_unvisited_domains(self) -> List[str]:
"""Find all domains for which there are unvisited URLs
and potentially adjust done meta-information."""
Expand All @@ -308,18 +288,21 @@ def get_unvisited_domains(self) -> List[str]:
self._set_done()
return unvisited

def is_exhausted_domain(self, domain: str) -> bool:
"Tell if all known URLs for the website have been visited."
if domain in self.urldict:
return self.urldict[domain].state in (State.ALL_VISITED, State.BUSTED)
raise KeyError("website not in store")

def unvisited_websites_number(self) -> int:
"Return the number of websites for which there are still URLs to visit."
return len(self.get_unvisited_domains())

# URL-BASED QUERIES

def has_been_visited(self, url: str) -> bool:
"Check if the given URL has already been visited."
hostinfo, urlpath = get_host_and_path(url)
known_urlpaths = {u.urlpath: u.visited for u in self._load_urls(hostinfo)}
return known_urlpaths.get(urlpath) or False
# return bool(self.filter_unvisited_urls([url]))
def find_known_urls(self, domain: str) -> List[str]:
"""Get all already known URLs for the given domain (ex. "https://example.org")."""
return [domain + u.urlpath for u in self._load_urls(domain)]

def find_unvisited_urls(self, domain: str) -> List[str]:
"Get all unvisited URLs for the given domain."
Expand All @@ -329,10 +312,24 @@ def find_unvisited_urls(self, domain: str) -> List[str]:
]
return []

def filter_unknown_urls(self, urls: List[str]) -> List[str]:
"Take a list of URLs and return the currently unknown ones."
return self._search_urls(urls, switch=1)

def filter_unvisited_urls(self, urls: List[str]) -> List[Union[Any, str]]:
"Take a list of URLs and return the currently unvisited ones."
return self._search_urls(urls, switch=2)

def has_been_visited(self, url: str) -> bool:
"Check if the given URL has already been visited."
return not bool(self.filter_unvisited_urls([url]))

def is_known(self, url: str) -> bool:
"Check if the given URL has already been stored."
hostinfo, urlpath = get_host_and_path(url)
# returns False if domain or URL is new
return urlpath in {u.urlpath for u in self._load_urls(hostinfo)}

# DOWNLOADS

def get_url(self, domain: str, as_visited: bool = True) -> Optional[str]:
Expand Down

0 comments on commit a176186

Please sign in to comment.