Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sourcery refactored master branch #78

Closed
wants to merge 1 commit into from
Closed

Conversation

sourcery-ai[bot]
Copy link
Contributor

@sourcery-ai sourcery-ai bot commented Jan 16, 2024

Branch master refactored by Sourcery.

If you're happy with these changes, merge this Pull Request using the Squash and merge strategy.

See our documentation here.

Run Sourcery locally

Reduce the feedback loop during development by using the Sourcery editor plugin:

Review changes via command line

To manually merge these changes, make sure you're on the master branch, then run:

git fetch origin sourcery/master
git merge --ff-only FETCH_HEAD
git reset HEAD^

Help us improve this pull request!

@sourcery-ai sourcery-ai bot requested a review from adbar January 16, 2024 14:10
Comment on lines -94 to +97
# too long and garbled URLs e.g. due to quotes URLs
# https://github.com/cocrawler/cocrawler/blob/main/cocrawler/urls.py
# if len(url) > 500: # arbitrary choice
match = TRAILING_PARTS.match(url)
if match:
if match := TRAILING_PARTS.match(url):
url = match[1]
if len(url) > 500:
LOGGER.debug("invalid-looking link %s of length %d", url[:50] + "…", len(url))
LOGGER.debug("invalid-looking link %s of length %d", f"{url[:50]}…", len(url))
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function scrub_url refactored with the following changes:

This removes the following comments ( why? ):

# too long and garbled URLs e.g. due to quotes URLs
# https://github.com/cocrawler/cocrawler/blob/main/cocrawler/urls.py
# if len(url) > 500:  # arbitrary choice

Comment on lines -121 to +123
if strict:
if teststr not in ALLOWED_PARAMS and teststr not in CONTROL_PARAMS:
continue
# get rid of trackers
elif TRACKERS_RE.search(teststr):
if (
strict
and teststr not in ALLOWED_PARAMS
and teststr not in CONTROL_PARAMS
or not strict
and TRACKERS_RE.search(teststr)
):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function clean_query refactored with the following changes:

This removes the following comments ( why? ):

# get rid of trackers

Comment on lines -110 to -113
batch = [line.strip() for line in islice(inputfh, 10**5)]
if not batch:
if batch := [line.strip() for line in islice(inputfh, 10**5)]:
yield batch
else:
return
yield batch
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function _batch_lines refactored with the following changes:

Comment on lines -141 to +178
with ProcessPoolExecutor(max_workers=args.parallel) as executor, open(
args.outputfile, "w", encoding="utf-8"
) as outputfh, open(
args.inputfile, "r", encoding="utf-8", errors="ignore"
) as inputfh:
with (ProcessPoolExecutor(max_workers=args.parallel) as executor, open(
args.outputfile, "w", encoding="utf-8"
) as outputfh, open(
args.inputfile, "r", encoding="utf-8", errors="ignore"
) as inputfh):
while True:
batches = [] # type: List[List[str]]
while len(batches) < 1000:
line_batch = list(islice(inputfh, 1000))
if not line_batch:
break
batches.append(line_batch)

if batches:
futures = (
executor.submit(
_cli_check_urls,
batch,
strict=args.strict,
with_redirects=args.redirects,
language=args.language,
)
for batch in batches
)
if line_batch := list(islice(inputfh, 1000)):
batches.append(line_batch)

for future in as_completed(futures):
for valid, url in future.result():
if valid:
outputfh.write(url + "\n")
# proceed with discarded URLs. to be rewritten
elif args.discardedfile is not None:
with open(
args.discardedfile, "a", encoding="utf-8"
) as discardfh:
discardfh.write(url)

batches = []
else:
else:
break
if not batches:
break
futures = (
executor.submit(
_cli_check_urls,
batch,
strict=args.strict,
with_redirects=args.redirects,
language=args.language,
)
for batch in batches
)

for future in as_completed(futures):
for valid, url in future.result():
if valid:
outputfh.write(url + "\n")
# proceed with discarded URLs. to be rewritten
elif args.discardedfile is not None:
with open(
args.discardedfile, "a", encoding="utf-8"
) as discardfh:
discardfh.write(url)

batches = []
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function _cli_process refactored with the following changes:

Comment on lines -177 to +192
if no_filter is False and language is not None and "hreflang" in link:
if not no_filter and language is not None and "hreflang" in link:
langmatch = HREFLANG_REGEX.search(link)
if langmatch and (
langmatch[1].startswith(language) or langmatch[1] == "x-default"
):
linkmatch = LINK_REGEX.search(link)
if linkmatch:
if linkmatch := LINK_REGEX.search(link):
candidates.add(linkmatch[1])
# default
else:
linkmatch = LINK_REGEX.search(link)
if linkmatch:
candidates.add(linkmatch[1])
elif linkmatch := LINK_REGEX.search(link):
candidates.add(linkmatch[1])
# filter candidates
for link in candidates:
# repair using base
if not link.startswith("http"):
link = fix_relative_urls(url, link)
# check
if no_filter is False:
if not no_filter:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function extract_links refactored with the following changes:

This removes the following comments ( why? ):

# default

Comment on lines -86 to +80
if parsed_url.scheme:
scheme = parsed_url.scheme + "://"
else:
scheme = ""
scheme = f"{parsed_url.scheme}://" if parsed_url.scheme else ""
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function get_base_url refactored with the following changes:

Comment on lines -147 to +138
if ignore_suffix:
return stripped_domain != stripped_ref
return domain != ref
return stripped_domain != stripped_ref if ignore_suffix else domain != ref
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function is_external refactored with the following changes:

# not elegant, but will do the trick
match = re.match(r'(.+?)/', line)
if match:
if match := re.match(r'(.+?)/', line):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lines 41-63 refactored with the following changes:

This removes the following comments ( why? ):

# not elegant, but will do the trick

Comment on lines -38 to +53
# files.wordpress.com hack
url = re.sub(r'\.files\.wordpress\.', '.wordpress.', url)
# wphost = re.match (r'(htt.+?\.wordpress\.[a-z]{2,3}/)', url)
wphost = re.match (r'(htt.+?\.wordpress\.[a-z]{2,3})/?', url)
if wphost:
return wphost.group(1).rstrip('/') + '/'

# K.O. victory
ko_string = re.match(r'(.+?)(/wp/|/wordpress/|/wp-content/)', url)
if ko_string:
# files.wordpress.com hack
url = re.sub(r'\.files\.wordpress\.', '.wordpress.', url)
if wphost := re.match(r'(htt.+?\.wordpress\.[a-z]{2,3})/?', url):
return wphost.group(1).rstrip('/') + '/'

if ko_string := re.match(r'(.+?)(/wp/|/wordpress/|/wp-content/)', url):
return ko_string.group(1).rstrip('/') + '/'

# /tag/ /category/
tagcat = re.match(r'(.+?)(/tag/|/category/|\?cat=)', url)
if tagcat:
if tagcat := re.match(r'(.+?)(/tag/|/category/|\?cat=)', url):
return tagcat.group(1).rstrip('/') + '/'

# query parameters
if re.search(r'/\?p=|\?page_id=|\?paged=/', url):
mquery = re.match(r'(https?://.+?/)(blog/|weblog/)?(\?p=|\?page_id=|\?paged=)', url)
if mquery:
if mquery := re.match(
r'(https?://.+?/)(blog/|weblog/)?(\?p=|\?page_id=|\?paged=)', url
):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function find_target refactored with the following changes:

This removes the following comments ( why? ):

# wphost = re.match (r'(htt.+?\.wordpress\.[a-z]{2,3}/)', url)
# K.O. victory
# /tag/ /category/

if my_urls.compressed is False:
if not my_urls.compressed:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function test_urlstore refactored with the following changes:

@adbar adbar closed this Jan 16, 2024
@adbar adbar deleted the sourcery/master branch January 16, 2024 14:29
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

1 participant