Skip to content

Commit

Permalink
修复tldextract
Browse files Browse the repository at this point in the history
  • Loading branch information
beikejinmiao committed Apr 8, 2023
1 parent 06462b1 commit 723bdd3
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 8 deletions.
8 changes: 4 additions & 4 deletions libs/regex.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import re
import tldextract
from utils.hostsplit import domextract


ipv4 = re.compile(r"^((25[0-5])|(2[0-4]\d)|(1\d\d)|([1-9]\d)|\d)(\.((25[0-5])|(2[0-4]\d)|(1\d\d)|([1-9]\d)|\d)){3}$")
Expand Down Expand Up @@ -47,13 +47,13 @@ def is_valid_ip(text):


def is_valid_domain(text):
if domain.match(text) and "." in text[-7:] and tldextract.extract(text).suffix != "":
if domain.match(text) and "." in text[-7:] and domextract(text).suffix != "":
return True
return False


def maybe_url(text):
if url.match(text) and tldextract.extract(text).suffix != "":
if url.match(text) and domextract(text).suffix != "":
return True
return False

Expand All @@ -68,7 +68,7 @@ def find_domains(text):
domains = set()
for item in domain_find_regex.findall(text):
# in general, domain suffix length less than 6.
if "." in item[-7:] and tldextract.extract(item).suffix != "":
if "." in item[-7:] and domextract(item).suffix != "":
domains.add(item)
return list(domains)

Expand Down
4 changes: 2 additions & 2 deletions libs/web/url.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@
# -*- coding:utf-8 -*-
import os
import re
import tldextract
import html as htmlparser
from urllib.parse import unquote
from collections import namedtuple
from urllib.parse import urlparse
from libs.regex import html, common_dom
from utils.hostsplit import domextract


def normal_url(url):
Expand All @@ -25,7 +25,7 @@ def urlsite(url):
if re.match(r'^\w+://', url):
site = urlparse(url).netloc
#
ext = tldextract.extract(url)
ext = domextract(url)
if not ext.registered_domain:
return UrlSiteResult(subdomain='', domain='', suffix='',
reg_domain='', hostname=site)
Expand Down
4 changes: 2 additions & 2 deletions tools/alexa_bloom.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import os
import tldextract
from pybloom_live import BloomFilter
from utils.filedir import reader
from utils.hostsplit import domextract
from conf.paths import PRIVATE_RESOURCE_HOME
from conf.paths import ALEXA_BLOOM_FILTER_PATH

Expand All @@ -25,7 +25,7 @@ def check(hosts):
bloom = BloomFilter.fromfile(fopen)
for host in hosts:
host = host.lower()
reg_domain = tldextract.extract(host).registered_domain
reg_domain = domextract(host).registered_domain
if not reg_domain:
reg_domain = host
if reg_domain in bloom:
Expand Down
11 changes: 11 additions & 0 deletions utils/hostsplit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import tldextract


domextract = tldextract.TLDExtract(cache_dir=False)





0 comments on commit 723bdd3

Please sign in to comment.