From 723bdd387d7b8ff91d9fe20aa183e73302db6c6a Mon Sep 17 00:00:00 2001 From: lijinmiao Date: Sat, 8 Apr 2023 23:14:44 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dtldextract=20=20cache?= =?UTF-8?q?=E9=97=AE=E9=A2=98(https://github.com/john-kurkowski/tldextract?= =?UTF-8?q?/issues/254)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- libs/regex.py | 8 ++++---- libs/web/url.py | 4 ++-- tools/alexa_bloom.py | 4 ++-- utils/hostsplit.py | 11 +++++++++++ 4 files changed, 19 insertions(+), 8 deletions(-) create mode 100644 utils/hostsplit.py diff --git a/libs/regex.py b/libs/regex.py index b1e1243..eb1ee7f 100644 --- a/libs/regex.py +++ b/libs/regex.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding:utf-8 -*- import re -import tldextract +from utils.hostsplit import domextract ipv4 = re.compile(r"^((25[0-5])|(2[0-4]\d)|(1\d\d)|([1-9]\d)|\d)(\.((25[0-5])|(2[0-4]\d)|(1\d\d)|([1-9]\d)|\d)){3}$") @@ -47,13 +47,13 @@ def is_valid_ip(text): def is_valid_domain(text): - if domain.match(text) and "." in text[-7:] and tldextract.extract(text).suffix != "": + if domain.match(text) and "." in text[-7:] and domextract(text).suffix != "": return True return False def maybe_url(text): - if url.match(text) and tldextract.extract(text).suffix != "": + if url.match(text) and domextract(text).suffix != "": return True return False @@ -68,7 +68,7 @@ def find_domains(text): domains = set() for item in domain_find_regex.findall(text): # in general, domain suffix length less than 6. - if "." in item[-7:] and tldextract.extract(item).suffix != "": + if "." in item[-7:] and domextract(item).suffix != "": domains.add(item) return list(domains) diff --git a/libs/web/url.py b/libs/web/url.py index 4b7a293..4012711 100644 --- a/libs/web/url.py +++ b/libs/web/url.py @@ -2,12 +2,12 @@ # -*- coding:utf-8 -*- import os import re -import tldextract import html as htmlparser from urllib.parse import unquote from collections import namedtuple from urllib.parse import urlparse from libs.regex import html, common_dom +from utils.hostsplit import domextract def normal_url(url): @@ -25,7 +25,7 @@ def urlsite(url): if re.match(r'^\w+://', url): site = urlparse(url).netloc # - ext = tldextract.extract(url) + ext = domextract(url) if not ext.registered_domain: return UrlSiteResult(subdomain='', domain='', suffix='', reg_domain='', hostname=site) diff --git a/tools/alexa_bloom.py b/tools/alexa_bloom.py index 9a62711..6d9dba4 100644 --- a/tools/alexa_bloom.py +++ b/tools/alexa_bloom.py @@ -1,9 +1,9 @@ #!/usr/bin/env python # -*- coding:utf-8 -*- import os -import tldextract from pybloom_live import BloomFilter from utils.filedir import reader +from utils.hostsplit import domextract from conf.paths import PRIVATE_RESOURCE_HOME from conf.paths import ALEXA_BLOOM_FILTER_PATH @@ -25,7 +25,7 @@ def check(hosts): bloom = BloomFilter.fromfile(fopen) for host in hosts: host = host.lower() - reg_domain = tldextract.extract(host).registered_domain + reg_domain = domextract(host).registered_domain if not reg_domain: reg_domain = host if reg_domain in bloom: diff --git a/utils/hostsplit.py b/utils/hostsplit.py new file mode 100644 index 0000000..f74a77c --- /dev/null +++ b/utils/hostsplit.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python +# -*- coding:utf-8 -*- +import tldextract + + +domextract = tldextract.TLDExtract(cache_dir=False) + + + + +