修复tldextract

cache问题(john-kurkowski/tldextract#254)
beikejinmiao · Apr 8, 2023 · 723bdd3 · 723bdd3
1 parent 06462b1
commit 723bdd3
Show file tree

Hide file tree

Showing 4 changed files with 19 additions and 8 deletions.
diff --git a/libs/regex.py b/libs/regex.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 # -*- coding:utf-8 -*-
 import re
-import tldextract
+from utils.hostsplit import domextract
 
 
 ipv4 = re.compile(r"^((25[0-5])|(2[0-4]\d)|(1\d\d)|([1-9]\d)|\d)(\.((25[0-5])|(2[0-4]\d)|(1\d\d)|([1-9]\d)|\d)){3}$")
@@ -47,13 +47,13 @@ def is_valid_ip(text):
 
 
 def is_valid_domain(text):
-    if domain.match(text) and "." in text[-7:] and tldextract.extract(text).suffix != "":
+    if domain.match(text) and "." in text[-7:] and domextract(text).suffix != "":
         return True
     return False
 
 
 def maybe_url(text):
-    if url.match(text) and tldextract.extract(text).suffix != "":
+    if url.match(text) and domextract(text).suffix != "":
         return True
     return False
 
@@ -68,7 +68,7 @@ def find_domains(text):
     domains = set()
     for item in domain_find_regex.findall(text):
         # in general, domain suffix length less than 6.
-        if "." in item[-7:] and tldextract.extract(item).suffix != "":
+        if "." in item[-7:] and domextract(item).suffix != "":
             domains.add(item)
     return list(domains)
 

diff --git a/libs/web/url.py b/libs/web/url.py
@@ -2,12 +2,12 @@
 # -*- coding:utf-8 -*-
 import os
 import re
-import tldextract
 import html as htmlparser
 from urllib.parse import unquote
 from collections import namedtuple
 from urllib.parse import urlparse
 from libs.regex import html, common_dom
+from utils.hostsplit import domextract
 
 
 def normal_url(url):
@@ -25,7 +25,7 @@ def urlsite(url):
     if re.match(r'^\w+://', url):
         site = urlparse(url).netloc
     #
-    ext = tldextract.extract(url)
+    ext = domextract(url)
     if not ext.registered_domain:
         return UrlSiteResult(subdomain='', domain='', suffix='',
                              reg_domain='', hostname=site)

diff --git a/tools/alexa_bloom.py b/tools/alexa_bloom.py
@@ -1,9 +1,9 @@
 #!/usr/bin/env python
 # -*- coding:utf-8 -*-
 import os
-import tldextract
 from pybloom_live import BloomFilter
 from utils.filedir import reader
+from utils.hostsplit import domextract
 from conf.paths import PRIVATE_RESOURCE_HOME
 from conf.paths import ALEXA_BLOOM_FILTER_PATH
 
@@ -25,7 +25,7 @@ def check(hosts):
         bloom = BloomFilter.fromfile(fopen)
     for host in hosts:
         host = host.lower()
-        reg_domain = tldextract.extract(host).registered_domain
+        reg_domain = domextract(host).registered_domain
         if not reg_domain:
             reg_domain = host
         if reg_domain in bloom:

diff --git a/utils/hostsplit.py b/utils/hostsplit.py
@@ -0,0 +1,11 @@
+#!/usr/bin/env python
+# -*- coding:utf-8 -*-
+import tldextract
+
+
+domextract = tldextract.TLDExtract(cache_dir=False)
+
+
+
+
+