From 723bdd387d7b8ff91d9fe20aa183e73302db6c6a Mon Sep 17 00:00:00 2001
From: lijinmiao <beikejinmiao@gmail.com>
Date: Sat, 8 Apr 2023 23:14:44 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dtldextract=20=20cache?=
 =?UTF-8?q?=E9=97=AE=E9=A2=98(https://github.com/john-kurkowski/tldextract?=
 =?UTF-8?q?/issues/254)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 libs/regex.py        |  8 ++++----
 libs/web/url.py      |  4 ++--
 tools/alexa_bloom.py |  4 ++--
 utils/hostsplit.py   | 11 +++++++++++
 4 files changed, 19 insertions(+), 8 deletions(-)
 create mode 100644 utils/hostsplit.py

diff --git a/libs/regex.py b/libs/regex.py
index b1e1243..eb1ee7f 100644
--- a/libs/regex.py
+++ b/libs/regex.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 # -*- coding:utf-8 -*-
 import re
-import tldextract
+from utils.hostsplit import domextract
 
 
 ipv4 = re.compile(r"^((25[0-5])|(2[0-4]\d)|(1\d\d)|([1-9]\d)|\d)(\.((25[0-5])|(2[0-4]\d)|(1\d\d)|([1-9]\d)|\d)){3}$")
@@ -47,13 +47,13 @@ def is_valid_ip(text):
 
 
 def is_valid_domain(text):
-    if domain.match(text) and "." in text[-7:] and tldextract.extract(text).suffix != "":
+    if domain.match(text) and "." in text[-7:] and domextract(text).suffix != "":
         return True
     return False
 
 
 def maybe_url(text):
-    if url.match(text) and tldextract.extract(text).suffix != "":
+    if url.match(text) and domextract(text).suffix != "":
         return True
     return False
 
@@ -68,7 +68,7 @@ def find_domains(text):
     domains = set()
     for item in domain_find_regex.findall(text):
         # in general, domain suffix length less than 6.
-        if "." in item[-7:] and tldextract.extract(item).suffix != "":
+        if "." in item[-7:] and domextract(item).suffix != "":
             domains.add(item)
     return list(domains)
 
diff --git a/libs/web/url.py b/libs/web/url.py
index 4b7a293..4012711 100644
--- a/libs/web/url.py
+++ b/libs/web/url.py
@@ -2,12 +2,12 @@
 # -*- coding:utf-8 -*-
 import os
 import re
-import tldextract
 import html as htmlparser
 from urllib.parse import unquote
 from collections import namedtuple
 from urllib.parse import urlparse
 from libs.regex import html, common_dom
+from utils.hostsplit import domextract
 
 
 def normal_url(url):
@@ -25,7 +25,7 @@ def urlsite(url):
     if re.match(r'^\w+://', url):
         site = urlparse(url).netloc
     #
-    ext = tldextract.extract(url)
+    ext = domextract(url)
     if not ext.registered_domain:
         return UrlSiteResult(subdomain='', domain='', suffix='',
                              reg_domain='', hostname=site)
diff --git a/tools/alexa_bloom.py b/tools/alexa_bloom.py
index 9a62711..6d9dba4 100644
--- a/tools/alexa_bloom.py
+++ b/tools/alexa_bloom.py
@@ -1,9 +1,9 @@
 #!/usr/bin/env python
 # -*- coding:utf-8 -*-
 import os
-import tldextract
 from pybloom_live import BloomFilter
 from utils.filedir import reader
+from utils.hostsplit import domextract
 from conf.paths import PRIVATE_RESOURCE_HOME
 from conf.paths import ALEXA_BLOOM_FILTER_PATH
 
@@ -25,7 +25,7 @@ def check(hosts):
         bloom = BloomFilter.fromfile(fopen)
     for host in hosts:
         host = host.lower()
-        reg_domain = tldextract.extract(host).registered_domain
+        reg_domain = domextract(host).registered_domain
         if not reg_domain:
             reg_domain = host
         if reg_domain in bloom:
diff --git a/utils/hostsplit.py b/utils/hostsplit.py
new file mode 100644
index 0000000..f74a77c
--- /dev/null
+++ b/utils/hostsplit.py
@@ -0,0 +1,11 @@
+#!/usr/bin/env python
+# -*- coding:utf-8 -*-
+import tldextract
+
+
+domextract = tldextract.TLDExtract(cache_dir=False)
+
+
+
+
+