Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cache entire public suffix list. Select at runtime. #144

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,5 @@ tldextract_app/tldextract
tldextract_app/web
tldextract.egg-info
.tox
tldextract/.suffix_cache/*
.pytest_cache
21 changes: 7 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ tldextract http://forums.bbc.co.uk

Beware when first running the module, it updates its TLD list with a live HTTP
request. This updated TLD set is cached indefinitely in
`/path/to/tldextract/.tld_set`.
`/path/to/tldextract/.suffix_cache`.

(Arguably runtime bootstrapping like that shouldn't be the default behavior,
like for production systems. But I want you to have the latest TLDs, especially
Expand All @@ -122,11 +122,11 @@ no_fetch_extract = tldextract.TLDExtract(suffix_list_urls=None)
no_fetch_extract('http://www.google.com')

# extract callable that reads/writes the updated TLD set to a different path
custom_cache_extract = tldextract.TLDExtract(cache_file='/path/to/your/cache/file')
custom_cache_extract = tldextract.TLDExtract(cache_dir='/path/to/your/cache/dir')
custom_cache_extract('http://www.google.com')

# extract callable that doesn't use caching
no_cache_extract = tldextract.TLDExtract(cache_file=False)
no_cache_extract = tldextract.TLDExtract(cache_dir=False)
no_cache_extract('http://www.google.com')
```

Expand Down Expand Up @@ -169,9 +169,8 @@ ExtractResult(subdomain='waiterrant', domain='blogspot', suffix='com')
The following overrides this.

```python
>>> extract = tldextract.TLDExtract(include_psl_private_domains=True)
>>> extract.update() # necessary until #66 is fixed
>>> extract('waiterrant.blogspot.com')
>>> extract = tldextract.TLDExtract()
>>> extract('waiterrant.blogspot.com', include_psl_private_domains=True)
ExtractResult(subdomain='', domain='waiterrant', suffix='blogspot.com')
```

Expand All @@ -185,11 +184,7 @@ behavior of other, PSL-based libraries.
You can specify your own input data in place of the default Mozilla Public Suffix List:

```python
extract = tldextract.TLDExtract(
suffix_list_urls=["http://foo.bar.baz"],
# Recommended: Specify your own cache file, to minimize ambiguities about where
# tldextract is getting its data, or cached data, from.
cache_file='/path/to/your/cache/file')
extract = tldextract.TLDExtract(suffix_list_urls=["http://foo.bar.baz"])
```

The above snippet will fetch from the URL *you* specified, upon first need to download the
Expand All @@ -198,9 +193,7 @@ suffix list (i.e. if the cache_file doesn't exist).
If you want to use input data from your local filesystem, just use the `file://` protocol:

```python
extract = tldextract.TLDExtract(
suffix_list_urls=["file://absolute/path/to/your/local/suffix/list/file"],
cache_file='/path/to/your/cache/file')
extract = tldextract.TLDExtract(suffix_list_urls=["file://absolute/path/to/your/local/suffix/list/file"])
```

Use an absolute path when specifying the `suffix_list_urls` keyword argument.
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
"Please upgrade your Python or use an older "
"version of tldextract.")

INSTALL_REQUIRES = ["setuptools", "idna", "requests>=2.1.0", "requests-file>=1.4"]
INSTALL_REQUIRES = ["setuptools", "idna", "requests>=2.1.0", "requests-file>=1.4", 'filelock>=3.0.8']

setup(
name="tldextract",
Expand Down
8 changes: 4 additions & 4 deletions tests/custom_suffix_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import tldextract

from .helpers import temporary_file
from .helpers import temporary_dir


FAKE_SUFFIX_LIST_URL = "file://" + os.path.join(
Expand All @@ -15,15 +15,15 @@

# pylint: disable=invalid-name
extract_using_fake_suffix_list = tldextract.TLDExtract(
cache_file=temporary_file(),
cache_dir=temporary_dir(),
suffix_list_urls=[FAKE_SUFFIX_LIST_URL]
)
extract_using_fake_suffix_list_no_cache = tldextract.TLDExtract(
cache_file=None,
cache_dir=None,
suffix_list_urls=[FAKE_SUFFIX_LIST_URL]
)
extract_using_extra_suffixes = tldextract.TLDExtract(
cache_file=None,
cache_dir=None,
suffix_list_urls=[FAKE_SUFFIX_LIST_URL],
extra_suffixes=EXTRA_SUFFIXES
)
Expand Down
4 changes: 2 additions & 2 deletions tests/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def check_output(*popenargs, **kwargs):
return output


def temporary_file():
def temporary_dir():
""" Make a writable temporary file and return its absolute path.
"""
return tempfile.mkstemp()[1]
return tempfile.mkdtemp()
23 changes: 1 addition & 22 deletions tests/integration_test.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,12 @@
'''tldextract integration tests.'''

import logging
import os
import traceback

import pytest

import tldextract


def test_log_snapshot_diff(mocker):
mocker.patch.object(logging.getLogger(), 'level', logging.DEBUG)
debug_mock = mocker.patch.object(logging.getLogger('tldextract'), 'debug')

extractor = tldextract.TLDExtract()
try:
os.remove(extractor.cache_file)
except (IOError, OSError):
logging.warning(traceback.format_exc())

extractor('ignore.com')

assert debug_mock.call_count == 1
log_str = debug_mock.call_args[0][0]
assert log_str.startswith('computed TLD diff')


def test_bad_kwargs():
with pytest.raises(ValueError):
tldextract.TLDExtract(
cache_file=False, suffix_list_urls=False, fallback_to_snapshot=False
cache_dir=False, suffix_list_urls=False, fallback_to_snapshot=False
)
21 changes: 12 additions & 9 deletions tests/main_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,24 @@
import sys

import responses

import tldextract
from .helpers import temporary_file
from .helpers import temporary_dir

if sys.version_info >= (3,): # pragma: no cover
unicode = str # pylint: disable=invalid-name,redefined-builtin


# pylint: disable=invalid-name
extract = tldextract.TLDExtract(cache_file=temporary_file())
extract_no_cache = tldextract.TLDExtract(cache_file=False)
extract_using_real_local_suffix_list = tldextract.TLDExtract(cache_file=temporary_file())
extract_using_real_local_suffix_list_no_cache = tldextract.TLDExtract(cache_file=False)
extract = tldextract.TLDExtract(cache_dir=temporary_dir())
extract_no_cache = tldextract.TLDExtract(cache_dir=False)
extract_using_real_local_suffix_list = tldextract.TLDExtract(cache_dir=temporary_dir())
extract_using_real_local_suffix_list_no_cache = tldextract.TLDExtract(cache_dir=False)
extract_using_fallback_to_snapshot_no_cache = tldextract.TLDExtract(
cache_file=None,
cache_dir=None,
suffix_list_urls=None
)


# pylint: enable=invalid-name


Expand Down Expand Up @@ -90,7 +93,7 @@ def test_qualified_local_host():
def test_ip():
assert_extract('http://216.22.0.192/',
('', '', '216.22.0.192', ''),
expected_ip_data='216.22.0.192',)
expected_ip_data='216.22.0.192', )
assert_extract('http://216.22.project.coop/',
('216.22.project.coop', '216.22', 'project', 'coop'))

Expand Down Expand Up @@ -223,7 +226,7 @@ def test_result_as_dict():
)
expected_dict = {'subdomain': 'www',
'domain': 'google',
'suffix': 'com'}
'suffix': 'com', }
assert result._asdict() == expected_dict


Expand Down
Loading