Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Retry well-known lookup before expiry. #5844

Merged
merged 2 commits into from
Aug 14, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/5844.misc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Retry well-known lookup before the cache expires, giving a grace period where the remote well-known can be down but we still use the old result.
82 changes: 60 additions & 22 deletions synapse/http/federation/well_known_resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,12 @@
# lower bound for .well-known cache period
WELL_KNOWN_MIN_CACHE_PERIOD = 5 * 60

# Attempt to refetch a cached well-known N% of the TTL before it expires.
# e.g. if set to 0.2 and we have a cached entry with a TTL of 5mins, then
# we'll start trying to refetch 1 minute before it expires.
WELL_KNOWN_GRACE_PERIOD_FACTOR = 0.2


logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -80,15 +86,38 @@ def get_well_known(self, server_name):
Deferred[WellKnownLookupResult]: The result of the lookup
"""
try:
result = self._well_known_cache[server_name]
prev_result, expiry, ttl = self._well_known_cache.get_with_expiry(
server_name
)

now = self._clock.time()
if now < expiry - WELL_KNOWN_GRACE_PERIOD_FACTOR * ttl:
return WellKnownLookupResult(delegated_server=prev_result)
except KeyError:
# TODO: should we linearise so that we don't end up doing two .well-known
# requests for the same server in parallel?
prev_result = None

# TODO: should we linearise so that we don't end up doing two .well-known
# requests for the same server in parallel?
try:
with Measure(self._clock, "get_well_known"):
result, cache_period = yield self._do_get_well_known(server_name)

if cache_period > 0:
self._well_known_cache.set(server_name, result, cache_period)
except _FetchWellKnownFailure as e:
if prev_result and e.temporary:
# This is a temporary failure and we have a still valid cached
# result, so lets return that. Hopefully the next time we ask
# the remote will be back up again.
return WellKnownLookupResult(delegated_server=prev_result)

result = None

# add some randomness to the TTL to avoid a stampeding herd every hour
# after startup
cache_period = WELL_KNOWN_INVALID_CACHE_PERIOD
cache_period += random.uniform(0, WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER)

if cache_period > 0:
self._well_known_cache.set(server_name, result, cache_period)

return WellKnownLookupResult(delegated_server=result)

Expand All @@ -99,40 +128,42 @@ def _do_get_well_known(self, server_name):
Args:
server_name (bytes): name of the server, from the requested url

Raises:
_FetchWellKnownFailure if we fail to lookup a result

Returns:
Deferred[Tuple[bytes|None|object],int]:
result, cache period, where result is one of:
- the new server name from the .well-known (as a `bytes`)
- None if there was no .well-known file.
- INVALID_WELL_KNOWN if the .well-known was invalid
Deferred[Tuple[bytes,int]]: The lookup result and cache period.
"""
uri = b"https://%s/.well-known/matrix/server" % (server_name,)
uri_str = uri.decode("ascii")
logger.info("Fetching %s", uri_str)

# We do this in two steps to differentiate between possibly transient
# errors (e.g. can't connect to host, 503 response) and more permenant
# errors (such as getting a 404 response).
try:
response = yield make_deferred_yieldable(
self._well_known_agent.request(b"GET", uri)
)
body = yield make_deferred_yieldable(readBody(response))

if 500 <= response.code < 600:
raise Exception("Non-200 response %s" % (response.code,))
except Exception as e:
logger.info("Error fetching %s: %s", uri_str, e)
raise _FetchWellKnownFailure(temporary=True)

try:
if response.code != 200:
raise Exception("Non-200 response %s" % (response.code,))

parsed_body = json.loads(body.decode("utf-8"))
logger.info("Response from .well-known: %s", parsed_body)
if not isinstance(parsed_body, dict):
raise Exception("not a dict")
if "m.server" not in parsed_body:
raise Exception("Missing key 'm.server'")

result = parsed_body["m.server"].encode("ascii")
except Exception as e:
logger.info("Error fetching %s: %s", uri_str, e)

# add some randomness to the TTL to avoid a stampeding herd every hour
# after startup
cache_period = WELL_KNOWN_INVALID_CACHE_PERIOD
cache_period += random.uniform(0, WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER)
return (None, cache_period)

result = parsed_body["m.server"].encode("ascii")
raise _FetchWellKnownFailure(temporary=False)

cache_period = _cache_period_from_headers(
response.headers, time_now=self._reactor.seconds
Expand Down Expand Up @@ -185,3 +216,10 @@ def _parse_cache_control(headers):
v = splits[1] if len(splits) > 1 else None
cache_controls[k] = v
return cache_controls


@attr.s()
class _FetchWellKnownFailure(Exception):
# True if we didn't get a non-5xx HTTP response, i.e. this may or may not be
# a temporary failure.
temporary = attr.ib()
8 changes: 5 additions & 3 deletions synapse/util/caches/ttlcache.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def set(self, key, value, ttl):
if e != SENTINEL:
self._expiry_list.remove(e)

entry = _CacheEntry(expiry_time=expiry, key=key, value=value)
entry = _CacheEntry(expiry_time=expiry, ttl=ttl, key=key, value=value)
self._data[key] = entry
self._expiry_list.add(entry)

Expand Down Expand Up @@ -87,7 +87,8 @@ def get_with_expiry(self, key):
key: key to look up

Returns:
Tuple[Any, float]: the value from the cache, and the expiry time
Tuple[Any, float, float]: the value from the cache, the expiry time
and the TTL

Raises:
KeyError if the entry is not found
Expand All @@ -99,7 +100,7 @@ def get_with_expiry(self, key):
self._metrics.inc_misses()
raise
self._metrics.inc_hits()
return e.value, e.expiry_time
return e.value, e.expiry_time, e.ttl

def pop(self, key, default=SENTINEL):
"""Remove a value from the cache
Expand Down Expand Up @@ -158,5 +159,6 @@ class _CacheEntry(object):

# expiry_time is the first attribute, so that entries are sorted by expiry.
expiry_time = attr.ib()
ttl = attr.ib()
key = attr.ib()
value = attr.ib()
69 changes: 69 additions & 0 deletions tests/http/federation/test_matrix_federation_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -987,6 +987,75 @@ def test_well_known_cache(self):
r = self.successResultOf(fetch_d)
self.assertEqual(r.delegated_server, b"other-server")

def test_well_known_cache_with_temp_failure(self):
"""Test that we refetch well-known before the cache expires, and that
it ignores transient errors.
"""

well_known_resolver = WellKnownResolver(
self.reactor,
Agent(self.reactor, contextFactory=self.tls_factory),
well_known_cache=self.well_known_cache,
)

self.reactor.lookups["testserv"] = "1.2.3.4"

fetch_d = well_known_resolver.get_well_known(b"testserv")

# there should be an attempt to connect on port 443 for the .well-known
clients = self.reactor.tcpClients
self.assertEqual(len(clients), 1)
(host, port, client_factory, _timeout, _bindAddress) = clients.pop(0)
self.assertEqual(host, "1.2.3.4")
self.assertEqual(port, 443)

well_known_server = self._handle_well_known_connection(
client_factory,
expected_sni=b"testserv",
response_headers={b"Cache-Control": b"max-age=1000"},
content=b'{ "m.server": "target-server" }',
)

r = self.successResultOf(fetch_d)
self.assertEqual(r.delegated_server, b"target-server")

# close the tcp connection
well_known_server.loseConnection()

# Get close to the cache expiry, this will cause the resolver to do
# another lookup.
self.reactor.pump((900.0,))

fetch_d = well_known_resolver.get_well_known(b"testserv")
clients = self.reactor.tcpClients
(host, port, client_factory, _timeout, _bindAddress) = clients.pop(0)

# fonx the connection attempt, this will be treated as a temporary
# failure.
client_factory.clientConnectionFailed(None, Exception("nope"))

# attemptdelay on the hostnameendpoint is 0.3, so takes that long before the
# .well-known request fails.
self.reactor.pump((0.4,))

# Resolver should return cached value, despite the lookup failing.
r = self.successResultOf(fetch_d)
self.assertEqual(r.delegated_server, b"target-server")

# Expire the cache and repeat the request
self.reactor.pump((100.0,))

# Repated the request, this time it should fail if the lookup fails.
fetch_d = well_known_resolver.get_well_known(b"testserv")

clients = self.reactor.tcpClients
(host, port, client_factory, _timeout, _bindAddress) = clients.pop(0)
client_factory.clientConnectionFailed(None, Exception("nope"))
self.reactor.pump((0.4,))

r = self.successResultOf(fetch_d)
self.assertEqual(r.delegated_server, None)


class TestCachePeriodFromHeaders(TestCase):
def test_cache_control(self):
Expand Down
4 changes: 2 additions & 2 deletions tests/util/caches/test_ttlcache.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def test_get(self):
self.assertTrue("one" in self.cache)
self.assertEqual(self.cache.get("one"), "1")
self.assertEqual(self.cache["one"], "1")
self.assertEqual(self.cache.get_with_expiry("one"), ("1", 110))
self.assertEqual(self.cache.get_with_expiry("one"), ("1", 110, 10))
self.assertEqual(self.cache._metrics.hits, 3)
self.assertEqual(self.cache._metrics.misses, 0)

Expand Down Expand Up @@ -77,7 +77,7 @@ def test_expiry(self):
self.assertEqual(self.cache["two"], "2")
self.assertEqual(self.cache["three"], "3")

self.assertEqual(self.cache.get_with_expiry("two"), ("2", 120))
self.assertEqual(self.cache.get_with_expiry("two"), ("2", 120, 20))

self.assertEqual(self.cache._metrics.hits, 5)
self.assertEqual(self.cache._metrics.misses, 0)