diff --git a/CHANGES/9203.misc.rst b/CHANGES/9203.misc.rst new file mode 100644 index 00000000000..766fdc01a57 --- /dev/null +++ b/CHANGES/9203.misc.rst @@ -0,0 +1,3 @@ +Significantly improved performance of expiring cookies -- by :user:`bdraco`. + +Expiring cookies has been redesigned to use :mod:`heapq` instead of a linear search, to better scale. diff --git a/aiohttp/cookiejar.py b/aiohttp/cookiejar.py index 43e701bfe59..6a687531767 100644 --- a/aiohttp/cookiejar.py +++ b/aiohttp/cookiejar.py @@ -1,6 +1,7 @@ import calendar import contextlib import datetime +import heapq import itertools import os # noqa import pathlib @@ -10,13 +11,13 @@ import warnings from collections import defaultdict from http.cookies import BaseCookie, Morsel, SimpleCookie -from math import ceil from typing import ( DefaultDict, Dict, FrozenSet, Iterable, Iterator, + List, Mapping, Optional, Set, @@ -40,6 +41,11 @@ _FORMAT_PATH = "{}/{}".format _FORMAT_DOMAIN_REVERSED = "{1}.{0}".format +# The minimum number of scheduled cookie expirations before we start cleaning up +# the expiration heap. This is a performance optimization to avoid cleaning up the +# heap too often when there are only a few scheduled expirations. +_MIN_SCHEDULED_COOKIE_EXPIRATION = 100 + class CookieJar(AbstractCookieJar): """Implements cookie storage adhering to RFC 6265.""" @@ -106,7 +112,7 @@ def __init__( for url in treat_as_secure_origin } ) - self._next_expiration: float = ceil(time.time()) + self._expire_heap: List[Tuple[float, Tuple[str, str, str]]] = [] self._expirations: Dict[Tuple[str, str, str], float] = {} def save(self, file_path: PathLike) -> None: @@ -121,34 +127,25 @@ def load(self, file_path: PathLike) -> None: def clear(self, predicate: Optional[ClearCookiePredicate] = None) -> None: if predicate is None: - self._next_expiration = ceil(time.time()) + self._expire_heap.clear() self._cookies.clear() self._host_only_cookies.clear() self._expirations.clear() return - to_del = [] now = time.time() - for (domain, path), cookie in self._cookies.items(): - for name, morsel in cookie.items(): - key = (domain, path, name) - if ( - key in self._expirations and self._expirations[key] <= now - ) or predicate(morsel): - to_del.append(key) - - for domain, path, name in to_del: - self._host_only_cookies.discard((domain, name)) - key = (domain, path, name) - if key in self._expirations: - del self._expirations[(domain, path, name)] - self._cookies[(domain, path)].pop(name, None) - - self._next_expiration = ( - min(*self._expirations.values(), self.SUB_MAX_TIME) + 1 - if self._expirations - else self.MAX_TIME - ) + to_del = [ + key + for (domain, path), cookie in self._cookies.items() + for name, morsel in cookie.items() + if ( + (key := (domain, path, name)) in self._expirations + and self._expirations[key] <= now + ) + or predicate(morsel) + ] + if to_del: + self._delete_cookies(to_del) def clear_domain(self, domain: str) -> None: self.clear(lambda x: self._is_domain_match(domain, x["domain"])) @@ -167,11 +164,61 @@ def __len__(self) -> int: return sum(len(cookie.values()) for cookie in self._cookies.values()) def _do_expiration(self) -> None: - self.clear(lambda x: False) + """Remove expired cookies.""" + if not (expire_heap_len := len(self._expire_heap)): + return + + # If the expiration heap grows larger than the number expirations + # times two, we clean it up to avoid keeping expired entries in + # the heap and consuming memory. We guard this with a minimum + # threshold to avoid cleaning up the heap too often when there are + # only a few scheduled expirations. + if ( + expire_heap_len > _MIN_SCHEDULED_COOKIE_EXPIRATION + and expire_heap_len > len(self._expirations) * 2 + ): + # Remove any expired entries from the expiration heap + # that do not match the expiration time in the expirations + # as it means the cookie has been re-added to the heap + # with a different expiration time. + self._expire_heap = [ + entry + for entry in self._expire_heap + if self._expirations.get(entry[1]) == entry[0] + ] + heapq.heapify(self._expire_heap) + + now = time.time() + to_del: List[Tuple[str, str, str]] = [] + # Find any expired cookies and add them to the to-delete list + while self._expire_heap: + when, cookie_key = self._expire_heap[0] + if when > now: + break + heapq.heappop(self._expire_heap) + # Check if the cookie hasn't been re-added to the heap + # with a different expiration time as it will be removed + # later when it reaches the top of the heap and its + # expiration time is met. + if self._expirations.get(cookie_key) == when: + to_del.append(cookie_key) + + if to_del: + self._delete_cookies(to_del) + + def _delete_cookies(self, to_del: List[Tuple[str, str, str]]) -> None: + for domain, path, name in to_del: + self._host_only_cookies.discard((domain, name)) + self._cookies[(domain, path)].pop(name, None) + self._expirations.pop((domain, path, name), None) def _expire_cookie(self, when: float, domain: str, path: str, name: str) -> None: - self._next_expiration = min(self._next_expiration, when) - self._expirations[(domain, path, name)] = when + cookie_key = (domain, path, name) + if self._expirations.get(cookie_key) == when: + # Avoid adding duplicates to the heap + return + heapq.heappush(self._expire_heap, (when, cookie_key)) + self._expirations[cookie_key] = when def update_cookies(self, cookies: LooseCookies, response_url: URL = URL()) -> None: """Update cookies.""" diff --git a/tests/test_cookiejar.py b/tests/test_cookiejar.py index bfe6fe27751..0821bcc5fcb 100644 --- a/tests/test_cookiejar.py +++ b/tests/test_cookiejar.py @@ -1,9 +1,11 @@ import asyncio import datetime +import heapq import itertools import pickle import unittest from http.cookies import BaseCookie, Morsel, SimpleCookie +from operator import not_ from pathlib import Path from typing import List, Set, Tuple, Union from unittest import mock @@ -876,9 +878,95 @@ async def test_cookie_jar_clear_expired() -> None: with freeze_time("1980-01-01"): sut.update_cookies(cookie) - sut.clear(lambda x: False) - with freeze_time("1980-01-01"): - assert len(sut) == 0 + for _ in range(2): + sut.clear(not_) + with freeze_time("1980-01-01"): + assert len(sut) == 0 + + +async def test_cookie_jar_expired_changes() -> None: + """Test that expire time changes are handled as expected.""" + jar = CookieJar() + + cookie_eleven_am = SimpleCookie() + cookie_eleven_am["foo"] = "bar" + cookie_eleven_am["foo"]["expires"] = "Tue, 1 Jan 1990 11:00:00 GMT" + + cookie_noon = SimpleCookie() + cookie_noon["foo"] = "bar" + cookie_noon["foo"]["expires"] = "Tue, 1 Jan 1990 12:00:00 GMT" + + cookie_one_pm = SimpleCookie() + cookie_one_pm["foo"] = "bar" + cookie_one_pm["foo"]["expires"] = "Tue, 1 Jan 1990 13:00:00 GMT" + + cookie_two_pm = SimpleCookie() + cookie_two_pm["foo"] = "bar" + cookie_two_pm["foo"]["expires"] = "Tue, 1 Jan 1990 14:00:00 GMT" + + with freeze_time() as freezer: + freezer.move_to("1990-01-01 10:00:00+00:00") + jar.update_cookies(cookie_noon) + assert len(jar) == 1 + matched_cookies = jar.filter_cookies(URL("/")) + assert len(matched_cookies) == 1 + assert "foo" in matched_cookies + + jar.update_cookies(cookie_eleven_am) + matched_cookies = jar.filter_cookies(URL("/")) + assert len(matched_cookies) == 1 + assert "foo" in matched_cookies + + jar.update_cookies(cookie_one_pm) + matched_cookies = jar.filter_cookies(URL("/")) + assert len(matched_cookies) == 1 + assert "foo" in matched_cookies + + jar.update_cookies(cookie_two_pm) + matched_cookies = jar.filter_cookies(URL("/")) + assert len(matched_cookies) == 1 + assert "foo" in matched_cookies + + freezer.move_to("1990-01-01 13:00:00+00:00") + matched_cookies = jar.filter_cookies(URL("/")) + assert len(matched_cookies) == 1 + assert "foo" in matched_cookies + + freezer.move_to("1990-01-01 14:00:00+00:00") + matched_cookies = jar.filter_cookies(URL("/")) + assert len(matched_cookies) == 0 + + +async def test_cookie_jar_duplicates_with_expire_heap() -> None: + """Test that duplicate cookies do not grow the expires heap.""" + jar = CookieJar() + + cookie_eleven_am = SimpleCookie() + cookie_eleven_am["foo"] = "bar" + cookie_eleven_am["foo"]["expires"] = "Tue, 1 Jan 1990 11:00:00 GMT" + + cookie_two_pm = SimpleCookie() + cookie_two_pm["foo"] = "bar" + cookie_two_pm["foo"]["expires"] = "Tue, 1 Jan 1990 14:00:00 GMT" + + with freeze_time() as freezer: + freezer.move_to("1990-01-01 10:00:00+00:00") + + for _ in range(10): + jar.update_cookies(cookie_eleven_am) + + assert len(jar) == 1 + matched_cookies = jar.filter_cookies(URL("/")) + assert len(matched_cookies) == 1 + assert "foo" in matched_cookies + + assert len(jar._expire_heap) == 1 + + freezer.move_to("1990-01-01 16:00:00+00:00") + jar.update_cookies(cookie_two_pm) + matched_cookies = jar.filter_cookies(URL("/")) + assert len(matched_cookies) == 0 + assert len(jar._expire_heap) == 0 async def test_cookie_jar_filter_cookies_expires() -> None: @@ -902,6 +990,84 @@ async def test_cookie_jar_filter_cookies_expires() -> None: assert len(jar) == 0 +async def test_cookie_jar_heap_cleanup() -> None: + """Test that the heap gets cleaned up when there are many old expirations.""" + jar = CookieJar() + # The heap should not be cleaned up when there are less than 100 expiration changes + min_cookies_to_cleanup = 100 + + with freeze_time() as freezer: + freezer.move_to("1990-01-01 09:00:00+00:00") + + start_time = datetime.datetime( + 1990, 1, 1, 10, 0, 0, tzinfo=datetime.timezone.utc + ) + for i in range(min_cookies_to_cleanup): + cookie = SimpleCookie() + cookie["foo"] = "bar" + cookie["foo"]["expires"] = ( + start_time + datetime.timedelta(seconds=i) + ).strftime("%a, %d %b %Y %H:%M:%S GMT") + jar.update_cookies(cookie) + assert len(jar._expire_heap) == i + 1 + + assert len(jar._expire_heap) == min_cookies_to_cleanup + + # Now that we reached the minimum number of cookies to cleanup, + # add one more cookie to trigger the cleanup + cookie = SimpleCookie() + cookie["foo"] = "bar" + cookie["foo"]["expires"] = ( + start_time + datetime.timedelta(seconds=i + 1) + ).strftime("%a, %d %b %Y %H:%M:%S GMT") + jar.update_cookies(cookie) + + # Verify that the heap has been cleaned up + assert len(jar) == 1 + matched_cookies = jar.filter_cookies(URL("/")) + assert len(matched_cookies) == 1 + assert "foo" in matched_cookies + # The heap should have been cleaned up + assert len(jar._expire_heap) == 1 + + +async def test_cookie_jar_heap_maintains_order_after_cleanup() -> None: + """Test that order is maintained after cleanup.""" + jar = CookieJar() + # The heap should not be cleaned up when there are less than 100 expiration changes + min_cookies_to_cleanup = 100 + + with freeze_time() as freezer: + freezer.move_to("1990-01-01 09:00:00+00:00") + + for hour in (12, 13): + for i in range(min_cookies_to_cleanup): + cookie = SimpleCookie() + cookie["foo"] = "bar" + cookie["foo"]["domain"] = f"example{i}.com" + cookie["foo"]["expires"] = f"Tue, 1 Jan 1990 {hour}:00:00 GMT" + jar.update_cookies(cookie) + + # Get the jar into a state where the next cookie will trigger the cleanup + assert len(jar._expire_heap) == min_cookies_to_cleanup * 2 + assert len(jar._expirations) == min_cookies_to_cleanup + + cookie = SimpleCookie() + cookie["foo"] = "bar" + cookie["foo"]["domain"] = "example0.com" + cookie["foo"]["expires"] = "Tue, 1 Jan 1990 14:00:00 GMT" + jar.update_cookies(cookie) + + assert len(jar) == 100 + # The heap should have been cleaned up + assert len(jar._expire_heap) == 100 + + # Verify that the heap is still ordered + heap_before = jar._expire_heap.copy() + heapq.heapify(jar._expire_heap) + assert heap_before == jar._expire_heap + + async def test_cookie_jar_clear_domain() -> None: sut = CookieJar() cookie = SimpleCookie()