Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[PR #9203/6e70c0a backport][3.10] Implement heapq for cookie expire times #9208

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGES/9203.misc.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Significantly improved performance of expiring cookies -- by :user:`bdraco`.

Expiring cookies has been redesigned to use :mod:`heapq` instead of a linear search, to better scale.
100 changes: 73 additions & 27 deletions aiohttp/cookiejar.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import calendar
import contextlib
import datetime
import heapq
import itertools
import os # noqa
import pathlib
Expand All @@ -10,7 +11,6 @@
import time
from collections import defaultdict
from http.cookies import BaseCookie, Morsel, SimpleCookie
from math import ceil
from typing import (
DefaultDict,
Dict,
Expand Down Expand Up @@ -40,6 +40,11 @@
_FORMAT_PATH = "{}/{}".format
_FORMAT_DOMAIN_REVERSED = "{1}.{0}".format

# The minimum number of scheduled cookie expirations before we start cleaning up
# the expiration heap. This is a performance optimization to avoid cleaning up the
# heap too often when there are only a few scheduled expirations.
_MIN_SCHEDULED_COOKIE_EXPIRATION = 100


class CookieJar(AbstractCookieJar):
"""Implements cookie storage adhering to RFC 6265."""
Expand Down Expand Up @@ -105,7 +110,7 @@ def __init__(
for url in treat_as_secure_origin
]
self._treat_as_secure_origin = treat_as_secure_origin
self._next_expiration: float = ceil(time.time())
self._expire_heap: List[Tuple[float, Tuple[str, str, str]]] = []
self._expirations: Dict[Tuple[str, str, str], float] = {}

def save(self, file_path: PathLike) -> None:
Expand All @@ -120,34 +125,25 @@ def load(self, file_path: PathLike) -> None:

def clear(self, predicate: Optional[ClearCookiePredicate] = None) -> None:
if predicate is None:
self._next_expiration = ceil(time.time())
self._expire_heap.clear()
self._cookies.clear()
self._host_only_cookies.clear()
self._expirations.clear()
return

to_del = []
now = time.time()
for (domain, path), cookie in self._cookies.items():
for name, morsel in cookie.items():
key = (domain, path, name)
if (
key in self._expirations and self._expirations[key] <= now
) or predicate(morsel):
to_del.append(key)

for domain, path, name in to_del:
self._host_only_cookies.discard((domain, name))
key = (domain, path, name)
if key in self._expirations:
del self._expirations[(domain, path, name)]
self._cookies[(domain, path)].pop(name, None)

self._next_expiration = (
min(*self._expirations.values(), self.SUB_MAX_TIME) + 1
if self._expirations
else self.MAX_TIME
)
to_del = [
key
for (domain, path), cookie in self._cookies.items()
for name, morsel in cookie.items()
if (
(key := (domain, path, name)) in self._expirations
and self._expirations[key] <= now
)
or predicate(morsel)
]
if to_del:
self._delete_cookies(to_del)

def clear_domain(self, domain: str) -> None:
self.clear(lambda x: self._is_domain_match(domain, x["domain"]))
Expand All @@ -166,11 +162,61 @@ def __len__(self) -> int:
return sum(len(cookie.values()) for cookie in self._cookies.values())

def _do_expiration(self) -> None:
self.clear(lambda x: False)
"""Remove expired cookies."""
if not (expire_heap_len := len(self._expire_heap)):
return

# If the expiration heap grows larger than the number expirations
# times two, we clean it up to avoid keeping expired entries in
# the heap and consuming memory. We guard this with a minimum
# threshold to avoid cleaning up the heap too often when there are
# only a few scheduled expirations.
if (
expire_heap_len > _MIN_SCHEDULED_COOKIE_EXPIRATION
and expire_heap_len > len(self._expirations) * 2
):
# Remove any expired entries from the expiration heap
# that do not match the expiration time in the expirations
# as it means the cookie has been re-added to the heap
# with a different expiration time.
self._expire_heap = [
entry
for entry in self._expire_heap
if self._expirations.get(entry[1]) == entry[0]
]
heapq.heapify(self._expire_heap)

now = time.time()
to_del: List[Tuple[str, str, str]] = []
# Find any expired cookies and add them to the to-delete list
while self._expire_heap:
when, cookie_key = self._expire_heap[0]
if when > now:
break
heapq.heappop(self._expire_heap)
# Check if the cookie hasn't been re-added to the heap
# with a different expiration time as it will be removed
# later when it reaches the top of the heap and its
# expiration time is met.
if self._expirations.get(cookie_key) == when:
to_del.append(cookie_key)

if to_del:
self._delete_cookies(to_del)

def _delete_cookies(self, to_del: List[Tuple[str, str, str]]) -> None:
for domain, path, name in to_del:
self._host_only_cookies.discard((domain, name))
self._cookies[(domain, path)].pop(name, None)
self._expirations.pop((domain, path, name), None)

def _expire_cookie(self, when: float, domain: str, path: str, name: str) -> None:
self._next_expiration = min(self._next_expiration, when)
self._expirations[(domain, path, name)] = when
cookie_key = (domain, path, name)
if self._expirations.get(cookie_key) == when:
# Avoid adding duplicates to the heap
return
heapq.heappush(self._expire_heap, (when, cookie_key))
self._expirations[cookie_key] = when

def update_cookies(self, cookies: LooseCookies, response_url: URL = URL()) -> None:
"""Update cookies."""
Expand Down
174 changes: 170 additions & 4 deletions tests/test_cookiejar.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import asyncio
import datetime
import heapq
import itertools
import pathlib
import pickle
import unittest
from http.cookies import BaseCookie, Morsel, SimpleCookie
from operator import not_
from unittest import mock

import pytest
Expand Down Expand Up @@ -847,12 +849,98 @@ async def test_cookie_jar_clear_expired():
with freeze_time("1980-01-01"):
sut.update_cookies(cookie)

sut.clear(lambda x: False)
with freeze_time("1980-01-01"):
assert len(sut) == 0
for _ in range(2):
sut.clear(not_)
with freeze_time("1980-01-01"):
assert len(sut) == 0


async def test_cookie_jar_expired_changes() -> None:
"""Test that expire time changes are handled as expected."""
jar = CookieJar()

cookie_eleven_am = SimpleCookie()
cookie_eleven_am["foo"] = "bar"
cookie_eleven_am["foo"]["expires"] = "Tue, 1 Jan 1990 11:00:00 GMT"

cookie_noon = SimpleCookie()
cookie_noon["foo"] = "bar"
cookie_noon["foo"]["expires"] = "Tue, 1 Jan 1990 12:00:00 GMT"

cookie_one_pm = SimpleCookie()
cookie_one_pm["foo"] = "bar"
cookie_one_pm["foo"]["expires"] = "Tue, 1 Jan 1990 13:00:00 GMT"

cookie_two_pm = SimpleCookie()
cookie_two_pm["foo"] = "bar"
cookie_two_pm["foo"]["expires"] = "Tue, 1 Jan 1990 14:00:00 GMT"

with freeze_time() as freezer:
freezer.move_to("1990-01-01 10:00:00+00:00")
jar.update_cookies(cookie_noon)
assert len(jar) == 1
matched_cookies = jar.filter_cookies(URL("/"))
assert len(matched_cookies) == 1
assert "foo" in matched_cookies

jar.update_cookies(cookie_eleven_am)
matched_cookies = jar.filter_cookies(URL("/"))
assert len(matched_cookies) == 1
assert "foo" in matched_cookies

jar.update_cookies(cookie_one_pm)
matched_cookies = jar.filter_cookies(URL("/"))
assert len(matched_cookies) == 1
assert "foo" in matched_cookies

jar.update_cookies(cookie_two_pm)
matched_cookies = jar.filter_cookies(URL("/"))
assert len(matched_cookies) == 1
assert "foo" in matched_cookies

freezer.move_to("1990-01-01 13:00:00+00:00")
matched_cookies = jar.filter_cookies(URL("/"))
assert len(matched_cookies) == 1
assert "foo" in matched_cookies

freezer.move_to("1990-01-01 14:00:00+00:00")
matched_cookies = jar.filter_cookies(URL("/"))
assert len(matched_cookies) == 0


async def test_cookie_jar_duplicates_with_expire_heap() -> None:
"""Test that duplicate cookies do not grow the expires heap."""
jar = CookieJar()

cookie_eleven_am = SimpleCookie()
cookie_eleven_am["foo"] = "bar"
cookie_eleven_am["foo"]["expires"] = "Tue, 1 Jan 1990 11:00:00 GMT"

cookie_two_pm = SimpleCookie()
cookie_two_pm["foo"] = "bar"
cookie_two_pm["foo"]["expires"] = "Tue, 1 Jan 1990 14:00:00 GMT"

with freeze_time() as freezer:
freezer.move_to("1990-01-01 10:00:00+00:00")

for _ in range(10):
jar.update_cookies(cookie_eleven_am)

async def test_cookie_jar_filter_cookies_expires():
assert len(jar) == 1
matched_cookies = jar.filter_cookies(URL("/"))
assert len(matched_cookies) == 1
assert "foo" in matched_cookies

assert len(jar._expire_heap) == 1

freezer.move_to("1990-01-01 16:00:00+00:00")
jar.update_cookies(cookie_two_pm)
matched_cookies = jar.filter_cookies(URL("/"))
assert len(matched_cookies) == 0
assert len(jar._expire_heap) == 0


async def test_cookie_jar_filter_cookies_expires() -> None:
"""Test that calling filter_cookies will expire stale cookies."""
jar = CookieJar()
assert len(jar) == 0
Expand All @@ -873,6 +961,84 @@ async def test_cookie_jar_filter_cookies_expires():
assert len(jar) == 0


async def test_cookie_jar_heap_cleanup() -> None:
"""Test that the heap gets cleaned up when there are many old expirations."""
jar = CookieJar()
# The heap should not be cleaned up when there are less than 100 expiration changes
min_cookies_to_cleanup = 100

with freeze_time() as freezer:
freezer.move_to("1990-01-01 09:00:00+00:00")

start_time = datetime.datetime(
1990, 1, 1, 10, 0, 0, tzinfo=datetime.timezone.utc
)
for i in range(min_cookies_to_cleanup):
cookie = SimpleCookie()
cookie["foo"] = "bar"
cookie["foo"]["expires"] = (
start_time + datetime.timedelta(seconds=i)
).strftime("%a, %d %b %Y %H:%M:%S GMT")
jar.update_cookies(cookie)
assert len(jar._expire_heap) == i + 1

assert len(jar._expire_heap) == min_cookies_to_cleanup

# Now that we reached the minimum number of cookies to cleanup,
# add one more cookie to trigger the cleanup
cookie = SimpleCookie()
cookie["foo"] = "bar"
cookie["foo"]["expires"] = (
start_time + datetime.timedelta(seconds=i + 1)
).strftime("%a, %d %b %Y %H:%M:%S GMT")
jar.update_cookies(cookie)

# Verify that the heap has been cleaned up
assert len(jar) == 1
matched_cookies = jar.filter_cookies(URL("/"))
assert len(matched_cookies) == 1
assert "foo" in matched_cookies
# The heap should have been cleaned up
assert len(jar._expire_heap) == 1


async def test_cookie_jar_heap_maintains_order_after_cleanup() -> None:
"""Test that order is maintained after cleanup."""
jar = CookieJar()
# The heap should not be cleaned up when there are less than 100 expiration changes
min_cookies_to_cleanup = 100

with freeze_time() as freezer:
freezer.move_to("1990-01-01 09:00:00+00:00")

for hour in (12, 13):
for i in range(min_cookies_to_cleanup):
cookie = SimpleCookie()
cookie["foo"] = "bar"
cookie["foo"]["domain"] = f"example{i}.com"
cookie["foo"]["expires"] = f"Tue, 1 Jan 1990 {hour}:00:00 GMT"
jar.update_cookies(cookie)

# Get the jar into a state where the next cookie will trigger the cleanup
assert len(jar._expire_heap) == min_cookies_to_cleanup * 2
assert len(jar._expirations) == min_cookies_to_cleanup

cookie = SimpleCookie()
cookie["foo"] = "bar"
cookie["foo"]["domain"] = "example0.com"
cookie["foo"]["expires"] = "Tue, 1 Jan 1990 14:00:00 GMT"
jar.update_cookies(cookie)

assert len(jar) == 100
# The heap should have been cleaned up
assert len(jar._expire_heap) == 100

# Verify that the heap is still ordered
heap_before = jar._expire_heap.copy()
heapq.heapify(jar._expire_heap)
assert heap_before == jar._expire_heap


async def test_cookie_jar_clear_domain() -> None:
sut = CookieJar()
cookie = SimpleCookie()
Expand Down
Loading