From 7d46709eaccf4e6db96804163645fb379eef59d7 Mon Sep 17 00:00:00 2001 From: Neel Shah Date: Thu, 8 Aug 2024 13:44:59 +0200 Subject: [PATCH] Serialize vars early to avoid living references (#3409) --- sentry_sdk/client.py | 16 +++--- sentry_sdk/integrations/pure_eval.py | 3 +- sentry_sdk/scope.py | 10 ++++ sentry_sdk/serializer.py | 74 ++++++++++------------------ sentry_sdk/utils.py | 4 +- tests/test_scrubber.py | 17 +++++++ 6 files changed, 67 insertions(+), 57 deletions(-) diff --git a/sentry_sdk/client.py b/sentry_sdk/client.py index 6698ee527d..d22dd1c0a4 100644 --- a/sentry_sdk/client.py +++ b/sentry_sdk/client.py @@ -5,12 +5,12 @@ from collections.abc import Mapping from datetime import datetime, timezone from importlib import import_module +from typing import cast from sentry_sdk._compat import PY37, check_uwsgi_thread_support from sentry_sdk.utils import ( capture_internal_exceptions, current_stacktrace, - disable_capture_event, format_timestamp, get_sdk_name, get_type_name, @@ -525,10 +525,13 @@ def _prepare_event( # Postprocess the event here so that annotated types do # generally not surface in before_send if event is not None: - event = serialize( - event, - max_request_body_size=self.options.get("max_request_body_size"), - max_value_length=self.options.get("max_value_length"), + event = cast( + "Event", + serialize( + cast("Dict[str, Any]", event), + max_request_body_size=self.options.get("max_request_body_size"), + max_value_length=self.options.get("max_value_length"), + ), ) before_send = self.options["before_send"] @@ -726,9 +729,6 @@ def capture_event( :returns: An event ID. May be `None` if there is no DSN set or of if the SDK decided to discard the event for other reasons. In such situations setting `debug=True` on `init()` may help. """ - if disable_capture_event.get(False): - return None - if hint is None: hint = {} event_id = event.get("event_id") diff --git a/sentry_sdk/integrations/pure_eval.py b/sentry_sdk/integrations/pure_eval.py index 9af4831b32..d5325be384 100644 --- a/sentry_sdk/integrations/pure_eval.py +++ b/sentry_sdk/integrations/pure_eval.py @@ -131,7 +131,8 @@ def start(n): atok = source.asttokens() expressions.sort(key=closeness, reverse=True) - return { + vars = { atok.get_text(nodes[0]): value for nodes, value in expressions[: serializer.MAX_DATABAG_BREADTH] } + return serializer.serialize(vars, is_vars=True) diff --git a/sentry_sdk/scope.py b/sentry_sdk/scope.py index 4e07e818c9..69037758a2 100644 --- a/sentry_sdk/scope.py +++ b/sentry_sdk/scope.py @@ -31,6 +31,7 @@ capture_internal_exception, capture_internal_exceptions, ContextVar, + disable_capture_event, event_from_exception, exc_info_from_error, logger, @@ -1130,6 +1131,9 @@ def capture_event(self, event, hint=None, scope=None, **scope_kwargs): :returns: An `event_id` if the SDK decided to send the event (see :py:meth:`sentry_sdk.client._Client.capture_event`). """ + if disable_capture_event.get(False): + return None + scope = self._merge_scopes(scope, scope_kwargs) event_id = self.get_client().capture_event(event=event, hint=hint, scope=scope) @@ -1157,6 +1161,9 @@ def capture_message(self, message, level=None, scope=None, **scope_kwargs): :returns: An `event_id` if the SDK decided to send the event (see :py:meth:`sentry_sdk.client._Client.capture_event`). """ + if disable_capture_event.get(False): + return None + if level is None: level = "info" @@ -1182,6 +1189,9 @@ def capture_exception(self, error=None, scope=None, **scope_kwargs): :returns: An `event_id` if the SDK decided to send the event (see :py:meth:`sentry_sdk.client._Client.capture_event`). """ + if disable_capture_event.get(False): + return None + if error is not None: exc_info = exc_info_from_error(error) else: diff --git a/sentry_sdk/serializer.py b/sentry_sdk/serializer.py index ff243eeadc..010c1a963f 100644 --- a/sentry_sdk/serializer.py +++ b/sentry_sdk/serializer.py @@ -25,7 +25,7 @@ from typing import Type from typing import Union - from sentry_sdk._types import NotImplementedType, Event + from sentry_sdk._types import NotImplementedType Span = Dict[str, Any] @@ -95,7 +95,25 @@ def __exit__( def serialize(event, **kwargs): - # type: (Event, **Any) -> Event + # type: (Dict[str, Any], **Any) -> Dict[str, Any] + """ + A very smart serializer that takes a dict and emits a json-friendly dict. + Currently used for serializing the final Event and also prematurely while fetching the stack + local variables for each frame in a stacktrace. + + It works internally with 'databags' which are arbitrary data structures like Mapping, Sequence and Set. + The algorithm itself is a recursive graph walk down the data structures it encounters. + + It has the following responsibilities: + * Trimming databags and keeping them within MAX_DATABAG_BREADTH and MAX_DATABAG_DEPTH. + * Calling safe_repr() on objects appropriately to keep them informative and readable in the final payload. + * Annotating the payload with the _meta field whenever trimming happens. + + :param max_request_body_size: If set to "always", will never trim request bodies. + :param max_value_length: The max length to strip strings to, defaults to sentry_sdk.consts.DEFAULT_MAX_VALUE_LENGTH + :param is_vars: If we're serializing vars early, we want to repr() things that are JSON-serializable to make their type more apparent. For example, it's useful to see the difference between a unicode-string and a bytestring when viewing a stacktrace. + + """ memo = Memo() path = [] # type: List[Segment] meta_stack = [] # type: List[Dict[str, Any]] @@ -104,6 +122,7 @@ def serialize(event, **kwargs): kwargs.pop("max_request_body_size", None) == "always" ) # type: bool max_value_length = kwargs.pop("max_value_length", None) # type: Optional[int] + is_vars = kwargs.pop("is_vars", False) def _annotate(**meta): # type: (**Any) -> None @@ -118,56 +137,17 @@ def _annotate(**meta): meta_stack[-1].setdefault("", {}).update(meta) - def _should_repr_strings(): - # type: () -> Optional[bool] - """ - By default non-serializable objects are going through - safe_repr(). For certain places in the event (local vars) we - want to repr() even things that are JSON-serializable to - make their type more apparent. For example, it's useful to - see the difference between a unicode-string and a bytestring - when viewing a stacktrace. - - For container-types we still don't do anything different. - Generally we just try to make the Sentry UI present exactly - what a pretty-printed repr would look like. - - :returns: `True` if we are somewhere in frame variables, and `False` if - we are in a position where we will never encounter frame variables - when recursing (for example, we're in `event.extra`). `None` if we - are not (yet) in frame variables, but might encounter them when - recursing (e.g. we're in `event.exception`) - """ - try: - p0 = path[0] - if p0 == "stacktrace" and path[1] == "frames" and path[3] == "vars": - return True - - if ( - p0 in ("threads", "exception") - and path[1] == "values" - and path[3] == "stacktrace" - and path[4] == "frames" - and path[6] == "vars" - ): - return True - except IndexError: - return None - - return False - def _is_databag(): # type: () -> Optional[bool] """ A databag is any value that we need to trim. + True for stuff like vars, request bodies, breadcrumbs and extra. - :returns: Works like `_should_repr_strings()`. `True` for "yes", - `False` for :"no", `None` for "maybe soon". + :returns: `True` for "yes", `False` for :"no", `None` for "maybe soon". """ try: - rv = _should_repr_strings() - if rv in (True, None): - return rv + if is_vars: + return True is_request_body = _is_request_body() if is_request_body in (True, None): @@ -253,7 +233,7 @@ def _serialize_node_impl( if isinstance(obj, AnnotatedValue): should_repr_strings = False if should_repr_strings is None: - should_repr_strings = _should_repr_strings() + should_repr_strings = is_vars if is_databag is None: is_databag = _is_databag() @@ -387,7 +367,7 @@ def _serialize_node_impl( disable_capture_event.set(True) try: serialized_event = _serialize_node(event, **kwargs) - if meta_stack and isinstance(serialized_event, dict): + if not is_vars and meta_stack and isinstance(serialized_event, dict): serialized_event["_meta"] = meta_stack[0] return serialized_event diff --git a/sentry_sdk/utils.py b/sentry_sdk/utils.py index 08d2768cde..8b718a1f92 100644 --- a/sentry_sdk/utils.py +++ b/sentry_sdk/utils.py @@ -616,7 +616,9 @@ def serialize_frame( ) if include_local_variables: - rv["vars"] = frame.f_locals.copy() + from sentry_sdk.serializer import serialize + + rv["vars"] = serialize(dict(frame.f_locals), is_vars=True) return rv diff --git a/tests/test_scrubber.py b/tests/test_scrubber.py index 2c4bd3aa90..5034121b83 100644 --- a/tests/test_scrubber.py +++ b/tests/test_scrubber.py @@ -187,3 +187,20 @@ def test_recursive_event_scrubber(sentry_init, capture_events): (event,) = events assert event["extra"]["deep"]["deeper"][0]["deepest"]["password"] == "'[Filtered]'" + + +def test_recursive_scrubber_does_not_override_original(sentry_init, capture_events): + sentry_init(event_scrubber=EventScrubber(recursive=True)) + events = capture_events() + + data = {"csrf": "secret"} + try: + raise RuntimeError("An error") + except Exception: + capture_exception() + + (event,) = events + frames = event["exception"]["values"][0]["stacktrace"]["frames"] + (frame,) = frames + assert data["csrf"] == "secret" + assert frame["vars"]["data"]["csrf"] == "[Filtered]"