Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ dependencies = [
"sentry-forked-email-reply-parser>=0.5.12.post1",
"sentry-kafka-schemas>=2.1.15",
"sentry-ophio>=1.1.3",
"sentry-protos>=0.4.6",
"sentry-protos>=0.4.7",
"sentry-redis-tools>=0.5.0",
"sentry-relay>=0.9.22",
"sentry-sdk[http2]>=2.43.0",
Expand Down
173 changes: 109 additions & 64 deletions src/sentry/api/endpoints/organization_trace_item_attributes_ranked.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,26 @@

from rest_framework.request import Request
from rest_framework.response import Response
from sentry_protos.snuba.v1.endpoint_trace_item_attributes_pb2 import TraceItemAttributeNamesRequest
from sentry_protos.snuba.v1.endpoint_trace_item_stats_pb2 import (
AttributeDistributionsRequest,
StatsType,
TraceItemStatsRequest,
)
from sentry_protos.snuba.v1.trace_item_attribute_pb2 import ExtrapolationMode
from sentry_protos.snuba.v1.request_common_pb2 import TraceItemType
from sentry_protos.snuba.v1.trace_item_attribute_pb2 import AttributeKey, ExtrapolationMode

from sentry import features
from sentry import features, options
from sentry.api.api_owners import ApiOwner
from sentry.api.api_publish_status import ApiPublishStatus
from sentry.api.base import region_silo_endpoint
from sentry.api.bases import NoProjects, OrganizationEventsEndpointBase
from sentry.api.endpoints.organization_trace_item_attributes import adjust_start_end_window
from sentry.api.utils import handle_query_errors
from sentry.exceptions import InvalidSearchQuery
from sentry.models.organization import Organization
from sentry.search.eap.resolver import SearchResolver
from sentry.search.eap.spans.attributes import SPANS_STATS_EXCLUDED_ATTRIBUTES
from sentry.search.eap.spans.definitions import SPAN_DEFINITIONS
from sentry.search.eap.types import SearchResolverConfig, SupportedTraceItemType
from sentry.search.eap.utils import can_expose_attribute, translate_internal_to_public_alias
Expand All @@ -28,10 +33,13 @@
from sentry.seer.workflows.compare import keyed_rrf_score
from sentry.snuba.referrer import Referrer
from sentry.snuba.spans_rpc import Spans
from sentry.utils import snuba_rpc
from sentry.utils.snuba_rpc import trace_item_stats_rpc

logger = logging.getLogger(__name__)

PARALLELIZATION_FACTOR = 2


@region_silo_endpoint
class OrganizationTraceItemsAttributesRankedEndpoint(OrganizationEventsEndpointBase):
Expand Down Expand Up @@ -121,75 +129,111 @@ def get(self, request: Request, organization: Organization) -> Response:
return Response({"rankedAttributes": []})

cohort_1, _, _ = resolver.resolve_query(query_1)
cohort_1_request = TraceItemStatsRequest(
filter=cohort_1,
meta=meta,
stats_types=[
StatsType(
attribute_distributions=AttributeDistributionsRequest(
max_buckets=75,
)
)
],
cohort_2, _, _ = resolver.resolve_query(query_2)

# Fetch attribute names for parallelization
adjusted_start_date, adjusted_end_date = adjust_start_end_window(
snuba_params.start_date, snuba_params.end_date
)
attrs_snuba_params = snuba_params.copy()
attrs_snuba_params.start = adjusted_start_date
attrs_snuba_params.end = adjusted_end_date
attrs_resolver = SearchResolver(
params=attrs_snuba_params, config=resolver_config, definitions=SPAN_DEFINITIONS
)
attrs_meta = attrs_resolver.resolve_meta(
referrer=Referrer.API_SPANS_FREQUENCY_STATS_RPC.value
)
attrs_meta.trace_item_type = TraceItemType.TRACE_ITEM_TYPE_SPAN

cohort_2, _, _ = resolver.resolve_query(query_2)
cohort_2_request = TraceItemStatsRequest(
filter=cohort_2,
meta=meta,
stats_types=[
StatsType(
attribute_distributions=AttributeDistributionsRequest(
max_buckets=75,
)
attr_type = AttributeKey.Type.TYPE_STRING
max_attributes = options.get("explore.trace-items.keys.max")

with handle_query_errors():
attrs_request = TraceItemAttributeNamesRequest(
meta=attrs_meta,
limit=max_attributes,
type=attr_type,
intersecting_attributes_filter=cohort_2,
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Suspect cohort attributes missed due to baseline-only attribute filtering

The TraceItemAttributeNamesRequest uses intersecting_attributes_filter=cohort_2 (the baseline cohort), which means only attributes present in the baseline are fetched and analyzed. These attributes are then used to query both the suspect cohort (cohort_1) and baseline cohort (cohort_2). Attributes unique to the suspect cohort are completely missed, which defeats the purpose of comparing cohorts to find differentiating attributes. The old code didn't pre-filter attributes, so it analyzed all attributes from both cohorts independently.

Fix in Cursor Fix in Web

)
attrs_response = snuba_rpc.attribute_names_rpc(attrs_request)

# Chunk attributes for parallel processing
chunked_attributes: defaultdict[int, list[AttributeKey]] = defaultdict(list)
for i, attr_proto in enumerate(attrs_response.attributes):
if attr_proto.name in SPANS_STATS_EXCLUDED_ATTRIBUTES:
continue

chunked_attributes[i % PARALLELIZATION_FACTOR].append(
AttributeKey(name=attr_proto.name, type=AttributeKey.TYPE_STRING)
)

def run_stats_request_with_error_handling(filter, attributes):
with handle_query_errors():
request = TraceItemStatsRequest(
filter=filter,
meta=meta,
stats_types=[
StatsType(
attribute_distributions=AttributeDistributionsRequest(
max_buckets=75,
attributes=attributes,
)
)
],
)
return trace_item_stats_rpc(request)

def run_table_query_with_error_handling(query_string):
with handle_query_errors():
return Spans.run_table_query(
params=snuba_params,
query_string=query_string,
selected_columns=["count(span.duration)"],
orderby=None,
config=resolver_config,
offset=0,
limit=1,
sampling_mode=snuba_params.sampling_mode,
referrer=Referrer.API_SPAN_SAMPLE_GET_SPAN_DATA.value,
)
],
)

with ThreadPoolExecutor(
thread_name_prefix=__name__,
max_workers=4,
max_workers=PARALLELIZATION_FACTOR * 2 + 2, # 2 cohorts * threads + 2 totals queries
) as query_thread_pool:
cohort_1_future = query_thread_pool.submit(
trace_item_stats_rpc,
cohort_1_request,
)
totals_1_future = query_thread_pool.submit(
Spans.run_table_query,
params=snuba_params,
query_string=query_1,
selected_columns=["count(span.duration)"],
orderby=None,
config=resolver_config,
offset=0,
limit=1,
sampling_mode=snuba_params.sampling_mode,
referrer=Referrer.API_SPAN_SAMPLE_GET_SPAN_DATA.value,
)
cohort_1_futures = [
query_thread_pool.submit(
run_stats_request_with_error_handling, cohort_1, attributes
)
for attributes in chunked_attributes.values()
]
cohort_2_futures = [
query_thread_pool.submit(
run_stats_request_with_error_handling, cohort_2, attributes
)
for attributes in chunked_attributes.values()
]

cohort_2_future = query_thread_pool.submit(
trace_item_stats_rpc,
cohort_2_request,
)
totals_1_future = query_thread_pool.submit(run_table_query_with_error_handling, query_1)
totals_2_future = query_thread_pool.submit(run_table_query_with_error_handling, query_2)

totals_2_future = query_thread_pool.submit(
Spans.run_table_query,
params=snuba_params,
query_string=query_2,
selected_columns=["count(span.duration)"],
orderby=None,
config=resolver_config,
offset=0,
limit=1,
sampling_mode=snuba_params.sampling_mode,
referrer=Referrer.API_SPAN_SAMPLE_GET_SPAN_DATA.value,
)
# Merge cohort 1 results
cohort_1_data = []
for future in cohort_1_futures:
result = future.result()
if result.results:
cohort_1_data.extend(result.results[0].attribute_distributions.attributes)

cohort_1_data = cohort_1_future.result()
cohort_2_data = cohort_2_future.result()
# Merge cohort 2 results
cohort_2_data = []
for future in cohort_2_futures:
result = future.result()
if result.results:
cohort_2_data.extend(result.results[0].attribute_distributions.attributes)

totals_1_result = totals_1_future.result()
totals_2_result = totals_2_future.result()
totals_1_result = totals_1_future.result()
totals_2_result = totals_2_future.result()

cohort_1_distribution = []
cohort_1_distribution_map = defaultdict(list)
Expand All @@ -198,7 +242,7 @@ def get(self, request: Request, organization: Organization) -> Response:
cohort_2_distribution_map = defaultdict(list)
processed_cohort_2_buckets = set()

for attribute in cohort_2_data.results[0].attribute_distributions.attributes:
for attribute in cohort_2_data:
if not can_expose_attribute(attribute.attribute_name, SupportedTraceItemType.SPANS):
continue

Expand All @@ -207,7 +251,7 @@ def get(self, request: Request, organization: Organization) -> Response:
{"label": bucket.label, "value": bucket.value}
)

for attribute in cohort_1_data.results[0].attribute_distributions.attributes:
for attribute in cohort_1_data:
if not can_expose_attribute(attribute.attribute_name, SupportedTraceItemType.SPANS):
continue
for bucket in attribute.buckets:
Expand Down Expand Up @@ -287,7 +331,7 @@ def get(self, request: Request, organization: Organization) -> Response:
# Create RRR order mapping from compare_distributions results
# scored_attrs_rrr returns a dict with 'results' key containing list of [attribute_name, score] pairs
rrr_results = scored_attrs_rrr.get("results", [])
rrr_order_map = {attr: i for i, (attr, _) in enumerate(rrr_results)}
rrr_order_map = {attr_name: i for i, (attr_name, _) in enumerate(rrr_results)}

ranked_distribution: dict[str, Any] = {
"rankedAttributes": [],
Expand All @@ -300,7 +344,8 @@ def get(self, request: Request, organization: Organization) -> Response:
"cohort2Total": total_baseline,
}

for i, (attr, _) in enumerate(scored_attrs_rrf):
for i, scored_attr_tuple in enumerate(scored_attrs_rrf):
attr = scored_attr_tuple[0]

public_alias, _, _ = translate_internal_to_public_alias(
attr, "string", SupportedTraceItemType.SPANS
Expand Down
90 changes: 82 additions & 8 deletions src/sentry/api/endpoints/organization_trace_item_stats.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,37 @@
import logging
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed

from rest_framework import serializers
from rest_framework.request import Request
from rest_framework.response import Response
from sentry_protos.snuba.v1.endpoint_trace_item_attributes_pb2 import TraceItemAttributeNamesRequest
from sentry_protos.snuba.v1.request_common_pb2 import TraceItemType
from sentry_protos.snuba.v1.trace_item_attribute_pb2 import AttributeKey

from sentry import options
from sentry.api.api_owners import ApiOwner
from sentry.api.api_publish_status import ApiPublishStatus
from sentry.api.base import region_silo_endpoint
from sentry.api.bases import NoProjects, OrganizationEventsEndpointBase
from sentry.api.endpoints.organization_trace_item_attributes import adjust_start_end_window
from sentry.api.utils import handle_query_errors
from sentry.models.organization import Organization
from sentry.search.eap.constants import SUPPORTED_STATS_TYPES
from sentry.search.eap.resolver import SearchResolver
from sentry.search.eap.spans.attributes import SPANS_STATS_EXCLUDED_ATTRIBUTES
from sentry.search.eap.spans.definitions import SPAN_DEFINITIONS
from sentry.search.eap.types import SearchResolverConfig
from sentry.snuba.referrer import Referrer
from sentry.snuba.spans_rpc import Spans
from sentry.utils import snuba_rpc

logger = logging.getLogger(__name__)


MAX_THREADS = 4


class OrganizationTraceItemsStatsSerializer(serializers.Serializer):
query = serializers.CharField(required=False)
statsType = serializers.ListField(
Expand Down Expand Up @@ -49,13 +62,74 @@ def get(self, request: Request, organization: Organization) -> Response:
params=snuba_params, config=resolver_config, definitions=SPAN_DEFINITIONS
)

stats_results = Spans.run_stats_query(
params=snuba_params,
stats_types=serialized.get("statsType"),
query_string=serialized.get("query", ""),
referrer=Referrer.API_SPANS_FREQUENCY_STATS_RPC.value,
config=resolver_config,
search_resolver=resolver,
query_string = serialized.get("query")
query_filter, _, _ = resolver.resolve_query(query_string)

# Fetch attribute names
adjusted_start_date, adjusted_end_date = adjust_start_end_window(
snuba_params.start_date, snuba_params.end_date
)
attrs_snuba_params = snuba_params.copy()
attrs_snuba_params.start = adjusted_start_date
attrs_snuba_params.end = adjusted_end_date
attrs_resolver = SearchResolver(
params=attrs_snuba_params, config=resolver_config, definitions=SPAN_DEFINITIONS
)
attrs_meta = attrs_resolver.resolve_meta(
referrer=Referrer.API_SPANS_FREQUENCY_STATS_RPC.value
)
attrs_meta.trace_item_type = TraceItemType.TRACE_ITEM_TYPE_SPAN

attr_type = AttributeKey.Type.TYPE_STRING
max_attributes = options.get("explore.trace-items.keys.max")

with handle_query_errors():
attrs_request = TraceItemAttributeNamesRequest(
meta=attrs_meta,
limit=max_attributes,
type=attr_type,
intersecting_attributes_filter=query_filter,
)

attrs_response = snuba_rpc.attribute_names_rpc(attrs_request)

# Chunk attributes and run stats query in parallel
chunked_attributes: defaultdict[int, list[AttributeKey]] = defaultdict(list)
for i, attr in enumerate(attrs_response.attributes):
if attr.name in SPANS_STATS_EXCLUDED_ATTRIBUTES:
continue

chunked_attributes[i % MAX_THREADS].append(
AttributeKey(name=attr.name, type=AttributeKey.TYPE_STRING)
)

def run_stats_query_with_error_handling(attributes):
with handle_query_errors():
return Spans.run_stats_query(
params=snuba_params,
stats_types=serialized.get("statsType"),
query_string=serialized.get("query", ""),
referrer=Referrer.API_SPANS_FREQUENCY_STATS_RPC.value,
config=resolver_config,
search_resolver=resolver,
attributes=attributes,
)

stats_results: dict[str, dict[str, dict]] = defaultdict(lambda: {"data": {}})
with ThreadPoolExecutor(
thread_name_prefix=__name__,
max_workers=MAX_THREADS,
) as query_thread_pool:

futures = [
query_thread_pool.submit(run_stats_query_with_error_handling, attributes)
for attributes in chunked_attributes.values()
]

for future in as_completed(futures):
result = future.result()
for stats in result:
for stats_type, data in stats.items():
stats_results[stats_type]["data"].update(data["data"])
Comment on lines +131 to +133
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: The organization_trace_item_stats.py endpoint's response format changed, breaking API compatibility.
Severity: CRITICAL | Confidence: High

🔍 Detailed Analysis

The organization_trace_item_stats.py endpoint's response format has changed from a dictionary structure, {"data": {stat_type: {...}}}, to a list of single-key dictionaries, {"data": [{stat_type: {...}}, ...]}. This transformation is a breaking API change that will cause clients expecting the original dictionary format to fail.

💡 Suggested Fix

Revert the response format in organization_trace_item_stats.py line 135 to return Response({"data": stats_results}) instead of Response({"data": [{k: v} for k, v in stats_results.items()]}).

🤖 Prompt for AI Agent
Review the code at the location below. A potential bug has been identified by an AI
agent.
Verify if this is a real issue. If it is, propose a fix; if not, explain why it's not
valid.

Location: src/sentry/api/endpoints/organization_trace_item_stats.py#L131-L133

Potential issue: The `organization_trace_item_stats.py` endpoint's response format has
changed from a dictionary structure, `{"data": {stat_type: {...}}}`, to a list of
single-key dictionaries, `{"data": [{stat_type: {...}}, ...]}`. This transformation is a
breaking API change that will cause clients expecting the original dictionary format to
fail.

Did we get this right? 👍 / 👎 to inform future reviews.
Reference ID: 5302423


return Response({"data": stats_results})
return Response({"data": [{k: v} for k, v in stats_results.items()]})
12 changes: 12 additions & 0 deletions src/sentry/search/eap/spans/attributes.py
Original file line number Diff line number Diff line change
Expand Up @@ -634,6 +634,18 @@ def is_starred_segment_context_constructor(params: SnubaParams) -> VirtualColumn
if definition.replacement
}

# Attributes excluded from stats queries (e.g., attribute distributions)
# These are typically system-level identifiers that don't provide useful distribution insights
SPANS_STATS_EXCLUDED_ATTRIBUTES: set[str] = {
"sentry.item_id",
"sentry.trace_id",
"sentry.segment_id",
"sentry.parent_span_id",
"sentry.profile_id",
"sentry.event_id",
"sentry.group",
}


SPAN_VIRTUAL_CONTEXTS = {
"device.class": VirtualColumnDefinition(
Expand Down
Loading
Loading