From a67ed7a0e803dc27688458f5944556a7687206dd Mon Sep 17 00:00:00 2001 From: Yian Shang Date: Sun, 17 May 2026 08:44:04 -0700 Subject: [PATCH 1/2] Tighten rules around metric-in-filter --- .../datajunction_server/api/sql.py | 17 ++++ .../construction/build_v3/builder.py | 25 ++++- .../construction/build_v3/metrics.py | 50 ++++++++-- .../construction/build_v3/utils.py | 54 ++++++++++- .../build_v3/having_clause_test.py | 95 +++++++++++++++++++ 5 files changed, 225 insertions(+), 16 deletions(-) diff --git a/datajunction-server/datajunction_server/api/sql.py b/datajunction-server/datajunction_server/api/sql.py index 7b1e7f7c6..6046f0f2f 100644 --- a/datajunction-server/datajunction_server/api/sql.py +++ b/datajunction-server/datajunction_server/api/sql.py @@ -247,6 +247,13 @@ async def get_measures_sql_v3( a cube with temporal partitions, and applies partition filters if so. lookback_window: Lookback window for temporal filters when applicable. + Filter contract: + Filters may only reference dimension attributes — the measures layer + is pre-aggregation and has no metrics to bind against. Filters that + reference a metric node are rejected with a 422; apply them + downstream after aggregating, or use ``/sql/metrics/v3`` which emits + them as ``HAVING`` clauses. + See also: `/sql/metrics/v3/` for the final combined query with metric expressions. """ merged_filters = list(filters) @@ -623,6 +630,16 @@ async def get_metrics_sql_v3( - Dimension references in metric expressions are resolved to their final column aliases. + Filter contract: + Each filter is classified by what it references: + - A dimension attribute → emitted as a ``WHERE`` predicate. + - A metric in the requested ``metrics`` list → emitted as a ``HAVING`` + predicate (post-aggregation). + - A metric *not* in the requested list → rejected with a 422, since + ``HAVING`` can only reference SELECT items and silently computing + an extra aggregate would be surprising/wasteful. Add the metric to + ``metrics`` to filter on it. + Args: metrics: List of metric names to include dimensions: List of dimensions to group by (the grain) diff --git a/datajunction-server/datajunction_server/construction/build_v3/builder.py b/datajunction-server/datajunction_server/construction/build_v3/builder.py index 8703c9c7a..221b7ff08 100644 --- a/datajunction-server/datajunction_server/construction/build_v3/builder.py +++ b/datajunction-server/datajunction_server/construction/build_v3/builder.py @@ -44,6 +44,7 @@ from datajunction_server.construction.build_v3.utils import ( add_dimensions_from_filters, add_dimensions_from_metric_expressions, + preload_filter_metric_refs, ) from datajunction_server.database.partition import Partition from datajunction_server.errors import DJError, DJInvalidInputException, ErrorCode @@ -241,6 +242,7 @@ async def setup_build_context( include_temporal_filters: bool = False, lookback_window: str | None = None, matched_cube: Optional["NodeRevision"] = None, + disallow_metric_filters: bool = False, ) -> BuildContext: """ Create and initialize a BuildContext with all setup done. @@ -286,6 +288,11 @@ async def setup_build_context( lookback_window=lookback_window, ) + # Pre-resolve any filter refs that are themselves nodes (metrics in + # particular) so add_dimensions_from_filters doesn't mistake a metric + # filter ref for a dimension and produce a confusing join-path error. + await preload_filter_metric_refs(ctx) + # Add filter-driven dimensions upfront — this only parses filter strings and # needs no DB data, so we do it before load_nodes to include those dimension # nodes in the first (and ideally only) load. @@ -306,6 +313,18 @@ async def setup_build_context( if not ctx.metrics: raise DJInvalidInputException("At least one metric is required") + # Validate filter references against loaded node types BEFORE we try to use + # any auto-added "filter dimensions" — a metric ref in a filter would have + # been added to ctx.dimensions by add_dimensions_from_filters (it can't tell + # node types yet), and downstream join resolution would fail with a confusing + # message. Catching it here gives a precise error. + # Note: this is an early run; the WHERE/HAVING split below uses the result. + ctx.dimension_filters, ctx.metric_filters = classify_filters( + ctx.filters, + ctx, + disallow_metric_filters=disallow_metric_filters, + ) + # Decompose metrics and group by parent node ctx.metric_groups, ctx.decomposed_metrics = await decompose_and_group_metrics(ctx) @@ -327,11 +346,6 @@ async def setup_build_context( if missing_dim_nodes or internally_added_roots: await load_nodes(ctx) - # Classify filters into dimension filters (WHERE) and metric filters (HAVING) - # This MUST happen AFTER all nodes are loaded so we can correctly identify - # whether a filter references a dimension or a metric - ctx.dimension_filters, ctx.metric_filters = classify_filters(ctx.filters, ctx) - return ctx @@ -386,6 +400,7 @@ async def build_measures_sql( include_temporal_filters=include_temporal_filters, lookback_window=lookback_window, matched_cube=matched_cube, + disallow_metric_filters=True, ) # Build grain groups from context diff --git a/datajunction-server/datajunction_server/construction/build_v3/metrics.py b/datajunction-server/datajunction_server/construction/build_v3/metrics.py index 9f8a8975a..d071375dc 100644 --- a/datajunction-server/datajunction_server/construction/build_v3/metrics.py +++ b/datajunction-server/datajunction_server/construction/build_v3/metrics.py @@ -58,6 +58,7 @@ def classify_filters( filters: list[str], ctx: BuildContext, + disallow_metric_filters: bool = False, ) -> tuple[list[str], list[str]]: """ Classify filters into dimension filters (WHERE) and metric filters (HAVING). @@ -65,13 +66,28 @@ def classify_filters( A filter is classified as a metric filter if it references any metric node. Otherwise, it's a dimension filter. + Validation: + - Any metric referenced by a filter must also appear in ``ctx.metrics`` + (i.e., be in the requested metrics list). Otherwise we'd be silently + computing an extra aggregate the caller didn't ask for, and HAVING + against a SELECT item that doesn't exist. + - If ``disallow_metric_filters`` is True, filters referencing metric + nodes are rejected outright. Used by ``/sql/measures/v3`` since the + measures layer is pre-aggregation and has no metrics to bind against. + Args: filters: List of filter strings ctx: Build context with nodes loaded + disallow_metric_filters: If True, raise on any filter that references + a metric node (for the measures endpoint). Returns: Tuple of (dimension_filters, metric_filters) + Raises: + DJInvalidInputException: if a filter references a metric not in + ``ctx.metrics``, or any metric at all when ``disallow_metric_filters``. + Example:: dimension_filters, metric_filters = classify_filters( @@ -86,20 +102,38 @@ def classify_filters( """ dimension_filters = [] metric_filters = [] + requested_metrics = set(ctx.metrics) for filter_str in filters: # Extract all column references from this filter refs = get_filter_column_references(filter_str) - # Check if any reference is a metric - is_metric_filter = False - for ref in refs: - node = ctx.nodes.get(ref) - if node and node.type == NodeType.METRIC: - is_metric_filter = True - break + # Identify metric refs in this filter + metric_refs = [ + ref + for ref in refs + if (node := ctx.nodes.get(ref)) is not None and node.type == NodeType.METRIC + ] - if is_metric_filter: + if metric_refs: + if disallow_metric_filters: + raise DJInvalidInputException( + f"Filter {filter_str!r} references metric " + f"{metric_refs[0]!r}, but filters on metric nodes are not " + "supported by /sql/measures/v3 — the measures layer is " + "pre-aggregation and has no metrics to bind against. " + "Apply the filter downstream after aggregating, or use " + "/sql/metrics/v3 which emits a HAVING clause for " + "metric filters.", + ) + missing = [m for m in metric_refs if m not in requested_metrics] + if missing: + raise DJInvalidInputException( + f"Filter {filter_str!r} references metric {missing[0]!r}, " + "which is not in the requested metrics list. Add it to " + "`metrics` to filter on it (HAVING clauses can only " + "reference metrics being selected).", + ) metric_filters.append(filter_str) else: dimension_filters.append(filter_str) diff --git a/datajunction-server/datajunction_server/construction/build_v3/utils.py b/datajunction-server/datajunction_server/construction/build_v3/utils.py index 074f97d18..fb1ff18de 100644 --- a/datajunction-server/datajunction_server/construction/build_v3/utils.py +++ b/datajunction-server/datajunction_server/construction/build_v3/utils.py @@ -8,6 +8,7 @@ parse_filter, ) from datajunction_server.database.node import Node +from datajunction_server.models.node_type import NodeType from datajunction_server.sql.parsing import ast from datajunction_server.utils import SEPARATOR @@ -296,6 +297,45 @@ def add_dimensions_from_metric_expressions( ) +async def preload_filter_metric_refs(ctx: "BuildContext") -> None: + """ + Pre-load any nodes referenced by filters so we can detect metric filters + before ``add_dimensions_from_filters`` would otherwise treat them as + dimension refs (and produce confusing join-path errors downstream). + + Filter-refs that resolve to non-node names (true dimension attribute paths + like ``v3.product.category``) are simply absent from the result — those + continue to flow through ``add_dimensions_from_filters`` unchanged. + """ + # Local imports to avoid circulars. + from datajunction_server.construction.build_v3.filters import ( + get_filter_column_references, + ) + + if not ctx.filters: + return + + candidate_names: set[str] = set() + for filter_str in ctx.filters: + try: + refs = get_filter_column_references(filter_str) + except Exception: # pragma: no cover + continue + for ref in refs: + # Skip refs already loaded as part of ctx.metrics or anything we + # have on hand. + if ref in ctx.nodes: + continue + candidate_names.add(ref) + + if not candidate_names: + return + + nodes = await Node.get_by_names(ctx.session, list(candidate_names)) + for node in nodes: + ctx.nodes[node.name] = node + + def add_dimensions_from_filters(ctx: "BuildContext") -> None: """ Scan filter expressions for dimension references and add them to ctx.dimensions. @@ -415,11 +455,19 @@ def add_dimensions_from_filters(ctx: "BuildContext") -> None: # Already in dimensions, no need to add continue - # Skip if this is a metric reference, not a dimension - # Metrics in WHERE clauses should be treated as HAVING conditions, - # not dimension joins + # Skip if this is a metric reference, not a dimension. + # Metrics in filter expressions become HAVING conditions, not + # dimension joins. We catch both the (common) case where the metric + # is in the requested list, and the case where the user filtered on + # a metric that's NOT in the metrics list — the latter is a + # validation error raised later by classify_filters, but we must + # avoid auto-adding it as a dimension here or downstream join + # resolution would fail first with a confusing error. if full_name in ctx.metrics: continue + ref_node = ctx.nodes.get(full_name) + if ref_node is not None and ref_node.type == NodeType.METRIC: + continue # Check if any existing dimension already covers this (node, column) dim_ref = parse_dimension_ref(full_name) diff --git a/datajunction-server/tests/construction/build_v3/having_clause_test.py b/datajunction-server/tests/construction/build_v3/having_clause_test.py index 8bde5601f..38e6998b5 100644 --- a/datajunction-server/tests/construction/build_v3/having_clause_test.py +++ b/datajunction-server/tests/construction/build_v3/having_clause_test.py @@ -672,3 +672,98 @@ async def test_filter_only_dimension_with_metric_filter( column_names = [col["name"] for col in result["columns"]] assert "category" in column_names assert "subcategory" not in column_names + + +class TestMetricFilterValidation: + """ + Tests for validation rules on metric-referencing filters: + - On /sql/metrics/v3: the filtered metric must be in the metrics list. + - On /sql/measures/v3 (and its `combined` variant): metric filters are + rejected outright since the measures layer is pre-aggregation. + """ + + @pytest.mark.asyncio + async def test_metrics_v3_metric_filter_not_in_metrics_list_rejected( + self, + client_with_build_v3, + ): + """ + A filter on a metric that isn't in the requested metrics list should + be rejected with a 422, since HAVING can only reference SELECT items. + """ + response = await client_with_build_v3.get( + "/sql/metrics/v3/", + params={ + "metrics": ["v3.order_count"], + "dimensions": ["v3.product.category"], + "filters": ["v3.total_revenue > 10000"], + }, + ) + assert response.status_code == 422 + body = response.json() + message = body.get("message", "") + assert "v3.total_revenue" in message + assert "not in the requested metrics list" in message + + @pytest.mark.asyncio + async def test_measures_v3_metric_filter_rejected( + self, + client_with_build_v3, + ): + """ + /sql/measures/v3 should reject any filter referencing a metric node — + the measures layer is pre-aggregation, so there's nothing to HAVING + against. + """ + response = await client_with_build_v3.get( + "/sql/measures/v3/", + params={ + "metrics": ["v3.total_revenue"], + "dimensions": ["v3.product.category"], + "filters": ["v3.total_revenue > 10000"], + }, + ) + assert response.status_code == 422 + body = response.json() + message = body.get("message", "") + assert "v3.total_revenue" in message + assert "/sql/measures/v3" in message + + @pytest.mark.asyncio + async def test_measures_v3_combined_metric_filter_rejected( + self, + client_with_build_v3, + ): + """ + /sql/measures/v3/combined flows through build_measures_sql, so the + same rejection applies. + """ + response = await client_with_build_v3.get( + "/sql/measures/v3/combined", + params={ + "metrics": ["v3.total_revenue"], + "dimensions": ["v3.product.category"], + "filters": ["v3.total_revenue > 10000"], + }, + ) + assert response.status_code == 422 + body = response.json() + message = body.get("message", "") + assert "v3.total_revenue" in message + assert "/sql/measures/v3" in message + + @pytest.mark.asyncio + async def test_measures_v3_dimension_filter_still_allowed( + self, + client_with_build_v3, + ): + """Sanity check: dimension filters on /sql/measures/v3 still work.""" + response = await client_with_build_v3.get( + "/sql/measures/v3/", + params={ + "metrics": ["v3.total_revenue"], + "dimensions": ["v3.product.category"], + "filters": ["v3.product.category = 'Electronics'"], + }, + ) + assert response.status_code == 200, response.json() From ec7dc9f7a6e6d553fcba5c68440041b953d5ba38 Mon Sep 17 00:00:00 2001 From: Yian Shang Date: Sun, 17 May 2026 09:07:22 -0700 Subject: [PATCH 2/2] Fix coverage --- .../construction/build_v3/utils.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/datajunction-server/datajunction_server/construction/build_v3/utils.py b/datajunction-server/datajunction_server/construction/build_v3/utils.py index fb1ff18de..059ef33c4 100644 --- a/datajunction-server/datajunction_server/construction/build_v3/utils.py +++ b/datajunction-server/datajunction_server/construction/build_v3/utils.py @@ -321,16 +321,10 @@ async def preload_filter_metric_refs(ctx: "BuildContext") -> None: refs = get_filter_column_references(filter_str) except Exception: # pragma: no cover continue - for ref in refs: - # Skip refs already loaded as part of ctx.metrics or anything we - # have on hand. - if ref in ctx.nodes: - continue - candidate_names.add(ref) - - if not candidate_names: - return + candidate_names.update(refs) + # Node.get_by_names early-returns on an empty list, and a no-op lookup is + # cheap, so we skip the explicit empty-check here for simpler coverage. nodes = await Node.get_by_names(ctx.session, list(candidate_names)) for node in nodes: ctx.nodes[node.name] = node