Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions datajunction-server/datajunction_server/api/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,13 @@ async def get_measures_sql_v3(
a cube with temporal partitions, and applies partition filters if so.
lookback_window: Lookback window for temporal filters when applicable.

Filter contract:
Filters may only reference dimension attributes — the measures layer
is pre-aggregation and has no metrics to bind against. Filters that
reference a metric node are rejected with a 422; apply them
downstream after aggregating, or use ``/sql/metrics/v3`` which emits
them as ``HAVING`` clauses.

See also: `/sql/metrics/v3/` for the final combined query with metric expressions.
"""
merged_filters = list(filters)
Expand Down Expand Up @@ -623,6 +630,16 @@ async def get_metrics_sql_v3(
- Dimension references in metric expressions are resolved to their
final column aliases.

Filter contract:
Each filter is classified by what it references:
- A dimension attribute → emitted as a ``WHERE`` predicate.
- A metric in the requested ``metrics`` list → emitted as a ``HAVING``
predicate (post-aggregation).
- A metric *not* in the requested list → rejected with a 422, since
``HAVING`` can only reference SELECT items and silently computing
an extra aggregate would be surprising/wasteful. Add the metric to
``metrics`` to filter on it.

Args:
metrics: List of metric names to include
dimensions: List of dimensions to group by (the grain)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
from datajunction_server.construction.build_v3.utils import (
add_dimensions_from_filters,
add_dimensions_from_metric_expressions,
preload_filter_metric_refs,
)
from datajunction_server.database.partition import Partition
from datajunction_server.errors import DJError, DJInvalidInputException, ErrorCode
Expand Down Expand Up @@ -241,6 +242,7 @@ async def setup_build_context(
include_temporal_filters: bool = False,
lookback_window: str | None = None,
matched_cube: Optional["NodeRevision"] = None,
disallow_metric_filters: bool = False,
) -> BuildContext:
"""
Create and initialize a BuildContext with all setup done.
Expand Down Expand Up @@ -286,6 +288,11 @@ async def setup_build_context(
lookback_window=lookback_window,
)

# Pre-resolve any filter refs that are themselves nodes (metrics in
# particular) so add_dimensions_from_filters doesn't mistake a metric
# filter ref for a dimension and produce a confusing join-path error.
await preload_filter_metric_refs(ctx)

# Add filter-driven dimensions upfront — this only parses filter strings and
# needs no DB data, so we do it before load_nodes to include those dimension
# nodes in the first (and ideally only) load.
Expand All @@ -306,6 +313,18 @@ async def setup_build_context(
if not ctx.metrics:
raise DJInvalidInputException("At least one metric is required")

# Validate filter references against loaded node types BEFORE we try to use
# any auto-added "filter dimensions" — a metric ref in a filter would have
# been added to ctx.dimensions by add_dimensions_from_filters (it can't tell
# node types yet), and downstream join resolution would fail with a confusing
# message. Catching it here gives a precise error.
# Note: this is an early run; the WHERE/HAVING split below uses the result.
ctx.dimension_filters, ctx.metric_filters = classify_filters(
ctx.filters,
ctx,
disallow_metric_filters=disallow_metric_filters,
)

# Decompose metrics and group by parent node
ctx.metric_groups, ctx.decomposed_metrics = await decompose_and_group_metrics(ctx)

Expand All @@ -327,11 +346,6 @@ async def setup_build_context(
if missing_dim_nodes or internally_added_roots:
await load_nodes(ctx)

# Classify filters into dimension filters (WHERE) and metric filters (HAVING)
# This MUST happen AFTER all nodes are loaded so we can correctly identify
# whether a filter references a dimension or a metric
ctx.dimension_filters, ctx.metric_filters = classify_filters(ctx.filters, ctx)

return ctx


Expand Down Expand Up @@ -386,6 +400,7 @@ async def build_measures_sql(
include_temporal_filters=include_temporal_filters,
lookback_window=lookback_window,
matched_cube=matched_cube,
disallow_metric_filters=True,
)

# Build grain groups from context
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,20 +58,36 @@
def classify_filters(
filters: list[str],
ctx: BuildContext,
disallow_metric_filters: bool = False,
) -> tuple[list[str], list[str]]:
"""
Classify filters into dimension filters (WHERE) and metric filters (HAVING).

A filter is classified as a metric filter if it references any metric node.
Otherwise, it's a dimension filter.

Validation:
- Any metric referenced by a filter must also appear in ``ctx.metrics``
(i.e., be in the requested metrics list). Otherwise we'd be silently
computing an extra aggregate the caller didn't ask for, and HAVING
against a SELECT item that doesn't exist.
- If ``disallow_metric_filters`` is True, filters referencing metric
nodes are rejected outright. Used by ``/sql/measures/v3`` since the
measures layer is pre-aggregation and has no metrics to bind against.

Args:
filters: List of filter strings
ctx: Build context with nodes loaded
disallow_metric_filters: If True, raise on any filter that references
a metric node (for the measures endpoint).

Returns:
Tuple of (dimension_filters, metric_filters)

Raises:
DJInvalidInputException: if a filter references a metric not in
``ctx.metrics``, or any metric at all when ``disallow_metric_filters``.

Example::

dimension_filters, metric_filters = classify_filters(
Expand All @@ -86,20 +102,38 @@ def classify_filters(
"""
dimension_filters = []
metric_filters = []
requested_metrics = set(ctx.metrics)

for filter_str in filters:
# Extract all column references from this filter
refs = get_filter_column_references(filter_str)

# Check if any reference is a metric
is_metric_filter = False
for ref in refs:
node = ctx.nodes.get(ref)
if node and node.type == NodeType.METRIC:
is_metric_filter = True
break
# Identify metric refs in this filter
metric_refs = [
ref
for ref in refs
if (node := ctx.nodes.get(ref)) is not None and node.type == NodeType.METRIC
]

if is_metric_filter:
if metric_refs:
if disallow_metric_filters:
raise DJInvalidInputException(
f"Filter {filter_str!r} references metric "
f"{metric_refs[0]!r}, but filters on metric nodes are not "
"supported by /sql/measures/v3 — the measures layer is "
"pre-aggregation and has no metrics to bind against. "
"Apply the filter downstream after aggregating, or use "
"/sql/metrics/v3 which emits a HAVING clause for "
"metric filters.",
)
missing = [m for m in metric_refs if m not in requested_metrics]
if missing:
raise DJInvalidInputException(
f"Filter {filter_str!r} references metric {missing[0]!r}, "
"which is not in the requested metrics list. Add it to "
"`metrics` to filter on it (HAVING clauses can only "
"reference metrics being selected).",
)
metric_filters.append(filter_str)
else:
dimension_filters.append(filter_str)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
parse_filter,
)
from datajunction_server.database.node import Node
from datajunction_server.models.node_type import NodeType
from datajunction_server.sql.parsing import ast
from datajunction_server.utils import SEPARATOR

Expand Down Expand Up @@ -296,6 +297,39 @@ def add_dimensions_from_metric_expressions(
)


async def preload_filter_metric_refs(ctx: "BuildContext") -> None:
"""
Pre-load any nodes referenced by filters so we can detect metric filters
before ``add_dimensions_from_filters`` would otherwise treat them as
dimension refs (and produce confusing join-path errors downstream).

Filter-refs that resolve to non-node names (true dimension attribute paths
like ``v3.product.category``) are simply absent from the result — those
continue to flow through ``add_dimensions_from_filters`` unchanged.
"""
# Local imports to avoid circulars.
from datajunction_server.construction.build_v3.filters import (
get_filter_column_references,
)

if not ctx.filters:
return

candidate_names: set[str] = set()
for filter_str in ctx.filters:
try:
refs = get_filter_column_references(filter_str)
except Exception: # pragma: no cover
continue
candidate_names.update(refs)

# Node.get_by_names early-returns on an empty list, and a no-op lookup is
# cheap, so we skip the explicit empty-check here for simpler coverage.
nodes = await Node.get_by_names(ctx.session, list(candidate_names))
for node in nodes:
ctx.nodes[node.name] = node


def add_dimensions_from_filters(ctx: "BuildContext") -> None:
"""
Scan filter expressions for dimension references and add them to ctx.dimensions.
Expand Down Expand Up @@ -415,11 +449,19 @@ def add_dimensions_from_filters(ctx: "BuildContext") -> None:
# Already in dimensions, no need to add
continue

# Skip if this is a metric reference, not a dimension
# Metrics in WHERE clauses should be treated as HAVING conditions,
# not dimension joins
# Skip if this is a metric reference, not a dimension.
# Metrics in filter expressions become HAVING conditions, not
# dimension joins. We catch both the (common) case where the metric
# is in the requested list, and the case where the user filtered on
# a metric that's NOT in the metrics list — the latter is a
# validation error raised later by classify_filters, but we must
# avoid auto-adding it as a dimension here or downstream join
# resolution would fail first with a confusing error.
if full_name in ctx.metrics:
continue
ref_node = ctx.nodes.get(full_name)
if ref_node is not None and ref_node.type == NodeType.METRIC:
continue

# Check if any existing dimension already covers this (node, column)
dim_ref = parse_dimension_ref(full_name)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -672,3 +672,98 @@ async def test_filter_only_dimension_with_metric_filter(
column_names = [col["name"] for col in result["columns"]]
assert "category" in column_names
assert "subcategory" not in column_names


class TestMetricFilterValidation:
"""
Tests for validation rules on metric-referencing filters:
- On /sql/metrics/v3: the filtered metric must be in the metrics list.
- On /sql/measures/v3 (and its `combined` variant): metric filters are
rejected outright since the measures layer is pre-aggregation.
"""

@pytest.mark.asyncio
async def test_metrics_v3_metric_filter_not_in_metrics_list_rejected(
self,
client_with_build_v3,
):
"""
A filter on a metric that isn't in the requested metrics list should
be rejected with a 422, since HAVING can only reference SELECT items.
"""
response = await client_with_build_v3.get(
"/sql/metrics/v3/",
params={
"metrics": ["v3.order_count"],
"dimensions": ["v3.product.category"],
"filters": ["v3.total_revenue > 10000"],
},
)
assert response.status_code == 422
body = response.json()
message = body.get("message", "")
assert "v3.total_revenue" in message
assert "not in the requested metrics list" in message

@pytest.mark.asyncio
async def test_measures_v3_metric_filter_rejected(
self,
client_with_build_v3,
):
"""
/sql/measures/v3 should reject any filter referencing a metric node —
the measures layer is pre-aggregation, so there's nothing to HAVING
against.
"""
response = await client_with_build_v3.get(
"/sql/measures/v3/",
params={
"metrics": ["v3.total_revenue"],
"dimensions": ["v3.product.category"],
"filters": ["v3.total_revenue > 10000"],
},
)
assert response.status_code == 422
body = response.json()
message = body.get("message", "")
assert "v3.total_revenue" in message
assert "/sql/measures/v3" in message

@pytest.mark.asyncio
async def test_measures_v3_combined_metric_filter_rejected(
self,
client_with_build_v3,
):
"""
/sql/measures/v3/combined flows through build_measures_sql, so the
same rejection applies.
"""
response = await client_with_build_v3.get(
"/sql/measures/v3/combined",
params={
"metrics": ["v3.total_revenue"],
"dimensions": ["v3.product.category"],
"filters": ["v3.total_revenue > 10000"],
},
)
assert response.status_code == 422
body = response.json()
message = body.get("message", "")
assert "v3.total_revenue" in message
assert "/sql/measures/v3" in message

@pytest.mark.asyncio
async def test_measures_v3_dimension_filter_still_allowed(
self,
client_with_build_v3,
):
"""Sanity check: dimension filters on /sql/measures/v3 still work."""
response = await client_with_build_v3.get(
"/sql/measures/v3/",
params={
"metrics": ["v3.total_revenue"],
"dimensions": ["v3.product.category"],
"filters": ["v3.product.category = 'Electronics'"],
},
)
assert response.status_code == 200, response.json()
Loading