Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
206 changes: 206 additions & 0 deletions benchmarks/test_named_tuple_factory_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
# Copyright ScyllaDB, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Benchmarks for named_tuple_factory with and without namedtuple class caching.

Run with: pytest benchmarks/test_named_tuple_factory_benchmark.py -v
"""

import re
import warnings
from collections import namedtuple

import pytest

from cassandra.query import named_tuple_factory, _named_tuple_cache
from cassandra.util import _sanitize_identifiers


# ---------------------------------------------------------------------------
# Reference: original uncached implementation (copied from master)
# ---------------------------------------------------------------------------

NON_ALPHA_REGEX = re.compile("[^a-zA-Z0-9]")
START_BADCHAR_REGEX = re.compile("^[^a-zA-Z0-9]*")
END_BADCHAR_REGEX = re.compile("[^a-zA-Z0-9_]*$")

_clean_name_cache_old = {}


def _clean_column_name_old(name):
try:
return _clean_name_cache_old[name]
except KeyError:
clean = NON_ALPHA_REGEX.sub(
"_", START_BADCHAR_REGEX.sub("", END_BADCHAR_REGEX.sub("", name))
)
_clean_name_cache_old[name] = clean
return clean


def named_tuple_factory_uncached(colnames, rows):
"""Original implementation without caching (for benchmark comparison)."""
clean_column_names = map(_clean_column_name_old, colnames)
try:
Row = namedtuple("Row", clean_column_names)
except SyntaxError:
raise
except Exception:
clean_column_names = list(map(_clean_column_name_old, colnames))
Row = namedtuple("Row", _sanitize_identifiers(clean_column_names))
return [Row(*row) for row in rows]


# ---------------------------------------------------------------------------
# Test data generators
# ---------------------------------------------------------------------------


def make_colnames(n):
return tuple(f"col_{i}" for i in range(n))


def make_rows(ncols, nrows):
return [tuple(range(ncols)) for _ in range(nrows)]


# ---------------------------------------------------------------------------
# Correctness tests
# ---------------------------------------------------------------------------


class TestNamedTupleFactoryCorrectness:
"""Verify the cached implementation matches the uncached one."""

@pytest.mark.parametrize("ncols", [1, 5, 10, 20])
@pytest.mark.parametrize("nrows", [1, 10, 100])
def test_results_match(self, ncols, nrows):
colnames = make_colnames(ncols)
rows = make_rows(ncols, nrows)
_named_tuple_cache.clear()
cached_result = named_tuple_factory(colnames, rows)
uncached_result = named_tuple_factory_uncached(colnames, rows)
assert len(cached_result) == len(uncached_result)
for cr, ur in zip(cached_result, uncached_result):
assert tuple(cr) == tuple(ur)
assert cr._fields == ur._fields

def test_cache_hit_returns_same_class(self):
colnames = ("name", "age", "email")
rows1 = [("Alice", 30, "a@b.com")]
rows2 = [("Bob", 25, "b@c.com")]
_named_tuple_cache.clear()
result1 = named_tuple_factory(colnames, rows1)
result2 = named_tuple_factory(colnames, rows2)
# Same Row class should be reused
assert type(result1[0]) is type(result2[0])

def test_different_schemas_get_different_classes(self):
_named_tuple_cache.clear()
result1 = named_tuple_factory(("a", "b"), [(1, 2)])
result2 = named_tuple_factory(("x", "y"), [(3, 4)])
assert type(result1[0]) is not type(result2[0])
assert result1[0]._fields == ("a", "b")
assert result2[0]._fields == ("x", "y")


# ---------------------------------------------------------------------------
# Benchmarks
# ---------------------------------------------------------------------------


class TestNamedTupleFactoryBenchmark:
"""Benchmark cached vs uncached named_tuple_factory."""

# --- 5 columns, 100 rows ---

@pytest.mark.benchmark(group="ntf_5cols_100rows")
def test_uncached_5cols_100rows(self, benchmark):
colnames = make_colnames(5)
rows = make_rows(5, 100)
benchmark(named_tuple_factory_uncached, colnames, rows)

@pytest.mark.benchmark(group="ntf_5cols_100rows")
def test_cached_5cols_100rows(self, benchmark):
colnames = make_colnames(5)
rows = make_rows(5, 100)
_named_tuple_cache.clear()
# Warm the cache with one call
named_tuple_factory(colnames, rows)
benchmark(named_tuple_factory, colnames, rows)

# --- 10 columns, 100 rows ---

@pytest.mark.benchmark(group="ntf_10cols_100rows")
def test_uncached_10cols_100rows(self, benchmark):
colnames = make_colnames(10)
rows = make_rows(10, 100)
benchmark(named_tuple_factory_uncached, colnames, rows)

@pytest.mark.benchmark(group="ntf_10cols_100rows")
def test_cached_10cols_100rows(self, benchmark):
colnames = make_colnames(10)
rows = make_rows(10, 100)
_named_tuple_cache.clear()
named_tuple_factory(colnames, rows)
benchmark(named_tuple_factory, colnames, rows)

# --- 20 columns, 100 rows ---

@pytest.mark.benchmark(group="ntf_20cols_100rows")
def test_uncached_20cols_100rows(self, benchmark):
colnames = make_colnames(20)
rows = make_rows(20, 100)
benchmark(named_tuple_factory_uncached, colnames, rows)

@pytest.mark.benchmark(group="ntf_20cols_100rows")
def test_cached_20cols_100rows(self, benchmark):
colnames = make_colnames(20)
rows = make_rows(20, 100)
_named_tuple_cache.clear()
named_tuple_factory(colnames, rows)
benchmark(named_tuple_factory, colnames, rows)

# --- 5 columns, 1000 rows ---

@pytest.mark.benchmark(group="ntf_5cols_1000rows")
def test_uncached_5cols_1000rows(self, benchmark):
colnames = make_colnames(5)
rows = make_rows(5, 1000)
benchmark(named_tuple_factory_uncached, colnames, rows)

@pytest.mark.benchmark(group="ntf_5cols_1000rows")
def test_cached_5cols_1000rows(self, benchmark):
colnames = make_colnames(5)
rows = make_rows(5, 1000)
_named_tuple_cache.clear()
named_tuple_factory(colnames, rows)
benchmark(named_tuple_factory, colnames, rows)

# --- 10 columns, 1 row (measures class creation overhead most clearly) ---

@pytest.mark.benchmark(group="ntf_10cols_1row")
def test_uncached_10cols_1row(self, benchmark):
colnames = make_colnames(10)
rows = make_rows(10, 1)
benchmark(named_tuple_factory_uncached, colnames, rows)

@pytest.mark.benchmark(group="ntf_10cols_1row")
def test_cached_10cols_1row(self, benchmark):
colnames = make_colnames(10)
rows = make_rows(10, 1)
_named_tuple_cache.clear()
named_tuple_factory(colnames, rows)
benchmark(named_tuple_factory, colnames, rows)
59 changes: 35 additions & 24 deletions cassandra/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,12 @@ def pseudo_namedtuple_factory(colnames, rows):
for od in ordered_dict_factory(colnames, rows)]


# Cache namedtuple Row classes to avoid repeated exec() calls in namedtuple()
# for the same column schema. Naturally bounded by the number of distinct
# column-name tuples, which equals the number of distinct queries.
_named_tuple_cache = {}


def named_tuple_factory(colnames, rows):
"""
Returns each row as a `namedtuple <https://docs.python.org/2/library/collections.html#collections.namedtuple>`_.
Expand Down Expand Up @@ -146,32 +152,37 @@ def named_tuple_factory(colnames, rows):
.. versionchanged:: 2.0.0
moved from ``cassandra.decoder`` to ``cassandra.query``
"""
clean_column_names = map(_clean_column_name, colnames)
key = tuple(colnames)
try:
Row = namedtuple('Row', clean_column_names)
except SyntaxError:
warnings.warn(
"Failed creating namedtuple for a result because there were too "
"many columns. This is due to a Python limitation that affects "
"namedtuple in Python 3.0-3.6 (see issue18896). The row will be "
"created with {substitute_factory_name}, which lacks some namedtuple "
"features and is slower. To avoid slower performance accessing "
"values on row objects, Upgrade to Python 3.7, or use a different "
"row factory. (column names: {colnames})".format(
substitute_factory_name=pseudo_namedtuple_factory.__name__,
colnames=colnames
Row = _named_tuple_cache[key]
except KeyError:
clean_column_names = map(_clean_column_name, colnames)
try:
Row = namedtuple('Row', clean_column_names)
except SyntaxError:
warnings.warn(
"Failed creating namedtuple for a result because there were too "
"many columns. This is due to a Python limitation that affects "
"namedtuple in Python 3.0-3.6 (see issue18896). The row will be "
"created with {substitute_factory_name}, which lacks some namedtuple "
"features and is slower. To avoid slower performance accessing "
"values on row objects, Upgrade to Python 3.7, or use a different "
"row factory. (column names: {colnames})".format(
substitute_factory_name=pseudo_namedtuple_factory.__name__,
colnames=colnames
)
)
)
return pseudo_namedtuple_factory(colnames, rows)
except Exception:
clean_column_names = list(map(_clean_column_name, colnames)) # create list because py3 map object will be consumed by first attempt
log.warning("Failed creating named tuple for results with column names %s (cleaned: %s) "
"(see Python 'namedtuple' documentation for details on name rules). "
"Results will be returned with positional names. "
"Avoid this by choosing different names, using SELECT \"<col name>\" AS aliases, "
"or specifying a different row_factory on your Session" %
(colnames, clean_column_names))
Row = namedtuple('Row', _sanitize_identifiers(clean_column_names))
return pseudo_namedtuple_factory(colnames, rows)
except Exception:
clean_column_names = list(map(_clean_column_name, colnames)) # create list because py3 map object will be consumed by first attempt
log.warning("Failed creating named tuple for results with column names %s (cleaned: %s) "
"(see Python 'namedtuple' documentation for details on name rules). "
"Results will be returned with positional names. "
"Avoid this by choosing different names, using SELECT \"<col name>\" AS aliases, "
"or specifying a different row_factory on your Session" %
(colnames, clean_column_names))
Row = namedtuple('Row', _sanitize_identifiers(clean_column_names))
_named_tuple_cache[key] = Row

return [Row(*row) for row in rows]

Expand Down
Loading