diff --git a/benchmarks/test_named_tuple_factory_benchmark.py b/benchmarks/test_named_tuple_factory_benchmark.py new file mode 100644 index 0000000000..5edbf1e9c0 --- /dev/null +++ b/benchmarks/test_named_tuple_factory_benchmark.py @@ -0,0 +1,206 @@ +# Copyright ScyllaDB, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Benchmarks for named_tuple_factory with and without namedtuple class caching. + +Run with: pytest benchmarks/test_named_tuple_factory_benchmark.py -v +""" + +import re +import warnings +from collections import namedtuple + +import pytest + +from cassandra.query import named_tuple_factory, _named_tuple_cache +from cassandra.util import _sanitize_identifiers + + +# --------------------------------------------------------------------------- +# Reference: original uncached implementation (copied from master) +# --------------------------------------------------------------------------- + +NON_ALPHA_REGEX = re.compile("[^a-zA-Z0-9]") +START_BADCHAR_REGEX = re.compile("^[^a-zA-Z0-9]*") +END_BADCHAR_REGEX = re.compile("[^a-zA-Z0-9_]*$") + +_clean_name_cache_old = {} + + +def _clean_column_name_old(name): + try: + return _clean_name_cache_old[name] + except KeyError: + clean = NON_ALPHA_REGEX.sub( + "_", START_BADCHAR_REGEX.sub("", END_BADCHAR_REGEX.sub("", name)) + ) + _clean_name_cache_old[name] = clean + return clean + + +def named_tuple_factory_uncached(colnames, rows): + """Original implementation without caching (for benchmark comparison).""" + clean_column_names = map(_clean_column_name_old, colnames) + try: + Row = namedtuple("Row", clean_column_names) + except SyntaxError: + raise + except Exception: + clean_column_names = list(map(_clean_column_name_old, colnames)) + Row = namedtuple("Row", _sanitize_identifiers(clean_column_names)) + return [Row(*row) for row in rows] + + +# --------------------------------------------------------------------------- +# Test data generators +# --------------------------------------------------------------------------- + + +def make_colnames(n): + return tuple(f"col_{i}" for i in range(n)) + + +def make_rows(ncols, nrows): + return [tuple(range(ncols)) for _ in range(nrows)] + + +# --------------------------------------------------------------------------- +# Correctness tests +# --------------------------------------------------------------------------- + + +class TestNamedTupleFactoryCorrectness: + """Verify the cached implementation matches the uncached one.""" + + @pytest.mark.parametrize("ncols", [1, 5, 10, 20]) + @pytest.mark.parametrize("nrows", [1, 10, 100]) + def test_results_match(self, ncols, nrows): + colnames = make_colnames(ncols) + rows = make_rows(ncols, nrows) + _named_tuple_cache.clear() + cached_result = named_tuple_factory(colnames, rows) + uncached_result = named_tuple_factory_uncached(colnames, rows) + assert len(cached_result) == len(uncached_result) + for cr, ur in zip(cached_result, uncached_result): + assert tuple(cr) == tuple(ur) + assert cr._fields == ur._fields + + def test_cache_hit_returns_same_class(self): + colnames = ("name", "age", "email") + rows1 = [("Alice", 30, "a@b.com")] + rows2 = [("Bob", 25, "b@c.com")] + _named_tuple_cache.clear() + result1 = named_tuple_factory(colnames, rows1) + result2 = named_tuple_factory(colnames, rows2) + # Same Row class should be reused + assert type(result1[0]) is type(result2[0]) + + def test_different_schemas_get_different_classes(self): + _named_tuple_cache.clear() + result1 = named_tuple_factory(("a", "b"), [(1, 2)]) + result2 = named_tuple_factory(("x", "y"), [(3, 4)]) + assert type(result1[0]) is not type(result2[0]) + assert result1[0]._fields == ("a", "b") + assert result2[0]._fields == ("x", "y") + + +# --------------------------------------------------------------------------- +# Benchmarks +# --------------------------------------------------------------------------- + + +class TestNamedTupleFactoryBenchmark: + """Benchmark cached vs uncached named_tuple_factory.""" + + # --- 5 columns, 100 rows --- + + @pytest.mark.benchmark(group="ntf_5cols_100rows") + def test_uncached_5cols_100rows(self, benchmark): + colnames = make_colnames(5) + rows = make_rows(5, 100) + benchmark(named_tuple_factory_uncached, colnames, rows) + + @pytest.mark.benchmark(group="ntf_5cols_100rows") + def test_cached_5cols_100rows(self, benchmark): + colnames = make_colnames(5) + rows = make_rows(5, 100) + _named_tuple_cache.clear() + # Warm the cache with one call + named_tuple_factory(colnames, rows) + benchmark(named_tuple_factory, colnames, rows) + + # --- 10 columns, 100 rows --- + + @pytest.mark.benchmark(group="ntf_10cols_100rows") + def test_uncached_10cols_100rows(self, benchmark): + colnames = make_colnames(10) + rows = make_rows(10, 100) + benchmark(named_tuple_factory_uncached, colnames, rows) + + @pytest.mark.benchmark(group="ntf_10cols_100rows") + def test_cached_10cols_100rows(self, benchmark): + colnames = make_colnames(10) + rows = make_rows(10, 100) + _named_tuple_cache.clear() + named_tuple_factory(colnames, rows) + benchmark(named_tuple_factory, colnames, rows) + + # --- 20 columns, 100 rows --- + + @pytest.mark.benchmark(group="ntf_20cols_100rows") + def test_uncached_20cols_100rows(self, benchmark): + colnames = make_colnames(20) + rows = make_rows(20, 100) + benchmark(named_tuple_factory_uncached, colnames, rows) + + @pytest.mark.benchmark(group="ntf_20cols_100rows") + def test_cached_20cols_100rows(self, benchmark): + colnames = make_colnames(20) + rows = make_rows(20, 100) + _named_tuple_cache.clear() + named_tuple_factory(colnames, rows) + benchmark(named_tuple_factory, colnames, rows) + + # --- 5 columns, 1000 rows --- + + @pytest.mark.benchmark(group="ntf_5cols_1000rows") + def test_uncached_5cols_1000rows(self, benchmark): + colnames = make_colnames(5) + rows = make_rows(5, 1000) + benchmark(named_tuple_factory_uncached, colnames, rows) + + @pytest.mark.benchmark(group="ntf_5cols_1000rows") + def test_cached_5cols_1000rows(self, benchmark): + colnames = make_colnames(5) + rows = make_rows(5, 1000) + _named_tuple_cache.clear() + named_tuple_factory(colnames, rows) + benchmark(named_tuple_factory, colnames, rows) + + # --- 10 columns, 1 row (measures class creation overhead most clearly) --- + + @pytest.mark.benchmark(group="ntf_10cols_1row") + def test_uncached_10cols_1row(self, benchmark): + colnames = make_colnames(10) + rows = make_rows(10, 1) + benchmark(named_tuple_factory_uncached, colnames, rows) + + @pytest.mark.benchmark(group="ntf_10cols_1row") + def test_cached_10cols_1row(self, benchmark): + colnames = make_colnames(10) + rows = make_rows(10, 1) + _named_tuple_cache.clear() + named_tuple_factory(colnames, rows) + benchmark(named_tuple_factory, colnames, rows) diff --git a/cassandra/query.py b/cassandra/query.py index 6c6878fdb4..a254b56127 100644 --- a/cassandra/query.py +++ b/cassandra/query.py @@ -117,6 +117,12 @@ def pseudo_namedtuple_factory(colnames, rows): for od in ordered_dict_factory(colnames, rows)] +# Cache namedtuple Row classes to avoid repeated exec() calls in namedtuple() +# for the same column schema. Naturally bounded by the number of distinct +# column-name tuples, which equals the number of distinct queries. +_named_tuple_cache = {} + + def named_tuple_factory(colnames, rows): """ Returns each row as a `namedtuple `_. @@ -146,32 +152,37 @@ def named_tuple_factory(colnames, rows): .. versionchanged:: 2.0.0 moved from ``cassandra.decoder`` to ``cassandra.query`` """ - clean_column_names = map(_clean_column_name, colnames) + key = tuple(colnames) try: - Row = namedtuple('Row', clean_column_names) - except SyntaxError: - warnings.warn( - "Failed creating namedtuple for a result because there were too " - "many columns. This is due to a Python limitation that affects " - "namedtuple in Python 3.0-3.6 (see issue18896). The row will be " - "created with {substitute_factory_name}, which lacks some namedtuple " - "features and is slower. To avoid slower performance accessing " - "values on row objects, Upgrade to Python 3.7, or use a different " - "row factory. (column names: {colnames})".format( - substitute_factory_name=pseudo_namedtuple_factory.__name__, - colnames=colnames + Row = _named_tuple_cache[key] + except KeyError: + clean_column_names = map(_clean_column_name, colnames) + try: + Row = namedtuple('Row', clean_column_names) + except SyntaxError: + warnings.warn( + "Failed creating namedtuple for a result because there were too " + "many columns. This is due to a Python limitation that affects " + "namedtuple in Python 3.0-3.6 (see issue18896). The row will be " + "created with {substitute_factory_name}, which lacks some namedtuple " + "features and is slower. To avoid slower performance accessing " + "values on row objects, Upgrade to Python 3.7, or use a different " + "row factory. (column names: {colnames})".format( + substitute_factory_name=pseudo_namedtuple_factory.__name__, + colnames=colnames + ) ) - ) - return pseudo_namedtuple_factory(colnames, rows) - except Exception: - clean_column_names = list(map(_clean_column_name, colnames)) # create list because py3 map object will be consumed by first attempt - log.warning("Failed creating named tuple for results with column names %s (cleaned: %s) " - "(see Python 'namedtuple' documentation for details on name rules). " - "Results will be returned with positional names. " - "Avoid this by choosing different names, using SELECT \"\" AS aliases, " - "or specifying a different row_factory on your Session" % - (colnames, clean_column_names)) - Row = namedtuple('Row', _sanitize_identifiers(clean_column_names)) + return pseudo_namedtuple_factory(colnames, rows) + except Exception: + clean_column_names = list(map(_clean_column_name, colnames)) # create list because py3 map object will be consumed by first attempt + log.warning("Failed creating named tuple for results with column names %s (cleaned: %s) " + "(see Python 'namedtuple' documentation for details on name rules). " + "Results will be returned with positional names. " + "Avoid this by choosing different names, using SELECT \"\" AS aliases, " + "or specifying a different row_factory on your Session" % + (colnames, clean_column_names)) + Row = namedtuple('Row', _sanitize_identifiers(clean_column_names)) + _named_tuple_cache[key] = Row return [Row(*row) for row in rows]