scylladb · mykaul · Apr 3, 2026
diff --git a/benchmarks/test_named_tuple_factory_benchmark.py b/benchmarks/test_named_tuple_factory_benchmark.py
@@ -0,0 +1,206 @@
+# Copyright ScyllaDB, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Benchmarks for named_tuple_factory with and without namedtuple class caching.
+
+Run with: pytest benchmarks/test_named_tuple_factory_benchmark.py -v
+"""
+
+import re
+import warnings
+from collections import namedtuple
+
+import pytest
+
+from cassandra.query import named_tuple_factory, _named_tuple_cache
+from cassandra.util import _sanitize_identifiers
+
+
+# ---------------------------------------------------------------------------
+# Reference: original uncached implementation (copied from master)
+# ---------------------------------------------------------------------------
+
+NON_ALPHA_REGEX = re.compile("[^a-zA-Z0-9]")
+START_BADCHAR_REGEX = re.compile("^[^a-zA-Z0-9]*")
+END_BADCHAR_REGEX = re.compile("[^a-zA-Z0-9_]*$")
+
+_clean_name_cache_old = {}
+
+
+def _clean_column_name_old(name):
+    try:
+        return _clean_name_cache_old[name]
+    except KeyError:
+        clean = NON_ALPHA_REGEX.sub(
+            "_", START_BADCHAR_REGEX.sub("", END_BADCHAR_REGEX.sub("", name))
+        )
+        _clean_name_cache_old[name] = clean
+        return clean
+
+
+def named_tuple_factory_uncached(colnames, rows):
+    """Original implementation without caching (for benchmark comparison)."""
+    clean_column_names = map(_clean_column_name_old, colnames)
+    try:
+        Row = namedtuple("Row", clean_column_names)
+    except SyntaxError:
+        raise
+    except Exception:
+        clean_column_names = list(map(_clean_column_name_old, colnames))
+        Row = namedtuple("Row", _sanitize_identifiers(clean_column_names))
+    return [Row(*row) for row in rows]
+
+
+# ---------------------------------------------------------------------------
+# Test data generators
+# ---------------------------------------------------------------------------
+
+
+def make_colnames(n):
+    return tuple(f"col_{i}" for i in range(n))
+
+
+def make_rows(ncols, nrows):
+    return [tuple(range(ncols)) for _ in range(nrows)]
+
+
+# ---------------------------------------------------------------------------
+# Correctness tests
+# ---------------------------------------------------------------------------
+
+
+class TestNamedTupleFactoryCorrectness:
+    """Verify the cached implementation matches the uncached one."""
+
+    @pytest.mark.parametrize("ncols", [1, 5, 10, 20])
+    @pytest.mark.parametrize("nrows", [1, 10, 100])
+    def test_results_match(self, ncols, nrows):
+        colnames = make_colnames(ncols)
+        rows = make_rows(ncols, nrows)
+        _named_tuple_cache.clear()
+        cached_result = named_tuple_factory(colnames, rows)
+        uncached_result = named_tuple_factory_uncached(colnames, rows)
+        assert len(cached_result) == len(uncached_result)
+        for cr, ur in zip(cached_result, uncached_result):
+            assert tuple(cr) == tuple(ur)
+            assert cr._fields == ur._fields
+
+    def test_cache_hit_returns_same_class(self):
+        colnames = ("name", "age", "email")
+        rows1 = [("Alice", 30, "a@b.com")]
+        rows2 = [("Bob", 25, "b@c.com")]
+        _named_tuple_cache.clear()
+        result1 = named_tuple_factory(colnames, rows1)
+        result2 = named_tuple_factory(colnames, rows2)
+        # Same Row class should be reused
+        assert type(result1[0]) is type(result2[0])
+
+    def test_different_schemas_get_different_classes(self):
+        _named_tuple_cache.clear()
+        result1 = named_tuple_factory(("a", "b"), [(1, 2)])
+        result2 = named_tuple_factory(("x", "y"), [(3, 4)])
+        assert type(result1[0]) is not type(result2[0])
+        assert result1[0]._fields == ("a", "b")
+        assert result2[0]._fields == ("x", "y")
+
+
+# ---------------------------------------------------------------------------
+# Benchmarks
+# ---------------------------------------------------------------------------
+
+
+class TestNamedTupleFactoryBenchmark:
+    """Benchmark cached vs uncached named_tuple_factory."""
+
+    # --- 5 columns, 100 rows ---
+
+    @pytest.mark.benchmark(group="ntf_5cols_100rows")
+    def test_uncached_5cols_100rows(self, benchmark):
+        colnames = make_colnames(5)
+        rows = make_rows(5, 100)
+        benchmark(named_tuple_factory_uncached, colnames, rows)
+
+    @pytest.mark.benchmark(group="ntf_5cols_100rows")
+    def test_cached_5cols_100rows(self, benchmark):
+        colnames = make_colnames(5)
+        rows = make_rows(5, 100)
+        _named_tuple_cache.clear()
+        # Warm the cache with one call
+        named_tuple_factory(colnames, rows)
+        benchmark(named_tuple_factory, colnames, rows)
+
+    # --- 10 columns, 100 rows ---
+
+    @pytest.mark.benchmark(group="ntf_10cols_100rows")
+    def test_uncached_10cols_100rows(self, benchmark):
+        colnames = make_colnames(10)
+        rows = make_rows(10, 100)
+        benchmark(named_tuple_factory_uncached, colnames, rows)
+
+    @pytest.mark.benchmark(group="ntf_10cols_100rows")
+    def test_cached_10cols_100rows(self, benchmark):
+        colnames = make_colnames(10)
+        rows = make_rows(10, 100)
+        _named_tuple_cache.clear()
+        named_tuple_factory(colnames, rows)
+        benchmark(named_tuple_factory, colnames, rows)
+
+    # --- 20 columns, 100 rows ---
+
+    @pytest.mark.benchmark(group="ntf_20cols_100rows")
+    def test_uncached_20cols_100rows(self, benchmark):
+        colnames = make_colnames(20)
+        rows = make_rows(20, 100)
+        benchmark(named_tuple_factory_uncached, colnames, rows)
+
+    @pytest.mark.benchmark(group="ntf_20cols_100rows")
+    def test_cached_20cols_100rows(self, benchmark):
+        colnames = make_colnames(20)
+        rows = make_rows(20, 100)
+        _named_tuple_cache.clear()
+        named_tuple_factory(colnames, rows)
+        benchmark(named_tuple_factory, colnames, rows)
+
+    # --- 5 columns, 1000 rows ---
+
+    @pytest.mark.benchmark(group="ntf_5cols_1000rows")
+    def test_uncached_5cols_1000rows(self, benchmark):
+        colnames = make_colnames(5)
+        rows = make_rows(5, 1000)
+        benchmark(named_tuple_factory_uncached, colnames, rows)
+
+    @pytest.mark.benchmark(group="ntf_5cols_1000rows")
+    def test_cached_5cols_1000rows(self, benchmark):
+        colnames = make_colnames(5)
+        rows = make_rows(5, 1000)
+        _named_tuple_cache.clear()
+        named_tuple_factory(colnames, rows)
+        benchmark(named_tuple_factory, colnames, rows)
+
+    # --- 10 columns, 1 row (measures class creation overhead most clearly) ---
+
+    @pytest.mark.benchmark(group="ntf_10cols_1row")
+    def test_uncached_10cols_1row(self, benchmark):
+        colnames = make_colnames(10)
+        rows = make_rows(10, 1)
+        benchmark(named_tuple_factory_uncached, colnames, rows)
+
+    @pytest.mark.benchmark(group="ntf_10cols_1row")
+    def test_cached_10cols_1row(self, benchmark):
+        colnames = make_colnames(10)
+        rows = make_rows(10, 1)
+        _named_tuple_cache.clear()
+        named_tuple_factory(colnames, rows)
+        benchmark(named_tuple_factory, colnames, rows)
diff --git a/cassandra/query.py b/cassandra/query.py
@@ -117,6 +117,12 @@ def pseudo_namedtuple_factory(colnames, rows):
             for od in ordered_dict_factory(colnames, rows)]
 
 
+# Cache namedtuple Row classes to avoid repeated exec() calls in namedtuple()
+# for the same column schema. Naturally bounded by the number of distinct
+# column-name tuples, which equals the number of distinct queries.
+_named_tuple_cache = {}
+
+
 def named_tuple_factory(colnames, rows):
     """
     Returns each row as a `namedtuple <https://docs.python.org/2/library/collections.html#collections.namedtuple>`_.
@@ -146,32 +152,37 @@ def named_tuple_factory(colnames, rows):
     .. versionchanged:: 2.0.0
         moved from ``cassandra.decoder`` to ``cassandra.query``
     """
-    clean_column_names = map(_clean_column_name, colnames)
+    key = tuple(colnames)
     try:
-        Row = namedtuple('Row', clean_column_names)
-    except SyntaxError:
-        warnings.warn(
-            "Failed creating namedtuple for a result because there were too "
-            "many columns. This is due to a Python limitation that affects "
-            "namedtuple in Python 3.0-3.6 (see issue18896). The row will be "
-            "created with {substitute_factory_name}, which lacks some namedtuple "
-            "features and is slower. To avoid slower performance accessing "
-            "values on row objects, Upgrade to Python 3.7, or use a different "
-            "row factory. (column names: {colnames})".format(
-                substitute_factory_name=pseudo_namedtuple_factory.__name__,
-                colnames=colnames
+        Row = _named_tuple_cache[key]
+    except KeyError:
+        clean_column_names = map(_clean_column_name, colnames)
+        try:
+            Row = namedtuple('Row', clean_column_names)
+        except SyntaxError:
+            warnings.warn(
+                "Failed creating namedtuple for a result because there were too "
+                "many columns. This is due to a Python limitation that affects "
+                "namedtuple in Python 3.0-3.6 (see issue18896). The row will be "
+                "created with {substitute_factory_name}, which lacks some namedtuple "
+                "features and is slower. To avoid slower performance accessing "
+                "values on row objects, Upgrade to Python 3.7, or use a different "
+                "row factory. (column names: {colnames})".format(
+                    substitute_factory_name=pseudo_namedtuple_factory.__name__,
+                    colnames=colnames
+                )
             )
-        )
-        return pseudo_namedtuple_factory(colnames, rows)
-    except Exception:
-        clean_column_names = list(map(_clean_column_name, colnames))  # create list because py3 map object will be consumed by first attempt
-        log.warning("Failed creating named tuple for results with column names %s (cleaned: %s) "
-                    "(see Python 'namedtuple' documentation for details on name rules). "
-                    "Results will be returned with positional names. "
-                    "Avoid this by choosing different names, using SELECT \"<col name>\" AS aliases, "
-                    "or specifying a different row_factory on your Session" %
-                    (colnames, clean_column_names))
-        Row = namedtuple('Row', _sanitize_identifiers(clean_column_names))
+            return pseudo_namedtuple_factory(colnames, rows)
+        except Exception:
+            clean_column_names = list(map(_clean_column_name, colnames))  # create list because py3 map object will be consumed by first attempt
+            log.warning("Failed creating named tuple for results with column names %s (cleaned: %s) "
+                        "(see Python 'namedtuple' documentation for details on name rules). "
+                        "Results will be returned with positional names. "
+                        "Avoid this by choosing different names, using SELECT \"<col name>\" AS aliases, "
+                        "or specifying a different row_factory on your Session" %
+                        (colnames, clean_column_names))
+            Row = namedtuple('Row', _sanitize_identifiers(clean_column_names))
+        _named_tuple_cache[key] = Row
 
     return [Row(*row) for row in rows]