From 6f7dc4a44da0784c89aaaafe96a869c2c00dd7b7 Mon Sep 17 00:00:00 2001 From: Yaniv Michael Kaul Date: Fri, 3 Apr 2026 18:20:03 +0300 Subject: [PATCH 1/6] (improvement) metadata: add lightweight _RowView and _row_factory Introduce _RowView, a __slots__-based read-only row wrapper that stores data as tuples with a shared column-name-to-index map, and _row_factory that creates these views. _RowView inherits from collections.abc.Mapping, providing a complete dict-like read interface. This eliminates per-row dict allocation during schema parsing. All rows from the same result set share a single index map object. --- cassandra/metadata.py | 50 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/cassandra/metadata.py b/cassandra/metadata.py index 43399b7152..b49b47d0c2 100644 --- a/cassandra/metadata.py +++ b/cassandra/metadata.py @@ -46,6 +46,56 @@ from cassandra.tablets import Tablets from cassandra.util import maybe_add_timeout_to_query + +class _RowView(Mapping): + """ + Lightweight read-only view over a row tuple, supporting dict-like access. + Shares a single index map across all rows from the same result set, + avoiding per-row dict allocation overhead. + + Implements the :class:`collections.abc.Mapping` protocol, providing + ``__getitem__``, ``__iter__``, ``__len__``, ``get``, ``keys``, + ``values``, ``items``, and ``__contains__`` for free. + """ + + __slots__ = ("_row", "_index_map") + + def __init__(self, row, index_map): + self._row = row + self._index_map = index_map + + def __getitem__(self, key): + return self._row[self._index_map[key]] + + def __iter__(self): + return iter(self._index_map) + + def __len__(self): + return len(self._index_map) + + def get(self, key, default=None): + idx = self._index_map.get(key) + if idx is not None: + return self._row[idx] + return default + + def __contains__(self, key): + return key in self._index_map + + def __repr__(self): + return repr({k: self._row[i] for k, i in self._index_map.items()}) + + +def _row_factory(colnames, rows): + """ + Lightweight replacement for dict_factory used internally by schema parsers. + Returns a list of _RowView objects that support row["key"] and row.get("key") + but store data as tuples with a shared column-name-to-index map. + """ + index_map = {name: i for i, name in enumerate(colnames)} + return [_RowView(row, index_map) for row in rows] + + log = logging.getLogger(__name__) cql_keywords = set(( From 4a8d055f240036a2eaa67a7753da0cb6ca1a905b Mon Sep 17 00:00:00 2001 From: Yaniv Michael Kaul Date: Fri, 3 Apr 2026 18:20:38 +0300 Subject: [PATCH 2/6] (improvement) metadata: replace OrderedDict with dict Python 3.7+ guarantees dict preserves insertion order, making OrderedDict unnecessary. Replace OrderedDict() with {} in TableMetadata.columns, TableMetadata.triggers, and MaterializedViewMetadata.columns. Remove the now-unused OrderedDict import. --- cassandra/metadata.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cassandra/metadata.py b/cassandra/metadata.py index b49b47d0c2..1c34a24988 100644 --- a/cassandra/metadata.py +++ b/cassandra/metadata.py @@ -40,7 +40,7 @@ from cassandra.marshal import varint_unpack from cassandra.protocol import QueryMessage from cassandra.query import dict_factory, bind_params -from cassandra.util import OrderedDict, Version +from cassandra.util import Version from cassandra.pool import HostDistance from cassandra.connection import EndPoint from cassandra.tablets import Tablets @@ -1380,11 +1380,11 @@ def __init__(self, keyspace_name, name, partition_key=None, clustering_key=None, self.name = name self.partition_key = [] if partition_key is None else partition_key self.clustering_key = [] if clustering_key is None else clustering_key - self.columns = OrderedDict() if columns is None else columns + self.columns = {} if columns is None else columns self.indexes = {} self.options = {} if options is None else options self.comparator = None - self.triggers = OrderedDict() if triggers is None else triggers + self.triggers = {} if triggers is None else triggers self.views = {} self.virtual = virtual @@ -2796,7 +2796,7 @@ def _build_table_columns(self, meta, col_rows, compact_static=False, is_dense=Fa partition_rows = sorted(partition_rows, key=lambda row: row.get('position')) for r in partition_rows: # we have to add meta here (and not in the later loop) because TableMetadata.columns is an - # OrderedDict, and it assumes keys are inserted first, in order, when exporting CQL + # dict (ordered since Python 3.7), and it assumes keys are inserted first, in order, when exporting CQL column_meta = self._build_column_metadata(meta, r) meta.columns[column_meta.name] = column_meta meta.partition_key.append(meta.columns[r.get('column_name')]) @@ -3378,7 +3378,7 @@ def __init__(self, keyspace_name, view_name, base_table_name, include_all_column self.base_table_name = base_table_name self.partition_key = [] self.clustering_key = [] - self.columns = OrderedDict() + self.columns = {} self.include_all_columns = include_all_columns self.where_clause = where_clause self.options = options or {} From d5f0ae0efc1a118bcd6c40f56a13bb2a511d2ab7 Mon Sep 17 00:00:00 2001 From: Yaniv Michael Kaul Date: Fri, 3 Apr 2026 18:21:14 +0300 Subject: [PATCH 3/6] (improvement) metadata: select only needed columns from system_schema.columns Replace SELECT * with an explicit column list for the system_schema.columns query in SchemaParserV3 (inherited by V4). Only the 7 columns actually consumed by the parser are fetched: keyspace_name, table_name, column_name, clustering_order, kind, position, type. This reduces network transfer and deserialization overhead during schema refresh. --- cassandra/metadata.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cassandra/metadata.py b/cassandra/metadata.py index 1c34a24988..24b9901c06 100644 --- a/cassandra/metadata.py +++ b/cassandra/metadata.py @@ -2619,7 +2619,10 @@ class SchemaParserV3(SchemaParserV22): """ _SELECT_KEYSPACES = "SELECT * FROM system_schema.keyspaces" _SELECT_TABLES = "SELECT * FROM system_schema.tables" - _SELECT_COLUMNS = "SELECT * FROM system_schema.columns" + # Only fetch the columns used by _build_column_metadata / _build_table_columns. + # If _build_column_metadata or _build_table_columns needs more columns, this query + # should be updated accordingly. + _SELECT_COLUMNS = "SELECT keyspace_name, table_name, column_name, clustering_order, kind, position, type FROM system_schema.columns" _SELECT_INDEXES = "SELECT * FROM system_schema.indexes" _SELECT_TRIGGERS = "SELECT * FROM system_schema.triggers" _SELECT_TYPES = "SELECT * FROM system_schema.types" From b23c7ae19d20d73182bf8e92c750e030f9107611 Mon Sep 17 00:00:00 2001 From: Yaniv Michael Kaul Date: Fri, 3 Apr 2026 18:22:15 +0300 Subject: [PATCH 4/6] (improvement) metadata: replace dict_factory with lightweight _RowView Replace dict_factory in _SchemaParser._handle_results and get_column_from_system_local with _row_factory, eliminating per-row dict allocation during schema parsing. Also refactor SchemaParserV4._build_keyspace_metadata_internal to read from the row without mutating it, since _RowView is read-only. Note: V22-only dict_factory call sites are left unchanged as they do not affect the V3/V4 code path (V3 and V4 fully override _query_all). --- cassandra/metadata.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/cassandra/metadata.py b/cassandra/metadata.py index 24b9901c06..195a3fa50e 100644 --- a/cassandra/metadata.py +++ b/cassandra/metadata.py @@ -2057,7 +2057,7 @@ def get_next_pages(): yield next_result.parsed_rows result.parsed_rows += itertools.chain(*get_next_pages()) - return dict_factory(result.column_names, result.parsed_rows) if result else [] + return _row_factory(result.column_names, result.parsed_rows) if result else [] else: raise result @@ -3109,11 +3109,12 @@ def get_all_keyspaces(self): @staticmethod def _build_keyspace_metadata_internal(row): - # necessary fields that aren't int virtual ks - row["durable_writes"] = row.get("durable_writes", None) - row["replication"] = row.get("replication", {}) - row["replication"]["class"] = row["replication"].get("class", None) - return super(SchemaParserV4, SchemaParserV4)._build_keyspace_metadata_internal(row) + # Read without mutating the row, since _RowView is read-only + name = row["keyspace_name"] + durable_writes = row.get("durable_writes", None) + replication = dict(row.get("replication")) if 'replication' in row else {} + replication_class = replication.pop("class") if 'class' in replication else None + return KeyspaceMetadata(name, durable_writes, replication_class, replication) class SchemaParserDSE67(SchemaParserV4): @@ -3517,7 +3518,7 @@ def get_column_from_system_local(connection, column_name: str, timeout, metadata , timeout=timeout, fail_on_error=False) if not success or not local_result.parsed_rows: return "" - local_rows = dict_factory(local_result.column_names, local_result.parsed_rows) + local_rows = _row_factory(local_result.column_names, local_result.parsed_rows) local_row = local_rows[0] return local_row.get(column_name) From a38bbfe7e30b450e49b5adbac1369e303b6b28e2 Mon Sep 17 00:00:00 2001 From: Yaniv Michael Kaul Date: Fri, 3 Apr 2026 18:26:17 +0300 Subject: [PATCH 5/6] (improvement) metadata: single-pass _build_table_columns Replace three list comprehension passes over col_rows with a single classification loop that sorts columns into partition, clustering, and other buckets. Also use in-place sort() instead of sorted() and reuse the already-built column_meta instead of a redundant dict lookup. --- cassandra/metadata.py | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/cassandra/metadata.py b/cassandra/metadata.py index 195a3fa50e..f6ad7f55b5 100644 --- a/cassandra/metadata.py +++ b/cassandra/metadata.py @@ -2792,31 +2792,40 @@ def _build_table_options(self, row): return dict((o, row.get(o)) for o in self.recognized_table_options if o in row) def _build_table_columns(self, meta, col_rows, compact_static=False, is_dense=False, virtual=False): - # partition key - partition_rows = [r for r in col_rows - if r.get('kind', None) == "partition_key"] + # Single-pass classification of column rows by kind + partition_rows = [] + clustering_rows = [] + other_rows = [] + for r in col_rows: + kind = r.get('kind', None) + if kind == "partition_key": + partition_rows.append(r) + elif kind == "clustering": + if not compact_static: + clustering_rows.append(r) + # else: skip clustering rows entirely for compact_static tables + else: + other_rows.append(r) + + # partition key - must be inserted first into meta.columns for CQL export ordering if len(partition_rows) > 1: - partition_rows = sorted(partition_rows, key=lambda row: row.get('position')) + partition_rows.sort(key=lambda row: row.get('position')) for r in partition_rows: - # we have to add meta here (and not in the later loop) because TableMetadata.columns is an - # dict (ordered since Python 3.7), and it assumes keys are inserted first, in order, when exporting CQL column_meta = self._build_column_metadata(meta, r) meta.columns[column_meta.name] = column_meta - meta.partition_key.append(meta.columns[r.get('column_name')]) + meta.partition_key.append(column_meta) # clustering key - if not compact_static: - clustering_rows = [r for r in col_rows - if r.get('kind', None) == "clustering"] + if clustering_rows: if len(clustering_rows) > 1: - clustering_rows = sorted(clustering_rows, key=lambda row: row.get('position')) + clustering_rows.sort(key=lambda row: row.get('position')) for r in clustering_rows: column_meta = self._build_column_metadata(meta, r) meta.columns[column_meta.name] = column_meta - meta.clustering_key.append(meta.columns[r.get('column_name')]) + meta.clustering_key.append(column_meta) - for col_row in (r for r in col_rows - if r.get('kind', None) not in ('partition_key', 'clustering')): + # remaining columns (static, regular, etc.) + for col_row in other_rows: column_meta = self._build_column_metadata(meta, col_row) if is_dense and column_meta.cql_type == types.cql_empty_type: continue From eda10866497186efe97b15ead968a1a76a40adb1 Mon Sep 17 00:00:00 2001 From: Yaniv Michael Kaul Date: Fri, 3 Apr 2026 18:27:18 +0300 Subject: [PATCH 6/6] tests: add _RowView and _row_factory unit tests Cover __getitem__, get(), __contains__, __repr__, shared index map, read-only enforcement, empty input, single-column, and multi-row scenarios. --- tests/unit/test_metadata.py | 66 ++++++++++++++++++++++++++++++++++++- 1 file changed, 65 insertions(+), 1 deletion(-) diff --git a/tests/unit/test_metadata.py b/tests/unit/test_metadata.py index dcbb840447..6d33ccec93 100644 --- a/tests/unit/test_metadata.py +++ b/tests/unit/test_metadata.py @@ -32,7 +32,8 @@ _UnknownStrategy, ColumnMetadata, TableMetadata, IndexMetadata, Function, Aggregate, Metadata, TokenMap, ReplicationFactor, - SchemaParserDSE68) + SchemaParserDSE68, + _RowView, _row_factory) from cassandra.policies import SimpleConvictionPolicy from cassandra.pool import Host from cassandra.protocol import QueryMessage @@ -846,3 +847,66 @@ def test_strip_frozen(self): for argument, expected_result in argument_to_expected_results: result = strip_frozen(argument) assert result == expected_result, "strip_frozen() arg: {}".format(argument) + +class RowViewTest(unittest.TestCase): + """Tests for the internal _RowView and _row_factory helpers.""" + + def test_getitem(self): + rv = _RowView(("a_val", "b_val"), {"a": 0, "b": 1}) + self.assertEqual(rv["a"], "a_val") + self.assertEqual(rv["b"], "b_val") + + def test_getitem_missing_key(self): + rv = _RowView(("a_val",), {"a": 0}) + with self.assertRaises(KeyError): + rv["missing"] + + def test_get_present(self): + rv = _RowView(("a_val", "b_val"), {"a": 0, "b": 1}) + self.assertEqual(rv.get("a"), "a_val") + self.assertEqual(rv.get("b"), "b_val") + + def test_get_missing_returns_default(self): + rv = _RowView(("a_val",), {"a": 0}) + self.assertIsNone(rv.get("missing")) + self.assertEqual(rv.get("missing", 42), 42) + + def test_contains(self): + rv = _RowView(("a_val",), {"a": 0}) + self.assertIn("a", rv) + self.assertNotIn("b", rv) + + def test_repr(self): + rv = _RowView(("a_val", "b_val"), {"a": 0, "b": 1}) + r = repr(rv) + self.assertIn("'a'", r) + self.assertIn("'a_val'", r) + + def test_shared_index_map(self): + """All _RowView objects from the same _row_factory call share one index map.""" + rows = _row_factory(["x", "y"], [("x1", "y1"), ("x2", "y2")]) + self.assertIs(rows[0]._index_map, rows[1]._index_map) + + def test_read_only(self): + """_RowView must not allow item assignment or deletion.""" + rv = _RowView(("val",), {"col": 0}) + with self.assertRaises(TypeError): + rv["col"] = "new" + with self.assertRaises(TypeError): + del rv["col"] + + def test_row_factory_empty(self): + result = _row_factory(["a", "b"], []) + self.assertEqual(result, []) + + def test_row_factory_single_column(self): + rows = _row_factory(["only"], [("v1",), ("v2",)]) + self.assertEqual(rows[0]["only"], "v1") + self.assertEqual(rows[1]["only"], "v2") + + def test_row_factory_values(self): + rows = _row_factory(["id", "name"], [(1, "alice"), (2, "bob")]) + self.assertEqual(rows[0]["id"], 1) + self.assertEqual(rows[0]["name"], "alice") + self.assertEqual(rows[1]["id"], 2) + self.assertEqual(rows[1]["name"], "bob")