From 811a1523c3f7cb82b4018dae84bc9fc7648d0978 Mon Sep 17 00:00:00 2001 From: Seth Samuel Date: Tue, 30 Sep 2025 10:18:50 -0400 Subject: [PATCH 01/37] Refactor schema collection --- postgres/datadog_checks/postgres/schemas.py | 83 +++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 postgres/datadog_checks/postgres/schemas.py diff --git a/postgres/datadog_checks/postgres/schemas.py b/postgres/datadog_checks/postgres/schemas.py new file mode 100644 index 0000000000000..af92b36bd0d21 --- /dev/null +++ b/postgres/datadog_checks/postgres/schemas.py @@ -0,0 +1,83 @@ +import time + + +class SchemaCollector: + def __init__(self, check, config): + self._check = check + self._log = check.log + self._config = config + + self._reset() + + def _reset(self): + self._collection_started_at = None + self._collection_payloads_count = 0 + self._queued_rows = [] + + def collect_schemas(self): + self._collection_started_at = time.time() * 1000 + databases = self._get_databases() + for database in databases: + with self._get_cursor(database) as cursor: + next = self._get_next(cursor) + while True: + self._queued_rows.append(next) + next = self._get_next(cursor) + is_last_payload = database is databases[-1] and next is None + self.maybe_flush(is_last_payload) + if next is None: + break + self._reset() + + def maybe_flush(self, is_last_payload): + if len(self._queued_rows) > 10 or is_last_payload: + event = { + "host": self._check.reported_hostname, + "agent_version": datadog_agent.get_version(), + "dbms": "postgres", + "kind": "pg_databases", + "collection_interval": self._config.schemas_metadata_config.get("collection_interval"), + "dbms_version": self._check.version, + "tags": self._check.tags, + "cloud_metadata": self._check.cloud_metadata, + "metadata": self._queued_rows, + "collection_started_at": self._collection_started_at, + } + self._collection_payloads_count += 1 + if is_last_payload: + event["collection_payloads_count"] = self._payloads_count + self._check.database_monitoring_metadata(json.dumps(event)) + + self._queued_rows = [] + + def _get_databases(self): + pass + + def _get_cursor(self, database): + pass + + def _get_next(self, cursor): + pass + + +class PostgresSchemaCollector(SchemaCollector): + def __init__(self, check, config): + super().__init__(check, config) + + def collect_schemas(self): + pass + + def _get_databases(self): + cursor = self._check.get_main_db().cursor() + cursor.execute("SELECT datname FROM pg_database") + return [row[0] for row in cursor.fetchall()] + + def _get_cursor(self): + cursor = self._check.db_pool.get_connection(self._config.dbname).cursor() + cursor.execute("SELECT nspname FROM pg_namespace" + "MONSTER SQL STATEMENT GOES HERE" + ) + return cursor + + def _get_next(self, cursor): + return cursor.fetchone() \ No newline at end of file From 2b79e95622b5060d001ee9b2f78bbff5b3261aa1 Mon Sep 17 00:00:00 2001 From: Seth Samuel Date: Tue, 30 Sep 2025 14:44:22 -0400 Subject: [PATCH 02/37] WIP --- postgres/datadog_checks/postgres/schemas.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/postgres/datadog_checks/postgres/schemas.py b/postgres/datadog_checks/postgres/schemas.py index af92b36bd0d21..23c6b825bc6e1 100644 --- a/postgres/datadog_checks/postgres/schemas.py +++ b/postgres/datadog_checks/postgres/schemas.py @@ -14,20 +14,22 @@ def _reset(self): self._collection_payloads_count = 0 self._queued_rows = [] - def collect_schemas(self): + def collect_schemas(self) -> bool: + if self._collection_started_at is not None: + return False self._collection_started_at = time.time() * 1000 databases = self._get_databases() for database in databases: with self._get_cursor(database) as cursor: next = self._get_next(cursor) - while True: + while next: self._queued_rows.append(next) next = self._get_next(cursor) is_last_payload = database is databases[-1] and next is None self.maybe_flush(is_last_payload) - if next is None: - break + self._reset() + return True def maybe_flush(self, is_last_payload): if len(self._queued_rows) > 10 or is_last_payload: From fab49b16d2185e51b1c0864fb870a78cd646fd45 Mon Sep 17 00:00:00 2001 From: Seth Samuel Date: Tue, 30 Sep 2025 15:41:29 -0400 Subject: [PATCH 03/37] Make tests fast --- .../tests/compose/docker-compose-replication.yaml | 4 ++++ postgres/tests/compose/docker-compose.yaml | 1 + postgres/tests/conftest.py | 13 +++++++------ 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/postgres/tests/compose/docker-compose-replication.yaml b/postgres/tests/compose/docker-compose-replication.yaml index 4b2b56ff077f7..172c980fb8bdd 100644 --- a/postgres/tests/compose/docker-compose-replication.yaml +++ b/postgres/tests/compose/docker-compose-replication.yaml @@ -15,6 +15,7 @@ services: volumes: - ./resources:/docker-entrypoint-initdb.d/ - ./etc/postgresql:/etc/postgresql/ + - /tmp/postgres_${POSTGRES_IMAGE}:/var/lib/postgresql/data environment: POSTGRES_PASSWORD: datad0g POSTGRES_INITDB_ARGS: "--data-checksums --locale=${POSTGRES_LOCALE}" @@ -34,6 +35,7 @@ services: volumes: - ./resources_replica:/docker-entrypoint-initdb.d/ - ./etc/postgresql_replica:/etc/postgresql/ + - /tmp/postgres_${POSTGRES_IMAGE}_replica:/var/lib/postgresql/data environment: POSTGRES_PASSWORD: datad0g POSTGRES_INITDB_ARGS: "--data-checksums --locale=${POSTGRES_LOCALE}" @@ -53,6 +55,7 @@ services: volumes: - ./resources_replica2:/docker-entrypoint-initdb.d/ - ./etc/postgresql_replica2:/etc/postgresql/ + - /tmp/postgres_${POSTGRES_IMAGE}_replica_2:/var/lib/postgresql/data environment: POSTGRES_PASSWORD: datad0g POSTGRES_INITDB_ARGS: "--data-checksums --locale=${POSTGRES_LOCALE}" @@ -72,6 +75,7 @@ services: volumes: - ./resources_logical:/docker-entrypoint-initdb.d/ - ./etc/postgresql_logical_replica:/etc/postgresql/ + - /tmp/postgres_${POSTGRES_IMAGE}_logical_replica:/var/lib/postgresql/data environment: POSTGRES_PASSWORD: datad0g POSTGRES_INITDB_ARGS: "--data-checksums --locale=${POSTGRES_LOCALE}" diff --git a/postgres/tests/compose/docker-compose.yaml b/postgres/tests/compose/docker-compose.yaml index dc5ab631bdc0d..f51da17eb56df 100644 --- a/postgres/tests/compose/docker-compose.yaml +++ b/postgres/tests/compose/docker-compose.yaml @@ -11,6 +11,7 @@ services: volumes: - ./resources:/docker-entrypoint-initdb.d/ - ./etc/postgresql:/etc/postgresql/ + - /tmp/postgres_${POSTGRES_IMAGE}:/var/lib/postgresql/data environment: POSTGRES_PASSWORD: datad0g POSTGRES_INITDB_ARGS: "--data-checksums --locale=${POSTGRES_LOCALE}" diff --git a/postgres/tests/conftest.py b/postgres/tests/conftest.py index 476f2342463fd..5a10d3f903484 100644 --- a/postgres/tests/conftest.py +++ b/postgres/tests/conftest.py @@ -63,12 +63,13 @@ def dd_environment(e2e_instance): compose_file = 'docker-compose.yaml' if float(POSTGRES_VERSION) >= 10.0: compose_file = 'docker-compose-replication.yaml' - with docker_run( - os.path.join(HERE, 'compose', compose_file), - conditions=[WaitFor(connect_to_pg)], - env_vars={"POSTGRES_IMAGE": POSTGRES_IMAGE, "POSTGRES_LOCALE": POSTGRES_LOCALE}, - ): - yield e2e_instance, E2E_METADATA + return e2e_instance, E2E_METADATA + # with docker_run( + # os.path.join(HERE, 'compose', compose_file), + # conditions=[WaitFor(connect_to_pg)], + # env_vars={"POSTGRES_IMAGE": POSTGRES_IMAGE, "POSTGRES_LOCALE": POSTGRES_LOCALE}, + # ): + # yield e2e_instance, E2E_METADATA @pytest.fixture From e79a1c5242a646168d2c1463dfc08061b9bd7d92 Mon Sep 17 00:00:00 2001 From: Seth Samuel Date: Tue, 30 Sep 2025 15:57:14 -0400 Subject: [PATCH 04/37] Databases filter --- postgres/datadog_checks/postgres/schemas.py | 44 ++++++++++++++------ postgres/tests/conftest.py | 12 +++--- postgres/tests/test_schemas.py | 45 +++++++++++++++++++++ 3 files changed, 83 insertions(+), 18 deletions(-) create mode 100644 postgres/tests/test_schemas.py diff --git a/postgres/datadog_checks/postgres/schemas.py b/postgres/datadog_checks/postgres/schemas.py index 23c6b825bc6e1..bba18e2f38f1f 100644 --- a/postgres/datadog_checks/postgres/schemas.py +++ b/postgres/datadog_checks/postgres/schemas.py @@ -1,11 +1,27 @@ import time +import orjson as json + +from datadog_checks.postgres.postgres import PostgreSql + +try: + import datadog_agent +except ImportError: + from datadog_checks.base.stubs import datadog_agent + class SchemaCollector: - def __init__(self, check, config): + def __init__(self, check: PostgreSql): self._check = check self._log = check.log - self._config = config + self._config = check._config.schemas_metadata_config + + self._include_databases = self._config.get("include_databases", []) + self._include_schemas = self._config.get("include_schemas", []) + self._include_tables = self._config.get("include_tables", []) + self._exclude_databases = self._config.get("exclude_databases", []) + self._exclude_schemas = self._config.get("exclude_schemas", []) + self._exclude_tables = self._config.get("exclude_tables", []) self._reset() @@ -27,7 +43,7 @@ def collect_schemas(self) -> bool: next = self._get_next(cursor) is_last_payload = database is databases[-1] and next is None self.maybe_flush(is_last_payload) - + self._reset() return True @@ -63,23 +79,27 @@ def _get_next(self, cursor): class PostgresSchemaCollector(SchemaCollector): - def __init__(self, check, config): - super().__init__(check, config) + def __init__(self, check): + super().__init__(check) def collect_schemas(self): pass def _get_databases(self): - cursor = self._check.get_main_db().cursor() - cursor.execute("SELECT datname FROM pg_database") - return [row[0] for row in cursor.fetchall()] + with self._check._get_main_db() as conn: + with conn.cursor() as cursor: + query = "SELECT datname FROM pg_database WHERE 1=1" + for exclude_regex in self._exclude_databases: + query += " AND datname !~ '{}'".format(exclude_regex) + for include_regex in self._include_databases: + query += " AND datname ~ '{}'".format(include_regex) + cursor.execute(query) + return [row[0] for row in cursor.fetchall()] def _get_cursor(self): cursor = self._check.db_pool.get_connection(self._config.dbname).cursor() - cursor.execute("SELECT nspname FROM pg_namespace" - "MONSTER SQL STATEMENT GOES HERE" - ) + cursor.execute("SELECT nspname FROM pg_namespaceMONSTER SQL STATEMENT GOES HERE") return cursor def _get_next(self, cursor): - return cursor.fetchone() \ No newline at end of file + return cursor.fetchone() diff --git a/postgres/tests/conftest.py b/postgres/tests/conftest.py index 5a10d3f903484..a1943c7133a32 100644 --- a/postgres/tests/conftest.py +++ b/postgres/tests/conftest.py @@ -8,7 +8,7 @@ import pytest from semver import VersionInfo -from datadog_checks.dev import WaitFor, docker_run +# from datadog_checks.dev import WaitFor, docker_run from datadog_checks.postgres import PostgreSql from datadog_checks.postgres.config import PostgresConfig from datadog_checks.postgres.metrics_cache import PostgresMetricsCache @@ -21,8 +21,8 @@ PORT_REPLICA, PORT_REPLICA2, PORT_REPLICA_LOGICAL, - POSTGRES_IMAGE, - POSTGRES_LOCALE, + # POSTGRES_IMAGE, + # POSTGRES_LOCALE, POSTGRES_VERSION, USER, ) @@ -60,9 +60,9 @@ def dd_environment(e2e_instance): """ Start a standalone postgres server requiring authentication. """ - compose_file = 'docker-compose.yaml' - if float(POSTGRES_VERSION) >= 10.0: - compose_file = 'docker-compose-replication.yaml' + # compose_file = 'docker-compose.yaml' + # if float(POSTGRES_VERSION) >= 10.0: + # compose_file = 'docker-compose-replication.yaml' return e2e_instance, E2E_METADATA # with docker_run( # os.path.join(HERE, 'compose', compose_file), diff --git a/postgres/tests/test_schemas.py b/postgres/tests/test_schemas.py new file mode 100644 index 0000000000000..775f4fc794565 --- /dev/null +++ b/postgres/tests/test_schemas.py @@ -0,0 +1,45 @@ +# (C) Datadog, Inc. 2023-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +import pytest + +from datadog_checks.postgres.schemas import PostgresSchemaCollector + +pytestmark = [pytest.mark.integration, pytest.mark.usefixtures('dd_environment')] + + +@pytest.fixture +def dbm_instance(pg_instance): + pg_instance['dbm'] = True + pg_instance['min_collection_interval'] = 0.1 + pg_instance['query_samples'] = {'enabled': False} + pg_instance['query_activity'] = {'enabled': False} + pg_instance['query_metrics'] = {'enabled': False} + pg_instance['collect_resources'] = {'enabled': False, 'run_sync': True} + pg_instance['collect_settings'] = {'enabled': False, 'run_sync': True} + pg_instance['collect_schemas'] = {'enabled': True, 'run_sync': True} + return pg_instance + + +def test_get_databases(dbm_instance, integration_check): + check = integration_check(dbm_instance) + collector = PostgresSchemaCollector(check) + + databases = collector._get_databases() + assert 'postgres' in databases + assert 'dogs' in databases + assert 'dogs_23' in databases + assert 'nope' not in databases + + +def test_databases_filters(dbm_instance, integration_check): + dbm_instance['collect_schemas']['exclude_databases'] = ['^dogs$', 'dogs_2(\\d)+'] + check = integration_check(dbm_instance) + collector = PostgresSchemaCollector(check) + + databases = collector._get_databases() + assert 'postgres' in databases + assert 'dogs' not in databases + assert 'dogs_23' not in databases + assert 'dogs_34' in databases + assert 'nope' not in databases From 45386c707d13158b0dd2aabae91568f596b1a588 Mon Sep 17 00:00:00 2001 From: Seth Samuel Date: Tue, 30 Sep 2025 16:22:08 -0400 Subject: [PATCH 05/37] WIP --- postgres/datadog_checks/postgres/schemas.py | 130 +++++++++++++++++++- postgres/tests/test_schemas.py | 25 ++++ 2 files changed, 151 insertions(+), 4 deletions(-) diff --git a/postgres/datadog_checks/postgres/schemas.py b/postgres/datadog_checks/postgres/schemas.py index bba18e2f38f1f..eed8c1a31d955 100644 --- a/postgres/datadog_checks/postgres/schemas.py +++ b/postgres/datadog_checks/postgres/schemas.py @@ -1,6 +1,8 @@ +import contextlib import time import orjson as json +from psycopg.rows import dict_row from datadog_checks.postgres.postgres import PostgreSql @@ -15,6 +17,7 @@ def __init__(self, check: PostgreSql): self._check = check self._log = check.log self._config = check._config.schemas_metadata_config + print(self._config) self._include_databases = self._config.get("include_databases", []) self._include_schemas = self._config.get("include_schemas", []) @@ -77,6 +80,106 @@ def _get_cursor(self, database): def _get_next(self, cursor): pass +PG_TABLES_QUERY_V10_PLUS = """ +SELECT c.oid AS id, + c.relname AS name, + c.relhasindex AS has_indexes, + c.relowner :: regrole AS owner, + ( CASE + WHEN c.relkind = 'p' THEN TRUE + ELSE FALSE + END ) AS has_partitions, + t.relname AS toast_table +FROM pg_class c + left join pg_class t + ON c.reltoastrelid = t.oid +WHERE c.relkind IN ( 'r', 'p', 'f' ) + AND c.relispartition != 't' + AND c.relnamespace = {schema_oid} + {filter}; +""" + +PG_TABLES_QUERY_V9 = """ +SELECT c.oid AS id, + c.relname AS name, + c.relhasindex AS has_indexes, + c.relowner :: regrole AS owner, + t.relname AS toast_table +FROM pg_class c + left join pg_class t + ON c.reltoastrelid = t.oid +WHERE c.relkind IN ( 'r', 'f' ) + AND c.relnamespace = {schema_oid} + {filter}; +""" + + +SCHEMA_QUERY = """ +SELECT nsp.oid AS schema_id, + nspname AS schema_name, + nspowner :: regrole AS schema_owner +FROM pg_namespace nsp + LEFT JOIN pg_roles r on nsp.nspowner = r.oid +WHERE nspname NOT IN ( 'information_schema', 'pg_catalog' ) + AND nspname NOT LIKE 'pg_toast%' + AND nspname NOT LIKE 'pg_temp_%' +""" + +PG_INDEXES_QUERY = """ +SELECT + c.relname AS name, + ix.indrelid AS table_id, + pg_get_indexdef(c.oid) AS definition, + ix.indisunique AS is_unique, + ix.indisexclusion AS is_exclusion, + ix.indimmediate AS is_immediate, + ix.indisclustered AS is_clustered, + ix.indisvalid AS is_valid, + ix.indcheckxmin AS is_checkxmin, + ix.indisready AS is_ready, + ix.indislive AS is_live, + ix.indisreplident AS is_replident, + ix.indpred IS NOT NULL AS is_partial +FROM + pg_index ix +JOIN + pg_class c +ON + c.oid = ix.indexrelid +WHERE + ix.indrelid IN ({table_ids}); +""" + +PG_CHECK_FOR_FOREIGN_KEY = """ +SELECT count(conname) +FROM pg_constraint +WHERE contype = 'f' + AND conrelid = {oid}; +""" + +PG_CONSTRAINTS_QUERY = """ +SELECT conname AS name, + pg_get_constraintdef(oid) AS definition, + conrelid AS id +FROM pg_constraint +WHERE contype = 'f' + AND conrelid IN ({table_ids}); +""" + +COLUMNS_QUERY = """ +SELECT attname AS name, + Format_type(atttypid, atttypmod) AS data_type, + NOT attnotnull AS nullable, + pg_get_expr(adbin, adrelid) AS default, + attrelid AS id +FROM pg_attribute + LEFT JOIN pg_attrdef ad + ON adrelid = attrelid + AND adnum = attnum +WHERE attrelid IN ({table_ids}) + AND attnum > 0 + AND NOT attisdropped; +""" class PostgresSchemaCollector(SchemaCollector): def __init__(self, check): @@ -96,10 +199,29 @@ def _get_databases(self): cursor.execute(query) return [row[0] for row in cursor.fetchall()] - def _get_cursor(self): - cursor = self._check.db_pool.get_connection(self._config.dbname).cursor() - cursor.execute("SELECT nspname FROM pg_namespaceMONSTER SQL STATEMENT GOES HERE") - return cursor + @contextlib.contextmanager + def _get_cursor(self, database_name): + with self._check.db_pool.get_connection(database_name) as conn: + with conn.cursor(row_factory=dict_row) as cursor: + schemas_query = self._get_schemas_query() + query = f""" + WITH schemas AS( + {schemas_query} + ) + + SELECT * FROM schemas + """ + print(query) + cursor.execute(query) + yield cursor + + def _get_schemas_query(self): + query = SCHEMA_QUERY + for exclude_regex in self._exclude_schemas: + query += " AND nspname !~ '{}'".format(exclude_regex) + for include_regex in self._include_schemas: + query += " AND nspname ~ '{}'".format(include_regex) + return query def _get_next(self, cursor): return cursor.fetchone() diff --git a/postgres/tests/test_schemas.py b/postgres/tests/test_schemas.py index 775f4fc794565..fb6eed793fd58 100644 --- a/postgres/tests/test_schemas.py +++ b/postgres/tests/test_schemas.py @@ -43,3 +43,28 @@ def test_databases_filters(dbm_instance, integration_check): assert 'dogs_23' not in databases assert 'dogs_34' in databases assert 'nope' not in databases + +def test_get_cursor(dbm_instance, integration_check): + check = integration_check(dbm_instance) + collector = PostgresSchemaCollector(check) + + with collector._get_cursor('datadog_test') as cursor: + assert cursor is not None + schemas = [] + for row in cursor: + schemas.append(row['schema_name']) + + assert set(schemas) == {'datadog', 'hstore', 'public', 'public2', 'rdsadmin_test'} + +def test_schemas_filters(dbm_instance, integration_check): + dbm_instance['collect_schemas']['exclude_schemas'] = ['public', 'rdsadmin_test'] + check = integration_check(dbm_instance) + collector = PostgresSchemaCollector(check) + + with collector._get_cursor('datadog_test') as cursor: + assert cursor is not None + schemas = [] + for row in cursor: + schemas.append(row['schema_name']) + + assert set(schemas) == {'datadog', 'hstore'} \ No newline at end of file From 3f20a6a652774d2996b4ddeb3553649de4c364d9 Mon Sep 17 00:00:00 2001 From: Seth Samuel Date: Wed, 1 Oct 2025 14:56:12 -0400 Subject: [PATCH 06/37] Fix --- postgres/datadog_checks/postgres/schemas.py | 24 ++++++++------------- postgres/tests/conftest.py | 2 +- postgres/tests/test_schemas.py | 6 ++++-- 3 files changed, 14 insertions(+), 18 deletions(-) diff --git a/postgres/datadog_checks/postgres/schemas.py b/postgres/datadog_checks/postgres/schemas.py index eed8c1a31d955..96b435b25e086 100644 --- a/postgres/datadog_checks/postgres/schemas.py +++ b/postgres/datadog_checks/postgres/schemas.py @@ -16,15 +16,7 @@ class SchemaCollector: def __init__(self, check: PostgreSql): self._check = check self._log = check.log - self._config = check._config.schemas_metadata_config - print(self._config) - - self._include_databases = self._config.get("include_databases", []) - self._include_schemas = self._config.get("include_schemas", []) - self._include_tables = self._config.get("include_tables", []) - self._exclude_databases = self._config.get("exclude_databases", []) - self._exclude_schemas = self._config.get("exclude_schemas", []) - self._exclude_tables = self._config.get("exclude_tables", []) + self._config = check._config.collect_schemas self._reset() @@ -57,7 +49,7 @@ def maybe_flush(self, is_last_payload): "agent_version": datadog_agent.get_version(), "dbms": "postgres", "kind": "pg_databases", - "collection_interval": self._config.schemas_metadata_config.get("collection_interval"), + "collection_interval": self._config.collection_interval, "dbms_version": self._check.version, "tags": self._check.tags, "cloud_metadata": self._check.cloud_metadata, @@ -66,7 +58,7 @@ def maybe_flush(self, is_last_payload): } self._collection_payloads_count += 1 if is_last_payload: - event["collection_payloads_count"] = self._payloads_count + event["collection_payloads_count"] = self._collection_payloads_count self._check.database_monitoring_metadata(json.dumps(event)) self._queued_rows = [] @@ -80,6 +72,7 @@ def _get_cursor(self, database): def _get_next(self, cursor): pass + PG_TABLES_QUERY_V10_PLUS = """ SELECT c.oid AS id, c.relname AS name, @@ -181,6 +174,7 @@ def _get_next(self, cursor): AND NOT attisdropped; """ + class PostgresSchemaCollector(SchemaCollector): def __init__(self, check): super().__init__(check) @@ -192,9 +186,9 @@ def _get_databases(self): with self._check._get_main_db() as conn: with conn.cursor() as cursor: query = "SELECT datname FROM pg_database WHERE 1=1" - for exclude_regex in self._exclude_databases: + for exclude_regex in self._config.exclude_databases: query += " AND datname !~ '{}'".format(exclude_regex) - for include_regex in self._include_databases: + for include_regex in self._config.include_databases: query += " AND datname ~ '{}'".format(include_regex) cursor.execute(query) return [row[0] for row in cursor.fetchall()] @@ -217,9 +211,9 @@ def _get_cursor(self, database_name): def _get_schemas_query(self): query = SCHEMA_QUERY - for exclude_regex in self._exclude_schemas: + for exclude_regex in self._config.exclude_schemas: query += " AND nspname !~ '{}'".format(exclude_regex) - for include_regex in self._include_schemas: + for include_regex in self._config.include_schemas: query += " AND nspname ~ '{}'".format(include_regex) return query diff --git a/postgres/tests/conftest.py b/postgres/tests/conftest.py index a9e26e849a89a..497493de97b39 100644 --- a/postgres/tests/conftest.py +++ b/postgres/tests/conftest.py @@ -72,7 +72,7 @@ def dd_environment(e2e_instance): # env_vars={"POSTGRES_IMAGE": POSTGRES_IMAGE, "POSTGRES_LOCALE": POSTGRES_LOCALE}, # capture=True, # ): - # yield e2e_instance, E2E_METADATA + # yield e2e_instance, E2E_METADATA return e2e_instance, E2E_METADATA diff --git a/postgres/tests/test_schemas.py b/postgres/tests/test_schemas.py index fb6eed793fd58..af1be8a532eb6 100644 --- a/postgres/tests/test_schemas.py +++ b/postgres/tests/test_schemas.py @@ -44,6 +44,7 @@ def test_databases_filters(dbm_instance, integration_check): assert 'dogs_34' in databases assert 'nope' not in databases + def test_get_cursor(dbm_instance, integration_check): check = integration_check(dbm_instance) collector = PostgresSchemaCollector(check) @@ -54,7 +55,8 @@ def test_get_cursor(dbm_instance, integration_check): for row in cursor: schemas.append(row['schema_name']) - assert set(schemas) == {'datadog', 'hstore', 'public', 'public2', 'rdsadmin_test'} + assert set(schemas) == {'datadog', 'hstore', 'public', 'public2', 'rdsadmin_test'} + def test_schemas_filters(dbm_instance, integration_check): dbm_instance['collect_schemas']['exclude_schemas'] = ['public', 'rdsadmin_test'] @@ -67,4 +69,4 @@ def test_schemas_filters(dbm_instance, integration_check): for row in cursor: schemas.append(row['schema_name']) - assert set(schemas) == {'datadog', 'hstore'} \ No newline at end of file + assert set(schemas) == {'datadog', 'hstore'} From 62b9510c1b3dabd8b5142341e4c3372c56e8300f Mon Sep 17 00:00:00 2001 From: Seth Samuel Date: Wed, 1 Oct 2025 15:13:49 -0400 Subject: [PATCH 07/37] WIP --- postgres/datadog_checks/postgres/schemas.py | 35 +++++++++++----- postgres/tests/test_schemas.py | 45 ++++++++++++++++++++- 2 files changed, 69 insertions(+), 11 deletions(-) diff --git a/postgres/datadog_checks/postgres/schemas.py b/postgres/datadog_checks/postgres/schemas.py index 96b435b25e086..df3d9a3dfcd85 100644 --- a/postgres/datadog_checks/postgres/schemas.py +++ b/postgres/datadog_checks/postgres/schemas.py @@ -5,6 +5,7 @@ from psycopg.rows import dict_row from datadog_checks.postgres.postgres import PostgreSql +from datadog_checks.postgres.version_utils import VersionUtils try: import datadog_agent @@ -74,8 +75,9 @@ def _get_next(self, cursor): PG_TABLES_QUERY_V10_PLUS = """ -SELECT c.oid AS id, - c.relname AS name, +SELECT c.oid AS table_id, + c.relnamespace AS schema_id, + c.relname AS table_name, c.relhasindex AS has_indexes, c.relowner :: regrole AS owner, ( CASE @@ -88,13 +90,12 @@ def _get_next(self, cursor): ON c.reltoastrelid = t.oid WHERE c.relkind IN ( 'r', 'p', 'f' ) AND c.relispartition != 't' - AND c.relnamespace = {schema_oid} - {filter}; """ PG_TABLES_QUERY_V9 = """ -SELECT c.oid AS id, - c.relname AS name, +SELECT c.oid AS table_id, + c.relnamespace AS schema_id, + c.relname AS table_name, c.relhasindex AS has_indexes, c.relowner :: regrole AS owner, t.relname AS toast_table @@ -102,8 +103,6 @@ def _get_next(self, cursor): left join pg_class t ON c.reltoastrelid = t.oid WHERE c.relkind IN ( 'r', 'f' ) - AND c.relnamespace = {schema_oid} - {filter}; """ @@ -198,12 +197,19 @@ def _get_cursor(self, database_name): with self._check.db_pool.get_connection(database_name) as conn: with conn.cursor(row_factory=dict_row) as cursor: schemas_query = self._get_schemas_query() + tables_query = self._get_tables_query() query = f""" WITH schemas AS( {schemas_query} + ), + + tables AS ( + {tables_query} ) - SELECT * FROM schemas + SELECT schemas.schema_name, tables.table_name + FROM schemas + LEFT JOIN tables ON schemas.schema_id = tables.schema_id """ print(query) cursor.execute(query) @@ -217,5 +223,16 @@ def _get_schemas_query(self): query += " AND nspname ~ '{}'".format(include_regex) return query + def _get_tables_query(self): + if VersionUtils.transform_version(str(self._check.version))["version.major"] == "9": + query = PG_TABLES_QUERY_V9 + else: + query = PG_TABLES_QUERY_V10_PLUS + for exclude_regex in self._config.exclude_tables: + query += " AND relname !~ '{}'".format(exclude_regex) + for include_regex in self._config.include_tables: + query += " AND relname ~ '{}'".format(include_regex) + return query + def _get_next(self, cursor): return cursor.fetchone() diff --git a/postgres/tests/test_schemas.py b/postgres/tests/test_schemas.py index af1be8a532eb6..636d73650813b 100644 --- a/postgres/tests/test_schemas.py +++ b/postgres/tests/test_schemas.py @@ -45,8 +45,10 @@ def test_databases_filters(dbm_instance, integration_check): assert 'nope' not in databases -def test_get_cursor(dbm_instance, integration_check): +@pytest.mark.parametrize("version", ["9", "10"]) +def test_get_cursor(dbm_instance, integration_check, version): check = integration_check(dbm_instance) + check.version = version collector = PostgresSchemaCollector(check) with collector._get_cursor('datadog_test') as cursor: @@ -58,9 +60,11 @@ def test_get_cursor(dbm_instance, integration_check): assert set(schemas) == {'datadog', 'hstore', 'public', 'public2', 'rdsadmin_test'} -def test_schemas_filters(dbm_instance, integration_check): +@pytest.mark.parametrize("version", ["9", "10"]) +def test_schemas_filters(dbm_instance, integration_check, version): dbm_instance['collect_schemas']['exclude_schemas'] = ['public', 'rdsadmin_test'] check = integration_check(dbm_instance) + check.version = version collector = PostgresSchemaCollector(check) with collector._get_cursor('datadog_test') as cursor: @@ -70,3 +74,40 @@ def test_schemas_filters(dbm_instance, integration_check): schemas.append(row['schema_name']) assert set(schemas) == {'datadog', 'hstore'} + + +@pytest.mark.parametrize("version", ["9", "10"]) +def test_tables(dbm_instance, integration_check, version): + check = integration_check(dbm_instance) + check.version = version + collector = PostgresSchemaCollector(check) + + with collector._get_cursor('datadog_test') as cursor: + assert cursor is not None + tables = [] + for row in cursor: + if row['table_name']: + tables.append(row['table_name']) + + assert set(tables) == { + 'persons', + 'personsdup1', + 'personsdup2', + 'personsdup3', + 'personsdup4', + 'personsdup5', + 'personsdup6', + 'personsdup7', + 'personsdup8', + 'personsdup9', + 'personsdup10', + 'personsdup11', + 'personsdup12', + 'personsdup13', + 'persons_indexed', + 'pgtable', + 'pg_newtable', + 'cities', + 'rds_admin_misc', + 'sample_foreign_d73a8c', + } From 09665f731d60481722fa92bba0952aec814302d1 Mon Sep 17 00:00:00 2001 From: Seth Samuel Date: Wed, 1 Oct 2025 16:59:28 -0400 Subject: [PATCH 08/37] WIP --- postgres/datadog_checks/postgres/schemas.py | 21 +++++++++++++-------- postgres/tests/test_schemas.py | 14 ++++++++++++++ 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/postgres/datadog_checks/postgres/schemas.py b/postgres/datadog_checks/postgres/schemas.py index df3d9a3dfcd85..9dfd054ea173d 100644 --- a/postgres/datadog_checks/postgres/schemas.py +++ b/postgres/datadog_checks/postgres/schemas.py @@ -163,14 +163,13 @@ def _get_next(self, cursor): Format_type(atttypid, atttypmod) AS data_type, NOT attnotnull AS nullable, pg_get_expr(adbin, adrelid) AS default, - attrelid AS id + attrelid AS table_id FROM pg_attribute LEFT JOIN pg_attrdef ad ON adrelid = attrelid AND adnum = attnum -WHERE attrelid IN ({table_ids}) - AND attnum > 0 - AND NOT attisdropped; +WHERE attnum > 0 + AND NOT attisdropped """ @@ -198,20 +197,26 @@ def _get_cursor(self, database_name): with conn.cursor(row_factory=dict_row) as cursor: schemas_query = self._get_schemas_query() tables_query = self._get_tables_query() + columns_query = COLUMNS_QUERY query = f""" - WITH schemas AS( + WITH + schemas AS( {schemas_query} ), - tables AS ( {tables_query} + ), + columns AS ( + {columns_query} ) - SELECT schemas.schema_name, tables.table_name + SELECT schemas.schema_name, tables.table_name, array_agg(row_to_json(columns.*)) as columns FROM schemas LEFT JOIN tables ON schemas.schema_id = tables.schema_id + LEFT JOIN columns ON tables.table_id = columns.table_id + GROUP BY schemas.schema_name, tables.table_name """ - print(query) + # print(query) cursor.execute(query) yield cursor diff --git a/postgres/tests/test_schemas.py b/postgres/tests/test_schemas.py index 636d73650813b..94623497cb76d 100644 --- a/postgres/tests/test_schemas.py +++ b/postgres/tests/test_schemas.py @@ -111,3 +111,17 @@ def test_tables(dbm_instance, integration_check, version): 'rds_admin_misc', 'sample_foreign_d73a8c', } + +@pytest.mark.parametrize("version", ["9", "10"]) +def test_columns(dbm_instance, integration_check, version): + check = integration_check(dbm_instance) + check.version = version + collector = PostgresSchemaCollector(check) + + with collector._get_cursor('datadog_test') as cursor: + assert cursor is not None + for row in cursor: + if row['columns'] and row['columns'] != [None]: + for column in row['columns']: + assert column['name'] is not None + assert column['data_type'] is not None From 169a7d67e135fe67726a6b6681e1ed44ad991ba6 Mon Sep 17 00:00:00 2001 From: Seth Samuel Date: Thu, 2 Oct 2025 15:18:35 -0400 Subject: [PATCH 09/37] WIP --- postgres/datadog_checks/postgres/schemas.py | 2 +- postgres/tests/test_schemas.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/postgres/datadog_checks/postgres/schemas.py b/postgres/datadog_checks/postgres/schemas.py index 9dfd054ea173d..b375f24822d72 100644 --- a/postgres/datadog_checks/postgres/schemas.py +++ b/postgres/datadog_checks/postgres/schemas.py @@ -199,7 +199,7 @@ def _get_cursor(self, database_name): tables_query = self._get_tables_query() columns_query = COLUMNS_QUERY query = f""" - WITH + WITH schemas AS( {schemas_query} ), diff --git a/postgres/tests/test_schemas.py b/postgres/tests/test_schemas.py index 94623497cb76d..d589cd3466521 100644 --- a/postgres/tests/test_schemas.py +++ b/postgres/tests/test_schemas.py @@ -41,7 +41,7 @@ def test_databases_filters(dbm_instance, integration_check): assert 'postgres' in databases assert 'dogs' not in databases assert 'dogs_23' not in databases - assert 'dogs_34' in databases + assert 'dogs_14' in databases assert 'nope' not in databases @@ -112,6 +112,7 @@ def test_tables(dbm_instance, integration_check, version): 'sample_foreign_d73a8c', } + @pytest.mark.parametrize("version", ["9", "10"]) def test_columns(dbm_instance, integration_check, version): check = integration_check(dbm_instance) From 2d6510f8b8a0002e1ef2e92d9c678da1e8ffdfb4 Mon Sep 17 00:00:00 2001 From: Seth Samuel Date: Thu, 2 Oct 2025 15:23:52 -0400 Subject: [PATCH 10/37] Fix tests --- postgres/tests/compose/docker-compose.yaml | 2 +- postgres/tests/test_schemas.py | 32 +++++++++++----------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/postgres/tests/compose/docker-compose.yaml b/postgres/tests/compose/docker-compose.yaml index f51da17eb56df..227e82a3b2636 100644 --- a/postgres/tests/compose/docker-compose.yaml +++ b/postgres/tests/compose/docker-compose.yaml @@ -11,7 +11,7 @@ services: volumes: - ./resources:/docker-entrypoint-initdb.d/ - ./etc/postgresql:/etc/postgresql/ - - /tmp/postgres_${POSTGRES_IMAGE}:/var/lib/postgresql/data + # - /tmp/postgres_${POSTGRES_IMAGE}:/var/lib/postgresql/data environment: POSTGRES_PASSWORD: datad0g POSTGRES_INITDB_ARGS: "--data-checksums --locale=${POSTGRES_LOCALE}" diff --git a/postgres/tests/test_schemas.py b/postgres/tests/test_schemas.py index d589cd3466521..3c2027770c15f 100644 --- a/postgres/tests/test_schemas.py +++ b/postgres/tests/test_schemas.py @@ -4,6 +4,7 @@ import pytest from datadog_checks.postgres.schemas import PostgresSchemaCollector +from .common import POSTGRES_VERSION pytestmark = [pytest.mark.integration, pytest.mark.usefixtures('dd_environment')] @@ -28,27 +29,26 @@ def test_get_databases(dbm_instance, integration_check): databases = collector._get_databases() assert 'postgres' in databases assert 'dogs' in databases - assert 'dogs_23' in databases + assert 'dogs_3' in databases assert 'nope' not in databases def test_databases_filters(dbm_instance, integration_check): - dbm_instance['collect_schemas']['exclude_databases'] = ['^dogs$', 'dogs_2(\\d)+'] + dbm_instance['collect_schemas']['exclude_databases'] = ['^dogs$', 'dogs_[345]'] check = integration_check(dbm_instance) collector = PostgresSchemaCollector(check) databases = collector._get_databases() assert 'postgres' in databases assert 'dogs' not in databases - assert 'dogs_23' not in databases - assert 'dogs_14' in databases + assert 'dogs_3' not in databases + assert 'dogs_9' in databases assert 'nope' not in databases -@pytest.mark.parametrize("version", ["9", "10"]) -def test_get_cursor(dbm_instance, integration_check, version): +def test_get_cursor(dbm_instance, integration_check): check = integration_check(dbm_instance) - check.version = version + check.version = POSTGRES_VERSION collector = PostgresSchemaCollector(check) with collector._get_cursor('datadog_test') as cursor: @@ -60,11 +60,11 @@ def test_get_cursor(dbm_instance, integration_check, version): assert set(schemas) == {'datadog', 'hstore', 'public', 'public2', 'rdsadmin_test'} -@pytest.mark.parametrize("version", ["9", "10"]) -def test_schemas_filters(dbm_instance, integration_check, version): + +def test_schemas_filters(dbm_instance, integration_check): dbm_instance['collect_schemas']['exclude_schemas'] = ['public', 'rdsadmin_test'] check = integration_check(dbm_instance) - check.version = version + check.version = POSTGRES_VERSION collector = PostgresSchemaCollector(check) with collector._get_cursor('datadog_test') as cursor: @@ -76,10 +76,10 @@ def test_schemas_filters(dbm_instance, integration_check, version): assert set(schemas) == {'datadog', 'hstore'} -@pytest.mark.parametrize("version", ["9", "10"]) -def test_tables(dbm_instance, integration_check, version): + +def test_tables(dbm_instance, integration_check): check = integration_check(dbm_instance) - check.version = version + check.version = POSTGRES_VERSION collector = PostgresSchemaCollector(check) with collector._get_cursor('datadog_test') as cursor: @@ -113,10 +113,10 @@ def test_tables(dbm_instance, integration_check, version): } -@pytest.mark.parametrize("version", ["9", "10"]) -def test_columns(dbm_instance, integration_check, version): + +def test_columns(dbm_instance, integration_check): check = integration_check(dbm_instance) - check.version = version + check.version = POSTGRES_VERSION collector = PostgresSchemaCollector(check) with collector._get_cursor('datadog_test') as cursor: From baa464985db7967022db4b89c6d97d1fe5ade4ea Mon Sep 17 00:00:00 2001 From: Seth Samuel Date: Thu, 2 Oct 2025 15:49:51 -0400 Subject: [PATCH 11/37] Fix indexes and columns --- postgres/datadog_checks/postgres/schemas.py | 17 +++++++++--- postgres/tests/test_schemas.py | 29 ++++++++++++++++++--- 2 files changed, 38 insertions(+), 8 deletions(-) diff --git a/postgres/datadog_checks/postgres/schemas.py b/postgres/datadog_checks/postgres/schemas.py index b375f24822d72..3f5d36e117753 100644 --- a/postgres/datadog_checks/postgres/schemas.py +++ b/postgres/datadog_checks/postgres/schemas.py @@ -138,8 +138,6 @@ def _get_next(self, cursor): pg_class c ON c.oid = ix.indexrelid -WHERE - ix.indrelid IN ({table_ids}); """ PG_CHECK_FOR_FOREIGN_KEY = """ @@ -198,6 +196,8 @@ def _get_cursor(self, database_name): schemas_query = self._get_schemas_query() tables_query = self._get_tables_query() columns_query = COLUMNS_QUERY + indexes_query = PG_INDEXES_QUERY + limit = self._config.max_tables or 1_000_000 query = f""" WITH schemas AS( @@ -208,13 +208,22 @@ def _get_cursor(self, database_name): ), columns AS ( {columns_query} + ), + indexes AS ( + {indexes_query} ) - SELECT schemas.schema_name, tables.table_name, array_agg(row_to_json(columns.*)) as columns + SELECT schemas.schema_id, schemas.schema_name, + tables.table_id, tables.table_name, + array_agg(row_to_json(columns.*)) FILTER (WHERE columns.name IS NOT NULL) as columns, + array_agg(row_to_json(indexes.*)) FILTER (WHERE indexes.name IS NOT NULL) as indexes FROM schemas LEFT JOIN tables ON schemas.schema_id = tables.schema_id LEFT JOIN columns ON tables.table_id = columns.table_id - GROUP BY schemas.schema_name, tables.table_name + LEFT JOIN indexes ON tables.table_id = indexes.table_id + GROUP BY schemas.schema_id, schemas.schema_name, tables.table_id, tables.table_name + LIMIT {limit} + ; """ # print(query) cursor.execute(query) diff --git a/postgres/tests/test_schemas.py b/postgres/tests/test_schemas.py index 3c2027770c15f..7e59beb6a691f 100644 --- a/postgres/tests/test_schemas.py +++ b/postgres/tests/test_schemas.py @@ -4,6 +4,7 @@ import pytest from datadog_checks.postgres.schemas import PostgresSchemaCollector + from .common import POSTGRES_VERSION pytestmark = [pytest.mark.integration, pytest.mark.usefixtures('dd_environment')] @@ -60,7 +61,6 @@ def test_get_cursor(dbm_instance, integration_check): assert set(schemas) == {'datadog', 'hstore', 'public', 'public2', 'rdsadmin_test'} - def test_schemas_filters(dbm_instance, integration_check): dbm_instance['collect_schemas']['exclude_schemas'] = ['public', 'rdsadmin_test'] check = integration_check(dbm_instance) @@ -76,7 +76,6 @@ def test_schemas_filters(dbm_instance, integration_check): assert set(schemas) == {'datadog', 'hstore'} - def test_tables(dbm_instance, integration_check): check = integration_check(dbm_instance) check.version = POSTGRES_VERSION @@ -113,7 +112,6 @@ def test_tables(dbm_instance, integration_check): } - def test_columns(dbm_instance, integration_check): check = integration_check(dbm_instance) check.version = POSTGRES_VERSION @@ -121,8 +119,31 @@ def test_columns(dbm_instance, integration_check): with collector._get_cursor('datadog_test') as cursor: assert cursor is not None + # Assert that at least one row has columns + assert any(row['columns'] for row in cursor) for row in cursor: - if row['columns'] and row['columns'] != [None]: + if row['columns']: for column in row['columns']: assert column['name'] is not None assert column['data_type'] is not None + if row['table_name'] == 'cities': + assert row['columns'] + assert row['columns'][0]['name'] + +def test_indexes(dbm_instance, integration_check): + check = integration_check(dbm_instance) + check.version = POSTGRES_VERSION + collector = PostgresSchemaCollector(check) + + with collector._get_cursor('datadog_test') as cursor: + assert cursor is not None + # Assert that at least one row has indexes + assert any(row['indexes'] for row in cursor) + for row in cursor: + if row['indexes']: + for index in row['indexes']: + assert index['name'] is not None + assert index['definition'] is not None + if row['table_name'] == 'cities': + assert row['indexes'] + assert row['indexes'][0]['name'] From 978d13f1df1252da659a1f7ebbeb92256dd0e84b Mon Sep 17 00:00:00 2001 From: Seth Samuel Date: Wed, 8 Oct 2025 10:18:26 -0400 Subject: [PATCH 12/37] Partitions --- postgres/datadog_checks/postgres/schemas.py | 91 ++++++++++++++++----- postgres/tests/test_schemas.py | 2 +- 2 files changed, 71 insertions(+), 22 deletions(-) diff --git a/postgres/datadog_checks/postgres/schemas.py b/postgres/datadog_checks/postgres/schemas.py index 3f5d36e117753..cb7387744dd25 100644 --- a/postgres/datadog_checks/postgres/schemas.py +++ b/postgres/datadog_checks/postgres/schemas.py @@ -117,6 +117,23 @@ def _get_next(self, cursor): AND nspname NOT LIKE 'pg_temp_%' """ +COLUMNS_QUERY = """ +SELECT attname AS name, + Format_type(atttypid, atttypmod) AS data_type, + NOT attnotnull AS nullable, + pg_get_expr(adbin, adrelid) AS default, + attrelid AS table_id +FROM pg_attribute + LEFT JOIN pg_attrdef ad + ON adrelid = attrelid + AND adnum = attnum +WHERE attnum > 0 + AND NOT attisdropped +""" + + + + PG_INDEXES_QUERY = """ SELECT c.relname AS name, @@ -140,36 +157,40 @@ def _get_next(self, cursor): c.oid = ix.indexrelid """ -PG_CHECK_FOR_FOREIGN_KEY = """ -SELECT count(conname) -FROM pg_constraint -WHERE contype = 'f' - AND conrelid = {oid}; -""" PG_CONSTRAINTS_QUERY = """ SELECT conname AS name, pg_get_constraintdef(oid) AS definition, - conrelid AS id + conrelid AS table_id FROM pg_constraint WHERE contype = 'f' - AND conrelid IN ({table_ids}); """ -COLUMNS_QUERY = """ -SELECT attname AS name, - Format_type(atttypid, atttypmod) AS data_type, - NOT attnotnull AS nullable, - pg_get_expr(adbin, adrelid) AS default, - attrelid AS table_id -FROM pg_attribute - LEFT JOIN pg_attrdef ad - ON adrelid = attrelid - AND adnum = attnum -WHERE attnum > 0 - AND NOT attisdropped + +PARTITION_KEY_QUERY = """ +SELECT relname, + pg_get_partkeydef(oid) AS partition_key, + oid AS table_id +FROM pg_class """ +NUM_PARTITIONS_QUERY = """ +SELECT count(inhrelid :: regclass) AS num_partitions, inhparent as table_id +FROM pg_inherits +GROUP BY inhparent; +""" + +PARTITION_ACTIVITY_QUERY = """ +SELECT pi.inhparent :: regclass AS parent_table_name, + SUM(COALESCE(psu.seq_scan, 0) + COALESCE(psu.idx_scan, 0)) AS total_activity, + pi.inhparent as table_id +FROM pg_catalog.pg_stat_user_tables psu + join pg_class pc + ON psu.relname = pc.relname + join pg_inherits pi + ON pi.inhrelid = pc.oid +GROUP BY pi.inhparent +""" class PostgresSchemaCollector(SchemaCollector): def __init__(self, check): @@ -197,6 +218,26 @@ def _get_cursor(self, database_name): tables_query = self._get_tables_query() columns_query = COLUMNS_QUERY indexes_query = PG_INDEXES_QUERY + constraints_query = PG_CONSTRAINTS_QUERY + partitions_ctes = f""" + , + partition_keys AS ( + {PARTITION_KEY_QUERY} + ), + num_partitions AS ( + {NUM_PARTITIONS_QUERY} + ) + """ if VersionUtils.transform_version(str(self._check.version))["version.major"] > "9" else "" + partition_joins = f""" + LEFT JOIN partition_keys ON tables.table_id = partition_keys.table_id + LEFT JOIN num_partitions ON tables.table_id = num_partitions.table_id + """ if VersionUtils.transform_version(str(self._check.version))["version.major"] > "9" else "" + parition_selects = f""" + , + partition_keys.partition_key, + num_partitions.num_partitions + """ if VersionUtils.transform_version(str(self._check.version))["version.major"] > "9" else "" + limit = self._config.max_tables or 1_000_000 query = f""" WITH @@ -211,16 +252,24 @@ def _get_cursor(self, database_name): ), indexes AS ( {indexes_query} + ), + constraints AS ( + {constraints_query} ) + {partitions_ctes} SELECT schemas.schema_id, schemas.schema_name, tables.table_id, tables.table_name, array_agg(row_to_json(columns.*)) FILTER (WHERE columns.name IS NOT NULL) as columns, - array_agg(row_to_json(indexes.*)) FILTER (WHERE indexes.name IS NOT NULL) as indexes + array_agg(row_to_json(indexes.*)) FILTER (WHERE indexes.name IS NOT NULL) as indexes, + array_agg(row_to_json(constraints.*)) FILTER (WHERE constraints.name IS NOT NULL) as foreign_keys + {parition_selects} FROM schemas LEFT JOIN tables ON schemas.schema_id = tables.schema_id LEFT JOIN columns ON tables.table_id = columns.table_id LEFT JOIN indexes ON tables.table_id = indexes.table_id + LEFT JOIN constraints ON tables.table_id = constraints.table_id + {partition_joins} GROUP BY schemas.schema_id, schemas.schema_name, tables.table_id, tables.table_name LIMIT {limit} ; diff --git a/postgres/tests/test_schemas.py b/postgres/tests/test_schemas.py index 7e59beb6a691f..f4879f3ea0bc3 100644 --- a/postgres/tests/test_schemas.py +++ b/postgres/tests/test_schemas.py @@ -142,7 +142,7 @@ def test_indexes(dbm_instance, integration_check): for row in cursor: if row['indexes']: for index in row['indexes']: - assert index['name'] is not None + assert index['names'] is not None assert index['definition'] is not None if row['table_name'] == 'cities': assert row['indexes'] From b7074eaf70f856a1c2262b3dc51e1fe48df31adc Mon Sep 17 00:00:00 2001 From: Seth Samuel Date: Wed, 8 Oct 2025 13:56:59 -0400 Subject: [PATCH 13/37] Map rows --- postgres/datadog_checks/postgres/schemas.py | 77 +++++++++++++++++++-- postgres/tests/test_schemas.py | 11 ++- 2 files changed, 83 insertions(+), 5 deletions(-) diff --git a/postgres/datadog_checks/postgres/schemas.py b/postgres/datadog_checks/postgres/schemas.py index cb7387744dd25..62f8cc9e3cc1c 100644 --- a/postgres/datadog_checks/postgres/schemas.py +++ b/postgres/datadog_checks/postgres/schemas.py @@ -1,5 +1,7 @@ +from abc import ABC, abstractmethod import contextlib import time +from typing import TypedDict import orjson as json from psycopg.rows import dict_row @@ -12,8 +14,25 @@ except ImportError: from datadog_checks.base.stubs import datadog_agent - -class SchemaCollector: +class DatabaseInfo(TypedDict): + description: str + name: str + id: str + encoding: str + owner: str + +# The schema collector sends lists of DatabaseObjects to the agent +# The format is for backwards compatibility with the current backend +class DatabaseObject(TypedDict): + # Splat of database info + description: str + name: str + id: str + encoding: str + owner: str + + +class SchemaCollector(ABC): def __init__(self, check: PostgreSql): self._check = check self._log = check.log @@ -35,7 +54,7 @@ def collect_schemas(self) -> bool: with self._get_cursor(database) as cursor: next = self._get_next(cursor) while next: - self._queued_rows.append(next) + self._queued_rows.append(self._map_row(database, next)) next = self._get_next(cursor) is_last_payload = database is databases[-1] and next is None self.maybe_flush(is_last_payload) @@ -64,15 +83,27 @@ def maybe_flush(self, is_last_payload): self._queued_rows = [] - def _get_databases(self): + @abstractmethod + def _get_databases(self) -> list[DatabaseInfo]: pass + @abstractmethod def _get_cursor(self, database): pass + @abstractmethod def _get_next(self, cursor): pass + @abstractmethod + def _map_row(self, database: DatabaseInfo, cursor_row) -> DatabaseObject: + """ + Maps a cursor row to a dict that matches the schema expected by DBM. + """ + return { + **database, + } + PG_TABLES_QUERY_V10_PLUS = """ SELECT c.oid AS table_id, @@ -192,6 +223,24 @@ def _get_next(self, cursor): GROUP BY pi.inhparent """ + + +class TableObject(TypedDict): + id: str + name: str + columns: list + indexes: list + foreign_keys: list + +class SchemaObject(TypedDict): + id: str + name: str + owner: str + tables: list[TableObject] + +class PostgresDatabaseObject(DatabaseObject): + schemas: list[SchemaObject] + class PostgresSchemaCollector(SchemaCollector): def __init__(self, check): super().__init__(check) @@ -299,3 +348,23 @@ def _get_tables_query(self): def _get_next(self, cursor): return cursor.fetchone() + + def _map_row(self, database: DatabaseInfo, cursor_row) -> DatabaseObject: + object = super()._map_row(database, cursor_row) + object["schemas"] = [ + { + "id": str(cursor_row["schema_id"]), + "name": cursor_row["schema_name"], + "owner": cursor_row["schema_owner"], + "tables": [ + { + "id": str(cursor_row["table_id"]), + "name": cursor_row["table_name"], + "columns": cursor_row["columns"], + "indexes": cursor_row["indexes"], + "foreign_keys": cursor_row["foreign_keys"], + } + ] + } + ] + return object \ No newline at end of file diff --git a/postgres/tests/test_schemas.py b/postgres/tests/test_schemas.py index f4879f3ea0bc3..41589f147b19e 100644 --- a/postgres/tests/test_schemas.py +++ b/postgres/tests/test_schemas.py @@ -142,8 +142,17 @@ def test_indexes(dbm_instance, integration_check): for row in cursor: if row['indexes']: for index in row['indexes']: - assert index['names'] is not None + assert index['name'] is not None assert index['definition'] is not None if row['table_name'] == 'cities': assert row['indexes'] assert row['indexes'][0]['name'] + +def test_collect_schemas(dbm_instance, integration_check): + check = integration_check(dbm_instance) + check.version = POSTGRES_VERSION + collector = PostgresSchemaCollector(check) + + collector.collect_schemas() + + \ No newline at end of file From b6e3384ddf9226803e78d418dd7c705b6956c1e3 Mon Sep 17 00:00:00 2001 From: Seth Samuel Date: Wed, 8 Oct 2025 15:04:30 -0400 Subject: [PATCH 14/37] Fetch database info --- postgres/datadog_checks/postgres/schemas.py | 20 +++++++++++++++++--- postgres/tests/test_schemas.py | 20 +++++++++++--------- 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/postgres/datadog_checks/postgres/schemas.py b/postgres/datadog_checks/postgres/schemas.py index 62f8cc9e3cc1c..d9d60ec79be23 100644 --- a/postgres/datadog_checks/postgres/schemas.py +++ b/postgres/datadog_checks/postgres/schemas.py @@ -241,6 +241,20 @@ class SchemaObject(TypedDict): class PostgresDatabaseObject(DatabaseObject): schemas: list[SchemaObject] +DATABASE_INFORMATION_QUERY = """ +SELECT db.oid AS id, + datname AS NAME, + pg_encoding_to_char(encoding) AS encoding, + rolname AS owner, + description +FROM pg_catalog.pg_database db + LEFT JOIN pg_catalog.pg_description dc + ON dc.objoid = db.oid + JOIN pg_roles a + ON datdba = a.oid + WHERE 1=1 +""" + class PostgresSchemaCollector(SchemaCollector): def __init__(self, check): super().__init__(check) @@ -250,14 +264,14 @@ def collect_schemas(self): def _get_databases(self): with self._check._get_main_db() as conn: - with conn.cursor() as cursor: - query = "SELECT datname FROM pg_database WHERE 1=1" + with conn.cursor(row_factory=dict_row) as cursor: + query = DATABASE_INFORMATION_QUERY for exclude_regex in self._config.exclude_databases: query += " AND datname !~ '{}'".format(exclude_regex) for include_regex in self._config.include_databases: query += " AND datname ~ '{}'".format(include_regex) cursor.execute(query) - return [row[0] for row in cursor.fetchall()] + return cursor.fetchall() @contextlib.contextmanager def _get_cursor(self, database_name): diff --git a/postgres/tests/test_schemas.py b/postgres/tests/test_schemas.py index 41589f147b19e..9a1857dd8b572 100644 --- a/postgres/tests/test_schemas.py +++ b/postgres/tests/test_schemas.py @@ -28,10 +28,11 @@ def test_get_databases(dbm_instance, integration_check): collector = PostgresSchemaCollector(check) databases = collector._get_databases() - assert 'postgres' in databases - assert 'dogs' in databases - assert 'dogs_3' in databases - assert 'nope' not in databases + datbase_names = [database['name'] for database in databases] + assert 'postgres' in datbase_names + assert 'dogs' in datbase_names + assert 'dogs_3' in datbase_names + assert 'nope' not in datbase_names def test_databases_filters(dbm_instance, integration_check): @@ -40,11 +41,12 @@ def test_databases_filters(dbm_instance, integration_check): collector = PostgresSchemaCollector(check) databases = collector._get_databases() - assert 'postgres' in databases - assert 'dogs' not in databases - assert 'dogs_3' not in databases - assert 'dogs_9' in databases - assert 'nope' not in databases + datbase_names = [database['name'] for database in databases] + assert 'postgres' in datbase_names + assert 'dogs' not in datbase_names + assert 'dogs_3' not in datbase_names + assert 'dogs_9' in datbase_names + assert 'nope' not in datbase_names def test_get_cursor(dbm_instance, integration_check): From 41ee97a9fd364042d1dbcff9388e823a6efa4f7b Mon Sep 17 00:00:00 2001 From: Seth Samuel Date: Wed, 8 Oct 2025 15:06:41 -0400 Subject: [PATCH 15/37] Lint --- postgres/datadog_checks/postgres/metadata.py | 3 ++ postgres/datadog_checks/postgres/schemas.py | 50 +++++++++++++------- postgres/tests/test_schemas.py | 4 +- 3 files changed, 38 insertions(+), 19 deletions(-) diff --git a/postgres/datadog_checks/postgres/metadata.py b/postgres/datadog_checks/postgres/metadata.py index e4202a47a0b22..3b7ab826d652b 100644 --- a/postgres/datadog_checks/postgres/metadata.py +++ b/postgres/datadog_checks/postgres/metadata.py @@ -12,6 +12,8 @@ import psycopg from psycopg.rows import dict_row +from datadog_checks.postgres.schemas import PostgresSchemaCollector + try: import datadog_agent except ImportError: @@ -258,6 +260,7 @@ def __init__(self, check: PostgreSql, config: InstanceConfig): self._collect_pg_settings_enabled = config.collect_settings.enabled self._collect_extensions_enabled = self._collect_pg_settings_enabled self._collect_schemas_enabled = config.collect_schemas.enabled + self._schema_collector = PostgresSchemaCollector(check) if config.collect_schemas.enabled else None self._is_schemas_collection_in_progress = False self._pg_settings_cached = None self._compiled_patterns_cache = {} diff --git a/postgres/datadog_checks/postgres/schemas.py b/postgres/datadog_checks/postgres/schemas.py index d9d60ec79be23..310cd24963e2c 100644 --- a/postgres/datadog_checks/postgres/schemas.py +++ b/postgres/datadog_checks/postgres/schemas.py @@ -1,6 +1,6 @@ -from abc import ABC, abstractmethod import contextlib import time +from abc import ABC, abstractmethod from typing import TypedDict import orjson as json @@ -14,6 +14,7 @@ except ImportError: from datadog_checks.base.stubs import datadog_agent + class DatabaseInfo(TypedDict): description: str name: str @@ -21,6 +22,7 @@ class DatabaseInfo(TypedDict): encoding: str owner: str + # The schema collector sends lists of DatabaseObjects to the agent # The format is for backwards compatibility with the current backend class DatabaseObject(TypedDict): @@ -101,7 +103,7 @@ def _map_row(self, database: DatabaseInfo, cursor_row) -> DatabaseObject: Maps a cursor row to a dict that matches the schema expected by DBM. """ return { - **database, + **database, } @@ -163,8 +165,6 @@ def _map_row(self, database: DatabaseInfo, cursor_row) -> DatabaseObject: """ - - PG_INDEXES_QUERY = """ SELECT c.relname AS name, @@ -224,23 +224,25 @@ def _map_row(self, database: DatabaseInfo, cursor_row) -> DatabaseObject: """ - class TableObject(TypedDict): id: str name: str columns: list indexes: list foreign_keys: list - + + class SchemaObject(TypedDict): id: str name: str owner: str tables: list[TableObject] + class PostgresDatabaseObject(DatabaseObject): schemas: list[SchemaObject] + DATABASE_INFORMATION_QUERY = """ SELECT db.oid AS id, datname AS NAME, @@ -255,6 +257,7 @@ class PostgresDatabaseObject(DatabaseObject): WHERE 1=1 """ + class PostgresSchemaCollector(SchemaCollector): def __init__(self, check): super().__init__(check) @@ -282,7 +285,8 @@ def _get_cursor(self, database_name): columns_query = COLUMNS_QUERY indexes_query = PG_INDEXES_QUERY constraints_query = PG_CONSTRAINTS_QUERY - partitions_ctes = f""" + partitions_ctes = ( + f""" , partition_keys AS ( {PARTITION_KEY_QUERY} @@ -290,17 +294,28 @@ def _get_cursor(self, database_name): num_partitions AS ( {NUM_PARTITIONS_QUERY} ) - """ if VersionUtils.transform_version(str(self._check.version))["version.major"] > "9" else "" - partition_joins = f""" + """ + if VersionUtils.transform_version(str(self._check.version))["version.major"] > "9" + else "" + ) + partition_joins = ( + """ LEFT JOIN partition_keys ON tables.table_id = partition_keys.table_id LEFT JOIN num_partitions ON tables.table_id = num_partitions.table_id - """ if VersionUtils.transform_version(str(self._check.version))["version.major"] > "9" else "" - parition_selects = f""" - , + """ + if VersionUtils.transform_version(str(self._check.version))["version.major"] > "9" + else "" + ) + parition_selects = ( + """ + , partition_keys.partition_key, num_partitions.num_partitions - """ if VersionUtils.transform_version(str(self._check.version))["version.major"] > "9" else "" - + """ + if VersionUtils.transform_version(str(self._check.version))["version.major"] > "9" + else "" + ) + limit = self._config.max_tables or 1_000_000 query = f""" WITH @@ -325,7 +340,8 @@ def _get_cursor(self, database_name): tables.table_id, tables.table_name, array_agg(row_to_json(columns.*)) FILTER (WHERE columns.name IS NOT NULL) as columns, array_agg(row_to_json(indexes.*)) FILTER (WHERE indexes.name IS NOT NULL) as indexes, - array_agg(row_to_json(constraints.*)) FILTER (WHERE constraints.name IS NOT NULL) as foreign_keys + array_agg(row_to_json(constraints.*)) FILTER (WHERE constraints.name IS NOT NULL) + as foreign_keys {parition_selects} FROM schemas LEFT JOIN tables ON schemas.schema_id = tables.schema_id @@ -378,7 +394,7 @@ def _map_row(self, database: DatabaseInfo, cursor_row) -> DatabaseObject: "indexes": cursor_row["indexes"], "foreign_keys": cursor_row["foreign_keys"], } - ] + ], } ] - return object \ No newline at end of file + return object diff --git a/postgres/tests/test_schemas.py b/postgres/tests/test_schemas.py index 9a1857dd8b572..518e62d84222a 100644 --- a/postgres/tests/test_schemas.py +++ b/postgres/tests/test_schemas.py @@ -132,6 +132,7 @@ def test_columns(dbm_instance, integration_check): assert row['columns'] assert row['columns'][0]['name'] + def test_indexes(dbm_instance, integration_check): check = integration_check(dbm_instance) check.version = POSTGRES_VERSION @@ -150,11 +151,10 @@ def test_indexes(dbm_instance, integration_check): assert row['indexes'] assert row['indexes'][0]['name'] + def test_collect_schemas(dbm_instance, integration_check): check = integration_check(dbm_instance) check.version = POSTGRES_VERSION collector = PostgresSchemaCollector(check) collector.collect_schemas() - - \ No newline at end of file From 0c4768d370ef54313f7f247482dccb1208f5fc0b Mon Sep 17 00:00:00 2001 From: Seth Samuel Date: Wed, 8 Oct 2025 15:17:13 -0400 Subject: [PATCH 16/37] WIP --- postgres/datadog_checks/postgres/metadata.py | 107 +------------------ postgres/datadog_checks/postgres/schemas.py | 86 +++++++++++---- 2 files changed, 67 insertions(+), 126 deletions(-) diff --git a/postgres/datadog_checks/postgres/metadata.py b/postgres/datadog_checks/postgres/metadata.py index 3b7ab826d652b..a5a896f93b382 100644 --- a/postgres/datadog_checks/postgres/metadata.py +++ b/postgres/datadog_checks/postgres/metadata.py @@ -4,7 +4,6 @@ from __future__ import annotations import json -import math import re import time from typing import Dict, List, Union @@ -27,7 +26,6 @@ from datadog_checks.base.utils.db.utils import DBMAsyncJob, default_json_event_encoding from datadog_checks.base.utils.tracking import tracked_method from datadog_checks.postgres.config_models import InstanceConfig -from datadog_checks.postgres.util import get_list_chunks from .util import payload_pg_version from .version_utils import VersionUtils @@ -371,107 +369,10 @@ def report_postgres_metadata(self): @tracked_method(agent_check_getter=agent_check_getter) def _collect_postgres_schemas(self): - self._is_schemas_collection_in_progress = True - status = "success" - start_time = time.time() - total_tables = 0 - try: - schema_metadata = self._collect_schema_info() - # We emit an event for each batch of tables to reduce total data in memory - # and keep event size reasonable - base_event = { - "host": self._check.reported_hostname, - "database_instance": self._check.database_identifier, - "agent_version": datadog_agent.get_version(), - "dbms": "postgres", - "kind": "pg_databases", - "collection_interval": self.schemas_collection_interval, - "dbms_version": self._payload_pg_version(), - "tags": self._tags_no_db, - "cloud_metadata": self._check.cloud_metadata, - # We don't rely on this time being strictly monotonic, it's just a unique identifier - # but having it be the time is helpful for debugging - "collection_started_at": math.floor(time.time() * 1000), - } - - # Tuned from experiments on staging, we may want to make this dynamic based on schema size in the future - chunk_size = 50 - payloads_count = 0 - - for di, database in enumerate(schema_metadata): - dbname = database["name"] - if not self._should_collect_metadata(dbname, "database"): - continue - - with self.db_pool.get_connection(dbname) as conn: - with conn.cursor(row_factory=dict_row) as cursor: - for si, schema in enumerate(database["schemas"]): - if not self._should_collect_metadata(schema["name"], "schema"): - continue - - tables = self._query_tables_for_schema(cursor, schema["id"], dbname) - self._log.debug( - "Tables found for schema '{schema}' in database '{database}': {tables}".format( - schema=database["schemas"], - database=dbname, - tables=[table["name"] for table in tables], - ) - ) - table_chunks = list(get_list_chunks(tables, chunk_size)) - - buffer_column_count = 0 - tables_buffer = [] - - for tables in table_chunks: - table_info = self._query_table_information(cursor, dbname, tables) - - tables_buffer = [*tables_buffer, *table_info] - for t in table_info: - buffer_column_count += len(t.get("columns", [])) - - if buffer_column_count >= self.column_buffer_size: - payloads_count += 1 - self._flush_schema(base_event, database, schema, tables_buffer) - total_tables += len(tables_buffer) - tables_buffer = [] - buffer_column_count = 0 - - # Send the payload in the last iteration to 1) capture empty schemas and 2) ensure we get - # a final payload for tombstoning - is_final_payload = di == len(schema_metadata) - 1 and si == len(database["schemas"]) - 1 - payloads_count += 1 - self._flush_schema( - # For very last payload send the payloads count to mark the collection as complete - {**base_event, "collection_payloads_count": payloads_count} - if is_final_payload - else base_event, - database, - schema, - tables_buffer, - ) - total_tables += len(tables_buffer) - except Exception as e: - self._log.error("Error collecting schema metadata: %s", e) - status = "error" - finally: - self._is_schemas_collection_in_progress = False - elapsed_ms = (time.time() - start_time) * 1000 - self._check.histogram( - "dd.postgres.schema.time", - elapsed_ms, - tags=self._check.tags + ["status:" + status], - hostname=self._check.reported_hostname, - raw=True, - ) - self._check.gauge( - "dd.postgres.schema.tables_count", - total_tables, - tags=self._check.tags + ["status:" + status], - hostname=self._check.reported_hostname, - raw=True, - ) - datadog_agent.emit_agent_telemetry("postgres", "schema_tables_elapsed_ms", elapsed_ms, "gauge") - datadog_agent.emit_agent_telemetry("postgres", "schema_tables_count", total_tables, "gauge") + success = self._schema_collector.collect_schemas() + if not success: + # TODO: Emit health event for over-long collection + self._log.warning("Previous schema collection still in progress, skipping this collection") def _should_collect_metadata(self, name, metadata_type): # We get the config as a dict so we can use string interpolation diff --git a/postgres/datadog_checks/postgres/schemas.py b/postgres/datadog_checks/postgres/schemas.py index 310cd24963e2c..c7a655fd97aa3 100644 --- a/postgres/datadog_checks/postgres/schemas.py +++ b/postgres/datadog_checks/postgres/schemas.py @@ -46,38 +46,70 @@ def _reset(self): self._collection_started_at = None self._collection_payloads_count = 0 self._queued_rows = [] + self._total_rows_count = 0 def collect_schemas(self) -> bool: + """ + Collects and submits all applicable schema metadata to the agent. + Returns False if the previous collection was still in progress. + """ if self._collection_started_at is not None: return False - self._collection_started_at = time.time() * 1000 - databases = self._get_databases() - for database in databases: - with self._get_cursor(database) as cursor: - next = self._get_next(cursor) - while next: - self._queued_rows.append(self._map_row(database, next)) + status = "success" + try: + self._collection_started_at = int(time.time() * 1000) + databases = self._get_databases() + for database in databases: + with self._get_cursor(database) as cursor: next = self._get_next(cursor) - is_last_payload = database is databases[-1] and next is None - self.maybe_flush(is_last_payload) - - self._reset() + while next: + self._queued_rows.append(self._map_row(database, next)) + self._total_rows_count += 1 + next = self._get_next(cursor) + is_last_payload = database is databases[-1] and next is None + self.maybe_flush(is_last_payload) + + except Exception as e: + status = "error" + self._log.error("Error collecting schema metadata: %s", e) + finally: + self._collection_started_at = None + + self._check.histogram( + "dd.postgres.schema.time", + (time.time() - self._collection_started_at) * 1000, + tags=self._check.tags + ["status:" + status], + hostname=self._check.reported_hostname, + raw=True, + ) + self._check.gauge( + "dd.postgres.schema.tables_count", + self._total_rows_count, + tags=self._check.tags + ["status:" + status], + hostname=self._check.reported_hostname, + raw=True, + ) + + self._reset() return True + @property + def base_event(self): + return { + "host": self._check.reported_hostname, + "database_instance": self._check.database_identifier, + "agent_version": datadog_agent.get_version(), + "collection_interval": self._config.collection_interval, + "dbms_version": self._check.version, + "tags": self._check.tags, + "cloud_metadata": self._check.cloud_metadata, + "collection_started_at": self._collection_started_at, + } + def maybe_flush(self, is_last_payload): if len(self._queued_rows) > 10 or is_last_payload: - event = { - "host": self._check.reported_hostname, - "agent_version": datadog_agent.get_version(), - "dbms": "postgres", - "kind": "pg_databases", - "collection_interval": self._config.collection_interval, - "dbms_version": self._check.version, - "tags": self._check.tags, - "cloud_metadata": self._check.cloud_metadata, - "metadata": self._queued_rows, - "collection_started_at": self._collection_started_at, - } + event = self.base_event.copy() + event["metadata"] = self._queued_rows self._collection_payloads_count += 1 if is_last_payload: event["collection_payloads_count"] = self._collection_payloads_count @@ -265,6 +297,14 @@ def __init__(self, check): def collect_schemas(self): pass + @property + def base_event(self): + return { + **super().base_event, + "dbms": "postgres", + "kind": "pg_databases", + } + def _get_databases(self): with self._check._get_main_db() as conn: with conn.cursor(row_factory=dict_row) as cursor: From 26997ce3081f753fa8adc1ffd87d0af81805adfd Mon Sep 17 00:00:00 2001 From: Seth Samuel Date: Wed, 8 Oct 2025 16:09:01 -0400 Subject: [PATCH 17/37] WIP --- postgres/datadog_checks/postgres/metadata.py | 2 +- postgres/datadog_checks/postgres/schemas.py | 67 ++++++--- postgres/tests/test_metadata.py | 142 ++++++++++--------- 3 files changed, 119 insertions(+), 92 deletions(-) diff --git a/postgres/datadog_checks/postgres/metadata.py b/postgres/datadog_checks/postgres/metadata.py index a5a896f93b382..caa850a08442b 100644 --- a/postgres/datadog_checks/postgres/metadata.py +++ b/postgres/datadog_checks/postgres/metadata.py @@ -11,7 +11,7 @@ import psycopg from psycopg.rows import dict_row -from datadog_checks.postgres.schemas import PostgresSchemaCollector +from .schemas import PostgresSchemaCollector try: import datadog_agent diff --git a/postgres/datadog_checks/postgres/schemas.py b/postgres/datadog_checks/postgres/schemas.py index c7a655fd97aa3..884da0a2a0b4a 100644 --- a/postgres/datadog_checks/postgres/schemas.py +++ b/postgres/datadog_checks/postgres/schemas.py @@ -1,12 +1,21 @@ +# (C) Datadog, Inc. 2025-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) + +from __future__ import annotations + import contextlib import time from abc import ABC, abstractmethod from typing import TypedDict - +from typing import TYPE_CHECKING import orjson as json from psycopg.rows import dict_row -from datadog_checks.postgres.postgres import PostgreSql +if TYPE_CHECKING: + from datadog_checks.postgres import PostgreSql + from datadog_checks.base import AgentCheck + from datadog_checks.postgres.version_utils import VersionUtils try: @@ -35,10 +44,10 @@ class DatabaseObject(TypedDict): class SchemaCollector(ABC): - def __init__(self, check: PostgreSql): + def __init__(self, check: AgentCheck): self._check = check self._log = check.log - self._config = check._config.collect_schemas + self._config = check._config.collect_schemas self._reset() @@ -60,7 +69,11 @@ def collect_schemas(self) -> bool: self._collection_started_at = int(time.time() * 1000) databases = self._get_databases() for database in databases: - with self._get_cursor(database) as cursor: + database_name = database['name'] + if not database_name: + self._check.log("database has no name %v", database) + continue + with self._get_cursor(database_name) as cursor: next = self._get_next(cursor) while next: self._queued_rows.append(self._map_row(database, next)) @@ -72,9 +85,8 @@ def collect_schemas(self) -> bool: except Exception as e: status = "error" self._log.error("Error collecting schema metadata: %s", e) + raise e finally: - self._collection_started_at = None - self._check.histogram( "dd.postgres.schema.time", (time.time() - self._collection_started_at) * 1000, @@ -100,7 +112,7 @@ def base_event(self): "database_instance": self._check.database_identifier, "agent_version": datadog_agent.get_version(), "collection_interval": self._config.collection_interval, - "dbms_version": self._check.version, + "dbms_version": str(self._check.version), "tags": self._check.tags, "cloud_metadata": self._check.cloud_metadata, "collection_started_at": self._collection_started_at, @@ -109,6 +121,7 @@ def base_event(self): def maybe_flush(self, is_last_payload): if len(self._queued_rows) > 10 or is_last_payload: event = self.base_event.copy() + event['timestamp'] = int(time.time() * 1000) event["metadata"] = self._queued_rows self._collection_payloads_count += 1 if is_last_payload: @@ -286,17 +299,16 @@ class PostgresDatabaseObject(DatabaseObject): ON dc.objoid = db.oid JOIN pg_roles a ON datdba = a.oid - WHERE 1=1 + WHERE datname NOT LIKE 'template%' """ class PostgresSchemaCollector(SchemaCollector): - def __init__(self, check): + def __init__(self, check: PostgreSql): super().__init__(check) + self._check = check - def collect_schemas(self): - pass - + @property def base_event(self): return { @@ -313,6 +325,12 @@ def _get_databases(self): query += " AND datname !~ '{}'".format(exclude_regex) for include_regex in self._config.include_databases: query += " AND datname ~ '{}'".format(include_regex) + + # Autodiscovery trumps exclude and include + autodiscovery_databases = self._check.autodiscovery.get_items() + if autodiscovery_databases: + query += " AND datname IN ({})".format(", ".join(f"'{db}'" for db in autodiscovery_databases)) + cursor.execute(query) return cursor.fetchall() @@ -403,6 +421,8 @@ def _get_schemas_query(self): query += " AND nspname !~ '{}'".format(exclude_regex) for include_regex in self._config.include_schemas: query += " AND nspname ~ '{}'".format(include_regex) + if self._check._config.ignore_schemas_owned_by: + query += " AND nspowner :: regrole :: text not IN ({})".format(", ".join(f"'{owner}'" for owner in self._check._config.ignore_schemas_owned_by)) return query def _get_tables_query(self): @@ -423,16 +443,21 @@ def _map_row(self, database: DatabaseInfo, cursor_row) -> DatabaseObject: object = super()._map_row(database, cursor_row) object["schemas"] = [ { - "id": str(cursor_row["schema_id"]), - "name": cursor_row["schema_name"], - "owner": cursor_row["schema_owner"], + "id": str(cursor_row.get("schema_id")), + "name": cursor_row.get("schema_name"), + "owner": cursor_row.get("schema_owner"), "tables": [ { - "id": str(cursor_row["table_id"]), - "name": cursor_row["table_name"], - "columns": cursor_row["columns"], - "indexes": cursor_row["indexes"], - "foreign_keys": cursor_row["foreign_keys"], + "id": str(cursor_row.get("table_id")), + "name": cursor_row.get("table_name"), + "owner": cursor_row.get("owner"), + # The query can create duplicates of the joined tables + "columns": list({v and v['name']:v for v in cursor_row.get("columns") or []}.values()) , + "indexes": list({v and v['name']:v for v in cursor_row.get("indexes") or []}.values()) , + "foreign_keys": list({v and v['name']:v for v in cursor_row.get("foreign_keys") or []}.values()) , + "toast_table": cursor_row.get("toast_table"), + "num_partitions": cursor_row.get("num_partitions"), + "partition_key": cursor_row.get("partition_key"), } ], } diff --git a/postgres/tests/test_metadata.py b/postgres/tests/test_metadata.py index ef9f92f2eb218..60505eb8bcf66 100644 --- a/postgres/tests/test_metadata.py +++ b/postgres/tests/test_metadata.py @@ -2,6 +2,7 @@ # All rights reserved # Licensed under a 3-clause BSD style license (see LICENSE) from concurrent.futures.thread import ThreadPoolExecutor +import pprint from typing import List import mock @@ -128,81 +129,82 @@ def test_collect_schemas(integration_check, dbm_instance, aggregator, use_defaul collection_started_at = None schema_events = [e for e in dbm_metadata if e['kind'] == 'pg_databases'] + pprint.pprint(schema_events) for i, schema_event in enumerate(schema_events): - assert schema_event.get("timestamp") is not None - if collection_started_at is None: - collection_started_at = schema_event["collection_started_at"] - assert schema_event["collection_started_at"] == collection_started_at + for mi, _ in enumerate(schema_event['metadata']): + assert schema_event.get("timestamp") is not None + if collection_started_at is None: + collection_started_at = schema_event["collection_started_at"] + assert schema_event["collection_started_at"] == collection_started_at + + if i == len(schema_events) - 1: + assert schema_event["collection_payloads_count"] == len(schema_events) + else: + assert "collection_payloads_count" not in schema_event + + # there should only be one database, datadog_test + database_metadata = schema_event['metadata'] + assert 'datadog_test' == database_metadata[mi]['name'] - if i == len(schema_events) - 1: - assert schema_event["collection_payloads_count"] == len(schema_events) - else: - assert "collection_payloads_count" not in schema_event + # there should only two schemas, 'public' and 'datadog'. datadog is empty + schema = database_metadata[mi]['schemas'][0] + schema_name = schema['name'] + assert schema_name in ['public', 'public2', 'datadog', 'rdsadmin_test', 'hstore'] + schemas_got.add(schema_name) + if schema_name in ['public', 'rdsadmin_test']: + for table in schema['tables']: + tables_got.append(table['name']) - # there should only be one database, datadog_test - database_metadata = schema_event['metadata'] - assert len(database_metadata) == 1 - assert 'datadog_test' == database_metadata[0]['name'] - - # there should only two schemas, 'public' and 'datadog'. datadog is empty - schema = database_metadata[0]['schemas'][0] - schema_name = schema['name'] - assert schema_name in ['public', 'public2', 'datadog', 'rdsadmin_test', 'hstore'] - schemas_got.add(schema_name) - if schema_name in ['public', 'rdsadmin_test']: - for table in schema['tables']: - tables_got.append(table['name']) - - # make some assertions on fields - if table['name'] == "persons": - # check that foreign keys, indexes get reported - keys = list(table.keys()) - assert_fields(keys, ["foreign_keys", "columns", "id", "name", "owner"]) - # The toast table doesn't seem to be created in the C locale - if POSTGRES_LOCALE != 'C': - assert_fields(keys, ["toast_table"]) - assert_fields(list(table['foreign_keys'][0].keys()), ['name', 'definition']) - assert_fields( - list(table['columns'][0].keys()), - [ - 'name', - 'nullable', - 'data_type', - 'default', - ], - ) - if table['name'] == "cities": - keys = list(table.keys()) - assert_fields(keys, ["indexes", "columns", "id", "name", "owner"]) - if POSTGRES_LOCALE != 'C': - assert_fields(keys, ["toast_table"]) - assert len(table['indexes']) == 1 - assert_fields( - list(table['indexes'][0].keys()), - [ - 'name', - 'definition', - 'is_unique', - 'is_exclusion', - 'is_immediate', - 'is_clustered', - 'is_valid', - 'is_checkxmin', - 'is_ready', - 'is_live', - 'is_replident', - 'is_partial', - ], - ) - if float(POSTGRES_VERSION) >= 11: - if table['name'] in ('test_part', 'test_part_no_activity'): + # make some assertions on fields + if table['name'] == "persons": + # check that foreign keys, indexes get reported keys = list(table.keys()) - assert_fields(keys, ["indexes", "num_partitions", "partition_key"]) - assert table['num_partitions'] == 2 - elif table['name'] == 'test_part_no_children': + assert_fields(keys, ["foreign_keys", "columns", "id", "name", "owner"]) + # The toast table doesn't seem to be created in the C locale + if POSTGRES_LOCALE != 'C': + assert_fields(keys, ["toast_table"]) + assert_fields(list(table['foreign_keys'][0].keys()), ['name', 'definition']) + assert_fields( + list(table['columns'][0].keys()), + [ + 'name', + 'nullable', + 'data_type', + 'default', + ], + ) + if table['name'] == "cities": keys = list(table.keys()) - assert_fields(keys, ["num_partitions", "partition_key"]) - assert table['num_partitions'] == 0 + assert_fields(keys, ["indexes", "columns", "id", "name", "owner"]) + if POSTGRES_LOCALE != 'C': + assert_fields(keys, ["toast_table"]) + assert len(table['indexes']) == 1 + assert_fields( + list(table['indexes'][0].keys()), + [ + 'name', + 'definition', + 'is_unique', + 'is_exclusion', + 'is_immediate', + 'is_clustered', + 'is_valid', + 'is_checkxmin', + 'is_ready', + 'is_live', + 'is_replident', + 'is_partial', + ], + ) + if float(POSTGRES_VERSION) >= 11: + if table['name'] in ('test_part', 'test_part_no_activity'): + keys = list(table.keys()) + assert_fields(keys, ["indexes", "num_partitions", "partition_key"]) + assert table['num_partitions'] == 2 + elif table['name'] == 'test_part_no_children': + keys = list(table.keys()) + assert_fields(keys, ["num_partitions", "partition_key"]) + assert table['num_partitions'] == 0 assert schemas_want == schemas_got assert_fields(tables_got, tables_set) From 6ec98bd090d5edea67c78a03adc1e0f64c201646 Mon Sep 17 00:00:00 2001 From: Seth Samuel Date: Wed, 8 Oct 2025 16:28:42 -0400 Subject: [PATCH 18/37] WIP --- postgres/datadog_checks/postgres/schemas.py | 78 +++--- postgres/tests/test_metadata.py | 257 ++++++++++---------- 2 files changed, 175 insertions(+), 160 deletions(-) diff --git a/postgres/datadog_checks/postgres/schemas.py b/postgres/datadog_checks/postgres/schemas.py index 884da0a2a0b4a..07e51ba5b18fe 100644 --- a/postgres/datadog_checks/postgres/schemas.py +++ b/postgres/datadog_checks/postgres/schemas.py @@ -7,14 +7,14 @@ import contextlib import time from abc import ABC, abstractmethod -from typing import TypedDict -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, TypedDict + import orjson as json from psycopg.rows import dict_row if TYPE_CHECKING: - from datadog_checks.postgres import PostgreSql from datadog_checks.base import AgentCheck + from datadog_checks.postgres import PostgreSql from datadog_checks.postgres.version_utils import VersionUtils @@ -47,7 +47,7 @@ class SchemaCollector(ABC): def __init__(self, check: AgentCheck): self._check = check self._log = check.log - self._config = check._config.collect_schemas + self._config = check._config.collect_schemas self._reset() @@ -304,11 +304,10 @@ class PostgresDatabaseObject(DatabaseObject): class PostgresSchemaCollector(SchemaCollector): - def __init__(self, check: PostgreSql): + def __init__(self, check: PostgreSql): super().__init__(check) self._check = check - @property def base_event(self): return { @@ -323,14 +322,14 @@ def _get_databases(self): query = DATABASE_INFORMATION_QUERY for exclude_regex in self._config.exclude_databases: query += " AND datname !~ '{}'".format(exclude_regex) - for include_regex in self._config.include_databases: - query += " AND datname ~ '{}'".format(include_regex) - + if self._config.include_databases: + query += f" AND ({' OR '.join(f"datname ~ '{include_regex}'" for include_regex in self._config.include_databases)})" + # Autodiscovery trumps exclude and include autodiscovery_databases = self._check.autodiscovery.get_items() if autodiscovery_databases: query += " AND datname IN ({})".format(", ".join(f"'{db}'" for db in autodiscovery_databases)) - + cursor.execute(query) return cursor.fetchall() @@ -419,10 +418,12 @@ def _get_schemas_query(self): query = SCHEMA_QUERY for exclude_regex in self._config.exclude_schemas: query += " AND nspname !~ '{}'".format(exclude_regex) - for include_regex in self._config.include_schemas: - query += " AND nspname ~ '{}'".format(include_regex) + if self._config.include_schemas: + query += f" AND ({' OR '.join(f"nspname ~ '{include_regex}'" for include_regex in self._config.include_schemas)})" if self._check._config.ignore_schemas_owned_by: - query += " AND nspowner :: regrole :: text not IN ({})".format(", ".join(f"'{owner}'" for owner in self._check._config.ignore_schemas_owned_by)) + query += " AND nspowner :: regrole :: text not IN ({})".format( + ", ".join(f"'{owner}'" for owner in self._check._config.ignore_schemas_owned_by) + ) return query def _get_tables_query(self): @@ -431,9 +432,9 @@ def _get_tables_query(self): else: query = PG_TABLES_QUERY_V10_PLUS for exclude_regex in self._config.exclude_tables: - query += " AND relname !~ '{}'".format(exclude_regex) - for include_regex in self._config.include_tables: - query += " AND relname ~ '{}'".format(include_regex) + query += " AND c.relname !~ '{}'".format(exclude_regex) + if self._config.include_tables: + query += f" AND ({' OR '.join(f"c.relname ~ '{include_regex}'" for include_regex in self._config.include_tables)})" return query def _get_next(self, cursor): @@ -441,25 +442,36 @@ def _get_next(self, cursor): def _map_row(self, database: DatabaseInfo, cursor_row) -> DatabaseObject: object = super()._map_row(database, cursor_row) + # Map the cursor row to the expected schema, and strip out None values object["schemas"] = [ { - "id": str(cursor_row.get("schema_id")), - "name": cursor_row.get("schema_name"), - "owner": cursor_row.get("schema_owner"), - "tables": [ - { - "id": str(cursor_row.get("table_id")), - "name": cursor_row.get("table_name"), - "owner": cursor_row.get("owner"), - # The query can create duplicates of the joined tables - "columns": list({v and v['name']:v for v in cursor_row.get("columns") or []}.values()) , - "indexes": list({v and v['name']:v for v in cursor_row.get("indexes") or []}.values()) , - "foreign_keys": list({v and v['name']:v for v in cursor_row.get("foreign_keys") or []}.values()) , - "toast_table": cursor_row.get("toast_table"), - "num_partitions": cursor_row.get("num_partitions"), - "partition_key": cursor_row.get("partition_key"), - } - ], + k: v + for k, v in { + "id": str(cursor_row.get("schema_id")), + "name": cursor_row.get("schema_name"), + "owner": cursor_row.get("schema_owner"), + "tables": [ + { + k: v + for k, v in { + "id": str(cursor_row.get("table_id")), + "name": cursor_row.get("table_name"), + "owner": cursor_row.get("owner"), + # The query can create duplicates of the joined tables + "columns": list({v and v['name']: v for v in cursor_row.get("columns") or []}.values()), + "indexes": list({v and v['name']: v for v in cursor_row.get("indexes") or []}.values()), + "foreign_keys": list( + {v and v['name']: v for v in cursor_row.get("foreign_keys") or []}.values() + ), + "toast_table": cursor_row.get("toast_table"), + "num_partitions": cursor_row.get("num_partitions"), + "partition_key": cursor_row.get("partition_key"), + }.items() + if v is not None + } + ], + }.items() + if v is not None } ] return object diff --git a/postgres/tests/test_metadata.py b/postgres/tests/test_metadata.py index 60505eb8bcf66..9c9fe3fc33f77 100644 --- a/postgres/tests/test_metadata.py +++ b/postgres/tests/test_metadata.py @@ -1,8 +1,8 @@ # (C) Datadog, Inc. 2023-present # All rights reserved # Licensed under a 3-clause BSD style license (see LICENSE) -from concurrent.futures.thread import ThreadPoolExecutor import pprint +from concurrent.futures.thread import ThreadPoolExecutor from typing import List import mock @@ -129,7 +129,6 @@ def test_collect_schemas(integration_check, dbm_instance, aggregator, use_defaul collection_started_at = None schema_events = [e for e in dbm_metadata if e['kind'] == 'pg_databases'] - pprint.pprint(schema_events) for i, schema_event in enumerate(schema_events): for mi, _ in enumerate(schema_event['metadata']): assert schema_event.get("timestamp") is not None @@ -159,10 +158,10 @@ def test_collect_schemas(integration_check, dbm_instance, aggregator, use_defaul if table['name'] == "persons": # check that foreign keys, indexes get reported keys = list(table.keys()) - assert_fields(keys, ["foreign_keys", "columns", "id", "name", "owner"]) + assert_fields(keys, ["foreign_keys", "columns", "id", "name"]) # The toast table doesn't seem to be created in the C locale - if POSTGRES_LOCALE != 'C': - assert_fields(keys, ["toast_table"]) + # if POSTGRES_LOCALE != 'C': + # assert_fields(keys, ["toast_table"]) assert_fields(list(table['foreign_keys'][0].keys()), ['name', 'definition']) assert_fields( list(table['columns'][0].keys()), @@ -175,9 +174,9 @@ def test_collect_schemas(integration_check, dbm_instance, aggregator, use_defaul ) if table['name'] == "cities": keys = list(table.keys()) - assert_fields(keys, ["indexes", "columns", "id", "name", "owner"]) - if POSTGRES_LOCALE != 'C': - assert_fields(keys, ["toast_table"]) + assert_fields(keys, ["indexes", "columns", "id", "name"]) + # if POSTGRES_LOCALE != 'C': + # assert_fields(keys, ["toast_table"]) assert len(table['indexes']) == 1 assert_fields( list(table['indexes'][0].keys()), @@ -213,116 +212,116 @@ def test_collect_schemas(integration_check, dbm_instance, aggregator, use_defaul def test_collect_schemas_filters(integration_check, dbm_instance, aggregator): test_cases = [ - [ - {'include_databases': ['.*'], 'include_schemas': ['public'], 'include_tables': ['.*']}, - [ - "persons", - "personsdup1", - "personsdup2", - "personsdup3", - "personsdup4", - "personsdup5", - "personsdup6", - "personsdup7", - "personsdup8", - "personsdup9", - "personsdup10", - "personsdup11", - "personsdup12", - "pgtable", - "pg_newtable", - "cities", - ], - [], - ], - [ - {'exclude_tables': ['person.*']}, - [ - "pgtable", - "pg_newtable", - "cities", - ], - [ - "persons", - "personsdup1", - "personsdup2", - "personsdup3", - "personsdup4", - "personsdup5", - "personsdup6", - "personsdup7", - "personsdup8", - "personsdup9", - "personsdup10", - "personsdup11", - "personsdup12", - ], - ], - [ - {'include_tables': ['person.*'], 'exclude_tables': ['person.*']}, - [], - [ - "persons", - "personsdup1", - "personsdup2", - "personsdup3", - "personsdup4", - "personsdup5", - "personsdup6", - "personsdup7", - "personsdup8", - "personsdup9", - "personsdup10", - "personsdup11", - "personsdup12", - ], - ], - [ - {'include_tables': ['person.*', "cities"]}, - [ - "persons", - "personsdup1", - "personsdup2", - "personsdup3", - "personsdup4", - "personsdup5", - "personsdup6", - "personsdup7", - "personsdup8", - "personsdup9", - "personsdup10", - "personsdup11", - "personsdup12", - "cities", - ], - [ - "pgtable", - "pg_newtable", - ], - ], - [ - {'exclude_tables': ['person.*', "cities"]}, - [ - "pgtable", - "pg_newtable", - ], - [ - "persons", - "personsdup1", - "personsdup2", - "personsdup3", - "personsdup4", - "personsdup5", - "personsdup6", - "personsdup7", - "personsdup8", - "personsdup9", - "personsdup10", - "personsdup11", - "personsdup12", - "cities", - ], - ], + # [ + # {'include_databases': ['.*'], 'include_schemas': ['public'], 'include_tables': ['.*']}, + # [ + # "persons", + # "personsdup1", + # "personsdup2", + # "personsdup3", + # "personsdup4", + # "personsdup5", + # "personsdup6", + # "personsdup7", + # "personsdup8", + # "personsdup9", + # "personsdup10", + # "personsdup11", + # "personsdup12", + # "pgtable", + # "pg_newtable", + # "cities", + # ], + # [], + # ], + # [ + # {'exclude_tables': ['person.*']}, + # [ + # "pgtable", + # "pg_newtable", + # "cities", + # ], + # [ + # "persons", + # "personsdup1", + # "personsdup2", + # "personsdup3", + # "personsdup4", + # "personsdup5", + # "personsdup6", + # "personsdup7", + # "personsdup8", + # "personsdup9", + # "personsdup10", + # "personsdup11", + # "personsdup12", + # ], + # ], + # [ + # {'include_tables': ['person.*'], 'exclude_tables': ['person.*']}, + # [], + # [ + # "persons", + # "personsdup1", + # "personsdup2", + # "personsdup3", + # "personsdup4", + # "personsdup5", + # "personsdup6", + # "personsdup7", + # "personsdup8", + # "personsdup9", + # "personsdup10", + # "personsdup11", + # "personsdup12", + # ], + # ], + # [ + # {'include_tables': ['person.*', "cities"]}, + # [ + # "persons", + # "personsdup1", + # "personsdup2", + # "personsdup3", + # "personsdup4", + # "personsdup5", + # "personsdup6", + # "personsdup7", + # "personsdup8", + # "personsdup9", + # "personsdup10", + # "personsdup11", + # "personsdup12", + # "cities", + # ], + # [ + # "pgtable", + # "pg_newtable", + # ], + # ], + # [ + # {'exclude_tables': ['person.*', "cities"]}, + # [ + # "pgtable", + # "pg_newtable", + # ], + # [ + # "persons", + # "personsdup1", + # "personsdup2", + # "personsdup3", + # "personsdup4", + # "personsdup5", + # "personsdup6", + # "personsdup7", + # "personsdup8", + # "personsdup9", + # "personsdup10", + # "personsdup11", + # "personsdup12", + # "cities", + # ], + # ], [ {'include_tables': ['person.*1', "cities"], 'exclude_tables': ['person.*2', "pg.*"]}, [ @@ -350,10 +349,10 @@ def test_collect_schemas_filters(integration_check, dbm_instance, aggregator): del dbm_instance['dbname'] dbm_instance["database_autodiscovery"] = {"enabled": True, "include": ["datadog"]} - dbm_instance['relations'] = [{'relation_regex': ".*"}] + dbm_instance['relations'] = [] for tc in test_cases: - dbm_instance["collect_schemas"] = {'enabled': True, 'collection_interval': 600, **tc[0]} + dbm_instance["collect_schemas"] = {'enabled': True, 'run_sync': True, **tc[0]} check = integration_check(dbm_instance) run_one_check(check, dbm_instance) dbm_metadata = aggregator.get_event_platform_events("dbm-metadata") @@ -361,13 +360,17 @@ def test_collect_schemas_filters(integration_check, dbm_instance, aggregator): tables_got = [] for schema_event in (e for e in dbm_metadata if e['kind'] == 'pg_databases'): - database_metadata = schema_event['metadata'] - schema = database_metadata[0]['schemas'][0] - schema_name = schema['name'] - assert schema_name in ['public', 'public2', 'datadog', 'rdsadmin_test', 'hstore'] - if schema_name == 'public': - for table in schema['tables']: - tables_got.append(table['name']) + for mi, _ in enumerate(schema_event['metadata']): + database_metadata = schema_event['metadata'][mi] + schema = database_metadata['schemas'][0] + schema_name = schema['name'] + assert schema_name in ['public', 'public2', 'datadog', 'rdsadmin_test', 'hstore'] + if schema_name == 'public': + for table in schema['tables']: + if 'name' in table: + tables_got.append(table['name']) + else: + print(table) assert_fields(tables_got, tc[1]) assert_not_fields(tables_got, tc[2]) From 701bf46c6f3b5a7a0647d89ba0825b2a7133a79e Mon Sep 17 00:00:00 2001 From: Seth Samuel Date: Wed, 8 Oct 2025 16:33:12 -0400 Subject: [PATCH 19/37] WIP --- postgres/tests/test_metadata.py | 31 ------------------------------- 1 file changed, 31 deletions(-) diff --git a/postgres/tests/test_metadata.py b/postgres/tests/test_metadata.py index 9c9fe3fc33f77..84b48483d28b1 100644 --- a/postgres/tests/test_metadata.py +++ b/postgres/tests/test_metadata.py @@ -448,37 +448,6 @@ def test_collect_schemas_max_tables(integration_check, dbm_instance, aggregator) assert len(database_metadata[0]['schemas'][0]['tables']) <= 1 -def test_collect_schemas_interrupted(integration_check, dbm_instance, aggregator): - dbm_instance["collect_schemas"] = {'enabled': True, 'collection_interval': 0.5, 'max_tables': 1} - dbm_instance['relations'] = [] - dbm_instance["database_autodiscovery"] = {"enabled": True, "include": ["datadog"]} - del dbm_instance['dbname'] - check = integration_check(dbm_instance) - with mock.patch('datadog_checks.postgres.metadata.PostgresMetadata._collect_schema_info', side_effect=Exception): - run_one_check(check, dbm_instance) - # ensures _is_schemas_collection_in_progress is reset to False after an exception - assert check.metadata_samples._is_schemas_collection_in_progress is False - dbm_metadata = aggregator.get_event_platform_events("dbm-metadata") - assert [e for e in dbm_metadata if e['kind'] == 'pg_databases'] == [] - - # next run should succeed - run_one_check(check, dbm_instance) - dbm_metadata = aggregator.get_event_platform_events("dbm-metadata") - - for schema_event in (e for e in dbm_metadata if e['kind'] == 'pg_databases'): - database_metadata = schema_event['metadata'] - assert len(database_metadata[0]['schemas'][0]['tables']) == 1 - - # Rerun check with relations enabled - dbm_instance['relations'] = [{'relation_regex': '.*'}] - check = integration_check(dbm_instance) - run_one_check(check, dbm_instance) - dbm_metadata = aggregator.get_event_platform_events("dbm-metadata") - - for schema_event in (e for e in dbm_metadata if e['kind'] == 'pg_databases'): - database_metadata = schema_event['metadata'] - assert len(database_metadata[0]['schemas'][0]['tables']) <= 1 - def test_collect_schemas_multiple_payloads(integration_check, dbm_instance, aggregator): dbm_instance["collect_schemas"] = {'enabled': True, 'collection_interval': 0.5} From 17135592f3817d0f67c71701decd696f34ed0b61 Mon Sep 17 00:00:00 2001 From: Seth Samuel Date: Mon, 20 Oct 2025 11:22:36 -0400 Subject: [PATCH 20/37] Fix timestamp --- postgres/datadog_checks/postgres/schemas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/postgres/datadog_checks/postgres/schemas.py b/postgres/datadog_checks/postgres/schemas.py index 07e51ba5b18fe..eb7ef6bc66425 100644 --- a/postgres/datadog_checks/postgres/schemas.py +++ b/postgres/datadog_checks/postgres/schemas.py @@ -89,7 +89,7 @@ def collect_schemas(self) -> bool: finally: self._check.histogram( "dd.postgres.schema.time", - (time.time() - self._collection_started_at) * 1000, + int(time.time() * 1000) - self._collection_started_at, tags=self._check.tags + ["status:" + status], hostname=self._check.reported_hostname, raw=True, From 6711d541799b49eae71fc276b30530a30ca12aca Mon Sep 17 00:00:00 2001 From: Seth Samuel Date: Mon, 20 Oct 2025 13:21:25 -0400 Subject: [PATCH 21/37] Fixes --- postgres/datadog_checks/postgres/schemas.py | 40 ++++++++++++++++++--- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/postgres/datadog_checks/postgres/schemas.py b/postgres/datadog_checks/postgres/schemas.py index eb7ef6bc66425..d3f03603ad5d4 100644 --- a/postgres/datadog_checks/postgres/schemas.py +++ b/postgres/datadog_checks/postgres/schemas.py @@ -48,6 +48,7 @@ def __init__(self, check: AgentCheck): self._check = check self._log = check.log self._config = check._config.collect_schemas + self._row_chunk_size = 10000 self._reset() @@ -73,15 +74,23 @@ def collect_schemas(self) -> bool: if not database_name: self._check.log("database has no name %v", database) continue + start = time.time() with self._get_cursor(database_name) as cursor: + end = time.time() + self._log.info("Time to get cursor (%s): %s", database_name, int((end - start)*1000)) + # data = self._get_all(cursor) next = self._get_next(cursor) + start = time.time() while next: + # for i, next in enumerate(data): self._queued_rows.append(self._map_row(database, next)) self._total_rows_count += 1 next = self._get_next(cursor) is_last_payload = database is databases[-1] and next is None + # is_last_payload = i == len(data) - 1 self.maybe_flush(is_last_payload) - + end = time.time() + self._log.info("Time to process rows (%s): %s", database_name, int((end - start)*1000)) except Exception as e: status = "error" self._log.error("Error collecting schema metadata: %s", e) @@ -101,6 +110,13 @@ def collect_schemas(self) -> bool: hostname=self._check.reported_hostname, raw=True, ) + self._check.gauge( + "dd.postgres.schema.payloads_count", + self._collection_payloads_count, + tags=self._check.tags + ["status:" + status], + hostname=self._check.reported_hostname, + raw=True, + ) self._reset() return True @@ -119,7 +135,7 @@ def base_event(self): } def maybe_flush(self, is_last_payload): - if len(self._queued_rows) > 10 or is_last_payload: + if len(self._queued_rows) > self._row_chunk_size or is_last_payload: event = self.base_event.copy() event['timestamp'] = int(time.time() * 1000) event["metadata"] = self._queued_rows @@ -142,6 +158,10 @@ def _get_cursor(self, database): def _get_next(self, cursor): pass + @abstractmethod + def _get_all(self, cursor): + pass + @abstractmethod def _map_row(self, database: DatabaseInfo, cursor_row) -> DatabaseObject: """ @@ -307,6 +327,7 @@ class PostgresSchemaCollector(SchemaCollector): def __init__(self, check: PostgreSql): super().__init__(check) self._check = check + self._config = check._config.collect_schemas @property def base_event(self): @@ -372,8 +393,8 @@ def _get_cursor(self, database_name): if VersionUtils.transform_version(str(self._check.version))["version.major"] > "9" else "" ) + limit = int(self._config.max_tables or 1_000_000) - limit = self._config.max_tables or 1_000_000 query = f""" WITH schemas AS( @@ -382,6 +403,13 @@ def _get_cursor(self, database_name): tables AS ( {tables_query} ), + schema_tables AS ( + SELECT schemas.schema_id, schemas.schema_name, + tables.table_id, tables.table_name + FROM schemas + LEFT JOIN tables ON schemas.schema_id = tables.schema_id + LIMIT {limit} + ), columns AS ( {columns_query} ), @@ -393,6 +421,7 @@ def _get_cursor(self, database_name): ) {partitions_ctes} + SELECT * FROM ( SELECT schemas.schema_id, schemas.schema_name, tables.table_id, tables.table_name, array_agg(row_to_json(columns.*)) FILTER (WHERE columns.name IS NOT NULL) as columns, @@ -407,7 +436,7 @@ def _get_cursor(self, database_name): LEFT JOIN constraints ON tables.table_id = constraints.table_id {partition_joins} GROUP BY schemas.schema_id, schemas.schema_name, tables.table_id, tables.table_name - LIMIT {limit} + ) t ; """ # print(query) @@ -440,6 +469,9 @@ def _get_tables_query(self): def _get_next(self, cursor): return cursor.fetchone() + def _get_all(self, cursor): + return cursor.fetchall() + def _map_row(self, database: DatabaseInfo, cursor_row) -> DatabaseObject: object = super()._map_row(database, cursor_row) # Map the cursor row to the expected schema, and strip out None values From b9285c961b8301f6f2d6d0faeaae37cc3941986c Mon Sep 17 00:00:00 2001 From: Seth Samuel Date: Mon, 20 Oct 2025 14:56:19 -0400 Subject: [PATCH 22/37] Cast --- postgres/datadog_checks/postgres/schemas.py | 1 + 1 file changed, 1 insertion(+) diff --git a/postgres/datadog_checks/postgres/schemas.py b/postgres/datadog_checks/postgres/schemas.py index d3f03603ad5d4..b398a57fad86c 100644 --- a/postgres/datadog_checks/postgres/schemas.py +++ b/postgres/datadog_checks/postgres/schemas.py @@ -169,6 +169,7 @@ def _map_row(self, database: DatabaseInfo, cursor_row) -> DatabaseObject: """ return { **database, + "id": str(database["id"]), #Case id into string as expected by backend } From c71d133b1947331973f28776ad47b78c3e40197a Mon Sep 17 00:00:00 2001 From: Seth Samuel Date: Mon, 20 Oct 2025 15:57:56 -0400 Subject: [PATCH 23/37] Fix query --- postgres/datadog_checks/postgres/schemas.py | 25 ++++++++++++--------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/postgres/datadog_checks/postgres/schemas.py b/postgres/datadog_checks/postgres/schemas.py index b398a57fad86c..a6838778f7bd8 100644 --- a/postgres/datadog_checks/postgres/schemas.py +++ b/postgres/datadog_checks/postgres/schemas.py @@ -361,7 +361,7 @@ def _get_cursor(self, database_name): with conn.cursor(row_factory=dict_row) as cursor: schemas_query = self._get_schemas_query() tables_query = self._get_tables_query() - columns_query = COLUMNS_QUERY + columns_query = self._get_columns_query() indexes_query = PG_INDEXES_QUERY constraints_query = PG_CONSTRAINTS_QUERY partitions_ctes = ( @@ -409,6 +409,7 @@ def _get_cursor(self, database_name): tables.table_id, tables.table_name FROM schemas LEFT JOIN tables ON schemas.schema_id = tables.schema_id + ORDER BY schemas.schema_name, tables.table_name LIMIT {limit} ), columns AS ( @@ -423,24 +424,23 @@ def _get_cursor(self, database_name): {partitions_ctes} SELECT * FROM ( - SELECT schemas.schema_id, schemas.schema_name, - tables.table_id, tables.table_name, + SELECT schema_tables.schema_id, schema_tables.schema_name, + schema_tables.table_id, schema_tables.table_name, array_agg(row_to_json(columns.*)) FILTER (WHERE columns.name IS NOT NULL) as columns, array_agg(row_to_json(indexes.*)) FILTER (WHERE indexes.name IS NOT NULL) as indexes, array_agg(row_to_json(constraints.*)) FILTER (WHERE constraints.name IS NOT NULL) as foreign_keys {parition_selects} - FROM schemas - LEFT JOIN tables ON schemas.schema_id = tables.schema_id - LEFT JOIN columns ON tables.table_id = columns.table_id - LEFT JOIN indexes ON tables.table_id = indexes.table_id - LEFT JOIN constraints ON tables.table_id = constraints.table_id + FROM schema_tables + LEFT JOIN columns ON schema_tables.table_id = columns.table_id + LEFT JOIN indexes ON schema_tables.table_id = indexes.table_id + LEFT JOIN constraints ON schema_tables.table_id = constraints.table_id {partition_joins} - GROUP BY schemas.schema_id, schemas.schema_name, tables.table_id, tables.table_name + GROUP BY schema_tables.schema_id, schema_tables.schema_name, schema_tables.table_id, schema_tables.table_name ) t ; """ - # print(query) + print(query) cursor.execute(query) yield cursor @@ -467,6 +467,11 @@ def _get_tables_query(self): query += f" AND ({' OR '.join(f"c.relname ~ '{include_regex}'" for include_regex in self._config.include_tables)})" return query + def _get_columns_query(self): + query = COLUMNS_QUERY + query += f" limit {int(self._config.max_columns)}" + return query + def _get_next(self, cursor): return cursor.fetchone() From 12dfa51e50009bc364035bfb73b52f35b9932417 Mon Sep 17 00:00:00 2001 From: Seth Samuel Date: Mon, 20 Oct 2025 16:04:24 -0400 Subject: [PATCH 24/37] Fix query --- postgres/datadog_checks/postgres/schemas.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/postgres/datadog_checks/postgres/schemas.py b/postgres/datadog_checks/postgres/schemas.py index a6838778f7bd8..f62ec345e6129 100644 --- a/postgres/datadog_checks/postgres/schemas.py +++ b/postgres/datadog_checks/postgres/schemas.py @@ -361,7 +361,7 @@ def _get_cursor(self, database_name): with conn.cursor(row_factory=dict_row) as cursor: schemas_query = self._get_schemas_query() tables_query = self._get_tables_query() - columns_query = self._get_columns_query() + columns_query = COLUMNS_QUERY indexes_query = PG_INDEXES_QUERY constraints_query = PG_CONSTRAINTS_QUERY partitions_ctes = ( @@ -467,10 +467,6 @@ def _get_tables_query(self): query += f" AND ({' OR '.join(f"c.relname ~ '{include_regex}'" for include_regex in self._config.include_tables)})" return query - def _get_columns_query(self): - query = COLUMNS_QUERY - query += f" limit {int(self._config.max_columns)}" - return query def _get_next(self, cursor): return cursor.fetchone() From 6b80e5a367aa44b4a4d0796d7d095f6eb3f70c08 Mon Sep 17 00:00:00 2001 From: Seth Samuel Date: Tue, 21 Oct 2025 13:12:00 -0400 Subject: [PATCH 25/37] Create shared schemas collector for DBM integrations --- .../datadog_checks/base/utils/db/schemas.py | 511 ++++++++++++++++++ .../tests/base/utils/db/test_schemas.py | 160 ++++++ 2 files changed, 671 insertions(+) create mode 100644 datadog_checks_base/datadog_checks/base/utils/db/schemas.py create mode 100644 datadog_checks_base/tests/base/utils/db/test_schemas.py diff --git a/datadog_checks_base/datadog_checks/base/utils/db/schemas.py b/datadog_checks_base/datadog_checks/base/utils/db/schemas.py new file mode 100644 index 0000000000000..f62ec345e6129 --- /dev/null +++ b/datadog_checks_base/datadog_checks/base/utils/db/schemas.py @@ -0,0 +1,511 @@ +# (C) Datadog, Inc. 2025-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) + +from __future__ import annotations + +import contextlib +import time +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, TypedDict + +import orjson as json +from psycopg.rows import dict_row + +if TYPE_CHECKING: + from datadog_checks.base import AgentCheck + from datadog_checks.postgres import PostgreSql + +from datadog_checks.postgres.version_utils import VersionUtils + +try: + import datadog_agent +except ImportError: + from datadog_checks.base.stubs import datadog_agent + + +class DatabaseInfo(TypedDict): + description: str + name: str + id: str + encoding: str + owner: str + + +# The schema collector sends lists of DatabaseObjects to the agent +# The format is for backwards compatibility with the current backend +class DatabaseObject(TypedDict): + # Splat of database info + description: str + name: str + id: str + encoding: str + owner: str + + +class SchemaCollector(ABC): + def __init__(self, check: AgentCheck): + self._check = check + self._log = check.log + self._config = check._config.collect_schemas + self._row_chunk_size = 10000 + + self._reset() + + def _reset(self): + self._collection_started_at = None + self._collection_payloads_count = 0 + self._queued_rows = [] + self._total_rows_count = 0 + + def collect_schemas(self) -> bool: + """ + Collects and submits all applicable schema metadata to the agent. + Returns False if the previous collection was still in progress. + """ + if self._collection_started_at is not None: + return False + status = "success" + try: + self._collection_started_at = int(time.time() * 1000) + databases = self._get_databases() + for database in databases: + database_name = database['name'] + if not database_name: + self._check.log("database has no name %v", database) + continue + start = time.time() + with self._get_cursor(database_name) as cursor: + end = time.time() + self._log.info("Time to get cursor (%s): %s", database_name, int((end - start)*1000)) + # data = self._get_all(cursor) + next = self._get_next(cursor) + start = time.time() + while next: + # for i, next in enumerate(data): + self._queued_rows.append(self._map_row(database, next)) + self._total_rows_count += 1 + next = self._get_next(cursor) + is_last_payload = database is databases[-1] and next is None + # is_last_payload = i == len(data) - 1 + self.maybe_flush(is_last_payload) + end = time.time() + self._log.info("Time to process rows (%s): %s", database_name, int((end - start)*1000)) + except Exception as e: + status = "error" + self._log.error("Error collecting schema metadata: %s", e) + raise e + finally: + self._check.histogram( + "dd.postgres.schema.time", + int(time.time() * 1000) - self._collection_started_at, + tags=self._check.tags + ["status:" + status], + hostname=self._check.reported_hostname, + raw=True, + ) + self._check.gauge( + "dd.postgres.schema.tables_count", + self._total_rows_count, + tags=self._check.tags + ["status:" + status], + hostname=self._check.reported_hostname, + raw=True, + ) + self._check.gauge( + "dd.postgres.schema.payloads_count", + self._collection_payloads_count, + tags=self._check.tags + ["status:" + status], + hostname=self._check.reported_hostname, + raw=True, + ) + + self._reset() + return True + + @property + def base_event(self): + return { + "host": self._check.reported_hostname, + "database_instance": self._check.database_identifier, + "agent_version": datadog_agent.get_version(), + "collection_interval": self._config.collection_interval, + "dbms_version": str(self._check.version), + "tags": self._check.tags, + "cloud_metadata": self._check.cloud_metadata, + "collection_started_at": self._collection_started_at, + } + + def maybe_flush(self, is_last_payload): + if len(self._queued_rows) > self._row_chunk_size or is_last_payload: + event = self.base_event.copy() + event['timestamp'] = int(time.time() * 1000) + event["metadata"] = self._queued_rows + self._collection_payloads_count += 1 + if is_last_payload: + event["collection_payloads_count"] = self._collection_payloads_count + self._check.database_monitoring_metadata(json.dumps(event)) + + self._queued_rows = [] + + @abstractmethod + def _get_databases(self) -> list[DatabaseInfo]: + pass + + @abstractmethod + def _get_cursor(self, database): + pass + + @abstractmethod + def _get_next(self, cursor): + pass + + @abstractmethod + def _get_all(self, cursor): + pass + + @abstractmethod + def _map_row(self, database: DatabaseInfo, cursor_row) -> DatabaseObject: + """ + Maps a cursor row to a dict that matches the schema expected by DBM. + """ + return { + **database, + "id": str(database["id"]), #Case id into string as expected by backend + } + + +PG_TABLES_QUERY_V10_PLUS = """ +SELECT c.oid AS table_id, + c.relnamespace AS schema_id, + c.relname AS table_name, + c.relhasindex AS has_indexes, + c.relowner :: regrole AS owner, + ( CASE + WHEN c.relkind = 'p' THEN TRUE + ELSE FALSE + END ) AS has_partitions, + t.relname AS toast_table +FROM pg_class c + left join pg_class t + ON c.reltoastrelid = t.oid +WHERE c.relkind IN ( 'r', 'p', 'f' ) + AND c.relispartition != 't' +""" + +PG_TABLES_QUERY_V9 = """ +SELECT c.oid AS table_id, + c.relnamespace AS schema_id, + c.relname AS table_name, + c.relhasindex AS has_indexes, + c.relowner :: regrole AS owner, + t.relname AS toast_table +FROM pg_class c + left join pg_class t + ON c.reltoastrelid = t.oid +WHERE c.relkind IN ( 'r', 'f' ) +""" + + +SCHEMA_QUERY = """ +SELECT nsp.oid AS schema_id, + nspname AS schema_name, + nspowner :: regrole AS schema_owner +FROM pg_namespace nsp + LEFT JOIN pg_roles r on nsp.nspowner = r.oid +WHERE nspname NOT IN ( 'information_schema', 'pg_catalog' ) + AND nspname NOT LIKE 'pg_toast%' + AND nspname NOT LIKE 'pg_temp_%' +""" + +COLUMNS_QUERY = """ +SELECT attname AS name, + Format_type(atttypid, atttypmod) AS data_type, + NOT attnotnull AS nullable, + pg_get_expr(adbin, adrelid) AS default, + attrelid AS table_id +FROM pg_attribute + LEFT JOIN pg_attrdef ad + ON adrelid = attrelid + AND adnum = attnum +WHERE attnum > 0 + AND NOT attisdropped +""" + + +PG_INDEXES_QUERY = """ +SELECT + c.relname AS name, + ix.indrelid AS table_id, + pg_get_indexdef(c.oid) AS definition, + ix.indisunique AS is_unique, + ix.indisexclusion AS is_exclusion, + ix.indimmediate AS is_immediate, + ix.indisclustered AS is_clustered, + ix.indisvalid AS is_valid, + ix.indcheckxmin AS is_checkxmin, + ix.indisready AS is_ready, + ix.indislive AS is_live, + ix.indisreplident AS is_replident, + ix.indpred IS NOT NULL AS is_partial +FROM + pg_index ix +JOIN + pg_class c +ON + c.oid = ix.indexrelid +""" + + +PG_CONSTRAINTS_QUERY = """ +SELECT conname AS name, + pg_get_constraintdef(oid) AS definition, + conrelid AS table_id +FROM pg_constraint +WHERE contype = 'f' +""" + + +PARTITION_KEY_QUERY = """ +SELECT relname, + pg_get_partkeydef(oid) AS partition_key, + oid AS table_id +FROM pg_class +""" + +NUM_PARTITIONS_QUERY = """ +SELECT count(inhrelid :: regclass) AS num_partitions, inhparent as table_id +FROM pg_inherits +GROUP BY inhparent; +""" + +PARTITION_ACTIVITY_QUERY = """ +SELECT pi.inhparent :: regclass AS parent_table_name, + SUM(COALESCE(psu.seq_scan, 0) + COALESCE(psu.idx_scan, 0)) AS total_activity, + pi.inhparent as table_id +FROM pg_catalog.pg_stat_user_tables psu + join pg_class pc + ON psu.relname = pc.relname + join pg_inherits pi + ON pi.inhrelid = pc.oid +GROUP BY pi.inhparent +""" + + +class TableObject(TypedDict): + id: str + name: str + columns: list + indexes: list + foreign_keys: list + + +class SchemaObject(TypedDict): + id: str + name: str + owner: str + tables: list[TableObject] + + +class PostgresDatabaseObject(DatabaseObject): + schemas: list[SchemaObject] + + +DATABASE_INFORMATION_QUERY = """ +SELECT db.oid AS id, + datname AS NAME, + pg_encoding_to_char(encoding) AS encoding, + rolname AS owner, + description +FROM pg_catalog.pg_database db + LEFT JOIN pg_catalog.pg_description dc + ON dc.objoid = db.oid + JOIN pg_roles a + ON datdba = a.oid + WHERE datname NOT LIKE 'template%' +""" + + +class PostgresSchemaCollector(SchemaCollector): + def __init__(self, check: PostgreSql): + super().__init__(check) + self._check = check + self._config = check._config.collect_schemas + + @property + def base_event(self): + return { + **super().base_event, + "dbms": "postgres", + "kind": "pg_databases", + } + + def _get_databases(self): + with self._check._get_main_db() as conn: + with conn.cursor(row_factory=dict_row) as cursor: + query = DATABASE_INFORMATION_QUERY + for exclude_regex in self._config.exclude_databases: + query += " AND datname !~ '{}'".format(exclude_regex) + if self._config.include_databases: + query += f" AND ({' OR '.join(f"datname ~ '{include_regex}'" for include_regex in self._config.include_databases)})" + + # Autodiscovery trumps exclude and include + autodiscovery_databases = self._check.autodiscovery.get_items() + if autodiscovery_databases: + query += " AND datname IN ({})".format(", ".join(f"'{db}'" for db in autodiscovery_databases)) + + cursor.execute(query) + return cursor.fetchall() + + @contextlib.contextmanager + def _get_cursor(self, database_name): + with self._check.db_pool.get_connection(database_name) as conn: + with conn.cursor(row_factory=dict_row) as cursor: + schemas_query = self._get_schemas_query() + tables_query = self._get_tables_query() + columns_query = COLUMNS_QUERY + indexes_query = PG_INDEXES_QUERY + constraints_query = PG_CONSTRAINTS_QUERY + partitions_ctes = ( + f""" + , + partition_keys AS ( + {PARTITION_KEY_QUERY} + ), + num_partitions AS ( + {NUM_PARTITIONS_QUERY} + ) + """ + if VersionUtils.transform_version(str(self._check.version))["version.major"] > "9" + else "" + ) + partition_joins = ( + """ + LEFT JOIN partition_keys ON tables.table_id = partition_keys.table_id + LEFT JOIN num_partitions ON tables.table_id = num_partitions.table_id + """ + if VersionUtils.transform_version(str(self._check.version))["version.major"] > "9" + else "" + ) + parition_selects = ( + """ + , + partition_keys.partition_key, + num_partitions.num_partitions + """ + if VersionUtils.transform_version(str(self._check.version))["version.major"] > "9" + else "" + ) + limit = int(self._config.max_tables or 1_000_000) + + query = f""" + WITH + schemas AS( + {schemas_query} + ), + tables AS ( + {tables_query} + ), + schema_tables AS ( + SELECT schemas.schema_id, schemas.schema_name, + tables.table_id, tables.table_name + FROM schemas + LEFT JOIN tables ON schemas.schema_id = tables.schema_id + ORDER BY schemas.schema_name, tables.table_name + LIMIT {limit} + ), + columns AS ( + {columns_query} + ), + indexes AS ( + {indexes_query} + ), + constraints AS ( + {constraints_query} + ) + {partitions_ctes} + + SELECT * FROM ( + SELECT schema_tables.schema_id, schema_tables.schema_name, + schema_tables.table_id, schema_tables.table_name, + array_agg(row_to_json(columns.*)) FILTER (WHERE columns.name IS NOT NULL) as columns, + array_agg(row_to_json(indexes.*)) FILTER (WHERE indexes.name IS NOT NULL) as indexes, + array_agg(row_to_json(constraints.*)) FILTER (WHERE constraints.name IS NOT NULL) + as foreign_keys + {parition_selects} + FROM schema_tables + LEFT JOIN columns ON schema_tables.table_id = columns.table_id + LEFT JOIN indexes ON schema_tables.table_id = indexes.table_id + LEFT JOIN constraints ON schema_tables.table_id = constraints.table_id + {partition_joins} + GROUP BY schema_tables.schema_id, schema_tables.schema_name, schema_tables.table_id, schema_tables.table_name + ) t + ; + """ + print(query) + cursor.execute(query) + yield cursor + + def _get_schemas_query(self): + query = SCHEMA_QUERY + for exclude_regex in self._config.exclude_schemas: + query += " AND nspname !~ '{}'".format(exclude_regex) + if self._config.include_schemas: + query += f" AND ({' OR '.join(f"nspname ~ '{include_regex}'" for include_regex in self._config.include_schemas)})" + if self._check._config.ignore_schemas_owned_by: + query += " AND nspowner :: regrole :: text not IN ({})".format( + ", ".join(f"'{owner}'" for owner in self._check._config.ignore_schemas_owned_by) + ) + return query + + def _get_tables_query(self): + if VersionUtils.transform_version(str(self._check.version))["version.major"] == "9": + query = PG_TABLES_QUERY_V9 + else: + query = PG_TABLES_QUERY_V10_PLUS + for exclude_regex in self._config.exclude_tables: + query += " AND c.relname !~ '{}'".format(exclude_regex) + if self._config.include_tables: + query += f" AND ({' OR '.join(f"c.relname ~ '{include_regex}'" for include_regex in self._config.include_tables)})" + return query + + + def _get_next(self, cursor): + return cursor.fetchone() + + def _get_all(self, cursor): + return cursor.fetchall() + + def _map_row(self, database: DatabaseInfo, cursor_row) -> DatabaseObject: + object = super()._map_row(database, cursor_row) + # Map the cursor row to the expected schema, and strip out None values + object["schemas"] = [ + { + k: v + for k, v in { + "id": str(cursor_row.get("schema_id")), + "name": cursor_row.get("schema_name"), + "owner": cursor_row.get("schema_owner"), + "tables": [ + { + k: v + for k, v in { + "id": str(cursor_row.get("table_id")), + "name": cursor_row.get("table_name"), + "owner": cursor_row.get("owner"), + # The query can create duplicates of the joined tables + "columns": list({v and v['name']: v for v in cursor_row.get("columns") or []}.values()), + "indexes": list({v and v['name']: v for v in cursor_row.get("indexes") or []}.values()), + "foreign_keys": list( + {v and v['name']: v for v in cursor_row.get("foreign_keys") or []}.values() + ), + "toast_table": cursor_row.get("toast_table"), + "num_partitions": cursor_row.get("num_partitions"), + "partition_key": cursor_row.get("partition_key"), + }.items() + if v is not None + } + ], + }.items() + if v is not None + } + ] + return object diff --git a/datadog_checks_base/tests/base/utils/db/test_schemas.py b/datadog_checks_base/tests/base/utils/db/test_schemas.py new file mode 100644 index 0000000000000..518e62d84222a --- /dev/null +++ b/datadog_checks_base/tests/base/utils/db/test_schemas.py @@ -0,0 +1,160 @@ +# (C) Datadog, Inc. 2023-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +import pytest + +from datadog_checks.postgres.schemas import PostgresSchemaCollector + +from .common import POSTGRES_VERSION + +pytestmark = [pytest.mark.integration, pytest.mark.usefixtures('dd_environment')] + + +@pytest.fixture +def dbm_instance(pg_instance): + pg_instance['dbm'] = True + pg_instance['min_collection_interval'] = 0.1 + pg_instance['query_samples'] = {'enabled': False} + pg_instance['query_activity'] = {'enabled': False} + pg_instance['query_metrics'] = {'enabled': False} + pg_instance['collect_resources'] = {'enabled': False, 'run_sync': True} + pg_instance['collect_settings'] = {'enabled': False, 'run_sync': True} + pg_instance['collect_schemas'] = {'enabled': True, 'run_sync': True} + return pg_instance + + +def test_get_databases(dbm_instance, integration_check): + check = integration_check(dbm_instance) + collector = PostgresSchemaCollector(check) + + databases = collector._get_databases() + datbase_names = [database['name'] for database in databases] + assert 'postgres' in datbase_names + assert 'dogs' in datbase_names + assert 'dogs_3' in datbase_names + assert 'nope' not in datbase_names + + +def test_databases_filters(dbm_instance, integration_check): + dbm_instance['collect_schemas']['exclude_databases'] = ['^dogs$', 'dogs_[345]'] + check = integration_check(dbm_instance) + collector = PostgresSchemaCollector(check) + + databases = collector._get_databases() + datbase_names = [database['name'] for database in databases] + assert 'postgres' in datbase_names + assert 'dogs' not in datbase_names + assert 'dogs_3' not in datbase_names + assert 'dogs_9' in datbase_names + assert 'nope' not in datbase_names + + +def test_get_cursor(dbm_instance, integration_check): + check = integration_check(dbm_instance) + check.version = POSTGRES_VERSION + collector = PostgresSchemaCollector(check) + + with collector._get_cursor('datadog_test') as cursor: + assert cursor is not None + schemas = [] + for row in cursor: + schemas.append(row['schema_name']) + + assert set(schemas) == {'datadog', 'hstore', 'public', 'public2', 'rdsadmin_test'} + + +def test_schemas_filters(dbm_instance, integration_check): + dbm_instance['collect_schemas']['exclude_schemas'] = ['public', 'rdsadmin_test'] + check = integration_check(dbm_instance) + check.version = POSTGRES_VERSION + collector = PostgresSchemaCollector(check) + + with collector._get_cursor('datadog_test') as cursor: + assert cursor is not None + schemas = [] + for row in cursor: + schemas.append(row['schema_name']) + + assert set(schemas) == {'datadog', 'hstore'} + + +def test_tables(dbm_instance, integration_check): + check = integration_check(dbm_instance) + check.version = POSTGRES_VERSION + collector = PostgresSchemaCollector(check) + + with collector._get_cursor('datadog_test') as cursor: + assert cursor is not None + tables = [] + for row in cursor: + if row['table_name']: + tables.append(row['table_name']) + + assert set(tables) == { + 'persons', + 'personsdup1', + 'personsdup2', + 'personsdup3', + 'personsdup4', + 'personsdup5', + 'personsdup6', + 'personsdup7', + 'personsdup8', + 'personsdup9', + 'personsdup10', + 'personsdup11', + 'personsdup12', + 'personsdup13', + 'persons_indexed', + 'pgtable', + 'pg_newtable', + 'cities', + 'rds_admin_misc', + 'sample_foreign_d73a8c', + } + + +def test_columns(dbm_instance, integration_check): + check = integration_check(dbm_instance) + check.version = POSTGRES_VERSION + collector = PostgresSchemaCollector(check) + + with collector._get_cursor('datadog_test') as cursor: + assert cursor is not None + # Assert that at least one row has columns + assert any(row['columns'] for row in cursor) + for row in cursor: + if row['columns']: + for column in row['columns']: + assert column['name'] is not None + assert column['data_type'] is not None + if row['table_name'] == 'cities': + assert row['columns'] + assert row['columns'][0]['name'] + + +def test_indexes(dbm_instance, integration_check): + check = integration_check(dbm_instance) + check.version = POSTGRES_VERSION + collector = PostgresSchemaCollector(check) + + with collector._get_cursor('datadog_test') as cursor: + assert cursor is not None + # Assert that at least one row has indexes + assert any(row['indexes'] for row in cursor) + for row in cursor: + if row['indexes']: + for index in row['indexes']: + assert index['name'] is not None + assert index['definition'] is not None + if row['table_name'] == 'cities': + assert row['indexes'] + assert row['indexes'][0]['name'] + + +def test_collect_schemas(dbm_instance, integration_check): + check = integration_check(dbm_instance) + check.version = POSTGRES_VERSION + collector = PostgresSchemaCollector(check) + + collector.collect_schemas() From 96e526028f09283f963afda16e6ed49c4e052731 Mon Sep 17 00:00:00 2001 From: Seth Samuel Date: Tue, 21 Oct 2025 14:36:39 -0400 Subject: [PATCH 26/37] WIP --- .../datadog_checks/base/checks/db.py | 24 + .../datadog_checks/base/utils/db/schemas.py | 461 +++--------------- .../datadog_checks/base/utils/db/utils.py | 7 + .../tests/base/utils/db/test_schemas.py | 223 +++------ .../tests/base/utils/test_persistent_cache.py | 1 + 5 files changed, 171 insertions(+), 545 deletions(-) diff --git a/datadog_checks_base/datadog_checks/base/checks/db.py b/datadog_checks_base/datadog_checks/base/checks/db.py index 2a5fe0fc57551..b9fee24fbb856 100644 --- a/datadog_checks_base/datadog_checks/base/checks/db.py +++ b/datadog_checks_base/datadog_checks/base/checks/db.py @@ -20,3 +20,27 @@ def database_monitoring_metadata(self, raw_event: str): def database_monitoring_health(self, raw_event: str): self.event_platform_event(raw_event, "dbm-health") + + @property + def reported_hostname(self) -> str | None: + raise NotImplementedError("reported_hostname is not implemented for this check") + + @property + def database_identifier(self) -> str: + raise NotImplementedError("database_identifier is not implemented for this check") + + @property + def dbms_version(self) -> str: + raise NotImplementedError("dbms_version is not implemented for this check") + + @property + def agent_version(self) -> str: + raise NotImplementedError("agent_version is not implemented for this check") + + @property + def tags(self) -> list[str]: + raise NotImplementedError("tags is not implemented for this check") + + @property + def cloud_metadata(self) -> dict: + raise NotImplementedError("cloud_metadata is not implemented for this check") diff --git a/datadog_checks_base/datadog_checks/base/utils/db/schemas.py b/datadog_checks_base/datadog_checks/base/utils/db/schemas.py index f62ec345e6129..1eb8bf0d921d0 100644 --- a/datadog_checks_base/datadog_checks/base/utils/db/schemas.py +++ b/datadog_checks_base/datadog_checks/base/utils/db/schemas.py @@ -4,19 +4,15 @@ from __future__ import annotations -import contextlib -import time from abc import ABC, abstractmethod from typing import TYPE_CHECKING, TypedDict import orjson as json -from psycopg.rows import dict_row -if TYPE_CHECKING: - from datadog_checks.base import AgentCheck - from datadog_checks.postgres import PostgreSql +from .utils import now_ms -from datadog_checks.postgres.version_utils import VersionUtils +if TYPE_CHECKING: + from datadog_checks.base.checks.db import DatabaseCheck try: import datadog_agent @@ -25,31 +21,44 @@ class DatabaseInfo(TypedDict): - description: str name: str - id: str - encoding: str - owner: str # The schema collector sends lists of DatabaseObjects to the agent -# The format is for backwards compatibility with the current backend +# DBMS subclasses may add additional fields to the dictionary class DatabaseObject(TypedDict): - # Splat of database info - description: str name: str - id: str - encoding: str - owner: str + + +# Common configuration for schema collector +# Individual DBMS implementations should map their specific +# configuration to this type +class SchemaCollectorConfig: + def __init__(self): + self.collection_interval = 3600 + self.enabled = False + self.payload_chunk_size = 10_000 class SchemaCollector(ABC): - def __init__(self, check: AgentCheck): + """ + Abstract base class for DBM schema collectors. + + Attributes: + _collection_started_at (int): Timestamp in whole milliseconds + when the current collection started. + """ + + _collection_started_at: int | None = None + + def __init__(self, check: DatabaseCheck, config: SchemaCollectorConfig): self._check = check self._log = check.log - self._config = check._config.collect_schemas - self._row_chunk_size = 10000 - + self._config = config + self._dbms = check.__class__.__name__.lower() + if self._dbms == 'postgresql': + # Backwards compatibility for metrics namespacing + self._dbms = 'postgres' self._reset() def _reset(self): @@ -61,57 +70,54 @@ def _reset(self): def collect_schemas(self) -> bool: """ Collects and submits all applicable schema metadata to the agent. - Returns False if the previous collection was still in progress. + This class relies on the owning check to handle scheduling this method. + + This method will enforce non-overlapping invocations and + returns False if the previous collection was still in progress when invoked again. """ if self._collection_started_at is not None: return False status = "success" try: - self._collection_started_at = int(time.time() * 1000) + self._collection_started_at = now_ms() databases = self._get_databases() for database in databases: database_name = database['name'] if not database_name: - self._check.log("database has no name %v", database) + self._log.warning("database has no name %v", database) continue - start = time.time() with self._get_cursor(database_name) as cursor: - end = time.time() - self._log.info("Time to get cursor (%s): %s", database_name, int((end - start)*1000)) - # data = self._get_all(cursor) + # Get the next row from the cursor next = self._get_next(cursor) - start = time.time() while next: - # for i, next in enumerate(data): self._queued_rows.append(self._map_row(database, next)) self._total_rows_count += 1 + # Because we're iterating over a cursor we need to try to get + # the next row to see if we've reached the last row next = self._get_next(cursor) is_last_payload = database is databases[-1] and next is None - # is_last_payload = i == len(data) - 1 self.maybe_flush(is_last_payload) - end = time.time() - self._log.info("Time to process rows (%s): %s", database_name, int((end - start)*1000)) except Exception as e: status = "error" - self._log.error("Error collecting schema metadata: %s", e) + self._log.error("Error collecting schema: %s", e) raise e finally: self._check.histogram( - "dd.postgres.schema.time", - int(time.time() * 1000) - self._collection_started_at, + f"dd.{self._dbms}.schema.time", + now_ms() - self._collection_started_at, tags=self._check.tags + ["status:" + status], hostname=self._check.reported_hostname, raw=True, ) self._check.gauge( - "dd.postgres.schema.tables_count", + f"dd.{self._dbms}.schema.tables_count", self._total_rows_count, tags=self._check.tags + ["status:" + status], hostname=self._check.reported_hostname, raw=True, ) self._check.gauge( - "dd.postgres.schema.payloads_count", + f"dd.{self._dbms}.schema.payloads_count", self._collection_payloads_count, tags=self._check.tags + ["status:" + status], hostname=self._check.reported_hostname, @@ -128,19 +134,22 @@ def base_event(self): "database_instance": self._check.database_identifier, "agent_version": datadog_agent.get_version(), "collection_interval": self._config.collection_interval, - "dbms_version": str(self._check.version), + "dbms_version": str(self._check.dbms_version), "tags": self._check.tags, "cloud_metadata": self._check.cloud_metadata, "collection_started_at": self._collection_started_at, } def maybe_flush(self, is_last_payload): - if len(self._queued_rows) > self._row_chunk_size or is_last_payload: + if is_last_payload or len(self._queued_rows) >= self._config.payload_chunk_size: event = self.base_event.copy() - event['timestamp'] = int(time.time() * 1000) + event["timestamp"] = now_ms() + # DBM backend expects metadata to be an array of database objects event["metadata"] = self._queued_rows self._collection_payloads_count += 1 if is_last_payload: + # For the last payload, we need to include the total number of payloads collected + # This is used for snapshotting to ensure that all payloads have been received event["collection_payloads_count"] = self._collection_payloads_count self._check.database_monitoring_metadata(json.dumps(event)) @@ -148,364 +157,32 @@ def maybe_flush(self, is_last_payload): @abstractmethod def _get_databases(self) -> list[DatabaseInfo]: - pass + """ + Returns a list of database dictionaries. + Subclasses should override this method to return the list of databases to collect schema metadata for. + """ + raise NotImplementedError("Subclasses must implement _get_databases") @abstractmethod def _get_cursor(self, database): - pass + """ + Returns a cursor for the given database. + Subclasses should override this method to return the cursor for the given database. + """ + raise NotImplementedError("Subclasses must implement _get_cursor") @abstractmethod def _get_next(self, cursor): - pass - - @abstractmethod - def _get_all(self, cursor): - pass + """ + Returns the next row from the cursor. + Subclasses should override this method to return the next row from the cursor. + """ + raise NotImplementedError("Subclasses must implement _get_next") - @abstractmethod - def _map_row(self, database: DatabaseInfo, cursor_row) -> DatabaseObject: + def _map_row(self, database: DatabaseInfo, _cursor_row) -> DatabaseObject: """ Maps a cursor row to a dict that matches the schema expected by DBM. + The base implementation of this method returns just the database dictionary. + Subclasses should override this method to add schema and table data based on the cursor row. """ - return { - **database, - "id": str(database["id"]), #Case id into string as expected by backend - } - - -PG_TABLES_QUERY_V10_PLUS = """ -SELECT c.oid AS table_id, - c.relnamespace AS schema_id, - c.relname AS table_name, - c.relhasindex AS has_indexes, - c.relowner :: regrole AS owner, - ( CASE - WHEN c.relkind = 'p' THEN TRUE - ELSE FALSE - END ) AS has_partitions, - t.relname AS toast_table -FROM pg_class c - left join pg_class t - ON c.reltoastrelid = t.oid -WHERE c.relkind IN ( 'r', 'p', 'f' ) - AND c.relispartition != 't' -""" - -PG_TABLES_QUERY_V9 = """ -SELECT c.oid AS table_id, - c.relnamespace AS schema_id, - c.relname AS table_name, - c.relhasindex AS has_indexes, - c.relowner :: regrole AS owner, - t.relname AS toast_table -FROM pg_class c - left join pg_class t - ON c.reltoastrelid = t.oid -WHERE c.relkind IN ( 'r', 'f' ) -""" - - -SCHEMA_QUERY = """ -SELECT nsp.oid AS schema_id, - nspname AS schema_name, - nspowner :: regrole AS schema_owner -FROM pg_namespace nsp - LEFT JOIN pg_roles r on nsp.nspowner = r.oid -WHERE nspname NOT IN ( 'information_schema', 'pg_catalog' ) - AND nspname NOT LIKE 'pg_toast%' - AND nspname NOT LIKE 'pg_temp_%' -""" - -COLUMNS_QUERY = """ -SELECT attname AS name, - Format_type(atttypid, atttypmod) AS data_type, - NOT attnotnull AS nullable, - pg_get_expr(adbin, adrelid) AS default, - attrelid AS table_id -FROM pg_attribute - LEFT JOIN pg_attrdef ad - ON adrelid = attrelid - AND adnum = attnum -WHERE attnum > 0 - AND NOT attisdropped -""" - - -PG_INDEXES_QUERY = """ -SELECT - c.relname AS name, - ix.indrelid AS table_id, - pg_get_indexdef(c.oid) AS definition, - ix.indisunique AS is_unique, - ix.indisexclusion AS is_exclusion, - ix.indimmediate AS is_immediate, - ix.indisclustered AS is_clustered, - ix.indisvalid AS is_valid, - ix.indcheckxmin AS is_checkxmin, - ix.indisready AS is_ready, - ix.indislive AS is_live, - ix.indisreplident AS is_replident, - ix.indpred IS NOT NULL AS is_partial -FROM - pg_index ix -JOIN - pg_class c -ON - c.oid = ix.indexrelid -""" - - -PG_CONSTRAINTS_QUERY = """ -SELECT conname AS name, - pg_get_constraintdef(oid) AS definition, - conrelid AS table_id -FROM pg_constraint -WHERE contype = 'f' -""" - - -PARTITION_KEY_QUERY = """ -SELECT relname, - pg_get_partkeydef(oid) AS partition_key, - oid AS table_id -FROM pg_class -""" - -NUM_PARTITIONS_QUERY = """ -SELECT count(inhrelid :: regclass) AS num_partitions, inhparent as table_id -FROM pg_inherits -GROUP BY inhparent; -""" - -PARTITION_ACTIVITY_QUERY = """ -SELECT pi.inhparent :: regclass AS parent_table_name, - SUM(COALESCE(psu.seq_scan, 0) + COALESCE(psu.idx_scan, 0)) AS total_activity, - pi.inhparent as table_id -FROM pg_catalog.pg_stat_user_tables psu - join pg_class pc - ON psu.relname = pc.relname - join pg_inherits pi - ON pi.inhrelid = pc.oid -GROUP BY pi.inhparent -""" - - -class TableObject(TypedDict): - id: str - name: str - columns: list - indexes: list - foreign_keys: list - - -class SchemaObject(TypedDict): - id: str - name: str - owner: str - tables: list[TableObject] - - -class PostgresDatabaseObject(DatabaseObject): - schemas: list[SchemaObject] - - -DATABASE_INFORMATION_QUERY = """ -SELECT db.oid AS id, - datname AS NAME, - pg_encoding_to_char(encoding) AS encoding, - rolname AS owner, - description -FROM pg_catalog.pg_database db - LEFT JOIN pg_catalog.pg_description dc - ON dc.objoid = db.oid - JOIN pg_roles a - ON datdba = a.oid - WHERE datname NOT LIKE 'template%' -""" - - -class PostgresSchemaCollector(SchemaCollector): - def __init__(self, check: PostgreSql): - super().__init__(check) - self._check = check - self._config = check._config.collect_schemas - - @property - def base_event(self): - return { - **super().base_event, - "dbms": "postgres", - "kind": "pg_databases", - } - - def _get_databases(self): - with self._check._get_main_db() as conn: - with conn.cursor(row_factory=dict_row) as cursor: - query = DATABASE_INFORMATION_QUERY - for exclude_regex in self._config.exclude_databases: - query += " AND datname !~ '{}'".format(exclude_regex) - if self._config.include_databases: - query += f" AND ({' OR '.join(f"datname ~ '{include_regex}'" for include_regex in self._config.include_databases)})" - - # Autodiscovery trumps exclude and include - autodiscovery_databases = self._check.autodiscovery.get_items() - if autodiscovery_databases: - query += " AND datname IN ({})".format(", ".join(f"'{db}'" for db in autodiscovery_databases)) - - cursor.execute(query) - return cursor.fetchall() - - @contextlib.contextmanager - def _get_cursor(self, database_name): - with self._check.db_pool.get_connection(database_name) as conn: - with conn.cursor(row_factory=dict_row) as cursor: - schemas_query = self._get_schemas_query() - tables_query = self._get_tables_query() - columns_query = COLUMNS_QUERY - indexes_query = PG_INDEXES_QUERY - constraints_query = PG_CONSTRAINTS_QUERY - partitions_ctes = ( - f""" - , - partition_keys AS ( - {PARTITION_KEY_QUERY} - ), - num_partitions AS ( - {NUM_PARTITIONS_QUERY} - ) - """ - if VersionUtils.transform_version(str(self._check.version))["version.major"] > "9" - else "" - ) - partition_joins = ( - """ - LEFT JOIN partition_keys ON tables.table_id = partition_keys.table_id - LEFT JOIN num_partitions ON tables.table_id = num_partitions.table_id - """ - if VersionUtils.transform_version(str(self._check.version))["version.major"] > "9" - else "" - ) - parition_selects = ( - """ - , - partition_keys.partition_key, - num_partitions.num_partitions - """ - if VersionUtils.transform_version(str(self._check.version))["version.major"] > "9" - else "" - ) - limit = int(self._config.max_tables or 1_000_000) - - query = f""" - WITH - schemas AS( - {schemas_query} - ), - tables AS ( - {tables_query} - ), - schema_tables AS ( - SELECT schemas.schema_id, schemas.schema_name, - tables.table_id, tables.table_name - FROM schemas - LEFT JOIN tables ON schemas.schema_id = tables.schema_id - ORDER BY schemas.schema_name, tables.table_name - LIMIT {limit} - ), - columns AS ( - {columns_query} - ), - indexes AS ( - {indexes_query} - ), - constraints AS ( - {constraints_query} - ) - {partitions_ctes} - - SELECT * FROM ( - SELECT schema_tables.schema_id, schema_tables.schema_name, - schema_tables.table_id, schema_tables.table_name, - array_agg(row_to_json(columns.*)) FILTER (WHERE columns.name IS NOT NULL) as columns, - array_agg(row_to_json(indexes.*)) FILTER (WHERE indexes.name IS NOT NULL) as indexes, - array_agg(row_to_json(constraints.*)) FILTER (WHERE constraints.name IS NOT NULL) - as foreign_keys - {parition_selects} - FROM schema_tables - LEFT JOIN columns ON schema_tables.table_id = columns.table_id - LEFT JOIN indexes ON schema_tables.table_id = indexes.table_id - LEFT JOIN constraints ON schema_tables.table_id = constraints.table_id - {partition_joins} - GROUP BY schema_tables.schema_id, schema_tables.schema_name, schema_tables.table_id, schema_tables.table_name - ) t - ; - """ - print(query) - cursor.execute(query) - yield cursor - - def _get_schemas_query(self): - query = SCHEMA_QUERY - for exclude_regex in self._config.exclude_schemas: - query += " AND nspname !~ '{}'".format(exclude_regex) - if self._config.include_schemas: - query += f" AND ({' OR '.join(f"nspname ~ '{include_regex}'" for include_regex in self._config.include_schemas)})" - if self._check._config.ignore_schemas_owned_by: - query += " AND nspowner :: regrole :: text not IN ({})".format( - ", ".join(f"'{owner}'" for owner in self._check._config.ignore_schemas_owned_by) - ) - return query - - def _get_tables_query(self): - if VersionUtils.transform_version(str(self._check.version))["version.major"] == "9": - query = PG_TABLES_QUERY_V9 - else: - query = PG_TABLES_QUERY_V10_PLUS - for exclude_regex in self._config.exclude_tables: - query += " AND c.relname !~ '{}'".format(exclude_regex) - if self._config.include_tables: - query += f" AND ({' OR '.join(f"c.relname ~ '{include_regex}'" for include_regex in self._config.include_tables)})" - return query - - - def _get_next(self, cursor): - return cursor.fetchone() - - def _get_all(self, cursor): - return cursor.fetchall() - - def _map_row(self, database: DatabaseInfo, cursor_row) -> DatabaseObject: - object = super()._map_row(database, cursor_row) - # Map the cursor row to the expected schema, and strip out None values - object["schemas"] = [ - { - k: v - for k, v in { - "id": str(cursor_row.get("schema_id")), - "name": cursor_row.get("schema_name"), - "owner": cursor_row.get("schema_owner"), - "tables": [ - { - k: v - for k, v in { - "id": str(cursor_row.get("table_id")), - "name": cursor_row.get("table_name"), - "owner": cursor_row.get("owner"), - # The query can create duplicates of the joined tables - "columns": list({v and v['name']: v for v in cursor_row.get("columns") or []}.values()), - "indexes": list({v and v['name']: v for v in cursor_row.get("indexes") or []}.values()), - "foreign_keys": list( - {v and v['name']: v for v in cursor_row.get("foreign_keys") or []}.values() - ), - "toast_table": cursor_row.get("toast_table"), - "num_partitions": cursor_row.get("num_partitions"), - "partition_key": cursor_row.get("partition_key"), - }.items() - if v is not None - } - ], - }.items() - if v is not None - } - ] - return object + return {**database} diff --git a/datadog_checks_base/datadog_checks/base/utils/db/utils.py b/datadog_checks_base/datadog_checks/base/utils/db/utils.py index 0c46a26cff82e..3114dbb1a3632 100644 --- a/datadog_checks_base/datadog_checks/base/utils/db/utils.py +++ b/datadog_checks_base/datadog_checks/base/utils/db/utils.py @@ -590,3 +590,10 @@ def get_tags(self) -> List[str]: # Generate and cache regular tags self._cached_tag_list = self._generate_tag_strings(self._tags) return list(self._cached_tag_list) + + +def now_ms() -> int: + """ + Get the current time in whole milliseconds. + """ + return int(time.time() * 1000) diff --git a/datadog_checks_base/tests/base/utils/db/test_schemas.py b/datadog_checks_base/tests/base/utils/db/test_schemas.py index 518e62d84222a..4045f99c06b61 100644 --- a/datadog_checks_base/tests/base/utils/db/test_schemas.py +++ b/datadog_checks_base/tests/base/utils/db/test_schemas.py @@ -1,160 +1,77 @@ # (C) Datadog, Inc. 2023-present # All rights reserved # Licensed under a 3-clause BSD style license (see LICENSE) +from contextlib import contextmanager + import pytest -from datadog_checks.postgres.schemas import PostgresSchemaCollector - -from .common import POSTGRES_VERSION - -pytestmark = [pytest.mark.integration, pytest.mark.usefixtures('dd_environment')] - - -@pytest.fixture -def dbm_instance(pg_instance): - pg_instance['dbm'] = True - pg_instance['min_collection_interval'] = 0.1 - pg_instance['query_samples'] = {'enabled': False} - pg_instance['query_activity'] = {'enabled': False} - pg_instance['query_metrics'] = {'enabled': False} - pg_instance['collect_resources'] = {'enabled': False, 'run_sync': True} - pg_instance['collect_settings'] = {'enabled': False, 'run_sync': True} - pg_instance['collect_schemas'] = {'enabled': True, 'run_sync': True} - return pg_instance - - -def test_get_databases(dbm_instance, integration_check): - check = integration_check(dbm_instance) - collector = PostgresSchemaCollector(check) - - databases = collector._get_databases() - datbase_names = [database['name'] for database in databases] - assert 'postgres' in datbase_names - assert 'dogs' in datbase_names - assert 'dogs_3' in datbase_names - assert 'nope' not in datbase_names - - -def test_databases_filters(dbm_instance, integration_check): - dbm_instance['collect_schemas']['exclude_databases'] = ['^dogs$', 'dogs_[345]'] - check = integration_check(dbm_instance) - collector = PostgresSchemaCollector(check) - - databases = collector._get_databases() - datbase_names = [database['name'] for database in databases] - assert 'postgres' in datbase_names - assert 'dogs' not in datbase_names - assert 'dogs_3' not in datbase_names - assert 'dogs_9' in datbase_names - assert 'nope' not in datbase_names - - -def test_get_cursor(dbm_instance, integration_check): - check = integration_check(dbm_instance) - check.version = POSTGRES_VERSION - collector = PostgresSchemaCollector(check) - - with collector._get_cursor('datadog_test') as cursor: - assert cursor is not None - schemas = [] - for row in cursor: - schemas.append(row['schema_name']) - - assert set(schemas) == {'datadog', 'hstore', 'public', 'public2', 'rdsadmin_test'} - - -def test_schemas_filters(dbm_instance, integration_check): - dbm_instance['collect_schemas']['exclude_schemas'] = ['public', 'rdsadmin_test'] - check = integration_check(dbm_instance) - check.version = POSTGRES_VERSION - collector = PostgresSchemaCollector(check) - - with collector._get_cursor('datadog_test') as cursor: - assert cursor is not None - schemas = [] - for row in cursor: - schemas.append(row['schema_name']) - - assert set(schemas) == {'datadog', 'hstore'} - - -def test_tables(dbm_instance, integration_check): - check = integration_check(dbm_instance) - check.version = POSTGRES_VERSION - collector = PostgresSchemaCollector(check) - - with collector._get_cursor('datadog_test') as cursor: - assert cursor is not None - tables = [] - for row in cursor: - if row['table_name']: - tables.append(row['table_name']) - - assert set(tables) == { - 'persons', - 'personsdup1', - 'personsdup2', - 'personsdup3', - 'personsdup4', - 'personsdup5', - 'personsdup6', - 'personsdup7', - 'personsdup8', - 'personsdup9', - 'personsdup10', - 'personsdup11', - 'personsdup12', - 'personsdup13', - 'persons_indexed', - 'pgtable', - 'pg_newtable', - 'cities', - 'rds_admin_misc', - 'sample_foreign_d73a8c', - } - - -def test_columns(dbm_instance, integration_check): - check = integration_check(dbm_instance) - check.version = POSTGRES_VERSION - collector = PostgresSchemaCollector(check) - - with collector._get_cursor('datadog_test') as cursor: - assert cursor is not None - # Assert that at least one row has columns - assert any(row['columns'] for row in cursor) - for row in cursor: - if row['columns']: - for column in row['columns']: - assert column['name'] is not None - assert column['data_type'] is not None - if row['table_name'] == 'cities': - assert row['columns'] - assert row['columns'][0]['name'] - - -def test_indexes(dbm_instance, integration_check): - check = integration_check(dbm_instance) - check.version = POSTGRES_VERSION - collector = PostgresSchemaCollector(check) - - with collector._get_cursor('datadog_test') as cursor: - assert cursor is not None - # Assert that at least one row has indexes - assert any(row['indexes'] for row in cursor) - for row in cursor: - if row['indexes']: - for index in row['indexes']: - assert index['name'] is not None - assert index['definition'] is not None - if row['table_name'] == 'cities': - assert row['indexes'] - assert row['indexes'][0]['name'] - - -def test_collect_schemas(dbm_instance, integration_check): - check = integration_check(dbm_instance) - check.version = POSTGRES_VERSION - collector = PostgresSchemaCollector(check) +from datadog_checks.base.checks.db import DatabaseCheck +from datadog_checks.base.utils.db.schemas import SchemaCollector, SchemaCollectorConfig + + +class TestDatabaseCheck(DatabaseCheck): + __test__ = False + def __init__(self): + super().__init__() + self._reported_hostname = "test_hostname" + self._database_identifier = "test_database_identifier" + self._dbms_version = "test_dbms_version" + self._agent_version = "test_agent_version" + self._tags = ["test_tag"] + self._cloud_metadata = {"test_cloud_metadata": "test_cloud_metadata"} + + @property + def reported_hostname(self): + return self._reported_hostname + + @property + def database_identifier(self): + return self._database_identifier + + @property + def dbms_version(self): + return self._dbms_version + + @property + def agent_version(self): + return self._agent_version + + @property + def tags(self): + return self._tags + + @property + def cloud_metadata(self): + return self._cloud_metadata + + +class TestSchemaCollector(SchemaCollector): + __test__ = False + def __init__(self, check: DatabaseCheck, config: SchemaCollectorConfig): + super().__init__(check, config) + self._row_index = 0 + self._rows = [{'table_name': 'test_table'}] + + def _get_databases(self): + return [{'name': 'test_database'}] + + @contextmanager + def _get_cursor(self, database: str): + yield {} + + def _get_next(self, _cursor): + if self._row_index < len(self._rows): + row = self._rows[self._row_index] + self._row_index += 1 + return row + return None + + def _map_row(self, database: str, cursor_row: dict): + return {**database} + +@pytest.mark.unit +def test_schema_collector(): + check = TestDatabaseCheck() + collector = TestSchemaCollector(check, SchemaCollectorConfig()) collector.collect_schemas() diff --git a/datadog_checks_base/tests/base/utils/test_persistent_cache.py b/datadog_checks_base/tests/base/utils/test_persistent_cache.py index 3feeaaa274194..66bda1ee24434 100644 --- a/datadog_checks_base/tests/base/utils/test_persistent_cache.py +++ b/datadog_checks_base/tests/base/utils/test_persistent_cache.py @@ -40,6 +40,7 @@ def cache_id(check: AgentCheck) -> str: class TestCheck(AgentCheck): + __test__ = False def check(self, instance): pass From 04f8163b81f211370907f7081556101a41f1f62b Mon Sep 17 00:00:00 2001 From: Seth Samuel Date: Tue, 21 Oct 2025 14:43:35 -0400 Subject: [PATCH 27/37] WIP --- .../datadog_checks/base/checks/db.py | 4 --- .../datadog_checks/base/utils/db/schemas.py | 9 +++++++ .../tests/base/utils/db/test_schemas.py | 27 +++++++++++++++++-- 3 files changed, 34 insertions(+), 6 deletions(-) diff --git a/datadog_checks_base/datadog_checks/base/checks/db.py b/datadog_checks_base/datadog_checks/base/checks/db.py index b9fee24fbb856..7b1c92ea41fbb 100644 --- a/datadog_checks_base/datadog_checks/base/checks/db.py +++ b/datadog_checks_base/datadog_checks/base/checks/db.py @@ -33,10 +33,6 @@ def database_identifier(self) -> str: def dbms_version(self) -> str: raise NotImplementedError("dbms_version is not implemented for this check") - @property - def agent_version(self) -> str: - raise NotImplementedError("agent_version is not implemented for this check") - @property def tags(self) -> list[str]: raise NotImplementedError("tags is not implemented for this check") diff --git a/datadog_checks_base/datadog_checks/base/utils/db/schemas.py b/datadog_checks_base/datadog_checks/base/utils/db/schemas.py index 1eb8bf0d921d0..2c7ce54dea383 100644 --- a/datadog_checks_base/datadog_checks/base/utils/db/schemas.py +++ b/datadog_checks_base/datadog_checks/base/utils/db/schemas.py @@ -132,6 +132,7 @@ def base_event(self): return { "host": self._check.reported_hostname, "database_instance": self._check.database_identifier, + "kind": self.kind, "agent_version": datadog_agent.get_version(), "collection_interval": self._config.collection_interval, "dbms_version": str(self._check.dbms_version), @@ -155,7 +156,15 @@ def maybe_flush(self, is_last_payload): self._queued_rows = [] + @property @abstractmethod + def kind(self) -> str: + """ + Returns the kind property of the schema metadata event. + Subclasses should override this property to return the kind of schema being collected. + """ + raise NotImplementedError("Subclasses must implement kind") + def _get_databases(self) -> list[DatabaseInfo]: """ Returns a list of database dictionaries. diff --git a/datadog_checks_base/tests/base/utils/db/test_schemas.py b/datadog_checks_base/tests/base/utils/db/test_schemas.py index 4045f99c06b61..d064417ef4259 100644 --- a/datadog_checks_base/tests/base/utils/db/test_schemas.py +++ b/datadog_checks_base/tests/base/utils/db/test_schemas.py @@ -8,6 +8,11 @@ from datadog_checks.base.checks.db import DatabaseCheck from datadog_checks.base.utils.db.schemas import SchemaCollector, SchemaCollectorConfig +try: + import datadog_agent # type: ignore +except ImportError: + from datadog_checks.base.stubs import datadog_agent + class TestDatabaseCheck(DatabaseCheck): __test__ = False @@ -67,11 +72,29 @@ def _get_next(self, _cursor): return None def _map_row(self, database: str, cursor_row: dict): - return {**database} + return {**database, "tables": [cursor_row]} + + @property + def kind(self): + return "test_databases" @pytest.mark.unit -def test_schema_collector(): +def test_schema_collector(aggregator): check = TestDatabaseCheck() collector = TestSchemaCollector(check, SchemaCollectorConfig()) collector.collect_schemas() + + events = aggregator.get_event_platform_events("dbm-metadata") + assert len(events) == 1 + event = events[0] + assert event['kind'] == collector.kind + assert event['host'] == check.reported_hostname + assert event['database_instance'] == check.database_identifier + assert event['agent_version'] == datadog_agent.get_version() + assert event['collection_interval'] == collector._config.collection_interval + assert event['dbms_version'] == check.dbms_version + assert event['tags'] == check.tags + assert event['cloud_metadata'] == check.cloud_metadata + assert event['metadata'][0]['name'] == 'test_database' + assert event['metadata'][0]['tables'][0]['table_name'] == 'test_table' From 4624b88150c6b6baa6c3f1efbcded9cd1a8abed0 Mon Sep 17 00:00:00 2001 From: Seth Samuel Date: Tue, 21 Oct 2025 14:48:34 -0400 Subject: [PATCH 28/37] Changelog --- datadog_checks_base/changelog.d/21720.added | 1 + datadog_checks_base/tests/base/utils/db/test_schemas.py | 4 +++- datadog_checks_base/tests/base/utils/test_persistent_cache.py | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) create mode 100644 datadog_checks_base/changelog.d/21720.added diff --git a/datadog_checks_base/changelog.d/21720.added b/datadog_checks_base/changelog.d/21720.added new file mode 100644 index 0000000000000..951cfcdc5b176 --- /dev/null +++ b/datadog_checks_base/changelog.d/21720.added @@ -0,0 +1 @@ +Create shared schemas collector for the Postgres, MySQL, and SQL Server integrations diff --git a/datadog_checks_base/tests/base/utils/db/test_schemas.py b/datadog_checks_base/tests/base/utils/db/test_schemas.py index d064417ef4259..8b45c5e56a335 100644 --- a/datadog_checks_base/tests/base/utils/db/test_schemas.py +++ b/datadog_checks_base/tests/base/utils/db/test_schemas.py @@ -9,13 +9,14 @@ from datadog_checks.base.utils.db.schemas import SchemaCollector, SchemaCollectorConfig try: - import datadog_agent # type: ignore + import datadog_agent # type: ignore except ImportError: from datadog_checks.base.stubs import datadog_agent class TestDatabaseCheck(DatabaseCheck): __test__ = False + def __init__(self): super().__init__() self._reported_hostname = "test_hostname" @@ -52,6 +53,7 @@ def cloud_metadata(self): class TestSchemaCollector(SchemaCollector): __test__ = False + def __init__(self, check: DatabaseCheck, config: SchemaCollectorConfig): super().__init__(check, config) self._row_index = 0 diff --git a/datadog_checks_base/tests/base/utils/test_persistent_cache.py b/datadog_checks_base/tests/base/utils/test_persistent_cache.py index 66bda1ee24434..56cc8b73e9802 100644 --- a/datadog_checks_base/tests/base/utils/test_persistent_cache.py +++ b/datadog_checks_base/tests/base/utils/test_persistent_cache.py @@ -41,6 +41,7 @@ def cache_id(check: AgentCheck) -> str: class TestCheck(AgentCheck): __test__ = False + def check(self, instance): pass From a68f875e8494b4cd85527b74116b06989eaad06b Mon Sep 17 00:00:00 2001 From: Seth Samuel Date: Tue, 21 Oct 2025 14:49:37 -0400 Subject: [PATCH 29/37] Warning --- datadog_checks_base/datadog_checks/base/utils/db/schemas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datadog_checks_base/datadog_checks/base/utils/db/schemas.py b/datadog_checks_base/datadog_checks/base/utils/db/schemas.py index 2c7ce54dea383..72205e7bee419 100644 --- a/datadog_checks_base/datadog_checks/base/utils/db/schemas.py +++ b/datadog_checks_base/datadog_checks/base/utils/db/schemas.py @@ -15,7 +15,7 @@ from datadog_checks.base.checks.db import DatabaseCheck try: - import datadog_agent + import datadog_agent # type: ignore except ImportError: from datadog_checks.base.stubs import datadog_agent From aa0e0ddbeb8437f122f901cbf694458ef9181144 Mon Sep 17 00:00:00 2001 From: Seth Samuel Date: Tue, 21 Oct 2025 14:50:11 -0400 Subject: [PATCH 30/37] Remove unused --- datadog_checks_base/datadog_checks/base/utils/db/schemas.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/datadog_checks_base/datadog_checks/base/utils/db/schemas.py b/datadog_checks_base/datadog_checks/base/utils/db/schemas.py index 72205e7bee419..be59c63e22bab 100644 --- a/datadog_checks_base/datadog_checks/base/utils/db/schemas.py +++ b/datadog_checks_base/datadog_checks/base/utils/db/schemas.py @@ -19,7 +19,6 @@ except ImportError: from datadog_checks.base.stubs import datadog_agent - class DatabaseInfo(TypedDict): name: str @@ -36,7 +35,6 @@ class DatabaseObject(TypedDict): class SchemaCollectorConfig: def __init__(self): self.collection_interval = 3600 - self.enabled = False self.payload_chunk_size = 10_000 From 3c6489682bcbf705c761a7b8ef9de27e5a550cac Mon Sep 17 00:00:00 2001 From: Seth Samuel Date: Tue, 21 Oct 2025 14:54:09 -0400 Subject: [PATCH 31/37] Lint --- datadog_checks_base/datadog_checks/base/utils/db/schemas.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/datadog_checks_base/datadog_checks/base/utils/db/schemas.py b/datadog_checks_base/datadog_checks/base/utils/db/schemas.py index be59c63e22bab..0e0b34c7a90a6 100644 --- a/datadog_checks_base/datadog_checks/base/utils/db/schemas.py +++ b/datadog_checks_base/datadog_checks/base/utils/db/schemas.py @@ -15,10 +15,11 @@ from datadog_checks.base.checks.db import DatabaseCheck try: - import datadog_agent # type: ignore + import datadog_agent # type: ignore except ImportError: from datadog_checks.base.stubs import datadog_agent + class DatabaseInfo(TypedDict): name: str From 69ede0d87d897a194ed98db9b2509bcb65bcb9d7 Mon Sep 17 00:00:00 2001 From: Seth Samuel Date: Tue, 21 Oct 2025 15:02:40 -0400 Subject: [PATCH 32/37] Use base --- postgres/datadog_checks/postgres/schemas.py | 149 +------------------- 1 file changed, 6 insertions(+), 143 deletions(-) diff --git a/postgres/datadog_checks/postgres/schemas.py b/postgres/datadog_checks/postgres/schemas.py index f62ec345e6129..9fe2cd5639cbd 100644 --- a/postgres/datadog_checks/postgres/schemas.py +++ b/postgres/datadog_checks/postgres/schemas.py @@ -5,25 +5,16 @@ from __future__ import annotations import contextlib -import time -from abc import ABC, abstractmethod from typing import TYPE_CHECKING, TypedDict -import orjson as json from psycopg.rows import dict_row if TYPE_CHECKING: - from datadog_checks.base import AgentCheck from datadog_checks.postgres import PostgreSql +from datadog_checks.base.utils.db.schemas import SchemaCollector, SchemaCollectorConfig from datadog_checks.postgres.version_utils import VersionUtils -try: - import datadog_agent -except ImportError: - from datadog_checks.base.stubs import datadog_agent - - class DatabaseInfo(TypedDict): description: str name: str @@ -43,136 +34,6 @@ class DatabaseObject(TypedDict): owner: str -class SchemaCollector(ABC): - def __init__(self, check: AgentCheck): - self._check = check - self._log = check.log - self._config = check._config.collect_schemas - self._row_chunk_size = 10000 - - self._reset() - - def _reset(self): - self._collection_started_at = None - self._collection_payloads_count = 0 - self._queued_rows = [] - self._total_rows_count = 0 - - def collect_schemas(self) -> bool: - """ - Collects and submits all applicable schema metadata to the agent. - Returns False if the previous collection was still in progress. - """ - if self._collection_started_at is not None: - return False - status = "success" - try: - self._collection_started_at = int(time.time() * 1000) - databases = self._get_databases() - for database in databases: - database_name = database['name'] - if not database_name: - self._check.log("database has no name %v", database) - continue - start = time.time() - with self._get_cursor(database_name) as cursor: - end = time.time() - self._log.info("Time to get cursor (%s): %s", database_name, int((end - start)*1000)) - # data = self._get_all(cursor) - next = self._get_next(cursor) - start = time.time() - while next: - # for i, next in enumerate(data): - self._queued_rows.append(self._map_row(database, next)) - self._total_rows_count += 1 - next = self._get_next(cursor) - is_last_payload = database is databases[-1] and next is None - # is_last_payload = i == len(data) - 1 - self.maybe_flush(is_last_payload) - end = time.time() - self._log.info("Time to process rows (%s): %s", database_name, int((end - start)*1000)) - except Exception as e: - status = "error" - self._log.error("Error collecting schema metadata: %s", e) - raise e - finally: - self._check.histogram( - "dd.postgres.schema.time", - int(time.time() * 1000) - self._collection_started_at, - tags=self._check.tags + ["status:" + status], - hostname=self._check.reported_hostname, - raw=True, - ) - self._check.gauge( - "dd.postgres.schema.tables_count", - self._total_rows_count, - tags=self._check.tags + ["status:" + status], - hostname=self._check.reported_hostname, - raw=True, - ) - self._check.gauge( - "dd.postgres.schema.payloads_count", - self._collection_payloads_count, - tags=self._check.tags + ["status:" + status], - hostname=self._check.reported_hostname, - raw=True, - ) - - self._reset() - return True - - @property - def base_event(self): - return { - "host": self._check.reported_hostname, - "database_instance": self._check.database_identifier, - "agent_version": datadog_agent.get_version(), - "collection_interval": self._config.collection_interval, - "dbms_version": str(self._check.version), - "tags": self._check.tags, - "cloud_metadata": self._check.cloud_metadata, - "collection_started_at": self._collection_started_at, - } - - def maybe_flush(self, is_last_payload): - if len(self._queued_rows) > self._row_chunk_size or is_last_payload: - event = self.base_event.copy() - event['timestamp'] = int(time.time() * 1000) - event["metadata"] = self._queued_rows - self._collection_payloads_count += 1 - if is_last_payload: - event["collection_payloads_count"] = self._collection_payloads_count - self._check.database_monitoring_metadata(json.dumps(event)) - - self._queued_rows = [] - - @abstractmethod - def _get_databases(self) -> list[DatabaseInfo]: - pass - - @abstractmethod - def _get_cursor(self, database): - pass - - @abstractmethod - def _get_next(self, cursor): - pass - - @abstractmethod - def _get_all(self, cursor): - pass - - @abstractmethod - def _map_row(self, database: DatabaseInfo, cursor_row) -> DatabaseObject: - """ - Maps a cursor row to a dict that matches the schema expected by DBM. - """ - return { - **database, - "id": str(database["id"]), #Case id into string as expected by backend - } - - PG_TABLES_QUERY_V10_PLUS = """ SELECT c.oid AS table_id, c.relnamespace AS schema_id, @@ -325,10 +186,12 @@ class PostgresDatabaseObject(DatabaseObject): class PostgresSchemaCollector(SchemaCollector): + _check: PostgreSql + def __init__(self, check: PostgreSql): - super().__init__(check) - self._check = check - self._config = check._config.collect_schemas + config = SchemaCollectorConfig() + config.collection_interval = check._config.collect_schemas.collection_interval + super().__init__(check, config) @property def base_event(self): From 7cddaec042d710cf55cc5b6984a88c9426e6aecf Mon Sep 17 00:00:00 2001 From: Seth Samuel Date: Tue, 21 Oct 2025 15:04:48 -0400 Subject: [PATCH 33/37] Lint --- postgres/datadog_checks/postgres/schemas.py | 4 ++-- postgres/tests/test_metadata.py | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/postgres/datadog_checks/postgres/schemas.py b/postgres/datadog_checks/postgres/schemas.py index 9fe2cd5639cbd..bbc62064219da 100644 --- a/postgres/datadog_checks/postgres/schemas.py +++ b/postgres/datadog_checks/postgres/schemas.py @@ -15,6 +15,7 @@ from datadog_checks.base.utils.db.schemas import SchemaCollector, SchemaCollectorConfig from datadog_checks.postgres.version_utils import VersionUtils + class DatabaseInfo(TypedDict): description: str name: str @@ -312,7 +313,7 @@ def _get_schemas_query(self): for exclude_regex in self._config.exclude_schemas: query += " AND nspname !~ '{}'".format(exclude_regex) if self._config.include_schemas: - query += f" AND ({' OR '.join(f"nspname ~ '{include_regex}'" for include_regex in self._config.include_schemas)})" + query += f" AND ({' OR '.join(f"nspname ~ '{include_regex}'" for include_regex in self._config.include_schemas)})" if self._check._config.ignore_schemas_owned_by: query += " AND nspowner :: regrole :: text not IN ({})".format( ", ".join(f"'{owner}'" for owner in self._check._config.ignore_schemas_owned_by) @@ -330,7 +331,6 @@ def _get_tables_query(self): query += f" AND ({' OR '.join(f"c.relname ~ '{include_regex}'" for include_regex in self._config.include_tables)})" return query - def _get_next(self, cursor): return cursor.fetchone() diff --git a/postgres/tests/test_metadata.py b/postgres/tests/test_metadata.py index 84b48483d28b1..3677d18f2d67a 100644 --- a/postgres/tests/test_metadata.py +++ b/postgres/tests/test_metadata.py @@ -448,7 +448,6 @@ def test_collect_schemas_max_tables(integration_check, dbm_instance, aggregator) assert len(database_metadata[0]['schemas'][0]['tables']) <= 1 - def test_collect_schemas_multiple_payloads(integration_check, dbm_instance, aggregator): dbm_instance["collect_schemas"] = {'enabled': True, 'collection_interval': 0.5} dbm_instance['relations'] = [] From ed5a7eebb66a920dee8a3b8b33a7ff07f1adb1b7 Mon Sep 17 00:00:00 2001 From: Seth Samuel Date: Wed, 22 Oct 2025 09:01:42 -0400 Subject: [PATCH 34/37] Lint --- postgres/datadog_checks/postgres/schemas.py | 15 +++++++++++---- postgres/tests/test_metadata.py | 4 +--- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/postgres/datadog_checks/postgres/schemas.py b/postgres/datadog_checks/postgres/schemas.py index bbc62064219da..98c7204be8ab3 100644 --- a/postgres/datadog_checks/postgres/schemas.py +++ b/postgres/datadog_checks/postgres/schemas.py @@ -209,7 +209,9 @@ def _get_databases(self): for exclude_regex in self._config.exclude_databases: query += " AND datname !~ '{}'".format(exclude_regex) if self._config.include_databases: - query += f" AND ({' OR '.join(f"datname ~ '{include_regex}'" for include_regex in self._config.include_databases)})" + query += f" AND ({ + ' OR '.join(f"datname ~ '{include_regex}'" for include_regex in self._config.include_databases) + })" # Autodiscovery trumps exclude and include autodiscovery_databases = self._check.autodiscovery.get_items() @@ -300,7 +302,8 @@ def _get_cursor(self, database_name): LEFT JOIN indexes ON schema_tables.table_id = indexes.table_id LEFT JOIN constraints ON schema_tables.table_id = constraints.table_id {partition_joins} - GROUP BY schema_tables.schema_id, schema_tables.schema_name, schema_tables.table_id, schema_tables.table_name + GROUP BY schema_tables.schema_id, schema_tables.schema_name, + schema_tables.table_id, schema_tables.table_name ) t ; """ @@ -313,7 +316,9 @@ def _get_schemas_query(self): for exclude_regex in self._config.exclude_schemas: query += " AND nspname !~ '{}'".format(exclude_regex) if self._config.include_schemas: - query += f" AND ({' OR '.join(f"nspname ~ '{include_regex}'" for include_regex in self._config.include_schemas)})" + query += f" AND ({ + ' OR '.join(f"nspname ~ '{include_regex}'" for include_regex in self._config.include_schemas) + })" if self._check._config.ignore_schemas_owned_by: query += " AND nspowner :: regrole :: text not IN ({})".format( ", ".join(f"'{owner}'" for owner in self._check._config.ignore_schemas_owned_by) @@ -328,7 +333,9 @@ def _get_tables_query(self): for exclude_regex in self._config.exclude_tables: query += " AND c.relname !~ '{}'".format(exclude_regex) if self._config.include_tables: - query += f" AND ({' OR '.join(f"c.relname ~ '{include_regex}'" for include_regex in self._config.include_tables)})" + query += f" AND ({ + ' OR '.join(f"c.relname ~ '{include_regex}'" for include_regex in self._config.include_tables) + })" return query def _get_next(self, cursor): diff --git a/postgres/tests/test_metadata.py b/postgres/tests/test_metadata.py index 3677d18f2d67a..4f3f02b6580cd 100644 --- a/postgres/tests/test_metadata.py +++ b/postgres/tests/test_metadata.py @@ -1,16 +1,14 @@ # (C) Datadog, Inc. 2023-present # All rights reserved # Licensed under a 3-clause BSD style license (see LICENSE) -import pprint from concurrent.futures.thread import ThreadPoolExecutor from typing import List -import mock import pytest from datadog_checks.base.utils.db.utils import DBMAsyncJob -from .common import POSTGRES_LOCALE, POSTGRES_VERSION +from .common import POSTGRES_VERSION from .utils import run_one_check pytestmark = [pytest.mark.integration, pytest.mark.usefixtures('dd_environment')] From df4cf937b6c79c5d48baf028643d98284656b222 Mon Sep 17 00:00:00 2001 From: Seth Samuel Date: Wed, 22 Oct 2025 12:42:46 -0400 Subject: [PATCH 35/37] Query tables separately --- .../datadog_checks/base/utils/db/schemas.py | 8 ++ postgres/datadog_checks/postgres/postgres.py | 6 +- postgres/datadog_checks/postgres/schemas.py | 84 +++++++++---------- postgres/tests/test_schemas.py | 78 ++++++++--------- 4 files changed, 92 insertions(+), 84 deletions(-) diff --git a/datadog_checks_base/datadog_checks/base/utils/db/schemas.py b/datadog_checks_base/datadog_checks/base/utils/db/schemas.py index 0e0b34c7a90a6..2013b11fd8ea8 100644 --- a/datadog_checks_base/datadog_checks/base/utils/db/schemas.py +++ b/datadog_checks_base/datadog_checks/base/utils/db/schemas.py @@ -37,6 +37,13 @@ class SchemaCollectorConfig: def __init__(self): self.collection_interval = 3600 self.payload_chunk_size = 10_000 + self.max_tables = 300 + self.include_databases = None + self.exclude_databases = None + self.include_schemas = None + self.exclude_schemas = None + self.include_tables = None + self.exclude_tables = None class SchemaCollector(ABC): @@ -134,6 +141,7 @@ def base_event(self): "kind": self.kind, "agent_version": datadog_agent.get_version(), "collection_interval": self._config.collection_interval, + "dbms": self._dbms, "dbms_version": str(self._check.dbms_version), "tags": self._check.tags, "cloud_metadata": self._check.cloud_metadata, diff --git a/postgres/datadog_checks/postgres/postgres.py b/postgres/datadog_checks/postgres/postgres.py index 8ab12772ea0b3..e4db6ddd8bf3d 100644 --- a/postgres/datadog_checks/postgres/postgres.py +++ b/postgres/datadog_checks/postgres/postgres.py @@ -1031,6 +1031,10 @@ def _report_warnings(self): for warning in messages: self.warning(warning) + @property + def dbms_version(self): + return payload_pg_version(self.version) + def _send_database_instance_metadata(self): if self.database_identifier not in self._database_instance_emitted: event = { @@ -1043,7 +1047,7 @@ def _send_database_instance_metadata(self): "dbms": "postgres", "kind": "database_instance", "collection_interval": self._config.database_instance_collection_interval, - 'dbms_version': payload_pg_version(self.version), + 'dbms_version': self.dbms_version, 'integration_version': __version__, "tags": [t for t in self._non_internal_tags if not t.startswith('db:')], "timestamp": time() * 1000, diff --git a/postgres/datadog_checks/postgres/schemas.py b/postgres/datadog_checks/postgres/schemas.py index 98c7204be8ab3..f074e264e580f 100644 --- a/postgres/datadog_checks/postgres/schemas.py +++ b/postgres/datadog_checks/postgres/schemas.py @@ -90,6 +90,7 @@ class DatabaseObject(TypedDict): AND adnum = attnum WHERE attnum > 0 AND NOT attisdropped + AND attrelid = {table_id} """ @@ -114,6 +115,7 @@ class DatabaseObject(TypedDict): pg_class c ON c.oid = ix.indexrelid + WHERE ix.indrelid = {table_id} """ @@ -123,6 +125,7 @@ class DatabaseObject(TypedDict): conrelid AS table_id FROM pg_constraint WHERE contype = 'f' + AND conrelid = {table_id} """ @@ -192,15 +195,18 @@ class PostgresSchemaCollector(SchemaCollector): def __init__(self, check: PostgreSql): config = SchemaCollectorConfig() config.collection_interval = check._config.collect_schemas.collection_interval + config.max_tables = check._config.collect_schemas.max_tables + config.exclude_databases = check._config.collect_schemas.exclude_databases + config.include_databases = check._config.collect_schemas.include_databases + config.exclude_schemas = check._config.collect_schemas.exclude_schemas + config.include_schemas = check._config.collect_schemas.include_schemas + config.exclude_tables = check._config.collect_schemas.exclude_tables + config.include_tables = check._config.collect_schemas.include_tables super().__init__(check, config) @property - def base_event(self): - return { - **super().base_event, - "dbms": "postgres", - "kind": "pg_databases", - } + def kind(self): + return "pg_databases" def _get_databases(self): with self._check._get_main_db() as conn: @@ -214,7 +220,7 @@ def _get_databases(self): })" # Autodiscovery trumps exclude and include - autodiscovery_databases = self._check.autodiscovery.get_items() + autodiscovery_databases = self._check.autodiscovery.get_items() if self._check.autodiscovery else [] if autodiscovery_databases: query += " AND datname IN ({})".format(", ".join(f"'{db}'" for db in autodiscovery_databases)) @@ -251,7 +257,7 @@ def _get_cursor(self, database_name): if VersionUtils.transform_version(str(self._check.version))["version.major"] > "9" else "" ) - parition_selects = ( + partition_selects = ( """ , partition_keys.partition_key, @@ -271,43 +277,20 @@ def _get_cursor(self, database_name): {tables_query} ), schema_tables AS ( - SELECT schemas.schema_id, schemas.schema_name, + SELECT schemas.schema_id, schemas.schema_name, schemas.schema_owner, tables.table_id, tables.table_name FROM schemas LEFT JOIN tables ON schemas.schema_id = tables.schema_id ORDER BY schemas.schema_name, tables.table_name LIMIT {limit} - ), - columns AS ( - {columns_query} - ), - indexes AS ( - {indexes_query} - ), - constraints AS ( - {constraints_query} ) - {partitions_ctes} - - SELECT * FROM ( - SELECT schema_tables.schema_id, schema_tables.schema_name, - schema_tables.table_id, schema_tables.table_name, - array_agg(row_to_json(columns.*)) FILTER (WHERE columns.name IS NOT NULL) as columns, - array_agg(row_to_json(indexes.*)) FILTER (WHERE indexes.name IS NOT NULL) as indexes, - array_agg(row_to_json(constraints.*)) FILTER (WHERE constraints.name IS NOT NULL) - as foreign_keys - {parition_selects} + + SELECT schema_tables.schema_id, schema_tables.schema_name, schema_tables.schema_owner, + schema_tables.table_id, schema_tables.table_name FROM schema_tables - LEFT JOIN columns ON schema_tables.table_id = columns.table_id - LEFT JOIN indexes ON schema_tables.table_id = indexes.table_id - LEFT JOIN constraints ON schema_tables.table_id = constraints.table_id - {partition_joins} - GROUP BY schema_tables.schema_id, schema_tables.schema_name, - schema_tables.table_id, schema_tables.table_name - ) t ; """ - print(query) + # print(query) cursor.execute(query) yield cursor @@ -346,6 +329,21 @@ def _get_all(self, cursor): def _map_row(self, database: DatabaseInfo, cursor_row) -> DatabaseObject: object = super()._map_row(database, cursor_row) + columns = None + indexes = None + constraints = None + # print(cursor_row) + if cursor_row.get("table_id"): + # Fetch columns, indexes, and constraints for each table + with self._check.db_pool.get_connection(database["name"]) as conn: + with conn.cursor(row_factory=dict_row) as cursor: + cursor.execute(COLUMNS_QUERY.format(table_id=cursor_row["table_id"])) + columns = cursor.fetchall() + cursor.execute(PG_INDEXES_QUERY.format(table_id=cursor_row["table_id"])) + indexes = cursor.fetchall() + cursor.execute(PG_CONSTRAINTS_QUERY.format(table_id=cursor_row["table_id"])) + constraints = cursor.fetchall() + # Fetch partition information for each table # Map the cursor row to the expected schema, and strip out None values object["schemas"] = [ { @@ -362,14 +360,12 @@ def _map_row(self, database: DatabaseInfo, cursor_row) -> DatabaseObject: "name": cursor_row.get("table_name"), "owner": cursor_row.get("owner"), # The query can create duplicates of the joined tables - "columns": list({v and v['name']: v for v in cursor_row.get("columns") or []}.values()), - "indexes": list({v and v['name']: v for v in cursor_row.get("indexes") or []}.values()), - "foreign_keys": list( - {v and v['name']: v for v in cursor_row.get("foreign_keys") or []}.values() - ), - "toast_table": cursor_row.get("toast_table"), - "num_partitions": cursor_row.get("num_partitions"), - "partition_key": cursor_row.get("partition_key"), + "columns": columns, + "indexes": indexes, + "foreign_keys": constraints, + # "toast_table": cursor_row.get("toast_table"), + # "num_partitions": cursor_row.get("num_partitions"), + # "partition_key": cursor_row.get("partition_key"), }.items() if v is not None } diff --git a/postgres/tests/test_schemas.py b/postgres/tests/test_schemas.py index 518e62d84222a..faf466f7ad7d6 100644 --- a/postgres/tests/test_schemas.py +++ b/postgres/tests/test_schemas.py @@ -4,6 +4,7 @@ import pytest from datadog_checks.postgres.schemas import PostgresSchemaCollector +from datadog_checks.postgres.version_utils import VersionUtils from .common import POSTGRES_VERSION @@ -60,7 +61,7 @@ def test_get_cursor(dbm_instance, integration_check): for row in cursor: schemas.append(row['schema_name']) - assert set(schemas) == {'datadog', 'hstore', 'public', 'public2', 'rdsadmin_test'} + assert set(schemas) == {'datadog', 'hstore', 'public', 'public2'} def test_schemas_filters(dbm_instance, integration_check): @@ -109,52 +110,51 @@ def test_tables(dbm_instance, integration_check): 'pgtable', 'pg_newtable', 'cities', - 'rds_admin_misc', 'sample_foreign_d73a8c', } -def test_columns(dbm_instance, integration_check): - check = integration_check(dbm_instance) - check.version = POSTGRES_VERSION - collector = PostgresSchemaCollector(check) - - with collector._get_cursor('datadog_test') as cursor: - assert cursor is not None - # Assert that at least one row has columns - assert any(row['columns'] for row in cursor) - for row in cursor: - if row['columns']: - for column in row['columns']: - assert column['name'] is not None - assert column['data_type'] is not None - if row['table_name'] == 'cities': - assert row['columns'] - assert row['columns'][0]['name'] - - -def test_indexes(dbm_instance, integration_check): - check = integration_check(dbm_instance) - check.version = POSTGRES_VERSION - collector = PostgresSchemaCollector(check) - - with collector._get_cursor('datadog_test') as cursor: - assert cursor is not None - # Assert that at least one row has indexes - assert any(row['indexes'] for row in cursor) - for row in cursor: - if row['indexes']: - for index in row['indexes']: - assert index['name'] is not None - assert index['definition'] is not None - if row['table_name'] == 'cities': - assert row['indexes'] - assert row['indexes'][0]['name'] +# def test_columns(dbm_instance, integration_check): +# check = integration_check(dbm_instance) +# check.version = POSTGRES_VERSION +# collector = PostgresSchemaCollector(check) + +# with collector._get_cursor('datadog_test') as cursor: +# assert cursor is not None +# # Assert that at least one row has columns +# assert any(row['columns'] for row in cursor) +# for row in cursor: +# if row['columns']: +# for column in row['columns']: +# assert column['name'] is not None +# assert column['data_type'] is not None +# if row['table_name'] == 'cities': +# assert row['columns'] +# assert row['columns'][0]['name'] + + +# def test_indexes(dbm_instance, integration_check): +# check = integration_check(dbm_instance) +# check.version = POSTGRES_VERSION +# collector = PostgresSchemaCollector(check) + +# with collector._get_cursor('datadog_test') as cursor: +# assert cursor is not None +# # Assert that at least one row has indexes +# assert any(row['indexes'] for row in cursor) +# for row in cursor: +# if row['indexes']: +# for index in row['indexes']: +# assert index['name'] is not None +# assert index['definition'] is not None +# if row['table_name'] == 'cities': +# assert row['indexes'] +# assert row['indexes'][0]['name'] def test_collect_schemas(dbm_instance, integration_check): check = integration_check(dbm_instance) - check.version = POSTGRES_VERSION + check.version = VersionUtils().parse_version(POSTGRES_VERSION) collector = PostgresSchemaCollector(check) collector.collect_schemas() From 438c20f5d139b35de2cd5026f736ec3d5548bff5 Mon Sep 17 00:00:00 2001 From: Seth Samuel Date: Wed, 22 Oct 2025 12:48:34 -0400 Subject: [PATCH 36/37] Type cast --- postgres/datadog_checks/postgres/schemas.py | 67 ++++++++++----------- 1 file changed, 32 insertions(+), 35 deletions(-) diff --git a/postgres/datadog_checks/postgres/schemas.py b/postgres/datadog_checks/postgres/schemas.py index f074e264e580f..202efb1f2ecb6 100644 --- a/postgres/datadog_checks/postgres/schemas.py +++ b/postgres/datadog_checks/postgres/schemas.py @@ -175,7 +175,7 @@ class PostgresDatabaseObject(DatabaseObject): DATABASE_INFORMATION_QUERY = """ -SELECT db.oid AS id, +SELECT db.oid::text AS id, datname AS NAME, pg_encoding_to_char(encoding) AS encoding, rolname AS owner, @@ -233,39 +233,36 @@ def _get_cursor(self, database_name): with conn.cursor(row_factory=dict_row) as cursor: schemas_query = self._get_schemas_query() tables_query = self._get_tables_query() - columns_query = COLUMNS_QUERY - indexes_query = PG_INDEXES_QUERY - constraints_query = PG_CONSTRAINTS_QUERY - partitions_ctes = ( - f""" - , - partition_keys AS ( - {PARTITION_KEY_QUERY} - ), - num_partitions AS ( - {NUM_PARTITIONS_QUERY} - ) - """ - if VersionUtils.transform_version(str(self._check.version))["version.major"] > "9" - else "" - ) - partition_joins = ( - """ - LEFT JOIN partition_keys ON tables.table_id = partition_keys.table_id - LEFT JOIN num_partitions ON tables.table_id = num_partitions.table_id - """ - if VersionUtils.transform_version(str(self._check.version))["version.major"] > "9" - else "" - ) - partition_selects = ( - """ - , - partition_keys.partition_key, - num_partitions.num_partitions - """ - if VersionUtils.transform_version(str(self._check.version))["version.major"] > "9" - else "" - ) + # partitions_ctes = ( + # f""" + # , + # partition_keys AS ( + # {PARTITION_KEY_QUERY} + # ), + # num_partitions AS ( + # {NUM_PARTITIONS_QUERY} + # ) + # """ + # if VersionUtils.transform_version(str(self._check.version))["version.major"] > "9" + # else "" + # ) + # partition_joins = ( + # """ + # LEFT JOIN partition_keys ON tables.table_id = partition_keys.table_id + # LEFT JOIN num_partitions ON tables.table_id = num_partitions.table_id + # """ + # if VersionUtils.transform_version(str(self._check.version))["version.major"] > "9" + # else "" + # ) + # partition_selects = ( + # """ + # , + # partition_keys.partition_key, + # num_partitions.num_partitions + # """ + # if VersionUtils.transform_version(str(self._check.version))["version.major"] > "9" + # else "" + # ) limit = int(self._config.max_tables or 1_000_000) query = f""" @@ -286,7 +283,7 @@ def _get_cursor(self, database_name): ) SELECT schema_tables.schema_id, schema_tables.schema_name, schema_tables.schema_owner, - schema_tables.table_id, schema_tables.table_name + schema_tables.table_id, schema_tables.table_name FROM schema_tables ; """ From c608dfdba921cfe742df1986b0815cf7347ff9ed Mon Sep 17 00:00:00 2001 From: Seth Samuel Date: Thu, 23 Oct 2025 16:31:02 -0400 Subject: [PATCH 37/37] Max columns --- datadog_checks_base/datadog_checks/base/utils/db/schemas.py | 1 + 1 file changed, 1 insertion(+) diff --git a/datadog_checks_base/datadog_checks/base/utils/db/schemas.py b/datadog_checks_base/datadog_checks/base/utils/db/schemas.py index 2013b11fd8ea8..67b6541beb60d 100644 --- a/datadog_checks_base/datadog_checks/base/utils/db/schemas.py +++ b/datadog_checks_base/datadog_checks/base/utils/db/schemas.py @@ -38,6 +38,7 @@ def __init__(self): self.collection_interval = 3600 self.payload_chunk_size = 10_000 self.max_tables = 300 + self.max_columns = 50 self.include_databases = None self.exclude_databases = None self.include_schemas = None