O Postgres é “tão evoluído” e blá blá blá, mas não consegue ter uma rotina de criação automática de estatísticas. Parece que voltamos para antes dos anos 2000.
Bom, não vou explicar a importância da criação de estatísticas, isso você já deveria saber, se não sabe, pergunta para sua IA favorita.
A código abaixo vai ler a tabela pg_stat_statements e, a partir dela, eu tento fazer um monte de regex para separar das querys a parte do SARG para criar as estatísticas.
É preciso adicionar a extensão pg_stat_statements:
create extension pg_stat_statements;
Dependendo do seu workload logo em seguida você já vai ter acesso a alguma coisa, mas para ter dados melhores, é melhor deixar o tempo passar para acumular mais informações.
Depois de algum tempo, quando você rodar o script abaixo, deve ter um resultado mais interessante para criar as estatísticas:
WITH normalized AS (SELECT queryid
, query
-- Remove DO $ ... $ e normaliza espaços
, lower(regexp_replace(
regexp_replace(query, '^do \$\$|\\$\$;$', '', 'gi'),
'\s+', ' ', 'g'
)) AS norm_query
, total_exec_time
, calls
FROM pg_stat_statements
WHERE query ILIKE 'select%'
AND calls > 10
AND query ~* ' where ')
, tables AS (SELECT n.queryid
, n.query
, n.norm_query
, n.total_exec_time
, n.calls
-- Captura tabela e alias
, (regexp_match(n.norm_query,
'(?:from|join)\s+([a-z0-9_\.]+)(?:\s+as\s+|\s+)?([a-z0-9_]+)?'))[1] AS table_obj_full
, (regexp_match(n.norm_query,
'(?:from|join)\s+([a-z0-9_\.]+)(?:\s+as\s+|\s+)?([a-z0-9_]+)?'))[2] AS table_alias
, COALESCE(
(regexp_match(n.norm_query, '(?:from|join)\s+([a-z0-9_\.]+)(?:\s+as\s+|\s+)?([a-z0-9_]+)'))[2],
split_part((regexp_match(n.norm_query, '(?:from|join)\s+([a-z0-9_\.]+)'))[1], '.', 2)
) AS main_identifier_raw
, split_part(
lower(
regexp_replace(
(regexp_match(n.norm_query, '(?:from|join)\s+([a-z0-9_\.]+)'))[1],
'[^a-z0-9_\.]',
'',
'g'
)
),
'.',
CASE WHEN (regexp_match(n.norm_query, '(?:from|join)\s+([a-z0-9_\.]+)'))[1] LIKE '%.%' THEN 2 ELSE 1 END
) AS table_name
FROM normalized n
WHERE (regexp_match(n.norm_query, '(?:from|join)\s+([a-z0-9_\.]+)'))[1] IS NOT NULL)
, where_clauses AS (SELECT t.queryid
, t.table_name
, lower(t.main_identifier_raw) AS main_identifier
, (regexp_match(
t.norm_query,
'where\s+(.*?)(?:\sgroup by|\sorder by|\slimit|;|$)'
))[1] AS where_block
, t.total_exec_time
, t.calls
FROM tables t)
, where_columns AS (SELECT wc.table_name
, wc.main_identifier
, CASE
WHEN rm.m[1] LIKE '%.%' THEN lower(split_part(rm.m[1], '.', 1))
ELSE NULL END AS column_prefix
, rm.m[1] AS full_column_name
, regexp_replace(rm.m[1], '^[a-z0-9_]+\.', '', 'i') AS column_name
, wc.total_exec_time
FROM where_clauses wc
CROSS JOIN LATERAL regexp_matches(
wc.where_block,
'([a-z_][a-z0-9_\.]*)\s*(=|>|<|>=|<=|<>|in\b|like\b)',
'gi'
) AS rm(m))
, distinct_columns AS (SELECT table_name
, column_name
, SUM(total_exec_time) AS total_cost
FROM where_columns
WHERE column_name IS NOT NULL
AND (
column_prefix IS NULL
OR column_prefix = main_identifier
OR column_prefix = table_name
)
GROUP BY table_name, column_name)
, validated_columns AS (SELECT dc.table_name
, dc.column_name
, dc.total_cost
FROM distinct_columns dc
JOIN information_schema.columns isc
ON lower(isc.table_name) = lower(dc.table_name)
AND lower(isc.column_name) = lower(dc.column_name)
WHERE dc.table_name NOT LIKE 'pg_%')
, family AS (SELECT vc.table_name
, (SELECT array_agg(c)
FROM (SELECT column_name AS c
FROM validated_columns sub
WHERE sub.table_name = vc.table_name
ORDER BY total_cost DESC
LIMIT 8) sub2) AS columns
, COUNT(DISTINCT column_name) AS occurrences
, SUM(total_cost) AS total_table_cost
FROM validated_columns vc
GROUP BY vc.table_name
HAVING COUNT(DISTINCT column_name) > 1)
, correlation_estimate AS (SELECT f.*
, (array_length(f.columns, 1) ^ 1.3) * ln(total_table_cost + 5) AS score
FROM family f)
, existing_ext_stats AS (SELECT cls.relname AS table_name
, st.stxname AS stat_name
, array_agg(att.attname ORDER BY att.attname)::text[] AS stat_columns
FROM pg_statistic_ext st
JOIN pg_class cls ON cls.oid = st.stxrelid
JOIN pg_attribute att
ON att.attrelid = cls.oid
AND att.attnum = ANY (st.stxkeys)
GROUP BY cls.relname, st.stxname)
, dedup AS (SELECT ce.*
, 's_' || regexp_replace(ce.table_name, '[^a-z0-9_]', '', 'g') ||
'_' || substr(md5(array_to_string(ce.columns, ',')), 1, 8) AS stat_name
, EXISTS (SELECT 1
FROM existing_ext_stats es
WHERE es.table_name = ce.table_name
AND es.stat_columns = ce.columns::text[]) AS already_exists
FROM correlation_estimate ce)
SELECT table_name
, columns
, occurrences
, round(score::numeric, 2) AS score
, stat_name
, 'CREATE STATISTICS IF NOT EXISTS ' ||
quote_ident(stat_name) ||
' ON ' ||
array_to_string(
ARRAY(SELECT quote_ident(c) FROM unnest(columns) c), ', '
) ||
' FROM ' ||
quote_ident(table_name) ||
';' AS suggested_cmd
, 'Filtros mais custosos sobre (' ||
array_to_string(columns, ', ') ||
') em ' || table_name ||
'. Ocorrências: ' || occurrences ||
'. Custo total estimado: ' || total_table_cost || 'ms.' AS justification
FROM dedup
WHERE NOT already_exists
AND table_name NOT LIKE 'pg_%'
AND table_name NOT IN ('table_constraints', 'columns', 'tables')
AND array_length(columns, 1) > 1
ORDER BY score DESC, total_table_cost DESC, occurrences DESC;




















