Postgres – Criar estatísticas

O Postgres é “tão evoluído” e blá blá blá, mas não consegue ter uma rotina de criação automática de estatísticas. Parece que voltamos para antes dos anos 2000.

Bom, não vou explicar a importância da criação de estatísticas, isso você já deveria saber, se não sabe, pergunta para sua IA favorita.

A código abaixo vai ler a tabela pg_stat_statements e, a partir dela, eu tento fazer um monte de regex para separar das querys a parte do SARG para criar as estatísticas.

É preciso adicionar a extensão pg_stat_statements:

create extension pg_stat_statements;

Dependendo do seu workload logo em seguida você já vai ter acesso a alguma coisa, mas para ter dados melhores, é melhor deixar o tempo passar para acumular mais informações.

Depois de algum tempo, quando você rodar o script abaixo, deve ter um resultado mais interessante para criar as estatísticas:

WITH normalized AS (SELECT queryid
                         , query
                         -- Remove DO $ ... $ e normaliza espaços
                         , lower(regexp_replace(
            regexp_replace(query, '^do \$\$|\\$\$;$', '', 'gi'),
            '\s+', ' ', 'g'
                                 )) AS norm_query
                         , total_exec_time
                         , calls
                    FROM pg_stat_statements
                    WHERE query ILIKE 'select%'
                      AND calls > 10
                      AND query ~* ' where ')
   , tables AS (SELECT n.queryid
                     , n.query
                     , n.norm_query
                     , n.total_exec_time
                     , n.calls
                     -- Captura tabela e alias
                     , (regexp_match(n.norm_query,
                                     '(?:from|join)\s+([a-z0-9_\.]+)(?:\s+as\s+|\s+)?([a-z0-9_]+)?'))[1] AS table_obj_full
                     , (regexp_match(n.norm_query,
                                     '(?:from|join)\s+([a-z0-9_\.]+)(?:\s+as\s+|\s+)?([a-z0-9_]+)?'))[2] AS table_alias
                     , COALESCE(
            (regexp_match(n.norm_query, '(?:from|join)\s+([a-z0-9_\.]+)(?:\s+as\s+|\s+)?([a-z0-9_]+)'))[2],
            split_part((regexp_match(n.norm_query, '(?:from|join)\s+([a-z0-9_\.]+)'))[1], '.', 2)
                       )                                                                                 AS main_identifier_raw
                     , split_part(
            lower(
                    regexp_replace(
                            (regexp_match(n.norm_query, '(?:from|join)\s+([a-z0-9_\.]+)'))[1],
                            '[^a-z0-9_\.]',
                            '',
                            'g'
                    )
            ),
            '.',
            CASE WHEN (regexp_match(n.norm_query, '(?:from|join)\s+([a-z0-9_\.]+)'))[1] LIKE '%.%' THEN 2 ELSE 1 END
                       )                                                                                 AS table_name
                FROM normalized n
                WHERE (regexp_match(n.norm_query, '(?:from|join)\s+([a-z0-9_\.]+)'))[1] IS NOT NULL)
   , where_clauses AS (SELECT t.queryid
                            , t.table_name
                            , lower(t.main_identifier_raw) AS main_identifier
                            , (regexp_match(
            t.norm_query,
            'where\s+(.*?)(?:\sgroup by|\sorder by|\slimit|;|$)'
                               ))[1]                       AS where_block
                            , t.total_exec_time
                            , t.calls
                       FROM tables t)
   , where_columns AS (SELECT wc.table_name
                            , wc.main_identifier
                            , CASE
                                  WHEN rm.m[1] LIKE '%.%' THEN lower(split_part(rm.m[1], '.', 1))
                                  ELSE NULL END                                 AS column_prefix
                            , rm.m[1]                                           AS full_column_name
                            , regexp_replace(rm.m[1], '^[a-z0-9_]+\.', '', 'i') AS column_name
                            , wc.total_exec_time
                       FROM where_clauses wc
                                CROSS JOIN LATERAL regexp_matches(
                               wc.where_block,
                               '([a-z_][a-z0-9_\.]*)\s*(=|>|<|>=|<=|<>|in\b|like\b)',
                               'gi'
                                                   ) AS rm(m))
   , distinct_columns AS (SELECT table_name
                               , column_name
                               , SUM(total_exec_time) AS total_cost
                          FROM where_columns
                          WHERE column_name IS NOT NULL
                            AND (
                              column_prefix IS NULL
                                  OR column_prefix = main_identifier
                                  OR column_prefix = table_name
                              )
                          GROUP BY table_name, column_name)
   , validated_columns AS (SELECT dc.table_name
                                , dc.column_name
                                , dc.total_cost
                           FROM distinct_columns dc
                                    JOIN information_schema.columns isc
                                         ON lower(isc.table_name) = lower(dc.table_name)
                                             AND lower(isc.column_name) = lower(dc.column_name)
                           WHERE dc.table_name NOT LIKE 'pg_%')
   , family AS (SELECT vc.table_name
                     , (SELECT array_agg(c)
                        FROM (SELECT column_name AS c
                              FROM validated_columns sub
                              WHERE sub.table_name = vc.table_name
                              ORDER BY total_cost DESC
                              LIMIT 8) sub2)       AS columns
                     , COUNT(DISTINCT column_name) AS occurrences
                     , SUM(total_cost)             AS total_table_cost
                FROM validated_columns vc
                GROUP BY vc.table_name
                HAVING COUNT(DISTINCT column_name) > 1)
   , correlation_estimate AS (SELECT f.*
                                   , (array_length(f.columns, 1) ^ 1.3) * ln(total_table_cost + 5) AS score
                              FROM family f)
   , existing_ext_stats AS (SELECT cls.relname                                         AS table_name
                                 , st.stxname                                          AS stat_name
                                 , array_agg(att.attname ORDER BY att.attname)::text[] AS stat_columns
                            FROM pg_statistic_ext st
                                     JOIN pg_class cls ON cls.oid = st.stxrelid
                                     JOIN pg_attribute att
                                          ON att.attrelid = cls.oid
                                              AND att.attnum = ANY (st.stxkeys)
                            GROUP BY cls.relname, st.stxname)
   , dedup AS (SELECT ce.*
                    , 's_' || regexp_replace(ce.table_name, '[^a-z0-9_]', '', 'g') ||
                      '_' || substr(md5(array_to_string(ce.columns, ',')), 1, 8) AS stat_name
                    , EXISTS (SELECT 1
                              FROM existing_ext_stats es
                              WHERE es.table_name = ce.table_name
                                AND es.stat_columns = ce.columns::text[])        AS already_exists
               FROM correlation_estimate ce)

SELECT table_name
     , columns
     , occurrences
     , round(score::numeric, 2)                                AS score
     , stat_name
     , 'CREATE STATISTICS IF NOT EXISTS ' ||
       quote_ident(stat_name) ||
       ' ON ' ||
       array_to_string(
               ARRAY(SELECT quote_ident(c) FROM unnest(columns) c), ', '
       ) ||
       ' FROM ' ||
       quote_ident(table_name) ||
       ';'                                                     AS suggested_cmd
     , 'Filtros mais custosos sobre (' ||
       array_to_string(columns, ', ') ||
       ') em ' || table_name ||
       '. Ocorrências: ' || occurrences ||
       '. Custo total estimado: ' || total_table_cost || 'ms.' AS justification
FROM dedup
WHERE NOT already_exists
  AND table_name NOT LIKE 'pg_%'
  AND table_name NOT IN ('table_constraints', 'columns', 'tables')
  AND array_length(columns, 1) > 1
ORDER BY score DESC, total_table_cost DESC, occurrences DESC;