目录

pgsql-35 生产环境最佳实践

35 - 生产环境最佳实践

1. 📖 概述

本章总结PostgreSQL生产环境部署和运维的最佳实践,涵盖安全、性能、备份、监控等关键领域。

2. 🔐 安全最佳实践

2.1 1. 网络安全

# pg_hba.conf - 配置客户端认证

# 本地连接使用peer认证
local   all             postgres                                peer

# 本地TCP连接使用md5密码
host    all             all             127.0.0.1/32            md5
host    all             all             ::1/128                 md5

# 允许特定IP访问
host    mydb            myapp           10.0.1.0/24             md5

# 拒绝所有其他连接
host    all             all             0.0.0.0/0               reject

# 使用SSL连接(生产环境必须)
hostssl all             all             0.0.0.0/0               md5

2.2 2. 用户权限

-- 创建应用专用用户
CREATE USER myapp_user WITH PASSWORD 'strong_random_password_here';

-- 只授予必要权限
GRANT CONNECT ON DATABASE myapp TO myapp_user;
GRANT USAGE ON SCHEMA public TO myapp_user;
GRANT SELECT, INSERT, UPDATE, DELETE ON ALL TABLES IN SCHEMA public TO myapp_user;
GRANT USAGE, SELECT ON ALL SEQUENCES IN SCHEMA public TO myapp_user;

-- 设置默认权限(新建表自动授权)
ALTER DEFAULT PRIVILEGES IN SCHEMA public
GRANT SELECT, INSERT, UPDATE, DELETE ON TABLES TO myapp_user;

-- 只读用户
CREATE USER readonly_user WITH PASSWORD 'readonly_password';
GRANT CONNECT ON DATABASE myapp TO readonly_user;
GRANT USAGE ON SCHEMA public TO readonly_user;
GRANT SELECT ON ALL TABLES IN SCHEMA public TO readonly_user;

-- 撤销public schema的创建权限
REVOKE CREATE ON SCHEMA public FROM PUBLIC;

2.3 3. SSL配置

# 生成SSL证书
openssl req -new -x509 -days 365 -nodes -text \
    -out server.crt -keyout server.key -subj "/CN=dbhost.example.com"

chmod 600 server.key
chown postgres:postgres server.key server.crt

# postgresql.conf
ssl = on
ssl_cert_file = '/path/to/server.crt'
ssl_key_file = '/path/to/server.key'
ssl_min_protocol_version = 'TLSv1.2'

# 强制SSL连接
# pg_hba.conf
hostssl all all 0.0.0.0/0 md5

2.4 4. 密码策略

-- 设置密码过期
ALTER USER myapp_user VALID UNTIL '2026-12-31';

-- 强制修改密码
ALTER USER myapp_user PASSWORD 'new_password' VALID UNTIL '2026-06-30';

-- 禁用用户
ALTER USER old_user NOLOGIN;

-- 安装pgcrypto扩展(密码加密)
CREATE EXTENSION pgcrypto;

-- 密码哈希示例
CREATE TABLE users (
    id SERIAL PRIMARY KEY,
    username VARCHAR(50) UNIQUE NOT NULL,
    password_hash TEXT NOT NULL
);

-- 插入时加密
INSERT INTO users (username, password_hash)
VALUES ('alice', crypt('my_password', gen_salt('bf', 10)));

-- 验证密码
SELECT username FROM users
WHERE username = 'alice'
  AND password_hash = crypt('my_password', password_hash);

3. ⚡ 性能调优

3.1 1. 内存配置

# postgresql.conf

# shared_buffers: 25-40% of total RAM
shared_buffers = 4GB                    # 16GB内存系统

# effective_cache_size: 50-75% of total RAM
effective_cache_size = 12GB             # 告诉优化器可用缓存

# work_mem: 根据并发连接数调整
work_mem = 16MB                         # per query operation

# maintenance_work_mem: 用于VACUUM、CREATE INDEX等
maintenance_work_mem = 512MB

# wal_buffers: -1表示自动(shared_buffers的1/32)
wal_buffers = -1

3.2 2. 连接配置

# 最大连接数
max_connections = 200

# 保留超级用户连接
superuser_reserved_connections = 3

# 连接超时
statement_timeout = 60000               # 60秒
idle_in_transaction_session_timeout = 300000  # 5分钟

3.3 3. WAL配置

# WAL日志级别
wal_level = replica                     # 支持复制

# WAL写入模式
wal_sync_method = fdatasync            # Linux推荐

# 检查点配置
checkpoint_timeout = 15min
checkpoint_completion_target = 0.9
max_wal_size = 4GB
min_wal_size = 1GB

# WAL归档(用于PITR)
archive_mode = on
archive_command = 'cp %p /path/to/archive/%f'

3.4 4. 查询优化

# 启用统计信息收集
track_activities = on
track_counts = on
track_io_timing = on
track_functions = all

# 慢查询日志
log_min_duration_statement = 1000       # 记录超过1秒的查询
log_line_prefix = '%t [%p]: [%l-1] user=%u,db=%d,app=%a,client=%h '
log_statement = 'ddl'                   # 记录DDL语句

# 自动VACUUM
autovacuum = on
autovacuum_max_workers = 3
autovacuum_naptime = 1min

4. 💾 备份策略

4.1 1. 逻辑备份(pg_dump)

#!/bin/bash
# 每日备份脚本

DATE=$(date +%Y%m%d)
BACKUP_DIR="/backup/postgres"
DB_NAME="myapp"

# 全库备份
pg_dump -U postgres -d $DB_NAME -F c -f "$BACKUP_DIR/${DB_NAME}_${DATE}.dump"

# 压缩备份
pg_dump -U postgres -d $DB_NAME | gzip > "$BACKUP_DIR/${DB_NAME}_${DATE}.sql.gz"

# 仅备份schema
pg_dump -U postgres -d $DB_NAME --schema-only -f "$BACKUP_DIR/${DB_NAME}_schema_${DATE}.sql"

# 仅备份数据
pg_dump -U postgres -d $DB_NAME --data-only -f "$BACKUP_DIR/${DB_NAME}_data_${DATE}.sql"

# 删除7天前的备份
find $BACKUP_DIR -name "${DB_NAME}_*.dump" -mtime +7 -delete

# 上传到云存储
# aws s3 cp "$BACKUP_DIR/${DB_NAME}_${DATE}.dump" s3://my-bucket/postgres-backups/

4.2 2. 物理备份(PITR)

# 基础备份
pg_basebackup -U postgres -D /backup/base -F tar -z -P

# 配合WAL归档实现PITR
# postgresql.conf
archive_mode = on
archive_command = 'test ! -f /archive/%f && cp %p /archive/%f'

# 恢复示例
# 1. 停止PostgreSQL
# 2. 删除data目录内容
# 3. 解压基础备份
# 4. 创建recovery.conf
restore_command = 'cp /archive/%f %p'
recovery_target_time = '2024-03-15 10:30:00'

# 5. 启动PostgreSQL

4.3 3. 连续归档

# 使用pgBackRest(推荐)
# 安装
sudo apt-get install pgbackrest

# 配置 /etc/pgbackrest.conf
[myapp]
pg1-path=/var/lib/postgresql/14/main
pg1-port=5432

[global]
repo1-path=/backup/pgbackrest
repo1-retention-full=7
repo1-retention-diff=14
start-fast=y

# 执行备份
pgbackrest --stanza=myapp backup --type=full
pgbackrest --stanza=myapp backup --type=diff
pgbackrest --stanza=myapp backup --type=incr

# 恢复
pgbackrest --stanza=myapp restore

5. 📊 监控告警

5.1 1. 性能监控SQL

-- 监控活动连接
SELECT
    pid,
    usename,
    application_name,
    client_addr,
    state,
    query,
    query_start,
    NOW() - query_start AS duration
FROM pg_stat_activity
WHERE state != 'idle'
ORDER BY duration DESC;

-- 监控数据库大小
SELECT
    datname,
    pg_size_pretty(pg_database_size(datname)) AS size,
    (SELECT count(*) FROM pg_stat_activity WHERE datname = d.datname) AS connections
FROM pg_database d
ORDER BY pg_database_size(datname) DESC;

-- 监控表膨胀
SELECT
    schemaname,
    tablename,
    pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) AS total_size,
    pg_size_pretty(pg_relation_size(schemaname||'.'||tablename)) AS table_size,
    pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename) -
                   pg_relation_size(schemaname||'.'||tablename)) AS index_size,
    n_dead_tup,
    n_live_tup,
    ROUND(100.0 * n_dead_tup / NULLIF(n_live_tup + n_dead_tup, 0), 2) AS dead_ratio
FROM pg_stat_user_tables
ORDER BY n_dead_tup DESC
LIMIT 20;

-- 监控长事务
SELECT
    pid,
    usename,
    application_name,
    client_addr,
    xact_start,
    NOW() - xact_start AS duration,
    state,
    query
FROM pg_stat_activity
WHERE xact_start IS NOT NULL
  AND NOW() - xact_start > INTERVAL '5 minutes'
ORDER BY xact_start;

-- 监控锁等待
SELECT
    blocked_locks.pid AS blocked_pid,
    blocked_activity.usename AS blocked_user,
    blocking_locks.pid AS blocking_pid,
    blocking_activity.usename AS blocking_user,
    blocked_activity.query AS blocked_statement,
    blocking_activity.query AS blocking_statement
FROM pg_catalog.pg_locks blocked_locks
JOIN pg_catalog.pg_stat_activity blocked_activity ON blocked_activity.pid = blocked_locks.pid
JOIN pg_catalog.pg_locks blocking_locks ON blocking_locks.locktype = blocked_locks.locktype
    AND blocking_locks.database IS NOT DISTINCT FROM blocked_locks.database
    AND blocking_locks.relation IS NOT DISTINCT FROM blocked_locks.relation
    AND blocking_locks.page IS NOT DISTINCT FROM blocked_locks.page
    AND blocking_locks.tuple IS NOT DISTINCT FROM blocked_locks.tuple
    AND blocking_locks.virtualxid IS NOT DISTINCT FROM blocked_locks.virtualxid
    AND blocking_locks.transactionid IS NOT DISTINCT FROM blocked_locks.transactionid
    AND blocking_locks.classid IS NOT DISTINCT FROM blocked_locks.classid
    AND blocking_locks.objid IS NOT DISTINCT FROM blocked_locks.objid
    AND blocking_locks.objsubid IS NOT DISTINCT FROM blocked_locks.objsubid
    AND blocking_locks.pid != blocked_locks.pid
JOIN pg_catalog.pg_stat_activity blocking_activity ON blocking_activity.pid = blocking_locks.pid
WHERE NOT blocked_locks.granted;

5.2 2. 使用监控工具

# Prometheus + postgres_exporter
docker run -d \
  -p 9187:9187 \
  -e DATA_SOURCE_NAME="postgresql://user:password@localhost:5432/postgres?sslmode=disable" \
  prometheuscommunity/postgres-exporter

# pgAdmin 4
# https://www.pgadmin.org/

# Grafana仪表板
# Dashboard ID: 9628 (PostgreSQL Database)

6. 🔄 高可用方案

6.1 1. 流复制

-- 主库配置
-- postgresql.conf
wal_level = replica
max_wal_senders = 10
max_replication_slots = 10
hot_standby = on

-- pg_hba.conf
host replication replicator 10.0.1.0/24 md5

-- 创建复制用户
CREATE USER replicator WITH REPLICATION PASSWORD 'replica_password';

-- 从库配置
-- 执行基础备份
pg_basebackup -h primary_host -D /var/lib/postgresql/14/main -U replicator -P -R

-- 启动从库(自动进入恢复模式)

6.2 2. 自动故障转移

# 使用Patroni + etcd/Consul
# docker-compose.yml
version: '3'
services:
  etcd:
    image: quay.io/coreos/etcd
    environment:
      ETCD_LISTEN_CLIENT_URLS: http://0.0.0.0:2379

  postgres1:
    image: patroni
    environment:
      PATRONI_NAME: postgres1
      PATRONI_RESTAPI_CONNECT_ADDRESS: postgres1:8008
      PATRONI_POSTGRESQL_CONNECT_ADDRESS: postgres1:5432
      PATRONI_ETCD_HOSTS: etcd:2379

  postgres2:
    image: patroni
    environment:
      PATRONI_NAME: postgres2
      # 类似配置

  haproxy:
    image: haproxy
    ports:
      - "5432:5432"
      - "5433:5433"

7. 🎯 运维检查清单

7.1 每日检查

  • 检查备份是否成功
  • 检查磁盘空间(data、WAL、archive)
  • 检查慢查询日志
  • 检查错误日志
  • 检查活动连接数
  • 检查复制延迟(如有主从)

7.2 每周检查

  • 分析表统计信息(ANALYZE)
  • 检查未使用的索引
  • 检查表膨胀情况
  • 回顾性能趋势
  • 测试备份恢复

7.3 每月检查

  • VACUUM FULL大表(如需要)
  • 审查用户权限
  • 更新数据库统计信息
  • 检查配置参数是否需要调整
  • 容量规划评估

7.4 季度检查

  • PostgreSQL版本升级计划
  • 安全审计
  • 灾难恢复演练
  • 性能基准测试

8. 🚨 故障处理

8.1 1. 连接数耗尽

-- 查看当前连接
SELECT count(*) FROM pg_stat_activity;

-- 查看每个数据库的连接数
SELECT datname, count(*) FROM pg_stat_activity GROUP BY datname;

-- 终止空闲连接
SELECT pg_terminate_backend(pid)
FROM pg_stat_activity
WHERE state = 'idle'
  AND state_change < NOW() - INTERVAL '10 minutes';

-- 临时解决
ALTER SYSTEM SET max_connections = 300;
SELECT pg_reload_conf();

-- 长期解决:使用连接池(PgBouncer)

8.2 2. 磁盘空间不足

# 检查磁盘使用
df -h

# 检查PostgreSQL数据目录
du -sh /var/lib/postgresql/14/main/*

# 清理WAL日志(谨慎!)
# SELECT pg_switch_wal();  -- 切换WAL
# 手动删除归档的WAL

# 清理旧的pg_log
find /var/log/postgresql -name "*.log" -mtime +7 -delete

# VACUUM FULL释放空间(锁表,慢)
VACUUM FULL;

8.3 3. 性能突然下降

-- 1. 检查慢查询
SELECT query, mean_exec_time, calls
FROM pg_stat_statements
ORDER BY mean_exec_time DESC
LIMIT 10;

-- 2. 检查锁等待
-- (使用上面的锁监控SQL)

-- 3. 检查是否需要VACUUM
SELECT
    schemaname,
    tablename,
    n_dead_tup,
    n_live_tup,
    last_autovacuum
FROM pg_stat_user_tables
WHERE n_dead_tup > 10000
ORDER BY n_dead_tup DESC;

-- 4. 重新分析统计信息
ANALYZE;

-- 5. 检查是否有长事务
-- (使用上面的长事务监控SQL)

9. 📝 配置文件模板

# postgresql.conf - 生产环境推荐配置(16GB内存服务器)

# CONNECTIONS
max_connections = 200
superuser_reserved_connections = 3

# MEMORY
shared_buffers = 4GB
effective_cache_size = 12GB
work_mem = 16MB
maintenance_work_mem = 512MB
wal_buffers = -1

# WAL
wal_level = replica
max_wal_senders = 10
max_replication_slots = 10
wal_keep_size = 1GB
checkpoint_timeout = 15min
checkpoint_completion_target = 0.9
max_wal_size = 4GB
min_wal_size = 1GB

# QUERY TUNING
random_page_cost = 1.1                  # SSD
effective_io_concurrency = 200          # SSD
default_statistics_target = 100

# LOGGING
log_destination = 'stderr'
logging_collector = on
log_directory = 'log'
log_filename = 'postgresql-%Y-%m-%d.log'
log_rotation_age = 1d
log_rotation_size = 100MB
log_min_duration_statement = 1000       # 1秒
log_line_prefix = '%t [%p]: [%l-1] user=%u,db=%d,app=%a,client=%h '
log_checkpoints = on
log_connections = on
log_disconnections = on
log_lock_waits = on

# AUTOVACUUM
autovacuum = on
autovacuum_max_workers = 3
autovacuum_naptime = 1min

# STATISTICS
track_activities = on
track_counts = on
track_io_timing = on
track_functions = all

# LOCALE
lc_messages = 'en_US.UTF-8'
lc_monetary = 'en_US.UTF-8'
lc_numeric = 'en_US.UTF-8'
lc_time = 'en_US.UTF-8'
default_text_search_config = 'pg_catalog.english'

# TIMEZONE
timezone = 'UTC'

10. 🔗 有用的资源

10.1 官方文档

10.2 监控工具

10.3 备份工具

10.4 高可用方案

10.5 连接池

10.6 性能分析

  • pg_stat_statements: 内置扩展
  • pgBadger: 日志分析工具
  • explain.depesz.com: 执行计划可视化

11. 📚 总结

生产环境PostgreSQL的关键要点:

  1. 安全第一: SSL、最小权限原则、网络隔离
  2. 定期备份: 自动化备份、测试恢复、异地存储
  3. 性能监控: 实时监控、告警设置、定期优化
  4. 高可用性: 主从复制、自动故障转移
  5. 运维规范: 变更管理、容量规划、文档完善

恭喜你完成了PostgreSQL完整教程! 🎉

从基础到实战,你已经掌握了PostgreSQL的核心知识。继续实践,在项目中应用这些技能,你会成为PostgreSQL专家!