pgsql-35 生产环境最佳实践
目录
35 - 生产环境最佳实践
1. 📖 概述
本章总结PostgreSQL生产环境部署和运维的最佳实践,涵盖安全、性能、备份、监控等关键领域。
2. 🔐 安全最佳实践
2.1 1. 网络安全
# pg_hba.conf - 配置客户端认证
# 本地连接使用peer认证
local all postgres peer
# 本地TCP连接使用md5密码
host all all 127.0.0.1/32 md5
host all all ::1/128 md5
# 允许特定IP访问
host mydb myapp 10.0.1.0/24 md5
# 拒绝所有其他连接
host all all 0.0.0.0/0 reject
# 使用SSL连接(生产环境必须)
hostssl all all 0.0.0.0/0 md5
2.2 2. 用户权限
-- 创建应用专用用户
CREATE USER myapp_user WITH PASSWORD 'strong_random_password_here';
-- 只授予必要权限
GRANT CONNECT ON DATABASE myapp TO myapp_user;
GRANT USAGE ON SCHEMA public TO myapp_user;
GRANT SELECT, INSERT, UPDATE, DELETE ON ALL TABLES IN SCHEMA public TO myapp_user;
GRANT USAGE, SELECT ON ALL SEQUENCES IN SCHEMA public TO myapp_user;
-- 设置默认权限(新建表自动授权)
ALTER DEFAULT PRIVILEGES IN SCHEMA public
GRANT SELECT, INSERT, UPDATE, DELETE ON TABLES TO myapp_user;
-- 只读用户
CREATE USER readonly_user WITH PASSWORD 'readonly_password';
GRANT CONNECT ON DATABASE myapp TO readonly_user;
GRANT USAGE ON SCHEMA public TO readonly_user;
GRANT SELECT ON ALL TABLES IN SCHEMA public TO readonly_user;
-- 撤销public schema的创建权限
REVOKE CREATE ON SCHEMA public FROM PUBLIC;
2.3 3. SSL配置
# 生成SSL证书
openssl req -new -x509 -days 365 -nodes -text \
-out server.crt -keyout server.key -subj "/CN=dbhost.example.com"
chmod 600 server.key
chown postgres:postgres server.key server.crt
# postgresql.conf
ssl = on
ssl_cert_file = '/path/to/server.crt'
ssl_key_file = '/path/to/server.key'
ssl_min_protocol_version = 'TLSv1.2'
# 强制SSL连接
# pg_hba.conf
hostssl all all 0.0.0.0/0 md5
2.4 4. 密码策略
-- 设置密码过期
ALTER USER myapp_user VALID UNTIL '2026-12-31';
-- 强制修改密码
ALTER USER myapp_user PASSWORD 'new_password' VALID UNTIL '2026-06-30';
-- 禁用用户
ALTER USER old_user NOLOGIN;
-- 安装pgcrypto扩展(密码加密)
CREATE EXTENSION pgcrypto;
-- 密码哈希示例
CREATE TABLE users (
id SERIAL PRIMARY KEY,
username VARCHAR(50) UNIQUE NOT NULL,
password_hash TEXT NOT NULL
);
-- 插入时加密
INSERT INTO users (username, password_hash)
VALUES ('alice', crypt('my_password', gen_salt('bf', 10)));
-- 验证密码
SELECT username FROM users
WHERE username = 'alice'
AND password_hash = crypt('my_password', password_hash);
3. ⚡ 性能调优
3.1 1. 内存配置
# postgresql.conf
# shared_buffers: 25-40% of total RAM
shared_buffers = 4GB # 16GB内存系统
# effective_cache_size: 50-75% of total RAM
effective_cache_size = 12GB # 告诉优化器可用缓存
# work_mem: 根据并发连接数调整
work_mem = 16MB # per query operation
# maintenance_work_mem: 用于VACUUM、CREATE INDEX等
maintenance_work_mem = 512MB
# wal_buffers: -1表示自动(shared_buffers的1/32)
wal_buffers = -1
3.2 2. 连接配置
# 最大连接数
max_connections = 200
# 保留超级用户连接
superuser_reserved_connections = 3
# 连接超时
statement_timeout = 60000 # 60秒
idle_in_transaction_session_timeout = 300000 # 5分钟
3.3 3. WAL配置
# WAL日志级别
wal_level = replica # 支持复制
# WAL写入模式
wal_sync_method = fdatasync # Linux推荐
# 检查点配置
checkpoint_timeout = 15min
checkpoint_completion_target = 0.9
max_wal_size = 4GB
min_wal_size = 1GB
# WAL归档(用于PITR)
archive_mode = on
archive_command = 'cp %p /path/to/archive/%f'
3.4 4. 查询优化
# 启用统计信息收集
track_activities = on
track_counts = on
track_io_timing = on
track_functions = all
# 慢查询日志
log_min_duration_statement = 1000 # 记录超过1秒的查询
log_line_prefix = '%t [%p]: [%l-1] user=%u,db=%d,app=%a,client=%h '
log_statement = 'ddl' # 记录DDL语句
# 自动VACUUM
autovacuum = on
autovacuum_max_workers = 3
autovacuum_naptime = 1min
4. 💾 备份策略
4.1 1. 逻辑备份(pg_dump)
#!/bin/bash
# 每日备份脚本
DATE=$(date +%Y%m%d)
BACKUP_DIR="/backup/postgres"
DB_NAME="myapp"
# 全库备份
pg_dump -U postgres -d $DB_NAME -F c -f "$BACKUP_DIR/${DB_NAME}_${DATE}.dump"
# 压缩备份
pg_dump -U postgres -d $DB_NAME | gzip > "$BACKUP_DIR/${DB_NAME}_${DATE}.sql.gz"
# 仅备份schema
pg_dump -U postgres -d $DB_NAME --schema-only -f "$BACKUP_DIR/${DB_NAME}_schema_${DATE}.sql"
# 仅备份数据
pg_dump -U postgres -d $DB_NAME --data-only -f "$BACKUP_DIR/${DB_NAME}_data_${DATE}.sql"
# 删除7天前的备份
find $BACKUP_DIR -name "${DB_NAME}_*.dump" -mtime +7 -delete
# 上传到云存储
# aws s3 cp "$BACKUP_DIR/${DB_NAME}_${DATE}.dump" s3://my-bucket/postgres-backups/
4.2 2. 物理备份(PITR)
# 基础备份
pg_basebackup -U postgres -D /backup/base -F tar -z -P
# 配合WAL归档实现PITR
# postgresql.conf
archive_mode = on
archive_command = 'test ! -f /archive/%f && cp %p /archive/%f'
# 恢复示例
# 1. 停止PostgreSQL
# 2. 删除data目录内容
# 3. 解压基础备份
# 4. 创建recovery.conf
restore_command = 'cp /archive/%f %p'
recovery_target_time = '2024-03-15 10:30:00'
# 5. 启动PostgreSQL
4.3 3. 连续归档
# 使用pgBackRest(推荐)
# 安装
sudo apt-get install pgbackrest
# 配置 /etc/pgbackrest.conf
[myapp]
pg1-path=/var/lib/postgresql/14/main
pg1-port=5432
[global]
repo1-path=/backup/pgbackrest
repo1-retention-full=7
repo1-retention-diff=14
start-fast=y
# 执行备份
pgbackrest --stanza=myapp backup --type=full
pgbackrest --stanza=myapp backup --type=diff
pgbackrest --stanza=myapp backup --type=incr
# 恢复
pgbackrest --stanza=myapp restore
5. 📊 监控告警
5.1 1. 性能监控SQL
-- 监控活动连接
SELECT
pid,
usename,
application_name,
client_addr,
state,
query,
query_start,
NOW() - query_start AS duration
FROM pg_stat_activity
WHERE state != 'idle'
ORDER BY duration DESC;
-- 监控数据库大小
SELECT
datname,
pg_size_pretty(pg_database_size(datname)) AS size,
(SELECT count(*) FROM pg_stat_activity WHERE datname = d.datname) AS connections
FROM pg_database d
ORDER BY pg_database_size(datname) DESC;
-- 监控表膨胀
SELECT
schemaname,
tablename,
pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) AS total_size,
pg_size_pretty(pg_relation_size(schemaname||'.'||tablename)) AS table_size,
pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename) -
pg_relation_size(schemaname||'.'||tablename)) AS index_size,
n_dead_tup,
n_live_tup,
ROUND(100.0 * n_dead_tup / NULLIF(n_live_tup + n_dead_tup, 0), 2) AS dead_ratio
FROM pg_stat_user_tables
ORDER BY n_dead_tup DESC
LIMIT 20;
-- 监控长事务
SELECT
pid,
usename,
application_name,
client_addr,
xact_start,
NOW() - xact_start AS duration,
state,
query
FROM pg_stat_activity
WHERE xact_start IS NOT NULL
AND NOW() - xact_start > INTERVAL '5 minutes'
ORDER BY xact_start;
-- 监控锁等待
SELECT
blocked_locks.pid AS blocked_pid,
blocked_activity.usename AS blocked_user,
blocking_locks.pid AS blocking_pid,
blocking_activity.usename AS blocking_user,
blocked_activity.query AS blocked_statement,
blocking_activity.query AS blocking_statement
FROM pg_catalog.pg_locks blocked_locks
JOIN pg_catalog.pg_stat_activity blocked_activity ON blocked_activity.pid = blocked_locks.pid
JOIN pg_catalog.pg_locks blocking_locks ON blocking_locks.locktype = blocked_locks.locktype
AND blocking_locks.database IS NOT DISTINCT FROM blocked_locks.database
AND blocking_locks.relation IS NOT DISTINCT FROM blocked_locks.relation
AND blocking_locks.page IS NOT DISTINCT FROM blocked_locks.page
AND blocking_locks.tuple IS NOT DISTINCT FROM blocked_locks.tuple
AND blocking_locks.virtualxid IS NOT DISTINCT FROM blocked_locks.virtualxid
AND blocking_locks.transactionid IS NOT DISTINCT FROM blocked_locks.transactionid
AND blocking_locks.classid IS NOT DISTINCT FROM blocked_locks.classid
AND blocking_locks.objid IS NOT DISTINCT FROM blocked_locks.objid
AND blocking_locks.objsubid IS NOT DISTINCT FROM blocked_locks.objsubid
AND blocking_locks.pid != blocked_locks.pid
JOIN pg_catalog.pg_stat_activity blocking_activity ON blocking_activity.pid = blocking_locks.pid
WHERE NOT blocked_locks.granted;
5.2 2. 使用监控工具
# Prometheus + postgres_exporter
docker run -d \
-p 9187:9187 \
-e DATA_SOURCE_NAME="postgresql://user:password@localhost:5432/postgres?sslmode=disable" \
prometheuscommunity/postgres-exporter
# pgAdmin 4
# https://www.pgadmin.org/
# Grafana仪表板
# Dashboard ID: 9628 (PostgreSQL Database)
6. 🔄 高可用方案
6.1 1. 流复制
-- 主库配置
-- postgresql.conf
wal_level = replica
max_wal_senders = 10
max_replication_slots = 10
hot_standby = on
-- pg_hba.conf
host replication replicator 10.0.1.0/24 md5
-- 创建复制用户
CREATE USER replicator WITH REPLICATION PASSWORD 'replica_password';
-- 从库配置
-- 执行基础备份
pg_basebackup -h primary_host -D /var/lib/postgresql/14/main -U replicator -P -R
-- 启动从库(自动进入恢复模式)
6.2 2. 自动故障转移
# 使用Patroni + etcd/Consul
# docker-compose.yml
version: '3'
services:
etcd:
image: quay.io/coreos/etcd
environment:
ETCD_LISTEN_CLIENT_URLS: http://0.0.0.0:2379
postgres1:
image: patroni
environment:
PATRONI_NAME: postgres1
PATRONI_RESTAPI_CONNECT_ADDRESS: postgres1:8008
PATRONI_POSTGRESQL_CONNECT_ADDRESS: postgres1:5432
PATRONI_ETCD_HOSTS: etcd:2379
postgres2:
image: patroni
environment:
PATRONI_NAME: postgres2
# 类似配置
haproxy:
image: haproxy
ports:
- "5432:5432"
- "5433:5433"
7. 🎯 运维检查清单
7.1 每日检查
- 检查备份是否成功
- 检查磁盘空间(data、WAL、archive)
- 检查慢查询日志
- 检查错误日志
- 检查活动连接数
- 检查复制延迟(如有主从)
7.2 每周检查
- 分析表统计信息(ANALYZE)
- 检查未使用的索引
- 检查表膨胀情况
- 回顾性能趋势
- 测试备份恢复
7.3 每月检查
- VACUUM FULL大表(如需要)
- 审查用户权限
- 更新数据库统计信息
- 检查配置参数是否需要调整
- 容量规划评估
7.4 季度检查
- PostgreSQL版本升级计划
- 安全审计
- 灾难恢复演练
- 性能基准测试
8. 🚨 故障处理
8.1 1. 连接数耗尽
-- 查看当前连接
SELECT count(*) FROM pg_stat_activity;
-- 查看每个数据库的连接数
SELECT datname, count(*) FROM pg_stat_activity GROUP BY datname;
-- 终止空闲连接
SELECT pg_terminate_backend(pid)
FROM pg_stat_activity
WHERE state = 'idle'
AND state_change < NOW() - INTERVAL '10 minutes';
-- 临时解决
ALTER SYSTEM SET max_connections = 300;
SELECT pg_reload_conf();
-- 长期解决:使用连接池(PgBouncer)
8.2 2. 磁盘空间不足
# 检查磁盘使用
df -h
# 检查PostgreSQL数据目录
du -sh /var/lib/postgresql/14/main/*
# 清理WAL日志(谨慎!)
# SELECT pg_switch_wal(); -- 切换WAL
# 手动删除归档的WAL
# 清理旧的pg_log
find /var/log/postgresql -name "*.log" -mtime +7 -delete
# VACUUM FULL释放空间(锁表,慢)
VACUUM FULL;
8.3 3. 性能突然下降
-- 1. 检查慢查询
SELECT query, mean_exec_time, calls
FROM pg_stat_statements
ORDER BY mean_exec_time DESC
LIMIT 10;
-- 2. 检查锁等待
-- (使用上面的锁监控SQL)
-- 3. 检查是否需要VACUUM
SELECT
schemaname,
tablename,
n_dead_tup,
n_live_tup,
last_autovacuum
FROM pg_stat_user_tables
WHERE n_dead_tup > 10000
ORDER BY n_dead_tup DESC;
-- 4. 重新分析统计信息
ANALYZE;
-- 5. 检查是否有长事务
-- (使用上面的长事务监控SQL)
9. 📝 配置文件模板
# postgresql.conf - 生产环境推荐配置(16GB内存服务器)
# CONNECTIONS
max_connections = 200
superuser_reserved_connections = 3
# MEMORY
shared_buffers = 4GB
effective_cache_size = 12GB
work_mem = 16MB
maintenance_work_mem = 512MB
wal_buffers = -1
# WAL
wal_level = replica
max_wal_senders = 10
max_replication_slots = 10
wal_keep_size = 1GB
checkpoint_timeout = 15min
checkpoint_completion_target = 0.9
max_wal_size = 4GB
min_wal_size = 1GB
# QUERY TUNING
random_page_cost = 1.1 # SSD
effective_io_concurrency = 200 # SSD
default_statistics_target = 100
# LOGGING
log_destination = 'stderr'
logging_collector = on
log_directory = 'log'
log_filename = 'postgresql-%Y-%m-%d.log'
log_rotation_age = 1d
log_rotation_size = 100MB
log_min_duration_statement = 1000 # 1秒
log_line_prefix = '%t [%p]: [%l-1] user=%u,db=%d,app=%a,client=%h '
log_checkpoints = on
log_connections = on
log_disconnections = on
log_lock_waits = on
# AUTOVACUUM
autovacuum = on
autovacuum_max_workers = 3
autovacuum_naptime = 1min
# STATISTICS
track_activities = on
track_counts = on
track_io_timing = on
track_functions = all
# LOCALE
lc_messages = 'en_US.UTF-8'
lc_monetary = 'en_US.UTF-8'
lc_numeric = 'en_US.UTF-8'
lc_time = 'en_US.UTF-8'
default_text_search_config = 'pg_catalog.english'
# TIMEZONE
timezone = 'UTC'
10. 🔗 有用的资源
10.1 官方文档
- PostgreSQL官方文档: https://www.postgresql.org/docs/
- PostgreSQL Wiki: https://wiki.postgresql.org/
10.2 监控工具
- pgAdmin: https://www.pgadmin.org/
- pg_stat_statements: 内置扩展
- Prometheus + postgres_exporter
- Grafana仪表板
10.3 备份工具
- pg_dump/pg_restore: 内置
- pgBackRest: https://pgbackrest.org/
- Barman: https://www.pgbarman.org/
10.4 高可用方案
- Patroni: https://github.com/patroni/patroni
- Repmgr: https://repmgr.org/
- Stolon: https://github.com/sorintlab/stolon
10.5 连接池
- PgBouncer: https://www.pgbouncer.org/
- Pgpool-II: https://www.pgpool.net/
10.6 性能分析
- pg_stat_statements: 内置扩展
- pgBadger: 日志分析工具
- explain.depesz.com: 执行计划可视化
11. 📚 总结
生产环境PostgreSQL的关键要点:
- 安全第一: SSL、最小权限原则、网络隔离
- 定期备份: 自动化备份、测试恢复、异地存储
- 性能监控: 实时监控、告警设置、定期优化
- 高可用性: 主从复制、自动故障转移
- 运维规范: 变更管理、容量规划、文档完善
恭喜你完成了PostgreSQL完整教程! 🎉
从基础到实战,你已经掌握了PostgreSQL的核心知识。继续实践,在项目中应用这些技能,你会成为PostgreSQL专家!
xingliuhua