pgsql-08 聚合函数与分组
目录
08 - 聚合函数与分组
1. 📖 概述
聚合函数对一组值进行计算并返回单个值。本章详细介绍PostgreSQL的聚合函数、分组查询和HAVING子句。
2. 🔢 基本聚合函数
2.1 COUNT计数
-- 统计所有行
SELECT COUNT(*) FROM users;
-- 统计非NULL值
SELECT COUNT(phone) FROM users;
-- 统计唯一值
SELECT COUNT(DISTINCT city) FROM users;
SELECT COUNT(DISTINCT country) FROM users;
-- 实战: 多维度计数
SELECT
COUNT(*) AS total_users,
COUNT(phone) AS users_with_phone,
COUNT(DISTINCT city) AS unique_cities,
COUNT(*) FILTER (WHERE is_active = TRUE) AS active_users,
COUNT(*) FILTER (WHERE created_at >= NOW() - INTERVAL '30 days') AS new_users
FROM users;
2.2 SUM求和
-- 基本求和
SELECT SUM(total_amount) FROM orders;
-- 条件求和
SELECT
SUM(total_amount) AS total_revenue,
SUM(total_amount) FILTER (WHERE status = 'completed') AS completed_revenue,
SUM(total_amount) FILTER (WHERE status = 'cancelled') AS cancelled_amount
FROM orders;
-- 注意NULL值
SELECT SUM(commission) FROM sales; -- NULL值被忽略
SELECT SUM(COALESCE(commission, 0)) FROM sales; -- 将NULL转为0
2.3 AVG平均值
-- 基本平均值
SELECT AVG(price) FROM products;
-- 四舍五入
SELECT ROUND(AVG(price), 2) FROM products;
-- 按条件平均
SELECT
AVG(price) AS avg_all,
AVG(price) FILTER (WHERE stock > 0) AS avg_in_stock,
AVG(price) FILTER (WHERE category = 'Electronics') AS avg_electronics
FROM products;
-- 加权平均
SELECT
SUM(price * quantity) / SUM(quantity) AS weighted_avg_price
FROM order_items;
2.4 MAX和MIN
-- 最大值和最小值
SELECT
MAX(price) AS max_price,
MIN(price) AS min_price,
MAX(price) - MIN(price) AS price_range
FROM products;
-- 最新和最早日期
SELECT
MIN(created_at) AS first_order,
MAX(created_at) AS last_order,
AGE(MAX(created_at), MIN(created_at)) AS business_duration
FROM orders;
-- 字符串MAX/MIN (按字母顺序)
SELECT
MIN(username) AS first_alphabetically,
MAX(username) AS last_alphabetically
FROM users;
3. 📊 GROUP BY分组
3.1 单列分组
-- 按分类统计产品
SELECT
category,
COUNT(*) AS product_count,
AVG(price) AS avg_price,
MIN(price) AS min_price,
MAX(price) AS max_price
FROM products
GROUP BY category
ORDER BY product_count DESC;
-- 按状态统计订单
SELECT
status,
COUNT(*) AS order_count,
SUM(total_amount) AS total_revenue,
AVG(total_amount) AS avg_order_value
FROM orders
GROUP BY status;
3.2 多列分组
-- 按分类和品牌分组
SELECT
category,
brand,
COUNT(*) AS product_count,
AVG(price) AS avg_price
FROM products
GROUP BY category, brand
ORDER BY category, brand;
-- 按年月分组
SELECT
EXTRACT(YEAR FROM created_at) AS year,
EXTRACT(MONTH FROM created_at) AS month,
COUNT(*) AS order_count,
SUM(total_amount) AS revenue
FROM orders
GROUP BY
EXTRACT(YEAR FROM created_at),
EXTRACT(MONTH FROM created_at)
ORDER BY year, month;
-- 使用DATE_TRUNC (推荐)
SELECT
DATE_TRUNC('month', created_at) AS month,
COUNT(*) AS order_count,
SUM(total_amount) AS revenue
FROM orders
GROUP BY DATE_TRUNC('month', created_at)
ORDER BY month;
3.3 表达式分组
-- 按价格区间分组
SELECT
CASE
WHEN price < 20 THEN '0-20'
WHEN price < 50 THEN '20-50'
WHEN price < 100 THEN '50-100'
ELSE '100+'
END AS price_range,
COUNT(*) AS product_count,
AVG(price) AS avg_price
FROM products
GROUP BY
CASE
WHEN price < 20 THEN '0-20'
WHEN price < 50 THEN '20-50'
WHEN price < 100 THEN '50-100'
ELSE '100+'
END
ORDER BY price_range;
-- 使用WIDTH_BUCKET (更优雅)
SELECT
WIDTH_BUCKET(price, 0, 200, 10) AS price_bucket,
MIN(price) AS min_price,
MAX(price) AS max_price,
COUNT(*) AS count
FROM products
GROUP BY WIDTH_BUCKET(price, 0, 200, 10)
ORDER BY price_bucket;
4. 🎯 HAVING子句
4.1 基本HAVING
-- 找出产品数量超过10的分类
SELECT
category,
COUNT(*) AS product_count
FROM products
GROUP BY category
HAVING COUNT(*) > 10
ORDER BY product_count DESC;
-- 找出平均订单金额超过100的用户
SELECT
user_id,
COUNT(*) AS order_count,
AVG(total_amount) AS avg_order_value
FROM orders
GROUP BY user_id
HAVING AVG(total_amount) > 100
ORDER BY avg_order_value DESC;
4.2 WHERE vs HAVING
-- WHERE: 分组前过滤
-- HAVING: 分组后过滤
-- 错误: 不能在WHERE中使用聚合函数
-- SELECT category, COUNT(*)
-- FROM products
-- WHERE COUNT(*) > 10 -- ❌ 错误!
-- GROUP BY category;
-- 正确: 使用HAVING
SELECT category, COUNT(*)
FROM products
GROUP BY category
HAVING COUNT(*) > 10; -- ✅ 正确
-- 组合使用
SELECT
category,
COUNT(*) AS product_count,
AVG(price) AS avg_price
FROM products
WHERE stock > 0 -- 先过滤: 只看有库存的产品
GROUP BY category
HAVING COUNT(*) > 5 -- 再过滤: 只看产品数>5的分类
ORDER BY avg_price DESC;
4.3 复杂HAVING条件
-- 多个条件
SELECT
category,
COUNT(*) AS product_count,
AVG(price) AS avg_price,
SUM(stock) AS total_stock
FROM products
GROUP BY category
HAVING COUNT(*) >= 10
AND AVG(price) > 50
AND SUM(stock) > 100
ORDER BY avg_price DESC;
-- 使用子查询
SELECT
category,
AVG(price) AS avg_price
FROM products
GROUP BY category
HAVING AVG(price) > (
SELECT AVG(price) FROM products
);
5. 🔧 高级聚合函数
5.1 STRING_AGG字符串聚合
-- 将多行合并为一行
SELECT
category,
STRING_AGG(name, ', ' ORDER BY price DESC) AS products
FROM products
GROUP BY category;
-- 带DISTINCT
SELECT
order_id,
STRING_AGG(DISTINCT product_name, ' | ') AS products
FROM order_items
GROUP BY order_id;
-- MySQL对比: GROUP_CONCAT
-- SELECT category, GROUP_CONCAT(name ORDER BY price DESC) FROM products GROUP BY category;
5.2 ARRAY_AGG数组聚合
-- 聚合为数组 (PostgreSQL独有)
SELECT
category,
ARRAY_AGG(name ORDER BY price DESC) AS product_names,
ARRAY_AGG(price ORDER BY price DESC) AS prices
FROM products
GROUP BY category;
-- 过滤NULL值
SELECT
category,
ARRAY_AGG(tag) FILTER (WHERE tag IS NOT NULL) AS tags
FROM products
GROUP BY category;
5.3 JSON聚合
-- JSON_AGG: 聚合为JSON数组
SELECT
category,
JSON_AGG(
JSON_BUILD_OBJECT(
'name', name,
'price', price,
'stock', stock
) ORDER BY price DESC
) AS products
FROM products
GROUP BY category;
-- JSONB_AGG (推荐)
SELECT
user_id,
JSONB_AGG(
JSONB_BUILD_OBJECT(
'order_id', id,
'amount', total_amount,
'date', created_at
)
) AS orders
FROM orders
GROUP BY user_id;
-- JSONB_OBJECT_AGG: 聚合为JSON对象
SELECT
category,
JSONB_OBJECT_AGG(name, price) AS price_list
FROM products
GROUP BY category;
5.4 统计聚合函数
-- 标准差和方差
SELECT
category,
AVG(price) AS avg_price,
STDDEV(price) AS std_dev,
VARIANCE(price) AS variance
FROM products
GROUP BY category;
-- 百分位数
SELECT
category,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY price) AS median,
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY price) AS q1,
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY price) AS q3
FROM products
GROUP BY category;
-- 众数 (最常出现的值)
SELECT
category,
MODE() WITHIN GROUP (ORDER BY brand) AS most_common_brand
FROM products
GROUP BY category;
6. 📈 分组集合
6.1 GROUPING SETS
-- 一次查询多种分组
SELECT
category,
brand,
COUNT(*) AS count,
AVG(price) AS avg_price
FROM products
GROUP BY GROUPING SETS (
(category, brand), -- 按category和brand
(category), -- 只按category
(brand), -- 只按brand
() -- 总计
)
ORDER BY category NULLS LAST, brand NULLS LAST;
-- 识别分组级别
SELECT
category,
brand,
COUNT(*) AS count,
GROUPING(category) AS is_category_total,
GROUPING(brand) AS is_brand_total
FROM products
GROUP BY GROUPING SETS ((category, brand), (category), (brand), ())
ORDER BY category NULLS LAST, brand NULLS LAST;
6.2 ROLLUP
-- 分层汇总
SELECT
category,
brand,
COUNT(*) AS count,
SUM(price * stock) AS inventory_value
FROM products
GROUP BY ROLLUP (category, brand)
ORDER BY category NULLS LAST, brand NULLS LAST;
-- 等价于:
-- GROUP BY GROUPING SETS (
-- (category, brand),
-- (category),
-- ()
-- )
-- 时间分层汇总
SELECT
EXTRACT(YEAR FROM created_at) AS year,
EXTRACT(MONTH FROM created_at) AS month,
SUM(total_amount) AS revenue
FROM orders
GROUP BY ROLLUP (
EXTRACT(YEAR FROM created_at),
EXTRACT(MONTH FROM created_at)
)
ORDER BY year NULLS LAST, month NULLS LAST;
6.3 CUBE
-- 所有维度组合
SELECT
category,
brand,
is_active,
COUNT(*) AS count
FROM products
GROUP BY CUBE (category, brand, is_active)
ORDER BY category NULLS LAST, brand NULLS LAST, is_active NULLS LAST;
-- 结果包含:
-- (category, brand, is_active)
-- (category, brand), (category, is_active), (brand, is_active)
-- (category), (brand), (is_active)
-- ()
7. 💡 实战示例
7.1 销售报表
-- 每日销售统计
SELECT
DATE(created_at) AS sale_date,
COUNT(DISTINCT id) AS order_count,
COUNT(DISTINCT user_id) AS unique_customers,
SUM(total_amount) AS daily_revenue,
AVG(total_amount) AS avg_order_value,
MIN(total_amount) AS min_order,
MAX(total_amount) AS max_order
FROM orders
WHERE created_at >= CURRENT_DATE - INTERVAL '30 days'
GROUP BY DATE(created_at)
ORDER BY sale_date DESC;
7.2 用户分层
-- 按购买次数和金额分层
WITH user_stats AS (
SELECT
user_id,
COUNT(*) AS order_count,
SUM(total_amount) AS lifetime_value
FROM orders
GROUP BY user_id
)
SELECT
CASE
WHEN order_count = 0 THEN 'Inactive'
WHEN order_count = 1 THEN 'One-time'
WHEN order_count <= 5 THEN 'Regular'
WHEN order_count <= 20 THEN 'Loyal'
ELSE 'VIP'
END AS customer_tier,
COUNT(*) AS customer_count,
AVG(lifetime_value) AS avg_ltv,
SUM(lifetime_value) AS total_revenue
FROM user_stats
GROUP BY
CASE
WHEN order_count = 0 THEN 'Inactive'
WHEN order_count = 1 THEN 'One-time'
WHEN order_count <= 5 THEN 'Regular'
WHEN order_count <= 20 THEN 'Loyal'
ELSE 'VIP'
END
ORDER BY total_revenue DESC;
7.3 库存预警
-- 低库存产品统计
SELECT
category,
COUNT(*) FILTER (WHERE stock = 0) AS out_of_stock,
COUNT(*) FILTER (WHERE stock > 0 AND stock < 10) AS low_stock,
COUNT(*) FILTER (WHERE stock >= 10 AND stock < 50) AS medium_stock,
COUNT(*) FILTER (WHERE stock >= 50) AS well_stocked,
COUNT(*) AS total_products,
ROUND(100.0 * COUNT(*) FILTER (WHERE stock = 0) / COUNT(*), 2) AS out_of_stock_pct
FROM products
GROUP BY category
HAVING COUNT(*) FILTER (WHERE stock = 0) > 0
ORDER BY out_of_stock_pct DESC;
7.4 同比环比分析
-- 月度销售同比环比
WITH monthly_sales AS (
SELECT
DATE_TRUNC('month', created_at) AS month,
SUM(total_amount) AS revenue
FROM orders
GROUP BY DATE_TRUNC('month', created_at)
)
SELECT
month,
revenue,
LAG(revenue, 1) OVER (ORDER BY month) AS prev_month_revenue,
LAG(revenue, 12) OVER (ORDER BY month) AS same_month_last_year,
revenue - LAG(revenue, 1) OVER (ORDER BY month) AS mom_diff,
ROUND(100.0 * (revenue - LAG(revenue, 1) OVER (ORDER BY month)) /
NULLIF(LAG(revenue, 1) OVER (ORDER BY month), 0), 2) AS mom_growth_pct,
ROUND(100.0 * (revenue - LAG(revenue, 12) OVER (ORDER BY month)) /
NULLIF(LAG(revenue, 12) OVER (ORDER BY month), 0), 2) AS yoy_growth_pct
FROM monthly_sales
ORDER BY month DESC;
8. 🎯 性能优化
8.1 使用索引
-- 确保GROUP BY的列有索引
CREATE INDEX idx_orders_user_id ON orders(user_id);
CREATE INDEX idx_orders_created ON orders(created_at);
CREATE INDEX idx_products_category ON products(category);
-- 复合索引
CREATE INDEX idx_orders_user_created ON orders(user_id, created_at);
8.2 避免不必要的分组
-- 不好: 不必要的分组
SELECT
user_id,
COUNT(*)
FROM orders
GROUP BY user_id;
-- 如果只需要总数, 不应该分组
-- 好: 直接计数
SELECT COUNT(*) FROM orders;
8.3 使用物化视图
-- 对于复杂的聚合查询, 使用物化视图
CREATE MATERIALIZED VIEW daily_sales_summary AS
SELECT
DATE(created_at) AS sale_date,
COUNT(*) AS order_count,
SUM(total_amount) AS revenue,
AVG(total_amount) AS avg_order_value
FROM orders
GROUP BY DATE(created_at);
-- 创建索引
CREATE INDEX idx_daily_sales_date ON daily_sales_summary(sale_date);
-- 定期刷新
REFRESH MATERIALIZED VIEW daily_sales_summary;
-- 查询快速
SELECT * FROM daily_sales_summary WHERE sale_date >= '2024-01-01';
9. 🆚 PostgreSQL vs MySQL
| 功能 | PostgreSQL | MySQL |
|---|---|---|
| FILTER子句 | 支持 | 不支持 |
| STRING_AGG | 支持 | GROUP_CONCAT |
| ARRAY_AGG | 支持 | 不支持 |
| JSON_AGG | 支持 | 5.7+支持 |
| GROUPING SETS | 支持 | 不支持 |
| ROLLUP | 支持 | 支持 |
| CUBE | 支持 | 不支持 |
| 统计函数 | STDDEV, VARIANCE等 | 支持 |
| 百分位数 | PERCENTILE_CONT等 | 不支持 |
10. 📚 下一步
关键要点:
- GROUP BY必须包含SELECT中非聚合列
- HAVING用于过滤分组后的结果
- FILTER子句优雅地实现条件聚合
- GROUPING SETS避免多次UNION
- 为分组列创建索引提高性能
xingliuhua