目录

pgsql-08 聚合函数与分组

08 - 聚合函数与分组

1. 📖 概述

聚合函数对一组值进行计算并返回单个值。本章详细介绍PostgreSQL的聚合函数、分组查询和HAVING子句。

2. 🔢 基本聚合函数

2.1 COUNT计数

-- 统计所有行
SELECT COUNT(*) FROM users;

-- 统计非NULL值
SELECT COUNT(phone) FROM users;

-- 统计唯一值
SELECT COUNT(DISTINCT city) FROM users;
SELECT COUNT(DISTINCT country) FROM users;

-- 实战: 多维度计数
SELECT
    COUNT(*) AS total_users,
    COUNT(phone) AS users_with_phone,
    COUNT(DISTINCT city) AS unique_cities,
    COUNT(*) FILTER (WHERE is_active = TRUE) AS active_users,
    COUNT(*) FILTER (WHERE created_at >= NOW() - INTERVAL '30 days') AS new_users
FROM users;

2.2 SUM求和

-- 基本求和
SELECT SUM(total_amount) FROM orders;

-- 条件求和
SELECT
    SUM(total_amount) AS total_revenue,
    SUM(total_amount) FILTER (WHERE status = 'completed') AS completed_revenue,
    SUM(total_amount) FILTER (WHERE status = 'cancelled') AS cancelled_amount
FROM orders;

-- 注意NULL值
SELECT SUM(commission) FROM sales;  -- NULL值被忽略
SELECT SUM(COALESCE(commission, 0)) FROM sales;  -- 将NULL转为0

2.3 AVG平均值

-- 基本平均值
SELECT AVG(price) FROM products;

-- 四舍五入
SELECT ROUND(AVG(price), 2) FROM products;

-- 按条件平均
SELECT
    AVG(price) AS avg_all,
    AVG(price) FILTER (WHERE stock > 0) AS avg_in_stock,
    AVG(price) FILTER (WHERE category = 'Electronics') AS avg_electronics
FROM products;

-- 加权平均
SELECT
    SUM(price * quantity) / SUM(quantity) AS weighted_avg_price
FROM order_items;

2.4 MAX和MIN

-- 最大值和最小值
SELECT
    MAX(price) AS max_price,
    MIN(price) AS min_price,
    MAX(price) - MIN(price) AS price_range
FROM products;

-- 最新和最早日期
SELECT
    MIN(created_at) AS first_order,
    MAX(created_at) AS last_order,
    AGE(MAX(created_at), MIN(created_at)) AS business_duration
FROM orders;

-- 字符串MAX/MIN (按字母顺序)
SELECT
    MIN(username) AS first_alphabetically,
    MAX(username) AS last_alphabetically
FROM users;

3. 📊 GROUP BY分组

3.1 单列分组

-- 按分类统计产品
SELECT
    category,
    COUNT(*) AS product_count,
    AVG(price) AS avg_price,
    MIN(price) AS min_price,
    MAX(price) AS max_price
FROM products
GROUP BY category
ORDER BY product_count DESC;

-- 按状态统计订单
SELECT
    status,
    COUNT(*) AS order_count,
    SUM(total_amount) AS total_revenue,
    AVG(total_amount) AS avg_order_value
FROM orders
GROUP BY status;

3.2 多列分组

-- 按分类和品牌分组
SELECT
    category,
    brand,
    COUNT(*) AS product_count,
    AVG(price) AS avg_price
FROM products
GROUP BY category, brand
ORDER BY category, brand;

-- 按年月分组
SELECT
    EXTRACT(YEAR FROM created_at) AS year,
    EXTRACT(MONTH FROM created_at) AS month,
    COUNT(*) AS order_count,
    SUM(total_amount) AS revenue
FROM orders
GROUP BY
    EXTRACT(YEAR FROM created_at),
    EXTRACT(MONTH FROM created_at)
ORDER BY year, month;

-- 使用DATE_TRUNC (推荐)
SELECT
    DATE_TRUNC('month', created_at) AS month,
    COUNT(*) AS order_count,
    SUM(total_amount) AS revenue
FROM orders
GROUP BY DATE_TRUNC('month', created_at)
ORDER BY month;

3.3 表达式分组

-- 按价格区间分组
SELECT
    CASE
        WHEN price < 20 THEN '0-20'
        WHEN price < 50 THEN '20-50'
        WHEN price < 100 THEN '50-100'
        ELSE '100+'
    END AS price_range,
    COUNT(*) AS product_count,
    AVG(price) AS avg_price
FROM products
GROUP BY
    CASE
        WHEN price < 20 THEN '0-20'
        WHEN price < 50 THEN '20-50'
        WHEN price < 100 THEN '50-100'
        ELSE '100+'
    END
ORDER BY price_range;

-- 使用WIDTH_BUCKET (更优雅)
SELECT
    WIDTH_BUCKET(price, 0, 200, 10) AS price_bucket,
    MIN(price) AS min_price,
    MAX(price) AS max_price,
    COUNT(*) AS count
FROM products
GROUP BY WIDTH_BUCKET(price, 0, 200, 10)
ORDER BY price_bucket;

4. 🎯 HAVING子句

4.1 基本HAVING

-- 找出产品数量超过10的分类
SELECT
    category,
    COUNT(*) AS product_count
FROM products
GROUP BY category
HAVING COUNT(*) > 10
ORDER BY product_count DESC;

-- 找出平均订单金额超过100的用户
SELECT
    user_id,
    COUNT(*) AS order_count,
    AVG(total_amount) AS avg_order_value
FROM orders
GROUP BY user_id
HAVING AVG(total_amount) > 100
ORDER BY avg_order_value DESC;

4.2 WHERE vs HAVING

-- WHERE: 分组前过滤
-- HAVING: 分组后过滤

-- 错误: 不能在WHERE中使用聚合函数
-- SELECT category, COUNT(*)
-- FROM products
-- WHERE COUNT(*) > 10  -- ❌ 错误!
-- GROUP BY category;

-- 正确: 使用HAVING
SELECT category, COUNT(*)
FROM products
GROUP BY category
HAVING COUNT(*) > 10;  -- ✅ 正确

-- 组合使用
SELECT
    category,
    COUNT(*) AS product_count,
    AVG(price) AS avg_price
FROM products
WHERE stock > 0  -- 先过滤: 只看有库存的产品
GROUP BY category
HAVING COUNT(*) > 5  -- 再过滤: 只看产品数>5的分类
ORDER BY avg_price DESC;

4.3 复杂HAVING条件

-- 多个条件
SELECT
    category,
    COUNT(*) AS product_count,
    AVG(price) AS avg_price,
    SUM(stock) AS total_stock
FROM products
GROUP BY category
HAVING COUNT(*) >= 10
   AND AVG(price) > 50
   AND SUM(stock) > 100
ORDER BY avg_price DESC;

-- 使用子查询
SELECT
    category,
    AVG(price) AS avg_price
FROM products
GROUP BY category
HAVING AVG(price) > (
    SELECT AVG(price) FROM products
);

5. 🔧 高级聚合函数

5.1 STRING_AGG字符串聚合

-- 将多行合并为一行
SELECT
    category,
    STRING_AGG(name, ', ' ORDER BY price DESC) AS products
FROM products
GROUP BY category;

-- 带DISTINCT
SELECT
    order_id,
    STRING_AGG(DISTINCT product_name, ' | ') AS products
FROM order_items
GROUP BY order_id;

-- MySQL对比: GROUP_CONCAT
-- SELECT category, GROUP_CONCAT(name ORDER BY price DESC) FROM products GROUP BY category;

5.2 ARRAY_AGG数组聚合

-- 聚合为数组 (PostgreSQL独有)
SELECT
    category,
    ARRAY_AGG(name ORDER BY price DESC) AS product_names,
    ARRAY_AGG(price ORDER BY price DESC) AS prices
FROM products
GROUP BY category;

-- 过滤NULL值
SELECT
    category,
    ARRAY_AGG(tag) FILTER (WHERE tag IS NOT NULL) AS tags
FROM products
GROUP BY category;

5.3 JSON聚合

-- JSON_AGG: 聚合为JSON数组
SELECT
    category,
    JSON_AGG(
        JSON_BUILD_OBJECT(
            'name', name,
            'price', price,
            'stock', stock
        ) ORDER BY price DESC
    ) AS products
FROM products
GROUP BY category;

-- JSONB_AGG (推荐)
SELECT
    user_id,
    JSONB_AGG(
        JSONB_BUILD_OBJECT(
            'order_id', id,
            'amount', total_amount,
            'date', created_at
        )
    ) AS orders
FROM orders
GROUP BY user_id;

-- JSONB_OBJECT_AGG: 聚合为JSON对象
SELECT
    category,
    JSONB_OBJECT_AGG(name, price) AS price_list
FROM products
GROUP BY category;

5.4 统计聚合函数

-- 标准差和方差
SELECT
    category,
    AVG(price) AS avg_price,
    STDDEV(price) AS std_dev,
    VARIANCE(price) AS variance
FROM products
GROUP BY category;

-- 百分位数
SELECT
    category,
    PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY price) AS median,
    PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY price) AS q1,
    PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY price) AS q3
FROM products
GROUP BY category;

-- 众数 (最常出现的值)
SELECT
    category,
    MODE() WITHIN GROUP (ORDER BY brand) AS most_common_brand
FROM products
GROUP BY category;

6. 📈 分组集合

6.1 GROUPING SETS

-- 一次查询多种分组
SELECT
    category,
    brand,
    COUNT(*) AS count,
    AVG(price) AS avg_price
FROM products
GROUP BY GROUPING SETS (
    (category, brand),  -- 按category和brand
    (category),         -- 只按category
    (brand),            -- 只按brand
    ()                  -- 总计
)
ORDER BY category NULLS LAST, brand NULLS LAST;

-- 识别分组级别
SELECT
    category,
    brand,
    COUNT(*) AS count,
    GROUPING(category) AS is_category_total,
    GROUPING(brand) AS is_brand_total
FROM products
GROUP BY GROUPING SETS ((category, brand), (category), (brand), ())
ORDER BY category NULLS LAST, brand NULLS LAST;

6.2 ROLLUP

-- 分层汇总
SELECT
    category,
    brand,
    COUNT(*) AS count,
    SUM(price * stock) AS inventory_value
FROM products
GROUP BY ROLLUP (category, brand)
ORDER BY category NULLS LAST, brand NULLS LAST;

-- 等价于:
-- GROUP BY GROUPING SETS (
--     (category, brand),
--     (category),
--     ()
-- )

-- 时间分层汇总
SELECT
    EXTRACT(YEAR FROM created_at) AS year,
    EXTRACT(MONTH FROM created_at) AS month,
    SUM(total_amount) AS revenue
FROM orders
GROUP BY ROLLUP (
    EXTRACT(YEAR FROM created_at),
    EXTRACT(MONTH FROM created_at)
)
ORDER BY year NULLS LAST, month NULLS LAST;

6.3 CUBE

-- 所有维度组合
SELECT
    category,
    brand,
    is_active,
    COUNT(*) AS count
FROM products
GROUP BY CUBE (category, brand, is_active)
ORDER BY category NULLS LAST, brand NULLS LAST, is_active NULLS LAST;

-- 结果包含:
-- (category, brand, is_active)
-- (category, brand), (category, is_active), (brand, is_active)
-- (category), (brand), (is_active)
-- ()

7. 💡 实战示例

7.1 销售报表

-- 每日销售统计
SELECT
    DATE(created_at) AS sale_date,
    COUNT(DISTINCT id) AS order_count,
    COUNT(DISTINCT user_id) AS unique_customers,
    SUM(total_amount) AS daily_revenue,
    AVG(total_amount) AS avg_order_value,
    MIN(total_amount) AS min_order,
    MAX(total_amount) AS max_order
FROM orders
WHERE created_at >= CURRENT_DATE - INTERVAL '30 days'
GROUP BY DATE(created_at)
ORDER BY sale_date DESC;

7.2 用户分层

-- 按购买次数和金额分层
WITH user_stats AS (
    SELECT
        user_id,
        COUNT(*) AS order_count,
        SUM(total_amount) AS lifetime_value
    FROM orders
    GROUP BY user_id
)
SELECT
    CASE
        WHEN order_count = 0 THEN 'Inactive'
        WHEN order_count = 1 THEN 'One-time'
        WHEN order_count <= 5 THEN 'Regular'
        WHEN order_count <= 20 THEN 'Loyal'
        ELSE 'VIP'
    END AS customer_tier,
    COUNT(*) AS customer_count,
    AVG(lifetime_value) AS avg_ltv,
    SUM(lifetime_value) AS total_revenue
FROM user_stats
GROUP BY
    CASE
        WHEN order_count = 0 THEN 'Inactive'
        WHEN order_count = 1 THEN 'One-time'
        WHEN order_count <= 5 THEN 'Regular'
        WHEN order_count <= 20 THEN 'Loyal'
        ELSE 'VIP'
    END
ORDER BY total_revenue DESC;

7.3 库存预警

-- 低库存产品统计
SELECT
    category,
    COUNT(*) FILTER (WHERE stock = 0) AS out_of_stock,
    COUNT(*) FILTER (WHERE stock > 0 AND stock < 10) AS low_stock,
    COUNT(*) FILTER (WHERE stock >= 10 AND stock < 50) AS medium_stock,
    COUNT(*) FILTER (WHERE stock >= 50) AS well_stocked,
    COUNT(*) AS total_products,
    ROUND(100.0 * COUNT(*) FILTER (WHERE stock = 0) / COUNT(*), 2) AS out_of_stock_pct
FROM products
GROUP BY category
HAVING COUNT(*) FILTER (WHERE stock = 0) > 0
ORDER BY out_of_stock_pct DESC;

7.4 同比环比分析

-- 月度销售同比环比
WITH monthly_sales AS (
    SELECT
        DATE_TRUNC('month', created_at) AS month,
        SUM(total_amount) AS revenue
    FROM orders
    GROUP BY DATE_TRUNC('month', created_at)
)
SELECT
    month,
    revenue,
    LAG(revenue, 1) OVER (ORDER BY month) AS prev_month_revenue,
    LAG(revenue, 12) OVER (ORDER BY month) AS same_month_last_year,
    revenue - LAG(revenue, 1) OVER (ORDER BY month) AS mom_diff,
    ROUND(100.0 * (revenue - LAG(revenue, 1) OVER (ORDER BY month)) /
          NULLIF(LAG(revenue, 1) OVER (ORDER BY month), 0), 2) AS mom_growth_pct,
    ROUND(100.0 * (revenue - LAG(revenue, 12) OVER (ORDER BY month)) /
          NULLIF(LAG(revenue, 12) OVER (ORDER BY month), 0), 2) AS yoy_growth_pct
FROM monthly_sales
ORDER BY month DESC;

8. 🎯 性能优化

8.1 使用索引

-- 确保GROUP BY的列有索引
CREATE INDEX idx_orders_user_id ON orders(user_id);
CREATE INDEX idx_orders_created ON orders(created_at);
CREATE INDEX idx_products_category ON products(category);

-- 复合索引
CREATE INDEX idx_orders_user_created ON orders(user_id, created_at);

8.2 避免不必要的分组

-- 不好: 不必要的分组
SELECT
    user_id,
    COUNT(*)
FROM orders
GROUP BY user_id;
-- 如果只需要总数, 不应该分组

-- 好: 直接计数
SELECT COUNT(*) FROM orders;

8.3 使用物化视图

-- 对于复杂的聚合查询, 使用物化视图
CREATE MATERIALIZED VIEW daily_sales_summary AS
SELECT
    DATE(created_at) AS sale_date,
    COUNT(*) AS order_count,
    SUM(total_amount) AS revenue,
    AVG(total_amount) AS avg_order_value
FROM orders
GROUP BY DATE(created_at);

-- 创建索引
CREATE INDEX idx_daily_sales_date ON daily_sales_summary(sale_date);

-- 定期刷新
REFRESH MATERIALIZED VIEW daily_sales_summary;

-- 查询快速
SELECT * FROM daily_sales_summary WHERE sale_date >= '2024-01-01';

9. 🆚 PostgreSQL vs MySQL

功能 PostgreSQL MySQL
FILTER子句 支持 不支持
STRING_AGG 支持 GROUP_CONCAT
ARRAY_AGG 支持 不支持
JSON_AGG 支持 5.7+支持
GROUPING SETS 支持 不支持
ROLLUP 支持 支持
CUBE 支持 不支持
统计函数 STDDEV, VARIANCE等 支持
百分位数 PERCENTILE_CONT等 不支持

10. 📚 下一步


关键要点:

  • GROUP BY必须包含SELECT中非聚合列
  • HAVING用于过滤分组后的结果
  • FILTER子句优雅地实现条件聚合
  • GROUPING SETS避免多次UNION
  • 为分组列创建索引提高性能