目录

pgsql-15 窗口函数应用

15 - 窗口函数应用

1. 📖 概述

窗口函数(Window Functions)是PostgreSQL的强大特性,允许在结果集的分区上执行计算,而不需要GROUP BY,非常适合排名、移动平均、累计求和等场景。

2. 🎯 窗口函数基础

2.1 基本语法

function_name([expression]) OVER (
    [PARTITION BY partition_expression]
    [ORDER BY sort_expression [ASC | DESC]]
    [frame_clause]
)

2.2 简单示例

-- 创建测试表
CREATE TABLE sales (
    id SERIAL PRIMARY KEY,
    salesperson VARCHAR(50),
    region VARCHAR(50),
    amount NUMERIC(10, 2),
    sale_date DATE
);

INSERT INTO sales (salesperson, region, amount, sale_date) VALUES
    ('Alice', 'North', 1000, '2024-01-15'),
    ('Bob', 'North', 1500, '2024-01-20'),
    ('Charlie', 'South', 2000, '2024-01-18'),
    ('Alice', 'North', 1200, '2024-02-10'),
    ('Bob', 'North', 1800, '2024-02-15'),
    ('Charlie', 'South', 2200, '2024-02-20'),
    ('David', 'East', 1600, '2024-01-25'),
    ('David', 'East', 1900, '2024-02-28');

-- 基本窗口函数: 显示每条记录和总计
SELECT
    salesperson,
    region,
    amount,
    SUM(amount) OVER () AS total_sales,        -- 所有销售总额
    amount / SUM(amount) OVER () AS percentage -- 占比
FROM sales;

3. 🔢 排名函数

3.1 ROW_NUMBER()

为每一行分配唯一的序号。

-- 全局排名
SELECT
    salesperson,
    amount,
    ROW_NUMBER() OVER (ORDER BY amount DESC) AS rank
FROM sales;

-- 按区域排名
SELECT
    salesperson,
    region,
    amount,
    ROW_NUMBER() OVER (PARTITION BY region ORDER BY amount DESC) AS rank_in_region
FROM sales;

3.2 RANK() 和 DENSE_RANK()

SELECT
    salesperson,
    amount,
    ROW_NUMBER() OVER (ORDER BY amount DESC) AS row_num,
    RANK() OVER (ORDER BY amount DESC) AS rank,
    DENSE_RANK() OVER (ORDER BY amount DESC) AS dense_rank
FROM sales;

-- 示例输出:
-- amount | row_num | rank | dense_rank
-- 2200   | 1       | 1    | 1
-- 2000   | 2       | 2    | 2
-- 1900   | 3       | 3    | 3
-- 1800   | 4       | 4    | 4
-- 1600   | 5       | 5    | 5
-- 1500   | 6       | 6    | 6
-- 1200   | 7       | 7    | 7
-- 1000   | 8       | 8    | 8

-- 相同金额的情况:
-- amount | row_num | rank | dense_rank
-- 2000   | 1       | 1    | 1
-- 2000   | 2       | 1    | 1  (RANK相同)
-- 1500   | 3       | 3    | 2  (RANK跳过2, DENSE_RANK不跳)
-- 1500   | 4       | 3    | 2
-- 1000   | 5       | 5    | 3

3.3 NTILE()

将结果集分成N个桶。

-- 分成4个组
SELECT
    salesperson,
    amount,
    NTILE(4) OVER (ORDER BY amount) AS quartile
FROM sales;

-- 应用: 客户分层
SELECT
    username,
    lifetime_value,
    CASE NTILE(4) OVER (ORDER BY lifetime_value DESC)
        WHEN 1 THEN 'VIP'
        WHEN 2 THEN 'High Value'
        WHEN 3 THEN 'Medium Value'
        ELSE 'Low Value'
    END AS customer_tier
FROM (
    SELECT
        u.username,
        COALESCE(SUM(o.total_amount), 0) AS lifetime_value
    FROM users u
    LEFT JOIN orders o ON u.id = o.user_id
    GROUP BY u.id, u.username
) AS customer_values;

4. 📊 聚合窗口函数

4.1 SUM(), AVG(), COUNT()

-- 累计求和
SELECT
    sale_date,
    salesperson,
    amount,
    SUM(amount) OVER (ORDER BY sale_date) AS running_total,
    AVG(amount) OVER (ORDER BY sale_date) AS running_avg,
    COUNT(*) OVER (ORDER BY sale_date) AS running_count
FROM sales
ORDER BY sale_date;

-- 按区域累计
SELECT
    sale_date,
    region,
    salesperson,
    amount,
    SUM(amount) OVER (
        PARTITION BY region
        ORDER BY sale_date
    ) AS regional_running_total
FROM sales
ORDER BY region, sale_date;

4.2 MIN(), MAX()

SELECT
    salesperson,
    region,
    amount,
    MIN(amount) OVER (PARTITION BY region) AS region_min,
    MAX(amount) OVER (PARTITION BY region) AS region_max,
    amount - MIN(amount) OVER (PARTITION BY region) AS diff_from_min,
    MAX(amount) OVER (PARTITION BY region) - amount AS diff_from_max
FROM sales;

5. 🔄 值函数

5.1 LAG() 和 LEAD()

访问前一行或后一行的数据。

-- LAG: 前一行
-- LEAD: 后一行

SELECT
    sale_date,
    salesperson,
    amount,
    LAG(amount) OVER (ORDER BY sale_date) AS prev_amount,
    LEAD(amount) OVER (ORDER BY sale_date) AS next_amount,
    amount - LAG(amount) OVER (ORDER BY sale_date) AS diff_from_prev,
    LEAD(amount) OVER (ORDER BY sale_date) - amount AS diff_to_next
FROM sales
ORDER BY sale_date;

-- 按销售员分组
SELECT
    sale_date,
    salesperson,
    amount,
    LAG(amount, 1) OVER (
        PARTITION BY salesperson
        ORDER BY sale_date
    ) AS prev_sale,
    amount - LAG(amount, 1, 0) OVER (
        PARTITION BY salesperson
        ORDER BY sale_date
    ) AS growth
FROM sales
ORDER BY salesperson, sale_date;

-- 参数说明:
-- LAG(expression, offset, default)
-- offset: 偏移量(默认1)
-- default: 没有前一行时的默认值

5.2 FIRST_VALUE() 和 LAST_VALUE()

-- FIRST_VALUE: 分区第一个值
-- LAST_VALUE: 分区最后一个值

SELECT
    sale_date,
    region,
    amount,
    FIRST_VALUE(amount) OVER (
        PARTITION BY region
        ORDER BY sale_date
    ) AS first_sale_in_region,
    LAST_VALUE(amount) OVER (
        PARTITION BY region
        ORDER BY sale_date
        ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
    ) AS last_sale_in_region
FROM sales;

-- 注意: LAST_VALUE需要指定frame,否则只会取到当前行

5.3 NTH_VALUE()

-- 获取第N个值
SELECT
    sale_date,
    region,
    amount,
    NTH_VALUE(amount, 2) OVER (
        PARTITION BY region
        ORDER BY sale_date
        ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
    ) AS second_sale_in_region
FROM sales;

6. 📐 窗口帧(Frame Clause)

6.1 Frame类型

-- ROWS: 物理行
-- RANGE: 逻辑范围

-- Frame边界:
-- UNBOUNDED PRECEDING: 分区开始
-- N PRECEDING: 当前行之前N行
-- CURRENT ROW: 当前行
-- N FOLLOWING: 当前行之后N行
-- UNBOUNDED FOLLOWING: 分区结束

6.2 移动平均

-- 3日移动平均
SELECT
    sale_date,
    amount,
    AVG(amount) OVER (
        ORDER BY sale_date
        ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
    ) AS moving_avg_3day
FROM sales
ORDER BY sale_date;

-- 中心移动平均(前后各1天)
SELECT
    sale_date,
    amount,
    AVG(amount) OVER (
        ORDER BY sale_date
        ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING
    ) AS centered_moving_avg
FROM sales
ORDER BY sale_date;

6.3 累计窗口

-- 累计求和(默认frame)
SELECT
    sale_date,
    amount,
    SUM(amount) OVER (
        ORDER BY sale_date
        -- 默认: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
    ) AS cumulative_sum
FROM sales;

-- 完整分区聚合
SELECT
    sale_date,
    amount,
    SUM(amount) OVER (
        ORDER BY sale_date
        ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
    ) AS total_sum
FROM sales;

7. 🎨 实战应用

7.1 1. 排名查询

-- 每个分类中价格最高的3个产品
WITH ranked_products AS (
    SELECT
        category,
        name,
        price,
        ROW_NUMBER() OVER (
            PARTITION BY category
            ORDER BY price DESC
        ) AS rank
    FROM products
)
SELECT category, name, price
FROM ranked_products
WHERE rank <= 3
ORDER BY category, rank;

-- 去重(保留最新记录)
WITH numbered_records AS (
    SELECT
        *,
        ROW_NUMBER() OVER (
            PARTITION BY user_id, product_id
            ORDER BY created_at DESC
        ) AS rn
    FROM user_actions
)
SELECT * FROM numbered_records WHERE rn = 1;

7.2 2. 同比环比分析

-- 月度销售同比环比
WITH monthly_sales AS (
    SELECT
        DATE_TRUNC('month', sale_date) AS month,
        SUM(amount) AS total_sales
    FROM sales
    GROUP BY DATE_TRUNC('month', sale_date)
)
SELECT
    month,
    total_sales,
    LAG(total_sales, 1) OVER (ORDER BY month) AS prev_month,
    LAG(total_sales, 12) OVER (ORDER BY month) AS same_month_last_year,
    total_sales - LAG(total_sales, 1) OVER (ORDER BY month) AS mom_diff,
    ROUND(
        100.0 * (total_sales - LAG(total_sales, 1) OVER (ORDER BY month)) /
        NULLIF(LAG(total_sales, 1) OVER (ORDER BY month), 0),
        2
    ) AS mom_growth_pct,
    ROUND(
        100.0 * (total_sales - LAG(total_sales, 12) OVER (ORDER BY month)) /
        NULLIF(LAG(total_sales, 12) OVER (ORDER BY month), 0),
        2
    ) AS yoy_growth_pct
FROM monthly_sales
ORDER BY month;

7.3 3. 用户留存分析

-- 用户首次和最后访问
SELECT
    user_id,
    visit_date,
    FIRST_VALUE(visit_date) OVER (
        PARTITION BY user_id
        ORDER BY visit_date
        ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
    ) AS first_visit,
    LAST_VALUE(visit_date) OVER (
        PARTITION BY user_id
        ORDER BY visit_date
        ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
    ) AS last_visit,
    visit_date - FIRST_VALUE(visit_date) OVER (
        PARTITION BY user_id
        ORDER BY visit_date
    ) AS days_since_first_visit
FROM user_visits;

-- 用户活跃天数
SELECT
    user_id,
    COUNT(*) AS total_visits,
    MIN(visit_date) AS first_visit,
    MAX(visit_date) AS last_visit,
    COUNT(DISTINCT visit_date) AS active_days
FROM user_visits
GROUP BY user_id;

7.4 4. 分位数和百分位

-- 计算销售额分位数
SELECT
    salesperson,
    amount,
    NTILE(100) OVER (ORDER BY amount) AS percentile,
    PERCENT_RANK() OVER (ORDER BY amount) AS percent_rank,
    CUME_DIST() OVER (ORDER BY amount) AS cumulative_distribution
FROM sales;

-- 找出前20%的销售
WITH percentiles AS (
    SELECT
        *,
        PERCENT_RANK() OVER (ORDER BY amount DESC) AS pct_rank
    FROM sales
)
SELECT * FROM percentiles WHERE pct_rank <= 0.2;

7.5 5. 连续事件检测

-- 查找连续3天都有销售的销售员
WITH daily_sales AS (
    SELECT
        salesperson,
        sale_date,
        SUM(amount) AS daily_amount
    FROM sales
    GROUP BY salesperson, sale_date
),
consecutive_days AS (
    SELECT
        salesperson,
        sale_date,
        sale_date - (ROW_NUMBER() OVER (
            PARTITION BY salesperson
            ORDER BY sale_date
        ))::INTEGER AS group_id
    FROM daily_sales
)
SELECT
    salesperson,
    MIN(sale_date) AS streak_start,
    MAX(sale_date) AS streak_end,
    COUNT(*) AS consecutive_days
FROM consecutive_days
GROUP BY salesperson, group_id
HAVING COUNT(*) >= 3
ORDER BY salesperson, streak_start;

7.6 6. 库存预警

-- 产品库存变化趋势
CREATE TABLE inventory_log (
    id SERIAL PRIMARY KEY,
    product_id INTEGER,
    quantity_change INTEGER,
    current_stock INTEGER,
    log_date TIMESTAMPTZ DEFAULT NOW()
);

-- 分析库存趋势
SELECT
    product_id,
    log_date,
    current_stock,
    AVG(current_stock) OVER (
        PARTITION BY product_id
        ORDER BY log_date
        ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
    ) AS avg_stock_7days,
    current_stock - AVG(current_stock) OVER (
        PARTITION BY product_id
        ORDER BY log_date
        ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
    ) AS deviation_from_avg,
    CASE
        WHEN current_stock < 0.5 * AVG(current_stock) OVER (
            PARTITION BY product_id
            ORDER BY log_date
            ROWS BETWEEN 13 PRECEDING AND 7 PRECEDING
        ) THEN 'CRITICAL_LOW'
        WHEN current_stock < AVG(current_stock) OVER (
            PARTITION BY product_id
            ORDER BY log_date
            ROWS BETWEEN 13 PRECEDING AND 7 PRECEDING
        ) THEN 'LOW'
        ELSE 'NORMAL'
    END AS stock_status
FROM inventory_log
ORDER BY product_id, log_date DESC;

8. 🆚 窗口函数 vs GROUP BY

特性 窗口函数 GROUP BY
行数 保持原行数 减少到分组数
访问单行数据 可以 不可以
多个聚合 可以不同分区 所有列相同分组
排序 ORDER BY in OVER ORDER BY在外层
性能 单次扫描 可能需要排序
-- 窗口函数: 保留所有行
SELECT
    salesperson,
    amount,
    AVG(amount) OVER (PARTITION BY region) AS region_avg
FROM sales;

-- GROUP BY: 聚合后只有分组行
SELECT
    region,
    AVG(amount) AS region_avg
FROM sales
GROUP BY region;

-- 组合使用
SELECT
    region,
    AVG(amount) AS avg_amount,
    COUNT(*) AS sale_count,
    MAX(amount) - MIN(amount) AS range
FROM sales
GROUP BY region;

9. 🎯 最佳实践

9.1 1. 合理使用PARTITION BY

-- 好: 按需分区
SELECT
    product_id,
    sale_date,
    quantity,
    SUM(quantity) OVER (
        PARTITION BY product_id
        ORDER BY sale_date
    ) AS cumulative_sales
FROM sales;

-- 不好: 不必要的全局窗口
SELECT
    *,
    COUNT(*) OVER () AS total_count  -- 如果只需要总数,用子查询更好
FROM sales;

9.2 2. 明确指定Frame

-- 推荐: 明确frame定义
SELECT
    sale_date,
    amount,
    AVG(amount) OVER (
        ORDER BY sale_date
        ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
    ) AS moving_avg
FROM sales;

-- 避免: 依赖默认frame(可能不是你想要的)
SELECT
    sale_date,
    amount,
    AVG(amount) OVER (ORDER BY sale_date) AS avg_amount
FROM sales;

9.3 3. 使用CTE提高可读性

-- 好: 使用CTE
WITH ranked_sales AS (
    SELECT
        *,
        ROW_NUMBER() OVER (
            PARTITION BY region
            ORDER BY amount DESC
        ) AS rank
    FROM sales
)
SELECT * FROM ranked_sales WHERE rank <= 5;

-- 不好: 嵌套复杂
SELECT * FROM (
    SELECT
        *,
        ROW_NUMBER() OVER (PARTITION BY region ORDER BY amount DESC) AS rank
    FROM sales
) AS ranked
WHERE rank <= 5;

10. 📝 练习题

  1. 查询每个产品类别中价格排名前3的产品
  2. 计算每个用户最近3次购买的平均金额
  3. 找出销售额环比增长超过20%的月份
  4. 计算每个产品的7日移动平均销量
  5. 找出连续5天活跃的用户

10.1 参考答案

-- 1. 每个类别价格前3的产品
WITH ranked AS (
    SELECT
        category,
        name,
        price,
        RANK() OVER (PARTITION BY category ORDER BY price DESC) AS rank
    FROM products
)
SELECT * FROM ranked WHERE rank <= 3;

-- 2. 用户最近3次购买平均金额
WITH recent_orders AS (
    SELECT
        user_id,
        total_amount,
        ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY created_at DESC) AS rn
    FROM orders
)
SELECT
    user_id,
    AVG(total_amount) AS avg_last_3_orders
FROM recent_orders
WHERE rn <= 3
GROUP BY user_id;

-- 3. 环比增长超过20%的月份
WITH monthly AS (
    SELECT
        DATE_TRUNC('month', sale_date) AS month,
        SUM(amount) AS revenue
    FROM sales
    GROUP BY DATE_TRUNC('month', sale_date)
)
SELECT
    month,
    revenue,
    LAG(revenue) OVER (ORDER BY month) AS prev_month_revenue,
    ROUND(
        100.0 * (revenue - LAG(revenue) OVER (ORDER BY month)) /
        LAG(revenue) OVER (ORDER BY month),
        2
    ) AS growth_pct
FROM monthly
WHERE LAG(revenue) OVER (ORDER BY month) IS NOT NULL
  AND revenue > LAG(revenue) OVER (ORDER BY month) * 1.2;

-- 4. 产品7日移动平均销量
SELECT
    product_id,
    sale_date,
    quantity,
    AVG(quantity) OVER (
        PARTITION BY product_id
        ORDER BY sale_date
        ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
    ) AS moving_avg_7days
FROM product_sales
ORDER BY product_id, sale_date;

-- 5. 连续5天活跃的用户
WITH daily_active AS (
    SELECT DISTINCT
        user_id,
        DATE(activity_time) AS activity_date
    FROM user_activities
),
streaks AS (
    SELECT
        user_id,
        activity_date,
        activity_date - (ROW_NUMBER() OVER (
            PARTITION BY user_id
            ORDER BY activity_date
        ))::INTEGER AS streak_group
    FROM daily_active
)
SELECT
    user_id,
    MIN(activity_date) AS streak_start,
    MAX(activity_date) AS streak_end,
    COUNT(*) AS consecutive_days
FROM streaks
GROUP BY user_id, streak_group
HAVING COUNT(*) >= 5;

11. 📚 下一步


关键要点:

  • 窗口函数不减少行数,GROUP BY会聚合
  • LAG/LEAD用于同比环比分析
  • ROW_NUMBER用于排名和去重
  • Frame子句控制聚合范围
  • 移动平均需要ROWS BETWEEN