pgsql-15 窗口函数应用
目录
15 - 窗口函数应用
1. 📖 概述
窗口函数(Window Functions)是PostgreSQL的强大特性,允许在结果集的分区上执行计算,而不需要GROUP BY,非常适合排名、移动平均、累计求和等场景。
2. 🎯 窗口函数基础
2.1 基本语法
function_name([expression]) OVER (
[PARTITION BY partition_expression]
[ORDER BY sort_expression [ASC | DESC]]
[frame_clause]
)
2.2 简单示例
-- 创建测试表
CREATE TABLE sales (
id SERIAL PRIMARY KEY,
salesperson VARCHAR(50),
region VARCHAR(50),
amount NUMERIC(10, 2),
sale_date DATE
);
INSERT INTO sales (salesperson, region, amount, sale_date) VALUES
('Alice', 'North', 1000, '2024-01-15'),
('Bob', 'North', 1500, '2024-01-20'),
('Charlie', 'South', 2000, '2024-01-18'),
('Alice', 'North', 1200, '2024-02-10'),
('Bob', 'North', 1800, '2024-02-15'),
('Charlie', 'South', 2200, '2024-02-20'),
('David', 'East', 1600, '2024-01-25'),
('David', 'East', 1900, '2024-02-28');
-- 基本窗口函数: 显示每条记录和总计
SELECT
salesperson,
region,
amount,
SUM(amount) OVER () AS total_sales, -- 所有销售总额
amount / SUM(amount) OVER () AS percentage -- 占比
FROM sales;
3. 🔢 排名函数
3.1 ROW_NUMBER()
为每一行分配唯一的序号。
-- 全局排名
SELECT
salesperson,
amount,
ROW_NUMBER() OVER (ORDER BY amount DESC) AS rank
FROM sales;
-- 按区域排名
SELECT
salesperson,
region,
amount,
ROW_NUMBER() OVER (PARTITION BY region ORDER BY amount DESC) AS rank_in_region
FROM sales;
3.2 RANK() 和 DENSE_RANK()
SELECT
salesperson,
amount,
ROW_NUMBER() OVER (ORDER BY amount DESC) AS row_num,
RANK() OVER (ORDER BY amount DESC) AS rank,
DENSE_RANK() OVER (ORDER BY amount DESC) AS dense_rank
FROM sales;
-- 示例输出:
-- amount | row_num | rank | dense_rank
-- 2200 | 1 | 1 | 1
-- 2000 | 2 | 2 | 2
-- 1900 | 3 | 3 | 3
-- 1800 | 4 | 4 | 4
-- 1600 | 5 | 5 | 5
-- 1500 | 6 | 6 | 6
-- 1200 | 7 | 7 | 7
-- 1000 | 8 | 8 | 8
-- 相同金额的情况:
-- amount | row_num | rank | dense_rank
-- 2000 | 1 | 1 | 1
-- 2000 | 2 | 1 | 1 (RANK相同)
-- 1500 | 3 | 3 | 2 (RANK跳过2, DENSE_RANK不跳)
-- 1500 | 4 | 3 | 2
-- 1000 | 5 | 5 | 3
3.3 NTILE()
将结果集分成N个桶。
-- 分成4个组
SELECT
salesperson,
amount,
NTILE(4) OVER (ORDER BY amount) AS quartile
FROM sales;
-- 应用: 客户分层
SELECT
username,
lifetime_value,
CASE NTILE(4) OVER (ORDER BY lifetime_value DESC)
WHEN 1 THEN 'VIP'
WHEN 2 THEN 'High Value'
WHEN 3 THEN 'Medium Value'
ELSE 'Low Value'
END AS customer_tier
FROM (
SELECT
u.username,
COALESCE(SUM(o.total_amount), 0) AS lifetime_value
FROM users u
LEFT JOIN orders o ON u.id = o.user_id
GROUP BY u.id, u.username
) AS customer_values;
4. 📊 聚合窗口函数
4.1 SUM(), AVG(), COUNT()
-- 累计求和
SELECT
sale_date,
salesperson,
amount,
SUM(amount) OVER (ORDER BY sale_date) AS running_total,
AVG(amount) OVER (ORDER BY sale_date) AS running_avg,
COUNT(*) OVER (ORDER BY sale_date) AS running_count
FROM sales
ORDER BY sale_date;
-- 按区域累计
SELECT
sale_date,
region,
salesperson,
amount,
SUM(amount) OVER (
PARTITION BY region
ORDER BY sale_date
) AS regional_running_total
FROM sales
ORDER BY region, sale_date;
4.2 MIN(), MAX()
SELECT
salesperson,
region,
amount,
MIN(amount) OVER (PARTITION BY region) AS region_min,
MAX(amount) OVER (PARTITION BY region) AS region_max,
amount - MIN(amount) OVER (PARTITION BY region) AS diff_from_min,
MAX(amount) OVER (PARTITION BY region) - amount AS diff_from_max
FROM sales;
5. 🔄 值函数
5.1 LAG() 和 LEAD()
访问前一行或后一行的数据。
-- LAG: 前一行
-- LEAD: 后一行
SELECT
sale_date,
salesperson,
amount,
LAG(amount) OVER (ORDER BY sale_date) AS prev_amount,
LEAD(amount) OVER (ORDER BY sale_date) AS next_amount,
amount - LAG(amount) OVER (ORDER BY sale_date) AS diff_from_prev,
LEAD(amount) OVER (ORDER BY sale_date) - amount AS diff_to_next
FROM sales
ORDER BY sale_date;
-- 按销售员分组
SELECT
sale_date,
salesperson,
amount,
LAG(amount, 1) OVER (
PARTITION BY salesperson
ORDER BY sale_date
) AS prev_sale,
amount - LAG(amount, 1, 0) OVER (
PARTITION BY salesperson
ORDER BY sale_date
) AS growth
FROM sales
ORDER BY salesperson, sale_date;
-- 参数说明:
-- LAG(expression, offset, default)
-- offset: 偏移量(默认1)
-- default: 没有前一行时的默认值
5.2 FIRST_VALUE() 和 LAST_VALUE()
-- FIRST_VALUE: 分区第一个值
-- LAST_VALUE: 分区最后一个值
SELECT
sale_date,
region,
amount,
FIRST_VALUE(amount) OVER (
PARTITION BY region
ORDER BY sale_date
) AS first_sale_in_region,
LAST_VALUE(amount) OVER (
PARTITION BY region
ORDER BY sale_date
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
) AS last_sale_in_region
FROM sales;
-- 注意: LAST_VALUE需要指定frame,否则只会取到当前行
5.3 NTH_VALUE()
-- 获取第N个值
SELECT
sale_date,
region,
amount,
NTH_VALUE(amount, 2) OVER (
PARTITION BY region
ORDER BY sale_date
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
) AS second_sale_in_region
FROM sales;
6. 📐 窗口帧(Frame Clause)
6.1 Frame类型
-- ROWS: 物理行
-- RANGE: 逻辑范围
-- Frame边界:
-- UNBOUNDED PRECEDING: 分区开始
-- N PRECEDING: 当前行之前N行
-- CURRENT ROW: 当前行
-- N FOLLOWING: 当前行之后N行
-- UNBOUNDED FOLLOWING: 分区结束
6.2 移动平均
-- 3日移动平均
SELECT
sale_date,
amount,
AVG(amount) OVER (
ORDER BY sale_date
ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
) AS moving_avg_3day
FROM sales
ORDER BY sale_date;
-- 中心移动平均(前后各1天)
SELECT
sale_date,
amount,
AVG(amount) OVER (
ORDER BY sale_date
ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING
) AS centered_moving_avg
FROM sales
ORDER BY sale_date;
6.3 累计窗口
-- 累计求和(默认frame)
SELECT
sale_date,
amount,
SUM(amount) OVER (
ORDER BY sale_date
-- 默认: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
) AS cumulative_sum
FROM sales;
-- 完整分区聚合
SELECT
sale_date,
amount,
SUM(amount) OVER (
ORDER BY sale_date
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
) AS total_sum
FROM sales;
7. 🎨 实战应用
7.1 1. 排名查询
-- 每个分类中价格最高的3个产品
WITH ranked_products AS (
SELECT
category,
name,
price,
ROW_NUMBER() OVER (
PARTITION BY category
ORDER BY price DESC
) AS rank
FROM products
)
SELECT category, name, price
FROM ranked_products
WHERE rank <= 3
ORDER BY category, rank;
-- 去重(保留最新记录)
WITH numbered_records AS (
SELECT
*,
ROW_NUMBER() OVER (
PARTITION BY user_id, product_id
ORDER BY created_at DESC
) AS rn
FROM user_actions
)
SELECT * FROM numbered_records WHERE rn = 1;
7.2 2. 同比环比分析
-- 月度销售同比环比
WITH monthly_sales AS (
SELECT
DATE_TRUNC('month', sale_date) AS month,
SUM(amount) AS total_sales
FROM sales
GROUP BY DATE_TRUNC('month', sale_date)
)
SELECT
month,
total_sales,
LAG(total_sales, 1) OVER (ORDER BY month) AS prev_month,
LAG(total_sales, 12) OVER (ORDER BY month) AS same_month_last_year,
total_sales - LAG(total_sales, 1) OVER (ORDER BY month) AS mom_diff,
ROUND(
100.0 * (total_sales - LAG(total_sales, 1) OVER (ORDER BY month)) /
NULLIF(LAG(total_sales, 1) OVER (ORDER BY month), 0),
2
) AS mom_growth_pct,
ROUND(
100.0 * (total_sales - LAG(total_sales, 12) OVER (ORDER BY month)) /
NULLIF(LAG(total_sales, 12) OVER (ORDER BY month), 0),
2
) AS yoy_growth_pct
FROM monthly_sales
ORDER BY month;
7.3 3. 用户留存分析
-- 用户首次和最后访问
SELECT
user_id,
visit_date,
FIRST_VALUE(visit_date) OVER (
PARTITION BY user_id
ORDER BY visit_date
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
) AS first_visit,
LAST_VALUE(visit_date) OVER (
PARTITION BY user_id
ORDER BY visit_date
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
) AS last_visit,
visit_date - FIRST_VALUE(visit_date) OVER (
PARTITION BY user_id
ORDER BY visit_date
) AS days_since_first_visit
FROM user_visits;
-- 用户活跃天数
SELECT
user_id,
COUNT(*) AS total_visits,
MIN(visit_date) AS first_visit,
MAX(visit_date) AS last_visit,
COUNT(DISTINCT visit_date) AS active_days
FROM user_visits
GROUP BY user_id;
7.4 4. 分位数和百分位
-- 计算销售额分位数
SELECT
salesperson,
amount,
NTILE(100) OVER (ORDER BY amount) AS percentile,
PERCENT_RANK() OVER (ORDER BY amount) AS percent_rank,
CUME_DIST() OVER (ORDER BY amount) AS cumulative_distribution
FROM sales;
-- 找出前20%的销售
WITH percentiles AS (
SELECT
*,
PERCENT_RANK() OVER (ORDER BY amount DESC) AS pct_rank
FROM sales
)
SELECT * FROM percentiles WHERE pct_rank <= 0.2;
7.5 5. 连续事件检测
-- 查找连续3天都有销售的销售员
WITH daily_sales AS (
SELECT
salesperson,
sale_date,
SUM(amount) AS daily_amount
FROM sales
GROUP BY salesperson, sale_date
),
consecutive_days AS (
SELECT
salesperson,
sale_date,
sale_date - (ROW_NUMBER() OVER (
PARTITION BY salesperson
ORDER BY sale_date
))::INTEGER AS group_id
FROM daily_sales
)
SELECT
salesperson,
MIN(sale_date) AS streak_start,
MAX(sale_date) AS streak_end,
COUNT(*) AS consecutive_days
FROM consecutive_days
GROUP BY salesperson, group_id
HAVING COUNT(*) >= 3
ORDER BY salesperson, streak_start;
7.6 6. 库存预警
-- 产品库存变化趋势
CREATE TABLE inventory_log (
id SERIAL PRIMARY KEY,
product_id INTEGER,
quantity_change INTEGER,
current_stock INTEGER,
log_date TIMESTAMPTZ DEFAULT NOW()
);
-- 分析库存趋势
SELECT
product_id,
log_date,
current_stock,
AVG(current_stock) OVER (
PARTITION BY product_id
ORDER BY log_date
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
) AS avg_stock_7days,
current_stock - AVG(current_stock) OVER (
PARTITION BY product_id
ORDER BY log_date
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
) AS deviation_from_avg,
CASE
WHEN current_stock < 0.5 * AVG(current_stock) OVER (
PARTITION BY product_id
ORDER BY log_date
ROWS BETWEEN 13 PRECEDING AND 7 PRECEDING
) THEN 'CRITICAL_LOW'
WHEN current_stock < AVG(current_stock) OVER (
PARTITION BY product_id
ORDER BY log_date
ROWS BETWEEN 13 PRECEDING AND 7 PRECEDING
) THEN 'LOW'
ELSE 'NORMAL'
END AS stock_status
FROM inventory_log
ORDER BY product_id, log_date DESC;
8. 🆚 窗口函数 vs GROUP BY
| 特性 | 窗口函数 | GROUP BY |
|---|---|---|
| 行数 | 保持原行数 | 减少到分组数 |
| 访问单行数据 | 可以 | 不可以 |
| 多个聚合 | 可以不同分区 | 所有列相同分组 |
| 排序 | ORDER BY in OVER | ORDER BY在外层 |
| 性能 | 单次扫描 | 可能需要排序 |
-- 窗口函数: 保留所有行
SELECT
salesperson,
amount,
AVG(amount) OVER (PARTITION BY region) AS region_avg
FROM sales;
-- GROUP BY: 聚合后只有分组行
SELECT
region,
AVG(amount) AS region_avg
FROM sales
GROUP BY region;
-- 组合使用
SELECT
region,
AVG(amount) AS avg_amount,
COUNT(*) AS sale_count,
MAX(amount) - MIN(amount) AS range
FROM sales
GROUP BY region;
9. 🎯 最佳实践
9.1 1. 合理使用PARTITION BY
-- 好: 按需分区
SELECT
product_id,
sale_date,
quantity,
SUM(quantity) OVER (
PARTITION BY product_id
ORDER BY sale_date
) AS cumulative_sales
FROM sales;
-- 不好: 不必要的全局窗口
SELECT
*,
COUNT(*) OVER () AS total_count -- 如果只需要总数,用子查询更好
FROM sales;
9.2 2. 明确指定Frame
-- 推荐: 明确frame定义
SELECT
sale_date,
amount,
AVG(amount) OVER (
ORDER BY sale_date
ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
) AS moving_avg
FROM sales;
-- 避免: 依赖默认frame(可能不是你想要的)
SELECT
sale_date,
amount,
AVG(amount) OVER (ORDER BY sale_date) AS avg_amount
FROM sales;
9.3 3. 使用CTE提高可读性
-- 好: 使用CTE
WITH ranked_sales AS (
SELECT
*,
ROW_NUMBER() OVER (
PARTITION BY region
ORDER BY amount DESC
) AS rank
FROM sales
)
SELECT * FROM ranked_sales WHERE rank <= 5;
-- 不好: 嵌套复杂
SELECT * FROM (
SELECT
*,
ROW_NUMBER() OVER (PARTITION BY region ORDER BY amount DESC) AS rank
FROM sales
) AS ranked
WHERE rank <= 5;
10. 📝 练习题
- 查询每个产品类别中价格排名前3的产品
- 计算每个用户最近3次购买的平均金额
- 找出销售额环比增长超过20%的月份
- 计算每个产品的7日移动平均销量
- 找出连续5天活跃的用户
10.1 参考答案
-- 1. 每个类别价格前3的产品
WITH ranked AS (
SELECT
category,
name,
price,
RANK() OVER (PARTITION BY category ORDER BY price DESC) AS rank
FROM products
)
SELECT * FROM ranked WHERE rank <= 3;
-- 2. 用户最近3次购买平均金额
WITH recent_orders AS (
SELECT
user_id,
total_amount,
ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY created_at DESC) AS rn
FROM orders
)
SELECT
user_id,
AVG(total_amount) AS avg_last_3_orders
FROM recent_orders
WHERE rn <= 3
GROUP BY user_id;
-- 3. 环比增长超过20%的月份
WITH monthly AS (
SELECT
DATE_TRUNC('month', sale_date) AS month,
SUM(amount) AS revenue
FROM sales
GROUP BY DATE_TRUNC('month', sale_date)
)
SELECT
month,
revenue,
LAG(revenue) OVER (ORDER BY month) AS prev_month_revenue,
ROUND(
100.0 * (revenue - LAG(revenue) OVER (ORDER BY month)) /
LAG(revenue) OVER (ORDER BY month),
2
) AS growth_pct
FROM monthly
WHERE LAG(revenue) OVER (ORDER BY month) IS NOT NULL
AND revenue > LAG(revenue) OVER (ORDER BY month) * 1.2;
-- 4. 产品7日移动平均销量
SELECT
product_id,
sale_date,
quantity,
AVG(quantity) OVER (
PARTITION BY product_id
ORDER BY sale_date
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
) AS moving_avg_7days
FROM product_sales
ORDER BY product_id, sale_date;
-- 5. 连续5天活跃的用户
WITH daily_active AS (
SELECT DISTINCT
user_id,
DATE(activity_time) AS activity_date
FROM user_activities
),
streaks AS (
SELECT
user_id,
activity_date,
activity_date - (ROW_NUMBER() OVER (
PARTITION BY user_id
ORDER BY activity_date
))::INTEGER AS streak_group
FROM daily_active
)
SELECT
user_id,
MIN(activity_date) AS streak_start,
MAX(activity_date) AS streak_end,
COUNT(*) AS consecutive_days
FROM streaks
GROUP BY user_id, streak_group
HAVING COUNT(*) >= 5;
11. 📚 下一步
关键要点:
- 窗口函数不减少行数,GROUP BY会聚合
- LAG/LEAD用于同比环比分析
- ROW_NUMBER用于排名和去重
- Frame子句控制聚合范围
- 移动平均需要ROWS BETWEEN
xingliuhua