HIVE函数使用案例之----窗口函数
案例1:连续登陆用户


方案一:表中的数据自连接,构建笛卡尔积



这种方案弊端非常明显,如果查询连续3天,4天,一个月…没办法实现
方案二:使用窗口函数来实现





案例2:级联累加求和









案例3:分组TopN





完整sql
--------------------------------hive 窗口函数应用实例-------------------------------------------------------
--1、连续登陆用户
--建表
create table tb_login(
userid string,
logintime string
) row format delimited fields terminated by '\t';
load data local inpath '/root/hivedata/login.log' into table tb_login;
select *
from tb_login;
--自连接过滤实现
--a.构建笛卡尔积
select
a.userid as a_userid,
a.logintime as a_logintime,
b.userid as b_userid,
b.logintime as b_logintime
from tb_login a,tb_login b;
--上述查询结果保存为临时表
create table tb_login_tmp as
select
a.userid as a_userid,
a.logintime as a_logintime,
b.userid as b_userid,
b.logintime as b_logintime
from tb_login a,tb_login b;
--过滤数据:用户id相同并且登陆日期相差1
select
a_userid,a_logintime,b_userid,b_logintime
from tb_login_tmp
where a_userid = b_userid
and cast(substr(a_logintime,9,2) as int) - 1 = cast(substr(b_logintime,9,2) as int);
--统计连续两天登陆用户
select
distinct a_userid
from tb_login_tmp
where a_userid = b_userid
and cast(substr(a_logintime,9,2) as int) - 1 = cast(substr(b_logintime,9,2) as int);
----窗口函数实现
--连续登陆2天
select
userid,
logintime,
--本次登陆日期的第二天
date_add(logintime,1) as nextday,
--按照用户id分区,按照登陆日期排序,取下一次登陆时间,取不到就为0
lead(logintime,1,0) over (partition by userid order by logintime) as nextlogin
from tb_login;
--实现
with t1 as (
select
userid,
logintime,
--本次登陆日期的第二天
date_add(logintime,1) as nextday,
--按照用户id分区,按照登陆日期排序,取下一次登陆时间,取不到就为0
lead(logintime,1,0) over (partition by userid order by logintime) as nextlogin
from tb_login )
select distinct userid from t1 where nextday = nextlogin;
--连续3天登陆
select
userid,
logintime,
--本次登陆日期的第三天
date_add(logintime,2) as nextday,
--按照用户id分区,按照登陆日期排序,取下下一次登陆时间,取不到就为0
lead(logintime,2,0) over (partition by userid order by logintime) as nextlogin
from tb_login;
--实现
with t1 as (
select
userid,
logintime,
--本次登陆日期的第三天
date_add(logintime,2) as nextday,
--按照用户id分区,按照登陆日期排序,取下下一次登陆时间,取不到就为0
lead(logintime,2,0) over (partition by userid order by logintime) as nextlogin
from tb_login )
select distinct userid from t1 where nextday = nextlogin;
--连续N天
select
userid,
logintime,
--本次登陆日期的第N天
date_add(logintime,N-1) as nextday,
--按照用户id分区,按照登陆日期排序,取下下一次登陆时间,取不到就为0
lead(logintime,N-1,0) over (partition by userid order by logintime) as nextlogin
from tb_login;
--2、级联累加求和
--建表加载数据
create table tb_money(
userid string,
mth string,
money int
) row format delimited fields terminated by '\t';
load data local inpath '/root/hivedata/money.tsv' into table tb_money;
select * from tb_money;
-- 统计得到每个用户每个月的消费总金额
create table tb_money_mtn as
select
userid,
mth,
sum(money) as m_money
from tb_money
group by userid,mth;
select * from tb_money_mtn;
--方案一:自连接分组聚合
-- 基于每个用户每个月的消费总金额进行自连接
select
a.*,b.*
from tb_money_mtn a join tb_money_mtn b on a.userid = b.userid;
-- 将每个月之前月份的数据过滤出来
select
a.*,b.*
from tb_money_mtn a join tb_money_mtn b on a.userid = b.userid
where b.mth <= a.mth;
-- 同一个用户 同一个月的数据分到同一组 再根据用户、月份排序
select
a.userid,
a.mth,
max(a.m_money) as current_mth_money, --当月花费
sum(b.m_money) as accumulate_money --累积花费
from tb_money_mtn a join tb_money_mtn b on a.userid = b.userid
where b.mth <= a.mth
group by a.userid,a.mth
order by a.userid,a.mth;
--方案二:窗口函数实现
-- 统计每个用户每个月消费金额及累计总金额
select
userid,
mth,
m_money,
sum(m_money) over (partition by userid order by mth rows between 1 preceding and 2 following) as t_money
from tb_money_mtn;
--3、分组TopN问题
--建表加载数据
create table tb_emp(
empno string,
ename string,
job string,
managerid string,
hiredate string,
salary double,
bonus double,
deptno string
) row format delimited fields terminated by '\t';
load data local inpath '/root/hivedata/emp.txt' into table tb_emp;
select * from tb_emp;
-- 基于row_number实现,按照部门分区,每个部门内部按照薪水降序排序
select
empno,
ename,
salary,
deptno,
row_number() over (partition by deptno order by salary desc) as rn
from tb_emp;
-- 过滤每个部门的薪资最高的前两名
with t1 as (
select
empno,
ename,
salary,
deptno,
row_number() over (partition by deptno order by salary desc) as rn
from tb_emp )
select * from t1 where rn < 3;
1862

被折叠的 条评论
为什么被折叠?



