sql案例分析:⽇活、统计连续登陆的三天和蚂蚁森林 # 当天新⽤户
hive -e \'select count(1) from hm2.daily_helper \
where guid not in (select guid from hm2.history_helper);\' > %s'%(resultPath)
(status,result) = execHive(cmd)
肥皂水# 次⽇活跃留存
hive -e \'select count(1) from\你莫走歌原唱
(select guid from hm2.helper where dt = "%s" group by guid) yes\
inner join\
(select guid from hm2.helper where dt = "%s" group by guid) today\
where yes.guid = today.guid;\'
1、SQL语句统计连续登陆的三天数和以上的⽤户案例分析
这个问题可以扩展到很多相似的问题:连续⼏个⽉充值会员、连续天数有商品卖出、连续打滴滴、连续逾期。
测试数据:⽤户ID、登⼊⽇期
uid,dt
guid01,2018-02-28
guid01,2018-03-01
guid01,2018-03-02
guid01,2018-03-04
guid01,2018-03-05
guid01,2018-03-06
guid01,2018-03-07
guid02,2018-03-01
guid02,2018-03-02
guid02,2018-03-03
guid02,2018-03-06
如果数据是多条的话是时间,按照group by ⽇期和⽤户ID 去重,取出唯⼀
guid02,2018-03-03 00:01:01
guid02,2018-03-03 12:01:01
⽬标表格:
+---------+--------+-------------+-------------+--+
| uid | times | start_date | end_date |
+---------+--------+-------------+-------------+--+
| guid01 | 4 | 2018-03-04 | 2018-03-07 |
| guid02 | 3 | 2018-03-01 | 2018-03-03 |
+---------+--------+-------------+-------------+--+
思路:
写sql 1.分组,排序,打⾏号,2.让时间戳-⾏号根据差值检查是否为连续
整体的答案:
SELECT uid, min(dt), max(dt), count(1) AS counts
FROM
(SELECT uid , dt, date_sub(dt, rn) AS dis
FROM
(SELECT uid , dt, row_number()over (partition by uid ORDER BY dt)rn
FROM continuous
)
t1
)t2
GROUP BY uid ,dis HAVING counts>2
sql问题:连续活跃n天⽤户的获取;数据倾斜的sql如何优化;数据量⼤的sql如何优化?
使⽤row number排序,然后时间-排序字段,减的结果分组,相同的就是连续的
-- 第⼀种⽅法:row_number() over()实现
SELECT DISTINCT user_name
FROM (
SELECT user_name, logindate
, row_number() OVER (PARTITION BY user_name ORDER BY logindate) + datediff(day, date(logindate), CURRENT_DATE) AS diff FROM user_login_table
)
GROUP BY user_name, diff
HAVING count(1) >= 3
SELECT DISTINCT user_name,count(1) as login_num,login_date
FROM (SELECT user_name, login_date
, row_number() OVER (PARTITION BY user_name ORDER BY login_date) AS rn
FROM user_login_table
) t
GROUP BY user_name, date_sub(login_date,rn)
HAVING count(1) >= 3
2、连续三周活跃⽤户:在当前⽇期之前三周的周活表中,此⽤户都存在
SELECT '2022-02-23'
,
concat(date_sub(next_day('2022-02-23', 'MO'), 7*4), '-', date_sub(next_day('2022-02-23', 'MO'), 7*1))
, mid_id
,count
FRMO (
SELECT mid_id,count(1) as count
FRMO DWS_DV_DETAIL_WK
WHERE login_date BETWEEN date_sub(next_day('2022-02-23', 'MO'), 7*2) and date_sub(next_day('2022-02-23', 'MO'), 7*1)
AND login_date BETWEEN date_sub(next_day('2022-02-23', 'MO'), 7*3) and date_sub(next_day('2022-02-23', 'MO'), 7*2)
AND login_date BETWEEN date_sub(next_day('2022-02-23', 'MO'), 7*4) and date_sub(next_day('2022-02-23', 'MO'), 7*3)
GROUP BY mid_id
HAVING count(1) >= 3
) t1;
1、蚂蚁森林植物申领统计
问题:假设2017年1⽉1⽇开始记录低碳数据(user_low_carbon),假设2017年10⽉1⽇之前满⾜申领条件的⽤户都申领了⼀颗p004-胡杨,剩余的能量全部⽤来领取“p002-沙柳” 。
统计在10⽉1⽇累计申领“p002-沙柳” 排名前10的⽤户信息;以及他⽐后⼀名多领了⼏颗沙柳(加分选项,可不做)。
得到的统计结果如下表样式:
user_id plant_count less_count(⽐后⼀名多领了⼏颗沙柳)
u_101 1000 100
u_088 900 400
u_103 500 …
select user_id,
sum(low_carbon) plant_count,
rank() over(order by sum(low_carbon) desc),
(sum(low_carbon) - lead(sum(low_carbon), 1, 0) over(ORDER BY plant_count DESC)) as less_count
from plant_carbon
group by user_id;
-----------
SELECT user_id, plant_count , plant_count - lead(plant_count, 1) OVER (ORDER BY plant_count DESC)
FROM (
SELECT user_id, floor((sum_low_carbon - hy_p004) / sl_p002) AS plant_count
FROM (
SELECT user_id, sum(low_carbon) AS sum_low_carbon
FROM user_low_carbon
WHERE datediff(regexp_replace(data_dt, '/', '-'), '2017-1-1') >= 0 AND datediff(regexp_replace(data_dt, '/', '-'), '2017-10-1') <= 0
GROUP BY user_id
) t1,
( SELECT low_carbon AS hy_p004 FROM plant_carbon WHERE plant_id = 'p004' ) t2, --p004-胡杨所需能量
( SELECT low_carbon AS sl_p002 FROM plant_carbon WHERE plant_id = 'p002' ) t3 --p002-沙柳所需能量
where --...
) t4
LIMIT 10;
2、蚂蚁森林低碳⽤户排名分析
问题:查询user_low_carbon表中每⽇流⽔记录,条件为:⽤户在2017年,连续三天(或以上)的天数⾥,每天减少碳排放
(low_carbon)都超过100g的⽤户低碳流⽔。
需要查询返回满⾜以上条件的user_low_carbon表中的记录流⽔。例如⽤户u_002符合条件的记录如下,因为2017/1/2~2017/1/5连续四天的碳排放量之和都⼤于等于100g:
seq(key) user_id data_dt low_carbon
好想挣钱xxxxx10 u_002 2017/1/2 150
xxxxx11 u_002 2017/1/2 70
xxxxx12 u_002 2017/1/3 30
xxxxx13 u_002 2017/1/3 80
xxxxx14 u_002 2017/1/4 150
xxxxx14 u_002 2017/1/5 101
wwwblogs/yxym2016/p/13254601.html?share_token=a209fe09-3d0c-470a-a5f4-122fff5feb73
SELECT user_id
FROM (
SELECT user_id, data_dt, date_sub(data_dt, rk) AS data_sub_rk
FROM (
SELECT user_id, data_dt, rank() OVER (PARTITION BY user_id ORDER BY data_dt) AS rk
FROM (
SELECT user_id , date_format(regexp_replace(data_dt, '/', '-'), 'yyyy-MM-dd') AS data_dt
FROM user_low_carbon
WHERE substring(data_dt, 1, 4) = '2017'
GROUP BY user_id, data_dt
HAVING sum(low_carbon) > 100
) t1
) t2
) t3
GROUP BY user_id, data_sub_rk
HAVING count(*) >= 3;
------
SELECT t5.user_id, t5.data_dt, t5.low_carbon
FROM user_low_carbon t5
JOIN (
SELECT user_id, data_dt
FROM (
SELECT user_id, data_dt, count(*) OVER (PARTITION BY user_id, date_diff ) AS date_diff_count
FROM (
SELECT user_id, data_dt , date_sub(to_date(regexp_replace(data_dt, '/', '-')), rn) AS date_diff
FROM (
SELECT user_id, data_dt, sum(low_carbon) AS low_carbon_sum , row_number() OVER (PARTITION BY user_id ORDER BY data_dt) AS rn
FROM user_low_carbon
WHERE substring(data_dt, 1, 4) = '2020'
GROUP BY user_id, data_dt
HAVING low_carbon_sum > 100
) t1
) t2
) t3
WHERE date_diff_count >= 3
) t4
WHERE t4.user_id = t5.user_id
AND t4.data_dt = t5.data_dt
ORDER BY t5.user_id, t5.data_dt;
-
--------
题2:考试成绩排名
学校的期末考试结束后,考试成绩被存储在⼀张表中TableA中,包含信息:sn(学号), sourse(课程),score(成绩)。班主任想做如下统计:
1、统计每个学⽣的总分,排名,当前排名与下⼀名次总分差(如:第n名总分-第n+1名总分,即为分差);
2、统计英语(sourse=ENG)挂科(score<60),但是总分排名进⼊前20%的学⽣名单以及他们的总分成绩;
庄稼地里的故事3、统计⾄少两门科⽬满分(score=100)的同学中,总分排名第⼆的学⽣的学号
现在有⼀张 sc 表 记录了学号 学科号 和 每科的成绩 求
1 每个⼈的总分 以及排名
2 在上⼀题的基础上,输出每个⼈⽐他后来的⼈⾼多少分
-
- 每个⼈的总分以及排名
select SId, sum(score) sum_score, rank() over(order by sum(score) desc)
from train.sc
group by SId;
-- 每个⼈⽐他后来的⼈⾼多少分
select SId,
sum(score) sum_score,
rank() over(order by sum(score) desc),
(sum(score) - lead(sum(score), 1, 0) over(ORDER BY score DESC)) as score_diff
from train.sc
group by SId;
-
- 别名的作⽤域从group by 开始
SELECT t1.SId, t2.sum_score
FROM (
SELECT SId
FROM train.sc
WHERE sourse = 'ENG'
AND score < 60
) t1
JOIN (
SELECT SId, sum(score) AS sum_score, rank() OVER (ORDER BY sum(score) DESC) as seqnum
FROM train.sc
目标 作文
GROUP BY SId
) t2
ON t1.SId = t2.SId
where t2.seqnum <= 35
;
今年教师节是第几个题3:计算90分位。
有10000个⽤户,每个⽤户有user_id,和不同的交易量trd_cnt。
求:按照交易量从⼤到⼩排序,选出最少的、能占总体交易量90%的⽤户id(这些⽤户交易量占整体交易量的90%)percentile --整数
percentile_approx--⼩数
select *,
sum(成绩) over (order by 学号) as current_sum,
avg(成绩) over (order by 学号) as current_avg,
count(成绩) over (order by 学号) as current_count,
max(成绩) over (order by 学号) as current_max,
min(成绩) over (order by 学号) as current_min
from 班级表;
select
min(Premium),
percentile_approx(Premium,array(0.1,0.5,0.95,1)) as percentile,
max(Premium)
from CH_ZZT.POLICY_AMOUNT_H;
select
explode(percentile_approx(Premium,array(0.1,0.5,0.95)))as percentile
from CH_ZZT.POLICY_AMOUNT_H;
4、社保、⼯作经历
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论