大数据常见代码问题

本文最后更新于 2024年1月3日 凌晨

大数据常见代码问题

Hadoop

WordCount

Mapper

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
// WordCountMapper.java
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class WordCountMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
private final static LongWritable one = new LongWritable(1);
private Text word = new Text();

@Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 将每行文本拆分成单词
String[] words = value.toString().split("\\s+");

// 遍历每个单词,并发送到Reducer
for (String w : words) {
word.set(w);
context.write(word, one);
}
}
}

Reducer

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
// WordCountReducer.java
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class WordCountReducer extends Reducer<Text, LongWritable, Text, LongWritable> {
private LongWritable result = new LongWritable();

@Override
public void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
// 计算每个单词的总次数
long sum = 0;
for (LongWritable value : values) {
sum += value.get();
}
result.set(sum);

// 输出单词及其总次数
context.write(key, result);
}
}

Driver

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
// WordCountDriver.java
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class WordCountDriver {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "WordCount");

job.setJarByClass(WordCountDriver.class);
job.setMapperClass(WordCountMapper.class);
job.setCombinerClass(WordCountReducer.class);
job.setReducerClass(WordCountReducer.class);

job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);

FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));

System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}

Spark

WordCount

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

object SparkWordCount {
def main(args: Array[String]): Unit = {
// 配置 Spark 应用程序
val conf = new SparkConf().setAppName("Spark WordCount").setMaster("local[*]")
val sc = new SparkContext(conf)

// 读取输入文本文件
val inputPath = "path/to/input.txt"
val textFile = sc.textFile(inputPath)

// 使用 flatMap 将每行文本拆分成单词
val words = textFile.flatMap(line => line.split("\\s+"))

// 使用 map 将每个单词映射为 (单词, 1) 键值对
val wordCounts = words.map(word => (word, 1))

// 使用 reduceByKey 对相同单词的计数值进行累加
val result = wordCounts.reduceByKey(_ + _)

// 输出结果
result.foreach(println)

// 停止 Spark 应用程序
sc.stop()
}
}

Hive

题目 1

题目:找出所有科目成绩都大于某一学科平均成绩的学生

格式

1
2
3
4
5
score (
uid STRING,
subject_id STRING,
score INT
);

示例

1
2
3
4
5
6
7
8
9
1001   01 90
1001   02 90
1001   03 90
1002   01 85
1002   02 85
1002   03 70
1003   01 70
1003   02 70
1003   03 85

答案

  1. 平均成绩
1
2
3
4
5
6
select
uid,
score,
avg(score) over(partition by subject_id) avg_score
from
score;t1
  1. 根据是否大于平均成绩记录 flag,大于则记为 0 否则记为 1
1
2
3
4
5
select
uid,
if(score>avg_score,0,1) flag
from
t1;t2
  1. 根据学生 id 进行分组统计 flag 的和,和为 0 则是所有学科都大于平均成绩
1
2
3
4
5
6
7
8
select
uid
from
t2
group by
uid
having
sum(flag)=0;
  1. 最终 SQL
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
select
uid
from
(
select
uid,
if(score > avg_score, 0, 1) flag
from
(
select
uid,
score,
avg(score) over(partition by subject_id) avg_score
from
score
) t1
) t2
group by
uid
having
sum(flag) = 0;

题目 2

题目: 统计出每个用户的每月累积访问次数

格式

1
2
3
4
5
create table action
(userId string,
visitDate string,
visitCount int)
row format delimited fields terminated by "\t";

示例

1
2
3
4
5
6
7
8
u01     2017/1/21       5
u02 2017/1/23 6
u03 2017/1/22 8
u04 2017/1/20 3
u01 2017/1/23 6
u01 2017/2/21 8
u02 2017/1/23 6
u01 2017/2/22 4

答案

  1. 修改数据格式
1
2
3
4
5
6
select
userId,
date_format(regexp_replace(visitDate,'/','-'),'yyyy-MM') mn,
visitCount
from
action;t1
  1. 计算每人单月访问量
1
2
3
4
5
6
7
8
select
userId,
mn,
sum(visitCount) mn_count
from
t1
group by
userId,mn;t2
  1. 按月累计访问量
1
2
3
4
5
6
select
userId,
mn,
mn_count,
sum(mn_count) over(partition by userId order by mn)
from t2;
  1. 最终 SQL
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
select
userId,
mn,
mn_count,
sum(mn_count) over(partition by userId order by mn)
from
( select
userId,
mn,
sum(visitCount) mn_count
from
(select
userId,
date_format(regexp_replace(visitDate,'/','-'),'yyyy-MM') mn,
visitCount
from
action)t1
group by userId,mn)t2;

题目 3

题目:有 50 W 个京东店铺,每个顾客访客访问任何一个店铺的任何一个商品时都会产生一条访问日志,访问日志存储的表名为 Visit,访客的用户 id 为 user_id,被访问的店铺名称为 shop,请统计:

  1. 每个店铺的 UV(访客数)
  2. 每个店铺访问次数 top 3 的访客信息。输出店铺名称、访客 id、访问次数

格式

1
create table visit(user_id string,shop string) row format delimited fields terminated by '\t';

示例

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
u1	a
u2 b
u1 b
u1 a
u3 c
u4 b
u1 a
u2 c
u5 b
u4 b
u6 c
u2 c
u1 b
u2 a
u2 a
u3 a
u5 a
u5 a
u5 a

答案

  1. 每个店铺的 UV(访客数)
1
select shop,count(distinct user_id) from visit group by shop;
  1. 查询每个店铺被每个用户访问次数
1
2
3
select shop,user_id,count(*) ct
from visit
group by shop,user_id;t1
  1. 计算每个店铺被用户访问次数排名
1
2
select shop,user_id,ct,rank() over(partition by shop order by ct) rk
from t1;t2
  1. 取每个店铺排名前 3 的
1
2
3
select shop,user_id,ct
from t2
where rk<=3;
  1. 每个店铺访问次数 top 3 的访客信息。输出店铺名称、访客 id、访问次数
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
select 
shop,
user_id,
ct
from
(select
shop,
user_id,
ct,
rank() over(partition by shop order by ct) rk
from
(select
shop,
user_id,
count(*) ct
from visit
group by
shop,
user_id)t1
)t2
where rk<=3;

题目 4

题目:已知一个表 STG. ORDER,有如下字段: Date,Order_id,User_id,amount。请给出 sql 进行统计: 数据样例: 2017-01-01,10029028,1000003251,33.57。

  1. 给出 2017 年每个月的订单数、用户数、总成交金额。
  2. 给出 2017 年 11 月的新客数 (指在 11 月才有第一笔订单)
    格式
1
create table order_tab(dt string,order_id string,user_id string,amount decimal(10,2)) row format delimited fields terminated by '\t';

示例

1
2017-01-01,10029028,1000003251,33.57

答案

  1. 给出 2017 年每个月的订单数、用户数、总成交金额
1
2
3
4
5
6
7
8
9
10
11
select
date_format(dt,'yyyy-MM'),
count(order_id),
count(distinct user_id),
sum(amount)
from
order_tab
where
date_format(dt,'yyyy')='2017'
group by
date_format(dt,'yyyy-MM');
  1. 给出 2017 年 11 月的新客数 (指在 11 月才有第一笔订单)
1
2
3
4
5
6
7
8
select
count(user_id)
from
order_tab
group by
user_id
having
date_format(min(dt),'yyyy-MM')='2017-11';

题目 5

题目:有日志如下,请写出代码求得所有用户和活跃用户的总数及平均年龄。(活跃用户指连续两天都有访问记录的用户)日期用户年龄

格式

1
create table user_age(dt string,user_id string,age int)row format delimited fields terminated by ',';

示例

1
2
3
4
5
6
7
8
9
10
2019-02-11,test_1,23
2019-02-11,test_2,19
2019-02-11,test_3,39
2019-02-11,test_1,23
2019-02-11,test_3,39
2019-02-11,test_1,23
2019-02-12,test_2,19
2019-02-13,test_1,23
2019-02-15,test_2,19
2019-02-16,test_2,19

答案

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
select 
sum(user_total_count),
sum(user_total_avg_age),
sum(twice_count),
sum(twice_count_avg_age)
from
(select
0 user_total_count,
0 user_total_avg_age,
count(*) twice_count,
cast(sum(age)/count(*) as decimal(10,2)) twice_count_avg_age
from
(
select
user_id,
min(age) age
from
(select
user_id,
min(age) age
from
(
select
user_id,
age,
date_sub(dt,rk) flag
from
(
select
dt,
user_id,
min(age) age,
rank() over(partition by user_id order by dt) rk
from
user_age
group by
dt,user_id
)t1
)t2
group by
user_id,flag
having
count(*)>=2)t3
group by
user_id
)t4

union all

select
count(*) user_total_count,
cast((sum(age)/count(*)) as decimal(10,1)),
0 twice_count,
0 twice_count_avg_age
from
(
select
user_id,
min(age) age
from
user_age
group by
user_id
)t5)t6;

题目 6

题目:请用 sql 写出所有用户中在今年 10 月份第一次购买商品的金额,表 ordertable 字段(购买用户:userid,金额:money,购买时间:paymenttime (格式:2017-10-01),订单 id:orderid)

格式

1
2
3
4
5
6
create table ordertable(
userid string,
money int,
paymenttime string,
orderid string)
row format delimited fields terminated by '\t';

示例

1
表ordertable字段(购买用户:userid,金额:money,购买时间:paymenttime(格式:2017-10-01),订单id:orderid)

答案

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
select
t1.userid,
t1.paymenttime,
od.money
from
(select
userid,
min(paymenttime) paymenttime
from
ordertable
where
date_format(paymenttime,'yyyy-MM')='2017-10'
group by
userid)t1
join
ordertable od
on
t1.userid=od.userid
and
t1.paymenttime=od.paymenttime;

题目 7

题目:求 11 月 9 号下午 14 点(14-15 点),访问 api/user/login 接口的 top 10 的 ip 地址

格式

1
2
3
4
5
create table ip(
time string,
interface string,
ip string)
row format delimited fields terminated by '\t';

示例

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
2016-11-09 14:22:05	/api/user/login	110.23.5.33
2016-11-09 11:23:10 /api/user/detail 57.3.2.16
2016-11-09 14:59:40 /api/user/login 200.6.5.166
2016-11-09 14:22:05 /api/user/login 110.23.5.34
2016-11-09 14:22:05 /api/user/login 110.23.5.34
2016-11-09 14:22:05 /api/user/login 110.23.5.34
2016-11-09 11:23:10 /api/user/detail 57.3.2.16
2016-11-09 23:59:40 /api/user/login 200.6.5.166
2016-11-09 14:22:05 /api/user/login 110.23.5.34
2016-11-09 11:23:10 /api/user/detail 57.3.2.16
2016-11-09 23:59:40 /api/user/login 200.6.5.166
2016-11-09 14:22:05 /api/user/login 110.23.5.35
2016-11-09 14:23:10 /api/user/detail 57.3.2.16
2016-11-09 23:59:40 /api/user/login 200.6.5.166
2016-11-09 14:59:40 /api/user/login 200.6.5.166
2016-11-09 14:59:40 /api/user/login 200.6.5.166

答案

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
select
ip,
interface,
count(*) ct
from
ip
where
date_format(time,'yyyy-MM-dd HH')>='2016-11-09 14'
and
date_format(time,'yyyy-MM-dd HH')<='2016-11-09 15'
and
interface='/api/user/login'
group by
ip,interface
order by
ct desc
limit 10;t1

题目 8

题目:查询各自区组的 money 排名前十的账号(分组取前 10)

格式

1
2
3
4
CREATE TABLE `account`
( `dist_id` int11DEFAULT NULL COMMENT '区组id',
`account` varchar100DEFAULT NULL COMMENT '账号',
`gold` int11DEFAULT 0 COMMENT '金币');

答案

1
SELECT * FROM ( SELECT dist_id, account, gold, dense_rank() OVER (PARTITION BY dist_id ORDER BY gold DESC) as rk FROM account ) t1 WHERE rk <= 10;

题目 9

题目

  1. 有三张表分别为会员表(member)销售表(sale)退货表(regoods)
    1. 会员表有字段 memberid(会员 id,主键)credits(积分);
    2. 销售表有字段 memberid(会员 id,外键)购买金额(MNAccount);
    3. 退货表中有字段 memberid(会员 id,外键)退货金额(RMNAccount)。
  2. 业务说明
    1. 销售表中的销售记录可以是会员购买,也可以是非会员购买。(即销售表中的 memberid 可以为空);
    2. 销售表中的一个会员可以有多条购买记录;
    3. 退货表中的退货记录可以是会员,也可是非会员;
    4. 一个会员可以有一条或多条退货记录。

查询需求:分组查出销售表中所有会员购买金额,同时分组查出退货表中所有会员的退货金额,把会员 id 相同的购买金额-退款金额得到的结果更新到表会员表中对应会员的积分字段(credits)

格式

1
2
3
4
5
create table member(memberid string,credits double) row format delimited fields terminated by '\t';

create table sale(memberid string,MNAccount double) row format delimited fields terminated by '\t';

create table regoods(memberid string,RMNAccount double) row format delimited fields terminated by '\t';

示例

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
sale
1001 50.3
1002 56.5
1003 235
1001 23.6
1005 56.2
25.6
33.5

regoods
1001 20.1
1002 23.6
1001 10.1
23.5
10.2
1005 0.8

答案

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
insert into table member
select
t1.memberid,
MNAccount-RMNAccount
from
(select
memberid,
sum(MNAccount) MNAccount
from
sale
where
memberid!=''
group by
memberid
)t1
join
(select
memberid,
sum(RMNAccount) RMNAccount
from
regoods
where
memberid!=''
group by
memberid
)t2
on
t1.memberid=t2.memberid;


大数据常见代码问题
https://hexo.leelurker.com/posts/49351
作者
LeeLurker
发布于
2024年1月5日
许可协议