大数据常见代码问题

本文最后更新于 2024年1月3日凌晨

大数据常见代码问题

Hadoop

WordCount

Mapper

// WordCountMapper.java
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class WordCountMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
    private final static LongWritable one = new LongWritable(1);
    private Text word = new Text();

    @Override
    public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        // 将每行文本拆分成单词
        String[] words = value.toString().split("\\s+");

        // 遍历每个单词，并发送到Reducer
        for (String w : words) {
            word.set(w);
            context.write(word, one);
        }
    }
}

Reducer

// WordCountReducer.java
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class WordCountReducer extends Reducer<Text, LongWritable, Text, LongWritable> {
    private LongWritable result = new LongWritable();

    @Override
    public void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
        // 计算每个单词的总次数
        long sum = 0;
        for (LongWritable value : values) {
            sum += value.get();
        }
        result.set(sum);

        // 输出单词及其总次数
        context.write(key, result);
    }
}

Driver

// WordCountDriver.java
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class WordCountDriver {
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf, "WordCount");

        job.setJarByClass(WordCountDriver.class);
        job.setMapperClass(WordCountMapper.class);
        job.setCombinerClass(WordCountReducer.class);
        job.setReducerClass(WordCountReducer.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);

        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

Spark

WordCount

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

object SparkWordCount {
  def main(args: Array[String]): Unit = {
    // 配置 Spark 应用程序
    val conf = new SparkConf().setAppName("Spark WordCount").setMaster("local[*]")
    val sc = new SparkContext(conf)

    // 读取输入文本文件
    val inputPath = "path/to/input.txt"
    val textFile = sc.textFile(inputPath)

    // 使用 flatMap 将每行文本拆分成单词
    val words = textFile.flatMap(line => line.split("\\s+"))

    // 使用 map 将每个单词映射为 (单词, 1) 键值对
    val wordCounts = words.map(word => (word, 1))

    // 使用 reduceByKey 对相同单词的计数值进行累加
    val result = wordCounts.reduceByKey(_ + _)

    // 输出结果
    result.foreach(println)

    // 停止 Spark 应用程序
    sc.stop()
  }
}

Hive

题目 1

题目：找出所有科目成绩都大于某一学科平均成绩的学生

格式：

score (
    uid STRING,
    subject_id STRING,
    score INT
);

示例：

1001   01 90
1001   02 90
1001   03 90
1002   01 85
1002   02 85
1002   03 70
1003   01 70
1003   02 70
1003   03 85

答案：

平均成绩

select
    uid,
    score,
    avg(score) over(partition by subject_id) avg_score
from
    score;t1

根据是否大于平均成绩记录 flag，大于则记为 0 否则记为 1

select
    uid,
    if(score>avg_score,0,1) flag
from
    t1;t2

根据学生 id 进行分组统计 flag 的和，和为 0 则是所有学科都大于平均成绩

select
    uid
from
    t2
group by
    uid
having
    sum(flag)=0;

最终 SQL

select
    uid
from
    (
        select
            uid,
            if(score > avg_score, 0, 1) flag
        from
            (
                select
                    uid,
                    score,
                    avg(score) over(partition by subject_id) avg_score
                from
                    score
            ) t1
    ) t2
group by
    uid
having
    sum(flag) = 0;

题目 2

题目：统计出每个用户的每月累积访问次数

格式：

create table action
(userId string,
visitDate string,
visitCount int) 
row format delimited fields terminated by "\t";

示例：

u01     2017/1/21       5
u02     2017/1/23       6
u03     2017/1/22       8
u04     2017/1/20       3
u01     2017/1/23       6
u01     2017/2/21       8
u02     2017/1/23       6
u01     2017/2/22       4

答案：

修改数据格式

select
     userId,
     date_format(regexp_replace(visitDate,'/','-'),'yyyy-MM') mn,
     visitCount
from
     action;t1

计算每人单月访问量

select
    userId,
    mn,
    sum(visitCount) mn_count
from
    t1
group by 
userId,mn;t2

按月累计访问量

select
    userId,
    mn,
    mn_count,
    sum(mn_count) over(partition by userId order by mn)
from t2;

最终 SQL

select
    userId,
    mn,
    mn_count,
    sum(mn_count) over(partition by userId order by mn)
from 
(   select
        userId,
        mn,
        sum(visitCount) mn_count
    from
         (select
             userId,
             date_format(regexp_replace(visitDate,'/','-'),'yyyy-MM') mn,
             visitCount
         from
             action)t1
group by userId,mn)t2;

题目 3

题目：有 50 W 个京东店铺，每个顾客访客访问任何一个店铺的任何一个商品时都会产生一条访问日志，访问日志存储的表名为 Visit，访客的用户 id 为 user_id，被访问的店铺名称为 shop，请统计：

每个店铺的 UV（访客数）
每个店铺访问次数 top 3 的访客信息。输出店铺名称、访客 id、访问次数

格式：

1	`create table visit(user_id string,shop string) row format delimited fields terminated by '\t';`

示例：

u1	a
u2	b
u1	b
u1	a
u3	c
u4	b
u1	a
u2	c
u5	b
u4	b
u6	c
u2	c
u1	b
u2	a
u2	a
u3	a
u5	a
u5	a
u5	a

答案：

每个店铺的 UV（访客数）

1	`select shop,count(distinct user_id) from visit group by shop;`

查询每个店铺被每个用户访问次数

1
2
3

select shop,user_id,count(*) ct
from visit
group by shop,user_id;t1

计算每个店铺被用户访问次数排名

1 2	`select shop,user_id,ct,rank() over(partition by shop order by ct) rk from t1;t2`

取每个店铺排名前 3 的

1
2
3

select shop,user_id,ct
from t2
where rk<=3;

每个店铺访问次数 top 3 的访客信息。输出店铺名称、访客 id、访问次数

select 
shop,
user_id,
ct
from
(select 
shop,
user_id,
ct,
rank() over(partition by shop order by ct) rk
from 
(select 
shop,
user_id,
count(*) ct
from visit
group by 
shop,
user_id)t1
)t2
where rk<=3;

题目 4

题目：已知一个表 STG. ORDER，有如下字段: Date，Order_id，User_id，amount。请给出 sql 进行统计: 数据样例: 2017-01-01,10029028,1000003251,33.57。

给出 2017 年每个月的订单数、用户数、总成交金额。
给出 2017 年 11 月的新客数 (指在 11 月才有第一笔订单)
格式：

1	`create table order_tab(dt string,order_id string,user_id string,amount decimal(10,2)) row format delimited fields terminated by '\t';`

示例：

1	`2017-01-01,10029028,1000003251,33.57`

答案：

给出 2017 年每个月的订单数、用户数、总成交金额

select
   date_format(dt,'yyyy-MM'),
   count(order_id),
   count(distinct user_id),
   sum(amount)
from
   order_tab
where
   date_format(dt,'yyyy')='2017'
group by
   date_format(dt,'yyyy-MM');

给出 2017 年 11 月的新客数 (指在 11 月才有第一笔订单)

select
   count(user_id)
from
   order_tab
group by
   user_id
having
   date_format(min(dt),'yyyy-MM')='2017-11';

题目 5

题目：有日志如下，请写出代码求得所有用户和活跃用户的总数及平均年龄。（活跃用户指连续两天都有访问记录的用户）日期用户年龄

格式：

1	`create table user_age(dt string,user_id string,age int)row format delimited fields terminated by ',';`

示例：

2019-02-11,test_1,23
2019-02-11,test_2,19
2019-02-11,test_3,39
2019-02-11,test_1,23
2019-02-11,test_3,39
2019-02-11,test_1,23
2019-02-12,test_2,19
2019-02-13,test_1,23
2019-02-15,test_2,19
2019-02-16,test_2,19

答案：

select 
    sum(user_total_count),
    sum(user_total_avg_age),
    sum(twice_count),
    sum(twice_count_avg_age)
from 
(select
   0 user_total_count,
   0 user_total_avg_age,
   count(*) twice_count,
   cast(sum(age)/count(*) as decimal(10,2)) twice_count_avg_age
from 
(
   select
   user_id,
   min(age) age
from
   (select
   user_id,
   min(age) age
from
   (
    select
   user_id,
   age,
   date_sub(dt,rk) flag
from
   (
    select
       dt,
       user_id,
       min(age) age,
       rank() over(partition by user_id order by dt) rk
    from
       user_age
    group by
       dt,user_id
    )t1
    )t2
group by
   user_id,flag
having
   count(*)>=2)t3
group by
   user_id 
)t4

union all

select
   count(*) user_total_count,
   cast((sum(age)/count(*)) as decimal(10,1)),
   0 twice_count,
   0 twice_count_avg_age
from 
   (
      select
         user_id,
         min(age) age 
      from 
         user_age 
      group by 
         user_id
   )t5)t6;

题目 6

题目：请用 sql 写出所有用户中在今年 10 月份第一次购买商品的金额，表 ordertable 字段（购买用户：userid，金额：money，购买时间：paymenttime (格式：2017-10-01)，订单 id：orderid）

格式：

create table ordertable(
    userid string,
    money int,
    paymenttime string,
    orderid string)
row format delimited fields terminated by '\t';

示例：

1	`表ordertable字段（购买用户：userid，金额：money，购买时间：paymenttime(格式：2017-10-01)，订单id：orderid）`

答案：

select
   t1.userid,
   t1.paymenttime,
   od.money
from
   (select
   userid,
   min(paymenttime) paymenttime
from
   ordertable
where
   date_format(paymenttime,'yyyy-MM')='2017-10'
group by
   userid)t1
join
   ordertable od
on
   t1.userid=od.userid
   and
   t1.paymenttime=od.paymenttime;

题目 7

题目：求 11 月 9 号下午 14 点（14-15 点），访问 api/user/login 接口的 top 10 的 ip 地址

格式：

create table ip(
    time string,
    interface string,
    ip string)
row format delimited fields terminated by '\t';

示例：

2016-11-09 14:22:05	/api/user/login	110.23.5.33
2016-11-09 11:23:10	/api/user/detail	57.3.2.16
2016-11-09 14:59:40	/api/user/login	200.6.5.166
2016-11-09 14:22:05	/api/user/login	110.23.5.34
2016-11-09 14:22:05	/api/user/login	110.23.5.34
2016-11-09 14:22:05	/api/user/login	110.23.5.34
2016-11-09 11:23:10	/api/user/detail	57.3.2.16
2016-11-09 23:59:40	/api/user/login	200.6.5.166
2016-11-09 14:22:05	/api/user/login	110.23.5.34
2016-11-09 11:23:10	/api/user/detail	57.3.2.16
2016-11-09 23:59:40	/api/user/login	200.6.5.166
2016-11-09 14:22:05	/api/user/login	110.23.5.35
2016-11-09 14:23:10	/api/user/detail	57.3.2.16
2016-11-09 23:59:40	/api/user/login	200.6.5.166
2016-11-09 14:59:40	/api/user/login	200.6.5.166
2016-11-09 14:59:40	/api/user/login	200.6.5.166

答案：

select
    ip,
    interface,
    count(*) ct
from
    ip
where
    date_format(time,'yyyy-MM-dd HH')>='2016-11-09 14'
    and 
    date_format(time,'yyyy-MM-dd HH')<='2016-11-09 15'
    and
    interface='/api/user/login'
group by
ip,interface
order by
ct desc
limit 10;t1

题目 8

题目：查询各自区组的 money 排名前十的账号（分组取前 10）

格式：

CREATE TABLE `account`
(   `dist_id` int（11）DEFAULT NULL COMMENT '区组id',
    `account` varchar（100）DEFAULT NULL COMMENT '账号',
    `gold` int（11）DEFAULT 0 COMMENT '金币'）;

答案：

1	`SELECT * FROM ( SELECT dist_id, account, gold, dense_rank() OVER (PARTITION BY dist_id ORDER BY gold DESC) as rk FROM account ) t1 WHERE rk <= 10;`

题目 9

题目：

有三张表分别为会员表（member）销售表（sale）退货表（regoods）
1. 会员表有字段 memberid（会员 id，主键）credits（积分）；
2. 销售表有字段 memberid（会员 id，外键）购买金额（MNAccount）；
3. 退货表中有字段 memberid（会员 id，外键）退货金额（RMNAccount）。
业务说明
1. 销售表中的销售记录可以是会员购买，也可以是非会员购买。（即销售表中的 memberid 可以为空）；
2. 销售表中的一个会员可以有多条购买记录；
3. 退货表中的退货记录可以是会员，也可是非会员；
4. 一个会员可以有一条或多条退货记录。

查询需求：分组查出销售表中所有会员购买金额，同时分组查出退货表中所有会员的退货金额，把会员 id 相同的购买金额-退款金额得到的结果更新到表会员表中对应会员的积分字段（credits）

格式：

create table member(memberid string,credits double) row format delimited fields terminated by '\t';

create table sale(memberid string,MNAccount double) row format delimited fields terminated by '\t';

create table regoods(memberid string,RMNAccount double) row format delimited fields terminated by '\t';

示例：

sale
1001    50.3
1002    56.5
1003    235
1001    23.6
1005    56.2
        25.6
        33.5

regoods
1001    20.1
1002    23.6
1001    10.1
        23.5
        10.2
1005    0.8

答案：

insert into table member
select
    t1.memberid,
    MNAccount-RMNAccount
from
    (select
        memberid,
        sum(MNAccount) MNAccount
    from
        sale
    where
        memberid!=''
    group by
        memberid
    )t1
join
    (select
        memberid,
        sum(RMNAccount) RMNAccount
    from
        regoods
    where
        memberid!=''
    group by
        memberid
    )t2
on
    t1.memberid=t2.memberid;

技术 > 大数据

#技术 #面试 #大数据 #Hadoop #Hive

大数据常见代码问题

https://hexo.leelurker.com/posts/49351

作者

LeeLurker

发布于

2024年1月5日

许可协议

数据仓库上一篇

Cloudflare Pages + Hexo 搭建个人博客下一篇