本文最后更新于 2024年1月3日 凌晨
大数据常见代码问题 Hadoop WordCount Mapper
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 import java.io.IOException;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;public class WordCountMapper extends Mapper <LongWritable, Text, Text, LongWritable> { private final static LongWritable one = new LongWritable (1 ); private Text word = new Text (); @Override public void map (LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] words = value.toString().split("\\s+" ); for (String w : words) { word.set(w); context.write(word, one); } } }
Reducer
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 import java.io.IOException;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;public class WordCountReducer extends Reducer <Text, LongWritable, Text, LongWritable> { private LongWritable result = new LongWritable (); @Override public void reduce (Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException { long sum = 0 ; for (LongWritable value : values) { sum += value.get(); } result.set(sum); context.write(key, result); } }
Driver
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 import org.apache.hadoop.fs.Path;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public class WordCountDriver { public static void main (String[] args) throws Exception { Configuration conf = new Configuration (); Job job = Job.getInstance(conf, "WordCount" ); job.setJarByClass(WordCountDriver.class); job.setMapperClass(WordCountMapper.class); job.setCombinerClass(WordCountReducer.class); job.setReducerClass(WordCountReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); FileInputFormat.addInputPath(job, new Path (args[0 ])); FileOutputFormat.setOutputPath(job, new Path (args[1 ])); System.exit(job.waitForCompletion(true ) ? 0 : 1 ); } }
Spark WordCount 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 import org.apache.spark.SparkConf import org.apache.spark.SparkContext object SparkWordCount { def main (args: Array [String ]): Unit = { val conf = new SparkConf ().setAppName("Spark WordCount" ).setMaster("local[*]" ) val sc = new SparkContext (conf) val inputPath = "path/to/input.txt" val textFile = sc.textFile(inputPath) val words = textFile.flatMap(line => line.split("\\s+" )) val wordCounts = words.map(word => (word, 1 )) val result = wordCounts.reduceByKey(_ + _) result.foreach(println) sc.stop() } }
Hive 题目 1 题目 :找出所有科目成绩都大于某一学科平均成绩的学生
格式 :
1 2 3 4 5 score ( uid STRING, subject_id STRING, score INT );
示例 :
1 2 3 4 5 6 7 8 9 1001 01 90 1001 02 90 1001 03 90 1002 01 85 1002 02 85 1002 03 70 1003 01 70 1003 02 70 1003 03 85
答案 :
平均成绩
1 2 3 4 5 6 select uid, score, avg (score) over (partition by subject_id) avg_scorefrom score;t1
根据是否大于平均成绩记录 flag,大于则记为 0 否则记为 1
1 2 3 4 5 select uid, if(score> avg_score,0 ,1 ) flagfrom t1;t2
根据学生 id 进行分组统计 flag 的和,和为 0 则是所有学科都大于平均成绩
1 2 3 4 5 6 7 8 select uidfrom t2group by uidhaving sum (flag)= 0 ;
最终 SQL
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 select uidfrom ( select uid, if(score > avg_score, 0 , 1 ) flag from ( select uid, score, avg (score) over (partition by subject_id) avg_score from score ) t1 ) t2group by uidhaving sum (flag) = 0 ;
题目 2 题目 : 统计出每个用户的每月累积访问次数
格式 :
1 2 3 4 5 create table action (userId string, visitDate string, visitCount int ) row format delimited fields terminated by "\t";
示例 :
1 2 3 4 5 6 7 8 u01 2017 /1 /21 5 u02 2017 /1 /23 6 u03 2017 /1 /22 8 u04 2017 /1 /20 3 u01 2017 /1 /23 6 u01 2017 /2 /21 8 u02 2017 /1 /23 6 u01 2017 /2 /22 4
答案 :
修改数据格式
1 2 3 4 5 6 select userId, date_format(regexp_replace(visitDate,'/' ,'-' ),'yyyy-MM' ) mn, visitCountfrom action;t1
计算每人单月访问量
1 2 3 4 5 6 7 8 select userId, mn, sum (visitCount) mn_countfrom t1group by userId,mn;t2
按月累计访问量
1 2 3 4 5 6 select userId, mn, mn_count, sum (mn_count) over (partition by userId order by mn)from t2;
最终 SQL
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 select userId, mn, mn_count, sum (mn_count) over (partition by userId order by mn)from ( select userId, mn, sum (visitCount) mn_count from (select userId, date_format(regexp_replace(visitDate,'/' ,'-' ),'yyyy-MM' ) mn, visitCount from action)t1group by userId,mn)t2;
题目 3 题目 :有 50 W 个京东店铺,每个顾客访客访问任何一个店铺的任何一个商品时都会产生一条访问日志,访问日志存储的表名为 Visit,访客的用户 id 为 user_id,被访问的店铺名称为 shop,请统计:
每个店铺的 UV(访客数)
每个店铺访问次数 top 3 的访客信息。输出店铺名称、访客 id、访问次数
格式 :
1 create table visit(user_id string ,shop string ) row format delimited fields terminated by '\t' ;
示例 :
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 u1 a u2 b u1 b u1 a u3 c u4 b u1 a u2 c u5 b u4 b u6 c u2 c u1 b u2 a u2 a u3 a u5 a u5 a u5 a
答案 :
每个店铺的 UV(访客数)
1 select shop,count (distinct user_id) from visit group by shop;
查询每个店铺被每个用户访问次数
1 2 3 select shop,user_id,count (* ) ctfrom visitgroup by shop,user_id;t1
计算每个店铺被用户访问次数排名
1 2 select shop,user_id,ct,rank () over (partition by shop order by ct) rkfrom t1;t2
取每个店铺排名前 3 的
1 2 3 select shop,user_id,ctfrom t2where rk<=3 ;
每个店铺访问次数 top 3 的访客信息。输出店铺名称、访客 id、访问次数
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 select shop, user_id, ctfrom (select shop, user_id, ct,rank () over (partition by shop order by ct) rkfrom (select shop, user_id,count (* ) ctfrom visitgroup by shop, user_id)t1 )t2where rk<= 3 ;
题目 4 题目 :已知一个表 STG. ORDER,有如下字段: Date,Order_id,User_id,amount。请给出 sql 进行统计: 数据样例: 2017-01-01,10029028,1000003251,33.57。
给出 2017 年每个月的订单数、用户数、总成交金额。
给出 2017 年 11 月的新客数 (指在 11 月才有第一笔订单)格式 :
1 create table order_tab(dt string ,order_id string ,user_id string ,amount decimal(10 ,2 )) row format delimited fields terminated by '\t' ;
示例 :
1 2017 -01 -01 ,10029028 ,1000003251 ,33 .57
答案 :
给出 2017 年每个月的订单数、用户数、总成交金额
1 2 3 4 5 6 7 8 9 10 11 select date_format(dt,'yyyy-MM' ), count (order_id), count (distinct user_id), sum (amount)from order_tabwhere date_format(dt,'yyyy' )= '2017' group by date_format(dt,'yyyy-MM' );
给出 2017 年 11 月的新客数 (指在 11 月才有第一笔订单)
1 2 3 4 5 6 7 8 select count (user_id)from order_tabgroup by user_idhaving date_format(min (dt),'yyyy-MM' )= '2017-11' ;
题目 5 题目 :有日志如下,请写出代码求得所有用户和活跃用户的总数及平均年龄。(活跃用户指连续两天都有访问记录的用户)日期用户年龄
格式 :
1 create table user_age(dt string,user_id string,age int )row format delimited fields terminated by ',' ;
示例 :
1 2 3 4 5 6 7 8 9 10 2019 -02 -11 ,test_1,23 2019 -02 -11 ,test_2,19 2019 -02 -11 ,test_3,39 2019 -02 -11 ,test_1,23 2019 -02 -11 ,test_3,39 2019 -02 -11 ,test_1,23 2019 -02 -12 ,test_2,19 2019 -02 -13 ,test_1,23 2019 -02 -15 ,test_2,19 2019 -02 -16 ,test_2,19
答案 :
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 select sum (user_total_count), sum (user_total_avg_age), sum (twice_count), sum (twice_count_avg_age)from (select 0 user_total_count, 0 user_total_avg_age, count (* ) twice_count, cast (sum (age)/ count (* ) as decimal (10 ,2 )) twice_count_avg_agefrom ( select user_id, min (age) agefrom (select user_id, min (age) agefrom ( select user_id, age, date_sub(dt,rk) flagfrom ( select dt, user_id, min (age) age, rank () over (partition by user_id order by dt) rk from user_age group by dt,user_id )t1 )t2group by user_id,flaghaving count (* )>= 2 )t3group by user_id )t4union all select count (* ) user_total_count, cast ((sum (age)/ count (* )) as decimal (10 ,1 )), 0 twice_count, 0 twice_count_avg_agefrom ( select user_id, min (age) age from user_age group by user_id )t5)t6;
题目 6 题目 :请用 sql 写出所有用户中在今年 10 月份第一次购买商品的金额,表 ordertable 字段(购买用户:userid,金额:money,购买时间:paymenttime (格式:2017-10-01),订单 id:orderid)
格式 :
1 2 3 4 5 6 create table ordertable( userid string, money int , paymenttime string, orderid string)row format delimited fields terminated by '\t' ;
示例 :
1 表ordertable字段(购买用户:userid,金额:money,购买时间:paymenttime (格式:2017 -10 -01 ),订单id:orderid)
答案 :
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 select t1.userid, t1.paymenttime, od.moneyfrom (select userid, min (paymenttime) paymenttimefrom ordertablewhere date_format(paymenttime,'yyyy-MM' )= '2017-10' group by userid)t1join ordertable odon t1.userid= od.userid and t1.paymenttime= od.paymenttime;
题目 7 题目 :求 11 月 9 号下午 14 点(14-15 点),访问 api/user/login 接口的 top 10 的 ip 地址
格式 :
1 2 3 4 5 create table ip( time string , interface string , ip string ) row format delimited fields terminated by '\t' ;
示例 :
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 2016 -11 -09 14 :22 :05 /api/user/login 110.23.5.33 2016 -11 -09 11 :23 :10 /api/user/detail 57.3.2.16 2016 -11 -09 14 :59 :40 /api/user/login 200.6.5.166 2016 -11 -09 14 :22 :05 /api/user/login 110.23.5.34 2016 -11 -09 14 :22 :05 /api/user/login 110.23.5.34 2016 -11 -09 14 :22 :05 /api/user/login 110.23.5.34 2016 -11 -09 11 :23 :10 /api/user/detail 57.3.2.16 2016 -11 -09 23 :59 :40 /api/user/login 200.6.5.166 2016 -11 -09 14 :22 :05 /api/user/login 110.23.5.34 2016 -11 -09 11 :23 :10 /api/user/detail 57.3.2.16 2016 -11 -09 23 :59 :40 /api/user/login 200.6.5.166 2016 -11 -09 14 :22 :05 /api/user/login 110.23.5.35 2016 -11 -09 14 :23 :10 /api/user/detail 57.3.2.16 2016 -11 -09 23 :59 :40 /api/user/login 200.6.5.166 2016 -11 -09 14 :59 :40 /api/user/login 200.6.5.166 2016 -11 -09 14 :59 :40 /api/user/login 200.6.5.166
答案 :
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 select ip, interface, count (* ) ctfrom ipwhere date_format(time ,'yyyy-MM-dd HH' )>= '2016-11-09 14' and date_format(time ,'yyyy-MM-dd HH' )<= '2016-11-09 15' and interface= '/api/user/login' group by ip,interfaceorder by ct desc limit 10 ;t1
题目 8 题目 :查询各自区组的 money 排名前十的账号(分组取前 10)
格式 :
1 2 3 4 CREATE TABLE `account` ( `dist_id` int (11 )DEFAULT NULL COMMENT '区组id' , `account` varchar (100 )DEFAULT NULL COMMENT '账号' , `gold` int (11 )DEFAULT 0 COMMENT '金币' );
答案 :
1 SELECT * FROM ( SELECT dist_id, account, gold, dense_rank () OVER (PARTITION BY dist_id ORDER BY gold DESC ) as rk FROM account ) t1 WHERE rk <= 10 ;
题目 9 题目 :
有三张表分别为会员表(member)销售表(sale)退货表(regoods)
会员表有字段 memberid(会员 id,主键)credits(积分);
销售表有字段 memberid(会员 id,外键)购买金额(MNAccount);
退货表中有字段 memberid(会员 id,外键)退货金额(RMNAccount)。
业务说明
销售表中的销售记录可以是会员购买,也可以是非会员购买。(即销售表中的 memberid 可以为空);
销售表中的一个会员可以有多条购买记录;
退货表中的退货记录可以是会员,也可是非会员;
一个会员可以有一条或多条退货记录。
查询需求:分组查出销售表中所有会员购买金额,同时分组查出退货表中所有会员的退货金额,把会员 id 相同的购买金额-退款金额得到的结果更新到表会员表中对应会员的积分字段(credits)
格式 :
1 2 3 4 5 create table member(memberid string,credits double ) row format delimited fields terminated by '\t' ;create table sale(memberid string,MNAccount double ) row format delimited fields terminated by '\t' ;create table regoods(memberid string,RMNAccount double ) row format delimited fields terminated by '\t' ;
示例 :
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 sale 1001 50 .3 1002 56 .5 1003 235 1001 23 .6 1005 56 .2 25 .6 33 .5 regoods 1001 20 .1 1002 23 .6 1001 10 .1 23 .5 10 .2 1005 0 .8
答案 :
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 insert into table member select t1.memberid, MNAccount- RMNAccountfrom (select memberid, sum (MNAccount) MNAccount from sale where memberid!= '' group by memberid )t1join (select memberid, sum (RMNAccount) RMNAccount from regoods where memberid!= '' group by memberid )t2on t1.memberid= t2.memberid;