数据清洗

网友投稿 280 2022-11-24

数据清洗

一、需求 去掉日志,小于11个 二、代码 1、Mapper package com.wt.etl; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; public class ETLMapper extends Mapper { Text k = new Text(); @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { super.map(key, value, context); String line = value.toString(); boolean result = parseLog(line, context); if (!result){ return; } k.set(line); } private boolean parseLog(String line, Context context) { String[] fields = line.split("\\s"); if (fields.length < 11){ // 系统计数器 context.getCounter("map", "true").increment(1); return true; }else { context.getCounter("map", "false").increment(1); return false; } } } 2、Driver package com.wt.etl; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; public class ETLDriver { public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { // 输入输出路径需要根据自己电脑上实际的输入输出路径设置 args = new String[] { "E:\\a\\input2", "E:\\a\\output2" }; // 1 获取job信息 Configuration conf = new Configuration(); Job job = Job.getInstance(conf); // 2 加载jar包 job.setJarByClass(ETLDriver.class); // 3 关联map job.setMapperClass(ETLMapper.class); // 4 设置最终输出类型 job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); // 设置reducetask个数为0 job.setNumReduceTasks(0); // 5 设置输入和输出路径 FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); // 6 提交 boolean wait = job.waitForCompletion(true); System.exit(wait? 0:1); } }

版权声明:本文内容由网络用户投稿,版权归原作者所有,本站不拥有其著作权,亦不承担相应法律责任。如果您发现本站中有涉嫌抄袭或描述失实的内容,请联系我们jiasou666@gmail.com 处理,核实后本网站将在24小时内删除侵权内容。

上一篇:一种高速PCI数据采集卡设计方案
下一篇:Map Join案例
相关文章

 发表评论

暂时没有评论,来抢沙发吧~