MapReduce入门案例--单词计数 - 博客

[{"createTime":1735734952000,"id":1,"img":"hwy_ms_500_252.jpeg","link":"https://activity.huaweicloud.com/cps.html?fromacct=261f35b6-af54-4511-a2ca-910fa15905d1&utm_source=V1g3MDY4NTY=&utm_medium=cps&utm_campaign=201905","name":"华为云秒杀","status":9,"txt":"华为云38元秒杀","type":1,"updateTime":1735747411000,"userId":3},{"createTime":1736173885000,"id":2,"img":"txy_480_300.png","link":"https://cloud.tencent.com/act/cps/redirect?redirect=1077&cps_key=edb15096bfff75effaaa8c8bb66138bd&from=console","name":"腾讯云秒杀","status":9,"txt":"腾讯云限量秒杀","type":1,"updateTime":1736173885000,"userId":3},{"createTime":1736177492000,"id":3,"img":"aly_251_140.png","link":"https://www.aliyun.com/minisite/goods?userCode=pwp8kmv3","memo":"","name":"阿里云","status":9,"txt":"阿里云2折起","type":1,"updateTime":1736177492000,"userId":3},{"createTime":1735660800000,"id":4,"img":"vultr_560_300.png","link":"https://www.vultr.com/?ref=9603742-8H","name":"Vultr","status":9,"txt":"Vultr送$100","type":1,"updateTime":1735660800000,"userId":3},{"createTime":1735660800000,"id":5,"img":"jdy_663_320.jpg","link":"https://3.cn/2ay1-e5t","name":"京东云","status":9,"txt":"京东云特惠专区","type":1,"updateTime":1735660800000,"userId":3},{"createTime":1735660800000,"id":6,"img":"new_ads.png","link":"https://www.iodraw.com/ads","name":"发布广告","status":9,"txt":"发布广告","type":1,"updateTime":1735660800000,"userId":3},{"createTime":1735660800000,"id":7,"img":"yun_910_50.png","link":"https://activity.huaweicloud.com/discount_area_v5/index.html?fromacct=261f35b6-af54-4511-a2ca-910fa15905d1&utm_source=aXhpYW95YW5nOA===&utm_medium=cps&utm_campaign=201905","name":"底部","status":9,"txt":"高性能云服务器2折起","type":2,"updateTime":1735660800000,"userId":3}]

1.提前准备好单词

2.WordCount需求分析

3.新建工程并导入pom依赖 (pom.xml)
<dependencies> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>
hadoop-common</artifactId> <version>3.2.1</version> </dependency> <dependency> <
groupId>org.apache.hadoop</groupId> <artifactId>hadoop-client</artifactId> <
version>3.2.1</version> </dependency> <dependency> <groupId>org.apache.hadoop</
groupId> <artifactId>hadoop-hdfs</artifactId> <version>3.2.1</version> </
dependency> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId>
<version>4.13</version> </dependency> </dependencies>
3.创建日志文件(log4j.properties)
# 控制台输出配置 log4j.appender.Console=org.apache.log4j.ConsoleAppender log4j.
appender.Console.layout=org.apache.log4j.PatternLayout log4j.appender.Console.
layout.ConversionPattern=%d [%t] %p [%c] - %m%n # 指定日志的输出级别与输出端 log4j.rootLogger
=debug,Console
4.创建Map类(WeMapper.java)
import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.
LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.
mapreduce.Mapper; import java.io.IOException; public class WeMapper extends
Mapper<LongWritable, Text,Text, IntWritable> { Text out_key=new Text();
IntWritable vul=new IntWritable(1); @Override protected void map(LongWritable
key, Text value, Context context) throws IOException, InterruptedException {
String lines=value.toString(); String[] words=lines.split(" "); for(String word:
words){ out_key.set(word); context.write(out_key,vul); } } }
5.创建WeReduce类(WeReduce.java)
import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; public
class WeReduce extends Reducer <Text, IntWritable,Text,IntWritable>{ IntWritable
vul=new IntWritable(); @Override protected void reduce(Text key, Iterable<
IntWritable> values, Context context) throws IOException, InterruptedException {
int count=0; for(IntWritable value:values){ count+=value.get(); } vul.set(count)
; context.write(key,vul); } }
4.创建WeReduce类(WeReduce.java)
(这里是将数据传送至hdfs上)
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.*;
import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.
input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.
FileOutputFormat; import java.io.IOException; public class WeDriver { public
static void main(String[] args) throws IOException, ClassNotFoundException,
InterruptedException { Configuration conf = new Configuration(); conf.set(
"fs.defaultFS","hdfs://node-1:8020"); Job job=Job.getInstance(conf); job.
setJarByClass(WeDriver.class); job.setMapperClass(WeMapper.class); job.
setReducerClass(WeReduce.class); job.setMapOutputKeyClass(Text.class); job.
setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class); Path input=new Path(args[0]); Path
output=new Path(args[1]); FileSystem fileSystem=output.getFileSystem(new
Configuration()); if(fileSystem.exists(output)){ fileSystem.delete(output,true);
} FileInputFormat.setInputPaths(job,input); FileOutputFormat.setOutputPath(job,
output); boolean flag=job.waitForCompletion(true); System.exit(flag?0:-1); } }
6.代码运行(并打包jar包)

将jar包重新命名为word.jar并防止虚拟机temp目录下

在hadoop上运行jar包
复制driver的reference

在Linux上temp上运行该jar 包(并指明输入输出路径)
hadoop jar word.jar com.xdd.mapreduce.WeDriver /wc/input/test.txt /wc/output/
wordhadoop
看到successful即为成功

查看Hadoop上运行情况(word.jar包已经成功运行)

在hdfs上查看结果的输出情况(已生成对应文件)

查看文件结果(统计成功)

技术

Java1212 篇
Python927 篇
开发语言608 篇
c语言463 篇
算法461 篇
MySQL438 篇
数据库394 篇
前端387 篇
更多...