MapReduce之WordCount单词计数（下）

xiaoxiao2021-02-28 23

一代码 Wordcount.java import java.io.IOException; import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; public class WordCount { public static class WordCountMap extends Mapper<LongWritable, Text, Text, IntWritable> { private final IntWritable one = new IntWritable(1); private Text word = new Text(); public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); StringTokenizer token = new StringTokenizer(line); while (token.hasMoreTokens()) { word.set(token.nextToken()); context.write(word, one); } } } public static class WordCountReduce extends Reducer<Text, IntWritable, Text, IntWritable> { public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable val : values) { sum += val.get(); } context.write(key, new IntWritable(sum)); } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = new Job(conf); job.setJarByClass(WordCount.class); job.setJobName("wordcount"); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(WordCountMap.class); job.setReducerClass(WordCountReduce.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.waitForCompletion(true); } } 二构建运行 1、编译 [root@localhost word_count]# ll total 4 drwxr-xr-x. 2 root root 101 Aug 20 14:27 word_count_class -rwxr-xr-x. 1 root root 2132 Aug 20 14:22 WordCount.java [root@localhost word_count]# javac -classpath /opt/hadoop-1.2.1/hadoop-core-1.2.1.jar:/opt/hadoop-1.2.1/lib/commons-cli-1.2.jar -d word_count_class/ WordCount.java [root@localhost word_count]# cd word_count_class/ [root@localhost word_count_class]# ls WordCount.class WordCount$WordCountMap.class WordCount$WordCountReduce.class 2、打包 [root@localhost word_count_class]# jar -cvf wordcount.jar *.class added manifest adding: WordCount.class(in = 1539) (out= 772)(deflated 49%) adding: WordCount$WordCountMap.class(in = 1829) (out= 767)(deflated 58%) adding: WordCount$WordCountReduce.class(in = 1645) (out= 687)(deflated 58%) [root@localhost word_count_class]# ls WordCount.class wordcount.jar WordCount$WordCountMap.class WordCount$WordCountReduce.class 3、准备输入文件file1和输入文件file2 [root@localhost input]# ls file1 file2 file1的内容： hello world hello hadoop hadoop file system hadoop java api hello java file2的内容： new file hadoop file hadoop new world hadoop free home hadoop free school 4、将输入文件提交HDFS [root@localhost word_count]# hadoop fs -mkdir input_wordcount Warning: $HADOOP_HOME is deprecated. [root@localhost word_count]# hadoop fs -put input/* input_wordcount/ Warning: $HADOOP_HOME is deprecated. [root@localhost word_count]# hadoop fs -ls Warning: $HADOOP_HOME is deprecated. Found 2 items drwxr-xr-x - root supergroup 0 2017-08-20 12:44 /user/root/input drwxr-xr-x - root supergroup 0 2017-08-20 14:41 /user/root/input_wordcount [root@localhost word_count]# hadoop fs -ls input_wordcount Warning: $HADOOP_HOME is deprecated. Found 2 items -rw-r--r-- 3 root supergroup 71 2017-08-20 14:41 /user/root/input_wordcount/file1 -rw-r--r-- 3 root supergroup 74 2017-08-20 14:41 /user/root/input_wordcount/file2 [root@localhost word_count]# hadoop fs -cat input_wordcount/file1 Warning: $HADOOP_HOME is deprecated. hello world hello hadoop hadoop file system hadoop java api hello java 5、任务提交 [root@localhost word_count]# hadoop jar word_count_class/wordcount.jar WordCount input_wordcount output_wordcount Warning: $HADOOP_HOME is deprecated. 17/08/20 14:50:30 WARN mapred.JobClient: Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same. 17/08/20 14:50:31 INFO input.FileInputFormat: Total input paths to process : 2 17/08/20 14:50:31 INFO util.NativeCodeLoader: Loaded the native-hadoop library 17/08/20 14:50:31 WARN snappy.LoadSnappy: Snappy native library not loaded 17/08/20 14:50:33 INFO mapred.JobClient: Running job: job_201708201140_0001 17/08/20 14:50:34 INFO mapred.JobClient: map 0% reduce 0% 17/08/20 14:51:20 INFO mapred.JobClient: map 100% reduce 0% 17/08/20 14:51:45 INFO mapred.JobClient: map 100% reduce 100% 17/08/20 14:51:51 INFO mapred.JobClient: Job complete: job_201708201140_0001 17/08/20 14:51:52 INFO mapred.JobClient: Counters: 29 17/08/20 14:51:52 INFO mapred.JobClient: Job Counters 17/08/20 14:51:52 INFO mapred.JobClient: Launched reduce tasks=1 17/08/20 14:51:52 INFO mapred.JobClient: SLOTS_MILLIS_MAPS=81389 17/08/20 14:51:52 INFO mapred.JobClient: Total time spent by all reduces waiting after reserving slots (ms)=0 17/08/20 14:51:52 INFO mapred.JobClient: Total time spent by all maps waiting after reserving slots (ms)=0 17/08/20 14:51:52 INFO mapred.JobClient: Launched map tasks=2 17/08/20 14:51:52 INFO mapred.JobClient: Data-local map tasks=2 17/08/20 14:51:52 INFO mapred.JobClient: SLOTS_MILLIS_REDUCES=24253 17/08/20 14:51:52 INFO mapred.JobClient: File Output Format Counters 17/08/20 14:51:52 INFO mapred.JobClient: Bytes Written=83 17/08/20 14:51:52 INFO mapred.JobClient: FileSystemCounters 17/08/20 14:51:52 INFO mapred.JobClient: FILE_BYTES_READ=301 17/08/20 14:51:52 INFO mapred.JobClient: HDFS_BYTES_READ=381 17/08/20 14:51:52 INFO mapred.JobClient: FILE_BYTES_WRITTEN=156847 17/08/20 14:51:52 INFO mapred.JobClient: HDFS_BYTES_WRITTEN=83 17/08/20 14:51:52 INFO mapred.JobClient: File Input Format Counters 17/08/20 14:51:52 INFO mapred.JobClient: Bytes Read=145 17/08/20 14:51:52 INFO mapred.JobClient: Map-Reduce Framework 17/08/20 14:51:52 INFO mapred.JobClient: Map output materialized bytes=307 17/08/20 14:51:52 INFO mapred.JobClient: Map input records=10 17/08/20 14:51:52 INFO mapred.JobClient: Reduce shuffle bytes=307 17/08/20 14:51:52 INFO mapred.JobClient: Spilled Records=50 17/08/20 14:51:52 INFO mapred.JobClient: Map output bytes=245 17/08/20 14:51:52 INFO mapred.JobClient: Total committed heap usage (bytes)=246751232 17/08/20 14:51:52 INFO mapred.JobClient: CPU time spent (ms)=5290 17/08/20 14:51:52 INFO mapred.JobClient: Combine input records=0 17/08/20 14:51:52 INFO mapred.JobClient: SPLIT_RAW_BYTES=236 17/08/20 14:51:52 INFO mapred.JobClient: Reduce input records=25 17/08/20 14:51:52 INFO mapred.JobClient: Reduce input groups=11 17/08/20 14:51:52 INFO mapred.JobClient: Combine output records=0 17/08/20 14:51:52 INFO mapred.JobClient: Physical memory (bytes) snapshot=382996480 17/08/20 14:51:52 INFO mapred.JobClient: Reduce output records=11 17/08/20 14:51:52 INFO mapred.JobClient: Virtual memory (bytes) snapshot=2590666752 17/08/20 14:51:52 INFO mapred.JobClient: Map output records=25 6、查看结果 [root@localhost word_count]# hadoop fs -ls output_wordcount Warning: $HADOOP_HOME is deprecated. Found 3 items -rw-r--r-- 3 root supergroup 0 2017-08-20 14:51 /user/root/output_wordcount/_SUCCESS drwxr-xr-x - root supergroup 0 2017-08-20 14:50 /user/root/output_wordcount/_logs -rw-r--r-- 3 root supergroup 83 2017-08-20 14:51 /user/root/output_wordcount/part-r-00000 [root@localhost word_count]# hadoop fs -cat output_wordcount/part-r-00000 Warning: $HADOOP_HOME is deprecated. api 1 file 3 free 2 hadoop 7 hello 3 home 1 java 2 new 2 school 1 system 1 world 2

转载请注明原文地址: https://www.6miu.com/read-2150136.html

技术

最新回复(0)