You can run the program as
bin/hadoop jar WordCount.jar com.xyz.WordCount input output-wordcount
where
(1) com.xyz is the package name for WordCount class
(2) WordCount.jar is the jar file name
(3) input is the input directory which contains the files
(4) output is the output directory
You can check the result as
bin/hadoop dfs -ls output-wordcount
check output file (part-r-00000) content
bin/hadoop dfs -cat output-wordcount/part-r-00000 | less
WordMapper.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
| import java.io.IOException; import java.util.StringTokenizer; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; public class WordMapper extends Mapper<Object, Text, Text, IntWritable> { private Text word = new Text(); private final static IntWritable one = new IntWritable( 1 ); @Override public void map(Object key, Text value, Context contex) throws IOException, InterruptedException { // Break line into words for processing StringTokenizer wordList = new StringTokenizer(value.toString()); while (wordList.hasMoreTokens()) { word.set(wordList.nextToken()); contex.write(word, one); } } } |
SumReducer.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
| import java.io.IOException; import java.util.Iterator; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; public class SumReducer extends Reducer<Text, IntWritable, Text, IntWritable> { private IntWritable totalWordCount = new IntWritable(); @Override public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int wordCount = 0 ; Iterator<IntWritable> it=values.iterator(); while (it.hasNext()) { wordCount += it.next().get(); } totalWordCount.set(wordCount); context.write(key, totalWordCount); } } |
WordCount.java (Driver)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
| import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; public class WordCount { public static void main(String[] args) throws Exception { if (args.length != 2 ) { System.out.println( "usage: [input] [output]" ); System.exit(- 1 ); } Job job = Job.getInstance( new Configuration()); job.setOutputKeyClass(Text. class ); job.setOutputValueClass(IntWritable. class ); job.setMapperClass(WordMapper. class ); job.setReducerClass(SumReducer. class ); job.setInputFormatClass(TextInputFormat. class ); job.setOutputFormatClass(TextOutputFormat. class ); FileInputFormat.setInputPaths(job, new Path(args[ 0 ])); FileOutputFormat.setOutputPath(job, new Path(args[ 1 ])); job.setJarByClass(WordCount. class ); job.submit(); } }
References:
[1] http://codesfusion.blogspot.com/2013/10/hadoop-wordcount-with-new-map-reduce-api.html
[2] http://cs.smith.edu/dftwiki/index.php/Hadoop_Tutorial_1_--_Running_WordCount
|
Comments
Post a Comment