资源简介
在Hadoop集群中,用MapReduce分布式计算TFIDF
代码片段和文件信息
package eb.cloud.mapreduce.MR.guoruonan;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
public class Tfidf {
public static class Mapper0 extends Mapper {
String filename;
public void map(LongWritable key Text value Context context)
throws IOException InterruptedException {
FileSplit split = (FileSplit) context.getInputSplit();
filename = split.getPath().getName();
String newString = value.toString().toLowerCase();
String results[] = newString.split(“[^a-zA-Z]“);
int flag = 0;
for (String val : results) {
if (val.equals(““))
continue;
context.write(new Text(filename) new Text(val));
}
}
}
public static class Reducer0 extends Reducer {
public void reduce(Text key Iterable values Context context)
throws IOException InterruptedException {
ArrayList array = new ArrayList();
for (Text t : values) {
array.add(t.toString());
}
for (String str : array) {
context.write(new Text(key.toString() + “ “+str) new Text(““
+ array.size()));
//
}
}
}
public static class Mapper1 extends Mapper {
public void map(LongWritable key Text value Context context)
throws IOException InterruptedException {
String line = value.toString();
int index = line.indexOf(“ “);
context.write(new Text(line.substring(0 index))
new Text(line.substring(index + 1)));
}
}
public static class Reducer1 extends Reducer {
public void reduce(Text key Iterable values Context context)
throws IOException InterruptedException {
ArrayList array = new ArrayList();
int ciNum = 1;
for (Text val : values) {
array.add(val.toString());
ciNum =
- 上一篇:java汉诺塔动画实现
- 下一篇:jsr173_1.0_api.jar
相关资源
- hadoop实战源代码Java
- TF-IDF计算程序
- hadoop-2.6.0-hadoop.dll-winutils.exe
- 高职组云计算与大数据题库
- Hadoop-2.8.5全面资料
- IT18掌大数据课程包含配套资料
- wordcount.jar
- 完整都的java聚类算法包括界面
- Hadoop-client-2.7.4.jar
- hadoop2.7.3 hadoop.dll
- hadoop2.7.1对应的hadoop.dllwinutils.exe等。
- Spark大数据中文分词统计Java工程源码
- hadoop-auth-2.2.0.jar
- hadoop-common-2.2.0-bin-master
- hive函数大全中文版
- hadoop.dll以及winutils.exe
- hadoop-lzo-0.4.13.jar
- 山东大学大数据实验三:Hadoop实现P
- hadoop-lzo-0.4.20-SNAPSHOT.jar 包
- hadoop-lzo-0.4.20-SNAPSHOT.jar
- 基于mapreduce的pagerank实现DEMO地址
- TFIDF算法mapreduce实现
- hadoop-lzo-0.4.20.jar
- Hive 入门级编程全案例详解
- hadoop-2.7.2-common.jar
- hadoop各版本--hadoop.dll以及winutils.exe,
- 2016年大数据全套视频徐培成
- hadoop-common-2.7.4-bin 包含hadoop.dll、win
- hadoop-core-1.2.1.jar
- 大数据hadoop winutils.exe
评论
共有 条评论