资源简介
对应于前面的MahoutCanopy.jar文件的源代码,可以参考来看;前面的那个是工具,这个是源码
代码片段和文件信息
package mahout.fansy.canopy;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.clustering.Cluster;
import org.apache.mahout.clustering.canopy.CanopyConfigKeys;
import org.apache.mahout.clustering.iterator.ClusterWritable;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.ClassUtils;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.common.distance.DistanceMeasure;
import org.apache.mahout.math.VectorWritable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* 改编原mahout代码,使输入数据可以直接为文本
* 主要改编的是Mapper的key和value的格式
* @author Administrator
*
*/
public class CanopyDriver extends AbstractJob{
/**
* @param args
*/
private static final Logger log = LoggerFactory.getLogger(CanopyDriver.class);
public static void main(String[] args) throws Exception {
ToolRunner.run(new Configuration() new CanopyDriver() args);
}
@Override
public int run(String[] arg0) throws Exception {
addInputOption();
addOutputOption();
addOption(DefaultOptionCreator.distanceMeasureOption().create());
addOption(DefaultOptionCreator.t1Option().create());
addOption(DefaultOptionCreator.t2Option().create());
addOption(DefaultOptionCreator.t3Option().create());
addOption(DefaultOptionCreator.t4Option().create());
addOption(DefaultOptionCreator.clusterFilterOption().create());
addOption(DefaultOptionCreator.overwriteOption().create());
addOption(DefaultOptionCreator.clusteringOption().create());
addOption(DefaultOptionCreator.methodOption().create());
addOption(DefaultOptionCreator.outlierThresholdOption().create());
if (parseArguments(arg0) == null) {
return -1;
}
Path input = getInputPath();
Path output = getOutputPath();
Configuration conf = getConf();
if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
HadoopUtil.delete(conf output);
}
String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION));
double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION));
double t3 = t1;
if (hasOption(DefaultOptionCreator.T3_OPTION)) {
t3 = Double.parseDouble(getOption(DefaultOptionCreator.T3_OPTION));
}
double t4 = t2;
if (hasOption(DefaultOptionCreator.T4_OPTION)) {
t4 = Double.parseDouble(getOption(DefaultOptionCreator.T4_OPTION));
}
int clusterFilter = 0;
if (hasOption(Defaul
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
文件 6693 2013-07-23 16:48 CanopyDriver.java
文件 2445 2013-07-23 17:21 CanopyMapper.java
文件 2196 2013-07-23 17:21 CanopyReducer.java
评论
共有 条评论