List of usage examples for org.apache.mahout.common HadoopUtil cacheFiles
public static void cacheFiles(Path fileToCache, Configuration conf)
From source file:com.luca.filipponi.tweetAnalysis.SentimentClassifier.CustomTestNaiveBayesDriver.java
License:Apache License
private boolean runMapReduce(Map<String, List<String>> parsedArgs) throws IOException, InterruptedException, ClassNotFoundException { Path model = new Path(getOption("model")); HadoopUtil.cacheFiles(model, getConf()); //the output key is the expected value, the output value are the scores for all the labels Job testJob = prepareJob(getInputPath(), getOutputPath(), SequenceFileInputFormat.class, BayesTestMapper.class, Text.class, VectorWritable.class, SequenceFileOutputFormat.class); //testJob.getConfiguration().set(LABEL_KEY, getOption("--labels")); //boolean complementary = parsedArgs.containsKey("testComplementary"); //always result to false as key in hash map is "--testComplementary" boolean complementary = hasOption("testComplementary"); //or complementary = parsedArgs.containsKey("--testComplementary"); testJob.getConfiguration().set(COMPLEMENTARY, String.valueOf(complementary)); return testJob.waitForCompletion(true); }
From source file:com.netease.news.classifier.naivebayes.TestNaiveBayesDriver.java
License:Apache License
private boolean runMapReduce(Map<String, List<String>> parsedArgs) throws IOException, InterruptedException, ClassNotFoundException { Path model = new Path(getOption("model")); HadoopUtil.cacheFiles(model, getConf()); //the output key is the expected value, the output value are the scores for all the labels Job testJob = prepareJob(getInputPath(), getOutputPath(), SequenceFileInputFormat.class, BayesTestMapper.class, Text.class, VectorWritable.class, SequenceFileOutputFormat.class); //testJob.getConfiguration().set(LABEL_KEY, getOption("--labels")); boolean complementary = parsedArgs.containsKey("testComplementary"); testJob.getConfiguration().set(COMPLEMENTARY, String.valueOf(complementary)); return testJob.waitForCompletion(true); }
From source file:com.netease.news.classifier.naivebayes.TrainNaiveBayesJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();//from w w w .j a v a 2 s. c om addOutputOption(); addOption(LABELS, "l", "comma-separated list of labels to include in training", false); addOption(buildOption(EXTRACT_LABELS, "el", "Extract the labels from the input", false, false, "")); addOption(ALPHA_I, "a", "smoothing parameter", String.valueOf(1.0f)); addOption( buildOption(TRAIN_COMPLEMENTARY, "c", "train complementary?", false, false, String.valueOf(false))); addOption(LABEL_INDEX, "li", "The path to store the label index in", false); addOption(DefaultOptionCreator.overwriteOption().create()); Map<String, List<String>> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(getConf(), getOutputPath()); HadoopUtil.delete(getConf(), getTempPath()); } Path labPath; String labPathStr = getOption(LABEL_INDEX); if (labPathStr != null) { labPath = new Path(labPathStr); } else { labPath = getTempPath(LABEL_INDEX); } long labelSize = createLabelIndex(labPath); float alphaI = Float.parseFloat(getOption(ALPHA_I)); boolean trainComplementary = Boolean.parseBoolean(getOption(TRAIN_COMPLEMENTARY)); HadoopUtil.setSerializations(getConf()); HadoopUtil.cacheFiles(labPath, getConf()); //add up all the vectors with the same labels, while mapping the labels into our index Job indexInstances = prepareJob(getInputPath(), getTempPath(SUMMED_OBSERVATIONS), SequenceFileInputFormat.class, IndexInstancesMapper.class, IntWritable.class, VectorWritable.class, VectorSumReducer.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); indexInstances.setCombinerClass(VectorSumReducer.class); boolean succeeded = indexInstances.waitForCompletion(true); if (!succeeded) { return -1; } //sum up all the weights from the previous step, per label and per feature Job weightSummer = prepareJob(getTempPath(SUMMED_OBSERVATIONS), getTempPath(WEIGHTS), SequenceFileInputFormat.class, WeightsMapper.class, Text.class, VectorWritable.class, VectorSumReducer.class, Text.class, VectorWritable.class, SequenceFileOutputFormat.class); weightSummer.getConfiguration().set(WeightsMapper.NUM_LABELS, String.valueOf(labelSize)); weightSummer.setCombinerClass(VectorSumReducer.class); succeeded = weightSummer.waitForCompletion(true); if (!succeeded) { return -1; } //put the per label and per feature vectors into the cache HadoopUtil.cacheFiles(getTempPath(WEIGHTS), getConf()); //calculate the Thetas, write out to LABEL_THETA_NORMALIZER vectors -- // TODO: add reference here to the part of the Rennie paper that discusses this Job thetaSummer = prepareJob(getTempPath(SUMMED_OBSERVATIONS), getTempPath(THETAS), SequenceFileInputFormat.class, ThetaMapper.class, Text.class, VectorWritable.class, VectorSumReducer.class, Text.class, VectorWritable.class, SequenceFileOutputFormat.class); thetaSummer.setCombinerClass(VectorSumReducer.class); thetaSummer.getConfiguration().setFloat(ThetaMapper.ALPHA_I, alphaI); thetaSummer.getConfiguration().setBoolean(ThetaMapper.TRAIN_COMPLEMENTARY, trainComplementary); /* TODO(robinanil): Enable this when thetanormalization works. succeeded = thetaSummer.waitForCompletion(true); if (!succeeded) { return -1; }*/ //validate our model and then write it out to the official output getConf().setFloat(ThetaMapper.ALPHA_I, alphaI); NaiveBayesModel naiveBayesModel = BayesUtils.readModelFromDir(getTempPath(), getConf()); naiveBayesModel.validate(); naiveBayesModel.serialize(getOutputPath(), getConf()); return 0; }
From source file:mlbench.bayes.train.IndexInstances.java
License:Apache License
@SuppressWarnings({ "deprecation" })
public static void main(String[] args) throws MPI_D_Exception, IOException, MPIException {
parseArgs(args);/*from w w w .j a va 2 s . co m*/
HashMap<String, String> conf = new HashMap<String, String>();
initConf(conf);
MPI_D.Init(args, MPI_D.Mode.Common, conf);
if (MPI_D.COMM_BIPARTITE_O != null) {
rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O);
if (rank == 0) {
System.out.println(IndexInstances.class.getSimpleName() + " O start.");
createLabelIndex(labPath);
}
HadoopUtil.cacheFiles(labPath, config);
MPI_D.COMM_BIPARTITE_O.Barrier();
OpenObjectIntHashMap<String> labelIndex = BayesUtils.readIndexFromCache(config);
if (MPI_D.COMM_BIPARTITE_O != null) {
// O communicator
int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O);
int size = MPI_D.Comm_size(MPI_D.COMM_BIPARTITE_O);
FileSplit[] inputs = DataMPIUtil.HDFSDataLocalLocator.getTaskInputs(MPI_D.COMM_BIPARTITE_O,
(JobConf) config, inDir, rank);
for (int i = 0; i < inputs.length; i++) {
FileSplit fsplit = inputs[i];
SequenceFileRecordReader<Text, VectorWritable> kvrr = new SequenceFileRecordReader<>(config,
fsplit);
Text labelText = kvrr.createKey();
VectorWritable instance = kvrr.createValue();
while (kvrr.next(labelText, instance)) {
String label = SLASH.split(labelText.toString())[1];
if (labelIndex.containsKey(label)) {
MPI_D.Send(new IntWritable(labelIndex.get(label)), instance);
}
}
}
}
} else if (MPI_D.COMM_BIPARTITE_A != null) {
int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_A);
config.set(MAPRED_OUTPUT_DIR, outDir);
config.set("mapred.task.id", DataMPIUtil.getHadoopTaskAttemptID().toString().toString());
((JobConf) config).setOutputKeyClass(IntWritable.class);
((JobConf) config).setOutputValueClass(VectorWritable.class);
TaskAttemptContext taskContext = new TaskAttemptContextImpl(config,
DataMPIUtil.getHadoopTaskAttemptID());
SequenceFileOutputFormat<IntWritable, VectorWritable> outfile = new SequenceFileOutputFormat<>();
FileSystem fs = FileSystem.get(config);
Path output = new Path(config.get(MAPRED_OUTPUT_DIR));
FileOutputCommitter fcommitter = new FileOutputCommitter(output, taskContext);
RecordWriter<IntWritable, VectorWritable> outrw = null;
try {
fcommitter.setupJob(taskContext);
outrw = outfile.getRecordWriter(fs, (JobConf) config, getOutputName(rank), null);
} catch (IOException e) {
e.printStackTrace();
System.err.println("ERROR: Please set the HDFS configuration properly\n");
System.exit(-1);
}
IntWritable key = null, newKey = null;
VectorWritable point = null, newPoint = null;
Vector vector = null;
Object[] vals = MPI_D.Recv();
while (vals != null) {
newKey = (IntWritable) vals[0];
newPoint = (VectorWritable) vals[1];
if (key == null && point == null) {
} else if (!key.equals(newKey)) {
outrw.write(key, new VectorWritable(vector));
vector = null;
}
if (vector == null) {
vector = newPoint.get();
} else {
vector.assign(newPoint.get(), Functions.PLUS);
}
key = newKey;
point = newPoint;
vals = MPI_D.Recv();
}
if (newKey != null && newPoint != null) {
outrw.write(key, new VectorWritable(vector));
}
outrw.close(null);
if (fcommitter.needsTaskCommit(taskContext)) {
fcommitter.commitTask(taskContext);
}
}
MPI_D.Finalize();
}