List of usage examples for org.apache.mahout.common HadoopUtil cacheFiles
public static void cacheFiles(Path fileToCache, Configuration conf)
From source file:com.luca.filipponi.tweetAnalysis.SentimentClassifier.CustomTestNaiveBayesDriver.java
License:Apache License
private boolean runMapReduce(Map<String, List<String>> parsedArgs) throws IOException, InterruptedException, ClassNotFoundException { Path model = new Path(getOption("model")); HadoopUtil.cacheFiles(model, getConf()); //the output key is the expected value, the output value are the scores for all the labels Job testJob = prepareJob(getInputPath(), getOutputPath(), SequenceFileInputFormat.class, BayesTestMapper.class, Text.class, VectorWritable.class, SequenceFileOutputFormat.class); //testJob.getConfiguration().set(LABEL_KEY, getOption("--labels")); //boolean complementary = parsedArgs.containsKey("testComplementary"); //always result to false as key in hash map is "--testComplementary" boolean complementary = hasOption("testComplementary"); //or complementary = parsedArgs.containsKey("--testComplementary"); testJob.getConfiguration().set(COMPLEMENTARY, String.valueOf(complementary)); return testJob.waitForCompletion(true); }
From source file:com.netease.news.classifier.naivebayes.TestNaiveBayesDriver.java
License:Apache License
private boolean runMapReduce(Map<String, List<String>> parsedArgs) throws IOException, InterruptedException, ClassNotFoundException { Path model = new Path(getOption("model")); HadoopUtil.cacheFiles(model, getConf()); //the output key is the expected value, the output value are the scores for all the labels Job testJob = prepareJob(getInputPath(), getOutputPath(), SequenceFileInputFormat.class, BayesTestMapper.class, Text.class, VectorWritable.class, SequenceFileOutputFormat.class); //testJob.getConfiguration().set(LABEL_KEY, getOption("--labels")); boolean complementary = parsedArgs.containsKey("testComplementary"); testJob.getConfiguration().set(COMPLEMENTARY, String.valueOf(complementary)); return testJob.waitForCompletion(true); }
From source file:com.netease.news.classifier.naivebayes.TrainNaiveBayesJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();//from w w w .j a v a 2 s. c om addOutputOption(); addOption(LABELS, "l", "comma-separated list of labels to include in training", false); addOption(buildOption(EXTRACT_LABELS, "el", "Extract the labels from the input", false, false, "")); addOption(ALPHA_I, "a", "smoothing parameter", String.valueOf(1.0f)); addOption( buildOption(TRAIN_COMPLEMENTARY, "c", "train complementary?", false, false, String.valueOf(false))); addOption(LABEL_INDEX, "li", "The path to store the label index in", false); addOption(DefaultOptionCreator.overwriteOption().create()); Map<String, List<String>> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(getConf(), getOutputPath()); HadoopUtil.delete(getConf(), getTempPath()); } Path labPath; String labPathStr = getOption(LABEL_INDEX); if (labPathStr != null) { labPath = new Path(labPathStr); } else { labPath = getTempPath(LABEL_INDEX); } long labelSize = createLabelIndex(labPath); float alphaI = Float.parseFloat(getOption(ALPHA_I)); boolean trainComplementary = Boolean.parseBoolean(getOption(TRAIN_COMPLEMENTARY)); HadoopUtil.setSerializations(getConf()); HadoopUtil.cacheFiles(labPath, getConf()); //add up all the vectors with the same labels, while mapping the labels into our index Job indexInstances = prepareJob(getInputPath(), getTempPath(SUMMED_OBSERVATIONS), SequenceFileInputFormat.class, IndexInstancesMapper.class, IntWritable.class, VectorWritable.class, VectorSumReducer.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); indexInstances.setCombinerClass(VectorSumReducer.class); boolean succeeded = indexInstances.waitForCompletion(true); if (!succeeded) { return -1; } //sum up all the weights from the previous step, per label and per feature Job weightSummer = prepareJob(getTempPath(SUMMED_OBSERVATIONS), getTempPath(WEIGHTS), SequenceFileInputFormat.class, WeightsMapper.class, Text.class, VectorWritable.class, VectorSumReducer.class, Text.class, VectorWritable.class, SequenceFileOutputFormat.class); weightSummer.getConfiguration().set(WeightsMapper.NUM_LABELS, String.valueOf(labelSize)); weightSummer.setCombinerClass(VectorSumReducer.class); succeeded = weightSummer.waitForCompletion(true); if (!succeeded) { return -1; } //put the per label and per feature vectors into the cache HadoopUtil.cacheFiles(getTempPath(WEIGHTS), getConf()); //calculate the Thetas, write out to LABEL_THETA_NORMALIZER vectors -- // TODO: add reference here to the part of the Rennie paper that discusses this Job thetaSummer = prepareJob(getTempPath(SUMMED_OBSERVATIONS), getTempPath(THETAS), SequenceFileInputFormat.class, ThetaMapper.class, Text.class, VectorWritable.class, VectorSumReducer.class, Text.class, VectorWritable.class, SequenceFileOutputFormat.class); thetaSummer.setCombinerClass(VectorSumReducer.class); thetaSummer.getConfiguration().setFloat(ThetaMapper.ALPHA_I, alphaI); thetaSummer.getConfiguration().setBoolean(ThetaMapper.TRAIN_COMPLEMENTARY, trainComplementary); /* TODO(robinanil): Enable this when thetanormalization works. succeeded = thetaSummer.waitForCompletion(true); if (!succeeded) { return -1; }*/ //validate our model and then write it out to the official output getConf().setFloat(ThetaMapper.ALPHA_I, alphaI); NaiveBayesModel naiveBayesModel = BayesUtils.readModelFromDir(getTempPath(), getConf()); naiveBayesModel.validate(); naiveBayesModel.serialize(getOutputPath(), getConf()); return 0; }
From source file:mlbench.bayes.train.IndexInstances.java
License:Apache License
@SuppressWarnings({ "deprecation" }) public static void main(String[] args) throws MPI_D_Exception, IOException, MPIException { parseArgs(args);/*from w w w .j a va 2 s . co m*/ HashMap<String, String> conf = new HashMap<String, String>(); initConf(conf); MPI_D.Init(args, MPI_D.Mode.Common, conf); if (MPI_D.COMM_BIPARTITE_O != null) { rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O); if (rank == 0) { System.out.println(IndexInstances.class.getSimpleName() + " O start."); createLabelIndex(labPath); } HadoopUtil.cacheFiles(labPath, config); MPI_D.COMM_BIPARTITE_O.Barrier(); OpenObjectIntHashMap<String> labelIndex = BayesUtils.readIndexFromCache(config); if (MPI_D.COMM_BIPARTITE_O != null) { // O communicator int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O); int size = MPI_D.Comm_size(MPI_D.COMM_BIPARTITE_O); FileSplit[] inputs = DataMPIUtil.HDFSDataLocalLocator.getTaskInputs(MPI_D.COMM_BIPARTITE_O, (JobConf) config, inDir, rank); for (int i = 0; i < inputs.length; i++) { FileSplit fsplit = inputs[i]; SequenceFileRecordReader<Text, VectorWritable> kvrr = new SequenceFileRecordReader<>(config, fsplit); Text labelText = kvrr.createKey(); VectorWritable instance = kvrr.createValue(); while (kvrr.next(labelText, instance)) { String label = SLASH.split(labelText.toString())[1]; if (labelIndex.containsKey(label)) { MPI_D.Send(new IntWritable(labelIndex.get(label)), instance); } } } } } else if (MPI_D.COMM_BIPARTITE_A != null) { int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_A); config.set(MAPRED_OUTPUT_DIR, outDir); config.set("mapred.task.id", DataMPIUtil.getHadoopTaskAttemptID().toString().toString()); ((JobConf) config).setOutputKeyClass(IntWritable.class); ((JobConf) config).setOutputValueClass(VectorWritable.class); TaskAttemptContext taskContext = new TaskAttemptContextImpl(config, DataMPIUtil.getHadoopTaskAttemptID()); SequenceFileOutputFormat<IntWritable, VectorWritable> outfile = new SequenceFileOutputFormat<>(); FileSystem fs = FileSystem.get(config); Path output = new Path(config.get(MAPRED_OUTPUT_DIR)); FileOutputCommitter fcommitter = new FileOutputCommitter(output, taskContext); RecordWriter<IntWritable, VectorWritable> outrw = null; try { fcommitter.setupJob(taskContext); outrw = outfile.getRecordWriter(fs, (JobConf) config, getOutputName(rank), null); } catch (IOException e) { e.printStackTrace(); System.err.println("ERROR: Please set the HDFS configuration properly\n"); System.exit(-1); } IntWritable key = null, newKey = null; VectorWritable point = null, newPoint = null; Vector vector = null; Object[] vals = MPI_D.Recv(); while (vals != null) { newKey = (IntWritable) vals[0]; newPoint = (VectorWritable) vals[1]; if (key == null && point == null) { } else if (!key.equals(newKey)) { outrw.write(key, new VectorWritable(vector)); vector = null; } if (vector == null) { vector = newPoint.get(); } else { vector.assign(newPoint.get(), Functions.PLUS); } key = newKey; point = newPoint; vals = MPI_D.Recv(); } if (newKey != null && newPoint != null) { outrw.write(key, new VectorWritable(vector)); } outrw.close(null); if (fcommitter.needsTaskCommit(taskContext)) { fcommitter.commitTask(taskContext); } } MPI_D.Finalize(); }