Example usage for org.apache.hadoop.mapreduce Job setMapOutputKeyClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setMapOutputKeyClass.

Prototype

public void setMapOutputKeyClass(Class<?> theClass) throws IllegalStateException

Source Link

Document

Set the key class for the map output data.

Usage

From source file:edu.indiana.d2i.htrc.io.ParallelDataCopyJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 5) {
        printUsage();//from   w w  w.  ja va2 s . c  o m
    }

    String inputPath = args[0];
    String outputPath = args[1];
    int maxIdsPerSplit = Integer.valueOf(args[2]);
    String dataAPIConfClassName = args[3];
    int maxIdsPerReq = Integer.valueOf(args[4]);

    logger.info("ParallelDataCopyJob ");
    logger.info(" - input: " + inputPath); // id list
    logger.info(" - output: " + outputPath);
    logger.info(" - maxIdsPerSplit: " + maxIdsPerSplit);
    logger.info(" - dataAPIConfClassName: " + dataAPIConfClassName);

    Job job = new Job(getConf(), "Copy data from HTRC data storage parallely.");
    job.setJarByClass(ParallelDataCopyJob.class);

    job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit);
    Utilities.setDataAPIConf(job.getConfiguration(), dataAPIConfClassName, maxIdsPerReq);

    // no speculation
    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);

    job.setInputFormatClass(IDInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(org.apache.hadoop.mapreduce.Mapper.class);
    job.setNumReduceTasks(0);

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    long start = System.nanoTime();
    job.waitForCompletion(true);
    logger.info("ParallelDataCopyJob took " + (System.nanoTime() - start) / 1e9 + " seconds.");

    return 0;
}

From source file:edu.indiana.d2i.htrc.io.SparseVectorsFromHDFSRawText.java

License:Apache License

private void createSparseVector(Path inputPath, Path outputPath)
        throws IOException, ClassNotFoundException, InterruptedException {
    Job job = new Job(getConf(), "Create sparse vector from plain text in HDFS.");
    job.setJarByClass(SparseVectorsFromHDFSRawText.class);

    job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit);

    // no speculation
    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);

    // maximum #id per split
    job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit);

    // dictionary and lucene
    job.getConfiguration().set(HTRCConstants.DICTIONARY_PATH, dictDir);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(VectorWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(SparseVectorUtil.Text2VectorMapper.class);
    job.setNumReduceTasks(0);//from   w  ww.ja  va  2s. c o m

    FileInputFormat.setInputPaths(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.waitForCompletion(true);
}

From source file:edu.indiana.d2i.htrc.io.SparseVectorsFromLucene.java

License:Apache License

private void createSparseVector(Path inputPath, Path outputPath)
        throws IOException, ClassNotFoundException, InterruptedException {
    Job job = new Job(getConf(), "Create sparse vector from Lucene.");
    job.setJarByClass(SparseVectorsFromLucene.class);

    job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit);

    // no speculation
    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);

    // maximum #id per split
    job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit);

    // dictionary and lucene
    //       job.getConfiguration().set(HTRCConstants.DICTIONARY_PATH, dictDir);
    job.getConfiguration().set("htrc.solr.dictionary", dictDir);
    job.getConfiguration().set("htrc.lucene.index.path", indexLoc);

    job.setInputFormatClass(LuceneIDFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(VectorWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(org.apache.hadoop.mapreduce.Mapper.class);
    job.setNumReduceTasks(0);//from  w  ww  . j  a  va 2  s . co  m

    FileInputFormat.setInputPaths(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.waitForCompletion(true);
}

From source file:edu.indiana.d2i.htrc.io.SparseVectorsFromRawText.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 7) {
        printUsage();/*from w  w w  . j a v  a2s  .  com*/
    }

    String inputPath = args[0];
    String outputPath = args[1];
    String dictPath = args[2];
    int maxIdsPerSplit = Integer.valueOf(args[3]);
    String dataAPIConfClassName = args[4];
    String analyzerClassName = args[5];
    int maxIdsPerReq = Integer.valueOf(args[6]);

    logger.info("DataCopyTokenizerJob ");
    logger.info(" - input: " + inputPath);
    logger.info(" - output: " + outputPath);
    logger.info(" - dictPath: " + dictPath);
    logger.info(" - maxIdsPerSplit: " + maxIdsPerSplit);
    logger.info(" - dataAPIConfClassName: " + dataAPIConfClassName);
    logger.info(" - analyzerName: " + analyzerClassName);
    logger.info(" - maxIdsPerReq: " + maxIdsPerReq);

    //
    Job job = new Job(getConf(), "Create sparse vector from HTRC data storage.");
    job.setJarByClass(SparseVectorsFromRawText.class);

    // set dictionary
    job.getConfiguration().set(HTRCConstants.DICTIONARY_PATH, dictPath);

    // set analyzer
    job.getConfiguration().set(DocumentProcessor.ANALYZER_CLASS, analyzerClassName);

    // set data api conf
    job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit);
    Utilities.setDataAPIConf(job.getConfiguration(), dataAPIConfClassName, maxIdsPerReq);

    // no speculation
    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);

    job.setInputFormatClass(IDInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(SparseVectorUtil.Text2VectorMapper.class);
    job.setNumReduceTasks(0);

    Path output = new Path(outputPath);
    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, output);
    FileSystem.get(job.getConfiguration()).delete(output, true);

    long start = System.nanoTime();
    job.waitForCompletion(true);
    logger.info("SparseVectorsFromRawText took " + (System.nanoTime() - start) / 1e9 + " seconds.");

    return 0;
}

From source file:edu.indiana.d2i.htrc.io.SparseVectorsFromSolr.java

License:Apache License

private void createSparseVector(Path inputPath, Path outputPath)
        throws IOException, ClassNotFoundException, InterruptedException {
    Job job = new Job(getConf(), "Create sparse vector from Solr.");
    job.setJarByClass(SparseVectorsFromSolr.class);

    job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit);

    // no speculation
    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);

    // maximum #id per split
    job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit);

    // dictionary and solr
    //       job.getConfiguration().set(HTRCConstants.DICTIONARY_PATH, dictDir);
    job.getConfiguration().set("htrc.solr.dictionary", dictDir);
    job.getConfiguration().set(HTRCConstants.SOLR_MAIN_URL, solrEPR);

    job.setInputFormatClass(SolrIDFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(VectorWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(org.apache.hadoop.mapreduce.Mapper.class);
    job.setNumReduceTasks(0);/*from w  ww. j  av a 2s . com*/

    FileInputFormat.setInputPaths(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.waitForCompletion(true);
}

From source file:edu.indiana.d2i.htrc.io.SparseVectorsToMemcached.java

License:Apache License

private void parallelTransform() throws IOException, ClassNotFoundException, InterruptedException {
    ///*w ww . ja  va2s  .  com*/
    Job job = new Job(getConf(), "Create sparse vectors from HTRC data storage, store them in MemCached");
    job.setJarByClass(SparseVectorsToMemcached.class);

    Configuration conf = job.getConfiguration();
    setupConfiguration(conf);

    // no speculation
    conf.setBoolean("mapred.map.tasks.speculative.execution", false);
    conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);

    job.setInputFormatClass(IDInputFormat.class);
    job.setOutputFormatClass(MemCachedOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(VectorWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(SparseVectorUtil.Text2VectorMapper.class);
    job.setNumReduceTasks(0);

    FileInputFormat.setInputPaths(job, new Path(idListDir));

    job.waitForCompletion(true);
}

From source file:edu.indiana.d2i.htrc.io.SVFromHDFS2HDFS.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 4) {
        printUsage();/*from  w ww.java2  s.co  m*/
    }

    String inputPath = args[0];
    String outputPath = args[1];
    String dictPath = args[2];
    String analyzerClassName = args[3];

    logger.info("DataCopyTokenizerJob ");
    logger.info(" - input: " + inputPath);
    logger.info(" - output: " + outputPath);
    logger.info(" - dictPath: " + dictPath);
    logger.info(" - analyzerName: " + analyzerClassName);

    //
    Job job = new Job(getConf(), "Create sparse vector from HTRC data storage.");
    job.setJarByClass(SVFromHDFS2HDFS.class);

    // set dictionary
    job.getConfiguration().set(HTRCConstants.DICTIONARY_PATH, dictPath);

    // set analyzer
    job.getConfiguration().set(DocumentProcessor.ANALYZER_CLASS, analyzerClassName);

    // no speculation
    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(SparseVectorUtil.Text2VectorMapper.class);
    job.setNumReduceTasks(0);

    Path output = new Path(outputPath);
    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, output);
    FileSystem.get(job.getConfiguration()).delete(output, true);

    long start = System.nanoTime();
    job.waitForCompletion(true);
    logger.info("SparseVectorsFromRawText took " + (System.nanoTime() - start) / 1e9 + " seconds.");

    return 0;
}

From source file:edu.indiana.d2i.htrc.io.SVFromHDFS2Memcached.java

License:Apache License

private void parallelTransform() throws IOException, ClassNotFoundException, InterruptedException {
    ////from   w  w w. j  a v a 2  s . com
    Job job = new Job(getConf(), "Create sparse vectors from HTRC data storage, store them in MemCached");
    job.setJarByClass(SVFromHDFS2Memcached.class);

    Configuration conf = job.getConfiguration();
    setupConfiguration(conf);

    // no speculation
    conf.setBoolean("mapred.map.tasks.speculative.execution", false);
    conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(MemCachedOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(VectorWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(SparseVectorUtil.Text2VectorMapper.class);
    job.setNumReduceTasks(0);

    FileInputFormat.setInputPaths(job, new Path(vecDir));

    job.waitForCompletion(true);
}

From source file:edu.indiana.d2i.htrc.kmeans.MemCachedKMeansDriver.java

License:Apache License

/**
 * Run the job using supplied arguments/*from w  w  w  .j a v a  2  s  . co m*/
 * 
 * @param input
 *            the directory pathname for input points
 * @param clustersIn
 *            the directory pathname for input clusters
 * @param clustersOut
 *            the directory pathname for output clusters
 * @param measureClass
 *            the classname of the DistanceMeasure
 * @param convergenceDelta
 *            the convergence delta value
 * 
 * @return true if the iteration successfully runs
 */
private static boolean runIteration(Configuration conf, Path input, Path clustersOut, String measureClass,
        String convergenceDelta) throws IOException, InterruptedException, ClassNotFoundException {
    conf.set(KMeansConfigKeys.DISTANCE_MEASURE_KEY, measureClass);
    conf.set(KMeansConfigKeys.CLUSTER_CONVERGENCE_KEY, convergenceDelta);

    Job job = new Job(conf, "KMeans Driver running runIteration ");
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(ClusterObservations.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Cluster.class);

    //      job.setInputFormatClass(SequenceFileInputFormat.class);
    //      job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setInputFormatClass(MemIDInputFormat.class);
    job.setOutputFormatClass(MemCachedOutputFormat.class);
    job.setMapperClass(MemKMeansMapper.class);
    job.setCombinerClass(KMeansCombiner.class); // ??
    job.setReducerClass(MemKMeansReducer.class);

    FileInputFormat.addInputPath(job, input); // input is id list
    FileOutputFormat.setOutputPath(job, clustersOut);

    job.setJarByClass(MemCachedKMeansDriver.class);
    HadoopUtil.delete(conf, clustersOut);
    if (!job.waitForCompletion(true)) {
        throw new InterruptedException("K-Means Iteration failed processing ");
    }

    return isConverged(conf);
}

From source file:edu.indiana.d2i.htrc.skmeans.StreamingKMeansDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 3) {
        printUsage();//from   ww w  . j  ava  2s .  c o m
    }

    String input = args[0];
    String output = args[1];
    int maxCluster = Integer.valueOf(args[2]);

    logger.info("StreamingKmeansDriver ");
    logger.info(" - input: " + input);
    logger.info(" - output: " + output);
    logger.info(" - maxCluster: " + maxCluster);

    // set job
    Job job = new Job(getConf(), "Streaming KMeans");
    job.setJarByClass(StreamingKMeansDriver.class);
    StreamingKMeansConfigHelper(job.getConfiguration(), input, maxCluster);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(VectorWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(StreamingKMeansCluster.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    FileInputFormat.setInputPaths(job, new Path(input));
    FileOutputFormat.setOutputPath(job, new Path(output));

    job.setMapperClass(StreamingKMeansMapper.class);
    job.setReducerClass(StreamingKMeansReducer.class);

    job.waitForCompletion(true);

    return 0;
}