Example usage for org.apache.hadoop.mapreduce Job setMapOutputKeyClass

List of usage examples for org.apache.hadoop.mapreduce Job setMapOutputKeyClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setMapOutputKeyClass.

Prototype

public void setMapOutputKeyClass(Class<?> theClass) throws IllegalStateException 

Source Link

Document

Set the key class for the map output data.

Usage

From source file:edu.indiana.d2i.htrc.io.ParallelDataCopyJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 5) {
        printUsage();//from   w w  w.  ja va2 s . c  o m
    }

    String inputPath = args[0];
    String outputPath = args[1];
    int maxIdsPerSplit = Integer.valueOf(args[2]);
    String dataAPIConfClassName = args[3];
    int maxIdsPerReq = Integer.valueOf(args[4]);

    logger.info("ParallelDataCopyJob ");
    logger.info(" - input: " + inputPath); // id list
    logger.info(" - output: " + outputPath);
    logger.info(" - maxIdsPerSplit: " + maxIdsPerSplit);
    logger.info(" - dataAPIConfClassName: " + dataAPIConfClassName);

    Job job = new Job(getConf(), "Copy data from HTRC data storage parallely.");
    job.setJarByClass(ParallelDataCopyJob.class);

    job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit);
    Utilities.setDataAPIConf(job.getConfiguration(), dataAPIConfClassName, maxIdsPerReq);

    // no speculation
    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);

    job.setInputFormatClass(IDInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(org.apache.hadoop.mapreduce.Mapper.class);
    job.setNumReduceTasks(0);

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    long start = System.nanoTime();
    job.waitForCompletion(true);
    logger.info("ParallelDataCopyJob took " + (System.nanoTime() - start) / 1e9 + " seconds.");

    return 0;
}

From source file:edu.indiana.d2i.htrc.io.SparseVectorsFromHDFSRawText.java

License:Apache License

private void createSparseVector(Path inputPath, Path outputPath)
        throws IOException, ClassNotFoundException, InterruptedException {
    Job job = new Job(getConf(), "Create sparse vector from plain text in HDFS.");
    job.setJarByClass(SparseVectorsFromHDFSRawText.class);

    job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit);

    // no speculation
    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);

    // maximum #id per split
    job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit);

    // dictionary and lucene
    job.getConfiguration().set(HTRCConstants.DICTIONARY_PATH, dictDir);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(VectorWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(SparseVectorUtil.Text2VectorMapper.class);
    job.setNumReduceTasks(0);//from   w  ww.ja  va  2s. c o m

    FileInputFormat.setInputPaths(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.waitForCompletion(true);
}

From source file:edu.indiana.d2i.htrc.io.SparseVectorsFromLucene.java

License:Apache License

private void createSparseVector(Path inputPath, Path outputPath)
        throws IOException, ClassNotFoundException, InterruptedException {
    Job job = new Job(getConf(), "Create sparse vector from Lucene.");
    job.setJarByClass(SparseVectorsFromLucene.class);

    job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit);

    // no speculation
    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);

    // maximum #id per split
    job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit);

    // dictionary and lucene
    //       job.getConfiguration().set(HTRCConstants.DICTIONARY_PATH, dictDir);
    job.getConfiguration().set("htrc.solr.dictionary", dictDir);
    job.getConfiguration().set("htrc.lucene.index.path", indexLoc);

    job.setInputFormatClass(LuceneIDFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(VectorWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(org.apache.hadoop.mapreduce.Mapper.class);
    job.setNumReduceTasks(0);//from  w  ww  . j  a  va 2  s . co  m

    FileInputFormat.setInputPaths(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.waitForCompletion(true);
}

From source file:edu.indiana.d2i.htrc.io.SparseVectorsFromRawText.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 7) {
        printUsage();/*from w  w w  . j a v  a2s  .  com*/
    }

    String inputPath = args[0];
    String outputPath = args[1];
    String dictPath = args[2];
    int maxIdsPerSplit = Integer.valueOf(args[3]);
    String dataAPIConfClassName = args[4];
    String analyzerClassName = args[5];
    int maxIdsPerReq = Integer.valueOf(args[6]);

    logger.info("DataCopyTokenizerJob ");
    logger.info(" - input: " + inputPath);
    logger.info(" - output: " + outputPath);
    logger.info(" - dictPath: " + dictPath);
    logger.info(" - maxIdsPerSplit: " + maxIdsPerSplit);
    logger.info(" - dataAPIConfClassName: " + dataAPIConfClassName);
    logger.info(" - analyzerName: " + analyzerClassName);
    logger.info(" - maxIdsPerReq: " + maxIdsPerReq);

    //
    Job job = new Job(getConf(), "Create sparse vector from HTRC data storage.");
    job.setJarByClass(SparseVectorsFromRawText.class);

    // set dictionary
    job.getConfiguration().set(HTRCConstants.DICTIONARY_PATH, dictPath);

    // set analyzer
    job.getConfiguration().set(DocumentProcessor.ANALYZER_CLASS, analyzerClassName);

    // set data api conf
    job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit);
    Utilities.setDataAPIConf(job.getConfiguration(), dataAPIConfClassName, maxIdsPerReq);

    // no speculation
    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);

    job.setInputFormatClass(IDInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(SparseVectorUtil.Text2VectorMapper.class);
    job.setNumReduceTasks(0);

    Path output = new Path(outputPath);
    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, output);
    FileSystem.get(job.getConfiguration()).delete(output, true);

    long start = System.nanoTime();
    job.waitForCompletion(true);
    logger.info("SparseVectorsFromRawText took " + (System.nanoTime() - start) / 1e9 + " seconds.");

    return 0;
}

From source file:edu.indiana.d2i.htrc.io.SparseVectorsFromSolr.java

License:Apache License

private void createSparseVector(Path inputPath, Path outputPath)
        throws IOException, ClassNotFoundException, InterruptedException {
    Job job = new Job(getConf(), "Create sparse vector from Solr.");
    job.setJarByClass(SparseVectorsFromSolr.class);

    job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit);

    // no speculation
    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);

    // maximum #id per split
    job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit);

    // dictionary and solr
    //       job.getConfiguration().set(HTRCConstants.DICTIONARY_PATH, dictDir);
    job.getConfiguration().set("htrc.solr.dictionary", dictDir);
    job.getConfiguration().set(HTRCConstants.SOLR_MAIN_URL, solrEPR);

    job.setInputFormatClass(SolrIDFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(VectorWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(org.apache.hadoop.mapreduce.Mapper.class);
    job.setNumReduceTasks(0);/*from w  ww. j  av a 2s . com*/

    FileInputFormat.setInputPaths(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.waitForCompletion(true);
}

From source file:edu.indiana.d2i.htrc.io.SparseVectorsToMemcached.java

License:Apache License

private void parallelTransform() throws IOException, ClassNotFoundException, InterruptedException {
    ///*w ww . ja  va2s  .  com*/
    Job job = new Job(getConf(), "Create sparse vectors from HTRC data storage, store them in MemCached");
    job.setJarByClass(SparseVectorsToMemcached.class);

    Configuration conf = job.getConfiguration();
    setupConfiguration(conf);

    // no speculation
    conf.setBoolean("mapred.map.tasks.speculative.execution", false);
    conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);

    job.setInputFormatClass(IDInputFormat.class);
    job.setOutputFormatClass(MemCachedOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(VectorWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(SparseVectorUtil.Text2VectorMapper.class);
    job.setNumReduceTasks(0);

    FileInputFormat.setInputPaths(job, new Path(idListDir));

    job.waitForCompletion(true);
}

From source file:edu.indiana.d2i.htrc.io.SVFromHDFS2HDFS.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 4) {
        printUsage();/*from  w ww.java2  s.co  m*/
    }

    String inputPath = args[0];
    String outputPath = args[1];
    String dictPath = args[2];
    String analyzerClassName = args[3];

    logger.info("DataCopyTokenizerJob ");
    logger.info(" - input: " + inputPath);
    logger.info(" - output: " + outputPath);
    logger.info(" - dictPath: " + dictPath);
    logger.info(" - analyzerName: " + analyzerClassName);

    //
    Job job = new Job(getConf(), "Create sparse vector from HTRC data storage.");
    job.setJarByClass(SVFromHDFS2HDFS.class);

    // set dictionary
    job.getConfiguration().set(HTRCConstants.DICTIONARY_PATH, dictPath);

    // set analyzer
    job.getConfiguration().set(DocumentProcessor.ANALYZER_CLASS, analyzerClassName);

    // no speculation
    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(SparseVectorUtil.Text2VectorMapper.class);
    job.setNumReduceTasks(0);

    Path output = new Path(outputPath);
    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, output);
    FileSystem.get(job.getConfiguration()).delete(output, true);

    long start = System.nanoTime();
    job.waitForCompletion(true);
    logger.info("SparseVectorsFromRawText took " + (System.nanoTime() - start) / 1e9 + " seconds.");

    return 0;
}

From source file:edu.indiana.d2i.htrc.io.SVFromHDFS2Memcached.java

License:Apache License

private void parallelTransform() throws IOException, ClassNotFoundException, InterruptedException {
    ////from   w  w w. j  a v a 2  s . com
    Job job = new Job(getConf(), "Create sparse vectors from HTRC data storage, store them in MemCached");
    job.setJarByClass(SVFromHDFS2Memcached.class);

    Configuration conf = job.getConfiguration();
    setupConfiguration(conf);

    // no speculation
    conf.setBoolean("mapred.map.tasks.speculative.execution", false);
    conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(MemCachedOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(VectorWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(SparseVectorUtil.Text2VectorMapper.class);
    job.setNumReduceTasks(0);

    FileInputFormat.setInputPaths(job, new Path(vecDir));

    job.waitForCompletion(true);
}

From source file:edu.indiana.d2i.htrc.kmeans.MemCachedKMeansDriver.java

License:Apache License

/**
 * Run the job using supplied arguments/*from w  w  w  .j a v a  2  s  . co m*/
 * 
 * @param input
 *            the directory pathname for input points
 * @param clustersIn
 *            the directory pathname for input clusters
 * @param clustersOut
 *            the directory pathname for output clusters
 * @param measureClass
 *            the classname of the DistanceMeasure
 * @param convergenceDelta
 *            the convergence delta value
 * 
 * @return true if the iteration successfully runs
 */
private static boolean runIteration(Configuration conf, Path input, Path clustersOut, String measureClass,
        String convergenceDelta) throws IOException, InterruptedException, ClassNotFoundException {
    conf.set(KMeansConfigKeys.DISTANCE_MEASURE_KEY, measureClass);
    conf.set(KMeansConfigKeys.CLUSTER_CONVERGENCE_KEY, convergenceDelta);

    Job job = new Job(conf, "KMeans Driver running runIteration ");
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(ClusterObservations.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Cluster.class);

    //      job.setInputFormatClass(SequenceFileInputFormat.class);
    //      job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setInputFormatClass(MemIDInputFormat.class);
    job.setOutputFormatClass(MemCachedOutputFormat.class);
    job.setMapperClass(MemKMeansMapper.class);
    job.setCombinerClass(KMeansCombiner.class); // ??
    job.setReducerClass(MemKMeansReducer.class);

    FileInputFormat.addInputPath(job, input); // input is id list
    FileOutputFormat.setOutputPath(job, clustersOut);

    job.setJarByClass(MemCachedKMeansDriver.class);
    HadoopUtil.delete(conf, clustersOut);
    if (!job.waitForCompletion(true)) {
        throw new InterruptedException("K-Means Iteration failed processing ");
    }

    return isConverged(conf);
}

From source file:edu.indiana.d2i.htrc.skmeans.StreamingKMeansDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 3) {
        printUsage();//from   ww w  . j  ava  2s .  c o m
    }

    String input = args[0];
    String output = args[1];
    int maxCluster = Integer.valueOf(args[2]);

    logger.info("StreamingKmeansDriver ");
    logger.info(" - input: " + input);
    logger.info(" - output: " + output);
    logger.info(" - maxCluster: " + maxCluster);

    // set job
    Job job = new Job(getConf(), "Streaming KMeans");
    job.setJarByClass(StreamingKMeansDriver.class);
    StreamingKMeansConfigHelper(job.getConfiguration(), input, maxCluster);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(VectorWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(StreamingKMeansCluster.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    FileInputFormat.setInputPaths(job, new Path(input));
    FileOutputFormat.setOutputPath(job, new Path(output));

    job.setMapperClass(StreamingKMeansMapper.class);
    job.setReducerClass(StreamingKMeansReducer.class);

    job.waitForCompletion(true);

    return 0;
}