Example usage for org.apache.hadoop.mapreduce Job setMapOutputKeyClass

List of usage examples for org.apache.hadoop.mapreduce Job setMapOutputKeyClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setMapOutputKeyClass.

Prototype

public void setMapOutputKeyClass(Class<?> theClass) throws IllegalStateException 

Source Link

Document

Set the key class for the map output data.

Usage

From source file:diamondmapreduce.DiamondMapReduce.java

License:Apache License

int launchHamondAWS(String[] arguments) throws Exception {

    //extract diamond, query, reference and output from array
    String diamond = arguments[0];
    String query = arguments[1];//from w  w  w . j  a  v  a  2  s  . co  m
    String dataBase = arguments[2];
    String outPut = arguments[3];

    //set Hadoop configuration
    Job job = Job.getInstance(getConf(), "DIAMOND");
    Configuration conf = job.getConfiguration();
    SetConf.setHadoopConf(conf);

    //get user name
    userName = HadoopUser.getHadoopUser();

    //delete all existing DIAMOND files under current Hadoop user
    DeleteHDFSFiles.deleteAllFiles(userName);

    //make local Hamond dir
    awshamondsidefunctions.MakeHamondDir.make();

    //copy DIAMOND, query, reference from S3 to master local
    awshamondsidefunctions.CopyFromS3.copyFromS3(diamond, query, dataBase);

    //make Hamond directory on HDFS
    MakeHamondHDFSdir.makedir(conf, userName);

    //make DIAMOND database on local then copy to HDFS with query and delete local database
    MakeDB.makeDB("/mnt/Hamond/diamond", "/mnt/Hamond/" + new Path(dataBase).getName());

    //copy DIAMOND bin, query and local database file to HDFS
    CopyFromLocal.copyFromLocal(conf, "/mnt/Hamond/diamond", "/mnt/Hamond/" + new Path(query).getName(),
            "/mnt/Hamond/" + new Path(dataBase).getName(), userName);

    //pass query name and database name to mappers
    conf.set(QUERY, query);
    conf.set(DATABASE, dataBase);
    conf.set(OUTPUT, outPut);
    String[] subArgs = Arrays.copyOfRange(arguments, 4, arguments.length);
    conf.setStrings("DIAMOND-arguments", subArgs);
    conf.setStrings(OUTPUT, outPut);

    //add DIAMOND bin and database into distributed cache
    job.addCacheFile(new URI("/user/" + userName + "/Hamond/diamond"));
    job.addCacheFile(new URI("/user/" + userName + "/Hamond/" + new Path(dataBase).getName() + ".dmnd"));

    //set job input and output paths
    FileInputFormat.addInputPath(job, new Path("/user/" + userName + "/Hamond/" + new Path(query).getName()));
    FileOutputFormat.setOutputPath(job, new Path("/user/" + userName + "/Hamond/out"));

    //set job driver and mapper
    job.setJarByClass(DiamondMapReduce.class);
    job.setMapperClass(DiamondMapper.class);
    job.setReducerClass(AWSDiamondReducer.class);

    //set job input format into customized multilines format
    job.setInputFormatClass(CustomNLineFileInputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setNumReduceTasks(1);

    return job.waitForCompletion(true) ? 0 : 1;

}

From source file:distributed.hadoop.MapReduceJobConfig.java

License:Open Source License

/**
 * Apply the settings encapsulated in this config and return a Job object
 * ready for execution./*from  www.  jav a 2s .c om*/
 * 
 * @param jobName the name of the job
 * @param conf the Configuration object that will be wrapped in the Job
 * @param env environment variables
 * @return a configured Job object
 * @throws IOException if a problem occurs
 * @throws ClassNotFoundException if various classes are not found
 */
public Job configureForHadoop(String jobName, Configuration conf, Environment env)
        throws IOException, ClassNotFoundException {

    String jobTrackerPort = getJobTrackerPort();
    if (DistributedJobConfig.isEmpty(jobTrackerPort)) {
        jobTrackerPort = AbstractHadoopJobConfig.isHadoop2() ? AbstractHadoopJobConfig.DEFAULT_PORT_YARN
                : AbstractHadoopJobConfig.DEFAULT_PORT;
    }
    String jobTracker = getJobTrackerHost() + ":" + jobTrackerPort;
    if (DistributedJobConfig.isEmpty(jobTracker)) {
        System.err.println("No " + (AbstractHadoopJobConfig.isHadoop2() ? "resource manager " : "JobTracker ")
                + "set - running locally...");
    } else {
        jobTracker = environmentSubstitute(jobTracker, env);
        if (AbstractHadoopJobConfig.isHadoop2()) {
            conf.set(YARN_RESOURCE_MANAGER_ADDRESS, jobTracker);
            conf.set(YARN_RESOURCE_MANAGER_SCHEDULER_ADDRESS,
                    environmentSubstitute(getJobTrackerHost(), env) + ":8030");
        } else {
            conf.set(HADOOP_JOB_TRACKER_HOST, jobTracker);
        }
    }
    System.err.println("Using " + (AbstractHadoopJobConfig.isHadoop2() ? "resource manager: " : "jobtracker: ")
            + jobTracker);

    if (AbstractHadoopJobConfig.isHadoop2()) {
        // a few other properties needed to run against Yarn
        conf.set("yarn.nodemanager.aux-services", "mapreduce_shuffle");
        conf.set("mapreduce.framework.name", "yarn");
    }

    if (!DistributedJobConfig.isEmpty(getMapredMaxSplitSize())) {
        conf.set(AbstractHadoopJobConfig.isHadoop2() ? HADOOP2_MAPRED_MAX_SPLIT_SIZE
                : HADOOP_MAPRED_MAX_SPLIT_SIZE, getMapredMaxSplitSize());
    }

    // Do any user supplied properties here before creating the Job
    for (Map.Entry<String, String> e : m_additionalUserSuppliedProperties.entrySet()) {
        conf.set(e.getKey(), e.getValue());
    }

    m_hdfsConfig.configureForHadoop(conf, env);
    Job job = new Job(conf, jobName);

    String numMappers = getNumberOfMaps();
    if (!DistributedJobConfig.isEmpty(numMappers)) {
        numMappers = environmentSubstitute(numMappers, env);
        ((JobConf) job.getConfiguration()).setNumMapTasks(Integer.parseInt(numMappers));
    }

    // The number of map tasks that will be run simultaneously by a task tracker
    String maxConcurrentMapTasks = getTaskTrackerMapTasksMaximum();
    if (!DistributedJobConfig.isEmpty(maxConcurrentMapTasks)) {
        ((JobConf) job.getConfiguration()).set("mapred.tasktracker.map.tasks.maximum", maxConcurrentMapTasks);
    }

    String numReducers = getNumberOfReducers();
    if (!DistributedJobConfig.isEmpty(numReducers)) {
        numReducers = environmentSubstitute(numReducers, env);
        job.setNumReduceTasks(Integer.parseInt(numReducers));

        if (Integer.parseInt(numReducers) == 0) {
            System.err.println("Warning - no reducer class set. Configuring for a map only job");
        }
    } else {
        job.setNumReduceTasks(1);
    }
    String mapperClass = getMapperClass();
    if (DistributedJobConfig.isEmpty(mapperClass)) {
        throw new IOException("No mapper class specified!");
    }
    mapperClass = environmentSubstitute(mapperClass, env);

    @SuppressWarnings("unchecked")
    Class<? extends Mapper> mc = (Class<? extends Mapper>) Class.forName(mapperClass);

    job.setMapperClass(mc);

    String reducerClass = getReducerClass();
    if (DistributedJobConfig.isEmpty(reducerClass) && Integer.parseInt(numReducers) > 0) {
        throw new IOException("No reducer class specified!");
    } else if (job.getNumReduceTasks() > 0) {
        reducerClass = environmentSubstitute(reducerClass, env);

        @SuppressWarnings("unchecked")
        Class<? extends Reducer> rc = (Class<? extends Reducer>) Class.forName(reducerClass);

        job.setReducerClass(rc);
    }

    String combinerClass = getCombinerClass();
    if (!DistributedJobConfig.isEmpty(combinerClass)) {
        combinerClass = environmentSubstitute(combinerClass, env);

        @SuppressWarnings("unchecked")
        Class<? extends Reducer> cc = (Class<? extends Reducer>) Class.forName(combinerClass);

        job.setCombinerClass(cc);
    }

    String inputFormatClass = getInputFormatClass();
    if (DistributedJobConfig.isEmpty(inputFormatClass)) {
        throw new IOException("No input format class specified");
    }
    inputFormatClass = environmentSubstitute(inputFormatClass, env);

    @SuppressWarnings("unchecked")
    Class<? extends InputFormat> ifc = (Class<? extends InputFormat>) Class.forName(inputFormatClass);

    job.setInputFormatClass(ifc);

    String outputFormatClass = getOutputFormatClass();
    if (DistributedJobConfig.isEmpty(outputFormatClass)) {
        throw new IOException("No output format class specified");
    }
    outputFormatClass = environmentSubstitute(outputFormatClass, env);

    @SuppressWarnings("unchecked")
    Class<? extends OutputFormat> ofc = (Class<? extends OutputFormat>) Class.forName(outputFormatClass);
    job.setOutputFormatClass(ofc);

    String mapOutputKeyClass = getMapOutputKeyClass();
    if (DistributedJobConfig.isEmpty(mapOutputKeyClass)) {
        throw new IOException("No map output key class defined");
    }
    mapOutputKeyClass = environmentSubstitute(mapOutputKeyClass, env);
    Class mokc = Class.forName(mapOutputKeyClass);
    job.setMapOutputKeyClass(mokc);

    String mapOutputValueClass = getMapOutputValueClass();
    if (DistributedJobConfig.isEmpty(mapOutputValueClass)) {
        throw new IOException("No map output value class defined");
    }
    mapOutputValueClass = environmentSubstitute(mapOutputValueClass, env);
    Class movc = Class.forName(mapOutputValueClass);
    job.setMapOutputValueClass(movc);

    String outputKeyClass = getOutputKeyClass();
    if (DistributedJobConfig.isEmpty(outputKeyClass)) {
        throw new IOException("No output key class defined");
    }
    outputKeyClass = environmentSubstitute(outputKeyClass, env);
    Class okc = Class.forName(outputKeyClass);
    job.setOutputKeyClass(okc);

    String outputValueClass = getOutputValueClass();
    if (DistributedJobConfig.isEmpty(outputValueClass)) {
        throw new IOException("No output value class defined");
    }
    outputValueClass = environmentSubstitute(outputValueClass, env);
    Class ovc = Class.forName(outputValueClass);
    job.setOutputValueClass(ovc);

    String inputPaths = getInputPaths();
    // don't complain if there aren't any as inputs such as HBASE
    // require other properties to be set
    if (!DistributedJobConfig.isEmpty(inputPaths)) {
        inputPaths = environmentSubstitute(inputPaths, env);
        FileInputFormat.setInputPaths(job, inputPaths);
    }

    String outputPath = getOutputPath();
    if (DistributedJobConfig.isEmpty(outputPath)) {
        throw new IOException("No output path specified");
    }
    outputPath = environmentSubstitute(outputPath, env);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job;
}

From source file:dk.statsbiblioteket.hadoop.archeaderextractor.ARCHeaderExtractorMR.java

License:Apache License

public int run(String[] args) throws Exception {
    Configuration configuration = getConf();

    Job job = new Job(configuration, "ARC Header Extractor");
    job.setJarByClass(ARCHeaderExtractorMR.class);

    job.setMapperClass(ARCHeaderExtractorMapper.class);
    job.setCombinerClass(ARCHeaderExtractorReducer.class);
    job.setReducerClass(ARCHeaderExtractorReducer.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    int n = args.length;
    if (n == 0 || n > 2) {
        System.err.println(/*from w w  w. ja  v  a 2 s. c o m*/
                "Not enough arguments. input dir and output dir mandatory. Only " + n + " were supplied.");
        System.exit(0);
    }

    SequenceFileInputFormat.addInputPath(job, new Path(args[0]));
    SequenceFileOutputFormat.setOutputPath(job, new Path(args[1]));

    return job.waitForCompletion(true) ? 0 : -1;
}

From source file:edu.berkeley.chukwa_xtrace.XtrExtract.java

License:Apache License

@Override
public int run(String[] arg) throws Exception {
    Job extractor = new Job(getConf());

    extractor.setMapperClass(MapClass.class);

    extractor.setReducerClass(Reduce.class);
    extractor.setJobName("x-trace reconstructor");
    extractor.setJarByClass(this.getClass());

    extractor.setMapOutputKeyClass(BytesWritable.class);
    extractor.setMapOutputValueClass(Text.class);

    extractor.setOutputKeyClass(BytesWritable.class);
    extractor.setOutputValueClass(TextArrayWritable.class);

    extractor.setInputFormatClass(SequenceFileInputFormat.class);
    extractor.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileInputFormat.setInputPaths(extractor, new Path(arg[0]));
    FileOutputFormat.setOutputPath(extractor, new Path(arg[1]));
    System.out.println("looks OK.  Submitting.");
    extractor.submit();//from  w w  w  . j  a v a  2s.co  m
    //    extractor.waitForCompletion(false);
    return 0;

}

From source file:edu.berkeley.chukwa_xtrace.XtrIndex.java

License:Apache License

@Override
public int run(String[] arg) throws Exception {
    Job extractor = new Job(getConf());
    extractor.setMapperClass(MapClass.class);
    //no reduce, just identity

    extractor.setJobName("x-trace indexer");
    extractor.setJarByClass(this.getClass());

    extractor.setMapOutputKeyClass(BytesWritable.class);
    extractor.setMapOutputValueClass(TextArrayWritable.class);

    extractor.setOutputKeyClass(BytesWritable.class);
    extractor.setOutputValueClass(TextArrayWritable.class);

    extractor.setInputFormatClass(SequenceFileInputFormat.class);
    extractor.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileInputFormat.setInputPaths(extractor, new Path(arg[0]));
    FileOutputFormat.setOutputPath(extractor, new Path(arg[1]));
    System.out.println("looks OK.  Submitting.");
    extractor.submit();/*  ww w  .  j  a  va2s  .c om*/
    //    extractor.waitForCompletion(false);
    return 0;

}

From source file:edu.bigdata.training.serialization.UserHistory.java

@Override
public int run(String[] strings) throws Exception {
    Configuration conf = new Configuration();

    Job job = new Job(conf, "postHistory");
    job.setJarByClass(UserHistory.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setNumReduceTasks(1);/* w  w  w .  j av a  2  s. c om*/

    MultipleInputs.addInputPath(job, new Path("input/posts/user_info.txt"), TextInputFormat.class,
            UserCityMapper.class);
    MultipleInputs.addInputPath(job, new Path("input/posts/user_posts.txt"), TextInputFormat.class,
            UserPostsMapper.class);

    AvroJob.setOutputKeySchema(job, Schema.create(Schema.Type.STRING));
    AvroJob.setOutputValueSchema(job, UserPostSummary.getClassSchema());
    job.setOutputFormatClass(AvroKeyValueOutputFormat.class);

    Path outPath = new Path("output/user/posts");
    FileOutputFormat.setOutputPath(job, outPath);
    job.setReducerClass(UserPostHistory.class);
    //outPath.getFileSystem(job.getConfiguration()).delete(outPath, true);

    return (job.waitForCompletion(true) ? 0 : 1);
}

From source file:edu.columbia.hs2807.Sentiment.java

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf, "sentiment");

    job.setJarByClass(Sentiment.class);
    job.setMapperClass(Map.class);
    job.setCombinerClass(Combine.class);
    job.setReducerClass(Reduce.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(LongArrayWritable.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(DoubleWritable.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:edu.gslis.ts.hadoop.ThriftBulkLoader.java

License:Apache License

public int run(String[] args) throws Exception {
    String tableName = args[0];/* w w  w .  j  a va  2 s .  com*/
    String inputPath = args[1];
    String outputPath = args[2];
    Path topicsFile = new Path(args[3]);
    Path vocabFile = new Path(args[4]);
    Path dateBinFile = new Path(args[5]);

    Configuration config = getConf();
    config.set("hbase.table.name", tableName);
    HBaseConfiguration.addHbaseResources(config);

    Job job = Job.getInstance(config);
    job.setJarByClass(ThriftBulkLoader.class);
    job.setJobName("Bulk Loading HBase Table::" + tableName);
    job.setInputFormatClass(ThriftFileInputFormat.class);
    job.setMapOutputKeyClass(ImmutableBytesWritable.class);
    job.setMapperClass(ThriftFilterMapper.class);

    Path output = new Path(outputPath);
    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileInputFormat.setInputDirRecursive(job, true);
    FileOutputFormat.setOutputPath(job, output);

    job.setMapOutputValueClass(Put.class);

    job.addCacheFile(topicsFile.toUri());
    job.addCacheFile(vocabFile.toUri());
    job.addCacheFile(dateBinFile.toUri());

    job.getConfiguration().setBoolean("mapreduce.map.output.compress", true);
    job.getConfiguration().setClass("mapred.map.output.compression.codec",
            org.apache.hadoop.io.compress.SnappyCodec.class,
            org.apache.hadoop.io.compress.CompressionCodec.class);
    job.getConfiguration().set("hfile.compression", Compression.Algorithm.SNAPPY.getName());

    //RegionLocator regionLocator = conn.getRegionLocator(tableName);
    //HFileOutputFormat2.configureIncrementalLoad(job, new HTable(config,tableName));

    Connection con = ConnectionFactory.createConnection(config);
    TableName htableName = TableName.valueOf(tableName);
    HFileOutputFormat2.configureIncrementalLoad(job, con.getTable(htableName),
            con.getRegionLocator(htableName));

    job.waitForCompletion(true);
    if (job.isSuccessful()) {
        // Couldn't find a better way to do this. The LoadIncrementalHFiles
        // seems to want 777 permissions on the output directory.
        try {
            Runtime rt = Runtime.getRuntime();
            rt.exec("hadoop fs -chmod -R 777 " + output);
        } catch (Exception e) {
            e.printStackTrace();
        }
        /*
        LoadIncrementalHFiles loader = new LoadIncrementalHFiles(config);
        HTable htable = new HTable(config, tableName);
        loader.doBulkLoad(new Path(outputPath), htable);
        */

    } else {
        throw new IOException("error with job");
    }

    return 0;

    // - 

    /*
    Job job = Job.getInstance(config);
    job.setJarByClass(ThriftBulkLoader.class);
            
    job.setMapOutputKeyClass(ImmutableBytesWritable.class);  
    job.setMapOutputValueClass(Put.class);  
    job.setInputFormatClass(ThriftFileInputFormat.class);
            
    //HFileOutputFormat2.configureIncrementalLoad(job, htable);
            
    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileInputFormat.setInputDirRecursive(job, true);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));        
            
    job.addCacheFile(topicsFile.toUri());
    job.addCacheFile(vocabFile.toUri());
            
    job.setMapperClass(ThriftFilterMapper.class);
            
    boolean b = job.waitForCompletion(true);
    if (!b) {
    throw new IOException("error with job");
    }
            
    LoadIncrementalHFiles loader = new LoadIncrementalHFiles(config);
    loader.doBulkLoad(new Path(outputPath), htable);
            
    return 0;        
    */
}

From source file:edu.indiana.d2i.htrc.exp.PartialVectorsFromTokenizedDoc.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 4) {
        printUsage();/*from   w  ww.jav  a2  s.  co m*/
    }

    // all directories are in HDFS
    tokenizedDocDir = args[0];
    dictDir = args[1];
    outputDir = args[2];
    numReducers = Integer.valueOf(args[3]);

    logger.info("PartialVectorsFromTokenizedDoc ");
    logger.info(" - tokenizedDocDir: " + tokenizedDocDir);
    logger.info(" - dictDir: " + dictDir);
    logger.info(" - outputDir: " + outputDir);
    logger.info(" - numReducers: " + numReducers);

    Path tokenizedDocPath = new Path(tokenizedDocDir);
    Path dictPath = new Path(dictDir);
    Path outputPath = new Path(outputDir);

    // get dimension
    Configuration conf = getConf();

    int dimension = 0;
    for (Pair<Writable, IntWritable> record : new SequenceFileIterable<Writable, IntWritable>(dictPath, true,
            conf)) {
        dimension++;
    }
    logger.info("dimension of a vector: " + dimension);

    // submit job
    long t0 = System.currentTimeMillis();
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    conf.setInt(PartialVectorMerger.DIMENSION, dimension);
    DistributedCache.setCacheFiles(new URI[] { dictPath.toUri() }, conf);

    Job job = new Job(conf);
    job.setJobName("PartialVectorsFromTokenizedDoc::MakePartialVectors: input-folder: " + tokenizedDocDir
            + ", dictionary-file: " + dictDir);
    job.setJarByClass(PartialVectorsFromTokenizedDoc.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(StringTuple.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);

    FileInputFormat.setInputPaths(job, tokenizedDocPath);
    FileOutputFormat.setOutputPath(job, outputPath);
    HadoopUtil.delete(conf, outputPath);

    job.setMapperClass(Mapper.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setReducerClass(TFPartialVectorReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setNumReduceTasks(numReducers);

    job.waitForCompletion(true);

    long t1 = System.currentTimeMillis();
    logger.info("PartialVectorsFromTokenizedDoc takes " + (double) (t1 - t0) / 1000 + " seconds.");

    return 0;
}

From source file:edu.indiana.d2i.htrc.io.DataCopyTokenizerJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 6) {
        printUsage();/*from   w ww  .j  av a  2 s. co  m*/
    }

    String inputPath = args[0];
    String outputPath = args[1];
    int maxIdsPerSplit = Integer.valueOf(args[2]);
    String dataAPIConfClassName = args[3];
    String analyzerClassName = args[4];
    int maxIdsPerReq = Integer.valueOf(args[5]);

    logger.info("DataCopyTokenizerJob ");
    logger.info(" - input: " + inputPath);
    logger.info(" - output: " + outputPath);
    logger.info(" - maxIdsPerSplit: " + maxIdsPerSplit);
    logger.info(" - dataAPIConfClassName: " + dataAPIConfClassName);
    logger.info(" - analyzerName: " + analyzerClassName);
    logger.info(" - maxIdsPerReq: " + maxIdsPerReq);

    // upload dictionary file to HDFS
    //      FileSystem fs = FileSystem.get(getConf());
    //      Path dictionaryPath = new Path(outputPath, Utilities.path2FileName(dictionaryFile));
    //      BufferedWriter writer = new BufferedWriter(
    //            new OutputStreamWriter(fs.create(dictionaryPath, true)));
    //      BufferedReader reader = new BufferedReader(new FileReader(dictionaryFile));
    //      String line = null;
    //      while ((line = reader.readLine()) != null) {
    //         writer.write(line + "\n");
    //      }
    //      writer.close();

    // 
    Job job = new Job(getConf(), "Copy and tokenize data from HTRC data storage parallely.");
    job.setJarByClass(DataCopyTokenizerJob.class);

    // set analyzer
    job.getConfiguration().set(DocumentProcessor.ANALYZER_CLASS, analyzerClassName);

    // set distributed cache
    //      Path dictionaryPath = new Path(dictionaryFile);
    //      DistributedCache.setCacheFiles(new URI[] {dictionaryPath.toUri()}, job.getConfiguration());

    // set data api conf
    job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit);
    Utilities.setDataAPIConf(job.getConfiguration(), dataAPIConfClassName, maxIdsPerReq);

    // no speculation
    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);

    job.setInputFormatClass(IDInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(StringTuple.class);

    job.setMapperClass(DataCopyTokenizerMapper.class);
    job.setNumReduceTasks(0);

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    long start = System.nanoTime();
    job.waitForCompletion(true);
    logger.info("DataCopyTokenizerJob took " + (System.nanoTime() - start) / 1e9 + " seconds.");

    return 0;
}