Example usage for org.apache.hadoop.mapred JobConf setOutputKeyClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setOutputKeyClass.

Prototype

public void setOutputKeyClass(Class<?> theClass)

Source Link

Document

Set the key class for the job output data.

Usage

From source file:io.aos.t4f.hadoop.mapred.WordCountTest.java

License:Apache License

private JobConf createJobConf() {
    JobConf conf = mrCluster.createJobConf();
    conf.setJobName("wordcount test");

    conf.setMapperClass(WordCountMapper.class);
    conf.setReducerClass(WordCountReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(IntWritable.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);
    conf.setNumMapTasks(1);//from  ww  w  .j  av a2s  . co  m
    conf.setNumReduceTasks(1);
    FileInputFormat.setInputPaths(conf, input);
    FileOutputFormat.setOutputPath(conf, output);
    return conf;
}

From source file:io.dstream.tez.TezDAGBuilder.java

License:Apache License

/**
 *
 *///from w ww  .  j a  va2s .  com
private JobConf buildJobConf(Class<? extends Writable> keyClass, Class<? extends Writable> valueClass) {
    JobConf jobConf = new JobConf(this.tezClient.getTezConfiguration());
    jobConf.setOutputKeyClass(keyClass);
    jobConf.setOutputValueClass(valueClass);
    return jobConf;
}

From source file:io.fluo.stress.trie.Generate.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    if (args.length != 4) {
        log.error("Usage: " + this.getClass().getSimpleName()
                + " <numMappers> <numbersPerMapper> <max> <output dir>");
        System.exit(-1);//  w w w  . j  ava2  s. c  om
    }

    int numMappers = Integer.parseInt(args[0]);
    int numPerMapper = Integer.parseInt(args[1]);
    long max = Long.parseLong(args[2]);
    Path out = new Path(args[3]);

    Preconditions.checkArgument(numMappers > 0, "numMappers <= 0");
    Preconditions.checkArgument(numPerMapper > 0, "numPerMapper <= 0");
    Preconditions.checkArgument(max > 0, "max <= 0");

    JobConf job = new JobConf(getConf());

    job.setJobName(this.getClass().getName());

    job.setJarByClass(Generate.class);

    job.setInt(TRIE_GEN_NUM_PER_MAPPER_PROP, numPerMapper);
    job.setInt(TRIE_GEN_NUM_MAPPERS_PROP, numMappers);
    job.setLong(TRIE_GEN_MAX_PROP, max);

    job.setInputFormat(RandomLongInputFormat.class);

    job.setNumReduceTasks(0);

    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(NullWritable.class);

    job.setOutputFormat(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setOutputPath(job, out);

    RunningJob runningJob = JobClient.runJob(job);
    runningJob.waitForCompletion();
    return runningJob.isSuccessful() ? 0 : -1;
}

From source file:io.fluo.stress.trie.NumberIngest.java

License:Apache License

public static void main(String[] args) throws IOException, ConfigurationException {

    // Parse arguments
    if (args.length != 4) {
        log.error("Usage: NumberIngest <numMappers> <numbersPerMapper> <nodeSize> <fluoProps>");
        System.exit(-1);//from  w  w w . java  2  s . co m
    }
    int numMappers = Integer.parseInt(args[0]);
    int numPerMapper = Integer.parseInt(args[1]);
    int nodeSize = Integer.parseInt(args[2]);
    String fluoPropsPath = args[3];

    String hadoopPrefix = System.getenv("HADOOP_PREFIX");
    if (hadoopPrefix == null) {
        hadoopPrefix = System.getenv("HADOOP_HOME");
        if (hadoopPrefix == null) {
            log.error("HADOOP_PREFIX or HADOOP_HOME needs to be set!");
            System.exit(-1);
        }
    }

    // create test name
    String testId = String.format("test-%d", (new Date().getTime() / 1000));
    String testDir = "/trie-stress/" + testId;

    setupHdfs(hadoopPrefix, testDir, numMappers, numPerMapper);

    JobConf ingestConf = new JobConf(NumberIngest.class);
    ingestConf.setJobName("NumberIngest");

    FluoConfiguration config = new FluoConfiguration(new File(fluoPropsPath));

    loadConfig(ingestConf, ConfigurationConverter.getProperties(config));
    ingestConf.setInt(TRIE_NODE_SIZE_PROP, nodeSize);

    ingestConf.setOutputKeyClass(LongWritable.class);
    ingestConf.setOutputValueClass(IntWritable.class);
    ingestConf.setMapperClass(NumberIngest.IngestMapper.class);
    ingestConf.setReducerClass(NumberIngest.UniqueReducer.class);

    FileInputFormat.setInputPaths(ingestConf, new Path(testDir + "/input/"));
    FileOutputFormat.setOutputPath(ingestConf, new Path(testDir + "/unique/"));

    RunningJob ingestJob = JobClient.runJob(ingestConf);
    ingestJob.waitForCompletion();
    if (ingestJob.isSuccessful()) {

        JobConf countConf = new JobConf(NumberIngest.class);
        countConf.setJobName("NumberCount");

        countConf.setOutputKeyClass(Text.class);
        countConf.setOutputValueClass(LongWritable.class);
        countConf.setMapperClass(NumberIngest.CountMapper.class);
        countConf.setReducerClass(NumberIngest.CountReducer.class);

        FileInputFormat.setInputPaths(countConf, new Path(testDir + "/unique/"));
        FileOutputFormat.setOutputPath(countConf, new Path(testDir + "/output/"));

        RunningJob countJob = JobClient.runJob(countConf);
        countJob.waitForCompletion();
        if (countJob.isSuccessful()) {
            log.info("Ingest and count jobs were successful");
            log.info("Output can be viewed @ " + testDir);
            System.exit(0);
        } else {
            log.error("Count job failed for " + testId);
        }
    } else {
        log.error("Ingest job failed.  Skipping count job for " + testId);
    }

    System.exit(-1);
}

From source file:io.hops.erasure_coding.MapReduceEncoder.java

License:Apache License

/**
 * create new job conf based on configuration passed.
 *
 * @param conf/*  w w w .  j  a v  a  2  s  . co m*/
 * @return
 */
private static JobConf createJobConf(Configuration conf) {
    JobConf jobconf = new JobConf(conf, MapReduceEncoder.class);
    jobName = NAME + " " + dateForm.format(new Date(BaseEncodingManager.now()));
    jobconf.setUser(BaseEncodingManager.JOBUSER);
    jobconf.setJobName(jobName);
    jobconf.setMapSpeculativeExecution(false);
    RaidUtils.parseAndSetOptions(jobconf, SCHEDULER_OPTION_LABEL);

    jobconf.setJarByClass(MapReduceEncoder.class);
    jobconf.setInputFormat(DistRaidInputFormat.class);
    jobconf.setOutputKeyClass(Text.class);
    jobconf.setOutputValueClass(Text.class);

    jobconf.setMapperClass(DistRaidMapper.class);
    jobconf.setNumReduceTasks(0);
    return jobconf;
}

From source file:it.isislab.sof.core.engine.hadoop.sshclient.utils.simulation.executor.SOF.java

License:Apache License

public static void main(String[] args) {

    /**/*from  w w  w  . ja  v a 2  s .  c om*/
     * aids /home/michele/Scrivania/aids netlogo /home/michele/Scrivania/aids/aids.nlogo /home/michele/Scrivania/aids/input.tmp /home/michele/Scrivania/aids/output /home/michele/Scrivania/aids/output.xml false pepp ciao  
     *  
     */

    /*         try {//Runtime.getRuntime().exec("rm -r /home/lizard87/Desktop/mason_test/output");
        Runtime.getRuntime().exec("rm -r /home/michele/Scrivania/aids/output");
       } catch (IOException e) {e.printStackTrace();}*/

    if (args.length < 9 || args.length == 11 || args.length == 12 || args.length >= 15) {

        System.out.println("Usage:");
        System.out.println("java -jar SCUD.jar " + "<simulation_name> " + "<simulation_path_home> "
                + "<simulation_type[mason |netlogo |generic]>" + "<simulation_generic_interpreter_path>"
                + "<simultion_program_path> " + "<simulation_mapper_input_path> "
                + "<simulation_mapper_output_path> " + "<simulation_output_domain_xmlfile> "
                + "<simulation_input_path> " + "<<simulation_rating_path>>" + "<oneshot[one|loop]> "
                + "<author_name> " + "<simulation_description> " + "<path_interpreter_evaluate_file> "
                + "<evaluate_file_path>");
        System.exit(-1);
    }

    Configuration conf = null;
    JobConf job = null;

    String AUTHOR = null;/*author name*/
    String SIMULATION_NAME = null;/*simulation name*/
    String SIMULATION_HOME = null;/*path simulation*/
    String SIM_TYPE = null;/*mason, netlogo, generic*/
    String SIM_EXECUTABLE_SIMULATION_INTERPRETER_PATH = null;
    String SIM_EXECUTABLE_SIMULATION_PROGRAM = null; /*executable program *.jar | *.nlogo*/
    String SIM_EXECUTION_INPUT_DATA_MAPPER = null;/*input.data path */
    String SIM_EXECUTION_OUTPUT_MAPPER = null;/*output loop(i) path*/
    String SIM_DESCRIPTION_OUTPUT_XML_DOMAIN = null;/*path of domain file */
    String SIM_EXECUTION_INPUT_XML = null;/*execution input path*/
    boolean ISLOOP = false;/*false[one] | true[loop]*/
    //String DESCRIPTION=null;/*simulations' description*/
    String INTERPRETER_REMOTE_PATH_EVALUATION = null;/*remote program bin path for executing EvalFoo*/
    String EXECUTABLE_RATING_FILE = null;/*path of rating file*/
    String SIM_RATING_PATH = null;

    // aids /home/michele/Scrivania/aids netlogo /home/michele/Scrivania/aids/aids.nlogo /home/michele/Scrivania/aids/input.tmp /home/michele/Scrivania/aids/output /home/michele/Scrivania/aids/domain.xml /home/michele/Scrivania/aids/input loop pepp ciao /usr/bin/python /home/michele/Scrivania/aids/evaluate.py 

    if (args.length == 13) {
        SIMULATION_NAME = args[0];
        SIMULATION_HOME = args[1];
        SIM_TYPE = args[2];
        SIM_EXECUTABLE_SIMULATION_PROGRAM = args[3];
        SIM_EXECUTION_INPUT_DATA_MAPPER = args[4];
        SIM_EXECUTION_OUTPUT_MAPPER = args[5];
        SIM_DESCRIPTION_OUTPUT_XML_DOMAIN = args[6];
        SIM_EXECUTION_INPUT_XML = args[7];
        SIM_RATING_PATH = args[8];
        ISLOOP = Boolean.parseBoolean(args[9]);
        AUTHOR = args[10];
        //DESCRIPTION=args[11];
        INTERPRETER_REMOTE_PATH_EVALUATION = args[11];
        EXECUTABLE_RATING_FILE = args[12];
        //   System.out.println(DESCRIPTION);
        //System.out.println(INTERPRETER_REMOTE_PATH_EVALUATION);

    }

    else if (args.length == 9) {
        SIMULATION_NAME = args[0];
        SIMULATION_HOME = args[1];
        SIM_TYPE = args[2];
        SIM_EXECUTABLE_SIMULATION_PROGRAM = args[3];
        SIM_EXECUTION_INPUT_DATA_MAPPER = args[4];
        SIM_EXECUTION_OUTPUT_MAPPER = args[5];
        SIM_DESCRIPTION_OUTPUT_XML_DOMAIN = args[6];
        ISLOOP = Boolean.parseBoolean(args[7]);
        AUTHOR = args[8];
        //DESCRIPTION=args[9];
    }

    else if (args.length == 14) {
        SIMULATION_NAME = args[0];
        SIMULATION_HOME = args[1];
        SIM_TYPE = args[2];
        SIM_EXECUTABLE_SIMULATION_INTERPRETER_PATH = args[3];
        SIM_EXECUTABLE_SIMULATION_PROGRAM = args[4];
        SIM_EXECUTION_INPUT_DATA_MAPPER = args[5];
        SIM_EXECUTION_OUTPUT_MAPPER = args[6];
        SIM_DESCRIPTION_OUTPUT_XML_DOMAIN = args[7];
        SIM_EXECUTION_INPUT_XML = args[8];
        SIM_RATING_PATH = args[9];
        ISLOOP = Boolean.parseBoolean(args[10]);
        AUTHOR = args[11];
        //   DESCRIPTION=args[12];
        INTERPRETER_REMOTE_PATH_EVALUATION = args[12];
        EXECUTABLE_RATING_FILE = args[13];

    }

    else if (args.length == 10) {
        SIMULATION_NAME = args[0];
        SIMULATION_HOME = args[1];
        SIM_TYPE = args[2];
        SIM_EXECUTABLE_SIMULATION_INTERPRETER_PATH = args[3];
        SIM_EXECUTABLE_SIMULATION_PROGRAM = args[4];
        SIM_EXECUTION_INPUT_DATA_MAPPER = args[5];
        SIM_EXECUTION_OUTPUT_MAPPER = args[6];
        SIM_DESCRIPTION_OUTPUT_XML_DOMAIN = args[7];
        ISLOOP = Boolean.parseBoolean(args[8]);
        AUTHOR = args[9];
        //   DESCRIPTION=args[10];
    }

    if (!(SIM_TYPE.equalsIgnoreCase("mason") || SIM_TYPE.equalsIgnoreCase("netlogo")
            || SIM_TYPE.equalsIgnoreCase("generic"))) {
        System.exit(-2);
    }

    conf = new Configuration();
    job = new JobConf(conf, SOF.class);
    job.setJobName(SIMULATION_NAME/*SIMULATION NAME*/);
    job.set("simulation.home", SIMULATION_HOME);
    job.set("simulation.name", SIMULATION_NAME);
    job.set("simulation.type", SIM_TYPE);

    if (SIM_TYPE.equalsIgnoreCase("generic")) {
        job.set("simulation.interpreter.genericsim", SIM_EXECUTABLE_SIMULATION_INTERPRETER_PATH);
    }

    job.set("simulation.program.simulation", SIM_EXECUTABLE_SIMULATION_PROGRAM);
    job.set("simulation.executable.input", SIM_EXECUTION_INPUT_DATA_MAPPER);
    job.set("simulation.executable.output", SIM_EXECUTION_OUTPUT_MAPPER);
    job.setBoolean("simulation.executable.mode", ISLOOP);
    //job.set("simulation.executable.mode", ISLOOP);
    job.set("simulation.executable.author", AUTHOR);
    //job.set("simulation.executable.description", DESCRIPTION);
    job.set("simulation.description.output.domain", SIM_DESCRIPTION_OUTPUT_XML_DOMAIN);

    /**
     * GENERA IL .TMP
     * COMMENTA LA LINEA 
     * TEST IN LOCALE 
     * SOLO PER IL LOCALE
     */
    //XmlToText.convertXmlFileToFileText(conf,"/home/lizard87/Desktop/mason_test/input.xml");
    //XmlToText.convertXmlFileToFileText(conf,"/home/lizard87/Desktop/input.xml");
    //XmlToText.convertXmlFileToFileText(conf,"/home/lizard87/Desktop/aids/input.xml");

    if (ISLOOP) {
        job.set("simulation.description.input", SIM_EXECUTION_INPUT_XML);
        job.set("simulation.program.rating", EXECUTABLE_RATING_FILE);
        //job.set("simulation.interpreter.selection", INTERPRETER_REMOTE_PATH_SELECTION);
        job.set("simulation.interpreter.rating", INTERPRETER_REMOTE_PATH_EVALUATION);
        job.set("simulation.executable.loop.rating", SIM_RATING_PATH);
    }

    FileInputFormat.addInputPath(job, new Path(SIM_EXECUTION_INPUT_DATA_MAPPER)/*DIRECTORY INPUT*/);
    FileOutputFormat.setOutputPath(job, new Path(SIM_EXECUTION_OUTPUT_MAPPER));

    if (SIM_TYPE.equalsIgnoreCase("mason")) {
        job.setMapperClass(SOFMapperMason.class);
        job.setReducerClass(SOFReducerMason.class);

    } else if (SIM_TYPE.equalsIgnoreCase("netlogo")) {

        job.setMapperClass(SOFMapperNetLogo.class);
        job.setReducerClass(SOFReducerNetLogo.class);
    } else if (SIM_TYPE.equalsIgnoreCase("generic")) {
        job.setMapperClass(SOFMapperGeneric.class);
        job.setReducerClass(SOFReducerGeneric.class);
    }

    job.setOutputKeyClass(org.apache.hadoop.io.Text.class);
    job.setOutputValueClass(org.apache.hadoop.io.Text.class);

    JobClient jobc;

    try {
        jobc = new JobClient(job);
        System.out.println(jobc + " " + job);
        RunningJob runjob;
        runjob = JobClient.runJob(job);
        while (runjob.getJobStatus().equals(JobStatus.SUCCEEDED)) {
        }
        System.exit(0);
    } catch (IOException e) {

        e.printStackTrace();
    }

}

From source file:Iterator.SpeciesIterDriver2.java

@SuppressWarnings("deprecation")
public static void main(String[] args) {

    int iterationCount = 0;

    while (iterationCount <= 20) {

        System.out.println("Running Iteration - " + iterationCount);
        JobClient client = new JobClient();
        JobConf conf = new JobConf(SpeciesIterDriver2.class);
        conf.setJobName("Species Iter - " + iterationCount);

        // This property is set to generate 5 reducer tasks
        conf.setNumReduceTasks(5);//from  ww  w .  j  av  a2s.  c o m
        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(Text.class);

        //output-iterator-0 contains the input data
        FileInputFormat.setInputPaths(conf, new Path("output-iterator-" + iterationCount));
        iterationCount++;
        FileOutputFormat.setOutputPath(conf, new Path("output-iterator-" + iterationCount));

        conf.setMapperClass(SpeciesIterMapper2.class);
        conf.setReducerClass(SpeciesIterReducer2.class);
        conf.setCombinerClass(SpeciesIterReducer2.class);

        client.setConf(conf);
        try {
            JobClient.runJob(conf);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

}

From source file:ivory.core.index.MergeGlobalStatsAcrossIndexSegments.java

License:Apache License

public int runTool() throws Exception {

    JobConf conf = new JobConf(getConf(), MergeGlobalStatsAcrossIndexSegments.class);
    FileSystem fs = FileSystem.get(conf);

    String collectionName = conf.get("Ivory.CollectionName");
    String indexPaths = conf.get("Ivory.IndexPaths");
    String dataOutputPath = conf.get("Ivory.DataOutputPath");
    int dfThreshold = conf.getInt("Ivory.DfThreshold", 0);

    // first, compute size of global term space
    Path tmpPaths = new Path("/tmp/index-paths.txt");

    FSDataOutputStream out = fs.create(tmpPaths, true);
    for (String s : indexPaths.split(",")) {
        out.write(new String(s + "\n").getBytes());
    }/*from   ww  w.  j  a  v  a  2  s  .  c o m*/
    out.close();

    LOG.info("Job: ComputeNumberOfTermsAcrossIndexSegments");
    conf.setJobName("ComputeNumberOfTermsAcrossIndexSegments:" + collectionName);

    FileInputFormat.addInputPath(conf, tmpPaths);

    conf.setNumMapTasks(1);
    conf.setNumReduceTasks(1);

    conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.setInputFormat(NLineInputFormat.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(PairOfIntLong.class);
    conf.setOutputFormat(NullOutputFormat.class);

    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(IdentityReducer.class);

    long startTime = System.currentTimeMillis();
    RunningJob job = JobClient.runJob(conf);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    Counters counters = job.getCounters();

    long totalNumTerms = counters.findCounter("org.apache.hadoop.mapred.Task$Counter", 6, "REDUCE_INPUT_GROUPS")
            .getCounter();

    LOG.info("total number of terms in global dictionary = " + totalNumTerms);

    // now build the dictionary
    fs.delete(new Path(dataOutputPath), true);

    conf = new JobConf(getConf(), MergeGlobalStatsAcrossIndexSegments.class);

    LOG.info("Job: MergeGlobalStatsAcrossIndexSegments");
    conf.setJobName("MergeGlobalStatsAcrossIndexSegments:" + collectionName);

    FileInputFormat.addInputPath(conf, tmpPaths);

    conf.setNumMapTasks(1);
    conf.setNumReduceTasks(1);

    conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.setInputFormat(NLineInputFormat.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(PairOfIntLong.class);
    conf.setOutputFormat(NullOutputFormat.class);

    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(MyReducer.class);

    conf.setLong("Ivory.IndexNumberOfTerms", (int) totalNumTerms);

    startTime = System.currentTimeMillis();
    job = JobClient.runJob(conf);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    // compute some # docs, collection length, avg doc length
    long collectionLength = 0;
    int docCount = 0;
    for (String index : indexPaths.split(",")) {
        LOG.info("reading stats for " + index);

        RetrievalEnvironment env = new RetrievalEnvironment(index, fs);

        long l = env.readCollectionLength();
        int n = env.readCollectionDocumentCount();

        LOG.info(" - CollectionLength: " + l);
        LOG.info(" - CollectionDocumentCount: " + n);

        collectionLength += l;
        docCount += n;
    }

    float avgdl = (float) collectionLength / docCount;

    LOG.info("all index segments: ");
    LOG.info(" - CollectionLength: " + collectionLength);
    LOG.info(" - CollectionDocumentCount: " + docCount);
    LOG.info(" - AverageDocumentLenght: " + avgdl);

    RetrievalEnvironment env = new RetrievalEnvironment(dataOutputPath, fs);

    env.writeCollectionAverageDocumentLength(avgdl);
    env.writeCollectionLength(collectionLength);
    env.writeCollectionDocumentCount(docCount);

    return 0;
}

From source file:ivory.core.preprocess.BuildTargetLangWeightedIntDocVectors.java

License:Apache License

@SuppressWarnings("deprecation")
public int runTool() throws Exception {
    //      sLogger.setLevel(Level.DEBUG);

    sLogger.info("PowerTool: GetTargetLangWeightedIntDocVectors");

    JobConf conf = new JobConf(BuildTargetLangWeightedIntDocVectors.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = getConf().get("Ivory.IndexPath");

    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    String outputPath = env.getWeightedIntDocVectorsDirectory();
    int mapTasks = getConf().getInt("Ivory.NumMapTasks", 0);
    int minSplitSize = getConf().getInt("Ivory.MinSplitSize", 0);
    String collectionName = getConf().get("Ivory.CollectionName");

    sLogger.info("Characteristics of the collection:");
    sLogger.info(" - CollectionName: " + collectionName);
    sLogger.info("Characteristics of the job:");
    sLogger.info(" - NumMapTasks: " + mapTasks);
    sLogger.info(" - MinSplitSize: " + minSplitSize);

    String vocabFile = getConf().get("Ivory.FinalVocab");
    DistributedCache.addCacheFile(new URI(vocabFile), conf);

    Path inputPath = new Path(PwsimEnvironment.getFileNameWithPars(indexPath, "TermDocs"));
    Path weightedVectorsPath = new Path(outputPath);

    if (fs.exists(weightedVectorsPath)) {
        sLogger.info("Output path already exists!");
        return -1;
    }//w  w  w.  ja va  2  s  .  c o m
    conf.setJobName("GetWeightedIntDocVectors:" + collectionName);
    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(0);
    conf.setInt("mapred.min.split.size", minSplitSize);
    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.setBoolean("Ivory.Normalize", getConf().getBoolean("Ivory.Normalize", false));
    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, weightedVectorsPath);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(WeightedIntDocVector.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(WeightedIntDocVector.class);

    conf.setMapperClass(MyMapper.class);

    long startTime = System.currentTimeMillis();

    RunningJob rj = JobClient.runJob(conf);
    sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
    Counters counters = rj.getCounters();

    long numOfDocs = (long) counters.findCounter(Docs.Total).getCounter();

    return (int) numOfDocs;
}

From source file:ivory.core.preprocess.BuildWeightedIntDocVectors.java

License:Apache License

@SuppressWarnings("deprecation")
public int runTool() throws Exception {
    sLogger.setLevel(Level.WARN);

    sLogger.info("PowerTool: GetWeightedIntDocVectors");

    // create a new JobConf, inheriting from the configuration of this
    // PowerTool//  w  w w . j  a va 2s .  c o m
    JobConf conf = new JobConf(getConf(), BuildWeightedIntDocVectors.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get("Ivory.IndexPath");
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    String outputPath = env.getWeightedIntDocVectorsDirectory();
    int mapTasks = conf.getInt("Ivory.NumMapTasks", 0);
    int minSplitSize = conf.getInt("Ivory.MinSplitSize", 0);
    String collectionName = conf.get("Ivory.CollectionName");

    sLogger.info("Characteristics of the collection:");
    sLogger.info(" - CollectionName: " + collectionName);
    sLogger.info("Characteristics of the job:");
    sLogger.info(" - NumMapTasks: " + mapTasks);
    sLogger.info(" - MinSplitSize: " + minSplitSize);

    String dfByIntFilePath = env.getDfByIntData();
    String cfByIntFilePath = env.getCfByIntData();

    /* add df table to cache */
    if (!fs.exists(new Path(dfByIntFilePath))) {
        throw new RuntimeException("Error, df data file " + dfByIntFilePath + "doesn't exist!");
    }
    DistributedCache.addCacheFile(new URI(dfByIntFilePath), conf);

    /* add cf table to cache */
    if (!fs.exists(new Path(cfByIntFilePath))) {
        throw new RuntimeException("Error, cf data file " + cfByIntFilePath + "doesn't exist!");
    }
    DistributedCache.addCacheFile(new URI(cfByIntFilePath), conf);

    /* add dl table to cache */
    Path docLengthFile = env.getDoclengthsData();
    if (!fs.exists(docLengthFile)) {
        throw new RuntimeException("Error, doc-length data file " + docLengthFile + "doesn't exist!");
    }
    DistributedCache.addCacheFile(docLengthFile.toUri(), conf);

    Path inputPath = new Path(env.getIntDocVectorsDirectory());
    Path weightedVectorsPath = new Path(outputPath);

    if (fs.exists(weightedVectorsPath)) {
        sLogger.info("Output path already exists!");
        return 0;
    }

    //fs.delete(weightedVectirsPath, true);

    conf.setJobName("GetWeightedIntDocVectors:" + collectionName);
    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(0);
    conf.setInt("mapred.min.split.size", minSplitSize);
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, weightedVectorsPath);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(WeightedIntDocVector.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(WeightedIntDocVector.class);

    conf.setMapperClass(MyMapper.class);
    //conf.setInt("mapred.task.timeout",3600000);

    long startTime = System.currentTimeMillis();

    RunningJob job = JobClient.runJob(conf);
    sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}