Example usage for org.apache.hadoop.mapred JobConf setMapOutputKeyClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setMapOutputKeyClass.

Prototype

public void setMapOutputKeyClass(Class<?> theClass)

Source Link

Document

Set the key class for the map output data.

Usage

From source file:io.fluo.stress.trie.Unique.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    if (args.length < 1) {
        log.error("Usage: " + this.getClass().getSimpleName() + "<input dir>{ <input dir>}");
        System.exit(-1);//from  w w  w.  j a  v  a  2 s .  c om
    }

    JobConf job = new JobConf(getConf());

    job.setJobName(Unique.class.getName());
    job.setJarByClass(Unique.class);

    job.setInputFormat(SequenceFileInputFormat.class);
    for (String arg : args) {
        SequenceFileInputFormat.addInputPath(job, new Path(arg));
    }

    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(NullWritable.class);

    job.setReducerClass(UniqueReducer.class);

    job.setOutputFormat(NullOutputFormat.class);

    RunningJob runningJob = JobClient.runJob(job);
    runningJob.waitForCompletion();
    numUnique = (int) runningJob.getCounters().getCounter(Stats.UNIQUE);

    log.debug("numUnique : " + numUnique);

    return runningJob.isSuccessful() ? 0 : -1;

}

From source file:ivory.core.preprocess.BuildTargetLangWeightedIntDocVectors.java

License:Apache License

@SuppressWarnings("deprecation")
public int runTool() throws Exception {
    //      sLogger.setLevel(Level.DEBUG);

    sLogger.info("PowerTool: GetTargetLangWeightedIntDocVectors");

    JobConf conf = new JobConf(BuildTargetLangWeightedIntDocVectors.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = getConf().get("Ivory.IndexPath");

    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    String outputPath = env.getWeightedIntDocVectorsDirectory();
    int mapTasks = getConf().getInt("Ivory.NumMapTasks", 0);
    int minSplitSize = getConf().getInt("Ivory.MinSplitSize", 0);
    String collectionName = getConf().get("Ivory.CollectionName");

    sLogger.info("Characteristics of the collection:");
    sLogger.info(" - CollectionName: " + collectionName);
    sLogger.info("Characteristics of the job:");
    sLogger.info(" - NumMapTasks: " + mapTasks);
    sLogger.info(" - MinSplitSize: " + minSplitSize);

    String vocabFile = getConf().get("Ivory.FinalVocab");
    DistributedCache.addCacheFile(new URI(vocabFile), conf);

    Path inputPath = new Path(PwsimEnvironment.getFileNameWithPars(indexPath, "TermDocs"));
    Path weightedVectorsPath = new Path(outputPath);

    if (fs.exists(weightedVectorsPath)) {
        sLogger.info("Output path already exists!");
        return -1;
    }//from w  ww  .ja  v  a2s .  c  o  m
    conf.setJobName("GetWeightedIntDocVectors:" + collectionName);
    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(0);
    conf.setInt("mapred.min.split.size", minSplitSize);
    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.setBoolean("Ivory.Normalize", getConf().getBoolean("Ivory.Normalize", false));
    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, weightedVectorsPath);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(WeightedIntDocVector.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(WeightedIntDocVector.class);

    conf.setMapperClass(MyMapper.class);

    long startTime = System.currentTimeMillis();

    RunningJob rj = JobClient.runJob(conf);
    sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
    Counters counters = rj.getCounters();

    long numOfDocs = (long) counters.findCounter(Docs.Total).getCounter();

    return (int) numOfDocs;
}

From source file:ivory.core.preprocess.BuildWeightedIntDocVectors.java

License:Apache License

@SuppressWarnings("deprecation")
public int runTool() throws Exception {
    sLogger.setLevel(Level.WARN);

    sLogger.info("PowerTool: GetWeightedIntDocVectors");

    // create a new JobConf, inheriting from the configuration of this
    // PowerTool//from w  w w.java  2 s .  c  om
    JobConf conf = new JobConf(getConf(), BuildWeightedIntDocVectors.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get("Ivory.IndexPath");
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    String outputPath = env.getWeightedIntDocVectorsDirectory();
    int mapTasks = conf.getInt("Ivory.NumMapTasks", 0);
    int minSplitSize = conf.getInt("Ivory.MinSplitSize", 0);
    String collectionName = conf.get("Ivory.CollectionName");

    sLogger.info("Characteristics of the collection:");
    sLogger.info(" - CollectionName: " + collectionName);
    sLogger.info("Characteristics of the job:");
    sLogger.info(" - NumMapTasks: " + mapTasks);
    sLogger.info(" - MinSplitSize: " + minSplitSize);

    String dfByIntFilePath = env.getDfByIntData();
    String cfByIntFilePath = env.getCfByIntData();

    /* add df table to cache */
    if (!fs.exists(new Path(dfByIntFilePath))) {
        throw new RuntimeException("Error, df data file " + dfByIntFilePath + "doesn't exist!");
    }
    DistributedCache.addCacheFile(new URI(dfByIntFilePath), conf);

    /* add cf table to cache */
    if (!fs.exists(new Path(cfByIntFilePath))) {
        throw new RuntimeException("Error, cf data file " + cfByIntFilePath + "doesn't exist!");
    }
    DistributedCache.addCacheFile(new URI(cfByIntFilePath), conf);

    /* add dl table to cache */
    Path docLengthFile = env.getDoclengthsData();
    if (!fs.exists(docLengthFile)) {
        throw new RuntimeException("Error, doc-length data file " + docLengthFile + "doesn't exist!");
    }
    DistributedCache.addCacheFile(docLengthFile.toUri(), conf);

    Path inputPath = new Path(env.getIntDocVectorsDirectory());
    Path weightedVectorsPath = new Path(outputPath);

    if (fs.exists(weightedVectorsPath)) {
        sLogger.info("Output path already exists!");
        return 0;
    }

    //fs.delete(weightedVectirsPath, true);

    conf.setJobName("GetWeightedIntDocVectors:" + collectionName);
    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(0);
    conf.setInt("mapred.min.split.size", minSplitSize);
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, weightedVectorsPath);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(WeightedIntDocVector.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(WeightedIntDocVector.class);

    conf.setMapperClass(MyMapper.class);
    //conf.setInt("mapred.task.timeout",3600000);

    long startTime = System.currentTimeMillis();

    RunningJob job = JobClient.runJob(conf);
    sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}

From source file:ivory.core.preprocess.BuildWeightedTermDocVectors.java

License:Apache License

@SuppressWarnings("deprecation")
public int runTool() throws Exception {
    sLogger.info("PowerTool: GetWeightedTermDocVectors");

    JobConf conf = new JobConf(BuildWeightedTermDocVectors.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = getConf().get("Ivory.IndexPath");
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    String outputPath = env.getWeightedTermDocVectorsDirectory();
    int mapTasks = getConf().getInt("Ivory.NumMapTasks", 0);
    int minSplitSize = getConf().getInt("Ivory.MinSplitSize", 0);
    String collectionName = getConf().get("Ivory.CollectionName");

    String termsFilePath = env.getIndexTermsData();
    String termsIdsFilePath = env.getIndexTermIdsData();
    String termIdMappingFilePath = env.getIndexTermIdMappingData();
    String dfByTermFilePath = env.getDfByTermData();

    Path inputPath = new Path(env.getTermDocVectorsDirectory());
    Path weightedVectorsPath = new Path(outputPath);

    if (fs.exists(weightedVectorsPath)) {
        //fs.delete(weightedVectorsPath, true);
        sLogger.info("Output path already exists!");
        return 0;
    }/*from   w  w w  . j a va  2  s .co m*/

    /* add terms file to cache */
    if (!fs.exists(new Path(termsFilePath)) || !fs.exists(new Path(termsIdsFilePath))
            || !fs.exists(new Path(termIdMappingFilePath))) {
        throw new RuntimeException("Error, terms file " + termsFilePath + "/" + termsIdsFilePath + "/"
                + termIdMappingFilePath + "doesn't exist!");
    }
    DistributedCache.addCacheFile(new URI(termsFilePath), conf);
    DistributedCache.addCacheFile(new URI(termsIdsFilePath), conf);
    DistributedCache.addCacheFile(new URI(termIdMappingFilePath), conf);

    /* add df table to cache */
    if (!fs.exists(new Path(dfByTermFilePath))) {
        throw new RuntimeException("Error, df data file " + dfByTermFilePath + "doesn't exist!");
    }
    DistributedCache.addCacheFile(new URI(dfByTermFilePath), conf);

    /* add dl table to cache */
    Path docLengthFile = env.getDoclengthsData();
    if (!fs.exists(docLengthFile)) {
        throw new RuntimeException("Error, doc-length data file " + docLengthFile + "doesn't exist!");
    }
    DistributedCache.addCacheFile(docLengthFile.toUri(), conf);

    conf.setMapperClass(MyMapper.class);
    //conf.setInt("mapred.task.timeout",3600000);
    conf.setJobName("GetWeightedTermDocVectors:" + collectionName);
    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(0);
    conf.setInt("mapred.min.split.size", minSplitSize);
    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.setInt("Ivory.MinNumTerms", getConf().getInt("Ivory.MinNumTerms", Integer.MAX_VALUE));
    conf.setBoolean("Ivory.Normalize", getConf().getBoolean("Ivory.Normalize", false));
    if (getConf().get("Ivory.ShortDocLengths") != null) {
        conf.set("Ivory.ShortDocLengths", getConf().get("Ivory.ShortDocLengths"));
    }
    conf.set("Ivory.ScoringModel", getConf().get("Ivory.ScoringModel"));

    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, weightedVectorsPath);
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(HMapSFW.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(HMapSFW.class);

    sLogger.info("Running job: " + conf.getJobName());

    long startTime = System.currentTimeMillis();
    RunningJob job = JobClient.runJob(conf);
    sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}

From source file:ivory.index.BuildIntPostingsForwardIndex.java

License:Apache License

public int runTool() throws Exception {
    JobConf conf = new JobConf(getConf(), BuildIntPostingsForwardIndex.class);
    FileSystem fs = FileSystem.get(conf);

    int mapTasks = conf.getInt("Ivory.NumMapTasks", 0);
    int minSplitSize = conf.getInt("Ivory.MinSplitSize", 0);
    String indexPath = conf.get("Ivory.IndexPath");

    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    String collectionName = env.readCollectionName();

    sLogger.info("Tool: BuildIntPostingsForwardIndex");
    sLogger.info(" - IndexPath: " + indexPath);
    sLogger.info(" - CollectionName: " + collectionName);

    conf.setJobName("BuildIntPostingsForwardIndex:" + collectionName);

    Path inputPath = new Path(env.getPostingsDirectory());
    FileInputFormat.setInputPaths(conf, inputPath);

    Path postingsIndexPath = new Path(env.getPostingsIndexData());

    if (fs.exists(postingsIndexPath)) {
        sLogger.info("Postings forward index path already exists!");
        return 0;
    }//  w  w  w  . j a v  a 2  s  .  co  m
    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(1);

    conf.setInt("mapred.min.split.size", minSplitSize);
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(Text.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setOutputFormat(NullOutputFormat.class);

    conf.setMapRunnerClass(MyMapRunner.class);
    conf.setReducerClass(MyReducer.class);

    JobClient.runJob(conf);

    return 0;
}

From source file:ivory.index.BuildIPInvertedIndexDocSorted.java

License:Apache License

@SuppressWarnings("unused")
public int runTool() throws Exception {
    JobConf conf = new JobConf(getConf(), BuildIPInvertedIndexDocSorted.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get("Ivory.IndexPath");
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    String collectionName = env.readCollectionName();

    int mapTasks = conf.getInt("Ivory.NumMapTasks", 0);
    int reduceTasks = conf.getInt("Ivory.NumReduceTasks", 0);
    int minSplitSize = conf.getInt("Ivory.MinSplitSize", 0);
    int collectionDocCnt = env.readCollectionDocumentCount();

    LOG.info("PowerTool: BuildIPInvertedIndexDocSorted");
    LOG.info(" - IndexPath: " + indexPath);
    LOG.info(" - CollectionName: " + collectionName);
    LOG.info(" - CollectionDocumentCount: " + collectionDocCnt);
    LOG.info(" - NumMapTasks: " + mapTasks);
    LOG.info(" - NumReduceTasks: " + reduceTasks);
    LOG.info(" - MinSplitSize: " + minSplitSize);

    if (!fs.exists(new Path(indexPath))) {
        fs.mkdirs(new Path(indexPath));
    }/*from www.  j  av a  2 s .c  om*/

    Path inputPath = new Path(env.getIntDocVectorsDirectory());
    Path postingsPath = new Path(env.getPostingsDirectory());

    if (fs.exists(postingsPath)) {
        LOG.info("Postings already exist: no indexing will be performed.");
        return 0;
    }

    conf.setJobName("BuildIPInvertedIndex:" + collectionName);

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(reduceTasks);

    conf.setInt("Ivory.CollectionDocumentCount", collectionDocCnt);

    conf.setInt("mapred.min.split.size", minSplitSize);
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, postingsPath);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(PairOfInts.class);
    conf.setMapOutputValueClass(TermPositions.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(PostingsListDocSortedPositional.class);

    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(MyReducer.class);
    conf.setPartitionerClass(MyPartitioner.class);

    long startTime = System.currentTimeMillis();
    RunningJob job = JobClient.runJob(conf);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    env.writePostingsType("ivory.data.PostingsListDocSortedPositional");

    return 0;
}

From source file:ivory.preprocess.BuildIntDocVectors.java

License:Apache License

@SuppressWarnings("unused")
public int runTool() throws Exception {
    // create a new JobConf, inheriting from the configuration of this
    // PowerTool/* ww w. ja va  2s .  com*/
    JobConf conf = new JobConf(getConf(), BuildIntDocVectors.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get("Ivory.IndexPath");
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    int mapTasks = conf.getInt("Ivory.NumMapTasks", 0);

    String collectionName = env.readCollectionName();

    sLogger.info("PowerTool: BuildIntDocVectors");
    sLogger.info(" - IndexPath: " + indexPath);
    sLogger.info(" - CollectionName: " + collectionName);
    sLogger.info(" - NumMapTasks: " + mapTasks);
    sLogger.info("This is new!");
    String termsFile = env.getIndexTermsData();
    String termIDsFile = env.getIndexTermIdsData();
    String idToTermFile = env.getIndexTermIdMappingData();

    Path termsFilePath = new Path(termsFile);
    Path termIDsFilePath = new Path(termIDsFile);

    if (!fs.exists(termsFilePath) || !fs.exists(termIDsFilePath)) {
        sLogger.error("Error, terms files don't exist!");
        return 0;
    }

    Path outputPath = new Path(env.getIntDocVectorsDirectory());
    if (fs.exists(outputPath)) {
        sLogger.info("IntDocVectors already exist: skipping!");
        return 0;
    }

    DistributedCache.addCacheFile(new URI(termsFile), conf);
    DistributedCache.addCacheFile(new URI(termIDsFile), conf);
    DistributedCache.addCacheFile(new URI(idToTermFile), conf);

    conf.setJobName("BuildIntDocVectors:" + collectionName);

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(0);

    FileInputFormat.setInputPaths(conf, env.getTermDocVectorsDirectory());
    FileOutputFormat.setOutputPath(conf, outputPath);

    conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.RECORD);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(LazyIntDocVector.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(LazyIntDocVector.class);

    conf.setMapperClass(MyMapper.class);

    long startTime = System.currentTimeMillis();
    RunningJob job = JobClient.runJob(conf);
    sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}

From source file:ivory.preprocess.BuildIntDocVectorsForwardIndex.java

License:Apache License

public int runTool() throws Exception {
    JobConf conf = new JobConf(getConf(), BuildIntDocVectorsForwardIndex.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get("Ivory.IndexPath");
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    int mapTasks = conf.getInt("Ivory.NumMapTasks", 0);
    String collectionName = env.readCollectionName();
    boolean buildWeighted = conf.getBoolean("Ivory.BuildWeighted", false);

    sLogger.info("Tool: BuildIntDocVectorsIndex");
    sLogger.info(" - IndexPath: " + indexPath);
    sLogger.info(" - CollectionName: " + collectionName);
    sLogger.info(" - BuildWeighted: " + buildWeighted);
    sLogger.info(" - NumMapTasks: " + mapTasks);

    String intDocVectorsPath;/*from  w  ww. j  a  va  2  s .  c o  m*/
    String forwardIndexPath;
    if (buildWeighted) {
        intDocVectorsPath = env.getWeightedIntDocVectorsDirectory();
        forwardIndexPath = env.getWeightedIntDocVectorsForwardIndex();
    } else {
        intDocVectorsPath = env.getIntDocVectorsDirectory();
        forwardIndexPath = env.getIntDocVectorsForwardIndex();
    }

    if (!fs.exists(new Path(intDocVectorsPath))) {
        sLogger.info("Error: IntDocVectors don't exist!");
        return 0;
    }

    if (fs.exists(new Path(forwardIndexPath))) {
        sLogger.info("IntDocVectorIndex already exists: skipping!");
        return 0;
    }

    conf.setJobName("BuildIntDocVectorsForwardIndex:" + collectionName);

    Path inputPath = new Path(intDocVectorsPath);
    FileInputFormat.setInputPaths(conf, inputPath);

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(1);

    conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(Text.class);
    conf.setOutputFormat(NullOutputFormat.class);

    conf.setMapRunnerClass(MyMapRunner.class);
    conf.setReducerClass(MyReducer.class);

    JobClient.runJob(conf);

    return 0;
}

From source file:ivory.preprocess.BuildTermDocVectors.java

License:Apache License

@SuppressWarnings("unchecked")
public int runTool() throws Exception {
    // create a new JobConf, inheriting from the configuration of this
    // PowerTool/* w  w w  . j a  v  a 2  s .co  m*/
    JobConf conf = new JobConf(getConf(), BuildTermDocVectors.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get("Ivory.IndexPath");
    int mapTasks = conf.getInt("Ivory.NumMapTasks", 0);

    String collectionName = conf.get("Ivory.CollectionName");
    String collectionPath = conf.get("Ivory.CollectionPath");
    String inputFormat = conf.get("Ivory.InputFormat");
    String tokenizer = conf.get("Ivory.Tokenizer");
    String mappingClass = conf.get("Ivory.DocnoMappingClass");

    sLogger.info("PowerTool: BuildTermDocVectors");
    sLogger.info(" - CollectionName: " + collectionName);
    sLogger.info(" - CollectionPath: " + collectionPath);
    sLogger.info(" - InputputFormat: " + inputFormat);
    sLogger.info(" - Tokenizer: " + tokenizer);
    sLogger.info(" - DocnoMappingClass: " + mappingClass);
    sLogger.info(" - NumMapTasks: " + mapTasks);
    sLogger.info(" - NumReduceTasks: " + 0);

    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    Path mappingFile = env.getDocnoMappingData();

    if (!fs.exists(mappingFile)) {
        sLogger.error("Error, docno mapping data file " + mappingFile + "doesn't exist!");
        return 0;
    }

    DistributedCache.addCacheFile(mappingFile.toUri(), conf);

    conf.setJobName("BuildTermDocVectors:" + collectionName);

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(0);

    if (collectionPath.indexOf(",") == -1) {
        FileInputFormat.setInputPaths(conf, new Path(collectionPath));
        sLogger.info("Adding input path " + collectionPath);
    } else {
        String[] paths = collectionPath.split(",");
        for (String p : paths) {
            FileInputFormat.addInputPath(conf, new Path(p));
            sLogger.info("Adding input path " + p);
        }
    }

    Path outputPath = new Path(env.getTermDocVectorsDirectory());
    if (fs.exists(outputPath)) {
        sLogger.info("TermDocVectors already exist: Skipping!");
    } else {
        env.writeCollectionName(collectionName);
        env.writeCollectionPath(collectionPath);
        env.writeInputFormat(inputFormat);
        env.writeDocnoMappingClass(mappingClass);
        env.writeTokenizerClass(tokenizer);

        conf.set("mapred.child.java.opts", "-Xmx2048m");
        conf.setInt("mapred.task.timeout", 60000000);

        FileOutputFormat.setOutputPath(conf, outputPath);

        conf.setInputFormat((Class<? extends InputFormat>) Class.forName(inputFormat));
        conf.setOutputFormat(SequenceFileOutputFormat.class);
        SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.RECORD);

        conf.setMapOutputKeyClass(IntWritable.class);
        conf.setMapOutputValueClass(LazyTermDocVector.class);
        conf.setOutputKeyClass(IntWritable.class);
        conf.setOutputValueClass(LazyTermDocVector.class);

        conf.setMapperClass(MyMapper.class);

        long startTime = System.currentTimeMillis();
        RunningJob job = JobClient.runJob(conf);
        sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

        Counters counters = job.getCounters();

        // write out number of postings
        int collectionDocCount = (int) counters.findCounter(Docs.Total).getCounter();
        env.writeCollectionDocumentCount(collectionDocCount);
    }

    if (fs.exists(env.getDoclengthsData())) {
        sLogger.info("DocLength data exists: Skipping!");
        return 0;
    }

    int collectionDocCount = env.readCollectionDocumentCount();
    long startTime = System.currentTimeMillis();
    writeDoclengthsData(collectionDocCount);
    sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
    return 0;
}

From source file:ivory.preprocess.BuildTermDocVectorsForwardIndex.java

License:Apache License

public int runTool() throws Exception {
    JobConf conf = new JobConf(getConf(), BuildTermDocVectorsForwardIndex.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get("Ivory.IndexPath");
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    int mapTasks = conf.getInt("Ivory.NumMapTasks", 0);
    String collectionName = env.readCollectionName();

    sLogger.info("Tool: BuildTermDocVectorsIndex");
    sLogger.info(" - IndexPath: " + indexPath);
    sLogger.info(" - CollectionName: " + collectionName);
    sLogger.info(" - NumMapTasks: " + mapTasks);

    if (!fs.exists(new Path(env.getTermDocVectorsDirectory()))) {
        sLogger.info("Error: TermDocVectors don't exist!");
        return 0;
    }//from w w w . j a va 2 s . c o m

    if (fs.exists(new Path(env.getTermDocVectorsForwardIndex()))) {
        sLogger.info("TermDocVectorIndex already exists: skipping!");
        return 0;
    }

    conf.setJobName("BuildTermDocVectorsForwardIndex:" + collectionName);

    Path inputPath = new Path(env.getTermDocVectorsDirectory());
    FileInputFormat.setInputPaths(conf, inputPath);

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(1);

    conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(Text.class);
    conf.setOutputFormat(NullOutputFormat.class);

    conf.setMapRunnerClass(MyMapRunner.class);
    conf.setReducerClass(MyReducer.class);

    JobClient.runJob(conf);

    return 0;
}