Example usage for org.apache.hadoop.mapred JobConf setInt

List of usage examples for org.apache.hadoop.mapred JobConf setInt

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setInt.

Prototype

public void setInt(String name, int value) 

Source Link

Document

Set the value of the name property to an int.

Usage

From source file:io.fluo.stress.trie.NumberIngest.java

License:Apache License

public static void main(String[] args) throws IOException, ConfigurationException {

    // Parse arguments
    if (args.length != 4) {
        log.error("Usage: NumberIngest <numMappers> <numbersPerMapper> <nodeSize> <fluoProps>");
        System.exit(-1);//from  w  ww .ja va2s. c o m
    }
    int numMappers = Integer.parseInt(args[0]);
    int numPerMapper = Integer.parseInt(args[1]);
    int nodeSize = Integer.parseInt(args[2]);
    String fluoPropsPath = args[3];

    String hadoopPrefix = System.getenv("HADOOP_PREFIX");
    if (hadoopPrefix == null) {
        hadoopPrefix = System.getenv("HADOOP_HOME");
        if (hadoopPrefix == null) {
            log.error("HADOOP_PREFIX or HADOOP_HOME needs to be set!");
            System.exit(-1);
        }
    }

    // create test name
    String testId = String.format("test-%d", (new Date().getTime() / 1000));
    String testDir = "/trie-stress/" + testId;

    setupHdfs(hadoopPrefix, testDir, numMappers, numPerMapper);

    JobConf ingestConf = new JobConf(NumberIngest.class);
    ingestConf.setJobName("NumberIngest");

    FluoConfiguration config = new FluoConfiguration(new File(fluoPropsPath));

    loadConfig(ingestConf, ConfigurationConverter.getProperties(config));
    ingestConf.setInt(TRIE_NODE_SIZE_PROP, nodeSize);

    ingestConf.setOutputKeyClass(LongWritable.class);
    ingestConf.setOutputValueClass(IntWritable.class);
    ingestConf.setMapperClass(NumberIngest.IngestMapper.class);
    ingestConf.setReducerClass(NumberIngest.UniqueReducer.class);

    FileInputFormat.setInputPaths(ingestConf, new Path(testDir + "/input/"));
    FileOutputFormat.setOutputPath(ingestConf, new Path(testDir + "/unique/"));

    RunningJob ingestJob = JobClient.runJob(ingestConf);
    ingestJob.waitForCompletion();
    if (ingestJob.isSuccessful()) {

        JobConf countConf = new JobConf(NumberIngest.class);
        countConf.setJobName("NumberCount");

        countConf.setOutputKeyClass(Text.class);
        countConf.setOutputValueClass(LongWritable.class);
        countConf.setMapperClass(NumberIngest.CountMapper.class);
        countConf.setReducerClass(NumberIngest.CountReducer.class);

        FileInputFormat.setInputPaths(countConf, new Path(testDir + "/unique/"));
        FileOutputFormat.setOutputPath(countConf, new Path(testDir + "/output/"));

        RunningJob countJob = JobClient.runJob(countConf);
        countJob.waitForCompletion();
        if (countJob.isSuccessful()) {
            log.info("Ingest and count jobs were successful");
            log.info("Output can be viewed @ " + testDir);
            System.exit(0);
        } else {
            log.error("Count job failed for " + testId);
        }
    } else {
        log.error("Ingest job failed.  Skipping count job for " + testId);
    }

    System.exit(-1);
}

From source file:ivory.core.preprocess.BuildTargetLangWeightedIntDocVectors.java

License:Apache License

@SuppressWarnings("deprecation")
public int runTool() throws Exception {
    //      sLogger.setLevel(Level.DEBUG);

    sLogger.info("PowerTool: GetTargetLangWeightedIntDocVectors");

    JobConf conf = new JobConf(BuildTargetLangWeightedIntDocVectors.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = getConf().get("Ivory.IndexPath");

    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    String outputPath = env.getWeightedIntDocVectorsDirectory();
    int mapTasks = getConf().getInt("Ivory.NumMapTasks", 0);
    int minSplitSize = getConf().getInt("Ivory.MinSplitSize", 0);
    String collectionName = getConf().get("Ivory.CollectionName");

    sLogger.info("Characteristics of the collection:");
    sLogger.info(" - CollectionName: " + collectionName);
    sLogger.info("Characteristics of the job:");
    sLogger.info(" - NumMapTasks: " + mapTasks);
    sLogger.info(" - MinSplitSize: " + minSplitSize);

    String vocabFile = getConf().get("Ivory.FinalVocab");
    DistributedCache.addCacheFile(new URI(vocabFile), conf);

    Path inputPath = new Path(PwsimEnvironment.getFileNameWithPars(indexPath, "TermDocs"));
    Path weightedVectorsPath = new Path(outputPath);

    if (fs.exists(weightedVectorsPath)) {
        sLogger.info("Output path already exists!");
        return -1;
    }/*from   ww w .  j a  v a2s. co m*/
    conf.setJobName("GetWeightedIntDocVectors:" + collectionName);
    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(0);
    conf.setInt("mapred.min.split.size", minSplitSize);
    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.setBoolean("Ivory.Normalize", getConf().getBoolean("Ivory.Normalize", false));
    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, weightedVectorsPath);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(WeightedIntDocVector.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(WeightedIntDocVector.class);

    conf.setMapperClass(MyMapper.class);

    long startTime = System.currentTimeMillis();

    RunningJob rj = JobClient.runJob(conf);
    sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
    Counters counters = rj.getCounters();

    long numOfDocs = (long) counters.findCounter(Docs.Total).getCounter();

    return (int) numOfDocs;
}

From source file:ivory.core.preprocess.BuildWeightedIntDocVectors.java

License:Apache License

@SuppressWarnings("deprecation")
public int runTool() throws Exception {
    sLogger.setLevel(Level.WARN);

    sLogger.info("PowerTool: GetWeightedIntDocVectors");

    // create a new JobConf, inheriting from the configuration of this
    // PowerTool/*from w w  w . j av  a  2 s . c  o  m*/
    JobConf conf = new JobConf(getConf(), BuildWeightedIntDocVectors.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get("Ivory.IndexPath");
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    String outputPath = env.getWeightedIntDocVectorsDirectory();
    int mapTasks = conf.getInt("Ivory.NumMapTasks", 0);
    int minSplitSize = conf.getInt("Ivory.MinSplitSize", 0);
    String collectionName = conf.get("Ivory.CollectionName");

    sLogger.info("Characteristics of the collection:");
    sLogger.info(" - CollectionName: " + collectionName);
    sLogger.info("Characteristics of the job:");
    sLogger.info(" - NumMapTasks: " + mapTasks);
    sLogger.info(" - MinSplitSize: " + minSplitSize);

    String dfByIntFilePath = env.getDfByIntData();
    String cfByIntFilePath = env.getCfByIntData();

    /* add df table to cache */
    if (!fs.exists(new Path(dfByIntFilePath))) {
        throw new RuntimeException("Error, df data file " + dfByIntFilePath + "doesn't exist!");
    }
    DistributedCache.addCacheFile(new URI(dfByIntFilePath), conf);

    /* add cf table to cache */
    if (!fs.exists(new Path(cfByIntFilePath))) {
        throw new RuntimeException("Error, cf data file " + cfByIntFilePath + "doesn't exist!");
    }
    DistributedCache.addCacheFile(new URI(cfByIntFilePath), conf);

    /* add dl table to cache */
    Path docLengthFile = env.getDoclengthsData();
    if (!fs.exists(docLengthFile)) {
        throw new RuntimeException("Error, doc-length data file " + docLengthFile + "doesn't exist!");
    }
    DistributedCache.addCacheFile(docLengthFile.toUri(), conf);

    Path inputPath = new Path(env.getIntDocVectorsDirectory());
    Path weightedVectorsPath = new Path(outputPath);

    if (fs.exists(weightedVectorsPath)) {
        sLogger.info("Output path already exists!");
        return 0;
    }

    //fs.delete(weightedVectirsPath, true);

    conf.setJobName("GetWeightedIntDocVectors:" + collectionName);
    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(0);
    conf.setInt("mapred.min.split.size", minSplitSize);
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, weightedVectorsPath);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(WeightedIntDocVector.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(WeightedIntDocVector.class);

    conf.setMapperClass(MyMapper.class);
    //conf.setInt("mapred.task.timeout",3600000);

    long startTime = System.currentTimeMillis();

    RunningJob job = JobClient.runJob(conf);
    sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}

From source file:ivory.core.preprocess.BuildWeightedTermDocVectors.java

License:Apache License

@SuppressWarnings("deprecation")
public int runTool() throws Exception {
    sLogger.info("PowerTool: GetWeightedTermDocVectors");

    JobConf conf = new JobConf(BuildWeightedTermDocVectors.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = getConf().get("Ivory.IndexPath");
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    String outputPath = env.getWeightedTermDocVectorsDirectory();
    int mapTasks = getConf().getInt("Ivory.NumMapTasks", 0);
    int minSplitSize = getConf().getInt("Ivory.MinSplitSize", 0);
    String collectionName = getConf().get("Ivory.CollectionName");

    String termsFilePath = env.getIndexTermsData();
    String termsIdsFilePath = env.getIndexTermIdsData();
    String termIdMappingFilePath = env.getIndexTermIdMappingData();
    String dfByTermFilePath = env.getDfByTermData();

    Path inputPath = new Path(env.getTermDocVectorsDirectory());
    Path weightedVectorsPath = new Path(outputPath);

    if (fs.exists(weightedVectorsPath)) {
        //fs.delete(weightedVectorsPath, true);
        sLogger.info("Output path already exists!");
        return 0;
    }//from   w  w  w .ja  v a  2 s. c  o m

    /* add terms file to cache */
    if (!fs.exists(new Path(termsFilePath)) || !fs.exists(new Path(termsIdsFilePath))
            || !fs.exists(new Path(termIdMappingFilePath))) {
        throw new RuntimeException("Error, terms file " + termsFilePath + "/" + termsIdsFilePath + "/"
                + termIdMappingFilePath + "doesn't exist!");
    }
    DistributedCache.addCacheFile(new URI(termsFilePath), conf);
    DistributedCache.addCacheFile(new URI(termsIdsFilePath), conf);
    DistributedCache.addCacheFile(new URI(termIdMappingFilePath), conf);

    /* add df table to cache */
    if (!fs.exists(new Path(dfByTermFilePath))) {
        throw new RuntimeException("Error, df data file " + dfByTermFilePath + "doesn't exist!");
    }
    DistributedCache.addCacheFile(new URI(dfByTermFilePath), conf);

    /* add dl table to cache */
    Path docLengthFile = env.getDoclengthsData();
    if (!fs.exists(docLengthFile)) {
        throw new RuntimeException("Error, doc-length data file " + docLengthFile + "doesn't exist!");
    }
    DistributedCache.addCacheFile(docLengthFile.toUri(), conf);

    conf.setMapperClass(MyMapper.class);
    //conf.setInt("mapred.task.timeout",3600000);
    conf.setJobName("GetWeightedTermDocVectors:" + collectionName);
    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(0);
    conf.setInt("mapred.min.split.size", minSplitSize);
    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.setInt("Ivory.MinNumTerms", getConf().getInt("Ivory.MinNumTerms", Integer.MAX_VALUE));
    conf.setBoolean("Ivory.Normalize", getConf().getBoolean("Ivory.Normalize", false));
    if (getConf().get("Ivory.ShortDocLengths") != null) {
        conf.set("Ivory.ShortDocLengths", getConf().get("Ivory.ShortDocLengths"));
    }
    conf.set("Ivory.ScoringModel", getConf().get("Ivory.ScoringModel"));

    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, weightedVectorsPath);
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(HMapSFW.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(HMapSFW.class);

    sLogger.info("Running job: " + conf.getJobName());

    long startTime = System.currentTimeMillis();
    RunningJob job = JobClient.runJob(conf);
    sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}

From source file:ivory.index.BuildIntPostingsForwardIndex.java

License:Apache License

public int runTool() throws Exception {
    JobConf conf = new JobConf(getConf(), BuildIntPostingsForwardIndex.class);
    FileSystem fs = FileSystem.get(conf);

    int mapTasks = conf.getInt("Ivory.NumMapTasks", 0);
    int minSplitSize = conf.getInt("Ivory.MinSplitSize", 0);
    String indexPath = conf.get("Ivory.IndexPath");

    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    String collectionName = env.readCollectionName();

    sLogger.info("Tool: BuildIntPostingsForwardIndex");
    sLogger.info(" - IndexPath: " + indexPath);
    sLogger.info(" - CollectionName: " + collectionName);

    conf.setJobName("BuildIntPostingsForwardIndex:" + collectionName);

    Path inputPath = new Path(env.getPostingsDirectory());
    FileInputFormat.setInputPaths(conf, inputPath);

    Path postingsIndexPath = new Path(env.getPostingsIndexData());

    if (fs.exists(postingsIndexPath)) {
        sLogger.info("Postings forward index path already exists!");
        return 0;
    }/*from  w  w  w  .java2  s  .c  o m*/
    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(1);

    conf.setInt("mapred.min.split.size", minSplitSize);
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(Text.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setOutputFormat(NullOutputFormat.class);

    conf.setMapRunnerClass(MyMapRunner.class);
    conf.setReducerClass(MyReducer.class);

    JobClient.runJob(conf);

    return 0;
}

From source file:ivory.index.BuildIPInvertedIndexDocSorted.java

License:Apache License

@SuppressWarnings("unused")
public int runTool() throws Exception {
    JobConf conf = new JobConf(getConf(), BuildIPInvertedIndexDocSorted.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get("Ivory.IndexPath");
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    String collectionName = env.readCollectionName();

    int mapTasks = conf.getInt("Ivory.NumMapTasks", 0);
    int reduceTasks = conf.getInt("Ivory.NumReduceTasks", 0);
    int minSplitSize = conf.getInt("Ivory.MinSplitSize", 0);
    int collectionDocCnt = env.readCollectionDocumentCount();

    LOG.info("PowerTool: BuildIPInvertedIndexDocSorted");
    LOG.info(" - IndexPath: " + indexPath);
    LOG.info(" - CollectionName: " + collectionName);
    LOG.info(" - CollectionDocumentCount: " + collectionDocCnt);
    LOG.info(" - NumMapTasks: " + mapTasks);
    LOG.info(" - NumReduceTasks: " + reduceTasks);
    LOG.info(" - MinSplitSize: " + minSplitSize);

    if (!fs.exists(new Path(indexPath))) {
        fs.mkdirs(new Path(indexPath));
    }/*from  w  ww. java  2  s. c  o  m*/

    Path inputPath = new Path(env.getIntDocVectorsDirectory());
    Path postingsPath = new Path(env.getPostingsDirectory());

    if (fs.exists(postingsPath)) {
        LOG.info("Postings already exist: no indexing will be performed.");
        return 0;
    }

    conf.setJobName("BuildIPInvertedIndex:" + collectionName);

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(reduceTasks);

    conf.setInt("Ivory.CollectionDocumentCount", collectionDocCnt);

    conf.setInt("mapred.min.split.size", minSplitSize);
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, postingsPath);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(PairOfInts.class);
    conf.setMapOutputValueClass(TermPositions.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(PostingsListDocSortedPositional.class);

    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(MyReducer.class);
    conf.setPartitionerClass(MyPartitioner.class);

    long startTime = System.currentTimeMillis();
    RunningJob job = JobClient.runJob(conf);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    env.writePostingsType("ivory.data.PostingsListDocSortedPositional");

    return 0;
}

From source file:ivory.preprocess.BuildTermDocVectors.java

License:Apache License

@SuppressWarnings("unchecked")
public int runTool() throws Exception {
    // create a new JobConf, inheriting from the configuration of this
    // PowerTool//w  w w.  ja v  a  2 s. c o  m
    JobConf conf = new JobConf(getConf(), BuildTermDocVectors.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get("Ivory.IndexPath");
    int mapTasks = conf.getInt("Ivory.NumMapTasks", 0);

    String collectionName = conf.get("Ivory.CollectionName");
    String collectionPath = conf.get("Ivory.CollectionPath");
    String inputFormat = conf.get("Ivory.InputFormat");
    String tokenizer = conf.get("Ivory.Tokenizer");
    String mappingClass = conf.get("Ivory.DocnoMappingClass");

    sLogger.info("PowerTool: BuildTermDocVectors");
    sLogger.info(" - CollectionName: " + collectionName);
    sLogger.info(" - CollectionPath: " + collectionPath);
    sLogger.info(" - InputputFormat: " + inputFormat);
    sLogger.info(" - Tokenizer: " + tokenizer);
    sLogger.info(" - DocnoMappingClass: " + mappingClass);
    sLogger.info(" - NumMapTasks: " + mapTasks);
    sLogger.info(" - NumReduceTasks: " + 0);

    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    Path mappingFile = env.getDocnoMappingData();

    if (!fs.exists(mappingFile)) {
        sLogger.error("Error, docno mapping data file " + mappingFile + "doesn't exist!");
        return 0;
    }

    DistributedCache.addCacheFile(mappingFile.toUri(), conf);

    conf.setJobName("BuildTermDocVectors:" + collectionName);

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(0);

    if (collectionPath.indexOf(",") == -1) {
        FileInputFormat.setInputPaths(conf, new Path(collectionPath));
        sLogger.info("Adding input path " + collectionPath);
    } else {
        String[] paths = collectionPath.split(",");
        for (String p : paths) {
            FileInputFormat.addInputPath(conf, new Path(p));
            sLogger.info("Adding input path " + p);
        }
    }

    Path outputPath = new Path(env.getTermDocVectorsDirectory());
    if (fs.exists(outputPath)) {
        sLogger.info("TermDocVectors already exist: Skipping!");
    } else {
        env.writeCollectionName(collectionName);
        env.writeCollectionPath(collectionPath);
        env.writeInputFormat(inputFormat);
        env.writeDocnoMappingClass(mappingClass);
        env.writeTokenizerClass(tokenizer);

        conf.set("mapred.child.java.opts", "-Xmx2048m");
        conf.setInt("mapred.task.timeout", 60000000);

        FileOutputFormat.setOutputPath(conf, outputPath);

        conf.setInputFormat((Class<? extends InputFormat>) Class.forName(inputFormat));
        conf.setOutputFormat(SequenceFileOutputFormat.class);
        SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.RECORD);

        conf.setMapOutputKeyClass(IntWritable.class);
        conf.setMapOutputValueClass(LazyTermDocVector.class);
        conf.setOutputKeyClass(IntWritable.class);
        conf.setOutputValueClass(LazyTermDocVector.class);

        conf.setMapperClass(MyMapper.class);

        long startTime = System.currentTimeMillis();
        RunningJob job = JobClient.runJob(conf);
        sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

        Counters counters = job.getCounters();

        // write out number of postings
        int collectionDocCount = (int) counters.findCounter(Docs.Total).getCounter();
        env.writeCollectionDocumentCount(collectionDocCount);
    }

    if (fs.exists(env.getDoclengthsData())) {
        sLogger.info("DocLength data exists: Skipping!");
        return 0;
    }

    int collectionDocCount = env.readCollectionDocumentCount();
    long startTime = System.currentTimeMillis();
    writeDoclengthsData(collectionDocCount);
    sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
    return 0;
}

From source file:ivory.preprocess.BuildTermDocVectors.java

License:Apache License

private void writeDoclengthsData(int collectionDocCount) throws IOException {
    JobConf conf = new JobConf(getConf(), GetTermCount.class);

    String indexPath = conf.get("Ivory.IndexPath");
    String collectionName = conf.get("Ivory.CollectionName");
    int docnoOffset = conf.getInt("Ivory.DocnoOffset", 0);

    FileSystem fs = FileSystem.get(conf);
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    Path dlFile = env.getDoclengthsData();
    Path inputPath = env.getDoclengthsDirectory();

    sLogger.info("Writing doc length data to " + dlFile + "...");

    conf.setJobName("DocLengthTable:" + collectionName);

    conf.setInt("Ivory.CollectionDocumentCount", collectionDocCount);
    conf.set("InputPath", inputPath.toString());
    conf.set("DocLengthDataFile", dlFile.toString());
    conf.set("mapred.child.java.opts", "-Xmx4096m");

    conf.setNumMapTasks(1);/*from  w  ww  .  j  ava 2  s  .  c  o m*/
    conf.setNumReduceTasks(0);
    conf.setSpeculativeExecution(false);

    conf.setInputFormat(NullInputFormat.class);
    conf.setOutputFormat(NullOutputFormat.class);
    conf.setMapperClass(DocLengthDataWriterMapper.class);

    RunningJob job = JobClient.runJob(conf);

    env.writeDocnoOffset(docnoOffset);
    Counters counters = job.getCounters();

    long collectionSumOfDocLengths = (long) counters.findCounter(DocLengths.SumOfDocLengths).getCounter();
    env.writeCollectionAverageDocumentLength((float) collectionSumOfDocLengths / collectionDocCount);
}

From source file:ivory.preprocess.BuildTermIdMap.java

License:Apache License

@SuppressWarnings("unused")
public int runTool() throws Exception {
    // create a new JobConf, inheriting from the configuration of this
    // PowerTool//w  w  w.  j a  v a 2 s.  c  om
    JobConf conf = new JobConf(getConf(), BuildTermIdMap.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get("Ivory.IndexPath");
    String collectionName = conf.get("Ivory.CollectionName");

    int mapTasks = conf.getInt("Ivory.NumMapTasks", 0);
    int reduceTasks = 1;
    int minSplitSize = conf.getInt("Ivory.MinSplitSize", 0);

    sLogger.info("PowerTool: BuildTermIdMap");
    sLogger.info(" - CollectionName: " + collectionName);
    sLogger.info(" - IndexPath: " + indexPath);
    sLogger.info(" - NumMapTasks: " + mapTasks);
    sLogger.info(" - NumReduceTasks: " + reduceTasks);

    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    if (!fs.exists(new Path(indexPath))) {
        sLogger.error("index path doesn't existing: skipping!");
        return 0;
    }

    Path termsFilePath = new Path(env.getIndexTermsData());
    Path termIDsFilePath = new Path(env.getIndexTermIdsData());
    Path idToTermFilePath = new Path(env.getIndexTermIdMappingData());
    Path dfByTermFilePath = new Path(env.getDfByTermData());
    Path cfByTermFilePath = new Path(env.getCfByTermData());
    Path dfByIntFilePath = new Path(env.getDfByIntData());
    Path cfByIntFilePath = new Path(env.getCfByIntData());

    if (fs.exists(termsFilePath) || fs.exists(termIDsFilePath) || fs.exists(idToTermFilePath)
            || fs.exists(dfByTermFilePath) || fs.exists(cfByTermFilePath) || fs.exists(dfByIntFilePath)
            || fs.exists(cfByIntFilePath)) {
        sLogger.info("term and term id data exist: skipping!");
        return 0;
    }

    Path tmpPath = new Path(env.getTempDirectory());
    fs.delete(tmpPath, true);

    conf.setJobName("BuildTermIdMap:" + collectionName);

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(reduceTasks);

    conf.setInt("Ivory.CollectionTermCount", (int) env.readCollectionTermCount());
    conf.setInt("mapred.min.split.size", minSplitSize);
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    FileInputFormat.setInputPaths(conf, new Path(env.getTermDfCfDirectory()));
    FileOutputFormat.setOutputPath(conf, tmpPath);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(PairOfIntLong.class);
    conf.setOutputKeyClass(Text.class);

    conf.setMapperClass(IdentityMapper.class);
    conf.setReducerClass(MyReducer.class);

    long startTime = System.currentTimeMillis();
    RunningJob job = JobClient.runJob(conf);
    sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    fs.delete(tmpPath, true);

    return 0;
}

From source file:ivory.ptc.AnchorTextInvertedIndex.java

License:Apache License

@Override
public int runTool() throws Exception {
    JobConf conf = new JobConf(getConf(), AnchorTextInvertedIndex.class);
    FileSystem fs = FileSystem.get(conf);
    String inPath = conf.get("Ivory.InputPath");
    String outPath = conf.get("Ivory.OutputPath");
    Path inputPath = new Path(inPath);
    Path outputPath = new Path(outPath);
    int mapTasks = conf.getInt("Ivory.NumMapTasks", 1);
    int reduceTasks = conf.getInt("Ivory.NumReduceTasks", 100);
    String weightingSchemeParameters = conf.get("Ivory.WeightingSchemeParameters");

    LOG.info("BuildAnchorTextInvertedIndex");
    LOG.info(" - input path: " + inPath);
    LOG.info(" - output path: " + outPath);
    LOG.info(" - number of reducers: " + reduceTasks);
    LOG.info(" - weighting scheme: " + conf.get("Ivory.WeightingScheme"));
    LOG.info(" - weighting scheme parameters: " + weightingSchemeParameters);

    String[] params = weightingSchemeParameters.split(PARAMETER_SEPARATER);
    for (String param : params) {
        DistributedCache.addCacheFile(new URI(param), conf);
    }/*from ww  w . ja v a 2 s  .  co  m*/

    conf.setJobName("BuildAnchorTextInvertedIndex");
    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(reduceTasks);
    conf.set("mapred.child.java.opts", "-Xmx4096m");
    conf.setInt("mapred.task.timeout", 60000000);

    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, outputPath);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(AnchorTextTarget.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(ArrayListWritable.class);
    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(MyReducer.class);

    fs.delete(outputPath);
    JobClient.runJob(conf);
    return 0;
}