Example usage for org.apache.hadoop.mapred JobConf setMapOutputValueClass

List of usage examples for org.apache.hadoop.mapred JobConf setMapOutputValueClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setMapOutputValueClass.

Prototype

public void setMapOutputValueClass(Class<?> theClass) 

Source Link

Document

Set the value class for the map output data.

Usage

From source file:ivory.core.preprocess.BuildWeightedTermDocVectors.java

License:Apache License

@SuppressWarnings("deprecation")
public int runTool() throws Exception {
    sLogger.info("PowerTool: GetWeightedTermDocVectors");

    JobConf conf = new JobConf(BuildWeightedTermDocVectors.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = getConf().get("Ivory.IndexPath");
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    String outputPath = env.getWeightedTermDocVectorsDirectory();
    int mapTasks = getConf().getInt("Ivory.NumMapTasks", 0);
    int minSplitSize = getConf().getInt("Ivory.MinSplitSize", 0);
    String collectionName = getConf().get("Ivory.CollectionName");

    String termsFilePath = env.getIndexTermsData();
    String termsIdsFilePath = env.getIndexTermIdsData();
    String termIdMappingFilePath = env.getIndexTermIdMappingData();
    String dfByTermFilePath = env.getDfByTermData();

    Path inputPath = new Path(env.getTermDocVectorsDirectory());
    Path weightedVectorsPath = new Path(outputPath);

    if (fs.exists(weightedVectorsPath)) {
        //fs.delete(weightedVectorsPath, true);
        sLogger.info("Output path already exists!");
        return 0;
    }//from www  . j  a  v a  2  s.c  o m

    /* add terms file to cache */
    if (!fs.exists(new Path(termsFilePath)) || !fs.exists(new Path(termsIdsFilePath))
            || !fs.exists(new Path(termIdMappingFilePath))) {
        throw new RuntimeException("Error, terms file " + termsFilePath + "/" + termsIdsFilePath + "/"
                + termIdMappingFilePath + "doesn't exist!");
    }
    DistributedCache.addCacheFile(new URI(termsFilePath), conf);
    DistributedCache.addCacheFile(new URI(termsIdsFilePath), conf);
    DistributedCache.addCacheFile(new URI(termIdMappingFilePath), conf);

    /* add df table to cache */
    if (!fs.exists(new Path(dfByTermFilePath))) {
        throw new RuntimeException("Error, df data file " + dfByTermFilePath + "doesn't exist!");
    }
    DistributedCache.addCacheFile(new URI(dfByTermFilePath), conf);

    /* add dl table to cache */
    Path docLengthFile = env.getDoclengthsData();
    if (!fs.exists(docLengthFile)) {
        throw new RuntimeException("Error, doc-length data file " + docLengthFile + "doesn't exist!");
    }
    DistributedCache.addCacheFile(docLengthFile.toUri(), conf);

    conf.setMapperClass(MyMapper.class);
    //conf.setInt("mapred.task.timeout",3600000);
    conf.setJobName("GetWeightedTermDocVectors:" + collectionName);
    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(0);
    conf.setInt("mapred.min.split.size", minSplitSize);
    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.setInt("Ivory.MinNumTerms", getConf().getInt("Ivory.MinNumTerms", Integer.MAX_VALUE));
    conf.setBoolean("Ivory.Normalize", getConf().getBoolean("Ivory.Normalize", false));
    if (getConf().get("Ivory.ShortDocLengths") != null) {
        conf.set("Ivory.ShortDocLengths", getConf().get("Ivory.ShortDocLengths"));
    }
    conf.set("Ivory.ScoringModel", getConf().get("Ivory.ScoringModel"));

    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, weightedVectorsPath);
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(HMapSFW.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(HMapSFW.class);

    sLogger.info("Running job: " + conf.getJobName());

    long startTime = System.currentTimeMillis();
    RunningJob job = JobClient.runJob(conf);
    sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}

From source file:ivory.index.BuildIntPostingsForwardIndex.java

License:Apache License

public int runTool() throws Exception {
    JobConf conf = new JobConf(getConf(), BuildIntPostingsForwardIndex.class);
    FileSystem fs = FileSystem.get(conf);

    int mapTasks = conf.getInt("Ivory.NumMapTasks", 0);
    int minSplitSize = conf.getInt("Ivory.MinSplitSize", 0);
    String indexPath = conf.get("Ivory.IndexPath");

    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    String collectionName = env.readCollectionName();

    sLogger.info("Tool: BuildIntPostingsForwardIndex");
    sLogger.info(" - IndexPath: " + indexPath);
    sLogger.info(" - CollectionName: " + collectionName);

    conf.setJobName("BuildIntPostingsForwardIndex:" + collectionName);

    Path inputPath = new Path(env.getPostingsDirectory());
    FileInputFormat.setInputPaths(conf, inputPath);

    Path postingsIndexPath = new Path(env.getPostingsIndexData());

    if (fs.exists(postingsIndexPath)) {
        sLogger.info("Postings forward index path already exists!");
        return 0;
    }// ww w .  j  av a  2s .com
    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(1);

    conf.setInt("mapred.min.split.size", minSplitSize);
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(Text.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setOutputFormat(NullOutputFormat.class);

    conf.setMapRunnerClass(MyMapRunner.class);
    conf.setReducerClass(MyReducer.class);

    JobClient.runJob(conf);

    return 0;
}

From source file:ivory.index.BuildIPInvertedIndexDocSorted.java

License:Apache License

@SuppressWarnings("unused")
public int runTool() throws Exception {
    JobConf conf = new JobConf(getConf(), BuildIPInvertedIndexDocSorted.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get("Ivory.IndexPath");
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    String collectionName = env.readCollectionName();

    int mapTasks = conf.getInt("Ivory.NumMapTasks", 0);
    int reduceTasks = conf.getInt("Ivory.NumReduceTasks", 0);
    int minSplitSize = conf.getInt("Ivory.MinSplitSize", 0);
    int collectionDocCnt = env.readCollectionDocumentCount();

    LOG.info("PowerTool: BuildIPInvertedIndexDocSorted");
    LOG.info(" - IndexPath: " + indexPath);
    LOG.info(" - CollectionName: " + collectionName);
    LOG.info(" - CollectionDocumentCount: " + collectionDocCnt);
    LOG.info(" - NumMapTasks: " + mapTasks);
    LOG.info(" - NumReduceTasks: " + reduceTasks);
    LOG.info(" - MinSplitSize: " + minSplitSize);

    if (!fs.exists(new Path(indexPath))) {
        fs.mkdirs(new Path(indexPath));
    }//from  w  w  w .  j a va  2  s.c o  m

    Path inputPath = new Path(env.getIntDocVectorsDirectory());
    Path postingsPath = new Path(env.getPostingsDirectory());

    if (fs.exists(postingsPath)) {
        LOG.info("Postings already exist: no indexing will be performed.");
        return 0;
    }

    conf.setJobName("BuildIPInvertedIndex:" + collectionName);

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(reduceTasks);

    conf.setInt("Ivory.CollectionDocumentCount", collectionDocCnt);

    conf.setInt("mapred.min.split.size", minSplitSize);
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, postingsPath);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(PairOfInts.class);
    conf.setMapOutputValueClass(TermPositions.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(PostingsListDocSortedPositional.class);

    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(MyReducer.class);
    conf.setPartitionerClass(MyPartitioner.class);

    long startTime = System.currentTimeMillis();
    RunningJob job = JobClient.runJob(conf);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    env.writePostingsType("ivory.data.PostingsListDocSortedPositional");

    return 0;
}

From source file:ivory.preprocess.BuildIntDocVectors.java

License:Apache License

@SuppressWarnings("unused")
public int runTool() throws Exception {
    // create a new JobConf, inheriting from the configuration of this
    // PowerTool/*from   w ww.j  a  va 2 s .  c o  m*/
    JobConf conf = new JobConf(getConf(), BuildIntDocVectors.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get("Ivory.IndexPath");
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    int mapTasks = conf.getInt("Ivory.NumMapTasks", 0);

    String collectionName = env.readCollectionName();

    sLogger.info("PowerTool: BuildIntDocVectors");
    sLogger.info(" - IndexPath: " + indexPath);
    sLogger.info(" - CollectionName: " + collectionName);
    sLogger.info(" - NumMapTasks: " + mapTasks);
    sLogger.info("This is new!");
    String termsFile = env.getIndexTermsData();
    String termIDsFile = env.getIndexTermIdsData();
    String idToTermFile = env.getIndexTermIdMappingData();

    Path termsFilePath = new Path(termsFile);
    Path termIDsFilePath = new Path(termIDsFile);

    if (!fs.exists(termsFilePath) || !fs.exists(termIDsFilePath)) {
        sLogger.error("Error, terms files don't exist!");
        return 0;
    }

    Path outputPath = new Path(env.getIntDocVectorsDirectory());
    if (fs.exists(outputPath)) {
        sLogger.info("IntDocVectors already exist: skipping!");
        return 0;
    }

    DistributedCache.addCacheFile(new URI(termsFile), conf);
    DistributedCache.addCacheFile(new URI(termIDsFile), conf);
    DistributedCache.addCacheFile(new URI(idToTermFile), conf);

    conf.setJobName("BuildIntDocVectors:" + collectionName);

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(0);

    FileInputFormat.setInputPaths(conf, env.getTermDocVectorsDirectory());
    FileOutputFormat.setOutputPath(conf, outputPath);

    conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.RECORD);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(LazyIntDocVector.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(LazyIntDocVector.class);

    conf.setMapperClass(MyMapper.class);

    long startTime = System.currentTimeMillis();
    RunningJob job = JobClient.runJob(conf);
    sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}

From source file:ivory.preprocess.BuildIntDocVectorsForwardIndex.java

License:Apache License

public int runTool() throws Exception {
    JobConf conf = new JobConf(getConf(), BuildIntDocVectorsForwardIndex.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get("Ivory.IndexPath");
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    int mapTasks = conf.getInt("Ivory.NumMapTasks", 0);
    String collectionName = env.readCollectionName();
    boolean buildWeighted = conf.getBoolean("Ivory.BuildWeighted", false);

    sLogger.info("Tool: BuildIntDocVectorsIndex");
    sLogger.info(" - IndexPath: " + indexPath);
    sLogger.info(" - CollectionName: " + collectionName);
    sLogger.info(" - BuildWeighted: " + buildWeighted);
    sLogger.info(" - NumMapTasks: " + mapTasks);

    String intDocVectorsPath;//from w w  w  . j  a  v  a 2 s . co m
    String forwardIndexPath;
    if (buildWeighted) {
        intDocVectorsPath = env.getWeightedIntDocVectorsDirectory();
        forwardIndexPath = env.getWeightedIntDocVectorsForwardIndex();
    } else {
        intDocVectorsPath = env.getIntDocVectorsDirectory();
        forwardIndexPath = env.getIntDocVectorsForwardIndex();
    }

    if (!fs.exists(new Path(intDocVectorsPath))) {
        sLogger.info("Error: IntDocVectors don't exist!");
        return 0;
    }

    if (fs.exists(new Path(forwardIndexPath))) {
        sLogger.info("IntDocVectorIndex already exists: skipping!");
        return 0;
    }

    conf.setJobName("BuildIntDocVectorsForwardIndex:" + collectionName);

    Path inputPath = new Path(intDocVectorsPath);
    FileInputFormat.setInputPaths(conf, inputPath);

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(1);

    conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(Text.class);
    conf.setOutputFormat(NullOutputFormat.class);

    conf.setMapRunnerClass(MyMapRunner.class);
    conf.setReducerClass(MyReducer.class);

    JobClient.runJob(conf);

    return 0;
}

From source file:ivory.preprocess.BuildTermDocVectors.java

License:Apache License

@SuppressWarnings("unchecked")
public int runTool() throws Exception {
    // create a new JobConf, inheriting from the configuration of this
    // PowerTool// ww  w . jav a 2 s . c o m
    JobConf conf = new JobConf(getConf(), BuildTermDocVectors.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get("Ivory.IndexPath");
    int mapTasks = conf.getInt("Ivory.NumMapTasks", 0);

    String collectionName = conf.get("Ivory.CollectionName");
    String collectionPath = conf.get("Ivory.CollectionPath");
    String inputFormat = conf.get("Ivory.InputFormat");
    String tokenizer = conf.get("Ivory.Tokenizer");
    String mappingClass = conf.get("Ivory.DocnoMappingClass");

    sLogger.info("PowerTool: BuildTermDocVectors");
    sLogger.info(" - CollectionName: " + collectionName);
    sLogger.info(" - CollectionPath: " + collectionPath);
    sLogger.info(" - InputputFormat: " + inputFormat);
    sLogger.info(" - Tokenizer: " + tokenizer);
    sLogger.info(" - DocnoMappingClass: " + mappingClass);
    sLogger.info(" - NumMapTasks: " + mapTasks);
    sLogger.info(" - NumReduceTasks: " + 0);

    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    Path mappingFile = env.getDocnoMappingData();

    if (!fs.exists(mappingFile)) {
        sLogger.error("Error, docno mapping data file " + mappingFile + "doesn't exist!");
        return 0;
    }

    DistributedCache.addCacheFile(mappingFile.toUri(), conf);

    conf.setJobName("BuildTermDocVectors:" + collectionName);

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(0);

    if (collectionPath.indexOf(",") == -1) {
        FileInputFormat.setInputPaths(conf, new Path(collectionPath));
        sLogger.info("Adding input path " + collectionPath);
    } else {
        String[] paths = collectionPath.split(",");
        for (String p : paths) {
            FileInputFormat.addInputPath(conf, new Path(p));
            sLogger.info("Adding input path " + p);
        }
    }

    Path outputPath = new Path(env.getTermDocVectorsDirectory());
    if (fs.exists(outputPath)) {
        sLogger.info("TermDocVectors already exist: Skipping!");
    } else {
        env.writeCollectionName(collectionName);
        env.writeCollectionPath(collectionPath);
        env.writeInputFormat(inputFormat);
        env.writeDocnoMappingClass(mappingClass);
        env.writeTokenizerClass(tokenizer);

        conf.set("mapred.child.java.opts", "-Xmx2048m");
        conf.setInt("mapred.task.timeout", 60000000);

        FileOutputFormat.setOutputPath(conf, outputPath);

        conf.setInputFormat((Class<? extends InputFormat>) Class.forName(inputFormat));
        conf.setOutputFormat(SequenceFileOutputFormat.class);
        SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.RECORD);

        conf.setMapOutputKeyClass(IntWritable.class);
        conf.setMapOutputValueClass(LazyTermDocVector.class);
        conf.setOutputKeyClass(IntWritable.class);
        conf.setOutputValueClass(LazyTermDocVector.class);

        conf.setMapperClass(MyMapper.class);

        long startTime = System.currentTimeMillis();
        RunningJob job = JobClient.runJob(conf);
        sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

        Counters counters = job.getCounters();

        // write out number of postings
        int collectionDocCount = (int) counters.findCounter(Docs.Total).getCounter();
        env.writeCollectionDocumentCount(collectionDocCount);
    }

    if (fs.exists(env.getDoclengthsData())) {
        sLogger.info("DocLength data exists: Skipping!");
        return 0;
    }

    int collectionDocCount = env.readCollectionDocumentCount();
    long startTime = System.currentTimeMillis();
    writeDoclengthsData(collectionDocCount);
    sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
    return 0;
}

From source file:ivory.preprocess.BuildTermDocVectorsForwardIndex.java

License:Apache License

public int runTool() throws Exception {
    JobConf conf = new JobConf(getConf(), BuildTermDocVectorsForwardIndex.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get("Ivory.IndexPath");
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    int mapTasks = conf.getInt("Ivory.NumMapTasks", 0);
    String collectionName = env.readCollectionName();

    sLogger.info("Tool: BuildTermDocVectorsIndex");
    sLogger.info(" - IndexPath: " + indexPath);
    sLogger.info(" - CollectionName: " + collectionName);
    sLogger.info(" - NumMapTasks: " + mapTasks);

    if (!fs.exists(new Path(env.getTermDocVectorsDirectory()))) {
        sLogger.info("Error: TermDocVectors don't exist!");
        return 0;
    }//  w  w  w.j a v a  2 s.  c  om

    if (fs.exists(new Path(env.getTermDocVectorsForwardIndex()))) {
        sLogger.info("TermDocVectorIndex already exists: skipping!");
        return 0;
    }

    conf.setJobName("BuildTermDocVectorsForwardIndex:" + collectionName);

    Path inputPath = new Path(env.getTermDocVectorsDirectory());
    FileInputFormat.setInputPaths(conf, inputPath);

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(1);

    conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(Text.class);
    conf.setOutputFormat(NullOutputFormat.class);

    conf.setMapRunnerClass(MyMapRunner.class);
    conf.setReducerClass(MyReducer.class);

    JobClient.runJob(conf);

    return 0;
}

From source file:ivory.preprocess.BuildTermIdMap.java

License:Apache License

@SuppressWarnings("unused")
public int runTool() throws Exception {
    // create a new JobConf, inheriting from the configuration of this
    // PowerTool//w  ww. ja  va 2  s  .c o  m
    JobConf conf = new JobConf(getConf(), BuildTermIdMap.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get("Ivory.IndexPath");
    String collectionName = conf.get("Ivory.CollectionName");

    int mapTasks = conf.getInt("Ivory.NumMapTasks", 0);
    int reduceTasks = 1;
    int minSplitSize = conf.getInt("Ivory.MinSplitSize", 0);

    sLogger.info("PowerTool: BuildTermIdMap");
    sLogger.info(" - CollectionName: " + collectionName);
    sLogger.info(" - IndexPath: " + indexPath);
    sLogger.info(" - NumMapTasks: " + mapTasks);
    sLogger.info(" - NumReduceTasks: " + reduceTasks);

    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    if (!fs.exists(new Path(indexPath))) {
        sLogger.error("index path doesn't existing: skipping!");
        return 0;
    }

    Path termsFilePath = new Path(env.getIndexTermsData());
    Path termIDsFilePath = new Path(env.getIndexTermIdsData());
    Path idToTermFilePath = new Path(env.getIndexTermIdMappingData());
    Path dfByTermFilePath = new Path(env.getDfByTermData());
    Path cfByTermFilePath = new Path(env.getCfByTermData());
    Path dfByIntFilePath = new Path(env.getDfByIntData());
    Path cfByIntFilePath = new Path(env.getCfByIntData());

    if (fs.exists(termsFilePath) || fs.exists(termIDsFilePath) || fs.exists(idToTermFilePath)
            || fs.exists(dfByTermFilePath) || fs.exists(cfByTermFilePath) || fs.exists(dfByIntFilePath)
            || fs.exists(cfByIntFilePath)) {
        sLogger.info("term and term id data exist: skipping!");
        return 0;
    }

    Path tmpPath = new Path(env.getTempDirectory());
    fs.delete(tmpPath, true);

    conf.setJobName("BuildTermIdMap:" + collectionName);

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(reduceTasks);

    conf.setInt("Ivory.CollectionTermCount", (int) env.readCollectionTermCount());
    conf.setInt("mapred.min.split.size", minSplitSize);
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    FileInputFormat.setInputPaths(conf, new Path(env.getTermDfCfDirectory()));
    FileOutputFormat.setOutputPath(conf, tmpPath);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(PairOfIntLong.class);
    conf.setOutputKeyClass(Text.class);

    conf.setMapperClass(IdentityMapper.class);
    conf.setReducerClass(MyReducer.class);

    long startTime = System.currentTimeMillis();
    RunningJob job = JobClient.runJob(conf);
    sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    fs.delete(tmpPath, true);

    return 0;
}

From source file:ivory.preprocess.GetTermCount.java

License:Apache License

public int runTool() throws Exception {
    // create a new JobConf, inheriting from the configuration of this
    // PowerTool// ww  w  .j a  va2  s  .c o m
    JobConf conf = new JobConf(getConf(), GetTermCount.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get(Constants.IndexPath);
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    int mapTasks = conf.getInt(Constants.NumMapTasks, 0);
    int reduceTasks = conf.getInt(Constants.NumReduceTasks, 0);

    String collectionName = env.readCollectionName();
    String termDocVectorsPath = env.getTermDocVectorsDirectory();
    String termDfCfPath = env.getTermDfCfDirectory();

    if (!fs.exists(new Path(indexPath))) {
        sLogger.info("index path doesn't existing: skipping!");
        return 0;
    }

    sLogger.info("PowerTool: GetTermCount");
    sLogger.info(" - CollectionName: " + collectionName);
    sLogger.info(" - NumMapTasks: " + mapTasks);
    sLogger.info(" - NumReduceTasks: " + reduceTasks);
    sLogger.info(" - MinDf: " + conf.getInt(Constants.MinDf, 0));
    sLogger.info(" - MaxDf: " + conf.getInt(Constants.MaxDf, Integer.MAX_VALUE));

    Path outputPath = new Path(termDfCfPath);
    if (fs.exists(outputPath)) {
        sLogger.error("TermDfCf directory exist: skipping!");
        return 0;
    }

    conf.setJobName("GetTermCount:" + collectionName);

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(reduceTasks);
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    FileInputFormat.setInputPaths(conf, new Path(termDocVectorsPath));
    FileOutputFormat.setOutputPath(conf, outputPath);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(PairOfIntLong.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(PairOfIntLong.class);

    conf.setMapperClass(MyMapper.class);
    conf.setCombinerClass(MyCombiner.class);
    conf.setReducerClass(MyReducer.class);

    long startTime = System.currentTimeMillis();
    RunningJob job = JobClient.runJob(conf);
    sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    Counters counters = job.getCounters();
    // write out number of postings
    int collectionTermCount = (int) counters.findCounter(Statistics.Terms).getCounter();
    env.writeCollectionTermCount(collectionTermCount);
    // NOTE: this value is not the same as number of postings, because
    // postings for non-English terms are discarded, or as result of df cut

    long collectionLength = counters.findCounter(Statistics.SumOfDocLengths).getCounter();
    env.writeCollectionLength(collectionLength);
    return 0;
}

From source file:ivory.ptc.AnchorTextInvertedIndex.java

License:Apache License

@Override
public int runTool() throws Exception {
    JobConf conf = new JobConf(getConf(), AnchorTextInvertedIndex.class);
    FileSystem fs = FileSystem.get(conf);
    String inPath = conf.get("Ivory.InputPath");
    String outPath = conf.get("Ivory.OutputPath");
    Path inputPath = new Path(inPath);
    Path outputPath = new Path(outPath);
    int mapTasks = conf.getInt("Ivory.NumMapTasks", 1);
    int reduceTasks = conf.getInt("Ivory.NumReduceTasks", 100);
    String weightingSchemeParameters = conf.get("Ivory.WeightingSchemeParameters");

    LOG.info("BuildAnchorTextInvertedIndex");
    LOG.info(" - input path: " + inPath);
    LOG.info(" - output path: " + outPath);
    LOG.info(" - number of reducers: " + reduceTasks);
    LOG.info(" - weighting scheme: " + conf.get("Ivory.WeightingScheme"));
    LOG.info(" - weighting scheme parameters: " + weightingSchemeParameters);

    String[] params = weightingSchemeParameters.split(PARAMETER_SEPARATER);
    for (String param : params) {
        DistributedCache.addCacheFile(new URI(param), conf);
    }/* w  w  w  .  j av  a  2 s. c  om*/

    conf.setJobName("BuildAnchorTextInvertedIndex");
    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(reduceTasks);
    conf.set("mapred.child.java.opts", "-Xmx4096m");
    conf.setInt("mapred.task.timeout", 60000000);

    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, outputPath);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(AnchorTextTarget.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(ArrayListWritable.class);
    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(MyReducer.class);

    fs.delete(outputPath);
    JobClient.runJob(conf);
    return 0;
}