Example usage for org.apache.hadoop.mapred JobConf getInt

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf getInt.

Prototype

public int getInt(String name, int defaultValue)

Source Link

Document

Get the value of the name property as an int.

Usage

From source file:ivory.preprocess.BuildTermDocVectors.java

License:Apache License

@SuppressWarnings("unchecked")
public int runTool() throws Exception {
    // create a new JobConf, inheriting from the configuration of this
    // PowerTool/*ww  w . j  ava2 s  .co  m*/
    JobConf conf = new JobConf(getConf(), BuildTermDocVectors.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get("Ivory.IndexPath");
    int mapTasks = conf.getInt("Ivory.NumMapTasks", 0);

    String collectionName = conf.get("Ivory.CollectionName");
    String collectionPath = conf.get("Ivory.CollectionPath");
    String inputFormat = conf.get("Ivory.InputFormat");
    String tokenizer = conf.get("Ivory.Tokenizer");
    String mappingClass = conf.get("Ivory.DocnoMappingClass");

    sLogger.info("PowerTool: BuildTermDocVectors");
    sLogger.info(" - CollectionName: " + collectionName);
    sLogger.info(" - CollectionPath: " + collectionPath);
    sLogger.info(" - InputputFormat: " + inputFormat);
    sLogger.info(" - Tokenizer: " + tokenizer);
    sLogger.info(" - DocnoMappingClass: " + mappingClass);
    sLogger.info(" - NumMapTasks: " + mapTasks);
    sLogger.info(" - NumReduceTasks: " + 0);

    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    Path mappingFile = env.getDocnoMappingData();

    if (!fs.exists(mappingFile)) {
        sLogger.error("Error, docno mapping data file " + mappingFile + "doesn't exist!");
        return 0;
    }

    DistributedCache.addCacheFile(mappingFile.toUri(), conf);

    conf.setJobName("BuildTermDocVectors:" + collectionName);

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(0);

    if (collectionPath.indexOf(",") == -1) {
        FileInputFormat.setInputPaths(conf, new Path(collectionPath));
        sLogger.info("Adding input path " + collectionPath);
    } else {
        String[] paths = collectionPath.split(",");
        for (String p : paths) {
            FileInputFormat.addInputPath(conf, new Path(p));
            sLogger.info("Adding input path " + p);
        }
    }

    Path outputPath = new Path(env.getTermDocVectorsDirectory());
    if (fs.exists(outputPath)) {
        sLogger.info("TermDocVectors already exist: Skipping!");
    } else {
        env.writeCollectionName(collectionName);
        env.writeCollectionPath(collectionPath);
        env.writeInputFormat(inputFormat);
        env.writeDocnoMappingClass(mappingClass);
        env.writeTokenizerClass(tokenizer);

        conf.set("mapred.child.java.opts", "-Xmx2048m");
        conf.setInt("mapred.task.timeout", 60000000);

        FileOutputFormat.setOutputPath(conf, outputPath);

        conf.setInputFormat((Class<? extends InputFormat>) Class.forName(inputFormat));
        conf.setOutputFormat(SequenceFileOutputFormat.class);
        SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.RECORD);

        conf.setMapOutputKeyClass(IntWritable.class);
        conf.setMapOutputValueClass(LazyTermDocVector.class);
        conf.setOutputKeyClass(IntWritable.class);
        conf.setOutputValueClass(LazyTermDocVector.class);

        conf.setMapperClass(MyMapper.class);

        long startTime = System.currentTimeMillis();
        RunningJob job = JobClient.runJob(conf);
        sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

        Counters counters = job.getCounters();

        // write out number of postings
        int collectionDocCount = (int) counters.findCounter(Docs.Total).getCounter();
        env.writeCollectionDocumentCount(collectionDocCount);
    }

    if (fs.exists(env.getDoclengthsData())) {
        sLogger.info("DocLength data exists: Skipping!");
        return 0;
    }

    int collectionDocCount = env.readCollectionDocumentCount();
    long startTime = System.currentTimeMillis();
    writeDoclengthsData(collectionDocCount);
    sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
    return 0;
}

From source file:ivory.preprocess.BuildTermDocVectors.java

License:Apache License

private void writeDoclengthsData(int collectionDocCount) throws IOException {
    JobConf conf = new JobConf(getConf(), GetTermCount.class);

    String indexPath = conf.get("Ivory.IndexPath");
    String collectionName = conf.get("Ivory.CollectionName");
    int docnoOffset = conf.getInt("Ivory.DocnoOffset", 0);

    FileSystem fs = FileSystem.get(conf);
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    Path dlFile = env.getDoclengthsData();
    Path inputPath = env.getDoclengthsDirectory();

    sLogger.info("Writing doc length data to " + dlFile + "...");

    conf.setJobName("DocLengthTable:" + collectionName);

    conf.setInt("Ivory.CollectionDocumentCount", collectionDocCount);
    conf.set("InputPath", inputPath.toString());
    conf.set("DocLengthDataFile", dlFile.toString());
    conf.set("mapred.child.java.opts", "-Xmx4096m");

    conf.setNumMapTasks(1);//from   w  w w .j a  va 2s .  c o m
    conf.setNumReduceTasks(0);
    conf.setSpeculativeExecution(false);

    conf.setInputFormat(NullInputFormat.class);
    conf.setOutputFormat(NullOutputFormat.class);
    conf.setMapperClass(DocLengthDataWriterMapper.class);

    RunningJob job = JobClient.runJob(conf);

    env.writeDocnoOffset(docnoOffset);
    Counters counters = job.getCounters();

    long collectionSumOfDocLengths = (long) counters.findCounter(DocLengths.SumOfDocLengths).getCounter();
    env.writeCollectionAverageDocumentLength((float) collectionSumOfDocLengths / collectionDocCount);
}

From source file:ivory.preprocess.BuildTermDocVectorsForwardIndex.java

License:Apache License

public int runTool() throws Exception {
    JobConf conf = new JobConf(getConf(), BuildTermDocVectorsForwardIndex.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get("Ivory.IndexPath");
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    int mapTasks = conf.getInt("Ivory.NumMapTasks", 0);
    String collectionName = env.readCollectionName();

    sLogger.info("Tool: BuildTermDocVectorsIndex");
    sLogger.info(" - IndexPath: " + indexPath);
    sLogger.info(" - CollectionName: " + collectionName);
    sLogger.info(" - NumMapTasks: " + mapTasks);

    if (!fs.exists(new Path(env.getTermDocVectorsDirectory()))) {
        sLogger.info("Error: TermDocVectors don't exist!");
        return 0;
    }/*from   w ww  . j a  va  2  s .c o m*/

    if (fs.exists(new Path(env.getTermDocVectorsForwardIndex()))) {
        sLogger.info("TermDocVectorIndex already exists: skipping!");
        return 0;
    }

    conf.setJobName("BuildTermDocVectorsForwardIndex:" + collectionName);

    Path inputPath = new Path(env.getTermDocVectorsDirectory());
    FileInputFormat.setInputPaths(conf, inputPath);

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(1);

    conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(Text.class);
    conf.setOutputFormat(NullOutputFormat.class);

    conf.setMapRunnerClass(MyMapRunner.class);
    conf.setReducerClass(MyReducer.class);

    JobClient.runJob(conf);

    return 0;
}

From source file:ivory.preprocess.BuildTermIdMap.java

License:Apache License

@SuppressWarnings("unused")
public int runTool() throws Exception {
    // create a new JobConf, inheriting from the configuration of this
    // PowerTool//from  w  w w  .j  a va2s  .co m
    JobConf conf = new JobConf(getConf(), BuildTermIdMap.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get("Ivory.IndexPath");
    String collectionName = conf.get("Ivory.CollectionName");

    int mapTasks = conf.getInt("Ivory.NumMapTasks", 0);
    int reduceTasks = 1;
    int minSplitSize = conf.getInt("Ivory.MinSplitSize", 0);

    sLogger.info("PowerTool: BuildTermIdMap");
    sLogger.info(" - CollectionName: " + collectionName);
    sLogger.info(" - IndexPath: " + indexPath);
    sLogger.info(" - NumMapTasks: " + mapTasks);
    sLogger.info(" - NumReduceTasks: " + reduceTasks);

    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    if (!fs.exists(new Path(indexPath))) {
        sLogger.error("index path doesn't existing: skipping!");
        return 0;
    }

    Path termsFilePath = new Path(env.getIndexTermsData());
    Path termIDsFilePath = new Path(env.getIndexTermIdsData());
    Path idToTermFilePath = new Path(env.getIndexTermIdMappingData());
    Path dfByTermFilePath = new Path(env.getDfByTermData());
    Path cfByTermFilePath = new Path(env.getCfByTermData());
    Path dfByIntFilePath = new Path(env.getDfByIntData());
    Path cfByIntFilePath = new Path(env.getCfByIntData());

    if (fs.exists(termsFilePath) || fs.exists(termIDsFilePath) || fs.exists(idToTermFilePath)
            || fs.exists(dfByTermFilePath) || fs.exists(cfByTermFilePath) || fs.exists(dfByIntFilePath)
            || fs.exists(cfByIntFilePath)) {
        sLogger.info("term and term id data exist: skipping!");
        return 0;
    }

    Path tmpPath = new Path(env.getTempDirectory());
    fs.delete(tmpPath, true);

    conf.setJobName("BuildTermIdMap:" + collectionName);

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(reduceTasks);

    conf.setInt("Ivory.CollectionTermCount", (int) env.readCollectionTermCount());
    conf.setInt("mapred.min.split.size", minSplitSize);
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    FileInputFormat.setInputPaths(conf, new Path(env.getTermDfCfDirectory()));
    FileOutputFormat.setOutputPath(conf, tmpPath);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(PairOfIntLong.class);
    conf.setOutputKeyClass(Text.class);

    conf.setMapperClass(IdentityMapper.class);
    conf.setReducerClass(MyReducer.class);

    long startTime = System.currentTimeMillis();
    RunningJob job = JobClient.runJob(conf);
    sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    fs.delete(tmpPath, true);

    return 0;
}

From source file:ivory.preprocess.GetTermCount.java

License:Apache License

public int runTool() throws Exception {
    // create a new JobConf, inheriting from the configuration of this
    // PowerTool//from   w w  w  . ja  v  a  2 s.  co  m
    JobConf conf = new JobConf(getConf(), GetTermCount.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get(Constants.IndexPath);
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    int mapTasks = conf.getInt(Constants.NumMapTasks, 0);
    int reduceTasks = conf.getInt(Constants.NumReduceTasks, 0);

    String collectionName = env.readCollectionName();
    String termDocVectorsPath = env.getTermDocVectorsDirectory();
    String termDfCfPath = env.getTermDfCfDirectory();

    if (!fs.exists(new Path(indexPath))) {
        sLogger.info("index path doesn't existing: skipping!");
        return 0;
    }

    sLogger.info("PowerTool: GetTermCount");
    sLogger.info(" - CollectionName: " + collectionName);
    sLogger.info(" - NumMapTasks: " + mapTasks);
    sLogger.info(" - NumReduceTasks: " + reduceTasks);
    sLogger.info(" - MinDf: " + conf.getInt(Constants.MinDf, 0));
    sLogger.info(" - MaxDf: " + conf.getInt(Constants.MaxDf, Integer.MAX_VALUE));

    Path outputPath = new Path(termDfCfPath);
    if (fs.exists(outputPath)) {
        sLogger.error("TermDfCf directory exist: skipping!");
        return 0;
    }

    conf.setJobName("GetTermCount:" + collectionName);

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(reduceTasks);
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    FileInputFormat.setInputPaths(conf, new Path(termDocVectorsPath));
    FileOutputFormat.setOutputPath(conf, outputPath);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(PairOfIntLong.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(PairOfIntLong.class);

    conf.setMapperClass(MyMapper.class);
    conf.setCombinerClass(MyCombiner.class);
    conf.setReducerClass(MyReducer.class);

    long startTime = System.currentTimeMillis();
    RunningJob job = JobClient.runJob(conf);
    sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    Counters counters = job.getCounters();
    // write out number of postings
    int collectionTermCount = (int) counters.findCounter(Statistics.Terms).getCounter();
    env.writeCollectionTermCount(collectionTermCount);
    // NOTE: this value is not the same as number of postings, because
    // postings for non-English terms are discarded, or as result of df cut

    long collectionLength = counters.findCounter(Statistics.SumOfDocLengths).getCounter();
    env.writeCollectionLength(collectionLength);
    return 0;
}

From source file:ivory.ptc.AnchorTextInvertedIndex.java

License:Apache License

@Override
public int runTool() throws Exception {
    JobConf conf = new JobConf(getConf(), AnchorTextInvertedIndex.class);
    FileSystem fs = FileSystem.get(conf);
    String inPath = conf.get("Ivory.InputPath");
    String outPath = conf.get("Ivory.OutputPath");
    Path inputPath = new Path(inPath);
    Path outputPath = new Path(outPath);
    int mapTasks = conf.getInt("Ivory.NumMapTasks", 1);
    int reduceTasks = conf.getInt("Ivory.NumReduceTasks", 100);
    String weightingSchemeParameters = conf.get("Ivory.WeightingSchemeParameters");

    LOG.info("BuildAnchorTextInvertedIndex");
    LOG.info(" - input path: " + inPath);
    LOG.info(" - output path: " + outPath);
    LOG.info(" - number of reducers: " + reduceTasks);
    LOG.info(" - weighting scheme: " + conf.get("Ivory.WeightingScheme"));
    LOG.info(" - weighting scheme parameters: " + weightingSchemeParameters);

    String[] params = weightingSchemeParameters.split(PARAMETER_SEPARATER);
    for (String param : params) {
        DistributedCache.addCacheFile(new URI(param), conf);
    }//from  ww w  . j  ava  2  s  .  c om

    conf.setJobName("BuildAnchorTextInvertedIndex");
    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(reduceTasks);
    conf.set("mapred.child.java.opts", "-Xmx4096m");
    conf.setInt("mapred.task.timeout", 60000000);

    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, outputPath);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(AnchorTextTarget.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(ArrayListWritable.class);
    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(MyReducer.class);

    fs.delete(outputPath);
    JobClient.runJob(conf);
    return 0;
}

From source file:lennard.PiInputFormat.java

License:Apache License

public void configure(JobConf conf) {
    N = conf.getInt("mapred.line.input.format.linespermap", 1);
}

From source file:net.iponweb.hadoop.streaming.avro.AvroAsJsonOutputFormat.java

License:Apache License

static <K> void configureDataFileWriter(DataFileWriter<K> writer, JobConf job)
        throws UnsupportedEncodingException {

    if (FileOutputFormat.getCompressOutput(job)) {
        int level = job.getInt(org.apache.avro.mapred.AvroOutputFormat.DEFLATE_LEVEL_KEY,
                org.apache.avro.mapred.AvroOutputFormat.DEFAULT_DEFLATE_LEVEL);
        String codecName = job.get(AvroJob.OUTPUT_CODEC, DEFLATE_CODEC);
        CodecFactory factory = codecName.equals(DEFLATE_CODEC) ? CodecFactory.deflateCodec(level)
                : CodecFactory.fromString(codecName);
        writer.setCodec(factory);/*from   ww w . j a v  a  2s .  c om*/
    }

    writer.setSyncInterval(
            job.getInt(org.apache.avro.mapred.AvroOutputFormat.SYNC_INTERVAL_KEY, DEFAULT_SYNC_INTERVAL));

    // copy metadata from job
    for (Map.Entry<String, String> e : job) {
        if (e.getKey().startsWith(AvroJob.TEXT_PREFIX))
            writer.setMeta(e.getKey().substring(AvroJob.TEXT_PREFIX.length()), e.getValue());
        if (e.getKey().startsWith(AvroJob.BINARY_PREFIX))
            writer.setMeta(e.getKey().substring(AvroJob.BINARY_PREFIX.length()),
                    URLDecoder.decode(e.getValue(), "ISO-8859-1").getBytes("ISO-8859-1"));
    }
}

From source file:net.peacesoft.nutch.crawl.ReLinkDb.java

License:Apache License

public void configure(JobConf job) {
    maxAnchorLength = job.getInt("db.max.anchor.length", 100);
    ignoreInternalLinks = job.getBoolean(IGNORE_INTERNAL_LINKS, true);
    if (job.getBoolean(LinkDbFilter.URL_FILTERING, false)) {
        urlFilters = new URLFilters(job);
    }//  w w w .  j  a v  a2  s.  c o  m
    if (job.getBoolean(LinkDbFilter.URL_NORMALIZING, false)) {
        urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_LINKDB);
    }
}

From source file:net.peacesoft.nutch.crawl.ReSolrWriter.java

License:Apache License

void init(SolrServer server, JobConf job) throws IOException {
    solr = server;/* www  .j  a  v a2s  . c  o  m*/
    commitSize = job.getInt(SolrConstants.COMMIT_SIZE, 1000);
    solrMapping = SolrMappingReader.getInstance(job);
    delete = job.getBoolean(IndexerMapReduce.INDEXER_DELETE, false);
    // parse optional params
    params = new ModifiableSolrParams();
    String paramString = job.get(SolrConstants.PARAMS);
    if (paramString != null) {
        String[] values = paramString.split("&");
        for (String v : values) {
            String[] kv = v.split("=");
            if (kv.length < 2) {
                continue;
            }
            params.add(kv[0], kv[1]);
        }
    }
}