Example usage for org.apache.hadoop.mapred JobConf getInt

List of usage examples for org.apache.hadoop.mapred JobConf getInt

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf getInt.

Prototype

public int getInt(String name, int defaultValue) 

Source Link

Document

Get the value of the name property as an int.

Usage

From source file:org.apache.tez.mapreduce.hadoop.TestDeprecatedKeys.java

License:Apache License

@Test(timeout = 5000)
/**/*from w w w.  j a  v  a 2s  .c  o  m*/
 * Set of keys that can be overriden at tez runtime
 */
public void verifyTezOverridenKeys() {
    JobConf jobConf = new JobConf();
    jobConf.setInt(MRJobConfig.IO_SORT_FACTOR, 2000);
    jobConf.setInt(MRJobConfig.IO_SORT_MB, 100);
    jobConf.setInt(MRJobConfig.COUNTERS_MAX_KEY, 100);

    jobConf.setInt(TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_FACTOR, 1000);
    jobConf.setInt(TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_MB, 200);
    jobConf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_IFILE_READAHEAD, true);
    jobConf.setInt(TezRuntimeConfiguration.TEZ_RUNTIME_IFILE_READAHEAD_BYTES, 20);
    jobConf.setFloat(TezRuntimeConfiguration.TEZ_RUNTIME_SORT_SPILL_PERCENT, 0.2f);
    jobConf.setInt(TezRuntimeConfiguration.TEZ_RUNTIME_INDEX_CACHE_MEMORY_LIMIT_BYTES, 10);
    jobConf.setInt(TezRuntimeConfiguration.TEZ_RUNTIME_COMBINE_MIN_SPILLS, 20);
    jobConf.setInt(Constants.TEZ_RUNTIME_TASK_MEMORY, 10);
    jobConf.setInt(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_PARALLEL_COPIES, 10);
    jobConf.setInt(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_FETCH_FAILURES_LIMIT, 10);
    jobConf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_NOTIFY_READERROR, true);
    jobConf.setInt(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_CONNECT_TIMEOUT, 10);
    jobConf.setInt(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_READ_TIMEOUT, 10);
    jobConf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_ENABLE_SSL, true);
    jobConf.setFloat(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_FETCH_BUFFER_PERCENT, 10.0f);
    jobConf.setFloat(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_MEMORY_LIMIT_PERCENT, 10.0f);
    jobConf.setFloat(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_MERGE_PERCENT, 10.0f);
    jobConf.setInt(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_MEMTOMEM_SEGMENTS, 10);
    jobConf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_ENABLE_MEMTOMEM, true);
    jobConf.setFloat(TezRuntimeConfiguration.TEZ_RUNTIME_INPUT_POST_MERGE_BUFFER_PERCENT, 10.0f);
    jobConf.set(TezRuntimeConfiguration.TEZ_RUNTIME_INTERNAL_SORTER_CLASS, "DefaultSorter");
    jobConf.set(TezRuntimeConfiguration.TEZ_RUNTIME_GROUP_COMPARATOR_CLASS, "groupComparator");
    jobConf.set(TezRuntimeConfiguration.TEZ_RUNTIME_KEY_SECONDARY_COMPARATOR_CLASS, "SecondaryComparator");

    jobConf.setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, false);
    jobConf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_COMPRESS, true);

    MRHelpers.translateMRConfToTez(jobConf);

    assertEquals(1000, jobConf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_FACTOR, 0));
    assertEquals(200, jobConf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_MB, 100));
    assertEquals(true, jobConf.getBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_IFILE_READAHEAD, false));
    assertEquals(20, jobConf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_IFILE_READAHEAD_BYTES, 0));
    assertEquals(10, jobConf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_INDEX_CACHE_MEMORY_LIMIT_BYTES, 0));
    assertEquals(20, jobConf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_COMBINE_MIN_SPILLS, 0));
    assertEquals(10, jobConf.getInt(Constants.TEZ_RUNTIME_TASK_MEMORY, 0));
    assertEquals(10, jobConf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_PARALLEL_COPIES, 0));
    assertEquals(10, jobConf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_FETCH_FAILURES_LIMIT, 0));
    assertEquals(true, jobConf.getBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_NOTIFY_READERROR, false));
    assertEquals(10, jobConf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_CONNECT_TIMEOUT, 0));
    assertEquals(10, jobConf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_READ_TIMEOUT, 0));
    assertEquals(true, jobConf.getBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_ENABLE_SSL, false));
    assertEquals(10.0f,
            jobConf.getFloat(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_FETCH_BUFFER_PERCENT, 0.0f), 0.0f);
    assertEquals(10.0f,
            jobConf.getFloat(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_MEMORY_LIMIT_PERCENT, 0.0f), 0.0f);
    assertEquals(10.0f, jobConf.getFloat(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_MERGE_PERCENT, 0.0f),
            0.0f);
    assertEquals(10, jobConf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_MEMTOMEM_SEGMENTS, 0));
    assertEquals(true, jobConf.getBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_ENABLE_MEMTOMEM, false));
    assertEquals(10.0f,
            jobConf.getFloat(TezRuntimeConfiguration.TEZ_RUNTIME_INPUT_POST_MERGE_BUFFER_PERCENT, 0.0f), 0.0f);
    assertEquals("DefaultSorter", jobConf.get(TezRuntimeConfiguration.TEZ_RUNTIME_INTERNAL_SORTER_CLASS, ""));
    assertEquals("groupComparator",
            jobConf.get(TezRuntimeConfiguration.TEZ_RUNTIME_GROUP_COMPARATOR_CLASS, ""));
    assertEquals("SecondaryComparator",
            jobConf.get(TezRuntimeConfiguration.TEZ_RUNTIME_KEY_SECONDARY_COMPARATOR_CLASS, ""));
    assertEquals("DefaultSorter", jobConf.get(TezRuntimeConfiguration.TEZ_RUNTIME_INTERNAL_SORTER_CLASS, ""));
    assertTrue(jobConf.getBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_COMPRESS, false));

    assertNull(jobConf.get(MRConfig.MAPRED_IFILE_READAHEAD));
    assertNull(jobConf.get(MRConfig.MAPRED_IFILE_READAHEAD_BYTES));
    assertNull(jobConf.get(MRJobConfig.RECORDS_BEFORE_PROGRESS));
    assertNull(jobConf.get(MRJobConfig.IO_SORT_FACTOR));
    assertNull(jobConf.get(MRJobConfig.IO_SORT_MB));
    assertNull(jobConf.get(MRJobConfig.SHUFFLE_READ_TIMEOUT));
    assertNull(jobConf.get(MRJobConfig.INDEX_CACHE_MEMORY_LIMIT));
    assertNull(jobConf.get(MRJobConfig.MAP_COMBINE_MIN_SPILLS));
    assertNull(jobConf.get(MRJobConfig.REDUCE_MEMORY_TOTAL_BYTES));
    assertNull(jobConf.get(MRJobConfig.SHUFFLE_PARALLEL_COPIES));
    assertNull(jobConf.get(MRJobConfig.SHUFFLE_FETCH_FAILURES));
    assertNull(jobConf.get(MRJobConfig.SHUFFLE_NOTIFY_READERROR));
    assertNull(jobConf.get(MRJobConfig.SHUFFLE_CONNECT_TIMEOUT));
    assertNull(jobConf.get(MRJobConfig.SHUFFLE_READ_TIMEOUT));
    assertNull(jobConf.get(MRConfig.SHUFFLE_SSL_ENABLED_KEY));
    assertNull(jobConf.get(MRJobConfig.SHUFFLE_INPUT_BUFFER_PERCENT));
    assertNull(jobConf.get(MRJobConfig.SHUFFLE_MEMORY_LIMIT_PERCENT));
    assertNull(jobConf.get(MRJobConfig.REDUCE_MEMTOMEM_THRESHOLD));
    assertNull(jobConf.get(MRJobConfig.REDUCE_MEMTOMEM_ENABLED));
    assertNull(jobConf.get(MRJobConfig.REDUCE_INPUT_BUFFER_PERCENT));
    assertNull(jobConf.get(MRJobConfig.GROUP_COMPARATOR_CLASS));
    assertNull(jobConf.get(MRJobConfig.GROUP_COMPARATOR_CLASS));
    assertNull(jobConf.get("map.sort.class"));
}

From source file:org.apache.tez.mapreduce.task.MRRuntimeTask.java

License:Apache License

private static void configureMRTask(JobConf job, MRTask task) throws IOException, InterruptedException {

    Credentials credentials = UserGroupInformation.getCurrentUser().getCredentials();
    job.setCredentials(credentials);/*from   w ww  .  ja v a2s. c  o m*/
    // TODO Can this be avoided all together. Have the MRTezOutputCommitter use
    // the Tez parameter.
    // TODO This could be fetched from the env if YARN is setting it for all
    // Containers.
    // Set it in conf, so as to be able to be used the the OutputCommitter.
    job.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, job.getInt(TezJobConfig.APPLICATION_ATTEMPT_ID, -1));

    job.setClass(MRConfig.TASK_LOCAL_OUTPUT_CLASS, YarnOutputFiles.class, MapOutputFile.class); // MR

    Token<JobTokenIdentifier> jobToken = TokenCache.getJobToken(credentials);
    if (jobToken != null) {
        // Will MR ever run without a job token.
        SecretKey sk = JobTokenSecretManager.createSecretKey(jobToken.getPassword());
        task.setJobTokenSecret(sk);
    } else {
        LOG.warn("No job token set");
    }

    job.set(MRJobConfig.JOB_LOCAL_DIR, job.get(TezJobConfig.JOB_LOCAL_DIR));
    job.set(MRConfig.LOCAL_DIR, job.get(TezJobConfig.LOCAL_DIRS));
    if (job.get(TezJobConfig.DAG_CREDENTIALS_BINARY) != null) {
        job.set(MRJobConfig.MAPREDUCE_JOB_CREDENTIALS_BINARY, job.get(TezJobConfig.DAG_CREDENTIALS_BINARY));
    }

    // setup the child's attempt directories
    // Do the task-type specific localization
    task.localizeConfiguration(job);

    // Set up the DistributedCache related configs
    setupDistributedCacheConfig(job);

    task.setConf(job);
}

From source file:org.archive.access.nutch.jobs.ImportArcs.java

License:LGPL

public void configure(final JobConf job) {
    setConf(job);//from  www.ja  va 2  s  . c  om
    this.indexAll = job.getBoolean("wax.index.all", false);

    this.contentLimit = job.getInt("http.content.limit", 1024 * 100);
    final int pdfMultiplicand = job.getInt("wax.pdf.size.multiplicand", 10);
    this.pdfContentLimit = (this.contentLimit == -1) ? this.contentLimit : pdfMultiplicand * this.contentLimit;
    this.mimeTypes = MimeTypes.get(job.get("mime.types.file"));
    this.segmentName = job.get(Nutch.SEGMENT_NAME_KEY);

    // Get the rsync protocol handler into the mix.
    System.setProperty("java.protocol.handler.pkgs", "org.archive.net");

    // Format numbers output by parse rate logging.
    this.numberFormatter.setMaximumFractionDigits(2);
    this.numberFormatter.setMinimumFractionDigits(2);
    this.parseThreshold = job.getInt("wax.parse.rate.threshold", -1);

    this.indexRedirects = job.getBoolean("wax.index.redirects", false);

    this.sha1 = job.getBoolean("wax.digest.sha1", false);

    this.urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_FETCHER);
    this.filters = new URLFilters(job);

    this.parseUtil = new ParseUtil(job);

    this.collectionName = job.get(ImportArcs.WAX_SUFFIX + ImportArcs.ARCCOLLECTION_KEY);

    // Get ARCName by reading first record in ARC?  Otherwise, we parse
    // the name of the file we've been passed to find an ARC name.
    this.arcNameFromFirstRecord = job.getBoolean("wax.arcname.from.first.record", true);

    this.collectionType = job.get(Global.COLLECTION_TYPE);
    this.timeoutIndexingDocument = job.getInt(Global.TIMEOUT_INDEXING_DOCUMENT, -1);

    LOG.info("ImportArcs collectionType: " + collectionType);
}

From source file:org.archive.jbs.lucene.LuceneOutputFormat.java

License:Apache License

public RecordWriter<Text, Text> getRecordWriter(final FileSystem fs, final JobConf job, final String name,
        final Progressable progress) throws IOException {
    // Open Lucene index in ${temp}
    this.fs = FileSystem.get(job);
    this.job = job;
    this.perm = new Path(FileOutputFormat.getOutputPath(job), name);
    this.temp = job.getLocalPath("index/_" + (new Random().nextInt()));

    this.fs.delete(perm, true); // delete old, if any

    indexer = new IndexWriter(new NIOFSDirectory(new File(fs.startLocalOutput(perm, temp).toString())),
            new KeywordAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED);

    indexer.setMergeFactor(job.getInt("jbs.lucene.mergeFactor", 100));
    indexer.setMaxMergeDocs(job.getInt("jbs.lucene.maxMergeDocs", Integer.MAX_VALUE));
    indexer.setRAMBufferSizeMB(job.getInt("jbs.lucene.maxRAMBufferSize", 512));
    indexer.setTermIndexInterval(/*from   w w  w .  jav a  2  s . c o m*/
            job.getInt("jbs.lucene.termIndexInterval", IndexWriterConfig.DEFAULT_TERM_INDEX_INTERVAL));
    indexer.setMaxFieldLength(job.getInt("jbs.lucene.max.tokens", Integer.MAX_VALUE));
    indexer.setUseCompoundFile(false);
    indexer.setSimilarity(new WebSimilarity());

    LuceneDocumentWriter docWriter = buildDocumentWriter(job, indexer);

    return new LuceneRecordWriter(docWriter);
}

From source file:org.archive.jbs.lucene.LuceneOutputFormat.java

License:Apache License

/**
 * Factory method which constructs the LuceneDocumentWriter.  Much
 * of the configuration can be controlled via the Hadoop JobConf.
 *///from  w  w  w.j  a va  2  s. c  om
protected LuceneDocumentWriter buildDocumentWriter(JobConf job, IndexWriter indexer) throws IOException {
    CustomAnalyzer analyzer = new CustomAnalyzer(
            job.getBoolean("jbs.lucene.analyzer.custom.omitNonAlpha", true), new HashSet<String>(
                    Arrays.asList(job.get("jbs.lucene.analyzer.stopWords", "").trim().split("\\s+"))));

    LuceneDocumentWriter writer = new LuceneDocumentWriter(indexer, analyzer);

    IDNHelper idnHelper = buildIDNHelper(job);
    TypeNormalizer normalizer = buildTypeNormalizer(job);
    TypeFilter typeFilter = buildTypeFilter(job, normalizer);

    writer.setFilter("reqFields", new RequiredFieldsFilter());
    writer.setFilter("type", typeFilter);
    writer.setFilter("robots", new RobotsFilter());
    writer.setFilter("http", new HTTPStatusCodeFilter(job.get("jbs.httpStatusCodeFilter")));

    int textMaxLength = job.getInt("jbs.lucene.text.maxlength", TextHandler.MAX_LENGTH);

    Map<String, FieldHandler> handlers = new HashMap<String, FieldHandler>();
    handlers.put("url", new SimpleFieldHandler("url", Field.Store.YES, Field.Index.ANALYZED));
    handlers.put("digest", new SimpleFieldHandler("digest", Field.Store.YES, Field.Index.NO));
    handlers.put("title", new SimpleFieldHandler("title", Field.Store.YES, Field.Index.ANALYZED));
    handlers.put("keywords", new SimpleFieldHandler("keywords", Field.Store.YES, Field.Index.ANALYZED));
    handlers.put("description", new SimpleFieldHandler("description", Field.Store.YES, Field.Index.ANALYZED));
    handlers.put("length", new SimpleFieldHandler("length", Field.Store.YES, Field.Index.NO));
    handlers.put("collection",
            new SimpleFieldHandler("collection", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
    handlers.put("code", new SimpleFieldHandler("code", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
    handlers.put("content", new TextHandler("content", textMaxLength));
    handlers.put("boiled", new TextHandler("boiled", textMaxLength));
    handlers.put("date", new DateHandler());
    handlers.put("site", new SiteHandler(idnHelper));
    handlers.put("type", new TypeHandler(normalizer));
    handlers.put("boost", new BoostHandler());

    writer.setHandlers(handlers);

    return writer;
}

From source file:org.archive.jbs.lucene.NutchWAXOutputFormat.java

License:Apache License

protected LuceneDocumentWriter buildDocumentWriter(JobConf job, IndexWriter indexer) throws IOException {
    // This configuration propery must be set to an actual file,
    // otherwise the Nutch CommonGrams class explodes (which is
    // invoked by the NutchDocumentAnalyzer).  This empty
    // "common-terms.utf8" file is bundled into the JBs .jar file.
    job.set("analysis.common.terms.file", "common-terms.utf8");

    Analyzer analyzer = new org.apache.nutch.analysis.NutchDocumentAnalyzer(job);

    LuceneDocumentWriter writer = new LuceneDocumentWriter(indexer, analyzer);

    TypeNormalizer normalizer = buildTypeNormalizer(job);
    TypeFilter typeFilter = buildTypeFilter(job, normalizer);

    writer.setFilter("reqFields", new RequiredFieldsFilter());
    writer.setFilter("type", typeFilter);
    writer.setFilter("robots", new RobotsFilter());
    writer.setFilter("http", new HTTPStatusCodeFilter(job.get("jbs.httpStatusCodeFilter")));

    int textMaxLength = job.getInt("jbs.lucene.text.maxlength", TextHandler.MAX_LENGTH);

    Map<String, FieldHandler> handlers = new HashMap<String, FieldHandler>();
    handlers.put("url", new SimpleFieldHandler("url", Field.Store.YES, Field.Index.ANALYZED));
    handlers.put("digest", new SimpleFieldHandler("digest", Field.Store.YES, Field.Index.NO));
    handlers.put("title", new SimpleFieldHandler("title", Field.Store.YES, Field.Index.ANALYZED));
    handlers.put("keywords", new SimpleFieldHandler("keywords", Field.Store.YES, Field.Index.ANALYZED));
    handlers.put("description", new SimpleFieldHandler("description", Field.Store.YES, Field.Index.ANALYZED));
    handlers.put("length", new SimpleFieldHandler("length", Field.Store.YES, Field.Index.NO));
    handlers.put("code", new SimpleFieldHandler("code", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
    handlers.put("content", new TextHandler("content", textMaxLength));
    handlers.put("boiled", new TextHandler("boiled", textMaxLength));
    handlers.put("date", new DateHandler());
    handlers.put("site", new NutchWAXSiteHandler());
    handlers.put("type", new TypeHandler(normalizer));
    handlers.put("boost", new BoostHandler());

    String collection = job.get("jbs.lucene.collection", null);
    if (collection == null)
        handlers.put("collection",
                new SimpleFieldHandler("collection", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
    else/*from w  w  w .jav  a  2 s .  c o m*/
        handlers.put("collection", new FixedValueFieldHandler("collection", collection, Field.Store.YES,
                Field.Index.NOT_ANALYZED_NO_NORMS));

    writer.setHandlers(handlers);

    return writer;
}

From source file:org.archive.jbs.solr.SolrOutputFormat.java

License:Apache License

public RecordWriter<Text, Text> getRecordWriter(final FileSystem fs, final JobConf job, final String name,
        final Progressable progress) throws IOException {
    String serverUrl = job.get("jbs.solr.url", "http://localhost:8983/solr");
    int docBufSize = job.getInt("jbs.solr.bufSize", 10);

    SolrDocumentWriter solrDocWriter = new SolrDocumentWriter(new URL(serverUrl), docBufSize);

    // FIXME: Temporary collection hack
    solrDocWriter.collectionHack = job.get("jbs.solr.collectionHack", null);

    TypeNormalizer normalizer = new TypeNormalizer();
    Map<String, String> aliases = normalizer.parseAliases(job.get("jbs.typeNormalizer.aliases", ""));

    if (job.getBoolean("jbs.typeNormalizer.useDefaults", true)) {
        Map<String, String> defaults = normalizer.getDefaultAliases();
        defaults.putAll(aliases);//from ww  w.ja v a2s . c  o m

        aliases = defaults;
    }
    normalizer.setAliases(aliases);

    TypeFilter typeFilter = new TypeFilter();
    Set<String> allowedTypes = typeFilter.parse(job.get("jbs.typeFilter.allowed", ""));

    if (job.getBoolean("jbs.typeFilter.useDefaults", true)) {
        Set<String> defaults = typeFilter.getDefaultAllowed();
        defaults.addAll(allowedTypes);

        allowedTypes = defaults;
    }
    typeFilter.setAllowed(allowedTypes);
    typeFilter.setTypeNormalizer(normalizer);

    solrDocWriter.setFilter("reqFields", new RequiredFieldsFilter());
    solrDocWriter.setFilter("type", typeFilter);
    solrDocWriter.setFilter("robots", new RobotsFilter());

    solrDocWriter.setTypeNormalizer(normalizer);
    solrDocWriter.setIDNHelper(buildIDNHelper(job));

    return new SolrRecordWriter(solrDocWriter);
}

From source file:org.commoncrawl.hadoop.io.ARCInputFormat.java

License:Open Source License

/**
 * @inheritDoc//w w  w .ja  v  a 2 s  .co m
 */
public void configure(JobConf job) {

    blockSize = job.getInt(P_IO_BLOCK_SIZE, 32 * 1024);
    int bufferSize = job.getInt(P_IO_BUFFER_SIZE, 10 * 1024 * 1024);
    int queueSize = Math.max(1, bufferSize / blockSize);
    int timeout = job.getInt(P_IO_TIMEOUT, 60 * 1000);

    ArcFileReader.setBlockSize(blockSize);
    ArcFileReader.setBufferQueueSize(queueSize);
    ArcFileReader.setIOTimeoutValue(timeout);

    LOG.info("Block Size: " + blockSize);
    LOG.info("Queue Size: " + queueSize);
    LOG.info("IO Timeout: " + timeout);

    Class archiveSourceClass = job.getClass(P_ARC_SOURCE, JetS3tARCSource.class, ARCSource.class);
    arcSource = (ARCSource) ReflectionUtils.newInstance(archiveSourceClass, job);
}

From source file:org.commoncrawl.hadoop.io.ARCSplitCalculator.java

License:Open Source License

/**
 * @inheritDoc// www . j av a 2s .c om
 */
public final void configure(JobConf job) {
    filesPerSplit = job.getInt(P_FILES_PER_SPLIT, 1);
    bytesPerSplit = job.get(P_MB_PER_SPLIT) == null ? Long.MAX_VALUE
            : Long.parseLong(job.get(P_MB_PER_SPLIT)) * 1024 * 1024;
    configureImpl(job);
}

From source file:org.commoncrawl.hadoop.io.deprecated.JetS3tARCSource.java

License:Apache License

/**
 * @inheritDoc//  ww  w. j  ava  2 s. c o m
 */
@Override
protected void configureImpl(JobConf job) {
    try {

        // Pull credentials from the configuration
        String awsAccessKeyId = getProperty(job, P_AWS_ACCESS_KEY_ID);
        String awsSecretAccessKey = getProperty(job, P_AWS_SECRET_ACCESS_KEY);
        String bucketName = getProperty(job, P_BUCKET_NAME);

        // Instantiate JetS3t classes
        AWSCredentials awsCredentials = new AWSCredentials(awsAccessKeyId, awsSecretAccessKey);
        service = new RestS3Service(awsCredentials);
        // enable requester pays feature flag
        //service.setRequesterPaysEnabled(true);
        bucket = new S3Bucket(bucketName);

        maxTries = job.getInt(P_MAX_TRIES, 4);

    } catch (S3ServiceException e) {
        throw new RuntimeException(e);
    }
}