Example usage for org.apache.hadoop.mapred JobConf getBoolean

List of usage examples for org.apache.hadoop.mapred JobConf getBoolean

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf getBoolean.

Prototype

public boolean getBoolean(String name, boolean defaultValue) 

Source Link

Document

Get the value of the name property as a boolean.

Usage

From source file:org.archive.jbs.solr.SolrOutputFormat.java

License:Apache License

public RecordWriter<Text, Text> getRecordWriter(final FileSystem fs, final JobConf job, final String name,
        final Progressable progress) throws IOException {
    String serverUrl = job.get("jbs.solr.url", "http://localhost:8983/solr");
    int docBufSize = job.getInt("jbs.solr.bufSize", 10);

    SolrDocumentWriter solrDocWriter = new SolrDocumentWriter(new URL(serverUrl), docBufSize);

    // FIXME: Temporary collection hack
    solrDocWriter.collectionHack = job.get("jbs.solr.collectionHack", null);

    TypeNormalizer normalizer = new TypeNormalizer();
    Map<String, String> aliases = normalizer.parseAliases(job.get("jbs.typeNormalizer.aliases", ""));

    if (job.getBoolean("jbs.typeNormalizer.useDefaults", true)) {
        Map<String, String> defaults = normalizer.getDefaultAliases();
        defaults.putAll(aliases);/*from   www . j  a v  a 2 s.  c  om*/

        aliases = defaults;
    }
    normalizer.setAliases(aliases);

    TypeFilter typeFilter = new TypeFilter();
    Set<String> allowedTypes = typeFilter.parse(job.get("jbs.typeFilter.allowed", ""));

    if (job.getBoolean("jbs.typeFilter.useDefaults", true)) {
        Set<String> defaults = typeFilter.getDefaultAllowed();
        defaults.addAll(allowedTypes);

        allowedTypes = defaults;
    }
    typeFilter.setAllowed(allowedTypes);
    typeFilter.setTypeNormalizer(normalizer);

    solrDocWriter.setFilter("reqFields", new RequiredFieldsFilter());
    solrDocWriter.setFilter("type", typeFilter);
    solrDocWriter.setFilter("robots", new RobotsFilter());

    solrDocWriter.setTypeNormalizer(normalizer);
    solrDocWriter.setIDNHelper(buildIDNHelper(job));

    return new SolrRecordWriter(solrDocWriter);
}

From source file:org.archive.jbs.solr.SolrOutputFormat.java

License:Apache License

/**
 * Build an IDNHelper object using configuration information in the JobConf.
 *//*from w  w  w  .  j  a  v  a  2  s  .co m*/
protected IDNHelper buildIDNHelper(JobConf job) throws IOException {
    IDNHelper helper = new IDNHelper();

    if (job.getBoolean("jbs.idnHelper.useDefaults", true)) {
        InputStream is = this.getClass().getClassLoader().getResourceAsStream("effective_tld_names.dat");

        if (is == null) {
            throw new RuntimeException("Cannot load default tld rules: effective_tld_names.dat");
        }

        Reader reader = new InputStreamReader(is, "utf-8");

        helper.addRules(reader);
    }

    String moreRules = job.get("jbs.idnHelper.moreRules", "");

    if (moreRules.length() > 0) {
        helper.addRules(new StringReader(moreRules));
    }

    return helper;
}

From source file:org.archive.nutchwax.PageRankDb.java

License:Apache License

public void configure(JobConf job) {
    ignoreInternalLinks = job.getBoolean("db.ignore.internal.links", true);
    if (job.getBoolean(LinkDbFilter.URL_FILTERING, false)) {
        urlFilters = new URLFilters(job);
    }/*from www . j a va  2 s .  co m*/
    if (job.getBoolean(LinkDbFilter.URL_NORMALIZING, false)) {
        urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_LINKDB);
    }
}

From source file:org.cloudata.core.parallel.hadoop.TableRowRecordReader.java

License:Apache License

public TableRowRecordReader(JobConf jobConf, CloudataConf conf, TableSplit tableSplit, Reporter reporter)
        throws IOException {
    this.jobConf = jobConf;
    this.reporter = reporter;
    this.skipError = jobConf.getBoolean(AbstractTabletInputFormat.SKIP_ERROR, false);

    RowFilter rowFilter = tableSplit.getRowFilter();
    InputTableInfo inputTableInfo = tableSplit.getInputTableInfo();

    this.startRowKey = rowFilter.getStartRowKey();
    this.endRowKey = rowFilter.getEndRowKey();

    LOG.info("TableSplit: " + inputTableInfo.getTableName() + ":" + startRowKey + " ~ " + endRowKey);
    if (reporter != null) {
        reporter.setStatus(inputTableInfo.getTableName() + ":" + startRowKey + " ~ " + endRowKey);
    }//from   w w  w .  ja va 2s .  c  om

    CTable ctable = CTable.openTable(conf, inputTableInfo.getTableName());
    try {
        this.scanner = ScannerFactory.openScanner(ctable, rowFilter, TableScanner.SCANNER_OPEN_TIMEOUT);
    } catch (IOException e) {
        if (this.scanner != null) {
            this.scanner.close();
        }
        end = true;
        LOG.error("Error while scanner open:startRowKey=" + startRowKey + ", endRowKey=" + endRowKey, e);
        if (skipError) {
            reporter.setStatus("Error while scanner open:startRowKey=" + startRowKey + ", endRowKey="
                    + endRowKey + "," + e.getMessage());
            reporter.incrCounter("cloudata", "error", 1);
            return;
        } else {
            throw e;
        }
    }

    //If Tablet isn't first tablet, skip first row.
    if (!startRowKey.equals(tableSplit.getJobStartRowKey())) {
        try {
            Row row = scanner.nextRow();
            if (row == null) {
                end = true;
                return;
            }
        } catch (IOException e) {
            end = true;
            LOG.error("Error while scanner.nextRow():startRowKey=" + startRowKey + ", endRowKey=" + endRowKey,
                    e);
            if (skipError) {
                reporter.setStatus("Error while scanner.nextRow():startRowKey=" + startRowKey + ", endRowKey="
                        + endRowKey + "," + e.getMessage());
                reporter.incrCounter("cloudata", "error", 1);
                return;
            } else {
                throw e;
            }
        }
    }
}

From source file:org.cloudata.core.parallel.hadoop.TableScanCellReader.java

License:Apache License

public TableScanCellReader(JobConf jobConf, CloudataConf conf, TableSplit tableSplit, Reporter reporter)
        throws IOException {
    this.jobConf = jobConf;
    this.reporter = reporter;
    this.skipError = jobConf.getBoolean(AbstractTabletInputFormat.SKIP_ERROR, false);

    RowFilter rowFilter = tableSplit.getRowFilter();

    InputTableInfo inputTableInfo = tableSplit.getInputTableInfo();
    Row.Key startRowKey = rowFilter.getStartRowKey();

    reporter.setStatus(inputTableInfo.getTableName() + ":" + rowFilter.getStartRowKey() + " ~ "
            + rowFilter.getEndRowKey());

    CTable ctable = CTable.openTable(conf, inputTableInfo.getTableName());
    long startTime = System.currentTimeMillis();
    try {//from  w  w w .j  av a 2  s .  c  o  m
        scanner = ScannerFactory.openScanner(ctable, rowFilter, TableScanner.SCANNER_OPEN_TIMEOUT);
    } catch (IOException e) {
        if (this.scanner != null) {
            this.scanner.close();
        }
        end = true;
        LOG.error("Error while scanner open:startRowKey=" + startRowKey + "," + e.getMessage());
        if (skipError) {
            reporter.setStatus(
                    "Error while scanner open:startRowKey=" + startRowKey + ", endRowKey=" + e.getMessage());
            reporter.incrCounter("cloudata", "error", 1);
            return;
        } else {
            throw e;
        }
    }

    try {
        // Tablet?  ?  row .
        scanCell = scanner.next();
        if (scanCell == null) {
            end = true;
            scanner.close();
            scanner = null;
            return;
        }
        if (!startRowKey.equals(Row.Key.MIN_KEY)) {
            Row.Key firstRow = scanCell.getRowKey();

            while (firstRow.equals(scanCell.getRowKey())) {
                scanCell = scanner.next();
                if (scanCell == null) {
                    end = true;
                    scanner.close();
                    scanner = null;
                    break;
                }
            }
        }

        totalScanTime += (System.currentTimeMillis() - startTime);
    } catch (IOException e) {
        if (scanner == null) {
            scanner.close();
            scanner = null;
        }
        LOG.error("Error while skip first row:startRowKey=" + startRowKey + "," + e.getMessage());
        if (skipError) {
            reporter.setStatus(
                    "Error while skip first row:startRowKey=" + startRowKey + ", endRowKey=" + e.getMessage());
            reporter.incrCounter("cloudata", "error", 1);
            return;
        } else {
            throw e;
        }
    }
}

From source file:org.cloudata.util.matrix.MatrixMutiplyMap.java

License:Apache License

public void configure(JobConf job) {
    CloudataConf conf = new CloudataConf();
    boolean sparse = job.getBoolean(MatrixInputFormat.MATRIX_TARGET_SPARSE, false);
    String targetTableName = job.get(MatrixInputFormat.MATRIX_TARGET_TABLE);
    String targetColumnName = job.get(MatrixInputFormat.MATRIX_TARGET_COLUMN);

    try {//from  www  . j  a  v a 2  s  .c o m
        if (sparse) {
            targetMatrix = new SparseMatrix(conf, targetTableName, targetColumnName);
        } else {
            targetMatrix = new Matrix(conf, targetTableName, targetColumnName);
        }
    } catch (IOException e) {
        err = e;
    }
}

From source file:org.cloudata.util.matrix.MatrixMutiplyReduce.java

License:Apache License

public void configure(JobConf job) {
    CloudataConf conf = new CloudataConf();
    boolean sparse = job.getBoolean(MatrixInputFormat.MATRIX_RESULT_SPARSE, false);
    String resultTableName = job.get(MatrixInputFormat.MATRIX_RESULT_TABLE);
    String resultColumnName = job.get(MatrixInputFormat.MATRIX_RESULT_COLUMN);

    try {//from ww  w  . j  av a  2s .  c o  m
        if (sparse) {
            resultMatrix = new SparseMatrix(conf, resultTableName, resultColumnName);
        } else {
            resultMatrix = new Matrix(conf, resultTableName, resultColumnName);
        }
        resultMatrix.initUploader();
    } catch (IOException e) {
        err = e;
    }
}

From source file:org.cloudata.util.upload.UploadMap.java

License:Apache License

@Override
public void configure(JobConf job) {
    try {/*ww  w  .  ja  v a 2s.  c o m*/
        String tableName = job.get(AbstractTabletInputFormat.OUTPUT_TABLE);
        CloudataConf nconf = new CloudataConf(job);
        ctable = CTable.openTable(nconf, tableName);
        if (ctable == null) {
            throw new IOException("No table:" + tableName);
        }
        delim = job.get("uploadJob.delim", "\t");
        columns = job.get("uploadJob.columns").split(",");

        String[] fieldNumStr = job.get("uploadJob.fieldNums").split(",");
        fieldNums = new int[fieldNumStr.length];
        for (int i = 0; i < fieldNumStr.length; i++) {
            fieldNums[i] = Integer.parseInt(fieldNumStr[i]);
        }

        keyValuePair = job.getBoolean("uploadJob.keyValuePair", false);
    } catch (Exception e) {
        err = new IOException(e.getMessage());
        err.initCause(e);
    }
}

From source file:org.commoncrawl.mapred.pipelineV3.domainmeta.crawlstats.NonSuperSubdomainCollectorStep.java

License:Open Source License

@Override
public void configure(JobConf job) {

    if (job.getBoolean("mapred.task.is.map", false)) {
        Path superDomainIdFile = new Path(job.get(SUPER_DOMAIN_FILE_PATH));

        try {/*from  w w w  .  j  ava 2 s  .c  o m*/
            superDomainIdSet = SuperDomainList.loadSuperDomainIdList(job, superDomainIdFile);
        } catch (IOException e) {
            LOG.error(StringUtils.stringifyException(e));
            throw new RuntimeException(e);
        }

        subDomainFilter = new URLFPBloomFilter(NUM_ELEMENTS, NUM_HASH_FUNCTIONS, NUM_BITS);
    }
}

From source file:org.dkpro.bigdata.hadoop.UIMAMapReduceBase.java

License:Open Source License

@Override
public void configure(JobConf job) {
    try {//w  w  w . jav  a 2  s.  c om
        this.job = job;
        this.inputName = job.get("mapred.input.dir");
        this.taskId = job.get("mapred.task.id");
        this.mapOutputValueClass = job.getMapOutputValueClass();
        this.outputValueClass = job.getOutputValueClass();
        this.samplingPropability = job.getInt("dkpro.map.samplingratio", 100);
        final EngineFactory engineFactory = (EngineFactory) Class
                .forName(job.get("dkpro.uima.factory", DkproHadoopDriver.class.getName())).newInstance();
        engineFactory.configure(job);

        final AnalysisEngineDescription engineDescription = getEngineDescription(engineFactory, job);

        // replace the $dir variable within the configuration.
        this.fs = FileSystem.get(job);
        this.localFS = FileSystem.getLocal(job);
        if (job.getBoolean("dkpro.output.onedirpertask", true)) {
            this.working_dir = new Path("uima_output_" + job.get("mapred.task.id"));
        } else {
            this.working_dir = new Path("uima_output");
        }
        final Path outputPath = FileOutputFormat.getOutputPath(job);
        this.results_dir = this.fs.startLocalOutput(outputPath, job.getLocalPath(this.working_dir.getName()));
        this.localFS.mkdirs(this.results_dir);
        final String[] resources = job.get("dkpro.resources", "").split(",");
        sLogger.info("Writing local data to: " + this.results_dir);
        this.resourceURIs = new TreeMap<String, URL>();
        for (final String resource : resources) {
            final URL r = job.getResource(resource);
            if (r != null && !resource.isEmpty()) {
                this.resourceURIs.put(resource, r);
            }

        }
        Map<String, String> variableValues = new HashMap<String, String>();
        variableValues.put("\\$dir", this.results_dir.toString());
        variableValues.put("\\$input", this.inputName);
        variableValues.put("\\$taskid", this.taskId);
        Path[] cacheFiles = DistributedCache.getLocalCacheFiles(job);
        if (cacheFiles != null) {
            for (Path cacheFile : cacheFiles) {
                variableValues.put("^\\$cache/" + cacheFile.getName(), cacheFile.toUri().getPath());
            }
        }
        for (final Entry<String, URL> resource : this.resourceURIs.entrySet()) {
            variableValues.put("\\$" + resource, resource.getValue().toString());
        }
        AnalysisEngineUtil.replaceVariables(engineDescription, variableValues);
        this.engine = createEngine(engineDescription);

    } catch (final Exception e) {
        sLogger.fatal("Error while configuring pipeline", e);
        e.printStackTrace();
        throw new RuntimeException(e);
    }

}