Example usage for org.apache.hadoop.mapreduce JobContext getConfiguration

List of usage examples for org.apache.hadoop.mapreduce JobContext getConfiguration

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce JobContext getConfiguration.

Prototype

public Configuration getConfiguration();

Source Link

Document

Return the configuration for the job.

Usage

From source file:kogiri.common.hadoop.io.format.fasta.FastaReadInputFormat.java

License:Open Source License

@Override
protected boolean isSplitable(JobContext context, Path filename) {
    boolean splitable = FastaReadInputFormat.isSplitable(context.getConfiguration());
    LOG.info("splitable = " + splitable);
    if (!splitable) {
        return false;
    }/*  w ww . j  a va2  s  .  c o  m*/

    CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(filename);
    if (codec != null) {
        return false;
    }

    return true;
}

From source file:kogiri.mapreduce.common.kmermatch.KmerMatchInputFormat.java

License:Open Source License

public static void setInputFormatConfig(JobContext job, KmerMatchInputFormatConfig inputFormatConfig)
        throws IOException {
    inputFormatConfig.saveTo(job.getConfiguration());
}

From source file:kogiri.mapreduce.common.kmermatch.KmerMatchInputFormat.java

License:Open Source License

public List<InputSplit> getSplits(JobContext job) throws IOException {
    KmerMatchInputFormatConfig inputFormatConfig = KmerMatchInputFormatConfig
            .createInstance(job.getConfiguration());

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);
    List<Path> kmerIndexFiles = new ArrayList<Path>();
    for (FileStatus file : files) {
        Path path = file.getPath();
        kmerIndexFiles.add(path);/* w  w  w.j  a va 2s . co  m*/
    }

    LOG.info("# of Split input file : " + kmerIndexFiles.size());
    for (int i = 0; i < kmerIndexFiles.size(); i++) {
        LOG.info("> " + kmerIndexFiles.get(i).toString());
    }

    Path[] kmerIndexFilePath = kmerIndexFiles.toArray(new Path[0]);

    // histogram
    List<KmerHistogram> histograms = new ArrayList<KmerHistogram>();
    for (int i = 0; i < kmerIndexFiles.size(); i++) {
        String fastaFileName = KmerIndexHelper.getFastaFileName(kmerIndexFilePath[i]);

        Path histogramPath = new Path(inputFormatConfig.getKmerHistogramPath(),
                KmerHistogramHelper.makeKmerHistogramFileName(fastaFileName));
        FileSystem fs = histogramPath.getFileSystem(job.getConfiguration());
        if (fs.exists(histogramPath)) {
            KmerHistogram histogram = KmerHistogram.createInstance(fs, histogramPath);
            histograms.add(histogram);
        } else {
            throw new IOException("k-mer histogram is not found in given paths");
        }
    }

    // merge histogram
    Hashtable<String, KmerHistogramRecord> histogramRecords = new Hashtable<String, KmerHistogramRecord>();
    long kmerCounts = 0;
    for (int i = 0; i < histograms.size(); i++) {
        Collection<KmerHistogramRecord> records = histograms.get(i).getSortedRecord();
        kmerCounts += histograms.get(i).getTotalKmerCount();

        for (KmerHistogramRecord rec : records) {
            KmerHistogramRecord ext_rec = histogramRecords.get(rec.getKmer());
            if (ext_rec == null) {
                histogramRecords.put(rec.getKmer(), rec);
            } else {
                ext_rec.increaseFrequency(rec.getFrequency());
            }
        }
    }

    List<KmerHistogramRecord> histogramRecordsArr = new ArrayList<KmerHistogramRecord>();
    histogramRecordsArr.addAll(histogramRecords.values());
    Collections.sort(histogramRecordsArr, new KmerHistogramRecordComparator());

    KmerRangePartitioner partitioner = new KmerRangePartitioner(inputFormatConfig.getKmerSize(),
            inputFormatConfig.getPartitionNum());
    KmerRangePartition[] partitions = partitioner
            .getHistogramPartitions(histogramRecordsArr.toArray(new KmerHistogramRecord[0]), kmerCounts);

    for (KmerRangePartition partition : partitions) {
        splits.add(new KmerMatchInputSplit(kmerIndexFilePath, partition));
    }

    // Save the number of input files in the job-conf
    job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());

    LOG.debug("Total # of splits: " + splits.size());
    return splits;
}

From source file:kogiri.mapreduce.common.kmermatch.KmerMatchInputFormat.java

License:Open Source License

@Override
protected List<FileStatus> listStatus(JobContext job) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();
    Path[] dirs = getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }/*from ww w. j av  a  2s .  co m*/

    // get tokens for all the required FileSystems..
    TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration());

    // creates a MultiPathFilter with the hiddenFileFilter and the
    // user provided one (if any).
    List<PathFilter> filters = new ArrayList<PathFilter>();
    PathFilter jobFilter = getInputPathFilter(job);
    if (jobFilter != null) {
        filters.add(jobFilter);
    }
    filters.add(new KmerIndexIndexPathFilter());
    PathFilter inputFilter = new MultiPathFilter(filters);

    for (int i = 0; i < dirs.length; ++i) {
        Path p = dirs[i];
        if (inputFilter.accept(p)) {
            FileSystem fs = p.getFileSystem(job.getConfiguration());
            FileStatus status = fs.getFileStatus(p);
            result.add(status);
        }
    }

    LOG.info("Total input paths to process : " + result.size());
    return result;
}

From source file:kogiri.mapreduce.preprocess.common.kmerindex.KmerIndexInputFormat.java

License:Open Source License

public static void setInputFormatConfig(JobContext job, KmerIndexInputFormatConfig inputFormatConfig)
        throws IOException {
    inputFormatConfig.saveTo(job.getConfiguration());
}

From source file:kogiri.mapreduce.preprocess.common.kmerindex.KmerIndexInputFormat.java

License:Open Source License

public List<InputSplit> getSplits(JobContext job) throws IOException {
    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);
    List<Path> indexFiles = new ArrayList<Path>();
    for (FileStatus file : files) {
        Path path = file.getPath();
        indexFiles.add(path);/*from  ww w. j  a  v  a2s .  c o m*/
    }

    LOG.info("# of Split input file : " + indexFiles.size());
    for (int i = 0; i < indexFiles.size(); i++) {
        LOG.info("> " + indexFiles.get(i).toString());
    }

    Path[] indexFilePaths = indexFiles.toArray(new Path[0]);

    Path[][] groups = KmerIndexHelper.groupKmerIndices(indexFilePaths);
    LOG.info("Input index groups : " + groups.length);
    for (int i = 0; i < groups.length; i++) {
        Path[] group = groups[i];
        LOG.info("Input index group " + i + " : " + group.length);
        for (int j = 0; j < group.length; j++) {
            LOG.info("> " + group[j].toString());
        }
        splits.add(new KmerIndexSplit(group, job.getConfiguration()));
    }

    // Save the number of input files in the job-conf
    job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());

    LOG.debug("Total # of splits: " + splits.size());
    return splits;
}

From source file:kogiri.mapreduce.preprocess.common.kmerindex.KmerIndexInputFormat.java

License:Open Source License

@Override
protected List<FileStatus> listStatus(JobContext job) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();
    Path[] dirs = getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }//from w  w w.  ja v a  2 s  .c o  m

    // get tokens for all the required FileSystems..
    TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration());

    // creates a MultiPathFilter with the hiddenFileFilter and the
    // user provided one (if any).
    List<PathFilter> filters = new ArrayList<PathFilter>();
    PathFilter jobFilter = getInputPathFilter(job);
    if (jobFilter != null) {
        filters.add(jobFilter);
    }
    filters.add(new KmerIndexPartPathFilter());
    PathFilter inputFilter = new MultiPathFilter(filters);

    for (int i = 0; i < dirs.length; ++i) {
        Path p = dirs[i];
        if (inputFilter.accept(p)) {
            FileSystem fs = p.getFileSystem(job.getConfiguration());
            FileStatus status = fs.getFileStatus(p);
            result.add(status);
        }
    }

    LOG.info("Total input paths to process : " + result.size());
    return result;
}

From source file:libra.common.hadoop.io.format.fasta.FastaKmerInputFormat.java

License:Apache License

@Override
protected boolean isSplitable(JobContext context, Path filename) {
    boolean splitable = FastaKmerInputFormat.isSplitable(context.getConfiguration());
    LOG.info("splitable = " + splitable);
    if (!splitable) {
        return false;
    }/*  ww  w. ja va2  s . co  m*/

    CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(filename);
    if (codec != null) {
        return false;
    }

    return true;
}

From source file:ml.shifu.guagua.mapreduce.example.nn.NNInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    List<InputSplit> splits = super.getSplits(job);
    List<InputSplit> newSplits = new ArrayList<InputSplit>();
    for (int i = 0; i < job.getConfiguration().getInt(NNConstants.NN_TEST_SCALE, 1); i++) {
        for (InputSplit inputSplit : splits) {
            if (isNotPigOrHadoopMetaFile(((FileSplit) inputSplit).getPath())) {
                newSplits.add(new GuaguaInputSplit(false, new FileSplit[] { (FileSplit) inputSplit }));
            }/*from ww  w .  ja  va2  s  . c  om*/
        }
    }
    newSplits.add(new GuaguaInputSplit(true, (FileSplit) null));
    int mapperSize = newSplits.size();
    LOG.info("inputs size including master: {}", mapperSize);
    LOG.debug("input splits inclduing: {}", newSplits);
    job.getConfiguration().set(GuaguaConstants.GUAGUA_WORKER_NUMBER, (mapperSize - 1) + "");
    return newSplits;
}

From source file:ml.shifu.guagua.mapreduce.example.nn.NNInputFormat.java

License:Apache License

@Override
protected boolean isSplitable(JobContext context, Path file) {
    // bzip2 can be split.
    if (file.getName().endsWith(GuaguaMapReduceConstants.BZ2)) {
        return true;
    }//from   w w  w .  ja  v a2  s  .c  om
    // other compression can not be split, maybe for lzo I should add it to split list.
    CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(file);
    return codec == null;
}