Example usage for org.apache.hadoop.mapreduce JobContext getConfiguration

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce JobContext getConfiguration.

Prototype

public Configuration getConfiguration();

Source Link

Document

Return the configuration for the job.

Usage

From source file:kogiri.common.hadoop.io.format.fasta.FastaReadInputFormat.java

License:Open Source License

@Override
protected boolean isSplitable(JobContext context, Path filename) {
    boolean splitable = FastaReadInputFormat.isSplitable(context.getConfiguration());
    LOG.info("splitable = " + splitable);
    if (!splitable) {
        return false;
    }/*  w ww . j  a va2  s  .  c o  m*/

    CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(filename);
    if (codec != null) {
        return false;
    }

    return true;
}

From source file:kogiri.mapreduce.common.kmermatch.KmerMatchInputFormat.java

License:Open Source License

public static void setInputFormatConfig(JobContext job, KmerMatchInputFormatConfig inputFormatConfig)
        throws IOException {
    inputFormatConfig.saveTo(job.getConfiguration());
}

From source file:kogiri.mapreduce.common.kmermatch.KmerMatchInputFormat.java

License:Open Source License

public List<InputSplit> getSplits(JobContext job) throws IOException {
    KmerMatchInputFormatConfig inputFormatConfig = KmerMatchInputFormatConfig
            .createInstance(job.getConfiguration());

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);
    List<Path> kmerIndexFiles = new ArrayList<Path>();
    for (FileStatus file : files) {
        Path path = file.getPath();
        kmerIndexFiles.add(path);/* w  w  w.j  a va 2s . co  m*/
    }

    LOG.info("# of Split input file : " + kmerIndexFiles.size());
    for (int i = 0; i < kmerIndexFiles.size(); i++) {
        LOG.info("> " + kmerIndexFiles.get(i).toString());
    }

    Path[] kmerIndexFilePath = kmerIndexFiles.toArray(new Path[0]);

    // histogram
    List<KmerHistogram> histograms = new ArrayList<KmerHistogram>();
    for (int i = 0; i < kmerIndexFiles.size(); i++) {
        String fastaFileName = KmerIndexHelper.getFastaFileName(kmerIndexFilePath[i]);

        Path histogramPath = new Path(inputFormatConfig.getKmerHistogramPath(),
                KmerHistogramHelper.makeKmerHistogramFileName(fastaFileName));
        FileSystem fs = histogramPath.getFileSystem(job.getConfiguration());
        if (fs.exists(histogramPath)) {
            KmerHistogram histogram = KmerHistogram.createInstance(fs, histogramPath);
            histograms.add(histogram);
        } else {
            throw new IOException("k-mer histogram is not found in given paths");
        }
    }

    // merge histogram
    Hashtable<String, KmerHistogramRecord> histogramRecords = new Hashtable<String, KmerHistogramRecord>();
    long kmerCounts = 0;
    for (int i = 0; i < histograms.size(); i++) {
        Collection<KmerHistogramRecord> records = histograms.get(i).getSortedRecord();
        kmerCounts += histograms.get(i).getTotalKmerCount();

        for (KmerHistogramRecord rec : records) {
            KmerHistogramRecord ext_rec = histogramRecords.get(rec.getKmer());
            if (ext_rec == null) {
                histogramRecords.put(rec.getKmer(), rec);
            } else {
                ext_rec.increaseFrequency(rec.getFrequency());
            }
        }
    }

    List<KmerHistogramRecord> histogramRecordsArr = new ArrayList<KmerHistogramRecord>();
    histogramRecordsArr.addAll(histogramRecords.values());
    Collections.sort(histogramRecordsArr, new KmerHistogramRecordComparator());

    KmerRangePartitioner partitioner = new KmerRangePartitioner(inputFormatConfig.getKmerSize(),
            inputFormatConfig.getPartitionNum());
    KmerRangePartition[] partitions = partitioner
            .getHistogramPartitions(histogramRecordsArr.toArray(new KmerHistogramRecord[0]), kmerCounts);

    for (KmerRangePartition partition : partitions) {
        splits.add(new KmerMatchInputSplit(kmerIndexFilePath, partition));
    }

    // Save the number of input files in the job-conf
    job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());

    LOG.debug("Total # of splits: " + splits.size());
    return splits;
}

From source file:kogiri.mapreduce.common.kmermatch.KmerMatchInputFormat.java

License:Open Source License

@Override
protected List<FileStatus> listStatus(JobContext job) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();
    Path[] dirs = getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }/*from ww w. j av  a  2s .  co m*/

    // get tokens for all the required FileSystems..
    TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration());

    // creates a MultiPathFilter with the hiddenFileFilter and the
    // user provided one (if any).
    List<PathFilter> filters = new ArrayList<PathFilter>();
    PathFilter jobFilter = getInputPathFilter(job);
    if (jobFilter != null) {
        filters.add(jobFilter);
    }
    filters.add(new KmerIndexIndexPathFilter());
    PathFilter inputFilter = new MultiPathFilter(filters);

    for (int i = 0; i < dirs.length; ++i) {
        Path p = dirs[i];
        if (inputFilter.accept(p)) {
            FileSystem fs = p.getFileSystem(job.getConfiguration());
            FileStatus status = fs.getFileStatus(p);
            result.add(status);
        }
    }

    LOG.info("Total input paths to process : " + result.size());
    return result;
}

From source file:kogiri.mapreduce.preprocess.common.kmerindex.KmerIndexInputFormat.java

License:Open Source License

public static void setInputFormatConfig(JobContext job, KmerIndexInputFormatConfig inputFormatConfig)
        throws IOException {
    inputFormatConfig.saveTo(job.getConfiguration());
}

From source file:kogiri.mapreduce.preprocess.common.kmerindex.KmerIndexInputFormat.java

License:Open Source License

public List<InputSplit> getSplits(JobContext job) throws IOException {
    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);
    List<Path> indexFiles = new ArrayList<Path>();
    for (FileStatus file : files) {
        Path path = file.getPath();
        indexFiles.add(path);/*from  ww w. j  a  v  a2s .  c o m*/
    }

    LOG.info("# of Split input file : " + indexFiles.size());
    for (int i = 0; i < indexFiles.size(); i++) {
        LOG.info("> " + indexFiles.get(i).toString());
    }

    Path[] indexFilePaths = indexFiles.toArray(new Path[0]);

    Path[][] groups = KmerIndexHelper.groupKmerIndices(indexFilePaths);
    LOG.info("Input index groups : " + groups.length);
    for (int i = 0; i < groups.length; i++) {
        Path[] group = groups[i];
        LOG.info("Input index group " + i + " : " + group.length);
        for (int j = 0; j < group.length; j++) {
            LOG.info("> " + group[j].toString());
        }
        splits.add(new KmerIndexSplit(group, job.getConfiguration()));
    }

    // Save the number of input files in the job-conf
    job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());

    LOG.debug("Total # of splits: " + splits.size());
    return splits;
}

From source file:kogiri.mapreduce.preprocess.common.kmerindex.KmerIndexInputFormat.java

License:Open Source License

@Override
protected List<FileStatus> listStatus(JobContext job) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();
    Path[] dirs = getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }//from w  w w.  ja v a  2 s  .c o  m

    // get tokens for all the required FileSystems..
    TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration());

    // creates a MultiPathFilter with the hiddenFileFilter and the
    // user provided one (if any).
    List<PathFilter> filters = new ArrayList<PathFilter>();
    PathFilter jobFilter = getInputPathFilter(job);
    if (jobFilter != null) {
        filters.add(jobFilter);
    }
    filters.add(new KmerIndexPartPathFilter());
    PathFilter inputFilter = new MultiPathFilter(filters);

    for (int i = 0; i < dirs.length; ++i) {
        Path p = dirs[i];
        if (inputFilter.accept(p)) {
            FileSystem fs = p.getFileSystem(job.getConfiguration());
            FileStatus status = fs.getFileStatus(p);
            result.add(status);
        }
    }

    LOG.info("Total input paths to process : " + result.size());
    return result;
}

From source file:libra.common.hadoop.io.format.fasta.FastaKmerInputFormat.java

License:Apache License

@Override
protected boolean isSplitable(JobContext context, Path filename) {
    boolean splitable = FastaKmerInputFormat.isSplitable(context.getConfiguration());
    LOG.info("splitable = " + splitable);
    if (!splitable) {
        return false;
    }/*  ww  w. ja va2  s . co  m*/

    CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(filename);
    if (codec != null) {
        return false;
    }

    return true;
}

From source file:ml.shifu.guagua.mapreduce.example.nn.NNInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    List<InputSplit> splits = super.getSplits(job);
    List<InputSplit> newSplits = new ArrayList<InputSplit>();
    for (int i = 0; i < job.getConfiguration().getInt(NNConstants.NN_TEST_SCALE, 1); i++) {
        for (InputSplit inputSplit : splits) {
            if (isNotPigOrHadoopMetaFile(((FileSplit) inputSplit).getPath())) {
                newSplits.add(new GuaguaInputSplit(false, new FileSplit[] { (FileSplit) inputSplit }));
            }/*from ww  w .  ja  va2  s  . c  om*/
        }
    }
    newSplits.add(new GuaguaInputSplit(true, (FileSplit) null));
    int mapperSize = newSplits.size();
    LOG.info("inputs size including master: {}", mapperSize);
    LOG.debug("input splits inclduing: {}", newSplits);
    job.getConfiguration().set(GuaguaConstants.GUAGUA_WORKER_NUMBER, (mapperSize - 1) + "");
    return newSplits;
}

From source file:ml.shifu.guagua.mapreduce.example.nn.NNInputFormat.java

License:Apache License

@Override
protected boolean isSplitable(JobContext context, Path file) {
    // bzip2 can be split.
    if (file.getName().endsWith(GuaguaMapReduceConstants.BZ2)) {
        return true;
    }//from   w w  w .  ja  v a2  s  .c  om
    // other compression can not be split, maybe for lzo I should add it to split list.
    CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(file);
    return codec == null;
}