List of usage examples for org.apache.hadoop.mapreduce JobContext getConfiguration
public Configuration getConfiguration();
From source file:kogiri.common.hadoop.io.format.fasta.FastaReadInputFormat.java
License:Open Source License
@Override protected boolean isSplitable(JobContext context, Path filename) { boolean splitable = FastaReadInputFormat.isSplitable(context.getConfiguration()); LOG.info("splitable = " + splitable); if (!splitable) { return false; }/* w ww . j a va2 s . c o m*/ CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(filename); if (codec != null) { return false; } return true; }
From source file:kogiri.mapreduce.common.kmermatch.KmerMatchInputFormat.java
License:Open Source License
public static void setInputFormatConfig(JobContext job, KmerMatchInputFormatConfig inputFormatConfig) throws IOException { inputFormatConfig.saveTo(job.getConfiguration()); }
From source file:kogiri.mapreduce.common.kmermatch.KmerMatchInputFormat.java
License:Open Source License
public List<InputSplit> getSplits(JobContext job) throws IOException { KmerMatchInputFormatConfig inputFormatConfig = KmerMatchInputFormatConfig .createInstance(job.getConfiguration()); // generate splits List<InputSplit> splits = new ArrayList<InputSplit>(); List<FileStatus> files = listStatus(job); List<Path> kmerIndexFiles = new ArrayList<Path>(); for (FileStatus file : files) { Path path = file.getPath(); kmerIndexFiles.add(path);/* w w w.j a va 2s . co m*/ } LOG.info("# of Split input file : " + kmerIndexFiles.size()); for (int i = 0; i < kmerIndexFiles.size(); i++) { LOG.info("> " + kmerIndexFiles.get(i).toString()); } Path[] kmerIndexFilePath = kmerIndexFiles.toArray(new Path[0]); // histogram List<KmerHistogram> histograms = new ArrayList<KmerHistogram>(); for (int i = 0; i < kmerIndexFiles.size(); i++) { String fastaFileName = KmerIndexHelper.getFastaFileName(kmerIndexFilePath[i]); Path histogramPath = new Path(inputFormatConfig.getKmerHistogramPath(), KmerHistogramHelper.makeKmerHistogramFileName(fastaFileName)); FileSystem fs = histogramPath.getFileSystem(job.getConfiguration()); if (fs.exists(histogramPath)) { KmerHistogram histogram = KmerHistogram.createInstance(fs, histogramPath); histograms.add(histogram); } else { throw new IOException("k-mer histogram is not found in given paths"); } } // merge histogram Hashtable<String, KmerHistogramRecord> histogramRecords = new Hashtable<String, KmerHistogramRecord>(); long kmerCounts = 0; for (int i = 0; i < histograms.size(); i++) { Collection<KmerHistogramRecord> records = histograms.get(i).getSortedRecord(); kmerCounts += histograms.get(i).getTotalKmerCount(); for (KmerHistogramRecord rec : records) { KmerHistogramRecord ext_rec = histogramRecords.get(rec.getKmer()); if (ext_rec == null) { histogramRecords.put(rec.getKmer(), rec); } else { ext_rec.increaseFrequency(rec.getFrequency()); } } } List<KmerHistogramRecord> histogramRecordsArr = new ArrayList<KmerHistogramRecord>(); histogramRecordsArr.addAll(histogramRecords.values()); Collections.sort(histogramRecordsArr, new KmerHistogramRecordComparator()); KmerRangePartitioner partitioner = new KmerRangePartitioner(inputFormatConfig.getKmerSize(), inputFormatConfig.getPartitionNum()); KmerRangePartition[] partitions = partitioner .getHistogramPartitions(histogramRecordsArr.toArray(new KmerHistogramRecord[0]), kmerCounts); for (KmerRangePartition partition : partitions) { splits.add(new KmerMatchInputSplit(kmerIndexFilePath, partition)); } // Save the number of input files in the job-conf job.getConfiguration().setLong(NUM_INPUT_FILES, files.size()); LOG.debug("Total # of splits: " + splits.size()); return splits; }
From source file:kogiri.mapreduce.common.kmermatch.KmerMatchInputFormat.java
License:Open Source License
@Override protected List<FileStatus> listStatus(JobContext job) throws IOException { List<FileStatus> result = new ArrayList<FileStatus>(); Path[] dirs = getInputPaths(job); if (dirs.length == 0) { throw new IOException("No input paths specified in job"); }/*from ww w. j av a 2s . co m*/ // get tokens for all the required FileSystems.. TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration()); // creates a MultiPathFilter with the hiddenFileFilter and the // user provided one (if any). List<PathFilter> filters = new ArrayList<PathFilter>(); PathFilter jobFilter = getInputPathFilter(job); if (jobFilter != null) { filters.add(jobFilter); } filters.add(new KmerIndexIndexPathFilter()); PathFilter inputFilter = new MultiPathFilter(filters); for (int i = 0; i < dirs.length; ++i) { Path p = dirs[i]; if (inputFilter.accept(p)) { FileSystem fs = p.getFileSystem(job.getConfiguration()); FileStatus status = fs.getFileStatus(p); result.add(status); } } LOG.info("Total input paths to process : " + result.size()); return result; }
From source file:kogiri.mapreduce.preprocess.common.kmerindex.KmerIndexInputFormat.java
License:Open Source License
public static void setInputFormatConfig(JobContext job, KmerIndexInputFormatConfig inputFormatConfig) throws IOException { inputFormatConfig.saveTo(job.getConfiguration()); }
From source file:kogiri.mapreduce.preprocess.common.kmerindex.KmerIndexInputFormat.java
License:Open Source License
public List<InputSplit> getSplits(JobContext job) throws IOException { // generate splits List<InputSplit> splits = new ArrayList<InputSplit>(); List<FileStatus> files = listStatus(job); List<Path> indexFiles = new ArrayList<Path>(); for (FileStatus file : files) { Path path = file.getPath(); indexFiles.add(path);/*from ww w. j a v a2s . c o m*/ } LOG.info("# of Split input file : " + indexFiles.size()); for (int i = 0; i < indexFiles.size(); i++) { LOG.info("> " + indexFiles.get(i).toString()); } Path[] indexFilePaths = indexFiles.toArray(new Path[0]); Path[][] groups = KmerIndexHelper.groupKmerIndices(indexFilePaths); LOG.info("Input index groups : " + groups.length); for (int i = 0; i < groups.length; i++) { Path[] group = groups[i]; LOG.info("Input index group " + i + " : " + group.length); for (int j = 0; j < group.length; j++) { LOG.info("> " + group[j].toString()); } splits.add(new KmerIndexSplit(group, job.getConfiguration())); } // Save the number of input files in the job-conf job.getConfiguration().setLong(NUM_INPUT_FILES, files.size()); LOG.debug("Total # of splits: " + splits.size()); return splits; }
From source file:kogiri.mapreduce.preprocess.common.kmerindex.KmerIndexInputFormat.java
License:Open Source License
@Override protected List<FileStatus> listStatus(JobContext job) throws IOException { List<FileStatus> result = new ArrayList<FileStatus>(); Path[] dirs = getInputPaths(job); if (dirs.length == 0) { throw new IOException("No input paths specified in job"); }//from w w w. ja v a 2 s .c o m // get tokens for all the required FileSystems.. TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration()); // creates a MultiPathFilter with the hiddenFileFilter and the // user provided one (if any). List<PathFilter> filters = new ArrayList<PathFilter>(); PathFilter jobFilter = getInputPathFilter(job); if (jobFilter != null) { filters.add(jobFilter); } filters.add(new KmerIndexPartPathFilter()); PathFilter inputFilter = new MultiPathFilter(filters); for (int i = 0; i < dirs.length; ++i) { Path p = dirs[i]; if (inputFilter.accept(p)) { FileSystem fs = p.getFileSystem(job.getConfiguration()); FileStatus status = fs.getFileStatus(p); result.add(status); } } LOG.info("Total input paths to process : " + result.size()); return result; }
From source file:libra.common.hadoop.io.format.fasta.FastaKmerInputFormat.java
License:Apache License
@Override protected boolean isSplitable(JobContext context, Path filename) { boolean splitable = FastaKmerInputFormat.isSplitable(context.getConfiguration()); LOG.info("splitable = " + splitable); if (!splitable) { return false; }/* ww w. ja va2 s . co m*/ CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(filename); if (codec != null) { return false; } return true; }
From source file:ml.shifu.guagua.mapreduce.example.nn.NNInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { List<InputSplit> splits = super.getSplits(job); List<InputSplit> newSplits = new ArrayList<InputSplit>(); for (int i = 0; i < job.getConfiguration().getInt(NNConstants.NN_TEST_SCALE, 1); i++) { for (InputSplit inputSplit : splits) { if (isNotPigOrHadoopMetaFile(((FileSplit) inputSplit).getPath())) { newSplits.add(new GuaguaInputSplit(false, new FileSplit[] { (FileSplit) inputSplit })); }/*from ww w . ja va2 s . c om*/ } } newSplits.add(new GuaguaInputSplit(true, (FileSplit) null)); int mapperSize = newSplits.size(); LOG.info("inputs size including master: {}", mapperSize); LOG.debug("input splits inclduing: {}", newSplits); job.getConfiguration().set(GuaguaConstants.GUAGUA_WORKER_NUMBER, (mapperSize - 1) + ""); return newSplits; }
From source file:ml.shifu.guagua.mapreduce.example.nn.NNInputFormat.java
License:Apache License
@Override protected boolean isSplitable(JobContext context, Path file) { // bzip2 can be split. if (file.getName().endsWith(GuaguaMapReduceConstants.BZ2)) { return true; }//from w w w . ja v a2 s .c om // other compression can not be split, maybe for lzo I should add it to split list. CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(file); return codec == null; }