Example usage for org.apache.hadoop.mapreduce JobContext getConfiguration

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce JobContext getConfiguration.

Prototype

public Configuration getConfiguration();

Source Link

Document

Return the configuration for the job.

Usage

From source file:edu.rutgers.ess.crs.utility.CSVInputFormat.java

License:Apache License

protected boolean isSplitable(final JobContext context, final Path file) {
    final CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(file);
    return codec == null;
}

From source file:edu.umn.cs.spatialHadoop.mapreduce.SpatialInputFormat3.java

License:Open Source License

@Override
protected List<FileStatus> listStatus(JobContext job) throws IOException {
    try {// www.j  a va2 s  . c om
        Configuration jobConf = job.getConfiguration();
        // The block filter associated with this job
        BlockFilter blockFilter = null;
        if (jobConf.get(InputQueryRange) != null) {
            // This job requires a range query
            blockFilter = new RangeFilter(OperationsParams.getShape(jobConf, InputQueryRange));
        }
        // Retrieve the BlockFilter set by the developers in the JobConf
        Class<? extends BlockFilter> blockFilterClass = jobConf.getClass(SpatialSite.FilterClass, null,
                BlockFilter.class);
        if (blockFilterClass != null) {
            BlockFilter userBlockFilter = blockFilterClass.newInstance();
            blockFilter = blockFilter == null ? userBlockFilter
                    : new CombineBlockFilter(blockFilter, userBlockFilter);
        }
        if (blockFilter == null) {
            // No block filter specified by user
            LOG.info("No block filter specified");
            return super.listStatus(job);
        }
        // Get all blocks the user wants to process
        blockFilter.configure(jobConf);

        // Filter files based on user specified filter function
        List<FileStatus> result = new ArrayList<FileStatus>();
        Path[] inputDirs = getInputPaths(job);

        for (Path dir : inputDirs) {
            FileSystem fs = dir.getFileSystem(jobConf);
            listStatus(fs, dir, result, blockFilter);
        }

        LOG.info("Spatial filter function matched with " + result.size() + " cells");

        return result;
    } catch (InstantiationException e) {
        LOG.warn(e);
        return super.listStatus(job);
    } catch (IllegalAccessException e) {
        LOG.warn(e);
        return super.listStatus(job);
    }
}

From source file:edu.umn.cs.spatialHadoop.mapreduce.SpatialInputFormat3.java

License:Open Source License

@Override
protected boolean isSplitable(JobContext context, Path file) {
    try {/*from   www.j a  va  2s.c o  m*/
        // Create compressionCodecs to be used by isSplitable method
        if (compressionCodecs == null)
            compressionCodecs = new CompressionCodecFactory(context.getConfiguration());
        FileSystem fs = file.getFileSystem(context.getConfiguration());
        // HDF files are not splittable
        if (file.getName().toLowerCase().endsWith(".hdf"))
            return false;
        final CompressionCodec codec = compressionCodecs.getCodec(file);
        if (codec != null && !(codec instanceof SplittableCompressionCodec))
            return false;

        // To avoid opening the file and checking the first 8-bytes to look for
        // an R-tree signature, we never split a file read over HTTP
        if (fs instanceof HTTPFileSystem)
            return false;
        // ... and never split a file less than 150MB to perform better with many small files
        if (fs.getFileStatus(file).getLen() < 150 * 1024 * 1024)
            return false;
        return !SpatialSite.isRTree(fs, file);
    } catch (IOException e) {
        LOG.warn("Error while determining whether a file is splittable", e);
        return false; // Safer to not split it
    }
}

From source file:edu.umn.cs.spatialHadoop.mapreduce.SpatialInputFormat3.java

License:Open Source License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    List<InputSplit> splits = super.getSplits(job);
    Configuration jobConf = job.getConfiguration();
    if (jobConf.getInt(CombineSplits, 1) > 1) {
        long t1 = System.currentTimeMillis();
        int combine = jobConf.getInt(CombineSplits, 1);
        /*/*ww w .  j av a 2  s.c  om*/
         * Combine splits to reduce number of map tasks. Currently, this is done
         * using a greedy algorithm that combines splits based on how many hosts
         * they share.
         * TODO: Use a graph clustering algorithm where each vertex represents a
         * split, and each edge is weighted with number of shared hosts between
         * the two splits
         */
        Vector<Vector<FileSplit>> openSplits = new Vector<Vector<FileSplit>>();
        int maxNumberOfSplits = (int) Math.ceil((float) splits.size() / combine);
        List<InputSplit> combinedSplits = new Vector<InputSplit>();
        for (InputSplit split : splits) {
            FileSplit fsplit = (FileSplit) split;
            int maxSimilarity = -1; // Best similarity found so far
            int bestFit = -1; // Index of a random open split with max similarity
            int numMatches = 0; // Number of splits with max similarity
            for (int i = 0; i < openSplits.size(); i++) {
                Vector<FileSplit> splitList = openSplits.elementAt(i);
                int similarity = 0;
                for (FileSplit otherSplit : splitList) {
                    for (String host1 : fsplit.getLocations())
                        for (String host2 : otherSplit.getLocations())
                            if (host1.equals(host2))
                                similarity++;
                }
                if (similarity > maxSimilarity) {
                    maxSimilarity = similarity;
                    bestFit = i;
                    numMatches = 1;
                } else if (similarity == maxSimilarity) {
                    numMatches++;
                    // Replace with a probability () for a reservoir sample
                    double random = Math.random();
                    if (random < (double) 1 / numMatches) {
                        // Replace the element in the reservoir
                        bestFit = i;
                    }
                }
            }
            if (maxSimilarity > 0 || (openSplits.size() + combinedSplits.size()) >= maxNumberOfSplits) {
                // Good fit || cannot create more open splits,
                // add it to an existing open split.
                Vector<FileSplit> bestList = openSplits.elementAt(bestFit);
                bestList.add(fsplit);
                if (bestList.size() > combine) {
                    // Reached threshold for this list. Add it to combined splits
                    combinedSplits.add(FileSplitUtil.combineFileSplits(bestList, 0, bestList.size()));
                    // Remove it from open splits
                    openSplits.remove(bestFit);
                }
            } else {
                // Bad fit && can add a new split
                // Create a new open split just for this one
                Vector<FileSplit> newOpenSplit = new Vector<FileSplit>();
                newOpenSplit.add(fsplit);
                openSplits.addElement(newOpenSplit);
            }
        }

        // Add all remaining open splits to the list of combined splits
        for (Vector<FileSplit> openSplit : openSplits) {
            combinedSplits.add(FileSplitUtil.combineFileSplits(openSplit, 0, openSplit.size()));
        }

        String msg = String.format("Combined %d splits into %d combined splits", splits.size(),
                combinedSplits.size());
        splits.clear();
        splits.addAll(combinedSplits);
        long t2 = System.currentTimeMillis();
        LOG.info(msg + " in " + ((t2 - t1) / 1000.0) + " seconds");
    }
    return splits;
}

From source file:eu.scape_project.pt.mapred.input.ControlFileInputFormat.java

License:Apache License

/** 
 * Logically splits the set of input files for the job, splits N lines
 * of the input as one split.//from  w  ww  .  j  a v  a 2 s  .  co  m
 * 
 * @see NLineInputFormat#getSplits(JobContext)
 */
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    List<InputSplit> splits = new ArrayList<InputSplit>();
    int numLinesPerSplit = getNumLinesPerSplit(job);
    for (FileStatus status : listStatus(job)) {
        splits.addAll(getSplitsForFile(status, job.getConfiguration(), numLinesPerSplit));
    }
    return splits;
}

From source file:format.OverlapInputFormat.java

License:BSD License

@Override
protected List<FileStatus> listStatus(JobContext job) throws IOException {
    List<FileStatus> files = super.listStatus(job);
    List<FileStatus> results = new ArrayList<FileStatus>();
    //        Configuration conf = HadoopUtils.getConfiguration(job);
    Configuration conf = job.getConfiguration();
    boolean recursive = conf.getBoolean("mapred.input.dir.recursive", false);
    Iterator<FileStatus> it = files.iterator();
    while (it.hasNext()) {
        FileStatus fileStatus = it.next();
        FileSystem fs = fileStatus.getPath().getFileSystem(conf);
        addInputPath(results, fs, fileStatus, recursive);
    }/*from w  w w  . j  av a 2s .c o  m*/

    LOG.debug("Total pcap input paths to process: " + results.size());
    return results;
}

From source file:format.OverlapInputFormat.java

License:BSD License

/******
    @Override/*  w  w  w  .  j av a 2  s.co  m*/
    public List<InputSplit> getSplits(JobContext job) throws IOException {
Configuration conf = HadoopUtils.getConfiguration(job);
        
List<InputSplit> defaultSplits = super.getSplits(job);
List<InputSplit> result = new ArrayList<InputSplit>();
        
Path prevFile = null;
FourMcBlockIndex prevIndex = null;
        
for (InputSplit genericSplit : defaultSplits) {
    // Load the index.
    FileSplit fileSplit = (FileSplit) genericSplit;
    Path file = fileSplit.getPath();
    FileSystem fs = file.getFileSystem(conf);
        
    FourMcBlockIndex index;
    if (file.equals(prevFile)) {
        index = prevIndex;
    } else {
        index = FourMcBlockIndex.readIndex(fs, file);
        prevFile = file;
        prevIndex = index;
    }
        
    if (index == null) {
        throw new IOException("BlockIndex unreadable for " + file);
    }
        
    if (index.isEmpty()) { // leave the default split for empty block index
        result.add(fileSplit);
        continue;
    }
        
    long start = fileSplit.getStart();
    long end = start + fileSplit.getLength();
        
    long fourMcStart = index.alignSliceStartToIndex(start, end);
    long fourMcEnd = index.alignSliceEndToIndex(end, fs.getFileStatus(file).getLen());
        
    if (fourMcStart != FourMcBlockIndex.NOT_FOUND && fourMcEnd != FourMcBlockIndex.NOT_FOUND) {
        result.add(new FileSplit(file, fourMcStart, fourMcEnd - fourMcStart, fileSplit.getLocations()));
        LOG.debug("Added 4mc split for " + file + "[start=" + fourMcStart + ", length=" + (fourMcEnd - fourMcStart) + "]");
    }
        
}
        
return result;
    }
 ******/

@Override
public List<InputSplit> getSplits(JobContext context) {
    List<InputSplit> splits = new ArrayList<InputSplit>();
    FileSystem fs = null;
    Path file = OverlapInputFormat.getInputPaths(context)[0];
    Configuration conf = context.getConfiguration();
    long blocksize = Long.parseLong(conf.get("dfs.blocksize"));
    //        long overlap = Long.parseLong(conf.get("pcap.defaultsize"));
    long overlap = 16;
    FSDataInputStream in = null;
    try {
        fs = FileSystem.get(context.getConfiguration());
        in = fs.open(file);
        long pos = 0;
        while (in.available() > 0) {
            FileSplit split = new FileSplit(file, pos, blocksize + overlap, new String[] {});
            splits.add(split);
            pos += blocksize;
            in.skip(blocksize + overlap);
        }
    } catch (IOException e) {
        LOG.error(e.getLocalizedMessage());
    } finally {
        if (in != null) {
            try {
                in.close();
            } catch (Exception e) {
            }
        }
        ;
        if (fs != null) {
            try {
                fs.close();
            } catch (Exception e) {
            }
        }
        ;
    }
    return splits;
}

From source file:fr.ens.biologie.genomique.eoulsan.bio.io.hadoop.FastqInputFormat.java

License:LGPL

@Override
protected boolean isSplitable(JobContext context, Path file) {

    final CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(file);

    if (null == codec) {
        return true;
    }/*from  ww w  .  j a  va  2s.  co  m*/

    return codec instanceof SplittableCompressionCodec;
}

From source file:gobblin.compaction.mapreduce.avro.AvroKeyRecursiveCombineFileInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext cx) throws IOException {
    Job modifiedJob = Job.getInstance(cx.getConfiguration());
    setSplitSize(modifiedJob);//from w w w.  ja  va 2s  .  c  o  m
    FileInputFormat.setInputDirRecursive(modifiedJob, true);
    return cleanSplits(super.getSplits(modifiedJob));
}

From source file:gobblin.compaction.mapreduce.avro.AvroKeyRecursiveCombineFileInputFormat.java

License:Apache License

private void setSplitSize(JobContext cx) {
    super.setMaxSplitSize(cx.getConfiguration().getLong(COMPACTION_JOB_MAPRED_MAX_SPLIT_SIZE,
            DEFAULT_COMPACTION_JOB_MAPRED_MAX_SPLIT_SIZE));
    super.setMinSplitSizeNode(cx.getConfiguration().getLong(COMPACTION_JOB_MAPRED_MIN_SPLIT_SIZE,
            DEFAULT_COMPACTION_JOB_MAPRED_MIN_SPLIT_SIZE));
}