Example usage for org.apache.hadoop.mapreduce JobContext getConfiguration

List of usage examples for org.apache.hadoop.mapreduce JobContext getConfiguration

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce JobContext getConfiguration.

Prototype

public Configuration getConfiguration();

Source Link

Document

Return the configuration for the job.

Usage

From source file:edu.rutgers.ess.crs.utility.CSVInputFormat.java

License:Apache License

protected boolean isSplitable(final JobContext context, final Path file) {
    final CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(file);
    return codec == null;
}

From source file:edu.umn.cs.spatialHadoop.mapreduce.SpatialInputFormat3.java

License:Open Source License

@Override
protected List<FileStatus> listStatus(JobContext job) throws IOException {
    try {// www.j  a va2 s  . c om
        Configuration jobConf = job.getConfiguration();
        // The block filter associated with this job
        BlockFilter blockFilter = null;
        if (jobConf.get(InputQueryRange) != null) {
            // This job requires a range query
            blockFilter = new RangeFilter(OperationsParams.getShape(jobConf, InputQueryRange));
        }
        // Retrieve the BlockFilter set by the developers in the JobConf
        Class<? extends BlockFilter> blockFilterClass = jobConf.getClass(SpatialSite.FilterClass, null,
                BlockFilter.class);
        if (blockFilterClass != null) {
            BlockFilter userBlockFilter = blockFilterClass.newInstance();
            blockFilter = blockFilter == null ? userBlockFilter
                    : new CombineBlockFilter(blockFilter, userBlockFilter);
        }
        if (blockFilter == null) {
            // No block filter specified by user
            LOG.info("No block filter specified");
            return super.listStatus(job);
        }
        // Get all blocks the user wants to process
        blockFilter.configure(jobConf);

        // Filter files based on user specified filter function
        List<FileStatus> result = new ArrayList<FileStatus>();
        Path[] inputDirs = getInputPaths(job);

        for (Path dir : inputDirs) {
            FileSystem fs = dir.getFileSystem(jobConf);
            listStatus(fs, dir, result, blockFilter);
        }

        LOG.info("Spatial filter function matched with " + result.size() + " cells");

        return result;
    } catch (InstantiationException e) {
        LOG.warn(e);
        return super.listStatus(job);
    } catch (IllegalAccessException e) {
        LOG.warn(e);
        return super.listStatus(job);
    }
}

From source file:edu.umn.cs.spatialHadoop.mapreduce.SpatialInputFormat3.java

License:Open Source License

@Override
protected boolean isSplitable(JobContext context, Path file) {
    try {/*from   www.j a  va  2s.c o  m*/
        // Create compressionCodecs to be used by isSplitable method
        if (compressionCodecs == null)
            compressionCodecs = new CompressionCodecFactory(context.getConfiguration());
        FileSystem fs = file.getFileSystem(context.getConfiguration());
        // HDF files are not splittable
        if (file.getName().toLowerCase().endsWith(".hdf"))
            return false;
        final CompressionCodec codec = compressionCodecs.getCodec(file);
        if (codec != null && !(codec instanceof SplittableCompressionCodec))
            return false;

        // To avoid opening the file and checking the first 8-bytes to look for
        // an R-tree signature, we never split a file read over HTTP
        if (fs instanceof HTTPFileSystem)
            return false;
        // ... and never split a file less than 150MB to perform better with many small files
        if (fs.getFileStatus(file).getLen() < 150 * 1024 * 1024)
            return false;
        return !SpatialSite.isRTree(fs, file);
    } catch (IOException e) {
        LOG.warn("Error while determining whether a file is splittable", e);
        return false; // Safer to not split it
    }
}

From source file:edu.umn.cs.spatialHadoop.mapreduce.SpatialInputFormat3.java

License:Open Source License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    List<InputSplit> splits = super.getSplits(job);
    Configuration jobConf = job.getConfiguration();
    if (jobConf.getInt(CombineSplits, 1) > 1) {
        long t1 = System.currentTimeMillis();
        int combine = jobConf.getInt(CombineSplits, 1);
        /*/*ww w .  j av a 2  s.c  om*/
         * Combine splits to reduce number of map tasks. Currently, this is done
         * using a greedy algorithm that combines splits based on how many hosts
         * they share.
         * TODO: Use a graph clustering algorithm where each vertex represents a
         * split, and each edge is weighted with number of shared hosts between
         * the two splits
         */
        Vector<Vector<FileSplit>> openSplits = new Vector<Vector<FileSplit>>();
        int maxNumberOfSplits = (int) Math.ceil((float) splits.size() / combine);
        List<InputSplit> combinedSplits = new Vector<InputSplit>();
        for (InputSplit split : splits) {
            FileSplit fsplit = (FileSplit) split;
            int maxSimilarity = -1; // Best similarity found so far
            int bestFit = -1; // Index of a random open split with max similarity
            int numMatches = 0; // Number of splits with max similarity
            for (int i = 0; i < openSplits.size(); i++) {
                Vector<FileSplit> splitList = openSplits.elementAt(i);
                int similarity = 0;
                for (FileSplit otherSplit : splitList) {
                    for (String host1 : fsplit.getLocations())
                        for (String host2 : otherSplit.getLocations())
                            if (host1.equals(host2))
                                similarity++;
                }
                if (similarity > maxSimilarity) {
                    maxSimilarity = similarity;
                    bestFit = i;
                    numMatches = 1;
                } else if (similarity == maxSimilarity) {
                    numMatches++;
                    // Replace with a probability () for a reservoir sample
                    double random = Math.random();
                    if (random < (double) 1 / numMatches) {
                        // Replace the element in the reservoir
                        bestFit = i;
                    }
                }
            }
            if (maxSimilarity > 0 || (openSplits.size() + combinedSplits.size()) >= maxNumberOfSplits) {
                // Good fit || cannot create more open splits,
                // add it to an existing open split.
                Vector<FileSplit> bestList = openSplits.elementAt(bestFit);
                bestList.add(fsplit);
                if (bestList.size() > combine) {
                    // Reached threshold for this list. Add it to combined splits
                    combinedSplits.add(FileSplitUtil.combineFileSplits(bestList, 0, bestList.size()));
                    // Remove it from open splits
                    openSplits.remove(bestFit);
                }
            } else {
                // Bad fit && can add a new split
                // Create a new open split just for this one
                Vector<FileSplit> newOpenSplit = new Vector<FileSplit>();
                newOpenSplit.add(fsplit);
                openSplits.addElement(newOpenSplit);
            }
        }

        // Add all remaining open splits to the list of combined splits
        for (Vector<FileSplit> openSplit : openSplits) {
            combinedSplits.add(FileSplitUtil.combineFileSplits(openSplit, 0, openSplit.size()));
        }

        String msg = String.format("Combined %d splits into %d combined splits", splits.size(),
                combinedSplits.size());
        splits.clear();
        splits.addAll(combinedSplits);
        long t2 = System.currentTimeMillis();
        LOG.info(msg + " in " + ((t2 - t1) / 1000.0) + " seconds");
    }
    return splits;
}

From source file:eu.scape_project.pt.mapred.input.ControlFileInputFormat.java

License:Apache License

/** 
 * Logically splits the set of input files for the job, splits N lines
 * of the input as one split.//from  w  ww  .  j  a v  a 2 s  .  co  m
 * 
 * @see NLineInputFormat#getSplits(JobContext)
 */
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    List<InputSplit> splits = new ArrayList<InputSplit>();
    int numLinesPerSplit = getNumLinesPerSplit(job);
    for (FileStatus status : listStatus(job)) {
        splits.addAll(getSplitsForFile(status, job.getConfiguration(), numLinesPerSplit));
    }
    return splits;
}

From source file:format.OverlapInputFormat.java

License:BSD License

@Override
protected List<FileStatus> listStatus(JobContext job) throws IOException {
    List<FileStatus> files = super.listStatus(job);
    List<FileStatus> results = new ArrayList<FileStatus>();
    //        Configuration conf = HadoopUtils.getConfiguration(job);
    Configuration conf = job.getConfiguration();
    boolean recursive = conf.getBoolean("mapred.input.dir.recursive", false);
    Iterator<FileStatus> it = files.iterator();
    while (it.hasNext()) {
        FileStatus fileStatus = it.next();
        FileSystem fs = fileStatus.getPath().getFileSystem(conf);
        addInputPath(results, fs, fileStatus, recursive);
    }/*from w  w w  . j  av a 2s .c o  m*/

    LOG.debug("Total pcap input paths to process: " + results.size());
    return results;
}

From source file:format.OverlapInputFormat.java

License:BSD License

/******
    @Override/*  w  w  w  .  j av a 2  s.co  m*/
    public List<InputSplit> getSplits(JobContext job) throws IOException {
Configuration conf = HadoopUtils.getConfiguration(job);
        
List<InputSplit> defaultSplits = super.getSplits(job);
List<InputSplit> result = new ArrayList<InputSplit>();
        
Path prevFile = null;
FourMcBlockIndex prevIndex = null;
        
for (InputSplit genericSplit : defaultSplits) {
    // Load the index.
    FileSplit fileSplit = (FileSplit) genericSplit;
    Path file = fileSplit.getPath();
    FileSystem fs = file.getFileSystem(conf);
        
    FourMcBlockIndex index;
    if (file.equals(prevFile)) {
        index = prevIndex;
    } else {
        index = FourMcBlockIndex.readIndex(fs, file);
        prevFile = file;
        prevIndex = index;
    }
        
    if (index == null) {
        throw new IOException("BlockIndex unreadable for " + file);
    }
        
    if (index.isEmpty()) { // leave the default split for empty block index
        result.add(fileSplit);
        continue;
    }
        
    long start = fileSplit.getStart();
    long end = start + fileSplit.getLength();
        
    long fourMcStart = index.alignSliceStartToIndex(start, end);
    long fourMcEnd = index.alignSliceEndToIndex(end, fs.getFileStatus(file).getLen());
        
    if (fourMcStart != FourMcBlockIndex.NOT_FOUND && fourMcEnd != FourMcBlockIndex.NOT_FOUND) {
        result.add(new FileSplit(file, fourMcStart, fourMcEnd - fourMcStart, fileSplit.getLocations()));
        LOG.debug("Added 4mc split for " + file + "[start=" + fourMcStart + ", length=" + (fourMcEnd - fourMcStart) + "]");
    }
        
}
        
return result;
    }
 ******/

@Override
public List<InputSplit> getSplits(JobContext context) {
    List<InputSplit> splits = new ArrayList<InputSplit>();
    FileSystem fs = null;
    Path file = OverlapInputFormat.getInputPaths(context)[0];
    Configuration conf = context.getConfiguration();
    long blocksize = Long.parseLong(conf.get("dfs.blocksize"));
    //        long overlap = Long.parseLong(conf.get("pcap.defaultsize"));
    long overlap = 16;
    FSDataInputStream in = null;
    try {
        fs = FileSystem.get(context.getConfiguration());
        in = fs.open(file);
        long pos = 0;
        while (in.available() > 0) {
            FileSplit split = new FileSplit(file, pos, blocksize + overlap, new String[] {});
            splits.add(split);
            pos += blocksize;
            in.skip(blocksize + overlap);
        }
    } catch (IOException e) {
        LOG.error(e.getLocalizedMessage());
    } finally {
        if (in != null) {
            try {
                in.close();
            } catch (Exception e) {
            }
        }
        ;
        if (fs != null) {
            try {
                fs.close();
            } catch (Exception e) {
            }
        }
        ;
    }
    return splits;
}

From source file:fr.ens.biologie.genomique.eoulsan.bio.io.hadoop.FastqInputFormat.java

License:LGPL

@Override
protected boolean isSplitable(JobContext context, Path file) {

    final CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(file);

    if (null == codec) {
        return true;
    }/*from  ww w  .  j a  va  2s.  co  m*/

    return codec instanceof SplittableCompressionCodec;
}

From source file:gobblin.compaction.mapreduce.avro.AvroKeyRecursiveCombineFileInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext cx) throws IOException {
    Job modifiedJob = Job.getInstance(cx.getConfiguration());
    setSplitSize(modifiedJob);//from w w w.  ja  va 2s  .  c  o  m
    FileInputFormat.setInputDirRecursive(modifiedJob, true);
    return cleanSplits(super.getSplits(modifiedJob));
}

From source file:gobblin.compaction.mapreduce.avro.AvroKeyRecursiveCombineFileInputFormat.java

License:Apache License

private void setSplitSize(JobContext cx) {
    super.setMaxSplitSize(cx.getConfiguration().getLong(COMPACTION_JOB_MAPRED_MAX_SPLIT_SIZE,
            DEFAULT_COMPACTION_JOB_MAPRED_MAX_SPLIT_SIZE));
    super.setMinSplitSizeNode(cx.getConfiguration().getLong(COMPACTION_JOB_MAPRED_MIN_SPLIT_SIZE,
            DEFAULT_COMPACTION_JOB_MAPRED_MIN_SPLIT_SIZE));
}