Example usage for org.apache.hadoop.mapreduce JobContext getConfiguration

List of usage examples for org.apache.hadoop.mapreduce JobContext getConfiguration

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce JobContext getConfiguration.

Prototype

public Configuration getConfiguration();

Source Link

Document

Return the configuration for the job.

Usage

From source file:gobblin.runtime.mapreduce.GobblinOutputCommitter.java

License:Apache License

@Override
public void abortJob(JobContext jobContext, JobStatus.State state) throws IOException {
    LOG.info("Aborting Job: " + jobContext.getJobID() + " with state: " + state);

    Configuration conf = jobContext.getConfiguration();

    URI fsUri = URI.create(conf.get(ConfigurationKeys.FS_URI_KEY, ConfigurationKeys.LOCAL_FS_URI));
    FileSystem fs = FileSystem.get(fsUri, conf);

    Path mrJobDir = new Path(conf.get(ConfigurationKeys.MR_JOB_ROOT_DIR_KEY),
            conf.get(ConfigurationKeys.JOB_NAME_KEY));
    Path jobInputDir = new Path(mrJobDir, MRJobLauncher.INPUT_DIR_NAME);

    if (!fs.exists(jobInputDir) || !fs.isDirectory(jobInputDir)) {
        LOG.warn(String.format("%s either does not exist or is not a directory. No data to cleanup.",
                jobInputDir));/*from ww w.ja  v  a 2s . c o m*/
        return;
    }

    // Iterate through all files in the jobInputDir, each file should correspond to a serialized wu or mwu
    try {
        for (FileStatus status : fs.listStatus(jobInputDir, new WorkUnitFilter())) {

            Closer workUnitFileCloser = Closer.create();

            // If the file ends with ".wu" de-serialize it into a WorkUnit
            if (status.getPath().getName().endsWith(AbstractJobLauncher.WORK_UNIT_FILE_EXTENSION)) {
                WorkUnit wu = WorkUnit.createEmpty();
                try {
                    wu.readFields(workUnitFileCloser.register(new DataInputStream(fs.open(status.getPath()))));
                } finally {
                    workUnitFileCloser.close();
                }
                JobLauncherUtils.cleanTaskStagingData(new WorkUnitState(wu), LOG);
            }

            // If the file ends with ".mwu" de-serialize it into a MultiWorkUnit
            if (status.getPath().getName().endsWith(AbstractJobLauncher.MULTI_WORK_UNIT_FILE_EXTENSION)) {
                MultiWorkUnit mwu = MultiWorkUnit.createEmpty();
                try {
                    mwu.readFields(workUnitFileCloser.register(new DataInputStream(fs.open(status.getPath()))));
                } finally {
                    workUnitFileCloser.close();
                }
                for (WorkUnit wu : mwu.getWorkUnits()) {
                    JobLauncherUtils.cleanTaskStagingData(new WorkUnitState(wu), LOG);
                }
            }
        }
    } finally {
        try {
            cleanUpWorkingDirectory(mrJobDir, fs);
        } finally {
            super.abortJob(jobContext, state);
        }
    }
}

From source file:gobblin.runtime.mapreduce.GobblinWorkUnitsInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {

    Path[] inputPaths = FileInputFormat.getInputPaths(context);
    if (inputPaths == null || inputPaths.length == 0) {
        throw new IOException("No input found!");
    }//from  w  w  w  .  j a va  2 s .  co  m

    List<String> allPaths = Lists.newArrayList();

    for (Path path : inputPaths) {
        // path is a single work unit / multi work unit
        FileSystem fs = path.getFileSystem(context.getConfiguration());
        FileStatus[] inputs = fs.listStatus(path);

        if (inputs == null) {
            throw new IOException(String.format("Path %s does not exist.", path));
        }
        log.info(String.format("Found %d input files at %s: %s", inputs.length, path, Arrays.toString(inputs)));
        for (FileStatus input : inputs) {
            allPaths.add(input.getPath().toString());
        }
    }

    int maxMappers = getMaxMapper(context.getConfiguration());
    int numTasksPerMapper = allPaths.size() % maxMappers == 0 ? allPaths.size() / maxMappers
            : allPaths.size() / maxMappers + 1;

    List<InputSplit> splits = Lists.newArrayList();
    Iterator<String> pathsIt = allPaths.iterator();
    while (pathsIt.hasNext()) {
        Iterator<String> limitedIterator = Iterators.limit(pathsIt, numTasksPerMapper);
        splits.add(new GobblinSplit(Lists.newArrayList(limitedIterator)));
    }

    return splits;
}

From source file:gov.llnl.ontology.text.hbase.GzipTarInputFormat.java

License:Open Source License

/**
 * Returns a {@link List} of {@link FileSplit}s.  Each {@link FileSplit}
 * will be a gzipped tarball of xml documents.  Each tarred file should
 * contain a single document./*from  w ww  .jav a2s  .  co m*/
 */
public List<InputSplit> getSplits(JobContext context) throws IOException {
    List<InputSplit> splits = new ArrayList<InputSplit>();

    // Get the list of zipped files to be processed and add each zipped file
    // as an InputSplit.
    FileSystem fs = FileSystem.get(context.getConfiguration());
    for (Path file : getInputPaths(context)) {
        // Check that the list of files exists.  Throw an exception if it
        // does not.
        if (fs.isDirectory(file) || !fs.exists(file))
            throw new IOException("File does not exist: " + file);

        // Read the contents of the file list and add each line as a
        // FileSplit.
        BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(file)));
        for (String line = null; (line = br.readLine()) != null;)
            splits.add(new FileSplit(new Path(line), 0, Integer.MAX_VALUE, null));
    }
    return splits;
}

From source file:gr.ntua.h2rdf.inputFormat.FileTableInputFormat.java

License:Open Source License

public List<InputSplit> getSplits(JobContext context) throws IOException {

    List<InputSplit> splits = super.getSplits(context);
    List<InputSplit> spl = textFormat.getSplits(context);
    splits.addAll(spl);//from  w w  w .  j a v  a2  s  .  co  m
    String p = context.getConfiguration().get("mapred.fairscheduler.pool");
    int max = Integer.parseInt(p.substring(p.indexOf("l") + 1));
    if (splits.size() <= max)
        context.getConfiguration().setInt("mapred.reduce.tasks", splits.size());
    else
        context.getConfiguration().setInt("mapred.reduce.tasks", max);
    return splits;

}

From source file:gr.ntua.h2rdf.inputFormat.MyFileInputFormat.java

License:Open Source License

/**
 * Get the minimum split size/*from   ww  w  .ja v a2 s  .c o  m*/
 * @param job the job
 * @return the minimum number of bytes that can be in a split
 */
public static long getMinSplitSize(JobContext job) {
    return job.getConfiguration().getLong("mapred.min.split.size", 1L);
}

From source file:gr.ntua.h2rdf.inputFormat.MyFileInputFormat.java

License:Open Source License

/**
 * Get the maximum split size./* w  w  w .j  a va 2s. com*/
 * @param context the job to look at.
 * @return the maximum number of bytes a split can include
 */
public static long getMaxSplitSize(JobContext context) {
    return context.getConfiguration().getLong("mapred.max.split.size", Long.MAX_VALUE);
}

From source file:gr.ntua.h2rdf.inputFormat.MyFileInputFormat.java

License:Open Source License

/** List input directories.
 * Subclasses may override to, e.g., select only files matching a regular
 * expression. //w  ww  . ja v a2 s  . co m
 * 
 * @param job the job to list input paths for
 * @return array of FileStatus objects
 * @throws IOException if zero items.
 */
protected List<FileStatus> listStatus(JobContext job) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();
    Path[] dirs = getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }

    List<IOException> errors = new ArrayList<IOException>();

    // creates a MultiPathFilter with the hiddenFileFilter and the
    // user provided one (if any).
    List<PathFilter> filters = new ArrayList<PathFilter>();
    filters.add(hiddenFileFilter);
    PathFilter jobFilter = getInputPathFilter(job);
    if (jobFilter != null) {
        filters.add(jobFilter);
    }
    PathFilter inputFilter = new MultiPathFilter(filters);

    for (int i = 0; i < dirs.length; ++i) {
        Path p = dirs[i];
        FileSystem fs = p.getFileSystem(job.getConfiguration());
        FileStatus[] matches = fs.globStatus(p, inputFilter);
        if (matches == null) {
            errors.add(new IOException("Input path does not exist: " + p));
        } else if (matches.length == 0) {
            errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
        } else {
            for (FileStatus globStat : matches) {
                if (globStat.isDir()) {
                    for (FileStatus stat : fs.listStatus(globStat.getPath(), inputFilter)) {
                        result.add(stat);
                    }
                } else {
                    result.add(globStat);
                }
            }
        }
    }

    if (!errors.isEmpty()) {
        throw new InvalidInputException(errors);
    }
    LOG.info("Total input paths to process : " + result.size());
    return result;
}

From source file:gr.ntua.h2rdf.inputFormat.MyFileInputFormat.java

License:Open Source License

/** 
 * Generate the list of files and make them into FileSplits.
 *///from   w  w  w  .j  a va2 s .  c o m
public List<InputSplit> getSplits(JobContext job) throws IOException {
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    for (FileStatus file : listStatus(job)) {
        Path path = file.getPath();
        FileSystem fs = path.getFileSystem(job.getConfiguration());
        long length = file.getLen();
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if ((length != 0) && isSplitable(job, path)) {
            long blockSize = file.getBlockSize();
            long splitSize = computeSplitSize(blockSize, minSize, maxSize);

            long bytesRemaining = length;
            while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                splits.add(new MyFileSplit(path, length - bytesRemaining, splitSize,
                        blkLocations[blkIndex].getHosts()));
                bytesRemaining -= splitSize;
            }

            if (bytesRemaining != 0) {
                splits.add(new MyFileSplit(path, length - bytesRemaining, bytesRemaining,
                        blkLocations[blkLocations.length - 1].getHosts()));
            }
        } else if (length != 0) {
            splits.add(new MyFileSplit(path, 0, length, blkLocations[0].getHosts()));
        } else {
            //Create empty hosts array for zero length files
            splits.add(new MyFileSplit(path, 0, length, new String[0]));
        }
    }
    LOG.debug("Total # of splits: " + splits.size());

    String p = job.getConfiguration().get("mapred.fairscheduler.pool");
    int max = Integer.parseInt(p.substring(p.indexOf("l") + 1));

    if (splits.size() <= max)
        job.getConfiguration().setInt("mapred.reduce.tasks", splits.size());
    else
        job.getConfiguration().setInt("mapred.reduce.tasks", max);
    return splits;
}

From source file:gr.ntua.h2rdf.inputFormat.TableInputFormatBase.java

License:Open Source License

/**
 * Calculates the splits that will serve as input for the map tasks. The
 * number of splits matches the number of regions in a table.
 *
 * @param context  The current job context.
 * @return The list of input splits.//from   ww  w.ja  v  a2s. c o m
 * @throws IOException When creating the list of splits fails.
 * @see org.apache.hadoop.mapreduce.InputFormat#getSplits(
 *   org.apache.hadoop.mapreduce.JobContext)
 */
@Override
public List<InputSplit> getSplits(JobContext context) throws IOException {

    String p = context.getConfiguration().get("mapred.fairscheduler.pool");
    max_tasks = Integer.parseInt(p.substring(p.indexOf("l") + 1));

    Iterator<Scan> scanIterator = scanList.iterator();
    Iterator<String> tableIterator = tableList.iterator();
    Iterator<String> varsIterator = varList.iterator();
    Iterator<String> fnameIterator = fnameList.iterator();

    try {
        SUBCLASS = (new H2RDFNode(Node.createURI("http://www.w3.org/2000/01/rdf-schema#subClassOf")))
                .getHashValue();
    } catch (NotSupportedDatatypeException e) {
        throw new IOException("Not supported datatype");
    }
    System.out.println("calculating splitnumber");
    conf = context.getConfiguration();
    Configuration HBconf = HBaseConfiguration.create();
    Scan scan = null;
    splits = new ArrayList<InputSplit>();
    while (scanIterator.hasNext()) {
        System.out.println("New Input BGP");
        scan = scanIterator.next();
        String tname = tableIterator.next();
        table = new HTable(HBconf, tname);
        keys = table.getStartEndKeys();
        String vars = varsIterator.next();
        String fname = fnameIterator.next();

        splitSubclass(scan, tname, vars, fname);
    }
    if (splits.size() <= max_tasks)
        context.getConfiguration().setInt("mapred.reduce.tasks", splits.size());
    else
        context.getConfiguration().setInt("mapred.reduce.tasks", max_tasks);
    return splits;
}

From source file:gr.ntua.h2rdf.inputFormat2.MultiTableInputFormatBase.java

License:Open Source License

/**
 * Calculates the splits that will serve as input for the map tasks. The
 * number of splits matches the number of regions in a table.
 *
 * @param context The current job context.
 * @return The list of input splits./*from   ww  w  .jav a  2s.  co  m*/
 * @throws IOException When creating the list of splits fails.
 * @see org.apache.hadoop.mapreduce.InputFormat#getSplits(org.apache.hadoop.mapreduce.JobContext)
 */
@Override
public List<InputSplit> getSplits(JobContext context) throws IOException {
    if (scans.isEmpty()) {
        throw new IOException("No scans were provided.");
    }
    List<InputSplit> splits = new ArrayList<InputSplit>();
    for (Scan scan : scans) {
        byte[] tableName = scan.getAttribute(Scan.SCAN_ATTRIBUTES_TABLE_NAME);
        if (tableName == null)
            throw new IOException("A scan object did not have a table name");
        HTable table = new HTable(context.getConfiguration(), tableName);
        Pair<byte[][], byte[][]> keys = table.getStartEndKeys();
        if (keys == null || keys.getFirst() == null || keys.getFirst().length == 0) {
            throw new IOException("Expecting at least one region for table : " + Bytes.toString(tableName));
        }
        int count = 0;

        byte[] startRow = scan.getStartRow();
        byte[] stopRow = scan.getStopRow();

        for (int i = 0; i < keys.getFirst().length; i++) {
            if (!includeRegionInSplit(keys.getFirst()[i], keys.getSecond()[i])) {
                continue;
            }
            String regionLocation = table.getRegionLocation(keys.getFirst()[i], false).getHostname();

            // determine if the given start and stop keys fall into the range
            if ((startRow.length == 0 || keys.getSecond()[i].length == 0
                    || Bytes.compareTo(startRow, keys.getSecond()[i]) < 0)
                    && (stopRow.length == 0 || Bytes.compareTo(stopRow, keys.getFirst()[i]) > 0)) {
                byte[] splitStart = startRow.length == 0 || Bytes.compareTo(keys.getFirst()[i], startRow) >= 0
                        ? keys.getFirst()[i]
                        : startRow;
                byte[] splitStop = (stopRow.length == 0 || Bytes.compareTo(keys.getSecond()[i], stopRow) <= 0)
                        && keys.getSecond()[i].length > 0 ? keys.getSecond()[i] : stopRow;
                InputSplit split = new TableSplit(tableName, scan, splitStart, splitStop, regionLocation);
                splits.add(split);
                if (LOG.isDebugEnabled())
                    LOG.debug("getSplits: split -> " + (count++) + " -> " + split);
            }
        }
        table.close();
    }
    return splits;
}