Example usage for org.apache.hadoop.mapreduce JobContext getConfiguration

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce JobContext getConfiguration.

Prototype

public Configuration getConfiguration();

Source Link

Document

Return the configuration for the job.

Usage

From source file:gobblin.runtime.mapreduce.GobblinOutputCommitter.java

License:Apache License

@Override
public void abortJob(JobContext jobContext, JobStatus.State state) throws IOException {
    LOG.info("Aborting Job: " + jobContext.getJobID() + " with state: " + state);

    Configuration conf = jobContext.getConfiguration();

    URI fsUri = URI.create(conf.get(ConfigurationKeys.FS_URI_KEY, ConfigurationKeys.LOCAL_FS_URI));
    FileSystem fs = FileSystem.get(fsUri, conf);

    Path mrJobDir = new Path(conf.get(ConfigurationKeys.MR_JOB_ROOT_DIR_KEY),
            conf.get(ConfigurationKeys.JOB_NAME_KEY));
    Path jobInputDir = new Path(mrJobDir, MRJobLauncher.INPUT_DIR_NAME);

    if (!fs.exists(jobInputDir) || !fs.isDirectory(jobInputDir)) {
        LOG.warn(String.format("%s either does not exist or is not a directory. No data to cleanup.",
                jobInputDir));/*from ww w.ja  v  a 2s . c o m*/
        return;
    }

    // Iterate through all files in the jobInputDir, each file should correspond to a serialized wu or mwu
    try {
        for (FileStatus status : fs.listStatus(jobInputDir, new WorkUnitFilter())) {

            Closer workUnitFileCloser = Closer.create();

            // If the file ends with ".wu" de-serialize it into a WorkUnit
            if (status.getPath().getName().endsWith(AbstractJobLauncher.WORK_UNIT_FILE_EXTENSION)) {
                WorkUnit wu = WorkUnit.createEmpty();
                try {
                    wu.readFields(workUnitFileCloser.register(new DataInputStream(fs.open(status.getPath()))));
                } finally {
                    workUnitFileCloser.close();
                }
                JobLauncherUtils.cleanTaskStagingData(new WorkUnitState(wu), LOG);
            }

            // If the file ends with ".mwu" de-serialize it into a MultiWorkUnit
            if (status.getPath().getName().endsWith(AbstractJobLauncher.MULTI_WORK_UNIT_FILE_EXTENSION)) {
                MultiWorkUnit mwu = MultiWorkUnit.createEmpty();
                try {
                    mwu.readFields(workUnitFileCloser.register(new DataInputStream(fs.open(status.getPath()))));
                } finally {
                    workUnitFileCloser.close();
                }
                for (WorkUnit wu : mwu.getWorkUnits()) {
                    JobLauncherUtils.cleanTaskStagingData(new WorkUnitState(wu), LOG);
                }
            }
        }
    } finally {
        try {
            cleanUpWorkingDirectory(mrJobDir, fs);
        } finally {
            super.abortJob(jobContext, state);
        }
    }
}

From source file:gobblin.runtime.mapreduce.GobblinWorkUnitsInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {

    Path[] inputPaths = FileInputFormat.getInputPaths(context);
    if (inputPaths == null || inputPaths.length == 0) {
        throw new IOException("No input found!");
    }//from  w  w  w  .  j a va  2 s .  co  m

    List<String> allPaths = Lists.newArrayList();

    for (Path path : inputPaths) {
        // path is a single work unit / multi work unit
        FileSystem fs = path.getFileSystem(context.getConfiguration());
        FileStatus[] inputs = fs.listStatus(path);

        if (inputs == null) {
            throw new IOException(String.format("Path %s does not exist.", path));
        }
        log.info(String.format("Found %d input files at %s: %s", inputs.length, path, Arrays.toString(inputs)));
        for (FileStatus input : inputs) {
            allPaths.add(input.getPath().toString());
        }
    }

    int maxMappers = getMaxMapper(context.getConfiguration());
    int numTasksPerMapper = allPaths.size() % maxMappers == 0 ? allPaths.size() / maxMappers
            : allPaths.size() / maxMappers + 1;

    List<InputSplit> splits = Lists.newArrayList();
    Iterator<String> pathsIt = allPaths.iterator();
    while (pathsIt.hasNext()) {
        Iterator<String> limitedIterator = Iterators.limit(pathsIt, numTasksPerMapper);
        splits.add(new GobblinSplit(Lists.newArrayList(limitedIterator)));
    }

    return splits;
}

From source file:gov.llnl.ontology.text.hbase.GzipTarInputFormat.java

License:Open Source License

/**
 * Returns a {@link List} of {@link FileSplit}s.  Each {@link FileSplit}
 * will be a gzipped tarball of xml documents.  Each tarred file should
 * contain a single document./*from  w ww  .jav a2s  .  co m*/
 */
public List<InputSplit> getSplits(JobContext context) throws IOException {
    List<InputSplit> splits = new ArrayList<InputSplit>();

    // Get the list of zipped files to be processed and add each zipped file
    // as an InputSplit.
    FileSystem fs = FileSystem.get(context.getConfiguration());
    for (Path file : getInputPaths(context)) {
        // Check that the list of files exists.  Throw an exception if it
        // does not.
        if (fs.isDirectory(file) || !fs.exists(file))
            throw new IOException("File does not exist: " + file);

        // Read the contents of the file list and add each line as a
        // FileSplit.
        BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(file)));
        for (String line = null; (line = br.readLine()) != null;)
            splits.add(new FileSplit(new Path(line), 0, Integer.MAX_VALUE, null));
    }
    return splits;
}

From source file:gr.ntua.h2rdf.inputFormat.FileTableInputFormat.java

License:Open Source License

public List<InputSplit> getSplits(JobContext context) throws IOException {

    List<InputSplit> splits = super.getSplits(context);
    List<InputSplit> spl = textFormat.getSplits(context);
    splits.addAll(spl);//from  w w  w .  j a v  a2  s  .  co  m
    String p = context.getConfiguration().get("mapred.fairscheduler.pool");
    int max = Integer.parseInt(p.substring(p.indexOf("l") + 1));
    if (splits.size() <= max)
        context.getConfiguration().setInt("mapred.reduce.tasks", splits.size());
    else
        context.getConfiguration().setInt("mapred.reduce.tasks", max);
    return splits;

}

From source file:gr.ntua.h2rdf.inputFormat.MyFileInputFormat.java

License:Open Source License

/**
 * Get the minimum split size/*from   ww  w  .ja v a2 s  .c o  m*/
 * @param job the job
 * @return the minimum number of bytes that can be in a split
 */
public static long getMinSplitSize(JobContext job) {
    return job.getConfiguration().getLong("mapred.min.split.size", 1L);
}

From source file:gr.ntua.h2rdf.inputFormat.MyFileInputFormat.java

License:Open Source License

/**
 * Get the maximum split size./* w  w  w .j  a va 2s. com*/
 * @param context the job to look at.
 * @return the maximum number of bytes a split can include
 */
public static long getMaxSplitSize(JobContext context) {
    return context.getConfiguration().getLong("mapred.max.split.size", Long.MAX_VALUE);
}

From source file:gr.ntua.h2rdf.inputFormat.MyFileInputFormat.java

License:Open Source License

/** List input directories.
 * Subclasses may override to, e.g., select only files matching a regular
 * expression. //w  ww  . ja v a2 s  . co m
 * 
 * @param job the job to list input paths for
 * @return array of FileStatus objects
 * @throws IOException if zero items.
 */
protected List<FileStatus> listStatus(JobContext job) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();
    Path[] dirs = getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }

    List<IOException> errors = new ArrayList<IOException>();

    // creates a MultiPathFilter with the hiddenFileFilter and the
    // user provided one (if any).
    List<PathFilter> filters = new ArrayList<PathFilter>();
    filters.add(hiddenFileFilter);
    PathFilter jobFilter = getInputPathFilter(job);
    if (jobFilter != null) {
        filters.add(jobFilter);
    }
    PathFilter inputFilter = new MultiPathFilter(filters);

    for (int i = 0; i < dirs.length; ++i) {
        Path p = dirs[i];
        FileSystem fs = p.getFileSystem(job.getConfiguration());
        FileStatus[] matches = fs.globStatus(p, inputFilter);
        if (matches == null) {
            errors.add(new IOException("Input path does not exist: " + p));
        } else if (matches.length == 0) {
            errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
        } else {
            for (FileStatus globStat : matches) {
                if (globStat.isDir()) {
                    for (FileStatus stat : fs.listStatus(globStat.getPath(), inputFilter)) {
                        result.add(stat);
                    }
                } else {
                    result.add(globStat);
                }
            }
        }
    }

    if (!errors.isEmpty()) {
        throw new InvalidInputException(errors);
    }
    LOG.info("Total input paths to process : " + result.size());
    return result;
}

From source file:gr.ntua.h2rdf.inputFormat.MyFileInputFormat.java

License:Open Source License

/** 
 * Generate the list of files and make them into FileSplits.
 *///from   w  w  w  .j  a va2 s .  c o m
public List<InputSplit> getSplits(JobContext job) throws IOException {
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    for (FileStatus file : listStatus(job)) {
        Path path = file.getPath();
        FileSystem fs = path.getFileSystem(job.getConfiguration());
        long length = file.getLen();
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if ((length != 0) && isSplitable(job, path)) {
            long blockSize = file.getBlockSize();
            long splitSize = computeSplitSize(blockSize, minSize, maxSize);

            long bytesRemaining = length;
            while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                splits.add(new MyFileSplit(path, length - bytesRemaining, splitSize,
                        blkLocations[blkIndex].getHosts()));
                bytesRemaining -= splitSize;
            }

            if (bytesRemaining != 0) {
                splits.add(new MyFileSplit(path, length - bytesRemaining, bytesRemaining,
                        blkLocations[blkLocations.length - 1].getHosts()));
            }
        } else if (length != 0) {
            splits.add(new MyFileSplit(path, 0, length, blkLocations[0].getHosts()));
        } else {
            //Create empty hosts array for zero length files
            splits.add(new MyFileSplit(path, 0, length, new String[0]));
        }
    }
    LOG.debug("Total # of splits: " + splits.size());

    String p = job.getConfiguration().get("mapred.fairscheduler.pool");
    int max = Integer.parseInt(p.substring(p.indexOf("l") + 1));

    if (splits.size() <= max)
        job.getConfiguration().setInt("mapred.reduce.tasks", splits.size());
    else
        job.getConfiguration().setInt("mapred.reduce.tasks", max);
    return splits;
}

From source file:gr.ntua.h2rdf.inputFormat.TableInputFormatBase.java

License:Open Source License

/**
 * Calculates the splits that will serve as input for the map tasks. The
 * number of splits matches the number of regions in a table.
 *
 * @param context  The current job context.
 * @return The list of input splits.//from   ww  w.ja  v  a2s. c o m
 * @throws IOException When creating the list of splits fails.
 * @see org.apache.hadoop.mapreduce.InputFormat#getSplits(
 *   org.apache.hadoop.mapreduce.JobContext)
 */
@Override
public List<InputSplit> getSplits(JobContext context) throws IOException {

    String p = context.getConfiguration().get("mapred.fairscheduler.pool");
    max_tasks = Integer.parseInt(p.substring(p.indexOf("l") + 1));

    Iterator<Scan> scanIterator = scanList.iterator();
    Iterator<String> tableIterator = tableList.iterator();
    Iterator<String> varsIterator = varList.iterator();
    Iterator<String> fnameIterator = fnameList.iterator();

    try {
        SUBCLASS = (new H2RDFNode(Node.createURI("http://www.w3.org/2000/01/rdf-schema#subClassOf")))
                .getHashValue();
    } catch (NotSupportedDatatypeException e) {
        throw new IOException("Not supported datatype");
    }
    System.out.println("calculating splitnumber");
    conf = context.getConfiguration();
    Configuration HBconf = HBaseConfiguration.create();
    Scan scan = null;
    splits = new ArrayList<InputSplit>();
    while (scanIterator.hasNext()) {
        System.out.println("New Input BGP");
        scan = scanIterator.next();
        String tname = tableIterator.next();
        table = new HTable(HBconf, tname);
        keys = table.getStartEndKeys();
        String vars = varsIterator.next();
        String fname = fnameIterator.next();

        splitSubclass(scan, tname, vars, fname);
    }
    if (splits.size() <= max_tasks)
        context.getConfiguration().setInt("mapred.reduce.tasks", splits.size());
    else
        context.getConfiguration().setInt("mapred.reduce.tasks", max_tasks);
    return splits;
}

From source file:gr.ntua.h2rdf.inputFormat2.MultiTableInputFormatBase.java

License:Open Source License

/**
 * Calculates the splits that will serve as input for the map tasks. The
 * number of splits matches the number of regions in a table.
 *
 * @param context The current job context.
 * @return The list of input splits./*from   ww  w  .jav a  2s.  co  m*/
 * @throws IOException When creating the list of splits fails.
 * @see org.apache.hadoop.mapreduce.InputFormat#getSplits(org.apache.hadoop.mapreduce.JobContext)
 */
@Override
public List<InputSplit> getSplits(JobContext context) throws IOException {
    if (scans.isEmpty()) {
        throw new IOException("No scans were provided.");
    }
    List<InputSplit> splits = new ArrayList<InputSplit>();
    for (Scan scan : scans) {
        byte[] tableName = scan.getAttribute(Scan.SCAN_ATTRIBUTES_TABLE_NAME);
        if (tableName == null)
            throw new IOException("A scan object did not have a table name");
        HTable table = new HTable(context.getConfiguration(), tableName);
        Pair<byte[][], byte[][]> keys = table.getStartEndKeys();
        if (keys == null || keys.getFirst() == null || keys.getFirst().length == 0) {
            throw new IOException("Expecting at least one region for table : " + Bytes.toString(tableName));
        }
        int count = 0;

        byte[] startRow = scan.getStartRow();
        byte[] stopRow = scan.getStopRow();

        for (int i = 0; i < keys.getFirst().length; i++) {
            if (!includeRegionInSplit(keys.getFirst()[i], keys.getSecond()[i])) {
                continue;
            }
            String regionLocation = table.getRegionLocation(keys.getFirst()[i], false).getHostname();

            // determine if the given start and stop keys fall into the range
            if ((startRow.length == 0 || keys.getSecond()[i].length == 0
                    || Bytes.compareTo(startRow, keys.getSecond()[i]) < 0)
                    && (stopRow.length == 0 || Bytes.compareTo(stopRow, keys.getFirst()[i]) > 0)) {
                byte[] splitStart = startRow.length == 0 || Bytes.compareTo(keys.getFirst()[i], startRow) >= 0
                        ? keys.getFirst()[i]
                        : startRow;
                byte[] splitStop = (stopRow.length == 0 || Bytes.compareTo(keys.getSecond()[i], stopRow) <= 0)
                        && keys.getSecond()[i].length > 0 ? keys.getSecond()[i] : stopRow;
                InputSplit split = new TableSplit(tableName, scan, splitStart, splitStop, regionLocation);
                splits.add(split);
                if (LOG.isDebugEnabled())
                    LOG.debug("getSplits: split -> " + (count++) + " -> " + split);
            }
        }
        table.close();
    }
    return splits;
}