Example usage for org.apache.hadoop.mapreduce.lib.input CombineFileSplit CombineFileSplit

List of usage examples for org.apache.hadoop.mapreduce.lib.input CombineFileSplit CombineFileSplit

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.input CombineFileSplit CombineFileSplit.

Prototype

public CombineFileSplit(Path[] files, long[] start, long[] lengths, String[] locations) 

Source Link

Usage

From source file:com.gemstone.gemfire.cache.hdfs.internal.hoplog.mapreduce.HDFSSplitIterator.java

License:Apache License

public HDFSSplitIterator(FileSystem fs, Path[] paths, long[] offsets, long[] lengths, long startTime,
        long endTime) throws IOException {
    this.fs = fs;
    this.split = new CombineFileSplit(paths, offsets, lengths, null);
    while (currentHopIndex < split.getNumPaths() && !fs.exists(split.getPath(currentHopIndex))) {
        logger.warn(LocalizedMessage.create(LocalizedStrings.HOPLOG_CLEANED_UP_BY_JANITOR,
                split.getPath(currentHopIndex)));
        currentHopIndex++;/*from  w w w  . j av a2  s  .c o m*/
    }
    if (currentHopIndex == split.getNumPaths()) {
        this.hoplog = null;
        iterator = null;
    } else {
        this.hoplog = getHoplog(fs, split.getPath(currentHopIndex));
        iterator = hoplog.getReader().scan(split.getOffset(currentHopIndex), split.getLength(currentHopIndex));
    }
    this.startTime = startTime;
    this.endTime = endTime;
}

From source file:com.hp.hpit.cs.MyCombineFileInputFormat.java

License:Apache License

/**
 * Create a single split from the list of blocks specified in validBlocks
 * Add this new split into splitList./*  ww w  .  j  av  a  2s .  c  o m*/
 */
private void addCreatedSplit(List<InputSplit> splitList, Collection<String> locations,
        ArrayList<OneBlockInfo> validBlocks) {
    // create an input split
    Path[] fl = new Path[validBlocks.size()];
    long[] offset = new long[validBlocks.size()];
    long[] length = new long[validBlocks.size()];
    for (int i = 0; i < validBlocks.size(); i++) {
        fl[i] = validBlocks.get(i).onepath;
        offset[i] = validBlocks.get(i).offset;
        length[i] = validBlocks.get(i).length;
    }

    // add this split to the list that is returned
    CombineFileSplit thissplit = new CombineFileSplit(fl, offset, length, locations.toArray(new String[0]));
    splitList.add(thissplit);
}

From source file:com.ikanow.aleph2.analytics.hadoop.assets.BeFileInputFormat.java

License:Open Source License

@Override
public List<InputSplit> getSplits(JobContext context) throws IOException {
    logger.debug("BeFileInputFormat.getSplits");

    super.setMaxSplitSize(MAX_SPLIT_SIZE);

    try {/*from   ww w.ja  v a2 s.c  om*/
        final List<InputSplit> splits = Lambdas.get(Lambdas.wrap_u(() -> {
            final List<InputSplit> tmp = super.getSplits(context);

            String debug_max_str = context.getConfiguration().get(BatchEnrichmentJob.BE_DEBUG_MAX_SIZE);
            if (null != debug_max_str) {
                final int requested_records = Integer.parseInt(debug_max_str);

                // dump 5* the request number of splits into one mega split
                // to strike a balance between limiting the data and making sure for 
                // tests that enough records are generated

                final CombineFileSplit combined = new CombineFileSplit(
                        tmp.stream().map(split -> (CombineFileSplit) split)
                                .flatMap(split -> Arrays.stream(split.getPaths())).limit(5L * requested_records)
                                .<Path>toArray(size -> new Path[size]),
                        ArrayUtils.toPrimitive(
                                tmp.stream().map(split -> (CombineFileSplit) split)
                                        .flatMap(split -> Arrays.stream(split.getStartOffsets()).boxed())
                                        .limit(5L * requested_records).<Long>toArray(size -> new Long[size]),
                                0L),
                        ArrayUtils.toPrimitive(
                                tmp.stream().map(split -> (CombineFileSplit) split)
                                        .flatMap(split -> Arrays.stream(split.getLengths()).boxed())
                                        .limit(5L * requested_records).<Long>toArray(size -> new Long[size]),
                                0L),
                        tmp.stream().map(split -> (CombineFileSplit) split)
                                .flatMap(Lambdas.wrap_u(split -> Arrays.stream(split.getLocations())))
                                .limit(5L * requested_records).<String>toArray(size -> new String[size]));
                return Arrays.<InputSplit>asList(combined);
            } else
                return tmp;
        }));

        logger.debug("BeFileInputFormat.getSplits: " + ((splits != null) ? splits.size() : "null"));
        return splits;

    } catch (Throwable t) {
        logger.error(t);
        throw new IOException(t);
    }
}

From source file:com.ikanow.aleph2.analytics.hadoop.assets.UpdatedCombineFileInputFormat.java

License:Apache License

/**
 * Create a single split from the list of blocks specified in validBlocks
 * Add this new split into splitList.//from www .jav  a  2  s.co m
 */
private void addCreatedSplit(List<InputSplit> splitList, Collection<String> locations,
        ArrayList<OneBlockInfo> validBlocks) {
    // create an input split
    Path[] fl = new Path[validBlocks.size()];
    long[] offset = new long[validBlocks.size()];
    long[] length = new long[validBlocks.size()];
    for (int i = 0; i < validBlocks.size(); i++) {
        fl[i] = validBlocks.get(i).onepath;
        offset[i] = validBlocks.get(i).offset;
        length[i] = validBlocks.get(i).length;
    }
    // add this split to the list that is returned
    CombineFileSplit thissplit = new CombineFileSplit(fl, offset, length, locations.toArray(new String[0]));
    splitList.add(thissplit);
}

From source file:com.ikanow.aleph2.analytics.r.assets.BeFileInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext context) throws IOException {
    logger.debug("BeFileInputFormat.getSplits");

    super.setMaxSplitSize(MAX_SPLIT_SIZE);

    try {//from   ww  w .  java 2 s.c o m
        final List<InputSplit> splits = Lambdas.get(Lambdas.wrap_u(() -> {
            final List<InputSplit> tmp = super.getSplits(context);

            String debug_max_str = context.getConfiguration().get(BatchEnrichmentJob.BE_DEBUG_MAX_SIZE);
            if (null != debug_max_str) {
                final int requested_records = Integer.parseInt(debug_max_str);

                // dump 5* the request number of splits into one mega split
                // to strike a balance between limiting the data and making sure for 
                // tests that enough records are generated

                final CombineFileSplit combined = new CombineFileSplit(
                        tmp.stream().map(split -> (CombineFileSplit) split)
                                .flatMap(split -> Arrays.stream(split.getPaths())).limit(5L * requested_records)
                                .<Path>toArray(size -> new Path[size]),
                        ArrayUtils.toPrimitive(
                                tmp.stream().map(split -> (CombineFileSplit) split)
                                        .flatMap(split -> Arrays.stream(split.getStartOffsets()).boxed())
                                        .limit(5L * requested_records).<Long>toArray(size -> new Long[size]),
                                0L),
                        ArrayUtils.toPrimitive(
                                tmp.stream().map(split -> (CombineFileSplit) split)
                                        .flatMap(split -> Arrays.stream(split.getLengths()).boxed())
                                        .limit(5L * requested_records).<Long>toArray(size -> new Long[size]),
                                0L),
                        tmp.stream().map(split -> (CombineFileSplit) split)
                                .flatMap(Lambdas.wrap_u(split -> Arrays.stream(split.getLocations())))
                                .limit(5L * requested_records).<String>toArray(size -> new String[size]));
                return Arrays.<InputSplit>asList(combined);
            } else
                return tmp;
        }));

        logger.debug("BeFileInputFormat.getSplits: " + ((splits != null) ? splits.size() : "null"));
        return splits;

    } catch (Throwable t) {
        logger.error(ErrorUtils.getLongForm("Error getting splits, error = {0}", t));

        return Collections.emptyList();
    }
}

From source file:com.ikanow.aleph2.analytics.spark.assets.BeFileInputFormat_Pure.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext context) throws IOException {
    logger.debug("BeFileInputFormat.getSplits");

    super.setMaxSplitSize(MAX_SPLIT_SIZE);

    try {//  w ww.j a  va 2 s. c  o m
        final List<InputSplit> splits = Lambdas.get(Lambdas.wrap_u(() -> {
            final List<InputSplit> tmp = super.getSplits(context);

            String debug_max_str = context.getConfiguration().get(HadoopBatchEnrichmentUtils.BE_DEBUG_MAX_SIZE);
            if (null != debug_max_str) {
                final int requested_records = Integer.parseInt(debug_max_str);

                // dump 5* the request number of splits into one mega split
                // to strike a balance between limiting the data and making sure for 
                // tests that enough records are generated

                final CombineFileSplit combined = new CombineFileSplit(
                        tmp.stream().map(split -> (CombineFileSplit) split)
                                .flatMap(split -> Arrays.stream(split.getPaths())).limit(5L * requested_records)
                                .<Path>toArray(size -> new Path[size]),
                        ArrayUtils.toPrimitive(
                                tmp.stream().map(split -> (CombineFileSplit) split)
                                        .flatMap(split -> Arrays.stream(split.getStartOffsets()).boxed())
                                        .limit(5L * requested_records).<Long>toArray(size -> new Long[size]),
                                0L),
                        ArrayUtils.toPrimitive(
                                tmp.stream().map(split -> (CombineFileSplit) split)
                                        .flatMap(split -> Arrays.stream(split.getLengths()).boxed())
                                        .limit(5L * requested_records).<Long>toArray(size -> new Long[size]),
                                0L),
                        tmp.stream().map(split -> (CombineFileSplit) split)
                                .flatMap(Lambdas.wrap_u(split -> Arrays.stream(split.getLocations())))
                                .limit(5L * requested_records).<String>toArray(size -> new String[size]));
                return Arrays.<InputSplit>asList(combined);
            } else
                return tmp;
        }));

        logger.debug("BeFileInputFormat.getSplits: " + ((splits != null) ? splits.size() : "null"));
        return splits;

    } catch (Throwable t) {
        logger.error(ErrorUtils.getLongForm("Error getting splits, error = {0}", t));

        return Collections.emptyList();
    }
}

From source file:com.twitter.hraven.mapreduce.CombineFileInputFormat.java

License:Apache License

/**
 * Create a single split from the list of blocks specified in validBlocks
 * Add this new split into splitList./*from   www.j ava2  s.c o  m*/
 */
private void addCreatedSplit(List<InputSplit> splitList, List<String> locations,
        ArrayList<OneBlockInfo> validBlocks) {
    // create an input split
    Path[] fl = new Path[validBlocks.size()];
    long[] offset = new long[validBlocks.size()];
    long[] length = new long[validBlocks.size()];
    for (int i = 0; i < validBlocks.size(); i++) {
        fl[i] = validBlocks.get(i).onepath;
        offset[i] = validBlocks.get(i).offset;
        length[i] = validBlocks.get(i).length;
    }

    // add this split to the list that is returned
    CombineFileSplit thissplit = new CombineFileSplit(fl, offset, length, locations.toArray(new String[0]));
    splitList.add(thissplit);
}

From source file:gobblin.compaction.mapreduce.avro.AvroKeyRecursiveCombineFileInputFormat.java

License:Apache License

/**
 * Set the number of locations in the split to SPLIT_MAX_NUM_LOCATIONS if it is larger than
 * SPLIT_MAX_NUM_LOCATIONS (MAPREDUCE-5186).
 *///from w ww  .  ja v a2  s .c om
private static List<InputSplit> cleanSplits(List<InputSplit> splits) throws IOException {
    if (VersionInfo.getVersion().compareTo("2.3.0") >= 0) {
        // This issue was fixed in 2.3.0, if newer version, no need to clean up splits
        return splits;
    }

    List<InputSplit> cleanedSplits = Lists.newArrayList();

    for (int i = 0; i < splits.size(); i++) {
        CombineFileSplit oldSplit = (CombineFileSplit) splits.get(i);
        String[] locations = oldSplit.getLocations();

        Preconditions.checkNotNull(locations, "CombineFileSplit.getLocations() returned null");

        if (locations.length > SPLIT_MAX_NUM_LOCATIONS) {
            locations = Arrays.copyOf(locations, SPLIT_MAX_NUM_LOCATIONS);
        }

        cleanedSplits.add(new CombineFileSplit(oldSplit.getPaths(), oldSplit.getStartOffsets(),
                oldSplit.getLengths(), locations));
    }
    return cleanedSplits;
}