Example usage for org.apache.hadoop.mapreduce.lib.input CombineFileSplit CombineFileSplit

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.input CombineFileSplit CombineFileSplit.

Prototype

public CombineFileSplit(Path[] files, long[] start, long[] lengths, String[] locations)

Source Link

Usage

From source file:com.gemstone.gemfire.cache.hdfs.internal.hoplog.mapreduce.HDFSSplitIterator.java

License:Apache License

public HDFSSplitIterator(FileSystem fs, Path[] paths, long[] offsets, long[] lengths, long startTime,
        long endTime) throws IOException {
    this.fs = fs;
    this.split = new CombineFileSplit(paths, offsets, lengths, null);
    while (currentHopIndex < split.getNumPaths() && !fs.exists(split.getPath(currentHopIndex))) {
        logger.warn(LocalizedMessage.create(LocalizedStrings.HOPLOG_CLEANED_UP_BY_JANITOR,
                split.getPath(currentHopIndex)));
        currentHopIndex++;/*from  w w w  . j av a2  s  .c o m*/
    }
    if (currentHopIndex == split.getNumPaths()) {
        this.hoplog = null;
        iterator = null;
    } else {
        this.hoplog = getHoplog(fs, split.getPath(currentHopIndex));
        iterator = hoplog.getReader().scan(split.getOffset(currentHopIndex), split.getLength(currentHopIndex));
    }
    this.startTime = startTime;
    this.endTime = endTime;
}

From source file:com.hp.hpit.cs.MyCombineFileInputFormat.java

License:Apache License

/**
 * Create a single split from the list of blocks specified in validBlocks
 * Add this new split into splitList./*  ww w  .  j  av  a  2s .  c  o m*/
 */
private void addCreatedSplit(List<InputSplit> splitList, Collection<String> locations,
        ArrayList<OneBlockInfo> validBlocks) {
    // create an input split
    Path[] fl = new Path[validBlocks.size()];
    long[] offset = new long[validBlocks.size()];
    long[] length = new long[validBlocks.size()];
    for (int i = 0; i < validBlocks.size(); i++) {
        fl[i] = validBlocks.get(i).onepath;
        offset[i] = validBlocks.get(i).offset;
        length[i] = validBlocks.get(i).length;
    }

    // add this split to the list that is returned
    CombineFileSplit thissplit = new CombineFileSplit(fl, offset, length, locations.toArray(new String[0]));
    splitList.add(thissplit);
}

From source file:com.ikanow.aleph2.analytics.hadoop.assets.BeFileInputFormat.java

License:Open Source License

@Override
public List<InputSplit> getSplits(JobContext context) throws IOException {
    logger.debug("BeFileInputFormat.getSplits");

    super.setMaxSplitSize(MAX_SPLIT_SIZE);

    try {/*from   ww w.ja  v a2 s.c  om*/
        final List<InputSplit> splits = Lambdas.get(Lambdas.wrap_u(() -> {
            final List<InputSplit> tmp = super.getSplits(context);

            String debug_max_str = context.getConfiguration().get(BatchEnrichmentJob.BE_DEBUG_MAX_SIZE);
            if (null != debug_max_str) {
                final int requested_records = Integer.parseInt(debug_max_str);

                // dump 5* the request number of splits into one mega split
                // to strike a balance between limiting the data and making sure for 
                // tests that enough records are generated

                final CombineFileSplit combined = new CombineFileSplit(
                        tmp.stream().map(split -> (CombineFileSplit) split)
                                .flatMap(split -> Arrays.stream(split.getPaths())).limit(5L * requested_records)
                                .<Path>toArray(size -> new Path[size]),
                        ArrayUtils.toPrimitive(
                                tmp.stream().map(split -> (CombineFileSplit) split)
                                        .flatMap(split -> Arrays.stream(split.getStartOffsets()).boxed())
                                        .limit(5L * requested_records).<Long>toArray(size -> new Long[size]),
                                0L),
                        ArrayUtils.toPrimitive(
                                tmp.stream().map(split -> (CombineFileSplit) split)
                                        .flatMap(split -> Arrays.stream(split.getLengths()).boxed())
                                        .limit(5L * requested_records).<Long>toArray(size -> new Long[size]),
                                0L),
                        tmp.stream().map(split -> (CombineFileSplit) split)
                                .flatMap(Lambdas.wrap_u(split -> Arrays.stream(split.getLocations())))
                                .limit(5L * requested_records).<String>toArray(size -> new String[size]));
                return Arrays.<InputSplit>asList(combined);
            } else
                return tmp;
        }));

        logger.debug("BeFileInputFormat.getSplits: " + ((splits != null) ? splits.size() : "null"));
        return splits;

    } catch (Throwable t) {
        logger.error(t);
        throw new IOException(t);
    }
}

From source file:com.ikanow.aleph2.analytics.hadoop.assets.UpdatedCombineFileInputFormat.java

License:Apache License

/**
 * Create a single split from the list of blocks specified in validBlocks
 * Add this new split into splitList.//from www .jav  a  2  s.co m
 */
private void addCreatedSplit(List<InputSplit> splitList, Collection<String> locations,
        ArrayList<OneBlockInfo> validBlocks) {
    // create an input split
    Path[] fl = new Path[validBlocks.size()];
    long[] offset = new long[validBlocks.size()];
    long[] length = new long[validBlocks.size()];
    for (int i = 0; i < validBlocks.size(); i++) {
        fl[i] = validBlocks.get(i).onepath;
        offset[i] = validBlocks.get(i).offset;
        length[i] = validBlocks.get(i).length;
    }
    // add this split to the list that is returned
    CombineFileSplit thissplit = new CombineFileSplit(fl, offset, length, locations.toArray(new String[0]));
    splitList.add(thissplit);
}

From source file:com.ikanow.aleph2.analytics.r.assets.BeFileInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext context) throws IOException {
    logger.debug("BeFileInputFormat.getSplits");

    super.setMaxSplitSize(MAX_SPLIT_SIZE);

    try {//from   ww  w .  java 2 s.c o m
        final List<InputSplit> splits = Lambdas.get(Lambdas.wrap_u(() -> {
            final List<InputSplit> tmp = super.getSplits(context);

            String debug_max_str = context.getConfiguration().get(BatchEnrichmentJob.BE_DEBUG_MAX_SIZE);
            if (null != debug_max_str) {
                final int requested_records = Integer.parseInt(debug_max_str);

                // dump 5* the request number of splits into one mega split
                // to strike a balance between limiting the data and making sure for 
                // tests that enough records are generated

                final CombineFileSplit combined = new CombineFileSplit(
                        tmp.stream().map(split -> (CombineFileSplit) split)
                                .flatMap(split -> Arrays.stream(split.getPaths())).limit(5L * requested_records)
                                .<Path>toArray(size -> new Path[size]),
                        ArrayUtils.toPrimitive(
                                tmp.stream().map(split -> (CombineFileSplit) split)
                                        .flatMap(split -> Arrays.stream(split.getStartOffsets()).boxed())
                                        .limit(5L * requested_records).<Long>toArray(size -> new Long[size]),
                                0L),
                        ArrayUtils.toPrimitive(
                                tmp.stream().map(split -> (CombineFileSplit) split)
                                        .flatMap(split -> Arrays.stream(split.getLengths()).boxed())
                                        .limit(5L * requested_records).<Long>toArray(size -> new Long[size]),
                                0L),
                        tmp.stream().map(split -> (CombineFileSplit) split)
                                .flatMap(Lambdas.wrap_u(split -> Arrays.stream(split.getLocations())))
                                .limit(5L * requested_records).<String>toArray(size -> new String[size]));
                return Arrays.<InputSplit>asList(combined);
            } else
                return tmp;
        }));

        logger.debug("BeFileInputFormat.getSplits: " + ((splits != null) ? splits.size() : "null"));
        return splits;

    } catch (Throwable t) {
        logger.error(ErrorUtils.getLongForm("Error getting splits, error = {0}", t));

        return Collections.emptyList();
    }
}

From source file:com.ikanow.aleph2.analytics.spark.assets.BeFileInputFormat_Pure.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext context) throws IOException {
    logger.debug("BeFileInputFormat.getSplits");

    super.setMaxSplitSize(MAX_SPLIT_SIZE);

    try {//  w ww.j a  va 2 s. c  o m
        final List<InputSplit> splits = Lambdas.get(Lambdas.wrap_u(() -> {
            final List<InputSplit> tmp = super.getSplits(context);

            String debug_max_str = context.getConfiguration().get(HadoopBatchEnrichmentUtils.BE_DEBUG_MAX_SIZE);
            if (null != debug_max_str) {
                final int requested_records = Integer.parseInt(debug_max_str);

                // dump 5* the request number of splits into one mega split
                // to strike a balance between limiting the data and making sure for 
                // tests that enough records are generated

                final CombineFileSplit combined = new CombineFileSplit(
                        tmp.stream().map(split -> (CombineFileSplit) split)
                                .flatMap(split -> Arrays.stream(split.getPaths())).limit(5L * requested_records)
                                .<Path>toArray(size -> new Path[size]),
                        ArrayUtils.toPrimitive(
                                tmp.stream().map(split -> (CombineFileSplit) split)
                                        .flatMap(split -> Arrays.stream(split.getStartOffsets()).boxed())
                                        .limit(5L * requested_records).<Long>toArray(size -> new Long[size]),
                                0L),
                        ArrayUtils.toPrimitive(
                                tmp.stream().map(split -> (CombineFileSplit) split)
                                        .flatMap(split -> Arrays.stream(split.getLengths()).boxed())
                                        .limit(5L * requested_records).<Long>toArray(size -> new Long[size]),
                                0L),
                        tmp.stream().map(split -> (CombineFileSplit) split)
                                .flatMap(Lambdas.wrap_u(split -> Arrays.stream(split.getLocations())))
                                .limit(5L * requested_records).<String>toArray(size -> new String[size]));
                return Arrays.<InputSplit>asList(combined);
            } else
                return tmp;
        }));

        logger.debug("BeFileInputFormat.getSplits: " + ((splits != null) ? splits.size() : "null"));
        return splits;

    } catch (Throwable t) {
        logger.error(ErrorUtils.getLongForm("Error getting splits, error = {0}", t));

        return Collections.emptyList();
    }
}

From source file:com.twitter.hraven.mapreduce.CombineFileInputFormat.java

License:Apache License

/**
 * Create a single split from the list of blocks specified in validBlocks
 * Add this new split into splitList./*from   www.j ava2  s.c o  m*/
 */
private void addCreatedSplit(List<InputSplit> splitList, List<String> locations,
        ArrayList<OneBlockInfo> validBlocks) {
    // create an input split
    Path[] fl = new Path[validBlocks.size()];
    long[] offset = new long[validBlocks.size()];
    long[] length = new long[validBlocks.size()];
    for (int i = 0; i < validBlocks.size(); i++) {
        fl[i] = validBlocks.get(i).onepath;
        offset[i] = validBlocks.get(i).offset;
        length[i] = validBlocks.get(i).length;
    }

    // add this split to the list that is returned
    CombineFileSplit thissplit = new CombineFileSplit(fl, offset, length, locations.toArray(new String[0]));
    splitList.add(thissplit);
}

From source file:gobblin.compaction.mapreduce.avro.AvroKeyRecursiveCombineFileInputFormat.java

License:Apache License

/**
 * Set the number of locations in the split to SPLIT_MAX_NUM_LOCATIONS if it is larger than
 * SPLIT_MAX_NUM_LOCATIONS (MAPREDUCE-5186).
 *///from w ww  .  ja v a2  s .c om
private static List<InputSplit> cleanSplits(List<InputSplit> splits) throws IOException {
    if (VersionInfo.getVersion().compareTo("2.3.0") >= 0) {
        // This issue was fixed in 2.3.0, if newer version, no need to clean up splits
        return splits;
    }

    List<InputSplit> cleanedSplits = Lists.newArrayList();

    for (int i = 0; i < splits.size(); i++) {
        CombineFileSplit oldSplit = (CombineFileSplit) splits.get(i);
        String[] locations = oldSplit.getLocations();

        Preconditions.checkNotNull(locations, "CombineFileSplit.getLocations() returned null");

        if (locations.length > SPLIT_MAX_NUM_LOCATIONS) {
            locations = Arrays.copyOf(locations, SPLIT_MAX_NUM_LOCATIONS);
        }

        cleanedSplits.add(new CombineFileSplit(oldSplit.getPaths(), oldSplit.getStartOffsets(),
                oldSplit.getLengths(), locations));
    }
    return cleanedSplits;
}