Example usage for org.apache.hadoop.mapreduce.lib.input FileSplit FileSplit

List of usage examples for org.apache.hadoop.mapreduce.lib.input FileSplit FileSplit

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.input FileSplit FileSplit.

Prototype

public FileSplit(Path file, long start, long length, String[] hosts) 

Source Link

Document

Constructs a split with host information

Usage

From source file:be.uantwerpen.adrem.eclat.util.SplitByKTextInputFormatTest.java

License:Apache License

@Test
public void splits_Non_Empty_File_One_Split() throws IOException {
    File in = createTmpFile("in_Splits_Non_Empty_File_One_Split", non_Empty);
    Configuration conf = createConfiguration();

    FileStatus status = EasyMock.createMock(FileStatus.class);
    EasyMock.expect(status.getPath()).andReturn(new Path(in.getAbsolutePath()));
    EasyMock.expect(status.isDir()).andReturn(false);
    EasyMock.replay(status);//ww  w . j a v  a  2s.com

    List<FileSplit> splits = SplitByKTextInputFormat.getSplitsForFile(status, conf, 1);

    List<FileSplit> expected = newArrayListWithCapacity(1);
    expected.add(new FileSplit(new Path(in.getAbsolutePath()), 0, 17, new String[] {}));

    checkSplits(expected, splits);
}

From source file:be.uantwerpen.adrem.eclat.util.SplitByKTextInputFormatTest.java

License:Apache License

@Test
public void splits_Non_Empty_File_Ok_Splits() throws IOException {
    File in = createTmpFile("in_Splits_Non_Empty_File_Ok_Splits", non_Empty);
    Configuration conf = createConfiguration();

    FileStatus status = EasyMock.createMock(FileStatus.class);
    EasyMock.expect(status.getPath()).andReturn(new Path(in.getAbsolutePath()));
    EasyMock.expect(status.isDir()).andReturn(false);
    EasyMock.replay(status);/*from w  w w  . java  2  s.  com*/

    List<FileSplit> splits = SplitByKTextInputFormat.getSplitsForFile(status, conf, 2);

    List<FileSplit> expected = newArrayListWithCapacity(2);
    expected.add(new FileSplit(new Path(in.getAbsolutePath()), 0, 12, new String[] {}));
    expected.add(new FileSplit(new Path(in.getAbsolutePath()), 12, 5, new String[] {}));

    checkSplits(expected, splits);
}

From source file:be.uantwerpen.adrem.eclat.util.SplitByKTextInputFormatTest.java

License:Apache License

@Test
public void splits_Non_Empty_File_More_Splits_Than_Lines() throws IOException {
    File in = createTmpFile("in_Splits_Non_Empty_File_More_Splits_Than_Lines", non_Empty);
    Configuration conf = createConfiguration();

    FileStatus status = EasyMock.createMock(FileStatus.class);
    EasyMock.expect(status.getPath()).andReturn(new Path(in.getAbsolutePath()));
    EasyMock.expect(status.isDir()).andReturn(false);
    EasyMock.replay(status);//from   w ww.  j av  a 2  s . co  m

    List<FileSplit> splits = SplitByKTextInputFormat.getSplitsForFile(status, conf, 10);

    List<FileSplit> expected = newArrayListWithCapacity(3);
    expected.add(new FileSplit(new Path(in.getAbsolutePath()), 0, 5, new String[] {}));
    expected.add(new FileSplit(new Path(in.getAbsolutePath()), 5, 7, new String[] {}));
    expected.add(new FileSplit(new Path(in.getAbsolutePath()), 12, 5, new String[] {}));

    checkSplits(expected, splits);
}

From source file:be.uantwerpen.adrem.hadoop.util.SplitByKTextInputFormat.java

License:Apache License

/**
 * Creates a new filesplit object//from w w w  .  j a v  a2  s. c om
 * 
 * @param fileName
 *          name of the file to which filesplit corresponds
 * @param begin
 *          begin of the split
 * @param length
 *          length of the split
 * @return file split object
 */
protected static FileSplit createFileSplit(Path fileName, long begin, long length) {
    return (begin == 0) ? new FileSplit(fileName, begin, length - 1, new String[] {})
            : new FileSplit(fileName, begin - 1, length, new String[] {});
}

From source file:bsc.spark.examples.terasort.ehiggs.TeraScheduler.java

License:Apache License

/**
 * Solve the schedule and modify the FileSplit array to reflect the new
 * schedule. It will move placed splits to front and unplacable splits
 * to the end./*from ww  w.j  a v  a 2  s . c  o  m*/
 * @return a new list of FileSplits that are modified to have the
 *    best host as the only host.
 * @throws IOException
 */
public List<InputSplit> getNewFileSplits() throws IOException {
    solve();
    FileSplit[] result = new FileSplit[realSplits.length];
    int left = 0;
    int right = realSplits.length - 1;
    for (int i = 0; i < splits.length; ++i) {
        if (splits[i].isAssigned) {
            // copy the split and fix up the locations
            String[] newLocations = { splits[i].locations.get(0).hostname };
            realSplits[i] = new FileSplit(realSplits[i].getPath(), realSplits[i].getStart(),
                    realSplits[i].getLength(), newLocations);
            result[left++] = realSplits[i];
        } else {
            result[right--] = realSplits[i];
        }
    }
    List<InputSplit> ret = new ArrayList<InputSplit>();
    for (FileSplit fs : result) {
        ret.add(fs);
    }
    return ret;
}

From source file:bucket_sort.NLineInputFormat.java

License:Apache License

public static List<FileSplit> getSplitsForFile(FileStatus status, Configuration conf, int numLinesPerSplit)
        throws IOException {
    List<FileSplit> splits = new ArrayList<FileSplit>();
    Path fileName = status.getPath();
    if (status.isDir()) {
        throw new IOException("Not a file: " + fileName);
    }/*from   w w  w .  ja v a2s.  c  o  m*/
    FileSystem fs = fileName.getFileSystem(conf);
    LineReader lr = null;
    try {
        FSDataInputStream in = fs.open(fileName);
        lr = new LineReader(in, conf);
        Text line = new Text();
        int numLines = 0;
        long begin = 0;
        long length = 0;
        int num = -1;
        while ((num = lr.readLine(line)) > 0) {
            numLines++;
            length += num;
            if (numLines == numLinesPerSplit) {
                // NLineInputFormat uses LineRecordReader, which always reads
                // (and consumes) at least one character out of its upper split
                // boundary. So to make sure that each mapper gets N lines, we
                // move back the upper split limits of each split 
                // by one character here.
                if (begin == 0) {
                    splits.add(new FileSplit(fileName, begin, length - 1, new String[] {}));
                } else {
                    splits.add(new FileSplit(fileName, begin - 1, length, new String[] {}));
                }
                begin += length;
                length = 0;
                numLines = 0;
            }
        }
        if (numLines != 0) {
            splits.add(new FileSplit(fileName, begin, length, new String[] {}));
        }
    } finally {
        if (lr != null) {
            lr.close();
        }
    }
    return splits;
}

From source file:com.alexholmes.hadooputils.combine.common.mapreduce.CommonCombineFileRecordReader.java

License:Apache License

/**
 * Moves on to the next split inside {@link #split}. The {@link #reader} will be {@code null}
 * once we have exhausted all the splits.
 *
 * @return true if we successfully moved on to the next split
 * @throws java.io.IOException  if we hit io errors
 * @throws InterruptedException if we get interrupted
 *///from  w  w w . jav  a 2s  .c o m
public boolean nextReader() throws IOException, InterruptedException {
    // close the current reader and set it to null
    close();

    currentSplit++;

    if (currentSplit >= split.getPaths().length) {
        // hit the end of the line
        return false;
    }

    FileSplit fileSplit = new FileSplit(split.getPath(currentSplit), split.getOffset(currentSplit),
            split.getLength(currentSplit),
            split.getLocations() == null || split.getLocations().length - 1 < currentSplit ? null
                    : new String[] { split.getLocations()[currentSplit] });

    reader = engineerer.createRecordReader();
    reader.initialize(fileSplit, context);
    return true;
}

From source file:com.asakusafw.runtime.stage.input.TemporaryInputFormat.java

License:Apache License

private static FileSplit getSplit(BlockMap blockMap, Path path, long start, long end) {
    DirectInputFragment f = blockMap.get(start, end);
    List<String> owners = f.getOwnerNodeNames();
    FileSplit split = new FileSplit(path, start, end - start, owners.toArray(new String[owners.size()]));
    return split;
}

From source file:com.asakusafw.runtime.stage.input.TemporaryInputFormatTest.java

License:Apache License

/**
 * Simple case for record readers./*from   w w  w .  j  a  v  a  2  s.co m*/
 * @throws Exception if failed
 */
@Test
public void reader_simple() throws Exception {
    Configuration conf = new ConfigurationProvider().newInstance();
    FileStatus stat = write(conf, 1);
    try (RecordReader<NullWritable, Text> reader = TemporaryInputFormat.createRecordReader()) {
        reader.initialize(new FileSplit(stat.getPath(), 0, stat.getLen(), null),
                JobCompatibility.newTaskAttemptContext(conf, id()));

        assertThat(reader.nextKeyValue(), is(true));
        assertThat(reader.getCurrentValue(), is(new Text("Hello, world!")));

        assertThat(reader.nextKeyValue(), is(false));
        assertThat((double) reader.getProgress(), closeTo(1.0, 0.01));
    }
}

From source file:com.chinamobile.bcbsp.io.BSPFileInputFormat.java

License:Apache License

/**
 * Generate the list of files and make them into FileSplits.
 *
 * @param job//from   ww w .ja v  a  2s  . co m
 *        The current BSPJob job
 * @return input splits
 */
@Override
public List<InputSplit> getSplits(BSPJob job) throws IOException {
    List<InputSplit> splits = new ArrayList<InputSplit>();
    for (FileStatus file : listStatus(job)) {
        Path path = file.getPath();
        FileSystem fs = path.getFileSystem(job.getConf());
        long length = file.getLen();
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if ((length != 0) && isSplitable(job, path)) {
            long blockSize = file.getBlockSize();
            long splitSize = 0L;
            if (job.getInt(Constants.USER_BC_BSP_JOB_SPLIT_FACTOR, 1) == 1) {
                if (job.getSplitSize() == 0L) {
                    splitSize = blockSize;
                } else {
                    splitSize = job.getSplitSize();
                }
            } else {
                if (job.getSplitSize() == 0L) {
                    splitSize = blockSize * job.getInt(Constants.USER_BC_BSP_JOB_SPLIT_FACTOR, 1);
                } else {
                    splitSize = job.getSplitSize() * job.getInt(Constants.USER_BC_BSP_JOB_SPLIT_FACTOR, 1);
                }
            }
            LOG.info("[Split Size] " + (splitSize / (1024 * 1024)) + " MB");
            long bytesRemaining = length;
            while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                splits.add(new FileSplit(path, length - bytesRemaining, splitSize,
                        blkLocations[blkIndex].getHosts()));
                bytesRemaining -= splitSize;
            }
            if (bytesRemaining != 0) {
                splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining,
                        blkLocations[blkLocations.length - 1].getHosts()));
            }
        } else if (length != 0) {
            splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts()));
        } else {
            // Create empty hosts array for zero length files
            splits.add(new FileSplit(path, 0, length, new String[0]));
        }
    }
    LOG.info("[Split Number] " + splits.size());
    return splits;
}