Example usage for org.apache.hadoop.mapreduce.lib.input FileSplit FileSplit

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.input FileSplit FileSplit.

Prototype

public FileSplit(Path file, long start, long length, String[] hosts)

Source Link

Document

Constructs a split with host information

Usage

From source file:be.uantwerpen.adrem.eclat.util.SplitByKTextInputFormatTest.java

License:Apache License

@Test
public void splits_Non_Empty_File_One_Split() throws IOException {
    File in = createTmpFile("in_Splits_Non_Empty_File_One_Split", non_Empty);
    Configuration conf = createConfiguration();

    FileStatus status = EasyMock.createMock(FileStatus.class);
    EasyMock.expect(status.getPath()).andReturn(new Path(in.getAbsolutePath()));
    EasyMock.expect(status.isDir()).andReturn(false);
    EasyMock.replay(status);//ww  w . j a v  a  2s.com

    List<FileSplit> splits = SplitByKTextInputFormat.getSplitsForFile(status, conf, 1);

    List<FileSplit> expected = newArrayListWithCapacity(1);
    expected.add(new FileSplit(new Path(in.getAbsolutePath()), 0, 17, new String[] {}));

    checkSplits(expected, splits);
}

From source file:be.uantwerpen.adrem.eclat.util.SplitByKTextInputFormatTest.java

License:Apache License

@Test
public void splits_Non_Empty_File_Ok_Splits() throws IOException {
    File in = createTmpFile("in_Splits_Non_Empty_File_Ok_Splits", non_Empty);
    Configuration conf = createConfiguration();

    FileStatus status = EasyMock.createMock(FileStatus.class);
    EasyMock.expect(status.getPath()).andReturn(new Path(in.getAbsolutePath()));
    EasyMock.expect(status.isDir()).andReturn(false);
    EasyMock.replay(status);/*from w  w w  . java  2  s.  com*/

    List<FileSplit> splits = SplitByKTextInputFormat.getSplitsForFile(status, conf, 2);

    List<FileSplit> expected = newArrayListWithCapacity(2);
    expected.add(new FileSplit(new Path(in.getAbsolutePath()), 0, 12, new String[] {}));
    expected.add(new FileSplit(new Path(in.getAbsolutePath()), 12, 5, new String[] {}));

    checkSplits(expected, splits);
}

From source file:be.uantwerpen.adrem.eclat.util.SplitByKTextInputFormatTest.java

License:Apache License

@Test
public void splits_Non_Empty_File_More_Splits_Than_Lines() throws IOException {
    File in = createTmpFile("in_Splits_Non_Empty_File_More_Splits_Than_Lines", non_Empty);
    Configuration conf = createConfiguration();

    FileStatus status = EasyMock.createMock(FileStatus.class);
    EasyMock.expect(status.getPath()).andReturn(new Path(in.getAbsolutePath()));
    EasyMock.expect(status.isDir()).andReturn(false);
    EasyMock.replay(status);//from   w ww.  j av  a 2  s . co  m

    List<FileSplit> splits = SplitByKTextInputFormat.getSplitsForFile(status, conf, 10);

    List<FileSplit> expected = newArrayListWithCapacity(3);
    expected.add(new FileSplit(new Path(in.getAbsolutePath()), 0, 5, new String[] {}));
    expected.add(new FileSplit(new Path(in.getAbsolutePath()), 5, 7, new String[] {}));
    expected.add(new FileSplit(new Path(in.getAbsolutePath()), 12, 5, new String[] {}));

    checkSplits(expected, splits);
}

From source file:be.uantwerpen.adrem.hadoop.util.SplitByKTextInputFormat.java

License:Apache License

/**
 * Creates a new filesplit object//from w w w  .  j a v  a2  s. c om
 * 
 * @param fileName
 *          name of the file to which filesplit corresponds
 * @param begin
 *          begin of the split
 * @param length
 *          length of the split
 * @return file split object
 */
protected static FileSplit createFileSplit(Path fileName, long begin, long length) {
    return (begin == 0) ? new FileSplit(fileName, begin, length - 1, new String[] {})
            : new FileSplit(fileName, begin - 1, length, new String[] {});
}

From source file:bsc.spark.examples.terasort.ehiggs.TeraScheduler.java

License:Apache License

/**
 * Solve the schedule and modify the FileSplit array to reflect the new
 * schedule. It will move placed splits to front and unplacable splits
 * to the end./*from ww  w.j  a v  a 2  s . c  o  m*/
 * @return a new list of FileSplits that are modified to have the
 *    best host as the only host.
 * @throws IOException
 */
public List<InputSplit> getNewFileSplits() throws IOException {
    solve();
    FileSplit[] result = new FileSplit[realSplits.length];
    int left = 0;
    int right = realSplits.length - 1;
    for (int i = 0; i < splits.length; ++i) {
        if (splits[i].isAssigned) {
            // copy the split and fix up the locations
            String[] newLocations = { splits[i].locations.get(0).hostname };
            realSplits[i] = new FileSplit(realSplits[i].getPath(), realSplits[i].getStart(),
                    realSplits[i].getLength(), newLocations);
            result[left++] = realSplits[i];
        } else {
            result[right--] = realSplits[i];
        }
    }
    List<InputSplit> ret = new ArrayList<InputSplit>();
    for (FileSplit fs : result) {
        ret.add(fs);
    }
    return ret;
}

From source file:bucket_sort.NLineInputFormat.java

License:Apache License

public static List<FileSplit> getSplitsForFile(FileStatus status, Configuration conf, int numLinesPerSplit)
        throws IOException {
    List<FileSplit> splits = new ArrayList<FileSplit>();
    Path fileName = status.getPath();
    if (status.isDir()) {
        throw new IOException("Not a file: " + fileName);
    }/*from   w w  w .  ja v a2s.  c  o  m*/
    FileSystem fs = fileName.getFileSystem(conf);
    LineReader lr = null;
    try {
        FSDataInputStream in = fs.open(fileName);
        lr = new LineReader(in, conf);
        Text line = new Text();
        int numLines = 0;
        long begin = 0;
        long length = 0;
        int num = -1;
        while ((num = lr.readLine(line)) > 0) {
            numLines++;
            length += num;
            if (numLines == numLinesPerSplit) {
                // NLineInputFormat uses LineRecordReader, which always reads
                // (and consumes) at least one character out of its upper split
                // boundary. So to make sure that each mapper gets N lines, we
                // move back the upper split limits of each split 
                // by one character here.
                if (begin == 0) {
                    splits.add(new FileSplit(fileName, begin, length - 1, new String[] {}));
                } else {
                    splits.add(new FileSplit(fileName, begin - 1, length, new String[] {}));
                }
                begin += length;
                length = 0;
                numLines = 0;
            }
        }
        if (numLines != 0) {
            splits.add(new FileSplit(fileName, begin, length, new String[] {}));
        }
    } finally {
        if (lr != null) {
            lr.close();
        }
    }
    return splits;
}

From source file:com.alexholmes.hadooputils.combine.common.mapreduce.CommonCombineFileRecordReader.java

License:Apache License

/**
 * Moves on to the next split inside {@link #split}. The {@link #reader} will be {@code null}
 * once we have exhausted all the splits.
 *
 * @return true if we successfully moved on to the next split
 * @throws java.io.IOException  if we hit io errors
 * @throws InterruptedException if we get interrupted
 *///from  w  w w . jav  a 2s  .c o m
public boolean nextReader() throws IOException, InterruptedException {
    // close the current reader and set it to null
    close();

    currentSplit++;

    if (currentSplit >= split.getPaths().length) {
        // hit the end of the line
        return false;
    }

    FileSplit fileSplit = new FileSplit(split.getPath(currentSplit), split.getOffset(currentSplit),
            split.getLength(currentSplit),
            split.getLocations() == null || split.getLocations().length - 1 < currentSplit ? null
                    : new String[] { split.getLocations()[currentSplit] });

    reader = engineerer.createRecordReader();
    reader.initialize(fileSplit, context);
    return true;
}

From source file:com.asakusafw.runtime.stage.input.TemporaryInputFormat.java

License:Apache License

private static FileSplit getSplit(BlockMap blockMap, Path path, long start, long end) {
    DirectInputFragment f = blockMap.get(start, end);
    List<String> owners = f.getOwnerNodeNames();
    FileSplit split = new FileSplit(path, start, end - start, owners.toArray(new String[owners.size()]));
    return split;
}

From source file:com.asakusafw.runtime.stage.input.TemporaryInputFormatTest.java

License:Apache License

/**
 * Simple case for record readers./*from   w w  w .  j  a  v  a  2  s.co m*/
 * @throws Exception if failed
 */
@Test
public void reader_simple() throws Exception {
    Configuration conf = new ConfigurationProvider().newInstance();
    FileStatus stat = write(conf, 1);
    try (RecordReader<NullWritable, Text> reader = TemporaryInputFormat.createRecordReader()) {
        reader.initialize(new FileSplit(stat.getPath(), 0, stat.getLen(), null),
                JobCompatibility.newTaskAttemptContext(conf, id()));

        assertThat(reader.nextKeyValue(), is(true));
        assertThat(reader.getCurrentValue(), is(new Text("Hello, world!")));

        assertThat(reader.nextKeyValue(), is(false));
        assertThat((double) reader.getProgress(), closeTo(1.0, 0.01));
    }
}

From source file:com.chinamobile.bcbsp.io.BSPFileInputFormat.java

License:Apache License

/**
 * Generate the list of files and make them into FileSplits.
 *
 * @param job//from   ww w .ja v  a  2s  . co m
 *        The current BSPJob job
 * @return input splits
 */
@Override
public List<InputSplit> getSplits(BSPJob job) throws IOException {
    List<InputSplit> splits = new ArrayList<InputSplit>();
    for (FileStatus file : listStatus(job)) {
        Path path = file.getPath();
        FileSystem fs = path.getFileSystem(job.getConf());
        long length = file.getLen();
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if ((length != 0) && isSplitable(job, path)) {
            long blockSize = file.getBlockSize();
            long splitSize = 0L;
            if (job.getInt(Constants.USER_BC_BSP_JOB_SPLIT_FACTOR, 1) == 1) {
                if (job.getSplitSize() == 0L) {
                    splitSize = blockSize;
                } else {
                    splitSize = job.getSplitSize();
                }
            } else {
                if (job.getSplitSize() == 0L) {
                    splitSize = blockSize * job.getInt(Constants.USER_BC_BSP_JOB_SPLIT_FACTOR, 1);
                } else {
                    splitSize = job.getSplitSize() * job.getInt(Constants.USER_BC_BSP_JOB_SPLIT_FACTOR, 1);
                }
            }
            LOG.info("[Split Size] " + (splitSize / (1024 * 1024)) + " MB");
            long bytesRemaining = length;
            while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                splits.add(new FileSplit(path, length - bytesRemaining, splitSize,
                        blkLocations[blkIndex].getHosts()));
                bytesRemaining -= splitSize;
            }
            if (bytesRemaining != 0) {
                splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining,
                        blkLocations[blkLocations.length - 1].getHosts()));
            }
        } else if (length != 0) {
            splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts()));
        } else {
            // Create empty hosts array for zero length files
            splits.add(new FileSplit(path, 0, length, new String[0]));
        }
    }
    LOG.info("[Split Number] " + splits.size());
    return splits;
}