Example usage for org.apache.hadoop.mapred FileSplit FileSplit

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred FileSplit FileSplit.

Prototype

public FileSplit(Path file, long start, long length, String[] hosts)

Source Link

Document

Constructs a split with host information

Usage

From source file:PageInputFormat.java

License:Apache License

protected FileSplit makeSplit(Path file, long start, long length, String[] hosts) {
    return new FileSplit(file, start, length, hosts);
}

From source file:RunText.java

License:Apache License

public static void main(String[] args) throws Exception {
    o = new Options();
    JCommander jc = null;/*  w  w  w  . j  a  v a  2  s  . c  o m*/
    try {
        jc = new JCommander(o, args);
        jc.setProgramName("./runText");
    } catch (ParameterException e) {
        System.out.println(e.getMessage());
        String[] valid = { "-p", "path", "-d", "delimiter", "v", "value", "-i", "index" };
        new JCommander(o, valid).usage();
        System.exit(-1);
    }
    if (o.help) {
        jc.usage();
        System.exit(0);
    }
    path = new Path(o.path);
    delim = o.delimiter.getBytes()[0];
    toFind = o.value;
    index = o.index;
    numThreads = o.threads;
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    TextInputFormat format = new TextInputFormat();
    long len = fs.getFileStatus(path).getLen() / numThreads;

    List<Thread> threads = Lists.newArrayList();

    for (int i = 0; i < numThreads; i++) {
        FileSplit split = new FileSplit(path, i * len, len, new String[] { "" });
        threads.add(new Thread(new RunText(split, format)));
    }

    runningThreads = new AtomicInteger(numThreads);

    for (Thread t : threads) {
        t.start();
    }

    int prev = 0;
    int current;
    long t1 = System.nanoTime();
    long t2;
    while (runningThreads.get() > 0) {
        Thread.sleep(5000);
        current = totalCount.get();
        t2 = System.nanoTime();
        System.out.println(String.format("%f records/sec", (current - prev) * 1e9 / (t2 - t1)));
        t1 = t2;
        prev = current;
    }

    for (Thread t : threads) {
        t.join();
    }

    fs.close();
}

From source file:StreamWikiDumpInputFormat.java

License:Apache License

public FileSplit makeSplit(Path file, long start, long length, String[] hosts) {
    return new FileSplit(file, start, length, hosts);
}

From source file:ca.sparkera.adapters.mapred.MainframeVBInputFormat.java

License:Apache License

/**
 * Splits files returned by {@link #listStatus(JobConf)} when they're too
 * big./*from www  .  j  av  a2s.c om*/
 */
@Override
@SuppressWarnings("deprecation")
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {

    FileStatus[] files = listStatus(job);
    for (FileStatus file : files) { // check we have valid files
        if (file.isDir()) {
            throw new IOException("Not a file: " + file.getPath());
        }
        totalSize += file.getLen();
    }

    long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits);
    long minSize = Math.max(job.getLong("mapred.min.split.size", 1), minSplitSize);
    // generate splits
    ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
    for (FileStatus file : files) {
        Path path = file.getPath();
        FileSystem fs = path.getFileSystem(job);
        FSDataInputStream fileIn;
        InputStream inputStream;
        fileIn = fs.open(path);
        inputStream = fileIn;
        filePosition = fileIn;
        long offset = 0;
        long length = file.getLen();
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if ((length != 0) && isSplitable(fs, path)) {
            long blockSize = file.getBlockSize();

            long bytesRemaining = length;
            long splitSize = 0;
            while (offset < length) {
                splitSize = computeSplitSize(goalSize, minSize, blockSize, inputStream);

                int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                splits.add(new FileSplit(path, length - bytesRemaining, splitSize,
                        blkLocations[blkIndex].getHosts()));

                bytesRemaining -= splitSize;
                offset = length - bytesRemaining;
            }

            if (bytesRemaining != 0) {
                throw new IOException(
                        "Partial record(length = " + bytesRemaining + ") found at the end of file " + path);
            }
        } else if (length != 0) {
            splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts()));
        } else {
            // Create empty hosts array for zero length files
            splits.add(new FileSplit(path, 0, length, new String[0]));
        }
        if (inputStream != null) {
            inputStream.close();
            inputStream = null;
        }
    }
    java.util.Date date = new java.util.Date();
    System.out.println((new Timestamp(date.getTime())) + ",\t Split = 100%  Total Splits - " + (++splitCount)
            + "\t Total Records in VB file - " + totalRecords);

    LOG.debug("Total # of splits: " + splits.size());
    return splits.toArray(new FileSplit[splits.size()]);
}

From source file:cascading.tap.hadoop.io.CombineFileRecordReaderWrapper.java

License:Open Source License

public CombineFileRecordReaderWrapper(CombineFileSplit split, Configuration conf, Reporter reporter,
        Integer idx) throws Exception {
    FileSplit fileSplit = new FileSplit(split.getPath(idx), split.getOffset(idx), split.getLength(idx),
            split.getLocations());/* w w w.  j  a va  2  s  . c  om*/

    Class<?> clz = conf.getClass(INDIVIDUAL_INPUT_FORMAT, null);
    FileInputFormat<K, V> inputFormat = (FileInputFormat<K, V>) clz.newInstance();

    if (inputFormat instanceof Configurable)
        ((Configurable) inputFormat).setConf(conf);

    delegate = inputFormat.getRecordReader(fileSplit, (JobConf) conf, reporter);
}

From source file:com.alexholmes.hadooputils.combine.avro.mapred.CombineAvroInputFormatTest.java

License:Apache License

@SuppressWarnings("deprecation")
public void testProjection() throws Exception {
    JobConf job = new JobConf();

    Integer defaultRank = new Integer(-1);

    String jsonSchema = "{\"type\":\"record\"," + "\"name\":\"org.apache.avro.mapred.Pair\"," + "\"fields\": [ "
            + "{\"name\":\"rank\", \"type\":\"int\", \"default\": -1},"
            + "{\"name\":\"value\", \"type\":\"long\"}" + "]}";

    Schema readerSchema = Schema.parse(jsonSchema);

    AvroJob.setInputSchema(job, readerSchema);

    String dir = System.getProperty("test.dir", ".") + "/mapred";
    Path inputPath = new Path(dir + "/out" + "/part-00000" + AvroOutputFormat.EXT);
    FileStatus fileStatus = FileSystem.get(job).getFileStatus(inputPath);
    FileSplit fileSplit = new FileSplit(inputPath, 0, fileStatus.getLen(), job);

    AvroRecordReader<Pair<Integer, Long>> recordReader = new AvroRecordReader<Pair<Integer, Long>>(job,
            fileSplit);/*from www .j a v  a  2  s .co  m*/

    AvroWrapper<Pair<Integer, Long>> inputPair = new AvroWrapper<Pair<Integer, Long>>(null);
    NullWritable ignore = NullWritable.get();

    long sumOfCounts = 0;
    long numOfCounts = 0;
    while (recordReader.next(inputPair, ignore)) {
        Assert.assertEquals((Integer) inputPair.datum().get(0), defaultRank);
        sumOfCounts += (Long) inputPair.datum().get(1);
        numOfCounts++;
    }

    Assert.assertEquals(numOfCounts, WordCountUtil.COUNTS.size());

    long actualSumOfCounts = 0;
    for (Long count : WordCountUtil.COUNTS.values()) {
        actualSumOfCounts += count;
    }

    Assert.assertEquals(sumOfCounts, actualSumOfCounts);
}

From source file:com.alexholmes.hadooputils.combine.common.mapred.CommonCombineRecordReader.java

License:Apache License

/**
 * Moves on to the next split inside {@link #split}. The {@link #reader} will be {@code null}
 * once we have exhausted all the splits.
 *
 * @return true if we successfully moved on to the next split
 * @throws java.io.IOException if we hit io errors
 *///from w  w w .  jav a 2s. c  o m
public boolean nextReader() throws IOException {
    // close the current reader and set it to null
    close();

    currentSplit++;

    if (currentSplit >= split.getPaths().length) {
        // hit the end of the line
        return false;
    }

    FileSplit fileSplit = new FileSplit(split.getPath(currentSplit), split.getOffset(currentSplit),
            split.getLength(currentSplit),
            split.getLocations() == null || split.getLocations().length - 1 < currentSplit ? null
                    : new String[] { split.getLocations()[currentSplit] });

    reader = engineerer.createRecordReader(conf, fileSplit);
    return true;
}

From source file:com.aliyun.fs.utils.OssInputUtils.java

License:Apache License

public FileSplit[] getSplits(String file, int numSplits) throws IOException {
    Path path = new Path(file);
    this.fs = FileSystem.get(path.toUri(), conf);
    fs.initialize(path.toUri(), conf);/*from  w  w w .  j  a v  a2 s .c  o  m*/

    FileStatus[] files = fs.listStatus(path);
    long totalSize = 0;
    for (FileStatus file1 : files) {
        if (file1.isDirectory()) {
            throw new IOException("Not a file: " + file1.getPath());
        }
        totalSize += file1.getLen();
    }

    long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits);
    long minSize = Math
            .max(conf.getLong(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.SPLIT_MINSIZE, 1), 1);

    ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
    for (FileStatus file2 : files) {
        Path fp = file2.getPath();
        long length = file2.getLen();
        if (length != 0) {
            long splitSize = Math.max(minSize, goalSize);
            long bytesRemaining = length;
            while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                FileSplit split = new FileSplit(fp, length - bytesRemaining, splitSize, new String[0]);
                splits.add(split);
                bytesRemaining -= splitSize;
            }
            if (bytesRemaining != 0) {
                FileSplit split = new FileSplit(fp, length - bytesRemaining, bytesRemaining, new String[0]);
                splits.add(split);
            }
        }
    }
    LOG.info("Total # of splits: " + splits.size());
    return splits.toArray(new FileSplit[splits.size()]);
}

From source file:com.bianfeng.bfas.hive.io.RealtimeInputFormat2.java

License:Apache License

/** Splits files returned by {@link #listStatus(JobConf)} when
 * they're too big.*///  w w w . j a va  2 s.  com
@SuppressWarnings("deprecation")
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    LOG.warn("test go go go");

    FileStatus[] files = listStatus(job);

    // Save the number of input files in the job-conf
    job.setLong(NUM_INPUT_FILES, files.length);
    long totalSize = 0; // compute total size
    for (FileStatus file : files) { // check we have valid files
        if (file.isDir()) {
            throw new IOException("Not a file: " + file.getPath());
        }
        totalSize += file.getLen();
    }

    long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits);
    long minSize = Math.max(job.getLong("mapred.min.split.size", 1), minSplitSize);

    // generate splits
    ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
    NetworkTopology clusterMap = new NetworkTopology();
    for (FileStatus file : files) {
        Path path = file.getPath();
        FileSystem fs = path.getFileSystem(job);
        long length = file.getLen();
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if ((length != 0) && isSplitable(fs, path)) {
            long blockSize = file.getBlockSize();
            long splitSize = computeSplitSize(goalSize, minSize, blockSize);

            long bytesRemaining = length;
            while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                String[] splitHosts = getSplitHosts(blkLocations, length - bytesRemaining, splitSize,
                        clusterMap);
                splits.add(new FileSplit(path, length - bytesRemaining, splitSize, splitHosts));
                bytesRemaining -= splitSize;
            }

            if (bytesRemaining != 0) {
                splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining,
                        blkLocations[blkLocations.length - 1].getHosts()));
            }
        } else if (length != 0) {
            String[] splitHosts = getSplitHosts(blkLocations, 0, length, clusterMap);
            splits.add(new FileSplit(path, 0, length, splitHosts));
        } else {
            //Create empty hosts array for zero length files
            splits.add(new FileSplit(path, 0, length, new String[0]));
        }
    }
    LOG.debug("Total # of splits: " + splits.size());
    return splits.toArray(new FileSplit[splits.size()]);
}

From source file:com.conductor.s3.S3InputFormatUtils.java

License:Apache License

/**
 * Converts the {@link org.apache.hadoop.fs.FileStatus}s to {@link org.apache.hadoop.mapred.InputSplit}s (MRV1 API).
 * <p>//w  w  w  . ja  v a  2  s. c  o m
 * This is taken directly from {@link org.apache.hadoop.mapreduce.lib.input.FileInputFormat}, less any file system
 * operations that do not make sense when using {@code S3}.
 * 
 * @param files
 *            the files to convert
 * @param minSize
 *            the minimum size of the splits
 * @param maxSize
 *            the maximum size of the splits
 * @return the splits.
 */
static List<InputSplit> convertToInputSplitsMRV1(final Iterable<FileStatus> files, final long minSize,
        final long maxSize) {
    final List<InputSplit> splits = Lists.newArrayList();
    for (final FileStatus file : files) {
        // check for valid data for this input format
        checkArgument(!file.isDirectory(), "Cannot pass directories to this method!");
        final String path = file.getPath().toString();
        checkArgument(path.startsWith("s3:") || path.startsWith("s3n:"), "Expected S3 input");

        // create splits out of file
        final long length = file.getLen();
        if (length > 0) {
            long blockSize = file.getBlockSize();
            long splitSize = computeSplitSize(blockSize, minSize, maxSize);
            long bytesRemaining = length;
            while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                splits.add(new FileSplit(file.getPath(), length - bytesRemaining, splitSize, S3_SPLIT_HOST));
                bytesRemaining -= splitSize;
            }
            if (bytesRemaining != 0) {
                splits.add(
                        new FileSplit(file.getPath(), length - bytesRemaining, bytesRemaining, S3_SPLIT_HOST));
            }
        }
    }
    return splits;
}