Example usage for org.apache.hadoop.mapred FileSplit FileSplit

List of usage examples for org.apache.hadoop.mapred FileSplit FileSplit

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred FileSplit FileSplit.

Prototype

public FileSplit(Path file, long start, long length, String[] hosts) 

Source Link

Document

Constructs a split with host information

Usage

From source file:PageInputFormat.java

License:Apache License

protected FileSplit makeSplit(Path file, long start, long length, String[] hosts) {
    return new FileSplit(file, start, length, hosts);
}

From source file:RunText.java

License:Apache License

public static void main(String[] args) throws Exception {
    o = new Options();
    JCommander jc = null;/*  w  w  w  . j  a  v a  2  s  . c  o m*/
    try {
        jc = new JCommander(o, args);
        jc.setProgramName("./runText");
    } catch (ParameterException e) {
        System.out.println(e.getMessage());
        String[] valid = { "-p", "path", "-d", "delimiter", "v", "value", "-i", "index" };
        new JCommander(o, valid).usage();
        System.exit(-1);
    }
    if (o.help) {
        jc.usage();
        System.exit(0);
    }
    path = new Path(o.path);
    delim = o.delimiter.getBytes()[0];
    toFind = o.value;
    index = o.index;
    numThreads = o.threads;
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    TextInputFormat format = new TextInputFormat();
    long len = fs.getFileStatus(path).getLen() / numThreads;

    List<Thread> threads = Lists.newArrayList();

    for (int i = 0; i < numThreads; i++) {
        FileSplit split = new FileSplit(path, i * len, len, new String[] { "" });
        threads.add(new Thread(new RunText(split, format)));
    }

    runningThreads = new AtomicInteger(numThreads);

    for (Thread t : threads) {
        t.start();
    }

    int prev = 0;
    int current;
    long t1 = System.nanoTime();
    long t2;
    while (runningThreads.get() > 0) {
        Thread.sleep(5000);
        current = totalCount.get();
        t2 = System.nanoTime();
        System.out.println(String.format("%f records/sec", (current - prev) * 1e9 / (t2 - t1)));
        t1 = t2;
        prev = current;
    }

    for (Thread t : threads) {
        t.join();
    }

    fs.close();
}

From source file:StreamWikiDumpInputFormat.java

License:Apache License

public FileSplit makeSplit(Path file, long start, long length, String[] hosts) {
    return new FileSplit(file, start, length, hosts);
}

From source file:ca.sparkera.adapters.mapred.MainframeVBInputFormat.java

License:Apache License

/**
 * Splits files returned by {@link #listStatus(JobConf)} when they're too
 * big./*from www  .  j  av  a2s.c om*/
 */
@Override
@SuppressWarnings("deprecation")
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {

    FileStatus[] files = listStatus(job);
    for (FileStatus file : files) { // check we have valid files
        if (file.isDir()) {
            throw new IOException("Not a file: " + file.getPath());
        }
        totalSize += file.getLen();
    }

    long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits);
    long minSize = Math.max(job.getLong("mapred.min.split.size", 1), minSplitSize);
    // generate splits
    ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
    for (FileStatus file : files) {
        Path path = file.getPath();
        FileSystem fs = path.getFileSystem(job);
        FSDataInputStream fileIn;
        InputStream inputStream;
        fileIn = fs.open(path);
        inputStream = fileIn;
        filePosition = fileIn;
        long offset = 0;
        long length = file.getLen();
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if ((length != 0) && isSplitable(fs, path)) {
            long blockSize = file.getBlockSize();

            long bytesRemaining = length;
            long splitSize = 0;
            while (offset < length) {
                splitSize = computeSplitSize(goalSize, minSize, blockSize, inputStream);

                int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                splits.add(new FileSplit(path, length - bytesRemaining, splitSize,
                        blkLocations[blkIndex].getHosts()));

                bytesRemaining -= splitSize;
                offset = length - bytesRemaining;
            }

            if (bytesRemaining != 0) {
                throw new IOException(
                        "Partial record(length = " + bytesRemaining + ") found at the end of file " + path);
            }
        } else if (length != 0) {
            splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts()));
        } else {
            // Create empty hosts array for zero length files
            splits.add(new FileSplit(path, 0, length, new String[0]));
        }
        if (inputStream != null) {
            inputStream.close();
            inputStream = null;
        }
    }
    java.util.Date date = new java.util.Date();
    System.out.println((new Timestamp(date.getTime())) + ",\t Split = 100%  Total Splits - " + (++splitCount)
            + "\t Total Records in VB file - " + totalRecords);

    LOG.debug("Total # of splits: " + splits.size());
    return splits.toArray(new FileSplit[splits.size()]);
}

From source file:cascading.tap.hadoop.io.CombineFileRecordReaderWrapper.java

License:Open Source License

public CombineFileRecordReaderWrapper(CombineFileSplit split, Configuration conf, Reporter reporter,
        Integer idx) throws Exception {
    FileSplit fileSplit = new FileSplit(split.getPath(idx), split.getOffset(idx), split.getLength(idx),
            split.getLocations());/* w w w.  j  a va  2  s  . c  om*/

    Class<?> clz = conf.getClass(INDIVIDUAL_INPUT_FORMAT, null);
    FileInputFormat<K, V> inputFormat = (FileInputFormat<K, V>) clz.newInstance();

    if (inputFormat instanceof Configurable)
        ((Configurable) inputFormat).setConf(conf);

    delegate = inputFormat.getRecordReader(fileSplit, (JobConf) conf, reporter);
}

From source file:com.alexholmes.hadooputils.combine.avro.mapred.CombineAvroInputFormatTest.java

License:Apache License

@SuppressWarnings("deprecation")
public void testProjection() throws Exception {
    JobConf job = new JobConf();

    Integer defaultRank = new Integer(-1);

    String jsonSchema = "{\"type\":\"record\"," + "\"name\":\"org.apache.avro.mapred.Pair\"," + "\"fields\": [ "
            + "{\"name\":\"rank\", \"type\":\"int\", \"default\": -1},"
            + "{\"name\":\"value\", \"type\":\"long\"}" + "]}";

    Schema readerSchema = Schema.parse(jsonSchema);

    AvroJob.setInputSchema(job, readerSchema);

    String dir = System.getProperty("test.dir", ".") + "/mapred";
    Path inputPath = new Path(dir + "/out" + "/part-00000" + AvroOutputFormat.EXT);
    FileStatus fileStatus = FileSystem.get(job).getFileStatus(inputPath);
    FileSplit fileSplit = new FileSplit(inputPath, 0, fileStatus.getLen(), job);

    AvroRecordReader<Pair<Integer, Long>> recordReader = new AvroRecordReader<Pair<Integer, Long>>(job,
            fileSplit);/*from www .j a v  a  2  s .co  m*/

    AvroWrapper<Pair<Integer, Long>> inputPair = new AvroWrapper<Pair<Integer, Long>>(null);
    NullWritable ignore = NullWritable.get();

    long sumOfCounts = 0;
    long numOfCounts = 0;
    while (recordReader.next(inputPair, ignore)) {
        Assert.assertEquals((Integer) inputPair.datum().get(0), defaultRank);
        sumOfCounts += (Long) inputPair.datum().get(1);
        numOfCounts++;
    }

    Assert.assertEquals(numOfCounts, WordCountUtil.COUNTS.size());

    long actualSumOfCounts = 0;
    for (Long count : WordCountUtil.COUNTS.values()) {
        actualSumOfCounts += count;
    }

    Assert.assertEquals(sumOfCounts, actualSumOfCounts);
}

From source file:com.alexholmes.hadooputils.combine.common.mapred.CommonCombineRecordReader.java

License:Apache License

/**
 * Moves on to the next split inside {@link #split}. The {@link #reader} will be {@code null}
 * once we have exhausted all the splits.
 *
 * @return true if we successfully moved on to the next split
 * @throws java.io.IOException if we hit io errors
 *///from w  w w .  jav a 2s. c  o m
public boolean nextReader() throws IOException {
    // close the current reader and set it to null
    close();

    currentSplit++;

    if (currentSplit >= split.getPaths().length) {
        // hit the end of the line
        return false;
    }

    FileSplit fileSplit = new FileSplit(split.getPath(currentSplit), split.getOffset(currentSplit),
            split.getLength(currentSplit),
            split.getLocations() == null || split.getLocations().length - 1 < currentSplit ? null
                    : new String[] { split.getLocations()[currentSplit] });

    reader = engineerer.createRecordReader(conf, fileSplit);
    return true;
}

From source file:com.aliyun.fs.utils.OssInputUtils.java

License:Apache License

public FileSplit[] getSplits(String file, int numSplits) throws IOException {
    Path path = new Path(file);
    this.fs = FileSystem.get(path.toUri(), conf);
    fs.initialize(path.toUri(), conf);/*from  w  w w .  j  a v  a2 s .c  o  m*/

    FileStatus[] files = fs.listStatus(path);
    long totalSize = 0;
    for (FileStatus file1 : files) {
        if (file1.isDirectory()) {
            throw new IOException("Not a file: " + file1.getPath());
        }
        totalSize += file1.getLen();
    }

    long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits);
    long minSize = Math
            .max(conf.getLong(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.SPLIT_MINSIZE, 1), 1);

    ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
    for (FileStatus file2 : files) {
        Path fp = file2.getPath();
        long length = file2.getLen();
        if (length != 0) {
            long splitSize = Math.max(minSize, goalSize);
            long bytesRemaining = length;
            while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                FileSplit split = new FileSplit(fp, length - bytesRemaining, splitSize, new String[0]);
                splits.add(split);
                bytesRemaining -= splitSize;
            }
            if (bytesRemaining != 0) {
                FileSplit split = new FileSplit(fp, length - bytesRemaining, bytesRemaining, new String[0]);
                splits.add(split);
            }
        }
    }
    LOG.info("Total # of splits: " + splits.size());
    return splits.toArray(new FileSplit[splits.size()]);
}

From source file:com.bianfeng.bfas.hive.io.RealtimeInputFormat2.java

License:Apache License

/** Splits files returned by {@link #listStatus(JobConf)} when
 * they're too big.*///  w w w . j a va  2 s.  com
@SuppressWarnings("deprecation")
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    LOG.warn("test go go go");

    FileStatus[] files = listStatus(job);

    // Save the number of input files in the job-conf
    job.setLong(NUM_INPUT_FILES, files.length);
    long totalSize = 0; // compute total size
    for (FileStatus file : files) { // check we have valid files
        if (file.isDir()) {
            throw new IOException("Not a file: " + file.getPath());
        }
        totalSize += file.getLen();
    }

    long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits);
    long minSize = Math.max(job.getLong("mapred.min.split.size", 1), minSplitSize);

    // generate splits
    ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
    NetworkTopology clusterMap = new NetworkTopology();
    for (FileStatus file : files) {
        Path path = file.getPath();
        FileSystem fs = path.getFileSystem(job);
        long length = file.getLen();
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if ((length != 0) && isSplitable(fs, path)) {
            long blockSize = file.getBlockSize();
            long splitSize = computeSplitSize(goalSize, minSize, blockSize);

            long bytesRemaining = length;
            while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                String[] splitHosts = getSplitHosts(blkLocations, length - bytesRemaining, splitSize,
                        clusterMap);
                splits.add(new FileSplit(path, length - bytesRemaining, splitSize, splitHosts));
                bytesRemaining -= splitSize;
            }

            if (bytesRemaining != 0) {
                splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining,
                        blkLocations[blkLocations.length - 1].getHosts()));
            }
        } else if (length != 0) {
            String[] splitHosts = getSplitHosts(blkLocations, 0, length, clusterMap);
            splits.add(new FileSplit(path, 0, length, splitHosts));
        } else {
            //Create empty hosts array for zero length files
            splits.add(new FileSplit(path, 0, length, new String[0]));
        }
    }
    LOG.debug("Total # of splits: " + splits.size());
    return splits.toArray(new FileSplit[splits.size()]);
}

From source file:com.conductor.s3.S3InputFormatUtils.java

License:Apache License

/**
 * Converts the {@link org.apache.hadoop.fs.FileStatus}s to {@link org.apache.hadoop.mapred.InputSplit}s (MRV1 API).
 * <p>//w  w  w  . ja  v a  2  s. c  o m
 * This is taken directly from {@link org.apache.hadoop.mapreduce.lib.input.FileInputFormat}, less any file system
 * operations that do not make sense when using {@code S3}.
 * 
 * @param files
 *            the files to convert
 * @param minSize
 *            the minimum size of the splits
 * @param maxSize
 *            the maximum size of the splits
 * @return the splits.
 */
static List<InputSplit> convertToInputSplitsMRV1(final Iterable<FileStatus> files, final long minSize,
        final long maxSize) {
    final List<InputSplit> splits = Lists.newArrayList();
    for (final FileStatus file : files) {
        // check for valid data for this input format
        checkArgument(!file.isDirectory(), "Cannot pass directories to this method!");
        final String path = file.getPath().toString();
        checkArgument(path.startsWith("s3:") || path.startsWith("s3n:"), "Expected S3 input");

        // create splits out of file
        final long length = file.getLen();
        if (length > 0) {
            long blockSize = file.getBlockSize();
            long splitSize = computeSplitSize(blockSize, minSize, maxSize);
            long bytesRemaining = length;
            while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                splits.add(new FileSplit(file.getPath(), length - bytesRemaining, splitSize, S3_SPLIT_HOST));
                bytesRemaining -= splitSize;
            }
            if (bytesRemaining != 0) {
                splits.add(
                        new FileSplit(file.getPath(), length - bytesRemaining, bytesRemaining, S3_SPLIT_HOST));
            }
        }
    }
    return splits;
}