Example usage for org.apache.hadoop.mapreduce.lib.input NLineInputFormat NLineInputFormat

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.input NLineInputFormat NLineInputFormat.

Prototype

NLineInputFormat

Source Link

Usage

From source file:org.mrgeo.hdfs.vector.DelimitedVectorInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
    boolean useNLineFormat = context.getConfiguration().getBoolean(USE_NLINE_FORMAT, false);
    if (useNLineFormat) {
        List<InputSplit> splits = new NLineInputFormat().getSplits(context);
        // This is a workaround to what appears to be a bug in in how NLineInputFormat
        // computes its splits. When there are multiple splits in a file, it seems
        // the start position in the last split is off by one. Note that this corrective
        // code needs to check the last split for each different file that appears
        // in the list of splits.
        for (int index = 2; index < splits.size(); index++) {
            FileSplit previousSplit = (FileSplit) splits.get(index - 1);
            FileSplit currSplit = (FileSplit) splits.get(index);
            // If this index is the last split, or we've moved on to splits from a different
            // file, then we need to adjust the last split for that file.
            int lastFileIndex = -1;
            if (index == splits.size() - 1) {
                lastFileIndex = index;/*from w  w  w.j ava  2  s . c o  m*/
            } else if (!currSplit.getPath().equals(previousSplit.getPath())) {
                lastFileIndex = index - 1;
            }
            if (lastFileIndex >= 2) {
                FileSplit lastFileSplit = (FileSplit) splits.get(lastFileIndex);
                FileSplit priorSplit = (FileSplit) splits.get(lastFileIndex - 1);
                if (lastFileSplit.getPath().equals(priorSplit.getPath())) {
                    if (priorSplit.getPath().equals(lastFileSplit.getPath())
                            && priorSplit.getStart() + priorSplit.getLength() < lastFileSplit.getStart()) {
                        // Adjust the start of previous split
                        FileSplit replacement = new FileSplit(lastFileSplit.getPath(),
                                priorSplit.getStart() + priorSplit.getLength(), lastFileSplit.getLength() + 1,
                                lastFileSplit.getLocations());
                        log.info("Replacing split: " + lastFileSplit.toString());
                        log.info("  With split: " + replacement.toString());
                        splits.set(lastFileIndex, replacement);
                    }
                }
            }
        }
        return splits;
    } else {
        List<InputSplit> splits = new TextInputFormat().getSplits(context);
        return splits;
    }
}

From source file:org.mrgeo.hdfs.vector.HdfsVectorInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
    boolean useNLineFormat = context.getConfiguration().getBoolean(USE_NLINE_FORMAT, false);
    if (useNLineFormat) {
        List<InputSplit> splits = new NLineInputFormat().getSplits(context);
        // This is a workaround to what appears to be a bug in in how NLineInputFormat
        // computes its splits. When there are multiple splits in a file, it seems
        // the start position in the last split is off by one. Note that this corrective
        // code needs to check the last split for each different file that appears
        // in the list of splits.
        for (int index = 2; index < splits.size(); index++) {
            FileSplit previousSplit = (FileSplit) splits.get(index - 1);
            FileSplit currSplit = (FileSplit) splits.get(index);
            // If this index is the last split, or we've moved on to splits from a different
            // file, then we need to adjust the last split for that file.
            int lastFileIndex = -1;
            if (index == splits.size() - 1) {
                lastFileIndex = index;// w  w w  .  j a v  a  2  s. c  om
            } else if (!currSplit.getPath().equals(previousSplit.getPath())) {
                lastFileIndex = index - 1;
            }
            if (lastFileIndex >= 2) {
                FileSplit lastFileSplit = (FileSplit) splits.get(lastFileIndex);
                FileSplit priorSplit = (FileSplit) splits.get(lastFileIndex - 1);
                if (lastFileSplit.getPath().equals(priorSplit.getPath()))
                    if (priorSplit.getPath().equals(lastFileSplit.getPath())
                            && priorSplit.getStart() + priorSplit.getLength() < lastFileSplit.getStart()) {
                        // Adjust the start of previous split
                        FileSplit replacement = new FileSplit(lastFileSplit.getPath(),
                                priorSplit.getStart() + priorSplit.getLength(), lastFileSplit.getLength() + 1,
                                lastFileSplit.getLocations());
                        log.info("Replacing split: " + lastFileSplit.toString());
                        log.info("  With split: " + replacement.toString());
                        splits.set(lastFileIndex, replacement);
                    }
            }
        }
        return splits;
    } else {
        List<InputSplit> splits = new TextInputFormat().getSplits(context);
        return splits;
    }
}