Example usage for org.apache.hadoop.fs FileSystem open

List of usage examples for org.apache.hadoop.fs FileSystem open

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem open.

Prototype

public FSDataInputStream open(PathHandle fd) throws IOException 

Source Link

Document

Open an FSDataInputStream matching the PathHandle instance.

Usage

From source file:com.netease.news.utils.SplitInput.java

License:Apache License

/**
 * Perform a split on the specified input file. Results will be written to files of the same name in the specified
 * training and test output directories. The {@link #validate()} method is called prior to executing the split.
 *//*from   ww w .  j  a v a 2 s  .co  m*/
public void splitFile(Path inputFile) throws IOException {
    Configuration conf = getConf();
    FileSystem fs = inputFile.getFileSystem(conf);
    if (fs.getFileStatus(inputFile) == null) {
        throw new IOException(inputFile + " does not exist");
    }
    if (fs.getFileStatus(inputFile).isDir()) {
        throw new IOException(inputFile + " is a directory");
    }

    validate();

    Path testOutputFile = new Path(testOutputDirectory, inputFile.getName());
    Path trainingOutputFile = new Path(trainingOutputDirectory, inputFile.getName());

    int lineCount = countLines(fs, inputFile, charset);

    log.info("{} has {} lines", inputFile.getName(), lineCount);

    int testSplitStart = 0;
    int testSplitSize = this.testSplitSize; // don't modify state
    BitSet randomSel = null;

    if (testRandomSelectionPct > 0 || testRandomSelectionSize > 0) {
        testSplitSize = this.testRandomSelectionSize;

        if (testRandomSelectionPct > 0) {
            testSplitSize = Math.round(lineCount * testRandomSelectionPct / 100.0f);
        }
        log.info("{} test split size is {} based on random selection percentage {}", inputFile.getName(),
                testSplitSize, testRandomSelectionPct);
        long[] ridx = new long[testSplitSize];
        RandomSampler.sample(testSplitSize, lineCount - 1, testSplitSize, 0, ridx, 0, RandomUtils.getRandom());
        randomSel = new BitSet(lineCount);
        for (long idx : ridx) {
            randomSel.set((int) idx + 1);
        }
    } else {
        if (testSplitPct > 0) { // calculate split size based on percentage
            testSplitSize = Math.round(lineCount * testSplitPct / 100.0f);
            log.info("{} test split size is {} based on percentage {}", inputFile.getName(), testSplitSize,
                    testSplitPct);
        } else {
            log.info("{} test split size is {}", inputFile.getName(), testSplitSize);
        }

        if (splitLocation > 0) { // calculate start of split based on percentage
            testSplitStart = Math.round(lineCount * splitLocation / 100.0f);
            if (lineCount - testSplitStart < testSplitSize) {
                // adjust split start downwards based on split size.
                testSplitStart = lineCount - testSplitSize;
            }
            log.info("{} test split start is {} based on split location {}", inputFile.getName(),
                    testSplitStart, splitLocation);
        }

        if (testSplitStart < 0) {
            throw new IllegalArgumentException(
                    "test split size for " + inputFile + " is too large, it would produce an "
                            + "empty training set from the initial set of " + lineCount + " examples");
        } else if (lineCount - testSplitSize < testSplitSize) {
            log.warn(
                    "Test set size for {} may be too large, {} is larger than the number of "
                            + "lines remaining in the training set: {}",
                    inputFile, testSplitSize, lineCount - testSplitSize);
        }
    }
    int trainCount = 0;
    int testCount = 0;
    if (!useSequence) {
        BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(inputFile), charset));
        Writer trainingWriter = new OutputStreamWriter(fs.create(trainingOutputFile), charset);
        Writer testWriter = new OutputStreamWriter(fs.create(testOutputFile), charset);

        try {

            String line;
            int pos = 0;
            while ((line = reader.readLine()) != null) {
                pos++;

                Writer writer;
                if (testRandomSelectionPct > 0) { // Randomly choose
                    writer = randomSel.get(pos) ? testWriter : trainingWriter;
                } else { // Choose based on location
                    writer = pos > testSplitStart ? testWriter : trainingWriter;
                }

                if (writer == testWriter) {
                    if (testCount >= testSplitSize) {
                        writer = trainingWriter;
                    } else {
                        testCount++;
                    }
                }
                if (writer == trainingWriter) {
                    trainCount++;
                }
                writer.write(line);
                writer.write('\n');
            }

        } finally {
            Closeables.close(reader, true);
            Closeables.close(trainingWriter, false);
            Closeables.close(testWriter, false);
        }
    } else {
        SequenceFileIterator<Writable, Writable> iterator = new SequenceFileIterator<Writable, Writable>(
                inputFile, false, fs.getConf());
        SequenceFile.Writer trainingWriter = SequenceFile.createWriter(fs, fs.getConf(), trainingOutputFile,
                iterator.getKeyClass(), iterator.getValueClass());
        SequenceFile.Writer testWriter = SequenceFile.createWriter(fs, fs.getConf(), testOutputFile,
                iterator.getKeyClass(), iterator.getValueClass());
        try {

            int pos = 0;
            while (iterator.hasNext()) {
                pos++;
                SequenceFile.Writer writer;
                if (testRandomSelectionPct > 0) { // Randomly choose
                    writer = randomSel.get(pos) ? testWriter : trainingWriter;
                } else { // Choose based on location
                    writer = pos > testSplitStart ? testWriter : trainingWriter;
                }

                if (writer == testWriter) {
                    if (testCount >= testSplitSize) {
                        writer = trainingWriter;
                    } else {
                        testCount++;
                    }
                }
                if (writer == trainingWriter) {
                    trainCount++;
                }
                Pair<Writable, Writable> pair = iterator.next();
                writer.append(pair.getFirst(), pair.getSecond());
            }

        } finally {
            Closeables.close(iterator, true);
            Closeables.close(trainingWriter, false);
            Closeables.close(testWriter, false);
        }
    }
    log.info("file: {}, input: {} train: {}, test: {} starting at {}", inputFile.getName(), lineCount,
            trainCount, testCount, testSplitStart);

    // testing;
    if (callback != null) {
        callback.splitComplete(inputFile, lineCount, trainCount, testCount, testSplitStart);
    }
}

From source file:com.netease.news.utils.SplitInput.java

License:Apache License

/**
 * Count the lines in the file specified as returned by {@code BufferedReader.readLine()}
 *
 * @param inputFile the file whose lines will be counted
 * @param charset   the charset of the file to read
 * @return the number of lines in the input file.
 * @throws IOException if there is a problem opening or reading the file.
 *//*from  w w w.j a v a2s.c o m*/
public static int countLines(FileSystem fs, Path inputFile, Charset charset) throws IOException {
    int lineCount = 0;
    BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(inputFile), charset));
    try {
        while (reader.readLine() != null) {
            lineCount++;
        }
    } finally {
        Closeables.close(reader, true);
    }

    return lineCount;
}

From source file:com.netflix.aegisthus.input.AegisthusInputFormat.java

License:Apache License

/**
 * The main thing that the addSSTableSplit handles is to split SSTables
 * using their index if available. The general algorithm is that if the file
 * is large than the blocksize plus some fuzzy factor to
 *//*w w  w. j a  va 2s . c  om*/
public void addSSTableSplit(List<InputSplit> splits, JobContext job, FileStatus file) throws IOException {
    Path path = file.getPath();
    FileSystem fs = path.getFileSystem(job.getConfiguration());
    long length = file.getLen();
    BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
    if (length != 0) {
        long blockSize = file.getBlockSize();
        long maxSplitSize = (long) (blockSize * .99);
        long fuzzySplit = (long) (blockSize * 1.2);

        long bytesRemaining = length;

        Iterator<Long> scanner = null;
        Path compressionPath = new Path(path.getParent(),
                path.getName().replaceAll("-Data.db", "-CompressionInfo.db"));
        if (!fs.exists(compressionPath)) {
            // Only initialize if we are going to have more than a single
            // split
            if (fuzzySplit < length) {
                Path indexPath = new Path(path.getParent(), path.getName().replaceAll("-Data.db", "-Index.db"));
                if (!fs.exists(indexPath)) {
                    fuzzySplit = length;
                } else {
                    FSDataInputStream fileIn = fs.open(indexPath);
                    scanner = new OffsetScanner(new DataInputStream(new BufferedInputStream(fileIn)),
                            indexPath.getName());
                }
            }
            long splitStart = 0;
            while (splitStart + fuzzySplit < length && scanner.hasNext()) {
                long splitSize = 0;
                // The scanner returns an offset from the start of the file.
                while (splitSize < maxSplitSize && scanner.hasNext()) {
                    splitSize = scanner.next() - splitStart;
                }
                int blkIndex = getBlockIndex(blkLocations, splitStart + (splitSize / 2));
                LOG.info("split path: " + path.getName() + ":" + splitStart + ":" + splitSize);
                splits.add(new AegSplit(path, splitStart, splitSize, blkLocations[blkIndex].getHosts(),
                        convertors));
                bytesRemaining -= splitSize;
                splitStart += splitSize;
            }
        }

        if (bytesRemaining != 0) {
            LOG.info("end path: " + path.getName() + ":" + (length - bytesRemaining) + ":" + bytesRemaining);
            splits.add(new AegSplit(path, length - bytesRemaining, bytesRemaining,
                    blkLocations[blkLocations.length - 1].getHosts(), convertors, fs.exists(compressionPath),
                    compressionPath));
        }
    } else {
        LOG.info("skipping zero length file: " + path.toString());
    }
}

From source file:com.netflix.aegisthus.input.AegSplit.java

License:Apache License

public InputStream getInput(Configuration conf) throws IOException {
    FileSystem fs = path.getFileSystem(conf);
    FSDataInputStream fileIn = fs.open(path);
    InputStream dis = new DataInputStream(new BufferedInputStream(fileIn));
    if (compressed) {
        FSDataInputStream cmIn = fs.open(compressedPath);
        compressionMetadata = new CompressionMetadata(new BufferedInputStream(cmIn), end - start);
        dis = new CompressionInputStream(dis, compressionMetadata);
        end = compressionMetadata.getDataLength();
    }/*from w  w  w  .j  a v  a  2s . co m*/
    return dis;
}