Example usage for org.apache.hadoop.io.compress CompressionCodec createInputStream

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress CompressionCodec createInputStream.

Prototype

CompressionInputStream createInputStream(InputStream in, Decompressor decompressor) throws IOException;

Source Link

Document

Create a CompressionInputStream that will read from the given InputStream with the given Decompressor .

Usage

From source file:com.ricemap.spateDB.core.SpatialSite.java

License:Apache License

public static boolean isRTree(FileSystem fs, Path path) throws IOException {
    FileStatus file = fs.getFileStatus(path);
    Path fileToCheck;//from  w ww.ja va  2 s.co m
    if (file.isDir()) {
        // Check any cell (e.g., first cell)
        GlobalIndex<Partition> gIndex = getGlobalIndex(fs, path);
        if (gIndex == null)
            return false;
        fileToCheck = new Path(path, gIndex.iterator().next().filename);
    } else {
        fileToCheck = file.getPath();
    }
    InputStream fileIn = fs.open(fileToCheck);

    // Check if file is compressed
    CompressionCodec codec = compressionCodecs.getCodec(fileToCheck);
    Decompressor decompressor = null;
    if (codec != null) {
        decompressor = CodecPool.getDecompressor(codec);
        fileIn = codec.createInputStream(fileIn, decompressor);
    }
    byte[] signature = new byte[RTreeFileMarkerB.length];
    fileIn.read(signature);
    fileIn.close();
    if (decompressor != null) {
        CodecPool.returnDecompressor(decompressor);
    }
    return Arrays.equals(signature, SpatialSite.RTreeFileMarkerB);
}

From source file:com.rw.legion.input.LegionRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    /*//from ww w .  ja  v  a2 s  .c om
     * fileBroken tracks whether there's been an IOException while reading
     * this file. If there has, the record reader will simply stop reading
     * records for this particular file, rather than blowing up the whole
     * job.
     */
    fileBroken = false;
    currentLine = new Text();
    currentLineNumber = 0;

    FileSplit split = (FileSplit) genericSplit;

    if (split.getLength() == 0) {
        fileBroken = true;
    }

    // Load the Legion Objective.
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
    legionObjective = ObjectiveDeserializer.deserialize(job.get("legion_objective"));

    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();

    // Open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);

    // Grab the file name to include with the data.
    fileName = file.toString();

    // Does the Legion Objective specify an input codec to use?
    if (legionObjective.getCodecOverride() != null) {
        isCompressedInput = true;
        CompressionCodec codec = new CompressionCodecFactory(job)
                .getCodecByClassName(legionObjective.getCodecOverride());
        decompressor = CodecPool.getDecompressor(codec);
        in = new SplitLineReader(codec.createInputStream(fileIn, decompressor), job, this.recordDelimiterBytes);
        filePosition = fileIn;
    } else {
        CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
        if (null != codec) {
            isCompressedInput = true;
            decompressor = CodecPool.getDecompressor(codec);

            if (codec instanceof SplittableCompressionCodec) {
                final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                        fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
                in = new CompressedSplitLineReader(cIn, job, this.recordDelimiterBytes);
                start = cIn.getAdjustedStart();
                end = cIn.getAdjustedEnd();
                filePosition = cIn;
            } else {
                in = new SplitLineReader(codec.createInputStream(fileIn, decompressor), job,
                        this.recordDelimiterBytes);
                filePosition = fileIn;
            }
        } else {
            fileIn.seek(start);
            in = new SplitLineReader(fileIn, job, this.recordDelimiterBytes);
            filePosition = fileIn;
        }
    }

    /*
     * If this is not the first split, we always throw away first record
     * because we always (except the last split) read one extra line in
     * next() method.
     */
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }

    this.pos = start;
}

From source file:de.l3s.streamcorpus.terrier.ThriftFileCollectionRecordReader.java

License:Apache License

/** 
 * Reading a bunch of lines of file paths in a list.
 * The code in this method is redistributed from Hadoop LineRecordReader
 * /*from   ww w  .  j a  v a 2  s.  c  om*/
 * @throws IOException 
 */
private void loadPathsFromInputSplit(InputSplit split, Configuration conf) throws IOException {
    FileSplit fileSplit = (FileSplit) split;
    Path path = fileSplit.getPath();

    long begin = fileSplit.getStart();
    long end = begin + fileSplit.getLength();

    LOG.info("Reading paths in file " + path.getName());

    // First check the compression codec
    CompressionCodecFactory compressionCodec = new CompressionCodecFactory(conf);
    CompressionCodec codec = compressionCodec.getCodec(path);
    FSDataInputStream fis = fs.open(path);
    SplitLineReader in;

    Seekable filePosition;

    boolean compressed = false;
    Decompressor decompressor = null;
    if (null != codec) {
        compressed = true;
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(fis,
                    decompressor, begin, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = new CompressedSplitLineReader(cIn, conf, (byte[]) null);
            begin = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            in = new SplitLineReader(codec.createInputStream(fis, decompressor), conf, null);
            filePosition = fis;
        }
    } else {
        fis.seek(begin);
        in = new SplitLineReader(fis, conf, (byte[]) null);
        filePosition = fis;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (begin != 0) {
        begin += in.readLine(new Text(), 0, maxBytesToConsume(compressed, begin, end));
    }
    long pos = begin;

    int newSize = 0;
    final Text nextLine = new Text();
    paths = new ArrayList<>();
    while (getFilePosition(compressed, filePosition, pos) <= end || in.needAdditionalRecordAfterSplit()) {

        if (pos == 0) {
            // Strip BOM(Byte Order Mark)
            // Text only support UTF-8, we only need to check UTF-8 BOM
            // (0xEF,0xBB,0xBF) at the start of the text stream.
            newSize = in.readLine(nextLine, Integer.MAX_VALUE, Integer.MAX_VALUE);
            pos += newSize;
            int textLength = nextLine.getLength();
            byte[] textBytes = nextLine.getBytes();
            if ((textLength >= 3) && (textBytes[0] == (byte) 0xEF) && (textBytes[1] == (byte) 0xBB)
                    && (textBytes[2] == (byte) 0xBF)) {
                // find UTF-8 BOM, strip it.
                LOG.info("Found UTF-8 BOM and skipped it");
                textLength -= 3;
                newSize -= 3;
                if (textLength > 0) {
                    // It may work to use the same buffer and 
                    // not do the copyBytes
                    textBytes = nextLine.copyBytes();
                    nextLine.set(textBytes, 3, textLength);
                } else {
                    nextLine.clear();
                }
            }
        } else {
            newSize = in.readLine(nextLine, Integer.MAX_VALUE, maxBytesToConsume(compressed, pos, end));
            pos += newSize;
        }

        paths.add(nextLine.toString());
        // line too long. try again
        LOG.info("Skipped line of size " + newSize + " at pos " + (pos - newSize));
    }

    try {
        if (in != null) {
            in.close();
        }
        if (fis != null) {
            fis.close();
        }
    } finally {
        if (decompressor != null) {
            CodecPool.returnDecompressor(decompressor);
        }
    }
}

From source file:edu.umn.cs.spatialHadoop.core.SpatialSite.java

License:Open Source License

/**
 * Checks whether a file is indexed using an R-tree or not. This allows
 * an operation to use the R-tree to speedup the processing if it exists.
 * This function opens the specified file and reads the first eight bytes
 * which include the R-tree signature. If the signatures matches with the
 * R-tree signature, true is returned. Otherwise, false is returned.
 * If the parameter is a path to a directory, only the first data file in that
 * directory is tested./*from   ww  w .  ja  va 2 s .  c o m*/
 * @param fs
 * @param path
 * @return
 * @throws IOException
 */
public static boolean isRTree(FileSystem fs, Path path) throws IOException {
    if (FileUtil.getExtensionWithoutCompression(path).equals("rtree"))
        return true;

    FileStatus file = fs.getFileStatus(path);
    Path fileToCheck;
    if (file.isDir()) {
        // Check any cell (e.g., first cell)
        GlobalIndex<Partition> gIndex = getGlobalIndex(fs, path);
        if (gIndex == null)
            return false;
        fileToCheck = new Path(path, gIndex.iterator().next().filename);
    } else {
        fileToCheck = file.getPath();
    }
    InputStream fileIn = fs.open(fileToCheck);

    // Check if file is compressed
    CompressionCodec codec = compressionCodecs.getCodec(fileToCheck);
    Decompressor decompressor = null;
    if (codec != null) {
        synchronized (compressionCodecs) {
            // CodecPool is not thread-safe
            decompressor = CodecPool.getDecompressor(codec);
        }
        fileIn = codec.createInputStream(fileIn, decompressor);
    }
    byte[] signature = new byte[RTreeFileMarkerB.length];
    fileIn.read(signature);
    fileIn.close();
    if (decompressor != null) {
        CodecPool.returnDecompressor(decompressor);
    }
    return Arrays.equals(signature, SpatialSite.RTreeFileMarkerB);
}

From source file:edu.umn.cs.spatialHadoop.operations.LocalSampler.java

License:Open Source License

/**
 * Sample a specific number of lines from a given file
 * @param fs//from  w  w  w. ja  v a 2s.  c o  m
 * @param file
 * @param count
 * @param seed
 * @param output
 * @return
 * @throws IOException
 */
private static int sampleFileSplitByCount(FileSplit file, Configuration conf, int count, long seed,
        ResultCollector<Text> output) throws IOException {
    InputStream in = null;
    Decompressor decompressor = null;
    try {
        CompressionCodecFactory compressionCodecFactory = new CompressionCodecFactory(conf);
        CompressionCodec codec = compressionCodecFactory.getCodec(file.getPath());

        // Open the file and read the sample
        FileSystem fs = file.getPath().getFileSystem(conf);
        in = fs.open(file.getPath());
        int sampledLines = 0;

        if (codec != null) {
            // Special handling for compressed file as we cannot compute the actual
            // size of the underlying data
            decompressor = CodecPool.getDecompressor(codec);

            if (codec instanceof SplittableCompressionCodec) {
                // A splittable compression codec, can seek to the desired input pos
                final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                        in, decompressor, file.getStart(), file.getStart() + file.getLength(),
                        SplittableCompressionCodec.READ_MODE.BYBLOCK);
                in = cIn;
                // Adjust the start of the end based on the compressed data
                long start = cIn.getAdjustedStart();
                long end = cIn.getAdjustedEnd();
                sampledLines = sampleStreamByCount(in, end - start, count, seed, output);
            } else {
                // Non-splittable input, need to start from the beginning
                in = codec.createInputStream(in, decompressor);
                sampledLines = sampleStreamByCount(in, Long.MAX_VALUE, count, seed, output);
            }
        } else {
            long pos = 0; // Current position in file

            // Generate random offsets and keep them sorted for IO efficiency
            Random rand = new Random(seed);
            long[] sampleOffsets = new long[count];
            for (int i = 0; i < count; i++)
                sampleOffsets[i] = Math.abs(rand.nextLong()) % file.getLength() + file.getStart();
            Arrays.sort(sampleOffsets);

            // Sample the generated numbers
            Text line = new Text2();
            for (int i = 0; i < count; i++) {
                pos += in.skip(sampleOffsets[i] - pos);
                // Skip until end of line
                line.clear();
                pos += readUntilEOL(in, line);
                // Read the next full line
                line.clear();
                if ((pos += readUntilEOL(in, line)) > 1) {
                    sampledLines++;
                    if (output != null)
                        output.collect(line);
                }
            }
        }

        return sampledLines;
    } finally {
        if (in != null)
            in.close();
        if (decompressor != null)
            CodecPool.returnDecompressor(decompressor);
    }
}

From source file:edu.umn.cs.spatialHadoop.operations.LocalSampler.java

License:Open Source License

/**
 * Sample text lines from the given split with the given sampling ratio
 * @param fs//w ww  .j ava 2 s  .  c  o m
 * @param file
 * @param ratio
 * @param seed
 * @param output
 * @return
 * @throws IOException
 */
private static int sampleFileSplitByRatio(FileSplit file, Configuration conf, float ratio, long seed,
        ResultCollector<Text> output) throws IOException {

    InputStream in = null;
    Decompressor decompressor = null;
    int sampledLines;
    Text line = new Text2();

    try {
        CompressionCodecFactory compressionCodecFactory = new CompressionCodecFactory(conf);
        CompressionCodec codec = compressionCodecFactory.getCodec(file.getPath());
        FileSystem fs = file.getPath().getFileSystem(conf);
        in = fs.open(file.getPath());

        if (codec != null) {
            // Special handling for compressed file as we cannot compute the actual
            // size of the underlying data
            decompressor = CodecPool.getDecompressor(codec);

            if (codec instanceof SplittableCompressionCodec) {
                // A splittable compression codec, can seek to the desired input pos
                final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                        in, decompressor, file.getStart(), file.getStart() + file.getLength(),
                        SplittableCompressionCodec.READ_MODE.BYBLOCK);
                in = cIn;
                // Adjust the start of the end based on the compressed data
                long start = cIn.getAdjustedStart();
                long end = cIn.getAdjustedEnd();
                // Skip first line if needed
                if (file.getStart() > 0)
                    start += readUntilEOL(cIn, line);

                sampledLines = sampleStreamByRatio(in, ratio, seed, output);
            } else {
                // Non-splittable input, need to start from the beginning
                in = codec.createInputStream(in, decompressor);
                // No need to skip first line because we actually read the file from
                // the beginning
                sampledLines = sampleStreamByRatio(in, ratio, seed, output);
            }
        } else {
            // Not a compressed file. Apply a more efficient, though approximate,
            // solution
            // Open the file and read the sample
            long pos = 0; // Current position in file
            if (file.getStart() > 0) {
                pos += in.skip(file.getStart());
                pos += readUntilEOL(in, line);
            }

            // Initialize the random variable which is used for sampling
            Random rand = new Random(seed);
            sampledLines = 0;

            // Read the first 10 lines to estimate the average record size
            long end = file.getStart() + file.getLength();
            for (int i = 0; i < 10 && pos < end; i++) {
                line.clear();
                pos += readUntilEOL(in, line);
                if (rand.nextFloat() < ratio) {
                    sampledLines++;
                    if (output != null)
                        output.collect(line);
                }
            }

            int averageLineSize = (int) ((pos - file.getStart()) / 10);
            int count = Math.round(ratio * file.getLength() / averageLineSize) - sampledLines;
            long[] sampleOffsets = new long[count];
            for (int i = 0; i < count; i++)
                sampleOffsets[i] = Math.abs(rand.nextLong()) % (end - pos) + file.getStart();
            Arrays.sort(sampleOffsets);

            // Sample the generated numbers
            for (int i = 0; i < count; i++) {
                pos += in.skip(sampleOffsets[i] - pos);
                // Skip until end of line
                line.clear();
                pos += readUntilEOL(in, line);
                // Read the next full line
                line.clear();
                if ((pos += readUntilEOL(in, line)) > 1) {
                    sampledLines++;
                    if (output != null)
                        output.collect(line);
                }
            }
        }
    } finally {
        if (in != null)
            in.close();
        if (decompressor != null)
            CodecPool.returnDecompressor(decompressor);
    }

    in.close();
    return sampledLines;
}

From source file:format.OverlapLengthRecordReader.java

License:Apache License

public void initialize(Configuration job, long splitStart, long splitLength, Path file) throws IOException {
    start = splitStart;/*from  www .j  a v a 2 s .co m*/
    end = start + splitLength;
    long partialRecordLength = start % recordLength;
    long numBytesToSkip = 0;

    /* This if check is not necessary since for this, we will read one entire split */
    /*
    if (partialRecordLength != 0) {
      numBytesToSkip = recordLength - partialRecordLength;
    }
    */

    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);

    CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
    if (null != codec) {
        isCompressedInput = true;
        decompressor = CodecPool.getDecompressor(codec);
        CompressionInputStream cIn = codec.createInputStream(fileIn, decompressor);
        filePosition = cIn;
        inputStream = cIn;
        numRecordsRemainingInSplit = Long.MAX_VALUE;
        LOG.info("Compressed input; cannot compute number of records in the split");
    } else {
        fileIn.seek(start);
        filePosition = fileIn;
        inputStream = fileIn;
        long splitSize = end - start - numBytesToSkip;
        /* This remains to be observed, since we are assuming recordLength = splitSize */
        //      numRecordsRemainingInSplit = (splitSize + recordLength - 1)/recordLength;
        numRecordsRemainingInSplit = 1;
        if (numRecordsRemainingInSplit < 0) {
            numRecordsRemainingInSplit = 0;
        }
        LOG.info("Expecting " + numRecordsRemainingInSplit + " records each with a length of " + recordLength
                + " bytes in the split with an effective size of " + splitSize + " bytes");
    }
    if (numBytesToSkip != 0) {
        start += inputStream.skip(numBytesToSkip);
    }
    this.pos = start;
}

From source file:fr.ens.biologie.genomique.eoulsan.bio.io.hadoop.FastqLineRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
    start = split.getStart();/*from  www . jav a2 s.  c o  m*/
    end = start + split.getLength();
    final Path file = split.getPath();

    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);

    CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
    if (null != codec) {
        isCompressedInput = true;
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = new CompressedSplitFastqLineReader(cIn, job, this.recordDelimiterBytes);
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            in = new SplitLineReader(codec.createInputStream(fileIn, decompressor), job,
                    this.recordDelimiterBytes);
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        in = new SplitLineReader(fileIn, job, this.recordDelimiterBytes);
        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
}

From source file:hivemall.utils.hadoop.HadoopUtils.java

License:Open Source License

public static BufferedReader getBufferedReader(File file, MapredContext context) throws IOException {
    URI fileuri = file.toURI();//from   w w  w.j  a  v a 2  s .  c  om
    Path path = new Path(fileuri);

    Configuration conf = context.getJobConf();
    CompressionCodecFactory ccf = new CompressionCodecFactory(conf);
    CompressionCodec codec = ccf.getCodec(path);

    if (codec == null) {
        return new BufferedReader(new FileReader(file));
    } else {
        Decompressor decompressor = CodecPool.getDecompressor(codec);
        FileInputStream fis = new FileInputStream(file);
        CompressionInputStream cis = codec.createInputStream(fis, decompressor);
        BufferedReader br = new BufferedReaderExt(new InputStreamReader(cis), decompressor);
        return br;
    }
}

From source file:mapred.io.CustomRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
    start = split.getStart();/*from  w  w  w . j av  a  2  s  . co m*/
    end = start + split.getLength();
    final Path file = split.getPath();

    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);

    CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
    if (null != codec) {
        isCompressedInput = true;
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = new CompressedSplitLineReader(cIn, job, this.recordDelimiterBytes);
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            in = new SplitLineReader(codec.createInputStream(fileIn, decompressor), job,
                    this.recordDelimiterBytes);
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        in = new SplitLineReader(fileIn, job, this.recordDelimiterBytes);
        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
}