Example usage for org.apache.hadoop.io.compress CompressionCodecFactory getCodec

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress CompressionCodecFactory getCodec.

Prototype

public CompressionCodec getCodec(Path file)

Source Link

Document

Find the relevant compression codec for the given file based on its filename suffix.

Usage

From source file:de.l3s.streamcorpus.terrier.ThriftFileCollectionRecordReader.java

License:Apache License

/** 
 * Reading a bunch of lines of file paths in a list.
 * The code in this method is redistributed from Hadoop LineRecordReader
 * //from w ww. j  av  a 2s . c o m
 * @throws IOException 
 */
private void loadPathsFromInputSplit(InputSplit split, Configuration conf) throws IOException {
    FileSplit fileSplit = (FileSplit) split;
    Path path = fileSplit.getPath();

    long begin = fileSplit.getStart();
    long end = begin + fileSplit.getLength();

    LOG.info("Reading paths in file " + path.getName());

    // First check the compression codec
    CompressionCodecFactory compressionCodec = new CompressionCodecFactory(conf);
    CompressionCodec codec = compressionCodec.getCodec(path);
    FSDataInputStream fis = fs.open(path);
    SplitLineReader in;

    Seekable filePosition;

    boolean compressed = false;
    Decompressor decompressor = null;
    if (null != codec) {
        compressed = true;
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(fis,
                    decompressor, begin, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = new CompressedSplitLineReader(cIn, conf, (byte[]) null);
            begin = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            in = new SplitLineReader(codec.createInputStream(fis, decompressor), conf, null);
            filePosition = fis;
        }
    } else {
        fis.seek(begin);
        in = new SplitLineReader(fis, conf, (byte[]) null);
        filePosition = fis;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (begin != 0) {
        begin += in.readLine(new Text(), 0, maxBytesToConsume(compressed, begin, end));
    }
    long pos = begin;

    int newSize = 0;
    final Text nextLine = new Text();
    paths = new ArrayList<>();
    while (getFilePosition(compressed, filePosition, pos) <= end || in.needAdditionalRecordAfterSplit()) {

        if (pos == 0) {
            // Strip BOM(Byte Order Mark)
            // Text only support UTF-8, we only need to check UTF-8 BOM
            // (0xEF,0xBB,0xBF) at the start of the text stream.
            newSize = in.readLine(nextLine, Integer.MAX_VALUE, Integer.MAX_VALUE);
            pos += newSize;
            int textLength = nextLine.getLength();
            byte[] textBytes = nextLine.getBytes();
            if ((textLength >= 3) && (textBytes[0] == (byte) 0xEF) && (textBytes[1] == (byte) 0xBB)
                    && (textBytes[2] == (byte) 0xBF)) {
                // find UTF-8 BOM, strip it.
                LOG.info("Found UTF-8 BOM and skipped it");
                textLength -= 3;
                newSize -= 3;
                if (textLength > 0) {
                    // It may work to use the same buffer and 
                    // not do the copyBytes
                    textBytes = nextLine.copyBytes();
                    nextLine.set(textBytes, 3, textLength);
                } else {
                    nextLine.clear();
                }
            }
        } else {
            newSize = in.readLine(nextLine, Integer.MAX_VALUE, maxBytesToConsume(compressed, pos, end));
            pos += newSize;
        }

        paths.add(nextLine.toString());
        // line too long. try again
        LOG.info("Skipped line of size " + newSize + " at pos " + (pos - newSize));
    }

    try {
        if (in != null) {
            in.close();
        }
        if (fis != null) {
            fis.close();
        }
    } finally {
        if (decompressor != null) {
            CodecPool.returnDecompressor(decompressor);
        }
    }
}

From source file:edu.uci.ics.hyracks.imru.file.HDFSUtils.java

License:Apache License

/**
 * Open a file in HDFS for reading, performing automatic
 * decompression as necessary./*  w  w w .j a v  a 2s .  co  m*/
 *
 * @param dfs
 *            The HDFS file system object.
 * @param conf
 *            The HDFS configuration.
 * @param path
 *            The path to the file.
 * @return An InputStream for reading the file.
 * @throws IOException
 */
public static InputStream open(FileSystem dfs, Configuration conf, Path path) throws IOException {
    FSDataInputStream fin = dfs.open(path);
    CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf);
    final CompressionCodec codec = compressionCodecs.getCodec(path);
    if (codec != null) {
        return codec.createInputStream(fin);
    } else {
        return fin;
    }
}

From source file:edu.umn.cs.spatialHadoop.operations.LocalSampler.java

License:Open Source License

/**
 * Sample a specific number of lines from a given file
 * @param fs/* ww  w .  j a  va 2s .c o m*/
 * @param file
 * @param count
 * @param seed
 * @param output
 * @return
 * @throws IOException
 */
private static int sampleFileSplitByCount(FileSplit file, Configuration conf, int count, long seed,
        ResultCollector<Text> output) throws IOException {
    InputStream in = null;
    Decompressor decompressor = null;
    try {
        CompressionCodecFactory compressionCodecFactory = new CompressionCodecFactory(conf);
        CompressionCodec codec = compressionCodecFactory.getCodec(file.getPath());

        // Open the file and read the sample
        FileSystem fs = file.getPath().getFileSystem(conf);
        in = fs.open(file.getPath());
        int sampledLines = 0;

        if (codec != null) {
            // Special handling for compressed file as we cannot compute the actual
            // size of the underlying data
            decompressor = CodecPool.getDecompressor(codec);

            if (codec instanceof SplittableCompressionCodec) {
                // A splittable compression codec, can seek to the desired input pos
                final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                        in, decompressor, file.getStart(), file.getStart() + file.getLength(),
                        SplittableCompressionCodec.READ_MODE.BYBLOCK);
                in = cIn;
                // Adjust the start of the end based on the compressed data
                long start = cIn.getAdjustedStart();
                long end = cIn.getAdjustedEnd();
                sampledLines = sampleStreamByCount(in, end - start, count, seed, output);
            } else {
                // Non-splittable input, need to start from the beginning
                in = codec.createInputStream(in, decompressor);
                sampledLines = sampleStreamByCount(in, Long.MAX_VALUE, count, seed, output);
            }
        } else {
            long pos = 0; // Current position in file

            // Generate random offsets and keep them sorted for IO efficiency
            Random rand = new Random(seed);
            long[] sampleOffsets = new long[count];
            for (int i = 0; i < count; i++)
                sampleOffsets[i] = Math.abs(rand.nextLong()) % file.getLength() + file.getStart();
            Arrays.sort(sampleOffsets);

            // Sample the generated numbers
            Text line = new Text2();
            for (int i = 0; i < count; i++) {
                pos += in.skip(sampleOffsets[i] - pos);
                // Skip until end of line
                line.clear();
                pos += readUntilEOL(in, line);
                // Read the next full line
                line.clear();
                if ((pos += readUntilEOL(in, line)) > 1) {
                    sampledLines++;
                    if (output != null)
                        output.collect(line);
                }
            }
        }

        return sampledLines;
    } finally {
        if (in != null)
            in.close();
        if (decompressor != null)
            CodecPool.returnDecompressor(decompressor);
    }
}

From source file:edu.umn.cs.spatialHadoop.operations.LocalSampler.java

License:Open Source License

/**
 * Sample text lines from the given split with the given sampling ratio
 * @param fs//from www  .j  av  a2s  .  c  o m
 * @param file
 * @param ratio
 * @param seed
 * @param output
 * @return
 * @throws IOException
 */
private static int sampleFileSplitByRatio(FileSplit file, Configuration conf, float ratio, long seed,
        ResultCollector<Text> output) throws IOException {

    InputStream in = null;
    Decompressor decompressor = null;
    int sampledLines;
    Text line = new Text2();

    try {
        CompressionCodecFactory compressionCodecFactory = new CompressionCodecFactory(conf);
        CompressionCodec codec = compressionCodecFactory.getCodec(file.getPath());
        FileSystem fs = file.getPath().getFileSystem(conf);
        in = fs.open(file.getPath());

        if (codec != null) {
            // Special handling for compressed file as we cannot compute the actual
            // size of the underlying data
            decompressor = CodecPool.getDecompressor(codec);

            if (codec instanceof SplittableCompressionCodec) {
                // A splittable compression codec, can seek to the desired input pos
                final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                        in, decompressor, file.getStart(), file.getStart() + file.getLength(),
                        SplittableCompressionCodec.READ_MODE.BYBLOCK);
                in = cIn;
                // Adjust the start of the end based on the compressed data
                long start = cIn.getAdjustedStart();
                long end = cIn.getAdjustedEnd();
                // Skip first line if needed
                if (file.getStart() > 0)
                    start += readUntilEOL(cIn, line);

                sampledLines = sampleStreamByRatio(in, ratio, seed, output);
            } else {
                // Non-splittable input, need to start from the beginning
                in = codec.createInputStream(in, decompressor);
                // No need to skip first line because we actually read the file from
                // the beginning
                sampledLines = sampleStreamByRatio(in, ratio, seed, output);
            }
        } else {
            // Not a compressed file. Apply a more efficient, though approximate,
            // solution
            // Open the file and read the sample
            long pos = 0; // Current position in file
            if (file.getStart() > 0) {
                pos += in.skip(file.getStart());
                pos += readUntilEOL(in, line);
            }

            // Initialize the random variable which is used for sampling
            Random rand = new Random(seed);
            sampledLines = 0;

            // Read the first 10 lines to estimate the average record size
            long end = file.getStart() + file.getLength();
            for (int i = 0; i < 10 && pos < end; i++) {
                line.clear();
                pos += readUntilEOL(in, line);
                if (rand.nextFloat() < ratio) {
                    sampledLines++;
                    if (output != null)
                        output.collect(line);
                }
            }

            int averageLineSize = (int) ((pos - file.getStart()) / 10);
            int count = Math.round(ratio * file.getLength() / averageLineSize) - sampledLines;
            long[] sampleOffsets = new long[count];
            for (int i = 0; i < count; i++)
                sampleOffsets[i] = Math.abs(rand.nextLong()) % (end - pos) + file.getStart();
            Arrays.sort(sampleOffsets);

            // Sample the generated numbers
            for (int i = 0; i < count; i++) {
                pos += in.skip(sampleOffsets[i] - pos);
                // Skip until end of line
                line.clear();
                pos += readUntilEOL(in, line);
                // Read the next full line
                line.clear();
                if ((pos += readUntilEOL(in, line)) > 1) {
                    sampledLines++;
                    if (output != null)
                        output.collect(line);
                }
            }
        }
    } finally {
        if (in != null)
            in.close();
        if (decompressor != null)
            CodecPool.returnDecompressor(decompressor);
    }

    in.close();
    return sampledLines;
}

From source file:format.OverlapRecordReader.java

License:BSD License

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    FileSplit split = (FileSplit) genericSplit;
    start = split.getStart();/*from  w w w. j ava2s . co m*/
    end = start + split.getLength();
    final Path file = split.getPath();
    //Configuration job = HadoopUtils.getConfiguration(context);
    Configuration job = context.getConfiguration();
    maxLineLen = job.getInt(MAX_LINE_LEN_CONF, Integer.MAX_VALUE);

    FileSystem fs = file.getFileSystem(job);
    CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);
    if (codec == null) {
        throw new IOException("Codec for file " + file + " not found, cannot run");
    }

    // open the file and seek to the start of the split
    fileIn = fs.open(split.getPath());

    // creates input stream and also reads the file header
    in = new LineReader(codec.createInputStream(fileIn), job);

    if (start != 0) {
        fileIn.seek(start);

        // read and ignore the first line
        in.readLine(new Text());
        start = fileIn.getPos();
    }

    this.pos = start;
}

From source file:fr.ens.biologie.genomique.eoulsan.util.hadoop.PathUtils.java

License:LGPL

/**
 * Copy bytes from an InputStream to a path.
 * @param is the InputStream to read from
 * @param destPath destination path//from   w  ww  . j  a  v  a2 s  .  c  o  m
 * @param conf Configuration object
 * @return the number of bytes copied
 * @throws IOException In case of an I/O problem
 */
public static boolean copyAndCompressInputStreamToPath(final InputStream is, final Path destPath,
        final Configuration conf) throws IOException {

    if (is == null) {
        throw new NullPointerException("The input stream is null");
    }
    if (destPath == null) {
        throw new NullPointerException("The destination path is null");
    }
    if (conf == null) {
        throw new NullPointerException("The configuration object is null");
    }

    final FileSystem fs = FileSystem.get(destPath.toUri(), conf);

    final CompressionCodecFactory factory = new CompressionCodecFactory(conf);
    final CompressionCodec codec = factory.getCodec(destPath);

    if (codec == null) {
        throw new IOException("No codec found for: " + destPath);
    }

    final OutputStream os = codec.createOutputStream(fs.create(destPath));
    FileUtils.copy(is, os);

    return true;
}

From source file:gobblin.source.extractor.hadoop.HadoopFsHelper.java

License:Apache License

/**
 * Returns an {@link InputStream} to the specified file.
 * <p>//www.jav a 2  s.  c o  m
 * Note: It is the caller's responsibility to close the returned {@link InputStream}.
 * </p>
 *
 * @param path The path to the file to open.
 * @return An {@link InputStream} for the specified file.
 * @throws FileBasedHelperException if there is a problem opening the {@link InputStream} for the specified file.
 */
@Override
public InputStream getFileStream(String path) throws FileBasedHelperException {
    try {
        Path p = new Path(path);
        InputStream in = this.getFileSystem().open(p);
        // Account for compressed files (e.g. gzip).
        // https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/input/WholeTextFileRecordReader.scala
        CompressionCodecFactory factory = new CompressionCodecFactory(this.getFileSystem().getConf());
        CompressionCodec codec = factory.getCodec(p);
        return (codec == null) ? in : codec.createInputStream(in);
    } catch (IOException e) {
        throw new FileBasedHelperException("Cannot open file " + path + " due to " + e.getMessage(), e);
    }
}

From source file:hdfsIO.fileInteractions.java

public List<String> readLines(Path location, Configuration conf) throws Exception {
    FileSystem fileSystem = FileSystem.get(location.toUri(), conf);
    CompressionCodecFactory factory = new CompressionCodecFactory(conf);
    FileStatus[] items = fileSystem.listStatus(location);
    if (items == null) {
        return new ArrayList<String>();
    }/*from   ww  w  .  j a  v  a  2s .c  om*/
    List<String> results = new ArrayList<String>();
    for (FileStatus item : items) {

        // ignoring files like _SUCCESS
        if (item.getPath().getName().startsWith("_")) {
            continue;
        }

        CompressionCodec codec = factory.getCodec(item.getPath());
        InputStream stream = null;

        // check if we have a compression codec we need to use
        if (codec != null) {
            stream = codec.createInputStream(fileSystem.open(item.getPath()));
        } else {
            stream = fileSystem.open(item.getPath());
        }

        StringWriter writer = new StringWriter();
        IOUtils.copy(stream, writer, "UTF-8");
        String raw = writer.toString();
        String[] resulting = raw.split("\n");
        for (String str : raw.split("\n")) {
            results.add(str);
        }
    }
    return results;
}

From source file:hivemall.utils.hadoop.HadoopUtils.java

License:Open Source License

public static BufferedReader getBufferedReader(File file, MapredContext context) throws IOException {
    URI fileuri = file.toURI();//from   www .j a  v a  2 s .  c om
    Path path = new Path(fileuri);

    Configuration conf = context.getJobConf();
    CompressionCodecFactory ccf = new CompressionCodecFactory(conf);
    CompressionCodec codec = ccf.getCodec(path);

    if (codec == null) {
        return new BufferedReader(new FileReader(file));
    } else {
        Decompressor decompressor = CodecPool.getDecompressor(codec);
        FileInputStream fis = new FileInputStream(file);
        CompressionInputStream cis = codec.createInputStream(fis, decompressor);
        BufferedReader br = new BufferedReaderExt(new InputStreamReader(cis), decompressor);
        return br;
    }
}

From source file:io.aos.hdfs.FileDecompressor.java

License:Apache License

public static void main(String... args) throws Exception {
    String uri = args[0];// www  .ja v  a 2s  .co  m
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(URI.create(uri), conf);

    Path inputPath = new Path(uri);
    CompressionCodecFactory factory = new CompressionCodecFactory(conf);
    CompressionCodec codec = factory.getCodec(inputPath);
    if (codec == null) {
        System.err.println("No codec found for " + uri);
        System.exit(1);
    }

    String outputUri = CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());

    InputStream in = null;
    OutputStream out = null;
    try {
        in = codec.createInputStream(fs.open(inputPath));
        out = fs.create(new Path(outputUri));
        IOUtils.copyBytes(in, out, conf);
    } finally {
        IOUtils.closeStream(in);
        IOUtils.closeStream(out);
    }
}