Example usage for org.apache.hadoop.io.compress CompressionCodec createInputStream

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress CompressionCodec createInputStream.

Prototype

CompressionInputStream createInputStream(InputStream in) throws IOException;

Source Link

Document

Create a CompressionInputStream that will read from the given input stream.

Usage

From source file:org.apache.tajo.engine.query.TestInsertQuery.java

License:Apache License

@Test
public final void testInsertOverwritePathWithNonFromQuery() throws Exception {
    ResultSet res = executeString("insert overwrite into location "
            + "'/tajo-data/testInsertOverwritePathWithNonFromQuery' "
            + "USING csv WITH ('csvfile.delimiter'='|','compression.codec'='org.apache.hadoop.io.compress.DeflateCodec') "
            + "select 1::INT4, 2.1::FLOAT4, 'test'");

    res.close();// w  w  w.  jav  a  2  s  .co  m
    FileSystem fs = FileSystem.get(testingCluster.getConfiguration());
    Path path = new Path("/tajo-data/testInsertOverwritePathWithNonFromQuery");
    assertTrue(fs.exists(path));
    assertEquals(1, fs.listStatus(path).length);

    CompressionCodecFactory factory = new CompressionCodecFactory(testingCluster.getConfiguration());
    FileStatus file = fs.listStatus(path)[0];
    CompressionCodec codec = factory.getCodec(file.getPath());
    assertTrue(codec instanceof DeflateCodec);

    BufferedReader reader = new BufferedReader(
            new InputStreamReader(codec.createInputStream(fs.open(file.getPath()))));

    try {
        String line = reader.readLine();
        assertNotNull(line);

        String[] tokens = line.split("\\|");

        assertEquals(3, tokens.length);
        assertEquals("1", tokens[0]);
        assertEquals("2.1", tokens[1]);
        assertEquals("test", tokens[2]);
    } finally {
        reader.close();
    }
}

From source file:org.apache.tinkerpop.gremlin.hadoop.structure.hdfs.HDFSTools.java

License:Apache License

public static void decompressFile(final FileSystem fs, final String inFile, final String outFile,
        boolean deletePrevious) throws IOException {
    final Path inPath = new Path(inFile);
    final Path outPath = new Path(outFile);
    final CompressionCodecFactory factory = new CompressionCodecFactory(new Configuration());
    final CompressionCodec codec = factory.getCodec(inPath);
    final OutputStream out = fs.create(outPath);
    final InputStream in = codec.createInputStream(fs.open(inPath));
    IOUtils.copyBytes(in, out, 8192);// w  w  w. j  a va2  s . com
    IOUtils.closeStream(in);
    IOUtils.closeStream(out);

    if (deletePrevious)
        fs.delete(new Path(inFile), true);

}

From source file:org.bdgenomics.adam.io.FastqRecordReader.java

License:Apache License

/**
 * Builds a new record reader given a config file and an input split.
 *
 * @param conf The Hadoop configuration object. Used for gaining access
 *   to the underlying file system.//from  ww w  .  j  a  va  2  s  . c om
 * @param split The file split to read.
 */
protected FastqRecordReader(final Configuration conf, final FileSplit split) throws IOException {
    maxLineLength = conf.getInt(MAX_READ_LENGTH_PROPERTY, DEFAULT_MAX_READ_LENGTH);

    file = split.getPath();
    start = split.getStart();
    end = start + split.getLength();

    FileSystem fs = file.getFileSystem(conf);
    FSDataInputStream fileIn = fs.open(file);

    CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
    CompressionCodec codec = codecFactory.getCodec(file);

    // if our codec is splittable, we can (tentatively) say that
    // we too are splittable.
    //
    // if we get a bgzfenhancedcodec, the codec might not actually
    // be splittable. however, if we get a non-splittable gz file,
    // several things happen:
    //
    // 1. the input format will detect this, and will not split the
    //    file
    // 2. the bgzfenhancedcodec will check the underlying data type
    //    (BGZF vs GZIP) at input stream creation time, and will
    //    apply the appropriate codec.
    //
    // if we get an unsplittable codec, really all that we do differently
    // is skip the positioning check, since we know that we're at the
    // start of the file and can get to reading immediately
    isSplittable = (codec instanceof SplittableCompressionCodec);

    if (codec == null) {
        // no codec.  Uncompressed file.
        int bytesToSkip = positionAtFirstRecord(fileIn, null);
        inputStream = fileIn;
        inputStream.skip(bytesToSkip);
        lineReader = new LineReader(inputStream);
    } else if (isSplittable) {
        // file is compressed, but uses a splittable codec
        isCompressed = true;
        int bytesToSkip = positionAtFirstRecord(fileIn, codec);

        // apparent fun finding: if you don't seek back to 0,
        // SplittableCompressionCodec.createInputStream will seek in the stream
        // to a start position, and funny things happen..
        fileIn.seek(0);
        inputStream = ((SplittableCompressionCodec) codec).createInputStream(fileIn, codec.createDecompressor(),
                start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);

        inputStream.skip(bytesToSkip);
        lineReader = new ResettableCompressedSplitLineReader((SplitCompressionInputStream) inputStream, conf);
    } else {
        // unsplittable compressed file
        // expect a single split, first record at offset 0
        isCompressed = true;
        inputStream = codec.createInputStream(fileIn);
        end = Long.MAX_VALUE; // read until the end of the file
        lineReader = new LineReader(inputStream);
    }
}

From source file:org.bgi.flexlab.gaea.data.mapreduce.input.adaptor.AdaptorRecordReader.java

License:Open Source License

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    System.out.println(split.toString());
    Configuration job = context.getConfiguration();
    System.err.println(split.getPath().toString());
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();/*from  w  w w. j a  v  a 2 s.c o m*/
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        in = new LineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        in = new LineReader(fileIn, job);
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:org.bgi.flexlab.gaea.data.mapreduce.input.fastq.FastqBasicReader.java

License:Open Source License

public FastqBasicReader(Configuration job, FileSplit split, byte[] recordDelimiter) throws IOException {
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(split.getPath());

    String multiSampleList = job.get("multiSampleList");
    if (multiSampleList != null && multiSampleList != "") {
        FastqMultipleSample samplelist;//from   www.  j a v a 2s . co m
        samplelist = new FastqMultipleSample(multiSampleList, false);
        FastqSample slist = samplelist.getID(split.getPath().toString());
        if (slist != null) {
            sampleID = String.valueOf(slist.getId());
        } else {
            sampleID = "+";
        }
    }

    start = split.getStart();
    end = split.getStart() + split.getLength();

    // open the file and seek to the start of the split
    FileSystem fs = split.getPath().getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        if (null == this.recordDelimiterBytes) {
            in = new LineReader(codec.createInputStream(fileIn), job);
        } else {
            in = new LineReader(codec.createInputStream(fileIn), job, this.recordDelimiterBytes);
        }
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        if (null == this.recordDelimiterBytes) {
            in = new LineReader(fileIn, job);
        } else {
            in = new LineReader(fileIn, job, this.recordDelimiterBytes);
        }
    }

    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    getFirstFastqLine();
    this.pos = start;
}

From source file:org.deepak.joins.CustomRecordReader.java

License:Apache License

public CustomRecordReader(Configuration job, FileSplit split) throws IOException {
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    curSplit = split;/* w w  w.j  a  v a 2s .  c  o m*/
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        in = new LineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        in = new LineReader(fileIn, job);
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:org.gestore.hadoop.LongRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    job = context.getConfiguration();//  w w w . jav a 2  s  . c  om
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        if (null == this.recordDelimiterBytes) {
            in = new LineReader(codec.createInputStream(fileIn), job);
        } else {
            in = new LineReader(codec.createInputStream(fileIn), job);
        }
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        if (null == this.recordDelimiterBytes) {
            in = new LineReader(fileIn, job);
        } else {
            in = new LineReader(fileIn, job);
        }
    }
    this.pos = start;
    lastLine = new Text();
}

From source file:org.godhuli.rhipe.RXLineRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();//from  w  w w. jav  a  2s  . co  m
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        in = new LineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        in = new LineReader(fileIn, job);
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine((new RHText()).getText(), 0,
                (int) Math.min((long) Integer.MAX_VALUE, end - start));
        // linecounter ++ ;
    }
    this.pos = start;
}

From source file:org.hipi.tools.downloader.Downloader.java

License:Open Source License

public int run(String[] args) throws Exception {

    // try to parse command line arguments
    CommandLine line = null;/*from  w w w .  ja  v a 2  s .c  om*/
    try {
        line = parser.parse(options, args);
    } catch (ParseException exp) {
        usage();
    }
    if (line == null) {
        usage();
    }

    String[] leftArgs = line.getArgs();

    if (leftArgs.length != 2) {
        usage();
    }

    String inputDir = leftArgs[0];
    String outputHib = leftArgs[1];

    boolean yfcc100m = line.hasOption("yfcc100m");
    int numDownloadNodes = (yfcc100m ? 1
            : ((line.hasOption("num-nodes") ? Integer.parseInt(line.getOptionValue("num-nodes")) : 1)));
    if (numDownloadNodes < 1) {
        System.err.println("Invalid number of download nodes specified [" + numDownloadNodes + "]");
        System.exit(1);
    }

    boolean overwrite = line.hasOption("force");

    System.out.println("Source directory: " + inputDir);
    System.out.println("Output HIB: " + outputHib);
    System.out.println("Overwrite output HIB if it exists: " + (overwrite ? "true" : "false"));
    System.out.println("YFCC100M format: " + (yfcc100m ? "true" : "false"));
    System.out.println("Number of download nodes: " + numDownloadNodes);

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);

    // Remove existing HIB if overwrite is specified and HIB exists
    if (!overwrite) {
        if (fs.exists(new Path(outputHib))) {
            System.err.println(
                    "HIB [" + outputHib + "] already exists. Use the \"--force\" argument to overwrite.");
            System.exit(1);
        }
    } else { // overwrite
        if (fs.exists(new Path(outputHib))) {
            System.out.println("Found that output HIB already exists, deleting.");
        }
    }

    fs.delete(new Path(outputHib), true);
    fs.delete(new Path(outputHib + ".dat"), true);
    fs.delete(new Path(outputHib + "_output"), true);

    // Scan source directory for list of input files
    FileStatus[] inputFiles = fs.listStatus(new Path(inputDir));
    if (inputFiles == null || inputFiles.length == 0) {
        System.err.println("Failed to find any files in source directory: " + inputDir);
        System.exit(1);
    }

    // Validate list of input files
    ArrayList<Path> sourceFiles = new ArrayList<Path>();
    for (FileStatus file : inputFiles) {

        Path path = file.getPath();

        if (yfcc100m) {
            String[] tokens = path.getName().split("-");
            if (tokens == null || tokens.length == 0) {
                System.out.println("  Skipping source file (does not follow YFCC100M file name convention): "
                        + file.getPath());
                continue;
            }
        }

        try {
            // If it exists, get the relevant compression codec
            CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
            CompressionCodec codec = codecFactory.getCodec(path);

            FSDataInputStream fis = fs.open(path);

            // If the codec was found, use it to create an decompressed input stream.
            // Otherwise, assume input stream is already decompressed
            BufferedReader reader = null;
            if (codec != null) {
                reader = new BufferedReader(new InputStreamReader(codec.createInputStream(fis)));
            } else {
                reader = new BufferedReader(new InputStreamReader(fis));
            }

            String fileLine = reader.readLine();
            String[] lineFields = (yfcc100m ? fileLine.split("\t") : fileLine.split("\\s+"));

            if (yfcc100m) {
                if (lineFields.length != 23) {
                    System.out.println("  Skipping source file (does not follow YFCC100M source file format): "
                            + file.getPath());
                    String imageUri = null;
                } else {
                    System.out.println("  Adding source file: " + file.getPath());
                    sourceFiles.add(path);
                }
            } else {
                if (lineFields.length != 1) {
                    System.out.println(
                            "  Skipping source file (contains multiple fields per line where only one is expected): "
                                    + file.getPath());
                    if (lineFields.length == 23) {
                        System.out.println("  Did you mean to use \"--yfcc100m\"?");
                    }
                    String imageUri = null;
                } else {
                    System.out.println("  Adding source file: " + file.getPath());
                    sourceFiles.add(path);
                }
            }
            fis.close();
            reader = null;
        } catch (Exception e) {
            System.err.println("Skipping source file (unable to open and parse first line: " + file.getPath());
            continue;
        }

    }

    if (sourceFiles.size() == 0) {
        System.err.println("Failed to find any valid files in source directory: " + inputDir);
        System.exit(1);
    }

    // Construct path to directory containing outputHib
    String outputPath = outputHib.substring(0, outputHib.lastIndexOf('/') + 1);

    // Attaching job parameters to global Configuration object
    conf.setInt("downloader.nodes", numDownloadNodes);
    conf.setStrings("downloader.outfile", outputHib);
    conf.setStrings("downloader.outpath", outputPath);
    conf.setBoolean("downloader.yfcc100m", yfcc100m);

    Job job = Job.getInstance(conf, "hibDownload");
    job.setJarByClass(Downloader.class);
    job.setMapperClass(DownloaderMapper.class);
    job.setReducerClass(DownloaderReducer.class);
    job.setInputFormatClass(DownloaderInputFormat.class);
    job.setOutputKeyClass(BooleanWritable.class);
    job.setOutputValueClass(Text.class);
    job.setNumReduceTasks(1);

    FileOutputFormat.setOutputPath(job, new Path(outputHib + "_output"));

    Path[] inputPaths = new Path[sourceFiles.size()];
    inputPaths = sourceFiles.toArray(inputPaths);
    DownloaderInputFormat.setInputPaths(job, inputPaths);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:org.huahinframework.core.lib.input.SimpleRecordReader.java

License:Apache License

/**
 * {@inheritDoc}/* ww  w  .  j a  v  a  2  s . c om*/
 */
@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();

    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        in = new LineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        in = new LineReader(fileIn, job);
    }

    // skip first line and re-establish "start".
    if (skipFirstLine) {
        start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }

    this.fileName = file.getName();
    this.fileLength = fs.getFileStatus(file).getLen();
    this.conf = context.getConfiguration();
    this.pos = start;
    this.separator = conf.get(SimpleJob.SEPARATOR, StringUtil.COMMA);
    this.regex = conf.getBoolean(SimpleJob.SEPARATOR_REGEX, false);

    init();
}