Example usage for org.apache.hadoop.io.compress CompressionCodec createInputStream

List of usage examples for org.apache.hadoop.io.compress CompressionCodec createInputStream

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress CompressionCodec createInputStream.

Prototype

CompressionInputStream createInputStream(InputStream in) throws IOException;

Source Link

Document

Create a CompressionInputStream that will read from the given input stream.

Usage

From source file:org.apache.tajo.engine.query.TestInsertQuery.java

License:Apache License

@Test
public final void testInsertOverwritePathWithNonFromQuery() throws Exception {
    ResultSet res = executeString("insert overwrite into location "
            + "'/tajo-data/testInsertOverwritePathWithNonFromQuery' "
            + "USING csv WITH ('csvfile.delimiter'='|','compression.codec'='org.apache.hadoop.io.compress.DeflateCodec') "
            + "select 1::INT4, 2.1::FLOAT4, 'test'");

    res.close();// w  w  w.  jav  a  2  s  .co  m
    FileSystem fs = FileSystem.get(testingCluster.getConfiguration());
    Path path = new Path("/tajo-data/testInsertOverwritePathWithNonFromQuery");
    assertTrue(fs.exists(path));
    assertEquals(1, fs.listStatus(path).length);

    CompressionCodecFactory factory = new CompressionCodecFactory(testingCluster.getConfiguration());
    FileStatus file = fs.listStatus(path)[0];
    CompressionCodec codec = factory.getCodec(file.getPath());
    assertTrue(codec instanceof DeflateCodec);

    BufferedReader reader = new BufferedReader(
            new InputStreamReader(codec.createInputStream(fs.open(file.getPath()))));

    try {
        String line = reader.readLine();
        assertNotNull(line);

        String[] tokens = line.split("\\|");

        assertEquals(3, tokens.length);
        assertEquals("1", tokens[0]);
        assertEquals("2.1", tokens[1]);
        assertEquals("test", tokens[2]);
    } finally {
        reader.close();
    }
}

From source file:org.apache.tinkerpop.gremlin.hadoop.structure.hdfs.HDFSTools.java

License:Apache License

public static void decompressFile(final FileSystem fs, final String inFile, final String outFile,
        boolean deletePrevious) throws IOException {
    final Path inPath = new Path(inFile);
    final Path outPath = new Path(outFile);
    final CompressionCodecFactory factory = new CompressionCodecFactory(new Configuration());
    final CompressionCodec codec = factory.getCodec(inPath);
    final OutputStream out = fs.create(outPath);
    final InputStream in = codec.createInputStream(fs.open(inPath));
    IOUtils.copyBytes(in, out, 8192);// w  w  w. j  a va2  s . com
    IOUtils.closeStream(in);
    IOUtils.closeStream(out);

    if (deletePrevious)
        fs.delete(new Path(inFile), true);

}

From source file:org.bdgenomics.adam.io.FastqRecordReader.java

License:Apache License

/**
 * Builds a new record reader given a config file and an input split.
 *
 * @param conf The Hadoop configuration object. Used for gaining access
 *   to the underlying file system.//from  ww w  .  j  a  va  2  s  . c om
 * @param split The file split to read.
 */
protected FastqRecordReader(final Configuration conf, final FileSplit split) throws IOException {
    maxLineLength = conf.getInt(MAX_READ_LENGTH_PROPERTY, DEFAULT_MAX_READ_LENGTH);

    file = split.getPath();
    start = split.getStart();
    end = start + split.getLength();

    FileSystem fs = file.getFileSystem(conf);
    FSDataInputStream fileIn = fs.open(file);

    CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
    CompressionCodec codec = codecFactory.getCodec(file);

    // if our codec is splittable, we can (tentatively) say that
    // we too are splittable.
    //
    // if we get a bgzfenhancedcodec, the codec might not actually
    // be splittable. however, if we get a non-splittable gz file,
    // several things happen:
    //
    // 1. the input format will detect this, and will not split the
    //    file
    // 2. the bgzfenhancedcodec will check the underlying data type
    //    (BGZF vs GZIP) at input stream creation time, and will
    //    apply the appropriate codec.
    //
    // if we get an unsplittable codec, really all that we do differently
    // is skip the positioning check, since we know that we're at the
    // start of the file and can get to reading immediately
    isSplittable = (codec instanceof SplittableCompressionCodec);

    if (codec == null) {
        // no codec.  Uncompressed file.
        int bytesToSkip = positionAtFirstRecord(fileIn, null);
        inputStream = fileIn;
        inputStream.skip(bytesToSkip);
        lineReader = new LineReader(inputStream);
    } else if (isSplittable) {
        // file is compressed, but uses a splittable codec
        isCompressed = true;
        int bytesToSkip = positionAtFirstRecord(fileIn, codec);

        // apparent fun finding: if you don't seek back to 0,
        // SplittableCompressionCodec.createInputStream will seek in the stream
        // to a start position, and funny things happen..
        fileIn.seek(0);
        inputStream = ((SplittableCompressionCodec) codec).createInputStream(fileIn, codec.createDecompressor(),
                start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);

        inputStream.skip(bytesToSkip);
        lineReader = new ResettableCompressedSplitLineReader((SplitCompressionInputStream) inputStream, conf);
    } else {
        // unsplittable compressed file
        // expect a single split, first record at offset 0
        isCompressed = true;
        inputStream = codec.createInputStream(fileIn);
        end = Long.MAX_VALUE; // read until the end of the file
        lineReader = new LineReader(inputStream);
    }
}

From source file:org.bgi.flexlab.gaea.data.mapreduce.input.adaptor.AdaptorRecordReader.java

License:Open Source License

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    System.out.println(split.toString());
    Configuration job = context.getConfiguration();
    System.err.println(split.getPath().toString());
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();/*from  w  w w. j a  v  a 2 s.c o m*/
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        in = new LineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        in = new LineReader(fileIn, job);
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:org.bgi.flexlab.gaea.data.mapreduce.input.fastq.FastqBasicReader.java

License:Open Source License

public FastqBasicReader(Configuration job, FileSplit split, byte[] recordDelimiter) throws IOException {
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(split.getPath());

    String multiSampleList = job.get("multiSampleList");
    if (multiSampleList != null && multiSampleList != "") {
        FastqMultipleSample samplelist;//from   www.  j a v a 2s . co m
        samplelist = new FastqMultipleSample(multiSampleList, false);
        FastqSample slist = samplelist.getID(split.getPath().toString());
        if (slist != null) {
            sampleID = String.valueOf(slist.getId());
        } else {
            sampleID = "+";
        }
    }

    start = split.getStart();
    end = split.getStart() + split.getLength();

    // open the file and seek to the start of the split
    FileSystem fs = split.getPath().getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        if (null == this.recordDelimiterBytes) {
            in = new LineReader(codec.createInputStream(fileIn), job);
        } else {
            in = new LineReader(codec.createInputStream(fileIn), job, this.recordDelimiterBytes);
        }
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        if (null == this.recordDelimiterBytes) {
            in = new LineReader(fileIn, job);
        } else {
            in = new LineReader(fileIn, job, this.recordDelimiterBytes);
        }
    }

    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    getFirstFastqLine();
    this.pos = start;
}

From source file:org.deepak.joins.CustomRecordReader.java

License:Apache License

public CustomRecordReader(Configuration job, FileSplit split) throws IOException {
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    curSplit = split;/* w w  w.j  a  v a 2s .  c  o m*/
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        in = new LineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        in = new LineReader(fileIn, job);
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:org.gestore.hadoop.LongRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    job = context.getConfiguration();//  w w w . jav a 2  s  . c  om
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        if (null == this.recordDelimiterBytes) {
            in = new LineReader(codec.createInputStream(fileIn), job);
        } else {
            in = new LineReader(codec.createInputStream(fileIn), job);
        }
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        if (null == this.recordDelimiterBytes) {
            in = new LineReader(fileIn, job);
        } else {
            in = new LineReader(fileIn, job);
        }
    }
    this.pos = start;
    lastLine = new Text();
}

From source file:org.godhuli.rhipe.RXLineRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();//from  w  w w. jav  a  2s  . co  m
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        in = new LineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        in = new LineReader(fileIn, job);
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine((new RHText()).getText(), 0,
                (int) Math.min((long) Integer.MAX_VALUE, end - start));
        // linecounter ++ ;
    }
    this.pos = start;
}

From source file:org.hipi.tools.downloader.Downloader.java

License:Open Source License

public int run(String[] args) throws Exception {

    // try to parse command line arguments
    CommandLine line = null;/*from  w w w .  ja  v a 2  s .c  om*/
    try {
        line = parser.parse(options, args);
    } catch (ParseException exp) {
        usage();
    }
    if (line == null) {
        usage();
    }

    String[] leftArgs = line.getArgs();

    if (leftArgs.length != 2) {
        usage();
    }

    String inputDir = leftArgs[0];
    String outputHib = leftArgs[1];

    boolean yfcc100m = line.hasOption("yfcc100m");
    int numDownloadNodes = (yfcc100m ? 1
            : ((line.hasOption("num-nodes") ? Integer.parseInt(line.getOptionValue("num-nodes")) : 1)));
    if (numDownloadNodes < 1) {
        System.err.println("Invalid number of download nodes specified [" + numDownloadNodes + "]");
        System.exit(1);
    }

    boolean overwrite = line.hasOption("force");

    System.out.println("Source directory: " + inputDir);
    System.out.println("Output HIB: " + outputHib);
    System.out.println("Overwrite output HIB if it exists: " + (overwrite ? "true" : "false"));
    System.out.println("YFCC100M format: " + (yfcc100m ? "true" : "false"));
    System.out.println("Number of download nodes: " + numDownloadNodes);

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);

    // Remove existing HIB if overwrite is specified and HIB exists
    if (!overwrite) {
        if (fs.exists(new Path(outputHib))) {
            System.err.println(
                    "HIB [" + outputHib + "] already exists. Use the \"--force\" argument to overwrite.");
            System.exit(1);
        }
    } else { // overwrite
        if (fs.exists(new Path(outputHib))) {
            System.out.println("Found that output HIB already exists, deleting.");
        }
    }

    fs.delete(new Path(outputHib), true);
    fs.delete(new Path(outputHib + ".dat"), true);
    fs.delete(new Path(outputHib + "_output"), true);

    // Scan source directory for list of input files
    FileStatus[] inputFiles = fs.listStatus(new Path(inputDir));
    if (inputFiles == null || inputFiles.length == 0) {
        System.err.println("Failed to find any files in source directory: " + inputDir);
        System.exit(1);
    }

    // Validate list of input files
    ArrayList<Path> sourceFiles = new ArrayList<Path>();
    for (FileStatus file : inputFiles) {

        Path path = file.getPath();

        if (yfcc100m) {
            String[] tokens = path.getName().split("-");
            if (tokens == null || tokens.length == 0) {
                System.out.println("  Skipping source file (does not follow YFCC100M file name convention): "
                        + file.getPath());
                continue;
            }
        }

        try {
            // If it exists, get the relevant compression codec
            CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
            CompressionCodec codec = codecFactory.getCodec(path);

            FSDataInputStream fis = fs.open(path);

            // If the codec was found, use it to create an decompressed input stream.
            // Otherwise, assume input stream is already decompressed
            BufferedReader reader = null;
            if (codec != null) {
                reader = new BufferedReader(new InputStreamReader(codec.createInputStream(fis)));
            } else {
                reader = new BufferedReader(new InputStreamReader(fis));
            }

            String fileLine = reader.readLine();
            String[] lineFields = (yfcc100m ? fileLine.split("\t") : fileLine.split("\\s+"));

            if (yfcc100m) {
                if (lineFields.length != 23) {
                    System.out.println("  Skipping source file (does not follow YFCC100M source file format): "
                            + file.getPath());
                    String imageUri = null;
                } else {
                    System.out.println("  Adding source file: " + file.getPath());
                    sourceFiles.add(path);
                }
            } else {
                if (lineFields.length != 1) {
                    System.out.println(
                            "  Skipping source file (contains multiple fields per line where only one is expected): "
                                    + file.getPath());
                    if (lineFields.length == 23) {
                        System.out.println("  Did you mean to use \"--yfcc100m\"?");
                    }
                    String imageUri = null;
                } else {
                    System.out.println("  Adding source file: " + file.getPath());
                    sourceFiles.add(path);
                }
            }
            fis.close();
            reader = null;
        } catch (Exception e) {
            System.err.println("Skipping source file (unable to open and parse first line: " + file.getPath());
            continue;
        }

    }

    if (sourceFiles.size() == 0) {
        System.err.println("Failed to find any valid files in source directory: " + inputDir);
        System.exit(1);
    }

    // Construct path to directory containing outputHib
    String outputPath = outputHib.substring(0, outputHib.lastIndexOf('/') + 1);

    // Attaching job parameters to global Configuration object
    conf.setInt("downloader.nodes", numDownloadNodes);
    conf.setStrings("downloader.outfile", outputHib);
    conf.setStrings("downloader.outpath", outputPath);
    conf.setBoolean("downloader.yfcc100m", yfcc100m);

    Job job = Job.getInstance(conf, "hibDownload");
    job.setJarByClass(Downloader.class);
    job.setMapperClass(DownloaderMapper.class);
    job.setReducerClass(DownloaderReducer.class);
    job.setInputFormatClass(DownloaderInputFormat.class);
    job.setOutputKeyClass(BooleanWritable.class);
    job.setOutputValueClass(Text.class);
    job.setNumReduceTasks(1);

    FileOutputFormat.setOutputPath(job, new Path(outputHib + "_output"));

    Path[] inputPaths = new Path[sourceFiles.size()];
    inputPaths = sourceFiles.toArray(inputPaths);
    DownloaderInputFormat.setInputPaths(job, inputPaths);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:org.huahinframework.core.lib.input.SimpleRecordReader.java

License:Apache License

/**
 * {@inheritDoc}/* ww  w  .  j a  v  a  2  s . c om*/
 */
@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();

    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        in = new LineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        in = new LineReader(fileIn, job);
    }

    // skip first line and re-establish "start".
    if (skipFirstLine) {
        start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }

    this.fileName = file.getName();
    this.fileLength = fs.getFileStatus(file).getLen();
    this.conf = context.getConfiguration();
    this.pos = start;
    this.separator = conf.get(SimpleJob.SEPARATOR, StringUtil.COMMA);
    this.regex = conf.getBoolean(SimpleJob.SEPARATOR_REGEX, false);

    init();
}