Example usage for org.apache.hadoop.io.compress CompressionCodec createInputStream

List of usage examples for org.apache.hadoop.io.compress CompressionCodec createInputStream

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress CompressionCodec createInputStream.

Prototype

CompressionInputStream createInputStream(InputStream in) throws IOException;

Source Link

Document

Create a CompressionInputStream that will read from the given input stream.

Usage

From source file:gov.jgi.meta.hadoop.input.FastaBlockRecordReader.java

License:Open Source License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {

    LOG.info("initializing FastaBlockRecordReader");

    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();// w w w.  ja v  a2  s  .c  o m
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        in = new FastaBlockLineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = false; // don't do this!
            //--start;                      or this
            fileIn.seek(start);
        }
        in = new FastaBlockLineReader(fileIn, job);
    }
    this.pos = start;
}

From source file:gov.jgi.meta.hadoop.input.FastaRecordReader.java

License:Open Source License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();/*from  w w w  . j  a v  a  2 s .  c  om*/
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        in = new FastaLineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = false; // don't do this!
            //--start;                      or this
            fileIn.seek(start);
        }
        in = new FastaLineReader(fileIn, job);
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine(new Text(), new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:gov.jgi.meta.hadoop.input.FastqBlockRecordReader.java

License:Open Source License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();//from  ww  w .ja va 2  s.c  o m
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        in = new FastqBlockLineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = false; // don't do this!
            //--start;                      or this
            fileIn.seek(start);
        }
        in = new FastqBlockLineReader(fileIn, job);
    }
    this.pos = start;
}

From source file:gov.jgi.meta.hadoop.input.FastqRecordReader.java

License:Open Source License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();//  w w  w. ja  v  a2  s.  co m
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        in = new FastqLineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = false; // don't do this!
            //--start;                      or this
            fileIn.seek(start);
        }
        in = new FastqLineReader(fileIn, job);
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine(new Text(), new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:gr.ntua.h2rdf.inputFormat.MyLineRecordReader.java

License:Open Source License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    MyFileSplit split = (MyFileSplit) (MyInputSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();//w  w  w. j  a va2 s . c om
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        in = new LineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        in = new LineReader(fileIn, job);
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:gr.ntua.h2rdf.loadTriples.TranslateAndImport.java

License:Apache License

public Job createSubmittableJob(String[] args) throws IOException, ClassNotFoundException {
    //compute sample partitions
    FileSystem fs;/*  w  w  w.j  a v a2  s . com*/
    Configuration conf = new Configuration();
    int collected = 0, chunks = 0;
    try {
        fs = FileSystem.get(conf);
        Path sampleDir = new Path("sample");
        FileStatus[] samples = fs.listStatus(sampleDir);
        TreeSet<String> set = new TreeSet<String>();
        for (FileStatus sample : samples) {
            FSDataInputStream in = fs.open(sample.getPath());
            CompressionCodec codec = (CompressionCodec) ReflectionUtils.newInstance(GzipCodec.class, conf);
            CompressionInputStream in1 = codec.createInputStream(in);
            NxParser nxp = new NxParser(in1);
            Iterator<Node[]> it = nxp.iterator();
            while (it.hasNext()) {
                Node[] tr = it.next();
                //System.out.println(tr[0].toN3());
                set.add(tr[0].toN3());
                set.add(tr[1].toN3());
                set.add(tr[2].toN3());
            }
            in1.close();
            in.close();
        }

        IndexTranslator translator = new IndexTranslator(TABLE_NAME + "_Index");
        HashMap<String, Long> index = translator.translate(set);
        set.clear();
        TreeSet<ImmutableBytesWritable> set1 = new TreeSet<ImmutableBytesWritable>(
                new ImmutableBytesWritable.Comparator());

        for (FileStatus sample : samples) {
            FSDataInputStream in = fs.open(sample.getPath());
            CompressionCodec codec = (CompressionCodec) ReflectionUtils.newInstance(GzipCodec.class, conf);
            CompressionInputStream in1 = codec.createInputStream(in);
            NxParser nxp = new NxParser(in1);
            Iterator<Node[]> it = nxp.iterator();
            while (it.hasNext()) {
                Node[] tr = it.next();
                ByteTriple btr = new ByteTriple(index.get(tr[0].toN3()), index.get(tr[1].toN3()),
                        index.get(tr[2].toN3()));
                set1.add(new ImmutableBytesWritable(btr.getSPOByte()));
                set1.add(new ImmutableBytesWritable(btr.getSOPByte()));
                set1.add(new ImmutableBytesWritable(btr.getOPSByte()));
                set1.add(new ImmutableBytesWritable(btr.getOSPByte()));
                set1.add(new ImmutableBytesWritable(btr.getPOSByte()));
                set1.add(new ImmutableBytesWritable(btr.getPSOByte()));
            }
            in1.close();
            in.close();
        }
        index.clear();

        Path p = new Path("hexastorePartition");
        if (fs.exists(p)) {
            fs.delete(p, true);
        }
        SequenceFile.Writer partitionWriter = SequenceFile.createWriter(fs, conf, p,
                ImmutableBytesWritable.class, NullWritable.class);

        double chunkSize = bucketSampledTriples * DistinctIds.samplingRate;
        System.out.println("chunkSize: " + chunkSize);
        Iterator<ImmutableBytesWritable> it = set1.iterator();
        while (it.hasNext()) {
            ImmutableBytesWritable key = it.next();
            if (collected > chunkSize) {
                partitionWriter.append(key, NullWritable.get());
                //System.out.println(Bytes.toStringBinary(key.get()));
                collected = 0;
                chunks++;
            } else {
                collected++;
            }
        }
        System.out.println("chunks: " + chunks);
        partitionWriter.close();

    } catch (IOException e) {
        e.printStackTrace();
    }

    Job job = new Job();
    job = new Job(conf, "Import Hexastore");

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    Path out = new Path("out");
    try {
        fs = FileSystem.get(conf);
        if (fs.exists(out)) {
            fs.delete(out, true);
        }
    } catch (IOException e) {
        e.printStackTrace();
    }
    FileOutputFormat.setOutputPath(job, out);

    job.setPartitionerClass(TotalOrderPartitioner.class);
    TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), new Path("hexastorePartition"));
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(HFileOutputFormat.class);

    StringBuilder compressionConfigValue = new StringBuilder();
    compressionConfigValue.append(URLEncoder.encode("I", "UTF-8"));
    compressionConfigValue.append('=');
    compressionConfigValue.append(URLEncoder.encode(Algorithm.SNAPPY.getName(), "UTF-8"));
    compressionConfigValue.append('&');
    compressionConfigValue.append(URLEncoder.encode("S", "UTF-8"));
    compressionConfigValue.append('=');
    compressionConfigValue.append(URLEncoder.encode(Algorithm.SNAPPY.getName(), "UTF-8"));
    compressionConfigValue.append('&');
    compressionConfigValue.append(URLEncoder.encode("T", "UTF-8"));
    compressionConfigValue.append('=');
    compressionConfigValue.append(URLEncoder.encode(Algorithm.SNAPPY.getName(), "UTF-8"));
    job.getConfiguration().set("hbase.hfileoutputformat.families.compression",
            compressionConfigValue.toString());
    //job.getConfiguration().setInt("hbase.mapreduce.hfileoutputformat.blocksize",262144);
    //job.getConfiguration().setInt("hbase.mapreduce.hfileoutputformat.blocksize",16384);
    job.setMapOutputKeyClass(ImmutableBytesWritable.class);
    job.setMapOutputValueClass(NullWritable.class);
    job.setOutputKeyClass(ImmutableBytesWritable.class);
    job.setOutputValueClass(KeyValue.class);
    job.setJarByClass(TranslateAndImport.class);
    job.setMapperClass(Map.class);
    //job.setReducerClass(HexaStoreHistogramsReduce.class);
    job.setReducerClass(HexaStoreReduce.class);

    job.getConfiguration().set("h2rdf.tableName", TABLE_NAME);
    job.getConfiguration().setInt("mapred.reduce.tasks", chunks + 1);
    //job.setCombinerClass(Combiner.class);
    job.setJobName("Translate Projections");
    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);
    job.getConfiguration().setInt("io.sort.mb", 100);
    job.getConfiguration().setInt("io.file.buffer.size", 131072);
    job.getConfiguration().setInt("mapred.job.reuse.jvm.num.tasks", -1);

    job.getConfiguration().set("mapred.compress.map.output", "true");
    job.getConfiguration().set("mapred.map.output.compression.codec",
            "org.apache.hadoop.io.compress.SnappyCodec");
    //job.getConfiguration().setInt("hbase.hregion.max.filesize", 268435456);
    //job.getConfiguration().setInt("hbase.hregion.max.filesize", 67108864);
    job.getConfiguration().setInt("hbase.hregion.max.filesize", 33554432);

    return job;

}

From source file:hdfsIO.fileInteractions.java

public List<String> readLines(Path location, Configuration conf) throws Exception {
    FileSystem fileSystem = FileSystem.get(location.toUri(), conf);
    CompressionCodecFactory factory = new CompressionCodecFactory(conf);
    FileStatus[] items = fileSystem.listStatus(location);
    if (items == null) {
        return new ArrayList<String>();
    }/*from w w w  .j a  va  2  s.c  o m*/
    List<String> results = new ArrayList<String>();
    for (FileStatus item : items) {

        // ignoring files like _SUCCESS
        if (item.getPath().getName().startsWith("_")) {
            continue;
        }

        CompressionCodec codec = factory.getCodec(item.getPath());
        InputStream stream = null;

        // check if we have a compression codec we need to use
        if (codec != null) {
            stream = codec.createInputStream(fileSystem.open(item.getPath()));
        } else {
            stream = fileSystem.open(item.getPath());
        }

        StringWriter writer = new StringWriter();
        IOUtils.copy(stream, writer, "UTF-8");
        String raw = writer.toString();
        String[] resulting = raw.split("\n");
        for (String str : raw.split("\n")) {
            results.add(str);
        }
    }
    return results;
}

From source file:InvertedIndex.NLineRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.job = job;
    this.context = context;
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();/*from   www .j  a  v a 2  s .  c o m*/
    end = start + split.getLength();
    final Path file = split.getPath();
    this.path = file;
    this.length = split.getLength();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        if (0 == split.getLength() && job.getBoolean("mapred.ignore.badcompress", false)) {
            if (null != context && context instanceof TaskInputOutputContext) {
                ((TaskInputOutputContext) context).getCounter("Input Counter", "Gzip File length is zero")
                        .increment(1);
            }
            if (null != this.path) {
                LOG.warn("Skip 0-length Zip file: " + this.path.toString());
            }
            in = new NLineReader(fileIn, job);
        } else {
            try {
                in = new NLineReader(codec.createInputStream(fileIn), job);
                end = Long.MAX_VALUE;
            } catch (IOException e) {
                if (isIgnoreBadCompress(job, e)) {
                    in = new NLineReader(fileIn, job);
                    end = start;
                    LOG.warn("Skip Bad Compress File: " + this.path.toString());
                    LOG.warn("initialize line read error", e);
                    ((TaskInputOutputContext) context).getCounter("Input Counter", "Skip Bad Zip File")
                            .increment(1);
                    ((TaskInputOutputContext) context).getCounter("Input Counter", "Total Skip Bad Zip Length")
                            .increment(this.length);
                } else {
                    throw e;
                }
            }
        }
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        in = new NLineReader(fileIn, job);
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:io.aos.hdfs.FileDecompressor.java

License:Apache License

public static void main(String... args) throws Exception {
    String uri = args[0];//from   ww w  .  j  a  v  a2 s .co m
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(URI.create(uri), conf);

    Path inputPath = new Path(uri);
    CompressionCodecFactory factory = new CompressionCodecFactory(conf);
    CompressionCodec codec = factory.getCodec(inputPath);
    if (codec == null) {
        System.err.println("No codec found for " + uri);
        System.exit(1);
    }

    String outputUri = CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());

    InputStream in = null;
    OutputStream out = null;
    try {
        in = codec.createInputStream(fs.open(inputPath));
        out = fs.create(new Path(outputUri));
        IOUtils.copyBytes(in, out, conf);
    } finally {
        IOUtils.closeStream(in);
        IOUtils.closeStream(out);
    }
}

From source file:io.druid.indexer.Utils.java

License:Apache License

public static InputStream openInputStream(JobContext job, Path inputPath, final FileSystem fileSystem)
        throws IOException {
    if (!FileOutputFormat.getCompressOutput(job)) {
        return fileSystem.open(inputPath);
    } else {/*from w w  w.  j  a  v a  2  s  .co m*/
        Class<? extends CompressionCodec> codecClass = FileOutputFormat.getOutputCompressorClass(job,
                GzipCodec.class);
        CompressionCodec codec = ReflectionUtils.newInstance(codecClass, job.getConfiguration());
        inputPath = new Path(inputPath.toString() + codec.getDefaultExtension());

        return codec.createInputStream(fileSystem.open(inputPath));
    }
}