Example usage for org.apache.hadoop.io.compress CompressionCodec createInputStream

List of usage examples for org.apache.hadoop.io.compress CompressionCodec createInputStream

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress CompressionCodec createInputStream.

Prototype

CompressionInputStream createInputStream(InputStream in) throws IOException;

Source Link

Document

Create a CompressionInputStream that will read from the given input stream.

Usage

From source file:net.darkseraphim.webanalytics.hadoop.csv.CSVLineRecordReader.java

License:Apache License

public void configure(InputSplit genericSplit, JobConf conf) throws IOException {
    FileSplit split = (FileSplit) genericSplit;

    start = split.getStart();//from  www. j  av a 2  s.c  om
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(conf);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(conf);
    FSDataInputStream fileIn = fs.open(split.getPath());

    if (codec != null) {
        is = codec.createInputStream(fileIn);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            fileIn.seek(start);
        }
        is = fileIn;
    }

    this.pos = start;
    init(is, conf);
}

From source file:newprotobuf.mapred.ProtobufRecordReader.java

License:Open Source License

public ProtobufRecordReader(Configuration conf, FileSplit split, Reporter reporter) throws IOException {

    this.conf = conf;

    start = split.getStart();/*ww w. jav  a  2 s.  com*/
    pos = start;
    splitLength = split.getLength();
    end = start + splitLength;

    file = split.getPath();

    FileSystem fs = file.getFileSystem(conf);
    in = fs.open(split.getPath());
    compressionCodecs = new CompressionCodecFactory(conf);
    final CompressionCodec codec = compressionCodecs.getCodec(file);
    if (codec != null) {
        in = codec.createInputStream(in);
        end = Long.MAX_VALUE;
    }
    this.reporter = reporter;
    skipbad = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVEPBBADFILESKIP);
    LOG.info("Skip bad is set to " + skipbad);
}

From source file:nyu.cs.webgraph.MRhelpers.LzoTabSeperatedLineRecordReader.java

License:Open Source License

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    FileSplit split = (FileSplit) genericSplit;
    start = split.getStart();//ww w .j  a v a2 s.c  o  m
    end = start + split.getLength();
    final Path file = split.getPath();
    Configuration job = context.getConfiguration();

    FileSystem fs = file.getFileSystem(job);
    CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);
    if (codec == null) {
        throw new IOException("Codec for file " + file + " not found, cannot run");
    }

    // open the file and seek to the start of the split
    fileIn = fs.open(split.getPath());

    // creates input stream and also reads the file header
    in = new LineReader(codec.createInputStream(fileIn), job);

    if (start != 0) {
        fileIn.seek(start);

        // read and ignore the first line
        in.readLine(new Text());
        start = fileIn.getPos();
    }

    this.pos = start;
}

From source file:org.apache.ambari.view.filebrowser.FilePreviewService.java

License:Apache License

@GET
@Path("/file")
@Produces(MediaType.APPLICATION_JSON)//from  w  ww. j  a  va 2s. com
public Response previewFile(@QueryParam("path") String path, @QueryParam("start") int start,
        @QueryParam("end") int end) {

    try {
        HdfsApi api = getApi(context);
        FileStatus status = api.getFileStatus(path);

        CompressionCodec codec = compressionCodecFactory.getCodec(status.getPath());

        // check if we have a compression codec we need to use
        InputStream stream = (codec != null) ? codec.createInputStream(api.open(path)) : api.open(path);

        int length = end - start;
        byte[] bytes = new byte[length];
        // ((Seekable)stream).seek(start); //seek(start);
        stream.skip(start);
        int readBytes = stream.read(bytes, 0, length);
        boolean isFileEnd = false;

        if (readBytes < length)
            isFileEnd = true;

        JSONObject response = new JSONObject();
        response.put("data", new String(bytes));
        response.put("readbytes", readBytes);
        response.put("isFileEnd", isFileEnd);

        return Response.ok(response).build();
    } catch (WebApplicationException ex) {
        throw ex;
    } catch (FileNotFoundException ex) {
        throw new NotFoundFormattedException(ex.getMessage(), ex);
    } catch (Exception ex) {
        throw new ServiceFormattedException(ex.getMessage(), ex);
    }
}

From source file:org.apache.apex.malhar.lib.io.fs.AbstractFileOutputOperatorTest.java

License:Apache License

@Test
public void testSnappyCompressionSimple() throws IOException {
    if (checkNativeSnappy()) {
        return;/*from  ww  w. j a  v  a  2 s .  c  o m*/
    }

    File snappyFile = new File(testMeta.getDir(), "snappyTestFile.snappy");

    BufferedOutputStream os = new BufferedOutputStream(new FileOutputStream(snappyFile));
    Configuration conf = new Configuration();
    CompressionCodec codec = (CompressionCodec) ReflectionUtils.newInstance(SnappyCodec.class, conf);
    FilterStreamCodec.SnappyFilterStream filterStream = new FilterStreamCodec.SnappyFilterStream(
            codec.createOutputStream(os));

    int ONE_MB = 1024 * 1024;

    String testStr = "TestSnap-16bytes";
    for (int i = 0; i < ONE_MB; i++) { // write 16 MBs
        filterStream.write(testStr.getBytes());
    }
    filterStream.flush();
    filterStream.close();

    CompressionInputStream is = codec.createInputStream(new FileInputStream(snappyFile));

    byte[] recovered = new byte[testStr.length()];
    int bytesRead = is.read(recovered);
    is.close();
    assertEquals(testStr, new String(recovered));
}

From source file:org.apache.apex.malhar.lib.io.fs.AbstractFileOutputOperatorTest.java

License:Apache License

private void checkSnappyFile(File file, List<Long> offsets, int startVal, int totalWindows, int totalRecords)
        throws IOException {
    FileInputStream fis;/* ww w  . j  a  v  a 2  s .com*/
    InputStream gss = null;
    Configuration conf = new Configuration();
    CompressionCodec codec = (CompressionCodec) ReflectionUtils.newInstance(SnappyCodec.class, conf);
    CompressionInputStream snappyIs = null;

    BufferedReader br = null;

    int numWindows = 0;
    try {
        fis = new FileInputStream(file);
        gss = fis;

        long startOffset = 0;
        for (long offset : offsets) {
            // Skip initial case in case file is not yet created
            if (offset == 0) {
                continue;
            }
            long limit = offset - startOffset;
            LimitInputStream lis = new LimitInputStream(gss, limit);

            snappyIs = codec.createInputStream(lis);
            br = new BufferedReader(new InputStreamReader(snappyIs));
            String eline = "" + (startVal + numWindows * 2);
            int count = 0;
            String line;
            while ((line = br.readLine()) != null) {
                Assert.assertEquals("File line", eline, line);
                ++count;
                if ((count % totalRecords) == 0) {
                    ++numWindows;
                    eline = "" + (startVal + numWindows * 2);
                }
            }
            startOffset = offset;
        }
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        if (br != null) {
            br.close();
        } else {
            if (snappyIs != null) {
                snappyIs.close();
            } else if (gss != null) {
                gss.close();
            }
        }
    }
    Assert.assertEquals("Total", totalWindows, numWindows);
}

From source file:org.apache.drill.exec.store.dfs.DrillFileSystem.java

License:Apache License

public InputStream openPossiblyCompressedStream(Path path) throws IOException {
    CompressionCodec codec = codecFactory.getCodec(path); // infers from file ext.
    if (codec != null) {
        return codec.createInputStream(open(path));
    } else {/*w w w. jav a 2  s.c o m*/
        return open(path);
    }
}

From source file:org.apache.druid.indexer.Utils.java

License:Apache License

public static InputStream openInputStream(JobContext job, Path inputPath, final FileSystem fileSystem)
        throws IOException {
    if (!FileOutputFormat.getCompressOutput(job)) {
        return fileSystem.open(inputPath);
    } else {/*www  .j  a v a 2s . c o m*/
        Class<? extends CompressionCodec> codecClass = FileOutputFormat.getOutputCompressorClass(job,
                GzipCodec.class);
        CompressionCodec codec = ReflectionUtils.newInstance(codecClass, job.getConfiguration());
        inputPath = new Path(inputPath + codec.getDefaultExtension());

        return codec.createInputStream(fileSystem.open(inputPath));
    }
}

From source file:org.apache.giraph.worker.BspServiceSource.java

License:Apache License

/**
 * Load saved partitions in multiple threads.
 * @param superstep superstep to load/*from   w w w .  j  av a2  s  .com*/
 * @param partitions list of partitions to load
 */
private void loadCheckpointVertices(final long superstep, List<Integer> partitions) {
    int numThreads = Math.min(GiraphConstants.NUM_CHECKPOINT_IO_THREADS.get(getConfiguration()),
            partitions.size());

    final Queue<Integer> partitionIdQueue = new ConcurrentLinkedQueue<>(partitions);

    final CompressionCodec codec = new CompressionCodecFactory(getConfiguration())
            .getCodec(new Path(GiraphConstants.CHECKPOINT_COMPRESSION_CODEC.get(getConfiguration())));

    long t0 = System.currentTimeMillis();

    CallableFactory<Void> callableFactory = new CallableFactory<Void>() {
        @Override
        public Callable<Void> newCallable(int callableId) {
            return new Callable<Void>() {

                @Override
                public Void call() throws Exception {
                    while (!partitionIdQueue.isEmpty()) {
                        Integer partitionId = partitionIdQueue.poll();
                        if (partitionId == null) {
                            break;
                        }
                        Path path = getSavedCheckpoint(superstep,
                                "_" + partitionId + CheckpointingUtils.CHECKPOINT_VERTICES_POSTFIX);

                        FSDataInputStream compressedStream = getFs().open(path);

                        DataInputStream stream = codec == null ? compressedStream
                                : new DataInputStream(codec.createInputStream(compressedStream));

                        Partition<I, V, E> partition = getConfiguration().createPartition(partitionId,
                                getContext());

                        partition.readFields(stream);

                        getPartitionStore().addPartition(partition);

                        stream.close();
                    }
                    return null;
                }

            };
        }
    };

    ProgressableUtils.getResultsWithNCallables(callableFactory, numThreads, "load-vertices-%d", getContext());

    LOG.info("Loaded checkpoint in " + (System.currentTimeMillis() - t0) + " ms, using " + numThreads
            + " threads");
}

From source file:org.apache.hama.bsp.LineRecordReader.java

License:Apache License

public LineRecordReader(Configuration job, FileSplit split) throws IOException {
    this.maxLineLength = job.getInt("bsp.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();// ww  w. j a va 2 s.c o  m
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        in = new LineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        in = new LineReader(fileIn, job);
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine(new Text(), 0, (int) Math.min(Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}