Example usage for org.apache.hadoop.io.compress CompressionCodec createInputStream

List of usage examples for org.apache.hadoop.io.compress CompressionCodec createInputStream

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress CompressionCodec createInputStream.

Prototype

CompressionInputStream createInputStream(InputStream in) throws IOException;

Source Link

Document

Create a CompressionInputStream that will read from the given input stream.

Usage

From source file:com.linkedin.cubert.io.rubix.RubixRecordReader.java

License:Open Source License

public void initialize(InputSplit split, Configuration conf) throws IOException, InterruptedException {
    @SuppressWarnings("unchecked")
    RubixInputSplit<K, V> rsplit = (RubixInputSplit<K, V>) split;

    SerializationFactory serializationFactory = new SerializationFactory(conf);
    switch (rsplit.getBlockSerializationType()) {
    case DEFAULT:
        valueDeserializer = serializationFactory.getDeserializer(rsplit.getValueClass());
        break;//ww  w .jav a 2 s .  c o  m
    case COMPACT:
        BlockSchema schema = rsplit.getSchema();
        valueDeserializer = new CompactDeserializer<V>(schema);
        break;
    }

    key = rsplit.getKey();

    // store the blockid and partition key in the conf
    conf.setLong("MY_BLOCK_ID", rsplit.getBlockId());
    conf.setLong("MY_NUM_RECORDS", rsplit.getNumRecords());
    ByteArrayOutputStream tmpOut = new ByteArrayOutputStream();
    ((Tuple) key).write(new DataOutputStream(tmpOut));
    String keySerialized = SerializerUtils.serializeToString(tmpOut.toByteArray());
    conf.set("MY_PARTITION_KEY", keySerialized);

    Path path = rsplit.getFilename();
    offset = rsplit.getOffset();
    length = rsplit.getLength();

    FileSystem fs = path.getFileSystem(conf);
    FSDataInputStream fsin = fs.open(path);
    fsin.seek(offset);

    in = new BlockInputStream(fsin, length);
    CompressionCodec codec = new CompressionCodecFactory(conf).getCodec(path);
    if (codec != null) {
        print.f("codec is not null and it is %s", codec.getClass().toString());
        in = codec.createInputStream(in);
    } else {
        print.f("codec is null");
    }

    valueDeserializer.open(in);
}

From source file:com.matthewrathbone.hadoop.MRTester.java

License:Apache License

public List<String> collectStrings(Path location) throws Exception {
    CompressionCodecFactory factory = new CompressionCodecFactory(conf);
    FileStatus[] items = fileSystem.listStatus(location);
    if (items == null)
        return new ArrayList<String>();
    List<String> results = new ArrayList<String>();
    for (FileStatus item : items) {
        if (item.getPath().getName().startsWith("_")) {
            continue;
        }/* w  ww .j ava  2s.  c  o m*/

        CompressionCodec codec = factory.getCodec(item.getPath());
        InputStream stream = null;

        // check if we have a compression codec
        if (codec != null) {
            stream = codec.createInputStream(fileSystem.open(item.getPath()));
        } else {
            stream = fileSystem.open(item.getPath());
        }

        StringWriter writer = new StringWriter();
        IOUtils.copy(stream, writer, "UTF-8");
        String raw = writer.toString();
        String[] resulting = raw.split("\n");
        for (String str : raw.split("\n")) {
            results.add(str);
        }
    }
    return results;
}

From source file:com.netflix.bdp.inviso.history.TraceService.java

License:Apache License

/**
 * Returns a json object representing the job history.
 *
 * @param jobId//w  w w  .ja v  a 2  s .c  o m
 * @param path Use the given path as opposed to the history locator
 * @param summary Return just the top level details of the job
 * @param counters Include counters
 * @return Json string
 * @throws Exception
 */
@Path("load/{jobId}")
@GET
@Produces("application/json")
public String trace(@PathParam("jobId") final String jobId, @QueryParam("path") final String path,
        @QueryParam("summary") boolean summary, @QueryParam("counters") @DefaultValue("true") boolean counters)
        throws Exception {

    Pair<org.apache.hadoop.fs.Path, org.apache.hadoop.fs.Path> historyPath;

    if (path != null) {
        historyPath = new ImmutablePair<>(null, new org.apache.hadoop.fs.Path(path));
    } else {
        historyPath = historyLocator.locate(jobId);
    }

    if (historyPath == null) {
        throw new WebApplicationException(404);
    }

    TraceJobHistoryLoader loader = new TraceJobHistoryLoader(properties);

    FileSystem fs = FileSystem.get(historyPath.getRight().toUri(), config);
    CompressionCodec codec = new CompressionCodecFactory(config).getCodec(historyPath.getRight());

    FSDataInputStream fin = fs.open(historyPath.getRight());

    if (codec != null) {
        fin = new FSDataInputStream(new WrappedCompressionInputStream(codec.createInputStream(fin)));
    }

    JobHistoryParser parser = new JobHistoryParser(fin);
    parser.parse(loader);

    String[] ignore = { "counters" };

    ObjectMapper mapper = new ObjectMapper();
    SimpleModule module = new SimpleModule("MyModule", new Version(1, 0, 0, null));

    //Job
    JavaType jobMapType = MapLikeType.construct(Job.class, SimpleType.construct(String.class),
            SimpleType.construct(Object.class));
    module.addSerializer(Job.class, MapSerializer.construct(ignore, jobMapType, false, null, null, null, null));

    //Task
    JavaType taskMapType = MapLikeType.construct(Task.class, SimpleType.construct(String.class),
            SimpleType.construct(Object.class));
    module.addSerializer(Task.class,
            MapSerializer.construct(ignore, taskMapType, false, null, null, null, null));

    //Attempt
    JavaType attemptMapType = MapLikeType.construct(TaskAttempt.class, SimpleType.construct(String.class),
            SimpleType.construct(Object.class));
    module.addSerializer(TaskAttempt.class,
            MapSerializer.construct(ignore, attemptMapType, false, null, null, null, null));

    if (!counters) {
        mapper.registerModule(module);
    }

    if (summary) {
        loader.getJob().clearTasks();
    }

    return mapper.writeValueAsString(loader.getJob());
}

From source file:com.netflix.suro.sink.localfile.TestTextFileWriter.java

License:Apache License

private int checkFileContentsWithGzip(String filePath, String message)
        throws IOException, ClassNotFoundException {
    FileSystem fs = FileSystem.get(new Configuration());
    FSDataInputStream input = fs.open(new Path(filePath));
    CompressionCodec codec = FileWriterBase.createCodecInstance("org.apache.hadoop.io.compress.GzipCodec");

    BufferedReader br = new BufferedReader(new InputStreamReader(codec.createInputStream(input)));
    String line = null;/*from  ww w.ja v  a  2  s.c o  m*/
    int i = 0;
    while ((line = br.readLine()) != null) {
        assertEquals(line, message + i);
        ++i;
    }
    br.close();

    return i;
}

From source file:com.pinterest.secor.io.impl.DelimitedTextFileReaderWriter.java

License:Apache License

public DelimitedTextFileReaderWriter(LogFilePath path, CompressionCodec codec, FileReaderWriter.Type type)
        throws FileNotFoundException, IOException {

    Path fsPath = new Path(path.getLogFilePath());
    FileSystem fs = FileUtil.getFileSystem(path.getLogFilePath());
    if (type == FileReaderWriter.Type.Reader) {
        InputStream inputStream = fs.open(fsPath);
        this.mReader = (codec == null) ? new BufferedInputStream(inputStream)
                : new BufferedInputStream(codec.createInputStream(inputStream));
        this.mOffset = path.getOffset();
        this.mCountingStream = null;
        this.mWriter = null;
    } else if (type == FileReaderWriter.Type.Writer) {
        this.mCountingStream = new CountingOutputStream(fs.create(fsPath));
        this.mWriter = (codec == null) ? new BufferedOutputStream(this.mCountingStream)
                : new BufferedOutputStream(codec.createOutputStream(this.mCountingStream));
        this.mReader = null;
    } else {/* w  ww. j  a v  a2 s.c  om*/
        throw new IllegalArgumentException("Undefined File Type: " + type);
    }
}

From source file:com.streamsets.pipeline.stage.destination.hdfs.writer.TestRecordWriterManager.java

License:Apache License

private void testTextFile(CompressionCodec compressionCodec) throws Exception {
    URI uri = new URI("file:///");
    Configuration conf = new HdfsConfiguration();
    String prefix = "prefix";
    String template = getTestDir().toString() + "/${YYYY()}";
    TimeZone timeZone = TimeZone.getTimeZone("UTC");
    long cutOffSecs = 10;
    long cutOffSize = 20;
    long cutOffRecords = 2;
    HdfsFileType fileType = HdfsFileType.TEXT;
    SequenceFile.CompressionType compressionType = null;
    String keyEL = null;/* w  w w  .  j av a2s .  c o  m*/
    DataGeneratorFactory generatorFactory = new DummyDataGeneratorFactory(null);
    RecordWriterManager mgr = new RecordWriterManager(uri, conf, prefix, template, timeZone, cutOffSecs,
            cutOffSize, cutOffRecords, fileType, compressionCodec, compressionType, keyEL, generatorFactory,
            targetContext, "dirPathTemplate");
    Assert.assertTrue(mgr.validateDirTemplate("g", "dirPathTemplate", new ArrayList<Stage.ConfigIssue>()));
    FileSystem fs = FileSystem.get(uri, conf);
    Path file = new Path(getTestDir(), UUID.randomUUID().toString());
    long expires = System.currentTimeMillis() + 50000;
    RecordWriter writer = mgr.createWriter(fs, file, 50000);
    Assert.assertTrue(expires <= writer.getExpiresOn());
    Assert.assertTrue(writer.isTextFile());
    Assert.assertFalse(writer.isSeqFile());
    Record record = RecordCreator.create();
    record.set(Field.create("a"));
    writer.write(record);
    writer.close();
    InputStream is = fs.open(file);
    if (compressionCodec != null) {
        is = compressionCodec.createInputStream(is);
    }
    BufferedReader reader = new BufferedReader(new InputStreamReader(is));
    Assert.assertEquals("a", reader.readLine());
    Assert.assertNull(reader.readLine());
    reader.close();
}

From source file:com.tgam.hadoop.mapred.EscapedLineRecordReader.java

License:Apache License

public EscapedLineRecordReader(Configuration job, FileSplit split) throws IOException {
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();/*w  w  w. j  av  a  2s  .  c  o m*/
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        in = new LineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        in = new LineReader(fileIn, job);
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:com.thinkbiganalytics.inputformat.hadoop.mapred.OmnitureDataFileRecordReader.java

License:Open Source License

public OmnitureDataFileRecordReader(Configuration job, FileSplit split) throws IOException {

    this.maxLineLength = job.getInt("mapred.escapedlinereader.maxlength", Integer.MAX_VALUE);
    this.start = split.getStart();
    this.end = start + split.getLength();
    final Path file = split.getPath();
    this.compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // Open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        lineReader = new EscapedLineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;//from  ww w . j a  v a 2  s  . c om
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        lineReader = new EscapedLineReader(fileIn, job);
    }
    if (skipFirstLine) {
        start += lineReader.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:com.yahoo.glimmer.indexing.CompressionCodecHelper.java

License:Open Source License

public static InputStream wrapStream(Configuration conf, Path path, InputStream inputStream)
        throws IOException {
    CompressionCodec codec = getCompressionCodec(conf, path);
    if (codec != null) {
        return codec.createInputStream(inputStream);
    }//from w ww  .j a va 2s.  co  m
    return inputStream;
}

From source file:com.yahoo.glimmer.util.MergeSortTool.java

License:Open Source License

public static int mergeSort(FileSystem fs, List<Path> sourcePaths, Path outputPath,
        CompressionCodecFactory compressionCodecFactory) throws IOException {
    assert sourcePaths.size() > 0 : "No source paths given.";

    LOG.info("Sorted merge into " + outputPath.toString());
    OutputStream outputStream = fs.create(outputPath);

    CompressionCodec inputCompressionCodec = compressionCodecFactory.getCodec(sourcePaths.get(0));
    if (inputCompressionCodec != null) {
        LOG.info("Input compression codec " + inputCompressionCodec.getClass().getName());
    }//from  w ww  .j  a v a2  s  .  c  om

    CompressionCodec outputCompressionCodec = compressionCodecFactory.getCodec(outputPath);
    if (outputCompressionCodec != null) {
        LOG.info("Output compression codec " + outputCompressionCodec.getClass().getName());
        outputStream = outputCompressionCodec.createOutputStream(outputStream);
    }

    List<BufferedReader> readers = new ArrayList<BufferedReader>();
    OutputStreamWriter writer = new OutputStreamWriter(outputStream);

    for (Path partPath : sourcePaths) {
        LOG.info("\tAdding source " + partPath.toString());
        InputStream inputStream = fs.open(partPath);
        if (inputCompressionCodec != null) {
            inputStream = inputCompressionCodec.createInputStream(inputStream);
        }
        BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream));
        readers.add(reader);
    }

    int count = ReadersWriterMergeSort.mergeSort(readers, writer);

    writer.close();
    for (BufferedReader reader : readers) {
        reader.close();
    }
    readers.clear();
    LOG.info("Processed " + count + " lines into " + outputPath.toString());
    return count;
}