Example usage for org.apache.hadoop.io.compress CompressionCodec createInputStream

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress CompressionCodec createInputStream.

Prototype

CompressionInputStream createInputStream(InputStream in) throws IOException;

Source Link

Document

Create a CompressionInputStream that will read from the given input stream.

Usage

From source file:com.linkedin.cubert.io.rubix.RubixRecordReader.java

License:Open Source License

public void initialize(InputSplit split, Configuration conf) throws IOException, InterruptedException {
    @SuppressWarnings("unchecked")
    RubixInputSplit<K, V> rsplit = (RubixInputSplit<K, V>) split;

    SerializationFactory serializationFactory = new SerializationFactory(conf);
    switch (rsplit.getBlockSerializationType()) {
    case DEFAULT:
        valueDeserializer = serializationFactory.getDeserializer(rsplit.getValueClass());
        break;//ww  w .jav a 2 s .  c o  m
    case COMPACT:
        BlockSchema schema = rsplit.getSchema();
        valueDeserializer = new CompactDeserializer<V>(schema);
        break;
    }

    key = rsplit.getKey();

    // store the blockid and partition key in the conf
    conf.setLong("MY_BLOCK_ID", rsplit.getBlockId());
    conf.setLong("MY_NUM_RECORDS", rsplit.getNumRecords());
    ByteArrayOutputStream tmpOut = new ByteArrayOutputStream();
    ((Tuple) key).write(new DataOutputStream(tmpOut));
    String keySerialized = SerializerUtils.serializeToString(tmpOut.toByteArray());
    conf.set("MY_PARTITION_KEY", keySerialized);

    Path path = rsplit.getFilename();
    offset = rsplit.getOffset();
    length = rsplit.getLength();

    FileSystem fs = path.getFileSystem(conf);
    FSDataInputStream fsin = fs.open(path);
    fsin.seek(offset);

    in = new BlockInputStream(fsin, length);
    CompressionCodec codec = new CompressionCodecFactory(conf).getCodec(path);
    if (codec != null) {
        print.f("codec is not null and it is %s", codec.getClass().toString());
        in = codec.createInputStream(in);
    } else {
        print.f("codec is null");
    }

    valueDeserializer.open(in);
}

From source file:com.matthewrathbone.hadoop.MRTester.java

License:Apache License

public List<String> collectStrings(Path location) throws Exception {
    CompressionCodecFactory factory = new CompressionCodecFactory(conf);
    FileStatus[] items = fileSystem.listStatus(location);
    if (items == null)
        return new ArrayList<String>();
    List<String> results = new ArrayList<String>();
    for (FileStatus item : items) {
        if (item.getPath().getName().startsWith("_")) {
            continue;
        }/* w  ww .j ava  2s.  c  o m*/

        CompressionCodec codec = factory.getCodec(item.getPath());
        InputStream stream = null;

        // check if we have a compression codec
        if (codec != null) {
            stream = codec.createInputStream(fileSystem.open(item.getPath()));
        } else {
            stream = fileSystem.open(item.getPath());
        }

        StringWriter writer = new StringWriter();
        IOUtils.copy(stream, writer, "UTF-8");
        String raw = writer.toString();
        String[] resulting = raw.split("\n");
        for (String str : raw.split("\n")) {
            results.add(str);
        }
    }
    return results;
}

From source file:com.netflix.bdp.inviso.history.TraceService.java

License:Apache License

/**
 * Returns a json object representing the job history.
 *
 * @param jobId//w  w w  .ja v  a 2  s .c  o m
 * @param path Use the given path as opposed to the history locator
 * @param summary Return just the top level details of the job
 * @param counters Include counters
 * @return Json string
 * @throws Exception
 */
@Path("load/{jobId}")
@GET
@Produces("application/json")
public String trace(@PathParam("jobId") final String jobId, @QueryParam("path") final String path,
        @QueryParam("summary") boolean summary, @QueryParam("counters") @DefaultValue("true") boolean counters)
        throws Exception {

    Pair<org.apache.hadoop.fs.Path, org.apache.hadoop.fs.Path> historyPath;

    if (path != null) {
        historyPath = new ImmutablePair<>(null, new org.apache.hadoop.fs.Path(path));
    } else {
        historyPath = historyLocator.locate(jobId);
    }

    if (historyPath == null) {
        throw new WebApplicationException(404);
    }

    TraceJobHistoryLoader loader = new TraceJobHistoryLoader(properties);

    FileSystem fs = FileSystem.get(historyPath.getRight().toUri(), config);
    CompressionCodec codec = new CompressionCodecFactory(config).getCodec(historyPath.getRight());

    FSDataInputStream fin = fs.open(historyPath.getRight());

    if (codec != null) {
        fin = new FSDataInputStream(new WrappedCompressionInputStream(codec.createInputStream(fin)));
    }

    JobHistoryParser parser = new JobHistoryParser(fin);
    parser.parse(loader);

    String[] ignore = { "counters" };

    ObjectMapper mapper = new ObjectMapper();
    SimpleModule module = new SimpleModule("MyModule", new Version(1, 0, 0, null));

    //Job
    JavaType jobMapType = MapLikeType.construct(Job.class, SimpleType.construct(String.class),
            SimpleType.construct(Object.class));
    module.addSerializer(Job.class, MapSerializer.construct(ignore, jobMapType, false, null, null, null, null));

    //Task
    JavaType taskMapType = MapLikeType.construct(Task.class, SimpleType.construct(String.class),
            SimpleType.construct(Object.class));
    module.addSerializer(Task.class,
            MapSerializer.construct(ignore, taskMapType, false, null, null, null, null));

    //Attempt
    JavaType attemptMapType = MapLikeType.construct(TaskAttempt.class, SimpleType.construct(String.class),
            SimpleType.construct(Object.class));
    module.addSerializer(TaskAttempt.class,
            MapSerializer.construct(ignore, attemptMapType, false, null, null, null, null));

    if (!counters) {
        mapper.registerModule(module);
    }

    if (summary) {
        loader.getJob().clearTasks();
    }

    return mapper.writeValueAsString(loader.getJob());
}

From source file:com.netflix.suro.sink.localfile.TestTextFileWriter.java

License:Apache License

private int checkFileContentsWithGzip(String filePath, String message)
        throws IOException, ClassNotFoundException {
    FileSystem fs = FileSystem.get(new Configuration());
    FSDataInputStream input = fs.open(new Path(filePath));
    CompressionCodec codec = FileWriterBase.createCodecInstance("org.apache.hadoop.io.compress.GzipCodec");

    BufferedReader br = new BufferedReader(new InputStreamReader(codec.createInputStream(input)));
    String line = null;/*from  ww w.ja v  a  2  s.c o  m*/
    int i = 0;
    while ((line = br.readLine()) != null) {
        assertEquals(line, message + i);
        ++i;
    }
    br.close();

    return i;
}

From source file:com.pinterest.secor.io.impl.DelimitedTextFileReaderWriter.java

License:Apache License

public DelimitedTextFileReaderWriter(LogFilePath path, CompressionCodec codec, FileReaderWriter.Type type)
        throws FileNotFoundException, IOException {

    Path fsPath = new Path(path.getLogFilePath());
    FileSystem fs = FileUtil.getFileSystem(path.getLogFilePath());
    if (type == FileReaderWriter.Type.Reader) {
        InputStream inputStream = fs.open(fsPath);
        this.mReader = (codec == null) ? new BufferedInputStream(inputStream)
                : new BufferedInputStream(codec.createInputStream(inputStream));
        this.mOffset = path.getOffset();
        this.mCountingStream = null;
        this.mWriter = null;
    } else if (type == FileReaderWriter.Type.Writer) {
        this.mCountingStream = new CountingOutputStream(fs.create(fsPath));
        this.mWriter = (codec == null) ? new BufferedOutputStream(this.mCountingStream)
                : new BufferedOutputStream(codec.createOutputStream(this.mCountingStream));
        this.mReader = null;
    } else {/* w  ww. j  a v  a2 s.c  om*/
        throw new IllegalArgumentException("Undefined File Type: " + type);
    }
}

From source file:com.streamsets.pipeline.stage.destination.hdfs.writer.TestRecordWriterManager.java

License:Apache License

private void testTextFile(CompressionCodec compressionCodec) throws Exception {
    URI uri = new URI("file:///");
    Configuration conf = new HdfsConfiguration();
    String prefix = "prefix";
    String template = getTestDir().toString() + "/${YYYY()}";
    TimeZone timeZone = TimeZone.getTimeZone("UTC");
    long cutOffSecs = 10;
    long cutOffSize = 20;
    long cutOffRecords = 2;
    HdfsFileType fileType = HdfsFileType.TEXT;
    SequenceFile.CompressionType compressionType = null;
    String keyEL = null;/* w  w w  .  j av a2s .  c o  m*/
    DataGeneratorFactory generatorFactory = new DummyDataGeneratorFactory(null);
    RecordWriterManager mgr = new RecordWriterManager(uri, conf, prefix, template, timeZone, cutOffSecs,
            cutOffSize, cutOffRecords, fileType, compressionCodec, compressionType, keyEL, generatorFactory,
            targetContext, "dirPathTemplate");
    Assert.assertTrue(mgr.validateDirTemplate("g", "dirPathTemplate", new ArrayList<Stage.ConfigIssue>()));
    FileSystem fs = FileSystem.get(uri, conf);
    Path file = new Path(getTestDir(), UUID.randomUUID().toString());
    long expires = System.currentTimeMillis() + 50000;
    RecordWriter writer = mgr.createWriter(fs, file, 50000);
    Assert.assertTrue(expires <= writer.getExpiresOn());
    Assert.assertTrue(writer.isTextFile());
    Assert.assertFalse(writer.isSeqFile());
    Record record = RecordCreator.create();
    record.set(Field.create("a"));
    writer.write(record);
    writer.close();
    InputStream is = fs.open(file);
    if (compressionCodec != null) {
        is = compressionCodec.createInputStream(is);
    }
    BufferedReader reader = new BufferedReader(new InputStreamReader(is));
    Assert.assertEquals("a", reader.readLine());
    Assert.assertNull(reader.readLine());
    reader.close();
}

From source file:com.tgam.hadoop.mapred.EscapedLineRecordReader.java

License:Apache License

public EscapedLineRecordReader(Configuration job, FileSplit split) throws IOException {
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();/*w  w  w. j  av  a  2s  .  c  o m*/
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        in = new LineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        in = new LineReader(fileIn, job);
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:com.thinkbiganalytics.inputformat.hadoop.mapred.OmnitureDataFileRecordReader.java

License:Open Source License

public OmnitureDataFileRecordReader(Configuration job, FileSplit split) throws IOException {

    this.maxLineLength = job.getInt("mapred.escapedlinereader.maxlength", Integer.MAX_VALUE);
    this.start = split.getStart();
    this.end = start + split.getLength();
    final Path file = split.getPath();
    this.compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // Open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        lineReader = new EscapedLineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;//from  ww w . j a  v a 2  s  . c om
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        lineReader = new EscapedLineReader(fileIn, job);
    }
    if (skipFirstLine) {
        start += lineReader.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:com.yahoo.glimmer.indexing.CompressionCodecHelper.java

License:Open Source License

public static InputStream wrapStream(Configuration conf, Path path, InputStream inputStream)
        throws IOException {
    CompressionCodec codec = getCompressionCodec(conf, path);
    if (codec != null) {
        return codec.createInputStream(inputStream);
    }//from w ww  .j a va 2s.  co  m
    return inputStream;
}

From source file:com.yahoo.glimmer.util.MergeSortTool.java

License:Open Source License

public static int mergeSort(FileSystem fs, List<Path> sourcePaths, Path outputPath,
        CompressionCodecFactory compressionCodecFactory) throws IOException {
    assert sourcePaths.size() > 0 : "No source paths given.";

    LOG.info("Sorted merge into " + outputPath.toString());
    OutputStream outputStream = fs.create(outputPath);

    CompressionCodec inputCompressionCodec = compressionCodecFactory.getCodec(sourcePaths.get(0));
    if (inputCompressionCodec != null) {
        LOG.info("Input compression codec " + inputCompressionCodec.getClass().getName());
    }//from  w ww  .j  a v a2  s  .  c  om

    CompressionCodec outputCompressionCodec = compressionCodecFactory.getCodec(outputPath);
    if (outputCompressionCodec != null) {
        LOG.info("Output compression codec " + outputCompressionCodec.getClass().getName());
        outputStream = outputCompressionCodec.createOutputStream(outputStream);
    }

    List<BufferedReader> readers = new ArrayList<BufferedReader>();
    OutputStreamWriter writer = new OutputStreamWriter(outputStream);

    for (Path partPath : sourcePaths) {
        LOG.info("\tAdding source " + partPath.toString());
        InputStream inputStream = fs.open(partPath);
        if (inputCompressionCodec != null) {
            inputStream = inputCompressionCodec.createInputStream(inputStream);
        }
        BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream));
        readers.add(reader);
    }

    int count = ReadersWriterMergeSort.mergeSort(readers, writer);

    writer.close();
    for (BufferedReader reader : readers) {
        reader.close();
    }
    readers.clear();
    LOG.info("Processed " + count + " lines into " + outputPath.toString());
    return count;
}