List of usage examples for org.apache.hadoop.io.compress CompressionCodec createInputStream
CompressionInputStream createInputStream(InputStream in) throws IOException;
From source file:com.linkedin.cubert.io.rubix.RubixRecordReader.java
License:Open Source License
public void initialize(InputSplit split, Configuration conf) throws IOException, InterruptedException { @SuppressWarnings("unchecked") RubixInputSplit<K, V> rsplit = (RubixInputSplit<K, V>) split; SerializationFactory serializationFactory = new SerializationFactory(conf); switch (rsplit.getBlockSerializationType()) { case DEFAULT: valueDeserializer = serializationFactory.getDeserializer(rsplit.getValueClass()); break;//ww w .jav a 2 s . c o m case COMPACT: BlockSchema schema = rsplit.getSchema(); valueDeserializer = new CompactDeserializer<V>(schema); break; } key = rsplit.getKey(); // store the blockid and partition key in the conf conf.setLong("MY_BLOCK_ID", rsplit.getBlockId()); conf.setLong("MY_NUM_RECORDS", rsplit.getNumRecords()); ByteArrayOutputStream tmpOut = new ByteArrayOutputStream(); ((Tuple) key).write(new DataOutputStream(tmpOut)); String keySerialized = SerializerUtils.serializeToString(tmpOut.toByteArray()); conf.set("MY_PARTITION_KEY", keySerialized); Path path = rsplit.getFilename(); offset = rsplit.getOffset(); length = rsplit.getLength(); FileSystem fs = path.getFileSystem(conf); FSDataInputStream fsin = fs.open(path); fsin.seek(offset); in = new BlockInputStream(fsin, length); CompressionCodec codec = new CompressionCodecFactory(conf).getCodec(path); if (codec != null) { print.f("codec is not null and it is %s", codec.getClass().toString()); in = codec.createInputStream(in); } else { print.f("codec is null"); } valueDeserializer.open(in); }
From source file:com.matthewrathbone.hadoop.MRTester.java
License:Apache License
public List<String> collectStrings(Path location) throws Exception { CompressionCodecFactory factory = new CompressionCodecFactory(conf); FileStatus[] items = fileSystem.listStatus(location); if (items == null) return new ArrayList<String>(); List<String> results = new ArrayList<String>(); for (FileStatus item : items) { if (item.getPath().getName().startsWith("_")) { continue; }/* w ww .j ava 2s. c o m*/ CompressionCodec codec = factory.getCodec(item.getPath()); InputStream stream = null; // check if we have a compression codec if (codec != null) { stream = codec.createInputStream(fileSystem.open(item.getPath())); } else { stream = fileSystem.open(item.getPath()); } StringWriter writer = new StringWriter(); IOUtils.copy(stream, writer, "UTF-8"); String raw = writer.toString(); String[] resulting = raw.split("\n"); for (String str : raw.split("\n")) { results.add(str); } } return results; }
From source file:com.netflix.bdp.inviso.history.TraceService.java
License:Apache License
/** * Returns a json object representing the job history. * * @param jobId//w w w .ja v a 2 s .c o m * @param path Use the given path as opposed to the history locator * @param summary Return just the top level details of the job * @param counters Include counters * @return Json string * @throws Exception */ @Path("load/{jobId}") @GET @Produces("application/json") public String trace(@PathParam("jobId") final String jobId, @QueryParam("path") final String path, @QueryParam("summary") boolean summary, @QueryParam("counters") @DefaultValue("true") boolean counters) throws Exception { Pair<org.apache.hadoop.fs.Path, org.apache.hadoop.fs.Path> historyPath; if (path != null) { historyPath = new ImmutablePair<>(null, new org.apache.hadoop.fs.Path(path)); } else { historyPath = historyLocator.locate(jobId); } if (historyPath == null) { throw new WebApplicationException(404); } TraceJobHistoryLoader loader = new TraceJobHistoryLoader(properties); FileSystem fs = FileSystem.get(historyPath.getRight().toUri(), config); CompressionCodec codec = new CompressionCodecFactory(config).getCodec(historyPath.getRight()); FSDataInputStream fin = fs.open(historyPath.getRight()); if (codec != null) { fin = new FSDataInputStream(new WrappedCompressionInputStream(codec.createInputStream(fin))); } JobHistoryParser parser = new JobHistoryParser(fin); parser.parse(loader); String[] ignore = { "counters" }; ObjectMapper mapper = new ObjectMapper(); SimpleModule module = new SimpleModule("MyModule", new Version(1, 0, 0, null)); //Job JavaType jobMapType = MapLikeType.construct(Job.class, SimpleType.construct(String.class), SimpleType.construct(Object.class)); module.addSerializer(Job.class, MapSerializer.construct(ignore, jobMapType, false, null, null, null, null)); //Task JavaType taskMapType = MapLikeType.construct(Task.class, SimpleType.construct(String.class), SimpleType.construct(Object.class)); module.addSerializer(Task.class, MapSerializer.construct(ignore, taskMapType, false, null, null, null, null)); //Attempt JavaType attemptMapType = MapLikeType.construct(TaskAttempt.class, SimpleType.construct(String.class), SimpleType.construct(Object.class)); module.addSerializer(TaskAttempt.class, MapSerializer.construct(ignore, attemptMapType, false, null, null, null, null)); if (!counters) { mapper.registerModule(module); } if (summary) { loader.getJob().clearTasks(); } return mapper.writeValueAsString(loader.getJob()); }
From source file:com.netflix.suro.sink.localfile.TestTextFileWriter.java
License:Apache License
private int checkFileContentsWithGzip(String filePath, String message) throws IOException, ClassNotFoundException { FileSystem fs = FileSystem.get(new Configuration()); FSDataInputStream input = fs.open(new Path(filePath)); CompressionCodec codec = FileWriterBase.createCodecInstance("org.apache.hadoop.io.compress.GzipCodec"); BufferedReader br = new BufferedReader(new InputStreamReader(codec.createInputStream(input))); String line = null;/*from ww w.ja v a 2 s.c o m*/ int i = 0; while ((line = br.readLine()) != null) { assertEquals(line, message + i); ++i; } br.close(); return i; }
From source file:com.pinterest.secor.io.impl.DelimitedTextFileReaderWriter.java
License:Apache License
public DelimitedTextFileReaderWriter(LogFilePath path, CompressionCodec codec, FileReaderWriter.Type type) throws FileNotFoundException, IOException { Path fsPath = new Path(path.getLogFilePath()); FileSystem fs = FileUtil.getFileSystem(path.getLogFilePath()); if (type == FileReaderWriter.Type.Reader) { InputStream inputStream = fs.open(fsPath); this.mReader = (codec == null) ? new BufferedInputStream(inputStream) : new BufferedInputStream(codec.createInputStream(inputStream)); this.mOffset = path.getOffset(); this.mCountingStream = null; this.mWriter = null; } else if (type == FileReaderWriter.Type.Writer) { this.mCountingStream = new CountingOutputStream(fs.create(fsPath)); this.mWriter = (codec == null) ? new BufferedOutputStream(this.mCountingStream) : new BufferedOutputStream(codec.createOutputStream(this.mCountingStream)); this.mReader = null; } else {/* w ww. j a v a2 s.c om*/ throw new IllegalArgumentException("Undefined File Type: " + type); } }
From source file:com.streamsets.pipeline.stage.destination.hdfs.writer.TestRecordWriterManager.java
License:Apache License
private void testTextFile(CompressionCodec compressionCodec) throws Exception { URI uri = new URI("file:///"); Configuration conf = new HdfsConfiguration(); String prefix = "prefix"; String template = getTestDir().toString() + "/${YYYY()}"; TimeZone timeZone = TimeZone.getTimeZone("UTC"); long cutOffSecs = 10; long cutOffSize = 20; long cutOffRecords = 2; HdfsFileType fileType = HdfsFileType.TEXT; SequenceFile.CompressionType compressionType = null; String keyEL = null;/* w w w . j av a2s . c o m*/ DataGeneratorFactory generatorFactory = new DummyDataGeneratorFactory(null); RecordWriterManager mgr = new RecordWriterManager(uri, conf, prefix, template, timeZone, cutOffSecs, cutOffSize, cutOffRecords, fileType, compressionCodec, compressionType, keyEL, generatorFactory, targetContext, "dirPathTemplate"); Assert.assertTrue(mgr.validateDirTemplate("g", "dirPathTemplate", new ArrayList<Stage.ConfigIssue>())); FileSystem fs = FileSystem.get(uri, conf); Path file = new Path(getTestDir(), UUID.randomUUID().toString()); long expires = System.currentTimeMillis() + 50000; RecordWriter writer = mgr.createWriter(fs, file, 50000); Assert.assertTrue(expires <= writer.getExpiresOn()); Assert.assertTrue(writer.isTextFile()); Assert.assertFalse(writer.isSeqFile()); Record record = RecordCreator.create(); record.set(Field.create("a")); writer.write(record); writer.close(); InputStream is = fs.open(file); if (compressionCodec != null) { is = compressionCodec.createInputStream(is); } BufferedReader reader = new BufferedReader(new InputStreamReader(is)); Assert.assertEquals("a", reader.readLine()); Assert.assertNull(reader.readLine()); reader.close(); }
From source file:com.tgam.hadoop.mapred.EscapedLineRecordReader.java
License:Apache License
public EscapedLineRecordReader(Configuration job, FileSplit split) throws IOException { this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart();/*w w w. j av a 2s . c o m*/ end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { in = new LineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } in = new LineReader(fileIn, job); } if (skipFirstLine) { // skip first line and re-establish "start". start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }
From source file:com.thinkbiganalytics.inputformat.hadoop.mapred.OmnitureDataFileRecordReader.java
License:Open Source License
public OmnitureDataFileRecordReader(Configuration job, FileSplit split) throws IOException { this.maxLineLength = job.getInt("mapred.escapedlinereader.maxlength", Integer.MAX_VALUE); this.start = split.getStart(); this.end = start + split.getLength(); final Path file = split.getPath(); this.compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // Open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { lineReader = new EscapedLineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE;//from ww w . j a v a 2 s . c om } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } lineReader = new EscapedLineReader(fileIn, job); } if (skipFirstLine) { start += lineReader.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }
From source file:com.yahoo.glimmer.indexing.CompressionCodecHelper.java
License:Open Source License
public static InputStream wrapStream(Configuration conf, Path path, InputStream inputStream) throws IOException { CompressionCodec codec = getCompressionCodec(conf, path); if (codec != null) { return codec.createInputStream(inputStream); }//from w ww .j a va 2s. co m return inputStream; }
From source file:com.yahoo.glimmer.util.MergeSortTool.java
License:Open Source License
public static int mergeSort(FileSystem fs, List<Path> sourcePaths, Path outputPath, CompressionCodecFactory compressionCodecFactory) throws IOException { assert sourcePaths.size() > 0 : "No source paths given."; LOG.info("Sorted merge into " + outputPath.toString()); OutputStream outputStream = fs.create(outputPath); CompressionCodec inputCompressionCodec = compressionCodecFactory.getCodec(sourcePaths.get(0)); if (inputCompressionCodec != null) { LOG.info("Input compression codec " + inputCompressionCodec.getClass().getName()); }//from w ww .j a v a2 s . c om CompressionCodec outputCompressionCodec = compressionCodecFactory.getCodec(outputPath); if (outputCompressionCodec != null) { LOG.info("Output compression codec " + outputCompressionCodec.getClass().getName()); outputStream = outputCompressionCodec.createOutputStream(outputStream); } List<BufferedReader> readers = new ArrayList<BufferedReader>(); OutputStreamWriter writer = new OutputStreamWriter(outputStream); for (Path partPath : sourcePaths) { LOG.info("\tAdding source " + partPath.toString()); InputStream inputStream = fs.open(partPath); if (inputCompressionCodec != null) { inputStream = inputCompressionCodec.createInputStream(inputStream); } BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream)); readers.add(reader); } int count = ReadersWriterMergeSort.mergeSort(readers, writer); writer.close(); for (BufferedReader reader : readers) { reader.close(); } readers.clear(); LOG.info("Processed " + count + " lines into " + outputPath.toString()); return count; }