List of usage examples for org.apache.hadoop.io.compress CompressionCodec createInputStream
CompressionInputStream createInputStream(InputStream in) throws IOException;
From source file:org.apache.hawq.pxf.plugins.json.JsonRecordReader.java
License:Apache License
/** * Create new multi-line json object reader. * /*from ww w . j a va 2 s. co m*/ * @param conf * Hadoop context * @param split * HDFS split to start the reading from * @throws IOException IOException when reading the file */ public JsonRecordReader(JobConf conf, FileSplit split) throws IOException { this.jsonMemberName = conf.get(RECORD_MEMBER_IDENTIFIER); this.maxObjectLength = conf.getInt(RECORD_MAX_LENGTH, Integer.MAX_VALUE); start = split.getStart(); end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(conf); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(conf); FSDataInputStream fileIn = fs.open(split.getPath()); if (codec != null) { is = codec.createInputStream(fileIn); start = 0; end = Long.MAX_VALUE; } else { if (start != 0) { fileIn.seek(start); } is = fileIn; } parser = new PartitionedJsonParser(is); this.pos = start; }
From source file:org.apache.jena.grande.mapreduce.io.QuadRecordReader.java
License:Apache License
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { log.debug("initialize({}, {})", genericSplit, context); FileSplit split = (FileSplit) genericSplit; profile = Utils.createParserProfile(context, split.getPath()); // RIOT configuration Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE); start = split.getStart();/*from www .j a va2 s . c o m*/ end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(file); boolean skipFirstLine = false; if (codec != null) { in = new LineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } in = new LineReader(fileIn, job); } if (skipFirstLine) { // skip first line and re-establish "start". start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }
From source file:org.apache.jena.hadoop.rdf.io.input.readers.AbstractLineBasedNodeTupleReader.java
License:Apache License
@Override public final void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { LOG.debug("initialize({}, {})", genericSplit, context); // Assuming file split if (!(genericSplit instanceof FileSplit)) throw new IOException("This record reader only supports FileSplit inputs"); FileSplit split = (FileSplit) genericSplit; // Intermediate : RDFParser but need to make a Iterator<Quad/Triple> LabelToNode labelToNode = RdfIOUtils.createLabelToNode(context, split.getPath()); maker = new ParserProfileStd(RiotLib.factoryRDF(labelToNode), ErrorHandlerFactory.errorHandlerStd, IRIResolver.create(), PrefixMapFactory.createForInput(), null, true, false); Configuration config = context.getConfiguration(); this.ignoreBadTuples = config.getBoolean(RdfIOConstants.INPUT_IGNORE_BAD_TUPLES, true); if (this.ignoreBadTuples) LOG.warn(//from w w w. j a v a 2s.c o m "Configured to ignore bad tuples, parsing errors will be logged and the bad line skipped but no errors will be thrownConsider setting {} to false to disable this behaviour", RdfIOConstants.INPUT_IGNORE_BAD_TUPLES); // Figure out what portion of the file to read this.maxLineLength = config.getInt(HadoopIOConstants.MAX_LINE_LENGTH, Integer.MAX_VALUE); start = split.getStart(); end = start + split.getLength(); final Path file = split.getPath(); long totalLength = file.getFileSystem(context.getConfiguration()).getFileStatus(file).getLen(); compressionCodecs = new CompressionCodecFactory(config); final CompressionCodec codec = compressionCodecs.getCodec(file); LOG.info(String.format("Got split with start %d and length %d for file with total length of %d", new Object[] { start, split.getLength(), totalLength })); // Open the file and seek to the start of the split FileSystem fs = file.getFileSystem(config); FSDataInputStream fileIn = fs.open(file); boolean skipFirstLine = false; if (codec != null) { // Compressed input // For compressed input NLineInputFormat will have failed to find // any line breaks and will give us a split from 0 -> (length - 1) // Add 1 and verify we got complete split if (totalLength > split.getLength() + 1) throw new IOException( "This record reader can only be used with compressed input where the split covers the whole file"); in = new LineReader(codec.createInputStream(fileIn), config); estLength = end; end = Long.MAX_VALUE; } else { // Uncompressed input if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } in = new LineReader(fileIn, config); } // Skip first line and re-establish "start". // This is to do with how line reader reads lines and how // NLineInputFormat will provide the split information to use if (skipFirstLine) { start += in.readLine(new Text(), 0, (int) Math.min(Integer.MAX_VALUE, end - start)); } this.pos = start; }
From source file:org.apache.kylin.job.hadoop.cardinality.HiveColumnCardinalityUpdateJob.java
License:Apache License
private static List<String> readLines(Path location, Configuration conf) throws Exception { FileSystem fileSystem = FileSystem.get(location.toUri(), conf); CompressionCodecFactory factory = new CompressionCodecFactory(conf); FileStatus[] items = fileSystem.listStatus(location); if (items == null) return new ArrayList<String>(); List<String> results = new ArrayList<String>(); for (FileStatus item : items) { // ignoring files like _SUCCESS if (item.getPath().getName().startsWith("_")) { continue; }//from www . ja v a 2 s.com CompressionCodec codec = factory.getCodec(item.getPath()); InputStream stream = null; // check if we have a compression codec we need to use if (codec != null) { stream = codec.createInputStream(fileSystem.open(item.getPath())); } else { stream = fileSystem.open(item.getPath()); } StringWriter writer = new StringWriter(); IOUtils.copy(stream, writer, "UTF-8"); String raw = writer.toString(); for (String str : raw.split("\n")) { results.add(str); } } return results; }
From source file:org.apache.lens.lib.query.TestAbstractFileFormatter.java
License:Apache License
/** * Read compressed file./* w ww .j a v a 2 s .c om*/ * * @param finalPath the final path * @param conf the conf * @param encoding the encoding * @return the list * @throws IOException Signals that an I/O exception has occurred. */ protected List<String> readCompressedFile(Path finalPath, Configuration conf, String encoding) throws IOException { CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf); compressionCodecs = new CompressionCodecFactory(conf); final CompressionCodec codec = compressionCodecs.getCodec(finalPath); FileSystem fs = finalPath.getFileSystem(conf); return readFromStream(new InputStreamReader(codec.createInputStream(fs.open(finalPath)), encoding)); }
From source file:org.apache.mahout.classifier.bayes.WikipediaXmlSplitter.java
License:Apache License
public static void main(String[] args) throws IOException { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option dumpFileOpt = obuilder.withLongName("dumpFile").withRequired(true) .withArgument(abuilder.withName("dumpFile").withMinimum(1).withMaximum(1).create()) .withDescription("The path to the wikipedia dump file (.bz2 or uncompressed)").withShortName("d") .create();// w w w. j a v a 2s .com Option outputDirOpt = obuilder.withLongName("outputDir").withRequired(true) .withArgument(abuilder.withName("outputDir").withMinimum(1).withMaximum(1).create()) .withDescription("The output directory to place the splits in:\n" + "local files:\n\t/var/data/wikipedia-xml-chunks or\n\tfile:///var/data/wikipedia-xml-chunks\n" + "Hadoop DFS:\n\thdfs://wikipedia-xml-chunks\n" + "AWS S3 (blocks):\n\ts3://bucket-name/wikipedia-xml-chunks\n" + "AWS S3 (native files):\n\ts3n://bucket-name/wikipedia-xml-chunks\n") .withShortName("o").create(); Option s3IdOpt = obuilder.withLongName("s3ID").withRequired(false) .withArgument(abuilder.withName("s3Id").withMinimum(1).withMaximum(1).create()) .withDescription("Amazon S3 ID key").withShortName("i").create(); Option s3SecretOpt = obuilder.withLongName("s3Secret").withRequired(false) .withArgument(abuilder.withName("s3Secret").withMinimum(1).withMaximum(1).create()) .withDescription("Amazon S3 secret key").withShortName("s").create(); Option chunkSizeOpt = obuilder.withLongName("chunkSize").withRequired(true) .withArgument(abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create()) .withDescription("The Size of the chunk, in megabytes").withShortName("c").create(); Option numChunksOpt = obuilder.withLongName("numChunks").withRequired(false) .withArgument(abuilder.withName("numChunks").withMinimum(1).withMaximum(1).create()) .withDescription( "The maximum number of chunks to create. If specified, program will only create a subset of the chunks") .withShortName("n").create(); Group group = gbuilder.withName("Options").withOption(dumpFileOpt).withOption(outputDirOpt) .withOption(chunkSizeOpt).withOption(numChunksOpt).withOption(s3IdOpt).withOption(s3SecretOpt) .create(); Parser parser = new Parser(); parser.setGroup(group); CommandLine cmdLine; try { cmdLine = parser.parse(args); } catch (OptionException e) { log.error("Error while parsing options", e); CommandLineUtil.printHelp(group); return; } Configuration conf = new Configuration(); String dumpFilePath = (String) cmdLine.getValue(dumpFileOpt); String outputDirPath = (String) cmdLine.getValue(outputDirOpt); if (cmdLine.hasOption(s3IdOpt)) { String id = (String) cmdLine.getValue(s3IdOpt); conf.set("fs.s3n.awsAccessKeyId", id); conf.set("fs.s3.awsAccessKeyId", id); } if (cmdLine.hasOption(s3SecretOpt)) { String secret = (String) cmdLine.getValue(s3SecretOpt); conf.set("fs.s3n.awsSecretAccessKey", secret); conf.set("fs.s3.awsSecretAccessKey", secret); } // do not compute crc file when using local FS conf.set("fs.file.impl", "org.apache.hadoop.fs.RawLocalFileSystem"); FileSystem fs = FileSystem.get(URI.create(outputDirPath), conf); int chunkSize = 1024 * 1024 * Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt)); int numChunks = Integer.MAX_VALUE; if (cmdLine.hasOption(numChunksOpt)) { numChunks = Integer.parseInt((String) cmdLine.getValue(numChunksOpt)); } String header = "<mediawiki xmlns=\"http://www.mediawiki.org/xml/export-0.3/\" " + "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" " + "xsi:schemaLocation=\"http://www.mediawiki.org/xml/export-0.3/ " + "http://www.mediawiki.org/xml/export-0.3.xsd\" " + "version=\"0.3\" " + "xml:lang=\"en\">\n" + " <siteinfo>\n" + "<sitename>Wikipedia</sitename>\n" + " <base>http://en.wikipedia.org/wiki/Main_Page</base>\n" + " <generator>MediaWiki 1.13alpha</generator>\n" + " <case>first-letter</case>\n" + " <namespaces>\n" + " <namespace key=\"-2\">Media</namespace>\n" + " <namespace key=\"-1\">Special</namespace>\n" + " <namespace key=\"0\" />\n" + " <namespace key=\"1\">Talk</namespace>\n" + " <namespace key=\"2\">User</namespace>\n" + " <namespace key=\"3\">User talk</namespace>\n" + " <namespace key=\"4\">Wikipedia</namespace>\n" + " <namespace key=\"5\">Wikipedia talk</namespace>\n" + " <namespace key=\"6\">Image</namespace>\n" + " <namespace key=\"7\">Image talk</namespace>\n" + " <namespace key=\"8\">MediaWiki</namespace>\n" + " <namespace key=\"9\">MediaWiki talk</namespace>\n" + " <namespace key=\"10\">Template</namespace>\n" + " <namespace key=\"11\">Template talk</namespace>\n" + " <namespace key=\"12\">Help</namespace>\n" + " <namespace key=\"13\">Help talk</namespace>\n" + " <namespace key=\"14\">Category</namespace>\n" + " <namespace key=\"15\">Category talk</namespace>\n" + " <namespace key=\"100\">Portal</namespace>\n" + " <namespace key=\"101\">Portal talk</namespace>\n" + " </namespaces>\n" + " </siteinfo>\n"; StringBuilder content = new StringBuilder(); content.append(header); NumberFormat decimalFormatter = new DecimalFormat("0000"); File dumpFile = new File(dumpFilePath); FileLineIterator it; if (dumpFilePath.endsWith(".bz2")) { // default compression format from http://download.wikimedia.org CompressionCodec codec = new BZip2Codec(); it = new FileLineIterator(codec.createInputStream(new FileInputStream(dumpFile))); } else { // assume the user has previously de-compressed the dump file it = new FileLineIterator(dumpFile); } int filenumber = 0; while (it.hasNext()) { String thisLine = it.next(); if (thisLine.trim().startsWith("<page>")) { boolean end = false; while (!thisLine.trim().startsWith("</page>")) { content.append(thisLine).append('\n'); if (it.hasNext()) { thisLine = it.next(); } else { end = true; break; } } content.append(thisLine).append('\n'); if (content.length() > chunkSize || end) { content.append("</mediawiki>"); filenumber++; String filename = outputDirPath + "/chunk-" + decimalFormatter.format(filenumber) + ".xml"; BufferedWriter chunkWriter = new BufferedWriter( new OutputStreamWriter(fs.create(new Path(filename)), "UTF-8")); chunkWriter.write(content.toString(), 0, content.length()); chunkWriter.close(); if (filenumber >= numChunks) { break; } content = new StringBuilder(); content.append(header); } } } }
From source file:org.apache.mahout.text.wikipedia.WikipediaXmlSplitter.java
License:Apache License
public static void main(String[] args) throws IOException { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option dumpFileOpt = obuilder.withLongName("dumpFile").withRequired(true) .withArgument(abuilder.withName("dumpFile").withMinimum(1).withMaximum(1).create()) .withDescription("The path to the wikipedia dump file (.bz2 or uncompressed)").withShortName("d") .create();// www. ja v a2s . c o m Option outputDirOpt = obuilder.withLongName("outputDir").withRequired(true) .withArgument(abuilder.withName("outputDir").withMinimum(1).withMaximum(1).create()) .withDescription("The output directory to place the splits in:\n" + "local files:\n\t/var/data/wikipedia-xml-chunks or\n\tfile:///var/data/wikipedia-xml-chunks\n" + "Hadoop DFS:\n\thdfs://wikipedia-xml-chunks\n" + "AWS S3 (blocks):\n\ts3://bucket-name/wikipedia-xml-chunks\n" + "AWS S3 (native files):\n\ts3n://bucket-name/wikipedia-xml-chunks\n") .withShortName("o").create(); Option s3IdOpt = obuilder.withLongName("s3ID").withRequired(false) .withArgument(abuilder.withName("s3Id").withMinimum(1).withMaximum(1).create()) .withDescription("Amazon S3 ID key").withShortName("i").create(); Option s3SecretOpt = obuilder.withLongName("s3Secret").withRequired(false) .withArgument(abuilder.withName("s3Secret").withMinimum(1).withMaximum(1).create()) .withDescription("Amazon S3 secret key").withShortName("s").create(); Option chunkSizeOpt = obuilder.withLongName("chunkSize").withRequired(true) .withArgument(abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create()) .withDescription("The Size of the chunk, in megabytes").withShortName("c").create(); Option numChunksOpt = obuilder.withLongName("numChunks").withRequired(false) .withArgument(abuilder.withName("numChunks").withMinimum(1).withMaximum(1).create()) .withDescription( "The maximum number of chunks to create. If specified, program will only create a subset of the chunks") .withShortName("n").create(); Group group = gbuilder.withName("Options").withOption(dumpFileOpt).withOption(outputDirOpt) .withOption(chunkSizeOpt).withOption(numChunksOpt).withOption(s3IdOpt).withOption(s3SecretOpt) .create(); Parser parser = new Parser(); parser.setGroup(group); CommandLine cmdLine; try { cmdLine = parser.parse(args); } catch (OptionException e) { log.error("Error while parsing options", e); CommandLineUtil.printHelp(group); return; } Configuration conf = new Configuration(); String dumpFilePath = (String) cmdLine.getValue(dumpFileOpt); String outputDirPath = (String) cmdLine.getValue(outputDirOpt); if (cmdLine.hasOption(s3IdOpt)) { String id = (String) cmdLine.getValue(s3IdOpt); conf.set("fs.s3n.awsAccessKeyId", id); conf.set("fs.s3.awsAccessKeyId", id); } if (cmdLine.hasOption(s3SecretOpt)) { String secret = (String) cmdLine.getValue(s3SecretOpt); conf.set("fs.s3n.awsSecretAccessKey", secret); conf.set("fs.s3.awsSecretAccessKey", secret); } // do not compute crc file when using local FS conf.set("fs.file.impl", "org.apache.hadoop.fs.RawLocalFileSystem"); FileSystem fs = FileSystem.get(URI.create(outputDirPath), conf); int chunkSize = 1024 * 1024 * Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt)); int numChunks = Integer.MAX_VALUE; if (cmdLine.hasOption(numChunksOpt)) { numChunks = Integer.parseInt((String) cmdLine.getValue(numChunksOpt)); } String header = "<mediawiki xmlns=\"http://www.mediawiki.org/xml/export-0.3/\" " + "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" " + "xsi:schemaLocation=\"http://www.mediawiki.org/xml/export-0.3/ " + "http://www.mediawiki.org/xml/export-0.3.xsd\" " + "version=\"0.3\" " + "xml:lang=\"en\">\n" + " <siteinfo>\n" + "<sitename>Wikipedia</sitename>\n" + " <base>http://en.wikipedia.org/wiki/Main_Page</base>\n" + " <generator>MediaWiki 1.13alpha</generator>\n" + " <case>first-letter</case>\n" + " <namespaces>\n" + " <namespace key=\"-2\">Media</namespace>\n" + " <namespace key=\"-1\">Special</namespace>\n" + " <namespace key=\"0\" />\n" + " <namespace key=\"1\">Talk</namespace>\n" + " <namespace key=\"2\">User</namespace>\n" + " <namespace key=\"3\">User talk</namespace>\n" + " <namespace key=\"4\">Wikipedia</namespace>\n" + " <namespace key=\"5\">Wikipedia talk</namespace>\n" + " <namespace key=\"6\">Image</namespace>\n" + " <namespace key=\"7\">Image talk</namespace>\n" + " <namespace key=\"8\">MediaWiki</namespace>\n" + " <namespace key=\"9\">MediaWiki talk</namespace>\n" + " <namespace key=\"10\">Template</namespace>\n" + " <namespace key=\"11\">Template talk</namespace>\n" + " <namespace key=\"12\">Help</namespace>\n" + " <namespace key=\"13\">Help talk</namespace>\n" + " <namespace key=\"14\">Category</namespace>\n" + " <namespace key=\"15\">Category talk</namespace>\n" + " <namespace key=\"100\">Portal</namespace>\n" + " <namespace key=\"101\">Portal talk</namespace>\n" + " </namespaces>\n" + " </siteinfo>\n"; StringBuilder content = new StringBuilder(); content.append(header); NumberFormat decimalFormatter = new DecimalFormat("0000"); File dumpFile = new File(dumpFilePath); // If the specified path for the input file is incorrect, return immediately if (!dumpFile.exists()) { log.error("Input file path {} doesn't exist", dumpFilePath); return; } FileLineIterator it; if (dumpFilePath.endsWith(".bz2")) { // default compression format from http://download.wikimedia.org CompressionCodec codec = new BZip2Codec(); it = new FileLineIterator(codec.createInputStream(new FileInputStream(dumpFile))); } else { // assume the user has previously de-compressed the dump file it = new FileLineIterator(dumpFile); } int fileNumber = 0; while (it.hasNext()) { String thisLine = it.next(); if (thisLine.trim().startsWith("<page>")) { boolean end = false; while (!thisLine.trim().startsWith("</page>")) { content.append(thisLine).append('\n'); if (it.hasNext()) { thisLine = it.next(); } else { end = true; break; } } content.append(thisLine).append('\n'); if (content.length() > chunkSize || end) { content.append("</mediawiki>"); fileNumber++; String filename = outputDirPath + "/chunk-" + decimalFormatter.format(fileNumber) + ".xml"; BufferedWriter chunkWriter = new BufferedWriter( new OutputStreamWriter(fs.create(new Path(filename)), "UTF-8")); try { chunkWriter.write(content.toString(), 0, content.length()); } finally { Closeables.close(chunkWriter, false); } if (fileNumber >= numChunks) { break; } content = new StringBuilder(); content.append(header); } } } }
From source file:org.apache.nifi.processors.hadoop.FetchHDFS.java
License:Apache License
@Override public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException { FlowFile flowFile = session.get();//from w ww.jav a2s. c o m if (flowFile == null) { return; } final FileSystem hdfs = getFileSystem(); final UserGroupInformation ugi = getUserGroupInformation(); final String filenameValue = context.getProperty(FILENAME).evaluateAttributeExpressions(flowFile) .getValue(); final Path path; try { path = new Path(filenameValue); } catch (IllegalArgumentException e) { getLogger().error("Failed to retrieve content from {} for {} due to {}; routing to failure", new Object[] { filenameValue, flowFile, e }); flowFile = session.putAttribute(flowFile, "hdfs.failure.reason", e.getMessage()); flowFile = session.penalize(flowFile); session.transfer(flowFile, REL_FAILURE); return; } final StopWatch stopWatch = new StopWatch(true); final FlowFile finalFlowFile = flowFile; ugi.doAs(new PrivilegedAction<Object>() { @Override public Object run() { InputStream stream = null; CompressionCodec codec = null; Configuration conf = getConfiguration(); final CompressionCodecFactory compressionCodecFactory = new CompressionCodecFactory(conf); final CompressionType compressionType = CompressionType .valueOf(context.getProperty(COMPRESSION_CODEC).toString()); final boolean inferCompressionCodec = compressionType == CompressionType.AUTOMATIC; if (inferCompressionCodec) { codec = compressionCodecFactory.getCodec(path); } else if (compressionType != CompressionType.NONE) { codec = getCompressionCodec(context, getConfiguration()); } FlowFile flowFile = finalFlowFile; final Path qualifiedPath = path.makeQualified(hdfs.getUri(), hdfs.getWorkingDirectory()); try { final String outputFilename; final String originalFilename = path.getName(); stream = hdfs.open(path, 16384); // Check if compression codec is defined (inferred or otherwise) if (codec != null) { stream = codec.createInputStream(stream); outputFilename = StringUtils.removeEnd(originalFilename, codec.getDefaultExtension()); } else { outputFilename = originalFilename; } flowFile = session.importFrom(stream, finalFlowFile); flowFile = session.putAttribute(flowFile, CoreAttributes.FILENAME.key(), outputFilename); stopWatch.stop(); getLogger().info("Successfully received content from {} for {} in {}", new Object[] { qualifiedPath, flowFile, stopWatch.getDuration() }); session.getProvenanceReporter().fetch(flowFile, qualifiedPath.toString(), stopWatch.getDuration(TimeUnit.MILLISECONDS)); session.transfer(flowFile, REL_SUCCESS); } catch (final FileNotFoundException | AccessControlException e) { getLogger().error("Failed to retrieve content from {} for {} due to {}; routing to failure", new Object[] { qualifiedPath, flowFile, e }); flowFile = session.putAttribute(flowFile, "hdfs.failure.reason", e.getMessage()); flowFile = session.penalize(flowFile); session.transfer(flowFile, REL_FAILURE); } catch (final IOException e) { getLogger().error( "Failed to retrieve content from {} for {} due to {}; routing to comms.failure", new Object[] { qualifiedPath, flowFile, e }); flowFile = session.penalize(flowFile); session.transfer(flowFile, REL_COMMS_FAILURE); } finally { IOUtils.closeQuietly(stream); } return null; } }); }
From source file:org.apache.nifi.processors.hadoop.GetHDFS.java
License:Apache License
protected void processBatchOfFiles(final List<Path> files, final ProcessContext context, final ProcessSession session) { // process the batch of files InputStream stream = null;/* w w w . ja v a 2 s . c om*/ CompressionCodec codec = null; Configuration conf = getConfiguration(); FileSystem hdfs = getFileSystem(); final boolean keepSourceFiles = context.getProperty(KEEP_SOURCE_FILE).asBoolean(); final Double bufferSizeProp = context.getProperty(BUFFER_SIZE).asDataSize(DataUnit.B); int bufferSize = bufferSizeProp != null ? bufferSizeProp.intValue() : conf.getInt(BUFFER_SIZE_KEY, BUFFER_SIZE_DEFAULT); final Path rootDir = new Path(context.getProperty(DIRECTORY).evaluateAttributeExpressions().getValue()); final CompressionType compressionType = CompressionType .valueOf(context.getProperty(COMPRESSION_CODEC).toString()); final boolean inferCompressionCodec = compressionType == CompressionType.AUTOMATIC; if (inferCompressionCodec || compressionType != CompressionType.NONE) { codec = getCompressionCodec(context, getConfiguration()); } final CompressionCodecFactory compressionCodecFactory = new CompressionCodecFactory(conf); for (final Path file : files) { try { if (!getUserGroupInformation().doAs((PrivilegedExceptionAction<Boolean>) () -> hdfs.exists(file))) { continue; // if file is no longer there then move on } final String originalFilename = file.getName(); final String relativePath = getPathDifference(rootDir, file); stream = getUserGroupInformation() .doAs((PrivilegedExceptionAction<FSDataInputStream>) () -> hdfs.open(file, bufferSize)); final String outputFilename; // Check if we should infer compression codec if (inferCompressionCodec) { codec = compressionCodecFactory.getCodec(file); } // Check if compression codec is defined (inferred or otherwise) if (codec != null) { stream = codec.createInputStream(stream); outputFilename = StringUtils.removeEnd(originalFilename, codec.getDefaultExtension()); } else { outputFilename = originalFilename; } FlowFile flowFile = session.create(); final StopWatch stopWatch = new StopWatch(true); flowFile = session.importFrom(stream, flowFile); stopWatch.stop(); final String dataRate = stopWatch.calculateDataRate(flowFile.getSize()); final long millis = stopWatch.getDuration(TimeUnit.MILLISECONDS); flowFile = session.putAttribute(flowFile, CoreAttributes.PATH.key(), relativePath); flowFile = session.putAttribute(flowFile, CoreAttributes.FILENAME.key(), outputFilename); if (!keepSourceFiles && !getUserGroupInformation() .doAs((PrivilegedExceptionAction<Boolean>) () -> hdfs.delete(file, false))) { getLogger().warn("Could not remove {} from HDFS. Not ingesting this file ...", new Object[] { file }); session.remove(flowFile); continue; } session.getProvenanceReporter().receive(flowFile, file.toString()); session.transfer(flowFile, REL_SUCCESS); getLogger().info("retrieved {} from HDFS {} in {} milliseconds at a rate of {}", new Object[] { flowFile, file, millis, dataRate }); session.commit(); } catch (final Throwable t) { getLogger().error("Error retrieving file {} from HDFS due to {}", new Object[] { file, t }); session.rollback(); context.yield(); } finally { IOUtils.closeQuietly(stream); stream = null; } } }
From source file:org.apache.pig.piggybank.test.storage.TestMultiStorageCompression.java
License:Apache License
private void verifyResults(String type, List<String> filesToDelete, String outputPath) throws IOException, FileNotFoundException { // Verify the output File outputDir = new File(outputPath); List<String> indexFolders = Arrays.asList(outputDir.list()); // Assert whether all keys are present assertTrue(indexFolders.contains("f1." + type)); assertTrue(indexFolders.contains("f2." + type)); assertTrue(indexFolders.contains("f3." + type)); assertTrue(indexFolders.contains("f4." + type)); // Sort so that assertions are easy Collections.sort(indexFolders); for (int i = 0; i < indexFolders.size(); i++) { String indexFolder = indexFolders.get(i); if (indexFolder.startsWith("._SUCCESS") || indexFolder.startsWith("_SUCCESS")) continue; String topFolder = outputPath + File.separator + indexFolder; File indexFolderFile = new File(topFolder); filesToDelete.add(topFolder);/* w w w . j av a2s.c o m*/ String[] list = indexFolderFile.list(); for (String outputFile : list) { String file = topFolder + File.separator + outputFile; filesToDelete.add(file); // Skip off any file starting with . if (outputFile.startsWith(".")) continue; // Try to read the records using the codec CompressionCodec codec = null; // Use the codec according to the test case if (type.equals("bz2")) { codec = new BZip2Codec(); } else if (type.equals("gz")) { codec = new GzipCodec(); } if (codec instanceof Configurable) { ((Configurable) codec).setConf(new Configuration()); } CompressionInputStream createInputStream = codec.createInputStream(new FileInputStream(file)); int b; StringBuffer sb = new StringBuffer(); while ((b = createInputStream.read()) != -1) { sb.append((char) b); } createInputStream.close(); // Assert for the number of fields and keys. String[] fields = sb.toString().split("\\t"); assertEquals(3, fields.length); String id = indexFolder.substring(1, 2); assertEquals("f" + id, fields[0]); } } }