Example usage for org.apache.hadoop.io.compress CompressionCodec createInputStream

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress CompressionCodec createInputStream.

Prototype

CompressionInputStream createInputStream(InputStream in) throws IOException;

Source Link

Document

Create a CompressionInputStream that will read from the given input stream.

Usage

From source file:org.apache.hawq.pxf.plugins.json.JsonRecordReader.java

License:Apache License

/**
 * Create new multi-line json object reader.
 * /*from ww w . j  a va 2  s.  co  m*/
 * @param conf
 *            Hadoop context
 * @param split
 *            HDFS split to start the reading from
 * @throws IOException IOException when reading the file
 */
public JsonRecordReader(JobConf conf, FileSplit split) throws IOException {

    this.jsonMemberName = conf.get(RECORD_MEMBER_IDENTIFIER);
    this.maxObjectLength = conf.getInt(RECORD_MAX_LENGTH, Integer.MAX_VALUE);

    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(conf);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(conf);
    FSDataInputStream fileIn = fs.open(split.getPath());
    if (codec != null) {
        is = codec.createInputStream(fileIn);
        start = 0;
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            fileIn.seek(start);
        }
        is = fileIn;
    }
    parser = new PartitionedJsonParser(is);
    this.pos = start;
}

From source file:org.apache.jena.grande.mapreduce.io.QuadRecordReader.java

License:Apache License

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    log.debug("initialize({}, {})", genericSplit, context);

    FileSplit split = (FileSplit) genericSplit;
    profile = Utils.createParserProfile(context, split.getPath()); // RIOT configuration
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
    start = split.getStart();/*from   www  .j  a  va2  s . c o  m*/
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(file);
    boolean skipFirstLine = false;
    if (codec != null) {
        in = new LineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        in = new LineReader(fileIn, job);
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:org.apache.jena.hadoop.rdf.io.input.readers.AbstractLineBasedNodeTupleReader.java

License:Apache License

@Override
public final void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    LOG.debug("initialize({}, {})", genericSplit, context);

    // Assuming file split
    if (!(genericSplit instanceof FileSplit))
        throw new IOException("This record reader only supports FileSplit inputs");
    FileSplit split = (FileSplit) genericSplit;

    // Intermediate : RDFParser but need to make a Iterator<Quad/Triple>
    LabelToNode labelToNode = RdfIOUtils.createLabelToNode(context, split.getPath());
    maker = new ParserProfileStd(RiotLib.factoryRDF(labelToNode), ErrorHandlerFactory.errorHandlerStd,
            IRIResolver.create(), PrefixMapFactory.createForInput(), null, true, false);

    Configuration config = context.getConfiguration();
    this.ignoreBadTuples = config.getBoolean(RdfIOConstants.INPUT_IGNORE_BAD_TUPLES, true);
    if (this.ignoreBadTuples)
        LOG.warn(//from  w  w  w.  j a  v a  2s.c  o m
                "Configured to ignore bad tuples, parsing errors will be logged and the bad line skipped but no errors will be thrownConsider setting {} to false to disable this behaviour",
                RdfIOConstants.INPUT_IGNORE_BAD_TUPLES);

    // Figure out what portion of the file to read
    this.maxLineLength = config.getInt(HadoopIOConstants.MAX_LINE_LENGTH, Integer.MAX_VALUE);
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();
    long totalLength = file.getFileSystem(context.getConfiguration()).getFileStatus(file).getLen();
    compressionCodecs = new CompressionCodecFactory(config);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    LOG.info(String.format("Got split with start %d and length %d for file with total length of %d",
            new Object[] { start, split.getLength(), totalLength }));

    // Open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(config);
    FSDataInputStream fileIn = fs.open(file);
    boolean skipFirstLine = false;
    if (codec != null) {
        // Compressed input
        // For compressed input NLineInputFormat will have failed to find
        // any line breaks and will give us a split from 0 -> (length - 1)
        // Add 1 and verify we got complete split
        if (totalLength > split.getLength() + 1)
            throw new IOException(
                    "This record reader can only be used with compressed input where the split covers the whole file");
        in = new LineReader(codec.createInputStream(fileIn), config);
        estLength = end;
        end = Long.MAX_VALUE;
    } else {
        // Uncompressed input
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        in = new LineReader(fileIn, config);
    }
    // Skip first line and re-establish "start".
    // This is to do with how line reader reads lines and how
    // NLineInputFormat will provide the split information to use
    if (skipFirstLine) {
        start += in.readLine(new Text(), 0, (int) Math.min(Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:org.apache.kylin.job.hadoop.cardinality.HiveColumnCardinalityUpdateJob.java

License:Apache License

private static List<String> readLines(Path location, Configuration conf) throws Exception {
    FileSystem fileSystem = FileSystem.get(location.toUri(), conf);
    CompressionCodecFactory factory = new CompressionCodecFactory(conf);
    FileStatus[] items = fileSystem.listStatus(location);
    if (items == null)
        return new ArrayList<String>();
    List<String> results = new ArrayList<String>();
    for (FileStatus item : items) {

        // ignoring files like _SUCCESS
        if (item.getPath().getName().startsWith("_")) {
            continue;
        }//from www  . ja  v a 2  s.com

        CompressionCodec codec = factory.getCodec(item.getPath());
        InputStream stream = null;

        // check if we have a compression codec we need to use
        if (codec != null) {
            stream = codec.createInputStream(fileSystem.open(item.getPath()));
        } else {
            stream = fileSystem.open(item.getPath());
        }

        StringWriter writer = new StringWriter();
        IOUtils.copy(stream, writer, "UTF-8");
        String raw = writer.toString();
        for (String str : raw.split("\n")) {
            results.add(str);
        }
    }
    return results;
}

From source file:org.apache.lens.lib.query.TestAbstractFileFormatter.java

License:Apache License

/**
 * Read compressed file./*  w  ww  .j a  v a  2  s .c  om*/
 *
 * @param finalPath the final path
 * @param conf      the conf
 * @param encoding  the encoding
 * @return the list
 * @throws IOException Signals that an I/O exception has occurred.
 */
protected List<String> readCompressedFile(Path finalPath, Configuration conf, String encoding)
        throws IOException {
    CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf);
    compressionCodecs = new CompressionCodecFactory(conf);
    final CompressionCodec codec = compressionCodecs.getCodec(finalPath);
    FileSystem fs = finalPath.getFileSystem(conf);
    return readFromStream(new InputStreamReader(codec.createInputStream(fs.open(finalPath)), encoding));
}

From source file:org.apache.mahout.classifier.bayes.WikipediaXmlSplitter.java

License:Apache License

public static void main(String[] args) throws IOException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option dumpFileOpt = obuilder.withLongName("dumpFile").withRequired(true)
            .withArgument(abuilder.withName("dumpFile").withMinimum(1).withMaximum(1).create())
            .withDescription("The path to the wikipedia dump file (.bz2 or uncompressed)").withShortName("d")
            .create();// w w  w. j a v  a  2s  .com

    Option outputDirOpt = obuilder.withLongName("outputDir").withRequired(true)
            .withArgument(abuilder.withName("outputDir").withMinimum(1).withMaximum(1).create())
            .withDescription("The output directory to place the splits in:\n"
                    + "local files:\n\t/var/data/wikipedia-xml-chunks or\n\tfile:///var/data/wikipedia-xml-chunks\n"
                    + "Hadoop DFS:\n\thdfs://wikipedia-xml-chunks\n"
                    + "AWS S3 (blocks):\n\ts3://bucket-name/wikipedia-xml-chunks\n"
                    + "AWS S3 (native files):\n\ts3n://bucket-name/wikipedia-xml-chunks\n")

            .withShortName("o").create();

    Option s3IdOpt = obuilder.withLongName("s3ID").withRequired(false)
            .withArgument(abuilder.withName("s3Id").withMinimum(1).withMaximum(1).create())
            .withDescription("Amazon S3 ID key").withShortName("i").create();
    Option s3SecretOpt = obuilder.withLongName("s3Secret").withRequired(false)
            .withArgument(abuilder.withName("s3Secret").withMinimum(1).withMaximum(1).create())
            .withDescription("Amazon S3 secret key").withShortName("s").create();

    Option chunkSizeOpt = obuilder.withLongName("chunkSize").withRequired(true)
            .withArgument(abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create())
            .withDescription("The Size of the chunk, in megabytes").withShortName("c").create();
    Option numChunksOpt = obuilder.withLongName("numChunks").withRequired(false)
            .withArgument(abuilder.withName("numChunks").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The maximum number of chunks to create.  If specified, program will only create a subset of the chunks")
            .withShortName("n").create();
    Group group = gbuilder.withName("Options").withOption(dumpFileOpt).withOption(outputDirOpt)
            .withOption(chunkSizeOpt).withOption(numChunksOpt).withOption(s3IdOpt).withOption(s3SecretOpt)
            .create();

    Parser parser = new Parser();
    parser.setGroup(group);
    CommandLine cmdLine;
    try {
        cmdLine = parser.parse(args);
    } catch (OptionException e) {
        log.error("Error while parsing options", e);
        CommandLineUtil.printHelp(group);
        return;
    }

    Configuration conf = new Configuration();
    String dumpFilePath = (String) cmdLine.getValue(dumpFileOpt);
    String outputDirPath = (String) cmdLine.getValue(outputDirOpt);

    if (cmdLine.hasOption(s3IdOpt)) {
        String id = (String) cmdLine.getValue(s3IdOpt);
        conf.set("fs.s3n.awsAccessKeyId", id);
        conf.set("fs.s3.awsAccessKeyId", id);
    }
    if (cmdLine.hasOption(s3SecretOpt)) {
        String secret = (String) cmdLine.getValue(s3SecretOpt);
        conf.set("fs.s3n.awsSecretAccessKey", secret);
        conf.set("fs.s3.awsSecretAccessKey", secret);
    }
    // do not compute crc file when using local FS
    conf.set("fs.file.impl", "org.apache.hadoop.fs.RawLocalFileSystem");
    FileSystem fs = FileSystem.get(URI.create(outputDirPath), conf);

    int chunkSize = 1024 * 1024 * Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt));

    int numChunks = Integer.MAX_VALUE;
    if (cmdLine.hasOption(numChunksOpt)) {
        numChunks = Integer.parseInt((String) cmdLine.getValue(numChunksOpt));
    }

    String header = "<mediawiki xmlns=\"http://www.mediawiki.org/xml/export-0.3/\" "
            + "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
            + "xsi:schemaLocation=\"http://www.mediawiki.org/xml/export-0.3/ "
            + "http://www.mediawiki.org/xml/export-0.3.xsd\" " + "version=\"0.3\" " + "xml:lang=\"en\">\n"
            + "  <siteinfo>\n" + "<sitename>Wikipedia</sitename>\n"
            + "    <base>http://en.wikipedia.org/wiki/Main_Page</base>\n"
            + "    <generator>MediaWiki 1.13alpha</generator>\n" + "    <case>first-letter</case>\n"
            + "    <namespaces>\n" + "      <namespace key=\"-2\">Media</namespace>\n"
            + "      <namespace key=\"-1\">Special</namespace>\n" + "      <namespace key=\"0\" />\n"
            + "      <namespace key=\"1\">Talk</namespace>\n" + "      <namespace key=\"2\">User</namespace>\n"
            + "      <namespace key=\"3\">User talk</namespace>\n"
            + "      <namespace key=\"4\">Wikipedia</namespace>\n"
            + "      <namespace key=\"5\">Wikipedia talk</namespace>\n"
            + "      <namespace key=\"6\">Image</namespace>\n"
            + "      <namespace key=\"7\">Image talk</namespace>\n"
            + "      <namespace key=\"8\">MediaWiki</namespace>\n"
            + "      <namespace key=\"9\">MediaWiki talk</namespace>\n"
            + "      <namespace key=\"10\">Template</namespace>\n"
            + "      <namespace key=\"11\">Template talk</namespace>\n"
            + "      <namespace key=\"12\">Help</namespace>\n"
            + "      <namespace key=\"13\">Help talk</namespace>\n"
            + "      <namespace key=\"14\">Category</namespace>\n"
            + "      <namespace key=\"15\">Category talk</namespace>\n"
            + "      <namespace key=\"100\">Portal</namespace>\n"
            + "      <namespace key=\"101\">Portal talk</namespace>\n" + "    </namespaces>\n"
            + "  </siteinfo>\n";

    StringBuilder content = new StringBuilder();
    content.append(header);
    NumberFormat decimalFormatter = new DecimalFormat("0000");
    File dumpFile = new File(dumpFilePath);
    FileLineIterator it;
    if (dumpFilePath.endsWith(".bz2")) {
        // default compression format from http://download.wikimedia.org
        CompressionCodec codec = new BZip2Codec();
        it = new FileLineIterator(codec.createInputStream(new FileInputStream(dumpFile)));
    } else {
        // assume the user has previously de-compressed the dump file
        it = new FileLineIterator(dumpFile);
    }
    int filenumber = 0;
    while (it.hasNext()) {
        String thisLine = it.next();
        if (thisLine.trim().startsWith("<page>")) {
            boolean end = false;
            while (!thisLine.trim().startsWith("</page>")) {
                content.append(thisLine).append('\n');
                if (it.hasNext()) {
                    thisLine = it.next();
                } else {
                    end = true;
                    break;
                }
            }
            content.append(thisLine).append('\n');

            if (content.length() > chunkSize || end) {
                content.append("</mediawiki>");
                filenumber++;
                String filename = outputDirPath + "/chunk-" + decimalFormatter.format(filenumber) + ".xml";
                BufferedWriter chunkWriter = new BufferedWriter(
                        new OutputStreamWriter(fs.create(new Path(filename)), "UTF-8"));

                chunkWriter.write(content.toString(), 0, content.length());
                chunkWriter.close();
                if (filenumber >= numChunks) {
                    break;
                }
                content = new StringBuilder();
                content.append(header);
            }
        }
    }
}

From source file:org.apache.mahout.text.wikipedia.WikipediaXmlSplitter.java

License:Apache License

public static void main(String[] args) throws IOException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option dumpFileOpt = obuilder.withLongName("dumpFile").withRequired(true)
            .withArgument(abuilder.withName("dumpFile").withMinimum(1).withMaximum(1).create())
            .withDescription("The path to the wikipedia dump file (.bz2 or uncompressed)").withShortName("d")
            .create();// www. ja  v  a2s  . c o  m

    Option outputDirOpt = obuilder.withLongName("outputDir").withRequired(true)
            .withArgument(abuilder.withName("outputDir").withMinimum(1).withMaximum(1).create())
            .withDescription("The output directory to place the splits in:\n"
                    + "local files:\n\t/var/data/wikipedia-xml-chunks or\n\tfile:///var/data/wikipedia-xml-chunks\n"
                    + "Hadoop DFS:\n\thdfs://wikipedia-xml-chunks\n"
                    + "AWS S3 (blocks):\n\ts3://bucket-name/wikipedia-xml-chunks\n"
                    + "AWS S3 (native files):\n\ts3n://bucket-name/wikipedia-xml-chunks\n")

            .withShortName("o").create();

    Option s3IdOpt = obuilder.withLongName("s3ID").withRequired(false)
            .withArgument(abuilder.withName("s3Id").withMinimum(1).withMaximum(1).create())
            .withDescription("Amazon S3 ID key").withShortName("i").create();
    Option s3SecretOpt = obuilder.withLongName("s3Secret").withRequired(false)
            .withArgument(abuilder.withName("s3Secret").withMinimum(1).withMaximum(1).create())
            .withDescription("Amazon S3 secret key").withShortName("s").create();

    Option chunkSizeOpt = obuilder.withLongName("chunkSize").withRequired(true)
            .withArgument(abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create())
            .withDescription("The Size of the chunk, in megabytes").withShortName("c").create();
    Option numChunksOpt = obuilder.withLongName("numChunks").withRequired(false)
            .withArgument(abuilder.withName("numChunks").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The maximum number of chunks to create.  If specified, program will only create a subset of the chunks")
            .withShortName("n").create();
    Group group = gbuilder.withName("Options").withOption(dumpFileOpt).withOption(outputDirOpt)
            .withOption(chunkSizeOpt).withOption(numChunksOpt).withOption(s3IdOpt).withOption(s3SecretOpt)
            .create();

    Parser parser = new Parser();
    parser.setGroup(group);
    CommandLine cmdLine;
    try {
        cmdLine = parser.parse(args);
    } catch (OptionException e) {
        log.error("Error while parsing options", e);
        CommandLineUtil.printHelp(group);
        return;
    }

    Configuration conf = new Configuration();
    String dumpFilePath = (String) cmdLine.getValue(dumpFileOpt);
    String outputDirPath = (String) cmdLine.getValue(outputDirOpt);

    if (cmdLine.hasOption(s3IdOpt)) {
        String id = (String) cmdLine.getValue(s3IdOpt);
        conf.set("fs.s3n.awsAccessKeyId", id);
        conf.set("fs.s3.awsAccessKeyId", id);
    }
    if (cmdLine.hasOption(s3SecretOpt)) {
        String secret = (String) cmdLine.getValue(s3SecretOpt);
        conf.set("fs.s3n.awsSecretAccessKey", secret);
        conf.set("fs.s3.awsSecretAccessKey", secret);
    }
    // do not compute crc file when using local FS
    conf.set("fs.file.impl", "org.apache.hadoop.fs.RawLocalFileSystem");
    FileSystem fs = FileSystem.get(URI.create(outputDirPath), conf);

    int chunkSize = 1024 * 1024 * Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt));

    int numChunks = Integer.MAX_VALUE;
    if (cmdLine.hasOption(numChunksOpt)) {
        numChunks = Integer.parseInt((String) cmdLine.getValue(numChunksOpt));
    }

    String header = "<mediawiki xmlns=\"http://www.mediawiki.org/xml/export-0.3/\" "
            + "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
            + "xsi:schemaLocation=\"http://www.mediawiki.org/xml/export-0.3/ "
            + "http://www.mediawiki.org/xml/export-0.3.xsd\" " + "version=\"0.3\" " + "xml:lang=\"en\">\n"
            + "  <siteinfo>\n" + "<sitename>Wikipedia</sitename>\n"
            + "    <base>http://en.wikipedia.org/wiki/Main_Page</base>\n"
            + "    <generator>MediaWiki 1.13alpha</generator>\n" + "    <case>first-letter</case>\n"
            + "    <namespaces>\n" + "      <namespace key=\"-2\">Media</namespace>\n"
            + "      <namespace key=\"-1\">Special</namespace>\n" + "      <namespace key=\"0\" />\n"
            + "      <namespace key=\"1\">Talk</namespace>\n" + "      <namespace key=\"2\">User</namespace>\n"
            + "      <namespace key=\"3\">User talk</namespace>\n"
            + "      <namespace key=\"4\">Wikipedia</namespace>\n"
            + "      <namespace key=\"5\">Wikipedia talk</namespace>\n"
            + "      <namespace key=\"6\">Image</namespace>\n"
            + "      <namespace key=\"7\">Image talk</namespace>\n"
            + "      <namespace key=\"8\">MediaWiki</namespace>\n"
            + "      <namespace key=\"9\">MediaWiki talk</namespace>\n"
            + "      <namespace key=\"10\">Template</namespace>\n"
            + "      <namespace key=\"11\">Template talk</namespace>\n"
            + "      <namespace key=\"12\">Help</namespace>\n"
            + "      <namespace key=\"13\">Help talk</namespace>\n"
            + "      <namespace key=\"14\">Category</namespace>\n"
            + "      <namespace key=\"15\">Category talk</namespace>\n"
            + "      <namespace key=\"100\">Portal</namespace>\n"
            + "      <namespace key=\"101\">Portal talk</namespace>\n" + "    </namespaces>\n"
            + "  </siteinfo>\n";

    StringBuilder content = new StringBuilder();
    content.append(header);
    NumberFormat decimalFormatter = new DecimalFormat("0000");
    File dumpFile = new File(dumpFilePath);

    // If the specified path for the input file is incorrect, return immediately
    if (!dumpFile.exists()) {
        log.error("Input file path {} doesn't exist", dumpFilePath);
        return;
    }

    FileLineIterator it;
    if (dumpFilePath.endsWith(".bz2")) {
        // default compression format from http://download.wikimedia.org
        CompressionCodec codec = new BZip2Codec();
        it = new FileLineIterator(codec.createInputStream(new FileInputStream(dumpFile)));
    } else {
        // assume the user has previously de-compressed the dump file
        it = new FileLineIterator(dumpFile);
    }
    int fileNumber = 0;
    while (it.hasNext()) {
        String thisLine = it.next();
        if (thisLine.trim().startsWith("<page>")) {
            boolean end = false;
            while (!thisLine.trim().startsWith("</page>")) {
                content.append(thisLine).append('\n');
                if (it.hasNext()) {
                    thisLine = it.next();
                } else {
                    end = true;
                    break;
                }
            }
            content.append(thisLine).append('\n');

            if (content.length() > chunkSize || end) {
                content.append("</mediawiki>");
                fileNumber++;
                String filename = outputDirPath + "/chunk-" + decimalFormatter.format(fileNumber) + ".xml";
                BufferedWriter chunkWriter = new BufferedWriter(
                        new OutputStreamWriter(fs.create(new Path(filename)), "UTF-8"));
                try {
                    chunkWriter.write(content.toString(), 0, content.length());
                } finally {
                    Closeables.close(chunkWriter, false);
                }
                if (fileNumber >= numChunks) {
                    break;
                }
                content = new StringBuilder();
                content.append(header);
            }
        }
    }
}

From source file:org.apache.nifi.processors.hadoop.FetchHDFS.java

License:Apache License

@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
    FlowFile flowFile = session.get();//from  w ww.jav  a2s. c o  m
    if (flowFile == null) {
        return;
    }

    final FileSystem hdfs = getFileSystem();
    final UserGroupInformation ugi = getUserGroupInformation();
    final String filenameValue = context.getProperty(FILENAME).evaluateAttributeExpressions(flowFile)
            .getValue();

    final Path path;
    try {
        path = new Path(filenameValue);
    } catch (IllegalArgumentException e) {
        getLogger().error("Failed to retrieve content from {} for {} due to {}; routing to failure",
                new Object[] { filenameValue, flowFile, e });
        flowFile = session.putAttribute(flowFile, "hdfs.failure.reason", e.getMessage());
        flowFile = session.penalize(flowFile);
        session.transfer(flowFile, REL_FAILURE);
        return;
    }

    final StopWatch stopWatch = new StopWatch(true);
    final FlowFile finalFlowFile = flowFile;

    ugi.doAs(new PrivilegedAction<Object>() {
        @Override
        public Object run() {
            InputStream stream = null;
            CompressionCodec codec = null;
            Configuration conf = getConfiguration();
            final CompressionCodecFactory compressionCodecFactory = new CompressionCodecFactory(conf);
            final CompressionType compressionType = CompressionType
                    .valueOf(context.getProperty(COMPRESSION_CODEC).toString());
            final boolean inferCompressionCodec = compressionType == CompressionType.AUTOMATIC;

            if (inferCompressionCodec) {
                codec = compressionCodecFactory.getCodec(path);
            } else if (compressionType != CompressionType.NONE) {
                codec = getCompressionCodec(context, getConfiguration());
            }

            FlowFile flowFile = finalFlowFile;
            final Path qualifiedPath = path.makeQualified(hdfs.getUri(), hdfs.getWorkingDirectory());
            try {
                final String outputFilename;
                final String originalFilename = path.getName();
                stream = hdfs.open(path, 16384);

                // Check if compression codec is defined (inferred or otherwise)
                if (codec != null) {
                    stream = codec.createInputStream(stream);
                    outputFilename = StringUtils.removeEnd(originalFilename, codec.getDefaultExtension());
                } else {
                    outputFilename = originalFilename;
                }

                flowFile = session.importFrom(stream, finalFlowFile);
                flowFile = session.putAttribute(flowFile, CoreAttributes.FILENAME.key(), outputFilename);

                stopWatch.stop();
                getLogger().info("Successfully received content from {} for {} in {}",
                        new Object[] { qualifiedPath, flowFile, stopWatch.getDuration() });
                session.getProvenanceReporter().fetch(flowFile, qualifiedPath.toString(),
                        stopWatch.getDuration(TimeUnit.MILLISECONDS));
                session.transfer(flowFile, REL_SUCCESS);
            } catch (final FileNotFoundException | AccessControlException e) {
                getLogger().error("Failed to retrieve content from {} for {} due to {}; routing to failure",
                        new Object[] { qualifiedPath, flowFile, e });
                flowFile = session.putAttribute(flowFile, "hdfs.failure.reason", e.getMessage());
                flowFile = session.penalize(flowFile);
                session.transfer(flowFile, REL_FAILURE);
            } catch (final IOException e) {
                getLogger().error(
                        "Failed to retrieve content from {} for {} due to {}; routing to comms.failure",
                        new Object[] { qualifiedPath, flowFile, e });
                flowFile = session.penalize(flowFile);
                session.transfer(flowFile, REL_COMMS_FAILURE);
            } finally {
                IOUtils.closeQuietly(stream);
            }

            return null;
        }
    });

}

From source file:org.apache.nifi.processors.hadoop.GetHDFS.java

License:Apache License

protected void processBatchOfFiles(final List<Path> files, final ProcessContext context,
        final ProcessSession session) {
    // process the batch of files
    InputStream stream = null;/* w w w . ja  v a  2  s .  c om*/
    CompressionCodec codec = null;
    Configuration conf = getConfiguration();
    FileSystem hdfs = getFileSystem();
    final boolean keepSourceFiles = context.getProperty(KEEP_SOURCE_FILE).asBoolean();
    final Double bufferSizeProp = context.getProperty(BUFFER_SIZE).asDataSize(DataUnit.B);
    int bufferSize = bufferSizeProp != null ? bufferSizeProp.intValue()
            : conf.getInt(BUFFER_SIZE_KEY, BUFFER_SIZE_DEFAULT);
    final Path rootDir = new Path(context.getProperty(DIRECTORY).evaluateAttributeExpressions().getValue());

    final CompressionType compressionType = CompressionType
            .valueOf(context.getProperty(COMPRESSION_CODEC).toString());
    final boolean inferCompressionCodec = compressionType == CompressionType.AUTOMATIC;
    if (inferCompressionCodec || compressionType != CompressionType.NONE) {
        codec = getCompressionCodec(context, getConfiguration());
    }
    final CompressionCodecFactory compressionCodecFactory = new CompressionCodecFactory(conf);
    for (final Path file : files) {
        try {
            if (!getUserGroupInformation().doAs((PrivilegedExceptionAction<Boolean>) () -> hdfs.exists(file))) {
                continue; // if file is no longer there then move on
            }
            final String originalFilename = file.getName();
            final String relativePath = getPathDifference(rootDir, file);

            stream = getUserGroupInformation()
                    .doAs((PrivilegedExceptionAction<FSDataInputStream>) () -> hdfs.open(file, bufferSize));

            final String outputFilename;
            // Check if we should infer compression codec
            if (inferCompressionCodec) {
                codec = compressionCodecFactory.getCodec(file);
            }
            // Check if compression codec is defined (inferred or otherwise)
            if (codec != null) {
                stream = codec.createInputStream(stream);
                outputFilename = StringUtils.removeEnd(originalFilename, codec.getDefaultExtension());
            } else {
                outputFilename = originalFilename;
            }

            FlowFile flowFile = session.create();

            final StopWatch stopWatch = new StopWatch(true);
            flowFile = session.importFrom(stream, flowFile);
            stopWatch.stop();
            final String dataRate = stopWatch.calculateDataRate(flowFile.getSize());
            final long millis = stopWatch.getDuration(TimeUnit.MILLISECONDS);

            flowFile = session.putAttribute(flowFile, CoreAttributes.PATH.key(), relativePath);
            flowFile = session.putAttribute(flowFile, CoreAttributes.FILENAME.key(), outputFilename);

            if (!keepSourceFiles && !getUserGroupInformation()
                    .doAs((PrivilegedExceptionAction<Boolean>) () -> hdfs.delete(file, false))) {
                getLogger().warn("Could not remove {} from HDFS. Not ingesting this file ...",
                        new Object[] { file });
                session.remove(flowFile);
                continue;
            }

            session.getProvenanceReporter().receive(flowFile, file.toString());
            session.transfer(flowFile, REL_SUCCESS);
            getLogger().info("retrieved {} from HDFS {} in {} milliseconds at a rate of {}",
                    new Object[] { flowFile, file, millis, dataRate });
            session.commit();
        } catch (final Throwable t) {
            getLogger().error("Error retrieving file {} from HDFS due to {}", new Object[] { file, t });
            session.rollback();
            context.yield();
        } finally {
            IOUtils.closeQuietly(stream);
            stream = null;
        }
    }
}

From source file:org.apache.pig.piggybank.test.storage.TestMultiStorageCompression.java

License:Apache License

private void verifyResults(String type, List<String> filesToDelete, String outputPath)
        throws IOException, FileNotFoundException {
    // Verify the output
    File outputDir = new File(outputPath);
    List<String> indexFolders = Arrays.asList(outputDir.list());

    // Assert whether all keys are present
    assertTrue(indexFolders.contains("f1." + type));
    assertTrue(indexFolders.contains("f2." + type));
    assertTrue(indexFolders.contains("f3." + type));
    assertTrue(indexFolders.contains("f4." + type));

    // Sort so that assertions are easy
    Collections.sort(indexFolders);

    for (int i = 0; i < indexFolders.size(); i++) {

        String indexFolder = indexFolders.get(i);
        if (indexFolder.startsWith("._SUCCESS") || indexFolder.startsWith("_SUCCESS"))
            continue;
        String topFolder = outputPath + File.separator + indexFolder;
        File indexFolderFile = new File(topFolder);
        filesToDelete.add(topFolder);/* w w w  . j av  a2s.c  o m*/
        String[] list = indexFolderFile.list();
        for (String outputFile : list) {

            String file = topFolder + File.separator + outputFile;
            filesToDelete.add(file);

            // Skip off any file starting with .
            if (outputFile.startsWith("."))
                continue;

            // Try to read the records using the codec
            CompressionCodec codec = null;

            // Use the codec according to the test case
            if (type.equals("bz2")) {
                codec = new BZip2Codec();
            } else if (type.equals("gz")) {
                codec = new GzipCodec();
            }
            if (codec instanceof Configurable) {
                ((Configurable) codec).setConf(new Configuration());
            }

            CompressionInputStream createInputStream = codec.createInputStream(new FileInputStream(file));
            int b;
            StringBuffer sb = new StringBuffer();
            while ((b = createInputStream.read()) != -1) {
                sb.append((char) b);
            }
            createInputStream.close();

            // Assert for the number of fields and keys.
            String[] fields = sb.toString().split("\\t");
            assertEquals(3, fields.length);
            String id = indexFolder.substring(1, 2);
            assertEquals("f" + id, fields[0]);

        }

    }
}