Example usage for org.apache.commons.compress.compressors CompressorStreamFactory createCompressorInputStream

List of usage examples for org.apache.commons.compress.compressors CompressorStreamFactory createCompressorInputStream

Introduction

In this page you can find the example usage for org.apache.commons.compress.compressors CompressorStreamFactory createCompressorInputStream.

Prototype

public CompressorInputStream createCompressorInputStream(final InputStream in) throws CompressorException 

Source Link

Document

Create an compressor input stream from an input stream, autodetecting the compressor type from the first few bytes of the stream.

Usage

From source file:es.uvigo.ei.sing.gc.view.ZKUtils.java

public static Data loadDataFromMedia(Media media)
        throws ArchiveException, IOException, AbortException, Exception {
    final Reader reader;
    if (media.isBinary()) {
        // It seems that, in Windows, big text files are uploaded as binary.
        final String fileName = media.getName().toLowerCase();
        if (fileName.endsWith(".csv") || fileName.endsWith(".txt")) {
            reader = new InputStreamReader(media.getStreamData());
        } else {/*from   w w w.  j  a v  a 2s.  c om*/
            InputStream is;
            try {
                final CompressorStreamFactory factory = new CompressorStreamFactory();
                is = factory.createCompressorInputStream(new BufferedInputStream(media.getStreamData()));
            } catch (CompressorException ce) {
                final ArchiveStreamFactory factory = new ArchiveStreamFactory();
                is = factory.createArchiveInputStream(new BufferedInputStream(media.getStreamData()));

                if (((ArchiveInputStream) is).getNextEntry().isDirectory())
                    throw new IOException("Invalid archive file format");
            }

            reader = new InputStreamReader(is);
        }
    } else {
        reader = media.getReaderData();
    }

    return LoadClassificationData.loadData(reader, media.getName(), null, null, true, null);
}

From source file:mj.ocraptor.extraction.tika.parser.pkg.ZipContainerDetector.java

private static MediaType detectCompressorFormat(byte[] prefix, int length) {
    try {/*from   w w  w.ja  va2  s.c  om*/
        CompressorStreamFactory factory = new CompressorStreamFactory();
        CompressorInputStream cis = factory
                .createCompressorInputStream(new ByteArrayInputStream(prefix, 0, length));
        try {
            return CompressorParser.getMediaType(cis);
        } finally {
            IOUtils.closeQuietly(cis);
        }
    } catch (CompressorException e) {
        return MediaType.OCTET_STREAM;
    }
}

From source file:mj.ocraptor.extraction.tika.parser.pkg.CompressorParser.java

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {
    // At the end we want to close the compression stream to release
    // any associated resources, but the underlying document stream
    // should not be closed
    stream = new CloseShieldInputStream(stream);

    // Ensure that the stream supports the mark feature
    stream = new BufferedInputStream(stream);

    CompressorInputStream cis;/*from  ww w.  j av  a 2 s . c  o m*/
    try {
        CompressorStreamFactory factory = new CompressorStreamFactory();
        CompressorParserOptions options = context.get(CompressorParserOptions.class,
                new CompressorParserOptions() {
                    public boolean decompressConcatenated(Metadata metadata) {
                        return false;
                    }
                });
        factory.setDecompressConcatenated(options.decompressConcatenated(metadata));
        cis = factory.createCompressorInputStream(stream);
    } catch (CompressorException e) {
        throw new TikaException("Unable to uncompress document stream", e);
    }

    MediaType type = getMediaType(cis);
    if (!type.equals(MediaType.OCTET_STREAM)) {
        metadata.set(CONTENT_TYPE, type.toString());
    }

    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();

    try {
        Metadata entrydata = new Metadata();
        String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
        if (name != null) {
            if (name.endsWith(".tbz")) {
                name = name.substring(0, name.length() - 4) + ".tar";
            } else if (name.endsWith(".tbz2")) {
                name = name.substring(0, name.length() - 5) + ".tar";
            } else if (name.endsWith(".bz")) {
                name = name.substring(0, name.length() - 3);
            } else if (name.endsWith(".bz2")) {
                name = name.substring(0, name.length() - 4);
            } else if (name.endsWith(".xz")) {
                name = name.substring(0, name.length() - 3);
            } else if (name.endsWith(".pack")) {
                name = name.substring(0, name.length() - 5);
            } else if (name.length() > 0) {
                name = GzipUtils.getUncompressedFilename(name);
            }
            entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
        }

        // Use the delegate parser to parse the compressed document
        EmbeddedDocumentExtractor extractor = context.get(EmbeddedDocumentExtractor.class,
                new ParsingEmbeddedDocumentExtractor(context));
        if (extractor.shouldParseEmbedded(entrydata)) {
            extractor.parseEmbedded(cis, xhtml, entrydata, true);
        }
    } finally {
        cis.close();
    }

    xhtml.endDocument();
}

From source file:org.apache.jackrabbit.oak.benchmark.wikipedia.WikipediaImport.java

public int importWikipedia(Session session) throws Exception {
    long start = System.currentTimeMillis();
    int count = 0;
    int code = 0;

    if (doReport) {
        System.out.format("Importing %s...%n", dump);
    }//  w  ww  .  j a v  a2  s .c  om

    String type = "nt:unstructured";
    if (session.getWorkspace().getNodeTypeManager().hasNodeType("oak:Unstructured")) {
        type = "oak:Unstructured";
    }
    Node wikipedia = session.getRootNode().addNode("wikipedia", type);

    int levels = 0;
    if (!flat) {
        // calculate the number of levels needed, based on the rough
        // estimate that the average XML size of a page is about 1kB
        for (long pages = dump.length() / 1024; pages > 256; pages /= 256) {
            levels++;
        }
    }

    String title = null;
    String text = null;
    XMLInputFactory factory = XMLInputFactory.newInstance();
    StreamSource source;
    if (dump.getName().endsWith(".xml")) {
        source = new StreamSource(dump);
    } else {
        CompressorStreamFactory csf = new CompressorStreamFactory();
        source = new StreamSource(
                csf.createCompressorInputStream(new BufferedInputStream(new FileInputStream(dump))));
    }
    haltImport = false;
    XMLStreamReader reader = factory.createXMLStreamReader(source);
    while (reader.hasNext() && !haltImport) {
        switch (reader.next()) {
        case XMLStreamConstants.START_ELEMENT:
            if ("title".equals(reader.getLocalName())) {
                title = reader.getElementText();
            } else if ("text".equals(reader.getLocalName())) {
                text = reader.getElementText();
            }
            break;
        case XMLStreamConstants.END_ELEMENT:
            if ("page".equals(reader.getLocalName())) {
                String name = Text.escapeIllegalJcrChars(title);
                Node parent = wikipedia;
                if (levels > 0) {
                    int n = name.length();
                    for (int i = 0; i < levels; i++) {
                        int hash = name.substring(min(i, n)).hashCode();
                        parent = JcrUtils.getOrAddNode(parent, String.format("%02x", hash & 0xff));
                    }
                }
                Node page = parent.addNode(name);
                page.setProperty("title", title);
                page.setProperty("text", text);
                code += title.hashCode();
                code += text.hashCode();
                count++;
                if (count % 1000 == 0) {
                    batchDone(session, start, count);
                }

                pageAdded(title, text);
            }
            break;
        }
    }

    session.save();

    if (doReport) {
        long millis = System.currentTimeMillis() - start;
        System.out.format("Imported %d pages in %d seconds (%.2fms/page)%n", count, millis / 1000,
                (double) millis / count);
    }

    return code;
}