List of usage examples for org.apache.commons.compress.compressors CompressorStreamFactory createCompressorInputStream
public CompressorInputStream createCompressorInputStream(final InputStream in) throws CompressorException
From source file:es.uvigo.ei.sing.gc.view.ZKUtils.java
public static Data loadDataFromMedia(Media media) throws ArchiveException, IOException, AbortException, Exception { final Reader reader; if (media.isBinary()) { // It seems that, in Windows, big text files are uploaded as binary. final String fileName = media.getName().toLowerCase(); if (fileName.endsWith(".csv") || fileName.endsWith(".txt")) { reader = new InputStreamReader(media.getStreamData()); } else {/*from w w w. j a v a 2s. c om*/ InputStream is; try { final CompressorStreamFactory factory = new CompressorStreamFactory(); is = factory.createCompressorInputStream(new BufferedInputStream(media.getStreamData())); } catch (CompressorException ce) { final ArchiveStreamFactory factory = new ArchiveStreamFactory(); is = factory.createArchiveInputStream(new BufferedInputStream(media.getStreamData())); if (((ArchiveInputStream) is).getNextEntry().isDirectory()) throw new IOException("Invalid archive file format"); } reader = new InputStreamReader(is); } } else { reader = media.getReaderData(); } return LoadClassificationData.loadData(reader, media.getName(), null, null, true, null); }
From source file:mj.ocraptor.extraction.tika.parser.pkg.ZipContainerDetector.java
private static MediaType detectCompressorFormat(byte[] prefix, int length) { try {/*from w w w.ja va2 s.c om*/ CompressorStreamFactory factory = new CompressorStreamFactory(); CompressorInputStream cis = factory .createCompressorInputStream(new ByteArrayInputStream(prefix, 0, length)); try { return CompressorParser.getMediaType(cis); } finally { IOUtils.closeQuietly(cis); } } catch (CompressorException e) { return MediaType.OCTET_STREAM; } }
From source file:mj.ocraptor.extraction.tika.parser.pkg.CompressorParser.java
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { // At the end we want to close the compression stream to release // any associated resources, but the underlying document stream // should not be closed stream = new CloseShieldInputStream(stream); // Ensure that the stream supports the mark feature stream = new BufferedInputStream(stream); CompressorInputStream cis;/*from ww w. j av a 2 s . c o m*/ try { CompressorStreamFactory factory = new CompressorStreamFactory(); CompressorParserOptions options = context.get(CompressorParserOptions.class, new CompressorParserOptions() { public boolean decompressConcatenated(Metadata metadata) { return false; } }); factory.setDecompressConcatenated(options.decompressConcatenated(metadata)); cis = factory.createCompressorInputStream(stream); } catch (CompressorException e) { throw new TikaException("Unable to uncompress document stream", e); } MediaType type = getMediaType(cis); if (!type.equals(MediaType.OCTET_STREAM)) { metadata.set(CONTENT_TYPE, type.toString()); } XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); try { Metadata entrydata = new Metadata(); String name = metadata.get(Metadata.RESOURCE_NAME_KEY); if (name != null) { if (name.endsWith(".tbz")) { name = name.substring(0, name.length() - 4) + ".tar"; } else if (name.endsWith(".tbz2")) { name = name.substring(0, name.length() - 5) + ".tar"; } else if (name.endsWith(".bz")) { name = name.substring(0, name.length() - 3); } else if (name.endsWith(".bz2")) { name = name.substring(0, name.length() - 4); } else if (name.endsWith(".xz")) { name = name.substring(0, name.length() - 3); } else if (name.endsWith(".pack")) { name = name.substring(0, name.length() - 5); } else if (name.length() > 0) { name = GzipUtils.getUncompressedFilename(name); } entrydata.set(Metadata.RESOURCE_NAME_KEY, name); } // Use the delegate parser to parse the compressed document EmbeddedDocumentExtractor extractor = context.get(EmbeddedDocumentExtractor.class, new ParsingEmbeddedDocumentExtractor(context)); if (extractor.shouldParseEmbedded(entrydata)) { extractor.parseEmbedded(cis, xhtml, entrydata, true); } } finally { cis.close(); } xhtml.endDocument(); }
From source file:org.apache.jackrabbit.oak.benchmark.wikipedia.WikipediaImport.java
public int importWikipedia(Session session) throws Exception { long start = System.currentTimeMillis(); int count = 0; int code = 0; if (doReport) { System.out.format("Importing %s...%n", dump); }// w ww . j a v a2 s .c om String type = "nt:unstructured"; if (session.getWorkspace().getNodeTypeManager().hasNodeType("oak:Unstructured")) { type = "oak:Unstructured"; } Node wikipedia = session.getRootNode().addNode("wikipedia", type); int levels = 0; if (!flat) { // calculate the number of levels needed, based on the rough // estimate that the average XML size of a page is about 1kB for (long pages = dump.length() / 1024; pages > 256; pages /= 256) { levels++; } } String title = null; String text = null; XMLInputFactory factory = XMLInputFactory.newInstance(); StreamSource source; if (dump.getName().endsWith(".xml")) { source = new StreamSource(dump); } else { CompressorStreamFactory csf = new CompressorStreamFactory(); source = new StreamSource( csf.createCompressorInputStream(new BufferedInputStream(new FileInputStream(dump)))); } haltImport = false; XMLStreamReader reader = factory.createXMLStreamReader(source); while (reader.hasNext() && !haltImport) { switch (reader.next()) { case XMLStreamConstants.START_ELEMENT: if ("title".equals(reader.getLocalName())) { title = reader.getElementText(); } else if ("text".equals(reader.getLocalName())) { text = reader.getElementText(); } break; case XMLStreamConstants.END_ELEMENT: if ("page".equals(reader.getLocalName())) { String name = Text.escapeIllegalJcrChars(title); Node parent = wikipedia; if (levels > 0) { int n = name.length(); for (int i = 0; i < levels; i++) { int hash = name.substring(min(i, n)).hashCode(); parent = JcrUtils.getOrAddNode(parent, String.format("%02x", hash & 0xff)); } } Node page = parent.addNode(name); page.setProperty("title", title); page.setProperty("text", text); code += title.hashCode(); code += text.hashCode(); count++; if (count % 1000 == 0) { batchDone(session, start, count); } pageAdded(title, text); } break; } } session.save(); if (doReport) { long millis = System.currentTimeMillis() - start; System.out.format("Imported %d pages in %d seconds (%.2fms/page)%n", count, millis / 1000, (double) millis / count); } return code; }