Example usage for org.apache.poi.poifs.filesystem POIFSFileSystem createDocumentInputStream

Introduction

In this page you can find the example usage for org.apache.poi.poifs.filesystem POIFSFileSystem createDocumentInputStream.

Prototype

public DocumentInputStream createDocumentInputStream(final String documentName) throws IOException

Source Link

Document

open a document in the root entry's list of entries

Usage

From source file:org.apache.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor.java

License:Apache License

/**
 * Handles an embedded OLE object in the document
 *///from  w w w  .  j a  v  a2 s  .  c o m
private void handleEmbeddedOLE(PackagePart part, ContentHandler handler, String rel)
        throws IOException, SAXException {
    // A POIFSFileSystem needs to be at least 3 blocks big to be valid
    if (part.getSize() >= 0 && part.getSize() < 512 * 3) {
        // Too small, skip
        return;
    }

    // Open the POIFS (OLE2) structure and process
    POIFSFileSystem fs = new POIFSFileSystem(part.getInputStream());
    try {
        Metadata metadata = new Metadata();
        TikaInputStream stream = null;
        metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel);

        DirectoryNode root = fs.getRoot();
        POIFSDocumentType type = POIFSDocumentType.detectType(root);

        if (root.hasEntry("CONTENTS") && root.hasEntry("\u0001Ole") && root.hasEntry("\u0001CompObj")
                && root.hasEntry("\u0003ObjInfo")) {
            // TIKA-704: OLE 2.0 embedded non-Office document?
            stream = TikaInputStream.get(fs.createDocumentInputStream("CONTENTS"));
            if (embeddedExtractor.shouldParseEmbedded(metadata)) {
                embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false);
            }
        } else if (POIFSDocumentType.OLE10_NATIVE == type) {
            // TIKA-704: OLE 1.0 embedded document
            Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(fs);
            if (ole.getLabel() != null) {
                metadata.set(Metadata.RESOURCE_NAME_KEY, ole.getLabel());
            }
            byte[] data = ole.getDataBuffer();
            if (data != null) {
                stream = TikaInputStream.get(data);
            }

            if (stream != null && embeddedExtractor.shouldParseEmbedded(metadata)) {
                embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false);
            }
        } else {
            handleEmbeddedFile(part, handler, rel);
        }
    } catch (FileNotFoundException e) {
        // There was no CONTENTS entry, so skip this part
    } catch (Ole10NativeException e) {
        // Could not process an OLE 1.0 entry, so skip this part
    }
}

From source file:org.apache.tika.parser.wordperfect.QPWTextExtractor.java

License:Apache License

@SuppressWarnings("resource")
public void extract(InputStream input, XHTMLContentHandler xhtml, Metadata metadata)
        throws IOException, SAXException, TikaException {

    POIFSFileSystem pfs = new POIFSFileSystem(input);
    DirectoryNode rootNode = pfs.getRoot();
    if (rootNode == null || !rootNode.hasEntry(OLE_DOCUMENT_NAME)) {
        throw new UnsupportedFormatException("Unsupported QuattroPro file format. " + "Looking for OLE entry \""
                + OLE_DOCUMENT_NAME + "\". Found: " + rootNode.getEntryNames());
    }//from  w ww .  j a  v  a2  s. c o  m

    //TODO shall we validate and throw warning/error if the file does not 
    //start with a BOF and ends with a EOF?
    xhtml.startElement("p");
    try (WPInputStream in = new WPInputStream(pfs.createDocumentInputStream(OLE_DOCUMENT_NAME))) {
        Context ctx = new Context(in, xhtml, metadata);
        while (hasNext(in)) {
            ctx.type = in.readWPShort();
            ctx.bodyLength = in.readWPShort();
            Extractor extractor = EXTRACTORS.get(ctx.type);
            if (extractor != null) {
                extractor.extract(ctx);
            } else {
                // Use DEBUG to find out what we are ignoring
                //                    Extractor.DEBUG.extract(ctx);
                Extractor.IGNORE.extract(ctx);
            }
        }
    }
    xhtml.endElement("p");
}

From source file:org.jab.docsearch.converters.Excel.java

License:Open Source License

/**
 * @see ConverterInterface#parse()//w ww  .  j  ava  2s. c o m
 */
@Override
public void parse() throws ConverterException {
    if (filename == null) {
        log.error("parse() filename is null");
        throw new ConverterException("Word::parse() filename is null");
    }

    // get meta data
    FileInputStream fin = null;
    try {
        fin = new FileInputStream(filename);

        POIFSReader r = new POIFSReader();
        MyPOIFSReaderListener mpfsrl = new MyPOIFSReaderListener();
        r.registerListener(mpfsrl, "\005SummaryInformation");
        r.read(fin);

        fin.close();

        // get meta data
        documentTitle = mpfsrl.getTitle();
        documentAuthor = mpfsrl.getAuthor();
        documentKeywords = mpfsrl.getKeywords();
    } catch (IOException ioe) {
        log.error("parse() failed at Excel file=" + filename, ioe);
        throw new ConverterException("Excel::parse() failed at Excel file=" + filename, ioe);
    } catch (Exception e) {
        log.error("parse() failed at Excel file=" + filename, e);
        throw new ConverterException("Excel::parse() failed", e);
    } finally {
        IOUtils.closeQuietly(fin);
    }

    if (log.isDebugEnabled()) {
        log.debug("parse() Excel file='" + filename + "'" + Layout.LINE_SEP + "title='" + documentTitle + "'"
                + Layout.LINE_SEP + "author='" + documentAuthor + "'" + Layout.LINE_SEP + "keywords='"
                + documentKeywords + "'");
    }

    // get text
    DocumentInputStream din = null;
    ExcelListener el = new ExcelListener();
    try {
        // proceed to write to file
        // create a new file input stream with the input file specified
        // at the command line
        fin = new FileInputStream(filename);

        POIFSFileSystem poifs = new POIFSFileSystem(fin);
        din = poifs.createDocumentInputStream("Workbook");
        HSSFRequest req = new HSSFRequest();
        req.addListenerForAllRecords(el);
        HSSFEventFactory factory = new HSSFEventFactory();
        factory.processEvents(req, din);

        fin.close();

        // get text
        documentText = el.getText().toString();
    } catch (IOException ioe) {
        log.error("parse() failed at Excel file=" + filename, ioe);
        throw new ConverterException("Excel::parse() failed at Excel file=" + filename, ioe);
    } catch (Exception e) {
        log.error("parse() failed", e);
        throw new ConverterException("Excel::parse() failed", e);
    } finally {
        IOUtils.closeQuietly(din);
        IOUtils.closeQuietly(fin);
    }
}

From source file:org.jberet.support.io.ExcelEventItemReader.java

License:Open Source License

@Override
protected void initWorkbookAndSheet(final int startRowNumber) throws Exception {
    queue = new ArrayBlockingQueue<Object>(queueCapacity == 0 ? MAX_WORKSHEET_ROWS : queueCapacity);
    final POIFSFileSystem poifs = new POIFSFileSystem(inputStream);
    // get the Workbook (excel part) stream in a InputStream
    documentInputStream = poifs.createDocumentInputStream("Workbook");
    final HSSFRequest req = new HSSFRequest();
    final MissingRecordAwareHSSFListener missingRecordAwareHSSFListener = new MissingRecordAwareHSSFListener(
            new HSSFListenerImpl(this));
    /*//  w  ww .  jav  a 2 s.c  o m
     * Need to use English locale her because Jackson double parsing might break in certain regions
     * where ',' is used as decimal separator instead of '.'.
     */
    formatListener = new FormatTrackingHSSFListener(missingRecordAwareHSSFListener, Locale.ENGLISH);
    req.addListenerForAllRecords(formatListener);
    final HSSFEventFactory factory = new HSSFEventFactory();

    if (objectMapper == null) {
        initJsonFactoryAndObjectMapper();
    }

    new Thread(new Runnable() {
        @Override
        public void run() {
            try {
                factory.processEvents(req, documentInputStream);
            } catch (final ReadCompletedException e) {
                SupportLogger.LOGGER.tracef("Completed reading %s%n", resource);
            }
        }
    }).start();
}

From source file:org.jreserve.gui.poi.read.xls.XlsReader.java

License:Open Source License

private void openFile(File file) throws IOException {
    fin = new FileInputStream(file);
    POIFSFileSystem pfs = new POIFSFileSystem(fin);
    din = pfs.createDocumentInputStream(DOCUMENT_NAME);
}

From source file:org.opencrx.kernel.text.ExcelToText.java

License:BSD License

/**
 * Extract text from XLS./*from w  w w.  j a v  a  2s .c  o  m*/
 * 
 * @param document
 * @return
 * @throws ServiceException
 */
public Reader parse(InputStream document) throws ServiceException {
    try {
        this.text.setLength(0);
        POIFSFileSystem fs = new POIFSFileSystem(document);
        InputStream workbook = fs.createDocumentInputStream("Workbook");
        HSSFRequest request = new HSSFRequest();
        request.addListenerForAllRecords(this);
        HSSFEventFactory eventFactory = new HSSFEventFactory();
        try {
            eventFactory.processEvents(request, workbook);
        } catch (Exception e) {
            throw new ServiceException(e);
        } catch (NoSuchMethodError e) {
            throw new ServiceException(BasicException.toExceptionStack(e));
        }
        workbook.close();
        return new StringReader(this.text.toString());
    } catch (IOException e) {
        throw new ServiceException(e);
    }
}

From source file:org.textmining.extraction.excel.ExcelTextExtractor.java

License:Open Source License

public ExcelTextExtractor(InputStream in) throws IOException {
    POIFSFileSystem poifs = new POIFSFileSystem(in);
    DocumentEntry headerProps = (DocumentEntry) poifs.getRoot().getEntry("Workbook");
    DocumentInputStream din = poifs.createDocumentInputStream("Workbook");
    _recordStream = new byte[headerProps.getSize()];

    din.read(_recordStream);//  ww  w  . j a v a2s . com
    din.close();
}

From source file:org.tonguetied.datatransfer.importing.ExcelImporter.java

License:Apache License

/**
 * This method initializes the parser enabling the parser to handle the
 * excel document./*from  www. j a  v a2 s. co  m*/
 * 
 * @param input the byte code representation of the excel document 
 * @throws ImportException if the input data fails to be parsed
 */
private void loadData(byte[] input) throws ImportException {
    ByteArrayInputStream bais = null;
    InputStream dis = null;
    try {
        bais = new ByteArrayInputStream(input);
        // create a new org.apache.poi.poifs.filesystem.Filesystem
        POIFSFileSystem poifs = new POIFSFileSystem(bais);
        // get the Workbook (excel part) stream in a InputStream
        dis = poifs.createDocumentInputStream("Workbook");
        // construct out HSSFRequest object
        HSSFRequest req = new HSSFRequest();
        // lazy listen for ALL records with the listener shown above
        req.addListenerForAllRecords(parser);
        // create our event factory
        HSSFEventFactory factory = new HSSFEventFactory();
        // process our events based on the document input stream
        factory.processEvents(req, dis);
    } catch (IOException ioe) {
        throw new ImportException(ioe);
    } finally {
        // and our document input stream (don't want to leak these!)
        close(dis);
        // once all the events are processed close our file input stream
        close(bais);
    }
}

From source file:org.tonguetied.datatransfer.importing.ExcelLanguageCentricParserTest.java

License:Apache License

/**
 * Test method for {@link org.tonguetied.datatransfer.importing.ExcelLanguageCentricParser#processRecord(org.apache.poi.hssf.record.Record)}.
 *//* w w  w .j  a v  a  2s.  c  om*/
@Test
public final void testProcessRecord() throws Exception {
    ExcelLanguageCentricParser parser = new ExcelLanguageCentricParser(keywordService);
    InputStream is = null;
    try {
        // create a new file input stream with the input file specified
        // at the command line
        File input = new File(TEST_DATA_DIR, "LanguageCentricImportData.xls");

        is = new BufferedInputStream(new FileInputStream(input));
        // create a new org.apache.poi.poifs.filesystem.Filesystem
        POIFSFileSystem poifs = new POIFSFileSystem(is);
        // get the Workbook (excel part) stream in a InputStream
        InputStream din = poifs.createDocumentInputStream("Workbook");
        // construct out HSSFRequest object
        HSSFRequest req = new HSSFRequest();
        // lazy listen for ALL records with the listener shown above
        req.addListenerForAllRecords(parser);
        // create our event factory
        HSSFEventFactory factory = new HSSFEventFactory();
        // process our events based on the document input stream
        factory.processEvents(req, din);
    } finally {
        // once all the events are processed close our file input stream
        if (is != null)
            is.close();
    }

    List<Language> languages = parser.getLanguages();
    assertEquals(4, languages.size());
    assertTrue(languages.contains(defaultLanguage));
    assertTrue(languages.contains(hebrew));
    assertTrue(languages.contains(simplifiedChinese));
    assertTrue(languages.contains(traditionalChinese));

    Map<String, Keyword> keywords = parser.getKeywords();
    assertEquals(8, keywords.size());
    Keyword actual = keywords.get(keyword1.getKeyword());
    assessKeyword(keyword1, actual);

    actual = keywords.get(keyword2.getKeyword());
    assessKeyword(keyword2, actual);
}

From source file:org.tonguetied.datatransfer.importing.KeywordExcelParserTest.java

License:Apache License

/**
 * Test method for {@link org.tonguetied.datatransfer.importing.ExcelLanguageCentricParser#processRecord(org.apache.poi.hssf.record.Record)}.
 *///www.  ja  va 2  s.  c o m
@Test
public final void testProcessRecord() throws Exception {
    ExcelParser parser = new ExcelKeywordParser(keywordService);
    InputStream is = null;
    try {
        // create a new file input stream with the input file specified
        // at the command line
        File input = new File(TEST_DATA_DIR, "KeywordExcelParserTest.xls");

        is = new BufferedInputStream(new FileInputStream(input));
        // create a new org.apache.poi.poifs.filesystem.Filesystem
        POIFSFileSystem poifs = new POIFSFileSystem(is);
        // get the Workbook (excel part) stream in a InputStream
        InputStream din = poifs.createDocumentInputStream("Workbook");
        // construct out HSSFRequest object
        HSSFRequest req = new HSSFRequest();
        // lazy listen for ALL records with the listener shown above
        req.addListenerForAllRecords(parser);
        // create our event factory
        HSSFEventFactory factory = new HSSFEventFactory();
        // process our events based on the document input stream
        factory.processEvents(req, din);
    } finally {
        // once all the events are processed close our file input stream
        if (is != null)
            is.close();
    }

    Map<String, Keyword> keywords = parser.getKeywords();
    assertEquals(7, keywords.size());
    Keyword actual = keywords.get(keyword1.getKeyword());
    assessKeyword(keyword1, actual);

    actual = keywords.get(keyword2.getKeyword());
    assessKeyword(keyword2, actual);

    actual = keywords.get(keyword3.getKeyword());
    assessKeyword(keyword3, actual);

    actual = keywords.get(keyword4.getKeyword());
    assessKeyword(keyword4, actual);
    assertTrue(actual.getTranslations().isEmpty());

    actual = keywords.get(keyword5.getKeyword());
    assessKeyword(keyword5, actual);

    final List<ImportErrorCode> errorCodes = parser.getErrorCodes();
    assertEquals(6, errorCodes.size());
    assertTrue(errorCodes.contains(ImportErrorCode.unknownCountry));
    assertTrue(errorCodes.contains(ImportErrorCode.illegalCountry));
    assertTrue(errorCodes.contains(ImportErrorCode.unknownLanguage));
    assertTrue(errorCodes.contains(ImportErrorCode.illegalLanguage));
    assertTrue(errorCodes.contains(ImportErrorCode.unknownBundle));
    assertTrue(errorCodes.contains(ImportErrorCode.illegalTranslationState));
}