Example usage for org.apache.poi.poifs.filesystem DirectoryNode getEntry

List of usage examples for org.apache.poi.poifs.filesystem DirectoryNode getEntry

Introduction

In this page you can find the example usage for org.apache.poi.poifs.filesystem DirectoryNode getEntry.

Prototype


public Entry getEntry(final String name) throws FileNotFoundException 

Source Link

Document

get a specified Entry by name

Usage

From source file:NewEmptyJUnitTest.java

/**
 * Test that we can get data from two different
 *  embeded word documents//w w  w.  j a  v a 2  s  . c o m
 * @throws Exception
 */
public void testExtractFromEmbeded() throws Exception {
    POIFSFileSystem fs = new POIFSFileSystem(
            POIDataSamples.getSpreadSheetInstance().openResourceAsStream(filename3));
    HWPFDocument doc;
    WordExtractor extractor3;

    DirectoryNode dirA = (DirectoryNode) fs.getRoot().getEntry("MBD0000A3B7");
    DirectoryNode dirB = (DirectoryNode) fs.getRoot().getEntry("MBD0000A3B2");

    // Should have WordDocument and 1Table
    assertNotNull(dirA.getEntry("1Table"));
    assertNotNull(dirA.getEntry("WordDocument"));

    assertNotNull(dirB.getEntry("1Table"));
    assertNotNull(dirB.getEntry("WordDocument"));

    // Check each in turn
    doc = new HWPFDocument(dirA, fs);
    extractor3 = new WordExtractor(doc);

    assertNotNull(extractor3.getText());
    assertTrue(extractor3.getText().length() > 20);
    assertEquals("I am a sample document\r\nNot much on me\r\nI am document 1\r\n", extractor3.getText());
    assertEquals("Sample Doc 1", extractor3.getSummaryInformation().getTitle());
    assertEquals("Sample Test", extractor3.getSummaryInformation().getSubject());

    doc = new HWPFDocument(dirB, fs);
    extractor3 = new WordExtractor(doc);

    assertNotNull(extractor3.getText());
    assertTrue(extractor3.getText().length() > 20);
    assertEquals("I am another sample document\r\nNot much on me\r\nI am document 2\r\n", extractor3.getText());
    assertEquals("Sample Doc 2", extractor3.getSummaryInformation().getTitle());
    assertEquals("Another Sample Test", extractor3.getSummaryInformation().getSubject());
}

From source file:com.argo.hwp.v5.HwpTextExtractorV5.java

License:Open Source License

/**
 * HWP? FileHeader //ww  w .j a v  a 2 s. c o m
 * 
 * @param fs
 * @return
 * @throws IOException
 */
private static FileHeader getHeader(NPOIFSFileSystem fs) throws IOException {
    DirectoryNode root = fs.getRoot();

    // ??? p.18

    // FileHeader  
    Entry headerEntry = root.getEntry("FileHeader");
    if (!headerEntry.isDocumentEntry())
        return null;

    //  ?
    byte[] header = new byte[256]; // FileHeader ? 256
    DocumentInputStream headerStream = new DocumentInputStream((DocumentEntry) headerEntry);
    try {
        int read = headerStream.read(header);
        if (read != 256
                || !Arrays.equals(HWP_V5_SIGNATURE, Arrays.copyOfRange(header, 0, HWP_V5_SIGNATURE.length)))
            return null;
    } finally {
        headerStream.close();
    }

    FileHeader fileHeader = new FileHeader();

    // . debug
    fileHeader.version = HwpVersion.parseVersion(LittleEndian.getUInt(header, 32));
    long flags = LittleEndian.getUInt(header, 36);
    log.debug("Flags={}", Long.toBinaryString(flags).replace(' ', '0'));

    fileHeader.compressed = (flags & 0x01) == 0x01;
    fileHeader.encrypted = (flags & 0x02) == 0x02;
    fileHeader.viewtext = (flags & 0x04) == 0x04;

    return fileHeader;
}

From source file:com.argo.hwp.v5.HwpTextExtractorV5.java

License:Open Source License

/**
 * ? /*from   w  w  w . j a v a  2s .c  o  m*/
 * 
 * @param writer
 * @param source
 * 
 * @return
 * @throws IOException
 */
private static void extractText(FileHeader header, NPOIFSFileSystem fs, Writer writer) throws IOException {
    DirectoryNode root = fs.getRoot();

    // BodyText ?
    Entry bodyText = root.getEntry("BodyText");
    if (bodyText == null || !bodyText.isDirectoryEntry())
        throw new IOException("Invalid BodyText");

    Iterator<Entry> iterator = ((DirectoryEntry) bodyText).getEntries();
    while (iterator.hasNext()) {
        Entry entry = iterator.next();
        if (entry.getName().startsWith("Section") && entry instanceof DocumentEntry) {
            log.debug("extract {}", entry.getName());

            InputStream input = new NDocumentInputStream((DocumentEntry) entry);
            if (header.compressed)
                input = new InflaterInputStream(input, new Inflater(true));

            HwpStreamReader sectionStream = new HwpStreamReader(input);

            try {
                extractText(sectionStream, writer);
            } finally {
                // ?  ? ?
                try {
                    input.close();
                } catch (IOException e) {
                    log.error("?   ??", e);
                }
            }
        } else {
            log.warn(" Entry '{}'({})", entry.getName(), entry);
        }
    }
}

From source file:com.ezdi.rtf.testRTFParser.RTFObjDataParser.java

License:Apache License

private byte[] handleEmbeddedPOIFS(InputStream is, Metadata metadata, AtomicInteger unknownFilenameCount)
        throws IOException {

    byte[] ret = null;
    try (NPOIFSFileSystem fs = new NPOIFSFileSystem(is)) {

        DirectoryNode root = fs.getRoot();

        if (root == null) {
            return ret;
        }/*  www . ja  v a  2  s .  c o m*/

        if (root.hasEntry("Package")) {
            Entry ooxml = root.getEntry("Package");
            TikaInputStream stream = TikaInputStream.get(new DocumentInputStream((DocumentEntry) ooxml));

            ByteArrayOutputStream out = new ByteArrayOutputStream();

            IOUtils.copy(stream, out);
            ret = out.toByteArray();
        } else {
            // try poifs
            POIFSDocumentType type = POIFSDocumentType.detectType(root);
            if (type == POIFSDocumentType.OLE10_NATIVE) {
                try {
                    // Try to un-wrap the OLE10Native record:
                    Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(root);
                    ret = ole.getDataBuffer();
                } catch (Ole10NativeException ex) {
                    // Not a valid OLE10Native record, skip it
                }
            } else if (type == POIFSDocumentType.COMP_OBJ) {

                DocumentEntry contentsEntry;
                try {
                    contentsEntry = (DocumentEntry) root.getEntry("CONTENTS");
                } catch (FileNotFoundException ioe) {
                    contentsEntry = (DocumentEntry) root.getEntry("Contents");
                }

                try (DocumentInputStream inp = new DocumentInputStream(contentsEntry)) {
                    ret = new byte[contentsEntry.getSize()];
                    inp.readFully(ret);
                }
            } else {

                ByteArrayOutputStream out = new ByteArrayOutputStream();
                is.reset();
                IOUtils.copy(is, out);
                ret = out.toByteArray();
                metadata.set(Metadata.RESOURCE_NAME_KEY,
                        "file_" + unknownFilenameCount.getAndIncrement() + "." + type.getExtension());
                metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
            }
        }
    }
    return ret;
}

From source file:Coop.argo.hwp.v5.HwpTextExtractorV5.java

License:Apache License

/**
 * ? //from  w ww . j  av  a  2  s  .co  m
 * 
 * @param writer
 * @param source
 * 
 * @return
 * @throws IOException
 */
private static void extractBodyText(FileHeader header, NPOIFSFileSystem fs, Writer writer) throws IOException {
    DirectoryNode root = fs.getRoot();

    // BodyText ?
    Entry bodyText = root.getEntry("BodyText");
    if (bodyText == null || !bodyText.isDirectoryEntry())
        throw new IOException("Invalid BodyText");

    Iterator<Entry> iterator = ((DirectoryEntry) bodyText).getEntries();
    while (iterator.hasNext()) {
        Entry entry = iterator.next();
        if (entry.getName().startsWith("Section") && entry instanceof DocumentEntry) {
            log.debug("extract {}", entry.getName());

            InputStream input = new NDocumentInputStream((DocumentEntry) entry);
            try {
                if (header.compressed)
                    input = new InflaterInputStream(input, new Inflater(true));

                HwpStreamReader sectionStream = new HwpStreamReader(input);

                extractText(sectionStream, writer);
            } finally {
                // ?  ? ?
                try {
                    input.close();
                } catch (IOException e) {
                    log.error("?   ??", e);
                }
            }
        } else {
            log.warn(" Entry '{}'({})", entry.getName(), entry);
        }
    }
}

From source file:Coop.argo.hwp.v5.HwpTextExtractorV5.java

License:Apache License

/**
 * ? //from   ww w  .  j a va2 s.  c om
 * 
 * @param writer
 * @param source
 * 
 * @return
 * @throws IOException
 */
private static void extractViewText(FileHeader header, NPOIFSFileSystem fs, Writer writer) throws IOException {
    DirectoryNode root = fs.getRoot();

    // BodyText ?
    Entry bodyText = root.getEntry("ViewText");
    if (bodyText == null || !bodyText.isDirectoryEntry())
        throw new IOException("Invalid ViewText");

    Iterator<Entry> iterator = ((DirectoryEntry) bodyText).getEntries();
    while (iterator.hasNext()) {
        Entry entry = iterator.next();
        if (entry.getName().startsWith("Section") && entry instanceof DocumentEntry) {
            log.debug("extract {}", entry.getName());

            InputStream input = new NDocumentInputStream((DocumentEntry) entry);

            // FIXME   ?
            Key key = readKey(input);
            try {
                input = createDecryptStream(input, key);
                if (header.compressed)
                    input = new InflaterInputStream(input, new Inflater(true));

                HwpStreamReader sectionStream = new HwpStreamReader(input);
                extractText(sectionStream, writer);
            } catch (InvalidKeyException e) {
                throw new IOException(e);
            } catch (NoSuchAlgorithmException e) {
                throw new IOException(e);
            } catch (NoSuchPaddingException e) {
                throw new IOException(e);
            } finally {
                // ?  ? ?
                try {
                    input.close();
                } catch (IOException e) {
                    log.error("?   ??", e);
                }
            }
        } else {
            log.warn(" Entry '{}'({})", entry.getName(), entry);
        }
    }
}

From source file:mj.ocraptor.extraction.tika.parser.microsoft.SummaryExtractor.java

License:Apache License

private void parseSummaryEntryIfExists(DirectoryNode root, String entryName) throws IOException, TikaException {
    try {//from   ww  w.j a  va  2 s . co  m
        DocumentEntry entry = (DocumentEntry) root.getEntry(entryName);
        PropertySet properties = new PropertySet(new DocumentInputStream(entry));
        if (properties.isSummaryInformation()) {
            parse(new SummaryInformation(properties));
        }
        if (properties.isDocumentSummaryInformation()) {
            parse(new DocumentSummaryInformation(properties));
        }
    } catch (FileNotFoundException e) {
        // entry does not exist, just skip it
    } catch (NoPropertySetStreamException e) {
        // no property stream, just skip it
    } catch (UnexpectedPropertySetTypeException e) {
        throw new TikaException("Unexpected HPSF document", e);
    } catch (MarkUnsupportedException e) {
        throw new TikaException("Invalid DocumentInputStream", e);
    } catch (Exception e) {
        LOGGER.warn("Ignoring unexpected exception while parsing summary entry " + entryName, e);
    }
}

From source file:mj.ocraptor.extraction.tika.parser.microsoft.WordExtractor.java

License:Apache License

protected void parse(DirectoryNode root, XHTMLContentHandler xhtml)
        throws IOException, SAXException, TikaException {
    HWPFDocument document;/* ww  w .java  2s. c o  m*/
    try {
        document = new HWPFDocument(root);
    } catch (OldWordFileFormatException e) {
        parseWord6(root, xhtml);
        return;
    }

    org.apache.poi.hwpf.extractor.WordExtractor wordExtractor = new org.apache.poi.hwpf.extractor.WordExtractor(
            document);

    // mj
    extractImageText(xhtml, document);

    HeaderStories headerFooter = new HeaderStories(document);

    // Grab the list of pictures. As far as we can tell,
    // the pictures should be in order, and may be directly
    // placed or referenced from an anchor
    PicturesTable pictureTable = document.getPicturesTable();
    PicturesSource pictures = new PicturesSource(document);

    // Do any headers, if present
    Range[] headers = new Range[] { headerFooter.getFirstHeaderSubrange(), headerFooter.getEvenHeaderSubrange(),
            headerFooter.getOddHeaderSubrange() };
    handleHeaderFooter(headers, "header", document, pictures, pictureTable, xhtml);

    // Do the main paragraph text
    Range r = document.getRange();
    for (int i = 0; i < r.numParagraphs(); i++) {
        Paragraph p = r.getParagraph(i);
        i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN, pictures, pictureTable, xhtml);
    }

    // Do everything else
    for (String paragraph : wordExtractor.getMainTextboxText()) {
        xhtml.element("p", paragraph);
    }

    for (String paragraph : wordExtractor.getFootnoteText()) {
        xhtml.element("p", paragraph);
    }

    for (String paragraph : wordExtractor.getCommentsText()) {
        xhtml.element("p", paragraph);
    }

    for (String paragraph : wordExtractor.getEndnoteText()) {
        xhtml.element("p", paragraph);
    }

    // Do any footers, if present
    Range[] footers = new Range[] { headerFooter.getFirstFooterSubrange(), headerFooter.getEvenFooterSubrange(),
            headerFooter.getOddFooterSubrange() };
    handleHeaderFooter(footers, "footer", document, pictures, pictureTable, xhtml);

    // Handle any pictures that we haven't output yet
    for (Picture p = pictures.nextUnclaimed(); p != null;) {
        handlePictureCharacterRun(null, p, pictures, xhtml);
        p = pictures.nextUnclaimed();
    }

    // Handle any embeded office documents
    try {
        DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool");
        for (Entry entry : op) {
            if (entry.getName().startsWith("_") && entry instanceof DirectoryEntry) {
                handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml);
            }
        }
    } catch (FileNotFoundException e) {
    }
}

From source file:org.apache.tika.parser.microsoft.SummaryExtractor.java

License:Apache License

private void parseSummaryEntryIfExists(DirectoryNode root, String entryName) throws IOException, TikaException {
    try {/*from   www.  j a  va 2  s.  c om*/
        DocumentEntry entry = (DocumentEntry) root.getEntry(entryName);
        PropertySet properties = new PropertySet(new DocumentInputStream(entry));
        if (properties.isSummaryInformation()) {
            parse(new SummaryInformation(properties));
        }
        if (properties.isDocumentSummaryInformation()) {
            parse(new DocumentSummaryInformation(properties));
        }
    } catch (FileNotFoundException e) {
        // entry does not exist, just skip it
    } catch (NoPropertySetStreamException e) {
        // no property stream, just skip it
    } catch (UnexpectedPropertySetTypeException e) {
        throw new TikaException("Unexpected HPSF document", e);
    } catch (MarkUnsupportedException e) {
        throw new TikaException("Invalid DocumentInputStream", e);
    } catch (Exception e) {
        logger.warn("Ignoring unexpected exception while parsing summary entry " + entryName, e);
    }
}

From source file:org.apache.tika.parser.microsoft.WordExtractor.java

License:Apache License

protected void parse(DirectoryNode root, XHTMLContentHandler xhtml)
        throws IOException, SAXException, TikaException {
    HWPFDocument document;//  w  w w  .java  2  s  .com
    try {
        document = new HWPFDocument(root);
    } catch (OldWordFileFormatException e) {
        parseWord6(root, xhtml);
        return;
    }
    org.apache.poi.hwpf.extractor.WordExtractor wordExtractor = new org.apache.poi.hwpf.extractor.WordExtractor(
            document);
    HeaderStories headerFooter = new HeaderStories(document);

    // Grab the list of pictures. As far as we can tell,
    //  the pictures should be in order, and may be directly
    //  placed or referenced from an anchor
    PicturesTable pictureTable = document.getPicturesTable();
    PicturesSource pictures = new PicturesSource(document);

    // Do any headers, if present
    Range[] headers = new Range[] { headerFooter.getFirstHeaderSubrange(), headerFooter.getEvenHeaderSubrange(),
            headerFooter.getOddHeaderSubrange() };
    handleHeaderFooter(headers, "header", document, pictures, pictureTable, xhtml);

    // Do the main paragraph text
    Range r = document.getRange();
    ListManager listManager = new ListManager(document);
    for (int i = 0; i < r.numParagraphs(); i++) {
        Paragraph p = r.getParagraph(i);
        i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN, pictures, pictureTable, listManager,
                xhtml);
    }

    // Do everything else
    for (String paragraph : wordExtractor.getMainTextboxText()) {
        xhtml.element("p", paragraph);
    }

    for (String paragraph : wordExtractor.getFootnoteText()) {
        xhtml.element("p", paragraph);
    }

    for (String paragraph : wordExtractor.getCommentsText()) {
        xhtml.element("p", paragraph);
    }

    for (String paragraph : wordExtractor.getEndnoteText()) {
        xhtml.element("p", paragraph);
    }

    // Do any footers, if present
    Range[] footers = new Range[] { headerFooter.getFirstFooterSubrange(), headerFooter.getEvenFooterSubrange(),
            headerFooter.getOddFooterSubrange() };
    handleHeaderFooter(footers, "footer", document, pictures, pictureTable, xhtml);

    // Handle any pictures that we haven't output yet
    for (Picture p = pictures.nextUnclaimed(); p != null;) {
        handlePictureCharacterRun(null, p, pictures, xhtml);
        p = pictures.nextUnclaimed();
    }

    // Handle any embeded office documents
    try {
        DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool");
        for (Entry entry : op) {
            if (entry.getName().startsWith("_") && entry instanceof DirectoryEntry) {
                handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml);
            }
        }
    } catch (FileNotFoundException e) {
    }
}