List of usage examples for org.apache.poi.poifs.filesystem DirectoryNode getEntry
public Entry getEntry(final String name) throws FileNotFoundException
From source file:NewEmptyJUnitTest.java
/** * Test that we can get data from two different * embeded word documents//w w w. j a v a 2 s . c o m * @throws Exception */ public void testExtractFromEmbeded() throws Exception { POIFSFileSystem fs = new POIFSFileSystem( POIDataSamples.getSpreadSheetInstance().openResourceAsStream(filename3)); HWPFDocument doc; WordExtractor extractor3; DirectoryNode dirA = (DirectoryNode) fs.getRoot().getEntry("MBD0000A3B7"); DirectoryNode dirB = (DirectoryNode) fs.getRoot().getEntry("MBD0000A3B2"); // Should have WordDocument and 1Table assertNotNull(dirA.getEntry("1Table")); assertNotNull(dirA.getEntry("WordDocument")); assertNotNull(dirB.getEntry("1Table")); assertNotNull(dirB.getEntry("WordDocument")); // Check each in turn doc = new HWPFDocument(dirA, fs); extractor3 = new WordExtractor(doc); assertNotNull(extractor3.getText()); assertTrue(extractor3.getText().length() > 20); assertEquals("I am a sample document\r\nNot much on me\r\nI am document 1\r\n", extractor3.getText()); assertEquals("Sample Doc 1", extractor3.getSummaryInformation().getTitle()); assertEquals("Sample Test", extractor3.getSummaryInformation().getSubject()); doc = new HWPFDocument(dirB, fs); extractor3 = new WordExtractor(doc); assertNotNull(extractor3.getText()); assertTrue(extractor3.getText().length() > 20); assertEquals("I am another sample document\r\nNot much on me\r\nI am document 2\r\n", extractor3.getText()); assertEquals("Sample Doc 2", extractor3.getSummaryInformation().getTitle()); assertEquals("Another Sample Test", extractor3.getSummaryInformation().getSubject()); }
From source file:com.argo.hwp.v5.HwpTextExtractorV5.java
License:Open Source License
/** * HWP? FileHeader //ww w .j a v a 2 s. c o m * * @param fs * @return * @throws IOException */ private static FileHeader getHeader(NPOIFSFileSystem fs) throws IOException { DirectoryNode root = fs.getRoot(); // ??? p.18 // FileHeader Entry headerEntry = root.getEntry("FileHeader"); if (!headerEntry.isDocumentEntry()) return null; // ? byte[] header = new byte[256]; // FileHeader ? 256 DocumentInputStream headerStream = new DocumentInputStream((DocumentEntry) headerEntry); try { int read = headerStream.read(header); if (read != 256 || !Arrays.equals(HWP_V5_SIGNATURE, Arrays.copyOfRange(header, 0, HWP_V5_SIGNATURE.length))) return null; } finally { headerStream.close(); } FileHeader fileHeader = new FileHeader(); // . debug fileHeader.version = HwpVersion.parseVersion(LittleEndian.getUInt(header, 32)); long flags = LittleEndian.getUInt(header, 36); log.debug("Flags={}", Long.toBinaryString(flags).replace(' ', '0')); fileHeader.compressed = (flags & 0x01) == 0x01; fileHeader.encrypted = (flags & 0x02) == 0x02; fileHeader.viewtext = (flags & 0x04) == 0x04; return fileHeader; }
From source file:com.argo.hwp.v5.HwpTextExtractorV5.java
License:Open Source License
/** * ? /*from w w w . j a v a 2s .c o m*/ * * @param writer * @param source * * @return * @throws IOException */ private static void extractText(FileHeader header, NPOIFSFileSystem fs, Writer writer) throws IOException { DirectoryNode root = fs.getRoot(); // BodyText ? Entry bodyText = root.getEntry("BodyText"); if (bodyText == null || !bodyText.isDirectoryEntry()) throw new IOException("Invalid BodyText"); Iterator<Entry> iterator = ((DirectoryEntry) bodyText).getEntries(); while (iterator.hasNext()) { Entry entry = iterator.next(); if (entry.getName().startsWith("Section") && entry instanceof DocumentEntry) { log.debug("extract {}", entry.getName()); InputStream input = new NDocumentInputStream((DocumentEntry) entry); if (header.compressed) input = new InflaterInputStream(input, new Inflater(true)); HwpStreamReader sectionStream = new HwpStreamReader(input); try { extractText(sectionStream, writer); } finally { // ? ? ? try { input.close(); } catch (IOException e) { log.error("? ??", e); } } } else { log.warn(" Entry '{}'({})", entry.getName(), entry); } } }
From source file:com.ezdi.rtf.testRTFParser.RTFObjDataParser.java
License:Apache License
private byte[] handleEmbeddedPOIFS(InputStream is, Metadata metadata, AtomicInteger unknownFilenameCount) throws IOException { byte[] ret = null; try (NPOIFSFileSystem fs = new NPOIFSFileSystem(is)) { DirectoryNode root = fs.getRoot(); if (root == null) { return ret; }/* www . ja v a 2 s . c o m*/ if (root.hasEntry("Package")) { Entry ooxml = root.getEntry("Package"); TikaInputStream stream = TikaInputStream.get(new DocumentInputStream((DocumentEntry) ooxml)); ByteArrayOutputStream out = new ByteArrayOutputStream(); IOUtils.copy(stream, out); ret = out.toByteArray(); } else { // try poifs POIFSDocumentType type = POIFSDocumentType.detectType(root); if (type == POIFSDocumentType.OLE10_NATIVE) { try { // Try to un-wrap the OLE10Native record: Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(root); ret = ole.getDataBuffer(); } catch (Ole10NativeException ex) { // Not a valid OLE10Native record, skip it } } else if (type == POIFSDocumentType.COMP_OBJ) { DocumentEntry contentsEntry; try { contentsEntry = (DocumentEntry) root.getEntry("CONTENTS"); } catch (FileNotFoundException ioe) { contentsEntry = (DocumentEntry) root.getEntry("Contents"); } try (DocumentInputStream inp = new DocumentInputStream(contentsEntry)) { ret = new byte[contentsEntry.getSize()]; inp.readFully(ret); } } else { ByteArrayOutputStream out = new ByteArrayOutputStream(); is.reset(); IOUtils.copy(is, out); ret = out.toByteArray(); metadata.set(Metadata.RESOURCE_NAME_KEY, "file_" + unknownFilenameCount.getAndIncrement() + "." + type.getExtension()); metadata.set(Metadata.CONTENT_TYPE, type.getType().toString()); } } } return ret; }
From source file:Coop.argo.hwp.v5.HwpTextExtractorV5.java
License:Apache License
/** * ? //from w ww . j av a 2 s .co m * * @param writer * @param source * * @return * @throws IOException */ private static void extractBodyText(FileHeader header, NPOIFSFileSystem fs, Writer writer) throws IOException { DirectoryNode root = fs.getRoot(); // BodyText ? Entry bodyText = root.getEntry("BodyText"); if (bodyText == null || !bodyText.isDirectoryEntry()) throw new IOException("Invalid BodyText"); Iterator<Entry> iterator = ((DirectoryEntry) bodyText).getEntries(); while (iterator.hasNext()) { Entry entry = iterator.next(); if (entry.getName().startsWith("Section") && entry instanceof DocumentEntry) { log.debug("extract {}", entry.getName()); InputStream input = new NDocumentInputStream((DocumentEntry) entry); try { if (header.compressed) input = new InflaterInputStream(input, new Inflater(true)); HwpStreamReader sectionStream = new HwpStreamReader(input); extractText(sectionStream, writer); } finally { // ? ? ? try { input.close(); } catch (IOException e) { log.error("? ??", e); } } } else { log.warn(" Entry '{}'({})", entry.getName(), entry); } } }
From source file:Coop.argo.hwp.v5.HwpTextExtractorV5.java
License:Apache License
/** * ? //from ww w . j a va2 s. c om * * @param writer * @param source * * @return * @throws IOException */ private static void extractViewText(FileHeader header, NPOIFSFileSystem fs, Writer writer) throws IOException { DirectoryNode root = fs.getRoot(); // BodyText ? Entry bodyText = root.getEntry("ViewText"); if (bodyText == null || !bodyText.isDirectoryEntry()) throw new IOException("Invalid ViewText"); Iterator<Entry> iterator = ((DirectoryEntry) bodyText).getEntries(); while (iterator.hasNext()) { Entry entry = iterator.next(); if (entry.getName().startsWith("Section") && entry instanceof DocumentEntry) { log.debug("extract {}", entry.getName()); InputStream input = new NDocumentInputStream((DocumentEntry) entry); // FIXME ? Key key = readKey(input); try { input = createDecryptStream(input, key); if (header.compressed) input = new InflaterInputStream(input, new Inflater(true)); HwpStreamReader sectionStream = new HwpStreamReader(input); extractText(sectionStream, writer); } catch (InvalidKeyException e) { throw new IOException(e); } catch (NoSuchAlgorithmException e) { throw new IOException(e); } catch (NoSuchPaddingException e) { throw new IOException(e); } finally { // ? ? ? try { input.close(); } catch (IOException e) { log.error("? ??", e); } } } else { log.warn(" Entry '{}'({})", entry.getName(), entry); } } }
From source file:mj.ocraptor.extraction.tika.parser.microsoft.SummaryExtractor.java
License:Apache License
private void parseSummaryEntryIfExists(DirectoryNode root, String entryName) throws IOException, TikaException { try {//from ww w.j a va 2 s . co m DocumentEntry entry = (DocumentEntry) root.getEntry(entryName); PropertySet properties = new PropertySet(new DocumentInputStream(entry)); if (properties.isSummaryInformation()) { parse(new SummaryInformation(properties)); } if (properties.isDocumentSummaryInformation()) { parse(new DocumentSummaryInformation(properties)); } } catch (FileNotFoundException e) { // entry does not exist, just skip it } catch (NoPropertySetStreamException e) { // no property stream, just skip it } catch (UnexpectedPropertySetTypeException e) { throw new TikaException("Unexpected HPSF document", e); } catch (MarkUnsupportedException e) { throw new TikaException("Invalid DocumentInputStream", e); } catch (Exception e) { LOGGER.warn("Ignoring unexpected exception while parsing summary entry " + entryName, e); } }
From source file:mj.ocraptor.extraction.tika.parser.microsoft.WordExtractor.java
License:Apache License
protected void parse(DirectoryNode root, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { HWPFDocument document;/* ww w .java 2s. c o m*/ try { document = new HWPFDocument(root); } catch (OldWordFileFormatException e) { parseWord6(root, xhtml); return; } org.apache.poi.hwpf.extractor.WordExtractor wordExtractor = new org.apache.poi.hwpf.extractor.WordExtractor( document); // mj extractImageText(xhtml, document); HeaderStories headerFooter = new HeaderStories(document); // Grab the list of pictures. As far as we can tell, // the pictures should be in order, and may be directly // placed or referenced from an anchor PicturesTable pictureTable = document.getPicturesTable(); PicturesSource pictures = new PicturesSource(document); // Do any headers, if present Range[] headers = new Range[] { headerFooter.getFirstHeaderSubrange(), headerFooter.getEvenHeaderSubrange(), headerFooter.getOddHeaderSubrange() }; handleHeaderFooter(headers, "header", document, pictures, pictureTable, xhtml); // Do the main paragraph text Range r = document.getRange(); for (int i = 0; i < r.numParagraphs(); i++) { Paragraph p = r.getParagraph(i); i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN, pictures, pictureTable, xhtml); } // Do everything else for (String paragraph : wordExtractor.getMainTextboxText()) { xhtml.element("p", paragraph); } for (String paragraph : wordExtractor.getFootnoteText()) { xhtml.element("p", paragraph); } for (String paragraph : wordExtractor.getCommentsText()) { xhtml.element("p", paragraph); } for (String paragraph : wordExtractor.getEndnoteText()) { xhtml.element("p", paragraph); } // Do any footers, if present Range[] footers = new Range[] { headerFooter.getFirstFooterSubrange(), headerFooter.getEvenFooterSubrange(), headerFooter.getOddFooterSubrange() }; handleHeaderFooter(footers, "footer", document, pictures, pictureTable, xhtml); // Handle any pictures that we haven't output yet for (Picture p = pictures.nextUnclaimed(); p != null;) { handlePictureCharacterRun(null, p, pictures, xhtml); p = pictures.nextUnclaimed(); } // Handle any embeded office documents try { DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool"); for (Entry entry : op) { if (entry.getName().startsWith("_") && entry instanceof DirectoryEntry) { handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml); } } } catch (FileNotFoundException e) { } }
From source file:org.apache.tika.parser.microsoft.SummaryExtractor.java
License:Apache License
private void parseSummaryEntryIfExists(DirectoryNode root, String entryName) throws IOException, TikaException { try {/*from www. j a va 2 s. c om*/ DocumentEntry entry = (DocumentEntry) root.getEntry(entryName); PropertySet properties = new PropertySet(new DocumentInputStream(entry)); if (properties.isSummaryInformation()) { parse(new SummaryInformation(properties)); } if (properties.isDocumentSummaryInformation()) { parse(new DocumentSummaryInformation(properties)); } } catch (FileNotFoundException e) { // entry does not exist, just skip it } catch (NoPropertySetStreamException e) { // no property stream, just skip it } catch (UnexpectedPropertySetTypeException e) { throw new TikaException("Unexpected HPSF document", e); } catch (MarkUnsupportedException e) { throw new TikaException("Invalid DocumentInputStream", e); } catch (Exception e) { logger.warn("Ignoring unexpected exception while parsing summary entry " + entryName, e); } }
From source file:org.apache.tika.parser.microsoft.WordExtractor.java
License:Apache License
protected void parse(DirectoryNode root, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { HWPFDocument document;// w w w .java 2 s .com try { document = new HWPFDocument(root); } catch (OldWordFileFormatException e) { parseWord6(root, xhtml); return; } org.apache.poi.hwpf.extractor.WordExtractor wordExtractor = new org.apache.poi.hwpf.extractor.WordExtractor( document); HeaderStories headerFooter = new HeaderStories(document); // Grab the list of pictures. As far as we can tell, // the pictures should be in order, and may be directly // placed or referenced from an anchor PicturesTable pictureTable = document.getPicturesTable(); PicturesSource pictures = new PicturesSource(document); // Do any headers, if present Range[] headers = new Range[] { headerFooter.getFirstHeaderSubrange(), headerFooter.getEvenHeaderSubrange(), headerFooter.getOddHeaderSubrange() }; handleHeaderFooter(headers, "header", document, pictures, pictureTable, xhtml); // Do the main paragraph text Range r = document.getRange(); ListManager listManager = new ListManager(document); for (int i = 0; i < r.numParagraphs(); i++) { Paragraph p = r.getParagraph(i); i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN, pictures, pictureTable, listManager, xhtml); } // Do everything else for (String paragraph : wordExtractor.getMainTextboxText()) { xhtml.element("p", paragraph); } for (String paragraph : wordExtractor.getFootnoteText()) { xhtml.element("p", paragraph); } for (String paragraph : wordExtractor.getCommentsText()) { xhtml.element("p", paragraph); } for (String paragraph : wordExtractor.getEndnoteText()) { xhtml.element("p", paragraph); } // Do any footers, if present Range[] footers = new Range[] { headerFooter.getFirstFooterSubrange(), headerFooter.getEvenFooterSubrange(), headerFooter.getOddFooterSubrange() }; handleHeaderFooter(footers, "footer", document, pictures, pictureTable, xhtml); // Handle any pictures that we haven't output yet for (Picture p = pictures.nextUnclaimed(); p != null;) { handlePictureCharacterRun(null, p, pictures, xhtml); p = pictures.nextUnclaimed(); } // Handle any embeded office documents try { DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool"); for (Entry entry : op) { if (entry.getName().startsWith("_") && entry instanceof DirectoryEntry) { handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml); } } } catch (FileNotFoundException e) { } }