List of usage examples for org.apache.poi.poifs.filesystem POIFSFileSystem getRoot
public DirectoryNode getRoot()
From source file:org.elasticwarehouse.core.parsers.FileEmbeddedDocumentExtractor.java
License:Apache License
public void parseEmbedded(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, boolean outputHtml) throws SAXException, IOException { String name = metadata.get(Metadata.RESOURCE_NAME_KEY); if (name == null) { name = "file" + count++; }//w w w . j a v a2s . co m DefaultDetector detector = new DefaultDetector(); MediaType contentType = detector.detect(inputStream, metadata); if (name.indexOf('.') == -1 && contentType != null) { try { name += config.getMimeRepository().forName(contentType.toString()).getExtension(); } catch (MimeTypeException e) { EWLogger.logerror(e); e.printStackTrace(); } } String relID = metadata.get(Metadata.EMBEDDED_RELATIONSHIP_ID); if (relID != null && !name.startsWith(relID)) { name = relID + "_" + name; } File outputFile = new File(extractDir, FilenameUtils.normalize(name)); File parent = outputFile.getParentFile(); if (!parent.exists()) { if (!parent.mkdirs()) { throw new IOException("unable to create directory \"" + parent + "\""); } } System.out.println("Extracting '" + name + "' (" + contentType + ") to " + outputFile); FileOutputStream os = null; try { os = new FileOutputStream(outputFile); if (inputStream instanceof TikaInputStream) { TikaInputStream tin = (TikaInputStream) inputStream; if (tin.getOpenContainer() != null && tin.getOpenContainer() instanceof DirectoryEntry) { POIFSFileSystem fs = new POIFSFileSystem(); copy((DirectoryEntry) tin.getOpenContainer(), fs.getRoot()); fs.writeFilesystem(os); } else { IOUtils.copy(inputStream, os); } } else { IOUtils.copy(inputStream, os); } } catch (Exception e) { // // being a CLI program messages should go to the stderr too // String msg = String.format(Locale.ROOT, "Ignoring unexpected exception trying to save embedded file %s (%s)", name, e.getMessage()); EWLogger.logerror(e); System.err.println(msg); //logger.warn(msg, e); } finally { if (os != null) { os.close(); } } }
From source file:org.knime.ext.textprocessing.nodes.source.parser.word.WordDocumentParser.java
License:Open Source License
private Document parseInternal(final InputStream is) throws Exception { m_currentDoc = new DocumentBuilder(m_tokenizerName); m_currentDoc.setDocumentFile(new File(m_docPath)); m_currentDoc.setDocumentType(m_type); m_currentDoc.addDocumentCategory(m_category); m_currentDoc.addDocumentSource(m_source); POIFSFileSystem poifs = null; HWPFDocument hdoc = null;/*from w w w.j ava 2 s .com*/ XWPFDocument hdoc2 = null; WordExtractor extractor = null; try { // doc files if (m_docPath.endsWith(".doc")) { // copy content of input stream into byte array since content have to be red twice unfortunately. final ByteArrayOutputStream baos = new ByteArrayOutputStream(); final byte[] buf = new byte[1024]; int i = 0; while ((i = is.read(buf)) >= 0) { baos.write(buf, 0, i); } final byte[] content = baos.toByteArray(); // open stream with copied content to read text InputStream copiedInput = new ByteArrayInputStream(content); hdoc = new HWPFDocument(copiedInput); extractor = new WordExtractor(hdoc); for (String p : extractor.getParagraphText()) { p = p.trim(); if (!onlyWhitepscaes(p)) { m_currentDoc.addParagraph(p); } } // open stream again with copied content to read meta info copiedInput = new ByteArrayInputStream(content); poifs = new POIFSFileSystem(copiedInput); final DirectoryEntry dir = poifs.getRoot(); final DocumentEntry siEntry = (DocumentEntry) dir.getEntry(SummaryInformation.DEFAULT_STREAM_NAME); final PropertySet ps = new PropertySet(new DocumentInputStream(siEntry)); final SummaryInformation si = new SummaryInformation(ps); setAuthor(si.getAuthor()); setPublicationDate(si.getCreateDateTime()); // docx files } else if (m_docPath.endsWith(".docx") || m_docPath.endsWith(".docm")) { hdoc2 = new XWPFDocument(is); final List<XWPFParagraph> paragraphs = hdoc2.getParagraphs(); for (final XWPFParagraph paragraph : paragraphs) { final String text = paragraph.getText(); if (!onlyWhitepscaes(text)) { m_currentDoc.addParagraph(text); } } setAuthor(hdoc2.getProperties().getCoreProperties().getCreator()); setPublicationDate(hdoc2.getProperties().getCoreProperties().getCreated()); } m_currentDoc.createNewSection(SectionAnnotation.CHAPTER); // find title String title = null; if (m_filenameAsTitle) { title = m_docPath.trim(); } else { final List<Section> sections = m_currentDoc.getSections(); if (sections.size() > 0) { try { title = sections.get(0).getParagraphs().get(0).getSentences().get(0).getText().trim(); } catch (IndexOutOfBoundsException e) { LOGGER.debug("Parsed word document " + m_docPath + " is empty."); title = ""; } } } if (!checkTitle(title)) { title = m_docPath.toString(); } m_currentDoc.addTitle(title); return m_currentDoc.createDocument(); } finally { is.close(); if (poifs != null) { poifs.close(); } if (hdoc != null) { hdoc.close(); } if (hdoc2 != null) { hdoc2.close(); } if (extractor != null) { extractor.close(); } } }
From source file:org.olat.search.service.document.file.WordDocument.java
License:Apache License
@Override protected String readContent(final VFSLeaf leaf) throws IOException, DocumentException { BufferedInputStream bis = null; final StringBuilder sb = new StringBuilder(); try {//from w ww.ja va2 s . c o m bis = new BufferedInputStream(leaf.getInputStream()); final POIFSFileSystem filesystem = new POIFSFileSystem(bis); final Iterator<?> entries = filesystem.getRoot().getEntries(); while (entries.hasNext()) { final Entry entry = (Entry) entries.next(); final String name = entry.getName(); if (!(entry instanceof DocumentEntry)) { // Skip directory entries } else if ("WordDocument".equals(name)) { collectWordDocument(filesystem, sb); } } return sb.toString(); } catch (final Exception e) { throw new DocumentException(e.getMessage()); } finally { if (bis != null) { bis.close(); } } }
From source file:org.opencrx.application.uses.com.auxilii.msgparser.MsgParser.java
License:Open Source License
/** * Parses a .msg file provided by an input stream. * //from w w w .jav a 2 s .co m * @param msgFileStream The .msg file as a InputStream. * @param closeStream Indicates whether the provided stream should * be closed after the message has been read. * @return A {@link Message} object representing the .msg file. * @throws IOException Thrown if the file could not be loaded or parsed. * @throws UnsupportedOperationException Thrown if the .msg file cannot * be parsed correctly. */ public Message parseMsg(InputStream msgFileStream, boolean closeStream) throws IOException, UnsupportedOperationException { // the .msg file, like a file system, contains directories // and documents within this directories // we now gain access to the root node // and recursively go through the complete 'filesystem'. Message msg = null; try { POIFSFileSystem fs = new POIFSFileSystem(msgFileStream); DirectoryEntry dir = fs.getRoot(); msg = new Message(); this.checkDirectoryEntry(dir, msg); } finally { if (closeStream) { try { msgFileStream.close(); } catch (Exception e) { // ignore } } } return msg; }
From source file:org.opf_labs.aqua.OfficeAnalyser.java
License:Apache License
public static void main(String[] args) throws Exception { //import org.apache.poi.poifs.dev.POIFSDump; //POIFSDump.main(args); SMOutputDocument xmldoc = SMOutputFactory.createOutputDocument( SMOutputFactory.getGlobalXMLOutputFactory().createXMLStreamWriter(System.out, "UTF-8"), "1.1", "UTF-8", true); xmldoc.setIndentation("\n ", 1, 2); // for unix linefeed, 2 spaces per level SMOutputElement xmlroot = xmldoc.addElement("properties"); // Loop through arguments: for (int i = 0; i < args.length; i++) { SMOutputElement xd = xmlroot.addElement("document"); xd.addAttribute("href", args[i]); HWPFDocument doc = new HWPFDocument(new FileInputStream(args[i])); // SummaryInformation SMOutputElement sie = xd.addElement("SummaryInformation"); sie.addElement("ApplicationName").addCharacters(doc.getSummaryInformation().getApplicationName()); sie.addElement("OSVersion").addCharacters("" + doc.getSummaryInformation().getOSVersion()); sie.addElement("Author").addCharacters("" + doc.getSummaryInformation().getAuthor()); sie.addElement("CharCount").addCharacters("" + doc.getSummaryInformation().getCharCount()); sie.addElement("Comments").addCharacters("" + doc.getSummaryInformation().getComments()); sie.addElement("EditTime").addCharacters("" + doc.getSummaryInformation().getEditTime()); sie.addElement("Format").addCharacters("" + doc.getSummaryInformation().getFormat()); sie.addElement("Keywords").addCharacters("" + doc.getSummaryInformation().getKeywords()); sie.addElement("LastAuthor").addCharacters("" + doc.getSummaryInformation().getLastAuthor()); sie.addElement("PageCount").addCharacters("" + doc.getSummaryInformation().getPageCount()); sie.addElement("RevNumber").addCharacters("" + doc.getSummaryInformation().getRevNumber()); sie.addElement("SectionCount").addCharacters("" + doc.getSummaryInformation().getSectionCount()); sie.addElement("Security").addCharacters("" + doc.getSummaryInformation().getSecurity()); sie.addElement("Subject").addCharacters("" + doc.getSummaryInformation().getSubject()); sie.addElement("Template").addCharacters("" + doc.getSummaryInformation().getTemplate()); sie.addElement("Title").addCharacters("" + doc.getSummaryInformation().getTitle()); sie.addElement("WordCount").addCharacters("" + doc.getSummaryInformation().getWordCount()); sie.addElement("CreatedDateTime").addCharacters("" + doc.getSummaryInformation().getCreateDateTime()); sie.addElement("LastPrinted").addCharacters("" + doc.getSummaryInformation().getLastPrinted()); sie.addElement("LastSaveDateTime") .addCharacters("" + doc.getSummaryInformation().getLastSaveDateTime()); sie.addElement("Thumbnail").addCharacters("" + doc.getSummaryInformation().getThumbnail()); // TextTable SMOutputElement tte = xd.addElement("TextTable"); for (TextPiece tp : doc.getTextTable().getTextPieces()) { SMOutputElement tpe = tte.addElement("TextPiece"); tpe.addAttribute("isUnicode", "" + tp.getPieceDescriptor().isUnicode()); tpe.addCharacters(tp.getStringBuilder().toString()); }//from ww w . ja va 2s . co m // DocumentSummaryInformation SMOutputElement dsie = xd.addElement("DocumentSummaryInformation"); dsie.addElement("ParCount").addCharacters("" + doc.getDocumentSummaryInformation().getParCount()); dsie.addElement("ByteCount").addCharacters("" + doc.getDocumentSummaryInformation().getByteCount()); dsie.addElement("HiddenCount").addCharacters("" + doc.getDocumentSummaryInformation().getHiddenCount()); dsie.addElement("LineCount").addCharacters("" + doc.getDocumentSummaryInformation().getLineCount()); dsie.addElement("MMClipCount").addCharacters("" + doc.getDocumentSummaryInformation().getMMClipCount()); dsie.addElement("NoteCount").addCharacters("" + doc.getDocumentSummaryInformation().getNoteCount()); dsie.addElement("SectionCount") .addCharacters("" + doc.getDocumentSummaryInformation().getSectionCount()); dsie.addElement("SlideCount").addCharacters("" + doc.getDocumentSummaryInformation().getSlideCount()); dsie.addElement("Format").addCharacters("" + doc.getDocumentSummaryInformation().getFormat()); dsie.addElement("PresentationFormat") .addCharacters("" + doc.getDocumentSummaryInformation().getPresentationFormat()); dsie.addElement("Company").addCharacters("" + doc.getDocumentSummaryInformation().getCompany()); dsie.addElement("Category").addCharacters("" + doc.getDocumentSummaryInformation().getCategory()); // Sections for (Object os : doc.getDocumentSummaryInformation().getSections()) { Section s = (Section) os; SMOutputElement se = dsie.addElement("Section"); se.addElement("FormatID").addCharacters("" + s.getFormatID()); se.addElement("CodePage").addCharacters("" + s.getCodepage()); se.addElement("PropertyCount").addCharacters("" + s.getPropertyCount()); for (Property sp : s.getProperties()) { SMOutputElement pe = se.addElement("Property"); pe.addAttribute("class", sp.getValue().getClass().getCanonicalName()); pe.addCharacters(sp.getValue().toString()); } } SMOutputElement fte = xd.addElement("FontTable"); for (Ffn f : doc.getFontTable().getFontNames()) { SMOutputElement fe = fte.addElement("Font"); fe.addElement("MainFontName").addCharacters(f.getMainFontName()); try { fe.addElement("AltFontName").addCharacters(f.getAltFontName()); } catch (Exception e) { // Seems to fail, and no safe test found as yet. } fe.addElement("Size").addCharacters("" + f.getSize()); fe.addElement("Weight").addCharacters("" + f.getWeight()); } SMOutputElement pte = xd.addElement("PicturesTable"); for (Picture p : doc.getPicturesTable().getAllPictures()) { SMOutputElement pe = pte.addElement("Picture"); pe.addElement("MimeType").addCharacters(p.getMimeType()); pe.addElement("Width").addCharacters("" + p.getWidth()); pe.addElement("Height").addCharacters("" + p.getHeight()); pe.addElement("HorizontalScalingFactor").addCharacters("" + p.getHorizontalScalingFactor()); pe.addElement("VerticalScalingFactor").addCharacters("" + p.getVerticalScalingFactor()); pe.addElement("Content").addCharacters("" + p.getContent()); } //parseCompObj( new File(args[i]) ); // This //System.out.println("Dumping " + args[i]); FileInputStream is = new FileInputStream(args[i]); POIFSFileSystem fs = new POIFSFileSystem(is); is.close(); DirectoryEntry root = fs.getRoot(); //dump(root); xmldoc.closeRoot(); // important, flushes, closes output } }
From source file:org.paxle.parser.msoffice.impl.AMsOfficeParser.java
License:Open Source License
protected void extractMetadata(POIFSFileSystem fs, IParserDocument parserDoc) throws ParserException { DocumentInputStream docIn = null;/*from w w w. ja v a 2 s . c om*/ try { // read the summary info DirectoryEntry dir = fs.getRoot(); DocumentEntry siEntry = (DocumentEntry) dir.getEntry(SummaryInformation.DEFAULT_STREAM_NAME); docIn = new DocumentInputStream(siEntry); // get properties PropertySet props = new PropertySet(docIn); docIn.close(); // extract info SummaryInformation summary = new SummaryInformation(props); // doc title String title = summary.getTitle(); if (title != null && title.length() > 0) { parserDoc.setTitle(title); this.logger.debug(String.format("Document title is: %s", title)); } // doc author String author = summary.getAuthor(); if (author != null && author.length() > 0) { parserDoc.setAuthor(author); this.logger.debug(String.format("Document author is: %s", author)); } // subject String subject = summary.getSubject(); if (subject != null && subject.length() > 0) { parserDoc.setSummary(subject); this.logger.debug(String.format("Document summary is: %s", subject)); } // doc keywords String keywords = summary.getKeywords(); if (keywords != null && keywords.length() > 0) { String[] keywordArray = keywords.split("[,;\\s]"); if (keywordArray != null && keywordArray.length > 0) { ArrayList<String> keywordsList = new ArrayList<String>(keywordArray.length); for (String keyword : keywordArray) { keyword = keyword.trim(); if (keyword.length() > 0) { keywordsList.add(keyword); } } parserDoc.setKeywords(keywordsList); this.logger.debug(String.format("Document keywords are: %s", keywordsList.toString())); } } // last modification date if (summary.getEditTime() > 0) { Date editTime = new Date(summary.getEditTime()); parserDoc.setLastChanged(editTime); this.logger.debug(String.format("Document last-changed-date is: %s", editTime.toString())); } else if (summary.getCreateDateTime() != null) { Date creationDate = summary.getCreateDateTime(); parserDoc.setLastChanged(creationDate); this.logger.debug(String.format("Document creation-date is: %s", creationDate.toString())); } else if (summary.getLastSaveDateTime() != null) { Date lastSaveDate = summary.getLastSaveDateTime(); parserDoc.setLastChanged(lastSaveDate); this.logger.debug(String.format("Document last-save-date is: %s", lastSaveDate.toString())); } } catch (Exception e) { String errorMsg = String.format("Unexpected '%s' while extracting metadata: %s", e.getClass().getName(), e.getMessage()); logger.error(errorMsg, e); throw new ParserException(errorMsg); } finally { if (docIn != null) try { docIn.close(); } catch (Exception e) { /* ignore this */} } }
From source file:org.textmining.extraction.excel.ExcelTextExtractor.java
License:Open Source License
public ExcelTextExtractor(InputStream in) throws IOException { POIFSFileSystem poifs = new POIFSFileSystem(in); DocumentEntry headerProps = (DocumentEntry) poifs.getRoot().getEntry("Workbook"); DocumentInputStream din = poifs.createDocumentInputStream("Workbook"); _recordStream = new byte[headerProps.getSize()]; din.read(_recordStream);/*from w ww.ja va 2 s. c om*/ din.close(); }
From source file:poi.hpsf.examples.CopyCompare.java
License:Apache License
/** * <p>Runs the example program. The application expects one or two * arguments:</p>//from w ww . j ava 2s .co m * * <ol> * * <li><p>The first argument is the disk file name of the POI filesystem to * copy.</p></li> * * <li><p>The second argument is optional. If it is given, it is the name of * a disk file the copy of the POI filesystem will be written to. If it is * not given, the copy will be written to a temporary file which will be * deleted at the end of the program.</p></li> * * </ol> * * @param args Command-line arguments. * @exception MarkUnsupportedException if a POI document stream does not * support the mark() operation. * @exception NoPropertySetStreamException if the application tries to * create a property set from a POI document stream that is not a property * set stream. * @exception java.io.IOException if any I/O exception occurs. * @exception java.io.UnsupportedEncodingException if a character encoding is not * supported. */ public static void main(final String[] args) throws NoPropertySetStreamException, MarkUnsupportedException, UnsupportedEncodingException, IOException { String originalFileName = null; String copyFileName = null; /* Check the command-line arguments. */ if (args.length == 1) { originalFileName = args[0]; File f = TempFile.createTempFile("CopyOfPOIFileSystem-", ".ole2"); f.deleteOnExit(); copyFileName = f.getAbsolutePath(); } else if (args.length == 2) { originalFileName = args[0]; copyFileName = args[1]; } else { System.err.println("Usage: " + CopyCompare.class.getName() + "originPOIFS [copyPOIFS]"); System.exit(1); } /* Read the origin POIFS using the eventing API. The real work is done * in the class CopyFile which is registered here as a POIFSReader. */ final POIFSReader r = new POIFSReader(); final CopyFile cf = new CopyFile(copyFileName); r.registerListener(cf); r.read(new FileInputStream(originalFileName)); /* Write the new POIFS to disk. */ cf.close(); /* Read all documents from the original POI file system and compare them * with the equivalent document from the copy. */ final POIFSFileSystem opfs = new POIFSFileSystem(new FileInputStream(originalFileName)); final POIFSFileSystem cpfs = new POIFSFileSystem(new FileInputStream(copyFileName)); final DirectoryEntry oRoot = opfs.getRoot(); final DirectoryEntry cRoot = cpfs.getRoot(); final StringBuffer messages = new StringBuffer(); if (equal(oRoot, cRoot, messages)) System.out.println("Equal"); else System.out.println("Not equal: " + messages.toString()); }
From source file:poi.hpsf.examples.ModifyDocumentSummaryInformation.java
License:Apache License
/** * <p>Main method - see class description.</p> * * @param args The command-line parameters. * @throws java.io.IOException/*from w w w . j a va 2 s . com*/ * @throws MarkUnsupportedException * @throws NoPropertySetStreamException * @throws UnexpectedPropertySetTypeException * @throws WritingNotSupportedException */ public static void main(final String[] args) throws IOException, NoPropertySetStreamException, MarkUnsupportedException, UnexpectedPropertySetTypeException, WritingNotSupportedException { /* Read the name of the POI filesystem to modify from the command line. * For brevity to boundary check is performed on the command-line * arguments. */ File poiFilesystem = new File(args[0]); /* Open the POI filesystem. */ InputStream is = new FileInputStream(poiFilesystem); POIFSFileSystem poifs = new POIFSFileSystem(is); is.close(); /* Read the summary information. */ DirectoryEntry dir = poifs.getRoot(); SummaryInformation si; try { DocumentEntry siEntry = (DocumentEntry) dir.getEntry(SummaryInformation.DEFAULT_STREAM_NAME); DocumentInputStream dis = new DocumentInputStream(siEntry); PropertySet ps = new PropertySet(dis); dis.close(); si = new SummaryInformation(ps); } catch (FileNotFoundException ex) { /* There is no summary information yet. We have to create a new * one. */ si = PropertySetFactory.newSummaryInformation(); } /* Change the author to "Rainer Klute". Any former author value will * be lost. If there has been no author yet, it will be created. */ si.setAuthor("Rainer Klute"); System.out.println("Author changed to " + si.getAuthor() + "."); /* Handling the document summary information is analogous to handling * the summary information. An additional feature, however, are the * custom properties. */ /* Read the document summary information. */ DocumentSummaryInformation dsi; try { DocumentEntry dsiEntry = (DocumentEntry) dir.getEntry(DocumentSummaryInformation.DEFAULT_STREAM_NAME); DocumentInputStream dis = new DocumentInputStream(dsiEntry); PropertySet ps = new PropertySet(dis); dis.close(); dsi = new DocumentSummaryInformation(ps); } catch (FileNotFoundException ex) { /* There is no document summary information yet. We have to create a * new one. */ dsi = PropertySetFactory.newDocumentSummaryInformation(); } /* Change the category to "POI example". Any former category value will * be lost. If there has been no category yet, it will be created. */ dsi.setCategory("POI example"); System.out.println("Category changed to " + dsi.getCategory() + "."); /* Read the custom properties. If there are no custom properties yet, * the application has to create a new CustomProperties object. It will * serve as a container for custom properties. */ CustomProperties customProperties = dsi.getCustomProperties(); if (customProperties == null) customProperties = new CustomProperties(); /* Insert some custom properties into the container. */ customProperties.put("Key 1", "Value 1"); customProperties.put("Schl\u00fcssel 2", "Wert 2"); customProperties.put("Sample Number", new Integer(12345)); customProperties.put("Sample Boolean", Boolean.TRUE); customProperties.put("Sample Date", new Date()); /* Read a custom property. */ Object value = customProperties.get("Sample Number"); /* Write the custom properties back to the document summary * information. */ dsi.setCustomProperties(customProperties); /* Write the summary information and the document summary information * to the POI filesystem. */ si.write(dir, SummaryInformation.DEFAULT_STREAM_NAME); dsi.write(dir, DocumentSummaryInformation.DEFAULT_STREAM_NAME); /* Write the POI filesystem back to the original file. Please note that * in production code you should never write directly to the origin * file! In case of a writing error everything would be lost. */ OutputStream out = new FileOutputStream(poiFilesystem); poifs.writeFilesystem(out); out.close(); }
From source file:ro.nextreports.engine.exporter.XlsExporter.java
License:Apache License
public static void createSummaryInformation(String filePath, String title) { if (filePath == null) { return;//from ww w .j a v a 2s. c o m } try { File poiFilesystem = new File(filePath); InputStream is = new FileInputStream(poiFilesystem); POIFSFileSystem poifs = new POIFSFileSystem(is); is.close(); DirectoryEntry dir = poifs.getRoot(); SummaryInformation si = PropertySetFactory.newSummaryInformation(); si.setTitle(title); si.setAuthor(ReleaseInfoAdapter.getCompany()); si.setApplicationName("NextReports " + ReleaseInfoAdapter.getVersionNumber()); si.setSubject("Created by NextReports Designer" + ReleaseInfoAdapter.getVersionNumber()); si.setCreateDateTime(new Date()); si.setKeywords(ReleaseInfoAdapter.getHome()); si.write(dir, SummaryInformation.DEFAULT_STREAM_NAME); OutputStream out = new FileOutputStream(poiFilesystem); poifs.writeFilesystem(out); out.close(); } catch (Exception ex) { ex.printStackTrace(); } }