List of usage examples for org.apache.poi.poifs.filesystem POIFSFileSystem getRoot
public DirectoryNode getRoot()
From source file:mj.ocraptor.extraction.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor.java
License:Apache License
/** * Handles an embedded OLE object in the document *//*from ww w . ja va 2 s.co m*/ private void handleEmbeddedOLE(PackagePart part, ContentHandler handler, String rel) throws IOException, SAXException { // A POIFSFileSystem needs to be at least 3 blocks big to be valid // TODO: TIKA-1118 Upgrade to POI 4.0 then enable this block of code // if (part.getSize() >= 0 && part.getSize() < 512*3) { // // Too small, skip // return; // } // Open the POIFS (OLE2) structure and process POIFSFileSystem fs = new POIFSFileSystem(part.getInputStream()); try { Metadata metadata = new Metadata(); TikaInputStream stream = null; metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel); DirectoryNode root = fs.getRoot(); POIFSDocumentType type = POIFSDocumentType.detectType(root); if (root.hasEntry("CONTENTS") && root.hasEntry("\u0001Ole") && root.hasEntry("\u0001CompObj") && root.hasEntry("\u0003ObjInfo")) { // TIKA-704: OLE 2.0 embedded non-Office document? stream = TikaInputStream.get(fs.createDocumentInputStream("CONTENTS")); if (embeddedExtractor.shouldParseEmbedded(metadata)) { embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false); } } else if (POIFSDocumentType.OLE10_NATIVE == type) { // TIKA-704: OLE 1.0 embedded document Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(fs); metadata.set(Metadata.RESOURCE_NAME_KEY, ole.getLabel()); byte[] data = ole.getDataBuffer(); if (data != null) { stream = TikaInputStream.get(data); } if (stream != null && embeddedExtractor.shouldParseEmbedded(metadata)) { embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false); } } else { handleEmbeddedFile(part, handler, rel); } } catch (FileNotFoundException e) { // There was no CONTENTS entry, so skip this part } catch (Ole10NativeException e) { // Could not process an OLE 1.0 entry, so skip this part } }
From source file:net.freeutils.tnef.msg.Msg.java
License:Open Source License
public static void main(String[] args) throws Exception { String filename = args[0];//from w w w. j a v a 2 s.c om String outputdir = args[1]; InputStream in = null; try { in = new FileInputStream(filename); POIFSFileSystem fs = new POIFSFileSystem(in); DirectoryEntry root = fs.getRoot(); //printDirectory(root, ""); Message message = processMessage(root); TNEF.extractContent(message, outputdir); } catch (IOException ioe) { ioe.printStackTrace(); } finally { if (in != null) in.close(); } }
From source file:net.sf.mpxj.mpp.MPPReader.java
License:Open Source License
/** * Alternative entry point allowing an MPP file to be read from * a user-supplied POI file stream. /* w w w . j a va 2 s . c om*/ * * @param fs POI file stream * @return ProjectFile instance * @throws MPXJException */ public ProjectFile read(POIFSFileSystem fs) throws MPXJException { try { ProjectFile projectFile = new ProjectFile(); ProjectConfig config = projectFile.getProjectConfig(); config.setAutoTaskID(false); config.setAutoTaskUniqueID(false); config.setAutoResourceID(false); config.setAutoResourceUniqueID(false); config.setAutoOutlineLevel(false); config.setAutoOutlineNumber(false); config.setAutoWBS(false); config.setAutoCalendarUniqueID(false); config.setAutoAssignmentUniqueID(false); projectFile.getEventManager().addProjectListeners(m_projectListeners); // // Open the file system and retrieve the root directory // DirectoryEntry root = fs.getRoot(); // // Retrieve the CompObj data, validate the file format and process // CompObj compObj = new CompObj(new DocumentInputStream((DocumentEntry) root.getEntry("\1CompObj"))); projectFile.getProjectProperties().setFullApplicationName(compObj.getApplicationName()); projectFile.getProjectProperties().setApplicationVersion(compObj.getApplicationVersion()); String format = compObj.getFileFormat(); Class<? extends MPPVariantReader> readerClass = FILE_CLASS_MAP.get(format); if (readerClass == null) { throw new MPXJException(MPXJException.INVALID_FILE + ": " + format); } MPPVariantReader reader = readerClass.newInstance(); reader.process(this, projectFile, root); // // Update the internal structure. We'll take this opportunity to // generate outline numbers for the tasks as they don't appear to // be present in the MPP file. // config.setAutoOutlineNumber(true); projectFile.updateStructure(); config.setAutoOutlineNumber(false); // // Perform post-processing to set the summary flag and clean // up any instances where a task has an empty splits list. // for (Task task : projectFile.getAllTasks()) { task.setSummary(task.getChildTasks().size() != 0); List<DateRange> splits = task.getSplits(); if (splits != null && splits.isEmpty()) { task.setSplits(null); } validationRelations(task); } // // Ensure that the unique ID counters are correct // config.updateUniqueCounters(); return (projectFile); } catch (IOException ex) { throw new MPXJException(MPXJException.READ_ERROR, ex); } catch (IllegalAccessException ex) { throw new MPXJException(MPXJException.READ_ERROR, ex); } catch (InstantiationException ex) { throw new MPXJException(MPXJException.READ_ERROR, ex); } }
From source file:net.sf.mpxj.sample.MppDump.java
License:Open Source License
/** * This method opens the input and output files and kicks * off the processing./*from w w w . j a v a 2s . com*/ * * @param input Name of the input file * @param output Name of the output file * @throws Exception Thrown on file read errors */ private static void process(String input, String output) throws Exception { FileInputStream is = new FileInputStream(input); PrintWriter pw = new PrintWriter(new FileWriter(output)); POIFSFileSystem fs = new POIFSFileSystem(is); dumpTree(pw, fs.getRoot(), "", true, true, null); is.close(); pw.flush(); pw.close(); }
From source file:net.sf.mpxj.utility.MppClean.java
License:Open Source License
/** * Process an MPP file to make it anonymous. * /*from www. j a v a2 s . c o m*/ * @param input input file name * @param output output file name * @throws Exception */ private void process(String input, String output) throws MPXJException, IOException { // // Extract the project data // MPPReader reader = new MPPReader(); m_project = reader.read(input); String varDataFileName; String projectDirName; switch (m_project.getMppFileType()) { case 8: { projectDirName = " 1"; varDataFileName = "FixDeferFix 0"; break; } case 9: { projectDirName = " 19"; varDataFileName = "Var2Data"; break; } case 12: { projectDirName = " 112"; varDataFileName = "Var2Data"; break; } default: { throw new IllegalArgumentException("Unsupported file type " + m_project.getMppFileType()); } } // // Load the raw file // FileInputStream is = new FileInputStream(input); POIFSFileSystem fs = new POIFSFileSystem(is); is.close(); // // Locate the root of the project file system // DirectoryEntry root = fs.getRoot(); m_projectDir = (DirectoryEntry) root.getEntry(projectDirName); // // Process Tasks // Map<String, String> replacements = new HashMap<String, String>(); for (Task task : m_project.getAllTasks()) { mapText(task.getName(), replacements); } processReplacements(((DirectoryEntry) m_projectDir.getEntry("TBkndTask")), varDataFileName, replacements, true); // // Process Resources // replacements.clear(); for (Resource resource : m_project.getAllResources()) { mapText(resource.getName(), replacements); mapText(resource.getInitials(), replacements); } processReplacements((DirectoryEntry) m_projectDir.getEntry("TBkndRsc"), varDataFileName, replacements, true); // // Process project header details // replacements.clear(); ProjectHeader header = m_project.getProjectHeader(); mapText(header.getProjectTitle(), replacements); processReplacements(m_projectDir, "Props", replacements, true); replacements.clear(); mapText(header.getProjectTitle(), replacements); mapText(header.getSubject(), replacements); mapText(header.getAuthor(), replacements); mapText(header.getKeywords(), replacements); mapText(header.getComments(), replacements); processReplacements(root, "\005SummaryInformation", replacements, false); replacements.clear(); mapText(header.getManager(), replacements); mapText(header.getCompany(), replacements); mapText(header.getCategory(), replacements); processReplacements(root, "\005DocumentSummaryInformation", replacements, false); // // Write the replacement raw file // FileOutputStream os = new FileOutputStream(output); fs.writeFilesystem(os); os.flush(); os.close(); }
From source file:net.sf.mpxj.utility.MppCleanUtility.java
License:Open Source License
/** * Process an MPP file to make it anonymous. * /* w ww. j a v a 2s. c o m*/ * @param input input file name * @param output output file name * @throws Exception */ private void process(String input, String output) throws MPXJException, IOException { // // Extract the project data // MPPReader reader = new MPPReader(); m_project = reader.read(input); String varDataFileName; String projectDirName; int mppFileType = NumberHelper.getInt(m_project.getProjectProperties().getMppFileType()); switch (mppFileType) { case 8: { projectDirName = " 1"; varDataFileName = "FixDeferFix 0"; break; } case 9: { projectDirName = " 19"; varDataFileName = "Var2Data"; break; } case 12: { projectDirName = " 112"; varDataFileName = "Var2Data"; break; } default: { throw new IllegalArgumentException("Unsupported file type " + mppFileType); } } // // Load the raw file // FileInputStream is = new FileInputStream(input); POIFSFileSystem fs = new POIFSFileSystem(is); is.close(); // // Locate the root of the project file system // DirectoryEntry root = fs.getRoot(); m_projectDir = (DirectoryEntry) root.getEntry(projectDirName); // // Process Tasks // Map<String, String> replacements = new HashMap<String, String>(); for (Task task : m_project.getAllTasks()) { mapText(task.getName(), replacements); } processReplacements(((DirectoryEntry) m_projectDir.getEntry("TBkndTask")), varDataFileName, replacements, true); // // Process Resources // replacements.clear(); for (Resource resource : m_project.getAllResources()) { mapText(resource.getName(), replacements); mapText(resource.getInitials(), replacements); } processReplacements((DirectoryEntry) m_projectDir.getEntry("TBkndRsc"), varDataFileName, replacements, true); // // Process project properties // replacements.clear(); ProjectProperties properties = m_project.getProjectProperties(); mapText(properties.getProjectTitle(), replacements); processReplacements(m_projectDir, "Props", replacements, true); replacements.clear(); mapText(properties.getProjectTitle(), replacements); mapText(properties.getSubject(), replacements); mapText(properties.getAuthor(), replacements); mapText(properties.getKeywords(), replacements); mapText(properties.getComments(), replacements); processReplacements(root, "\005SummaryInformation", replacements, false); replacements.clear(); mapText(properties.getManager(), replacements); mapText(properties.getCompany(), replacements); mapText(properties.getCategory(), replacements); processReplacements(root, "\005DocumentSummaryInformation", replacements, false); // // Write the replacement raw file // FileOutputStream os = new FileOutputStream(output); fs.writeFilesystem(os); os.flush(); os.close(); }
From source file:nz.govt.natlib.adapter.works.DocAdapter.java
License:Apache License
public void adapt(File file, ParserContext ctx) throws IOException { ctx.fireStartParseEvent("MSWorks"); writeFileInfo(file, ctx);/*from w w w.j a va 2s . com*/ ctx.fireParseEvent("Version", "Works"); POIFSFileSystem fs = null; FileInputStream fin = null; try { fin = new FileInputStream(file); fs = new POIFSFileSystem(fin); DirectoryEntry root = fs.getRoot(); readDirectory(fs, root, ctx); } catch (Exception ex) { throw new RuntimeException(ex); } finally { AdapterUtils.close(fin); fs = null; } ctx.fireEndParseEvent("MSWorks"); }
From source file:org.apache.nutch.parse.msword.WordExtractor.java
License:Apache License
/** * Gets the text from a Word document.//from ww w . j ava 2s. co m * * @param in The InputStream representing the Word file. */ protected String extractText(InputStream in) throws Exception { ArrayList text = new ArrayList(); POIFSFileSystem fsys = new POIFSFileSystem(in); // load our POIFS document streams. DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument"); DocumentInputStream din = fsys.createDocumentInputStream("WordDocument"); byte[] header = new byte[headerProps.getSize()]; din.read(header); din.close(); int info = LittleEndian.getShort(header, 0xa); if ((info & 0x4) != 0) { throw new FastSavedException("Fast-saved files are unsupported at this time"); } if ((info & 0x100) != 0) { throw new PasswordProtectedException("This document is password protected"); } // determine the version of Word this document came from. int nFib = LittleEndian.getShort(header, 0x2); switch (nFib) { case 101: case 102: case 103: case 104: // this is a Word 6.0 doc send it to the extractor for that version. Word6Extractor oldExtractor = new Word6Extractor(); return oldExtractor.extractText(header); } //Get the information we need from the header boolean useTable1 = (info & 0x200) != 0; //get the location of the piece table int complexOffset = LittleEndian.getInt(header, 0x1a2); // determine which table stream we must use. String tableName = null; if (useTable1) { tableName = "1Table"; } else { tableName = "0Table"; } DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName); byte[] tableStream = new byte[table.getSize()]; din = fsys.createDocumentInputStream(tableName); din.read(tableStream); din.close(); int chpOffset = LittleEndian.getInt(header, 0xfa); int chpSize = LittleEndian.getInt(header, 0xfe); int fcMin = LittleEndian.getInt(header, 0x18); CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin); // load our text pieces and our character runs ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin); TextPieceTable tpt = cft.getTextPieceTable(); List textPieces = tpt.getTextPieces(); // make the POIFS objects available for garbage collection din = null; fsys = null; table = null; headerProps = null; List textRuns = cbt.getTextRuns(); Iterator runIt = textRuns.iterator(); Iterator textIt = textPieces.iterator(); TextPiece currentPiece = (TextPiece) textIt.next(); int currentTextStart = currentPiece.getStart(); int currentTextEnd = currentPiece.getEnd(); WordTextBuffer finalTextBuf = new WordTextBuffer(); // iterate through all text runs extract the text only if they haven't been // deleted while (runIt.hasNext()) { CHPX chpx = (CHPX) runIt.next(); boolean deleted = isDeleted(chpx.getGrpprl()); if (deleted) { continue; } int runStart = chpx.getStart(); int runEnd = chpx.getEnd(); while (runStart >= currentTextEnd) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); currentTextEnd = currentPiece.getEnd(); } if (runEnd < currentTextEnd) { String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart); finalTextBuf.append(str); } else if (runEnd > currentTextEnd) { while (runEnd > currentTextEnd) { String str = currentPiece.substring(runStart - currentTextStart, currentTextEnd - currentTextStart); finalTextBuf.append(str); if (textIt.hasNext()) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); runStart = currentTextStart; currentTextEnd = currentPiece.getEnd(); } else { return finalTextBuf.toString(); } } String str = currentPiece.substring(0, runEnd - currentTextStart); finalTextBuf.append(str); } else { String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart); if (textIt.hasNext()) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); currentTextEnd = currentPiece.getEnd(); } finalTextBuf.append(str); } } return finalTextBuf.toString(); }
From source file:org.apache.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor.java
License:Apache License
/** * Handles an embedded OLE object in the document *//*from w ww. j a v a 2 s . co m*/ private void handleEmbeddedOLE(PackagePart part, ContentHandler handler, String rel) throws IOException, SAXException { // A POIFSFileSystem needs to be at least 3 blocks big to be valid if (part.getSize() >= 0 && part.getSize() < 512 * 3) { // Too small, skip return; } // Open the POIFS (OLE2) structure and process POIFSFileSystem fs = new POIFSFileSystem(part.getInputStream()); try { Metadata metadata = new Metadata(); TikaInputStream stream = null; metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel); DirectoryNode root = fs.getRoot(); POIFSDocumentType type = POIFSDocumentType.detectType(root); if (root.hasEntry("CONTENTS") && root.hasEntry("\u0001Ole") && root.hasEntry("\u0001CompObj") && root.hasEntry("\u0003ObjInfo")) { // TIKA-704: OLE 2.0 embedded non-Office document? stream = TikaInputStream.get(fs.createDocumentInputStream("CONTENTS")); if (embeddedExtractor.shouldParseEmbedded(metadata)) { embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false); } } else if (POIFSDocumentType.OLE10_NATIVE == type) { // TIKA-704: OLE 1.0 embedded document Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(fs); if (ole.getLabel() != null) { metadata.set(Metadata.RESOURCE_NAME_KEY, ole.getLabel()); } byte[] data = ole.getDataBuffer(); if (data != null) { stream = TikaInputStream.get(data); } if (stream != null && embeddedExtractor.shouldParseEmbedded(metadata)) { embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false); } } else { handleEmbeddedFile(part, handler, rel); } } catch (FileNotFoundException e) { // There was no CONTENTS entry, so skip this part } catch (Ole10NativeException e) { // Could not process an OLE 1.0 entry, so skip this part } }
From source file:org.apache.tika.parser.wordperfect.QPWTextExtractor.java
License:Apache License
@SuppressWarnings("resource") public void extract(InputStream input, XHTMLContentHandler xhtml, Metadata metadata) throws IOException, SAXException, TikaException { POIFSFileSystem pfs = new POIFSFileSystem(input); DirectoryNode rootNode = pfs.getRoot(); if (rootNode == null || !rootNode.hasEntry(OLE_DOCUMENT_NAME)) { throw new UnsupportedFormatException("Unsupported QuattroPro file format. " + "Looking for OLE entry \"" + OLE_DOCUMENT_NAME + "\". Found: " + rootNode.getEntryNames()); }//from w ww . j a va 2 s .co m //TODO shall we validate and throw warning/error if the file does not //start with a BOF and ends with a EOF? xhtml.startElement("p"); try (WPInputStream in = new WPInputStream(pfs.createDocumentInputStream(OLE_DOCUMENT_NAME))) { Context ctx = new Context(in, xhtml, metadata); while (hasNext(in)) { ctx.type = in.readWPShort(); ctx.bodyLength = in.readWPShort(); Extractor extractor = EXTRACTORS.get(ctx.type); if (extractor != null) { extractor.extract(ctx); } else { // Use DEBUG to find out what we are ignoring // Extractor.DEBUG.extract(ctx); Extractor.IGNORE.extract(ctx); } } } xhtml.endElement("p"); }