Example usage for org.apache.poi.poifs.filesystem POIFSFileSystem getRoot

List of usage examples for org.apache.poi.poifs.filesystem POIFSFileSystem getRoot

Introduction

In this page you can find the example usage for org.apache.poi.poifs.filesystem POIFSFileSystem getRoot.

Prototype

public DirectoryNode getRoot() 

Source Link

Document

Get the root entry

Usage

From source file:mj.ocraptor.extraction.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor.java

License:Apache License

/**
 * Handles an embedded OLE object in the document
 *//*from  ww w .  ja va 2  s.co  m*/
private void handleEmbeddedOLE(PackagePart part, ContentHandler handler, String rel)
        throws IOException, SAXException {
    // A POIFSFileSystem needs to be at least 3 blocks big to be valid
    // TODO: TIKA-1118 Upgrade to POI 4.0 then enable this block of code
    //        if (part.getSize() >= 0 && part.getSize() < 512*3) {
    //           // Too small, skip
    //           return;
    //        }

    // Open the POIFS (OLE2) structure and process
    POIFSFileSystem fs = new POIFSFileSystem(part.getInputStream());
    try {
        Metadata metadata = new Metadata();
        TikaInputStream stream = null;
        metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel);

        DirectoryNode root = fs.getRoot();
        POIFSDocumentType type = POIFSDocumentType.detectType(root);

        if (root.hasEntry("CONTENTS") && root.hasEntry("\u0001Ole") && root.hasEntry("\u0001CompObj")
                && root.hasEntry("\u0003ObjInfo")) {
            // TIKA-704: OLE 2.0 embedded non-Office document?
            stream = TikaInputStream.get(fs.createDocumentInputStream("CONTENTS"));
            if (embeddedExtractor.shouldParseEmbedded(metadata)) {
                embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false);
            }
        } else if (POIFSDocumentType.OLE10_NATIVE == type) {
            // TIKA-704: OLE 1.0 embedded document
            Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(fs);
            metadata.set(Metadata.RESOURCE_NAME_KEY, ole.getLabel());
            byte[] data = ole.getDataBuffer();
            if (data != null) {
                stream = TikaInputStream.get(data);
            }

            if (stream != null && embeddedExtractor.shouldParseEmbedded(metadata)) {
                embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false);
            }
        } else {
            handleEmbeddedFile(part, handler, rel);
        }
    } catch (FileNotFoundException e) {
        // There was no CONTENTS entry, so skip this part
    } catch (Ole10NativeException e) {
        // Could not process an OLE 1.0 entry, so skip this part
    }
}

From source file:net.freeutils.tnef.msg.Msg.java

License:Open Source License

public static void main(String[] args) throws Exception {
    String filename = args[0];//from   w  w w.  j  a  v  a 2  s.c om
    String outputdir = args[1];
    InputStream in = null;
    try {
        in = new FileInputStream(filename);
        POIFSFileSystem fs = new POIFSFileSystem(in);
        DirectoryEntry root = fs.getRoot();
        //printDirectory(root, "");
        Message message = processMessage(root);
        TNEF.extractContent(message, outputdir);
    } catch (IOException ioe) {
        ioe.printStackTrace();
    } finally {
        if (in != null)
            in.close();
    }
}

From source file:net.sf.mpxj.mpp.MPPReader.java

License:Open Source License

/**
 * Alternative entry point allowing an MPP file to be read from
 * a user-supplied POI file stream. /*  w w w  .  j a va  2 s  . c  om*/
 * 
 * @param fs POI file stream
 * @return ProjectFile instance
 * @throws MPXJException
 */
public ProjectFile read(POIFSFileSystem fs) throws MPXJException {

    try {
        ProjectFile projectFile = new ProjectFile();
        ProjectConfig config = projectFile.getProjectConfig();

        config.setAutoTaskID(false);
        config.setAutoTaskUniqueID(false);
        config.setAutoResourceID(false);
        config.setAutoResourceUniqueID(false);
        config.setAutoOutlineLevel(false);
        config.setAutoOutlineNumber(false);
        config.setAutoWBS(false);
        config.setAutoCalendarUniqueID(false);
        config.setAutoAssignmentUniqueID(false);

        projectFile.getEventManager().addProjectListeners(m_projectListeners);

        //
        // Open the file system and retrieve the root directory
        //
        DirectoryEntry root = fs.getRoot();

        //
        // Retrieve the CompObj data, validate the file format and process
        //
        CompObj compObj = new CompObj(new DocumentInputStream((DocumentEntry) root.getEntry("\1CompObj")));
        projectFile.getProjectProperties().setFullApplicationName(compObj.getApplicationName());
        projectFile.getProjectProperties().setApplicationVersion(compObj.getApplicationVersion());
        String format = compObj.getFileFormat();
        Class<? extends MPPVariantReader> readerClass = FILE_CLASS_MAP.get(format);
        if (readerClass == null) {
            throw new MPXJException(MPXJException.INVALID_FILE + ": " + format);
        }
        MPPVariantReader reader = readerClass.newInstance();
        reader.process(this, projectFile, root);

        //
        // Update the internal structure. We'll take this opportunity to
        // generate outline numbers for the tasks as they don't appear to
        // be present in the MPP file.
        //
        config.setAutoOutlineNumber(true);
        projectFile.updateStructure();
        config.setAutoOutlineNumber(false);

        //
        // Perform post-processing to set the summary flag and clean
        // up any instances where a task has an empty splits list.
        //
        for (Task task : projectFile.getAllTasks()) {
            task.setSummary(task.getChildTasks().size() != 0);
            List<DateRange> splits = task.getSplits();
            if (splits != null && splits.isEmpty()) {
                task.setSplits(null);
            }
            validationRelations(task);
        }

        //
        // Ensure that the unique ID counters are correct
        //
        config.updateUniqueCounters();

        return (projectFile);
    }

    catch (IOException ex) {
        throw new MPXJException(MPXJException.READ_ERROR, ex);
    }

    catch (IllegalAccessException ex) {
        throw new MPXJException(MPXJException.READ_ERROR, ex);
    }

    catch (InstantiationException ex) {
        throw new MPXJException(MPXJException.READ_ERROR, ex);
    }
}

From source file:net.sf.mpxj.sample.MppDump.java

License:Open Source License

/**
 * This method opens the input and output files and kicks
 * off the processing./*from   w w w  . j  a v a 2s .  com*/
 *
 * @param input Name of the input file
 * @param output Name of the output file
 * @throws Exception Thrown on file read errors
 */
private static void process(String input, String output) throws Exception {
    FileInputStream is = new FileInputStream(input);
    PrintWriter pw = new PrintWriter(new FileWriter(output));

    POIFSFileSystem fs = new POIFSFileSystem(is);
    dumpTree(pw, fs.getRoot(), "", true, true, null);

    is.close();
    pw.flush();
    pw.close();
}

From source file:net.sf.mpxj.utility.MppClean.java

License:Open Source License

/**
 * Process an MPP file to make it anonymous.
 * /*from  www.  j  a  v a2 s  .  c o m*/
 * @param input input file name
 * @param output output file name
 * @throws Exception
 */
private void process(String input, String output) throws MPXJException, IOException {
    //
    // Extract the project data
    //
    MPPReader reader = new MPPReader();
    m_project = reader.read(input);

    String varDataFileName;
    String projectDirName;

    switch (m_project.getMppFileType()) {
    case 8: {
        projectDirName = "   1";
        varDataFileName = "FixDeferFix   0";
        break;
    }

    case 9: {
        projectDirName = "   19";
        varDataFileName = "Var2Data";
        break;
    }

    case 12: {
        projectDirName = "   112";
        varDataFileName = "Var2Data";
        break;
    }

    default: {
        throw new IllegalArgumentException("Unsupported file type " + m_project.getMppFileType());
    }
    }

    //
    // Load the raw file
    //
    FileInputStream is = new FileInputStream(input);
    POIFSFileSystem fs = new POIFSFileSystem(is);
    is.close();

    //
    // Locate the root of the project file system
    //
    DirectoryEntry root = fs.getRoot();
    m_projectDir = (DirectoryEntry) root.getEntry(projectDirName);

    //
    // Process Tasks
    //
    Map<String, String> replacements = new HashMap<String, String>();
    for (Task task : m_project.getAllTasks()) {
        mapText(task.getName(), replacements);
    }
    processReplacements(((DirectoryEntry) m_projectDir.getEntry("TBkndTask")), varDataFileName, replacements,
            true);

    //
    // Process Resources
    //
    replacements.clear();
    for (Resource resource : m_project.getAllResources()) {
        mapText(resource.getName(), replacements);
        mapText(resource.getInitials(), replacements);
    }
    processReplacements((DirectoryEntry) m_projectDir.getEntry("TBkndRsc"), varDataFileName, replacements,
            true);

    //
    // Process project header details
    //
    replacements.clear();
    ProjectHeader header = m_project.getProjectHeader();
    mapText(header.getProjectTitle(), replacements);
    processReplacements(m_projectDir, "Props", replacements, true);

    replacements.clear();
    mapText(header.getProjectTitle(), replacements);
    mapText(header.getSubject(), replacements);
    mapText(header.getAuthor(), replacements);
    mapText(header.getKeywords(), replacements);
    mapText(header.getComments(), replacements);
    processReplacements(root, "\005SummaryInformation", replacements, false);

    replacements.clear();
    mapText(header.getManager(), replacements);
    mapText(header.getCompany(), replacements);
    mapText(header.getCategory(), replacements);
    processReplacements(root, "\005DocumentSummaryInformation", replacements, false);

    //
    // Write the replacement raw file
    //
    FileOutputStream os = new FileOutputStream(output);
    fs.writeFilesystem(os);
    os.flush();
    os.close();
}

From source file:net.sf.mpxj.utility.MppCleanUtility.java

License:Open Source License

/**
 * Process an MPP file to make it anonymous.
 * /*  w ww. j a v  a 2s. c  o m*/
 * @param input input file name
 * @param output output file name
 * @throws Exception
 */
private void process(String input, String output) throws MPXJException, IOException {
    //
    // Extract the project data
    //
    MPPReader reader = new MPPReader();
    m_project = reader.read(input);

    String varDataFileName;
    String projectDirName;
    int mppFileType = NumberHelper.getInt(m_project.getProjectProperties().getMppFileType());
    switch (mppFileType) {
    case 8: {
        projectDirName = "   1";
        varDataFileName = "FixDeferFix   0";
        break;
    }

    case 9: {
        projectDirName = "   19";
        varDataFileName = "Var2Data";
        break;
    }

    case 12: {
        projectDirName = "   112";
        varDataFileName = "Var2Data";
        break;
    }

    default: {
        throw new IllegalArgumentException("Unsupported file type " + mppFileType);
    }
    }

    //
    // Load the raw file
    //
    FileInputStream is = new FileInputStream(input);
    POIFSFileSystem fs = new POIFSFileSystem(is);
    is.close();

    //
    // Locate the root of the project file system
    //
    DirectoryEntry root = fs.getRoot();
    m_projectDir = (DirectoryEntry) root.getEntry(projectDirName);

    //
    // Process Tasks
    //
    Map<String, String> replacements = new HashMap<String, String>();
    for (Task task : m_project.getAllTasks()) {
        mapText(task.getName(), replacements);
    }
    processReplacements(((DirectoryEntry) m_projectDir.getEntry("TBkndTask")), varDataFileName, replacements,
            true);

    //
    // Process Resources
    //
    replacements.clear();
    for (Resource resource : m_project.getAllResources()) {
        mapText(resource.getName(), replacements);
        mapText(resource.getInitials(), replacements);
    }
    processReplacements((DirectoryEntry) m_projectDir.getEntry("TBkndRsc"), varDataFileName, replacements,
            true);

    //
    // Process project properties
    //
    replacements.clear();
    ProjectProperties properties = m_project.getProjectProperties();
    mapText(properties.getProjectTitle(), replacements);
    processReplacements(m_projectDir, "Props", replacements, true);

    replacements.clear();
    mapText(properties.getProjectTitle(), replacements);
    mapText(properties.getSubject(), replacements);
    mapText(properties.getAuthor(), replacements);
    mapText(properties.getKeywords(), replacements);
    mapText(properties.getComments(), replacements);
    processReplacements(root, "\005SummaryInformation", replacements, false);

    replacements.clear();
    mapText(properties.getManager(), replacements);
    mapText(properties.getCompany(), replacements);
    mapText(properties.getCategory(), replacements);
    processReplacements(root, "\005DocumentSummaryInformation", replacements, false);

    //
    // Write the replacement raw file
    //
    FileOutputStream os = new FileOutputStream(output);
    fs.writeFilesystem(os);
    os.flush();
    os.close();
}

From source file:nz.govt.natlib.adapter.works.DocAdapter.java

License:Apache License

public void adapt(File file, ParserContext ctx) throws IOException {
    ctx.fireStartParseEvent("MSWorks");
    writeFileInfo(file, ctx);/*from w  w w.j  a  va  2s .  com*/
    ctx.fireParseEvent("Version", "Works");
    POIFSFileSystem fs = null;
    FileInputStream fin = null;
    try {
        fin = new FileInputStream(file);
        fs = new POIFSFileSystem(fin);
        DirectoryEntry root = fs.getRoot();
        readDirectory(fs, root, ctx);

    } catch (Exception ex) {
        throw new RuntimeException(ex);
    } finally {
        AdapterUtils.close(fin);
        fs = null;
    }
    ctx.fireEndParseEvent("MSWorks");
}

From source file:org.apache.nutch.parse.msword.WordExtractor.java

License:Apache License

/**
 * Gets the text from a Word document.//from  ww w .  j ava 2s.  co  m
 *
 * @param in The InputStream representing the Word file.
 */
protected String extractText(InputStream in) throws Exception {

    ArrayList text = new ArrayList();
    POIFSFileSystem fsys = new POIFSFileSystem(in);

    // load our POIFS document streams.
    DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument");
    DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
    byte[] header = new byte[headerProps.getSize()];

    din.read(header);
    din.close();

    int info = LittleEndian.getShort(header, 0xa);
    if ((info & 0x4) != 0) {
        throw new FastSavedException("Fast-saved files are unsupported at this time");
    }
    if ((info & 0x100) != 0) {
        throw new PasswordProtectedException("This document is password protected");
    }

    // determine the version of Word this document came from.
    int nFib = LittleEndian.getShort(header, 0x2);
    switch (nFib) {
    case 101:
    case 102:
    case 103:
    case 104:
        // this is a Word 6.0 doc send it to the extractor for that version.
        Word6Extractor oldExtractor = new Word6Extractor();
        return oldExtractor.extractText(header);
    }

    //Get the information we need from the header
    boolean useTable1 = (info & 0x200) != 0;

    //get the location of the piece table
    int complexOffset = LittleEndian.getInt(header, 0x1a2);

    // determine which table stream we must use.
    String tableName = null;
    if (useTable1) {
        tableName = "1Table";
    } else {
        tableName = "0Table";
    }

    DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName);
    byte[] tableStream = new byte[table.getSize()];

    din = fsys.createDocumentInputStream(tableName);

    din.read(tableStream);
    din.close();

    int chpOffset = LittleEndian.getInt(header, 0xfa);
    int chpSize = LittleEndian.getInt(header, 0xfe);
    int fcMin = LittleEndian.getInt(header, 0x18);
    CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin);

    // load our text pieces and our character runs
    ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin);
    TextPieceTable tpt = cft.getTextPieceTable();
    List textPieces = tpt.getTextPieces();

    // make the POIFS objects available for garbage collection
    din = null;
    fsys = null;
    table = null;
    headerProps = null;

    List textRuns = cbt.getTextRuns();
    Iterator runIt = textRuns.iterator();
    Iterator textIt = textPieces.iterator();

    TextPiece currentPiece = (TextPiece) textIt.next();
    int currentTextStart = currentPiece.getStart();
    int currentTextEnd = currentPiece.getEnd();

    WordTextBuffer finalTextBuf = new WordTextBuffer();

    // iterate through all text runs extract the text only if they haven't been
    // deleted
    while (runIt.hasNext()) {
        CHPX chpx = (CHPX) runIt.next();
        boolean deleted = isDeleted(chpx.getGrpprl());
        if (deleted) {
            continue;
        }

        int runStart = chpx.getStart();
        int runEnd = chpx.getEnd();

        while (runStart >= currentTextEnd) {
            currentPiece = (TextPiece) textIt.next();
            currentTextStart = currentPiece.getStart();
            currentTextEnd = currentPiece.getEnd();
        }

        if (runEnd < currentTextEnd) {
            String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
            finalTextBuf.append(str);
        } else if (runEnd > currentTextEnd) {
            while (runEnd > currentTextEnd) {
                String str = currentPiece.substring(runStart - currentTextStart,
                        currentTextEnd - currentTextStart);
                finalTextBuf.append(str);
                if (textIt.hasNext()) {
                    currentPiece = (TextPiece) textIt.next();
                    currentTextStart = currentPiece.getStart();
                    runStart = currentTextStart;
                    currentTextEnd = currentPiece.getEnd();
                } else {
                    return finalTextBuf.toString();
                }
            }
            String str = currentPiece.substring(0, runEnd - currentTextStart);
            finalTextBuf.append(str);
        } else {
            String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
            if (textIt.hasNext()) {
                currentPiece = (TextPiece) textIt.next();
                currentTextStart = currentPiece.getStart();
                currentTextEnd = currentPiece.getEnd();
            }
            finalTextBuf.append(str);
        }
    }
    return finalTextBuf.toString();
}

From source file:org.apache.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor.java

License:Apache License

/**
 * Handles an embedded OLE object in the document
 *//*from   w  ww. j  a  v a 2  s  . co  m*/
private void handleEmbeddedOLE(PackagePart part, ContentHandler handler, String rel)
        throws IOException, SAXException {
    // A POIFSFileSystem needs to be at least 3 blocks big to be valid
    if (part.getSize() >= 0 && part.getSize() < 512 * 3) {
        // Too small, skip
        return;
    }

    // Open the POIFS (OLE2) structure and process
    POIFSFileSystem fs = new POIFSFileSystem(part.getInputStream());
    try {
        Metadata metadata = new Metadata();
        TikaInputStream stream = null;
        metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel);

        DirectoryNode root = fs.getRoot();
        POIFSDocumentType type = POIFSDocumentType.detectType(root);

        if (root.hasEntry("CONTENTS") && root.hasEntry("\u0001Ole") && root.hasEntry("\u0001CompObj")
                && root.hasEntry("\u0003ObjInfo")) {
            // TIKA-704: OLE 2.0 embedded non-Office document?
            stream = TikaInputStream.get(fs.createDocumentInputStream("CONTENTS"));
            if (embeddedExtractor.shouldParseEmbedded(metadata)) {
                embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false);
            }
        } else if (POIFSDocumentType.OLE10_NATIVE == type) {
            // TIKA-704: OLE 1.0 embedded document
            Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(fs);
            if (ole.getLabel() != null) {
                metadata.set(Metadata.RESOURCE_NAME_KEY, ole.getLabel());
            }
            byte[] data = ole.getDataBuffer();
            if (data != null) {
                stream = TikaInputStream.get(data);
            }

            if (stream != null && embeddedExtractor.shouldParseEmbedded(metadata)) {
                embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false);
            }
        } else {
            handleEmbeddedFile(part, handler, rel);
        }
    } catch (FileNotFoundException e) {
        // There was no CONTENTS entry, so skip this part
    } catch (Ole10NativeException e) {
        // Could not process an OLE 1.0 entry, so skip this part
    }
}

From source file:org.apache.tika.parser.wordperfect.QPWTextExtractor.java

License:Apache License

@SuppressWarnings("resource")
public void extract(InputStream input, XHTMLContentHandler xhtml, Metadata metadata)
        throws IOException, SAXException, TikaException {

    POIFSFileSystem pfs = new POIFSFileSystem(input);
    DirectoryNode rootNode = pfs.getRoot();
    if (rootNode == null || !rootNode.hasEntry(OLE_DOCUMENT_NAME)) {
        throw new UnsupportedFormatException("Unsupported QuattroPro file format. " + "Looking for OLE entry \""
                + OLE_DOCUMENT_NAME + "\". Found: " + rootNode.getEntryNames());
    }//from  w  ww  . j a va 2 s  .co m

    //TODO shall we validate and throw warning/error if the file does not 
    //start with a BOF and ends with a EOF?
    xhtml.startElement("p");
    try (WPInputStream in = new WPInputStream(pfs.createDocumentInputStream(OLE_DOCUMENT_NAME))) {
        Context ctx = new Context(in, xhtml, metadata);
        while (hasNext(in)) {
            ctx.type = in.readWPShort();
            ctx.bodyLength = in.readWPShort();
            Extractor extractor = EXTRACTORS.get(ctx.type);
            if (extractor != null) {
                extractor.extract(ctx);
            } else {
                // Use DEBUG to find out what we are ignoring
                //                    Extractor.DEBUG.extract(ctx);
                Extractor.IGNORE.extract(ctx);
            }
        }
    }
    xhtml.endElement("p");
}