org.apache.tika.parser.wordperfect.QPWTextExtractor.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.tika.parser.wordperfect.QPWTextExtractor.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.wordperfect;

import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;

import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.UnsupportedFormatException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.QuattroPro;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.SAXException;

/**
 * Extracts text from a Quattro Pro document according to QPW v9 File Format.
 * This format appears to be compatible with more recent versions too.
 * @author Pascal Essiembre
 */
class QPWTextExtractor {

    private static final Logger LOG = LogManager.getLogger(QPWTextExtractor.class);

    private static final String OLE_DOCUMENT_NAME = "NativeContent_MAIN";

    private enum Extractor {
        IGNORE {
            @Override
            public void extract(Context ctx) throws IOException {
                ctx.in.skipWPByte(ctx.bodyLength);
            }
        },
        BOF {
            @Override
            public void extract(Context ctx) throws IOException {
                ctx.metadata.set(QuattroPro.ID, ctx.in.readWPString(4));
                ctx.metadata.set(QuattroPro.VERSION, ctx.in.readWPShort());
                ctx.metadata.set(QuattroPro.BUILD, ctx.in.readWPShort());
                ctx.in.readWPShort(); // Last saved bits
                ctx.metadata.set(QuattroPro.LOWEST_VERSION, ctx.in.readWPShort());
                ctx.metadata.set(Office.PAGE_COUNT, ctx.in.readWPShort());
                ctx.in.skipWPByte(ctx.bodyLength - 14);
            }
        },
        USER {
            @Override
            public void extract(Context ctx) throws IOException {
                ctx.metadata.set(TikaCoreProperties.CREATOR, getQstrLabel(ctx.in));
                ctx.metadata.set(TikaCoreProperties.MODIFIER, getQstrLabel(ctx.in));
            }
        },
        EXT_LINK {
            @Override
            public void extract(Context ctx) throws IOException, SAXException {
                ctx.in.readWPShort(); // index
                ctx.in.readWPShort(); // page first
                ctx.in.readWPShort(); // page last
                ctx.xhtml.characters(getQstrLabel(ctx.in));
                ctx.xhtml.characters(System.lineSeparator());
            }
        },
        STRING_TABLE {
            @Override
            public void extract(Context ctx) throws IOException, SAXException {
                long entries = ctx.in.readWPLong();
                ctx.in.readWPLong(); // Total used
                ctx.in.readWPLong(); // Total saved
                for (int i = 0; i < entries; i++) {
                    ctx.xhtml.characters(getQstrLabel(ctx.in));
                    ctx.xhtml.characters(System.lineSeparator());
                }
            }
        },
        BOS {
            @Override
            public void extract(Context ctx) throws IOException, SAXException {
                ctx.in.readWPShort(); // sheet #
                ctx.in.readWPShort(); // first col index
                ctx.in.readWPShort(); // last col index
                ctx.in.readWPLong(); // first row index
                ctx.in.readWPLong(); // last row index
                ctx.in.readWPShort(); // format
                ctx.in.readWPShort(); // flags
                ctx.xhtml.characters(getQstrLabel(ctx.in));
                ctx.xhtml.characters(System.lineSeparator());
            }
        },
        SHEET_HEADFOOT {
            @Override
            public void extract(Context ctx) throws IOException, SAXException {
                ctx.in.readWPShort(); // flag
                ctx.xhtml.characters(getQstrLabel(ctx.in));
                ctx.xhtml.characters(System.lineSeparator());
            }
        },
        FORMULA_STRING_VALUE {
            @Override
            public void extract(Context ctx) throws IOException, SAXException {
                ctx.in.readWPShort(); // column
                ctx.in.readWPLong(); // row
                ctx.xhtml.characters(getQstrLabel(ctx.in));
            }
        },
        CGENERICLABEL {
            @Override
            public void extract(Context ctx) throws IOException, SAXException {
                ctx.in.readWPShort(); // column
                ctx.in.readWPLong(); // row
                ctx.in.readWPShort(); // format index
                ctx.xhtml.characters(getQstrLabel(ctx.in));
            }
        },
        CCOMMENT {
            @Override
            public void extract(Context ctx) throws IOException, SAXException {
                ctx.in.readWPShort(); // column
                ctx.in.readWPLong(); // row
                ctx.in.readWPLong(); // flag
                ctx.xhtml.characters(getQstrLabel(ctx.in)); // author name
                ctx.xhtml.characters(getQstrLabel(ctx.in)); // comment
            }
        },
        // Use to print out a chunk
        DEBUG {
            @Override
            public void extract(Context ctx) throws IOException {
                LOG.error("REC (" + Integer.toHexString(ctx.type) + "/" + ctx.bodyLength + "):"
                        + ctx.in.readWPString(ctx.bodyLength));
            }
        };
        public abstract void extract(Context ctx) throws IOException, SAXException;
    }

    // Holds extractors for each record types we are interested in.
    // All record types not defined here will be skipped.
    private static final Map<Integer, Extractor> EXTRACTORS = new HashMap<>();
    static {
        //--- Global Records ---
        EXTRACTORS.put(0x0001, Extractor.BOF); // Beginning of file
        EXTRACTORS.put(0x0005, Extractor.USER); // User

        //--- Notebook Records ---
        EXTRACTORS.put(0x0403, Extractor.EXT_LINK);// External link
        EXTRACTORS.put(0x0407, Extractor.STRING_TABLE); // String table

        //--- Sheet Records ---
        EXTRACTORS.put(0x0601, Extractor.BOS); // Beginning of sheet
        EXTRACTORS.put(0x0605, Extractor.SHEET_HEADFOOT); // Sheet header
        EXTRACTORS.put(0x0606, Extractor.SHEET_HEADFOOT); // Sheet footer

        //--- Cells ---
        EXTRACTORS.put(0x0c02, Extractor.FORMULA_STRING_VALUE);
        EXTRACTORS.put(0x0c72, Extractor.CGENERICLABEL);
        EXTRACTORS.put(0x0c80, Extractor.CCOMMENT);
    }

    class Context {
        private final WPInputStream in;
        private final XHTMLContentHandler xhtml;
        private final Metadata metadata;
        private int type;
        private int bodyLength;

        public Context(WPInputStream in, XHTMLContentHandler xhtml, Metadata metadata) {
            super();
            this.in = in;
            this.xhtml = xhtml;
            this.metadata = metadata;
        }
    }

    @SuppressWarnings("resource")
    public void extract(InputStream input, XHTMLContentHandler xhtml, Metadata metadata)
            throws IOException, SAXException, TikaException {

        POIFSFileSystem pfs = new POIFSFileSystem(input);
        DirectoryNode rootNode = pfs.getRoot();
        if (rootNode == null || !rootNode.hasEntry(OLE_DOCUMENT_NAME)) {
            throw new UnsupportedFormatException("Unsupported QuattroPro file format. " + "Looking for OLE entry \""
                    + OLE_DOCUMENT_NAME + "\". Found: " + rootNode.getEntryNames());
        }

        //TODO shall we validate and throw warning/error if the file does not 
        //start with a BOF and ends with a EOF?
        xhtml.startElement("p");
        try (WPInputStream in = new WPInputStream(pfs.createDocumentInputStream(OLE_DOCUMENT_NAME))) {
            Context ctx = new Context(in, xhtml, metadata);
            while (hasNext(in)) {
                ctx.type = in.readWPShort();
                ctx.bodyLength = in.readWPShort();
                Extractor extractor = EXTRACTORS.get(ctx.type);
                if (extractor != null) {
                    extractor.extract(ctx);
                } else {
                    // Use DEBUG to find out what we are ignoring
                    //                    Extractor.DEBUG.extract(ctx);
                    Extractor.IGNORE.extract(ctx);
                }
            }
        }
        xhtml.endElement("p");
    }

    private boolean hasNext(InputStream in) throws IOException {
        try {
            in.mark(1);
            return in.read() != -1;
        } finally {
            in.reset();
        }
    }

    private static String getQstrLabel(WPInputStream in) throws IOException {
        // QSTR
        int count = in.readWPShort();
        in.readWPByte(); // string type
        char[] text = new char[count + 1];
        text[0] = in.readWPChar();

        // QSTRLABEL
        for (int i = 0; i < count; i++) {
            text[i + 1] = in.readWPChar();
        }
        return new String(text);
    }
}