Example usage for org.apache.poi.hpbf.extractor PublisherTextExtractor getText

List of usage examples for org.apache.poi.hpbf.extractor PublisherTextExtractor getText

Introduction

In this page you can find the example usage for org.apache.poi.hpbf.extractor PublisherTextExtractor getText.

Prototype

public String getText() 

Source Link

Usage

From source file:com.jaeksoft.searchlib.parser.PublisherParser.java

License:Open Source License

@Override
protected void parseContent(StreamLimiter streamLimiter, LanguageEnum lang) throws IOException {
    PublisherTextExtractor extractor = null;
    try {/*from   w  ww  . j a  v a 2s .c  o  m*/
        extractor = new PublisherTextExtractor(streamLimiter.getNewInputStream());
        SummaryInformation info = extractor.getSummaryInformation();
        ParserResultItem result = getNewParserResultItem();
        if (info != null) {
            result.addField(ParserFieldEnum.title, info.getTitle());
            result.addField(ParserFieldEnum.author, info.getAuthor());
            result.addField(ParserFieldEnum.subject, info.getSubject());
        }
        result.addField(ParserFieldEnum.content,
                StringUtils.replaceConsecutiveSpaces(extractor.getText(), " "));
        result.langDetection(10000, ParserFieldEnum.content);
    } finally {
        IOUtils.close(extractor);
    }
}

From source file:com.opensearchserver.extractor.parser.Publisher.java

License:Apache License

@Override
protected void parseContent(InputStream inputStream, String extension, String mimeType) throws Exception {
    PublisherTextExtractor extractor = null;
    try {//from  w ww  . ja va 2s.  co  m
        extractor = new PublisherTextExtractor(inputStream);
        SummaryInformation info = extractor.getSummaryInformation();

        if (info != null) {
            metas.add(TITLE, info.getTitle());
            metas.add(AUTHOR, info.getAuthor());
            metas.add(SUBJECT, info.getSubject());
            metas.add(CREATION_DATE, info.getCreateDateTime());
            metas.add(MODIFICATION_DATE, info.getLastSaveDateTime());
            metas.add(CONTENT, info.getKeywords());
            metas.add(COMMENTS, info.getComments());
        }
        String text = extractor.getText();
        if (StringUtils.isEmpty(text))
            return;
        ParserDocument result = getNewParserDocument();
        result.add(CONTENT, text);
        result.add(LANG_DETECTION, languageDetection(CONTENT, 10000));
    } finally {
        if (extractor != null)
            IOUtils.closeQuietly(extractor);
    }
}

From source file:mj.ocraptor.extraction.tika.parser.microsoft.OfficeParser.java

License:Apache License

protected void parse(DirectoryNode root, ParseContext context, Metadata metadata, XHTMLContentHandler xhtml)
        throws IOException, SAXException, TikaException {

    // Parse summary entries first, to make metadata available early
    new SummaryExtractor(metadata).parseSummaries(root);

    // Parse remaining document entries
    POIFSDocumentType type = POIFSDocumentType.detectType(root);

    if (type != POIFSDocumentType.UNKNOWN) {
        setType(metadata, type.getType());
    }//from  w  w  w. j  a  v  a 2s . c om

    switch (type) {
    case SOLIDWORKS_PART:
        // new SolidworksExtractor(context).parse(root, xhtml);
        break;
    case SOLIDWORKS_ASSEMBLY:
        break;
    case SOLIDWORKS_DRAWING:
        break;
    case PUBLISHER:
        PublisherTextExtractor publisherTextExtractor = new PublisherTextExtractor(root);
        xhtml.element("p", publisherTextExtractor.getText());
        break;
    case WORDDOCUMENT:
        new WordExtractor(context, metadata).parse(root, xhtml);
        break;
    case POWERPOINT:
        new HSLFExtractor(context, metadata).parse(root, xhtml);
        break;
    case WORKBOOK:
    case XLR:
        Locale locale = context.get(Locale.class, Locale.getDefault());
        new ExcelExtractor(context, metadata).parse(root, xhtml, locale);
        break;
    case PROJECT:
        // We currently can't do anything beyond the metadata
        break;
    case VISIO:
        VisioTextExtractor visioTextExtractor = new VisioTextExtractor(root);
        for (String text : visioTextExtractor.getAllText()) {
            xhtml.element("p", text);
        }
        break;
    case OUTLOOK:
        OutlookExtractor extractor = new OutlookExtractor(root, context);
        extractor.parse(xhtml, metadata);
        break;
    case ENCRYPTED:
        EncryptionInfo info = new EncryptionInfo(root);
        Decryptor d = Decryptor.getInstance(info);

        try {
            // By default, use the default Office Password
            String password = Decryptor.DEFAULT_PASSWORD;

            // If they supplied a Password Provider, ask that for the password,
            // and use the provider given one if available (stick with default if
            // not)
            PasswordProvider passwordProvider = context.get(PasswordProvider.class);
            if (passwordProvider != null) {
                String suppliedPassword = passwordProvider.getPassword(metadata);
                if (suppliedPassword != null) {
                    password = suppliedPassword;
                }
            }

            // Check if we've the right password or not
            if (!d.verifyPassword(password)) {
                throw new EncryptedDocumentException();
            }

            // Decrypt the OLE2 stream, and delegate the resulting OOXML
            // file to the regular OOXML parser for normal handling
            OOXMLParser parser = new OOXMLParser();

            parser.parse(d.getDataStream(root), new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
                    metadata, context);
        } catch (GeneralSecurityException ex) {
            throw new EncryptedDocumentException(ex);
        }
    }
}

From source file:org.apache.tika.parser.microsoft.OfficeParser.java

License:Apache License

protected void parse(DirectoryNode root, ParseContext context, Metadata metadata, XHTMLContentHandler xhtml)
        throws IOException, SAXException, TikaException {

    // Parse summary entries first, to make metadata available early
    new SummaryExtractor(metadata).parseSummaries(root);

    // Parse remaining document entries
    POIFSDocumentType type = POIFSDocumentType.detectType(root);

    if (type != POIFSDocumentType.UNKNOWN) {
        setType(metadata, type.getType());
    }/*from ww  w.jav a 2  s.c  om*/

    switch (type) {
    case SOLIDWORKS_PART:
    case SOLIDWORKS_ASSEMBLY:
    case SOLIDWORKS_DRAWING:
        break;
    case PUBLISHER:
        PublisherTextExtractor publisherTextExtractor = new PublisherTextExtractor(root);
        xhtml.element("p", publisherTextExtractor.getText());
        break;
    case WORDDOCUMENT:
        new WordExtractor(context).parse(root, xhtml);
        break;
    case POWERPOINT:
        new HSLFExtractor(context).parse(root, xhtml);
        break;
    case WORKBOOK:
    case XLR:
        Locale locale = context.get(Locale.class, Locale.getDefault());
        new ExcelExtractor(context, metadata).parse(root, xhtml, locale);
        break;
    case PROJECT:
        // We currently can't do anything beyond the metadata
        break;
    case VISIO:
        VisioTextExtractor visioTextExtractor = new VisioTextExtractor(root);
        for (String text : visioTextExtractor.getAllText()) {
            xhtml.element("p", text);
        }
        break;
    case OUTLOOK:
        OutlookExtractor extractor = new OutlookExtractor(root, context);

        extractor.parse(xhtml, metadata);
        break;
    case ENCRYPTED:
        EncryptionInfo info = new EncryptionInfo(root);
        Decryptor d = Decryptor.getInstance(info);

        try {
            // By default, use the default Office Password
            String password = Decryptor.DEFAULT_PASSWORD;

            // If they supplied a Password Provider, ask that for the password,
            //  and use the provider given one if available (stick with default if not)
            PasswordProvider passwordProvider = context.get(PasswordProvider.class);
            if (passwordProvider != null) {
                String suppliedPassword = passwordProvider.getPassword(metadata);
                if (suppliedPassword != null) {
                    password = suppliedPassword;
                }
            }

            // Check if we've the right password or not
            if (!d.verifyPassword(password)) {
                throw new EncryptedDocumentException();
            }

            // Decrypt the OLE2 stream, and delegate the resulting OOXML
            //  file to the regular OOXML parser for normal handling
            OOXMLParser parser = new OOXMLParser();

            parser.parse(d.getDataStream(root), new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
                    metadata, context);
        } catch (GeneralSecurityException ex) {
            throw new EncryptedDocumentException(ex);
        }
    default:
        // For unsupported / unhandled types, just the metadata
        //  is extracted, which happened above
        break;
    }
}