Example usage for org.apache.poi.hdgf.extractor VisioTextExtractor VisioTextExtractor

List of usage examples for org.apache.poi.hdgf.extractor VisioTextExtractor VisioTextExtractor

Introduction

In this page you can find the example usage for org.apache.poi.hdgf.extractor VisioTextExtractor VisioTextExtractor.

Prototype

public VisioTextExtractor(InputStream inp) throws IOException 

Source Link

Usage

From source file:com.jaeksoft.searchlib.parser.VisioParser.java

License:Open Source License

@Override
protected void parseContent(StreamLimiter streamLimiter, LanguageEnum lang) throws IOException {
    VisioTextExtractor extractor = null;
    try {//from   w  ww .j  av a  2s  .  c o m
        extractor = new VisioTextExtractor(streamLimiter.getNewInputStream());
        SummaryInformation info = extractor.getSummaryInformation();
        ParserResultItem result = getNewParserResultItem();

        if (info != null) {
            result.addField(ParserFieldEnum.title, info.getTitle());
            result.addField(ParserFieldEnum.author, info.getAuthor());
            result.addField(ParserFieldEnum.subject, info.getSubject());
        }
        String[] texts = extractor.getAllText();
        if (texts == null)
            return;
        for (String text : texts)
            result.addField(ParserFieldEnum.content, StringUtils.replaceConsecutiveSpaces(text, " "));
        result.langDetection(10000, ParserFieldEnum.content);
    } finally {
        IOUtils.close(extractor);
    }
}

From source file:com.opensearchserver.extractor.parser.Visio.java

License:Apache License

@Override
protected void parseContent(InputStream inputStream, String extension, String mimeType) throws Exception {
    VisioTextExtractor extractor = null;
    try {/*  w ww. j  ava 2 s  .c  o  m*/
        extractor = new VisioTextExtractor(inputStream);
        SummaryInformation info = extractor.getSummaryInformation();

        if (info != null) {
            metas.add(TITLE, info.getTitle());
            metas.add(AUTHOR, info.getAuthor());
            metas.add(SUBJECT, info.getSubject());
            metas.add(CREATION_DATE, info.getCreateDateTime());
            metas.add(MODIFICATION_DATE, info.getLastSaveDateTime());
            metas.add(CONTENT, info.getKeywords());
            metas.add(COMMENTS, info.getComments());
        }
        String[] texts = extractor.getAllText();
        if (texts == null)
            return;
        ParserDocument result = getNewParserDocument();
        for (String text : texts)
            result.add(CONTENT, text);
        result.add(LANG_DETECTION, languageDetection(CONTENT, 10000));
    } finally {
        if (extractor != null)
            IOUtils.closeQuietly(extractor);
    }
}

From source file:com.opensearchserver.textextractor.parser.Visio.java

License:Open Source License

@Override
protected void parseContent(InputStream inputStream) throws Exception {
    VisioTextExtractor extractor = null;
    try {//from  w  ww .j a  v a2  s . c om
        extractor = new VisioTextExtractor(inputStream);
        SummaryInformation info = extractor.getSummaryInformation();

        if (info != null) {
            metas.add(TITLE, info.getTitle());
            metas.add(AUTHOR, info.getAuthor());
            metas.add(SUBJECT, info.getSubject());
            metas.add(CREATION_DATE, info.getCreateDateTime());
            metas.add(MODIFICATION_DATE, info.getLastSaveDateTime());
            metas.add(CONTENT, info.getKeywords());
            metas.add(COMMENTS, info.getComments());
        }
        String[] texts = extractor.getAllText();
        if (texts == null)
            return;
        ParserDocument result = getNewParserDocument();
        for (String text : texts)
            result.add(CONTENT, text);
        result.add(LANG_DETECTION, languageDetection(CONTENT, 10000));
    } finally {
        if (extractor != null)
            IOUtils.closeQuietly(extractor);
    }
}

From source file:com.qwazr.library.poi.VisioParser.java

License:Apache License

@Override
public void parseContent(final MultivaluedMap<String, String> parameters, final InputStream inputStream,
        String extension, final String mimeType, final ParserResultBuilder resultBuilder) throws Exception {

    try (final VisioTextExtractor extractor = new VisioTextExtractor(inputStream)) {

        final SummaryInformation info = extractor.getSummaryInformation();

        if (info != null) {
            final ParserFieldsBuilder metas = resultBuilder.metas();
            metas.add(TITLE, info.getTitle());
            metas.add(AUTHOR, info.getAuthor());
            metas.add(SUBJECT, info.getSubject());
            metas.add(CREATION_DATE, info.getCreateDateTime());
            metas.add(MODIFICATION_DATE, info.getLastSaveDateTime());
            metas.add(CONTENT, info.getKeywords());
            metas.add(COMMENTS, info.getComments());
        }//from www  .ja v  a  2  s .c om
        final String[] texts = extractor.getAllText();
        if (texts == null)
            return;
        final ParserFieldsBuilder result = resultBuilder.newDocument();
        for (String text : texts)
            result.add(CONTENT, text);
        result.add(LANG_DETECTION, languageDetection(result, CONTENT, 10000));
    }
}

From source file:mj.ocraptor.extraction.tika.parser.microsoft.OfficeParser.java

License:Apache License

protected void parse(DirectoryNode root, ParseContext context, Metadata metadata, XHTMLContentHandler xhtml)
        throws IOException, SAXException, TikaException {

    // Parse summary entries first, to make metadata available early
    new SummaryExtractor(metadata).parseSummaries(root);

    // Parse remaining document entries
    POIFSDocumentType type = POIFSDocumentType.detectType(root);

    if (type != POIFSDocumentType.UNKNOWN) {
        setType(metadata, type.getType());
    }//from   w ww .j a v  a2s.c  o m

    switch (type) {
    case SOLIDWORKS_PART:
        // new SolidworksExtractor(context).parse(root, xhtml);
        break;
    case SOLIDWORKS_ASSEMBLY:
        break;
    case SOLIDWORKS_DRAWING:
        break;
    case PUBLISHER:
        PublisherTextExtractor publisherTextExtractor = new PublisherTextExtractor(root);
        xhtml.element("p", publisherTextExtractor.getText());
        break;
    case WORDDOCUMENT:
        new WordExtractor(context, metadata).parse(root, xhtml);
        break;
    case POWERPOINT:
        new HSLFExtractor(context, metadata).parse(root, xhtml);
        break;
    case WORKBOOK:
    case XLR:
        Locale locale = context.get(Locale.class, Locale.getDefault());
        new ExcelExtractor(context, metadata).parse(root, xhtml, locale);
        break;
    case PROJECT:
        // We currently can't do anything beyond the metadata
        break;
    case VISIO:
        VisioTextExtractor visioTextExtractor = new VisioTextExtractor(root);
        for (String text : visioTextExtractor.getAllText()) {
            xhtml.element("p", text);
        }
        break;
    case OUTLOOK:
        OutlookExtractor extractor = new OutlookExtractor(root, context);
        extractor.parse(xhtml, metadata);
        break;
    case ENCRYPTED:
        EncryptionInfo info = new EncryptionInfo(root);
        Decryptor d = Decryptor.getInstance(info);

        try {
            // By default, use the default Office Password
            String password = Decryptor.DEFAULT_PASSWORD;

            // If they supplied a Password Provider, ask that for the password,
            // and use the provider given one if available (stick with default if
            // not)
            PasswordProvider passwordProvider = context.get(PasswordProvider.class);
            if (passwordProvider != null) {
                String suppliedPassword = passwordProvider.getPassword(metadata);
                if (suppliedPassword != null) {
                    password = suppliedPassword;
                }
            }

            // Check if we've the right password or not
            if (!d.verifyPassword(password)) {
                throw new EncryptedDocumentException();
            }

            // Decrypt the OLE2 stream, and delegate the resulting OOXML
            // file to the regular OOXML parser for normal handling
            OOXMLParser parser = new OOXMLParser();

            parser.parse(d.getDataStream(root), new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
                    metadata, context);
        } catch (GeneralSecurityException ex) {
            throw new EncryptedDocumentException(ex);
        }
    }
}

From source file:net.sourceforge.docfetcher.parse.MSVisioParser.java

License:Open Source License

public String renderText(File file) throws ParseException {
    InputStream in = null;//  w ww .j a  v a2 s .c om
    try {
        in = new FileInputStream(file);
        VisioTextExtractor extractor = null;
        try {
            extractor = new VisioTextExtractor(in);
        } catch (Exception e) {
            // This can happen if the file has the "vsd" extension, but is not a Visio document
            throw new ParseException(file, Msg.file_corrupted.value());
        } finally {
            in.close();
        }
        return extractor.getText();
    } catch (FileNotFoundException e) {
        throw new ParseException(file, Msg.file_not_found.value());
    } catch (IOException ioe) {
        throw new ParseException(file, Msg.file_not_readable.value());
    }
}

From source file:net.yacy.document.parser.vsdParser.java

License:Open Source License

@Override
public Document[] parse(final AnchorURL location, final String mimeType, final String charset,
        final VocabularyScraper scraper, final int timezoneOffset, final InputStream source)
        throws Parser.Failure, InterruptedException {

    Document theDoc = null;//from www .  j av a2  s.  c  o m

    try {
        String contents = "";
        SummaryInformation summary = null;
        try {
            final VisioTextExtractor extractor = new VisioTextExtractor(source);
            contents = extractor.getText();
            summary = extractor.getSummaryInformation();
        } catch (final Exception e) {
            ConcurrentLog.warn("vsdParser", e.getMessage());
        }

        String author = null;
        String[] keywords = null;
        String title = null;
        if (summary != null) {
            author = summary.getAuthor();
            if (summary.getKeywords() != null) {
                keywords = summary.getKeywords().split("[ ,;]");
            }
            title = summary.getTitle();
        }

        List<String> abstrct = new ArrayList<String>();
        if (contents.length() > 0)
            abstrct.add(((contents.length() > 80) ? contents.substring(0, 80) : contents.trim())
                    .replaceAll("\r\n", " ").replaceAll("\n", " ").replaceAll("\r", " ").replaceAll("\t", " "));

        if (title == null)
            title = location.toNormalform(true);

        // As the result of parsing this function must return a plasmaParserDocument object
        return new Document[] { new Document(location, // url of the source document
                mimeType, // the documents mime type
                "UTF-8", // charset of the document text
                this, null, // language
                keywords, singleList(title), author, "", null, // an array of section headlines
                abstrct, // an abstract
                0.0f, 0.0f, contents, // the parsed document text
                null, // a map of extracted anchors
                null, null, // a treeset of image URLs
                false, new Date()) };
    } catch (final Exception e) {
        if (e instanceof InterruptedException)
            throw (InterruptedException) e;

        // if an unexpected error occures just log the error and raise a new ParserException
        final String errorMsg = "Unable to parse the vsd document '" + location + "':" + e.getMessage();
        AbstractParser.log.severe(errorMsg);
        throw new Parser.Failure(errorMsg, location);
    } finally {
        if (theDoc == null) {
            // if an unexpected error occures just log the error and raise a new Parser.Failure
            final String errorMsg = "Unable to parse the vsd document '" + location
                    + "': possibly out of memory";
            AbstractParser.log.severe(errorMsg);
            throw new Parser.Failure(errorMsg, location);
        }
    }
}

From source file:org.apache.tika.parser.microsoft.OfficeParser.java

License:Apache License

protected void parse(DirectoryNode root, ParseContext context, Metadata metadata, XHTMLContentHandler xhtml)
        throws IOException, SAXException, TikaException {

    // Parse summary entries first, to make metadata available early
    new SummaryExtractor(metadata).parseSummaries(root);

    // Parse remaining document entries
    POIFSDocumentType type = POIFSDocumentType.detectType(root);

    if (type != POIFSDocumentType.UNKNOWN) {
        setType(metadata, type.getType());
    }//from w  w  w  . j a v  a  2s  . c om

    switch (type) {
    case SOLIDWORKS_PART:
    case SOLIDWORKS_ASSEMBLY:
    case SOLIDWORKS_DRAWING:
        break;
    case PUBLISHER:
        PublisherTextExtractor publisherTextExtractor = new PublisherTextExtractor(root);
        xhtml.element("p", publisherTextExtractor.getText());
        break;
    case WORDDOCUMENT:
        new WordExtractor(context).parse(root, xhtml);
        break;
    case POWERPOINT:
        new HSLFExtractor(context).parse(root, xhtml);
        break;
    case WORKBOOK:
    case XLR:
        Locale locale = context.get(Locale.class, Locale.getDefault());
        new ExcelExtractor(context, metadata).parse(root, xhtml, locale);
        break;
    case PROJECT:
        // We currently can't do anything beyond the metadata
        break;
    case VISIO:
        VisioTextExtractor visioTextExtractor = new VisioTextExtractor(root);
        for (String text : visioTextExtractor.getAllText()) {
            xhtml.element("p", text);
        }
        break;
    case OUTLOOK:
        OutlookExtractor extractor = new OutlookExtractor(root, context);

        extractor.parse(xhtml, metadata);
        break;
    case ENCRYPTED:
        EncryptionInfo info = new EncryptionInfo(root);
        Decryptor d = Decryptor.getInstance(info);

        try {
            // By default, use the default Office Password
            String password = Decryptor.DEFAULT_PASSWORD;

            // If they supplied a Password Provider, ask that for the password,
            //  and use the provider given one if available (stick with default if not)
            PasswordProvider passwordProvider = context.get(PasswordProvider.class);
            if (passwordProvider != null) {
                String suppliedPassword = passwordProvider.getPassword(metadata);
                if (suppliedPassword != null) {
                    password = suppliedPassword;
                }
            }

            // Check if we've the right password or not
            if (!d.verifyPassword(password)) {
                throw new EncryptedDocumentException();
            }

            // Decrypt the OLE2 stream, and delegate the resulting OOXML
            //  file to the regular OOXML parser for normal handling
            OOXMLParser parser = new OOXMLParser();

            parser.parse(d.getDataStream(root), new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
                    metadata, context);
        } catch (GeneralSecurityException ex) {
            throw new EncryptedDocumentException(ex);
        }
    default:
        // For unsupported / unhandled types, just the metadata
        //  is extracted, which happened above
        break;
    }
}

From source file:org.codelibs.fess.crawler.extractor.impl.MsVisioExtractor.java

License:Apache License

@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
    if (in == null) {
        throw new CrawlerSystemException("The inputstream is null.");
    }/*from  w  w w .j  ava  2s .  c om*/
    try {
        return new ExtractData(new VisioTextExtractor(in).getText());
    } catch (final IOException e) {
        throw new ExtractException(e);
    }
}

From source file:org.codelibs.robot.extractor.impl.MsVisioExtractor.java

License:Apache License

@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
    if (in == null) {
        throw new RobotSystemException("The inputstream is null.");
    }//from  w w  w  . j av  a  2s  . co  m
    try {
        return new ExtractData(new VisioTextExtractor(in).getText());
    } catch (final IOException e) {
        throw new ExtractException(e);
    }
}