List of usage examples for org.apache.poi.hdgf.extractor VisioTextExtractor getAllText
public String[] getAllText()
From source file:com.jaeksoft.searchlib.parser.VisioParser.java
License:Open Source License
@Override protected void parseContent(StreamLimiter streamLimiter, LanguageEnum lang) throws IOException { VisioTextExtractor extractor = null; try {//from w w w. j a v a 2 s. c o m extractor = new VisioTextExtractor(streamLimiter.getNewInputStream()); SummaryInformation info = extractor.getSummaryInformation(); ParserResultItem result = getNewParserResultItem(); if (info != null) { result.addField(ParserFieldEnum.title, info.getTitle()); result.addField(ParserFieldEnum.author, info.getAuthor()); result.addField(ParserFieldEnum.subject, info.getSubject()); } String[] texts = extractor.getAllText(); if (texts == null) return; for (String text : texts) result.addField(ParserFieldEnum.content, StringUtils.replaceConsecutiveSpaces(text, " ")); result.langDetection(10000, ParserFieldEnum.content); } finally { IOUtils.close(extractor); } }
From source file:com.opensearchserver.extractor.parser.Visio.java
License:Apache License
@Override protected void parseContent(InputStream inputStream, String extension, String mimeType) throws Exception { VisioTextExtractor extractor = null; try {/*from www. j ava2s.c o m*/ extractor = new VisioTextExtractor(inputStream); SummaryInformation info = extractor.getSummaryInformation(); if (info != null) { metas.add(TITLE, info.getTitle()); metas.add(AUTHOR, info.getAuthor()); metas.add(SUBJECT, info.getSubject()); metas.add(CREATION_DATE, info.getCreateDateTime()); metas.add(MODIFICATION_DATE, info.getLastSaveDateTime()); metas.add(CONTENT, info.getKeywords()); metas.add(COMMENTS, info.getComments()); } String[] texts = extractor.getAllText(); if (texts == null) return; ParserDocument result = getNewParserDocument(); for (String text : texts) result.add(CONTENT, text); result.add(LANG_DETECTION, languageDetection(CONTENT, 10000)); } finally { if (extractor != null) IOUtils.closeQuietly(extractor); } }
From source file:com.opensearchserver.textextractor.parser.Visio.java
License:Open Source License
@Override protected void parseContent(InputStream inputStream) throws Exception { VisioTextExtractor extractor = null; try {//from ww w . jav a 2 s .co m extractor = new VisioTextExtractor(inputStream); SummaryInformation info = extractor.getSummaryInformation(); if (info != null) { metas.add(TITLE, info.getTitle()); metas.add(AUTHOR, info.getAuthor()); metas.add(SUBJECT, info.getSubject()); metas.add(CREATION_DATE, info.getCreateDateTime()); metas.add(MODIFICATION_DATE, info.getLastSaveDateTime()); metas.add(CONTENT, info.getKeywords()); metas.add(COMMENTS, info.getComments()); } String[] texts = extractor.getAllText(); if (texts == null) return; ParserDocument result = getNewParserDocument(); for (String text : texts) result.add(CONTENT, text); result.add(LANG_DETECTION, languageDetection(CONTENT, 10000)); } finally { if (extractor != null) IOUtils.closeQuietly(extractor); } }
From source file:mj.ocraptor.extraction.tika.parser.microsoft.OfficeParser.java
License:Apache License
protected void parse(DirectoryNode root, ParseContext context, Metadata metadata, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { // Parse summary entries first, to make metadata available early new SummaryExtractor(metadata).parseSummaries(root); // Parse remaining document entries POIFSDocumentType type = POIFSDocumentType.detectType(root); if (type != POIFSDocumentType.UNKNOWN) { setType(metadata, type.getType()); }//from w ww.j ava2s . c o m switch (type) { case SOLIDWORKS_PART: // new SolidworksExtractor(context).parse(root, xhtml); break; case SOLIDWORKS_ASSEMBLY: break; case SOLIDWORKS_DRAWING: break; case PUBLISHER: PublisherTextExtractor publisherTextExtractor = new PublisherTextExtractor(root); xhtml.element("p", publisherTextExtractor.getText()); break; case WORDDOCUMENT: new WordExtractor(context, metadata).parse(root, xhtml); break; case POWERPOINT: new HSLFExtractor(context, metadata).parse(root, xhtml); break; case WORKBOOK: case XLR: Locale locale = context.get(Locale.class, Locale.getDefault()); new ExcelExtractor(context, metadata).parse(root, xhtml, locale); break; case PROJECT: // We currently can't do anything beyond the metadata break; case VISIO: VisioTextExtractor visioTextExtractor = new VisioTextExtractor(root); for (String text : visioTextExtractor.getAllText()) { xhtml.element("p", text); } break; case OUTLOOK: OutlookExtractor extractor = new OutlookExtractor(root, context); extractor.parse(xhtml, metadata); break; case ENCRYPTED: EncryptionInfo info = new EncryptionInfo(root); Decryptor d = Decryptor.getInstance(info); try { // By default, use the default Office Password String password = Decryptor.DEFAULT_PASSWORD; // If they supplied a Password Provider, ask that for the password, // and use the provider given one if available (stick with default if // not) PasswordProvider passwordProvider = context.get(PasswordProvider.class); if (passwordProvider != null) { String suppliedPassword = passwordProvider.getPassword(metadata); if (suppliedPassword != null) { password = suppliedPassword; } } // Check if we've the right password or not if (!d.verifyPassword(password)) { throw new EncryptedDocumentException(); } // Decrypt the OLE2 stream, and delegate the resulting OOXML // file to the regular OOXML parser for normal handling OOXMLParser parser = new OOXMLParser(); parser.parse(d.getDataStream(root), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), metadata, context); } catch (GeneralSecurityException ex) { throw new EncryptedDocumentException(ex); } } }
From source file:org.apache.tika.parser.microsoft.OfficeParser.java
License:Apache License
protected void parse(DirectoryNode root, ParseContext context, Metadata metadata, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { // Parse summary entries first, to make metadata available early new SummaryExtractor(metadata).parseSummaries(root); // Parse remaining document entries POIFSDocumentType type = POIFSDocumentType.detectType(root); if (type != POIFSDocumentType.UNKNOWN) { setType(metadata, type.getType()); }/*from w w w. java 2s. c o m*/ switch (type) { case SOLIDWORKS_PART: case SOLIDWORKS_ASSEMBLY: case SOLIDWORKS_DRAWING: break; case PUBLISHER: PublisherTextExtractor publisherTextExtractor = new PublisherTextExtractor(root); xhtml.element("p", publisherTextExtractor.getText()); break; case WORDDOCUMENT: new WordExtractor(context).parse(root, xhtml); break; case POWERPOINT: new HSLFExtractor(context).parse(root, xhtml); break; case WORKBOOK: case XLR: Locale locale = context.get(Locale.class, Locale.getDefault()); new ExcelExtractor(context, metadata).parse(root, xhtml, locale); break; case PROJECT: // We currently can't do anything beyond the metadata break; case VISIO: VisioTextExtractor visioTextExtractor = new VisioTextExtractor(root); for (String text : visioTextExtractor.getAllText()) { xhtml.element("p", text); } break; case OUTLOOK: OutlookExtractor extractor = new OutlookExtractor(root, context); extractor.parse(xhtml, metadata); break; case ENCRYPTED: EncryptionInfo info = new EncryptionInfo(root); Decryptor d = Decryptor.getInstance(info); try { // By default, use the default Office Password String password = Decryptor.DEFAULT_PASSWORD; // If they supplied a Password Provider, ask that for the password, // and use the provider given one if available (stick with default if not) PasswordProvider passwordProvider = context.get(PasswordProvider.class); if (passwordProvider != null) { String suppliedPassword = passwordProvider.getPassword(metadata); if (suppliedPassword != null) { password = suppliedPassword; } } // Check if we've the right password or not if (!d.verifyPassword(password)) { throw new EncryptedDocumentException(); } // Decrypt the OLE2 stream, and delegate the resulting OOXML // file to the regular OOXML parser for normal handling OOXMLParser parser = new OOXMLParser(); parser.parse(d.getDataStream(root), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), metadata, context); } catch (GeneralSecurityException ex) { throw new EncryptedDocumentException(ex); } default: // For unsupported / unhandled types, just the metadata // is extracted, which happened above break; } }