List of usage examples for org.apache.poi.hdgf.extractor VisioTextExtractor VisioTextExtractor
public VisioTextExtractor(InputStream inp) throws IOException
From source file:com.jaeksoft.searchlib.parser.VisioParser.java
License:Open Source License
@Override protected void parseContent(StreamLimiter streamLimiter, LanguageEnum lang) throws IOException { VisioTextExtractor extractor = null; try {//from w ww .j av a 2s . c o m extractor = new VisioTextExtractor(streamLimiter.getNewInputStream()); SummaryInformation info = extractor.getSummaryInformation(); ParserResultItem result = getNewParserResultItem(); if (info != null) { result.addField(ParserFieldEnum.title, info.getTitle()); result.addField(ParserFieldEnum.author, info.getAuthor()); result.addField(ParserFieldEnum.subject, info.getSubject()); } String[] texts = extractor.getAllText(); if (texts == null) return; for (String text : texts) result.addField(ParserFieldEnum.content, StringUtils.replaceConsecutiveSpaces(text, " ")); result.langDetection(10000, ParserFieldEnum.content); } finally { IOUtils.close(extractor); } }
From source file:com.opensearchserver.extractor.parser.Visio.java
License:Apache License
@Override protected void parseContent(InputStream inputStream, String extension, String mimeType) throws Exception { VisioTextExtractor extractor = null; try {/* w ww. j ava 2 s .c o m*/ extractor = new VisioTextExtractor(inputStream); SummaryInformation info = extractor.getSummaryInformation(); if (info != null) { metas.add(TITLE, info.getTitle()); metas.add(AUTHOR, info.getAuthor()); metas.add(SUBJECT, info.getSubject()); metas.add(CREATION_DATE, info.getCreateDateTime()); metas.add(MODIFICATION_DATE, info.getLastSaveDateTime()); metas.add(CONTENT, info.getKeywords()); metas.add(COMMENTS, info.getComments()); } String[] texts = extractor.getAllText(); if (texts == null) return; ParserDocument result = getNewParserDocument(); for (String text : texts) result.add(CONTENT, text); result.add(LANG_DETECTION, languageDetection(CONTENT, 10000)); } finally { if (extractor != null) IOUtils.closeQuietly(extractor); } }
From source file:com.opensearchserver.textextractor.parser.Visio.java
License:Open Source License
@Override protected void parseContent(InputStream inputStream) throws Exception { VisioTextExtractor extractor = null; try {//from w ww .j a v a2 s . c om extractor = new VisioTextExtractor(inputStream); SummaryInformation info = extractor.getSummaryInformation(); if (info != null) { metas.add(TITLE, info.getTitle()); metas.add(AUTHOR, info.getAuthor()); metas.add(SUBJECT, info.getSubject()); metas.add(CREATION_DATE, info.getCreateDateTime()); metas.add(MODIFICATION_DATE, info.getLastSaveDateTime()); metas.add(CONTENT, info.getKeywords()); metas.add(COMMENTS, info.getComments()); } String[] texts = extractor.getAllText(); if (texts == null) return; ParserDocument result = getNewParserDocument(); for (String text : texts) result.add(CONTENT, text); result.add(LANG_DETECTION, languageDetection(CONTENT, 10000)); } finally { if (extractor != null) IOUtils.closeQuietly(extractor); } }
From source file:com.qwazr.library.poi.VisioParser.java
License:Apache License
@Override public void parseContent(final MultivaluedMap<String, String> parameters, final InputStream inputStream, String extension, final String mimeType, final ParserResultBuilder resultBuilder) throws Exception { try (final VisioTextExtractor extractor = new VisioTextExtractor(inputStream)) { final SummaryInformation info = extractor.getSummaryInformation(); if (info != null) { final ParserFieldsBuilder metas = resultBuilder.metas(); metas.add(TITLE, info.getTitle()); metas.add(AUTHOR, info.getAuthor()); metas.add(SUBJECT, info.getSubject()); metas.add(CREATION_DATE, info.getCreateDateTime()); metas.add(MODIFICATION_DATE, info.getLastSaveDateTime()); metas.add(CONTENT, info.getKeywords()); metas.add(COMMENTS, info.getComments()); }//from www .ja v a 2 s .c om final String[] texts = extractor.getAllText(); if (texts == null) return; final ParserFieldsBuilder result = resultBuilder.newDocument(); for (String text : texts) result.add(CONTENT, text); result.add(LANG_DETECTION, languageDetection(result, CONTENT, 10000)); } }
From source file:mj.ocraptor.extraction.tika.parser.microsoft.OfficeParser.java
License:Apache License
protected void parse(DirectoryNode root, ParseContext context, Metadata metadata, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { // Parse summary entries first, to make metadata available early new SummaryExtractor(metadata).parseSummaries(root); // Parse remaining document entries POIFSDocumentType type = POIFSDocumentType.detectType(root); if (type != POIFSDocumentType.UNKNOWN) { setType(metadata, type.getType()); }//from w ww .j a v a2s.c o m switch (type) { case SOLIDWORKS_PART: // new SolidworksExtractor(context).parse(root, xhtml); break; case SOLIDWORKS_ASSEMBLY: break; case SOLIDWORKS_DRAWING: break; case PUBLISHER: PublisherTextExtractor publisherTextExtractor = new PublisherTextExtractor(root); xhtml.element("p", publisherTextExtractor.getText()); break; case WORDDOCUMENT: new WordExtractor(context, metadata).parse(root, xhtml); break; case POWERPOINT: new HSLFExtractor(context, metadata).parse(root, xhtml); break; case WORKBOOK: case XLR: Locale locale = context.get(Locale.class, Locale.getDefault()); new ExcelExtractor(context, metadata).parse(root, xhtml, locale); break; case PROJECT: // We currently can't do anything beyond the metadata break; case VISIO: VisioTextExtractor visioTextExtractor = new VisioTextExtractor(root); for (String text : visioTextExtractor.getAllText()) { xhtml.element("p", text); } break; case OUTLOOK: OutlookExtractor extractor = new OutlookExtractor(root, context); extractor.parse(xhtml, metadata); break; case ENCRYPTED: EncryptionInfo info = new EncryptionInfo(root); Decryptor d = Decryptor.getInstance(info); try { // By default, use the default Office Password String password = Decryptor.DEFAULT_PASSWORD; // If they supplied a Password Provider, ask that for the password, // and use the provider given one if available (stick with default if // not) PasswordProvider passwordProvider = context.get(PasswordProvider.class); if (passwordProvider != null) { String suppliedPassword = passwordProvider.getPassword(metadata); if (suppliedPassword != null) { password = suppliedPassword; } } // Check if we've the right password or not if (!d.verifyPassword(password)) { throw new EncryptedDocumentException(); } // Decrypt the OLE2 stream, and delegate the resulting OOXML // file to the regular OOXML parser for normal handling OOXMLParser parser = new OOXMLParser(); parser.parse(d.getDataStream(root), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), metadata, context); } catch (GeneralSecurityException ex) { throw new EncryptedDocumentException(ex); } } }
From source file:net.sourceforge.docfetcher.parse.MSVisioParser.java
License:Open Source License
public String renderText(File file) throws ParseException { InputStream in = null;// w ww .j a v a2 s .c om try { in = new FileInputStream(file); VisioTextExtractor extractor = null; try { extractor = new VisioTextExtractor(in); } catch (Exception e) { // This can happen if the file has the "vsd" extension, but is not a Visio document throw new ParseException(file, Msg.file_corrupted.value()); } finally { in.close(); } return extractor.getText(); } catch (FileNotFoundException e) { throw new ParseException(file, Msg.file_not_found.value()); } catch (IOException ioe) { throw new ParseException(file, Msg.file_not_readable.value()); } }
From source file:net.yacy.document.parser.vsdParser.java
License:Open Source License
@Override public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final int timezoneOffset, final InputStream source) throws Parser.Failure, InterruptedException { Document theDoc = null;//from www . j av a2 s. c o m try { String contents = ""; SummaryInformation summary = null; try { final VisioTextExtractor extractor = new VisioTextExtractor(source); contents = extractor.getText(); summary = extractor.getSummaryInformation(); } catch (final Exception e) { ConcurrentLog.warn("vsdParser", e.getMessage()); } String author = null; String[] keywords = null; String title = null; if (summary != null) { author = summary.getAuthor(); if (summary.getKeywords() != null) { keywords = summary.getKeywords().split("[ ,;]"); } title = summary.getTitle(); } List<String> abstrct = new ArrayList<String>(); if (contents.length() > 0) abstrct.add(((contents.length() > 80) ? contents.substring(0, 80) : contents.trim()) .replaceAll("\r\n", " ").replaceAll("\n", " ").replaceAll("\r", " ").replaceAll("\t", " ")); if (title == null) title = location.toNormalform(true); // As the result of parsing this function must return a plasmaParserDocument object return new Document[] { new Document(location, // url of the source document mimeType, // the documents mime type "UTF-8", // charset of the document text this, null, // language keywords, singleList(title), author, "", null, // an array of section headlines abstrct, // an abstract 0.0f, 0.0f, contents, // the parsed document text null, // a map of extracted anchors null, null, // a treeset of image URLs false, new Date()) }; } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; // if an unexpected error occures just log the error and raise a new ParserException final String errorMsg = "Unable to parse the vsd document '" + location + "':" + e.getMessage(); AbstractParser.log.severe(errorMsg); throw new Parser.Failure(errorMsg, location); } finally { if (theDoc == null) { // if an unexpected error occures just log the error and raise a new Parser.Failure final String errorMsg = "Unable to parse the vsd document '" + location + "': possibly out of memory"; AbstractParser.log.severe(errorMsg); throw new Parser.Failure(errorMsg, location); } } }
From source file:org.apache.tika.parser.microsoft.OfficeParser.java
License:Apache License
protected void parse(DirectoryNode root, ParseContext context, Metadata metadata, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { // Parse summary entries first, to make metadata available early new SummaryExtractor(metadata).parseSummaries(root); // Parse remaining document entries POIFSDocumentType type = POIFSDocumentType.detectType(root); if (type != POIFSDocumentType.UNKNOWN) { setType(metadata, type.getType()); }//from w w w . j a v a 2s . c om switch (type) { case SOLIDWORKS_PART: case SOLIDWORKS_ASSEMBLY: case SOLIDWORKS_DRAWING: break; case PUBLISHER: PublisherTextExtractor publisherTextExtractor = new PublisherTextExtractor(root); xhtml.element("p", publisherTextExtractor.getText()); break; case WORDDOCUMENT: new WordExtractor(context).parse(root, xhtml); break; case POWERPOINT: new HSLFExtractor(context).parse(root, xhtml); break; case WORKBOOK: case XLR: Locale locale = context.get(Locale.class, Locale.getDefault()); new ExcelExtractor(context, metadata).parse(root, xhtml, locale); break; case PROJECT: // We currently can't do anything beyond the metadata break; case VISIO: VisioTextExtractor visioTextExtractor = new VisioTextExtractor(root); for (String text : visioTextExtractor.getAllText()) { xhtml.element("p", text); } break; case OUTLOOK: OutlookExtractor extractor = new OutlookExtractor(root, context); extractor.parse(xhtml, metadata); break; case ENCRYPTED: EncryptionInfo info = new EncryptionInfo(root); Decryptor d = Decryptor.getInstance(info); try { // By default, use the default Office Password String password = Decryptor.DEFAULT_PASSWORD; // If they supplied a Password Provider, ask that for the password, // and use the provider given one if available (stick with default if not) PasswordProvider passwordProvider = context.get(PasswordProvider.class); if (passwordProvider != null) { String suppliedPassword = passwordProvider.getPassword(metadata); if (suppliedPassword != null) { password = suppliedPassword; } } // Check if we've the right password or not if (!d.verifyPassword(password)) { throw new EncryptedDocumentException(); } // Decrypt the OLE2 stream, and delegate the resulting OOXML // file to the regular OOXML parser for normal handling OOXMLParser parser = new OOXMLParser(); parser.parse(d.getDataStream(root), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), metadata, context); } catch (GeneralSecurityException ex) { throw new EncryptedDocumentException(ex); } default: // For unsupported / unhandled types, just the metadata // is extracted, which happened above break; } }
From source file:org.codelibs.fess.crawler.extractor.impl.MsVisioExtractor.java
License:Apache License
@Override public ExtractData getText(final InputStream in, final Map<String, String> params) { if (in == null) { throw new CrawlerSystemException("The inputstream is null."); }/*from w w w .j ava 2s . c om*/ try { return new ExtractData(new VisioTextExtractor(in).getText()); } catch (final IOException e) { throw new ExtractException(e); } }
From source file:org.codelibs.robot.extractor.impl.MsVisioExtractor.java
License:Apache License
@Override public ExtractData getText(final InputStream in, final Map<String, String> params) { if (in == null) { throw new RobotSystemException("The inputstream is null."); }//from w w w . j av a 2s . co m try { return new ExtractData(new VisioTextExtractor(in).getText()); } catch (final IOException e) { throw new ExtractException(e); } }