List of usage examples for org.apache.poi.hdgf.extractor VisioTextExtractor getSummaryInformation
public SummaryInformation getSummaryInformation()
From source file:com.jaeksoft.searchlib.parser.VisioParser.java
License:Open Source License
@Override protected void parseContent(StreamLimiter streamLimiter, LanguageEnum lang) throws IOException { VisioTextExtractor extractor = null; try {//from w w w.ja va 2s . co m extractor = new VisioTextExtractor(streamLimiter.getNewInputStream()); SummaryInformation info = extractor.getSummaryInformation(); ParserResultItem result = getNewParserResultItem(); if (info != null) { result.addField(ParserFieldEnum.title, info.getTitle()); result.addField(ParserFieldEnum.author, info.getAuthor()); result.addField(ParserFieldEnum.subject, info.getSubject()); } String[] texts = extractor.getAllText(); if (texts == null) return; for (String text : texts) result.addField(ParserFieldEnum.content, StringUtils.replaceConsecutiveSpaces(text, " ")); result.langDetection(10000, ParserFieldEnum.content); } finally { IOUtils.close(extractor); } }
From source file:com.opensearchserver.extractor.parser.Visio.java
License:Apache License
@Override protected void parseContent(InputStream inputStream, String extension, String mimeType) throws Exception { VisioTextExtractor extractor = null; try {//from w ww . j a va 2s . c o m extractor = new VisioTextExtractor(inputStream); SummaryInformation info = extractor.getSummaryInformation(); if (info != null) { metas.add(TITLE, info.getTitle()); metas.add(AUTHOR, info.getAuthor()); metas.add(SUBJECT, info.getSubject()); metas.add(CREATION_DATE, info.getCreateDateTime()); metas.add(MODIFICATION_DATE, info.getLastSaveDateTime()); metas.add(CONTENT, info.getKeywords()); metas.add(COMMENTS, info.getComments()); } String[] texts = extractor.getAllText(); if (texts == null) return; ParserDocument result = getNewParserDocument(); for (String text : texts) result.add(CONTENT, text); result.add(LANG_DETECTION, languageDetection(CONTENT, 10000)); } finally { if (extractor != null) IOUtils.closeQuietly(extractor); } }
From source file:com.opensearchserver.textextractor.parser.Visio.java
License:Open Source License
@Override protected void parseContent(InputStream inputStream) throws Exception { VisioTextExtractor extractor = null; try {/* ww w . j a va 2 s. c o m*/ extractor = new VisioTextExtractor(inputStream); SummaryInformation info = extractor.getSummaryInformation(); if (info != null) { metas.add(TITLE, info.getTitle()); metas.add(AUTHOR, info.getAuthor()); metas.add(SUBJECT, info.getSubject()); metas.add(CREATION_DATE, info.getCreateDateTime()); metas.add(MODIFICATION_DATE, info.getLastSaveDateTime()); metas.add(CONTENT, info.getKeywords()); metas.add(COMMENTS, info.getComments()); } String[] texts = extractor.getAllText(); if (texts == null) return; ParserDocument result = getNewParserDocument(); for (String text : texts) result.add(CONTENT, text); result.add(LANG_DETECTION, languageDetection(CONTENT, 10000)); } finally { if (extractor != null) IOUtils.closeQuietly(extractor); } }
From source file:net.yacy.document.parser.vsdParser.java
License:Open Source License
@Override public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final int timezoneOffset, final InputStream source) throws Parser.Failure, InterruptedException { Document theDoc = null;//w w w. ja va 2s .com try { String contents = ""; SummaryInformation summary = null; try { final VisioTextExtractor extractor = new VisioTextExtractor(source); contents = extractor.getText(); summary = extractor.getSummaryInformation(); } catch (final Exception e) { ConcurrentLog.warn("vsdParser", e.getMessage()); } String author = null; String[] keywords = null; String title = null; if (summary != null) { author = summary.getAuthor(); if (summary.getKeywords() != null) { keywords = summary.getKeywords().split("[ ,;]"); } title = summary.getTitle(); } List<String> abstrct = new ArrayList<String>(); if (contents.length() > 0) abstrct.add(((contents.length() > 80) ? contents.substring(0, 80) : contents.trim()) .replaceAll("\r\n", " ").replaceAll("\n", " ").replaceAll("\r", " ").replaceAll("\t", " ")); if (title == null) title = location.toNormalform(true); // As the result of parsing this function must return a plasmaParserDocument object return new Document[] { new Document(location, // url of the source document mimeType, // the documents mime type "UTF-8", // charset of the document text this, null, // language keywords, singleList(title), author, "", null, // an array of section headlines abstrct, // an abstract 0.0f, 0.0f, contents, // the parsed document text null, // a map of extracted anchors null, null, // a treeset of image URLs false, new Date()) }; } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; // if an unexpected error occures just log the error and raise a new ParserException final String errorMsg = "Unable to parse the vsd document '" + location + "':" + e.getMessage(); AbstractParser.log.severe(errorMsg); throw new Parser.Failure(errorMsg, location); } finally { if (theDoc == null) { // if an unexpected error occures just log the error and raise a new Parser.Failure final String errorMsg = "Unable to parse the vsd document '" + location + "': possibly out of memory"; AbstractParser.log.severe(errorMsg); throw new Parser.Failure(errorMsg, location); } } }