List of usage examples for org.apache.poi.hdgf.extractor VisioTextExtractor getText
@Override
public String getText()
From source file:net.sourceforge.docfetcher.parse.MSVisioParser.java
License:Open Source License
public String renderText(File file) throws ParseException { InputStream in = null;/* w ww .java 2 s . c o m*/ try { in = new FileInputStream(file); VisioTextExtractor extractor = null; try { extractor = new VisioTextExtractor(in); } catch (Exception e) { // This can happen if the file has the "vsd" extension, but is not a Visio document throw new ParseException(file, Msg.file_corrupted.value()); } finally { in.close(); } return extractor.getText(); } catch (FileNotFoundException e) { throw new ParseException(file, Msg.file_not_found.value()); } catch (IOException ioe) { throw new ParseException(file, Msg.file_not_readable.value()); } }
From source file:net.yacy.document.parser.vsdParser.java
License:Open Source License
@Override public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final int timezoneOffset, final InputStream source) throws Parser.Failure, InterruptedException { Document theDoc = null;/*from ww w . j a v a 2s . com*/ try { String contents = ""; SummaryInformation summary = null; try { final VisioTextExtractor extractor = new VisioTextExtractor(source); contents = extractor.getText(); summary = extractor.getSummaryInformation(); } catch (final Exception e) { ConcurrentLog.warn("vsdParser", e.getMessage()); } String author = null; String[] keywords = null; String title = null; if (summary != null) { author = summary.getAuthor(); if (summary.getKeywords() != null) { keywords = summary.getKeywords().split("[ ,;]"); } title = summary.getTitle(); } List<String> abstrct = new ArrayList<String>(); if (contents.length() > 0) abstrct.add(((contents.length() > 80) ? contents.substring(0, 80) : contents.trim()) .replaceAll("\r\n", " ").replaceAll("\n", " ").replaceAll("\r", " ").replaceAll("\t", " ")); if (title == null) title = location.toNormalform(true); // As the result of parsing this function must return a plasmaParserDocument object return new Document[] { new Document(location, // url of the source document mimeType, // the documents mime type "UTF-8", // charset of the document text this, null, // language keywords, singleList(title), author, "", null, // an array of section headlines abstrct, // an abstract 0.0f, 0.0f, contents, // the parsed document text null, // a map of extracted anchors null, null, // a treeset of image URLs false, new Date()) }; } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; // if an unexpected error occures just log the error and raise a new ParserException final String errorMsg = "Unable to parse the vsd document '" + location + "':" + e.getMessage(); AbstractParser.log.severe(errorMsg); throw new Parser.Failure(errorMsg, location); } finally { if (theDoc == null) { // if an unexpected error occures just log the error and raise a new Parser.Failure final String errorMsg = "Unable to parse the vsd document '" + location + "': possibly out of memory"; AbstractParser.log.severe(errorMsg); throw new Parser.Failure(errorMsg, location); } } }
From source file:org.paxle.parser.msoffice.impl.MsVisioParser.java
License:Open Source License
@Override protected void extractText(POIFSFileSystem fs, IParserDocument parserDoc) throws ParserException, IOException { // extract plain text final VisioTextExtractor parser = new VisioTextExtractor(fs); final String text = parser.getText(); if (text != null && text.length() > 0) { parserDoc.append(text);/*from ww w . j ava2s. c o m*/ } }