List of usage examples for org.apache.poi.hwpf.extractor WordExtractor getHeaderText
@Deprecated
public String getHeaderText()
From source file:net.yacy.document.parser.docParser.java
License:Open Source License
@SuppressWarnings("deprecation") @Override/* ww w . j av a2 s . c om*/ public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final int timezoneOffset, final InputStream source) throws Parser.Failure, InterruptedException { final WordExtractor extractor; try { extractor = new WordExtractor(source); } catch (final Exception e) { throw new Parser.Failure("error in docParser, WordTextExtractorFactory: " + e.getMessage(), location); } final StringBuilder contents = new StringBuilder(80); try { contents.append(extractor.getText().trim()); contents.append(' '); contents.append(extractor.getHeaderText()); contents.append(' '); contents.append(extractor.getFooterText()); } catch (final Exception e) { throw new Parser.Failure("error in docParser, getText: " + e.getMessage(), location); } String title = (contents.length() > 240) ? contents.substring(0, 240) : contents.toString().trim(); title = title.replaceAll("\r", " ").replaceAll("\n", " ").replaceAll("\t", " ").trim(); if (title.length() > 80) title = title.substring(0, 80); int l = title.length(); while (true) { title = title.replaceAll(" ", " "); if (title.length() == l) break; l = title.length(); } // get keywords (for yacy as array) final String keywords = extractor.getSummaryInformation().getKeywords(); final String[] keywlist; if (keywords != null && !keywords.isEmpty()) { keywlist = CommonPattern.COMMA.split(keywords); } else { keywlist = null; } final String subject = extractor.getSummaryInformation().getSubject(); List<String> descriptions = new ArrayList<String>(); if (subject != null && !subject.isEmpty()) descriptions.add(subject); Document[] docs; docs = new Document[] { new Document(location, mimeType, "UTF-8", this, null, keywlist, singleList(title), extractor.getSummaryInformation().getAuthor(), // constuctor can handle null extractor.getDocSummaryInformation().getCompany(), // publisher null, descriptions, 0.0f, 0.0f, contents.toString(), null, null, null, false, new Date()) }; return docs; }
From source file:org.olat.search.service.document.file.WordDocument.java
License:Apache License
private void collectWordDocument(final POIFSFileSystem filesystem, final StringBuilder sb) throws IOException { final WordExtractor extractor = new WordExtractor(filesystem); addTextIfAny(sb, extractor.getHeaderText()); for (final String paragraph : extractor.getParagraphText()) { sb.append(paragraph).append(' '); }//from w w w. j a v a 2 s . c o m for (final String paragraph : extractor.getFootnoteText()) { sb.append(paragraph).append(' '); } for (final String paragraph : extractor.getCommentsText()) { sb.append(paragraph).append(' '); } for (final String paragraph : extractor.getEndnoteText()) { sb.append(paragraph).append(' '); } addTextIfAny(sb, extractor.getFooterText()); }
From source file:uk.ac.liverpool.MSOffice.MSWord.java
License:Open Source License
private String toHTML(INode parent) { HWPFDocument wor = (HWPFDocument) parent.getDocument().getValue("worddoc"); WordExtractor wx = new WordExtractor(wor); StringBuilder b = new StringBuilder(); b.append("<html><head>" + "<META http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">" + "<style type=\"text/css\">\n" + "body {\n" + " color: black; background-color: white;\n" + " font-size: 14pts;\n" + " padding: 10px;}\n" + "\n" + "a:link { color: blue; }\n" + "a:visited { color: magenta; }\n" + "a:hover { color: red; }\n" + "a:active { color: red; }\n" + "\n" + "a:link, a:visited, \n" + "a:active, a:hover {\n" + " text-decoration: underline;\n" + "}\n" + "\n" + "p {\n" + " margin-top: 10px;\n" + "}\n" + "text { padding: 5px; }\n" + "\n" + "pre { font-family: monospace; }\n" + "\n\n" + "h1 { font-size: 24pt; font-weight: bold; margin: 10px 0px; }\n" + "h2 { font-size: 18pt; font-weight: bold; margin: 9px 0px; }\n" + "h3 { font-size: 14pt; font-weight: bold; margin: 7px 0px; }\n" + "h4 { font-size: 12pt; font-weight: bold; margin: 6px 0px; }\n" + "h5 { font-size: 10pt; font-weight: bold; margin: 5px 0px; }\n" + "h6 { font-size: 9pt; font-weight: bold; margin: 5px 0px; }\n" + "" + "" + "</style>"); b.append("<title>").append("Text extracion contents of the word document (APACHE POI):").append("</title>"); b.append("</head>\n"); b.append("<body>\n"); b.append("<p>").append(wx.getHeaderText()).append("</p>\n"); ArrayList<String> text = new ArrayList<String>(); text.addAll(Arrays.asList(wx.getParagraphText())); text.addAll(Arrays.asList(wx.getFootnoteText())); text.addAll(Arrays.asList(wx.getEndnoteText())); for (String p : text) { b.append("<p>").append(p).append("</p>\n"); }/* w w w . j a v a 2s .c om*/ b.append("</body></html>"); return b.toString(); }