List of usage examples for org.apache.poi.hwpf.extractor WordExtractor getFooterText
@Deprecated
public String getFooterText()
From source file:net.yacy.document.parser.docParser.java
License:Open Source License
@SuppressWarnings("deprecation") @Override//ww w. ja v a 2s. c o m public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final int timezoneOffset, final InputStream source) throws Parser.Failure, InterruptedException { final WordExtractor extractor; try { extractor = new WordExtractor(source); } catch (final Exception e) { throw new Parser.Failure("error in docParser, WordTextExtractorFactory: " + e.getMessage(), location); } final StringBuilder contents = new StringBuilder(80); try { contents.append(extractor.getText().trim()); contents.append(' '); contents.append(extractor.getHeaderText()); contents.append(' '); contents.append(extractor.getFooterText()); } catch (final Exception e) { throw new Parser.Failure("error in docParser, getText: " + e.getMessage(), location); } String title = (contents.length() > 240) ? contents.substring(0, 240) : contents.toString().trim(); title = title.replaceAll("\r", " ").replaceAll("\n", " ").replaceAll("\t", " ").trim(); if (title.length() > 80) title = title.substring(0, 80); int l = title.length(); while (true) { title = title.replaceAll(" ", " "); if (title.length() == l) break; l = title.length(); } // get keywords (for yacy as array) final String keywords = extractor.getSummaryInformation().getKeywords(); final String[] keywlist; if (keywords != null && !keywords.isEmpty()) { keywlist = CommonPattern.COMMA.split(keywords); } else { keywlist = null; } final String subject = extractor.getSummaryInformation().getSubject(); List<String> descriptions = new ArrayList<String>(); if (subject != null && !subject.isEmpty()) descriptions.add(subject); Document[] docs; docs = new Document[] { new Document(location, mimeType, "UTF-8", this, null, keywlist, singleList(title), extractor.getSummaryInformation().getAuthor(), // constuctor can handle null extractor.getDocSummaryInformation().getCompany(), // publisher null, descriptions, 0.0f, 0.0f, contents.toString(), null, null, null, false, new Date()) }; return docs; }
From source file:org.olat.search.service.document.file.WordDocument.java
License:Apache License
private void collectWordDocument(final POIFSFileSystem filesystem, final StringBuilder sb) throws IOException { final WordExtractor extractor = new WordExtractor(filesystem); addTextIfAny(sb, extractor.getHeaderText()); for (final String paragraph : extractor.getParagraphText()) { sb.append(paragraph).append(' '); }/*from www.j a va2 s . c o m*/ for (final String paragraph : extractor.getFootnoteText()) { sb.append(paragraph).append(' '); } for (final String paragraph : extractor.getCommentsText()) { sb.append(paragraph).append(' '); } for (final String paragraph : extractor.getEndnoteText()) { sb.append(paragraph).append(' '); } addTextIfAny(sb, extractor.getFooterText()); }