Example usage for org.apache.poi.hwpf.extractor WordExtractor getHeaderText

List of usage examples for org.apache.poi.hwpf.extractor WordExtractor getHeaderText

Introduction

In this page you can find the example usage for org.apache.poi.hwpf.extractor WordExtractor getHeaderText.

Prototype

@Deprecated
public String getHeaderText() 

Source Link

Document

Grab the text from the headers

Usage

From source file:net.yacy.document.parser.docParser.java

License:Open Source License

@SuppressWarnings("deprecation")
@Override/* ww w  . j av a2  s  . c om*/
public Document[] parse(final AnchorURL location, final String mimeType, final String charset,
        final VocabularyScraper scraper, final int timezoneOffset, final InputStream source)
        throws Parser.Failure, InterruptedException {

    final WordExtractor extractor;

    try {
        extractor = new WordExtractor(source);
    } catch (final Exception e) {
        throw new Parser.Failure("error in docParser, WordTextExtractorFactory: " + e.getMessage(), location);
    }

    final StringBuilder contents = new StringBuilder(80);
    try {
        contents.append(extractor.getText().trim());
        contents.append(' ');
        contents.append(extractor.getHeaderText());
        contents.append(' ');
        contents.append(extractor.getFooterText());
    } catch (final Exception e) {
        throw new Parser.Failure("error in docParser, getText: " + e.getMessage(), location);
    }
    String title = (contents.length() > 240) ? contents.substring(0, 240) : contents.toString().trim();
    title = title.replaceAll("\r", " ").replaceAll("\n", " ").replaceAll("\t", " ").trim();
    if (title.length() > 80)
        title = title.substring(0, 80);
    int l = title.length();
    while (true) {
        title = title.replaceAll("  ", " ");
        if (title.length() == l)
            break;
        l = title.length();
    }
    // get keywords (for yacy as array)
    final String keywords = extractor.getSummaryInformation().getKeywords();
    final String[] keywlist;
    if (keywords != null && !keywords.isEmpty()) {
        keywlist = CommonPattern.COMMA.split(keywords);
    } else {
        keywlist = null;
    }

    final String subject = extractor.getSummaryInformation().getSubject();
    List<String> descriptions = new ArrayList<String>();
    if (subject != null && !subject.isEmpty())
        descriptions.add(subject);

    Document[] docs;
    docs = new Document[] { new Document(location, mimeType, "UTF-8", this, null, keywlist, singleList(title),
            extractor.getSummaryInformation().getAuthor(), // constuctor can handle null
            extractor.getDocSummaryInformation().getCompany(), // publisher
            null, descriptions, 0.0f, 0.0f, contents.toString(), null, null, null, false, new Date()) };

    return docs;
}

From source file:org.olat.search.service.document.file.WordDocument.java

License:Apache License

private void collectWordDocument(final POIFSFileSystem filesystem, final StringBuilder sb) throws IOException {
    final WordExtractor extractor = new WordExtractor(filesystem);
    addTextIfAny(sb, extractor.getHeaderText());
    for (final String paragraph : extractor.getParagraphText()) {
        sb.append(paragraph).append(' ');
    }//from w w  w. j a v a 2  s .  c o  m

    for (final String paragraph : extractor.getFootnoteText()) {
        sb.append(paragraph).append(' ');
    }

    for (final String paragraph : extractor.getCommentsText()) {
        sb.append(paragraph).append(' ');
    }

    for (final String paragraph : extractor.getEndnoteText()) {
        sb.append(paragraph).append(' ');
    }
    addTextIfAny(sb, extractor.getFooterText());
}

From source file:uk.ac.liverpool.MSOffice.MSWord.java

License:Open Source License

private String toHTML(INode parent) {
    HWPFDocument wor = (HWPFDocument) parent.getDocument().getValue("worddoc");
    WordExtractor wx = new WordExtractor(wor);
    StringBuilder b = new StringBuilder();

    b.append("<html><head>" + "<META http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">"
            + "<style type=\"text/css\">\n" + "body {\n" + "   color: black; background-color: white;\n"
            + "   font-size: 14pts;\n" + "   padding: 10px;}\n" + "\n" + "a:link { color: blue; }\n"
            + "a:visited { color: magenta; }\n" + "a:hover { color: red; }\n" + "a:active { color: red; }\n"
            + "\n" + "a:link, a:visited, \n" + "a:active, a:hover {\n" + "   text-decoration: underline;\n"
            + "}\n" + "\n" + "p {\n" + "   margin-top: 10px;\n" + "}\n" + "text { padding: 5px; }\n" + "\n"
            + "pre { font-family: monospace; }\n" + "\n\n"
            + "h1 { font-size: 24pt; font-weight: bold; margin: 10px 0px; }\n"
            + "h2 { font-size: 18pt; font-weight: bold; margin: 9px 0px; }\n"
            + "h3 { font-size: 14pt; font-weight: bold; margin: 7px 0px; }\n"
            + "h4 { font-size: 12pt; font-weight: bold; margin: 6px 0px; }\n"
            + "h5 { font-size: 10pt; font-weight: bold; margin: 5px 0px; }\n"
            + "h6 { font-size:  9pt; font-weight: bold; margin: 5px 0px; }\n" + "" + "" + "</style>");
    b.append("<title>").append("Text extracion contents of the word document (APACHE POI):").append("</title>");
    b.append("</head>\n");
    b.append("<body>\n");
    b.append("<p>").append(wx.getHeaderText()).append("</p>\n");
    ArrayList<String> text = new ArrayList<String>();
    text.addAll(Arrays.asList(wx.getParagraphText()));
    text.addAll(Arrays.asList(wx.getFootnoteText()));
    text.addAll(Arrays.asList(wx.getEndnoteText()));

    for (String p : text) {
        b.append("<p>").append(p).append("</p>\n");
    }/*  w  w  w  . j  a v  a  2s  .c  om*/
    b.append("</body></html>");
    return b.toString();
}