Example usage for org.apache.poi.hwpf.extractor WordExtractor getDocSummaryInformation

List of usage examples for org.apache.poi.hwpf.extractor WordExtractor getDocSummaryInformation

Introduction

In this page you can find the example usage for org.apache.poi.hwpf.extractor WordExtractor getDocSummaryInformation.

Prototype

public DocumentSummaryInformation getDocSummaryInformation() 

Source Link

Document

Returns the document information metadata for the document

Usage

From source file:com.frameworkset.platform.cms.searchmanager.extractors.CmsExtractorMsWord.java

License:Open Source License

/** 
 * @see org.opencms.search.extractors.I_CmsTextExtractor#extractText(java.io.InputStream, java.lang.String)
 *//*from   ww w.  j a  va2s .  c om*/
public I_CmsExtractionResult extractText(InputStream in, String encoding) throws Exception {

    // first extract the text using the text actraction libary
    //       WordTextExtractorFactory factory = new WordTextExtractorFactory();

    //        TextExtractor wordExtractor = factory.textExtractor(getStreamCopy(in));

    //        String result = wordExtractor.getText();
    //        String result = wordExtractor.extractText(getStreamCopy(in));
    StringBuffer content = new StringBuffer();
    //       org.apache.poi.hwpf

    //        HWPFDocument doc = new HWPFDocument(in);
    //         Range range = doc.getRange();
    //         int paragraphCount = range.numParagraphs();// ?
    //         for (int i = 0; i < paragraphCount; i++) {// ?????
    //             Paragraph pp = range.getParagraph(i);
    //             content.append(pp.text());
    //         }
    String result = null;
    Map metaInfo = null;
    if (version.equals(ContentHandler.VERSION_2003)) {
        WordExtractor ex = new WordExtractor(in);
        result = ex.getText();
        SummaryInformation info = ex.getSummaryInformation();
        this.m_summary = info;
        this.m_documentSummary = ex.getDocSummaryInformation();
        metaInfo = extractMetaInformation();
    } else {
        XWPFDocument doc = new XWPFDocument(in);
        XWPFWordExtractor ex = new XWPFWordExtractor(doc);
        result = ex.getText();
        cp = ex.getCoreProperties();
        metaInfo = extractMetaInformation();
        //           SummaryInformation info = doc.getSummaryInformation();
        //             this.m_summary = info;
        //             this.m_documentSummary = doc.getDocSummaryInformation();
    }

    //        result = removeControlChars(result);

    //     String result = content.toString();
    // now extract the meta information using POI 
    //        POIFSReader reader = new POIFSReader();
    //        reader.registerListener(this);
    //        reader.read(getStreamCopy(getStreamCopy(in)));

    // free some memory
    cleanup();

    // return the final result
    return new CmsExtractionResult(result, metaInfo);
}

From source file:net.yacy.document.parser.docParser.java

License:Open Source License

@SuppressWarnings("deprecation")
@Override//from   w w w  .j  ava2s. co  m
public Document[] parse(final AnchorURL location, final String mimeType, final String charset,
        final VocabularyScraper scraper, final int timezoneOffset, final InputStream source)
        throws Parser.Failure, InterruptedException {

    final WordExtractor extractor;

    try {
        extractor = new WordExtractor(source);
    } catch (final Exception e) {
        throw new Parser.Failure("error in docParser, WordTextExtractorFactory: " + e.getMessage(), location);
    }

    final StringBuilder contents = new StringBuilder(80);
    try {
        contents.append(extractor.getText().trim());
        contents.append(' ');
        contents.append(extractor.getHeaderText());
        contents.append(' ');
        contents.append(extractor.getFooterText());
    } catch (final Exception e) {
        throw new Parser.Failure("error in docParser, getText: " + e.getMessage(), location);
    }
    String title = (contents.length() > 240) ? contents.substring(0, 240) : contents.toString().trim();
    title = title.replaceAll("\r", " ").replaceAll("\n", " ").replaceAll("\t", " ").trim();
    if (title.length() > 80)
        title = title.substring(0, 80);
    int l = title.length();
    while (true) {
        title = title.replaceAll("  ", " ");
        if (title.length() == l)
            break;
        l = title.length();
    }
    // get keywords (for yacy as array)
    final String keywords = extractor.getSummaryInformation().getKeywords();
    final String[] keywlist;
    if (keywords != null && !keywords.isEmpty()) {
        keywlist = CommonPattern.COMMA.split(keywords);
    } else {
        keywlist = null;
    }

    final String subject = extractor.getSummaryInformation().getSubject();
    List<String> descriptions = new ArrayList<String>();
    if (subject != null && !subject.isEmpty())
        descriptions.add(subject);

    Document[] docs;
    docs = new Document[] { new Document(location, mimeType, "UTF-8", this, null, keywlist, singleList(title),
            extractor.getSummaryInformation().getAuthor(), // constuctor can handle null
            extractor.getDocSummaryInformation().getCompany(), // publisher
            null, descriptions, 0.0f, 0.0f, contents.toString(), null, null, null, false, new Date()) };

    return docs;
}