Example usage for org.apache.poi.hwpf.extractor WordExtractor getSummaryInformation

List of usage examples for org.apache.poi.hwpf.extractor WordExtractor getSummaryInformation

Introduction

In this page you can find the example usage for org.apache.poi.hwpf.extractor WordExtractor getSummaryInformation.

Prototype

public SummaryInformation getSummaryInformation() 

Source Link

Document

Returns the summary information metadata for the document.

Usage

From source file:com.frameworkset.platform.cms.searchmanager.extractors.CmsExtractorMsWord.java

License:Open Source License

/** 
 * @see org.opencms.search.extractors.I_CmsTextExtractor#extractText(java.io.InputStream, java.lang.String)
 *//*ww  w.  j  a v a 2  s  . c  om*/
public I_CmsExtractionResult extractText(InputStream in, String encoding) throws Exception {

    // first extract the text using the text actraction libary
    //       WordTextExtractorFactory factory = new WordTextExtractorFactory();

    //        TextExtractor wordExtractor = factory.textExtractor(getStreamCopy(in));

    //        String result = wordExtractor.getText();
    //        String result = wordExtractor.extractText(getStreamCopy(in));
    StringBuffer content = new StringBuffer();
    //       org.apache.poi.hwpf

    //        HWPFDocument doc = new HWPFDocument(in);
    //         Range range = doc.getRange();
    //         int paragraphCount = range.numParagraphs();// ?
    //         for (int i = 0; i < paragraphCount; i++) {// ?????
    //             Paragraph pp = range.getParagraph(i);
    //             content.append(pp.text());
    //         }
    String result = null;
    Map metaInfo = null;
    if (version.equals(ContentHandler.VERSION_2003)) {
        WordExtractor ex = new WordExtractor(in);
        result = ex.getText();
        SummaryInformation info = ex.getSummaryInformation();
        this.m_summary = info;
        this.m_documentSummary = ex.getDocSummaryInformation();
        metaInfo = extractMetaInformation();
    } else {
        XWPFDocument doc = new XWPFDocument(in);
        XWPFWordExtractor ex = new XWPFWordExtractor(doc);
        result = ex.getText();
        cp = ex.getCoreProperties();
        metaInfo = extractMetaInformation();
        //           SummaryInformation info = doc.getSummaryInformation();
        //             this.m_summary = info;
        //             this.m_documentSummary = doc.getDocSummaryInformation();
    }

    //        result = removeControlChars(result);

    //     String result = content.toString();
    // now extract the meta information using POI 
    //        POIFSReader reader = new POIFSReader();
    //        reader.registerListener(this);
    //        reader.read(getStreamCopy(getStreamCopy(in)));

    // free some memory
    cleanup();

    // return the final result
    return new CmsExtractionResult(result, metaInfo);
}

From source file:com.jaeksoft.searchlib.parser.DocParser.java

License:Open Source License

private void currentWordExtraction(ParserResultItem result, InputStream inputStream) throws IOException {
    WordExtractor word = null;

    try {/* w ww . j a  va  2 s  .  c  om*/
        word = new WordExtractor(inputStream);

        SummaryInformation info = word.getSummaryInformation();
        if (info != null) {
            result.addField(ParserFieldEnum.title, info.getTitle());
            result.addField(ParserFieldEnum.author, info.getAuthor());
            result.addField(ParserFieldEnum.subject, info.getSubject());
        }

        String[] paragraphes = word.getParagraphText();
        for (String paragraph : paragraphes) {
            String[] frags = paragraph.split("\\n");
            for (String frag : frags)
                result.addField(ParserFieldEnum.content, StringUtils.replaceConsecutiveSpaces(frag, " "));
        }
    } finally {
        IOUtils.close(word);
    }
}

From source file:com.opensearchserver.extractor.parser.Doc.java

License:Apache License

private void currentWordExtraction(InputStream inputStream) throws IOException {
    WordExtractor word = null;

    try {/*from   w  ww  .j  a va2  s. c om*/
        word = new WordExtractor(inputStream);

        SummaryInformation info = word.getSummaryInformation();
        if (info != null) {
            metas.add(TITLE, info.getTitle());
            metas.add(AUTHOR, info.getAuthor());
            metas.add(SUBJECT, info.getSubject());
            metas.add(CREATION_DATE, info.getCreateDateTime());
            metas.add(MODIFICATION_DATE, info.getLastSaveDateTime());
            metas.add(KEYWORDS, info.getKeywords());
        }

        ParserDocument document = getNewParserDocument();
        String[] paragraphes = word.getParagraphText();
        for (String paragraph : paragraphes)
            document.add(CONTENT, paragraph);
        document.add(LANG_DETECTION, languageDetection(CONTENT, 10000));
    } finally {
        IOUtils.closeQuietly(word);
    }
}

From source file:net.yacy.document.parser.docParser.java

License:Open Source License

@SuppressWarnings("deprecation")
@Override//from w w  w  . j  a v a  2s  . co m
public Document[] parse(final AnchorURL location, final String mimeType, final String charset,
        final VocabularyScraper scraper, final int timezoneOffset, final InputStream source)
        throws Parser.Failure, InterruptedException {

    final WordExtractor extractor;

    try {
        extractor = new WordExtractor(source);
    } catch (final Exception e) {
        throw new Parser.Failure("error in docParser, WordTextExtractorFactory: " + e.getMessage(), location);
    }

    final StringBuilder contents = new StringBuilder(80);
    try {
        contents.append(extractor.getText().trim());
        contents.append(' ');
        contents.append(extractor.getHeaderText());
        contents.append(' ');
        contents.append(extractor.getFooterText());
    } catch (final Exception e) {
        throw new Parser.Failure("error in docParser, getText: " + e.getMessage(), location);
    }
    String title = (contents.length() > 240) ? contents.substring(0, 240) : contents.toString().trim();
    title = title.replaceAll("\r", " ").replaceAll("\n", " ").replaceAll("\t", " ").trim();
    if (title.length() > 80)
        title = title.substring(0, 80);
    int l = title.length();
    while (true) {
        title = title.replaceAll("  ", " ");
        if (title.length() == l)
            break;
        l = title.length();
    }
    // get keywords (for yacy as array)
    final String keywords = extractor.getSummaryInformation().getKeywords();
    final String[] keywlist;
    if (keywords != null && !keywords.isEmpty()) {
        keywlist = CommonPattern.COMMA.split(keywords);
    } else {
        keywlist = null;
    }

    final String subject = extractor.getSummaryInformation().getSubject();
    List<String> descriptions = new ArrayList<String>();
    if (subject != null && !subject.isEmpty())
        descriptions.add(subject);

    Document[] docs;
    docs = new Document[] { new Document(location, mimeType, "UTF-8", this, null, keywlist, singleList(title),
            extractor.getSummaryInformation().getAuthor(), // constuctor can handle null
            extractor.getDocSummaryInformation().getCompany(), // publisher
            null, descriptions, 0.0f, 0.0f, contents.toString(), null, null, null, false, new Date()) };

    return docs;
}

From source file:org.jab.docsearch.converters.Word.java

License:Open Source License

/**
 * @see ConverterInterface#parse()//from w  w w. j  a  va  2  s  .c  o m
 */
@Override
public void parse() throws ConverterException {
    if (filename == null) {
        log.error("parse() filename is null");
        throw new ConverterException("Word::parse() filename is null");
    }

    // get metadata and text
    FileInputStream fin = null;
    try {
        fin = new FileInputStream(filename);

        WordExtractor we = new WordExtractor(fin);

        // get meta data
        SummaryInformation si = we.getSummaryInformation();
        documentAuthor = si.getAuthor();
        documentTitle = si.getTitle();
        documentKeywords = si.getKeywords();

        // get text
        documentText = we.getText();
    } catch (IOException ioe) {
        log.error("parse() failed at Word file=" + filename, ioe);
        throw new ConverterException("Word::parse() failed at Word file=" + filename, ioe);
    } catch (Exception e) {
        log.error("parse() failed at Word file=" + filename, e);
        throw new ConverterException("Word::parse() failed", e);
    } finally {
        IOUtils.closeQuietly(fin);
    }

    if (log.isDebugEnabled()) {
        log.debug("parse() Word file='" + filename + "'" + Layout.LINE_SEP + "title='" + documentTitle + "'"
                + Layout.LINE_SEP + "author='" + documentAuthor + "'" + Layout.LINE_SEP + "keywords='"
                + documentKeywords + "'");
    }
}