Example usage for org.apache.poi.hwpf.extractor Word6Extractor Word6Extractor

List of usage examples for org.apache.poi.hwpf.extractor Word6Extractor Word6Extractor

Introduction

In this page you can find the example usage for org.apache.poi.hwpf.extractor Word6Extractor Word6Extractor.

Prototype

public Word6Extractor(HWPFOldDocument doc) 

Source Link

Document

Create a new Word Extractor

Usage

From source file:com.jaeksoft.searchlib.parser.DocParser.java

License:Open Source License

private void oldWordExtraction(ParserResultItem result, InputStream inputStream) throws IOException {
    Word6Extractor word6 = null;/*  ww  w.ja  va  2s.  c  o m*/
    try {
        word6 = new Word6Extractor(inputStream);
        SummaryInformation si = word6.getSummaryInformation();
        if (si != null) {
            result.addField(ParserFieldEnum.title, si.getTitle());
            result.addField(ParserFieldEnum.author, si.getAuthor());
            result.addField(ParserFieldEnum.subject, si.getSubject());
        }

        String text = word6.getText();
        String[] frags = text.split("\\n");
        for (String frag : frags)
            result.addField(ParserFieldEnum.content, StringUtils.replaceConsecutiveSpaces(frag, " "));
    } finally {
        IOUtils.close(word6);
    }
}

From source file:com.opensearchserver.extractor.parser.Doc.java

License:Apache License

private void oldWordExtraction(InputStream inputStream) throws IOException {
    Word6Extractor word6 = null;/*from  w ww . ja  va  2s.  c om*/
    try {
        word6 = new Word6Extractor(inputStream);
        SummaryInformation si = word6.getSummaryInformation();
        if (si != null) {
            metas.add(TITLE, si.getTitle());
            metas.add(AUTHOR, si.getAuthor());
            metas.add(SUBJECT, si.getSubject());
        }

        ParserDocument document = getNewParserDocument();
        @SuppressWarnings("deprecation")
        String[] paragraphes = word6.getParagraphText();
        for (String paragraph : paragraphes)
            document.add(CONTENT, paragraph);
        document.add(LANG_DETECTION, languageDetection(CONTENT, 10000));
    } finally {
        IOUtils.closeQuietly(word6);
    }
}

From source file:com.qwazr.library.poi.DocParser.java

License:Apache License

private void oldWordExtraction(final InputStream inputStream, final ParserResultBuilder resultBuilder)
        throws IOException {
    Word6Extractor word6 = null;//from w w w .j a  v  a2 s  .c  o m
    try {
        word6 = new Word6Extractor(inputStream);

        final ParserFieldsBuilder metas = resultBuilder.metas();
        metas.set(MIME_TYPE, DEFAULT_MIMETYPES[0]);

        SummaryInformation si = word6.getSummaryInformation();
        if (si != null) {
            metas.add(TITLE, si.getTitle());
            metas.add(AUTHOR, si.getAuthor());
            metas.add(SUBJECT, si.getSubject());
        }

        final ParserFieldsBuilder document = resultBuilder.newDocument();
        @SuppressWarnings("deprecation")
        String[] paragraphes = word6.getParagraphText();
        if (paragraphes != null)
            for (String paragraph : paragraphes)
                document.add(CONTENT, paragraph);
        document.add(LANG_DETECTION, languageDetection(document, CONTENT, 10000));
    } finally {
        IOUtils.closeQuietly(word6);
    }
}

From source file:mj.ocraptor.extraction.tika.parser.microsoft.WordExtractor.java

License:Apache License

protected void parseWord6(DirectoryNode root, XHTMLContentHandler xhtml)
        throws IOException, SAXException, TikaException {
    HWPFOldDocument doc = new HWPFOldDocument(root);
    Word6Extractor extractor = new Word6Extractor(doc);

    for (String p : extractor.getParagraphText()) {
        xhtml.element("p", p);
    }/*from  w ww.j ava  2s.c o  m*/
}