List of usage examples for org.apache.poi.hwpf.extractor WordExtractor getDocSummaryInformation
public DocumentSummaryInformation getDocSummaryInformation()
From source file:com.frameworkset.platform.cms.searchmanager.extractors.CmsExtractorMsWord.java
License:Open Source License
/** * @see org.opencms.search.extractors.I_CmsTextExtractor#extractText(java.io.InputStream, java.lang.String) *//*from ww w. j a va2s . c om*/ public I_CmsExtractionResult extractText(InputStream in, String encoding) throws Exception { // first extract the text using the text actraction libary // WordTextExtractorFactory factory = new WordTextExtractorFactory(); // TextExtractor wordExtractor = factory.textExtractor(getStreamCopy(in)); // String result = wordExtractor.getText(); // String result = wordExtractor.extractText(getStreamCopy(in)); StringBuffer content = new StringBuffer(); // org.apache.poi.hwpf // HWPFDocument doc = new HWPFDocument(in); // Range range = doc.getRange(); // int paragraphCount = range.numParagraphs();// ? // for (int i = 0; i < paragraphCount; i++) {// ????? // Paragraph pp = range.getParagraph(i); // content.append(pp.text()); // } String result = null; Map metaInfo = null; if (version.equals(ContentHandler.VERSION_2003)) { WordExtractor ex = new WordExtractor(in); result = ex.getText(); SummaryInformation info = ex.getSummaryInformation(); this.m_summary = info; this.m_documentSummary = ex.getDocSummaryInformation(); metaInfo = extractMetaInformation(); } else { XWPFDocument doc = new XWPFDocument(in); XWPFWordExtractor ex = new XWPFWordExtractor(doc); result = ex.getText(); cp = ex.getCoreProperties(); metaInfo = extractMetaInformation(); // SummaryInformation info = doc.getSummaryInformation(); // this.m_summary = info; // this.m_documentSummary = doc.getDocSummaryInformation(); } // result = removeControlChars(result); // String result = content.toString(); // now extract the meta information using POI // POIFSReader reader = new POIFSReader(); // reader.registerListener(this); // reader.read(getStreamCopy(getStreamCopy(in))); // free some memory cleanup(); // return the final result return new CmsExtractionResult(result, metaInfo); }
From source file:net.yacy.document.parser.docParser.java
License:Open Source License
@SuppressWarnings("deprecation") @Override//from w w w .j ava2s. co m public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final int timezoneOffset, final InputStream source) throws Parser.Failure, InterruptedException { final WordExtractor extractor; try { extractor = new WordExtractor(source); } catch (final Exception e) { throw new Parser.Failure("error in docParser, WordTextExtractorFactory: " + e.getMessage(), location); } final StringBuilder contents = new StringBuilder(80); try { contents.append(extractor.getText().trim()); contents.append(' '); contents.append(extractor.getHeaderText()); contents.append(' '); contents.append(extractor.getFooterText()); } catch (final Exception e) { throw new Parser.Failure("error in docParser, getText: " + e.getMessage(), location); } String title = (contents.length() > 240) ? contents.substring(0, 240) : contents.toString().trim(); title = title.replaceAll("\r", " ").replaceAll("\n", " ").replaceAll("\t", " ").trim(); if (title.length() > 80) title = title.substring(0, 80); int l = title.length(); while (true) { title = title.replaceAll(" ", " "); if (title.length() == l) break; l = title.length(); } // get keywords (for yacy as array) final String keywords = extractor.getSummaryInformation().getKeywords(); final String[] keywlist; if (keywords != null && !keywords.isEmpty()) { keywlist = CommonPattern.COMMA.split(keywords); } else { keywlist = null; } final String subject = extractor.getSummaryInformation().getSubject(); List<String> descriptions = new ArrayList<String>(); if (subject != null && !subject.isEmpty()) descriptions.add(subject); Document[] docs; docs = new Document[] { new Document(location, mimeType, "UTF-8", this, null, keywlist, singleList(title), extractor.getSummaryInformation().getAuthor(), // constuctor can handle null extractor.getDocSummaryInformation().getCompany(), // publisher null, descriptions, 0.0f, 0.0f, contents.toString(), null, null, null, false, new Date()) }; return docs; }