List of usage examples for org.apache.poi.hwpf.extractor Word6Extractor Word6Extractor
public Word6Extractor(HWPFOldDocument doc)
From source file:com.jaeksoft.searchlib.parser.DocParser.java
License:Open Source License
private void oldWordExtraction(ParserResultItem result, InputStream inputStream) throws IOException { Word6Extractor word6 = null;/* ww w.ja va 2s. c o m*/ try { word6 = new Word6Extractor(inputStream); SummaryInformation si = word6.getSummaryInformation(); if (si != null) { result.addField(ParserFieldEnum.title, si.getTitle()); result.addField(ParserFieldEnum.author, si.getAuthor()); result.addField(ParserFieldEnum.subject, si.getSubject()); } String text = word6.getText(); String[] frags = text.split("\\n"); for (String frag : frags) result.addField(ParserFieldEnum.content, StringUtils.replaceConsecutiveSpaces(frag, " ")); } finally { IOUtils.close(word6); } }
From source file:com.opensearchserver.extractor.parser.Doc.java
License:Apache License
private void oldWordExtraction(InputStream inputStream) throws IOException { Word6Extractor word6 = null;/*from w ww . ja va 2s. c om*/ try { word6 = new Word6Extractor(inputStream); SummaryInformation si = word6.getSummaryInformation(); if (si != null) { metas.add(TITLE, si.getTitle()); metas.add(AUTHOR, si.getAuthor()); metas.add(SUBJECT, si.getSubject()); } ParserDocument document = getNewParserDocument(); @SuppressWarnings("deprecation") String[] paragraphes = word6.getParagraphText(); for (String paragraph : paragraphes) document.add(CONTENT, paragraph); document.add(LANG_DETECTION, languageDetection(CONTENT, 10000)); } finally { IOUtils.closeQuietly(word6); } }
From source file:com.qwazr.library.poi.DocParser.java
License:Apache License
private void oldWordExtraction(final InputStream inputStream, final ParserResultBuilder resultBuilder) throws IOException { Word6Extractor word6 = null;//from w w w .j a v a2 s .c o m try { word6 = new Word6Extractor(inputStream); final ParserFieldsBuilder metas = resultBuilder.metas(); metas.set(MIME_TYPE, DEFAULT_MIMETYPES[0]); SummaryInformation si = word6.getSummaryInformation(); if (si != null) { metas.add(TITLE, si.getTitle()); metas.add(AUTHOR, si.getAuthor()); metas.add(SUBJECT, si.getSubject()); } final ParserFieldsBuilder document = resultBuilder.newDocument(); @SuppressWarnings("deprecation") String[] paragraphes = word6.getParagraphText(); if (paragraphes != null) for (String paragraph : paragraphes) document.add(CONTENT, paragraph); document.add(LANG_DETECTION, languageDetection(document, CONTENT, 10000)); } finally { IOUtils.closeQuietly(word6); } }
From source file:mj.ocraptor.extraction.tika.parser.microsoft.WordExtractor.java
License:Apache License
protected void parseWord6(DirectoryNode root, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { HWPFOldDocument doc = new HWPFOldDocument(root); Word6Extractor extractor = new Word6Extractor(doc); for (String p : extractor.getParagraphText()) { xhtml.element("p", p); }/*from w ww.j ava 2s.c o m*/ }