List of usage examples for org.apache.poi.hwpf.extractor WordExtractor getText
public String getText()
From source file:avoking.com.documentos.scheduler.core.Core.java
private String leerDoc(InputStream doc) throws IOException { POIFSFileSystem fs = new POIFSFileSystem(doc); WordExtractor we = new WordExtractor(fs); return we.getText(); }
From source file:axiom.util.TextExtractor.java
License:Open Source License
public static String msWordExtractor(InputStream is) throws Exception { WordExtractor we = new WordExtractor(is); return we.getText(); }
From source file:com.artech.prototype2.bardakov.utils.impl.MultiParserImpl.java
/** * doc/docx/*from ww w. j a v a 2 s.c o m*/ * @param FilePath - * @return ?? ? */ private ArrayList<String> getListOfWordsFromDoc(String FilePath) { FileInputStream fis; List<String> result = new ArrayList<String>(); if (FilePath.substring(FilePath.length() - 1).equals("x")) { //is a docx try { fis = new FileInputStream(new File(FilePath)); XWPFDocument doc = new XWPFDocument(fis); XWPFWordExtractor extract = new XWPFWordExtractor(doc); // System.out.println(extract.getText()); StringBuilder builder = new StringBuilder(); builder.append(extract.getText()); String[] words = builder.toString().split(" "); for (String s : words) { result.add(s); } } catch (IOException e) { e.printStackTrace(); } } else { //is not a docx try { fis = new FileInputStream(new File(FilePath)); HWPFDocument doc = new HWPFDocument(fis); WordExtractor extractor = new WordExtractor(doc); StringBuilder builder = new StringBuilder(); builder.append(extractor.getText()); String[] words = builder.toString().split(" "); for (String s : words) { result.add(s); } } catch (IOException e) { e.printStackTrace(); } } return (ArrayList<String>) result; }
From source file:com.docdoku.server.esindexer.ESTools.java
License:Open Source License
private static String microsoftWordDocumentToString(InputStream inputStream) throws IOException { String strRet;/*from w ww . j a v a 2 s.c om*/ try (InputStream wordStream = new BufferedInputStream(inputStream)) { if (POIFSFileSystem.hasPOIFSHeader(wordStream)) { WordExtractor wordExtractor = new WordExtractor(wordStream); strRet = wordExtractor.getText(); } else { XWPFWordExtractor wordXExtractor = new XWPFWordExtractor(new XWPFDocument(wordStream)); strRet = wordXExtractor.getText(); } } return strRet; }
From source file:com.docdoku.server.IndexerBean.java
License:Open Source License
@Asynchronous @Lock(LockType.WRITE)//from w w w.java2 s .com public void addToIndex(String fullName, String pathName) { IndexWriter indexWriter = null; Directory indexDir = null; try { indexDir = FSDirectory.open(new File(indexPath)); indexWriter = new IndexWriter(indexDir, new StandardAnalyzer(Version.LUCENE_30), IndexWriter.MaxFieldLength.LIMITED); int ext = pathName.lastIndexOf('.'); String extension = ""; if (ext != -1) { extension = pathName.substring(ext); } if (extension.equals(".odt") || extension.equals(".ods") || extension.equals(".odp") || extension.equals(".odg") || extension.equals(".odc") || extension.equals(".odf") || extension.equals(".odb") || extension.equals(".odi") || extension.equals(".odm")) { final StringBuilder text = new StringBuilder(); ZipInputStream zipOpenDoc = new ZipInputStream( new BufferedInputStream(new FileInputStream(pathName))); ZipEntry zipEntry; while ((zipEntry = zipOpenDoc.getNextEntry()) != null) { if (zipEntry.getName().equals("content.xml")) { SAXParserFactory saxParserFactory = SAXParserFactory.newInstance(); SAXParser parser = saxParserFactory.newSAXParser(); parser.parse(zipOpenDoc, new DefaultHandler() { @Override public void characters(char[] ch, int start, int length) throws SAXException { for (int i = start; i < start + length; i++) { text.append(ch[i]); } text.append("\r\n"); } }); break; } } zipOpenDoc.close(); Reader contentReader = new StringReader(text.toString()); addDoc(indexWriter, contentReader, fullName); contentReader.close(); } else if (extension.equals(".doc")) { //MSWord Document InputStream wordStream = new BufferedInputStream(new FileInputStream(pathName)); WordExtractor wordExtractor = new WordExtractor(wordStream); Reader contentReader = new StringReader(wordExtractor.getText()); wordStream.close(); addDoc(indexWriter, contentReader, fullName); contentReader.close(); } else if (extension.equals(".ppt") || extension.equals(".pps")) { //MSPowerPoint Document InputStream pptStream = new BufferedInputStream(new FileInputStream(pathName)); PowerPointExtractor pptExtractor = new PowerPointExtractor(pptStream); Reader contentReader = new StringReader(pptExtractor.getText(true, true)); pptStream.close(); addDoc(indexWriter, contentReader, fullName); pptExtractor.close(); contentReader.close(); } else if (extension.equals(".txt")) { //Text Document Reader contentReader = new BufferedReader(new FileReader(pathName)); addDoc(indexWriter, contentReader, fullName); contentReader.close(); } else if (extension.equals(".xls")) { //MSExcelExtractor Document //InputStream excelStream=new BufferedInputStream(new FileInputStream(pathName)); //ExcelExtractor excelExtractor= new ExcelExtractor(excelStream); //Reader contentReader=new StringReader(excelExtractor.getText()); //excelStream.close(); //addDoc(indexWriter,contentReader,fullName); //excelExtractor.close(); //contentReader.close(); } else if (extension.equals(".html") || extension.equals(".htm")) { } else if (extension.equals(".csv")) { } else if (extension.equals(".xml")) { } else if (extension.equals(".rtf")) { } else if (extension.equals(".pdf")) { } else if (extension.equals(".msg")) { } } catch (CorruptIndexException ex) { throw new EJBException(ex); } catch (LockObtainFailedException ex) { try { if (IndexWriter.isLocked(indexDir)) { IndexWriter.unlock(indexDir); } } catch (IOException pIOEx) { throw new EJBException(pIOEx); } throw new EJBException(ex); } catch (ParserConfigurationException ex) { throw new EJBException(ex); } catch (SAXException ex) { throw new EJBException(ex); } catch (IOException ex) { throw new EJBException(ex); } finally { try { if (indexWriter != null) { indexWriter.close(); } } catch (IOException ex) { throw new EJBException(ex); } } }
From source file:com.frameworkset.platform.cms.searchmanager.extractors.CmsExtractorMsWord.java
License:Open Source License
/** * @see org.opencms.search.extractors.I_CmsTextExtractor#extractText(java.io.InputStream, java.lang.String) *///w ww.j a v a 2 s . c om public I_CmsExtractionResult extractText(InputStream in, String encoding) throws Exception { // first extract the text using the text actraction libary // WordTextExtractorFactory factory = new WordTextExtractorFactory(); // TextExtractor wordExtractor = factory.textExtractor(getStreamCopy(in)); // String result = wordExtractor.getText(); // String result = wordExtractor.extractText(getStreamCopy(in)); StringBuffer content = new StringBuffer(); // org.apache.poi.hwpf // HWPFDocument doc = new HWPFDocument(in); // Range range = doc.getRange(); // int paragraphCount = range.numParagraphs();// ? // for (int i = 0; i < paragraphCount; i++) {// ????? // Paragraph pp = range.getParagraph(i); // content.append(pp.text()); // } String result = null; Map metaInfo = null; if (version.equals(ContentHandler.VERSION_2003)) { WordExtractor ex = new WordExtractor(in); result = ex.getText(); SummaryInformation info = ex.getSummaryInformation(); this.m_summary = info; this.m_documentSummary = ex.getDocSummaryInformation(); metaInfo = extractMetaInformation(); } else { XWPFDocument doc = new XWPFDocument(in); XWPFWordExtractor ex = new XWPFWordExtractor(doc); result = ex.getText(); cp = ex.getCoreProperties(); metaInfo = extractMetaInformation(); // SummaryInformation info = doc.getSummaryInformation(); // this.m_summary = info; // this.m_documentSummary = doc.getDocSummaryInformation(); } // result = removeControlChars(result); // String result = content.toString(); // now extract the meta information using POI // POIFSReader reader = new POIFSReader(); // reader.registerListener(this); // reader.read(getStreamCopy(getStreamCopy(in))); // free some memory cleanup(); // return the final result return new CmsExtractionResult(result, metaInfo); }
From source file:com.isotrol.impe3.idx.oc.extractors.ExtractorMsWord.java
License:Open Source License
/** * Extrae el texto de un fichero word./*w w w . j a va 2 s .c om*/ * @param in * @return String. Devuelve el texto crudo * @throws Exception */ public static String extractText(InputStream in) throws Exception { String result = ""; HWPFDocument doc = new HWPFDocument(in); WordExtractor we = new WordExtractor(doc); result = we.getText(); // Eliminamos los caracteres que no nos sirven para indexar. result = ExtractorUtil.removeControlChars(result); return result; }
From source file:com.jgaap.generics.DocumentHelper.java
License:Open Source License
/** * Extracts text from a Word document and stores it in the document. * //from w w w. j a v a 2 s . c o m * @param inputStream * An input stream pointing to the Word document to be read. * @throws IOException */ static private char[] loadMSWord(InputStream inputStream) throws IOException { POIFSFileSystem fs = new POIFSFileSystem(inputStream); HWPFDocument doc = new HWPFDocument(fs); WordExtractor we = new WordExtractor(doc); char[] origText = we.getText().toCharArray(); return origText; }
From source file:com.virtusa.isq.vtaf.runtime.SeleniumTestBase.java
License:Apache License
/** * Read doc file.//from w w w . j a v a2 s . com * * @param fileName * the file name * @return the string */ public final String readDocFile(final String fileName) { String docContent = null; try { File file = new File(fileName); FileInputStream fis = new FileInputStream(file.getAbsolutePath()); HWPFDocument doc = new HWPFDocument(fis); WordExtractor we = new WordExtractor(doc); docContent = we.getText(); System.out.println("MS Word(.doc) Document Red, Content:" + docContent); fis.close(); } catch (IOException e) { e.printStackTrace(); reportresult(true, "CHECK DOCUMENT :", "FAILED", "CheckDocument command NODECOUNT : Execption occured. Actual error : " + e.getMessage()); checkTrue(false, false, "CheckDocument command NODECOUNT : Execption occured. Actual error : " + e.getMessage()); } return docContent; }
From source file:de.catma.document.source.contenthandler.Doc2TxtExtractor.java
License:Open Source License
private void extractTo(String path, String outputPath) throws IOException { BufferedInputStream bis = new BufferedInputStream(new File(path).toURI().toURL().openStream()); try {//from w w w.j a v a2 s . c o m WordExtractor we = new WordExtractor(bis); String buf = we.getText(); System.out.println("before: " + buf.length()); if (FileOSType.getFileOSType(buf).equals(FileOSType.UNIX)) { buf = FileOSType.convertUnixToDos(buf); } System.out.println("after: " + buf.length()); StringBuilder builder = new StringBuilder(); addTest(buf, 10, 20, builder); addTest(buf, 10, 80, builder); if (outputPath != null) { Writer repoSourceFileWriter = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(outputPath), "UTF-8")); try { repoSourceFileWriter.append(buf); repoSourceFileWriter.append(builder.toString()); } finally { if (repoSourceFileWriter != null) { repoSourceFileWriter.close(); } } } else { System.out.println(buf); System.out.println(builder.toString()); } } finally { if (bis != null) { bis.close(); } } }