List of usage examples for org.apache.poi.hwpf.extractor WordExtractor WordExtractor
public WordExtractor(HWPFDocument doc)
From source file:at.tugraz.sss.serv.SSFileU.java
License:Apache License
public static void writePDFFromDoc(final String docFilePath, final String pdfFilePath) throws Exception { final Document document = new Document(); final POIFSFileSystem fs = new POIFSFileSystem(openFileForRead(docFilePath)); final HWPFDocument word = new HWPFDocument(fs); final WordExtractor we = new WordExtractor(word); final OutputStream out = openOrCreateFileWithPathForWrite(pdfFilePath); final PdfWriter writer = PdfWriter.getInstance(document, out); final Range range = word.getRange(); document.open();/* ww w . ja v a 2s .co m*/ writer.setPageEmpty(true); document.newPage(); writer.setPageEmpty(true); String[] paragraphs = we.getParagraphText(); for (int i = 0; i < paragraphs.length; i++) { org.apache.poi.hwpf.usermodel.Paragraph pr = range.getParagraph(i); // CharacterRun run = pr.getCharacterRun(i); // run.setBold(true); // run.setCapitalized(true); // run.setItalic(true); paragraphs[i] = paragraphs[i].replaceAll("\\cM?\r?\n", ""); System.out.println("Length:" + paragraphs[i].length()); System.out.println("Paragraph" + i + ": " + paragraphs[i].toString()); // add the paragraph to the document document.add(new Paragraph(paragraphs[i])); } document.close(); }
From source file:at.tugraz.sss.serv.util.SSFileU.java
License:Apache License
public static void writePDFFromDoc(final String docFilePath, final String pdfFilePath) throws SSErr { try {/*from www . ja va 2 s.co m*/ final Document document = new Document(); final POIFSFileSystem fs = new POIFSFileSystem(openFileForRead(docFilePath)); final HWPFDocument word = new HWPFDocument(fs); final WordExtractor we = new WordExtractor(word); final OutputStream out = openOrCreateFileWithPathForWrite(pdfFilePath); final PdfWriter writer = PdfWriter.getInstance(document, out); final Range range = word.getRange(); document.open(); writer.setPageEmpty(true); document.newPage(); writer.setPageEmpty(true); String[] paragraphs = we.getParagraphText(); for (int i = 0; i < paragraphs.length; i++) { org.apache.poi.hwpf.usermodel.Paragraph pr = range.getParagraph(i); // CharacterRun run = pr.getCharacterRun(i); // run.setBold(true); // run.setCapitalized(true); // run.setItalic(true); paragraphs[i] = paragraphs[i].replaceAll("\\cM?\r?\n", ""); System.out.println("Length:" + paragraphs[i].length()); System.out.println("Paragraph" + i + ": " + paragraphs[i].toString()); // add the paragraph to the document document.add(new Paragraph(paragraphs[i])); } document.close(); } catch (Exception error) { SSServErrReg.regErrThrow(error); } }
From source file:avoking.com.documentos.scheduler.core.Core.java
private String leerDoc(InputStream doc) throws IOException { POIFSFileSystem fs = new POIFSFileSystem(doc); WordExtractor we = new WordExtractor(fs); return we.getText(); }
From source file:axiom.util.TextExtractor.java
License:Open Source License
public static String msWordExtractor(InputStream is) throws Exception { WordExtractor we = new WordExtractor(is); return we.getText(); }
From source file:br.com.schumaker.beta.doc.ReadDocMaster.java
public static void main(String[] args) { try {// w w w . j a v a2 s. c om File file = new File( "/users/hudsonschumaker/downloads/Guisi01206us - Jira Guide for P3 PECB enhancement requests.doc"); FileInputStream fis = new FileInputStream(file.getAbsolutePath()); HWPFDocument doc = new HWPFDocument(fis); WordExtractor extractor = new WordExtractor(doc); for (String rawText : extractor.getParagraphText()) { String text = extractor.stripFields(rawText); if (text.length() > 10) System.out.println(text.trim()); } } catch (Exception exep) { } }
From source file:com.artech.prototype2.bardakov.utils.impl.MultiParserImpl.java
/** * doc/docx//from w ww . j a va 2 s . c o m * @param FilePath - * @return ?? ? */ private ArrayList<String> getListOfWordsFromDoc(String FilePath) { FileInputStream fis; List<String> result = new ArrayList<String>(); if (FilePath.substring(FilePath.length() - 1).equals("x")) { //is a docx try { fis = new FileInputStream(new File(FilePath)); XWPFDocument doc = new XWPFDocument(fis); XWPFWordExtractor extract = new XWPFWordExtractor(doc); // System.out.println(extract.getText()); StringBuilder builder = new StringBuilder(); builder.append(extract.getText()); String[] words = builder.toString().split(" "); for (String s : words) { result.add(s); } } catch (IOException e) { e.printStackTrace(); } } else { //is not a docx try { fis = new FileInputStream(new File(FilePath)); HWPFDocument doc = new HWPFDocument(fis); WordExtractor extractor = new WordExtractor(doc); StringBuilder builder = new StringBuilder(); builder.append(extractor.getText()); String[] words = builder.toString().split(" "); for (String s : words) { result.add(s); } } catch (IOException e) { e.printStackTrace(); } } return (ArrayList<String>) result; }
From source file:com.docdoku.server.esindexer.ESTools.java
License:Open Source License
private static String microsoftWordDocumentToString(InputStream inputStream) throws IOException { String strRet;/*from w w w.jav a 2 s. c om*/ try (InputStream wordStream = new BufferedInputStream(inputStream)) { if (POIFSFileSystem.hasPOIFSHeader(wordStream)) { WordExtractor wordExtractor = new WordExtractor(wordStream); strRet = wordExtractor.getText(); } else { XWPFWordExtractor wordXExtractor = new XWPFWordExtractor(new XWPFDocument(wordStream)); strRet = wordXExtractor.getText(); } } return strRet; }
From source file:com.docdoku.server.IndexerBean.java
License:Open Source License
@Asynchronous @Lock(LockType.WRITE)//w w w. j a v a 2s . c o m public void addToIndex(String fullName, String pathName) { IndexWriter indexWriter = null; Directory indexDir = null; try { indexDir = FSDirectory.open(new File(indexPath)); indexWriter = new IndexWriter(indexDir, new StandardAnalyzer(Version.LUCENE_30), IndexWriter.MaxFieldLength.LIMITED); int ext = pathName.lastIndexOf('.'); String extension = ""; if (ext != -1) { extension = pathName.substring(ext); } if (extension.equals(".odt") || extension.equals(".ods") || extension.equals(".odp") || extension.equals(".odg") || extension.equals(".odc") || extension.equals(".odf") || extension.equals(".odb") || extension.equals(".odi") || extension.equals(".odm")) { final StringBuilder text = new StringBuilder(); ZipInputStream zipOpenDoc = new ZipInputStream( new BufferedInputStream(new FileInputStream(pathName))); ZipEntry zipEntry; while ((zipEntry = zipOpenDoc.getNextEntry()) != null) { if (zipEntry.getName().equals("content.xml")) { SAXParserFactory saxParserFactory = SAXParserFactory.newInstance(); SAXParser parser = saxParserFactory.newSAXParser(); parser.parse(zipOpenDoc, new DefaultHandler() { @Override public void characters(char[] ch, int start, int length) throws SAXException { for (int i = start; i < start + length; i++) { text.append(ch[i]); } text.append("\r\n"); } }); break; } } zipOpenDoc.close(); Reader contentReader = new StringReader(text.toString()); addDoc(indexWriter, contentReader, fullName); contentReader.close(); } else if (extension.equals(".doc")) { //MSWord Document InputStream wordStream = new BufferedInputStream(new FileInputStream(pathName)); WordExtractor wordExtractor = new WordExtractor(wordStream); Reader contentReader = new StringReader(wordExtractor.getText()); wordStream.close(); addDoc(indexWriter, contentReader, fullName); contentReader.close(); } else if (extension.equals(".ppt") || extension.equals(".pps")) { //MSPowerPoint Document InputStream pptStream = new BufferedInputStream(new FileInputStream(pathName)); PowerPointExtractor pptExtractor = new PowerPointExtractor(pptStream); Reader contentReader = new StringReader(pptExtractor.getText(true, true)); pptStream.close(); addDoc(indexWriter, contentReader, fullName); pptExtractor.close(); contentReader.close(); } else if (extension.equals(".txt")) { //Text Document Reader contentReader = new BufferedReader(new FileReader(pathName)); addDoc(indexWriter, contentReader, fullName); contentReader.close(); } else if (extension.equals(".xls")) { //MSExcelExtractor Document //InputStream excelStream=new BufferedInputStream(new FileInputStream(pathName)); //ExcelExtractor excelExtractor= new ExcelExtractor(excelStream); //Reader contentReader=new StringReader(excelExtractor.getText()); //excelStream.close(); //addDoc(indexWriter,contentReader,fullName); //excelExtractor.close(); //contentReader.close(); } else if (extension.equals(".html") || extension.equals(".htm")) { } else if (extension.equals(".csv")) { } else if (extension.equals(".xml")) { } else if (extension.equals(".rtf")) { } else if (extension.equals(".pdf")) { } else if (extension.equals(".msg")) { } } catch (CorruptIndexException ex) { throw new EJBException(ex); } catch (LockObtainFailedException ex) { try { if (IndexWriter.isLocked(indexDir)) { IndexWriter.unlock(indexDir); } } catch (IOException pIOEx) { throw new EJBException(pIOEx); } throw new EJBException(ex); } catch (ParserConfigurationException ex) { throw new EJBException(ex); } catch (SAXException ex) { throw new EJBException(ex); } catch (IOException ex) { throw new EJBException(ex); } finally { try { if (indexWriter != null) { indexWriter.close(); } } catch (IOException ex) { throw new EJBException(ex); } } }
From source file:com.frameworkset.platform.cms.searchmanager.extractors.CmsExtractorMsWord.java
License:Open Source License
/** * @see org.opencms.search.extractors.I_CmsTextExtractor#extractText(java.io.InputStream, java.lang.String) *///from ww w . j a v a 2s.com public I_CmsExtractionResult extractText(InputStream in, String encoding) throws Exception { // first extract the text using the text actraction libary // WordTextExtractorFactory factory = new WordTextExtractorFactory(); // TextExtractor wordExtractor = factory.textExtractor(getStreamCopy(in)); // String result = wordExtractor.getText(); // String result = wordExtractor.extractText(getStreamCopy(in)); StringBuffer content = new StringBuffer(); // org.apache.poi.hwpf // HWPFDocument doc = new HWPFDocument(in); // Range range = doc.getRange(); // int paragraphCount = range.numParagraphs();// ? // for (int i = 0; i < paragraphCount; i++) {// ????? // Paragraph pp = range.getParagraph(i); // content.append(pp.text()); // } String result = null; Map metaInfo = null; if (version.equals(ContentHandler.VERSION_2003)) { WordExtractor ex = new WordExtractor(in); result = ex.getText(); SummaryInformation info = ex.getSummaryInformation(); this.m_summary = info; this.m_documentSummary = ex.getDocSummaryInformation(); metaInfo = extractMetaInformation(); } else { XWPFDocument doc = new XWPFDocument(in); XWPFWordExtractor ex = new XWPFWordExtractor(doc); result = ex.getText(); cp = ex.getCoreProperties(); metaInfo = extractMetaInformation(); // SummaryInformation info = doc.getSummaryInformation(); // this.m_summary = info; // this.m_documentSummary = doc.getDocSummaryInformation(); } // result = removeControlChars(result); // String result = content.toString(); // now extract the meta information using POI // POIFSReader reader = new POIFSReader(); // reader.registerListener(this); // reader.read(getStreamCopy(getStreamCopy(in))); // free some memory cleanup(); // return the final result return new CmsExtractionResult(result, metaInfo); }
From source file:com.isotrol.impe3.idx.oc.extractors.ExtractorMsWord.java
License:Open Source License
/** * Extrae el texto de un fichero word./*from www .j a v a 2 s.c om*/ * @param in * @return String. Devuelve el texto crudo * @throws Exception */ public static String extractText(InputStream in) throws Exception { String result = ""; HWPFDocument doc = new HWPFDocument(in); WordExtractor we = new WordExtractor(doc); result = we.getText(); // Eliminamos los caracteres que no nos sirven para indexar. result = ExtractorUtil.removeControlChars(result); return result; }