List of usage examples for org.apache.poi.xwpf.extractor XWPFWordExtractor getText
public String getText()
From source file:dk.defxws.fedoragsearch.server.TransformerToText.java
License:Open Source License
private static Stream getTextFromDOCX(InputStream doc) throws GenericSearchException { long time = System.currentTimeMillis(); boolean errorFlag = Boolean.parseBoolean(Config.getCurrentConfig().getIgnoreTextExtractionErrors()); XWPFWordExtractor wordExtractor = null; try {//from w ww. j a va 2s . c om wordExtractor = new XWPFWordExtractor(OPCPackage.open(doc)); StringBuffer buffer = new StringBuffer(wordExtractor.getText().trim()); Stream stream = new Stream(); stream.write(buffer.toString().getBytes(Constants.XML_CHARACTER_ENCODING)); stream.lock(); if (logger.isDebugEnabled()) { logger.debug("extracting text from docx needed " + (System.currentTimeMillis() - time)); } return stream; } catch (Exception e) { if (errorFlag) { logger.warn("", e); return createErrorStream(docxTextExtractionErrorString); } else { throw new GenericSearchException("cannot parse docx-file", e); } } finally { wordExtractor = null; } }
From source file:edu.ur.ir.index.DefaultWordXmlTextExtractor.java
License:Apache License
/** * Extract text from a word 97-2003 document. * @throws Exception //www . j ava2 s. c om * * @see edu.ur.ir.index.FileTextExtractor#getText(java.io.File) */ public String getText(File f) throws Exception { log.debug("Getting text for file " + f.getAbsolutePath()); String text = null; if (isFileTooLarge(f) || f.length() <= 0l) { return text; } OPCPackage p = null; try { p = XWPFDocument.openPackage(f.getAbsolutePath()); XWPFDocument wordDocument = new XWPFDocument(p); XWPFWordExtractor wordExtractor = new XWPFWordExtractor(wordDocument); String myText = wordExtractor.getText(); if (myText != null && !myText.trim().equals("")) { text = myText; } } catch (OutOfMemoryError oome) { text = null; log.error("could not extract text", oome); throw (oome); } catch (Exception e) { text = null; log.error("could not get text for word document " + f.getAbsolutePath(), e); throw (e); } finally { if (p != null) { try { p.close(); p = null; } catch (IOException e) { log.debug(e); p = null; } } } return text; }
From source file:eu.modelwriter.ide.ui.command.ExtractTextHandler.java
License:Open Source License
/** * Extracts text from the given .docx {@link IFile}. * //from w w w . ja v a2 s . c om * @param file * the .docx {@link IFile} */ private void exctractDocx(final IFile file) { try { FileInputStream fis = new FileInputStream(file.getLocation().toFile()); XWPFDocument docx = new XWPFDocument(fis); XWPFWordExtractor we = new XWPFWordExtractor(docx); final IPath textPath = file.getFullPath().removeFileExtension().addFileExtension("txt"); final IFile textFile = ResourcesPlugin.getWorkspace().getRoot().getFile(textPath); if (textFile.exists()) { textFile.delete(true, new NullProgressMonitor()); } textFile.create(new ByteArrayInputStream(we.getText().getBytes()), true, new NullProgressMonitor()); we.close(); docx.close(); fis.close(); } catch (IOException e) { Activator.getDefault().getLog().log(new Status(IStatus.ERROR, Activator.PLUGIN_ID, UNABLE_TO_EXTRACT_TEXT_FROM + file.getFullPath(), e)); } catch (CoreException e) { Activator.getDefault().getLog().log(new Status(IStatus.ERROR, Activator.PLUGIN_ID, UNABLE_TO_EXTRACT_TEXT_FROM + file.getFullPath(), e)); } }
From source file:eu.transkribus.languageresources.extractor.docx.DocxExtractor.java
@Override public String extractTextFromDocument(String pathToFile, String splitCharacter) { try {//from www . java2s.com XWPFDocument docx = new XWPFDocument(new FileInputStream(pathToFile)); XWPFWordExtractor we = new XWPFWordExtractor(docx); return we.getText(); } catch (IOException ex) { throw new RuntimeException("Could not find docx for given path: " + pathToFile); } }
From source file:File.DOCX.ReadDocx.java
public void ReadAll(String path, String filename) { try {//from w w w . j a v a 2 s .c o m FileInputStream fis = new FileInputStream(path + filename + ".doc"); XWPFDocument xdoc = new XWPFDocument(OPCPackage.open(fis)); XWPFWordExtractor extractor = new XWPFWordExtractor(xdoc); System.out.println(extractor.getText()); } catch (Exception ex) { ex.printStackTrace(); } }
From source file:FilesHandlers.WordHandler.java
public String[] getDocContentByLine(String docName) throws IOException { XWPFDocument docx = new XWPFDocument(new FileInputStream(workingDirectory.concat(docName))); //using XWPFWordExtractor Class XWPFWordExtractor we = new XWPFWordExtractor(docx); String content = we.getText(); int total = countOccurrences(content, '\n'); String[] res = new String[total]; int latest = 0; String row = ""; for (int i = 0; i < content.length(); i++) { if (content.charAt(i) == '\n') { res[latest] = row;// w w w . j av a 2 s . c o m row = ""; latest++; } else { row = row.concat("" + content.charAt(i)); } } return res; }
From source file:indexer.Indexer.java
/** * @param filePath//from www. j a v a2 s. c o m * @return */ // KIND OF USELESS public static String getText(String filePath) { File file; String fd = null; XWPFWordExtractor extractor; try { file = new File(filePath); FileInputStream fis = new FileInputStream(file.getAbsolutePath()); XWPFDocument document = new XWPFDocument(fis); extractor = new XWPFWordExtractor(document); fd = extractor.getText(); } catch (IOException exep) { exep.printStackTrace(); } return fd; }
From source file:LAB.ReportCreate.java
private void ReadDocActionPerformed(java.awt.event.ActionEvent evt) {//GEN-FIRST:event_ReadDocActionPerformed try {// w ww . ja va 2s . c o m JFileChooser s = new JFileChooser(); s.showOpenDialog(null); XWPFDocument d = new XWPFDocument(new FileInputStream(s.getSelectedFile())); XWPFWordExtractor extract = new XWPFWordExtractor(d); contest.setText(extract.getText()); } catch (Exception e) { JOptionPane.showMessageDialog(null, "Do you want to leave"); } }
From source file:lisa.ExtractText.java
License:Open Source License
private static String parseDOCX(String file) { try {/* w w w .j av a2s.c o m*/ BufferedInputStream isr = new BufferedInputStream(new FileInputStream(file)); XWPFWordExtractor word = new XWPFWordExtractor(new XWPFDocument(isr)); return word.getText(); } catch (Exception e) { Common.createLog(e); return ""; } }
From source file:mc.program.Importer.java
public void importDOCX() { try {/*from w w w. ja v a 2 s . c om*/ // Set up objects for getting from .docx file FileInputStream fis = new FileInputStream(sourceFile.getAbsolutePath()); XWPFDocument document = new XWPFDocument(fis); XWPFWordExtractor extractor = new XWPFWordExtractor(document); // Extract text String fileData = extractor.getText(); // Put text into array list Scanner scanner = new Scanner(fileData); while (scanner.hasNext()) { sourceText.add(scanner.next()); } fis.close(); extractor.close(); } catch (Exception ex) { System.out.print(ex); } }