Example usage for org.apache.poi.xwpf.extractor XWPFWordExtractor getText

Introduction

In this page you can find the example usage for org.apache.poi.xwpf.extractor XWPFWordExtractor getText.

Prototype

public String getText()

Source Link

Usage

From source file:dk.defxws.fedoragsearch.server.TransformerToText.java

License:Open Source License

private static Stream getTextFromDOCX(InputStream doc) throws GenericSearchException {
    long time = System.currentTimeMillis();
    boolean errorFlag = Boolean.parseBoolean(Config.getCurrentConfig().getIgnoreTextExtractionErrors());
    XWPFWordExtractor wordExtractor = null;
    try {//from w  ww. j a va 2s  .  c  om
        wordExtractor = new XWPFWordExtractor(OPCPackage.open(doc));
        StringBuffer buffer = new StringBuffer(wordExtractor.getText().trim());
        Stream stream = new Stream();
        stream.write(buffer.toString().getBytes(Constants.XML_CHARACTER_ENCODING));
        stream.lock();
        if (logger.isDebugEnabled()) {
            logger.debug("extracting text from docx needed " + (System.currentTimeMillis() - time));
        }
        return stream;
    } catch (Exception e) {
        if (errorFlag) {
            logger.warn("", e);
            return createErrorStream(docxTextExtractionErrorString);
        } else {
            throw new GenericSearchException("cannot parse docx-file", e);
        }
    } finally {
        wordExtractor = null;
    }
}

From source file:edu.ur.ir.index.DefaultWordXmlTextExtractor.java

License:Apache License

/**
 * Extract text from a word 97-2003 document.
 * @throws Exception //www .  j ava2 s.  c  om
 * 
 * @see edu.ur.ir.index.FileTextExtractor#getText(java.io.File)
 */
public String getText(File f) throws Exception {
    log.debug("Getting text for file " + f.getAbsolutePath());
    String text = null;
    if (isFileTooLarge(f) || f.length() <= 0l) {
        return text;
    }

    OPCPackage p = null;
    try {
        p = XWPFDocument.openPackage(f.getAbsolutePath());
        XWPFDocument wordDocument = new XWPFDocument(p);
        XWPFWordExtractor wordExtractor = new XWPFWordExtractor(wordDocument);

        String myText = wordExtractor.getText();
        if (myText != null && !myText.trim().equals("")) {
            text = myText;
        }

    } catch (OutOfMemoryError oome) {
        text = null;
        log.error("could not extract text", oome);
        throw (oome);
    } catch (Exception e) {
        text = null;
        log.error("could not get text for word document " + f.getAbsolutePath(), e);
        throw (e);
    }

    finally {
        if (p != null) {
            try {
                p.close();
                p = null;
            } catch (IOException e) {
                log.debug(e);
                p = null;
            }
        }
    }
    return text;
}

From source file:eu.modelwriter.ide.ui.command.ExtractTextHandler.java

License:Open Source License

/**
 * Extracts text from the given .docx {@link IFile}.
 * //from w w w .  ja  v a2  s .  c om
 * @param file
 *            the .docx {@link IFile}
 */
private void exctractDocx(final IFile file) {
    try {
        FileInputStream fis = new FileInputStream(file.getLocation().toFile());
        XWPFDocument docx = new XWPFDocument(fis);
        XWPFWordExtractor we = new XWPFWordExtractor(docx);
        final IPath textPath = file.getFullPath().removeFileExtension().addFileExtension("txt");
        final IFile textFile = ResourcesPlugin.getWorkspace().getRoot().getFile(textPath);
        if (textFile.exists()) {
            textFile.delete(true, new NullProgressMonitor());
        }
        textFile.create(new ByteArrayInputStream(we.getText().getBytes()), true, new NullProgressMonitor());
        we.close();
        docx.close();
        fis.close();
    } catch (IOException e) {
        Activator.getDefault().getLog().log(new Status(IStatus.ERROR, Activator.PLUGIN_ID,
                UNABLE_TO_EXTRACT_TEXT_FROM + file.getFullPath(), e));
    } catch (CoreException e) {
        Activator.getDefault().getLog().log(new Status(IStatus.ERROR, Activator.PLUGIN_ID,
                UNABLE_TO_EXTRACT_TEXT_FROM + file.getFullPath(), e));
    }
}

From source file:eu.transkribus.languageresources.extractor.docx.DocxExtractor.java

@Override
public String extractTextFromDocument(String pathToFile, String splitCharacter) {
    try {//from   www .  java2s.com
        XWPFDocument docx = new XWPFDocument(new FileInputStream(pathToFile));
        XWPFWordExtractor we = new XWPFWordExtractor(docx);
        return we.getText();
    } catch (IOException ex) {
        throw new RuntimeException("Could not find docx for given path: " + pathToFile);
    }
}

From source file:File.DOCX.ReadDocx.java

public void ReadAll(String path, String filename) {
    try {//from w w  w . j a v  a  2  s  .c o  m
        FileInputStream fis = new FileInputStream(path + filename + ".doc");
        XWPFDocument xdoc = new XWPFDocument(OPCPackage.open(fis));
        XWPFWordExtractor extractor = new XWPFWordExtractor(xdoc);
        System.out.println(extractor.getText());
    } catch (Exception ex) {
        ex.printStackTrace();
    }
}

From source file:FilesHandlers.WordHandler.java

public String[] getDocContentByLine(String docName) throws IOException {
    XWPFDocument docx = new XWPFDocument(new FileInputStream(workingDirectory.concat(docName)));

    //using XWPFWordExtractor Class
    XWPFWordExtractor we = new XWPFWordExtractor(docx);
    String content = we.getText();

    int total = countOccurrences(content, '\n');
    String[] res = new String[total];
    int latest = 0;
    String row = "";

    for (int i = 0; i < content.length(); i++) {
        if (content.charAt(i) == '\n') {
            res[latest] = row;//  w  w w  . j av a  2 s  .  c  o  m
            row = "";
            latest++;
        } else {
            row = row.concat("" + content.charAt(i));

        }

    }

    return res;

}

From source file:indexer.Indexer.java

/**
 * @param filePath//from   www.  j  a  v  a2 s. c o  m
 * @return
 */
// KIND OF USELESS
public static String getText(String filePath) {
    File file;
    String fd = null;
    XWPFWordExtractor extractor;
    try {
        file = new File(filePath);
        FileInputStream fis = new FileInputStream(file.getAbsolutePath());
        XWPFDocument document = new XWPFDocument(fis);
        extractor = new XWPFWordExtractor(document);
        fd = extractor.getText();
    } catch (IOException exep) {
        exep.printStackTrace();
    }
    return fd;
}

From source file:LAB.ReportCreate.java

private void ReadDocActionPerformed(java.awt.event.ActionEvent evt) {//GEN-FIRST:event_ReadDocActionPerformed
    try {// w ww  .  ja  va  2s .  c o m
        JFileChooser s = new JFileChooser();
        s.showOpenDialog(null);
        XWPFDocument d = new XWPFDocument(new FileInputStream(s.getSelectedFile()));
        XWPFWordExtractor extract = new XWPFWordExtractor(d);
        contest.setText(extract.getText());

    } catch (Exception e) {
        JOptionPane.showMessageDialog(null, "Do you want to leave");
    }

}

From source file:lisa.ExtractText.java

License:Open Source License

private static String parseDOCX(String file) {
    try {/*  w w  w  .j  av a2s.c  o  m*/
        BufferedInputStream isr = new BufferedInputStream(new FileInputStream(file));
        XWPFWordExtractor word = new XWPFWordExtractor(new XWPFDocument(isr));
        return word.getText();
    } catch (Exception e) {
        Common.createLog(e);
        return "";
    }
}

From source file:mc.program.Importer.java

public void importDOCX() {
    try {/*from w  w  w.  ja  v a  2 s  . c  om*/
        // Set up objects for getting from .docx file
        FileInputStream fis = new FileInputStream(sourceFile.getAbsolutePath());
        XWPFDocument document = new XWPFDocument(fis);
        XWPFWordExtractor extractor = new XWPFWordExtractor(document);

        // Extract text
        String fileData = extractor.getText();

        // Put text into array list
        Scanner scanner = new Scanner(fileData);
        while (scanner.hasNext()) {
            sourceText.add(scanner.next());
        }

        fis.close();
        extractor.close();
    } catch (Exception ex) {
        System.out.print(ex);
    }
}