Example usage for org.apache.poi.xwpf.extractor XWPFWordExtractor getText

List of usage examples for org.apache.poi.xwpf.extractor XWPFWordExtractor getText

Introduction

In this page you can find the example usage for org.apache.poi.xwpf.extractor XWPFWordExtractor getText.

Prototype

public String getText() 

Source Link

Usage

From source file:dk.defxws.fedoragsearch.server.TransformerToText.java

License:Open Source License

private static Stream getTextFromDOCX(InputStream doc) throws GenericSearchException {
    long time = System.currentTimeMillis();
    boolean errorFlag = Boolean.parseBoolean(Config.getCurrentConfig().getIgnoreTextExtractionErrors());
    XWPFWordExtractor wordExtractor = null;
    try {//from w  ww. j a va 2s  .  c  om
        wordExtractor = new XWPFWordExtractor(OPCPackage.open(doc));
        StringBuffer buffer = new StringBuffer(wordExtractor.getText().trim());
        Stream stream = new Stream();
        stream.write(buffer.toString().getBytes(Constants.XML_CHARACTER_ENCODING));
        stream.lock();
        if (logger.isDebugEnabled()) {
            logger.debug("extracting text from docx needed " + (System.currentTimeMillis() - time));
        }
        return stream;
    } catch (Exception e) {
        if (errorFlag) {
            logger.warn("", e);
            return createErrorStream(docxTextExtractionErrorString);
        } else {
            throw new GenericSearchException("cannot parse docx-file", e);
        }
    } finally {
        wordExtractor = null;
    }
}

From source file:edu.ur.ir.index.DefaultWordXmlTextExtractor.java

License:Apache License

/**
 * Extract text from a word 97-2003 document.
 * @throws Exception //www .  j ava2 s.  c  om
 * 
 * @see edu.ur.ir.index.FileTextExtractor#getText(java.io.File)
 */
public String getText(File f) throws Exception {
    log.debug("Getting text for file " + f.getAbsolutePath());
    String text = null;
    if (isFileTooLarge(f) || f.length() <= 0l) {
        return text;
    }

    OPCPackage p = null;
    try {
        p = XWPFDocument.openPackage(f.getAbsolutePath());
        XWPFDocument wordDocument = new XWPFDocument(p);
        XWPFWordExtractor wordExtractor = new XWPFWordExtractor(wordDocument);

        String myText = wordExtractor.getText();
        if (myText != null && !myText.trim().equals("")) {
            text = myText;
        }

    } catch (OutOfMemoryError oome) {
        text = null;
        log.error("could not extract text", oome);
        throw (oome);
    } catch (Exception e) {
        text = null;
        log.error("could not get text for word document " + f.getAbsolutePath(), e);
        throw (e);
    }

    finally {
        if (p != null) {
            try {
                p.close();
                p = null;
            } catch (IOException e) {
                log.debug(e);
                p = null;
            }
        }
    }
    return text;
}

From source file:eu.modelwriter.ide.ui.command.ExtractTextHandler.java

License:Open Source License

/**
 * Extracts text from the given .docx {@link IFile}.
 * //from w w w .  ja  v a2  s .  c om
 * @param file
 *            the .docx {@link IFile}
 */
private void exctractDocx(final IFile file) {
    try {
        FileInputStream fis = new FileInputStream(file.getLocation().toFile());
        XWPFDocument docx = new XWPFDocument(fis);
        XWPFWordExtractor we = new XWPFWordExtractor(docx);
        final IPath textPath = file.getFullPath().removeFileExtension().addFileExtension("txt");
        final IFile textFile = ResourcesPlugin.getWorkspace().getRoot().getFile(textPath);
        if (textFile.exists()) {
            textFile.delete(true, new NullProgressMonitor());
        }
        textFile.create(new ByteArrayInputStream(we.getText().getBytes()), true, new NullProgressMonitor());
        we.close();
        docx.close();
        fis.close();
    } catch (IOException e) {
        Activator.getDefault().getLog().log(new Status(IStatus.ERROR, Activator.PLUGIN_ID,
                UNABLE_TO_EXTRACT_TEXT_FROM + file.getFullPath(), e));
    } catch (CoreException e) {
        Activator.getDefault().getLog().log(new Status(IStatus.ERROR, Activator.PLUGIN_ID,
                UNABLE_TO_EXTRACT_TEXT_FROM + file.getFullPath(), e));
    }
}

From source file:eu.transkribus.languageresources.extractor.docx.DocxExtractor.java

@Override
public String extractTextFromDocument(String pathToFile, String splitCharacter) {
    try {//from   www .  java2s.com
        XWPFDocument docx = new XWPFDocument(new FileInputStream(pathToFile));
        XWPFWordExtractor we = new XWPFWordExtractor(docx);
        return we.getText();
    } catch (IOException ex) {
        throw new RuntimeException("Could not find docx for given path: " + pathToFile);
    }
}

From source file:File.DOCX.ReadDocx.java

public void ReadAll(String path, String filename) {
    try {//from w w  w . j a v  a  2  s  .c o  m
        FileInputStream fis = new FileInputStream(path + filename + ".doc");
        XWPFDocument xdoc = new XWPFDocument(OPCPackage.open(fis));
        XWPFWordExtractor extractor = new XWPFWordExtractor(xdoc);
        System.out.println(extractor.getText());
    } catch (Exception ex) {
        ex.printStackTrace();
    }
}

From source file:FilesHandlers.WordHandler.java

public String[] getDocContentByLine(String docName) throws IOException {
    XWPFDocument docx = new XWPFDocument(new FileInputStream(workingDirectory.concat(docName)));

    //using XWPFWordExtractor Class
    XWPFWordExtractor we = new XWPFWordExtractor(docx);
    String content = we.getText();

    int total = countOccurrences(content, '\n');
    String[] res = new String[total];
    int latest = 0;
    String row = "";

    for (int i = 0; i < content.length(); i++) {
        if (content.charAt(i) == '\n') {
            res[latest] = row;//  w  w w  . j av a  2 s  .  c  o  m
            row = "";
            latest++;
        } else {
            row = row.concat("" + content.charAt(i));

        }

    }

    return res;

}

From source file:indexer.Indexer.java

/**
 * @param filePath//from   www.  j  a  v  a2 s. c o  m
 * @return
 */
// KIND OF USELESS
public static String getText(String filePath) {
    File file;
    String fd = null;
    XWPFWordExtractor extractor;
    try {
        file = new File(filePath);
        FileInputStream fis = new FileInputStream(file.getAbsolutePath());
        XWPFDocument document = new XWPFDocument(fis);
        extractor = new XWPFWordExtractor(document);
        fd = extractor.getText();
    } catch (IOException exep) {
        exep.printStackTrace();
    }
    return fd;
}

From source file:LAB.ReportCreate.java

private void ReadDocActionPerformed(java.awt.event.ActionEvent evt) {//GEN-FIRST:event_ReadDocActionPerformed
    try {// w ww  .  ja  va  2s .  c o m
        JFileChooser s = new JFileChooser();
        s.showOpenDialog(null);
        XWPFDocument d = new XWPFDocument(new FileInputStream(s.getSelectedFile()));
        XWPFWordExtractor extract = new XWPFWordExtractor(d);
        contest.setText(extract.getText());

    } catch (Exception e) {
        JOptionPane.showMessageDialog(null, "Do you want to leave");
    }

}

From source file:lisa.ExtractText.java

License:Open Source License

private static String parseDOCX(String file) {
    try {/*  w w  w  .j  av a2s.c  o  m*/
        BufferedInputStream isr = new BufferedInputStream(new FileInputStream(file));
        XWPFWordExtractor word = new XWPFWordExtractor(new XWPFDocument(isr));
        return word.getText();
    } catch (Exception e) {
        Common.createLog(e);
        return "";
    }
}

From source file:mc.program.Importer.java

public void importDOCX() {
    try {/*from w  w  w.  ja  v a  2 s  . c  om*/
        // Set up objects for getting from .docx file
        FileInputStream fis = new FileInputStream(sourceFile.getAbsolutePath());
        XWPFDocument document = new XWPFDocument(fis);
        XWPFWordExtractor extractor = new XWPFWordExtractor(document);

        // Extract text
        String fileData = extractor.getText();

        // Put text into array list
        Scanner scanner = new Scanner(fileData);
        while (scanner.hasNext()) {
            sourceText.add(scanner.next());
        }

        fis.close();
        extractor.close();
    } catch (Exception ex) {
        System.out.print(ex);
    }
}