Example usage for org.apache.poi.xwpf.extractor XWPFWordExtractor getText

Introduction

In this page you can find the example usage for org.apache.poi.xwpf.extractor XWPFWordExtractor getText.

Prototype

public String getText()

Source Link

Usage

From source file:me.philnate.textmanager.utils.WordCount.java

License:Open Source License

/**
 * opens the given file, if it's a .doc or .docx file and returns the number
 * of words within the document/*from   w ww  .j  a va 2  s. co m*/
 * 
 * @param file
 * @return
 * @throws FileNotFoundException
 * @throws IOException
 */
public static long countFile(File file) throws FileNotFoundException, IOException {
    try (FileInputStream fis = new FileInputStream(file.getAbsolutePath())) {
        if (file.getName().endsWith(".docx")) {
            XWPFDocument document = new XWPFDocument(fis);
            XWPFWordExtractor extractor = new XWPFWordExtractor(document);
            return linecount(extractor.getText());
        } else if (file.getName().endsWith(".doc")) {
            HWPFDocument document = new HWPFDocument(fis);
            WordExtractor extractor = new WordExtractor(document);
            return WordCount.linecount(extractor.getText());
        } else {
            throw new IllegalArgumentException("Can't handle non doc(X) files");
        }
    }
}

From source file:modelo.Lectura.java

private void leerDocx(InputStream docx) {
    //Se crea un documento que la POI entiende pasandole el stream
    //instanciamos el obj para extraer contenido pasando el documento
    try {//from  w  ww  . ja v a 2  s.  c  o  m
        XWPFWordExtractor xwpf_we = new XWPFWordExtractor(new XWPFDocument(docx));

        setTextoDeDocx(xwpf_we.getText());
    } catch (Exception e) {
        System.out.println("Fallo al leer del archivo.\n" + e.toString() + "Error en archivo"
                + javax.swing.JOptionPane.ERROR_MESSAGE);
    }

}

From source file:myexamples.WordDocsExamples.Test1.java

public static void simpleFileReading() throws IOException {
    JFileChooser chooser = new JFileChooser();
    if (chooser.showOpenDialog(null) == JFileChooser.APPROVE_OPTION) {
        System.out.println(chooser.getSelectedFile().getName());
        FileInputStream fis = new FileInputStream(chooser.getSelectedFile());
        XWPFDocument doc = new XWPFDocument(fis);
        XWPFWordExtractor extract = new XWPFWordExtractor(doc);
        System.out.println(extract.getText());

    }/*  ww  w .j  a va 2 s  .c o m*/
}

From source file:net.ontopia.topicmaps.classify.OOXMLWordFormatModule.java

License:Apache License

public void readContent(ClassifiableContentIF cc, TextHandlerIF handler) {
    try {//from  ww w.j  a  v  a2 s . c o m
        OPCPackage opc = OPCPackage.open(new ByteArrayInputStream(cc.getContent()));
        XWPFWordExtractor extractor = new XWPFWordExtractor(opc);
        String s = extractor.getText();
        char[] c = s.toCharArray();
        handler.startRegion("document");
        handler.text(c, 0, c.length);
        handler.endRegion();
    } catch (Exception e) {
        throw new OntopiaRuntimeException(e);
    }
}

From source file:org.crypto.sse.TextExtractPar.java

License:Open Source License

private static TextExtractPar extractOneDoc(File[] listOfFile) throws FileNotFoundException {

    Multimap<String, String> lookup1 = ArrayListMultimap.create();
    Multimap<String, String> lookup2 = ArrayListMultimap.create();

    for (File file : listOfFile) {

        for (int j = 0; j < 100; j++) {

            if (counter == (int) ((j + 1) * listOfFile.length / 100)) {
                System.out.println("Number of files read equals " + j + " %");
                break;
            }/*from w  w  w. j a va  2  s .  com*/
        }

        List<String> lines = new ArrayList<String>();
        counter++;
        FileInputStream fis = new FileInputStream(file);

        // ***********************************************************************************************//

        ///////////////////// .docx /////////////////////////////

        // ***********************************************************************************************//

        if (file.getName().endsWith(".docx")) {
            XWPFDocument doc;
            try {
                // System.out.println("File read: "+file.getName());

                doc = new XWPFDocument(fis);
                XWPFWordExtractor ex = new XWPFWordExtractor(doc);
                lines.add(ex.getText());
            } catch (IOException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            }

        }

        // ***********************************************************************************************//

        ///////////////////// .pptx /////////////////////////////

        // ***********************************************************************************************//

        else if (file.getName().endsWith(".pptx")) {

            OPCPackage ppt;
            try {
                // System.out.println("File read: "+file.getName());

                ppt = OPCPackage.open(fis);
                XSLFPowerPointExtractor xw = new XSLFPowerPointExtractor(ppt);
                lines.add(xw.getText());
            } catch (XmlException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            } catch (IOException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            } catch (OpenXML4JException e) {
                System.out.println("File not read: " + file.getName());
            }

        }

        // ***********************************************************************************************//

        ///////////////////// .xlsx /////////////////////////////

        // ***********************************************************************************************//

        else if (file.getName().endsWith(".xlsx")) {

            OPCPackage xls;
            try {
                // System.out.println("File read: "+file.getName());

                xls = OPCPackage.open(fis);
                XSSFExcelExtractor xe = new XSSFExcelExtractor(xls);
                lines.add(xe.getText());
            } catch (InvalidFormatException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            } catch (IOException e) {
                System.out.println("File not read: " + file.getName());

            } catch (XmlException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            } catch (OpenXML4JException e) {
                System.out.println("File not read: " + file.getName());
            }

        }

        // ***********************************************************************************************//

        ///////////////////// .doc /////////////////////////////

        // ***********************************************************************************************//

        else if (file.getName().endsWith(".doc")) {

            NPOIFSFileSystem fs;
            try {
                // System.out.println("File read: "+file.getName());

                fs = new NPOIFSFileSystem(file);
                WordExtractor extractor = new WordExtractor(fs.getRoot());
                for (String rawText : extractor.getParagraphText()) {
                    lines.add(extractor.stripFields(rawText));
                }
            } catch (IOException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            }

        }

        // ***********************************************************************************************//

        ///////////////////// .pdf /////////////////////////////

        // ***********************************************************************************************//

        else if (file.getName().endsWith(".pdf")) {

            PDFParser parser;
            try {
                // System.out.println("File read: "+file.getName());

                parser = new PDFParser(fis);
                parser.parse();
                COSDocument cd = parser.getDocument();
                PDFTextStripper stripper = new PDFTextStripper();
                lines.add(stripper.getText(new PDDocument(cd)));

            } catch (IOException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            }

        }

        // ***********************************************************************************************//

        ///////////////////// Media Files such as gif, jpeg, .wmv, .mpeg,
        ///////////////////// .mp4 /////////////////////////////

        // ***********************************************************************************************//

        else if (file.getName().endsWith(".gif") && file.getName().endsWith(".jpeg")
                && file.getName().endsWith(".wmv") && file.getName().endsWith(".mpeg")
                && file.getName().endsWith(".mp4")) {

            lines.add(file.getName());

        }

        // ***********************************************************************************************//

        ///////////////////// raw text extensions
        ///////////////////// /////////////////////////////

        // ***********************************************************************************************//

        else {
            try {
                // System.out.println("File read: "+file.getName());

                lines = Files.readLines(file, Charsets.UTF_8);
            } catch (IOException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            } finally {
                try {
                    fis.close();
                } catch (IOException ioex) {
                    // omitted.
                }
            }
        }

        // ***********************************************************************************************//

        ///////////////////// Begin word extraction
        ///////////////////// /////////////////////////////

        // ***********************************************************************************************//

        int temporaryCounter = 0;

        // Filter threshold
        int counterDoc = 0;
        for (int i = 0; i < lines.size(); i++) {

            CharArraySet noise = EnglishAnalyzer.getDefaultStopSet();

            // We are using a standard tokenizer that eliminates the stop
            // words. We can use Stemming tokenizer such Porter
            // A set of English noise keywords is used that will eliminates
            // words such as "the, a, etc"

            Analyzer analyzer = new StandardAnalyzer(noise);
            List<String> token = Tokenizer.tokenizeString(analyzer, lines.get(i));
            temporaryCounter = temporaryCounter + token.size();
            for (int j = 0; j < token.size(); j++) {

                // Avoid counting occurrences of words in the same file
                if (!lookup2.get(file.getName()).contains(token.get(j))) {
                    lookup2.put(file.getName(), token.get(j));
                }

                // Avoid counting occurrences of words in the same file
                if (!lookup1.get(token.get(j)).contains(file.getName())) {
                    lookup1.put(token.get(j), file.getName());
                }

            }

        }

    }

    // System.out.println(lookup.toString());
    return new TextExtractPar(lookup1, lookup2);

}

From source file:org.encuestame.business.search.IndexerFile.java

License:Apache License

/**
 * Extract word document content./*from   w ww  .j  a v a2  s .  com*/
 * @param wde
 * @return
 */
public static String extractContentWordDocument(final XWPFWordExtractor wde) {
    String bodyText = null;
    try {
        bodyText = wde.getText();
    } catch (Exception e) {
        log.error("ERROR extracting content Word Document-------->" + e);
    }
    return bodyText;
}

From source file:org.encuestame.business.search.SearchUtils.java

License:Apache License

/**
* Create Document Word.//from w  ww .j av  a2  s. c  om
* @param file {@link File}
* @param Long attachmentId.
* @return {@link Document}
* @throws POIXMLException
* @throws Exception
*/
public static Document createWordDocument(final File file) throws POIXMLException, Exception {
    InputStream is = new FileInputStream(file);
    String bodyText = null;
    try {
        XWPFDocument wd = new XWPFDocument(is);
        XWPFWordExtractor wde = new XWPFWordExtractor(wd);
        bodyText = wde.getText();
    } catch (Exception e) {
        log.debug(e);
    }
    Document doc = SearchUtils.addFields(file, bodyText);
    return doc;
}

From source file:org.exoplatform.services.document.impl.MSXWordDocumentReader.java

License:Open Source License

/**
 * Returns only a text from .docx file content.
 * //from  w  w w .  j a v a 2s  . co m
 * @param is an input stream with .docx file content.
 * @return The string only with text from file content.
 */
public String getContentAsText(final InputStream is) throws IOException, DocumentReadException {
    if (is == null) {
        throw new IllegalArgumentException("InputStream is null.");
    }
    String text = "";
    try {
        if (is.available() == 0) {
            return "";
        }

        XWPFDocument doc;
        try {
            doc = SecurityHelper.doPrivilegedIOExceptionAction(new PrivilegedExceptionAction<XWPFDocument>() {
                public XWPFDocument run() throws Exception {
                    return new XWPFDocument(is);
                }
            });
        } catch (IOException e) {
            throw new DocumentReadException("Can't open message.", e);
        } catch (OpenXML4JRuntimeException e) {
            throw new DocumentReadException("Can't open message.", e);
        }

        final XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
        text = SecurityHelper.doPrivilegedAction(new PrivilegedAction<String>() {
            public String run() {
                return extractor.getText();
            }
        });
    } finally {
        if (is != null) {
            try {
                is.close();
            } catch (IOException e) {
                if (LOG.isTraceEnabled()) {
                    LOG.trace("An exception occurred: " + e.getMessage());
                }
            }
        }
    }
    return text.trim();
}

From source file:org.kimios.kernel.index.filters.ExcelXFilter.java

License:Open Source License

public String getBody(InputStream in) throws IOException {
    XWPFDocument doc = new XWPFDocument(in);
    XWPFWordExtractor ex = new XWPFWordExtractor(doc);
    String text = ex.getText();
    return text;//w  w  w  .  j  a v  a 2s . co  m
}

From source file:org.nuxeo.ecm.platform.template.tests.TestOOoConvert.java

License:Apache License

@Test
public void testOfficeConverter4() throws Exception {
    ConversionService cs = Framework.getService(ConversionService.class);

    BlobHolder bh = getBlobFromPath("data/testMe.html", "text/html");
    String converterName = cs.getConverterName(bh.getBlob().getMimeType(),
            "application/vnd.openxmlformats-officedocument.wordprocessingml.document");
    assertEquals("any2docx", converterName);

    boolean isAvailable = cs.isConverterAvailable(converterName).isAvailable();
    assumeTrue(isAvailable);//from w  w  w  . j av a2s .c o  m

    BlobHolder result = cs.convert(converterName, bh, null);
    File docxFile = Framework.createTempFile("docxfile", "docx");
    result.getBlob().transferTo(docxFile);

    XWPFDocument doc = new XWPFDocument(new FileInputStream(docxFile));
    XWPFWordExtractor extractor = new XWPFWordExtractor(doc);

    String text = extractor.getText();
    assertTrue(text.length() > 0);
    assertTrue(text.contains("Titre 1"));

    docxFile.delete();
}