Example usage for org.apache.poi.xwpf.extractor XWPFWordExtractor getText

List of usage examples for org.apache.poi.xwpf.extractor XWPFWordExtractor getText

Introduction

In this page you can find the example usage for org.apache.poi.xwpf.extractor XWPFWordExtractor getText.

Prototype

public String getText() 

Source Link

Usage

From source file:me.philnate.textmanager.utils.WordCount.java

License:Open Source License

/**
 * opens the given file, if it's a .doc or .docx file and returns the number
 * of words within the document/*from   w ww  .j  a va 2  s. co m*/
 * 
 * @param file
 * @return
 * @throws FileNotFoundException
 * @throws IOException
 */
public static long countFile(File file) throws FileNotFoundException, IOException {
    try (FileInputStream fis = new FileInputStream(file.getAbsolutePath())) {
        if (file.getName().endsWith(".docx")) {
            XWPFDocument document = new XWPFDocument(fis);
            XWPFWordExtractor extractor = new XWPFWordExtractor(document);
            return linecount(extractor.getText());
        } else if (file.getName().endsWith(".doc")) {
            HWPFDocument document = new HWPFDocument(fis);
            WordExtractor extractor = new WordExtractor(document);
            return WordCount.linecount(extractor.getText());
        } else {
            throw new IllegalArgumentException("Can't handle non doc(X) files");
        }
    }
}

From source file:modelo.Lectura.java

private void leerDocx(InputStream docx) {
    //Se crea un documento que la POI entiende pasandole el stream
    //instanciamos el obj para extraer contenido pasando el documento
    try {//from  w  ww  . ja v a 2  s.  c  o  m
        XWPFWordExtractor xwpf_we = new XWPFWordExtractor(new XWPFDocument(docx));

        setTextoDeDocx(xwpf_we.getText());
    } catch (Exception e) {
        System.out.println("Fallo al leer del archivo.\n" + e.toString() + "Error en archivo"
                + javax.swing.JOptionPane.ERROR_MESSAGE);
    }

}

From source file:myexamples.WordDocsExamples.Test1.java

public static void simpleFileReading() throws IOException {
    JFileChooser chooser = new JFileChooser();
    if (chooser.showOpenDialog(null) == JFileChooser.APPROVE_OPTION) {
        System.out.println(chooser.getSelectedFile().getName());
        FileInputStream fis = new FileInputStream(chooser.getSelectedFile());
        XWPFDocument doc = new XWPFDocument(fis);
        XWPFWordExtractor extract = new XWPFWordExtractor(doc);
        System.out.println(extract.getText());

    }/*  ww  w .j  a va 2 s  .c o m*/
}

From source file:net.ontopia.topicmaps.classify.OOXMLWordFormatModule.java

License:Apache License

public void readContent(ClassifiableContentIF cc, TextHandlerIF handler) {
    try {//from  ww w.j  a  v  a2 s . c o m
        OPCPackage opc = OPCPackage.open(new ByteArrayInputStream(cc.getContent()));
        XWPFWordExtractor extractor = new XWPFWordExtractor(opc);
        String s = extractor.getText();
        char[] c = s.toCharArray();
        handler.startRegion("document");
        handler.text(c, 0, c.length);
        handler.endRegion();
    } catch (Exception e) {
        throw new OntopiaRuntimeException(e);
    }
}

From source file:org.crypto.sse.TextExtractPar.java

License:Open Source License

private static TextExtractPar extractOneDoc(File[] listOfFile) throws FileNotFoundException {

    Multimap<String, String> lookup1 = ArrayListMultimap.create();
    Multimap<String, String> lookup2 = ArrayListMultimap.create();

    for (File file : listOfFile) {

        for (int j = 0; j < 100; j++) {

            if (counter == (int) ((j + 1) * listOfFile.length / 100)) {
                System.out.println("Number of files read equals " + j + " %");
                break;
            }/*from w  w  w. j a va  2  s .  com*/
        }

        List<String> lines = new ArrayList<String>();
        counter++;
        FileInputStream fis = new FileInputStream(file);

        // ***********************************************************************************************//

        ///////////////////// .docx /////////////////////////////

        // ***********************************************************************************************//

        if (file.getName().endsWith(".docx")) {
            XWPFDocument doc;
            try {
                // System.out.println("File read: "+file.getName());

                doc = new XWPFDocument(fis);
                XWPFWordExtractor ex = new XWPFWordExtractor(doc);
                lines.add(ex.getText());
            } catch (IOException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            }

        }

        // ***********************************************************************************************//

        ///////////////////// .pptx /////////////////////////////

        // ***********************************************************************************************//

        else if (file.getName().endsWith(".pptx")) {

            OPCPackage ppt;
            try {
                // System.out.println("File read: "+file.getName());

                ppt = OPCPackage.open(fis);
                XSLFPowerPointExtractor xw = new XSLFPowerPointExtractor(ppt);
                lines.add(xw.getText());
            } catch (XmlException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            } catch (IOException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            } catch (OpenXML4JException e) {
                System.out.println("File not read: " + file.getName());
            }

        }

        // ***********************************************************************************************//

        ///////////////////// .xlsx /////////////////////////////

        // ***********************************************************************************************//

        else if (file.getName().endsWith(".xlsx")) {

            OPCPackage xls;
            try {
                // System.out.println("File read: "+file.getName());

                xls = OPCPackage.open(fis);
                XSSFExcelExtractor xe = new XSSFExcelExtractor(xls);
                lines.add(xe.getText());
            } catch (InvalidFormatException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            } catch (IOException e) {
                System.out.println("File not read: " + file.getName());

            } catch (XmlException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            } catch (OpenXML4JException e) {
                System.out.println("File not read: " + file.getName());
            }

        }

        // ***********************************************************************************************//

        ///////////////////// .doc /////////////////////////////

        // ***********************************************************************************************//

        else if (file.getName().endsWith(".doc")) {

            NPOIFSFileSystem fs;
            try {
                // System.out.println("File read: "+file.getName());

                fs = new NPOIFSFileSystem(file);
                WordExtractor extractor = new WordExtractor(fs.getRoot());
                for (String rawText : extractor.getParagraphText()) {
                    lines.add(extractor.stripFields(rawText));
                }
            } catch (IOException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            }

        }

        // ***********************************************************************************************//

        ///////////////////// .pdf /////////////////////////////

        // ***********************************************************************************************//

        else if (file.getName().endsWith(".pdf")) {

            PDFParser parser;
            try {
                // System.out.println("File read: "+file.getName());

                parser = new PDFParser(fis);
                parser.parse();
                COSDocument cd = parser.getDocument();
                PDFTextStripper stripper = new PDFTextStripper();
                lines.add(stripper.getText(new PDDocument(cd)));

            } catch (IOException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            }

        }

        // ***********************************************************************************************//

        ///////////////////// Media Files such as gif, jpeg, .wmv, .mpeg,
        ///////////////////// .mp4 /////////////////////////////

        // ***********************************************************************************************//

        else if (file.getName().endsWith(".gif") && file.getName().endsWith(".jpeg")
                && file.getName().endsWith(".wmv") && file.getName().endsWith(".mpeg")
                && file.getName().endsWith(".mp4")) {

            lines.add(file.getName());

        }

        // ***********************************************************************************************//

        ///////////////////// raw text extensions
        ///////////////////// /////////////////////////////

        // ***********************************************************************************************//

        else {
            try {
                // System.out.println("File read: "+file.getName());

                lines = Files.readLines(file, Charsets.UTF_8);
            } catch (IOException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            } finally {
                try {
                    fis.close();
                } catch (IOException ioex) {
                    // omitted.
                }
            }
        }

        // ***********************************************************************************************//

        ///////////////////// Begin word extraction
        ///////////////////// /////////////////////////////

        // ***********************************************************************************************//

        int temporaryCounter = 0;

        // Filter threshold
        int counterDoc = 0;
        for (int i = 0; i < lines.size(); i++) {

            CharArraySet noise = EnglishAnalyzer.getDefaultStopSet();

            // We are using a standard tokenizer that eliminates the stop
            // words. We can use Stemming tokenizer such Porter
            // A set of English noise keywords is used that will eliminates
            // words such as "the, a, etc"

            Analyzer analyzer = new StandardAnalyzer(noise);
            List<String> token = Tokenizer.tokenizeString(analyzer, lines.get(i));
            temporaryCounter = temporaryCounter + token.size();
            for (int j = 0; j < token.size(); j++) {

                // Avoid counting occurrences of words in the same file
                if (!lookup2.get(file.getName()).contains(token.get(j))) {
                    lookup2.put(file.getName(), token.get(j));
                }

                // Avoid counting occurrences of words in the same file
                if (!lookup1.get(token.get(j)).contains(file.getName())) {
                    lookup1.put(token.get(j), file.getName());
                }

            }

        }

    }

    // System.out.println(lookup.toString());
    return new TextExtractPar(lookup1, lookup2);

}

From source file:org.encuestame.business.search.IndexerFile.java

License:Apache License

/**
 * Extract word document content./*from   w ww  .j  a v a2  s .  com*/
 * @param wde
 * @return
 */
public static String extractContentWordDocument(final XWPFWordExtractor wde) {
    String bodyText = null;
    try {
        bodyText = wde.getText();
    } catch (Exception e) {
        log.error("ERROR extracting content Word Document-------->" + e);
    }
    return bodyText;
}

From source file:org.encuestame.business.search.SearchUtils.java

License:Apache License

/**
* Create Document Word.//from w  ww .j av  a2  s. c  om
* @param file {@link File}
* @param Long attachmentId.
* @return {@link Document}
* @throws POIXMLException
* @throws Exception
*/
public static Document createWordDocument(final File file) throws POIXMLException, Exception {
    InputStream is = new FileInputStream(file);
    String bodyText = null;
    try {
        XWPFDocument wd = new XWPFDocument(is);
        XWPFWordExtractor wde = new XWPFWordExtractor(wd);
        bodyText = wde.getText();
    } catch (Exception e) {
        log.debug(e);
    }
    Document doc = SearchUtils.addFields(file, bodyText);
    return doc;
}

From source file:org.exoplatform.services.document.impl.MSXWordDocumentReader.java

License:Open Source License

/**
 * Returns only a text from .docx file content.
 * //from  w  w w .  j a v a 2s  . co m
 * @param is an input stream with .docx file content.
 * @return The string only with text from file content.
 */
public String getContentAsText(final InputStream is) throws IOException, DocumentReadException {
    if (is == null) {
        throw new IllegalArgumentException("InputStream is null.");
    }
    String text = "";
    try {
        if (is.available() == 0) {
            return "";
        }

        XWPFDocument doc;
        try {
            doc = SecurityHelper.doPrivilegedIOExceptionAction(new PrivilegedExceptionAction<XWPFDocument>() {
                public XWPFDocument run() throws Exception {
                    return new XWPFDocument(is);
                }
            });
        } catch (IOException e) {
            throw new DocumentReadException("Can't open message.", e);
        } catch (OpenXML4JRuntimeException e) {
            throw new DocumentReadException("Can't open message.", e);
        }

        final XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
        text = SecurityHelper.doPrivilegedAction(new PrivilegedAction<String>() {
            public String run() {
                return extractor.getText();
            }
        });
    } finally {
        if (is != null) {
            try {
                is.close();
            } catch (IOException e) {
                if (LOG.isTraceEnabled()) {
                    LOG.trace("An exception occurred: " + e.getMessage());
                }
            }
        }
    }
    return text.trim();
}

From source file:org.kimios.kernel.index.filters.ExcelXFilter.java

License:Open Source License

public String getBody(InputStream in) throws IOException {
    XWPFDocument doc = new XWPFDocument(in);
    XWPFWordExtractor ex = new XWPFWordExtractor(doc);
    String text = ex.getText();
    return text;//w  w  w  .  j  a v  a 2s . co  m
}

From source file:org.nuxeo.ecm.platform.template.tests.TestOOoConvert.java

License:Apache License

@Test
public void testOfficeConverter4() throws Exception {
    ConversionService cs = Framework.getService(ConversionService.class);

    BlobHolder bh = getBlobFromPath("data/testMe.html", "text/html");
    String converterName = cs.getConverterName(bh.getBlob().getMimeType(),
            "application/vnd.openxmlformats-officedocument.wordprocessingml.document");
    assertEquals("any2docx", converterName);

    boolean isAvailable = cs.isConverterAvailable(converterName).isAvailable();
    assumeTrue(isAvailable);//from w  w  w  . j av a2s .c o  m

    BlobHolder result = cs.convert(converterName, bh, null);
    File docxFile = Framework.createTempFile("docxfile", "docx");
    result.getBlob().transferTo(docxFile);

    XWPFDocument doc = new XWPFDocument(new FileInputStream(docxFile));
    XWPFWordExtractor extractor = new XWPFWordExtractor(doc);

    String text = extractor.getText();
    assertTrue(text.length() > 0);
    assertTrue(text.contains("Titre 1"));

    docxFile.delete();
}