List of usage examples for org.apache.poi.xwpf.extractor XWPFWordExtractor getText
public String getText()
From source file:me.philnate.textmanager.utils.WordCount.java
License:Open Source License
/** * opens the given file, if it's a .doc or .docx file and returns the number * of words within the document/*from w ww .j a va 2 s. co m*/ * * @param file * @return * @throws FileNotFoundException * @throws IOException */ public static long countFile(File file) throws FileNotFoundException, IOException { try (FileInputStream fis = new FileInputStream(file.getAbsolutePath())) { if (file.getName().endsWith(".docx")) { XWPFDocument document = new XWPFDocument(fis); XWPFWordExtractor extractor = new XWPFWordExtractor(document); return linecount(extractor.getText()); } else if (file.getName().endsWith(".doc")) { HWPFDocument document = new HWPFDocument(fis); WordExtractor extractor = new WordExtractor(document); return WordCount.linecount(extractor.getText()); } else { throw new IllegalArgumentException("Can't handle non doc(X) files"); } } }
From source file:modelo.Lectura.java
private void leerDocx(InputStream docx) { //Se crea un documento que la POI entiende pasandole el stream //instanciamos el obj para extraer contenido pasando el documento try {//from w ww . ja v a 2 s. c o m XWPFWordExtractor xwpf_we = new XWPFWordExtractor(new XWPFDocument(docx)); setTextoDeDocx(xwpf_we.getText()); } catch (Exception e) { System.out.println("Fallo al leer del archivo.\n" + e.toString() + "Error en archivo" + javax.swing.JOptionPane.ERROR_MESSAGE); } }
From source file:myexamples.WordDocsExamples.Test1.java
public static void simpleFileReading() throws IOException { JFileChooser chooser = new JFileChooser(); if (chooser.showOpenDialog(null) == JFileChooser.APPROVE_OPTION) { System.out.println(chooser.getSelectedFile().getName()); FileInputStream fis = new FileInputStream(chooser.getSelectedFile()); XWPFDocument doc = new XWPFDocument(fis); XWPFWordExtractor extract = new XWPFWordExtractor(doc); System.out.println(extract.getText()); }/* ww w .j a va 2 s .c o m*/ }
From source file:net.ontopia.topicmaps.classify.OOXMLWordFormatModule.java
License:Apache License
public void readContent(ClassifiableContentIF cc, TextHandlerIF handler) { try {//from ww w.j a v a2 s . c o m OPCPackage opc = OPCPackage.open(new ByteArrayInputStream(cc.getContent())); XWPFWordExtractor extractor = new XWPFWordExtractor(opc); String s = extractor.getText(); char[] c = s.toCharArray(); handler.startRegion("document"); handler.text(c, 0, c.length); handler.endRegion(); } catch (Exception e) { throw new OntopiaRuntimeException(e); } }
From source file:org.crypto.sse.TextExtractPar.java
License:Open Source License
private static TextExtractPar extractOneDoc(File[] listOfFile) throws FileNotFoundException { Multimap<String, String> lookup1 = ArrayListMultimap.create(); Multimap<String, String> lookup2 = ArrayListMultimap.create(); for (File file : listOfFile) { for (int j = 0; j < 100; j++) { if (counter == (int) ((j + 1) * listOfFile.length / 100)) { System.out.println("Number of files read equals " + j + " %"); break; }/*from w w w. j a va 2 s . com*/ } List<String> lines = new ArrayList<String>(); counter++; FileInputStream fis = new FileInputStream(file); // ***********************************************************************************************// ///////////////////// .docx ///////////////////////////// // ***********************************************************************************************// if (file.getName().endsWith(".docx")) { XWPFDocument doc; try { // System.out.println("File read: "+file.getName()); doc = new XWPFDocument(fis); XWPFWordExtractor ex = new XWPFWordExtractor(doc); lines.add(ex.getText()); } catch (IOException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } } // ***********************************************************************************************// ///////////////////// .pptx ///////////////////////////// // ***********************************************************************************************// else if (file.getName().endsWith(".pptx")) { OPCPackage ppt; try { // System.out.println("File read: "+file.getName()); ppt = OPCPackage.open(fis); XSLFPowerPointExtractor xw = new XSLFPowerPointExtractor(ppt); lines.add(xw.getText()); } catch (XmlException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } catch (IOException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } catch (OpenXML4JException e) { System.out.println("File not read: " + file.getName()); } } // ***********************************************************************************************// ///////////////////// .xlsx ///////////////////////////// // ***********************************************************************************************// else if (file.getName().endsWith(".xlsx")) { OPCPackage xls; try { // System.out.println("File read: "+file.getName()); xls = OPCPackage.open(fis); XSSFExcelExtractor xe = new XSSFExcelExtractor(xls); lines.add(xe.getText()); } catch (InvalidFormatException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } catch (IOException e) { System.out.println("File not read: " + file.getName()); } catch (XmlException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } catch (OpenXML4JException e) { System.out.println("File not read: " + file.getName()); } } // ***********************************************************************************************// ///////////////////// .doc ///////////////////////////// // ***********************************************************************************************// else if (file.getName().endsWith(".doc")) { NPOIFSFileSystem fs; try { // System.out.println("File read: "+file.getName()); fs = new NPOIFSFileSystem(file); WordExtractor extractor = new WordExtractor(fs.getRoot()); for (String rawText : extractor.getParagraphText()) { lines.add(extractor.stripFields(rawText)); } } catch (IOException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } } // ***********************************************************************************************// ///////////////////// .pdf ///////////////////////////// // ***********************************************************************************************// else if (file.getName().endsWith(".pdf")) { PDFParser parser; try { // System.out.println("File read: "+file.getName()); parser = new PDFParser(fis); parser.parse(); COSDocument cd = parser.getDocument(); PDFTextStripper stripper = new PDFTextStripper(); lines.add(stripper.getText(new PDDocument(cd))); } catch (IOException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } } // ***********************************************************************************************// ///////////////////// Media Files such as gif, jpeg, .wmv, .mpeg, ///////////////////// .mp4 ///////////////////////////// // ***********************************************************************************************// else if (file.getName().endsWith(".gif") && file.getName().endsWith(".jpeg") && file.getName().endsWith(".wmv") && file.getName().endsWith(".mpeg") && file.getName().endsWith(".mp4")) { lines.add(file.getName()); } // ***********************************************************************************************// ///////////////////// raw text extensions ///////////////////// ///////////////////////////// // ***********************************************************************************************// else { try { // System.out.println("File read: "+file.getName()); lines = Files.readLines(file, Charsets.UTF_8); } catch (IOException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } finally { try { fis.close(); } catch (IOException ioex) { // omitted. } } } // ***********************************************************************************************// ///////////////////// Begin word extraction ///////////////////// ///////////////////////////// // ***********************************************************************************************// int temporaryCounter = 0; // Filter threshold int counterDoc = 0; for (int i = 0; i < lines.size(); i++) { CharArraySet noise = EnglishAnalyzer.getDefaultStopSet(); // We are using a standard tokenizer that eliminates the stop // words. We can use Stemming tokenizer such Porter // A set of English noise keywords is used that will eliminates // words such as "the, a, etc" Analyzer analyzer = new StandardAnalyzer(noise); List<String> token = Tokenizer.tokenizeString(analyzer, lines.get(i)); temporaryCounter = temporaryCounter + token.size(); for (int j = 0; j < token.size(); j++) { // Avoid counting occurrences of words in the same file if (!lookup2.get(file.getName()).contains(token.get(j))) { lookup2.put(file.getName(), token.get(j)); } // Avoid counting occurrences of words in the same file if (!lookup1.get(token.get(j)).contains(file.getName())) { lookup1.put(token.get(j), file.getName()); } } } } // System.out.println(lookup.toString()); return new TextExtractPar(lookup1, lookup2); }
From source file:org.encuestame.business.search.IndexerFile.java
License:Apache License
/** * Extract word document content./*from w ww .j a v a2 s . com*/ * @param wde * @return */ public static String extractContentWordDocument(final XWPFWordExtractor wde) { String bodyText = null; try { bodyText = wde.getText(); } catch (Exception e) { log.error("ERROR extracting content Word Document-------->" + e); } return bodyText; }
From source file:org.encuestame.business.search.SearchUtils.java
License:Apache License
/** * Create Document Word.//from w ww .j av a2 s. c om * @param file {@link File} * @param Long attachmentId. * @return {@link Document} * @throws POIXMLException * @throws Exception */ public static Document createWordDocument(final File file) throws POIXMLException, Exception { InputStream is = new FileInputStream(file); String bodyText = null; try { XWPFDocument wd = new XWPFDocument(is); XWPFWordExtractor wde = new XWPFWordExtractor(wd); bodyText = wde.getText(); } catch (Exception e) { log.debug(e); } Document doc = SearchUtils.addFields(file, bodyText); return doc; }
From source file:org.exoplatform.services.document.impl.MSXWordDocumentReader.java
License:Open Source License
/** * Returns only a text from .docx file content. * //from w w w . j a v a 2s . co m * @param is an input stream with .docx file content. * @return The string only with text from file content. */ public String getContentAsText(final InputStream is) throws IOException, DocumentReadException { if (is == null) { throw new IllegalArgumentException("InputStream is null."); } String text = ""; try { if (is.available() == 0) { return ""; } XWPFDocument doc; try { doc = SecurityHelper.doPrivilegedIOExceptionAction(new PrivilegedExceptionAction<XWPFDocument>() { public XWPFDocument run() throws Exception { return new XWPFDocument(is); } }); } catch (IOException e) { throw new DocumentReadException("Can't open message.", e); } catch (OpenXML4JRuntimeException e) { throw new DocumentReadException("Can't open message.", e); } final XWPFWordExtractor extractor = new XWPFWordExtractor(doc); text = SecurityHelper.doPrivilegedAction(new PrivilegedAction<String>() { public String run() { return extractor.getText(); } }); } finally { if (is != null) { try { is.close(); } catch (IOException e) { if (LOG.isTraceEnabled()) { LOG.trace("An exception occurred: " + e.getMessage()); } } } } return text.trim(); }
From source file:org.kimios.kernel.index.filters.ExcelXFilter.java
License:Open Source License
public String getBody(InputStream in) throws IOException { XWPFDocument doc = new XWPFDocument(in); XWPFWordExtractor ex = new XWPFWordExtractor(doc); String text = ex.getText(); return text;//w w w . j a v a 2s . co m }
From source file:org.nuxeo.ecm.platform.template.tests.TestOOoConvert.java
License:Apache License
@Test public void testOfficeConverter4() throws Exception { ConversionService cs = Framework.getService(ConversionService.class); BlobHolder bh = getBlobFromPath("data/testMe.html", "text/html"); String converterName = cs.getConverterName(bh.getBlob().getMimeType(), "application/vnd.openxmlformats-officedocument.wordprocessingml.document"); assertEquals("any2docx", converterName); boolean isAvailable = cs.isConverterAvailable(converterName).isAvailable(); assumeTrue(isAvailable);//from w w w . j av a2s .c o m BlobHolder result = cs.convert(converterName, bh, null); File docxFile = Framework.createTempFile("docxfile", "docx"); result.getBlob().transferTo(docxFile); XWPFDocument doc = new XWPFDocument(new FileInputStream(docxFile)); XWPFWordExtractor extractor = new XWPFWordExtractor(doc); String text = extractor.getText(); assertTrue(text.length() > 0); assertTrue(text.contains("Titre 1")); docxFile.delete(); }