List of usage examples for org.apache.poi.xwpf.extractor XWPFWordExtractor getText
public String getText()
From source file:org.wandora.utils.MSOfficeBox.java
License:Open Source License
public static String getDocxText(File file) { try {// w w w. ja va 2 s.c o m XWPFDocument docx = new XWPFDocument(new FileInputStream(file)); XWPFWordExtractor extractor = new XWPFWordExtractor(docx); String text = extractor.getText(); return text; } catch (Exception e) { e.printStackTrace(); } return null; }
From source file:org.wso2.carbon.apimgt.impl.indexing.indexer.DocumentIndexer.java
License:Open Source License
/** * Write document content to document artifact as its raw content * * @param registry/*from ww w . j a v a 2 s . c o m*/ * @param documentResource * @return * @throws RegistryException * @throws IOException * @throws APIManagementException */ private String fetchDocumentContent(Registry registry, Resource documentResource) throws RegistryException, IOException, APIManagementException { GenericArtifactManager docArtifactManager = APIUtil.getArtifactManager(registry, APIConstants.DOCUMENTATION_KEY); GenericArtifact documentArtifact = docArtifactManager.getGenericArtifact(documentResource.getUUID()); String sourceType = documentArtifact.getAttribute(APIConstants.DOC_SOURCE_TYPE); String contentString = null; if (Documentation.DocumentSourceType.FILE.name().equals(sourceType)) { Association fileAssociations[] = registry.getAssociations(documentResource.getPath(), APIConstants.DOCUMENTATION_FILE_ASSOCIATION); Association fileAssociation; if (fileAssociations.length < 1) { String error = "No document associated to API"; log.error(error); throw new APIManagementException(error); } //a file document can have one file association fileAssociation = fileAssociations[0]; String contentPath = fileAssociation.getDestinationPath(); if (!registry.resourceExists(contentPath)) { String error = "API not found at " + contentPath; log.error(error); throw new APIManagementException(error); } Resource contentResource = registry.get(contentPath); String fileName = ((ResourceImpl) contentResource).getName(); String extension = FilenameUtils.getExtension(fileName); InputStream inputStream = null; try { inputStream = contentResource.getContentStream(); switch (extension) { case APIConstants.PDF_EXTENSION: PDFParser pdfParser = new PDFParser(inputStream); pdfParser.parse(); COSDocument cosDocument = pdfParser.getDocument(); PDFTextStripper stripper = new PDFTextStripper(); contentString = stripper.getText(new PDDocument(cosDocument)); break; case APIConstants.DOC_EXTENSION: { POIFSFileSystem pfs = new POIFSFileSystem(inputStream); WordExtractor msWord2003Extractor = new WordExtractor(pfs); contentString = msWord2003Extractor.getText(); break; } case APIConstants.DOCX_EXTENSION: XWPFDocument doc = new XWPFDocument(inputStream); XWPFWordExtractor msWord2007Extractor = new XWPFWordExtractor(doc); contentString = msWord2007Extractor.getText(); break; case APIConstants.XLS_EXTENSION: { POIFSFileSystem pfs = new POIFSFileSystem(inputStream); ExcelExtractor extractor = new ExcelExtractor(pfs); contentString = extractor.getText(); break; } case APIConstants.XLSX_EXTENSION: XSSFWorkbook xssfSheets = new XSSFWorkbook(inputStream); XSSFExcelExtractor xssfExcelExtractor = new XSSFExcelExtractor(xssfSheets); contentString = xssfExcelExtractor.getText(); break; case APIConstants.PPT_EXTENSION: { POIFSFileSystem fs = new POIFSFileSystem(inputStream); PowerPointExtractor extractor = new PowerPointExtractor(fs); contentString = extractor.getText(); break; } case APIConstants.PPTX_EXTENSION: XMLSlideShow xmlSlideShow = new XMLSlideShow(inputStream); XSLFPowerPointExtractor xslfPowerPointExtractor = new XSLFPowerPointExtractor(xmlSlideShow); contentString = xslfPowerPointExtractor.getText(); break; case APIConstants.TXT_EXTENSION: case APIConstants.WSDL_EXTENSION: case APIConstants.XML_DOC_EXTENSION: BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream)); String line; StringBuilder contentBuilder = new StringBuilder(); while ((line = reader.readLine()) != null) { contentBuilder.append(line); } contentString = contentBuilder.toString(); break; } } finally { IOUtils.closeQuietly(inputStream); } } else if (Documentation.DocumentSourceType.INLINE.name().equals(sourceType)) { Association contentAssociations[] = registry.getAssociations(documentResource.getPath(), APIConstants.DOCUMENTATION_CONTENT_ASSOCIATION); Association contentAssociation; //an inline document can have one or no content associations if (contentAssociations.length == 1) { contentAssociation = contentAssociations[0]; String contentPath = contentAssociation.getDestinationPath(); if (registry.resourceExists(contentPath)) { Resource contentResource = registry.get(contentPath); InputStream instream = null; BufferedReader reader = null; String line; try { instream = contentResource.getContentStream(); reader = new BufferedReader(new InputStreamReader(instream)); StringBuilder contentBuilder = new StringBuilder(); while ((line = reader.readLine()) != null) { contentBuilder.append(line); } contentString = contentBuilder.toString(); } finally { if (reader != null) { IOUtils.closeQuietly(reader); } } } } } return contentString; }
From source file:org.wso2.carbon.apimgt.impl.indexing.indexer.MSWordIndexerTest.java
License:Open Source License
@Test public void testShouldReturnIndexedDocumentWhenParameterCorrect() throws Exception { POIFSFileSystem poiFS = Mockito.mock(POIFSFileSystem.class); WordExtractor wordExtractor = Mockito.mock(WordExtractor.class); XWPFWordExtractor xwpfExtractor = Mockito.mock(XWPFWordExtractor.class); XWPFDocument xwpfDocument = Mockito.mock(XWPFDocument.class); PowerMockito.whenNew(POIFSFileSystem.class).withArguments(Mockito.anyObject()) .thenThrow(OfficeXmlFileException.class).thenReturn(poiFS).thenThrow(APIManagementException.class); PowerMockito.whenNew(WordExtractor.class).withArguments(poiFS).thenReturn(wordExtractor); PowerMockito.whenNew(XWPFDocument.class).withParameterTypes(InputStream.class).withArguments(Mockito.any()) .thenReturn(xwpfDocument);//w w w .j av a 2 s . co m PowerMockito.whenNew(XWPFWordExtractor.class).withArguments(xwpfDocument).thenReturn(xwpfExtractor); Mockito.when(wordExtractor.getText()).thenReturn(""); Mockito.when(xwpfExtractor.getText()).thenReturn(""); MSWordIndexer indexer = new MSWordIndexer(); IndexDocument wordDoc = indexer.getIndexedDocument(file2Index); // should return the default media type when media type is not defined in file2Index if (!"application/pdf".equals(wordDoc.getFields().get(IndexingConstants.FIELD_MEDIA_TYPE).get(0))) { Assert.fail(); } // should return the media type we have set in the file2Index file2Index.mediaType = "text/html"; wordDoc = indexer.getIndexedDocument(file2Index); if (!"text/html".equals(wordDoc.getFields().get(IndexingConstants.FIELD_MEDIA_TYPE).get(0))) { Assert.fail(); } // should return the media type we have set in the file2Index even if exception occurred while reading the file file2Index.mediaType = "text/html"; wordDoc = indexer.getIndexedDocument(file2Index); if (!"text/html".equals(wordDoc.getFields().get(IndexingConstants.FIELD_MEDIA_TYPE).get(0))) { Assert.fail(); } }
From source file:org.wso2.carbon.pc.core.DocumentIndexer.java
License:Open Source License
@Override public IndexDocument getIndexedDocument(AsyncIndexer.File2Index fileData) throws SolrException, RegistryException { try {/*from w w w . j a v a 2s .c o m*/ String wordText = null; try { //Extract MSWord 2003 document files POIFSFileSystem fs = new POIFSFileSystem(new ByteArrayInputStream(fileData.data)); WordExtractor msWord2003Extractor = new WordExtractor(fs); wordText = msWord2003Extractor.getText(); } catch (OfficeXmlFileException e) { //if 2003 extraction failed, try with MSWord 2007 document files extractor XWPFDocument doc = new XWPFDocument(new ByteArrayInputStream(fileData.data)); XWPFWordExtractor msWord2007Extractor = new XWPFWordExtractor(doc); wordText = msWord2007Extractor.getText(); } catch (Exception e) { //The reason for not throwing an exception is that since this is an indexer that runs in the background //throwing an exception might lead to adverse behaviors in the client side and might lead to //other files not being indexed String msg = "Failed to extract the document while indexing"; log.error(msg, e); } IndexDocument indexDoc = new IndexDocument(fileData.path, wordText, null); Map<String, List<String>> fields = new HashMap<String, List<String>>(); fields.put("path", Arrays.asList(fileData.path)); if (fileData.mediaType != null) { fields.put(IndexingConstants.FIELD_MEDIA_TYPE, Arrays.asList(fileData.mediaType)); } else { fields.put(IndexingConstants.FIELD_MEDIA_TYPE, Arrays.asList("application/pdf")); } indexDoc.setFields(fields); return indexDoc; } catch (IOException e) { String msg = "Failed to write to the index"; log.error(msg, e); throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, msg); } }
From source file:org.wurtele.ifttt.watchers.TrainingScheduleWatcher.java
License:Open Source License
private void processWordFile(Path path) { try {// ww w . ja v a2s . co m XWPFDocument doc = new XWPFDocument(Files.newInputStream(path)); XWPFWordExtractor extractor = new XWPFWordExtractor(doc); List<List<String>> data = new ArrayList<>(); DateFormat df1 = new SimpleDateFormat("MMM dd, yyyy"); DateFormat df2 = new SimpleDateFormat("MMM dd, yyyy HH:mm"); Arrays.asList(extractor.getText().split("\n")).stream().forEach((line) -> { try { df1.parse(line.split("\t")[0]); List<String> list = new ArrayList<>(); list.addAll(Arrays.asList(line.split("\t"))); data.add(list); } catch (ParseException pe) { } if (line.startsWith("\t")) data.get(data.size() - 1).addAll(Arrays.asList(line.substring(1).split("\t"))); }); List<TrainingScheduleEntry> entries = new ArrayList<>(); for (List<String> event : data) { TrainingScheduleEntry entry = new TrainingScheduleEntry(); entry.setStart(df2.parse(event.get(0) + " " + event.get(1))); entry.setEnd(df2.parse(event.get(0) + " " + event.get(2))); entry.setGroup(event.get(4)); entry.setTitle(event.get(5)); entry.setNotes(event.get(6).length() > 6 ? event.get(6).substring(6) : event.get(6)); if (event.size() > 13) { for (int i = 7; i < 7 + event.size() - 13; i++) { entry.setNotes(entry.getNotes() + " " + event.get(i)); } } entry.setInstructor(event.get(event.size() - 6).trim()); entry.setUniform(event.get(event.size() - 5)); entry.setLocation(event.get(event.size() - 2)); entries.add(entry); } if (!entries.isEmpty()) { Collections.sort(entries); try (OutputStream os = Files.newOutputStream(processedPath(path)); ObjectOutputStream oos = new ObjectOutputStream(os)) { oos.writeObject(entries); } logger.info("Processed " + path); Date start = DateUtils.truncate(entries.get(0).getStart(), Calendar.DATE); Date end = DateUtils.truncate(entries.get(entries.size() - 1).getEnd(), Calendar.DATE); DateFormat df = new SimpleDateFormat("MMM d, yyyy"); String payload = APNS.newPayload().category("scheduleCategory") .alertTitle("Training Schedule Received") .alertBody(entries.size() + " events found for " + (start.before(end) ? df.format(start) + " - " + df.format(end) : df.format(start))) .sound("default").customField("schedule", path.getParent().getFileName().toString() + "/" + FilenameUtils.getBaseName(path.getFileName().toString())) .build(); PushDevices.getDevices().stream().forEach((device) -> { PushUtils.getService().push(device, payload); }); } } catch (Exception e) { logger.error("Failed to process training schedule file: " + path, e); FAILED.add(path); } }
From source file:ru.lisaprog.parser.ExtractText.java
License:Open Source License
public static String parseDOCX(String file) { try {/*from www.j a va 2 s .c o m*/ BufferedInputStream isr = new BufferedInputStream(new FileInputStream(file)); XWPFWordExtractor word = new XWPFWordExtractor(new XWPFDocument(isr)); return word.getText(); } catch (Exception e) { // Common.createLog(e); return ""; } }
From source file:steffen.haertlein.file.FileObject.java
License:Apache License
private void readWordDocument() { try {/*from w ww .j a v a 2 s. co m*/ FileInputStream fs = new FileInputStream(f); XWPFDocument document; document = new XWPFDocument(OPCPackage.open(fs)); XWPFWordExtractor docxReader = new XWPFWordExtractor(document); String text = docxReader.getText(); docxReader.close(); String[] docxLines = text.split("\n"); for (String line : docxLines) { lines.add(line); } fs.close(); } catch (InvalidFormatException e) { JOptionPane.showMessageDialog(null, "InvalidFormatException in readWordDocument", "Fehler", JOptionPane.ERROR_MESSAGE); e.printStackTrace(); } catch (FileNotFoundException e) { JOptionPane.showMessageDialog(null, "FileNotFoundException in readWordDocument", "Fehler", JOptionPane.ERROR_MESSAGE); e.printStackTrace(); } catch (IOException e) { JOptionPane.showMessageDialog(null, "IOException in readWordDocument", "Fehler", JOptionPane.ERROR_MESSAGE); e.printStackTrace(); } }