List of usage examples for org.apache.poi.xwpf.extractor XWPFWordExtractor XWPFWordExtractor
public XWPFWordExtractor(XWPFDocument document)
From source file:org.terrier.indexing.POIDocument.java
License:Mozilla Public License
protected POITextExtractor getExtractor(String filename, InputStream docStream) throws IOException { //Word .doc: if (filename.endsWith(".doc")) { return new WordExtractor(docStream); }/*from www .j a v a 2 s.c o m*/ //Word .docx: if (filename.endsWith(".docx")) { return new XWPFWordExtractor(new XWPFDocument(docStream)); } //Powertpoint .ppt: if (filename.endsWith(".ppt")) { return new PowerPointExtractor(docStream); } //Powertpoint .pptx: if (filename.endsWith(".pptx")) { return new XSLFPowerPointExtractor(new XMLSlideShow(docStream)); } //Publisher .pub: if (filename.endsWith(".pub")) { return new PublisherTextExtractor(docStream); } //Excel: .xls: if (filename.endsWith(".xls")) { return new ExcelExtractor(new POIFSFileSystem(docStream)); } //Excel: .xlsx: if (filename.endsWith(".xlsx")) { return new org.apache.poi.xssf.extractor.XSSFExcelExtractor(new XSSFWorkbook(docStream)); } //Visio: .vsd: if (filename.endsWith(".vsd")) { return new VisioTextExtractor(docStream); } return null; }
From source file:org.wandora.utils.MSOfficeBox.java
License:Open Source License
public static String getDocxText(File file) { try {//from www . ja v a2 s.c o m XWPFDocument docx = new XWPFDocument(new FileInputStream(file)); XWPFWordExtractor extractor = new XWPFWordExtractor(docx); String text = extractor.getText(); return text; } catch (Exception e) { e.printStackTrace(); } return null; }
From source file:org.wso2.carbon.apimgt.impl.indexing.indexer.DocumentIndexer.java
License:Open Source License
/** * Write document content to document artifact as its raw content * * @param registry//from w w w . j a va 2 s . c o m * @param documentResource * @return * @throws RegistryException * @throws IOException * @throws APIManagementException */ private String fetchDocumentContent(Registry registry, Resource documentResource) throws RegistryException, IOException, APIManagementException { GenericArtifactManager docArtifactManager = APIUtil.getArtifactManager(registry, APIConstants.DOCUMENTATION_KEY); GenericArtifact documentArtifact = docArtifactManager.getGenericArtifact(documentResource.getUUID()); String sourceType = documentArtifact.getAttribute(APIConstants.DOC_SOURCE_TYPE); String contentString = null; if (Documentation.DocumentSourceType.FILE.name().equals(sourceType)) { Association fileAssociations[] = registry.getAssociations(documentResource.getPath(), APIConstants.DOCUMENTATION_FILE_ASSOCIATION); Association fileAssociation; if (fileAssociations.length < 1) { String error = "No document associated to API"; log.error(error); throw new APIManagementException(error); } //a file document can have one file association fileAssociation = fileAssociations[0]; String contentPath = fileAssociation.getDestinationPath(); if (!registry.resourceExists(contentPath)) { String error = "API not found at " + contentPath; log.error(error); throw new APIManagementException(error); } Resource contentResource = registry.get(contentPath); String fileName = ((ResourceImpl) contentResource).getName(); String extension = FilenameUtils.getExtension(fileName); InputStream inputStream = null; try { inputStream = contentResource.getContentStream(); switch (extension) { case APIConstants.PDF_EXTENSION: PDFParser pdfParser = new PDFParser(inputStream); pdfParser.parse(); COSDocument cosDocument = pdfParser.getDocument(); PDFTextStripper stripper = new PDFTextStripper(); contentString = stripper.getText(new PDDocument(cosDocument)); break; case APIConstants.DOC_EXTENSION: { POIFSFileSystem pfs = new POIFSFileSystem(inputStream); WordExtractor msWord2003Extractor = new WordExtractor(pfs); contentString = msWord2003Extractor.getText(); break; } case APIConstants.DOCX_EXTENSION: XWPFDocument doc = new XWPFDocument(inputStream); XWPFWordExtractor msWord2007Extractor = new XWPFWordExtractor(doc); contentString = msWord2007Extractor.getText(); break; case APIConstants.XLS_EXTENSION: { POIFSFileSystem pfs = new POIFSFileSystem(inputStream); ExcelExtractor extractor = new ExcelExtractor(pfs); contentString = extractor.getText(); break; } case APIConstants.XLSX_EXTENSION: XSSFWorkbook xssfSheets = new XSSFWorkbook(inputStream); XSSFExcelExtractor xssfExcelExtractor = new XSSFExcelExtractor(xssfSheets); contentString = xssfExcelExtractor.getText(); break; case APIConstants.PPT_EXTENSION: { POIFSFileSystem fs = new POIFSFileSystem(inputStream); PowerPointExtractor extractor = new PowerPointExtractor(fs); contentString = extractor.getText(); break; } case APIConstants.PPTX_EXTENSION: XMLSlideShow xmlSlideShow = new XMLSlideShow(inputStream); XSLFPowerPointExtractor xslfPowerPointExtractor = new XSLFPowerPointExtractor(xmlSlideShow); contentString = xslfPowerPointExtractor.getText(); break; case APIConstants.TXT_EXTENSION: case APIConstants.WSDL_EXTENSION: case APIConstants.XML_DOC_EXTENSION: BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream)); String line; StringBuilder contentBuilder = new StringBuilder(); while ((line = reader.readLine()) != null) { contentBuilder.append(line); } contentString = contentBuilder.toString(); break; } } finally { IOUtils.closeQuietly(inputStream); } } else if (Documentation.DocumentSourceType.INLINE.name().equals(sourceType)) { Association contentAssociations[] = registry.getAssociations(documentResource.getPath(), APIConstants.DOCUMENTATION_CONTENT_ASSOCIATION); Association contentAssociation; //an inline document can have one or no content associations if (contentAssociations.length == 1) { contentAssociation = contentAssociations[0]; String contentPath = contentAssociation.getDestinationPath(); if (registry.resourceExists(contentPath)) { Resource contentResource = registry.get(contentPath); InputStream instream = null; BufferedReader reader = null; String line; try { instream = contentResource.getContentStream(); reader = new BufferedReader(new InputStreamReader(instream)); StringBuilder contentBuilder = new StringBuilder(); while ((line = reader.readLine()) != null) { contentBuilder.append(line); } contentString = contentBuilder.toString(); } finally { if (reader != null) { IOUtils.closeQuietly(reader); } } } } } return contentString; }
From source file:org.wso2.carbon.pc.core.DocumentIndexer.java
License:Open Source License
@Override public IndexDocument getIndexedDocument(AsyncIndexer.File2Index fileData) throws SolrException, RegistryException { try {/*from w w w . j a v a2 s . c o m*/ String wordText = null; try { //Extract MSWord 2003 document files POIFSFileSystem fs = new POIFSFileSystem(new ByteArrayInputStream(fileData.data)); WordExtractor msWord2003Extractor = new WordExtractor(fs); wordText = msWord2003Extractor.getText(); } catch (OfficeXmlFileException e) { //if 2003 extraction failed, try with MSWord 2007 document files extractor XWPFDocument doc = new XWPFDocument(new ByteArrayInputStream(fileData.data)); XWPFWordExtractor msWord2007Extractor = new XWPFWordExtractor(doc); wordText = msWord2007Extractor.getText(); } catch (Exception e) { //The reason for not throwing an exception is that since this is an indexer that runs in the background //throwing an exception might lead to adverse behaviors in the client side and might lead to //other files not being indexed String msg = "Failed to extract the document while indexing"; log.error(msg, e); } IndexDocument indexDoc = new IndexDocument(fileData.path, wordText, null); Map<String, List<String>> fields = new HashMap<String, List<String>>(); fields.put("path", Arrays.asList(fileData.path)); if (fileData.mediaType != null) { fields.put(IndexingConstants.FIELD_MEDIA_TYPE, Arrays.asList(fileData.mediaType)); } else { fields.put(IndexingConstants.FIELD_MEDIA_TYPE, Arrays.asList("application/pdf")); } indexDoc.setFields(fields); return indexDoc; } catch (IOException e) { String msg = "Failed to write to the index"; log.error(msg, e); throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, msg); } }
From source file:org.wurtele.ifttt.watchers.TrainingScheduleWatcher.java
License:Open Source License
private void processWordFile(Path path) { try {//from ww w . j a v a 2s. c o m XWPFDocument doc = new XWPFDocument(Files.newInputStream(path)); XWPFWordExtractor extractor = new XWPFWordExtractor(doc); List<List<String>> data = new ArrayList<>(); DateFormat df1 = new SimpleDateFormat("MMM dd, yyyy"); DateFormat df2 = new SimpleDateFormat("MMM dd, yyyy HH:mm"); Arrays.asList(extractor.getText().split("\n")).stream().forEach((line) -> { try { df1.parse(line.split("\t")[0]); List<String> list = new ArrayList<>(); list.addAll(Arrays.asList(line.split("\t"))); data.add(list); } catch (ParseException pe) { } if (line.startsWith("\t")) data.get(data.size() - 1).addAll(Arrays.asList(line.substring(1).split("\t"))); }); List<TrainingScheduleEntry> entries = new ArrayList<>(); for (List<String> event : data) { TrainingScheduleEntry entry = new TrainingScheduleEntry(); entry.setStart(df2.parse(event.get(0) + " " + event.get(1))); entry.setEnd(df2.parse(event.get(0) + " " + event.get(2))); entry.setGroup(event.get(4)); entry.setTitle(event.get(5)); entry.setNotes(event.get(6).length() > 6 ? event.get(6).substring(6) : event.get(6)); if (event.size() > 13) { for (int i = 7; i < 7 + event.size() - 13; i++) { entry.setNotes(entry.getNotes() + " " + event.get(i)); } } entry.setInstructor(event.get(event.size() - 6).trim()); entry.setUniform(event.get(event.size() - 5)); entry.setLocation(event.get(event.size() - 2)); entries.add(entry); } if (!entries.isEmpty()) { Collections.sort(entries); try (OutputStream os = Files.newOutputStream(processedPath(path)); ObjectOutputStream oos = new ObjectOutputStream(os)) { oos.writeObject(entries); } logger.info("Processed " + path); Date start = DateUtils.truncate(entries.get(0).getStart(), Calendar.DATE); Date end = DateUtils.truncate(entries.get(entries.size() - 1).getEnd(), Calendar.DATE); DateFormat df = new SimpleDateFormat("MMM d, yyyy"); String payload = APNS.newPayload().category("scheduleCategory") .alertTitle("Training Schedule Received") .alertBody(entries.size() + " events found for " + (start.before(end) ? df.format(start) + " - " + df.format(end) : df.format(start))) .sound("default").customField("schedule", path.getParent().getFileName().toString() + "/" + FilenameUtils.getBaseName(path.getFileName().toString())) .build(); PushDevices.getDevices().stream().forEach((device) -> { PushUtils.getService().push(device, payload); }); } } catch (Exception e) { logger.error("Failed to process training schedule file: " + path, e); FAILED.add(path); } }
From source file:rocky.sizecounter.SizeCounterUtil.java
License:Apache License
/** * Count Word's number of page from input directory. * /* w w w .j a v a 2 s.c om*/ * @param filePath . * @return Number of A4 pages */ public static int countWordFile(String filePath) { FileInputStream fis = null; int page = 0; try { fis = new FileInputStream(filePath); if (CommonUtil.getExtension(filePath).equals("doc")) { // When file is .DOC HWPFDocument doc = new HWPFDocument(fis); page = doc.getDocProperties().getCPg(); } else if (CommonUtil.getExtension(filePath).equals("docx")) { // When file is .DOCX XWPFDocument doc = new XWPFDocument(fis); XWPFWordExtractor ex = new XWPFWordExtractor(doc); page = ex.getExtendedProperties().getUnderlyingProperties().getPages(); } } catch (FileNotFoundException ex) { LOG.warn("File " + filePath + " not found", ex); } catch (IOException ex) { LOG.warn("Invalid when reading file.", ex); } catch (Exception ex) { LOG.warn("Can not count file " + filePath, ex); } finally { if (fis != null) { try { fis.close(); } catch (IOException ex) { LOG.warn("Close the file input stream", ex); } } } return page; }
From source file:ru.lisaprog.parser.ExtractText.java
License:Open Source License
public static String parseDOCX(String file) { try {/* w w w . j a va 2s . co m*/ BufferedInputStream isr = new BufferedInputStream(new FileInputStream(file)); XWPFWordExtractor word = new XWPFWordExtractor(new XWPFDocument(isr)); return word.getText(); } catch (Exception e) { // Common.createLog(e); return ""; } }
From source file:steffen.haertlein.file.FileObject.java
License:Apache License
private void readWordDocument() { try {//from www . j av a 2s. co m FileInputStream fs = new FileInputStream(f); XWPFDocument document; document = new XWPFDocument(OPCPackage.open(fs)); XWPFWordExtractor docxReader = new XWPFWordExtractor(document); String text = docxReader.getText(); docxReader.close(); String[] docxLines = text.split("\n"); for (String line : docxLines) { lines.add(line); } fs.close(); } catch (InvalidFormatException e) { JOptionPane.showMessageDialog(null, "InvalidFormatException in readWordDocument", "Fehler", JOptionPane.ERROR_MESSAGE); e.printStackTrace(); } catch (FileNotFoundException e) { JOptionPane.showMessageDialog(null, "FileNotFoundException in readWordDocument", "Fehler", JOptionPane.ERROR_MESSAGE); e.printStackTrace(); } catch (IOException e) { JOptionPane.showMessageDialog(null, "IOException in readWordDocument", "Fehler", JOptionPane.ERROR_MESSAGE); e.printStackTrace(); } }