Example usage for org.apache.poi.xwpf.extractor XWPFWordExtractor XWPFWordExtractor

Introduction

In this page you can find the example usage for org.apache.poi.xwpf.extractor XWPFWordExtractor XWPFWordExtractor.

Prototype

public XWPFWordExtractor(XWPFDocument document)

Source Link

Usage

From source file:org.terrier.indexing.POIDocument.java

License:Mozilla Public License

protected POITextExtractor getExtractor(String filename, InputStream docStream) throws IOException {
    //Word .doc: 
    if (filename.endsWith(".doc")) {
        return new WordExtractor(docStream);
    }/*from  www  .j a v  a  2  s.c  o  m*/
    //Word .docx:
    if (filename.endsWith(".docx")) {
        return new XWPFWordExtractor(new XWPFDocument(docStream));
    }
    //Powertpoint .ppt: 
    if (filename.endsWith(".ppt")) {
        return new PowerPointExtractor(docStream);
    }
    //Powertpoint .pptx: 
    if (filename.endsWith(".pptx")) {
        return new XSLFPowerPointExtractor(new XMLSlideShow(docStream));
    }
    //Publisher .pub: 
    if (filename.endsWith(".pub")) {
        return new PublisherTextExtractor(docStream);
    }
    //Excel: .xls:
    if (filename.endsWith(".xls")) {
        return new ExcelExtractor(new POIFSFileSystem(docStream));
    }
    //Excel: .xlsx:
    if (filename.endsWith(".xlsx")) {
        return new org.apache.poi.xssf.extractor.XSSFExcelExtractor(new XSSFWorkbook(docStream));
    }
    //Visio: .vsd:
    if (filename.endsWith(".vsd")) {
        return new VisioTextExtractor(docStream);
    }
    return null;
}

From source file:org.wandora.utils.MSOfficeBox.java

License:Open Source License

public static String getDocxText(File file) {
    try {//from www .  ja v  a2  s.c o  m
        XWPFDocument docx = new XWPFDocument(new FileInputStream(file));
        XWPFWordExtractor extractor = new XWPFWordExtractor(docx);
        String text = extractor.getText();
        return text;
    } catch (Exception e) {
        e.printStackTrace();
    }
    return null;
}

From source file:org.wso2.carbon.apimgt.impl.indexing.indexer.DocumentIndexer.java

License:Open Source License

/**
 * Write document content to document artifact as its raw content
 *
 * @param registry//from w w w . j  a va  2  s  .  c o m
 * @param documentResource
 * @return
 * @throws RegistryException
 * @throws IOException
 * @throws APIManagementException
 */
private String fetchDocumentContent(Registry registry, Resource documentResource)
        throws RegistryException, IOException, APIManagementException {
    GenericArtifactManager docArtifactManager = APIUtil.getArtifactManager(registry,
            APIConstants.DOCUMENTATION_KEY);
    GenericArtifact documentArtifact = docArtifactManager.getGenericArtifact(documentResource.getUUID());
    String sourceType = documentArtifact.getAttribute(APIConstants.DOC_SOURCE_TYPE);

    String contentString = null;
    if (Documentation.DocumentSourceType.FILE.name().equals(sourceType)) {
        Association fileAssociations[] = registry.getAssociations(documentResource.getPath(),
                APIConstants.DOCUMENTATION_FILE_ASSOCIATION);
        Association fileAssociation;

        if (fileAssociations.length < 1) {
            String error = "No document associated to API";
            log.error(error);
            throw new APIManagementException(error);
        }

        //a file document can have one file association
        fileAssociation = fileAssociations[0];
        String contentPath = fileAssociation.getDestinationPath();

        if (!registry.resourceExists(contentPath)) {
            String error = "API not found at " + contentPath;
            log.error(error);
            throw new APIManagementException(error);
        }

        Resource contentResource = registry.get(contentPath);

        String fileName = ((ResourceImpl) contentResource).getName();
        String extension = FilenameUtils.getExtension(fileName);
        InputStream inputStream = null;
        try {
            inputStream = contentResource.getContentStream();
            switch (extension) {
            case APIConstants.PDF_EXTENSION:
                PDFParser pdfParser = new PDFParser(inputStream);
                pdfParser.parse();
                COSDocument cosDocument = pdfParser.getDocument();
                PDFTextStripper stripper = new PDFTextStripper();
                contentString = stripper.getText(new PDDocument(cosDocument));
                break;
            case APIConstants.DOC_EXTENSION: {
                POIFSFileSystem pfs = new POIFSFileSystem(inputStream);
                WordExtractor msWord2003Extractor = new WordExtractor(pfs);
                contentString = msWord2003Extractor.getText();
                break;
            }
            case APIConstants.DOCX_EXTENSION:
                XWPFDocument doc = new XWPFDocument(inputStream);
                XWPFWordExtractor msWord2007Extractor = new XWPFWordExtractor(doc);
                contentString = msWord2007Extractor.getText();
                break;
            case APIConstants.XLS_EXTENSION: {
                POIFSFileSystem pfs = new POIFSFileSystem(inputStream);
                ExcelExtractor extractor = new ExcelExtractor(pfs);
                contentString = extractor.getText();
                break;
            }
            case APIConstants.XLSX_EXTENSION:
                XSSFWorkbook xssfSheets = new XSSFWorkbook(inputStream);
                XSSFExcelExtractor xssfExcelExtractor = new XSSFExcelExtractor(xssfSheets);
                contentString = xssfExcelExtractor.getText();
                break;
            case APIConstants.PPT_EXTENSION: {
                POIFSFileSystem fs = new POIFSFileSystem(inputStream);
                PowerPointExtractor extractor = new PowerPointExtractor(fs);
                contentString = extractor.getText();
                break;
            }
            case APIConstants.PPTX_EXTENSION:
                XMLSlideShow xmlSlideShow = new XMLSlideShow(inputStream);
                XSLFPowerPointExtractor xslfPowerPointExtractor = new XSLFPowerPointExtractor(xmlSlideShow);
                contentString = xslfPowerPointExtractor.getText();
                break;
            case APIConstants.TXT_EXTENSION:
            case APIConstants.WSDL_EXTENSION:
            case APIConstants.XML_DOC_EXTENSION:
                BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream));
                String line;
                StringBuilder contentBuilder = new StringBuilder();
                while ((line = reader.readLine()) != null) {
                    contentBuilder.append(line);
                }
                contentString = contentBuilder.toString();
                break;
            }
        } finally {
            IOUtils.closeQuietly(inputStream);
        }

    } else if (Documentation.DocumentSourceType.INLINE.name().equals(sourceType)) {
        Association contentAssociations[] = registry.getAssociations(documentResource.getPath(),
                APIConstants.DOCUMENTATION_CONTENT_ASSOCIATION);
        Association contentAssociation;

        //an inline document can have one or no content associations
        if (contentAssociations.length == 1) {
            contentAssociation = contentAssociations[0];
            String contentPath = contentAssociation.getDestinationPath();

            if (registry.resourceExists(contentPath)) {
                Resource contentResource = registry.get(contentPath);

                InputStream instream = null;
                BufferedReader reader = null;
                String line;
                try {
                    instream = contentResource.getContentStream();
                    reader = new BufferedReader(new InputStreamReader(instream));
                    StringBuilder contentBuilder = new StringBuilder();
                    while ((line = reader.readLine()) != null) {
                        contentBuilder.append(line);
                    }
                    contentString = contentBuilder.toString();
                } finally {
                    if (reader != null) {
                        IOUtils.closeQuietly(reader);
                    }
                }
            }
        }
    }
    return contentString;
}

From source file:org.wso2.carbon.pc.core.DocumentIndexer.java

License:Open Source License

@Override
public IndexDocument getIndexedDocument(AsyncIndexer.File2Index fileData)
        throws SolrException, RegistryException {
    try {/*from w  w w .  j a  v a2 s .  c o m*/
        String wordText = null;
        try {
            //Extract MSWord 2003 document files
            POIFSFileSystem fs = new POIFSFileSystem(new ByteArrayInputStream(fileData.data));

            WordExtractor msWord2003Extractor = new WordExtractor(fs);
            wordText = msWord2003Extractor.getText();

        } catch (OfficeXmlFileException e) {
            //if 2003 extraction failed, try with MSWord 2007 document files extractor
            XWPFDocument doc = new XWPFDocument(new ByteArrayInputStream(fileData.data));

            XWPFWordExtractor msWord2007Extractor = new XWPFWordExtractor(doc);
            wordText = msWord2007Extractor.getText();

        } catch (Exception e) {
            //The reason for not throwing an exception is that since this is an indexer that runs in the background
            //throwing an exception might lead to adverse behaviors in the client side and might lead to
            //other files not being indexed
            String msg = "Failed to extract the document while indexing";
            log.error(msg, e);
        }
        IndexDocument indexDoc = new IndexDocument(fileData.path, wordText, null);

        Map<String, List<String>> fields = new HashMap<String, List<String>>();
        fields.put("path", Arrays.asList(fileData.path));
        if (fileData.mediaType != null) {
            fields.put(IndexingConstants.FIELD_MEDIA_TYPE, Arrays.asList(fileData.mediaType));
        } else {
            fields.put(IndexingConstants.FIELD_MEDIA_TYPE, Arrays.asList("application/pdf"));
        }

        indexDoc.setFields(fields);

        return indexDoc;

    } catch (IOException e) {
        String msg = "Failed to write to the index";
        log.error(msg, e);
        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, msg);
    }
}

From source file:org.wurtele.ifttt.watchers.TrainingScheduleWatcher.java

License:Open Source License

private void processWordFile(Path path) {
    try {//from   ww  w  . j a  v  a  2s. c  o m
        XWPFDocument doc = new XWPFDocument(Files.newInputStream(path));
        XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
        List<List<String>> data = new ArrayList<>();
        DateFormat df1 = new SimpleDateFormat("MMM dd, yyyy");
        DateFormat df2 = new SimpleDateFormat("MMM dd, yyyy HH:mm");
        Arrays.asList(extractor.getText().split("\n")).stream().forEach((line) -> {
            try {
                df1.parse(line.split("\t")[0]);
                List<String> list = new ArrayList<>();
                list.addAll(Arrays.asList(line.split("\t")));
                data.add(list);
            } catch (ParseException pe) {
            }
            if (line.startsWith("\t"))
                data.get(data.size() - 1).addAll(Arrays.asList(line.substring(1).split("\t")));
        });
        List<TrainingScheduleEntry> entries = new ArrayList<>();
        for (List<String> event : data) {
            TrainingScheduleEntry entry = new TrainingScheduleEntry();
            entry.setStart(df2.parse(event.get(0) + " " + event.get(1)));
            entry.setEnd(df2.parse(event.get(0) + " " + event.get(2)));
            entry.setGroup(event.get(4));
            entry.setTitle(event.get(5));
            entry.setNotes(event.get(6).length() > 6 ? event.get(6).substring(6) : event.get(6));
            if (event.size() > 13) {
                for (int i = 7; i < 7 + event.size() - 13; i++) {
                    entry.setNotes(entry.getNotes() + " " + event.get(i));
                }
            }
            entry.setInstructor(event.get(event.size() - 6).trim());
            entry.setUniform(event.get(event.size() - 5));
            entry.setLocation(event.get(event.size() - 2));
            entries.add(entry);
        }

        if (!entries.isEmpty()) {
            Collections.sort(entries);

            try (OutputStream os = Files.newOutputStream(processedPath(path));
                    ObjectOutputStream oos = new ObjectOutputStream(os)) {
                oos.writeObject(entries);
            }
            logger.info("Processed " + path);
            Date start = DateUtils.truncate(entries.get(0).getStart(), Calendar.DATE);
            Date end = DateUtils.truncate(entries.get(entries.size() - 1).getEnd(), Calendar.DATE);
            DateFormat df = new SimpleDateFormat("MMM d, yyyy");
            String payload = APNS.newPayload().category("scheduleCategory")
                    .alertTitle("Training Schedule Received")
                    .alertBody(entries.size() + " events found for "
                            + (start.before(end) ? df.format(start) + " - " + df.format(end)
                                    : df.format(start)))
                    .sound("default").customField("schedule", path.getParent().getFileName().toString() + "/"
                            + FilenameUtils.getBaseName(path.getFileName().toString()))
                    .build();
            PushDevices.getDevices().stream().forEach((device) -> {
                PushUtils.getService().push(device, payload);
            });
        }
    } catch (Exception e) {
        logger.error("Failed to process training schedule file: " + path, e);
        FAILED.add(path);
    }
}

From source file:rocky.sizecounter.SizeCounterUtil.java

License:Apache License

/**
 * Count Word's number of page from input directory.
 * /* w  w  w .j a v a 2 s.c  om*/
 * @param filePath .
 * @return Number of A4 pages
 */
public static int countWordFile(String filePath) {
    FileInputStream fis = null;
    int page = 0;
    try {
        fis = new FileInputStream(filePath);

        if (CommonUtil.getExtension(filePath).equals("doc")) { // When file is .DOC
            HWPFDocument doc = new HWPFDocument(fis);
            page = doc.getDocProperties().getCPg();
        } else if (CommonUtil.getExtension(filePath).equals("docx")) { // When file is .DOCX
            XWPFDocument doc = new XWPFDocument(fis);
            XWPFWordExtractor ex = new XWPFWordExtractor(doc);
            page = ex.getExtendedProperties().getUnderlyingProperties().getPages();
        }
    } catch (FileNotFoundException ex) {
        LOG.warn("File " + filePath + " not found", ex);
    } catch (IOException ex) {
        LOG.warn("Invalid when reading file.", ex);
    } catch (Exception ex) {
        LOG.warn("Can not count file " + filePath, ex);
    } finally {
        if (fis != null) {
            try {
                fis.close();
            } catch (IOException ex) {
                LOG.warn("Close the file input stream", ex);
            }
        }
    }
    return page;
}

From source file:ru.lisaprog.parser.ExtractText.java

License:Open Source License

public static String parseDOCX(String file) {
    try {/*  w w w  .  j  a va  2s . co m*/
        BufferedInputStream isr = new BufferedInputStream(new FileInputStream(file));
        XWPFWordExtractor word = new XWPFWordExtractor(new XWPFDocument(isr));
        return word.getText();
    } catch (Exception e) {
        //         Common.createLog(e);
        return "";
    }
}

From source file:steffen.haertlein.file.FileObject.java

License:Apache License

private void readWordDocument() {
    try {//from   www .  j  av a  2s.  co m
        FileInputStream fs = new FileInputStream(f);
        XWPFDocument document;
        document = new XWPFDocument(OPCPackage.open(fs));
        XWPFWordExtractor docxReader = new XWPFWordExtractor(document);
        String text = docxReader.getText();
        docxReader.close();
        String[] docxLines = text.split("\n");
        for (String line : docxLines) {
            lines.add(line);
        }
        fs.close();
    } catch (InvalidFormatException e) {
        JOptionPane.showMessageDialog(null, "InvalidFormatException in readWordDocument", "Fehler",
                JOptionPane.ERROR_MESSAGE);
        e.printStackTrace();
    } catch (FileNotFoundException e) {
        JOptionPane.showMessageDialog(null, "FileNotFoundException in readWordDocument", "Fehler",
                JOptionPane.ERROR_MESSAGE);
        e.printStackTrace();
    } catch (IOException e) {
        JOptionPane.showMessageDialog(null, "IOException in readWordDocument", "Fehler",
                JOptionPane.ERROR_MESSAGE);
        e.printStackTrace();
    }
}