Example usage for org.apache.poi.xwpf.extractor XWPFWordExtractor XWPFWordExtractor

List of usage examples for org.apache.poi.xwpf.extractor XWPFWordExtractor XWPFWordExtractor

Introduction

In this page you can find the example usage for org.apache.poi.xwpf.extractor XWPFWordExtractor XWPFWordExtractor.

Prototype

public XWPFWordExtractor(XWPFDocument document) 

Source Link

Usage

From source file:org.terrier.indexing.POIDocument.java

License:Mozilla Public License

protected POITextExtractor getExtractor(String filename, InputStream docStream) throws IOException {
    //Word .doc: 
    if (filename.endsWith(".doc")) {
        return new WordExtractor(docStream);
    }/*from  www  .j a v  a  2  s.c  o  m*/
    //Word .docx:
    if (filename.endsWith(".docx")) {
        return new XWPFWordExtractor(new XWPFDocument(docStream));
    }
    //Powertpoint .ppt: 
    if (filename.endsWith(".ppt")) {
        return new PowerPointExtractor(docStream);
    }
    //Powertpoint .pptx: 
    if (filename.endsWith(".pptx")) {
        return new XSLFPowerPointExtractor(new XMLSlideShow(docStream));
    }
    //Publisher .pub: 
    if (filename.endsWith(".pub")) {
        return new PublisherTextExtractor(docStream);
    }
    //Excel: .xls:
    if (filename.endsWith(".xls")) {
        return new ExcelExtractor(new POIFSFileSystem(docStream));
    }
    //Excel: .xlsx:
    if (filename.endsWith(".xlsx")) {
        return new org.apache.poi.xssf.extractor.XSSFExcelExtractor(new XSSFWorkbook(docStream));
    }
    //Visio: .vsd:
    if (filename.endsWith(".vsd")) {
        return new VisioTextExtractor(docStream);
    }
    return null;
}

From source file:org.wandora.utils.MSOfficeBox.java

License:Open Source License

public static String getDocxText(File file) {
    try {//from www .  ja v  a2  s.c o  m
        XWPFDocument docx = new XWPFDocument(new FileInputStream(file));
        XWPFWordExtractor extractor = new XWPFWordExtractor(docx);
        String text = extractor.getText();
        return text;
    } catch (Exception e) {
        e.printStackTrace();
    }
    return null;
}

From source file:org.wso2.carbon.apimgt.impl.indexing.indexer.DocumentIndexer.java

License:Open Source License

/**
 * Write document content to document artifact as its raw content
 *
 * @param registry//from w w w . j  a va  2  s  .  c o m
 * @param documentResource
 * @return
 * @throws RegistryException
 * @throws IOException
 * @throws APIManagementException
 */
private String fetchDocumentContent(Registry registry, Resource documentResource)
        throws RegistryException, IOException, APIManagementException {
    GenericArtifactManager docArtifactManager = APIUtil.getArtifactManager(registry,
            APIConstants.DOCUMENTATION_KEY);
    GenericArtifact documentArtifact = docArtifactManager.getGenericArtifact(documentResource.getUUID());
    String sourceType = documentArtifact.getAttribute(APIConstants.DOC_SOURCE_TYPE);

    String contentString = null;
    if (Documentation.DocumentSourceType.FILE.name().equals(sourceType)) {
        Association fileAssociations[] = registry.getAssociations(documentResource.getPath(),
                APIConstants.DOCUMENTATION_FILE_ASSOCIATION);
        Association fileAssociation;

        if (fileAssociations.length < 1) {
            String error = "No document associated to API";
            log.error(error);
            throw new APIManagementException(error);
        }

        //a file document can have one file association
        fileAssociation = fileAssociations[0];
        String contentPath = fileAssociation.getDestinationPath();

        if (!registry.resourceExists(contentPath)) {
            String error = "API not found at " + contentPath;
            log.error(error);
            throw new APIManagementException(error);
        }

        Resource contentResource = registry.get(contentPath);

        String fileName = ((ResourceImpl) contentResource).getName();
        String extension = FilenameUtils.getExtension(fileName);
        InputStream inputStream = null;
        try {
            inputStream = contentResource.getContentStream();
            switch (extension) {
            case APIConstants.PDF_EXTENSION:
                PDFParser pdfParser = new PDFParser(inputStream);
                pdfParser.parse();
                COSDocument cosDocument = pdfParser.getDocument();
                PDFTextStripper stripper = new PDFTextStripper();
                contentString = stripper.getText(new PDDocument(cosDocument));
                break;
            case APIConstants.DOC_EXTENSION: {
                POIFSFileSystem pfs = new POIFSFileSystem(inputStream);
                WordExtractor msWord2003Extractor = new WordExtractor(pfs);
                contentString = msWord2003Extractor.getText();
                break;
            }
            case APIConstants.DOCX_EXTENSION:
                XWPFDocument doc = new XWPFDocument(inputStream);
                XWPFWordExtractor msWord2007Extractor = new XWPFWordExtractor(doc);
                contentString = msWord2007Extractor.getText();
                break;
            case APIConstants.XLS_EXTENSION: {
                POIFSFileSystem pfs = new POIFSFileSystem(inputStream);
                ExcelExtractor extractor = new ExcelExtractor(pfs);
                contentString = extractor.getText();
                break;
            }
            case APIConstants.XLSX_EXTENSION:
                XSSFWorkbook xssfSheets = new XSSFWorkbook(inputStream);
                XSSFExcelExtractor xssfExcelExtractor = new XSSFExcelExtractor(xssfSheets);
                contentString = xssfExcelExtractor.getText();
                break;
            case APIConstants.PPT_EXTENSION: {
                POIFSFileSystem fs = new POIFSFileSystem(inputStream);
                PowerPointExtractor extractor = new PowerPointExtractor(fs);
                contentString = extractor.getText();
                break;
            }
            case APIConstants.PPTX_EXTENSION:
                XMLSlideShow xmlSlideShow = new XMLSlideShow(inputStream);
                XSLFPowerPointExtractor xslfPowerPointExtractor = new XSLFPowerPointExtractor(xmlSlideShow);
                contentString = xslfPowerPointExtractor.getText();
                break;
            case APIConstants.TXT_EXTENSION:
            case APIConstants.WSDL_EXTENSION:
            case APIConstants.XML_DOC_EXTENSION:
                BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream));
                String line;
                StringBuilder contentBuilder = new StringBuilder();
                while ((line = reader.readLine()) != null) {
                    contentBuilder.append(line);
                }
                contentString = contentBuilder.toString();
                break;
            }
        } finally {
            IOUtils.closeQuietly(inputStream);
        }

    } else if (Documentation.DocumentSourceType.INLINE.name().equals(sourceType)) {
        Association contentAssociations[] = registry.getAssociations(documentResource.getPath(),
                APIConstants.DOCUMENTATION_CONTENT_ASSOCIATION);
        Association contentAssociation;

        //an inline document can have one or no content associations
        if (contentAssociations.length == 1) {
            contentAssociation = contentAssociations[0];
            String contentPath = contentAssociation.getDestinationPath();

            if (registry.resourceExists(contentPath)) {
                Resource contentResource = registry.get(contentPath);

                InputStream instream = null;
                BufferedReader reader = null;
                String line;
                try {
                    instream = contentResource.getContentStream();
                    reader = new BufferedReader(new InputStreamReader(instream));
                    StringBuilder contentBuilder = new StringBuilder();
                    while ((line = reader.readLine()) != null) {
                        contentBuilder.append(line);
                    }
                    contentString = contentBuilder.toString();
                } finally {
                    if (reader != null) {
                        IOUtils.closeQuietly(reader);
                    }
                }
            }
        }
    }
    return contentString;
}

From source file:org.wso2.carbon.pc.core.DocumentIndexer.java

License:Open Source License

@Override
public IndexDocument getIndexedDocument(AsyncIndexer.File2Index fileData)
        throws SolrException, RegistryException {
    try {/*from w  w w .  j a  v a2 s .  c o m*/
        String wordText = null;
        try {
            //Extract MSWord 2003 document files
            POIFSFileSystem fs = new POIFSFileSystem(new ByteArrayInputStream(fileData.data));

            WordExtractor msWord2003Extractor = new WordExtractor(fs);
            wordText = msWord2003Extractor.getText();

        } catch (OfficeXmlFileException e) {
            //if 2003 extraction failed, try with MSWord 2007 document files extractor
            XWPFDocument doc = new XWPFDocument(new ByteArrayInputStream(fileData.data));

            XWPFWordExtractor msWord2007Extractor = new XWPFWordExtractor(doc);
            wordText = msWord2007Extractor.getText();

        } catch (Exception e) {
            //The reason for not throwing an exception is that since this is an indexer that runs in the background
            //throwing an exception might lead to adverse behaviors in the client side and might lead to
            //other files not being indexed
            String msg = "Failed to extract the document while indexing";
            log.error(msg, e);
        }
        IndexDocument indexDoc = new IndexDocument(fileData.path, wordText, null);

        Map<String, List<String>> fields = new HashMap<String, List<String>>();
        fields.put("path", Arrays.asList(fileData.path));
        if (fileData.mediaType != null) {
            fields.put(IndexingConstants.FIELD_MEDIA_TYPE, Arrays.asList(fileData.mediaType));
        } else {
            fields.put(IndexingConstants.FIELD_MEDIA_TYPE, Arrays.asList("application/pdf"));
        }

        indexDoc.setFields(fields);

        return indexDoc;

    } catch (IOException e) {
        String msg = "Failed to write to the index";
        log.error(msg, e);
        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, msg);
    }
}

From source file:org.wurtele.ifttt.watchers.TrainingScheduleWatcher.java

License:Open Source License

private void processWordFile(Path path) {
    try {//from   ww  w  . j a  v  a  2s. c  o m
        XWPFDocument doc = new XWPFDocument(Files.newInputStream(path));
        XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
        List<List<String>> data = new ArrayList<>();
        DateFormat df1 = new SimpleDateFormat("MMM dd, yyyy");
        DateFormat df2 = new SimpleDateFormat("MMM dd, yyyy HH:mm");
        Arrays.asList(extractor.getText().split("\n")).stream().forEach((line) -> {
            try {
                df1.parse(line.split("\t")[0]);
                List<String> list = new ArrayList<>();
                list.addAll(Arrays.asList(line.split("\t")));
                data.add(list);
            } catch (ParseException pe) {
            }
            if (line.startsWith("\t"))
                data.get(data.size() - 1).addAll(Arrays.asList(line.substring(1).split("\t")));
        });
        List<TrainingScheduleEntry> entries = new ArrayList<>();
        for (List<String> event : data) {
            TrainingScheduleEntry entry = new TrainingScheduleEntry();
            entry.setStart(df2.parse(event.get(0) + " " + event.get(1)));
            entry.setEnd(df2.parse(event.get(0) + " " + event.get(2)));
            entry.setGroup(event.get(4));
            entry.setTitle(event.get(5));
            entry.setNotes(event.get(6).length() > 6 ? event.get(6).substring(6) : event.get(6));
            if (event.size() > 13) {
                for (int i = 7; i < 7 + event.size() - 13; i++) {
                    entry.setNotes(entry.getNotes() + " " + event.get(i));
                }
            }
            entry.setInstructor(event.get(event.size() - 6).trim());
            entry.setUniform(event.get(event.size() - 5));
            entry.setLocation(event.get(event.size() - 2));
            entries.add(entry);
        }

        if (!entries.isEmpty()) {
            Collections.sort(entries);

            try (OutputStream os = Files.newOutputStream(processedPath(path));
                    ObjectOutputStream oos = new ObjectOutputStream(os)) {
                oos.writeObject(entries);
            }
            logger.info("Processed " + path);
            Date start = DateUtils.truncate(entries.get(0).getStart(), Calendar.DATE);
            Date end = DateUtils.truncate(entries.get(entries.size() - 1).getEnd(), Calendar.DATE);
            DateFormat df = new SimpleDateFormat("MMM d, yyyy");
            String payload = APNS.newPayload().category("scheduleCategory")
                    .alertTitle("Training Schedule Received")
                    .alertBody(entries.size() + " events found for "
                            + (start.before(end) ? df.format(start) + " - " + df.format(end)
                                    : df.format(start)))
                    .sound("default").customField("schedule", path.getParent().getFileName().toString() + "/"
                            + FilenameUtils.getBaseName(path.getFileName().toString()))
                    .build();
            PushDevices.getDevices().stream().forEach((device) -> {
                PushUtils.getService().push(device, payload);
            });
        }
    } catch (Exception e) {
        logger.error("Failed to process training schedule file: " + path, e);
        FAILED.add(path);
    }
}

From source file:rocky.sizecounter.SizeCounterUtil.java

License:Apache License

/**
 * Count Word's number of page from input directory.
 * /* w  w  w .j a v a 2 s.c  om*/
 * @param filePath .
 * @return Number of A4 pages
 */
public static int countWordFile(String filePath) {
    FileInputStream fis = null;
    int page = 0;
    try {
        fis = new FileInputStream(filePath);

        if (CommonUtil.getExtension(filePath).equals("doc")) { // When file is .DOC
            HWPFDocument doc = new HWPFDocument(fis);
            page = doc.getDocProperties().getCPg();
        } else if (CommonUtil.getExtension(filePath).equals("docx")) { // When file is .DOCX
            XWPFDocument doc = new XWPFDocument(fis);
            XWPFWordExtractor ex = new XWPFWordExtractor(doc);
            page = ex.getExtendedProperties().getUnderlyingProperties().getPages();
        }
    } catch (FileNotFoundException ex) {
        LOG.warn("File " + filePath + " not found", ex);
    } catch (IOException ex) {
        LOG.warn("Invalid when reading file.", ex);
    } catch (Exception ex) {
        LOG.warn("Can not count file " + filePath, ex);
    } finally {
        if (fis != null) {
            try {
                fis.close();
            } catch (IOException ex) {
                LOG.warn("Close the file input stream", ex);
            }
        }
    }
    return page;
}

From source file:ru.lisaprog.parser.ExtractText.java

License:Open Source License

public static String parseDOCX(String file) {
    try {/*  w w w  .  j  a va  2s . co m*/
        BufferedInputStream isr = new BufferedInputStream(new FileInputStream(file));
        XWPFWordExtractor word = new XWPFWordExtractor(new XWPFDocument(isr));
        return word.getText();
    } catch (Exception e) {
        //         Common.createLog(e);
        return "";
    }
}

From source file:steffen.haertlein.file.FileObject.java

License:Apache License

private void readWordDocument() {
    try {//from   www .  j  av a  2s.  co m
        FileInputStream fs = new FileInputStream(f);
        XWPFDocument document;
        document = new XWPFDocument(OPCPackage.open(fs));
        XWPFWordExtractor docxReader = new XWPFWordExtractor(document);
        String text = docxReader.getText();
        docxReader.close();
        String[] docxLines = text.split("\n");
        for (String line : docxLines) {
            lines.add(line);
        }
        fs.close();
    } catch (InvalidFormatException e) {
        JOptionPane.showMessageDialog(null, "InvalidFormatException in readWordDocument", "Fehler",
                JOptionPane.ERROR_MESSAGE);
        e.printStackTrace();
    } catch (FileNotFoundException e) {
        JOptionPane.showMessageDialog(null, "FileNotFoundException in readWordDocument", "Fehler",
                JOptionPane.ERROR_MESSAGE);
        e.printStackTrace();
    } catch (IOException e) {
        JOptionPane.showMessageDialog(null, "IOException in readWordDocument", "Fehler",
                JOptionPane.ERROR_MESSAGE);
        e.printStackTrace();
    }
}