Example usage for org.apache.poi.xwpf.extractor XWPFWordExtractor getText

Introduction

In this page you can find the example usage for org.apache.poi.xwpf.extractor XWPFWordExtractor getText.

Prototype

public String getText()

Source Link

Usage

From source file:org.wandora.utils.MSOfficeBox.java

License:Open Source License

public static String getDocxText(File file) {
    try {// w w w. ja va 2 s.c o  m
        XWPFDocument docx = new XWPFDocument(new FileInputStream(file));
        XWPFWordExtractor extractor = new XWPFWordExtractor(docx);
        String text = extractor.getText();
        return text;
    } catch (Exception e) {
        e.printStackTrace();
    }
    return null;
}

From source file:org.wso2.carbon.apimgt.impl.indexing.indexer.DocumentIndexer.java

License:Open Source License

/**
 * Write document content to document artifact as its raw content
 *
 * @param registry/*from ww  w  .  j  a  v a 2  s  . c o  m*/
 * @param documentResource
 * @return
 * @throws RegistryException
 * @throws IOException
 * @throws APIManagementException
 */
private String fetchDocumentContent(Registry registry, Resource documentResource)
        throws RegistryException, IOException, APIManagementException {
    GenericArtifactManager docArtifactManager = APIUtil.getArtifactManager(registry,
            APIConstants.DOCUMENTATION_KEY);
    GenericArtifact documentArtifact = docArtifactManager.getGenericArtifact(documentResource.getUUID());
    String sourceType = documentArtifact.getAttribute(APIConstants.DOC_SOURCE_TYPE);

    String contentString = null;
    if (Documentation.DocumentSourceType.FILE.name().equals(sourceType)) {
        Association fileAssociations[] = registry.getAssociations(documentResource.getPath(),
                APIConstants.DOCUMENTATION_FILE_ASSOCIATION);
        Association fileAssociation;

        if (fileAssociations.length < 1) {
            String error = "No document associated to API";
            log.error(error);
            throw new APIManagementException(error);
        }

        //a file document can have one file association
        fileAssociation = fileAssociations[0];
        String contentPath = fileAssociation.getDestinationPath();

        if (!registry.resourceExists(contentPath)) {
            String error = "API not found at " + contentPath;
            log.error(error);
            throw new APIManagementException(error);
        }

        Resource contentResource = registry.get(contentPath);

        String fileName = ((ResourceImpl) contentResource).getName();
        String extension = FilenameUtils.getExtension(fileName);
        InputStream inputStream = null;
        try {
            inputStream = contentResource.getContentStream();
            switch (extension) {
            case APIConstants.PDF_EXTENSION:
                PDFParser pdfParser = new PDFParser(inputStream);
                pdfParser.parse();
                COSDocument cosDocument = pdfParser.getDocument();
                PDFTextStripper stripper = new PDFTextStripper();
                contentString = stripper.getText(new PDDocument(cosDocument));
                break;
            case APIConstants.DOC_EXTENSION: {
                POIFSFileSystem pfs = new POIFSFileSystem(inputStream);
                WordExtractor msWord2003Extractor = new WordExtractor(pfs);
                contentString = msWord2003Extractor.getText();
                break;
            }
            case APIConstants.DOCX_EXTENSION:
                XWPFDocument doc = new XWPFDocument(inputStream);
                XWPFWordExtractor msWord2007Extractor = new XWPFWordExtractor(doc);
                contentString = msWord2007Extractor.getText();
                break;
            case APIConstants.XLS_EXTENSION: {
                POIFSFileSystem pfs = new POIFSFileSystem(inputStream);
                ExcelExtractor extractor = new ExcelExtractor(pfs);
                contentString = extractor.getText();
                break;
            }
            case APIConstants.XLSX_EXTENSION:
                XSSFWorkbook xssfSheets = new XSSFWorkbook(inputStream);
                XSSFExcelExtractor xssfExcelExtractor = new XSSFExcelExtractor(xssfSheets);
                contentString = xssfExcelExtractor.getText();
                break;
            case APIConstants.PPT_EXTENSION: {
                POIFSFileSystem fs = new POIFSFileSystem(inputStream);
                PowerPointExtractor extractor = new PowerPointExtractor(fs);
                contentString = extractor.getText();
                break;
            }
            case APIConstants.PPTX_EXTENSION:
                XMLSlideShow xmlSlideShow = new XMLSlideShow(inputStream);
                XSLFPowerPointExtractor xslfPowerPointExtractor = new XSLFPowerPointExtractor(xmlSlideShow);
                contentString = xslfPowerPointExtractor.getText();
                break;
            case APIConstants.TXT_EXTENSION:
            case APIConstants.WSDL_EXTENSION:
            case APIConstants.XML_DOC_EXTENSION:
                BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream));
                String line;
                StringBuilder contentBuilder = new StringBuilder();
                while ((line = reader.readLine()) != null) {
                    contentBuilder.append(line);
                }
                contentString = contentBuilder.toString();
                break;
            }
        } finally {
            IOUtils.closeQuietly(inputStream);
        }

    } else if (Documentation.DocumentSourceType.INLINE.name().equals(sourceType)) {
        Association contentAssociations[] = registry.getAssociations(documentResource.getPath(),
                APIConstants.DOCUMENTATION_CONTENT_ASSOCIATION);
        Association contentAssociation;

        //an inline document can have one or no content associations
        if (contentAssociations.length == 1) {
            contentAssociation = contentAssociations[0];
            String contentPath = contentAssociation.getDestinationPath();

            if (registry.resourceExists(contentPath)) {
                Resource contentResource = registry.get(contentPath);

                InputStream instream = null;
                BufferedReader reader = null;
                String line;
                try {
                    instream = contentResource.getContentStream();
                    reader = new BufferedReader(new InputStreamReader(instream));
                    StringBuilder contentBuilder = new StringBuilder();
                    while ((line = reader.readLine()) != null) {
                        contentBuilder.append(line);
                    }
                    contentString = contentBuilder.toString();
                } finally {
                    if (reader != null) {
                        IOUtils.closeQuietly(reader);
                    }
                }
            }
        }
    }
    return contentString;
}

From source file:org.wso2.carbon.apimgt.impl.indexing.indexer.MSWordIndexerTest.java

License:Open Source License

@Test
public void testShouldReturnIndexedDocumentWhenParameterCorrect() throws Exception {
    POIFSFileSystem poiFS = Mockito.mock(POIFSFileSystem.class);
    WordExtractor wordExtractor = Mockito.mock(WordExtractor.class);
    XWPFWordExtractor xwpfExtractor = Mockito.mock(XWPFWordExtractor.class);
    XWPFDocument xwpfDocument = Mockito.mock(XWPFDocument.class);
    PowerMockito.whenNew(POIFSFileSystem.class).withArguments(Mockito.anyObject())
            .thenThrow(OfficeXmlFileException.class).thenReturn(poiFS).thenThrow(APIManagementException.class);
    PowerMockito.whenNew(WordExtractor.class).withArguments(poiFS).thenReturn(wordExtractor);
    PowerMockito.whenNew(XWPFDocument.class).withParameterTypes(InputStream.class).withArguments(Mockito.any())
            .thenReturn(xwpfDocument);//w w  w .j av  a  2 s .  co m
    PowerMockito.whenNew(XWPFWordExtractor.class).withArguments(xwpfDocument).thenReturn(xwpfExtractor);
    Mockito.when(wordExtractor.getText()).thenReturn("");
    Mockito.when(xwpfExtractor.getText()).thenReturn("");
    MSWordIndexer indexer = new MSWordIndexer();

    IndexDocument wordDoc = indexer.getIndexedDocument(file2Index);

    // should return the default media type when media type is not defined in file2Index
    if (!"application/pdf".equals(wordDoc.getFields().get(IndexingConstants.FIELD_MEDIA_TYPE).get(0))) {
        Assert.fail();
    }

    // should return the media type we have set in the file2Index
    file2Index.mediaType = "text/html";
    wordDoc = indexer.getIndexedDocument(file2Index);
    if (!"text/html".equals(wordDoc.getFields().get(IndexingConstants.FIELD_MEDIA_TYPE).get(0))) {
        Assert.fail();
    }

    // should return the media type we have set in the file2Index even if exception occurred while reading the file
    file2Index.mediaType = "text/html";
    wordDoc = indexer.getIndexedDocument(file2Index);
    if (!"text/html".equals(wordDoc.getFields().get(IndexingConstants.FIELD_MEDIA_TYPE).get(0))) {
        Assert.fail();
    }
}

From source file:org.wso2.carbon.pc.core.DocumentIndexer.java

License:Open Source License

@Override
public IndexDocument getIndexedDocument(AsyncIndexer.File2Index fileData)
        throws SolrException, RegistryException {
    try {/*from   w w  w .  j  a v  a  2s  .c  o  m*/
        String wordText = null;
        try {
            //Extract MSWord 2003 document files
            POIFSFileSystem fs = new POIFSFileSystem(new ByteArrayInputStream(fileData.data));

            WordExtractor msWord2003Extractor = new WordExtractor(fs);
            wordText = msWord2003Extractor.getText();

        } catch (OfficeXmlFileException e) {
            //if 2003 extraction failed, try with MSWord 2007 document files extractor
            XWPFDocument doc = new XWPFDocument(new ByteArrayInputStream(fileData.data));

            XWPFWordExtractor msWord2007Extractor = new XWPFWordExtractor(doc);
            wordText = msWord2007Extractor.getText();

        } catch (Exception e) {
            //The reason for not throwing an exception is that since this is an indexer that runs in the background
            //throwing an exception might lead to adverse behaviors in the client side and might lead to
            //other files not being indexed
            String msg = "Failed to extract the document while indexing";
            log.error(msg, e);
        }
        IndexDocument indexDoc = new IndexDocument(fileData.path, wordText, null);

        Map<String, List<String>> fields = new HashMap<String, List<String>>();
        fields.put("path", Arrays.asList(fileData.path));
        if (fileData.mediaType != null) {
            fields.put(IndexingConstants.FIELD_MEDIA_TYPE, Arrays.asList(fileData.mediaType));
        } else {
            fields.put(IndexingConstants.FIELD_MEDIA_TYPE, Arrays.asList("application/pdf"));
        }

        indexDoc.setFields(fields);

        return indexDoc;

    } catch (IOException e) {
        String msg = "Failed to write to the index";
        log.error(msg, e);
        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, msg);
    }
}

From source file:org.wurtele.ifttt.watchers.TrainingScheduleWatcher.java

License:Open Source License

private void processWordFile(Path path) {
    try {// ww  w  . ja v  a2s .  co  m
        XWPFDocument doc = new XWPFDocument(Files.newInputStream(path));
        XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
        List<List<String>> data = new ArrayList<>();
        DateFormat df1 = new SimpleDateFormat("MMM dd, yyyy");
        DateFormat df2 = new SimpleDateFormat("MMM dd, yyyy HH:mm");
        Arrays.asList(extractor.getText().split("\n")).stream().forEach((line) -> {
            try {
                df1.parse(line.split("\t")[0]);
                List<String> list = new ArrayList<>();
                list.addAll(Arrays.asList(line.split("\t")));
                data.add(list);
            } catch (ParseException pe) {
            }
            if (line.startsWith("\t"))
                data.get(data.size() - 1).addAll(Arrays.asList(line.substring(1).split("\t")));
        });
        List<TrainingScheduleEntry> entries = new ArrayList<>();
        for (List<String> event : data) {
            TrainingScheduleEntry entry = new TrainingScheduleEntry();
            entry.setStart(df2.parse(event.get(0) + " " + event.get(1)));
            entry.setEnd(df2.parse(event.get(0) + " " + event.get(2)));
            entry.setGroup(event.get(4));
            entry.setTitle(event.get(5));
            entry.setNotes(event.get(6).length() > 6 ? event.get(6).substring(6) : event.get(6));
            if (event.size() > 13) {
                for (int i = 7; i < 7 + event.size() - 13; i++) {
                    entry.setNotes(entry.getNotes() + " " + event.get(i));
                }
            }
            entry.setInstructor(event.get(event.size() - 6).trim());
            entry.setUniform(event.get(event.size() - 5));
            entry.setLocation(event.get(event.size() - 2));
            entries.add(entry);
        }

        if (!entries.isEmpty()) {
            Collections.sort(entries);

            try (OutputStream os = Files.newOutputStream(processedPath(path));
                    ObjectOutputStream oos = new ObjectOutputStream(os)) {
                oos.writeObject(entries);
            }
            logger.info("Processed " + path);
            Date start = DateUtils.truncate(entries.get(0).getStart(), Calendar.DATE);
            Date end = DateUtils.truncate(entries.get(entries.size() - 1).getEnd(), Calendar.DATE);
            DateFormat df = new SimpleDateFormat("MMM d, yyyy");
            String payload = APNS.newPayload().category("scheduleCategory")
                    .alertTitle("Training Schedule Received")
                    .alertBody(entries.size() + " events found for "
                            + (start.before(end) ? df.format(start) + " - " + df.format(end)
                                    : df.format(start)))
                    .sound("default").customField("schedule", path.getParent().getFileName().toString() + "/"
                            + FilenameUtils.getBaseName(path.getFileName().toString()))
                    .build();
            PushDevices.getDevices().stream().forEach((device) -> {
                PushUtils.getService().push(device, payload);
            });
        }
    } catch (Exception e) {
        logger.error("Failed to process training schedule file: " + path, e);
        FAILED.add(path);
    }
}

From source file:ru.lisaprog.parser.ExtractText.java

License:Open Source License

public static String parseDOCX(String file) {
    try {/*from www.j a  va  2  s  .c o m*/
        BufferedInputStream isr = new BufferedInputStream(new FileInputStream(file));
        XWPFWordExtractor word = new XWPFWordExtractor(new XWPFDocument(isr));
        return word.getText();
    } catch (Exception e) {
        //         Common.createLog(e);
        return "";
    }
}

From source file:steffen.haertlein.file.FileObject.java

License:Apache License

private void readWordDocument() {
    try {/*from  w ww .j a v a  2  s.  co  m*/
        FileInputStream fs = new FileInputStream(f);
        XWPFDocument document;
        document = new XWPFDocument(OPCPackage.open(fs));
        XWPFWordExtractor docxReader = new XWPFWordExtractor(document);
        String text = docxReader.getText();
        docxReader.close();
        String[] docxLines = text.split("\n");
        for (String line : docxLines) {
            lines.add(line);
        }
        fs.close();
    } catch (InvalidFormatException e) {
        JOptionPane.showMessageDialog(null, "InvalidFormatException in readWordDocument", "Fehler",
                JOptionPane.ERROR_MESSAGE);
        e.printStackTrace();
    } catch (FileNotFoundException e) {
        JOptionPane.showMessageDialog(null, "FileNotFoundException in readWordDocument", "Fehler",
                JOptionPane.ERROR_MESSAGE);
        e.printStackTrace();
    } catch (IOException e) {
        JOptionPane.showMessageDialog(null, "IOException in readWordDocument", "Fehler",
                JOptionPane.ERROR_MESSAGE);
        e.printStackTrace();
    }
}