Example usage for org.apache.poi.xwpf.extractor XWPFWordExtractor getText

List of usage examples for org.apache.poi.xwpf.extractor XWPFWordExtractor getText

Introduction

In this page you can find the example usage for org.apache.poi.xwpf.extractor XWPFWordExtractor getText.

Prototype

public String getText() 

Source Link

Usage

From source file:org.wandora.utils.MSOfficeBox.java

License:Open Source License

public static String getDocxText(File file) {
    try {// w w w. ja va 2 s.c o  m
        XWPFDocument docx = new XWPFDocument(new FileInputStream(file));
        XWPFWordExtractor extractor = new XWPFWordExtractor(docx);
        String text = extractor.getText();
        return text;
    } catch (Exception e) {
        e.printStackTrace();
    }
    return null;
}

From source file:org.wso2.carbon.apimgt.impl.indexing.indexer.DocumentIndexer.java

License:Open Source License

/**
 * Write document content to document artifact as its raw content
 *
 * @param registry/*from ww  w  .  j  a  v a 2  s  . c o  m*/
 * @param documentResource
 * @return
 * @throws RegistryException
 * @throws IOException
 * @throws APIManagementException
 */
private String fetchDocumentContent(Registry registry, Resource documentResource)
        throws RegistryException, IOException, APIManagementException {
    GenericArtifactManager docArtifactManager = APIUtil.getArtifactManager(registry,
            APIConstants.DOCUMENTATION_KEY);
    GenericArtifact documentArtifact = docArtifactManager.getGenericArtifact(documentResource.getUUID());
    String sourceType = documentArtifact.getAttribute(APIConstants.DOC_SOURCE_TYPE);

    String contentString = null;
    if (Documentation.DocumentSourceType.FILE.name().equals(sourceType)) {
        Association fileAssociations[] = registry.getAssociations(documentResource.getPath(),
                APIConstants.DOCUMENTATION_FILE_ASSOCIATION);
        Association fileAssociation;

        if (fileAssociations.length < 1) {
            String error = "No document associated to API";
            log.error(error);
            throw new APIManagementException(error);
        }

        //a file document can have one file association
        fileAssociation = fileAssociations[0];
        String contentPath = fileAssociation.getDestinationPath();

        if (!registry.resourceExists(contentPath)) {
            String error = "API not found at " + contentPath;
            log.error(error);
            throw new APIManagementException(error);
        }

        Resource contentResource = registry.get(contentPath);

        String fileName = ((ResourceImpl) contentResource).getName();
        String extension = FilenameUtils.getExtension(fileName);
        InputStream inputStream = null;
        try {
            inputStream = contentResource.getContentStream();
            switch (extension) {
            case APIConstants.PDF_EXTENSION:
                PDFParser pdfParser = new PDFParser(inputStream);
                pdfParser.parse();
                COSDocument cosDocument = pdfParser.getDocument();
                PDFTextStripper stripper = new PDFTextStripper();
                contentString = stripper.getText(new PDDocument(cosDocument));
                break;
            case APIConstants.DOC_EXTENSION: {
                POIFSFileSystem pfs = new POIFSFileSystem(inputStream);
                WordExtractor msWord2003Extractor = new WordExtractor(pfs);
                contentString = msWord2003Extractor.getText();
                break;
            }
            case APIConstants.DOCX_EXTENSION:
                XWPFDocument doc = new XWPFDocument(inputStream);
                XWPFWordExtractor msWord2007Extractor = new XWPFWordExtractor(doc);
                contentString = msWord2007Extractor.getText();
                break;
            case APIConstants.XLS_EXTENSION: {
                POIFSFileSystem pfs = new POIFSFileSystem(inputStream);
                ExcelExtractor extractor = new ExcelExtractor(pfs);
                contentString = extractor.getText();
                break;
            }
            case APIConstants.XLSX_EXTENSION:
                XSSFWorkbook xssfSheets = new XSSFWorkbook(inputStream);
                XSSFExcelExtractor xssfExcelExtractor = new XSSFExcelExtractor(xssfSheets);
                contentString = xssfExcelExtractor.getText();
                break;
            case APIConstants.PPT_EXTENSION: {
                POIFSFileSystem fs = new POIFSFileSystem(inputStream);
                PowerPointExtractor extractor = new PowerPointExtractor(fs);
                contentString = extractor.getText();
                break;
            }
            case APIConstants.PPTX_EXTENSION:
                XMLSlideShow xmlSlideShow = new XMLSlideShow(inputStream);
                XSLFPowerPointExtractor xslfPowerPointExtractor = new XSLFPowerPointExtractor(xmlSlideShow);
                contentString = xslfPowerPointExtractor.getText();
                break;
            case APIConstants.TXT_EXTENSION:
            case APIConstants.WSDL_EXTENSION:
            case APIConstants.XML_DOC_EXTENSION:
                BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream));
                String line;
                StringBuilder contentBuilder = new StringBuilder();
                while ((line = reader.readLine()) != null) {
                    contentBuilder.append(line);
                }
                contentString = contentBuilder.toString();
                break;
            }
        } finally {
            IOUtils.closeQuietly(inputStream);
        }

    } else if (Documentation.DocumentSourceType.INLINE.name().equals(sourceType)) {
        Association contentAssociations[] = registry.getAssociations(documentResource.getPath(),
                APIConstants.DOCUMENTATION_CONTENT_ASSOCIATION);
        Association contentAssociation;

        //an inline document can have one or no content associations
        if (contentAssociations.length == 1) {
            contentAssociation = contentAssociations[0];
            String contentPath = contentAssociation.getDestinationPath();

            if (registry.resourceExists(contentPath)) {
                Resource contentResource = registry.get(contentPath);

                InputStream instream = null;
                BufferedReader reader = null;
                String line;
                try {
                    instream = contentResource.getContentStream();
                    reader = new BufferedReader(new InputStreamReader(instream));
                    StringBuilder contentBuilder = new StringBuilder();
                    while ((line = reader.readLine()) != null) {
                        contentBuilder.append(line);
                    }
                    contentString = contentBuilder.toString();
                } finally {
                    if (reader != null) {
                        IOUtils.closeQuietly(reader);
                    }
                }
            }
        }
    }
    return contentString;
}

From source file:org.wso2.carbon.apimgt.impl.indexing.indexer.MSWordIndexerTest.java

License:Open Source License

@Test
public void testShouldReturnIndexedDocumentWhenParameterCorrect() throws Exception {
    POIFSFileSystem poiFS = Mockito.mock(POIFSFileSystem.class);
    WordExtractor wordExtractor = Mockito.mock(WordExtractor.class);
    XWPFWordExtractor xwpfExtractor = Mockito.mock(XWPFWordExtractor.class);
    XWPFDocument xwpfDocument = Mockito.mock(XWPFDocument.class);
    PowerMockito.whenNew(POIFSFileSystem.class).withArguments(Mockito.anyObject())
            .thenThrow(OfficeXmlFileException.class).thenReturn(poiFS).thenThrow(APIManagementException.class);
    PowerMockito.whenNew(WordExtractor.class).withArguments(poiFS).thenReturn(wordExtractor);
    PowerMockito.whenNew(XWPFDocument.class).withParameterTypes(InputStream.class).withArguments(Mockito.any())
            .thenReturn(xwpfDocument);//w w  w .j av  a  2 s .  co m
    PowerMockito.whenNew(XWPFWordExtractor.class).withArguments(xwpfDocument).thenReturn(xwpfExtractor);
    Mockito.when(wordExtractor.getText()).thenReturn("");
    Mockito.when(xwpfExtractor.getText()).thenReturn("");
    MSWordIndexer indexer = new MSWordIndexer();

    IndexDocument wordDoc = indexer.getIndexedDocument(file2Index);

    // should return the default media type when media type is not defined in file2Index
    if (!"application/pdf".equals(wordDoc.getFields().get(IndexingConstants.FIELD_MEDIA_TYPE).get(0))) {
        Assert.fail();
    }

    // should return the media type we have set in the file2Index
    file2Index.mediaType = "text/html";
    wordDoc = indexer.getIndexedDocument(file2Index);
    if (!"text/html".equals(wordDoc.getFields().get(IndexingConstants.FIELD_MEDIA_TYPE).get(0))) {
        Assert.fail();
    }

    // should return the media type we have set in the file2Index even if exception occurred while reading the file
    file2Index.mediaType = "text/html";
    wordDoc = indexer.getIndexedDocument(file2Index);
    if (!"text/html".equals(wordDoc.getFields().get(IndexingConstants.FIELD_MEDIA_TYPE).get(0))) {
        Assert.fail();
    }
}

From source file:org.wso2.carbon.pc.core.DocumentIndexer.java

License:Open Source License

@Override
public IndexDocument getIndexedDocument(AsyncIndexer.File2Index fileData)
        throws SolrException, RegistryException {
    try {/*from   w w  w .  j  a v  a  2s  .c  o  m*/
        String wordText = null;
        try {
            //Extract MSWord 2003 document files
            POIFSFileSystem fs = new POIFSFileSystem(new ByteArrayInputStream(fileData.data));

            WordExtractor msWord2003Extractor = new WordExtractor(fs);
            wordText = msWord2003Extractor.getText();

        } catch (OfficeXmlFileException e) {
            //if 2003 extraction failed, try with MSWord 2007 document files extractor
            XWPFDocument doc = new XWPFDocument(new ByteArrayInputStream(fileData.data));

            XWPFWordExtractor msWord2007Extractor = new XWPFWordExtractor(doc);
            wordText = msWord2007Extractor.getText();

        } catch (Exception e) {
            //The reason for not throwing an exception is that since this is an indexer that runs in the background
            //throwing an exception might lead to adverse behaviors in the client side and might lead to
            //other files not being indexed
            String msg = "Failed to extract the document while indexing";
            log.error(msg, e);
        }
        IndexDocument indexDoc = new IndexDocument(fileData.path, wordText, null);

        Map<String, List<String>> fields = new HashMap<String, List<String>>();
        fields.put("path", Arrays.asList(fileData.path));
        if (fileData.mediaType != null) {
            fields.put(IndexingConstants.FIELD_MEDIA_TYPE, Arrays.asList(fileData.mediaType));
        } else {
            fields.put(IndexingConstants.FIELD_MEDIA_TYPE, Arrays.asList("application/pdf"));
        }

        indexDoc.setFields(fields);

        return indexDoc;

    } catch (IOException e) {
        String msg = "Failed to write to the index";
        log.error(msg, e);
        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, msg);
    }
}

From source file:org.wurtele.ifttt.watchers.TrainingScheduleWatcher.java

License:Open Source License

private void processWordFile(Path path) {
    try {// ww  w  . ja v  a2s .  co  m
        XWPFDocument doc = new XWPFDocument(Files.newInputStream(path));
        XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
        List<List<String>> data = new ArrayList<>();
        DateFormat df1 = new SimpleDateFormat("MMM dd, yyyy");
        DateFormat df2 = new SimpleDateFormat("MMM dd, yyyy HH:mm");
        Arrays.asList(extractor.getText().split("\n")).stream().forEach((line) -> {
            try {
                df1.parse(line.split("\t")[0]);
                List<String> list = new ArrayList<>();
                list.addAll(Arrays.asList(line.split("\t")));
                data.add(list);
            } catch (ParseException pe) {
            }
            if (line.startsWith("\t"))
                data.get(data.size() - 1).addAll(Arrays.asList(line.substring(1).split("\t")));
        });
        List<TrainingScheduleEntry> entries = new ArrayList<>();
        for (List<String> event : data) {
            TrainingScheduleEntry entry = new TrainingScheduleEntry();
            entry.setStart(df2.parse(event.get(0) + " " + event.get(1)));
            entry.setEnd(df2.parse(event.get(0) + " " + event.get(2)));
            entry.setGroup(event.get(4));
            entry.setTitle(event.get(5));
            entry.setNotes(event.get(6).length() > 6 ? event.get(6).substring(6) : event.get(6));
            if (event.size() > 13) {
                for (int i = 7; i < 7 + event.size() - 13; i++) {
                    entry.setNotes(entry.getNotes() + " " + event.get(i));
                }
            }
            entry.setInstructor(event.get(event.size() - 6).trim());
            entry.setUniform(event.get(event.size() - 5));
            entry.setLocation(event.get(event.size() - 2));
            entries.add(entry);
        }

        if (!entries.isEmpty()) {
            Collections.sort(entries);

            try (OutputStream os = Files.newOutputStream(processedPath(path));
                    ObjectOutputStream oos = new ObjectOutputStream(os)) {
                oos.writeObject(entries);
            }
            logger.info("Processed " + path);
            Date start = DateUtils.truncate(entries.get(0).getStart(), Calendar.DATE);
            Date end = DateUtils.truncate(entries.get(entries.size() - 1).getEnd(), Calendar.DATE);
            DateFormat df = new SimpleDateFormat("MMM d, yyyy");
            String payload = APNS.newPayload().category("scheduleCategory")
                    .alertTitle("Training Schedule Received")
                    .alertBody(entries.size() + " events found for "
                            + (start.before(end) ? df.format(start) + " - " + df.format(end)
                                    : df.format(start)))
                    .sound("default").customField("schedule", path.getParent().getFileName().toString() + "/"
                            + FilenameUtils.getBaseName(path.getFileName().toString()))
                    .build();
            PushDevices.getDevices().stream().forEach((device) -> {
                PushUtils.getService().push(device, payload);
            });
        }
    } catch (Exception e) {
        logger.error("Failed to process training schedule file: " + path, e);
        FAILED.add(path);
    }
}

From source file:ru.lisaprog.parser.ExtractText.java

License:Open Source License

public static String parseDOCX(String file) {
    try {/*from www.j a  va  2  s  .c o m*/
        BufferedInputStream isr = new BufferedInputStream(new FileInputStream(file));
        XWPFWordExtractor word = new XWPFWordExtractor(new XWPFDocument(isr));
        return word.getText();
    } catch (Exception e) {
        //         Common.createLog(e);
        return "";
    }
}

From source file:steffen.haertlein.file.FileObject.java

License:Apache License

private void readWordDocument() {
    try {/*from  w ww .j a v a  2  s.  co  m*/
        FileInputStream fs = new FileInputStream(f);
        XWPFDocument document;
        document = new XWPFDocument(OPCPackage.open(fs));
        XWPFWordExtractor docxReader = new XWPFWordExtractor(document);
        String text = docxReader.getText();
        docxReader.close();
        String[] docxLines = text.split("\n");
        for (String line : docxLines) {
            lines.add(line);
        }
        fs.close();
    } catch (InvalidFormatException e) {
        JOptionPane.showMessageDialog(null, "InvalidFormatException in readWordDocument", "Fehler",
                JOptionPane.ERROR_MESSAGE);
        e.printStackTrace();
    } catch (FileNotFoundException e) {
        JOptionPane.showMessageDialog(null, "FileNotFoundException in readWordDocument", "Fehler",
                JOptionPane.ERROR_MESSAGE);
        e.printStackTrace();
    } catch (IOException e) {
        JOptionPane.showMessageDialog(null, "IOException in readWordDocument", "Fehler",
                JOptionPane.ERROR_MESSAGE);
        e.printStackTrace();
    }
}