Example usage for org.apache.poi.xslf.extractor XSLFPowerPointExtractor XSLFPowerPointExtractor

List of usage examples for org.apache.poi.xslf.extractor XSLFPowerPointExtractor XSLFPowerPointExtractor

Introduction

In this page you can find the example usage for org.apache.poi.xslf.extractor XSLFPowerPointExtractor XSLFPowerPointExtractor.

Prototype

public XSLFPowerPointExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException 

Source Link

Usage

From source file:com.docdoku.server.esindexer.ESTools.java

License:Open Source License

private static String microsoftPowerPointDocumentToString(InputStream inputStream) throws IOException {
    String strRet;//from  w  w  w .ja  v  a2s  . c o  m
    try (InputStream pptStream = new BufferedInputStream(inputStream)) {
        if (POIFSFileSystem.hasPOIFSHeader(pptStream)) {
            PowerPointExtractor pptExtractor = new PowerPointExtractor(pptStream);
            strRet = pptExtractor.getText(true, true);
        } else {
            XSLFPowerPointExtractor pptExtractor = new XSLFPowerPointExtractor(new XMLSlideShow(pptStream));
            strRet = pptExtractor.getText(true, true, true);
        }
    }
    return strRet;
}

From source file:com.frameworkset.platform.cms.searchmanager.extractors.CmsExtractorMsPowerPoint.java

License:Open Source License

/** 
 * ?ppt //w  ww . j  a  v  a2  s  . c o m
   * @param path 
   * @return 
   */
public String readPowerPoint2007(InputStream in) {
    String content = null;
    try {

        XMLSlideShow xmlslideshow = new XMLSlideShow(in);
        org.apache.poi.xslf.extractor.XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(
                xmlslideshow);
        this.cp = extractor.getCoreProperties();
        content = extractor.getText();
        //                  SlideShow ss = new SlideShow(new HSLFSlideShow(in));// is  
        //                 // InputStreamSlideShow  
        //                 Slide[] slides = ss.getSlides();// ??  
        //                  for (int i = 0; i < slides.length; i++) {  
        //                     TextRun[] t = slides[i].getTextRuns();// ??TextRun  
        //                      for (int j = 0; j < t.length; j++) {  
        //                          content.append(t[j].getText());// content  
        //                     }  
        //                  }  
    } catch (Exception ex) {
        System.out.println(ex.toString());
    }
    return content;
}

From source file:com.jaeksoft.searchlib.parser.PptxParser.java

License:Open Source License

@Override
protected void parseContent(StreamLimiter streamLimiter, LanguageEnum lang) throws IOException {

    // TODO Optimise if it is already a file
    File tempFile = File.createTempFile("oss", ".pptx");
    FileOutputStream fos = null;//from   w  w  w.ja va  2s. c o m
    try {
        fos = new FileOutputStream(tempFile);
        IOUtils.copy(streamLimiter.getNewInputStream(), fos);
        fos.close();
    } catch (IOException e) {
        IOUtils.close(fos);
        throw e;
    }

    XSLFPowerPointExtractor poiExtractor = null;
    try {
        XSLFSlideShow pptSlideShow = new XSLFSlideShow(tempFile.getAbsolutePath());
        poiExtractor = new XSLFPowerPointExtractor(pptSlideShow);

        ParserResultItem result = getNewParserResultItem();
        CoreProperties info = poiExtractor.getCoreProperties();
        if (info != null) {
            result.addField(ParserFieldEnum.title, info.getTitle());
            result.addField(ParserFieldEnum.creator, info.getCreator());
            result.addField(ParserFieldEnum.subject, info.getSubject());
            result.addField(ParserFieldEnum.description, info.getDescription());
            result.addField(ParserFieldEnum.keywords, info.getKeywords());
        }

        String content = poiExtractor.getText(true, true);
        result.addField(ParserFieldEnum.content, StringUtils.replaceConsecutiveSpaces(content, " "));

        result.langDetection(10000, ParserFieldEnum.content);

    } catch (OpenXML4JException e) {
        throw new IOException(e);
    } catch (XmlException e) {
        throw new IOException(e);
    } finally {
        IOUtils.close(poiExtractor);
    }

}

From source file:com.krawler.esp.fileparser.ppt.MsPPTParser.java

License:Open Source License

public String extractText(String filepath) throws Exception {
    String resultText = "";
    try {//from   w  w w. j  ava2 s .  c om
        InputStream input = new BufferedInputStream(new FileInputStream(filepath));
        XSLFSlideShow xsslsh = new XSLFSlideShow(filepath);
        XMLSlideShow xslsh = new XMLSlideShow(xsslsh);
        XSLFPowerPointExtractor ppt = new XSLFPowerPointExtractor(xslsh);
        resultText = ppt.getText();
        if (input != null) {
            input.close();
        }
    } catch (XmlException e) {
        System.out.print(e.getMessage());
    }
    return resultText;
}

From source file:com.opensearchserver.extractor.parser.Pptx.java

License:Apache License

@Override
protected void parseContent(File file, String extension, String mimeType) throws Exception {

    XSLFSlideShow pptSlideShow = new XSLFSlideShow(file.getAbsolutePath());
    XMLSlideShow slideshow = new XMLSlideShow(pptSlideShow.getPackage());

    // Extract metadata
    XSLFPowerPointExtractor poiExtractor = null;
    try {//from   w ww. j ava  2 s . com
        poiExtractor = new XSLFPowerPointExtractor(slideshow);
        CoreProperties info = poiExtractor.getCoreProperties();
        if (info != null) {
            metas.add(TITLE, info.getTitle());
            metas.add(CREATOR, info.getCreator());
            metas.add(SUBJECT, info.getSubject());
            metas.add(DESCRIPTION, info.getDescription());
            metas.add(KEYWORDS, info.getKeywords());
            metas.add(CREATION_DATE, info.getCreated());
            metas.add(MODIFICATION_DATE, info.getModified());
        }
    } finally {
        poiExtractor.close();
    }
    extractSides(slideshow);
}

From source file:com.opensearchserver.textextractor.parser.Pptx.java

License:Open Source License

@Override
protected void parseContent(File file) throws Exception {

    XSLFSlideShow pptSlideShow = new XSLFSlideShow(file.getAbsolutePath());
    XMLSlideShow slideshow = new XMLSlideShow(pptSlideShow.getPackage());

    // Extract metadata
    XSLFPowerPointExtractor poiExtractor = null;
    try {/*from  w w w. ja va 2s  .  co  m*/
        poiExtractor = new XSLFPowerPointExtractor(slideshow);
        CoreProperties info = poiExtractor.getCoreProperties();
        if (info != null) {
            metas.add(TITLE, info.getTitle());
            metas.add(CREATOR, info.getCreator());
            metas.add(SUBJECT, info.getSubject());
            metas.add(DESCRIPTION, info.getDescription());
            metas.add(KEYWORDS, info.getKeywords());
            metas.add(CREATION_DATE, info.getCreated());
            metas.add(MODIFICATION_DATE, info.getModified());
        }
    } finally {
        poiExtractor.close();
    }
    extractSides(slideshow);
}

From source file:com.qwazr.library.poi.PptxParser.java

License:Apache License

@Override
public void parseContent(final MultivaluedMap<String, String> parameters, final Path filePath,
        final String extension, final String mimeType, final ParserResultBuilder resultBuilder)
        throws Exception {

    final XSLFSlideShow pptSlideShow = new XSLFSlideShow(filePath.toAbsolutePath().toString());
    final XMLSlideShow slideshow = new XMLSlideShow(pptSlideShow.getPackage());

    final ParserFieldsBuilder metas = resultBuilder.metas();
    metas.set(MIME_TYPE, findMimeType(extension, mimeType, this::findMimeTypeUsingDefault));

    // Extract metadata
    try (XSLFPowerPointExtractor poiExtractor = new XSLFPowerPointExtractor(slideshow)) {
        final CoreProperties info = poiExtractor.getCoreProperties();
        if (info != null) {
            metas.add(TITLE, info.getTitle());
            metas.add(CREATOR, info.getCreator());
            metas.add(SUBJECT, info.getSubject());
            metas.add(DESCRIPTION, info.getDescription());
            metas.add(KEYWORDS, info.getKeywords());
            metas.add(CREATION_DATE, info.getCreated());
            metas.add(MODIFICATION_DATE, info.getModified());
        }/*  ww  w . j  av  a2s. c  o m*/
    }
    extractSides(slideshow, resultBuilder);
}

From source file:dk.defxws.fedoragsearch.server.TransformerToText.java

License:Open Source License

private static Stream getTextFromPPTX(InputStream doc) throws GenericSearchException {
    long time = System.currentTimeMillis();
    boolean errorFlag = Boolean.parseBoolean(Config.getCurrentConfig().getIgnoreTextExtractionErrors());
    XSLFPowerPointExtractor powerPointExtractor = null;
    try {//from  w w w . j  a va2  s  . c  o  m
        powerPointExtractor = new XSLFPowerPointExtractor(OPCPackage.open(doc));
        StringBuffer buffer = new StringBuffer(powerPointExtractor.getText(true, true).trim());
        Stream stream = new Stream();
        stream.write(buffer.toString().getBytes(Constants.XML_CHARACTER_ENCODING));
        stream.lock();
        if (logger.isDebugEnabled()) {
            logger.debug("extracting text from pptx needed " + (System.currentTimeMillis() - time));
        }
        return stream;
    } catch (Exception e) {
        if (errorFlag) {
            logger.warn("", e);
            return createErrorStream(pptxTextExtractionErrorString);
        } else {
            throw new GenericSearchException("cannot parse pptx-file", e);
        }
    } finally {
        powerPointExtractor = null;
    }
}

From source file:edu.ur.ir.index.DefaultPowerPointXmlTextExtractor.java

License:Apache License

/**
 * Extract text from a word 97-2003 document.
 * @throws Exception // w  ww .  j a va2s  .c o  m
 * 
 * @see edu.ur.ir.index.FileTextExtractor#getText(java.io.File)
 */
public String getText(File f) throws Exception {
    String text = null;
    if (isFileTooLarge(f) || f.length() <= 0l) {
        return text;
    }

    OPCPackage p = null;
    try {
        p = XSLFSlideShow.openPackage(f.getAbsolutePath());
        XSLFSlideShow slideShow = new XSLFSlideShow(p);
        XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(slideShow);

        String myText = extractor.getText();
        if (myText != null && !myText.trim().equals("")) {
            text = myText;
        }

    } catch (OutOfMemoryError oome) {
        text = null;
        log.error("could not extract text", oome);
        throw (oome);
    } catch (Exception e) {
        text = null;
        log.error("could not get text for word document " + f.getAbsolutePath(), e);
        throw (e);
    }

    finally {
        if (p != null) {
            try {
                p.close();
                p = null;
            } catch (IOException e) {
                log.debug(e);
                p = null;
            }
        }
    }
    return text;
}

From source file:IO.search.SearchWordFile.java

private void search(File scrFile, String word) {
    //split the key word in different way
    //there are two way to split key word
    //space or hyphen
    //space 's meaning is any one key word contain in the file we search
    //hyphen 's meaning is all key word must contain in the file we seach
    String[] arrStr = null;// w w  w  .j a v  a 2  s .c  om
    String[] arrStrA = null;
    if (word.contains(" ")) {
        arrStr = word.split(" ");
    } else if (word.contains("-")) {
        arrStrA = word.split("-");
        System.out.println("reach");
    }

    //regular expression mean suffixes must contain doc.
    boolean is03word = scrFile.getName().matches("^.+\\.(?i)(doc)$");
    if (is03word) {
        try {
            InputStream is = new FileInputStream(scrFile);
            WordExtractor ex = new WordExtractor(is);
            String text2003 = ex.getText();
            if (arrStr != null && arrStr.length > 0) {
                //if keyword has space ,then we do spilt it
                //invoke the method
                finding(text2003, arrStr, scrFile);
            } else if (arrStrA != null && arrStrA.length > 0) {
                //if keyword has hyphen, it mean that the file we search must contain these key word.
                //we are using count varible to count the text of the file containing keyword whether enough or not.
                int count = 0;
                for (int i = 0; i < arrStrA.length; i++) {
                    if (text2003.toLowerCase().contains(arrStrA[i].toLowerCase())) {
                        count++;
                    }
                }
                //if count varible if equal with amount of keyword that the file is we want.
                if (count == arrStrA.length) {
                    nameList.add(scrFile.getPath());
                }
            } else if (text2003.toLowerCase().contains(word.toLowerCase())) {
                System.out.println("true");
                nameList.add(scrFile.getPath());
            }
        } catch (Exception ex) {
            Logger.getLogger(SearchWordFile.class.getName()).log(Level.SEVERE, null, ex);
        }

    } else if (scrFile.getName().matches("^.+\\.(?i)(docx)$")) {
        try {
            OPCPackage opcPackage = POIXMLDocument.openPackage(scrFile.getPath());
            POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);
            String text2007 = extractor.getText();
            if (arrStr != null && arrStr.length > 0) {
                finding(text2007, arrStr, scrFile);
            } else if (arrStrA != null && arrStrA.length > 0) {
                int count = 0;
                for (int i = 0; i < arrStrA.length; i++) {
                    if (text2007.toLowerCase().contains(arrStrA[i].toLowerCase())) {
                        count++;
                    }
                }
                if (count == arrStrA.length) {
                    nameList.add(scrFile.getPath());
                }
            } else if (text2007.toLowerCase().contains(word.toLowerCase())) {
                System.out.println("true");
                nameList.add(scrFile.getPath());
            }
        } catch (Exception ex) {
            Logger.getLogger(SearchWordFile.class.getName()).log(Level.SEVERE, null, ex);
        }
    } else if (scrFile.getName().matches("^.+\\.(?i)(pdf)$")) {

        FileInputStream input = null;
        PDDocument pdfDocument = null;
        try {

            input = new FileInputStream(scrFile);
            PDFParser pdfParser = new PDFParser(input);
            pdfParser.parse();
            pdfDocument = pdfParser.getPDDocument();
            PDFTextStripper stripper = new PDFTextStripper();
            String content = stripper.getText(pdfDocument);
            if (arrStr != null && arrStr.length > 0) {
                finding(content, arrStr, scrFile);
            } else if (arrStrA != null && arrStrA.length > 0) {
                int count = 0;
                for (int i = 0; i < arrStrA.length; i++) {
                    if (content.toLowerCase().contains(arrStrA[i].toLowerCase())) {
                        count++;
                    }
                }
                if (count == arrStrA.length) {
                    nameList.add(scrFile.getPath());
                }
            } else if (content.toLowerCase().contains(word.toLowerCase())) {
                System.out.println("true");
                nameList.add(scrFile.getPath());

            }

        } catch (Exception ex) {
            Logger.getLogger(SearchWordFile.class.getName()).log(Level.SEVERE, null, ex);
        } finally {
            try {
                input.close();
                pdfDocument.close();
            } catch (IOException ex) {
                Logger.getLogger(SearchWordFile.class.getName()).log(Level.SEVERE, null, ex);
            }
        }
    } else if (scrFile.getName().matches("^.+\\.(?i)(txt)$")) {
        BufferedReader in = null;
        StringBuffer sb = new StringBuffer();
        try {
            in = new BufferedReader((new InputStreamReader(new FileInputStream(scrFile),
                    getCharset(scrFile.getAbsolutePath()))));
            String line = null;
            while ((line = in.readLine()) != null) {
                sb.append(line);
            }
            if (arrStr != null && arrStr.length > 0) {
                if (finding(sb.toString(), arrStr, scrFile)) {
                    return;
                }
            } else if (arrStrA != null && arrStrA.length > 0) {
                int count = 0;
                for (int i = 0; i < arrStrA.length; i++) {
                    if (sb.toString().contains(arrStrA[i])) {
                        count++;
                    }
                }
                if (count == arrStrA.length) {
                    nameList.add(scrFile.getPath());
                }
            } else if (line.toLowerCase().contains(word.toLowerCase())) {
                System.out.println("true");
                nameList.add(scrFile.getPath());
                return;
            }

        } catch (Exception ex) {
            Logger.getLogger(SearchWordFile.class.getName()).log(Level.SEVERE, null, ex);
        } finally {
            try {
                in.close();
            } catch (IOException ex) {
                Logger.getLogger(SearchWordFile.class.getName()).log(Level.SEVERE, null, ex);
            }
        }

    } else if (scrFile.getName().matches("^.+\\.(?i)(ppt)$")) {//find the key word in ppt file
        InputStream is = null;
        try {
            StringBuffer content = new StringBuffer("");
            is = new FileInputStream(scrFile);
            //get core API
            HSLFSlideShow ss = new HSLFSlideShow(is);
            //get how many page in this PPT
            List<HSLFSlide> slides = ss.getSlides();
            System.out.println("total have " + slides.size() + " page PPT");
            for (int i = 0; i < slides.size(); i++) {
                //get each page of ppt content, retrun is List
                List<List<HSLFTextParagraph>> textParagraphs = slides.get(i).getTextParagraphs();
                if (textParagraphs != null) {
                    for (int j = 0; j < textParagraphs.size(); j++) {
                        content.append("\n");
                        //get each row of the page
                        List<HSLFTextParagraph> hslfTextParagraph = textParagraphs.get(j);
                        for (int f = 0; f < hslfTextParagraph.size(); f++) {
                            //get  the text of this row
                            content.append(hslfTextParagraph.get(f).toString());
                        }
                    }
                }
            }
            if (arrStr != null && arrStr.length > 0) {
                finding(content.toString(), arrStr, scrFile);
            } else if (arrStrA != null && arrStrA.length > 0) {
                int count = 0;
                for (int i = 0; i < arrStrA.length; i++) {
                    if (content.toString().toLowerCase().contains(arrStrA[i].toLowerCase())) {
                        count++;
                    }
                }
                if (count == arrStrA.length) {
                    nameList.add(scrFile.getPath());
                }
            } else if (content.toString().toLowerCase().contains(word.toLowerCase())) {
                System.out.println("true");
                nameList.add(scrFile.getPath());
            }

        } catch (Exception ex) {
            Logger.getLogger(SearchWordFile.class.getName()).log(Level.SEVERE, null, ex);
        } finally {
            try {
                is.close();
            } catch (IOException ex) {
                Logger.getLogger(SearchWordFile.class.getName()).log(Level.SEVERE, null, ex);
            }
        }
    } else if (scrFile.getName().matches("^.+\\.(?i)(pptx)$")) {//if powerpoint is 2007 or after we use this method
        String conetxt = null;
        try {
            conetxt = new XSLFPowerPointExtractor(POIXMLDocument.openPackage(scrFile.getPath())).getText();
            if (arrStr != null && arrStr.length > 0) {
                finding(conetxt, arrStr, scrFile);
            } else if (arrStrA != null && arrStrA.length > 0) {
                int count = 0;
                for (int i = 0; i < arrStrA.length; i++) {
                    if (conetxt.toLowerCase().contains(arrStrA[i].toLowerCase())) {
                        count++;
                    }
                }
                if (count == arrStrA.length) {
                    nameList.add(scrFile.getPath());
                }
            } else if (conetxt.toLowerCase().contains(word.toLowerCase())) {
                System.out.println("true");
                nameList.add(scrFile.getPath());
            }
        } catch (Exception ex) {
            Logger.getLogger(SearchWordFile.class.getName()).log(Level.SEVERE, null, ex);
        }
    }
}