Example usage for org.apache.poi.hwpf.extractor WordExtractor getText

List of usage examples for org.apache.poi.hwpf.extractor WordExtractor getText

Introduction

In this page you can find the example usage for org.apache.poi.hwpf.extractor WordExtractor getText.

Prototype

public String getText() 

Source Link

Document

Grab the text, based on the WordToTextConverter.

Usage

From source file:avoking.com.documentos.scheduler.core.Core.java

private String leerDoc(InputStream doc) throws IOException {
    POIFSFileSystem fs = new POIFSFileSystem(doc);
    WordExtractor we = new WordExtractor(fs);
    return we.getText();
}

From source file:axiom.util.TextExtractor.java

License:Open Source License

public static String msWordExtractor(InputStream is) throws Exception {
    WordExtractor we = new WordExtractor(is);
    return we.getText();
}

From source file:com.artech.prototype2.bardakov.utils.impl.MultiParserImpl.java

/**
 *      doc/docx/*from ww  w.  j  a  v a 2 s.c o m*/
 * @param FilePath -   
 * @return ?? ?
 */
private ArrayList<String> getListOfWordsFromDoc(String FilePath) {
    FileInputStream fis;
    List<String> result = new ArrayList<String>();
    if (FilePath.substring(FilePath.length() - 1).equals("x")) { //is a docx
        try {
            fis = new FileInputStream(new File(FilePath));
            XWPFDocument doc = new XWPFDocument(fis);
            XWPFWordExtractor extract = new XWPFWordExtractor(doc);
            // System.out.println(extract.getText());
            StringBuilder builder = new StringBuilder();
            builder.append(extract.getText());
            String[] words = builder.toString().split(" ");
            for (String s : words) {
                result.add(s);
            }
        } catch (IOException e) {

            e.printStackTrace();
        }
    } else { //is not a docx
        try {
            fis = new FileInputStream(new File(FilePath));
            HWPFDocument doc = new HWPFDocument(fis);
            WordExtractor extractor = new WordExtractor(doc);
            StringBuilder builder = new StringBuilder();
            builder.append(extractor.getText());
            String[] words = builder.toString().split(" ");
            for (String s : words) {
                result.add(s);
            }

        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    return (ArrayList<String>) result;
}

From source file:com.docdoku.server.esindexer.ESTools.java

License:Open Source License

private static String microsoftWordDocumentToString(InputStream inputStream) throws IOException {
    String strRet;/*from   w  ww .  j  a v a  2  s.c om*/
    try (InputStream wordStream = new BufferedInputStream(inputStream)) {
        if (POIFSFileSystem.hasPOIFSHeader(wordStream)) {
            WordExtractor wordExtractor = new WordExtractor(wordStream);
            strRet = wordExtractor.getText();
        } else {
            XWPFWordExtractor wordXExtractor = new XWPFWordExtractor(new XWPFDocument(wordStream));
            strRet = wordXExtractor.getText();
        }
    }
    return strRet;
}

From source file:com.docdoku.server.IndexerBean.java

License:Open Source License

@Asynchronous
@Lock(LockType.WRITE)//from   w  w  w.java2 s  .com
public void addToIndex(String fullName, String pathName) {
    IndexWriter indexWriter = null;
    Directory indexDir = null;
    try {
        indexDir = FSDirectory.open(new File(indexPath));
        indexWriter = new IndexWriter(indexDir, new StandardAnalyzer(Version.LUCENE_30),
                IndexWriter.MaxFieldLength.LIMITED);
        int ext = pathName.lastIndexOf('.');
        String extension = "";
        if (ext != -1) {
            extension = pathName.substring(ext);
        }

        if (extension.equals(".odt") || extension.equals(".ods") || extension.equals(".odp")
                || extension.equals(".odg") || extension.equals(".odc") || extension.equals(".odf")
                || extension.equals(".odb") || extension.equals(".odi") || extension.equals(".odm")) {
            final StringBuilder text = new StringBuilder();
            ZipInputStream zipOpenDoc = new ZipInputStream(
                    new BufferedInputStream(new FileInputStream(pathName)));
            ZipEntry zipEntry;
            while ((zipEntry = zipOpenDoc.getNextEntry()) != null) {
                if (zipEntry.getName().equals("content.xml")) {
                    SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
                    SAXParser parser = saxParserFactory.newSAXParser();
                    parser.parse(zipOpenDoc, new DefaultHandler() {

                        @Override
                        public void characters(char[] ch, int start, int length) throws SAXException {
                            for (int i = start; i < start + length; i++) {
                                text.append(ch[i]);
                            }
                            text.append("\r\n");
                        }
                    });
                    break;
                }
            }
            zipOpenDoc.close();
            Reader contentReader = new StringReader(text.toString());
            addDoc(indexWriter, contentReader, fullName);
            contentReader.close();
        } else if (extension.equals(".doc")) {
            //MSWord Document
            InputStream wordStream = new BufferedInputStream(new FileInputStream(pathName));
            WordExtractor wordExtractor = new WordExtractor(wordStream);
            Reader contentReader = new StringReader(wordExtractor.getText());
            wordStream.close();
            addDoc(indexWriter, contentReader, fullName);
            contentReader.close();
        } else if (extension.equals(".ppt") || extension.equals(".pps")) {
            //MSPowerPoint Document
            InputStream pptStream = new BufferedInputStream(new FileInputStream(pathName));
            PowerPointExtractor pptExtractor = new PowerPointExtractor(pptStream);
            Reader contentReader = new StringReader(pptExtractor.getText(true, true));
            pptStream.close();
            addDoc(indexWriter, contentReader, fullName);
            pptExtractor.close();
            contentReader.close();
        } else if (extension.equals(".txt")) {
            //Text Document
            Reader contentReader = new BufferedReader(new FileReader(pathName));
            addDoc(indexWriter, contentReader, fullName);
            contentReader.close();
        } else if (extension.equals(".xls")) {
            //MSExcelExtractor Document
            //InputStream excelStream=new BufferedInputStream(new FileInputStream(pathName));
            //ExcelExtractor excelExtractor= new ExcelExtractor(excelStream);
            //Reader contentReader=new StringReader(excelExtractor.getText());
            //excelStream.close();
            //addDoc(indexWriter,contentReader,fullName);
            //excelExtractor.close();
            //contentReader.close();
        } else if (extension.equals(".html") || extension.equals(".htm")) {
        } else if (extension.equals(".csv")) {
        } else if (extension.equals(".xml")) {
        } else if (extension.equals(".rtf")) {
        } else if (extension.equals(".pdf")) {
        } else if (extension.equals(".msg")) {
        }
    } catch (CorruptIndexException ex) {
        throw new EJBException(ex);
    } catch (LockObtainFailedException ex) {
        try {
            if (IndexWriter.isLocked(indexDir)) {
                IndexWriter.unlock(indexDir);
            }
        } catch (IOException pIOEx) {
            throw new EJBException(pIOEx);
        }
        throw new EJBException(ex);
    } catch (ParserConfigurationException ex) {
        throw new EJBException(ex);
    } catch (SAXException ex) {
        throw new EJBException(ex);
    } catch (IOException ex) {
        throw new EJBException(ex);
    } finally {
        try {
            if (indexWriter != null) {
                indexWriter.close();
            }
        } catch (IOException ex) {
            throw new EJBException(ex);
        }
    }
}

From source file:com.frameworkset.platform.cms.searchmanager.extractors.CmsExtractorMsWord.java

License:Open Source License

/** 
 * @see org.opencms.search.extractors.I_CmsTextExtractor#extractText(java.io.InputStream, java.lang.String)
 *///w ww.j a  v a  2  s .  c  om
public I_CmsExtractionResult extractText(InputStream in, String encoding) throws Exception {

    // first extract the text using the text actraction libary
    //       WordTextExtractorFactory factory = new WordTextExtractorFactory();

    //        TextExtractor wordExtractor = factory.textExtractor(getStreamCopy(in));

    //        String result = wordExtractor.getText();
    //        String result = wordExtractor.extractText(getStreamCopy(in));
    StringBuffer content = new StringBuffer();
    //       org.apache.poi.hwpf

    //        HWPFDocument doc = new HWPFDocument(in);
    //         Range range = doc.getRange();
    //         int paragraphCount = range.numParagraphs();// ?
    //         for (int i = 0; i < paragraphCount; i++) {// ?????
    //             Paragraph pp = range.getParagraph(i);
    //             content.append(pp.text());
    //         }
    String result = null;
    Map metaInfo = null;
    if (version.equals(ContentHandler.VERSION_2003)) {
        WordExtractor ex = new WordExtractor(in);
        result = ex.getText();
        SummaryInformation info = ex.getSummaryInformation();
        this.m_summary = info;
        this.m_documentSummary = ex.getDocSummaryInformation();
        metaInfo = extractMetaInformation();
    } else {
        XWPFDocument doc = new XWPFDocument(in);
        XWPFWordExtractor ex = new XWPFWordExtractor(doc);
        result = ex.getText();
        cp = ex.getCoreProperties();
        metaInfo = extractMetaInformation();
        //           SummaryInformation info = doc.getSummaryInformation();
        //             this.m_summary = info;
        //             this.m_documentSummary = doc.getDocSummaryInformation();
    }

    //        result = removeControlChars(result);

    //     String result = content.toString();
    // now extract the meta information using POI 
    //        POIFSReader reader = new POIFSReader();
    //        reader.registerListener(this);
    //        reader.read(getStreamCopy(getStreamCopy(in)));

    // free some memory
    cleanup();

    // return the final result
    return new CmsExtractionResult(result, metaInfo);
}

From source file:com.isotrol.impe3.idx.oc.extractors.ExtractorMsWord.java

License:Open Source License

/**
 * Extrae el texto de un fichero word./*w  w w  .  j a va 2  s  .c  om*/
 * @param in
 * @return String. Devuelve el texto crudo
 * @throws Exception
 */
public static String extractText(InputStream in) throws Exception {

    String result = "";

    HWPFDocument doc = new HWPFDocument(in);

    WordExtractor we = new WordExtractor(doc);
    result = we.getText();

    // Eliminamos los caracteres que no nos sirven para indexar.
    result = ExtractorUtil.removeControlChars(result);

    return result;
}

From source file:com.jgaap.generics.DocumentHelper.java

License:Open Source License

/**
 * Extracts text from a Word document and stores it in the document.
 * //from w w  w. j  a  v  a 2 s  . c  o m
 * @param inputStream
 *            An input stream pointing to the Word document to be read.
 * @throws IOException
 */
static private char[] loadMSWord(InputStream inputStream) throws IOException {
    POIFSFileSystem fs = new POIFSFileSystem(inputStream);
    HWPFDocument doc = new HWPFDocument(fs);
    WordExtractor we = new WordExtractor(doc);
    char[] origText = we.getText().toCharArray();

    return origText;
}

From source file:com.virtusa.isq.vtaf.runtime.SeleniumTestBase.java

License:Apache License

/**
 * Read doc file.//from w w w  . j a  v  a2 s  .  com
 *
 * @param fileName
 *            the file name
 * @return the string
 */
public final String readDocFile(final String fileName) {
    String docContent = null;
    try {
        File file = new File(fileName);
        FileInputStream fis = new FileInputStream(file.getAbsolutePath());

        HWPFDocument doc = new HWPFDocument(fis);
        WordExtractor we = new WordExtractor(doc);

        docContent = we.getText();
        System.out.println("MS Word(.doc) Document Red, Content:" + docContent);

        fis.close();
    } catch (IOException e) {
        e.printStackTrace();
        reportresult(true, "CHECK DOCUMENT :", "FAILED",
                "CheckDocument command NODECOUNT : Execption occured. Actual error : " + e.getMessage());
        checkTrue(false, false,
                "CheckDocument command NODECOUNT : Execption occured. Actual error : " + e.getMessage());
    }
    return docContent;

}

From source file:de.catma.document.source.contenthandler.Doc2TxtExtractor.java

License:Open Source License

private void extractTo(String path, String outputPath) throws IOException {
    BufferedInputStream bis = new BufferedInputStream(new File(path).toURI().toURL().openStream());
    try {//from w  w  w.j a v a2 s . c o m
        WordExtractor we = new WordExtractor(bis);
        String buf = we.getText();
        System.out.println("before: " + buf.length());
        if (FileOSType.getFileOSType(buf).equals(FileOSType.UNIX)) {
            buf = FileOSType.convertUnixToDos(buf);
        }
        System.out.println("after: " + buf.length());

        StringBuilder builder = new StringBuilder();
        addTest(buf, 10, 20, builder);
        addTest(buf, 10, 80, builder);

        if (outputPath != null) {
            Writer repoSourceFileWriter = new BufferedWriter(
                    new OutputStreamWriter(new FileOutputStream(outputPath), "UTF-8"));
            try {
                repoSourceFileWriter.append(buf);
                repoSourceFileWriter.append(builder.toString());
            } finally {
                if (repoSourceFileWriter != null) {
                    repoSourceFileWriter.close();
                }
            }
        } else {
            System.out.println(buf);
            System.out.println(builder.toString());
        }
    } finally {
        if (bis != null) {
            bis.close();
        }
    }
}