Example usage for org.apache.poi.hwpf.extractor WordExtractor WordExtractor

List of usage examples for org.apache.poi.hwpf.extractor WordExtractor WordExtractor

Introduction

In this page you can find the example usage for org.apache.poi.hwpf.extractor WordExtractor WordExtractor.

Prototype

public WordExtractor(HWPFDocument doc) 

Source Link

Document

Create a new Word Extractor

Usage

From source file:at.tugraz.sss.serv.SSFileU.java

License:Apache License

public static void writePDFFromDoc(final String docFilePath, final String pdfFilePath) throws Exception {

    final Document document = new Document();
    final POIFSFileSystem fs = new POIFSFileSystem(openFileForRead(docFilePath));
    final HWPFDocument word = new HWPFDocument(fs);
    final WordExtractor we = new WordExtractor(word);
    final OutputStream out = openOrCreateFileWithPathForWrite(pdfFilePath);
    final PdfWriter writer = PdfWriter.getInstance(document, out);
    final Range range = word.getRange();

    document.open();/* ww w .  ja v  a  2s  .co m*/
    writer.setPageEmpty(true);
    document.newPage();
    writer.setPageEmpty(true);

    String[] paragraphs = we.getParagraphText();

    for (int i = 0; i < paragraphs.length; i++) {

        org.apache.poi.hwpf.usermodel.Paragraph pr = range.getParagraph(i);
        // CharacterRun run = pr.getCharacterRun(i);
        // run.setBold(true);
        // run.setCapitalized(true);
        // run.setItalic(true);
        paragraphs[i] = paragraphs[i].replaceAll("\\cM?\r?\n", "");
        System.out.println("Length:" + paragraphs[i].length());
        System.out.println("Paragraph" + i + ": " + paragraphs[i].toString());

        // add the paragraph to the document
        document.add(new Paragraph(paragraphs[i]));
    }

    document.close();
}

From source file:at.tugraz.sss.serv.util.SSFileU.java

License:Apache License

public static void writePDFFromDoc(final String docFilePath, final String pdfFilePath) throws SSErr {

    try {/*from www  . ja  va 2  s.co  m*/
        final Document document = new Document();
        final POIFSFileSystem fs = new POIFSFileSystem(openFileForRead(docFilePath));
        final HWPFDocument word = new HWPFDocument(fs);
        final WordExtractor we = new WordExtractor(word);
        final OutputStream out = openOrCreateFileWithPathForWrite(pdfFilePath);
        final PdfWriter writer = PdfWriter.getInstance(document, out);
        final Range range = word.getRange();

        document.open();
        writer.setPageEmpty(true);
        document.newPage();
        writer.setPageEmpty(true);

        String[] paragraphs = we.getParagraphText();

        for (int i = 0; i < paragraphs.length; i++) {

            org.apache.poi.hwpf.usermodel.Paragraph pr = range.getParagraph(i);
            // CharacterRun run = pr.getCharacterRun(i);
            // run.setBold(true);
            // run.setCapitalized(true);
            // run.setItalic(true);
            paragraphs[i] = paragraphs[i].replaceAll("\\cM?\r?\n", "");
            System.out.println("Length:" + paragraphs[i].length());
            System.out.println("Paragraph" + i + ": " + paragraphs[i].toString());

            // add the paragraph to the document
            document.add(new Paragraph(paragraphs[i]));
        }

        document.close();
    } catch (Exception error) {
        SSServErrReg.regErrThrow(error);
    }
}

From source file:avoking.com.documentos.scheduler.core.Core.java

private String leerDoc(InputStream doc) throws IOException {
    POIFSFileSystem fs = new POIFSFileSystem(doc);
    WordExtractor we = new WordExtractor(fs);
    return we.getText();
}

From source file:axiom.util.TextExtractor.java

License:Open Source License

public static String msWordExtractor(InputStream is) throws Exception {
    WordExtractor we = new WordExtractor(is);
    return we.getText();
}

From source file:br.com.schumaker.beta.doc.ReadDocMaster.java

public static void main(String[] args) {
    try {//  w  w w .  j a v a2  s. c om

        File file = new File(
                "/users/hudsonschumaker/downloads/Guisi01206us - Jira Guide for P3 PECB enhancement requests.doc");
        FileInputStream fis = new FileInputStream(file.getAbsolutePath());
        HWPFDocument doc = new HWPFDocument(fis);
        WordExtractor extractor = new WordExtractor(doc);

        for (String rawText : extractor.getParagraphText()) {
            String text = extractor.stripFields(rawText);
            if (text.length() > 10)
                System.out.println(text.trim());
        }
    } catch (Exception exep) {
    }
}

From source file:com.artech.prototype2.bardakov.utils.impl.MultiParserImpl.java

/**
 *      doc/docx//from  w ww . j a  va 2 s  . c o m
 * @param FilePath -   
 * @return ?? ?
 */
private ArrayList<String> getListOfWordsFromDoc(String FilePath) {
    FileInputStream fis;
    List<String> result = new ArrayList<String>();
    if (FilePath.substring(FilePath.length() - 1).equals("x")) { //is a docx
        try {
            fis = new FileInputStream(new File(FilePath));
            XWPFDocument doc = new XWPFDocument(fis);
            XWPFWordExtractor extract = new XWPFWordExtractor(doc);
            // System.out.println(extract.getText());
            StringBuilder builder = new StringBuilder();
            builder.append(extract.getText());
            String[] words = builder.toString().split(" ");
            for (String s : words) {
                result.add(s);
            }
        } catch (IOException e) {

            e.printStackTrace();
        }
    } else { //is not a docx
        try {
            fis = new FileInputStream(new File(FilePath));
            HWPFDocument doc = new HWPFDocument(fis);
            WordExtractor extractor = new WordExtractor(doc);
            StringBuilder builder = new StringBuilder();
            builder.append(extractor.getText());
            String[] words = builder.toString().split(" ");
            for (String s : words) {
                result.add(s);
            }

        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    return (ArrayList<String>) result;
}

From source file:com.docdoku.server.esindexer.ESTools.java

License:Open Source License

private static String microsoftWordDocumentToString(InputStream inputStream) throws IOException {
    String strRet;/*from w  w w.jav  a  2 s.  c om*/
    try (InputStream wordStream = new BufferedInputStream(inputStream)) {
        if (POIFSFileSystem.hasPOIFSHeader(wordStream)) {
            WordExtractor wordExtractor = new WordExtractor(wordStream);
            strRet = wordExtractor.getText();
        } else {
            XWPFWordExtractor wordXExtractor = new XWPFWordExtractor(new XWPFDocument(wordStream));
            strRet = wordXExtractor.getText();
        }
    }
    return strRet;
}

From source file:com.docdoku.server.IndexerBean.java

License:Open Source License

@Asynchronous
@Lock(LockType.WRITE)//w  w  w. j a v a 2s  . c  o  m
public void addToIndex(String fullName, String pathName) {
    IndexWriter indexWriter = null;
    Directory indexDir = null;
    try {
        indexDir = FSDirectory.open(new File(indexPath));
        indexWriter = new IndexWriter(indexDir, new StandardAnalyzer(Version.LUCENE_30),
                IndexWriter.MaxFieldLength.LIMITED);
        int ext = pathName.lastIndexOf('.');
        String extension = "";
        if (ext != -1) {
            extension = pathName.substring(ext);
        }

        if (extension.equals(".odt") || extension.equals(".ods") || extension.equals(".odp")
                || extension.equals(".odg") || extension.equals(".odc") || extension.equals(".odf")
                || extension.equals(".odb") || extension.equals(".odi") || extension.equals(".odm")) {
            final StringBuilder text = new StringBuilder();
            ZipInputStream zipOpenDoc = new ZipInputStream(
                    new BufferedInputStream(new FileInputStream(pathName)));
            ZipEntry zipEntry;
            while ((zipEntry = zipOpenDoc.getNextEntry()) != null) {
                if (zipEntry.getName().equals("content.xml")) {
                    SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
                    SAXParser parser = saxParserFactory.newSAXParser();
                    parser.parse(zipOpenDoc, new DefaultHandler() {

                        @Override
                        public void characters(char[] ch, int start, int length) throws SAXException {
                            for (int i = start; i < start + length; i++) {
                                text.append(ch[i]);
                            }
                            text.append("\r\n");
                        }
                    });
                    break;
                }
            }
            zipOpenDoc.close();
            Reader contentReader = new StringReader(text.toString());
            addDoc(indexWriter, contentReader, fullName);
            contentReader.close();
        } else if (extension.equals(".doc")) {
            //MSWord Document
            InputStream wordStream = new BufferedInputStream(new FileInputStream(pathName));
            WordExtractor wordExtractor = new WordExtractor(wordStream);
            Reader contentReader = new StringReader(wordExtractor.getText());
            wordStream.close();
            addDoc(indexWriter, contentReader, fullName);
            contentReader.close();
        } else if (extension.equals(".ppt") || extension.equals(".pps")) {
            //MSPowerPoint Document
            InputStream pptStream = new BufferedInputStream(new FileInputStream(pathName));
            PowerPointExtractor pptExtractor = new PowerPointExtractor(pptStream);
            Reader contentReader = new StringReader(pptExtractor.getText(true, true));
            pptStream.close();
            addDoc(indexWriter, contentReader, fullName);
            pptExtractor.close();
            contentReader.close();
        } else if (extension.equals(".txt")) {
            //Text Document
            Reader contentReader = new BufferedReader(new FileReader(pathName));
            addDoc(indexWriter, contentReader, fullName);
            contentReader.close();
        } else if (extension.equals(".xls")) {
            //MSExcelExtractor Document
            //InputStream excelStream=new BufferedInputStream(new FileInputStream(pathName));
            //ExcelExtractor excelExtractor= new ExcelExtractor(excelStream);
            //Reader contentReader=new StringReader(excelExtractor.getText());
            //excelStream.close();
            //addDoc(indexWriter,contentReader,fullName);
            //excelExtractor.close();
            //contentReader.close();
        } else if (extension.equals(".html") || extension.equals(".htm")) {
        } else if (extension.equals(".csv")) {
        } else if (extension.equals(".xml")) {
        } else if (extension.equals(".rtf")) {
        } else if (extension.equals(".pdf")) {
        } else if (extension.equals(".msg")) {
        }
    } catch (CorruptIndexException ex) {
        throw new EJBException(ex);
    } catch (LockObtainFailedException ex) {
        try {
            if (IndexWriter.isLocked(indexDir)) {
                IndexWriter.unlock(indexDir);
            }
        } catch (IOException pIOEx) {
            throw new EJBException(pIOEx);
        }
        throw new EJBException(ex);
    } catch (ParserConfigurationException ex) {
        throw new EJBException(ex);
    } catch (SAXException ex) {
        throw new EJBException(ex);
    } catch (IOException ex) {
        throw new EJBException(ex);
    } finally {
        try {
            if (indexWriter != null) {
                indexWriter.close();
            }
        } catch (IOException ex) {
            throw new EJBException(ex);
        }
    }
}

From source file:com.frameworkset.platform.cms.searchmanager.extractors.CmsExtractorMsWord.java

License:Open Source License

/** 
 * @see org.opencms.search.extractors.I_CmsTextExtractor#extractText(java.io.InputStream, java.lang.String)
 *///from   ww  w  .  j a  v  a 2s.com
public I_CmsExtractionResult extractText(InputStream in, String encoding) throws Exception {

    // first extract the text using the text actraction libary
    //       WordTextExtractorFactory factory = new WordTextExtractorFactory();

    //        TextExtractor wordExtractor = factory.textExtractor(getStreamCopy(in));

    //        String result = wordExtractor.getText();
    //        String result = wordExtractor.extractText(getStreamCopy(in));
    StringBuffer content = new StringBuffer();
    //       org.apache.poi.hwpf

    //        HWPFDocument doc = new HWPFDocument(in);
    //         Range range = doc.getRange();
    //         int paragraphCount = range.numParagraphs();// ?
    //         for (int i = 0; i < paragraphCount; i++) {// ?????
    //             Paragraph pp = range.getParagraph(i);
    //             content.append(pp.text());
    //         }
    String result = null;
    Map metaInfo = null;
    if (version.equals(ContentHandler.VERSION_2003)) {
        WordExtractor ex = new WordExtractor(in);
        result = ex.getText();
        SummaryInformation info = ex.getSummaryInformation();
        this.m_summary = info;
        this.m_documentSummary = ex.getDocSummaryInformation();
        metaInfo = extractMetaInformation();
    } else {
        XWPFDocument doc = new XWPFDocument(in);
        XWPFWordExtractor ex = new XWPFWordExtractor(doc);
        result = ex.getText();
        cp = ex.getCoreProperties();
        metaInfo = extractMetaInformation();
        //           SummaryInformation info = doc.getSummaryInformation();
        //             this.m_summary = info;
        //             this.m_documentSummary = doc.getDocSummaryInformation();
    }

    //        result = removeControlChars(result);

    //     String result = content.toString();
    // now extract the meta information using POI 
    //        POIFSReader reader = new POIFSReader();
    //        reader.registerListener(this);
    //        reader.read(getStreamCopy(getStreamCopy(in)));

    // free some memory
    cleanup();

    // return the final result
    return new CmsExtractionResult(result, metaInfo);
}

From source file:com.isotrol.impe3.idx.oc.extractors.ExtractorMsWord.java

License:Open Source License

/**
 * Extrae el texto de un fichero word./*from   www  .j  a v a 2  s.c  om*/
 * @param in
 * @return String. Devuelve el texto crudo
 * @throws Exception
 */
public static String extractText(InputStream in) throws Exception {

    String result = "";

    HWPFDocument doc = new HWPFDocument(in);

    WordExtractor we = new WordExtractor(doc);
    result = we.getText();

    // Eliminamos los caracteres que no nos sirven para indexar.
    result = ExtractorUtil.removeControlChars(result);

    return result;
}