Example usage for org.apache.poi.hwpf.extractor WordExtractor getParagraphText

List of usage examples for org.apache.poi.hwpf.extractor WordExtractor getParagraphText

Introduction

In this page you can find the example usage for org.apache.poi.hwpf.extractor WordExtractor getParagraphText.

Prototype

public String[] getParagraphText() 

Source Link

Document

Get the text from the word file, as an array with one String per paragraph

Usage

From source file:at.tugraz.sss.serv.SSFileU.java

License:Apache License

public static void writePDFFromDoc(final String docFilePath, final String pdfFilePath) throws Exception {

    final Document document = new Document();
    final POIFSFileSystem fs = new POIFSFileSystem(openFileForRead(docFilePath));
    final HWPFDocument word = new HWPFDocument(fs);
    final WordExtractor we = new WordExtractor(word);
    final OutputStream out = openOrCreateFileWithPathForWrite(pdfFilePath);
    final PdfWriter writer = PdfWriter.getInstance(document, out);
    final Range range = word.getRange();

    document.open();/*from  ww  w .ja  va  2s  .  co  m*/
    writer.setPageEmpty(true);
    document.newPage();
    writer.setPageEmpty(true);

    String[] paragraphs = we.getParagraphText();

    for (int i = 0; i < paragraphs.length; i++) {

        org.apache.poi.hwpf.usermodel.Paragraph pr = range.getParagraph(i);
        // CharacterRun run = pr.getCharacterRun(i);
        // run.setBold(true);
        // run.setCapitalized(true);
        // run.setItalic(true);
        paragraphs[i] = paragraphs[i].replaceAll("\\cM?\r?\n", "");
        System.out.println("Length:" + paragraphs[i].length());
        System.out.println("Paragraph" + i + ": " + paragraphs[i].toString());

        // add the paragraph to the document
        document.add(new Paragraph(paragraphs[i]));
    }

    document.close();
}

From source file:at.tugraz.sss.serv.util.SSFileU.java

License:Apache License

public static void writePDFFromDoc(final String docFilePath, final String pdfFilePath) throws SSErr {

    try {//  w  ww .j a va2  s  .c  om
        final Document document = new Document();
        final POIFSFileSystem fs = new POIFSFileSystem(openFileForRead(docFilePath));
        final HWPFDocument word = new HWPFDocument(fs);
        final WordExtractor we = new WordExtractor(word);
        final OutputStream out = openOrCreateFileWithPathForWrite(pdfFilePath);
        final PdfWriter writer = PdfWriter.getInstance(document, out);
        final Range range = word.getRange();

        document.open();
        writer.setPageEmpty(true);
        document.newPage();
        writer.setPageEmpty(true);

        String[] paragraphs = we.getParagraphText();

        for (int i = 0; i < paragraphs.length; i++) {

            org.apache.poi.hwpf.usermodel.Paragraph pr = range.getParagraph(i);
            // CharacterRun run = pr.getCharacterRun(i);
            // run.setBold(true);
            // run.setCapitalized(true);
            // run.setItalic(true);
            paragraphs[i] = paragraphs[i].replaceAll("\\cM?\r?\n", "");
            System.out.println("Length:" + paragraphs[i].length());
            System.out.println("Paragraph" + i + ": " + paragraphs[i].toString());

            // add the paragraph to the document
            document.add(new Paragraph(paragraphs[i]));
        }

        document.close();
    } catch (Exception error) {
        SSServErrReg.regErrThrow(error);
    }
}

From source file:br.com.schumaker.beta.doc.ReadDocMaster.java

public static void main(String[] args) {
    try {//from   w ww.j a  v  a  2  s  .  co m

        File file = new File(
                "/users/hudsonschumaker/downloads/Guisi01206us - Jira Guide for P3 PECB enhancement requests.doc");
        FileInputStream fis = new FileInputStream(file.getAbsolutePath());
        HWPFDocument doc = new HWPFDocument(fis);
        WordExtractor extractor = new WordExtractor(doc);

        for (String rawText : extractor.getParagraphText()) {
            String text = extractor.stripFields(rawText);
            if (text.length() > 10)
                System.out.println(text.trim());
        }
    } catch (Exception exep) {
    }
}

From source file:com.jaeksoft.searchlib.parser.DocParser.java

License:Open Source License

private void currentWordExtraction(ParserResultItem result, InputStream inputStream) throws IOException {
    WordExtractor word = null;

    try {//from  w w w. j av  a  2s.  c  o m
        word = new WordExtractor(inputStream);

        SummaryInformation info = word.getSummaryInformation();
        if (info != null) {
            result.addField(ParserFieldEnum.title, info.getTitle());
            result.addField(ParserFieldEnum.author, info.getAuthor());
            result.addField(ParserFieldEnum.subject, info.getSubject());
        }

        String[] paragraphes = word.getParagraphText();
        for (String paragraph : paragraphes) {
            String[] frags = paragraph.split("\\n");
            for (String frag : frags)
                result.addField(ParserFieldEnum.content, StringUtils.replaceConsecutiveSpaces(frag, " "));
        }
    } finally {
        IOUtils.close(word);
    }
}

From source file:com.opensearchserver.extractor.parser.Doc.java

License:Apache License

private void currentWordExtraction(InputStream inputStream) throws IOException {
    WordExtractor word = null;

    try {//from www  . java  2  s  .co  m
        word = new WordExtractor(inputStream);

        SummaryInformation info = word.getSummaryInformation();
        if (info != null) {
            metas.add(TITLE, info.getTitle());
            metas.add(AUTHOR, info.getAuthor());
            metas.add(SUBJECT, info.getSubject());
            metas.add(CREATION_DATE, info.getCreateDateTime());
            metas.add(MODIFICATION_DATE, info.getLastSaveDateTime());
            metas.add(KEYWORDS, info.getKeywords());
        }

        ParserDocument document = getNewParserDocument();
        String[] paragraphes = word.getParagraphText();
        for (String paragraph : paragraphes)
            document.add(CONTENT, paragraph);
        document.add(LANG_DETECTION, languageDetection(CONTENT, 10000));
    } finally {
        IOUtils.closeQuietly(word);
    }
}

From source file:com.pdf.GetPdf.java

public static void docConvert(Document document, String url, String type)
        throws IOException, DocumentException {
    WordExtractor we;

    if (type.equals("doc")) {
        HWPFDocument wordDoc = new HWPFDocument(new URL(url).openStream());
        we = new WordExtractor(wordDoc);
        String[] paragraphs = we.getParagraphText();
        for (int i = 0; i < paragraphs.length; i++) {
            paragraphs[i] = paragraphs[i].replaceAll("\\cM?\r?\n", "");
            document.add(new Paragraph(paragraphs[i]));
        }/*from  ww  w  . j a va  2s.  com*/
    } else {
        XWPFDocument wordDoc = new XWPFDocument(new URL(url).openStream());
        List<IBodyElement> contents = wordDoc.getBodyElements();

        for (IBodyElement content : contents) {
            if (content.getElementType() == BodyElementType.PARAGRAPH) {
                List<XWPFParagraph> paras = content.getBody().getParagraphs();
                for (XWPFParagraph para : paras) {
                    document.add(new Paragraph(para.getParagraphText()));
                }

            } else if (content.getElementType() == BodyElementType.TABLE) {
                List<XWPFTable> tables = content.getBody().getTables();
                for (XWPFTable table : tables) {
                    List<XWPFTableRow> rows = table.getRows();
                    for (XWPFTableRow row : rows) {
                        List<XWPFTableCell> tablecells = row.getTableCells();
                    }
                }
            }

        }
    }

}

From source file:com.unsa.view.MainView.java

License:Creative Commons License

private void DocConverterPDF(File file1) {
    NPOIFSFileSystem fs = null;/* www. j a v  a2  s  . com*/
    com.lowagie.text.Document document = new com.lowagie.text.Document();

    try {
        System.out.println(file1.getAbsolutePath());
        fs = new NPOIFSFileSystem(new FileInputStream(file1.getAbsolutePath()));
        HWPFDocument doc = new HWPFDocument(fs.getRoot());
        WordExtractor we = new WordExtractor(doc);
        String output = file1.getAbsolutePath().substring(0, file1.getAbsolutePath().length() - 3);
        OutputStream fileout = new FileOutputStream(new File(output + "pdf"));

        PdfWriter writer = PdfWriter.getInstance(document, fileout);

        Range range = doc.getRange();
        document.open();
        writer.setPageEmpty(true);
        document.newPage();
        writer.setPageEmpty(true);

        String[] paragraphs = we.getParagraphText();
        for (int i = 0; i < paragraphs.length; i++) {

            org.apache.poi.hwpf.usermodel.Paragraph pr = range.getParagraph(i);
            paragraphs[i] = paragraphs[i].replaceAll("\\cM?\r?\n", "");
            document.add(new Paragraph(paragraphs[i]));
        }

    } catch (Exception e) {

        e.printStackTrace();
    } finally {

        document.close();
    }

}

From source file:cv_extractor.DocReader.java

protected static void readDocFile(File localFile) {
    try {//w ww .  j a  va2 s .  c om
        //Create a input stream to read file
        FileInputStream fis = new FileInputStream(localFile.getAbsolutePath());

        //For reading docx files
        HWPFDocument doc = new HWPFDocument(fis);

        WordExtractor we = new WordExtractor(doc);

        String[] paragraphs = we.getParagraphText();

        System.out.println("Total no of paragraph " + paragraphs.length);

        for (String para : paragraphs) {
            //Compile the regex defined above
            Pattern r = Pattern.compile(pattern);

            //Check if any string matches the compiled pattern
            Matcher m = r.matcher(para);

            if (m.find()) {
                //m.group() Returns the input subsequence matched by the previous match
                data.add(m.group());
            }
        }

        fis.close();
    }

    catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:File.DOC.ReadDoc.java

public void Read(String path, String namafile) {
    try {//  ww  w  .j av a 2 s. c  o  m
        File file = new File(path + namafile + ".doc");
        FileInputStream fis = new FileInputStream(file.getAbsolutePath());

        HWPFDocument doc = new HWPFDocument(fis);

        WordExtractor we = new WordExtractor(doc);

        String[] paragraphs = we.getParagraphText();

        System.out.println("Total no of paragraph " + paragraphs.length);
        for (String para : paragraphs) {
            System.out.println(para.toString());
        }
        fis.close();
    } catch (Exception ex) {
        ex.printStackTrace();
    }
}

From source file:insight.masters.policyanalytics.services.BranchingOriginStanfordKeywords.java

public static String readfromdoc(String datsetspath, String Document) {
    File file = null;//from  w  w w.  ja v  a 2s  .c  o m
    WordExtractor extractor = null;
    String extractedtext = "";
    try {

        file = new File(datsetspath + Document);
        FileInputStream fis = new FileInputStream(file.getAbsolutePath());
        HWPFDocument document = new HWPFDocument(fis);
        extractor = new WordExtractor(document);
        String[] fileData = extractor.getParagraphText();
        for (int i = 0; i < fileData.length; i++) {
            if (fileData[i] != null)
                //                       System.out.print("{\"text\":\"");
                System.out.print(fileData[i].replace("\n", "").replace("\r", ""));
            extractedtext += fileData[i].replace("\n", "").replace("\r", "");
            //                System.out.print("\"}");

        }
    } catch (Exception exep) {
        exep.printStackTrace();
    }
    return extractedtext;
}