Example usage for org.apache.poi.hwpf.usermodel Range getParagraph

List of usage examples for org.apache.poi.hwpf.usermodel Range getParagraph

Introduction

In this page you can find the example usage for org.apache.poi.hwpf.usermodel Range getParagraph.

Prototype


public Paragraph getParagraph(int index) 

Source Link

Document

Gets the paragraph at index.

Usage

From source file:at.tugraz.sss.serv.SSFileU.java

License:Apache License

public static void writePDFFromDoc(final String docFilePath, final String pdfFilePath) throws Exception {

    final Document document = new Document();
    final POIFSFileSystem fs = new POIFSFileSystem(openFileForRead(docFilePath));
    final HWPFDocument word = new HWPFDocument(fs);
    final WordExtractor we = new WordExtractor(word);
    final OutputStream out = openOrCreateFileWithPathForWrite(pdfFilePath);
    final PdfWriter writer = PdfWriter.getInstance(document, out);
    final Range range = word.getRange();

    document.open();/*w  w w  .j a  v  a  2s  .  co  m*/
    writer.setPageEmpty(true);
    document.newPage();
    writer.setPageEmpty(true);

    String[] paragraphs = we.getParagraphText();

    for (int i = 0; i < paragraphs.length; i++) {

        org.apache.poi.hwpf.usermodel.Paragraph pr = range.getParagraph(i);
        // CharacterRun run = pr.getCharacterRun(i);
        // run.setBold(true);
        // run.setCapitalized(true);
        // run.setItalic(true);
        paragraphs[i] = paragraphs[i].replaceAll("\\cM?\r?\n", "");
        System.out.println("Length:" + paragraphs[i].length());
        System.out.println("Paragraph" + i + ": " + paragraphs[i].toString());

        // add the paragraph to the document
        document.add(new Paragraph(paragraphs[i]));
    }

    document.close();
}

From source file:at.tugraz.sss.serv.util.SSFileU.java

License:Apache License

public static void writePDFFromDoc(final String docFilePath, final String pdfFilePath) throws SSErr {

    try {//from   w  w w  . j a  v  a2  s  . c  o m
        final Document document = new Document();
        final POIFSFileSystem fs = new POIFSFileSystem(openFileForRead(docFilePath));
        final HWPFDocument word = new HWPFDocument(fs);
        final WordExtractor we = new WordExtractor(word);
        final OutputStream out = openOrCreateFileWithPathForWrite(pdfFilePath);
        final PdfWriter writer = PdfWriter.getInstance(document, out);
        final Range range = word.getRange();

        document.open();
        writer.setPageEmpty(true);
        document.newPage();
        writer.setPageEmpty(true);

        String[] paragraphs = we.getParagraphText();

        for (int i = 0; i < paragraphs.length; i++) {

            org.apache.poi.hwpf.usermodel.Paragraph pr = range.getParagraph(i);
            // CharacterRun run = pr.getCharacterRun(i);
            // run.setBold(true);
            // run.setCapitalized(true);
            // run.setItalic(true);
            paragraphs[i] = paragraphs[i].replaceAll("\\cM?\r?\n", "");
            System.out.println("Length:" + paragraphs[i].length());
            System.out.println("Paragraph" + i + ": " + paragraphs[i].toString());

            // add the paragraph to the document
            document.add(new Paragraph(paragraphs[i]));
        }

        document.close();
    } catch (Exception error) {
        SSServErrReg.regErrThrow(error);
    }
}

From source file:com.google.gdt.handler.impl.WordHandler.java

License:Open Source License

/**
 * // w  w  w .  ja  va 2s.c  o m
 * @param inputFile
 * @param pLevel
 * @throws IOException
 * @throws InvalidFormatException
 */
@Override
public void handle(String inputFile, ProgressLevel pLevel) throws IOException, InvalidFormatException {
    String outPutFile = getOuputFileName(inputFile);
    OutputStream outputStream = new FileOutputStream(outPutFile);
    InputStream inputStream = new FileInputStream(inputFile);

    HWPFDocument hDocument = new HWPFDocument(inputStream);
    Range range = hDocument.getRange();

    pLevel.setTrFileName(outPutFile);
    pLevel.setValue(0);
    pLevel.setStringPainted(true);
    pLevel.setMaxValue(range.numParagraphs());
    int count = 0;
    for (int i = 0; i < range.numParagraphs(); i++) {
        Paragraph paragraph = range.getParagraph(i);
        int numCharRuns = paragraph.numCharacterRuns();
        for (int j = 0; j < numCharRuns; j++) {
            if (isInterrupted) {
                outputStream.close();
                new File(outPutFile).delete();
                pLevel.setString("cancelled");
                return;
            }
            CharacterRun charRun = paragraph.getCharacterRun(j);
            String inputText = charRun.text();
            if ((null == inputText) || (inputText.trim().equals("")))
                continue;
            String translatedTxt = inputText;
            //in http post method, all key value pairs are seperated with &
            if (preferenceModel.getTranslatorType() == TranslatorType.HTTP)
                inputText = inputText.replaceAll("&", "and");
            try {
                translatedTxt = translator.translate(translatedTxt);
                charRun.replaceText(inputText, translatedTxt);
            } catch (Exception e) {
                logger.log(Level.SEVERE,
                        "Input File : " + inputFile + " cannot translate the text : " + inputText, e);
            }
        }
        count++;
        pLevel.setValue(count);
    }
    pLevel.setString("done");
    hDocument.write(outputStream);
    outputStream.close();
}

From source file:com.thuvienkhoahoc.wordtomwtext.examples.WordToMwtext.java

License:Apache License

public WordToMwtext(HWPFDocument doc, OutputStream stream) throws IOException, UnsupportedEncodingException {

    // bagd/*from  ww w . ja v  a  2s. c  om*/
    OutputStreamWriter out = new OutputStreamWriter(stream, "UTF-8");
    _out = out;
    _doc = doc;

    init();
    openDocument();
    openBody();

    Range r = doc.getRange();
    StyleSheet styleSheet = doc.getStyleSheet();

    int sectionLevel = 0;
    int lenParagraph = r.numParagraphs();
    boolean inCode = false;
    for (int x = 0; x < lenParagraph; x++) {
        Paragraph p = r.getParagraph(x);
        String text = p.text();
        if (text.trim().length() == 0) {
            continue;
        }
        StyleDescription paragraphStyle = styleSheet.getStyleDescription(p.getStyleIndex());
        String styleName = paragraphStyle.getName();
        if (styleName.startsWith("Heading")) {
            if (inCode) {
                closeSource();
                inCode = false;
            }

            int headerLevel = Integer.parseInt(styleName.substring(8));
            if (headerLevel > sectionLevel) {
                openSection();
            } else {
                for (int y = 0; y < (sectionLevel - headerLevel) + 1; y++) {
                    closeSection();
                }
                openSection();
            }
            sectionLevel = headerLevel;
            openTitle(sectionLevel);
            writePlainText(text.trim());
            closeTitle(sectionLevel);
        } else {
            int cruns = p.numCharacterRuns();
            CharacterRun run = p.getCharacterRun(0);
            String fontName = run.getFontName();
            if (fontName.startsWith("Courier")) {
                if (!inCode) {
                    openSource();
                    inCode = true;
                }
                writePlainText(p.text());
            } else {
                if (inCode) {
                    inCode = false;
                    closeSource();
                }
                openParagraph();
                writePlainText(p.text());
                closeParagraph();
            }
        }
    }
    for (int x = 0; x < sectionLevel; x++) {
        closeSection();
    }
    closeBody();
    closeDocument();
    _out.flush();

}

From source file:com.unsa.view.MainView.java

License:Creative Commons License

private void DocConverterPDF(File file1) {
    NPOIFSFileSystem fs = null;/*from  w  w w .j  av  a 2  s .c o m*/
    com.lowagie.text.Document document = new com.lowagie.text.Document();

    try {
        System.out.println(file1.getAbsolutePath());
        fs = new NPOIFSFileSystem(new FileInputStream(file1.getAbsolutePath()));
        HWPFDocument doc = new HWPFDocument(fs.getRoot());
        WordExtractor we = new WordExtractor(doc);
        String output = file1.getAbsolutePath().substring(0, file1.getAbsolutePath().length() - 3);
        OutputStream fileout = new FileOutputStream(new File(output + "pdf"));

        PdfWriter writer = PdfWriter.getInstance(document, fileout);

        Range range = doc.getRange();
        document.open();
        writer.setPageEmpty(true);
        document.newPage();
        writer.setPageEmpty(true);

        String[] paragraphs = we.getParagraphText();
        for (int i = 0; i < paragraphs.length; i++) {

            org.apache.poi.hwpf.usermodel.Paragraph pr = range.getParagraph(i);
            paragraphs[i] = paragraphs[i].replaceAll("\\cM?\r?\n", "");
            document.add(new Paragraph(paragraphs[i]));
        }

    } catch (Exception e) {

        e.printStackTrace();
    } finally {

        document.close();
    }

}

From source file:com.xx.platform.util.tools.ms.WordExtractor.java

License:Apache License

/**
 * Get the text from the word file, as an array with one String
 *  per paragraph//w  w  w.ja  v a 2s. co  m
 */
public String[] getParagraphText() {
    String[] ret;

    // Extract using the model code
    try {
        Range r = doc.getRange();

        ret = new String[r.numParagraphs()];
        for (int i = 0; i < ret.length; i++) {
            Paragraph p = r.getParagraph(i);
            ret[i] = p.text();

            // Fix the line ending
            if (ret[i].endsWith("\r")) {
                ret[i] = ret[i] + "\n";
            }
        }
    } catch (Exception e) {
        // Something's up with turning the text pieces into paragraphs
        // Fall back to ripping out the text pieces
        ret = new String[1];
        ret[0] = getTextFromPieces();
    }

    return ret;
}

From source file:com.zhch.example.poi.Word2Forrest.java

License:Apache License

@SuppressWarnings("unused")
public Word2Forrest(HWPFDocument doc, OutputStream stream) throws IOException {
    OutputStreamWriter out = new OutputStreamWriter(stream, Charset.forName("UTF-8"));
    _out = out;/*w w w .j av  a  2s. c o m*/
    _doc = doc;

    init();
    openDocument();
    openBody();

    Range r = doc.getRange();
    StyleSheet styleSheet = doc.getStyleSheet();

    int sectionLevel = 0;
    int lenParagraph = r.numParagraphs();
    boolean inCode = false;
    for (int x = 0; x < lenParagraph; x++) {
        Paragraph p = r.getParagraph(x);
        String text = p.text();
        if (text.trim().length() == 0) {
            continue;
        }
        StyleDescription paragraphStyle = styleSheet.getStyleDescription(p.getStyleIndex());
        String styleName = paragraphStyle.getName();
        if (styleName.startsWith("Heading")) {
            if (inCode) {
                closeSource();
                inCode = false;
            }

            int headerLevel = Integer.parseInt(styleName.substring(8));
            if (headerLevel > sectionLevel) {
                openSection();
            } else {
                for (int y = 0; y < (sectionLevel - headerLevel) + 1; y++) {
                    closeSection();
                }
                openSection();
            }
            sectionLevel = headerLevel;
            openTitle();
            writePlainText(text);
            closeTitle();
        } else {
            int cruns = p.numCharacterRuns();
            CharacterRun run = p.getCharacterRun(0);
            String fontName = run.getFontName();
            if (fontName.startsWith("Courier")) {
                if (!inCode) {
                    openSource();
                    inCode = true;
                }
                writePlainText(p.text());
            } else {
                if (inCode) {
                    inCode = false;
                    closeSource();
                }
                openParagraph();
                writePlainText(p.text());
                closeParagraph();
            }
        }
    }
    for (int x = 0; x < sectionLevel; x++) {
        closeSection();
    }
    closeBody();
    closeDocument();
    _out.flush();

}

From source file:mj.ocraptor.extraction.tika.parser.microsoft.WordExtractor.java

License:Apache License

protected void parse(DirectoryNode root, XHTMLContentHandler xhtml)
        throws IOException, SAXException, TikaException {
    HWPFDocument document;//ww  w .j av  a  2 s.c o  m
    try {
        document = new HWPFDocument(root);
    } catch (OldWordFileFormatException e) {
        parseWord6(root, xhtml);
        return;
    }

    org.apache.poi.hwpf.extractor.WordExtractor wordExtractor = new org.apache.poi.hwpf.extractor.WordExtractor(
            document);

    // mj
    extractImageText(xhtml, document);

    HeaderStories headerFooter = new HeaderStories(document);

    // Grab the list of pictures. As far as we can tell,
    // the pictures should be in order, and may be directly
    // placed or referenced from an anchor
    PicturesTable pictureTable = document.getPicturesTable();
    PicturesSource pictures = new PicturesSource(document);

    // Do any headers, if present
    Range[] headers = new Range[] { headerFooter.getFirstHeaderSubrange(), headerFooter.getEvenHeaderSubrange(),
            headerFooter.getOddHeaderSubrange() };
    handleHeaderFooter(headers, "header", document, pictures, pictureTable, xhtml);

    // Do the main paragraph text
    Range r = document.getRange();
    for (int i = 0; i < r.numParagraphs(); i++) {
        Paragraph p = r.getParagraph(i);
        i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN, pictures, pictureTable, xhtml);
    }

    // Do everything else
    for (String paragraph : wordExtractor.getMainTextboxText()) {
        xhtml.element("p", paragraph);
    }

    for (String paragraph : wordExtractor.getFootnoteText()) {
        xhtml.element("p", paragraph);
    }

    for (String paragraph : wordExtractor.getCommentsText()) {
        xhtml.element("p", paragraph);
    }

    for (String paragraph : wordExtractor.getEndnoteText()) {
        xhtml.element("p", paragraph);
    }

    // Do any footers, if present
    Range[] footers = new Range[] { headerFooter.getFirstFooterSubrange(), headerFooter.getEvenFooterSubrange(),
            headerFooter.getOddFooterSubrange() };
    handleHeaderFooter(footers, "footer", document, pictures, pictureTable, xhtml);

    // Handle any pictures that we haven't output yet
    for (Picture p = pictures.nextUnclaimed(); p != null;) {
        handlePictureCharacterRun(null, p, pictures, xhtml);
        p = pictures.nextUnclaimed();
    }

    // Handle any embeded office documents
    try {
        DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool");
        for (Entry entry : op) {
            if (entry.getName().startsWith("_") && entry instanceof DirectoryEntry) {
                handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml);
            }
        }
    } catch (FileNotFoundException e) {
    }
}

From source file:mj.ocraptor.extraction.tika.parser.microsoft.WordExtractor.java

License:Apache License

private void handleHeaderFooter(Range[] ranges, String type, HWPFDocument document, PicturesSource pictures,
        PicturesTable pictureTable, XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException {
    if (countParagraphs(ranges) > 0) {
        xhtml.startElement("div", "class", type);
        for (Range r : ranges) {
            if (r != null) {
                for (int i = 0; i < r.numParagraphs(); i++) {
                    Paragraph p = r.getParagraph(i);

                    String text = p.text();
                    if (text.replaceAll("[\\r\\n\\s]+", "").isEmpty()) {
                        // Skip empty header or footer paragraphs
                    } else {
                        i += handleParagraph(p, 0, r, document, FieldsDocumentPart.HEADER, pictures,
                                pictureTable, xhtml);
                    }/*from  w  ww.j a  va 2 s  .  c om*/
                }
            }
        }
        xhtml.endElement("div");
    }
}

From source file:org.apache.tika.parser.microsoft.WordExtractor.java

License:Apache License

protected void parse(DirectoryNode root, XHTMLContentHandler xhtml)
        throws IOException, SAXException, TikaException {
    HWPFDocument document;/*from  ww  w  .  ja  va  2  s  .  c om*/
    try {
        document = new HWPFDocument(root);
    } catch (OldWordFileFormatException e) {
        parseWord6(root, xhtml);
        return;
    }
    org.apache.poi.hwpf.extractor.WordExtractor wordExtractor = new org.apache.poi.hwpf.extractor.WordExtractor(
            document);
    HeaderStories headerFooter = new HeaderStories(document);

    // Grab the list of pictures. As far as we can tell,
    //  the pictures should be in order, and may be directly
    //  placed or referenced from an anchor
    PicturesTable pictureTable = document.getPicturesTable();
    PicturesSource pictures = new PicturesSource(document);

    // Do any headers, if present
    Range[] headers = new Range[] { headerFooter.getFirstHeaderSubrange(), headerFooter.getEvenHeaderSubrange(),
            headerFooter.getOddHeaderSubrange() };
    handleHeaderFooter(headers, "header", document, pictures, pictureTable, xhtml);

    // Do the main paragraph text
    Range r = document.getRange();
    ListManager listManager = new ListManager(document);
    for (int i = 0; i < r.numParagraphs(); i++) {
        Paragraph p = r.getParagraph(i);
        i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN, pictures, pictureTable, listManager,
                xhtml);
    }

    // Do everything else
    for (String paragraph : wordExtractor.getMainTextboxText()) {
        xhtml.element("p", paragraph);
    }

    for (String paragraph : wordExtractor.getFootnoteText()) {
        xhtml.element("p", paragraph);
    }

    for (String paragraph : wordExtractor.getCommentsText()) {
        xhtml.element("p", paragraph);
    }

    for (String paragraph : wordExtractor.getEndnoteText()) {
        xhtml.element("p", paragraph);
    }

    // Do any footers, if present
    Range[] footers = new Range[] { headerFooter.getFirstFooterSubrange(), headerFooter.getEvenFooterSubrange(),
            headerFooter.getOddFooterSubrange() };
    handleHeaderFooter(footers, "footer", document, pictures, pictureTable, xhtml);

    // Handle any pictures that we haven't output yet
    for (Picture p = pictures.nextUnclaimed(); p != null;) {
        handlePictureCharacterRun(null, p, pictures, xhtml);
        p = pictures.nextUnclaimed();
    }

    // Handle any embeded office documents
    try {
        DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool");
        for (Entry entry : op) {
            if (entry.getName().startsWith("_") && entry instanceof DirectoryEntry) {
                handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml);
            }
        }
    } catch (FileNotFoundException e) {
    }
}