List of usage examples for org.apache.poi.hwpf.usermodel Range getParagraph
public Paragraph getParagraph(int index)
From source file:at.tugraz.sss.serv.SSFileU.java
License:Apache License
public static void writePDFFromDoc(final String docFilePath, final String pdfFilePath) throws Exception { final Document document = new Document(); final POIFSFileSystem fs = new POIFSFileSystem(openFileForRead(docFilePath)); final HWPFDocument word = new HWPFDocument(fs); final WordExtractor we = new WordExtractor(word); final OutputStream out = openOrCreateFileWithPathForWrite(pdfFilePath); final PdfWriter writer = PdfWriter.getInstance(document, out); final Range range = word.getRange(); document.open();/*w w w .j a v a 2s . co m*/ writer.setPageEmpty(true); document.newPage(); writer.setPageEmpty(true); String[] paragraphs = we.getParagraphText(); for (int i = 0; i < paragraphs.length; i++) { org.apache.poi.hwpf.usermodel.Paragraph pr = range.getParagraph(i); // CharacterRun run = pr.getCharacterRun(i); // run.setBold(true); // run.setCapitalized(true); // run.setItalic(true); paragraphs[i] = paragraphs[i].replaceAll("\\cM?\r?\n", ""); System.out.println("Length:" + paragraphs[i].length()); System.out.println("Paragraph" + i + ": " + paragraphs[i].toString()); // add the paragraph to the document document.add(new Paragraph(paragraphs[i])); } document.close(); }
From source file:at.tugraz.sss.serv.util.SSFileU.java
License:Apache License
public static void writePDFFromDoc(final String docFilePath, final String pdfFilePath) throws SSErr { try {//from w w w . j a v a2 s . c o m final Document document = new Document(); final POIFSFileSystem fs = new POIFSFileSystem(openFileForRead(docFilePath)); final HWPFDocument word = new HWPFDocument(fs); final WordExtractor we = new WordExtractor(word); final OutputStream out = openOrCreateFileWithPathForWrite(pdfFilePath); final PdfWriter writer = PdfWriter.getInstance(document, out); final Range range = word.getRange(); document.open(); writer.setPageEmpty(true); document.newPage(); writer.setPageEmpty(true); String[] paragraphs = we.getParagraphText(); for (int i = 0; i < paragraphs.length; i++) { org.apache.poi.hwpf.usermodel.Paragraph pr = range.getParagraph(i); // CharacterRun run = pr.getCharacterRun(i); // run.setBold(true); // run.setCapitalized(true); // run.setItalic(true); paragraphs[i] = paragraphs[i].replaceAll("\\cM?\r?\n", ""); System.out.println("Length:" + paragraphs[i].length()); System.out.println("Paragraph" + i + ": " + paragraphs[i].toString()); // add the paragraph to the document document.add(new Paragraph(paragraphs[i])); } document.close(); } catch (Exception error) { SSServErrReg.regErrThrow(error); } }
From source file:com.google.gdt.handler.impl.WordHandler.java
License:Open Source License
/** * // w w w . ja va 2s.c o m * @param inputFile * @param pLevel * @throws IOException * @throws InvalidFormatException */ @Override public void handle(String inputFile, ProgressLevel pLevel) throws IOException, InvalidFormatException { String outPutFile = getOuputFileName(inputFile); OutputStream outputStream = new FileOutputStream(outPutFile); InputStream inputStream = new FileInputStream(inputFile); HWPFDocument hDocument = new HWPFDocument(inputStream); Range range = hDocument.getRange(); pLevel.setTrFileName(outPutFile); pLevel.setValue(0); pLevel.setStringPainted(true); pLevel.setMaxValue(range.numParagraphs()); int count = 0; for (int i = 0; i < range.numParagraphs(); i++) { Paragraph paragraph = range.getParagraph(i); int numCharRuns = paragraph.numCharacterRuns(); for (int j = 0; j < numCharRuns; j++) { if (isInterrupted) { outputStream.close(); new File(outPutFile).delete(); pLevel.setString("cancelled"); return; } CharacterRun charRun = paragraph.getCharacterRun(j); String inputText = charRun.text(); if ((null == inputText) || (inputText.trim().equals(""))) continue; String translatedTxt = inputText; //in http post method, all key value pairs are seperated with & if (preferenceModel.getTranslatorType() == TranslatorType.HTTP) inputText = inputText.replaceAll("&", "and"); try { translatedTxt = translator.translate(translatedTxt); charRun.replaceText(inputText, translatedTxt); } catch (Exception e) { logger.log(Level.SEVERE, "Input File : " + inputFile + " cannot translate the text : " + inputText, e); } } count++; pLevel.setValue(count); } pLevel.setString("done"); hDocument.write(outputStream); outputStream.close(); }
From source file:com.thuvienkhoahoc.wordtomwtext.examples.WordToMwtext.java
License:Apache License
public WordToMwtext(HWPFDocument doc, OutputStream stream) throws IOException, UnsupportedEncodingException { // bagd/*from ww w . ja v a 2s. c om*/ OutputStreamWriter out = new OutputStreamWriter(stream, "UTF-8"); _out = out; _doc = doc; init(); openDocument(); openBody(); Range r = doc.getRange(); StyleSheet styleSheet = doc.getStyleSheet(); int sectionLevel = 0; int lenParagraph = r.numParagraphs(); boolean inCode = false; for (int x = 0; x < lenParagraph; x++) { Paragraph p = r.getParagraph(x); String text = p.text(); if (text.trim().length() == 0) { continue; } StyleDescription paragraphStyle = styleSheet.getStyleDescription(p.getStyleIndex()); String styleName = paragraphStyle.getName(); if (styleName.startsWith("Heading")) { if (inCode) { closeSource(); inCode = false; } int headerLevel = Integer.parseInt(styleName.substring(8)); if (headerLevel > sectionLevel) { openSection(); } else { for (int y = 0; y < (sectionLevel - headerLevel) + 1; y++) { closeSection(); } openSection(); } sectionLevel = headerLevel; openTitle(sectionLevel); writePlainText(text.trim()); closeTitle(sectionLevel); } else { int cruns = p.numCharacterRuns(); CharacterRun run = p.getCharacterRun(0); String fontName = run.getFontName(); if (fontName.startsWith("Courier")) { if (!inCode) { openSource(); inCode = true; } writePlainText(p.text()); } else { if (inCode) { inCode = false; closeSource(); } openParagraph(); writePlainText(p.text()); closeParagraph(); } } } for (int x = 0; x < sectionLevel; x++) { closeSection(); } closeBody(); closeDocument(); _out.flush(); }
From source file:com.unsa.view.MainView.java
License:Creative Commons License
private void DocConverterPDF(File file1) { NPOIFSFileSystem fs = null;/*from w w w .j av a 2 s .c o m*/ com.lowagie.text.Document document = new com.lowagie.text.Document(); try { System.out.println(file1.getAbsolutePath()); fs = new NPOIFSFileSystem(new FileInputStream(file1.getAbsolutePath())); HWPFDocument doc = new HWPFDocument(fs.getRoot()); WordExtractor we = new WordExtractor(doc); String output = file1.getAbsolutePath().substring(0, file1.getAbsolutePath().length() - 3); OutputStream fileout = new FileOutputStream(new File(output + "pdf")); PdfWriter writer = PdfWriter.getInstance(document, fileout); Range range = doc.getRange(); document.open(); writer.setPageEmpty(true); document.newPage(); writer.setPageEmpty(true); String[] paragraphs = we.getParagraphText(); for (int i = 0; i < paragraphs.length; i++) { org.apache.poi.hwpf.usermodel.Paragraph pr = range.getParagraph(i); paragraphs[i] = paragraphs[i].replaceAll("\\cM?\r?\n", ""); document.add(new Paragraph(paragraphs[i])); } } catch (Exception e) { e.printStackTrace(); } finally { document.close(); } }
From source file:com.xx.platform.util.tools.ms.WordExtractor.java
License:Apache License
/** * Get the text from the word file, as an array with one String * per paragraph//w w w.ja v a 2s. co m */ public String[] getParagraphText() { String[] ret; // Extract using the model code try { Range r = doc.getRange(); ret = new String[r.numParagraphs()]; for (int i = 0; i < ret.length; i++) { Paragraph p = r.getParagraph(i); ret[i] = p.text(); // Fix the line ending if (ret[i].endsWith("\r")) { ret[i] = ret[i] + "\n"; } } } catch (Exception e) { // Something's up with turning the text pieces into paragraphs // Fall back to ripping out the text pieces ret = new String[1]; ret[0] = getTextFromPieces(); } return ret; }
From source file:com.zhch.example.poi.Word2Forrest.java
License:Apache License
@SuppressWarnings("unused") public Word2Forrest(HWPFDocument doc, OutputStream stream) throws IOException { OutputStreamWriter out = new OutputStreamWriter(stream, Charset.forName("UTF-8")); _out = out;/*w w w .j av a 2s. c o m*/ _doc = doc; init(); openDocument(); openBody(); Range r = doc.getRange(); StyleSheet styleSheet = doc.getStyleSheet(); int sectionLevel = 0; int lenParagraph = r.numParagraphs(); boolean inCode = false; for (int x = 0; x < lenParagraph; x++) { Paragraph p = r.getParagraph(x); String text = p.text(); if (text.trim().length() == 0) { continue; } StyleDescription paragraphStyle = styleSheet.getStyleDescription(p.getStyleIndex()); String styleName = paragraphStyle.getName(); if (styleName.startsWith("Heading")) { if (inCode) { closeSource(); inCode = false; } int headerLevel = Integer.parseInt(styleName.substring(8)); if (headerLevel > sectionLevel) { openSection(); } else { for (int y = 0; y < (sectionLevel - headerLevel) + 1; y++) { closeSection(); } openSection(); } sectionLevel = headerLevel; openTitle(); writePlainText(text); closeTitle(); } else { int cruns = p.numCharacterRuns(); CharacterRun run = p.getCharacterRun(0); String fontName = run.getFontName(); if (fontName.startsWith("Courier")) { if (!inCode) { openSource(); inCode = true; } writePlainText(p.text()); } else { if (inCode) { inCode = false; closeSource(); } openParagraph(); writePlainText(p.text()); closeParagraph(); } } } for (int x = 0; x < sectionLevel; x++) { closeSection(); } closeBody(); closeDocument(); _out.flush(); }
From source file:mj.ocraptor.extraction.tika.parser.microsoft.WordExtractor.java
License:Apache License
protected void parse(DirectoryNode root, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { HWPFDocument document;//ww w .j av a 2 s.c o m try { document = new HWPFDocument(root); } catch (OldWordFileFormatException e) { parseWord6(root, xhtml); return; } org.apache.poi.hwpf.extractor.WordExtractor wordExtractor = new org.apache.poi.hwpf.extractor.WordExtractor( document); // mj extractImageText(xhtml, document); HeaderStories headerFooter = new HeaderStories(document); // Grab the list of pictures. As far as we can tell, // the pictures should be in order, and may be directly // placed or referenced from an anchor PicturesTable pictureTable = document.getPicturesTable(); PicturesSource pictures = new PicturesSource(document); // Do any headers, if present Range[] headers = new Range[] { headerFooter.getFirstHeaderSubrange(), headerFooter.getEvenHeaderSubrange(), headerFooter.getOddHeaderSubrange() }; handleHeaderFooter(headers, "header", document, pictures, pictureTable, xhtml); // Do the main paragraph text Range r = document.getRange(); for (int i = 0; i < r.numParagraphs(); i++) { Paragraph p = r.getParagraph(i); i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN, pictures, pictureTable, xhtml); } // Do everything else for (String paragraph : wordExtractor.getMainTextboxText()) { xhtml.element("p", paragraph); } for (String paragraph : wordExtractor.getFootnoteText()) { xhtml.element("p", paragraph); } for (String paragraph : wordExtractor.getCommentsText()) { xhtml.element("p", paragraph); } for (String paragraph : wordExtractor.getEndnoteText()) { xhtml.element("p", paragraph); } // Do any footers, if present Range[] footers = new Range[] { headerFooter.getFirstFooterSubrange(), headerFooter.getEvenFooterSubrange(), headerFooter.getOddFooterSubrange() }; handleHeaderFooter(footers, "footer", document, pictures, pictureTable, xhtml); // Handle any pictures that we haven't output yet for (Picture p = pictures.nextUnclaimed(); p != null;) { handlePictureCharacterRun(null, p, pictures, xhtml); p = pictures.nextUnclaimed(); } // Handle any embeded office documents try { DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool"); for (Entry entry : op) { if (entry.getName().startsWith("_") && entry instanceof DirectoryEntry) { handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml); } } } catch (FileNotFoundException e) { } }
From source file:mj.ocraptor.extraction.tika.parser.microsoft.WordExtractor.java
License:Apache License
private void handleHeaderFooter(Range[] ranges, String type, HWPFDocument document, PicturesSource pictures, PicturesTable pictureTable, XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException { if (countParagraphs(ranges) > 0) { xhtml.startElement("div", "class", type); for (Range r : ranges) { if (r != null) { for (int i = 0; i < r.numParagraphs(); i++) { Paragraph p = r.getParagraph(i); String text = p.text(); if (text.replaceAll("[\\r\\n\\s]+", "").isEmpty()) { // Skip empty header or footer paragraphs } else { i += handleParagraph(p, 0, r, document, FieldsDocumentPart.HEADER, pictures, pictureTable, xhtml); }/*from w ww.j a va 2 s . c om*/ } } } xhtml.endElement("div"); } }
From source file:org.apache.tika.parser.microsoft.WordExtractor.java
License:Apache License
protected void parse(DirectoryNode root, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { HWPFDocument document;/*from ww w . ja va 2 s . c om*/ try { document = new HWPFDocument(root); } catch (OldWordFileFormatException e) { parseWord6(root, xhtml); return; } org.apache.poi.hwpf.extractor.WordExtractor wordExtractor = new org.apache.poi.hwpf.extractor.WordExtractor( document); HeaderStories headerFooter = new HeaderStories(document); // Grab the list of pictures. As far as we can tell, // the pictures should be in order, and may be directly // placed or referenced from an anchor PicturesTable pictureTable = document.getPicturesTable(); PicturesSource pictures = new PicturesSource(document); // Do any headers, if present Range[] headers = new Range[] { headerFooter.getFirstHeaderSubrange(), headerFooter.getEvenHeaderSubrange(), headerFooter.getOddHeaderSubrange() }; handleHeaderFooter(headers, "header", document, pictures, pictureTable, xhtml); // Do the main paragraph text Range r = document.getRange(); ListManager listManager = new ListManager(document); for (int i = 0; i < r.numParagraphs(); i++) { Paragraph p = r.getParagraph(i); i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN, pictures, pictureTable, listManager, xhtml); } // Do everything else for (String paragraph : wordExtractor.getMainTextboxText()) { xhtml.element("p", paragraph); } for (String paragraph : wordExtractor.getFootnoteText()) { xhtml.element("p", paragraph); } for (String paragraph : wordExtractor.getCommentsText()) { xhtml.element("p", paragraph); } for (String paragraph : wordExtractor.getEndnoteText()) { xhtml.element("p", paragraph); } // Do any footers, if present Range[] footers = new Range[] { headerFooter.getFirstFooterSubrange(), headerFooter.getEvenFooterSubrange(), headerFooter.getOddFooterSubrange() }; handleHeaderFooter(footers, "footer", document, pictures, pictureTable, xhtml); // Handle any pictures that we haven't output yet for (Picture p = pictures.nextUnclaimed(); p != null;) { handlePictureCharacterRun(null, p, pictures, xhtml); p = pictures.nextUnclaimed(); } // Handle any embeded office documents try { DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool"); for (Entry entry : op) { if (entry.getName().startsWith("_") && entry instanceof DirectoryEntry) { handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml); } } } catch (FileNotFoundException e) { } }