List of usage examples for org.apache.poi.hwpf.usermodel Picture getContent
public byte[] getContent()
From source file:com.example.minireader.WordViewActivity.java
License:Apache License
/**word.jpg*/ public void writePicture() { Picture picture = (Picture) pictures.get(presentPicture); byte[] pictureBytes = picture.getContent(); Bitmap bitmap = BitmapFactory.decodeByteArray(pictureBytes, 0, pictureBytes.length); makePictureFile();/*from ww w . j a va 2 s . c o m*/ presentPicture++; File myPicture = new File(picturePath); try { FileOutputStream outputPicture = new FileOutputStream(myPicture); outputPicture.write(pictureBytes); outputPicture.close(); } catch (Exception e) { System.out.println("outputPicture Exception"); } String imageString = "<img src=\"" + picturePath + "\""; if (bitmap.getWidth() > screenWidth) { imageString = imageString + " " + "width=\"" + screenWidth + "\""; } imageString = imageString + ">"; try { output.write(imageString.getBytes()); } catch (Exception e) { System.out.println("output Exception"); } }
From source file:mj.ocraptor.extraction.tika.parser.microsoft.WordExtractor.java
License:Apache License
private void extractImageText(XHTMLContentHandler xhtml, HWPFDocument document) { if (Config.inst().getProp(ConfigBool.ENABLE_IMAGE_OCR)) { TikaImageHelper helper = new TikaImageHelper(metadata); try {// w w w. j a v a2s . c o m List<Picture> pictures2 = document.getPicturesTable().getAllPictures(); for (Picture picture : pictures2) { ByteArrayInputStream imageData = new ByteArrayInputStream(picture.getContent()); helper.addImage(ImageIO.read(imageData)); } // TODO: find out page number helper.addTextToHandler(xhtml); } catch (Exception e) { e.printStackTrace(); } finally { if (helper != null) { helper.close(); } } } }
From source file:mj.ocraptor.extraction.tika.parser.microsoft.WordExtractor.java
License:Apache License
private void handlePictureCharacterRun(CharacterRun cr, Picture picture, PicturesSource pictures, XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException { if (!isRendered(cr) || picture == null) { // Oh dear, we've run out... // Probably caused by multiple \u0008 images referencing // the same real image return;//w ww . j a v a 2s .c om } // Which one is it? String extension = picture.suggestFileExtension(); int pictureNumber = pictures.pictureNumber(picture); // Make up a name for the picture // There isn't one in the file, but we need to be able to reference // the picture from the img tag and the embedded resource String filename = "image" + pictureNumber + (extension.length() > 0 ? "." + extension : ""); // Grab the mime type for the picture String mimeType = picture.getMimeType(); // Output the img tag AttributesImpl attr = new AttributesImpl(); attr.addAttribute("", "src", "src", "CDATA", "embedded:" + filename); attr.addAttribute("", "alt", "alt", "CDATA", filename); xhtml.startElement("img", attr); xhtml.endElement("img"); // Have we already output this one? // (Only expose each individual image once) if (!pictures.hasOutput(picture)) { TikaInputStream stream = TikaInputStream.get(picture.getContent()); handleEmbeddedResource(stream, filename, null, mimeType, xhtml, false); pictures.recordOutput(picture); } }
From source file:org.apache.tika.parser.microsoft.WordExtractor.java
License:Apache License
private void handlePictureCharacterRun(CharacterRun cr, Picture picture, PicturesSource pictures, XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException { if (!isRendered(cr) || picture == null) { // Oh dear, we've run out... // Probably caused by multiple \u0008 images referencing // the same real image return;//from ww w .ja v a2s.c o m } // Which one is it? String extension = picture.suggestFileExtension(); int pictureNumber = pictures.pictureNumber(picture); // Make up a name for the picture // There isn't one in the file, but we need to be able to reference // the picture from the img tag and the embedded resource String filename = "image" + pictureNumber + (extension.length() > 0 ? "." + extension : ""); // Grab the mime type for the picture String mimeType = picture.getMimeType(); // Output the img tag AttributesImpl attr = new AttributesImpl(); attr.addAttribute("", "src", "src", "CDATA", "embedded:" + filename); attr.addAttribute("", "alt", "alt", "CDATA", filename); xhtml.startElement("img", attr); xhtml.endElement("img"); // Have we already output this one? // (Only expose each individual image once) if (!pictures.hasOutput(picture)) { TikaInputStream stream = TikaInputStream.get(picture.getContent()); handleEmbeddedResource(stream, filename, null, mimeType, xhtml, false); pictures.recordOutput(picture); } }
From source file:org.docx4j.convert.in.Doc.java
License:Apache License
private static org.docx4j.wml.P handleP(WordprocessingMLPackage wordMLPackage, HWPFDocument doc, Paragraph p, org.apache.poi.hwpf.model.StyleSheet stylesheet, MainDocumentPart documentPart, org.docx4j.wml.ObjectFactory factory) { org.docx4j.wml.P wmlP = null;/* w w w . ja v a 2 s.c om*/ if (p.getStyleIndex() > 0) { log.debug("Styled paragraph, with index: " + p.getStyleIndex()); String styleName = stylesheet.getStyleDescription(p.getStyleIndex()).getName(); log.debug(styleName); wmlP = documentPart.createStyledParagraphOfText(stripSpace(styleName), null); } else { wmlP = documentPart.createParagraphOfText(null); } // LineSpacingDescriptor lsd = p.getLineSpacing(); // if (lsd==null || lsd.isEmpty()) { // // do nothing // } else { // PPr pPr = wmlP.getPPr(); // if (pPr==null) { // pPr = Context.getWmlObjectFactory().createPPr(); // wmlP.setPPr(pPr); // } // Spacing spacing = // Context.getWmlObjectFactory().createPPrBaseSpacing(); // spacing.setLine(lsd._dyaLine); // not visible // spacing.setLineRule(STLineSpacingRule.AUTO); // pPr.setSpacing(spacing); // } for (int z = 0; z < p.numCharacterRuns(); z++) { // character run CharacterRun run = p.getCharacterRun(z); // No character styles defined in there?? org.docx4j.wml.RPr rPr = null; if (run.isBold()) { // TODO - HIGH PRIORITY- handle other run properties // esp underline, font size if (rPr == null) { rPr = factory.createRPr(); } org.docx4j.wml.BooleanDefaultTrue boldOn = factory.createBooleanDefaultTrue(); boldOn.setVal(Boolean.TRUE); rPr.setB(boldOn); } //Process image if (doc instanceof HWPFDocument && ((HWPFDocument) doc).getPicturesTable().hasPicture(run)) { Picture picture = doc.getPicturesTable().extractPicture(run, true); Inline inline; try { BinaryPartAbstractImage imagePart = BinaryPartAbstractImage.createImagePart(wordMLPackage, picture.getContent()); long cx = UnitsOfMeasurement .twipToEMU(Math.round((double) imagePart.getImageInfo().getSize().getWidthMpt() * ((double) picture.getHorizontalScalingFactor() * 0.00001d))) * 2L; long cy = UnitsOfMeasurement .twipToEMU(Math.round((double) imagePart.getImageInfo().getSize().getHeightMpt() * ((double) picture.getVerticalScalingFactor() * 0.00001d))) * 2L; inline = imagePart.createImageInline(null, "", ID1++, ID2++, cx, cy, false); org.docx4j.wml.R imgrun = factory.createR(); org.docx4j.wml.Drawing drawing = factory.createDrawing(); imgrun.getContent().add(drawing); drawing.getAnchorOrInline().add(inline); wmlP.getContent().add(imgrun); } catch (Exception e1) { // TODO Auto-generated catch block e1.printStackTrace(); } } else { // character run text String text = run.text(); // show us the text log.debug("Processing: " + text); String cleansed = stripNonValidXMLCharacters(text); // Necessary to avoid org.xml.sax.SAXParseException: An invalid // XML character // (Unicode: 0xb) was found in the element content of the // document. // when trying to open the resulting docx. // ie JAXB happily writes (marshals) it, but doesn't want to // unmarshall. if (!text.equals(cleansed)) { log.warn("Cleansed.."); } org.docx4j.wml.Text t = factory.createText(); t.setValue(cleansed); org.docx4j.wml.R wmlRun = factory.createR(); if (rPr != null) { wmlRun.setRPr(rPr); } wmlRun.getRunContent().add(t); wmlP.getParagraphContent().add(wmlRun); } } System.out.println(XmlUtils.marshaltoString(wmlP, true, true)); return wmlP; }
From source file:org.opf_labs.aqua.OfficeAnalyser.java
License:Apache License
public static void main(String[] args) throws Exception { //import org.apache.poi.poifs.dev.POIFSDump; //POIFSDump.main(args); SMOutputDocument xmldoc = SMOutputFactory.createOutputDocument( SMOutputFactory.getGlobalXMLOutputFactory().createXMLStreamWriter(System.out, "UTF-8"), "1.1", "UTF-8", true); xmldoc.setIndentation("\n ", 1, 2); // for unix linefeed, 2 spaces per level SMOutputElement xmlroot = xmldoc.addElement("properties"); // Loop through arguments: for (int i = 0; i < args.length; i++) { SMOutputElement xd = xmlroot.addElement("document"); xd.addAttribute("href", args[i]); HWPFDocument doc = new HWPFDocument(new FileInputStream(args[i])); // SummaryInformation SMOutputElement sie = xd.addElement("SummaryInformation"); sie.addElement("ApplicationName").addCharacters(doc.getSummaryInformation().getApplicationName()); sie.addElement("OSVersion").addCharacters("" + doc.getSummaryInformation().getOSVersion()); sie.addElement("Author").addCharacters("" + doc.getSummaryInformation().getAuthor()); sie.addElement("CharCount").addCharacters("" + doc.getSummaryInformation().getCharCount()); sie.addElement("Comments").addCharacters("" + doc.getSummaryInformation().getComments()); sie.addElement("EditTime").addCharacters("" + doc.getSummaryInformation().getEditTime()); sie.addElement("Format").addCharacters("" + doc.getSummaryInformation().getFormat()); sie.addElement("Keywords").addCharacters("" + doc.getSummaryInformation().getKeywords()); sie.addElement("LastAuthor").addCharacters("" + doc.getSummaryInformation().getLastAuthor()); sie.addElement("PageCount").addCharacters("" + doc.getSummaryInformation().getPageCount()); sie.addElement("RevNumber").addCharacters("" + doc.getSummaryInformation().getRevNumber()); sie.addElement("SectionCount").addCharacters("" + doc.getSummaryInformation().getSectionCount()); sie.addElement("Security").addCharacters("" + doc.getSummaryInformation().getSecurity()); sie.addElement("Subject").addCharacters("" + doc.getSummaryInformation().getSubject()); sie.addElement("Template").addCharacters("" + doc.getSummaryInformation().getTemplate()); sie.addElement("Title").addCharacters("" + doc.getSummaryInformation().getTitle()); sie.addElement("WordCount").addCharacters("" + doc.getSummaryInformation().getWordCount()); sie.addElement("CreatedDateTime").addCharacters("" + doc.getSummaryInformation().getCreateDateTime()); sie.addElement("LastPrinted").addCharacters("" + doc.getSummaryInformation().getLastPrinted()); sie.addElement("LastSaveDateTime") .addCharacters("" + doc.getSummaryInformation().getLastSaveDateTime()); sie.addElement("Thumbnail").addCharacters("" + doc.getSummaryInformation().getThumbnail()); // TextTable SMOutputElement tte = xd.addElement("TextTable"); for (TextPiece tp : doc.getTextTable().getTextPieces()) { SMOutputElement tpe = tte.addElement("TextPiece"); tpe.addAttribute("isUnicode", "" + tp.getPieceDescriptor().isUnicode()); tpe.addCharacters(tp.getStringBuilder().toString()); }/* w ww .ja v a2s. c o m*/ // DocumentSummaryInformation SMOutputElement dsie = xd.addElement("DocumentSummaryInformation"); dsie.addElement("ParCount").addCharacters("" + doc.getDocumentSummaryInformation().getParCount()); dsie.addElement("ByteCount").addCharacters("" + doc.getDocumentSummaryInformation().getByteCount()); dsie.addElement("HiddenCount").addCharacters("" + doc.getDocumentSummaryInformation().getHiddenCount()); dsie.addElement("LineCount").addCharacters("" + doc.getDocumentSummaryInformation().getLineCount()); dsie.addElement("MMClipCount").addCharacters("" + doc.getDocumentSummaryInformation().getMMClipCount()); dsie.addElement("NoteCount").addCharacters("" + doc.getDocumentSummaryInformation().getNoteCount()); dsie.addElement("SectionCount") .addCharacters("" + doc.getDocumentSummaryInformation().getSectionCount()); dsie.addElement("SlideCount").addCharacters("" + doc.getDocumentSummaryInformation().getSlideCount()); dsie.addElement("Format").addCharacters("" + doc.getDocumentSummaryInformation().getFormat()); dsie.addElement("PresentationFormat") .addCharacters("" + doc.getDocumentSummaryInformation().getPresentationFormat()); dsie.addElement("Company").addCharacters("" + doc.getDocumentSummaryInformation().getCompany()); dsie.addElement("Category").addCharacters("" + doc.getDocumentSummaryInformation().getCategory()); // Sections for (Object os : doc.getDocumentSummaryInformation().getSections()) { Section s = (Section) os; SMOutputElement se = dsie.addElement("Section"); se.addElement("FormatID").addCharacters("" + s.getFormatID()); se.addElement("CodePage").addCharacters("" + s.getCodepage()); se.addElement("PropertyCount").addCharacters("" + s.getPropertyCount()); for (Property sp : s.getProperties()) { SMOutputElement pe = se.addElement("Property"); pe.addAttribute("class", sp.getValue().getClass().getCanonicalName()); pe.addCharacters(sp.getValue().toString()); } } SMOutputElement fte = xd.addElement("FontTable"); for (Ffn f : doc.getFontTable().getFontNames()) { SMOutputElement fe = fte.addElement("Font"); fe.addElement("MainFontName").addCharacters(f.getMainFontName()); try { fe.addElement("AltFontName").addCharacters(f.getAltFontName()); } catch (Exception e) { // Seems to fail, and no safe test found as yet. } fe.addElement("Size").addCharacters("" + f.getSize()); fe.addElement("Weight").addCharacters("" + f.getWeight()); } SMOutputElement pte = xd.addElement("PicturesTable"); for (Picture p : doc.getPicturesTable().getAllPictures()) { SMOutputElement pe = pte.addElement("Picture"); pe.addElement("MimeType").addCharacters(p.getMimeType()); pe.addElement("Width").addCharacters("" + p.getWidth()); pe.addElement("Height").addCharacters("" + p.getHeight()); pe.addElement("HorizontalScalingFactor").addCharacters("" + p.getHorizontalScalingFactor()); pe.addElement("VerticalScalingFactor").addCharacters("" + p.getVerticalScalingFactor()); pe.addElement("Content").addCharacters("" + p.getContent()); } //parseCompObj( new File(args[i]) ); // This //System.out.println("Dumping " + args[i]); FileInputStream is = new FileInputStream(args[i]); POIFSFileSystem fs = new POIFSFileSystem(is); is.close(); DirectoryEntry root = fs.getRoot(); //dump(root); xmldoc.closeRoot(); // important, flushes, closes output } }