List of usage examples for org.apache.pdfbox.pdmodel PDDocument getDocumentCatalog
public PDDocumentCatalog getDocumentCatalog()
From source file:mj.ocraptor.extraction.tika.parser.pdf.PDF2XHTML.java
License:Apache License
private void extractAcroForm(PDDocument pdf, XHTMLContentHandler handler) throws IOException, SAXException { // Thank you, Ben Litchfield, for // org.apache.pdfbox.examples.fdf.PrintFields // this code derives from Ben's code PDDocumentCatalog catalog = pdf.getDocumentCatalog(); if (catalog == null) return;//from w ww. j av a 2s . co m PDAcroForm form = catalog.getAcroForm(); if (form == null) return; @SuppressWarnings("rawtypes") List fields = form.getFields(); if (fields == null) return; @SuppressWarnings("rawtypes") ListIterator itr = fields.listIterator(); if (itr == null) return; handler.startElement("div", "class", "acroform"); handler.startElement("ol"); while (itr.hasNext()) { Object obj = itr.next(); if (obj != null && obj instanceof PDField) { processAcroField((PDField) obj, handler, 0); } } handler.endElement("ol"); handler.endElement("div"); }
From source file:mj.ocraptor.extraction.tika.parser.pdf.PDF2XHTML.java
License:Apache License
/** * * * @param pdf//w ww . java 2 s . co m * * @throws SAXException */ private void extractImageText(PDDocument pdf) { List<?> pages = pdf.getDocumentCatalog().getAllPages(); Iterator<?> pageIterator = pages.iterator(); pageCount = pages.size(); imageCount = 0; int currentPage = 0; TikaImageHelper helper = new TikaImageHelper(this.metadata); try { while (pageIterator.hasNext()) { PDPage page = (PDPage) pageIterator.next(); PDResources resources = page.getResources(); processResources(resources, helper); helper.addTextToHandler(handler, ++currentPage, pageCount); } } catch (Exception e) { e.printStackTrace(); } finally { if (helper != null) { helper.close(); } } }
From source file:modules.PDFFontDependencyExtractorModule.java
License:Apache License
public PDFFontResults extractFontList(File f) throws IOException, InvalidParameterException { PDDocument document; try {//ww w . ja va2s . c om document = PDDocument.load(f); } catch (IOException x) { throw new InvalidParameterException("Not a PDF file"); } SortedSet<FontInformation> ret = new TreeSet<FontInformation>(new Comparator<FontInformation>() { @Override public int compare(FontInformation o1, FontInformation o2) { int a = o1.fontName.compareTo(o2.fontName); if (a != 0) return a; else return o1.fontType.compareTo(o2.fontType); } }); document.getDocumentCatalog().getAllPages(); // The code down here is easier as it gets all the fonts used in the // document. Still, this would inlcude unused fonts, so we get the fonts // page by page and add them to a Hash table. for (COSObject c : document.getDocument().getObjectsByType(COSName.FONT)) { if (c == null || !(c.getObject() instanceof COSDictionary)) { continue; // System.out.println(c.getObject()); } COSDictionary fontDictionary = (COSDictionary) c.getObject(); // System.out.println(dic.getNameAsString(COSName.BASE_FONT)); // } // } // int pagen = document.getNumberOfPages(); // i=0; // for (int p=0;p<pagen;p++){ // PDPage page = (PDPage)pages.get(p); // PDResources res = page.findResources(); // //for each page resources // if (res==null) continue; // // get the font dictionary // COSDictionary fonts = (COSDictionary) // res.getCOSDictionary().getDictionaryObject( COSName.FONT ); // for( COSName fontName : fonts.keySet() ) { // COSObject font = (COSObject) fonts.getItem( fontName ); // // if the font has already been visited we ingore it // long objectId = font.getObjectNumber().longValue(); // if (ret.get(objectId)!=null) // continue; // if( font==null || ! (font.getObject() instanceof COSDictionary) ) // continue; // COSDictionary fontDictionary = (COSDictionary)font.getObject(); // Type MUSt be font if (!fontDictionary.getNameAsString(COSName.TYPE).equals("Font")) { continue; } // get the variables FontInformation fi = new FontInformation(); fi.fontType = fontDictionary.getNameAsString(COSName.SUBTYPE); String baseFont = fontDictionary.getNameAsString(COSName.BASE_FONT); if (baseFont == null) { continue; } if (Arrays.binarySearch(standard14, baseFont) >= 0) { continue; } COSDictionary fontDescriptor = (COSDictionary) fontDictionary.getDictionaryObject(COSName.FONT_DESC); COSBase enc = fontDictionary.getItem(COSName.ENCODING); COSBase uni = fontDictionary.getItem(COSName.TO_UNICODE); fontDictionary.getInt(COSName.FIRST_CHAR); fontDictionary.getInt(COSName.LAST_CHAR); String encoding; boolean toUnicode = uni != null; if (enc == null) { encoding = "standard14"; } if (enc instanceof COSString) { encoding = ((COSString) enc).getString(); } else { encoding = "table"; } fi.isSubset = false; boolean t = true; // Type one and TT can have subsets defineing the basename see 5.5.3 // pdfref 1.6 // if (fi.fontType.lastIndexOf(COSName.TYPE1.getName())!=-1 || // fi.fontType.equals(COSName.TRUE_TYPE.getName()) ) if (baseFont != null) { if (baseFont.length() > 6) { for (int k = 0; k < 6; k++) if (!Character.isUpperCase(baseFont.charAt(k))) { t = false; } if (baseFont.charAt(6) != '+') { t = false; } } else { t = false; } fi.isSubset = t; if (fi.isSubset) { fi.baseName = baseFont.substring(0, 6); baseFont = baseFont.substring(7); } } fi.fontFlags = 0; if (fi.fontType.equals(COSName.TYPE0.getName()) || fi.fontType.equals(COSName.TYPE3.getName())) { fi.isEmbedded = true; } if (fontDescriptor != null) { // in Type1 charset indicates font is subsetted if (fontDescriptor.getItem(COSName.CHAR_SET) != null) { fi.isSubset = true; } if (fontDescriptor.getItem(COSName.FONT_FILE) != null || fontDescriptor.getItem(COSName.FONT_FILE3) != null || fontDescriptor.getItem(COSName.FONT_FILE2) != null) { fi.isEmbedded = true; } fi.fontFlags = fontDescriptor.getInt(COSName.getPDFName("Flags")); fi.fontFamily = fontDescriptor.getString(COSName.FONT_FAMILY); fi.fontStretch = fontDescriptor.getString(COSName.FONT_STRETCH); } fi.charset = encoding; fi.fontName = baseFont; fi.isToUnicode = toUnicode; fi.encoding = fontDictionary.getNameAsString(COSName.CID_TO_GID_MAP); ret.add(fi); } // for all fonts HashMultimap<String, FontInformation> m = HashMultimap.create(); for (FontInformation ff : ret) { m.put(ff.fontName, ff); } LinkedList<FontInformation> missing = new LinkedList<FontInformation>(); Set<String> k = m.keySet(); for (String kk : k) { Set<FontInformation> s = m.get(kk); if (s.size() < 1) { continue; } if (s.size() > 1) { boolean found = false; FontInformation ff = null; for (FontInformation fonti : s) { if (!fonti.isEmbedded) { ff = fonti; } else { found = true; } } if (!found) { missing.add(ff); } } else { FontInformation ff = s.iterator().next(); if (!ff.isEmbedded) { missing.add(ff); } } } // } // for all pages // Iterator<FontInformation> it = ret.iterator(); // FontInformation prev = null; // LinkedList<FontInformation> toDelete = new // LinkedList<FontInformation>(); // while (it.hasNext()) { // FontInformation current = it.next(); // // if (prev!= null && prev.fontName.equals(current.fontName) && // (prev.fontType.startsWith("CIDFontType") || // current.fontType.startsWith("CIDFontType"))) // toDelete.add(current); // prev = current; // } // // //ret.removeAll(toDelete); // FontInformation[] retArray =toDelete.toArray(new FontInformation[0]); // if (missing.size() == 0) { missing = null; } else { System.out.println("Found missing fonts: " + f); System.out.println(missing); } return new PDFFontResults(new LinkedList<FontInformation>(ret), missing); }
From source file:name.marcelomorales.siqisiqi.pdfbox.CoordinatesGenerator.java
License:Apache License
public void generarPdf(OutputStream os, String template, Map<String, Object> m, String path, String coordenates, float fontSize, float ancho) throws IOException { long t = System.currentTimeMillis(); PDDocument doc = null; try {/*from w w w .j av a2s .c o m*/ doc = PDDocument.load(new File(path)); List pages = doc.getDocumentCatalog().getAllPages(); PDPage sourcePage = (PDPage) pages.get(0); boolean append = sourcePage.getContents() != null; PDPageContentStream contentStream = new PDPageContentStream(doc, sourcePage, append, true); StringReader fileReader = null; try { fileReader = new StringReader(template); List<String> list = CharStreams.readLines(fileReader); boolean textHasBegun = false; float currentOffset = 0f; for (String line : list) { if (line == null) { continue; } if (line.startsWith("#")) { continue; } final Iterable<String> str = Splitter.on(',').omitEmptyStrings().trimResults().split(line); final String[] split = Iterables.toArray(str, String.class); if (split == null || split.length < 4) { continue; } if (Character.isDigit(split[0].charAt(0))) { if (textHasBegun) { contentStream.endText(); } contentStream.beginText(); textHasBegun = true; contentStream.moveTextPositionByAmount(parseFloat(split[0]), parseFloat(split[1])); } else { contentStream.moveTextPositionByAmount(currentOffset, 0); } if (!textHasBegun) { LOGGER.warn("Hay un posible mal uso de un .ree", new Throwable()); contentStream.beginText(); textHasBegun = true; } PDType1Font font; if ("b".equals(split[2])) { font = HELVETICA_BOLD; } else { font = HELVETICA; } contentStream.setFont(font, fontSize); Object text = null; if (split[3].startsWith("\"")) { // TODO: text = substring(split[3], 1, -1); } else { // TODO: text = new PropertyModel(m, split[3]).getObject(); } if (text == null) { LOGGER.warn("Propiedad {} no se encuentra", split[3]); //contentStream.drawString("ERROR: propiedad no encontrada"); contentStream.drawString(" "); } else { String string = text.toString(); currentOffset = font.getStringWidth(string) * ancho; contentStream.drawString(string); } } if (textHasBegun) { contentStream.endText(); } } finally { Closeables.closeQuietly(fileReader); } contentStream.close(); try { doc.save(os); } catch (COSVisitorException e) { throw new IOException("Ha ocurrido un error al escribir en el Os", e); } } finally { if (doc != null) { doc.close(); } LOGGER.info("Me ha tomado {} milisegundos hacer el pdf", System.currentTimeMillis() - t); } }
From source file:net.padaf.preflight.helpers.CatalogValidationHelper.java
License:Apache License
@Override public List<ValidationError> innerValidate(DocumentHandler handler) throws ValidationException { List<ValidationError> result = new ArrayList<ValidationError>(0); PDDocument pdfbox = handler.getDocument(); PDDocumentCatalog catalog = pdfbox.getDocumentCatalog(); if (catalog != null) { validateActions(handler, catalog, result); validateLang(handler, catalog, result); validateNames(handler, catalog, result); validateOCProperties(handler, catalog, result); } else {/*w w w .j av a 2 s . com*/ throw new ValidationException("There are no Catalog entry in the Document."); } // ---- Check OutputIntent to know the ICC Profile result.addAll(validateOutputIntent(handler)); return result; }
From source file:net.padaf.preflight.helpers.CatalogValidationHelper.java
License:Apache License
/** * This method checks the content of each OutputIntent. The S entry must * contain GTS_PDFA1. The DestOuputProfile must contain a valid ICC Profile * Stream./*from w w w. j a v a 2 s. com*/ * * If there are more than one OutputIntent, they have to use the same ICC * Profile. * * This method returns a list of ValidationError. It is empty if no errors * have been found. * * @param handler * @return * @throws ValidationException */ public List<ValidationError> validateOutputIntent(DocumentHandler handler) throws ValidationException { List<ValidationError> result = new ArrayList<ValidationError>(0); PDDocument pdDocument = handler.getDocument(); PDDocumentCatalog catalog = pdDocument.getDocumentCatalog(); COSDocument cDoc = pdDocument.getDocument(); COSBase cBase = catalog.getCOSDictionary() .getItem(COSName.getPDFName(DOCUMENT_DICTIONARY_KEY_OUTPUT_INTENTS)); COSArray outputIntents = COSUtils.getAsArray(cBase, cDoc); Map<COSObjectKey, Boolean> tmpDestOutputProfile = new HashMap<COSObjectKey, Boolean>(); for (int i = 0; outputIntents != null && i < outputIntents.size(); ++i) { COSDictionary dictionary = COSUtils.getAsDictionary(outputIntents.get(i), cDoc); if (dictionary == null) { result.add(new ValidationError(ERROR_GRAPHIC_OUTPUT_INTENT_INVALID_ENTRY, "OutputIntent object is null or isn't a dictionary")); } else { // ---- S entry is mandatory and must be equals to GTS_PDFA1 String sValue = dictionary.getNameAsString(COSName.getPDFName(OUTPUT_INTENT_DICTIONARY_KEY_S)); if (!OUTPUT_INTENT_DICTIONARY_VALUE_GTS_PDFA1.equals(sValue)) { result.add(new ValidationError(ERROR_GRAPHIC_OUTPUT_INTENT_S_VALUE_INVALID, "The S entry of the OutputIntent isn't GTS_PDFA1")); continue; } // ---- OutputConditionIdentifier is a mandatory field String outputConditionIdentifier = dictionary .getString(COSName.getPDFName(OUTPUT_INTENT_DICTIONARY_KEY_OUTPUT_CONDITION_IDENTIFIER)); if (outputConditionIdentifier == null || "".equals(outputConditionIdentifier)) { result.add(new ValidationError(ERROR_GRAPHIC_OUTPUT_INTENT_INVALID_ENTRY, "The OutputIntentCondition is missing")); continue; } // ---- If OutputConditionIdentifier is "Custom" : // ---- DestOutputProfile and Info are mandatory // ---- DestOutputProfile must be a ICC Profile // ---- Because of PDF/A conforming file needs to specify the color // characteristics, the DestOutputProfile // is checked even if the OutputConditionIdentifier isn't "Custom" COSBase dop = dictionary .getItem(COSName.getPDFName(OUTPUT_INTENT_DICTIONARY_KEY_DEST_OUTPUT_PROFILE)); ValidationError valer = validateICCProfile(dop, cDoc, tmpDestOutputProfile, handler); if (valer != null) { result.add(valer); continue; } if (OUTPUT_INTENT_DICTIONARY_VALUE_OUTPUT_CONDITION_IDENTIFIER_CUSTOM .equals(outputConditionIdentifier)) { String info = dictionary.getString(COSName.getPDFName(OUTPUT_INTENT_DICTIONARY_KEY_INFO)); if (info == null || "".equals(info)) { result.add(new ValidationError(ERROR_GRAPHIC_OUTPUT_INTENT_INVALID_ENTRY, "The Info entry of a OutputIntent dictionary is missing")); continue; } } } } return result; }
From source file:net.padaf.preflight.helpers.MetadataValidationHelper.java
License:Apache License
/** * Check if metadata dictionary has no stream filter * /*ww w . j av a2 s. co m*/ * @param doc * @return */ protected List<ValidationError> checkStreamFilterUsage(PDDocument doc) { List<ValidationError> ve = new ArrayList<ValidationError>(); List<?> filters = doc.getDocumentCatalog().getMetadata().getFilters(); if (filters != null && !filters.isEmpty()) { ve.add(new ValidationError(ValidationConstants.ERROR_METADATA_MAIN, "Using stream filter on metadata dictionary is forbidden")); } return ve; }
From source file:net.sf.jabref.logic.xmp.XMPUtil.java
License:Open Source License
/** * @return empty Optional if no metadata has been found *///from w ww .ja va 2 s .co m private static Optional<XMPMetadata> getXMPMetadata(PDDocument document) throws IOException { PDDocumentCatalog catalog = document.getDocumentCatalog(); PDMetadata metaRaw = catalog.getMetadata(); if (metaRaw == null) { return Optional.empty(); } Document parseResult; try (InputStream is = metaRaw.createInputStream()) { parseResult = XMLUtil.parse(is); } XMPMetadata meta = new XMPMetadata(parseResult); meta.addXMLNSMapping(XMPSchemaBibtex.NAMESPACE, XMPSchemaBibtex.class); return Optional.of(meta); }
From source file:net.sf.jabref.logic.xmp.XMPUtil.java
License:Open Source License
/** * Try to write the given BibTexEntries as DublinCore XMP Schemas * * Existing DublinCore schemas in the document are removed * * @param document/*from ww w . ja v a2s . c o m*/ * The pdf document to write to. * @param entries * The BibTeX entries that are written as schemas * @param database * maybenull An optional database which the given BibTeX entries * belong to, which will be used to resolve strings. If the * database is null the strings will not be resolved. * @throws IOException * @throws TransformerException */ private static void writeDublinCore(PDDocument document, Collection<BibEntry> entries, BibDatabase database) throws IOException, TransformerException { Collection<BibEntry> resolvedEntries; if (database == null) { resolvedEntries = entries; } else { resolvedEntries = database.resolveForStrings(entries, false); } PDDocumentCatalog catalog = document.getDocumentCatalog(); PDMetadata metaRaw = catalog.getMetadata(); XMPMetadata meta; if (metaRaw == null) { meta = new XMPMetadata(); } else { meta = new XMPMetadata(XMLUtil.parse(metaRaw.createInputStream())); } // Remove all current Dublin-Core schemas List<XMPSchema> schemas = meta.getSchemasByNamespaceURI(XMPSchemaDublinCore.NAMESPACE); for (XMPSchema schema : schemas) { schema.getElement().getParentNode().removeChild(schema.getElement()); } for (BibEntry entry : resolvedEntries) { XMPSchemaDublinCore dcSchema = new XMPSchemaDublinCore(meta); XMPUtil.writeToDCSchema(dcSchema, entry, null); meta.addSchema(dcSchema); } // Save to stream and then input that stream to the PDF ByteArrayOutputStream os = new ByteArrayOutputStream(); meta.save(os); ByteArrayInputStream is = new ByteArrayInputStream(os.toByteArray()); PDMetadata metadataStream = new PDMetadata(document, is, false); catalog.setMetadata(metadataStream); }
From source file:net.sf.jabref.PdfPreviewPanel.java
License:Open Source License
private void renderPDFFile(File file) { InputStream input;//from www . j a v a 2s. c o m try { input = new FileInputStream(file); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); return; } PDDocument document; try { document = PDDocument.load(input); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); return; } @SuppressWarnings("unchecked") List<PDPage> pages = document.getDocumentCatalog().getAllPages(); PDPage page = pages.get(0); BufferedImage image; try { image = page.convertToImage(); } catch (Exception e1) { // silently ignores all rendering exceptions image = null; } if (image != null) { int width = this.getParent().getWidth(); int height = this.getParent().getHeight(); BufferedImage resImage = resizeImage(image, width, height, BufferedImage.TYPE_INT_RGB); ImageIcon icon = new ImageIcon(resImage); picLabel.setText(null); picLabel.setIcon(icon); } else { clearPreview(); } try { document.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } }