List of usage examples for com.lowagie.text.pdf PRIndirectReference getGeneration
public int getGeneration()
From source file:com.cyberway.issue.crawler.extractor.PDFParser.java
License:Open Source License
/** * Parse a PdfDictionary, looking for URIs recursively and adding * them to foundURIs/* w ww . j a va2s .c o m*/ * @param entity */ protected void extractURIs(PdfObject entity) { // deal with dictionaries if (entity.isDictionary()) { PdfDictionary dictionary = (PdfDictionary) entity; @SuppressWarnings("unchecked") Set<PdfName> allkeys = dictionary.getKeys(); for (PdfName key : allkeys) { PdfObject value = dictionary.get(key); // see if it's the key is a UR[I,L] if (key.toString().equals("/URI") || key.toString().equals("/URL")) { foundURIs.add(value.toString()); } else { this.extractURIs(value); } } // deal with arrays } else if (entity.isArray()) { PdfArray array = (PdfArray) entity; ArrayList arrayObjects = array.getArrayList(); Iterator objectList = arrayObjects.iterator(); while (objectList.hasNext()) { this.extractURIs((PdfObject) objectList.next()); } // deal with indirect references } else if (entity.getClass() == PRIndirectReference.class) { PRIndirectReference indirect = (PRIndirectReference) entity; // if we've already seen a reference to this object if (haveSeen(indirect.getGeneration(), indirect.getNumber())) { return; // note that we've seen it if it's new } else { markAsSeen(indirect.getGeneration(), indirect.getNumber()); } // dereference the "pointer" and process the object indirect.getReader(); // FIXME: examine side-effects PdfObject direct = PdfReader.getPdfObject(indirect); this.extractURIs(direct); } }
From source file:crawler.PDFParser.java
License:Open Source License
/** * Parse a PdfDictionary, looking for URIs recursively and adding * them to foundURIs/*from w w w . ja v a2 s . com*/ * @param entity */ protected void extractURIs(PdfObject entity) { // deal with dictionaries if (entity.isDictionary()) { PdfDictionary dictionary = (PdfDictionary) entity; @SuppressWarnings("unchecked") Set<PdfName> allkeys = dictionary.getKeys(); for (PdfName key : allkeys) { PdfObject value = dictionary.get(key); // see if it's the key is a UR[I,L] if (key.toString().equals("/URI") || key.toString().equals("/URL")) foundURIs.add(value.toString()); else this.extractURIs(value); } // deal with arrays } else if (entity.isArray()) { PdfArray array = (PdfArray) entity; ArrayList arrayObjects = array.getArrayList(); Iterator objectList = arrayObjects.iterator(); while (objectList.hasNext()) this.extractURIs((PdfObject) objectList.next()); // deal with indirect references } else if (entity.getClass() == PRIndirectReference.class) { PRIndirectReference indirect = (PRIndirectReference) entity; // if we've already seen a reference to this object if (haveSeen(indirect.getGeneration(), indirect.getNumber())) return; // note that we've seen it if it's new else markAsSeen(indirect.getGeneration(), indirect.getNumber()); // dereference the "pointer" and process the object indirect.getReader(); PdfObject direct = PdfReader.getPdfObject(indirect); this.extractURIs(direct); } }
From source file:org.archive.modules.extractor.PDFParser.java
License:Apache License
/** * Parse a PdfDictionary, looking for URIs recursively and adding * them to foundURIs/*ww w.ja v a2s .co m*/ * @param entity */ @SuppressWarnings("unchecked") protected void extractURIs(PdfObject entity) { // deal with dictionaries if (entity.isDictionary()) { PdfDictionary dictionary = (PdfDictionary) entity; Set<PdfName> allkeys = dictionary.getKeys(); for (PdfName key : allkeys) { PdfObject value = dictionary.get(key); // see if it's the key is a UR[I,L] if (key.toString().equals("/URI") || key.toString().equals("/URL")) { foundURIs.add(value.toString()); } else { this.extractURIs(value); } } // deal with arrays } else if (entity.isArray()) { PdfArray array = (PdfArray) entity; for (PdfObject pdfObject : (Iterable<PdfObject>) array.getArrayList()) { this.extractURIs(pdfObject); } // deal with indirect references } else if (entity.getClass() == PRIndirectReference.class) { PRIndirectReference indirect = (PRIndirectReference) entity; // if we've already seen a reference to this object if (haveSeen(indirect.getGeneration(), indirect.getNumber())) { return; // note that we've seen it if it's new } else { markAsSeen(indirect.getGeneration(), indirect.getNumber()); } // dereference the "pointer" and process the object indirect.getReader(); // FIXME: examine side-effects PdfObject direct = PdfReader.getPdfObject(indirect); this.extractURIs(direct); } }