List of usage examples for org.apache.pdfbox.pdmodel PDDocumentInformation getCreator
public String getCreator()
From source file:PDFConverter.java
License:Apache License
/** * Implementation is informed by PDFBox authors. * * @param doc//from w w w. ja v a2 s . co m * @return * @throws IOException */ @Override public synchronized ConvertedDocument convert(java.io.File doc) throws IOException { /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Adapted from LucenePDFDocument.java from PDFBox lucene project * * This class is used to create a document for the lucene search engine. * This should easily plug into the IndexHTML or IndexFiles that comes * with the lucene project. This class will populate the following * fields. * <table> <tr> <th>Lucene Field Name</th> <th>Description</th> </tr> * <tr> * <td>path</td> <td>File system path if loaded from a file</td> </tr> * <tr> * <td>url</td> <td>URL to PDF document</td> </tr> <tr> * <td>contents</td> * <td>Entire contents of PDF document, indexed but not stored</td> * </tr> * <tr> <td>summary</td> <td>First 500 characters of content</td> </tr> * <tr> * <td>modified</td> <td>The modified date/time according to the url or * path</td> </tr> <tr> <td>uid</td> <td>A unique identifier for the * Lucene document.</td> </tr> <tr> <td>CreationDate</td> <td>From PDF * meta-data if available</td> </tr> <tr> <td>Creator</td> <td>From PDF * meta-data if available</td> </tr> <tr> <td>Keywords</td> <td>From PDF * meta-data if available</td> </tr> <tr> <td>ModificationDate</td> * <td>From PDF meta-data if available</td> </tr> <tr> <td>Producer</td> * <td>From PDF meta-data if available</td> </tr> <tr> <td>Subject</td> * <td>From PDF meta-data if available</td> </tr> <tr> <td>Trapped</td> * <td>From PDF meta-data if available</td> </tr> <tr> * <td>Encrypted</td> <td>From PDF meta-data if available</td> </tr> * </table> * * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a> * @version $Revision: 1.23 $ * * @throws IOException If there is an error parsing the document. */ PDDocument pdfDocument = null; ConvertedDocument textdoc = new ConvertedDocument(doc); try { pdfDocument = PDDocument.load(doc); if (pdfDocument.isEncrypted()) { //Just try using the default password and move on // Even if the doc is encrypted, apparently you can try. Throw exception if it fails. textdoc.addProperty("encrypted", "YES"); } //create a writer where to append the text content. StringWriter writer = new StringWriter(); stripper.resetEngine(); stripper.writeText(pdfDocument, writer); PDDocumentInformation info = pdfDocument.getDocumentInformation(); if (info != null) { textdoc.addAuthor(info.getAuthor()); try { textdoc.addCreateDate(info.getCreationDate()); } catch (IOException io) { //ignore, bad date but continue with indexing } textdoc.addProperty("creator_tool", info.getCreator()); textdoc.addProperty("keywords", info.getKeywords()); /* try { metadata.add("ModificationDate", info.getModificationDate()); } catch (IOException io) { //ignore, bad date but continue with indexing } */ //metadata.add("Producer", info.getProducer()); textdoc.addProperty("subject", info.getSubject()); String ttl = info.getTitle(); if (ttl == null || "untitled".equalsIgnoreCase(ttl)) { ttl = textdoc.filename; } textdoc.addTitle(ttl); // metadata.add("Trapped", info.getTrapped()); // TODO: Character set is what? textdoc.setEncoding("UTF-8"); } // Note: the buffer to string operation is costless; // the char array value of the writer buffer and the content string // is shared as long as the buffer content is not modified, which will // not occur here. textdoc.setText(writer.getBuffer().toString()); return textdoc; } finally { if (pdfDocument != null) { pdfDocument.close(); } } }
From source file:PDFExtractMetadata.java
License:Apache License
private static void showDocumentInformation(PDDocumentInformation information) { display("Title:", information.getTitle()); display("Subject:", information.getSubject()); display("Author:", information.getAuthor()); display("Creator:", information.getCreator()); display("Producer:", information.getProducer()); }
From source file:adams.flow.transformer.PDFMetaData.java
License:Open Source License
/** * Executes the flow item./*from w w w . j av a 2s.c o m*/ * * @return null if everything is fine, otherwise error message */ @Override protected String doExecute() { String result; File file; SpreadSheet sheet; PDDocument document; PDDocumentInformation info; Row row; Set<String> keys; result = null; // get file if (m_InputToken.getPayload() instanceof File) file = (File) m_InputToken.getPayload(); else file = new PlaceholderFile((String) m_InputToken.getPayload()); sheet = new DefaultSpreadSheet(); sheet.setDataRowClass(SparseDataRow.class); sheet.setName("Meta-Data: " + file.getAbsolutePath()); try { row = sheet.addRow(); document = PDDocument.load(file.getAbsoluteFile()); info = document.getDocumentInformation(); addCell(row, "Title", info.getTitle()); addCell(row, "Subject", info.getSubject()); addCell(row, "Author", info.getAuthor()); addCell(row, "Keywords", info.getKeywords()); addCell(row, "Producer", info.getProducer()); addCell(row, "Creation Date", info.getCreationDate()); addCell(row, "Modification Date", info.getModificationDate()); addCell(row, "Creator", info.getCreator()); addCell(row, "Trapped", info.getTrapped()); keys = info.getMetadataKeys(); for (String key : keys) addCell(row, "Meta-" + key, info.getCustomMetadataValue(key)); } catch (Exception e) { result = handleException("Failed to extract meta-data: ", e); } if (result == null) m_OutputToken = new Token(sheet); return result; }
From source file:com.esri.geoportal.commons.pdf.PdfUtils.java
License:Apache License
/** * Reads metadata values from a PDF file. * /* ww w . j a v a2 s . co m*/ * @param rawBytes the PDF to read * @param defaultTitle title to be used if the PDF metadata doesn't have one * @param geometryServiceUrl url of a <a href="https://developers.arcgis.com/rest/services-reference/geometry-service.htm">geometry service</a> for reprojecting coordinates. * * @return metadata properties or null if the PDF cannot be read. * * @throws IOException on parsing error */ public static Properties readMetadata(byte[] rawBytes, String defaultTitle, String geometryServiceUrl) throws IOException { Properties ret = new Properties(); // Attempt to read in the PDF file try (PDDocument document = PDDocument.load(rawBytes)) { // See if we can read the PDF if (!document.isEncrypted()) { // Get document metadata PDDocumentInformation info = document.getDocumentInformation(); if (info != null) { if (info.getTitle() != null) { ret.put(PROP_TITLE, info.getTitle()); } else { ret.put(PROP_TITLE, defaultTitle); } if (info.getSubject() != null) { ret.put(PROP_SUBJECT, info.getSubject()); } else { StringBuilder psudoSubject = new StringBuilder(""); psudoSubject.append("\nAuthor: " + info.getAuthor()); psudoSubject.append("\nCreator: " + info.getCreator()); psudoSubject.append("\nProducer: " + info.getProducer()); ret.put(PROP_SUBJECT, psudoSubject.toString()); } if (info.getModificationDate() != null) { ret.put(PROP_MODIFICATION_DATE, info.getModificationDate().getTime()); } else { ret.put(PROP_MODIFICATION_DATE, info.getCreationDate().getTime()); } } else { LOG.warn("Got null metadata for PDF file"); return null; } // Attempt to read in geospatial PDF data COSObject measure = document.getDocument().getObjectByType(COSName.getPDFName("Measure")); String bBox = null; if (measure != null) { // This is a Geospatial PDF (i.e. Adobe's standard) COSDictionary dictionary = (COSDictionary) measure.getObject(); float[] coords = ((COSArray) dictionary.getItem("GPTS")).toFloatArray(); bBox = generateBbox(coords); } else { PDPage page = document.getPage(0); if (page.getCOSObject().containsKey(COSName.getPDFName("LGIDict"))) { // This is a GeoPDF (i.e. TerraGo's standard) bBox = extractGeoPDFProps(page, geometryServiceUrl); } } if (bBox != null) { ret.put(PROP_BBOX, bBox); } } else { LOG.warn("Cannot read encrypted PDF file"); return null; } } catch (IOException ex) { LOG.error("Exception reading PDF", ex); throw ex; } return ret; }
From source file:com.github.joemcintyre.pdffinish.PDFFinish.java
License:Open Source License
/** * Show metadata from PDF document./*from w w w . j ava 2s .co m*/ * * @param document Loaded PDF document. */ private static void showMetadata(PDDocument document) throws IOException { PDDocumentInformation info = document.getDocumentInformation(); System.out.println("Title: " + info.getTitle()); System.out.println("Author: " + info.getAuthor()); System.out.println("Subject: " + info.getSubject()); System.out.println("Keywords: " + info.getKeywords()); System.out.println("Creator: " + info.getCreator()); System.out.println("Producer: " + info.getProducer()); System.out.println("Creation Date: " + info.getCreationDate()); System.out.println("Modification Date: " + info.getModificationDate()); }
From source file:com.openkm.util.metadata.MetadataExtractor.java
License:Open Source License
/** * Extract metadata from PDF//from w w w. j av a2s . com */ public static PdfMetadata pdfExtractor(InputStream is) throws IOException { PDDocument doc = PDDocument.load(is); PDDocumentInformation info = doc.getDocumentInformation(); PdfMetadata md = new PdfMetadata(); md.setNumberOfPages(doc.getNumberOfPages()); md.setTitle(info.getTitle()); md.setAuthor(info.getAuthor()); md.setSubject(info.getSubject()); md.setKeywords(info.getKeywords()); md.setCreator(info.getCreator()); md.setProducer(info.getProducer()); md.setTrapped(info.getTrapped()); md.setCreationDate(info.getCreationDate()); md.setModificationDate(info.getModificationDate()); log.info("pdfExtractor: {}", md); return md; }
From source file:com.wintindustries.pdffilter.pdfcore.PDFTester.java
static public void printMetadata(PDDocument document) throws IOException { PDDocumentInformation info = document.getDocumentInformation(); PDDocumentCatalog cat = document.getDocumentCatalog(); PDMetadata metadata = cat.getMetadata(); System.out.println("Page Count=" + document.getNumberOfPages()); System.out.println("Title=" + info.getTitle()); System.out.println("Author=" + info.getAuthor()); System.out.println("Subject=" + info.getSubject()); System.out.println("Keywords=" + info.getKeywords()); System.out.println("Creator=" + info.getCreator()); System.out.println("Producer=" + info.getProducer()); System.out.println("Creation Date=" + formatDate(info.getCreationDate())); System.out.println("Modification Date=" + formatDate(info.getModificationDate())); System.out.println("Trapped=" + info.getTrapped()); if (metadata != null) { System.out.println("Metadata=" + metadata.getInputStreamAsString()); }/*from w w w .j a v a2s .c o m*/ }
From source file:de.offis.health.icardea.cied.pdf.extractor.PDFApachePDFBoxExtractor.java
License:Apache License
/** * This method will return the key and value pairs stored in the PDF * information. It's the basic information like title, subject, author, * creator, keywords, producer (meaning application) as well as creation * and modification date. The method is provided for debugging purposes. * /* ww w.j a va 2 s .c om*/ * @return Returns <code>key=value</code> pair line by line (using system * dependent newline). */ @SuppressWarnings("unused") private String getPdfInfo() { StringBuffer stringBuffer = new StringBuffer(); if (pdfDocument != null) { PDDocumentInformation pdfInfo = pdfDocument.getDocumentInformation(); // Title if (pdfInfo.getTitle() != null) { stringBuffer.append("Title"); stringBuffer.append("="); stringBuffer.append(pdfInfo.getTitle()); stringBuffer.append(GlobalTools.LINESEPARATOR); } // end if // Subject if (pdfInfo.getSubject() != null) { stringBuffer.append("Subject"); stringBuffer.append("="); stringBuffer.append(pdfInfo.getSubject()); stringBuffer.append(GlobalTools.LINESEPARATOR); } // end if // Keywords if (pdfInfo.getKeywords() != null) { stringBuffer.append("Keywords"); stringBuffer.append("="); stringBuffer.append(pdfInfo.getKeywords()); stringBuffer.append(GlobalTools.LINESEPARATOR); } // end if // Author if (pdfInfo.getAuthor() != null) { stringBuffer.append("Author"); stringBuffer.append("="); stringBuffer.append(pdfInfo.getAuthor()); stringBuffer.append(GlobalTools.LINESEPARATOR); } // end if // Producer if (pdfInfo.getProducer() != null) { stringBuffer.append("Producer"); stringBuffer.append("="); stringBuffer.append(pdfInfo.getProducer()); stringBuffer.append(GlobalTools.LINESEPARATOR); } // end if // Creator if (pdfInfo.getCreator() != null) { stringBuffer.append("Creator"); stringBuffer.append("="); stringBuffer.append(pdfInfo.getCreator()); stringBuffer.append(GlobalTools.LINESEPARATOR); } // end if // CreationDate try { if (pdfInfo.getCreationDate() != null) { stringBuffer.append("CreationDate"); stringBuffer.append("="); stringBuffer.append(GlobalTools.calendar2String(pdfInfo.getCreationDate(), GlobalTools.DATE_FORMAT_STRING_ISO8601)); stringBuffer.append(GlobalTools.LINESEPARATOR); } // end if } catch (IOException ex) { } // end try..catch // ModDate try { if (pdfInfo.getModificationDate() != null) { stringBuffer.append("ModDate"); stringBuffer.append("="); stringBuffer.append(GlobalTools.calendar2String(pdfInfo.getModificationDate(), GlobalTools.DATE_FORMAT_STRING_ISO8601)); stringBuffer.append(GlobalTools.LINESEPARATOR); } // end if } catch (IOException ex) { } // end try..catch } // end if return stringBuffer.toString(); }
From source file:fr.univ_tours.etu.pdf.LucenePDFDocument.java
License:Apache License
/** * This will add the contents to the lucene document. * * @param document The document to add the contents to. * @param is The stream to get the contents from. * @param documentLocation The location of the document, used just for debug messages. * * @throws IOException If there is an error parsing the document. *///from w w w . j a v a 2 s . c om private void addContent(Document document, InputStream is, String documentLocation) throws IOException { PDDocument pdfDocument = null; try { pdfDocument = PDDocument.load(is); // create a writer where to append the text content. StringWriter writer = new StringWriter(); if (stripper == null) { stripper = new PDFTextStripper(); } stripper.writeText(pdfDocument, writer); String contentsDirty = writer.getBuffer().toString(); //System.out.println(contentsDirty.substring(0,100)); String contents = contentsDirty.replaceAll("\\p{Sm}|\\p{Sk}|\\p{So}", " "); //System.out.println(contents); // addTextField(document, DocFields.CONTENTS, reader); TextField ne = this.getNamedEntities(contents); String lemmas = nlpNeTokenizer.getLemmaString(); //StringReader reader = new StringReader(contents); StringReader reader = new StringReader(lemmas); // Add the tag-stripped contents as a Reader-valued Text field so it will // get tokenized and indexed. FieldType type = new FieldType(); type.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); type.setStored(false); type.setTokenized(true); document.add(new Field(DocFields.CONTENTS, reader, type)); PDDocumentInformation info = pdfDocument.getDocumentInformation(); if (info != null) { document.add(ne);//adding named entities addTextField(document, DocFields.AUTHOR, info.getAuthor()); try {//to avoid issues with CreationDate addUnstoredDate(document, DocFields.CREATION_DATE, info.getCreationDate().getTime()); } catch (Exception e) { System.out.println("Warning: some issue with CreationDate attribute!"); } addTextField(document, DocFields.CREATOR, info.getCreator()); addTextField(document, DocFields.KEYWORDS, info.getKeywords()); addTextField(document, DocFields.SUBJECT, info.getSubject()); addTextField(document, DocFields.TITLE, info.getTitle()); //addTextField(document, "Title", info.getTitle()); //addTextField(document, "ModificationDate", info.getModificationDate()); //addTextField(document, "Producer", info.getProducer()); //addTextField(document, "Trapped", info.getTrapped()); } int summarySize = Math.min(contents.length(), 1500); String summary = contents.substring(0, summarySize); // Add the summary as an UnIndexed field, so that it is stored and returned // with hit documents for display. addUnindexedField(document, DocFields.SUMMARY, summary); } finally { if (pdfDocument != null) { pdfDocument.close(); } } }
From source file:fr.univ_tours.etu.searcher.LucenePDFDocument.java
License:Apache License
/** * This will add the contents to the lucene document. * * @param document The document to add the contents to. * @param is The stream to get the contents from. * @param documentLocation The location of the document, used just for debug messages. * * @throws IOException If there is an error parsing the document. *///w w w . j a v a2 s .c om private void addContent(Document document, InputStream is, String documentLocation) throws IOException { PDDocument pdfDocument = null; try { pdfDocument = PDDocument.load(is); // create a writer where to append the text content. StringWriter writer = new StringWriter(); if (stripper == null) { stripper = new PDFTextStripper(); } stripper.writeText(pdfDocument, writer); // Note: the buffer to string operation is costless; // the char array value of the writer buffer and the content string // is shared as long as the buffer content is not modified, which will // not occur here. String contents = writer.getBuffer().toString(); StringReader reader = new StringReader(contents); // Add the tag-stripped contents as a Reader-valued Text field so it will // get tokenized and indexed. addTextField(document, "contents", reader); PDDocumentInformation info = pdfDocument.getDocumentInformation(); if (info != null) { addTextField(document, "Author", info.getAuthor()); addTextField(document, "CreationDate", info.getCreationDate()); addTextField(document, "Creator", info.getCreator()); addTextField(document, "Keywords", info.getKeywords()); addTextField(document, "ModificationDate", info.getModificationDate()); addTextField(document, "Producer", info.getProducer()); addTextField(document, "Subject", info.getSubject()); addTextField(document, "Title", info.getTitle()); addTextField(document, "Trapped", info.getTrapped()); } int summarySize = Math.min(contents.length(), 1500); String summary = contents.substring(0, summarySize); // Add the summary as an UnIndexed field, so that it is stored and returned // with hit documents for display. addUnindexedField(document, "summary", summary); } finally { if (pdfDocument != null) { pdfDocument.close(); } } }