List of usage examples for org.apache.pdfbox.pdmodel PDDocumentInformation getTitle
public String getTitle()
From source file:ddf.catalog.transformer.input.pdf.PdfInputTransformer.java
License:Open Source License
/** * @param pdfDocument PDF document/*from w w w . ja v a 2 s. co m*/ * @param metacard A mutable metacard to add the extracted data to */ private void extractPdfMetadata(PDDocument pdfDocument, MetacardImpl metacard) { PDDocumentInformation documentInformation = pdfDocument.getDocumentInformation(); setDateIfNotNull(documentInformation.getCreationDate(), metacard, Metacard.CREATED); setDateIfNotNull(documentInformation.getModificationDate(), metacard, Metacard.MODIFIED); if (usePdfTitleAsTitle) { setIfNotBlank(documentInformation.getTitle(), metacard, Metacard.TITLE); } setIfNotBlank(documentInformation.getAuthor(), metacard, Contact.CREATOR_NAME); setIfNotBlank(documentInformation.getSubject(), metacard, Metacard.DESCRIPTION); setIfNotBlank(documentInformation.getKeywords(), metacard, Topic.KEYWORD); }
From source file:de.offis.health.icardea.cied.pdf.extractor.PDFApachePDFBoxExtractor.java
License:Apache License
/** * This method will return the key and value pairs stored in the PDF * information. It's the basic information like title, subject, author, * creator, keywords, producer (meaning application) as well as creation * and modification date. The method is provided for debugging purposes. * // w w w .j a v a2 s .co m * @return Returns <code>key=value</code> pair line by line (using system * dependent newline). */ @SuppressWarnings("unused") private String getPdfInfo() { StringBuffer stringBuffer = new StringBuffer(); if (pdfDocument != null) { PDDocumentInformation pdfInfo = pdfDocument.getDocumentInformation(); // Title if (pdfInfo.getTitle() != null) { stringBuffer.append("Title"); stringBuffer.append("="); stringBuffer.append(pdfInfo.getTitle()); stringBuffer.append(GlobalTools.LINESEPARATOR); } // end if // Subject if (pdfInfo.getSubject() != null) { stringBuffer.append("Subject"); stringBuffer.append("="); stringBuffer.append(pdfInfo.getSubject()); stringBuffer.append(GlobalTools.LINESEPARATOR); } // end if // Keywords if (pdfInfo.getKeywords() != null) { stringBuffer.append("Keywords"); stringBuffer.append("="); stringBuffer.append(pdfInfo.getKeywords()); stringBuffer.append(GlobalTools.LINESEPARATOR); } // end if // Author if (pdfInfo.getAuthor() != null) { stringBuffer.append("Author"); stringBuffer.append("="); stringBuffer.append(pdfInfo.getAuthor()); stringBuffer.append(GlobalTools.LINESEPARATOR); } // end if // Producer if (pdfInfo.getProducer() != null) { stringBuffer.append("Producer"); stringBuffer.append("="); stringBuffer.append(pdfInfo.getProducer()); stringBuffer.append(GlobalTools.LINESEPARATOR); } // end if // Creator if (pdfInfo.getCreator() != null) { stringBuffer.append("Creator"); stringBuffer.append("="); stringBuffer.append(pdfInfo.getCreator()); stringBuffer.append(GlobalTools.LINESEPARATOR); } // end if // CreationDate try { if (pdfInfo.getCreationDate() != null) { stringBuffer.append("CreationDate"); stringBuffer.append("="); stringBuffer.append(GlobalTools.calendar2String(pdfInfo.getCreationDate(), GlobalTools.DATE_FORMAT_STRING_ISO8601)); stringBuffer.append(GlobalTools.LINESEPARATOR); } // end if } catch (IOException ex) { } // end try..catch // ModDate try { if (pdfInfo.getModificationDate() != null) { stringBuffer.append("ModDate"); stringBuffer.append("="); stringBuffer.append(GlobalTools.calendar2String(pdfInfo.getModificationDate(), GlobalTools.DATE_FORMAT_STRING_ISO8601)); stringBuffer.append(GlobalTools.LINESEPARATOR); } // end if } catch (IOException ex) { } // end try..catch } // end if return stringBuffer.toString(); }
From source file:fr.univ_tours.etu.pdf.LucenePDFDocument.java
License:Apache License
/** * This will add the contents to the lucene document. * * @param document The document to add the contents to. * @param is The stream to get the contents from. * @param documentLocation The location of the document, used just for debug messages. * * @throws IOException If there is an error parsing the document. *//*from ww w . j ava 2 s.c om*/ private void addContent(Document document, InputStream is, String documentLocation) throws IOException { PDDocument pdfDocument = null; try { pdfDocument = PDDocument.load(is); // create a writer where to append the text content. StringWriter writer = new StringWriter(); if (stripper == null) { stripper = new PDFTextStripper(); } stripper.writeText(pdfDocument, writer); String contentsDirty = writer.getBuffer().toString(); //System.out.println(contentsDirty.substring(0,100)); String contents = contentsDirty.replaceAll("\\p{Sm}|\\p{Sk}|\\p{So}", " "); //System.out.println(contents); // addTextField(document, DocFields.CONTENTS, reader); TextField ne = this.getNamedEntities(contents); String lemmas = nlpNeTokenizer.getLemmaString(); //StringReader reader = new StringReader(contents); StringReader reader = new StringReader(lemmas); // Add the tag-stripped contents as a Reader-valued Text field so it will // get tokenized and indexed. FieldType type = new FieldType(); type.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); type.setStored(false); type.setTokenized(true); document.add(new Field(DocFields.CONTENTS, reader, type)); PDDocumentInformation info = pdfDocument.getDocumentInformation(); if (info != null) { document.add(ne);//adding named entities addTextField(document, DocFields.AUTHOR, info.getAuthor()); try {//to avoid issues with CreationDate addUnstoredDate(document, DocFields.CREATION_DATE, info.getCreationDate().getTime()); } catch (Exception e) { System.out.println("Warning: some issue with CreationDate attribute!"); } addTextField(document, DocFields.CREATOR, info.getCreator()); addTextField(document, DocFields.KEYWORDS, info.getKeywords()); addTextField(document, DocFields.SUBJECT, info.getSubject()); addTextField(document, DocFields.TITLE, info.getTitle()); //addTextField(document, "Title", info.getTitle()); //addTextField(document, "ModificationDate", info.getModificationDate()); //addTextField(document, "Producer", info.getProducer()); //addTextField(document, "Trapped", info.getTrapped()); } int summarySize = Math.min(contents.length(), 1500); String summary = contents.substring(0, summarySize); // Add the summary as an UnIndexed field, so that it is stored and returned // with hit documents for display. addUnindexedField(document, DocFields.SUMMARY, summary); } finally { if (pdfDocument != null) { pdfDocument.close(); } } }
From source file:fr.univ_tours.etu.searcher.LucenePDFDocument.java
License:Apache License
/** * This will add the contents to the lucene document. * * @param document The document to add the contents to. * @param is The stream to get the contents from. * @param documentLocation The location of the document, used just for debug messages. * * @throws IOException If there is an error parsing the document. *///from w w w . j ava2s. com private void addContent(Document document, InputStream is, String documentLocation) throws IOException { PDDocument pdfDocument = null; try { pdfDocument = PDDocument.load(is); // create a writer where to append the text content. StringWriter writer = new StringWriter(); if (stripper == null) { stripper = new PDFTextStripper(); } stripper.writeText(pdfDocument, writer); // Note: the buffer to string operation is costless; // the char array value of the writer buffer and the content string // is shared as long as the buffer content is not modified, which will // not occur here. String contents = writer.getBuffer().toString(); StringReader reader = new StringReader(contents); // Add the tag-stripped contents as a Reader-valued Text field so it will // get tokenized and indexed. addTextField(document, "contents", reader); PDDocumentInformation info = pdfDocument.getDocumentInformation(); if (info != null) { addTextField(document, "Author", info.getAuthor()); addTextField(document, "CreationDate", info.getCreationDate()); addTextField(document, "Creator", info.getCreator()); addTextField(document, "Keywords", info.getKeywords()); addTextField(document, "ModificationDate", info.getModificationDate()); addTextField(document, "Producer", info.getProducer()); addTextField(document, "Subject", info.getSubject()); addTextField(document, "Title", info.getTitle()); addTextField(document, "Trapped", info.getTrapped()); } int summarySize = Math.min(contents.length(), 1500); String summary = contents.substring(0, summarySize); // Add the summary as an UnIndexed field, so that it is stored and returned // with hit documents for display. addUnindexedField(document, "summary", summary); } finally { if (pdfDocument != null) { pdfDocument.close(); } } }
From source file:mj.ocraptor.extraction.tika.parser.pdf.PDFParser.java
License:Apache License
private void extractMetadata(PDDocument document, Metadata metadata) throws TikaException { PDDocumentInformation info = document.getDocumentInformation(); metadata.set(PagedText.N_PAGES, document.getNumberOfPages()); addMetadata(metadata, TikaCoreProperties.TITLE, info.getTitle()); addMetadata(metadata, TikaCoreProperties.CREATOR, info.getAuthor()); addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator()); addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords()); addMetadata(metadata, "producer", info.getProducer()); // TODO: Move to description in Tika 2.0 addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject()); addMetadata(metadata, "trapped", info.getTrapped()); try {//from w ww. j av a2 s . c o m // TODO Remove these in Tika 2.0 addMetadata(metadata, "created", info.getCreationDate()); addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate()); } catch (IOException e) { // Invalid date format, just ignore } try { Calendar modified = info.getModificationDate(); addMetadata(metadata, Metadata.LAST_MODIFIED, modified); addMetadata(metadata, TikaCoreProperties.MODIFIED, modified); } catch (IOException e) { // Invalid date format, just ignore } // All remaining metadata is custom // Copy this over as-is List<String> handledMetadata = Arrays.asList(new String[] { "Author", "Creator", "CreationDate", "ModDate", "Keywords", "Producer", "Subject", "Title", "Trapped" }); for (COSName key : info.getDictionary().keySet()) { String name = key.getName(); if (!handledMetadata.contains(name)) { addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key)); } } }
From source file:net.homeip.donaldm.doxmentor4j.indexers.PDFIndexer.java
License:Open Source License
@Override public Reader getText(URI uri, int page, StringBuilder title) throws FileNotFoundException, MalformedURLException, IOException //----------------------------------------------------------------------------------------- { FileWriter writer = null;//from w w w. ja v a2s. co m PDDocument pdf = null; PDFTextStripper stripper = null; java.io.File tmpPdf = null; try { tmpPdf = Utils.uri2File(uri); if (tmpPdf != null) pdf = PDDocument.load(tmpPdf.getAbsolutePath(), true); else pdf = PDDocument.load(uri.toURL(), true); PDDocumentInformation pdfInfo = pdf.getDocumentInformation(); String s = pdfInfo.getTitle(); if ((s == null) || (s.length() == 0)) s = uri.getPath(); if (title != null) title.append(s); stripper = new PDFTextStripper(); if (page >= 0) { stripper.setStartPage(page); stripper.setEndPage(page); } else { stripper.setStartPage(1); stripper.setEndPage(pdf.getNumberOfPages()); } java.io.File f = java.io.File.createTempFile("pdf", ".tmp"); writer = new FileWriter(f); stripper.writeText(pdf, writer); try { writer.close(); writer = null; } catch (Exception _e) { } stripper.resetEngine(); return new FileReader(f); } finally { if (stripper != null) try { stripper.resetEngine(); } catch (Exception _e) { } if (pdf != null) try { pdf.close(); } catch (Exception _e) { } if (writer != null) try { writer.close(); } catch (Exception _e) { } if ((tmpPdf != null) && (tmpPdf.getAbsolutePath().toLowerCase().indexOf(System.getProperty("java.io.tmpdir")) >= 0)) tmpPdf.delete(); } }
From source file:net.homeip.donaldm.doxmentor4j.indexers.PDFIndexer.java
License:Open Source License
@Override public long index(String href, URI uri, boolean followLinks, Object... extraParams) throws IOException //----------------------------------------------------------------------------------------------------- { if (m_indexWriter == null) { logger.error("PDFIndexer: index writer is null"); return -1; }//from ww w. ja v a 2s . co m PDDocument pdf = null; PDFTextStripper stripper = null; Reader reader = null; Writer writer = null; java.io.File tmpPdf = null; try { tmpPdf = Utils.uri2File(uri); if (tmpPdf != null) pdf = PDDocument.load(tmpPdf.getAbsolutePath(), true); else pdf = PDDocument.load(uri.toURL(), true); PDDocumentInformation pdfInfo = pdf.getDocumentInformation(); String title = pdfInfo.getTitle(); if ((title == null) || (title.isEmpty())) title = uri.getPath(); stripper = new PDFTextStripper(); int noPages = pdf.getNumberOfPages(); stripper.setSuppressDuplicateOverlappingText(false); if (noPages != PDDocument.UNKNOWN_NUMBER_OF_PAGES) { for (int page = 1; page <= noPages; page++) { stripper.setStartPage(page); stripper.setEndPage(page); writer = new StringWriter(); stripper.writeText(pdf, writer); reader = new StringReader(writer.toString()); Document doc = new Document(); doc.add(new Field("path", href, Field.Store.YES, Field.Index.NO)); doc.add(new Field("title", title.toString(), Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("contents", reader, Field.TermVector.WITH_POSITIONS_OFFSETS)); doc.add(new Field("page", Integer.toString(page), Field.Store.YES, Field.Index.NO)); if (addDocument(doc)) AjaxIndexer.incrementCount(); try { writer.close(); writer = null; } catch (Exception _e) { } try { reader.close(); reader = null; } catch (Exception _e) { } if ((page % 50) == 0) { try { System.runFinalization(); System.gc(); } catch (Exception _e) { } } } } else { java.io.File f = java.io.File.createTempFile("pdf", ".tmp"); writer = new FileWriter(f); stripper.writeText(pdf, writer); try { writer.close(); writer = null; } catch (Exception _e) { } reader = new FileReader(f); Document doc = new Document(); doc.add(new Field("path", href, Field.Store.YES, Field.Index.NO)); doc.add(new Field("title", title.toString(), Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("contents", reader, Field.TermVector.WITH_POSITIONS_OFFSETS)); doc.add(new Field("page", "-1", Field.Store.YES, Field.Index.NO)); if (addDocument(doc)) AjaxIndexer.incrementCount(); try { reader.close(); reader = null; } catch (Exception _e) { } try { System.runFinalization(); System.gc(); } catch (Exception _e) { } } return 1; } catch (Exception e) { logger.error("Error indexing PDF text from " + uri.toString(), e); return -1; } finally { if (stripper != null) try { stripper.resetEngine(); } catch (Exception _e) { } if (pdf != null) try { pdf.close(); } catch (Exception _e) { } if (writer != null) try { writer.close(); } catch (Exception _e) { } if (reader != null) try { reader.close(); } catch (Exception _e) { } if ((tmpPdf != null) && (tmpPdf.getAbsolutePath().toLowerCase().indexOf(System.getProperty("java.io.tmpdir")) >= 0)) tmpPdf.delete(); } }
From source file:net.padaf.preflight.xmp.SynchronizedMetaDataValidation.java
License:Apache License
/** * Analyze if Title embedded in Document Information dictionary and in XMP * properties are synchronized/*from w w w . ja v a2 s . com*/ * * @param dico * Document Information Dictionary * @param dc * Dublin Core Schema * @param ve * The list of validation errors */ protected void analyzeTitleProperty(PDDocumentInformation dico, DublinCoreSchema dc, List<ValidationError> ve) { String title = dico.getTitle(); if (title != null) { if (dc != null) { // Check the x-default value, if not found, check with the first value // found if (dc.getTitle() != null) { if (dc.getTitleValue("x-default") != null) { if (!dc.getTitleValue("x-default").equals(title)) { ve.add(unsynchronizedMetaDataError("Title")); } } else { // This search of first value is made just to keep compatibility // with lot of PDF documents // which use title without lang definition // REM : MAY we have to delete this option in the future Iterator<AbstractField> it = dc.getTitle().getContainer().getAllProperties().iterator(); if (it.hasNext()) { AbstractField tmp = it.next(); if (tmp instanceof TextType) { if (!((TextType) tmp).getStringValue().equals(title)) { ve.add(unsynchronizedMetaDataError("Title")); } } else { ve.add(AbsentXMPPropertyError("Title", "Property is badly defined")); } } else { ve.add(AbsentXMPPropertyError("Title", "Property is not defined")); } } } else { ve.add(AbsentXMPPropertyError("Title", "Property is not defined")); } } else { ve.add(AbsentSchemaMetaDataError("Title", "Dublin Core")); } } }
From source file:net.sf.jabref.logic.xmp.XMPUtil.java
License:Open Source License
/** * Helper function for retrieving a BibEntry from the * PDDocumentInformation in a PDF file./*from www . j a v a 2 s . c o m*/ * * To understand how to get hold of a PDDocumentInformation have a look in * the test cases for XMPUtil. * * The BibEntry is build by mapping individual fields in the document * information (like author, title, keywords) to fields in a bibtex entry. * * @param di * The document information from which to build a BibEntry. * * @return The bibtex entry found in the document information. */ public static Optional<BibEntry> getBibtexEntryFromDocumentInformation(PDDocumentInformation di) { BibEntry entry = new BibEntry(); entry.setType("misc"); String s = di.getAuthor(); if (s != null) { entry.setField("author", s); } s = di.getTitle(); if (s != null) { entry.setField("title", s); } s = di.getKeywords(); if (s != null) { entry.setField("keywords", s); } s = di.getSubject(); if (s != null) { entry.setField("abstract", s); } COSDictionary dict = di.getDictionary(); for (Map.Entry<COSName, COSBase> o : dict.entrySet()) { String key = o.getKey().getName(); if (key.startsWith("bibtex/")) { String value = dict.getString(key); key = key.substring("bibtex/".length()); if ("entrytype".equals(key)) { entry.setType(value); } else { entry.setField(key, value); } } } // Return empty Optional if no values were found return entry.getFieldNames().isEmpty() ? Optional.empty() : Optional.of(entry); }
From source file:net.sf.jabref.util.XMPUtil.java
License:Open Source License
/** * Helper function for retrieving a BibtexEntry from the * PDDocumentInformation in a PDF file./*w w w . ja va2 s . c o m*/ * * To understand how to get hold of a PDDocumentInformation have a look in * the test cases for XMPUtil. * * The BibtexEntry is build by mapping individual fields in the document * information (like author, title, keywords) to fields in a bibtex entry. * * @param di * The document information from which to build a BibtexEntry. * * @return The bibtex entry found in the document information. */ @SuppressWarnings("unchecked") public static BibtexEntry getBibtexEntryFromDocumentInformation(PDDocumentInformation di) { BibtexEntry entry = new BibtexEntry(); String s = di.getAuthor(); if (s != null) { entry.setField("author", s); } s = di.getTitle(); if (s != null) { entry.setField("title", s); } s = di.getKeywords(); if (s != null) { entry.setField("keywords", s); } s = di.getSubject(); if (s != null) { entry.setField("abstract", s); } COSDictionary dict = di.getDictionary(); for (Map.Entry<COSName, COSBase> o : dict.entrySet()) { String key = o.getKey().getName(); if (key.startsWith("bibtex/")) { String value = dict.getString(key); key = key.substring("bibtex/".length()); if (key.equals("entrytype")) { BibtexEntryType type = BibtexEntryType.getStandardType(value); if (type != null) { entry.setType(type); } } else { entry.setField(key, value); } } } // Return null if no values were found return (!entry.getAllFields().isEmpty() ? entry : null); }