List of usage examples for org.apache.pdfbox.pdmodel PDDocumentInformation getKeywords
public String getKeywords()
From source file:fr.univ_tours.etu.pdf.LucenePDFDocument.java
License:Apache License
/** * This will add the contents to the lucene document. * * @param document The document to add the contents to. * @param is The stream to get the contents from. * @param documentLocation The location of the document, used just for debug messages. * * @throws IOException If there is an error parsing the document. *//*from w w w . j a v a 2s.c o m*/ private void addContent(Document document, InputStream is, String documentLocation) throws IOException { PDDocument pdfDocument = null; try { pdfDocument = PDDocument.load(is); // create a writer where to append the text content. StringWriter writer = new StringWriter(); if (stripper == null) { stripper = new PDFTextStripper(); } stripper.writeText(pdfDocument, writer); String contentsDirty = writer.getBuffer().toString(); //System.out.println(contentsDirty.substring(0,100)); String contents = contentsDirty.replaceAll("\\p{Sm}|\\p{Sk}|\\p{So}", " "); //System.out.println(contents); // addTextField(document, DocFields.CONTENTS, reader); TextField ne = this.getNamedEntities(contents); String lemmas = nlpNeTokenizer.getLemmaString(); //StringReader reader = new StringReader(contents); StringReader reader = new StringReader(lemmas); // Add the tag-stripped contents as a Reader-valued Text field so it will // get tokenized and indexed. FieldType type = new FieldType(); type.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); type.setStored(false); type.setTokenized(true); document.add(new Field(DocFields.CONTENTS, reader, type)); PDDocumentInformation info = pdfDocument.getDocumentInformation(); if (info != null) { document.add(ne);//adding named entities addTextField(document, DocFields.AUTHOR, info.getAuthor()); try {//to avoid issues with CreationDate addUnstoredDate(document, DocFields.CREATION_DATE, info.getCreationDate().getTime()); } catch (Exception e) { System.out.println("Warning: some issue with CreationDate attribute!"); } addTextField(document, DocFields.CREATOR, info.getCreator()); addTextField(document, DocFields.KEYWORDS, info.getKeywords()); addTextField(document, DocFields.SUBJECT, info.getSubject()); addTextField(document, DocFields.TITLE, info.getTitle()); //addTextField(document, "Title", info.getTitle()); //addTextField(document, "ModificationDate", info.getModificationDate()); //addTextField(document, "Producer", info.getProducer()); //addTextField(document, "Trapped", info.getTrapped()); } int summarySize = Math.min(contents.length(), 1500); String summary = contents.substring(0, summarySize); // Add the summary as an UnIndexed field, so that it is stored and returned // with hit documents for display. addUnindexedField(document, DocFields.SUMMARY, summary); } finally { if (pdfDocument != null) { pdfDocument.close(); } } }
From source file:fr.univ_tours.etu.searcher.LucenePDFDocument.java
License:Apache License
/** * This will add the contents to the lucene document. * * @param document The document to add the contents to. * @param is The stream to get the contents from. * @param documentLocation The location of the document, used just for debug messages. * * @throws IOException If there is an error parsing the document. *//*from w w w.j a v a 2 s. c om*/ private void addContent(Document document, InputStream is, String documentLocation) throws IOException { PDDocument pdfDocument = null; try { pdfDocument = PDDocument.load(is); // create a writer where to append the text content. StringWriter writer = new StringWriter(); if (stripper == null) { stripper = new PDFTextStripper(); } stripper.writeText(pdfDocument, writer); // Note: the buffer to string operation is costless; // the char array value of the writer buffer and the content string // is shared as long as the buffer content is not modified, which will // not occur here. String contents = writer.getBuffer().toString(); StringReader reader = new StringReader(contents); // Add the tag-stripped contents as a Reader-valued Text field so it will // get tokenized and indexed. addTextField(document, "contents", reader); PDDocumentInformation info = pdfDocument.getDocumentInformation(); if (info != null) { addTextField(document, "Author", info.getAuthor()); addTextField(document, "CreationDate", info.getCreationDate()); addTextField(document, "Creator", info.getCreator()); addTextField(document, "Keywords", info.getKeywords()); addTextField(document, "ModificationDate", info.getModificationDate()); addTextField(document, "Producer", info.getProducer()); addTextField(document, "Subject", info.getSubject()); addTextField(document, "Title", info.getTitle()); addTextField(document, "Trapped", info.getTrapped()); } int summarySize = Math.min(contents.length(), 1500); String summary = contents.substring(0, summarySize); // Add the summary as an UnIndexed field, so that it is stored and returned // with hit documents for display. addUnindexedField(document, "summary", summary); } finally { if (pdfDocument != null) { pdfDocument.close(); } } }
From source file:mj.ocraptor.extraction.tika.parser.pdf.PDFParser.java
License:Apache License
private void extractMetadata(PDDocument document, Metadata metadata) throws TikaException { PDDocumentInformation info = document.getDocumentInformation(); metadata.set(PagedText.N_PAGES, document.getNumberOfPages()); addMetadata(metadata, TikaCoreProperties.TITLE, info.getTitle()); addMetadata(metadata, TikaCoreProperties.CREATOR, info.getAuthor()); addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator()); addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords()); addMetadata(metadata, "producer", info.getProducer()); // TODO: Move to description in Tika 2.0 addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject()); addMetadata(metadata, "trapped", info.getTrapped()); try {//w ww. jav a 2 s . co m // TODO Remove these in Tika 2.0 addMetadata(metadata, "created", info.getCreationDate()); addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate()); } catch (IOException e) { // Invalid date format, just ignore } try { Calendar modified = info.getModificationDate(); addMetadata(metadata, Metadata.LAST_MODIFIED, modified); addMetadata(metadata, TikaCoreProperties.MODIFIED, modified); } catch (IOException e) { // Invalid date format, just ignore } // All remaining metadata is custom // Copy this over as-is List<String> handledMetadata = Arrays.asList(new String[] { "Author", "Creator", "CreationDate", "ModDate", "Keywords", "Producer", "Subject", "Title", "Trapped" }); for (COSName key : info.getDictionary().keySet()) { String name = key.getName(); if (!handledMetadata.contains(name)) { addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key)); } } }
From source file:net.padaf.preflight.xmp.SynchronizedMetaDataValidation.java
License:Apache License
/** * Analyze if Keyword(s) embedded in Document Information dictionary and in * XMP properties are synchronized/* w w w .ja va 2 s . com*/ * * @param dico * Document Information Dictionary * @param pdf * PDF Schema * @param ve * The list of validation errors */ protected void analyzeKeywordsProperty(PDDocumentInformation dico, AdobePDFSchema pdf, List<ValidationError> ve) { String keyword = dico.getKeywords(); if (keyword != null) { if (pdf != null) { if (pdf.getKeywords() == null) { ve.add(AbsentXMPPropertyError("Keywords", "Property is not defined")); } else { if (!pdf.getKeywordsValue().equals(keyword)) { ve.add(unsynchronizedMetaDataError("Keywords")); } } } else { ve.add(AbsentSchemaMetaDataError("Keywords", "PDF")); } } }
From source file:net.sf.jabref.logic.xmp.XMPUtil.java
License:Open Source License
/** * Helper function for retrieving a BibEntry from the * PDDocumentInformation in a PDF file./*from ww w . ja v a2 s. c om*/ * * To understand how to get hold of a PDDocumentInformation have a look in * the test cases for XMPUtil. * * The BibEntry is build by mapping individual fields in the document * information (like author, title, keywords) to fields in a bibtex entry. * * @param di * The document information from which to build a BibEntry. * * @return The bibtex entry found in the document information. */ public static Optional<BibEntry> getBibtexEntryFromDocumentInformation(PDDocumentInformation di) { BibEntry entry = new BibEntry(); entry.setType("misc"); String s = di.getAuthor(); if (s != null) { entry.setField("author", s); } s = di.getTitle(); if (s != null) { entry.setField("title", s); } s = di.getKeywords(); if (s != null) { entry.setField("keywords", s); } s = di.getSubject(); if (s != null) { entry.setField("abstract", s); } COSDictionary dict = di.getDictionary(); for (Map.Entry<COSName, COSBase> o : dict.entrySet()) { String key = o.getKey().getName(); if (key.startsWith("bibtex/")) { String value = dict.getString(key); key = key.substring("bibtex/".length()); if ("entrytype".equals(key)) { entry.setType(value); } else { entry.setField(key, value); } } } // Return empty Optional if no values were found return entry.getFieldNames().isEmpty() ? Optional.empty() : Optional.of(entry); }
From source file:net.sf.jabref.util.XMPUtil.java
License:Open Source License
/** * Helper function for retrieving a BibtexEntry from the * PDDocumentInformation in a PDF file.//from www.j av a 2 s. c o m * * To understand how to get hold of a PDDocumentInformation have a look in * the test cases for XMPUtil. * * The BibtexEntry is build by mapping individual fields in the document * information (like author, title, keywords) to fields in a bibtex entry. * * @param di * The document information from which to build a BibtexEntry. * * @return The bibtex entry found in the document information. */ @SuppressWarnings("unchecked") public static BibtexEntry getBibtexEntryFromDocumentInformation(PDDocumentInformation di) { BibtexEntry entry = new BibtexEntry(); String s = di.getAuthor(); if (s != null) { entry.setField("author", s); } s = di.getTitle(); if (s != null) { entry.setField("title", s); } s = di.getKeywords(); if (s != null) { entry.setField("keywords", s); } s = di.getSubject(); if (s != null) { entry.setField("abstract", s); } COSDictionary dict = di.getDictionary(); for (Map.Entry<COSName, COSBase> o : dict.entrySet()) { String key = o.getKey().getName(); if (key.startsWith("bibtex/")) { String value = dict.getString(key); key = key.substring("bibtex/".length()); if (key.equals("entrytype")) { BibtexEntryType type = BibtexEntryType.getStandardType(value); if (type != null) { entry.setType(type); } } else { entry.setField(key, value); } } } // Return null if no values were found return (!entry.getAllFields().isEmpty() ? entry : null); }
From source file:net.sf.mmm.content.parser.impl.pdf.ContentParserPdf.java
License:Apache License
/** * {@inheritDoc}//from ww w.j a v a2 s .co m */ @Override public void parse(InputStream inputStream, long filesize, ContentParserOptions options, MutableGenericContext context) throws Exception { PDFParser parser = new PDFParser(inputStream); parser.parse(); PDDocument pdfDoc = parser.getPDDocument(); try { if (pdfDoc.isEncrypted()) { // pdfDoc.decrypt("password"); return; } PDDocumentInformation info = pdfDoc.getDocumentInformation(); String title = info.getTitle(); if (title != null) { context.setVariable(VARIABLE_NAME_TITLE, title); } String keywords = info.getKeywords(); if (keywords != null) { context.setVariable(VARIABLE_NAME_KEYWORDS, keywords); } String author = info.getAuthor(); if (author != null) { context.setVariable(VARIABLE_NAME_CREATOR, author); } if (filesize < options.getMaximumBufferSize()) { PDFTextStripper stripper = new PDFTextStripper(); context.setVariable(VARIABLE_NAME_TEXT, stripper.getText(pdfDoc)); } } finally { pdfDoc.close(); } }
From source file:net.sourceforge.docfetcher.model.parse.PdfParser.java
License:Open Source License
@Override protected ParseResult parse(@NotNull InputStream in, @NotNull final ParseContext context) throws ParseException { PDDocument pdfDoc = null;//ww w . ja va 2 s . c o m try { /* * TODO post-release-1.1: check if 'force' argument in PDDocument/Stripper increases * number of parsed PDF files */ pdfDoc = PDDocument.load(in, true); PDDocumentInformation pdInfo; final int pageCount; try { pdInfo = pdfDoc.getDocumentInformation(); pageCount = pdfDoc.getNumberOfPages(); } catch (ClassCastException e) { // Bug #3529070 and #3528345 throw new ParseException(e); } StringWriter writer = new StringWriter(); /* * If the PDF file is encrypted, the PDF stripper will automatically * try an empty password. * * In contrast to the paging PDF parser that is used for the * preview, we do not need to call setSortByPosition(true) here * because the extracted text will be digested by Lucene anyway. */ PDFTextStripper stripper = new PDFTextStripper() { protected void startPage(PDPage page) throws IOException { context.getReporter().subInfo(getCurrentPageNo(), pageCount); } protected void endPage(PDPage page) throws IOException { if (context.getCancelable().isCanceled()) setEndPage(0); } }; stripper.setForceParsing(true); try { stripper.writeText(pdfDoc, writer); } catch (RuntimeException e) { /* * PDFTextStripper.writeText can throw various * RuntimeExceptions, see bugs #3446010, #3448272, #3444887. */ throw new ParseException(e); } return new ParseResult(writer.getBuffer()).setTitle(pdInfo.getTitle()).addAuthor(pdInfo.getAuthor()) .addMiscMetadata(pdInfo.getSubject()).addMiscMetadata(pdInfo.getKeywords()); } catch (IOException e) { if (e.getCause() instanceof CryptographyException) throw new ParseException(Msg.doc_pw_protected.get()); throw new ParseException(e); } finally { close(pdfDoc); } }
From source file:net.sourceforge.docfetcher.model.parse.TestParseFromZip.java
License:Open Source License
@Test public void testZippedPdf() throws Exception { new ZipAndRun(TestFiles.multi_page_pdf) { protected void handleInputStream(InputStream in) throws Exception { PDDocument pdfDoc = PDDocument.load(in); PDFTextStripper stripper = new PDFTextStripper(); StringWriter writer = new StringWriter(); stripper.setForceParsing(true); stripper.setSortByPosition(true); stripper.writeText(pdfDoc, writer); // Will handle encryption with empty password PDDocumentInformation pdInfo = pdfDoc.getDocumentInformation(); ParseResult result = new ParseResult(writer.getBuffer()).setTitle(pdInfo.getTitle()) .addAuthor(pdInfo.getAuthor()).addMiscMetadata(pdInfo.getSubject()) .addMiscMetadata(pdInfo.getKeywords()); String expectedContents = Util.join(Util.LS, "page 1", "page 2", "page 3"); String actualContents = result.getContent().toString().trim(); assertEquals(expectedContents, actualContents); }/*from w w w. j a v a 2 s . c om*/ }; }
From source file:net.sourceforge.docfetcher.parse.PDFParser.java
License:Open Source License
public Document parse(File file) throws ParseException { PDDocument pdfDoc = null;/*from ww w. j av a 2s . c o m*/ try { // Check if PDF file is encrypted pdfDoc = PDDocument.load(file); if (pdfDoc.isEncrypted()) { try { pdfDoc.openProtection(new StandardDecryptionMaterial("")); } catch (Exception e) { throw new ParseException(file, Msg.no_extraction_permission.value()); } } // Get tags and contents PDFTextStripper stripper = new PDFTextStripper(); StringWriter writer = new StringWriter(); stripper.writeText(pdfDoc, writer); DocFetcher.getInstance().setExceptionHandlerEnabled(true); PDDocumentInformation pdInfo = pdfDoc.getDocumentInformation(); String[] metaData = new String[] { pdInfo.getTitle(), pdInfo.getAuthor(), pdInfo.getSubject(), pdInfo.getKeywords(), }; for (String field : metaData) if (field != null) writer.append(" ").append(field); //$NON-NLS-1$ return new Document(file, metaData[0], writer.getBuffer()).addAuthor(metaData[1]); } catch (IOException e) { throw new ParseException(file, Msg.file_not_readable.value()); } finally { if (pdfDoc != null) { try { pdfDoc.close(); } catch (IOException e) { e.printStackTrace(); } } } }