Example usage for org.apache.pdfbox.pdmodel PDDocumentInformation getKeywords

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocumentInformation getKeywords.

Prototype

public String getKeywords()

Source Link

Document

This will get the keywords of the document.

Usage

From source file:fr.univ_tours.etu.pdf.LucenePDFDocument.java

License:Apache License

/**
 * This will add the contents to the lucene document.
 *
 * @param document The document to add the contents to.
 * @param is The stream to get the contents from.
 * @param documentLocation The location of the document, used just for debug messages.
 *
 * @throws IOException If there is an error parsing the document.
 *//*from w w  w . j a v a 2s.c o  m*/
private void addContent(Document document, InputStream is, String documentLocation) throws IOException {
    PDDocument pdfDocument = null;
    try {
        pdfDocument = PDDocument.load(is);

        // create a writer where to append the text content.
        StringWriter writer = new StringWriter();
        if (stripper == null) {
            stripper = new PDFTextStripper();
        }
        stripper.writeText(pdfDocument, writer);

        String contentsDirty = writer.getBuffer().toString();
        //System.out.println(contentsDirty.substring(0,100));
        String contents = contentsDirty.replaceAll("\\p{Sm}|\\p{Sk}|\\p{So}", " ");
        //System.out.println(contents);

        // addTextField(document, DocFields.CONTENTS, reader);
        TextField ne = this.getNamedEntities(contents);

        String lemmas = nlpNeTokenizer.getLemmaString();

        //StringReader reader = new StringReader(contents);
        StringReader reader = new StringReader(lemmas);

        // Add the tag-stripped contents as a Reader-valued Text field so it will
        // get tokenized and indexed.

        FieldType type = new FieldType();
        type.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
        type.setStored(false);
        type.setTokenized(true);
        document.add(new Field(DocFields.CONTENTS, reader, type));

        PDDocumentInformation info = pdfDocument.getDocumentInformation();
        if (info != null) {
            document.add(ne);//adding named entities
            addTextField(document, DocFields.AUTHOR, info.getAuthor());

            try {//to avoid issues with CreationDate
                addUnstoredDate(document, DocFields.CREATION_DATE, info.getCreationDate().getTime());
            } catch (Exception e) {
                System.out.println("Warning: some issue with CreationDate attribute!");
            }

            addTextField(document, DocFields.CREATOR, info.getCreator());
            addTextField(document, DocFields.KEYWORDS, info.getKeywords());

            addTextField(document, DocFields.SUBJECT, info.getSubject());
            addTextField(document, DocFields.TITLE, info.getTitle());

            //addTextField(document, "Title", info.getTitle());
            //addTextField(document, "ModificationDate", info.getModificationDate());
            //addTextField(document, "Producer", info.getProducer());
            //addTextField(document, "Trapped", info.getTrapped());

        }

        int summarySize = Math.min(contents.length(), 1500);
        String summary = contents.substring(0, summarySize);
        // Add the summary as an UnIndexed field, so that it is stored and returned
        // with hit documents for display.
        addUnindexedField(document, DocFields.SUMMARY, summary);
    } finally {
        if (pdfDocument != null) {
            pdfDocument.close();
        }
    }
}

From source file:fr.univ_tours.etu.searcher.LucenePDFDocument.java

License:Apache License

/**
 * This will add the contents to the lucene document.
 *
 * @param document The document to add the contents to.
 * @param is The stream to get the contents from.
 * @param documentLocation The location of the document, used just for debug messages.
 *
 * @throws IOException If there is an error parsing the document.
 *//*from  w w  w.j a v  a  2  s. c om*/
private void addContent(Document document, InputStream is, String documentLocation) throws IOException {
    PDDocument pdfDocument = null;
    try {
        pdfDocument = PDDocument.load(is);

        // create a writer where to append the text content.
        StringWriter writer = new StringWriter();
        if (stripper == null) {
            stripper = new PDFTextStripper();
        }
        stripper.writeText(pdfDocument, writer);

        // Note: the buffer to string operation is costless;
        // the char array value of the writer buffer and the content string
        // is shared as long as the buffer content is not modified, which will
        // not occur here.
        String contents = writer.getBuffer().toString();

        StringReader reader = new StringReader(contents);

        // Add the tag-stripped contents as a Reader-valued Text field so it will
        // get tokenized and indexed.
        addTextField(document, "contents", reader);

        PDDocumentInformation info = pdfDocument.getDocumentInformation();
        if (info != null) {
            addTextField(document, "Author", info.getAuthor());
            addTextField(document, "CreationDate", info.getCreationDate());
            addTextField(document, "Creator", info.getCreator());
            addTextField(document, "Keywords", info.getKeywords());
            addTextField(document, "ModificationDate", info.getModificationDate());
            addTextField(document, "Producer", info.getProducer());
            addTextField(document, "Subject", info.getSubject());
            addTextField(document, "Title", info.getTitle());
            addTextField(document, "Trapped", info.getTrapped());
        }
        int summarySize = Math.min(contents.length(), 1500);
        String summary = contents.substring(0, summarySize);
        // Add the summary as an UnIndexed field, so that it is stored and returned
        // with hit documents for display.
        addUnindexedField(document, "summary", summary);
    } finally {
        if (pdfDocument != null) {
            pdfDocument.close();
        }
    }
}

From source file:mj.ocraptor.extraction.tika.parser.pdf.PDFParser.java

License:Apache License

private void extractMetadata(PDDocument document, Metadata metadata) throws TikaException {
    PDDocumentInformation info = document.getDocumentInformation();
    metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
    addMetadata(metadata, TikaCoreProperties.TITLE, info.getTitle());
    addMetadata(metadata, TikaCoreProperties.CREATOR, info.getAuthor());
    addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator());
    addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords());
    addMetadata(metadata, "producer", info.getProducer());
    // TODO: Move to description in Tika 2.0
    addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject());
    addMetadata(metadata, "trapped", info.getTrapped());
    try {//w ww.  jav a  2  s  . co m
        // TODO Remove these in Tika 2.0
        addMetadata(metadata, "created", info.getCreationDate());
        addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate());
    } catch (IOException e) {
        // Invalid date format, just ignore
    }
    try {
        Calendar modified = info.getModificationDate();
        addMetadata(metadata, Metadata.LAST_MODIFIED, modified);
        addMetadata(metadata, TikaCoreProperties.MODIFIED, modified);
    } catch (IOException e) {
        // Invalid date format, just ignore
    }

    // All remaining metadata is custom
    // Copy this over as-is
    List<String> handledMetadata = Arrays.asList(new String[] { "Author", "Creator", "CreationDate", "ModDate",
            "Keywords", "Producer", "Subject", "Title", "Trapped" });
    for (COSName key : info.getDictionary().keySet()) {
        String name = key.getName();
        if (!handledMetadata.contains(name)) {
            addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key));
        }
    }
}

From source file:net.padaf.preflight.xmp.SynchronizedMetaDataValidation.java

License:Apache License

/**
 * Analyze if Keyword(s) embedded in Document Information dictionary and in
 * XMP properties are synchronized/* w  w  w .ja va 2 s  .  com*/
 * 
 * @param dico
 *          Document Information Dictionary
 * @param pdf
 *          PDF Schema
 * @param ve
 *          The list of validation errors
 */
protected void analyzeKeywordsProperty(PDDocumentInformation dico, AdobePDFSchema pdf,
        List<ValidationError> ve) {
    String keyword = dico.getKeywords();
    if (keyword != null) {
        if (pdf != null) {
            if (pdf.getKeywords() == null) {
                ve.add(AbsentXMPPropertyError("Keywords", "Property is not defined"));
            } else {
                if (!pdf.getKeywordsValue().equals(keyword)) {
                    ve.add(unsynchronizedMetaDataError("Keywords"));
                }
            }
        } else {
            ve.add(AbsentSchemaMetaDataError("Keywords", "PDF"));
        }
    }
}

From source file:net.sf.jabref.logic.xmp.XMPUtil.java

License:Open Source License

/**
 * Helper function for retrieving a BibEntry from the
 * PDDocumentInformation in a PDF file./*from ww w  .  ja  v  a2 s. c  om*/
 *
 * To understand how to get hold of a PDDocumentInformation have a look in
 * the test cases for XMPUtil.
 *
 * The BibEntry is build by mapping individual fields in the document
 * information (like author, title, keywords) to fields in a bibtex entry.
 *
 * @param di
 *            The document information from which to build a BibEntry.
 *
 * @return The bibtex entry found in the document information.
 */
public static Optional<BibEntry> getBibtexEntryFromDocumentInformation(PDDocumentInformation di) {

    BibEntry entry = new BibEntry();
    entry.setType("misc");

    String s = di.getAuthor();
    if (s != null) {
        entry.setField("author", s);
    }

    s = di.getTitle();
    if (s != null) {
        entry.setField("title", s);
    }

    s = di.getKeywords();
    if (s != null) {
        entry.setField("keywords", s);
    }

    s = di.getSubject();
    if (s != null) {
        entry.setField("abstract", s);
    }

    COSDictionary dict = di.getDictionary();
    for (Map.Entry<COSName, COSBase> o : dict.entrySet()) {
        String key = o.getKey().getName();
        if (key.startsWith("bibtex/")) {
            String value = dict.getString(key);
            key = key.substring("bibtex/".length());
            if ("entrytype".equals(key)) {
                entry.setType(value);
            } else {
                entry.setField(key, value);
            }
        }
    }

    // Return empty Optional if no values were found
    return entry.getFieldNames().isEmpty() ? Optional.empty() : Optional.of(entry);
}

From source file:net.sf.jabref.util.XMPUtil.java

License:Open Source License

/**
 * Helper function for retrieving a BibtexEntry from the
 * PDDocumentInformation in a PDF file.//from  www.j  av a  2  s.  c  o  m
 * 
 * To understand how to get hold of a PDDocumentInformation have a look in
 * the test cases for XMPUtil.
 * 
 * The BibtexEntry is build by mapping individual fields in the document
 * information (like author, title, keywords) to fields in a bibtex entry.
 * 
 * @param di
 *            The document information from which to build a BibtexEntry.
 * 
 * @return The bibtex entry found in the document information.
 */
@SuppressWarnings("unchecked")
public static BibtexEntry getBibtexEntryFromDocumentInformation(PDDocumentInformation di) {

    BibtexEntry entry = new BibtexEntry();

    String s = di.getAuthor();
    if (s != null) {
        entry.setField("author", s);
    }

    s = di.getTitle();
    if (s != null) {
        entry.setField("title", s);
    }

    s = di.getKeywords();
    if (s != null) {
        entry.setField("keywords", s);
    }

    s = di.getSubject();
    if (s != null) {
        entry.setField("abstract", s);
    }

    COSDictionary dict = di.getDictionary();
    for (Map.Entry<COSName, COSBase> o : dict.entrySet()) {
        String key = o.getKey().getName();
        if (key.startsWith("bibtex/")) {
            String value = dict.getString(key);
            key = key.substring("bibtex/".length());
            if (key.equals("entrytype")) {
                BibtexEntryType type = BibtexEntryType.getStandardType(value);
                if (type != null) {
                    entry.setType(type);
                }
            } else {
                entry.setField(key, value);
            }
        }
    }

    // Return null if no values were found
    return (!entry.getAllFields().isEmpty() ? entry : null);
}

From source file:net.sf.mmm.content.parser.impl.pdf.ContentParserPdf.java

License:Apache License

/**
 * {@inheritDoc}//from ww w.j  a  v  a2  s .co m
 */
@Override
public void parse(InputStream inputStream, long filesize, ContentParserOptions options,
        MutableGenericContext context) throws Exception {

    PDFParser parser = new PDFParser(inputStream);
    parser.parse();
    PDDocument pdfDoc = parser.getPDDocument();
    try {
        if (pdfDoc.isEncrypted()) {
            // pdfDoc.decrypt("password");
            return;
        }
        PDDocumentInformation info = pdfDoc.getDocumentInformation();
        String title = info.getTitle();
        if (title != null) {
            context.setVariable(VARIABLE_NAME_TITLE, title);
        }
        String keywords = info.getKeywords();
        if (keywords != null) {
            context.setVariable(VARIABLE_NAME_KEYWORDS, keywords);
        }
        String author = info.getAuthor();
        if (author != null) {
            context.setVariable(VARIABLE_NAME_CREATOR, author);
        }

        if (filesize < options.getMaximumBufferSize()) {
            PDFTextStripper stripper = new PDFTextStripper();
            context.setVariable(VARIABLE_NAME_TEXT, stripper.getText(pdfDoc));
        }
    } finally {
        pdfDoc.close();
    }
}

From source file:net.sourceforge.docfetcher.model.parse.PdfParser.java

License:Open Source License

@Override
protected ParseResult parse(@NotNull InputStream in, @NotNull final ParseContext context)
        throws ParseException {
    PDDocument pdfDoc = null;//ww  w . ja  va  2 s .  c  o  m
    try {
        /*
         * TODO post-release-1.1: check if 'force' argument in PDDocument/Stripper increases
         * number of parsed PDF files
         */
        pdfDoc = PDDocument.load(in, true);
        PDDocumentInformation pdInfo;
        final int pageCount;
        try {
            pdInfo = pdfDoc.getDocumentInformation();
            pageCount = pdfDoc.getNumberOfPages();
        } catch (ClassCastException e) {
            // Bug #3529070 and #3528345
            throw new ParseException(e);
        }
        StringWriter writer = new StringWriter();

        /*
         * If the PDF file is encrypted, the PDF stripper will automatically
         * try an empty password.
         * 
         * In contrast to the paging PDF parser that is used for the
         * preview, we do not need to call setSortByPosition(true) here
         * because the extracted text will be digested by Lucene anyway.
         */
        PDFTextStripper stripper = new PDFTextStripper() {
            protected void startPage(PDPage page) throws IOException {
                context.getReporter().subInfo(getCurrentPageNo(), pageCount);
            }

            protected void endPage(PDPage page) throws IOException {
                if (context.getCancelable().isCanceled())
                    setEndPage(0);
            }
        };
        stripper.setForceParsing(true);

        try {
            stripper.writeText(pdfDoc, writer);
        } catch (RuntimeException e) {
            /*
             * PDFTextStripper.writeText can throw various
             * RuntimeExceptions, see bugs #3446010, #3448272, #3444887.
             */
            throw new ParseException(e);
        }

        return new ParseResult(writer.getBuffer()).setTitle(pdInfo.getTitle()).addAuthor(pdInfo.getAuthor())
                .addMiscMetadata(pdInfo.getSubject()).addMiscMetadata(pdInfo.getKeywords());
    } catch (IOException e) {
        if (e.getCause() instanceof CryptographyException)
            throw new ParseException(Msg.doc_pw_protected.get());
        throw new ParseException(e);
    } finally {
        close(pdfDoc);
    }
}

From source file:net.sourceforge.docfetcher.model.parse.TestParseFromZip.java

License:Open Source License

@Test
public void testZippedPdf() throws Exception {
    new ZipAndRun(TestFiles.multi_page_pdf) {
        protected void handleInputStream(InputStream in) throws Exception {
            PDDocument pdfDoc = PDDocument.load(in);
            PDFTextStripper stripper = new PDFTextStripper();
            StringWriter writer = new StringWriter();
            stripper.setForceParsing(true);
            stripper.setSortByPosition(true);
            stripper.writeText(pdfDoc, writer); // Will handle encryption with empty password
            PDDocumentInformation pdInfo = pdfDoc.getDocumentInformation();
            ParseResult result = new ParseResult(writer.getBuffer()).setTitle(pdInfo.getTitle())
                    .addAuthor(pdInfo.getAuthor()).addMiscMetadata(pdInfo.getSubject())
                    .addMiscMetadata(pdInfo.getKeywords());
            String expectedContents = Util.join(Util.LS, "page 1", "page 2", "page 3");
            String actualContents = result.getContent().toString().trim();
            assertEquals(expectedContents, actualContents);
        }/*from   w w w. j a  v  a  2 s .  c  om*/
    };
}

From source file:net.sourceforge.docfetcher.parse.PDFParser.java

License:Open Source License

public Document parse(File file) throws ParseException {
    PDDocument pdfDoc = null;/*from ww w. j av a 2s .  c o m*/
    try {
        // Check if PDF file is encrypted
        pdfDoc = PDDocument.load(file);
        if (pdfDoc.isEncrypted()) {
            try {
                pdfDoc.openProtection(new StandardDecryptionMaterial(""));
            } catch (Exception e) {
                throw new ParseException(file, Msg.no_extraction_permission.value());
            }
        }

        // Get tags and contents
        PDFTextStripper stripper = new PDFTextStripper();
        StringWriter writer = new StringWriter();
        stripper.writeText(pdfDoc, writer);
        DocFetcher.getInstance().setExceptionHandlerEnabled(true);
        PDDocumentInformation pdInfo = pdfDoc.getDocumentInformation();
        String[] metaData = new String[] { pdInfo.getTitle(), pdInfo.getAuthor(), pdInfo.getSubject(),
                pdInfo.getKeywords(), };
        for (String field : metaData)
            if (field != null)
                writer.append(" ").append(field); //$NON-NLS-1$
        return new Document(file, metaData[0], writer.getBuffer()).addAuthor(metaData[1]);
    } catch (IOException e) {
        throw new ParseException(file, Msg.file_not_readable.value());
    } finally {
        if (pdfDoc != null) {
            try {
                pdfDoc.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}