Example usage for org.apache.pdfbox.pdmodel PDDocumentInformation getSubject

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocumentInformation getSubject.

Prototype

public String getSubject()

Source Link

Document

This will get the subject of the document.

Usage

From source file:org.pdfsam.pdf.DefaultPDFBoxLoader.java

License:Open Source License

public void accept(PDDocument document, PdfDocumentDescriptor descriptor) {
    descriptor.pages(document.getNumberOfPages());
    descriptor.setVersion(getVersion(Float.toString(document.getVersion())));
    PDDocumentInformation info = document.getDocumentInformation();
    descriptor.putInformation(PdfMetadataKey.TITLE.getKey(), info.getTitle());
    descriptor.putInformation(PdfMetadataKey.AUTHOR.getKey(), info.getAuthor());
    descriptor.putInformation(PdfMetadataKey.CREATOR.getKey(), info.getCreator());
    descriptor.putInformation(PdfMetadataKey.SUBJECT.getKey(), info.getSubject());
    descriptor.putInformation(PdfMetadataKey.KEYWORDS.getKey(), info.getKeywords());
    descriptor.putInformation("Producer", info.getProducer());
    Optional.ofNullable(info.getCreationDate()).map(FORMATTER::format)
            .ifPresent(c -> descriptor.putInformation("FormattedCreationDate", c));
}

From source file:org.wandora.application.tools.extractors.files.SimpleDocumentExtractor.java

License:Open Source License

public void _extractTopicsFromStream(String locator, InputStream inputStream, TopicMap topicMap, Topic topic) {
    try {/*from ww w  .  j ava 2s .  com*/
        String name = locator;
        if (name.indexOf("/") != -1) {
            name = name.substring(name.lastIndexOf("/") + 1);
        } else if (name.indexOf("\\") != -1) {
            name = name.substring(name.lastIndexOf("\\") + 1);
        }
        String lowerCaseLocator = locator.toLowerCase();

        // --- HANDLE PDF ENRICHMENT TEXT ---
        if (lowerCaseLocator.endsWith("pdf")) {
            PDDocument doc = PDDocument.load(locator);
            PDDocumentInformation info = doc.getDocumentInformation();
            DateFormat dateFormatter = new SimpleDateFormat(DEFAULT_DATE_FORMAT);

            // --- PDF PRODUCER ---
            String producer = info.getProducer();
            if (producer != null && producer.length() > 0) {
                Topic producerType = createTopic(topicMap, "pdf-producer");
                setData(topic, producerType, defaultLang, producer.trim());
            }

            // --- PDF MODIFICATION DATE ---
            Calendar mCal = info.getModificationDate();
            if (mCal != null) {
                String mdate = dateFormatter.format(mCal.getTime());
                if (mdate != null && mdate.length() > 0) {
                    Topic modificationDateType = createTopic(topicMap, "pdf-modification-date");
                    setData(topic, modificationDateType, defaultLang, mdate.trim());
                }
            }

            // --- PDF CREATOR ---
            String creator = info.getCreator();
            if (creator != null && creator.length() > 0) {
                Topic creatorType = createTopic(topicMap, "pdf-creator");
                setData(topic, creatorType, defaultLang, creator.trim());
            }

            // --- PDF CREATION DATE ---
            Calendar cCal = info.getCreationDate();
            if (cCal != null) {
                String cdate = dateFormatter.format(cCal.getTime());
                if (cdate != null && cdate.length() > 0) {
                    Topic creationDateType = createTopic(topicMap, "pdf-creation-date");
                    setData(topic, creationDateType, defaultLang, cdate.trim());
                }
            }

            // --- PDF AUTHOR ---
            String author = info.getAuthor();
            if (author != null && author.length() > 0) {
                Topic authorType = createTopic(topicMap, "pdf-author");
                setData(topic, authorType, defaultLang, author.trim());
            }

            // --- PDF SUBJECT ---
            String subject = info.getSubject();
            if (subject != null && subject.length() > 0) {
                Topic subjectType = createTopic(topicMap, "pdf-subject");
                setData(topic, subjectType, defaultLang, subject.trim());
            }

            // --- PDF TITLE ---
            String title = info.getSubject();
            if (title != null && title.length() > 0) {
                Topic titleType = createTopic(topicMap, "pdf-title");
                setData(topic, titleType, defaultLang, title.trim());
            }

            // --- PDF KEYWORDS (SEPARATED WITH SEMICOLON) ---
            String keywords = info.getKeywords();
            if (keywords != null && keywords.length() > 0) {
                Topic keywordType = createTopic(topicMap, "pdf-keyword");
                String[] keywordArray = keywords.split(";");
                String keyword = null;
                for (int i = 0; i < keywordArray.length; i++) {
                    keyword = Textbox.trimExtraSpaces(keywordArray[i]);
                    if (keyword != null && keyword.length() > 0) {
                        Topic keywordTopic = createTopic(topicMap, keyword, keywordType);
                        createAssociation(topicMap, keywordType, new Topic[] { topic, keywordTopic });
                    }
                }
            }

            // --- PDF TEXT CONTENT ---
            PDFTextStripper stripper = new PDFTextStripper();
            String content = stripper.getText(doc);
            doc.close();
            setTextEnrichment(topic, topicMap, content, name);
        }

        // --- HANDLE RTF DOCUMENTS ---
        else if (lowerCaseLocator.endsWith("rtf")) {
            String content = Textbox.RTF2PlainText(inputStream);
            setTextEnrichment(topic, topicMap, content, name);
        }

        // --- HANDLE OFFICE DOCUMENTS ---
        else if (lowerCaseLocator.endsWith("doc") || lowerCaseLocator.endsWith("docx")
                || lowerCaseLocator.endsWith("ppt") || lowerCaseLocator.endsWith("xls")
                || lowerCaseLocator.endsWith("vsd")) {
            String content = MSOfficeBox.getText(inputStream);
            if (content != null) {
                setTextEnrichment(topic, topicMap, content, name);
            }
        }

        else if (lowerCaseLocator.endsWith("odt") || lowerCaseLocator.endsWith("odp")
                || lowerCaseLocator.endsWith("odg") || lowerCaseLocator.endsWith("ods")) {

            org.odftoolkit.simple.Document oodocument = org.odftoolkit.simple.Document
                    .loadDocument(inputStream);
            String content = OpenOfficeBox.getText(oodocument);
            setTextEnrichment(topic, topicMap, content, name);

            org.odftoolkit.simple.meta.Meta meta = oodocument.getOfficeMetadata();

            // --- OO KEYWORDS ---
            List<String> keywords = meta.getKeywords();
            if (keywords != null && !keywords.isEmpty()) {
                Topic keywordType = createTopic(topicMap, "oo-keyword");
                for (String keyword : keywords) {
                    keyword = keyword.trim();
                    if (keyword != null && keyword.length() > 0) {
                        Topic keywordTopic = createTopic(topicMap, keyword, keywordType);
                        createAssociation(topicMap, keywordType, new Topic[] { topic, keywordTopic });
                    }
                }
            }

            // --- OO TITLE ---
            String title = meta.getTitle();
            if (title != null && title.length() > 0) {
                Topic titleType = createTopic(topicMap, "oo-title");
                setData(topic, titleType, defaultLang, title.trim());
            }

            // --- OO SUBJECT ---
            String subject = meta.getSubject();
            if (subject != null && subject.length() > 0) {
                Topic subjectType = createTopic(topicMap, "oo-subject");
                setData(topic, subjectType, defaultLang, subject.trim());
            }

            // --- OO CREATOR ---
            String author = meta.getCreator();
            if (author != null && author.length() > 0) {
                Topic authorType = createTopic(topicMap, "oo-author");
                setData(topic, authorType, defaultLang, author.trim());
            }

            // --- OO CREATION DATE ---
            Calendar cCal = meta.getCreationDate();
            if (cCal != null) {
                DateFormat dateFormatter = new SimpleDateFormat(DEFAULT_DATE_FORMAT);
                String cdate = dateFormatter.format(cCal.getTime());
                if (cdate != null && cdate.length() > 0) {
                    Topic creationDateType = createTopic(topicMap, "oo-creation-date");
                    setData(topic, creationDateType, defaultLang, cdate.trim());
                }
            }

            // --- OO DESCRIPTION ---
            String description = meta.getDescription();
            if (description != null && description.length() > 0) {
                Topic descriptionType = createTopic(topicMap, "oo-description");
                setData(topic, descriptionType, defaultLang, description.trim());
            }

            // --- OO GENERATOR ---
            String generator = meta.getGenerator();
            if (generator != null && generator.length() > 0) {
                Topic generatorType = createTopic(topicMap, "oo-generator");
                setData(topic, generatorType, defaultLang, generator.trim());
            }
        }

        else if (lowerCaseLocator.endsWith("html") || lowerCaseLocator.endsWith("htm")) {
            String content = IObox.loadFile(new InputStreamReader(inputStream));
            setTextEnrichment(topic, topicMap, content, name);
        }

        else if (lowerCaseLocator.endsWith("txt") || lowerCaseLocator.endsWith("text")) {
            String content = IObox.loadFile(new InputStreamReader(inputStream));
            setTextEnrichment(topic, topicMap, content, name);
        }

        // --- HANDLE ANY OTHER DOCUMENTS ---
        else {
            byte[] content = IObox.loadBFile(inputStream);
            String mimeType = "";
            MimeUtil.registerMimeDetector("eu.medsea.mimeutil.detector.MagicMimeMimeDetector");
            Collection<MimeType> mimeTypes = new ArrayList();
            if (locator != null) {
                if (MimeTypes.getMimeType(locator) != null) {
                    mimeTypes.add(new MimeType(MimeTypes.getMimeType(locator)));
                }
                mimeTypes.addAll(MimeUtil.getMimeTypes(locator));
            }
            mimeTypes.addAll(MimeUtil.getMimeTypes(content));
            boolean isText = false;
            for (MimeType mime : mimeTypes) {
                if (MimeUtil.isTextMimeType(mime)) {
                    isText = true;
                    break;
                }
            }
            if (isText) {
                setTextEnrichment(topic, topicMap, new String(content), name);
            } else {
                if (!mimeTypes.isEmpty()) {
                    MimeType mime = mimeTypes.iterator().next();
                    mimeType = mime.toString();
                }
                setBinaryEnrichment(topic, topicMap, content, mimeType);
            }
        }
    } catch (Exception e) {
        log(e);
    }
}

From source file:org.wandora.application.tools.extractors.files.SimplePDFExtractor.java

License:Open Source License

public void _extractTopicsFromStream(String locator, InputStream inputStream, TopicMap topicMap,
        Topic pdfTopic) {/*from  w  w  w  . j  av  a  2s . com*/
    PDDocument doc = null;
    try {
        if (locator.startsWith("http://")) {
            doc = PDDocument.load(new URL(locator));
        } else {
            doc = PDDocument.load(new File(locator));
        }
        PDDocumentInformation info = doc.getDocumentInformation();
        DateFormat dateFormatter = new SimpleDateFormat(DEFAULT_DATE_FORMAT);

        // --- PDF PRODUCER ---
        String producer = info.getProducer();
        if (producer != null && producer.length() > 0) {
            Topic producerType = createTopic(topicMap, "pdf-producer");
            setData(pdfTopic, producerType, defaultLang, producer.trim());
        }

        // --- PDF MODIFICATION DATE ---
        Calendar mCal = info.getModificationDate();
        if (mCal != null) {
            String mdate = dateFormatter.format(mCal.getTime());
            if (mdate != null && mdate.length() > 0) {
                Topic modificationDateType = createTopic(topicMap, "pdf-modification-date");
                setData(pdfTopic, modificationDateType, defaultLang, mdate.trim());
            }
        }

        // --- PDF CREATOR ---
        String creator = info.getCreator();
        if (creator != null && creator.length() > 0) {
            Topic creatorType = createTopic(topicMap, "pdf-creator");
            setData(pdfTopic, creatorType, defaultLang, creator.trim());
        }

        // --- PDF CREATION DATE ---
        Calendar cCal = info.getCreationDate();
        if (cCal != null) {
            String cdate = dateFormatter.format(cCal.getTime());
            if (cdate != null && cdate.length() > 0) {
                Topic creationDateType = createTopic(topicMap, "pdf-creation-date");
                setData(pdfTopic, creationDateType, defaultLang, cdate.trim());
            }
        }

        // --- PDF AUTHOR ---
        String author = info.getAuthor();
        if (author != null && author.length() > 0) {
            Topic authorType = createTopic(topicMap, "pdf-author");
            setData(pdfTopic, authorType, defaultLang, author.trim());
        }

        // --- PDF SUBJECT ---
        String subject = info.getSubject();
        if (subject != null && subject.length() > 0) {
            Topic subjectType = createTopic(topicMap, "pdf-subject");
            setData(pdfTopic, subjectType, defaultLang, subject.trim());
        }

        // --- PDF TITLE ---
        String title = info.getSubject();
        if (title != null && title.length() > 0) {
            if (makeVariantFromTitle) {
                pdfTopic.setDisplayName(defaultLang, title);
            } else {
                Topic titleType = createTopic(topicMap, "pdf-title");
                setData(pdfTopic, titleType, defaultLang, title.trim());
            }
        }

        // --- PDF KEYWORDS (SEPARATED WITH SEMICOLON) ---
        String keywords = info.getKeywords();
        if (keywords != null && keywords.length() > 0) {
            Topic keywordType = createTopic(topicMap, "pdf-keyword");
            String[] keywordArray = keywords.split(";");
            String keyword = null;
            for (int i = 0; i < keywordArray.length; i++) {
                keyword = Textbox.trimExtraSpaces(keywordArray[i]);
                if (keyword != null && keyword.length() > 0) {
                    Topic keywordTopic = createTopic(topicMap, keyword, keywordType);
                    createAssociation(topicMap, keywordType, new Topic[] { pdfTopic, keywordTopic });
                }
            }
        }

        // --- PDF TEXT CONTENT ---
        PDFTextStripper stripper = new PDFTextStripper();
        String content = new String();

        if (makePageTopics) {
            int pages = doc.getNumberOfPages();
            String pageContent = null;
            for (int i = 0; i < pages; i++) {
                stripper.setStartPage(i);
                stripper.setEndPage(i);
                pageContent = stripper.getText(doc);
                Topic pageType = createTopic(topicMap, "pdf-page");
                Topic pageTopic = createTopic(topicMap, pdfTopic.getBaseName() + " (page " + i + ")", pageType);
                Topic orderType = createTopic(topicMap, "order");
                Topic orderTopic = createTopic(topicMap, i + ".", orderType);
                Topic contentType = createTopic(topicMap, "pdf-text");
                setData(pageTopic, contentType, defaultLang, pageContent.trim());
                createAssociation(topicMap, pageType, new Topic[] { pdfTopic, pageTopic, orderTopic });
            }
        } else {
            content = stripper.getText(doc);
        }

        if (!makePageTopics && content != null && content.length() > 0) {
            Topic contentType = createTopic(topicMap, "pdf-text");
            setData(pdfTopic, contentType, defaultLang, content.trim());
        }
        doc.close();
    } catch (Exception e) {
        e.printStackTrace();
        try {
            if (doc != null)
                doc.close();
        } catch (Exception ix) {
            e.printStackTrace();
        }
    }
}

From source file:org.wandora.application.tools.extractors.fng.ExtractFNGTextEnrichment.java

License:Open Source License

public void _extractTopicsFromStream(String locator, InputStream inputStream, TopicMap topicMap,
        Topic textTopic) {//from  w w w. jav a 2 s  .  c om
    try {
        String lowerCaseLocator = locator.toLowerCase();

        // --- HANDLE PDF ENRICHMENT TEXT ---
        if (lowerCaseLocator.endsWith("pdf")) {

            PDDocument doc = PDDocument.load(new URL(locator));
            PDDocumentInformation info = doc.getDocumentInformation();

            // --- PDF SUBJECT ---
            String subject = info.getSubject();
            if (subject != null && subject.length() > 0) {
                Topic subjectType = createTopic(topicMap, "subject");
                setData(textTopic, subjectType, defaultLang, subject.trim());
            }

            // --- PDF TITLE ---
            String title = info.getTitle();
            if (title != null && title.length() > 0) {
                Topic titleType = createTopic(topicMap, "title");
                setData(textTopic, titleType, defaultLang, title.trim());
            }

            // --- PDF KEYWORDS ---
            String keywords = info.getKeywords();
            if (keywords != null && keywords.length() > 0) {
                Topic keywordType = createTopic(topicMap, "keywords");
                setData(textTopic, keywordType, defaultLang, keywords.trim());
            }

            // --- PDF TEXT CONTENT ---
            PDFTextStripper stripper = new PDFTextStripper();
            String content = stripper.getText(doc);
            setTextEnrichment(textTopic, topicMap, content);
            doc.close();
        }

        // --- HANDLE RTF DOCUMENTS ---
        else if (lowerCaseLocator.endsWith("rtf")) {
            String content = Textbox.RTF2PlainText(inputStream);
            setTextEnrichment(textTopic, topicMap, content);
        }

        // --- HANDLE OFFICE DOCUMENTS ---
        else if (lowerCaseLocator.endsWith("doc") || lowerCaseLocator.endsWith("docx")
                || lowerCaseLocator.endsWith("ppt") || lowerCaseLocator.endsWith("xsl")
                || lowerCaseLocator.endsWith("vsd")) {
            String content = MSOfficeBox.getText(inputStream);
            if (content != null) {
                setTextEnrichment(textTopic, topicMap, content);
            }
        }

        // --- HANDLE TXT DOCUMENTS ---
        else {
            String content = IObox.loadFile(new InputStreamReader(inputStream));
            setTextEnrichment(textTopic, topicMap, content);
        }
    } catch (Exception e) {
        log(e);
    }
}

From source file:org.wandora.piccolo.utils.crawler.handlers.PDFHandler.java

License:Open Source License

public void handle(CrawlerAccess crawler, InputStream in, int depth, URL page) {
    try {//from   w  ww. jav  a 2  s  .  co  m
        Document d = new Document();

        PDDocument doc = PDDocument.load(page);
        PDDocumentInformation info = doc.getDocumentInformation();
        PDFTextStripper stripper = new PDFTextStripper();
        String content = stripper.getText(doc);
        doc.close();

        d.add(LuceneCrawler.subject(info.getSubject()));
        d.add(LuceneCrawler.title(info.getTitle()));
        d.add(LuceneCrawler.keywords(info.getKeywords()));
        d.add(LuceneCrawler.content(content));
        d.add(LuceneCrawler.location(page.toString()));

        crawler.addObject(d);
    } catch (IOException e) {
        e.printStackTrace();
    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:org.xstudiosys.pdfxmp.AddMetadataFromDocInfo.java

License:Apache License

/**
 * This will print the documents data.//from   w w w.j a v  a  2 s  .c  om
 *
 * @param args The command line arguments.
 *
 * @throws Exception If there is an error parsing the document.
 */
public static void main(String[] args) throws Exception {
    if (args.length != 2) {
        usage();
    } else {
        PDDocument document = null;

        try {
            document = PDDocument.load(args[0]);
            if (document.isEncrypted()) {
                System.err.println("Error: Cannot add metadata to encrypted document.");
                System.exit(1);
            }
            PDDocumentCatalog catalog = document.getDocumentCatalog();
            PDDocumentInformation info = document.getDocumentInformation();

            XMPMetadata metadata = new XMPMetadata();

            XMPSchemaPDF pdfSchema = metadata.addPDFSchema();
            pdfSchema.setKeywords(info.getKeywords());
            pdfSchema.setProducer(info.getProducer());

            XMPSchemaBasic basicSchema = metadata.addBasicSchema();
            basicSchema.setModifyDate(info.getModificationDate());
            basicSchema.setCreateDate(info.getCreationDate());
            basicSchema.setCreatorTool(info.getCreator());
            basicSchema.setMetadataDate(new GregorianCalendar());

            XMPSchemaDublinCore dcSchema = metadata.addDublinCoreSchema();
            dcSchema.setTitle(info.getTitle());
            dcSchema.addCreator("PDFBox");
            dcSchema.setDescription(info.getSubject());

            PDMetadata metadataStream = new PDMetadata(document);
            metadataStream.importXMPMetadata(metadata);
            catalog.setMetadata(metadataStream);

            document.save(args[1]);
        } finally {
            if (document != null) {
                document.close();
            }
        }
    }
}

From source file:org.xstudiosys.pdfxmp.MarkBuilder.java

License:Open Source License

public void onComplete(PDDocument document) {
    try {//www  .  jav a 2s .c  o  m

        PDDocumentCatalog catalog = document.getDocumentCatalog();
        PDDocumentInformation info = document.getDocumentInformation();

        XMPMetadata metadata = new XMPMetadata();

        XMPSchemaPDF pdfSchema = metadata.addPDFSchema();
        pdfSchema.setKeywords(info.getKeywords());
        pdfSchema.setProducer(info.getProducer());

        XMPSchemaBasic basicSchema = metadata.addBasicSchema();
        basicSchema.setModifyDate(info.getModificationDate());
        basicSchema.setCreateDate(info.getCreationDate());
        basicSchema.setCreatorTool(info.getCreator());
        basicSchema.setMetadataDate(new GregorianCalendar());

        XMPSchemaDublinCore dcSchema = metadata.addDublinCoreSchema();
        dcSchema.setTitle(info.getTitle());
        dcSchema.addCreator("PDFBox");
        dcSchema.setDescription(info.getSubject());

        PDMetadata metadataStream = new PDMetadata(document);
        metadataStream.importXMPMetadata(metadata);
        catalog.setMetadata(metadataStream);
    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:org.xstudiosys.pdfxmp.XMPUtil.java

License:Open Source License

/**
 * Helper function for retrieving a BibtexEntry from the
 * PDDocumentInformation in a PDF file.//from w  w  w .  jav a2 s  .c om
 * 
 * To understand how to get hold of a PDDocumentInformation have a look in
 * the test cases for XMPUtil.
 * 
 * The BibtexEntry is build by mapping individual fields in the document
 * information (like author, title, keywords) to fields in a bibtex entry.
 * 
 * @param di
 *            The document information from which to build a BibtexEntry.
 * 
 * @return The bibtex entry found in the document information.
 */
@SuppressWarnings("unchecked")
public static BibtexEntry getBibtexEntryFromDocumentInformation(PDDocumentInformation di) {

    BibtexEntry entry = new BibtexEntry();

    String s = di.getAuthor();
    if (s != null)
        entry.setField("author", s);

    s = di.getTitle();
    if (s != null)
        entry.setField("title", s);

    s = di.getKeywords();
    if (s != null)
        entry.setField("keywords", s);

    s = di.getSubject();
    if (s != null)
        entry.setField("abstract", s);

    COSDictionary dict = di.getDictionary();
    for (Map.Entry<COSName, COSBase> o : dict.entrySet()) {
        String key = o.getKey().getName();
        if (key.startsWith("bibtex/")) {
            String value = dict.getString(key);
            key = key.substring("bibtex/".length());
            if (key.equals("entrytype")) {
                BibtexEntryType type = BibtexEntryType.getStandardType(value);
                if (type != null)
                    entry.setType(type);
            } else
                entry.setField(key, value);
        }
    }

    // Return null if no values were found
    return (entry.getAllFields().size() > 0 ? entry : null);
}

From source file:uk.bl.wa.tika.parser.pdf.pdfbox.PDFParser.java

License:Apache License

private void extractMetadata(PDDocument document, Metadata metadata) throws TikaException {
    PDDocumentInformation info = document.getDocumentInformation();
    metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
    addMetadata(metadata, Metadata.TITLE, info.getTitle());
    addMetadata(metadata, Metadata.AUTHOR, info.getAuthor());
    addMetadata(metadata, Metadata.KEYWORDS, info.getKeywords());
    addMetadata(metadata, "pdf:creator", info.getCreator());
    addMetadata(metadata, "pdf:producer", info.getProducer());
    addMetadata(metadata, Metadata.SUBJECT, info.getSubject());
    addMetadata(metadata, "trapped", info.getTrapped());
    addMetadata(metadata, "created", info.getCreationDate());
    addMetadata(metadata, Metadata.CREATION_DATE, info.getCreationDate());
    Calendar modified = info.getModificationDate();
    addMetadata(metadata, Metadata.LAST_MODIFIED, modified);

    // All remaining metadata is custom
    // Copy this over as-is
    List<String> handledMetadata = Arrays.asList(new String[] { "Author", "Creator", "CreationDate", "ModDate",
            "Keywords", "Producer", "Subject", "Title", "Trapped" });
    if (info.getCOSObject() != null && info.getCOSObject().keySet() != null) {
        for (COSName key : info.getCOSObject().keySet()) {
            String name = key.getName();
            if (!handledMetadata.contains(name)) {
                addMetadata(metadata, name, info.getCOSObject().getDictionaryObject(key));
            }/*from w ww. j a va 2s .c o  m*/
        }
    }
    // ANJ Extensions:
    //
    //
    // Add other data of interest:
    metadata.set("pdf:version", "" + document.getDocument().getVersion());
    metadata.set("pdf:numPages", "" + document.getNumberOfPages());
    //metadata.set("pdf:cryptoMode", ""+getCryptoModeAsString(reader));
    //metadata.set("pdf:openedWithFullPermissions", ""+reader.isOpenedWithFullPermissions());
    metadata.set("pdf:encrypted", "" + document.isEncrypted());
    //metadata.set("pdf:metadataEncrypted", ""+document.isMetadataEncrypted());
    //metadata.set("pdf:128key", ""+reader.is128Key());
    //metadata.set("pdf:tampered", ""+reader.isTampered());
    try {
        if (document.getDocumentCatalog().getMetadata() != null) {
            XMPMetadata xmp = XMPMetadata.load(document.getDocumentCatalog().getMetadata().exportXMPMetadata());
            // There is a special class for grabbing data in the PDF schema - not sure it will add much here:
            // Could parse xmp:CreatorTool and pdf:Producer etc. etc. out of here.
            XMPSchemaPDF pdfxmp = xmp.getPDFSchema();
            // Added a PDF/A schema class:
            xmp.addXMLNSMapping(XMPSchemaPDFA.NAMESPACE, XMPSchemaPDFA.class);
            XMPSchemaPDFA pdfaxmp = (XMPSchemaPDFA) xmp.getSchemaByClass(XMPSchemaPDFA.class);
            if (pdfaxmp != null) {
                metadata.set("pdfaid:part", pdfaxmp.getPart());
                metadata.set("pdfaid:conformance", pdfaxmp.getConformance());
                String version = "A-" + pdfaxmp.getPart() + pdfaxmp.getConformance().toLowerCase();
                //metadata.set("pdfa:version", version );                    
                metadata.set("pdf:version", version);
            }
            // TODO WARN if this XMP version is inconsistent with document header version?
        }
    } catch (IOException e) {
        log.error("XMP Parsing failed: " + e);
        metadata.set("pdf:metadata-xmp-parse-failed", "" + e);
    }

    // Attempt to determine Adobe extension level, if present:
    COSDictionary root = document.getDocumentCatalog().getCOSObject();
    COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions"));
    if (extensions != null) {
        for (COSName extName : extensions.keySet()) {
            // If it's an Adobe one, interpret it to determine the extension level:
            if (extName.equals(COSName.getPDFName("ADBE"))) {
                COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName);
                if (adobeExt != null) {
                    String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion"));
                    int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel"));
                    metadata.set("pdf:version", baseVersion + " Adobe Extension Level " + el);
                }
                // TODO WARN if this embedded version is inconsistent with document header version?
            } else {
                // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'.
                metadata.set("pdf:foundNonAdobeExtensionName", extName.getName());
            }
        }
    }
    // End Of ANJ Extensions.
}

From source file:zhaw.PDFIndexer.java

License:Apache License

/**
 * This will add the contents to the lucene document.
 * //from w ww . j  ava 2 s . com
 * @param document
 *            The document to add the contents to.
 * @param is
 *            The stream to get the contents from.
 * @param documentLocation
 *            The location of the document, used just for debug messages.
 * @throws IOException
 *             If there is an error parsing the document.
 */
private void addContent(Document document, InputStream is, String documentLocation) throws IOException {
    PDDocument pdfDocument = null;
    PDFTextStripper stripper;
    try {
        pdfDocument = PDDocument.load(is);
        if (pdfDocument.isEncrypted()) {
            // Just try using the default password and move on
            pdfDocument.decrypt("");
        }

        // create a writer where to append the text content.
        StringWriter writer = new StringWriter();
        stripper = new PDFTextStripper();
        try {
            stripper.writeText(pdfDocument, writer);

        } catch (Exception e) {
            System.out.println("Error in stripper.writeText()");
        }
        String contents = writer.getBuffer().toString();

        StringReader reader = new StringReader(contents);
        addTextField(document, Indexer.contents, reader);
        PDDocumentInformation info = pdfDocument.getDocumentInformation();
        if (info != null) {
            addTextField(document, Indexer.Author, info.getAuthor());
            try {
                addTextField(document, Indexer.created, info.getCreationDate());
            } catch (IOException io) {
                // ignore, bad date but continue with indexing
            }

            addTextField(document, Indexer.keywords, info.getKeywords());
            try {
                addTextField(document, Indexer.modified, info.getModificationDate());
            } catch (IOException io) {
                // ignore, bad date but continue with indexing
            }
            addTextField(document, "Subject", info.getSubject());
            addTextField(document, Indexer.Title, info.getTitle());
        }
        int summarySize = Math.min(contents.length(), 500);
        String summary = contents.substring(0, summarySize);
        // Add the summary as an UnIndexed field, so that it is stored and
        // returned
        // with hit documents for display.
        addUnindexedField(document, Indexer.summary, summary);
    } catch (CryptographyException e) {
        throw new IOException("Error decrypting document(" + documentLocation + "): " + e);
    } catch (InvalidPasswordException e) {
        // they didn't suppply a password and the default of "" was wrong.
        throw new IOException(
                "Error: The document(" + documentLocation + ") is encrypted and will not be indexed.");
    } finally {
        if (pdfDocument != null) {
            pdfDocument.close();
        }
    }
}