Example usage for org.apache.pdfbox.pdmodel PDDocumentInformation getProducer

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocumentInformation getProducer.

Prototype

public String getProducer()

Source Link

Document

This will get the producer of the document.

Usage

From source file:org.encuestame.business.search.DataFile.java

License:Apache License

/**
 * Extract Metadata in PDF Documents./* w w w .j  a v a  2 s  .c  om*/
 * @param pdDoc
 * @return
 */
public static AttachmentIndex extractMetadataPDFDocument(final PDDocument pdDoc) {
    PDDocumentInformation docInfo = pdDoc.getDocumentInformation();
    author = docInfo.getAuthor();
    title = docInfo.getTitle();
    producer = docInfo.getProducer();
    subject = docInfo.getSubject();
    AttachmentIndex attachmentMetadata = IndexerFile.addMetadatatoBean(author, title, producer, subject);
    return attachmentMetadata;
}

From source file:org.exoplatform.services.document.impl.PDFDocumentReader.java

License:Open Source License

public Properties getProperties(final InputStream is) throws IOException, DocumentReadException {
    try {//from  w  w  w .  j  a  v  a 2  s  .  c om
        return SecurityHelper.doPrivilegedExceptionAction(new PrivilegedExceptionAction<Properties>() {
            public Properties run() throws Exception {
                if (is == null) {
                    throw new IllegalArgumentException("InputStream is null.");
                }

                PDDocument pdDocument = PDDocument.load(is);
                Properties props = new Properties();
                try {
                    if (pdDocument.isEncrypted()) {
                        try {
                            pdDocument.decrypt("");
                        } catch (InvalidPasswordException e) {
                            throw new DocumentReadException("The pdf document is encrypted.", e);
                        } catch (org.apache.pdfbox.exceptions.CryptographyException e) {
                            throw new DocumentReadException(e.getMessage(), e);
                        }
                    }

                    PDDocumentCatalog catalog = pdDocument.getDocumentCatalog();
                    PDMetadata meta = catalog.getMetadata();
                    if (meta != null) {
                        XMPMetadata metadata = meta.exportXMPMetadata();

                        XMPSchemaDublinCore dc = metadata.getDublinCoreSchema();
                        if (dc != null) {
                            try {
                                if (dc.getTitle() != null)
                                    props.put(DCMetaData.TITLE, fixEncoding(dc.getTitle()));
                            } catch (Exception e) {
                                LOG.warn("getTitle failed: " + e.getMessage());
                            }
                            try {
                                if (dc.getDescription() != null)
                                    props.put(DCMetaData.DESCRIPTION, fixEncoding(dc.getDescription()));
                            } catch (Exception e) {
                                LOG.warn("getSubject failed: " + e.getMessage());
                            }

                            try {
                                if (dc.getCreators() != null) {
                                    for (String creator : dc.getCreators()) {
                                        props.put(DCMetaData.CREATOR, fixEncoding(creator));
                                    }
                                }
                            } catch (Exception e) {
                                LOG.warn("getCreator failed: " + e.getMessage());
                            }

                            try {
                                if (dc.getDates() != null) {
                                    for (Calendar date : dc.getDates()) {
                                        props.put(DCMetaData.DATE, date);
                                    }
                                }
                            } catch (Exception e) {
                                LOG.warn("getDate failed: " + e.getMessage());
                            }
                        }

                        XMPSchemaPDF pdf = metadata.getPDFSchema();
                        if (pdf != null) {
                            try {
                                if (pdf.getKeywords() != null)
                                    props.put(DCMetaData.SUBJECT, fixEncoding(pdf.getKeywords()));
                            } catch (Exception e) {
                                LOG.warn("getKeywords failed: " + e.getMessage());
                            }

                            try {
                                if (pdf.getProducer() != null)
                                    props.put(DCMetaData.PUBLISHER, fixEncoding(pdf.getProducer()));
                            } catch (Exception e) {
                                LOG.warn("getProducer failed: " + e.getMessage());
                            }
                        }

                        XMPSchemaBasic basic = metadata.getBasicSchema();
                        if (basic != null) {
                            try {
                                if (basic.getCreateDate() != null)
                                    props.put(DCMetaData.DATE, basic.getCreateDate());
                            } catch (Exception e) {
                                LOG.warn("getCreationDate failed: " + e.getMessage());
                            }
                            try {
                                if (basic.getModifyDate() != null)
                                    props.put(DCMetaData.DATE, basic.getModifyDate());
                            } catch (Exception e) {
                                LOG.warn("getModificationDate failed: " + e.getMessage());
                            }

                            // DCMetaData.PUBLISHER - basic.getCreatorTool()
                        }
                    }

                    if (props.isEmpty()) {
                        // The pdf doesn't contain any metadata, try to use the document
                        // information instead
                        PDDocumentInformation docInfo = pdDocument.getDocumentInformation();

                        if (docInfo != null) {
                            try {
                                if (docInfo.getAuthor() != null)
                                    props.put(DCMetaData.CONTRIBUTOR, docInfo.getAuthor());
                            } catch (Exception e) {
                                LOG.warn("getAuthor failed: " + e.getMessage());
                            }
                            try {
                                if (docInfo.getCreationDate() != null)
                                    props.put(DCMetaData.DATE, docInfo.getCreationDate());
                            } catch (Exception e) {
                                LOG.warn("getCreationDate failed: " + e.getMessage());
                            }
                            try {
                                if (docInfo.getCreator() != null)
                                    props.put(DCMetaData.CREATOR, docInfo.getCreator());
                            } catch (Exception e) {
                                LOG.warn("getCreator failed: " + e.getMessage());
                            }
                            try {

                                if (docInfo.getKeywords() != null)
                                    props.put(DCMetaData.SUBJECT, docInfo.getKeywords());
                            } catch (Exception e) {
                                LOG.warn("getKeywords failed: " + e.getMessage());
                            }
                            try {
                                if (docInfo.getModificationDate() != null)
                                    props.put(DCMetaData.DATE, docInfo.getModificationDate());
                            } catch (Exception e) {
                                LOG.warn("getModificationDate failed: " + e.getMessage());
                            }
                            try {
                                if (docInfo.getProducer() != null)
                                    props.put(DCMetaData.PUBLISHER, docInfo.getProducer());
                            } catch (Exception e) {
                                LOG.warn("getProducer failed: " + e.getMessage());
                            }
                            try {
                                if (docInfo.getSubject() != null)
                                    props.put(DCMetaData.DESCRIPTION, docInfo.getSubject());
                            } catch (Exception e) {
                                LOG.warn("getSubject failed: " + e.getMessage());
                            }
                            try {
                                if (docInfo.getTitle() != null)
                                    props.put(DCMetaData.TITLE, docInfo.getTitle());
                            } catch (Exception e) {
                                LOG.warn("getTitle failed: " + e.getMessage());
                            }

                            // docInfo.getTrapped();
                        }
                    }
                } finally {
                    if (pdDocument != null) {
                        pdDocument.close();
                    }

                    if (is != null) {
                        try {
                            is.close();
                        } catch (IOException e) {
                            if (LOG.isTraceEnabled()) {
                                LOG.trace("An exception occurred: " + e.getMessage());
                            }
                        }
                    }
                }
                return props;
            }
        });

    } catch (PrivilegedActionException pae) {
        Throwable cause = pae.getCause();
        if (cause instanceof IOException) {
            throw (IOException) cause;
        } else if (cause instanceof RuntimeException) {
            throw (RuntimeException) cause;
        } else {
            throw new RuntimeException(cause);
        }
    }
}

From source file:org.knoesis.matvocab.indexer.LucenePDFDocument.java

License:Apache License

/**
 * This will add the contents to the lucene document.
 *
 * @param document The document to add the contents to.
 * @param is The stream to get the contents from.
 * @param documentLocation The location of the document, used just for debug messages.
 *
 * @throws IOException If there is an error parsing the document.
 *//* w w w .  j  a  v a2  s.  co m*/
private void addContent(Document document, InputStream is, String documentLocation, PDFTextStripper stripper)
        throws IOException {
    PDDocument pdfDocument = null;
    try {
        pdfDocument = PDDocument.load(is);

        if (pdfDocument.isEncrypted()) {
            //Just try using the default password and move on
            pdfDocument.decrypt("");
        }

        //create a writer where to append the text content.
        StringWriter writer = new StringWriter();
        if (stripper == null) {
            stripper = new PDFTextStripper();
        } else {
            stripper.resetEngine();
        }
        stripper.writeText(pdfDocument, writer);

        // Note: the buffer to string operation is costless;
        // the char array value of the writer buffer and the content string
        // is shared as long as the buffer content is not modified, which will
        // not occur here.
        String contents = writer.getBuffer().toString();
        // Add the tag-stripped contents as a Reader-valued Text field so it will
        // get tokenized and indexed.
        addField(document, "contents", contents);

        addField(document, "stemmedcontents", contents);

        PDDocumentInformation info = pdfDocument.getDocumentInformation();
        if (info != null) {
            addField(document, "Author", info.getAuthor());
            try {
                addField(document, "CreationDate", info.getCreationDate());
            } catch (IOException io) {
                //ignore, bad date but continue with indexing
            }
            addField(document, "Creator", info.getCreator());
            addField(document, "Keywords", info.getKeywords());
            try {
                addField(document, "ModificationDate", info.getModificationDate());
            } catch (IOException io) {
                //ignore, bad date but continue with indexing
            }
            addField(document, "Producer", info.getProducer());
            addField(document, "Subject", info.getSubject());
            addField(document, "Title", info.getTitle());
            addField(document, "Trapped", info.getTrapped());
        }
        int summarySize = Math.min(contents.length(), 500);
        String summary = contents.substring(0, summarySize);
        // Add the summary as an UnIndexed field, so that it is stored and returned
        // with hit documents for display.
        addField(document, "summary", summary);
        addField(document, "numpages", String.valueOf(pdfDocument.getNumberOfPages()));
    } catch (CryptographyException e) {
        throw new IOException("Error decrypting document(" + documentLocation + "): " + e);
    } catch (InvalidPasswordException e) {
        //they didn't suppply a password and the default of "" was wrong.
        throw new IOException(
                "Error: The document(" + documentLocation + ") is encrypted and will not be indexed.");
    } finally {
        if (pdfDocument != null) {
            pdfDocument.close();
        }
    }
}

From source file:org.modeshape.sequencer.pdf.PdfBasicMetadata.java

License:Apache License

public boolean check() throws Exception {
    try (PDDocument document = PDDocument.load(in)) {
        PDDocumentCatalog catalog = document.getDocumentCatalog();
        PDPageable pageable = new PDPageable(document);
        PageFormat firstPage = pageable.getPageFormat(0);

        encrypted = document.isEncrypted();
        pageCount = document.getNumberOfPages();
        orientation = ORIENTATION_STRINGS[firstPage.getOrientation()];
        version = String.valueOf(document.getDocument().getVersion());
        String catalogVersion = catalog.getVersion();
        if (catalogVersion != null && !catalogVersion.isEmpty()) {
            // According to specs version saved here should be determining instead
            // the version in header. It is barely used, though.
            version = catalogVersion;/*w ww .j  av  a  2s  .  co  m*/
        }

        if (!encrypted) {
            PDDocumentInformation metadata = document.getDocumentInformation();
            author = metadata.getAuthor();
            creationDate = metadata.getCreationDate();
            creator = metadata.getCreator();
            keywords = metadata.getKeywords();
            modificationDate = metadata.getModificationDate();
            producer = metadata.getProducer();
            subject = metadata.getSubject();
            title = metadata.getTitle();
        }

        // extract all attached files from all pages
        int pageNumber = 0;
        for (Object page : catalog.getAllPages()) {
            pageNumber += 1;
            PdfPageMetadata pageMetadata = new PdfPageMetadata();
            pageMetadata.setPageNumber(pageNumber);
            for (PDAnnotation annotation : ((PDPage) page).getAnnotations()) {
                if (annotation instanceof PDAnnotationFileAttachment) {
                    PdfAttachmentMetadata attachmentMetadata = new PdfAttachmentMetadata();

                    PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation;
                    PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile();
                    PDEmbeddedFile embeddedFile = fileSpec.getEmbeddedFile();

                    attachmentMetadata.setSubject(fann.getSubject());
                    attachmentMetadata.setName(fileSpec.getFilename());
                    attachmentMetadata.setCreationDate(embeddedFile.getCreationDate());
                    attachmentMetadata.setModificationDate(embeddedFile.getModDate());
                    attachmentMetadata.setMimeType(embeddedFile.getSubtype());
                    attachmentMetadata.setData(embeddedFile.getByteArray());

                    pageMetadata.addAttachment(attachmentMetadata);
                }
            }
            pages.add(pageMetadata);
        }
        return true;
    }
}

From source file:org.mycore.media.MCRMediaPDFParser.java

License:Open Source License

/**
 * Parse file and store metadata in related Object.
 * /*  www .  j  a v a2s. c  om*/
 * @return MCRMediaObject
 *              can be held any MCRMediaObject
 * @see MCRMediaObject#clone()
 */
@SuppressWarnings("unchecked")
public synchronized MCRMediaObject parse(File file) throws Exception {
    if (!file.exists())
        throw new IOException("File \"" + file.getName() + "\" doesn't exists!");

    MCRPDFObject media = new MCRPDFObject();

    LOGGER.info("parse " + file.getName() + "...");

    PDDocument pdf = PDDocument.load(file);
    try {
        media.fileName = file.getName();
        media.fileSize = file.length();
        media.folderName = (file.getAbsolutePath()).replace(file.getName(), "");

        PDPageTree pages = pdf.getDocumentCatalog().getPages();

        media.numPages = pdf.getNumberOfPages();

        PDPage page = (PDPage) pages.get(0);
        PDRectangle rect = page.getMediaBox();

        media.width = Math.round(rect.getWidth());
        media.height = Math.round(rect.getHeight());

        PDDocumentInformation info = pdf.getDocumentInformation();
        if (info != null) {
            media.tags = new MCRMediaTagObject();
            media.tags.author = info.getAuthor();
            media.tags.creator = info.getCreator();
            media.tags.producer = info.getProducer();
            media.tags.title = info.getTitle();
            media.tags.subject = info.getSubject();
            media.tags.keywords = info.getKeywords();
        }
    } catch (Exception e) {
        LOGGER.error(e.getMessage());
        throw new Exception(e.getMessage());
    } finally {
        pdf.close();
    }

    return media;
}

From source file:org.nuxeo.pdf.PDFInfo.java

License:Open Source License

/**
 * After building the object with the correct constructor, and after
 * possibly having set some parsing property (<code>setParseWithXMP()</code>
 * for example), this method will extract the information from the PDF.
 * <p>/*from ww w.j a  v  a2  s .c  o  m*/
 * After extraction, caller get the info: Either all of them (
 * <code>toHashMap()</code> or <code>toString()</code>) or individual info
 * (see all getters)
 *
 * @throws ClientException
 *
 * @since 5.9.5
 */
public void run() throws ClientException {

    // In case the caller calls several time the run() method
    if (!alreadyParsed) {

        fileName = pdfBlob.getFilename();
        // Getting the file size os ok only if the blob is already backed by
        // a
        // File. If it is pure Stream, we give up
        File pdfFile = BlobHelper.getFileFromBlob(pdfBlob);
        if (pdfFile == null) {
            fileSize = -1;
        } else {
            fileSize = pdfFile.length();
        }

        try {
            pdfDoc = PDDocument.load(pdfBlob.getStream());

            isEncrypted = pdfDoc.isEncrypted();
            if (isEncrypted) {
                pdfDoc.openProtection(new StandardDecryptionMaterial(password));
            }

            numberOfPages = pdfDoc.getNumberOfPages();
            PDDocumentCatalog docCatalog = pdfDoc.getDocumentCatalog();
            pageLayout = checkNotNull(docCatalog.getPageLayout());
            pdfVersion = "" + pdfDoc.getDocument().getVersion();

            PDDocumentInformation docInfo = pdfDoc.getDocumentInformation();
            author = checkNotNull(docInfo.getAuthor());
            contentCreator = checkNotNull(docInfo.getCreator());
            keywords = checkNotNull(docInfo.getKeywords());
            creationDate = docInfo.getCreationDate();
            modificationDate = docInfo.getModificationDate();
            producer = checkNotNull(docInfo.getProducer());
            subject = checkNotNull(docInfo.getSubject());
            title = checkNotNull(docInfo.getTitle());

            // Getting dimension is a bit tricky
            mediaBoxWidthInPoints = -1;
            mediaBoxHeightInPoints = -1;
            cropBoxWidthInPoints = -1;
            cropBoxHeightInPoints = -1;
            List<PDPage> allPages = docCatalog.getAllPages();
            boolean gotMediaBox = false;
            boolean gotCropBox = false;
            for (PDPage page : allPages) {

                if (page != null) {
                    PDRectangle r = page.findMediaBox();
                    if (r != null) {
                        mediaBoxWidthInPoints = r.getWidth();
                        mediaBoxHeightInPoints = r.getHeight();
                        gotMediaBox = true;
                    }
                    r = page.findCropBox();
                    if (r != null) {
                        cropBoxWidthInPoints = r.getWidth();
                        cropBoxHeightInPoints = r.getHeight();
                        gotCropBox = true;
                    }
                }
                if (gotMediaBox && gotCropBox) {
                    break;
                }
            }

            if (doXMP) {
                xmp = null;
                PDMetadata metadata = docCatalog.getMetadata();
                if (metadata != null) {
                    xmp = "";
                    InputStream xmlInputStream = metadata.createInputStream();

                    InputStreamReader isr = new InputStreamReader(xmlInputStream);
                    BufferedReader reader = new BufferedReader(isr);
                    String line;
                    do {
                        line = reader.readLine();
                        if (line != null) {
                            xmp += line + "\n";
                        }
                    } while (line != null);
                    reader.close();
                }
            }

        } catch (IOException | BadSecurityHandlerException | CryptographyException e) {
            throw new ClientException(/*
                                       * "Cannot get PDF info: " +
                                       * e.getMessage(),
                                       */e);
        } finally {
            if (pdfDoc != null) {
                try {
                    pdfDoc.close();
                } catch (IOException e) {
                    // Ignore
                }
                pdfDoc = null;
            }
            alreadyParsed = true;
        }
    }
}

From source file:org.pdfsam.pdf.DefaultPDFBoxLoader.java

License:Open Source License

public void accept(PDDocument document, PdfDocumentDescriptor descriptor) {
    descriptor.pages(document.getNumberOfPages());
    descriptor.setVersion(getVersion(Float.toString(document.getVersion())));
    PDDocumentInformation info = document.getDocumentInformation();
    descriptor.putInformation(PdfMetadataKey.TITLE.getKey(), info.getTitle());
    descriptor.putInformation(PdfMetadataKey.AUTHOR.getKey(), info.getAuthor());
    descriptor.putInformation(PdfMetadataKey.CREATOR.getKey(), info.getCreator());
    descriptor.putInformation(PdfMetadataKey.SUBJECT.getKey(), info.getSubject());
    descriptor.putInformation(PdfMetadataKey.KEYWORDS.getKey(), info.getKeywords());
    descriptor.putInformation("Producer", info.getProducer());
    Optional.ofNullable(info.getCreationDate()).map(FORMATTER::format)
            .ifPresent(c -> descriptor.putInformation("FormattedCreationDate", c));
}

From source file:org.wandora.application.tools.extractors.files.SimpleDocumentExtractor.java

License:Open Source License

public void _extractTopicsFromStream(String locator, InputStream inputStream, TopicMap topicMap, Topic topic) {
    try {// www  . java2 s.c o m
        String name = locator;
        if (name.indexOf("/") != -1) {
            name = name.substring(name.lastIndexOf("/") + 1);
        } else if (name.indexOf("\\") != -1) {
            name = name.substring(name.lastIndexOf("\\") + 1);
        }
        String lowerCaseLocator = locator.toLowerCase();

        // --- HANDLE PDF ENRICHMENT TEXT ---
        if (lowerCaseLocator.endsWith("pdf")) {
            PDDocument doc = PDDocument.load(locator);
            PDDocumentInformation info = doc.getDocumentInformation();
            DateFormat dateFormatter = new SimpleDateFormat(DEFAULT_DATE_FORMAT);

            // --- PDF PRODUCER ---
            String producer = info.getProducer();
            if (producer != null && producer.length() > 0) {
                Topic producerType = createTopic(topicMap, "pdf-producer");
                setData(topic, producerType, defaultLang, producer.trim());
            }

            // --- PDF MODIFICATION DATE ---
            Calendar mCal = info.getModificationDate();
            if (mCal != null) {
                String mdate = dateFormatter.format(mCal.getTime());
                if (mdate != null && mdate.length() > 0) {
                    Topic modificationDateType = createTopic(topicMap, "pdf-modification-date");
                    setData(topic, modificationDateType, defaultLang, mdate.trim());
                }
            }

            // --- PDF CREATOR ---
            String creator = info.getCreator();
            if (creator != null && creator.length() > 0) {
                Topic creatorType = createTopic(topicMap, "pdf-creator");
                setData(topic, creatorType, defaultLang, creator.trim());
            }

            // --- PDF CREATION DATE ---
            Calendar cCal = info.getCreationDate();
            if (cCal != null) {
                String cdate = dateFormatter.format(cCal.getTime());
                if (cdate != null && cdate.length() > 0) {
                    Topic creationDateType = createTopic(topicMap, "pdf-creation-date");
                    setData(topic, creationDateType, defaultLang, cdate.trim());
                }
            }

            // --- PDF AUTHOR ---
            String author = info.getAuthor();
            if (author != null && author.length() > 0) {
                Topic authorType = createTopic(topicMap, "pdf-author");
                setData(topic, authorType, defaultLang, author.trim());
            }

            // --- PDF SUBJECT ---
            String subject = info.getSubject();
            if (subject != null && subject.length() > 0) {
                Topic subjectType = createTopic(topicMap, "pdf-subject");
                setData(topic, subjectType, defaultLang, subject.trim());
            }

            // --- PDF TITLE ---
            String title = info.getSubject();
            if (title != null && title.length() > 0) {
                Topic titleType = createTopic(topicMap, "pdf-title");
                setData(topic, titleType, defaultLang, title.trim());
            }

            // --- PDF KEYWORDS (SEPARATED WITH SEMICOLON) ---
            String keywords = info.getKeywords();
            if (keywords != null && keywords.length() > 0) {
                Topic keywordType = createTopic(topicMap, "pdf-keyword");
                String[] keywordArray = keywords.split(";");
                String keyword = null;
                for (int i = 0; i < keywordArray.length; i++) {
                    keyword = Textbox.trimExtraSpaces(keywordArray[i]);
                    if (keyword != null && keyword.length() > 0) {
                        Topic keywordTopic = createTopic(topicMap, keyword, keywordType);
                        createAssociation(topicMap, keywordType, new Topic[] { topic, keywordTopic });
                    }
                }
            }

            // --- PDF TEXT CONTENT ---
            PDFTextStripper stripper = new PDFTextStripper();
            String content = stripper.getText(doc);
            doc.close();
            setTextEnrichment(topic, topicMap, content, name);
        }

        // --- HANDLE RTF DOCUMENTS ---
        else if (lowerCaseLocator.endsWith("rtf")) {
            String content = Textbox.RTF2PlainText(inputStream);
            setTextEnrichment(topic, topicMap, content, name);
        }

        // --- HANDLE OFFICE DOCUMENTS ---
        else if (lowerCaseLocator.endsWith("doc") || lowerCaseLocator.endsWith("docx")
                || lowerCaseLocator.endsWith("ppt") || lowerCaseLocator.endsWith("xls")
                || lowerCaseLocator.endsWith("vsd")) {
            String content = MSOfficeBox.getText(inputStream);
            if (content != null) {
                setTextEnrichment(topic, topicMap, content, name);
            }
        }

        else if (lowerCaseLocator.endsWith("odt") || lowerCaseLocator.endsWith("odp")
                || lowerCaseLocator.endsWith("odg") || lowerCaseLocator.endsWith("ods")) {

            org.odftoolkit.simple.Document oodocument = org.odftoolkit.simple.Document
                    .loadDocument(inputStream);
            String content = OpenOfficeBox.getText(oodocument);
            setTextEnrichment(topic, topicMap, content, name);

            org.odftoolkit.simple.meta.Meta meta = oodocument.getOfficeMetadata();

            // --- OO KEYWORDS ---
            List<String> keywords = meta.getKeywords();
            if (keywords != null && !keywords.isEmpty()) {
                Topic keywordType = createTopic(topicMap, "oo-keyword");
                for (String keyword : keywords) {
                    keyword = keyword.trim();
                    if (keyword != null && keyword.length() > 0) {
                        Topic keywordTopic = createTopic(topicMap, keyword, keywordType);
                        createAssociation(topicMap, keywordType, new Topic[] { topic, keywordTopic });
                    }
                }
            }

            // --- OO TITLE ---
            String title = meta.getTitle();
            if (title != null && title.length() > 0) {
                Topic titleType = createTopic(topicMap, "oo-title");
                setData(topic, titleType, defaultLang, title.trim());
            }

            // --- OO SUBJECT ---
            String subject = meta.getSubject();
            if (subject != null && subject.length() > 0) {
                Topic subjectType = createTopic(topicMap, "oo-subject");
                setData(topic, subjectType, defaultLang, subject.trim());
            }

            // --- OO CREATOR ---
            String author = meta.getCreator();
            if (author != null && author.length() > 0) {
                Topic authorType = createTopic(topicMap, "oo-author");
                setData(topic, authorType, defaultLang, author.trim());
            }

            // --- OO CREATION DATE ---
            Calendar cCal = meta.getCreationDate();
            if (cCal != null) {
                DateFormat dateFormatter = new SimpleDateFormat(DEFAULT_DATE_FORMAT);
                String cdate = dateFormatter.format(cCal.getTime());
                if (cdate != null && cdate.length() > 0) {
                    Topic creationDateType = createTopic(topicMap, "oo-creation-date");
                    setData(topic, creationDateType, defaultLang, cdate.trim());
                }
            }

            // --- OO DESCRIPTION ---
            String description = meta.getDescription();
            if (description != null && description.length() > 0) {
                Topic descriptionType = createTopic(topicMap, "oo-description");
                setData(topic, descriptionType, defaultLang, description.trim());
            }

            // --- OO GENERATOR ---
            String generator = meta.getGenerator();
            if (generator != null && generator.length() > 0) {
                Topic generatorType = createTopic(topicMap, "oo-generator");
                setData(topic, generatorType, defaultLang, generator.trim());
            }
        }

        else if (lowerCaseLocator.endsWith("html") || lowerCaseLocator.endsWith("htm")) {
            String content = IObox.loadFile(new InputStreamReader(inputStream));
            setTextEnrichment(topic, topicMap, content, name);
        }

        else if (lowerCaseLocator.endsWith("txt") || lowerCaseLocator.endsWith("text")) {
            String content = IObox.loadFile(new InputStreamReader(inputStream));
            setTextEnrichment(topic, topicMap, content, name);
        }

        // --- HANDLE ANY OTHER DOCUMENTS ---
        else {
            byte[] content = IObox.loadBFile(inputStream);
            String mimeType = "";
            MimeUtil.registerMimeDetector("eu.medsea.mimeutil.detector.MagicMimeMimeDetector");
            Collection<MimeType> mimeTypes = new ArrayList();
            if (locator != null) {
                if (MimeTypes.getMimeType(locator) != null) {
                    mimeTypes.add(new MimeType(MimeTypes.getMimeType(locator)));
                }
                mimeTypes.addAll(MimeUtil.getMimeTypes(locator));
            }
            mimeTypes.addAll(MimeUtil.getMimeTypes(content));
            boolean isText = false;
            for (MimeType mime : mimeTypes) {
                if (MimeUtil.isTextMimeType(mime)) {
                    isText = true;
                    break;
                }
            }
            if (isText) {
                setTextEnrichment(topic, topicMap, new String(content), name);
            } else {
                if (!mimeTypes.isEmpty()) {
                    MimeType mime = mimeTypes.iterator().next();
                    mimeType = mime.toString();
                }
                setBinaryEnrichment(topic, topicMap, content, mimeType);
            }
        }
    } catch (Exception e) {
        log(e);
    }
}

From source file:org.wandora.application.tools.extractors.files.SimplePDFExtractor.java

License:Open Source License

public void _extractTopicsFromStream(String locator, InputStream inputStream, TopicMap topicMap,
        Topic pdfTopic) {/*from   w w w .  ja v a 2  s. co  m*/
    PDDocument doc = null;
    try {
        if (locator.startsWith("http://")) {
            doc = PDDocument.load(new URL(locator));
        } else {
            doc = PDDocument.load(new File(locator));
        }
        PDDocumentInformation info = doc.getDocumentInformation();
        DateFormat dateFormatter = new SimpleDateFormat(DEFAULT_DATE_FORMAT);

        // --- PDF PRODUCER ---
        String producer = info.getProducer();
        if (producer != null && producer.length() > 0) {
            Topic producerType = createTopic(topicMap, "pdf-producer");
            setData(pdfTopic, producerType, defaultLang, producer.trim());
        }

        // --- PDF MODIFICATION DATE ---
        Calendar mCal = info.getModificationDate();
        if (mCal != null) {
            String mdate = dateFormatter.format(mCal.getTime());
            if (mdate != null && mdate.length() > 0) {
                Topic modificationDateType = createTopic(topicMap, "pdf-modification-date");
                setData(pdfTopic, modificationDateType, defaultLang, mdate.trim());
            }
        }

        // --- PDF CREATOR ---
        String creator = info.getCreator();
        if (creator != null && creator.length() > 0) {
            Topic creatorType = createTopic(topicMap, "pdf-creator");
            setData(pdfTopic, creatorType, defaultLang, creator.trim());
        }

        // --- PDF CREATION DATE ---
        Calendar cCal = info.getCreationDate();
        if (cCal != null) {
            String cdate = dateFormatter.format(cCal.getTime());
            if (cdate != null && cdate.length() > 0) {
                Topic creationDateType = createTopic(topicMap, "pdf-creation-date");
                setData(pdfTopic, creationDateType, defaultLang, cdate.trim());
            }
        }

        // --- PDF AUTHOR ---
        String author = info.getAuthor();
        if (author != null && author.length() > 0) {
            Topic authorType = createTopic(topicMap, "pdf-author");
            setData(pdfTopic, authorType, defaultLang, author.trim());
        }

        // --- PDF SUBJECT ---
        String subject = info.getSubject();
        if (subject != null && subject.length() > 0) {
            Topic subjectType = createTopic(topicMap, "pdf-subject");
            setData(pdfTopic, subjectType, defaultLang, subject.trim());
        }

        // --- PDF TITLE ---
        String title = info.getSubject();
        if (title != null && title.length() > 0) {
            if (makeVariantFromTitle) {
                pdfTopic.setDisplayName(defaultLang, title);
            } else {
                Topic titleType = createTopic(topicMap, "pdf-title");
                setData(pdfTopic, titleType, defaultLang, title.trim());
            }
        }

        // --- PDF KEYWORDS (SEPARATED WITH SEMICOLON) ---
        String keywords = info.getKeywords();
        if (keywords != null && keywords.length() > 0) {
            Topic keywordType = createTopic(topicMap, "pdf-keyword");
            String[] keywordArray = keywords.split(";");
            String keyword = null;
            for (int i = 0; i < keywordArray.length; i++) {
                keyword = Textbox.trimExtraSpaces(keywordArray[i]);
                if (keyword != null && keyword.length() > 0) {
                    Topic keywordTopic = createTopic(topicMap, keyword, keywordType);
                    createAssociation(topicMap, keywordType, new Topic[] { pdfTopic, keywordTopic });
                }
            }
        }

        // --- PDF TEXT CONTENT ---
        PDFTextStripper stripper = new PDFTextStripper();
        String content = new String();

        if (makePageTopics) {
            int pages = doc.getNumberOfPages();
            String pageContent = null;
            for (int i = 0; i < pages; i++) {
                stripper.setStartPage(i);
                stripper.setEndPage(i);
                pageContent = stripper.getText(doc);
                Topic pageType = createTopic(topicMap, "pdf-page");
                Topic pageTopic = createTopic(topicMap, pdfTopic.getBaseName() + " (page " + i + ")", pageType);
                Topic orderType = createTopic(topicMap, "order");
                Topic orderTopic = createTopic(topicMap, i + ".", orderType);
                Topic contentType = createTopic(topicMap, "pdf-text");
                setData(pageTopic, contentType, defaultLang, pageContent.trim());
                createAssociation(topicMap, pageType, new Topic[] { pdfTopic, pageTopic, orderTopic });
            }
        } else {
            content = stripper.getText(doc);
        }

        if (!makePageTopics && content != null && content.length() > 0) {
            Topic contentType = createTopic(topicMap, "pdf-text");
            setData(pdfTopic, contentType, defaultLang, content.trim());
        }
        doc.close();
    } catch (Exception e) {
        e.printStackTrace();
        try {
            if (doc != null)
                doc.close();
        } catch (Exception ix) {
            e.printStackTrace();
        }
    }
}

From source file:org.xstudiosys.pdfxmp.AddMetadataFromDocInfo.java

License:Apache License

/**
 * This will print the documents data./*from   w w w  .j a va 2 s  .co m*/
 *
 * @param args The command line arguments.
 *
 * @throws Exception If there is an error parsing the document.
 */
public static void main(String[] args) throws Exception {
    if (args.length != 2) {
        usage();
    } else {
        PDDocument document = null;

        try {
            document = PDDocument.load(args[0]);
            if (document.isEncrypted()) {
                System.err.println("Error: Cannot add metadata to encrypted document.");
                System.exit(1);
            }
            PDDocumentCatalog catalog = document.getDocumentCatalog();
            PDDocumentInformation info = document.getDocumentInformation();

            XMPMetadata metadata = new XMPMetadata();

            XMPSchemaPDF pdfSchema = metadata.addPDFSchema();
            pdfSchema.setKeywords(info.getKeywords());
            pdfSchema.setProducer(info.getProducer());

            XMPSchemaBasic basicSchema = metadata.addBasicSchema();
            basicSchema.setModifyDate(info.getModificationDate());
            basicSchema.setCreateDate(info.getCreationDate());
            basicSchema.setCreatorTool(info.getCreator());
            basicSchema.setMetadataDate(new GregorianCalendar());

            XMPSchemaDublinCore dcSchema = metadata.addDublinCoreSchema();
            dcSchema.setTitle(info.getTitle());
            dcSchema.addCreator("PDFBox");
            dcSchema.setDescription(info.getSubject());

            PDMetadata metadataStream = new PDMetadata(document);
            metadataStream.importXMPMetadata(metadata);
            catalog.setMetadata(metadataStream);

            document.save(args[1]);
        } finally {
            if (document != null) {
                document.close();
            }
        }
    }
}