Example usage for org.apache.pdfbox.pdmodel PDDocumentInformation getTitle

List of usage examples for org.apache.pdfbox.pdmodel PDDocumentInformation getTitle

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocumentInformation getTitle.

Prototype

public String getTitle() 

Source Link

Document

This will get the title of the document.

Usage

From source file:org.nuxeo.pdf.PDFInfo.java

License:Open Source License

/**
 * After building the object with the correct constructor, and after
 * possibly having set some parsing property (<code>setParseWithXMP()</code>
 * for example), this method will extract the information from the PDF.
 * <p>/*ww  w .j  a  v  a  2  s .c o m*/
 * After extraction, caller get the info: Either all of them (
 * <code>toHashMap()</code> or <code>toString()</code>) or individual info
 * (see all getters)
 *
 * @throws ClientException
 *
 * @since 5.9.5
 */
public void run() throws ClientException {

    // In case the caller calls several time the run() method
    if (!alreadyParsed) {

        fileName = pdfBlob.getFilename();
        // Getting the file size os ok only if the blob is already backed by
        // a
        // File. If it is pure Stream, we give up
        File pdfFile = BlobHelper.getFileFromBlob(pdfBlob);
        if (pdfFile == null) {
            fileSize = -1;
        } else {
            fileSize = pdfFile.length();
        }

        try {
            pdfDoc = PDDocument.load(pdfBlob.getStream());

            isEncrypted = pdfDoc.isEncrypted();
            if (isEncrypted) {
                pdfDoc.openProtection(new StandardDecryptionMaterial(password));
            }

            numberOfPages = pdfDoc.getNumberOfPages();
            PDDocumentCatalog docCatalog = pdfDoc.getDocumentCatalog();
            pageLayout = checkNotNull(docCatalog.getPageLayout());
            pdfVersion = "" + pdfDoc.getDocument().getVersion();

            PDDocumentInformation docInfo = pdfDoc.getDocumentInformation();
            author = checkNotNull(docInfo.getAuthor());
            contentCreator = checkNotNull(docInfo.getCreator());
            keywords = checkNotNull(docInfo.getKeywords());
            creationDate = docInfo.getCreationDate();
            modificationDate = docInfo.getModificationDate();
            producer = checkNotNull(docInfo.getProducer());
            subject = checkNotNull(docInfo.getSubject());
            title = checkNotNull(docInfo.getTitle());

            // Getting dimension is a bit tricky
            mediaBoxWidthInPoints = -1;
            mediaBoxHeightInPoints = -1;
            cropBoxWidthInPoints = -1;
            cropBoxHeightInPoints = -1;
            List<PDPage> allPages = docCatalog.getAllPages();
            boolean gotMediaBox = false;
            boolean gotCropBox = false;
            for (PDPage page : allPages) {

                if (page != null) {
                    PDRectangle r = page.findMediaBox();
                    if (r != null) {
                        mediaBoxWidthInPoints = r.getWidth();
                        mediaBoxHeightInPoints = r.getHeight();
                        gotMediaBox = true;
                    }
                    r = page.findCropBox();
                    if (r != null) {
                        cropBoxWidthInPoints = r.getWidth();
                        cropBoxHeightInPoints = r.getHeight();
                        gotCropBox = true;
                    }
                }
                if (gotMediaBox && gotCropBox) {
                    break;
                }
            }

            if (doXMP) {
                xmp = null;
                PDMetadata metadata = docCatalog.getMetadata();
                if (metadata != null) {
                    xmp = "";
                    InputStream xmlInputStream = metadata.createInputStream();

                    InputStreamReader isr = new InputStreamReader(xmlInputStream);
                    BufferedReader reader = new BufferedReader(isr);
                    String line;
                    do {
                        line = reader.readLine();
                        if (line != null) {
                            xmp += line + "\n";
                        }
                    } while (line != null);
                    reader.close();
                }
            }

        } catch (IOException | BadSecurityHandlerException | CryptographyException e) {
            throw new ClientException(/*
                                       * "Cannot get PDF info: " +
                                       * e.getMessage(),
                                       */e);
        } finally {
            if (pdfDoc != null) {
                try {
                    pdfDoc.close();
                } catch (IOException e) {
                    // Ignore
                }
                pdfDoc = null;
            }
            alreadyParsed = true;
        }
    }
}

From source file:org.nuxeo.pdf.test.PDFPageExtractorTest.java

License:Open Source License

@Test
public void testExtractPages_WithSetInfo() throws Exception {

    Blob extracted;// w  w w. j  ava2s  .c om
    String originalName = pdfFileBlob.getFilename().replace(".pdf", "");
    PDFPageExtractor pe = new PDFPageExtractor(pdfFileBlob);

    extracted = pe.extract(5, 9, null, "One Upon a Time", "Fairyland", "Cool Author");
    assertTrue(extracted instanceof FileBlob);
    assertEquals(originalName + "-5-9.pdf", extracted.getFilename());
    PDDocument doc = PDDocument.load(extracted.getStream());
    utils.track(doc);
    PDDocumentInformation docInfo = doc.getDocumentInformation();
    assertEquals("One Upon a Time", docInfo.getTitle());
    assertEquals("Fairyland", docInfo.getSubject());
    assertEquals("Cool Author", docInfo.getAuthor());
    doc.close();
    utils.untrack(doc);
}

From source file:org.nuxeo.pdf.test.PDFUtilsTest.java

License:Open Source License

@Test
public void test_setInfos() throws Exception {

    PDDocument doc = PDDocument.load(pdfFile);
    utils.track(doc);/*  w ww. j av a2 s  .  c  o m*/

    PDDocumentInformation docInfoOriginal = doc.getDocumentInformation();
    // Check original document has the expected values
    assertEquals("Untitled 3", docInfoOriginal.getTitle());
    assertNull(docInfoOriginal.getSubject());
    assertNull(docInfoOriginal.getAuthor());
    // Now, modify
    // First, actually, don't modify
    PDFUtils.setInfos(doc, null, "", null);
    PDDocumentInformation newDocInfo = doc.getDocumentInformation();
    assertEquals(docInfoOriginal.getTitle(), newDocInfo.getTitle());
    assertEquals(docInfoOriginal.getSubject(), newDocInfo.getSubject());
    assertEquals(docInfoOriginal.getAuthor(), newDocInfo.getAuthor());
    // Now, modify
    PDFUtils.setInfos(doc, "The Title", "The Subject", "The Author");
    newDocInfo = doc.getDocumentInformation();
    assertEquals("The Title", newDocInfo.getTitle());
    assertEquals("The Subject", newDocInfo.getSubject());
    assertEquals("The Author", newDocInfo.getAuthor());

    doc.close();
    utils.untrack(doc);
}

From source file:org.paxle.parser.pdf.impl.PdfParser.java

License:Open Source License

/**
 * A function to extract metadata from the PDF-document.
 *//*from   ww  w. j  a  v  a  2s  .co m*/
protected void extractMetaData(IParserDocument parserDoc, PDDocument pddDoc) throws IOException {
    // extract metadata
    final PDDocumentInformation metadata = pddDoc.getDocumentInformation();
    if (metadata == null)
        return;

    // document title
    final String title = metadata.getTitle();
    if (title != null && title.length() > 0)
        parserDoc.setTitle(title);

    // document author(s)
    final String author = metadata.getAuthor();
    if (author != null && author.length() > 0)
        parserDoc.setAuthor(author);
    ;

    // subject
    final String summary = metadata.getSubject();
    if (summary != null && summary.length() > 0)
        parserDoc.setSummary(summary);

    // keywords
    final String keywords = metadata.getKeywords();
    if (keywords != null && keywords.length() > 0) {
        String[] keywordArray = keywords.split("[,;\\s]");
        if (keywordArray != null && keywordArray.length > 0) {
            parserDoc.setKeywords(Arrays.asList(keywordArray));
        }
    }

    // last modification date
    final Calendar lastMod = metadata.getModificationDate();
    if (lastMod != null) {
        parserDoc.setLastChanged(lastMod.getTime());
    }
}

From source file:org.pdfsam.pdf.DefaultPDFBoxLoader.java

License:Open Source License

public void accept(PDDocument document, PdfDocumentDescriptor descriptor) {
    descriptor.pages(document.getNumberOfPages());
    descriptor.setVersion(getVersion(Float.toString(document.getVersion())));
    PDDocumentInformation info = document.getDocumentInformation();
    descriptor.putInformation(PdfMetadataKey.TITLE.getKey(), info.getTitle());
    descriptor.putInformation(PdfMetadataKey.AUTHOR.getKey(), info.getAuthor());
    descriptor.putInformation(PdfMetadataKey.CREATOR.getKey(), info.getCreator());
    descriptor.putInformation(PdfMetadataKey.SUBJECT.getKey(), info.getSubject());
    descriptor.putInformation(PdfMetadataKey.KEYWORDS.getKey(), info.getKeywords());
    descriptor.putInformation("Producer", info.getProducer());
    Optional.ofNullable(info.getCreationDate()).map(FORMATTER::format)
            .ifPresent(c -> descriptor.putInformation("FormattedCreationDate", c));
}

From source file:org.terrier.indexing.PDFDocument.java

License:Mozilla Public License

/** 
 * Returns the reader of text, which is suitable for parsing terms out of,
 * and which is created by converting the file represented by 
 * parameter docStream. This method involves running the stream 
 * through the PDFParser etc provided in the org.pdfbox library.
 * On error, it returns null, and sets EOD to true, so no terms 
 * can be read from this document.//from   ww  w .  j  a  v  a2s  . c o m
 * @param is the input stream that represents the document's file.
 * @return Reader a reader that is fed to an indexer.
 */
protected Reader getReader(InputStream is) {

    if ((Files.length(filename) / 1048576) > 300) {
        logger.info("Skipping document " + filename + " because it's size exceeds 300Mb");
        return new StringReader("");
    }

    PDDocument pdfDocument = null;
    Reader rtr = null;
    try {
        pdfDocument = PDDocument.load(is);

        if (pdfDocument.isEncrypted()) {
            //Just try using the default password and move on
            pdfDocument.decrypt("");
        }

        //create a writer where to append the text content.
        StringWriter writer = new StringWriter();
        PDFTextStripper stripper = new PDFTextStripper();
        stripper.writeText(pdfDocument, writer);

        String contents = writer.getBuffer().toString();
        int spaceCount = StringUtils.countMatches(contents, " ");
        for (char badChar : new char[] { '\u00A0', '\u2029', '#' }) {
            final int count = StringUtils.countMatches(contents, "" + badChar);
            if (count > spaceCount / 2) {
                contents = contents.replace(badChar, ' ');
                spaceCount += count;
            }
        }
        rtr = new StringReader(contents);

        PDDocumentInformation info = pdfDocument.getDocumentInformation();
        if (info != null && USE_PDF_TITLE) {
            setProperty("title", info.getTitle());
        } else {
            setProperty("title", new java.io.File(super.filename).getName());
        }
    } catch (CryptographyException e) {
        throw new RuntimeException("Error decrypting PDF document: " + e);
    } catch (InvalidPasswordException e) {
        //they didn't suppply a password and the default of "" was wrong.
        throw new RuntimeException("Error: The PDF document is encrypted and will not be indexed.");
    } catch (Exception e) {
        throw new RuntimeException("Error extracting PDF document", e);
    } finally {
        if (pdfDocument != null) {
            try {
                pdfDocument.close();
            } catch (IOException ioe) {
            }
        }
    }
    return rtr;
}

From source file:org.wandora.application.tools.extractors.fng.ExtractFNGTextEnrichment.java

License:Open Source License

public void _extractTopicsFromStream(String locator, InputStream inputStream, TopicMap topicMap,
        Topic textTopic) {//from   w  w w  . j  ava  2  s .c  o  m
    try {
        String lowerCaseLocator = locator.toLowerCase();

        // --- HANDLE PDF ENRICHMENT TEXT ---
        if (lowerCaseLocator.endsWith("pdf")) {

            PDDocument doc = PDDocument.load(new URL(locator));
            PDDocumentInformation info = doc.getDocumentInformation();

            // --- PDF SUBJECT ---
            String subject = info.getSubject();
            if (subject != null && subject.length() > 0) {
                Topic subjectType = createTopic(topicMap, "subject");
                setData(textTopic, subjectType, defaultLang, subject.trim());
            }

            // --- PDF TITLE ---
            String title = info.getTitle();
            if (title != null && title.length() > 0) {
                Topic titleType = createTopic(topicMap, "title");
                setData(textTopic, titleType, defaultLang, title.trim());
            }

            // --- PDF KEYWORDS ---
            String keywords = info.getKeywords();
            if (keywords != null && keywords.length() > 0) {
                Topic keywordType = createTopic(topicMap, "keywords");
                setData(textTopic, keywordType, defaultLang, keywords.trim());
            }

            // --- PDF TEXT CONTENT ---
            PDFTextStripper stripper = new PDFTextStripper();
            String content = stripper.getText(doc);
            setTextEnrichment(textTopic, topicMap, content);
            doc.close();
        }

        // --- HANDLE RTF DOCUMENTS ---
        else if (lowerCaseLocator.endsWith("rtf")) {
            String content = Textbox.RTF2PlainText(inputStream);
            setTextEnrichment(textTopic, topicMap, content);
        }

        // --- HANDLE OFFICE DOCUMENTS ---
        else if (lowerCaseLocator.endsWith("doc") || lowerCaseLocator.endsWith("docx")
                || lowerCaseLocator.endsWith("ppt") || lowerCaseLocator.endsWith("xsl")
                || lowerCaseLocator.endsWith("vsd")) {
            String content = MSOfficeBox.getText(inputStream);
            if (content != null) {
                setTextEnrichment(textTopic, topicMap, content);
            }
        }

        // --- HANDLE TXT DOCUMENTS ---
        else {
            String content = IObox.loadFile(new InputStreamReader(inputStream));
            setTextEnrichment(textTopic, topicMap, content);
        }
    } catch (Exception e) {
        log(e);
    }
}

From source file:org.wandora.piccolo.utils.crawler.handlers.PDFHandler.java

License:Open Source License

public void handle(CrawlerAccess crawler, InputStream in, int depth, URL page) {
    try {/* w  ww  .j a v a  2 s .co  m*/
        Document d = new Document();

        PDDocument doc = PDDocument.load(page);
        PDDocumentInformation info = doc.getDocumentInformation();
        PDFTextStripper stripper = new PDFTextStripper();
        String content = stripper.getText(doc);
        doc.close();

        d.add(LuceneCrawler.subject(info.getSubject()));
        d.add(LuceneCrawler.title(info.getTitle()));
        d.add(LuceneCrawler.keywords(info.getKeywords()));
        d.add(LuceneCrawler.content(content));
        d.add(LuceneCrawler.location(page.toString()));

        crawler.addObject(d);
    } catch (IOException e) {
        e.printStackTrace();
    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:org.xstudiosys.pdfxmp.AddMetadataFromDocInfo.java

License:Apache License

/**
 * This will print the documents data./*from w  w w .ja  va  2 s .  c o  m*/
 *
 * @param args The command line arguments.
 *
 * @throws Exception If there is an error parsing the document.
 */
public static void main(String[] args) throws Exception {
    if (args.length != 2) {
        usage();
    } else {
        PDDocument document = null;

        try {
            document = PDDocument.load(args[0]);
            if (document.isEncrypted()) {
                System.err.println("Error: Cannot add metadata to encrypted document.");
                System.exit(1);
            }
            PDDocumentCatalog catalog = document.getDocumentCatalog();
            PDDocumentInformation info = document.getDocumentInformation();

            XMPMetadata metadata = new XMPMetadata();

            XMPSchemaPDF pdfSchema = metadata.addPDFSchema();
            pdfSchema.setKeywords(info.getKeywords());
            pdfSchema.setProducer(info.getProducer());

            XMPSchemaBasic basicSchema = metadata.addBasicSchema();
            basicSchema.setModifyDate(info.getModificationDate());
            basicSchema.setCreateDate(info.getCreationDate());
            basicSchema.setCreatorTool(info.getCreator());
            basicSchema.setMetadataDate(new GregorianCalendar());

            XMPSchemaDublinCore dcSchema = metadata.addDublinCoreSchema();
            dcSchema.setTitle(info.getTitle());
            dcSchema.addCreator("PDFBox");
            dcSchema.setDescription(info.getSubject());

            PDMetadata metadataStream = new PDMetadata(document);
            metadataStream.importXMPMetadata(metadata);
            catalog.setMetadata(metadataStream);

            document.save(args[1]);
        } finally {
            if (document != null) {
                document.close();
            }
        }
    }
}

From source file:org.xstudiosys.pdfxmp.MarkBuilder.java

License:Open Source License

public void onComplete(PDDocument document) {
    try {/*from   w w  w .  ja va 2s  .c  o  m*/

        PDDocumentCatalog catalog = document.getDocumentCatalog();
        PDDocumentInformation info = document.getDocumentInformation();

        XMPMetadata metadata = new XMPMetadata();

        XMPSchemaPDF pdfSchema = metadata.addPDFSchema();
        pdfSchema.setKeywords(info.getKeywords());
        pdfSchema.setProducer(info.getProducer());

        XMPSchemaBasic basicSchema = metadata.addBasicSchema();
        basicSchema.setModifyDate(info.getModificationDate());
        basicSchema.setCreateDate(info.getCreationDate());
        basicSchema.setCreatorTool(info.getCreator());
        basicSchema.setMetadataDate(new GregorianCalendar());

        XMPSchemaDublinCore dcSchema = metadata.addDublinCoreSchema();
        dcSchema.setTitle(info.getTitle());
        dcSchema.addCreator("PDFBox");
        dcSchema.setDescription(info.getSubject());

        PDMetadata metadataStream = new PDMetadata(document);
        metadataStream.importXMPMetadata(metadata);
        catalog.setMetadata(metadataStream);
    } catch (Exception e) {
        e.printStackTrace();
    }
}