Example usage for org.apache.pdfbox.pdmodel PDDocument close

List of usage examples for org.apache.pdfbox.pdmodel PDDocument close

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocument close.

Prototype

@Override
public void close() throws IOException 

Source Link

Document

This will close the underlying COSDocument object.

Usage

From source file:org.titans.fyp.webcrawler.PageCollector.java

License:Open Source License

private static void pdfToText(String pdfURL) {

    pdfURL = "https://" + pdfURL.split("://")[1];
    //        System.out.println(pdfURL);

    try {//from   ww w  .java2s.c  om

        PDDocument pddDocument = PDDocument.load((new URL(pdfURL)).openStream());
        PDFTextStripper textStripper = new PDFTextStripper();
        String doc = textStripper.getText(pddDocument);
        pddDocument.close();
        System.out.println(doc);
    } catch (Exception e) {
        e.getMessage();
    }
}

From source file:org.tnc.doctrack.behaviours.docTrackBehaviours.java

License:Open Source License

private Result[] extractQRfromPDF(InputStream PDF) throws Exception {
    System.out.println("TNC - DocTrack  - extractQRfromPDF starting....");
    //Initialize variable for QR decoding.

    PDDocument document = null;
    String password = "";
    String prefix = null;//from  ww  w . java  2  s.c o  m
    boolean addKey = false;
    Result[] QR = null;
    try {
        //read PDF document 
        document = PDDocument.loadNonSeq(PDF, null, password);
        //Check permission to PDF
        AccessPermission ap = document.getCurrentAccessPermission();
        if (!ap.canExtractContent()) {
            System.out.println(
                    "TNC - DocTrack  Error - extractQRfromPDF - You do not have permission to extract images from PDF.");
            throw new IOException(
                    "TNC - DocTrack  Error - extractQRfromPDF - You do not have permission to extract images from PDF.");
        }
        //Iterate throw the PDF pages. 
        List<?> pages = document.getDocumentCatalog().getAllPages();
        Iterator<?> iter = pages.iterator();
        while (iter.hasNext()) {
            PDPage page = (PDPage) iter.next();
            PDResources resources = page.getResources();
            // extract all XObjectImages which are part of the page resources
            System.out.println("TNC - DocTrack  - extractQRfromPDF - Try to process image and find QR code");
            QR = processResources(resources, prefix, addKey);
        }

    } finally {
        if ((document != null)) {
            try {
                document.close();
            } catch (Exception e) {

            }

        }
    }
    System.out.println("TNC - DocTrack  - extractQRfromPDF finished. QR code string : " + QR);
    return QR;
}

From source file:org.ujmp.pdfbox.ImportMatrixPDF.java

License:Open Source License

public static final Matrix fromFile(File file) throws IOException {
    PDDocument pdd = PDDocument.load(file);
    PDFTextStripper pts = new PDFTextStripper();
    String text = pts.getText(pdd);
    pdd.close();
    return Matrix.Factory.linkToValue(text);
}

From source file:org.ujmp.pdfbox.ImportMatrixPDF.java

License:Open Source License

public static final Matrix fromStream(InputStream inputStream) throws IOException {
    PDDocument pdd = PDDocument.load(inputStream);
    PDFTextStripper pts = new PDFTextStripper();
    String text = pts.getText(pdd);
    pdd.close();
    return Matrix.Factory.linkToValue(text);
}

From source file:org.ujmp.pdfbox.PdfUtil.java

License:Open Source License

public static final String getTextFromFile(File file) throws IOException {
    PDDocument pdd = PDDocument.load(file);
    PDFTextStripper pts = new PDFTextStripper();
    String text = pts.getText(pdd);
    pdd.close();
    return text;//w  ww  .ja  v a 2s .c  om
}

From source file:org.ujmp.pdfbox.PdfUtil.java

License:Open Source License

public static final String getTextFromStream(InputStream inputStream) throws IOException {
    PDDocument pdd = PDDocument.load(inputStream);
    PDFTextStripper pts = new PDFTextStripper();
    String text = pts.getText(pdd);
    pdd.close();
    return text;/* w  w w  .j  ava 2 s .  c o  m*/
}

From source file:org.vesalainen.ham.pdf.RfaxTest.java

License:Open Source License

public void test() throws IOException {
    PDDocument document = PDDocument.load(new File("rfax.pdf"));
    if (!document.isEncrypted()) {
        PDFTextStripper stripper = new PDFTextStripper();
        String text = stripper.getText(document);
        try (BufferedWriter bw = Files.newBufferedWriter(Paths.get("src", "main", "resources", "rfax.txt"))) {
            bw.write(text);//from   w w w  .  j av  a  2s  .c  om
        }
    }
    document.close();
}

From source file:org.wandora.application.gui.simple.SimpleTextPane.java

License:Open Source License

public void load(File file) {
    if (file != null) {
        if (file.length() > MAX_TEXT_SIZE) {
            WandoraOptionPane.showMessageDialog(wandora, "File size is too big.", "File size is too big",
                    WandoraOptionPane.WARNING_MESSAGE);
        } else {/*ww w  . j  a v  a  2s  .c  om*/
            try {
                int a = WandoraOptionPane.showConfirmDialog(wandora, "Store the file content as a data URI?",
                        "Make data URI?", WandoraOptionPane.QUESTION_MESSAGE);
                if (a == WandoraOptionPane.YES_OPTION) {
                    DataURL url = new DataURL(file);
                    setText(url.toExternalForm());
                } else {
                    Object desc = getStyledDocument();

                    Reader inputReader = null;
                    String content = "";

                    String filename = file.getPath().toLowerCase();
                    String extension = filename.substring(Math.max(filename.lastIndexOf(".") + 1, 0));

                    // --- handle rtf files ---
                    if ("rtf".equals(extension)) {
                        content = Textbox.RTF2PlainText(new FileInputStream(file));
                        inputReader = new StringReader(content);
                    }

                    // --- handle pdf files ---
                    if ("pdf".equals(extension)) {
                        try {
                            PDDocument doc = PDDocument.load(file);
                            PDFTextStripper stripper = new PDFTextStripper();
                            content = stripper.getText(doc);
                            doc.close();
                            inputReader = new StringReader(content);
                        } catch (Exception e) {
                            System.out.println("No PDF support!");
                        }
                    }

                    // --- handle MS office files ---
                    if ("doc".equals(extension) || "ppt".equals(extension) || "xls".equals(extension)
                            || "vsd".equals(extension) || "odt".equals(extension)) {
                        content = MSOfficeBox.getText(new FileInputStream(file));
                        if (content != null) {
                            inputReader = new StringReader(content);
                        }
                    }

                    if ("docx".equals(extension)) {
                        content = MSOfficeBox.getDocxText(file);
                        if (content != null) {
                            inputReader = new StringReader(content);
                        }
                    }

                    // --- handle everything else ---
                    if (inputReader == null) {
                        inputReader = new FileReader(file);
                    }
                    read(inputReader, desc);
                    inputReader.close();
                    setCaretPosition(0);
                }
            } catch (MalformedURLException mfue) {
                mfue.printStackTrace();
                wandora.handleError(mfue);
            } catch (IOException ioe) {
                ioe.printStackTrace();
                wandora.handleError(ioe);
            } catch (Exception e) {
                e.printStackTrace();
                wandora.handleError(e);
            }
        }
    }
}

From source file:org.wandora.application.tools.extractors.email.SimpleEmailExtractor.java

License:Open Source License

public void extractContent(TopicMap map, Topic emailTopic, Part part) {
    try {/*w  ww . ja v  a 2 s  .co  m*/
        Object content = part.getContent();
        String contentType = part.getContentType();
        String lowerCaseType = contentType.toLowerCase();

        if (lowerCaseType.startsWith("text/plain")) {
            Topic textContentType = createTopic(map, "text-content");
            String stringContent = (content != null ? content.toString() : "");
            setData(emailTopic, textContentType, defaultLang, Textbox.trimExtraSpaces(stringContent));
        } else if (lowerCaseType.startsWith("text/html")) {
            Topic htmlTextContentType = createTopic(map, "html-text-content");
            String stringContent = (content != null ? content.toString() : "");
            setData(emailTopic, htmlTextContentType, defaultLang, Textbox.trimExtraSpaces(stringContent));
        } else if (lowerCaseType.startsWith("text/xml") || lowerCaseType.startsWith("application/xml")) {
            Topic contentTypeTopic = createTopic(map, "xml-content");
            String stringContent = (content != null ? content.toString() : "");
            setData(emailTopic, contentTypeTopic, defaultLang, stringContent);
        } else if (lowerCaseType.startsWith("application/msword")
                || lowerCaseType.startsWith("application/x-msword")
                || lowerCaseType.startsWith("application/x-ms-word")
                || lowerCaseType.startsWith("application/x-word")) {
            Topic contentTypeTopic = createTopic(map, "ms-word-text-content");
            String stringContent = MSOfficeBox.getText(part.getInputStream());
            setData(emailTopic, contentTypeTopic, defaultLang, Textbox.trimExtraSpaces(stringContent));
        } else if (lowerCaseType.startsWith("application/msexcel")
                || lowerCaseType.startsWith("application/x-msexcel")
                || lowerCaseType.startsWith("application/x-ms-excel")
                || lowerCaseType.startsWith("application/x-excel")
                || lowerCaseType.startsWith("application/vnd.ms-excel")) {
            Topic contentTypeTopic = createTopic(map, "ms-excel-text-content");
            String stringContent = MSOfficeBox.getText(part.getInputStream());
            setData(emailTopic, contentTypeTopic, defaultLang, Textbox.trimExtraSpaces(stringContent));
        } else if (lowerCaseType.startsWith("application/powerpoint")
                || lowerCaseType.startsWith("application/x-mspowerpoint")
                || lowerCaseType.startsWith("application/x-ms-powerpoint")
                || lowerCaseType.startsWith("application/x-powerpoint")
                || lowerCaseType.startsWith("application/vnd.ms-powerpoint")) {
            Topic contentTypeTopic = createTopic(map, "ms-powerpoint-text-content");
            String stringContent = MSOfficeBox.getText(part.getInputStream());
            setData(emailTopic, contentTypeTopic, defaultLang, Textbox.trimExtraSpaces(stringContent));
        } else if (lowerCaseType.startsWith("application/pdf")) {
            Topic contentTypeTopic = createTopic(map, "pdf-text-content");
            String stringContent = "";
            try {
                PDDocument doc = PDDocument.load(part.getInputStream());
                PDFTextStripper stripper = new PDFTextStripper();
                stringContent = stripper.getText(doc);
                doc.close();
            } catch (Exception e) {
                System.out.println("No PDF support!");
            }
            setData(emailTopic, contentTypeTopic, defaultLang, stringContent.trim());
        } else if (lowerCaseType.startsWith("multipart")) {
            Multipart multipart = (Multipart) content;
            BodyPart bodypart = null;
            int c = multipart.getCount();
            for (int i = 0; i < c; i++) {
                bodypart = multipart.getBodyPart(i);
                extractContent(map, emailTopic, bodypart);
            }
        } else {
            if (contentType.indexOf(";") > -1) {
                contentType = contentType.substring(0, contentType.indexOf(";"));
            }
            log("Unsupported attachment type '" + contentType + "' found.");

            if (shouldExtractUnknownContentTypeAttachments) {
                log("Processing anyway...");
                Topic contentTypeTopic = createTopic(map, "unknown-content");
                String unknownContent = (String) content;
                setData(emailTopic, contentTypeTopic, defaultLang, unknownContent);
            }
        }
    } catch (Exception e) {
        log(e);
    } catch (Error e) {
        log(e);
    }
}

From source file:org.wandora.application.tools.extractors.files.SimpleDocumentExtractor.java

License:Open Source License

public void _extractTopicsFromStream(String locator, InputStream inputStream, TopicMap topicMap, Topic topic) {
    try {//from   w  ww  .j  a  v  a2  s.c om
        String name = locator;
        if (name.indexOf("/") != -1) {
            name = name.substring(name.lastIndexOf("/") + 1);
        } else if (name.indexOf("\\") != -1) {
            name = name.substring(name.lastIndexOf("\\") + 1);
        }
        String lowerCaseLocator = locator.toLowerCase();

        // --- HANDLE PDF ENRICHMENT TEXT ---
        if (lowerCaseLocator.endsWith("pdf")) {
            PDDocument doc = PDDocument.load(locator);
            PDDocumentInformation info = doc.getDocumentInformation();
            DateFormat dateFormatter = new SimpleDateFormat(DEFAULT_DATE_FORMAT);

            // --- PDF PRODUCER ---
            String producer = info.getProducer();
            if (producer != null && producer.length() > 0) {
                Topic producerType = createTopic(topicMap, "pdf-producer");
                setData(topic, producerType, defaultLang, producer.trim());
            }

            // --- PDF MODIFICATION DATE ---
            Calendar mCal = info.getModificationDate();
            if (mCal != null) {
                String mdate = dateFormatter.format(mCal.getTime());
                if (mdate != null && mdate.length() > 0) {
                    Topic modificationDateType = createTopic(topicMap, "pdf-modification-date");
                    setData(topic, modificationDateType, defaultLang, mdate.trim());
                }
            }

            // --- PDF CREATOR ---
            String creator = info.getCreator();
            if (creator != null && creator.length() > 0) {
                Topic creatorType = createTopic(topicMap, "pdf-creator");
                setData(topic, creatorType, defaultLang, creator.trim());
            }

            // --- PDF CREATION DATE ---
            Calendar cCal = info.getCreationDate();
            if (cCal != null) {
                String cdate = dateFormatter.format(cCal.getTime());
                if (cdate != null && cdate.length() > 0) {
                    Topic creationDateType = createTopic(topicMap, "pdf-creation-date");
                    setData(topic, creationDateType, defaultLang, cdate.trim());
                }
            }

            // --- PDF AUTHOR ---
            String author = info.getAuthor();
            if (author != null && author.length() > 0) {
                Topic authorType = createTopic(topicMap, "pdf-author");
                setData(topic, authorType, defaultLang, author.trim());
            }

            // --- PDF SUBJECT ---
            String subject = info.getSubject();
            if (subject != null && subject.length() > 0) {
                Topic subjectType = createTopic(topicMap, "pdf-subject");
                setData(topic, subjectType, defaultLang, subject.trim());
            }

            // --- PDF TITLE ---
            String title = info.getSubject();
            if (title != null && title.length() > 0) {
                Topic titleType = createTopic(topicMap, "pdf-title");
                setData(topic, titleType, defaultLang, title.trim());
            }

            // --- PDF KEYWORDS (SEPARATED WITH SEMICOLON) ---
            String keywords = info.getKeywords();
            if (keywords != null && keywords.length() > 0) {
                Topic keywordType = createTopic(topicMap, "pdf-keyword");
                String[] keywordArray = keywords.split(";");
                String keyword = null;
                for (int i = 0; i < keywordArray.length; i++) {
                    keyword = Textbox.trimExtraSpaces(keywordArray[i]);
                    if (keyword != null && keyword.length() > 0) {
                        Topic keywordTopic = createTopic(topicMap, keyword, keywordType);
                        createAssociation(topicMap, keywordType, new Topic[] { topic, keywordTopic });
                    }
                }
            }

            // --- PDF TEXT CONTENT ---
            PDFTextStripper stripper = new PDFTextStripper();
            String content = stripper.getText(doc);
            doc.close();
            setTextEnrichment(topic, topicMap, content, name);
        }

        // --- HANDLE RTF DOCUMENTS ---
        else if (lowerCaseLocator.endsWith("rtf")) {
            String content = Textbox.RTF2PlainText(inputStream);
            setTextEnrichment(topic, topicMap, content, name);
        }

        // --- HANDLE OFFICE DOCUMENTS ---
        else if (lowerCaseLocator.endsWith("doc") || lowerCaseLocator.endsWith("docx")
                || lowerCaseLocator.endsWith("ppt") || lowerCaseLocator.endsWith("xls")
                || lowerCaseLocator.endsWith("vsd")) {
            String content = MSOfficeBox.getText(inputStream);
            if (content != null) {
                setTextEnrichment(topic, topicMap, content, name);
            }
        }

        else if (lowerCaseLocator.endsWith("odt") || lowerCaseLocator.endsWith("odp")
                || lowerCaseLocator.endsWith("odg") || lowerCaseLocator.endsWith("ods")) {

            org.odftoolkit.simple.Document oodocument = org.odftoolkit.simple.Document
                    .loadDocument(inputStream);
            String content = OpenOfficeBox.getText(oodocument);
            setTextEnrichment(topic, topicMap, content, name);

            org.odftoolkit.simple.meta.Meta meta = oodocument.getOfficeMetadata();

            // --- OO KEYWORDS ---
            List<String> keywords = meta.getKeywords();
            if (keywords != null && !keywords.isEmpty()) {
                Topic keywordType = createTopic(topicMap, "oo-keyword");
                for (String keyword : keywords) {
                    keyword = keyword.trim();
                    if (keyword != null && keyword.length() > 0) {
                        Topic keywordTopic = createTopic(topicMap, keyword, keywordType);
                        createAssociation(topicMap, keywordType, new Topic[] { topic, keywordTopic });
                    }
                }
            }

            // --- OO TITLE ---
            String title = meta.getTitle();
            if (title != null && title.length() > 0) {
                Topic titleType = createTopic(topicMap, "oo-title");
                setData(topic, titleType, defaultLang, title.trim());
            }

            // --- OO SUBJECT ---
            String subject = meta.getSubject();
            if (subject != null && subject.length() > 0) {
                Topic subjectType = createTopic(topicMap, "oo-subject");
                setData(topic, subjectType, defaultLang, subject.trim());
            }

            // --- OO CREATOR ---
            String author = meta.getCreator();
            if (author != null && author.length() > 0) {
                Topic authorType = createTopic(topicMap, "oo-author");
                setData(topic, authorType, defaultLang, author.trim());
            }

            // --- OO CREATION DATE ---
            Calendar cCal = meta.getCreationDate();
            if (cCal != null) {
                DateFormat dateFormatter = new SimpleDateFormat(DEFAULT_DATE_FORMAT);
                String cdate = dateFormatter.format(cCal.getTime());
                if (cdate != null && cdate.length() > 0) {
                    Topic creationDateType = createTopic(topicMap, "oo-creation-date");
                    setData(topic, creationDateType, defaultLang, cdate.trim());
                }
            }

            // --- OO DESCRIPTION ---
            String description = meta.getDescription();
            if (description != null && description.length() > 0) {
                Topic descriptionType = createTopic(topicMap, "oo-description");
                setData(topic, descriptionType, defaultLang, description.trim());
            }

            // --- OO GENERATOR ---
            String generator = meta.getGenerator();
            if (generator != null && generator.length() > 0) {
                Topic generatorType = createTopic(topicMap, "oo-generator");
                setData(topic, generatorType, defaultLang, generator.trim());
            }
        }

        else if (lowerCaseLocator.endsWith("html") || lowerCaseLocator.endsWith("htm")) {
            String content = IObox.loadFile(new InputStreamReader(inputStream));
            setTextEnrichment(topic, topicMap, content, name);
        }

        else if (lowerCaseLocator.endsWith("txt") || lowerCaseLocator.endsWith("text")) {
            String content = IObox.loadFile(new InputStreamReader(inputStream));
            setTextEnrichment(topic, topicMap, content, name);
        }

        // --- HANDLE ANY OTHER DOCUMENTS ---
        else {
            byte[] content = IObox.loadBFile(inputStream);
            String mimeType = "";
            MimeUtil.registerMimeDetector("eu.medsea.mimeutil.detector.MagicMimeMimeDetector");
            Collection<MimeType> mimeTypes = new ArrayList();
            if (locator != null) {
                if (MimeTypes.getMimeType(locator) != null) {
                    mimeTypes.add(new MimeType(MimeTypes.getMimeType(locator)));
                }
                mimeTypes.addAll(MimeUtil.getMimeTypes(locator));
            }
            mimeTypes.addAll(MimeUtil.getMimeTypes(content));
            boolean isText = false;
            for (MimeType mime : mimeTypes) {
                if (MimeUtil.isTextMimeType(mime)) {
                    isText = true;
                    break;
                }
            }
            if (isText) {
                setTextEnrichment(topic, topicMap, new String(content), name);
            } else {
                if (!mimeTypes.isEmpty()) {
                    MimeType mime = mimeTypes.iterator().next();
                    mimeType = mime.toString();
                }
                setBinaryEnrichment(topic, topicMap, content, mimeType);
            }
        }
    } catch (Exception e) {
        log(e);
    }
}