List of usage examples for org.apache.pdfbox.pdmodel PDDocument close
@Override public void close() throws IOException
From source file:org.titans.fyp.webcrawler.PageCollector.java
License:Open Source License
private static void pdfToText(String pdfURL) { pdfURL = "https://" + pdfURL.split("://")[1]; // System.out.println(pdfURL); try {//from ww w .java2s.c om PDDocument pddDocument = PDDocument.load((new URL(pdfURL)).openStream()); PDFTextStripper textStripper = new PDFTextStripper(); String doc = textStripper.getText(pddDocument); pddDocument.close(); System.out.println(doc); } catch (Exception e) { e.getMessage(); } }
From source file:org.tnc.doctrack.behaviours.docTrackBehaviours.java
License:Open Source License
private Result[] extractQRfromPDF(InputStream PDF) throws Exception { System.out.println("TNC - DocTrack - extractQRfromPDF starting...."); //Initialize variable for QR decoding. PDDocument document = null; String password = ""; String prefix = null;//from ww w . java 2 s.c o m boolean addKey = false; Result[] QR = null; try { //read PDF document document = PDDocument.loadNonSeq(PDF, null, password); //Check permission to PDF AccessPermission ap = document.getCurrentAccessPermission(); if (!ap.canExtractContent()) { System.out.println( "TNC - DocTrack Error - extractQRfromPDF - You do not have permission to extract images from PDF."); throw new IOException( "TNC - DocTrack Error - extractQRfromPDF - You do not have permission to extract images from PDF."); } //Iterate throw the PDF pages. List<?> pages = document.getDocumentCatalog().getAllPages(); Iterator<?> iter = pages.iterator(); while (iter.hasNext()) { PDPage page = (PDPage) iter.next(); PDResources resources = page.getResources(); // extract all XObjectImages which are part of the page resources System.out.println("TNC - DocTrack - extractQRfromPDF - Try to process image and find QR code"); QR = processResources(resources, prefix, addKey); } } finally { if ((document != null)) { try { document.close(); } catch (Exception e) { } } } System.out.println("TNC - DocTrack - extractQRfromPDF finished. QR code string : " + QR); return QR; }
From source file:org.ujmp.pdfbox.ImportMatrixPDF.java
License:Open Source License
public static final Matrix fromFile(File file) throws IOException { PDDocument pdd = PDDocument.load(file); PDFTextStripper pts = new PDFTextStripper(); String text = pts.getText(pdd); pdd.close(); return Matrix.Factory.linkToValue(text); }
From source file:org.ujmp.pdfbox.ImportMatrixPDF.java
License:Open Source License
public static final Matrix fromStream(InputStream inputStream) throws IOException { PDDocument pdd = PDDocument.load(inputStream); PDFTextStripper pts = new PDFTextStripper(); String text = pts.getText(pdd); pdd.close(); return Matrix.Factory.linkToValue(text); }
From source file:org.ujmp.pdfbox.PdfUtil.java
License:Open Source License
public static final String getTextFromFile(File file) throws IOException { PDDocument pdd = PDDocument.load(file); PDFTextStripper pts = new PDFTextStripper(); String text = pts.getText(pdd); pdd.close(); return text;//w ww .ja v a 2s .c om }
From source file:org.ujmp.pdfbox.PdfUtil.java
License:Open Source License
public static final String getTextFromStream(InputStream inputStream) throws IOException { PDDocument pdd = PDDocument.load(inputStream); PDFTextStripper pts = new PDFTextStripper(); String text = pts.getText(pdd); pdd.close(); return text;/* w w w .j ava 2 s . c o m*/ }
From source file:org.vesalainen.ham.pdf.RfaxTest.java
License:Open Source License
public void test() throws IOException { PDDocument document = PDDocument.load(new File("rfax.pdf")); if (!document.isEncrypted()) { PDFTextStripper stripper = new PDFTextStripper(); String text = stripper.getText(document); try (BufferedWriter bw = Files.newBufferedWriter(Paths.get("src", "main", "resources", "rfax.txt"))) { bw.write(text);//from w w w . j av a 2s .c om } } document.close(); }
From source file:org.wandora.application.gui.simple.SimpleTextPane.java
License:Open Source License
public void load(File file) { if (file != null) { if (file.length() > MAX_TEXT_SIZE) { WandoraOptionPane.showMessageDialog(wandora, "File size is too big.", "File size is too big", WandoraOptionPane.WARNING_MESSAGE); } else {/*ww w . j a v a 2s .c om*/ try { int a = WandoraOptionPane.showConfirmDialog(wandora, "Store the file content as a data URI?", "Make data URI?", WandoraOptionPane.QUESTION_MESSAGE); if (a == WandoraOptionPane.YES_OPTION) { DataURL url = new DataURL(file); setText(url.toExternalForm()); } else { Object desc = getStyledDocument(); Reader inputReader = null; String content = ""; String filename = file.getPath().toLowerCase(); String extension = filename.substring(Math.max(filename.lastIndexOf(".") + 1, 0)); // --- handle rtf files --- if ("rtf".equals(extension)) { content = Textbox.RTF2PlainText(new FileInputStream(file)); inputReader = new StringReader(content); } // --- handle pdf files --- if ("pdf".equals(extension)) { try { PDDocument doc = PDDocument.load(file); PDFTextStripper stripper = new PDFTextStripper(); content = stripper.getText(doc); doc.close(); inputReader = new StringReader(content); } catch (Exception e) { System.out.println("No PDF support!"); } } // --- handle MS office files --- if ("doc".equals(extension) || "ppt".equals(extension) || "xls".equals(extension) || "vsd".equals(extension) || "odt".equals(extension)) { content = MSOfficeBox.getText(new FileInputStream(file)); if (content != null) { inputReader = new StringReader(content); } } if ("docx".equals(extension)) { content = MSOfficeBox.getDocxText(file); if (content != null) { inputReader = new StringReader(content); } } // --- handle everything else --- if (inputReader == null) { inputReader = new FileReader(file); } read(inputReader, desc); inputReader.close(); setCaretPosition(0); } } catch (MalformedURLException mfue) { mfue.printStackTrace(); wandora.handleError(mfue); } catch (IOException ioe) { ioe.printStackTrace(); wandora.handleError(ioe); } catch (Exception e) { e.printStackTrace(); wandora.handleError(e); } } } }
From source file:org.wandora.application.tools.extractors.email.SimpleEmailExtractor.java
License:Open Source License
public void extractContent(TopicMap map, Topic emailTopic, Part part) { try {/*w ww . ja v a 2 s .co m*/ Object content = part.getContent(); String contentType = part.getContentType(); String lowerCaseType = contentType.toLowerCase(); if (lowerCaseType.startsWith("text/plain")) { Topic textContentType = createTopic(map, "text-content"); String stringContent = (content != null ? content.toString() : ""); setData(emailTopic, textContentType, defaultLang, Textbox.trimExtraSpaces(stringContent)); } else if (lowerCaseType.startsWith("text/html")) { Topic htmlTextContentType = createTopic(map, "html-text-content"); String stringContent = (content != null ? content.toString() : ""); setData(emailTopic, htmlTextContentType, defaultLang, Textbox.trimExtraSpaces(stringContent)); } else if (lowerCaseType.startsWith("text/xml") || lowerCaseType.startsWith("application/xml")) { Topic contentTypeTopic = createTopic(map, "xml-content"); String stringContent = (content != null ? content.toString() : ""); setData(emailTopic, contentTypeTopic, defaultLang, stringContent); } else if (lowerCaseType.startsWith("application/msword") || lowerCaseType.startsWith("application/x-msword") || lowerCaseType.startsWith("application/x-ms-word") || lowerCaseType.startsWith("application/x-word")) { Topic contentTypeTopic = createTopic(map, "ms-word-text-content"); String stringContent = MSOfficeBox.getText(part.getInputStream()); setData(emailTopic, contentTypeTopic, defaultLang, Textbox.trimExtraSpaces(stringContent)); } else if (lowerCaseType.startsWith("application/msexcel") || lowerCaseType.startsWith("application/x-msexcel") || lowerCaseType.startsWith("application/x-ms-excel") || lowerCaseType.startsWith("application/x-excel") || lowerCaseType.startsWith("application/vnd.ms-excel")) { Topic contentTypeTopic = createTopic(map, "ms-excel-text-content"); String stringContent = MSOfficeBox.getText(part.getInputStream()); setData(emailTopic, contentTypeTopic, defaultLang, Textbox.trimExtraSpaces(stringContent)); } else if (lowerCaseType.startsWith("application/powerpoint") || lowerCaseType.startsWith("application/x-mspowerpoint") || lowerCaseType.startsWith("application/x-ms-powerpoint") || lowerCaseType.startsWith("application/x-powerpoint") || lowerCaseType.startsWith("application/vnd.ms-powerpoint")) { Topic contentTypeTopic = createTopic(map, "ms-powerpoint-text-content"); String stringContent = MSOfficeBox.getText(part.getInputStream()); setData(emailTopic, contentTypeTopic, defaultLang, Textbox.trimExtraSpaces(stringContent)); } else if (lowerCaseType.startsWith("application/pdf")) { Topic contentTypeTopic = createTopic(map, "pdf-text-content"); String stringContent = ""; try { PDDocument doc = PDDocument.load(part.getInputStream()); PDFTextStripper stripper = new PDFTextStripper(); stringContent = stripper.getText(doc); doc.close(); } catch (Exception e) { System.out.println("No PDF support!"); } setData(emailTopic, contentTypeTopic, defaultLang, stringContent.trim()); } else if (lowerCaseType.startsWith("multipart")) { Multipart multipart = (Multipart) content; BodyPart bodypart = null; int c = multipart.getCount(); for (int i = 0; i < c; i++) { bodypart = multipart.getBodyPart(i); extractContent(map, emailTopic, bodypart); } } else { if (contentType.indexOf(";") > -1) { contentType = contentType.substring(0, contentType.indexOf(";")); } log("Unsupported attachment type '" + contentType + "' found."); if (shouldExtractUnknownContentTypeAttachments) { log("Processing anyway..."); Topic contentTypeTopic = createTopic(map, "unknown-content"); String unknownContent = (String) content; setData(emailTopic, contentTypeTopic, defaultLang, unknownContent); } } } catch (Exception e) { log(e); } catch (Error e) { log(e); } }
From source file:org.wandora.application.tools.extractors.files.SimpleDocumentExtractor.java
License:Open Source License
public void _extractTopicsFromStream(String locator, InputStream inputStream, TopicMap topicMap, Topic topic) { try {//from w ww .j a v a2 s.c om String name = locator; if (name.indexOf("/") != -1) { name = name.substring(name.lastIndexOf("/") + 1); } else if (name.indexOf("\\") != -1) { name = name.substring(name.lastIndexOf("\\") + 1); } String lowerCaseLocator = locator.toLowerCase(); // --- HANDLE PDF ENRICHMENT TEXT --- if (lowerCaseLocator.endsWith("pdf")) { PDDocument doc = PDDocument.load(locator); PDDocumentInformation info = doc.getDocumentInformation(); DateFormat dateFormatter = new SimpleDateFormat(DEFAULT_DATE_FORMAT); // --- PDF PRODUCER --- String producer = info.getProducer(); if (producer != null && producer.length() > 0) { Topic producerType = createTopic(topicMap, "pdf-producer"); setData(topic, producerType, defaultLang, producer.trim()); } // --- PDF MODIFICATION DATE --- Calendar mCal = info.getModificationDate(); if (mCal != null) { String mdate = dateFormatter.format(mCal.getTime()); if (mdate != null && mdate.length() > 0) { Topic modificationDateType = createTopic(topicMap, "pdf-modification-date"); setData(topic, modificationDateType, defaultLang, mdate.trim()); } } // --- PDF CREATOR --- String creator = info.getCreator(); if (creator != null && creator.length() > 0) { Topic creatorType = createTopic(topicMap, "pdf-creator"); setData(topic, creatorType, defaultLang, creator.trim()); } // --- PDF CREATION DATE --- Calendar cCal = info.getCreationDate(); if (cCal != null) { String cdate = dateFormatter.format(cCal.getTime()); if (cdate != null && cdate.length() > 0) { Topic creationDateType = createTopic(topicMap, "pdf-creation-date"); setData(topic, creationDateType, defaultLang, cdate.trim()); } } // --- PDF AUTHOR --- String author = info.getAuthor(); if (author != null && author.length() > 0) { Topic authorType = createTopic(topicMap, "pdf-author"); setData(topic, authorType, defaultLang, author.trim()); } // --- PDF SUBJECT --- String subject = info.getSubject(); if (subject != null && subject.length() > 0) { Topic subjectType = createTopic(topicMap, "pdf-subject"); setData(topic, subjectType, defaultLang, subject.trim()); } // --- PDF TITLE --- String title = info.getSubject(); if (title != null && title.length() > 0) { Topic titleType = createTopic(topicMap, "pdf-title"); setData(topic, titleType, defaultLang, title.trim()); } // --- PDF KEYWORDS (SEPARATED WITH SEMICOLON) --- String keywords = info.getKeywords(); if (keywords != null && keywords.length() > 0) { Topic keywordType = createTopic(topicMap, "pdf-keyword"); String[] keywordArray = keywords.split(";"); String keyword = null; for (int i = 0; i < keywordArray.length; i++) { keyword = Textbox.trimExtraSpaces(keywordArray[i]); if (keyword != null && keyword.length() > 0) { Topic keywordTopic = createTopic(topicMap, keyword, keywordType); createAssociation(topicMap, keywordType, new Topic[] { topic, keywordTopic }); } } } // --- PDF TEXT CONTENT --- PDFTextStripper stripper = new PDFTextStripper(); String content = stripper.getText(doc); doc.close(); setTextEnrichment(topic, topicMap, content, name); } // --- HANDLE RTF DOCUMENTS --- else if (lowerCaseLocator.endsWith("rtf")) { String content = Textbox.RTF2PlainText(inputStream); setTextEnrichment(topic, topicMap, content, name); } // --- HANDLE OFFICE DOCUMENTS --- else if (lowerCaseLocator.endsWith("doc") || lowerCaseLocator.endsWith("docx") || lowerCaseLocator.endsWith("ppt") || lowerCaseLocator.endsWith("xls") || lowerCaseLocator.endsWith("vsd")) { String content = MSOfficeBox.getText(inputStream); if (content != null) { setTextEnrichment(topic, topicMap, content, name); } } else if (lowerCaseLocator.endsWith("odt") || lowerCaseLocator.endsWith("odp") || lowerCaseLocator.endsWith("odg") || lowerCaseLocator.endsWith("ods")) { org.odftoolkit.simple.Document oodocument = org.odftoolkit.simple.Document .loadDocument(inputStream); String content = OpenOfficeBox.getText(oodocument); setTextEnrichment(topic, topicMap, content, name); org.odftoolkit.simple.meta.Meta meta = oodocument.getOfficeMetadata(); // --- OO KEYWORDS --- List<String> keywords = meta.getKeywords(); if (keywords != null && !keywords.isEmpty()) { Topic keywordType = createTopic(topicMap, "oo-keyword"); for (String keyword : keywords) { keyword = keyword.trim(); if (keyword != null && keyword.length() > 0) { Topic keywordTopic = createTopic(topicMap, keyword, keywordType); createAssociation(topicMap, keywordType, new Topic[] { topic, keywordTopic }); } } } // --- OO TITLE --- String title = meta.getTitle(); if (title != null && title.length() > 0) { Topic titleType = createTopic(topicMap, "oo-title"); setData(topic, titleType, defaultLang, title.trim()); } // --- OO SUBJECT --- String subject = meta.getSubject(); if (subject != null && subject.length() > 0) { Topic subjectType = createTopic(topicMap, "oo-subject"); setData(topic, subjectType, defaultLang, subject.trim()); } // --- OO CREATOR --- String author = meta.getCreator(); if (author != null && author.length() > 0) { Topic authorType = createTopic(topicMap, "oo-author"); setData(topic, authorType, defaultLang, author.trim()); } // --- OO CREATION DATE --- Calendar cCal = meta.getCreationDate(); if (cCal != null) { DateFormat dateFormatter = new SimpleDateFormat(DEFAULT_DATE_FORMAT); String cdate = dateFormatter.format(cCal.getTime()); if (cdate != null && cdate.length() > 0) { Topic creationDateType = createTopic(topicMap, "oo-creation-date"); setData(topic, creationDateType, defaultLang, cdate.trim()); } } // --- OO DESCRIPTION --- String description = meta.getDescription(); if (description != null && description.length() > 0) { Topic descriptionType = createTopic(topicMap, "oo-description"); setData(topic, descriptionType, defaultLang, description.trim()); } // --- OO GENERATOR --- String generator = meta.getGenerator(); if (generator != null && generator.length() > 0) { Topic generatorType = createTopic(topicMap, "oo-generator"); setData(topic, generatorType, defaultLang, generator.trim()); } } else if (lowerCaseLocator.endsWith("html") || lowerCaseLocator.endsWith("htm")) { String content = IObox.loadFile(new InputStreamReader(inputStream)); setTextEnrichment(topic, topicMap, content, name); } else if (lowerCaseLocator.endsWith("txt") || lowerCaseLocator.endsWith("text")) { String content = IObox.loadFile(new InputStreamReader(inputStream)); setTextEnrichment(topic, topicMap, content, name); } // --- HANDLE ANY OTHER DOCUMENTS --- else { byte[] content = IObox.loadBFile(inputStream); String mimeType = ""; MimeUtil.registerMimeDetector("eu.medsea.mimeutil.detector.MagicMimeMimeDetector"); Collection<MimeType> mimeTypes = new ArrayList(); if (locator != null) { if (MimeTypes.getMimeType(locator) != null) { mimeTypes.add(new MimeType(MimeTypes.getMimeType(locator))); } mimeTypes.addAll(MimeUtil.getMimeTypes(locator)); } mimeTypes.addAll(MimeUtil.getMimeTypes(content)); boolean isText = false; for (MimeType mime : mimeTypes) { if (MimeUtil.isTextMimeType(mime)) { isText = true; break; } } if (isText) { setTextEnrichment(topic, topicMap, new String(content), name); } else { if (!mimeTypes.isEmpty()) { MimeType mime = mimeTypes.iterator().next(); mimeType = mime.toString(); } setBinaryEnrichment(topic, topicMap, content, mimeType); } } } catch (Exception e) { log(e); } }