List of usage examples for org.apache.pdfbox.pdmodel PDDocumentInformation getKeywords
public String getKeywords()
From source file:org.knoesis.matvocab.indexer.LucenePDFDocument.java
License:Apache License
/** * This will add the contents to the lucene document. * * @param document The document to add the contents to. * @param is The stream to get the contents from. * @param documentLocation The location of the document, used just for debug messages. * * @throws IOException If there is an error parsing the document. *///from w w w . jav a 2s . c om private void addContent(Document document, InputStream is, String documentLocation, PDFTextStripper stripper) throws IOException { PDDocument pdfDocument = null; try { pdfDocument = PDDocument.load(is); if (pdfDocument.isEncrypted()) { //Just try using the default password and move on pdfDocument.decrypt(""); } //create a writer where to append the text content. StringWriter writer = new StringWriter(); if (stripper == null) { stripper = new PDFTextStripper(); } else { stripper.resetEngine(); } stripper.writeText(pdfDocument, writer); // Note: the buffer to string operation is costless; // the char array value of the writer buffer and the content string // is shared as long as the buffer content is not modified, which will // not occur here. String contents = writer.getBuffer().toString(); // Add the tag-stripped contents as a Reader-valued Text field so it will // get tokenized and indexed. addField(document, "contents", contents); addField(document, "stemmedcontents", contents); PDDocumentInformation info = pdfDocument.getDocumentInformation(); if (info != null) { addField(document, "Author", info.getAuthor()); try { addField(document, "CreationDate", info.getCreationDate()); } catch (IOException io) { //ignore, bad date but continue with indexing } addField(document, "Creator", info.getCreator()); addField(document, "Keywords", info.getKeywords()); try { addField(document, "ModificationDate", info.getModificationDate()); } catch (IOException io) { //ignore, bad date but continue with indexing } addField(document, "Producer", info.getProducer()); addField(document, "Subject", info.getSubject()); addField(document, "Title", info.getTitle()); addField(document, "Trapped", info.getTrapped()); } int summarySize = Math.min(contents.length(), 500); String summary = contents.substring(0, summarySize); // Add the summary as an UnIndexed field, so that it is stored and returned // with hit documents for display. addField(document, "summary", summary); addField(document, "numpages", String.valueOf(pdfDocument.getNumberOfPages())); } catch (CryptographyException e) { throw new IOException("Error decrypting document(" + documentLocation + "): " + e); } catch (InvalidPasswordException e) { //they didn't suppply a password and the default of "" was wrong. throw new IOException( "Error: The document(" + documentLocation + ") is encrypted and will not be indexed."); } finally { if (pdfDocument != null) { pdfDocument.close(); } } }
From source file:org.mitre.xtext.converters.PDFConverter.java
License:Apache License
/** Implementation is informed by PDFBox authors. *///from www.j ava 2 s . c o m @Override public synchronized ConvertedDocument convert(java.io.File doc) throws IOException { /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Adapted from LucenePDFDocument.java from PDFBox lucene project * * This class is used to create a document for the lucene search engine. * This should easily plug into the IndexHTML or IndexFiles that comes with * the lucene project. This class will populate the following fields. * <table> <tr> <th>Lucene Field Name</th> <th>Description</th> </tr> <tr> * <td>path</td> <td>File system path if loaded from a file</td> </tr> <tr> * <td>url</td> <td>URL to PDF document</td> </tr> <tr> <td>contents</td> * <td>Entire contents of PDF document, indexed but not stored</td> </tr> * <tr> <td>summary</td> <td>First 500 characters of content</td> </tr> <tr> * <td>modified</td> <td>The modified date/time according to the url or * path</td> </tr> <tr> <td>uid</td> <td>A unique identifier for the Lucene * document.</td> </tr> <tr> <td>CreationDate</td> <td>From PDF meta-data if * available</td> </tr> <tr> <td>Creator</td> <td>From PDF meta-data if * available</td> </tr> <tr> <td>Keywords</td> <td>From PDF meta-data if * available</td> </tr> <tr> <td>ModificationDate</td> <td>From PDF * meta-data if available</td> </tr> <tr> <td>Producer</td> <td>From PDF * meta-data if available</td> </tr> <tr> <td>Subject</td> <td>From PDF * meta-data if available</td> </tr> <tr> <td>Trapped</td> <td>From PDF * meta-data if available</td> </tr> <tr> <td>Encrypted</td> <td>From PDF * meta-data if available</td> </tr> </table> * * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a> * @version $Revision: 1.23 $ * * @throws IOException If there is an error parsing the document. */ PDDocument pdfDocument = null; ConvertedDocument textdoc = new ConvertedDocument(doc); try { pdfDocument = PDDocument.load(doc); if (pdfDocument.isEncrypted()) { //Just try using the default password and move on /** * * Exception in thread "main" java.lang.NoClassDefFoundError: * org/bouncycastle/jce/provider/BouncyCastleProvider at * org.apache.pdfbox.pdmodel.PDDocument.openProtection(PDDocument.java:1090) * at * org.apache.pdfbox.pdmodel.PDDocument.decrypt(PDDocument.java:594) * * CRYPTO stuff -- load BouncyCastle crypto JAR files. try { * pdfDocument.decrypt(""); } catch (CryptographyException e) { * throw new IOException("Error decrypting document(" + pdf_file * + "): " + e); } catch (InvalidPasswordException e) { //they * didn't suppply a password and the default of "" was wrong. * throw new IOException( "Error: The document(" + pdf_file + ") * is encrypted "); } finally { if (pdfDocument != null) { * pdfDocument.close();} } */ textdoc.addProperty("encrypted", "YES"); } else { //create a writer where to append the text content. StringWriter writer = new StringWriter(); stripper.resetEngine(); stripper.writeText(pdfDocument, writer); PDDocumentInformation info = pdfDocument.getDocumentInformation(); if (info != null) { textdoc.addAuthor(info.getAuthor()); try { textdoc.addCreateDate(info.getCreationDate()); } catch (IOException io) { //ignore, bad date but continue with indexing } textdoc.addProperty("creator_tool", info.getCreator()); textdoc.addProperty("keywords", info.getKeywords()); /* try { metadata.add("ModificationDate", info.getModificationDate()); } catch (IOException io) { //ignore, bad date but continue with indexing } */ //metadata.add("Producer", info.getProducer()); textdoc.addProperty("subject", info.getSubject()); String ttl = info.getTitle(); if (ttl == null || "untitled".equalsIgnoreCase(ttl)) { ttl = textdoc.filename; } textdoc.addTitle(ttl); // metadata.add("Trapped", info.getTrapped()); // TODO: Character set is what? textdoc.setEncoding("UTF-8"); } // Note: the buffer to string operation is costless; // the char array value of the writer buffer and the content string // is shared as long as the buffer content is not modified, which will // not occur here. textdoc.setPayload(writer.getBuffer().toString()); } return textdoc; } finally { if (pdfDocument != null) { pdfDocument.close(); } } }
From source file:org.modeshape.sequencer.pdf.PdfBasicMetadata.java
License:Apache License
public boolean check() throws Exception { try (PDDocument document = PDDocument.load(in)) { PDDocumentCatalog catalog = document.getDocumentCatalog(); PDPageable pageable = new PDPageable(document); PageFormat firstPage = pageable.getPageFormat(0); encrypted = document.isEncrypted(); pageCount = document.getNumberOfPages(); orientation = ORIENTATION_STRINGS[firstPage.getOrientation()]; version = String.valueOf(document.getDocument().getVersion()); String catalogVersion = catalog.getVersion(); if (catalogVersion != null && !catalogVersion.isEmpty()) { // According to specs version saved here should be determining instead // the version in header. It is barely used, though. version = catalogVersion;/*from w w w . ja va2s .c om*/ } if (!encrypted) { PDDocumentInformation metadata = document.getDocumentInformation(); author = metadata.getAuthor(); creationDate = metadata.getCreationDate(); creator = metadata.getCreator(); keywords = metadata.getKeywords(); modificationDate = metadata.getModificationDate(); producer = metadata.getProducer(); subject = metadata.getSubject(); title = metadata.getTitle(); } // extract all attached files from all pages int pageNumber = 0; for (Object page : catalog.getAllPages()) { pageNumber += 1; PdfPageMetadata pageMetadata = new PdfPageMetadata(); pageMetadata.setPageNumber(pageNumber); for (PDAnnotation annotation : ((PDPage) page).getAnnotations()) { if (annotation instanceof PDAnnotationFileAttachment) { PdfAttachmentMetadata attachmentMetadata = new PdfAttachmentMetadata(); PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation; PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile(); PDEmbeddedFile embeddedFile = fileSpec.getEmbeddedFile(); attachmentMetadata.setSubject(fann.getSubject()); attachmentMetadata.setName(fileSpec.getFilename()); attachmentMetadata.setCreationDate(embeddedFile.getCreationDate()); attachmentMetadata.setModificationDate(embeddedFile.getModDate()); attachmentMetadata.setMimeType(embeddedFile.getSubtype()); attachmentMetadata.setData(embeddedFile.getByteArray()); pageMetadata.addAttachment(attachmentMetadata); } } pages.add(pageMetadata); } return true; } }
From source file:org.mycore.media.MCRMediaPDFParser.java
License:Open Source License
/** * Parse file and store metadata in related Object. * //w ww .ja va2 s . co m * @return MCRMediaObject * can be held any MCRMediaObject * @see MCRMediaObject#clone() */ @SuppressWarnings("unchecked") public synchronized MCRMediaObject parse(File file) throws Exception { if (!file.exists()) throw new IOException("File \"" + file.getName() + "\" doesn't exists!"); MCRPDFObject media = new MCRPDFObject(); LOGGER.info("parse " + file.getName() + "..."); PDDocument pdf = PDDocument.load(file); try { media.fileName = file.getName(); media.fileSize = file.length(); media.folderName = (file.getAbsolutePath()).replace(file.getName(), ""); PDPageTree pages = pdf.getDocumentCatalog().getPages(); media.numPages = pdf.getNumberOfPages(); PDPage page = (PDPage) pages.get(0); PDRectangle rect = page.getMediaBox(); media.width = Math.round(rect.getWidth()); media.height = Math.round(rect.getHeight()); PDDocumentInformation info = pdf.getDocumentInformation(); if (info != null) { media.tags = new MCRMediaTagObject(); media.tags.author = info.getAuthor(); media.tags.creator = info.getCreator(); media.tags.producer = info.getProducer(); media.tags.title = info.getTitle(); media.tags.subject = info.getSubject(); media.tags.keywords = info.getKeywords(); } } catch (Exception e) { LOGGER.error(e.getMessage()); throw new Exception(e.getMessage()); } finally { pdf.close(); } return media; }
From source file:org.nuxeo.pdf.PDFInfo.java
License:Open Source License
/** * After building the object with the correct constructor, and after * possibly having set some parsing property (<code>setParseWithXMP()</code> * for example), this method will extract the information from the PDF. * <p>//from ww w. j av a2 s . c om * After extraction, caller get the info: Either all of them ( * <code>toHashMap()</code> or <code>toString()</code>) or individual info * (see all getters) * * @throws ClientException * * @since 5.9.5 */ public void run() throws ClientException { // In case the caller calls several time the run() method if (!alreadyParsed) { fileName = pdfBlob.getFilename(); // Getting the file size os ok only if the blob is already backed by // a // File. If it is pure Stream, we give up File pdfFile = BlobHelper.getFileFromBlob(pdfBlob); if (pdfFile == null) { fileSize = -1; } else { fileSize = pdfFile.length(); } try { pdfDoc = PDDocument.load(pdfBlob.getStream()); isEncrypted = pdfDoc.isEncrypted(); if (isEncrypted) { pdfDoc.openProtection(new StandardDecryptionMaterial(password)); } numberOfPages = pdfDoc.getNumberOfPages(); PDDocumentCatalog docCatalog = pdfDoc.getDocumentCatalog(); pageLayout = checkNotNull(docCatalog.getPageLayout()); pdfVersion = "" + pdfDoc.getDocument().getVersion(); PDDocumentInformation docInfo = pdfDoc.getDocumentInformation(); author = checkNotNull(docInfo.getAuthor()); contentCreator = checkNotNull(docInfo.getCreator()); keywords = checkNotNull(docInfo.getKeywords()); creationDate = docInfo.getCreationDate(); modificationDate = docInfo.getModificationDate(); producer = checkNotNull(docInfo.getProducer()); subject = checkNotNull(docInfo.getSubject()); title = checkNotNull(docInfo.getTitle()); // Getting dimension is a bit tricky mediaBoxWidthInPoints = -1; mediaBoxHeightInPoints = -1; cropBoxWidthInPoints = -1; cropBoxHeightInPoints = -1; List<PDPage> allPages = docCatalog.getAllPages(); boolean gotMediaBox = false; boolean gotCropBox = false; for (PDPage page : allPages) { if (page != null) { PDRectangle r = page.findMediaBox(); if (r != null) { mediaBoxWidthInPoints = r.getWidth(); mediaBoxHeightInPoints = r.getHeight(); gotMediaBox = true; } r = page.findCropBox(); if (r != null) { cropBoxWidthInPoints = r.getWidth(); cropBoxHeightInPoints = r.getHeight(); gotCropBox = true; } } if (gotMediaBox && gotCropBox) { break; } } if (doXMP) { xmp = null; PDMetadata metadata = docCatalog.getMetadata(); if (metadata != null) { xmp = ""; InputStream xmlInputStream = metadata.createInputStream(); InputStreamReader isr = new InputStreamReader(xmlInputStream); BufferedReader reader = new BufferedReader(isr); String line; do { line = reader.readLine(); if (line != null) { xmp += line + "\n"; } } while (line != null); reader.close(); } } } catch (IOException | BadSecurityHandlerException | CryptographyException e) { throw new ClientException(/* * "Cannot get PDF info: " + * e.getMessage(), */e); } finally { if (pdfDoc != null) { try { pdfDoc.close(); } catch (IOException e) { // Ignore } pdfDoc = null; } alreadyParsed = true; } } }
From source file:org.paxle.parser.pdf.impl.PdfParser.java
License:Open Source License
/** * A function to extract metadata from the PDF-document. *///from ww w. j av a 2 s.c om protected void extractMetaData(IParserDocument parserDoc, PDDocument pddDoc) throws IOException { // extract metadata final PDDocumentInformation metadata = pddDoc.getDocumentInformation(); if (metadata == null) return; // document title final String title = metadata.getTitle(); if (title != null && title.length() > 0) parserDoc.setTitle(title); // document author(s) final String author = metadata.getAuthor(); if (author != null && author.length() > 0) parserDoc.setAuthor(author); ; // subject final String summary = metadata.getSubject(); if (summary != null && summary.length() > 0) parserDoc.setSummary(summary); // keywords final String keywords = metadata.getKeywords(); if (keywords != null && keywords.length() > 0) { String[] keywordArray = keywords.split("[,;\\s]"); if (keywordArray != null && keywordArray.length > 0) { parserDoc.setKeywords(Arrays.asList(keywordArray)); } } // last modification date final Calendar lastMod = metadata.getModificationDate(); if (lastMod != null) { parserDoc.setLastChanged(lastMod.getTime()); } }
From source file:org.pdfsam.pdf.DefaultPDFBoxLoader.java
License:Open Source License
public void accept(PDDocument document, PdfDocumentDescriptor descriptor) { descriptor.pages(document.getNumberOfPages()); descriptor.setVersion(getVersion(Float.toString(document.getVersion()))); PDDocumentInformation info = document.getDocumentInformation(); descriptor.putInformation(PdfMetadataKey.TITLE.getKey(), info.getTitle()); descriptor.putInformation(PdfMetadataKey.AUTHOR.getKey(), info.getAuthor()); descriptor.putInformation(PdfMetadataKey.CREATOR.getKey(), info.getCreator()); descriptor.putInformation(PdfMetadataKey.SUBJECT.getKey(), info.getSubject()); descriptor.putInformation(PdfMetadataKey.KEYWORDS.getKey(), info.getKeywords()); descriptor.putInformation("Producer", info.getProducer()); Optional.ofNullable(info.getCreationDate()).map(FORMATTER::format) .ifPresent(c -> descriptor.putInformation("FormattedCreationDate", c)); }
From source file:org.wandora.application.tools.extractors.files.SimpleDocumentExtractor.java
License:Open Source License
public void _extractTopicsFromStream(String locator, InputStream inputStream, TopicMap topicMap, Topic topic) { try {//from w ww. j ava 2s . c om String name = locator; if (name.indexOf("/") != -1) { name = name.substring(name.lastIndexOf("/") + 1); } else if (name.indexOf("\\") != -1) { name = name.substring(name.lastIndexOf("\\") + 1); } String lowerCaseLocator = locator.toLowerCase(); // --- HANDLE PDF ENRICHMENT TEXT --- if (lowerCaseLocator.endsWith("pdf")) { PDDocument doc = PDDocument.load(locator); PDDocumentInformation info = doc.getDocumentInformation(); DateFormat dateFormatter = new SimpleDateFormat(DEFAULT_DATE_FORMAT); // --- PDF PRODUCER --- String producer = info.getProducer(); if (producer != null && producer.length() > 0) { Topic producerType = createTopic(topicMap, "pdf-producer"); setData(topic, producerType, defaultLang, producer.trim()); } // --- PDF MODIFICATION DATE --- Calendar mCal = info.getModificationDate(); if (mCal != null) { String mdate = dateFormatter.format(mCal.getTime()); if (mdate != null && mdate.length() > 0) { Topic modificationDateType = createTopic(topicMap, "pdf-modification-date"); setData(topic, modificationDateType, defaultLang, mdate.trim()); } } // --- PDF CREATOR --- String creator = info.getCreator(); if (creator != null && creator.length() > 0) { Topic creatorType = createTopic(topicMap, "pdf-creator"); setData(topic, creatorType, defaultLang, creator.trim()); } // --- PDF CREATION DATE --- Calendar cCal = info.getCreationDate(); if (cCal != null) { String cdate = dateFormatter.format(cCal.getTime()); if (cdate != null && cdate.length() > 0) { Topic creationDateType = createTopic(topicMap, "pdf-creation-date"); setData(topic, creationDateType, defaultLang, cdate.trim()); } } // --- PDF AUTHOR --- String author = info.getAuthor(); if (author != null && author.length() > 0) { Topic authorType = createTopic(topicMap, "pdf-author"); setData(topic, authorType, defaultLang, author.trim()); } // --- PDF SUBJECT --- String subject = info.getSubject(); if (subject != null && subject.length() > 0) { Topic subjectType = createTopic(topicMap, "pdf-subject"); setData(topic, subjectType, defaultLang, subject.trim()); } // --- PDF TITLE --- String title = info.getSubject(); if (title != null && title.length() > 0) { Topic titleType = createTopic(topicMap, "pdf-title"); setData(topic, titleType, defaultLang, title.trim()); } // --- PDF KEYWORDS (SEPARATED WITH SEMICOLON) --- String keywords = info.getKeywords(); if (keywords != null && keywords.length() > 0) { Topic keywordType = createTopic(topicMap, "pdf-keyword"); String[] keywordArray = keywords.split(";"); String keyword = null; for (int i = 0; i < keywordArray.length; i++) { keyword = Textbox.trimExtraSpaces(keywordArray[i]); if (keyword != null && keyword.length() > 0) { Topic keywordTopic = createTopic(topicMap, keyword, keywordType); createAssociation(topicMap, keywordType, new Topic[] { topic, keywordTopic }); } } } // --- PDF TEXT CONTENT --- PDFTextStripper stripper = new PDFTextStripper(); String content = stripper.getText(doc); doc.close(); setTextEnrichment(topic, topicMap, content, name); } // --- HANDLE RTF DOCUMENTS --- else if (lowerCaseLocator.endsWith("rtf")) { String content = Textbox.RTF2PlainText(inputStream); setTextEnrichment(topic, topicMap, content, name); } // --- HANDLE OFFICE DOCUMENTS --- else if (lowerCaseLocator.endsWith("doc") || lowerCaseLocator.endsWith("docx") || lowerCaseLocator.endsWith("ppt") || lowerCaseLocator.endsWith("xls") || lowerCaseLocator.endsWith("vsd")) { String content = MSOfficeBox.getText(inputStream); if (content != null) { setTextEnrichment(topic, topicMap, content, name); } } else if (lowerCaseLocator.endsWith("odt") || lowerCaseLocator.endsWith("odp") || lowerCaseLocator.endsWith("odg") || lowerCaseLocator.endsWith("ods")) { org.odftoolkit.simple.Document oodocument = org.odftoolkit.simple.Document .loadDocument(inputStream); String content = OpenOfficeBox.getText(oodocument); setTextEnrichment(topic, topicMap, content, name); org.odftoolkit.simple.meta.Meta meta = oodocument.getOfficeMetadata(); // --- OO KEYWORDS --- List<String> keywords = meta.getKeywords(); if (keywords != null && !keywords.isEmpty()) { Topic keywordType = createTopic(topicMap, "oo-keyword"); for (String keyword : keywords) { keyword = keyword.trim(); if (keyword != null && keyword.length() > 0) { Topic keywordTopic = createTopic(topicMap, keyword, keywordType); createAssociation(topicMap, keywordType, new Topic[] { topic, keywordTopic }); } } } // --- OO TITLE --- String title = meta.getTitle(); if (title != null && title.length() > 0) { Topic titleType = createTopic(topicMap, "oo-title"); setData(topic, titleType, defaultLang, title.trim()); } // --- OO SUBJECT --- String subject = meta.getSubject(); if (subject != null && subject.length() > 0) { Topic subjectType = createTopic(topicMap, "oo-subject"); setData(topic, subjectType, defaultLang, subject.trim()); } // --- OO CREATOR --- String author = meta.getCreator(); if (author != null && author.length() > 0) { Topic authorType = createTopic(topicMap, "oo-author"); setData(topic, authorType, defaultLang, author.trim()); } // --- OO CREATION DATE --- Calendar cCal = meta.getCreationDate(); if (cCal != null) { DateFormat dateFormatter = new SimpleDateFormat(DEFAULT_DATE_FORMAT); String cdate = dateFormatter.format(cCal.getTime()); if (cdate != null && cdate.length() > 0) { Topic creationDateType = createTopic(topicMap, "oo-creation-date"); setData(topic, creationDateType, defaultLang, cdate.trim()); } } // --- OO DESCRIPTION --- String description = meta.getDescription(); if (description != null && description.length() > 0) { Topic descriptionType = createTopic(topicMap, "oo-description"); setData(topic, descriptionType, defaultLang, description.trim()); } // --- OO GENERATOR --- String generator = meta.getGenerator(); if (generator != null && generator.length() > 0) { Topic generatorType = createTopic(topicMap, "oo-generator"); setData(topic, generatorType, defaultLang, generator.trim()); } } else if (lowerCaseLocator.endsWith("html") || lowerCaseLocator.endsWith("htm")) { String content = IObox.loadFile(new InputStreamReader(inputStream)); setTextEnrichment(topic, topicMap, content, name); } else if (lowerCaseLocator.endsWith("txt") || lowerCaseLocator.endsWith("text")) { String content = IObox.loadFile(new InputStreamReader(inputStream)); setTextEnrichment(topic, topicMap, content, name); } // --- HANDLE ANY OTHER DOCUMENTS --- else { byte[] content = IObox.loadBFile(inputStream); String mimeType = ""; MimeUtil.registerMimeDetector("eu.medsea.mimeutil.detector.MagicMimeMimeDetector"); Collection<MimeType> mimeTypes = new ArrayList(); if (locator != null) { if (MimeTypes.getMimeType(locator) != null) { mimeTypes.add(new MimeType(MimeTypes.getMimeType(locator))); } mimeTypes.addAll(MimeUtil.getMimeTypes(locator)); } mimeTypes.addAll(MimeUtil.getMimeTypes(content)); boolean isText = false; for (MimeType mime : mimeTypes) { if (MimeUtil.isTextMimeType(mime)) { isText = true; break; } } if (isText) { setTextEnrichment(topic, topicMap, new String(content), name); } else { if (!mimeTypes.isEmpty()) { MimeType mime = mimeTypes.iterator().next(); mimeType = mime.toString(); } setBinaryEnrichment(topic, topicMap, content, mimeType); } } } catch (Exception e) { log(e); } }
From source file:org.wandora.application.tools.extractors.files.SimplePDFExtractor.java
License:Open Source License
public void _extractTopicsFromStream(String locator, InputStream inputStream, TopicMap topicMap, Topic pdfTopic) {//from ww w.j a va 2 s . c o m PDDocument doc = null; try { if (locator.startsWith("http://")) { doc = PDDocument.load(new URL(locator)); } else { doc = PDDocument.load(new File(locator)); } PDDocumentInformation info = doc.getDocumentInformation(); DateFormat dateFormatter = new SimpleDateFormat(DEFAULT_DATE_FORMAT); // --- PDF PRODUCER --- String producer = info.getProducer(); if (producer != null && producer.length() > 0) { Topic producerType = createTopic(topicMap, "pdf-producer"); setData(pdfTopic, producerType, defaultLang, producer.trim()); } // --- PDF MODIFICATION DATE --- Calendar mCal = info.getModificationDate(); if (mCal != null) { String mdate = dateFormatter.format(mCal.getTime()); if (mdate != null && mdate.length() > 0) { Topic modificationDateType = createTopic(topicMap, "pdf-modification-date"); setData(pdfTopic, modificationDateType, defaultLang, mdate.trim()); } } // --- PDF CREATOR --- String creator = info.getCreator(); if (creator != null && creator.length() > 0) { Topic creatorType = createTopic(topicMap, "pdf-creator"); setData(pdfTopic, creatorType, defaultLang, creator.trim()); } // --- PDF CREATION DATE --- Calendar cCal = info.getCreationDate(); if (cCal != null) { String cdate = dateFormatter.format(cCal.getTime()); if (cdate != null && cdate.length() > 0) { Topic creationDateType = createTopic(topicMap, "pdf-creation-date"); setData(pdfTopic, creationDateType, defaultLang, cdate.trim()); } } // --- PDF AUTHOR --- String author = info.getAuthor(); if (author != null && author.length() > 0) { Topic authorType = createTopic(topicMap, "pdf-author"); setData(pdfTopic, authorType, defaultLang, author.trim()); } // --- PDF SUBJECT --- String subject = info.getSubject(); if (subject != null && subject.length() > 0) { Topic subjectType = createTopic(topicMap, "pdf-subject"); setData(pdfTopic, subjectType, defaultLang, subject.trim()); } // --- PDF TITLE --- String title = info.getSubject(); if (title != null && title.length() > 0) { if (makeVariantFromTitle) { pdfTopic.setDisplayName(defaultLang, title); } else { Topic titleType = createTopic(topicMap, "pdf-title"); setData(pdfTopic, titleType, defaultLang, title.trim()); } } // --- PDF KEYWORDS (SEPARATED WITH SEMICOLON) --- String keywords = info.getKeywords(); if (keywords != null && keywords.length() > 0) { Topic keywordType = createTopic(topicMap, "pdf-keyword"); String[] keywordArray = keywords.split(";"); String keyword = null; for (int i = 0; i < keywordArray.length; i++) { keyword = Textbox.trimExtraSpaces(keywordArray[i]); if (keyword != null && keyword.length() > 0) { Topic keywordTopic = createTopic(topicMap, keyword, keywordType); createAssociation(topicMap, keywordType, new Topic[] { pdfTopic, keywordTopic }); } } } // --- PDF TEXT CONTENT --- PDFTextStripper stripper = new PDFTextStripper(); String content = new String(); if (makePageTopics) { int pages = doc.getNumberOfPages(); String pageContent = null; for (int i = 0; i < pages; i++) { stripper.setStartPage(i); stripper.setEndPage(i); pageContent = stripper.getText(doc); Topic pageType = createTopic(topicMap, "pdf-page"); Topic pageTopic = createTopic(topicMap, pdfTopic.getBaseName() + " (page " + i + ")", pageType); Topic orderType = createTopic(topicMap, "order"); Topic orderTopic = createTopic(topicMap, i + ".", orderType); Topic contentType = createTopic(topicMap, "pdf-text"); setData(pageTopic, contentType, defaultLang, pageContent.trim()); createAssociation(topicMap, pageType, new Topic[] { pdfTopic, pageTopic, orderTopic }); } } else { content = stripper.getText(doc); } if (!makePageTopics && content != null && content.length() > 0) { Topic contentType = createTopic(topicMap, "pdf-text"); setData(pdfTopic, contentType, defaultLang, content.trim()); } doc.close(); } catch (Exception e) { e.printStackTrace(); try { if (doc != null) doc.close(); } catch (Exception ix) { e.printStackTrace(); } } }
From source file:org.wandora.application.tools.extractors.fng.ExtractFNGTextEnrichment.java
License:Open Source License
public void _extractTopicsFromStream(String locator, InputStream inputStream, TopicMap topicMap, Topic textTopic) {//from ww w. j ava2s . c o m try { String lowerCaseLocator = locator.toLowerCase(); // --- HANDLE PDF ENRICHMENT TEXT --- if (lowerCaseLocator.endsWith("pdf")) { PDDocument doc = PDDocument.load(new URL(locator)); PDDocumentInformation info = doc.getDocumentInformation(); // --- PDF SUBJECT --- String subject = info.getSubject(); if (subject != null && subject.length() > 0) { Topic subjectType = createTopic(topicMap, "subject"); setData(textTopic, subjectType, defaultLang, subject.trim()); } // --- PDF TITLE --- String title = info.getTitle(); if (title != null && title.length() > 0) { Topic titleType = createTopic(topicMap, "title"); setData(textTopic, titleType, defaultLang, title.trim()); } // --- PDF KEYWORDS --- String keywords = info.getKeywords(); if (keywords != null && keywords.length() > 0) { Topic keywordType = createTopic(topicMap, "keywords"); setData(textTopic, keywordType, defaultLang, keywords.trim()); } // --- PDF TEXT CONTENT --- PDFTextStripper stripper = new PDFTextStripper(); String content = stripper.getText(doc); setTextEnrichment(textTopic, topicMap, content); doc.close(); } // --- HANDLE RTF DOCUMENTS --- else if (lowerCaseLocator.endsWith("rtf")) { String content = Textbox.RTF2PlainText(inputStream); setTextEnrichment(textTopic, topicMap, content); } // --- HANDLE OFFICE DOCUMENTS --- else if (lowerCaseLocator.endsWith("doc") || lowerCaseLocator.endsWith("docx") || lowerCaseLocator.endsWith("ppt") || lowerCaseLocator.endsWith("xsl") || lowerCaseLocator.endsWith("vsd")) { String content = MSOfficeBox.getText(inputStream); if (content != null) { setTextEnrichment(textTopic, topicMap, content); } } // --- HANDLE TXT DOCUMENTS --- else { String content = IObox.loadFile(new InputStreamReader(inputStream)); setTextEnrichment(textTopic, topicMap, content); } } catch (Exception e) { log(e); } }