List of usage examples for org.xml.sax.helpers DefaultHandler DefaultHandler
DefaultHandler
From source file:org.apache.solr.handler.dataimport.CustomTikaEntityProcessor.java
@Override public Map<String, Object> nextRow() { if (done)// w ww .j a v a 2s . co m return null; Map<String, Object> row = new HashMap<>(); String filePath = context.getResolvedEntityAttribute(URL); /* * Changed from original source * Required for later change */ @SuppressWarnings("unchecked") DataSource<InputStream> dataSource = context.getDataSource(); /* * Changed from original source * When dataSource is an InputStreamReader, create a new InputStream to handle this * */ InputStream is = null; if (InputStream.class.isInstance(dataSource)) { is = dataSource.getData(filePath); } else { try { is = new FileInputStream(new File(filePath)); } catch (FileNotFoundException e) { LOG.warn("Unable to create InputStream of " + filePath); } } ContentHandler contentHandler = null; Metadata metadata = new Metadata(); /* * Changed from original source * metadata is not able to determine the PDF fileformat without the filepath * see also: http://stackoverflow.com/questions/5507565/extracting-text-from-documents-of-unknown-content-type */ metadata.set(Metadata.RESOURCE_NAME_KEY, filePath); StringWriter sw = new StringWriter(); try { if ("html".equals(format)) { contentHandler = getHtmlHandler(sw); } else if ("xml".equals(format)) { contentHandler = getXmlContentHandler(sw); } else if ("text".equals(format)) { contentHandler = getTextContentHandler(sw); } else if ("none".equals(format)) { contentHandler = new DefaultHandler(); } } catch (TransformerConfigurationException e) { wrapAndThrow(SEVERE, e, "Unable to create content handler"); } Parser tikaParser = null; if (parser.equals(AUTO_PARSER)) { tikaParser = new AutoDetectParser(tikaConfig); } else { tikaParser = context.getSolrCore().getResourceLoader().newInstance(parser, Parser.class); } try { ParseContext context = new ParseContext();//here /* * Changed from original source * makes it possible to index the content files contained in zip files * see also: https://issues.apache.org/jira/browse/SOLR-2332 and https://issues.apache.org/jira/secure/attachment/12469108/SOLR-2332.patch */ context.set(Parser.class, tikaParser); if ("identity".equals(htmlMapper)) { context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE); } tikaParser.parse(is, contentHandler, metadata, context); } catch (Exception e) { /* * Changed from original source * print to log that file can't be read, instead of throwing error and stopping indexing */ //wrapAndThrow(SEVERE, e, "Unable to read content"); LOG.warn("Unable to read content of " + filePath); } IOUtils.closeQuietly(is); for (Map<String, String> field : context.getAllEntityFields()) { if (!"true".equals(field.get("meta"))) continue; String col = field.get(COLUMN); String s = metadata.get(col); if (s != null) row.put(col, s); } if (!"none".equals(format)) row.put("text", sw.toString()); done = true; return row; }
From source file:org.apache.solr.handler.dataimport.TikaEntityProcessor.java
@Override public Map<String, Object> nextRow() { if (done)/*from w w w.j a v a 2s .c o m*/ return null; Map<String, Object> row = new HashMap<>(); DataSource<InputStream> dataSource = context.getDataSource(); InputStream is = dataSource.getData(context.getResolvedEntityAttribute(URL)); ContentHandler contentHandler = null; Metadata metadata = new Metadata(); StringWriter sw = new StringWriter(); try { if ("html".equals(format)) { contentHandler = getHtmlHandler(sw); } else if ("xml".equals(format)) { contentHandler = getXmlContentHandler(sw); } else if ("text".equals(format)) { contentHandler = getTextContentHandler(sw); } else if ("none".equals(format)) { contentHandler = new DefaultHandler(); } } catch (TransformerConfigurationException e) { wrapAndThrow(SEVERE, e, "Unable to create content handler"); } Parser tikaParser = null; if (parser.equals(AUTO_PARSER)) { tikaParser = new AutoDetectParser(tikaConfig); } else { tikaParser = context.getSolrCore().getResourceLoader().newInstance(parser, Parser.class); } try { ParseContext context = new ParseContext(); if ("identity".equals(htmlMapper)) { context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE); } if (extractEmbedded) { context.set(Parser.class, tikaParser); } tikaParser.parse(is, contentHandler, metadata, context); } catch (Exception e) { if (SKIP.equals(onError)) { throw new DataImportHandlerException(DataImportHandlerException.SKIP_ROW, "Document skipped :" + e.getMessage()); } wrapAndThrow(SEVERE, e, "Unable to read content"); } IOUtils.closeQuietly(is); for (Map<String, String> field : context.getAllEntityFields()) { if (!"true".equals(field.get("meta"))) continue; String col = field.get(COLUMN); String s = metadata.get(col); if (s != null) row.put(col, s); } if (!"none".equals(format)) row.put("text", sw.toString()); tryToAddLatLon(metadata, row); done = true; return row; }
From source file:org.apache.tika.batch.fs.RecursiveParserWrapperFSConsumer.java
@Override public boolean processFileResource(FileResource fileResource) { Parser wrapped = parserFactory.getParser(tikaConfig); RecursiveParserWrapper parser = new RecursiveParserWrapper(wrapped, contentHandlerFactory); ParseContext context = new ParseContext(); // if (parseRecursively == true) { context.set(Parser.class, parser); // }// ww w. ja v a 2 s . c om //try to open outputstream first OutputStream os = getOutputStream(fsOSFactory, fileResource); if (os == null) { LOG.debug("Skipping: {}", fileResource.getMetadata().get(FSProperties.FS_REL_PATH)); return false; } //try to open the inputstream before the parse. //if the parse hangs or throws a nasty exception, at least there will //be a zero byte file there so that the batchrunner can skip that problematic //file during the next run. InputStream is = getInputStream(fileResource); if (is == null) { IOUtils.closeQuietly(os); return false; } Throwable thrown = null; List<Metadata> metadataList = null; Metadata containerMetadata = fileResource.getMetadata(); try { parse(fileResource.getResourceId(), parser, is, new DefaultHandler(), containerMetadata, context); metadataList = parser.getMetadata(); } catch (Throwable t) { thrown = t; metadataList = parser.getMetadata(); if (metadataList == null) { metadataList = new LinkedList<>(); } Metadata m = null; if (metadataList.size() == 0) { m = containerMetadata; } else { //take the top metadata item m = metadataList.remove(0); } String stackTrace = ExceptionUtils.getFilteredStackTrace(t); m.add(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "runtime", stackTrace); metadataList.add(0, m); } finally { IOUtils.closeQuietly(is); } Writer writer = null; try { writer = new OutputStreamWriter(os, getOutputEncoding()); JsonMetadataList.toJson(metadataList, writer); } catch (Exception e) { //this is a stop the world kind of thing LOG.error("{}", getXMLifiedLogMsg(IO_OS + "json", fileResource.getResourceId(), e)); throw new RuntimeException(e); } finally { flushAndClose(writer); } if (thrown != null) { if (thrown instanceof Error) { throw (Error) thrown; } else { return false; } } return true; }
From source file:org.apache.tika.parser.epub.EpubParser.java
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { // Because an EPub file is often made up of multiple XHTML files, // we need explicit control over the start and end of the document XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument();// w ww.ja v a 2 s . co m ContentHandler childHandler = new EmbeddedContentHandler(new BodyContentHandler(xhtml)); ZipInputStream zip = new ZipInputStream(stream); ZipEntry entry = zip.getNextEntry(); while (entry != null) { if (entry.getName().equals("mimetype")) { String type = IOUtils.toString(zip, UTF_8); //often has trailing new lines if (type != null) { type = type.trim(); } metadata.set(Metadata.CONTENT_TYPE, type); } else if (entry.getName().equals("metadata.xml")) { meta.parse(zip, new DefaultHandler(), metadata, context); } else if (entry.getName().endsWith(".opf")) { meta.parse(zip, new DefaultHandler(), metadata, context); } else if (entry.getName().endsWith(".html") || entry.getName().endsWith(".xhtml")) { content.parse(zip, childHandler, metadata, context); } entry = zip.getNextEntry(); } // Finish everything xhtml.endDocument(); }
From source file:org.apache.tika.parser.exiftool.ExifToolImageParserTest.java
public void testJPEGIPTC() throws Exception { Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "image/jpeg"); InputStream stream = getClass().getResourceAsStream("/test-documents/testJPEG_IPTC_EXT.jpg"); parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); assertEquals("Washington", metadata.get(IPTC.CITY)); assertEquals("United States", metadata.get(IPTC.COUNTRY)); assertEquals("US", metadata.get(IPTC.COUNTRY_CODE)); assertEquals(/*from w ww .j ava 2 s . com*/ "A stream bank in Rock Creek Park Washington DC during a photo bike tour with ASPP DC/South chapter.", metadata.get(IPTC.DESCRIPTION)); assertEquals( "A stream bank in Rock Creek Park Washington DC during a photo bike tour with ASPP DC/South chapter.", metadata.get(Metadata.DESCRIPTION)); assertEquals("Rock Creek Park", metadata.get(IPTC.HEADLINE)); assertEquals("Downstream", metadata.get(Metadata.TITLE)); assertEquals("intellectual genre", metadata.get(IPTC.INTELLECTUAL_GENRE)); List<String> iptcKeywords = Arrays.asList(metadata.getValues(IPTC.KEYWORDS)); assertTrue(iptcKeywords.contains("stream")); assertTrue(iptcKeywords.contains("park")); assertTrue(iptcKeywords.contains("bank")); assertEquals(5, iptcKeywords.size()); List<String> tikaKeywords = Arrays.asList(metadata.getValues(Metadata.KEYWORDS)); assertTrue(Arrays.toString(tikaKeywords.toArray()).contains("stream")); assertTrue(Arrays.toString(tikaKeywords.toArray()).contains("park")); assertTrue(Arrays.toString(tikaKeywords.toArray()).contains("bank")); assertEquals(5, tikaKeywords.size()); assertEquals("DC", metadata.get(IPTC.PROVINCE_OR_STATE)); List<String> iptcSceneCode = Arrays.asList(metadata.getValues(IPTC.SCENE_CODE)); assertEquals(2, iptcSceneCode.size()); assertTrue(Arrays.toString(iptcSceneCode.toArray()).contains("iptc scene 1")); assertTrue(Arrays.toString(iptcSceneCode.toArray()).contains("iptc scene 2")); List<String> iptcSubjectCode = Arrays.asList(metadata.getValues(IPTC.SUBJECT_CODE)); assertEquals(2, iptcSubjectCode.size()); assertTrue(Arrays.toString(iptcSubjectCode.toArray()).contains("iptc subject code 1")); assertTrue(Arrays.toString(iptcSubjectCode.toArray()).contains("iptc subject code 2")); assertEquals("Rock Creek Park", metadata.get(IPTC.SUBLOCATION)); GregorianCalendar calendar = new GregorianCalendar(); calendar.set(Calendar.YEAR, 2011); calendar.set(Calendar.MONTH, 7); calendar.set(Calendar.DATE, 31); calendar.set(Calendar.HOUR_OF_DAY, 12); calendar.set(Calendar.MINUTE, 0); calendar.set(Calendar.SECOND, 0); calendar.set(Calendar.MILLISECOND, 0); calendar.setTimeZone(TimeZone.getTimeZone("UTC")); assertEquals(calendar.getTime(), metadata.getDate(IPTC.DATE_CREATED)); assertEquals("Ray Gauss II", metadata.get(IPTC.DESCRIPTION_WRITER)); assertEquals("instructions", metadata.get(IPTC.INSTRUCTIONS)); assertEquals("job identifier", metadata.get(IPTC.JOB_ID)); assertEquals("Downstream", metadata.get(IPTC.TITLE)); assertTrue(metadata.get(IPTC.COPYRIGHT_NOTICE).contains("Ray Gauss II")); List<String> creators = Arrays.asList(metadata.getValues(IPTC.CREATOR)); assertTrue(Arrays.toString(creators.toArray()).contains("Ray Gauss II")); assertEquals("DAM Architect", metadata.get(IPTC.CREATORS_JOB_TITLE)); assertEquals("provider", metadata.get(IPTC.CREDIT_LINE)); assertEquals("rights usage terms", metadata.get(IPTC.RIGHTS_USAGE_TERMS)); assertEquals("source", metadata.get(IPTC.SOURCE)); assertEquals("1234 Some Road", metadata.get(IPTC.CONTACT_INFO_ADDRESS)); assertEquals("Atlanta", metadata.get(IPTC.CONTACT_INFO_CITY)); assertEquals("US", metadata.get(IPTC.CONTACT_INFO_COUNTRY)); List<String> ciWorkEmails = Arrays.asList(metadata.getValues(IPTC.CONTACT_INFO_EMAIL)); // Photoshop does not support true multi-value here assertTrue(Arrays.toString(ciWorkEmails.toArray()).contains("info@alfresco.com")); assertTrue(Arrays.toString(ciWorkEmails.toArray()).contains("other@example.com")); List<String> ciWorkTels = Arrays.asList(metadata.getValues(IPTC.CONTACT_INFO_PHONE)); // Photoshop does not support true multi-value here assertTrue(Arrays.toString(ciWorkTels.toArray()).contains("555-1234")); assertTrue(Arrays.toString(ciWorkTels.toArray()).contains("555-4321")); assertEquals("30339", metadata.get(IPTC.CONTACT_INFO_POSTAL_CODE)); assertEquals("GA", metadata.get(IPTC.CONTACT_INFO_STATE_PROVINCE)); List<String> ciWorkUrls = Arrays.asList(metadata.getValues(IPTC.CONTACT_INFO_WEB_URL)); // Photoshop does not support true multi-value here assertTrue(Arrays.toString(ciWorkUrls.toArray()).contains("http://alfresco.com")); assertTrue(Arrays.toString(ciWorkUrls.toArray()).contains("http://example.com")); assertEquals("rocky 1 and rocky 2 are big", metadata.get(IPTC.ADDITIONAL_MODEL_INFO)); List<String> orgCodes = Arrays.asList(metadata.getValues(IPTC.ORGANISATION_CODE)); assertEquals(2, orgCodes.size()); assertEquals("ASPP", orgCodes.get(0)); assertEquals("OTHER_ORG", orgCodes.get(1)); // List<String> cvTerms = Arrays.asList(metadata.getValues(IPTC.CONTROLLED_VOCABULARY_TERM)); List<String> modelAges = Arrays.asList(metadata.getValues(IPTC.MODEL_AGE)); assertEquals(2, modelAges.size()); assertEquals("1000", modelAges.get(0)); assertEquals("1001", modelAges.get(1)); List<String> orgNames = Arrays.asList(metadata.getValues(IPTC.ORGANISATION_NAME)); assertEquals(2, orgNames.size()); assertEquals("ASPP", orgNames.get(0)); assertEquals("Other Org", orgNames.get(1)); List<String> peopleShown = Arrays.asList(metadata.getValues(IPTC.PERSON)); assertEquals(2, peopleShown.size()); assertEquals("rocky 1", peopleShown.get(0)); assertEquals("rocky 2", peopleShown.get(1)); assertEquals("http://cv.iptc.org/newscodes/digitalsourcetype/digitalCapture", metadata.get(IPTC.DIGITAL_SOURCE_TYPE)); assertEquals("Photo Bike Tour", metadata.get(IPTC.EVENT)); assertEquals("RGAUSS", metadata.get(IPTC.IMAGE_SUPPLIER_ID)); assertEquals("Ray Gauss II", metadata.get(IPTC.IMAGE_SUPPLIER_NAME)); assertEquals("supplier image ID", metadata.get(IPTC.IMAGE_SUPPLIER_IMAGE_ID)); assertEquals("3456", metadata.get(IPTC.MAX_AVAIL_HEIGHT)); assertEquals("5184", metadata.get(IPTC.MAX_AVAIL_WIDTH)); assertEquals("1.2.0", metadata.get(IPTC.PLUS_VERSION)); List<String> copyrightOwnerIds = Arrays.asList(metadata.getValues(IPTC.COPYRIGHT_OWNER_ID)); assertEquals(1, copyrightOwnerIds.size()); assertEquals("RGAUSS", copyrightOwnerIds.get(0)); // assertEquals("", copyrightOwnerIds.get(1)); // TODO: Get ExifTool to preserve empty values List<String> copyrightOwnerNames = Arrays.asList(metadata.getValues(IPTC.COPYRIGHT_OWNER_NAME)); assertEquals(2, copyrightOwnerNames.size()); assertEquals("Ray Gauss II", copyrightOwnerNames.get(0)); assertEquals("GG", copyrightOwnerNames.get(1)); List<String> imageCreatorIds = Arrays.asList(metadata.getValues(IPTC.IMAGE_CREATOR_ID)); assertEquals(1, imageCreatorIds.size()); assertEquals("RGAUSS", imageCreatorIds.get(0)); // assertEquals("", imageCreatorIds.get(1)); // TODO: Get ExifTool to preserve empty values assertTrue(metadata.isMultiValued(IPTC.IMAGE_CREATOR_NAME)); List<String> imageCreatorNames = Arrays.asList(metadata.getValues(IPTC.IMAGE_CREATOR_NAME)); assertEquals(2, imageCreatorNames.size()); assertEquals("Ray Gauss II", imageCreatorNames.get(0)); assertEquals("GG", imageCreatorNames.get(1)); List<String> licensorIds = Arrays.asList(metadata.getValues(IPTC.LICENSOR_ID)); assertEquals("RGAUSS", licensorIds.get(0)); assertTrue(metadata.isMultiValued(IPTC.LICENSOR_NAME)); List<String> licensorNames = Arrays.asList(metadata.getValues(IPTC.LICENSOR_NAME)); assertEquals(2, licensorNames.size()); assertEquals("Ray Gauss II", licensorNames.get(0)); assertEquals("GG", licensorNames.get(1)); // Photoshop does not support licensor addresses, cities, or countries List<String> licensorEmails = Arrays.asList(metadata.getValues(IPTC.LICENSOR_EMAIL)); assertEquals("r@example.com", licensorEmails.get(0)); // assertEquals("", licensorEmails.get(1)); // TODO: Get ExifTool to preserve empty values List<String> licensorTel1 = Arrays.asList(metadata.getValues(IPTC.LICENSOR_TELEPHONE_1)); assertEquals("555-5555", licensorTel1.get(0)); // assertEquals("", licensorTel1.get(1)); // TODO: Get ExifTool to preserve empty values List<String> licensorTel2 = Arrays.asList(metadata.getValues(IPTC.LICENSOR_TELEPHONE_2)); assertEquals("555-4444", licensorTel2.get(0)); // assertEquals("", licensorTel2.get(1)); // TODO: Get ExifTool to preserve empty values List<String> licensorUrls = Arrays.asList(metadata.getValues(IPTC.LICENSOR_URL)); assertEquals("http://rgauss.com", licensorUrls.get(0)); // assertEquals("", licensorUrls.get(1)); // TODO: Get ExifTool to preserve empty values assertEquals("Age Unknown", metadata.get(IPTC.MINOR_MODEL_AGE_DISCLOSURE)); List<String> modelReleaseIds = Arrays.asList(metadata.getValues(IPTC.MODEL_RELEASE_ID)); assertEquals("model release id 1", modelReleaseIds.get(0)); assertEquals("model release id 2", modelReleaseIds.get(1)); assertEquals("Not Applicable", metadata.get(IPTC.MODEL_RELEASE_STATUS)); List<String> propertyReleaseIds = Arrays.asList(metadata.getValues(IPTC.PROPERTY_RELEASE_ID)); assertEquals("prop release id 1", propertyReleaseIds.get(0)); assertEquals("prop release id 2", propertyReleaseIds.get(1)); assertEquals("Not Applicable", metadata.get(IPTC.PROPERTY_RELEASE_STATUS)); List<String> aoCopyright = Arrays .asList(metadata.getValues(IPTC.ARTWORK_OR_OBJECT_DETAIL_COPYRIGHT_NOTICE)); assertEquals("Ray Gauss II", aoCopyright.get(0)); // assertEquals("", aoCopyright.get(1)); // TODO: Get ExifTool to preserve empty values // assertEquals("", aoCopyright.get(2)); // TODO: Get ExifTool to preserve empty values List<String> aoCreator = Arrays.asList(metadata.getValues(IPTC.ARTWORK_OR_OBJECT_DETAIL_CREATOR)); assertEquals("Mother Nature", aoCreator.get(0)); assertEquals("Man", aoCreator.get(1)); assertEquals("Mother Nature", aoCreator.get(2)); List<String> aoDateCreated = Arrays.asList(metadata.getValues(IPTC.ARTWORK_OR_OBJECT_DETAIL_DATE_CREATED)); assertEquals("1890:01:01", aoDateCreated.get(0)); // assertEquals("", aoDateCreated.get(1)); // TODO: Get ExifTool to preserve empty values assertEquals("1901:02:01", aoDateCreated.get(1)); // assertEquals("", aoDateCreated.get(2)); // TODO: Get ExifTool to preserve empty values List<String> aoSource = Arrays.asList(metadata.getValues(IPTC.ARTWORK_OR_OBJECT_DETAIL_SOURCE)); assertEquals("National Park Service", aoSource.get(0)); // assertEquals("", aoSource.get(1)); // TODO: Get ExifTool to preserve empty values // assertEquals("", aoSource.get(2)); // TODO: Get ExifTool to preserve empty values List<String> aoSourceInventoryNum = Arrays .asList(metadata.getValues(IPTC.ARTWORK_OR_OBJECT_DETAIL_SOURCE_INVENTORY_NUMBER)); assertEquals("123456", aoSourceInventoryNum.get(0)); // assertEquals("", aoSourceInventoryNum.get(1)); // TODO: Get ExifTool to preserve empty values assertEquals("654321", aoSourceInventoryNum.get(1)); // This should be index 2, TODO: Get ExifTool to preserve empty values List<String> aoSourceTitles = Arrays.asList(metadata.getValues(IPTC.ARTWORK_OR_OBJECT_DETAIL_TITLE)); assertEquals("Rock Creek Stream Bank", aoSourceTitles.get(0)); assertEquals("Pollution", aoSourceTitles.get(1)); assertEquals("Some Tree", aoSourceTitles.get(2)); List<String> locationShownCity = Arrays.asList(metadata.getValues(IPTC.LOCATION_SHOWN_CITY)); assertEquals("Washington", locationShownCity.get(0)); // assertEquals("", locationShownCity.get(1)); // TODO: Get ExifTool to preserve empty values List<String> locationShownCountryCode = Arrays.asList(metadata.getValues(IPTC.LOCATION_SHOWN_COUNTRY_CODE)); assertEquals("US", locationShownCountryCode.get(0)); // assertEquals("", locationShownCountryCode.get(1)); // TODO: Get ExifTool to preserve empty values List<String> locationShownCountryName = Arrays.asList(metadata.getValues(IPTC.LOCATION_SHOWN_COUNTRY_NAME)); assertEquals("United States", locationShownCountryName.get(0)); // assertEquals("", locationShownCountryName.get(1)); // TODO: Get ExifTool to preserve empty values List<String> locationShownState = Arrays.asList(metadata.getValues(IPTC.LOCATION_SHOWN_PROVINCE_OR_STATE)); assertEquals("D.C.", locationShownState.get(0)); // assertEquals("", locationShownState.get(1)); // TODO: Get ExifTool to preserve empty values List<String> locationShownSublocation = Arrays.asList(metadata.getValues(IPTC.LOCATION_SHOWN_SUBLOCATION)); assertEquals("Rock Creek Park Sub", locationShownSublocation.get(0)); assertEquals("Stream Section", locationShownSublocation.get(1)); List<String> locationShownWorldRegion = Arrays.asList(metadata.getValues(IPTC.LOCATION_SHOWN_WORLD_REGION)); assertEquals("North America", locationShownWorldRegion.get(0)); // assertEquals("", locationShownWorldRegion.get(1)); // TODO: Get ExifTool to preserve empty values assertEquals("Washington", metadata.get(IPTC.LOCATION_CREATED_CITY)); assertEquals("US", metadata.get(IPTC.LOCATION_CREATED_COUNTRY_CODE)); assertEquals("United States", metadata.get(IPTC.LOCATION_CREATED_COUNTRY_NAME)); assertEquals("D.C.", metadata.get(IPTC.LOCATION_CREATED_PROVINCE_OR_STATE)); assertEquals("Rock Creek Park", metadata.get(IPTC.LOCATION_CREATED_SUBLOCATION)); assertEquals("North America", metadata.get(IPTC.LOCATION_CREATED_WORLD_REGION)); assertTrue(IPTC.REGISTRY_ENTRY_CREATED_ORGANISATION_ID.isMultiValuePermitted()); assertTrue(metadata.isMultiValued(IPTC.REGISTRY_ENTRY_CREATED_ORGANISATION_ID)); List<String> registryEntryOrgIds = Arrays .asList(metadata.getValues(IPTC.REGISTRY_ENTRY_CREATED_ORGANISATION_ID)); assertEquals(2, registryEntryOrgIds.size()); assertEquals("PLUS", registryEntryOrgIds.get(0)); // assertEquals("", registryEntryOrgIds.get(1)); // TODO: Get ExifTool to preserve empty values assertEquals("ORG 2", registryEntryOrgIds.get(1)); // This should be index 2, TODO: Get ExifTool to preserve empty values assertTrue(IPTC.REGISTRY_ENTRY_CREATED_ORGANISATION_ID.isMultiValuePermitted()); assertTrue(metadata.isMultiValued(IPTC.REGISTRY_ENTRY_CREATED_ITEM_ID)); List<String> registryEntryItemIds = Arrays.asList(metadata.getValues(IPTC.REGISTRY_ENTRY_CREATED_ITEM_ID)); assertEquals(registryEntryItemIds.size(), 3); assertEquals("100-ABC-ABC-555", registryEntryItemIds.get(0)); assertEquals("11223344", registryEntryItemIds.get(1)); assertEquals("55667788", registryEntryItemIds.get(2)); }
From source file:org.apache.tika.parser.exiftool.ExifToolImageParserTest.java
public void testJPEGCustomXmp() throws Exception { Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "image/jpeg"); InputStream stream = getClass().getResourceAsStream("/test-documents/testJPEG_IPTC_EXT.jpg"); ArrayList<Property> passthroughXmpProperties = new ArrayList<Property>(2); passthroughXmpProperties.add(Property.internalText("XMP-custom:Text")); passthroughXmpProperties.add(Property.internalText("XMP-custom:TextML")); Parser passthroughParser = new ExiftoolImageParser(null, passthroughXmpProperties); passthroughParser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); assertEquals("customTextField", metadata.get("XMP-custom:Text")); assertEquals("customMultilineField", metadata.get("XMP-custom:TextML")); }
From source file:org.apache.tika.parser.exiftool.ExifToolImageParserTest.java
public void testJPEG() throws Exception { Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "image/jpeg"); InputStream stream = getClass().getResourceAsStream("/test-documents/testJPEG.jpg"); parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); assertEquals("100", metadata.get(TIFF.IMAGE_WIDTH)); for (String name : metadata.names()) { logger.trace("JPEG-- " + name + "=" + metadata.get(name)); }/*from w w w . ja va 2 s . c o m*/ }
From source file:org.apache.tika.parser.exiftool.ExifToolImageParserTest.java
public void testPNGIPTC() throws Exception { Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "image/png"); InputStream stream = getClass().getResourceAsStream("/test-documents/testPNG_IPTC.png"); parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); for (String name : metadata.names()) { logger.trace("PNG-- " + name + "=" + metadata.get(name)); }// www .j av a 2 s. c om assertEquals("100", metadata.get(TIFF.IMAGE_WIDTH)); assertEquals("Cat in a garden", metadata.get(IPTC.HEADLINE)); }
From source file:org.apache.tika.parser.exiftool.ExifToolImageParserTest.java
public void testTIFFIPTC() throws Exception { Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "image/tiff"); InputStream stream = getClass().getResourceAsStream("/test-documents/testTIFF_IPTC.tif"); parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); for (String name : metadata.names()) { logger.trace("TIFF-- " + name + "=" + metadata.get(name)); }// ww w .j a v a 2 s. c o m List<String> iptcKeywords = Arrays.asList(metadata.getValues(IPTC.KEYWORDS)); assertTrue(iptcKeywords.contains("garden")); assertTrue(iptcKeywords.contains("cat")); assertEquals("Cat in a garden", metadata.get(IPTC.HEADLINE)); }
From source file:org.apache.tika.parser.image.WebPParserTest.java
@Test public void testSimple() throws Exception { Metadata metadata = new Metadata(); InputStream stream = getClass().getResourceAsStream("/test-documents/testWebp_Alpha_Lossy.webp"); parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); assertEquals("301", metadata.get("Image Height")); assertEquals("400", metadata.get("Image Width")); assertEquals("true", metadata.get("Has Alpha")); assertEquals("false", metadata.get("Is Animation")); assertEquals("image/webp", metadata.get(Metadata.CONTENT_TYPE)); IOUtils.closeQuietly(stream);/*from w ww . j a v a2s.com*/ metadata = new Metadata(); stream = getClass().getResourceAsStream("/test-documents/testWebp_Alpha_Lossless.webp"); parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); //unfortunately, there isn't much metadata in lossless assertEquals("image/webp", metadata.get(Metadata.CONTENT_TYPE)); }