Example usage for org.xml.sax.helpers DefaultHandler DefaultHandler

List of usage examples for org.xml.sax.helpers DefaultHandler DefaultHandler

Introduction

In this page you can find the example usage for org.xml.sax.helpers DefaultHandler DefaultHandler.

Prototype

DefaultHandler

Source Link

Usage

From source file:org.apache.solr.handler.dataimport.CustomTikaEntityProcessor.java

@Override
public Map<String, Object> nextRow() {
    if (done)// w  ww .j  a v  a  2s . co  m
        return null;
    Map<String, Object> row = new HashMap<>();
    String filePath = context.getResolvedEntityAttribute(URL);

    /*
     * Changed from original source
     * Required for later change
     */
    @SuppressWarnings("unchecked")
    DataSource<InputStream> dataSource = context.getDataSource();

    /*
     * Changed from original source
     * When dataSource is an InputStreamReader, create a new InputStream to handle this
     * 
     */
    InputStream is = null;
    if (InputStream.class.isInstance(dataSource)) {
        is = dataSource.getData(filePath);
    } else {
        try {
            is = new FileInputStream(new File(filePath));
        } catch (FileNotFoundException e) {
            LOG.warn("Unable to create InputStream of " + filePath);
        }
    }

    ContentHandler contentHandler = null;
    Metadata metadata = new Metadata();

    /*
     * Changed from original source
     * metadata is not able to determine the PDF fileformat without the filepath
     * see also: http://stackoverflow.com/questions/5507565/extracting-text-from-documents-of-unknown-content-type
     */
    metadata.set(Metadata.RESOURCE_NAME_KEY, filePath);

    StringWriter sw = new StringWriter();
    try {
        if ("html".equals(format)) {
            contentHandler = getHtmlHandler(sw);
        } else if ("xml".equals(format)) {
            contentHandler = getXmlContentHandler(sw);
        } else if ("text".equals(format)) {
            contentHandler = getTextContentHandler(sw);
        } else if ("none".equals(format)) {
            contentHandler = new DefaultHandler();
        }
    } catch (TransformerConfigurationException e) {
        wrapAndThrow(SEVERE, e, "Unable to create content handler");
    }
    Parser tikaParser = null;
    if (parser.equals(AUTO_PARSER)) {
        tikaParser = new AutoDetectParser(tikaConfig);
    } else {
        tikaParser = context.getSolrCore().getResourceLoader().newInstance(parser, Parser.class);
    }

    try {
        ParseContext context = new ParseContext();//here
        /*
         * Changed from original source
         * makes it possible to index the content files contained in zip files
         * see also: https://issues.apache.org/jira/browse/SOLR-2332 and https://issues.apache.org/jira/secure/attachment/12469108/SOLR-2332.patch
         */
        context.set(Parser.class, tikaParser);
        if ("identity".equals(htmlMapper)) {
            context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
        }
        tikaParser.parse(is, contentHandler, metadata, context);
    } catch (Exception e) {
        /*
         * Changed from original source
         * print to log that file can't be read, instead of throwing error and stopping indexing
         */
        //wrapAndThrow(SEVERE, e, "Unable to read content");
        LOG.warn("Unable to read content of " + filePath);
    }
    IOUtils.closeQuietly(is);
    for (Map<String, String> field : context.getAllEntityFields()) {
        if (!"true".equals(field.get("meta")))
            continue;
        String col = field.get(COLUMN);
        String s = metadata.get(col);
        if (s != null)
            row.put(col, s);
    }
    if (!"none".equals(format))
        row.put("text", sw.toString());
    done = true;
    return row;
}

From source file:org.apache.solr.handler.dataimport.TikaEntityProcessor.java

@Override
public Map<String, Object> nextRow() {
    if (done)/*from   w w  w.j a v  a  2s .c  o  m*/
        return null;
    Map<String, Object> row = new HashMap<>();
    DataSource<InputStream> dataSource = context.getDataSource();
    InputStream is = dataSource.getData(context.getResolvedEntityAttribute(URL));
    ContentHandler contentHandler = null;
    Metadata metadata = new Metadata();
    StringWriter sw = new StringWriter();
    try {
        if ("html".equals(format)) {
            contentHandler = getHtmlHandler(sw);
        } else if ("xml".equals(format)) {
            contentHandler = getXmlContentHandler(sw);
        } else if ("text".equals(format)) {
            contentHandler = getTextContentHandler(sw);
        } else if ("none".equals(format)) {
            contentHandler = new DefaultHandler();
        }
    } catch (TransformerConfigurationException e) {
        wrapAndThrow(SEVERE, e, "Unable to create content handler");
    }
    Parser tikaParser = null;
    if (parser.equals(AUTO_PARSER)) {
        tikaParser = new AutoDetectParser(tikaConfig);
    } else {
        tikaParser = context.getSolrCore().getResourceLoader().newInstance(parser, Parser.class);
    }
    try {
        ParseContext context = new ParseContext();
        if ("identity".equals(htmlMapper)) {
            context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
        }
        if (extractEmbedded) {
            context.set(Parser.class, tikaParser);
        }
        tikaParser.parse(is, contentHandler, metadata, context);
    } catch (Exception e) {
        if (SKIP.equals(onError)) {
            throw new DataImportHandlerException(DataImportHandlerException.SKIP_ROW,
                    "Document skipped :" + e.getMessage());
        }
        wrapAndThrow(SEVERE, e, "Unable to read content");
    }
    IOUtils.closeQuietly(is);
    for (Map<String, String> field : context.getAllEntityFields()) {
        if (!"true".equals(field.get("meta")))
            continue;
        String col = field.get(COLUMN);
        String s = metadata.get(col);
        if (s != null)
            row.put(col, s);
    }
    if (!"none".equals(format))
        row.put("text", sw.toString());
    tryToAddLatLon(metadata, row);
    done = true;
    return row;
}

From source file:org.apache.tika.batch.fs.RecursiveParserWrapperFSConsumer.java

@Override
public boolean processFileResource(FileResource fileResource) {

    Parser wrapped = parserFactory.getParser(tikaConfig);
    RecursiveParserWrapper parser = new RecursiveParserWrapper(wrapped, contentHandlerFactory);
    ParseContext context = new ParseContext();

    //        if (parseRecursively == true) {
    context.set(Parser.class, parser);
    //        }//  ww w. ja  v a 2  s  .  c  om

    //try to open outputstream first
    OutputStream os = getOutputStream(fsOSFactory, fileResource);

    if (os == null) {
        LOG.debug("Skipping: {}", fileResource.getMetadata().get(FSProperties.FS_REL_PATH));
        return false;
    }

    //try to open the inputstream before the parse.
    //if the parse hangs or throws a nasty exception, at least there will
    //be a zero byte file there so that the batchrunner can skip that problematic
    //file during the next run.
    InputStream is = getInputStream(fileResource);
    if (is == null) {
        IOUtils.closeQuietly(os);
        return false;
    }

    Throwable thrown = null;
    List<Metadata> metadataList = null;
    Metadata containerMetadata = fileResource.getMetadata();
    try {
        parse(fileResource.getResourceId(), parser, is, new DefaultHandler(), containerMetadata, context);
        metadataList = parser.getMetadata();
    } catch (Throwable t) {
        thrown = t;
        metadataList = parser.getMetadata();
        if (metadataList == null) {
            metadataList = new LinkedList<>();
        }
        Metadata m = null;
        if (metadataList.size() == 0) {
            m = containerMetadata;
        } else {
            //take the top metadata item
            m = metadataList.remove(0);
        }
        String stackTrace = ExceptionUtils.getFilteredStackTrace(t);
        m.add(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "runtime", stackTrace);
        metadataList.add(0, m);
    } finally {
        IOUtils.closeQuietly(is);
    }

    Writer writer = null;

    try {
        writer = new OutputStreamWriter(os, getOutputEncoding());
        JsonMetadataList.toJson(metadataList, writer);
    } catch (Exception e) {
        //this is a stop the world kind of thing
        LOG.error("{}", getXMLifiedLogMsg(IO_OS + "json", fileResource.getResourceId(), e));
        throw new RuntimeException(e);
    } finally {
        flushAndClose(writer);
    }

    if (thrown != null) {
        if (thrown instanceof Error) {
            throw (Error) thrown;
        } else {
            return false;
        }
    }

    return true;
}

From source file:org.apache.tika.parser.epub.EpubParser.java

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {
    // Because an EPub file is often made up of multiple XHTML files,
    //  we need explicit control over the start and end of the document
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();// w ww.ja v  a 2 s . co m
    ContentHandler childHandler = new EmbeddedContentHandler(new BodyContentHandler(xhtml));

    ZipInputStream zip = new ZipInputStream(stream);
    ZipEntry entry = zip.getNextEntry();
    while (entry != null) {
        if (entry.getName().equals("mimetype")) {
            String type = IOUtils.toString(zip, UTF_8);
            //often has trailing new lines
            if (type != null) {
                type = type.trim();
            }
            metadata.set(Metadata.CONTENT_TYPE, type);
        } else if (entry.getName().equals("metadata.xml")) {
            meta.parse(zip, new DefaultHandler(), metadata, context);
        } else if (entry.getName().endsWith(".opf")) {
            meta.parse(zip, new DefaultHandler(), metadata, context);
        } else if (entry.getName().endsWith(".html") || entry.getName().endsWith(".xhtml")) {
            content.parse(zip, childHandler, metadata, context);
        }
        entry = zip.getNextEntry();
    }

    // Finish everything
    xhtml.endDocument();
}

From source file:org.apache.tika.parser.exiftool.ExifToolImageParserTest.java

public void testJPEGIPTC() throws Exception {
    Metadata metadata = new Metadata();
    metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
    InputStream stream = getClass().getResourceAsStream("/test-documents/testJPEG_IPTC_EXT.jpg");
    parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());

    assertEquals("Washington", metadata.get(IPTC.CITY));
    assertEquals("United States", metadata.get(IPTC.COUNTRY));
    assertEquals("US", metadata.get(IPTC.COUNTRY_CODE));

    assertEquals(/*from  w  ww .j  ava  2  s . com*/
            "A stream bank in Rock Creek Park Washington DC during a photo bike tour with ASPP DC/South chapter.",
            metadata.get(IPTC.DESCRIPTION));
    assertEquals(
            "A stream bank in Rock Creek Park Washington DC during a photo bike tour with ASPP DC/South chapter.",
            metadata.get(Metadata.DESCRIPTION));

    assertEquals("Rock Creek Park", metadata.get(IPTC.HEADLINE));
    assertEquals("Downstream", metadata.get(Metadata.TITLE));

    assertEquals("intellectual genre", metadata.get(IPTC.INTELLECTUAL_GENRE));

    List<String> iptcKeywords = Arrays.asList(metadata.getValues(IPTC.KEYWORDS));
    assertTrue(iptcKeywords.contains("stream"));
    assertTrue(iptcKeywords.contains("park"));
    assertTrue(iptcKeywords.contains("bank"));
    assertEquals(5, iptcKeywords.size());
    List<String> tikaKeywords = Arrays.asList(metadata.getValues(Metadata.KEYWORDS));
    assertTrue(Arrays.toString(tikaKeywords.toArray()).contains("stream"));
    assertTrue(Arrays.toString(tikaKeywords.toArray()).contains("park"));
    assertTrue(Arrays.toString(tikaKeywords.toArray()).contains("bank"));
    assertEquals(5, tikaKeywords.size());

    assertEquals("DC", metadata.get(IPTC.PROVINCE_OR_STATE));

    List<String> iptcSceneCode = Arrays.asList(metadata.getValues(IPTC.SCENE_CODE));
    assertEquals(2, iptcSceneCode.size());
    assertTrue(Arrays.toString(iptcSceneCode.toArray()).contains("iptc scene 1"));
    assertTrue(Arrays.toString(iptcSceneCode.toArray()).contains("iptc scene 2"));

    List<String> iptcSubjectCode = Arrays.asList(metadata.getValues(IPTC.SUBJECT_CODE));
    assertEquals(2, iptcSubjectCode.size());
    assertTrue(Arrays.toString(iptcSubjectCode.toArray()).contains("iptc subject code 1"));
    assertTrue(Arrays.toString(iptcSubjectCode.toArray()).contains("iptc subject code 2"));

    assertEquals("Rock Creek Park", metadata.get(IPTC.SUBLOCATION));

    GregorianCalendar calendar = new GregorianCalendar();
    calendar.set(Calendar.YEAR, 2011);
    calendar.set(Calendar.MONTH, 7);
    calendar.set(Calendar.DATE, 31);
    calendar.set(Calendar.HOUR_OF_DAY, 12);
    calendar.set(Calendar.MINUTE, 0);
    calendar.set(Calendar.SECOND, 0);
    calendar.set(Calendar.MILLISECOND, 0);
    calendar.setTimeZone(TimeZone.getTimeZone("UTC"));
    assertEquals(calendar.getTime(), metadata.getDate(IPTC.DATE_CREATED));

    assertEquals("Ray Gauss II", metadata.get(IPTC.DESCRIPTION_WRITER));
    assertEquals("instructions", metadata.get(IPTC.INSTRUCTIONS));
    assertEquals("job identifier", metadata.get(IPTC.JOB_ID));
    assertEquals("Downstream", metadata.get(IPTC.TITLE));
    assertTrue(metadata.get(IPTC.COPYRIGHT_NOTICE).contains("Ray Gauss II"));

    List<String> creators = Arrays.asList(metadata.getValues(IPTC.CREATOR));
    assertTrue(Arrays.toString(creators.toArray()).contains("Ray Gauss II"));

    assertEquals("DAM Architect", metadata.get(IPTC.CREATORS_JOB_TITLE));
    assertEquals("provider", metadata.get(IPTC.CREDIT_LINE));
    assertEquals("rights usage terms", metadata.get(IPTC.RIGHTS_USAGE_TERMS));
    assertEquals("source", metadata.get(IPTC.SOURCE));
    assertEquals("1234 Some Road", metadata.get(IPTC.CONTACT_INFO_ADDRESS));
    assertEquals("Atlanta", metadata.get(IPTC.CONTACT_INFO_CITY));
    assertEquals("US", metadata.get(IPTC.CONTACT_INFO_COUNTRY));

    List<String> ciWorkEmails = Arrays.asList(metadata.getValues(IPTC.CONTACT_INFO_EMAIL));
    // Photoshop does not support true multi-value here
    assertTrue(Arrays.toString(ciWorkEmails.toArray()).contains("info@alfresco.com"));
    assertTrue(Arrays.toString(ciWorkEmails.toArray()).contains("other@example.com"));

    List<String> ciWorkTels = Arrays.asList(metadata.getValues(IPTC.CONTACT_INFO_PHONE));
    // Photoshop does not support true multi-value here
    assertTrue(Arrays.toString(ciWorkTels.toArray()).contains("555-1234"));
    assertTrue(Arrays.toString(ciWorkTels.toArray()).contains("555-4321"));

    assertEquals("30339", metadata.get(IPTC.CONTACT_INFO_POSTAL_CODE));
    assertEquals("GA", metadata.get(IPTC.CONTACT_INFO_STATE_PROVINCE));

    List<String> ciWorkUrls = Arrays.asList(metadata.getValues(IPTC.CONTACT_INFO_WEB_URL));
    // Photoshop does not support true multi-value here
    assertTrue(Arrays.toString(ciWorkUrls.toArray()).contains("http://alfresco.com"));
    assertTrue(Arrays.toString(ciWorkUrls.toArray()).contains("http://example.com"));

    assertEquals("rocky 1 and rocky 2 are big", metadata.get(IPTC.ADDITIONAL_MODEL_INFO));

    List<String> orgCodes = Arrays.asList(metadata.getValues(IPTC.ORGANISATION_CODE));
    assertEquals(2, orgCodes.size());
    assertEquals("ASPP", orgCodes.get(0));
    assertEquals("OTHER_ORG", orgCodes.get(1));

    // List<String> cvTerms = Arrays.asList(metadata.getValues(IPTC.CONTROLLED_VOCABULARY_TERM));

    List<String> modelAges = Arrays.asList(metadata.getValues(IPTC.MODEL_AGE));
    assertEquals(2, modelAges.size());
    assertEquals("1000", modelAges.get(0));
    assertEquals("1001", modelAges.get(1));

    List<String> orgNames = Arrays.asList(metadata.getValues(IPTC.ORGANISATION_NAME));
    assertEquals(2, orgNames.size());
    assertEquals("ASPP", orgNames.get(0));
    assertEquals("Other Org", orgNames.get(1));

    List<String> peopleShown = Arrays.asList(metadata.getValues(IPTC.PERSON));
    assertEquals(2, peopleShown.size());
    assertEquals("rocky 1", peopleShown.get(0));
    assertEquals("rocky 2", peopleShown.get(1));

    assertEquals("http://cv.iptc.org/newscodes/digitalsourcetype/digitalCapture",
            metadata.get(IPTC.DIGITAL_SOURCE_TYPE));
    assertEquals("Photo Bike Tour", metadata.get(IPTC.EVENT));

    assertEquals("RGAUSS", metadata.get(IPTC.IMAGE_SUPPLIER_ID));
    assertEquals("Ray Gauss II", metadata.get(IPTC.IMAGE_SUPPLIER_NAME));
    assertEquals("supplier image ID", metadata.get(IPTC.IMAGE_SUPPLIER_IMAGE_ID));
    assertEquals("3456", metadata.get(IPTC.MAX_AVAIL_HEIGHT));
    assertEquals("5184", metadata.get(IPTC.MAX_AVAIL_WIDTH));
    assertEquals("1.2.0", metadata.get(IPTC.PLUS_VERSION));

    List<String> copyrightOwnerIds = Arrays.asList(metadata.getValues(IPTC.COPYRIGHT_OWNER_ID));
    assertEquals(1, copyrightOwnerIds.size());
    assertEquals("RGAUSS", copyrightOwnerIds.get(0));
    // assertEquals("", copyrightOwnerIds.get(1)); // TODO: Get ExifTool to preserve empty values

    List<String> copyrightOwnerNames = Arrays.asList(metadata.getValues(IPTC.COPYRIGHT_OWNER_NAME));
    assertEquals(2, copyrightOwnerNames.size());
    assertEquals("Ray Gauss II", copyrightOwnerNames.get(0));
    assertEquals("GG", copyrightOwnerNames.get(1));

    List<String> imageCreatorIds = Arrays.asList(metadata.getValues(IPTC.IMAGE_CREATOR_ID));
    assertEquals(1, imageCreatorIds.size());
    assertEquals("RGAUSS", imageCreatorIds.get(0));
    // assertEquals("", imageCreatorIds.get(1)); // TODO: Get ExifTool to preserve empty values

    assertTrue(metadata.isMultiValued(IPTC.IMAGE_CREATOR_NAME));
    List<String> imageCreatorNames = Arrays.asList(metadata.getValues(IPTC.IMAGE_CREATOR_NAME));
    assertEquals(2, imageCreatorNames.size());
    assertEquals("Ray Gauss II", imageCreatorNames.get(0));
    assertEquals("GG", imageCreatorNames.get(1));

    List<String> licensorIds = Arrays.asList(metadata.getValues(IPTC.LICENSOR_ID));
    assertEquals("RGAUSS", licensorIds.get(0));

    assertTrue(metadata.isMultiValued(IPTC.LICENSOR_NAME));
    List<String> licensorNames = Arrays.asList(metadata.getValues(IPTC.LICENSOR_NAME));
    assertEquals(2, licensorNames.size());
    assertEquals("Ray Gauss II", licensorNames.get(0));
    assertEquals("GG", licensorNames.get(1));

    // Photoshop does not support licensor addresses, cities, or countries

    List<String> licensorEmails = Arrays.asList(metadata.getValues(IPTC.LICENSOR_EMAIL));
    assertEquals("r@example.com", licensorEmails.get(0));
    // assertEquals("", licensorEmails.get(1)); // TODO: Get ExifTool to preserve empty values
    List<String> licensorTel1 = Arrays.asList(metadata.getValues(IPTC.LICENSOR_TELEPHONE_1));
    assertEquals("555-5555", licensorTel1.get(0));
    // assertEquals("", licensorTel1.get(1)); // TODO: Get ExifTool to preserve empty values
    List<String> licensorTel2 = Arrays.asList(metadata.getValues(IPTC.LICENSOR_TELEPHONE_2));
    assertEquals("555-4444", licensorTel2.get(0));
    // assertEquals("", licensorTel2.get(1)); // TODO: Get ExifTool to preserve empty values
    List<String> licensorUrls = Arrays.asList(metadata.getValues(IPTC.LICENSOR_URL));
    assertEquals("http://rgauss.com", licensorUrls.get(0));
    // assertEquals("", licensorUrls.get(1)); // TODO: Get ExifTool to preserve empty values

    assertEquals("Age Unknown", metadata.get(IPTC.MINOR_MODEL_AGE_DISCLOSURE));
    List<String> modelReleaseIds = Arrays.asList(metadata.getValues(IPTC.MODEL_RELEASE_ID));
    assertEquals("model release id 1", modelReleaseIds.get(0));
    assertEquals("model release id 2", modelReleaseIds.get(1));
    assertEquals("Not Applicable", metadata.get(IPTC.MODEL_RELEASE_STATUS));

    List<String> propertyReleaseIds = Arrays.asList(metadata.getValues(IPTC.PROPERTY_RELEASE_ID));
    assertEquals("prop release id 1", propertyReleaseIds.get(0));
    assertEquals("prop release id 2", propertyReleaseIds.get(1));
    assertEquals("Not Applicable", metadata.get(IPTC.PROPERTY_RELEASE_STATUS));

    List<String> aoCopyright = Arrays
            .asList(metadata.getValues(IPTC.ARTWORK_OR_OBJECT_DETAIL_COPYRIGHT_NOTICE));
    assertEquals("Ray Gauss II", aoCopyright.get(0));
    // assertEquals("", aoCopyright.get(1)); // TODO: Get ExifTool to preserve empty values
    // assertEquals("", aoCopyright.get(2)); // TODO: Get ExifTool to preserve empty values
    List<String> aoCreator = Arrays.asList(metadata.getValues(IPTC.ARTWORK_OR_OBJECT_DETAIL_CREATOR));
    assertEquals("Mother Nature", aoCreator.get(0));
    assertEquals("Man", aoCreator.get(1));
    assertEquals("Mother Nature", aoCreator.get(2));
    List<String> aoDateCreated = Arrays.asList(metadata.getValues(IPTC.ARTWORK_OR_OBJECT_DETAIL_DATE_CREATED));
    assertEquals("1890:01:01", aoDateCreated.get(0));
    // assertEquals("", aoDateCreated.get(1)); // TODO: Get ExifTool to preserve empty values
    assertEquals("1901:02:01", aoDateCreated.get(1));
    // assertEquals("", aoDateCreated.get(2)); // TODO: Get ExifTool to preserve empty values
    List<String> aoSource = Arrays.asList(metadata.getValues(IPTC.ARTWORK_OR_OBJECT_DETAIL_SOURCE));
    assertEquals("National Park Service", aoSource.get(0));
    // assertEquals("", aoSource.get(1)); // TODO: Get ExifTool to preserve empty values
    // assertEquals("", aoSource.get(2)); // TODO: Get ExifTool to preserve empty values
    List<String> aoSourceInventoryNum = Arrays
            .asList(metadata.getValues(IPTC.ARTWORK_OR_OBJECT_DETAIL_SOURCE_INVENTORY_NUMBER));
    assertEquals("123456", aoSourceInventoryNum.get(0));
    // assertEquals("", aoSourceInventoryNum.get(1)); // TODO: Get ExifTool to preserve empty values
    assertEquals("654321", aoSourceInventoryNum.get(1)); // This should be index 2, TODO: Get ExifTool to preserve empty values
    List<String> aoSourceTitles = Arrays.asList(metadata.getValues(IPTC.ARTWORK_OR_OBJECT_DETAIL_TITLE));
    assertEquals("Rock Creek Stream Bank", aoSourceTitles.get(0));
    assertEquals("Pollution", aoSourceTitles.get(1));
    assertEquals("Some Tree", aoSourceTitles.get(2));

    List<String> locationShownCity = Arrays.asList(metadata.getValues(IPTC.LOCATION_SHOWN_CITY));
    assertEquals("Washington", locationShownCity.get(0));
    // assertEquals("", locationShownCity.get(1)); // TODO: Get ExifTool to preserve empty values
    List<String> locationShownCountryCode = Arrays.asList(metadata.getValues(IPTC.LOCATION_SHOWN_COUNTRY_CODE));
    assertEquals("US", locationShownCountryCode.get(0));
    // assertEquals("", locationShownCountryCode.get(1)); // TODO: Get ExifTool to preserve empty values
    List<String> locationShownCountryName = Arrays.asList(metadata.getValues(IPTC.LOCATION_SHOWN_COUNTRY_NAME));
    assertEquals("United States", locationShownCountryName.get(0));
    // assertEquals("", locationShownCountryName.get(1)); // TODO: Get ExifTool to preserve empty values
    List<String> locationShownState = Arrays.asList(metadata.getValues(IPTC.LOCATION_SHOWN_PROVINCE_OR_STATE));
    assertEquals("D.C.", locationShownState.get(0));
    // assertEquals("", locationShownState.get(1)); // TODO: Get ExifTool to preserve empty values
    List<String> locationShownSublocation = Arrays.asList(metadata.getValues(IPTC.LOCATION_SHOWN_SUBLOCATION));
    assertEquals("Rock Creek Park Sub", locationShownSublocation.get(0));
    assertEquals("Stream Section", locationShownSublocation.get(1));
    List<String> locationShownWorldRegion = Arrays.asList(metadata.getValues(IPTC.LOCATION_SHOWN_WORLD_REGION));
    assertEquals("North America", locationShownWorldRegion.get(0));
    // assertEquals("", locationShownWorldRegion.get(1)); // TODO: Get ExifTool to preserve empty values

    assertEquals("Washington", metadata.get(IPTC.LOCATION_CREATED_CITY));
    assertEquals("US", metadata.get(IPTC.LOCATION_CREATED_COUNTRY_CODE));
    assertEquals("United States", metadata.get(IPTC.LOCATION_CREATED_COUNTRY_NAME));
    assertEquals("D.C.", metadata.get(IPTC.LOCATION_CREATED_PROVINCE_OR_STATE));
    assertEquals("Rock Creek Park", metadata.get(IPTC.LOCATION_CREATED_SUBLOCATION));
    assertEquals("North America", metadata.get(IPTC.LOCATION_CREATED_WORLD_REGION));

    assertTrue(IPTC.REGISTRY_ENTRY_CREATED_ORGANISATION_ID.isMultiValuePermitted());
    assertTrue(metadata.isMultiValued(IPTC.REGISTRY_ENTRY_CREATED_ORGANISATION_ID));
    List<String> registryEntryOrgIds = Arrays
            .asList(metadata.getValues(IPTC.REGISTRY_ENTRY_CREATED_ORGANISATION_ID));
    assertEquals(2, registryEntryOrgIds.size());
    assertEquals("PLUS", registryEntryOrgIds.get(0));
    // assertEquals("", registryEntryOrgIds.get(1)); // TODO: Get ExifTool to preserve empty values
    assertEquals("ORG 2", registryEntryOrgIds.get(1)); // This should be index 2, TODO: Get ExifTool to preserve empty values

    assertTrue(IPTC.REGISTRY_ENTRY_CREATED_ORGANISATION_ID.isMultiValuePermitted());
    assertTrue(metadata.isMultiValued(IPTC.REGISTRY_ENTRY_CREATED_ITEM_ID));
    List<String> registryEntryItemIds = Arrays.asList(metadata.getValues(IPTC.REGISTRY_ENTRY_CREATED_ITEM_ID));
    assertEquals(registryEntryItemIds.size(), 3);
    assertEquals("100-ABC-ABC-555", registryEntryItemIds.get(0));
    assertEquals("11223344", registryEntryItemIds.get(1));
    assertEquals("55667788", registryEntryItemIds.get(2));

}

From source file:org.apache.tika.parser.exiftool.ExifToolImageParserTest.java

public void testJPEGCustomXmp() throws Exception {
    Metadata metadata = new Metadata();
    metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
    InputStream stream = getClass().getResourceAsStream("/test-documents/testJPEG_IPTC_EXT.jpg");
    ArrayList<Property> passthroughXmpProperties = new ArrayList<Property>(2);
    passthroughXmpProperties.add(Property.internalText("XMP-custom:Text"));
    passthroughXmpProperties.add(Property.internalText("XMP-custom:TextML"));
    Parser passthroughParser = new ExiftoolImageParser(null, passthroughXmpProperties);
    passthroughParser.parse(stream, new DefaultHandler(), metadata, new ParseContext());

    assertEquals("customTextField", metadata.get("XMP-custom:Text"));
    assertEquals("customMultilineField", metadata.get("XMP-custom:TextML"));
}

From source file:org.apache.tika.parser.exiftool.ExifToolImageParserTest.java

public void testJPEG() throws Exception {
    Metadata metadata = new Metadata();
    metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
    InputStream stream = getClass().getResourceAsStream("/test-documents/testJPEG.jpg");
    parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());

    assertEquals("100", metadata.get(TIFF.IMAGE_WIDTH));
    for (String name : metadata.names()) {
        logger.trace("JPEG-- " + name + "=" + metadata.get(name));
    }/*from w w  w . ja va 2 s  . c o  m*/
}

From source file:org.apache.tika.parser.exiftool.ExifToolImageParserTest.java

public void testPNGIPTC() throws Exception {
    Metadata metadata = new Metadata();
    metadata.set(Metadata.CONTENT_TYPE, "image/png");
    InputStream stream = getClass().getResourceAsStream("/test-documents/testPNG_IPTC.png");
    parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());

    for (String name : metadata.names()) {
        logger.trace("PNG-- " + name + "=" + metadata.get(name));
    }// www  .j av a  2  s. c  om
    assertEquals("100", metadata.get(TIFF.IMAGE_WIDTH));
    assertEquals("Cat in a garden", metadata.get(IPTC.HEADLINE));
}

From source file:org.apache.tika.parser.exiftool.ExifToolImageParserTest.java

public void testTIFFIPTC() throws Exception {
    Metadata metadata = new Metadata();
    metadata.set(Metadata.CONTENT_TYPE, "image/tiff");
    InputStream stream = getClass().getResourceAsStream("/test-documents/testTIFF_IPTC.tif");
    parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());

    for (String name : metadata.names()) {
        logger.trace("TIFF-- " + name + "=" + metadata.get(name));
    }// ww  w .j  a v a 2  s.  c  o m
    List<String> iptcKeywords = Arrays.asList(metadata.getValues(IPTC.KEYWORDS));
    assertTrue(iptcKeywords.contains("garden"));
    assertTrue(iptcKeywords.contains("cat"));
    assertEquals("Cat in a garden", metadata.get(IPTC.HEADLINE));
}

From source file:org.apache.tika.parser.image.WebPParserTest.java

@Test
public void testSimple() throws Exception {
    Metadata metadata = new Metadata();
    InputStream stream = getClass().getResourceAsStream("/test-documents/testWebp_Alpha_Lossy.webp");

    parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());

    assertEquals("301", metadata.get("Image Height"));
    assertEquals("400", metadata.get("Image Width"));
    assertEquals("true", metadata.get("Has Alpha"));
    assertEquals("false", metadata.get("Is Animation"));
    assertEquals("image/webp", metadata.get(Metadata.CONTENT_TYPE));

    IOUtils.closeQuietly(stream);/*from  w ww  . j  a v  a2s.com*/

    metadata = new Metadata();
    stream = getClass().getResourceAsStream("/test-documents/testWebp_Alpha_Lossless.webp");
    parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());

    //unfortunately, there isn't much metadata in lossless
    assertEquals("image/webp", metadata.get(Metadata.CONTENT_TYPE));

}