com.cloudera.navigator.navigator_partner.extraction.HdfsMetadataExtraction.java Source code

Introduction

Here is the source code for com.cloudera.navigator.navigator_partner.extraction.HdfsMetadataExtraction.java
Source

/*
 * Copyright (c) 2016 Cloudera, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.cloudera.navigator.navigator_partner.extraction;

import com.cloudera.nav.sdk.client.MetadataExtractor;
import com.cloudera.nav.sdk.client.MetadataResultSet;
import com.cloudera.nav.sdk.client.NavApiCient;
import com.cloudera.nav.sdk.client.NavigatorPlugin;
import com.cloudera.nav.sdk.client.QueryUtils;
import com.cloudera.nav.sdk.client.writer.ResultSet;
import com.cloudera.nav.sdk.model.Source;
import com.cloudera.nav.sdk.model.SourceType;
import com.cloudera.nav.sdk.model.custom.CustomProperty;
import com.cloudera.nav.sdk.model.custom.CustomPropertyType;
import com.cloudera.nav.sdk.model.entities.EntityType;
import com.cloudera.nav.sdk.model.entities.HdfsEntity;
import com.cloudera.nav.sdk.model.entities.HiveTable;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;

import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

/**
 * This is a sample program that runs different metadata extractions and 
 * updates on HDFS entities. It shows how to create tags and custom properties.
 *
 * Program arguments:
 * 1. path to config file: see examples/src/main/resources/sample.conf
 * 2. output path: where to write the extracted marker for next run
 *
 */
public class HdfsMetadataExtraction {

    public static void main(String[] args) throws IOException {
        // handle arguments
        Preconditions.checkArgument(args.length >= 1);
        String configFilePath = args[0];
        String marker = null;

        NavigatorPlugin navPlugin = NavigatorPlugin.fromConfigFile(configFilePath);
        NavApiCient client = navPlugin.getClient();

        MetadataExtractor extractor = new MetadataExtractor(client, null);

        // Run filtered examples
        getHDFSEntities(client, extractor, marker);
        addCustomTags(navPlugin, extractor, marker);

    }

    /**
     * How to retrieve various HDFS entities. 
     * Comments describe how to do the equivalent functions using the REST APIs. 
     * 
     * @param NavApiCient - Navigator client used to communicate with Navigator service
     * @param extractor - used to extract the data
     * @param marker - is set to null, but is a string cursor return by the server when extraction occurs
     */
    public static void getHDFSEntities(NavApiCient client, MetadataExtractor extractor, String marker) {

        //retrieve all HDFS source types
        //REST API equivalent
        //curl 'http://fkader-nav-1.vpc.cloudera.com:7187/api/v9/entities?query=((sourceType:HDFS))' -u admin:admin -X GET
        Iterable<Map<String, Object>> HdfsAll = extractor.extractMetadata(marker, null, "sourceType:HDFS", null)
                .getEntities();
        getFirstResult(HdfsAll);

        //retrieve by SourceId (unique entity)
        //REST API equivalent
        // curl 'http://fkader-nav-1.vpc.cloudera.com:7187/api/v9/entities?query=((sourceType:HDFS)AND(sourceId:xxxxx)' -u admin:admin -X GET
        Source hdfsSource = client.getOnlySource(SourceType.HDFS);
        Iterable<Map<String, Object>> HdfsSingleSource = extractor.extractMetadata(marker, null,
                "sourceType:HDFS AND " + "sourceId:" + hdfsSource.getIdentity(), null).getEntities();
        getFirstResult(HdfsSingleSource);

        //retrieve only sample_07 files in HDFS 
        //REST API equivalent
        //curl 'http://fkader-nav-1.vpc.cloudera.com:7187/api/v9/entities?query=((type:FILE)AND(sourceType:HDFS)AND(originalName:sample_07))' -u admin:admin -X GET
        Iterable<Map<String, Object>> hdfsFile = extractor
                .extractMetadata(marker, null, "sourceType:HDFS AND type:FILE AND originalName:sample_07", null)
                .getEntities();
        getFirstResult(hdfsFile);

        //retrieve all directories in HDFS
        //REST API equivalent
        //curl 'http://fkader-nav-1.vpc.cloudera.com:7187/api/v9/entities?query=((type:DIRECTORY)AND(sourceType:HDFS))' -u admin:admin -X GET
        Iterable<Map<String, Object>> hdfsDir = extractor
                .extractMetadata(marker, null, "sourceType:HDFS AND type:DIRECTORY", null).getEntities();
        getFirstResult(hdfsDir);

    }

    /**
     * Create custom metadata as tags and key:value properties for HDFS FILE (sample_08). 
     * Comments describe how to do the equivalent functions using the REST APIs. 
     * 
     * @param navPlugin - Navigator Plugin to communicate with Navigator service
     * @param extractor - used to extract the data
     * @param marker - is set to null, but is a string cursor return by the server when extraction occurs
     */
    public static void addCustomTags(NavigatorPlugin navPlugin, MetadataExtractor extractor, String marker) {

        //REST API  
        //curl 'http://fkader-nav-1.vpc.cloudera.com:7187/api/v9/entities?query=((type:FILE)AND(sourceType:HDFS)AND(originalName:sample_08)AND(type:DIRECTORY))' -u admin:admin -X GET

        StringBuffer querybuff = new StringBuffer();
        querybuff.append("sourceType:HDFS ")
                //AND sourceId:a09b0233cc58ff7d601eaa68673a20c6")
                .append(" AND type:FILE ").append("AND originalName:sample_07");

        //get the sourceId
        Iterable<Map<String, Object>> hdfsDir = extractor.extractMetadata(marker, null, querybuff.toString(), null)
                .getEntities();

        //add tags using the REST API:
        //curl http://fkader-nav-1.vpc.cloudera.com:7187/api/v9/entities/ -u admin:admin -X POST -H "Content-Type: application/json" -d '{"sourceId":"a09b0233cc58ff7d601eaa68673a20c6","parentPath":"/user/admin","originalName":"sample_09","name":"navtest_custom_props","description":"Navigator Custom Properties", "properties":{"creator":"partnerCreated"}, "tags":["tag1", "tag2"]}'

        Iterator<Map<String, Object>> iterHdfsDir = hdfsDir.iterator();
        if (iterHdfsDir.hasNext()) {
            Map<String, Object> result = iterHdfsDir.next();

            HdfsEntity modifiedSampleDir = new HdfsEntity();

            modifiedSampleDir.setSourceId(result.get("sourceId").toString());
            modifiedSampleDir.setEntityType(EntityType.FILE);
            modifiedSampleDir.setFileSystemPath(result.get("fileSystemPath").toString());
            modifiedSampleDir.setName(result.get("originalName").toString());
            modifiedSampleDir.setDescription("Navigator Custom Properties");
            modifiedSampleDir.addTags("tag1");
            modifiedSampleDir.addTags("tag2");

            //key:value properties tag
            Map<String, String> props = new HashMap<String, String>();
            props.put("creator", "testpartnerCreated");
            modifiedSampleDir.addProperties(props);

            //create new namespace 
            //   modifiedSampleDir.setNamespace("partnernamespace");

            modifiedSampleDir.setProperties(props);

            ResultSet results = navPlugin.write(modifiedSampleDir);
            if (results.hasErrors()) {
                throw new RuntimeException(results.toString());
            }
            System.out.println("successfully updated sample hdfs directory");

        }
    }

    private static void getFirstResult(Iterable<Map<String, Object>> iterable) {
        // In real usage, iterate through results and process each metadata object
        Iterator<Map<String, Object>> iterator = iterable.iterator();
        if (iterator.hasNext()) {
            Map<String, Object> result = iterator.next();
            System.out.println("source: " + result.get("sourceType") + "  type: " + result.get("type")
                    + " originalName " + result.get("originalName") + " file location "
                    + result.get("fileSystemPath") + " parent path " + result.get("parentPath"));
        } else {
            System.out.println("no elements found");
        }
    }

}