com.pinterest.secor.parser.DataLakeMessageParser.java Source code

Java tutorial

Introduction

Here is the source code for com.pinterest.secor.parser.DataLakeMessageParser.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.pinterest.secor.parser;

import com.pinterest.secor.common.SecorConfig;
import com.pinterest.secor.message.Message;
import net.minidev.json.JSONObject;
import net.minidev.json.JSONValue;
import org.joda.time.LocalDateTime;
import org.joda.time.format.DateTimeFormatter;
import org.joda.time.format.ISODateTimeFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * DataLakeMessageParser extracts timestamp field (specified by 'message.timestamp.name')
 * usually named timestamp in analytics.
 * It uses the ISODateTimeFormat from joda-time library. Used by elasticsearch / logstash
 *
 * @see http://joda-time.sourceforge.net/apidocs/org/joda/time/format/ISODateTimeFormat.html
 *
 * @author Rahul  Bhatia (rbhatia@opentable.com)
 *
 */
public class DataLakeMessageParser extends MessageParser {
    private static final Logger LOG = LoggerFactory.getLogger(DataLakeMessageParser.class);
    protected static final String defaultType = "untyped";
    protected static final String defaultDate = "1970/01/01/00";

    public DataLakeMessageParser(SecorConfig config) {
        super(config);
    }

    @Override
    public ParsedMessage parse(final Message message) throws Exception {
        jsonObject = (JSONObject) JSONValue.parse(message.getPayload());
        try {
            if (shouldIncludeinLake()) {
                return super.parse(message);
            }
            return null;
        } finally {
            jsonObject = null;
        }
    }

    @Override
    public String[] extractPartitions(Message message) {
        JSONObject jsonObject = (JSONObject) JSONValue.parse(message.getPayload());
        String result[] = { defaultType, defaultDate };

        if (jsonObject != null) {
            Object fieldType = jsonObject.get(mConfig.getMessageTypeName()); //type
            Object fieldValue = jsonObject.get(mConfig.getMessageTimestampName()); //@timestamp
            if (fieldType != null) {
                result[0] = sanitizePath(fieldType.toString());
            }
            if (fieldValue != null) {
                try {

                    DateTimeFormatter inputFormatter = ISODateTimeFormat.dateOptionalTimeParser();
                    LocalDateTime datetime = LocalDateTime.parse(fieldValue.toString(), inputFormatter);
                    result[1] = datetime.toString(mConfig.getMessageTimestampBucketFormat());
                } catch (Exception e) {
                    LOG.warn("date = " + fieldValue.toString() + " could not be parsed with ISODateTimeFormat."
                            + " Using date default=" + defaultDate);
                }
            }
        }

        return result;
    }

    private String sanitizePath(String path_type) {
        //Accept only lowercase underscores and hypens
        return path_type.replaceAll("\\.", "-").replaceAll("[^a-zA-Z0-9-_]", "").replaceAll("---", "-")
                .replaceAll("--", "-").replaceAll("___", "_").replaceAll("__", "_").toLowerCase();
    }

    private boolean shouldIncludeinLake() {
        if (jsonObject == null) {
            return false;
        }
        if (!jsonObject.containsKey("properties")) {
            return false;
        }
        if (!jsonObject.containsKey("event")) {
            return false;
        }
        final Object eventObject = jsonObject.get("event");
        final Object propertiesObject = jsonObject.get("properties");
        if (!(propertiesObject instanceof JSONObject)) {
            return false;
        }
        final JSONObject properties = (JSONObject) propertiesObject;

        if (!(eventObject instanceof String)) {
            return false;
        }

        final String event_name = (String) eventObject;

        final boolean ret = "session_start".equals(event_name);
        if (ret) {
            ++IncludeCount;
            final Instant now = Instant.now();
            if (now.isAfter(lastFilterLog.plus(FILTER_LOG_THRESH))) {
                LOG.info("filtered For session_start payloads for Data LAke: {}", IncludeCount);
                lastFilterLog = now;
            }
        }
        return ret;
    }

}