Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.apex.malhar.lib.dedup; import java.util.concurrent.Future; import javax.validation.constraints.NotNull; import org.joda.time.Duration; import org.joda.time.Instant; import org.apache.apex.malhar.lib.state.managed.ManagedTimeUnifiedStateImpl; import org.apache.apex.malhar.lib.state.managed.MovingBoundaryTimeBucketAssigner; import org.apache.apex.malhar.lib.util.PojoUtils; import org.apache.apex.malhar.lib.util.PojoUtils.Getter; import org.apache.hadoop.classification.InterfaceStability.Evolving; import com.datatorrent.api.Context; import com.datatorrent.api.Context.OperatorContext; import com.datatorrent.api.Context.PortContext; import com.datatorrent.api.DefaultInputPort; import com.datatorrent.api.Operator.ActivationListener; import com.datatorrent.api.StreamCodec; import com.datatorrent.api.annotation.InputPortFieldAnnotation; import com.datatorrent.netlet.util.Slice; /** * Time based deduper will de-duplicate incoming POJO tuples and classify them into the following: * 1. Unique * 2. Duplicate * 3. Expired * * Since this is de-duplicating in a stream of tuples, and we cannot store all incoming keys indefinitely, * we use the concept of expiry, where incoming tuples expire after a specified period of time. In this case, * we choose to expire an entire bucket of data as a unit. This requires the user to specify the bucketing * structure in advance in order for the operator to function. Here are the parameters for specifying the * bucketing structure: * 1. {@link #expireBefore} (in seconds)- This is the total time period during which a tuple stays in the * system and blocks any other tuple with the same key. * 2. {@link #bucketSpan} (in seconds) - This is the unit which describes how large a bucket can be. * Typically this should be defined depending on the use case. For example, if we have {@link #expireBefore} * set to 1 hour, then typically we would be clubbing data in the order of minutes, so a {@link #bucketSpan} of * around 1 minute or 5 minutes would make sense. Note that in this case, the entire data worth 1 minute or * 5 minutes will expire as a whole. Setting it to 1 minute would make the number of time buckets in the system * to be 1 hour / 1 minute = 60 buckets. Similarly setting {@link #bucketSpan} to 5 minutes would make number * of buckets to be 12. Note that having too many or too less buckets could have a performance impact. If unsure, * set the {@link #bucketSpan} to be ~ sqrt({@link #expireBefore}). This way the number of buckets and bucket span * are balanced. * 3. {@link #referenceInstant} - The reference point from which to start the time which is used for expiry. * Setting the {@link #referenceInstant} to say, r seconds from the epoch, would initialize the start of expiry * to be from that instant = r. The start and end of the expiry window periodically move by the span of a single * bucket. Refer {@link MovingBoundaryTimeBucketAssigner} for details. * * Additionally, it also needs the following parameters: * 1. {@link #keyExpression} - The java expression to extract the key fields in the incoming tuple (POJO) * 2. {@link #timeExpression} - The java expression to extract the time field in the incoming tuple (POJO). * In case there is no time field in the tuple, system time, when the tuple is processed, will be used. * * * @since 3.5.0 */ @Evolving public class TimeBasedDedupOperator extends AbstractDeduper<Object> implements ActivationListener<Context> { // Required properties @NotNull private String keyExpression; @NotNull private String timeExpression; @NotNull private long bucketSpan; @NotNull private long expireBefore; // Optional private long referenceInstant = new Instant().getMillis() / 1000; private transient Class<?> pojoClass; private transient Getter<Object, Long> timeGetter; private transient Getter<Object, Object> keyGetter; private transient StreamCodec<Object> streamCodec; public TimeBasedDedupOperator() { managedState = new ManagedTimeUnifiedStateImpl(); } @InputPortFieldAnnotation(schemaRequired = true) public final transient DefaultInputPort<Object> input = new DefaultInputPort<Object>() { @Override public void setup(PortContext context) { pojoClass = context.getAttributes().get(PortContext.TUPLE_CLASS); streamCodec = getDeduperStreamCodec(); } @Override public void process(Object tuple) { processTuple(tuple); } @Override public StreamCodec<Object> getStreamCodec() { return streamCodec; } }; @Override protected long getTime(Object tuple) { return timeGetter.get(tuple); } @Override protected Slice getKey(Object tuple) { Object key = keyGetter.get(tuple); return streamCodec.toByteArray(key); } protected StreamCodec<Object> getDeduperStreamCodec() { return new DeduperStreamCodec(keyExpression); } @Override public void setup(OperatorContext context) { MovingBoundaryTimeBucketAssigner timeBucketAssigner = new MovingBoundaryTimeBucketAssigner(); timeBucketAssigner.setBucketSpan(Duration.standardSeconds(bucketSpan)); timeBucketAssigner.setExpireBefore(Duration.standardSeconds(expireBefore)); timeBucketAssigner.setReferenceInstant(new Instant(referenceInstant * 1000)); managedState.setTimeBucketAssigner(timeBucketAssigner); super.setup(context); } @Override public void activate(Context context) { if (timeExpression != null) { timeGetter = PojoUtils.createGetter(pojoClass, timeExpression, Long.class); } else { timeGetter = null; } keyGetter = PojoUtils.createGetter(pojoClass, keyExpression, Object.class); } @Override public void deactivate() { } @Override protected Future<Slice> getAsyncManagedState(Object tuple) { Future<Slice> valFuture = ((ManagedTimeUnifiedStateImpl) managedState).getAsync(getTime(tuple), getKey(tuple)); return valFuture; } @Override protected void putManagedState(Object tuple) { ((ManagedTimeUnifiedStateImpl) managedState).put(getTime(tuple), getKey(tuple), new Slice(null, 0, 0)); } public String getKeyExpression() { return keyExpression; } /** * Sets the key expression * @param keyExpression */ public void setKeyExpression(String keyExpression) { this.keyExpression = keyExpression; } public String getTimeExpression() { return timeExpression; } /** * Sets the time expression * @param timeExpression */ public void setTimeExpression(String timeExpression) { this.timeExpression = timeExpression; } public long getBucketSpan() { return bucketSpan; } /** * Sets the length of a single time bucket (in seconds) * @param bucketSpan */ public void setBucketSpan(long bucketSpan) { this.bucketSpan = bucketSpan; } public long getExpireBefore() { return expireBefore; } /** * Sets the expiry time (in seconds). Any event with time before this is considered to be expired. * @param expireBefore */ public void setExpireBefore(long expireBefore) { this.expireBefore = expireBefore; } public long getReferenceInstant() { return referenceInstant; } /** * Sets the reference instant (in seconds from the epoch). * By default this is the time when the application is started. * @param referenceInstant */ public void setReferenceInstant(long referenceInstant) { this.referenceInstant = referenceInstant; } }