org.apache.apex.malhar.stream.sample.complete.TwitterAutoComplete.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.apex.malhar.stream.sample.complete.TwitterAutoComplete.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.apex.malhar.stream.sample.complete;

import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;

import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.joda.time.Duration;

import org.apache.apex.malhar.contrib.twitter.TwitterSampleInput;
import org.apache.apex.malhar.lib.function.Function;
import org.apache.apex.malhar.lib.util.KeyValPair;
import org.apache.apex.malhar.lib.window.TriggerOption;
import org.apache.apex.malhar.lib.window.Tuple;
import org.apache.apex.malhar.lib.window.Window;
import org.apache.apex.malhar.lib.window.WindowOption;

import org.apache.apex.malhar.stream.api.ApexStream;
import org.apache.apex.malhar.stream.api.CompositeStreamTransform;
import org.apache.apex.malhar.stream.api.WindowedStream;
import org.apache.apex.malhar.stream.api.impl.StreamFactory;

import org.apache.hadoop.conf.Configuration;

import com.datatorrent.api.DAG;
import com.datatorrent.api.StreamingApplication;
import com.datatorrent.api.annotation.ApplicationAnnotation;

import static org.apache.apex.malhar.stream.api.Option.Options.name;

/**
 * Auto Complete Hashtag Example with real time twitter input. In order to run this application, you need to create an app
 * at https://apps.twitter.com, then generate your consumer and access keys and tokens, and enter those information
 * accordingly in /resources/META-INF/properties.xml.
 *
 * The authentication requires following 4 information.
 * Your application consumer key,
 * Your application consumer secret,
 * Your twitter access token, and
 * Your twitter access token secret.
 *
 * @since 3.5.0
 */
@ApplicationAnnotation(name = "TwitterAutoComplete")
public class TwitterAutoComplete implements StreamingApplication {
    /**
     * Check whether every character in a string is ASCII encoding.
     */
    public static class StringUtils {
        static CharsetEncoder encoder = Charset.forName("US-ASCII").newEncoder();

        public static boolean isAscii(String v) {
            return encoder.canEncode(v);
        }
    }

    /**
     * FlapMap Function to extract all hashtags from a text form tweet.
     */
    private static class ExtractHashtags implements Function.FlatMapFunction<String, String> {

        @Override
        public Iterable<String> f(String input) {
            List<String> result = new LinkedList<>();
            Matcher m = Pattern.compile("#\\S+").matcher(input);
            while (m.find()) {
                result.add(m.group().substring(1));
            }
            return result;
        }
    }

    /**
     * Lower latency, but more expensive.
     */
    private static class ComputeTopFlat extends
            CompositeStreamTransform<WindowedStream<CompletionCandidate>, WindowedStream<Tuple.WindowedTuple<KeyValPair<String, List<CompletionCandidate>>>>> {
        private final int candidatesPerPrefix;
        private final int minPrefix;

        public ComputeTopFlat(int candidatesPerPrefix, int minPrefix) {
            this.candidatesPerPrefix = candidatesPerPrefix;
            this.minPrefix = minPrefix;
        }

        @Override
        public WindowedStream<Tuple.WindowedTuple<KeyValPair<String, List<CompletionCandidate>>>> compose(
                WindowedStream<CompletionCandidate> input) {
            TopNByKey topNByKey = new TopNByKey();
            topNByKey.setN(candidatesPerPrefix);
            return input
                    .<KeyValPair<String, CompletionCandidate>, WindowedStream<KeyValPair<String, CompletionCandidate>>>flatMap(
                            new AllPrefixes(minPrefix, 3), name("Extract Prefixes"))
                    .accumulateByKey(topNByKey,
                            new Function.ToKeyValue<KeyValPair<String, CompletionCandidate>, String, CompletionCandidate>() {
                                @Override
                                public Tuple<KeyValPair<String, CompletionCandidate>> f(
                                        KeyValPair<String, CompletionCandidate> tuple) {
                                    // TODO: Should be removed after Auto-wrapping is supported.
                                    return new Tuple.WindowedTuple<>(Window.GlobalWindow.INSTANCE, tuple);
                                }
                            }, name("TopNByKey"));
        }
    }

    /**
     * FlapMap Function to extract all prefixes of the hashtag in the input CompletionCandidate, and output
     * KeyValPairs of the prefix and the CompletionCandidate
     */
    private static class AllPrefixes
            implements Function.FlatMapFunction<CompletionCandidate, KeyValPair<String, CompletionCandidate>> {
        private final int minPrefix;
        private final int maxPrefix;

        public AllPrefixes() {
            this(0, Integer.MAX_VALUE);
        }

        public AllPrefixes(int minPrefix) {
            this(minPrefix, Integer.MAX_VALUE);
        }

        public AllPrefixes(int minPrefix, int maxPrefix) {
            this.minPrefix = minPrefix;
            this.maxPrefix = maxPrefix;
        }

        @Override
        public Iterable<KeyValPair<String, CompletionCandidate>> f(CompletionCandidate input) {
            List<KeyValPair<String, CompletionCandidate>> result = new LinkedList<>();
            String word = input.getValue();
            for (int i = minPrefix; i <= Math.min(word.length(), maxPrefix); i++) {
                result.add(new KeyValPair<>(input.getValue().substring(0, i).toLowerCase(), input));
            }
            return result;
        }
    }

    /**
     * A Composite stream transform that takes as input a list of tokens and returns
     * the most common tokens per prefix.
     */
    public static class ComputeTopCompletions extends
            CompositeStreamTransform<WindowedStream<String>, WindowedStream<Tuple.WindowedTuple<KeyValPair<String, List<CompletionCandidate>>>>> {
        private final int candidatesPerPrefix;
        private final boolean recursive;

        protected ComputeTopCompletions(int candidatesPerPrefix, boolean recursive) {
            this.candidatesPerPrefix = candidatesPerPrefix;
            this.recursive = recursive;
        }

        public static ComputeTopCompletions top(int candidatesPerPrefix, boolean recursive) {
            return new ComputeTopCompletions(candidatesPerPrefix, recursive);
        }

        @Override
        @SuppressWarnings("unchecked")
        public WindowedStream<Tuple.WindowedTuple<KeyValPair<String, List<CompletionCandidate>>>> compose(
                WindowedStream<String> inputStream) {

            ApexStream<CompletionCandidate> candidates = inputStream
                    .countByKey(new Function.ToKeyValue<String, String, Long>() {
                        @Override
                        public Tuple<KeyValPair<String, Long>> f(String input) {
                            return new Tuple.PlainTuple<>(new KeyValPair<>(input, 1L));
                        }
                    }, name("Hashtag Count"))
                    .map(new Function.MapFunction<Tuple.WindowedTuple<KeyValPair<String, Long>>, CompletionCandidate>() {
                        @Override
                        public CompletionCandidate f(Tuple.WindowedTuple<KeyValPair<String, Long>> input) {
                            return new CompletionCandidate(input.getValue().getKey(), input.getValue().getValue());
                        }
                    }, name("KeyValPair to CompletionCandidate"));

            return candidates.addCompositeStreams(new ComputeTopFlat(candidatesPerPrefix, 1));

        }
    }

    /**
     * FilterFunction to filter out tweets with non-acsii characters.
     */
    static class ASCIIFilter implements Function.FilterFunction<String> {
        @Override
        public boolean f(String input) {
            return StringUtils.isAscii(input);
        }
    }

    /**
     * Populate the dag with High-Level API.
     * @param dag
     * @param conf
     */
    @Override
    public void populateDAG(DAG dag, Configuration conf) {
        TwitterSampleInput input = new TwitterSampleInput();

        WindowOption windowOption = new WindowOption.GlobalWindow();

        ApexStream<String> tags = StreamFactory.fromInput(input, input.text, name("tweetSampler"))
                .filter(new ASCIIFilter(), name("ACSII Filter"))
                .flatMap(new ExtractHashtags(), name("Extract Hashtags"));

        ApexStream<Tuple.WindowedTuple<KeyValPair<String, List<CompletionCandidate>>>> s = tags
                .window(windowOption,
                        new TriggerOption().accumulatingFiredPanes()
                                .withEarlyFiringsAtEvery(Duration.standardSeconds(10)))
                .addCompositeStreams(ComputeTopCompletions.top(10, true)).print();

        s.populateDag(dag);
    }
}