org.apache.mahout.freqtermsets.ParallelFPStreamReducer.java Source code

Introduction

Here is the source code for org.apache.mahout.freqtermsets.ParallelFPStreamReducer.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.freqtermsets;

import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map.Entry;
import java.util.Set;

import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.mutable.MutableLong;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.TaskInputOutputContext;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.queryParser.QueryParser.Operator;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.MMapDirectory;
import org.apache.lucene.util.Version;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.Parameters;
import org.apache.mahout.freqtermsets.convertors.ContextStatusUpdater;
import org.apache.mahout.freqtermsets.convertors.ContextWriteOutputCollector;
import org.apache.mahout.freqtermsets.convertors.integer.IntegerStringOutputConverter;
import org.apache.mahout.freqtermsets.convertors.string.TopKStringPatterns;
import org.apache.mahout.freqtermsets.fpgrowth.FPGrowth;
import org.apache.mahout.freqtermsets.stream.TimeWeightFunction;
import org.apache.mahout.math.list.IntArrayList;
import org.apache.mahout.math.map.OpenIntObjectHashMap;
import org.apache.mahout.math.map.OpenObjectIntHashMap;

import ca.uwaterloo.twitter.ItemSetIndexBuilder;
import ca.uwaterloo.twitter.ItemSetSimilarity;
import ca.uwaterloo.twitter.TokenIterator;
import ca.uwaterloo.twitter.TwitterAnalyzer;

import com.google.common.collect.Lists;
import com.google.common.collect.Sets;

/**
 * takes each group of transactions and runs Vanilla FPGrowth on it and
 * outputs the the Top K frequent Patterns for each group.
 * 
 */
public class ParallelFPStreamReducer extends Reducer<IntWritable, TransactionTree, Text, TopKStringPatterns> {

    class OldPatternsCollector extends Collector {
        private int docBase;
        private IndexReader reader;
        private Set<Set<String>> encounteredDocs = Sets.newHashSet();
        private final TaskInputOutputContext<?, ?, ?, ?> context;
        private final TransactionTree extendedTree;
        //    private long readerTime;

        private OldPatternsCollector(TaskInputOutputContext<?, ?, ?, ?> context, TransactionTree extendedTree) {
            super();
            this.context = context;
            this.extendedTree = extendedTree;
        }

        @Override
        public void setScorer(Scorer scorer) throws IOException {
        }

        @Override
        public void setNextReader(IndexReader reader, int docBase) throws IOException {
            this.docBase = docBase;
            this.reader = reader;
            //      this.readerTime = Long.parseLong( 
            //          ((MMapDirectory)reader.directory()).getFile().getParentFile().getParentFile().getName());
        }

        @Override
        public void collect(int docId) throws IOException {
            Document doc = reader.document(docId);// +docBase); results in a docId more than maxdoc
            String[] terms = reader.getTermFreqVector(docId, ItemSetIndexBuilder.AssocField.ITEMSET.name)
                    .getTerms();
            Set<String> termSet = Sets.newCopyOnWriteArraySet(Arrays.asList(terms));
            if (encounteredDocs.contains(termSet)) {
                return;
            } else {
                encounteredDocs.add(termSet);
            }

            // Set<Integer> appearingGroups = Sets.newHashSet();
            IntArrayList pattern = new IntArrayList(terms.length);
            for (String t : terms) {
                if (stringIdMap.containsKey(t)) {
                    int tId = stringIdMap.get(t);
                    pattern.add(tId);
                    // appearingGroups.add(PFPGrowth.getGroupHash(tId, numGroups));
                } else {
                    context.setStatus(
                            "Parallel FPGrowth: Term from previous pattern not part of the current fList: " + t);
                }
            }

            float patternFreq = Float
                    .parseFloat(doc.getFieldable(ItemSetIndexBuilder.AssocField.SUPPORT.name).stringValue());
            // No need to divide because originally every transaction was added once per group
            // // will be added that many times
            // patternFreq /= appearingGroups.size();

            //      long docStartTime = readerTime;
            long docStartTime = Long.parseLong(
                    doc.getFieldable(ItemSetIndexBuilder.AssocField.WINDOW_STARTTIME.name).stringValue());

            long support = Math.round(timeWeigth.apply(patternFreq, docStartTime, intervalStart));

            extendedTree.addPattern(pattern, support);
        }

        @Override
        public boolean acceptsDocsOutOfOrder() {
            return true;
        }
    }

    private IndexReader fisIxReader;
    //  private MultiReader fisIxMultiReader;
    private IndexSearcher fisSearcher;
    private ItemSetSimilarity fisSimilarity;
    private QueryParser fisQparser;
    private Analyzer ANALYZER = new TwitterAnalyzer();

    public static final String MIN_WORDS_FOR_LANG_ID = "lenLangId";
    public static final int MIN_WORDS_FOR_LANG_ID_DEFAULT = 3;

    private final OpenIntObjectHashMap<String> idStringMap = new OpenIntObjectHashMap<String>();
    private final OpenObjectIntHashMap<String> stringIdMap = new OpenObjectIntHashMap<String>();

    private TimeWeightFunction timeWeigth;
    private long mostRecentTime;

    private int maxHeapSize = 50;

    private int minSupport = 3;

    private int numGroups;

    private int minWordsForLangDetection;
    private boolean repeatHashTag;
    private long intervalStart;
    private long intervalEnd;
    private long windowSize;
    private long endTimestamp;

    private static class IteratorAdapter implements Iterator<Pair<List<Integer>, Long>> {
        private Iterator<Pair<IntArrayList, Long>> innerIter;

        private IteratorAdapter(Iterator<Pair<IntArrayList, Long>> transactionIter) {
            innerIter = transactionIter;
        }

        @Override
        public boolean hasNext() {
            return innerIter.hasNext();
        }

        @Override
        public Pair<List<Integer>, Long> next() {
            Pair<IntArrayList, Long> innerNext = innerIter.next();
            return new Pair<List<Integer>, Long>(innerNext.getFirst().toList(), innerNext.getSecond());
        }

        @Override
        public void remove() {
            throw new UnsupportedOperationException();
        }
    }

    @Override
    protected void reduce(IntWritable key, Iterable<TransactionTree> values, Context context) throws IOException {

        TransactionTree cTree = new TransactionTree();
        int numPatterns = 0;
        for (TransactionTree tr : values) {
            for (Pair<IntArrayList, Long> p : tr) {
                cTree.addPattern(p.getFirst(), p.getSecond());
                ++numPatterns;
            }
        }

        //    if (fisIxMultiReader != null) {
        if (fisIxReader != null) {
            BooleanQuery.setMaxClauseCount(numPatterns);
            BooleanQuery allPatternsQuery = new BooleanQuery();

            Iterator<Pair<IntArrayList, Long>> cTreeIter = cTree.iterator(true);
            while (cTreeIter.hasNext()) {
                IntArrayList newPatternIds = cTreeIter.next().getFirst();
                if (newPatternIds.size() == 1) {
                    // This is already carried over by loading the older flists
                    continue;
                }
                StringBuilder newPattenStr = new StringBuilder();
                for (int i = 0; i < newPatternIds.size(); ++i) {
                    int id = newPatternIds.getQuick(i);
                    String str = idStringMap.get(id);
                    newPattenStr.append(str).append(" ");
                }
                try {
                    allPatternsQuery.add(fisQparser.parse(newPattenStr.toString()), Occur.SHOULD);
                    // fisSearcher.search(fisQparser.parse(newPattenStr.toString()),oldPatternsCollector);
                } catch (ParseException e) {
                    context.setStatus("Parallel FPGrowth: caught a parse exception: " + e.getMessage());
                    continue;
                }
            }

            fisSearcher.search(allPatternsQuery, new OldPatternsCollector(context, cTree));

        }

        List<Pair<Integer, Long>> localFList = Lists.newArrayList();
        for (Entry<Integer, MutableLong> fItem : cTree.generateFList().entrySet()) {
            localFList.add(new Pair<Integer, Long>(fItem.getKey(), fItem.getValue().toLong()));
        }

        Collections.sort(localFList, new CountDescendingPairComparator<Integer, Long>());

        FPGrowth<Integer> fpGrowth = new FPGrowth<Integer>();
        fpGrowth.generateTopKFrequentPatterns(
                //new IteratorAdapter(cTree.iterator()),
                cTree, localFList, minSupport, maxHeapSize, null,
                new IntegerStringOutputConverter(
                        new ContextWriteOutputCollector<IntWritable, TransactionTree, Text, TopKStringPatterns>(
                                context),
                        idStringMap, minWordsForLangDetection, repeatHashTag),
                new ContextStatusUpdater<IntWritable, TransactionTree, Text, TopKStringPatterns>(context),
                key.get(), numGroups);

    }

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {

        super.setup(context);
        Configuration conf = context.getConfiguration();
        Parameters params = new Parameters(conf.get(PFPGrowth.PFP_PARAMETERS, ""));

        intervalStart = Long.parseLong(params.get(PFPGrowth.PARAM_INTERVAL_START));
        intervalEnd = Long.parseLong(params.get(PFPGrowth.PARAM_INTERVAL_END));
        windowSize = Long
                .parseLong(params.get(PFPGrowth.PARAM_WINDOW_SIZE, Long.toString(intervalEnd - intervalStart)));
        endTimestamp = Math.min(intervalEnd, intervalStart + windowSize - 1);

        PFPGrowth.loadEarlierFHashMaps(context, params, intervalStart, idStringMap, stringIdMap);

        maxHeapSize = Integer.valueOf(params.get(PFPGrowth.MAX_HEAPSIZE, "50"));
        minSupport = Integer.valueOf(params.get(PFPGrowth.MIN_SUPPORT, "3"));

        numGroups = params.getInt(PFPGrowth.NUM_GROUPS, PFPGrowth.NUM_GROUPS_DEFAULT);

        minWordsForLangDetection = params.getInt(MIN_WORDS_FOR_LANG_ID, MIN_WORDS_FOR_LANG_ID_DEFAULT);
        repeatHashTag = Boolean.parseBoolean(params.get(TokenIterator.PARAM_REPEAT_HASHTAG, "false"));

        long maxPatternLoadLag = Long.parseLong(
                params.get(PFPGrowth.PARAM_MAX_PATTERN_LOAD_LAG, PFPGrowth.DEFAULT_MAX_PATTERN_LOAD_LAG));

        Path mostRecentPath = null;
        Path outPath = new Path(params.get(PFPGrowth.OUTPUT));
        Path timeRoot = outPath.getParent().getParent();
        FileSystem fs = FileSystem.get(conf);
        FileStatus[] otherWindows = fs.listStatus(timeRoot);
        //    List<IndexReader> earlierIndexes = Lists
        //        .<IndexReader> newArrayListWithCapacity(otherWindows.length - 1);
        for (int f = otherWindows.length - 1; f >= 0; --f) {
            Path p = otherWindows[f].getPath();
            long pathStartTime = Long.parseLong(p.getName());
            // should have used end time, but it doesn't make a difference,
            // AS LONG AS windows don't overlap
            //      long timeDifference = intervalStart - pathStartTime;
            //      if (timeDifference > 0 && timeDifference <= maxPatternLoadLag) {
            if (pathStartTime < intervalStart && pathStartTime > mostRecentTime) {
                p = fs.listStatus(p)[0].getPath();
                p = new Path(p, "index");
                if (fs.exists(p)) {
                    mostRecentTime = pathStartTime;
                    mostRecentPath = p;
                    //          File indexDir = FileUtils.toFile(p.toUri().toURL());
                    //          // FIXME: this will work only on local filesystem.. like many other parts of the code
                    //          Directory fisdir = new MMapDirectory(indexDir);
                    //          IndexReader fisIxReader = IndexReader.open(fisdir);
                    //          earlierIndexes.add(fisIxReader);
                }
            }
        }
        if (mostRecentPath != null) {
            //    if(!earlierIndexes.isEmpty()) {
            //      fisIxMultiReader = new MultiReader(earlierIndexes.toArray(new IndexReader[0]));
            Directory fisdir = new MMapDirectory(FileUtils.toFile(mostRecentPath.toUri().toURL()));
            fisIxReader = IndexReader.open(fisdir);
            //      fisSearcher = new IndexSearcher(fisIxMultiReader);
            fisSearcher = new IndexSearcher(fisIxReader);
            fisSimilarity = new ItemSetSimilarity();
            fisSearcher.setSimilarity(fisSimilarity);

            fisQparser = new QueryParser(Version.LUCENE_36, ItemSetIndexBuilder.AssocField.ITEMSET.name, ANALYZER);
            fisQparser.setDefaultOperator(Operator.AND);

            timeWeigth = TimeWeightFunction.getDefault(params);
        }
    }
}