com.cg.mapreduce.fpgrowth.mahout.fpm.fpgrowth2.FPGrowthObj.java Source code

Java tutorial

Introduction

Here is the source code for com.cg.mapreduce.fpgrowth.mahout.fpm.fpgrowth2.FPGrowthObj.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.cg.mapreduce.fpgrowth.mahout.fpm.fpgrowth2;

import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;

import org.apache.commons.lang3.mutable.MutableLong;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
import com.cg.mapreduce.fpgrowth.mahout.fpm.CountDescendingPairComparator;
import com.cg.mapreduce.fpgrowth.mahout.fpm.convertors.StatusUpdater;
import com.cg.mapreduce.fpgrowth.mahout.fpm.convertors.TopKPatternsOutputConverter;
import com.cg.mapreduce.fpgrowth.mahout.fpm.convertors.TransactionIterator;
import com.cg.mapreduce.fpgrowth.mahout.fpm.convertors.string.TopKStringPatterns;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.cg.mapreduce.fpgrowth.mahout.fpm.fpgrowth.Pattern;
import com.cg.mapreduce.fpgrowth.mahout.fpm.fpgrowth.FrequentPatternMaxHeap;

/**
 * Implementation of PFGrowth Algorithm
 *
 * @param <A> object type used as the cell items in a transaction list
 */
@Deprecated
public class FPGrowthObj<A extends Comparable<? super A>> {

    private static final Logger log = LoggerFactory.getLogger(FPGrowthObj.class);

    public static List<Pair<String, TopKStringPatterns>> readFrequentPattern(Configuration conf, Path path) {
        List<Pair<String, TopKStringPatterns>> ret = Lists.newArrayList();
        // key is feature value is count
        for (Pair<Writable, TopKStringPatterns> record : new SequenceFileIterable<Writable, TopKStringPatterns>(
                path, true, conf)) {
            ret.add(new Pair<String, TopKStringPatterns>(record.getFirst().toString(),
                    new TopKStringPatterns(record.getSecond().getPatterns())));
        }
        return ret;
    }

    /**
     * Generate the Feature Frequency list from the given transaction whose
     * frequency > minSupport
     *
     * @param transactions
     *          Iterator over the transaction database
     * @param minSupport
     *          minSupport of the feature to be included
     * @return the List of features and their associated frequency as a Pair
     */
    public final List<Pair<A, Long>> generateFList(Iterator<Pair<List<A>, Long>> transactions, int minSupport) {

        Map<A, MutableLong> attributeSupport = Maps.newHashMap();
        while (transactions.hasNext()) {
            Pair<List<A>, Long> transaction = transactions.next();
            for (A attribute : transaction.getFirst()) {
                if (attributeSupport.containsKey(attribute)) {
                    attributeSupport.get(attribute).add(transaction.getSecond().longValue());
                } else {
                    attributeSupport.put(attribute, new MutableLong(transaction.getSecond()));
                }
            }
        }
        List<Pair<A, Long>> fList = Lists.newArrayList();
        for (Entry<A, MutableLong> e : attributeSupport.entrySet()) {
            long value = e.getValue().longValue();
            if (value >= minSupport) {
                fList.add(new Pair<A, Long>(e.getKey(), value));
            }
        }

        Collections.sort(fList, new CountDescendingPairComparator<A, Long>());

        return fList;
    }

    /**
      * Generate Top K Frequent Patterns for every feature in returnableFeatures
      * given a stream of transactions and the minimum support
      *
      * @param transactionStream
      *          Iterator of transaction
      * @param frequencyList
      *          list of frequent features and their support value
      * @param minSupport
      *          minimum support of the transactions
      * @param k
      *          Number of top frequent patterns to keep
      * @param returnableFeatures
      *          set of features for which the frequent patterns are mined. If the
      *          set is empty or null, then top K patterns for every frequent item (an item
      *          whose support> minSupport) is generated
      * @param output
      *          The output collector to which the the generated patterns are
      *          written
      * @throws IOException
      */
    public final void generateTopKFrequentPatterns(Iterator<Pair<List<A>, Long>> transactionStream,
            Collection<Pair<A, Long>> frequencyList, long minSupport, int k, Collection<A> returnableFeatures,
            OutputCollector<A, List<Pair<List<A>, Long>>> output, StatusUpdater updater) throws IOException {

        Map<Integer, A> reverseMapping = Maps.newHashMap();
        Map<A, Integer> attributeIdMapping = Maps.newHashMap();

        int id = 0;
        for (Pair<A, Long> feature : frequencyList) {
            A attrib = feature.getFirst();
            Long frequency = feature.getSecond();
            if (frequency >= minSupport) {
                attributeIdMapping.put(attrib, id);
                reverseMapping.put(id++, attrib);
            }
        }

        long[] attributeFrequency = new long[attributeIdMapping.size()];
        for (Pair<A, Long> feature : frequencyList) {
            A attrib = feature.getFirst();
            Long frequency = feature.getSecond();
            if (frequency < minSupport) {
                break;
            }
            attributeFrequency[attributeIdMapping.get(attrib)] = frequency;
        }

        log.info("Number of unique items {}", frequencyList.size());

        Collection<Integer> returnFeatures = Sets.newHashSet();
        if (returnableFeatures != null && !returnableFeatures.isEmpty()) {
            for (A attrib : returnableFeatures) {
                if (attributeIdMapping.containsKey(attrib)) {
                    returnFeatures.add(attributeIdMapping.get(attrib));
                    log.info("Adding Pattern {}=>{}", attrib, attributeIdMapping.get(attrib));
                }
            }
        } else {
            for (int j = 0; j < attributeIdMapping.size(); j++) {
                returnFeatures.add(j);
            }
        }

        log.info("Number of unique pruned items {}", attributeIdMapping.size());
        generateTopKFrequentPatterns(new TransactionIterator<A>(transactionStream, attributeIdMapping),
                attributeFrequency, minSupport, k, returnFeatures,
                new TopKPatternsOutputConverter<A>(output, reverseMapping), updater);
    }

    /**
     * Top K FpGrowth Algorithm
     *
     * @param tree
     *          to be mined
     * @param minSupportValue
     *          minimum support of the pattern to keep
     * @param k
     *          Number of top frequent patterns to keep
     * @param requiredFeatures
     *          Set of integer id's of features to mine
     * @param outputCollector
     *          the Collector class which converts the given frequent pattern in
     *          integer to A
     * @return Top K Frequent Patterns for each feature and their support
     */
    private Map<Integer, FrequentPatternMaxHeap> fpGrowth(FPTree tree, long minSupportValue, int k,
            Collection<Integer> requiredFeatures, TopKPatternsOutputConverter<A> outputCollector,
            StatusUpdater updater) throws IOException {

        Map<Integer, FrequentPatternMaxHeap> patterns = Maps.newHashMap();
        for (int attribute : tree.attrIterableRev()) {
            if (requiredFeatures.contains(attribute)) {
                log.info("Mining FTree Tree for all patterns with {}", attribute);
                MutableLong minSupport = new MutableLong(minSupportValue);
                FrequentPatternMaxHeap frequentPatterns = growth(tree, minSupport, k, attribute, updater);
                patterns.put(attribute, frequentPatterns);
                outputCollector.collect(attribute, frequentPatterns);

                minSupportValue = Math.max(minSupportValue, minSupport.longValue() / 2);
                log.info("Found {} Patterns with Least Support {}", patterns.get(attribute).count(),
                        patterns.get(attribute).leastSupport());
            }
        }
        return patterns;
    }

    /**
     * Internal TopKFrequentPattern Generation algorithm, which represents the A's
     * as integers and transforms features to use only integers
     *
     * @param transactions
     *          Transaction database Iterator
     * @param attributeFrequency
     *          array representing the Frequency of the corresponding attribute id
     * @param minSupport
     *          minimum support of the pattern to be mined
     * @param k
     *          Max value of the Size of the Max-Heap in which Patterns are held
     * @param returnFeatures
     *          the id's of the features for which Top K patterns have to be mined
     * @param topKPatternsOutputCollector
     *          the outputCollector which transforms the given Pattern in integer
     *          format to the corresponding A Format
     * @return Top K frequent patterns for each attribute
     */
    private void generateTopKFrequentPatterns(Iterator<Pair<int[], Long>> transactions, long[] attributeFrequency,
            long minSupport, int k, Collection<Integer> returnFeatures,
            TopKPatternsOutputConverter<A> topKPatternsOutputCollector, StatusUpdater updater) throws IOException {

        FPTree tree = new FPTree(attributeFrequency, minSupport);

        // Constructing initial FPTree from the list of transactions
        int i = 0;
        while (transactions.hasNext()) {
            Pair<int[], Long> transaction = transactions.next();
            List<Integer> iLst = Lists.newArrayList();
            int[] iArr = transaction.getFirst();
            for (int anIArr : iArr) {
                iLst.add(anIArr);
            }
            tree.accumulate(iLst, transaction.getSecond());
            i++;
            if (i % 10000 == 0) {
                log.info("FPTree Building: Read {} Transactions", i);
            }
        }

        fpGrowth(tree, minSupport, k, returnFeatures, topKPatternsOutputCollector, updater);
    }

    /** 
     * Run FP Growth recursively on tree, for the given target attribute
     */
    private static FrequentPatternMaxHeap growth(FPTree tree, MutableLong minSupportMutable, int k,
            int currentAttribute, StatusUpdater updater) {

        long currentAttributeCount = tree.headerCount(currentAttribute);

        if (currentAttributeCount < minSupportMutable.longValue()) {
            return new FrequentPatternMaxHeap(k, true);
        }

        FPTree condTree = tree.createMoreFreqConditionalTree(currentAttribute);

        Pair<FPTree, FPTree> pAndQ = condTree.splitSinglePrefix();
        FPTree p = pAndQ.getFirst();
        FPTree q = pAndQ.getSecond();

        FrequentPatternMaxHeap prefixPats = null;
        if (p != null) {
            prefixPats = mineSinglePrefix(p, k);
        }

        FrequentPatternMaxHeap suffixPats = new FrequentPatternMaxHeap(k, true);

        Pattern thisPat = new Pattern();
        thisPat.add(currentAttribute, currentAttributeCount);
        suffixPats.insert(thisPat);

        for (int attr : q.attrIterableRev()) {
            mergeHeap(suffixPats, growth(q, minSupportMutable, k, attr, updater), currentAttribute,
                    currentAttributeCount, true);
        }

        if (prefixPats != null) {
            return cross(prefixPats, suffixPats, k);
        }

        return suffixPats;
    }

    /** 
     * Return a set patterns which are the cross product of the patterns
     * in pPats and qPats.  
     */
    private static FrequentPatternMaxHeap cross(FrequentPatternMaxHeap pPats, FrequentPatternMaxHeap qPats, int k) {
        FrequentPatternMaxHeap pats = new FrequentPatternMaxHeap(k, true);

        for (Pattern p : pPats.getHeap()) {
            int[] pints = p.getPattern();
            for (Pattern q : qPats.getHeap()) {
                int[] qints = q.getPattern();

                Pattern pq = new Pattern();
                for (int pi = 0; pi < p.length(); pi++) {
                    pq.add(pints[pi], p.support());
                }
                for (int qi = 0; qi < q.length(); qi++) {
                    pq.add(qints[qi], q.support());
                }
                pats.insert(pq);
            }
        }

        for (Pattern q : qPats.getHeap()) {
            Pattern qq = new Pattern();
            int[] qints = q.getPattern();
            for (int qi = 0; qi < q.length(); qi++) {
                qq.add(qints[qi], q.support());
            }
            pats.insert(qq);
        }

        return pats;
    }

    /**
     * Mine all frequent patterns that can be created by following a prefix
     * that is common to all sets in the given tree.
     */
    private static FrequentPatternMaxHeap mineSinglePrefix(FPTree tree, int k) {
        FrequentPatternMaxHeap pats = new FrequentPatternMaxHeap(k, true);
        FPTree.FPNode currNode = tree.root();
        while (currNode.numChildren() == 1) {
            currNode = currNode.children().iterator().next();
            FrequentPatternMaxHeap singlePat = new FrequentPatternMaxHeap(k, true);
            Pattern p = new Pattern();
            p.add(currNode.attribute(), currNode.count());
            singlePat.insert(p);
            pats = cross(singlePat, pats, k);
            pats.insert(p);
        }

        return pats;
    }

    private static void mergeHeap(FrequentPatternMaxHeap frequentPatterns, FrequentPatternMaxHeap returnedPatterns,
            int attribute, long count, boolean addAttribute) {
        frequentPatterns.addAll(returnedPatterns, attribute, count);
        if (frequentPatterns.addable(count) && addAttribute) {
            Pattern p = new Pattern();
            p.add(attribute, count);
            frequentPatterns.insert(p);
        }
    }
}