List of usage examples for org.apache.mahout.common Pair Pair
public Pair(A first, B second)
From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.convertors.integer.IntegerStringOutputConverter.java
License:Apache License
@Override public void collect(Integer key, List<Pair<List<Integer>, Long>> value) throws IOException { String stringKey = featureReverseMap.get(key); List<Pair<List<String>, Long>> stringValues = Lists.newArrayList(); for (Pair<List<Integer>, Long> e : value) { List<String> pattern = Lists.newArrayList(); for (Integer i : e.getFirst()) { pattern.add(featureReverseMap.get(i)); }//from w ww . j av a 2 s . com stringValues.add(new Pair<List<String>, Long>(pattern, e.getSecond())); } collector.collect(new Text(stringKey), new TopKStringPatterns(stringValues)); }
From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.convertors.string.TopKStringPatterns.java
License:Apache License
@Override public void readFields(DataInput in) throws IOException { frequentPatterns.clear();/*from ww w .j a va 2 s .c om*/ int length = in.readInt(); for (int i = 0; i < length; i++) { List<String> items = Lists.newArrayList(); int itemsetLength = in.readInt(); long support = in.readLong(); for (int j = 0; j < itemsetLength; j++) { items.add(in.readUTF()); } frequentPatterns.add(new Pair<List<String>, Long>(items, support)); } }
From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.convertors.TopKPatternsOutputConverter.java
License:Apache License
@Override public void collect(Integer key, FrequentPatternMaxHeap value) throws IOException { List<Pair<List<A>, Long>> perAttributePatterns = Lists.newArrayList(); PriorityQueue<Pattern> t = value.getHeap(); while (!t.isEmpty()) { Pattern itemSet = t.poll(); List<A> frequentPattern = Lists.newArrayList(); for (int j = 0; j < itemSet.length(); j++) { frequentPattern.add(reverseMapping.get(itemSet.getPattern()[j])); }//from w w w. ja v a 2 s. c om Collections.sort(frequentPattern); Pair<List<A>, Long> returnItemSet = new Pair<List<A>, Long>(frequentPattern, itemSet.support()); perAttributePatterns.add(returnItemSet); } Collections.reverse(perAttributePatterns); //?? collector.collect(reverseMapping.get(key), perAttributePatterns); }
From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.convertors.TransactionIterator.java
License:Apache License
public TransactionIterator(Iterator<Pair<List<T>, Long>> transactions, final Map<T, Integer> attributeIdMapping) { transactionBuffer = new int[attributeIdMapping.size()]; delegate = Iterators.transform(transactions, new Function<Pair<List<T>, Long>, Pair<int[], Long>>() { @Override//from w w w. j ava 2s . com public Pair<int[], Long> apply(Pair<List<T>, Long> from) { if (from == null) { return null; } int index = 0; for (T attribute : from.getFirst()) { if (attributeIdMapping.containsKey(attribute)) { transactionBuffer[index++] = attributeIdMapping.get(attribute); } } int[] transactionList = new int[index]; System.arraycopy(transactionBuffer, 0, transactionList, 0, index); return new Pair<int[], Long>(transactionList, from.getSecond()); } }); }
From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.fpgrowth.FPGrowth.java
License:Apache License
public static List<Pair<String, TopKStringPatterns>> readFrequentPattern(Configuration conf, Path path) { List<Pair<String, TopKStringPatterns>> ret = Lists.newArrayList(); // key is feature value is count for (Pair<Writable, TopKStringPatterns> record : new SequenceFileIterable<Writable, TopKStringPatterns>( path, true, conf)) {/*w w w . java 2 s . com*/ ret.add(new Pair<String, TopKStringPatterns>(record.getFirst().toString(), new TopKStringPatterns(record.getSecond().getPatterns()))); } return ret; }
From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.fpgrowth.FPGrowth.java
License:Apache License
/** * Generate the Feature Frequency list from the given transaction whose * frequency > minSupport/*from w w w. jav a2 s . c om*/ * * @param transactions * Iterator over the transaction database * @param minSupport * minSupport of the feature to be included * @return the List of features and their associated frequency as a Pair */ public final List<Pair<A, Long>> generateFList(Iterator<Pair<List<A>, Long>> transactions, int minSupport) { Map<A, MutableLong> attributeSupport = Maps.newHashMap(); while (transactions.hasNext()) { Pair<List<A>, Long> transaction = transactions.next(); for (A attribute : transaction.getFirst()) { if (attributeSupport.containsKey(attribute)) { attributeSupport.get(attribute).add(transaction.getSecond().longValue()); } else { attributeSupport.put(attribute, new MutableLong(transaction.getSecond())); } } } List<Pair<A, Long>> fList = Lists.newArrayList(); for (Entry<A, MutableLong> e : attributeSupport.entrySet()) { long value = e.getValue().longValue(); if (value >= minSupport) { fList.add(new Pair<A, Long>(e.getKey(), value)); } } Collections.sort(fList, new CountDescendingPairComparator<A, Long>()); return fList; }
From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.fpgrowth2.FPTree.java
License:Apache License
/** * Return a pair of trees that result from separating a common prefix * (if one exists) from the lower portion of this tree. *///w ww . ja v a 2s .com public Pair<FPTree, FPTree> splitSinglePrefix() { if (root.numChildren() != 1) { return new Pair<FPTree, FPTree>(null, this); } LongArrayList pAttrCountList = new LongArrayList(); LongArrayList qAttrCountList = attrCountList.copy(); FPNode currNode = root; while (currNode.numChildren() == 1) { currNode = currNode.children().iterator().next(); if (pAttrCountList.size() <= currNode.attribute()) { pAttrCountList.setSize(currNode.attribute() + 1); } pAttrCountList.set(currNode.attribute(), currNode.count()); qAttrCountList.set(currNode.attribute(), 0); } FPTree pTree = new FPTree(pAttrCountList, minSupport); FPTree qTree = new FPTree(qAttrCountList, minSupport); recursivelyAddPrefixPats(pTree, qTree, root, null); return new Pair<FPTree, FPTree>(pTree, qTree); }
From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.ParallelFPGrowthReducer.java
License:Apache License
@Override protected void reduce(IntWritable key, Iterable<TransactionTree> values, Context context) throws IOException { TransactionTree cTree = new TransactionTree(); for (TransactionTree tr : values) { for (Pair<IntArrayList, Long> p : tr) { cTree.addPattern(p.getFirst(), p.getSecond()); }/*from ww w.j a v a 2 s. co m*/ } List<Pair<Integer, Long>> localFList = Lists.newArrayList(); for (Entry<Integer, MutableLong> fItem : cTree.generateFList().entrySet()) { localFList.add(new Pair<Integer, Long>(fItem.getKey(), fItem.getValue().toLong())); } Collections.sort(localFList, new CountDescendingPairComparator<Integer, Long>()); if (useFP2) { FPGrowthIds.generateTopKFrequentPatterns(cTree.iterator(), freqList, minSupport, maxHeapSize, PFPGrowth.getGroupMembers(key.get(), maxPerGroup, numFeatures), new IntegerStringOutputConverter( new ContextWriteOutputCollector<IntWritable, TransactionTree, Text, TopKStringPatterns>( context), featureReverseMap), new ContextStatusUpdater<IntWritable, TransactionTree, Text, TopKStringPatterns>(context)); } else { FPGrowth<Integer> fpGrowth = new FPGrowth<Integer>(); fpGrowth.generateTopKFrequentPatterns(new IteratorAdapter(cTree.iterator()), localFList, minSupport, maxHeapSize, Sets.newHashSet(PFPGrowth.getGroupMembers(key.get(), maxPerGroup, numFeatures).toList()), new IntegerStringOutputConverter( new ContextWriteOutputCollector<IntWritable, TransactionTree, Text, TopKStringPatterns>( context), featureReverseMap), new ContextStatusUpdater<IntWritable, TransactionTree, Text, TopKStringPatterns>(context)); } }
From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.PFPGrowth.java
License:Apache License
/** * Generates the fList from the serialized string representation * /* w ww.j av a2 s .co m*/ * @return Deserialized Feature Frequency List */ public static List<Pair<String, Long>> readFList(Configuration conf) throws IOException { List<Pair<String, Long>> list = Lists.newArrayList(); Path[] files = HadoopUtil.getCachedFiles(conf); if (files.length != 1) { throw new IOException("Cannot read Frequency list from Distributed Cache (" + files.length + ')'); } for (Pair<Text, LongWritable> record : new SequenceFileIterable<Text, LongWritable>(files[0], true, conf)) { list.add(new Pair<String, Long>(record.getFirst().toString(), record.getSecond().get())); } return list; }
From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.PFPGrowth.java
License:Apache License
/** * read the feature frequency List which is built at the end of the Parallel counting job * // w ww. j a v a2 s . c o m * @return Feature Frequency List */ public static List<Pair<String, Long>> readFList(Parameters params) { int minSupport = Integer.valueOf(params.get(MIN_SUPPORT, "3")); Configuration conf = new Configuration(); Path parallelCountingPath = new Path(params.get(OUTPUT), PARALLEL_COUNTING); PriorityQueue<Pair<String, Long>> queue = new PriorityQueue<Pair<String, Long>>(11, new Comparator<Pair<String, Long>>() { @Override public int compare(Pair<String, Long> o1, Pair<String, Long> o2) { int ret = o2.getSecond().compareTo(o1.getSecond()); if (ret != 0) { return ret; } return o1.getFirst().compareTo(o2.getFirst()); } }); for (Pair<Text, LongWritable> record : new SequenceFileDirIterable<Text, LongWritable>( new Path(parallelCountingPath, FILE_PATTERN), PathType.GLOB, null, null, true, conf)) { long value = record.getSecond().get(); if (value >= minSupport) { queue.add(new Pair<String, Long>(record.getFirst().toString(), value)); } } List<Pair<String, Long>> fList = Lists.newArrayList(); while (!queue.isEmpty()) { fList.add(queue.poll()); } return fList; }