List of usage examples for org.apache.mahout.common Pair getFirst
public A getFirst()
From source file:ClassifierHD.java
License:Apache License
public static Map<String, Integer> readDictionnary(Configuration conf, Path dictionnaryPath) { Map<String, Integer> dictionnary = new HashMap<String, Integer>(); for (Pair<Text, IntWritable> pair : new SequenceFileIterable<Text, IntWritable>(dictionnaryPath, true, conf)) {/*from ww w. j ava2 s . co m*/ dictionnary.put(pair.getFirst().toString(), pair.getSecond().get()); } return dictionnary; }
From source file:ClassifierHD.java
License:Apache License
public static Map<Integer, Long> readDocumentFrequency(Configuration conf, Path documentFrequencyPath) { Map<Integer, Long> documentFrequency = new HashMap<Integer, Long>(); for (Pair<IntWritable, LongWritable> pair : new SequenceFileIterable<IntWritable, LongWritable>( documentFrequencyPath, true, conf)) { documentFrequency.put(pair.getFirst().get(), pair.getSecond().get()); }//from w w w. j a va2 s.co m return documentFrequency; }
From source file:DisplayClustering.java
License:Apache License
/** * This method plots points and colors them according to their cluster * membership, rather than drawing ellipses. * * As of commit, this method is used only by K-means spectral clustering. * Since the cluster assignments are set within the eigenspace of the data, it * is not inherent that the original data cluster as they would in K-means: * that is, as symmetric gaussian mixtures. * * Since Spectral K-Means uses K-Means to cluster the eigenspace data, the raw * output is not directly usable. Rather, the cluster assignments from the raw * output need to be transferred back to the original data. As such, this * method will read the SequenceFile cluster results of K-means and transfer * the cluster assignments to the original data, coloring them appropriately. * * @param g2/*from w w w. ja v a2 s. c om*/ * @param data */ protected static void plotClusteredSampleData(Graphics2D g2, Path data) { double sx = (double) res / DS; g2.setTransform(AffineTransform.getScaleInstance(sx, sx)); g2.setColor(Color.BLACK); Vector dv = new DenseVector(2).assign(SIZE / 2.0); plotRectangle(g2, new DenseVector(2).assign(2), dv); plotRectangle(g2, new DenseVector(2).assign(-2), dv); // plot the sample data, colored according to the cluster they belong to dv.assign(0.03); Path clusteredPointsPath = new Path(data, "clusteredPoints"); Path inputPath = new Path(clusteredPointsPath, "part-m-00000"); Map<Integer, Color> colors = new HashMap<Integer, Color>(); int point = 0; for (Pair<IntWritable, WeightedVectorWritable> record : new SequenceFileIterable<IntWritable, WeightedVectorWritable>( inputPath, new Configuration())) { int clusterId = record.getFirst().get(); VectorWritable v = SAMPLE_DATA.get(point++); Integer key = clusterId; if (!colors.containsKey(key)) { colors.put(key, COLORS[Math.min(COLORS.length - 1, colors.size())]); } plotClusteredRectangle(g2, v.get(), dv, colors.get(key)); } }
From source file:ac.keio.sslab.nlp.lda.RowIdJob.java
License:Apache License
@SuppressWarnings("deprecation") @Override/*from w w w .j a va 2 s . c o m*/ public int run(String[] args) throws Exception { addInputOption(); addOutputOption(); Map<String, List<String>> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); Path outputPath = getOutputPath(); Path indexPath = new Path(outputPath, "docIndex"); Path matrixPath = new Path(outputPath, "matrix"); try (SequenceFile.Writer indexWriter = SequenceFile.createWriter(fs, conf, indexPath, IntWritable.class, Text.class); SequenceFile.Writer matrixWriter = SequenceFile.createWriter(fs, conf, matrixPath, IntWritable.class, VectorWritable.class)) { IntWritable docId = new IntWritable(); int i = 0; int numCols = 0; for (Pair<Text, VectorWritable> record : new SequenceFileDirIterable<Text, VectorWritable>( getInputPath(), PathType.LIST, PathFilters.logsCRCFilter(), null, true, conf)) { VectorWritable value = record.getSecond(); docId.set(i); indexWriter.append(docId, record.getFirst()); matrixWriter.append(docId, value); i++; numCols = value.get().size(); } log.info("Wrote out matrix with {} rows and {} columns to {}", i, numCols, matrixPath); return 0; } }
From source file:at.illecker.hadoop.rootbeer.examples.matrixmultiplication.DistributedRowMatrix.java
License:Apache License
@Override public Iterator<MatrixSlice> iterateAll() { try {/*from ww w .j a v a 2 s . c o m*/ Path pathPattern = rowPath; if (FileSystem.get(conf).getFileStatus(rowPath).isDir()) { pathPattern = new Path(rowPath, "*"); } return Iterators.transform( new SequenceFileDirIterator<IntWritable, VectorWritable>(pathPattern, PathType.GLOB, PathFilters.logsCRCFilter(), null, true, conf), new Function<Pair<IntWritable, VectorWritable>, MatrixSlice>() { @Override public MatrixSlice apply(Pair<IntWritable, VectorWritable> from) { return new MatrixSlice(from.getSecond().get(), from.getFirst().get()); } }); } catch (IOException ioe) { throw new IllegalStateException(ioe); } }
From source file:com.caseystella.ingest.SparseVectorsFromSequenceFiles.java
License:Apache License
@Override public int run(String[] args) throws Exception { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option inputDirOpt = DefaultOptionCreator.inputOption().create(); Option outputDirOpt = DefaultOptionCreator.outputOption().create(); Option minSupportOpt = obuilder.withLongName("minSupport") .withArgument(abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create()) .withDescription("(Optional) Minimum Support. Default Value: 2").withShortName("s").create(); Option analyzerNameOpt = obuilder.withLongName("analyzerName") .withArgument(abuilder.withName("analyzerName").withMinimum(1).withMaximum(1).create()) .withDescription("The class name of the analyzer").withShortName("a").create(); Option libJarsOpt = obuilder.withLongName("libjars") .withArgument(abuilder.withName("libjars").withMinimum(1).withMaximum(1).create()) .withDescription("The default arg for libjars").withShortName("libjars").create(); Option chunkSizeOpt = obuilder.withLongName("chunkSize") .withArgument(abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create()) .withDescription("The chunkSize in MegaBytes. 100-10000 MB").withShortName("chunk").create(); Option weightOpt = obuilder.withLongName("weight").withRequired(false) .withArgument(abuilder.withName("weight").withMinimum(1).withMaximum(1).create()) .withDescription("The kind of weight to use. Currently TF or TFIDF").withShortName("wt").create(); Option minDFOpt = obuilder.withLongName("minDF").withRequired(false) .withArgument(abuilder.withName("minDF").withMinimum(1).withMaximum(1).create()) .withDescription("The minimum document frequency. Default is 1").withShortName("md").create(); Option maxDFPercentOpt = obuilder.withLongName("maxDFPercent").withRequired(false) .withArgument(abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create()) .withDescription(/* w w w. j a v a 2 s . co m*/ "The max percentage of docs for the DF. Can be used to remove really high frequency terms." + " Expressed as an integer between 0 and 100. Default is 99. If maxDFSigma is also set, it will override this value.") .withShortName("x").create(); Option maxDFSigmaOpt = obuilder.withLongName("maxDFSigma").withRequired(false) .withArgument(abuilder.withName("maxDFSigma").withMinimum(1).withMaximum(1).create()) .withDescription( "What portion of the tf (tf-idf) vectors to be used, expressed in times the standard deviation (sigma) of the document frequencies of these vectors." + " Can be used to remove really high frequency terms." + " Expressed as a double value. Good value to be specified is 3.0. In case the value is less then 0 no vectors " + "will be filtered out. Default is -1.0. Overrides maxDFPercent") .withShortName("xs").create(); Option minLLROpt = obuilder.withLongName("minLLR").withRequired(false) .withArgument(abuilder.withName("minLLR").withMinimum(1).withMaximum(1).create()) .withDescription("(Optional)The minimum Log Likelihood Ratio(Float) Default is " + LLRReducer.DEFAULT_MIN_LLR) .withShortName("ml").create(); Option numReduceTasksOpt = obuilder.withLongName("numReducers") .withArgument(abuilder.withName("numReducers").withMinimum(1).withMaximum(1).create()) .withDescription("(Optional) Number of reduce tasks. Default Value: 1").withShortName("nr") .create(); Option powerOpt = obuilder.withLongName("norm").withRequired(false) .withArgument(abuilder.withName("norm").withMinimum(1).withMaximum(1).create()) .withDescription( "The norm to use, expressed as either a float or \"INF\" if you want to use the Infinite norm. " + "Must be greater or equal to 0. The default is not to normalize") .withShortName("n").create(); Option logNormalizeOpt = obuilder.withLongName("logNormalize").withRequired(false) .withDescription("(Optional) Whether output vectors should be logNormalize. If set true else false") .withShortName("lnorm").create(); Option maxNGramSizeOpt = obuilder.withLongName("maxNGramSize").withRequired(false) .withArgument(abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create()) .withDescription("(Optional) The maximum size of ngrams to create" + " (2 = bigrams, 3 = trigrams, etc) Default Value:1") .withShortName("ng").create(); Option sequentialAccessVectorOpt = obuilder.withLongName("sequentialAccessVector").withRequired(false) .withDescription( "(Optional) Whether output vectors should be SequentialAccessVectors. If set true else false") .withShortName("seq").create(); Option namedVectorOpt = obuilder.withLongName("namedVector").withRequired(false) .withDescription("(Optional) Whether output vectors should be NamedVectors. If set true else false") .withShortName("nv").create(); Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false) .withDescription("If set, overwrite the output directory").withShortName("ow").create(); Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h") .create(); Group group = gbuilder.withName("Options").withOption(minSupportOpt).withOption(analyzerNameOpt) .withOption(libJarsOpt).withOption(chunkSizeOpt).withOption(outputDirOpt).withOption(inputDirOpt) .withOption(minDFOpt).withOption(maxDFSigmaOpt).withOption(maxDFPercentOpt).withOption(weightOpt) .withOption(powerOpt).withOption(minLLROpt).withOption(numReduceTasksOpt) .withOption(maxNGramSizeOpt).withOption(overwriteOutput).withOption(helpOpt) .withOption(sequentialAccessVectorOpt).withOption(namedVectorOpt).withOption(logNormalizeOpt) .create(); try { Parser parser = new Parser(); parser.setGroup(group); parser.setHelpOption(helpOpt); CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption(helpOpt)) { CommandLineUtil.printHelp(group); return -1; } Path inputDir = new Path((String) cmdLine.getValue(inputDirOpt)); Path outputDir = new Path((String) cmdLine.getValue(outputDirOpt)); int chunkSize = 100; if (cmdLine.hasOption(chunkSizeOpt)) { chunkSize = Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt)); } int minSupport = 2; if (cmdLine.hasOption(minSupportOpt)) { String minSupportString = (String) cmdLine.getValue(minSupportOpt); minSupport = Integer.parseInt(minSupportString); } int maxNGramSize = 1; if (cmdLine.hasOption(maxNGramSizeOpt)) { try { maxNGramSize = Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt).toString()); } catch (NumberFormatException ex) { log.warn("Could not parse ngram size option"); } } log.info("Maximum n-gram size is: {}", maxNGramSize); if (cmdLine.hasOption(overwriteOutput)) { HadoopUtil.delete(getConf(), outputDir); } float minLLRValue = LLRReducer.DEFAULT_MIN_LLR; if (cmdLine.hasOption(minLLROpt)) { minLLRValue = Float.parseFloat(cmdLine.getValue(minLLROpt).toString()); } log.info("Minimum LLR value: {}", minLLRValue); int reduceTasks = 1; if (cmdLine.hasOption(numReduceTasksOpt)) { reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString()); } log.info("Number of reduce tasks: {}", reduceTasks); Class<? extends Analyzer> analyzerClass = DefaultAnalyzer.class; if (cmdLine.hasOption(analyzerNameOpt)) { String className = cmdLine.getValue(analyzerNameOpt).toString(); analyzerClass = Class.forName(className).asSubclass(Analyzer.class); // try instantiating it, b/c there isn't any point in setting it if // you can't instantiate it ClassUtils.instantiateAs(analyzerClass, Analyzer.class); } boolean processIdf; if (cmdLine.hasOption(weightOpt)) { String wString = cmdLine.getValue(weightOpt).toString(); if ("tf".equalsIgnoreCase(wString)) { processIdf = false; } else if ("tfidf".equalsIgnoreCase(wString)) { processIdf = true; } else { throw new OptionException(weightOpt); } } else { processIdf = true; } int minDf = 1; if (cmdLine.hasOption(minDFOpt)) { minDf = Integer.parseInt(cmdLine.getValue(minDFOpt).toString()); } int maxDFPercent = 99; if (cmdLine.hasOption(maxDFPercentOpt)) { maxDFPercent = Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString()); } double maxDFSigma = -1.0; if (cmdLine.hasOption(maxDFSigmaOpt)) { maxDFSigma = Double.parseDouble(cmdLine.getValue(maxDFSigmaOpt).toString()); } float norm = PartialVectorMerger.NO_NORMALIZING; if (cmdLine.hasOption(powerOpt)) { String power = cmdLine.getValue(powerOpt).toString(); if ("INF".equals(power)) { norm = Float.POSITIVE_INFINITY; } else { norm = Float.parseFloat(power); } } boolean logNormalize = false; if (cmdLine.hasOption(logNormalizeOpt)) { logNormalize = true; } Configuration conf = getConf(); Path tokenizedPath = new Path(outputDir, DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER); //TODO: move this into DictionaryVectorizer , and then fold SparseVectorsFrom with EncodedVectorsFrom to have one framework for all of this. DocumentProcessor.tokenizeDocuments(inputDir, analyzerClass, tokenizedPath, conf); boolean sequentialAccessOutput = false; if (cmdLine.hasOption(sequentialAccessVectorOpt)) { sequentialAccessOutput = true; } boolean namedVectors = false; if (cmdLine.hasOption(namedVectorOpt)) { namedVectors = true; } boolean shouldPrune = maxDFSigma >= 0.0; String tfDirName = shouldPrune ? DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER + "-toprune" : DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER; if (!processIdf) { DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, tfDirName, conf, minSupport, maxNGramSize, minLLRValue, norm, logNormalize, reduceTasks, chunkSize, sequentialAccessOutput, namedVectors); } else { DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, tfDirName, conf, minSupport, maxNGramSize, minLLRValue, -1.0f, false, reduceTasks, chunkSize, sequentialAccessOutput, namedVectors); } Pair<Long[], List<Path>> docFrequenciesFeatures = null; // Should document frequency features be processed if (shouldPrune || processIdf) { docFrequenciesFeatures = TFIDFConverter.calculateDF(new Path(outputDir, tfDirName), outputDir, conf, chunkSize); } long maxDF = maxDFPercent; //if we are pruning by std dev, then this will get changed if (shouldPrune) { Path dfDir = new Path(outputDir, TFIDFConverter.WORDCOUNT_OUTPUT_FOLDER); Path stdCalcDir = new Path(outputDir, HighDFWordsPruner.STD_CALC_DIR); // Calculate the standard deviation double stdDev = BasicStats.stdDevForGivenMean(dfDir, stdCalcDir, 0.0, conf); long vectorCount = docFrequenciesFeatures.getFirst()[1]; maxDF = (int) (100.0 * maxDFSigma * stdDev / vectorCount); // Prune the term frequency vectors Path tfDir = new Path(outputDir, tfDirName); Path prunedTFDir = new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER); Path prunedPartialTFDir = new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER + "-partial"); if (processIdf) { HighDFWordsPruner.pruneVectors(tfDir, prunedTFDir, prunedPartialTFDir, maxDF, conf, docFrequenciesFeatures, -1.0f, false, reduceTasks); } else { HighDFWordsPruner.pruneVectors(tfDir, prunedTFDir, prunedPartialTFDir, maxDF, conf, docFrequenciesFeatures, norm, logNormalize, reduceTasks); } HadoopUtil.delete(new Configuration(conf), tfDir); } if (processIdf) { TFIDFConverter.processTfIdf(new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER), outputDir, conf, docFrequenciesFeatures, minDf, maxDF, norm, logNormalize, sequentialAccessOutput, namedVectors, reduceTasks); } } catch (OptionException e) { log.error("Exception", e); CommandLineUtil.printHelp(group); } return 0; }
From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.AggregatorMapper.java
License:Apache License
@Override protected void map(Text key, TopKStringPatterns values, Context context) throws IOException, InterruptedException { for (Pair<List<String>, Long> pattern : values.getPatterns()) { for (String item : pattern.getFirst()) { List<Pair<List<String>, Long>> patternSingularList = Lists.newArrayList(); patternSingularList.add(pattern); context.setStatus("Aggregator Mapper:Grouping Patterns for " + item); context.write(new Text(item), new TopKStringPatterns(patternSingularList)); }/*from www . j av a2 s . c o m*/ } }
From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.convertors.integer.IntegerStringOutputConverter.java
License:Apache License
@Override public void collect(Integer key, List<Pair<List<Integer>, Long>> value) throws IOException { String stringKey = featureReverseMap.get(key); List<Pair<List<String>, Long>> stringValues = Lists.newArrayList(); for (Pair<List<Integer>, Long> e : value) { List<String> pattern = Lists.newArrayList(); for (Integer i : e.getFirst()) { pattern.add(featureReverseMap.get(i)); }/*from ww w . java2 s. c om*/ stringValues.add(new Pair<List<String>, Long>(pattern, e.getSecond())); } collector.collect(new Text(stringKey), new TopKStringPatterns(stringValues)); }
From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.convertors.string.TopKStringPatterns.java
License:Apache License
public TopKStringPatterns merge(TopKStringPatterns pattern, int heapSize) { List<Pair<List<String>, Long>> patterns = Lists.newArrayList(); Iterator<Pair<List<String>, Long>> myIterator = frequentPatterns.iterator(); Iterator<Pair<List<String>, Long>> otherIterator = pattern.iterator(); Pair<List<String>, Long> myItem = null; Pair<List<String>, Long> otherItem = null; for (int i = 0; i < heapSize; i++) { if (myItem == null && myIterator.hasNext()) { myItem = myIterator.next();//from w w w . j a va 2 s .c om } if (otherItem == null && otherIterator.hasNext()) { otherItem = otherIterator.next(); } if (myItem != null && otherItem != null) { int cmp = myItem.getSecond().compareTo(otherItem.getSecond()); if (cmp == 0) { cmp = myItem.getFirst().size() - otherItem.getFirst().size(); if (cmp == 0) { for (int j = 0; j < myItem.getFirst().size(); j++) { cmp = myItem.getFirst().get(j).compareTo(otherItem.getFirst().get(j)); if (cmp != 0) { break; } } } } if (cmp <= 0) { patterns.add(otherItem); if (cmp == 0) { myItem = null; } otherItem = null; } else if (cmp > 0) { patterns.add(myItem); myItem = null; } } else if (myItem != null) { patterns.add(myItem); myItem = null; } else if (otherItem != null) { patterns.add(otherItem); otherItem = null; } else { break; } } return new TopKStringPatterns(patterns); }
From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.convertors.string.TopKStringPatterns.java
License:Apache License
@Override public void write(DataOutput out) throws IOException { out.writeInt(frequentPatterns.size()); for (Pair<List<String>, Long> pattern : frequentPatterns) { out.writeInt(pattern.getFirst().size()); out.writeLong(pattern.getSecond()); for (String item : pattern.getFirst()) { out.writeUTF(item);// ww w. j a v a2s .co m } } }