Example usage for org.apache.mahout.common Pair getFirst

List of usage examples for org.apache.mahout.common Pair getFirst

Introduction

In this page you can find the example usage for org.apache.mahout.common Pair getFirst.

Prototype

public A getFirst() 

Source Link

Usage

From source file:ClassifierHD.java

License:Apache License

public static Map<String, Integer> readDictionnary(Configuration conf, Path dictionnaryPath) {
    Map<String, Integer> dictionnary = new HashMap<String, Integer>();
    for (Pair<Text, IntWritable> pair : new SequenceFileIterable<Text, IntWritable>(dictionnaryPath, true,
            conf)) {/*from  ww w. j ava2 s . co m*/
        dictionnary.put(pair.getFirst().toString(), pair.getSecond().get());
    }
    return dictionnary;
}

From source file:ClassifierHD.java

License:Apache License

public static Map<Integer, Long> readDocumentFrequency(Configuration conf, Path documentFrequencyPath) {
    Map<Integer, Long> documentFrequency = new HashMap<Integer, Long>();
    for (Pair<IntWritable, LongWritable> pair : new SequenceFileIterable<IntWritable, LongWritable>(
            documentFrequencyPath, true, conf)) {
        documentFrequency.put(pair.getFirst().get(), pair.getSecond().get());
    }//from   w w w.  j  a  va2 s.co  m
    return documentFrequency;
}

From source file:DisplayClustering.java

License:Apache License

/**
 * This method plots points and colors them according to their cluster
 * membership, rather than drawing ellipses.
 *
 * As of commit, this method is used only by K-means spectral clustering.
 * Since the cluster assignments are set within the eigenspace of the data, it
 * is not inherent that the original data cluster as they would in K-means:
 * that is, as symmetric gaussian mixtures.
 *
 * Since Spectral K-Means uses K-Means to cluster the eigenspace data, the raw
 * output is not directly usable. Rather, the cluster assignments from the raw
 * output need to be transferred back to the original data. As such, this
 * method will read the SequenceFile cluster results of K-means and transfer
 * the cluster assignments to the original data, coloring them appropriately.
 *
 * @param g2/*from  w w w.  ja v  a2  s.  c  om*/
 * @param data
 */
protected static void plotClusteredSampleData(Graphics2D g2, Path data) {
    double sx = (double) res / DS;
    g2.setTransform(AffineTransform.getScaleInstance(sx, sx));

    g2.setColor(Color.BLACK);
    Vector dv = new DenseVector(2).assign(SIZE / 2.0);
    plotRectangle(g2, new DenseVector(2).assign(2), dv);
    plotRectangle(g2, new DenseVector(2).assign(-2), dv);

    // plot the sample data, colored according to the cluster they belong to
    dv.assign(0.03);

    Path clusteredPointsPath = new Path(data, "clusteredPoints");
    Path inputPath = new Path(clusteredPointsPath, "part-m-00000");
    Map<Integer, Color> colors = new HashMap<Integer, Color>();
    int point = 0;
    for (Pair<IntWritable, WeightedVectorWritable> record : new SequenceFileIterable<IntWritable, WeightedVectorWritable>(
            inputPath, new Configuration())) {
        int clusterId = record.getFirst().get();
        VectorWritable v = SAMPLE_DATA.get(point++);
        Integer key = clusterId;
        if (!colors.containsKey(key)) {
            colors.put(key, COLORS[Math.min(COLORS.length - 1, colors.size())]);
        }
        plotClusteredRectangle(g2, v.get(), dv, colors.get(key));
    }
}

From source file:ac.keio.sslab.nlp.lda.RowIdJob.java

License:Apache License

@SuppressWarnings("deprecation")
@Override/*from  w  w  w .j a  va  2 s  . c o m*/
public int run(String[] args) throws Exception {

    addInputOption();
    addOutputOption();

    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    Path outputPath = getOutputPath();
    Path indexPath = new Path(outputPath, "docIndex");
    Path matrixPath = new Path(outputPath, "matrix");

    try (SequenceFile.Writer indexWriter = SequenceFile.createWriter(fs, conf, indexPath, IntWritable.class,
            Text.class);
            SequenceFile.Writer matrixWriter = SequenceFile.createWriter(fs, conf, matrixPath,
                    IntWritable.class, VectorWritable.class)) {
        IntWritable docId = new IntWritable();
        int i = 0;
        int numCols = 0;
        for (Pair<Text, VectorWritable> record : new SequenceFileDirIterable<Text, VectorWritable>(
                getInputPath(), PathType.LIST, PathFilters.logsCRCFilter(), null, true, conf)) {
            VectorWritable value = record.getSecond();
            docId.set(i);
            indexWriter.append(docId, record.getFirst());
            matrixWriter.append(docId, value);
            i++;
            numCols = value.get().size();
        }

        log.info("Wrote out matrix with {} rows and {} columns to {}", i, numCols, matrixPath);
        return 0;
    }
}

From source file:at.illecker.hadoop.rootbeer.examples.matrixmultiplication.DistributedRowMatrix.java

License:Apache License

@Override
public Iterator<MatrixSlice> iterateAll() {
    try {/*from   ww w .j  a v  a  2 s  . c  o  m*/
        Path pathPattern = rowPath;
        if (FileSystem.get(conf).getFileStatus(rowPath).isDir()) {
            pathPattern = new Path(rowPath, "*");
        }
        return Iterators.transform(
                new SequenceFileDirIterator<IntWritable, VectorWritable>(pathPattern, PathType.GLOB,
                        PathFilters.logsCRCFilter(), null, true, conf),
                new Function<Pair<IntWritable, VectorWritable>, MatrixSlice>() {
                    @Override
                    public MatrixSlice apply(Pair<IntWritable, VectorWritable> from) {
                        return new MatrixSlice(from.getSecond().get(), from.getFirst().get());
                    }
                });
    } catch (IOException ioe) {
        throw new IllegalStateException(ioe);
    }
}

From source file:com.caseystella.ingest.SparseVectorsFromSequenceFiles.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputDirOpt = DefaultOptionCreator.inputOption().create();

    Option outputDirOpt = DefaultOptionCreator.outputOption().create();

    Option minSupportOpt = obuilder.withLongName("minSupport")
            .withArgument(abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) Minimum Support. Default Value: 2").withShortName("s").create();

    Option analyzerNameOpt = obuilder.withLongName("analyzerName")
            .withArgument(abuilder.withName("analyzerName").withMinimum(1).withMaximum(1).create())
            .withDescription("The class name of the analyzer").withShortName("a").create();
    Option libJarsOpt = obuilder.withLongName("libjars")
            .withArgument(abuilder.withName("libjars").withMinimum(1).withMaximum(1).create())
            .withDescription("The default arg for libjars").withShortName("libjars").create();
    Option chunkSizeOpt = obuilder.withLongName("chunkSize")
            .withArgument(abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create())
            .withDescription("The chunkSize in MegaBytes. 100-10000 MB").withShortName("chunk").create();

    Option weightOpt = obuilder.withLongName("weight").withRequired(false)
            .withArgument(abuilder.withName("weight").withMinimum(1).withMaximum(1).create())
            .withDescription("The kind of weight to use. Currently TF or TFIDF").withShortName("wt").create();

    Option minDFOpt = obuilder.withLongName("minDF").withRequired(false)
            .withArgument(abuilder.withName("minDF").withMinimum(1).withMaximum(1).create())
            .withDescription("The minimum document frequency.  Default is 1").withShortName("md").create();

    Option maxDFPercentOpt = obuilder.withLongName("maxDFPercent").withRequired(false)
            .withArgument(abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create())
            .withDescription(/* w  w  w.  j a v  a 2 s .  co  m*/
                    "The max percentage of docs for the DF.  Can be used to remove really high frequency terms."
                            + " Expressed as an integer between 0 and 100. Default is 99.  If maxDFSigma is also set, it will override this value.")
            .withShortName("x").create();

    Option maxDFSigmaOpt = obuilder.withLongName("maxDFSigma").withRequired(false)
            .withArgument(abuilder.withName("maxDFSigma").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "What portion of the tf (tf-idf) vectors to be used, expressed in times the standard deviation (sigma) of the document frequencies of these vectors."
                            + "  Can be used to remove really high frequency terms."
                            + " Expressed as a double value. Good value to be specified is 3.0. In case the value is less then 0 no vectors "
                            + "will be filtered out. Default is -1.0.  Overrides maxDFPercent")
            .withShortName("xs").create();

    Option minLLROpt = obuilder.withLongName("minLLR").withRequired(false)
            .withArgument(abuilder.withName("minLLR").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional)The minimum Log Likelihood Ratio(Float)  Default is "
                    + LLRReducer.DEFAULT_MIN_LLR)
            .withShortName("ml").create();

    Option numReduceTasksOpt = obuilder.withLongName("numReducers")
            .withArgument(abuilder.withName("numReducers").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) Number of reduce tasks. Default Value: 1").withShortName("nr")
            .create();

    Option powerOpt = obuilder.withLongName("norm").withRequired(false)
            .withArgument(abuilder.withName("norm").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The norm to use, expressed as either a float or \"INF\" if you want to use the Infinite norm.  "
                            + "Must be greater or equal to 0.  The default is not to normalize")
            .withShortName("n").create();

    Option logNormalizeOpt = obuilder.withLongName("logNormalize").withRequired(false)
            .withDescription("(Optional) Whether output vectors should be logNormalize. If set true else false")
            .withShortName("lnorm").create();

    Option maxNGramSizeOpt = obuilder.withLongName("maxNGramSize").withRequired(false)
            .withArgument(abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) The maximum size of ngrams to create"
                    + " (2 = bigrams, 3 = trigrams, etc) Default Value:1")
            .withShortName("ng").create();

    Option sequentialAccessVectorOpt = obuilder.withLongName("sequentialAccessVector").withRequired(false)
            .withDescription(
                    "(Optional) Whether output vectors should be SequentialAccessVectors. If set true else false")
            .withShortName("seq").create();

    Option namedVectorOpt = obuilder.withLongName("namedVector").withRequired(false)
            .withDescription("(Optional) Whether output vectors should be NamedVectors. If set true else false")
            .withShortName("nv").create();

    Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false)
            .withDescription("If set, overwrite the output directory").withShortName("ow").create();
    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
            .create();

    Group group = gbuilder.withName("Options").withOption(minSupportOpt).withOption(analyzerNameOpt)
            .withOption(libJarsOpt).withOption(chunkSizeOpt).withOption(outputDirOpt).withOption(inputDirOpt)
            .withOption(minDFOpt).withOption(maxDFSigmaOpt).withOption(maxDFPercentOpt).withOption(weightOpt)
            .withOption(powerOpt).withOption(minLLROpt).withOption(numReduceTasksOpt)
            .withOption(maxNGramSizeOpt).withOption(overwriteOutput).withOption(helpOpt)
            .withOption(sequentialAccessVectorOpt).withOption(namedVectorOpt).withOption(logNormalizeOpt)
            .create();
    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        parser.setHelpOption(helpOpt);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return -1;
        }

        Path inputDir = new Path((String) cmdLine.getValue(inputDirOpt));
        Path outputDir = new Path((String) cmdLine.getValue(outputDirOpt));

        int chunkSize = 100;
        if (cmdLine.hasOption(chunkSizeOpt)) {
            chunkSize = Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt));
        }
        int minSupport = 2;
        if (cmdLine.hasOption(minSupportOpt)) {
            String minSupportString = (String) cmdLine.getValue(minSupportOpt);
            minSupport = Integer.parseInt(minSupportString);
        }

        int maxNGramSize = 1;

        if (cmdLine.hasOption(maxNGramSizeOpt)) {
            try {
                maxNGramSize = Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt).toString());
            } catch (NumberFormatException ex) {
                log.warn("Could not parse ngram size option");
            }
        }
        log.info("Maximum n-gram size is: {}", maxNGramSize);

        if (cmdLine.hasOption(overwriteOutput)) {
            HadoopUtil.delete(getConf(), outputDir);
        }

        float minLLRValue = LLRReducer.DEFAULT_MIN_LLR;
        if (cmdLine.hasOption(minLLROpt)) {
            minLLRValue = Float.parseFloat(cmdLine.getValue(minLLROpt).toString());
        }
        log.info("Minimum LLR value: {}", minLLRValue);

        int reduceTasks = 1;
        if (cmdLine.hasOption(numReduceTasksOpt)) {
            reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString());
        }
        log.info("Number of reduce tasks: {}", reduceTasks);

        Class<? extends Analyzer> analyzerClass = DefaultAnalyzer.class;
        if (cmdLine.hasOption(analyzerNameOpt)) {
            String className = cmdLine.getValue(analyzerNameOpt).toString();
            analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
            // try instantiating it, b/c there isn't any point in setting it if
            // you can't instantiate it
            ClassUtils.instantiateAs(analyzerClass, Analyzer.class);
        }

        boolean processIdf;

        if (cmdLine.hasOption(weightOpt)) {
            String wString = cmdLine.getValue(weightOpt).toString();
            if ("tf".equalsIgnoreCase(wString)) {
                processIdf = false;
            } else if ("tfidf".equalsIgnoreCase(wString)) {
                processIdf = true;
            } else {
                throw new OptionException(weightOpt);
            }
        } else {
            processIdf = true;
        }

        int minDf = 1;
        if (cmdLine.hasOption(minDFOpt)) {
            minDf = Integer.parseInt(cmdLine.getValue(minDFOpt).toString());
        }
        int maxDFPercent = 99;
        if (cmdLine.hasOption(maxDFPercentOpt)) {
            maxDFPercent = Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString());
        }
        double maxDFSigma = -1.0;
        if (cmdLine.hasOption(maxDFSigmaOpt)) {
            maxDFSigma = Double.parseDouble(cmdLine.getValue(maxDFSigmaOpt).toString());
        }

        float norm = PartialVectorMerger.NO_NORMALIZING;
        if (cmdLine.hasOption(powerOpt)) {
            String power = cmdLine.getValue(powerOpt).toString();
            if ("INF".equals(power)) {
                norm = Float.POSITIVE_INFINITY;
            } else {
                norm = Float.parseFloat(power);
            }
        }

        boolean logNormalize = false;
        if (cmdLine.hasOption(logNormalizeOpt)) {
            logNormalize = true;
        }

        Configuration conf = getConf();
        Path tokenizedPath = new Path(outputDir, DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);
        //TODO: move this into DictionaryVectorizer , and then fold SparseVectorsFrom with EncodedVectorsFrom to have one framework for all of this.
        DocumentProcessor.tokenizeDocuments(inputDir, analyzerClass, tokenizedPath, conf);

        boolean sequentialAccessOutput = false;
        if (cmdLine.hasOption(sequentialAccessVectorOpt)) {
            sequentialAccessOutput = true;
        }

        boolean namedVectors = false;
        if (cmdLine.hasOption(namedVectorOpt)) {
            namedVectors = true;
        }
        boolean shouldPrune = maxDFSigma >= 0.0;
        String tfDirName = shouldPrune ? DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER + "-toprune"
                : DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER;

        if (!processIdf) {
            DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, tfDirName, conf,
                    minSupport, maxNGramSize, minLLRValue, norm, logNormalize, reduceTasks, chunkSize,
                    sequentialAccessOutput, namedVectors);
        } else {
            DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, tfDirName, conf,
                    minSupport, maxNGramSize, minLLRValue, -1.0f, false, reduceTasks, chunkSize,
                    sequentialAccessOutput, namedVectors);
        }
        Pair<Long[], List<Path>> docFrequenciesFeatures = null;
        // Should document frequency features be processed
        if (shouldPrune || processIdf) {
            docFrequenciesFeatures = TFIDFConverter.calculateDF(new Path(outputDir, tfDirName), outputDir, conf,
                    chunkSize);
        }

        long maxDF = maxDFPercent; //if we are pruning by std dev, then this will get changed
        if (shouldPrune) {
            Path dfDir = new Path(outputDir, TFIDFConverter.WORDCOUNT_OUTPUT_FOLDER);
            Path stdCalcDir = new Path(outputDir, HighDFWordsPruner.STD_CALC_DIR);

            // Calculate the standard deviation
            double stdDev = BasicStats.stdDevForGivenMean(dfDir, stdCalcDir, 0.0, conf);
            long vectorCount = docFrequenciesFeatures.getFirst()[1];
            maxDF = (int) (100.0 * maxDFSigma * stdDev / vectorCount);

            // Prune the term frequency vectors
            Path tfDir = new Path(outputDir, tfDirName);
            Path prunedTFDir = new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER);
            Path prunedPartialTFDir = new Path(outputDir,
                    DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER + "-partial");
            if (processIdf) {
                HighDFWordsPruner.pruneVectors(tfDir, prunedTFDir, prunedPartialTFDir, maxDF, conf,
                        docFrequenciesFeatures, -1.0f, false, reduceTasks);
            } else {
                HighDFWordsPruner.pruneVectors(tfDir, prunedTFDir, prunedPartialTFDir, maxDF, conf,
                        docFrequenciesFeatures, norm, logNormalize, reduceTasks);
            }
            HadoopUtil.delete(new Configuration(conf), tfDir);
        }
        if (processIdf) {
            TFIDFConverter.processTfIdf(new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),
                    outputDir, conf, docFrequenciesFeatures, minDf, maxDF, norm, logNormalize,
                    sequentialAccessOutput, namedVectors, reduceTasks);
        }
    } catch (OptionException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
    }
    return 0;
}

From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.AggregatorMapper.java

License:Apache License

@Override
protected void map(Text key, TopKStringPatterns values, Context context)
        throws IOException, InterruptedException {
    for (Pair<List<String>, Long> pattern : values.getPatterns()) {
        for (String item : pattern.getFirst()) {
            List<Pair<List<String>, Long>> patternSingularList = Lists.newArrayList();
            patternSingularList.add(pattern);
            context.setStatus("Aggregator Mapper:Grouping Patterns for " + item);
            context.write(new Text(item), new TopKStringPatterns(patternSingularList));
        }/*from  www  .  j av a2 s . c o m*/
    }

}

From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.convertors.integer.IntegerStringOutputConverter.java

License:Apache License

@Override
public void collect(Integer key, List<Pair<List<Integer>, Long>> value) throws IOException {
    String stringKey = featureReverseMap.get(key);
    List<Pair<List<String>, Long>> stringValues = Lists.newArrayList();
    for (Pair<List<Integer>, Long> e : value) {
        List<String> pattern = Lists.newArrayList();
        for (Integer i : e.getFirst()) {
            pattern.add(featureReverseMap.get(i));
        }/*from ww w .  java2 s. c om*/
        stringValues.add(new Pair<List<String>, Long>(pattern, e.getSecond()));
    }
    collector.collect(new Text(stringKey), new TopKStringPatterns(stringValues));
}

From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.convertors.string.TopKStringPatterns.java

License:Apache License

public TopKStringPatterns merge(TopKStringPatterns pattern, int heapSize) {
    List<Pair<List<String>, Long>> patterns = Lists.newArrayList();
    Iterator<Pair<List<String>, Long>> myIterator = frequentPatterns.iterator();
    Iterator<Pair<List<String>, Long>> otherIterator = pattern.iterator();
    Pair<List<String>, Long> myItem = null;
    Pair<List<String>, Long> otherItem = null;
    for (int i = 0; i < heapSize; i++) {
        if (myItem == null && myIterator.hasNext()) {
            myItem = myIterator.next();//from  w  w w  .  j  a  va 2 s  .c  om
        }
        if (otherItem == null && otherIterator.hasNext()) {
            otherItem = otherIterator.next();
        }
        if (myItem != null && otherItem != null) {
            int cmp = myItem.getSecond().compareTo(otherItem.getSecond());
            if (cmp == 0) {
                cmp = myItem.getFirst().size() - otherItem.getFirst().size();
                if (cmp == 0) {
                    for (int j = 0; j < myItem.getFirst().size(); j++) {
                        cmp = myItem.getFirst().get(j).compareTo(otherItem.getFirst().get(j));
                        if (cmp != 0) {
                            break;
                        }
                    }
                }
            }
            if (cmp <= 0) {
                patterns.add(otherItem);
                if (cmp == 0) {
                    myItem = null;
                }
                otherItem = null;
            } else if (cmp > 0) {
                patterns.add(myItem);
                myItem = null;
            }
        } else if (myItem != null) {
            patterns.add(myItem);
            myItem = null;
        } else if (otherItem != null) {
            patterns.add(otherItem);
            otherItem = null;
        } else {
            break;
        }
    }
    return new TopKStringPatterns(patterns);
}

From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.convertors.string.TopKStringPatterns.java

License:Apache License

@Override
public void write(DataOutput out) throws IOException {
    out.writeInt(frequentPatterns.size());
    for (Pair<List<String>, Long> pattern : frequentPatterns) {
        out.writeInt(pattern.getFirst().size());
        out.writeLong(pattern.getSecond());
        for (String item : pattern.getFirst()) {
            out.writeUTF(item);// ww w. j  a v  a2s  .co m
        }
    }
}