Example usage for org.apache.mahout.common Pair toString

Introduction

In this page you can find the example usage for org.apache.mahout.common Pair toString.

Prototype

@Override
    public String toString()

Source Link

Usage

From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.convertors.string.TopKStringPatterns.java

License:Apache License

@Override
public String toString() {
    StringBuilder sb = new StringBuilder();
    String sep = "";
    for (Pair<List<String>, Long> pattern : frequentPatterns) {
        sb.append(sep);/* w ww  .j a v  a 2 s  . c  o  m*/
        sb.append(pattern.toString());
        sep = ", ";

    }
    return sb.toString();

}

From source file:parse_wikipedia.ParseWikipedia.java

License:Apache License

public static void runJob(String input, String output)
        throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = new Configuration();
    conf.set("xmlinput.start", "<page>");
    conf.set("xmlinput.end", "</page>");

    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");

    Job job = new Job(conf);
    log.info("Input: {} Out: {}", input, output);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    FileInputFormat.setInputPaths(job, new Path(input));
    Path outPath = new Path(output);
    FileOutputFormat.setOutputPath(job, outPath);
    job.setMapperClass(WikipediaParser.class);
    job.setInputFormatClass(XmlInputFormat.class);
    job.setReducerClass(Reducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setJarByClass(ParseWikipedia.class);

    HadoopUtil.delete(conf, outPath);/*  www.  j av a  2 s. c  o  m*/
    System.out.println("Got here.");
    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }
    System.out.println("Finished job.");

    /* TFIDF*/

    Path tokenizedDocumentsPath = new Path(output, DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);
    Path termFrequencyVectorsPath = new Path(output, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER);
    Path tfidfPath = new Path(outPath + "/tfidf");
    DocumentProcessor.tokenizeDocuments(outPath, StandardAnalyzer.class, tokenizedDocumentsPath, conf);

    DictionaryVectorizer.createTermFrequencyVectors(tokenizedDocumentsPath, outPath,
            DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER, conf, 1, 1, 0.0f,
            PartialVectorMerger.NO_NORMALIZING, true, 1, 100, false, false);

    Pair<Long[], List<Path>> documentFrequencies = TFIDFConverter.calculateDF(termFrequencyVectorsPath,
            tfidfPath, conf, 100);

    System.out.println(documentFrequencies.toString());

    TFIDFConverter.processTfIdf(termFrequencyVectorsPath, tfidfPath, conf, documentFrequencies, 1, 100,
            PartialVectorMerger.NO_NORMALIZING, false, false, false, 1);
}