List of usage examples for org.apache.mahout.common Pair toString
@Override
public String toString()
From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.convertors.string.TopKStringPatterns.java
License:Apache License
@Override public String toString() { StringBuilder sb = new StringBuilder(); String sep = ""; for (Pair<List<String>, Long> pattern : frequentPatterns) { sb.append(sep);/* w ww .j a v a 2 s . c o m*/ sb.append(pattern.toString()); sep = ", "; } return sb.toString(); }
From source file:parse_wikipedia.ParseWikipedia.java
License:Apache License
public static void runJob(String input, String output) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(); conf.set("xmlinput.start", "<page>"); conf.set("xmlinput.end", "</page>"); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); Job job = new Job(conf); log.info("Input: {} Out: {}", input, output); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(job, new Path(input)); Path outPath = new Path(output); FileOutputFormat.setOutputPath(job, outPath); job.setMapperClass(WikipediaParser.class); job.setInputFormatClass(XmlInputFormat.class); job.setReducerClass(Reducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setJarByClass(ParseWikipedia.class); HadoopUtil.delete(conf, outPath);/* www. j av a 2 s. c o m*/ System.out.println("Got here."); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } System.out.println("Finished job."); /* TFIDF*/ Path tokenizedDocumentsPath = new Path(output, DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER); Path termFrequencyVectorsPath = new Path(output, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER); Path tfidfPath = new Path(outPath + "/tfidf"); DocumentProcessor.tokenizeDocuments(outPath, StandardAnalyzer.class, tokenizedDocumentsPath, conf); DictionaryVectorizer.createTermFrequencyVectors(tokenizedDocumentsPath, outPath, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER, conf, 1, 1, 0.0f, PartialVectorMerger.NO_NORMALIZING, true, 1, 100, false, false); Pair<Long[], List<Path>> documentFrequencies = TFIDFConverter.calculateDF(termFrequencyVectorsPath, tfidfPath, conf, 100); System.out.println(documentFrequencies.toString()); TFIDFConverter.processTfIdf(termFrequencyVectorsPath, tfidfPath, conf, documentFrequencies, 1, 100, PartialVectorMerger.NO_NORMALIZING, false, false, false, 1); }