Java tutorial
/** * Copyright [2012] [Datasalt Systems S.L.] * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.datasalt.pangool.examples.topicalwordcount; import com.datasalt.pangool.io.ITuple; import com.datasalt.pangool.io.Tuple; import com.datasalt.pangool.io.TupleFile; import com.datasalt.pangool.utils.test.AbstractHadoopTestLibrary; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.util.ToolRunner; import org.junit.Test; import java.io.IOException; import static org.junit.Assert.assertEquals; public class TestTopicFingerprint extends AbstractHadoopTestLibrary { public final static String INPUT = TestTopicFingerprint.class.getName() + "-input"; public final static String OUTPUT = TestTopicFingerprint.class.getName() + "-output"; @Test public void test() throws Exception { trash(OUTPUT); Configuration conf = new Configuration(); createInput(INPUT, conf); ToolRunner.run(getConf(), new TopicFingerprint(), new String[] { INPUT, OUTPUT, 2 + "" }); Path outPath = new Path(OUTPUT + "/part-r-00000"); FileSystem fs = FileSystem.get(outPath.toUri(), conf); TupleFile.Reader reader = new TupleFile.Reader(fs, conf, outPath); Tuple tuple = new Tuple(reader.getSchema()); // The order in the output file is deterministic (we have sorted by topic, count) reader.next(tuple); assertEquals(1, tuple.get("topic")); assertEquals("a", tuple.get("word").toString()); reader.next(tuple); assertEquals(1, tuple.get("topic")); assertEquals("c", tuple.get("word").toString()); reader.next(tuple); assertEquals(2, tuple.get("topic")); assertEquals("a", tuple.get("word").toString()); reader.next(tuple); assertEquals(2, tuple.get("topic")); assertEquals("b", tuple.get("word").toString()); // Check the named output reader.close(); outPath = new Path(OUTPUT + "/" + TopicFingerprint.OUTPUT_TOTALCOUNT + "/" + "part-r-00000"); reader = new TupleFile.Reader(fs, conf, outPath); tuple = new Tuple(reader.getSchema()); reader.next(tuple); assertEquals(1, tuple.get("topic")); assertEquals(15, tuple.get("totalcount")); reader.next(tuple); assertEquals(2, tuple.get("topic")); assertEquals(19, tuple.get("totalcount")); reader.close(); trash(INPUT, OUTPUT); } public void createInput(String input, Configuration conf) throws IOException, InterruptedException { Path inPath = new Path(input); FileSystem fs = FileSystem.get(inPath.toUri(), conf); TupleFile.Writer writer = new TupleFile.Writer(fs, conf, inPath, TopicalWordCount.getSchema()); // Topic 1, words: { a, 10 } { b, 1 } , { c, 5 } // Top 2 words = a(10), c(5) ITuple tuple = new Tuple(TopicalWordCount.getSchema()); tuple.set("word", "a"); tuple.set("topic", 1); tuple.set("count", 10); writer.append(tuple); tuple.set("word", "b"); tuple.set("topic", 1); tuple.set("count", 1); writer.append(tuple); tuple.set("word", "c"); tuple.set("topic", 1); tuple.set("count", 5); writer.append(tuple); // Topic 2, words: { a, 10 } { b, 9 } , { c, 5 } // Top 2 words = a(10), b(9) tuple.set("word", "a"); tuple.set("topic", 2); tuple.set("count", 10); writer.append(tuple); tuple.set("word", "b"); tuple.set("topic", 2); tuple.set("count", 9); writer.append(tuple); tuple.set("word", "c"); tuple.set("topic", 2); tuple.set("count", 5); writer.append(tuple); writer.close(); } }