com.datasalt.pangool.examples.topicalwordcount.TestTopicFingerprint.java Source code

Java tutorial

Introduction

Here is the source code for com.datasalt.pangool.examples.topicalwordcount.TestTopicFingerprint.java

Source

/**
 * Copyright [2012] [Datasalt Systems S.L.]
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.datasalt.pangool.examples.topicalwordcount;

import com.datasalt.pangool.io.ITuple;
import com.datasalt.pangool.io.Tuple;
import com.datasalt.pangool.io.TupleFile;
import com.datasalt.pangool.utils.test.AbstractHadoopTestLibrary;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.ToolRunner;
import org.junit.Test;

import java.io.IOException;

import static org.junit.Assert.assertEquals;

public class TestTopicFingerprint extends AbstractHadoopTestLibrary {

    public final static String INPUT = TestTopicFingerprint.class.getName() + "-input";
    public final static String OUTPUT = TestTopicFingerprint.class.getName() + "-output";

    @Test
    public void test() throws Exception {
        trash(OUTPUT);

        Configuration conf = new Configuration();

        createInput(INPUT, conf);
        ToolRunner.run(getConf(), new TopicFingerprint(), new String[] { INPUT, OUTPUT, 2 + "" });

        Path outPath = new Path(OUTPUT + "/part-r-00000");
        FileSystem fs = FileSystem.get(outPath.toUri(), conf);
        TupleFile.Reader reader = new TupleFile.Reader(fs, conf, outPath);
        Tuple tuple = new Tuple(reader.getSchema());

        // The order in the output file is deterministic (we have sorted by topic, count)
        reader.next(tuple);
        assertEquals(1, tuple.get("topic"));
        assertEquals("a", tuple.get("word").toString());

        reader.next(tuple);
        assertEquals(1, tuple.get("topic"));
        assertEquals("c", tuple.get("word").toString());

        reader.next(tuple);
        assertEquals(2, tuple.get("topic"));
        assertEquals("a", tuple.get("word").toString());

        reader.next(tuple);
        assertEquals(2, tuple.get("topic"));
        assertEquals("b", tuple.get("word").toString());

        // Check the named output

        reader.close();
        outPath = new Path(OUTPUT + "/" + TopicFingerprint.OUTPUT_TOTALCOUNT + "/" + "part-r-00000");
        reader = new TupleFile.Reader(fs, conf, outPath);
        tuple = new Tuple(reader.getSchema());

        reader.next(tuple);
        assertEquals(1, tuple.get("topic"));
        assertEquals(15, tuple.get("totalcount"));

        reader.next(tuple);
        assertEquals(2, tuple.get("topic"));
        assertEquals(19, tuple.get("totalcount"));

        reader.close();

        trash(INPUT, OUTPUT);
    }

    public void createInput(String input, Configuration conf) throws IOException, InterruptedException {
        Path inPath = new Path(input);
        FileSystem fs = FileSystem.get(inPath.toUri(), conf);
        TupleFile.Writer writer = new TupleFile.Writer(fs, conf, inPath, TopicalWordCount.getSchema());

        // Topic 1, words: { a, 10 } { b, 1 } , { c, 5 }
        // Top 2 words = a(10), c(5)
        ITuple tuple = new Tuple(TopicalWordCount.getSchema());
        tuple.set("word", "a");
        tuple.set("topic", 1);
        tuple.set("count", 10);
        writer.append(tuple);

        tuple.set("word", "b");
        tuple.set("topic", 1);
        tuple.set("count", 1);
        writer.append(tuple);

        tuple.set("word", "c");
        tuple.set("topic", 1);
        tuple.set("count", 5);
        writer.append(tuple);

        // Topic 2, words: { a, 10 } { b, 9 } , { c, 5 }
        // Top 2 words = a(10), b(9)
        tuple.set("word", "a");
        tuple.set("topic", 2);
        tuple.set("count", 10);
        writer.append(tuple);

        tuple.set("word", "b");
        tuple.set("topic", 2);
        tuple.set("count", 9);
        writer.append(tuple);

        tuple.set("word", "c");
        tuple.set("topic", 2);
        tuple.set("count", 5);
        writer.append(tuple);

        writer.close();
    }
}