com.yahoo.glimmer.indexing.generator.IndexRecordWriterTest.java Source code

Java tutorial

Introduction

Here is the source code for com.yahoo.glimmer.indexing.generator.IndexRecordWriterTest.java

Source

package com.yahoo.glimmer.indexing.generator;

/*
 * Copyright (c) 2012 Yahoo! Inc. All rights reserved.
 * 
 *  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
 *  Unless required by applicable law or agreed to in writing, software distributed under the License is 
 *  distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and limitations under the License.
 *  See accompanying LICENSE file.
 */

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import it.unimi.di.big.mg4j.index.BitStreamIndex;
import it.unimi.di.big.mg4j.index.DiskBasedIndex;
import it.unimi.di.big.mg4j.index.IndexIterator;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RawLocalFileSystem;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.TaskInputOutputContext;
import org.apache.hadoop.mapreduce.TaskType;
import org.jmock.Expectations;
import org.jmock.Mockery;
import org.jmock.lib.legacy.ClassImposteriser;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;

import com.yahoo.glimmer.indexing.RDFDocumentFactory;
import com.yahoo.glimmer.indexing.generator.IndexRecordWriter.OutputFormat;

public class IndexRecordWriterTest {
    private static final Path INDEX_TMP_DIR = new Path("/tmp/IndexRecordWriterTest");
    private Mockery context;
    private TaskInputOutputContext<?, ?, ?, ?> taskContext;
    private Configuration conf;
    private FileSystem fs = new RawLocalFileSystem();
    private TaskAttemptID taskAttemptID = new TaskAttemptID("taskId", 8, TaskType.REDUCE, 88, 888);

    @Before
    public void before() throws IOException, URISyntaxException {
        context = new Mockery();
        context.setImposteriser(ClassImposteriser.INSTANCE);
        taskContext = context.mock(TaskInputOutputContext.class, "taskContext");
        conf = new Configuration();

        conf.set("mapred.output.dir", INDEX_TMP_DIR.toString());
        conf.setLong(TripleIndexGenerator.NUMBER_OF_DOCUMENTS, 8);

        fs.initialize(new URI("file:///"), new Configuration());
    }

    @After
    public void after() throws IOException {
        // fs.deleteOnExit(new Path(INDEX_TMP_DIR)); doesn't work..
        if (!INDEX_TMP_DIR.toString().startsWith("/tmp/")) {
            throw new AssertionError("Not removing test indexes as they are not in /tmp as expected.");
        }
        fs.delete(INDEX_TMP_DIR, true);
    }

    @Test
    public void test() throws Exception {
        context.checking(new Expectations() {
            {
                allowing(taskContext).getConfiguration();
                will(returnValue(conf));
                allowing(taskContext).getTaskAttemptID();
                will(returnValue(taskAttemptID));
            }
        });
        OutputFormat outputFormat = new IndexRecordWriter.OutputFormat();

        conf.setStrings("RdfFieldNames", "index0", "index1");
        conf.setEnum("IndexType", RDFDocumentFactory.IndexType.VERTICAL);

        RecordWriter<IntWritable, IndexRecordWriterValue> recordWriter = outputFormat.getRecordWriter(taskContext);

        IntWritable key = new IntWritable();
        IndexRecordWriterTermValue termValue = new IndexRecordWriterTermValue();
        IndexRecordWriterDocValue docValue = new IndexRecordWriterDocValue();
        IndexRecordWriterSizeValue sizeValue = new IndexRecordWriterSizeValue();

        // ALIGNEMENT_INDEX
        key.set(DocumentMapper.ALIGNMENT_INDEX);
        termValue.setTerm("term1");
        termValue.setTermFrequency(1);
        // The alignment index doesn't have positions/counts.
        termValue.setOccurrenceCount(0);
        termValue.setSumOfMaxTermPositions(0);
        recordWriter.write(key, termValue);
        docValue.setDocument(0); // term1 occurs in index 0
        recordWriter.write(key, docValue);

        // Index 0
        key.set(0);
        termValue.setTermFrequency(3);
        termValue.setOccurrenceCount(6);
        termValue.setSumOfMaxTermPositions(15 + 12 + 18);
        recordWriter.write(key, termValue);
        docValue.setDocument(3);
        docValue.clearOccerrences();
        docValue.addOccurrence(11);
        docValue.addOccurrence(15);
        recordWriter.write(key, docValue);
        docValue.setDocument(4);
        docValue.clearOccerrences();
        docValue.addOccurrence(12);
        recordWriter.write(key, docValue);
        docValue.setDocument(7);
        docValue.clearOccerrences();
        docValue.addOccurrence(14);
        docValue.addOccurrence(17);
        docValue.addOccurrence(18);
        recordWriter.write(key, docValue);

        // ALIGNEMENT_INDEX
        key.set(DocumentMapper.ALIGNMENT_INDEX);
        termValue.setTerm("term2");
        termValue.setTermFrequency(2);
        // The alignment index doesn't have positions/counts.
        termValue.setOccurrenceCount(0);
        termValue.setSumOfMaxTermPositions(0);
        recordWriter.write(key, termValue);
        docValue.clearOccerrences();
        docValue.setDocument(0); // term2 occurs in index 0 & 1
        recordWriter.write(key, docValue);
        docValue.setDocument(1); // term2 occurs in index 0 & 1
        recordWriter.write(key, docValue);

        // Index 0
        key.set(0);
        termValue.setTermFrequency(2);
        termValue.setOccurrenceCount(4);
        termValue.setSumOfMaxTermPositions(19 + 16);
        recordWriter.write(key, termValue);

        docValue.setDocument(1);
        docValue.clearOccerrences();
        docValue.addOccurrence(10);
        docValue.addOccurrence(19);
        recordWriter.write(key, docValue);
        docValue.setDocument(7);
        docValue.clearOccerrences();
        docValue.addOccurrence(13);
        docValue.addOccurrence(16);
        recordWriter.write(key, docValue);

        // Index 1
        key.set(1);
        termValue.setTermFrequency(1);
        termValue.setOccurrenceCount(1);
        termValue.setSumOfMaxTermPositions(14);
        recordWriter.write(key, termValue);
        docValue.setDocument(1);
        docValue.clearOccerrences();
        docValue.addOccurrence(14);
        recordWriter.write(key, docValue);

        // ALIGNMENT_INDEX 
        key.set(DocumentMapper.ALIGNMENT_INDEX);
        termValue.setTerm("term3");
        termValue.setTermFrequency(1);
        // The alignment index doesn't have positions/counts.
        termValue.setOccurrenceCount(0);
        termValue.setSumOfMaxTermPositions(0);
        recordWriter.write(key, termValue);
        docValue.setDocument(1); // term3 occurs in index 1
        recordWriter.write(key, docValue);
        docValue.clearOccerrences();

        // Index 1
        key.set(1);
        termValue.setTermFrequency(1);
        termValue.setOccurrenceCount(2);
        termValue.setSumOfMaxTermPositions(11);
        recordWriter.write(key, termValue);
        docValue.setDocument(3);
        docValue.clearOccerrences();
        docValue.addOccurrence(10);
        docValue.addOccurrence(11);
        recordWriter.write(key, docValue);

        // Doc Sizes.
        key.set(0);
        sizeValue.setDocument(0);
        sizeValue.setSize(3);
        recordWriter.write(key, sizeValue);
        sizeValue.setDocument(3);
        sizeValue.setSize(1);
        recordWriter.write(key, sizeValue);
        sizeValue.setDocument(4);
        sizeValue.setSize(10);
        recordWriter.write(key, sizeValue);
        sizeValue.setDocument(6);
        sizeValue.setSize(2);
        recordWriter.write(key, sizeValue);

        key.set(1);
        sizeValue.setDocument(3);
        sizeValue.setSize(3);
        recordWriter.write(key, sizeValue);
        sizeValue.setDocument(6);
        sizeValue.setSize(5);
        recordWriter.write(key, sizeValue);

        recordWriter.close(taskContext);

        // Check the written indexes..

        Path workPath = outputFormat.getDefaultWorkFile(taskContext, "");
        System.out.println("Default work file is " + workPath.toString());
        String dir = workPath.toUri().getPath();
        BitStreamIndex index0 = (BitStreamIndex) DiskBasedIndex.getInstance(dir + "/index0", true, true);
        assertEquals(8, index0.numberOfDocuments);
        assertEquals(2, index0.numberOfTerms);
        assertTrue(index0.hasPositions);
        // term1
        checkOccurrences(index0.documents(0), 3, "(3:11,15) (4:12) (7:14,17,18)");
        // term2
        checkOccurrences(index0.documents(1), 2, "(1:10,19) (7:13,16)");
        assertEquals("[3, 0, 0, 1, 10, 0, 2, 0]", index0.sizes.toString());

        BitStreamIndex index1 = (BitStreamIndex) DiskBasedIndex.getInstance(dir + "/index1", true, true);
        assertEquals(8, index1.numberOfDocuments);
        assertEquals(2, index1.numberOfTerms);
        assertTrue(index0.hasPositions);
        checkOccurrences(index1.documents(0), 1, "(1:14)");
        // term3
        checkOccurrences(index1.documents(1), 1, "(3:10,11)");

        BitStreamIndex indexAlignment = (BitStreamIndex) DiskBasedIndex.getInstance(dir + "/alignment", true);
        assertEquals(8, indexAlignment.numberOfDocuments);
        assertEquals(3, indexAlignment.numberOfTerms);
        assertFalse(indexAlignment.hasPositions);
        // term1
        assertEquals(1, indexAlignment.documents(0).frequency());
        // term2
        assertEquals(2, indexAlignment.documents(1).frequency());
        // term3
        assertEquals(1, indexAlignment.documents(2).frequency());
        assertEquals("[0, 0, 0, 3, 0, 0, 5, 0]", index1.sizes.toString());
    }

    private void checkOccurrences(IndexIterator documents, int frequencey, String expected) throws IOException {
        assertEquals(frequencey, documents.frequency());
        StringBuilder actual = new StringBuilder();
        while (documents.mayHaveNext()) {
            if (actual.length() > 0) {
                actual.append(' ');
            }
            Long next = documents.nextDocument();
            actual.append('(');
            actual.append(next);
            actual.append(':');
            int position;
            boolean first = true;
            while ((position = documents.nextPosition()) != IndexIterator.END_OF_POSITIONS) {
                if (first) {
                    first = false;
                } else {
                    actual.append(',');
                }
                actual.append(position);
            }
            actual.append(")");
        }
        assertEquals(expected, actual.toString());
    }

}