org.apache.accumulo.examples.wikisearch.ingest.WikipediaMapperTest.java Source code

Introduction

Here is the source code for org.apache.accumulo.examples.wikisearch.ingest.WikipediaMapperTest.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.accumulo.examples.wikisearch.ingest;

import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.util.HashMap;
import java.util.Map.Entry;

import junit.framework.Assert;

import org.apache.accumulo.core.client.BatchWriter;
import org.apache.accumulo.core.client.Connector;
import org.apache.accumulo.core.client.MutationsRejectedException;
import org.apache.accumulo.core.client.Scanner;
import org.apache.accumulo.core.client.mock.MockInstance;
import org.apache.accumulo.core.data.Key;
import org.apache.accumulo.core.data.Mutation;
import org.apache.accumulo.core.data.Range;
import org.apache.accumulo.core.data.Value;
import org.apache.accumulo.core.security.Authorizations;
import org.apache.accumulo.examples.wikisearch.ingest.WikipediaConfiguration;
import org.apache.accumulo.examples.wikisearch.ingest.WikipediaMapper;
import org.apache.accumulo.examples.wikisearch.reader.AggregatingRecordReader;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RawLocalFileSystem;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.junit.Before;

/**
 * Load some data into mock accumulo
 */
public class WikipediaMapperTest {

    private static final String METADATA_TABLE_NAME = "wikiMetadata";

    private static final String TABLE_NAME = "wiki";

    private static final String INDEX_TABLE_NAME = "wikiIndex";

    private static final String RINDEX_TABLE_NAME = "wikiReverseIndex";

    private class MockAccumuloRecordWriter extends RecordWriter<Text, Mutation> {
        @Override
        public void write(Text key, Mutation value) throws IOException, InterruptedException {
            try {
                writerMap.get(key).addMutation(value);
            } catch (MutationsRejectedException e) {
                throw new IOException("Error adding mutation", e);
            }
        }

        @Override
        public void close(TaskAttemptContext context) throws IOException, InterruptedException {
            try {
                for (BatchWriter w : writerMap.values()) {
                    w.flush();
                    w.close();
                }
            } catch (MutationsRejectedException e) {
                throw new IOException("Error closing Batch Writer", e);
            }
        }

    }

    private Connector c = null;
    private Configuration conf = new Configuration();
    private HashMap<Text, BatchWriter> writerMap = new HashMap<Text, BatchWriter>();

    @Before
    public void setup() throws Exception {

        conf.set(AggregatingRecordReader.START_TOKEN, "<page>");
        conf.set(AggregatingRecordReader.END_TOKEN, "</page>");
        conf.set(WikipediaConfiguration.TABLE_NAME, TABLE_NAME);
        conf.set(WikipediaConfiguration.NUM_PARTITIONS, "1");
        conf.set(WikipediaConfiguration.NUM_GROUPS, "1");

        MockInstance i = new MockInstance();
        c = i.getConnector("root", "pass");
        c.tableOperations().delete(METADATA_TABLE_NAME);
        c.tableOperations().delete(TABLE_NAME);
        c.tableOperations().delete(INDEX_TABLE_NAME);
        c.tableOperations().delete(RINDEX_TABLE_NAME);
        c.tableOperations().create(METADATA_TABLE_NAME);
        c.tableOperations().create(TABLE_NAME);
        c.tableOperations().create(INDEX_TABLE_NAME);
        c.tableOperations().create(RINDEX_TABLE_NAME);

        writerMap.put(new Text(METADATA_TABLE_NAME), c.createBatchWriter(METADATA_TABLE_NAME, 1000L, 1000L, 1));
        writerMap.put(new Text(TABLE_NAME), c.createBatchWriter(TABLE_NAME, 1000L, 1000L, 1));
        writerMap.put(new Text(INDEX_TABLE_NAME), c.createBatchWriter(INDEX_TABLE_NAME, 1000L, 1000L, 1));
        writerMap.put(new Text(RINDEX_TABLE_NAME), c.createBatchWriter(RINDEX_TABLE_NAME, 1000L, 1000L, 1));

        TaskAttemptID id = new TaskAttemptID();
        TaskAttemptContext context = new TaskAttemptContext(conf, id);

        RawLocalFileSystem fs = new RawLocalFileSystem();
        fs.setConf(conf);

        URL url = ClassLoader.getSystemResource("enwiki-20110901-001.xml");
        Assert.assertNotNull(url);
        File data = new File(url.toURI());
        Path tmpFile = new Path(data.getAbsolutePath());

        // Setup the Mapper
        InputSplit split = new FileSplit(tmpFile, 0, fs.pathToFile(tmpFile).length(), null);
        AggregatingRecordReader rr = new AggregatingRecordReader();
        Path ocPath = new Path(tmpFile, "oc");
        OutputCommitter oc = new FileOutputCommitter(ocPath, context);
        fs.deleteOnExit(ocPath);
        StandaloneStatusReporter sr = new StandaloneStatusReporter();
        rr.initialize(split, context);
        MockAccumuloRecordWriter rw = new MockAccumuloRecordWriter();
        WikipediaMapper mapper = new WikipediaMapper();

        // Load data into Mock Accumulo
        Mapper<LongWritable, Text, Text, Mutation>.Context con = mapper.new Context(conf, id, rr, rw, oc, sr,
                split);
        mapper.run(con);

        // Flush and close record writers.
        rw.close(context);

    }

    private void debugQuery(String tableName) throws Exception {
        Scanner s = c.createScanner(tableName, new Authorizations("all"));
        Range r = new Range();
        s.setRange(r);
        for (Entry<Key, Value> entry : s)
            System.out.println(entry.getKey().toString() + " " + entry.getValue().toString());
    }

    public void testViewAllData() throws Exception {
        debugQuery(METADATA_TABLE_NAME);
        debugQuery(TABLE_NAME);
        debugQuery(INDEX_TABLE_NAME);
        debugQuery(RINDEX_TABLE_NAME);
    }
}