org.apache.hadoop.contrib.index.mapred.TestIndexUpdater.java Source code

Introduction

Here is the source code for org.apache.hadoop.contrib.index.mapred.TestIndexUpdater.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.contrib.index.mapred;

import java.io.File;
import java.io.IOException;
import java.text.NumberFormat;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.contrib.index.lucene.FileSystemDirectory;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.mapred.MiniMRCluster;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.KeepOnlyLastCommitDeletionPolicy;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.Directory;

import junit.framework.TestCase;

public class TestIndexUpdater extends TestCase {

    private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance();
    static {
        NUMBER_FORMAT.setMinimumIntegerDigits(5);
        NUMBER_FORMAT.setGroupingUsed(false);
    }

    // however, "we only allow 0 or 1 reducer in local mode" - from
    // LocalJobRunner
    private Configuration conf;
    private Path localInputPath = new Path(System.getProperty("build.test") + "/sample/data.txt");
    private Path inputPath = new Path("/myexample/data.txt");
    private Path outputPath = new Path("/myoutput");
    private Path indexPath = new Path("/myindex");
    private int initNumShards = 3;
    private int numMapTasks = 5;

    private int numDataNodes = 3;
    private int numTaskTrackers = 3;

    private int numRuns = 3;
    private int numDocsPerRun = 10; // num of docs in local input path

    private FileSystem fs;
    private MiniDFSCluster dfsCluster;
    private MiniMRCluster mrCluster;

    public TestIndexUpdater() throws IOException {
        super();
        if (System.getProperty("hadoop.log.dir") == null) {
            String base = new File(".").getPath(); // getAbsolutePath();
            System.setProperty("hadoop.log.dir", new Path(base).toString() + "/logs");
        }
        conf = new Configuration();
    }

    protected void setUp() throws Exception {
        super.setUp();
        try {
            dfsCluster = new MiniDFSCluster(conf, numDataNodes, true, (String[]) null);

            fs = dfsCluster.getFileSystem();
            if (fs.exists(inputPath)) {
                fs.delete(inputPath);
            }
            fs.copyFromLocalFile(localInputPath, inputPath);

            if (fs.exists(outputPath)) {
                // do not create, mapred will create
                fs.delete(outputPath);
            }

            if (fs.exists(indexPath)) {
                fs.delete(indexPath);
            }

            mrCluster = new MiniMRCluster(numTaskTrackers, fs.getUri().toString(), 1);

        } catch (IOException e) {
            if (dfsCluster != null) {
                dfsCluster.shutdown();
                dfsCluster = null;
            }

            if (fs != null) {
                fs.close();
                fs = null;
            }

            if (mrCluster != null) {
                mrCluster.shutdown();
                mrCluster = null;
            }

            throw e;
        }

    }

    protected void tearDown() throws Exception {
        if (dfsCluster != null) {
            dfsCluster.shutdown();
            dfsCluster = null;
        }

        if (fs != null) {
            fs.close();
            fs = null;
        }

        if (mrCluster != null) {
            mrCluster.shutdown();
            mrCluster = null;
        }

        super.tearDown();
    }

    public void testIndexUpdater() throws IOException {
        IndexUpdateConfiguration iconf = new IndexUpdateConfiguration(conf);
        // max field length, compound file and number of segments will be checked
        // later
        iconf.setIndexMaxFieldLength(2);
        iconf.setIndexUseCompoundFile(true);
        iconf.setIndexMaxNumSegments(1);

        long versionNumber = -1;
        long generation = -1;

        for (int i = 0; i < numRuns; i++) {
            if (fs.exists(outputPath)) {
                fs.delete(outputPath);
            }

            Shard[] shards = new Shard[initNumShards + i];
            for (int j = 0; j < shards.length; j++) {
                shards[j] = new Shard(versionNumber, new Path(indexPath, NUMBER_FORMAT.format(j)).toString(),
                        generation);
            }
            run(i + 1, shards);
        }
    }

    private void run(int numRuns, Shard[] shards) throws IOException {
        IIndexUpdater updater = new IndexUpdater();
        updater.run(conf, new Path[] { inputPath }, outputPath, numMapTasks, shards);

        // verify the done files
        Path[] doneFileNames = new Path[shards.length];
        int count = 0;
        FileStatus[] fileStatus = fs.listStatus(outputPath);
        for (int i = 0; i < fileStatus.length; i++) {
            FileStatus[] doneFiles = fs.listStatus(fileStatus[i].getPath());
            for (int j = 0; j < doneFiles.length; j++) {
                doneFileNames[count++] = doneFiles[j].getPath();
            }
        }
        assertEquals(shards.length, count);
        for (int i = 0; i < count; i++) {
            assertTrue(doneFileNames[i].getName().startsWith(IndexUpdateReducer.DONE.toString()));
        }

        // verify the index
        IndexReader[] readers = new IndexReader[shards.length];
        for (int i = 0; i < shards.length; i++) {
            Directory dir = new FileSystemDirectory(fs, new Path(shards[i].getDirectory()), false, conf);
            readers[i] = IndexReader.open(dir);
        }

        IndexReader reader = new MultiReader(readers);
        IndexSearcher searcher = new IndexSearcher(reader);
        Hits hits = searcher.search(new TermQuery(new Term("content", "apache")));

        assertEquals(numRuns * numDocsPerRun, hits.length());

        int[] counts = new int[numDocsPerRun];
        for (int i = 0; i < hits.length(); i++) {
            Document doc = hits.doc(i);
            counts[Integer.parseInt(doc.get("id"))]++;
        }

        for (int i = 0; i < numDocsPerRun; i++) {
            assertEquals(numRuns, counts[i]);
        }

        // max field length is 2, so "dot" is also indexed but not "org"
        hits = searcher.search(new TermQuery(new Term("content", "dot")));
        assertEquals(numRuns, hits.length());

        hits = searcher.search(new TermQuery(new Term("content", "org")));
        assertEquals(0, hits.length());

        searcher.close();
        reader.close();

        // open and close an index writer with KeepOnlyLastCommitDeletionPolicy
        // to remove earlier checkpoints
        for (int i = 0; i < shards.length; i++) {
            Directory dir = new FileSystemDirectory(fs, new Path(shards[i].getDirectory()), false, conf);
            IndexWriter writer = new IndexWriter(dir, false, null, new KeepOnlyLastCommitDeletionPolicy());
            writer.close();
        }

        // verify the number of segments, must be done after an writer with
        // KeepOnlyLastCommitDeletionPolicy so that earlier checkpoints are removed
        for (int i = 0; i < shards.length; i++) {
            PathFilter cfsFilter = new PathFilter() {
                public boolean accept(Path path) {
                    return path.getName().endsWith(".cfs");
                }
            };
            FileStatus[] cfsFiles = fs.listStatus(new Path(shards[i].getDirectory()), cfsFilter);
            assertEquals(1, cfsFiles.length);
        }
    }

}