org.meresco.lucene.MerescoClustererTest.java Source code

Java tutorial

Introduction

Here is the source code for org.meresco.lucene.MerescoClustererTest.java

Source

/* begin license *
 *
 * "Meresco Lucene" is a set of components and tools to integrate Lucene (based on PyLucene) into Meresco
 *
 * Copyright (C) 2015 Koninklijke Bibliotheek (KB) http://www.kb.nl
 * Copyright (C) 2015-2016 Seecr (Seek You Too B.V.) http://seecr.nl
 *
 * This file is part of "Meresco Lucene"
 *
 * "Meresco Lucene" is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * "Meresco Lucene" is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with "Meresco Lucene"; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *
 * end license */

package org.meresco.lucene;

import static org.junit.Assert.assertArrayEquals;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotSame;
import static org.junit.Assert.assertTrue;

import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexReader;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import org.meresco.lucene.Lucene.UninitializedException;
import org.meresco.lucene.search.InterpolateEpsilon;
import org.meresco.lucene.search.MerescoCluster;
import org.meresco.lucene.search.MerescoCluster.DocScore;
import org.meresco.lucene.search.MerescoClusterer;

public class MerescoClustererTest extends SeecrTestCase {
    private Lucene lucene;

    @Override
    @Before
    public void setUp() throws Exception {
        super.setUp();
        Document doc;

        LuceneSettings settings = new LuceneSettings();
        settings.commitCount = 1;
        lucene = new Lucene(this.tmpDir, settings);
        FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
        fieldType.setStoreTermVectors(true);

        for (int i = 0; i < 5; i++) {
            doc = new Document();
            doc.add(new Field("termvector.field", "aap noot noot noot vuur", fieldType));
            lucene.addDocument("id:" + i, doc);
        }
        for (int i = 5; i < 10; i++) {
            doc = new Document();
            doc.add(new Field("termvector.field", "something else", fieldType));
            lucene.addDocument("id:" + i, doc);
        }
        for (int i = 10; i < 15; i++) {
            doc = new Document();
            doc.add(new Field("termvector.field", "iets anders", fieldType));
            lucene.addDocument("id:" + i, doc);
        }
        lucene.maybeCommitAfterUpdate();
    }

    @Override
    @After
    public void tearDown() throws Exception {
        lucene.close();
        super.tearDown();
    }

    @Test
    public void testClusterOnTermVectors() throws IOException, UninitializedException {
        ClusterConfig clusterConfig = new ClusterConfig()
                .addStrategy(new ClusterStrategy(0.5, 1).addField("termvector.field", 1.0, null));
        MerescoClusterer merescoClusterer = new MerescoClusterer(getIndexReader(), clusterConfig);
        for (int i = 0; i < 15; i++) {
            merescoClusterer.collect(i);
        }
        merescoClusterer.finish();

        MerescoCluster cluster1 = merescoClusterer.cluster(0);
        assertEquals(5, cluster1.topDocs.length);
        assertEquals(2, cluster1.topTerms.length);
        String[] terms = new String[cluster1.topTerms.length];
        for (int i = 0; i < terms.length; i++) {
            terms[i] = cluster1.topTerms[i].term;
        }
        assertArrayEquals(new String[] { "else", "something" }, terms);

        MerescoCluster cluster2 = merescoClusterer.cluster(5);
        assertEquals(5, cluster2.topDocs.length);
        terms = new String[cluster2.topTerms.length];
        for (int i = 0; i < terms.length; i++) {
            terms[i] = cluster2.topTerms[i].term;
        }
        assertArrayEquals(new String[] { "noot", "aap", "vuur" }, terms);

        MerescoCluster cluster3 = merescoClusterer.cluster(10);
        assertEquals(5, cluster3.topDocs.length);
        terms = new String[cluster3.topTerms.length];
        for (int i = 0; i < terms.length; i++) {
            terms[i] = cluster3.topTerms[i].term;
        }
        assertArrayEquals(new String[] { "anders", "iets" }, terms);

        assertNotSame(cluster1.topDocs, cluster2.topDocs);
        assertNotSame(cluster1.topDocs, cluster3.topDocs);
    }

    @Test
    public void testClusteringWithFieldFilter() throws IOException, UninitializedException {
        ClusterConfig clusterConfig = new ClusterConfig()
                .addStrategy(new ClusterStrategy(0.5, 1).addField("termvector.field", 1.0, "noot"));
        MerescoClusterer merescoClusterer = new MerescoClusterer(getIndexReader(), clusterConfig);
        for (int i = 0; i < 15; i++) {
            merescoClusterer.collect(i);
        }
        merescoClusterer.finish();

        MerescoCluster cluster1 = merescoClusterer.cluster(0);
        assertEquals(null, cluster1);

        MerescoCluster cluster2 = merescoClusterer.cluster(5);
        assertEquals(5, cluster2.topDocs.length);
        String[] terms = new String[cluster2.topTerms.length];
        for (int i = 0; i < terms.length; i++) {
            terms[i] = cluster2.topTerms[i].term;
        }
        assertArrayEquals(new String[] { "noot", "aap", "vuur" }, terms);

        MerescoCluster cluster3 = merescoClusterer.cluster(10);
        assertEquals(null, cluster3);
    }

    @Test
    public void testClusteringOnVectorsMultipleStrategies() throws IOException, Exception {
        ClusterConfig clusterConfig = new ClusterConfig(42);
        clusterConfig.addStrategy(new ClusterStrategy(0.5, 2).addField("termvector.field", 1.0, "vuur"));
        clusterConfig.addStrategy(new ClusterStrategy(0.4, 1).addField("termvector.field", 1.0, null));
        clusterConfig.addStrategy(new ClusterStrategy(0.4, 2).addField("termvector.field", 1.0, "anders"));

        InterpolateEpsilon interpolateEpsilon = new InterpolateEpsilon() {
            @Override
            public double interpolateEpsilon(int hits, int sliceSize, double clusteringEps,
                    int clusterMoreRecords) {
                assertEquals(100, hits);
                assertEquals(10, sliceSize);
                assertTrue(clusteringEps >= 0.4);
                assertEquals(42, clusterMoreRecords);
                return clusteringEps;
            }
        };
        IndexReader indexReader = getIndexReader();
        MerescoClusterer merescoClusterer = new MerescoClusterer(indexReader, clusterConfig, interpolateEpsilon,
                100, 10);
        for (int i = 0; i < 15; i++) {
            merescoClusterer.collect(i);
        }
        merescoClusterer.finish();

        assertEquals(3, merescoClusterer.clusters.size());
        for (int i = 0; i < 15; i++) {
            String theID = indexReader.document(i).get(Lucene.ID_FIELD);
            MerescoCluster cluster = merescoClusterer.cluster(i);
            Set<String> ids = new HashSet<>();
            for (DocScore ds : cluster.topDocs) {
                ids.add(indexReader.document(ds.docId).get(Lucene.ID_FIELD));
            }
            assertTrue(ids.contains(theID));
            int idOrd = Integer.valueOf(theID.split(":")[1]);
            if (0 <= idOrd && idOrd <= 4) {
                assertEquals(new HashSet<String>(Arrays.asList("id:4", "id:0", "id:1", "id:2", "id:3")), ids);
            } else if (5 <= idOrd && idOrd <= 9) {
                assertEquals(new HashSet<String>(Arrays.asList("id:8", "id:7", "id:6", "id:5", "id:9")), ids);
            } else {
                assertEquals(new HashSet<String>(Arrays.asList("id:10", "id:11", "id:12", "id:13", "id:14")), ids);
            }
        }
    }

    private IndexReader getIndexReader() throws IOException, UninitializedException {
        return lucene.data.getManager().acquire().searcher.getIndexReader();
    }
}