ch.usi.inf.lidr.merging.SSL.java Source code

Introduction

Here is the source code for ch.usi.inf.lidr.merging.SSL.java
Source

/*
 * Copyright (C) 2013  Ilya Markov
 * 
 * Full copyright notice can be found in LICENSE. 
 */
package ch.usi.inf.lidr.merging;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.math.stat.regression.SimpleRegression;

import ch.usi.inf.lidr.utils.ScoredEntity;

/**
 * The semi-supervised learning (SSL) results merging/score normalization algorithm.
 * SSL first learns the transformation function between source-specific
 * and centralized (normalized) document scores
 * based on a set of training data-points.
 * Then this function is applied to source-specific document scores,
 * thus normalizing them.
 * 
 * <p>
 * This implementation assumes that each search engine
 * uses its own scoring function,
 * possibly different from scoring functions of other search engines
 * (this corresponds to the "multiple search engine types" case in the paper, section 3.3).
 * </p>
 * 
 * @author Ilya Markov
 * 
 * @see "A semisupervised learning method to merge search engine results",
 *       Luo Si and Jamie Callan.
 *       <i>ACM Transactions on Information Systems</i>, pages 457-491, 2003.
 */
public final class SSL implements ResultsMerging {

    /**
     * The mapping between sample documents and their centralized (normalized) scores for a given query.
     * Used for training SSL.
     * 
     * <p>
     * <b>IMPORTANT:</b> the mapping must be set for each query using {@link #setSampleDocuments(List)}
     * before normalizing document scores.
     * </p>
     * 
     * @see #setSampleDocuments(List)
     */
    private final Map<Object, Double> centrScores = new HashMap<Object, Double>();

    /**
     * Sets the list of sampled documents.
     * This list must contain documents,
     * sampled from all sources of information/search engines.
     * Scores of all documents must be calculated by one
     * single scoring function for a given query. 
     * 
     * <p>
     * In other words, the following steps must be performed
     * in order to obtain the <code>sampleScoredDocs</code> list.
     * <ol>
     * <li>Sample a number of documents from each
     * source of information/search engine.</li>
     * <li>Create a single index out of all these documents.</li>
     * <li>Run a given query on this index.</li>
     * <li>Wrap obtained results into a list of {@link ScoredEntity} objects.</li>
     * <li>Pass this list to the current method.</li>
     * </ol>
     * </p>
     * 
     * <p>
     * <b>IMPORTANT:</b> this method must be called for each query
     * before performing normalization.
     * </p>
     * 
     * @param sampleScoredDocs The list of documents,
     *       sampled from all sources of information/search engines,
     *       with scores calculated by one single scoring function.
     * 
     * @throws NullPointerException
     *       if <code>sampleScoredDocs</code> is <code>null</code>.
     * 
     * @see #centrScores
     */
    public void setSampleDocuments(List<ScoredEntity<Object>> sampleScoredDocs) {
        if (sampleScoredDocs == null) {
            throw new NullPointerException("The list of sample scored documents is null.");
        }

        for (ScoredEntity<Object> scoredDocument : sampleScoredDocs) {
            centrScores.put(scoredDocument.getEntity(), scoredDocument.getScore());
        }
    }

    /**
     * <b>IMPORTANT:</b> {@link #setSampleDocuments(List)} must be called before running normalization.
     * 
     * @see ch.usi.inf.lidr.norm.ScoreNormalization#normalize(List<ScoredEntity<T>>)
     * @see #setSampleDocuments(List)
     */
    @Override
    public <T> List<ScoredEntity<T>> normalize(List<ScoredEntity<T>> unnormScoredDocs) {
        if (unnormScoredDocs == null) {
            throw new NullPointerException("The list of scored documents is null.");
        }

        SimpleRegression regression = getRegression(unnormScoredDocs);
        //TODO: backup with CORI
        if (regression.getN() < 3) {
            return new ArrayList<ScoredEntity<T>>();
        }

        List<ScoredEntity<T>> normScoredDocs = new ArrayList<ScoredEntity<T>>(unnormScoredDocs.size());
        for (int i = 0; i < unnormScoredDocs.size(); i++) {
            ScoredEntity<T> normScoredDoc = new ScoredEntity<T>(unnormScoredDocs.get(i).getEntity(),
                    regression.getSlope() * unnormScoredDocs.get(i).getScore() + regression.getIntercept());
            normScoredDocs.add(normScoredDoc);
        }

        return normScoredDocs;
    }

    /**
     * Creates and returns a {@link SimpleRegression}
     * for a given list of scored documents <code>scoredDocs</code>.
     * This regression maps unnormalized scores in <code>scoredDocs</code>
     * to normalized/centralized scores in <code>centrScores</code>.
     * Documents that appear both in <code>scoredDocs</code>
     * and <code>centrScores</code> are used as a training for the regression.
     * According to the original paper, only first 10
     * documents are considered for training.
     * 
     * @param scoredDocs The list of scored documents.
     * 
     * @return The {@link SimpleRegression} with filled-in training data.
     */
    private <T> SimpleRegression getRegression(List<ScoredEntity<T>> scoredDocs) {
        SimpleRegression regression = new SimpleRegression();

        Set<Double> xData = new HashSet<Double>();
        for (ScoredEntity<T> scoredDocument : scoredDocs) {
            Object docId = scoredDocument.getEntity();
            double specificScore = scoredDocument.getScore();

            if (centrScores.containsKey(docId) && !xData.contains(specificScore)) {
                regression.addData(specificScore, centrScores.get(docId));
                xData.add(specificScore);

                if (regression.getN() >= 10) {
                    return regression;
                }
            }
        }

        return regression;
    }
}