edu.txstate.dmlab.clusteringwiki.preprocess.CollectionContext.java Source code

Introduction

Here is the source code for edu.txstate.dmlab.clusteringwiki.preprocess.CollectionContext.java
Source

package edu.txstate.dmlab.clusteringwiki.preprocess;

/**
 *  ClusteringWiki - personalized and collaborative clustering of search results
 *  Copyright (C) 2010  Texas State University-San Marcos
 *  
 *  Contact: http://dmlab.cs.txstate.edu
 * 
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 * 
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 * 
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Stack;
import java.util.StringTokenizer;

import org.apache.lucene.util.OpenBitSet;
import org.springframework.util.StringUtils;

import edu.txstate.dmlab.clusteringwiki.app.ApplicationSettings;
import edu.txstate.dmlab.clusteringwiki.cluster.ClusterDocument;
import edu.txstate.dmlab.clusteringwiki.cluster.IClusterDocument;
import edu.txstate.dmlab.clusteringwiki.sources.ICWSearchResult;
import edu.txstate.dmlab.clusteringwiki.sources.ICWSearchResultCol;
import edu.txstate.dmlab.clusteringwiki.sources.ICWSearchResult.FIELDS;
import edu.txstate.dmlab.clusteringwiki.util.CharUtils;
import edu.txstate.dmlab.clusteringwiki.util.TokenUtils;
import edu.txstate.dmlab.clusteringwiki.util.ArrayUtils;

/**
 * Defines a context for information about the document 
 * collection being analyzed.  It encapsulates methods to
 * analyze a collection of documentsToCluster as well as retrieve
 * specific information about them or the collection needed
 * by other methods.
 * 
 * @author David C. Anastasiu
 *
 */
public class CollectionContext implements ICollectionContext {

    /**
     * Initially received document collection
     */
    public final ICWSearchResultCol docs;

    /**
     * Cache for number of documents to be processed
     */
    public final int docsLength;

    /**
     * Fields in ICWSearchResult that comprise the document
     * and should be analyzed.  Note that the "document" filed in
     * the ICWSearchResult will generally by dynamically retrieved on
     * first call and can cause app slowness
     */
    public final FIELDS[] fields;

    /**
     * Query that should be used when performing query related
     * analysis
     */
    public final String query;

    /**
     * Query after it has been stemmed
     */
    public String analyzedQuery;

    /**
     * Analyzed query as array of terms
     */
    public final int[] queryTerms;

    /**
     * Map for quickly finding the numeric id of a string version of a word.
     */
    public final Map<String, Integer> allWordsIndex = new HashMap<String, Integer>();

    /**
     * Cache for how many words are stored in the collection
     */
    public final int allWordsLength;

    /**
     * Map for quickly finding the numeric id of a string version of a term.
     */
    public final Map<String, Integer> allTermsIndex = new HashMap<String, Integer>();

    /**
     * Cache for how many terms there are in the collection
     */
    public final int allTermsLength;

    /**
     * Map for all term ids matched back to the term given this id
     */
    public final List<String> allTermsReverseIndex = new ArrayList<String>();

    /**
     * Keep track of document indexes that term is found in
     */
    public final List<OpenBitSet> allTermsDocumentIndexes = new ArrayList<OpenBitSet>();

    /**
     * Processed documentsToCluster that should be clustered
     */
    public final List<IClusterDocument> allDocs;

    /**
     * Some results have the same URL + title, causing non-result set duplicate paths problems
     * customResultLabels identifies duplicates and adds a prefix for copies of result labels
     */
    protected final Map<Integer, String> customResultLabels = new HashMap<Integer, String>();

    /**
     * Index of result nodes, key = result label, value = page indexes in allDocs
     */
    protected final Map<String, Set<Integer>> resultLabelIndex = new HashMap<String, Set<Integer>>();

    /**
     * Max path field size in db
     * @see ApplicationSettings
     */
    protected final int maxPathFieldSize = ApplicationSettings.getMaxPathFieldSize();

    /**
     * Ordered set of all noun phrases encountered in the document collection
     * Provides an index for each phrase
     */
    protected final List<String> allNounPhrases = new ArrayList<String>(0);

    /**
     * Reverse lookup index for noun phrases
     */
    protected final Map<String, Integer> nounPhraseIndex = new HashMap<String, Integer>(0);

    /**
     * Index for what documents a phrase in found in
     */
    protected final Map<Integer, Set<Integer>> nounPhraseDocIndex = new HashMap<Integer, Set<Integer>>(0);

    /**
     * @return the search returned docs
     */
    public ICWSearchResultCol getDocs() {
        return this.docs;
    }

    /**
     * @return the fields
     */
    public FIELDS[] getFields() {
        return this.fields;
    }

    /**
     * @return the query
     */
    public String getQuery() {
        return this.query;
    }

    /**
     * @return the analyzedQuery
     */
    public String getAnalyzedQuery() {
        return this.analyzedQuery;
    }

    /**
     * @return the queryTerms
     */
    public int[] getQueryTerms() {
        return queryTerms;
    }

    /**
     * @return the processed list of cluster documentsToCluster
     */
    public List<IClusterDocument> getAllDocs() {
        return this.allDocs;
    }

    /**
     * @return the customResultLabels
     */
    public Map<Integer, String> getCustomResultLabels() {
        return this.customResultLabels;
    }

    /**
     * @return the resultLabelIndex
     */
    public Map<String, Set<Integer>> getResultLabelIndex() {
        return this.resultLabelIndex;
    }

    /**
     * Initialize the document collection context by providing
     * the documentsToCluster to be analyzed, a list of fields within
     * those documentsToCluster to be analyzed, 
     * and the query that was posed on this collection
     * @param docs Collection of received documentsToCluster
     * @param fields Array of ICWSearchResult document fields to analyze 
     * @param query String of executed query 
     */
    public CollectionContext(ICWSearchResultCol theDocs, FIELDS[] theFields, String theQuery) {
        this.docs = theDocs;
        this.fields = theFields;
        this.query = theQuery;

        //analyze query
        this.queryTerms = analyzeQuery(theQuery);

        //get results from collection
        ICWSearchResult[] results = this.docs.getResults();

        if (results == null || results.length == 0)
            throw new IllegalArgumentException("No results received");

        this.docsLength = results.length;
        this.allDocs = new ArrayList<IClusterDocument>(docsLength);

        //for each retrieved document, build document to cluster, which also
        //does initial processing of text. also identify any duplicate result 
        //labels and tag them as such
        for (int i = 0; i < this.docsLength; i++) {
            IClusterDocument doc = new ClusterDocument(i, results[i], this);
            //add reference to document
            this.allDocs.add(doc);

            //register terms in doc with the allTermsDocumentIndexes index
            final int[] terms = doc.getTerms();
            for (int t : terms)
                this.setTermDocumentIndex(t, i);

            //build result label index and customResultLabels
            String key = getResultLabel(doc);
            Set<Integer> p = this.resultLabelIndex.get(key);
            if (p == null) {
                p = new HashSet<Integer>();
            } else {
                // register duplicate result label
                this.customResultLabels.put(i, p.size() + "-");
                key = getResultLabel(doc);
                p = this.resultLabelIndex.get(key);
                if (p == null)
                    p = new HashSet<Integer>();
            }
            p.add(i);
            this.resultLabelIndex.put(key, p);

        }

        this.allWordsLength = this.allWordsIndex.size();
        this.allTermsLength = this.allTermsIndex.size();

    }

    /**
     * Analyzes query and created queryTerms array, then
     * returns analyzed query string
     * @param query
     * @return
     */
    public int[] analyzeQuery(String query) {
        StringTokenizer st = new StringTokenizer(query, ApplicationSettings.TOKEN_CHARS, true);
        List<Integer> qTerms = new ArrayList<Integer>();
        List<String> qTermStrings = new ArrayList<String>();
        while (st.hasMoreTokens()) {
            String tok = st.nextToken();

            if (!tok.equals(" ") && tok.length() > 0) {
                while (CharUtils.startsWithPhraseSeparator(tok))
                    tok = tok.substring(1);
                while (CharUtils.endsWithPhraseSeparator(tok))
                    tok = tok.substring(0, tok.length() - 1);

                final short tt = TokenUtils.getFullTokenType(tok);
                final boolean isStopWord = TokenUtils.isStopWord(tt);
                final boolean isPunctuation = TokenUtils.isPunctuation(tt);
                if (!isStopWord && !isPunctuation) {
                    final String tm = edu.txstate.dmlab.clusteringwiki.util.StringUtils.stemWord(tok);
                    if (tm.length() >= ApplicationSettings.MINIMUM_TERM_LENGTH) {
                        final int tid = this.getTermId(tm); // add token
                        this.getWordId(tok); //add word to allWordsIndex
                        qTerms.add(tid);
                        qTermStrings.add(tm);
                    } else {
                        this.getWordId(tok);
                    }
                }
            }
        }

        this.analyzedQuery = StringUtils.collectionToDelimitedString(qTermStrings, " ");

        int[] terms = ArrayUtils.intColToArray(qTerms);

        return terms;

    }

    /**
     * A Suffix tree phrase is represented as an array of term ids
     * which is a subphrase of a number of represented documents.
     * Get the words label best represented within the first k
     * documents the phrase covers.
     * @param phraseDocs Stack of documents (indexes in allDocs array) covered by the phrase
     * @param phraseTerms array of allTermsIndex indexes that form the phrase
     */
    public String getPhraseLabel(final Stack<Integer> phraseDocs, final int[] phraseTerms) {
        final Map<String, Set<Integer>> index = new HashMap<String, Set<Integer>>();
        int i = 0;
        for (int phraseDoc : phraseDocs) {
            IClusterDocument d = this.allDocs.get(phraseDoc);
            if (d == null)
                continue;
            final String l = d.getWordPhraseString(phraseTerms);
            if (l == null)
                continue;
            Set<Integer> cover = index.get(l);
            if (cover == null)
                cover = new HashSet<Integer>();
            cover.add(phraseDoc);
            index.put(l, cover);
            i++;
            if (i + 1 % ApplicationSettings.LABEL_SET_DOCUMENT_SIZE == 0 && index.size() > 0)
                break;
        }
        if (index.size() > 0) {
            int coverSize = 0;
            String chosen = "";
            for (String s : index.keySet()) {
                final int compSize = index.get(s).size();
                if (compSize > coverSize) {
                    chosen = s;
                    coverSize = compSize;
                }
            }
            return chosen;
        }
        return "Unrepresented label";
    }

    /**
     * Register that a term exists in a given document in the 
     * allTermsDocumentIndexes array
     * @param termIndex
     * @param docIndex
     */
    protected void setTermDocumentIndex(int termIndex, int docIndex) {
        while (this.allTermsDocumentIndexes.size() < termIndex + 1)
            this.allTermsDocumentIndexes.add(new OpenBitSet(this.docsLength));
        OpenBitSet termDocIndex = this.allTermsDocumentIndexes.get(termIndex);
        termDocIndex.fastSet(docIndex);
    }

    /**
     * Get documents bit set that contain all terms in a phrase
     * @param terms
     * @return
     */
    public OpenBitSet getDocumentsContainingTerms(int[] terms) {
        if (terms == null || terms.length < 1)
            return new OpenBitSet(this.countDocs());
        OpenBitSet docs = (OpenBitSet) this.allTermsDocumentIndexes.get(terms[0]).clone();
        for (int i = 1; i < terms.length; i++)
            docs.and(this.allTermsDocumentIndexes.get(terms[i]));
        return docs;
    }

    /**
     * Get a unique term id for a given term in the collection
     * @param term
     * @param documentIndex
     * @return
     */
    public Integer getTermId(String term) {
        Integer id = this.allTermsIndex.get(term);
        if (id == null) {
            id = this.allTermsIndex.size();
            this.allTermsIndex.put(term, id);
            if (ApplicationSettings.KEEP_REVERSE_TERM_INDEX)
                this.allTermsReverseIndex.set(id, term);
        }
        return id;
    }

    /**
     * Get a unique word id for a given word in the collection
     * @param term
     * @return
     */
    public Integer getWordId(String word) {
        Integer id = this.allWordsIndex.get(word);
        if (id == null) {
            Integer i = this.allWordsIndex.size();
            this.allWordsIndex.put(word, i);
            return i;
        }
        return id;
    }

    /**
     * Number of documentsToCluster in the context
     * @return
     */
    public int countDocs() {
        return this.docsLength;
    }

    /**
     * Result label is built from the url, and a possible 
     * prefix if result url is not unique in the result list
     * @param d
     * @return
     */
    public String getResultLabel(IClusterDocument d) {
        String label = d.getResultDoc().getUrl().trim();
        String prefix = this.customResultLabels.get(d.getIndex());
        if (prefix != null)
            label = prefix + label;
        if (label.length() > this.maxPathFieldSize)
            return label.substring(0, this.maxPathFieldSize - 1);
        return label;
    }

}