intelligentWebAlgorithms.algos.search.ranking.DocRankMatrixBuilder.java Source code

Introduction

Here is the source code for intelligentWebAlgorithms.algos.search.ranking.DocRankMatrixBuilder.java
Source

/*
 *   ________________________________________________________________________________________
 *   
 *   Y O O R E E K A
 *   A library for data mining, machine learning, soft computing, and mathematical analysis
 *   ________________________________________________________________________________________ 
 *    
 *   The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " 
 *   (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms 
 *   are valuable in any software application.
 *  
 *   Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
 *   Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.  
 * 
 *   Certain library functions depend on other Open Source software libraries, which are covered 
 *   by different license agreements. See the NOTICE file distributed with this work for additional 
 *   information regarding copyright ownership and licensing.
 * 
 *   Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); 
 *   you may not use this file except in compliance with the License.  
 *   You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 *   Unless required by applicable law or agreed to in writing, software distributed under 
 *   the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
 *   either express or implied. See the License for the specific language governing permissions and
 *   limitations under the License.
 *   
 */
package intelligentWebAlgorithms.algos.search.ranking;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;

import intelligentWebAlgorithms.util.internet.crawling.core.CrawlDataProcessor;
import intelligentWebAlgorithms.util.parsing.common.ProcessedDocument;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.store.SimpleFSDirectory;
import intelligentWebAlgorithms.algos.search.lucene.analyzer.TextDocumentTerms;

public class DocRankMatrixBuilder implements CrawlDataProcessor {

    // private final int TERMS_TO_KEEP = 3;

    private int termsToKeep = 0;

    private String indexDir;
    private PageRankMatrixH matrixH;

    public DocRankMatrixBuilder(String indexDir) {
        this.indexDir = indexDir;
    }

    private PageRankMatrixH buildMatrixH(IndexReader idxR) throws IOException {

        // only consider URLs that with fetched and parsed content
        List<Integer> allDocs = getProcessedDocs(idxR);

        PageRankMatrixH docMatrix = new PageRankMatrixH(allDocs.size());

        for (int i = 0, n = allDocs.size(); i < n; i++) {

            for (int j = 0, k = allDocs.size(); j < k; j++) {

                double similarity = 0.0d;

                Document docX = idxR.document(i);
                String xURL = docX.get("url");

                if (i == j) {

                    // Avoid shameless self-promotion ;-)
                    docMatrix.addLink(xURL, xURL, similarity);

                } else {

                    TextDocumentTerms xDocumentTerms = new TextDocumentTerms(docX.get("content"));

                    Document docY = idxR.document(j);
                    TextDocumentTerms yDocumentTerms = new TextDocumentTerms(docY.get("content"));

                    similarity = getImportance(xDocumentTerms, yDocumentTerms);

                    // add link from docX to docY
                    String yURL = docY.get("url");

                    docMatrix.addLink(xURL, yURL, similarity);
                }
            }
        }

        docMatrix.calculate();

        return docMatrix;
    }

    /*
     * Checks if the index entry belongs to the category that we want to use
     * DocRank on.
     */
    private boolean eligibleForDocRank(String doctype) {
        return ProcessedDocument.TYPE_MSWORD.equalsIgnoreCase(doctype);
    }

    public PageRankMatrixH getH() {
        return matrixH;
    }

    /*
     * Calculates importance of document Y in the context of document X
     */
    private double getImportance(TextDocumentTerms xTerms, TextDocumentTerms yTerms) {

        // sharedTerms is the intersection of the two sets
        Set<String> sharedTerms = xTerms.getTf().keySet();
        sharedTerms.retainAll(yTerms.getTf().keySet());

        double sharedTermsSum = 0.0;

        // Notice that this way of assigning importance is not symmetric.
        // That is, if you swap X with Y then you get a different value;
        // unless the frequencies are equal, of course!

        double xF, yF;
        for (String term : sharedTerms) {

            xF = xTerms.getTf().get(term).doubleValue();
            yF = yTerms.getTf().get(term).doubleValue();

            sharedTermsSum += Math.round(Math.tanh(yF / xF));
        }

        return sharedTermsSum;
    }

    /*
     * Collects doc ids from the index for documents with matching doc type.
     */
    private List<Integer> getProcessedDocs(IndexReader idxR) throws IOException {
        List<Integer> docs = new ArrayList<Integer>();
        for (int i = 0, n = idxR.maxDoc(); i < n; i++) {
            if (idxR.hasDeletions() == false) {
                Document doc = idxR.document(i);
                if (eligibleForDocRank(doc.get("doctype"))) {
                    docs.add(i);
                }
            }
        }
        return docs;

    }

    // private Map<String, Integer> buildFreqMap(String[] terms, int[] freq) {
    //
    // int topNTermsToKeep = (termsToKeep == 0)? TERMS_TO_KEEP: termsToKeep;
    //
    // Map<String, Integer> freqMap =
    // TermFreqMapUtils.getTopNTermFreqMap(terms, freq, topNTermsToKeep);
    //
    // return freqMap;
    // }

    /**
     * @return the termsToKeep
     */
    public int getTermsToKeep() {
        return termsToKeep;
    }

    public void run() {
        try {
            DirectoryReader idxR = DirectoryReader.open(new SimpleFSDirectory(new File(indexDir)));
            matrixH = buildMatrixH(idxR);
        } catch (Exception e) {
            throw new RuntimeException("Error while building matrix: ", e);
        }
    }

    /**
     * @param termsToKeep
     *            the termsToKeep to set
     */
    public void setTermsToKeep(int termsToKeep) {
        this.termsToKeep = termsToKeep;
    }

}