ivory.core.data.stat.PrefixEncodedGlobalStatsWithIndex.java Source code

Introduction

Here is the source code for ivory.core.data.stat.PrefixEncodedGlobalStatsWithIndex.java
Source

/*
 * Ivory: A Hadoop toolkit for Web-scale information retrieval
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0 
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

package ivory.core.data.stat;

import ivory.core.RetrievalEnvironment;
import ivory.core.data.dictionary.PrefixEncodedLexicographicallySortedDictionary;

import java.io.IOException;
import java.util.NoSuchElementException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.WritableUtils;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;

import edu.umd.cloud9.io.pair.PairOfIntLong;
import edu.umd.cloud9.util.map.HMapKI;
import edu.umd.cloud9.util.map.HMapKL;

public class PrefixEncodedGlobalStatsWithIndex {

    /**
     * logger
     */
    private static final Logger LOGGER = Logger.getLogger(PrefixEncodedGlobalStatsWithIndex.class);
    static {
        LOGGER.setLevel(Level.WARN);
    }

    Configuration conf = new Configuration();
    FileSystem fileSys = FileSystem.get(conf);

    PrefixEncodedLexicographicallySortedDictionary prefixSet = new PrefixEncodedLexicographicallySortedDictionary();

    int[] dfs = null;
    HMapKI<String> frequentTermsDfs = null;

    long[] cfs = null;
    HMapKL<String> frequentTermsCfs = null;

    int[] idToTerm = null;

    //FSDataInputStream termsInput = null;
    //FSDataInputStream dfStatsInput = null;
    //FSDataInputStream cfStatsInput = null;

    public PrefixEncodedGlobalStatsWithIndex(Path prefixSetPath) throws IOException {
        FSDataInputStream termsInput = fileSys.open(prefixSetPath);
        prefixSet.readFields(termsInput);
        termsInput.close();
    }

    public PrefixEncodedGlobalStatsWithIndex(Path prefixSetPath, FileSystem fs) throws IOException {
        fileSys = fs;
        FSDataInputStream termsInput = fileSys.open(prefixSetPath);

        prefixSet.readFields(termsInput);
        termsInput.close();
    }

    public void loadDFStats(Path dfStatsPath, Path idToTermPath, float cachedFrequentPercent,
            boolean keepIDToTermMap) throws IOException {
        loadDfs(dfStatsPath);

        if (cachedFrequentPercent < 0 || cachedFrequentPercent > 1.0)
            return;

        if (cachedFrequentPercent > 0 || keepIDToTermMap) {
            loadIdToTerm(idToTermPath);
            if (cachedFrequentPercent > 0.2)
                cachedFrequentPercent = 0.2f;
            int cachedFrequent = (int) (cachedFrequentPercent * dfs.length);
            if (cachedFrequent > 0)
                loadFrequentDfMap(cachedFrequent);
            if (!keepIDToTermMap)
                idToTerm = null;
        }
    }

    private void loadDfs(Path dfStatsPath) throws IOException {
        if (dfs != null)
            return;
        FSDataInputStream dfStatsInput = fileSys.open(dfStatsPath);
        int l = dfStatsInput.readInt();
        if (l != prefixSet.size()) {
            throw new RuntimeException("df length mismatch: " + l + "\t" + prefixSet.size());
        }
        dfs = new int[l];
        for (int i = 0; i < l; i++)
            dfs[i] = WritableUtils.readVInt(dfStatsInput);
        dfStatsInput.close();
    }

    private void loadIdToTerm(Path idToTermPath) throws IOException {
        if (idToTerm != null)
            return;
        FSDataInputStream idToTermInput;
        idToTermInput = fileSys.open(idToTermPath);
        LOGGER.info("Loading id to term array ...");
        int k = idToTermInput.readInt();
        idToTerm = new int[k];
        for (int i = 0; i < k; i++)
            idToTerm[i] = idToTermInput.readInt();
        LOGGER.info("Loading done.");
        idToTermInput.close();
    }

    private void loadFrequentDfMap(int n) {
        if (frequentTermsDfs != null)
            return;
        frequentTermsDfs = new HMapKI<String>();
        if (dfs.length < n)
            n = dfs.length;
        for (int id = 1; id <= n; id++) {
            frequentTermsDfs.put(prefixSet.getTerm(idToTerm[id - 1]), dfs[idToTerm[id - 1]]);
        }
        //return frequentTermsMap;
    }

    public int getDF(String term) {
        //if(dfs == null) 
        //   throw new RuntimeException("DF-Stats must be loaded first!");

        if (frequentTermsDfs != null) {
            try {
                int df = frequentTermsDfs.get(term);
                LOGGER.info("[cached] df of " + term + ": " + df);
                return df;
            } catch (NoSuchElementException e) {
            }
        }
        int index = prefixSet.getId(term);
        LOGGER.info("index of " + term + ": " + index);
        if (index < 0)
            return -1;
        return dfs[index];
    }

    /*public void loadCFStats(Path cfStatsPath) throws IOException {
       loadCFStats(cfStatsPath, fileSys);
    }*/

    public void loadCFStats(Path cfStatsPath, Path idToTermPath, float cachedFrequentPercent,
            boolean keepIDToTermMap) throws IOException {
        loadCfs(cfStatsPath);

        if (cachedFrequentPercent < 0 || cachedFrequentPercent > 1.0)
            return;

        if (cachedFrequentPercent > 0 || keepIDToTermMap) {
            loadIdToTerm(idToTermPath);
            if (cachedFrequentPercent > 0.2)
                cachedFrequentPercent = 0.2f;
            int cachedFrequent = (int) (cachedFrequentPercent * dfs.length);
            if (cachedFrequent > 0)
                loadFrequentCfMap(cachedFrequent);
            if (!keepIDToTermMap)
                idToTerm = null;
        }
    }

    public void loadCfs(Path cfStatsPath) throws IOException {
        if (cfs != null)
            return;
        FSDataInputStream cfStatsInput = fileSys.open(cfStatsPath);

        int l = cfStatsInput.readInt();
        if (l != prefixSet.size()) {
            throw new RuntimeException("cf length mismatch: " + l + "\t" + prefixSet.size());
        }
        cfs = new long[l];
        for (int i = 0; i < l; i++)
            cfs[i] = WritableUtils.readVLong(cfStatsInput);
        cfStatsInput.close();
    }

    private void loadFrequentCfMap(int n) {
        if (frequentTermsCfs != null)
            return;
        frequentTermsCfs = new HMapKL<String>();
        if (cfs.length < n)
            n = cfs.length;
        for (int id = 1; id <= n; id++) {
            frequentTermsCfs.put(prefixSet.getTerm(idToTerm[id - 1]), cfs[idToTerm[id - 1]]);
        }
    }

    public long getCF(String term) {
        //if(cfs == null) 
        //   throw new RuntimeException("CF-Stats must be loaded first!");

        if (frequentTermsDfs != null) {
            try {
                long cf = frequentTermsCfs.get(term);
                LOGGER.info("[cached] df of " + term + ": " + cf);
                return cf;
            } catch (NoSuchElementException e) {
            }
        }
        int index = prefixSet.getId(term);
        LOGGER.info("index of " + term + ": " + index);
        if (index < 0)
            return -1;
        return cfs[index];
    }

    public PairOfIntLong getStats(String term) {
        int df = -1;
        long cf = -1;
        PairOfIntLong p = new PairOfIntLong();
        if (frequentTermsDfs != null) {
            try {
                df = frequentTermsDfs.get(term);
                LOGGER.info("[cached] df of " + term + ": " + df);
                if (frequentTermsCfs != null) {
                    try {
                        cf = frequentTermsCfs.get(term);
                        LOGGER.info("[cached] cf of " + term + ": " + cf);
                        p.set(df, cf);
                        return p;
                    } catch (NoSuchElementException e) {
                    }
                }
            } catch (NoSuchElementException e) {
            }
        }
        int index = prefixSet.getId(term);
        LOGGER.info("index of " + term + ": " + index);
        if (index < 0)
            return null;
        p.set(dfs[index], cfs[index]);
        return p;
    }

    /*public PairOfIntLong getStats(String term) {
       int index = prefixSet.getIndex(term);
       LOGGER.info("index of " + term + ": " + index);
       if (index < 0)
     return null;
       PairOfIntLong p = new PairOfIntLong();
       p.set(dfs[index], cfs[index]);
       return p;
    }*/

    public PairOfIntLong getStats(int index) {
        if (index < 0)
            return null;
        PairOfIntLong p = new PairOfIntLong();
        p.set(dfs[index], cfs[index]);
        return p;
    }

    public int length() {
        return prefixSet.size();
    }

    public void printKeys() {
        System.out.println("Window: " + this.prefixSet.getWindowSize());
        System.out.println("Length: " + this.length());
        // int window = prefixSet.getWindow();
        for (int i = 0; i < length() && i < 100; i++) {
            System.out.print(i + "\t" + prefixSet.getTerm(i));
            if (dfs != null)
                System.out.print("\t" + dfs[i]);
            if (cfs != null)
                System.out.print("\t" + cfs[i]);
            System.out.println();
        }
    }

    /*
     * public void printPrefixSetContent(){ prefixSet.printCompressedKeys();
     * prefixSet.printKeys(); }
     */
    public static void main(String[] args) throws Exception {
        //String indexPath = "/umd-lin/telsayed/indexes/medline04";
        String indexPath = "c:/Research/ivory-workspace";

        Configuration conf = new Configuration();
        FileSystem fileSys = FileSystem.getLocal(conf);
        RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fileSys);

        Path termsFilePath = new Path(env.getIndexTermsData());

        Path dfByTermFilePath = new Path(env.getDfByTermData());
        Path cfByTermFilePath = new Path(env.getCfByTermData());

        Path idToTermFilePath = new Path(env.getIndexTermIdMappingData());

        System.out.println("PrefixEncodedGlobalStats");

        PrefixEncodedGlobalStatsWithIndex globalStatsMap = new PrefixEncodedGlobalStatsWithIndex(termsFilePath);
        System.out.println("PrefixEncodedGlobalStats1");
        globalStatsMap.loadDFStats(dfByTermFilePath, idToTermFilePath, 0.2f, true);
        System.out.println("PrefixEncodedGlobalStats2");
        globalStatsMap.loadCFStats(cfByTermFilePath, idToTermFilePath, 0.2f, false);
        System.out.println("PrefixEncodedGlobalStats3");
        //String[] firstKeys = termIDMap.getDictionary().getFirstKeys(100);
        int nTerms = globalStatsMap.length();
        System.out.println("nTerms: " + nTerms);
        /*for(int i = 0; i < nTerms; i++){
               
           PairOfIntLong p = globalStatsMap.getStats(i);
           System.out.println(i+"\t"+p.getLeftElement() +"\t"+ p.getRightElement());
           //if(i%10000 == 0) System.out.println(i+" terms so far ("+p+").");
        }*/
        String term;
        term = "0046";
        System.out.println(term + "\t" + globalStatsMap.getDF(term));
        term = "00565";
        System.out.println(term + "\t" + globalStatsMap.getDF(term));
        term = "01338";
        System.out.println(term + "\t" + globalStatsMap.getDF(term));
        term = "01hz";
        System.out.println(term + "\t" + globalStatsMap.getDF(term));
        term = "03x";
        System.out.println(term + "\t" + globalStatsMap.getDF(term));
        term = "0278x";
        System.out.println(term + "\t" + globalStatsMap.getDF(term));

        term = "0081";
        System.out.println(term + "\t" + globalStatsMap.getDF(term));
        term = "0183";
        System.out.println(term + "\t" + globalStatsMap.getDF(term));
        term = "0244";
        System.out.println(term + "\t" + globalStatsMap.getDF(term));
        term = "032";
        System.out.println(term + "\t" + globalStatsMap.getDF(term));
        //for(int i = 1; i<=200; i++){
        //   term = termIDMap.getTerm(i);
        //   System.out.println(i+"\t"+term+"\t"+termIDMap.getID(term));
        //}
    }
}