ivory.core.data.stat.PrefixEncodedGlobalStats.java Source code

Java tutorial

Introduction

Here is the source code for ivory.core.data.stat.PrefixEncodedGlobalStats.java

Source

/*
 * Ivory: A Hadoop toolkit for Web-scale information retrieval
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

package ivory.core.data.stat;

import ivory.core.RetrievalEnvironment;
import ivory.core.data.dictionary.PrefixEncodedLexicographicallySortedDictionary;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.WritableUtils;
import org.apache.log4j.Logger;

import edu.umd.cloud9.io.pair.PairOfIntLong;

public class PrefixEncodedGlobalStats {
    private static final Logger LOG = Logger.getLogger(PrefixEncodedGlobalStats.class);

    PrefixEncodedLexicographicallySortedDictionary prefixSet = new PrefixEncodedLexicographicallySortedDictionary();

    Configuration conf = new Configuration();
    FileSystem fileSys = FileSystem.get(conf);
    int[] df = null;
    long[] cf = null;

    FSDataInputStream termsInput = null;
    FSDataInputStream dfStatsInput = null;
    FSDataInputStream cfStatsInput = null;

    public PrefixEncodedGlobalStats(Path prefixSetPath) throws IOException {
        termsInput = fileSys.open(prefixSetPath);

        prefixSet.readFields(termsInput);
        termsInput.close();
    }

    public PrefixEncodedGlobalStats(Path prefixSetPath, FileSystem fs) throws IOException {
        fileSys = fs;
        termsInput = fileSys.open(prefixSetPath);

        prefixSet.readFields(termsInput);
        termsInput.close();
    }

    public void loadDFStats(Path dfStatsPath) throws IOException {
        loadDFStats(dfStatsPath, fileSys);
    }

    public void loadDFStats(Path dfStatsPath, FileSystem fs) throws IOException {
        dfStatsInput = fs.open(dfStatsPath);

        int l = dfStatsInput.readInt();
        if (l != prefixSet.size()) {
            throw new RuntimeException("df length mismatch: " + l + "\t" + prefixSet.size());
        }
        df = new int[l];
        for (int i = 0; i < l; i++)
            //df[i] = dfStatsInput.readInt();
            df[i] = WritableUtils.readVInt(dfStatsInput);

        dfStatsInput.close();
    }

    public void loadCFStats(Path cfStatsPath) throws IOException {
        loadCFStats(cfStatsPath, fileSys);
    }

    public void loadCFStats(Path cfStatsPath, FileSystem fs) throws IOException {
        cfStatsInput = fs.open(cfStatsPath);

        int l = cfStatsInput.readInt();
        if (l != prefixSet.size()) {
            throw new RuntimeException("cf length mismatch: " + l + "\t" + prefixSet.size());
        }
        cf = new long[l];
        for (int i = 0; i < l; i++)
            //cf[i] = cfStatsInput.readLong();
            cf[i] = WritableUtils.readVLong(cfStatsInput);
        cfStatsInput.close();
    }

    public int getDF(String term) {
        if (df == null)
            throw new RuntimeException("DF-Stats must be loaded first!");
        int index = prefixSet.getId(term);
        //LOG.info("index of " + term + ": " + index);
        if (index < 0)
            return -1;
        return df[index];
    }

    public long getCF(String term) {
        if (cf == null)
            throw new RuntimeException("CF-Stats must be loaded first!");
        int index = prefixSet.getId(term);
        LOG.info("index of " + term + ": " + index);
        if (index < 0)
            return -1;
        return cf[index];
    }

    public PairOfIntLong getStats(String term) {
        int index = prefixSet.getId(term);
        LOG.info("index of " + term + ": " + index);
        if (index < 0)
            return null;
        PairOfIntLong p = new PairOfIntLong();
        p.set(df[index], cf[index]);
        return p;
    }

    public PairOfIntLong getStats(int index) {
        if (index < 0)
            return null;
        PairOfIntLong p = new PairOfIntLong();
        p.set(df[index], cf[index]);
        return p;
    }

    public int length() {
        return prefixSet.size();
    }

    public void printKeys() {
        System.out.println("Window: " + this.prefixSet.getWindowSize());
        System.out.println("Length: " + this.length());
        // int window = prefixSet.getWindow();
        for (int i = 0; i < length() && i < 100; i++) {
            System.out.print(i + "\t" + prefixSet.getTerm(i));
            if (df != null)
                System.out.print("\t" + df[i]);
            if (cf != null)
                System.out.print("\t" + cf[i]);
            System.out.println();
        }
    }

    /*
     * public void printPrefixSetContent(){ prefixSet.printCompressedKeys();
     * prefixSet.printKeys(); }
     */
    public static void main(String[] args) throws Exception {
        //String indexPath = "/umd-lin/telsayed/indexes/medline04";
        String indexPath = "c:/Research/ivory-workspace";

        Configuration conf = new Configuration();
        FileSystem fileSys = FileSystem.getLocal(conf);

        RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fileSys);

        Path termsFilePath = new Path(env.getIndexTermsData());

        Path dfByTermFilePath = new Path(env.getDfByTermData());
        Path cfByTermFilePath = new Path(env.getCfByTermData());

        System.out.println("PrefixEncodedGlobalStats");
        PrefixEncodedGlobalStats globalStatsMap = new PrefixEncodedGlobalStats(termsFilePath);
        System.out.println("PrefixEncodedGlobalStats1");
        globalStatsMap.loadDFStats(dfByTermFilePath);
        System.out.println("PrefixEncodedGlobalStats2");
        globalStatsMap.loadCFStats(cfByTermFilePath);
        System.out.println("PrefixEncodedGlobalStats3");
        //String[] firstKeys = termIDMap.getDictionary().getFirstKeys(100);
        int nTerms = globalStatsMap.length();
        System.out.println("nTerms: " + nTerms);
        /*for(int i = 0; i < nTerms; i++){
               
           PairOfIntLong p = globalStatsMap.getStats(i);
           System.out.println(i+"\t"+p.getLeftElement() +"\t"+ p.getRightElement());
           //if(i%10000 == 0) System.out.println(i+" terms so far ("+p+").");
        }*/
        String term;
        term = "0046";
        System.out.println(term + "\t" + globalStatsMap.getDF(term));
        term = "00565";
        System.out.println(term + "\t" + globalStatsMap.getDF(term));
        term = "01338";
        System.out.println(term + "\t" + globalStatsMap.getDF(term));
        term = "01hz";
        System.out.println(term + "\t" + globalStatsMap.getDF(term));
        term = "03x";
        System.out.println(term + "\t" + globalStatsMap.getDF(term));
        term = "0278x";
        System.out.println(term + "\t" + globalStatsMap.getDF(term));

        term = "0081";
        System.out.println(term + "\t" + globalStatsMap.getDF(term));
        term = "0183";
        System.out.println(term + "\t" + globalStatsMap.getDF(term));
        term = "0244";
        System.out.println(term + "\t" + globalStatsMap.getDF(term));
        term = "032";
        System.out.println(term + "\t" + globalStatsMap.getDF(term));
        //for(int i = 1; i<=200; i++){
        //   term = termIDMap.getTerm(i);
        //   System.out.println(i+"\t"+term+"\t"+termIDMap.getID(term));
        //}
    }
}