edu.ucsb.cs.hybrid.mappers.SingleS_Runner.java Source code

Introduction

Here is the source code for edu.ucsb.cs.hybrid.mappers.SingleS_Runner.java
Source

/**
 * Copyright 2012-2013 The Regents of the University of California
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on
 * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations under
 * the License.
 * 
 * Author: maha alabduljalil <maha (at) cs.ucsb.edu>
 * @Since Jul 27, 2012
 */

package edu.ucsb.cs.hybrid.mappers;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapRunner;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.log4j.Logger;

import edu.ucsb.cs.hybrid.Config;
import edu.ucsb.cs.hybrid.io.OneMapReader;
import edu.ucsb.cs.hybrid.io.Reader;
import edu.ucsb.cs.types.DocDocWritable;
import edu.ucsb.cs.types.FeatureWeightArrayWritable;
import edu.ucsb.cs.types.PostingDocWeight;

public class SingleS_Runner
        extends MapRunner<LongWritable, FeatureWeightArrayWritable, DocDocWritable, FloatWritable> {

    protected JobConf conf;
    protected SingleS_Mapper mapper;
    /** Number of vectors assigned to each map task */
    public int S_size = 0; //counts of actual read vectors
    protected long totalTerms = 0; //count of number of features with duplicates
    boolean googleDynSkip;//this should be removed later
    ArrayList<Float> dynamicSmaxw = null;

    RecordReader<LongWritable, FeatureWeightArrayWritable> input;
    //OutputCollector<DocDocWritable, FloatWritable> output;
    //   private static final Logger LOG = Logger.getLogger(SingleS_Runner.class); //never used

    /**
     * Responsible for instantiating and configuring a map class according to
     * user choice of similarity method.
     */
    @Override
    public void configure(JobConf job) {
        conf = job;
        instantMapper(job);
    }

    public void instantMapper(JobConf job) {
        ClassLoader myClassLoader = ClassLoader.getSystemClassLoader();
        try {
            Class mapperClass = myClassLoader.loadClass(job.getMapperClass().getName());
            mapper = (SingleS_Mapper) mapperClass.newInstance();
            mapper.configure(job);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    /**
     * Called automatically by Hadoop framework to control the flow of functions
     * calls in the map tasks. Here, it will read the whole split of the s
     * assigned vectors into memory S. Due to unknown size of the split, we
     * first use dynamic data structures then convert it to static ones before
     * APSS.
     */
    @Override
    public void run(RecordReader<LongWritable, FeatureWeightArrayWritable> input,
            OutputCollector<DocDocWritable, FloatWritable> output, Reporter reporter) throws IOException {
        /* Setup configurations */
        this.input = input;
        boolean log = conf.getBoolean(Config.LOG_PROPERTY, Config.LOG_VALUE);
        boolean origIdComp, idComparison = !conf.getBoolean(Config.CIRCULAR_PROPERTY, Config.CIRCULAR_VALUE);
        HashMap<Long, ArrayList<PostingDocWeight>> dynSIndex = null;

        /*  Phase1: Build Index */
        //long startTime = System.currentTimeMillis();
        initMapper(buildInvertedIndex(log), log, idComparison);
        //System.out.println("LOG: Inverted index building time in millisec:" + (System.currentTimeMillis() - startTime));

        /* Phase2: Similarity Computations */
        Reader reader = getReader(conf);
        // System.out.println("LOG: Total files to compare with: " + reader.nFiles);

        for (int currentFile = 0; currentFile < reader.nFiles; currentFile++) {
            if (!reader.setReader(currentFile)) {
                continue;
            } else {
                if (reader.readMyPartition) {
                    origIdComp = idComparison;
                    idComparison = true;//mapper.idComparison = true;
                    comparePartition(reader, output, reporter);
                    idComparison = origIdComp;//mapper.idComparison = origIdComp;
                } else {
                    comparePartition(reader, output, reporter);
                }
            }
        }
        closeMapper();
    }

    public void comparePartition(Reader reader, OutputCollector<DocDocWritable, FloatWritable> output,
            Reporter reporter) throws IOException {
        mapper.compareWith(reader, output, reporter);
    }

    protected void closeMapper() throws IOException {
        mapper.close();
    }

    @Override
    protected Mapper getMapper() {
        return mapper;
    }

    protected void initMapper(Object split, boolean logV, boolean idComparisonV) {
        mapper.initialize((HashMap<Long, PostingDocWeight[]>) split, logV, idComparisonV, S_size);
    }

    public boolean splitLimitReached(int nVectors) {
        return false;
    }

    public Object buildInvertedIndex(boolean log) throws IOException {
        HashMap<Long, ArrayList<PostingDocWeight>> dynSIndex = new HashMap<Long, ArrayList<PostingDocWeight>>();
        ArrayList<Long> dynamicIdMap = null;
        LongWritable doc = new LongWritable();
        FeatureWeightArrayWritable vector = new FeatureWeightArrayWritable();
        totalTerms = 0;
        int nVectors = 0;
        float threshold = Config.THRESHOLD_VALUE;
        dynamicIdMap = new ArrayList<Long>();
        googleDynSkip = conf.getBoolean(Config.BAYADRO_SKIP_PROPERTY, Config.BAYADRO_SKIP_VALUE);
        if (googleDynSkip)
            dynamicSmaxw = new ArrayList<Float>();
        //      if(compareDynamically)
        //         threshold = conf.getFloat(Config.THRESHOLD_PROPERTY, Config.THRESHOLD_VALUE);

        long startTime = System.currentTimeMillis();
        while (!splitLimitReached(nVectors) && input.next(doc, vector)) {
            /* Mapping docs IDs to serial numbers 0,1,2 ...S */
            dynamicIdMap.add(doc.get());
            if (googleDynSkip)
                dynamicSmaxw.add(vector.getMaxWeight());
            int numTerms = vector.getSize();
            totalTerms += numTerms;
            /* Adding one document into S inverted index */
            for (int i = 0; i < numTerms; i++) {
                ArrayList<PostingDocWeight> posting = getPosting(dynSIndex, vector.getFeature(i));
                PostingDocWeight item = new PostingDocWeight(nVectors, vector.getWeight(i));
                posting.add(item);
            }
            nVectors++;
        }
        S_size += nVectors;
        if (log) {
            long distinctTerms = dynSIndex.size();
            System.out.println("LOG: Build inverted index time in millisec:"
                    + ((System.currentTimeMillis() - startTime)) + "\nLOG: Number of distict features:"
                    + distinctTerms + "\nLOG: Reduction in features storage via uniqeness:"
                    + ((totalTerms - distinctTerms) / (float) totalTerms * 100)
                    + "\nAvg posting length per feature:" + totalTerms / (float) distinctTerms);
        }
        if (S_size == 0)
            return null;
        else
            return convertDynamicStatic(dynSIndex, dynamicIdMap);
    }

    /**
     * Flush accumulator of the dynamically compared inverted index after one document is added to S
     * @param accumulator
     * @param output: write the result of one-to-many comparisons into output stream
     * @throws IOException 
     */
    void flushAccumulator(float[] accumulator, ArrayList<Long> dynamicIdMap,
            OutputCollector<DocDocWritable, FloatWritable> output, float threshold) throws IOException {
        /* Exclude the document that is recently added */
        int lastIndex = dynamicIdMap.size() - 1;
        long accID, thisId = dynamicIdMap.get(lastIndex);
        for (int i = 0; i < lastIndex; i++) {
            float sim = accumulator[i];
            if ((sim > threshold) && ((accID = dynamicIdMap.get(i)) != thisId))
                output.collect(new DocDocWritable(dynamicIdMap.get(i), thisId), new FloatWritable(sim));
        }
    }

    /**
     * Returns the posting associated with feature t from the dynamically built inverted index
     * @param dynSIndex: Dynamic inverted index built from S vectors assigned to mapper
     * @param t: feature t to add to the index
     * @return posting of feature t from the inverted index
     */
    public ArrayList<PostingDocWeight> getPosting(HashMap<Long, ArrayList<PostingDocWeight>> dynSIndex, long t) {
        if (!dynSIndex.containsKey(t)) {
            dynSIndex.put(t, new ArrayList<PostingDocWeight>());
        }
        return dynSIndex.get(t);
    }

    public static Reader getReader(JobConf conf) throws IOException {
        boolean oneMap = conf.getBoolean(Config.SINGLE_MAP_PROPERTY, Config.SINGLE_MAP_VALUE);
        boolean splittable = conf.getBoolean(Config.SPLITABLE_PROPERTY, Config.SPLITABLE_VALUE);

        if (!oneMap || splittable)
            return new Reader(conf, new Path(conf.get("map.input.file")),
                    conf.getInt(Config.COMP_BLOCK_PROPERTY, Config.COMP_BLOCK_VALUE));
        else
            return new OneMapReader(conf, new Path(conf.get("map.input.file")),
                    conf.getInt(Config.COMP_BLOCK_PROPERTY, Config.COMP_BLOCK_VALUE));
    }

    public HashMap<Long, PostingDocWeight[]> convertDynamicStatic(
            HashMap<Long, ArrayList<PostingDocWeight>> dynSIndex, ArrayList<Long> dynamicIdMap) {
        /* PART1: convert the dynamic list inverted index */
        int i = 0, s;
        PostingDocWeight item;
        long term;
        ArrayList<PostingDocWeight> array;
        HashMap<Long, PostingDocWeight[]> SplitIndex = new HashMap<Long, PostingDocWeight[]>();
        Iterator<Long> terms = dynSIndex.keySet().iterator();

        long[] terms_new = new long[dynSIndex.keySet().size()];
        for (i = 0; i < terms_new.length; i++)
            terms_new[i] = terms.next();

        /* PART2: convert the dynamic list of document IDs */
        convertIdMap(dynamicIdMap);
        if (googleDynSkip) {
            convertSmaxw(dynamicSmaxw);
        }
        System.gc();

        for (int h = 0; h < terms_new.length; h++) {
            term = terms_new[h];
            array = dynSIndex.remove(term);
            s = array.size();

            PostingDocWeight[] toStaticArrayII = new PostingDocWeight[s];
            for (i = 0; i < s; i++) {
                item = array.get(0);
                toStaticArrayII[i] = new PostingDocWeight(item.doc, item.weight);
                array.remove(0);
            }
            Arrays.sort(toStaticArrayII, new indexComparable());
            array = null;
            SplitIndex.put(term, toStaticArrayII);
        }
        dynSIndex.clear();
        dynSIndex = null;
        System.gc();
        return SplitIndex;
    }

    /**
     * Converts the dynamic lists into static arrays to reduce memory occupied
     * @param dynamicIdMap
     */
    public void convertIdMap(ArrayList<Long> dynamicIdMap) {
        int s = dynamicIdMap.size();
        long[] idMap = initIdMap(s);
        for (int i = 0; i < s; i++) {
            idMap[i] = dynamicIdMap.get(0);
            dynamicIdMap.remove(0);
        }
        dynamicIdMap.clear();
        dynamicIdMap = null;
    }

    public void convertSmaxw(ArrayList<Float> dynamicSmaxw) { // late
        int s = dynamicSmaxw.size();
        float[] maxwS = initSmaxw(s);
        for (int i = 0; i < s; i++) {
            maxwS[i] = dynamicSmaxw.get(0);
            dynamicSmaxw.remove(0);
        }
        dynamicSmaxw.clear();
        dynamicSmaxw = null;
    }

    public float[] initSmaxw(int s) {// late
        mapper.maxwS = new float[s];
        return getSmaxw();
    }

    public float[] getSmaxw() {// late
        return mapper.maxwS;
    }

    public long[] initIdMap(int s) {
        mapper.IdMap = new long[s];
        return getIdMap();
    }

    public long[] getIdMap() {
        return mapper.IdMap;
    }

    class indexComparable implements Comparator<PostingDocWeight> {
        public int compare(PostingDocWeight o1, PostingDocWeight o2) {
            if (getIdMap()[o1.doc] < getIdMap()[o2.doc])
                return -1;
            else if (getIdMap()[o1.doc] > getIdMap()[o2.doc])
                return 1;
            else
                return 0;
        }
    }

    public void printSInvertedIndex(HashMap<Long, ArrayList<PostingDocWeight>> TermsTree) {
        System.out.println("========= In-memory II =========");
        ArrayList<PostingDocWeight> array;
        long term;
        int postLen, h;
        Iterator<Long> terms = TermsTree.keySet().iterator();
        while (terms.hasNext()) {
            term = terms.next();
            System.out.println(term + "-->");
            array = TermsTree.get(term);
            postLen = array.size();
            for (h = 0; h < postLen; h++) {
                System.out.println("(" + array.get(h).doc + "," + array.get(h).weight + ")");
            }
        }
    }
}