edu.ub.ahstfg.io.index.ArrayIndex.java Source code

Java tutorial

Introduction

Here is the source code for edu.ub.ahstfg.io.index.ArrayIndex.java

Source

/*
 * ArrayIndex.java is part of HHCluster.
 *
 * HHCluster is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * HHCluster is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with HHCluster.  If not, see <http://www.gnu.org/licenses/>.
 */

package edu.ub.ahstfg.io.index;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.HashMap;
import java.util.LinkedList;

import org.apache.hadoop.io.ArrayWritable;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.Reporter;

import edu.ub.ahstfg.io.WritableConverter;

/**
 * Index class. Groups all terms and keywords and constructs the feature matrix.
 * Document vectors are represented by ArrayLists and the feature matrices are
 * structured by considering documents as rows and each feature as columns.
 * It has been replaced by Index.java which memory and time costs are much better.
 * @author Alberto Huelamo Segura
 */
@Deprecated
public class ArrayIndex implements Writable {

    private LinkedList<String> terms;
    private HashMap<String, LinkedList<Short>> termFreq;

    private HashMap<String, LinkedList<String>> termAppearance;

    private LinkedList<String> keywords;
    private HashMap<String, LinkedList<Short>> keywordFreq;

    private Reporter reporter;

    /**
     * Sole constructor.
     * @param reporter Hadoop job reporter.
     */
    public ArrayIndex(Reporter reporter) {
        terms = new LinkedList<String>();
        termFreq = new HashMap<String, LinkedList<Short>>();

        termAppearance = new HashMap<String, LinkedList<String>>();

        keywords = new LinkedList<String>();
        keywordFreq = new HashMap<String, LinkedList<Short>>();

        this.reporter = reporter;
        reporter.incrCounter("Index report", "Remove term request", 0);
        reporter.incrCounter("Index report", "Removed terms", 0);
    }

    /**
     * Adds a term from a document to the index.
     * If the term already exists increases their frequency.
     * @param term Term to add.
     * @param url Document where term appears.
     * @param freq Frequency to increase.
     */
    public void addTerm(final String term, final String url, final short freq) {
        addTermAppearance(url, term);
        if (!termFreq.containsKey(url)) {
            LinkedList<Short> l = new LinkedList<Short>();
            for (int i = 0; i < terms.size(); i++) {
                l.add((short) 0);
            }
            termFreq.put(url, l);

            l = new LinkedList<Short>();
            for (int i = 0; i < keywords.size(); i++) {
                l.add((short) 0);
            }
            keywordFreq.put(url, l);
        }
        if (terms.contains(term)) {
            final int index = terms.indexOf(term);
            LinkedList<Short> freqs = termFreq.get(url);
            freqs.set(index, (short) (freqs.get(index) + freq));
        } else {
            terms.add(term);
            final int index = terms.indexOf(term);
            LinkedList<Short> freqs;
            for (String storedUrl : termFreq.keySet()) {
                freqs = termFreq.get(storedUrl);
                if (storedUrl.equals(url)) {
                    freqs.add((short) 0);
                    freqs.set(index, freq);
                } else {
                    freqs.add((short) 0);
                }
            }
        }
    }

    /**
     * Adds a document to term appearace. Used to filter.
     * @param url Docuemnt URL to add.
     * @param term Term which appears in document.
     */
    private void addTermAppearance(final String url, final String term) {
        if (!termAppearance.containsKey(term)) {
            termAppearance.put(term, new LinkedList<String>());
        }
        LinkedList<String> docs = termAppearance.get(term);
        if (!docs.contains(url)) {
            docs.add(url);
        }
    }

    /**
     * Gets the term vector.
     * @return An array of terms.
     */
    public String[] getTermVector() {
        return terms.toArray(new String[terms.size()]);
    }

    /**
     * Gets the document urls which have terms.
     * @return An array of urls.
     */
    public String[] getDocumentTermVector() {
        String[] ret = new String[termFreq.size()];
        int i = 0;
        for (String url : termFreq.keySet()) {
            ret[i] = url;
            i++;
        }
        return ret;
    }

    /**
     * Gets the term frequency matrix.
     * @return An array of arrays with frequencies. Rows represents the documents.
     */
    public short[][] getTermFreqMatrix() {
        short[][] ret = new short[termFreq.size()][terms.size()];
        int i = 0, j = 0;
        LinkedList<Short> freqs;
        for (String url : termFreq.keySet()) {
            freqs = termFreq.get(url);
            for (int k = 0; k < terms.size(); k++) {
                ret[i][j] = freqs.get(j);
                j++;
            }
            i++;
            j = 0;
        }
        return ret;
    }

    /**
     * Removes a term from all documents.
     * @param term The term to remove.
     */
    public void removeTerm(String term) {
        int idx = terms.indexOf(term);
        if (idx > 0) {
            terms.remove(idx);
            LinkedList<Short> tf;
            for (String url : termFreq.keySet()) {
                tf = termFreq.get(url);
                tf.remove(idx);
            }
            reporter.incrCounter("Index report", "Removed terms", 1);
        }
    }

    /**
     * Filters the index. Removes those terms which appears in less documents
     * than a percentage and those which appears in more documents than other
     * percentage.
     * @param infCote Terms which appear in less documents than this percentage will be removed.
     * @param supCote Terms which appear in more documents than this percentage will be removed.
     */
    public void filter(double infCote, double supCote) {
        int nDocs, totalDocs = termFreq.size();
        double rate;
        for (String term : termAppearance.keySet()) {
            nDocs = termAppearance.get(term).size();
            rate = (double) nDocs / (double) totalDocs;
            if (rate < infCote || rate > supCote) {
                removeTerm(term);
                reporter.incrCounter("Index report", "Remove term request", 1);
            }
        }
    }

    /**
     * Adds a keyword from a document to the index.
     * If the keyword already exists increases their frequency.
     * @param keyword Keyword to add.
     * @param url Document where keyword appears.
     * @param freq Frequency to increase.
     */
    public void addKeyword(final String keyword, final String url, final Short freq) {
        if (!keywordFreq.containsKey(url)) {
            LinkedList<Short> l = new LinkedList<Short>();
            for (int i = 0; i < keywords.size(); i++) {
                l.add((short) 0);
            }
            keywordFreq.put(url, l);
        }
        if (keywords.contains(keyword)) {
            final int index = keywords.indexOf(keyword);
            LinkedList<Short> freqs = keywordFreq.get(url);
            freqs.set(index, (short) (freqs.get(index) + freq));
        } else {
            keywords.add(keyword);
            final int index = keywords.indexOf(keyword);
            LinkedList<Short> freqs;
            for (String storedUrl : keywordFreq.keySet()) {
                freqs = keywordFreq.get(storedUrl);
                if (storedUrl.equals(url)) {
                    freqs.add((short) 0);
                    freqs.set(index, freq);
                } else {
                    freqs.add((short) 0);
                }
            }
        }
    }

    /**
     * Gets the vector of the added keywords.
     * @return An array with the keywords.
     */
    public String[] getKeywordVector() {
        return keywords.toArray(new String[keywords.size()]);
    }

    /**
     * Gets the keyword frequency matrix.
     * @return An array of arrays with frequencies. Rows represents the documents.
     */
    public short[][] getKeywordFreqMatrix() {
        short[][] ret = new short[keywordFreq.size()][keywords.size()];
        int i = 0, j = 0;
        LinkedList<Short> freqs;
        for (String url : keywordFreq.keySet()) {
            freqs = keywordFreq.get(url);
            if (freqs != null) {
                for (int k = 0; k < keywords.size(); k++) {
                    ret[i][j] = freqs.get(j);
                    j++;
                }
            } else {
                ret[i] = new short[] { (short) -1 };
            }
            i++;
            j = 0;
        }
        return ret;
    }

    @Override
    public void readFields(DataInput input) throws IOException {
        ArrayWritable wTerms = new ArrayWritable(Text.class);
        wTerms.readFields(input);
        terms = WritableConverter.arrayWritable2LinkedListString(wTerms);

        MapWritable wTermFreq = new MapWritable();
        wTermFreq.readFields(input);
        termFreq = WritableConverter.mapWritable2HashMapStringLinkedListShort(wTermFreq);

        ArrayWritable wKeywords = new ArrayWritable(Text.class);
        wKeywords.readFields(input);
        keywords = WritableConverter.arrayWritable2LinkedListString(wKeywords);

        MapWritable wKeywordFreq = new MapWritable();
        wKeywordFreq.readFields(input);
        keywordFreq = WritableConverter.mapWritable2HashMapStringLinkedListShort(wKeywordFreq);
    }

    @Override
    public void write(DataOutput output) throws IOException {
        WritableConverter.LinkedListString2ArrayWritable(terms).write(output);
        WritableConverter.hashMapStringLinkedListShort2MapWritable(termFreq).write(output);
        WritableConverter.LinkedListString2ArrayWritable(keywords).write(output);
        WritableConverter.hashMapStringLinkedListShort2MapWritable(keywordFreq).write(output);
    }

}