DocFrequencySelector.java :  » Natural-Language-Processing » Dragon-Toolkit » dragon » ir » classification » featureselection » Java Open Source

Java Open Source » Natural Language Processing » Dragon Toolkit 
Dragon Toolkit » dragon » ir » classification » featureselection » DocFrequencySelector.java
package dragon.ir.classification.featureselection;

import dragon.ir.classification.DocClassSet;
import dragon.ir.index.*;
import dragon.matrix.*;
import java.util.ArrayList;

/**
 * <p>Unsupervised Feature Selector which exclude features with its document frequency less than a given threshold</p>
 * <p>Please refer the paper below for details of the algorithm.<br>
 * Yang, Y. and Pedersen, J.O., "A comparative study on feature selection in text categorization,"
 * In Proceedings of International Conference on Machine Learning, 1997, pp. 412-420.
 * </p>
 * <p>Copyright: Copyright (c) 2005</p>
 * <p>Company: IST, Drexel University</p>
 * @author Davis Zhou
 * @version 1.0
 */

public class DocFrequencySelector extends AbstractFeatureSelector implements java.io.Serializable {
  private static final long serialVersionUID = 1L;
  private int minDocFrequency;

    public DocFrequencySelector(int minDocFrequency) {
        this.minDocFrequency =minDocFrequency;
    }

    protected int[] getSelectedFeatures(SparseMatrix doctermMatrix, DocClassSet trainingSet){
        ArrayList list;
        int[] featureMap;
        int i,termNum;

        featureMap=getTermDocFrequency(doctermMatrix,trainingSet);
        termNum=featureMap.length;
        list=new ArrayList(termNum);
        for(i=0;i<termNum;i++){
            if(featureMap[i]>=minDocFrequency)
                list.add(new Integer(i));
        }
        
        featureMap=new int[list.size()];
        for(i=0;i<featureMap.length;i++)
            featureMap[i]=((Integer)list.get(i)).intValue();
        return featureMap;
    }

    protected int[] getSelectedFeatures(IndexReader indexReader, DocClassSet trainingSet){
        IntDenseMatrix termDistri;
        ArrayList list;
        IRTerm curTerm;
        int[] featureMap;
        int i,termNum;

        termDistri=getTermDistribution(indexReader,trainingSet);
        termNum=termDistri.columns();
        list=new ArrayList(termNum);
        for(i=0;i<termNum;i++){
            if(termDistri.getColumnSum(i)<=0)
                continue;
            curTerm=indexReader.getIRTerm(i);
            if(curTerm.getDocFrequency()>=minDocFrequency)
                list.add(curTerm);
        }
        featureMap=new int[list.size()];
        for(i=0;i<featureMap.length;i++)
            featureMap[i]=((IRTerm)list.get(i)).getIndex();
        return featureMap;
    }
}
java2s.com  | Contact Us | Privacy Policy
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.