TermGenerativeSum.java :  » Natural-Language-Processing » Dragon-Toolkit » dragon » ir » summarize » Java Open Source

Java Open Source » Natural Language Processing » Dragon Toolkit 
Dragon Toolkit » dragon » ir » summarize » TermGenerativeSum.java
package dragon.ir.summarize;

import dragon.ir.index.*;
import dragon.nlp.Token;
import dragon.nlp.compare.*;
import dragon.util.SortedArray;
import java.util.ArrayList;
/**
 * <p>Generative term based summarization</p>
 * <p></p>
 * <p>Copyright: Copyright (c) 2005</p>
 * <p>Company: IST, Drexel University</p>
 * @author Davis Zhou
 * @version 1.0
 */

public class TermGenerativeSum extends AbstractStructureSum implements StructureSummarizer{
    private double bkgCoeffi;

    public TermGenerativeSum(IndexReader indexReader,double bkgCoefficient) {
        super(indexReader);
        this.bkgCoeffi =bkgCoefficient;
    }

    public TopicSummary summarize(ArrayList docSet, int maxLength){
        SortedArray termList;
        TopicSummary summary;
        Token curToken;
        IRDoc curDoc;
        TextUnit curTerm;
        int[] arrIndex, arrFreq;
        int docNum, iterationNum, termNum, i, j;
        double[] arrProb, arrCollectionProb;
        double weightSum, collectionTermCount;

        //prepare data for EM
        docNum=docSet.size();
        termList=new SortedArray(new IndexComparator());
        for (i = 0; i <docNum; i++) {
            curDoc=(IRDoc)docSet.get(i);
            arrIndex = indexReader.getTermIndexList(curDoc.getIndex());
            arrFreq=indexReader.getTermFrequencyList(curDoc.getIndex());
            for (j = 0; j < arrIndex.length; j++){
                curToken=new Token(null);
                curToken.setIndex(arrIndex[j]);
                curToken.setFrequency(arrFreq[j]);
                if(!termList.add(curToken)){
                    ((Token)termList.get(termList.insertedPos())).addFrequency(curToken.getFrequency());
                }
            }
        }

        //initialization
        iterationNum=15;
        arrProb=new double[termList.size()];
        arrCollectionProb=new double[termList.size()];
        collectionTermCount=indexReader.getCollection().getTermCount();
        for(i=0;i<termList.size();i++){
            curToken=(Token)termList.get(i);
            curToken.setWeight(1.0/termList.size());
            arrCollectionProb[i]=bkgCoeffi*indexReader.getIRTerm(curToken.getIndex()).getFrequency()/collectionTermCount;
        }

        //iteration
        for(i=0;i<iterationNum;i++){
            weightSum=0;
            for(j=0;j<termList.size();j++){
                curToken=(Token)termList.get(j);
                arrProb[j]=(1-bkgCoeffi)*curToken.getWeight()/((1-bkgCoeffi)*curToken.getWeight()+arrCollectionProb[j])*curToken.getFrequency();
                weightSum+=arrProb[j];
            }
            for(j=0;j<termList.size();j++)
                ((Token)termList.get(j)).setWeight(arrProb[j]/weightSum);
        }

        summary=new TopicSummary(TextUnit.UNIT_TERM);
        termNum=Math.min(termList.size(),maxLength);
        for(i=0;i<termNum;i++){
            curToken=(Token)termList.get(i);
            curTerm=new TextUnit(indexReader.getTermKey(curToken.getIndex()), curToken.getIndex(),curToken.getWeight());
            summary.addText(curTerm);
        }
        summary.sortByWegiht();
        return summary;
    }
}
java2s.com  | Contact Us | Privacy Policy
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.