AbstractSentenceSum.java :  » Natural-Language-Processing » Dragon-Toolkit » dragon » ir » summarize » Java Open Source

Java Open Source » Natural Language Processing » Dragon Toolkit 
Dragon Toolkit » dragon » ir » summarize » AbstractSentenceSum.java
package dragon.ir.summarize;

import dragon.ir.clustering.*;
import dragon.ir.index.*;
import dragon.matrix.vector.DoubleVector;
import dragon.nlp.compare.*;
import dragon.onlinedb.Article;
import dragon.util.SortedArray;
import java.util.*;

/**
 * <p>The class implement basic function of building summary given a sentence set. </p>
 * <p></p>
 * <p>Copyright: Copyright (c) 2005</p>
 * <p>Company: IST, Drexel University</p>
 * @author Davis Zhou
 * @version 1.0
 */

public abstract class AbstractSentenceSum {

    protected String buildSummary(IndexReader indexReader, ArrayList sentSet, int summaryLength, DoubleVector weightVector){
        ArrayList list;
        IRDoc curDoc;
        Article article;
        TopicSummary summary;
        StringBuffer autoSum;
        String curSentence;
        int i, curLength;

        curLength=0;
        summary=new TopicSummary(TextUnit.UNIT_SENTENCE);
        list=new ArrayList(sentSet.size());
        for(i=0;i<sentSet.size();i++){
            curDoc=(IRDoc)sentSet.get(i);
            curDoc.setWeight(weightVector.get(i));
            list.add(curDoc);
        }
        Collections.sort(list,new WeightComparator(true));

        for(i=0;i<list.size() && curLength<summaryLength; i++){
            curDoc=(IRDoc)list.get(i);
            article=indexReader.getOriginalDoc(curDoc.getIndex());
            if(article==null || (curSentence=article.getTitle())==null)
                continue;
            if(summary.contains(new TextUnit(curSentence)))
                continue;
            if(curLength<summaryLength){
                summary.addText(new TextUnit(curSentence,curDoc.getIndex(),curDoc.getWeight()));
                curLength+=curSentence.length();
            }
        }
        summary.sortByWegiht();
        if(summary.size()==0)
            return null;

        autoSum = new StringBuffer(summary.getTextUnit(0).getText());
        for (i = 1; i < summary.size(); i++) {
            autoSum.append("\n");
            autoSum.append(summary.getTextUnit(i).getText());
        }
        if(autoSum.length()<=summaryLength)
            return autoSum.toString();
        else
            return autoSum.substring(0,summaryLength);
    }

    protected String buildSummary(IndexReader indexReader, ArrayList sentSet, int summaryLength, DoubleVector weightVector,
                                  DocClusterSet clusters){
        SortedArray list;
        IRDoc curDoc;
        Article article;
        TopicSummary summary;
        DocCluster curCluster;
        StringBuffer autoSum;
        String curSentence;
        boolean[] usedDoc, usedCluster;
        int i, j, pos,curLength;


        list=new SortedArray(sentSet.size(), new IndexComparator());
        for(i=0;i<sentSet.size();i++){
            curDoc=(IRDoc)sentSet.get(i);
            curDoc.setWeight(weightVector.get(i));
            list.add(curDoc);
        }

        for(i=0;i<clusters.getClusterNum();i++){
            curCluster=clusters.getDocCluster(i);
            for(j=0;j<curCluster.getDocNum();j++){
                curDoc=curCluster.getDoc(j);
                pos=list.binarySearch(curDoc);
                if(pos<0)
                    continue;
                curDoc=(IRDoc)list.get(pos);
                curDoc.setCategory(i);
            }
        }

        list.setComparator(new WeightComparator(true));
        usedDoc=new boolean[list.size()];
        usedCluster=new boolean[clusters.getClusterNum()];
        curLength=0;
        summary=new TopicSummary(TextUnit.UNIT_SENTENCE);

        //extract one sentence with highest score from each cluster
        for(i=0;i<list.size() && curLength<summaryLength; i++){
            curDoc=(IRDoc)list.get(i);
            if(usedCluster[curDoc.getCategory()])
                continue;
            article=indexReader.getOriginalDoc(curDoc.getIndex());
            if(article==null || (curSentence=article.getTitle())==null)
                continue;
            if(summary.contains(new TextUnit(curSentence)))
                continue;
            if(curLength<summaryLength){
                summary.addText(new TextUnit(curSentence,curDoc.getIndex(),curDoc.getWeight()));
                curLength+=curSentence.length();
                usedCluster[curDoc.getCategory()]=true;
                usedDoc[i]=true;
            }
        }

        //extract remaining sentences
        for(i=0;i<list.size() && curLength<summaryLength; i++){
            if(usedDoc[i])
                continue;
            curDoc=(IRDoc)list.get(i);
            article=indexReader.getOriginalDoc(curDoc.getIndex());
            if(article==null || (curSentence=article.getTitle())==null)
                continue;
            if(summary.contains(new TextUnit(curSentence)))
                continue;
            if(curLength<summaryLength){
                summary.addText(new TextUnit(curSentence,curDoc.getIndex(),curDoc.getWeight()));
                curLength+=curSentence.length();
                usedDoc[i]=true;
            }
        }

        summary.sortByWegiht();
        if(summary.size()==0)
            return null;

        autoSum = new StringBuffer(summary.getTextUnit(0).getText());
        for (i = 1; i < summary.size(); i++) {
            autoSum.append("\n");
            autoSum.append(summary.getTextUnit(i).getText());
        }
        if(autoSum.length()<=summaryLength)
            return autoSum.toString();
        else
            return autoSum.substring(0,summaryLength);
    }

}
java2s.com  | Contact Us | Privacy Policy
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.