cn.edu.thss.iise.beehivez.server.index.labelindex.LabelLuceneIndex.java Source code

Introduction

Here is the source code for cn.edu.thss.iise.beehivez.server.index.labelindex.LabelLuceneIndex.java
Source

/**
 * BeehiveZ is a business process model and instance management system.
 * Copyright (C) 2011  
 * Institute of Information System and Engineering, School of Software, Tsinghua University,
 * Beijing, China
 *
 * Contact: jintao05@gmail.com 
 *
 * This program is a free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation with the version of 2.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 */
package cn.edu.thss.iise.beehivez.server.index.labelindex;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.StringReader;
import java.util.HashSet;
import java.util.Iterator;
import java.util.TreeSet;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.snowball.SnowballAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.apache.lucene.wordnet.SynExpand;
import org.apache.lucene.wordnet.SynonymMap;
import org.tartarus.snowball.ext.EnglishStemmer;

import cn.edu.thss.iise.beehivez.server.parameter.GlobalParameter;
import cn.edu.thss.iise.beehivez.server.util.FileUtil;

/**
 * every label in this index is unique
 * 
 * @author Tao Jin
 * 
 */
public class LabelLuceneIndex {

    private String parentDirectory = "processrepository/index";
    private File INDEX_DIR = new File(parentDirectory, "labelIndex");
    private IndexWriter indexWriter = null;
    private Analyzer analyzer = new SnowballAnalyzer(Version.LUCENE_CURRENT, "English",
            StandardAnalyzer.STOP_WORDS_SET);
    private Directory indexDir = null;
    private EnglishStemmer stemer = new EnglishStemmer();

    public LabelLuceneIndex(String parentDir) {
        parentDirectory = parentDir;
        INDEX_DIR = new File(parentDirectory, "labelIndex");
        try {
            // indexDir = new RAMDirectory(FSDirectory.open(INDEX_DIR));
            indexDir = FSDirectory.open(INDEX_DIR);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    public boolean open() {
        try {
            if (indexWriter != null) {
                indexWriter.close();
            }
            indexWriter = new IndexWriter(this.indexDir, analyzer, false, IndexWriter.MaxFieldLength.UNLIMITED);
            return true;
        } catch (Exception e) {
            e.printStackTrace();
            return false;
        }
    }

    public boolean create() {
        try {
            if (indexWriter != null) {
                indexWriter.close();
            }
            indexWriter = new IndexWriter(this.indexDir, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);
            return true;
        } catch (Exception e) {
            e.printStackTrace();
            return false;
        }
    }

    public void close() {
        try {
            if (indexWriter != null) {
                indexWriter.optimize();
                indexWriter.close();
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void destroy() {
        close();
        FileUtil.deleteFile(INDEX_DIR.getAbsolutePath());
    }

    // check whether the label is contained in this index
    public boolean contain(String label) {
        try {
            IndexReader reader = IndexReader.open(this.indexDir, true);
            Searcher searcher = new IndexSearcher(reader);
            // use the boolean query
            HashSet<String> queryTermSet = new HashSet<String>();
            TokenStream stream = analyzer.tokenStream(LabelDocument.FIELD_LABEL, new StringReader(label));
            TermAttribute termAtt = stream.addAttribute(TermAttribute.class);
            stream.reset();
            while (stream.incrementToken()) {
                queryTermSet.add(termAtt.term());
            }
            stream.end();
            stream.close();

            // construct the query
            BooleanQuery bq = new BooleanQuery();
            Iterator<String> it = queryTermSet.iterator();
            while (it.hasNext()) {
                String s = it.next();
                Term term = new Term(LabelDocument.FIELD_LABEL, s);
                TermQuery termQuery = new TermQuery(term);
                bq.add(termQuery, Occur.MUST);
            }

            ExactLabelQueryResultCollector collector = new ExactLabelQueryResultCollector(reader, label);
            searcher.search(bq, collector);
            boolean ret = collector.isExistQueryLabel();
            reader.close();
            return ret;
        } catch (Exception e) {
            e.printStackTrace();
        }
        return false;
    }

    public boolean addLabel(String label) {
        try {
            label = label.trim();
            if (contain(label)) {
                return false;
            }
            Document doc = LabelDocument.Document(label);
            indexWriter.addDocument(doc);
            indexWriter.commit();
            return true;
        } catch (Exception e) {
            e.printStackTrace();
            return false;
        }
    }

    public void deleteLabel() {
        System.out.println("unsupported now");
    }

    public TreeSet<SimilarLabelQueryResult> getSimilarLabels(String query, float similarity) {
        TreeSet<SimilarLabelQueryResult> ret = new TreeSet<SimilarLabelQueryResult>();
        if (query == null) {
            ret.add(new SimilarLabelQueryResult(null, 1));
            return ret;
        }
        try {
            IndexReader reader = IndexReader.open(this.indexDir, true);
            Searcher searcher = new IndexSearcher(reader);

            // get terms from query
            HashSet<String> queryTermSet = new HashSet<String>();
            TokenStream stream = analyzer.tokenStream(LabelDocument.FIELD_LABEL, new StringReader(query));
            TermAttribute termAtt = stream.addAttribute(TermAttribute.class);
            stream.reset();
            while (stream.incrementToken()) {
                queryTermSet.add(termAtt.term());
            }
            stream.end();
            stream.close();

            // construct the query
            BooleanQuery bq = new BooleanQuery();
            Iterator<String> it = queryTermSet.iterator();
            SynonymMap synMap = SynonymIndex.getSynonymMap();
            HashSet<String> expandedQueryTermSet = new HashSet<String>(queryTermSet);

            while (it.hasNext()) {
                String s = it.next();
                Term term = new Term(LabelDocument.FIELD_LABEL, s);
                TermQuery termQuery = new TermQuery(term);
                bq.add(termQuery, Occur.SHOULD);
                // expand using synonyms
                for (String syn : synMap.getSynonyms(s)) {
                    stemer.setCurrent(syn);
                    stemer.stem();
                    syn = stemer.getCurrent();
                    if (expandedQueryTermSet.add(syn)) {
                        term = new Term(LabelDocument.FIELD_LABEL, syn);
                        termQuery = new TermQuery(term);
                        bq.add(termQuery, Occur.SHOULD);
                    }
                }
            }

            // search in the label index
            SimilarLabelQueryResultCollector collector = new SimilarLabelQueryResultCollector(reader, queryTermSet,
                    similarity);
            searcher.search(bq, collector);
            ret = collector.getQueryResult();
            searcher.close();
            reader.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return ret;
    }

    public float getStorageSizeInMB() {
        return FileUtil.getFileSizeInMB(INDEX_DIR.getAbsolutePath());
    }

    /**
     * @param args
     */
    public static void main(String[] args) {
        // test LabelLuceneIndex
        try {
            FileWriter fw = new FileWriter("labelsimilar.txt", false);
            BufferedWriter bw = new BufferedWriter(fw);
            bw.write("similarity, nlabels, similarsize, timeconsumed(ms)");
            bw.newLine();

            for (float similarity = 0.05f; similarity <= 1; similarity += 0.05f) {

                LabelLuceneIndex index = new LabelLuceneIndex("processrepository/index");
                index.create();
                FileReader fr = new FileReader("labels.txt");
                BufferedReader br = new BufferedReader(fr);
                String label = br.readLine();
                int nLabel = 0;
                while (label != null) {
                    nLabel++;
                    // bw.write(label);
                    // bw.newLine();
                    long startTime = System.currentTimeMillis();
                    TreeSet<SimilarLabelQueryResult> ret = index.getSimilarLabels(label, similarity);
                    long endTime = System.currentTimeMillis();
                    long timeConsumed = endTime - startTime;
                    // Iterator<LabelQueryResult> it = ret.iterator();
                    // while (it.hasNext()) {
                    // LabelQueryResult t = it.next();
                    // bw.write(t.toString());
                    // bw.newLine();
                    // }
                    bw.write(String.valueOf(similarity) + ", " + String.valueOf(nLabel) + ", "
                            + String.valueOf(ret.size()) + ", " + String.valueOf(timeConsumed));
                    bw.newLine();
                    System.out.println(similarity + ", " + nLabel + ", " + ret.size() + ", " + timeConsumed);
                    // bw.write("#####################");
                    // bw.newLine();
                    index.addLabel(label);
                    label = br.readLine();
                }
                index.close();
                br.close();
                fr.close();
            }
            bw.close();
            fw.close();
        } catch (Exception e) {
            e.printStackTrace();
        }

        // // test SnowballAnalyzer
        // try {
        // Analyzer analyzer = new SnowballAnalyzer(Version.LUCENE_CURRENT,
        // "English", StandardAnalyzer.STOP_WORDS_SET); // or any other
        // // analyzer
        // TokenStream stream = analyzer.tokenStream("myfield",
        // new FileReader("labelInfo.txt"));
        // TermAttribute termAtt = stream.addAttribute(TermAttribute.class);
        //
        // stream.reset();
        //
        // FileWriter fw = new FileWriter("stemWord.txt", false);
        // BufferedWriter bw = new BufferedWriter(fw);
        //
        // int n = 0;
        // while (stream.incrementToken()) {
        // bw.write(termAtt.term());
        // bw.newLine();
        // n++;
        // }
        // System.out.println("in total: " + n);
        //
        // stream.end();
        // stream.close();
        //
        // bw.flush();
        // bw.close();
        // fw.close();
        // } catch (Exception e) {
        // e.printStackTrace();
        // }
    }
}