ri.trabri.Lucene.java Source code

Introduction

Here is the source code for ri.trabri.Lucene.java
Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package ri.trabri;

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.PorterStemmer;

/**
 *
 * @author nicolas
 * classe genrica para representar abordagens distintas de indexao
 */
public abstract class Lucene {

    protected StandardAnalyzer analyzer;
    protected Directory index;
    protected IndexWriterConfig config;
    protected IndexWriter w;

    public Lucene() throws IOException {
        this.analyzer = new StandardAnalyzer(Version.LUCENE_40);
        // create the index
        this.index = new RAMDirectory();
        this.config = new IndexWriterConfig(Version.LUCENE_40, this.analyzer);
        w = new IndexWriter(index, config);
    }

    /*
    retorna palavra stemizada
    */
    protected String stemText(String word) {
        PorterStemmer stem = new PorterStemmer();
        stem.setCurrent(word);
        stem.stem();
        String result = stem.getCurrent();
        return result;
    }

    /*
    Cria Tokens a partir de um texto
    */
    protected ArrayList<String> geraTokens(String text) throws IOException {
        TokenStream stream = this.analyzer.tokenStream(null, new StringReader(text));
        ArrayList<String> words = new ArrayList<>();

        CharTermAttribute cattr = stream.addAttribute(CharTermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            //System.out.println(cattr.toString());
            words.add(cattr.toString());
        }
        stream.end();
        stream.close();
        return words;
    }

    /*
    Pega um texto, gera os tokens e stemiza
    */
    protected String getStemTokens(String text) throws IOException {
        ArrayList<String> words = geraTokens(text);
        String data = "";
        for (String word : words) {
            data += stemText(word) + " ";
        }
        return data;
    }

    // cada classe deve implementar sua prpria forma de indexao
    public abstract void Indexar(ArrayList<Documento> docums) throws IOException, ParseException;

    // sua prpria forma de busca
    public abstract ArrayList<String> search(String querystr) throws IOException, ParseException;

}