at.lux.fotoretrieval.lucene.GraphTokenizer.java Source code

Introduction

Here is the source code for at.lux.fotoretrieval.lucene.GraphTokenizer.java
Source

/*
 * This file is part of Caliph & Emir.
 *
 * Caliph & Emir is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * Caliph & Emir is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Caliph & Emir; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 * Copyright statement:
 * --------------------
 * (c) 2002-2005 by Mathias Lux (mathias@juggle.at)
 * http://www.juggle.at, http://caliph-emir.sourceforge.net
 */
package at.lux.fotoretrieval.lucene;

import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;

import java.io.IOException;
import java.io.Reader;

/**
 * Date: 25.03.2005
 * Time: 22:13:35
 *
 * @author Mathias Lux, mathias@juggle.at
 */
public class GraphTokenizer extends TokenStream {
    private char last = ' ';
    private Reader reader;
    private boolean tokenstart, tokenend;

    public GraphTokenizer(Reader in) {
        //        super(in);
        reader = in;
    }

    protected boolean isTokenChar(char c) {
        boolean returnValue = false;
        if (c == ' ' && last == ']') {
            returnValue = true;
        }
        last = c;
        return returnValue;
    }

    public Token next() throws IOException {
        StringBuilder currenttoken = new StringBuilder(64);
        // currenttoken.append('[');
        char[] character = new char[1];
        int i = reader.read(character);
        // reset our states :)
        tokenstart = false;
        tokenend = false;
        do {
            // end of stream reached ...
            if (i == 0)
                return null;

            if (character[0] == '[') { // token starts here ...
                tokenstart = true;
            } else if (character[0] == ']') { // token ends here ...
                tokenend = true;
            } else if (tokenstart && !tokenend) { // between end and start ...
                currenttoken.append(character[0]);
            }
            // we found our token and return it ...
            if (tokenstart && tokenend) {
                // currenttoken.append(']');
                // prepend a token because lucene does not allow leading wildcards. 
                currenttoken.insert(0, '_');
                String tokenString = currenttoken.toString().toLowerCase().replace(' ', '_').trim();
                Token t = new Token(tokenString, 0, tokenString.length() - 1);
                return t;
            }
            i = reader.read(character);
        } while (i > 0 && !tokenend);
        return null;
    }
}