org.karsha.tokenize.SimpleTokenizer.java Source code

Introduction

Here is the source code for org.karsha.tokenize.SimpleTokenizer.java
Source

/**
 *   Copyright (C) 2013, Lanka Software Foundation and University of Maryland.
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU Affero General Public License as
 *   published by the Free Software Foundation, either version 3 of the
 *   License, or (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU Affero General Public License for more details.
 *
 *   You should have received a copy of the GNU Affero General Public License
 *   along with this program.  If not, see <http://www.gnu.org/licenses/>
 *
 * Date Author Changes 
 * 2010 Shanchan Wu Created (Lucene 2.9 compatible)
 * June 1 2012 Kasun Perera Modified the program as compatible to Lucene 3.5
 *
 */

package org.karsha.tokenize;

import java.io.File;
import java.io.Reader;
import java.io.StringReader;

import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.Version;

/**
 * SimpleTokenizer filters punctuation only
 */
public class SimpleTokenizer implements Tokenizer {

    private static boolean replaceInvalidAcronym = false;
    private static int maxTokenLength = 255;
    // currently not used
    @SuppressWarnings("unused")
    private static final String[] LUCENE_STOP_WORDS = { "a", "an", "and", "are", "as", "at", "be", "but", "by",
            "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their",
            "then", "there", "these", "they", "this", "to", "was", "will", "with" };

    public SimpleTokenizer() {
    }

    public TokenStream tokenStream(String text) {
        return tokenStream(new StringReader(text));
    }

    public TokenStream tokenStream(Reader reader) {
        //StandardTokenizer tokenStream = new StandardTokenizer(reader, replaceInvalidAcronym);
        StandardTokenizer tokenStream = new StandardTokenizer(Version.LUCENE_35, reader);
        tokenStream.setMaxTokenLength(maxTokenLength);
        TokenStream result = new StandardFilter(tokenStream);

        result = new LowerCaseFilter(result);
        //result = new StopFilter(result, TERRIER_STOP_WORDS);
        //result = new PorterStemFilter(result);

        return result;
    }

    public String processText(String text) {
        StringBuffer str = new StringBuffer();
        TokenStream stream = tokenStream(new StringReader(text));
        Token token = new Token();

        try {

            while (stream.incrementToken()) {
                str.append(stream.getAttribute(TermAttribute.class).term());
                str.append(" ");

            }
            //            while ((token = stream.next(token)) != null) {
            //                str.append(token.termBuffer(), 0, token.termLength());
            //                str.append(" ");
            //            }
        } catch (Exception e) {
            e.printStackTrace();
        }

        //return str.toString().replace('-', ' ').trim();

        return str.toString();
    }

    public static void main(String[] args) {
        //String a = "This a big man. He is here:` to ' see? me \"hehe+\" ";
        String a = "[0013] [T=/p2] -- - a [/p1] Transfer with M2V5Engine incomplete : face No /p4 not created /p3@@@@[0013] [T=/p2] [/p1]  M2V5 ?";
        String x = "dd3] [T=/p2] -- - a [/p1] Trvvsfer with M2V5Engine incomplete : face No /p4 not created /p3@@@@[0013] [T=/p2] [/p1]  M2V5 ?";

        Tokenizer t = new SimpleTokenizer();
        String b = t.processText(a);
        System.out.println(b);

        System.out.println(t.processText(x));

        //================

        String tempFileName = "data/temp.txt";
        File tempFile = new File(tempFileName);
        System.out.println();
        if (tempFile.exists()) {
            System.out.println(true);
            tempFile.delete();
        }

        if (tempFile.exists()) {
            System.out.println("2222Here again");
        } else {
            System.out.println("NOO Here again");
        }

        //Tool.writeTextToFile("dfdfdf===========", tempFileName + "ttt");
        if (tempFile.exists()) {
            System.out.println("33333Here again");
        }

        File dest = new File(tempFileName + "_re");
        dest.renameTo(tempFile);
    }
}