Example usage for org.apache.lucene.analysis TokenStream addAttribute

List of usage examples for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass) 

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:edu.mit.ll.vizlinc.highlight.Highlighter.java

License:Apache License

/**
 * Low level api to get the most relevant (formatted) sections of the document.
 * This method has been made public to allow visibility of score information held in TextFragment objects.
 * Thanks to Jason Calabrese for help in redefining the interface.
 * @param tokenStream/* ww  w .j  ava  2  s. co m*/
 * @param text
 * @param maxNumFragments
 * @param mergeContiguousFragments
 * @throws IOException
 * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
 */
public final TextFragment[] getBestTextFragments(TokenStream tokenStream, String text,
        boolean mergeContiguousFragments, int maxNumFragments)
        throws IOException, InvalidTokenOffsetsException {
    ArrayList<TextFragment> docFrags = new ArrayList<TextFragment>();
    StringBuilder newText = new StringBuilder();

    CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
    tokenStream.addAttribute(PositionIncrementAttribute.class);
    tokenStream.reset();

    TextFragment currentFrag = new TextFragment(newText, newText.length(), docFrags.size());

    if (fragmentScorer instanceof QueryScorer) {
        ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(maxDocCharsToAnalyze);
    }

    TokenStream newStream = fragmentScorer.init(tokenStream);
    if (newStream != null) {
        tokenStream = newStream;
    }
    fragmentScorer.startFragment(currentFrag);
    docFrags.add(currentFrag);

    FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);

    try {

        String tokenText;
        int startOffset;
        int endOffset;
        int lastEndOffset = 0;
        textFragmenter.start(text, tokenStream);

        TokenGroup tokenGroup = new TokenGroup(tokenStream);

        for (boolean next = tokenStream.incrementToken(); next
                && (offsetAtt.startOffset() < maxDocCharsToAnalyze); next = tokenStream.incrementToken()) {
            if ((offsetAtt.endOffset() > text.length()) || (offsetAtt.startOffset() > text.length())) {
                throw new InvalidTokenOffsetsException("Token " + termAtt.toString()
                        + " exceeds length of provided text sized " + text.length());
            }
            if ((tokenGroup.numTokens > 0) && (tokenGroup.isDistinct())) {
                //the current token is distinct from previous tokens -
                // markup the cached token group info
                startOffset = tokenGroup.matchStartOffset;
                endOffset = tokenGroup.matchEndOffset;
                tokenText = text.substring(startOffset, endOffset);
                String markedUpText = formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
                //store any whitespace etc from between this and last group
                if (startOffset > lastEndOffset)
                    newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
                newText.append(markedUpText);
                lastEndOffset = Math.max(endOffset, lastEndOffset);
                tokenGroup.clear();

                //check if current token marks the start of a new fragment
                if (textFragmenter.isNewFragment()) {
                    currentFrag.setScore(fragmentScorer.getFragmentScore());
                    //record stats for a new fragment
                    currentFrag.textEndPos = newText.length();
                    currentFrag = new TextFragment(newText, newText.length(), docFrags.size());
                    fragmentScorer.startFragment(currentFrag);
                    docFrags.add(currentFrag);
                }
            }

            tokenGroup.addToken(fragmentScorer.getTokenScore());

            //            if(lastEndOffset>maxDocBytesToAnalyze)
            //            {
            //               break;
            //            }
        }
        currentFrag.setScore(fragmentScorer.getFragmentScore());

        if (tokenGroup.numTokens > 0) {
            //flush the accumulated text (same code as in above loop)
            startOffset = tokenGroup.matchStartOffset;
            endOffset = tokenGroup.matchEndOffset;
            tokenText = text.substring(startOffset, endOffset);
            String markedUpText = formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
            //store any whitespace etc from between this and last group
            if (startOffset > lastEndOffset)
                newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
            newText.append(markedUpText);
            lastEndOffset = Math.max(lastEndOffset, endOffset);
        }

        //Test what remains of the original text beyond the point where we stopped analyzing 
        if (
        //               if there is text beyond the last token considered..
        (lastEndOffset < text.length()) &&
        //               and that text is not too large...
                (text.length() <= maxDocCharsToAnalyze)) {
            //append it to the last fragment
            newText.append(encoder.encodeText(text.substring(lastEndOffset)));
        }

        currentFrag.textEndPos = newText.length();

        //sort the most relevant sections of the text
        for (Iterator<TextFragment> i = docFrags.iterator(); i.hasNext();) {
            currentFrag = i.next();

            //If you are running with a version of Lucene before 11th Sept 03
            // you do not have PriorityQueue.insert() - so uncomment the code below
            /*
                   if (currentFrag.getScore() >= minScore)
                   {
                      fragQueue.put(currentFrag);
                      if (fragQueue.size() > maxNumFragments)
                      { // if hit queue overfull
                         fragQueue.pop(); // remove lowest in hit queue
                         minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
                      }
                    
                    
                   }
            */
            //The above code caused a problem as a result of Christoph Goller's 11th Sept 03
            //fix to PriorityQueue. The correct method to use here is the new "insert" method
            // USE ABOVE CODE IF THIS DOES NOT COMPILE!
            fragQueue.insertWithOverflow(currentFrag);
        }

        //return the most relevant fragments
        TextFragment frag[] = new TextFragment[fragQueue.size()];
        for (int i = frag.length - 1; i >= 0; i--) {
            frag[i] = fragQueue.pop();
        }

        //merge any contiguous fragments to improve readability
        if (mergeContiguousFragments) {
            mergeContiguousFragments(frag);
            ArrayList<TextFragment> fragTexts = new ArrayList<TextFragment>();
            for (int i = 0; i < frag.length; i++) {
                if ((frag[i] != null) && (frag[i].getScore() > 0)) {
                    fragTexts.add(frag[i]);
                }
            }
            frag = fragTexts.toArray(new TextFragment[0]);
        }

        return frag;

    } finally {
        if (tokenStream != null) {
            try {
                tokenStream.end();
                tokenStream.close();
            } catch (Exception e) {
            }
        }
    }
}

From source file:edu.mit.ll.vizlinc.highlight.QueryScorer.java

License:Apache License

public TokenStream init(TokenStream tokenStream) throws IOException {
    position = -1;//from  w  w w  .ja  va2 s  .co  m
    termAtt = tokenStream.addAttribute(CharTermAttribute.class);
    posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
    if (!skipInitExtractor) {
        if (fieldWeightedSpanTerms != null) {
            fieldWeightedSpanTerms.clear();
        }
        return initExtractor(tokenStream);
    }
    return null;
}

From source file:edu.mit.ll.vizlinc.highlight.QueryTermScorer.java

License:Apache License

public TokenStream init(TokenStream tokenStream) {
    termAtt = tokenStream.addAttribute(CharTermAttribute.class);
    return null;
}

From source file:edu.mit.ll.vizlinc.highlight.SimpleFragmenter.java

License:Apache License

public void start(String originalText, TokenStream stream) {
    offsetAtt = stream.addAttribute(OffsetAttribute.class);
    currentNumFrags = 1;// w w  w .ja v  a 2  s  .  c  o m
}

From source file:edu.mit.ll.vizlinc.highlight.SimpleSpanFragmenter.java

License:Apache License

public void start(String originalText, TokenStream tokenStream) {
    position = -1;//from w w w . j a  va  2s. co m
    currentNumFrags = 1;
    textSize = originalText.length();
    termAtt = tokenStream.addAttribute(CharTermAttribute.class);
    posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
    offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
}

From source file:edu.stanford.rad.naivebayes.ClassifyLines.java

License:Apache License

public static void main(String[] args) throws Exception {
    //      if (args.length < 5) {
    //         System.out.println("Arguments: [model] [label index] [dictionnary] [document frequency] [tweet file]");
    //         return;
    //      }//ww  w. jav a 2s.  c o m
    //      String modelPath = args[0];
    //      String labelIndexPath = args[1];
    //      String dictionaryPath = args[2];
    //      String documentFrequencyPath = args[3];
    //      String tweetsPath = args[4];

    String modelPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/classification/nb";
    String labelIndexPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/classification/nb/labelindex";
    String dictionaryPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/vectors/TFIDFsparseSeqdir/dictionary.file-0";
    String documentFrequencyPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/vectors/TFIDFsparseSeqdir/df-count/part-r-00000";
    String tweetsPath = "/Users/saeedhp/Desktop/tweet/tweet.txt";

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from tweet
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_46);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);
    BufferedReader reader = new BufferedReader(new FileReader(tweetsPath));
    while (true) {
        String line = reader.readLine();
        if (line == null) {
            break;
        }

        String[] tokens = line.split("\t", 2);
        String tweetId = tokens[0];
        String tweet = tokens[1];

        System.out.println("Tweet: " + tweetId + "\t" + tweet);

        Multiset<String> words = ConcurrentHashMultiset.create();

        // extract words from tweet
        TokenStream ts = analyzer.tokenStream("text", new StringReader(tweet));
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        int wordCount = 0;
        while (ts.incrementToken()) {
            if (termAtt.length() > 0) {
                String word = ts.getAttribute(CharTermAttribute.class).toString();
                Integer wordId = dictionary.get(word);
                // if the word is not in the dictionary, skip it
                if (wordId != null) {
                    words.add(word);
                    wordCount++;
                }
            }
        }
        // Fixed error : close ts:TokenStream
        ts.end();
        ts.close();
        // create vector wordId => weight using tfidf
        Vector vector = new RandomAccessSparseVector(10000);
        TFIDF tfidf = new TFIDF();
        for (Multiset.Entry<String> entry : words.entrySet()) {
            String word = entry.getElement();
            int count = entry.getCount();
            Integer wordId = dictionary.get(word);
            Long freq = documentFrequency.get(wordId);
            double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
            vector.setQuick(wordId, tfIdfValue);
        }
        // With the classifier, we get one score for each label 
        // The label with the highest score is the one the tweet is more likely to
        // be associated to
        Vector resultVector = classifier.classifyFull(vector);
        double bestScore = -Double.MAX_VALUE;
        int bestCategoryId = -1;
        for (Element element : resultVector.all()) {
            int categoryId = element.index();
            double score = element.get();
            if (score > bestScore) {
                bestScore = score;
                bestCategoryId = categoryId;
            }
            System.out.print("  " + labels.get(categoryId) + ": " + score);
        }
        System.out.println(" => " + labels.get(bestCategoryId));
    }
    analyzer.close();
    reader.close();
}

From source file:filters.indexing.IndexableFilter.java

License:Open Source License

/**
 * Constructor for class IndexableFilter
 * @param input/*from  w  w  w .  ja  va 2s.c  o m*/
 */
public IndexableFilter(TokenStream input, boolean set_synset_terms) {
    super(input);

    // Getting attributes from input token stream
    input_term = input.getAttribute(TermAttribute.class);
    input_type = input.getAttribute(TypeAttribute.class);
    input_flags = input.getAttribute(FlagsAttribute.class);
    input_payload = input.getAttribute(PayloadAttribute.class);

    // Setting attributes for this token stream
    output_term = this.addAttribute(TermAttribute.class);
    output_type = this.addAttribute(TypeAttribute.class);
    output_flags = this.addAttribute(FlagsAttribute.class);
    output_payload = input.addAttribute(PayloadAttribute.class);

    this.set_synset_terms = set_synset_terms;
}

From source file:fr.ericlab.sondy.core.DataManipulation.java

License:Open Source License

public void prepareStream(String datasetName, int intervalDuration, int ngram, String stemLanguage,
        boolean lemmatization, AppVariables appVariables) {
    try {//w w w  .  ja  va 2s .  com
        Connection connection;
        Class.forName("com.mysql.jdbc.Driver").newInstance();
        connection = DriverManager.getConnection("jdbc:mysql://" + appVariables.configuration.getHost(),
                appVariables.configuration.getUsername(), appVariables.configuration.getPassword());
        Statement statement = connection.createStatement();
        Statement statement2 = connection.createStatement();

        String lemStr = (lemmatization) ? "_lem1" : "_lem0";
        statement.executeUpdate("CREATE TABLE " + appVariables.configuration.getSchema() + "." + datasetName
                + "_" + intervalDuration + "min_" + stemLanguage + lemStr + "_" + ngram
                + "gram ( id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, msg_author VARCHAR(100), msg_post_time TIMESTAMP, msg_text VARCHAR(600), time_slice INT)ENGINE=myisam;");
        //            statement.executeUpdate("CREATE INDEX index_time ON "+appVariables.configuration.getSchema()+"."+datasetName+"_messages (msg_post_time)");

        ResultSet rsTMin = statement.executeQuery("select min(msg_post_time) from "
                + appVariables.configuration.getSchema() + "." + datasetName + "_messages;");
        rsTMin.next();
        Timestamp tMin = rsTMin.getTimestamp(1);
        ResultSet rsTMax = statement.executeQuery("select max(msg_post_time) from "
                + appVariables.configuration.getSchema() + "." + datasetName + "_messages;");
        rsTMax.next();
        Timestamp tMax = rsTMax.getTimestamp(1);
        Timestamp tRef = new Timestamp(0);
        long base = (tMin.getTime() - tRef.getTime()) * 1L;
        long streamDuration = (tMax.getTime() - tMin.getTime()) * 1L;
        long streamDurationMin = (streamDuration / 1000) / 60;

        String path = appVariables.configuration.getWorkspace() + "/datasets/" + datasetName + "/"
                + intervalDuration + "min-" + stemLanguage;
        path += (lemmatization) ? "-lem1" : "-lem0";
        path += "-" + ngram + "gram";
        String pathMention = path + "-m";

        FSDirectory indexGlobal = FSDirectory.open(new File(path));
        FSDirectory indexMention = FSDirectory.open(new File(pathMention));
        Analyzer analyzer;
        Properties props = new Properties();
        props.put("annotators", "tokenize,ssplit,parse,lemma");
        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
        Annotation annotation;
        if (stemLanguage.equalsIgnoreCase("Standard")) {
            analyzer = new StandardAnalyzer(Version.LUCENE_36);
        } else {
            Class cl;
            if (stemLanguage.equals("Chinese")) {
                analyzer = new SmartChineseAnalyzer(Version.LUCENE_36);
            } else {
                String packageName = stemLanguage.substring(0, 2).toLowerCase();
                cl = Class
                        .forName("org.apache.lucene.analysis." + packageName + "." + stemLanguage + "Analyzer");
                Class[] types = new Class[] { Version.class, Set.class };
                Constructor ct = cl.getConstructor(types);
                analyzer = (Analyzer) ct.newInstance(Version.LUCENE_36, appVariables.currentStopWords.getSet());
            }
        }
        IndexWriterConfig configGlobal;
        IndexWriterConfig configMention;
        ShingleAnalyzerWrapper shingleAnalyzer = null;
        if (ngram > 1) {
            shingleAnalyzer = new ShingleAnalyzerWrapper(analyzer, ngram, ngram, " ", false, false);
            WhitespaceAnalyzer whitespaceAnalyzer = new WhitespaceAnalyzer(Version.LUCENE_36);
            configGlobal = new IndexWriterConfig(Version.LUCENE_36, whitespaceAnalyzer);
            configMention = new IndexWriterConfig(Version.LUCENE_36, whitespaceAnalyzer);
        } else {
            configGlobal = new IndexWriterConfig(Version.LUCENE_36, analyzer);
            configMention = new IndexWriterConfig(Version.LUCENE_36, analyzer);
        }
        IndexWriter wGlobal = new IndexWriter(indexGlobal, configGlobal);
        IndexWriter wMention = new IndexWriter(indexMention, configMention);

        int docId = 0;
        for (int i = 0; i < streamDurationMin; i += intervalDuration) {
            statement = connection.createStatement();
            long infBound = base + i * 60 * 1000L;
            long supBound = base + (i + intervalDuration) * 60 * 1000L;
            Timestamp infTime = new Timestamp(infBound);
            Timestamp supTime = new Timestamp(supBound);
            ResultSet rs = statement.executeQuery("SELECT msg_text, msg_post_time, msg_author FROM "
                    + appVariables.configuration.getSchema() + "." + datasetName
                    + "_messages WHERE msg_post_time>'" + infTime + "' AND msg_post_time< '" + supTime + "'");
            String globalContent = new String();
            String mentionContent = new String();
            String timestamps = new String();
            NumberFormat formatter = new DecimalFormat("00000000");
            int bulk = 0;
            String bulkString = "";
            boolean mention;
            while (rs.next()) {
                String message = rs.getString(1).toLowerCase();
                mention = message.contains("@");
                if (lemmatization) {
                    annotation = new Annotation(message);
                    message = "";
                    pipeline.annotate(annotation);
                    List<CoreMap> lem = annotation.get(SentencesAnnotation.class);
                    for (CoreMap l : lem) {
                        for (CoreLabel token : l.get(TokensAnnotation.class)) {
                            message += token.get(LemmaAnnotation.class) + " ";
                        }
                    }
                }
                if (ngram > 1) {
                    String processedMessage = "";
                    TokenStream tokenStream = shingleAnalyzer.tokenStream("text", new StringReader(message));
                    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
                    while (tokenStream.incrementToken()) {
                        String termToken = charTermAttribute.toString();
                        if (!termToken.contains("_")) {
                            processedMessage += termToken.replace(" ", "=") + " ";
                        }
                    }
                    message = processedMessage;
                }
                bulk++;
                if (bulk < _BULK_SIZE_) {
                    bulkString += " (" + docId + ",'" + rs.getString(2) + "',\"" + message + "\",\""
                            + rs.getString(3) + "\"),";
                } else {
                    bulk = 0;
                    bulkString += " (" + docId + ",'" + rs.getString(2) + "',\"" + message + "\",\""
                            + rs.getString(3) + "\");";
                    statement2.executeUpdate("INSERT INTO " + appVariables.configuration.getSchema() + "."
                            + datasetName + "_" + intervalDuration + "min_" + stemLanguage + lemStr + "_"
                            + ngram + "gram (time_slice,msg_post_time,msg_text,msg_author) VALUES"
                            + bulkString);
                    bulkString = "";
                }
                globalContent += message + "\n";
                if (mention) {
                    mentionContent += message + "\n";
                }
                timestamps += rs.getString(2) + "\n";
            }
            if (bulk > 0 && bulkString.length() > 0) {
                statement2.executeUpdate("INSERT INTO " + appVariables.configuration.getSchema() + "."
                        + datasetName + "_" + intervalDuration + "min_" + stemLanguage + lemStr + "_" + ngram
                        + "gram (time_slice,msg_post_time,msg_text,msg_author) VALUES"
                        + bulkString.substring(0, bulkString.length() - 1) + ";");
            }
            Document docGlobal = new Document();
            docGlobal.add(new Field("content", globalContent, Field.Store.YES, Field.Index.ANALYZED,
                    Field.TermVector.YES));
            docGlobal.add(new Field("id", Integer.toString(docId), Field.Store.YES, Field.Index.NOT_ANALYZED));
            wGlobal.addDocument(docGlobal);
            wGlobal.commit();
            Document docMention = new Document();
            docMention.add(new Field("content", mentionContent, Field.Store.YES, Field.Index.ANALYZED,
                    Field.TermVector.YES));
            docMention.add(new Field("id", Integer.toString(docId), Field.Store.YES, Field.Index.NOT_ANALYZED));
            wMention.addDocument(docMention);
            wMention.commit();

            File textFile = new File(path + "/input/" + formatter.format(docId) + ".text");
            FileUtils.writeStringToFile(textFile, globalContent);
            File timeFile = new File(path + "/input/" + formatter.format(docId) + ".time");
            FileUtils.writeStringToFile(timeFile, timestamps);

            docId++;
            statement.close();
        }
        statement2.executeUpdate("CREATE INDEX index_time_slice ON " + appVariables.configuration.getSchema()
                + "." + datasetName + "_" + intervalDuration + "min_" + stemLanguage + lemStr + "_" + ngram
                + "gram (time_slice);");
        statement2.executeUpdate("CREATE FULLTEXT INDEX index_text ON " + appVariables.configuration.getSchema()
                + "." + datasetName + "_" + intervalDuration + "min_" + stemLanguage + lemStr + "_" + ngram
                + "gram (msg_text);");
        statement2.close();
        connection.close();
        wGlobal.close();
        wMention.close();
    } catch (IOException ex) {
        Logger.getLogger(DataManipulation.class.getName()).log(Level.SEVERE, null, ex);
    } catch (SQLException | InstantiationException | IllegalAccessException | ClassNotFoundException ex) {
        Logger.getLogger(DataManipulation.class.getName()).log(Level.SEVERE, null, ex);
    } catch (NoSuchMethodException | SecurityException | IllegalArgumentException
            | InvocationTargetException ex) {
        Logger.getLogger(DataManipulation.class.getName()).log(Level.SEVERE, null, ex);
    }
}

From source file:fr.inrialpes.exmo.ontosim.string.CommonWords.java

License:Open Source License

private void extractTerms(String e) {
    Set<String> s = new LinkedHashSet<String>();
    TokenStream ts = analyzer.tokenStream("", new StringReader(e));
    TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
    try {//  w  ww  .j a  va 2 s .  co m
        while (ts.incrementToken()) {
            s.add(termAtt.term());
        }
    } catch (IOException ex) {
        ex.printStackTrace();
    }
    /*
    Token token;
    try {
        while ((token = ts.next()) != null) {
       s.add(token.termText());
        }
    } catch (IOException ex) {
       ex.printStackTrace();
    }
    */
    map.put(e, s);
}

From source file:fr.inrialpes.exmo.ontosim.string.JWNLDistances.java

License:Open Source License

/**
 * Takes a gloss-like string (text) and returns it tokenized.
 * with:/*from   w  ww.  j  a  v a 2 s .c  o  m*/
 * - stopwords
 * - lower case
 * - porter stemmer
 */
protected Set<String> tokenizeGloss(String s) throws IOException {
    Set<String> result = new HashSet<String>();
    // I am affraid that I am reimplementing the StandardAnalizer...
    TokenStream ts = new PorterStemFilter(
            new StopFilter(true, new LowerCaseTokenizer(new StringReader(s)), stopWords, true));
    TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
    while (ts.incrementToken()) {
        result.add(termAtt.term());
    }
    return result;
}