Example usage for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass)

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:edu.mit.ll.vizlinc.highlight.Highlighter.java

License:Apache License

/**
 * Low level api to get the most relevant (formatted) sections of the document.
 * This method has been made public to allow visibility of score information held in TextFragment objects.
 * Thanks to Jason Calabrese for help in redefining the interface.
 * @param tokenStream/* ww  w .j  ava  2  s. co m*/
 * @param text
 * @param maxNumFragments
 * @param mergeContiguousFragments
 * @throws IOException
 * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
 */
public final TextFragment[] getBestTextFragments(TokenStream tokenStream, String text,
        boolean mergeContiguousFragments, int maxNumFragments)
        throws IOException, InvalidTokenOffsetsException {
    ArrayList<TextFragment> docFrags = new ArrayList<TextFragment>();
    StringBuilder newText = new StringBuilder();

    CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
    tokenStream.addAttribute(PositionIncrementAttribute.class);
    tokenStream.reset();

    TextFragment currentFrag = new TextFragment(newText, newText.length(), docFrags.size());

    if (fragmentScorer instanceof QueryScorer) {
        ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(maxDocCharsToAnalyze);
    }

    TokenStream newStream = fragmentScorer.init(tokenStream);
    if (newStream != null) {
        tokenStream = newStream;
    }
    fragmentScorer.startFragment(currentFrag);
    docFrags.add(currentFrag);

    FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);

    try {

        String tokenText;
        int startOffset;
        int endOffset;
        int lastEndOffset = 0;
        textFragmenter.start(text, tokenStream);

        TokenGroup tokenGroup = new TokenGroup(tokenStream);

        for (boolean next = tokenStream.incrementToken(); next
                && (offsetAtt.startOffset() < maxDocCharsToAnalyze); next = tokenStream.incrementToken()) {
            if ((offsetAtt.endOffset() > text.length()) || (offsetAtt.startOffset() > text.length())) {
                throw new InvalidTokenOffsetsException("Token " + termAtt.toString()
                        + " exceeds length of provided text sized " + text.length());
            }
            if ((tokenGroup.numTokens > 0) && (tokenGroup.isDistinct())) {
                //the current token is distinct from previous tokens -
                // markup the cached token group info
                startOffset = tokenGroup.matchStartOffset;
                endOffset = tokenGroup.matchEndOffset;
                tokenText = text.substring(startOffset, endOffset);
                String markedUpText = formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
                //store any whitespace etc from between this and last group
                if (startOffset > lastEndOffset)
                    newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
                newText.append(markedUpText);
                lastEndOffset = Math.max(endOffset, lastEndOffset);
                tokenGroup.clear();

                //check if current token marks the start of a new fragment
                if (textFragmenter.isNewFragment()) {
                    currentFrag.setScore(fragmentScorer.getFragmentScore());
                    //record stats for a new fragment
                    currentFrag.textEndPos = newText.length();
                    currentFrag = new TextFragment(newText, newText.length(), docFrags.size());
                    fragmentScorer.startFragment(currentFrag);
                    docFrags.add(currentFrag);
                }
            }

            tokenGroup.addToken(fragmentScorer.getTokenScore());

            //            if(lastEndOffset>maxDocBytesToAnalyze)
            //            {
            //               break;
            //            }
        }
        currentFrag.setScore(fragmentScorer.getFragmentScore());

        if (tokenGroup.numTokens > 0) {
            //flush the accumulated text (same code as in above loop)
            startOffset = tokenGroup.matchStartOffset;
            endOffset = tokenGroup.matchEndOffset;
            tokenText = text.substring(startOffset, endOffset);
            String markedUpText = formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
            //store any whitespace etc from between this and last group
            if (startOffset > lastEndOffset)
                newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
            newText.append(markedUpText);
            lastEndOffset = Math.max(lastEndOffset, endOffset);
        }

        //Test what remains of the original text beyond the point where we stopped analyzing 
        if (
        //               if there is text beyond the last token considered..
        (lastEndOffset < text.length()) &&
        //               and that text is not too large...
                (text.length() <= maxDocCharsToAnalyze)) {
            //append it to the last fragment
            newText.append(encoder.encodeText(text.substring(lastEndOffset)));
        }

        currentFrag.textEndPos = newText.length();

        //sort the most relevant sections of the text
        for (Iterator<TextFragment> i = docFrags.iterator(); i.hasNext();) {
            currentFrag = i.next();

            //If you are running with a version of Lucene before 11th Sept 03
            // you do not have PriorityQueue.insert() - so uncomment the code below
            /*
                   if (currentFrag.getScore() >= minScore)
                   {
                      fragQueue.put(currentFrag);
                      if (fragQueue.size() > maxNumFragments)
                      { // if hit queue overfull
                         fragQueue.pop(); // remove lowest in hit queue
                         minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
                      }
                    
                    
                   }
            */
            //The above code caused a problem as a result of Christoph Goller's 11th Sept 03
            //fix to PriorityQueue. The correct method to use here is the new "insert" method
            // USE ABOVE CODE IF THIS DOES NOT COMPILE!
            fragQueue.insertWithOverflow(currentFrag);
        }

        //return the most relevant fragments
        TextFragment frag[] = new TextFragment[fragQueue.size()];
        for (int i = frag.length - 1; i >= 0; i--) {
            frag[i] = fragQueue.pop();
        }

        //merge any contiguous fragments to improve readability
        if (mergeContiguousFragments) {
            mergeContiguousFragments(frag);
            ArrayList<TextFragment> fragTexts = new ArrayList<TextFragment>();
            for (int i = 0; i < frag.length; i++) {
                if ((frag[i] != null) && (frag[i].getScore() > 0)) {
                    fragTexts.add(frag[i]);
                }
            }
            frag = fragTexts.toArray(new TextFragment[0]);
        }

        return frag;

    } finally {
        if (tokenStream != null) {
            try {
                tokenStream.end();
                tokenStream.close();
            } catch (Exception e) {
            }
        }
    }
}

From source file:edu.mit.ll.vizlinc.highlight.QueryScorer.java

License:Apache License

public TokenStream init(TokenStream tokenStream) throws IOException {
    position = -1;//from  w  w w  .ja  va2 s  .co  m
    termAtt = tokenStream.addAttribute(CharTermAttribute.class);
    posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
    if (!skipInitExtractor) {
        if (fieldWeightedSpanTerms != null) {
            fieldWeightedSpanTerms.clear();
        }
        return initExtractor(tokenStream);
    }
    return null;
}

From source file:edu.mit.ll.vizlinc.highlight.QueryTermScorer.java

License:Apache License

public TokenStream init(TokenStream tokenStream) {
    termAtt = tokenStream.addAttribute(CharTermAttribute.class);
    return null;
}

From source file:edu.mit.ll.vizlinc.highlight.SimpleFragmenter.java

License:Apache License

public void start(String originalText, TokenStream stream) {
    offsetAtt = stream.addAttribute(OffsetAttribute.class);
    currentNumFrags = 1;// w w  w .ja v  a 2  s  .  c  o m
}

From source file:edu.mit.ll.vizlinc.highlight.SimpleSpanFragmenter.java

License:Apache License

public void start(String originalText, TokenStream tokenStream) {
    position = -1;//from w w w . j a  va  2s. co m
    currentNumFrags = 1;
    textSize = originalText.length();
    termAtt = tokenStream.addAttribute(CharTermAttribute.class);
    posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
    offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
}

From source file:edu.stanford.rad.naivebayes.ClassifyLines.java

License:Apache License

public static void main(String[] args) throws Exception {
    //      if (args.length < 5) {
    //         System.out.println("Arguments: [model] [label index] [dictionnary] [document frequency] [tweet file]");
    //         return;
    //      }//ww  w. jav a 2s.  c o m
    //      String modelPath = args[0];
    //      String labelIndexPath = args[1];
    //      String dictionaryPath = args[2];
    //      String documentFrequencyPath = args[3];
    //      String tweetsPath = args[4];

    String modelPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/classification/nb";
    String labelIndexPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/classification/nb/labelindex";
    String dictionaryPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/vectors/TFIDFsparseSeqdir/dictionary.file-0";
    String documentFrequencyPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/vectors/TFIDFsparseSeqdir/df-count/part-r-00000";
    String tweetsPath = "/Users/saeedhp/Desktop/tweet/tweet.txt";

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from tweet
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_46);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);
    BufferedReader reader = new BufferedReader(new FileReader(tweetsPath));
    while (true) {
        String line = reader.readLine();
        if (line == null) {
            break;
        }

        String[] tokens = line.split("\t", 2);
        String tweetId = tokens[0];
        String tweet = tokens[1];

        System.out.println("Tweet: " + tweetId + "\t" + tweet);

        Multiset<String> words = ConcurrentHashMultiset.create();

        // extract words from tweet
        TokenStream ts = analyzer.tokenStream("text", new StringReader(tweet));
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        int wordCount = 0;
        while (ts.incrementToken()) {
            if (termAtt.length() > 0) {
                String word = ts.getAttribute(CharTermAttribute.class).toString();
                Integer wordId = dictionary.get(word);
                // if the word is not in the dictionary, skip it
                if (wordId != null) {
                    words.add(word);
                    wordCount++;
                }
            }
        }
        // Fixed error : close ts:TokenStream
        ts.end();
        ts.close();
        // create vector wordId => weight using tfidf
        Vector vector = new RandomAccessSparseVector(10000);
        TFIDF tfidf = new TFIDF();
        for (Multiset.Entry<String> entry : words.entrySet()) {
            String word = entry.getElement();
            int count = entry.getCount();
            Integer wordId = dictionary.get(word);
            Long freq = documentFrequency.get(wordId);
            double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
            vector.setQuick(wordId, tfIdfValue);
        }
        // With the classifier, we get one score for each label 
        // The label with the highest score is the one the tweet is more likely to
        // be associated to
        Vector resultVector = classifier.classifyFull(vector);
        double bestScore = -Double.MAX_VALUE;
        int bestCategoryId = -1;
        for (Element element : resultVector.all()) {
            int categoryId = element.index();
            double score = element.get();
            if (score > bestScore) {
                bestScore = score;
                bestCategoryId = categoryId;
            }
            System.out.print("  " + labels.get(categoryId) + ": " + score);
        }
        System.out.println(" => " + labels.get(bestCategoryId));
    }
    analyzer.close();
    reader.close();
}

From source file:filters.indexing.IndexableFilter.java

License:Open Source License

/**
 * Constructor for class IndexableFilter
 * @param input/*from  w  w  w .  ja  va 2s.c  o m*/
 */
public IndexableFilter(TokenStream input, boolean set_synset_terms) {
    super(input);

    // Getting attributes from input token stream
    input_term = input.getAttribute(TermAttribute.class);
    input_type = input.getAttribute(TypeAttribute.class);
    input_flags = input.getAttribute(FlagsAttribute.class);
    input_payload = input.getAttribute(PayloadAttribute.class);

    // Setting attributes for this token stream
    output_term = this.addAttribute(TermAttribute.class);
    output_type = this.addAttribute(TypeAttribute.class);
    output_flags = this.addAttribute(FlagsAttribute.class);
    output_payload = input.addAttribute(PayloadAttribute.class);

    this.set_synset_terms = set_synset_terms;
}

From source file:fr.ericlab.sondy.core.DataManipulation.java

License:Open Source License

public void prepareStream(String datasetName, int intervalDuration, int ngram, String stemLanguage,
        boolean lemmatization, AppVariables appVariables) {
    try {//w w w  .  ja  va 2s .  com
        Connection connection;
        Class.forName("com.mysql.jdbc.Driver").newInstance();
        connection = DriverManager.getConnection("jdbc:mysql://" + appVariables.configuration.getHost(),
                appVariables.configuration.getUsername(), appVariables.configuration.getPassword());
        Statement statement = connection.createStatement();
        Statement statement2 = connection.createStatement();

        String lemStr = (lemmatization) ? "_lem1" : "_lem0";
        statement.executeUpdate("CREATE TABLE " + appVariables.configuration.getSchema() + "." + datasetName
                + "_" + intervalDuration + "min_" + stemLanguage + lemStr + "_" + ngram
                + "gram ( id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, msg_author VARCHAR(100), msg_post_time TIMESTAMP, msg_text VARCHAR(600), time_slice INT)ENGINE=myisam;");
        //            statement.executeUpdate("CREATE INDEX index_time ON "+appVariables.configuration.getSchema()+"."+datasetName+"_messages (msg_post_time)");

        ResultSet rsTMin = statement.executeQuery("select min(msg_post_time) from "
                + appVariables.configuration.getSchema() + "." + datasetName + "_messages;");
        rsTMin.next();
        Timestamp tMin = rsTMin.getTimestamp(1);
        ResultSet rsTMax = statement.executeQuery("select max(msg_post_time) from "
                + appVariables.configuration.getSchema() + "." + datasetName + "_messages;");
        rsTMax.next();
        Timestamp tMax = rsTMax.getTimestamp(1);
        Timestamp tRef = new Timestamp(0);
        long base = (tMin.getTime() - tRef.getTime()) * 1L;
        long streamDuration = (tMax.getTime() - tMin.getTime()) * 1L;
        long streamDurationMin = (streamDuration / 1000) / 60;

        String path = appVariables.configuration.getWorkspace() + "/datasets/" + datasetName + "/"
                + intervalDuration + "min-" + stemLanguage;
        path += (lemmatization) ? "-lem1" : "-lem0";
        path += "-" + ngram + "gram";
        String pathMention = path + "-m";

        FSDirectory indexGlobal = FSDirectory.open(new File(path));
        FSDirectory indexMention = FSDirectory.open(new File(pathMention));
        Analyzer analyzer;
        Properties props = new Properties();
        props.put("annotators", "tokenize,ssplit,parse,lemma");
        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
        Annotation annotation;
        if (stemLanguage.equalsIgnoreCase("Standard")) {
            analyzer = new StandardAnalyzer(Version.LUCENE_36);
        } else {
            Class cl;
            if (stemLanguage.equals("Chinese")) {
                analyzer = new SmartChineseAnalyzer(Version.LUCENE_36);
            } else {
                String packageName = stemLanguage.substring(0, 2).toLowerCase();
                cl = Class
                        .forName("org.apache.lucene.analysis." + packageName + "." + stemLanguage + "Analyzer");
                Class[] types = new Class[] { Version.class, Set.class };
                Constructor ct = cl.getConstructor(types);
                analyzer = (Analyzer) ct.newInstance(Version.LUCENE_36, appVariables.currentStopWords.getSet());
            }
        }
        IndexWriterConfig configGlobal;
        IndexWriterConfig configMention;
        ShingleAnalyzerWrapper shingleAnalyzer = null;
        if (ngram > 1) {
            shingleAnalyzer = new ShingleAnalyzerWrapper(analyzer, ngram, ngram, " ", false, false);
            WhitespaceAnalyzer whitespaceAnalyzer = new WhitespaceAnalyzer(Version.LUCENE_36);
            configGlobal = new IndexWriterConfig(Version.LUCENE_36, whitespaceAnalyzer);
            configMention = new IndexWriterConfig(Version.LUCENE_36, whitespaceAnalyzer);
        } else {
            configGlobal = new IndexWriterConfig(Version.LUCENE_36, analyzer);
            configMention = new IndexWriterConfig(Version.LUCENE_36, analyzer);
        }
        IndexWriter wGlobal = new IndexWriter(indexGlobal, configGlobal);
        IndexWriter wMention = new IndexWriter(indexMention, configMention);

        int docId = 0;
        for (int i = 0; i < streamDurationMin; i += intervalDuration) {
            statement = connection.createStatement();
            long infBound = base + i * 60 * 1000L;
            long supBound = base + (i + intervalDuration) * 60 * 1000L;
            Timestamp infTime = new Timestamp(infBound);
            Timestamp supTime = new Timestamp(supBound);
            ResultSet rs = statement.executeQuery("SELECT msg_text, msg_post_time, msg_author FROM "
                    + appVariables.configuration.getSchema() + "." + datasetName
                    + "_messages WHERE msg_post_time>'" + infTime + "' AND msg_post_time< '" + supTime + "'");
            String globalContent = new String();
            String mentionContent = new String();
            String timestamps = new String();
            NumberFormat formatter = new DecimalFormat("00000000");
            int bulk = 0;
            String bulkString = "";
            boolean mention;
            while (rs.next()) {
                String message = rs.getString(1).toLowerCase();
                mention = message.contains("@");
                if (lemmatization) {
                    annotation = new Annotation(message);
                    message = "";
                    pipeline.annotate(annotation);
                    List<CoreMap> lem = annotation.get(SentencesAnnotation.class);
                    for (CoreMap l : lem) {
                        for (CoreLabel token : l.get(TokensAnnotation.class)) {
                            message += token.get(LemmaAnnotation.class) + " ";
                        }
                    }
                }
                if (ngram > 1) {
                    String processedMessage = "";
                    TokenStream tokenStream = shingleAnalyzer.tokenStream("text", new StringReader(message));
                    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
                    while (tokenStream.incrementToken()) {
                        String termToken = charTermAttribute.toString();
                        if (!termToken.contains("_")) {
                            processedMessage += termToken.replace(" ", "=") + " ";
                        }
                    }
                    message = processedMessage;
                }
                bulk++;
                if (bulk < _BULK_SIZE_) {
                    bulkString += " (" + docId + ",'" + rs.getString(2) + "',\"" + message + "\",\""
                            + rs.getString(3) + "\"),";
                } else {
                    bulk = 0;
                    bulkString += " (" + docId + ",'" + rs.getString(2) + "',\"" + message + "\",\""
                            + rs.getString(3) + "\");";
                    statement2.executeUpdate("INSERT INTO " + appVariables.configuration.getSchema() + "."
                            + datasetName + "_" + intervalDuration + "min_" + stemLanguage + lemStr + "_"
                            + ngram + "gram (time_slice,msg_post_time,msg_text,msg_author) VALUES"
                            + bulkString);
                    bulkString = "";
                }
                globalContent += message + "\n";
                if (mention) {
                    mentionContent += message + "\n";
                }
                timestamps += rs.getString(2) + "\n";
            }
            if (bulk > 0 && bulkString.length() > 0) {
                statement2.executeUpdate("INSERT INTO " + appVariables.configuration.getSchema() + "."
                        + datasetName + "_" + intervalDuration + "min_" + stemLanguage + lemStr + "_" + ngram
                        + "gram (time_slice,msg_post_time,msg_text,msg_author) VALUES"
                        + bulkString.substring(0, bulkString.length() - 1) + ";");
            }
            Document docGlobal = new Document();
            docGlobal.add(new Field("content", globalContent, Field.Store.YES, Field.Index.ANALYZED,
                    Field.TermVector.YES));
            docGlobal.add(new Field("id", Integer.toString(docId), Field.Store.YES, Field.Index.NOT_ANALYZED));
            wGlobal.addDocument(docGlobal);
            wGlobal.commit();
            Document docMention = new Document();
            docMention.add(new Field("content", mentionContent, Field.Store.YES, Field.Index.ANALYZED,
                    Field.TermVector.YES));
            docMention.add(new Field("id", Integer.toString(docId), Field.Store.YES, Field.Index.NOT_ANALYZED));
            wMention.addDocument(docMention);
            wMention.commit();

            File textFile = new File(path + "/input/" + formatter.format(docId) + ".text");
            FileUtils.writeStringToFile(textFile, globalContent);
            File timeFile = new File(path + "/input/" + formatter.format(docId) + ".time");
            FileUtils.writeStringToFile(timeFile, timestamps);

            docId++;
            statement.close();
        }
        statement2.executeUpdate("CREATE INDEX index_time_slice ON " + appVariables.configuration.getSchema()
                + "." + datasetName + "_" + intervalDuration + "min_" + stemLanguage + lemStr + "_" + ngram
                + "gram (time_slice);");
        statement2.executeUpdate("CREATE FULLTEXT INDEX index_text ON " + appVariables.configuration.getSchema()
                + "." + datasetName + "_" + intervalDuration + "min_" + stemLanguage + lemStr + "_" + ngram
                + "gram (msg_text);");
        statement2.close();
        connection.close();
        wGlobal.close();
        wMention.close();
    } catch (IOException ex) {
        Logger.getLogger(DataManipulation.class.getName()).log(Level.SEVERE, null, ex);
    } catch (SQLException | InstantiationException | IllegalAccessException | ClassNotFoundException ex) {
        Logger.getLogger(DataManipulation.class.getName()).log(Level.SEVERE, null, ex);
    } catch (NoSuchMethodException | SecurityException | IllegalArgumentException
            | InvocationTargetException ex) {
        Logger.getLogger(DataManipulation.class.getName()).log(Level.SEVERE, null, ex);
    }
}

From source file:fr.inrialpes.exmo.ontosim.string.CommonWords.java

License:Open Source License

private void extractTerms(String e) {
    Set<String> s = new LinkedHashSet<String>();
    TokenStream ts = analyzer.tokenStream("", new StringReader(e));
    TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
    try {//  w  ww  .j a  va 2 s .  co m
        while (ts.incrementToken()) {
            s.add(termAtt.term());
        }
    } catch (IOException ex) {
        ex.printStackTrace();
    }
    /*
    Token token;
    try {
        while ((token = ts.next()) != null) {
       s.add(token.termText());
        }
    } catch (IOException ex) {
       ex.printStackTrace();
    }
    */
    map.put(e, s);
}

From source file:fr.inrialpes.exmo.ontosim.string.JWNLDistances.java

License:Open Source License

/**
 * Takes a gloss-like string (text) and returns it tokenized.
 * with:/*from   w  ww.  j  a  v a 2 s .c  o  m*/
 * - stopwords
 * - lower case
 * - porter stemmer
 */
protected Set<String> tokenizeGloss(String s) throws IOException {
    Set<String> result = new HashSet<String>();
    // I am affraid that I am reimplementing the StandardAnalizer...
    TokenStream ts = new PorterStemFilter(
            new StopFilter(true, new LowerCaseTokenizer(new StringReader(s)), stopWords, true));
    TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
    while (ts.incrementToken()) {
        result.add(termAtt.term());
    }
    return result;
}