List of usage examples for org.apache.lucene.analysis TokenStream reset
public void reset() throws IOException
From source file:com.globalsight.ling.lucene.TbTextIndex.java
License:Apache License
protected Query getQuery(String p_text) throws IOException { PhraseQuery result = new PhraseQuery(); TokenStream tokens = m_analyzer.tokenStream(IndexDocument.TEXT, new StringReader(p_text)); tokens.reset(); Token t;// w w w.ja va2 s. c o m while ((t = LuceneUtil.getNextToken(tokens)) != null) { result.add(new Term(IndexDocument.TEXT, t.toString())); } return result; }
From source file:com.globalsight.ling.lucene.TmFuzzyIndex.java
License:Apache License
protected Query getQuery(String p_text) throws IOException { BooleanQuery result = new BooleanQuery(); TokenStream tokens = m_analyzer.tokenStream(IndexDocument.TEXT, new StringReader(p_text)); tokens.reset(); Token t;//from w w w . j ava2s . c o m while ((t = LuceneUtil.getNextToken(tokens)) != null) { result.add(new BooleanClause(new TermQuery(new Term(IndexDocument.TEXT, t.toString())), Occur.SHOULD)); } return result; }
From source file:com.globalsight.ling.tm2.lucene.LuceneUtil.java
License:Apache License
/** * Create GlobalSight TM tokens from a provided segment string * using GsAnalyzer./*from w w w . jav a2 s .co m*/ * * @param p_text fuzzy match format string * @return List of c.g.l.tm2.index.Tokens */ public static List<Token> createGsTokens(String p_text, GlobalSightLocale p_locale) throws Exception { GsAnalyzer analyzer = new GsAnalyzer(p_locale); TokenStream tokenStream = analyzer.tokenStream("blah", new StringReader(p_text)); tokenStream.reset(); //GSAttribute gsAtt = tokenStream.addAttribute(GSAttribute.class); //org.apache.lucene.analysis.Token luceneToken = null; List<String> tokens = new ArrayList<String>(); while (tokenStream.incrementToken()) { // luceneToken = gsAtt.getToken(); CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class); tokens.add(termAtt.toString()); } tokenStream.close(); return buildTokenList(tokens); }
From source file:com.globalsight.ling.tm2.lucene.LuceneUtil.java
License:Apache License
/** * Create GlobalSight TM tokens from a provided segment string * using GsAnalyzer. This method is suitable for use with TM3 * fuzzy indices, and does two things differently than createGsTokens(): * 1) It returns tokens in the order in which they appear * 2) It does not collapse duplicate tokens (and correspondingly does * not return count information)/*from w w w . j av a 2s . c o m*/ * * @param p_text fuzzy match format string * @return List of Strings, each representing one token */ public static List<String> createTm3Tokens(String p_text, GlobalSightLocale p_locale) throws Exception { GsAnalyzer analyzer = new GsAnalyzer(p_locale); TokenStream tokenStream = analyzer.tokenStream("blah", new StringReader(p_text)); tokenStream.reset(); List<String> tokens = new ArrayList<String>(); while (tokenStream.incrementToken()) { CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class); tokens.add(termAtt.toString()); } tokenStream.close(); return tokens; }
From source file:com.globalsight.ling.tm2.lucene.LuceneUtil.java
License:Apache License
@SuppressWarnings("resource") public static List<String> createTm3TokensNoStopWord(String p_text, GlobalSightLocale p_locale) throws Exception { GsAnalyzer analyzer = new GsAnalyzer(p_locale, false); TokenStream tokenStream = analyzer.tokenStream("blah", new StringReader(p_text)); tokenStream.reset(); List<String> tokens = new ArrayList<String>(); while (tokenStream.incrementToken()) { CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class); tokens.add(termAtt.toString());/* w w w .j a v a 2 s. c o m*/ } tokenStream.close(); return tokens; }
From source file:com.globalsight.ling.tm2.lucene.TuvDocument.java
License:Apache License
private int getTotalTokenCount(String text, Analyzer analyzer) throws Exception { TokenStream tokenStream = analyzer.tokenStream("blah", new StringReader(text)); tokenStream.reset(); int tokenCount = 0; while (tokenStream.incrementToken()) { tokenCount++;// w w w . ja v a 2 s . c o m } return tokenCount; }
From source file:com.ibm.watson.developer_cloud.professor_languo.primary_search.SpanBigramQueryGeneratorTest.java
License:Open Source License
private void test_that_generated_bigram_query_match_the_referenced_query() throws IOException { // Get stemmed question SingletonAnalyzer.generateAnalyzer(PrimarySearchConstants.ENGLISH_ANALYZER); EnglishAnalyzer ea = (EnglishAnalyzer) SingletonAnalyzer.getAnalyzer(); TokenStream ts = ea.tokenStream("field", question); CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class); ts.reset(); List<String> stemmedQuestion = new ArrayList<String>(); while (ts.incrementToken()) stemmedQuestion.add(charTermAttribute.toString()); // get query terms BooleanClause[] clauses = ((BooleanQuery) spanBigramQuery).getClauses(); SpanQuery[] queries;/*from ww w . j a va 2 s . c o m*/ String term1, term2; List<String> unigrams = new ArrayList<String>(); int numFields = clauses.length / (2 * stemmedQuestion.size() - 1); // test bigrams int bigramidx = 0; for (int idx = 0; idx < clauses.length; idx++) { Query q = clauses[idx].getQuery(); if (q instanceof SpanNearQuery) { queries = ((SpanNearQuery) clauses[idx].getQuery()).getClauses(); int termidx = bigramidx / numFields; term1 = ((SpanTermQuery) queries[0]).getTerm().text(); term2 = ((SpanTermQuery) queries[1]).getTerm().text(); assertEquals("Extracted first term doesn't match the stemmed term", stemmedQuestion.get(termidx), term1); assertEquals("Extracted second term doesn't match the stemmed term", stemmedQuestion.get(termidx + 1), term2); bigramidx++; } else if (q instanceof TermQuery) { unigrams.add(((TermQuery) clauses[idx].getQuery()).getTerm().text()); } else { assertTrue("Unknown type of query found!", false); } } // test unigrams for (String s : unigrams) assertTrue(stemmedQuestion.contains(s)); for (String s : stemmedQuestion) assertTrue(unigrams.contains(s)); }
From source file:com.ibm.watson.developer_cloud.professor_languo.primary_search.SpanTrigramQueryGeneratorTest.java
License:Open Source License
private void test_that_generated_trigram_query_match_the_referenced_query() throws IOException { Set<Term> queryTerms = new HashSet<Term>(); // Get stemmed question SingletonAnalyzer.generateAnalyzer(PrimarySearchConstants.ENGLISH_ANALYZER); EnglishAnalyzer ea = (EnglishAnalyzer) SingletonAnalyzer.getAnalyzer(); TokenStream ts = ea.tokenStream("field", question); CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class); ts.reset(); List<String> stemmedQuestion = new ArrayList<String>(); while (ts.incrementToken()) stemmedQuestion.add(charTermAttribute.toString()); // get query terms spanTrigramQuery.extractTerms(queryTerms); BooleanClause[] clauses = ((BooleanQuery) spanTrigramQuery).getClauses(); SpanQuery[] qs;//from w ww. jav a 2 s . com String term1, term2, term3; int numFields = clauses.length / (3 * stemmedQuestion.size() - 3); List<String> unigrams = new ArrayList<String>(); int idx = 0; // test trigrams int trigramidx = 0; for (idx = clauses.length - numFields * (stemmedQuestion.size() - 2); idx < clauses.length; idx++) { qs = ((SpanNearQuery) clauses[idx].getQuery()).getClauses(); int termidx = trigramidx / numFields; term1 = ((SpanTermQuery) qs[0]).getTerm().text(); term2 = ((SpanTermQuery) qs[1]).getTerm().text(); term3 = ((SpanTermQuery) qs[2]).getTerm().text(); assertEquals("Extracted first term in the trigram doesn't match the stemmed term", stemmedQuestion.get(termidx), term1); assertEquals("Extracted second term in the trigram doesn't match the stemmed term", stemmedQuestion.get(termidx + 1), term2); assertEquals("Extracted third term in the trigram doesn't match the stemmed term", stemmedQuestion.get(termidx + 2), term3); trigramidx++; } // test bigrams int bigramidx = 0; for (idx = 0; idx < (2 * stemmedQuestion.size() - 1) * numFields; idx++) { Query q = clauses[idx].getQuery(); if (q instanceof SpanNearQuery) { qs = ((SpanNearQuery) clauses[idx].getQuery()).getClauses(); int termidx = bigramidx / numFields; term1 = ((SpanTermQuery) qs[0]).getTerm().text(); term2 = ((SpanTermQuery) qs[1]).getTerm().text(); assertEquals("Extracted first term in the bigram doesn't match the stemmed term", stemmedQuestion.get(termidx), term1); assertEquals("Extracted second term in the bigram doesn't match the stemmed term", stemmedQuestion.get(termidx + 1), term2); bigramidx++; } else if (q instanceof TermQuery) { unigrams.add(((TermQuery) clauses[idx].getQuery()).getTerm().text()); } else { assertTrue("Unknown type of query found!", false); } } // test unigrams for (String s : unigrams) assertTrue(stemmedQuestion.contains(s)); for (String s : stemmedQuestion) assertTrue(unigrams.contains(s)); }
From source file:com.isotrol.impe3.lucene.PortalSpanishAnalyzerTest.java
License:Open Source License
private void test(String name, Analyzer a, String text) throws IOException { final Reader r = new StringReader(text); final TokenStream s = a.tokenStream(null, r); List<String> list = Lists.newLinkedList(); s.reset(); while (s.incrementToken()) { if (s.hasAttribute(CharTermAttribute.class)) { list.add(s.getAttribute(CharTermAttribute.class).toString()); }// w ww. j a va2s . c om } System.out.printf("[%s] %s => %s\n", name, text, list); }
From source file:com.liferay.events.global.mobile.Utils.java
License:Open Source License
public static String removeStopWords(String words) throws IOException { if (Validator.isNull(EventContactServiceImpl.stopWords)) { EventContactServiceImpl.stopWords = new TreeSet<String>(); BufferedReader r = new BufferedReader(new InputStreamReader( EventContactService.class.getClassLoader().getResourceAsStream("stopwords/words.txt"))); String nextLine;/*from w ww . j av a2 s. c o m*/ while ((nextLine = r.readLine()) != null) { String word = nextLine.trim(); if (Validator.isNotNull(word)) { EventContactServiceImpl.stopWords.add(nextLine.trim()); } } r.close(); } // remove punctuation and stuff final CharArraySet stopSet = new CharArraySet(Version.LUCENE_35, EventContactServiceImpl.stopWords, true); TokenStream tokenStream = new StopFilter(Version.LUCENE_35, new StandardTokenizer(Version.LUCENE_35, new StringReader(words)), stopSet); StringBuilder sb = new StringBuilder(); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { String term = charTermAttribute.toString(); sb.append(term).append(" "); } return sb.toString(); }