List of usage examples for org.apache.lucene.analysis Token clone
@Override
public Token clone()
From source file:analysis.StandardTokenizer.java
License:Apache License
@SuppressWarnings("deprecation") public Token next(final Token reusableToken) throws IOException { assert reusableToken != null; int posIncr = 1; Token result = reusableToken;/* ww w .ja va2 s .c o m*/ if (tokenList.size() > 0) return tokenList.remove(); while (true) { int tokenType = scanner.getNextToken(); if (tokenType == StandardTokenizerImpl.YYEOF) { return null; } if (scanner.yylength() <= maxTokenLength) { reusableToken.clear(); reusableToken.setPositionIncrement(posIncr); scanner.getText(reusableToken); final int start = scanner.yychar(); reusableToken.setStartOffset(start); reusableToken.setEndOffset(start + reusableToken.termLength()); // This 'if' should be removed in the next release. For now, it // converts // invalid acronyms to HOST. When removed, only the 'else' part // should // remain. if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) { if (replaceInvalidAcronym) { reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]); reusableToken.setTermLength(reusableToken.termLength() - 1); // remove // extra // '.' tokenType = StandardTokenizerImpl.HOST; } else { reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]); tokenType = StandardTokenizerImpl.ACRONYM; } } else { reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]); } if (tokenType == StandardTokenizerImpl.HOST || tokenType == StandardTokenizerImpl.NUM || tokenType == StandardTokenizerImpl.ALPHANUM) { Tokenizer lt = new LetterDigitBreakTokenizer(new StringReader(reusableToken.term())); Token tk = null; int st = reusableToken.startOffset(); final Token token = new Token(); while ((tk = lt.next(token)) != null) { tk.setStartOffset(tk.startOffset() + st); tk.setEndOffset(tk.endOffset() + st); tk.setType(reusableToken.type()); tokenList.add((Token) tk.clone()); } } if (tokenList.size() > 0) result = tokenList.remove(); return result; } else // When we skip a too-long term, we still increment the // position increment posIncr++; } }
From source file:com.flaptor.hounder.classifier.util.TupleTokenizer.java
License:Apache License
/** * @throws IOException /*ww w .j av a 2 s. c o m*/ * @param ts the TokenStream to wrap * @param maxTuple If maxTuple>1 then tuples of 1..maxTuples will be return. * Ie if the document is "t1 t2 t3 t4" and maxTuple=2, then the return wil * be "t1 t2 t1_t2 t2_t3" * */ public TupleTokenizer(TokenStream ts, int maxTuples) throws IOException { MAX_INCREMENT = maxTuples; Token tk = new Token(); while ((tk = ts.next(tk)) != null) { tokens.add((Token) tk.clone()); } }
From source file:com.flaptor.hounder.searcher.query.AQuerySuggestor.java
License:Apache License
private List<AQuery> suggestLinear(AQuery query) { List<AQuery> queries = new ArrayList<AQuery>(); if (null == query) { logger.debug("Can't make a suggestion for a null query"); } else if (!(query instanceof LazyParsedQuery)) { // TODO FIXME logger.debug("can not make suggestions for queries of type " + query.getClass()); } else {/* w ww.j av a 2 s . c om*/ String originalString = ((LazyParsedQuery) query).getQueryString(); StandardTokenizer tokenizer = new StandardTokenizer(new StringReader(originalString)); List<String> tokens = new ArrayList<String>(); try { Token token = new Token(); while (true) { token = tokenizer.next(token); if (null == token) { break; } tokens.add(TokenUtil.termText((Token) token.clone())); } // for every word, suggest something for (int i = 0; i < tokens.size(); i++) { StringBuffer sb = new StringBuffer(); // sb.append("\""); for (int j = 0; j < i; j++) { sb.append(tokens.get(j)); sb.append(" "); } String[] suggestions = suggestor.suggestWords(tokens.get(i)); for (String suggestion : suggestions) { // generate final sb StringBuffer sbf = new StringBuffer(sb); sbf.append(suggestion); sbf.append(" "); for (int k = i + 1; k < tokens.size(); k++) { sbf.append(tokens.get(k)); if (k + 1 < tokens.size()) { sbf.append(" "); } } // sbf.append("\""); queries.add(new LazyParsedQuery(sbf.toString())); } } } catch (IOException e) { logger.error("Error while suggesting query", e); return new ArrayList<AQuery>(); } } return queries; }
From source file:org.alfresco.repo.search.impl.lucene.AbstractLuceneQueryParser.java
License:Open Source License
@SuppressWarnings("unchecked") protected Query getFieldQueryImpl(String field, String queryText, AnalysisMode analysisMode, LuceneFunction luceneFunction) throws ParseException { // Use the analyzer to get all the tokens, and then build a TermQuery, // PhraseQuery, or noth // TODO: Untokenised columns with functions require special handling if (luceneFunction != LuceneFunction.FIELD) { throw new UnsupportedOperationException( "Field queries are not supported on lucene functions (UPPER, LOWER, etc)"); }/*from w w w.j a v a 2 s .c o m*/ // if the incoming string already has a language identifier we strip it iff and addit back on again String localePrefix = ""; String toTokenise = queryText; if (queryText.startsWith("{")) { int position = queryText.indexOf("}"); String language = queryText.substring(0, position + 1); Locale locale = new Locale(queryText.substring(1, position)); String token = queryText.substring(position + 1); boolean found = false; if (!locale.toString().isEmpty()) { for (Locale current : Locale.getAvailableLocales()) { if (current.toString().equalsIgnoreCase(locale.toString())) { found = true; break; } } } if (found) { localePrefix = language; toTokenise = token; } else { toTokenise = token; } } String testText = toTokenise; boolean requiresMLTokenDuplication = false; String localeString = null; if (field.startsWith(PROPERTY_FIELD_PREFIX) && (localePrefix.length() == 0)) { if ((queryText.length() > 0) && (queryText.charAt(0) == '\u0000')) { int position = queryText.indexOf("\u0000", 1); testText = queryText.substring(position + 1); requiresMLTokenDuplication = true; localeString = queryText.substring(1, position); } } // find the positions of any escaped * and ? and ignore them Set<Integer> wildcardPoistions = getWildcardPositions(testText); TokenStream source; if ((localePrefix.length() == 0) || (wildcardPoistions.size() > 0) || (analysisMode == AnalysisMode.IDENTIFIER)) { source = getAnalyzer().tokenStream(field, new StringReader(toTokenise), analysisMode); } else { source = getAnalyzer().tokenStream(field, new StringReader( "\u0000" + localePrefix.substring(1, localePrefix.length() - 1) + "\u0000" + toTokenise), analysisMode); localePrefix = ""; } ArrayList<org.apache.lucene.analysis.Token> list = new ArrayList<org.apache.lucene.analysis.Token>(); org.apache.lucene.analysis.Token reusableToken = new org.apache.lucene.analysis.Token(); org.apache.lucene.analysis.Token nextToken; int positionCount = 0; boolean severalTokensAtSamePosition = false; while (true) { try { nextToken = source.next(reusableToken); } catch (IOException e) { nextToken = null; } if (nextToken == null) break; list.add((org.apache.lucene.analysis.Token) nextToken.clone()); if (nextToken.getPositionIncrement() != 0) positionCount += nextToken.getPositionIncrement(); else severalTokensAtSamePosition = true; } try { source.close(); } catch (IOException e) { // ignore } // add any alpha numeric wildcards that have been missed // Fixes most stop word and wild card issues for (int index = 0; index < testText.length(); index++) { char current = testText.charAt(index); if (((current == '*') || (current == '?')) && wildcardPoistions.contains(index)) { StringBuilder pre = new StringBuilder(10); if (index == 0) { // "*" and "?" at the start boolean found = false; for (int j = 0; j < list.size(); j++) { org.apache.lucene.analysis.Token test = list.get(j); if ((test.startOffset() <= 0) && (0 < test.endOffset())) { found = true; break; } } if (!found && (testText.length() == 1)) { // Add new token followed by * not given by the tokeniser org.apache.lucene.analysis.Token newToken = new org.apache.lucene.analysis.Token(0, 0); newToken.setTermBuffer(""); newToken.setType("ALPHANUM"); if (requiresMLTokenDuplication) { Locale locale = I18NUtil.parseLocale(localeString); MLAnalysisMode mlAnalysisMode = searchParameters.getMlAnalaysisMode() == null ? defaultSearchMLAnalysisMode : searchParameters.getMlAnalaysisMode(); MLTokenDuplicator duplicator = new MLTokenDuplicator(locale, mlAnalysisMode); Iterator<org.apache.lucene.analysis.Token> it = duplicator.buildIterator(newToken); if (it != null) { int count = 0; while (it.hasNext()) { list.add(it.next()); count++; if (count > 1) { severalTokensAtSamePosition = true; } } } } // content else { list.add(newToken); } } } else if (index > 0) { // Add * and ? back into any tokens from which it has been removed boolean tokenFound = false; for (int j = 0; j < list.size(); j++) { org.apache.lucene.analysis.Token test = list.get(j); if ((test.startOffset() <= index) && (index < test.endOffset())) { if (requiresMLTokenDuplication) { String termText = new String(test.termBuffer(), 0, test.termLength()); int position = termText.indexOf("}"); String language = termText.substring(0, position + 1); String token = termText.substring(position + 1); if (index >= test.startOffset() + token.length()) { test.setTermBuffer(language + token + current); } } else { if (index >= test.startOffset() + test.termLength()) { test.setTermBuffer(test.term() + current); } } tokenFound = true; break; } } if (!tokenFound) { for (int i = index - 1; i >= 0; i--) { char c = testText.charAt(i); if (Character.isLetterOrDigit(c)) { boolean found = false; for (int j = 0; j < list.size(); j++) { org.apache.lucene.analysis.Token test = list.get(j); if ((test.startOffset() <= i) && (i < test.endOffset())) { found = true; break; } } if (found) { break; } else { pre.insert(0, c); } } else { break; } } if (pre.length() > 0) { // Add new token followed by * not given by the tokeniser org.apache.lucene.analysis.Token newToken = new org.apache.lucene.analysis.Token( index - pre.length(), index); newToken.setTermBuffer(pre.toString()); newToken.setType("ALPHANUM"); if (requiresMLTokenDuplication) { Locale locale = I18NUtil.parseLocale(localeString); MLAnalysisMode mlAnalysisMode = searchParameters.getMlAnalaysisMode() == null ? defaultSearchMLAnalysisMode : searchParameters.getMlAnalaysisMode(); MLTokenDuplicator duplicator = new MLTokenDuplicator(locale, mlAnalysisMode); Iterator<org.apache.lucene.analysis.Token> it = duplicator.buildIterator(newToken); if (it != null) { int count = 0; while (it.hasNext()) { list.add(it.next()); count++; if (count > 1) { severalTokensAtSamePosition = true; } } } } // content else { list.add(newToken); } } } } StringBuilder post = new StringBuilder(10); if (index > 0) { for (int i = index + 1; i < testText.length(); i++) { char c = testText.charAt(i); if (Character.isLetterOrDigit(c)) { boolean found = false; for (int j = 0; j < list.size(); j++) { org.apache.lucene.analysis.Token test = list.get(j); if ((test.startOffset() <= i) && (i < test.endOffset())) { found = true; break; } } if (found) { break; } else { post.append(c); } } else { break; } } if (post.length() > 0) { // Add new token followed by * not given by the tokeniser org.apache.lucene.analysis.Token newToken = new org.apache.lucene.analysis.Token(index + 1, index + 1 + post.length()); newToken.setTermBuffer(post.toString()); newToken.setType("ALPHANUM"); if (requiresMLTokenDuplication) { Locale locale = I18NUtil.parseLocale(localeString); MLAnalysisMode mlAnalysisMode = searchParameters.getMlAnalaysisMode() == null ? defaultSearchMLAnalysisMode : searchParameters.getMlAnalaysisMode(); MLTokenDuplicator duplicator = new MLTokenDuplicator(locale, mlAnalysisMode); Iterator<org.apache.lucene.analysis.Token> it = duplicator.buildIterator(newToken); if (it != null) { int count = 0; while (it.hasNext()) { list.add(it.next()); count++; if (count > 1) { severalTokensAtSamePosition = true; } } } } // content else { list.add(newToken); } } } } } Collections.sort(list, new Comparator<org.apache.lucene.analysis.Token>() { public int compare(Token o1, Token o2) { int dif = o1.startOffset() - o2.startOffset(); if (dif != 0) { return dif; } else { return o2.getPositionIncrement() - o1.getPositionIncrement(); } } }); // Combined * and ? based strings - should redo the tokeniser // Build tokens by position LinkedList<LinkedList<org.apache.lucene.analysis.Token>> tokensByPosition = new LinkedList<LinkedList<org.apache.lucene.analysis.Token>>(); LinkedList<org.apache.lucene.analysis.Token> currentList = null; for (org.apache.lucene.analysis.Token c : list) { if (c.getPositionIncrement() == 0) { if (currentList == null) { currentList = new LinkedList<org.apache.lucene.analysis.Token>(); tokensByPosition.add(currentList); } currentList.add(c); } else { currentList = new LinkedList<org.apache.lucene.analysis.Token>(); tokensByPosition.add(currentList); currentList.add(c); } } // Build all the token sequences and see which ones get strung together LinkedList<LinkedList<org.apache.lucene.analysis.Token>> allTokenSequences = new LinkedList<LinkedList<org.apache.lucene.analysis.Token>>(); for (LinkedList<org.apache.lucene.analysis.Token> tokensAtPosition : tokensByPosition) { if (allTokenSequences.size() == 0) { for (org.apache.lucene.analysis.Token t : tokensAtPosition) { LinkedList<org.apache.lucene.analysis.Token> newEntry = new LinkedList<org.apache.lucene.analysis.Token>(); newEntry.add(t); allTokenSequences.add(newEntry); } } else { LinkedList<LinkedList<org.apache.lucene.analysis.Token>> newAllTokeSequences = new LinkedList<LinkedList<org.apache.lucene.analysis.Token>>(); FOR_FIRST_TOKEN_AT_POSITION_ONLY: for (org.apache.lucene.analysis.Token t : tokensAtPosition) { boolean tokenFoundSequence = false; for (LinkedList<org.apache.lucene.analysis.Token> tokenSequence : allTokenSequences) { LinkedList<org.apache.lucene.analysis.Token> newEntry = new LinkedList<org.apache.lucene.analysis.Token>(); newEntry.addAll(tokenSequence); if (newEntry.getLast().endOffset() <= t.startOffset()) { newEntry.add(t); tokenFoundSequence = true; } newAllTokeSequences.add(newEntry); } if (false == tokenFoundSequence) { LinkedList<org.apache.lucene.analysis.Token> newEntry = new LinkedList<org.apache.lucene.analysis.Token>(); newEntry.add(t); newAllTokeSequences.add(newEntry); } // Limit the max number of permutations we consider if (newAllTokeSequences.size() > 64) { break FOR_FIRST_TOKEN_AT_POSITION_ONLY; } } allTokenSequences = newAllTokeSequences; } } // build the uniquie LinkedList<LinkedList<org.apache.lucene.analysis.Token>> fixedTokenSequences = new LinkedList<LinkedList<org.apache.lucene.analysis.Token>>(); for (LinkedList<org.apache.lucene.analysis.Token> tokenSequence : allTokenSequences) { LinkedList<org.apache.lucene.analysis.Token> fixedTokenSequence = new LinkedList<org.apache.lucene.analysis.Token>(); fixedTokenSequences.add(fixedTokenSequence); org.apache.lucene.analysis.Token replace = null; for (org.apache.lucene.analysis.Token c : tokenSequence) { if (replace == null) { StringBuilder prefix = new StringBuilder(); for (int i = c.startOffset() - 1; i >= 0; i--) { char test = testText.charAt(i); if (((test == '*') || (test == '?')) && wildcardPoistions.contains(i)) { prefix.insert(0, test); } else { break; } } String pre = prefix.toString(); if (requiresMLTokenDuplication) { String termText = new String(c.termBuffer(), 0, c.termLength()); int position = termText.indexOf("}"); String language = termText.substring(0, position + 1); String token = termText.substring(position + 1); replace = new org.apache.lucene.analysis.Token(c.startOffset() - pre.length(), c.endOffset()); replace.setTermBuffer(language + pre + token); replace.setType(c.type()); replace.setPositionIncrement(c.getPositionIncrement()); } else { String termText = new String(c.termBuffer(), 0, c.termLength()); replace = new org.apache.lucene.analysis.Token(c.startOffset() - pre.length(), c.endOffset()); replace.setTermBuffer(pre + termText); replace.setType(c.type()); replace.setPositionIncrement(c.getPositionIncrement()); } } else { StringBuilder prefix = new StringBuilder(); StringBuilder postfix = new StringBuilder(); StringBuilder builder = prefix; for (int i = c.startOffset() - 1; i >= replace.endOffset(); i--) { char test = testText.charAt(i); if (((test == '*') || (test == '?')) && wildcardPoistions.contains(i)) { builder.insert(0, test); } else { builder = postfix; postfix.setLength(0); } } String pre = prefix.toString(); String post = postfix.toString(); // Does it bridge? if ((pre.length() > 0) && (replace.endOffset() + pre.length()) == c.startOffset()) { String termText = new String(c.termBuffer(), 0, c.termLength()); if (requiresMLTokenDuplication) { int position = termText.indexOf("}"); @SuppressWarnings("unused") String language = termText.substring(0, position + 1); String token = termText.substring(position + 1); int oldPositionIncrement = replace.getPositionIncrement(); String replaceTermText = new String(replace.termBuffer(), 0, replace.termLength()); replace = new org.apache.lucene.analysis.Token(replace.startOffset(), c.endOffset()); replace.setTermBuffer(replaceTermText + pre + token); replace.setType(replace.type()); replace.setPositionIncrement(oldPositionIncrement); } else { int oldPositionIncrement = replace.getPositionIncrement(); String replaceTermText = new String(replace.termBuffer(), 0, replace.termLength()); replace = new org.apache.lucene.analysis.Token(replace.startOffset(), c.endOffset()); replace.setTermBuffer(replaceTermText + pre + termText); replace.setType(replace.type()); replace.setPositionIncrement(oldPositionIncrement); } } else { String termText = new String(c.termBuffer(), 0, c.termLength()); if (requiresMLTokenDuplication) { int position = termText.indexOf("}"); String language = termText.substring(0, position + 1); String token = termText.substring(position + 1); String replaceTermText = new String(replace.termBuffer(), 0, replace.termLength()); org.apache.lucene.analysis.Token last = new org.apache.lucene.analysis.Token( replace.startOffset(), replace.endOffset() + post.length()); last.setTermBuffer(replaceTermText + post); last.setType(replace.type()); last.setPositionIncrement(replace.getPositionIncrement()); fixedTokenSequence.add(last); replace = new org.apache.lucene.analysis.Token(c.startOffset() - pre.length(), c.endOffset()); replace.setTermBuffer(language + pre + token); replace.setType(c.type()); replace.setPositionIncrement(c.getPositionIncrement()); } else { String replaceTermText = new String(replace.termBuffer(), 0, replace.termLength()); org.apache.lucene.analysis.Token last = new org.apache.lucene.analysis.Token( replace.startOffset(), replace.endOffset() + post.length()); last.setTermBuffer(replaceTermText + post); last.setType(replace.type()); last.setPositionIncrement(replace.getPositionIncrement()); fixedTokenSequence.add(last); replace = new org.apache.lucene.analysis.Token(c.startOffset() - pre.length(), c.endOffset()); replace.setTermBuffer(pre + termText); replace.setType(c.type()); replace.setPositionIncrement(c.getPositionIncrement()); } } } } // finish last if (replace != null) { StringBuilder postfix = new StringBuilder(); if ((replace.endOffset() >= 0) && (replace.endOffset() < testText.length())) { for (int i = replace.endOffset(); i < testText.length(); i++) { char test = testText.charAt(i); if (((test == '*') || (test == '?')) && wildcardPoistions.contains(i)) { postfix.append(test); } else { break; } } } String post = postfix.toString(); int oldPositionIncrement = replace.getPositionIncrement(); String replaceTermText = new String(replace.termBuffer(), 0, replace.termLength()); replace = new org.apache.lucene.analysis.Token(replace.startOffset(), replace.endOffset() + post.length()); replace.setTermBuffer(replaceTermText + post); replace.setType(replace.type()); replace.setPositionIncrement(oldPositionIncrement); fixedTokenSequence.add(replace); } } // rebuild fixed list ArrayList<org.apache.lucene.analysis.Token> fixed = new ArrayList<org.apache.lucene.analysis.Token>(); for (LinkedList<org.apache.lucene.analysis.Token> tokenSequence : fixedTokenSequences) { for (org.apache.lucene.analysis.Token token : tokenSequence) { fixed.add(token); } } // reorder by start position and increment Collections.sort(fixed, new Comparator<org.apache.lucene.analysis.Token>() { public int compare(Token o1, Token o2) { int dif = o1.startOffset() - o2.startOffset(); if (dif != 0) { return dif; } else { return o1.getPositionIncrement() - o2.getPositionIncrement(); } } }); // make sure we remove any tokens we have duplicated @SuppressWarnings("rawtypes") OrderedHashSet unique = new OrderedHashSet(); unique.addAll(fixed); fixed = new ArrayList<org.apache.lucene.analysis.Token>(unique); list = fixed; // add any missing locales back to the tokens if (localePrefix.length() > 0) { for (int j = 0; j < list.size(); j++) { org.apache.lucene.analysis.Token currentToken = list.get(j); String termText = new String(currentToken.termBuffer(), 0, currentToken.termLength()); currentToken.setTermBuffer(localePrefix + termText); } } if (list.size() == 0) return null; else if (list.size() == 1) { nextToken = list.get(0); String termText = new String(nextToken.termBuffer(), 0, nextToken.termLength()); if (termText.contains("*") || termText.contains("?")) { return newWildcardQuery( new Term(field, getLowercaseExpandedTerms() ? termText.toLowerCase() : termText)); } else { return newTermQuery(new Term(field, termText)); } } else { if (severalTokensAtSamePosition) { if (positionCount == 1) { // no phrase query: BooleanQuery q = newBooleanQuery(true); for (int i = 0; i < list.size(); i++) { Query currentQuery; nextToken = list.get(i); String termText = new String(nextToken.termBuffer(), 0, nextToken.termLength()); if (termText.contains("*") || termText.contains("?")) { currentQuery = newWildcardQuery(new Term(field, getLowercaseExpandedTerms() ? termText.toLowerCase() : termText)); } else { currentQuery = newTermQuery(new Term(field, termText)); } q.add(currentQuery, BooleanClause.Occur.SHOULD); } return q; } // Consider if we can use a multi-phrase query (e.g for synonym use rather then WordDelimiterFilterFactory) else if (canUseMultiPhraseQuery(fixedTokenSequences)) { // phrase query: MultiPhraseQuery mpq = newMultiPhraseQuery(); mpq.setSlop(internalSlop); ArrayList<Term> multiTerms = new ArrayList<Term>(); int position = 0; for (int i = 0; i < list.size(); i++) { nextToken = list.get(i); String termText = new String(nextToken.termBuffer(), 0, nextToken.termLength()); Term term = new Term(field, termText); if ((termText != null) && (termText.contains("*") || termText.contains("?"))) { addWildcardTerms(multiTerms, term); } else { multiTerms.add(term); } if (nextToken.getPositionIncrement() > 0 && multiTerms.size() > 0) { if (getEnablePositionIncrements()) { mpq.add(multiTerms.toArray(new Term[0]), position); } else { mpq.add(multiTerms.toArray(new Term[0])); } checkTermCount(field, queryText, mpq); multiTerms.clear(); } position += nextToken.getPositionIncrement(); } if (getEnablePositionIncrements()) { if (multiTerms.size() > 0) { mpq.add(multiTerms.toArray(new Term[0]), position); } // else // { // mpq.add(new Term[] { new Term(field, "\u0000") }, position); // } } else { if (multiTerms.size() > 0) { mpq.add(multiTerms.toArray(new Term[0])); } // else // { // mpq.add(new Term[] { new Term(field, "\u0000") }); // } } checkTermCount(field, queryText, mpq); return mpq; } // Word delimiter factory and other odd things generate complex token patterns // Smart skip token sequences with small tokens that generate toomany wildcards // Fall back to the larger pattern // e.g Site1* will not do (S ite 1*) or (Site 1*) if 1* matches too much (S ite1*) and (Site1*) will still be OK // If we skip all (for just 1* in the input) this is still an issue. else { boolean skippedTokens = false; BooleanQuery q = newBooleanQuery(true); TOKEN_SEQUENCE: for (LinkedList<org.apache.lucene.analysis.Token> tokenSequence : fixedTokenSequences) { // phrase query: MultiPhraseQuery mpq = newMultiPhraseQuery(); mpq.setSlop(internalSlop); int position = 0; for (int i = 0; i < tokenSequence.size(); i++) { nextToken = (org.apache.lucene.analysis.Token) tokenSequence.get(i); String termText = new String(nextToken.termBuffer(), 0, nextToken.termLength()); Term term = new Term(field, termText); if (getEnablePositionIncrements()) { if ((termText != null) && (termText.contains("*") || termText.contains("?"))) { mpq.add(getMatchingTerms(field, term), position); } else { mpq.add(new Term[] { term }, position); } if (exceedsTermCount(mpq)) { // We could duplicate the token sequence without the failing wildcard expansion and try again ?? skippedTokens = true; continue TOKEN_SEQUENCE; } if (nextToken.getPositionIncrement() > 0) { position += nextToken.getPositionIncrement(); } else { position++; } } else { if ((termText != null) && (termText.contains("*") || termText.contains("?"))) { mpq.add(getMatchingTerms(field, term)); } else { mpq.add(term); } if (exceedsTermCount(mpq)) { skippedTokens = true; continue TOKEN_SEQUENCE; } } } q.add(mpq, BooleanClause.Occur.SHOULD); } if (skippedTokens && (q.clauses().size() == 0)) { throw new LuceneQueryParserException( "Query skipped all token sequences as wildcards generated too many clauses: " + field + " " + queryText); } return q; } } else { MultiPhraseQuery q = new MultiPhraseQuery(); q.setSlop(internalSlop); int position = 0; for (int i = 0; i < list.size(); i++) { nextToken = list.get(i); String termText = new String(nextToken.termBuffer(), 0, nextToken.termLength()); Term term = new Term(field, termText); if (getEnablePositionIncrements()) { if ((termText != null) && (termText.contains("*") || termText.contains("?"))) { q.add(getMatchingTerms(field, term), position); } else { q.add(new Term[] { term }, position); } checkTermCount(field, queryText, q); if (nextToken.getPositionIncrement() > 0) { position += nextToken.getPositionIncrement(); } else { position++; } } else { if ((termText != null) && (termText.contains("*") || termText.contains("?"))) { q.add(getMatchingTerms(field, term)); } else { q.add(term); } checkTermCount(field, queryText, q); } } return q; } } }
From source file:org.apache.solr.analysis.TestSynonymFilter.java
License:Apache License
public List<Token> getTokList(SynonymMap dict, String input, boolean includeOrig) throws IOException { ArrayList<Token> lst = new ArrayList<Token>(); final List toks = tokens(input); TokenStream ts = new TokenStream() { Iterator iter = toks.iterator(); @Override// w w w . ja v a 2s. c o m public Token next() throws IOException { return iter.hasNext() ? (Token) iter.next() : null; } }; SynonymFilter sf = new SynonymFilter(ts, dict); Token target = new Token(); // test with token reuse while (true) { Token t = sf.next(target); if (t == null) return lst; lst.add((Token) t.clone()); } }