List of usage examples for org.apache.lucene.analysis TokenStream close
@Override public void close() throws IOException
From source file:org.alfresco.solr.query.Solr4QueryParser.java
License:Open Source License
@SuppressWarnings("unchecked") protected Query getFieldQueryImpl(String field, String queryText, AnalysisMode analysisMode, LuceneFunction luceneFunction) throws ParseException, IOException { // make sure the field exists or return a dummy query so we have no // error ....ACE-3231 SchemaField schemaField = schema.getFieldOrNull(field); boolean isNumeric = false; if (schemaField == null) { return new TermQuery(new Term("_dummy_", "_miss_")); } else {/* ww w . j a va 2 s. co m*/ isNumeric = (schemaField.getType().getNumericType() != null); if (isNumeric) { //Check to see if queryText is numeric or else it will fail. try { Double.valueOf(queryText); } catch (NumberFormatException e) { return new TermQuery(new Term("_dummy_", "_miss_")); } } } // Use the analyzer to get all the tokens, and then build a TermQuery, // PhraseQuery, or noth // TODO: Untokenised columns with functions require special handling if (luceneFunction != LuceneFunction.FIELD) { throw new UnsupportedOperationException( "Field queries are not supported on lucene functions (UPPER, LOWER, etc)"); } // if the incoming string already has a language identifier we strip it // iff and addit back on again String localePrefix = ""; String toTokenise = queryText; if (queryText.startsWith("{")) { int position = queryText.indexOf("}"); if (position > 0) { String language = queryText.substring(0, position + 1); Locale locale = new Locale(queryText.substring(1, position)); String token = queryText.substring(position + 1); boolean found = false; for (Locale current : Locale.getAvailableLocales()) { if (current.toString().equalsIgnoreCase(locale.toString())) { found = true; break; } } if (found) { localePrefix = language; toTokenise = token; } else { // toTokenise = token; } } } String testText = toTokenise; boolean requiresMLTokenDuplication = false; String localeString = null; if (isPropertyField(field) && (localePrefix.length() == 0)) { if ((queryText.length() > 0) && (queryText.charAt(0) == '\u0000')) { int position = queryText.indexOf("\u0000", 1); testText = queryText.substring(position + 1); requiresMLTokenDuplication = true; localeString = queryText.substring(1, position); } } // find the positions of any escaped * and ? and ignore them Set<Integer> wildcardPoistions = getWildcardPositions(testText); TokenStream source = null; ArrayList<PackedTokenAttributeImpl> list = new ArrayList<PackedTokenAttributeImpl>(); boolean severalTokensAtSamePosition = false; PackedTokenAttributeImpl nextToken; int positionCount = 0; try { source = getAnalyzer().tokenStream(field, new StringReader(toTokenise)); source.reset(); while (source.incrementToken()) { CharTermAttribute cta = source.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = source.getAttribute(OffsetAttribute.class); TypeAttribute typeAtt = null; if (source.hasAttribute(TypeAttribute.class)) { typeAtt = source.getAttribute(TypeAttribute.class); } PositionIncrementAttribute posIncAtt = null; if (source.hasAttribute(PositionIncrementAttribute.class)) { posIncAtt = source.getAttribute(PositionIncrementAttribute.class); } nextToken = new PackedTokenAttributeImpl(); nextToken.setEmpty().copyBuffer(cta.buffer(), 0, cta.length()); nextToken.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset()); if (typeAtt != null) { nextToken.setType(typeAtt.type()); } if (posIncAtt != null) { nextToken.setPositionIncrement(posIncAtt.getPositionIncrement()); } list.add(nextToken); if (nextToken.getPositionIncrement() != 0) positionCount += nextToken.getPositionIncrement(); else severalTokensAtSamePosition = true; } } finally { try { if (source != null) { source.close(); } } catch (IOException e) { // ignore } } // add any alpha numeric wildcards that have been missed // Fixes most stop word and wild card issues for (int index = 0; index < testText.length(); index++) { char current = testText.charAt(index); if (((current == '*') || (current == '?')) && wildcardPoistions.contains(index)) { StringBuilder pre = new StringBuilder(10); if (index == 0) { // "*" and "?" at the start boolean found = false; for (int j = 0; j < list.size(); j++) { PackedTokenAttributeImpl test = list.get(j); if ((test.startOffset() <= 0) && (0 < test.endOffset())) { found = true; break; } } if (!found && (list.size() == 0)) { // Add new token followed by * not given by the // tokeniser PackedTokenAttributeImpl newToken = new PackedTokenAttributeImpl(); newToken.setEmpty().append("", 0, 0); newToken.setType("ALPHANUM"); if (requiresMLTokenDuplication) { Locale locale = I18NUtil.parseLocale(localeString); @SuppressWarnings("resource") MLTokenDuplicator duplicator = new MLTokenDuplicator(locale, MLAnalysisMode.EXACT_LANGUAGE); Iterator<PackedTokenAttributeImpl> it = duplicator.buildIterator(newToken); if (it != null) { int count = 0; while (it.hasNext()) { list.add(it.next()); count++; if (count > 1) { severalTokensAtSamePosition = true; } } } } // content else { list.add(newToken); } } } else if (index > 0) { // Add * and ? back into any tokens from which it has been // removed boolean tokenFound = false; for (int j = 0; j < list.size(); j++) { PackedTokenAttributeImpl test = list.get(j); if ((test.startOffset() <= index) && (index < test.endOffset())) { if (requiresMLTokenDuplication) { String termText = test.toString(); int position = termText.indexOf("}"); String language = termText.substring(0, position + 1); String token = termText.substring(position + 1); if (index >= test.startOffset() + token.length()) { test.setEmpty(); test.append(language + token + current); } } else { if (index >= test.startOffset() + test.length()) { test.setEmpty(); test.append(test.toString() + current); } } tokenFound = true; break; } } if (!tokenFound) { for (int i = index - 1; i >= 0; i--) { char c = testText.charAt(i); if (Character.isLetterOrDigit(c)) { boolean found = false; for (int j = 0; j < list.size(); j++) { PackedTokenAttributeImpl test = list.get(j); if ((test.startOffset() <= i) && (i < test.endOffset())) { found = true; break; } } if (found) { break; } else { pre.insert(0, c); } } else { break; } } if (pre.length() > 0) { // Add new token followed by * not given by the // tokeniser PackedTokenAttributeImpl newToken = new PackedTokenAttributeImpl(); newToken.setEmpty().append(pre.toString()); newToken.setOffset(index - pre.length(), index); newToken.setType("ALPHANUM"); if (requiresMLTokenDuplication) { Locale locale = I18NUtil.parseLocale(localeString); @SuppressWarnings("resource") MLTokenDuplicator duplicator = new MLTokenDuplicator(locale, MLAnalysisMode.EXACT_LANGUAGE); Iterator<PackedTokenAttributeImpl> it = duplicator.buildIterator(newToken); if (it != null) { int count = 0; while (it.hasNext()) { list.add(it.next()); count++; if (count > 1) { severalTokensAtSamePosition = true; } } } } // content else { list.add(newToken); } } } } StringBuilder post = new StringBuilder(10); if (index > 0) { for (int i = index + 1; i < testText.length(); i++) { char c = testText.charAt(i); if (Character.isLetterOrDigit(c)) { boolean found = false; for (int j = 0; j < list.size(); j++) { PackedTokenAttributeImpl test = list.get(j); if ((test.startOffset() <= i) && (i < test.endOffset())) { found = true; break; } } if (found) { break; } else { post.append(c); } } else { break; } } if (post.length() > 0) { // Add new token followed by * not given by the // tokeniser PackedTokenAttributeImpl newToken = new PackedTokenAttributeImpl(); newToken.setEmpty().append(post.toString()); newToken.setOffset(index + 1, index + 1 + post.length()); newToken.setType("ALPHANUM"); if (requiresMLTokenDuplication) { Locale locale = I18NUtil.parseLocale(localeString); @SuppressWarnings("resource") MLTokenDuplicator duplicator = new MLTokenDuplicator(locale, MLAnalysisMode.EXACT_LANGUAGE); Iterator<PackedTokenAttributeImpl> it = duplicator.buildIterator(newToken); if (it != null) { int count = 0; while (it.hasNext()) { list.add(it.next()); count++; if (count > 1) { severalTokensAtSamePosition = true; } } } } // content else { list.add(newToken); } } } } } // Put in real position increments as we treat them correctly int curentIncrement = -1; for (PackedTokenAttributeImpl c : list) { if (curentIncrement == -1) { curentIncrement = c.getPositionIncrement(); } else if (c.getPositionIncrement() > 0) { curentIncrement = c.getPositionIncrement(); } else { c.setPositionIncrement(curentIncrement); } } // Fix up position increments for in phrase isolated wildcards boolean lastWasWild = false; for (int i = 0; i < list.size() - 1; i++) { for (int j = list.get(i).endOffset() + 1; j < list.get(i + 1).startOffset() - 1; j++) { if (wildcardPoistions.contains(j)) { if (!lastWasWild) { list.get(i + 1).setPositionIncrement(list.get(i + 1).getPositionIncrement() + 1); } lastWasWild = true; } else { lastWasWild = false; } } } Collections.sort(list, new Comparator<PackedTokenAttributeImpl>() { public int compare(PackedTokenAttributeImpl o1, PackedTokenAttributeImpl o2) { int dif = o1.startOffset() - o2.startOffset(); return dif; } }); // Combined * and ? based strings - should redo the tokeniser // Build tokens by position LinkedList<LinkedList<PackedTokenAttributeImpl>> tokensByPosition = new LinkedList<LinkedList<PackedTokenAttributeImpl>>(); LinkedList<PackedTokenAttributeImpl> currentList = null; int lastStart = 0; for (PackedTokenAttributeImpl c : list) { if (c.startOffset() == lastStart) { if (currentList == null) { currentList = new LinkedList<PackedTokenAttributeImpl>(); tokensByPosition.add(currentList); } currentList.add(c); } else { currentList = new LinkedList<PackedTokenAttributeImpl>(); tokensByPosition.add(currentList); currentList.add(c); } lastStart = c.startOffset(); } // Build all the token sequences and see which ones get strung together OrderedHashSet<LinkedList<PackedTokenAttributeImpl>> allTokenSequencesSet = new OrderedHashSet<LinkedList<PackedTokenAttributeImpl>>(); for (LinkedList<PackedTokenAttributeImpl> tokensAtPosition : tokensByPosition) { OrderedHashSet<LinkedList<PackedTokenAttributeImpl>> positionalSynonymSequencesSet = new OrderedHashSet<LinkedList<PackedTokenAttributeImpl>>(); OrderedHashSet<LinkedList<PackedTokenAttributeImpl>> newAllTokenSequencesSet = new OrderedHashSet<LinkedList<PackedTokenAttributeImpl>>(); FOR_FIRST_TOKEN_AT_POSITION_ONLY: for (PackedTokenAttributeImpl t : tokensAtPosition) { PackedTokenAttributeImpl replace = new PackedTokenAttributeImpl(); replace.setEmpty().append(t); replace.setOffset(t.startOffset(), t.endOffset()); replace.setType(t.type()); replace.setPositionIncrement(t.getPositionIncrement()); boolean tokenFoundSequence = false; for (LinkedList<PackedTokenAttributeImpl> tokenSequence : allTokenSequencesSet) { LinkedList<PackedTokenAttributeImpl> newEntry = new LinkedList<PackedTokenAttributeImpl>(); newEntry.addAll(tokenSequence); if ((newEntry.getLast().endOffset() == replace.endOffset()) && replace.type().equals(SynonymFilter.TYPE_SYNONYM)) { if ((newEntry.getLast().startOffset() == replace.startOffset()) && newEntry.getLast().type().equals(SynonymFilter.TYPE_SYNONYM)) { positionalSynonymSequencesSet.add(tokenSequence); newEntry.add(replace); tokenFoundSequence = true; } else if (newEntry.getLast().type().equals(CommonGramsFilter.GRAM_TYPE)) { if (newEntry.toString().endsWith(replace.toString())) { // already in the gram positionalSynonymSequencesSet.add(tokenSequence); tokenFoundSequence = true; } else { // need to replace the synonym in the current // gram tokenFoundSequence = true; StringBuffer old = new StringBuffer(newEntry.getLast().toString()); old.replace(replace.startOffset() - newEntry.getLast().startOffset(), replace.endOffset() - newEntry.getLast().startOffset(), replace.toString()); PackedTokenAttributeImpl newToken = new PackedTokenAttributeImpl(); newToken.setEmpty().append(old.toString()); newToken.setOffset(newEntry.getLast().startOffset(), newEntry.getLast().endOffset()); newEntry.removeLast(); newEntry.add(newToken); } } } else if ((newEntry.getLast().startOffset() < replace.startOffset()) && (newEntry.getLast().endOffset() < replace.endOffset())) { if (newEntry.getLast().type().equals(SynonymFilter.TYPE_SYNONYM) && replace.type().equals(SynonymFilter.TYPE_SYNONYM)) { positionalSynonymSequencesSet.add(tokenSequence); } newEntry.add(replace); tokenFoundSequence = true; } newAllTokenSequencesSet.add(newEntry); } if (false == tokenFoundSequence) { for (LinkedList<PackedTokenAttributeImpl> tokenSequence : newAllTokenSequencesSet) { LinkedList<PackedTokenAttributeImpl> newEntry = new LinkedList<PackedTokenAttributeImpl>(); newEntry.addAll(tokenSequence); if ((newEntry.getLast().endOffset() == replace.endOffset()) && replace.type().equals(SynonymFilter.TYPE_SYNONYM)) { if ((newEntry.getLast().startOffset() == replace.startOffset()) && newEntry.getLast().type().equals(SynonymFilter.TYPE_SYNONYM)) { positionalSynonymSequencesSet.add(tokenSequence); newEntry.add(replace); tokenFoundSequence = true; } else if (newEntry.getLast().type().equals(CommonGramsFilter.GRAM_TYPE)) { if (newEntry.toString().endsWith(replace.toString())) { // already in the gram positionalSynonymSequencesSet.add(tokenSequence); tokenFoundSequence = true; } else { // need to replace the synonym in the // current gram tokenFoundSequence = true; StringBuffer old = new StringBuffer(newEntry.getLast().toString()); old.replace(replace.startOffset() - newEntry.getLast().startOffset(), replace.endOffset() - newEntry.getLast().startOffset(), replace.toString()); PackedTokenAttributeImpl newToken = new PackedTokenAttributeImpl(); newToken.setEmpty().append(old.toString()); newToken.setOffset(newEntry.getLast().startOffset(), newEntry.getLast().endOffset()); newEntry.removeLast(); newEntry.add(newToken); positionalSynonymSequencesSet.add(newEntry); } } } else if ((newEntry.getLast().startOffset() < replace.startOffset()) && (newEntry.getLast().endOffset() < replace.endOffset())) { if (newEntry.getLast().type().equals(SynonymFilter.TYPE_SYNONYM) && replace.type().equals(SynonymFilter.TYPE_SYNONYM)) { positionalSynonymSequencesSet.add(tokenSequence); newEntry.add(replace); tokenFoundSequence = true; } } } } if (false == tokenFoundSequence) { LinkedList<PackedTokenAttributeImpl> newEntry = new LinkedList<PackedTokenAttributeImpl>(); newEntry.add(replace); newAllTokenSequencesSet.add(newEntry); } // Limit the max number of permutations we consider if (newAllTokenSequencesSet.size() > 64) { break FOR_FIRST_TOKEN_AT_POSITION_ONLY; } } allTokenSequencesSet = newAllTokenSequencesSet; allTokenSequencesSet.addAll(positionalSynonymSequencesSet); } LinkedList<LinkedList<PackedTokenAttributeImpl>> allTokenSequences = new LinkedList<LinkedList<PackedTokenAttributeImpl>>( allTokenSequencesSet); // build the unique LinkedList<LinkedList<PackedTokenAttributeImpl>> fixedTokenSequences = new LinkedList<LinkedList<PackedTokenAttributeImpl>>(); for (LinkedList<PackedTokenAttributeImpl> tokenSequence : allTokenSequences) { LinkedList<PackedTokenAttributeImpl> fixedTokenSequence = new LinkedList<PackedTokenAttributeImpl>(); fixedTokenSequences.add(fixedTokenSequence); PackedTokenAttributeImpl replace = null; for (PackedTokenAttributeImpl c : tokenSequence) { if (replace == null) { StringBuilder prefix = new StringBuilder(); for (int i = c.startOffset() - 1; i >= 0; i--) { char test = testText.charAt(i); if (((test == '*') || (test == '?')) && wildcardPoistions.contains(i)) { prefix.insert(0, test); } else { break; } } String pre = prefix.toString(); if (requiresMLTokenDuplication) { String termText = c.toString(); int position = termText.indexOf("}"); String language = termText.substring(0, position + 1); String token = termText.substring(position + 1); replace = new PackedTokenAttributeImpl(); replace.setEmpty().append(language + pre + token); replace.setOffset(c.startOffset() - pre.length(), c.endOffset()); replace.setType(c.type()); replace.setPositionIncrement(c.getPositionIncrement()); } else { String termText = c.toString(); replace = new PackedTokenAttributeImpl(); replace.setEmpty().append(pre + termText); replace.setOffset(c.startOffset() - pre.length(), c.endOffset()); replace.setType(c.type()); replace.setPositionIncrement(c.getPositionIncrement()); } } else { StringBuilder prefix = new StringBuilder(); StringBuilder postfix = new StringBuilder(); StringBuilder builder = prefix; for (int i = c.startOffset() - 1; i >= replace.endOffset(); i--) { char test = testText.charAt(i); if (((test == '*') || (test == '?')) && wildcardPoistions.contains(i)) { builder.insert(0, test); } else { builder = postfix; postfix.setLength(0); } } String pre = prefix.toString(); String post = postfix.toString(); // Does it bridge? if ((pre.length() > 0) && (replace.endOffset() + pre.length()) == c.startOffset()) { String termText = c.toString(); if (requiresMLTokenDuplication) { int position = termText.indexOf("}"); @SuppressWarnings("unused") String language = termText.substring(0, position + 1); String token = termText.substring(position + 1); int oldPositionIncrement = replace.getPositionIncrement(); String replaceTermText = replace.toString(); replace = new PackedTokenAttributeImpl(); replace.setEmpty().append(replaceTermText + pre + token); replace.setOffset(replace.startOffset(), c.endOffset()); replace.setType(replace.type()); replace.setPositionIncrement(oldPositionIncrement); } else { int oldPositionIncrement = replace.getPositionIncrement(); String replaceTermText = replace.toString(); replace = new PackedTokenAttributeImpl(); replace.setEmpty().append(replaceTermText + pre + termText); replace.setOffset(replace.startOffset(), c.endOffset()); replace.setType(replace.type()); replace.setPositionIncrement(oldPositionIncrement); } } else { String termText = c.toString(); if (requiresMLTokenDuplication) { int position = termText.indexOf("}"); String language = termText.substring(0, position + 1); String token = termText.substring(position + 1); String replaceTermText = replace.toString(); PackedTokenAttributeImpl last = new PackedTokenAttributeImpl(); last.setEmpty().append(replaceTermText + post); last.setOffset(replace.startOffset(), replace.endOffset() + post.length()); last.setType(replace.type()); last.setPositionIncrement(replace.getPositionIncrement()); fixedTokenSequence.add(last); replace = new PackedTokenAttributeImpl(); replace.setEmpty().append(language + pre + token); replace.setOffset(c.startOffset() - pre.length(), c.endOffset()); replace.setType(c.type()); replace.setPositionIncrement(c.getPositionIncrement()); } else { String replaceTermText = replace.toString(); PackedTokenAttributeImpl last = new PackedTokenAttributeImpl(); last.setEmpty().append(replaceTermText + post); last.setOffset(replace.startOffset(), replace.endOffset() + post.length()); last.setType(replace.type()); last.setPositionIncrement(replace.getPositionIncrement()); fixedTokenSequence.add(last); replace = new PackedTokenAttributeImpl(); replace.setEmpty().append(pre + termText); replace.setOffset(c.startOffset() - pre.length(), c.endOffset()); replace.setType(c.type()); replace.setPositionIncrement(c.getPositionIncrement()); } } } } // finish last if (replace != null) { StringBuilder postfix = new StringBuilder(); if ((replace.endOffset() >= 0) && (replace.endOffset() < testText.length())) { for (int i = replace.endOffset(); i < testText.length(); i++) { char test = testText.charAt(i); if (((test == '*') || (test == '?')) && wildcardPoistions.contains(i)) { postfix.append(test); } else { break; } } } String post = postfix.toString(); int oldPositionIncrement = replace.getPositionIncrement(); String replaceTermText = replace.toString(); PackedTokenAttributeImpl terminal = new PackedTokenAttributeImpl(); terminal.setEmpty().append(replaceTermText + post); terminal.setOffset(replace.startOffset(), replace.endOffset() + post.length()); terminal.setType(replace.type()); terminal.setPositionIncrement(oldPositionIncrement); fixedTokenSequence.add(terminal); } } // rebuild fixed list ArrayList<PackedTokenAttributeImpl> fixed = new ArrayList<PackedTokenAttributeImpl>(); for (LinkedList<PackedTokenAttributeImpl> tokenSequence : fixedTokenSequences) { for (PackedTokenAttributeImpl token : tokenSequence) { fixed.add(token); } } // reorder by start position and increment Collections.sort(fixed, new Comparator<PackedTokenAttributeImpl>() { public int compare(PackedTokenAttributeImpl o1, PackedTokenAttributeImpl o2) { int dif = o1.startOffset() - o2.startOffset(); if (dif != 0) { return dif; } else { return o1.getPositionIncrement() - o2.getPositionIncrement(); } } }); // make sure we remove any tokens we have duplicated @SuppressWarnings("rawtypes") OrderedHashSet unique = new OrderedHashSet(); unique.addAll(fixed); fixed = new ArrayList<PackedTokenAttributeImpl>(unique); list = fixed; // add any missing locales back to the tokens if (localePrefix.length() > 0) { for (int j = 0; j < list.size(); j++) { PackedTokenAttributeImpl currentToken = list.get(j); String termText = currentToken.toString(); currentToken.setEmpty(); currentToken.append(localePrefix + termText); } } SchemaField sf = schema.getField(field); boolean isShingled = false; @SuppressWarnings("resource") TokenizerChain tokenizerChain = (sf.getType().getQueryAnalyzer() instanceof TokenizerChain) ? ((TokenizerChain) sf.getType().getQueryAnalyzer()) : null; if (tokenizerChain != null) { for (TokenFilterFactory factory : tokenizerChain.getTokenFilterFactories()) { if (factory instanceof ShingleFilterFactory) { isShingled = true; break; } } } @SuppressWarnings("resource") AlfrescoAnalyzerWrapper analyzerWrapper = (sf.getType() .getQueryAnalyzer() instanceof AlfrescoAnalyzerWrapper) ? ((AlfrescoAnalyzerWrapper) sf.getType().getQueryAnalyzer()) : null; if (analyzerWrapper != null) { // assume if there are no term positions it is shingled .... isShingled = true; } boolean forceConjuncion = rerankPhase == RerankPhase.QUERY_PHASE; if (list.size() == 0) { return null; } else if (list.size() == 1) { nextToken = list.get(0); String termText = nextToken.toString(); if (!isNumeric && (termText.contains("*") || termText.contains("?"))) { return newWildcardQuery(new Term(field, termText)); } else { return newTermQuery(new Term(field, termText)); } } else { if (severalTokensAtSamePosition) { if (positionCount == 1) { // no phrase query: Builder q = newBooleanQuery(); for (int i = 0; i < list.size(); i++) { Query currentQuery; nextToken = list.get(i); String termText = nextToken.toString(); if (termText.contains("*") || termText.contains("?")) { currentQuery = newWildcardQuery(new Term(field, termText)); } else { currentQuery = newTermQuery(new Term(field, termText)); } q.add(currentQuery, BooleanClause.Occur.SHOULD); } return q.build(); } else if (forceConjuncion) { BooleanQuery.Builder or = new BooleanQuery.Builder(); for (LinkedList<PackedTokenAttributeImpl> tokenSequence : fixedTokenSequences) { BooleanQuery.Builder and = new BooleanQuery.Builder(); for (int i = 0; i < tokenSequence.size(); i++) { nextToken = (PackedTokenAttributeImpl) tokenSequence.get(i); String termText = nextToken.toString(); Term term = new Term(field, termText); if ((termText != null) && (termText.contains("*") || termText.contains("?"))) { org.apache.lucene.search.WildcardQuery wildQuery = new org.apache.lucene.search.WildcardQuery( term); and.add(wildQuery, Occur.MUST); } else { TermQuery termQuery = new TermQuery(term); and.add(termQuery, Occur.MUST); } } if (and.build().clauses().size() > 0) { or.add(and.build(), Occur.SHOULD); } } return or.build(); } // shingle else if (sf.omitPositions() && isShingled) { ArrayList<PackedTokenAttributeImpl> nonContained = getNonContained(list); Query currentQuery; BooleanQuery.Builder weakPhrase = new BooleanQuery.Builder(); for (PackedTokenAttributeImpl shingleToken : nonContained) { String termText = shingleToken.toString(); Term term = new Term(field, termText); if ((termText != null) && (termText.contains("*") || termText.contains("?"))) { currentQuery = new org.apache.lucene.search.WildcardQuery(term); } else { currentQuery = new TermQuery(term); } weakPhrase.add(currentQuery, Occur.MUST); } return weakPhrase.build(); } // Word delimiter factory and other odd things generate complex // token patterns // Smart skip token sequences with small tokens that generate // toomany wildcards // Fall back to the larger pattern // e.g Site1* will not do (S ite 1*) or (Site 1*) if 1* matches // too much (S ite1*) and (Site1*) will still be OK // If we skip all (for just 1* in the input) this is still an // issue. else { return generateSpanOrQuery(field, fixedTokenSequences); } } else { if (forceConjuncion) { BooleanQuery.Builder or = new BooleanQuery.Builder(); for (LinkedList<PackedTokenAttributeImpl> tokenSequence : fixedTokenSequences) { BooleanQuery.Builder and = new BooleanQuery.Builder(); for (int i = 0; i < tokenSequence.size(); i++) { nextToken = (PackedTokenAttributeImpl) tokenSequence.get(i); String termText = nextToken.toString(); Term term = new Term(field, termText); if ((termText != null) && (termText.contains("*") || termText.contains("?"))) { org.apache.lucene.search.WildcardQuery wildQuery = new org.apache.lucene.search.WildcardQuery( term); and.add(wildQuery, Occur.MUST); } else { TermQuery termQuery = new TermQuery(term); and.add(termQuery, Occur.MUST); } } if (and.build().clauses().size() > 0) { or.add(and.build(), Occur.SHOULD); } } return or.build(); } else { SpanQuery spanQuery = null; ArrayList<SpanQuery> atSamePositionSpanOrQueryParts = new ArrayList<SpanQuery>(); int gap = 0; for (int i = 0; i < list.size(); i++) { nextToken = list.get(i); String termText = nextToken.toString(); Term term = new Term(field, termText); if (getEnablePositionIncrements()) { SpanQuery nextSpanQuery; if ((termText != null) && (termText.contains("*") || termText.contains("?"))) { org.apache.lucene.search.WildcardQuery wildQuery = new org.apache.lucene.search.WildcardQuery( term); SpanMultiTermQueryWrapper<org.apache.lucene.search.WildcardQuery> wrapper = new SpanMultiTermQueryWrapper<org.apache.lucene.search.WildcardQuery>( wildQuery); wrapper.setRewriteMethod( new TopTermsSpanBooleanQueryRewrite(topTermSpanRewriteLimit)); nextSpanQuery = wrapper; } else { nextSpanQuery = new SpanTermQuery(term); } if (gap == 0) { atSamePositionSpanOrQueryParts.add(nextSpanQuery); } else { if (atSamePositionSpanOrQueryParts.size() == 0) { if (spanQuery == null) { spanQuery = nextSpanQuery; } else { spanQuery = new SpanNearQuery(new SpanQuery[] { spanQuery, nextSpanQuery }, (gap - 1) + internalSlop, internalSlop < 2); } atSamePositionSpanOrQueryParts = new ArrayList<SpanQuery>(); } else if (atSamePositionSpanOrQueryParts.size() == 1) { if (spanQuery == null) { spanQuery = atSamePositionSpanOrQueryParts.get(0); } else { spanQuery = new SpanNearQuery( new SpanQuery[] { spanQuery, atSamePositionSpanOrQueryParts.get(0) }, (gap - 1) + internalSlop, internalSlop < 2); } atSamePositionSpanOrQueryParts = new ArrayList<SpanQuery>(); atSamePositionSpanOrQueryParts.add(nextSpanQuery); } else { if (spanQuery == null) { spanQuery = new SpanOrQuery( atSamePositionSpanOrQueryParts.toArray(new SpanQuery[] {})); } else { spanQuery = new SpanNearQuery( new SpanQuery[] { spanQuery, new SpanOrQuery(atSamePositionSpanOrQueryParts .toArray(new SpanQuery[] {})) }, (gap - 1) + internalSlop, internalSlop < 2); } atSamePositionSpanOrQueryParts = new ArrayList<SpanQuery>(); atSamePositionSpanOrQueryParts.add(nextSpanQuery); } } gap = nextToken.getPositionIncrement(); } else { SpanQuery nextSpanQuery; if ((termText != null) && (termText.contains("*") || termText.contains("?"))) { org.apache.lucene.search.WildcardQuery wildQuery = new org.apache.lucene.search.WildcardQuery( term); SpanMultiTermQueryWrapper<org.apache.lucene.search.WildcardQuery> wrapper = new SpanMultiTermQueryWrapper<org.apache.lucene.search.WildcardQuery>( wildQuery); wrapper.setRewriteMethod( new TopTermsSpanBooleanQueryRewrite(topTermSpanRewriteLimit)); nextSpanQuery = wrapper; } else { nextSpanQuery = new SpanTermQuery(term); } if (spanQuery == null) { spanQuery = new SpanOrQuery(nextSpanQuery); } else { spanQuery = new SpanOrQuery(spanQuery, nextSpanQuery); } } } if (atSamePositionSpanOrQueryParts.size() == 0) { return spanQuery; } else if (atSamePositionSpanOrQueryParts.size() == 1) { if (spanQuery == null) { spanQuery = atSamePositionSpanOrQueryParts.get(0); } else { spanQuery = new SpanNearQuery( new SpanQuery[] { spanQuery, atSamePositionSpanOrQueryParts.get(0) }, (gap - 1) + internalSlop, internalSlop < 2); } return spanQuery; } else { if (spanQuery == null) { spanQuery = new SpanOrQuery(atSamePositionSpanOrQueryParts.toArray(new SpanQuery[] {})); } else { spanQuery = new SpanNearQuery( new SpanQuery[] { spanQuery, new SpanOrQuery( atSamePositionSpanOrQueryParts.toArray(new SpanQuery[] {})) }, (gap - 1) + internalSlop, internalSlop < 2); } return spanQuery; } } } } }
From source file:org.alfresco.solr.query.Solr4QueryParser.java
License:Open Source License
private String getFirstTokenForRange(String string, FieldInstance field) throws IOException { PackedTokenAttributeImpl nextToken;/*from w w w.j ava 2s. co m*/ TokenStream source = null; ; try { source = getAnalyzer().tokenStream(field.getField(), new StringReader(string)); source.reset(); while (source.incrementToken()) { CharTermAttribute cta = source.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = source.getAttribute(OffsetAttribute.class); TypeAttribute typeAtt = null; if (source.hasAttribute(TypeAttribute.class)) { typeAtt = source.getAttribute(TypeAttribute.class); } PositionIncrementAttribute posIncAtt = null; if (source.hasAttribute(PositionIncrementAttribute.class)) { posIncAtt = source.getAttribute(PositionIncrementAttribute.class); } nextToken = new PackedTokenAttributeImpl(); nextToken.setEmpty().copyBuffer(cta.buffer(), 0, cta.length()); nextToken.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset()); if (typeAtt != null) { nextToken.setType(typeAtt.type()); } if (posIncAtt != null) { nextToken.setPositionIncrement(posIncAtt.getPositionIncrement()); } return nextToken.toString(); } } finally { try { if (source != null) { source.close(); } } catch (IOException e) { // ignore } } return null; }
From source file:org.alfresco.solr.query.Solr4QueryParser.java
License:Open Source License
/** * @param first/*from www . jav a2 s . c o m*/ * @param field * @return SpanOrQuery * @throws IOException */ private SpanQuery buildSpanOrQuery(String first, FieldInstance field) throws IOException { ArrayList<SpanQuery> spanOrQueryParts = new ArrayList<SpanQuery>(); PackedTokenAttributeImpl nextToken; TokenStream source = null; try { source = getAnalyzer().tokenStream(field.getField(), new StringReader(first)); source.reset(); while (source.incrementToken()) { CharTermAttribute cta = source.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = source.getAttribute(OffsetAttribute.class); TypeAttribute typeAtt = null; if (source.hasAttribute(TypeAttribute.class)) { typeAtt = source.getAttribute(TypeAttribute.class); } PositionIncrementAttribute posIncAtt = null; if (source.hasAttribute(PositionIncrementAttribute.class)) { posIncAtt = source.getAttribute(PositionIncrementAttribute.class); } nextToken = new PackedTokenAttributeImpl(); nextToken.setEmpty().copyBuffer(cta.buffer(), 0, cta.length()); nextToken.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset()); if (typeAtt != null) { nextToken.setType(typeAtt.type()); } if (posIncAtt != null) { nextToken.setPositionIncrement(posIncAtt.getPositionIncrement()); } SpanQuery termQuery = new SpanTermQuery(new Term(field.getField(), nextToken.toString())); spanOrQueryParts.add(termQuery); } } finally { try { if (source != null) { source.close(); } } catch (IOException e) { // ignore } } if (spanOrQueryParts.size() == 1) { return spanOrQueryParts.get(0); } else { return new SpanOrQuery(spanOrQueryParts.toArray(new SpanQuery[] {})); } }
From source file:org.alfresco.solr.SolrInformationServer.java
License:Open Source License
private void addContentPropertyToDocUsingAlfrescoRepository(SolrInputDocument doc, QName propertyQName, long dbId, String locale) throws AuthenticationException, IOException { long start = System.nanoTime(); // Expensive call to be done with ContentTracker GetTextContentResponse response = repositoryClient.getTextContent(dbId, propertyQName, null); addContentPropertyMetadata(doc, propertyQName, AlfrescoSolrDataModel.ContentFieldType.TRANSFORMATION_STATUS, response);// w w w . j a va 2 s.c o m addContentPropertyMetadata(doc, propertyQName, AlfrescoSolrDataModel.ContentFieldType.TRANSFORMATION_EXCEPTION, response); addContentPropertyMetadata(doc, propertyQName, AlfrescoSolrDataModel.ContentFieldType.TRANSFORMATION_TIME, response); InputStream ris = response.getContent(); String textContent = ""; try { if (ris != null) { // Get and copy content byte[] bytes = FileCopyUtils.copyToByteArray(new BoundedInputStream(ris, contentStreamLimit)); textContent = new String(bytes, StandardCharsets.UTF_8); } } finally { // release the response only when the content has been read response.release(); } if (minHash && textContent.length() > 0) { Analyzer analyzer = core.getLatestSchema().getFieldType("min_hash").getIndexAnalyzer(); TokenStream ts = analyzer.tokenStream("min_hash", textContent); CharTermAttribute termAttribute = ts.getAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { StringBuilder tokenBuff = new StringBuilder(); char[] buff = termAttribute.buffer(); for (int i = 0; i < termAttribute.length(); i++) { tokenBuff.append(Integer.toHexString(buff[i])); } doc.addField(FINGERPRINT_FIELD, tokenBuff.toString()); } ts.end(); ts.close(); } long end = System.nanoTime(); this.getTrackerStats().addDocTransformationTime(end - start); StringBuilder builder = new StringBuilder(textContent.length() + 16); builder.append("\u0000").append(locale).append("\u0000"); builder.append(textContent); String localisedText = builder.toString(); for (FieldInstance field : AlfrescoSolrDataModel.getInstance() .getIndexedFieldNamesForProperty(propertyQName).getFields()) { doc.removeField(field.getField()); if (field.isLocalised()) { doc.addField(field.getField(), localisedText); } else { doc.addField(field.getField(), textContent); } addFieldIfNotSet(doc, field); } }
From source file:org.allenai.blacklab.queryParser.lucene.QueryParserBase.java
License:Apache License
/** * @exception org.apache.lucene.queryparser.classic.ParseException throw in overridden method to disallow *//*ww w.j av a 2s .c o m*/ protected TextPattern newFieldQuery(Analyzer analyzer, String field, String queryText, boolean quoted) throws ParseException { // Use the analyzer to get all the tokens, and then build a TermQuery, // PhraseQuery, or nothing based on the term count TokenStream source; try { source = analyzer.tokenStream(field, new StringReader(queryText)); source.reset(); } catch (IOException e) { ParseException p = new ParseException("Unable to initialize TokenStream to analyze query text"); p.initCause(e); throw p; } CachingTokenFilter buffer = new CachingTokenFilter(source); TermToBytesRefAttribute termAtt = null; PositionIncrementAttribute posIncrAtt = null; int numTokens = 0; buffer.reset(); if (buffer.hasAttribute(TermToBytesRefAttribute.class)) { termAtt = buffer.getAttribute(TermToBytesRefAttribute.class); } if (buffer.hasAttribute(PositionIncrementAttribute.class)) { posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class); } int positionCount = 0; boolean severalTokensAtSamePosition = false; boolean hasMoreTokens = false; if (termAtt != null) { try { hasMoreTokens = buffer.incrementToken(); while (hasMoreTokens) { numTokens++; int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1; if (positionIncrement != 0) { positionCount += positionIncrement; } else { severalTokensAtSamePosition = true; } hasMoreTokens = buffer.incrementToken(); } } catch (IOException e) { // ignore } } try { // rewind the buffer stream buffer.reset(); // close original stream - all tokens buffered source.close(); } catch (IOException e) { ParseException p = new ParseException("Cannot close TokenStream analyzing query text"); p.initCause(e); throw p; } BytesRef bytes = termAtt == null ? null : termAtt.getBytesRef(); if (numTokens == 0) return null; else if (numTokens == 1) { try { boolean hasNext = buffer.incrementToken(); assert hasNext == true; termAtt.fillBytesRef(); } catch (IOException e) { // safe to ignore, because we know the number of tokens } return newTermQuery(new Term(field, BytesRef.deepCopyOf(bytes))); } else { if (severalTokensAtSamePosition || (!quoted && !autoGeneratePhraseQueries)) { if (positionCount == 1 || (!quoted && !autoGeneratePhraseQueries)) { // no phrase query: TextPatternBoolean q = newBooleanQuery(positionCount == 1); // BL: BooleanQuery -> TextPatternBoolean BooleanClause.Occur occur = positionCount > 1 && operator == AND_OPERATOR ? BooleanClause.Occur.MUST : BooleanClause.Occur.SHOULD; for (int i = 0; i < numTokens; i++) { try { boolean hasNext = buffer.incrementToken(); assert hasNext == true; termAtt.fillBytesRef(); } catch (IOException e) { // safe to ignore, because we know the number of tokens } TextPattern currentQuery = newTermQuery(new Term(field, BytesRef.deepCopyOf(bytes))); q.add(currentQuery, occur); } return q; } else { // phrase query: TPMultiPhrase mpq = newMultiPhraseQuery(); // BL: MultiPhraseQuery -> TPMultiPhrase mpq.setSlop(phraseSlop); List<Term> multiTerms = new ArrayList<Term>(); int position = -1; for (int i = 0; i < numTokens; i++) { int positionIncrement = 1; try { boolean hasNext = buffer.incrementToken(); assert hasNext == true; termAtt.fillBytesRef(); if (posIncrAtt != null) { positionIncrement = posIncrAtt.getPositionIncrement(); } } catch (IOException e) { // safe to ignore, because we know the number of tokens } if (positionIncrement > 0 && multiTerms.size() > 0) { if (enablePositionIncrements) { mpq.add(multiTerms.toArray(new Term[0]), position); } else { mpq.add(multiTerms.toArray(new Term[0])); } multiTerms.clear(); } position += positionIncrement; multiTerms.add(new Term(field, BytesRef.deepCopyOf(bytes))); } if (enablePositionIncrements) { mpq.add(multiTerms.toArray(new Term[0]), position); } else { mpq.add(multiTerms.toArray(new Term[0])); } return mpq; } } else { TPPhrase pq = newPhraseQuery(); // BL: PhraseQuery -> TPPhrase pq.setSlop(phraseSlop); int position = -1; for (int i = 0; i < numTokens; i++) { int positionIncrement = 1; try { boolean hasNext = buffer.incrementToken(); assert hasNext == true; termAtt.fillBytesRef(); if (posIncrAtt != null) { positionIncrement = posIncrAtt.getPositionIncrement(); } } catch (IOException e) { // safe to ignore, because we know the number of tokens } if (enablePositionIncrements) { position += positionIncrement; pq.add(new Term(field, BytesRef.deepCopyOf(bytes)), position); } else { pq.add(new Term(field, BytesRef.deepCopyOf(bytes))); } } return pq; } } }
From source file:org.allenai.blacklab.queryParser.lucene.QueryParserBase.java
License:Apache License
protected BytesRef analyzeMultitermTerm(String field, String part, Analyzer analyzerIn) { TokenStream source; if (analyzerIn == null) analyzerIn = analyzer;//w w w . jav a 2 s . co m try { source = analyzerIn.tokenStream(field, new StringReader(part)); source.reset(); } catch (IOException e) { throw new RuntimeException("Unable to initialize TokenStream to analyze multiTerm term: " + part, e); } TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class); BytesRef bytes = termAtt.getBytesRef(); try { if (!source.incrementToken()) throw new IllegalArgumentException("analyzer returned no terms for multiTerm term: " + part); termAtt.fillBytesRef(); if (source.incrementToken()) throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + part); } catch (IOException e) { throw new RuntimeException("error analyzing range part: " + part, e); } try { source.end(); source.close(); } catch (IOException e) { throw new RuntimeException("Unable to end & close TokenStream after analyzing multiTerm term: " + part, e); } return BytesRef.deepCopyOf(bytes); }
From source file:org.apache.cocoon.bean.query.SimpleLuceneCriterionBean.java
License:Apache License
/** * Gets the <code>org.apache.lucene.search.Query</code> from the Criterion * <p>//w w w . j a v a 2 s . c o m * The analyzer specifies which <code>org.apache.lucene.analysis.Analyzer</code> to use for this search. * </p> * * @param analyzer The <code>org.apache.lucene.analysis.Analyzer</code> to use to extract the Terms from this Criterion */ public Query getQuery(Analyzer analyzer) { String f = this.field; Query query = null; if (ANY_FIELD.equals(this.field)) f = LuceneXMLIndexer.BODY_FIELD; // extract Terms from the query string TokenStream tokens = analyzer.tokenStream(f, new StringReader(this.term)); Vector words = new Vector(); Token token; while (true) { try { token = tokens.next(); } catch (IOException e) { token = null; } if (token == null) break; words.addElement(token.termText()); } try { tokens.close(); } catch (IOException e) { } // ignore // assemble the different matches if (ANY_MATCH.equals(this.match)) { if (words.size() > 1) { query = new BooleanQuery(); for (int i = 0; i < words.size(); i++) { ((BooleanQuery) query).add(new TermQuery(new Term(f, (String) words.elementAt(i))), false, false); } } else if (words.size() == 1) { query = new TermQuery(new Term(f, (String) words.elementAt(0))); } } if (ALL_MATCH.equals(this.match)) { if (words.size() > 1) { query = new BooleanQuery(); for (int i = 0; i < words.size(); i++) { ((BooleanQuery) query).add(new TermQuery(new Term(f, (String) words.elementAt(i))), true, false); } } else if (words.size() == 1) { query = new TermQuery(new Term(f, (String) words.elementAt(0))); } } if (NOT_MATCH.equals(this.match)) { if (words.size() > 1) { query = new BooleanQuery(); for (int i = 0; i < words.size(); i++) { ((BooleanQuery) query).add(new TermQuery(new Term(f, (String) words.elementAt(i))), true, true); } } else if (words.size() == 1) { query = new TermQuery(new Term(f, (String) words.elementAt(0))); } } if (LIKE_MATCH.equals(this.match)) { if (words.size() > 1) { query = new BooleanQuery(); for (int i = 0; i < words.size(); i++) { ((BooleanQuery) query).add(new FuzzyQuery(new Term(f, (String) words.elementAt(i))), false, false); } } else if (words.size() == 1) { query = new FuzzyQuery(new Term(f, (String) words.elementAt(0))); } } if (PHRASE_MATCH.equals(this.match)) { if (words.size() > 1) { query = new PhraseQuery(); ((PhraseQuery) query).setSlop(0); for (int i = 0; i < words.size(); i++) { ((PhraseQuery) query).add(new Term(f, (String) words.elementAt(i))); } } else if (words.size() == 1) { query = new TermQuery(new Term(f, (String) words.elementAt(0))); } } return query; }
From source file:org.apache.fuzzydb.queryParser.QueryParser.java
License:Open Source License
/** * @exception ParseException throw in overridden method to disallow *///from w w w . j av a 2 s.co m protected Query getFieldQuery(String field, String queryText) throws ParseException { // Use the analyzer to get all the tokens, and then build a TermQuery, // PhraseQuery, or nothing based on the term count TokenStream source; try { source = analyzer.reusableTokenStream(field, new StringReader(queryText)); source.reset(); } catch (IOException e) { source = analyzer.tokenStream(field, new StringReader(queryText)); } CachingTokenFilter buffer = new CachingTokenFilter(source); TermAttribute termAtt = null; PositionIncrementAttribute posIncrAtt = null; int numTokens = 0; boolean success = false; try { buffer.reset(); success = true; } catch (IOException e) { // success==false if we hit an exception } if (success) { if (buffer.hasAttribute(TermAttribute.class)) { termAtt = buffer.getAttribute(TermAttribute.class); } if (buffer.hasAttribute(PositionIncrementAttribute.class)) { posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class); } } int positionCount = 0; boolean severalTokensAtSamePosition = false; boolean hasMoreTokens = false; if (termAtt != null) { try { hasMoreTokens = buffer.incrementToken(); while (hasMoreTokens) { numTokens++; int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1; if (positionIncrement != 0) { positionCount += positionIncrement; } else { severalTokensAtSamePosition = true; } hasMoreTokens = buffer.incrementToken(); } } catch (IOException e) { // ignore } } try { // rewind the buffer stream buffer.reset(); // close original stream - all tokens buffered source.close(); } catch (IOException e) { // ignore } if (numTokens == 0) return null; else if (numTokens == 1) { String term = null; try { boolean hasNext = buffer.incrementToken(); assert hasNext == true; term = termAtt.term(); } catch (IOException e) { // safe to ignore, because we know the number of tokens } return newTermQuery(new Term(field, term)); } else { if (severalTokensAtSamePosition) { if (positionCount == 1) { // no phrase query: BooleanQuery q = newBooleanQuery(true); for (int i = 0; i < numTokens; i++) { String term = null; try { boolean hasNext = buffer.incrementToken(); assert hasNext == true; term = termAtt.term(); } catch (IOException e) { // safe to ignore, because we know the number of tokens } Query currentQuery = newTermQuery(new Term(field, term)); q.add(currentQuery, BooleanClause.Occur.SHOULD); } return q; } else { // phrase query: MultiPhraseQuery mpq = newMultiPhraseQuery(); mpq.setSlop(phraseSlop); List<Term> multiTerms = new ArrayList<Term>(); int position = -1; for (int i = 0; i < numTokens; i++) { String term = null; int positionIncrement = 1; try { boolean hasNext = buffer.incrementToken(); assert hasNext == true; term = termAtt.term(); if (posIncrAtt != null) { positionIncrement = posIncrAtt.getPositionIncrement(); } } catch (IOException e) { // safe to ignore, because we know the number of tokens } if (positionIncrement > 0 && multiTerms.size() > 0) { if (enablePositionIncrements) { mpq.add(multiTerms.toArray(new Term[0]), position); } else { mpq.add(multiTerms.toArray(new Term[0])); } multiTerms.clear(); } position += positionIncrement; multiTerms.add(new Term(field, term)); } if (enablePositionIncrements) { mpq.add(multiTerms.toArray(new Term[0]), position); } else { mpq.add(multiTerms.toArray(new Term[0])); } return mpq; } } else { PhraseQuery pq = newPhraseQuery(); pq.setSlop(phraseSlop); int position = -1; for (int i = 0; i < numTokens; i++) { String term = null; int positionIncrement = 1; try { boolean hasNext = buffer.incrementToken(); assert hasNext == true; term = termAtt.term(); if (posIncrAtt != null) { positionIncrement = posIncrAtt.getPositionIncrement(); } } catch (IOException e) { // safe to ignore, because we know the number of tokens } if (enablePositionIncrements) { position += positionIncrement; pq.add(new Term(field, term), position); } else { pq.add(new Term(field, term)); } } return pq; } } }
From source file:org.apache.jackrabbit.core.query.lucene.AbstractExcerpt.java
License:Apache License
/** * @param text the text.//ww w . ja v a 2 s . c o m * @return a <code>TermPositionVector</code> for the given text. */ private TermPositionVector createTermPositionVector(String text) { // term -> TermVectorOffsetInfo[] final SortedMap<String, TermVectorOffsetInfo[]> termMap = new TreeMap<String, TermVectorOffsetInfo[]>(); Reader r = new StringReader(text); TokenStream ts = index.getTextAnalyzer().tokenStream("", r); try { while (ts.incrementToken()) { OffsetAttribute offset = ts.getAttribute(OffsetAttribute.class); TermAttribute term = ts.getAttribute(TermAttribute.class); String termText = term.term(); TermVectorOffsetInfo[] info = termMap.get(termText); if (info == null) { info = new TermVectorOffsetInfo[1]; } else { TermVectorOffsetInfo[] tmp = info; info = new TermVectorOffsetInfo[tmp.length + 1]; System.arraycopy(tmp, 0, info, 0, tmp.length); } info[info.length - 1] = new TermVectorOffsetInfo(offset.startOffset(), offset.endOffset()); termMap.put(termText, info); } ts.end(); ts.close(); } catch (IOException e) { // should never happen, we are reading from a string } return new TermPositionVector() { private String[] terms = (String[]) termMap.keySet().toArray(new String[termMap.size()]); public int[] getTermPositions(int index) { return null; } public TermVectorOffsetInfo[] getOffsets(int index) { TermVectorOffsetInfo[] info = TermVectorOffsetInfo.EMPTY_OFFSET_INFO; if (index >= 0 && index < terms.length) { info = termMap.get(terms[index]); } return info; } public String getField() { return ""; } public int size() { return terms.length; } public String[] getTerms() { return terms; } public int[] getTermFrequencies() { int[] freqs = new int[terms.length]; for (int i = 0; i < terms.length; i++) { freqs[i] = termMap.get(terms[i]).length; } return freqs; } public int indexOf(String term) { int res = Arrays.binarySearch(terms, term); return res >= 0 ? res : -1; } public int[] indexesOf(String[] terms, int start, int len) { int[] res = new int[len]; for (int i = 0; i < len; i++) { res[i] = indexOf(terms[i]); } return res; } }; }
From source file:org.apache.jackrabbit.core.query.lucene.JackrabbitQueryParser.java
License:Apache License
/** * {@inheritDoc}//from w ww.j av a 2s . c om */ protected Query getPrefixQuery(String field, String termStr) throws ParseException { // only create a prefix query when the term is a single word / token Analyzer a = getAnalyzer(); TokenStream ts = a.tokenStream(field, new StringReader(termStr)); int count = 0; boolean isCJ = false; try { TypeAttribute t = ts.addAttribute(TypeAttribute.class); ts.reset(); while (ts.incrementToken()) { count++; isCJ = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.CJ].equals(t.type()); } ts.end(); } catch (IOException e) { throw new ParseException(e.getMessage()); } finally { try { ts.close(); } catch (IOException e) { // ignore } } if (count > 1 && isCJ) { return getFieldQuery(field, termStr); } else { return getWildcardQuery(field, termStr + "*"); } }