List of usage examples for org.apache.lucene.analysis TokenStream hasAttribute
public final boolean hasAttribute(Class<? extends Attribute> attClass)
From source file:com.isotrol.impe3.lucene.PortalSpanishAnalyzerTest.java
License:Open Source License
private void test(String name, Analyzer a, String text) throws IOException { final Reader r = new StringReader(text); final TokenStream s = a.tokenStream(null, r); List<String> list = Lists.newLinkedList(); s.reset();//w w w . jav a 2 s . c om while (s.incrementToken()) { if (s.hasAttribute(CharTermAttribute.class)) { list.add(s.getAttribute(CharTermAttribute.class).toString()); } } System.out.printf("[%s] %s => %s\n", name, text, list); }
From source file:com.isotrol.impe3.lucene.PrefixAnalyzedQueryParser.java
License:Open Source License
@Override protected org.apache.lucene.search.Query getPrefixQuery(String field, String termStr) throws ParseException { try {/* www . j a v a 2 s. co m*/ TokenStream ts = analyzer.tokenStream(field, new StringReader(termStr)); if (ts.incrementToken() && ts.hasAttribute(CharTermAttribute.class)) { String term = ts.getAttribute(CharTermAttribute.class).toString(); if (term != null) { return super.getPrefixQuery(field, term); } } } catch (IOException e) { } return super.getPrefixQuery(field, termStr); }
From source file:com.qwazr.search.analysis.AnalyzerUtils.java
License:Apache License
final static <T extends Attribute> T getAttribute(TokenStream tokenStream, Class<T> attributeClass) { return tokenStream.hasAttribute(attributeClass) ? tokenStream.getAttribute(attributeClass) : null; }
From source file:com.sindicetech.siren.analysis.NodeAnalyzerTestCase.java
License:Open Source License
public void assertAnalyzesTo(final Analyzer a, final String input, final String[] expectedImages, final String[] expectedTypes, final int[] expectedPosIncrs, final IntsRef[] expectedNode, final int[] expectedPos) throws Exception { final TokenStream t = a.tokenStream("", new StringReader(input)); t.reset();/* w ww . ja va 2 s . co m*/ assertTrue("has TermAttribute", t.hasAttribute(CharTermAttribute.class)); final CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); TypeAttribute typeAtt = null; if (expectedTypes != null) { assertTrue("has TypeAttribute", t.hasAttribute(TypeAttribute.class)); typeAtt = t.getAttribute(TypeAttribute.class); } PositionIncrementAttribute posIncrAtt = null; if (expectedPosIncrs != null) { assertTrue("has PositionIncrementAttribute", t.hasAttribute(PositionIncrementAttribute.class)); posIncrAtt = t.getAttribute(PositionIncrementAttribute.class); } NodeAttribute nodeAtt = null; if (expectedNode != null) { assertTrue("has NodeAttribute", t.hasAttribute(NodeAttribute.class)); nodeAtt = t.getAttribute(NodeAttribute.class); } PositionAttribute posAtt = null; if (expectedPos != null) { assertTrue("has PositionAttribute", t.hasAttribute(PositionAttribute.class)); posAtt = t.getAttribute(PositionAttribute.class); } for (int i = 0; i < expectedImages.length; i++) { assertTrue("token " + i + " exists", t.incrementToken()); assertEquals("i=" + i, expectedImages[i], termAtt.toString()); if (expectedTypes != null) { assertEquals(expectedTypes[i], typeAtt.type()); } if (expectedPosIncrs != null) { assertEquals(expectedPosIncrs[i], posIncrAtt.getPositionIncrement()); } if (expectedNode != null) { assertEquals(expectedNode[i], nodeAtt.node()); } if (expectedPos != null) { assertEquals(expectedPos[i], posAtt.position()); } } assertFalse("end of stream, received token " + termAtt.toString(), t.incrementToken()); t.end(); t.close(); }
From source file:com.sindicetech.siren.solr.analysis.BaseSirenStreamTestCase.java
License:Open Source License
public void assertTokenStreamContents(final TokenStream stream, final String[] expectedImages) throws Exception { assertTrue("has TermAttribute", stream.hasAttribute(CharTermAttribute.class)); final CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class); stream.reset();/*from w w w . jav a 2s. c o m*/ for (int i = 0; i < expectedImages.length; i++) { stream.clearAttributes(); assertTrue("token " + i + " does not exists", stream.incrementToken()); assertEquals(expectedImages[i], termAtt.toString()); } assertFalse("end of stream", stream.incrementToken()); stream.end(); stream.close(); }
From source file:filters.indexing.TopicModelInputFilter.java
License:Open Source License
/** * Constructor for class IndexableFilter * /* w w w . j ava 2 s . c om*/ * @param input */ public TopicModelInputFilter(TokenStream input, TokenListsCollector tokenLists, ReviewId reviewId) { super(input); // Getting attributes from input token stream input_term = input.getAttribute(TermAttribute.class); input_type = input.getAttribute(TypeAttribute.class); input_flags = input.getAttribute(FlagsAttribute.class); input_payload = input.hasAttribute(PayloadAttribute.class) ? input.getAttribute(PayloadAttribute.class) : null; // Setting attributes for this token stream output_term = this.getAttribute(TermAttribute.class); output_type = this.getAttribute(TypeAttribute.class); output_flags = this.addAttribute(FlagsAttribute.class); output_payload = input.hasAttribute(PayloadAttribute.class) ? this.getAttribute(PayloadAttribute.class) : null; this.reviewId = reviewId; currentDocNumber = new Counter(); tokenListsCollector = tokenLists; }
From source file:it.unibz.instasearch.indexing.tokenizers.TermSplitTokenizer.java
License:Open Source License
public TermSplitTokenizer(TokenStream in) { super(in);// w w w. j ava 2 s .co m assert (in.hasAttribute(TermAttribute.class)); assert (in.hasAttribute(OffsetAttribute.class)); assert (in.hasAttribute(PositionIncrementAttribute.class)); termAtt = (TermAttribute) addAttribute(TermAttribute.class); offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); posAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); }
From source file:jaligner.Sequence.java
License:Open Source License
/** * Constructor//from w ww . ja va 2 s . co m * * @param sequence */ public Sequence(String sequence, Analyzer analyzer, int max_length) throws IOException { super(); this.sequence = sequence; TokenStream stream = analyzer.tokenStream("contents", new StringReader(sequence)); Token.TokenAttributeFactory tokenAttributeFactory = new Token.TokenAttributeFactory( stream.getAttributeFactory()); Vector<Token> tokenVector = new Vector<Token>(); while (stream.incrementToken() && tokenVector.size() < max_length) { // Token token = new Token(); // Token token = (Token) stream.getAttribute(CharTermAttribute.class); Token token = (Token) tokenAttributeFactory.createAttributeInstance(Token.class); CharTermAttribute charTerm = stream.getAttribute(CharTermAttribute.class); OffsetAttribute offset = stream.getAttribute(OffsetAttribute.class); // PayloadAttribute payload = stream.getAttribute(PayloadAttribute.class); // FlagsAttribute flags = stream.getAttribute(FlagsAttribute.class); // public Token reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, String newType) { token.reinit(charTerm.buffer(), 0, charTerm.length(), offset.startOffset(), offset.endOffset()); token.setOffset(offset.startOffset(), offset.endOffset()); // token.setPayload(payload.getPayload()); // token.setFlags(flags.getFlags()); if (stream.hasAttribute(PositionIncrementAttribute.class)) { PositionIncrementAttribute positionIncrement = stream .getAttribute(PositionIncrementAttribute.class); token.setPositionIncrement(positionIncrement.getPositionIncrement()); } if (stream.hasAttribute(TypeAttribute.class)) { TypeAttribute type = stream.getAttribute(TypeAttribute.class); token.setType(type.type()); } tokenVector.add(token); } stream.end(); stream.close(); this.tokens = tokenVector.toArray(new Token[tokenVector.size()]); }
From source file:org.alfresco.solr.query.Solr4QueryParser.java
License:Open Source License
@SuppressWarnings("unchecked") protected Query getFieldQueryImpl(String field, String queryText, AnalysisMode analysisMode, LuceneFunction luceneFunction) throws ParseException, IOException { // make sure the field exists or return a dummy query so we have no // error ....ACE-3231 SchemaField schemaField = schema.getFieldOrNull(field); boolean isNumeric = false; if (schemaField == null) { return new TermQuery(new Term("_dummy_", "_miss_")); } else {//from w w w.ja v a2s.com isNumeric = (schemaField.getType().getNumericType() != null); if (isNumeric) { //Check to see if queryText is numeric or else it will fail. try { Double.valueOf(queryText); } catch (NumberFormatException e) { return new TermQuery(new Term("_dummy_", "_miss_")); } } } // Use the analyzer to get all the tokens, and then build a TermQuery, // PhraseQuery, or noth // TODO: Untokenised columns with functions require special handling if (luceneFunction != LuceneFunction.FIELD) { throw new UnsupportedOperationException( "Field queries are not supported on lucene functions (UPPER, LOWER, etc)"); } // if the incoming string already has a language identifier we strip it // iff and addit back on again String localePrefix = ""; String toTokenise = queryText; if (queryText.startsWith("{")) { int position = queryText.indexOf("}"); if (position > 0) { String language = queryText.substring(0, position + 1); Locale locale = new Locale(queryText.substring(1, position)); String token = queryText.substring(position + 1); boolean found = false; for (Locale current : Locale.getAvailableLocales()) { if (current.toString().equalsIgnoreCase(locale.toString())) { found = true; break; } } if (found) { localePrefix = language; toTokenise = token; } else { // toTokenise = token; } } } String testText = toTokenise; boolean requiresMLTokenDuplication = false; String localeString = null; if (isPropertyField(field) && (localePrefix.length() == 0)) { if ((queryText.length() > 0) && (queryText.charAt(0) == '\u0000')) { int position = queryText.indexOf("\u0000", 1); testText = queryText.substring(position + 1); requiresMLTokenDuplication = true; localeString = queryText.substring(1, position); } } // find the positions of any escaped * and ? and ignore them Set<Integer> wildcardPoistions = getWildcardPositions(testText); TokenStream source = null; ArrayList<PackedTokenAttributeImpl> list = new ArrayList<PackedTokenAttributeImpl>(); boolean severalTokensAtSamePosition = false; PackedTokenAttributeImpl nextToken; int positionCount = 0; try { source = getAnalyzer().tokenStream(field, new StringReader(toTokenise)); source.reset(); while (source.incrementToken()) { CharTermAttribute cta = source.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = source.getAttribute(OffsetAttribute.class); TypeAttribute typeAtt = null; if (source.hasAttribute(TypeAttribute.class)) { typeAtt = source.getAttribute(TypeAttribute.class); } PositionIncrementAttribute posIncAtt = null; if (source.hasAttribute(PositionIncrementAttribute.class)) { posIncAtt = source.getAttribute(PositionIncrementAttribute.class); } nextToken = new PackedTokenAttributeImpl(); nextToken.setEmpty().copyBuffer(cta.buffer(), 0, cta.length()); nextToken.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset()); if (typeAtt != null) { nextToken.setType(typeAtt.type()); } if (posIncAtt != null) { nextToken.setPositionIncrement(posIncAtt.getPositionIncrement()); } list.add(nextToken); if (nextToken.getPositionIncrement() != 0) positionCount += nextToken.getPositionIncrement(); else severalTokensAtSamePosition = true; } } finally { try { if (source != null) { source.close(); } } catch (IOException e) { // ignore } } // add any alpha numeric wildcards that have been missed // Fixes most stop word and wild card issues for (int index = 0; index < testText.length(); index++) { char current = testText.charAt(index); if (((current == '*') || (current == '?')) && wildcardPoistions.contains(index)) { StringBuilder pre = new StringBuilder(10); if (index == 0) { // "*" and "?" at the start boolean found = false; for (int j = 0; j < list.size(); j++) { PackedTokenAttributeImpl test = list.get(j); if ((test.startOffset() <= 0) && (0 < test.endOffset())) { found = true; break; } } if (!found && (list.size() == 0)) { // Add new token followed by * not given by the // tokeniser PackedTokenAttributeImpl newToken = new PackedTokenAttributeImpl(); newToken.setEmpty().append("", 0, 0); newToken.setType("ALPHANUM"); if (requiresMLTokenDuplication) { Locale locale = I18NUtil.parseLocale(localeString); @SuppressWarnings("resource") MLTokenDuplicator duplicator = new MLTokenDuplicator(locale, MLAnalysisMode.EXACT_LANGUAGE); Iterator<PackedTokenAttributeImpl> it = duplicator.buildIterator(newToken); if (it != null) { int count = 0; while (it.hasNext()) { list.add(it.next()); count++; if (count > 1) { severalTokensAtSamePosition = true; } } } } // content else { list.add(newToken); } } } else if (index > 0) { // Add * and ? back into any tokens from which it has been // removed boolean tokenFound = false; for (int j = 0; j < list.size(); j++) { PackedTokenAttributeImpl test = list.get(j); if ((test.startOffset() <= index) && (index < test.endOffset())) { if (requiresMLTokenDuplication) { String termText = test.toString(); int position = termText.indexOf("}"); String language = termText.substring(0, position + 1); String token = termText.substring(position + 1); if (index >= test.startOffset() + token.length()) { test.setEmpty(); test.append(language + token + current); } } else { if (index >= test.startOffset() + test.length()) { test.setEmpty(); test.append(test.toString() + current); } } tokenFound = true; break; } } if (!tokenFound) { for (int i = index - 1; i >= 0; i--) { char c = testText.charAt(i); if (Character.isLetterOrDigit(c)) { boolean found = false; for (int j = 0; j < list.size(); j++) { PackedTokenAttributeImpl test = list.get(j); if ((test.startOffset() <= i) && (i < test.endOffset())) { found = true; break; } } if (found) { break; } else { pre.insert(0, c); } } else { break; } } if (pre.length() > 0) { // Add new token followed by * not given by the // tokeniser PackedTokenAttributeImpl newToken = new PackedTokenAttributeImpl(); newToken.setEmpty().append(pre.toString()); newToken.setOffset(index - pre.length(), index); newToken.setType("ALPHANUM"); if (requiresMLTokenDuplication) { Locale locale = I18NUtil.parseLocale(localeString); @SuppressWarnings("resource") MLTokenDuplicator duplicator = new MLTokenDuplicator(locale, MLAnalysisMode.EXACT_LANGUAGE); Iterator<PackedTokenAttributeImpl> it = duplicator.buildIterator(newToken); if (it != null) { int count = 0; while (it.hasNext()) { list.add(it.next()); count++; if (count > 1) { severalTokensAtSamePosition = true; } } } } // content else { list.add(newToken); } } } } StringBuilder post = new StringBuilder(10); if (index > 0) { for (int i = index + 1; i < testText.length(); i++) { char c = testText.charAt(i); if (Character.isLetterOrDigit(c)) { boolean found = false; for (int j = 0; j < list.size(); j++) { PackedTokenAttributeImpl test = list.get(j); if ((test.startOffset() <= i) && (i < test.endOffset())) { found = true; break; } } if (found) { break; } else { post.append(c); } } else { break; } } if (post.length() > 0) { // Add new token followed by * not given by the // tokeniser PackedTokenAttributeImpl newToken = new PackedTokenAttributeImpl(); newToken.setEmpty().append(post.toString()); newToken.setOffset(index + 1, index + 1 + post.length()); newToken.setType("ALPHANUM"); if (requiresMLTokenDuplication) { Locale locale = I18NUtil.parseLocale(localeString); @SuppressWarnings("resource") MLTokenDuplicator duplicator = new MLTokenDuplicator(locale, MLAnalysisMode.EXACT_LANGUAGE); Iterator<PackedTokenAttributeImpl> it = duplicator.buildIterator(newToken); if (it != null) { int count = 0; while (it.hasNext()) { list.add(it.next()); count++; if (count > 1) { severalTokensAtSamePosition = true; } } } } // content else { list.add(newToken); } } } } } // Put in real position increments as we treat them correctly int curentIncrement = -1; for (PackedTokenAttributeImpl c : list) { if (curentIncrement == -1) { curentIncrement = c.getPositionIncrement(); } else if (c.getPositionIncrement() > 0) { curentIncrement = c.getPositionIncrement(); } else { c.setPositionIncrement(curentIncrement); } } // Fix up position increments for in phrase isolated wildcards boolean lastWasWild = false; for (int i = 0; i < list.size() - 1; i++) { for (int j = list.get(i).endOffset() + 1; j < list.get(i + 1).startOffset() - 1; j++) { if (wildcardPoistions.contains(j)) { if (!lastWasWild) { list.get(i + 1).setPositionIncrement(list.get(i + 1).getPositionIncrement() + 1); } lastWasWild = true; } else { lastWasWild = false; } } } Collections.sort(list, new Comparator<PackedTokenAttributeImpl>() { public int compare(PackedTokenAttributeImpl o1, PackedTokenAttributeImpl o2) { int dif = o1.startOffset() - o2.startOffset(); return dif; } }); // Combined * and ? based strings - should redo the tokeniser // Build tokens by position LinkedList<LinkedList<PackedTokenAttributeImpl>> tokensByPosition = new LinkedList<LinkedList<PackedTokenAttributeImpl>>(); LinkedList<PackedTokenAttributeImpl> currentList = null; int lastStart = 0; for (PackedTokenAttributeImpl c : list) { if (c.startOffset() == lastStart) { if (currentList == null) { currentList = new LinkedList<PackedTokenAttributeImpl>(); tokensByPosition.add(currentList); } currentList.add(c); } else { currentList = new LinkedList<PackedTokenAttributeImpl>(); tokensByPosition.add(currentList); currentList.add(c); } lastStart = c.startOffset(); } // Build all the token sequences and see which ones get strung together OrderedHashSet<LinkedList<PackedTokenAttributeImpl>> allTokenSequencesSet = new OrderedHashSet<LinkedList<PackedTokenAttributeImpl>>(); for (LinkedList<PackedTokenAttributeImpl> tokensAtPosition : tokensByPosition) { OrderedHashSet<LinkedList<PackedTokenAttributeImpl>> positionalSynonymSequencesSet = new OrderedHashSet<LinkedList<PackedTokenAttributeImpl>>(); OrderedHashSet<LinkedList<PackedTokenAttributeImpl>> newAllTokenSequencesSet = new OrderedHashSet<LinkedList<PackedTokenAttributeImpl>>(); FOR_FIRST_TOKEN_AT_POSITION_ONLY: for (PackedTokenAttributeImpl t : tokensAtPosition) { PackedTokenAttributeImpl replace = new PackedTokenAttributeImpl(); replace.setEmpty().append(t); replace.setOffset(t.startOffset(), t.endOffset()); replace.setType(t.type()); replace.setPositionIncrement(t.getPositionIncrement()); boolean tokenFoundSequence = false; for (LinkedList<PackedTokenAttributeImpl> tokenSequence : allTokenSequencesSet) { LinkedList<PackedTokenAttributeImpl> newEntry = new LinkedList<PackedTokenAttributeImpl>(); newEntry.addAll(tokenSequence); if ((newEntry.getLast().endOffset() == replace.endOffset()) && replace.type().equals(SynonymFilter.TYPE_SYNONYM)) { if ((newEntry.getLast().startOffset() == replace.startOffset()) && newEntry.getLast().type().equals(SynonymFilter.TYPE_SYNONYM)) { positionalSynonymSequencesSet.add(tokenSequence); newEntry.add(replace); tokenFoundSequence = true; } else if (newEntry.getLast().type().equals(CommonGramsFilter.GRAM_TYPE)) { if (newEntry.toString().endsWith(replace.toString())) { // already in the gram positionalSynonymSequencesSet.add(tokenSequence); tokenFoundSequence = true; } else { // need to replace the synonym in the current // gram tokenFoundSequence = true; StringBuffer old = new StringBuffer(newEntry.getLast().toString()); old.replace(replace.startOffset() - newEntry.getLast().startOffset(), replace.endOffset() - newEntry.getLast().startOffset(), replace.toString()); PackedTokenAttributeImpl newToken = new PackedTokenAttributeImpl(); newToken.setEmpty().append(old.toString()); newToken.setOffset(newEntry.getLast().startOffset(), newEntry.getLast().endOffset()); newEntry.removeLast(); newEntry.add(newToken); } } } else if ((newEntry.getLast().startOffset() < replace.startOffset()) && (newEntry.getLast().endOffset() < replace.endOffset())) { if (newEntry.getLast().type().equals(SynonymFilter.TYPE_SYNONYM) && replace.type().equals(SynonymFilter.TYPE_SYNONYM)) { positionalSynonymSequencesSet.add(tokenSequence); } newEntry.add(replace); tokenFoundSequence = true; } newAllTokenSequencesSet.add(newEntry); } if (false == tokenFoundSequence) { for (LinkedList<PackedTokenAttributeImpl> tokenSequence : newAllTokenSequencesSet) { LinkedList<PackedTokenAttributeImpl> newEntry = new LinkedList<PackedTokenAttributeImpl>(); newEntry.addAll(tokenSequence); if ((newEntry.getLast().endOffset() == replace.endOffset()) && replace.type().equals(SynonymFilter.TYPE_SYNONYM)) { if ((newEntry.getLast().startOffset() == replace.startOffset()) && newEntry.getLast().type().equals(SynonymFilter.TYPE_SYNONYM)) { positionalSynonymSequencesSet.add(tokenSequence); newEntry.add(replace); tokenFoundSequence = true; } else if (newEntry.getLast().type().equals(CommonGramsFilter.GRAM_TYPE)) { if (newEntry.toString().endsWith(replace.toString())) { // already in the gram positionalSynonymSequencesSet.add(tokenSequence); tokenFoundSequence = true; } else { // need to replace the synonym in the // current gram tokenFoundSequence = true; StringBuffer old = new StringBuffer(newEntry.getLast().toString()); old.replace(replace.startOffset() - newEntry.getLast().startOffset(), replace.endOffset() - newEntry.getLast().startOffset(), replace.toString()); PackedTokenAttributeImpl newToken = new PackedTokenAttributeImpl(); newToken.setEmpty().append(old.toString()); newToken.setOffset(newEntry.getLast().startOffset(), newEntry.getLast().endOffset()); newEntry.removeLast(); newEntry.add(newToken); positionalSynonymSequencesSet.add(newEntry); } } } else if ((newEntry.getLast().startOffset() < replace.startOffset()) && (newEntry.getLast().endOffset() < replace.endOffset())) { if (newEntry.getLast().type().equals(SynonymFilter.TYPE_SYNONYM) && replace.type().equals(SynonymFilter.TYPE_SYNONYM)) { positionalSynonymSequencesSet.add(tokenSequence); newEntry.add(replace); tokenFoundSequence = true; } } } } if (false == tokenFoundSequence) { LinkedList<PackedTokenAttributeImpl> newEntry = new LinkedList<PackedTokenAttributeImpl>(); newEntry.add(replace); newAllTokenSequencesSet.add(newEntry); } // Limit the max number of permutations we consider if (newAllTokenSequencesSet.size() > 64) { break FOR_FIRST_TOKEN_AT_POSITION_ONLY; } } allTokenSequencesSet = newAllTokenSequencesSet; allTokenSequencesSet.addAll(positionalSynonymSequencesSet); } LinkedList<LinkedList<PackedTokenAttributeImpl>> allTokenSequences = new LinkedList<LinkedList<PackedTokenAttributeImpl>>( allTokenSequencesSet); // build the unique LinkedList<LinkedList<PackedTokenAttributeImpl>> fixedTokenSequences = new LinkedList<LinkedList<PackedTokenAttributeImpl>>(); for (LinkedList<PackedTokenAttributeImpl> tokenSequence : allTokenSequences) { LinkedList<PackedTokenAttributeImpl> fixedTokenSequence = new LinkedList<PackedTokenAttributeImpl>(); fixedTokenSequences.add(fixedTokenSequence); PackedTokenAttributeImpl replace = null; for (PackedTokenAttributeImpl c : tokenSequence) { if (replace == null) { StringBuilder prefix = new StringBuilder(); for (int i = c.startOffset() - 1; i >= 0; i--) { char test = testText.charAt(i); if (((test == '*') || (test == '?')) && wildcardPoistions.contains(i)) { prefix.insert(0, test); } else { break; } } String pre = prefix.toString(); if (requiresMLTokenDuplication) { String termText = c.toString(); int position = termText.indexOf("}"); String language = termText.substring(0, position + 1); String token = termText.substring(position + 1); replace = new PackedTokenAttributeImpl(); replace.setEmpty().append(language + pre + token); replace.setOffset(c.startOffset() - pre.length(), c.endOffset()); replace.setType(c.type()); replace.setPositionIncrement(c.getPositionIncrement()); } else { String termText = c.toString(); replace = new PackedTokenAttributeImpl(); replace.setEmpty().append(pre + termText); replace.setOffset(c.startOffset() - pre.length(), c.endOffset()); replace.setType(c.type()); replace.setPositionIncrement(c.getPositionIncrement()); } } else { StringBuilder prefix = new StringBuilder(); StringBuilder postfix = new StringBuilder(); StringBuilder builder = prefix; for (int i = c.startOffset() - 1; i >= replace.endOffset(); i--) { char test = testText.charAt(i); if (((test == '*') || (test == '?')) && wildcardPoistions.contains(i)) { builder.insert(0, test); } else { builder = postfix; postfix.setLength(0); } } String pre = prefix.toString(); String post = postfix.toString(); // Does it bridge? if ((pre.length() > 0) && (replace.endOffset() + pre.length()) == c.startOffset()) { String termText = c.toString(); if (requiresMLTokenDuplication) { int position = termText.indexOf("}"); @SuppressWarnings("unused") String language = termText.substring(0, position + 1); String token = termText.substring(position + 1); int oldPositionIncrement = replace.getPositionIncrement(); String replaceTermText = replace.toString(); replace = new PackedTokenAttributeImpl(); replace.setEmpty().append(replaceTermText + pre + token); replace.setOffset(replace.startOffset(), c.endOffset()); replace.setType(replace.type()); replace.setPositionIncrement(oldPositionIncrement); } else { int oldPositionIncrement = replace.getPositionIncrement(); String replaceTermText = replace.toString(); replace = new PackedTokenAttributeImpl(); replace.setEmpty().append(replaceTermText + pre + termText); replace.setOffset(replace.startOffset(), c.endOffset()); replace.setType(replace.type()); replace.setPositionIncrement(oldPositionIncrement); } } else { String termText = c.toString(); if (requiresMLTokenDuplication) { int position = termText.indexOf("}"); String language = termText.substring(0, position + 1); String token = termText.substring(position + 1); String replaceTermText = replace.toString(); PackedTokenAttributeImpl last = new PackedTokenAttributeImpl(); last.setEmpty().append(replaceTermText + post); last.setOffset(replace.startOffset(), replace.endOffset() + post.length()); last.setType(replace.type()); last.setPositionIncrement(replace.getPositionIncrement()); fixedTokenSequence.add(last); replace = new PackedTokenAttributeImpl(); replace.setEmpty().append(language + pre + token); replace.setOffset(c.startOffset() - pre.length(), c.endOffset()); replace.setType(c.type()); replace.setPositionIncrement(c.getPositionIncrement()); } else { String replaceTermText = replace.toString(); PackedTokenAttributeImpl last = new PackedTokenAttributeImpl(); last.setEmpty().append(replaceTermText + post); last.setOffset(replace.startOffset(), replace.endOffset() + post.length()); last.setType(replace.type()); last.setPositionIncrement(replace.getPositionIncrement()); fixedTokenSequence.add(last); replace = new PackedTokenAttributeImpl(); replace.setEmpty().append(pre + termText); replace.setOffset(c.startOffset() - pre.length(), c.endOffset()); replace.setType(c.type()); replace.setPositionIncrement(c.getPositionIncrement()); } } } } // finish last if (replace != null) { StringBuilder postfix = new StringBuilder(); if ((replace.endOffset() >= 0) && (replace.endOffset() < testText.length())) { for (int i = replace.endOffset(); i < testText.length(); i++) { char test = testText.charAt(i); if (((test == '*') || (test == '?')) && wildcardPoistions.contains(i)) { postfix.append(test); } else { break; } } } String post = postfix.toString(); int oldPositionIncrement = replace.getPositionIncrement(); String replaceTermText = replace.toString(); PackedTokenAttributeImpl terminal = new PackedTokenAttributeImpl(); terminal.setEmpty().append(replaceTermText + post); terminal.setOffset(replace.startOffset(), replace.endOffset() + post.length()); terminal.setType(replace.type()); terminal.setPositionIncrement(oldPositionIncrement); fixedTokenSequence.add(terminal); } } // rebuild fixed list ArrayList<PackedTokenAttributeImpl> fixed = new ArrayList<PackedTokenAttributeImpl>(); for (LinkedList<PackedTokenAttributeImpl> tokenSequence : fixedTokenSequences) { for (PackedTokenAttributeImpl token : tokenSequence) { fixed.add(token); } } // reorder by start position and increment Collections.sort(fixed, new Comparator<PackedTokenAttributeImpl>() { public int compare(PackedTokenAttributeImpl o1, PackedTokenAttributeImpl o2) { int dif = o1.startOffset() - o2.startOffset(); if (dif != 0) { return dif; } else { return o1.getPositionIncrement() - o2.getPositionIncrement(); } } }); // make sure we remove any tokens we have duplicated @SuppressWarnings("rawtypes") OrderedHashSet unique = new OrderedHashSet(); unique.addAll(fixed); fixed = new ArrayList<PackedTokenAttributeImpl>(unique); list = fixed; // add any missing locales back to the tokens if (localePrefix.length() > 0) { for (int j = 0; j < list.size(); j++) { PackedTokenAttributeImpl currentToken = list.get(j); String termText = currentToken.toString(); currentToken.setEmpty(); currentToken.append(localePrefix + termText); } } SchemaField sf = schema.getField(field); boolean isShingled = false; @SuppressWarnings("resource") TokenizerChain tokenizerChain = (sf.getType().getQueryAnalyzer() instanceof TokenizerChain) ? ((TokenizerChain) sf.getType().getQueryAnalyzer()) : null; if (tokenizerChain != null) { for (TokenFilterFactory factory : tokenizerChain.getTokenFilterFactories()) { if (factory instanceof ShingleFilterFactory) { isShingled = true; break; } } } @SuppressWarnings("resource") AlfrescoAnalyzerWrapper analyzerWrapper = (sf.getType() .getQueryAnalyzer() instanceof AlfrescoAnalyzerWrapper) ? ((AlfrescoAnalyzerWrapper) sf.getType().getQueryAnalyzer()) : null; if (analyzerWrapper != null) { // assume if there are no term positions it is shingled .... isShingled = true; } boolean forceConjuncion = rerankPhase == RerankPhase.QUERY_PHASE; if (list.size() == 0) { return null; } else if (list.size() == 1) { nextToken = list.get(0); String termText = nextToken.toString(); if (!isNumeric && (termText.contains("*") || termText.contains("?"))) { return newWildcardQuery(new Term(field, termText)); } else { return newTermQuery(new Term(field, termText)); } } else { if (severalTokensAtSamePosition) { if (positionCount == 1) { // no phrase query: Builder q = newBooleanQuery(); for (int i = 0; i < list.size(); i++) { Query currentQuery; nextToken = list.get(i); String termText = nextToken.toString(); if (termText.contains("*") || termText.contains("?")) { currentQuery = newWildcardQuery(new Term(field, termText)); } else { currentQuery = newTermQuery(new Term(field, termText)); } q.add(currentQuery, BooleanClause.Occur.SHOULD); } return q.build(); } else if (forceConjuncion) { BooleanQuery.Builder or = new BooleanQuery.Builder(); for (LinkedList<PackedTokenAttributeImpl> tokenSequence : fixedTokenSequences) { BooleanQuery.Builder and = new BooleanQuery.Builder(); for (int i = 0; i < tokenSequence.size(); i++) { nextToken = (PackedTokenAttributeImpl) tokenSequence.get(i); String termText = nextToken.toString(); Term term = new Term(field, termText); if ((termText != null) && (termText.contains("*") || termText.contains("?"))) { org.apache.lucene.search.WildcardQuery wildQuery = new org.apache.lucene.search.WildcardQuery( term); and.add(wildQuery, Occur.MUST); } else { TermQuery termQuery = new TermQuery(term); and.add(termQuery, Occur.MUST); } } if (and.build().clauses().size() > 0) { or.add(and.build(), Occur.SHOULD); } } return or.build(); } // shingle else if (sf.omitPositions() && isShingled) { ArrayList<PackedTokenAttributeImpl> nonContained = getNonContained(list); Query currentQuery; BooleanQuery.Builder weakPhrase = new BooleanQuery.Builder(); for (PackedTokenAttributeImpl shingleToken : nonContained) { String termText = shingleToken.toString(); Term term = new Term(field, termText); if ((termText != null) && (termText.contains("*") || termText.contains("?"))) { currentQuery = new org.apache.lucene.search.WildcardQuery(term); } else { currentQuery = new TermQuery(term); } weakPhrase.add(currentQuery, Occur.MUST); } return weakPhrase.build(); } // Word delimiter factory and other odd things generate complex // token patterns // Smart skip token sequences with small tokens that generate // toomany wildcards // Fall back to the larger pattern // e.g Site1* will not do (S ite 1*) or (Site 1*) if 1* matches // too much (S ite1*) and (Site1*) will still be OK // If we skip all (for just 1* in the input) this is still an // issue. else { return generateSpanOrQuery(field, fixedTokenSequences); } } else { if (forceConjuncion) { BooleanQuery.Builder or = new BooleanQuery.Builder(); for (LinkedList<PackedTokenAttributeImpl> tokenSequence : fixedTokenSequences) { BooleanQuery.Builder and = new BooleanQuery.Builder(); for (int i = 0; i < tokenSequence.size(); i++) { nextToken = (PackedTokenAttributeImpl) tokenSequence.get(i); String termText = nextToken.toString(); Term term = new Term(field, termText); if ((termText != null) && (termText.contains("*") || termText.contains("?"))) { org.apache.lucene.search.WildcardQuery wildQuery = new org.apache.lucene.search.WildcardQuery( term); and.add(wildQuery, Occur.MUST); } else { TermQuery termQuery = new TermQuery(term); and.add(termQuery, Occur.MUST); } } if (and.build().clauses().size() > 0) { or.add(and.build(), Occur.SHOULD); } } return or.build(); } else { SpanQuery spanQuery = null; ArrayList<SpanQuery> atSamePositionSpanOrQueryParts = new ArrayList<SpanQuery>(); int gap = 0; for (int i = 0; i < list.size(); i++) { nextToken = list.get(i); String termText = nextToken.toString(); Term term = new Term(field, termText); if (getEnablePositionIncrements()) { SpanQuery nextSpanQuery; if ((termText != null) && (termText.contains("*") || termText.contains("?"))) { org.apache.lucene.search.WildcardQuery wildQuery = new org.apache.lucene.search.WildcardQuery( term); SpanMultiTermQueryWrapper<org.apache.lucene.search.WildcardQuery> wrapper = new SpanMultiTermQueryWrapper<org.apache.lucene.search.WildcardQuery>( wildQuery); wrapper.setRewriteMethod( new TopTermsSpanBooleanQueryRewrite(topTermSpanRewriteLimit)); nextSpanQuery = wrapper; } else { nextSpanQuery = new SpanTermQuery(term); } if (gap == 0) { atSamePositionSpanOrQueryParts.add(nextSpanQuery); } else { if (atSamePositionSpanOrQueryParts.size() == 0) { if (spanQuery == null) { spanQuery = nextSpanQuery; } else { spanQuery = new SpanNearQuery(new SpanQuery[] { spanQuery, nextSpanQuery }, (gap - 1) + internalSlop, internalSlop < 2); } atSamePositionSpanOrQueryParts = new ArrayList<SpanQuery>(); } else if (atSamePositionSpanOrQueryParts.size() == 1) { if (spanQuery == null) { spanQuery = atSamePositionSpanOrQueryParts.get(0); } else { spanQuery = new SpanNearQuery( new SpanQuery[] { spanQuery, atSamePositionSpanOrQueryParts.get(0) }, (gap - 1) + internalSlop, internalSlop < 2); } atSamePositionSpanOrQueryParts = new ArrayList<SpanQuery>(); atSamePositionSpanOrQueryParts.add(nextSpanQuery); } else { if (spanQuery == null) { spanQuery = new SpanOrQuery( atSamePositionSpanOrQueryParts.toArray(new SpanQuery[] {})); } else { spanQuery = new SpanNearQuery( new SpanQuery[] { spanQuery, new SpanOrQuery(atSamePositionSpanOrQueryParts .toArray(new SpanQuery[] {})) }, (gap - 1) + internalSlop, internalSlop < 2); } atSamePositionSpanOrQueryParts = new ArrayList<SpanQuery>(); atSamePositionSpanOrQueryParts.add(nextSpanQuery); } } gap = nextToken.getPositionIncrement(); } else { SpanQuery nextSpanQuery; if ((termText != null) && (termText.contains("*") || termText.contains("?"))) { org.apache.lucene.search.WildcardQuery wildQuery = new org.apache.lucene.search.WildcardQuery( term); SpanMultiTermQueryWrapper<org.apache.lucene.search.WildcardQuery> wrapper = new SpanMultiTermQueryWrapper<org.apache.lucene.search.WildcardQuery>( wildQuery); wrapper.setRewriteMethod( new TopTermsSpanBooleanQueryRewrite(topTermSpanRewriteLimit)); nextSpanQuery = wrapper; } else { nextSpanQuery = new SpanTermQuery(term); } if (spanQuery == null) { spanQuery = new SpanOrQuery(nextSpanQuery); } else { spanQuery = new SpanOrQuery(spanQuery, nextSpanQuery); } } } if (atSamePositionSpanOrQueryParts.size() == 0) { return spanQuery; } else if (atSamePositionSpanOrQueryParts.size() == 1) { if (spanQuery == null) { spanQuery = atSamePositionSpanOrQueryParts.get(0); } else { spanQuery = new SpanNearQuery( new SpanQuery[] { spanQuery, atSamePositionSpanOrQueryParts.get(0) }, (gap - 1) + internalSlop, internalSlop < 2); } return spanQuery; } else { if (spanQuery == null) { spanQuery = new SpanOrQuery(atSamePositionSpanOrQueryParts.toArray(new SpanQuery[] {})); } else { spanQuery = new SpanNearQuery( new SpanQuery[] { spanQuery, new SpanOrQuery( atSamePositionSpanOrQueryParts.toArray(new SpanQuery[] {})) }, (gap - 1) + internalSlop, internalSlop < 2); } return spanQuery; } } } } }
From source file:org.alfresco.solr.query.Solr4QueryParser.java
License:Open Source License
private String getFirstTokenForRange(String string, FieldInstance field) throws IOException { PackedTokenAttributeImpl nextToken;/*w ww. ja v a 2s .c o m*/ TokenStream source = null; ; try { source = getAnalyzer().tokenStream(field.getField(), new StringReader(string)); source.reset(); while (source.incrementToken()) { CharTermAttribute cta = source.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = source.getAttribute(OffsetAttribute.class); TypeAttribute typeAtt = null; if (source.hasAttribute(TypeAttribute.class)) { typeAtt = source.getAttribute(TypeAttribute.class); } PositionIncrementAttribute posIncAtt = null; if (source.hasAttribute(PositionIncrementAttribute.class)) { posIncAtt = source.getAttribute(PositionIncrementAttribute.class); } nextToken = new PackedTokenAttributeImpl(); nextToken.setEmpty().copyBuffer(cta.buffer(), 0, cta.length()); nextToken.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset()); if (typeAtt != null) { nextToken.setType(typeAtt.type()); } if (posIncAtt != null) { nextToken.setPositionIncrement(posIncAtt.getPositionIncrement()); } return nextToken.toString(); } } finally { try { if (source != null) { source.close(); } } catch (IOException e) { // ignore } } return null; }