List of usage examples for org.apache.lucene.analysis.tokenattributes PackedTokenAttributeImpl endOffset
@Override public final int endOffset()
From source file:org.alfresco.repo.search.impl.lucene.analysis.MLTokenDuplicator.java
License:Open Source License
public Iterator<PackedTokenAttributeImpl> buildIterator(PackedTokenAttributeImpl token) { if (token == null) { return null; }/*ww w.ja va2 s . co m*/ ArrayList<PackedTokenAttributeImpl> tokens = new ArrayList<PackedTokenAttributeImpl>(prefixes.size()); for (String prefix : prefixes) { PackedTokenAttributeImpl newToken = new PackedTokenAttributeImpl(); newToken.setEmpty().append(prefix + termText(token)); newToken.setOffset(token.startOffset(), token.endOffset()); newToken.setType(token.type()); if (tokens.size() == 0) { newToken.setPositionIncrement(token.getPositionIncrement()); } else { newToken.setPositionIncrement(0); } tokens.add(newToken); } return tokens.iterator(); }
From source file:org.alfresco.repo.search.impl.lucene.analysis.MLTokenDuplicator.java
License:Open Source License
@Override public final boolean incrementToken() throws IOException { clearAttributes();//from w ww . j a va 2s . c om PackedTokenAttributeImpl next = next(); if (next == null) { return false; } termAtt.copyBuffer(next.buffer(), 0, next.length()); offsetAtt.setOffset(next.startOffset(), next.endOffset()); typeAtt.setType(next.type()); posIncAtt.setPositionIncrement(next.getPositionIncrement()); return true; }
From source file:org.alfresco.repo.search.impl.lucene.analysis.PathTokenFilter.java
License:Open Source License
private void buildTokenListAndIterator() throws IOException { NumberFormat nf = new DecimalFormat(INTEGER_FORMAT); // Could optimise to read each path ata time - not just all paths int insertCountAt = 0; int lengthCounter = 0; PackedTokenAttributeImpl t; PackedTokenAttributeImpl pathSplitToken = null; PackedTokenAttributeImpl nameToken = null; PackedTokenAttributeImpl countToken = null; PackedTokenAttributeImpl namespaceToken = null; while ((t = nextToken()) != null) { String text = termText(t); if (text.length() == 0) { continue; // Skip if we find // or /; or ;; etc }/*from w ww.j a v a2s . com*/ if (text.charAt(text.length() - 1) == pathSeparator) { text = text.substring(0, text.length() - 1); pathSplitToken = new PackedTokenAttributeImpl(); pathSplitToken.setEmpty().append(separatorTokenText); pathSplitToken.setOffset(t.startOffset(), t.endOffset()); pathSplitToken.setType(TOKEN_TYPE_PATH_SEP); pathSplitToken.setPositionIncrement(1); } int split = -1; boolean isPrefix = false; if ((text.length() > 0) && (text.charAt(0) == nsStartDelimiter)) { split = text.indexOf(nsEndDelimiter); } if (split == -1) { split = text.indexOf(nsPrefixDelimiter); isPrefix = true; } if (split == -1) { namespaceToken = new PackedTokenAttributeImpl(); namespaceToken.setEmpty().append(noNsTokenText); namespaceToken.setOffset(t.startOffset(), t.startOffset()); namespaceToken.setType(TOKEN_TYPE_PATH_ELEMENT_NAMESPACE); nameToken = new PackedTokenAttributeImpl(); nameToken.setEmpty().append(text); nameToken.setOffset(t.startOffset(), t.endOffset()); nameToken.setType(TOKEN_TYPE_PATH_ELEMENT_NAME); } else { if (isPrefix) { namespaceToken = new PackedTokenAttributeImpl(); namespaceToken.setEmpty().append(text.substring(0, split)); namespaceToken.setOffset(t.startOffset(), t.startOffset() + split); namespaceToken.setType(TOKEN_TYPE_PATH_ELEMENT_NAMESPACE_PREFIX); nameToken = new PackedTokenAttributeImpl(); nameToken.setEmpty().append(text.substring(split + 1)); nameToken.setOffset(t.startOffset() + split + 1, t.endOffset()); nameToken.setType(TOKEN_TYPE_PATH_ELEMENT_NAME); } else { namespaceToken = new PackedTokenAttributeImpl(); namespaceToken.setEmpty() .append(text.substring(nsStartDelimiterLength, (split + nsEndDelimiterLength - 1))); namespaceToken.setOffset(t.startOffset(), t.startOffset() + split); namespaceToken.setType(TOKEN_TYPE_PATH_ELEMENT_NAMESPACE); nameToken = new PackedTokenAttributeImpl(); nameToken.setEmpty().append(text.substring(split + nsEndDelimiterLength)); nameToken.setOffset(t.startOffset() + split + nsEndDelimiterLength, t.endOffset()); nameToken.setType(TOKEN_TYPE_PATH_ELEMENT_NAME); } } namespaceToken.setPositionIncrement(1); nameToken.setPositionIncrement(1); if (includeNamespace) { if (termText(namespaceToken).equals("")) { namespaceToken = new PackedTokenAttributeImpl(); namespaceToken.setEmpty().append(noNsTokenText); namespaceToken.setOffset(t.startOffset(), t.startOffset()); namespaceToken.setType(TOKEN_TYPE_PATH_ELEMENT_NAMESPACE); namespaceToken.setPositionIncrement(1); } tokens.add(namespaceToken); } tokens.add(nameToken); lengthCounter++; if (pathSplitToken != null) { String countString = nf.format(lengthCounter); countToken = new PackedTokenAttributeImpl(); countToken.setEmpty().append(countString); countToken.setOffset(t.startOffset(), t.endOffset()); countToken.setType(TOKEN_TYPE_PATH_SEP); countToken.setPositionIncrement(1); tokens.add(insertCountAt, countToken); tokens.add(pathSplitToken); lengthCounter = 0; insertCountAt = tokens.size(); pathSplitToken = null; } } String countString = nf.format(lengthCounter); countToken = new PackedTokenAttributeImpl(); countToken.setEmpty().append(countString); countToken.setOffset(0, 0); countToken.setType(TOKEN_TYPE_PATH_SEP); countToken.setPositionIncrement(1); tokens.add(insertCountAt, countToken); if ((tokens.size() == 0) || !(termText(tokens.get(tokens.size() - 1)).equals(TOKEN_TYPE_PATH_SEP))) { pathSplitToken = new PackedTokenAttributeImpl(); pathSplitToken.setEmpty().append(separatorTokenText); pathSplitToken.setOffset(0, 0); pathSplitToken.setType(TOKEN_TYPE_PATH_SEP); pathSplitToken.setPositionIncrement(1); tokens.add(pathSplitToken); } it = tokens.iterator(); }
From source file:org.alfresco.repo.search.impl.lucene.analysis.PathTokenFilter.java
License:Open Source License
@Override public final boolean incrementToken() throws IOException { clearAttributes();/* ww w . j av a 2 s. c o m*/ PackedTokenAttributeImpl next = next(); if (next == null) { return false; } termAtt.copyBuffer(next.buffer(), 0, next.length()); offsetAtt.setOffset(correctOffset(next.startOffset()), correctOffset(next.endOffset())); typeAtt.setType(next.type()); posIncAtt.setPositionIncrement(next.getPositionIncrement()); return true; }
From source file:org.alfresco.solr.query.Solr4QueryParser.java
License:Open Source License
@SuppressWarnings("unchecked") protected Query getFieldQueryImpl(String field, String queryText, AnalysisMode analysisMode, LuceneFunction luceneFunction) throws ParseException, IOException { // make sure the field exists or return a dummy query so we have no // error ....ACE-3231 SchemaField schemaField = schema.getFieldOrNull(field); boolean isNumeric = false; if (schemaField == null) { return new TermQuery(new Term("_dummy_", "_miss_")); } else {//from w w w.ja va2 s . c o m isNumeric = (schemaField.getType().getNumericType() != null); if (isNumeric) { //Check to see if queryText is numeric or else it will fail. try { Double.valueOf(queryText); } catch (NumberFormatException e) { return new TermQuery(new Term("_dummy_", "_miss_")); } } } // Use the analyzer to get all the tokens, and then build a TermQuery, // PhraseQuery, or noth // TODO: Untokenised columns with functions require special handling if (luceneFunction != LuceneFunction.FIELD) { throw new UnsupportedOperationException( "Field queries are not supported on lucene functions (UPPER, LOWER, etc)"); } // if the incoming string already has a language identifier we strip it // iff and addit back on again String localePrefix = ""; String toTokenise = queryText; if (queryText.startsWith("{")) { int position = queryText.indexOf("}"); if (position > 0) { String language = queryText.substring(0, position + 1); Locale locale = new Locale(queryText.substring(1, position)); String token = queryText.substring(position + 1); boolean found = false; for (Locale current : Locale.getAvailableLocales()) { if (current.toString().equalsIgnoreCase(locale.toString())) { found = true; break; } } if (found) { localePrefix = language; toTokenise = token; } else { // toTokenise = token; } } } String testText = toTokenise; boolean requiresMLTokenDuplication = false; String localeString = null; if (isPropertyField(field) && (localePrefix.length() == 0)) { if ((queryText.length() > 0) && (queryText.charAt(0) == '\u0000')) { int position = queryText.indexOf("\u0000", 1); testText = queryText.substring(position + 1); requiresMLTokenDuplication = true; localeString = queryText.substring(1, position); } } // find the positions of any escaped * and ? and ignore them Set<Integer> wildcardPoistions = getWildcardPositions(testText); TokenStream source = null; ArrayList<PackedTokenAttributeImpl> list = new ArrayList<PackedTokenAttributeImpl>(); boolean severalTokensAtSamePosition = false; PackedTokenAttributeImpl nextToken; int positionCount = 0; try { source = getAnalyzer().tokenStream(field, new StringReader(toTokenise)); source.reset(); while (source.incrementToken()) { CharTermAttribute cta = source.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = source.getAttribute(OffsetAttribute.class); TypeAttribute typeAtt = null; if (source.hasAttribute(TypeAttribute.class)) { typeAtt = source.getAttribute(TypeAttribute.class); } PositionIncrementAttribute posIncAtt = null; if (source.hasAttribute(PositionIncrementAttribute.class)) { posIncAtt = source.getAttribute(PositionIncrementAttribute.class); } nextToken = new PackedTokenAttributeImpl(); nextToken.setEmpty().copyBuffer(cta.buffer(), 0, cta.length()); nextToken.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset()); if (typeAtt != null) { nextToken.setType(typeAtt.type()); } if (posIncAtt != null) { nextToken.setPositionIncrement(posIncAtt.getPositionIncrement()); } list.add(nextToken); if (nextToken.getPositionIncrement() != 0) positionCount += nextToken.getPositionIncrement(); else severalTokensAtSamePosition = true; } } finally { try { if (source != null) { source.close(); } } catch (IOException e) { // ignore } } // add any alpha numeric wildcards that have been missed // Fixes most stop word and wild card issues for (int index = 0; index < testText.length(); index++) { char current = testText.charAt(index); if (((current == '*') || (current == '?')) && wildcardPoistions.contains(index)) { StringBuilder pre = new StringBuilder(10); if (index == 0) { // "*" and "?" at the start boolean found = false; for (int j = 0; j < list.size(); j++) { PackedTokenAttributeImpl test = list.get(j); if ((test.startOffset() <= 0) && (0 < test.endOffset())) { found = true; break; } } if (!found && (list.size() == 0)) { // Add new token followed by * not given by the // tokeniser PackedTokenAttributeImpl newToken = new PackedTokenAttributeImpl(); newToken.setEmpty().append("", 0, 0); newToken.setType("ALPHANUM"); if (requiresMLTokenDuplication) { Locale locale = I18NUtil.parseLocale(localeString); @SuppressWarnings("resource") MLTokenDuplicator duplicator = new MLTokenDuplicator(locale, MLAnalysisMode.EXACT_LANGUAGE); Iterator<PackedTokenAttributeImpl> it = duplicator.buildIterator(newToken); if (it != null) { int count = 0; while (it.hasNext()) { list.add(it.next()); count++; if (count > 1) { severalTokensAtSamePosition = true; } } } } // content else { list.add(newToken); } } } else if (index > 0) { // Add * and ? back into any tokens from which it has been // removed boolean tokenFound = false; for (int j = 0; j < list.size(); j++) { PackedTokenAttributeImpl test = list.get(j); if ((test.startOffset() <= index) && (index < test.endOffset())) { if (requiresMLTokenDuplication) { String termText = test.toString(); int position = termText.indexOf("}"); String language = termText.substring(0, position + 1); String token = termText.substring(position + 1); if (index >= test.startOffset() + token.length()) { test.setEmpty(); test.append(language + token + current); } } else { if (index >= test.startOffset() + test.length()) { test.setEmpty(); test.append(test.toString() + current); } } tokenFound = true; break; } } if (!tokenFound) { for (int i = index - 1; i >= 0; i--) { char c = testText.charAt(i); if (Character.isLetterOrDigit(c)) { boolean found = false; for (int j = 0; j < list.size(); j++) { PackedTokenAttributeImpl test = list.get(j); if ((test.startOffset() <= i) && (i < test.endOffset())) { found = true; break; } } if (found) { break; } else { pre.insert(0, c); } } else { break; } } if (pre.length() > 0) { // Add new token followed by * not given by the // tokeniser PackedTokenAttributeImpl newToken = new PackedTokenAttributeImpl(); newToken.setEmpty().append(pre.toString()); newToken.setOffset(index - pre.length(), index); newToken.setType("ALPHANUM"); if (requiresMLTokenDuplication) { Locale locale = I18NUtil.parseLocale(localeString); @SuppressWarnings("resource") MLTokenDuplicator duplicator = new MLTokenDuplicator(locale, MLAnalysisMode.EXACT_LANGUAGE); Iterator<PackedTokenAttributeImpl> it = duplicator.buildIterator(newToken); if (it != null) { int count = 0; while (it.hasNext()) { list.add(it.next()); count++; if (count > 1) { severalTokensAtSamePosition = true; } } } } // content else { list.add(newToken); } } } } StringBuilder post = new StringBuilder(10); if (index > 0) { for (int i = index + 1; i < testText.length(); i++) { char c = testText.charAt(i); if (Character.isLetterOrDigit(c)) { boolean found = false; for (int j = 0; j < list.size(); j++) { PackedTokenAttributeImpl test = list.get(j); if ((test.startOffset() <= i) && (i < test.endOffset())) { found = true; break; } } if (found) { break; } else { post.append(c); } } else { break; } } if (post.length() > 0) { // Add new token followed by * not given by the // tokeniser PackedTokenAttributeImpl newToken = new PackedTokenAttributeImpl(); newToken.setEmpty().append(post.toString()); newToken.setOffset(index + 1, index + 1 + post.length()); newToken.setType("ALPHANUM"); if (requiresMLTokenDuplication) { Locale locale = I18NUtil.parseLocale(localeString); @SuppressWarnings("resource") MLTokenDuplicator duplicator = new MLTokenDuplicator(locale, MLAnalysisMode.EXACT_LANGUAGE); Iterator<PackedTokenAttributeImpl> it = duplicator.buildIterator(newToken); if (it != null) { int count = 0; while (it.hasNext()) { list.add(it.next()); count++; if (count > 1) { severalTokensAtSamePosition = true; } } } } // content else { list.add(newToken); } } } } } // Put in real position increments as we treat them correctly int curentIncrement = -1; for (PackedTokenAttributeImpl c : list) { if (curentIncrement == -1) { curentIncrement = c.getPositionIncrement(); } else if (c.getPositionIncrement() > 0) { curentIncrement = c.getPositionIncrement(); } else { c.setPositionIncrement(curentIncrement); } } // Fix up position increments for in phrase isolated wildcards boolean lastWasWild = false; for (int i = 0; i < list.size() - 1; i++) { for (int j = list.get(i).endOffset() + 1; j < list.get(i + 1).startOffset() - 1; j++) { if (wildcardPoistions.contains(j)) { if (!lastWasWild) { list.get(i + 1).setPositionIncrement(list.get(i + 1).getPositionIncrement() + 1); } lastWasWild = true; } else { lastWasWild = false; } } } Collections.sort(list, new Comparator<PackedTokenAttributeImpl>() { public int compare(PackedTokenAttributeImpl o1, PackedTokenAttributeImpl o2) { int dif = o1.startOffset() - o2.startOffset(); return dif; } }); // Combined * and ? based strings - should redo the tokeniser // Build tokens by position LinkedList<LinkedList<PackedTokenAttributeImpl>> tokensByPosition = new LinkedList<LinkedList<PackedTokenAttributeImpl>>(); LinkedList<PackedTokenAttributeImpl> currentList = null; int lastStart = 0; for (PackedTokenAttributeImpl c : list) { if (c.startOffset() == lastStart) { if (currentList == null) { currentList = new LinkedList<PackedTokenAttributeImpl>(); tokensByPosition.add(currentList); } currentList.add(c); } else { currentList = new LinkedList<PackedTokenAttributeImpl>(); tokensByPosition.add(currentList); currentList.add(c); } lastStart = c.startOffset(); } // Build all the token sequences and see which ones get strung together OrderedHashSet<LinkedList<PackedTokenAttributeImpl>> allTokenSequencesSet = new OrderedHashSet<LinkedList<PackedTokenAttributeImpl>>(); for (LinkedList<PackedTokenAttributeImpl> tokensAtPosition : tokensByPosition) { OrderedHashSet<LinkedList<PackedTokenAttributeImpl>> positionalSynonymSequencesSet = new OrderedHashSet<LinkedList<PackedTokenAttributeImpl>>(); OrderedHashSet<LinkedList<PackedTokenAttributeImpl>> newAllTokenSequencesSet = new OrderedHashSet<LinkedList<PackedTokenAttributeImpl>>(); FOR_FIRST_TOKEN_AT_POSITION_ONLY: for (PackedTokenAttributeImpl t : tokensAtPosition) { PackedTokenAttributeImpl replace = new PackedTokenAttributeImpl(); replace.setEmpty().append(t); replace.setOffset(t.startOffset(), t.endOffset()); replace.setType(t.type()); replace.setPositionIncrement(t.getPositionIncrement()); boolean tokenFoundSequence = false; for (LinkedList<PackedTokenAttributeImpl> tokenSequence : allTokenSequencesSet) { LinkedList<PackedTokenAttributeImpl> newEntry = new LinkedList<PackedTokenAttributeImpl>(); newEntry.addAll(tokenSequence); if ((newEntry.getLast().endOffset() == replace.endOffset()) && replace.type().equals(SynonymFilter.TYPE_SYNONYM)) { if ((newEntry.getLast().startOffset() == replace.startOffset()) && newEntry.getLast().type().equals(SynonymFilter.TYPE_SYNONYM)) { positionalSynonymSequencesSet.add(tokenSequence); newEntry.add(replace); tokenFoundSequence = true; } else if (newEntry.getLast().type().equals(CommonGramsFilter.GRAM_TYPE)) { if (newEntry.toString().endsWith(replace.toString())) { // already in the gram positionalSynonymSequencesSet.add(tokenSequence); tokenFoundSequence = true; } else { // need to replace the synonym in the current // gram tokenFoundSequence = true; StringBuffer old = new StringBuffer(newEntry.getLast().toString()); old.replace(replace.startOffset() - newEntry.getLast().startOffset(), replace.endOffset() - newEntry.getLast().startOffset(), replace.toString()); PackedTokenAttributeImpl newToken = new PackedTokenAttributeImpl(); newToken.setEmpty().append(old.toString()); newToken.setOffset(newEntry.getLast().startOffset(), newEntry.getLast().endOffset()); newEntry.removeLast(); newEntry.add(newToken); } } } else if ((newEntry.getLast().startOffset() < replace.startOffset()) && (newEntry.getLast().endOffset() < replace.endOffset())) { if (newEntry.getLast().type().equals(SynonymFilter.TYPE_SYNONYM) && replace.type().equals(SynonymFilter.TYPE_SYNONYM)) { positionalSynonymSequencesSet.add(tokenSequence); } newEntry.add(replace); tokenFoundSequence = true; } newAllTokenSequencesSet.add(newEntry); } if (false == tokenFoundSequence) { for (LinkedList<PackedTokenAttributeImpl> tokenSequence : newAllTokenSequencesSet) { LinkedList<PackedTokenAttributeImpl> newEntry = new LinkedList<PackedTokenAttributeImpl>(); newEntry.addAll(tokenSequence); if ((newEntry.getLast().endOffset() == replace.endOffset()) && replace.type().equals(SynonymFilter.TYPE_SYNONYM)) { if ((newEntry.getLast().startOffset() == replace.startOffset()) && newEntry.getLast().type().equals(SynonymFilter.TYPE_SYNONYM)) { positionalSynonymSequencesSet.add(tokenSequence); newEntry.add(replace); tokenFoundSequence = true; } else if (newEntry.getLast().type().equals(CommonGramsFilter.GRAM_TYPE)) { if (newEntry.toString().endsWith(replace.toString())) { // already in the gram positionalSynonymSequencesSet.add(tokenSequence); tokenFoundSequence = true; } else { // need to replace the synonym in the // current gram tokenFoundSequence = true; StringBuffer old = new StringBuffer(newEntry.getLast().toString()); old.replace(replace.startOffset() - newEntry.getLast().startOffset(), replace.endOffset() - newEntry.getLast().startOffset(), replace.toString()); PackedTokenAttributeImpl newToken = new PackedTokenAttributeImpl(); newToken.setEmpty().append(old.toString()); newToken.setOffset(newEntry.getLast().startOffset(), newEntry.getLast().endOffset()); newEntry.removeLast(); newEntry.add(newToken); positionalSynonymSequencesSet.add(newEntry); } } } else if ((newEntry.getLast().startOffset() < replace.startOffset()) && (newEntry.getLast().endOffset() < replace.endOffset())) { if (newEntry.getLast().type().equals(SynonymFilter.TYPE_SYNONYM) && replace.type().equals(SynonymFilter.TYPE_SYNONYM)) { positionalSynonymSequencesSet.add(tokenSequence); newEntry.add(replace); tokenFoundSequence = true; } } } } if (false == tokenFoundSequence) { LinkedList<PackedTokenAttributeImpl> newEntry = new LinkedList<PackedTokenAttributeImpl>(); newEntry.add(replace); newAllTokenSequencesSet.add(newEntry); } // Limit the max number of permutations we consider if (newAllTokenSequencesSet.size() > 64) { break FOR_FIRST_TOKEN_AT_POSITION_ONLY; } } allTokenSequencesSet = newAllTokenSequencesSet; allTokenSequencesSet.addAll(positionalSynonymSequencesSet); } LinkedList<LinkedList<PackedTokenAttributeImpl>> allTokenSequences = new LinkedList<LinkedList<PackedTokenAttributeImpl>>( allTokenSequencesSet); // build the unique LinkedList<LinkedList<PackedTokenAttributeImpl>> fixedTokenSequences = new LinkedList<LinkedList<PackedTokenAttributeImpl>>(); for (LinkedList<PackedTokenAttributeImpl> tokenSequence : allTokenSequences) { LinkedList<PackedTokenAttributeImpl> fixedTokenSequence = new LinkedList<PackedTokenAttributeImpl>(); fixedTokenSequences.add(fixedTokenSequence); PackedTokenAttributeImpl replace = null; for (PackedTokenAttributeImpl c : tokenSequence) { if (replace == null) { StringBuilder prefix = new StringBuilder(); for (int i = c.startOffset() - 1; i >= 0; i--) { char test = testText.charAt(i); if (((test == '*') || (test == '?')) && wildcardPoistions.contains(i)) { prefix.insert(0, test); } else { break; } } String pre = prefix.toString(); if (requiresMLTokenDuplication) { String termText = c.toString(); int position = termText.indexOf("}"); String language = termText.substring(0, position + 1); String token = termText.substring(position + 1); replace = new PackedTokenAttributeImpl(); replace.setEmpty().append(language + pre + token); replace.setOffset(c.startOffset() - pre.length(), c.endOffset()); replace.setType(c.type()); replace.setPositionIncrement(c.getPositionIncrement()); } else { String termText = c.toString(); replace = new PackedTokenAttributeImpl(); replace.setEmpty().append(pre + termText); replace.setOffset(c.startOffset() - pre.length(), c.endOffset()); replace.setType(c.type()); replace.setPositionIncrement(c.getPositionIncrement()); } } else { StringBuilder prefix = new StringBuilder(); StringBuilder postfix = new StringBuilder(); StringBuilder builder = prefix; for (int i = c.startOffset() - 1; i >= replace.endOffset(); i--) { char test = testText.charAt(i); if (((test == '*') || (test == '?')) && wildcardPoistions.contains(i)) { builder.insert(0, test); } else { builder = postfix; postfix.setLength(0); } } String pre = prefix.toString(); String post = postfix.toString(); // Does it bridge? if ((pre.length() > 0) && (replace.endOffset() + pre.length()) == c.startOffset()) { String termText = c.toString(); if (requiresMLTokenDuplication) { int position = termText.indexOf("}"); @SuppressWarnings("unused") String language = termText.substring(0, position + 1); String token = termText.substring(position + 1); int oldPositionIncrement = replace.getPositionIncrement(); String replaceTermText = replace.toString(); replace = new PackedTokenAttributeImpl(); replace.setEmpty().append(replaceTermText + pre + token); replace.setOffset(replace.startOffset(), c.endOffset()); replace.setType(replace.type()); replace.setPositionIncrement(oldPositionIncrement); } else { int oldPositionIncrement = replace.getPositionIncrement(); String replaceTermText = replace.toString(); replace = new PackedTokenAttributeImpl(); replace.setEmpty().append(replaceTermText + pre + termText); replace.setOffset(replace.startOffset(), c.endOffset()); replace.setType(replace.type()); replace.setPositionIncrement(oldPositionIncrement); } } else { String termText = c.toString(); if (requiresMLTokenDuplication) { int position = termText.indexOf("}"); String language = termText.substring(0, position + 1); String token = termText.substring(position + 1); String replaceTermText = replace.toString(); PackedTokenAttributeImpl last = new PackedTokenAttributeImpl(); last.setEmpty().append(replaceTermText + post); last.setOffset(replace.startOffset(), replace.endOffset() + post.length()); last.setType(replace.type()); last.setPositionIncrement(replace.getPositionIncrement()); fixedTokenSequence.add(last); replace = new PackedTokenAttributeImpl(); replace.setEmpty().append(language + pre + token); replace.setOffset(c.startOffset() - pre.length(), c.endOffset()); replace.setType(c.type()); replace.setPositionIncrement(c.getPositionIncrement()); } else { String replaceTermText = replace.toString(); PackedTokenAttributeImpl last = new PackedTokenAttributeImpl(); last.setEmpty().append(replaceTermText + post); last.setOffset(replace.startOffset(), replace.endOffset() + post.length()); last.setType(replace.type()); last.setPositionIncrement(replace.getPositionIncrement()); fixedTokenSequence.add(last); replace = new PackedTokenAttributeImpl(); replace.setEmpty().append(pre + termText); replace.setOffset(c.startOffset() - pre.length(), c.endOffset()); replace.setType(c.type()); replace.setPositionIncrement(c.getPositionIncrement()); } } } } // finish last if (replace != null) { StringBuilder postfix = new StringBuilder(); if ((replace.endOffset() >= 0) && (replace.endOffset() < testText.length())) { for (int i = replace.endOffset(); i < testText.length(); i++) { char test = testText.charAt(i); if (((test == '*') || (test == '?')) && wildcardPoistions.contains(i)) { postfix.append(test); } else { break; } } } String post = postfix.toString(); int oldPositionIncrement = replace.getPositionIncrement(); String replaceTermText = replace.toString(); PackedTokenAttributeImpl terminal = new PackedTokenAttributeImpl(); terminal.setEmpty().append(replaceTermText + post); terminal.setOffset(replace.startOffset(), replace.endOffset() + post.length()); terminal.setType(replace.type()); terminal.setPositionIncrement(oldPositionIncrement); fixedTokenSequence.add(terminal); } } // rebuild fixed list ArrayList<PackedTokenAttributeImpl> fixed = new ArrayList<PackedTokenAttributeImpl>(); for (LinkedList<PackedTokenAttributeImpl> tokenSequence : fixedTokenSequences) { for (PackedTokenAttributeImpl token : tokenSequence) { fixed.add(token); } } // reorder by start position and increment Collections.sort(fixed, new Comparator<PackedTokenAttributeImpl>() { public int compare(PackedTokenAttributeImpl o1, PackedTokenAttributeImpl o2) { int dif = o1.startOffset() - o2.startOffset(); if (dif != 0) { return dif; } else { return o1.getPositionIncrement() - o2.getPositionIncrement(); } } }); // make sure we remove any tokens we have duplicated @SuppressWarnings("rawtypes") OrderedHashSet unique = new OrderedHashSet(); unique.addAll(fixed); fixed = new ArrayList<PackedTokenAttributeImpl>(unique); list = fixed; // add any missing locales back to the tokens if (localePrefix.length() > 0) { for (int j = 0; j < list.size(); j++) { PackedTokenAttributeImpl currentToken = list.get(j); String termText = currentToken.toString(); currentToken.setEmpty(); currentToken.append(localePrefix + termText); } } SchemaField sf = schema.getField(field); boolean isShingled = false; @SuppressWarnings("resource") TokenizerChain tokenizerChain = (sf.getType().getQueryAnalyzer() instanceof TokenizerChain) ? ((TokenizerChain) sf.getType().getQueryAnalyzer()) : null; if (tokenizerChain != null) { for (TokenFilterFactory factory : tokenizerChain.getTokenFilterFactories()) { if (factory instanceof ShingleFilterFactory) { isShingled = true; break; } } } @SuppressWarnings("resource") AlfrescoAnalyzerWrapper analyzerWrapper = (sf.getType() .getQueryAnalyzer() instanceof AlfrescoAnalyzerWrapper) ? ((AlfrescoAnalyzerWrapper) sf.getType().getQueryAnalyzer()) : null; if (analyzerWrapper != null) { // assume if there are no term positions it is shingled .... isShingled = true; } boolean forceConjuncion = rerankPhase == RerankPhase.QUERY_PHASE; if (list.size() == 0) { return null; } else if (list.size() == 1) { nextToken = list.get(0); String termText = nextToken.toString(); if (!isNumeric && (termText.contains("*") || termText.contains("?"))) { return newWildcardQuery(new Term(field, termText)); } else { return newTermQuery(new Term(field, termText)); } } else { if (severalTokensAtSamePosition) { if (positionCount == 1) { // no phrase query: Builder q = newBooleanQuery(); for (int i = 0; i < list.size(); i++) { Query currentQuery; nextToken = list.get(i); String termText = nextToken.toString(); if (termText.contains("*") || termText.contains("?")) { currentQuery = newWildcardQuery(new Term(field, termText)); } else { currentQuery = newTermQuery(new Term(field, termText)); } q.add(currentQuery, BooleanClause.Occur.SHOULD); } return q.build(); } else if (forceConjuncion) { BooleanQuery.Builder or = new BooleanQuery.Builder(); for (LinkedList<PackedTokenAttributeImpl> tokenSequence : fixedTokenSequences) { BooleanQuery.Builder and = new BooleanQuery.Builder(); for (int i = 0; i < tokenSequence.size(); i++) { nextToken = (PackedTokenAttributeImpl) tokenSequence.get(i); String termText = nextToken.toString(); Term term = new Term(field, termText); if ((termText != null) && (termText.contains("*") || termText.contains("?"))) { org.apache.lucene.search.WildcardQuery wildQuery = new org.apache.lucene.search.WildcardQuery( term); and.add(wildQuery, Occur.MUST); } else { TermQuery termQuery = new TermQuery(term); and.add(termQuery, Occur.MUST); } } if (and.build().clauses().size() > 0) { or.add(and.build(), Occur.SHOULD); } } return or.build(); } // shingle else if (sf.omitPositions() && isShingled) { ArrayList<PackedTokenAttributeImpl> nonContained = getNonContained(list); Query currentQuery; BooleanQuery.Builder weakPhrase = new BooleanQuery.Builder(); for (PackedTokenAttributeImpl shingleToken : nonContained) { String termText = shingleToken.toString(); Term term = new Term(field, termText); if ((termText != null) && (termText.contains("*") || termText.contains("?"))) { currentQuery = new org.apache.lucene.search.WildcardQuery(term); } else { currentQuery = new TermQuery(term); } weakPhrase.add(currentQuery, Occur.MUST); } return weakPhrase.build(); } // Word delimiter factory and other odd things generate complex // token patterns // Smart skip token sequences with small tokens that generate // toomany wildcards // Fall back to the larger pattern // e.g Site1* will not do (S ite 1*) or (Site 1*) if 1* matches // too much (S ite1*) and (Site1*) will still be OK // If we skip all (for just 1* in the input) this is still an // issue. else { return generateSpanOrQuery(field, fixedTokenSequences); } } else { if (forceConjuncion) { BooleanQuery.Builder or = new BooleanQuery.Builder(); for (LinkedList<PackedTokenAttributeImpl> tokenSequence : fixedTokenSequences) { BooleanQuery.Builder and = new BooleanQuery.Builder(); for (int i = 0; i < tokenSequence.size(); i++) { nextToken = (PackedTokenAttributeImpl) tokenSequence.get(i); String termText = nextToken.toString(); Term term = new Term(field, termText); if ((termText != null) && (termText.contains("*") || termText.contains("?"))) { org.apache.lucene.search.WildcardQuery wildQuery = new org.apache.lucene.search.WildcardQuery( term); and.add(wildQuery, Occur.MUST); } else { TermQuery termQuery = new TermQuery(term); and.add(termQuery, Occur.MUST); } } if (and.build().clauses().size() > 0) { or.add(and.build(), Occur.SHOULD); } } return or.build(); } else { SpanQuery spanQuery = null; ArrayList<SpanQuery> atSamePositionSpanOrQueryParts = new ArrayList<SpanQuery>(); int gap = 0; for (int i = 0; i < list.size(); i++) { nextToken = list.get(i); String termText = nextToken.toString(); Term term = new Term(field, termText); if (getEnablePositionIncrements()) { SpanQuery nextSpanQuery; if ((termText != null) && (termText.contains("*") || termText.contains("?"))) { org.apache.lucene.search.WildcardQuery wildQuery = new org.apache.lucene.search.WildcardQuery( term); SpanMultiTermQueryWrapper<org.apache.lucene.search.WildcardQuery> wrapper = new SpanMultiTermQueryWrapper<org.apache.lucene.search.WildcardQuery>( wildQuery); wrapper.setRewriteMethod( new TopTermsSpanBooleanQueryRewrite(topTermSpanRewriteLimit)); nextSpanQuery = wrapper; } else { nextSpanQuery = new SpanTermQuery(term); } if (gap == 0) { atSamePositionSpanOrQueryParts.add(nextSpanQuery); } else { if (atSamePositionSpanOrQueryParts.size() == 0) { if (spanQuery == null) { spanQuery = nextSpanQuery; } else { spanQuery = new SpanNearQuery(new SpanQuery[] { spanQuery, nextSpanQuery }, (gap - 1) + internalSlop, internalSlop < 2); } atSamePositionSpanOrQueryParts = new ArrayList<SpanQuery>(); } else if (atSamePositionSpanOrQueryParts.size() == 1) { if (spanQuery == null) { spanQuery = atSamePositionSpanOrQueryParts.get(0); } else { spanQuery = new SpanNearQuery( new SpanQuery[] { spanQuery, atSamePositionSpanOrQueryParts.get(0) }, (gap - 1) + internalSlop, internalSlop < 2); } atSamePositionSpanOrQueryParts = new ArrayList<SpanQuery>(); atSamePositionSpanOrQueryParts.add(nextSpanQuery); } else { if (spanQuery == null) { spanQuery = new SpanOrQuery( atSamePositionSpanOrQueryParts.toArray(new SpanQuery[] {})); } else { spanQuery = new SpanNearQuery( new SpanQuery[] { spanQuery, new SpanOrQuery(atSamePositionSpanOrQueryParts .toArray(new SpanQuery[] {})) }, (gap - 1) + internalSlop, internalSlop < 2); } atSamePositionSpanOrQueryParts = new ArrayList<SpanQuery>(); atSamePositionSpanOrQueryParts.add(nextSpanQuery); } } gap = nextToken.getPositionIncrement(); } else { SpanQuery nextSpanQuery; if ((termText != null) && (termText.contains("*") || termText.contains("?"))) { org.apache.lucene.search.WildcardQuery wildQuery = new org.apache.lucene.search.WildcardQuery( term); SpanMultiTermQueryWrapper<org.apache.lucene.search.WildcardQuery> wrapper = new SpanMultiTermQueryWrapper<org.apache.lucene.search.WildcardQuery>( wildQuery); wrapper.setRewriteMethod( new TopTermsSpanBooleanQueryRewrite(topTermSpanRewriteLimit)); nextSpanQuery = wrapper; } else { nextSpanQuery = new SpanTermQuery(term); } if (spanQuery == null) { spanQuery = new SpanOrQuery(nextSpanQuery); } else { spanQuery = new SpanOrQuery(spanQuery, nextSpanQuery); } } } if (atSamePositionSpanOrQueryParts.size() == 0) { return spanQuery; } else if (atSamePositionSpanOrQueryParts.size() == 1) { if (spanQuery == null) { spanQuery = atSamePositionSpanOrQueryParts.get(0); } else { spanQuery = new SpanNearQuery( new SpanQuery[] { spanQuery, atSamePositionSpanOrQueryParts.get(0) }, (gap - 1) + internalSlop, internalSlop < 2); } return spanQuery; } else { if (spanQuery == null) { spanQuery = new SpanOrQuery(atSamePositionSpanOrQueryParts.toArray(new SpanQuery[] {})); } else { spanQuery = new SpanNearQuery( new SpanQuery[] { spanQuery, new SpanOrQuery( atSamePositionSpanOrQueryParts.toArray(new SpanQuery[] {})) }, (gap - 1) + internalSlop, internalSlop < 2); } return spanQuery; } } } } }
From source file:org.alfresco.solr.query.Solr4QueryParser.java
License:Open Source License
/** * @param list//ww w . j ava 2 s .co m * @return */ private ArrayList<PackedTokenAttributeImpl> getNonContained(ArrayList<PackedTokenAttributeImpl> list) { ArrayList<PackedTokenAttributeImpl> nonContained = new ArrayList<PackedTokenAttributeImpl>(); NEXT_CANDIDATE: for (PackedTokenAttributeImpl candidate : list) { NEXT_TEST: for (PackedTokenAttributeImpl test : list) { if (candidate == test) { continue NEXT_TEST; } else if ((test.startOffset() == candidate.startOffset()) && (candidate.endOffset() == test.endOffset()) && (test.toString().equals(candidate.toString()))) { continue NEXT_TEST; } else if ((test.startOffset() <= candidate.startOffset()) && (candidate.endOffset() <= test.endOffset()) && (test.toString().contains(candidate.toString()))) { continue NEXT_CANDIDATE; } } nonContained.add(candidate); } return nonContained; }