List of usage examples for org.apache.lucene.analysis TokenStream reset
public void reset() throws IOException
From source file:nl.uva.sne.commons.SemanticUtils.java
public static List<String> tokenize(String text, boolean stem) throws IOException, JWNLException { text = text.replaceAll("", "'"); text = text.replaceAll("_", " "); text = text.replaceAll("[0-9]", ""); text = text.replaceAll("[\\p{Punct}&&[^'-]]+", " "); text = text.replaceAll("(?:'(?:[tdsm]|[vr]e|ll))+\\b", ""); text = text.toLowerCase();/* w w w.j a va 2s.c om*/ TokenStream tokenStream; if (stem) { tokenStream = tokenStemStream("field", new StringReader(text)); } else { tokenStream = tokenStream("field", new StringReader(text)); } ArrayList<String> words = new ArrayList<>(); try { CharTermAttribute term = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { words.add(term.toString()); } tokenStream.end(); } finally { tokenStream.close(); } // Logger.getLogger(SemanticUtils.class.getName()).log(Level.INFO, "Returning {0}:", words.size() + " tokens"); return words; }
From source file:org.aksw.palmetto.corpus.lucene.SimpleAnalyzerTest.java
License:Open Source License
public void test(boolean lowercase) throws Exception { SimpleAnalyzer analyzer = new SimpleAnalyzer(lowercase); TokenStream stream = analyzer.tokenStream("test", text); CharTermAttribute token;/*from www. ja va 2s .c o m*/ int count = 0; stream.reset(); while (stream.incrementToken()) { Assert.assertTrue(count < expectedTokens.length); token = stream.getAttribute(CharTermAttribute.class); if (lowercase) { Assert.assertEquals(expectedTokens[count].toLowerCase(), token.toString()); } else { Assert.assertEquals(expectedTokens[count], token.toString()); } ++count; } Assert.assertEquals(expectedTokens.length, count); analyzer.close(); }
From source file:org.alfresco.repo.search.impl.lucene.analysis.MLAnalayserTest.java
License:Open Source License
/** * Check that the TokenStream yields the exact tokens specified. * Note that order is not checked, since the map of locales will not provide a * predictable ordering when enumerated. * //from w w w . j a v a 2 s . com * The expected list of tokens may contain the same token more than once and * the number of instances will have to match the number found in the stream. * * @param ts TokenStream to inspect. * @param expectedTokens List of tokens in the order expected from the stream. * @throws IOException */ private void verifyTokenStream(TokenStream ts, List<String> expectedTokens) throws IOException { final int expectedCount = expectedTokens.size(); int count = 0; CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); try { ts.reset(); while (ts.incrementToken()) { count++; System.out.println("Token: " + termAtt.toString()); if (expectedTokens.contains(termAtt.toString())) { // remove an instance of the term text so that it is not matched again expectedTokens.remove(termAtt.toString()); } else { fail("Unexpected token: " + termAtt.toString()); } } ts.end(); } finally { ts.close(); } assertEquals("Incorrect number of tokens generated.", expectedCount, count); }
From source file:org.alfresco.repo.search.impl.lucene.analysis.PathTokenFilterTest.java
License:Open Source License
private void tokenise(TokenStream ts, String[] tokens) throws IOException { int i = 0;/* w w w.j av a2 s .c o m*/ CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class); try { ts.reset(); while (ts.incrementToken()) { System.out.println("token: " + ts.reflectAsString(true)); String termText = termAtt.toString(); if (typeAtt.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAMESPACE)) { assert (i % 2 == 0); assertEquals(termText, tokens[i++]); } else if (typeAtt.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAMESPACE_PREFIX)) { assert (i % 2 == 0); assertEquals(termText, tokens[i++]); } else if (typeAtt.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAME)) { assert (i % 2 == 1); assertEquals(termText, tokens[i++]); } } ts.end(); } finally { ts.close(); } if (i != tokens.length) { fail("Invalid number of tokens, found " + i + " and expected " + tokens.length); } }
From source file:org.alfresco.solr.AlfrescoFieldType.java
License:Open Source License
public static BytesRef analyzeMultiTerm(String field, String part, Analyzer analyzerIn) { if (part == null || analyzerIn == null) return null; TokenStream source = null; try {/*from w w w .j av a2s. c o m*/ source = analyzerIn.tokenStream(field, part); source.reset(); TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class); BytesRef bytes = termAtt.getBytesRef(); if (!source.incrementToken()) throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "analyzer returned no terms for multiTerm term: " + part); if (source.incrementToken()) throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "analyzer returned too many terms for multiTerm term: " + part); source.end(); return BytesRef.deepCopyOf(bytes); } catch (IOException e) { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "error analyzing range part: " + part, e); } finally { IOUtils.closeWhileHandlingException(source); } }
From source file:org.alfresco.solr.query.Solr4QueryParser.java
License:Open Source License
private ArrayList<String> getTokens(IndexableField indexableField) throws IOException { ArrayList<String> tokens = new ArrayList<String>(); TokenStream ts = indexableField.tokenStream(schema.getIndexAnalyzer(), null); CharTermAttribute termAttribute = ts.getAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { String token = new String(termAttribute.buffer(), 0, termAttribute.length()); tokens.add(token);//from w w w .ja v a2s . c o m } ts.end(); ts.close(); return tokens; }
From source file:org.alfresco.solr.query.Solr4QueryParser.java
License:Open Source License
@SuppressWarnings("unchecked") protected Query getFieldQueryImpl(String field, String queryText, AnalysisMode analysisMode, LuceneFunction luceneFunction) throws ParseException, IOException { // make sure the field exists or return a dummy query so we have no // error ....ACE-3231 SchemaField schemaField = schema.getFieldOrNull(field); boolean isNumeric = false; if (schemaField == null) { return new TermQuery(new Term("_dummy_", "_miss_")); } else {/*ww w.j a v a 2 s . c o m*/ isNumeric = (schemaField.getType().getNumericType() != null); if (isNumeric) { //Check to see if queryText is numeric or else it will fail. try { Double.valueOf(queryText); } catch (NumberFormatException e) { return new TermQuery(new Term("_dummy_", "_miss_")); } } } // Use the analyzer to get all the tokens, and then build a TermQuery, // PhraseQuery, or noth // TODO: Untokenised columns with functions require special handling if (luceneFunction != LuceneFunction.FIELD) { throw new UnsupportedOperationException( "Field queries are not supported on lucene functions (UPPER, LOWER, etc)"); } // if the incoming string already has a language identifier we strip it // iff and addit back on again String localePrefix = ""; String toTokenise = queryText; if (queryText.startsWith("{")) { int position = queryText.indexOf("}"); if (position > 0) { String language = queryText.substring(0, position + 1); Locale locale = new Locale(queryText.substring(1, position)); String token = queryText.substring(position + 1); boolean found = false; for (Locale current : Locale.getAvailableLocales()) { if (current.toString().equalsIgnoreCase(locale.toString())) { found = true; break; } } if (found) { localePrefix = language; toTokenise = token; } else { // toTokenise = token; } } } String testText = toTokenise; boolean requiresMLTokenDuplication = false; String localeString = null; if (isPropertyField(field) && (localePrefix.length() == 0)) { if ((queryText.length() > 0) && (queryText.charAt(0) == '\u0000')) { int position = queryText.indexOf("\u0000", 1); testText = queryText.substring(position + 1); requiresMLTokenDuplication = true; localeString = queryText.substring(1, position); } } // find the positions of any escaped * and ? and ignore them Set<Integer> wildcardPoistions = getWildcardPositions(testText); TokenStream source = null; ArrayList<PackedTokenAttributeImpl> list = new ArrayList<PackedTokenAttributeImpl>(); boolean severalTokensAtSamePosition = false; PackedTokenAttributeImpl nextToken; int positionCount = 0; try { source = getAnalyzer().tokenStream(field, new StringReader(toTokenise)); source.reset(); while (source.incrementToken()) { CharTermAttribute cta = source.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = source.getAttribute(OffsetAttribute.class); TypeAttribute typeAtt = null; if (source.hasAttribute(TypeAttribute.class)) { typeAtt = source.getAttribute(TypeAttribute.class); } PositionIncrementAttribute posIncAtt = null; if (source.hasAttribute(PositionIncrementAttribute.class)) { posIncAtt = source.getAttribute(PositionIncrementAttribute.class); } nextToken = new PackedTokenAttributeImpl(); nextToken.setEmpty().copyBuffer(cta.buffer(), 0, cta.length()); nextToken.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset()); if (typeAtt != null) { nextToken.setType(typeAtt.type()); } if (posIncAtt != null) { nextToken.setPositionIncrement(posIncAtt.getPositionIncrement()); } list.add(nextToken); if (nextToken.getPositionIncrement() != 0) positionCount += nextToken.getPositionIncrement(); else severalTokensAtSamePosition = true; } } finally { try { if (source != null) { source.close(); } } catch (IOException e) { // ignore } } // add any alpha numeric wildcards that have been missed // Fixes most stop word and wild card issues for (int index = 0; index < testText.length(); index++) { char current = testText.charAt(index); if (((current == '*') || (current == '?')) && wildcardPoistions.contains(index)) { StringBuilder pre = new StringBuilder(10); if (index == 0) { // "*" and "?" at the start boolean found = false; for (int j = 0; j < list.size(); j++) { PackedTokenAttributeImpl test = list.get(j); if ((test.startOffset() <= 0) && (0 < test.endOffset())) { found = true; break; } } if (!found && (list.size() == 0)) { // Add new token followed by * not given by the // tokeniser PackedTokenAttributeImpl newToken = new PackedTokenAttributeImpl(); newToken.setEmpty().append("", 0, 0); newToken.setType("ALPHANUM"); if (requiresMLTokenDuplication) { Locale locale = I18NUtil.parseLocale(localeString); @SuppressWarnings("resource") MLTokenDuplicator duplicator = new MLTokenDuplicator(locale, MLAnalysisMode.EXACT_LANGUAGE); Iterator<PackedTokenAttributeImpl> it = duplicator.buildIterator(newToken); if (it != null) { int count = 0; while (it.hasNext()) { list.add(it.next()); count++; if (count > 1) { severalTokensAtSamePosition = true; } } } } // content else { list.add(newToken); } } } else if (index > 0) { // Add * and ? back into any tokens from which it has been // removed boolean tokenFound = false; for (int j = 0; j < list.size(); j++) { PackedTokenAttributeImpl test = list.get(j); if ((test.startOffset() <= index) && (index < test.endOffset())) { if (requiresMLTokenDuplication) { String termText = test.toString(); int position = termText.indexOf("}"); String language = termText.substring(0, position + 1); String token = termText.substring(position + 1); if (index >= test.startOffset() + token.length()) { test.setEmpty(); test.append(language + token + current); } } else { if (index >= test.startOffset() + test.length()) { test.setEmpty(); test.append(test.toString() + current); } } tokenFound = true; break; } } if (!tokenFound) { for (int i = index - 1; i >= 0; i--) { char c = testText.charAt(i); if (Character.isLetterOrDigit(c)) { boolean found = false; for (int j = 0; j < list.size(); j++) { PackedTokenAttributeImpl test = list.get(j); if ((test.startOffset() <= i) && (i < test.endOffset())) { found = true; break; } } if (found) { break; } else { pre.insert(0, c); } } else { break; } } if (pre.length() > 0) { // Add new token followed by * not given by the // tokeniser PackedTokenAttributeImpl newToken = new PackedTokenAttributeImpl(); newToken.setEmpty().append(pre.toString()); newToken.setOffset(index - pre.length(), index); newToken.setType("ALPHANUM"); if (requiresMLTokenDuplication) { Locale locale = I18NUtil.parseLocale(localeString); @SuppressWarnings("resource") MLTokenDuplicator duplicator = new MLTokenDuplicator(locale, MLAnalysisMode.EXACT_LANGUAGE); Iterator<PackedTokenAttributeImpl> it = duplicator.buildIterator(newToken); if (it != null) { int count = 0; while (it.hasNext()) { list.add(it.next()); count++; if (count > 1) { severalTokensAtSamePosition = true; } } } } // content else { list.add(newToken); } } } } StringBuilder post = new StringBuilder(10); if (index > 0) { for (int i = index + 1; i < testText.length(); i++) { char c = testText.charAt(i); if (Character.isLetterOrDigit(c)) { boolean found = false; for (int j = 0; j < list.size(); j++) { PackedTokenAttributeImpl test = list.get(j); if ((test.startOffset() <= i) && (i < test.endOffset())) { found = true; break; } } if (found) { break; } else { post.append(c); } } else { break; } } if (post.length() > 0) { // Add new token followed by * not given by the // tokeniser PackedTokenAttributeImpl newToken = new PackedTokenAttributeImpl(); newToken.setEmpty().append(post.toString()); newToken.setOffset(index + 1, index + 1 + post.length()); newToken.setType("ALPHANUM"); if (requiresMLTokenDuplication) { Locale locale = I18NUtil.parseLocale(localeString); @SuppressWarnings("resource") MLTokenDuplicator duplicator = new MLTokenDuplicator(locale, MLAnalysisMode.EXACT_LANGUAGE); Iterator<PackedTokenAttributeImpl> it = duplicator.buildIterator(newToken); if (it != null) { int count = 0; while (it.hasNext()) { list.add(it.next()); count++; if (count > 1) { severalTokensAtSamePosition = true; } } } } // content else { list.add(newToken); } } } } } // Put in real position increments as we treat them correctly int curentIncrement = -1; for (PackedTokenAttributeImpl c : list) { if (curentIncrement == -1) { curentIncrement = c.getPositionIncrement(); } else if (c.getPositionIncrement() > 0) { curentIncrement = c.getPositionIncrement(); } else { c.setPositionIncrement(curentIncrement); } } // Fix up position increments for in phrase isolated wildcards boolean lastWasWild = false; for (int i = 0; i < list.size() - 1; i++) { for (int j = list.get(i).endOffset() + 1; j < list.get(i + 1).startOffset() - 1; j++) { if (wildcardPoistions.contains(j)) { if (!lastWasWild) { list.get(i + 1).setPositionIncrement(list.get(i + 1).getPositionIncrement() + 1); } lastWasWild = true; } else { lastWasWild = false; } } } Collections.sort(list, new Comparator<PackedTokenAttributeImpl>() { public int compare(PackedTokenAttributeImpl o1, PackedTokenAttributeImpl o2) { int dif = o1.startOffset() - o2.startOffset(); return dif; } }); // Combined * and ? based strings - should redo the tokeniser // Build tokens by position LinkedList<LinkedList<PackedTokenAttributeImpl>> tokensByPosition = new LinkedList<LinkedList<PackedTokenAttributeImpl>>(); LinkedList<PackedTokenAttributeImpl> currentList = null; int lastStart = 0; for (PackedTokenAttributeImpl c : list) { if (c.startOffset() == lastStart) { if (currentList == null) { currentList = new LinkedList<PackedTokenAttributeImpl>(); tokensByPosition.add(currentList); } currentList.add(c); } else { currentList = new LinkedList<PackedTokenAttributeImpl>(); tokensByPosition.add(currentList); currentList.add(c); } lastStart = c.startOffset(); } // Build all the token sequences and see which ones get strung together OrderedHashSet<LinkedList<PackedTokenAttributeImpl>> allTokenSequencesSet = new OrderedHashSet<LinkedList<PackedTokenAttributeImpl>>(); for (LinkedList<PackedTokenAttributeImpl> tokensAtPosition : tokensByPosition) { OrderedHashSet<LinkedList<PackedTokenAttributeImpl>> positionalSynonymSequencesSet = new OrderedHashSet<LinkedList<PackedTokenAttributeImpl>>(); OrderedHashSet<LinkedList<PackedTokenAttributeImpl>> newAllTokenSequencesSet = new OrderedHashSet<LinkedList<PackedTokenAttributeImpl>>(); FOR_FIRST_TOKEN_AT_POSITION_ONLY: for (PackedTokenAttributeImpl t : tokensAtPosition) { PackedTokenAttributeImpl replace = new PackedTokenAttributeImpl(); replace.setEmpty().append(t); replace.setOffset(t.startOffset(), t.endOffset()); replace.setType(t.type()); replace.setPositionIncrement(t.getPositionIncrement()); boolean tokenFoundSequence = false; for (LinkedList<PackedTokenAttributeImpl> tokenSequence : allTokenSequencesSet) { LinkedList<PackedTokenAttributeImpl> newEntry = new LinkedList<PackedTokenAttributeImpl>(); newEntry.addAll(tokenSequence); if ((newEntry.getLast().endOffset() == replace.endOffset()) && replace.type().equals(SynonymFilter.TYPE_SYNONYM)) { if ((newEntry.getLast().startOffset() == replace.startOffset()) && newEntry.getLast().type().equals(SynonymFilter.TYPE_SYNONYM)) { positionalSynonymSequencesSet.add(tokenSequence); newEntry.add(replace); tokenFoundSequence = true; } else if (newEntry.getLast().type().equals(CommonGramsFilter.GRAM_TYPE)) { if (newEntry.toString().endsWith(replace.toString())) { // already in the gram positionalSynonymSequencesSet.add(tokenSequence); tokenFoundSequence = true; } else { // need to replace the synonym in the current // gram tokenFoundSequence = true; StringBuffer old = new StringBuffer(newEntry.getLast().toString()); old.replace(replace.startOffset() - newEntry.getLast().startOffset(), replace.endOffset() - newEntry.getLast().startOffset(), replace.toString()); PackedTokenAttributeImpl newToken = new PackedTokenAttributeImpl(); newToken.setEmpty().append(old.toString()); newToken.setOffset(newEntry.getLast().startOffset(), newEntry.getLast().endOffset()); newEntry.removeLast(); newEntry.add(newToken); } } } else if ((newEntry.getLast().startOffset() < replace.startOffset()) && (newEntry.getLast().endOffset() < replace.endOffset())) { if (newEntry.getLast().type().equals(SynonymFilter.TYPE_SYNONYM) && replace.type().equals(SynonymFilter.TYPE_SYNONYM)) { positionalSynonymSequencesSet.add(tokenSequence); } newEntry.add(replace); tokenFoundSequence = true; } newAllTokenSequencesSet.add(newEntry); } if (false == tokenFoundSequence) { for (LinkedList<PackedTokenAttributeImpl> tokenSequence : newAllTokenSequencesSet) { LinkedList<PackedTokenAttributeImpl> newEntry = new LinkedList<PackedTokenAttributeImpl>(); newEntry.addAll(tokenSequence); if ((newEntry.getLast().endOffset() == replace.endOffset()) && replace.type().equals(SynonymFilter.TYPE_SYNONYM)) { if ((newEntry.getLast().startOffset() == replace.startOffset()) && newEntry.getLast().type().equals(SynonymFilter.TYPE_SYNONYM)) { positionalSynonymSequencesSet.add(tokenSequence); newEntry.add(replace); tokenFoundSequence = true; } else if (newEntry.getLast().type().equals(CommonGramsFilter.GRAM_TYPE)) { if (newEntry.toString().endsWith(replace.toString())) { // already in the gram positionalSynonymSequencesSet.add(tokenSequence); tokenFoundSequence = true; } else { // need to replace the synonym in the // current gram tokenFoundSequence = true; StringBuffer old = new StringBuffer(newEntry.getLast().toString()); old.replace(replace.startOffset() - newEntry.getLast().startOffset(), replace.endOffset() - newEntry.getLast().startOffset(), replace.toString()); PackedTokenAttributeImpl newToken = new PackedTokenAttributeImpl(); newToken.setEmpty().append(old.toString()); newToken.setOffset(newEntry.getLast().startOffset(), newEntry.getLast().endOffset()); newEntry.removeLast(); newEntry.add(newToken); positionalSynonymSequencesSet.add(newEntry); } } } else if ((newEntry.getLast().startOffset() < replace.startOffset()) && (newEntry.getLast().endOffset() < replace.endOffset())) { if (newEntry.getLast().type().equals(SynonymFilter.TYPE_SYNONYM) && replace.type().equals(SynonymFilter.TYPE_SYNONYM)) { positionalSynonymSequencesSet.add(tokenSequence); newEntry.add(replace); tokenFoundSequence = true; } } } } if (false == tokenFoundSequence) { LinkedList<PackedTokenAttributeImpl> newEntry = new LinkedList<PackedTokenAttributeImpl>(); newEntry.add(replace); newAllTokenSequencesSet.add(newEntry); } // Limit the max number of permutations we consider if (newAllTokenSequencesSet.size() > 64) { break FOR_FIRST_TOKEN_AT_POSITION_ONLY; } } allTokenSequencesSet = newAllTokenSequencesSet; allTokenSequencesSet.addAll(positionalSynonymSequencesSet); } LinkedList<LinkedList<PackedTokenAttributeImpl>> allTokenSequences = new LinkedList<LinkedList<PackedTokenAttributeImpl>>( allTokenSequencesSet); // build the unique LinkedList<LinkedList<PackedTokenAttributeImpl>> fixedTokenSequences = new LinkedList<LinkedList<PackedTokenAttributeImpl>>(); for (LinkedList<PackedTokenAttributeImpl> tokenSequence : allTokenSequences) { LinkedList<PackedTokenAttributeImpl> fixedTokenSequence = new LinkedList<PackedTokenAttributeImpl>(); fixedTokenSequences.add(fixedTokenSequence); PackedTokenAttributeImpl replace = null; for (PackedTokenAttributeImpl c : tokenSequence) { if (replace == null) { StringBuilder prefix = new StringBuilder(); for (int i = c.startOffset() - 1; i >= 0; i--) { char test = testText.charAt(i); if (((test == '*') || (test == '?')) && wildcardPoistions.contains(i)) { prefix.insert(0, test); } else { break; } } String pre = prefix.toString(); if (requiresMLTokenDuplication) { String termText = c.toString(); int position = termText.indexOf("}"); String language = termText.substring(0, position + 1); String token = termText.substring(position + 1); replace = new PackedTokenAttributeImpl(); replace.setEmpty().append(language + pre + token); replace.setOffset(c.startOffset() - pre.length(), c.endOffset()); replace.setType(c.type()); replace.setPositionIncrement(c.getPositionIncrement()); } else { String termText = c.toString(); replace = new PackedTokenAttributeImpl(); replace.setEmpty().append(pre + termText); replace.setOffset(c.startOffset() - pre.length(), c.endOffset()); replace.setType(c.type()); replace.setPositionIncrement(c.getPositionIncrement()); } } else { StringBuilder prefix = new StringBuilder(); StringBuilder postfix = new StringBuilder(); StringBuilder builder = prefix; for (int i = c.startOffset() - 1; i >= replace.endOffset(); i--) { char test = testText.charAt(i); if (((test == '*') || (test == '?')) && wildcardPoistions.contains(i)) { builder.insert(0, test); } else { builder = postfix; postfix.setLength(0); } } String pre = prefix.toString(); String post = postfix.toString(); // Does it bridge? if ((pre.length() > 0) && (replace.endOffset() + pre.length()) == c.startOffset()) { String termText = c.toString(); if (requiresMLTokenDuplication) { int position = termText.indexOf("}"); @SuppressWarnings("unused") String language = termText.substring(0, position + 1); String token = termText.substring(position + 1); int oldPositionIncrement = replace.getPositionIncrement(); String replaceTermText = replace.toString(); replace = new PackedTokenAttributeImpl(); replace.setEmpty().append(replaceTermText + pre + token); replace.setOffset(replace.startOffset(), c.endOffset()); replace.setType(replace.type()); replace.setPositionIncrement(oldPositionIncrement); } else { int oldPositionIncrement = replace.getPositionIncrement(); String replaceTermText = replace.toString(); replace = new PackedTokenAttributeImpl(); replace.setEmpty().append(replaceTermText + pre + termText); replace.setOffset(replace.startOffset(), c.endOffset()); replace.setType(replace.type()); replace.setPositionIncrement(oldPositionIncrement); } } else { String termText = c.toString(); if (requiresMLTokenDuplication) { int position = termText.indexOf("}"); String language = termText.substring(0, position + 1); String token = termText.substring(position + 1); String replaceTermText = replace.toString(); PackedTokenAttributeImpl last = new PackedTokenAttributeImpl(); last.setEmpty().append(replaceTermText + post); last.setOffset(replace.startOffset(), replace.endOffset() + post.length()); last.setType(replace.type()); last.setPositionIncrement(replace.getPositionIncrement()); fixedTokenSequence.add(last); replace = new PackedTokenAttributeImpl(); replace.setEmpty().append(language + pre + token); replace.setOffset(c.startOffset() - pre.length(), c.endOffset()); replace.setType(c.type()); replace.setPositionIncrement(c.getPositionIncrement()); } else { String replaceTermText = replace.toString(); PackedTokenAttributeImpl last = new PackedTokenAttributeImpl(); last.setEmpty().append(replaceTermText + post); last.setOffset(replace.startOffset(), replace.endOffset() + post.length()); last.setType(replace.type()); last.setPositionIncrement(replace.getPositionIncrement()); fixedTokenSequence.add(last); replace = new PackedTokenAttributeImpl(); replace.setEmpty().append(pre + termText); replace.setOffset(c.startOffset() - pre.length(), c.endOffset()); replace.setType(c.type()); replace.setPositionIncrement(c.getPositionIncrement()); } } } } // finish last if (replace != null) { StringBuilder postfix = new StringBuilder(); if ((replace.endOffset() >= 0) && (replace.endOffset() < testText.length())) { for (int i = replace.endOffset(); i < testText.length(); i++) { char test = testText.charAt(i); if (((test == '*') || (test == '?')) && wildcardPoistions.contains(i)) { postfix.append(test); } else { break; } } } String post = postfix.toString(); int oldPositionIncrement = replace.getPositionIncrement(); String replaceTermText = replace.toString(); PackedTokenAttributeImpl terminal = new PackedTokenAttributeImpl(); terminal.setEmpty().append(replaceTermText + post); terminal.setOffset(replace.startOffset(), replace.endOffset() + post.length()); terminal.setType(replace.type()); terminal.setPositionIncrement(oldPositionIncrement); fixedTokenSequence.add(terminal); } } // rebuild fixed list ArrayList<PackedTokenAttributeImpl> fixed = new ArrayList<PackedTokenAttributeImpl>(); for (LinkedList<PackedTokenAttributeImpl> tokenSequence : fixedTokenSequences) { for (PackedTokenAttributeImpl token : tokenSequence) { fixed.add(token); } } // reorder by start position and increment Collections.sort(fixed, new Comparator<PackedTokenAttributeImpl>() { public int compare(PackedTokenAttributeImpl o1, PackedTokenAttributeImpl o2) { int dif = o1.startOffset() - o2.startOffset(); if (dif != 0) { return dif; } else { return o1.getPositionIncrement() - o2.getPositionIncrement(); } } }); // make sure we remove any tokens we have duplicated @SuppressWarnings("rawtypes") OrderedHashSet unique = new OrderedHashSet(); unique.addAll(fixed); fixed = new ArrayList<PackedTokenAttributeImpl>(unique); list = fixed; // add any missing locales back to the tokens if (localePrefix.length() > 0) { for (int j = 0; j < list.size(); j++) { PackedTokenAttributeImpl currentToken = list.get(j); String termText = currentToken.toString(); currentToken.setEmpty(); currentToken.append(localePrefix + termText); } } SchemaField sf = schema.getField(field); boolean isShingled = false; @SuppressWarnings("resource") TokenizerChain tokenizerChain = (sf.getType().getQueryAnalyzer() instanceof TokenizerChain) ? ((TokenizerChain) sf.getType().getQueryAnalyzer()) : null; if (tokenizerChain != null) { for (TokenFilterFactory factory : tokenizerChain.getTokenFilterFactories()) { if (factory instanceof ShingleFilterFactory) { isShingled = true; break; } } } @SuppressWarnings("resource") AlfrescoAnalyzerWrapper analyzerWrapper = (sf.getType() .getQueryAnalyzer() instanceof AlfrescoAnalyzerWrapper) ? ((AlfrescoAnalyzerWrapper) sf.getType().getQueryAnalyzer()) : null; if (analyzerWrapper != null) { // assume if there are no term positions it is shingled .... isShingled = true; } boolean forceConjuncion = rerankPhase == RerankPhase.QUERY_PHASE; if (list.size() == 0) { return null; } else if (list.size() == 1) { nextToken = list.get(0); String termText = nextToken.toString(); if (!isNumeric && (termText.contains("*") || termText.contains("?"))) { return newWildcardQuery(new Term(field, termText)); } else { return newTermQuery(new Term(field, termText)); } } else { if (severalTokensAtSamePosition) { if (positionCount == 1) { // no phrase query: Builder q = newBooleanQuery(); for (int i = 0; i < list.size(); i++) { Query currentQuery; nextToken = list.get(i); String termText = nextToken.toString(); if (termText.contains("*") || termText.contains("?")) { currentQuery = newWildcardQuery(new Term(field, termText)); } else { currentQuery = newTermQuery(new Term(field, termText)); } q.add(currentQuery, BooleanClause.Occur.SHOULD); } return q.build(); } else if (forceConjuncion) { BooleanQuery.Builder or = new BooleanQuery.Builder(); for (LinkedList<PackedTokenAttributeImpl> tokenSequence : fixedTokenSequences) { BooleanQuery.Builder and = new BooleanQuery.Builder(); for (int i = 0; i < tokenSequence.size(); i++) { nextToken = (PackedTokenAttributeImpl) tokenSequence.get(i); String termText = nextToken.toString(); Term term = new Term(field, termText); if ((termText != null) && (termText.contains("*") || termText.contains("?"))) { org.apache.lucene.search.WildcardQuery wildQuery = new org.apache.lucene.search.WildcardQuery( term); and.add(wildQuery, Occur.MUST); } else { TermQuery termQuery = new TermQuery(term); and.add(termQuery, Occur.MUST); } } if (and.build().clauses().size() > 0) { or.add(and.build(), Occur.SHOULD); } } return or.build(); } // shingle else if (sf.omitPositions() && isShingled) { ArrayList<PackedTokenAttributeImpl> nonContained = getNonContained(list); Query currentQuery; BooleanQuery.Builder weakPhrase = new BooleanQuery.Builder(); for (PackedTokenAttributeImpl shingleToken : nonContained) { String termText = shingleToken.toString(); Term term = new Term(field, termText); if ((termText != null) && (termText.contains("*") || termText.contains("?"))) { currentQuery = new org.apache.lucene.search.WildcardQuery(term); } else { currentQuery = new TermQuery(term); } weakPhrase.add(currentQuery, Occur.MUST); } return weakPhrase.build(); } // Word delimiter factory and other odd things generate complex // token patterns // Smart skip token sequences with small tokens that generate // toomany wildcards // Fall back to the larger pattern // e.g Site1* will not do (S ite 1*) or (Site 1*) if 1* matches // too much (S ite1*) and (Site1*) will still be OK // If we skip all (for just 1* in the input) this is still an // issue. else { return generateSpanOrQuery(field, fixedTokenSequences); } } else { if (forceConjuncion) { BooleanQuery.Builder or = new BooleanQuery.Builder(); for (LinkedList<PackedTokenAttributeImpl> tokenSequence : fixedTokenSequences) { BooleanQuery.Builder and = new BooleanQuery.Builder(); for (int i = 0; i < tokenSequence.size(); i++) { nextToken = (PackedTokenAttributeImpl) tokenSequence.get(i); String termText = nextToken.toString(); Term term = new Term(field, termText); if ((termText != null) && (termText.contains("*") || termText.contains("?"))) { org.apache.lucene.search.WildcardQuery wildQuery = new org.apache.lucene.search.WildcardQuery( term); and.add(wildQuery, Occur.MUST); } else { TermQuery termQuery = new TermQuery(term); and.add(termQuery, Occur.MUST); } } if (and.build().clauses().size() > 0) { or.add(and.build(), Occur.SHOULD); } } return or.build(); } else { SpanQuery spanQuery = null; ArrayList<SpanQuery> atSamePositionSpanOrQueryParts = new ArrayList<SpanQuery>(); int gap = 0; for (int i = 0; i < list.size(); i++) { nextToken = list.get(i); String termText = nextToken.toString(); Term term = new Term(field, termText); if (getEnablePositionIncrements()) { SpanQuery nextSpanQuery; if ((termText != null) && (termText.contains("*") || termText.contains("?"))) { org.apache.lucene.search.WildcardQuery wildQuery = new org.apache.lucene.search.WildcardQuery( term); SpanMultiTermQueryWrapper<org.apache.lucene.search.WildcardQuery> wrapper = new SpanMultiTermQueryWrapper<org.apache.lucene.search.WildcardQuery>( wildQuery); wrapper.setRewriteMethod( new TopTermsSpanBooleanQueryRewrite(topTermSpanRewriteLimit)); nextSpanQuery = wrapper; } else { nextSpanQuery = new SpanTermQuery(term); } if (gap == 0) { atSamePositionSpanOrQueryParts.add(nextSpanQuery); } else { if (atSamePositionSpanOrQueryParts.size() == 0) { if (spanQuery == null) { spanQuery = nextSpanQuery; } else { spanQuery = new SpanNearQuery(new SpanQuery[] { spanQuery, nextSpanQuery }, (gap - 1) + internalSlop, internalSlop < 2); } atSamePositionSpanOrQueryParts = new ArrayList<SpanQuery>(); } else if (atSamePositionSpanOrQueryParts.size() == 1) { if (spanQuery == null) { spanQuery = atSamePositionSpanOrQueryParts.get(0); } else { spanQuery = new SpanNearQuery( new SpanQuery[] { spanQuery, atSamePositionSpanOrQueryParts.get(0) }, (gap - 1) + internalSlop, internalSlop < 2); } atSamePositionSpanOrQueryParts = new ArrayList<SpanQuery>(); atSamePositionSpanOrQueryParts.add(nextSpanQuery); } else { if (spanQuery == null) { spanQuery = new SpanOrQuery( atSamePositionSpanOrQueryParts.toArray(new SpanQuery[] {})); } else { spanQuery = new SpanNearQuery( new SpanQuery[] { spanQuery, new SpanOrQuery(atSamePositionSpanOrQueryParts .toArray(new SpanQuery[] {})) }, (gap - 1) + internalSlop, internalSlop < 2); } atSamePositionSpanOrQueryParts = new ArrayList<SpanQuery>(); atSamePositionSpanOrQueryParts.add(nextSpanQuery); } } gap = nextToken.getPositionIncrement(); } else { SpanQuery nextSpanQuery; if ((termText != null) && (termText.contains("*") || termText.contains("?"))) { org.apache.lucene.search.WildcardQuery wildQuery = new org.apache.lucene.search.WildcardQuery( term); SpanMultiTermQueryWrapper<org.apache.lucene.search.WildcardQuery> wrapper = new SpanMultiTermQueryWrapper<org.apache.lucene.search.WildcardQuery>( wildQuery); wrapper.setRewriteMethod( new TopTermsSpanBooleanQueryRewrite(topTermSpanRewriteLimit)); nextSpanQuery = wrapper; } else { nextSpanQuery = new SpanTermQuery(term); } if (spanQuery == null) { spanQuery = new SpanOrQuery(nextSpanQuery); } else { spanQuery = new SpanOrQuery(spanQuery, nextSpanQuery); } } } if (atSamePositionSpanOrQueryParts.size() == 0) { return spanQuery; } else if (atSamePositionSpanOrQueryParts.size() == 1) { if (spanQuery == null) { spanQuery = atSamePositionSpanOrQueryParts.get(0); } else { spanQuery = new SpanNearQuery( new SpanQuery[] { spanQuery, atSamePositionSpanOrQueryParts.get(0) }, (gap - 1) + internalSlop, internalSlop < 2); } return spanQuery; } else { if (spanQuery == null) { spanQuery = new SpanOrQuery(atSamePositionSpanOrQueryParts.toArray(new SpanQuery[] {})); } else { spanQuery = new SpanNearQuery( new SpanQuery[] { spanQuery, new SpanOrQuery( atSamePositionSpanOrQueryParts.toArray(new SpanQuery[] {})) }, (gap - 1) + internalSlop, internalSlop < 2); } return spanQuery; } } } } }
From source file:org.alfresco.solr.query.Solr4QueryParser.java
License:Open Source License
private String getFirstTokenForRange(String string, FieldInstance field) throws IOException { PackedTokenAttributeImpl nextToken;//from w w w. ja v a 2 s . co m TokenStream source = null; ; try { source = getAnalyzer().tokenStream(field.getField(), new StringReader(string)); source.reset(); while (source.incrementToken()) { CharTermAttribute cta = source.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = source.getAttribute(OffsetAttribute.class); TypeAttribute typeAtt = null; if (source.hasAttribute(TypeAttribute.class)) { typeAtt = source.getAttribute(TypeAttribute.class); } PositionIncrementAttribute posIncAtt = null; if (source.hasAttribute(PositionIncrementAttribute.class)) { posIncAtt = source.getAttribute(PositionIncrementAttribute.class); } nextToken = new PackedTokenAttributeImpl(); nextToken.setEmpty().copyBuffer(cta.buffer(), 0, cta.length()); nextToken.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset()); if (typeAtt != null) { nextToken.setType(typeAtt.type()); } if (posIncAtt != null) { nextToken.setPositionIncrement(posIncAtt.getPositionIncrement()); } return nextToken.toString(); } } finally { try { if (source != null) { source.close(); } } catch (IOException e) { // ignore } } return null; }
From source file:org.alfresco.solr.query.Solr4QueryParser.java
License:Open Source License
/** * @param first//from w ww . ja v a 2s . c om * @param field * @return SpanOrQuery * @throws IOException */ private SpanQuery buildSpanOrQuery(String first, FieldInstance field) throws IOException { ArrayList<SpanQuery> spanOrQueryParts = new ArrayList<SpanQuery>(); PackedTokenAttributeImpl nextToken; TokenStream source = null; try { source = getAnalyzer().tokenStream(field.getField(), new StringReader(first)); source.reset(); while (source.incrementToken()) { CharTermAttribute cta = source.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = source.getAttribute(OffsetAttribute.class); TypeAttribute typeAtt = null; if (source.hasAttribute(TypeAttribute.class)) { typeAtt = source.getAttribute(TypeAttribute.class); } PositionIncrementAttribute posIncAtt = null; if (source.hasAttribute(PositionIncrementAttribute.class)) { posIncAtt = source.getAttribute(PositionIncrementAttribute.class); } nextToken = new PackedTokenAttributeImpl(); nextToken.setEmpty().copyBuffer(cta.buffer(), 0, cta.length()); nextToken.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset()); if (typeAtt != null) { nextToken.setType(typeAtt.type()); } if (posIncAtt != null) { nextToken.setPositionIncrement(posIncAtt.getPositionIncrement()); } SpanQuery termQuery = new SpanTermQuery(new Term(field.getField(), nextToken.toString())); spanOrQueryParts.add(termQuery); } } finally { try { if (source != null) { source.close(); } } catch (IOException e) { // ignore } } if (spanOrQueryParts.size() == 1) { return spanOrQueryParts.get(0); } else { return new SpanOrQuery(spanOrQueryParts.toArray(new SpanQuery[] {})); } }
From source file:org.alfresco.solr.SolrInformationServer.java
License:Open Source License
private void addContentPropertyToDocUsingAlfrescoRepository(SolrInputDocument doc, QName propertyQName, long dbId, String locale) throws AuthenticationException, IOException { long start = System.nanoTime(); // Expensive call to be done with ContentTracker GetTextContentResponse response = repositoryClient.getTextContent(dbId, propertyQName, null); addContentPropertyMetadata(doc, propertyQName, AlfrescoSolrDataModel.ContentFieldType.TRANSFORMATION_STATUS, response);/*w ww . j a v a 2 s .co m*/ addContentPropertyMetadata(doc, propertyQName, AlfrescoSolrDataModel.ContentFieldType.TRANSFORMATION_EXCEPTION, response); addContentPropertyMetadata(doc, propertyQName, AlfrescoSolrDataModel.ContentFieldType.TRANSFORMATION_TIME, response); InputStream ris = response.getContent(); String textContent = ""; try { if (ris != null) { // Get and copy content byte[] bytes = FileCopyUtils.copyToByteArray(new BoundedInputStream(ris, contentStreamLimit)); textContent = new String(bytes, StandardCharsets.UTF_8); } } finally { // release the response only when the content has been read response.release(); } if (minHash && textContent.length() > 0) { Analyzer analyzer = core.getLatestSchema().getFieldType("min_hash").getIndexAnalyzer(); TokenStream ts = analyzer.tokenStream("min_hash", textContent); CharTermAttribute termAttribute = ts.getAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { StringBuilder tokenBuff = new StringBuilder(); char[] buff = termAttribute.buffer(); for (int i = 0; i < termAttribute.length(); i++) { tokenBuff.append(Integer.toHexString(buff[i])); } doc.addField(FINGERPRINT_FIELD, tokenBuff.toString()); } ts.end(); ts.close(); } long end = System.nanoTime(); this.getTrackerStats().addDocTransformationTime(end - start); StringBuilder builder = new StringBuilder(textContent.length() + 16); builder.append("\u0000").append(locale).append("\u0000"); builder.append(textContent); String localisedText = builder.toString(); for (FieldInstance field : AlfrescoSolrDataModel.getInstance() .getIndexedFieldNamesForProperty(propertyQName).getFields()) { doc.removeField(field.getField()); if (field.isLocalised()) { doc.addField(field.getField(), localisedText); } else { doc.addField(field.getField(), textContent); } addFieldIfNotSet(doc, field); } }