List of usage examples for org.apache.lucene.analysis TokenStream reset
public void reset() throws IOException
From source file:org.gridkit.coherence.search.lucene.TokenStreamCheck.java
License:Apache License
@Test public void analyze() throws IOException { WhitespaceAnalyzer wa = new WhitespaceAnalyzer(Version.LUCENE_42); wa.getOffsetGap("xxx"); TokenStream ts = wa.tokenStream("test", new StringReader("red black tree")); ts.reset(); ts.incrementToken();/*from w w w .ja v a 2 s. c o m*/ ts.getAttribute(CharTermAttribute.class).buffer(); CapturedTokenStream cts = new CapturedTokenStream(ts); cts.reset(); cts.incrementToken(); cts.getAttribute(CharTermAttribute.class).buffer(); }
From source file:org.grouplens.samantha.modeler.featurizer.FeatureExtractorUtilities.java
License:Open Source License
static public Map<String, Integer> getTermFreq(Analyzer analyzer, String text, String termField) { TokenStream ts = analyzer.tokenStream(termField, text); Map<String, Integer> termFreq = new HashMap<>(); try {//from ww w . j a v a 2 s .c o m ts.reset(); while (ts.incrementToken()) { String term = ts.reflectAsString(false); int cnt = termFreq.getOrDefault(term, 0); termFreq.put(term, cnt + 1); } ts.close(); } catch (IOException e) { logger.error("{}", e.getMessage()); throw new BadRequestException(e); } return termFreq; }
From source file:org.hibernate.search.backend.lucene.util.impl.AnalyzerUtils.java
License:LGPL
/** * Returns the first token resulting from the analysis, logging a warning if there are more than one token. * * @param analyzer the Lucene analyzer to use * @param fieldName the name of the field: might affect the analyzer behavior * @param text the value to analyze// w w w .j a v a 2 s . c om * @return the first token resulting from the analysis * * @throws SearchException if a problem occurs when analyzing the sortable field's value. */ public static String normalize(Analyzer analyzer, String fieldName, String text) { final TokenStream stream = analyzer.tokenStream(fieldName, new StringReader(text)); try { try { String firstToken = null; CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); stream.reset(); if (stream.incrementToken()) { firstToken = new String(term.buffer(), 0, term.length()); if (stream.incrementToken()) { log.multipleTermsDetectedDuringNormalization(fieldName); } else { stream.end(); } } return firstToken; } finally { stream.close(); } } catch (SearchException | IOException e) { throw log.couldNotNormalizeField(fieldName, e); } }
From source file:org.hibernate.search.backend.spi.SingularTermDeletionQuery.java
License:LGPL
@Override public Query toLuceneQuery(DocumentBuilderIndexedEntity documentBuilder) { AnalyzerReference analyzerReferenceForEntity = documentBuilder.getAnalyzerReference(); String stringValue = documentBuilder.objectToString(fieldName, this.getValue(), new ContextualExceptionBridgeHelper()); if (this.getType() == Type.STRING) { try {/*from w w w . j av a 2 s .c o m*/ if (analyzerReferenceForEntity.is(RemoteAnalyzerReference.class)) { // no need to take into account the analyzer here as it will be dealt with remotely return new TermQuery(new Term(this.getFieldName(), stringValue)); } ScopedLuceneAnalyzer analyzerForEntity = (ScopedLuceneAnalyzer) analyzerReferenceForEntity .unwrap(LuceneAnalyzerReference.class).getAnalyzer(); TokenStream tokenStream = analyzerForEntity.tokenStream(this.getFieldName(), stringValue); tokenStream.reset(); try { BooleanQuery.Builder booleanQueryBuilder = new BooleanQuery.Builder(); while (tokenStream.incrementToken()) { String term = tokenStream.getAttribute(CharTermAttribute.class).toString(); booleanQueryBuilder.add(new TermQuery(new Term(this.getFieldName(), term)), Occur.FILTER); } return booleanQueryBuilder.build(); } finally { tokenStream.close(); } } catch (IOException e) { throw new AssertionFailure( "No IOException can occur while using a TokenStream that is generated via String"); } } else { FieldBridge fieldBridge = documentBuilder.getBridge(fieldName); if (NumericFieldUtils.isNumericFieldBridge(fieldBridge)) { return NumericFieldUtils.createExactMatchQuery(fieldName, this.getValue()); } else { return new TermQuery(new Term(this.getFieldName(), stringValue)); } } }
From source file:org.hibernate.search.query.dsl.impl.ConnectedMultiFieldsPhraseQueryBuilder.java
License:Open Source License
public Query createQuery(FieldContext fieldContext) { final Query perFieldQuery; final String fieldName = fieldContext.getField(); /*/*w ww. ja v a 2 s. c o m*/ * Store terms per position and detect if for a given position more than one term is present */ TokenStream stream = null; boolean isMultiPhrase = false; Map<Integer, List<Term>> termsPerPosition = new HashMap<Integer, List<Term>>(); final String sentence = phraseContext.getSentence(); try { Reader reader = new StringReader(sentence); stream = queryContext.getQueryAnalyzer().reusableTokenStream(fieldName, reader); TermAttribute termAttribute = stream.addAttribute(TermAttribute.class); PositionIncrementAttribute positionAttribute = stream.addAttribute(PositionIncrementAttribute.class); stream.reset(); int position = -1; //start at -1 since we apply at least one increment List<Term> termsAtSamePosition = null; while (stream.incrementToken()) { int positionIncrement = 1; if (positionAttribute != null) { positionIncrement = positionAttribute.getPositionIncrement(); } if (positionIncrement > 0) { position += positionIncrement; termsAtSamePosition = termsPerPosition.get(position); } if (termsAtSamePosition == null) { termsAtSamePosition = new ArrayList<Term>(); termsPerPosition.put(position, termsAtSamePosition); } termsAtSamePosition.add(new Term(fieldName, termAttribute.term())); if (termsAtSamePosition.size() > 1) { isMultiPhrase = true; } } } catch (IOException e) { throw new AssertionFailure("IOException while reading a string. Doh!", e); } finally { if (stream != null) { try { stream.end(); stream.close(); } catch (IOException e) { throw new AssertionFailure("IOException while reading a string. Doh!", e); } } } /* * Create the appropriate query depending on the conditions * note that a MultiPhraseQuery is needed if several terms share the same position * as it will do a OR and not a AND like PhraseQuery */ final int size = termsPerPosition.size(); if (size == 0) { throw new SearchException( "phrase query returns no term. Is there a problem with your analyzers? " + sentence); } if (size == 1) { final List<Term> terms = termsPerPosition.values().iterator().next(); if (terms.size() == 1) { perFieldQuery = new TermQuery(terms.get(0)); } else { BooleanQuery query = new BooleanQuery(); for (Term term : terms) { query.add(new TermQuery(term), BooleanClause.Occur.SHOULD); } perFieldQuery = query; } } else { if (isMultiPhrase) { MultiPhraseQuery query = new MultiPhraseQuery(); query.setSlop(phraseContext.getSlop()); for (Map.Entry<Integer, List<Term>> entry : termsPerPosition.entrySet()) { final List<Term> value = entry.getValue(); query.add(value.toArray(new Term[value.size()]), entry.getKey()); } perFieldQuery = query; } else { PhraseQuery query = new PhraseQuery(); query.setSlop(phraseContext.getSlop()); for (Map.Entry<Integer, List<Term>> entry : termsPerPosition.entrySet()) { final List<Term> value = entry.getValue(); query.add(value.get(0), entry.getKey()); } perFieldQuery = query; } } return fieldContext.getFieldCustomizer().setWrappedQuery(perFieldQuery).createQuery(); }
From source file:org.hibernate.search.query.dsl.impl.Helper.java
License:Open Source License
static List<String> getAllTermsFromText(String fieldName, String localText, Analyzer analyzer) throws IOException { List<String> terms = new ArrayList<String>(); // Can't deal with null at this point. Likely returned by some FieldBridge not recognizing the type. if (localText == null) { throw new SearchException("Search parameter on field " + fieldName + " could not be converted. " + "Are the parameter and the field of the same type?" + "Alternatively, apply the ignoreFieldBridge() option to " + "pass String parameters"); }/*from ww w . jav a 2s. c o m*/ Reader reader = new StringReader(localText); TokenStream stream = analyzer.reusableTokenStream(fieldName, reader); TermAttribute attribute = stream.addAttribute(TermAttribute.class); stream.reset(); while (stream.incrementToken()) { if (attribute.termLength() > 0) { String term = attribute.term(); terms.add(term); } } stream.end(); stream.close(); return terms; }
From source file:org.hibernate.search.query.dsl.impl.MoreLikeThisBuilder.java
License:LGPL
/** * Adds term frequencies found by tokenizing text from reader into the Map words * * @param r a source of text to be tokenized * @param termFreqMap a Map of terms and their frequencies * @param fieldName Used by analyzer for any special per-field analysis *//*from ww w .jav a2s . co m*/ private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, FieldContext fieldContext) throws IOException { String fieldName = fieldContext.getField(); Analyzer analyzer = queryContext.getQueryAnalyzerReference().unwrap(LuceneAnalyzerReference.class) .getAnalyzer(); if (!fieldContext.applyAnalyzer()) { // essentially does the Reader to String conversion for us analyzer = PassThroughAnalyzer.INSTANCE; } TokenStream ts = analyzer.tokenStream(fieldName, r); try { int tokenCount = 0; // for every token CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { String word = termAtt.toString(); tokenCount++; if (tokenCount > maxNumTokensParsed) { break; } if (isNoiseWord(word)) { continue; } // increment frequency Int cnt = termFreqMap.get(word); if (cnt == null) { termFreqMap.put(word, new Int()); } else { cnt.x++; } } ts.end(); } finally { IOUtils.closeWhileHandlingException(ts); } }
From source file:org.hibernate.search.test.serialization.SerializationTest.java
License:Open Source License
private boolean compareTokenStreams(TokenStream original, TokenStream copy) { if (original == null) { return copy == null; }/* w ww . jav a 2 s. c om*/ try { original.reset(); } catch (IOException e) { throw new RuntimeException(e); } SerializableTokenStream serOriginal = CopyTokenStream.buildSerializabletokenStream(original); SerializableTokenStream serCopy = CopyTokenStream.buildSerializabletokenStream(copy); if (serOriginal.getStream().size() != serCopy.getStream().size()) { return false; } for (int i = 0; i < serOriginal.getStream().size(); i++) { List<AttributeImpl> origToken = serOriginal.getStream().get(i); List<AttributeImpl> copyToken = serCopy.getStream().get(i); if (origToken.size() != copyToken.size()) { return false; } for (int j = 0; j < origToken.size(); j++) { AttributeImpl origAttr = origToken.get(j); AttributeImpl copyAttr = copyToken.get(j); if (origAttr.getClass() != copyAttr.getClass()) { return false; } testAttributeTypes(origAttr, copyAttr); } } return true; }
From source file:org.hibernate.search.util.impl.InternalAnalyzerUtils.java
License:LGPL
/** * Returns the first token resulting from the analysis, logging a warning if there are more than one token. * * @param analyzer the Lucene analyzer to use * @param fieldName the name of the field: might affect the analyzer behavior * @param text the value to analyze/* www.j av a 2s. c o m*/ * @return the first token resulting from the analysis * * @throws SearchException if a problem occurs when analyzing the sortable field's value. */ public static String analyzeSortableValue(Analyzer analyzer, String fieldName, String text) { final TokenStream stream = analyzer.tokenStream(fieldName, new StringReader(text)); try { try { String firstToken = null; CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); stream.reset(); if (stream.incrementToken()) { firstToken = new String(term.buffer(), 0, term.length()); if (stream.incrementToken()) { log.multipleTermsInAnalyzedSortableField(fieldName); } else { stream.end(); } } return firstToken; } finally { stream.close(); } } catch (SearchException | IOException e) { throw log.couldNotAnalyzeSortableField(fieldName, e); } }
From source file:org.ihtsdo.otf.query.lucene.LuceneIndexer.java
License:Apache License
protected Query buildPrefixQuery(String searchString, String field, Analyzer analyzer) throws IOException { StringReader textReader = new StringReader(searchString); TokenStream tokenStream = analyzer.tokenStream(field, textReader); tokenStream.reset(); List<String> terms = new ArrayList<>(); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); while (tokenStream.incrementToken()) { terms.add(charTermAttribute.toString()); }/*from w w w . ja v a2 s. co m*/ textReader.close(); tokenStream.close(); analyzer.close(); BooleanQuery bq = new BooleanQuery(); if (terms.size() > 0 && !searchString.endsWith(" ")) { String last = terms.remove(terms.size() - 1); bq.add(new PrefixQuery((new Term(field, last))), Occur.MUST); } terms.stream().forEach((s) -> { bq.add(new TermQuery(new Term(field, s)), Occur.MUST); }); return bq; }