List of usage examples for org.apache.lucene.analysis TokenStream end
public void end() throws IOException
false
(using the new TokenStream
API). From source file:org.genemania.completion.lucene.GeneCompletionProvider.java
License:Open Source License
public Long getNodeId(String symbol) { try {/*w ww.j a va 2 s . co m*/ TokenStream tokens = analyze(symbol); PhraseQuery query = new PhraseQuery(); tokens.reset(); while (tokens.incrementToken()) { TermAttribute term = tokens.getAttribute(TermAttribute.class); query.add(new Term(GeneIndexBuilder.GENE_FIELD, term.term())); } tokens.end(); tokens.close(); final Set<Long> nodes = new HashSet<Long>(); searcher.search(query, new AbstractCollector() { @Override public void handleHit(int id) { try { Document document = searcher.doc(id); nodes.add(Long.parseLong(document.get(GeneIndexBuilder.NODE_ID_FIELD))); } catch (IOException e) { log(e); } } }); if (nodes.size() > 0) { return nodes.iterator().next(); } } catch (IOException e) { log(e); } return null; }
From source file:org.genemania.data.classification.lucene.LuceneGeneClassifier.java
License:Open Source License
public void classify(final String symbol, final IGeneClassificationHandler handler) throws ApplicationException { try {/*from w w w .j a v a 2 s . c o m*/ TokenStream tokens = analyze(symbol); PhraseQuery query = new PhraseQuery(); tokens.reset(); while (tokens.incrementToken()) { TermAttribute term = tokens.getAttribute(TermAttribute.class); query.add(new Term(LuceneMediator.GENE_SYMBOL, term.term())); } tokens.end(); tokens.close(); searcher.search(query, new AbstractCollector() { @Override public void handleHit(int doc) { try { Document document = searcher.doc(doc); long organismId = Long.parseLong(document.get(LuceneMediator.GENE_ORGANISM_ID)); handler.handleClassification(symbol, organismId); } catch (IOException e) { log(e); } } }); } catch (IOException e) { throw new ApplicationException(e); } }
From source file:org.genemania.mediator.lucene.LuceneMediator.java
License:Open Source License
protected PhraseQuery createPhraseQuery(String field, String phrase) throws IOException { TokenStream stream = analyze(phrase); stream.reset();//w ww .ja va2 s. c o m PhraseQuery query = new PhraseQuery(); while (stream.incrementToken()) { TermAttribute term = stream.getAttribute(TermAttribute.class); query.add(new Term(field, term.term())); } stream.end(); stream.close(); return query; }
From source file:org.hibernate.search.backend.lucene.util.impl.AnalyzerUtils.java
License:LGPL
/** * Returns the first token resulting from the analysis, logging a warning if there are more than one token. * * @param analyzer the Lucene analyzer to use * @param fieldName the name of the field: might affect the analyzer behavior * @param text the value to analyze/*w w w. j a v a 2s. co m*/ * @return the first token resulting from the analysis * * @throws SearchException if a problem occurs when analyzing the sortable field's value. */ public static String normalize(Analyzer analyzer, String fieldName, String text) { final TokenStream stream = analyzer.tokenStream(fieldName, new StringReader(text)); try { try { String firstToken = null; CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); stream.reset(); if (stream.incrementToken()) { firstToken = new String(term.buffer(), 0, term.length()); if (stream.incrementToken()) { log.multipleTermsDetectedDuringNormalization(fieldName); } else { stream.end(); } } return firstToken; } finally { stream.close(); } } catch (SearchException | IOException e) { throw log.couldNotNormalizeField(fieldName, e); } }
From source file:org.hibernate.search.indexes.serialization.impl.CopyTokenStream.java
License:Open Source License
private static List<List<AttributeImpl>> fillCache(TokenStream input) throws IOException { List<List<AttributeImpl>> results = new ArrayList<List<AttributeImpl>>(); while (input.incrementToken()) { List<AttributeImpl> attrs = new ArrayList<AttributeImpl>(); results.add(attrs);/*from ww w . java2 s . co m*/ Iterator<AttributeImpl> iter = input.getAttributeImplsIterator(); while (iter.hasNext()) { //we need to clone as AttributeImpl instances can be reused across incrementToken() calls attrs.add((AttributeImpl) iter.next().clone()); } } input.end(); return results; }
From source file:org.hibernate.search.query.dsl.impl.ConnectedMultiFieldsPhraseQueryBuilder.java
License:Open Source License
public Query createQuery(FieldContext fieldContext) { final Query perFieldQuery; final String fieldName = fieldContext.getField(); /*// www . j a v a2 s . c om * Store terms per position and detect if for a given position more than one term is present */ TokenStream stream = null; boolean isMultiPhrase = false; Map<Integer, List<Term>> termsPerPosition = new HashMap<Integer, List<Term>>(); final String sentence = phraseContext.getSentence(); try { Reader reader = new StringReader(sentence); stream = queryContext.getQueryAnalyzer().reusableTokenStream(fieldName, reader); TermAttribute termAttribute = stream.addAttribute(TermAttribute.class); PositionIncrementAttribute positionAttribute = stream.addAttribute(PositionIncrementAttribute.class); stream.reset(); int position = -1; //start at -1 since we apply at least one increment List<Term> termsAtSamePosition = null; while (stream.incrementToken()) { int positionIncrement = 1; if (positionAttribute != null) { positionIncrement = positionAttribute.getPositionIncrement(); } if (positionIncrement > 0) { position += positionIncrement; termsAtSamePosition = termsPerPosition.get(position); } if (termsAtSamePosition == null) { termsAtSamePosition = new ArrayList<Term>(); termsPerPosition.put(position, termsAtSamePosition); } termsAtSamePosition.add(new Term(fieldName, termAttribute.term())); if (termsAtSamePosition.size() > 1) { isMultiPhrase = true; } } } catch (IOException e) { throw new AssertionFailure("IOException while reading a string. Doh!", e); } finally { if (stream != null) { try { stream.end(); stream.close(); } catch (IOException e) { throw new AssertionFailure("IOException while reading a string. Doh!", e); } } } /* * Create the appropriate query depending on the conditions * note that a MultiPhraseQuery is needed if several terms share the same position * as it will do a OR and not a AND like PhraseQuery */ final int size = termsPerPosition.size(); if (size == 0) { throw new SearchException( "phrase query returns no term. Is there a problem with your analyzers? " + sentence); } if (size == 1) { final List<Term> terms = termsPerPosition.values().iterator().next(); if (terms.size() == 1) { perFieldQuery = new TermQuery(terms.get(0)); } else { BooleanQuery query = new BooleanQuery(); for (Term term : terms) { query.add(new TermQuery(term), BooleanClause.Occur.SHOULD); } perFieldQuery = query; } } else { if (isMultiPhrase) { MultiPhraseQuery query = new MultiPhraseQuery(); query.setSlop(phraseContext.getSlop()); for (Map.Entry<Integer, List<Term>> entry : termsPerPosition.entrySet()) { final List<Term> value = entry.getValue(); query.add(value.toArray(new Term[value.size()]), entry.getKey()); } perFieldQuery = query; } else { PhraseQuery query = new PhraseQuery(); query.setSlop(phraseContext.getSlop()); for (Map.Entry<Integer, List<Term>> entry : termsPerPosition.entrySet()) { final List<Term> value = entry.getValue(); query.add(value.get(0), entry.getKey()); } perFieldQuery = query; } } return fieldContext.getFieldCustomizer().setWrappedQuery(perFieldQuery).createQuery(); }
From source file:org.hibernate.search.query.dsl.impl.Helper.java
License:Open Source License
static List<String> getAllTermsFromText(String fieldName, String localText, Analyzer analyzer) throws IOException { List<String> terms = new ArrayList<String>(); // Can't deal with null at this point. Likely returned by some FieldBridge not recognizing the type. if (localText == null) { throw new SearchException("Search parameter on field " + fieldName + " could not be converted. " + "Are the parameter and the field of the same type?" + "Alternatively, apply the ignoreFieldBridge() option to " + "pass String parameters"); }//w w w. jav a 2 s . c om Reader reader = new StringReader(localText); TokenStream stream = analyzer.reusableTokenStream(fieldName, reader); TermAttribute attribute = stream.addAttribute(TermAttribute.class); stream.reset(); while (stream.incrementToken()) { if (attribute.termLength() > 0) { String term = attribute.term(); terms.add(term); } } stream.end(); stream.close(); return terms; }
From source file:org.hibernate.search.query.dsl.impl.MoreLikeThisBuilder.java
License:LGPL
/** * Adds term frequencies found by tokenizing text from reader into the Map words * * @param r a source of text to be tokenized * @param termFreqMap a Map of terms and their frequencies * @param fieldName Used by analyzer for any special per-field analysis */// w w w. j a v a 2 s. c o m private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, FieldContext fieldContext) throws IOException { String fieldName = fieldContext.getField(); Analyzer analyzer = queryContext.getQueryAnalyzerReference().unwrap(LuceneAnalyzerReference.class) .getAnalyzer(); if (!fieldContext.applyAnalyzer()) { // essentially does the Reader to String conversion for us analyzer = PassThroughAnalyzer.INSTANCE; } TokenStream ts = analyzer.tokenStream(fieldName, r); try { int tokenCount = 0; // for every token CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { String word = termAtt.toString(); tokenCount++; if (tokenCount > maxNumTokensParsed) { break; } if (isNoiseWord(word)) { continue; } // increment frequency Int cnt = termFreqMap.get(word); if (cnt == null) { termFreqMap.put(word, new Int()); } else { cnt.x++; } } ts.end(); } finally { IOUtils.closeWhileHandlingException(ts); } }
From source file:org.hibernate.search.util.impl.InternalAnalyzerUtils.java
License:LGPL
/** * Returns the first token resulting from the analysis, logging a warning if there are more than one token. * * @param analyzer the Lucene analyzer to use * @param fieldName the name of the field: might affect the analyzer behavior * @param text the value to analyze/*from ww w . j av a2 s . c o m*/ * @return the first token resulting from the analysis * * @throws SearchException if a problem occurs when analyzing the sortable field's value. */ public static String analyzeSortableValue(Analyzer analyzer, String fieldName, String text) { final TokenStream stream = analyzer.tokenStream(fieldName, new StringReader(text)); try { try { String firstToken = null; CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); stream.reset(); if (stream.incrementToken()) { firstToken = new String(term.buffer(), 0, term.length()); if (stream.incrementToken()) { log.multipleTermsInAnalyzedSortableField(fieldName); } else { stream.end(); } } return firstToken; } finally { stream.close(); } } catch (SearchException | IOException e) { throw log.couldNotAnalyzeSortableField(fieldName, e); } }
From source file:org.index.Tag.java
private String getBagOfWords(String text) throws Exception { StringBuffer buff = new StringBuffer(); text = Question.removeTags(text);//from w w w . j a va 2s. c o m boolean toStem = Boolean.parseBoolean(prop.getProperty("stem", "true")); String stopFile = prop.getProperty("stopfile"); Analyzer analyzer = new SOAnalyzer(toStem, stopFile); TokenStream stream = analyzer.tokenStream("bow", new StringReader(text)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { String term = termAtt.toString(); buff.append(term).append(" "); } stream.end(); stream.close(); return buff.toString(); }