List of usage examples for org.apache.lucene.analysis TokenStream addAttribute
public final <T extends Attribute> T addAttribute(Class<T> attClass)
From source file:org.exist.indexing.lucene.XMLToQuery.java
License:Open Source License
private SpanQuery nearQuery(String field, Element node, Analyzer analyzer) throws XPathException { int slop = getSlop(node); if (slop < 0) slop = 0;// w w w . j av a2s. co m boolean inOrder = true; if (node.hasAttribute("ordered")) inOrder = node.getAttribute("ordered").equals("yes"); if (!hasElementContent(node)) { String qstr = getText(node); List<SpanTermQuery> list = new ArrayList<>(8); try { TokenStream stream = analyzer.tokenStream(field, new StringReader(qstr)); CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { list.add(new SpanTermQuery(new Term(field, termAttr.toString()))); } stream.end(); stream.close(); } catch (IOException e) { throw new XPathException("Error while parsing phrase query: " + qstr); } return new SpanNearQuery(list.toArray(new SpanTermQuery[list.size()]), slop, inOrder); } SpanQuery[] children = parseSpanChildren(field, node, analyzer); return new SpanNearQuery(children, slop, inOrder); }
From source file:org.exist.indexing.lucene.XMLToQuery.java
License:Open Source License
private String getTerm(String field, String text, Analyzer analyzer) throws XPathException { String term = null;/*from ww w . j ava 2 s . com*/ try { TokenStream stream = analyzer.tokenStream(field, new StringReader(text)); CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); stream.reset(); if (stream.incrementToken()) { term = termAttr.toString(); } stream.end(); stream.close(); return term; } catch (IOException e) { throw new XPathException("Lucene index error while creating query: " + e.getMessage(), e); } }
From source file:org.exist.indexing.range.RangeIndexWorker.java
License:Open Source License
protected BytesRef analyzeContent(String field, QName qname, String data, DocumentSet docs) throws XPathException { final Analyzer analyzer = getAnalyzer(qname, field, docs); if (!isCaseSensitive(qname, field, docs)) { data = data.toLowerCase();//from ww w.ja va 2 s .c o m } if (analyzer == null) { return new BytesRef(data); } try { TokenStream stream = analyzer.tokenStream(field, new StringReader(data)); TermToBytesRefAttribute termAttr = stream.addAttribute(TermToBytesRefAttribute.class); BytesRef token = null; try { stream.reset(); if (stream.incrementToken()) { termAttr.fillBytesRef(); token = termAttr.getBytesRef(); } stream.end(); } finally { stream.close(); } return token; } catch (IOException e) { throw new XPathException("Error analyzing the query string: " + e.getMessage(), e); } }
From source file:org.fao.geonet.kernel.search.LuceneSearcher.java
License:Open Source License
/** * Splits text into tokens using the Analyzer that is matched to the field. * @param field//from www . jav a2 s. c om * @param requestStr * @param a * @return */ private static String analyzeText(String field, String requestStr, PerFieldAnalyzerWrapper a) { boolean phrase = false; if ((requestStr.startsWith("\"") && requestStr.endsWith("\""))) { phrase = true; } TokenStream ts = a.tokenStream(field, new StringReader(requestStr)); TermAttribute termAtt = ts.addAttribute(TermAttribute.class); List<String> tokenList = new ArrayList<String>(); try { while (ts.incrementToken()) { tokenList.add(termAtt.term()); } } catch (Exception e) { // TODO why swallow e.printStackTrace(); } StringBuilder result = new StringBuilder(); for (int i = 0; i < tokenList.size(); i++) { if (i > 0) { result.append(" "); result.append(tokenList.get(i)); } else { result.append(tokenList.get(i)); } } String outStr = result.toString(); if (phrase) { outStr = "\"" + outStr + "\""; } return outStr; }
From source file:org.hbasene.index.HBaseIndexWriter.java
License:Apache License
public void addDocument(Document doc, Analyzer analyzer) throws CorruptIndexException, IOException { String docId = doc.get(this.primaryKeyField); if (docId == null) { throw new IllegalArgumentException( "Primary Key " + this.primaryKeyField + " not present in the document to be added "); // TODO: Special type of exception needed ? }/*from www . j a v a 2 s. com*/ int position = 0; Map<String, List<Integer>> termPositions = new HashMap<String, List<Integer>>(); Map<String, byte[]> fieldsToStore = new HashMap<String, byte[]>(); for (Fieldable field : doc.getFields()) { // Indexed field if (field.isIndexed() && field.isTokenized()) { TokenStream tokens = field.tokenStreamValue(); if (tokens == null) { tokens = analyzer.tokenStream(field.name(), new StringReader(field.stringValue())); } tokens.addAttribute(TermAttribute.class); tokens.addAttribute(PositionIncrementAttribute.class); // collect term frequencies per doc if (position > 0) { position += analyzer.getPositionIncrementGap(field.name()); } // Build the termPositions vector for all terms while (tokens.incrementToken()) { String term = createColumnName(field.name(), tokens.getAttribute(TermAttribute.class).term()); List<Integer> pvec = termPositions.get(term); if (pvec == null) { pvec = Lists.newArrayList(); termPositions.put(term, pvec); } position += (tokens.getAttribute(PositionIncrementAttribute.class).getPositionIncrement() - 1); pvec.add(++position); } tokens.close(); } // Untokenized fields go in without a termPosition if (field.isIndexed() && !field.isTokenized()) { String term = this.createColumnName(field.name(), field.stringValue()); String key = term; termPositions.put(key, EMPTY_TERM_POSITIONS); } // Stores each field as a column under this doc key if (field.isStored()) { byte[] value = field.isBinary() ? field.getBinaryValue() : Bytes.toBytes(field.stringValue()); // first byte flags if binary or not final byte[] prefix = Bytes.toBytes((field.isBinary() ? 'B' : 'T')); fieldsToStore.put(field.name(), Bytes.add(prefix, value)); } } indexStore.indexDocument(docId, new DocumentIndexContext(termPositions, fieldsToStore)); termPositions.clear(); fieldsToStore.clear(); }
From source file:org.hibernate.search.backend.lucene.util.impl.AnalyzerUtils.java
License:LGPL
/** * Returns the first token resulting from the analysis, logging a warning if there are more than one token. * * @param analyzer the Lucene analyzer to use * @param fieldName the name of the field: might affect the analyzer behavior * @param text the value to analyze/*from www .j av a 2 s .c o m*/ * @return the first token resulting from the analysis * * @throws SearchException if a problem occurs when analyzing the sortable field's value. */ public static String normalize(Analyzer analyzer, String fieldName, String text) { final TokenStream stream = analyzer.tokenStream(fieldName, new StringReader(text)); try { try { String firstToken = null; CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); stream.reset(); if (stream.incrementToken()) { firstToken = new String(term.buffer(), 0, term.length()); if (stream.incrementToken()) { log.multipleTermsDetectedDuringNormalization(fieldName); } else { stream.end(); } } return firstToken; } finally { stream.close(); } } catch (SearchException | IOException e) { throw log.couldNotNormalizeField(fieldName, e); } }
From source file:org.hibernate.search.query.dsl.impl.ConnectedMultiFieldsPhraseQueryBuilder.java
License:Open Source License
public Query createQuery(FieldContext fieldContext) { final Query perFieldQuery; final String fieldName = fieldContext.getField(); /*/*from w w w .ja v a2 s . c o m*/ * Store terms per position and detect if for a given position more than one term is present */ TokenStream stream = null; boolean isMultiPhrase = false; Map<Integer, List<Term>> termsPerPosition = new HashMap<Integer, List<Term>>(); final String sentence = phraseContext.getSentence(); try { Reader reader = new StringReader(sentence); stream = queryContext.getQueryAnalyzer().reusableTokenStream(fieldName, reader); TermAttribute termAttribute = stream.addAttribute(TermAttribute.class); PositionIncrementAttribute positionAttribute = stream.addAttribute(PositionIncrementAttribute.class); stream.reset(); int position = -1; //start at -1 since we apply at least one increment List<Term> termsAtSamePosition = null; while (stream.incrementToken()) { int positionIncrement = 1; if (positionAttribute != null) { positionIncrement = positionAttribute.getPositionIncrement(); } if (positionIncrement > 0) { position += positionIncrement; termsAtSamePosition = termsPerPosition.get(position); } if (termsAtSamePosition == null) { termsAtSamePosition = new ArrayList<Term>(); termsPerPosition.put(position, termsAtSamePosition); } termsAtSamePosition.add(new Term(fieldName, termAttribute.term())); if (termsAtSamePosition.size() > 1) { isMultiPhrase = true; } } } catch (IOException e) { throw new AssertionFailure("IOException while reading a string. Doh!", e); } finally { if (stream != null) { try { stream.end(); stream.close(); } catch (IOException e) { throw new AssertionFailure("IOException while reading a string. Doh!", e); } } } /* * Create the appropriate query depending on the conditions * note that a MultiPhraseQuery is needed if several terms share the same position * as it will do a OR and not a AND like PhraseQuery */ final int size = termsPerPosition.size(); if (size == 0) { throw new SearchException( "phrase query returns no term. Is there a problem with your analyzers? " + sentence); } if (size == 1) { final List<Term> terms = termsPerPosition.values().iterator().next(); if (terms.size() == 1) { perFieldQuery = new TermQuery(terms.get(0)); } else { BooleanQuery query = new BooleanQuery(); for (Term term : terms) { query.add(new TermQuery(term), BooleanClause.Occur.SHOULD); } perFieldQuery = query; } } else { if (isMultiPhrase) { MultiPhraseQuery query = new MultiPhraseQuery(); query.setSlop(phraseContext.getSlop()); for (Map.Entry<Integer, List<Term>> entry : termsPerPosition.entrySet()) { final List<Term> value = entry.getValue(); query.add(value.toArray(new Term[value.size()]), entry.getKey()); } perFieldQuery = query; } else { PhraseQuery query = new PhraseQuery(); query.setSlop(phraseContext.getSlop()); for (Map.Entry<Integer, List<Term>> entry : termsPerPosition.entrySet()) { final List<Term> value = entry.getValue(); query.add(value.get(0), entry.getKey()); } perFieldQuery = query; } } return fieldContext.getFieldCustomizer().setWrappedQuery(perFieldQuery).createQuery(); }
From source file:org.hibernate.search.query.dsl.impl.Helper.java
License:Open Source License
static List<String> getAllTermsFromText(String fieldName, String localText, Analyzer analyzer) throws IOException { List<String> terms = new ArrayList<String>(); // Can't deal with null at this point. Likely returned by some FieldBridge not recognizing the type. if (localText == null) { throw new SearchException("Search parameter on field " + fieldName + " could not be converted. " + "Are the parameter and the field of the same type?" + "Alternatively, apply the ignoreFieldBridge() option to " + "pass String parameters"); }//from w w w .j a va 2 s . com Reader reader = new StringReader(localText); TokenStream stream = analyzer.reusableTokenStream(fieldName, reader); TermAttribute attribute = stream.addAttribute(TermAttribute.class); stream.reset(); while (stream.incrementToken()) { if (attribute.termLength() > 0) { String term = attribute.term(); terms.add(term); } } stream.end(); stream.close(); return terms; }
From source file:org.hibernate.search.query.dsl.impl.MoreLikeThisBuilder.java
License:LGPL
/** * Adds term frequencies found by tokenizing text from reader into the Map words * * @param r a source of text to be tokenized * @param termFreqMap a Map of terms and their frequencies * @param fieldName Used by analyzer for any special per-field analysis *//* w w w . j a va 2 s. c o m*/ private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, FieldContext fieldContext) throws IOException { String fieldName = fieldContext.getField(); Analyzer analyzer = queryContext.getQueryAnalyzerReference().unwrap(LuceneAnalyzerReference.class) .getAnalyzer(); if (!fieldContext.applyAnalyzer()) { // essentially does the Reader to String conversion for us analyzer = PassThroughAnalyzer.INSTANCE; } TokenStream ts = analyzer.tokenStream(fieldName, r); try { int tokenCount = 0; // for every token CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { String word = termAtt.toString(); tokenCount++; if (tokenCount > maxNumTokensParsed) { break; } if (isNoiseWord(word)) { continue; } // increment frequency Int cnt = termFreqMap.get(word); if (cnt == null) { termFreqMap.put(word, new Int()); } else { cnt.x++; } } ts.end(); } finally { IOUtils.closeWhileHandlingException(ts); } }
From source file:org.hibernate.search.test.util.AnalyzerUtils.java
License:Open Source License
public static Token[] tokensFromAnalysis(Analyzer analyzer, String field, String text) throws IOException { TokenStream stream = analyzer.tokenStream(field, new StringReader(text)); TermAttribute term = stream.addAttribute(TermAttribute.class); List<Token> tokenList = new ArrayList<Token>(); while (stream.incrementToken()) { tokenList.add(new Token(term.term(), 0, 0)); }/*www .jav a 2 s .c o m*/ return tokenList.toArray(new Token[tokenList.size()]); }