List of usage examples for org.apache.lucene.analysis TokenStream addAttribute
public final <T extends Attribute> T addAttribute(Class<T> attClass)
From source file:org.hibernate.search.util.AnalyzerUtils.java
License:Open Source License
public static List<String> tokenizedTermValues(Analyzer analyzer, String field, String text) throws IOException { TokenStream stream = analyzer.tokenStream(field, new StringReader(text)); CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); List<String> tokenList = new ArrayList<String>(); while (stream.incrementToken()) { String s = new String(term.buffer(), 0, term.length()); tokenList.add(s);/*from w w w .j a va2 s . co m*/ } return tokenList; }
From source file:org.hibernate.search.util.AnalyzerUtils.java
License:Open Source License
public static Token[] tokensFromAnalysis(Analyzer analyzer, String field, String text) throws IOException { TokenStream stream = analyzer.tokenStream(field, new StringReader(text)); CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); List<Token> tokenList = new ArrayList<Token>(); while (stream.incrementToken()) { Token token = new Token(); token.copyBuffer(term.buffer(), 0, term.length()); tokenList.add(token);//from w ww . jav a 2 s. c o m } return tokenList.toArray(new Token[tokenList.size()]); }
From source file:org.hibernate.search.util.impl.InternalAnalyzerUtils.java
License:LGPL
/** * Returns the first token resulting from the analysis, logging a warning if there are more than one token. * * @param analyzer the Lucene analyzer to use * @param fieldName the name of the field: might affect the analyzer behavior * @param text the value to analyze// w ww .j av a 2 s . c om * @return the first token resulting from the analysis * * @throws SearchException if a problem occurs when analyzing the sortable field's value. */ public static String analyzeSortableValue(Analyzer analyzer, String fieldName, String text) { final TokenStream stream = analyzer.tokenStream(fieldName, new StringReader(text)); try { try { String firstToken = null; CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); stream.reset(); if (stream.incrementToken()) { firstToken = new String(term.buffer(), 0, term.length()); if (stream.incrementToken()) { log.multipleTermsInAnalyzedSortableField(fieldName); } else { stream.end(); } } return firstToken; } finally { stream.close(); } } catch (SearchException | IOException e) { throw log.couldNotAnalyzeSortableField(fieldName, e); } }
From source file:org.ihtsdo.otf.query.lucene.LuceneIndexer.java
License:Apache License
protected Query buildPrefixQuery(String searchString, String field, Analyzer analyzer) throws IOException { StringReader textReader = new StringReader(searchString); TokenStream tokenStream = analyzer.tokenStream(field, textReader); tokenStream.reset();/*from www. j av a2 s . c om*/ List<String> terms = new ArrayList<>(); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); while (tokenStream.incrementToken()) { terms.add(charTermAttribute.toString()); } textReader.close(); tokenStream.close(); analyzer.close(); BooleanQuery bq = new BooleanQuery(); if (terms.size() > 0 && !searchString.endsWith(" ")) { String last = terms.remove(terms.size() - 1); bq.add(new PrefixQuery((new Term(field, last))), Occur.MUST); } terms.stream().forEach((s) -> { bq.add(new TermQuery(new Term(field, s)), Occur.MUST); }); return bq; }
From source file:org.index.Tag.java
private String getBagOfWords(String text) throws Exception { StringBuffer buff = new StringBuffer(); text = Question.removeTags(text);/*from w w w .ja v a 2 s. c o m*/ boolean toStem = Boolean.parseBoolean(prop.getProperty("stem", "true")); String stopFile = prop.getProperty("stopfile"); Analyzer analyzer = new SOAnalyzer(toStem, stopFile); TokenStream stream = analyzer.tokenStream("bow", new StringReader(text)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { String term = termAtt.toString(); buff.append(term).append(" "); } stream.end(); stream.close(); return buff.toString(); }
From source file:org.index.TermScore.java
private List<String> getBagOfWords(String text) throws Exception { List<String> terms = new ArrayList<>(); text = Question.removeTags(text);//from w ww.j a v a2 s . c o m boolean toStem = Boolean.parseBoolean(prop.getProperty("stem", "true")); String stopFile = prop.getProperty("stopfile"); Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_4_9); /*SOAnalyzer(toStem, stopFile)*/; TokenStream stream = analyzer.tokenStream("bow", new StringReader(text)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { String term = termAtt.toString(); terms.add(term); } stream.end(); stream.close(); return terms; }
From source file:org.languagetool.dev.index.LanguageToolFilterTest.java
License:Open Source License
private static void displayTokensWithFullDetails(TokenStream stream) throws IOException { final CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); final PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class); final OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); final TypeAttribute type = stream.addAttribute(TypeAttribute.class); int position = 0; while (stream.incrementToken()) { final int increment = posIncr.getPositionIncrement(); if (increment > 0) { position = position + increment; System.out.println(); System.out.print(position + ": "); }/*from www . j a v a 2 s. c om*/ System.out.print("[" + term.toString() + ":" + offset.startOffset() + "->" + offset.endOffset() + ":" + type.type() + "] "); } System.out.println(); }
From source file:org.LexGrid.LexBIG.Impl.Extensions.GenericExtensions.search.SearchExtensionImpl.java
License:Open Source License
public List<String> tokenize(Analyzer analyzer, String field, String keywords) throws IOException { List<String> result = new ArrayList<String>(); StringReader reader = new StringReader(keywords); TokenStream stream = analyzer.tokenStream(field, reader); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); try {//from w ww.j a v a 2s . c om stream.reset(); while (stream.incrementToken()) { result.add(termAtt.toString()); } stream.close(); } finally { stream.close(); } return result; }
From source file:org.meresco.lucene.analysis.MerescoStandardAnalyzer.java
License:Open Source License
public static List<String> readTokenStream(TokenStream tok) throws IOException { List<String> terms = new ArrayList<String>(); CharTermAttribute termAtt = tok.addAttribute(CharTermAttribute.class); try {//from ww w . ja va2 s . c o m tok.reset(); while (tok.incrementToken()) { terms.add(termAtt.toString()); } tok.end(); } finally { tok.close(); } return terms; }
From source file:org.modeshape.jcr.query.lucene.LuceneQueryFactory.java
License:Open Source License
public Query createQuery(final SelectorName selectorName, String fieldName, FullTextSearch.Term term) throws IOException { assert fieldName != null; if (term instanceof FullTextSearch.Conjunction) { FullTextSearch.Conjunction conjunction = (FullTextSearch.Conjunction) term; BooleanQuery query = new BooleanQuery(); for (FullTextSearch.Term nested : conjunction) { if (nested instanceof NegationTerm) { Query subQuery = createQuery(selectorName, fieldName, ((NegationTerm) nested).getNegatedTerm()); if (!EMPTY_PHRASE_QUERY.equals(subQuery)) { query.add(subQuery, Occur.MUST_NOT); }//from www.j a v a 2 s .c om } else { Query subQuery = createQuery(selectorName, fieldName, nested); if (!EMPTY_PHRASE_QUERY.equals(subQuery)) { query.add(subQuery, Occur.MUST); } } } return query; } if (term instanceof FullTextSearch.Disjunction) { FullTextSearch.Disjunction disjunction = (FullTextSearch.Disjunction) term; BooleanQuery query = new BooleanQuery(); for (FullTextSearch.Term nested : disjunction) { if (nested instanceof NegationTerm) { Query subQuery = createQuery(selectorName, fieldName, ((NegationTerm) nested).getNegatedTerm()); if (!EMPTY_PHRASE_QUERY.equals(subQuery)) { query.add(subQuery, Occur.MUST_NOT); } } else { Query subQuery = createQuery(selectorName, fieldName, nested); if (!EMPTY_PHRASE_QUERY.equals(subQuery)) { query.add(subQuery, Occur.SHOULD); } } } return query; } if (term instanceof FullTextSearch.SimpleTerm) { FullTextSearch.SimpleTerm simple = (FullTextSearch.SimpleTerm) term; Analyzer analyzer = getFullTextSearchAnalyzer(); if (simple.containsWildcards()) { // Use the ComplexPhraseQueryParser, but instead of wildcard queries (which don't work with leading // wildcards) we should use our like queries (which often use RegexQuery where applicable) ... QueryParser parser = new QueryParser(version, fieldName, analyzer) { @Override protected org.apache.lucene.search.Query getWildcardQuery(String field, String termStr) { return findNodesLike(selectorName, termStr.toLowerCase(), field, CaseOperations.LOWERCASE); } }; parser.setAllowLeadingWildcard(true); try { String expression = simple.getValue(); // The ComplexPhraseQueryParser only understands the '?' and '*' as being wildcards ... expression = expression.replaceAll("(?<![\\\\])_", "?"); expression = expression.replaceAll("(?<![\\\\])%", "*"); // // Replace any '-' between tokens, except when preceded or followed by a digit, '*', or '?' ... expression = expression.replaceAll("((?<![\\d*?]))[-]((?![\\d*?]))", "$1 $2"); // Then use the parser ... return parser.parse(expression); } catch (ParseException e) { throw new IOException(e); } } PhraseQuery query = new PhraseQuery(); query.setSlop(0); // terms must be adjacent String expression = simple.getValue(); // Run the expression through the Lucene analyzer to extract the terms ... TokenStream stream = analyzer.tokenStream(fieldName, new StringReader(expression)); CharTermAttribute termAttribute = stream.addAttribute(CharTermAttribute.class); while (stream.incrementToken()) { // The term attribute object has been modified to contain the next term ... String analyzedTerm = termAttribute.toString(); query.add(new Term(fieldName, analyzedTerm)); } return query; } // Should not get here ... assert false; return null; }