List of usage examples for org.apache.lucene.analysis TokenStream hasAttribute
public final boolean hasAttribute(Class<? extends Attribute> attClass)
From source file:org.alfresco.solr.query.Solr4QueryParser.java
License:Open Source License
/** * @param first/* www .j ava 2 s. co m*/ * @param field * @return SpanOrQuery * @throws IOException */ private SpanQuery buildSpanOrQuery(String first, FieldInstance field) throws IOException { ArrayList<SpanQuery> spanOrQueryParts = new ArrayList<SpanQuery>(); PackedTokenAttributeImpl nextToken; TokenStream source = null; try { source = getAnalyzer().tokenStream(field.getField(), new StringReader(first)); source.reset(); while (source.incrementToken()) { CharTermAttribute cta = source.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = source.getAttribute(OffsetAttribute.class); TypeAttribute typeAtt = null; if (source.hasAttribute(TypeAttribute.class)) { typeAtt = source.getAttribute(TypeAttribute.class); } PositionIncrementAttribute posIncAtt = null; if (source.hasAttribute(PositionIncrementAttribute.class)) { posIncAtt = source.getAttribute(PositionIncrementAttribute.class); } nextToken = new PackedTokenAttributeImpl(); nextToken.setEmpty().copyBuffer(cta.buffer(), 0, cta.length()); nextToken.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset()); if (typeAtt != null) { nextToken.setType(typeAtt.type()); } if (posIncAtt != null) { nextToken.setPositionIncrement(posIncAtt.getPositionIncrement()); } SpanQuery termQuery = new SpanTermQuery(new Term(field.getField(), nextToken.toString())); spanOrQueryParts.add(termQuery); } } finally { try { if (source != null) { source.close(); } } catch (IOException e) { // ignore } } if (spanOrQueryParts.size() == 1) { return spanOrQueryParts.get(0); } else { return new SpanOrQuery(spanOrQueryParts.toArray(new SpanQuery[] {})); } }
From source file:org.apache.solr.handler.component.WordCloudComponent.java
License:Apache License
@Override public void process(ResponseBuilder rb) throws IOException { SolrQueryRequest req = rb.req;//from ww w .j a v a2s . c o m SolrParams params = req.getParams(); if (!params.getBool(COMPONENT_NAME, true)) { return; } String wcFields = null; if ((wcFields = params.get("wordcloud.fl", null)) == null) { return; } Set<String> flds = new HashSet<String>(StrUtils.splitSmart(wcFields, ',')); DocList ids = rb.getResults().docList; SolrIndexSearcher searcher = rb.req.getSearcher(); IndexSchema schema = rb.req.getCore().getLatestSchema(); final Analyzer analyzer = rb.req.getCore().getLatestSchema().getAnalyzer(); final HashMap<String, FieldType> fieldsToLoad = new HashMap<String, FieldType>(); CharTermAttribute termAtt; Map<String, Map<String, Integer>> tokens = new HashMap<String, Map<String, Integer>>(); for (String f : flds) { SchemaField field = schema.getFieldOrNull(f); if (field == null || !field.stored()) { continue; // ignore this field } fieldsToLoad.put(f, field.getType()); tokens.put(f, new HashMap<String, Integer>()); } DocIterator iterator = ids.iterator(); String w; Integer v; int sz = ids.size(); for (int i = 0; i < sz; i++) { int id = iterator.nextDoc(); Document doc = searcher.doc(id, fieldsToLoad.keySet()); for (Entry<String, FieldType> en : fieldsToLoad.entrySet()) { Map<String, Integer> toks = tokens.get(en.getKey()); String[] vals = doc.getValues(en.getKey()); FieldType fType = en.getValue(); if (vals != null) { for (String s : vals) { TokenStream buffer = analyzer.tokenStream(en.getKey(), new StringReader(fType.indexedToReadable(s))); if (!buffer.hasAttribute(CharTermAttribute.class)) { continue; // empty stream } termAtt = buffer.getAttribute(CharTermAttribute.class); buffer.reset(); while (buffer.incrementToken()) { w = termAtt.toString(); v = toks.get(w); if (v == null) v = 0; toks.put(w, ++v); } buffer.close(); } } } } // TODO: filter out the tokens (use some sort of a range 0.1-0.9 by frequency) AtomicReader reader = searcher.getAtomicReader(); BytesRef term; int df; String f; Map<String, Map<String, Double>> docFreqs = new HashMap<String, Map<String, Double>>(); for (Entry<String, Map<String, Integer>> field : tokens.entrySet()) { HashMap<String, Double> idfs = new HashMap<String, Double>(); f = field.getKey(); docFreqs.put(f, idfs); int N = reader.getDocCount(f); for (Entry<String, Integer> token : field.getValue().entrySet()) { w = token.getKey(); df = reader.docFreq(new Term(f, new BytesRef(w))); if (df != 0) { idfs.put(w, Math.log10(N / df)); } } } HashMap<String, Object> ret = new HashMap<String, Object>(); for (String fi : fieldsToLoad.keySet()) { HashMap<String, Object> va = new HashMap<String, Object>(); va.put("tf", tokens.get(fi)); va.put("idf", docFreqs.get(fi)); ret.put(fi, va); } rb.rsp.add("wordcloud", ret); }
From source file:org.apache.solr.schema.JsonPreAnalyzedParser.java
License:Apache License
@Override public String toFormattedString(Field f) throws IOException { Map<String, Object> map = new LinkedHashMap<String, Object>(); map.put(VERSION_KEY, VERSION);//from www. j av a2s .com if (f.fieldType().stored()) { String stringValue = f.stringValue(); if (stringValue != null) { map.put(STRING_KEY, stringValue); } BytesRef binaryValue = f.binaryValue(); if (binaryValue != null) { map.put(BINARY_KEY, Base64.byteArrayToBase64(binaryValue.bytes, binaryValue.offset, binaryValue.length)); } } TokenStream ts = f.tokenStreamValue(); if (ts != null) { List<Map<String, Object>> tokens = new LinkedList<Map<String, Object>>(); while (ts.incrementToken()) { Iterator<Class<? extends Attribute>> it = ts.getAttributeClassesIterator(); String cTerm = null; String tTerm = null; Map<String, Object> tok = new TreeMap<String, Object>(); while (it.hasNext()) { Class<? extends Attribute> cl = it.next(); if (!ts.hasAttribute(cl)) { continue; } Attribute att = ts.getAttribute(cl); if (cl.isAssignableFrom(CharTermAttribute.class)) { CharTermAttribute catt = (CharTermAttribute) att; cTerm = new String(catt.buffer(), 0, catt.length()); } else if (cl.isAssignableFrom(TermToBytesRefAttribute.class)) { TermToBytesRefAttribute tatt = (TermToBytesRefAttribute) att; tTerm = tatt.getBytesRef().utf8ToString(); } else { if (cl.isAssignableFrom(FlagsAttribute.class)) { tok.put(FLAGS_KEY, Integer.toHexString(((FlagsAttribute) att).getFlags())); } else if (cl.isAssignableFrom(OffsetAttribute.class)) { tok.put(OFFSET_START_KEY, ((OffsetAttribute) att).startOffset()); tok.put(OFFSET_END_KEY, ((OffsetAttribute) att).endOffset()); } else if (cl.isAssignableFrom(PayloadAttribute.class)) { BytesRef p = ((PayloadAttribute) att).getPayload(); if (p != null && p.length > 0) { tok.put(PAYLOAD_KEY, Base64.byteArrayToBase64(p.bytes, p.offset, p.length)); } } else if (cl.isAssignableFrom(PositionIncrementAttribute.class)) { tok.put(POSINCR_KEY, ((PositionIncrementAttribute) att).getPositionIncrement()); } else if (cl.isAssignableFrom(TypeAttribute.class)) { tok.put(TYPE_KEY, ((TypeAttribute) att).type()); } else { tok.put(cl.getName(), att.toString()); } } } String term = null; if (cTerm != null) { term = cTerm; } else { term = tTerm; } if (term != null && term.length() > 0) { tok.put(TOKEN_KEY, term); } tokens.add(tok); } map.put(TOKENS_KEY, tokens); } return JSONUtil.toJSON(map, -1); }
From source file:org.apache.solr.schema.SimplePreAnalyzedParser.java
License:Apache License
@Override public String toFormattedString(Field f) throws IOException { StringBuilder sb = new StringBuilder(); sb.append(VERSION + " "); if (f.fieldType().stored()) { String s = f.stringValue(); if (s != null) { // encode the equals sign s = s.replaceAll("=", "\\="); sb.append('='); sb.append(s);//from www. j ava2s .com sb.append('='); } } TokenStream ts = f.tokenStreamValue(); if (ts != null) { StringBuilder tok = new StringBuilder(); boolean next = false; while (ts.incrementToken()) { if (next) { sb.append(' '); } else { next = true; } tok.setLength(0); Iterator<Class<? extends Attribute>> it = ts.getAttributeClassesIterator(); String cTerm = null; String tTerm = null; while (it.hasNext()) { Class<? extends Attribute> cl = it.next(); if (!ts.hasAttribute(cl)) { continue; } Attribute att = ts.getAttribute(cl); if (cl.isAssignableFrom(CharTermAttribute.class)) { CharTermAttribute catt = (CharTermAttribute) att; cTerm = escape(catt.buffer(), catt.length()); } else if (cl.isAssignableFrom(TermToBytesRefAttribute.class)) { TermToBytesRefAttribute tatt = (TermToBytesRefAttribute) att; char[] tTermChars = tatt.getBytesRef().utf8ToString().toCharArray(); tTerm = escape(tTermChars, tTermChars.length); } else { if (tok.length() > 0) tok.append(','); if (cl.isAssignableFrom(FlagsAttribute.class)) { tok.append("f=" + Integer.toHexString(((FlagsAttribute) att).getFlags())); } else if (cl.isAssignableFrom(OffsetAttribute.class)) { tok.append("s=" + ((OffsetAttribute) att).startOffset() + ",e=" + ((OffsetAttribute) att).endOffset()); } else if (cl.isAssignableFrom(PayloadAttribute.class)) { BytesRef p = ((PayloadAttribute) att).getPayload(); if (p != null && p.length > 0) { tok.append("p=" + bytesToHex(p.bytes, p.offset, p.length)); } else if (tok.length() > 0) { tok.setLength(tok.length() - 1); // remove the last comma } } else if (cl.isAssignableFrom(PositionIncrementAttribute.class)) { tok.append("i=" + ((PositionIncrementAttribute) att).getPositionIncrement()); } else if (cl.isAssignableFrom(TypeAttribute.class)) { tok.append("y=" + escape(((TypeAttribute) att).type())); } else { tok.append(cl.getName() + "=" + escape(att.toString())); } } } String term = null; if (cTerm != null) { term = cTerm; } else { term = tTerm; } if (term != null && term.length() > 0) { if (tok.length() > 0) { tok.insert(0, term + ","); } else { tok.insert(0, term); } } sb.append(tok); } } return sb.toString(); }
From source file:org.elasticsearch.analysis.common.ShingleTokenFilterTests.java
License:Apache License
public void testPreConfiguredShingleFilterDisableGraphAttribute() throws Exception { ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .put("index.analysis.filter.my_ascii_folding.type", "asciifolding").build(), new CommonAnalysisPlugin()); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("shingle"); Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader("this is a test")); TokenStream tokenStream = tokenFilter.create(tokenizer); assertTrue(tokenStream.hasAttribute(DisableGraphAttribute.class)); }
From source file:org.elasticsearch.search.highlight.PlainHighlighter.java
License:Apache License
public HighlightField highlight(HighlighterContext highlighterContext) { SearchContextHighlight.Field field = highlighterContext.field; SearchContext context = highlighterContext.context; FetchSubPhase.HitContext hitContext = highlighterContext.hitContext; FieldMapper<?> mapper = highlighterContext.mapper; Encoder encoder = field.fieldOptions().encoder().equals("html") ? HighlightUtils.Encoders.HTML : HighlightUtils.Encoders.DEFAULT; if (!hitContext.cache().containsKey(CACHE_KEY)) { Map<FieldMapper<?>, org.apache.lucene.search.highlight.Highlighter> mappers = Maps.newHashMap(); hitContext.cache().put(CACHE_KEY, mappers); }//from ww w .j a v a 2 s . co m @SuppressWarnings("unchecked") Map<FieldMapper<?>, org.apache.lucene.search.highlight.Highlighter> cache = (Map<FieldMapper<?>, org.apache.lucene.search.highlight.Highlighter>) hitContext .cache().get(CACHE_KEY); org.apache.lucene.search.highlight.Highlighter entry = cache.get(mapper); if (entry == null) { Query query = highlighterContext.query.originalQuery(); QueryScorer queryScorer = new CustomQueryScorer(query, field.fieldOptions().requireFieldMatch() ? mapper.names().indexName() : null); queryScorer.setExpandMultiTermQuery(true); Fragmenter fragmenter; if (field.fieldOptions().numberOfFragments() == 0) { fragmenter = new NullFragmenter(); } else if (field.fieldOptions().fragmenter() == null) { fragmenter = new SimpleSpanFragmenter(queryScorer, field.fieldOptions().fragmentCharSize()); } else if ("simple".equals(field.fieldOptions().fragmenter())) { fragmenter = new SimpleFragmenter(field.fieldOptions().fragmentCharSize()); } else if ("span".equals(field.fieldOptions().fragmenter())) { fragmenter = new SimpleSpanFragmenter(queryScorer, field.fieldOptions().fragmentCharSize()); } else { throw new ElasticsearchIllegalArgumentException( "unknown fragmenter option [" + field.fieldOptions().fragmenter() + "] for the field [" + highlighterContext.fieldName + "]"); } Formatter formatter = new SimpleHTMLFormatter(field.fieldOptions().preTags()[0], field.fieldOptions().postTags()[0]); entry = new org.apache.lucene.search.highlight.Highlighter(formatter, encoder, queryScorer); entry.setTextFragmenter(fragmenter); // always highlight across all data entry.setMaxDocCharsToAnalyze(Integer.MAX_VALUE); cache.put(mapper, entry); } // a HACK to make highlighter do highlighting, even though its using the single frag list builder int numberOfFragments = field.fieldOptions().numberOfFragments() == 0 ? 1 : field.fieldOptions().numberOfFragments(); ArrayList<TextFragment> fragsList = new ArrayList<TextFragment>(); List<Object> textsToHighlight; try { textsToHighlight = HighlightUtils.loadFieldValues(field, mapper, context, hitContext); for (Object textToHighlight : textsToHighlight) { String text = textToHighlight.toString(); Analyzer analyzer = context.mapperService().documentMapper(hitContext.hit().type()).mappers() .indexAnalyzer(); TokenStream tokenStream = analyzer.tokenStream(mapper.names().indexName(), text); if (!tokenStream.hasAttribute(CharTermAttribute.class) || !tokenStream.hasAttribute(OffsetAttribute.class)) { // can't perform highlighting if the stream has no terms (binary token stream) or no offsets continue; } TextFragment[] bestTextFragments = entry.getBestTextFragments(tokenStream, text, false, numberOfFragments); for (TextFragment bestTextFragment : bestTextFragments) { if (bestTextFragment != null && bestTextFragment.getScore() > 0) { fragsList.add(bestTextFragment); } } } } catch (Exception e) { throw new FetchPhaseExecutionException(context, "Failed to highlight field [" + highlighterContext.fieldName + "]", e); } if (field.fieldOptions().scoreOrdered()) { CollectionUtil.introSort(fragsList, new Comparator<TextFragment>() { public int compare(TextFragment o1, TextFragment o2) { return Math.round(o2.getScore() - o1.getScore()); } }); } String[] fragments; // number_of_fragments is set to 0 but we have a multivalued field if (field.fieldOptions().numberOfFragments() == 0 && textsToHighlight.size() > 1 && fragsList.size() > 0) { fragments = new String[fragsList.size()]; for (int i = 0; i < fragsList.size(); i++) { fragments[i] = fragsList.get(i).toString(); } } else { // refine numberOfFragments if needed numberOfFragments = fragsList.size() < numberOfFragments ? fragsList.size() : numberOfFragments; fragments = new String[numberOfFragments]; for (int i = 0; i < fragments.length; i++) { fragments[i] = fragsList.get(i).toString(); } } if (fragments.length > 0) { return new HighlightField(highlighterContext.fieldName, StringText.convertFromStringArray(fragments)); } int noMatchSize = highlighterContext.field.fieldOptions().noMatchSize(); if (noMatchSize > 0 && textsToHighlight.size() > 0) { // Pull an excerpt from the beginning of the string but make sure to split the string on a term boundary. String fieldContents = textsToHighlight.get(0).toString(); Analyzer analyzer = context.mapperService().documentMapper(hitContext.hit().type()).mappers() .indexAnalyzer(); int end; try { end = findGoodEndForNoHighlightExcerpt(noMatchSize, analyzer.tokenStream(mapper.names().indexName(), fieldContents)); } catch (Exception e) { throw new FetchPhaseExecutionException(context, "Failed to highlight field [" + highlighterContext.fieldName + "]", e); } if (end > 0) { return new HighlightField(highlighterContext.fieldName, new Text[] { new StringText(fieldContents.substring(0, end)) }); } } return null; }
From source file:org.elasticsearch.search.highlight.PlainHighlighter.java
License:Apache License
private static int findGoodEndForNoHighlightExcerpt(int noMatchSize, TokenStream tokenStream) throws IOException { try {/*from w w w. jav a 2 s.com*/ if (!tokenStream.hasAttribute(OffsetAttribute.class)) { // Can't split on term boundaries without offsets return -1; } int end = -1; tokenStream.reset(); while (tokenStream.incrementToken()) { OffsetAttribute attr = tokenStream.getAttribute(OffsetAttribute.class); if (attr.endOffset() >= noMatchSize) { // Jump to the end of this token if it wouldn't put us past the boundary if (attr.endOffset() == noMatchSize) { end = noMatchSize; } return end; } end = attr.endOffset(); } // We've exhausted the token stream so we should just highlight everything. return end; } finally { tokenStream.end(); tokenStream.close(); } }
From source file:org.fastcatsearch.ir.index.SearchIndexWriter.java
License:Apache License
private void indexValue(int docNo, int i, Object value, boolean isIgnoreCase, int positionIncrementGap) throws IOException, IRException { if (value == null) { return;/* w w w. ja va2s . c om*/ } char[] fieldValue = value.toString().toCharArray(); TokenStream tokenStream = indexAnalyzerList[i].tokenStream(indexId, new CharArrayReader(fieldValue), indexingAnalyzerOption); tokenStream.reset(); CharsRefTermAttribute termAttribute = null; PositionIncrementAttribute positionAttribute = null; StopwordAttribute stopwordAttribute = null; AdditionalTermAttribute additionalTermAttribute = null; CharTermAttribute charTermAttribute = null; //? ? . if (tokenStream.hasAttribute(CharsRefTermAttribute.class)) { termAttribute = tokenStream.getAttribute(CharsRefTermAttribute.class); } if (tokenStream.hasAttribute(PositionIncrementAttribute.class)) { positionAttribute = tokenStream.getAttribute(PositionIncrementAttribute.class); } if (tokenStream.hasAttribute(AdditionalTermAttribute.class)) { additionalTermAttribute = tokenStream.getAttribute(AdditionalTermAttribute.class); } // stopword . if (tokenStream.hasAttribute(StopwordAttribute.class)) { stopwordAttribute = tokenStream.getAttribute(StopwordAttribute.class); } if (tokenStream.hasAttribute(CharTermAttribute.class)) { charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); } int lastPosition = 0; while (tokenStream.incrementToken()) { CharVector key = null; if (termAttribute != null) { CharsRef charRef = termAttribute.charsRef(); char[] buffer = new char[charRef.length()]; System.arraycopy(charRef.chars, charRef.offset, buffer, 0, charRef.length); key = new CharVector(buffer, 0, buffer.length); } else { key = new CharVector(charTermAttribute.buffer(), 0, charTermAttribute.length()); } int position = -1; if (positionAttribute != null) { position = positionAttribute.getPositionIncrement() + positionIncrementGap; lastPosition = position; } // logger.debug("FIELD#{}: {} >> {} ({})", indexId, key, docNo, position); if (stopwordAttribute != null && stopwordAttribute.isStopword()) { //ignore } else { memoryPosting.add(key, docNo, position); } // if(synonymAttribute != null) { // CharVector[] synonym = synonymAttribute.getSynonym(); // if(synonym != null) { // for(CharVector token : synonym) { // memoryPosting.add(token, docNo, position); // } // } // } if (additionalTermAttribute != null && additionalTermAttribute.size() > 0) { Iterator<String> iter = additionalTermAttribute.iterateAdditionalTerms(); while (iter.hasNext()) { CharVector token = new CharVector(iter.next().toCharArray()); memoryPosting.add(token, docNo, lastPosition); } } } }
From source file:org.fastcatsearch.plugin.analysis.RunAnalyzer.java
public static void main(String[] args) throws IOException { if (args.length != 3) { printUsage();/*from w ww.j av a 2 s . co m*/ System.exit(0); } File pluginDir = new File(args[0]); String pluginClassName = args[1]; String analyzerId = args[2]; RunAnalyzer runAnalyzer = new RunAnalyzer(pluginDir, pluginClassName); AnalyzerPool analyzerPool = runAnalyzer.getAnalyzerPool(analyzerId); Analyzer analyzer = null; try { analyzer = analyzerPool.getFromPool(); //? ? ? ?. Scanner sc = new Scanner(System.in); System.out.println("=================================="); System.out.println(" Fastcat analyzer"); System.out.println(" Enter 'quit' for exit program. "); System.out.println("=================================="); System.out.print("Input String: "); while (sc.hasNextLine()) { String str = sc.nextLine(); if (str.equalsIgnoreCase("quit")) { break; } try { char[] value = str.toCharArray(); TokenStream tokenStream = analyzer.tokenStream("", new CharArrayReader(value), new AnalyzerOption()); tokenStream.reset(); CharsRefTermAttribute termAttribute = null; if (tokenStream.hasAttribute(CharsRefTermAttribute.class)) { termAttribute = tokenStream.getAttribute(CharsRefTermAttribute.class); } SynonymAttribute synonymAttribute = null; if (tokenStream.hasAttribute(SynonymAttribute.class)) { synonymAttribute = tokenStream.getAttribute(SynonymAttribute.class); } AdditionalTermAttribute additionalTermAttribute = null; if (tokenStream.hasAttribute(AdditionalTermAttribute.class)) { additionalTermAttribute = tokenStream.getAttribute(AdditionalTermAttribute.class); } StopwordAttribute stopwordAttribute = null; if (tokenStream.hasAttribute(StopwordAttribute.class)) { stopwordAttribute = tokenStream.getAttribute(StopwordAttribute.class); } CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); while (tokenStream.incrementToken()) { String word = ""; //? ?? CharsRefTermAttribute ? . if (termAttribute != null) { word = termAttribute.toString(); } else { //CharsRefTermAttribute ? ?? CharTermAttribute ? ?. word = charTermAttribute.toString(); } // ?? . if (stopwordAttribute.isStopword()) { continue; } // // ?? . // System.out.print(">> "); System.out.println(word); // . if (synonymAttribute != null) { List synonyms = synonymAttribute.getSynonyms(); if (synonyms != null) { for (Object synonymObj : synonyms) { if (synonymObj instanceof CharVector) { CharVector synonym = (CharVector) synonymObj; System.out.print("S> "); System.out.println(synonym); } else if (synonymObj instanceof List) { List synonymList = (List) synonymObj; for (Object synonym : synonymList) { System.out.print("S> "); System.out.println(synonym); } } } } } // . // ??? ? ? ?? ?, ?? . if (additionalTermAttribute != null && additionalTermAttribute.size() > 0) { Iterator<String> termIter = additionalTermAttribute.iterateAdditionalTerms(); while (termIter.hasNext()) { String token = termIter.next(); System.out.print("A> "); System.out.println(word); } } } } catch (IOException e) { e.printStackTrace(); } System.out.print("Input String: "); } } finally { if (analyzer != null) { analyzerPool.releaseToPool(analyzer); } } System.out.print("Bye!"); }
From source file:org.sc.probro.lucene.BiothesaurusSearcher.java
License:Apache License
public String[] tokenize(String input) { ArrayList<String> tokens = new ArrayList<String>(); try {// w w w . j av a 2 s . c om TokenStream stream = analyzer.tokenStream(null, new StringReader(input)); TermAttribute termattr = (TermAttribute) stream.getAttribute(TermAttribute.class); //stream = new LowerCaseFilter(stream); stream.reset(); while (stream.incrementToken()) { if (stream.hasAttribute(TermAttribute.class)) { String term = termattr.term(); tokens.add(term); } } stream.end(); stream.close(); } catch (IllegalArgumentException e) { System.err.println(String.format("Phrase: \"%s\"", input)); e.printStackTrace(System.err); } catch (IOException e) { System.err.println(String.format("Phrase: \"%s\"", input)); e.printStackTrace(); } return tokens.toArray(new String[0]); }