List of usage examples for org.apache.lucene.analysis TokenStream incrementToken
public abstract boolean incrementToken() throws IOException;
From source file:org.exist.indexing.lucene.XMLToQuery.java
License:Open Source License
private String getTerm(String field, String text, Analyzer analyzer) throws XPathException { String term = null;/*w w w. j av a 2s .c om*/ try { TokenStream stream = analyzer.tokenStream(field, new StringReader(text)); CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); stream.reset(); if (stream.incrementToken()) { term = termAttr.toString(); } stream.end(); stream.close(); return term; } catch (IOException e) { throw new XPathException("Lucene index error while creating query: " + e.getMessage(), e); } }
From source file:org.exist.indexing.range.RangeIndexWorker.java
License:Open Source License
protected BytesRef analyzeContent(String field, QName qname, String data, DocumentSet docs) throws XPathException { final Analyzer analyzer = getAnalyzer(qname, field, docs); if (!isCaseSensitive(qname, field, docs)) { data = data.toLowerCase();/*w ww. ja va2 s . c o m*/ } if (analyzer == null) { return new BytesRef(data); } try { TokenStream stream = analyzer.tokenStream(field, new StringReader(data)); TermToBytesRefAttribute termAttr = stream.addAttribute(TermToBytesRefAttribute.class); BytesRef token = null; try { stream.reset(); if (stream.incrementToken()) { termAttr.fillBytesRef(); token = termAttr.getBytesRef(); } stream.end(); } finally { stream.close(); } return token; } catch (IOException e) { throw new XPathException("Error analyzing the query string: " + e.getMessage(), e); } }
From source file:org.exoplatform.services.jcr.impl.core.query.lucene.AbstractExcerpt.java
License:Apache License
/** * @param text the text./* ww w .j a va2 s . co m*/ * @return a <code>TermPositionVector</code> for the given text. */ private TermPositionVector createTermPositionVector(String text) { // term -> TermVectorOffsetInfo[] final SortedMap<String, TermVectorOffsetInfo[]> termMap = new TreeMap<String, TermVectorOffsetInfo[]>(); Reader r = new StringReader(text); TokenStream ts = index.getTextAnalyzer().tokenStream("", r); try { while (ts.incrementToken()) { OffsetAttribute offset = ts.getAttribute(OffsetAttribute.class); CharTermAttribute term = ts.getAttribute(CharTermAttribute.class); String termText = new String(term.buffer(), 0, term.length()); TermVectorOffsetInfo[] info = termMap.get(termText); if (info == null) { info = new TermVectorOffsetInfo[1]; } else { TermVectorOffsetInfo[] tmp = info; info = new TermVectorOffsetInfo[tmp.length + 1]; System.arraycopy(tmp, 0, info, 0, tmp.length); } info[info.length - 1] = new TermVectorOffsetInfo(offset.startOffset(), offset.endOffset()); termMap.put(termText, info); } ts.end(); ts.close(); } catch (IOException e) { // should never happen, we are reading from a string if (LOG.isTraceEnabled()) { LOG.trace("An exception occurred: " + e.getMessage()); } } return new TermPositionVector() { private String[] terms = termMap.keySet().toArray(new String[termMap.size()]); public int[] getTermPositions(int index) { return null; } public TermVectorOffsetInfo[] getOffsets(int index) { TermVectorOffsetInfo[] info = TermVectorOffsetInfo.EMPTY_OFFSET_INFO; if (index >= 0 && index < terms.length) { info = termMap.get(terms[index]); } return info; } public String getField() { return ""; } public int size() { return terms.length; } public String[] getTerms() { return terms; } public int[] getTermFrequencies() { int[] freqs = new int[terms.length]; for (int i = 0; i < terms.length; i++) { freqs[i] = termMap.get(terms[i]).length; } return freqs; } public int indexOf(String term) { int res = Arrays.binarySearch(terms, term); return res >= 0 ? res : -1; } public int[] indexesOf(String[] terms, int start, int len) { int[] res = new int[len]; for (int i = 0; i < len; i++) { res[i] = indexOf(terms[i]); } return res; } }; }
From source file:org.exoplatform.services.jcr.impl.core.query.lucene.MoreLikeThis.java
License:Apache License
/** * Adds term frequencies found by tokenizing text from reader into the Map words * @param r a source of text to be tokenized * @param termFreqMap a Map of terms and their frequencies * @param fieldName Used by analyzer for any special per-field analysis *///from ww w .j a v a 2 s.c o m private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, String fieldName) throws IOException { TokenStream ts = analyzer.tokenStream(fieldName, r); int tokenCount = 0; // for every token while (ts.incrementToken()) { CharTermAttribute term = ts.getAttribute(CharTermAttribute.class); String word = new String(term.buffer(), 0, term.length()); tokenCount++; if (tokenCount > maxNumTokensParsed) { break; } if (isNoiseWord(word)) { continue; } // increment frequency Int cnt = termFreqMap.get(word); if (cnt == null) { termFreqMap.put(word, new Int()); } else { cnt.x++; } } ts.end(); ts.close(); }
From source file:org.fao.geonet.kernel.search.LuceneSearcher.java
License:Open Source License
/** * Splits text into tokens using the Analyzer that is matched to the field. * @param field/*from w w w . j a v a2s.c om*/ * @param requestStr * @param a * @return */ private static String analyzeText(String field, String requestStr, PerFieldAnalyzerWrapper a) { boolean phrase = false; if ((requestStr.startsWith("\"") && requestStr.endsWith("\""))) { phrase = true; } TokenStream ts = a.tokenStream(field, new StringReader(requestStr)); TermAttribute termAtt = ts.addAttribute(TermAttribute.class); List<String> tokenList = new ArrayList<String>(); try { while (ts.incrementToken()) { tokenList.add(termAtt.term()); } } catch (Exception e) { // TODO why swallow e.printStackTrace(); } StringBuilder result = new StringBuilder(); for (int i = 0; i < tokenList.size(); i++) { if (i > 0) { result.append(" "); result.append(tokenList.get(i)); } else { result.append(tokenList.get(i)); } } String outStr = result.toString(); if (phrase) { outStr = "\"" + outStr + "\""; } return outStr; }
From source file:org.fastcatsearch.ir.index.SearchIndexWriter.java
License:Apache License
private void indexValue(int docNo, int i, Object value, boolean isIgnoreCase, int positionIncrementGap) throws IOException, IRException { if (value == null) { return;/*from w w w . ja v a 2 s .c o m*/ } char[] fieldValue = value.toString().toCharArray(); TokenStream tokenStream = indexAnalyzerList[i].tokenStream(indexId, new CharArrayReader(fieldValue), indexingAnalyzerOption); tokenStream.reset(); CharsRefTermAttribute termAttribute = null; PositionIncrementAttribute positionAttribute = null; StopwordAttribute stopwordAttribute = null; AdditionalTermAttribute additionalTermAttribute = null; CharTermAttribute charTermAttribute = null; //? ? . if (tokenStream.hasAttribute(CharsRefTermAttribute.class)) { termAttribute = tokenStream.getAttribute(CharsRefTermAttribute.class); } if (tokenStream.hasAttribute(PositionIncrementAttribute.class)) { positionAttribute = tokenStream.getAttribute(PositionIncrementAttribute.class); } if (tokenStream.hasAttribute(AdditionalTermAttribute.class)) { additionalTermAttribute = tokenStream.getAttribute(AdditionalTermAttribute.class); } // stopword . if (tokenStream.hasAttribute(StopwordAttribute.class)) { stopwordAttribute = tokenStream.getAttribute(StopwordAttribute.class); } if (tokenStream.hasAttribute(CharTermAttribute.class)) { charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); } int lastPosition = 0; while (tokenStream.incrementToken()) { CharVector key = null; if (termAttribute != null) { CharsRef charRef = termAttribute.charsRef(); char[] buffer = new char[charRef.length()]; System.arraycopy(charRef.chars, charRef.offset, buffer, 0, charRef.length); key = new CharVector(buffer, 0, buffer.length); } else { key = new CharVector(charTermAttribute.buffer(), 0, charTermAttribute.length()); } int position = -1; if (positionAttribute != null) { position = positionAttribute.getPositionIncrement() + positionIncrementGap; lastPosition = position; } // logger.debug("FIELD#{}: {} >> {} ({})", indexId, key, docNo, position); if (stopwordAttribute != null && stopwordAttribute.isStopword()) { //ignore } else { memoryPosting.add(key, docNo, position); } // if(synonymAttribute != null) { // CharVector[] synonym = synonymAttribute.getSynonym(); // if(synonym != null) { // for(CharVector token : synonym) { // memoryPosting.add(token, docNo, position); // } // } // } if (additionalTermAttribute != null && additionalTermAttribute.size() > 0) { Iterator<String> iter = additionalTermAttribute.iterateAdditionalTerms(); while (iter.hasNext()) { CharVector token = new CharVector(iter.next().toCharArray()); memoryPosting.add(token, docNo, lastPosition); } } } }
From source file:org.fastcatsearch.plugin.analysis.RunAnalyzer.java
public static void main(String[] args) throws IOException { if (args.length != 3) { printUsage();/*from w w w .ja va2s . c o m*/ System.exit(0); } File pluginDir = new File(args[0]); String pluginClassName = args[1]; String analyzerId = args[2]; RunAnalyzer runAnalyzer = new RunAnalyzer(pluginDir, pluginClassName); AnalyzerPool analyzerPool = runAnalyzer.getAnalyzerPool(analyzerId); Analyzer analyzer = null; try { analyzer = analyzerPool.getFromPool(); //? ? ? ?. Scanner sc = new Scanner(System.in); System.out.println("=================================="); System.out.println(" Fastcat analyzer"); System.out.println(" Enter 'quit' for exit program. "); System.out.println("=================================="); System.out.print("Input String: "); while (sc.hasNextLine()) { String str = sc.nextLine(); if (str.equalsIgnoreCase("quit")) { break; } try { char[] value = str.toCharArray(); TokenStream tokenStream = analyzer.tokenStream("", new CharArrayReader(value), new AnalyzerOption()); tokenStream.reset(); CharsRefTermAttribute termAttribute = null; if (tokenStream.hasAttribute(CharsRefTermAttribute.class)) { termAttribute = tokenStream.getAttribute(CharsRefTermAttribute.class); } SynonymAttribute synonymAttribute = null; if (tokenStream.hasAttribute(SynonymAttribute.class)) { synonymAttribute = tokenStream.getAttribute(SynonymAttribute.class); } AdditionalTermAttribute additionalTermAttribute = null; if (tokenStream.hasAttribute(AdditionalTermAttribute.class)) { additionalTermAttribute = tokenStream.getAttribute(AdditionalTermAttribute.class); } StopwordAttribute stopwordAttribute = null; if (tokenStream.hasAttribute(StopwordAttribute.class)) { stopwordAttribute = tokenStream.getAttribute(StopwordAttribute.class); } CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); while (tokenStream.incrementToken()) { String word = ""; //? ?? CharsRefTermAttribute ? . if (termAttribute != null) { word = termAttribute.toString(); } else { //CharsRefTermAttribute ? ?? CharTermAttribute ? ?. word = charTermAttribute.toString(); } // ?? . if (stopwordAttribute.isStopword()) { continue; } // // ?? . // System.out.print(">> "); System.out.println(word); // . if (synonymAttribute != null) { List synonyms = synonymAttribute.getSynonyms(); if (synonyms != null) { for (Object synonymObj : synonyms) { if (synonymObj instanceof CharVector) { CharVector synonym = (CharVector) synonymObj; System.out.print("S> "); System.out.println(synonym); } else if (synonymObj instanceof List) { List synonymList = (List) synonymObj; for (Object synonym : synonymList) { System.out.print("S> "); System.out.println(synonym); } } } } } // . // ??? ? ? ?? ?, ?? . if (additionalTermAttribute != null && additionalTermAttribute.size() > 0) { Iterator<String> termIter = additionalTermAttribute.iterateAdditionalTerms(); while (termIter.hasNext()) { String token = termIter.next(); System.out.print("A> "); System.out.println(word); } } } } catch (IOException e) { e.printStackTrace(); } System.out.print("Input String: "); } } finally { if (analyzer != null) { analyzerPool.releaseToPool(analyzer); } } System.out.print("Bye!"); }
From source file:org.genemania.completion.lucene.GeneCompletionProvider.java
License:Open Source License
public Long getNodeId(String symbol) { try {//w ww . j a va 2s. c om TokenStream tokens = analyze(symbol); PhraseQuery query = new PhraseQuery(); tokens.reset(); while (tokens.incrementToken()) { TermAttribute term = tokens.getAttribute(TermAttribute.class); query.add(new Term(GeneIndexBuilder.GENE_FIELD, term.term())); } tokens.end(); tokens.close(); final Set<Long> nodes = new HashSet<Long>(); searcher.search(query, new AbstractCollector() { @Override public void handleHit(int id) { try { Document document = searcher.doc(id); nodes.add(Long.parseLong(document.get(GeneIndexBuilder.NODE_ID_FIELD))); } catch (IOException e) { log(e); } } }); if (nodes.size() > 0) { return nodes.iterator().next(); } } catch (IOException e) { log(e); } return null; }
From source file:org.genemania.data.classification.lucene.LuceneGeneClassifier.java
License:Open Source License
public void classify(final String symbol, final IGeneClassificationHandler handler) throws ApplicationException { try {//from w w w .j av a2s. com TokenStream tokens = analyze(symbol); PhraseQuery query = new PhraseQuery(); tokens.reset(); while (tokens.incrementToken()) { TermAttribute term = tokens.getAttribute(TermAttribute.class); query.add(new Term(LuceneMediator.GENE_SYMBOL, term.term())); } tokens.end(); tokens.close(); searcher.search(query, new AbstractCollector() { @Override public void handleHit(int doc) { try { Document document = searcher.doc(doc); long organismId = Long.parseLong(document.get(LuceneMediator.GENE_ORGANISM_ID)); handler.handleClassification(symbol, organismId); } catch (IOException e) { log(e); } } }); } catch (IOException e) { throw new ApplicationException(e); } }