List of usage examples for org.apache.lucene.analysis TokenStream incrementToken
public abstract boolean incrementToken() throws IOException;
From source file:me.smoe.adar.utils.cam.o.common.SentenceAnalyzer.java
License:Apache License
public static Set<String> analyzer(String sentence) throws Exception { if (StringUtils.isEmpty(sentence)) { return Collections.emptySet(); }//from w ww.j av a 2 s . co m Analyzer analyzer = new StandardAnalyzer(); try { TokenStream tokenStream = analyzer.tokenStream(StringUtils.EMPTY, new StringReader(sentence)); tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); Set<String> words = new LinkedHashSet<>(); while (tokenStream.incrementToken()) { String word = ((CharTermAttribute) tokenStream.getAttribute(CharTermAttribute.class)).toString(); if (word.length() <= 1) { continue; } words.add(word); } return words; } finally { analyzer.close(); } }
From source file:modnlp.idx.inverted.TokeniserJPLucene.java
License:Open Source License
public void tokenise() throws IOException { String ignregexp = "--+|\\.\\.+|\\.+\\p{Space}"; // delete full stops and dashes (typically not used). if (ignoredElements != null && ignoredElements.length() > 0) ignregexp = ignregexp + "|< *" + ignoredElements + "[^>]*?/>" + "|< *" + ignoredElements + ".*?>.*?</" + ignoredElements + " *>"; if (!tagIndexing) ignregexp = ignregexp + "|<.*?>"; //ignregexp = ignregexp+"|\\W\\W+"; Pattern p = Pattern.compile(ignregexp); Matcher igns = p.matcher(originalText); StringBuffer tx = new StringBuffer(originalText); int ct = 1;/* www . j a va 2 s . c o m*/ while (igns.find()) { int s = igns.start(); int e = igns.end(); if (verbose) PrintUtil.printNoMove("Processing exclusions ...", ct++); //System.err.println("replacing\n-----------"+originalText.substring(s,e)+"\n--------------"); char sp[] = new char[e - s]; for (int j = 0; j < sp.length; j++) { sp[j] = ' '; } tx.replace(s, e, new String(sp)); } if (verbose) PrintUtil.donePrinting(); ct = 1; //verbose = false; String text = new String(tx); //System.out.println("-->"+text+"<--"); Tokenizer tokenizer = new JapaneseTokenizer(new StringReader(text), null, true, org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode.SEARCH); TokenStream stream = new JapaneseBaseFormFilter(tokenizer); //stream = new JapanesePartOfSpeechStopFilter(true, stream, stoptags); stream = new CJKWidthFilter(stream); //stream = new StopFilter(matchVersion, stream, stopwords); stream = new JapaneseKatakanaStemFilter(stream); //stream = new LowerCaseFilter(matchVersion, stream); OffsetAttribute offsetAttribute = stream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class); while (stream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String token = charTermAttribute.toString(); tokenMap.putPos(token, startOffset); //System.out.println(token.str+" \t\tS="+token.start+" E="+token.end); } if (verbose) PrintUtil.donePrinting(); ct = 1; }
From source file:modnlp.idx.inverted.TokeniserJPLucene.java
License:Open Source License
public List<String> split(String s) { ArrayList<String> ret = new ArrayList<String>(); try {// w ww . j a v a 2 s . c o m Tokenizer tokenizer = new JapaneseTokenizer(new StringReader(s), null, true, org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode.SEARCH); TokenStream stream = new JapaneseBaseFormFilter(tokenizer); //stream = new JapanesePartOfSpeechStopFilter(true, stream, stoptags); stream = new CJKWidthFilter(stream); //stream = new StopFilter(matchVersion, stream, stopwords); stream = new JapaneseKatakanaStemFilter(stream); //stream = new LowerCaseFilter(matchVersion, stream); OffsetAttribute offsetAttribute = stream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class); while (stream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String token = charTermAttribute.toString(); ret.add(token); //System.out.println(token.str+" \t\tS="+token.start+" E="+token.end); } } catch (java.io.IOException e) { System.err.println(e); } return ret; }
From source file:modnlp.idx.inverted.TokeniserJPLucene.java
License:Open Source License
public TokenIndex getTokenIndex(String str) { TokenIndex ret = new TokenIndex(); try {//ww w . jav a 2 s . c om Tokenizer tokenizer = new JapaneseTokenizer(new StringReader(str), null, true, org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode.SEARCH); TokenStream stream = new JapaneseBaseFormFilter(tokenizer); //stream = new JapanesePartOfSpeechStopFilter(true, stream, stoptags); stream = new CJKWidthFilter(stream); //stream = new StopFilter(matchVersion, stream, stopwords); stream = new JapaneseKatakanaStemFilter(stream); //stream = new LowerCaseFilter(matchVersion, stream); OffsetAttribute offsetAttribute = stream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class); while (stream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String token = charTermAttribute.toString(); ret.add(startOffset, endOffset); //System.out.println(token.str+" \t\tS="+token.start+" E="+token.end); } } catch (java.io.IOException e) { System.err.println(e); } return ret; }
From source file:mvm.rya.indexing.accumulo.freetext.LuceneTokenizer.java
License:Apache License
@Override public SortedSet<String> tokenize(String string) { SortedSet<String> set = new TreeSet<String>(); try {//from w w w .j ava2 s . c om TokenStream stream = analyzer.tokenStream(null, new StringReader(string)); stream.reset(); while (stream.incrementToken()) { set.add(stream.getAttribute(CharTermAttribute.class).toString()); } } catch (IOException e) { // not thrown b/c we're using a string reader... throw new RuntimeException(e); } return set; }
From source file:net.mad.ads.server.utils.http.KeywordUtils.java
License:Open Source License
public static List<String> getTokens(String queryString) { try {// ww w.j ava 2 s .com GermanAnalyzer a = new GermanAnalyzer(Version.LUCENE_33); TokenStream ts = a.tokenStream("", new StringReader(queryString)); List<String> tokens = new ArrayList<String>(); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { String token = termAtt.toString(); tokens.add(token); } ts.end(); ts.close(); return tokens; } catch (IOException e) { logger.error("", e); } return null; }
From source file:net.sf.logsaw.index.internal.LuceneIndexServiceImpl.java
License:Open Source License
private void fillPhraseQuery(PhraseQuery phrase, Analyzer analyzer, String fld, String val) throws IOException { TokenStream ts = analyzer.tokenStream(fld, new StringReader(val)); try {//from w ww . j a v a2 s . co m ts.reset(); // Iterate over tokens and treat each token as term int pos = 0; while (ts.incrementToken()) { CharTermAttribute t = ts.getAttribute(CharTermAttribute.class); PositionIncrementAttribute p = ts.getAttribute(PositionIncrementAttribute.class); pos += p.getPositionIncrement(); phrase.add(new Term(fld, t.toString()), pos - 1); } // End-of-stream clean-up ts.end(); } finally { ts.close(); } }
From source file:net.sf.mmm.search.engine.impl.lucene.LuceneFieldManagerImpl.java
License:Apache License
/** * {@inheritDoc}//from ww w. j a v a2 s. c om */ @Override public Term createTerm(String field, Object value) { NlsNullPointerException.checkNotNull("field", field); NlsNullPointerException.checkNotNull("value", value); String normalizedValue; SearchFieldConfiguration fieldConfiguration = getSearchFields().getOrCreateFieldConfiguration(field); SearchFieldType fieldType = fieldConfiguration.getType(); boolean isString = (value instanceof String); try { switch (fieldType) { case TEXT: try { TokenStream tokenStream = this.analyzer.tokenStream(field, new StringReader((String) value)); TermAttribute termAttribute = tokenStream.getAttribute(TermAttribute.class); if (tokenStream.incrementToken()) { normalizedValue = termAttribute.term(); } else { normalizedValue = ""; } } catch (IOException e) { throw new RuntimeIoException(e, IoMode.READ); } break; case STRING: normalizedValue = (String) value; break; case INTEGER: int i; if (isString) { i = Integer.parseInt((String) value); } else { i = ((Integer) value).intValue(); } normalizedValue = NumericUtils.intToPrefixCoded(i); break; case LONG: long l; if (isString) { l = Long.parseLong((String) value); } else { l = ((Long) value).longValue(); } normalizedValue = NumericUtils.longToPrefixCoded(l); break; case FLOAT: float f; if (isString) { f = Float.parseFloat((String) value); } else { f = ((Float) value).floatValue(); } normalizedValue = NumericUtils.floatToPrefixCoded(f); break; case DOUBLE: double d; if (isString) { d = Double.parseDouble((String) value); } else { d = ((Double) value).doubleValue(); } normalizedValue = NumericUtils.doubleToPrefixCoded(d); break; case DATE: Date date; if (isString) { date = this.iso8601Util.parseDate((String) value); } else { date = (Date) value; } normalizedValue = NumericUtils.longToPrefixCoded(date.getTime()); break; default: throw new IllegalCaseException(SearchFieldType.class, fieldType); } } catch (ClassCastException e) { throw new NlsClassCastException(e, value, fieldType.getFieldClass()); } return new Term(field, normalizedValue); }
From source file:net.sf.mmm.search.engine.impl.lucene.LuceneFieldManagerImpl.java
License:Apache License
/** * {@inheritDoc}/*w w w .ja v a2 s .c om*/ */ @Override public Query createPhraseQuery(String field, String value) { NlsNullPointerException.checkNotNull("field", field); NlsNullPointerException.checkNotNull("value", value); SearchFieldConfiguration fieldConfiguration = getSearchFields().getOrCreateFieldConfiguration(field); SearchFieldType fieldType = fieldConfiguration.getType(); Query result; if (fieldType == SearchFieldType.TEXT) { PhraseQuery phraseQuery = new PhraseQuery(); result = phraseQuery; try { TokenStream tokenStream = this.analyzer.tokenStream(field, new StringReader(value)); TermAttribute termAttribute = tokenStream.getAttribute(TermAttribute.class); while (tokenStream.incrementToken()) { phraseQuery.add(new Term(field, termAttribute.term())); } } catch (IOException e) { throw new RuntimeIoException(e, IoMode.READ); } } else { result = new TermQuery(createTerm(field, value)); } return result; }
From source file:net.sf.okapi.lib.tmdb.lucene.Seeker.java
License:Open Source License
public List<TmHit> searchFuzzy(String genericText, String codesAsString, String tmId, String locale, int max, int threshold, HashMap<String, String> attributes) { float searchThreshold = (float) threshold; if (threshold < 0) searchThreshold = 0.0f;/*from ww w. j a va 2s.com*/ if (threshold > 100) searchThreshold = 100.0f; String queryText = genericText; String gtextFName = TmEntry.GTEXT_PREFIX + locale; Locale javaLoc = new Locale(locale); // create basic ngram analyzer to tokenize query TokenStream queryTokenStream; if (javaLoc.getLanguage() == Locale.ENGLISH.getLanguage()) { queryTokenStream = defaultFuzzyAnalyzer.tokenStream(gtextFName, new StringReader(queryText)); } else { queryTokenStream = new NgramAnalyzer(javaLoc, 4).tokenStream(gtextFName, new StringReader(queryText)); } // Get the TermAttribute from the TokenStream CharTermAttribute termAtt = (CharTermAttribute) queryTokenStream.addAttribute(CharTermAttribute.class); TmFuzzyQuery fQuery = new TmFuzzyQuery(searchThreshold, gtextFName); try { queryTokenStream.reset(); while (queryTokenStream.incrementToken()) { //Term t = new Term(keyIndexField, new String(termAtt.buffer())); Term t = new Term(gtextFName, termAtt.toString()); fQuery.add(t); } queryTokenStream.end(); queryTokenStream.close(); } catch (IOException e) { throw new OkapiIOException(e.getMessage(), e); } return getFuzzyHits(fQuery, genericText, codesAsString, tmId, locale, max, searchThreshold, attributes); }