List of usage examples for org.apache.lucene.analysis TokenStream reset
public void reset() throws IOException
From source file:sh.isaac.provider.query.lucene.LuceneIndexer.java
License:Apache License
/** * Builds the prefix query./*ww w.java 2 s.c o m*/ * * @param searchString the search string * @param field the field * @param analyzer the analyzer * @return the query * @throws IOException Signals that an I/O exception has occurred. */ protected Query buildPrefixQuery(String searchString, String field, Analyzer analyzer) throws IOException { final TokenStream tokenStream; final List<String> terms; try (StringReader textReader = new StringReader(searchString)) { tokenStream = analyzer.tokenStream(field, textReader); tokenStream.reset(); terms = new ArrayList<>(); final CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); while (tokenStream.incrementToken()) { terms.add(charTermAttribute.toString()); } } tokenStream.close(); analyzer.close(); final BooleanQuery.Builder bq = new BooleanQuery.Builder(); if ((terms.size() > 0) && !searchString.endsWith(" ")) { final String last = terms.remove(terms.size() - 1); bq.add(new PrefixQuery((new Term(field, last))), Occur.MUST); } terms.stream().forEach((s) -> { bq.add(new TermQuery(new Term(field, s)), Occur.MUST); }); return bq.build(); }
From source file:snu.controladores.indexador.Parser.java
/** * Realiza a tokenizao de uma string (Pega as palavras com split e extrai * seu radical)/*from ww w . j av a 2 s.c o m*/ * * @param analyzer * @param string * @return * @throws IOException */ private List<String> tokenizeString(Analyzer analyzer, String string) throws IOException { List<String> result = new ArrayList<>(); TokenStream stream = analyzer.tokenStream(null, new StringReader(string)); stream.reset(); while (stream.incrementToken()) { result.add(stream.getAttribute(CharTermAttribute.class).toString()); } return result; }
From source file:snu.controladores.indexador.ProcessadorDeConsultas.java
/** * Realiza a tokenizao de uma string (Pega as palavras com split e extrai * seu radical)//from w w w. ja va2 s . c o m * * @param analyzer * @param string * @return * @throws IOException */ private List<String> tokenizeString(Analyzer analyzer, String string) throws IOException { List<String> result = new ArrayList<>(); TokenStream stream = analyzer.tokenStream(null, new StringReader(string)); stream.reset(); while (stream.incrementToken()) { result.add(stream.getAttribute(CharTermAttribute.class).toString()); } return result; }
From source file:stackoverflow.lucene.modified.MoreLikeThis.java
License:Apache License
/** * Adds term frequencies found by tokenizing text from reader into the Map words * * @param r a source of text to be tokenized * @param termFreqMap a Map of terms and their frequencies * @param fieldName Used by analyzer for any special per-field analysis */// w w w .j ava 2 s .c om private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, String fieldName) throws IOException { if (analyzer == null) { throw new UnsupportedOperationException( "To use MoreLikeThis without " + "term vectors, you must provide an Analyzer"); } TokenStream ts = analyzer.tokenStream(fieldName, r); int tokenCount = 0; // for every token CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { String word = termAtt.toString(); tokenCount++; if (tokenCount > maxNumTokensParsed) { break; } if (isNoiseWord(word)) { continue; } // increment frequency Int cnt = termFreqMap.get(word); if (cnt == null) { termFreqMap.put(word, new Int()); } else { cnt.x++; } } ts.end(); ts.close(); }
From source file:StopWords.StopWords.java
public String removeStopwords(String input) { TokenStream tokenStream = new ClassicTokenizer(Version.LUCENE_35, new StringReader(input)); // remove stop words tokenStream = new StopFilter(Version.LUCENE_35, tokenStream, EnglishAnalyzer.getDefaultStopSet()); // retrieve the remaining tokens Set<String> tokens = new HashSet<String>(); CharTermAttribute token = tokenStream.getAttribute(CharTermAttribute.class); String str = ""; try {//from w w w. j a va 2 s. c o m tokenStream.reset(); } catch (IOException ex) { Logger.getLogger(StopWords.class.getName()).log(Level.SEVERE, null, ex); } try { while (tokenStream.incrementToken()) { tokens.add(token.toString()); str += token.toString() + " "; //System.out.println(token.toString()); } } catch (IOException e) { // log } return str; }
From source file:stroom.search.server.TestStandardAnalyser.java
License:Apache License
private void testAnalyser(final String input, final Analyzer analyzer) throws Exception { System.out.println("Testing analyser: " + analyzer.getClass().getName()); final ReusableStringReader reader = new ReusableStringReader(); reader.init(input);//from ww w. ja v a 2s .co m final TokenStream stream = analyzer.tokenStream("Test", reader); // reset the TokenStream to the first token stream.reset(); boolean hasMoreTokens = stream.incrementToken(); final CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); for (;;) { if (!hasMoreTokens) break; // Get the text of this term. final char[] tokenText = termAtt.buffer(); final int tokenTextLen = termAtt.length(); System.out.println(new String(tokenText, 0, tokenTextLen)); hasMoreTokens = stream.incrementToken(); } }
From source file:summarizer.KeywordsGuesser.java
License:Open Source License
public static String stemmize(String term) throws IOException { TokenStream tokenStream = new ClassicTokenizer(LUCENE_VERSION, new StringReader(term)); tokenStream = new PorterStemFilter(tokenStream); Set<String> stems = new HashSet<String>(); CharTermAttribute token = tokenStream.getAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { stems.add(token.toString());/*from w w w . j av a 2 s . co m*/ } if (stems.size() != 1) { return null; } String stem = stems.iterator().next(); if (!stem.matches("[\\w-]+")) { return null; } return stem; }
From source file:summarizer.KeywordsGuesser.java
License:Open Source License
public static List<Keyword> guessFromString(String input) throws IOException { input = input.replaceAll("-+", "-0"); input = input.replaceAll("[\\p{Punct}&&[^'-]]+", " "); input = input.replaceAll("(?:'(?:[tdsm]|[vr]e|ll))+\\b", ""); TokenStream tokenStream = new ClassicTokenizer(LUCENE_VERSION, new StringReader(input)); tokenStream = new LowerCaseFilter(LUCENE_VERSION, tokenStream); tokenStream = new ClassicFilter(tokenStream); tokenStream = new ASCIIFoldingFilter(tokenStream); tokenStream = new StopFilter(LUCENE_VERSION, tokenStream, EnglishAnalyzer.getDefaultStopSet()); List<Keyword> keywords = new LinkedList<Keyword>(); CharTermAttribute token = tokenStream.getAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { String term = token.toString(); String stem = stemmize(term); if (stem != null) { Keyword keyword = find(keywords, new Keyword(stem.replaceAll("-0", "-"))); keyword.add(term.replaceAll("-0", "-")); }// ww w . j av a 2 s .com } Collections.sort(keywords); return keywords; }
From source file:test.analysis.AnalyzerUtils.java
License:Apache License
public static void displayTokens(TokenStream stream) throws IOException { stream.reset(); CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); while (stream.incrementToken()) { System.out.print("[" + term + "] "); //B }//from w w w. j a va 2 s . c o m stream.close(); }
From source file:test.analysis.AnalyzerUtils.java
License:Apache License
public static void displayTokensWithPositions(Analyzer analyzer, String text) throws IOException { TokenStream stream = analyzer.tokenStream("contents", new StringReader(text)); CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class); stream.reset(); int position = 0; while (stream.incrementToken()) { int increment = posIncr.getPositionIncrement(); if (increment > 0) { position = position + increment; System.out.println(); System.out.print(position + ": "); }//from www. ja v a2 s. c o m System.out.print("[" + term + "] "); } System.out.println(); stream.close(); }