List of usage examples for org.apache.lucene.analysis TokenStream incrementToken
public abstract boolean incrementToken() throws IOException;
From source file:de.ingrid.interfaces.csw.tools.LuceneTools.java
License:EUPL
/** * @param term//from w w w . j a v a2 s.co m * @return filtered term * @throws IOException */ public String filterTerm(String term) throws IOException { String result = ""; // always use same analyzer, NOT new instance ! Is called in mapping process ! Analyzer myAnalyzer = getAnalyzer(); TokenStream ts = myAnalyzer.tokenStream(null, new StringReader(term)); CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class); while (ts.incrementToken()) { String t = charTermAttribute.toString(); result = result + " " + t; } return result.trim(); }
From source file:de.jetwick.es.JetwickQuery.java
License:Apache License
public Set<String> doSnowballStemming(TokenStream ts) { Set<String> res = new LinkedHashSet<String>(); ts = new SnowballFilter(ts, "English"); try {/*from w ww.j a va 2s . c o m*/ while (ts.incrementToken()) { res.add(ts.getAttribute(TermAttribute.class).term()); } } catch (IOException ex) { logger.error("Exception while stemming to snoball", ex); } return res; }
From source file:de.mirkosertic.desktopsearch.SearchPhraseSuggester.java
License:Open Source License
private String analyze(String aFieldName, String aString) throws IOException { TokenStream theTokenStream = analyzer.tokenStream(aFieldName, aString); theTokenStream.reset();/*from www . j a v a2s. com*/ CharTermAttribute theCharTerms = theTokenStream.getAttribute(CharTermAttribute.class); try { if (theTokenStream.incrementToken()) { return theCharTerms.toString(); } return null; } finally { theTokenStream.end(); theTokenStream.close(); } }
From source file:de.powerstaff.business.service.impl.GoogleStyleQueryParser.java
License:Open Source License
protected void addWildcardOrTermQueries(String aTerm, BooleanQuery aQuery, String aField, Analyzer aAnalyzer) throws IOException { Query theTempQuery;/*www.j av a 2s . c om*/ TokenStream theTokenStream = aAnalyzer.tokenStream(aField, new StringReader(aTerm)); while (theTokenStream.incrementToken()) { TermAttribute theTermAttribute = theTokenStream.getAttribute(TermAttribute.class); String theTokenText = theTermAttribute.term(); if (isWildcardTerm(aTerm)) { theTempQuery = new WildcardQuery(new Term(aField, getCorrectedWildcardTerm(aTerm))); } else { theTempQuery = new TermQuery(new Term(aField, theTokenText)); } aQuery.add(theTempQuery, Occur.MUST); } }
From source file:de.powerstaff.business.service.impl.GoogleStyleQueryParser.java
License:Open Source License
protected void addPhraseQuery(String aTerm, BooleanQuery aQuery, String aField, Analyzer aAnalyzer) throws IOException { MultiPhraseQuery thePhraseQuery = new MultiPhraseQuery(); TokenStream theTokenStream = aAnalyzer.tokenStream(aField, new StringReader(aTerm)); while (theTokenStream.incrementToken()) { TermAttribute theTermAttribute = theTokenStream.getAttribute(TermAttribute.class); String theTokenText = theTermAttribute.term(); Term theTerm = new Term(aField, theTokenText); if (!isWildcardTerm(theTokenText)) { thePhraseQuery.add(theTerm); } else {//from w ww . ja v a2 s. com Term theWildcardTerm = new Term(theTerm.field(), getCorrectedWildcardTerm(theTerm.text())); WildcardTermEnum theEnum = new WildcardTermEnum(reader, theWildcardTerm); try { List<Term> theTerms = new ArrayList<Term>(); do { theTerms.add(theEnum.term()); } while (theEnum.next()); thePhraseQuery.add(theTerms.toArray(new Term[0])); } finally { theEnum.close(); } } } aQuery.add(thePhraseQuery, Occur.MUST); }
From source file:de.twitterlivesearch.analysis.Tokenizer.java
License:Apache License
/** * @param stringToAnalyze// w w w. j a v a2 s.com * String to be tokenized * @param {@link org.apache.lucene.analysis.Analyzer Analyzer} to be used * for analysis * * @return list of tokens */ public static List<String> getTokensForString(String stringToAnalyze, Analyzer analyzer) { List<String> tokens = new ArrayList<String>(); try { TokenStream stream = analyzer.tokenStream(null, new StringReader(stringToAnalyze)); stream.reset(); while (stream.incrementToken()) { tokens.add(stream.getAttribute(CharTermAttribute.class).toString()); } stream.end(); stream.close(); } catch (IOException e) { throw new RuntimeException(e); } return tokens; }
From source file:de.unidue.inf.is.ezdl.dlcore.data.extractor.TermExtractor.java
License:Open Source License
/** * Split the information cause in sense of term it is a standalone word. * TODO this method removes stopwords but don't detect any phrases. * /* w w w . ja v a 2 s . c om*/ * @param result * the list we will append the items * @param item * the item itself. */ private void add(ExtractionResultImpl result, String item) { if (item != null) { inferLanguage(item); List<String> terms = new ArrayList<String>(); TokenStream tokenStream = new StandardTokenizer(Version.LUCENE_30, new StringReader(item)); // OffsetAttribute offsetAttribute = // tokenStream.getAttribute(OffsetAttribute.class); TermAttribute termAttribute = tokenStream.getAttribute(TermAttribute.class); try { while (tokenStream.incrementToken()) { // int startOffset = offsetAttribute.startOffset(); // int endOffset = offsetAttribute.endOffset(); String term = termAttribute.term(); terms.add(term); } } catch (IOException e) { logger.error(e.getMessage(), e); } terms = filter.filter(terms, locale); for (String t : terms) { if (!StringUtils.isEmpty((t))) { Entry e = new EntryImpl(t.toLowerCase(locale)); result.add(e); } } } }
From source file:de.uni_koeln.spinfo.maalr.lucene.util.TokenizerHelper.java
License:Apache License
public static String tokenizeString(Analyzer analyzer, String string) { // Inspired by stackoverflow: // http://stackoverflow.com/questions/6334692/how-to-use-a-lucene-analyzer-to-tokenize-a-string StringBuilder builder = new StringBuilder(); try {/* ww w. j a v a2 s .com*/ TokenStream stream = analyzer.tokenStream(null, new StringReader(string)); stream.reset(); while (stream.incrementToken()) { builder.append(stream.getAttribute(CharTermAttribute.class).toString()); builder.append(" "); } stream.close(); } catch (IOException e) { // not thrown b/c we're using a string reader... throw new RuntimeException(e); } return builder.toString().trim(); }
From source file:dependencies.ReviewDependencyAnalyzer.java
License:Open Source License
public ArrayList<ArrayList<Token>> getSentences(Reader reader) { try {//from w ww . j a v a2s.com // Send reader data through the analyzer TokenStream tokstr = reusableTokenStream("", reader); TermAttribute tok_term = tokstr.getAttribute(TermAttribute.class); TypeAttribute tok_type = tokstr.getAttribute(TypeAttribute.class); FlagsAttribute tok_flags = tokstr.getAttribute(FlagsAttribute.class); PayloadAttribute tok_payload = tokstr.getAttribute(PayloadAttribute.class); // Split the tokenstream returned by the analyzer into sentences. Convert each sentence // into a linked list of tokens ArrayList<ArrayList<Token>> sentence_list = new ArrayList<ArrayList<Token>>(); ArrayList<Token> current_sentence = new ArrayList<Token>(); while (tokstr.incrementToken()) { Token current_token = new Token(tok_term.term(), tok_type.type(), tok_flags.getFlags(), new ReviewTermPayload(tok_payload.getPayload())); current_sentence.add(current_token); // End of sentence reached. Add current sentence to the sentence list if (current_token.isDelim(true)) { if (current_sentence.size() > 1) { sentence_list.add(current_sentence); } current_sentence = new ArrayList<Token>(); } } // At the end of the token stream, if there is an incomplete sentence, add it to the // sentence list. // This case could occur when the last sentence of a given passage does not end with a // period or other sentence delimiter. if (!current_sentence.isEmpty()) { sentence_list.add(current_sentence); } return sentence_list; } catch (IOException e) { AppLogger.error.log(Level.SEVERE, "Error reading data from reader. Analyzing text for typed dependencies could not be completed"); return null; } }
From source file:di.uniba.it.tri.occ.BuildOccurrence.java
License:Open Source License
private List<String> getTokens(Reader reader) throws IOException { List<String> tokens = new ArrayList<>(); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36); TokenStream tokenStream = analyzer.tokenStream("text", reader); tokenStream.reset();/*from ww w . ja va 2s . co m*/ CharTermAttribute cattr = tokenStream.addAttribute(CharTermAttribute.class); while (tokenStream.incrementToken()) { String token = cattr.toString(); String[] split = token.split("'"); if (split.length == 1) { tokens.add(token); } else { int max = 0; int index = 0; for (int i = 0; i < split.length; i++) { if (split[i].length() > max) { max = split[i].length(); index = i; } } tokens.add(split[index]); } } tokenStream.end(); return tokens; }