List of usage examples for org.apache.lucene.analysis TokenStream getAttribute
public final <T extends Attribute> T getAttribute(Class<T> attClass)
The caller must pass in a Class<?
From source file:org.karsha.base.DocIndexer.java
License:Open Source License
/** * This method handles the Lemmatization of given text using * EnglishLemmaAnalyzer//from w w w . j av a2 s . c o m * * @param text * @param tagger- should supply a Stanford parser "MaxentTagger" object * @return- Lemmatized text * @throws IOException * @throws ClassNotFoundException */ public String analyze(String text, MaxentTagger tagger) throws IOException, ClassNotFoundException { // System.out.println("Analzying "" + text + """); //MaxentTagger tagger = new MaxentTagger("tagger/bidirectional-distsim-wsj-0-18.tagger"); Analyzer analyzer = new EnglishLemmaAnalyzer(tagger); //System.out.println("\t" + analyzer.getClass().getName() + ":"); //System.out.print("\t\t"); TokenStream stream = analyzer.tokenStream("contents", new StringReader(text)); TermAttribute termAttribute = stream.getAttribute(TermAttribute.class); String term = null; while (stream.incrementToken()) { // stream. if (stream.incrementToken()) { term = term + " " + termAttribute.term(); // Token token = stream.next(); // if (token == null) break; // System.out.print("[" + term + "] \n"); } } stream.clearAttributes(); //System.out.println("\n"); return term; }
From source file:org.karsha.tokenize.DefaultTokenizer.java
License:Open Source License
public String processText(String text) { StringBuffer str = new StringBuffer(); TokenStream stream = tokenStream(new StringReader(text)); Token token = new Token(); try {/*from w w w . j a v a2s . c o m*/ while (stream.incrementToken()) { str.append(stream.getAttribute(TermAttribute.class).term()); str.append(" "); } } catch (Exception e) { e.printStackTrace(); } return str.toString(); }
From source file:org.karsha.tokenize.SimpleTokenizer.java
License:Open Source License
public String processText(String text) { StringBuffer str = new StringBuffer(); TokenStream stream = tokenStream(new StringReader(text)); Token token = new Token(); try {//from w w w.ja v a2 s .c o m while (stream.incrementToken()) { str.append(stream.getAttribute(TermAttribute.class).term()); str.append(" "); } // while ((token = stream.next(token)) != null) { // str.append(token.termBuffer(), 0, token.termLength()); // str.append(" "); // } } catch (Exception e) { e.printStackTrace(); } //return str.toString().replace('-', ' ').trim(); return str.toString(); }
From source file:org.meresco.lucene.suggestion.SuggestionIndex.java
License:Open Source License
public List<String> shingles(String s) throws IOException { List<String> shingles = new ArrayList<String>(); TokenStream stream = this.shingleAnalyzer.tokenStream("ignored", s); stream.reset();//w ww . j a v a 2 s. c om CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class); while (stream.incrementToken()) { shingles.add(termAttribute.toString()); } stream.close(); return shingles; }
From source file:org.meresco.lucene.suggestion.SuggestionNGramIndex.java
License:Open Source License
public static List<String> ngrams(String s, Boolean trigram) throws IOException { List<String> ngram = new ArrayList<String>(); Analyzer ngramAnalyzer = trigram ? TRIGRAM_ANALYZER : BIGRAM_ANALYZER; TokenStream stream = ngramAnalyzer.tokenStream("ignored", s); stream.reset();//from w w w .j a va2s .c om CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class); while (stream.incrementToken()) { ngram.add(termAttribute.toString()); } stream.close(); return ngram; }
From source file:org.neo4j.index.lucene.LuceneFulltextIndexService.java
License:Open Source License
@Override protected Query formQuery(String key, Object value, Object matching) { if (matching == MatchingType.EXACT) { return new TermQuery(new Term(DOC_INDEX_SOURCE_KEY, value.toString())); }/*www .j a v a 2 s. com*/ TokenStream stream = LuceneFulltextDataSource.LOWER_CASE_WHITESPACE_ANALYZER.tokenStream(DOC_INDEX_KEY, new StringReader(value.toString().toLowerCase())); BooleanQuery booleanQuery = new BooleanQuery(); try { while (stream.incrementToken()) { String term = stream.getAttribute(TermAttribute.class).term(); booleanQuery.add(new TermQuery(new Term(DOC_INDEX_KEY, term)), Occur.MUST); } } catch (IOException e) { throw new RuntimeException(e); } return booleanQuery; }
From source file:org.omegat.tokenizer.BaseTokenizer.java
License:Open Source License
protected Token[] tokenize(final String strOrig, final boolean stemsAllowed, final boolean stopWordsAllowed, final boolean filterDigits, final boolean filterWhitespace) { if (StringUtil.isEmpty(strOrig)) { return EMPTY_TOKENS_LIST; }//from w w w. j a v a 2 s.co m List<Token> result = new ArrayList<Token>(64); final TokenStream in = getTokenStream(strOrig, stemsAllowed, stopWordsAllowed); in.addAttribute(CharTermAttribute.class); in.addAttribute(OffsetAttribute.class); CharTermAttribute cattr = in.getAttribute(CharTermAttribute.class); OffsetAttribute off = in.getAttribute(OffsetAttribute.class); try { in.reset(); while (in.incrementToken()) { String tokenText = cattr.toString(); if (acceptToken(tokenText, filterDigits, filterWhitespace)) { result.add(new Token(tokenText, off.startOffset(), off.endOffset() - off.startOffset())); } } in.end(); in.close(); } catch (IOException ex) { // shouldn't happen } return result.toArray(new Token[result.size()]); }
From source file:org.omegat.tokenizer.BaseTokenizer.java
License:Open Source License
protected String[] tokenizeToStrings(String str, boolean stemsAllowed, boolean stopWordsAllowed, boolean filterDigits, boolean filterWhitespace) { if (StringUtil.isEmpty(str)) { return EMPTY_STRING_LIST; }/*from w ww . j av a 2 s . co m*/ List<String> result = new ArrayList<String>(64); final TokenStream in = getTokenStream(str, stemsAllowed, stopWordsAllowed); in.addAttribute(CharTermAttribute.class); in.addAttribute(OffsetAttribute.class); CharTermAttribute cattr = in.getAttribute(CharTermAttribute.class); OffsetAttribute off = in.getAttribute(OffsetAttribute.class); Locale loc = stemsAllowed ? getLanguage().getLocale() : null; try { in.reset(); while (in.incrementToken()) { String tokenText = cattr.toString(); if (acceptToken(tokenText, filterDigits, filterWhitespace)) { result.add(tokenText); if (stemsAllowed) { String origText = str.substring(off.startOffset(), off.endOffset()); if (!origText.toLowerCase(loc).equals(tokenText.toLowerCase(loc))) { result.add(origText); } } } } in.end(); in.close(); } catch (IOException ex) { // shouldn't happen } return result.toArray(new String[result.size()]); }
From source file:org.openedit.data.lucene.AnalyzingQueryParserWithStop.java
License:Apache License
/** * Returns the analyzed form for the given chunk * /*from w w w . ja v a 2 s . c o m*/ * If the analyzer produces more than one output token from the given chunk, * a ParseException is thrown. * * @param field The target field * @param termStr The full term from which the given chunk is excerpted * @param chunk The portion of the given termStr to be analyzed * @return The result of analyzing the given chunk * @throws ParseException when analysis returns other than one output token */ protected String analyzeSingleChunk(String field, String termStr, String chunk) throws ParseException { String analyzed = null; TokenStream stream = null; try { stream = getAnalyzer().tokenStream(field, chunk); stream.reset(); CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class); // get first and hopefully only output token if (stream.incrementToken()) { analyzed = termAtt.toString(); // try to increment again, there should only be one output token StringBuilder multipleOutputs = null; while (stream.incrementToken()) { if (null == multipleOutputs) { multipleOutputs = new StringBuilder(); multipleOutputs.append('"'); multipleOutputs.append(analyzed); multipleOutputs.append('"'); } multipleOutputs.append(','); multipleOutputs.append('"'); multipleOutputs.append(termAtt.toString()); multipleOutputs.append('"'); } stream.end(); if (null != multipleOutputs) { throw new ParseException(String.format(getLocale(), "Analyzer created multiple terms for \"%s\": %s", chunk, multipleOutputs.toString())); } } else { // nothing returned by analyzer. Was it a stop word and the user accidentally // used an analyzer with stop words? stream.end(); //Need to just ignore this return null; //throw new ParseException(String.format(getLocale(), "Analyzer returned nothing for \"%s\"", chunk)); } } catch (IOException e) { throw new ParseException( String.format(getLocale(), "IO error while trying to analyze single term: \"%s\"", termStr)); } finally { IOUtils.closeWhileHandlingException(stream); } return analyzed; }
From source file:org.sc.probro.lucene.BiothesaurusSearcher.java
License:Apache License
public Query createQuery(String phrase) { phrase = phrase.trim();//from w w w .j a v a 2s . c o m TermQuery idQuery = new TermQuery(new Term("protein-id", phrase.toUpperCase())); TermQuery accQuery = new TermQuery(new Term("accession", phrase.toUpperCase())); ArrayList<TermQuery> descQueries = new ArrayList<TermQuery>(); ArrayList<Term> descTerms = new ArrayList<Term>(); TokenStream stream = analyzer.tokenStream("description", new StringReader(phrase)); TermAttribute attr = (TermAttribute) stream.getAttribute(TermAttribute.class); try { stream.reset(); Term lastTerm = null; while (stream.incrementToken()) { Term t = new Term("description", attr.term()); descQueries.add(new TermQuery(t)); descTerms.add(t); if (lastTerm != null) { Term hyph = new Term("description", lastTerm.text() + "-" + t.text()); descQueries.add(new TermQuery(hyph)); //descTerms.add(hyph); } lastTerm = t; } } catch (IOException e) { e.printStackTrace(); } return createDisjunction(2.0f, createMust(idQuery), createMust(accQuery), //createShould(descQueries.toArray(new Query[0]))); createDescendingQuery(descTerms.toArray(new Term[0]))); }