List of usage examples for org.apache.lucene.analysis TokenStream addAttribute
public final <T extends Attribute> T addAttribute(Class<T> attClass)
From source file:com.mathworks.xzheng.analysis.positional.PositionalStopFilter.java
License:Apache License
public PositionalStopFilter(TokenStream in, CharArraySet stopWords) { super(in);/*w ww . jav a2 s . c om*/ this.stopWords = stopWords; posIncrAttr = in.addAttribute(PositionIncrementAttribute.class); termAttr = in.addAttribute(CharTermAttribute.class); }
From source file:com.mathworks.xzheng.analysis.synonym.SynonymAnalyzerTest.java
License:Apache License
public void testJumps() throws Exception { TokenStream stream = synonymAnalyzer.tokenStream("contents", // #A new StringReader("jumps")); // #A CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class); int i = 0;/*from w ww . jav a 2 s.c o m*/ String[] expected = new String[] { "jumps", // #B "hops", // #B "leaps" }; // #B while (stream.incrementToken()) { assertEquals(expected[i], term.buffer()); int expectedPos; // #C if (i == 0) { // #C expectedPos = 1; // #C } else { // #C expectedPos = 0; // #C } // #C assertEquals(expectedPos, // #C posIncr.getPositionIncrement()); // #C i++; } assertEquals(3, i); }
From source file:com.memonews.mahout.sentiment.SentimentModelHelper.java
License:Apache License
private static void countWords(final Analyzer analyzer, final Collection<String> words, final Reader in, final Multiset<String> overallCounts) throws IOException { final TokenStream ts = analyzer.reusableTokenStream("text", in); ts.addAttribute(CharTermAttribute.class); ts.reset();/*from w w w . j av a 2 s .c om*/ while (ts.incrementToken()) { final String s = ts.getAttribute(CharTermAttribute.class).toString(); words.add(s); } overallCounts.addAll(words); }
From source file:com.mhs.qsol.proximity.ProximityVisitor.java
License:Apache License
/** * Converts a token, as defined in the qsol.jtb JavaCC file, into an * appropriate query.//from w w w . java 2s .c om * * @param token * @return */ protected Query tokenToQuery(String token) { if (logger.isLoggable(Level.FINE)) { // logger.fine("Query tokenToQuery(String token) : token:" + token); } if (logger.isLoggable(Level.FINE)) { logger.fine("Query tokenToQuery(String token) : token:" + token); } token = removeEscapeChars(token); TokenStream source = analyzer.tokenStream(field, new StringReader(token)); CharTermAttribute charTermAtrib = source.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtrib = source.getAttribute(OffsetAttribute.class); PositionIncrementAttribute posIncAtt = source.addAttribute(PositionIncrementAttribute.class); ArrayList<Token> v = new ArrayList<Token>(); Token t; int positionCount = 0; boolean severalTokensAtSamePosition = false; while (true) { try { if (!source.incrementToken()) { break; } t = new Token(charTermAtrib.buffer(), 0, charTermAtrib.length(), offsetAtrib.startOffset(), offsetAtrib.endOffset()); t.setPositionIncrement(posIncAtt.getPositionIncrement()); } catch (IOException e) { t = null; } if (t == null) { break; } v.add(t); if (t.getPositionIncrement() != 0) { positionCount += t.getPositionIncrement(); } else { severalTokensAtSamePosition = true; } } try { source.close(); } catch (IOException e) { // ignore } if (v.size() == 0) { return null; } else if (v.size() == 1) { t = v.get(0); SpanTermQuery stq = new SpanTermQuery(new Term(field, new String(t.buffer(), 0, t.length()))); stq.setBoost(this.boost); return stq; } else { if (severalTokensAtSamePosition) { if (positionCount == 1) { // no phrase query: SpanQuery[] spanQueries = new SpanQuery[v.size()]; StringBuilder regex = new StringBuilder(); for (int i = 0; i < v.size(); i++) { spanQueries[i] = new SpanTermQuery(new Term(field, regex.toString())); } return new SpanOrQuery(spanQueries); } else { // All the Tokens in each sub-list are positioned at the the same location. ArrayList<ArrayList<Token>> identicallyPositionedTokenLists = new ArrayList<ArrayList<Token>>(); for (int i = 0; i < v.size(); i++) { if ((i == 0) || (v.get(i).getPositionIncrement() > 0)) { identicallyPositionedTokenLists.add(new ArrayList<Token>()); } ArrayList<Token> curList = identicallyPositionedTokenLists .get(identicallyPositionedTokenLists.size() - 1); curList.add(v.get(i)); } ArrayList<SpanQuery> spanNearSubclauses = new ArrayList<SpanQuery>(); for (int listNum = 0; listNum < identicallyPositionedTokenLists.size(); listNum++) { ArrayList<Token> curTokens = identicallyPositionedTokenLists.get(listNum); ArrayList<SpanTermQuery> curTermQueries = new ArrayList<SpanTermQuery>(); for (int tokenNum = 0; tokenNum < curTokens.size(); tokenNum++) { SpanTermQuery termQuery = new SpanTermQuery( new Term(field, curTokens.get(tokenNum).term())); termQuery.setBoost(this.boost); curTermQueries.add(termQuery); } int size = curTermQueries.size(); if (size <= 0) continue; else if (size == 1) spanNearSubclauses.add(curTermQueries.get(0)); else spanNearSubclauses.add(new SpanOrQuery(curTermQueries.toArray(new SpanQuery[0]))); } SpanNearQuery query = new SpanNearQuery( (SpanQuery[]) spanNearSubclauses.toArray(new SpanQuery[0]), slop, true); return query; } } else { SpanTermQuery[] clauses = new SpanTermQuery[v.size()]; for (int i = 0; i < v.size(); i++) { Token t2 = v.get(i); clauses[i] = new SpanTermQuery(new Term(field, new String(t2.buffer(), 0, t2.length()))); } SpanNearQuery query = new SpanNearQuery(clauses, slop, true); return query; } } }
From source file:com.mhs.qsol.QsolToQueryVisitor.java
License:Apache License
/** * Converts a token, as defined in the qsol.jtb JavaCC file, into an * appropriate query.// ww w.j av a 2s . com * * @param token * @return */ protected Query tokenToQuery(String token) { token = removeEscapeChars(token); TokenStream source = analyzer.tokenStream(field, new StringReader(token)); ArrayList<Token> v = new ArrayList<Token>(); Token t; int positionCount = 0; boolean severalTokensAtSamePosition = false; CharTermAttribute charTermAtrib = source.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtrib = source.getAttribute(OffsetAttribute.class); PositionIncrementAttribute posIncAtt = source.addAttribute(PositionIncrementAttribute.class); while (true) { try { if (!source.incrementToken()) { break; } t = new Token(charTermAtrib.buffer(), 0, charTermAtrib.length(), offsetAtrib.startOffset(), offsetAtrib.endOffset()); t.setPositionIncrement(posIncAtt.getPositionIncrement()); } catch (IOException e) { t = null; } if (t == null) { break; } v.add(t); if (t.getPositionIncrement() != 0) { positionCount += t.getPositionIncrement(); } else { severalTokensAtSamePosition = true; } } try { source.close(); } catch (IOException e) { // ignore } if (v.size() == 0) { // null's will get cleaned up in visitBooleanOp return null; } else if (v.size() == 1) { t = v.get(0); TermQuery termQuery = new TermQuery(new Term(field, new String(t.buffer(), 0, t.length()))); termQuery.setBoost(this.boost); return termQuery; } else { if (severalTokensAtSamePosition) { if (positionCount == 1) { // no phrase query: BooleanQuery q = new BooleanQuery(true); for (int i = 0; i < v.size(); i++) { t = v.get(i); TermQuery currentQuery = new TermQuery( new Term(field, new String(t.buffer(), 0, t.length()))); currentQuery.setBoost(this.boost); q.add(currentQuery, BooleanClause.Occur.SHOULD); } return q; } else { // All the Tokens in each sub-list are positioned at the the same location. ArrayList<ArrayList<Token>> identicallyPositionedTokenLists = new ArrayList<ArrayList<Token>>(); for (int i = 0; i < v.size(); i++) { if ((i == 0) || (v.get(i).getPositionIncrement() > 0)) { identicallyPositionedTokenLists.add(new ArrayList<Token>()); } ArrayList<Token> curList = identicallyPositionedTokenLists .get(identicallyPositionedTokenLists.size() - 1); curList.add(v.get(i)); } ArrayList<SpanQuery> spanNearSubclauses = new ArrayList<SpanQuery>(); for (int listNum = 0; listNum < identicallyPositionedTokenLists.size(); listNum++) { ArrayList<Token> curTokens = identicallyPositionedTokenLists.get(listNum); ArrayList<SpanTermQuery> curTermQueries = new ArrayList<SpanTermQuery>(); for (int tokenNum = 0; tokenNum < curTokens.size(); tokenNum++) { SpanTermQuery termQuery = new SpanTermQuery( new Term(field, curTokens.get(tokenNum).term())); termQuery.setBoost(this.boost); curTermQueries.add(termQuery); } int size = curTermQueries.size(); if (size <= 0) continue; else if (size == 1) spanNearSubclauses.add(curTermQueries.get(0)); else spanNearSubclauses.add(new SpanOrQuery(curTermQueries.toArray(new SpanQuery[0]))); } SpanNearQuery query = new SpanNearQuery( (SpanQuery[]) spanNearSubclauses.toArray(new SpanQuery[0]), slop, true); return query; } } else { SpanTermQuery[] clauses = new SpanTermQuery[v.size()]; for (int i = 0; i < v.size(); i++) { Token t2 = v.get(i); SpanTermQuery spanQuery = new SpanTermQuery( new Term(field, new String(t2.buffer(), 0, t2.length()))); spanQuery.setBoost(boost); clauses[i] = spanQuery; } // Note: There's a bug here (not by me) that where term offsets are not respected. SpanNearQuery query = new SpanNearQuery(clauses, slop, true); return query; } } }
From source file:com.ml.hadoop.nlp.SequenceFileTokenizerMapper.java
License:Apache License
@Override protected void map(Text key, Text value, Context context) throws IOException, InterruptedException { TokenStream stream = analyzer.tokenStream(key.toString(), new StringReader(value.toString())); stream.reset();/*from w ww . ja v a 2s. co m*/ CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); StringTuple document = new StringTuple(); while (stream.incrementToken()) { if (termAtt.length() > 0) { document.add(new String(termAtt.buffer(), 0, termAtt.length())); } } stream.end(); Closeables.close(stream, true); //drop stop words document = StopWordsHandler.dropStopWords(document); context.write(key, document); }
From source file:com.mozilla.grouperfish.lucene.analysis.en.NGramEnglishAnalyzer.java
License:Apache License
public static void main(String[] args) throws IOException { Set<String> stopwords = Dictionary .loadDictionary(new Path("file:///Users/xstevens/workspace/akela/stopwords-en.txt")); NGramEnglishAnalyzer analyzer = new com.mozilla.grouperfish.lucene.analysis.en.NGramEnglishAnalyzer( Version.LUCENE_31, stopwords, false, true); TokenStream stream = analyzer.tokenStream("", new StringReader("When I was growing up this was so much fun.")); CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); while (stream.incrementToken()) { if (termAttr.length() > 0) { System.out.println(termAttr.toString()); termAttr.setEmpty();/* w w w .j av a 2 s . c o m*/ } } }
From source file:com.mozilla.grouperfish.pig.eval.text.NGramTokenize.java
License:Apache License
@Override public DataBag exec(Tuple input) throws IOException { if (input == null || input.size() == 0) { return null; }/*from w w w.j a v a 2 s . c o m*/ if (analyzer == null) { String langCode = "en"; if (input.size() > 1) { loadDictionary((String) input.get(1)); } boolean stem = false; if (input.size() > 2) { stem = Boolean.parseBoolean((String) input.get(2)); } boolean outputUnigrams = false; if (input.size() > 3) { outputUnigrams = Boolean.parseBoolean((String) input.get(3)); } int minNGram = 2; if (input.size() > 4) { minNGram = Integer.parseInt((String) input.get(4)); } int maxNGram = 3; if (input.size() > 5) { maxNGram = Integer.parseInt((String) input.get(5)); } if (input.size() > 6) { langCode = (String) input.get(6); } if (stopwords != null && stopwords.size() != 0) { analyzer = new com.mozilla.grouperfish.lucene.analysis.en.NGramEnglishAnalyzer(Version.LUCENE_31, stopwords, stem, outputUnigrams, minNGram, maxNGram); } else { analyzer = new com.mozilla.grouperfish.lucene.analysis.en.NGramEnglishAnalyzer(Version.LUCENE_31, StandardAnalyzer.STOP_WORDS_SET, stem, outputUnigrams, minNGram, maxNGram); } } DataBag output = bagFactory.newDefaultBag(); TokenStream stream = analyzer.tokenStream(NOFIELD, new StringReader((String) input.get(0))); CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); while (stream.incrementToken()) { if (termAttr.length() > 0) { Tuple t = tupleFactory.newTuple(termAttr.toString()); output.add(t); termAttr.setEmpty(); } } return output; }
From source file:com.mozilla.grouperfish.pig.eval.text.Tokenize.java
License:Apache License
@Override public DataBag exec(Tuple input) throws IOException { if (input == null || input.size() == 0) { return null; }/*ww w.j a v a 2s . c o m*/ if (analyzer == null) { String langCode = "en"; if (input.size() > 1) { loadDictionary((String) input.get(1)); } boolean stem = false; if (input.size() > 2) { stem = Boolean.parseBoolean((String) input.get(2)); } if (input.size() > 3) { langCode = (String) input.get(3); } if (langCode.startsWith("zh") || langCode.startsWith("ja")) { analyzer = new org.apache.lucene.analysis.cjk.CJKAnalyzer(Version.LUCENE_31); } else if (langCode.startsWith("de")) { analyzer = new org.apache.lucene.analysis.de.GermanAnalyzer(Version.LUCENE_31); } else if (langCode.startsWith("es")) { analyzer = new org.apache.lucene.analysis.es.SpanishAnalyzer(Version.LUCENE_31); } else { if (stopwords != null && stopwords.size() > 0) { analyzer = new EnglishAnalyzer(Version.LUCENE_31, stopwords, stem); } else { analyzer = new EnglishAnalyzer(Version.LUCENE_31, stem); } } } DataBag output = bagFactory.newDefaultBag(); TokenStream stream = analyzer.tokenStream(NOFIELD, new StringReader((String) input.get(0))); CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); while (stream.incrementToken()) { if (termAttr.length() > 0) { Tuple t = tupleFactory.newTuple(termAttr.toString()); output.add(t); termAttr.setEmpty(); } } return output; }
From source file:com.mozilla.grouperfish.transforms.coclustering.lucene.analysis.en.NGramEnglishAnalyzer.java
License:Apache License
public static void main(String[] args) throws IOException { // TODO: SMELLY: de-system-ify Set<String> stopwords = Dictionary .loadDictionary(new Path("file:///Users/xstevens/workspace/akela/stopwords-en.txt")); NGramEnglishAnalyzer analyzer = new NGramEnglishAnalyzer(Version.LUCENE_31, stopwords, false, true); TokenStream stream = analyzer.tokenStream("", new StringReader("When I was growing up this was so much fun.")); CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); while (stream.incrementToken()) { if (termAttr.length() > 0) { System.out.println(termAttr.toString()); termAttr.setEmpty();//from ww w.ja v a 2s. c o m } } }