List of usage examples for org.apache.lucene.analysis TokenStream addAttribute
public final <T extends Attribute> T addAttribute(Class<T> attClass)
From source file:aos.lucene.search.advanced.SpanQueryTest.java
License:Apache License
private void dumpSpans(SpanQuery query) throws IOException { Spans spans = query.getSpans(reader); LOGGER.info(query + ":"); int numSpans = 0; TopDocs hits = searcher.search(query, 10); float[] scores = new float[2]; for (ScoreDoc sd : hits.scoreDocs) { scores[sd.doc] = sd.score;/* ww w. jav a 2 s.com*/ } while (spans.next()) { numSpans++; int id = spans.doc(); Document doc = reader.document(id); TokenStream stream = analyzer.tokenStream("contents", new StringReader(doc.get("f"))); TermAttribute term = stream.addAttribute(TermAttribute.class); StringBuilder buffer = new StringBuilder(); buffer.append(" "); int i = 0; while (stream.incrementToken()) { if (i == spans.start()) { buffer.append("<"); } buffer.append(term.term()); if (i + 1 == spans.end()) { buffer.append(">"); } buffer.append(" "); i++; } buffer.append("(").append(scores[id]).append(") "); LOGGER.info(buffer); } if (numSpans == 0) { LOGGER.info(" No spans"); } LOGGER.info(); }
From source file:at.ac.univie.mminf.luceneSKOS.analysis.AbstractMeSHFilter.java
License:Apache License
public static CharsRef analyze(Analyzer analyzer, String text, CharsRef reuse) throws IOException { TokenStream ts = analyzer.tokenStream("", new StringReader(text)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); // PositionIncrementAttribute posIncAtt = // ts.addAttribute(PositionIncrementAttribute.class); ts.reset();//from ww w . j a v a 2 s . com reuse.length = 0; while (ts.incrementToken()) { int length = termAtt.length(); if (length == 0) { throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token"); } // if (posIncAtt.getPositionIncrement() != 1) { // throw new IllegalArgumentException("term: " + text + // " analyzed to a token with posinc != 1"); // } reuse.grow(reuse.length + length + 1); /* current + word + separator */ int end = reuse.offset + reuse.length; if (reuse.length > 0) { reuse.chars[end++] = 32; // space reuse.length++; } System.arraycopy(termAtt.buffer(), 0, reuse.chars, end, length); reuse.length += length; } ts.end(); ts.close(); if (reuse.length == 0) { throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer"); } return reuse; }
From source file:at.ac.univie.mminf.luceneSKOS.analysis.SNOMEDFilter.java
License:Apache License
public static CharsRef analyze(Analyzer analyzer, String text, CharsRef reuse) throws IOException { TokenStream ts = analyzer.tokenStream("", new StringReader(text)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); // PositionIncrementAttribute posIncAtt = // ts.addAttribute(PositionIncrementAttribute.class); boolean phraseTerm = false; ts.reset();//from w w w.j a va2s . c o m reuse.length = 0; while (ts.incrementToken()) { // System.out.println(text + " | " + termAtt.toString()); int length = termAtt.length(); if (length == 0) { throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token"); } // if (posIncAtt.getPositionIncrement() != 1) { // throw new IllegalArgumentException("term: " + text + // " analyzed to a token with posinc != 1"); // } reuse.grow(reuse.length + length + 1); /* * current + word + * separator */ int end = reuse.offset + reuse.length; if (reuse.length > 0) { reuse.chars[end++] = 32; // space reuse.length++; phraseTerm = true; } System.arraycopy(termAtt.buffer(), 0, reuse.chars, end, length); reuse.length += length; } ts.end(); ts.close(); if (reuse.length == 0) { throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer"); } if (phraseTerm) { reuse.grow(reuse.length + 2); /* current + word + separator */ reuse.length += 2; char next = reuse.chars[0]; for (int i = 0; i < reuse.length - 2; i++) { char tmp = reuse.chars[i + 1]; reuse.chars[i + 1] = next; next = tmp; } reuse.chars[0] = '\"'; reuse.chars[reuse.length - 1] = '\"'; } return reuse; }
From source file:at.ac.univie.mminf.luceneSKOS.util.AnalyzerUtils.java
License:Apache License
public static void displayTokens(TokenStream stream) throws IOException { CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); while (stream.incrementToken()) { System.out.println("[" + term.toString() + "] "); }/* w w w . j a va 2 s .c om*/ }
From source file:at.ac.univie.mminf.luceneSKOS.util.AnalyzerUtils.java
License:Apache License
public static void displayTokensWithPositions(Analyzer analyzer, String text) throws IOException { TokenStream stream = analyzer.tokenStream("contents", new StringReader(text)); CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class); int position = 0; while (stream.incrementToken()) { int increment = posIncr.getPositionIncrement(); if (increment > 0) { position = position + increment; System.out.println(); System.out.print(position + ":"); }// w w w . j a v a2 s . c om System.out.print("[" + term.toString() + "] "); } System.out.println(); }
From source file:at.ac.univie.mminf.luceneSKOS.util.AnalyzerUtils.java
License:Apache License
public static void displayTokensWithFullDetails(Analyzer analyzer, String text) throws IOException { TokenStream stream = analyzer.tokenStream("contents", new StringReader(text)); CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class); OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); TypeAttribute type = stream.addAttribute(TypeAttribute.class); PayloadAttribute payload = stream.addAttribute(PayloadAttribute.class); int position = 0; while (stream.incrementToken()) { int increment = posIncr.getPositionIncrement(); if (increment > 0) { position = position + increment; System.out.println(); System.out.print(position + ":"); }/* ww w. ja v a 2 s .c o m*/ Payload pl = payload.getPayload(); if (pl != null) { System.out.print("[" + term.toString() + ":" + offset.startOffset() + "->" + offset.endOffset() + ":" + type.type() + ":" + new String(pl.getData()) + "] "); } else { System.out.print("[" + term.toString() + ":" + offset.startOffset() + "->" + offset.endOffset() + ":" + type.type() + "] "); } } System.out.println(); }
From source file:be.ugent.tiwi.sleroux.newsrec.newsreclib.newsFetch.storm.bolts.TweetAnalyzerBolt.java
License:Apache License
@Override public void execute(Tuple input) { try {/*from www . j av a 2 s .com*/ String tweet = (String) input.getValueByField(StreamIDs.TWEET); Reader reader = new StringReader(tweet); LanguageIdentifier identifier = new LanguageIdentifier(tweet); NewsRecLuceneAnalyzer analyzer = LanguageAnalyzerHelper.getInstance() .getAnalyzer(new Locale(identifier.getLanguage())); TokenStream tokenStream = analyzer.tokenStream("", reader); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { String term = charTermAttribute.toString(); collector.emit(StreamIDs.TERMSTREAM, new Values(term)); } reader.close(); tokenStream.close(); for (String term : extractNames(tweet, analyzer.getStopwords())) { collector.emit(StreamIDs.TERMSTREAM, new Values(term)); } } catch (IOException ex) { logger.error(ex); } }
From source file:bixo.examples.webmining.PhraseShingleAnalyzer.java
License:Apache License
public List<String> getTermList(String contentText) { List<String> result = new ArrayList<String>(contentText.length() / 10); try {//from w ww . ja va 2s. c o m TokenStream stream = _analyzer.tokenStream("content", new StringReader(contentText)); CharTermAttribute termAtt = (CharTermAttribute) stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { if (termAtt.length() > 0) { String term = termAtt.toString(); result.add(term); } } stream.end(); stream.close(); } catch (IOException e) { throw new RuntimeException("Impossible error", e); } return result; }
From source file:br.bireme.ngrams.Tools.java
public static void showTokens(final Analyzer analyzer, final String fieldName, final String text) throws IOException { TokenStream tokenStream = analyzer.tokenStream(fieldName, text); OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset();/*from w ww.j a va2s. c o m*/ while (tokenStream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); final String term = charTermAttribute.toString(); System.out.println(term + " [" + startOffset + "," + endOffset + "]"); } }
From source file:ca.ualberta.entitylinking.common.indexing.TFIDF3x.java
License:Open Source License
/** * Filter the string with StandardAnalyzer. * @param str//w w w. j av a 2 s . c om * @param removeStopWords Indicate if the stop words should be removed. * @return */ public static String processString(String str, boolean removeStopWords) { StringBuffer strBuf = new StringBuffer(); try { Analyzer analyzer = null; if (removeStopWords) analyzer = new StandardAnalyzer(Version.LUCENE_34); else analyzer = new TextAnalyzerWithStopwords(Version.LUCENE_34); TokenStream tokenStream = analyzer.tokenStream("string", new StringReader(str)); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { String term = charTermAttribute.toString(); strBuf.append(term + " "); } analyzer.close(); } catch (Exception e) { e.printStackTrace(); } return strBuf.toString().trim(); }