Example usage for org.apache.lucene.analysis TokenStream addAttribute

List of usage examples for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass) 

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:aos.lucene.search.advanced.SpanQueryTest.java

License:Apache License

private void dumpSpans(SpanQuery query) throws IOException {
      Spans spans = query.getSpans(reader);
      LOGGER.info(query + ":");
      int numSpans = 0;

      TopDocs hits = searcher.search(query, 10);
      float[] scores = new float[2];
      for (ScoreDoc sd : hits.scoreDocs) {
          scores[sd.doc] = sd.score;/* ww  w.  jav a 2  s.com*/
      }

      while (spans.next()) {
          numSpans++;

          int id = spans.doc();
          Document doc = reader.document(id);

          TokenStream stream = analyzer.tokenStream("contents", new StringReader(doc.get("f")));
          TermAttribute term = stream.addAttribute(TermAttribute.class);

          StringBuilder buffer = new StringBuilder();
          buffer.append("   ");
          int i = 0;
          while (stream.incrementToken()) {
              if (i == spans.start()) {
                  buffer.append("<");
              }
              buffer.append(term.term());
              if (i + 1 == spans.end()) {
                  buffer.append(">");
              }
              buffer.append(" ");
              i++;
          }
          buffer.append("(").append(scores[id]).append(") ");
          LOGGER.info(buffer);
      }

      if (numSpans == 0) {
          LOGGER.info("   No spans");
      }
      LOGGER.info();
  }

From source file:at.ac.univie.mminf.luceneSKOS.analysis.AbstractMeSHFilter.java

License:Apache License

public static CharsRef analyze(Analyzer analyzer, String text, CharsRef reuse) throws IOException {
    TokenStream ts = analyzer.tokenStream("", new StringReader(text));
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    // PositionIncrementAttribute posIncAtt =
    // ts.addAttribute(PositionIncrementAttribute.class);
    ts.reset();//from  ww w  .  j  a v a  2 s .  com
    reuse.length = 0;
    while (ts.incrementToken()) {
        int length = termAtt.length();
        if (length == 0) {
            throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
        }
        // if (posIncAtt.getPositionIncrement() != 1) {
        // throw new IllegalArgumentException("term: " + text +
        // " analyzed to a token with posinc != 1");
        // }
        reuse.grow(reuse.length + length + 1); /* current + word + separator */
        int end = reuse.offset + reuse.length;
        if (reuse.length > 0) {
            reuse.chars[end++] = 32; // space
            reuse.length++;
        }
        System.arraycopy(termAtt.buffer(), 0, reuse.chars, end, length);
        reuse.length += length;
    }
    ts.end();
    ts.close();
    if (reuse.length == 0) {
        throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer");
    }
    return reuse;
}

From source file:at.ac.univie.mminf.luceneSKOS.analysis.SNOMEDFilter.java

License:Apache License

public static CharsRef analyze(Analyzer analyzer, String text, CharsRef reuse) throws IOException {
    TokenStream ts = analyzer.tokenStream("", new StringReader(text));
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    // PositionIncrementAttribute posIncAtt =
    // ts.addAttribute(PositionIncrementAttribute.class);
    boolean phraseTerm = false;
    ts.reset();//from  w w w.j  a va2s .  c  o  m
    reuse.length = 0;
    while (ts.incrementToken()) {
        // System.out.println(text + " | " + termAtt.toString());
        int length = termAtt.length();
        if (length == 0) {
            throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
        }
        // if (posIncAtt.getPositionIncrement() != 1) {
        // throw new IllegalArgumentException("term: " + text +
        // " analyzed to a token with posinc != 1");
        // }
        reuse.grow(reuse.length + length + 1); /*
                                               * current + word +
                                               * separator
                                               */
        int end = reuse.offset + reuse.length;
        if (reuse.length > 0) {
            reuse.chars[end++] = 32; // space
            reuse.length++;
            phraseTerm = true;
        }
        System.arraycopy(termAtt.buffer(), 0, reuse.chars, end, length);
        reuse.length += length;
    }
    ts.end();
    ts.close();
    if (reuse.length == 0) {
        throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer");
    }

    if (phraseTerm) {
        reuse.grow(reuse.length + 2); /* current + word + separator */
        reuse.length += 2;
        char next = reuse.chars[0];
        for (int i = 0; i < reuse.length - 2; i++) {
            char tmp = reuse.chars[i + 1];
            reuse.chars[i + 1] = next;
            next = tmp;
        }
        reuse.chars[0] = '\"';
        reuse.chars[reuse.length - 1] = '\"';
    }
    return reuse;
}

From source file:at.ac.univie.mminf.luceneSKOS.util.AnalyzerUtils.java

License:Apache License

public static void displayTokens(TokenStream stream) throws IOException {

    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    while (stream.incrementToken()) {
        System.out.println("[" + term.toString() + "] ");
    }/* w w  w  . j a  va 2  s .c om*/

}

From source file:at.ac.univie.mminf.luceneSKOS.util.AnalyzerUtils.java

License:Apache License

public static void displayTokensWithPositions(Analyzer analyzer, String text) throws IOException {

    TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));

    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);

    int position = 0;
    while (stream.incrementToken()) {

        int increment = posIncr.getPositionIncrement();
        if (increment > 0) {
            position = position + increment;
            System.out.println();
            System.out.print(position + ":");
        }//  w  w w  .  j a v  a2  s  .  c om

        System.out.print("[" + term.toString() + "] ");

    }
    System.out.println();

}

From source file:at.ac.univie.mminf.luceneSKOS.util.AnalyzerUtils.java

License:Apache License

public static void displayTokensWithFullDetails(Analyzer analyzer, String text) throws IOException {

    TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));

    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
    OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
    TypeAttribute type = stream.addAttribute(TypeAttribute.class);
    PayloadAttribute payload = stream.addAttribute(PayloadAttribute.class);

    int position = 0;
    while (stream.incrementToken()) {

        int increment = posIncr.getPositionIncrement();
        if (increment > 0) {
            position = position + increment;
            System.out.println();
            System.out.print(position + ":");
        }/* ww  w.  ja v  a 2  s .c  o  m*/

        Payload pl = payload.getPayload();

        if (pl != null) {
            System.out.print("[" + term.toString() + ":" + offset.startOffset() + "->" + offset.endOffset()
                    + ":" + type.type() + ":" + new String(pl.getData()) + "] ");

        } else {
            System.out.print("[" + term.toString() + ":" + offset.startOffset() + "->" + offset.endOffset()
                    + ":" + type.type() + "] ");

        }

    }
    System.out.println();
}

From source file:be.ugent.tiwi.sleroux.newsrec.newsreclib.newsFetch.storm.bolts.TweetAnalyzerBolt.java

License:Apache License

@Override
public void execute(Tuple input) {
    try {/*from www  .  j  av a 2 s .com*/
        String tweet = (String) input.getValueByField(StreamIDs.TWEET);
        Reader reader = new StringReader(tweet);

        LanguageIdentifier identifier = new LanguageIdentifier(tweet);
        NewsRecLuceneAnalyzer analyzer = LanguageAnalyzerHelper.getInstance()
                .getAnalyzer(new Locale(identifier.getLanguage()));

        TokenStream tokenStream = analyzer.tokenStream("", reader);
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            String term = charTermAttribute.toString();
            collector.emit(StreamIDs.TERMSTREAM, new Values(term));
        }
        reader.close();
        tokenStream.close();

        for (String term : extractNames(tweet, analyzer.getStopwords())) {
            collector.emit(StreamIDs.TERMSTREAM, new Values(term));
        }
    } catch (IOException ex) {
        logger.error(ex);
    }
}

From source file:bixo.examples.webmining.PhraseShingleAnalyzer.java

License:Apache License

public List<String> getTermList(String contentText) {
    List<String> result = new ArrayList<String>(contentText.length() / 10);

    try {//from   w ww . ja va 2s.  c o m
        TokenStream stream = _analyzer.tokenStream("content", new StringReader(contentText));
        CharTermAttribute termAtt = (CharTermAttribute) stream.addAttribute(CharTermAttribute.class);

        stream.reset();
        while (stream.incrementToken()) {
            if (termAtt.length() > 0) {
                String term = termAtt.toString();
                result.add(term);
            }
        }
        stream.end();
        stream.close();
    } catch (IOException e) {
        throw new RuntimeException("Impossible error", e);
    }

    return result;
}

From source file:br.bireme.ngrams.Tools.java

public static void showTokens(final Analyzer analyzer, final String fieldName, final String text)
        throws IOException {
    TokenStream tokenStream = analyzer.tokenStream(fieldName, text);
    OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

    tokenStream.reset();/*from w  ww.j  a va2s. c  o m*/
    while (tokenStream.incrementToken()) {
        int startOffset = offsetAttribute.startOffset();
        int endOffset = offsetAttribute.endOffset();
        final String term = charTermAttribute.toString();

        System.out.println(term + " [" + startOffset + "," + endOffset + "]");
    }
}

From source file:ca.ualberta.entitylinking.common.indexing.TFIDF3x.java

License:Open Source License

/**
 * Filter the string with StandardAnalyzer.
 * @param str//w w w. j av  a 2  s  .  c om
 * @param removeStopWords   Indicate if the stop words should be removed.
 * @return
 */
public static String processString(String str, boolean removeStopWords) {
    StringBuffer strBuf = new StringBuffer();

    try {
        Analyzer analyzer = null;
        if (removeStopWords)
            analyzer = new StandardAnalyzer(Version.LUCENE_34);
        else
            analyzer = new TextAnalyzerWithStopwords(Version.LUCENE_34);

        TokenStream tokenStream = analyzer.tokenStream("string", new StringReader(str));
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            String term = charTermAttribute.toString();
            strBuf.append(term + " ");
        }

        analyzer.close();
    } catch (Exception e) {
        e.printStackTrace();
    }

    return strBuf.toString().trim();
}