Example usage for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass)

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:aos.lucene.search.advanced.SpanQueryTest.java

License:Apache License

private void dumpSpans(SpanQuery query) throws IOException {
      Spans spans = query.getSpans(reader);
      LOGGER.info(query + ":");
      int numSpans = 0;

      TopDocs hits = searcher.search(query, 10);
      float[] scores = new float[2];
      for (ScoreDoc sd : hits.scoreDocs) {
          scores[sd.doc] = sd.score;/* ww  w.  jav a 2  s.com*/
      }

      while (spans.next()) {
          numSpans++;

          int id = spans.doc();
          Document doc = reader.document(id);

          TokenStream stream = analyzer.tokenStream("contents", new StringReader(doc.get("f")));
          TermAttribute term = stream.addAttribute(TermAttribute.class);

          StringBuilder buffer = new StringBuilder();
          buffer.append("   ");
          int i = 0;
          while (stream.incrementToken()) {
              if (i == spans.start()) {
                  buffer.append("<");
              }
              buffer.append(term.term());
              if (i + 1 == spans.end()) {
                  buffer.append(">");
              }
              buffer.append(" ");
              i++;
          }
          buffer.append("(").append(scores[id]).append(") ");
          LOGGER.info(buffer);
      }

      if (numSpans == 0) {
          LOGGER.info("   No spans");
      }
      LOGGER.info();
  }

From source file:at.ac.univie.mminf.luceneSKOS.analysis.AbstractMeSHFilter.java

License:Apache License

public static CharsRef analyze(Analyzer analyzer, String text, CharsRef reuse) throws IOException {
    TokenStream ts = analyzer.tokenStream("", new StringReader(text));
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    // PositionIncrementAttribute posIncAtt =
    // ts.addAttribute(PositionIncrementAttribute.class);
    ts.reset();//from  ww w  .  j  a v a  2 s .  com
    reuse.length = 0;
    while (ts.incrementToken()) {
        int length = termAtt.length();
        if (length == 0) {
            throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
        }
        // if (posIncAtt.getPositionIncrement() != 1) {
        // throw new IllegalArgumentException("term: " + text +
        // " analyzed to a token with posinc != 1");
        // }
        reuse.grow(reuse.length + length + 1); /* current + word + separator */
        int end = reuse.offset + reuse.length;
        if (reuse.length > 0) {
            reuse.chars[end++] = 32; // space
            reuse.length++;
        }
        System.arraycopy(termAtt.buffer(), 0, reuse.chars, end, length);
        reuse.length += length;
    }
    ts.end();
    ts.close();
    if (reuse.length == 0) {
        throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer");
    }
    return reuse;
}

From source file:at.ac.univie.mminf.luceneSKOS.analysis.SNOMEDFilter.java

License:Apache License

public static CharsRef analyze(Analyzer analyzer, String text, CharsRef reuse) throws IOException {
    TokenStream ts = analyzer.tokenStream("", new StringReader(text));
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    // PositionIncrementAttribute posIncAtt =
    // ts.addAttribute(PositionIncrementAttribute.class);
    boolean phraseTerm = false;
    ts.reset();//from  w w w.j  a va2s .  c  o  m
    reuse.length = 0;
    while (ts.incrementToken()) {
        // System.out.println(text + " | " + termAtt.toString());
        int length = termAtt.length();
        if (length == 0) {
            throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
        }
        // if (posIncAtt.getPositionIncrement() != 1) {
        // throw new IllegalArgumentException("term: " + text +
        // " analyzed to a token with posinc != 1");
        // }
        reuse.grow(reuse.length + length + 1); /*
                                               * current + word +
                                               * separator
                                               */
        int end = reuse.offset + reuse.length;
        if (reuse.length > 0) {
            reuse.chars[end++] = 32; // space
            reuse.length++;
            phraseTerm = true;
        }
        System.arraycopy(termAtt.buffer(), 0, reuse.chars, end, length);
        reuse.length += length;
    }
    ts.end();
    ts.close();
    if (reuse.length == 0) {
        throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer");
    }

    if (phraseTerm) {
        reuse.grow(reuse.length + 2); /* current + word + separator */
        reuse.length += 2;
        char next = reuse.chars[0];
        for (int i = 0; i < reuse.length - 2; i++) {
            char tmp = reuse.chars[i + 1];
            reuse.chars[i + 1] = next;
            next = tmp;
        }
        reuse.chars[0] = '\"';
        reuse.chars[reuse.length - 1] = '\"';
    }
    return reuse;
}

From source file:at.ac.univie.mminf.luceneSKOS.util.AnalyzerUtils.java

License:Apache License

public static void displayTokens(TokenStream stream) throws IOException {

    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    while (stream.incrementToken()) {
        System.out.println("[" + term.toString() + "] ");
    }/* w w  w  . j a  va 2  s .c om*/

}

From source file:at.ac.univie.mminf.luceneSKOS.util.AnalyzerUtils.java

License:Apache License

public static void displayTokensWithPositions(Analyzer analyzer, String text) throws IOException {

    TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));

    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);

    int position = 0;
    while (stream.incrementToken()) {

        int increment = posIncr.getPositionIncrement();
        if (increment > 0) {
            position = position + increment;
            System.out.println();
            System.out.print(position + ":");
        }//  w  w w  .  j a v  a2  s  .  c om

        System.out.print("[" + term.toString() + "] ");

    }
    System.out.println();

}

From source file:at.ac.univie.mminf.luceneSKOS.util.AnalyzerUtils.java

License:Apache License

public static void displayTokensWithFullDetails(Analyzer analyzer, String text) throws IOException {

    TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));

    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
    OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
    TypeAttribute type = stream.addAttribute(TypeAttribute.class);
    PayloadAttribute payload = stream.addAttribute(PayloadAttribute.class);

    int position = 0;
    while (stream.incrementToken()) {

        int increment = posIncr.getPositionIncrement();
        if (increment > 0) {
            position = position + increment;
            System.out.println();
            System.out.print(position + ":");
        }/* ww  w.  ja v  a 2  s .c  o  m*/

        Payload pl = payload.getPayload();

        if (pl != null) {
            System.out.print("[" + term.toString() + ":" + offset.startOffset() + "->" + offset.endOffset()
                    + ":" + type.type() + ":" + new String(pl.getData()) + "] ");

        } else {
            System.out.print("[" + term.toString() + ":" + offset.startOffset() + "->" + offset.endOffset()
                    + ":" + type.type() + "] ");

        }

    }
    System.out.println();
}

From source file:be.ugent.tiwi.sleroux.newsrec.newsreclib.newsFetch.storm.bolts.TweetAnalyzerBolt.java

License:Apache License

@Override
public void execute(Tuple input) {
    try {/*from www  .  j  av a 2 s .com*/
        String tweet = (String) input.getValueByField(StreamIDs.TWEET);
        Reader reader = new StringReader(tweet);

        LanguageIdentifier identifier = new LanguageIdentifier(tweet);
        NewsRecLuceneAnalyzer analyzer = LanguageAnalyzerHelper.getInstance()
                .getAnalyzer(new Locale(identifier.getLanguage()));

        TokenStream tokenStream = analyzer.tokenStream("", reader);
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            String term = charTermAttribute.toString();
            collector.emit(StreamIDs.TERMSTREAM, new Values(term));
        }
        reader.close();
        tokenStream.close();

        for (String term : extractNames(tweet, analyzer.getStopwords())) {
            collector.emit(StreamIDs.TERMSTREAM, new Values(term));
        }
    } catch (IOException ex) {
        logger.error(ex);
    }
}

From source file:bixo.examples.webmining.PhraseShingleAnalyzer.java

License:Apache License

public List<String> getTermList(String contentText) {
    List<String> result = new ArrayList<String>(contentText.length() / 10);

    try {//from   w ww . ja va 2s.  c o m
        TokenStream stream = _analyzer.tokenStream("content", new StringReader(contentText));
        CharTermAttribute termAtt = (CharTermAttribute) stream.addAttribute(CharTermAttribute.class);

        stream.reset();
        while (stream.incrementToken()) {
            if (termAtt.length() > 0) {
                String term = termAtt.toString();
                result.add(term);
            }
        }
        stream.end();
        stream.close();
    } catch (IOException e) {
        throw new RuntimeException("Impossible error", e);
    }

    return result;
}

From source file:br.bireme.ngrams.Tools.java

public static void showTokens(final Analyzer analyzer, final String fieldName, final String text)
        throws IOException {
    TokenStream tokenStream = analyzer.tokenStream(fieldName, text);
    OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

    tokenStream.reset();/*from w  ww.j  a va2s. c  o m*/
    while (tokenStream.incrementToken()) {
        int startOffset = offsetAttribute.startOffset();
        int endOffset = offsetAttribute.endOffset();
        final String term = charTermAttribute.toString();

        System.out.println(term + " [" + startOffset + "," + endOffset + "]");
    }
}

From source file:ca.ualberta.entitylinking.common.indexing.TFIDF3x.java

License:Open Source License

/**
 * Filter the string with StandardAnalyzer.
 * @param str//w w w. j av  a 2  s  .  c om
 * @param removeStopWords   Indicate if the stop words should be removed.
 * @return
 */
public static String processString(String str, boolean removeStopWords) {
    StringBuffer strBuf = new StringBuffer();

    try {
        Analyzer analyzer = null;
        if (removeStopWords)
            analyzer = new StandardAnalyzer(Version.LUCENE_34);
        else
            analyzer = new TextAnalyzerWithStopwords(Version.LUCENE_34);

        TokenStream tokenStream = analyzer.tokenStream("string", new StringReader(str));
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            String term = charTermAttribute.toString();
            strBuf.append(term + " ");
        }

        analyzer.close();
    } catch (Exception e) {
        e.printStackTrace();
    }

    return strBuf.toString().trim();
}