Example usage for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:aos.lucene.analysis.i18n.ChineseDemo.java

License:Apache License

private static void analyze(String string, Analyzer analyzer) throws IOException {
    StringBuffer buffer = new StringBuffer();

    TokenStream stream = analyzer.tokenStream("contents", new StringReader(string));
    TermAttribute term = stream.addAttribute(TermAttribute.class);

    while (stream.incrementToken()) { //C
        buffer.append("[");
        buffer.append(term.term());//from  w ww  .j  a  va2 s.c  o  m
        buffer.append("] ");
    }

    String output = buffer.toString();

    Frame f = new Frame();
    f.setTitle(analyzer.getClass().getSimpleName() + " : " + string);
    f.setResizable(true);

    Font font = new Font(null, Font.PLAIN, 36);
    int width = getWidth(f.getFontMetrics(font), output);

    f.setSize((width < 250) ? 250 : width + 50, 75);

    // NOTE: if Label doesn't render the Chinese characters
    // properly, try using javax.swing.JLabel instead
    Label label = new Label(output); //D
    label.setSize(width, 75);
    label.setAlignment(Label.CENTER);
    label.setFont(font);
    f.add(label);

    f.setVisible(true);
}

From source file:aos.lucene.search.advanced.SpanQueryTest.java

License:Apache License

private void dumpSpans(SpanQuery query) throws IOException {
      Spans spans = query.getSpans(reader);
      LOGGER.info(query + ":");
      int numSpans = 0;

      TopDocs hits = searcher.search(query, 10);
      float[] scores = new float[2];
      for (ScoreDoc sd : hits.scoreDocs) {
          scores[sd.doc] = sd.score;//from w  w  w.j a va  2 s.  c o m
      }

      while (spans.next()) {
          numSpans++;

          int id = spans.doc();
          Document doc = reader.document(id);

          TokenStream stream = analyzer.tokenStream("contents", new StringReader(doc.get("f")));
          TermAttribute term = stream.addAttribute(TermAttribute.class);

          StringBuilder buffer = new StringBuilder();
          buffer.append("   ");
          int i = 0;
          while (stream.incrementToken()) {
              if (i == spans.start()) {
                  buffer.append("<");
              }
              buffer.append(term.term());
              if (i + 1 == spans.end()) {
                  buffer.append(">");
              }
              buffer.append(" ");
              i++;
          }
          buffer.append("(").append(scores[id]).append(") ");
          LOGGER.info(buffer);
      }

      if (numSpans == 0) {
          LOGGER.info("   No spans");
      }
      LOGGER.info();
  }

From source file:at.ac.tuwien.ifs.myluceneanalyzers.fa.algorithm.PersianDictionaryCountCompoundWord.java

@SuppressWarnings({ "resource", "deprecation" })
private String stem(String input) throws IOException {
    String output = "";
    Reader reader = new StringReader(input);
    Tokenizer source = new StandardTokenizer(Version.LUCENE_4_10_3, reader);
    TokenStream tokenStream = new PersianStemFilter(source);

    CharTermAttribute charTermAttributeGreedy = tokenStream.getAttribute(CharTermAttribute.class);
    tokenStream.reset();//from   w  w  w  . j  av a 2s  .  co m
    while (tokenStream.incrementToken()) {
        output = output + " " + charTermAttributeGreedy.toString();

    }
    return output.trim();
}

From source file:at.ac.univie.mminf.luceneSKOS.analysis.AbstractMeSHFilter.java

License:Apache License

public static CharsRef analyze(Analyzer analyzer, String text, CharsRef reuse) throws IOException {
    TokenStream ts = analyzer.tokenStream("", new StringReader(text));
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    // PositionIncrementAttribute posIncAtt =
    // ts.addAttribute(PositionIncrementAttribute.class);
    ts.reset();/*  w  w w.j  ava  2  s.co m*/
    reuse.length = 0;
    while (ts.incrementToken()) {
        int length = termAtt.length();
        if (length == 0) {
            throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
        }
        // if (posIncAtt.getPositionIncrement() != 1) {
        // throw new IllegalArgumentException("term: " + text +
        // " analyzed to a token with posinc != 1");
        // }
        reuse.grow(reuse.length + length + 1); /* current + word + separator */
        int end = reuse.offset + reuse.length;
        if (reuse.length > 0) {
            reuse.chars[end++] = 32; // space
            reuse.length++;
        }
        System.arraycopy(termAtt.buffer(), 0, reuse.chars, end, length);
        reuse.length += length;
    }
    ts.end();
    ts.close();
    if (reuse.length == 0) {
        throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer");
    }
    return reuse;
}

From source file:at.ac.univie.mminf.luceneSKOS.analysis.SNOMEDFilter.java

License:Apache License

public static CharsRef analyze(Analyzer analyzer, String text, CharsRef reuse) throws IOException {
    TokenStream ts = analyzer.tokenStream("", new StringReader(text));
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    // PositionIncrementAttribute posIncAtt =
    // ts.addAttribute(PositionIncrementAttribute.class);
    boolean phraseTerm = false;
    ts.reset();// w  w w  .  ja va  2 s  . c  o m
    reuse.length = 0;
    while (ts.incrementToken()) {
        // System.out.println(text + " | " + termAtt.toString());
        int length = termAtt.length();
        if (length == 0) {
            throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
        }
        // if (posIncAtt.getPositionIncrement() != 1) {
        // throw new IllegalArgumentException("term: " + text +
        // " analyzed to a token with posinc != 1");
        // }
        reuse.grow(reuse.length + length + 1); /*
                                               * current + word +
                                               * separator
                                               */
        int end = reuse.offset + reuse.length;
        if (reuse.length > 0) {
            reuse.chars[end++] = 32; // space
            reuse.length++;
            phraseTerm = true;
        }
        System.arraycopy(termAtt.buffer(), 0, reuse.chars, end, length);
        reuse.length += length;
    }
    ts.end();
    ts.close();
    if (reuse.length == 0) {
        throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer");
    }

    if (phraseTerm) {
        reuse.grow(reuse.length + 2); /* current + word + separator */
        reuse.length += 2;
        char next = reuse.chars[0];
        for (int i = 0; i < reuse.length - 2; i++) {
            char tmp = reuse.chars[i + 1];
            reuse.chars[i + 1] = next;
            next = tmp;
        }
        reuse.chars[0] = '\"';
        reuse.chars[reuse.length - 1] = '\"';
    }
    return reuse;
}

From source file:at.ac.univie.mminf.luceneSKOS.util.AnalyzerUtils.java

License:Apache License

public static void displayTokens(TokenStream stream) throws IOException {

    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    while (stream.incrementToken()) {
        System.out.println("[" + term.toString() + "] ");
    }/*from   w w  w.  j  a  v  a 2s  .c om*/

}

From source file:at.ac.univie.mminf.luceneSKOS.util.AnalyzerUtils.java

License:Apache License

public static void displayTokensWithPositions(Analyzer analyzer, String text) throws IOException {

    TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));

    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);

    int position = 0;
    while (stream.incrementToken()) {

        int increment = posIncr.getPositionIncrement();
        if (increment > 0) {
            position = position + increment;
            System.out.println();
            System.out.print(position + ":");
        }/*ww  w. j  a  v a2s. c o  m*/

        System.out.print("[" + term.toString() + "] ");

    }
    System.out.println();

}

From source file:at.ac.univie.mminf.luceneSKOS.util.AnalyzerUtils.java

License:Apache License

public static void displayTokensWithFullDetails(Analyzer analyzer, String text) throws IOException {

    TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));

    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
    OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
    TypeAttribute type = stream.addAttribute(TypeAttribute.class);
    PayloadAttribute payload = stream.addAttribute(PayloadAttribute.class);

    int position = 0;
    while (stream.incrementToken()) {

        int increment = posIncr.getPositionIncrement();
        if (increment > 0) {
            position = position + increment;
            System.out.println();
            System.out.print(position + ":");
        }//from w  w w . ja v a2  s .c o  m

        Payload pl = payload.getPayload();

        if (pl != null) {
            System.out.print("[" + term.toString() + ":" + offset.startOffset() + "->" + offset.endOffset()
                    + ":" + type.type() + ":" + new String(pl.getData()) + "] ");

        } else {
            System.out.print("[" + term.toString() + ":" + offset.startOffset() + "->" + offset.endOffset()
                    + ":" + type.type() + "] ");

        }

    }
    System.out.println();
}

From source file:at.itbh.bev.apibeans.FinderImpl.java

License:Open Source License

public FullTextQuery constructQuery(EntityManager em, String postalCode, String place, String addressLine,
        String houseId) throws InvalidApiUsageException {
    FullTextEntityManager fullTextEm = Search.getFullTextEntityManager(em);

    if ((Objects.toString(postalCode, "") + Objects.toString(place, "") + Objects.toString(addressLine, "")
            + Objects.toString(houseId, "")).length() == 0) {
        throw new InvalidApiUsageException(
                "At least one parameter must be provided. Coordinates don't count as parameters.");
    }/*  w  w w  .jav a2 s . c  o m*/

    if (addressLine != null && addressLine.length() < 2 && addressLine.length() > 0) {
        throw new InvalidApiUsageException("The parameter addressLine must consist of at least 2 characters.");
    }

    QueryBuilder b = fullTextEm.getSearchFactory().buildQueryBuilder().forEntity(AdresseDenormalized.class)
            .get();
    List<Query> queries = new ArrayList<>();

    if (postalCode != null && postalCode.length() > 0) {
        queries.add(b.keyword().onField("postalCode").boostedTo(20).matching(postalCode).createQuery());
    }

    if (addressLine != null && addressLine.length() > 0) {
        queries.add(b.keyword().onField("addressLine").matching(addressLine + addressLine + addressLine)
                .createQuery());
        // triple addressLine since in the data source it is also tripled if
        // there is no building or address name
        queries.add(b.keyword().onField("addressLineExact").boostedTo(10)
                .matching(addressLine + addressLine + addressLine).createQuery());
    }

    if (houseId != null && houseId.length() > 0) {
        // if search string contains a number, take the first number in the
        // search string and match with the house number

        Matcher matcher = housenumberPattern.matcher(houseId);
        if (matcher.find()) {
            queries.add(
                    b.keyword().onField("hausnrzahl").boostedTo(50).matching(matcher.group(1)).createQuery());
        }

        if (houseId.matches(".*\\D.*")) {
            queries.add(b.keyword().onField("houseIdExact").matching(houseId).createQuery());
        }

        queries.add(b.keyword().onField("houseId").boostedTo(20).matching(houseId).createQuery());

        TextAnalyzer analyzer = new TextAnalyzer();
        TokenStream stream;
        try {
            stream = analyzer.tokenStream(null, new StringReader(houseId));
            // CharTermAttribute cattr = stream.addAttribute(CharTermAttribute.class);
            stream.reset();
            if (stream.incrementToken()) {
                // if analyzer does not remove everything check hofname and hausnrgebaeudebez
                queries.add(b.keyword().onField("hofname").matching(houseId).createQuery());
                queries.add(b.keyword().onField("hausnrgebaeudebez").matching(houseId).createQuery());
                // System.out.println(cattr.toString());
            }
            stream.end();
            stream.close();
        } catch (IOException e1) {
            e1.printStackTrace();
        }
        analyzer.close();
    }

    if (place != null && place.length() > 0) {
        queries.add(b.keyword().onField("place").matching(place).createQuery());

        queries.add(b.keyword().onField("municipalityExact").boostedTo(20).matching(place).createQuery());
        queries.add(b.keyword().onField("placeExact").boostedTo(5).matching(place).createQuery());
    }

    @SuppressWarnings("rawtypes")
    BooleanJunction bq = b.bool();
    for (Query item : queries) {
        bq = bq.should(item);
    }

    FullTextQuery fullTextQuery = fullTextEm.createFullTextQuery(bq.createQuery(), AdresseDenormalized.class);
    return fullTextQuery;
}

From source file:at.newmedialab.lmf.util.solr.suggestion.service.FieldAnalyzerService.java

License:Apache License

/**
 * analyzes string like the default field
 * @param df the name of the default field
 * @param s the string to analyze/*from   w  w w  . j  ava2  s  .  co  m*/
 * @return
 */
public static String analyzeString(SolrCore core, String df, String s) {
    try {
        TokenStream ts = core.getSchema().getFieldType(df).getQueryAnalyzer().tokenStream(df,
                new StringReader(s));
        StringBuffer b = new StringBuffer();
        ts.reset();
        while (ts.incrementToken()) {
            b.append(" ");
            CharTermAttribute attr = ts.getAttribute(CharTermAttribute.class);
            b.append(attr);
        }
        return b.toString().trim();
    } catch (IOException e) {
        e.printStackTrace();
        return s;
    }
}