Example usage for org.apache.lucene.analysis TokenStream incrementToken

List of usage examples for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:aos.lucene.analysis.i18n.ChineseDemo.java

License:Apache License

private static void analyze(String string, Analyzer analyzer) throws IOException {
    StringBuffer buffer = new StringBuffer();

    TokenStream stream = analyzer.tokenStream("contents", new StringReader(string));
    TermAttribute term = stream.addAttribute(TermAttribute.class);

    while (stream.incrementToken()) { //C
        buffer.append("[");
        buffer.append(term.term());//from  w ww  .j  a  va2 s.c  o  m
        buffer.append("] ");
    }

    String output = buffer.toString();

    Frame f = new Frame();
    f.setTitle(analyzer.getClass().getSimpleName() + " : " + string);
    f.setResizable(true);

    Font font = new Font(null, Font.PLAIN, 36);
    int width = getWidth(f.getFontMetrics(font), output);

    f.setSize((width < 250) ? 250 : width + 50, 75);

    // NOTE: if Label doesn't render the Chinese characters
    // properly, try using javax.swing.JLabel instead
    Label label = new Label(output); //D
    label.setSize(width, 75);
    label.setAlignment(Label.CENTER);
    label.setFont(font);
    f.add(label);

    f.setVisible(true);
}

From source file:aos.lucene.search.advanced.SpanQueryTest.java

License:Apache License

private void dumpSpans(SpanQuery query) throws IOException {
      Spans spans = query.getSpans(reader);
      LOGGER.info(query + ":");
      int numSpans = 0;

      TopDocs hits = searcher.search(query, 10);
      float[] scores = new float[2];
      for (ScoreDoc sd : hits.scoreDocs) {
          scores[sd.doc] = sd.score;//from w  w  w.j a va  2 s.  c o m
      }

      while (spans.next()) {
          numSpans++;

          int id = spans.doc();
          Document doc = reader.document(id);

          TokenStream stream = analyzer.tokenStream("contents", new StringReader(doc.get("f")));
          TermAttribute term = stream.addAttribute(TermAttribute.class);

          StringBuilder buffer = new StringBuilder();
          buffer.append("   ");
          int i = 0;
          while (stream.incrementToken()) {
              if (i == spans.start()) {
                  buffer.append("<");
              }
              buffer.append(term.term());
              if (i + 1 == spans.end()) {
                  buffer.append(">");
              }
              buffer.append(" ");
              i++;
          }
          buffer.append("(").append(scores[id]).append(") ");
          LOGGER.info(buffer);
      }

      if (numSpans == 0) {
          LOGGER.info("   No spans");
      }
      LOGGER.info();
  }

From source file:at.ac.tuwien.ifs.myluceneanalyzers.fa.algorithm.PersianDictionaryCountCompoundWord.java

@SuppressWarnings({ "resource", "deprecation" })
private String stem(String input) throws IOException {
    String output = "";
    Reader reader = new StringReader(input);
    Tokenizer source = new StandardTokenizer(Version.LUCENE_4_10_3, reader);
    TokenStream tokenStream = new PersianStemFilter(source);

    CharTermAttribute charTermAttributeGreedy = tokenStream.getAttribute(CharTermAttribute.class);
    tokenStream.reset();//from   w  w  w  . j  av a 2s  .  co m
    while (tokenStream.incrementToken()) {
        output = output + " " + charTermAttributeGreedy.toString();

    }
    return output.trim();
}

From source file:at.ac.univie.mminf.luceneSKOS.analysis.AbstractMeSHFilter.java

License:Apache License

public static CharsRef analyze(Analyzer analyzer, String text, CharsRef reuse) throws IOException {
    TokenStream ts = analyzer.tokenStream("", new StringReader(text));
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    // PositionIncrementAttribute posIncAtt =
    // ts.addAttribute(PositionIncrementAttribute.class);
    ts.reset();/*  w  w w.j  ava  2  s.co m*/
    reuse.length = 0;
    while (ts.incrementToken()) {
        int length = termAtt.length();
        if (length == 0) {
            throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
        }
        // if (posIncAtt.getPositionIncrement() != 1) {
        // throw new IllegalArgumentException("term: " + text +
        // " analyzed to a token with posinc != 1");
        // }
        reuse.grow(reuse.length + length + 1); /* current + word + separator */
        int end = reuse.offset + reuse.length;
        if (reuse.length > 0) {
            reuse.chars[end++] = 32; // space
            reuse.length++;
        }
        System.arraycopy(termAtt.buffer(), 0, reuse.chars, end, length);
        reuse.length += length;
    }
    ts.end();
    ts.close();
    if (reuse.length == 0) {
        throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer");
    }
    return reuse;
}

From source file:at.ac.univie.mminf.luceneSKOS.analysis.SNOMEDFilter.java

License:Apache License

public static CharsRef analyze(Analyzer analyzer, String text, CharsRef reuse) throws IOException {
    TokenStream ts = analyzer.tokenStream("", new StringReader(text));
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    // PositionIncrementAttribute posIncAtt =
    // ts.addAttribute(PositionIncrementAttribute.class);
    boolean phraseTerm = false;
    ts.reset();// w  w w  .  ja va  2 s  . c  o m
    reuse.length = 0;
    while (ts.incrementToken()) {
        // System.out.println(text + " | " + termAtt.toString());
        int length = termAtt.length();
        if (length == 0) {
            throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
        }
        // if (posIncAtt.getPositionIncrement() != 1) {
        // throw new IllegalArgumentException("term: " + text +
        // " analyzed to a token with posinc != 1");
        // }
        reuse.grow(reuse.length + length + 1); /*
                                               * current + word +
                                               * separator
                                               */
        int end = reuse.offset + reuse.length;
        if (reuse.length > 0) {
            reuse.chars[end++] = 32; // space
            reuse.length++;
            phraseTerm = true;
        }
        System.arraycopy(termAtt.buffer(), 0, reuse.chars, end, length);
        reuse.length += length;
    }
    ts.end();
    ts.close();
    if (reuse.length == 0) {
        throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer");
    }

    if (phraseTerm) {
        reuse.grow(reuse.length + 2); /* current + word + separator */
        reuse.length += 2;
        char next = reuse.chars[0];
        for (int i = 0; i < reuse.length - 2; i++) {
            char tmp = reuse.chars[i + 1];
            reuse.chars[i + 1] = next;
            next = tmp;
        }
        reuse.chars[0] = '\"';
        reuse.chars[reuse.length - 1] = '\"';
    }
    return reuse;
}

From source file:at.ac.univie.mminf.luceneSKOS.util.AnalyzerUtils.java

License:Apache License

public static void displayTokens(TokenStream stream) throws IOException {

    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    while (stream.incrementToken()) {
        System.out.println("[" + term.toString() + "] ");
    }/*from   w w  w.  j  a  v  a 2s  .c om*/

}

From source file:at.ac.univie.mminf.luceneSKOS.util.AnalyzerUtils.java

License:Apache License

public static void displayTokensWithPositions(Analyzer analyzer, String text) throws IOException {

    TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));

    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);

    int position = 0;
    while (stream.incrementToken()) {

        int increment = posIncr.getPositionIncrement();
        if (increment > 0) {
            position = position + increment;
            System.out.println();
            System.out.print(position + ":");
        }/*ww  w. j  a  v a2s. c o  m*/

        System.out.print("[" + term.toString() + "] ");

    }
    System.out.println();

}

From source file:at.ac.univie.mminf.luceneSKOS.util.AnalyzerUtils.java

License:Apache License

public static void displayTokensWithFullDetails(Analyzer analyzer, String text) throws IOException {

    TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));

    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
    OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
    TypeAttribute type = stream.addAttribute(TypeAttribute.class);
    PayloadAttribute payload = stream.addAttribute(PayloadAttribute.class);

    int position = 0;
    while (stream.incrementToken()) {

        int increment = posIncr.getPositionIncrement();
        if (increment > 0) {
            position = position + increment;
            System.out.println();
            System.out.print(position + ":");
        }//from w  w w . ja v a2  s .c o  m

        Payload pl = payload.getPayload();

        if (pl != null) {
            System.out.print("[" + term.toString() + ":" + offset.startOffset() + "->" + offset.endOffset()
                    + ":" + type.type() + ":" + new String(pl.getData()) + "] ");

        } else {
            System.out.print("[" + term.toString() + ":" + offset.startOffset() + "->" + offset.endOffset()
                    + ":" + type.type() + "] ");

        }

    }
    System.out.println();
}

From source file:at.itbh.bev.apibeans.FinderImpl.java

License:Open Source License

public FullTextQuery constructQuery(EntityManager em, String postalCode, String place, String addressLine,
        String houseId) throws InvalidApiUsageException {
    FullTextEntityManager fullTextEm = Search.getFullTextEntityManager(em);

    if ((Objects.toString(postalCode, "") + Objects.toString(place, "") + Objects.toString(addressLine, "")
            + Objects.toString(houseId, "")).length() == 0) {
        throw new InvalidApiUsageException(
                "At least one parameter must be provided. Coordinates don't count as parameters.");
    }/*  w  w w  .jav a2 s . c  o m*/

    if (addressLine != null && addressLine.length() < 2 && addressLine.length() > 0) {
        throw new InvalidApiUsageException("The parameter addressLine must consist of at least 2 characters.");
    }

    QueryBuilder b = fullTextEm.getSearchFactory().buildQueryBuilder().forEntity(AdresseDenormalized.class)
            .get();
    List<Query> queries = new ArrayList<>();

    if (postalCode != null && postalCode.length() > 0) {
        queries.add(b.keyword().onField("postalCode").boostedTo(20).matching(postalCode).createQuery());
    }

    if (addressLine != null && addressLine.length() > 0) {
        queries.add(b.keyword().onField("addressLine").matching(addressLine + addressLine + addressLine)
                .createQuery());
        // triple addressLine since in the data source it is also tripled if
        // there is no building or address name
        queries.add(b.keyword().onField("addressLineExact").boostedTo(10)
                .matching(addressLine + addressLine + addressLine).createQuery());
    }

    if (houseId != null && houseId.length() > 0) {
        // if search string contains a number, take the first number in the
        // search string and match with the house number

        Matcher matcher = housenumberPattern.matcher(houseId);
        if (matcher.find()) {
            queries.add(
                    b.keyword().onField("hausnrzahl").boostedTo(50).matching(matcher.group(1)).createQuery());
        }

        if (houseId.matches(".*\\D.*")) {
            queries.add(b.keyword().onField("houseIdExact").matching(houseId).createQuery());
        }

        queries.add(b.keyword().onField("houseId").boostedTo(20).matching(houseId).createQuery());

        TextAnalyzer analyzer = new TextAnalyzer();
        TokenStream stream;
        try {
            stream = analyzer.tokenStream(null, new StringReader(houseId));
            // CharTermAttribute cattr = stream.addAttribute(CharTermAttribute.class);
            stream.reset();
            if (stream.incrementToken()) {
                // if analyzer does not remove everything check hofname and hausnrgebaeudebez
                queries.add(b.keyword().onField("hofname").matching(houseId).createQuery());
                queries.add(b.keyword().onField("hausnrgebaeudebez").matching(houseId).createQuery());
                // System.out.println(cattr.toString());
            }
            stream.end();
            stream.close();
        } catch (IOException e1) {
            e1.printStackTrace();
        }
        analyzer.close();
    }

    if (place != null && place.length() > 0) {
        queries.add(b.keyword().onField("place").matching(place).createQuery());

        queries.add(b.keyword().onField("municipalityExact").boostedTo(20).matching(place).createQuery());
        queries.add(b.keyword().onField("placeExact").boostedTo(5).matching(place).createQuery());
    }

    @SuppressWarnings("rawtypes")
    BooleanJunction bq = b.bool();
    for (Query item : queries) {
        bq = bq.should(item);
    }

    FullTextQuery fullTextQuery = fullTextEm.createFullTextQuery(bq.createQuery(), AdresseDenormalized.class);
    return fullTextQuery;
}

From source file:at.newmedialab.lmf.util.solr.suggestion.service.FieldAnalyzerService.java

License:Apache License

/**
 * analyzes string like the default field
 * @param df the name of the default field
 * @param s the string to analyze/*from   w  w w  . j  ava2  s  .  co  m*/
 * @return
 */
public static String analyzeString(SolrCore core, String df, String s) {
    try {
        TokenStream ts = core.getSchema().getFieldType(df).getQueryAnalyzer().tokenStream(df,
                new StringReader(s));
        StringBuffer b = new StringBuffer();
        ts.reset();
        while (ts.incrementToken()) {
            b.append(" ");
            CharTermAttribute attr = ts.getAttribute(CharTermAttribute.class);
            b.append(attr);
        }
        return b.toString().trim();
    } catch (IOException e) {
        e.printStackTrace();
        return s;
    }
}