Example usage for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:it.cnr.isti.hpc.dexter.analysis.SpotCleaner.java

License:Apache License

public String clean(String spot) throws IOException {
    try {/*from   www. j av  a  2s .com*/
        spot = URLDecoder.decode(spot, "UTF-8");
    } catch (IllegalArgumentException e) {

    }

    analyzer.lowercase(spot.length() > 4);

    TokenStream ts = analyzer.tokenStream("content", new StringReader(spot));

    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    ts.reset();
    sb.setLength(0);
    int tokens = 0;
    while (ts.incrementToken()) {
        tokens++;
        sb.append(termAtt.toString());
        sb.append(' ');
        if (tokens > maxSpotLength) {
            return "";
        }
    }
    ts.end();
    ts.reset();
    if (sb.length() > 0)
        sb.setLength(sb.length() - 1);
    // System.out.println(spot + " -> " + "[" + sb.toString() + "]");
    String finalSpot = sb.toString();
    for (Filter<String> filter : filters) {
        if (filter.isFilter(finalSpot)) {
            finalSpot = "";
        }
    }
    return finalSpot;
}

From source file:it.cnr.isti.hpc.dexter.spot.DocumentFrequencyGenerator.java

License:Apache License

private void initBloomFilter(Iterator<String> spotIterator) {
    String spot = spotIterator.next();
    analyzer.setShingles(false);//from   ww  w  .jav a2s  .  c o m

    ProgressLogger pl = new ProgressLogger("added {} spots to the bloom filter", 100000);
    pl.up();
    while (spotIterator.hasNext()) {
        String next = spotIterator.next();
        if (next.equals(spot))
            continue;
        pl.up();
        spot = next;
        TokenStream ts = null;
        try {
            ts = analyzer.tokenStream("content", new StringReader(spot));
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        try {
            ts.reset();

            if (ts.incrementToken()) {
                spot = termAtt.toString();
                bf.add(spot);

            }
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

}

From source file:it.unibz.instasearch.indexing.StorageIndexer.java

License:Open Source License

/**
 * Extracts terms from text//w  ww.j  av  a  2 s. co m
 * 
 * @param text
 * @return a map of terms to their offsets in text
 * @throws IOException
 */
public static Map<String, List<Integer>> extractTextTerms(String text) throws IOException {
    Map<String, List<Integer>> terms = new HashMap<String, List<Integer>>();
    TokenStream tokenStream = fileAnalyzer.tokenStream(Field.CONTENTS.toString(), new StringReader(text));

    TermAttribute termAtt = (TermAttribute) tokenStream.addAttribute(TermAttribute.class);
    OffsetAttribute offsetAtt = (OffsetAttribute) tokenStream.addAttribute(OffsetAttribute.class);

    while (tokenStream.incrementToken()) {
        String termText = termAtt.term().toLowerCase();// t.termText().toLowerCase();
        int offset = offsetAtt.startOffset();

        List<Integer> offsets = terms.get(termText);

        if (offsets == null) {
            offsets = new LinkedList<Integer>();
            terms.put(termText, offsets);
        }

        offsets.add(offset);
    }
    tokenStream.close();

    return terms;
}

From source file:ivory.core.tokenize.Tokenizer.java

License:Apache License

/**
 * Convert tokenStream object into a string.
 * /*from w w w.ja  v a2  s  .c o  m*/
 * @param tokenStream
 *    object returned by Lucene tokenizer
 * @return
 *    String corresponding to the tokens output by tokenStream
 */
protected static String streamToString(TokenStream tokenStream) {
    CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
    tokenStream.clearAttributes();
    StringBuilder tokenized = new StringBuilder();
    try {
        while (tokenStream.incrementToken()) {
            tokenized.append(termAtt.toString() + " ");
        }
    } catch (IOException e) {
        e.printStackTrace();
    }
    return tokenized.toString().trim();
}

From source file:jaligner.Sequence.java

License:Open Source License

/**
 * Constructor/* www .  j  ava  2  s  .  com*/
 * 
 * @param sequence
 */
public Sequence(String sequence, Analyzer analyzer, int max_length) throws IOException {
    super();
    this.sequence = sequence;

    TokenStream stream = analyzer.tokenStream("contents", new StringReader(sequence));
    Token.TokenAttributeFactory tokenAttributeFactory = new Token.TokenAttributeFactory(
            stream.getAttributeFactory());

    Vector<Token> tokenVector = new Vector<Token>();

    while (stream.incrementToken() && tokenVector.size() < max_length) {
        //            Token token = new Token();
        //            Token token = (Token) stream.getAttribute(CharTermAttribute.class);
        Token token = (Token) tokenAttributeFactory.createAttributeInstance(Token.class);

        CharTermAttribute charTerm = stream.getAttribute(CharTermAttribute.class);
        OffsetAttribute offset = stream.getAttribute(OffsetAttribute.class);
        //            PayloadAttribute payload = stream.getAttribute(PayloadAttribute.class);
        //            FlagsAttribute flags = stream.getAttribute(FlagsAttribute.class);

        //        public Token reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, String newType) {
        token.reinit(charTerm.buffer(), 0, charTerm.length(), offset.startOffset(), offset.endOffset());
        token.setOffset(offset.startOffset(), offset.endOffset());

        //            token.setPayload(payload.getPayload());
        //            token.setFlags(flags.getFlags());

        if (stream.hasAttribute(PositionIncrementAttribute.class)) {
            PositionIncrementAttribute positionIncrement = stream
                    .getAttribute(PositionIncrementAttribute.class);
            token.setPositionIncrement(positionIncrement.getPositionIncrement());
        }

        if (stream.hasAttribute(TypeAttribute.class)) {
            TypeAttribute type = stream.getAttribute(TypeAttribute.class);
            token.setType(type.type());
        }

        tokenVector.add(token);
    }

    stream.end();
    stream.close();

    this.tokens = tokenVector.toArray(new Token[tokenVector.size()]);
}

From source file:jobs.LoadOntologyJob.java

private int getTotalLength(String label) throws IOException {
    //Analyzer doesn't remomve stop words
    Analyzer customanalyzer = new CustomStopWordsStandardAnalyzer(Version.LUCENE_47);
    List<String> resultStop = new ArrayList<String>();
    TokenStream customstream = customanalyzer.tokenStream(null, new StringReader(label));
    customstream.reset();/*from w  ww  .ja v a 2  s  .  c o  m*/
    while (customstream.incrementToken()) {
        resultStop.add(customstream.getAttribute(CharTermAttribute.class).toString());
    }
    return resultStop.size();
}

From source file:jobs.LoadOntologyJob.java

private int getLengthWithoutStopWords(String label) throws IOException {
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47);
    List<String> result = new ArrayList<String>();
    TokenStream stream = analyzer.tokenStream(null, new StringReader(label));
    stream.reset();//  w  w w .  j a va2  s  . c  o m
    while (stream.incrementToken()) {
        result.add(stream.getAttribute(CharTermAttribute.class).toString());
    }
    return result.size();
}

From source file:jp.co.atware.solr.analizers.cjk.CJKBigramFilterTest.java

License:Apache License

@Theory
public void testIncrementToken(Fixture testData) throws Exception {
    TokenStream tokenStream = getTokenStream(testData.input);
    CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
    List<String> actual = new ArrayList<String>();
    while (tokenStream.incrementToken()) {
        actual.add(termAtt.toString());/* www . ja va2s .  co m*/
    }
    assertThat(actual.toArray(EMPTY_STRING_ARRAY), is(testData.expected));
}

From source file:jp.co.atware.solr.analizers.cjk.CranioCaudalFilterTest.java

License:Apache License

@Theory
public void testIncrementToken(TestData testData) throws Exception {
    TokenStream tokenStream = createTokenStream(testData.input);
    CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
    List<String> actual = new ArrayList<String>();
    while (tokenStream.incrementToken()) {
        actual.add(termAtt.toString());/* w  ww.ja v a 2 s .c o m*/
    }
    assertThat(actual.toArray(EMPTY_STRING_ARRAY), is(testData.expected));
}

From source file:jp.co.atware.solr.analizers.cjk.MultistageMappingCharFilterTest.java

License:Apache License

@Theory
public void testMultiMappingAndOffset(TestData testData) throws Exception {
    Reader reader = charFilterFactory.create(new StringReader(testData.input));
    TokenStream tokenStream = tokenizerFactory.create(reader);
    OffsetAttribute actualOffset = tokenStream.getAttribute(OffsetAttribute.class);
    CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);

    tokenStream.reset();//  ww  w  . j  ava  2  s .c o m

    assertThat(tokenStream.incrementToken(), is(true));
    assertThat(termAtt.toString(), is(testData.expected));
    assertThat(actualOffset.startOffset(), is(testData.start));
    assertThat(actualOffset.endOffset(), is(testData.end));
    assertThat(tokenStream.incrementToken(), is(false));
}