Example usage for org.apache.lucene.analysis TokenStream incrementToken

List of usage examples for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:it.cnr.isti.hpc.dexter.analysis.SpotCleaner.java

License:Apache License

public String clean(String spot) throws IOException {
    try {/*from   www. j av  a  2s .com*/
        spot = URLDecoder.decode(spot, "UTF-8");
    } catch (IllegalArgumentException e) {

    }

    analyzer.lowercase(spot.length() > 4);

    TokenStream ts = analyzer.tokenStream("content", new StringReader(spot));

    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    ts.reset();
    sb.setLength(0);
    int tokens = 0;
    while (ts.incrementToken()) {
        tokens++;
        sb.append(termAtt.toString());
        sb.append(' ');
        if (tokens > maxSpotLength) {
            return "";
        }
    }
    ts.end();
    ts.reset();
    if (sb.length() > 0)
        sb.setLength(sb.length() - 1);
    // System.out.println(spot + " -> " + "[" + sb.toString() + "]");
    String finalSpot = sb.toString();
    for (Filter<String> filter : filters) {
        if (filter.isFilter(finalSpot)) {
            finalSpot = "";
        }
    }
    return finalSpot;
}

From source file:it.cnr.isti.hpc.dexter.spot.DocumentFrequencyGenerator.java

License:Apache License

private void initBloomFilter(Iterator<String> spotIterator) {
    String spot = spotIterator.next();
    analyzer.setShingles(false);//from   ww  w  .jav a2s  .  c o m

    ProgressLogger pl = new ProgressLogger("added {} spots to the bloom filter", 100000);
    pl.up();
    while (spotIterator.hasNext()) {
        String next = spotIterator.next();
        if (next.equals(spot))
            continue;
        pl.up();
        spot = next;
        TokenStream ts = null;
        try {
            ts = analyzer.tokenStream("content", new StringReader(spot));
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        try {
            ts.reset();

            if (ts.incrementToken()) {
                spot = termAtt.toString();
                bf.add(spot);

            }
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

}

From source file:it.unibz.instasearch.indexing.StorageIndexer.java

License:Open Source License

/**
 * Extracts terms from text//w  ww.j  av  a  2 s. co m
 * 
 * @param text
 * @return a map of terms to their offsets in text
 * @throws IOException
 */
public static Map<String, List<Integer>> extractTextTerms(String text) throws IOException {
    Map<String, List<Integer>> terms = new HashMap<String, List<Integer>>();
    TokenStream tokenStream = fileAnalyzer.tokenStream(Field.CONTENTS.toString(), new StringReader(text));

    TermAttribute termAtt = (TermAttribute) tokenStream.addAttribute(TermAttribute.class);
    OffsetAttribute offsetAtt = (OffsetAttribute) tokenStream.addAttribute(OffsetAttribute.class);

    while (tokenStream.incrementToken()) {
        String termText = termAtt.term().toLowerCase();// t.termText().toLowerCase();
        int offset = offsetAtt.startOffset();

        List<Integer> offsets = terms.get(termText);

        if (offsets == null) {
            offsets = new LinkedList<Integer>();
            terms.put(termText, offsets);
        }

        offsets.add(offset);
    }
    tokenStream.close();

    return terms;
}

From source file:ivory.core.tokenize.Tokenizer.java

License:Apache License

/**
 * Convert tokenStream object into a string.
 * /*from w w w.ja  v a2  s  .c o  m*/
 * @param tokenStream
 *    object returned by Lucene tokenizer
 * @return
 *    String corresponding to the tokens output by tokenStream
 */
protected static String streamToString(TokenStream tokenStream) {
    CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
    tokenStream.clearAttributes();
    StringBuilder tokenized = new StringBuilder();
    try {
        while (tokenStream.incrementToken()) {
            tokenized.append(termAtt.toString() + " ");
        }
    } catch (IOException e) {
        e.printStackTrace();
    }
    return tokenized.toString().trim();
}

From source file:jaligner.Sequence.java

License:Open Source License

/**
 * Constructor/* www .  j  ava  2  s  .  com*/
 * 
 * @param sequence
 */
public Sequence(String sequence, Analyzer analyzer, int max_length) throws IOException {
    super();
    this.sequence = sequence;

    TokenStream stream = analyzer.tokenStream("contents", new StringReader(sequence));
    Token.TokenAttributeFactory tokenAttributeFactory = new Token.TokenAttributeFactory(
            stream.getAttributeFactory());

    Vector<Token> tokenVector = new Vector<Token>();

    while (stream.incrementToken() && tokenVector.size() < max_length) {
        //            Token token = new Token();
        //            Token token = (Token) stream.getAttribute(CharTermAttribute.class);
        Token token = (Token) tokenAttributeFactory.createAttributeInstance(Token.class);

        CharTermAttribute charTerm = stream.getAttribute(CharTermAttribute.class);
        OffsetAttribute offset = stream.getAttribute(OffsetAttribute.class);
        //            PayloadAttribute payload = stream.getAttribute(PayloadAttribute.class);
        //            FlagsAttribute flags = stream.getAttribute(FlagsAttribute.class);

        //        public Token reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, String newType) {
        token.reinit(charTerm.buffer(), 0, charTerm.length(), offset.startOffset(), offset.endOffset());
        token.setOffset(offset.startOffset(), offset.endOffset());

        //            token.setPayload(payload.getPayload());
        //            token.setFlags(flags.getFlags());

        if (stream.hasAttribute(PositionIncrementAttribute.class)) {
            PositionIncrementAttribute positionIncrement = stream
                    .getAttribute(PositionIncrementAttribute.class);
            token.setPositionIncrement(positionIncrement.getPositionIncrement());
        }

        if (stream.hasAttribute(TypeAttribute.class)) {
            TypeAttribute type = stream.getAttribute(TypeAttribute.class);
            token.setType(type.type());
        }

        tokenVector.add(token);
    }

    stream.end();
    stream.close();

    this.tokens = tokenVector.toArray(new Token[tokenVector.size()]);
}

From source file:jobs.LoadOntologyJob.java

private int getTotalLength(String label) throws IOException {
    //Analyzer doesn't remomve stop words
    Analyzer customanalyzer = new CustomStopWordsStandardAnalyzer(Version.LUCENE_47);
    List<String> resultStop = new ArrayList<String>();
    TokenStream customstream = customanalyzer.tokenStream(null, new StringReader(label));
    customstream.reset();/*from w  ww  .ja v a 2  s  .  c o  m*/
    while (customstream.incrementToken()) {
        resultStop.add(customstream.getAttribute(CharTermAttribute.class).toString());
    }
    return resultStop.size();
}

From source file:jobs.LoadOntologyJob.java

private int getLengthWithoutStopWords(String label) throws IOException {
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47);
    List<String> result = new ArrayList<String>();
    TokenStream stream = analyzer.tokenStream(null, new StringReader(label));
    stream.reset();//  w  w w .  j a va2  s  . c  o m
    while (stream.incrementToken()) {
        result.add(stream.getAttribute(CharTermAttribute.class).toString());
    }
    return result.size();
}

From source file:jp.co.atware.solr.analizers.cjk.CJKBigramFilterTest.java

License:Apache License

@Theory
public void testIncrementToken(Fixture testData) throws Exception {
    TokenStream tokenStream = getTokenStream(testData.input);
    CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
    List<String> actual = new ArrayList<String>();
    while (tokenStream.incrementToken()) {
        actual.add(termAtt.toString());/* www . ja va2s .  co m*/
    }
    assertThat(actual.toArray(EMPTY_STRING_ARRAY), is(testData.expected));
}

From source file:jp.co.atware.solr.analizers.cjk.CranioCaudalFilterTest.java

License:Apache License

@Theory
public void testIncrementToken(TestData testData) throws Exception {
    TokenStream tokenStream = createTokenStream(testData.input);
    CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
    List<String> actual = new ArrayList<String>();
    while (tokenStream.incrementToken()) {
        actual.add(termAtt.toString());/* w  ww.ja v a 2 s .c o m*/
    }
    assertThat(actual.toArray(EMPTY_STRING_ARRAY), is(testData.expected));
}

From source file:jp.co.atware.solr.analizers.cjk.MultistageMappingCharFilterTest.java

License:Apache License

@Theory
public void testMultiMappingAndOffset(TestData testData) throws Exception {
    Reader reader = charFilterFactory.create(new StringReader(testData.input));
    TokenStream tokenStream = tokenizerFactory.create(reader);
    OffsetAttribute actualOffset = tokenStream.getAttribute(OffsetAttribute.class);
    CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);

    tokenStream.reset();//  ww  w  . j  ava  2  s .c o m

    assertThat(tokenStream.incrementToken(), is(true));
    assertThat(termAtt.toString(), is(testData.expected));
    assertThat(actualOffset.startOffset(), is(testData.start));
    assertThat(actualOffset.endOffset(), is(testData.end));
    assertThat(tokenStream.incrementToken(), is(false));
}