Example usage for org.apache.lucene.analysis TokenStream getAttribute

List of usage examples for org.apache.lucene.analysis TokenStream getAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream getAttribute.

Prototype

public final <T extends Attribute> T getAttribute(Class<T> attClass) 

Source Link

Document

Returns the instance of the passed in Attribute contained in this AttributeSource

The caller must pass in a Class<?

Usage

From source file:net.sf.logsaw.index.internal.LuceneIndexServiceImpl.java

License:Open Source License

private void fillPhraseQuery(PhraseQuery phrase, Analyzer analyzer, String fld, String val) throws IOException {
    TokenStream ts = analyzer.tokenStream(fld, new StringReader(val));
    try {/*from w  ww  . ja  va 2s  .  c  o  m*/
        ts.reset();
        // Iterate over tokens and treat each token as term
        int pos = 0;
        while (ts.incrementToken()) {
            CharTermAttribute t = ts.getAttribute(CharTermAttribute.class);
            PositionIncrementAttribute p = ts.getAttribute(PositionIncrementAttribute.class);
            pos += p.getPositionIncrement();
            phrase.add(new Term(fld, t.toString()), pos - 1);
        }
        // End-of-stream clean-up
        ts.end();
    } finally {
        ts.close();
    }
}

From source file:net.sf.mmm.search.engine.impl.lucene.LuceneFieldManagerImpl.java

License:Apache License

/**
 * {@inheritDoc}// ww  w  .j av  a2 s  .co m
 */
@Override
public Term createTerm(String field, Object value) {

    NlsNullPointerException.checkNotNull("field", field);
    NlsNullPointerException.checkNotNull("value", value);
    String normalizedValue;
    SearchFieldConfiguration fieldConfiguration = getSearchFields().getOrCreateFieldConfiguration(field);
    SearchFieldType fieldType = fieldConfiguration.getType();
    boolean isString = (value instanceof String);
    try {
        switch (fieldType) {
        case TEXT:
            try {
                TokenStream tokenStream = this.analyzer.tokenStream(field, new StringReader((String) value));
                TermAttribute termAttribute = tokenStream.getAttribute(TermAttribute.class);
                if (tokenStream.incrementToken()) {
                    normalizedValue = termAttribute.term();
                } else {
                    normalizedValue = "";
                }
            } catch (IOException e) {
                throw new RuntimeIoException(e, IoMode.READ);
            }
            break;
        case STRING:
            normalizedValue = (String) value;
            break;
        case INTEGER:
            int i;
            if (isString) {
                i = Integer.parseInt((String) value);
            } else {
                i = ((Integer) value).intValue();
            }
            normalizedValue = NumericUtils.intToPrefixCoded(i);
            break;
        case LONG:
            long l;
            if (isString) {
                l = Long.parseLong((String) value);
            } else {
                l = ((Long) value).longValue();
            }
            normalizedValue = NumericUtils.longToPrefixCoded(l);
            break;
        case FLOAT:
            float f;
            if (isString) {
                f = Float.parseFloat((String) value);
            } else {
                f = ((Float) value).floatValue();
            }
            normalizedValue = NumericUtils.floatToPrefixCoded(f);
            break;
        case DOUBLE:
            double d;
            if (isString) {
                d = Double.parseDouble((String) value);
            } else {
                d = ((Double) value).doubleValue();
            }
            normalizedValue = NumericUtils.doubleToPrefixCoded(d);
            break;
        case DATE:
            Date date;
            if (isString) {
                date = this.iso8601Util.parseDate((String) value);
            } else {
                date = (Date) value;
            }
            normalizedValue = NumericUtils.longToPrefixCoded(date.getTime());
            break;
        default:
            throw new IllegalCaseException(SearchFieldType.class, fieldType);
        }
    } catch (ClassCastException e) {
        throw new NlsClassCastException(e, value, fieldType.getFieldClass());
    }
    return new Term(field, normalizedValue);
}

From source file:net.sf.mmm.search.engine.impl.lucene.LuceneFieldManagerImpl.java

License:Apache License

/**
 * {@inheritDoc}/*  ww w . java 2s  . c  om*/
 */
@Override
public Query createPhraseQuery(String field, String value) {

    NlsNullPointerException.checkNotNull("field", field);
    NlsNullPointerException.checkNotNull("value", value);
    SearchFieldConfiguration fieldConfiguration = getSearchFields().getOrCreateFieldConfiguration(field);
    SearchFieldType fieldType = fieldConfiguration.getType();
    Query result;
    if (fieldType == SearchFieldType.TEXT) {
        PhraseQuery phraseQuery = new PhraseQuery();
        result = phraseQuery;
        try {
            TokenStream tokenStream = this.analyzer.tokenStream(field, new StringReader(value));
            TermAttribute termAttribute = tokenStream.getAttribute(TermAttribute.class);
            while (tokenStream.incrementToken()) {
                phraseQuery.add(new Term(field, termAttribute.term()));
            }
        } catch (IOException e) {
            throw new RuntimeIoException(e, IoMode.READ);
        }
    } else {
        result = new TermQuery(createTerm(field, value));
    }
    return result;
}

From source file:net.sf.zekr.engine.search.lucene.ZekrLuceneAnalyzerTest.java

public void testNextToken1() throws Exception {
    ZekrLuceneAnalyzer zla = new ZekrLuceneAnalyzer(ZekrLuceneAnalyzer.QURAN_LANG_CODE, null);
    TokenStream ts1 = zla.tokenStream(null, new StringReader(ARABIC_STR_ORIG1));
    TokenStream ts2 = new WhitespaceTokenizer(new StringReader(ARABIC_STR1));
    boolean hasMore = ts1.incrementToken();
    ts2.incrementToken();//www . j  a v a2s. co m
    TermAttribute t1 = (TermAttribute) ts1
            .getAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute.class);
    TermAttribute t2 = (TermAttribute) ts2
            .getAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute.class);
    while (hasMore) {
        assertEquals(new String(t1.termBuffer(), 0, t1.termLength()),
                new String(t2.termBuffer(), 0, t2.termLength()));
        hasMore = ts1.incrementToken();
        ts2.incrementToken();
        t1 = (TermAttribute) ts1.getAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute.class);
        t2 = (TermAttribute) ts2.getAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute.class);
    }
}

From source file:net.sf.zekr.engine.search.lucene.ZekrLuceneAnalyzerTest.java

public void testNextToken2() throws Exception {
    ZekrLuceneAnalyzer zla = new ZekrLuceneAnalyzer(ZekrLuceneAnalyzer.QURAN_LANG_CODE, null);
    TokenStream ts1 = zla.tokenStream(null, new StringReader(ARABIC_STR_ORIG2));
    TokenStream ts2 = new WhitespaceTokenizer(new StringReader(ARABIC_STR2));
    boolean hasMore = ts1.incrementToken();
    ts2.incrementToken();//  w  w w  .ja  v a  2 s. c  o m
    TermAttribute t1 = (TermAttribute) ts1
            .getAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute.class);
    TermAttribute t2 = (TermAttribute) ts2
            .getAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute.class);
    while (hasMore) {
        assertEquals(new String(t1.termBuffer(), 0, t1.termLength()),
                new String(t2.termBuffer(), 0, t2.termLength()));
        hasMore = ts1.incrementToken();
        ts2.incrementToken();
        t1 = (TermAttribute) ts1.getAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute.class);
        t2 = (TermAttribute) ts2.getAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute.class);
    }
}

From source file:net.simpleframework.ado.lucene.AbstractLuceneManager.java

License:Apache License

@Override
public String[] getQueryTokens(final String queryString) {
    TokenStream tokenStream = null;
    try {/*from  ww w.  j a  v  a  2  s.  c  o  m*/
        tokenStream = getDefaultAnalyzer().tokenStream("QUERY_TOKENS", new StringReader(queryString));
        tokenStream.reset();
        final ArrayList<String> al = new ArrayList<>();
        while (tokenStream.incrementToken()) {
            final String term = tokenStream.getAttribute(CharTermAttribute.class).toString();
            if (term != null && term.length() > 1) {
                al.add(term);
            }
        }
        if (al.size() == 0) {
            al.add(queryString);
        }

        return al.toArray(new String[al.size()]);
    } catch (final IOException e) {
        throw ADOException.of(e);
    } finally {
        if (tokenStream != null) {
            try {
                tokenStream.close();
            } catch (final IOException e) {
            }
        }
    }
}

From source file:net.skyatlas.icd.dao.daoImpl.AnsjAnalysisTest.java

@Test
public void test() throws IOException {
    Token nt = new Token();
    Analyzer ca = new AnsjAnalysis();
    Reader sentence = new StringReader(
            "\n\n\n\n\n\n\n????, ????????????????????????????"
                    + "???????????????????"
                    + "??????????? ??????????????2????"
                    + ""
                    + "? ?????????????  ??? ????????");
    TokenStream ts = ca.tokenStream("sentence", sentence);

    System.out.println("start: " + (new Date()));
    long before = System.currentTimeMillis();
    while (ts.incrementToken()) {
        System.out.println(ts.getAttribute(CharTermAttribute.class));
    }/*from w w w.  ja  va2  s .co m*/
    ts.close();
    long now = System.currentTimeMillis();
    System.out.println("time: " + (now - before) / 1000.0 + " s");
}

From source file:net.skyatlas.icd.test.AnsegTest.java

static public void main(String[] args)
        throws IOException, CorruptIndexException, ParseException, InvalidTokenOffsetsException {
    AnsegTest inst = new AnsegTest();
    Token nt = new Token();
    Analyzer ca = new AnsjAnalysis();
    Reader sentence = new StringReader(
            "\n\n\n\n\n\n\n????, ????????????????????????????"
                    + "???????????????????"
                    + "??????????? ??????????????2????"
                    + ""
                    + "? ?????????????  ??? ????????");
    TokenStream ts = ca.tokenStream("sentence", sentence);

    System.out.println("start: " + (new Date()));
    long before = System.currentTimeMillis();
    while (ts.incrementToken()) {
        System.out.println(ts.getAttribute(CharTermAttribute.class));
    }//from  w  ww. j  a  va2 s  .c  om
    ts.close();
    long now = System.currentTimeMillis();
    System.out.println("time: " + (now - before) / 1000.0 + " s");

    HashSet<String> hs = new HashSet<String>();
    BufferedReader reader2 = IOUtil.getReader(ResourceBundle.getBundle("library").getString("stopLibrary"),
            "UTF-8");
    String word = null;
    while ((word = reader2.readLine()) != null) {
        hs.add(word);
    }
    Analyzer analyzer = new AnsjAnalysis(hs, false);
    Directory directory = null;
    IndexWriter iwriter = null;

    BufferedReader reader = IOUtil.getReader("/Users/changzhenghe/Downloads/hy_statspack01.txt", "UTF-8");
    String temp = null;
    StringBuilder sb = new StringBuilder();
    while ((temp = reader.readLine()) != null) {
        sb.append(temp);
        sb.append("\n");
    }
    reader.close();
    String text = sb.toString();

    text = "????????????  ??? ????????";

    IndexWriterConfig ic = new IndexWriterConfig(Version.LUCENE_32, analyzer);
    // 
    directory = new RAMDirectory();
    iwriter = new IndexWriter(directory, ic);
    // BufferedReader reader =
    // IOUtil.getReader("/Users/ansj/Documents//?//1998?_.txt",
    // "GBK");
    // String temp = null;
    // while ((temp = reader.readLine()) != null) {
    // addContent(iwriter, temp);
    // }
    inst.addContent(iwriter, "?   ?()   (?)");
    inst.addContent(iwriter, "   ?()   (?)");
    inst.addContent(iwriter, "?   ?   (?)");
    inst.addContent(iwriter, "   ??NEC   ");
    inst.addContent(iwriter, "?");
    iwriter.commit();
    iwriter.close();

    System.out.println("");

    inst.search(analyzer, directory, "");
    inst.search(analyzer, directory, "");
    inst.search(analyzer, directory, "");
    inst.search(analyzer, directory, "?");

    /*
     KeyWordComputer kwc = new KeyWordComputer(5);
     String title = "??";
     String content = "9??"
     + "?????????"
     + "????"
     + "??"
     + "?????"
     + "???"
     + "??????"
     + "???"
     + "????20??"
     + "????"
     + "?"
     + "???]??"
     + "???";
     Collection<Keyword> result = kwc.computeArticleTfidf(title, content);
     System.out.println(result);
            
     AnsegTest t = new AnsegTest();
     List<Term> parse = ToAnalysis.parse("?");
     System.out.println(parse);
     System.out.println("*********** ? ************");
     //        UserDefineLibrary.insertWord("", "userDefine", 1000);
     //        UserDefineLibrary.insertWord("?", "userDefine", 1000);
     UserDefineLibrary.insertWord("?", "userDefine", 1000);
     parse = ToAnalysis.parse("???");
     System.out.println(parse);
     */
}

From source file:nl.uva.lucenefacility.LuceneUtil.java

public static List<String> tokenizeString(Analyzer analyzer, String string) {
    List<String> result = new ArrayList<String>();
    try {//w  w  w  .  j  a va 2s.  c  om
        TokenStream stream = analyzer.tokenStream(null, new StringReader(string));
        stream.reset();
        while (stream.incrementToken()) {
            result.add(stream.getAttribute(CharTermAttribute.class).toString());
        }
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    return result;
}

From source file:org.aksw.palmetto.corpus.lucene.SimpleAnalyzerTest.java

License:Open Source License

public void test(boolean lowercase) throws Exception {
    SimpleAnalyzer analyzer = new SimpleAnalyzer(lowercase);
    TokenStream stream = analyzer.tokenStream("test", text);

    CharTermAttribute token;//w w  w . jav  a2  s. com
    int count = 0;
    stream.reset();
    while (stream.incrementToken()) {
        Assert.assertTrue(count < expectedTokens.length);
        token = stream.getAttribute(CharTermAttribute.class);
        if (lowercase) {
            Assert.assertEquals(expectedTokens[count].toLowerCase(), token.toString());
        } else {
            Assert.assertEquals(expectedTokens[count], token.toString());
        }
        ++count;
    }
    Assert.assertEquals(expectedTokens.length, count);
    analyzer.close();
}