Example usage for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:net.sf.zekr.engine.search.lucene.ZekrLuceneAnalyzerTest.java

public void testNextToken1() throws Exception {
    ZekrLuceneAnalyzer zla = new ZekrLuceneAnalyzer(ZekrLuceneAnalyzer.QURAN_LANG_CODE, null);
    TokenStream ts1 = zla.tokenStream(null, new StringReader(ARABIC_STR_ORIG1));
    TokenStream ts2 = new WhitespaceTokenizer(new StringReader(ARABIC_STR1));
    boolean hasMore = ts1.incrementToken();
    ts2.incrementToken();//from w w w  .  j  av a 2s.c  o  m
    TermAttribute t1 = (TermAttribute) ts1
            .getAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute.class);
    TermAttribute t2 = (TermAttribute) ts2
            .getAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute.class);
    while (hasMore) {
        assertEquals(new String(t1.termBuffer(), 0, t1.termLength()),
                new String(t2.termBuffer(), 0, t2.termLength()));
        hasMore = ts1.incrementToken();
        ts2.incrementToken();
        t1 = (TermAttribute) ts1.getAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute.class);
        t2 = (TermAttribute) ts2.getAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute.class);
    }
}

From source file:net.sf.zekr.engine.search.lucene.ZekrLuceneAnalyzerTest.java

public void testNextToken2() throws Exception {
    ZekrLuceneAnalyzer zla = new ZekrLuceneAnalyzer(ZekrLuceneAnalyzer.QURAN_LANG_CODE, null);
    TokenStream ts1 = zla.tokenStream(null, new StringReader(ARABIC_STR_ORIG2));
    TokenStream ts2 = new WhitespaceTokenizer(new StringReader(ARABIC_STR2));
    boolean hasMore = ts1.incrementToken();
    ts2.incrementToken();/*from   w  w  w  . j  a va 2s . com*/
    TermAttribute t1 = (TermAttribute) ts1
            .getAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute.class);
    TermAttribute t2 = (TermAttribute) ts2
            .getAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute.class);
    while (hasMore) {
        assertEquals(new String(t1.termBuffer(), 0, t1.termLength()),
                new String(t2.termBuffer(), 0, t2.termLength()));
        hasMore = ts1.incrementToken();
        ts2.incrementToken();
        t1 = (TermAttribute) ts1.getAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute.class);
        t2 = (TermAttribute) ts2.getAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute.class);
    }
}

From source file:net.simpleframework.ado.lucene.AbstractLuceneManager.java

License:Apache License

@Override
public String[] getQueryTokens(final String queryString) {
    TokenStream tokenStream = null;
    try {/*from w  w w .j a v a  2  s . c  o m*/
        tokenStream = getDefaultAnalyzer().tokenStream("QUERY_TOKENS", new StringReader(queryString));
        tokenStream.reset();
        final ArrayList<String> al = new ArrayList<>();
        while (tokenStream.incrementToken()) {
            final String term = tokenStream.getAttribute(CharTermAttribute.class).toString();
            if (term != null && term.length() > 1) {
                al.add(term);
            }
        }
        if (al.size() == 0) {
            al.add(queryString);
        }

        return al.toArray(new String[al.size()]);
    } catch (final IOException e) {
        throw ADOException.of(e);
    } finally {
        if (tokenStream != null) {
            try {
                tokenStream.close();
            } catch (final IOException e) {
            }
        }
    }
}

From source file:net.skyatlas.icd.dao.daoImpl.AnsjAnalysisTest.java

@Test
public void test() throws IOException {
    Token nt = new Token();
    Analyzer ca = new AnsjAnalysis();
    Reader sentence = new StringReader(
            "\n\n\n\n\n\n\n????, ????????????????????????????"
                    + "???????????????????"
                    + "??????????? ??????????????2????"
                    + ""
                    + "? ?????????????  ??? ????????");
    TokenStream ts = ca.tokenStream("sentence", sentence);

    System.out.println("start: " + (new Date()));
    long before = System.currentTimeMillis();
    while (ts.incrementToken()) {
        System.out.println(ts.getAttribute(CharTermAttribute.class));
    }/*from   w  w w . j  av  a 2 s  .c  o  m*/
    ts.close();
    long now = System.currentTimeMillis();
    System.out.println("time: " + (now - before) / 1000.0 + " s");
}

From source file:net.skyatlas.icd.test.AnsegTest.java

static public void main(String[] args)
        throws IOException, CorruptIndexException, ParseException, InvalidTokenOffsetsException {
    AnsegTest inst = new AnsegTest();
    Token nt = new Token();
    Analyzer ca = new AnsjAnalysis();
    Reader sentence = new StringReader(
            "\n\n\n\n\n\n\n????, ????????????????????????????"
                    + "???????????????????"
                    + "??????????? ??????????????2????"
                    + ""
                    + "? ?????????????  ??? ????????");
    TokenStream ts = ca.tokenStream("sentence", sentence);

    System.out.println("start: " + (new Date()));
    long before = System.currentTimeMillis();
    while (ts.incrementToken()) {
        System.out.println(ts.getAttribute(CharTermAttribute.class));
    }// w  w  w.ja v a2 s.  c o  m
    ts.close();
    long now = System.currentTimeMillis();
    System.out.println("time: " + (now - before) / 1000.0 + " s");

    HashSet<String> hs = new HashSet<String>();
    BufferedReader reader2 = IOUtil.getReader(ResourceBundle.getBundle("library").getString("stopLibrary"),
            "UTF-8");
    String word = null;
    while ((word = reader2.readLine()) != null) {
        hs.add(word);
    }
    Analyzer analyzer = new AnsjAnalysis(hs, false);
    Directory directory = null;
    IndexWriter iwriter = null;

    BufferedReader reader = IOUtil.getReader("/Users/changzhenghe/Downloads/hy_statspack01.txt", "UTF-8");
    String temp = null;
    StringBuilder sb = new StringBuilder();
    while ((temp = reader.readLine()) != null) {
        sb.append(temp);
        sb.append("\n");
    }
    reader.close();
    String text = sb.toString();

    text = "????????????  ??? ????????";

    IndexWriterConfig ic = new IndexWriterConfig(Version.LUCENE_32, analyzer);
    // 
    directory = new RAMDirectory();
    iwriter = new IndexWriter(directory, ic);
    // BufferedReader reader =
    // IOUtil.getReader("/Users/ansj/Documents//?//1998?_.txt",
    // "GBK");
    // String temp = null;
    // while ((temp = reader.readLine()) != null) {
    // addContent(iwriter, temp);
    // }
    inst.addContent(iwriter, "?   ?()   (?)");
    inst.addContent(iwriter, "   ?()   (?)");
    inst.addContent(iwriter, "?   ?   (?)");
    inst.addContent(iwriter, "   ??NEC   ");
    inst.addContent(iwriter, "?");
    iwriter.commit();
    iwriter.close();

    System.out.println("");

    inst.search(analyzer, directory, "");
    inst.search(analyzer, directory, "");
    inst.search(analyzer, directory, "");
    inst.search(analyzer, directory, "?");

    /*
     KeyWordComputer kwc = new KeyWordComputer(5);
     String title = "??";
     String content = "9??"
     + "?????????"
     + "????"
     + "??"
     + "?????"
     + "???"
     + "??????"
     + "???"
     + "????20??"
     + "????"
     + "?"
     + "???]??"
     + "???";
     Collection<Keyword> result = kwc.computeArticleTfidf(title, content);
     System.out.println(result);
            
     AnsegTest t = new AnsegTest();
     List<Term> parse = ToAnalysis.parse("?");
     System.out.println(parse);
     System.out.println("*********** ? ************");
     //        UserDefineLibrary.insertWord("", "userDefine", 1000);
     //        UserDefineLibrary.insertWord("?", "userDefine", 1000);
     UserDefineLibrary.insertWord("?", "userDefine", 1000);
     parse = ToAnalysis.parse("???");
     System.out.println(parse);
     */
}

From source file:NewsIR_search.TRECQuery.java

/**
 * Returns the content of the 'queryField' from the query text
 * @param analyzer//from w w  w  . jav  a  2 s  . c o  m
 * @param queryField
 * @return (String) The content of the field
 * @throws Exception 
 */
public String queryFieldAnalyze(Analyzer analyzer, String queryField) throws Exception {
    StringBuffer buff = new StringBuffer();
    TokenStream stream = analyzer.tokenStream(CumulativeIndexer.FIELD_TEXT, new StringReader(queryField));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();
    while (stream.incrementToken()) {
        String term = termAtt.toString();
        term = term.toLowerCase();
        buff.append(term).append(" ");
    }
    stream.end();
    stream.close();
    return buff.toString();
}

From source file:ngram.NGramExtractor.java

License:Open Source License

/**
 * Extracts NGrams from a String of text.
 * Can handle ngrams of any length and also perform stop word removal before extraction
 * @param text the text that the ngrams should be extracted from
 * @param length the length of the ngrams
 * @param stopWords whether or not stopwords should be removed before extraction
 * @param overlap whether or not the ngrams should overlap
 *///from   w  w w  .  j a v a2s  .c  o  m
public void extract(String text, int length, Boolean stopWords, Boolean overlap)
        throws FileNotFoundException, IOException {

    this.text = text;
    this.length = length;
    this.stopWords = stopWords;
    this.overlap = overlap;

    nGrams = new LinkedList<String>();
    uniqueNGrams = new LinkedList<String>();
    nGramFreqs = new HashMap<String, Integer>();

    /* If the minLength and maxLength are both 1, then we want unigrams
     * Make use of a StopAnalyzer when stopwords should be removed
     * Make use of a SimpleAnalyzer when stop words should be included
     */
    if (length == 1) {
        if (this.stopWords) {
            analyzer = new StandardAnalyzer(Version.LUCENE_36);
        } else {
            analyzer = new StandardAnalyzer(Version.LUCENE_36, Collections.EMPTY_SET); //Changed from simple to standard to include apostrophe/s
        }
    } else { //Bigger than unigrams so use ShingleAnalyzerWrapper. Once again, different analyzers depending on stop word removal
        if (this.stopWords) {
            analyzer = new ShingleAnalyzerWrapper(new StopAnalyzer(Version.LUCENE_24), length, length, " ",
                    false, false); //This is a hack to use Lucene 2.4 since in 2.4 position increments weren't preserved by default. Using a later version puts underscores (_) in the place of removed stop words.
        } else {
            analyzer = new ShingleAnalyzerWrapper(
                    new StandardAnalyzer(Version.LUCENE_36, Collections.EMPTY_SET), length, length, " ", false,
                    false); //Changed from simple to standard to include apostrophe/s
        }
    }

    //Code to process and extract the ngrams
    TokenStream tokenStream = analyzer.tokenStream("text", new StringReader(this.text));
    OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

    int tokenCount = 0;
    while (tokenStream.incrementToken()) {

        int startOffset = offsetAttribute.startOffset();
        int endOffset = offsetAttribute.endOffset();
        String termToken = charTermAttribute.toString(); //The actual token term
        nGrams.add(termToken); //Add all ngrams to the ngram LinkedList

        //If n-grams are not allowed to overlap, then increment to point of no overlap
        if (!overlap) {
            for (int i = 0; i < length - 1; i++) {
                tokenStream.incrementToken();
            }
        }

    }

    //Store unique nGrams and frequencies in hash tables
    for (String nGram : nGrams) {
        if (nGramFreqs.containsKey(nGram)) {
            nGramFreqs.put(nGram, nGramFreqs.get(nGram) + 1);
        } else {
            nGramFreqs.put(nGram, 1);
            uniqueNGrams.add(nGram);
        }
    }

}

From source file:nicta.com.au.failureanalysis.optimalquery.OptPatentQuery.java

private String transformation(TokenStream ts, int treshold, String field) throws IOException {
    Map<String, Integer> m = new HashMap<>();
    String q = "";
    CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
    ts.reset();//from w  ww. j  a v  a  2 s  .  co  m
    int s = 0;
    while (ts.incrementToken()) {
        String term = charTermAttribute.toString().replace(":", "\\:");
        q += term + " ";
        if (m.containsKey(term)) {
            m.put(term, m.get(term) + 1);
        } else {
            m.put(term, 1);
        }
        s++;
    }
    ts.close();
    //        return q;
    q = "";
    for (String k : m.keySet()) {
        if (m.get(k) >= treshold) {
            if (!Functions.isNumeric(k)) {
                q += k + "^" + m.get(k) + " ";
                //                    System.out.println(k);
            }
        }
    }
    if (field != null) {
        vocabulary.put(field, m);
    }
    fieldsSize.put(field, s);
    return q;
}

From source file:nicta.com.au.failureanalysis.query.QueryGneration.java

private Map<String, Integer> getTerms(TokenStream ts, int treshold, String field) throws IOException {
    Map<String, Integer> m = new HashMap<>();
    Map<String, Integer> qterm_freq = new HashMap<>();

    String q = "";
    CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
    ts.reset();//from  w  w w  .  j a  v  a  2 s.  c o  m
    int s = 0;
    while (ts.incrementToken()) {
        String term = charTermAttribute.toString().replace(":", "\\:");
        q += term + " ";
        if (m.containsKey(term)) {
            m.put(term, m.get(term) + 1);
        } else {
            m.put(term, 1);
        }
        s++;
    }
    ts.close();
    //        return q;
    q = "";
    //        int count = 0;
    for (String k : m.keySet()) {
        if (m.get(k) >= treshold) {
            if (!Functions.isNumeric(k)) {
                q += k + "^" + m.get(k) + " ";
                qterm_freq.put(k, m.get(k));
                //                    count++;
                //                    System.out.println(count + " " + k + " " + m.get(k));
            }
        }
    }
    //        System.out.println("-------------------");
    if (field != null) {
        vocabulary.put(field, m);
    }
    fieldsSize.put(field, s);
    //        return q;
    return qterm_freq;
}

From source file:nicta.com.au.patent.pac.analysis.FieldsCosineSimilarities.java

private Map<String, Double> getVector(TokenStream ts, String field) throws IOException, Exception {
    Map<String, Double> m = new HashMap<>();
    CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
    ts.reset();//from  ww  w  .j  av  a 2s . c o  m
    int i = 0;
    while (ts.incrementToken()) {
        i++;
        String term = charTermAttribute.toString();
        if (m.containsKey(term)) {
            m.put(term, m.get(term) + 1);
        } else {
            m.put(term, 1.0);
        }
    }
    for (String key : m.keySet()) {
        Term t = new Term(field, key);
        int totalTF = ir.docFreq(t);
        int docs = ir.getDocCount("claims");
        double idf = Math.log10((double) docs / (totalTF + 1));
        m.put(key, (m.get(key) / i) * idf);
    }

    return m;
}