Example usage for org.apache.lucene.analysis TokenStream incrementToken

List of usage examples for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:net.sf.zekr.engine.search.lucene.ZekrLuceneAnalyzerTest.java

public void testNextToken1() throws Exception {
    ZekrLuceneAnalyzer zla = new ZekrLuceneAnalyzer(ZekrLuceneAnalyzer.QURAN_LANG_CODE, null);
    TokenStream ts1 = zla.tokenStream(null, new StringReader(ARABIC_STR_ORIG1));
    TokenStream ts2 = new WhitespaceTokenizer(new StringReader(ARABIC_STR1));
    boolean hasMore = ts1.incrementToken();
    ts2.incrementToken();//from w w w  .  j  av a 2s.c  o  m
    TermAttribute t1 = (TermAttribute) ts1
            .getAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute.class);
    TermAttribute t2 = (TermAttribute) ts2
            .getAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute.class);
    while (hasMore) {
        assertEquals(new String(t1.termBuffer(), 0, t1.termLength()),
                new String(t2.termBuffer(), 0, t2.termLength()));
        hasMore = ts1.incrementToken();
        ts2.incrementToken();
        t1 = (TermAttribute) ts1.getAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute.class);
        t2 = (TermAttribute) ts2.getAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute.class);
    }
}

From source file:net.sf.zekr.engine.search.lucene.ZekrLuceneAnalyzerTest.java

public void testNextToken2() throws Exception {
    ZekrLuceneAnalyzer zla = new ZekrLuceneAnalyzer(ZekrLuceneAnalyzer.QURAN_LANG_CODE, null);
    TokenStream ts1 = zla.tokenStream(null, new StringReader(ARABIC_STR_ORIG2));
    TokenStream ts2 = new WhitespaceTokenizer(new StringReader(ARABIC_STR2));
    boolean hasMore = ts1.incrementToken();
    ts2.incrementToken();/*from   w  w  w  . j  a va 2s . com*/
    TermAttribute t1 = (TermAttribute) ts1
            .getAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute.class);
    TermAttribute t2 = (TermAttribute) ts2
            .getAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute.class);
    while (hasMore) {
        assertEquals(new String(t1.termBuffer(), 0, t1.termLength()),
                new String(t2.termBuffer(), 0, t2.termLength()));
        hasMore = ts1.incrementToken();
        ts2.incrementToken();
        t1 = (TermAttribute) ts1.getAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute.class);
        t2 = (TermAttribute) ts2.getAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute.class);
    }
}

From source file:net.simpleframework.ado.lucene.AbstractLuceneManager.java

License:Apache License

@Override
public String[] getQueryTokens(final String queryString) {
    TokenStream tokenStream = null;
    try {/*from w  w w .j a v a  2  s . c  o m*/
        tokenStream = getDefaultAnalyzer().tokenStream("QUERY_TOKENS", new StringReader(queryString));
        tokenStream.reset();
        final ArrayList<String> al = new ArrayList<>();
        while (tokenStream.incrementToken()) {
            final String term = tokenStream.getAttribute(CharTermAttribute.class).toString();
            if (term != null && term.length() > 1) {
                al.add(term);
            }
        }
        if (al.size() == 0) {
            al.add(queryString);
        }

        return al.toArray(new String[al.size()]);
    } catch (final IOException e) {
        throw ADOException.of(e);
    } finally {
        if (tokenStream != null) {
            try {
                tokenStream.close();
            } catch (final IOException e) {
            }
        }
    }
}

From source file:net.skyatlas.icd.dao.daoImpl.AnsjAnalysisTest.java

@Test
public void test() throws IOException {
    Token nt = new Token();
    Analyzer ca = new AnsjAnalysis();
    Reader sentence = new StringReader(
            "\n\n\n\n\n\n\n????, ????????????????????????????"
                    + "???????????????????"
                    + "??????????? ??????????????2????"
                    + ""
                    + "? ?????????????  ??? ????????");
    TokenStream ts = ca.tokenStream("sentence", sentence);

    System.out.println("start: " + (new Date()));
    long before = System.currentTimeMillis();
    while (ts.incrementToken()) {
        System.out.println(ts.getAttribute(CharTermAttribute.class));
    }/*from   w  w w . j  av  a 2 s  .c  o  m*/
    ts.close();
    long now = System.currentTimeMillis();
    System.out.println("time: " + (now - before) / 1000.0 + " s");
}

From source file:net.skyatlas.icd.test.AnsegTest.java

static public void main(String[] args)
        throws IOException, CorruptIndexException, ParseException, InvalidTokenOffsetsException {
    AnsegTest inst = new AnsegTest();
    Token nt = new Token();
    Analyzer ca = new AnsjAnalysis();
    Reader sentence = new StringReader(
            "\n\n\n\n\n\n\n????, ????????????????????????????"
                    + "???????????????????"
                    + "??????????? ??????????????2????"
                    + ""
                    + "? ?????????????  ??? ????????");
    TokenStream ts = ca.tokenStream("sentence", sentence);

    System.out.println("start: " + (new Date()));
    long before = System.currentTimeMillis();
    while (ts.incrementToken()) {
        System.out.println(ts.getAttribute(CharTermAttribute.class));
    }// w  w  w.ja v a2 s.  c o  m
    ts.close();
    long now = System.currentTimeMillis();
    System.out.println("time: " + (now - before) / 1000.0 + " s");

    HashSet<String> hs = new HashSet<String>();
    BufferedReader reader2 = IOUtil.getReader(ResourceBundle.getBundle("library").getString("stopLibrary"),
            "UTF-8");
    String word = null;
    while ((word = reader2.readLine()) != null) {
        hs.add(word);
    }
    Analyzer analyzer = new AnsjAnalysis(hs, false);
    Directory directory = null;
    IndexWriter iwriter = null;

    BufferedReader reader = IOUtil.getReader("/Users/changzhenghe/Downloads/hy_statspack01.txt", "UTF-8");
    String temp = null;
    StringBuilder sb = new StringBuilder();
    while ((temp = reader.readLine()) != null) {
        sb.append(temp);
        sb.append("\n");
    }
    reader.close();
    String text = sb.toString();

    text = "????????????  ??? ????????";

    IndexWriterConfig ic = new IndexWriterConfig(Version.LUCENE_32, analyzer);
    // 
    directory = new RAMDirectory();
    iwriter = new IndexWriter(directory, ic);
    // BufferedReader reader =
    // IOUtil.getReader("/Users/ansj/Documents//?//1998?_.txt",
    // "GBK");
    // String temp = null;
    // while ((temp = reader.readLine()) != null) {
    // addContent(iwriter, temp);
    // }
    inst.addContent(iwriter, "?   ?()   (?)");
    inst.addContent(iwriter, "   ?()   (?)");
    inst.addContent(iwriter, "?   ?   (?)");
    inst.addContent(iwriter, "   ??NEC   ");
    inst.addContent(iwriter, "?");
    iwriter.commit();
    iwriter.close();

    System.out.println("");

    inst.search(analyzer, directory, "");
    inst.search(analyzer, directory, "");
    inst.search(analyzer, directory, "");
    inst.search(analyzer, directory, "?");

    /*
     KeyWordComputer kwc = new KeyWordComputer(5);
     String title = "??";
     String content = "9??"
     + "?????????"
     + "????"
     + "??"
     + "?????"
     + "???"
     + "??????"
     + "???"
     + "????20??"
     + "????"
     + "?"
     + "???]??"
     + "???";
     Collection<Keyword> result = kwc.computeArticleTfidf(title, content);
     System.out.println(result);
            
     AnsegTest t = new AnsegTest();
     List<Term> parse = ToAnalysis.parse("?");
     System.out.println(parse);
     System.out.println("*********** ? ************");
     //        UserDefineLibrary.insertWord("", "userDefine", 1000);
     //        UserDefineLibrary.insertWord("?", "userDefine", 1000);
     UserDefineLibrary.insertWord("?", "userDefine", 1000);
     parse = ToAnalysis.parse("???");
     System.out.println(parse);
     */
}

From source file:NewsIR_search.TRECQuery.java

/**
 * Returns the content of the 'queryField' from the query text
 * @param analyzer//from w w  w  . jav  a  2 s  . c o  m
 * @param queryField
 * @return (String) The content of the field
 * @throws Exception 
 */
public String queryFieldAnalyze(Analyzer analyzer, String queryField) throws Exception {
    StringBuffer buff = new StringBuffer();
    TokenStream stream = analyzer.tokenStream(CumulativeIndexer.FIELD_TEXT, new StringReader(queryField));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();
    while (stream.incrementToken()) {
        String term = termAtt.toString();
        term = term.toLowerCase();
        buff.append(term).append(" ");
    }
    stream.end();
    stream.close();
    return buff.toString();
}

From source file:ngram.NGramExtractor.java

License:Open Source License

/**
 * Extracts NGrams from a String of text.
 * Can handle ngrams of any length and also perform stop word removal before extraction
 * @param text the text that the ngrams should be extracted from
 * @param length the length of the ngrams
 * @param stopWords whether or not stopwords should be removed before extraction
 * @param overlap whether or not the ngrams should overlap
 *///from   w  w w  .  j a v a2s  .c  o  m
public void extract(String text, int length, Boolean stopWords, Boolean overlap)
        throws FileNotFoundException, IOException {

    this.text = text;
    this.length = length;
    this.stopWords = stopWords;
    this.overlap = overlap;

    nGrams = new LinkedList<String>();
    uniqueNGrams = new LinkedList<String>();
    nGramFreqs = new HashMap<String, Integer>();

    /* If the minLength and maxLength are both 1, then we want unigrams
     * Make use of a StopAnalyzer when stopwords should be removed
     * Make use of a SimpleAnalyzer when stop words should be included
     */
    if (length == 1) {
        if (this.stopWords) {
            analyzer = new StandardAnalyzer(Version.LUCENE_36);
        } else {
            analyzer = new StandardAnalyzer(Version.LUCENE_36, Collections.EMPTY_SET); //Changed from simple to standard to include apostrophe/s
        }
    } else { //Bigger than unigrams so use ShingleAnalyzerWrapper. Once again, different analyzers depending on stop word removal
        if (this.stopWords) {
            analyzer = new ShingleAnalyzerWrapper(new StopAnalyzer(Version.LUCENE_24), length, length, " ",
                    false, false); //This is a hack to use Lucene 2.4 since in 2.4 position increments weren't preserved by default. Using a later version puts underscores (_) in the place of removed stop words.
        } else {
            analyzer = new ShingleAnalyzerWrapper(
                    new StandardAnalyzer(Version.LUCENE_36, Collections.EMPTY_SET), length, length, " ", false,
                    false); //Changed from simple to standard to include apostrophe/s
        }
    }

    //Code to process and extract the ngrams
    TokenStream tokenStream = analyzer.tokenStream("text", new StringReader(this.text));
    OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

    int tokenCount = 0;
    while (tokenStream.incrementToken()) {

        int startOffset = offsetAttribute.startOffset();
        int endOffset = offsetAttribute.endOffset();
        String termToken = charTermAttribute.toString(); //The actual token term
        nGrams.add(termToken); //Add all ngrams to the ngram LinkedList

        //If n-grams are not allowed to overlap, then increment to point of no overlap
        if (!overlap) {
            for (int i = 0; i < length - 1; i++) {
                tokenStream.incrementToken();
            }
        }

    }

    //Store unique nGrams and frequencies in hash tables
    for (String nGram : nGrams) {
        if (nGramFreqs.containsKey(nGram)) {
            nGramFreqs.put(nGram, nGramFreqs.get(nGram) + 1);
        } else {
            nGramFreqs.put(nGram, 1);
            uniqueNGrams.add(nGram);
        }
    }

}

From source file:nicta.com.au.failureanalysis.optimalquery.OptPatentQuery.java

private String transformation(TokenStream ts, int treshold, String field) throws IOException {
    Map<String, Integer> m = new HashMap<>();
    String q = "";
    CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
    ts.reset();//from w  ww. j  a v  a  2 s  .  co  m
    int s = 0;
    while (ts.incrementToken()) {
        String term = charTermAttribute.toString().replace(":", "\\:");
        q += term + " ";
        if (m.containsKey(term)) {
            m.put(term, m.get(term) + 1);
        } else {
            m.put(term, 1);
        }
        s++;
    }
    ts.close();
    //        return q;
    q = "";
    for (String k : m.keySet()) {
        if (m.get(k) >= treshold) {
            if (!Functions.isNumeric(k)) {
                q += k + "^" + m.get(k) + " ";
                //                    System.out.println(k);
            }
        }
    }
    if (field != null) {
        vocabulary.put(field, m);
    }
    fieldsSize.put(field, s);
    return q;
}

From source file:nicta.com.au.failureanalysis.query.QueryGneration.java

private Map<String, Integer> getTerms(TokenStream ts, int treshold, String field) throws IOException {
    Map<String, Integer> m = new HashMap<>();
    Map<String, Integer> qterm_freq = new HashMap<>();

    String q = "";
    CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
    ts.reset();//from  w  w w  .  j a  v  a  2 s.  c o  m
    int s = 0;
    while (ts.incrementToken()) {
        String term = charTermAttribute.toString().replace(":", "\\:");
        q += term + " ";
        if (m.containsKey(term)) {
            m.put(term, m.get(term) + 1);
        } else {
            m.put(term, 1);
        }
        s++;
    }
    ts.close();
    //        return q;
    q = "";
    //        int count = 0;
    for (String k : m.keySet()) {
        if (m.get(k) >= treshold) {
            if (!Functions.isNumeric(k)) {
                q += k + "^" + m.get(k) + " ";
                qterm_freq.put(k, m.get(k));
                //                    count++;
                //                    System.out.println(count + " " + k + " " + m.get(k));
            }
        }
    }
    //        System.out.println("-------------------");
    if (field != null) {
        vocabulary.put(field, m);
    }
    fieldsSize.put(field, s);
    //        return q;
    return qterm_freq;
}

From source file:nicta.com.au.patent.pac.analysis.FieldsCosineSimilarities.java

private Map<String, Double> getVector(TokenStream ts, String field) throws IOException, Exception {
    Map<String, Double> m = new HashMap<>();
    CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
    ts.reset();//from  ww  w  .j  av  a 2s . c o  m
    int i = 0;
    while (ts.incrementToken()) {
        i++;
        String term = charTermAttribute.toString();
        if (m.containsKey(term)) {
            m.put(term, m.get(term) + 1);
        } else {
            m.put(term, 1.0);
        }
    }
    for (String key : m.keySet()) {
        Term t = new Term(field, key);
        int totalTF = ir.docFreq(t);
        int docs = ir.getDocCount("claims");
        double idf = Math.log10((double) docs / (totalTF + 1));
        m.put(key, (m.get(key) / i) * idf);
    }

    return m;
}