Example usage for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:com.mozilla.grouperfish.pig.eval.text.NGramTokenize.java

License:Apache License

@Override
public DataBag exec(Tuple input) throws IOException {
    if (input == null || input.size() == 0) {
        return null;
    }/*from   w ww.j  av  a2 s.  c om*/

    if (analyzer == null) {
        String langCode = "en";
        if (input.size() > 1) {
            loadDictionary((String) input.get(1));
        }
        boolean stem = false;
        if (input.size() > 2) {
            stem = Boolean.parseBoolean((String) input.get(2));
        }
        boolean outputUnigrams = false;
        if (input.size() > 3) {
            outputUnigrams = Boolean.parseBoolean((String) input.get(3));
        }
        int minNGram = 2;
        if (input.size() > 4) {
            minNGram = Integer.parseInt((String) input.get(4));
        }
        int maxNGram = 3;
        if (input.size() > 5) {
            maxNGram = Integer.parseInt((String) input.get(5));
        }
        if (input.size() > 6) {
            langCode = (String) input.get(6);
        }

        if (stopwords != null && stopwords.size() != 0) {
            analyzer = new com.mozilla.grouperfish.lucene.analysis.en.NGramEnglishAnalyzer(Version.LUCENE_31,
                    stopwords, stem, outputUnigrams, minNGram, maxNGram);
        } else {
            analyzer = new com.mozilla.grouperfish.lucene.analysis.en.NGramEnglishAnalyzer(Version.LUCENE_31,
                    StandardAnalyzer.STOP_WORDS_SET, stem, outputUnigrams, minNGram, maxNGram);
        }
    }

    DataBag output = bagFactory.newDefaultBag();
    TokenStream stream = analyzer.tokenStream(NOFIELD, new StringReader((String) input.get(0)));
    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
    while (stream.incrementToken()) {
        if (termAttr.length() > 0) {
            Tuple t = tupleFactory.newTuple(termAttr.toString());
            output.add(t);
            termAttr.setEmpty();
        }
    }

    return output;
}

From source file:com.mozilla.grouperfish.pig.eval.text.Tokenize.java

License:Apache License

@Override
public DataBag exec(Tuple input) throws IOException {
    if (input == null || input.size() == 0) {
        return null;
    }//w w  w  . ja va2s  . com

    if (analyzer == null) {
        String langCode = "en";
        if (input.size() > 1) {
            loadDictionary((String) input.get(1));
        }
        boolean stem = false;
        if (input.size() > 2) {
            stem = Boolean.parseBoolean((String) input.get(2));
        }
        if (input.size() > 3) {
            langCode = (String) input.get(3);
        }

        if (langCode.startsWith("zh") || langCode.startsWith("ja")) {
            analyzer = new org.apache.lucene.analysis.cjk.CJKAnalyzer(Version.LUCENE_31);
        } else if (langCode.startsWith("de")) {
            analyzer = new org.apache.lucene.analysis.de.GermanAnalyzer(Version.LUCENE_31);
        } else if (langCode.startsWith("es")) {
            analyzer = new org.apache.lucene.analysis.es.SpanishAnalyzer(Version.LUCENE_31);
        } else {
            if (stopwords != null && stopwords.size() > 0) {
                analyzer = new EnglishAnalyzer(Version.LUCENE_31, stopwords, stem);
            } else {
                analyzer = new EnglishAnalyzer(Version.LUCENE_31, stem);
            }
        }
    }

    DataBag output = bagFactory.newDefaultBag();
    TokenStream stream = analyzer.tokenStream(NOFIELD, new StringReader((String) input.get(0)));
    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
    while (stream.incrementToken()) {
        if (termAttr.length() > 0) {
            Tuple t = tupleFactory.newTuple(termAttr.toString());
            output.add(t);
            termAttr.setEmpty();
        }
    }

    return output;
}

From source file:com.mozilla.grouperfish.transforms.coclustering.lucene.analysis.en.NGramEnglishAnalyzer.java

License:Apache License

public static void main(String[] args) throws IOException {
    // TODO: SMELLY: de-system-ify
    Set<String> stopwords = Dictionary
            .loadDictionary(new Path("file:///Users/xstevens/workspace/akela/stopwords-en.txt"));
    NGramEnglishAnalyzer analyzer = new NGramEnglishAnalyzer(Version.LUCENE_31, stopwords, false, true);
    TokenStream stream = analyzer.tokenStream("",
            new StringReader("When I was growing up this was so much fun."));
    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
    while (stream.incrementToken()) {
        if (termAttr.length() > 0) {
            System.out.println(termAttr.toString());
            termAttr.setEmpty();/*w  w w .j  a v a 2 s . c  o m*/
        }
    }
}

From source file:com.mozilla.grouperfish.transforms.coclustering.pig.eval.text.NGramTokenize.java

License:Apache License

@Override
public DataBag exec(Tuple input) throws IOException {
    if (input == null || input.size() == 0) {
        return null;
    }/*from w  w w. j a  v a 2s. c  om*/

    if (analyzer == null) {
        if (input.size() > 1) {
            loadDictionary((String) input.get(1));
        }
        boolean stem = false;
        if (input.size() > 2) {
            stem = Boolean.parseBoolean((String) input.get(2));
        }
        boolean outputUnigrams = false;
        if (input.size() > 3) {
            outputUnigrams = Boolean.parseBoolean((String) input.get(3));
        }
        int minNGram = 2;
        if (input.size() > 4) {
            minNGram = Integer.parseInt((String) input.get(4));
        }
        int maxNGram = 3;
        if (input.size() > 5) {
            maxNGram = Integer.parseInt((String) input.get(5));
        }

        if (stopwords != null && stopwords.size() != 0) {
            analyzer = new NGramEnglishAnalyzer(Version.LUCENE_31, stopwords, stem, outputUnigrams, minNGram,
                    maxNGram);
        } else {
            analyzer = new NGramEnglishAnalyzer(Version.LUCENE_31, StandardAnalyzer.STOP_WORDS_SET, stem,
                    outputUnigrams, minNGram, maxNGram);
        }
    }

    DataBag output = bagFactory.newDefaultBag();
    TokenStream stream = analyzer.tokenStream(NOFIELD, new StringReader((String) input.get(0)));
    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
    while (stream.incrementToken()) {
        if (termAttr.length() > 0) {
            Tuple t = tupleFactory.newTuple(termAttr.toString());
            output.add(t);
            termAttr.setEmpty();
        }
    }

    return output;
}

From source file:com.plug.Version_8_5_2.gs.ling.tm2.lucene.LuceneUtil.java

License:Apache License

/**
 * Create GlobalSight TM tokens from a provided segment string using
 * GsAnalyzer.//from  w w  w .  j  av a  2 s.  c o  m
 * 
 * @param p_text
 *            fuzzy match format string
 * @return List of c.g.l.tm2.index.Tokens
 */
public static List<Token> createGsTokens(String p_text, GlobalSightLocale p_locale) throws Exception {
    GsAnalyzer analyzer = new GsAnalyzer(p_locale);
    TokenStream tokenStream = analyzer.tokenStream("blah", new StringReader(p_text));

    tokenStream.reset();
    // GSAttribute gsAtt = tokenStream.addAttribute(GSAttribute.class);
    // org.apache.lucene.analysis.Token luceneToken = null;
    List<String> tokens = new ArrayList<String>();

    while (tokenStream.incrementToken()) {
        // luceneToken = gsAtt.getToken();

        CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
        tokens.add(termAtt.toString());

    }
    tokenStream.close();
    return buildTokenList(tokens);
}

From source file:com.ProcessText.java

public static void main(String[] args) throws FileNotFoundException, IOException {
    HashMap<String, Integer> unigram = new HashMap<>();
    HashMap<String, Integer> bigram = new HashMap<>();
    HashMap<String, Integer> trigram = new HashMap<>();
    BufferedReader br = new BufferedReader(new FileReader("D:/phrases90"));
    String line;//from  w w  w .  ja  va  2  s  .c o m
    Analyzer biAnalyzer = new NGramTokenBaseAnalyzer(2, 2, false);
    Analyzer triAnalyzer = new NGramTokenBaseAnalyzer(3, 3, false);
    while ((line = br.readLine()) != null) {
        line = line.replaceAll("\\s+", " ").trim();
        //Loai bo 1 so ky hieu
        line = line.replaceAll("<3", "");
        line = line.replaceAll(":3", "");
        line = line.replaceAll(":v", "");
        line = line.replaceAll(":d", "");
        line = line.replaceAll(":D", "");
        line = line.replaceAll("p/s:", "");
        line = line.replaceAll(":\\)", "");
        //unigram process
        String[] arr = line.split("\\s");
        for (String item : arr) {
            item = item.replaceAll("\\s", "");
            if (item.length() > 0) {
                item = item.toLowerCase();
                Integer freq = unigram.get(item);
                if (freq != null) {
                    unigram.put(item, freq + 1);
                } else
                    unigram.put(item, 1);
            }
        }
        //bigram process
        if (line.length() > 0) {
            TokenStream stream = biAnalyzer.tokenStream(null, new StringReader(line));
            CharTermAttribute cattr = stream.addAttribute(CharTermAttribute.class);
            stream.reset();
            while (stream.incrementToken()) {
                String item = cattr.toString();
                //item = item.replaceAll("$[\\s]","");
                Integer count = bigram.get(item);
                int fcount = 0;
                if (count == null)
                    fcount = 1;
                else
                    fcount = count + 1;
                if (item.length() > 3)
                    bigram.put(item, fcount);
            }
            stream.end();
            stream.close();
            //trigram process
            TokenStream stream1 = triAnalyzer.tokenStream(null, new StringReader(line));
            CharTermAttribute cattr1 = stream1.addAttribute(CharTermAttribute.class);
            stream1.reset();
            while (stream1.incrementToken()) {
                String item = cattr1.toString();
                //item = item.replaceAll("$[\\s]","");
                Integer count = trigram.get(item);
                int fcount = 0;
                if (count == null)
                    fcount = 1;
                else
                    fcount = count + 1;
                if (item.length() > 5)
                    trigram.put(item, fcount);
            }
            stream1.end();
            stream1.close();
        }

    }
    //Tinh Xac suat cho cac unigram
    HashMap<String, Double> unigramProb = new HashMap<>();
    int totalUniFreq = 0;
    int uniSize = unigram.size();
    for (String item : unigram.keySet()) {
        item = item.toLowerCase();
        int freq = unigram.get(item);
        totalUniFreq += freq;
    }
    //Cng thc xc sut di y  c sa li
    for (String item : unigram.keySet()) {
        item = item.toLowerCase();
        int freq = unigram.get(item);
        double prob = ((double) freq + 1) / (totalUniFreq + uniSize);
        //unigram.size l s lng t vng unigram khc nhau
        unigramProb.put(item, prob);
    }
    System.out.println("Tong tan suat cua unigram = " + totalUniFreq);
    //Tinh xac suat cho cac bigram
    HashMap<String, Double> bigramProb = new HashMap<>();
    HashMap<String, Integer> startUnigramOfBigram = new HashMap<>();// Luu tong tan suat cua A* bat dau boi unigram A
    //De phuc vu cong thuc xac suat co dieu kien
    int totalBiFreq = 0;//Tinh tong tan suat cua toan bo bigram A* cua unigram A
    //Luu A*
    for (String item : bigram.keySet()) {
        item = item.toLowerCase();
        int freq = bigram.get(item);
        totalBiFreq += freq;
        String[] arr = item.split("\\s");
        String key = arr[0].toLowerCase();//khong can thiet lam
        Integer startFreq = startUnigramOfBigram.get(key);
        if (startFreq == null)
            startUnigramOfBigram.put(key, freq);
        else
            startUnigramOfBigram.put(key, freq + startFreq);
    }
    //Ap dung cong thuc xac suat co dieu kien
    //? sa li cng thc
    for (String item : bigram.keySet()) {
        int freq = bigram.get(item);
        String[] arr = item.split("\\s");
        String key = arr[0].toLowerCase();
        int startUniFreq = startUnigramOfBigram.get(key);
        double startUniProb;
        try {
            startUniProb = unigramProb.get(key);
        } catch (NullPointerException ex) {
            startUniProb = 1d / (1 + uniSize);
        }
        double prob = (((double) freq + 1) / (startUniFreq + uniSize)) * startUniProb;
        //uniSize = V l kch thc t in unigram
        bigramProb.put(item, prob);
    }

    System.out.println("Tong tan suat cua bigram = " + totalBiFreq);
    //Tinh xac suat cho cac trigram
    HashMap<String, Double> trigramProb = new HashMap<>();
    HashMap<String, Integer> startBigramOfTrigram = new HashMap<>();// Luu tong tan suat cua AB* bat dau boi bigram AB
    int totalTriFreq = 0;
    for (String item : trigram.keySet()) {
        int freq = trigram.get(item);
        totalTriFreq += freq;
        String[] arr = item.split("\\s");
        String key = arr[0] + " " + arr[1];
        Integer startFreq = startBigramOfTrigram.get(key);
        if (startFreq == null)
            startBigramOfTrigram.put(key, freq);
        else
            startBigramOfTrigram.put(key, freq + startFreq);
    }
    //Ap dung cong thuc xac suat co dieu kien
    for (String item : trigram.keySet()) {
        double startBiProb;
        int freq = trigram.get(item);
        String[] arr = item.split("\\s");
        String key = arr[0] + " " + arr[1];
        //try {
        int startBiFreq = startBigramOfTrigram.get(key);
        try {
            startBiProb = bigramProb.get(key);
        } catch (NullPointerException ex) {
            startBiProb = 1d / (878592 + uniSize);
        }
        double prob = (((double) freq + 1) / (startBiFreq + uniSize)) * startBiProb;
        trigramProb.put(item, prob);
        //} catch(NullPointerException ex) {

        //}
    }
    System.out.println("Tong tan suat cua trigram = " + totalTriFreq);
    //In ra file
    PrintWriter f0 = new PrintWriter(new FileWriter("D:/App/unigramProb.txt"));
    PrintWriter f1 = new PrintWriter(new FileWriter("D:/App/bigramProb.txt"));
    PrintWriter f2 = new PrintWriter(new FileWriter("D:/App/trigramProb.txt"));
    for (String item : unigramProb.keySet()) {
        double freq = unigramProb.get(item);
        f0.append(item + " = " + freq + "\n");
    }

    f0.close();
    for (String item : bigramProb.keySet()) {
        double freq = bigramProb.get(item);
        f1.append(item + " = " + freq + "\n");
    }
    f1.close();
    for (String item : trigramProb.keySet()) {
        double freq = trigramProb.get(item);
        f2.append(item + " = " + freq + "\n");
    }
    f2.close();
    PrintWriter f3 = new PrintWriter(new FileWriter("D:/App/stringProb.txt"));
    br = new BufferedReader(new FileReader("D:/phrases10"));
    HashMap<String, Integer> prefix3Gram = new HashMap<>();
    HashMap<String, Integer> phrases = new HashMap<>();
    while ((line = br.readLine()) != null) {
        line = line.replaceAll("\\s+", " ").trim();
        //Loai bo 1 so ky hieu
        line = line.replaceAll("<3", "");
        line = line.replaceAll(":3", "");
        line = line.replaceAll(":v", "");
        line = line.replaceAll(":d", "");
        line = line.replaceAll(":D", "");
        line = line.replaceAll("p/s:", "");
        line = line.replaceAll(":\\)", "");
        String[] arr = line.split("\\s");
        if (arr.length > 2) {
            String prefix = arr[0] + " " + arr[1] + " " + arr[2];
            Integer prefixFreq = prefix3Gram.get(prefix);
            if (prefixFreq == null)
                prefix3Gram.put(prefix, 1);
            else
                prefix3Gram.put(prefix, 1 + prefixFreq);
        }
        Integer freq = phrases.get(line);
        if (freq == null)
            phrases.put(line, 1);
        else
            phrases.put(line, freq + 1);
    }
    //br = new BufferedReader(new FileReader("D:/phrases10"));
    double totalProb = 0;
    int countItem = 0;
    for (String item : phrases.keySet()) {
        line = item;
        Integer lineFreq = phrases.get(item);
        if (lineFreq == null)
            lineFreq = 1;
        String[] arr = line.split("\\s");
        String prefix = line;
        double probOfLine = 1d * lineFreq / (uniSize + totalTriFreq / uniSize);
        int length = arr.length;

        if (length >= 3) {
            prefix = arr[0] + " " + arr[1] + " " + arr[2];
            int prefixTotal = prefix3Gram.get(prefix);
            try {
                double prefixProb = trigramProb.get(prefix);
                probOfLine = prefixProb;
                if (length > 3) {
                    for (int i = 3; i < length; i++) {
                        prefix = arr[i - 2] + " " + arr[i - 1] + " " + arr[i];
                        prefixTotal = prefix3Gram.get(prefix);
                        prefixProb = trigramProb.get(prefix);
                        probOfLine *= (1d / prefixTotal) * prefixProb;
                    }
                }
                //f3.append(line + " = " + probOfLine + "\n");
            } catch (NullPointerException ex) {
                probOfLine = 1d * lineFreq / (prefixTotal + uniSize);
            }
        }

        f3.append(line + " = " + probOfLine + "\n");
        countItem += arr.length;
        totalProb -= (Math.log(probOfLine) / Math.log(2));
    }
    double somu = totalProb / countItem;
    double perplexity = Math.pow(2, somu);
    f3.close();
    DecimalFormat df = new DecimalFormat("#");
    df.setMaximumFractionDigits(4);
    System.out.println(somu);
    System.out.printf("PERPLEXITY = " + df.format(perplexity));
}

From source file:com.qubit.elasticsearch.analysis.url.helper.AnalyzerUtils.java

License:Apache License

public static void displayTokens(TokenStream stream) throws IOException {

    Token term = stream.addAttribute(Token.class);
    while (stream.incrementToken()) {
        System.out.print("[" + term.toString() + "] ");
    }//  ww  w  .ja  v  a  2s  .  c o m
}

From source file:com.quest.agent.weibomonitor.weiboMonitorAgentImpl.java

License:Open Source License

private ModelRoot collect(long collectionFreqInMs, String groupName, String sqlQuery, int groupID) {
    Weibo agentRoot = new Weibo(groupName);
    //TODO: collect data and populate the data collected to model(topology) 

    //List<UrlList> urlList = mWrapper.getUrlList();

    Analyzer ca = new SmartChineseAnalyzer(Version.LUCENE_CURRENT);

    try {//from w ww. ja  v  a  2 s. c o m

        SQLProcess sql = new SQLProcess();

        ResultSet res = sql.executeQuery(sqlQuery);

        MyTokenFilter tkFilter = new MyTokenFilter();

        while (res.next()) {
            Reader sentence = new StringReader(res.getString("status").toString());

            String weiboID = res.getObject("weiboId").toString();

            if (groupID == 0)
                sql.execute("update status set status.read=1 where weiboId=" + weiboID + ";");
            else
                sql.execute("update status2 set status2.read=1 where weiboId=" + weiboID + ";");

            TokenStream ts = ca.tokenStream("", sentence);
            try {
                while (ts.incrementToken()) {
                    String ss[] = ts.toString().split(",");
                    ss[0] = ss[0].replace("(", "");
                    if (tkFilter.doFilter(ss[0])) {
                        if (!map[groupID].containsKey(ss[0]))
                            map[groupID].put(ss[0], new Word(1, ss[0]));
                        else
                            map[groupID].get(ss[0]).plusNum();
                    }
                }
            } catch (IOException e) {
                mLogger.debug2("error occurred while incrementToken", e);
            }
        }
    } catch (SQLException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

    Word[] wordList = tfidf.doProcess(map[groupID]);

    int mapsize = map[groupID].size();
    for (int i = 0; i < Math.min(mapsize, ItemNumShow); i++) {

        collectWeibo(wordList[i].getWord(), wordList[i].getNum(), wordList[i].getTfIdf(), wordList[i].getIdf(),
                agentRoot);

    }

    return agentRoot;
}

From source file:com.qwazr.search.analysis.AnalyzerUtils.java

License:Apache License

final static public void forEachTerm(Analyzer analyzer, String field, String text, TermConsumer consumer)
        throws IOException {
    Objects.requireNonNull(analyzer, "The analyzer cannot be null");
    Objects.requireNonNull(field, "The field cannot be null");
    Objects.requireNonNull(text, "The text cannot be null");
    final TokenStream tokenStream = analyzer.tokenStream(field, text);
    try {/*from  w w w.j  a va2 s . c o  m*/
        final CharTermAttribute charTermAttr = getAttribute(tokenStream, CharTermAttribute.class);
        final FlagsAttribute flagsAttr = getAttribute(tokenStream, FlagsAttribute.class);
        final OffsetAttribute offsetAttr = getAttribute(tokenStream, OffsetAttribute.class);
        final PositionIncrementAttribute posIncAttr = getAttribute(tokenStream,
                PositionIncrementAttribute.class);
        final PositionLengthAttribute posLengthAttr = getAttribute(tokenStream, PositionLengthAttribute.class);
        final TypeAttribute typeAttr = getAttribute(tokenStream, TypeAttribute.class);
        final KeywordAttribute keywordAttr = getAttribute(tokenStream, KeywordAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken())
            if (!consumer.apply(charTermAttr, flagsAttr, offsetAttr, posIncAttr, posLengthAttr, typeAttr,
                    keywordAttr))
                break;

    } finally {
        tokenStream.close();
    }
}

From source file:com.qwazr.search.query.SpanPositionsQuery.java

License:Apache License

@Override
final public Query getQuery(QueryContext queryContext) throws IOException {

    BooleanQuery.Builder builder = new BooleanQuery.Builder();
    TokenStream tokenStream = queryContext.analyzer.tokenStream(field, queryContext.queryString);
    try {/*from w  w  w  .  j  av a  2  s. c o m*/
        CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
        PositionIncrementAttribute pocincrAttribute = tokenStream
                .getAttribute(PositionIncrementAttribute.class);
        tokenStream.reset();
        int pos = 0;
        while (tokenStream.incrementToken()) {
            final String charTerm = charTermAttribute.toString();
            int start = pos - distance;
            if (start < 0)
                start = 0;
            final int end = pos + distance + 1;
            for (int i = start; i < end; i++) {
                final float dist = Math.abs(i - pos) + 1;
                final float boost = 1 / dist;
                final SpanTermQuery spanTermQuery = new SpanTermQuery(new Term(field, charTerm));
                Query query = new BoostQuery(new SpanPositionRangeQuery(spanTermQuery, i, i + 1), boost);
                builder.add(new BooleanClause(query, BooleanClause.Occur.SHOULD));
            }
            pos += pocincrAttribute.getPositionIncrement();
        }
        return builder.build();
    } finally {
        IOUtils.closeQuietly(tokenStream);
    }
}