Example usage for org.apache.lucene.analysis TokenStream incrementToken

List of usage examples for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:com.mozilla.grouperfish.pig.eval.text.NGramTokenize.java

License:Apache License

@Override
public DataBag exec(Tuple input) throws IOException {
    if (input == null || input.size() == 0) {
        return null;
    }/*from   w ww.j  av  a2 s.  c om*/

    if (analyzer == null) {
        String langCode = "en";
        if (input.size() > 1) {
            loadDictionary((String) input.get(1));
        }
        boolean stem = false;
        if (input.size() > 2) {
            stem = Boolean.parseBoolean((String) input.get(2));
        }
        boolean outputUnigrams = false;
        if (input.size() > 3) {
            outputUnigrams = Boolean.parseBoolean((String) input.get(3));
        }
        int minNGram = 2;
        if (input.size() > 4) {
            minNGram = Integer.parseInt((String) input.get(4));
        }
        int maxNGram = 3;
        if (input.size() > 5) {
            maxNGram = Integer.parseInt((String) input.get(5));
        }
        if (input.size() > 6) {
            langCode = (String) input.get(6);
        }

        if (stopwords != null && stopwords.size() != 0) {
            analyzer = new com.mozilla.grouperfish.lucene.analysis.en.NGramEnglishAnalyzer(Version.LUCENE_31,
                    stopwords, stem, outputUnigrams, minNGram, maxNGram);
        } else {
            analyzer = new com.mozilla.grouperfish.lucene.analysis.en.NGramEnglishAnalyzer(Version.LUCENE_31,
                    StandardAnalyzer.STOP_WORDS_SET, stem, outputUnigrams, minNGram, maxNGram);
        }
    }

    DataBag output = bagFactory.newDefaultBag();
    TokenStream stream = analyzer.tokenStream(NOFIELD, new StringReader((String) input.get(0)));
    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
    while (stream.incrementToken()) {
        if (termAttr.length() > 0) {
            Tuple t = tupleFactory.newTuple(termAttr.toString());
            output.add(t);
            termAttr.setEmpty();
        }
    }

    return output;
}

From source file:com.mozilla.grouperfish.pig.eval.text.Tokenize.java

License:Apache License

@Override
public DataBag exec(Tuple input) throws IOException {
    if (input == null || input.size() == 0) {
        return null;
    }//w w  w  . ja va2s  . com

    if (analyzer == null) {
        String langCode = "en";
        if (input.size() > 1) {
            loadDictionary((String) input.get(1));
        }
        boolean stem = false;
        if (input.size() > 2) {
            stem = Boolean.parseBoolean((String) input.get(2));
        }
        if (input.size() > 3) {
            langCode = (String) input.get(3);
        }

        if (langCode.startsWith("zh") || langCode.startsWith("ja")) {
            analyzer = new org.apache.lucene.analysis.cjk.CJKAnalyzer(Version.LUCENE_31);
        } else if (langCode.startsWith("de")) {
            analyzer = new org.apache.lucene.analysis.de.GermanAnalyzer(Version.LUCENE_31);
        } else if (langCode.startsWith("es")) {
            analyzer = new org.apache.lucene.analysis.es.SpanishAnalyzer(Version.LUCENE_31);
        } else {
            if (stopwords != null && stopwords.size() > 0) {
                analyzer = new EnglishAnalyzer(Version.LUCENE_31, stopwords, stem);
            } else {
                analyzer = new EnglishAnalyzer(Version.LUCENE_31, stem);
            }
        }
    }

    DataBag output = bagFactory.newDefaultBag();
    TokenStream stream = analyzer.tokenStream(NOFIELD, new StringReader((String) input.get(0)));
    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
    while (stream.incrementToken()) {
        if (termAttr.length() > 0) {
            Tuple t = tupleFactory.newTuple(termAttr.toString());
            output.add(t);
            termAttr.setEmpty();
        }
    }

    return output;
}

From source file:com.mozilla.grouperfish.transforms.coclustering.lucene.analysis.en.NGramEnglishAnalyzer.java

License:Apache License

public static void main(String[] args) throws IOException {
    // TODO: SMELLY: de-system-ify
    Set<String> stopwords = Dictionary
            .loadDictionary(new Path("file:///Users/xstevens/workspace/akela/stopwords-en.txt"));
    NGramEnglishAnalyzer analyzer = new NGramEnglishAnalyzer(Version.LUCENE_31, stopwords, false, true);
    TokenStream stream = analyzer.tokenStream("",
            new StringReader("When I was growing up this was so much fun."));
    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
    while (stream.incrementToken()) {
        if (termAttr.length() > 0) {
            System.out.println(termAttr.toString());
            termAttr.setEmpty();/*w  w w .j  a v a 2 s . c  o m*/
        }
    }
}

From source file:com.mozilla.grouperfish.transforms.coclustering.pig.eval.text.NGramTokenize.java

License:Apache License

@Override
public DataBag exec(Tuple input) throws IOException {
    if (input == null || input.size() == 0) {
        return null;
    }/*from w  w w. j a  v a 2s. c  om*/

    if (analyzer == null) {
        if (input.size() > 1) {
            loadDictionary((String) input.get(1));
        }
        boolean stem = false;
        if (input.size() > 2) {
            stem = Boolean.parseBoolean((String) input.get(2));
        }
        boolean outputUnigrams = false;
        if (input.size() > 3) {
            outputUnigrams = Boolean.parseBoolean((String) input.get(3));
        }
        int minNGram = 2;
        if (input.size() > 4) {
            minNGram = Integer.parseInt((String) input.get(4));
        }
        int maxNGram = 3;
        if (input.size() > 5) {
            maxNGram = Integer.parseInt((String) input.get(5));
        }

        if (stopwords != null && stopwords.size() != 0) {
            analyzer = new NGramEnglishAnalyzer(Version.LUCENE_31, stopwords, stem, outputUnigrams, minNGram,
                    maxNGram);
        } else {
            analyzer = new NGramEnglishAnalyzer(Version.LUCENE_31, StandardAnalyzer.STOP_WORDS_SET, stem,
                    outputUnigrams, minNGram, maxNGram);
        }
    }

    DataBag output = bagFactory.newDefaultBag();
    TokenStream stream = analyzer.tokenStream(NOFIELD, new StringReader((String) input.get(0)));
    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
    while (stream.incrementToken()) {
        if (termAttr.length() > 0) {
            Tuple t = tupleFactory.newTuple(termAttr.toString());
            output.add(t);
            termAttr.setEmpty();
        }
    }

    return output;
}

From source file:com.plug.Version_8_5_2.gs.ling.tm2.lucene.LuceneUtil.java

License:Apache License

/**
 * Create GlobalSight TM tokens from a provided segment string using
 * GsAnalyzer.//from  w w  w .  j  av a  2 s.  c o  m
 * 
 * @param p_text
 *            fuzzy match format string
 * @return List of c.g.l.tm2.index.Tokens
 */
public static List<Token> createGsTokens(String p_text, GlobalSightLocale p_locale) throws Exception {
    GsAnalyzer analyzer = new GsAnalyzer(p_locale);
    TokenStream tokenStream = analyzer.tokenStream("blah", new StringReader(p_text));

    tokenStream.reset();
    // GSAttribute gsAtt = tokenStream.addAttribute(GSAttribute.class);
    // org.apache.lucene.analysis.Token luceneToken = null;
    List<String> tokens = new ArrayList<String>();

    while (tokenStream.incrementToken()) {
        // luceneToken = gsAtt.getToken();

        CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
        tokens.add(termAtt.toString());

    }
    tokenStream.close();
    return buildTokenList(tokens);
}

From source file:com.ProcessText.java

public static void main(String[] args) throws FileNotFoundException, IOException {
    HashMap<String, Integer> unigram = new HashMap<>();
    HashMap<String, Integer> bigram = new HashMap<>();
    HashMap<String, Integer> trigram = new HashMap<>();
    BufferedReader br = new BufferedReader(new FileReader("D:/phrases90"));
    String line;//from  w w  w .  ja  va  2  s  .c o m
    Analyzer biAnalyzer = new NGramTokenBaseAnalyzer(2, 2, false);
    Analyzer triAnalyzer = new NGramTokenBaseAnalyzer(3, 3, false);
    while ((line = br.readLine()) != null) {
        line = line.replaceAll("\\s+", " ").trim();
        //Loai bo 1 so ky hieu
        line = line.replaceAll("<3", "");
        line = line.replaceAll(":3", "");
        line = line.replaceAll(":v", "");
        line = line.replaceAll(":d", "");
        line = line.replaceAll(":D", "");
        line = line.replaceAll("p/s:", "");
        line = line.replaceAll(":\\)", "");
        //unigram process
        String[] arr = line.split("\\s");
        for (String item : arr) {
            item = item.replaceAll("\\s", "");
            if (item.length() > 0) {
                item = item.toLowerCase();
                Integer freq = unigram.get(item);
                if (freq != null) {
                    unigram.put(item, freq + 1);
                } else
                    unigram.put(item, 1);
            }
        }
        //bigram process
        if (line.length() > 0) {
            TokenStream stream = biAnalyzer.tokenStream(null, new StringReader(line));
            CharTermAttribute cattr = stream.addAttribute(CharTermAttribute.class);
            stream.reset();
            while (stream.incrementToken()) {
                String item = cattr.toString();
                //item = item.replaceAll("$[\\s]","");
                Integer count = bigram.get(item);
                int fcount = 0;
                if (count == null)
                    fcount = 1;
                else
                    fcount = count + 1;
                if (item.length() > 3)
                    bigram.put(item, fcount);
            }
            stream.end();
            stream.close();
            //trigram process
            TokenStream stream1 = triAnalyzer.tokenStream(null, new StringReader(line));
            CharTermAttribute cattr1 = stream1.addAttribute(CharTermAttribute.class);
            stream1.reset();
            while (stream1.incrementToken()) {
                String item = cattr1.toString();
                //item = item.replaceAll("$[\\s]","");
                Integer count = trigram.get(item);
                int fcount = 0;
                if (count == null)
                    fcount = 1;
                else
                    fcount = count + 1;
                if (item.length() > 5)
                    trigram.put(item, fcount);
            }
            stream1.end();
            stream1.close();
        }

    }
    //Tinh Xac suat cho cac unigram
    HashMap<String, Double> unigramProb = new HashMap<>();
    int totalUniFreq = 0;
    int uniSize = unigram.size();
    for (String item : unigram.keySet()) {
        item = item.toLowerCase();
        int freq = unigram.get(item);
        totalUniFreq += freq;
    }
    //Cng thc xc sut di y  c sa li
    for (String item : unigram.keySet()) {
        item = item.toLowerCase();
        int freq = unigram.get(item);
        double prob = ((double) freq + 1) / (totalUniFreq + uniSize);
        //unigram.size l s lng t vng unigram khc nhau
        unigramProb.put(item, prob);
    }
    System.out.println("Tong tan suat cua unigram = " + totalUniFreq);
    //Tinh xac suat cho cac bigram
    HashMap<String, Double> bigramProb = new HashMap<>();
    HashMap<String, Integer> startUnigramOfBigram = new HashMap<>();// Luu tong tan suat cua A* bat dau boi unigram A
    //De phuc vu cong thuc xac suat co dieu kien
    int totalBiFreq = 0;//Tinh tong tan suat cua toan bo bigram A* cua unigram A
    //Luu A*
    for (String item : bigram.keySet()) {
        item = item.toLowerCase();
        int freq = bigram.get(item);
        totalBiFreq += freq;
        String[] arr = item.split("\\s");
        String key = arr[0].toLowerCase();//khong can thiet lam
        Integer startFreq = startUnigramOfBigram.get(key);
        if (startFreq == null)
            startUnigramOfBigram.put(key, freq);
        else
            startUnigramOfBigram.put(key, freq + startFreq);
    }
    //Ap dung cong thuc xac suat co dieu kien
    //? sa li cng thc
    for (String item : bigram.keySet()) {
        int freq = bigram.get(item);
        String[] arr = item.split("\\s");
        String key = arr[0].toLowerCase();
        int startUniFreq = startUnigramOfBigram.get(key);
        double startUniProb;
        try {
            startUniProb = unigramProb.get(key);
        } catch (NullPointerException ex) {
            startUniProb = 1d / (1 + uniSize);
        }
        double prob = (((double) freq + 1) / (startUniFreq + uniSize)) * startUniProb;
        //uniSize = V l kch thc t in unigram
        bigramProb.put(item, prob);
    }

    System.out.println("Tong tan suat cua bigram = " + totalBiFreq);
    //Tinh xac suat cho cac trigram
    HashMap<String, Double> trigramProb = new HashMap<>();
    HashMap<String, Integer> startBigramOfTrigram = new HashMap<>();// Luu tong tan suat cua AB* bat dau boi bigram AB
    int totalTriFreq = 0;
    for (String item : trigram.keySet()) {
        int freq = trigram.get(item);
        totalTriFreq += freq;
        String[] arr = item.split("\\s");
        String key = arr[0] + " " + arr[1];
        Integer startFreq = startBigramOfTrigram.get(key);
        if (startFreq == null)
            startBigramOfTrigram.put(key, freq);
        else
            startBigramOfTrigram.put(key, freq + startFreq);
    }
    //Ap dung cong thuc xac suat co dieu kien
    for (String item : trigram.keySet()) {
        double startBiProb;
        int freq = trigram.get(item);
        String[] arr = item.split("\\s");
        String key = arr[0] + " " + arr[1];
        //try {
        int startBiFreq = startBigramOfTrigram.get(key);
        try {
            startBiProb = bigramProb.get(key);
        } catch (NullPointerException ex) {
            startBiProb = 1d / (878592 + uniSize);
        }
        double prob = (((double) freq + 1) / (startBiFreq + uniSize)) * startBiProb;
        trigramProb.put(item, prob);
        //} catch(NullPointerException ex) {

        //}
    }
    System.out.println("Tong tan suat cua trigram = " + totalTriFreq);
    //In ra file
    PrintWriter f0 = new PrintWriter(new FileWriter("D:/App/unigramProb.txt"));
    PrintWriter f1 = new PrintWriter(new FileWriter("D:/App/bigramProb.txt"));
    PrintWriter f2 = new PrintWriter(new FileWriter("D:/App/trigramProb.txt"));
    for (String item : unigramProb.keySet()) {
        double freq = unigramProb.get(item);
        f0.append(item + " = " + freq + "\n");
    }

    f0.close();
    for (String item : bigramProb.keySet()) {
        double freq = bigramProb.get(item);
        f1.append(item + " = " + freq + "\n");
    }
    f1.close();
    for (String item : trigramProb.keySet()) {
        double freq = trigramProb.get(item);
        f2.append(item + " = " + freq + "\n");
    }
    f2.close();
    PrintWriter f3 = new PrintWriter(new FileWriter("D:/App/stringProb.txt"));
    br = new BufferedReader(new FileReader("D:/phrases10"));
    HashMap<String, Integer> prefix3Gram = new HashMap<>();
    HashMap<String, Integer> phrases = new HashMap<>();
    while ((line = br.readLine()) != null) {
        line = line.replaceAll("\\s+", " ").trim();
        //Loai bo 1 so ky hieu
        line = line.replaceAll("<3", "");
        line = line.replaceAll(":3", "");
        line = line.replaceAll(":v", "");
        line = line.replaceAll(":d", "");
        line = line.replaceAll(":D", "");
        line = line.replaceAll("p/s:", "");
        line = line.replaceAll(":\\)", "");
        String[] arr = line.split("\\s");
        if (arr.length > 2) {
            String prefix = arr[0] + " " + arr[1] + " " + arr[2];
            Integer prefixFreq = prefix3Gram.get(prefix);
            if (prefixFreq == null)
                prefix3Gram.put(prefix, 1);
            else
                prefix3Gram.put(prefix, 1 + prefixFreq);
        }
        Integer freq = phrases.get(line);
        if (freq == null)
            phrases.put(line, 1);
        else
            phrases.put(line, freq + 1);
    }
    //br = new BufferedReader(new FileReader("D:/phrases10"));
    double totalProb = 0;
    int countItem = 0;
    for (String item : phrases.keySet()) {
        line = item;
        Integer lineFreq = phrases.get(item);
        if (lineFreq == null)
            lineFreq = 1;
        String[] arr = line.split("\\s");
        String prefix = line;
        double probOfLine = 1d * lineFreq / (uniSize + totalTriFreq / uniSize);
        int length = arr.length;

        if (length >= 3) {
            prefix = arr[0] + " " + arr[1] + " " + arr[2];
            int prefixTotal = prefix3Gram.get(prefix);
            try {
                double prefixProb = trigramProb.get(prefix);
                probOfLine = prefixProb;
                if (length > 3) {
                    for (int i = 3; i < length; i++) {
                        prefix = arr[i - 2] + " " + arr[i - 1] + " " + arr[i];
                        prefixTotal = prefix3Gram.get(prefix);
                        prefixProb = trigramProb.get(prefix);
                        probOfLine *= (1d / prefixTotal) * prefixProb;
                    }
                }
                //f3.append(line + " = " + probOfLine + "\n");
            } catch (NullPointerException ex) {
                probOfLine = 1d * lineFreq / (prefixTotal + uniSize);
            }
        }

        f3.append(line + " = " + probOfLine + "\n");
        countItem += arr.length;
        totalProb -= (Math.log(probOfLine) / Math.log(2));
    }
    double somu = totalProb / countItem;
    double perplexity = Math.pow(2, somu);
    f3.close();
    DecimalFormat df = new DecimalFormat("#");
    df.setMaximumFractionDigits(4);
    System.out.println(somu);
    System.out.printf("PERPLEXITY = " + df.format(perplexity));
}

From source file:com.qubit.elasticsearch.analysis.url.helper.AnalyzerUtils.java

License:Apache License

public static void displayTokens(TokenStream stream) throws IOException {

    Token term = stream.addAttribute(Token.class);
    while (stream.incrementToken()) {
        System.out.print("[" + term.toString() + "] ");
    }//  ww  w  .ja  v  a  2s  .  c o m
}

From source file:com.quest.agent.weibomonitor.weiboMonitorAgentImpl.java

License:Open Source License

private ModelRoot collect(long collectionFreqInMs, String groupName, String sqlQuery, int groupID) {
    Weibo agentRoot = new Weibo(groupName);
    //TODO: collect data and populate the data collected to model(topology) 

    //List<UrlList> urlList = mWrapper.getUrlList();

    Analyzer ca = new SmartChineseAnalyzer(Version.LUCENE_CURRENT);

    try {//from w ww. ja  v  a  2 s. c o m

        SQLProcess sql = new SQLProcess();

        ResultSet res = sql.executeQuery(sqlQuery);

        MyTokenFilter tkFilter = new MyTokenFilter();

        while (res.next()) {
            Reader sentence = new StringReader(res.getString("status").toString());

            String weiboID = res.getObject("weiboId").toString();

            if (groupID == 0)
                sql.execute("update status set status.read=1 where weiboId=" + weiboID + ";");
            else
                sql.execute("update status2 set status2.read=1 where weiboId=" + weiboID + ";");

            TokenStream ts = ca.tokenStream("", sentence);
            try {
                while (ts.incrementToken()) {
                    String ss[] = ts.toString().split(",");
                    ss[0] = ss[0].replace("(", "");
                    if (tkFilter.doFilter(ss[0])) {
                        if (!map[groupID].containsKey(ss[0]))
                            map[groupID].put(ss[0], new Word(1, ss[0]));
                        else
                            map[groupID].get(ss[0]).plusNum();
                    }
                }
            } catch (IOException e) {
                mLogger.debug2("error occurred while incrementToken", e);
            }
        }
    } catch (SQLException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

    Word[] wordList = tfidf.doProcess(map[groupID]);

    int mapsize = map[groupID].size();
    for (int i = 0; i < Math.min(mapsize, ItemNumShow); i++) {

        collectWeibo(wordList[i].getWord(), wordList[i].getNum(), wordList[i].getTfIdf(), wordList[i].getIdf(),
                agentRoot);

    }

    return agentRoot;
}

From source file:com.qwazr.search.analysis.AnalyzerUtils.java

License:Apache License

final static public void forEachTerm(Analyzer analyzer, String field, String text, TermConsumer consumer)
        throws IOException {
    Objects.requireNonNull(analyzer, "The analyzer cannot be null");
    Objects.requireNonNull(field, "The field cannot be null");
    Objects.requireNonNull(text, "The text cannot be null");
    final TokenStream tokenStream = analyzer.tokenStream(field, text);
    try {/*from  w w w.j  a va2 s . c o  m*/
        final CharTermAttribute charTermAttr = getAttribute(tokenStream, CharTermAttribute.class);
        final FlagsAttribute flagsAttr = getAttribute(tokenStream, FlagsAttribute.class);
        final OffsetAttribute offsetAttr = getAttribute(tokenStream, OffsetAttribute.class);
        final PositionIncrementAttribute posIncAttr = getAttribute(tokenStream,
                PositionIncrementAttribute.class);
        final PositionLengthAttribute posLengthAttr = getAttribute(tokenStream, PositionLengthAttribute.class);
        final TypeAttribute typeAttr = getAttribute(tokenStream, TypeAttribute.class);
        final KeywordAttribute keywordAttr = getAttribute(tokenStream, KeywordAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken())
            if (!consumer.apply(charTermAttr, flagsAttr, offsetAttr, posIncAttr, posLengthAttr, typeAttr,
                    keywordAttr))
                break;

    } finally {
        tokenStream.close();
    }
}

From source file:com.qwazr.search.query.SpanPositionsQuery.java

License:Apache License

@Override
final public Query getQuery(QueryContext queryContext) throws IOException {

    BooleanQuery.Builder builder = new BooleanQuery.Builder();
    TokenStream tokenStream = queryContext.analyzer.tokenStream(field, queryContext.queryString);
    try {/*from w  w  w  .  j  av a  2  s. c o m*/
        CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
        PositionIncrementAttribute pocincrAttribute = tokenStream
                .getAttribute(PositionIncrementAttribute.class);
        tokenStream.reset();
        int pos = 0;
        while (tokenStream.incrementToken()) {
            final String charTerm = charTermAttribute.toString();
            int start = pos - distance;
            if (start < 0)
                start = 0;
            final int end = pos + distance + 1;
            for (int i = start; i < end; i++) {
                final float dist = Math.abs(i - pos) + 1;
                final float boost = 1 / dist;
                final SpanTermQuery spanTermQuery = new SpanTermQuery(new Term(field, charTerm));
                Query query = new BoostQuery(new SpanPositionRangeQuery(spanTermQuery, i, i + 1), boost);
                builder.add(new BooleanClause(query, BooleanClause.Occur.SHOULD));
            }
            pos += pocincrAttribute.getPositionIncrement();
        }
        return builder.build();
    } finally {
        IOUtils.closeQuietly(tokenStream);
    }
}