Example usage for org.apache.lucene.analysis TokenStream end

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream end.

Prototype

public void end() throws IOException

Source Link

Document

This method is called by the consumer after the last token has been consumed, after #incrementToken() returned false (using the new TokenStream API).

Usage

From source file:com.github.tteofili.looseen.MinHashClassifier.java

License:Apache License

private ArrayList<String> getTokens(Analyzer analyzer, String field, String value) throws IOException {
    ArrayList<String> tokens = new ArrayList<String>();
    TokenStream ts = analyzer.tokenStream(field, value);
    ts.reset();//www  .ja v  a2 s . c om
    while (ts.incrementToken()) {
        CharTermAttribute termAttribute = ts.getAttribute(CharTermAttribute.class);
        String token = new String(termAttribute.buffer(), 0, termAttribute.length());
        tokens.add(token);
    }
    ts.end();
    ts.close();
    return tokens;
}

From source file:com.jaeksoft.searchlib.request.SearchField.java

License:Open Source License

final private List<TermQueryItem> getTermQueryFilter(final PerFieldAnalyzer perFieldAnalyzer,
        CompiledAnalyzer compiledAnalyzer, final String queryString) throws IOException {
    TokenStream ts = null;
    TokenQueryFilter.TermQueryFilter tqf = null;
    Analyzer analyzer = compiledAnalyzer != null ? compiledAnalyzer : perFieldAnalyzer.getKeywordAnalyzer();
    try {/*from  w w w . j  a  v a2 s  . c om*/
        ts = analyzer.tokenStream(field, new StringReader(queryString));
        tqf = new TermQueryFilter(compiledAnalyzer, field, (float) termBoost, ts);
        while (tqf.incrementToken())
            ;
        ts.end();
        ts.close();

        tqf.sortByOffset();

        TermQueryFilter.includeChildrenBrothers(tqf.termQueryItems);
        for (TermQueryItem termQueryItem : tqf.termQueryItems)
            termQueryItem.includeChildrenBrothers();
        return tqf.termQueryItems;
    } finally {
        IOUtils.close(tqf, ts, analyzer);
    }
}

From source file:com.jamespot.glifpix.index.ResourceDocument.java

License:Open Source License

private void addLiteralField(String literal) throws IOException {
    _luceneDocument/*from  w  w w. ja  v  a 2  s  .  c  o  m*/
            .add(new Field("literal", replaceUnicodeStr(literal), Store.YES, Index.NOT_ANALYZED_NO_NORMS));

    String coolLiteral = literal.replaceAll("\\\"", "");
    coolLiteral = replaceUnicodeStr(coolLiteral);

    Analyzer resAnalyzer = new ContentAnalyzer();
    TokenStream ts = resAnalyzer.tokenStream("dummyField", new StringReader(coolLiteral));

    TermAttribute termAttribute = ts.addAttribute(TermAttribute.class);

    int length = 0;
    StringBuffer sb = new StringBuffer();
    while (ts.incrementToken()) {
        sb.append("_" + termAttribute.term());
        length++;
    }
    sb.insert(0, length);
    _resourceLength = length;
    ts.end();
    ts.close();

    String finalToken = sb.toString();
    _luceneDocument.add(new Field("token", finalToken, Store.YES, Index.NOT_ANALYZED_NO_NORMS));
    _luceneDocument.add(new Field("crc", Utils.getCRC(finalToken), Store.YES, Index.NOT_ANALYZED_NO_NORMS));
}

From source file:com.jamespot.glifpix.library.TagsExtractorImpl.java

License:Open Source License

public Map<String, Integer> getTagsFreq(String content, String lng) {

    Map<String, Integer> items = new HashMap<String, Integer>();
    TokensArray tokArray = new TokensArray(_MaxExpressionLength);

    TokenStream ts = _contentAnalyzer.tokenStream("dummyField", new StringReader(content));
    TermAttribute termAttribute = ts.addAttribute(TermAttribute.class);

    try {/* w  w w  .  j a  v  a  2  s.  c o  m*/
        while (ts.incrementToken()) {
            tokArray.pushString(termAttribute.term());
            Map<String, Integer> tagCandidates = tokArray.check(_resStores.get(lng).getCRCs(),
                    _lngStopTags.get(lng));

            if (tagCandidates.size() > 0) {
                for (Map.Entry<String, Integer> s : tagCandidates.entrySet()) {
                    String tag = _resStores.get(lng).getTag(s.getKey());
                    if (tag != null && tag.length() >= _MinWordLength) {
                        if (items.containsKey(tag)) {
                            items.put(tag, items.get(tag) + s.getValue());
                        } else {
                            items.put(tag, s.getValue());
                        }
                    }
                }
            }
        }
        ts.end();
        ts.close();

    } catch (IOException e) {
        logger.error(e);
    }

    return items;
}

From source file:com.jamespot.glifpix.library.TagsExtractorImpl.java

License:Open Source License

public Map<String, Float> getWeightedTagsFreq(String content, String lng) {

    Map<String, Float> items = new HashMap<String, Float>();
    TokensArray tokArray = new TokensArray(_MaxExpressionLength);

    TokenStream ts = _contentAnalyzer.tokenStream("dummyField", new StringReader(content));
    TermAttribute termAttribute = ts.addAttribute(TermAttribute.class);

    try {/*from   w  w w. j  a v a 2  s  .  co m*/
        while (ts.incrementToken()) {
            tokArray.pushString(termAttribute.term());
            Map<String, Integer> tagCandidates = tokArray.check(_resStores.get(lng).getCRCs(),
                    _lngStopTags.get(lng));

            if (tagCandidates.size() > 0) {
                for (Map.Entry<String, Integer> s : tagCandidates.entrySet()) {
                    String tag = _resStores.get(lng).getTag(s.getKey());
                    if (tag != null && tag.length() >= _MinWordLength) {
                        if (items.containsKey(tag)) {
                            items.put(tag, items.get(tag)
                                    + (s.getValue().floatValue()) * getTagWeight(s.getKey(), lng));
                        } else {
                            items.put(tag, (s.getValue().floatValue()) * getTagWeight(s.getKey(), lng));
                        }
                    }
                }
            }
        }
        ts.end();
        ts.close();

    } catch (IOException e) {
        logger.error(e);
    }

    return items;
}

From source file:com.jamespot.glifpix.library.TagsExtractorImpl.java

License:Open Source License

public Set<String> getTokens(String content, String lng) {

    Set<String> tokens = new HashSet<String>();
    TokensArray tokArray = new TokensArray(15);

    TokenStream ts = _contentAnalyzer.tokenStream("dummyField", new StringReader(content));
    TermAttribute termAttribute = ts.addAttribute(TermAttribute.class);

    try {//w  w  w  .j a  v  a 2 s .  c om
        while (ts.incrementToken()) {
            tokArray.pushString(termAttribute.term());
            Map<String, Integer> tagCandidates = tokArray.check(_resStores.get(lng).getCRCs(),
                    _lngStopTags.get(lng));

            if (tagCandidates.size() > 0) {
                for (Map.Entry<String, Integer> s : tagCandidates.entrySet()) {
                    tokens.add(s.getKey());
                }
            }
        }
        ts.end();
        ts.close();

    } catch (IOException e) {
        logger.error(e);
    }

    return tokens;
}

From source file:com.lorelib.analyzer.sample.IKAnalzyerDemo.java

License:Apache License

public static void main(String[] args) {
    //IK?smart??//from  w  ww  . j av a2  s . co m
    Analyzer analyzer = new IKAnalyzer(true);

    //?LuceneTokenStream
    TokenStream ts = null;
    try {
        ts = analyzer.tokenStream("myfield", new StringReader(
                "?????IKAnalyer can analysis english text too"));
        //???
        OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
        //??
        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        //??
        TypeAttribute type = ts.addAttribute(TypeAttribute.class);

        //?TokenStream?StringReader
        ts.reset();
        //??
        while (ts.incrementToken()) {
            System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString()
                    + " | " + type.type());
        }
        //TokenStreamStringReader
        ts.end(); // Perform end-of-stream operations, e.g. set the final offset.

    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        //TokenStream?
        if (ts != null) {
            try {
                ts.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

}

From source file:com.ml.hadoop.nlp.SequenceFileTokenizerMapper.java

License:Apache License

@Override
protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
    TokenStream stream = analyzer.tokenStream(key.toString(), new StringReader(value.toString()));
    stream.reset();//from w w w  . j a  va2  s. com
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();
    StringTuple document = new StringTuple();
    while (stream.incrementToken()) {
        if (termAtt.length() > 0) {
            document.add(new String(termAtt.buffer(), 0, termAtt.length()));
        }
    }
    stream.end();
    Closeables.close(stream, true);

    //drop stop words
    document = StopWordsHandler.dropStopWords(document);
    context.write(key, document);
}

From source file:com.nec.scg.senseRanking.CountTextSimilarity.java

public Map<String, Float> CountTF_IDF(String str, Analyzer a) {
    Map<String, Float> termVector = new TreeMap<String, Float>();

    try {/*from   w w w  .  j a  v a 2 s  .  c om*/
        TokenStream stream = a.tokenStream("content", new StringReader(str));
        PorterStemFilter filter = new PorterStemFilter(stream);
        CharTermAttribute cta = filter.addAttribute(CharTermAttribute.class);
        filter.reset();
        String strcat = null;
        int wordCount = 0;
        while (filter.incrementToken()) {
            strcat = cta.toString();
            // System.out.print("["+strcat+"]");
            if (!termVector.containsKey(strcat)) {
                termVector.put(strcat, 1f);
                wordCount++;
            } else {
                termVector.put(strcat, termVector.get(strcat) + 1);
                wordCount++;
            }
        }
        for (String ter : termVector.keySet()) {
            int hits = searchIndexforIDF(ter) + 1;
            float idf = (float) (Math.log(AllArticle * 1.0 / hits) + 1.0);
            float tf = termVector.get(ter) / wordCount;
            termVector.put(ter, tf * idf);
        }

        filter.end();
        stream.end();
        filter.close();
        stream.close();
    } catch (IOException e) {
        e.printStackTrace();
    }
    return termVector;
}

From source file:com.ProcessText.java

public static void main(String[] args) throws FileNotFoundException, IOException {
    HashMap<String, Integer> unigram = new HashMap<>();
    HashMap<String, Integer> bigram = new HashMap<>();
    HashMap<String, Integer> trigram = new HashMap<>();
    BufferedReader br = new BufferedReader(new FileReader("D:/phrases90"));
    String line;// www  . ja v  a2s  .  c  o m
    Analyzer biAnalyzer = new NGramTokenBaseAnalyzer(2, 2, false);
    Analyzer triAnalyzer = new NGramTokenBaseAnalyzer(3, 3, false);
    while ((line = br.readLine()) != null) {
        line = line.replaceAll("\\s+", " ").trim();
        //Loai bo 1 so ky hieu
        line = line.replaceAll("<3", "");
        line = line.replaceAll(":3", "");
        line = line.replaceAll(":v", "");
        line = line.replaceAll(":d", "");
        line = line.replaceAll(":D", "");
        line = line.replaceAll("p/s:", "");
        line = line.replaceAll(":\\)", "");
        //unigram process
        String[] arr = line.split("\\s");
        for (String item : arr) {
            item = item.replaceAll("\\s", "");
            if (item.length() > 0) {
                item = item.toLowerCase();
                Integer freq = unigram.get(item);
                if (freq != null) {
                    unigram.put(item, freq + 1);
                } else
                    unigram.put(item, 1);
            }
        }
        //bigram process
        if (line.length() > 0) {
            TokenStream stream = biAnalyzer.tokenStream(null, new StringReader(line));
            CharTermAttribute cattr = stream.addAttribute(CharTermAttribute.class);
            stream.reset();
            while (stream.incrementToken()) {
                String item = cattr.toString();
                //item = item.replaceAll("$[\\s]","");
                Integer count = bigram.get(item);
                int fcount = 0;
                if (count == null)
                    fcount = 1;
                else
                    fcount = count + 1;
                if (item.length() > 3)
                    bigram.put(item, fcount);
            }
            stream.end();
            stream.close();
            //trigram process
            TokenStream stream1 = triAnalyzer.tokenStream(null, new StringReader(line));
            CharTermAttribute cattr1 = stream1.addAttribute(CharTermAttribute.class);
            stream1.reset();
            while (stream1.incrementToken()) {
                String item = cattr1.toString();
                //item = item.replaceAll("$[\\s]","");
                Integer count = trigram.get(item);
                int fcount = 0;
                if (count == null)
                    fcount = 1;
                else
                    fcount = count + 1;
                if (item.length() > 5)
                    trigram.put(item, fcount);
            }
            stream1.end();
            stream1.close();
        }

    }
    //Tinh Xac suat cho cac unigram
    HashMap<String, Double> unigramProb = new HashMap<>();
    int totalUniFreq = 0;
    int uniSize = unigram.size();
    for (String item : unigram.keySet()) {
        item = item.toLowerCase();
        int freq = unigram.get(item);
        totalUniFreq += freq;
    }
    //Cng thc xc sut di y  c sa li
    for (String item : unigram.keySet()) {
        item = item.toLowerCase();
        int freq = unigram.get(item);
        double prob = ((double) freq + 1) / (totalUniFreq + uniSize);
        //unigram.size l s lng t vng unigram khc nhau
        unigramProb.put(item, prob);
    }
    System.out.println("Tong tan suat cua unigram = " + totalUniFreq);
    //Tinh xac suat cho cac bigram
    HashMap<String, Double> bigramProb = new HashMap<>();
    HashMap<String, Integer> startUnigramOfBigram = new HashMap<>();// Luu tong tan suat cua A* bat dau boi unigram A
    //De phuc vu cong thuc xac suat co dieu kien
    int totalBiFreq = 0;//Tinh tong tan suat cua toan bo bigram A* cua unigram A
    //Luu A*
    for (String item : bigram.keySet()) {
        item = item.toLowerCase();
        int freq = bigram.get(item);
        totalBiFreq += freq;
        String[] arr = item.split("\\s");
        String key = arr[0].toLowerCase();//khong can thiet lam
        Integer startFreq = startUnigramOfBigram.get(key);
        if (startFreq == null)
            startUnigramOfBigram.put(key, freq);
        else
            startUnigramOfBigram.put(key, freq + startFreq);
    }
    //Ap dung cong thuc xac suat co dieu kien
    //? sa li cng thc
    for (String item : bigram.keySet()) {
        int freq = bigram.get(item);
        String[] arr = item.split("\\s");
        String key = arr[0].toLowerCase();
        int startUniFreq = startUnigramOfBigram.get(key);
        double startUniProb;
        try {
            startUniProb = unigramProb.get(key);
        } catch (NullPointerException ex) {
            startUniProb = 1d / (1 + uniSize);
        }
        double prob = (((double) freq + 1) / (startUniFreq + uniSize)) * startUniProb;
        //uniSize = V l kch thc t in unigram
        bigramProb.put(item, prob);
    }

    System.out.println("Tong tan suat cua bigram = " + totalBiFreq);
    //Tinh xac suat cho cac trigram
    HashMap<String, Double> trigramProb = new HashMap<>();
    HashMap<String, Integer> startBigramOfTrigram = new HashMap<>();// Luu tong tan suat cua AB* bat dau boi bigram AB
    int totalTriFreq = 0;
    for (String item : trigram.keySet()) {
        int freq = trigram.get(item);
        totalTriFreq += freq;
        String[] arr = item.split("\\s");
        String key = arr[0] + " " + arr[1];
        Integer startFreq = startBigramOfTrigram.get(key);
        if (startFreq == null)
            startBigramOfTrigram.put(key, freq);
        else
            startBigramOfTrigram.put(key, freq + startFreq);
    }
    //Ap dung cong thuc xac suat co dieu kien
    for (String item : trigram.keySet()) {
        double startBiProb;
        int freq = trigram.get(item);
        String[] arr = item.split("\\s");
        String key = arr[0] + " " + arr[1];
        //try {
        int startBiFreq = startBigramOfTrigram.get(key);
        try {
            startBiProb = bigramProb.get(key);
        } catch (NullPointerException ex) {
            startBiProb = 1d / (878592 + uniSize);
        }
        double prob = (((double) freq + 1) / (startBiFreq + uniSize)) * startBiProb;
        trigramProb.put(item, prob);
        //} catch(NullPointerException ex) {

        //}
    }
    System.out.println("Tong tan suat cua trigram = " + totalTriFreq);
    //In ra file
    PrintWriter f0 = new PrintWriter(new FileWriter("D:/App/unigramProb.txt"));
    PrintWriter f1 = new PrintWriter(new FileWriter("D:/App/bigramProb.txt"));
    PrintWriter f2 = new PrintWriter(new FileWriter("D:/App/trigramProb.txt"));
    for (String item : unigramProb.keySet()) {
        double freq = unigramProb.get(item);
        f0.append(item + " = " + freq + "\n");
    }

    f0.close();
    for (String item : bigramProb.keySet()) {
        double freq = bigramProb.get(item);
        f1.append(item + " = " + freq + "\n");
    }
    f1.close();
    for (String item : trigramProb.keySet()) {
        double freq = trigramProb.get(item);
        f2.append(item + " = " + freq + "\n");
    }
    f2.close();
    PrintWriter f3 = new PrintWriter(new FileWriter("D:/App/stringProb.txt"));
    br = new BufferedReader(new FileReader("D:/phrases10"));
    HashMap<String, Integer> prefix3Gram = new HashMap<>();
    HashMap<String, Integer> phrases = new HashMap<>();
    while ((line = br.readLine()) != null) {
        line = line.replaceAll("\\s+", " ").trim();
        //Loai bo 1 so ky hieu
        line = line.replaceAll("<3", "");
        line = line.replaceAll(":3", "");
        line = line.replaceAll(":v", "");
        line = line.replaceAll(":d", "");
        line = line.replaceAll(":D", "");
        line = line.replaceAll("p/s:", "");
        line = line.replaceAll(":\\)", "");
        String[] arr = line.split("\\s");
        if (arr.length > 2) {
            String prefix = arr[0] + " " + arr[1] + " " + arr[2];
            Integer prefixFreq = prefix3Gram.get(prefix);
            if (prefixFreq == null)
                prefix3Gram.put(prefix, 1);
            else
                prefix3Gram.put(prefix, 1 + prefixFreq);
        }
        Integer freq = phrases.get(line);
        if (freq == null)
            phrases.put(line, 1);
        else
            phrases.put(line, freq + 1);
    }
    //br = new BufferedReader(new FileReader("D:/phrases10"));
    double totalProb = 0;
    int countItem = 0;
    for (String item : phrases.keySet()) {
        line = item;
        Integer lineFreq = phrases.get(item);
        if (lineFreq == null)
            lineFreq = 1;
        String[] arr = line.split("\\s");
        String prefix = line;
        double probOfLine = 1d * lineFreq / (uniSize + totalTriFreq / uniSize);
        int length = arr.length;

        if (length >= 3) {
            prefix = arr[0] + " " + arr[1] + " " + arr[2];
            int prefixTotal = prefix3Gram.get(prefix);
            try {
                double prefixProb = trigramProb.get(prefix);
                probOfLine = prefixProb;
                if (length > 3) {
                    for (int i = 3; i < length; i++) {
                        prefix = arr[i - 2] + " " + arr[i - 1] + " " + arr[i];
                        prefixTotal = prefix3Gram.get(prefix);
                        prefixProb = trigramProb.get(prefix);
                        probOfLine *= (1d / prefixTotal) * prefixProb;
                    }
                }
                //f3.append(line + " = " + probOfLine + "\n");
            } catch (NullPointerException ex) {
                probOfLine = 1d * lineFreq / (prefixTotal + uniSize);
            }
        }

        f3.append(line + " = " + probOfLine + "\n");
        countItem += arr.length;
        totalProb -= (Math.log(probOfLine) / Math.log(2));
    }
    double somu = totalProb / countItem;
    double perplexity = Math.pow(2, somu);
    f3.close();
    DecimalFormat df = new DecimalFormat("#");
    df.setMaximumFractionDigits(4);
    System.out.println(somu);
    System.out.printf("PERPLEXITY = " + df.format(perplexity));
}