Example usage for org.apache.lucene.analysis TokenStream close

List of usage examples for org.apache.lucene.analysis TokenStream close

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream close.

Prototype

@Override
public void close() throws IOException 

Source Link

Document

Releases resources associated with this stream.

Usage

From source file:com.pongasoft.kiwidoc.index.impl.keyword.impl.KeywordIndexImpl.java

License:Apache License

/**
 * Generates a simple query: a boolean query made of TermQuery separated
 * by AND./*from   w  w  w  .  ja  v  a2  s  .  c  om*/
 *
 * @param query
 * @return <code>null</code> if there is no terms
 * @throws ParseException
 */
private Query generateSimpleQuery(String keyword, String field) throws ParseException {
    int termCount = 0;
    TokenStream source = _analyzer.tokenStream(field, new StringReader(keyword));

    BooleanQuery q = new BooleanQuery();
    org.apache.lucene.analysis.Token t = new org.apache.lucene.analysis.Token();

    while (true) {
        try {
            t = source.next(t);
        } catch (IOException e) {
            if (log.isDebugEnabled())
                log.debug("ingnored exception", e);

            t = null;
        }

        if (t == null)
            break;

        termCount++;
        q.add(new TermQuery(new Term(field, t.term())), BooleanClause.Occur.MUST);
    }
    try {
        source.close();
    } catch (IOException e) {
        if (log.isDebugEnabled())
            log.debug("ingnored exception", e);
    }

    if (termCount == 0) {
        return null;
    }

    BooleanClause[] clauses = q.getClauses();

    if (clauses != null && clauses.length == 1)
        return clauses[0].getQuery();

    return q;
}

From source file:com.ProcessText.java

public static void main(String[] args) throws FileNotFoundException, IOException {
    HashMap<String, Integer> unigram = new HashMap<>();
    HashMap<String, Integer> bigram = new HashMap<>();
    HashMap<String, Integer> trigram = new HashMap<>();
    BufferedReader br = new BufferedReader(new FileReader("D:/phrases90"));
    String line;/*ww  w .ja  v a2 s.c o m*/
    Analyzer biAnalyzer = new NGramTokenBaseAnalyzer(2, 2, false);
    Analyzer triAnalyzer = new NGramTokenBaseAnalyzer(3, 3, false);
    while ((line = br.readLine()) != null) {
        line = line.replaceAll("\\s+", " ").trim();
        //Loai bo 1 so ky hieu
        line = line.replaceAll("<3", "");
        line = line.replaceAll(":3", "");
        line = line.replaceAll(":v", "");
        line = line.replaceAll(":d", "");
        line = line.replaceAll(":D", "");
        line = line.replaceAll("p/s:", "");
        line = line.replaceAll(":\\)", "");
        //unigram process
        String[] arr = line.split("\\s");
        for (String item : arr) {
            item = item.replaceAll("\\s", "");
            if (item.length() > 0) {
                item = item.toLowerCase();
                Integer freq = unigram.get(item);
                if (freq != null) {
                    unigram.put(item, freq + 1);
                } else
                    unigram.put(item, 1);
            }
        }
        //bigram process
        if (line.length() > 0) {
            TokenStream stream = biAnalyzer.tokenStream(null, new StringReader(line));
            CharTermAttribute cattr = stream.addAttribute(CharTermAttribute.class);
            stream.reset();
            while (stream.incrementToken()) {
                String item = cattr.toString();
                //item = item.replaceAll("$[\\s]","");
                Integer count = bigram.get(item);
                int fcount = 0;
                if (count == null)
                    fcount = 1;
                else
                    fcount = count + 1;
                if (item.length() > 3)
                    bigram.put(item, fcount);
            }
            stream.end();
            stream.close();
            //trigram process
            TokenStream stream1 = triAnalyzer.tokenStream(null, new StringReader(line));
            CharTermAttribute cattr1 = stream1.addAttribute(CharTermAttribute.class);
            stream1.reset();
            while (stream1.incrementToken()) {
                String item = cattr1.toString();
                //item = item.replaceAll("$[\\s]","");
                Integer count = trigram.get(item);
                int fcount = 0;
                if (count == null)
                    fcount = 1;
                else
                    fcount = count + 1;
                if (item.length() > 5)
                    trigram.put(item, fcount);
            }
            stream1.end();
            stream1.close();
        }

    }
    //Tinh Xac suat cho cac unigram
    HashMap<String, Double> unigramProb = new HashMap<>();
    int totalUniFreq = 0;
    int uniSize = unigram.size();
    for (String item : unigram.keySet()) {
        item = item.toLowerCase();
        int freq = unigram.get(item);
        totalUniFreq += freq;
    }
    //Cng thc xc sut di y  c sa li
    for (String item : unigram.keySet()) {
        item = item.toLowerCase();
        int freq = unigram.get(item);
        double prob = ((double) freq + 1) / (totalUniFreq + uniSize);
        //unigram.size l s lng t vng unigram khc nhau
        unigramProb.put(item, prob);
    }
    System.out.println("Tong tan suat cua unigram = " + totalUniFreq);
    //Tinh xac suat cho cac bigram
    HashMap<String, Double> bigramProb = new HashMap<>();
    HashMap<String, Integer> startUnigramOfBigram = new HashMap<>();// Luu tong tan suat cua A* bat dau boi unigram A
    //De phuc vu cong thuc xac suat co dieu kien
    int totalBiFreq = 0;//Tinh tong tan suat cua toan bo bigram A* cua unigram A
    //Luu A*
    for (String item : bigram.keySet()) {
        item = item.toLowerCase();
        int freq = bigram.get(item);
        totalBiFreq += freq;
        String[] arr = item.split("\\s");
        String key = arr[0].toLowerCase();//khong can thiet lam
        Integer startFreq = startUnigramOfBigram.get(key);
        if (startFreq == null)
            startUnigramOfBigram.put(key, freq);
        else
            startUnigramOfBigram.put(key, freq + startFreq);
    }
    //Ap dung cong thuc xac suat co dieu kien
    //? sa li cng thc
    for (String item : bigram.keySet()) {
        int freq = bigram.get(item);
        String[] arr = item.split("\\s");
        String key = arr[0].toLowerCase();
        int startUniFreq = startUnigramOfBigram.get(key);
        double startUniProb;
        try {
            startUniProb = unigramProb.get(key);
        } catch (NullPointerException ex) {
            startUniProb = 1d / (1 + uniSize);
        }
        double prob = (((double) freq + 1) / (startUniFreq + uniSize)) * startUniProb;
        //uniSize = V l kch thc t in unigram
        bigramProb.put(item, prob);
    }

    System.out.println("Tong tan suat cua bigram = " + totalBiFreq);
    //Tinh xac suat cho cac trigram
    HashMap<String, Double> trigramProb = new HashMap<>();
    HashMap<String, Integer> startBigramOfTrigram = new HashMap<>();// Luu tong tan suat cua AB* bat dau boi bigram AB
    int totalTriFreq = 0;
    for (String item : trigram.keySet()) {
        int freq = trigram.get(item);
        totalTriFreq += freq;
        String[] arr = item.split("\\s");
        String key = arr[0] + " " + arr[1];
        Integer startFreq = startBigramOfTrigram.get(key);
        if (startFreq == null)
            startBigramOfTrigram.put(key, freq);
        else
            startBigramOfTrigram.put(key, freq + startFreq);
    }
    //Ap dung cong thuc xac suat co dieu kien
    for (String item : trigram.keySet()) {
        double startBiProb;
        int freq = trigram.get(item);
        String[] arr = item.split("\\s");
        String key = arr[0] + " " + arr[1];
        //try {
        int startBiFreq = startBigramOfTrigram.get(key);
        try {
            startBiProb = bigramProb.get(key);
        } catch (NullPointerException ex) {
            startBiProb = 1d / (878592 + uniSize);
        }
        double prob = (((double) freq + 1) / (startBiFreq + uniSize)) * startBiProb;
        trigramProb.put(item, prob);
        //} catch(NullPointerException ex) {

        //}
    }
    System.out.println("Tong tan suat cua trigram = " + totalTriFreq);
    //In ra file
    PrintWriter f0 = new PrintWriter(new FileWriter("D:/App/unigramProb.txt"));
    PrintWriter f1 = new PrintWriter(new FileWriter("D:/App/bigramProb.txt"));
    PrintWriter f2 = new PrintWriter(new FileWriter("D:/App/trigramProb.txt"));
    for (String item : unigramProb.keySet()) {
        double freq = unigramProb.get(item);
        f0.append(item + " = " + freq + "\n");
    }

    f0.close();
    for (String item : bigramProb.keySet()) {
        double freq = bigramProb.get(item);
        f1.append(item + " = " + freq + "\n");
    }
    f1.close();
    for (String item : trigramProb.keySet()) {
        double freq = trigramProb.get(item);
        f2.append(item + " = " + freq + "\n");
    }
    f2.close();
    PrintWriter f3 = new PrintWriter(new FileWriter("D:/App/stringProb.txt"));
    br = new BufferedReader(new FileReader("D:/phrases10"));
    HashMap<String, Integer> prefix3Gram = new HashMap<>();
    HashMap<String, Integer> phrases = new HashMap<>();
    while ((line = br.readLine()) != null) {
        line = line.replaceAll("\\s+", " ").trim();
        //Loai bo 1 so ky hieu
        line = line.replaceAll("<3", "");
        line = line.replaceAll(":3", "");
        line = line.replaceAll(":v", "");
        line = line.replaceAll(":d", "");
        line = line.replaceAll(":D", "");
        line = line.replaceAll("p/s:", "");
        line = line.replaceAll(":\\)", "");
        String[] arr = line.split("\\s");
        if (arr.length > 2) {
            String prefix = arr[0] + " " + arr[1] + " " + arr[2];
            Integer prefixFreq = prefix3Gram.get(prefix);
            if (prefixFreq == null)
                prefix3Gram.put(prefix, 1);
            else
                prefix3Gram.put(prefix, 1 + prefixFreq);
        }
        Integer freq = phrases.get(line);
        if (freq == null)
            phrases.put(line, 1);
        else
            phrases.put(line, freq + 1);
    }
    //br = new BufferedReader(new FileReader("D:/phrases10"));
    double totalProb = 0;
    int countItem = 0;
    for (String item : phrases.keySet()) {
        line = item;
        Integer lineFreq = phrases.get(item);
        if (lineFreq == null)
            lineFreq = 1;
        String[] arr = line.split("\\s");
        String prefix = line;
        double probOfLine = 1d * lineFreq / (uniSize + totalTriFreq / uniSize);
        int length = arr.length;

        if (length >= 3) {
            prefix = arr[0] + " " + arr[1] + " " + arr[2];
            int prefixTotal = prefix3Gram.get(prefix);
            try {
                double prefixProb = trigramProb.get(prefix);
                probOfLine = prefixProb;
                if (length > 3) {
                    for (int i = 3; i < length; i++) {
                        prefix = arr[i - 2] + " " + arr[i - 1] + " " + arr[i];
                        prefixTotal = prefix3Gram.get(prefix);
                        prefixProb = trigramProb.get(prefix);
                        probOfLine *= (1d / prefixTotal) * prefixProb;
                    }
                }
                //f3.append(line + " = " + probOfLine + "\n");
            } catch (NullPointerException ex) {
                probOfLine = 1d * lineFreq / (prefixTotal + uniSize);
            }
        }

        f3.append(line + " = " + probOfLine + "\n");
        countItem += arr.length;
        totalProb -= (Math.log(probOfLine) / Math.log(2));
    }
    double somu = totalProb / countItem;
    double perplexity = Math.pow(2, somu);
    f3.close();
    DecimalFormat df = new DecimalFormat("#");
    df.setMaximumFractionDigits(4);
    System.out.println(somu);
    System.out.printf("PERPLEXITY = " + df.format(perplexity));
}

From source file:com.qwazr.search.analysis.AnalyzerUtils.java

License:Apache License

final static public void forEachTerm(Analyzer analyzer, String field, String text, TermConsumer consumer)
        throws IOException {
    Objects.requireNonNull(analyzer, "The analyzer cannot be null");
    Objects.requireNonNull(field, "The field cannot be null");
    Objects.requireNonNull(text, "The text cannot be null");
    final TokenStream tokenStream = analyzer.tokenStream(field, text);
    try {//www . j a  va  2  s  .c  o m
        final CharTermAttribute charTermAttr = getAttribute(tokenStream, CharTermAttribute.class);
        final FlagsAttribute flagsAttr = getAttribute(tokenStream, FlagsAttribute.class);
        final OffsetAttribute offsetAttr = getAttribute(tokenStream, OffsetAttribute.class);
        final PositionIncrementAttribute posIncAttr = getAttribute(tokenStream,
                PositionIncrementAttribute.class);
        final PositionLengthAttribute posLengthAttr = getAttribute(tokenStream, PositionLengthAttribute.class);
        final TypeAttribute typeAttr = getAttribute(tokenStream, TypeAttribute.class);
        final KeywordAttribute keywordAttr = getAttribute(tokenStream, KeywordAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken())
            if (!consumer.apply(charTermAttr, flagsAttr, offsetAttr, posIncAttr, posLengthAttr, typeAttr,
                    keywordAttr))
                break;

    } finally {
        tokenStream.close();
    }
}

From source file:com.romeikat.datamessie.core.base.util.ParseUtil.java

License:Open Source License

public List<String> parseTerms(final String text, final Analyzer analyzer) {
    final List<String> terms = new LinkedList<String>();
    try {/*from www .j  av  a2  s . com*/
        final TokenStream tokenStream = analyzer.tokenStream(null, text);
        tokenStream.reset();
        final Attribute attribute = tokenStream.getAttribute(CharTermAttribute.class);
        while (tokenStream.incrementToken()) {
            final String term = attribute.toString();
            terms.add(term);
        }
        tokenStream.end();
        tokenStream.close();
    } catch (final IOException e) {
        // Cannot be thrown due to usage of a StringReader
    }
    return terms;
}

From source file:com.scaleunlimited.classify.analyzer.LuceneAnalyzer.java

License:Apache License

/**
 * @param contentText input text to be parsed into terms
 * @return salient terms in order of appearance
 * (or null if this content should be ignored)
 *//*w  w w.  j  a v  a 2s  .  co  m*/
public List<String> getTermList(String contentText) {
    init();
    List<String> result = new ArrayList<String>(contentText.length() / 10);

    try {
        TokenStream stream = _analyzer.tokenStream("content", new StringReader(contentText));
        CharTermAttribute termAtt = (CharTermAttribute) stream.addAttribute(CharTermAttribute.class);

        stream.reset();
        while (stream.incrementToken()) {
            if (termAtt.length() > 0) {
                String term = termAtt.toString();
                // Here we skip runs of position increment markers created
                // by the ShingleFilter for stop words because they skew
                // the clustering/liblinear analysis.
                if (!term.matches("(_ )*_")) {
                    result.add(term);
                }
            }
        }
        stream.end();
        stream.close();
    } catch (IOException e) {
        throw new RuntimeException("Impossible error", e);
    }

    return result;
}

From source file:com.searchcode.app.util.CodeAnalyzer.java

License:Open Source License

public static void main(String[] args) throws IOException {
    // text to tokenize
    final String text = "This is a demo of the TokenStream API";

    CodeAnalyzer analyzer = new CodeAnalyzer();
    TokenStream stream = analyzer.tokenStream("field", new StringReader(text));

    // get the CharTermAttribute from the TokenStream
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);

    try {//from w  w w . j  a  v  a 2s .  c o m
        stream.reset();

        // print all tokens until stream is exhausted
        while (stream.incrementToken()) {
            System.out.println(termAtt.toString());
        }

        stream.end();
    } finally {
        stream.close();
    }
}

From source file:com.shaie.annots.AnnotatingTokenStreamExample.java

License:Apache License

public static void main(String[] args) throws Exception {
    String text = "quick brown fox ate the blue red chicken";
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(text));
    TeeSinkTokenFilter teeSink = new TeeSinkTokenFilter(tokenizer);
    TokenStream colors = new AnnotatingTokenFilter(teeSink.newSinkTokenStream(new ColorsSinkFilter()),
            COLOR_ANNOT_TERM);/*from  w  w w. j  av  a 2s .  c  o  m*/

    System.out.println("Text tokens:\n");

    // consume all the tokens from the original stream. this also populates the
    // Sink (colors) with its color-matching tokens
    teeSink.reset();
    CharTermAttribute termAtt = teeSink.getAttribute(CharTermAttribute.class);
    PositionIncrementAttribute termPosAtt = teeSink.getAttribute(PositionIncrementAttribute.class);
    int termsPos = -1;
    while (teeSink.incrementToken()) {
        termsPos += termPosAtt.getPositionIncrement();
        System.out.println("term=" + termAtt + ", pos=" + termsPos);
    }
    teeSink.end();
    tokenizer.end();

    System.out.println("\nAnnotation tokens:\n");

    // now consume the color annotation tokens from the colors stream
    CharTermAttribute colorAtt = colors.getAttribute(CharTermAttribute.class);
    PayloadAttribute payloadAtt = colors.getAttribute(PayloadAttribute.class);
    ByteArrayDataInput in = new ByteArrayDataInput();
    colors.reset();
    while (colors.incrementToken()) {
        BytesRef bytes = payloadAtt.getPayload();
        in.reset(bytes.bytes, bytes.offset, bytes.length);
        System.out.println("term=" + colorAtt + ", start=" + in.readVInt() + ", length=" + in.readVInt());
    }
    colors.end();
    colors.close();

    teeSink.close();
    tokenizer.close();
}

From source file:com.shaie.SynonymFilterExample.java

License:Apache License

@SuppressWarnings("resource")
public static void main(String[] args) throws Exception {
    final Tokenizer tok = new WhitespaceTokenizer();
    tok.setReader(new StringReader("dark sea green sea green"));

    final SynonymMap.Builder builder = new SynonymMap.Builder(true);
    addSynonym("dark sea green", "color", builder);
    addSynonym("green", "color", builder);
    addSynonym("dark sea", "color", builder);
    addSynonym("sea green", "color", builder);
    final SynonymMap synMap = builder.build();
    final TokenStream ts = new SynonymGraphFilter(tok, synMap, true);

    final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    final PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
    final PositionLengthAttribute posLengthAtt = ts.addAttribute(PositionLengthAttribute.class);

    ts.reset();/*from  w  ww  .  ja  v a2  s .  c o m*/
    int pos = -1;
    while (ts.incrementToken()) {
        pos += posIncrAtt.getPositionIncrement();
        System.out.println("term=" + termAtt + ", pos=" + pos + ", posLen=" + posLengthAtt.getPositionLength());
    }
    ts.end();
    ts.close();
}

From source file:com.sindicetech.siren.analysis.filter.TestURINormalisationFilter.java

License:Open Source License

public void assertNormalisesTo(final Tokenizer t, final String input, final String[] expectedImages,
        final String[] expectedTypes) throws Exception {

    assertTrue("has CharTermAttribute", t.hasAttribute(CharTermAttribute.class));
    final CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);

    TypeAttribute typeAtt = null;//from  ww w. ja v a  2 s  . c  o  m
    if (expectedTypes != null) {
        assertTrue("has TypeAttribute", t.hasAttribute(TypeAttribute.class));
        typeAtt = t.getAttribute(TypeAttribute.class);
    }

    t.setReader(new StringReader(input));
    t.reset();

    final TokenStream filter = new URINormalisationFilter(t);

    for (int i = 0; i < expectedImages.length; i++) {

        assertTrue("token " + i + " exists", filter.incrementToken());

        assertEquals(expectedImages[i], termAtt.toString());

        if (expectedTypes != null) {
            assertEquals(expectedTypes[i], typeAtt.type());
        }

    }

    assertFalse("end of stream", filter.incrementToken());
    filter.end();
    filter.close();
}

From source file:com.sindicetech.siren.analysis.NodeAnalyzerTestCase.java

License:Open Source License

public void assertAnalyzesTo(final Analyzer a, final String input, final String[] expectedImages,
        final String[] expectedTypes, final int[] expectedPosIncrs, final IntsRef[] expectedNode,
        final int[] expectedPos) throws Exception {
    final TokenStream t = a.tokenStream("", new StringReader(input));
    t.reset();/*from  w  ww  .  j av a2 s  .  c  om*/

    assertTrue("has TermAttribute", t.hasAttribute(CharTermAttribute.class));
    final CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);

    TypeAttribute typeAtt = null;
    if (expectedTypes != null) {
        assertTrue("has TypeAttribute", t.hasAttribute(TypeAttribute.class));
        typeAtt = t.getAttribute(TypeAttribute.class);
    }

    PositionIncrementAttribute posIncrAtt = null;
    if (expectedPosIncrs != null) {
        assertTrue("has PositionIncrementAttribute", t.hasAttribute(PositionIncrementAttribute.class));
        posIncrAtt = t.getAttribute(PositionIncrementAttribute.class);
    }

    NodeAttribute nodeAtt = null;
    if (expectedNode != null) {
        assertTrue("has NodeAttribute", t.hasAttribute(NodeAttribute.class));
        nodeAtt = t.getAttribute(NodeAttribute.class);
    }

    PositionAttribute posAtt = null;
    if (expectedPos != null) {
        assertTrue("has PositionAttribute", t.hasAttribute(PositionAttribute.class));
        posAtt = t.getAttribute(PositionAttribute.class);
    }

    for (int i = 0; i < expectedImages.length; i++) {

        assertTrue("token " + i + " exists", t.incrementToken());

        assertEquals("i=" + i, expectedImages[i], termAtt.toString());

        if (expectedTypes != null) {
            assertEquals(expectedTypes[i], typeAtt.type());
        }

        if (expectedPosIncrs != null) {
            assertEquals(expectedPosIncrs[i], posIncrAtt.getPositionIncrement());
        }

        if (expectedNode != null) {
            assertEquals(expectedNode[i], nodeAtt.node());
        }

        if (expectedPos != null) {
            assertEquals(expectedPos[i], posAtt.position());
        }
    }

    assertFalse("end of stream, received token " + termAtt.toString(), t.incrementToken());
    t.end();
    t.close();
}