Example usage for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass)

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:com.mozilla.grouperfish.transforms.coclustering.pig.eval.text.NGramTokenize.java

License:Apache License

@Override
public DataBag exec(Tuple input) throws IOException {
    if (input == null || input.size() == 0) {
        return null;
    }//from  w  ww. j  av a2s  .  co  m

    if (analyzer == null) {
        if (input.size() > 1) {
            loadDictionary((String) input.get(1));
        }
        boolean stem = false;
        if (input.size() > 2) {
            stem = Boolean.parseBoolean((String) input.get(2));
        }
        boolean outputUnigrams = false;
        if (input.size() > 3) {
            outputUnigrams = Boolean.parseBoolean((String) input.get(3));
        }
        int minNGram = 2;
        if (input.size() > 4) {
            minNGram = Integer.parseInt((String) input.get(4));
        }
        int maxNGram = 3;
        if (input.size() > 5) {
            maxNGram = Integer.parseInt((String) input.get(5));
        }

        if (stopwords != null && stopwords.size() != 0) {
            analyzer = new NGramEnglishAnalyzer(Version.LUCENE_31, stopwords, stem, outputUnigrams, minNGram,
                    maxNGram);
        } else {
            analyzer = new NGramEnglishAnalyzer(Version.LUCENE_31, StandardAnalyzer.STOP_WORDS_SET, stem,
                    outputUnigrams, minNGram, maxNGram);
        }
    }

    DataBag output = bagFactory.newDefaultBag();
    TokenStream stream = analyzer.tokenStream(NOFIELD, new StringReader((String) input.get(0)));
    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
    while (stream.incrementToken()) {
        if (termAttr.length() > 0) {
            Tuple t = tupleFactory.newTuple(termAttr.toString());
            output.add(t);
            termAttr.setEmpty();
        }
    }

    return output;
}

From source file:com.ProcessText.java

public static void main(String[] args) throws FileNotFoundException, IOException {
    HashMap<String, Integer> unigram = new HashMap<>();
    HashMap<String, Integer> bigram = new HashMap<>();
    HashMap<String, Integer> trigram = new HashMap<>();
    BufferedReader br = new BufferedReader(new FileReader("D:/phrases90"));
    String line;//from   ww w .j  ava  2  s.com
    Analyzer biAnalyzer = new NGramTokenBaseAnalyzer(2, 2, false);
    Analyzer triAnalyzer = new NGramTokenBaseAnalyzer(3, 3, false);
    while ((line = br.readLine()) != null) {
        line = line.replaceAll("\\s+", " ").trim();
        //Loai bo 1 so ky hieu
        line = line.replaceAll("<3", "");
        line = line.replaceAll(":3", "");
        line = line.replaceAll(":v", "");
        line = line.replaceAll(":d", "");
        line = line.replaceAll(":D", "");
        line = line.replaceAll("p/s:", "");
        line = line.replaceAll(":\\)", "");
        //unigram process
        String[] arr = line.split("\\s");
        for (String item : arr) {
            item = item.replaceAll("\\s", "");
            if (item.length() > 0) {
                item = item.toLowerCase();
                Integer freq = unigram.get(item);
                if (freq != null) {
                    unigram.put(item, freq + 1);
                } else
                    unigram.put(item, 1);
            }
        }
        //bigram process
        if (line.length() > 0) {
            TokenStream stream = biAnalyzer.tokenStream(null, new StringReader(line));
            CharTermAttribute cattr = stream.addAttribute(CharTermAttribute.class);
            stream.reset();
            while (stream.incrementToken()) {
                String item = cattr.toString();
                //item = item.replaceAll("$[\\s]","");
                Integer count = bigram.get(item);
                int fcount = 0;
                if (count == null)
                    fcount = 1;
                else
                    fcount = count + 1;
                if (item.length() > 3)
                    bigram.put(item, fcount);
            }
            stream.end();
            stream.close();
            //trigram process
            TokenStream stream1 = triAnalyzer.tokenStream(null, new StringReader(line));
            CharTermAttribute cattr1 = stream1.addAttribute(CharTermAttribute.class);
            stream1.reset();
            while (stream1.incrementToken()) {
                String item = cattr1.toString();
                //item = item.replaceAll("$[\\s]","");
                Integer count = trigram.get(item);
                int fcount = 0;
                if (count == null)
                    fcount = 1;
                else
                    fcount = count + 1;
                if (item.length() > 5)
                    trigram.put(item, fcount);
            }
            stream1.end();
            stream1.close();
        }

    }
    //Tinh Xac suat cho cac unigram
    HashMap<String, Double> unigramProb = new HashMap<>();
    int totalUniFreq = 0;
    int uniSize = unigram.size();
    for (String item : unigram.keySet()) {
        item = item.toLowerCase();
        int freq = unigram.get(item);
        totalUniFreq += freq;
    }
    //Cng thc xc sut di y  c sa li
    for (String item : unigram.keySet()) {
        item = item.toLowerCase();
        int freq = unigram.get(item);
        double prob = ((double) freq + 1) / (totalUniFreq + uniSize);
        //unigram.size l s lng t vng unigram khc nhau
        unigramProb.put(item, prob);
    }
    System.out.println("Tong tan suat cua unigram = " + totalUniFreq);
    //Tinh xac suat cho cac bigram
    HashMap<String, Double> bigramProb = new HashMap<>();
    HashMap<String, Integer> startUnigramOfBigram = new HashMap<>();// Luu tong tan suat cua A* bat dau boi unigram A
    //De phuc vu cong thuc xac suat co dieu kien
    int totalBiFreq = 0;//Tinh tong tan suat cua toan bo bigram A* cua unigram A
    //Luu A*
    for (String item : bigram.keySet()) {
        item = item.toLowerCase();
        int freq = bigram.get(item);
        totalBiFreq += freq;
        String[] arr = item.split("\\s");
        String key = arr[0].toLowerCase();//khong can thiet lam
        Integer startFreq = startUnigramOfBigram.get(key);
        if (startFreq == null)
            startUnigramOfBigram.put(key, freq);
        else
            startUnigramOfBigram.put(key, freq + startFreq);
    }
    //Ap dung cong thuc xac suat co dieu kien
    //? sa li cng thc
    for (String item : bigram.keySet()) {
        int freq = bigram.get(item);
        String[] arr = item.split("\\s");
        String key = arr[0].toLowerCase();
        int startUniFreq = startUnigramOfBigram.get(key);
        double startUniProb;
        try {
            startUniProb = unigramProb.get(key);
        } catch (NullPointerException ex) {
            startUniProb = 1d / (1 + uniSize);
        }
        double prob = (((double) freq + 1) / (startUniFreq + uniSize)) * startUniProb;
        //uniSize = V l kch thc t in unigram
        bigramProb.put(item, prob);
    }

    System.out.println("Tong tan suat cua bigram = " + totalBiFreq);
    //Tinh xac suat cho cac trigram
    HashMap<String, Double> trigramProb = new HashMap<>();
    HashMap<String, Integer> startBigramOfTrigram = new HashMap<>();// Luu tong tan suat cua AB* bat dau boi bigram AB
    int totalTriFreq = 0;
    for (String item : trigram.keySet()) {
        int freq = trigram.get(item);
        totalTriFreq += freq;
        String[] arr = item.split("\\s");
        String key = arr[0] + " " + arr[1];
        Integer startFreq = startBigramOfTrigram.get(key);
        if (startFreq == null)
            startBigramOfTrigram.put(key, freq);
        else
            startBigramOfTrigram.put(key, freq + startFreq);
    }
    //Ap dung cong thuc xac suat co dieu kien
    for (String item : trigram.keySet()) {
        double startBiProb;
        int freq = trigram.get(item);
        String[] arr = item.split("\\s");
        String key = arr[0] + " " + arr[1];
        //try {
        int startBiFreq = startBigramOfTrigram.get(key);
        try {
            startBiProb = bigramProb.get(key);
        } catch (NullPointerException ex) {
            startBiProb = 1d / (878592 + uniSize);
        }
        double prob = (((double) freq + 1) / (startBiFreq + uniSize)) * startBiProb;
        trigramProb.put(item, prob);
        //} catch(NullPointerException ex) {

        //}
    }
    System.out.println("Tong tan suat cua trigram = " + totalTriFreq);
    //In ra file
    PrintWriter f0 = new PrintWriter(new FileWriter("D:/App/unigramProb.txt"));
    PrintWriter f1 = new PrintWriter(new FileWriter("D:/App/bigramProb.txt"));
    PrintWriter f2 = new PrintWriter(new FileWriter("D:/App/trigramProb.txt"));
    for (String item : unigramProb.keySet()) {
        double freq = unigramProb.get(item);
        f0.append(item + " = " + freq + "\n");
    }

    f0.close();
    for (String item : bigramProb.keySet()) {
        double freq = bigramProb.get(item);
        f1.append(item + " = " + freq + "\n");
    }
    f1.close();
    for (String item : trigramProb.keySet()) {
        double freq = trigramProb.get(item);
        f2.append(item + " = " + freq + "\n");
    }
    f2.close();
    PrintWriter f3 = new PrintWriter(new FileWriter("D:/App/stringProb.txt"));
    br = new BufferedReader(new FileReader("D:/phrases10"));
    HashMap<String, Integer> prefix3Gram = new HashMap<>();
    HashMap<String, Integer> phrases = new HashMap<>();
    while ((line = br.readLine()) != null) {
        line = line.replaceAll("\\s+", " ").trim();
        //Loai bo 1 so ky hieu
        line = line.replaceAll("<3", "");
        line = line.replaceAll(":3", "");
        line = line.replaceAll(":v", "");
        line = line.replaceAll(":d", "");
        line = line.replaceAll(":D", "");
        line = line.replaceAll("p/s:", "");
        line = line.replaceAll(":\\)", "");
        String[] arr = line.split("\\s");
        if (arr.length > 2) {
            String prefix = arr[0] + " " + arr[1] + " " + arr[2];
            Integer prefixFreq = prefix3Gram.get(prefix);
            if (prefixFreq == null)
                prefix3Gram.put(prefix, 1);
            else
                prefix3Gram.put(prefix, 1 + prefixFreq);
        }
        Integer freq = phrases.get(line);
        if (freq == null)
            phrases.put(line, 1);
        else
            phrases.put(line, freq + 1);
    }
    //br = new BufferedReader(new FileReader("D:/phrases10"));
    double totalProb = 0;
    int countItem = 0;
    for (String item : phrases.keySet()) {
        line = item;
        Integer lineFreq = phrases.get(item);
        if (lineFreq == null)
            lineFreq = 1;
        String[] arr = line.split("\\s");
        String prefix = line;
        double probOfLine = 1d * lineFreq / (uniSize + totalTriFreq / uniSize);
        int length = arr.length;

        if (length >= 3) {
            prefix = arr[0] + " " + arr[1] + " " + arr[2];
            int prefixTotal = prefix3Gram.get(prefix);
            try {
                double prefixProb = trigramProb.get(prefix);
                probOfLine = prefixProb;
                if (length > 3) {
                    for (int i = 3; i < length; i++) {
                        prefix = arr[i - 2] + " " + arr[i - 1] + " " + arr[i];
                        prefixTotal = prefix3Gram.get(prefix);
                        prefixProb = trigramProb.get(prefix);
                        probOfLine *= (1d / prefixTotal) * prefixProb;
                    }
                }
                //f3.append(line + " = " + probOfLine + "\n");
            } catch (NullPointerException ex) {
                probOfLine = 1d * lineFreq / (prefixTotal + uniSize);
            }
        }

        f3.append(line + " = " + probOfLine + "\n");
        countItem += arr.length;
        totalProb -= (Math.log(probOfLine) / Math.log(2));
    }
    double somu = totalProb / countItem;
    double perplexity = Math.pow(2, somu);
    f3.close();
    DecimalFormat df = new DecimalFormat("#");
    df.setMaximumFractionDigits(4);
    System.out.println(somu);
    System.out.printf("PERPLEXITY = " + df.format(perplexity));
}

From source file:com.qubit.elasticsearch.analysis.url.helper.AnalyzerUtils.java

License:Apache License

public static void displayTokens(TokenStream stream) throws IOException {

    Token term = stream.addAttribute(Token.class);
    while (stream.incrementToken()) {
        System.out.print("[" + term.toString() + "] ");
    }/*from  ww w  .  j a  v  a  2  s  .com*/
}

From source file:com.rubenlaguna.en4j.searchlucene.AnalyzerUtils.java

License:Open Source License

public static void displayTokens(TokenStream stream) throws IOException {
    TermAttribute term = (TermAttribute) stream.addAttribute(TermAttribute.class);
    while (stream.incrementToken()) {
        System.out.print("[" + term.term() + "] "); //B
    }//from   w  ww.  j a  v  a2  s.  c om
}

From source file:com.scaleunlimited.classify.analyzer.LuceneAnalyzer.java

License:Apache License

/**
 * @param contentText input text to be parsed into terms
 * @return salient terms in order of appearance
 * (or null if this content should be ignored)
 *//*  ww w  .ja v  a 2s  . co m*/
public List<String> getTermList(String contentText) {
    init();
    List<String> result = new ArrayList<String>(contentText.length() / 10);

    try {
        TokenStream stream = _analyzer.tokenStream("content", new StringReader(contentText));
        CharTermAttribute termAtt = (CharTermAttribute) stream.addAttribute(CharTermAttribute.class);

        stream.reset();
        while (stream.incrementToken()) {
            if (termAtt.length() > 0) {
                String term = termAtt.toString();
                // Here we skip runs of position increment markers created
                // by the ShingleFilter for stop words because they skew
                // the clustering/liblinear analysis.
                if (!term.matches("(_ )*_")) {
                    result.add(term);
                }
            }
        }
        stream.end();
        stream.close();
    } catch (IOException e) {
        throw new RuntimeException("Impossible error", e);
    }

    return result;
}

From source file:com.searchbox.SuggeterDataStructureBuilder.java

License:Apache License

private String[] getTokens(String fulltext) {
    LinkedList<String> tokens = new LinkedList<String>();
    try {//from   www  .  java 2  s .c o m
        TokenStream tokenStream = analyzer.tokenStream(fields[0], new StringReader(fulltext));
        tokenStream.reset();
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

        while (tokenStream.incrementToken()) {
            String token = charTermAttribute.toString();
            tokens.add(token);
        }

    } catch (IOException ex) {
        LOGGER.error("Failure reading tokens from stream", ex);
    }
    return tokens.toArray(new String[0]);
}

From source file:com.searchcode.app.util.CodeAnalyzer.java

License:Open Source License

public static void main(String[] args) throws IOException {
    // text to tokenize
    final String text = "This is a demo of the TokenStream API";

    CodeAnalyzer analyzer = new CodeAnalyzer();
    TokenStream stream = analyzer.tokenStream("field", new StringReader(text));

    // get the CharTermAttribute from the TokenStream
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);

    try {/*from www  .ja va  2 s  .c  o  m*/
        stream.reset();

        // print all tokens until stream is exhausted
        while (stream.incrementToken()) {
            System.out.println(termAtt.toString());
        }

        stream.end();
    } finally {
        stream.close();
    }
}

From source file:com.shaie.annots.filter.AnnotatorTokenFilterTest.java

License:Apache License

private static void assertTokenInfos(TokenStream ts, TokenInfo... infos) throws IOException {
    ts.reset();/*from   ww w . j a va  2  s  .  c o  m*/
    final CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
    final PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
    int pos = -1;
    for (final TokenInfo info : infos) {
        assertThat(ts.incrementToken()).isTrue();
        pos += posIncrAtt.getPositionIncrement();
        assertThat(new TokenInfo(term.toString(), pos)).isEqualTo(info);
    }
    assertThat(ts.incrementToken()).isFalse();
}

From source file:com.shaie.annots.filter.PreAnnotatedTokenFilterTest.java

License:Apache License

private static void assertTokenInfos(TokenStream ts, TokenInfo... infos) throws IOException {
    ts.reset();//from w w  w .  j  a v a2s .  c o m
    final CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
    final PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
    final PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
    final ByteArrayDataInput in = new ByteArrayDataInput();
    int pos = -1;
    for (final TokenInfo info : infos) {
        assertThat(ts.incrementToken()).isTrue();
        pos += posIncrAtt.getPositionIncrement();
        int len = -1;
        final BytesRef payload = payloadAtt.getPayload();
        if (info.len != -1) {
            assertThat(payload).isNotNull();
            in.reset(payload.bytes);
            len = in.readVInt();
        } else {
            assertThat(payload).isNull();
        }
        assertThat(new TokenInfo(term.toString(), pos, len)).isEqualTo(info);
    }
    assertThat(ts.incrementToken()).isFalse();
}

From source file:com.shaie.SynonymFilterExample.java

License:Apache License

@SuppressWarnings("resource")
public static void main(String[] args) throws Exception {
    final Tokenizer tok = new WhitespaceTokenizer();
    tok.setReader(new StringReader("dark sea green sea green"));

    final SynonymMap.Builder builder = new SynonymMap.Builder(true);
    addSynonym("dark sea green", "color", builder);
    addSynonym("green", "color", builder);
    addSynonym("dark sea", "color", builder);
    addSynonym("sea green", "color", builder);
    final SynonymMap synMap = builder.build();
    final TokenStream ts = new SynonymGraphFilter(tok, synMap, true);

    final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    final PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
    final PositionLengthAttribute posLengthAtt = ts.addAttribute(PositionLengthAttribute.class);

    ts.reset();/*from   w  w  w  .  java2  s.c om*/
    int pos = -1;
    while (ts.incrementToken()) {
        pos += posIncrAtt.getPositionIncrement();
        System.out.println("term=" + termAtt + ", pos=" + pos + ", posLen=" + posLengthAtt.getPositionLength());
    }
    ts.end();
    ts.close();
}