Example usage for org.apache.lucene.analysis.cjk CJKWidthFilter CJKWidthFilter

List of usage examples for org.apache.lucene.analysis.cjk CJKWidthFilter CJKWidthFilter

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.cjk CJKWidthFilter CJKWidthFilter.

Prototype

public CJKWidthFilter(TokenStream input) 

Source Link

Usage

From source file:com.github.buzztaiki.lucene.lastuni.CJKLastUniGramAnalyzer.java

License:Apache License

@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
    TokenStream result = new CJKWidthFilter(source);
    result = new LowerCaseFilter(matchVersion, result);
    result = new CJKBigramFilter(result);
    result = new CJKLastUniGramFilter(result, tokenizeLastUni);
    return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
}

From source file:com.github.cstoku.neologd.unidic.lucene.analysis.ja.JapaneseAnalyzer.java

License:Apache License

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    Tokenizer tokenizer = new JapaneseTokenizer(userDict, true, mode);
    TokenStream stream = new JapaneseBaseFormFilter(tokenizer);
    stream = new JapanesePartOfSpeechStopFilter(stream, stoptags);
    stream = new CJKWidthFilter(stream);
    stream = new StopFilter(stream, stopwords);
    stream = new JapaneseKatakanaStemFilter(stream);
    stream = new LowerCaseFilter(stream);
    return new TokenStreamComponents(tokenizer, stream);
}

From source file:modnlp.idx.inverted.TokeniserJPLucene.java

License:Open Source License

public void tokenise() throws IOException {
    String ignregexp = "--+|\\.\\.+|\\.+\\p{Space}"; // delete full stops and dashes (typically not used).
    if (ignoredElements != null && ignoredElements.length() > 0)
        ignregexp = ignregexp + "|< *" + ignoredElements + "[^>]*?/>" + "|< *" + ignoredElements + ".*?>.*?</"
                + ignoredElements + " *>";
    if (!tagIndexing)
        ignregexp = ignregexp + "|<.*?>";
    //ignregexp = ignregexp+"|\\W\\W+";

    Pattern p = Pattern.compile(ignregexp);
    Matcher igns = p.matcher(originalText);

    StringBuffer tx = new StringBuffer(originalText);
    int ct = 1;//from   ww w  .  j  av  a  2  s. c  o m
    while (igns.find()) {
        int s = igns.start();
        int e = igns.end();
        if (verbose)
            PrintUtil.printNoMove("Processing exclusions ...", ct++);
        //System.err.println("replacing\n-----------"+originalText.substring(s,e)+"\n--------------");
        char sp[] = new char[e - s];
        for (int j = 0; j < sp.length; j++) {
            sp[j] = ' ';
        }
        tx.replace(s, e, new String(sp));
    }
    if (verbose)
        PrintUtil.donePrinting();
    ct = 1;
    //verbose = false;
    String text = new String(tx);
    //System.out.println("-->"+text+"<--");
    Tokenizer tokenizer = new JapaneseTokenizer(new StringReader(text), null, true,
            org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode.SEARCH);
    TokenStream stream = new JapaneseBaseFormFilter(tokenizer);
    //stream = new JapanesePartOfSpeechStopFilter(true, stream, stoptags);
    stream = new CJKWidthFilter(stream);
    //stream = new StopFilter(matchVersion, stream, stopwords);
    stream = new JapaneseKatakanaStemFilter(stream);
    //stream = new LowerCaseFilter(matchVersion, stream);

    OffsetAttribute offsetAttribute = stream.addAttribute(OffsetAttribute.class);
    CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class);

    while (stream.incrementToken()) {
        int startOffset = offsetAttribute.startOffset();
        int endOffset = offsetAttribute.endOffset();
        String token = charTermAttribute.toString();
        tokenMap.putPos(token, startOffset);
        //System.out.println(token.str+" \t\tS="+token.start+" E="+token.end);
    }
    if (verbose)
        PrintUtil.donePrinting();
    ct = 1;
}

From source file:modnlp.idx.inverted.TokeniserJPLucene.java

License:Open Source License

public List<String> split(String s) {
    ArrayList<String> ret = new ArrayList<String>();
    try {/*w  ww . ja v a 2s. c  o m*/
        Tokenizer tokenizer = new JapaneseTokenizer(new StringReader(s), null, true,
                org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode.SEARCH);
        TokenStream stream = new JapaneseBaseFormFilter(tokenizer);
        //stream = new JapanesePartOfSpeechStopFilter(true, stream, stoptags);
        stream = new CJKWidthFilter(stream);
        //stream = new StopFilter(matchVersion, stream, stopwords);
        stream = new JapaneseKatakanaStemFilter(stream);
        //stream = new LowerCaseFilter(matchVersion, stream);

        OffsetAttribute offsetAttribute = stream.addAttribute(OffsetAttribute.class);
        CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class);

        while (stream.incrementToken()) {
            int startOffset = offsetAttribute.startOffset();
            int endOffset = offsetAttribute.endOffset();
            String token = charTermAttribute.toString();
            ret.add(token);
            //System.out.println(token.str+" \t\tS="+token.start+" E="+token.end);
        }
    } catch (java.io.IOException e) {
        System.err.println(e);
    }
    return ret;
}

From source file:modnlp.idx.inverted.TokeniserJPLucene.java

License:Open Source License

public TokenIndex getTokenIndex(String str) {
    TokenIndex ret = new TokenIndex();
    try {/*from ww w  . j  a va2  s . co  m*/
        Tokenizer tokenizer = new JapaneseTokenizer(new StringReader(str), null, true,
                org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode.SEARCH);
        TokenStream stream = new JapaneseBaseFormFilter(tokenizer);
        //stream = new JapanesePartOfSpeechStopFilter(true, stream, stoptags);
        stream = new CJKWidthFilter(stream);
        //stream = new StopFilter(matchVersion, stream, stopwords);
        stream = new JapaneseKatakanaStemFilter(stream);
        //stream = new LowerCaseFilter(matchVersion, stream);

        OffsetAttribute offsetAttribute = stream.addAttribute(OffsetAttribute.class);
        CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class);

        while (stream.incrementToken()) {
            int startOffset = offsetAttribute.startOffset();
            int endOffset = offsetAttribute.endOffset();
            String token = charTermAttribute.toString();
            ret.add(startOffset, endOffset);
            //System.out.println(token.str+" \t\tS="+token.start+" E="+token.end);
        }
    } catch (java.io.IOException e) {
        System.err.println(e);
    }
    return ret;
}

From source file:org.apache.solr.analysis.CJKWidthFilterFactory.java

License:Apache License

@Override
public TokenStream create(TokenStream input) {
    return new CJKWidthFilter(input);
}

From source file:org.elasticsearch.analysis.common.CJKWidthFilterFactory.java

License:Apache License

@Override
public TokenStream create(TokenStream tokenStream) {
    return new CJKWidthFilter(tokenStream);
}