Example usage for org.apache.lucene.analysis.ja JapaneseKatakanaStemFilter JapaneseKatakanaStemFilter

List of usage examples for org.apache.lucene.analysis.ja JapaneseKatakanaStemFilter JapaneseKatakanaStemFilter

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.ja JapaneseKatakanaStemFilter JapaneseKatakanaStemFilter.

Prototype

public JapaneseKatakanaStemFilter(TokenStream input) 

Source Link

Usage

From source file:modnlp.idx.inverted.TokeniserJPLucene.java

License:Open Source License

public void tokenise() throws IOException {
    String ignregexp = "--+|\\.\\.+|\\.+\\p{Space}"; // delete full stops and dashes (typically not used).
    if (ignoredElements != null && ignoredElements.length() > 0)
        ignregexp = ignregexp + "|< *" + ignoredElements + "[^>]*?/>" + "|< *" + ignoredElements + ".*?>.*?</"
                + ignoredElements + " *>";
    if (!tagIndexing)
        ignregexp = ignregexp + "|<.*?>";
    //ignregexp = ignregexp+"|\\W\\W+";

    Pattern p = Pattern.compile(ignregexp);
    Matcher igns = p.matcher(originalText);

    StringBuffer tx = new StringBuffer(originalText);
    int ct = 1;//w w  w  .ja  v a  2  s. c o m
    while (igns.find()) {
        int s = igns.start();
        int e = igns.end();
        if (verbose)
            PrintUtil.printNoMove("Processing exclusions ...", ct++);
        //System.err.println("replacing\n-----------"+originalText.substring(s,e)+"\n--------------");
        char sp[] = new char[e - s];
        for (int j = 0; j < sp.length; j++) {
            sp[j] = ' ';
        }
        tx.replace(s, e, new String(sp));
    }
    if (verbose)
        PrintUtil.donePrinting();
    ct = 1;
    //verbose = false;
    String text = new String(tx);
    //System.out.println("-->"+text+"<--");
    Tokenizer tokenizer = new JapaneseTokenizer(new StringReader(text), null, true,
            org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode.SEARCH);
    TokenStream stream = new JapaneseBaseFormFilter(tokenizer);
    //stream = new JapanesePartOfSpeechStopFilter(true, stream, stoptags);
    stream = new CJKWidthFilter(stream);
    //stream = new StopFilter(matchVersion, stream, stopwords);
    stream = new JapaneseKatakanaStemFilter(stream);
    //stream = new LowerCaseFilter(matchVersion, stream);

    OffsetAttribute offsetAttribute = stream.addAttribute(OffsetAttribute.class);
    CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class);

    while (stream.incrementToken()) {
        int startOffset = offsetAttribute.startOffset();
        int endOffset = offsetAttribute.endOffset();
        String token = charTermAttribute.toString();
        tokenMap.putPos(token, startOffset);
        //System.out.println(token.str+" \t\tS="+token.start+" E="+token.end);
    }
    if (verbose)
        PrintUtil.donePrinting();
    ct = 1;
}

From source file:modnlp.idx.inverted.TokeniserJPLucene.java

License:Open Source License

public List<String> split(String s) {
    ArrayList<String> ret = new ArrayList<String>();
    try {//from   w w  w.  j  ava 2 s.  c  o m
        Tokenizer tokenizer = new JapaneseTokenizer(new StringReader(s), null, true,
                org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode.SEARCH);
        TokenStream stream = new JapaneseBaseFormFilter(tokenizer);
        //stream = new JapanesePartOfSpeechStopFilter(true, stream, stoptags);
        stream = new CJKWidthFilter(stream);
        //stream = new StopFilter(matchVersion, stream, stopwords);
        stream = new JapaneseKatakanaStemFilter(stream);
        //stream = new LowerCaseFilter(matchVersion, stream);

        OffsetAttribute offsetAttribute = stream.addAttribute(OffsetAttribute.class);
        CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class);

        while (stream.incrementToken()) {
            int startOffset = offsetAttribute.startOffset();
            int endOffset = offsetAttribute.endOffset();
            String token = charTermAttribute.toString();
            ret.add(token);
            //System.out.println(token.str+" \t\tS="+token.start+" E="+token.end);
        }
    } catch (java.io.IOException e) {
        System.err.println(e);
    }
    return ret;
}

From source file:modnlp.idx.inverted.TokeniserJPLucene.java

License:Open Source License

public TokenIndex getTokenIndex(String str) {
    TokenIndex ret = new TokenIndex();
    try {//from   www.j av  a 2s .  c  om
        Tokenizer tokenizer = new JapaneseTokenizer(new StringReader(str), null, true,
                org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode.SEARCH);
        TokenStream stream = new JapaneseBaseFormFilter(tokenizer);
        //stream = new JapanesePartOfSpeechStopFilter(true, stream, stoptags);
        stream = new CJKWidthFilter(stream);
        //stream = new StopFilter(matchVersion, stream, stopwords);
        stream = new JapaneseKatakanaStemFilter(stream);
        //stream = new LowerCaseFilter(matchVersion, stream);

        OffsetAttribute offsetAttribute = stream.addAttribute(OffsetAttribute.class);
        CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class);

        while (stream.incrementToken()) {
            int startOffset = offsetAttribute.startOffset();
            int endOffset = offsetAttribute.endOffset();
            String token = charTermAttribute.toString();
            ret.add(startOffset, endOffset);
            //System.out.println(token.str+" \t\tS="+token.start+" E="+token.end);
        }
    } catch (java.io.IOException e) {
        System.err.println(e);
    }
    return ret;
}

From source file:org.elasticsearch.indices.analysis.KuromojiIndicesAnalysis.java

License:Apache License

@Inject
public KuromojiIndicesAnalysis(Settings settings, IndicesAnalysisService indicesAnalysisService) {
    super(settings);

    indicesAnalysisService.analyzerProviderFactories().put("kuromoji",
            new PreBuiltAnalyzerProviderFactory("kuromoji", AnalyzerScope.INDICES, new JapaneseAnalyzer()));

    indicesAnalysisService.charFilterFactories().put("kuromoji_iteration_mark",
            new PreBuiltCharFilterFactoryFactory(new CharFilterFactory() {
                @Override/*from   www .j ava 2 s  .c o  m*/
                public String name() {
                    return "kuromoji_iteration_mark";
                }

                @Override
                public Reader create(Reader reader) {
                    return new JapaneseIterationMarkCharFilter(reader,
                            JapaneseIterationMarkCharFilter.NORMALIZE_KANJI_DEFAULT,
                            JapaneseIterationMarkCharFilter.NORMALIZE_KANA_DEFAULT);
                }
            }));

    indicesAnalysisService.tokenizerFactories().put("kuromoji_tokenizer",
            new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() {
                @Override
                public String name() {
                    return "kuromoji_tokenizer";
                }

                @Override
                public Tokenizer create() {
                    return new JapaneseTokenizer(null, true, Mode.SEARCH);
                }
            }));

    indicesAnalysisService.tokenFilterFactories().put("kuromoji_baseform",
            new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
                @Override
                public String name() {
                    return "kuromoji_baseform";
                }

                @Override
                public TokenStream create(TokenStream tokenStream) {
                    return new JapaneseBaseFormFilter(tokenStream);
                }
            }));

    indicesAnalysisService.tokenFilterFactories().put("kuromoji_part_of_speech",
            new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
                @Override
                public String name() {
                    return "kuromoji_part_of_speech";
                }

                @Override
                public TokenStream create(TokenStream tokenStream) {
                    return new JapanesePartOfSpeechStopFilter(tokenStream,
                            JapaneseAnalyzer.getDefaultStopTags());
                }
            }));

    indicesAnalysisService.tokenFilterFactories().put("kuromoji_readingform",
            new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
                @Override
                public String name() {
                    return "kuromoji_readingform";
                }

                @Override
                public TokenStream create(TokenStream tokenStream) {
                    return new JapaneseReadingFormFilter(tokenStream, true);
                }
            }));

    indicesAnalysisService.tokenFilterFactories().put("kuromoji_stemmer",
            new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
                @Override
                public String name() {
                    return "kuromoji_stemmer";
                }

                @Override
                public TokenStream create(TokenStream tokenStream) {
                    return new JapaneseKatakanaStemFilter(tokenStream);
                }
            }));
}