Example usage for org.apache.lucene.analysis.ja JapaneseTokenizer JapaneseTokenizer

List of usage examples for org.apache.lucene.analysis.ja JapaneseTokenizer JapaneseTokenizer

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.ja JapaneseTokenizer JapaneseTokenizer.

Prototype

public JapaneseTokenizer(AttributeFactory factory, UserDictionary userDictionary, boolean discardPunctuation,
        Mode mode) 

Source Link

Document

Create a new JapaneseTokenizer using the system and unknown dictionaries shipped with Lucene.

Usage

From source file:aak.as.preProcess.japanese.JaSegmenter.java

License:Open Source License

@Override
public List<String> segmentWords(String text) {

    List<String> ret = new ArrayList<String>();

    StringReader textreader = new StringReader(text);
    JapaneseTokenizer segmenter = new JapaneseTokenizer(textreader, null, true, JapaneseTokenizer.Mode.SEARCH);

    JaStemmer.lemma.clear();/*from  ww  w  .  java 2s  .c  om*/
    CharTermAttribute termAtt = segmenter.getAttribute(CharTermAttribute.class);
    BaseFormAttribute baseAtt = segmenter.getAttribute(BaseFormAttribute.class);
    try {
        segmenter.reset();
        while (segmenter.incrementToken()) {
            //segmenter.clearAttributes();
            ret.add(termAtt.toString());
            if (baseAtt.getBaseForm() != null)
                JaStemmer.lemma.put(termAtt.toString(), baseAtt.getBaseForm());
        }

        segmenter.close();
    } catch (IOException e) {
        // TODO Auto-generated catch block.
        e.printStackTrace();
    }

    return ret;
}

From source file:modnlp.idx.inverted.TokeniserJPLucene.java

License:Open Source License

public void tokenise() throws IOException {
    String ignregexp = "--+|\\.\\.+|\\.+\\p{Space}"; // delete full stops and dashes (typically not used).
    if (ignoredElements != null && ignoredElements.length() > 0)
        ignregexp = ignregexp + "|< *" + ignoredElements + "[^>]*?/>" + "|< *" + ignoredElements + ".*?>.*?</"
                + ignoredElements + " *>";
    if (!tagIndexing)
        ignregexp = ignregexp + "|<.*?>";
    //ignregexp = ignregexp+"|\\W\\W+";

    Pattern p = Pattern.compile(ignregexp);
    Matcher igns = p.matcher(originalText);

    StringBuffer tx = new StringBuffer(originalText);
    int ct = 1;/*  www . java2 s  .  c om*/
    while (igns.find()) {
        int s = igns.start();
        int e = igns.end();
        if (verbose)
            PrintUtil.printNoMove("Processing exclusions ...", ct++);
        //System.err.println("replacing\n-----------"+originalText.substring(s,e)+"\n--------------");
        char sp[] = new char[e - s];
        for (int j = 0; j < sp.length; j++) {
            sp[j] = ' ';
        }
        tx.replace(s, e, new String(sp));
    }
    if (verbose)
        PrintUtil.donePrinting();
    ct = 1;
    //verbose = false;
    String text = new String(tx);
    //System.out.println("-->"+text+"<--");
    Tokenizer tokenizer = new JapaneseTokenizer(new StringReader(text), null, true,
            org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode.SEARCH);
    TokenStream stream = new JapaneseBaseFormFilter(tokenizer);
    //stream = new JapanesePartOfSpeechStopFilter(true, stream, stoptags);
    stream = new CJKWidthFilter(stream);
    //stream = new StopFilter(matchVersion, stream, stopwords);
    stream = new JapaneseKatakanaStemFilter(stream);
    //stream = new LowerCaseFilter(matchVersion, stream);

    OffsetAttribute offsetAttribute = stream.addAttribute(OffsetAttribute.class);
    CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class);

    while (stream.incrementToken()) {
        int startOffset = offsetAttribute.startOffset();
        int endOffset = offsetAttribute.endOffset();
        String token = charTermAttribute.toString();
        tokenMap.putPos(token, startOffset);
        //System.out.println(token.str+" \t\tS="+token.start+" E="+token.end);
    }
    if (verbose)
        PrintUtil.donePrinting();
    ct = 1;
}

From source file:modnlp.idx.inverted.TokeniserJPLucene.java

License:Open Source License

public List<String> split(String s) {
    ArrayList<String> ret = new ArrayList<String>();
    try {/*from w  ww .j  ava 2 s.  c  o  m*/
        Tokenizer tokenizer = new JapaneseTokenizer(new StringReader(s), null, true,
                org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode.SEARCH);
        TokenStream stream = new JapaneseBaseFormFilter(tokenizer);
        //stream = new JapanesePartOfSpeechStopFilter(true, stream, stoptags);
        stream = new CJKWidthFilter(stream);
        //stream = new StopFilter(matchVersion, stream, stopwords);
        stream = new JapaneseKatakanaStemFilter(stream);
        //stream = new LowerCaseFilter(matchVersion, stream);

        OffsetAttribute offsetAttribute = stream.addAttribute(OffsetAttribute.class);
        CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class);

        while (stream.incrementToken()) {
            int startOffset = offsetAttribute.startOffset();
            int endOffset = offsetAttribute.endOffset();
            String token = charTermAttribute.toString();
            ret.add(token);
            //System.out.println(token.str+" \t\tS="+token.start+" E="+token.end);
        }
    } catch (java.io.IOException e) {
        System.err.println(e);
    }
    return ret;
}

From source file:modnlp.idx.inverted.TokeniserJPLucene.java

License:Open Source License

public TokenIndex getTokenIndex(String str) {
    TokenIndex ret = new TokenIndex();
    try {/*ww  w  .  j  a va2s.  c o m*/
        Tokenizer tokenizer = new JapaneseTokenizer(new StringReader(str), null, true,
                org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode.SEARCH);
        TokenStream stream = new JapaneseBaseFormFilter(tokenizer);
        //stream = new JapanesePartOfSpeechStopFilter(true, stream, stoptags);
        stream = new CJKWidthFilter(stream);
        //stream = new StopFilter(matchVersion, stream, stopwords);
        stream = new JapaneseKatakanaStemFilter(stream);
        //stream = new LowerCaseFilter(matchVersion, stream);

        OffsetAttribute offsetAttribute = stream.addAttribute(OffsetAttribute.class);
        CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class);

        while (stream.incrementToken()) {
            int startOffset = offsetAttribute.startOffset();
            int endOffset = offsetAttribute.endOffset();
            String token = charTermAttribute.toString();
            ret.add(startOffset, endOffset);
            //System.out.println(token.str+" \t\tS="+token.start+" E="+token.end);
        }
    } catch (java.io.IOException e) {
        System.err.println(e);
    }
    return ret;
}

From source file:org.apache.solr.analysis.JapaneseTokenizerFactory.java

License:Apache License

@Override
public Tokenizer create(Reader input) {
    return new JapaneseTokenizer(input, userDictionary, true, mode);
}

From source file:org.omegat.tokenizer.LuceneJapaneseTokenizer.java

License:Open Source License

@Override
protected TokenStream getTokenStream(String strOrig, boolean stemsAllowed, boolean stopWordsAllowed) {
    if (stemsAllowed) {
        // Blank out tags when stemming only
        strOrig = blankOutTags(strOrig);
        CharArraySet stopWords = stopWordsAllowed ? JapaneseAnalyzer.getDefaultStopSet()
                : new CharArraySet(getBehavior(), 0, false);
        Set<String> stopTags = stopWordsAllowed ? JapaneseAnalyzer.getDefaultStopTags() : Collections.EMPTY_SET;
        return new JapaneseAnalyzer(getBehavior(), null, Mode.SEARCH, stopWords, stopTags).tokenStream("",
                new StringReader(strOrig));
    } else {/*  w ww.j a  v a 2 s  . com*/
        return new TagJoiningFilter(new JapaneseTokenizer(new StringReader(strOrig), null, false, Mode.NORMAL));
    }
}