Example usage for org.apache.lucene.analysis.ja JapaneseAnalyzer tokenStream

List of usage examples for org.apache.lucene.analysis.ja JapaneseAnalyzer tokenStream

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.ja JapaneseAnalyzer tokenStream.

Prototype

public final TokenStream tokenStream(final String fieldName, final Reader reader) 

Source Link

Document

Returns a TokenStream suitable for fieldName, tokenizing the contents of reader.

Usage

From source file:de.escidoc.sb.common.lucene.analyzer.EscidocJapaneseAnalyzer.java

License:Open Source License

/**
 * Constructs a token stream with JapaneseAnalyzer or WhitespaceTokenizer
 * depending if text is japanese or not.
 * //ww  w  . j  av  a2 s  .  c  om
 * @param fieldName
 *            name of the Lucene Indexfield.
 * @param reader
 *            reader with field-value
 * 
 * @return TokenStream tokenStream
 * 
 * @sb
 */
@Override
public TokenStream tokenStream(final String fieldName, final Reader reader) {
    if (log.isDebugEnabled()) {
        log.debug("tokenizing with EscidocJapaneseAnalyzer");
    }
    //checkJapanese ///////////////////////////////////////////////////////
    boolean isJapanese = false;
    TokenStream whitespaceTokens = new WhitespaceTokenizer(Constants.LUCENE_VERSION, reader);
    Reader reader1 = null;
    try {
        StringBuffer tokenBuffer = new StringBuffer("");
        CharTermAttribute termAtt = whitespaceTokens.addAttribute(CharTermAttribute.class);
        whitespaceTokens.reset();
        while (whitespaceTokens.incrementToken()) {
            if (tokenBuffer.length() > 0) {
                tokenBuffer.append(" ");
            }
            tokenBuffer.append(termAtt.toString());
        }
        for (int i = 0; i < tokenBuffer.length(); i++) {
            int hexInt = Integer.parseInt(charToHex(tokenBuffer.charAt(i)), 16);
            if (hexInt > 12287 && hexInt < 13328) {
                isJapanese = true;
                break;
            }
        }
        reader1 = new StringReader(tokenBuffer.toString());
    } catch (Exception e) {
        log.error(e);
    }
    ///////////////////////////////////////////////////////////////////////

    //No Japanese, so return whitespace-tokens
    if (!isJapanese) {
        TokenStream result = new XmlWhitespaceTokenizer(reader1);
        result = new JunkFilter(result);
        result = new LowerCaseFilter(Constants.LUCENE_VERSION, result);
        return result;
    }

    //Get Japanese Tokens
    JapaneseAnalyzer analyzer = new JapaneseAnalyzer(Constants.LUCENE_VERSION);
    TokenStream japaneseTokens = analyzer.tokenStream("", reader1);
    if (analyzer != null) {
        try {
            analyzer.close();
        } catch (Exception e) {
        }
    }
    return japaneseTokens;
}

From source file:hivemall.nlp.tokenizer.KuromojiUDF.java

License:Apache License

@Override
public List<Text> evaluate(DeferredObject[] arguments) throws HiveException {
    JapaneseAnalyzer analyzer = _analyzer;
    if (analyzer == null) {
        CharArraySet stopwords = stopWords(_stopWordsArray);
        analyzer = new JapaneseAnalyzer(null, _mode, stopwords, _stoptags);
        this._analyzer = analyzer;
    }/*w w w  .j  a  v a 2 s  .c o  m*/

    Object arg0 = arguments[0].get();
    if (arg0 == null) {
        return null;
    }
    String line = arg0.toString();

    final List<Text> results = new ArrayList<Text>(32);
    TokenStream stream = null;
    try {
        stream = analyzer.tokenStream("", line);
        if (stream != null) {
            analyzeTokens(stream, results);
        }
    } catch (IOException e) {
        IOUtils.closeQuietly(analyzer);
        throw new HiveException(e);
    } finally {
        IOUtils.closeQuietly(stream);
    }
    return results;
}