List of usage examples for org.apache.lucene.analysis.ja JapaneseAnalyzer tokenStream
public final TokenStream tokenStream(final String fieldName, final Reader reader)
fieldName, tokenizing the contents of reader. From source file:de.escidoc.sb.common.lucene.analyzer.EscidocJapaneseAnalyzer.java
License:Open Source License
/** * Constructs a token stream with JapaneseAnalyzer or WhitespaceTokenizer * depending if text is japanese or not. * //ww w . j av a2 s . c om * @param fieldName * name of the Lucene Indexfield. * @param reader * reader with field-value * * @return TokenStream tokenStream * * @sb */ @Override public TokenStream tokenStream(final String fieldName, final Reader reader) { if (log.isDebugEnabled()) { log.debug("tokenizing with EscidocJapaneseAnalyzer"); } //checkJapanese /////////////////////////////////////////////////////// boolean isJapanese = false; TokenStream whitespaceTokens = new WhitespaceTokenizer(Constants.LUCENE_VERSION, reader); Reader reader1 = null; try { StringBuffer tokenBuffer = new StringBuffer(""); CharTermAttribute termAtt = whitespaceTokens.addAttribute(CharTermAttribute.class); whitespaceTokens.reset(); while (whitespaceTokens.incrementToken()) { if (tokenBuffer.length() > 0) { tokenBuffer.append(" "); } tokenBuffer.append(termAtt.toString()); } for (int i = 0; i < tokenBuffer.length(); i++) { int hexInt = Integer.parseInt(charToHex(tokenBuffer.charAt(i)), 16); if (hexInt > 12287 && hexInt < 13328) { isJapanese = true; break; } } reader1 = new StringReader(tokenBuffer.toString()); } catch (Exception e) { log.error(e); } /////////////////////////////////////////////////////////////////////// //No Japanese, so return whitespace-tokens if (!isJapanese) { TokenStream result = new XmlWhitespaceTokenizer(reader1); result = new JunkFilter(result); result = new LowerCaseFilter(Constants.LUCENE_VERSION, result); return result; } //Get Japanese Tokens JapaneseAnalyzer analyzer = new JapaneseAnalyzer(Constants.LUCENE_VERSION); TokenStream japaneseTokens = analyzer.tokenStream("", reader1); if (analyzer != null) { try { analyzer.close(); } catch (Exception e) { } } return japaneseTokens; }
From source file:hivemall.nlp.tokenizer.KuromojiUDF.java
License:Apache License
@Override public List<Text> evaluate(DeferredObject[] arguments) throws HiveException { JapaneseAnalyzer analyzer = _analyzer; if (analyzer == null) { CharArraySet stopwords = stopWords(_stopWordsArray); analyzer = new JapaneseAnalyzer(null, _mode, stopwords, _stoptags); this._analyzer = analyzer; }/*w w w .j a v a 2 s .c o m*/ Object arg0 = arguments[0].get(); if (arg0 == null) { return null; } String line = arg0.toString(); final List<Text> results = new ArrayList<Text>(32); TokenStream stream = null; try { stream = analyzer.tokenStream("", line); if (stream != null) { analyzeTokens(stream, results); } } catch (IOException e) { IOUtils.closeQuietly(analyzer); throw new HiveException(e); } finally { IOUtils.closeQuietly(stream); } return results; }