List of usage examples for org.apache.lucene.analysis.ja JapaneseKatakanaStemFilter JapaneseKatakanaStemFilter
public JapaneseKatakanaStemFilter(TokenStream input)
From source file:modnlp.idx.inverted.TokeniserJPLucene.java
License:Open Source License
public void tokenise() throws IOException { String ignregexp = "--+|\\.\\.+|\\.+\\p{Space}"; // delete full stops and dashes (typically not used). if (ignoredElements != null && ignoredElements.length() > 0) ignregexp = ignregexp + "|< *" + ignoredElements + "[^>]*?/>" + "|< *" + ignoredElements + ".*?>.*?</" + ignoredElements + " *>"; if (!tagIndexing) ignregexp = ignregexp + "|<.*?>"; //ignregexp = ignregexp+"|\\W\\W+"; Pattern p = Pattern.compile(ignregexp); Matcher igns = p.matcher(originalText); StringBuffer tx = new StringBuffer(originalText); int ct = 1;//w w w .ja v a 2 s. c o m while (igns.find()) { int s = igns.start(); int e = igns.end(); if (verbose) PrintUtil.printNoMove("Processing exclusions ...", ct++); //System.err.println("replacing\n-----------"+originalText.substring(s,e)+"\n--------------"); char sp[] = new char[e - s]; for (int j = 0; j < sp.length; j++) { sp[j] = ' '; } tx.replace(s, e, new String(sp)); } if (verbose) PrintUtil.donePrinting(); ct = 1; //verbose = false; String text = new String(tx); //System.out.println("-->"+text+"<--"); Tokenizer tokenizer = new JapaneseTokenizer(new StringReader(text), null, true, org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode.SEARCH); TokenStream stream = new JapaneseBaseFormFilter(tokenizer); //stream = new JapanesePartOfSpeechStopFilter(true, stream, stoptags); stream = new CJKWidthFilter(stream); //stream = new StopFilter(matchVersion, stream, stopwords); stream = new JapaneseKatakanaStemFilter(stream); //stream = new LowerCaseFilter(matchVersion, stream); OffsetAttribute offsetAttribute = stream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class); while (stream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String token = charTermAttribute.toString(); tokenMap.putPos(token, startOffset); //System.out.println(token.str+" \t\tS="+token.start+" E="+token.end); } if (verbose) PrintUtil.donePrinting(); ct = 1; }
From source file:modnlp.idx.inverted.TokeniserJPLucene.java
License:Open Source License
public List<String> split(String s) { ArrayList<String> ret = new ArrayList<String>(); try {//from w w w. j ava 2 s. c o m Tokenizer tokenizer = new JapaneseTokenizer(new StringReader(s), null, true, org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode.SEARCH); TokenStream stream = new JapaneseBaseFormFilter(tokenizer); //stream = new JapanesePartOfSpeechStopFilter(true, stream, stoptags); stream = new CJKWidthFilter(stream); //stream = new StopFilter(matchVersion, stream, stopwords); stream = new JapaneseKatakanaStemFilter(stream); //stream = new LowerCaseFilter(matchVersion, stream); OffsetAttribute offsetAttribute = stream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class); while (stream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String token = charTermAttribute.toString(); ret.add(token); //System.out.println(token.str+" \t\tS="+token.start+" E="+token.end); } } catch (java.io.IOException e) { System.err.println(e); } return ret; }
From source file:modnlp.idx.inverted.TokeniserJPLucene.java
License:Open Source License
public TokenIndex getTokenIndex(String str) { TokenIndex ret = new TokenIndex(); try {//from www.j av a 2s . c om Tokenizer tokenizer = new JapaneseTokenizer(new StringReader(str), null, true, org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode.SEARCH); TokenStream stream = new JapaneseBaseFormFilter(tokenizer); //stream = new JapanesePartOfSpeechStopFilter(true, stream, stoptags); stream = new CJKWidthFilter(stream); //stream = new StopFilter(matchVersion, stream, stopwords); stream = new JapaneseKatakanaStemFilter(stream); //stream = new LowerCaseFilter(matchVersion, stream); OffsetAttribute offsetAttribute = stream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class); while (stream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String token = charTermAttribute.toString(); ret.add(startOffset, endOffset); //System.out.println(token.str+" \t\tS="+token.start+" E="+token.end); } } catch (java.io.IOException e) { System.err.println(e); } return ret; }
From source file:org.elasticsearch.indices.analysis.KuromojiIndicesAnalysis.java
License:Apache License
@Inject public KuromojiIndicesAnalysis(Settings settings, IndicesAnalysisService indicesAnalysisService) { super(settings); indicesAnalysisService.analyzerProviderFactories().put("kuromoji", new PreBuiltAnalyzerProviderFactory("kuromoji", AnalyzerScope.INDICES, new JapaneseAnalyzer())); indicesAnalysisService.charFilterFactories().put("kuromoji_iteration_mark", new PreBuiltCharFilterFactoryFactory(new CharFilterFactory() { @Override/*from www .j ava 2 s .c o m*/ public String name() { return "kuromoji_iteration_mark"; } @Override public Reader create(Reader reader) { return new JapaneseIterationMarkCharFilter(reader, JapaneseIterationMarkCharFilter.NORMALIZE_KANJI_DEFAULT, JapaneseIterationMarkCharFilter.NORMALIZE_KANA_DEFAULT); } })); indicesAnalysisService.tokenizerFactories().put("kuromoji_tokenizer", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() { @Override public String name() { return "kuromoji_tokenizer"; } @Override public Tokenizer create() { return new JapaneseTokenizer(null, true, Mode.SEARCH); } })); indicesAnalysisService.tokenFilterFactories().put("kuromoji_baseform", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { @Override public String name() { return "kuromoji_baseform"; } @Override public TokenStream create(TokenStream tokenStream) { return new JapaneseBaseFormFilter(tokenStream); } })); indicesAnalysisService.tokenFilterFactories().put("kuromoji_part_of_speech", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { @Override public String name() { return "kuromoji_part_of_speech"; } @Override public TokenStream create(TokenStream tokenStream) { return new JapanesePartOfSpeechStopFilter(tokenStream, JapaneseAnalyzer.getDefaultStopTags()); } })); indicesAnalysisService.tokenFilterFactories().put("kuromoji_readingform", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { @Override public String name() { return "kuromoji_readingform"; } @Override public TokenStream create(TokenStream tokenStream) { return new JapaneseReadingFormFilter(tokenStream, true); } })); indicesAnalysisService.tokenFilterFactories().put("kuromoji_stemmer", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { @Override public String name() { return "kuromoji_stemmer"; } @Override public TokenStream create(TokenStream tokenStream) { return new JapaneseKatakanaStemFilter(tokenStream); } })); }