List of usage examples for org.apache.lucene.analysis.ja JapaneseTokenizer JapaneseTokenizer
public JapaneseTokenizer(AttributeFactory factory, UserDictionary userDictionary, boolean discardPunctuation, Mode mode)
From source file:aak.as.preProcess.japanese.JaSegmenter.java
License:Open Source License
@Override public List<String> segmentWords(String text) { List<String> ret = new ArrayList<String>(); StringReader textreader = new StringReader(text); JapaneseTokenizer segmenter = new JapaneseTokenizer(textreader, null, true, JapaneseTokenizer.Mode.SEARCH); JaStemmer.lemma.clear();/*from ww w . java 2s .c om*/ CharTermAttribute termAtt = segmenter.getAttribute(CharTermAttribute.class); BaseFormAttribute baseAtt = segmenter.getAttribute(BaseFormAttribute.class); try { segmenter.reset(); while (segmenter.incrementToken()) { //segmenter.clearAttributes(); ret.add(termAtt.toString()); if (baseAtt.getBaseForm() != null) JaStemmer.lemma.put(termAtt.toString(), baseAtt.getBaseForm()); } segmenter.close(); } catch (IOException e) { // TODO Auto-generated catch block. e.printStackTrace(); } return ret; }
From source file:modnlp.idx.inverted.TokeniserJPLucene.java
License:Open Source License
public void tokenise() throws IOException { String ignregexp = "--+|\\.\\.+|\\.+\\p{Space}"; // delete full stops and dashes (typically not used). if (ignoredElements != null && ignoredElements.length() > 0) ignregexp = ignregexp + "|< *" + ignoredElements + "[^>]*?/>" + "|< *" + ignoredElements + ".*?>.*?</" + ignoredElements + " *>"; if (!tagIndexing) ignregexp = ignregexp + "|<.*?>"; //ignregexp = ignregexp+"|\\W\\W+"; Pattern p = Pattern.compile(ignregexp); Matcher igns = p.matcher(originalText); StringBuffer tx = new StringBuffer(originalText); int ct = 1;/* www . java2 s . c om*/ while (igns.find()) { int s = igns.start(); int e = igns.end(); if (verbose) PrintUtil.printNoMove("Processing exclusions ...", ct++); //System.err.println("replacing\n-----------"+originalText.substring(s,e)+"\n--------------"); char sp[] = new char[e - s]; for (int j = 0; j < sp.length; j++) { sp[j] = ' '; } tx.replace(s, e, new String(sp)); } if (verbose) PrintUtil.donePrinting(); ct = 1; //verbose = false; String text = new String(tx); //System.out.println("-->"+text+"<--"); Tokenizer tokenizer = new JapaneseTokenizer(new StringReader(text), null, true, org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode.SEARCH); TokenStream stream = new JapaneseBaseFormFilter(tokenizer); //stream = new JapanesePartOfSpeechStopFilter(true, stream, stoptags); stream = new CJKWidthFilter(stream); //stream = new StopFilter(matchVersion, stream, stopwords); stream = new JapaneseKatakanaStemFilter(stream); //stream = new LowerCaseFilter(matchVersion, stream); OffsetAttribute offsetAttribute = stream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class); while (stream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String token = charTermAttribute.toString(); tokenMap.putPos(token, startOffset); //System.out.println(token.str+" \t\tS="+token.start+" E="+token.end); } if (verbose) PrintUtil.donePrinting(); ct = 1; }
From source file:modnlp.idx.inverted.TokeniserJPLucene.java
License:Open Source License
public List<String> split(String s) { ArrayList<String> ret = new ArrayList<String>(); try {/*from w ww .j ava 2 s. c o m*/ Tokenizer tokenizer = new JapaneseTokenizer(new StringReader(s), null, true, org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode.SEARCH); TokenStream stream = new JapaneseBaseFormFilter(tokenizer); //stream = new JapanesePartOfSpeechStopFilter(true, stream, stoptags); stream = new CJKWidthFilter(stream); //stream = new StopFilter(matchVersion, stream, stopwords); stream = new JapaneseKatakanaStemFilter(stream); //stream = new LowerCaseFilter(matchVersion, stream); OffsetAttribute offsetAttribute = stream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class); while (stream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String token = charTermAttribute.toString(); ret.add(token); //System.out.println(token.str+" \t\tS="+token.start+" E="+token.end); } } catch (java.io.IOException e) { System.err.println(e); } return ret; }
From source file:modnlp.idx.inverted.TokeniserJPLucene.java
License:Open Source License
public TokenIndex getTokenIndex(String str) { TokenIndex ret = new TokenIndex(); try {/*ww w . j a va2s. c o m*/ Tokenizer tokenizer = new JapaneseTokenizer(new StringReader(str), null, true, org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode.SEARCH); TokenStream stream = new JapaneseBaseFormFilter(tokenizer); //stream = new JapanesePartOfSpeechStopFilter(true, stream, stoptags); stream = new CJKWidthFilter(stream); //stream = new StopFilter(matchVersion, stream, stopwords); stream = new JapaneseKatakanaStemFilter(stream); //stream = new LowerCaseFilter(matchVersion, stream); OffsetAttribute offsetAttribute = stream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class); while (stream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String token = charTermAttribute.toString(); ret.add(startOffset, endOffset); //System.out.println(token.str+" \t\tS="+token.start+" E="+token.end); } } catch (java.io.IOException e) { System.err.println(e); } return ret; }
From source file:org.apache.solr.analysis.JapaneseTokenizerFactory.java
License:Apache License
@Override public Tokenizer create(Reader input) { return new JapaneseTokenizer(input, userDictionary, true, mode); }
From source file:org.omegat.tokenizer.LuceneJapaneseTokenizer.java
License:Open Source License
@Override protected TokenStream getTokenStream(String strOrig, boolean stemsAllowed, boolean stopWordsAllowed) { if (stemsAllowed) { // Blank out tags when stemming only strOrig = blankOutTags(strOrig); CharArraySet stopWords = stopWordsAllowed ? JapaneseAnalyzer.getDefaultStopSet() : new CharArraySet(getBehavior(), 0, false); Set<String> stopTags = stopWordsAllowed ? JapaneseAnalyzer.getDefaultStopTags() : Collections.EMPTY_SET; return new JapaneseAnalyzer(getBehavior(), null, Mode.SEARCH, stopWords, stopTags).tokenStream("", new StringReader(strOrig)); } else {/* w ww.j a v a 2 s . com*/ return new TagJoiningFilter(new JapaneseTokenizer(new StringReader(strOrig), null, false, Mode.NORMAL)); } }