List of usage examples for org.apache.lucene.analysis Token type
@Override public final String type()
From source file:analysis.StandardFilter.java
License:Apache License
/** * Returns the next token in the stream, or null at EOS. * <p>//from ww w .j a va 2 s . co m * Removes <tt>'s</tt> from the end of words. * <p> * Removes dots from acronyms. */ public final Token next(final Token reusableToken) throws java.io.IOException { assert reusableToken != null; Token nextToken = input.next(reusableToken); if (nextToken == null) return null; char[] buffer = nextToken.termBuffer(); final int bufferLength = nextToken.termLength(); final String type = nextToken.type(); if (type == APOSTROPHE_TYPE && // remove 's bufferLength >= 2 && buffer[bufferLength - 2] == '\'' && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S')) { // Strip last 2 characters off nextToken.setTermLength(bufferLength - 2); } else if (type == ACRONYM_TYPE) { // remove dots int upto = 0; for (int i = 0; i < bufferLength; i++) { char c = buffer[i]; if (c != '.') buffer[upto++] = c; } nextToken.setTermLength(upto); } return nextToken; }
From source file:analysis.StandardTokenizer.java
License:Apache License
@SuppressWarnings("deprecation") public Token next(final Token reusableToken) throws IOException { assert reusableToken != null; int posIncr = 1; Token result = reusableToken;//from www . j a va2 s . c o m if (tokenList.size() > 0) return tokenList.remove(); while (true) { int tokenType = scanner.getNextToken(); if (tokenType == StandardTokenizerImpl.YYEOF) { return null; } if (scanner.yylength() <= maxTokenLength) { reusableToken.clear(); reusableToken.setPositionIncrement(posIncr); scanner.getText(reusableToken); final int start = scanner.yychar(); reusableToken.setStartOffset(start); reusableToken.setEndOffset(start + reusableToken.termLength()); // This 'if' should be removed in the next release. For now, it // converts // invalid acronyms to HOST. When removed, only the 'else' part // should // remain. if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) { if (replaceInvalidAcronym) { reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]); reusableToken.setTermLength(reusableToken.termLength() - 1); // remove // extra // '.' tokenType = StandardTokenizerImpl.HOST; } else { reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]); tokenType = StandardTokenizerImpl.ACRONYM; } } else { reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]); } if (tokenType == StandardTokenizerImpl.HOST || tokenType == StandardTokenizerImpl.NUM || tokenType == StandardTokenizerImpl.ALPHANUM) { Tokenizer lt = new LetterDigitBreakTokenizer(new StringReader(reusableToken.term())); Token tk = null; int st = reusableToken.startOffset(); final Token token = new Token(); while ((tk = lt.next(token)) != null) { tk.setStartOffset(tk.startOffset() + st); tk.setEndOffset(tk.endOffset() + st); tk.setType(reusableToken.type()); tokenList.add((Token) tk.clone()); } } if (tokenList.size() > 0) result = tokenList.remove(); return result; } else // When we skip a too-long term, we still increment the // position increment posIncr++; } }
From source file:ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search.SeparatorSplitterTokenFilter.java
License:Apache License
private static boolean isSplittableToken(Token token) { String type = token.type(); if (type.equals(ALPHANUM_TOKEN_TYPE) || type.equals(HOST_TOKEN_TYPE)) { return true; }/*from w w w .j av a 2s.c o m*/ if (type.equals(NUM_TOKEN_TYPE)) { // sometimes the original tokenizer lies to us and reports terms like 'version_3' to be // numbers. This is a heuristic to correct those lies. return Character.isLetter(token.term().charAt(0)); } return false; }
From source file:com.fdt.sdl.core.analyzer.phonetix.lucene.PhoneticFilter.java
License:Open Source License
/** * Returns the next token in the stream, or <code>null</code> at EOF. *//*from w w w . j a v a 2s. c o m*/ public Token next() throws IOException { if (encoder == null) { return input.next(); } else if (encoder.length == 1) { // optimize, if one encoder only final Token t = input.next(); if (t == null) return null; return (encoder[0] != null) ? new Token(encoder[0].generateKey(t.termText()), t.startOffset(), t.endOffset(), t.type()) : t; } else { ++actualIndex; // get next token, if necessary if (actualIndex >= encoder.length) { actualToken = input.next(); if (actualToken == null) { actualIndex = encoder.length; return null; } actualIndex = 0; } if (encoder[actualIndex] == null) return actualToken; else return new Token( encoder[actualIndex].toString() + ":" + encoder[actualIndex].generateKey(actualToken.termText()), actualToken.startOffset(), actualToken.endOffset(), actualToken.type()); } }
From source file:com.globalsight.ling.lucene.analysis.pl.PolishFilter.java
License:Apache License
/** Returns the next input Token, after being stemmed */ public final Token next() throws IOException { Token token = getNextToken(); if (token == null) { return null; } else {// w w w . java 2 s.c o m String s = stemmer.stem(token.toString(), true); if (!s.equals(token.toString())) { // reconstruct the input token. This is silly... Token res = new Token(s, token.startOffset(), token.endOffset(), token.type()); res.setPositionIncrement(token.getPositionIncrement()); return res; } return token; } }
From source file:com.globalsight.ling.lucene.analysis.snowball.SnowballFilter.java
License:Apache License
/** * Returns the next input Token, after being stemmed. *//* w w w. j a va 2 s . c o m*/ public final Token next() throws IOException { Token token = getNextToken(); if (token == null) { return null; } stemmer.setCurrent(token.toString()); stemmer.stem(); return new Token(stemmer.getCurrent(), token.startOffset(), token.endOffset(), token.type()); }
From source file:com.globalsight.ling.lucene.analysis.th.BreakIteratorTokenTokenizer.java
License:Apache License
protected TokenStream createSubStream(Token t) { if (t.type().equals(type)) { bi.setText(t.toString());/* w ww.j a va2s . co m*/ return new BreakIteratorAdaptor(t.toString(), bi, t.type(), t.startOffset()); } return null; }
From source file:com.globalsight.ling.tm2.lucene.GsStemFilter.java
License:Apache License
/** * Stems the next input Token and returns it. *///from w w w. java 2 s . c om public final Token next() throws IOException { Token token = getNextToken(); if (token != null) { String stemmed = m_stemmer.stem(token.toString()); if (!stemmed.equals(token.toString())) { token = new Token(stemmed, token.startOffset(), token.endOffset(), token.type()); } } return token; }
From source file:com.ideabase.repository.core.index.filter.TermUsageFilter.java
License:Open Source License
@Override public Token next() throws IOException { final Token token = input.next(); if (token != null) { if (token.type().equals(TYPE_ALPHANUM)) { storeAndIncrementCount(String.valueOf(token.termBuffer())); }//from w w w. j ava 2s . co m return token; } else { return null; } }
From source file:com.mathworks.xzheng.analysis.nutch.NutchExample.java
License:Apache License
public static void main(String[] args) throws IOException { Configuration conf = Configuration.getConfiguration(); conf.addResource("nutch-default.xml"); NutchDocumentAnalyzer analyzer = new NutchDocumentAnalyzer(conf); //1 TokenStream ts = analyzer.tokenStream("content", new StringReader("The quick brown fox...")); int position = 0; Token token; while (ts.incrementToken()) { // 2 token = ts.getAttribute(org.apache.lucene.analysis.Token.class); if (token == null) { break; }/*from w w w. jav a 2 s. c om*/ int increment = token.getPositionIncrement(); if (increment > 0) { position = position + increment; System.out.println(); System.out.print(position + ": "); } System.out.print("[" + token.termBuffer().toString() + ":" + token.startOffset() + "->" + token.endOffset() + ":" + token.type() + "] "); } System.out.println(); Query nutchQuery = Query.parse("\"the quick brown\"", conf); // 3 org.apache.lucene.search.Query luceneQuery; luceneQuery = new QueryFilters(conf).filter(nutchQuery); // A System.out.println("Translated: " + luceneQuery); }