List of usage examples for org.apache.lucene.analysis Tokenizer subclass-usage
From source file analysis.StandardTokenizer.java
/**
* A grammar-based tokenizer constructed with JFlex
* <p>
* This should be a good tokenizer for most European-language documents:
* <ul>
* <li>Splits words at punctuation characters, removing punctuation. However, a
From source file au.edu.unimelb.csse.analyser.FastStringPerThreadTokenizer.java
public class FastStringPerThreadTokenizer extends Tokenizer { private final FastStringParser actualParser; private final char[] ioBuffer = new char[512]; char[] sent = new char[512];
From source file au.edu.unimelb.csse.analyser.NodeTreebankSentenceTokenizer.java
public class NodeTreebankSentenceTokenizer extends Tokenizer { protected static final int BUFFER_SIZE = 512; private final char[] ioBuffer = new char[BUFFER_SIZE]; private JsonSentenceParser elementTokenizer; private int read = -1; private static final String NONE = "";
From source file au.edu.unimelb.csse.analyser.TreebankSentenceTokenizer.java
public class TreebankSentenceTokenizer extends Tokenizer { protected static final int BUFFER_SIZE = 512; private final char[] ioBuffer = new char[BUFFER_SIZE]; private Tknzr tokenizer; private int read = -1;
From source file au.edu.unimelb.csse.analyser.TreeTokenizer.java
public class TreeTokenizer extends Tokenizer { private String sentence; private TreeLexer lexer = new TreeLexer(); private int numTokens; // total number of tokens in the sentence private int tokenPos; // current position of the returned token
From source file biospectra.lucene.KmerSequenceTokenizer.java
/** * * @author iychoi */ public class KmerSequenceTokenizer extends Tokenizer {
From source file br.bireme.ngrams.NGTokenizer.java
/** * * @author Heitor Barbieri * date: 20151216 */ public class NGTokenizer extends Tokenizer {
From source file br.bireme.ngrams.OneTokenTokenizer.java
/** * Tokenizer that generates only one token, the whole input. * @author Heitor Barbieri * date: 20170327 */ public class OneTokenTokenizer extends Tokenizer {
From source file byrne.mitre.NGramTokenizer.java
/** * Tokenizes the input into n-grams of the given size(s). */ public final class NGramTokenizer extends Tokenizer { public static final int DEFAULT_MIN_NGRAM_SIZE = 2;
From source file cc.pp.analyzer.ik.lucene.IKTokenizer.java
/** * IK? Lucene Tokenizer? * lucene4.6 */ public final class IKTokenizer extends Tokenizer {