List of usage examples for org.apache.lucene.analysis.standard StandardTokenizer StandardTokenizer
public StandardTokenizer()
From source file:edu.illinois.cs.cogcomp.bigdata.lucene.ASCIIEnglishAnalyzer.java
License:Open Source License
@Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer source = new StandardTokenizer(); TokenStream result = new StandardFilter(source); result = new ASCIIFoldingFilter(result); result = new EnglishPossessiveFilter(result); result = new WordDelimiterFilter(result, WordDelimiterFilter.ALPHA, null); result = new LowerCaseFilter(result); result = new StopFilter(result, EnglishAnalyzer.getDefaultStopSet()); result = new PorterStemFilter(result); return new TokenStreamComponents(source, result); }
From source file:edu.illinois.cs.cogcomp.bigdata.lucene.MinimalAnalyzer.java
License:Open Source License
@Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer source = new StandardTokenizer(); TokenStream result = new StandardFilter(source); result = new ASCIIFoldingFilter(result); result = new LowerCaseFilter(result); result = new EnglishPossessiveFilter(result); result = new StopFilter(result, stopwords); result = new WordDelimiterFilter(result, WordDelimiterFilter.ALPHA, null); result = new PorterStemFilter(result); return new TokenStreamComponents(source, result); }
From source file:edu.umass.cs.ciir.IndexFromGalago.java
License:Open Source License
public static void main(String[] args) throws Exception { Parameters argp = Parameters.parseArgs(args); String galagoIndexPath = null; String luceneIndexPath = null; try {/*from ww w. ja v a 2s . c o m*/ galagoIndexPath = argp.getString("galagoIndex"); luceneIndexPath = argp.getString("luceneIndex"); } catch (Exception e) { System.out.println(getUsage()); return; } logger.setUseParentHandlers(false); FileHandler lfh = new FileHandler("indexing-errors.log"); SimpleFormatter formatter = new SimpleFormatter(); lfh.setFormatter(formatter); logger.addHandler(lfh); final DiskIndex index = new DiskIndex(argp.get("index", galagoIndexPath)); final CorpusReader corpus = (CorpusReader) index.getIndexPart("corpus"); long total = corpus.getManifest().getLong("keyCount"); final CorpusReader.KeyIterator iterator = corpus.getIterator(); final Document.DocumentComponents dcp = Document.DocumentComponents.JustText; // Analyzer includes options for text processing Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { // Step 1: tokenization (Lucene's StandardTokenizer is suitable for most text retrieval occasions) TokenStreamComponents ts = new TokenStreamComponents(new StandardTokenizer()); // Step 2: transforming all tokens into lowercased ones ts = new Analyzer.TokenStreamComponents(ts.getTokenizer(), new LowerCaseFilter(ts.getTokenStream())); // Step 3: whether to remove stop words // Uncomment the following line to remove stop words // ts = new TokenStreamComponents( ts.getTokenizer(), new StopwordsFilter( ts.getTokenStream(), StandardAnalyzer.ENGLISH_STOP_WORDS_SET ) ); // Step 4: whether to apply stemming // Uncomment the following line to apply Krovetz or Porter stemmer // ts = new TokenStreamComponents( ts.getTokenizer(), new KStemFilter( ts.getTokenStream() ) ); // ts = new TokenStreamComponents( ts.getTokenizer(), new PorterStemFilter( ts.getTokenStream() ) ); return ts; } }; try (final FSDirectory dir = FSDirectory.open(Paths.get(argp.get("output", luceneIndexPath)))) { final IndexWriterConfig cfg = new IndexWriterConfig(analyzer); System.out.println("Similarity: " + cfg.getSimilarity()); cfg.setOpenMode(IndexWriterConfig.OpenMode.CREATE); try (IndexWriter writer = new IndexWriter(dir, cfg)) { iterator.forAllKeyStrings(docId -> { try { Document document = iterator.getDocument(dcp); String text = document.text; String id = document.name; System.out.println("Processing document: " + id); org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document(); doc.add(new StringField("id", id, Field.Store.YES)); // this stores the actual text with tags so formatting is preserved doc.add(new StoredField("body", text)); org.jsoup.nodes.Document jsoup = Jsoup.parse(text); // tokens of the document FieldType fieldTypeText = new FieldType(); fieldTypeText.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); fieldTypeText.setStoreTermVectors(true); fieldTypeText.setStoreTermVectorPositions(true); fieldTypeText.setTokenized(true); fieldTypeText.setStored(false); fieldTypeText.freeze(); doc.add(new Field("tokens", jsoup.text(), fieldTypeText)); try { writer.addDocument(doc); System.out.println("Doc count: " + writer.numDocs()); } catch (IOException e) { logger.log(Level.WARNING, "Pull-Document-Exception", e); System.err.println(e.toString()); } } catch (Exception e) { logger.log(Level.WARNING, "Pull-Document-Exception", e); System.err.println(e.toString()); } }); } } System.out.println("Indexing Done. "); }
From source file:fastcampus.lucene.example.search.SpellCheckerExample.java
License:Apache License
public static void main(String[] args) throws Exception { Directory directory = FSDirectory.open(Paths.get("./index/spell/")); SpellChecker spellChecker = new SpellChecker(directory); //Analyzer analyzer = new StandardAnalyzer(); // ? Analyzer analyzer = new Analyzer() { @Override//from www . j a v a 2 s . co m protected TokenStreamComponents createComponents(String s) { Reader reader = new StringReader(s); Tokenizer tokenizer = new StandardTokenizer(); tokenizer.setReader(reader); String name = "nfc_cf"; Normalizer2 normalizer = Normalizer2.getInstance(null, name, Normalizer2.Mode.DECOMPOSE); TokenFilter filter = new ICUNormalizer2Filter(tokenizer, normalizer); return new TokenStreamComponents(tokenizer, filter); } }; IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer); //?? Writer? ? ? Path path = Paths.get("./data/spell/dic.txt"); spellChecker.setSpellIndex(directory); spellChecker.clearIndex(); spellChecker.indexDictionary(new PlainTextDictionary(path), indexWriterConfig, true); String wordForSuggestions = "?"; //spellChecker.setStringDistance(new LevensteinDistance()); //#Levenstein spellChecker.setStringDistance(new JaroWinklerDistance()); //Jaro-Winkler int suggestionsNumber = 1; String[] suggestions = spellChecker.suggestSimilar(wordForSuggestions, suggestionsNumber); if (suggestions != null && suggestions.length > 0) { for (String word : suggestions) { System.out.println("Did you mean:" + word); } } else { System.out.println("No suggestions found for word:" + wordForSuggestions); } }
From source file:indexer.CustomAnalyzer.java
@Override protected TokenStreamComponents createComponents(String string) { Tokenizer tokenizer = new StandardTokenizer(); TokenStream tokenStream = new StandardFilter(tokenizer); tokenStream = new LowerCaseFilter(tokenStream); try {//from w ww . ja v a2 s. co m tokenStream = this.filterStopWords(tokenStream); } catch (IOException ex) { Logger.getLogger(CustomAnalyzer.class.getName()).log(Level.SEVERE, null, ex); } tokenStream = new ASCIIFoldingFilter(tokenStream); tokenStream = new SnowballFilter(tokenStream, new RomanianStemmer()); return new TokenStreamComponents(tokenizer, tokenStream); }
From source file:internal.analyzers.indexing.PDFAnalyzer.java
License:Open Source License
@Override protected Analyzer.TokenStreamComponents createComponents(String s) { StringReader reader = new StringReader(s); Tokenizer tokenizer = new StandardTokenizer(); try {// w ww . ja v a 2s . c om tokenizer.setReader(reader); } catch (IOException e) { log.error("Could not set reader on tokenizer. Threw IO exception"); } TokenStream filter = new StandardFilter(tokenizer); filter = new LowerCaseFilter(filter); filter = new StopFilter(filter, StopAnalyzer.ENGLISH_STOP_WORDS_SET); filter = new StopFilter(filter, stop_set); filter = new NumberFilter(filter); filter = new AlphaNumericFilter(filter); return new TokenStreamComponents(tokenizer, filter); }
From source file:io.vertigo.dynamo.plugins.collections.lucene.DefaultAnalyzer.java
License:Apache License
/** * Creates a TokenStream which tokenizes all the text in the provided Reader. *//from w w w. j a v a 2 s. com * @return A TokenStream build from a StandardTokenizer filtered with * StandardFilter, StopFilter, FrenchStemFilter and LowerCaseFilter */ @Override protected TokenStreamComponents createComponents(final String fieldName) { /* initialisation du token */ final Tokenizer source = new StandardTokenizer(); //----- /* on retire les lisions*/ final CharArraySet elisionSet = new CharArraySet(Arrays.asList(LuceneConstants.ELISION_ARTICLES), true); TokenStream filter = new ElisionFilter(source, elisionSet); /* on retire article adjectif */ filter = new StopFilter(filter, stopWords); /* on retire les accents */ filter = new ASCIIFoldingFilter(filter); /* on met en minuscule */ filter = new LowerCaseFilter(filter); return new TokenStreamComponents(source, filter); }
From source file:luceneprueba.CustomAnalyzers.ReviewAnalyzer.java
@Override protected TokenStreamComponents createComponents(final String fieldName) { final StandardTokenizer src = new StandardTokenizer(); src.setMaxTokenLength(maxTokenLength); TokenStream tokenizer = new StandardFilter(src); tokenizer = new LowerCaseFilter(tokenizer); tokenizer = new StopFilter(tokenizer, stopwords); return new TokenStreamComponents(src, tokenizer) { @Override//from ww w . ja va 2s . c o m protected void setReader(final Reader reader) { src.setMaxTokenLength(ReviewAnalyzer.this.maxTokenLength); super.setReader(reader); } }; }
From source file:lucene_parameters.StandardAnalyzerWithNumberFilter.java
@Override protected TokenStreamComponents createComponents(String string) { Tokenizer tokenizer = new StandardTokenizer(); TokenStream filter = new StandardFilter(tokenizer); filter = new LowerCaseFilter(filter); filter = new StopFilter(filter, STOP_WORDS_SET); filter = new NumberFilter(filter); return new TokenStreamComponents(tokenizer, filter); }
From source file:my_code.MyRomanianAnalyzer.java
License:Apache License
@Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer source = new StandardTokenizer(); TokenStream result = source;//from w w w . ja v a 2 s . co m result = new StandardFilter(source); result = new LowerCaseFilter(result); result = new StopFilter(result, getStopwordSet()); result = new ASCIIFoldingFilter(result); return new TokenStreamComponents(source, result); }