List of usage examples for org.apache.lucene.analysis.standard StandardTokenizer StandardTokenizer
public StandardTokenizer(AttributeFactory factory)
From source file:analyzers.FormalAnalyzer.java
License:Apache License
/** * Define how tokens are processed.//from w w w. j a va 2s . c o m * * @param fieldName required input * @param reader reader for document */ @Override protected Analyzer.TokenStreamComponents createComponents(final String fieldName, final Reader reader) { Tokenizer tokenizer = new StandardTokenizer(reader); TokenStream chain = tokenizer; if (!tokenOpts.disableAllFilters) { // the chain of token filters... chain = new StandardFilter(chain); // discard tokens based on their type attribute chain = new StandardTagFilter(chain, tokenOpts); // convert tokens to lowercase chain = new LowerCaseFilter(chain); // replace accented chars with non-accented ASCII equivalents chain = new ASCIIFoldingFilter(chain); // remove stop words (must come after lowercasing) chain = new StopFilter(chain, stopWordSet); // remove 's chain = new EnglishPossessiveFilter(Version.LATEST, chain); // spelling correction if (!spellingHashtable.isEmpty()) chain = new SpellingCorrectionFilter(chain, spellingHashtable); if (!tokenOpts.disableStemming) { // Krovets stemmer (smarter than the Porter stemmer) chain = new KStemFilter(chain); } } return new Analyzer.TokenStreamComponents(tokenizer, chain); }
From source file:brazilianStemmer.BrazilianAnalyzer.java
License:Apache License
/** * Creates a TokenStream which tokenizes all the text in the provided * Reader.//from w w w . j av a 2 s . co m * * @return A TokenStream build from a StandardTokenizer filtered with * StandardFilter, StopFilter, GermanStemFilter and LowerCaseFilter. */ public final TokenStream tokenStream(String fieldName, Reader reader) { TokenStream result = new StandardTokenizer(reader); /** Convert to lowercase after stemming! */ result = new LowerCaseFilter(result); result = new StopFilter(result, englishStopWords); result = new BrazilianAccentsFilter(result); result = new StopFilter(result, stopWords); result = new BrazilianStemFilter(result, stopWords); return result; }
From source file:com.appeligo.lucene.PorterStemAnalyzer.java
License:Apache License
/** Filters LowerCaseTokenizer with StopFilter. */ public TokenStream tokenStream(String fieldName, Reader reader) { TokenStream result = new StandardTokenizer(reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new StopFilter(result, stopWords); result = new PorterStemFilter(result); return result; }
From source file:com.devb.search.IndicIndexer.java
License:Apache License
@Override public void makeIndex() { String indexPath = servletContext.getRealPath("/") + "/hindex/"; String docsPath = servletContext.getRealPath("/") + "/hdocs/"; boolean create = true; final File docDir = new File(docsPath); if (!docDir.exists() || !docDir.canRead()) { System.out.println("Document directory '" + docDir.getAbsolutePath() + "' does not exist or is not readable, please check the path\n"); return;// w w w . j a v a 2 s. co m } Date start = new Date(); try { System.out.println("Indexing to directory '" + indexPath + "'...\n"); org.apache.lucene.store.Directory dir = FSDirectory.open(new File(indexPath)); Analyzer analyzer = new HindiAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(null, analyzer); if (create) { iwc.setOpenMode(OpenMode.CREATE); } else { iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); } IndexWriter writer = new IndexWriter(dir, iwc); if (docDir.canRead()) { if (docDir.isDirectory()) { String[] files = docDir.list(); if (files != null) { for (int i = 0; i < files.length; i++) { File file = new File(docDir, files[i]); FileInputStream fileInputStream = new FileInputStream(file); BufferedReader reader = new BufferedReader( new InputStreamReader(fileInputStream, "UTF-8")); Tokenizer tokenizer = new StandardTokenizer(reader); CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class); tokenizer.reset(); int lineNumber = 0; try { while (tokenizer.incrementToken()) { Document doc = new Document(); Field pathField = new StringField("path", file.getName(), Field.Store.YES); doc.add(pathField); TextField nField = new TextField("linenumber", new Integer(++lineNumber).toString(), Store.YES); doc.add(nField); TextField field = new TextField("contents", termAtt.toString(), Store.YES); doc.add(field); writer.addDocument(doc); } System.out.println("Adding " + file + "\n"); } catch (Exception e) { e.printStackTrace(); } finally { tokenizer.close(); reader.close(); fileInputStream.close(); } } } } } writer.close(); Date end = new Date(); System.out.println((end.getTime() - start.getTime()) + " total milliseconds\n"); } catch (IOException e) { System.out.println("Caught a " + e.getClass() + "\n with message: " + e.getMessage()); } }
From source file:com.dhamacher.sentimentanalysis4tweets.sentiment.Tokenizer.java
License:Apache License
/** * Retrieve the tokens in a String. Behaves like getTokens, but operates on * a string instead of a tweet object.// w ww . j a va 2 s.com * * @param text The text to tokenize. * @return The tokens in the text. */ // Version 1 /*public LinkedList<String> getTokens (String text) { LinkedList<String> tokens = new LinkedList(); String[] words = text.split(" "); tokens.addAll(Arrays.asList(words)); return tokens; }*/ // Version 2 public static LinkedList<String> getTokens(String text) throws IOException { LinkedList<String> tokens = new LinkedList(); TokenStream ts = new StandardTokenizer(new StringReader(text)); TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class); while (ts.incrementToken()) { tokens.add(termAtt.term()); //System.out.print(termAtt.term()); } return tokens; }
From source file:com.duroty.lucene.analysis.DefaultAnalyzer.java
License:Open Source License
/** * DOCUMENT ME!// ww w . j a va 2 s .co m * * @param fieldName DOCUMENT ME! * @param reader DOCUMENT ME! * * @return DOCUMENT ME! */ public TokenStream tokenStream(String fieldName, Reader reader) { TokenStream result = new StandardTokenizer(reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); if (stopTable != null) { return new StopFilter(result, stopTable); } else { return result; } }
From source file:com.duroty.lucene.analysis.EmptyAnalyzer.java
License:Apache License
/** Constructs a {@link StandardTokenizer} filtered by a {@link StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */ public TokenStream tokenStream(String fieldName, Reader reader) { TokenStream result = new StandardTokenizer(reader); result = new StandardFilter(result); return result; }
From source file:com.fdt.sdl.core.analyzer.phonetix.lucene.PhoneticAnalyzer.java
License:Open Source License
/** * Constructs a {@link StandardTokenizer} filtered by a {@link * StandardFilter}, a {@link LowerCaseFilter}, a {@link StopFilter}, * and a {@link PhoneticFilter}.//from w ww. j a v a 2 s.c om */ public TokenStream tokenStream(String fieldname, final Reader reader) { TokenStream result = new StandardTokenizer(reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new StopFilter(result, stopTable); result = new PhoneticFilter(result, encoder); return result; }
From source file:com.flaptor.hounder.searcher.query.AQuerySuggestor.java
License:Apache License
private List<AQuery> suggestLinear(AQuery query) { List<AQuery> queries = new ArrayList<AQuery>(); if (null == query) { logger.debug("Can't make a suggestion for a null query"); } else if (!(query instanceof LazyParsedQuery)) { // TODO FIXME logger.debug("can not make suggestions for queries of type " + query.getClass()); } else {/*from w ww . java 2 s.c o m*/ String originalString = ((LazyParsedQuery) query).getQueryString(); StandardTokenizer tokenizer = new StandardTokenizer(new StringReader(originalString)); List<String> tokens = new ArrayList<String>(); try { Token token = new Token(); while (true) { token = tokenizer.next(token); if (null == token) { break; } tokens.add(TokenUtil.termText((Token) token.clone())); } // for every word, suggest something for (int i = 0; i < tokens.size(); i++) { StringBuffer sb = new StringBuffer(); // sb.append("\""); for (int j = 0; j < i; j++) { sb.append(tokens.get(j)); sb.append(" "); } String[] suggestions = suggestor.suggestWords(tokens.get(i)); for (String suggestion : suggestions) { // generate final sb StringBuffer sbf = new StringBuffer(sb); sbf.append(suggestion); sbf.append(" "); for (int k = i + 1; k < tokens.size(); k++) { sbf.append(tokens.get(k)); if (k + 1 < tokens.size()) { sbf.append(" "); } } // sbf.append("\""); queries.add(new LazyParsedQuery(sbf.toString())); } } } catch (IOException e) { logger.error("Error while suggesting query", e); return new ArrayList<AQuery>(); } } return queries; }
From source file:com.google.ie.common.search.analyzer.IdeaExchangeQueryAnalyzer.java
License:Apache License
@Override public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { SavedStreams streams = (SavedStreams) getPreviousTokenStream(); if (streams == null) { streams = new SavedStreams(); setPreviousTokenStream(streams); streams.tokenStream = new StandardTokenizer(reader); streams.filteredTokenStream = new StandardFilter(streams.tokenStream); streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream); streams.filteredTokenStream = new StopFilter(streams.filteredTokenStream, stopSet); fieldName = DEFAULT_LANGUAGE;//from ww w . j ava 2s . com streams.filteredTokenStream = new SnowballFilter(streams.filteredTokenStream, fieldName); } else { streams.tokenStream.reset(reader); } streams.tokenStream.setMaxTokenLength(maxTokenLength); return streams.filteredTokenStream; }