List of usage examples for org.apache.lucene.analysis.standard StandardTokenizer StandardTokenizer
public StandardTokenizer(AttributeFactory factory)
From source file:com.NGramTokenBaseAnalyzer.java
@Override protected Analyzer.TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer src = new StandardTokenizer(reader); TokenStream tok = new LowerCaseFilter(src); tok = filter(tok, this.unigramOutput); return new TokenStreamComponents(src, tok); }
From source file:com.redhat.satellite.search.index.ngram.NGramAnalyzer.java
License:Open Source License
/** * @param fieldName ignored param/*from w w w. ja v a2s.co m*/ * @param reader contains data to parse * @return TokenStream of ngrams */ public TokenStream tokenStream(String fieldName, Reader reader) { return new NGramTokenFilter(new LowerCaseFilter(new StandardFilter(new StandardTokenizer(reader))), min_ngram, max_ngram); }
From source file:com.stimulus.archiva.search.ArchivaAnalyzer.java
License:Open Source License
@Override public final TokenStream tokenStream(String fieldName, final Reader reader) { TokenStream result = new StandardTokenizer(reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new StopFilter(result, stopTable); result = new PorterStemFilter(result); return result; }
From source file:com.stimulus.archiva.search.FilterAnalyzer.java
License:Open Source License
@Override public final TokenStream tokenStream(String fieldName, final Reader reader) { TokenStream result = new StandardTokenizer(reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); return result; }
From source file:com.talis.lucene.analysis.normen.NormaliseStandardAnalyzer.java
License:Apache License
@Override public TokenStream tokenStream(String arg0, Reader reader) { TokenStream result = new StandardTokenizer(reader); result = new StandardFilter(result); result = new ISOLatin1AccentFilter(result); result = new LowerCaseFilter(result); return result; }
From source file:com.talis.lucene.analysis.nostopen.NoStopwordStandardAnalyzer.java
License:Apache License
@Override public TokenStream tokenStream(String arg0, Reader reader) { TokenStream result = new StandardTokenizer(reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); return result; }
From source file:com.tangentum.phonetix.lucene.PhoneticAnalyzer.java
License:Open Source License
/** * Constructs a {@link StandardTokenizer} filtered by a {@link * StandardFilter}, a {@link LowerCaseFilter}, a {@link StopFilter}, * and a {@link PhoneticFilter}.//from ww w . j a v a 2 s . c o m */ public TokenStream tokenStream(final Reader reader) { TokenStream result = new StandardTokenizer(reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new StopFilter(result, stopTable); result = new PhoneticFilter(result, encoder); return result; }
From source file:com.xiaomi.linden.lucene.analyzer.LindenWordDelimiterAnalyzer.java
License:Apache License
@Override protected TokenStreamComponents createComponents(String s, Reader reader) { final Tokenizer source = new StandardTokenizer(reader); TokenStream ts = factoryDefault.create(source); if (this.toLowerCase) { ts = new LowerCaseFilter(ts); }// ww w .ja v a2s .c om if (this.setStopWords) { ts = new StopFilter(ts, StopAnalyzer.ENGLISH_STOP_WORDS_SET); } return new TokenStreamComponents(source, ts); }
From source file:de.joergjahnke.jdesktopsearch.abstractionlayer.SQLServerIndexWriter.java
License:Open Source License
public synchronized void addDocument(final Document doc) throws IOException { SQLException ex = null;/*from w w w .j av a2 s . co m*/ try { this.con.setAutoCommit(false); // get ID for the new file Statement stmt = con.createStatement(); int fileId = 1; ResultSet rs = stmt.executeQuery("SELECT MAX( ID ) AS \"MaxID\" FROM File_"); if (null != rs && rs.next()) { fileId = rs.getInt("MaxID") + 1; } rs.close(); rs = null; // insert new file record PreparedStatement insertFileStmt = con .prepareStatement("INSERT INTO File_ ( ID, Name ) VALUES ( ?, ? )"); insertFileStmt.setInt(1, fileId); insertFileStmt.setString(2, doc.getField("path").stringValue()); int inserted = insertFileStmt.executeUpdate(); // prepare SQL statements for the different data to insert PreparedStatement insertFieldStmt = con.prepareStatement( "INSERT INTO Field_ ( ID, FileID, Name, Value, FullValue, IsIndexable, IsCompressed ) VALUES ( ?, ?, ?, ?, ?, ?, ? )"); PreparedStatement insertTokenStmt = con .prepareStatement("INSERT INTO Token_ ( FieldID, Word_, Occurrences ) VALUES ( ?, ?, ? )"); // get ID for the next field int fieldId = 1; rs = stmt.executeQuery("SELECT MAX( ID ) AS \"MaxID\" FROM Field_"); if (null != rs && rs.next()) { fieldId = rs.getInt("MaxID") + 1; } rs.close(); rs = null; // insert all elements into the database for (Enumeration elements = doc.fields(); elements.hasMoreElements();) { final Field field = (Field) elements.nextElement(); // don't save the value if not marked for storage String value = field.stringValue(); if (!field.isStored()) { value = null; } // compress data if it does not fit into the 255 char value field and // if this field is also tokenized so that a search can still take place byte[] fullValue = null == value || value.length() <= 255 ? null : value.getBytes(); boolean isCompressed = false; if (null != fullValue && field.isTokenized()) { // compress data final ByteArrayOutputStream newFullValue = new ByteArrayOutputStream(); final byte[] buf = new byte[4000]; final Deflater compresser = new Deflater(); compresser.setInput(fullValue); compresser.finish(); while (!compresser.finished()) { final int read = compresser.deflate(buf); newFullValue.write(buf, 0, read); } compresser.end(); // only use compressed version if it is shorter than the original one if (newFullValue.size() < fullValue.length) { fullValue = newFullValue.toByteArray(); isCompressed = true; } } // insert new file record insertFieldStmt.setInt(1, fieldId); insertFieldStmt.setInt(2, fileId); insertFieldStmt.setString(3, field.name()); insertFieldStmt.setString(4, isCompressed || null == value ? null : StringUtils.left(value, 255)); if (isCompressed || (null != fullValue && fullValue.length > 255)) { insertFieldStmt.setBytes(5, fullValue); } else { insertFieldStmt.setObject(5, null); } insertFieldStmt.setBoolean(6, field.isIndexed()); insertFieldStmt.setBoolean(7, isCompressed); inserted = insertFieldStmt.executeUpdate(); // tokenize string if necessary and store all tokens if (null != field.stringValue()) { final Map<String, Integer> tokenMap = new HashMap<String, Integer>( field.isTokenized() ? field.stringValue().length() / this.AVG_WORDLENGTH : 1); if (field.isTokenized()) { if (!this.USE_LUCENE_TOKENIZER) { for (StringTokenizer tokenizer = new StringTokenizer(field.stringValue(), " |&.:?,;!()[]/\t\n\r\f\240"); tokenizer.hasMoreTokens();) { final String token = tokenizer.nextToken().toLowerCase(); if (tokenMap.containsKey(token)) { tokenMap.put(token, tokenMap.get(token) + 1); } else { tokenMap.put(token, 1); } } } else { final Tokenizer tokenizer = new StandardTokenizer( new StringReader(field.stringValue())); Token token = null; while ((token = tokenizer.next()) != null) { final String tokenText = token.termText().toLowerCase(); if (tokenMap.containsKey(tokenText)) { tokenMap.put(tokenText, tokenMap.get(tokenText) + 1); } else { tokenMap.put(tokenText, 1); } } } } else { tokenMap.put(field.stringValue().toLowerCase(), 1); } // store tokens in database for (String token : tokenMap.keySet()) { insertTokenStmt.setInt(1, fieldId); insertTokenStmt.setString(2, token.substring(0, Math.min(token.length(), 255))); insertTokenStmt.setInt(3, tokenMap.get(token)); try { insertTokenStmt.executeUpdate(); } catch (SQLException e) { // it might happen that the sql server rejects the row as he assumes two entries which in // Java are different are for him unique. We ignore this. } } } ++fieldId; } // end the transaction this.con.commit(); // cleanup insertTokenStmt.close(); insertTokenStmt = null; insertFieldStmt.close(); insertFieldStmt = null; insertFileStmt.close(); insertFileStmt = null; stmt.close(); stmt = null; } catch (SQLException e) { e.printStackTrace(); ex = e; try { this.con.rollback(); } catch (SQLException e2) { } } finally { try { this.con.setAutoCommit(true); } catch (SQLException e2) { } } // an error occurred if (null != ex) throw new IOException(ex.getMessage()); }
From source file:de.walware.statet.r.internal.core.rhelp.index.DefaultAnalyzer.java
License:Open Source License
@Override protected TokenStreamComponents createComponents(final String fieldName, Reader reader) { if (this.charFilterFactory != null) { reader = this.charFilterFactory.create(reader); }//from w w w.jav a2s .co m final Tokenizer source = new StandardTokenizer(reader); TokenStream result = source; result = new EnglishPossessiveFilter(getVersion(), result); result = new LowerCaseFilter(result); result = new StopFilter(result, this.stopwords); result = new KeywordRepeatFilter(result); result = new SnowballFilter(result, new EnglishStemmer()); result = new RemoveDuplicatesTokenFilter(result); return new TokenStreamComponents(source, result); }