Example usage for org.apache.lucene.analysis.standard StandardTokenizer StandardTokenizer

List of usage examples for org.apache.lucene.analysis.standard StandardTokenizer StandardTokenizer

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.standard StandardTokenizer StandardTokenizer.

Prototype

public StandardTokenizer(AttributeFactory factory) 

Source Link

Document

Creates a new StandardTokenizer with a given org.apache.lucene.util.AttributeFactory

Usage

From source file:com.NGramTokenBaseAnalyzer.java

@Override
protected Analyzer.TokenStreamComponents createComponents(String fieldName, Reader reader) {

    Tokenizer src = new StandardTokenizer(reader);

    TokenStream tok = new LowerCaseFilter(src);
    tok = filter(tok, this.unigramOutput);
    return new TokenStreamComponents(src, tok);
}

From source file:com.redhat.satellite.search.index.ngram.NGramAnalyzer.java

License:Open Source License

/**
 * @param fieldName ignored param/*from w  w w. ja  v  a2s.co m*/
 * @param reader contains data to parse
 * @return TokenStream of ngrams
 */
public TokenStream tokenStream(String fieldName, Reader reader) {
    return new NGramTokenFilter(new LowerCaseFilter(new StandardFilter(new StandardTokenizer(reader))),
            min_ngram, max_ngram);
}

From source file:com.stimulus.archiva.search.ArchivaAnalyzer.java

License:Open Source License

@Override
public final TokenStream tokenStream(String fieldName, final Reader reader) {

    TokenStream result = new StandardTokenizer(reader);
    result = new StandardFilter(result);
    result = new LowerCaseFilter(result);
    result = new StopFilter(result, stopTable);
    result = new PorterStemFilter(result);
    return result;
}

From source file:com.stimulus.archiva.search.FilterAnalyzer.java

License:Open Source License

@Override
public final TokenStream tokenStream(String fieldName, final Reader reader) {
    TokenStream result = new StandardTokenizer(reader);
    result = new StandardFilter(result);
    result = new LowerCaseFilter(result);
    return result;
}

From source file:com.talis.lucene.analysis.normen.NormaliseStandardAnalyzer.java

License:Apache License

@Override
public TokenStream tokenStream(String arg0, Reader reader) {
    TokenStream result = new StandardTokenizer(reader);
    result = new StandardFilter(result);
    result = new ISOLatin1AccentFilter(result);
    result = new LowerCaseFilter(result);
    return result;
}

From source file:com.talis.lucene.analysis.nostopen.NoStopwordStandardAnalyzer.java

License:Apache License

@Override
public TokenStream tokenStream(String arg0, Reader reader) {
    TokenStream result = new StandardTokenizer(reader);
    result = new StandardFilter(result);
    result = new LowerCaseFilter(result);
    return result;
}

From source file:com.tangentum.phonetix.lucene.PhoneticAnalyzer.java

License:Open Source License

/**
 * Constructs a {@link StandardTokenizer} filtered by a {@link
 * StandardFilter}, a {@link LowerCaseFilter}, a {@link StopFilter},
 * and a {@link PhoneticFilter}.//from  ww  w  . j a v a  2  s . c  o m
 */
public TokenStream tokenStream(final Reader reader) {
    TokenStream result = new StandardTokenizer(reader);
    result = new StandardFilter(result);
    result = new LowerCaseFilter(result);
    result = new StopFilter(result, stopTable);
    result = new PhoneticFilter(result, encoder);
    return result;
}

From source file:com.xiaomi.linden.lucene.analyzer.LindenWordDelimiterAnalyzer.java

License:Apache License

@Override
protected TokenStreamComponents createComponents(String s, Reader reader) {
    final Tokenizer source = new StandardTokenizer(reader);

    TokenStream ts = factoryDefault.create(source);
    if (this.toLowerCase) {
        ts = new LowerCaseFilter(ts);
    }//  ww  w  .ja v a2s .c om
    if (this.setStopWords) {
        ts = new StopFilter(ts, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
    }
    return new TokenStreamComponents(source, ts);
}

From source file:de.joergjahnke.jdesktopsearch.abstractionlayer.SQLServerIndexWriter.java

License:Open Source License

public synchronized void addDocument(final Document doc) throws IOException {
    SQLException ex = null;/*from w w  w .j av a2 s .  co m*/

    try {
        this.con.setAutoCommit(false);

        // get ID for the new file
        Statement stmt = con.createStatement();
        int fileId = 1;
        ResultSet rs = stmt.executeQuery("SELECT MAX( ID ) AS \"MaxID\" FROM File_");

        if (null != rs && rs.next()) {
            fileId = rs.getInt("MaxID") + 1;
        }

        rs.close();
        rs = null;

        // insert new file record
        PreparedStatement insertFileStmt = con
                .prepareStatement("INSERT INTO File_ ( ID, Name ) VALUES ( ?, ? )");

        insertFileStmt.setInt(1, fileId);
        insertFileStmt.setString(2, doc.getField("path").stringValue());
        int inserted = insertFileStmt.executeUpdate();

        // prepare SQL statements for the different data to insert
        PreparedStatement insertFieldStmt = con.prepareStatement(
                "INSERT INTO Field_ ( ID, FileID, Name, Value, FullValue, IsIndexable, IsCompressed ) VALUES ( ?, ?, ?, ?, ?, ?, ? )");
        PreparedStatement insertTokenStmt = con
                .prepareStatement("INSERT INTO Token_ ( FieldID, Word_, Occurrences ) VALUES ( ?, ?, ? )");

        // get ID for the next field
        int fieldId = 1;

        rs = stmt.executeQuery("SELECT MAX( ID ) AS \"MaxID\" FROM Field_");
        if (null != rs && rs.next()) {
            fieldId = rs.getInt("MaxID") + 1;
        }

        rs.close();
        rs = null;

        // insert all elements into the database
        for (Enumeration elements = doc.fields(); elements.hasMoreElements();) {
            final Field field = (Field) elements.nextElement();

            // don't save the value if not marked for storage
            String value = field.stringValue();

            if (!field.isStored()) {
                value = null;
            }

            // compress data if it does not fit into the 255 char value field and
            // if this field is also tokenized so that a search can still take place
            byte[] fullValue = null == value || value.length() <= 255 ? null : value.getBytes();
            boolean isCompressed = false;

            if (null != fullValue && field.isTokenized()) {
                // compress data
                final ByteArrayOutputStream newFullValue = new ByteArrayOutputStream();
                final byte[] buf = new byte[4000];
                final Deflater compresser = new Deflater();

                compresser.setInput(fullValue);
                compresser.finish();

                while (!compresser.finished()) {
                    final int read = compresser.deflate(buf);

                    newFullValue.write(buf, 0, read);
                }
                compresser.end();

                // only use compressed version if it is shorter than the original one
                if (newFullValue.size() < fullValue.length) {
                    fullValue = newFullValue.toByteArray();
                    isCompressed = true;
                }

            }

            // insert new file record
            insertFieldStmt.setInt(1, fieldId);
            insertFieldStmt.setInt(2, fileId);
            insertFieldStmt.setString(3, field.name());
            insertFieldStmt.setString(4, isCompressed || null == value ? null : StringUtils.left(value, 255));
            if (isCompressed || (null != fullValue && fullValue.length > 255)) {
                insertFieldStmt.setBytes(5, fullValue);
            } else {
                insertFieldStmt.setObject(5, null);
            }
            insertFieldStmt.setBoolean(6, field.isIndexed());
            insertFieldStmt.setBoolean(7, isCompressed);
            inserted = insertFieldStmt.executeUpdate();

            // tokenize string if necessary and store all tokens
            if (null != field.stringValue()) {
                final Map<String, Integer> tokenMap = new HashMap<String, Integer>(
                        field.isTokenized() ? field.stringValue().length() / this.AVG_WORDLENGTH : 1);

                if (field.isTokenized()) {
                    if (!this.USE_LUCENE_TOKENIZER) {
                        for (StringTokenizer tokenizer = new StringTokenizer(field.stringValue(),
                                " |&.:?,;!()[]/\t\n\r\f\240"); tokenizer.hasMoreTokens();) {
                            final String token = tokenizer.nextToken().toLowerCase();

                            if (tokenMap.containsKey(token)) {
                                tokenMap.put(token, tokenMap.get(token) + 1);
                            } else {
                                tokenMap.put(token, 1);
                            }
                        }
                    } else {
                        final Tokenizer tokenizer = new StandardTokenizer(
                                new StringReader(field.stringValue()));
                        Token token = null;

                        while ((token = tokenizer.next()) != null) {
                            final String tokenText = token.termText().toLowerCase();

                            if (tokenMap.containsKey(tokenText)) {
                                tokenMap.put(tokenText, tokenMap.get(tokenText) + 1);
                            } else {
                                tokenMap.put(tokenText, 1);
                            }
                        }
                    }
                } else {
                    tokenMap.put(field.stringValue().toLowerCase(), 1);
                }

                // store tokens in database
                for (String token : tokenMap.keySet()) {
                    insertTokenStmt.setInt(1, fieldId);
                    insertTokenStmt.setString(2, token.substring(0, Math.min(token.length(), 255)));
                    insertTokenStmt.setInt(3, tokenMap.get(token));
                    try {
                        insertTokenStmt.executeUpdate();
                    } catch (SQLException e) {
                        // it might happen that the sql server rejects the row as he assumes two entries which in
                        // Java are different are for him unique. We ignore this.
                    }
                }
            }

            ++fieldId;
        }

        // end the transaction
        this.con.commit();

        // cleanup
        insertTokenStmt.close();
        insertTokenStmt = null;
        insertFieldStmt.close();
        insertFieldStmt = null;
        insertFileStmt.close();
        insertFileStmt = null;
        stmt.close();
        stmt = null;
    } catch (SQLException e) {
        e.printStackTrace();
        ex = e;
        try {
            this.con.rollback();
        } catch (SQLException e2) {
        }
    } finally {
        try {
            this.con.setAutoCommit(true);
        } catch (SQLException e2) {
        }
    }

    // an error occurred
    if (null != ex)
        throw new IOException(ex.getMessage());
}

From source file:de.walware.statet.r.internal.core.rhelp.index.DefaultAnalyzer.java

License:Open Source License

@Override
protected TokenStreamComponents createComponents(final String fieldName, Reader reader) {
    if (this.charFilterFactory != null) {
        reader = this.charFilterFactory.create(reader);
    }//from   w w w.jav  a2s  .co m
    final Tokenizer source = new StandardTokenizer(reader);
    TokenStream result = source;
    result = new EnglishPossessiveFilter(getVersion(), result);
    result = new LowerCaseFilter(result);
    result = new StopFilter(result, this.stopwords);
    result = new KeywordRepeatFilter(result);
    result = new SnowballFilter(result, new EnglishStemmer());
    result = new RemoveDuplicatesTokenFilter(result);
    return new TokenStreamComponents(source, result);
}