Example usage for org.apache.lucene.analysis.standard StandardTokenizer StandardTokenizer

List of usage examples for org.apache.lucene.analysis.standard StandardTokenizer StandardTokenizer

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.standard StandardTokenizer StandardTokenizer.

Prototype

public StandardTokenizer(AttributeFactory factory) 

Source Link

Document

Creates a new StandardTokenizer with a given org.apache.lucene.util.AttributeFactory

Usage

From source file:analyzers.FormalAnalyzer.java

License:Apache License

/**
* Define how tokens are processed.//from  w  w  w. j  a  va 2s  . c o m
*
* @param    fieldName    required input
* @param    reader       reader for document
*/
@Override
protected Analyzer.TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
    Tokenizer tokenizer = new StandardTokenizer(reader);
    TokenStream chain = tokenizer;

    if (!tokenOpts.disableAllFilters) {
        // the chain of token filters...

        chain = new StandardFilter(chain);

        // discard tokens based on their type attribute
        chain = new StandardTagFilter(chain, tokenOpts);

        // convert tokens to lowercase
        chain = new LowerCaseFilter(chain);

        // replace accented chars with non-accented ASCII equivalents
        chain = new ASCIIFoldingFilter(chain);

        // remove stop words (must come after lowercasing)
        chain = new StopFilter(chain, stopWordSet);

        // remove 's
        chain = new EnglishPossessiveFilter(Version.LATEST, chain);

        // spelling correction            
        if (!spellingHashtable.isEmpty())
            chain = new SpellingCorrectionFilter(chain, spellingHashtable);

        if (!tokenOpts.disableStemming) {
            // Krovets stemmer (smarter than the Porter stemmer)
            chain = new KStemFilter(chain);
        }
    }

    return new Analyzer.TokenStreamComponents(tokenizer, chain);
}

From source file:brazilianStemmer.BrazilianAnalyzer.java

License:Apache License

/**
 * Creates a TokenStream which tokenizes all the text in the provided
 * Reader.//from  w w w .  j  av  a  2 s  .  co m
 * 
 * @return A TokenStream build from a StandardTokenizer filtered with
 *         StandardFilter, StopFilter, GermanStemFilter and LowerCaseFilter.
 */
public final TokenStream tokenStream(String fieldName, Reader reader) {
    TokenStream result = new StandardTokenizer(reader);

    /** Convert to lowercase after stemming! */
    result = new LowerCaseFilter(result);
    result = new StopFilter(result, englishStopWords);

    result = new BrazilianAccentsFilter(result);
    result = new StopFilter(result, stopWords);

    result = new BrazilianStemFilter(result, stopWords);

    return result;
}

From source file:com.appeligo.lucene.PorterStemAnalyzer.java

License:Apache License

/** Filters LowerCaseTokenizer with StopFilter. */
public TokenStream tokenStream(String fieldName, Reader reader) {
    TokenStream result = new StandardTokenizer(reader);
    result = new StandardFilter(result);
    result = new LowerCaseFilter(result);
    result = new StopFilter(result, stopWords);
    result = new PorterStemFilter(result);
    return result;
}

From source file:com.devb.search.IndicIndexer.java

License:Apache License

@Override
public void makeIndex() {
    String indexPath = servletContext.getRealPath("/") + "/hindex/";
    String docsPath = servletContext.getRealPath("/") + "/hdocs/";
    boolean create = true;

    final File docDir = new File(docsPath);
    if (!docDir.exists() || !docDir.canRead()) {
        System.out.println("Document directory '" + docDir.getAbsolutePath()
                + "' does not exist or is not readable, please check the path\n");
        return;// w  w  w  .  j a v  a  2  s.  co  m
    }

    Date start = new Date();
    try {
        System.out.println("Indexing to directory '" + indexPath + "'...\n");

        org.apache.lucene.store.Directory dir = FSDirectory.open(new File(indexPath));
        Analyzer analyzer = new HindiAnalyzer();
        IndexWriterConfig iwc = new IndexWriterConfig(null, analyzer);

        if (create) {
            iwc.setOpenMode(OpenMode.CREATE);
        } else {
            iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
        }

        IndexWriter writer = new IndexWriter(dir, iwc);
        if (docDir.canRead()) {
            if (docDir.isDirectory()) {
                String[] files = docDir.list();
                if (files != null) {
                    for (int i = 0; i < files.length; i++) {
                        File file = new File(docDir, files[i]);
                        FileInputStream fileInputStream = new FileInputStream(file);
                        BufferedReader reader = new BufferedReader(
                                new InputStreamReader(fileInputStream, "UTF-8"));
                        Tokenizer tokenizer = new StandardTokenizer(reader);
                        CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
                        tokenizer.reset();
                        int lineNumber = 0;
                        try {
                            while (tokenizer.incrementToken()) {
                                Document doc = new Document();
                                Field pathField = new StringField("path", file.getName(), Field.Store.YES);
                                doc.add(pathField);
                                TextField nField = new TextField("linenumber",
                                        new Integer(++lineNumber).toString(), Store.YES);
                                doc.add(nField);
                                TextField field = new TextField("contents", termAtt.toString(), Store.YES);
                                doc.add(field);
                                writer.addDocument(doc);
                            }
                            System.out.println("Adding " + file + "\n");
                        } catch (Exception e) {
                            e.printStackTrace();
                        } finally {
                            tokenizer.close();
                            reader.close();
                            fileInputStream.close();
                        }
                    }
                }
            }
        }

        writer.close();

        Date end = new Date();
        System.out.println((end.getTime() - start.getTime()) + " total milliseconds\n");

    } catch (IOException e) {
        System.out.println("Caught a " + e.getClass() + "\n with message: " + e.getMessage());
    }
}

From source file:com.dhamacher.sentimentanalysis4tweets.sentiment.Tokenizer.java

License:Apache License

/**
 *  Retrieve the tokens in a String. Behaves like getTokens, but operates on
 *  a string instead of a tweet object.//  w  ww . j a va  2  s.com
 * 
 *  @param  text    The text to tokenize.
 *  @return         The tokens in the text.
 */

// Version 1
/*public LinkedList<String> getTokens (String text) {
LinkedList<String> tokens   = new LinkedList();
String[] words              = text.split(" ");
tokens.addAll(Arrays.asList(words));
return tokens;
}*/

// Version 2
public static LinkedList<String> getTokens(String text) throws IOException {
    LinkedList<String> tokens = new LinkedList();
    TokenStream ts = new StandardTokenizer(new StringReader(text));
    TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
    while (ts.incrementToken()) {
        tokens.add(termAtt.term());
        //System.out.print(termAtt.term());
    }
    return tokens;
}

From source file:com.duroty.lucene.analysis.DefaultAnalyzer.java

License:Open Source License

/**
 * DOCUMENT ME!// ww w . j a  va  2 s  .co m
 *
 * @param fieldName DOCUMENT ME!
 * @param reader DOCUMENT ME!
 *
 * @return DOCUMENT ME!
 */
public TokenStream tokenStream(String fieldName, Reader reader) {
    TokenStream result = new StandardTokenizer(reader);

    result = new StandardFilter(result);
    result = new LowerCaseFilter(result);

    if (stopTable != null) {
        return new StopFilter(result, stopTable);
    } else {
        return result;
    }
}

From source file:com.duroty.lucene.analysis.EmptyAnalyzer.java

License:Apache License

/** Constructs a {@link StandardTokenizer} filtered by a {@link
StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */
public TokenStream tokenStream(String fieldName, Reader reader) {
    TokenStream result = new StandardTokenizer(reader);
    result = new StandardFilter(result);

    return result;
}

From source file:com.fdt.sdl.core.analyzer.phonetix.lucene.PhoneticAnalyzer.java

License:Open Source License

/**
 * Constructs a {@link StandardTokenizer} filtered by a {@link
 * StandardFilter}, a {@link LowerCaseFilter}, a {@link StopFilter},
 * and a {@link PhoneticFilter}.//from w  ww. j  a  v  a 2 s.c  om
 */
public TokenStream tokenStream(String fieldname, final Reader reader) {
    TokenStream result = new StandardTokenizer(reader);
    result = new StandardFilter(result);
    result = new LowerCaseFilter(result);
    result = new StopFilter(result, stopTable);
    result = new PhoneticFilter(result, encoder);
    return result;
}

From source file:com.flaptor.hounder.searcher.query.AQuerySuggestor.java

License:Apache License

private List<AQuery> suggestLinear(AQuery query) {
    List<AQuery> queries = new ArrayList<AQuery>();
    if (null == query) {
        logger.debug("Can't make a suggestion for a null query");
    } else if (!(query instanceof LazyParsedQuery)) {
        // TODO FIXME
        logger.debug("can not make suggestions for queries of type " + query.getClass());
    } else {/*from  w  ww . java 2 s.c o  m*/
        String originalString = ((LazyParsedQuery) query).getQueryString();
        StandardTokenizer tokenizer = new StandardTokenizer(new StringReader(originalString));
        List<String> tokens = new ArrayList<String>();
        try {
            Token token = new Token();
            while (true) {
                token = tokenizer.next(token);
                if (null == token) {
                    break;
                }
                tokens.add(TokenUtil.termText((Token) token.clone()));
            }

            // for every word, suggest something
            for (int i = 0; i < tokens.size(); i++) {
                StringBuffer sb = new StringBuffer();
                //                    sb.append("\"");
                for (int j = 0; j < i; j++) {
                    sb.append(tokens.get(j));
                    sb.append(" ");
                }
                String[] suggestions = suggestor.suggestWords(tokens.get(i));
                for (String suggestion : suggestions) {
                    // generate final sb
                    StringBuffer sbf = new StringBuffer(sb);
                    sbf.append(suggestion);
                    sbf.append(" ");
                    for (int k = i + 1; k < tokens.size(); k++) {
                        sbf.append(tokens.get(k));
                        if (k + 1 < tokens.size()) {
                            sbf.append(" ");
                        }
                    }
                    //                        sbf.append("\"");
                    queries.add(new LazyParsedQuery(sbf.toString()));
                }
            }

        } catch (IOException e) {
            logger.error("Error while suggesting query", e);
            return new ArrayList<AQuery>();
        }
    }
    return queries;
}

From source file:com.google.ie.common.search.analyzer.IdeaExchangeQueryAnalyzer.java

License:Apache License

@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
    if (streams == null) {
        streams = new SavedStreams();
        setPreviousTokenStream(streams);
        streams.tokenStream = new StandardTokenizer(reader);
        streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
        streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
        streams.filteredTokenStream = new StopFilter(streams.filteredTokenStream, stopSet);
        fieldName = DEFAULT_LANGUAGE;//from  ww  w  .  j  ava  2s  .  com
        streams.filteredTokenStream = new SnowballFilter(streams.filteredTokenStream, fieldName);
    } else {
        streams.tokenStream.reset(reader);
    }
    streams.tokenStream.setMaxTokenLength(maxTokenLength);
    return streams.filteredTokenStream;
}