Example usage for org.apache.lucene.analysis.standard StandardTokenizer StandardTokenizer

List of usage examples for org.apache.lucene.analysis.standard StandardTokenizer StandardTokenizer

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.standard StandardTokenizer StandardTokenizer.

Prototype

public StandardTokenizer() 

Source Link

Document

Creates a new instance of the org.apache.lucene.analysis.standard.StandardTokenizer .

Usage

From source file:edu.illinois.cs.cogcomp.bigdata.lucene.ASCIIEnglishAnalyzer.java

License:Open Source License

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    final Tokenizer source = new StandardTokenizer();
    TokenStream result = new StandardFilter(source);
    result = new ASCIIFoldingFilter(result);
    result = new EnglishPossessiveFilter(result);
    result = new WordDelimiterFilter(result, WordDelimiterFilter.ALPHA, null);
    result = new LowerCaseFilter(result);
    result = new StopFilter(result, EnglishAnalyzer.getDefaultStopSet());
    result = new PorterStemFilter(result);
    return new TokenStreamComponents(source, result);
}

From source file:edu.illinois.cs.cogcomp.bigdata.lucene.MinimalAnalyzer.java

License:Open Source License

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    final Tokenizer source = new StandardTokenizer();
    TokenStream result = new StandardFilter(source);
    result = new ASCIIFoldingFilter(result);
    result = new LowerCaseFilter(result);
    result = new EnglishPossessiveFilter(result);
    result = new StopFilter(result, stopwords);
    result = new WordDelimiterFilter(result, WordDelimiterFilter.ALPHA, null);
    result = new PorterStemFilter(result);
    return new TokenStreamComponents(source, result);
}

From source file:edu.umass.cs.ciir.IndexFromGalago.java

License:Open Source License

public static void main(String[] args) throws Exception {
    Parameters argp = Parameters.parseArgs(args);
    String galagoIndexPath = null;
    String luceneIndexPath = null;
    try {/*from  ww w. ja  v  a 2s .  c  o m*/
        galagoIndexPath = argp.getString("galagoIndex");
        luceneIndexPath = argp.getString("luceneIndex");
    } catch (Exception e) {
        System.out.println(getUsage());
        return;
    }

    logger.setUseParentHandlers(false);
    FileHandler lfh = new FileHandler("indexing-errors.log");
    SimpleFormatter formatter = new SimpleFormatter();
    lfh.setFormatter(formatter);
    logger.addHandler(lfh);

    final DiskIndex index = new DiskIndex(argp.get("index", galagoIndexPath));
    final CorpusReader corpus = (CorpusReader) index.getIndexPart("corpus");
    long total = corpus.getManifest().getLong("keyCount");
    final CorpusReader.KeyIterator iterator = corpus.getIterator();

    final Document.DocumentComponents dcp = Document.DocumentComponents.JustText;
    // Analyzer includes options for text processing
    Analyzer analyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            // Step 1: tokenization (Lucene's StandardTokenizer is suitable for most text retrieval occasions)
            TokenStreamComponents ts = new TokenStreamComponents(new StandardTokenizer());
            // Step 2: transforming all tokens into lowercased ones
            ts = new Analyzer.TokenStreamComponents(ts.getTokenizer(),
                    new LowerCaseFilter(ts.getTokenStream()));
            // Step 3: whether to remove stop words
            // Uncomment the following line to remove stop words
            // ts = new TokenStreamComponents( ts.getTokenizer(), new StopwordsFilter( ts.getTokenStream(), StandardAnalyzer.ENGLISH_STOP_WORDS_SET ) );
            // Step 4: whether to apply stemming
            // Uncomment the following line to apply Krovetz or Porter stemmer
            // ts = new TokenStreamComponents( ts.getTokenizer(), new KStemFilter( ts.getTokenStream() ) );
            // ts = new TokenStreamComponents( ts.getTokenizer(), new PorterStemFilter( ts.getTokenStream() ) );
            return ts;
        }
    };

    try (final FSDirectory dir = FSDirectory.open(Paths.get(argp.get("output", luceneIndexPath)))) {
        final IndexWriterConfig cfg = new IndexWriterConfig(analyzer);
        System.out.println("Similarity: " + cfg.getSimilarity());
        cfg.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
        try (IndexWriter writer = new IndexWriter(dir, cfg)) {
            iterator.forAllKeyStrings(docId -> {
                try {
                    Document document = iterator.getDocument(dcp);

                    String text = document.text;
                    String id = document.name;
                    System.out.println("Processing document: " + id);
                    org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document();
                    doc.add(new StringField("id", id, Field.Store.YES));
                    // this stores the actual text with tags so formatting is preserved
                    doc.add(new StoredField("body", text));
                    org.jsoup.nodes.Document jsoup = Jsoup.parse(text);

                    // tokens of the document
                    FieldType fieldTypeText = new FieldType();
                    fieldTypeText.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
                    fieldTypeText.setStoreTermVectors(true);
                    fieldTypeText.setStoreTermVectorPositions(true);
                    fieldTypeText.setTokenized(true);
                    fieldTypeText.setStored(false);
                    fieldTypeText.freeze();
                    doc.add(new Field("tokens", jsoup.text(), fieldTypeText));

                    try {
                        writer.addDocument(doc);
                        System.out.println("Doc count: " + writer.numDocs());
                    } catch (IOException e) {
                        logger.log(Level.WARNING, "Pull-Document-Exception", e);
                        System.err.println(e.toString());
                    }

                } catch (Exception e) {
                    logger.log(Level.WARNING, "Pull-Document-Exception", e);
                    System.err.println(e.toString());
                }
            });

        }
    }

    System.out.println("Indexing Done. ");
}

From source file:fastcampus.lucene.example.search.SpellCheckerExample.java

License:Apache License

public static void main(String[] args) throws Exception {
    Directory directory = FSDirectory.open(Paths.get("./index/spell/"));
    SpellChecker spellChecker = new SpellChecker(directory);

    //Analyzer analyzer = new StandardAnalyzer();                             // ? 
    Analyzer analyzer = new Analyzer() {
        @Override//from www  .  j a  v  a 2 s  . co m
        protected TokenStreamComponents createComponents(String s) {
            Reader reader = new StringReader(s);
            Tokenizer tokenizer = new StandardTokenizer();
            tokenizer.setReader(reader);
            String name = "nfc_cf";
            Normalizer2 normalizer = Normalizer2.getInstance(null, name, Normalizer2.Mode.DECOMPOSE);
            TokenFilter filter = new ICUNormalizer2Filter(tokenizer, normalizer);
            return new TokenStreamComponents(tokenizer, filter);
        }
    };

    IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer); //?? Writer? ?  ?

    Path path = Paths.get("./data/spell/dic.txt");

    spellChecker.setSpellIndex(directory);
    spellChecker.clearIndex();
    spellChecker.indexDictionary(new PlainTextDictionary(path), indexWriterConfig, true);
    String wordForSuggestions = "?";
    //spellChecker.setStringDistance(new LevensteinDistance());  //#Levenstein  
    spellChecker.setStringDistance(new JaroWinklerDistance()); //Jaro-Winkler 

    int suggestionsNumber = 1;
    String[] suggestions = spellChecker.suggestSimilar(wordForSuggestions, suggestionsNumber);
    if (suggestions != null && suggestions.length > 0) {

        for (String word : suggestions) {

            System.out.println("Did you mean:" + word);

        }

    } else {

        System.out.println("No suggestions found for word:" + wordForSuggestions);

    }

}

From source file:indexer.CustomAnalyzer.java

@Override
protected TokenStreamComponents createComponents(String string) {
    Tokenizer tokenizer = new StandardTokenizer();
    TokenStream tokenStream = new StandardFilter(tokenizer);

    tokenStream = new LowerCaseFilter(tokenStream);
    try {//from w ww  . ja  v  a2 s. co m
        tokenStream = this.filterStopWords(tokenStream);
    } catch (IOException ex) {
        Logger.getLogger(CustomAnalyzer.class.getName()).log(Level.SEVERE, null, ex);
    }
    tokenStream = new ASCIIFoldingFilter(tokenStream);
    tokenStream = new SnowballFilter(tokenStream, new RomanianStemmer());

    return new TokenStreamComponents(tokenizer, tokenStream);
}

From source file:internal.analyzers.indexing.PDFAnalyzer.java

License:Open Source License

@Override
protected Analyzer.TokenStreamComponents createComponents(String s) {
    StringReader reader = new StringReader(s);
    Tokenizer tokenizer = new StandardTokenizer();
    try {// w  ww . ja v a  2s  .  c om
        tokenizer.setReader(reader);
    } catch (IOException e) {
        log.error("Could not set reader on tokenizer. Threw IO exception");
    }

    TokenStream filter = new StandardFilter(tokenizer);
    filter = new LowerCaseFilter(filter);
    filter = new StopFilter(filter, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
    filter = new StopFilter(filter, stop_set);
    filter = new NumberFilter(filter);
    filter = new AlphaNumericFilter(filter);
    return new TokenStreamComponents(tokenizer, filter);
}

From source file:io.vertigo.dynamo.plugins.collections.lucene.DefaultAnalyzer.java

License:Apache License

/**
   * Creates a TokenStream which tokenizes all the text in the provided Reader.
   *//from w w  w.  j a v a  2 s.  com
   * @return A TokenStream build from a StandardTokenizer filtered with
   *         StandardFilter, StopFilter, FrenchStemFilter and LowerCaseFilter
   */
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
    /* initialisation du token */
    final Tokenizer source = new StandardTokenizer();
    //-----
    /* on retire les lisions*/
    final CharArraySet elisionSet = new CharArraySet(Arrays.asList(LuceneConstants.ELISION_ARTICLES), true);
    TokenStream filter = new ElisionFilter(source, elisionSet);
    /* on retire article adjectif */
    filter = new StopFilter(filter, stopWords);
    /* on retire les accents */
    filter = new ASCIIFoldingFilter(filter);
    /* on met en minuscule */
    filter = new LowerCaseFilter(filter);
    return new TokenStreamComponents(source, filter);
}

From source file:luceneprueba.CustomAnalyzers.ReviewAnalyzer.java

@Override
protected TokenStreamComponents createComponents(final String fieldName) {
    final StandardTokenizer src = new StandardTokenizer();
    src.setMaxTokenLength(maxTokenLength);
    TokenStream tokenizer = new StandardFilter(src);
    tokenizer = new LowerCaseFilter(tokenizer);
    tokenizer = new StopFilter(tokenizer, stopwords);
    return new TokenStreamComponents(src, tokenizer) {
        @Override//from  ww w . ja va 2s .  c  o m
        protected void setReader(final Reader reader) {
            src.setMaxTokenLength(ReviewAnalyzer.this.maxTokenLength);
            super.setReader(reader);
        }
    };
}

From source file:lucene_parameters.StandardAnalyzerWithNumberFilter.java

@Override
protected TokenStreamComponents createComponents(String string) {
    Tokenizer tokenizer = new StandardTokenizer();
    TokenStream filter = new StandardFilter(tokenizer);
    filter = new LowerCaseFilter(filter);
    filter = new StopFilter(filter, STOP_WORDS_SET);
    filter = new NumberFilter(filter);

    return new TokenStreamComponents(tokenizer, filter);
}

From source file:my_code.MyRomanianAnalyzer.java

License:Apache License

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    final Tokenizer source = new StandardTokenizer();

    TokenStream result = source;//from   w w  w . ja  v a  2 s .  co  m
    result = new StandardFilter(source);
    result = new LowerCaseFilter(result);
    result = new StopFilter(result, getStopwordSet());
    result = new ASCIIFoldingFilter(result);
    return new TokenStreamComponents(source, result);
}