Example usage for org.apache.lucene.search.spell DirectSpellChecker DirectSpellChecker

List of usage examples for org.apache.lucene.search.spell DirectSpellChecker DirectSpellChecker

Introduction

In this page you can find the example usage for org.apache.lucene.search.spell DirectSpellChecker DirectSpellChecker.

Prototype

public DirectSpellChecker() 

Source Link

Document

Creates a DirectSpellChecker with default configuration values

Usage

From source file:com.codeReading.core.opengrok.SearchHelper.java

License:Open Source License

/**
 * Create the searcher to use wrt. to currently set parameters and the given
 * projects. Does not produce any {@link #redirect} link. It also does
 * nothing if {@link #redirect} or {@link #errorMsg} have a
 * none-{@code null} value. <p> Parameters which should be populated/set at
 * this time: <ul> <li>{@link #builder}</li> <li>{@link #dataRoot}</li>
 * <li>{@link #order} (falls back to relevance if unset)</li>
 * <li>{@link #parallel} (default: false)</li> </ul> Populates/sets: <ul>
 * <li>{@link #query}</li> <li>{@link #searcher}</li> <li>{@link #sort}</li>
 * <li>{@link #projects}</li> <li>{@link #errorMsg} if an error occurs</li>
 * </ul>/*from   ww  w  .  jav  a 2  s . c  om*/
 *
 * @param projects project to use query. If empty, a none-project opengrok
 * setup is assumed (i.e. DATA_ROOT/index will be used instead of possible
 * multiple DATA_ROOT/$project/index).
 * @return this instance
 */
public SearchHelper prepareExec(SortedSet<String> projects) {
    if (redirect != null || errorMsg != null) {
        return this;
    }
    // the Query created by the QueryBuilder
    try {
        indexDir = new File(dataRoot, "index");
        query = builder.build();
        if (projects == null) {
            errorMsg = "No project selected!";
            return this;
        }
        this.projects = projects;
        if (projects.isEmpty()) {
            //no project setup
            FSDirectory dir = FSDirectory.open(indexDir);
            searcher = new IndexSearcher(DirectoryReader.open(dir));
        } else if (projects.size() == 1) {
            // just 1 project selected
            FSDirectory dir = FSDirectory.open(new File(indexDir, projects.first()));
            searcher = new IndexSearcher(DirectoryReader.open(dir));
        } else {
            //more projects                                
            IndexReader[] subreaders = new IndexReader[projects.size()];
            int ii = 0;
            //TODO might need to rewrite to Project instead of
            // String , need changes in projects.jspf too
            for (String proj : projects) {
                FSDirectory dir = FSDirectory.open(new File(indexDir, proj));
                subreaders[ii++] = DirectoryReader.open(dir);
            }
            MultiReader searchables = new MultiReader(subreaders, true);
            if (parallel) {
                int noThreads = 2 + (2 * Runtime.getRuntime().availableProcessors()); //TODO there might be a better way for counting this
                executor = Executors.newFixedThreadPool(noThreads);
            }
            searcher = parallel ? new IndexSearcher(searchables, executor) : new IndexSearcher(searchables);
        }
        // TODO check if below is somehow reusing sessions so we don't
        // requery again and again, I guess 2min timeout sessions could be
        // usefull, since you click on the next page within 2mins, if not,
        // then wait ;)
        switch (order) {
        case LASTMODIFIED:
            sort = new Sort(new SortField("date", SortField.Type.STRING, true));
            break;
        case BY_PATH:
            sort = new Sort(new SortField("fullpath", SortField.Type.STRING));
            break;
        default:
            sort = Sort.RELEVANCE;
            break;
        }
        checker = new DirectSpellChecker();
    } catch (ParseException e) {
        errorMsg = PARSE_ERROR_MSG + e.getMessage();
    } catch (FileNotFoundException e) {
        //          errorMsg = "Index database(s) not found: " + e.getMessage();
        errorMsg = "Index database(s) not found.";
    } catch (Exception e) {
        errorMsg = e.getMessage();
    }
    return this;
}

From source file:de.blizzy.documentr.search.PageFinder.java

License:Open Source License

private SearchTextSuggestion getSearchTextSuggestion(String searchText, Authentication authentication,
        IndexSearcher searcher) throws IOException, ParseException, TimeoutException {

    List<WordPosition> words = Lists.newArrayList();

    TokenStream tokenStream = null;//  w  ww  . j  ava 2s  .c o m
    try {
        tokenStream = analyzer.tokenStream(PageIndex.ALL_TEXT_SUGGESTIONS, new StringReader(searchText));
        tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.addAttribute(OffsetAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class);
            String text = charTerm.toString();
            if (StringUtils.isNotBlank(text)) {
                OffsetAttribute offset = tokenStream.getAttribute(OffsetAttribute.class);
                WordPosition word = new WordPosition(text, offset.startOffset(), offset.endOffset());
                words.add(word);
            }
        }
        tokenStream.end();
    } finally {
        Util.closeQuietly(tokenStream);
    }

    Collections.reverse(words);

    StringBuilder suggestedSearchText = new StringBuilder(searchText);
    StringBuilder suggestedSearchTextHtml = new StringBuilder(searchText);
    boolean foundSuggestions = false;
    String now = String.valueOf(System.currentTimeMillis());
    String startMarker = "__SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$
    String endMarker = "__/SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$
    DirectSpellChecker spellChecker = new DirectSpellChecker();
    IndexReader reader = searcher.getIndexReader();
    for (WordPosition word : words) {
        Term term = new Term(PageIndex.ALL_TEXT_SUGGESTIONS, word.getWord());
        SuggestWord[] suggestions = spellChecker.suggestSimilar(term, 1, reader,
                SuggestMode.SUGGEST_MORE_POPULAR);
        if (suggestions.length > 0) {
            String suggestedWord = suggestions[0].string;
            int start = word.getStart();
            int end = word.getEnd();
            suggestedSearchText.replace(start, end, suggestedWord);
            suggestedSearchTextHtml.replace(start, end,
                    startMarker + StringEscapeUtils.escapeHtml4(suggestedWord) + endMarker);

            foundSuggestions = true;
        }
    }

    if (foundSuggestions) {
        String suggestion = suggestedSearchText.toString();
        SearchResult suggestionResult = findPages(suggestion, 1, authentication, searcher);
        int suggestionTotalHits = suggestionResult.getTotalHits();
        if (suggestionTotalHits > 0) {
            String html = StringEscapeUtils.escapeHtml4(suggestedSearchTextHtml.toString())
                    .replaceAll(startMarker + "(.*?)" + endMarker, "<strong><em>$1</em></strong>"); //$NON-NLS-1$ //$NON-NLS-2$
            return new SearchTextSuggestion(suggestedSearchText.toString(), html, suggestionTotalHits);
        }
    }

    return null;
}

From source file:de.blizzy.documentr.search.PageIndex.java

License:Open Source License

private SearchTextSuggestion getSearchTextSuggestion(String searchText, Authentication authentication,
        IndexSearcher searcher) throws IOException, ParseException, TimeoutException {

    List<WordPosition> words = Lists.newArrayList();

    TokenStream tokenStream = null;//from  w w w.j a  v  a  2  s. c om
    try {
        tokenStream = analyzer.tokenStream(ALL_TEXT_SUGGESTIONS, new StringReader(searchText));
        tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.addAttribute(OffsetAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class);
            String text = charTerm.toString();
            if (StringUtils.isNotBlank(text)) {
                OffsetAttribute offset = tokenStream.getAttribute(OffsetAttribute.class);
                WordPosition word = new WordPosition(text, offset.startOffset(), offset.endOffset());
                words.add(word);
            }
        }
        tokenStream.end();
    } finally {
        Closeables.closeQuietly(tokenStream);
    }

    Collections.reverse(words);

    StringBuilder suggestedSearchText = new StringBuilder(searchText);
    StringBuilder suggestedSearchTextHtml = new StringBuilder(searchText);
    boolean foundSuggestions = false;
    String now = String.valueOf(System.currentTimeMillis());
    String startMarker = "__SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$
    String endMarker = "__/SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$
    DirectSpellChecker spellChecker = new DirectSpellChecker();
    IndexReader reader = searcher.getIndexReader();
    for (WordPosition word : words) {
        Term term = new Term(ALL_TEXT_SUGGESTIONS, word.getWord());
        SuggestWord[] suggestions = spellChecker.suggestSimilar(term, 1, reader,
                SuggestMode.SUGGEST_MORE_POPULAR);
        if (suggestions.length > 0) {
            String suggestedWord = suggestions[0].string;
            int start = word.getStart();
            int end = word.getEnd();
            suggestedSearchText.replace(start, end, suggestedWord);
            suggestedSearchTextHtml.replace(start, end,
                    startMarker + StringEscapeUtils.escapeHtml4(suggestedWord) + endMarker);

            foundSuggestions = true;
        }
    }

    if (foundSuggestions) {
        String suggestion = suggestedSearchText.toString();
        SearchResult suggestionResult = findPages(suggestion, 1, authentication, searcher);
        int suggestionTotalHits = suggestionResult.getTotalHits();
        if (suggestionTotalHits > 0) {
            String html = StringEscapeUtils.escapeHtml4(suggestedSearchTextHtml.toString())
                    .replaceAll(startMarker + "(.*?)" + endMarker, "<strong><em>$1</em></strong>"); //$NON-NLS-1$ //$NON-NLS-2$
            return new SearchTextSuggestion(suggestedSearchText.toString(), html, suggestionTotalHits);
        }
    }

    return null;
}

From source file:org.codelibs.elasticsearch.search.suggest.DirectSpellcheckerSettings.java

License:Apache License

public DirectSpellChecker createDirectSpellChecker() {

    DirectSpellChecker directSpellChecker = new DirectSpellChecker();
    directSpellChecker.setAccuracy(accuracy());
    Comparator<SuggestWord> comparator;
    switch (sort()) {
    case SCORE://w  w  w .j  a  va 2  s  .  c o  m
        comparator = SCORE_COMPARATOR;
        break;
    case FREQUENCY:
        comparator = LUCENE_FREQUENCY;
        break;
    default:
        throw new IllegalArgumentException("Illegal suggest sort: " + sort());
    }
    directSpellChecker.setComparator(comparator);
    directSpellChecker.setDistance(stringDistance());
    directSpellChecker.setMaxEdits(maxEdits());
    directSpellChecker.setMaxInspections(maxInspections());
    directSpellChecker.setMaxQueryFrequency(maxTermFreq());
    directSpellChecker.setMinPrefix(prefixLength());
    directSpellChecker.setMinQueryLength(minWordLength());
    directSpellChecker.setThresholdFrequency(minDocFreq());
    directSpellChecker.setLowerCaseTerms(false);
    return directSpellChecker;
}

From source file:org.elasticsearch.search.suggest.phrase.NoisyChannelSpellCheckerTests.java

License:Apache License

@Test
public void testMarvelHeros() throws IOException {
    RAMDirectory dir = new RAMDirectory();
    Map<String, Analyzer> mapping = new HashMap<String, Analyzer>();
    mapping.put("body_ngram", new Analyzer() {

        @Override//from  w  w  w  .j a va  2 s.c  o  m
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader);
            ShingleFilter tf = new ShingleFilter(t, 2, 3);
            tf.setOutputUnigrams(false);
            return new TokenStreamComponents(t, new LowerCaseFilter(Version.LUCENE_41, tf));
        }

    });

    mapping.put("body", new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader);
            return new TokenStreamComponents(t, new LowerCaseFilter(Version.LUCENE_41, t));
        }

    });
    PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(Version.LUCENE_41),
            mapping);

    IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_41, wrapper);
    IndexWriter writer = new IndexWriter(dir, conf);
    BufferedReader reader = new BufferedReader(new InputStreamReader(
            NoisyChannelSpellCheckerTests.class.getResourceAsStream("/config/names.txt"), Charsets.UTF_8));
    String line = null;
    while ((line = reader.readLine()) != null) {
        Document doc = new Document();
        doc.add(new Field("body", line, TextField.TYPE_NOT_STORED));
        doc.add(new Field("body_ngram", line, TextField.TYPE_NOT_STORED));
        writer.addDocument(doc);
    }

    DirectoryReader ir = DirectoryReader.open(writer, false);
    WordScorer wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.95d,
            new BytesRef(" "), 0.5f);

    NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker();
    DirectSpellChecker spellchecker = new DirectSpellChecker();
    spellchecker.setMinQueryLength(1);
    DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "body",
            SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 5);
    Result result = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body",
            wordScorer, 1, 2);
    Correction[] corrections = result.corrections;
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(space).utf8ToString(), equalTo("american ace"));
    assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("american <em>ace</em>"));
    assertThat(result.cutoffScore, greaterThan(0d));

    result = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body",
            wordScorer, 0, 1);
    corrections = result.corrections;
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(space).utf8ToString(), equalTo("american ame"));
    assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("american ame"));
    assertThat(result.cutoffScore, equalTo(Double.MIN_VALUE));

    suggester = new NoisyChannelSpellChecker(0.85);
    wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d,
            new BytesRef(" "), 0.5f);
    corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir,
            "body", wordScorer, 0, 2).corrections;
    assertThat(corrections.length, equalTo(4));
    assertThat(corrections[0].join(space).utf8ToString(), equalTo("xorr the god jewel"));
    assertThat(corrections[1].join(space).utf8ToString(), equalTo("xor the god jewel"));
    assertThat(corrections[2].join(space).utf8ToString(), equalTo("xorn the god jewel"));
    assertThat(corrections[3].join(space).utf8ToString(), equalTo("xorr the got jewel"));
    assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(),
            equalTo("<em>xorr</em> the <em>god</em> jewel"));
    assertThat(corrections[1].join(space, preTag, postTag).utf8ToString(),
            equalTo("xor the <em>god</em> jewel"));
    assertThat(corrections[2].join(space, preTag, postTag).utf8ToString(),
            equalTo("<em>xorn</em> the <em>god</em> jewel"));
    assertThat(corrections[3].join(space, preTag, postTag).utf8ToString(),
            equalTo("<em>xorr</em> the got jewel"));

    corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir,
            "body", wordScorer, 1, 2).corrections;
    assertThat(corrections.length, equalTo(4));
    assertThat(corrections[0].join(space).utf8ToString(), equalTo("xorr the god jewel"));
    assertThat(corrections[1].join(space).utf8ToString(), equalTo("xor the god jewel"));
    assertThat(corrections[2].join(space).utf8ToString(), equalTo("xorn the god jewel"));
    assertThat(corrections[3].join(space).utf8ToString(), equalTo("xorr the got jewel"));

    // Test some of the highlighting corner cases
    suggester = new NoisyChannelSpellChecker(0.85);
    wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d,
            new BytesRef(" "), 0.5f);
    corrections = suggester.getCorrections(wrapper, new BytesRef("Xor teh Got-Jewel"), generator, 4f, 4, ir,
            "body", wordScorer, 1, 2).corrections;
    assertThat(corrections.length, equalTo(4));
    assertThat(corrections[0].join(space).utf8ToString(), equalTo("xorr the god jewel"));
    assertThat(corrections[1].join(space).utf8ToString(), equalTo("xor the god jewel"));
    assertThat(corrections[2].join(space).utf8ToString(), equalTo("xorn the god jewel"));
    assertThat(corrections[3].join(space).utf8ToString(), equalTo("xor teh god jewel"));
    assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(),
            equalTo("<em>xorr the god</em> jewel"));
    assertThat(corrections[1].join(space, preTag, postTag).utf8ToString(),
            equalTo("xor <em>the god</em> jewel"));
    assertThat(corrections[2].join(space, preTag, postTag).utf8ToString(),
            equalTo("<em>xorn the god</em> jewel"));
    assertThat(corrections[3].join(space, preTag, postTag).utf8ToString(),
            equalTo("xor teh <em>god</em> jewel"));

    // test synonyms

    Analyzer analyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader);
            TokenFilter filter = new LowerCaseFilter(Version.LUCENE_41, t);
            try {
                SolrSynonymParser parser = new SolrSynonymParser(true, false,
                        new WhitespaceAnalyzer(Version.LUCENE_41));
                ((SolrSynonymParser) parser).parse(
                        new StringReader("usa => usa, america, american\nursa => usa, america, american"));
                filter = new SynonymFilter(filter, parser.build(), true);
            } catch (Exception e) {
                throw new RuntimeException(e);
            }
            return new TokenStreamComponents(t, filter);
        }
    };

    spellchecker.setAccuracy(0.0f);
    spellchecker.setMinPrefix(1);
    spellchecker.setMinQueryLength(1);
    suggester = new NoisyChannelSpellChecker(0.85);
    wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d,
            new BytesRef(" "), 0.5f);
    corrections = suggester.getCorrections(analyzer, new BytesRef("captian usa"), generator, 2, 4, ir, "body",
            wordScorer, 1, 2).corrections;
    assertThat(corrections[0].join(space).utf8ToString(), equalTo("captain america"));
    assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("<em>captain america</em>"));

    generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.85,
            10, null, analyzer, MultiFields.getTerms(ir, "body"));
    corrections = suggester.getCorrections(analyzer, new BytesRef("captian usw"), generator, 2, 4, ir, "body",
            wordScorer, 1, 2).corrections;
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america"));
    assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("<em>captain america</em>"));

    // Make sure that user supplied text is not marked as highlighted in the presence of a synonym filter
    generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.85,
            10, null, analyzer, MultiFields.getTerms(ir, "body"));
    corrections = suggester.getCorrections(analyzer, new BytesRef("captain usw"), generator, 2, 4, ir, "body",
            wordScorer, 1, 2).corrections;
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america"));
    assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("captain <em>america</em>"));
}

From source file:org.elasticsearch.search.suggest.phrase.NoisyChannelSpellCheckerTests.java

License:Apache License

@Test
public void testMarvelHerosMultiGenerator() throws IOException {
    RAMDirectory dir = new RAMDirectory();
    Map<String, Analyzer> mapping = new HashMap<String, Analyzer>();
    mapping.put("body_ngram", new Analyzer() {

        @Override//ww  w.j  ava2s .  com
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader);
            ShingleFilter tf = new ShingleFilter(t, 2, 3);
            tf.setOutputUnigrams(false);
            return new TokenStreamComponents(t, new LowerCaseFilter(Version.LUCENE_41, tf));
        }

    });

    mapping.put("body", new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader);
            return new TokenStreamComponents(t, new LowerCaseFilter(Version.LUCENE_41, t));
        }

    });
    mapping.put("body_reverse", new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader);
            return new TokenStreamComponents(t,
                    new ReverseStringFilter(Version.LUCENE_41, new LowerCaseFilter(Version.LUCENE_41, t)));
        }

    });
    PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(Version.LUCENE_41),
            mapping);

    IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_41, wrapper);
    IndexWriter writer = new IndexWriter(dir, conf);
    BufferedReader reader = new BufferedReader(new InputStreamReader(
            NoisyChannelSpellCheckerTests.class.getResourceAsStream("/config/names.txt"), Charsets.UTF_8));
    String line = null;
    while ((line = reader.readLine()) != null) {
        Document doc = new Document();
        doc.add(new Field("body", line, TextField.TYPE_NOT_STORED));
        doc.add(new Field("body_reverse", line, TextField.TYPE_NOT_STORED));
        doc.add(new Field("body_ngram", line, TextField.TYPE_NOT_STORED));
        writer.addDocument(doc);
    }

    DirectoryReader ir = DirectoryReader.open(writer, false);
    LaplaceScorer wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram",
            0.95d, new BytesRef(" "), 0.5f);
    NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker();
    DirectSpellChecker spellchecker = new DirectSpellChecker();
    spellchecker.setMinQueryLength(1);
    DirectCandidateGenerator forward = new DirectCandidateGenerator(spellchecker, "body",
            SuggestMode.SUGGEST_ALWAYS, ir, 0.95, 10);
    DirectCandidateGenerator reverse = new DirectCandidateGenerator(spellchecker, "body_reverse",
            SuggestMode.SUGGEST_ALWAYS, ir, 0.95, 10, wrapper, wrapper,
            MultiFields.getTerms(ir, "body_reverse"));
    CandidateGenerator generator = new MultiCandidateGeneratorWrapper(10, forward, reverse);

    Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), generator, 1, 1,
            ir, "body", wordScorer, 1, 2).corrections;
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));

    generator = new MultiCandidateGeneratorWrapper(5, forward, reverse);
    corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body",
            wordScorer, 1, 2).corrections;
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));

    corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), forward, 1, 1, ir, "body",
            wordScorer, 1, 2).corrections;
    assertThat(corrections.length, equalTo(0)); // only use forward with constant prefix

    corrections = suggester.getCorrections(wrapper, new BytesRef("america cae"), generator, 2, 1, ir, "body",
            wordScorer, 1, 2).corrections;
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));

    corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 4, ir,
            "body", wordScorer, 0, 2).corrections;
    assertThat(corrections.length, equalTo(4));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
    assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("zorr the god jewel"));
    assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("gorr the god jewel"));
    assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("varr the god jewel"));

    corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 1, ir,
            "body", wordScorer, 1.5f, 2).corrections;
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));

    corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, ir,
            "body", wordScorer, 1.5f, 2).corrections;
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));

}

From source file:org.elasticsearch.search.suggest.phrase.NoisyChannelSpellCheckerTests.java

License:Apache License

@Test
public void testMarvelHerosTrigram() throws IOException {

    RAMDirectory dir = new RAMDirectory();
    Map<String, Analyzer> mapping = new HashMap<String, Analyzer>();
    mapping.put("body_ngram", new Analyzer() {

        @Override/*from  w  w  w  .ja  v a  2s  .  c o m*/
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader);
            ShingleFilter tf = new ShingleFilter(t, 2, 3);
            tf.setOutputUnigrams(false);
            return new TokenStreamComponents(t, new LowerCaseFilter(Version.LUCENE_41, tf));
        }

    });

    mapping.put("body", new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader);
            return new TokenStreamComponents(t, new LowerCaseFilter(Version.LUCENE_41, t));
        }

    });
    PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(Version.LUCENE_41),
            mapping);

    IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_41, wrapper);
    IndexWriter writer = new IndexWriter(dir, conf);
    BufferedReader reader = new BufferedReader(new InputStreamReader(
            NoisyChannelSpellCheckerTests.class.getResourceAsStream("/config/names.txt"), Charsets.UTF_8));
    String line = null;
    while ((line = reader.readLine()) != null) {
        Document doc = new Document();
        doc.add(new Field("body", line, TextField.TYPE_NOT_STORED));
        doc.add(new Field("body_ngram", line, TextField.TYPE_NOT_STORED));
        writer.addDocument(doc);
    }

    DirectoryReader ir = DirectoryReader.open(writer, false);
    WordScorer wordScorer = new LinearInterpoatingScorer(ir, MultiFields.getTerms(ir, "body_ngram"),
            "body_ngram", 0.85d, new BytesRef(" "), 0.5, 0.4, 0.1);

    NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker();
    DirectSpellChecker spellchecker = new DirectSpellChecker();
    spellchecker.setMinQueryLength(1);
    DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "body",
            SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 5);
    Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1,
            ir, "body", wordScorer, 1, 3).corrections;
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));

    corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body",
            wordScorer, 1, 1).corrections;
    assertThat(corrections.length, equalTo(0));
    //        assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ape"));

    wordScorer = new LinearInterpoatingScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d,
            new BytesRef(" "), 0.5, 0.4, 0.1);
    corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir,
            "body", wordScorer, 0, 3).corrections;
    assertThat(corrections.length, equalTo(4));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
    assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel"));
    assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("xorn the god jewel"));
    assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the got jewel"));

    corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir,
            "body", wordScorer, 1, 3).corrections;
    assertThat(corrections.length, equalTo(4));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
    assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel"));
    assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("xorn the god jewel"));
    assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the got jewel"));

    corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, ir,
            "body", wordScorer, 100, 3).corrections;
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));

    // test synonyms

    Analyzer analyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader);
            TokenFilter filter = new LowerCaseFilter(Version.LUCENE_41, t);
            try {
                SolrSynonymParser parser = new SolrSynonymParser(true, false,
                        new WhitespaceAnalyzer(Version.LUCENE_41));
                ((SolrSynonymParser) parser).parse(
                        new StringReader("usa => usa, america, american\nursa => usa, america, american"));
                filter = new SynonymFilter(filter, parser.build(), true);
            } catch (Exception e) {
                throw new RuntimeException(e);
            }
            return new TokenStreamComponents(t, filter);
        }
    };

    spellchecker.setAccuracy(0.0f);
    spellchecker.setMinPrefix(1);
    spellchecker.setMinQueryLength(1);
    suggester = new NoisyChannelSpellChecker(0.95);
    wordScorer = new LinearInterpoatingScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.95d,
            new BytesRef(" "), 0.5, 0.4, 0.1);
    corrections = suggester.getCorrections(analyzer, new BytesRef("captian usa"), generator, 2, 4, ir, "body",
            wordScorer, 1, 3).corrections;
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america"));

    generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95,
            10, null, analyzer, MultiFields.getTerms(ir, "body"));
    corrections = suggester.getCorrections(analyzer, new BytesRef("captian usw"), generator, 2, 4, ir, "body",
            wordScorer, 1, 3).corrections;
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america"));

    wordScorer = new StupidBackoffScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d,
            new BytesRef(" "), 0.4);
    corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 2, ir,
            "body", wordScorer, 0, 3).corrections;
    assertThat(corrections.length, equalTo(2));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
    assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel"));
}

From source file:org.elasticsearch.search.suggest.SuggestUtils.java

License:Apache License

public static DirectSpellChecker getDirectSpellChecker(DirectSpellcheckerSettings suggestion) {

    DirectSpellChecker directSpellChecker = new DirectSpellChecker();
    directSpellChecker.setAccuracy(suggestion.accuracy());
    Comparator<SuggestWord> comparator;
    switch (suggestion.sort()) {
    case SCORE:/*w  w w  .j a  va  2 s  .  c o m*/
        comparator = SCORE_COMPARATOR;
        break;
    case FREQUENCY:
        comparator = LUCENE_FREQUENCY;
        break;
    default:
        throw new ElasticsearchIllegalArgumentException("Illegal suggest sort: " + suggestion.sort());
    }
    directSpellChecker.setComparator(comparator);
    directSpellChecker.setDistance(suggestion.stringDistance());
    directSpellChecker.setMaxEdits(suggestion.maxEdits());
    directSpellChecker.setMaxInspections(suggestion.maxInspections());
    directSpellChecker.setMaxQueryFrequency(suggestion.maxTermFreq());
    directSpellChecker.setMinPrefix(suggestion.prefixLength());
    directSpellChecker.setMinQueryLength(suggestion.minWordLength());
    directSpellChecker.setThresholdFrequency(suggestion.minDocFreq());
    directSpellChecker.setLowerCaseTerms(false);
    return directSpellChecker;
}

From source file:org.elasticsearch.test.unit.search.suggest.phrase.NoisyChannelSpellCheckerTests.java

License:Apache License

@Test
public void testMarvelHeros() throws IOException {

    RAMDirectory dir = new RAMDirectory();
    Map<String, Analyzer> mapping = new HashMap<String, Analyzer>();
    mapping.put("body_ngram", new Analyzer() {

        @Override/*from ww w.ja v a 2  s. c o  m*/
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader);
            ShingleFilter tf = new ShingleFilter(t, 2, 3);
            tf.setOutputUnigrams(false);
            return new TokenStreamComponents(t, new LowerCaseFilter(Version.LUCENE_41, tf));
        }

    });

    mapping.put("body", new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader);
            return new TokenStreamComponents(t, new LowerCaseFilter(Version.LUCENE_41, t));
        }

    });
    PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(Version.LUCENE_41),
            mapping);

    IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_41, wrapper);
    IndexWriter writer = new IndexWriter(dir, conf);
    BufferedReader reader = new BufferedReader(new InputStreamReader(
            NoisyChannelSpellCheckerTests.class.getResourceAsStream("/config/names.txt")));
    String line = null;
    while ((line = reader.readLine()) != null) {
        Document doc = new Document();
        doc.add(new Field("body", line, TextField.TYPE_NOT_STORED));
        doc.add(new Field("body_ngram", line, TextField.TYPE_NOT_STORED));
        writer.addDocument(doc);
    }

    DirectoryReader ir = DirectoryReader.open(writer, false);
    WordScorer wordScorer = new LaplaceScorer(ir, "body_ngram", 0.95d, new BytesRef(" "), 0.5f);

    NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker();
    DirectSpellChecker spellchecker = new DirectSpellChecker();
    spellchecker.setMinQueryLength(1);
    DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "body",
            SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 5);
    Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1,
            ir, "body", wordScorer, 1, 2);
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));

    corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body",
            wordScorer, 0, 1);
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ame"));

    suggester = new NoisyChannelSpellChecker(0.85);
    wordScorer = new LaplaceScorer(ir, "body_ngram", 0.85d, new BytesRef(" "), 0.5f);
    corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir,
            "body", wordScorer, 0, 2);
    assertThat(corrections.length, equalTo(4));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
    assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel"));
    assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("xorn the god jewel"));
    assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the got jewel"));

    corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir,
            "body", wordScorer, 1, 2);
    assertThat(corrections.length, equalTo(4));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
    assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel"));
    assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("xorn the god jewel"));
    assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the got jewel"));

    // test synonyms

    Analyzer analyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader);
            TokenFilter filter = new LowerCaseFilter(Version.LUCENE_41, t);
            try {
                SolrSynonymParser parser = new SolrSynonymParser(true, false,
                        new WhitespaceAnalyzer(Version.LUCENE_41));
                ((SolrSynonymParser) parser)
                        .add(new StringReader("usa => usa, america, american\nursa => usa, america, american"));
                filter = new SynonymFilter(filter, parser.build(), true);
            } catch (Exception e) {
                throw new RuntimeException(e);
            }
            return new TokenStreamComponents(t, filter);
        }
    };

    spellchecker.setAccuracy(0.0f);
    spellchecker.setMinPrefix(1);
    spellchecker.setMinQueryLength(1);
    suggester = new NoisyChannelSpellChecker(0.85);
    wordScorer = new LaplaceScorer(ir, "body_ngram", 0.85d, new BytesRef(" "), 0.5f);
    corrections = suggester.getCorrections(analyzer, new BytesRef("captian usa"), generator, 2, 4, ir, "body",
            wordScorer, 1, 2);
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america"));

    generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.85,
            10, null, analyzer);
    corrections = suggester.getCorrections(analyzer, new BytesRef("captian usw"), generator, 2, 4, ir, "body",
            wordScorer, 1, 2);
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america"));
}

From source file:org.elasticsearch.test.unit.search.suggest.phrase.NoisyChannelSpellCheckerTests.java

License:Apache License

@Test
public void testMarvelHerosMultiGenerator() throws IOException {
    RAMDirectory dir = new RAMDirectory();
    Map<String, Analyzer> mapping = new HashMap<String, Analyzer>();
    mapping.put("body_ngram", new Analyzer() {

        @Override// w w  w . ja v a2s.  c  o m
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader);
            ShingleFilter tf = new ShingleFilter(t, 2, 3);
            tf.setOutputUnigrams(false);
            return new TokenStreamComponents(t, new LowerCaseFilter(Version.LUCENE_41, tf));
        }

    });

    mapping.put("body", new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader);
            return new TokenStreamComponents(t, new LowerCaseFilter(Version.LUCENE_41, t));
        }

    });
    mapping.put("body_reverse", new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader);
            return new TokenStreamComponents(t,
                    new ReverseStringFilter(Version.LUCENE_41, new LowerCaseFilter(Version.LUCENE_41, t)));
        }

    });
    PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(Version.LUCENE_41),
            mapping);

    IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_41, wrapper);
    IndexWriter writer = new IndexWriter(dir, conf);
    BufferedReader reader = new BufferedReader(new InputStreamReader(
            NoisyChannelSpellCheckerTests.class.getResourceAsStream("/config/names.txt")));
    String line = null;
    while ((line = reader.readLine()) != null) {
        Document doc = new Document();
        doc.add(new Field("body", line, TextField.TYPE_NOT_STORED));
        doc.add(new Field("body_reverse", line, TextField.TYPE_NOT_STORED));
        doc.add(new Field("body_ngram", line, TextField.TYPE_NOT_STORED));
        writer.addDocument(doc);
    }

    DirectoryReader ir = DirectoryReader.open(writer, false);
    LaplaceScorer wordScorer = new LaplaceScorer(ir, "body_ngram", 0.95d, new BytesRef(" "), 0.5f);
    NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker();
    DirectSpellChecker spellchecker = new DirectSpellChecker();
    spellchecker.setMinQueryLength(1);
    DirectCandidateGenerator forward = new DirectCandidateGenerator(spellchecker, "body",
            SuggestMode.SUGGEST_ALWAYS, ir, 0.95, 10);
    DirectCandidateGenerator reverse = new DirectCandidateGenerator(spellchecker, "body_reverse",
            SuggestMode.SUGGEST_ALWAYS, ir, 0.95, 10, wrapper, wrapper);
    CandidateGenerator generator = new MultiCandidateGeneratorWrapper(10, forward, reverse);

    Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), generator, 1, 1,
            ir, "body", wordScorer, 1, 2);
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));

    generator = new MultiCandidateGeneratorWrapper(5, forward, reverse);
    corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body",
            wordScorer, 1, 2);
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));

    corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), forward, 1, 1, ir, "body",
            wordScorer, 1, 2);
    assertThat(corrections.length, equalTo(0)); // only use forward with constant prefix

    corrections = suggester.getCorrections(wrapper, new BytesRef("america cae"), generator, 2, 1, ir, "body",
            wordScorer, 1, 2);
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));

    corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 4, ir,
            "body", wordScorer, 0, 2);
    assertThat(corrections.length, equalTo(4));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
    assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("zorr the god jewel"));
    assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("gorr the god jewel"));
    assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("tarr the god jewel"));

    corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 1, ir,
            "body", wordScorer, 1.5f, 2);
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));

    corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, ir,
            "body", wordScorer, 1.5f, 2);
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));

}