Example usage for org.apache.lucene.analysis.synonym SolrSynonymParser SolrSynonymParser

List of usage examples for org.apache.lucene.analysis.synonym SolrSynonymParser SolrSynonymParser

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.synonym SolrSynonymParser SolrSynonymParser.

Prototype

public SolrSynonymParser(boolean dedup, boolean expand, Analyzer analyzer) 

Source Link

Usage

From source file:brightsolid.solr.plugins.TestTargetPositionQuerySynonyms.java

License:Apache License

@Override
public void setUp() throws Exception {
    super.setUp();

    String testFile = "one, uno, un\n" + "two, dos, too\n" + "three, free, tres";

    SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random()));
    parser.parse(new StringReader(testFile));

    final SynonymMap map = parser.build();
    Analyzer analyzer = new Analyzer() {
        @Override/*from w ww .ja  v a 2  s.c om*/
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, true);
            return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, false));
        }
    };

    directory = newDirectory();
    RandomIndexWriter iw = new RandomIndexWriter(random(), directory, analyzer);
    Document doc = new Document();
    FieldType newType = new FieldType(org.apache.lucene.document.TextField.TYPE_STORED);
    newType.setOmitNorms(true);
    Field field = newField("field", "", newType);
    field.fieldType().setOmitNorms(true);

    doc.add(field);

    field.setStringValue("one two three");
    iw.addDocument(doc);
    field.setStringValue("two three one");
    iw.addDocument(doc);
    field.setStringValue("three one two");
    iw.addDocument(doc);

    reader = iw.getReader();
    iw.close();
    searcher = newSearcher(reader);
}

From source file:com.github.le11.nls.solr.TypeAwareSynonymFilterFactory.java

License:Apache License

/**
 * Load synonyms from the solr format, "format=solr".
 *///from  w  w w .  j  a v  a 2  s.  co  m
private SynonymMap loadSolrSynonyms(ResourceLoader loader, boolean dedup, Analyzer analyzer)
        throws IOException, ParseException {
    final boolean expand = getBoolean("expand", true);
    String synonyms = args.get("synonyms");
    if (synonyms == null)
        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Missing required argument 'synonyms'.");

    CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder().onMalformedInput(CodingErrorAction.REPORT)
            .onUnmappableCharacter(CodingErrorAction.REPORT);

    SolrSynonymParser parser = new SolrSynonymParser(dedup, expand, analyzer);
    File synonymFile = new File(synonyms);
    if (synonymFile.exists()) {
        decoder.reset();
        parser.add(new InputStreamReader(loader.openResource(synonyms), decoder));
    } else {
        List<String> files = StrUtils.splitFileNames(synonyms);
        for (String file : files) {
            decoder.reset();
            parser.add(new InputStreamReader(loader.openResource(file), decoder));
        }
    }
    return parser.build();
}

From source file:de.berlinbuzzwords.FrenchSynonymAnalyzer.java

License:Apache License

private void initSynonyms(Reader synonyms) throws IOException, ParseException {
    final SolrSynonymParser parser = new SolrSynonymParser(true, // Remove duplicates
            true, // Expand 
            new WhitespaceAnalyzer(matchVersion) // Analyzer to use for parsing synonym entries
    );// w w  w .  j  a v  a  2  s.  c  o  m

    parser.add(synonyms);
    synonymMap = parser.build();
}

From source file:org.elasticsearch.index.analysis.SynonymTokenFilterFactory.java

License:Apache License

@Inject
public SynonymTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment env,
        IndicesAnalysisService indicesAnalysisService, Map<String, TokenizerFactoryFactory> tokenizerFactories,
        @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettings, name, settings);

    Reader rulesReader = null;/* w w  w.j a v a2 s  .c  om*/
    if (settings.getAsArray("synonyms", null) != null) {
        List<String> rules = Analysis.getWordList(env, settings, "synonyms");
        StringBuilder sb = new StringBuilder();
        for (String line : rules) {
            sb.append(line).append(System.getProperty("line.separator"));
        }
        rulesReader = new FastStringReader(sb.toString());
    } else if (settings.get("synonyms_path") != null) {
        rulesReader = Analysis.getReaderFromFile(env, settings, "synonyms_path");
    } else {
        throw new ElasticsearchIllegalArgumentException(
                "synonym requires either `synonyms` or `synonyms_path` to be configured");
    }

    this.ignoreCase = settings.getAsBoolean("ignore_case", false);
    boolean expand = settings.getAsBoolean("expand", true);

    String tokenizerName = settings.get("tokenizer", "whitespace");

    TokenizerFactoryFactory tokenizerFactoryFactory = tokenizerFactories.get(tokenizerName);
    if (tokenizerFactoryFactory == null) {
        tokenizerFactoryFactory = indicesAnalysisService.tokenizerFactoryFactory(tokenizerName);
    }
    if (tokenizerFactoryFactory == null) {
        throw new ElasticsearchIllegalArgumentException(
                "failed to find tokenizer [" + tokenizerName + "] for synonym token filter");
    }
    final TokenizerFactory tokenizerFactory = tokenizerFactoryFactory.create(tokenizerName, settings);

    Analyzer analyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer tokenizer = tokenizerFactory == null
                    ? new WhitespaceTokenizer(Lucene.ANALYZER_VERSION, reader)
                    : tokenizerFactory.create(reader);
            TokenStream stream = ignoreCase ? new LowerCaseFilter(Lucene.ANALYZER_VERSION, tokenizer)
                    : tokenizer;
            return new TokenStreamComponents(tokenizer, stream);
        }
    };

    try {
        SynonymMap.Builder parser = null;

        if ("wordnet".equalsIgnoreCase(settings.get("format"))) {
            parser = new WordnetSynonymParser(true, expand, analyzer);
            ((WordnetSynonymParser) parser).parse(rulesReader);
        } else {
            parser = new SolrSynonymParser(true, expand, analyzer);
            ((SolrSynonymParser) parser).parse(rulesReader);
        }

        synonymMap = parser.build();
    } catch (Exception e) {
        throw new ElasticsearchIllegalArgumentException("failed to build synonyms", e);
    }
}

From source file:org.elasticsearch.plugin.analysis.SynonymWithPayloadsTokenFilterFactory.java

License:Apache License

@Inject
public SynonymWithPayloadsTokenFilterFactory(Index index, @IndexSettings Settings indexSettings,
        Environment env, IndicesAnalysisService indicesAnalysisService,
        Map<String, TokenizerFactoryFactory> tokenizerFactories, @Assisted String name,
        @Assisted Settings settings) {/*ww  w  . jav  a2  s .  c o  m*/
    super(index, indexSettings, name, settings);
    Reader rulesReader = null;
    if (settings.getAsArray("synonyms", null) != null) {
        List<String> rules = Analysis.getWordList(env, settings, "synonyms");
        StringBuilder sb = new StringBuilder();
        for (String line : rules) {
            sb.append(line).append(System.getProperty("line.separator"));
        }
        rulesReader = new FastStringReader(sb.toString());
    } else if (settings.get("synonyms_path") != null) {
        rulesReader = Analysis.getReaderFromFile(env, settings, "synonyms_path");
    } else {
        throw new ElasticsearchIllegalArgumentException(
                "synonym requires either `synonyms` or `synonyms_path` to be configured");
    }
    this.ignoreCase = settings.getAsBoolean("ignore_case", false);
    boolean expand = settings.getAsBoolean("expand", true);
    String tokenizerName = settings.get("tokenizer", "whitespace");
    TokenizerFactoryFactory tokenizerFactoryFactory = tokenizerFactories.get(tokenizerName);
    if (tokenizerFactoryFactory == null) {
        tokenizerFactoryFactory = indicesAnalysisService.tokenizerFactoryFactory(tokenizerName);
    }
    if (tokenizerFactoryFactory == null) {
        throw new ElasticsearchIllegalArgumentException(
                "failed to find tokenizer [" + tokenizerName + "] for synonym token filter");
    }
    final TokenizerFactory tokenizerFactory = tokenizerFactoryFactory.create(tokenizerName, indexSettings);
    Analyzer analyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer tokenizer = tokenizerFactory == null
                    ? new WhitespaceTokenizer(Lucene.ANALYZER_VERSION, reader)
                    : tokenizerFactory.create(reader);
            TokenStream stream = ignoreCase ? new LowerCaseFilter(Lucene.ANALYZER_VERSION, tokenizer)
                    : tokenizer;
            return new TokenStreamComponents(tokenizer, stream);
        }
    };
    try {
        SynonymMap.Builder parser = null;
        if ("wordnet".equalsIgnoreCase(settings.get("format"))) {
            parser = new WordnetSynonymParser(true, expand, analyzer);
            ((WordnetSynonymParser) parser).parse(rulesReader);
        } else {
            parser = new SolrSynonymParser(true, expand, analyzer);
            ((SolrSynonymParser) parser).parse(rulesReader);
        }
        synonymMap = parser.build();
    } catch (Exception e) {
        throw new ElasticsearchIllegalArgumentException("failed to build synonyms", e);
    }
}

From source file:org.elasticsearch.search.suggest.phrase.NoisyChannelSpellCheckerTests.java

License:Apache License

@Test
public void testMarvelHeros() throws IOException {
    RAMDirectory dir = new RAMDirectory();
    Map<String, Analyzer> mapping = new HashMap<String, Analyzer>();
    mapping.put("body_ngram", new Analyzer() {

        @Override//from   w  w  w .ja  v a  2  s. c om
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader);
            ShingleFilter tf = new ShingleFilter(t, 2, 3);
            tf.setOutputUnigrams(false);
            return new TokenStreamComponents(t, new LowerCaseFilter(Version.LUCENE_41, tf));
        }

    });

    mapping.put("body", new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader);
            return new TokenStreamComponents(t, new LowerCaseFilter(Version.LUCENE_41, t));
        }

    });
    PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(Version.LUCENE_41),
            mapping);

    IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_41, wrapper);
    IndexWriter writer = new IndexWriter(dir, conf);
    BufferedReader reader = new BufferedReader(new InputStreamReader(
            NoisyChannelSpellCheckerTests.class.getResourceAsStream("/config/names.txt"), Charsets.UTF_8));
    String line = null;
    while ((line = reader.readLine()) != null) {
        Document doc = new Document();
        doc.add(new Field("body", line, TextField.TYPE_NOT_STORED));
        doc.add(new Field("body_ngram", line, TextField.TYPE_NOT_STORED));
        writer.addDocument(doc);
    }

    DirectoryReader ir = DirectoryReader.open(writer, false);
    WordScorer wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.95d,
            new BytesRef(" "), 0.5f);

    NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker();
    DirectSpellChecker spellchecker = new DirectSpellChecker();
    spellchecker.setMinQueryLength(1);
    DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "body",
            SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 5);
    Result result = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body",
            wordScorer, 1, 2);
    Correction[] corrections = result.corrections;
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(space).utf8ToString(), equalTo("american ace"));
    assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("american <em>ace</em>"));
    assertThat(result.cutoffScore, greaterThan(0d));

    result = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body",
            wordScorer, 0, 1);
    corrections = result.corrections;
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(space).utf8ToString(), equalTo("american ame"));
    assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("american ame"));
    assertThat(result.cutoffScore, equalTo(Double.MIN_VALUE));

    suggester = new NoisyChannelSpellChecker(0.85);
    wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d,
            new BytesRef(" "), 0.5f);
    corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir,
            "body", wordScorer, 0, 2).corrections;
    assertThat(corrections.length, equalTo(4));
    assertThat(corrections[0].join(space).utf8ToString(), equalTo("xorr the god jewel"));
    assertThat(corrections[1].join(space).utf8ToString(), equalTo("xor the god jewel"));
    assertThat(corrections[2].join(space).utf8ToString(), equalTo("xorn the god jewel"));
    assertThat(corrections[3].join(space).utf8ToString(), equalTo("xorr the got jewel"));
    assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(),
            equalTo("<em>xorr</em> the <em>god</em> jewel"));
    assertThat(corrections[1].join(space, preTag, postTag).utf8ToString(),
            equalTo("xor the <em>god</em> jewel"));
    assertThat(corrections[2].join(space, preTag, postTag).utf8ToString(),
            equalTo("<em>xorn</em> the <em>god</em> jewel"));
    assertThat(corrections[3].join(space, preTag, postTag).utf8ToString(),
            equalTo("<em>xorr</em> the got jewel"));

    corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir,
            "body", wordScorer, 1, 2).corrections;
    assertThat(corrections.length, equalTo(4));
    assertThat(corrections[0].join(space).utf8ToString(), equalTo("xorr the god jewel"));
    assertThat(corrections[1].join(space).utf8ToString(), equalTo("xor the god jewel"));
    assertThat(corrections[2].join(space).utf8ToString(), equalTo("xorn the god jewel"));
    assertThat(corrections[3].join(space).utf8ToString(), equalTo("xorr the got jewel"));

    // Test some of the highlighting corner cases
    suggester = new NoisyChannelSpellChecker(0.85);
    wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d,
            new BytesRef(" "), 0.5f);
    corrections = suggester.getCorrections(wrapper, new BytesRef("Xor teh Got-Jewel"), generator, 4f, 4, ir,
            "body", wordScorer, 1, 2).corrections;
    assertThat(corrections.length, equalTo(4));
    assertThat(corrections[0].join(space).utf8ToString(), equalTo("xorr the god jewel"));
    assertThat(corrections[1].join(space).utf8ToString(), equalTo("xor the god jewel"));
    assertThat(corrections[2].join(space).utf8ToString(), equalTo("xorn the god jewel"));
    assertThat(corrections[3].join(space).utf8ToString(), equalTo("xor teh god jewel"));
    assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(),
            equalTo("<em>xorr the god</em> jewel"));
    assertThat(corrections[1].join(space, preTag, postTag).utf8ToString(),
            equalTo("xor <em>the god</em> jewel"));
    assertThat(corrections[2].join(space, preTag, postTag).utf8ToString(),
            equalTo("<em>xorn the god</em> jewel"));
    assertThat(corrections[3].join(space, preTag, postTag).utf8ToString(),
            equalTo("xor teh <em>god</em> jewel"));

    // test synonyms

    Analyzer analyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader);
            TokenFilter filter = new LowerCaseFilter(Version.LUCENE_41, t);
            try {
                SolrSynonymParser parser = new SolrSynonymParser(true, false,
                        new WhitespaceAnalyzer(Version.LUCENE_41));
                ((SolrSynonymParser) parser).parse(
                        new StringReader("usa => usa, america, american\nursa => usa, america, american"));
                filter = new SynonymFilter(filter, parser.build(), true);
            } catch (Exception e) {
                throw new RuntimeException(e);
            }
            return new TokenStreamComponents(t, filter);
        }
    };

    spellchecker.setAccuracy(0.0f);
    spellchecker.setMinPrefix(1);
    spellchecker.setMinQueryLength(1);
    suggester = new NoisyChannelSpellChecker(0.85);
    wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d,
            new BytesRef(" "), 0.5f);
    corrections = suggester.getCorrections(analyzer, new BytesRef("captian usa"), generator, 2, 4, ir, "body",
            wordScorer, 1, 2).corrections;
    assertThat(corrections[0].join(space).utf8ToString(), equalTo("captain america"));
    assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("<em>captain america</em>"));

    generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.85,
            10, null, analyzer, MultiFields.getTerms(ir, "body"));
    corrections = suggester.getCorrections(analyzer, new BytesRef("captian usw"), generator, 2, 4, ir, "body",
            wordScorer, 1, 2).corrections;
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america"));
    assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("<em>captain america</em>"));

    // Make sure that user supplied text is not marked as highlighted in the presence of a synonym filter
    generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.85,
            10, null, analyzer, MultiFields.getTerms(ir, "body"));
    corrections = suggester.getCorrections(analyzer, new BytesRef("captain usw"), generator, 2, 4, ir, "body",
            wordScorer, 1, 2).corrections;
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america"));
    assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("captain <em>america</em>"));
}

From source file:org.elasticsearch.search.suggest.phrase.NoisyChannelSpellCheckerTests.java

License:Apache License

@Test
public void testMarvelHerosTrigram() throws IOException {

    RAMDirectory dir = new RAMDirectory();
    Map<String, Analyzer> mapping = new HashMap<String, Analyzer>();
    mapping.put("body_ngram", new Analyzer() {

        @Override//from w w w.j  a v a  2 s  .c  o  m
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader);
            ShingleFilter tf = new ShingleFilter(t, 2, 3);
            tf.setOutputUnigrams(false);
            return new TokenStreamComponents(t, new LowerCaseFilter(Version.LUCENE_41, tf));
        }

    });

    mapping.put("body", new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader);
            return new TokenStreamComponents(t, new LowerCaseFilter(Version.LUCENE_41, t));
        }

    });
    PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(Version.LUCENE_41),
            mapping);

    IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_41, wrapper);
    IndexWriter writer = new IndexWriter(dir, conf);
    BufferedReader reader = new BufferedReader(new InputStreamReader(
            NoisyChannelSpellCheckerTests.class.getResourceAsStream("/config/names.txt"), Charsets.UTF_8));
    String line = null;
    while ((line = reader.readLine()) != null) {
        Document doc = new Document();
        doc.add(new Field("body", line, TextField.TYPE_NOT_STORED));
        doc.add(new Field("body_ngram", line, TextField.TYPE_NOT_STORED));
        writer.addDocument(doc);
    }

    DirectoryReader ir = DirectoryReader.open(writer, false);
    WordScorer wordScorer = new LinearInterpoatingScorer(ir, MultiFields.getTerms(ir, "body_ngram"),
            "body_ngram", 0.85d, new BytesRef(" "), 0.5, 0.4, 0.1);

    NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker();
    DirectSpellChecker spellchecker = new DirectSpellChecker();
    spellchecker.setMinQueryLength(1);
    DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "body",
            SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 5);
    Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1,
            ir, "body", wordScorer, 1, 3).corrections;
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));

    corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body",
            wordScorer, 1, 1).corrections;
    assertThat(corrections.length, equalTo(0));
    //        assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ape"));

    wordScorer = new LinearInterpoatingScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d,
            new BytesRef(" "), 0.5, 0.4, 0.1);
    corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir,
            "body", wordScorer, 0, 3).corrections;
    assertThat(corrections.length, equalTo(4));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
    assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel"));
    assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("xorn the god jewel"));
    assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the got jewel"));

    corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir,
            "body", wordScorer, 1, 3).corrections;
    assertThat(corrections.length, equalTo(4));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
    assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel"));
    assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("xorn the god jewel"));
    assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the got jewel"));

    corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, ir,
            "body", wordScorer, 100, 3).corrections;
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));

    // test synonyms

    Analyzer analyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader);
            TokenFilter filter = new LowerCaseFilter(Version.LUCENE_41, t);
            try {
                SolrSynonymParser parser = new SolrSynonymParser(true, false,
                        new WhitespaceAnalyzer(Version.LUCENE_41));
                ((SolrSynonymParser) parser).parse(
                        new StringReader("usa => usa, america, american\nursa => usa, america, american"));
                filter = new SynonymFilter(filter, parser.build(), true);
            } catch (Exception e) {
                throw new RuntimeException(e);
            }
            return new TokenStreamComponents(t, filter);
        }
    };

    spellchecker.setAccuracy(0.0f);
    spellchecker.setMinPrefix(1);
    spellchecker.setMinQueryLength(1);
    suggester = new NoisyChannelSpellChecker(0.95);
    wordScorer = new LinearInterpoatingScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.95d,
            new BytesRef(" "), 0.5, 0.4, 0.1);
    corrections = suggester.getCorrections(analyzer, new BytesRef("captian usa"), generator, 2, 4, ir, "body",
            wordScorer, 1, 3).corrections;
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america"));

    generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95,
            10, null, analyzer, MultiFields.getTerms(ir, "body"));
    corrections = suggester.getCorrections(analyzer, new BytesRef("captian usw"), generator, 2, 4, ir, "body",
            wordScorer, 1, 3).corrections;
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america"));

    wordScorer = new StupidBackoffScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d,
            new BytesRef(" "), 0.4);
    corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 2, ir,
            "body", wordScorer, 0, 3).corrections;
    assertThat(corrections.length, equalTo(2));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
    assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel"));
}

From source file:org.elasticsearch.test.unit.search.suggest.phrase.NoisyChannelSpellCheckerTests.java

License:Apache License

@Test
public void testMarvelHeros() throws IOException {

    RAMDirectory dir = new RAMDirectory();
    Map<String, Analyzer> mapping = new HashMap<String, Analyzer>();
    mapping.put("body_ngram", new Analyzer() {

        @Override/* w  ww  .  java  2 s.co  m*/
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader);
            ShingleFilter tf = new ShingleFilter(t, 2, 3);
            tf.setOutputUnigrams(false);
            return new TokenStreamComponents(t, new LowerCaseFilter(Version.LUCENE_41, tf));
        }

    });

    mapping.put("body", new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader);
            return new TokenStreamComponents(t, new LowerCaseFilter(Version.LUCENE_41, t));
        }

    });
    PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(Version.LUCENE_41),
            mapping);

    IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_41, wrapper);
    IndexWriter writer = new IndexWriter(dir, conf);
    BufferedReader reader = new BufferedReader(new InputStreamReader(
            NoisyChannelSpellCheckerTests.class.getResourceAsStream("/config/names.txt")));
    String line = null;
    while ((line = reader.readLine()) != null) {
        Document doc = new Document();
        doc.add(new Field("body", line, TextField.TYPE_NOT_STORED));
        doc.add(new Field("body_ngram", line, TextField.TYPE_NOT_STORED));
        writer.addDocument(doc);
    }

    DirectoryReader ir = DirectoryReader.open(writer, false);
    WordScorer wordScorer = new LaplaceScorer(ir, "body_ngram", 0.95d, new BytesRef(" "), 0.5f);

    NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker();
    DirectSpellChecker spellchecker = new DirectSpellChecker();
    spellchecker.setMinQueryLength(1);
    DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "body",
            SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 5);
    Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1,
            ir, "body", wordScorer, 1, 2);
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));

    corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body",
            wordScorer, 0, 1);
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ame"));

    suggester = new NoisyChannelSpellChecker(0.85);
    wordScorer = new LaplaceScorer(ir, "body_ngram", 0.85d, new BytesRef(" "), 0.5f);
    corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir,
            "body", wordScorer, 0, 2);
    assertThat(corrections.length, equalTo(4));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
    assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel"));
    assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("xorn the god jewel"));
    assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the got jewel"));

    corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir,
            "body", wordScorer, 1, 2);
    assertThat(corrections.length, equalTo(4));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
    assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel"));
    assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("xorn the god jewel"));
    assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the got jewel"));

    // test synonyms

    Analyzer analyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader);
            TokenFilter filter = new LowerCaseFilter(Version.LUCENE_41, t);
            try {
                SolrSynonymParser parser = new SolrSynonymParser(true, false,
                        new WhitespaceAnalyzer(Version.LUCENE_41));
                ((SolrSynonymParser) parser)
                        .add(new StringReader("usa => usa, america, american\nursa => usa, america, american"));
                filter = new SynonymFilter(filter, parser.build(), true);
            } catch (Exception e) {
                throw new RuntimeException(e);
            }
            return new TokenStreamComponents(t, filter);
        }
    };

    spellchecker.setAccuracy(0.0f);
    spellchecker.setMinPrefix(1);
    spellchecker.setMinQueryLength(1);
    suggester = new NoisyChannelSpellChecker(0.85);
    wordScorer = new LaplaceScorer(ir, "body_ngram", 0.85d, new BytesRef(" "), 0.5f);
    corrections = suggester.getCorrections(analyzer, new BytesRef("captian usa"), generator, 2, 4, ir, "body",
            wordScorer, 1, 2);
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america"));

    generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.85,
            10, null, analyzer);
    corrections = suggester.getCorrections(analyzer, new BytesRef("captian usw"), generator, 2, 4, ir, "body",
            wordScorer, 1, 2);
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america"));
}

From source file:org.elasticsearch.test.unit.search.suggest.phrase.NoisyChannelSpellCheckerTests.java

License:Apache License

@Test
public void testMarvelHerosTrigram() throws IOException {

    RAMDirectory dir = new RAMDirectory();
    Map<String, Analyzer> mapping = new HashMap<String, Analyzer>();
    mapping.put("body_ngram", new Analyzer() {

        @Override/*  w  w  w.j  a v a  2  s  . c o  m*/
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader);
            ShingleFilter tf = new ShingleFilter(t, 2, 3);
            tf.setOutputUnigrams(false);
            return new TokenStreamComponents(t, new LowerCaseFilter(Version.LUCENE_41, tf));
        }

    });

    mapping.put("body", new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader);
            return new TokenStreamComponents(t, new LowerCaseFilter(Version.LUCENE_41, t));
        }

    });
    PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(Version.LUCENE_41),
            mapping);

    IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_41, wrapper);
    IndexWriter writer = new IndexWriter(dir, conf);
    BufferedReader reader = new BufferedReader(new InputStreamReader(
            NoisyChannelSpellCheckerTests.class.getResourceAsStream("/config/names.txt")));
    String line = null;
    while ((line = reader.readLine()) != null) {
        Document doc = new Document();
        doc.add(new Field("body", line, TextField.TYPE_NOT_STORED));
        doc.add(new Field("body_ngram", line, TextField.TYPE_NOT_STORED));
        writer.addDocument(doc);
    }

    DirectoryReader ir = DirectoryReader.open(writer, false);
    WordScorer wordScorer = new LinearInterpoatingScorer(ir, "body_ngram", 0.85d, new BytesRef(" "), 0.5, 0.4,
            0.1);

    NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker();
    DirectSpellChecker spellchecker = new DirectSpellChecker();
    spellchecker.setMinQueryLength(1);
    DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "body",
            SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 5);
    Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1,
            ir, "body", wordScorer, 1, 3);
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));

    corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body",
            wordScorer, 1, 1);
    assertThat(corrections.length, equalTo(0));
    //        assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ape"));

    wordScorer = new LinearInterpoatingScorer(ir, "body_ngram", 0.85d, new BytesRef(" "), 0.5, 0.4, 0.1);
    corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir,
            "body", wordScorer, 0, 3);
    assertThat(corrections.length, equalTo(4));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
    assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel"));
    assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("xorn the god jewel"));
    assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the got jewel"));

    corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir,
            "body", wordScorer, 1, 3);
    assertThat(corrections.length, equalTo(4));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
    assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel"));
    assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("xorn the god jewel"));
    assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the got jewel"));

    corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, ir,
            "body", wordScorer, 100, 3);
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));

    // test synonyms

    Analyzer analyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader);
            TokenFilter filter = new LowerCaseFilter(Version.LUCENE_41, t);
            try {
                SolrSynonymParser parser = new SolrSynonymParser(true, false,
                        new WhitespaceAnalyzer(Version.LUCENE_41));
                ((SolrSynonymParser) parser)
                        .add(new StringReader("usa => usa, america, american\nursa => usa, america, american"));
                filter = new SynonymFilter(filter, parser.build(), true);
            } catch (Exception e) {
                throw new RuntimeException(e);
            }
            return new TokenStreamComponents(t, filter);
        }
    };

    spellchecker.setAccuracy(0.0f);
    spellchecker.setMinPrefix(1);
    spellchecker.setMinQueryLength(1);
    suggester = new NoisyChannelSpellChecker(0.95);
    wordScorer = new LinearInterpoatingScorer(ir, "body_ngram", 0.95d, new BytesRef(" "), 0.5, 0.4, 0.1);
    corrections = suggester.getCorrections(analyzer, new BytesRef("captian usa"), generator, 2, 4, ir, "body",
            wordScorer, 1, 3);
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america"));

    generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95,
            10, null, analyzer);
    corrections = suggester.getCorrections(analyzer, new BytesRef("captian usw"), generator, 2, 4, ir, "body",
            wordScorer, 1, 3);
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america"));

    wordScorer = new StupidBackoffScorer(ir, "body_ngram", 0.85d, new BytesRef(" "), 0.4);
    corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 2, ir,
            "body", wordScorer, 0, 3);
    assertThat(corrections.length, equalTo(2));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
    assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel"));
}

From source file:pl.litwiniuk.rowicki.collocations.CollocationAnalyzer.java

License:Apache License

/**
 * Load synonyms from the solr format, "format=solr".
 *///from w  w w .  j av  a2  s.  c om
private SynonymMap loadSolrSynonyms() throws IOException, ParseException {
    boolean dedup = true;

    TokenizerFactory factory;

    Analyzer analyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_43, reader);
            TokenStream stream = new LowerCaseFilter(Version.LUCENE_43, tokenizer);
            return new TokenStreamComponents(tokenizer, stream);
        }
    };

    CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder().onMalformedInput(CodingErrorAction.REPORT)
            .onUnmappableCharacter(CodingErrorAction.REPORT);

    SolrSynonymParser parser = new SolrSynonymParser(dedup, true, analyzer);
    File synonymFile = new File("./Parsers/thesaurus.txt");
    if (synonymFile.exists()) {
        decoder.reset();
        parser.add(new InputStreamReader(new FileInputStream(synonymFile), decoder));
    }
    return parser.build();
}