Example usage for org.apache.lucene.analysis MockTokenFilter ENGLISH_STOPSET

List of usage examples for org.apache.lucene.analysis MockTokenFilter ENGLISH_STOPSET

Introduction

In this page you can find the example usage for org.apache.lucene.analysis MockTokenFilter ENGLISH_STOPSET.

Prototype

CharacterRunAutomaton ENGLISH_STOPSET

To view the source code for org.apache.lucene.analysis MockTokenFilter ENGLISH_STOPSET.

Click Source Link

Document

Set of common english stopwords

Usage

From source file:org.apache.solr.analysis.MockTokenFilterFactory.java

License:Apache License

/** Creates a new MockTokenizerFactory */
public MockTokenFilterFactory(Map<String, String> args) {
    super(args);/*from   w  ww  .j a  va  2  s.com*/
    String stopset = get(args, "stopset", Arrays.asList("english", "empty"), null, false);
    String stopregex = get(args, "stopregex");
    if (null != stopset) {
        if (null != stopregex) {
            throw new IllegalArgumentException("Parameters stopset and stopregex cannot both be specified.");
        }
        if ("english".equalsIgnoreCase(stopset)) {
            filter = MockTokenFilter.ENGLISH_STOPSET;
        } else { // must be "empty"
            filter = MockTokenFilter.EMPTY_STOPSET;
        }
    } else if (null != stopregex) {
        RegExp regex = new RegExp(stopregex);
        filter = new CharacterRunAutomaton(regex.toAutomaton());
    } else {
        throw new IllegalArgumentException(
                "Configuration Error: either the 'stopset' or the 'stopregex' parameter must be specified.");
    }
    enablePositionIncrements = getBoolean(args, "enablePositionIncrements", true);
    if (!args.isEmpty()) {
        throw new IllegalArgumentException("Unknown parameters: " + args);
    }
}

From source file:org.apache.solr.core.MockTokenFilterFactory.java

License:Apache License

/** Creates a new MockTokenizerFactory */
public MockTokenFilterFactory(Map<String, String> args) {
    super(args);/*from   w  w w .  j a v  a2  s  . c  o  m*/
    String stopset = get(args, "stopset", Arrays.asList("english", "empty"), null, false);
    String stopregex = get(args, "stopregex");
    if (null != stopset) {
        if (null != stopregex) {
            throw new IllegalArgumentException("Parameters stopset and stopregex cannot both be specified.");
        }
        if ("english".equalsIgnoreCase(stopset)) {
            filter = MockTokenFilter.ENGLISH_STOPSET;
        } else { // must be "empty"
            filter = MockTokenFilter.EMPTY_STOPSET;
        }
    } else if (null != stopregex) {
        RegExp regex = new RegExp(stopregex);
        filter = new CharacterRunAutomaton(regex.toAutomaton());
    } else {
        throw new IllegalArgumentException(
                "Configuration Error: either the 'stopset' or the 'stopregex' parameter must be specified.");
    }
    if (!args.isEmpty()) {
        throw new IllegalArgumentException("Unknown parameters: " + args);
    }
}

From source file:org.apache.solr.search.TestSolrCoreParser.java

License:Apache License

private CoreParser solrCoreParser() {
    if (solrCoreParser == null) {
        final String defaultField = "contents";
        final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true,
                MockTokenFilter.ENGLISH_STOPSET);
        final SolrQueryRequest req = null;
        solrCoreParser = new SolrCoreParser(defaultField, analyzer, req);
        {/* www .ja  v  a  2 s  .  c  om*/
            final NamedList<String> args = new NamedList<>();
            args.add("HelloQuery", HelloQueryBuilder.class.getCanonicalName());
            args.add("GoodbyeQuery", GoodbyeQueryBuilder.class.getCanonicalName());
            args.add("HandyQuery", HandyQueryBuilder.class.getCanonicalName());
            args.add("ApacheLuceneSolr", ApacheLuceneSolrNearQueryBuilder.class.getCanonicalName());
            args.add("ChooseOneWord", ChooseOneWordQueryBuilder.class.getCanonicalName());
            solrCoreParser.init(args);
        }
    }
    return solrCoreParser;
}

From source file:org.easynet.resource.queryparser.QueryParserTestBase.java

License:Apache License

public void testBoost() throws Exception {
    CharacterRunAutomaton stopWords = new CharacterRunAutomaton(Automata.makeString("on"));
    Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords);
    QueryParser qp = getParserConfig(oneStopAnalyzer);
    Query q = getQuery("on^1.0", qp);
    assertNotNull(q);/*  w  ww.j  a  v  a2  s .co m*/
    q = getQuery("\"hello\"^2.0", qp);
    assertNotNull(q);
    assertEquals(q.getBoost(), (float) 2.0, (float) 0.5);
    q = getQuery("hello^2.0", qp);
    assertNotNull(q);
    assertEquals(q.getBoost(), (float) 2.0, (float) 0.5);
    q = getQuery("\"on\"^1.0", qp);
    assertNotNull(q);

    Analyzer a2 = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
    QueryParser qp2 = getParserConfig(a2);
    q = getQuery("the^3", qp2);
    // "the" is a stop word so the result is an empty query:
    assertNotNull(q);
    assertEquals("", q.toString());
    assertEquals(1.0f, q.getBoost(), 0.01f);
}

From source file:org.easynet.resource.queryparser.QueryParserTestBase.java

License:Apache License

public void testPositionIncrement() throws Exception {
    QueryParser qp = getParserConfig(//from w  ww  . j  a  v a 2s . co  m
            new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET));
    qp.setEnablePositionIncrements(true);
    String qtxt = "\"the words in poisitions pos02578 are stopped in this phrasequery\"";
    // 0 2 5 7 8
    int expectedPositions[] = { 1, 3, 4, 6, 9 };
    PhraseQuery pq = (PhraseQuery) getQuery(qtxt, qp);
    // System.out.println("Query text: "+qtxt);
    // System.out.println("Result: "+pq);
    Term t[] = pq.getTerms();
    int pos[] = pq.getPositions();
    for (int i = 0; i < t.length; i++) {
        // System.out.println(i+". "+t[i]+"  pos: "+pos[i]);
        assertEquals("term " + i + " = " + t[i] + " has wrong term-position!", expectedPositions[i], pos[i]);
    }
}

From source file:org.easynet.resource.queryparser.QueryParserTestBase.java

License:Apache License

public void testPhraseQueryToString() throws Exception {
    Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
    QueryParser qp = getParserConfig(analyzer);
    qp.setEnablePositionIncrements(true);
    PhraseQuery q = (PhraseQuery) getQuery("\"this hi this is a test is\"", qp);
    assertEquals("field:\"? hi ? ? ? test\"", q.toString());
}

From source file:org.elasticsearch.action.admin.indices.TransportAnalyzeActionTests.java

License:Apache License

@Override
public void setUp() throws Exception {
    super.setUp();
    Settings settings = Settings.builder()
            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();

    Settings indexSettings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
            .put(IndexMetaData.SETTING_INDEX_UUID, UUIDs.randomBase64UUID())
            .put("index.analysis.analyzer.custom_analyzer.tokenizer", "standard")
            .put("index.analysis.analyzer.custom_analyzer.filter", "mock")
            .put("index.analysis.normalizer.my_normalizer.type", "custom")
            .putList("index.analysis.normalizer.my_normalizer.filter", "lowercase").build();
    IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
    environment = new Environment(settings);
    AnalysisPlugin plugin = new AnalysisPlugin() {
        class MockFactory extends AbstractTokenFilterFactory {
            MockFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
                super(indexSettings, name, settings);
            }//from w  w w  .  j  a v  a  2s  .c  om

            @Override
            public TokenStream create(TokenStream tokenStream) {
                return new MockTokenFilter(tokenStream, MockTokenFilter.ENGLISH_STOPSET);
            }
        }

        class AppendCharFilterFactory extends AbstractCharFilterFactory {
            AppendCharFilterFactory(IndexSettings indexSettings, Environment environment, String name,
                    Settings settings) {
                super(indexSettings, name);
            }

            @Override
            public Reader create(Reader reader) {
                return new AppendCharFilter(reader, "bar");
            }
        }

        @Override
        public Map<String, AnalysisProvider<CharFilterFactory>> getCharFilters() {
            return singletonMap("append", AppendCharFilterFactory::new);
        }

        @Override
        public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
            return singletonMap("mock", MockFactory::new);
        }

        @Override
        public List<PreConfiguredCharFilter> getPreConfiguredCharFilters() {
            return singletonList(PreConfiguredCharFilter.singleton("append_foo", false,
                    reader -> new AppendCharFilter(reader, "foo")));
        }
    };
    registry = new AnalysisModule(environment, singletonList(plugin)).getAnalysisRegistry();
    indexAnalyzers = registry.build(idxSettings);
}

From source file:org.tallison.lucene.queryparser.spans.TestOverallSpanQueryParser.java

License:Apache License

public void testStops() throws Exception {
    Analyzer stopsAnalyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true,
            MockTokenFilter.ENGLISH_STOPSET);
    Directory dir = newDirectory();//from   w w  w .  j  av a2s  .co m
    RandomIndexWriter w = new RandomIndexWriter(random(), dir, newIndexWriterConfig(stopsAnalyzer)
            .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000)).setMergePolicy(newLogMergePolicy()));
    String[] docs = new String[] { "ab the the cd the the the ef the gh", "ab cd", "ab the ef" };

    for (int i = 0; i < docs.length; i++) {
        Document doc = new Document();
        doc.add(newTextField(FIELD1, docs[i], Field.Store.YES));
        w.addDocument(doc);
    }
    IndexReader r = w.getReader();
    IndexSearcher s = newSearcher(r);
    w.close();
    SpanQueryParser p = new SpanQueryParser(FIELD1, stopsAnalyzer, MULTITERM_ANALYZER);
    assertHits("-ab +the +cd", p, s, 0);
    assertHits("+ab +the +cd", p, s, 2);
    assertHits("+the", p, s, 0);
    assertHits("ab AND CD", p, s, 2);
    assertHits("ab AND the", p, s, 3);
    assertHits("ab OR the", p, s, 3);
    assertHits("(ab the cd)~2", p, s, 2);
    assertHits("(ab the cd)~3", p, s, 0);
    assertHits("ab AND (the OR cd)", p, s, 2);
    assertHits("ab AND (the AND cd)", p, s, 2);
    assertHits("cd OR (the OR ef)", p, s, 3);
    assertHits("cd AND (the AND ef)", p, s, 1);
    //do we want this behavior?
    assertHits("-the", p, s, 0);

    assertHits("\"ab cd\"", p, s, 1);
    assertHits("\"ab a a cd\"", p, s, 2);
    assertHits("\"ab a cd\"~1", p, s, 2);
    assertHits("\"ab a cd\"~>1", p, s, 2);
    assertHits("\"cd a a ab\"", p, s, 0);
    assertHits("\"cd a ab\"~1", p, s, 2);

    r.close();
    dir.close();
}

From source file:org.tallison.lucene.queryparser.spans.TestQPTestBaseSpanQuery.java

License:Apache License

@Override
public void testPositionIncrement() throws Exception {
    //For SQP, this only tests whether stop words have been dropped.
    //PositionIncrements are not available in SpanQueries yet.
    CommonQueryParserConfiguration qp = getParserConfig(
            new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET));
    //qp.setEnablePositionIncrements(true);
    String qtxt = "\"the words in poisitions pos02578 are stopped in this phrasequery\"";
    //               0         2                      5           7  8
    SpanNearQuery pq = (SpanNearQuery) getQuery(qtxt, qp);
    SpanQuery[] clauses = pq.getClauses();
    assertEquals(clauses.length, 5);/* w  ww  .  j  av a 2  s  . c  om*/
    Set<Term> expected = new HashSet<Term>();
    expected.add(new Term("field", "words"));
    expected.add(new Term("field", "poisitions"));
    expected.add(new Term("field", "pos"));
    expected.add(new Term("field", "stopped"));
    expected.add(new Term("field", "phrasequery"));
}

From source file:org.tallison.lucene.search.concordance.TestConcordanceSearcher.java

License:Apache License

@Test
public void testWithStops() throws Exception {
    String[] docs = new String[] { "a b the d e the f", "g h the d the j" };
    Analyzer analyzer = getAnalyzer(MockTokenFilter.ENGLISH_STOPSET);
    Directory directory = getDirectory(analyzer, docs);
    IndexReader reader = DirectoryReader.open(directory);
    IndexSearcher indexSearcher = new IndexSearcher(reader);
    WindowBuilder wb = new WindowBuilder(2, 2, analyzer.getOffsetGap(FIELD));

    ConcordanceSearcher searcher = new ConcordanceSearcher(wb);
    SpanQuery q = new SpanTermQuery(new Term(FIELD, "d"));
    ConcordanceWindowCollector collector = new ConcordanceWindowCollector(3);

    searcher.search(indexSearcher, FIELD, q, null, analyzer, collector);
    List<ConcordanceWindow> windows = collector.getSortedWindows();
    assertEquals(2, windows.size());/* w ww.  ja v  a  2 s .co m*/

    // the second word after the target is a stop word
    // this post-component of this window should only go to the first word after
    // the target
    assertEquals("b the", windows.get(0).getPre());
    assertEquals("d", windows.get(0).getTarget());
    assertEquals(" e", windows.get(0).getPost());

    assertEquals("h the", windows.get(1).getPre());
    assertEquals("d", windows.get(1).getTarget());
    assertEquals(" the j", windows.get(1).getPost());

    reader.close();
    directory.close();
}