List of usage examples for org.apache.lucene.analysis MockTokenFilter ENGLISH_STOPSET
CharacterRunAutomaton ENGLISH_STOPSET
To view the source code for org.apache.lucene.analysis MockTokenFilter ENGLISH_STOPSET.
Click Source Link
From source file:org.apache.solr.analysis.MockTokenFilterFactory.java
License:Apache License
/** Creates a new MockTokenizerFactory */ public MockTokenFilterFactory(Map<String, String> args) { super(args);/*from w ww .j a va 2 s.com*/ String stopset = get(args, "stopset", Arrays.asList("english", "empty"), null, false); String stopregex = get(args, "stopregex"); if (null != stopset) { if (null != stopregex) { throw new IllegalArgumentException("Parameters stopset and stopregex cannot both be specified."); } if ("english".equalsIgnoreCase(stopset)) { filter = MockTokenFilter.ENGLISH_STOPSET; } else { // must be "empty" filter = MockTokenFilter.EMPTY_STOPSET; } } else if (null != stopregex) { RegExp regex = new RegExp(stopregex); filter = new CharacterRunAutomaton(regex.toAutomaton()); } else { throw new IllegalArgumentException( "Configuration Error: either the 'stopset' or the 'stopregex' parameter must be specified."); } enablePositionIncrements = getBoolean(args, "enablePositionIncrements", true); if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } }
From source file:org.apache.solr.core.MockTokenFilterFactory.java
License:Apache License
/** Creates a new MockTokenizerFactory */ public MockTokenFilterFactory(Map<String, String> args) { super(args);/*from w w w . j a v a2 s . c o m*/ String stopset = get(args, "stopset", Arrays.asList("english", "empty"), null, false); String stopregex = get(args, "stopregex"); if (null != stopset) { if (null != stopregex) { throw new IllegalArgumentException("Parameters stopset and stopregex cannot both be specified."); } if ("english".equalsIgnoreCase(stopset)) { filter = MockTokenFilter.ENGLISH_STOPSET; } else { // must be "empty" filter = MockTokenFilter.EMPTY_STOPSET; } } else if (null != stopregex) { RegExp regex = new RegExp(stopregex); filter = new CharacterRunAutomaton(regex.toAutomaton()); } else { throw new IllegalArgumentException( "Configuration Error: either the 'stopset' or the 'stopregex' parameter must be specified."); } if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } }
From source file:org.apache.solr.search.TestSolrCoreParser.java
License:Apache License
private CoreParser solrCoreParser() { if (solrCoreParser == null) { final String defaultField = "contents"; final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET); final SolrQueryRequest req = null; solrCoreParser = new SolrCoreParser(defaultField, analyzer, req); {/* www .ja v a 2 s . c om*/ final NamedList<String> args = new NamedList<>(); args.add("HelloQuery", HelloQueryBuilder.class.getCanonicalName()); args.add("GoodbyeQuery", GoodbyeQueryBuilder.class.getCanonicalName()); args.add("HandyQuery", HandyQueryBuilder.class.getCanonicalName()); args.add("ApacheLuceneSolr", ApacheLuceneSolrNearQueryBuilder.class.getCanonicalName()); args.add("ChooseOneWord", ChooseOneWordQueryBuilder.class.getCanonicalName()); solrCoreParser.init(args); } } return solrCoreParser; }
From source file:org.easynet.resource.queryparser.QueryParserTestBase.java
License:Apache License
public void testBoost() throws Exception { CharacterRunAutomaton stopWords = new CharacterRunAutomaton(Automata.makeString("on")); Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords); QueryParser qp = getParserConfig(oneStopAnalyzer); Query q = getQuery("on^1.0", qp); assertNotNull(q);/* w ww.j a v a2 s .co m*/ q = getQuery("\"hello\"^2.0", qp); assertNotNull(q); assertEquals(q.getBoost(), (float) 2.0, (float) 0.5); q = getQuery("hello^2.0", qp); assertNotNull(q); assertEquals(q.getBoost(), (float) 2.0, (float) 0.5); q = getQuery("\"on\"^1.0", qp); assertNotNull(q); Analyzer a2 = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET); QueryParser qp2 = getParserConfig(a2); q = getQuery("the^3", qp2); // "the" is a stop word so the result is an empty query: assertNotNull(q); assertEquals("", q.toString()); assertEquals(1.0f, q.getBoost(), 0.01f); }
From source file:org.easynet.resource.queryparser.QueryParserTestBase.java
License:Apache License
public void testPositionIncrement() throws Exception { QueryParser qp = getParserConfig(//from w ww . j a v a 2s . co m new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)); qp.setEnablePositionIncrements(true); String qtxt = "\"the words in poisitions pos02578 are stopped in this phrasequery\""; // 0 2 5 7 8 int expectedPositions[] = { 1, 3, 4, 6, 9 }; PhraseQuery pq = (PhraseQuery) getQuery(qtxt, qp); // System.out.println("Query text: "+qtxt); // System.out.println("Result: "+pq); Term t[] = pq.getTerms(); int pos[] = pq.getPositions(); for (int i = 0; i < t.length; i++) { // System.out.println(i+". "+t[i]+" pos: "+pos[i]); assertEquals("term " + i + " = " + t[i] + " has wrong term-position!", expectedPositions[i], pos[i]); } }
From source file:org.easynet.resource.queryparser.QueryParserTestBase.java
License:Apache License
public void testPhraseQueryToString() throws Exception { Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET); QueryParser qp = getParserConfig(analyzer); qp.setEnablePositionIncrements(true); PhraseQuery q = (PhraseQuery) getQuery("\"this hi this is a test is\"", qp); assertEquals("field:\"? hi ? ? ? test\"", q.toString()); }
From source file:org.elasticsearch.action.admin.indices.TransportAnalyzeActionTests.java
License:Apache License
@Override public void setUp() throws Exception { super.setUp(); Settings settings = Settings.builder() .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build(); Settings indexSettings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .put(IndexMetaData.SETTING_INDEX_UUID, UUIDs.randomBase64UUID()) .put("index.analysis.analyzer.custom_analyzer.tokenizer", "standard") .put("index.analysis.analyzer.custom_analyzer.filter", "mock") .put("index.analysis.normalizer.my_normalizer.type", "custom") .putList("index.analysis.normalizer.my_normalizer.filter", "lowercase").build(); IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings); environment = new Environment(settings); AnalysisPlugin plugin = new AnalysisPlugin() { class MockFactory extends AbstractTokenFilterFactory { MockFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); }//from w w w . j a v a 2s .c om @Override public TokenStream create(TokenStream tokenStream) { return new MockTokenFilter(tokenStream, MockTokenFilter.ENGLISH_STOPSET); } } class AppendCharFilterFactory extends AbstractCharFilterFactory { AppendCharFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name); } @Override public Reader create(Reader reader) { return new AppendCharFilter(reader, "bar"); } } @Override public Map<String, AnalysisProvider<CharFilterFactory>> getCharFilters() { return singletonMap("append", AppendCharFilterFactory::new); } @Override public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() { return singletonMap("mock", MockFactory::new); } @Override public List<PreConfiguredCharFilter> getPreConfiguredCharFilters() { return singletonList(PreConfiguredCharFilter.singleton("append_foo", false, reader -> new AppendCharFilter(reader, "foo"))); } }; registry = new AnalysisModule(environment, singletonList(plugin)).getAnalysisRegistry(); indexAnalyzers = registry.build(idxSettings); }
From source file:org.tallison.lucene.queryparser.spans.TestOverallSpanQueryParser.java
License:Apache License
public void testStops() throws Exception { Analyzer stopsAnalyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET); Directory dir = newDirectory();//from w w w . j av a2s .co m RandomIndexWriter w = new RandomIndexWriter(random(), dir, newIndexWriterConfig(stopsAnalyzer) .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000)).setMergePolicy(newLogMergePolicy())); String[] docs = new String[] { "ab the the cd the the the ef the gh", "ab cd", "ab the ef" }; for (int i = 0; i < docs.length; i++) { Document doc = new Document(); doc.add(newTextField(FIELD1, docs[i], Field.Store.YES)); w.addDocument(doc); } IndexReader r = w.getReader(); IndexSearcher s = newSearcher(r); w.close(); SpanQueryParser p = new SpanQueryParser(FIELD1, stopsAnalyzer, MULTITERM_ANALYZER); assertHits("-ab +the +cd", p, s, 0); assertHits("+ab +the +cd", p, s, 2); assertHits("+the", p, s, 0); assertHits("ab AND CD", p, s, 2); assertHits("ab AND the", p, s, 3); assertHits("ab OR the", p, s, 3); assertHits("(ab the cd)~2", p, s, 2); assertHits("(ab the cd)~3", p, s, 0); assertHits("ab AND (the OR cd)", p, s, 2); assertHits("ab AND (the AND cd)", p, s, 2); assertHits("cd OR (the OR ef)", p, s, 3); assertHits("cd AND (the AND ef)", p, s, 1); //do we want this behavior? assertHits("-the", p, s, 0); assertHits("\"ab cd\"", p, s, 1); assertHits("\"ab a a cd\"", p, s, 2); assertHits("\"ab a cd\"~1", p, s, 2); assertHits("\"ab a cd\"~>1", p, s, 2); assertHits("\"cd a a ab\"", p, s, 0); assertHits("\"cd a ab\"~1", p, s, 2); r.close(); dir.close(); }
From source file:org.tallison.lucene.queryparser.spans.TestQPTestBaseSpanQuery.java
License:Apache License
@Override public void testPositionIncrement() throws Exception { //For SQP, this only tests whether stop words have been dropped. //PositionIncrements are not available in SpanQueries yet. CommonQueryParserConfiguration qp = getParserConfig( new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)); //qp.setEnablePositionIncrements(true); String qtxt = "\"the words in poisitions pos02578 are stopped in this phrasequery\""; // 0 2 5 7 8 SpanNearQuery pq = (SpanNearQuery) getQuery(qtxt, qp); SpanQuery[] clauses = pq.getClauses(); assertEquals(clauses.length, 5);/* w ww . j av a 2 s . c om*/ Set<Term> expected = new HashSet<Term>(); expected.add(new Term("field", "words")); expected.add(new Term("field", "poisitions")); expected.add(new Term("field", "pos")); expected.add(new Term("field", "stopped")); expected.add(new Term("field", "phrasequery")); }
From source file:org.tallison.lucene.search.concordance.TestConcordanceSearcher.java
License:Apache License
@Test public void testWithStops() throws Exception { String[] docs = new String[] { "a b the d e the f", "g h the d the j" }; Analyzer analyzer = getAnalyzer(MockTokenFilter.ENGLISH_STOPSET); Directory directory = getDirectory(analyzer, docs); IndexReader reader = DirectoryReader.open(directory); IndexSearcher indexSearcher = new IndexSearcher(reader); WindowBuilder wb = new WindowBuilder(2, 2, analyzer.getOffsetGap(FIELD)); ConcordanceSearcher searcher = new ConcordanceSearcher(wb); SpanQuery q = new SpanTermQuery(new Term(FIELD, "d")); ConcordanceWindowCollector collector = new ConcordanceWindowCollector(3); searcher.search(indexSearcher, FIELD, q, null, analyzer, collector); List<ConcordanceWindow> windows = collector.getSortedWindows(); assertEquals(2, windows.size());/* w ww. ja v a 2 s .co m*/ // the second word after the target is a stop word // this post-component of this window should only go to the first word after // the target assertEquals("b the", windows.get(0).getPre()); assertEquals("d", windows.get(0).getTarget()); assertEquals(" e", windows.get(0).getPost()); assertEquals("h the", windows.get(1).getPre()); assertEquals("d", windows.get(1).getTarget()); assertEquals(" the j", windows.get(1).getPost()); reader.close(); directory.close(); }