List of usage examples for org.apache.lucene.analysis MockTokenFilter EMPTY_STOPSET
CharacterRunAutomaton EMPTY_STOPSET
To view the source code for org.apache.lucene.analysis MockTokenFilter EMPTY_STOPSET.
Click Source Link
From source file:org.apache.solr.analysis.MockTokenFilterFactory.java
License:Apache License
/** Creates a new MockTokenizerFactory */ public MockTokenFilterFactory(Map<String, String> args) { super(args);//from ww w .ja va2s .c om String stopset = get(args, "stopset", Arrays.asList("english", "empty"), null, false); String stopregex = get(args, "stopregex"); if (null != stopset) { if (null != stopregex) { throw new IllegalArgumentException("Parameters stopset and stopregex cannot both be specified."); } if ("english".equalsIgnoreCase(stopset)) { filter = MockTokenFilter.ENGLISH_STOPSET; } else { // must be "empty" filter = MockTokenFilter.EMPTY_STOPSET; } } else if (null != stopregex) { RegExp regex = new RegExp(stopregex); filter = new CharacterRunAutomaton(regex.toAutomaton()); } else { throw new IllegalArgumentException( "Configuration Error: either the 'stopset' or the 'stopregex' parameter must be specified."); } enablePositionIncrements = getBoolean(args, "enablePositionIncrements", true); if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } }
From source file:org.apache.solr.core.MockTokenFilterFactory.java
License:Apache License
/** Creates a new MockTokenizerFactory */ public MockTokenFilterFactory(Map<String, String> args) { super(args);/* ww w .j a v a 2 s. c o m*/ String stopset = get(args, "stopset", Arrays.asList("english", "empty"), null, false); String stopregex = get(args, "stopregex"); if (null != stopset) { if (null != stopregex) { throw new IllegalArgumentException("Parameters stopset and stopregex cannot both be specified."); } if ("english".equalsIgnoreCase(stopset)) { filter = MockTokenFilter.ENGLISH_STOPSET; } else { // must be "empty" filter = MockTokenFilter.EMPTY_STOPSET; } } else if (null != stopregex) { RegExp regex = new RegExp(stopregex); filter = new CharacterRunAutomaton(regex.toAutomaton()); } else { throw new IllegalArgumentException( "Configuration Error: either the 'stopset' or the 'stopregex' parameter must be specified."); } if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } }
From source file:org.tallison.lucene.search.concordance.TestBigramFilter.java
License:Apache License
@Test public void testBasicNoUnigrams() throws Exception { Analyzer analyzer = ConcordanceTestBase.getBigramAnalyzer(MockTokenFilter.EMPTY_STOPSET, 10, 10, false); String s = "a b c d e f g"; TokenStream tokenStream = analyzer.tokenStream(ConcordanceTestBase.FIELD, s); tokenStream.reset();//from ww w. j a va2s.c om CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncAttribute = tokenStream.getAttribute(PositionIncrementAttribute.class); List<String> expected = Arrays.asList(new String[] { "a_b", "b_c", "c_d", "d_e", "e_f", "f_g", }); List<String> returned = new ArrayList<>(); while (tokenStream.incrementToken()) { String token = charTermAttribute.toString(); assertEquals(1, posIncAttribute.getPositionIncrement()); returned.add(token); } tokenStream.end(); tokenStream.close(); assertEquals(expected, returned); }
From source file:org.tallison.lucene.search.concordance.TestBigramFilter.java
License:Apache License
@Test public void testIncludeUnigrams() throws Exception { List<String> expected = Arrays.asList( new String[] { "a", "a_b", "b", "b_c", "c", "c_d", "d", "d_e", "e", "e_f", "f", "f_g", "g", }); Analyzer analyzer = ConcordanceTestBase.getBigramAnalyzer(MockTokenFilter.EMPTY_STOPSET, 10, 10, true); String s = "a b c d e f g"; TokenStream tokenStream = analyzer.tokenStream("f", s); tokenStream.reset();/*from w ww . ja v a 2s . co m*/ CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncAttribute = tokenStream.getAttribute(PositionIncrementAttribute.class); List<String> returned = new ArrayList<>(); int i = 0; while (tokenStream.incrementToken()) { String token = charTermAttribute.toString(); if (i++ % 2 == 0) { assertEquals(1, posIncAttribute.getPositionIncrement()); } else { assertEquals(0, posIncAttribute.getPositionIncrement()); } returned.add(token); } tokenStream.end(); tokenStream.close(); assertEquals(expected, returned); }
From source file:org.tallison.lucene.search.concordance.TestConcordanceSearcher.java
License:Apache License
@Test public void testSimple() throws Exception { String[] docs = new String[] { "a b c a b c", "c b a c b a" }; Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET); Directory directory = getDirectory(analyzer, docs); IndexReader reader = DirectoryReader.open(directory); IndexSearcher indexSearcher = new IndexSearcher(reader); WindowBuilder wb = new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD), new DefaultSortKeyBuilder(ConcordanceSortOrder.PRE), metadataExtractor, docIdBuilder); ConcordanceSearcher searcher = new ConcordanceSearcher(wb); SpanQuery q = new SpanTermQuery(new Term(FIELD, "a")); ConcordanceWindowCollector collector = new ConcordanceWindowCollector(3); searcher.search(indexSearcher, FIELD, q, null, analyzer, collector); assertEquals(3, collector.size());//from ww w. j a v a 2 s . c o m collector = new ConcordanceWindowCollector(ConcordanceWindowCollector.COLLECT_ALL); searcher.search(indexSearcher, FIELD, q, null, analyzer, collector); // test result size assertEquals(4, collector.size()); // test result with sort order = pre List<ConcordanceWindow> windows = collector.getSortedWindows(); String[] pres = new String[] { "", "c b", "c b a c b", "a b c" }; String[] posts = new String[] { " b c a b c", " c b a", "", " b c" }; for (int i = 0; i < windows.size(); i++) { ConcordanceWindow w = windows.get(i); assertEquals(pres[i], w.getPre()); assertEquals(posts[i], w.getPost()); } // test sort order post // sort key is built at search time, so must re-search wb = new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD), new DefaultSortKeyBuilder(ConcordanceSortOrder.POST), metadataExtractor, docIdBuilder); searcher = new ConcordanceSearcher(wb); collector = new ConcordanceWindowCollector(ConcordanceWindowCollector.COLLECT_ALL); searcher.search(indexSearcher, FIELD, q, null, analyzer, collector); windows = collector.getSortedWindows(); posts = new String[] { "", " b c", " b c a b c", " c b a", }; for (int i = 0; i < windows.size(); i++) { ConcordanceWindow w = windows.get(i); assertEquals(posts[i], w.getPost()); } reader.close(); directory.close(); }
From source file:org.tallison.lucene.search.concordance.TestConcordanceSearcher.java
License:Apache License
@Test public void testSimpleMultiValuedField() throws Exception { String[] doc = new String[] { "a b c a b c", "c b a c b a" }; List<String[]> docs = new ArrayList<>(); docs.add(doc);/*from w w w. ja v a2 s . co m*/ Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET); Directory directory = getDirectory(analyzer, docs); IndexReader reader = DirectoryReader.open(directory); IndexSearcher indexSearcher = new IndexSearcher(reader); ConcordanceSearcher searcher = new ConcordanceSearcher( new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD))); SpanQuery q = new SpanTermQuery(new Term(FIELD, "a")); ConcordanceWindowCollector collector = new ConcordanceWindowCollector(100); searcher.search(indexSearcher, FIELD, q, null, analyzer, collector); // test result size assertEquals(4, collector.size()); // test result with sort order = pre List<ConcordanceWindow> windows = collector.getSortedWindows(); String[] pres = new String[] { "", "c b", "c b a c b", "a b c" }; String[] posts = new String[] { " b c a b c", " c b a", "", " b c" }; for (int i = 0; i < pres.length; i++) { ConcordanceWindow w = windows.get(i); assertEquals("pres: " + i, pres[i], w.getPre()); assertEquals("posts: " + i, posts[i], w.getPost()); } // test sort order post // sort key is built at search time, so must re-search WindowBuilder wb = new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD), new DefaultSortKeyBuilder(ConcordanceSortOrder.POST), metadataExtractor, docIdBuilder); searcher = new ConcordanceSearcher(wb); collector = new ConcordanceWindowCollector(100); searcher.search(indexSearcher, FIELD, q, null, analyzer, collector); windows = collector.getSortedWindows(); posts = new String[] { "", " b c", " b c a b c", " c b a", }; for (int i = 0; i < posts.length; i++) { ConcordanceWindow w = windows.get(i); assertEquals(posts[i], w.getPost()); } reader.close(); directory.close(); }
From source file:org.tallison.lucene.search.concordance.TestConcordanceSearcher.java
License:Apache License
@Test public void testWindowLengths() throws Exception { String[] doc = new String[] { "a b c d e f g" }; List<String[]> docs = new ArrayList<>(); docs.add(doc);//from w ww .j av a 2s.com Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET); Directory directory = getDirectory(analyzer, docs); IndexReader reader = DirectoryReader.open(directory); IndexSearcher indexSearcher = new IndexSearcher(reader); SpanQuery q = new SpanTermQuery(new Term(FIELD, "d")); String[] pres = { "", "c", "b c", "a b c", "a b c", "a b c" }; String[] posts = { "", " e", " e f", " e f g", " e f g", " e f g" }; for (int tokensBefore = 0; tokensBefore < pres.length; tokensBefore++) { for (int tokensAfter = 0; tokensAfter < posts.length; tokensAfter++) { WindowBuilder wb = new WindowBuilder(tokensBefore, tokensAfter, analyzer.getOffsetGap(FIELD)); ConcordanceSearcher searcher = new ConcordanceSearcher(wb); ConcordanceWindowCollector collector = new ConcordanceWindowCollector(100); searcher.search(indexSearcher, FIELD, q, null, analyzer, collector); ConcordanceWindow w = collector.getSortedWindows().get(0); assertEquals(tokensBefore + " : " + tokensAfter, pres[tokensBefore], w.getPre()); assertEquals(tokensBefore + " : " + tokensAfter, posts[tokensAfter], w.getPost()); } } reader.close(); directory.close(); }
From source file:org.tallison.lucene.search.concordance.TestConcordanceSearcher.java
License:Apache License
@Test public void testClockworkOrangeMultiValuedFieldProblem() throws Exception { /*//www . j a v a 2 s . c o m * test handling of target match (or not) over different indices into multivalued * field array */ String[] doc = new String[] { "a b c a b the", "clockwork", "orange b a c b a" }; List<String[]> docs = new ArrayList<>(); docs.add(doc); Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 0, 10); Directory directory = getDirectory(analyzer, docs); IndexReader reader = DirectoryReader.open(directory); IndexSearcher indexSearcher = new IndexSearcher(reader); WindowBuilder wb = new WindowBuilder(3, 3, analyzer.getOffsetGap(FIELD)); ConcordanceSearcher searcher = new ConcordanceSearcher(wb); SpanQuery q1 = new SpanTermQuery(new Term(FIELD, "the")); SpanQuery q2 = new SpanTermQuery(new Term(FIELD, "clockwork")); SpanQuery q3 = new SpanTermQuery(new Term(FIELD, "orange")); SpanQuery q = new SpanNearQuery(new SpanQuery[] { q1, q2, q3 }, 3, true); ConcordanceWindowCollector collector = new ConcordanceWindowCollector(3); searcher.search(indexSearcher, FIELD, q, null, analyzer, collector); assertEquals(1, collector.size()); ConcordanceWindow w = collector.getSortedWindows().iterator().next(); assertEquals("target", "the | clockwork | orange", w.getTarget()); assertEquals("pre", "c a b", w.getPre()); assertEquals("post", " b a c", w.getPost()); reader.close(); directory.close(); // test hit even over long inter-field gap analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 20, 50); directory = getDirectory(analyzer, docs); reader = DirectoryReader.open(directory); indexSearcher = new IndexSearcher(reader); wb = new WindowBuilder(3, 3, analyzer.getOffsetGap(FIELD)); searcher = new ConcordanceSearcher(wb); q = new SpanNearQuery(new SpanQuery[] { q1, q2, q3 }, 120, true); collector = new ConcordanceWindowCollector(100); searcher.search(indexSearcher, FIELD, q, null, analyzer, collector); assertEquals(1, collector.size()); w = collector.getSortedWindows().iterator().next(); assertEquals("target", "the | clockwork | orange", w.getTarget()); assertEquals("pre", "c a b", w.getPre()); assertEquals("post", " b a c", w.getPost()); reader.close(); directory.close(); // test miss analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 100, 100); directory = getDirectory(analyzer, docs); reader = DirectoryReader.open(directory); indexSearcher = new IndexSearcher(reader); wb = new WindowBuilder(); searcher = new ConcordanceSearcher(wb); q = new SpanNearQuery(new SpanQuery[] { q1, q2, q3 }, 5, true); collector = new ConcordanceWindowCollector(100); searcher.search(indexSearcher, FIELD, q, null, analyzer, collector); assertEquals(0, collector.size()); reader.close(); directory.close(); }
From source file:org.tallison.lucene.search.concordance.TestConcordanceSearcher.java
License:Apache License
@Test public void testBasicStandardQueryConversion() throws Exception { String[] docs = new String[] { "a b c a b c", "c b a c b a d e a", "c b a c b a e a b c a" }; Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET); Directory directory = getDirectory(analyzer, docs); IndexReader reader = DirectoryReader.open(directory); IndexSearcher indexSearcher = new IndexSearcher(reader); ConcordanceSearcher searcher = new ConcordanceSearcher( new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD))); BooleanQuery q = new BooleanQuery.Builder().add(new TermQuery(new Term(FIELD, "a")), Occur.MUST) .add(new TermQuery(new Term(FIELD, "d")), Occur.MUST_NOT).build(); ConcordanceWindowCollector collector = new ConcordanceWindowCollector(10); searcher.search(indexSearcher, FIELD, q, null, analyzer, collector); // shouldn't include document with "d" assertEquals(6, collector.size());/* w w w . j av a 2 s . co m*/ // should only include document with "e" and not "d" Query filter = new TermQuery(new Term(FIELD, "e")); collector = new ConcordanceWindowCollector(10); searcher.search(indexSearcher, FIELD, (Query) q, filter, analyzer, collector); assertEquals(4, collector.size()); reader.close(); directory.close(); }
From source file:org.tallison.lucene.search.concordance.TestConcordanceSearcher.java
License:Apache License
@Test public void testMismatchingFieldsInStandardQueryConversion() throws Exception { // tests what happens if a Query doesn't contain a term in the "span" field // in the searcher...should be no exception and zero documents returned. String[] docs = new String[] { "a b c a b c", }; Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET); Directory directory = getDirectory(analyzer, docs); IndexReader reader = DirectoryReader.open(directory); IndexSearcher indexSearcher = new IndexSearcher(reader); ConcordanceSearcher searcher = new ConcordanceSearcher( new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD))); Query q = new TermQuery(new Term("_" + FIELD, "a")); int windowCount = -1; ConcordanceWindowCollector collector = new ConcordanceWindowCollector(10); searcher.search(indexSearcher, FIELD, q, null, analyzer, collector); windowCount = collector.size();/*from ww w.ja va 2 s. c o m*/ assertEquals(0, windowCount); reader.close(); directory.close(); }