Example usage for org.apache.lucene.analysis MockTokenFilter EMPTY_STOPSET

List of usage examples for org.apache.lucene.analysis MockTokenFilter EMPTY_STOPSET

Introduction

In this page you can find the example usage for org.apache.lucene.analysis MockTokenFilter EMPTY_STOPSET.

Prototype

CharacterRunAutomaton EMPTY_STOPSET

To view the source code for org.apache.lucene.analysis MockTokenFilter EMPTY_STOPSET.

Click Source Link

Document

Empty set of stopwords

Usage

From source file:org.apache.solr.analysis.MockTokenFilterFactory.java

License:Apache License

/** Creates a new MockTokenizerFactory */
public MockTokenFilterFactory(Map<String, String> args) {
    super(args);//from  ww w .ja va2s  .c  om
    String stopset = get(args, "stopset", Arrays.asList("english", "empty"), null, false);
    String stopregex = get(args, "stopregex");
    if (null != stopset) {
        if (null != stopregex) {
            throw new IllegalArgumentException("Parameters stopset and stopregex cannot both be specified.");
        }
        if ("english".equalsIgnoreCase(stopset)) {
            filter = MockTokenFilter.ENGLISH_STOPSET;
        } else { // must be "empty"
            filter = MockTokenFilter.EMPTY_STOPSET;
        }
    } else if (null != stopregex) {
        RegExp regex = new RegExp(stopregex);
        filter = new CharacterRunAutomaton(regex.toAutomaton());
    } else {
        throw new IllegalArgumentException(
                "Configuration Error: either the 'stopset' or the 'stopregex' parameter must be specified.");
    }
    enablePositionIncrements = getBoolean(args, "enablePositionIncrements", true);
    if (!args.isEmpty()) {
        throw new IllegalArgumentException("Unknown parameters: " + args);
    }
}

From source file:org.apache.solr.core.MockTokenFilterFactory.java

License:Apache License

/** Creates a new MockTokenizerFactory */
public MockTokenFilterFactory(Map<String, String> args) {
    super(args);/* ww  w .j a v  a  2  s. c  o m*/
    String stopset = get(args, "stopset", Arrays.asList("english", "empty"), null, false);
    String stopregex = get(args, "stopregex");
    if (null != stopset) {
        if (null != stopregex) {
            throw new IllegalArgumentException("Parameters stopset and stopregex cannot both be specified.");
        }
        if ("english".equalsIgnoreCase(stopset)) {
            filter = MockTokenFilter.ENGLISH_STOPSET;
        } else { // must be "empty"
            filter = MockTokenFilter.EMPTY_STOPSET;
        }
    } else if (null != stopregex) {
        RegExp regex = new RegExp(stopregex);
        filter = new CharacterRunAutomaton(regex.toAutomaton());
    } else {
        throw new IllegalArgumentException(
                "Configuration Error: either the 'stopset' or the 'stopregex' parameter must be specified.");
    }
    if (!args.isEmpty()) {
        throw new IllegalArgumentException("Unknown parameters: " + args);
    }
}

From source file:org.tallison.lucene.search.concordance.TestBigramFilter.java

License:Apache License

@Test
public void testBasicNoUnigrams() throws Exception {
    Analyzer analyzer = ConcordanceTestBase.getBigramAnalyzer(MockTokenFilter.EMPTY_STOPSET, 10, 10, false);

    String s = "a b c d e f g";
    TokenStream tokenStream = analyzer.tokenStream(ConcordanceTestBase.FIELD, s);
    tokenStream.reset();//from   ww  w. j  a  va2s.c  om
    CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncAttribute = tokenStream.getAttribute(PositionIncrementAttribute.class);

    List<String> expected = Arrays.asList(new String[] { "a_b", "b_c", "c_d", "d_e", "e_f", "f_g", });

    List<String> returned = new ArrayList<>();
    while (tokenStream.incrementToken()) {
        String token = charTermAttribute.toString();
        assertEquals(1, posIncAttribute.getPositionIncrement());
        returned.add(token);
    }
    tokenStream.end();
    tokenStream.close();
    assertEquals(expected, returned);
}

From source file:org.tallison.lucene.search.concordance.TestBigramFilter.java

License:Apache License

@Test
public void testIncludeUnigrams() throws Exception {
    List<String> expected = Arrays.asList(
            new String[] { "a", "a_b", "b", "b_c", "c", "c_d", "d", "d_e", "e", "e_f", "f", "f_g", "g", });
    Analyzer analyzer = ConcordanceTestBase.getBigramAnalyzer(MockTokenFilter.EMPTY_STOPSET, 10, 10, true);

    String s = "a b c d e f g";
    TokenStream tokenStream = analyzer.tokenStream("f", s);
    tokenStream.reset();/*from  w ww  . ja v  a  2s  . co  m*/
    CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncAttribute = tokenStream.getAttribute(PositionIncrementAttribute.class);

    List<String> returned = new ArrayList<>();
    int i = 0;
    while (tokenStream.incrementToken()) {
        String token = charTermAttribute.toString();
        if (i++ % 2 == 0) {
            assertEquals(1, posIncAttribute.getPositionIncrement());
        } else {
            assertEquals(0, posIncAttribute.getPositionIncrement());
        }
        returned.add(token);
    }
    tokenStream.end();
    tokenStream.close();
    assertEquals(expected, returned);
}

From source file:org.tallison.lucene.search.concordance.TestConcordanceSearcher.java

License:Apache License

@Test
public void testSimple() throws Exception {
    String[] docs = new String[] { "a b c a b c", "c b a c b a" };
    Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET);

    Directory directory = getDirectory(analyzer, docs);
    IndexReader reader = DirectoryReader.open(directory);
    IndexSearcher indexSearcher = new IndexSearcher(reader);

    WindowBuilder wb = new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD),
            new DefaultSortKeyBuilder(ConcordanceSortOrder.PRE), metadataExtractor, docIdBuilder);
    ConcordanceSearcher searcher = new ConcordanceSearcher(wb);
    SpanQuery q = new SpanTermQuery(new Term(FIELD, "a"));

    ConcordanceWindowCollector collector = new ConcordanceWindowCollector(3);
    searcher.search(indexSearcher, FIELD, q, null, analyzer, collector);

    assertEquals(3, collector.size());//from ww  w. j  a  v  a  2 s  . c  o  m

    collector = new ConcordanceWindowCollector(ConcordanceWindowCollector.COLLECT_ALL);
    searcher.search(indexSearcher, FIELD, q, null, analyzer, collector);

    // test result size
    assertEquals(4, collector.size());

    // test result with sort order = pre
    List<ConcordanceWindow> windows = collector.getSortedWindows();
    String[] pres = new String[] { "", "c b", "c b a c b", "a b c" };
    String[] posts = new String[] { " b c a b c", " c b a", "", " b c" };

    for (int i = 0; i < windows.size(); i++) {
        ConcordanceWindow w = windows.get(i);

        assertEquals(pres[i], w.getPre());
        assertEquals(posts[i], w.getPost());
    }

    // test sort order post
    // sort key is built at search time, so must re-search
    wb = new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD),
            new DefaultSortKeyBuilder(ConcordanceSortOrder.POST), metadataExtractor, docIdBuilder);
    searcher = new ConcordanceSearcher(wb);

    collector = new ConcordanceWindowCollector(ConcordanceWindowCollector.COLLECT_ALL);
    searcher.search(indexSearcher, FIELD, q, null, analyzer, collector);

    windows = collector.getSortedWindows();

    posts = new String[] { "", " b c", " b c a b c", " c b a", };
    for (int i = 0; i < windows.size(); i++) {
        ConcordanceWindow w = windows.get(i);
        assertEquals(posts[i], w.getPost());
    }
    reader.close();
    directory.close();
}

From source file:org.tallison.lucene.search.concordance.TestConcordanceSearcher.java

License:Apache License

@Test
public void testSimpleMultiValuedField() throws Exception {
    String[] doc = new String[] { "a b c a b c", "c b a c b a" };
    List<String[]> docs = new ArrayList<>();
    docs.add(doc);/*from  w w w.  ja  v  a2  s .  co  m*/
    Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET);
    Directory directory = getDirectory(analyzer, docs);
    IndexReader reader = DirectoryReader.open(directory);
    IndexSearcher indexSearcher = new IndexSearcher(reader);
    ConcordanceSearcher searcher = new ConcordanceSearcher(
            new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD)));
    SpanQuery q = new SpanTermQuery(new Term(FIELD, "a"));

    ConcordanceWindowCollector collector = new ConcordanceWindowCollector(100);

    searcher.search(indexSearcher, FIELD, q, null, analyzer, collector);

    // test result size
    assertEquals(4, collector.size());

    // test result with sort order = pre
    List<ConcordanceWindow> windows = collector.getSortedWindows();
    String[] pres = new String[] { "", "c b", "c b a c b", "a b c" };
    String[] posts = new String[] { " b c a b c", " c b a", "", " b c" };

    for (int i = 0; i < pres.length; i++) {
        ConcordanceWindow w = windows.get(i);

        assertEquals("pres: " + i, pres[i], w.getPre());

        assertEquals("posts: " + i, posts[i], w.getPost());
    }

    // test sort order post
    // sort key is built at search time, so must re-search
    WindowBuilder wb = new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD),
            new DefaultSortKeyBuilder(ConcordanceSortOrder.POST), metadataExtractor, docIdBuilder);
    searcher = new ConcordanceSearcher(wb);

    collector = new ConcordanceWindowCollector(100);

    searcher.search(indexSearcher, FIELD, q, null, analyzer, collector);

    windows = collector.getSortedWindows();

    posts = new String[] { "", " b c", " b c a b c", " c b a", };
    for (int i = 0; i < posts.length; i++) {
        ConcordanceWindow w = windows.get(i);
        assertEquals(posts[i], w.getPost());
    }
    reader.close();
    directory.close();
}

From source file:org.tallison.lucene.search.concordance.TestConcordanceSearcher.java

License:Apache License

@Test
public void testWindowLengths() throws Exception {
    String[] doc = new String[] { "a b c d e f g" };
    List<String[]> docs = new ArrayList<>();
    docs.add(doc);//from  w ww  .j  av a  2s.com
    Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET);
    Directory directory = getDirectory(analyzer, docs);
    IndexReader reader = DirectoryReader.open(directory);
    IndexSearcher indexSearcher = new IndexSearcher(reader);

    SpanQuery q = new SpanTermQuery(new Term(FIELD, "d"));

    String[] pres = { "", "c", "b c", "a b c", "a b c", "a b c" };
    String[] posts = { "", " e", " e f", " e f g", " e f g", " e f g" };

    for (int tokensBefore = 0; tokensBefore < pres.length; tokensBefore++) {
        for (int tokensAfter = 0; tokensAfter < posts.length; tokensAfter++) {
            WindowBuilder wb = new WindowBuilder(tokensBefore, tokensAfter, analyzer.getOffsetGap(FIELD));
            ConcordanceSearcher searcher = new ConcordanceSearcher(wb);
            ConcordanceWindowCollector collector = new ConcordanceWindowCollector(100);
            searcher.search(indexSearcher, FIELD, q, null, analyzer, collector);
            ConcordanceWindow w = collector.getSortedWindows().get(0);
            assertEquals(tokensBefore + " : " + tokensAfter, pres[tokensBefore], w.getPre());
            assertEquals(tokensBefore + " : " + tokensAfter, posts[tokensAfter], w.getPost());
        }
    }

    reader.close();
    directory.close();

}

From source file:org.tallison.lucene.search.concordance.TestConcordanceSearcher.java

License:Apache License

@Test
public void testClockworkOrangeMultiValuedFieldProblem() throws Exception {
    /*//www .  j  a  v  a 2 s . c o  m
     * test handling of target match (or not) over different indices into multivalued
     * field array
     */
    String[] doc = new String[] { "a b c a b the", "clockwork", "orange b a c b a" };
    List<String[]> docs = new ArrayList<>();
    docs.add(doc);
    Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 0, 10);
    Directory directory = getDirectory(analyzer, docs);
    IndexReader reader = DirectoryReader.open(directory);
    IndexSearcher indexSearcher = new IndexSearcher(reader);
    WindowBuilder wb = new WindowBuilder(3, 3, analyzer.getOffsetGap(FIELD));

    ConcordanceSearcher searcher = new ConcordanceSearcher(wb);
    SpanQuery q1 = new SpanTermQuery(new Term(FIELD, "the"));
    SpanQuery q2 = new SpanTermQuery(new Term(FIELD, "clockwork"));
    SpanQuery q3 = new SpanTermQuery(new Term(FIELD, "orange"));
    SpanQuery q = new SpanNearQuery(new SpanQuery[] { q1, q2, q3 }, 3, true);
    ConcordanceWindowCollector collector = new ConcordanceWindowCollector(3);

    searcher.search(indexSearcher, FIELD, q, null, analyzer, collector);
    assertEquals(1, collector.size());

    ConcordanceWindow w = collector.getSortedWindows().iterator().next();
    assertEquals("target", "the | clockwork | orange", w.getTarget());
    assertEquals("pre", "c a b", w.getPre());
    assertEquals("post", " b a c", w.getPost());

    reader.close();
    directory.close();

    // test hit even over long inter-field gap
    analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 20, 50);
    directory = getDirectory(analyzer, docs);
    reader = DirectoryReader.open(directory);
    indexSearcher = new IndexSearcher(reader);

    wb = new WindowBuilder(3, 3, analyzer.getOffsetGap(FIELD));

    searcher = new ConcordanceSearcher(wb);
    q = new SpanNearQuery(new SpanQuery[] { q1, q2, q3 }, 120, true);
    collector = new ConcordanceWindowCollector(100);

    searcher.search(indexSearcher, FIELD, q, null, analyzer, collector);

    assertEquals(1, collector.size());
    w = collector.getSortedWindows().iterator().next();
    assertEquals("target", "the | clockwork | orange", w.getTarget());
    assertEquals("pre", "c a b", w.getPre());
    assertEquals("post", " b a c", w.getPost());

    reader.close();
    directory.close();
    // test miss
    analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 100, 100);
    directory = getDirectory(analyzer, docs);
    reader = DirectoryReader.open(directory);
    indexSearcher = new IndexSearcher(reader);

    wb = new WindowBuilder();
    searcher = new ConcordanceSearcher(wb);
    q = new SpanNearQuery(new SpanQuery[] { q1, q2, q3 }, 5, true);
    collector = new ConcordanceWindowCollector(100);

    searcher.search(indexSearcher, FIELD, q, null, analyzer, collector);

    assertEquals(0, collector.size());

    reader.close();
    directory.close();
}

From source file:org.tallison.lucene.search.concordance.TestConcordanceSearcher.java

License:Apache License

@Test
public void testBasicStandardQueryConversion() throws Exception {
    String[] docs = new String[] { "a b c a b c", "c b a c b a d e a", "c b a c b a e a b c a" };
    Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET);
    Directory directory = getDirectory(analyzer, docs);
    IndexReader reader = DirectoryReader.open(directory);
    IndexSearcher indexSearcher = new IndexSearcher(reader);
    ConcordanceSearcher searcher = new ConcordanceSearcher(
            new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD)));
    BooleanQuery q = new BooleanQuery.Builder().add(new TermQuery(new Term(FIELD, "a")), Occur.MUST)
            .add(new TermQuery(new Term(FIELD, "d")), Occur.MUST_NOT).build();

    ConcordanceWindowCollector collector = new ConcordanceWindowCollector(10);
    searcher.search(indexSearcher, FIELD, q, null, analyzer, collector);
    // shouldn't include document with "d"
    assertEquals(6, collector.size());/*  w  w w .  j av  a 2 s  . co  m*/

    // should only include document with "e" and not "d"
    Query filter = new TermQuery(new Term(FIELD, "e"));
    collector = new ConcordanceWindowCollector(10);

    searcher.search(indexSearcher, FIELD, (Query) q, filter, analyzer, collector);
    assertEquals(4, collector.size());

    reader.close();
    directory.close();
}

From source file:org.tallison.lucene.search.concordance.TestConcordanceSearcher.java

License:Apache License

@Test
public void testMismatchingFieldsInStandardQueryConversion() throws Exception {
    // tests what happens if a Query doesn't contain a term in the "span" field
    // in the searcher...should be no exception and zero documents returned.

    String[] docs = new String[] { "a b c a b c", };
    Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET);
    Directory directory = getDirectory(analyzer, docs);
    IndexReader reader = DirectoryReader.open(directory);
    IndexSearcher indexSearcher = new IndexSearcher(reader);

    ConcordanceSearcher searcher = new ConcordanceSearcher(
            new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD)));

    Query q = new TermQuery(new Term("_" + FIELD, "a"));

    int windowCount = -1;
    ConcordanceWindowCollector collector = new ConcordanceWindowCollector(10);

    searcher.search(indexSearcher, FIELD, q, null, analyzer, collector);
    windowCount = collector.size();/*from  ww w.ja va 2 s.  c o m*/
    assertEquals(0, windowCount);
    reader.close();
    directory.close();
}