Example usage for org.apache.lucene.analysis Analyzer getOffsetGap

List of usage examples for org.apache.lucene.analysis Analyzer getOffsetGap

Introduction

In this page you can find the example usage for org.apache.lucene.analysis Analyzer getOffsetGap.

Prototype

public int getOffsetGap(String fieldName) 

Source Link

Document

Just like #getPositionIncrementGap , except for Token offsets instead.

Usage

From source file:com.liferay.portal.search.lucene.PerFieldAnalyzerWrapper.java

License:Open Source License

@Override
public int getOffsetGap(Fieldable field) {
    Analyzer analyzer = _getAnalyzer(field.name());

    return analyzer.getOffsetGap(field);
}

From source file:org.tallison.lucene.search.concordance.TestConcordanceSearcher.java

License:Apache License

@Test
public void testSimple() throws Exception {
    String[] docs = new String[] { "a b c a b c", "c b a c b a" };
    Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET);

    Directory directory = getDirectory(analyzer, docs);
    IndexReader reader = DirectoryReader.open(directory);
    IndexSearcher indexSearcher = new IndexSearcher(reader);

    WindowBuilder wb = new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD),
            new DefaultSortKeyBuilder(ConcordanceSortOrder.PRE), metadataExtractor, docIdBuilder);
    ConcordanceSearcher searcher = new ConcordanceSearcher(wb);
    SpanQuery q = new SpanTermQuery(new Term(FIELD, "a"));

    ConcordanceWindowCollector collector = new ConcordanceWindowCollector(3);
    searcher.search(indexSearcher, FIELD, q, null, analyzer, collector);

    assertEquals(3, collector.size());//from   ww  w . j  a  v  a2s.co  m

    collector = new ConcordanceWindowCollector(ConcordanceWindowCollector.COLLECT_ALL);
    searcher.search(indexSearcher, FIELD, q, null, analyzer, collector);

    // test result size
    assertEquals(4, collector.size());

    // test result with sort order = pre
    List<ConcordanceWindow> windows = collector.getSortedWindows();
    String[] pres = new String[] { "", "c b", "c b a c b", "a b c" };
    String[] posts = new String[] { " b c a b c", " c b a", "", " b c" };

    for (int i = 0; i < windows.size(); i++) {
        ConcordanceWindow w = windows.get(i);

        assertEquals(pres[i], w.getPre());
        assertEquals(posts[i], w.getPost());
    }

    // test sort order post
    // sort key is built at search time, so must re-search
    wb = new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD),
            new DefaultSortKeyBuilder(ConcordanceSortOrder.POST), metadataExtractor, docIdBuilder);
    searcher = new ConcordanceSearcher(wb);

    collector = new ConcordanceWindowCollector(ConcordanceWindowCollector.COLLECT_ALL);
    searcher.search(indexSearcher, FIELD, q, null, analyzer, collector);

    windows = collector.getSortedWindows();

    posts = new String[] { "", " b c", " b c a b c", " c b a", };
    for (int i = 0; i < windows.size(); i++) {
        ConcordanceWindow w = windows.get(i);
        assertEquals(posts[i], w.getPost());
    }
    reader.close();
    directory.close();
}

From source file:org.tallison.lucene.search.concordance.TestConcordanceSearcher.java

License:Apache License

@Test
public void testSimpleMultiValuedField() throws Exception {
    String[] doc = new String[] { "a b c a b c", "c b a c b a" };
    List<String[]> docs = new ArrayList<>();
    docs.add(doc);/*from w w  w.  j a  va2 s .  c  o m*/
    Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET);
    Directory directory = getDirectory(analyzer, docs);
    IndexReader reader = DirectoryReader.open(directory);
    IndexSearcher indexSearcher = new IndexSearcher(reader);
    ConcordanceSearcher searcher = new ConcordanceSearcher(
            new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD)));
    SpanQuery q = new SpanTermQuery(new Term(FIELD, "a"));

    ConcordanceWindowCollector collector = new ConcordanceWindowCollector(100);

    searcher.search(indexSearcher, FIELD, q, null, analyzer, collector);

    // test result size
    assertEquals(4, collector.size());

    // test result with sort order = pre
    List<ConcordanceWindow> windows = collector.getSortedWindows();
    String[] pres = new String[] { "", "c b", "c b a c b", "a b c" };
    String[] posts = new String[] { " b c a b c", " c b a", "", " b c" };

    for (int i = 0; i < pres.length; i++) {
        ConcordanceWindow w = windows.get(i);

        assertEquals("pres: " + i, pres[i], w.getPre());

        assertEquals("posts: " + i, posts[i], w.getPost());
    }

    // test sort order post
    // sort key is built at search time, so must re-search
    WindowBuilder wb = new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD),
            new DefaultSortKeyBuilder(ConcordanceSortOrder.POST), metadataExtractor, docIdBuilder);
    searcher = new ConcordanceSearcher(wb);

    collector = new ConcordanceWindowCollector(100);

    searcher.search(indexSearcher, FIELD, q, null, analyzer, collector);

    windows = collector.getSortedWindows();

    posts = new String[] { "", " b c", " b c a b c", " c b a", };
    for (int i = 0; i < posts.length; i++) {
        ConcordanceWindow w = windows.get(i);
        assertEquals(posts[i], w.getPost());
    }
    reader.close();
    directory.close();
}

From source file:org.tallison.lucene.search.concordance.TestConcordanceSearcher.java

License:Apache License

@Test
public void testWindowLengths() throws Exception {
    String[] doc = new String[] { "a b c d e f g" };
    List<String[]> docs = new ArrayList<>();
    docs.add(doc);// w  ww.j ava2 s .  com
    Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET);
    Directory directory = getDirectory(analyzer, docs);
    IndexReader reader = DirectoryReader.open(directory);
    IndexSearcher indexSearcher = new IndexSearcher(reader);

    SpanQuery q = new SpanTermQuery(new Term(FIELD, "d"));

    String[] pres = { "", "c", "b c", "a b c", "a b c", "a b c" };
    String[] posts = { "", " e", " e f", " e f g", " e f g", " e f g" };

    for (int tokensBefore = 0; tokensBefore < pres.length; tokensBefore++) {
        for (int tokensAfter = 0; tokensAfter < posts.length; tokensAfter++) {
            WindowBuilder wb = new WindowBuilder(tokensBefore, tokensAfter, analyzer.getOffsetGap(FIELD));
            ConcordanceSearcher searcher = new ConcordanceSearcher(wb);
            ConcordanceWindowCollector collector = new ConcordanceWindowCollector(100);
            searcher.search(indexSearcher, FIELD, q, null, analyzer, collector);
            ConcordanceWindow w = collector.getSortedWindows().get(0);
            assertEquals(tokensBefore + " : " + tokensAfter, pres[tokensBefore], w.getPre());
            assertEquals(tokensBefore + " : " + tokensAfter, posts[tokensAfter], w.getPost());
        }
    }

    reader.close();
    directory.close();

}

From source file:org.tallison.lucene.search.concordance.TestConcordanceSearcher.java

License:Apache License

@Test
public void testClockworkOrangeMultiValuedFieldProblem() throws Exception {
    /*//from   ww  w  .  j ava  2 s.  co m
     * test handling of target match (or not) over different indices into multivalued
     * field array
     */
    String[] doc = new String[] { "a b c a b the", "clockwork", "orange b a c b a" };
    List<String[]> docs = new ArrayList<>();
    docs.add(doc);
    Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 0, 10);
    Directory directory = getDirectory(analyzer, docs);
    IndexReader reader = DirectoryReader.open(directory);
    IndexSearcher indexSearcher = new IndexSearcher(reader);
    WindowBuilder wb = new WindowBuilder(3, 3, analyzer.getOffsetGap(FIELD));

    ConcordanceSearcher searcher = new ConcordanceSearcher(wb);
    SpanQuery q1 = new SpanTermQuery(new Term(FIELD, "the"));
    SpanQuery q2 = new SpanTermQuery(new Term(FIELD, "clockwork"));
    SpanQuery q3 = new SpanTermQuery(new Term(FIELD, "orange"));
    SpanQuery q = new SpanNearQuery(new SpanQuery[] { q1, q2, q3 }, 3, true);
    ConcordanceWindowCollector collector = new ConcordanceWindowCollector(3);

    searcher.search(indexSearcher, FIELD, q, null, analyzer, collector);
    assertEquals(1, collector.size());

    ConcordanceWindow w = collector.getSortedWindows().iterator().next();
    assertEquals("target", "the | clockwork | orange", w.getTarget());
    assertEquals("pre", "c a b", w.getPre());
    assertEquals("post", " b a c", w.getPost());

    reader.close();
    directory.close();

    // test hit even over long inter-field gap
    analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 20, 50);
    directory = getDirectory(analyzer, docs);
    reader = DirectoryReader.open(directory);
    indexSearcher = new IndexSearcher(reader);

    wb = new WindowBuilder(3, 3, analyzer.getOffsetGap(FIELD));

    searcher = new ConcordanceSearcher(wb);
    q = new SpanNearQuery(new SpanQuery[] { q1, q2, q3 }, 120, true);
    collector = new ConcordanceWindowCollector(100);

    searcher.search(indexSearcher, FIELD, q, null, analyzer, collector);

    assertEquals(1, collector.size());
    w = collector.getSortedWindows().iterator().next();
    assertEquals("target", "the | clockwork | orange", w.getTarget());
    assertEquals("pre", "c a b", w.getPre());
    assertEquals("post", " b a c", w.getPost());

    reader.close();
    directory.close();
    // test miss
    analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 100, 100);
    directory = getDirectory(analyzer, docs);
    reader = DirectoryReader.open(directory);
    indexSearcher = new IndexSearcher(reader);

    wb = new WindowBuilder();
    searcher = new ConcordanceSearcher(wb);
    q = new SpanNearQuery(new SpanQuery[] { q1, q2, q3 }, 5, true);
    collector = new ConcordanceWindowCollector(100);

    searcher.search(indexSearcher, FIELD, q, null, analyzer, collector);

    assertEquals(0, collector.size());

    reader.close();
    directory.close();
}

From source file:org.tallison.lucene.search.concordance.TestConcordanceSearcher.java

License:Apache License

@Test
public void testWithStops() throws Exception {
    String[] docs = new String[] { "a b the d e the f", "g h the d the j" };
    Analyzer analyzer = getAnalyzer(MockTokenFilter.ENGLISH_STOPSET);
    Directory directory = getDirectory(analyzer, docs);
    IndexReader reader = DirectoryReader.open(directory);
    IndexSearcher indexSearcher = new IndexSearcher(reader);
    WindowBuilder wb = new WindowBuilder(2, 2, analyzer.getOffsetGap(FIELD));

    ConcordanceSearcher searcher = new ConcordanceSearcher(wb);
    SpanQuery q = new SpanTermQuery(new Term(FIELD, "d"));
    ConcordanceWindowCollector collector = new ConcordanceWindowCollector(3);

    searcher.search(indexSearcher, FIELD, q, null, analyzer, collector);
    List<ConcordanceWindow> windows = collector.getSortedWindows();
    assertEquals(2, windows.size());//from  ww  w  .  j a  v a  2  s  . com

    // the second word after the target is a stop word
    // this post-component of this window should only go to the first word after
    // the target
    assertEquals("b the", windows.get(0).getPre());
    assertEquals("d", windows.get(0).getTarget());
    assertEquals(" e", windows.get(0).getPost());

    assertEquals("h the", windows.get(1).getPre());
    assertEquals("d", windows.get(1).getTarget());
    assertEquals(" the j", windows.get(1).getPost());

    reader.close();
    directory.close();
}

From source file:org.tallison.lucene.search.concordance.TestConcordanceSearcher.java

License:Apache License

@Test
public void testBasicStandardQueryConversion() throws Exception {
    String[] docs = new String[] { "a b c a b c", "c b a c b a d e a", "c b a c b a e a b c a" };
    Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET);
    Directory directory = getDirectory(analyzer, docs);
    IndexReader reader = DirectoryReader.open(directory);
    IndexSearcher indexSearcher = new IndexSearcher(reader);
    ConcordanceSearcher searcher = new ConcordanceSearcher(
            new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD)));
    BooleanQuery q = new BooleanQuery.Builder().add(new TermQuery(new Term(FIELD, "a")), Occur.MUST)
            .add(new TermQuery(new Term(FIELD, "d")), Occur.MUST_NOT).build();

    ConcordanceWindowCollector collector = new ConcordanceWindowCollector(10);
    searcher.search(indexSearcher, FIELD, q, null, analyzer, collector);
    // shouldn't include document with "d"
    assertEquals(6, collector.size());/* w w  w  .j  a v a 2  s.  c o m*/

    // should only include document with "e" and not "d"
    Query filter = new TermQuery(new Term(FIELD, "e"));
    collector = new ConcordanceWindowCollector(10);

    searcher.search(indexSearcher, FIELD, (Query) q, filter, analyzer, collector);
    assertEquals(4, collector.size());

    reader.close();
    directory.close();
}

From source file:org.tallison.lucene.search.concordance.TestConcordanceSearcher.java

License:Apache License

@Test
public void testMismatchingFieldsInStandardQueryConversion() throws Exception {
    // tests what happens if a Query doesn't contain a term in the "span" field
    // in the searcher...should be no exception and zero documents returned.

    String[] docs = new String[] { "a b c a b c", };
    Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET);
    Directory directory = getDirectory(analyzer, docs);
    IndexReader reader = DirectoryReader.open(directory);
    IndexSearcher indexSearcher = new IndexSearcher(reader);

    ConcordanceSearcher searcher = new ConcordanceSearcher(
            new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD)));

    Query q = new TermQuery(new Term("_" + FIELD, "a"));

    int windowCount = -1;
    ConcordanceWindowCollector collector = new ConcordanceWindowCollector(10);

    searcher.search(indexSearcher, FIELD, q, null, analyzer, collector);
    windowCount = collector.size();//from w w w  .  j  a  v  a2 s  . c  om
    assertEquals(0, windowCount);
    reader.close();
    directory.close();
}

From source file:org.tallison.lucene.search.concordance.TestConcordanceSearcher.java

License:Apache License

@Test
public void testUniqueCollector() throws Exception {
    String[] docs = new String[] { "a b c d c b a", "a B C d c b a", "a b C d C B a", "a b c d C B A",
            "e f g d g f e", "h i j d j i h" };

    Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET);
    Directory directory = getDirectory(analyzer, docs);
    IndexReader reader = DirectoryReader.open(directory);
    IndexSearcher indexSearcher = new IndexSearcher(reader);
    ConcordanceSearcher searcher = new ConcordanceSearcher(
            new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD)));
    SpanQuery q = new SpanTermQuery(new Term(FIELD, "d"));

    DedupingConcordanceWindowCollector collector = new DedupingConcordanceWindowCollector(2);
    searcher.search(indexSearcher, FIELD, (Query) q, null, analyzer, collector);
    assertEquals(2, collector.size());/*from  ww  w .  jav a 2s .c  om*/

    collector = new DedupingConcordanceWindowCollector(AbstractConcordanceWindowCollector.COLLECT_ALL);
    searcher.search(indexSearcher, FIELD, (Query) q, null, analyzer, collector);
    assertEquals(3, collector.size());

    reader.close();
    directory.close();

}

From source file:org.tallison.lucene.search.concordance.TestConcordanceSearcher.java

License:Apache License

@Test
public void testUniqueCollectorWithSameWindowOverflow() throws Exception {
    String[] docs = new String[] { "a b c d c b a", "a b c d c b a", "a b c d c b a", "a b c d c b a",
            "e f g d g f e", "h i j d j i h" };

    Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET);
    Directory directory = getDirectory(analyzer, docs);
    IndexReader reader = DirectoryReader.open(directory);
    IndexSearcher indexSearcher = new IndexSearcher(reader);
    ConcordanceSearcher searcher = new ConcordanceSearcher(
            new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD)));

    SpanQuery q = new SpanTermQuery(new Term(FIELD, "d"));

    DedupingConcordanceWindowCollector collector = new DedupingConcordanceWindowCollector(3);
    searcher.search(indexSearcher, FIELD, (Query) q, null, analyzer, collector);
    assertEquals(3, collector.size());/* w w  w . j ava2 s .  c  om*/
    assertEquals(4, collector.getSortedWindows().get(0).getCount());
    reader.close();
    directory.close();
}