org.tallison.lucene.search.concordance.charoffsets.SpansCrawler.java Source code

Introduction

Here is the source code for org.tallison.lucene.search.concordance.charoffsets.SpansCrawler.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.tallison.lucene.search.concordance.charoffsets;

import java.io.IOException;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.*;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanWeight;
import org.apache.lucene.search.spans.Spans;

public class SpansCrawler {

    public static void crawl(SpanQuery query, Query filter, IndexSearcher searcher, DocTokenOffsetsVisitor visitor)
            throws IOException, TargetTokenNotFoundException {

        query = (SpanQuery) query.rewrite(searcher.getIndexReader());

        SpanWeight w = query.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1.0f);
        if (filter == null) {
            for (LeafReaderContext ctx : searcher.getIndexReader().leaves()) {

                Spans spans = w.getSpans(ctx, SpanWeight.Postings.POSITIONS);
                if (spans == null) {
                    continue;
                }
                boolean cont = visitLeafReader(ctx, spans, visitor);
                if (!cont) {
                    break;
                }
            }
        } else {
            filter = searcher.rewrite(filter);
            Weight searcherWeight = searcher.createWeight(filter, ScoreMode.COMPLETE_NO_SCORES, 1.0f);
            for (LeafReaderContext ctx : searcher.getIndexReader().leaves()) {
                Scorer leafReaderContextScorer = searcherWeight.scorer(ctx);
                if (leafReaderContextScorer == null) {
                    continue;
                }
                //Can we tell from the scorer that there were no hits?
                //in <= 5.x we could stop here if the filter query had no hits.

                Spans spans = w.getSpans(ctx, SpanWeight.Postings.POSITIONS);
                if (spans == null) {
                    continue;
                }
                DocIdSetIterator filterItr = leafReaderContextScorer.iterator();

                if (filterItr == null || filterItr.equals(DocIdSetIterator.empty())) {
                    continue;
                }
                boolean cont = visitLeafReader(ctx, spans, filterItr, visitor);
                if (!cont) {
                    break;
                }
            }
        }
    }

    static boolean visitLeafReader(LeafReaderContext leafCtx, Spans spans, DocIdSetIterator filterItr,
            DocTokenOffsetsVisitor visitor) throws IOException, TargetTokenNotFoundException {
        int filterDoc = -1;
        int spansDoc = spans.nextDoc();
        while (true) {
            if (spansDoc == DocIdSetIterator.NO_MORE_DOCS) {
                break;
            }
            filterDoc = filterItr.advance(spansDoc);
            if (filterDoc == DocIdSetIterator.NO_MORE_DOCS) {
                break;
            } else if (filterDoc > spansDoc) {
                while (spansDoc <= filterDoc) {
                    spansDoc = spans.nextDoc();
                    if (spansDoc == filterDoc) {
                        boolean cont = visit(leafCtx, spans, visitor);
                        if (!cont) {
                            return false;
                        }

                    } else {
                        continue;
                    }
                }
            } else if (filterDoc == spansDoc) {
                boolean cont = visit(leafCtx, spans, visitor);
                if (!cont) {
                    return false;
                }
                //then iterate spans
                spansDoc = spans.nextDoc();
            } else if (filterDoc < spansDoc) {
                throw new IllegalArgumentException("FILTER doc is < spansdoc!!!");
            } else {
                throw new IllegalArgumentException("Something horrible happened");
            }
        }
        return true;
    }

    static boolean visitLeafReader(LeafReaderContext leafCtx, Spans spans, DocTokenOffsetsVisitor visitor)
            throws IOException, TargetTokenNotFoundException {
        while (spans.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
            boolean cont = visit(leafCtx, spans, visitor);
            if (!cont) {
                return false;
            }
        }
        return true;
    }

    static boolean visit(LeafReaderContext leafCtx, Spans spans, DocTokenOffsetsVisitor visitor)
            throws IOException, TargetTokenNotFoundException {
        Document document = leafCtx.reader().document(spans.docID(), visitor.getFields());
        DocTokenOffsets offsets = visitor.getDocTokenOffsets();
        offsets.reset(leafCtx.docBase, spans.docID(), document);
        while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
            offsets.addOffset(spans.startPosition(), spans.endPosition());
        }
        return visitor.visit(offsets);
    }

}