Example usage for org.apache.lucene.search DocIdSetIterator NO_MORE_DOCS

List of usage examples for org.apache.lucene.search DocIdSetIterator NO_MORE_DOCS

Introduction

In this page you can find the example usage for org.apache.lucene.search DocIdSetIterator NO_MORE_DOCS.

Prototype

int NO_MORE_DOCS

To view the source code for org.apache.lucene.search DocIdSetIterator NO_MORE_DOCS.

Click Source Link

Document

When returned by #nextDoc() , #advance(int) and #docID() it means there are no more docs in the iterator.

Usage

From source file:edu.cmu.lti.oaqa.annographix.solr.StructScorerVer3.java

License:Apache License

/**
 * Advance to the next doc after {@link #docID()}.
 * <p>/*  w w w .j a  v  a  2 s.  c om*/
 * In the ExactPhraseScorer in Solr, this function is highly optimized.
 * It is necessary, b/c the {@link #advance(int)} function is relatively costly 
 * while checking for an exact phrase occurrence can be implemented very
 * efficiently.
 * </p>
 * <p>  
 * In contrast, structured queries are expensive to evaluate. Thus, a clever 
 * optimization likely wouldn't make a lot of sense here. 
 * In fact, in a SloppyPhraseScorer (Solr 4.6) a simple advancement 
 * algorithm is used, which is not based on the galloping  intersection.
 * </p>
 * <p>If an optimized implementation of the {@link #nextDoc()} function does
 * not make sense for the sloppy phrase checker, it should be even
 * less useful for the (mostly) more expensive structured query scorer.
 */
@Override
public int nextDoc() throws IOException {
    if (mCurrDocId == DocIdSetIterator.NO_MORE_DOCS) {
        return mCurrDocId;
    }
    return advance(mCurrDocId + 1);
}

From source file:edu.cmu.lti.oaqa.annographix.solr.StructScorerVer3.java

License:Apache License

/**
 * Move to the first document with id &gt;= target.
 * //w  ww.  j  a v  a 2s .co m
 * @param target      find a document at least this large.
 */
@Override
public int advance(int target) throws IOException {
    // first (least-costly, i.e., rarest) term
    int doc = mAllPostsSortedByCost[0].advance(target);

    if (doc == DocIdSetIterator.NO_MORE_DOCS) {
        return mCurrDocId = doc;
    }

    while (true) {
        // second, etc terms 
        int i = 1;
        while (i < mAllPostsSortedByCost.length) {
            OnePostStateBase td = mAllPostsSortedByCost[i];
            int doc2 = td.getDocID();

            if (doc2 < doc) {
                doc2 = td.advance(doc);
            }

            if (doc2 > doc) {
                target = doc2;
                break;
            }
            i++;
        }

        if (i == mAllPostsSortedByCost.length) {
            /*
             *  found all query elements in a document, 
             *  let's compute a number of matches inside
             *  this document
             */
            mCurrDocId = doc;
            /*
             *  ... but first we need to read positional information  
             *      and payload data. 
             */
            for (OnePostStateBase st : mAllPostsSortedByCost)
                st.readDocElements();
            mNumMatches = computeFreq();

            if (mNumMatches != 0) {
                return mCurrDocId;
            }
        }

        if (target <= doc)
            doc = mAllPostsSortedByCost[0].nextDoc();
        else
            doc = mAllPostsSortedByCost[0].advance(target);

        if (doc == DocIdSetIterator.NO_MORE_DOCS) {
            return mCurrDocId = doc;
        }
    }
}

From source file:edu.ehu.galan.lite.algorithms.ranked.supervised.tfidf.corpus.lucene.CorpusHighFreqTerms.java

License:Open Source License

/**
 *
 * @param reader//  w ww.j  a  v a  2  s. co  m
 * @param field
 * @param termText
 * @return
 * @throws Exception
 */
public static long getTotalTermFreq(IndexReader reader, final String field, final BytesRef termText)
        throws Exception {
    long totalTF = 0L;
    for (final AtomicReaderContext ctx : reader.getTopReaderContext().leaves()) {
        AtomicReader r = ctx.reader();
        Bits liveDocs = r.getLiveDocs();
        if (liveDocs == null) {
            // TODO: we could do this up front, during the scan
            // (next()), instead of after-the-fact here w/ seek,
            // if the codec supports it and there are no del
            // docs...
            final long totTF = r.totalTermFreq(field, termText);
            if (totTF != -1) {
                totalTF += totTF;
                continue;
            } // otherwise we fall-through
        }
        // note: what should we do if field omits freqs? currently it counts as 1...
        DocsEnum de = r.termDocsEnum(liveDocs, field, termText);
        if (de != null) {
            while (de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
                totalTF += de.freq();
            }
        }
    }

    return totalTF;
}

From source file:edu.ehu.galan.lite.algorithms.ranked.supervised.tfidf.TFIDF.java

License:Open Source License

public static long getTotalTermFreq(IndexReader reader, final String field, final BytesRef termText)
        throws Exception {
    long totalTF = 0L;
    for (final AtomicReaderContext ctx : reader.getTopReaderContext().leaves()) {
        AtomicReader r = ctx.reader();/*from  w w  w  .  j a  v  a  2 s  . c  o  m*/
        Bits liveDocs = r.getLiveDocs();
        if (liveDocs == null) {
            // TODO: we could do this up front, during the scan
            // (next()), instead of after-the-fact here w/ seek,
            // if the codec supports it and there are no del
            // docs...
            final long totTF = r.totalTermFreq(field, termText);
            if (totTF != -1) {
                totalTF += totTF;
                continue;
            } // otherwise we fall-through
        }
        // note: what should we do if field omits freqs? currently it counts as 1...
        DocsEnum de = r.termDocsEnum(liveDocs, field, termText);
        if (de != null) {
            while (de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
                totalTF += de.freq();
            }
        }
    }
    return totalTF;
}

From source file:edu.upenn.library.solrplugins.ProofOfConceptPayloadHandler.java

License:Apache License

private NamedList<Object> buildEntryValue(long count, PostingsEnum postings, Bits liveDocs) throws IOException {
    NamedList<Object> entry = new NamedList<>();
    entry.add("count", count);
    int i = -1;// ww w .jav  a  2s .c om
    while (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
        if (!liveDocs.get(postings.docID())) {
            continue;
        }
        i++;
        NamedList<Object> documentEntry = new NamedList<>();
        entry.add("doc" + i, documentEntry);
        for (int j = 0; j < postings.freq(); j++) {
            postings.nextPosition();
            String extra = postings.getPayload().utf8ToString();
            documentEntry.add("position" + j, extra);
        }
    }
    return entry;
}

From source file:experiments.collective.entdoccentric.LTR.ConjunctionTermScorer.java

License:Apache License

private int doNext(int doc) throws IOException {
    do {/*from   w w w . ja va  2 s  .  c o m*/
        if (lead.doc == DocIdSetIterator.NO_MORE_DOCS) {
            return NO_MORE_DOCS;
        }
        advanceHead: do {
            for (int i = 1; i < docsAndFreqs.length; i++) {
                if (docsAndFreqs[i].doc < doc) {
                    docsAndFreqs[i].doc = docsAndFreqs[i].docs.advance(doc);
                }
                if (docsAndFreqs[i].doc > doc) {
                    // DocsEnum beyond the current doc - break and advance
                    // lead
                    break advanceHead;
                }
            }
            // success - all DocsEnums are on the same doc
            return doc;
        } while (true);
        // advance head for next iteration
        doc = lead.doc = lead.docs.nextDoc();
    } while (true);
}

From source file:game.TermFreq.java

void loadTfVec() throws Exception {

    IndexReader reader = retriever.getReader();
    long sumDf = reader.getSumDocFreq(TrecDocRetriever.FIELD_ANALYZED_CONTENT);

    Terms terms = reader.getTermVector(luceneDocIdToGuess, FIELD_ANALYZED_CONTENT);
    if (terms == null || terms.size() == 0)
        return;/* w w w .j ava 2s.  c  o m*/

    TermsEnum termsEnum;
    BytesRef term;
    tfvec = new ArrayList<>();

    // Construct the normalized tf vector
    termsEnum = terms.iterator(null); // access the terms for this field
    int doclen = 0;
    while ((term = termsEnum.next()) != null) { // explore the terms for this field
        String termStr = term.utf8ToString();
        String stem = retriever.analyze(termStr);
        DocsEnum docsEnum = termsEnum.docs(null, null); // enumerate through documents, in this case only one
        while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
            //get the term frequency in the document
            int tf = docsEnum.freq();
            TermFreq tfq = new TermFreq(new Term(TrecDocRetriever.FIELD_ANALYZED_CONTENT, term), termStr, tf);
            tfvec.add(tfq);

            doclen += tf;
        }
    }

    for (TermFreq tf : tfvec) {
        tf.tf = tf.tf / (float) doclen; // normalize by len
        float idf = sumDf / reader.docFreq(tf.term);
        tf.wt = (float) (Math.log(1 + LAMBDA / (ONE_MINUS_LAMBDA) * tf.tf * idf));
    }

    Collections.sort(tfvec);
}

From source file:indexer.Cell.java

List<DocVector> getVectors(IndexReader reader, Terms terms, int numDimensions) throws Exception {
    List<DocVector> containedPoints = new ArrayList<>();

    TermsEnum termsEnum = terms.iterator();
    // seek to a specific term
    boolean found = termsEnum.seekExact(new BytesRef(this.toString()));

    if (found) {/*from w w  w.  j av a2 s .  c  o  m*/
        // enumerate through documents
        DocsEnum docsEnum = termsEnum.docs(null, null);
        int docid;
        while ((docid = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
            Document d = reader.document(docid);
            DocVector dvec = new DocVector(d, numDimensions, DocVector.numIntervals, null);
            containedPoints.add(dvec);
        }
    }

    return containedPoints;
}

From source file:indexer.Retriever.java

private String getTF(IndexReader reader, int docID, String word) throws IOException {
    ClassicSimilarity similarity = new ClassicSimilarity();
    int postingsFreq = 0;
    float wordFreq = 0;

    Term term = new Term(documentField, word);
    BytesRef bytesRef = term.bytes();//from ww w.j a v  a 2 s  .  c  o  m
    PostingsEnum docsEnum = MultiFields.getTermDocsEnum(reader, documentField, bytesRef);
    int currentDocID;
    while ((currentDocID = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
        if (currentDocID == docID) {
            int _postingsFreq = docsEnum.freq();
            wordFreq += similarity.tf(_postingsFreq);
            postingsFreq += _postingsFreq;
        }
    }

    String printString = "\t" + word + ": TF = " + wordFreq + " (" + postingsFreq + " times in this document)";
    return printString;
}

From source file:info.boytsov.lucene.DumpIndex.java

License:Open Source License

public static void main(String[] args) {
    if (args.length < 3 || args.length > 8) {
        printUsage();/*from ww  w  . j a va  2 s.  c o  m*/
        System.exit(1);
    }
    boolean sortByURL = Integer.parseInt(args[0]) != 0;

    String srcDirName = args[1];
    String dstFileName = args[2];

    int minTermFreq = MIN_TERM_FREQ;

    if (args.length >= 4)
        minTermFreq = Integer.parseInt(args[3]);

    int maxTermQty = MAX_TERM_QTY;

    if (args.length >= 5)
        maxTermQty = Integer.parseInt(args[4]);

    System.out.println("Source dir: " + srcDirName + " target dir: " + dstFileName);
    System.out.println("Min term freq: " + minTermFreq + " Max # of terms: " + maxTermQty);

    try {
        IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(srcDirName)));

        int docQty = reader.maxDoc();
        int sortTable[] = new int[docQty];

        Arrays.fill(sortTable, -1);

        if (sortByURL) {
            System.out.println("Re-sorting documents by URL!");

            URL2DocID remap[] = new URL2DocID[docQty];

            for (int docID = 0; docID < docQty; ++docID) {
                Document doc = reader.document(docID);
                String url = doc.get("url");
                remap[docID] = new URL2DocID(url, docID);
                if (docID % 100000 == 0) {
                    System.out.println("Collected " + (docID + 1) + " URLs for re-sorting");
                }
            }

            Arrays.sort(remap);

            System.out.println("Collected and sorted all URLs for resoring, " + "filling out the sort table.");

            for (int newDocID = 0; newDocID < docQty; ++newDocID) {
                sortTable[remap[newDocID].docID] = newDocID;
                //System.out.println(remap[newDocID].url);
            }

            System.out.println("Sort table is filled up!");

            for (int i = 0; i < docQty; ++i)
                remap[i] = null;
            remap = null;
            System.gc(); // Let's try to free some memory

            /*
             *  Paranoid check: did we change all the -1 to non-negative numbers.
             *  Turned out, it wasn't that paranoid. You may have repeating URLs.
             *  Then, some elements in sortTable remain unset.
             */
            for (int i = 0; i < sortTable.length; ++i) {
                if (sortTable[i] == -1) {
                    throw new Exception("Bug: element " + i + " in sort table is not set");
                }
            }
        } else {
            System.out.println("Keeping the original document order!");

            for (int i = 0; i < sortTable.length; ++i) {
                sortTable[i] = i; // Identity transformation
            }
        }

        FreqWordDict dict = new FreqWordDict(reader, FIELD_NAME, minTermFreq, maxTermQty);

        File dstFile = new File(dstFileName);

        FileOutputStream outData = new FileOutputStream(dstFile);

        Iterator<Entry<TermDesc, Integer>> iter = dict.getTermIterator();

        long totalWritten = 0;
        long totalInts = 0;

        int termId = 0;

        int batchWriteSize = 1024 * 1024 * 16;

        /*
         *  We are trying to re-use as many objects as possible,
         *  in order to reduce the number of allocations.
         */
        IntArray bufferArray = new IntArray(batchWriteSize);
        int tmpDocId[] = null;

        ByteBuffer buffer = null;

        while (iter.hasNext()) {
            Entry<TermDesc, Integer> e = iter.next();

            TermDesc ts = e.getKey();
            DocsEnum docIter = dict.getDocIterator(ts.text);

            int postQty = ts.freq;

            int qty = 0, prevDocID = -1;

            /*
             * If posting lists appear in the order of descending term frequencies.,
             * this will be actually only one allocation.
             */
            if (tmpDocId == null || tmpDocId.length < postQty)
                tmpDocId = new int[postQty];

            bufferArray.add(postQty);

            for (int i = 0; docIter.nextDoc() != DocIdSetIterator.NO_MORE_DOCS; ++i, ++qty) {
                if (i >= postQty) {
                    throw new Exception("Bug: more postings than expected for term: " + ts.getText());
                }
                int currDocID = docIter.docID();
                if (currDocID >= docQty) {
                    throw new Exception("Bug: a document ID " + currDocID
                            + " is out of bounds, total # of docs: " + docQty);
                }
                tmpDocId[i] = sortTable[currDocID];
                if (prevDocID >= docIter.docID()) {
                    throw new Exception("Bug: unsorted doc ids for term: " + ts.getText());
                }
                prevDocID = currDocID;
            }
            if (qty != postQty) {
                throw new Exception("Bug: fewer postings than expected for term: " + ts.getText());
            }
            /*
             *  Now let's resort docIds and write them.
             *  REMEMBER that tmpDocId is a buffer that may contain 
             *  MORE than postQty elements!!!
             *  Some of the won't be used.
             *  
             */
            Arrays.sort(tmpDocId, 0, postQty);

            for (int i = 0; i < postQty; ++i)
                bufferArray.add(tmpDocId[i]);

            totalWritten += 4 * (1 + postQty);
            totalInts += postQty;

            if (termId % 100000 == 0 || bufferArray.size() >= batchWriteSize) {
                System.out.println(termId + ":" + ts.getText() + " \t postQty=" + postQty + " overall written: "
                        + totalWritten / 1024.0 / 1024.0 / 1024.0 + " Gbs, " + totalInts / 1e6
                        + " Millions postings");
            }

            if (bufferArray.size() >= batchWriteSize) {
                // WriteArray may produce a new buffer, let's reuse it
                buffer = WriteArray(bufferArray, outData, buffer);
            }

            ++termId;
        }
        System.out.println("Term qty: " + termId + " flat size size : "
                + totalWritten / 1024.0 / 1024.0 / 1024.0 + " Gbs, " + totalInts / 1e6 + " Millions postings");

        // WriteArray may produce a new buffer, let's reuse it      
        buffer = WriteArray(bufferArray, outData, buffer);
    } catch (Exception e) {
        System.err.println("Error: " + e.getMessage());
        e.printStackTrace();
        System.exit(1);
    }

}