List of usage examples for org.apache.lucene.search DocIdSetIterator NO_MORE_DOCS
int NO_MORE_DOCS
To view the source code for org.apache.lucene.search DocIdSetIterator NO_MORE_DOCS.
Click Source Link
From source file:edu.cmu.lti.oaqa.annographix.solr.StructScorerVer3.java
License:Apache License
/** * Advance to the next doc after {@link #docID()}. * <p>/* w w w .j a v a 2 s. c om*/ * In the ExactPhraseScorer in Solr, this function is highly optimized. * It is necessary, b/c the {@link #advance(int)} function is relatively costly * while checking for an exact phrase occurrence can be implemented very * efficiently. * </p> * <p> * In contrast, structured queries are expensive to evaluate. Thus, a clever * optimization likely wouldn't make a lot of sense here. * In fact, in a SloppyPhraseScorer (Solr 4.6) a simple advancement * algorithm is used, which is not based on the galloping intersection. * </p> * <p>If an optimized implementation of the {@link #nextDoc()} function does * not make sense for the sloppy phrase checker, it should be even * less useful for the (mostly) more expensive structured query scorer. */ @Override public int nextDoc() throws IOException { if (mCurrDocId == DocIdSetIterator.NO_MORE_DOCS) { return mCurrDocId; } return advance(mCurrDocId + 1); }
From source file:edu.cmu.lti.oaqa.annographix.solr.StructScorerVer3.java
License:Apache License
/** * Move to the first document with id >= target. * //w ww. j a v a 2s .co m * @param target find a document at least this large. */ @Override public int advance(int target) throws IOException { // first (least-costly, i.e., rarest) term int doc = mAllPostsSortedByCost[0].advance(target); if (doc == DocIdSetIterator.NO_MORE_DOCS) { return mCurrDocId = doc; } while (true) { // second, etc terms int i = 1; while (i < mAllPostsSortedByCost.length) { OnePostStateBase td = mAllPostsSortedByCost[i]; int doc2 = td.getDocID(); if (doc2 < doc) { doc2 = td.advance(doc); } if (doc2 > doc) { target = doc2; break; } i++; } if (i == mAllPostsSortedByCost.length) { /* * found all query elements in a document, * let's compute a number of matches inside * this document */ mCurrDocId = doc; /* * ... but first we need to read positional information * and payload data. */ for (OnePostStateBase st : mAllPostsSortedByCost) st.readDocElements(); mNumMatches = computeFreq(); if (mNumMatches != 0) { return mCurrDocId; } } if (target <= doc) doc = mAllPostsSortedByCost[0].nextDoc(); else doc = mAllPostsSortedByCost[0].advance(target); if (doc == DocIdSetIterator.NO_MORE_DOCS) { return mCurrDocId = doc; } } }
From source file:edu.ehu.galan.lite.algorithms.ranked.supervised.tfidf.corpus.lucene.CorpusHighFreqTerms.java
License:Open Source License
/** * * @param reader// w ww.j a v a 2 s. co m * @param field * @param termText * @return * @throws Exception */ public static long getTotalTermFreq(IndexReader reader, final String field, final BytesRef termText) throws Exception { long totalTF = 0L; for (final AtomicReaderContext ctx : reader.getTopReaderContext().leaves()) { AtomicReader r = ctx.reader(); Bits liveDocs = r.getLiveDocs(); if (liveDocs == null) { // TODO: we could do this up front, during the scan // (next()), instead of after-the-fact here w/ seek, // if the codec supports it and there are no del // docs... final long totTF = r.totalTermFreq(field, termText); if (totTF != -1) { totalTF += totTF; continue; } // otherwise we fall-through } // note: what should we do if field omits freqs? currently it counts as 1... DocsEnum de = r.termDocsEnum(liveDocs, field, termText); if (de != null) { while (de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { totalTF += de.freq(); } } } return totalTF; }
From source file:edu.ehu.galan.lite.algorithms.ranked.supervised.tfidf.TFIDF.java
License:Open Source License
public static long getTotalTermFreq(IndexReader reader, final String field, final BytesRef termText) throws Exception { long totalTF = 0L; for (final AtomicReaderContext ctx : reader.getTopReaderContext().leaves()) { AtomicReader r = ctx.reader();/*from w w w . j a v a 2 s . c o m*/ Bits liveDocs = r.getLiveDocs(); if (liveDocs == null) { // TODO: we could do this up front, during the scan // (next()), instead of after-the-fact here w/ seek, // if the codec supports it and there are no del // docs... final long totTF = r.totalTermFreq(field, termText); if (totTF != -1) { totalTF += totTF; continue; } // otherwise we fall-through } // note: what should we do if field omits freqs? currently it counts as 1... DocsEnum de = r.termDocsEnum(liveDocs, field, termText); if (de != null) { while (de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { totalTF += de.freq(); } } } return totalTF; }
From source file:edu.upenn.library.solrplugins.ProofOfConceptPayloadHandler.java
License:Apache License
private NamedList<Object> buildEntryValue(long count, PostingsEnum postings, Bits liveDocs) throws IOException { NamedList<Object> entry = new NamedList<>(); entry.add("count", count); int i = -1;// ww w .jav a 2s .c om while (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { if (!liveDocs.get(postings.docID())) { continue; } i++; NamedList<Object> documentEntry = new NamedList<>(); entry.add("doc" + i, documentEntry); for (int j = 0; j < postings.freq(); j++) { postings.nextPosition(); String extra = postings.getPayload().utf8ToString(); documentEntry.add("position" + j, extra); } } return entry; }
From source file:experiments.collective.entdoccentric.LTR.ConjunctionTermScorer.java
License:Apache License
private int doNext(int doc) throws IOException { do {/*from w w w . ja va 2 s . c o m*/ if (lead.doc == DocIdSetIterator.NO_MORE_DOCS) { return NO_MORE_DOCS; } advanceHead: do { for (int i = 1; i < docsAndFreqs.length; i++) { if (docsAndFreqs[i].doc < doc) { docsAndFreqs[i].doc = docsAndFreqs[i].docs.advance(doc); } if (docsAndFreqs[i].doc > doc) { // DocsEnum beyond the current doc - break and advance // lead break advanceHead; } } // success - all DocsEnums are on the same doc return doc; } while (true); // advance head for next iteration doc = lead.doc = lead.docs.nextDoc(); } while (true); }
From source file:game.TermFreq.java
void loadTfVec() throws Exception { IndexReader reader = retriever.getReader(); long sumDf = reader.getSumDocFreq(TrecDocRetriever.FIELD_ANALYZED_CONTENT); Terms terms = reader.getTermVector(luceneDocIdToGuess, FIELD_ANALYZED_CONTENT); if (terms == null || terms.size() == 0) return;/* w w w .j ava 2s. c o m*/ TermsEnum termsEnum; BytesRef term; tfvec = new ArrayList<>(); // Construct the normalized tf vector termsEnum = terms.iterator(null); // access the terms for this field int doclen = 0; while ((term = termsEnum.next()) != null) { // explore the terms for this field String termStr = term.utf8ToString(); String stem = retriever.analyze(termStr); DocsEnum docsEnum = termsEnum.docs(null, null); // enumerate through documents, in this case only one while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { //get the term frequency in the document int tf = docsEnum.freq(); TermFreq tfq = new TermFreq(new Term(TrecDocRetriever.FIELD_ANALYZED_CONTENT, term), termStr, tf); tfvec.add(tfq); doclen += tf; } } for (TermFreq tf : tfvec) { tf.tf = tf.tf / (float) doclen; // normalize by len float idf = sumDf / reader.docFreq(tf.term); tf.wt = (float) (Math.log(1 + LAMBDA / (ONE_MINUS_LAMBDA) * tf.tf * idf)); } Collections.sort(tfvec); }
From source file:indexer.Cell.java
List<DocVector> getVectors(IndexReader reader, Terms terms, int numDimensions) throws Exception { List<DocVector> containedPoints = new ArrayList<>(); TermsEnum termsEnum = terms.iterator(); // seek to a specific term boolean found = termsEnum.seekExact(new BytesRef(this.toString())); if (found) {/*from w w w. j av a2 s . c o m*/ // enumerate through documents DocsEnum docsEnum = termsEnum.docs(null, null); int docid; while ((docid = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { Document d = reader.document(docid); DocVector dvec = new DocVector(d, numDimensions, DocVector.numIntervals, null); containedPoints.add(dvec); } } return containedPoints; }
From source file:indexer.Retriever.java
private String getTF(IndexReader reader, int docID, String word) throws IOException { ClassicSimilarity similarity = new ClassicSimilarity(); int postingsFreq = 0; float wordFreq = 0; Term term = new Term(documentField, word); BytesRef bytesRef = term.bytes();//from ww w.j a v a 2 s . c o m PostingsEnum docsEnum = MultiFields.getTermDocsEnum(reader, documentField, bytesRef); int currentDocID; while ((currentDocID = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (currentDocID == docID) { int _postingsFreq = docsEnum.freq(); wordFreq += similarity.tf(_postingsFreq); postingsFreq += _postingsFreq; } } String printString = "\t" + word + ": TF = " + wordFreq + " (" + postingsFreq + " times in this document)"; return printString; }
From source file:info.boytsov.lucene.DumpIndex.java
License:Open Source License
public static void main(String[] args) { if (args.length < 3 || args.length > 8) { printUsage();/*from ww w . j a va 2 s. c o m*/ System.exit(1); } boolean sortByURL = Integer.parseInt(args[0]) != 0; String srcDirName = args[1]; String dstFileName = args[2]; int minTermFreq = MIN_TERM_FREQ; if (args.length >= 4) minTermFreq = Integer.parseInt(args[3]); int maxTermQty = MAX_TERM_QTY; if (args.length >= 5) maxTermQty = Integer.parseInt(args[4]); System.out.println("Source dir: " + srcDirName + " target dir: " + dstFileName); System.out.println("Min term freq: " + minTermFreq + " Max # of terms: " + maxTermQty); try { IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(srcDirName))); int docQty = reader.maxDoc(); int sortTable[] = new int[docQty]; Arrays.fill(sortTable, -1); if (sortByURL) { System.out.println("Re-sorting documents by URL!"); URL2DocID remap[] = new URL2DocID[docQty]; for (int docID = 0; docID < docQty; ++docID) { Document doc = reader.document(docID); String url = doc.get("url"); remap[docID] = new URL2DocID(url, docID); if (docID % 100000 == 0) { System.out.println("Collected " + (docID + 1) + " URLs for re-sorting"); } } Arrays.sort(remap); System.out.println("Collected and sorted all URLs for resoring, " + "filling out the sort table."); for (int newDocID = 0; newDocID < docQty; ++newDocID) { sortTable[remap[newDocID].docID] = newDocID; //System.out.println(remap[newDocID].url); } System.out.println("Sort table is filled up!"); for (int i = 0; i < docQty; ++i) remap[i] = null; remap = null; System.gc(); // Let's try to free some memory /* * Paranoid check: did we change all the -1 to non-negative numbers. * Turned out, it wasn't that paranoid. You may have repeating URLs. * Then, some elements in sortTable remain unset. */ for (int i = 0; i < sortTable.length; ++i) { if (sortTable[i] == -1) { throw new Exception("Bug: element " + i + " in sort table is not set"); } } } else { System.out.println("Keeping the original document order!"); for (int i = 0; i < sortTable.length; ++i) { sortTable[i] = i; // Identity transformation } } FreqWordDict dict = new FreqWordDict(reader, FIELD_NAME, minTermFreq, maxTermQty); File dstFile = new File(dstFileName); FileOutputStream outData = new FileOutputStream(dstFile); Iterator<Entry<TermDesc, Integer>> iter = dict.getTermIterator(); long totalWritten = 0; long totalInts = 0; int termId = 0; int batchWriteSize = 1024 * 1024 * 16; /* * We are trying to re-use as many objects as possible, * in order to reduce the number of allocations. */ IntArray bufferArray = new IntArray(batchWriteSize); int tmpDocId[] = null; ByteBuffer buffer = null; while (iter.hasNext()) { Entry<TermDesc, Integer> e = iter.next(); TermDesc ts = e.getKey(); DocsEnum docIter = dict.getDocIterator(ts.text); int postQty = ts.freq; int qty = 0, prevDocID = -1; /* * If posting lists appear in the order of descending term frequencies., * this will be actually only one allocation. */ if (tmpDocId == null || tmpDocId.length < postQty) tmpDocId = new int[postQty]; bufferArray.add(postQty); for (int i = 0; docIter.nextDoc() != DocIdSetIterator.NO_MORE_DOCS; ++i, ++qty) { if (i >= postQty) { throw new Exception("Bug: more postings than expected for term: " + ts.getText()); } int currDocID = docIter.docID(); if (currDocID >= docQty) { throw new Exception("Bug: a document ID " + currDocID + " is out of bounds, total # of docs: " + docQty); } tmpDocId[i] = sortTable[currDocID]; if (prevDocID >= docIter.docID()) { throw new Exception("Bug: unsorted doc ids for term: " + ts.getText()); } prevDocID = currDocID; } if (qty != postQty) { throw new Exception("Bug: fewer postings than expected for term: " + ts.getText()); } /* * Now let's resort docIds and write them. * REMEMBER that tmpDocId is a buffer that may contain * MORE than postQty elements!!! * Some of the won't be used. * */ Arrays.sort(tmpDocId, 0, postQty); for (int i = 0; i < postQty; ++i) bufferArray.add(tmpDocId[i]); totalWritten += 4 * (1 + postQty); totalInts += postQty; if (termId % 100000 == 0 || bufferArray.size() >= batchWriteSize) { System.out.println(termId + ":" + ts.getText() + " \t postQty=" + postQty + " overall written: " + totalWritten / 1024.0 / 1024.0 / 1024.0 + " Gbs, " + totalInts / 1e6 + " Millions postings"); } if (bufferArray.size() >= batchWriteSize) { // WriteArray may produce a new buffer, let's reuse it buffer = WriteArray(bufferArray, outData, buffer); } ++termId; } System.out.println("Term qty: " + termId + " flat size size : " + totalWritten / 1024.0 / 1024.0 / 1024.0 + " Gbs, " + totalInts / 1e6 + " Millions postings"); // WriteArray may produce a new buffer, let's reuse it buffer = WriteArray(bufferArray, outData, buffer); } catch (Exception e) { System.err.println("Error: " + e.getMessage()); e.printStackTrace(); System.exit(1); } }