List of usage examples for org.apache.lucene.index LeafReader maxDoc
public abstract int maxDoc();
From source file:com.meizu.nlp.classification.utils.DatasetSplitter.java
License:Apache License
/** * Split a given index into 3 indexes for training, test and cross validation tasks respectively * * @param originalIndex an {@link org.apache.lucene.index.LeafReader} on the source index * @param trainingIndex a {@link Directory} used to write the training index * @param testIndex a {@link Directory} used to write the test index * @param crossValidationIndex a {@link Directory} used to write the cross validation index * @param analyzer {@link Analyzer} used to create the new docs * @param fieldNames names of fields that need to be put in the new indexes or <code>null</code> if all should be used * @throws IOException if any writing operation fails on any of the indexes *//* w ww. j a v a2 s. c o m*/ public void split(LeafReader originalIndex, Directory trainingIndex, Directory testIndex, Directory crossValidationIndex, Analyzer analyzer, String... fieldNames) throws IOException { // create IWs for train / test / cv IDXs IndexWriter testWriter = new IndexWriter(testIndex, new IndexWriterConfig(analyzer)); IndexWriter cvWriter = new IndexWriter(crossValidationIndex, new IndexWriterConfig(analyzer)); IndexWriter trainingWriter = new IndexWriter(trainingIndex, new IndexWriterConfig(analyzer)); try { int size = originalIndex.maxDoc(); IndexSearcher indexSearcher = new IndexSearcher(originalIndex); TopDocs topDocs = indexSearcher.search(new MatchAllDocsQuery(), Integer.MAX_VALUE); // set the type to be indexed, stored, with term vectors FieldType ft = new FieldType(TextField.TYPE_STORED); ft.setStoreTermVectors(true); ft.setStoreTermVectorOffsets(true); ft.setStoreTermVectorPositions(true); int b = 0; // iterate over existing documents for (ScoreDoc scoreDoc : topDocs.scoreDocs) { // create a new document for indexing Document doc = new Document(); if (fieldNames != null && fieldNames.length > 0) { for (String fieldName : fieldNames) { doc.add(new Field(fieldName, originalIndex.document(scoreDoc.doc).getField(fieldName).stringValue(), ft)); } } else { for (IndexableField storableField : originalIndex.document(scoreDoc.doc).getFields()) { if (storableField.readerValue() != null) { doc.add(new Field(storableField.name(), storableField.readerValue(), ft)); } else if (storableField.binaryValue() != null) { doc.add(new Field(storableField.name(), storableField.binaryValue(), ft)); } else if (storableField.stringValue() != null) { doc.add(new Field(storableField.name(), storableField.stringValue(), ft)); } else if (storableField.numericValue() != null) { doc.add(new Field(storableField.name(), storableField.numericValue().toString(), ft)); } } } // add it to one of the IDXs if (b % 2 == 0 && testWriter.maxDoc() < size * testRatio) { testWriter.addDocument(doc); } else if (cvWriter.maxDoc() < size * crossValidationRatio) { cvWriter.addDocument(doc); } else { trainingWriter.addDocument(doc); } b++; } } catch (Exception e) { throw new IOException(e); } finally { testWriter.commit(); cvWriter.commit(); trainingWriter.commit(); // close IWs testWriter.close(); cvWriter.close(); trainingWriter.close(); } }
From source file:com.meizu.nlp.classification.utils.DataSplitterTest.java
License:Apache License
public static void assertSplit(LeafReader originalIndex, double testRatio, double crossValidationRatio, String... fieldNames) throws Exception { BaseDirectoryWrapper trainingIndex = newDirectory(); BaseDirectoryWrapper testIndex = newDirectory(); BaseDirectoryWrapper crossValidationIndex = newDirectory(); try {/*from w w w . j a v a 2 s. c o m*/ DatasetSplitter datasetSplitter = new DatasetSplitter(testRatio, crossValidationRatio); datasetSplitter.split(originalIndex, trainingIndex, testIndex, crossValidationIndex, new MockAnalyzer(random()), fieldNames); assertNotNull(trainingIndex); assertNotNull(testIndex); assertNotNull(crossValidationIndex); DirectoryReader trainingReader = DirectoryReader.open(trainingIndex); assertTrue((int) (originalIndex.maxDoc() * (1d - testRatio - crossValidationRatio)) == trainingReader .maxDoc()); DirectoryReader testReader = DirectoryReader.open(testIndex); assertTrue((int) (originalIndex.maxDoc() * testRatio) == testReader.maxDoc()); DirectoryReader cvReader = DirectoryReader.open(crossValidationIndex); assertTrue((int) (originalIndex.maxDoc() * crossValidationRatio) == cvReader.maxDoc()); trainingReader.close(); testReader.close(); cvReader.close(); closeQuietly(trainingReader); closeQuietly(testReader); closeQuietly(cvReader); } finally { if (trainingIndex != null) { trainingIndex.close(); } if (testIndex != null) { testIndex.close(); } if (crossValidationIndex != null) { crossValidationIndex.close(); } } }
From source file:de.unihildesheim.iw.lucene.search.EmptyFieldFilter.java
License:Open Source License
@Override public DocIdSet getDocIdSet(@NotNull final LeafReaderContext context, @Nullable final Bits acceptDocs) throws IOException { FixedBitSet checkBits;/*from w ww.j a v a 2s. co m*/ final LeafReader reader = context.reader(); final int maxDoc = reader.maxDoc(); BitSet finalBits = new SparseFixedBitSet(maxDoc); if (acceptDocs == null) { checkBits = BitsUtils.bits2FixedBitSet(reader.getLiveDocs()); if (checkBits == null) { // all live checkBits = new FixedBitSet(maxDoc); checkBits.set(0, checkBits.length()); } } else { checkBits = BitsUtils.bits2FixedBitSet(acceptDocs); } @Nullable final Terms terms = reader.terms(this.field); if (terms != null) { final int termsDocCount = terms.getDocCount(); if (termsDocCount != 0) { if (termsDocCount == maxDoc) { // all matching finalBits = checkBits; } else { @Nullable final Terms t = reader.terms(this.field); if (t != null) { PostingsEnum pe = null; final TermsEnum te = t.iterator(null); int docId; while (te.next() != null) { pe = te.postings(checkBits, pe, (int) PostingsEnum.NONE); while ((docId = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (checkBits.getAndClear(docId)) { finalBits.set(docId); } } } } } } } return new BitDocIdSet(finalBits); }
From source file:de.unihildesheim.iw.lucene.search.IPCFieldFilter.java
License:Open Source License
@Override public DocIdSet getDocIdSet(@NotNull final LeafReaderContext context, @Nullable final Bits acceptDocs) throws IOException { final LeafReader reader = context.reader(); final int maxDoc = reader.maxDoc(); final BitSet finalBits = new SparseFixedBitSet(maxDoc); if (acceptDocs == null) { // check all for (int i = 0; i < maxDoc; i++) { if (this.filterFunc.isAccepted(reader, i, this.ipcParser)) { finalBits.set(i);/*from w w w .j av a 2 s . c o m*/ } } } else { final BitSet checkBits = BitsUtils.bits2BitSet(acceptDocs); final DocIdSetIterator disi = new BitDocIdSet(checkBits).iterator(); int docId; while ((docId = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (this.filterFunc.isAccepted(reader, docId, this.ipcParser)) { finalBits.set(docId); } } } return new BitDocIdSet(finalBits); }
From source file:main.BM25VASimilarity.java
License:Apache License
@Override public final SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException { BM25Stats bm25stats = (BM25Stats) stats; LeafReader reader = context.reader(); //int docCount = reader.getDocCount(bm25stats.field); //BVA calculated for each document float[] BVA = new float[reader.maxDoc()]; float sumOfAverageTermFrequencies = 0.0f; //length of each doc float[] Ld = new float[reader.maxDoc()]; //the number of unique terms in the doc. float[] Td = new float[reader.maxDoc()]; NumericDocValues norms = reader.getNormValues(bm25stats.field); // int nulldocs = 0; for (int i = 0; i < reader.maxDoc(); i++) { Terms terms = reader.getTermVector(i, bm25stats.field); //norm should be the decoded length of doc d, Ld. float norm = norms == null ? k1 : bm25stats.cache[(byte) norms.get(i) & 0xFF]; Ld[i] = norm;/*from www. j ava 2s. c o m*/ //using terms.size() returns Td, the number of unique terms in the doc. Td[i] = terms.size(); // if (terms == null) { // nulldocs++; // continue; // } float averageTermFrequency = Ld[i] / Td[i]; sumOfAverageTermFrequencies += averageTermFrequency; } //calculate mean average term frequency of all documents float mavgtf = sumOfAverageTermFrequencies / reader.maxDoc(); //calculate B_VA for each document for (int i = 0; i < reader.maxDoc(); i++) { BVA[i] = 1 / (mavgtf * mavgtf) * Ld[i] / Td[i] + (1 - 1 / mavgtf) * Ld[i] / bm25stats.avgdl; } // System.out.println("Null docs: "+nulldocs); // System.out.println("Max docs: "+reader.maxDoc()); // System.out.println("Doc count: "+reader.getDocCount(bm25stats.field)); // System.out.println("max docs minus null docs: "+(reader.maxDoc() - nulldocs)); return new BM25DocScorer(bm25stats, BVA); }
From source file:nl.inl.blacklab.search.lucene.SpansNGrams.java
License:Apache License
/** * Constructs a SpansNGrams//from w w w. j av a 2 s.c o m * @param ignoreLastToken if true, we assume the last token is always a special closing token and ignore it * @param reader the index reader, for getting field lengths * @param fieldName the field name, for getting field lengths * @param min minimum n-gram length * @param max maximum n-gram length */ public SpansNGrams(boolean ignoreLastToken, LeafReader reader, String fieldName, int min, int max) { maxDoc = reader == null ? -1 : reader.maxDoc(); liveDocs = reader == null ? null : MultiFields.getLiveDocs(reader); subtractFromLength = ignoreLastToken ? 1 : 0; this.lengthGetter = new DocFieldLengthGetter(reader, fieldName); this.min = min; this.max = max; }
From source file:org.alfresco.solr.query.AbstractAuthoritySetQuery.java
License:Open Source License
protected HybridBitSet getACLSet(String[] auths, String field, SolrIndexSearcher searcher) throws IOException { /*//ww w . j a va 2 s. co m * Build a query that matches the authorities with a field in the ACL records in the index. */ BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder(); for (String current : auths) { queryBuilder.add(new TermQuery(new Term(field, current)), BooleanClause.Occur.SHOULD); } /* * Collect a docset containing the ACL records that match the query. * This query will be in the filter cache. Ideally it would remain cached throughout the users session. */ DocSet docSet = searcher.getDocSet(queryBuilder.build()); DocIterator iterator = docSet.iterator(); if (!iterator.hasNext()) { return new EmptyHybridBitSet(); } //TODO : makes this configurable. For some systems this is huge and for others not big enough. HybridBitSet hybridBitSet = new HybridBitSet(60000000); /* * Collect the ACLID's from the matching acl records. * This is done in a separate step so the initial ACL query can be cached in the FilterCache * The initial ACL query may be expensive if the number of authorities is very large. */ List<LeafReaderContext> leaves = searcher.getTopReaderContext().leaves(); LeafReaderContext context = leaves.get(0); NumericDocValues aclValues = DocValuesCache.getNumericDocValues(QueryConstants.FIELD_ACLID, context.reader()); LeafReader reader = context.reader(); int ceil = reader.maxDoc(); int base = 0; int ord = 0; while (iterator.hasNext()) { int doc = iterator.nextDoc(); if (doc >= ceil) { do { ++ord; context = leaves.get(ord); reader = context.reader(); base = context.docBase; ceil = base + reader.maxDoc(); aclValues = DocValuesCache.getNumericDocValues(QueryConstants.FIELD_ACLID, reader); } while (doc >= ceil); } if (aclValues != null) { long aclId = aclValues.get(doc - base); hybridBitSet.set(aclId); } } return hybridBitSet; }
From source file:org.alfresco.solr.query.AbstractAuthoritySetQuery.java
License:Open Source License
protected BitsFilter getACLFilter(String[] auths, String field, SolrIndexSearcher searcher) throws IOException { HybridBitSet aclBits = getACLSet(auths, field, searcher); List<LeafReaderContext> leaves = searcher.getTopReaderContext().leaves(); List<FixedBitSet> bitSets = new ArrayList<FixedBitSet>(leaves.size()); for (LeafReaderContext readerContext : leaves) { LeafReader reader = readerContext.reader(); int maxDoc = reader.maxDoc(); FixedBitSet bits = new FixedBitSet(maxDoc); bitSets.add(bits);// w w w . j ava 2 s. c om NumericDocValues fieldValues = DocValuesCache.getNumericDocValues(QueryConstants.FIELD_ACLID, reader); if (fieldValues != null) { for (int i = 0; i < maxDoc; i++) { long aclID = fieldValues.get(i); if (aclBits.get(aclID)) { bits.set(i); } } } } return new BitsFilter(bitSets); }
From source file:org.alfresco.solr.query.DocValuesCache.java
License:Open Source License
public static synchronized NumericDocValues getNumericDocValues(String field, LeafReader reader) throws IOException { WeakHashMap<Object, NumericDocValues> fieldCache = cache.get(field); if (fieldCache == null) { fieldCache = new WeakHashMap<Object, NumericDocValues>(); cache.put(field, fieldCache);/*from w ww .ja va 2 s . c o m*/ } Object cacheKey = reader.getCoreCacheKey(); NumericDocValues cachedValues = fieldCache.get(cacheKey); if (cachedValues == null) { NumericDocValues fieldValues = reader.getNumericDocValues(field); if (fieldValues == null) { return null; } else { int maxDoc = reader.maxDoc(); boolean longs = false; int[] intValues = new int[maxDoc]; //Always start off with an int array. SettableDocValues settableValues = new IntValues(intValues); for (int i = 0; i < maxDoc; i++) { long value = fieldValues.get(i); if (value > Integer.MAX_VALUE && !longs) { longs = true; settableValues = new LongValues(intValues); } settableValues.set(i, value); } fieldCache.put(cacheKey, settableValues); return settableValues; } } else { return cachedValues; } }
From source file:org.apache.solr.uninverting.DocTermOrds.java
License:Apache License
/** Call this only once (if you subclass!) */ protected void uninvert(final LeafReader reader, Bits liveDocs, final BytesRef termPrefix) throws IOException { final FieldInfo info = reader.getFieldInfos().fieldInfo(field); if (checkForDocValues && info != null && info.getDocValuesType() != DocValuesType.NONE) { throw new IllegalStateException( "Type mismatch: " + field + " was indexed as " + info.getDocValuesType()); }//from w w w. j a va 2 s.co m //System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix); final long startTime = System.nanoTime(); prefix = termPrefix == null ? null : BytesRef.deepCopyOf(termPrefix); final int maxDoc = reader.maxDoc(); final int[] index = new int[maxDoc]; // immediate term numbers, or the index into the byte[] representing the last number final int[] lastTerm = new int[maxDoc]; // last term we saw for this document final byte[][] bytes = new byte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts) final Terms terms = reader.terms(field); if (terms == null) { // No terms return; } final TermsEnum te = terms.iterator(); final BytesRef seekStart = termPrefix != null ? termPrefix : new BytesRef(); //System.out.println("seekStart=" + seekStart.utf8ToString()); if (te.seekCeil(seekStart) == TermsEnum.SeekStatus.END) { // No terms match return; } // For our "term index wrapper" final List<BytesRef> indexedTerms = new ArrayList<>(); final PagedBytes indexedTermsBytes = new PagedBytes(15); // we need a minimum of 9 bytes, but round up to 12 since the space would // be wasted with most allocators anyway. byte[] tempArr = new byte[12]; // // enumerate all terms, and build an intermediate form of the un-inverted field. // // During this intermediate form, every document has a (potential) byte[] // and the int[maxDoc()] array either contains the termNumber list directly // or the *end* offset of the termNumber list in its byte array (for faster // appending and faster creation of the final form). // // idea... if things are too large while building, we could do a range of docs // at a time (but it would be a fair amount slower to build) // could also do ranges in parallel to take advantage of multiple CPUs // OPTIONAL: remap the largest df terms to the lowest 128 (single byte) // values. This requires going over the field first to find the most // frequent terms ahead of time. int termNum = 0; postingsEnum = null; // Loop begins with te positioned to first term (we call // seek above): for (;;) { final BytesRef t = te.term(); if (t == null || (termPrefix != null && !StringHelper.startsWith(t, termPrefix))) { break; } //System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum); visitTerm(te, termNum); if ((termNum & indexIntervalMask) == 0) { // Index this term sizeOfIndexedStrings += t.length; BytesRef indexedTerm = new BytesRef(); indexedTermsBytes.copy(t, indexedTerm); // TODO: really should 1) strip off useless suffix, // and 2) use FST not array/PagedBytes indexedTerms.add(indexedTerm); } final int df = te.docFreq(); if (df <= maxTermDocFreq) { postingsEnum = te.postings(postingsEnum, PostingsEnum.NONE); // dF, but takes deletions into account int actualDF = 0; for (;;) { int doc = postingsEnum.nextDoc(); if (doc == DocIdSetIterator.NO_MORE_DOCS) { break; } //System.out.println(" chunk=" + chunk + " docs"); actualDF++; termInstances++; //System.out.println(" docID=" + doc); // add TNUM_OFFSET to the term number to make room for special reserved values: // 0 (end term) and 1 (index into byte array follows) int delta = termNum - lastTerm[doc] + TNUM_OFFSET; lastTerm[doc] = termNum; int val = index[doc]; if ((val & 0xff) == 1) { // index into byte array (actually the end of // the doc-specific byte[] when building) int pos = val >>> 8; int ilen = vIntSize(delta); byte[] arr = bytes[doc]; int newend = pos + ilen; if (newend > arr.length) { // We avoid a doubling strategy to lower memory usage. // this faceting method isn't for docs with many terms. // In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit boundary. // TODO: figure out what array lengths we can round up to w/o actually using more memory // (how much space does a byte[] take up? Is data preceded by a 32 bit length only? // It should be safe to round up to the nearest 32 bits in any case. int newLen = (newend + 3) & 0xfffffffc; // 4 byte alignment byte[] newarr = new byte[newLen]; System.arraycopy(arr, 0, newarr, 0, pos); arr = newarr; bytes[doc] = newarr; } pos = writeInt(delta, arr, pos); index[doc] = (pos << 8) | 1; // update pointer to end index in byte[] } else { // OK, this int has data in it... find the end (a zero starting byte - not // part of another number, hence not following a byte with the high bit set). int ipos; if (val == 0) { ipos = 0; } else if ((val & 0x0000ff80) == 0) { ipos = 1; } else if ((val & 0x00ff8000) == 0) { ipos = 2; } else if ((val & 0xff800000) == 0) { ipos = 3; } else { ipos = 4; } //System.out.println(" ipos=" + ipos); int endPos = writeInt(delta, tempArr, ipos); //System.out.println(" endpos=" + endPos); if (endPos <= 4) { //System.out.println(" fits!"); // value will fit in the integer... move bytes back for (int j = ipos; j < endPos; j++) { val |= (tempArr[j] & 0xff) << (j << 3); } index[doc] = val; } else { // value won't fit... move integer into byte[] for (int j = 0; j < ipos; j++) { tempArr[j] = (byte) val; val >>>= 8; } // point at the end index in the byte[] index[doc] = (endPos << 8) | 1; bytes[doc] = tempArr; tempArr = new byte[12]; } } } setActualDocFreq(termNum, actualDF); } termNum++; if (te.next() == null) { break; } } numTermsInField = termNum; long midPoint = System.nanoTime(); if (termInstances == 0) { // we didn't invert anything // lower memory consumption. tnums = null; } else { this.index = index; // // transform intermediate form into the final form, building a single byte[] // at a time, and releasing the intermediate byte[]s as we go to avoid // increasing the memory footprint. // for (int pass = 0; pass < 256; pass++) { byte[] target = tnums[pass]; int pos = 0; // end in target; if (target != null) { pos = target.length; } else { target = new byte[4096]; } // loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx // where pp is the pass (which array we are building), and xx is all values. // each pass shares the same byte[] for termNumber lists. for (int docbase = pass << 16; docbase < maxDoc; docbase += (1 << 24)) { int lim = Math.min(docbase + (1 << 16), maxDoc); for (int doc = docbase; doc < lim; doc++) { //System.out.println(" pass=" + pass + " process docID=" + doc); int val = index[doc]; if ((val & 0xff) == 1) { int len = val >>> 8; //System.out.println(" ptr pos=" + pos); index[doc] = (pos << 8) | 1; // change index to point to start of array if ((pos & 0xff000000) != 0) { // we only have 24 bits for the array index throw new IllegalStateException( "Too many values for UnInvertedField faceting on field " + field); } byte[] arr = bytes[doc]; /* for(byte b : arr) { //System.out.println(" b=" + Integer.toHexString((int) b)); } */ bytes[doc] = null; // IMPORTANT: allow GC to avoid OOM if (target.length <= pos + len) { int newlen = target.length; /*** we don't have to worry about the array getting too large * since the "pos" param will overflow first (only 24 bits available) if ((newlen<<1) <= 0) { // overflow... newlen = Integer.MAX_VALUE; if (newlen <= pos + len) { throw new SolrException(400,"Too many terms to uninvert field!"); } } else { while (newlen <= pos + len) newlen<<=1; // doubling strategy } ****/ while (newlen <= pos + len) newlen <<= 1; // doubling strategy byte[] newtarget = new byte[newlen]; System.arraycopy(target, 0, newtarget, 0, pos); target = newtarget; } System.arraycopy(arr, 0, target, pos, len); pos += len + 1; // skip single byte at end and leave it 0 for terminator } } } // shrink array if (pos < target.length) { byte[] newtarget = new byte[pos]; System.arraycopy(target, 0, newtarget, 0, pos); target = newtarget; } tnums[pass] = target; if ((pass << 16) > maxDoc) break; } } indexedTermsArray = indexedTerms.toArray(new BytesRef[indexedTerms.size()]); long endTime = System.nanoTime(); total_time = (int) TimeUnit.MILLISECONDS.convert(endTime - startTime, TimeUnit.NANOSECONDS); phase1_time = (int) TimeUnit.MILLISECONDS.convert(midPoint - startTime, TimeUnit.NANOSECONDS); }