List of usage examples for org.apache.lucene.index Fields terms
public abstract Terms terms(String field) throws IOException;
From source file:narock.HighFreqTerms.java
License:Apache License
/** * /*from ww w . java 2s .c o m*/ * @param reader * @param numTerms * @param field * @return TermStats[] ordered by terms with highest docFreq first. * @throws Exception */ public static TermStats[] getHighFreqTerms(IndexReader reader, int numTerms, String[] fieldNames) throws Exception { TermStatsQueue tiq = null; TermsEnum te = null; if (fieldNames != null) { Fields fields = MultiFields.getFields(reader); if (fields == null) { LOG.info("Index with no fields - probably empty or corrupted"); return EMPTY_STATS; } tiq = new TermStatsQueue(numTerms); for (String field : fieldNames) { Terms terms = fields.terms(field); if (terms != null) { te = terms.iterator(te); fillQueue(te, tiq, field); } } } else { Fields fields = MultiFields.getFields(reader); if (fields == null) { LOG.info("Index with no fields - probably empty or corrupted"); return EMPTY_STATS; } tiq = new TermStatsQueue(numTerms); // FieldsEnum fieldsEnum = fields.iterator(); while (true) { String field = fields.iterator().next(); //fieldsEnum.next(); if (field != null) { Terms terms = fields.terms(field); //fieldsEnum.terms(); te = terms.iterator(te); fillQueue(te, tiq, field); } else { break; } } } TermStats[] result = new TermStats[tiq.size()]; // we want highest first so we read the queue and populate the array // starting at the end and work backwards int count = tiq.size() - 1; while (tiq.size() != 0) { result[count] = tiq.pop(); count--; } return result; }
From source file:net.tourbook.search.FTSearchManager.java
License:Open Source License
private static InputIterator createTermIterator() throws IOException { final TermFreqIteratorListWrapper inputIterator = new TermFreqIteratorListWrapper(); final List<AtomicReaderContext> leaves = _indexReader.leaves(); for (final AtomicReaderContext readerContext : leaves) { final AtomicReader reader = readerContext.reader(); final Fields fields = reader.fields(); for (final String field : fields) { if (field.equals(SEARCH_FIELD_DESCRIPTION) || field.equals(SEARCH_FIELD_TITLE)) { final Terms terms = fields.terms(field); final TermsEnum termsEnum = terms.iterator(null); inputIterator.add(termsEnum); }//from w ww .j a va 2s . co m } } return inputIterator; }
From source file:NewsIR_search.CollectionStatistics.java
/** * Initialize collectionStat://from w ww . jav a 2 s . co m * docCount - total-number-of-docs-in-index * colSize - collection-size * uniqTermCount - unique terms in collection * perTermStat - cf, df of each terms in the collection * @return * @throws IOException */ public CollectionStatistics buildCollectionStat() throws IOException { long colSize = 0; CollectionStatistics collectionStat = new CollectionStatistics(); collectionStat.docCount = indexReader.maxDoc(); // total number of documents in the index Fields fields = MultiFields.getFields(indexReader); Terms terms = fields.terms(field); TermsEnum iterator = terms.iterator(null); BytesRef byteRef = null; while ((byteRef = iterator.next()) != null) { //* for each word in the collection String term = new String(byteRef.bytes, byteRef.offset, byteRef.length); int docFreq = iterator.docFreq(); // df of 'term' long colFreq = iterator.totalTermFreq(); // cf of 'term' collectionStat.perTermStat.put(term, new PerTermStat(term, colFreq, docFreq)); colSize += colFreq; } collectionStat.colSize = colSize; // collection size of the index collectionStat.uniqTermCount = collectionStat.perTermStat.size(); return collectionStat; }
From source file:nl.inl.blacklab.search.lucene.BLSpanTermQuery.java
License:Apache License
/** * Overridden frmo SpanTermQuery to return a BLSpans instead. *//*www . j a v a2s .c o m*/ @Override public Spans getSpans(final AtomicReaderContext context, Bits acceptDocs, Map<Term, TermContext> termContexts) throws IOException { TermContext termContext = termContexts.get(term); final TermState state; if (termContext == null) { // this happens with span-not query, as it doesn't include the NOT // side in extractTerms() // so we seek to the term now in this segment..., this sucks because // its ugly mostly! final Fields fields = context.reader().fields(); if (fields != null) { final Terms terms = fields.terms(term.field()); if (terms != null) { final TermsEnum termsEnum = terms.iterator(null); if (termsEnum.seekExact(term.bytes(), true)) { state = termsEnum.termState(); } else { state = null; } } else { state = null; } } else { state = null; } } else { state = termContext.get(context.ord); } if (state == null) { // term is not present in that reader return TermSpans.EMPTY_TERM_SPANS; } final TermsEnum termsEnum = context.reader().terms(term.field()).iterator(null); termsEnum.seekExact(term.bytes(), state); final DocsAndPositionsEnum postings = termsEnum.docsAndPositions(acceptDocs, null, DocsAndPositionsEnum.FLAG_PAYLOADS); if (postings != null) { return new TermSpans(postings, term); } // term does exist, but has no positions throw new IllegalStateException("field \"" + term.field() + "\" was indexed without position data; cannot run SpanTermQuery (term=" + term.text() + ")"); }
From source file:org.apache.blur.lucene.search.PrimeDocCache.java
License:Apache License
/** * The way this method is called via warm up methods the likelihood of * creating multiple bitsets during a race condition is very low, that's why * this method is not synced./*from ww w.j a va 2s . co m*/ */ public static OpenBitSet getPrimeDocBitSet(Term primeDocTerm, AtomicReader providedReader) throws IOException { AtomicReader reader = AtomicReaderUtil.getSegmentReader(providedReader); final Object key = reader.getCoreCacheKey(); final Map<Object, OpenBitSet> primeDocMap = getPrimeDocMap(primeDocTerm); OpenBitSet bitSet = primeDocMap.get(key); if (bitSet == null) { synchronized (reader) { reader.addReaderClosedListener(new ReaderClosedListener() { @Override public void onClose(IndexReader reader) { LOG.debug("Current size [" + primeDocMap.size() + "] Prime Doc BitSet removing for segment [" + reader + "]"); OpenBitSet openBitSet = primeDocMap.remove(key); if (openBitSet == null) { LOG.warn("Primedoc was missing for key [{0}]", key); } } }); LOG.debug("Prime Doc BitSet missing for segment [" + reader + "] current size [" + primeDocMap.size() + "]"); final OpenBitSet bs = new OpenBitSet(reader.maxDoc()); MemoryLeakDetector.record(bs, "PrimeDoc BitSet", key.toString()); Fields fields = reader.fields(); if (fields == null) { throw new IOException("Missing all fields."); } Terms terms = fields.terms(primeDocTerm.field()); if (terms == null) { throw new IOException("Missing prime doc field [" + primeDocTerm.field() + "]."); } TermsEnum termsEnum = terms.iterator(null); if (!termsEnum.seekExact(primeDocTerm.bytes(), true)) { throw new IOException("Missing prime doc term [" + primeDocTerm + "]."); } DocsEnum docsEnum = termsEnum.docs(null, null); int docFreq = reader.docFreq(primeDocTerm); int doc; int count = 0; while ((doc = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { bs.fastSet(doc); count++; } if (count == docFreq) { primeDocMap.put(key, bs); } else { LOG.warn( "PrimeDoc for reader [{0}] not stored, because count [{1}] and freq [{2}] do not match.", reader, count, docFreq); } return bs; } } return bitSet; }
From source file:org.apache.blur.lucene.security.search.DocumentVisibilityFilter.java
License:Apache License
@Override public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException { AtomicReader reader = context.reader(); List<DocIdSet> list = new ArrayList<DocIdSet>(); Fields fields = reader.fields(); Terms terms = fields.terms(_fieldName); if (terms == null) { // if field is not present then show nothing. return DocIdSet.EMPTY_DOCIDSET; }/*from w w w .j a v a 2 s .c o m*/ TermsEnum iterator = terms.iterator(null); BytesRef bytesRef; DocumentVisibilityEvaluator visibilityEvaluator = new DocumentVisibilityEvaluator(_authorizations); while ((bytesRef = iterator.next()) != null) { if (isVisible(visibilityEvaluator, bytesRef)) { DocIdSet docIdSet = _filterCacheStrategy.getDocIdSet(_fieldName, bytesRef, reader); if (docIdSet != null) { list.add(docIdSet); } else { // Do not use acceptDocs because we want the acl cache to be version // agnostic. DocsEnum docsEnum = iterator.docs(null, null); list.add(buildCache(reader, docsEnum, bytesRef)); } } } return getLogicalOr(list); }
From source file:org.apache.blur.lucene.warmup.IndexWarmup.java
License:Apache License
public Map<String, List<IndexTracerResult>> sampleIndex(AtomicReader atomicReader, String context) throws IOException { Map<String, List<IndexTracerResult>> results = new HashMap<String, List<IndexTracerResult>>(); if (atomicReader instanceof SegmentReader) { SegmentReader segmentReader = (SegmentReader) atomicReader; Directory directory = segmentReader.directory(); if (!(directory instanceof TraceableDirectory)) { LOG.info("Context [{1}] cannot warmup directory [{0}] needs to be a TraceableDirectory.", directory, context);// w w w . j a v a2s. c om return results; } IndexTracer tracer = new IndexTracer((TraceableDirectory) directory, _maxSampleSize); String fileName = getSampleFileName(segmentReader.getSegmentName()); List<IndexTracerResult> segmentTraces = new ArrayList<IndexTracerResult>(); if (directory.fileExists(fileName)) { IndexInput input = directory.openInput(fileName, IOContext.READONCE); segmentTraces = read(input); input.close(); } else { Fields fields = atomicReader.fields(); for (String field : fields) { LOG.debug("Context [{1}] sampling field [{0}].", field, context); Terms terms = fields.terms(field); boolean hasOffsets = terms.hasOffsets(); boolean hasPayloads = terms.hasPayloads(); boolean hasPositions = terms.hasPositions(); tracer.initTrace(segmentReader, field, hasPositions, hasPayloads, hasOffsets); IndexTracerResult result = tracer.runTrace(terms); segmentTraces.add(result); } if (_isClosed.get()) { LOG.info("Context [{0}] index closed", context); return null; } IndexOutput output = directory.createOutput(fileName, IOContext.DEFAULT); write(segmentTraces, output); output.close(); } results.put(segmentReader.getSegmentName(), segmentTraces); } return results; }
From source file:org.apache.blur.manager.writer.IndexImporter.java
License:Apache License
private void applyDeletes(Directory directory, IndexWriter indexWriter, String shard, boolean emitDeletes) throws IOException { DirectoryReader reader = DirectoryReader.open(directory); try {/*w w w.j av a2 s .c om*/ LOG.info("Applying deletes in reader [{0}]", reader); CompositeReaderContext compositeReaderContext = reader.getContext(); List<AtomicReaderContext> leaves = compositeReaderContext.leaves(); BlurPartitioner blurPartitioner = new BlurPartitioner(); Text key = new Text(); int numberOfShards = _shardContext.getTableContext().getDescriptor().getShardCount(); int shardId = ShardUtil.getShardIndex(shard); for (AtomicReaderContext context : leaves) { AtomicReader atomicReader = context.reader(); Fields fields = atomicReader.fields(); Terms terms = fields.terms(BlurConstants.ROW_ID); if (terms != null) { TermsEnum termsEnum = terms.iterator(null); BytesRef ref = null; while ((ref = termsEnum.next()) != null) { key.set(ref.bytes, ref.offset, ref.length); int partition = blurPartitioner.getPartition(key, null, numberOfShards); if (shardId != partition) { throw new IOException("Index is corrupted, RowIds are found in wrong shard, partition [" + partition + "] does not shard [" + shardId + "], this can happen when rows are not hashed correctly."); } if (emitDeletes) { indexWriter.deleteDocuments(new Term(BlurConstants.ROW_ID, BytesRef.deepCopyOf(ref))); } } } } } finally { reader.close(); } }
From source file:org.apache.blur.utils.BlurUtil.java
License:Apache License
private static void applyFamily(OpenBitSet bits, String family, AtomicReader atomicReader, int primeDocRowId, int numberOfDocsInRow, Bits liveDocs) throws IOException { Fields fields = atomicReader.fields(); Terms terms = fields.terms(BlurConstants.FAMILY); TermsEnum iterator = terms.iterator(null); BytesRef text = new BytesRef(family); int lastDocId = primeDocRowId + numberOfDocsInRow; if (iterator.seekExact(text, true)) { DocsEnum docs = iterator.docs(liveDocs, null, DocsEnum.FLAG_NONE); int doc = primeDocRowId; while ((doc = docs.advance(doc)) < lastDocId) { bits.set(doc - primeDocRowId); }/*from www .j a v a2 s. co m*/ } }
From source file:org.apache.solr.handler.admin.LukeRequestHandler.java
License:Apache License
@SuppressWarnings("unchecked") private static void getDetailedFieldInfo(SolrQueryRequest req, String field, SimpleOrderedMap<Object> fieldMap) throws IOException { SolrParams params = req.getParams(); final int numTerms = params.getInt(NUMTERMS, DEFAULT_COUNT); TopTermQueue tiq = new TopTermQueue(numTerms + 1); // Something to collect the top N terms in. final CharsRef spare = new CharsRef(); Fields fields = MultiFields.getFields(req.getSearcher().getIndexReader()); if (fields == null) { // No indexed fields return;/*from w ww .j a v a2 s .c o m*/ } Terms terms = fields.terms(field); if (terms == null) { // No terms in the field. return; } TermsEnum termsEnum = terms.iterator(null); BytesRef text; int[] buckets = new int[HIST_ARRAY_SIZE]; while ((text = termsEnum.next()) != null) { ++tiq.distinctTerms; int freq = termsEnum.docFreq(); // This calculation seems odd, but it gives the same results as it used to. int slot = 32 - Integer.numberOfLeadingZeros(Math.max(0, freq - 1)); buckets[slot] = buckets[slot] + 1; if (numTerms > 0 && freq > tiq.minFreq) { UnicodeUtil.UTF8toUTF16(text, spare); String t = spare.toString(); tiq.add(new TopTermQueue.TermInfo(new Term(field, t), termsEnum.docFreq())); if (tiq.size() > numTerms) { // if tiq full tiq.pop(); // remove lowest in tiq tiq.minFreq = tiq.getTopTermInfo().docFreq; } } } tiq.histogram.add(buckets); fieldMap.add("distinct", tiq.distinctTerms); // Include top terms fieldMap.add("topTerms", tiq.toNamedList(req.getSearcher().getSchema())); // Add a histogram fieldMap.add("histogram", tiq.histogram.toNamedList()); }