Example usage for org.apache.lucene.index Fields terms

List of usage examples for org.apache.lucene.index Fields terms

Introduction

In this page you can find the example usage for org.apache.lucene.index Fields terms.

Prototype

public abstract Terms terms(String field) throws IOException;

Source Link

Document

Get the Terms for this field.

Usage

From source file:narock.HighFreqTerms.java

License:Apache License

/**
 * /*from ww w . java 2s  .c  o  m*/
 * @param reader
 * @param numTerms
 * @param field
 * @return TermStats[] ordered by terms with highest docFreq first.
 * @throws Exception
 */
public static TermStats[] getHighFreqTerms(IndexReader reader, int numTerms, String[] fieldNames)
        throws Exception {
    TermStatsQueue tiq = null;
    TermsEnum te = null;

    if (fieldNames != null) {
        Fields fields = MultiFields.getFields(reader);
        if (fields == null) {
            LOG.info("Index with no fields - probably empty or corrupted");
            return EMPTY_STATS;
        }
        tiq = new TermStatsQueue(numTerms);
        for (String field : fieldNames) {
            Terms terms = fields.terms(field);
            if (terms != null) {
                te = terms.iterator(te);
                fillQueue(te, tiq, field);
            }
        }
    } else {
        Fields fields = MultiFields.getFields(reader);
        if (fields == null) {
            LOG.info("Index with no fields - probably empty or corrupted");
            return EMPTY_STATS;
        }
        tiq = new TermStatsQueue(numTerms);
        // FieldsEnum fieldsEnum = fields.iterator();
        while (true) {
            String field = fields.iterator().next();
            //fieldsEnum.next();
            if (field != null) {
                Terms terms = fields.terms(field);
                //fieldsEnum.terms();
                te = terms.iterator(te);
                fillQueue(te, tiq, field);
            } else {
                break;
            }
        }
    }

    TermStats[] result = new TermStats[tiq.size()];
    // we want highest first so we read the queue and populate the array
    // starting at the end and work backwards
    int count = tiq.size() - 1;
    while (tiq.size() != 0) {
        result[count] = tiq.pop();
        count--;
    }
    return result;
}

From source file:net.tourbook.search.FTSearchManager.java

License:Open Source License

private static InputIterator createTermIterator() throws IOException {

    final TermFreqIteratorListWrapper inputIterator = new TermFreqIteratorListWrapper();

    final List<AtomicReaderContext> leaves = _indexReader.leaves();

    for (final AtomicReaderContext readerContext : leaves) {

        final AtomicReader reader = readerContext.reader();
        final Fields fields = reader.fields();

        for (final String field : fields) {

            if (field.equals(SEARCH_FIELD_DESCRIPTION) || field.equals(SEARCH_FIELD_TITLE)) {

                final Terms terms = fields.terms(field);
                final TermsEnum termsEnum = terms.iterator(null);

                inputIterator.add(termsEnum);
            }//from  w ww .j a  va 2s .  co m
        }
    }

    return inputIterator;
}

From source file:NewsIR_search.CollectionStatistics.java

/**
 * Initialize collectionStat://from w  ww .  jav a 2  s . co m
 * docCount      - total-number-of-docs-in-index
 * colSize       - collection-size
 * uniqTermCount - unique terms in collection
 * perTermStat   - cf, df of each terms in the collection
 * @return 
 * @throws IOException 
 */
public CollectionStatistics buildCollectionStat() throws IOException {

    long colSize = 0;

    CollectionStatistics collectionStat = new CollectionStatistics();

    collectionStat.docCount = indexReader.maxDoc(); // total number of documents in the index

    Fields fields = MultiFields.getFields(indexReader);
    Terms terms = fields.terms(field);
    TermsEnum iterator = terms.iterator(null);
    BytesRef byteRef = null;

    while ((byteRef = iterator.next()) != null) {
        //* for each word in the collection
        String term = new String(byteRef.bytes, byteRef.offset, byteRef.length);
        int docFreq = iterator.docFreq(); // df of 'term'
        long colFreq = iterator.totalTermFreq(); // cf of 'term'
        collectionStat.perTermStat.put(term, new PerTermStat(term, colFreq, docFreq));
        colSize += colFreq;
    }
    collectionStat.colSize = colSize; // collection size of the index
    collectionStat.uniqTermCount = collectionStat.perTermStat.size();

    return collectionStat;
}

From source file:nl.inl.blacklab.search.lucene.BLSpanTermQuery.java

License:Apache License

/**
 * Overridden frmo SpanTermQuery to return a BLSpans instead.
 *//*www  .  j a  v a2s .c o  m*/
@Override
public Spans getSpans(final AtomicReaderContext context, Bits acceptDocs, Map<Term, TermContext> termContexts)
        throws IOException {
    TermContext termContext = termContexts.get(term);
    final TermState state;
    if (termContext == null) {
        // this happens with span-not query, as it doesn't include the NOT
        // side in extractTerms()
        // so we seek to the term now in this segment..., this sucks because
        // its ugly mostly!
        final Fields fields = context.reader().fields();
        if (fields != null) {
            final Terms terms = fields.terms(term.field());
            if (terms != null) {
                final TermsEnum termsEnum = terms.iterator(null);
                if (termsEnum.seekExact(term.bytes(), true)) {
                    state = termsEnum.termState();
                } else {
                    state = null;
                }
            } else {
                state = null;
            }
        } else {
            state = null;
        }
    } else {
        state = termContext.get(context.ord);
    }

    if (state == null) { // term is not present in that reader
        return TermSpans.EMPTY_TERM_SPANS;
    }

    final TermsEnum termsEnum = context.reader().terms(term.field()).iterator(null);
    termsEnum.seekExact(term.bytes(), state);

    final DocsAndPositionsEnum postings = termsEnum.docsAndPositions(acceptDocs, null,
            DocsAndPositionsEnum.FLAG_PAYLOADS);

    if (postings != null) {
        return new TermSpans(postings, term);
    }
    // term does exist, but has no positions
    throw new IllegalStateException("field \"" + term.field()
            + "\" was indexed without position data; cannot run SpanTermQuery (term=" + term.text() + ")");
}

From source file:org.apache.blur.lucene.search.PrimeDocCache.java

License:Apache License

/**
 * The way this method is called via warm up methods the likelihood of
 * creating multiple bitsets during a race condition is very low, that's why
 * this method is not synced./*from   ww w.j a va 2s  . co m*/
 */
public static OpenBitSet getPrimeDocBitSet(Term primeDocTerm, AtomicReader providedReader) throws IOException {
    AtomicReader reader = AtomicReaderUtil.getSegmentReader(providedReader);
    final Object key = reader.getCoreCacheKey();
    final Map<Object, OpenBitSet> primeDocMap = getPrimeDocMap(primeDocTerm);
    OpenBitSet bitSet = primeDocMap.get(key);
    if (bitSet == null) {
        synchronized (reader) {
            reader.addReaderClosedListener(new ReaderClosedListener() {
                @Override
                public void onClose(IndexReader reader) {
                    LOG.debug("Current size [" + primeDocMap.size()
                            + "] Prime Doc BitSet removing for segment [" + reader + "]");
                    OpenBitSet openBitSet = primeDocMap.remove(key);
                    if (openBitSet == null) {
                        LOG.warn("Primedoc was missing for key [{0}]", key);
                    }
                }
            });
            LOG.debug("Prime Doc BitSet missing for segment [" + reader + "] current size ["
                    + primeDocMap.size() + "]");
            final OpenBitSet bs = new OpenBitSet(reader.maxDoc());
            MemoryLeakDetector.record(bs, "PrimeDoc BitSet", key.toString());

            Fields fields = reader.fields();
            if (fields == null) {
                throw new IOException("Missing all fields.");
            }
            Terms terms = fields.terms(primeDocTerm.field());
            if (terms == null) {
                throw new IOException("Missing prime doc field [" + primeDocTerm.field() + "].");
            }
            TermsEnum termsEnum = terms.iterator(null);
            if (!termsEnum.seekExact(primeDocTerm.bytes(), true)) {
                throw new IOException("Missing prime doc term [" + primeDocTerm + "].");
            }

            DocsEnum docsEnum = termsEnum.docs(null, null);
            int docFreq = reader.docFreq(primeDocTerm);
            int doc;
            int count = 0;
            while ((doc = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                bs.fastSet(doc);
                count++;
            }
            if (count == docFreq) {
                primeDocMap.put(key, bs);
            } else {
                LOG.warn(
                        "PrimeDoc for reader [{0}] not stored, because count [{1}] and freq [{2}] do not match.",
                        reader, count, docFreq);
            }
            return bs;
        }
    }
    return bitSet;
}

From source file:org.apache.blur.lucene.security.search.DocumentVisibilityFilter.java

License:Apache License

@Override
public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException {
    AtomicReader reader = context.reader();
    List<DocIdSet> list = new ArrayList<DocIdSet>();

    Fields fields = reader.fields();
    Terms terms = fields.terms(_fieldName);
    if (terms == null) {
        // if field is not present then show nothing.
        return DocIdSet.EMPTY_DOCIDSET;
    }/*from  w w w .j  a v a  2  s .c o  m*/
    TermsEnum iterator = terms.iterator(null);
    BytesRef bytesRef;
    DocumentVisibilityEvaluator visibilityEvaluator = new DocumentVisibilityEvaluator(_authorizations);
    while ((bytesRef = iterator.next()) != null) {
        if (isVisible(visibilityEvaluator, bytesRef)) {
            DocIdSet docIdSet = _filterCacheStrategy.getDocIdSet(_fieldName, bytesRef, reader);
            if (docIdSet != null) {
                list.add(docIdSet);
            } else {
                // Do not use acceptDocs because we want the acl cache to be version
                // agnostic.
                DocsEnum docsEnum = iterator.docs(null, null);
                list.add(buildCache(reader, docsEnum, bytesRef));
            }
        }
    }
    return getLogicalOr(list);
}

From source file:org.apache.blur.lucene.warmup.IndexWarmup.java

License:Apache License

public Map<String, List<IndexTracerResult>> sampleIndex(AtomicReader atomicReader, String context)
        throws IOException {
    Map<String, List<IndexTracerResult>> results = new HashMap<String, List<IndexTracerResult>>();
    if (atomicReader instanceof SegmentReader) {
        SegmentReader segmentReader = (SegmentReader) atomicReader;
        Directory directory = segmentReader.directory();
        if (!(directory instanceof TraceableDirectory)) {
            LOG.info("Context [{1}] cannot warmup directory [{0}] needs to be a TraceableDirectory.", directory,
                    context);//  w w  w  .  j a  v  a2s. c  om
            return results;
        }
        IndexTracer tracer = new IndexTracer((TraceableDirectory) directory, _maxSampleSize);
        String fileName = getSampleFileName(segmentReader.getSegmentName());
        List<IndexTracerResult> segmentTraces = new ArrayList<IndexTracerResult>();
        if (directory.fileExists(fileName)) {
            IndexInput input = directory.openInput(fileName, IOContext.READONCE);
            segmentTraces = read(input);
            input.close();
        } else {
            Fields fields = atomicReader.fields();
            for (String field : fields) {
                LOG.debug("Context [{1}] sampling field [{0}].", field, context);
                Terms terms = fields.terms(field);
                boolean hasOffsets = terms.hasOffsets();
                boolean hasPayloads = terms.hasPayloads();
                boolean hasPositions = terms.hasPositions();

                tracer.initTrace(segmentReader, field, hasPositions, hasPayloads, hasOffsets);
                IndexTracerResult result = tracer.runTrace(terms);
                segmentTraces.add(result);
            }
            if (_isClosed.get()) {
                LOG.info("Context [{0}] index closed", context);
                return null;
            }
            IndexOutput output = directory.createOutput(fileName, IOContext.DEFAULT);
            write(segmentTraces, output);
            output.close();
        }
        results.put(segmentReader.getSegmentName(), segmentTraces);
    }
    return results;
}

From source file:org.apache.blur.manager.writer.IndexImporter.java

License:Apache License

private void applyDeletes(Directory directory, IndexWriter indexWriter, String shard, boolean emitDeletes)
        throws IOException {
    DirectoryReader reader = DirectoryReader.open(directory);
    try {/*w w w.j av a2 s .c om*/
        LOG.info("Applying deletes in reader [{0}]", reader);
        CompositeReaderContext compositeReaderContext = reader.getContext();
        List<AtomicReaderContext> leaves = compositeReaderContext.leaves();
        BlurPartitioner blurPartitioner = new BlurPartitioner();
        Text key = new Text();
        int numberOfShards = _shardContext.getTableContext().getDescriptor().getShardCount();
        int shardId = ShardUtil.getShardIndex(shard);
        for (AtomicReaderContext context : leaves) {
            AtomicReader atomicReader = context.reader();
            Fields fields = atomicReader.fields();
            Terms terms = fields.terms(BlurConstants.ROW_ID);
            if (terms != null) {
                TermsEnum termsEnum = terms.iterator(null);
                BytesRef ref = null;
                while ((ref = termsEnum.next()) != null) {
                    key.set(ref.bytes, ref.offset, ref.length);
                    int partition = blurPartitioner.getPartition(key, null, numberOfShards);
                    if (shardId != partition) {
                        throw new IOException("Index is corrupted, RowIds are found in wrong shard, partition ["
                                + partition + "] does not shard [" + shardId
                                + "], this can happen when rows are not hashed correctly.");
                    }
                    if (emitDeletes) {
                        indexWriter.deleteDocuments(new Term(BlurConstants.ROW_ID, BytesRef.deepCopyOf(ref)));
                    }
                }
            }
        }
    } finally {
        reader.close();
    }
}

From source file:org.apache.blur.utils.BlurUtil.java

License:Apache License

private static void applyFamily(OpenBitSet bits, String family, AtomicReader atomicReader, int primeDocRowId,
        int numberOfDocsInRow, Bits liveDocs) throws IOException {
    Fields fields = atomicReader.fields();
    Terms terms = fields.terms(BlurConstants.FAMILY);
    TermsEnum iterator = terms.iterator(null);
    BytesRef text = new BytesRef(family);
    int lastDocId = primeDocRowId + numberOfDocsInRow;
    if (iterator.seekExact(text, true)) {
        DocsEnum docs = iterator.docs(liveDocs, null, DocsEnum.FLAG_NONE);
        int doc = primeDocRowId;
        while ((doc = docs.advance(doc)) < lastDocId) {
            bits.set(doc - primeDocRowId);
        }/*from www  .j a v a2 s.  co  m*/
    }
}

From source file:org.apache.solr.handler.admin.LukeRequestHandler.java

License:Apache License

@SuppressWarnings("unchecked")
private static void getDetailedFieldInfo(SolrQueryRequest req, String field, SimpleOrderedMap<Object> fieldMap)
        throws IOException {

    SolrParams params = req.getParams();
    final int numTerms = params.getInt(NUMTERMS, DEFAULT_COUNT);

    TopTermQueue tiq = new TopTermQueue(numTerms + 1); // Something to collect the top N terms in.

    final CharsRef spare = new CharsRef();

    Fields fields = MultiFields.getFields(req.getSearcher().getIndexReader());

    if (fields == null) { // No indexed fields
        return;/*from w  ww  .j a  v  a2 s  .c o m*/
    }

    Terms terms = fields.terms(field);
    if (terms == null) { // No terms in the field.
        return;
    }
    TermsEnum termsEnum = terms.iterator(null);
    BytesRef text;
    int[] buckets = new int[HIST_ARRAY_SIZE];
    while ((text = termsEnum.next()) != null) {
        ++tiq.distinctTerms;
        int freq = termsEnum.docFreq(); // This calculation seems odd, but it gives the same results as it used to.
        int slot = 32 - Integer.numberOfLeadingZeros(Math.max(0, freq - 1));
        buckets[slot] = buckets[slot] + 1;
        if (numTerms > 0 && freq > tiq.minFreq) {
            UnicodeUtil.UTF8toUTF16(text, spare);
            String t = spare.toString();

            tiq.add(new TopTermQueue.TermInfo(new Term(field, t), termsEnum.docFreq()));
            if (tiq.size() > numTerms) { // if tiq full
                tiq.pop(); // remove lowest in tiq
                tiq.minFreq = tiq.getTopTermInfo().docFreq;
            }
        }
    }
    tiq.histogram.add(buckets);
    fieldMap.add("distinct", tiq.distinctTerms);

    // Include top terms
    fieldMap.add("topTerms", tiq.toNamedList(req.getSearcher().getSchema()));

    // Add a histogram
    fieldMap.add("histogram", tiq.histogram.toNamedList());
}