Example usage for org.apache.lucene.index Fields terms

Introduction

In this page you can find the example usage for org.apache.lucene.index Fields terms.

Prototype

public abstract Terms terms(String field) throws IOException;

Source Link

Document

Get the Terms for this field.

Usage

From source file:narock.HighFreqTerms.java

License:Apache License

/**
 * /*from ww w . java 2s  .c  o  m*/
 * @param reader
 * @param numTerms
 * @param field
 * @return TermStats[] ordered by terms with highest docFreq first.
 * @throws Exception
 */
public static TermStats[] getHighFreqTerms(IndexReader reader, int numTerms, String[] fieldNames)
        throws Exception {
    TermStatsQueue tiq = null;
    TermsEnum te = null;

    if (fieldNames != null) {
        Fields fields = MultiFields.getFields(reader);
        if (fields == null) {
            LOG.info("Index with no fields - probably empty or corrupted");
            return EMPTY_STATS;
        }
        tiq = new TermStatsQueue(numTerms);
        for (String field : fieldNames) {
            Terms terms = fields.terms(field);
            if (terms != null) {
                te = terms.iterator(te);
                fillQueue(te, tiq, field);
            }
        }
    } else {
        Fields fields = MultiFields.getFields(reader);
        if (fields == null) {
            LOG.info("Index with no fields - probably empty or corrupted");
            return EMPTY_STATS;
        }
        tiq = new TermStatsQueue(numTerms);
        // FieldsEnum fieldsEnum = fields.iterator();
        while (true) {
            String field = fields.iterator().next();
            //fieldsEnum.next();
            if (field != null) {
                Terms terms = fields.terms(field);
                //fieldsEnum.terms();
                te = terms.iterator(te);
                fillQueue(te, tiq, field);
            } else {
                break;
            }
        }
    }

    TermStats[] result = new TermStats[tiq.size()];
    // we want highest first so we read the queue and populate the array
    // starting at the end and work backwards
    int count = tiq.size() - 1;
    while (tiq.size() != 0) {
        result[count] = tiq.pop();
        count--;
    }
    return result;
}

From source file:net.tourbook.search.FTSearchManager.java

License:Open Source License

private static InputIterator createTermIterator() throws IOException {

    final TermFreqIteratorListWrapper inputIterator = new TermFreqIteratorListWrapper();

    final List<AtomicReaderContext> leaves = _indexReader.leaves();

    for (final AtomicReaderContext readerContext : leaves) {

        final AtomicReader reader = readerContext.reader();
        final Fields fields = reader.fields();

        for (final String field : fields) {

            if (field.equals(SEARCH_FIELD_DESCRIPTION) || field.equals(SEARCH_FIELD_TITLE)) {

                final Terms terms = fields.terms(field);
                final TermsEnum termsEnum = terms.iterator(null);

                inputIterator.add(termsEnum);
            }//from  w ww .j a  va 2s .  co m
        }
    }

    return inputIterator;
}

From source file:NewsIR_search.CollectionStatistics.java

/**
 * Initialize collectionStat://from w  ww .  jav a 2  s . co m
 * docCount      - total-number-of-docs-in-index
 * colSize       - collection-size
 * uniqTermCount - unique terms in collection
 * perTermStat   - cf, df of each terms in the collection
 * @return 
 * @throws IOException 
 */
public CollectionStatistics buildCollectionStat() throws IOException {

    long colSize = 0;

    CollectionStatistics collectionStat = new CollectionStatistics();

    collectionStat.docCount = indexReader.maxDoc(); // total number of documents in the index

    Fields fields = MultiFields.getFields(indexReader);
    Terms terms = fields.terms(field);
    TermsEnum iterator = terms.iterator(null);
    BytesRef byteRef = null;

    while ((byteRef = iterator.next()) != null) {
        //* for each word in the collection
        String term = new String(byteRef.bytes, byteRef.offset, byteRef.length);
        int docFreq = iterator.docFreq(); // df of 'term'
        long colFreq = iterator.totalTermFreq(); // cf of 'term'
        collectionStat.perTermStat.put(term, new PerTermStat(term, colFreq, docFreq));
        colSize += colFreq;
    }
    collectionStat.colSize = colSize; // collection size of the index
    collectionStat.uniqTermCount = collectionStat.perTermStat.size();

    return collectionStat;
}

From source file:nl.inl.blacklab.search.lucene.BLSpanTermQuery.java

License:Apache License

/**
 * Overridden frmo SpanTermQuery to return a BLSpans instead.
 *//*www  .  j a  v a2s .c o  m*/
@Override
public Spans getSpans(final AtomicReaderContext context, Bits acceptDocs, Map<Term, TermContext> termContexts)
        throws IOException {
    TermContext termContext = termContexts.get(term);
    final TermState state;
    if (termContext == null) {
        // this happens with span-not query, as it doesn't include the NOT
        // side in extractTerms()
        // so we seek to the term now in this segment..., this sucks because
        // its ugly mostly!
        final Fields fields = context.reader().fields();
        if (fields != null) {
            final Terms terms = fields.terms(term.field());
            if (terms != null) {
                final TermsEnum termsEnum = terms.iterator(null);
                if (termsEnum.seekExact(term.bytes(), true)) {
                    state = termsEnum.termState();
                } else {
                    state = null;
                }
            } else {
                state = null;
            }
        } else {
            state = null;
        }
    } else {
        state = termContext.get(context.ord);
    }

    if (state == null) { // term is not present in that reader
        return TermSpans.EMPTY_TERM_SPANS;
    }

    final TermsEnum termsEnum = context.reader().terms(term.field()).iterator(null);
    termsEnum.seekExact(term.bytes(), state);

    final DocsAndPositionsEnum postings = termsEnum.docsAndPositions(acceptDocs, null,
            DocsAndPositionsEnum.FLAG_PAYLOADS);

    if (postings != null) {
        return new TermSpans(postings, term);
    }
    // term does exist, but has no positions
    throw new IllegalStateException("field \"" + term.field()
            + "\" was indexed without position data; cannot run SpanTermQuery (term=" + term.text() + ")");
}

From source file:org.apache.blur.lucene.search.PrimeDocCache.java

License:Apache License

/**
 * The way this method is called via warm up methods the likelihood of
 * creating multiple bitsets during a race condition is very low, that's why
 * this method is not synced./*from   ww w.j a va 2s  . co m*/
 */
public static OpenBitSet getPrimeDocBitSet(Term primeDocTerm, AtomicReader providedReader) throws IOException {
    AtomicReader reader = AtomicReaderUtil.getSegmentReader(providedReader);
    final Object key = reader.getCoreCacheKey();
    final Map<Object, OpenBitSet> primeDocMap = getPrimeDocMap(primeDocTerm);
    OpenBitSet bitSet = primeDocMap.get(key);
    if (bitSet == null) {
        synchronized (reader) {
            reader.addReaderClosedListener(new ReaderClosedListener() {
                @Override
                public void onClose(IndexReader reader) {
                    LOG.debug("Current size [" + primeDocMap.size()
                            + "] Prime Doc BitSet removing for segment [" + reader + "]");
                    OpenBitSet openBitSet = primeDocMap.remove(key);
                    if (openBitSet == null) {
                        LOG.warn("Primedoc was missing for key [{0}]", key);
                    }
                }
            });
            LOG.debug("Prime Doc BitSet missing for segment [" + reader + "] current size ["
                    + primeDocMap.size() + "]");
            final OpenBitSet bs = new OpenBitSet(reader.maxDoc());
            MemoryLeakDetector.record(bs, "PrimeDoc BitSet", key.toString());

            Fields fields = reader.fields();
            if (fields == null) {
                throw new IOException("Missing all fields.");
            }
            Terms terms = fields.terms(primeDocTerm.field());
            if (terms == null) {
                throw new IOException("Missing prime doc field [" + primeDocTerm.field() + "].");
            }
            TermsEnum termsEnum = terms.iterator(null);
            if (!termsEnum.seekExact(primeDocTerm.bytes(), true)) {
                throw new IOException("Missing prime doc term [" + primeDocTerm + "].");
            }

            DocsEnum docsEnum = termsEnum.docs(null, null);
            int docFreq = reader.docFreq(primeDocTerm);
            int doc;
            int count = 0;
            while ((doc = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                bs.fastSet(doc);
                count++;
            }
            if (count == docFreq) {
                primeDocMap.put(key, bs);
            } else {
                LOG.warn(
                        "PrimeDoc for reader [{0}] not stored, because count [{1}] and freq [{2}] do not match.",
                        reader, count, docFreq);
            }
            return bs;
        }
    }
    return bitSet;
}

From source file:org.apache.blur.lucene.security.search.DocumentVisibilityFilter.java

License:Apache License

@Override
public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException {
    AtomicReader reader = context.reader();
    List<DocIdSet> list = new ArrayList<DocIdSet>();

    Fields fields = reader.fields();
    Terms terms = fields.terms(_fieldName);
    if (terms == null) {
        // if field is not present then show nothing.
        return DocIdSet.EMPTY_DOCIDSET;
    }/*from  w w w .j  a v a  2  s .c o  m*/
    TermsEnum iterator = terms.iterator(null);
    BytesRef bytesRef;
    DocumentVisibilityEvaluator visibilityEvaluator = new DocumentVisibilityEvaluator(_authorizations);
    while ((bytesRef = iterator.next()) != null) {
        if (isVisible(visibilityEvaluator, bytesRef)) {
            DocIdSet docIdSet = _filterCacheStrategy.getDocIdSet(_fieldName, bytesRef, reader);
            if (docIdSet != null) {
                list.add(docIdSet);
            } else {
                // Do not use acceptDocs because we want the acl cache to be version
                // agnostic.
                DocsEnum docsEnum = iterator.docs(null, null);
                list.add(buildCache(reader, docsEnum, bytesRef));
            }
        }
    }
    return getLogicalOr(list);
}

From source file:org.apache.blur.lucene.warmup.IndexWarmup.java

License:Apache License

public Map<String, List<IndexTracerResult>> sampleIndex(AtomicReader atomicReader, String context)
        throws IOException {
    Map<String, List<IndexTracerResult>> results = new HashMap<String, List<IndexTracerResult>>();
    if (atomicReader instanceof SegmentReader) {
        SegmentReader segmentReader = (SegmentReader) atomicReader;
        Directory directory = segmentReader.directory();
        if (!(directory instanceof TraceableDirectory)) {
            LOG.info("Context [{1}] cannot warmup directory [{0}] needs to be a TraceableDirectory.", directory,
                    context);//  w w  w  .  j a  v  a2s. c  om
            return results;
        }
        IndexTracer tracer = new IndexTracer((TraceableDirectory) directory, _maxSampleSize);
        String fileName = getSampleFileName(segmentReader.getSegmentName());
        List<IndexTracerResult> segmentTraces = new ArrayList<IndexTracerResult>();
        if (directory.fileExists(fileName)) {
            IndexInput input = directory.openInput(fileName, IOContext.READONCE);
            segmentTraces = read(input);
            input.close();
        } else {
            Fields fields = atomicReader.fields();
            for (String field : fields) {
                LOG.debug("Context [{1}] sampling field [{0}].", field, context);
                Terms terms = fields.terms(field);
                boolean hasOffsets = terms.hasOffsets();
                boolean hasPayloads = terms.hasPayloads();
                boolean hasPositions = terms.hasPositions();

                tracer.initTrace(segmentReader, field, hasPositions, hasPayloads, hasOffsets);
                IndexTracerResult result = tracer.runTrace(terms);
                segmentTraces.add(result);
            }
            if (_isClosed.get()) {
                LOG.info("Context [{0}] index closed", context);
                return null;
            }
            IndexOutput output = directory.createOutput(fileName, IOContext.DEFAULT);
            write(segmentTraces, output);
            output.close();
        }
        results.put(segmentReader.getSegmentName(), segmentTraces);
    }
    return results;
}

From source file:org.apache.blur.manager.writer.IndexImporter.java

License:Apache License

private void applyDeletes(Directory directory, IndexWriter indexWriter, String shard, boolean emitDeletes)
        throws IOException {
    DirectoryReader reader = DirectoryReader.open(directory);
    try {/*w w w.j av a2 s .c om*/
        LOG.info("Applying deletes in reader [{0}]", reader);
        CompositeReaderContext compositeReaderContext = reader.getContext();
        List<AtomicReaderContext> leaves = compositeReaderContext.leaves();
        BlurPartitioner blurPartitioner = new BlurPartitioner();
        Text key = new Text();
        int numberOfShards = _shardContext.getTableContext().getDescriptor().getShardCount();
        int shardId = ShardUtil.getShardIndex(shard);
        for (AtomicReaderContext context : leaves) {
            AtomicReader atomicReader = context.reader();
            Fields fields = atomicReader.fields();
            Terms terms = fields.terms(BlurConstants.ROW_ID);
            if (terms != null) {
                TermsEnum termsEnum = terms.iterator(null);
                BytesRef ref = null;
                while ((ref = termsEnum.next()) != null) {
                    key.set(ref.bytes, ref.offset, ref.length);
                    int partition = blurPartitioner.getPartition(key, null, numberOfShards);
                    if (shardId != partition) {
                        throw new IOException("Index is corrupted, RowIds are found in wrong shard, partition ["
                                + partition + "] does not shard [" + shardId
                                + "], this can happen when rows are not hashed correctly.");
                    }
                    if (emitDeletes) {
                        indexWriter.deleteDocuments(new Term(BlurConstants.ROW_ID, BytesRef.deepCopyOf(ref)));
                    }
                }
            }
        }
    } finally {
        reader.close();
    }
}

From source file:org.apache.blur.utils.BlurUtil.java

License:Apache License

private static void applyFamily(OpenBitSet bits, String family, AtomicReader atomicReader, int primeDocRowId,
        int numberOfDocsInRow, Bits liveDocs) throws IOException {
    Fields fields = atomicReader.fields();
    Terms terms = fields.terms(BlurConstants.FAMILY);
    TermsEnum iterator = terms.iterator(null);
    BytesRef text = new BytesRef(family);
    int lastDocId = primeDocRowId + numberOfDocsInRow;
    if (iterator.seekExact(text, true)) {
        DocsEnum docs = iterator.docs(liveDocs, null, DocsEnum.FLAG_NONE);
        int doc = primeDocRowId;
        while ((doc = docs.advance(doc)) < lastDocId) {
            bits.set(doc - primeDocRowId);
        }/*from www  .j a v a2 s.  co  m*/
    }
}

From source file:org.apache.solr.handler.admin.LukeRequestHandler.java

License:Apache License

@SuppressWarnings("unchecked")
private static void getDetailedFieldInfo(SolrQueryRequest req, String field, SimpleOrderedMap<Object> fieldMap)
        throws IOException {

    SolrParams params = req.getParams();
    final int numTerms = params.getInt(NUMTERMS, DEFAULT_COUNT);

    TopTermQueue tiq = new TopTermQueue(numTerms + 1); // Something to collect the top N terms in.

    final CharsRef spare = new CharsRef();

    Fields fields = MultiFields.getFields(req.getSearcher().getIndexReader());

    if (fields == null) { // No indexed fields
        return;/*from w  ww  .j a  v  a2 s  .c o m*/
    }

    Terms terms = fields.terms(field);
    if (terms == null) { // No terms in the field.
        return;
    }
    TermsEnum termsEnum = terms.iterator(null);
    BytesRef text;
    int[] buckets = new int[HIST_ARRAY_SIZE];
    while ((text = termsEnum.next()) != null) {
        ++tiq.distinctTerms;
        int freq = termsEnum.docFreq(); // This calculation seems odd, but it gives the same results as it used to.
        int slot = 32 - Integer.numberOfLeadingZeros(Math.max(0, freq - 1));
        buckets[slot] = buckets[slot] + 1;
        if (numTerms > 0 && freq > tiq.minFreq) {
            UnicodeUtil.UTF8toUTF16(text, spare);
            String t = spare.toString();

            tiq.add(new TopTermQueue.TermInfo(new Term(field, t), termsEnum.docFreq()));
            if (tiq.size() > numTerms) { // if tiq full
                tiq.pop(); // remove lowest in tiq
                tiq.minFreq = tiq.getTopTermInfo().docFreq;
            }
        }
    }
    tiq.histogram.add(buckets);
    fieldMap.add("distinct", tiq.distinctTerms);

    // Include top terms
    fieldMap.add("topTerms", tiq.toNamedList(req.getSearcher().getSchema()));

    // Add a histogram
    fieldMap.add("histogram", tiq.histogram.toNamedList());
}