Example usage for org.apache.solr.schema FieldType indexedToReadable

List of usage examples for org.apache.solr.schema FieldType indexedToReadable

Introduction

In this page you can find the example usage for org.apache.solr.schema FieldType indexedToReadable.

Prototype

public String indexedToReadable(String indexedForm) 

Source Link

Document

Given an indexed term, return the human readable representation

Usage

From source file:org.dyndns.andreasbaumann.LuceneAnalyzer.java

License:Open Source License

private static void printTerms(IndexReader indexReader, boolean printHeaders, boolean isSolr,
        SolrIndexSearcher solrSearch, boolean printDocNumbers, boolean printPositions) throws IOException {
    if (printHeaders) {
        System.out.println("Terms:");
        System.out.println("======");
    }/*from www  . j a  v a2 s . com*/
    TermEnum terms = indexReader.terms();
    while (terms.next()) {
        Term term = terms.term();
        // the df is stored in the iterator and not in the term, weird...
        int df = terms.docFreq();
        String field = term.field();
        String text = term.text();
        if (isSolr) {
            IndexSchema schema = solrSearch.getSchema();
            SchemaField schemaField = schema.getField(field);
            FieldType fieldType = schemaField.getType();
            text = fieldType.indexedToReadable(text);
        }
        if (!printDocNumbers && !printPositions) {
            System.out.print(field + "\t" + text + "\t" + df);
        } else {
            System.out.print(field + "\t" + text);
        }

        if (printDocNumbers) {
            TermDocs termDocs = indexReader.termDocs(term);
            boolean first = true;
            while (termDocs.next()) {
                if (first) {
                    System.out.print("\t" + termDocs.doc());
                    first = false;
                } else {
                    System.out.print("," + termDocs.doc());
                }
            }
            termDocs.close();
        } else if (printPositions) {
            TermPositions termPositions = indexReader.termPositions(term);
            boolean first = true;
            while (termPositions.next()) {
                if (first) {
                    System.out.print("\t" + termPositions.doc());
                    first = false;
                } else {
                    System.out.print("," + termPositions.doc());
                }

                for (int i = 0; i < termPositions.freq(); i++) {
                    int position = termPositions.nextPosition();
                    if (i == 0) {
                        System.out.print("[");
                    }
                    System.out.print(position);
                    if (i < termPositions.freq() - 1) {
                        System.out.print(",");
                    }
                    if (i == termPositions.freq() - 1) {
                        System.out.print("]");
                    }
                }
            }
            termPositions.close();
        }

        System.out.println("");
    }
    System.out.println("");
}

From source file:org.jahia.services.search.facets.SimpleJahiaJcrFacets.java

License:Open Source License

/**
 * Use the Lucene FieldCache to get counts for each unique field value in <code>docs</code>. The field must have at most one indexed
 * token per document./*from w  w  w  .ja  v  a2s  .com*/
 */
public NamedList<Object> getFieldCacheCounts(IndexSearcher searcher, OpenBitSet docs, String fieldName,
        int offset, int limit, int mincount, boolean missing, String sort, String prefix, String locale,
        ExtendedPropertyDefinition epd) throws IOException {
    // TODO: If the number of terms is high compared to docs.size(), and zeros==false,
    // we should use an alternate strategy to avoid
    // 1) creating another huge int[] for the counts
    // 2) looping over that huge int[] looking for the rare non-zeros.
    //
    // Yet another variation: if docs.size() is small and termvectors are stored,
    // then use them instead of the FieldCache.
    //

    // TODO: this function is too big and could use some refactoring, but
    // we also need a facet cache, and refactoring of SimpleFacets instead of
    // trying to pass all the various params around.
    FieldType ft = getType(epd);
    NamedList<Object> res = new NamedList<Object>();

    FieldCache.StringIndex si = FieldCache.DEFAULT.getStringIndex(searcher.getIndexReader(), fieldName);
    final String[] terms = si.lookup;
    final int[] termNum = si.order;

    if (prefix != null && prefix.length() == 0)
        prefix = null;

    int startTermIndex, endTermIndex;
    if (prefix != null) {
        startTermIndex = Arrays.binarySearch(terms, prefix, nullStrComparator);
        if (startTermIndex < 0)
            startTermIndex = -startTermIndex - 1;
        // find the end term. \uffff isn't a legal unicode char, but only compareTo
        // is used, so it should be fine, and is guaranteed to be bigger than legal chars.
        endTermIndex = Arrays.binarySearch(terms, prefix + "\uffff\uffff\uffff\uffff", nullStrComparator);
        endTermIndex = -endTermIndex - 1;
    } else {
        startTermIndex = 1;
        endTermIndex = terms.length;
    }

    final int nTerms = endTermIndex - startTermIndex;

    if (nTerms > 0 && docs.size() >= mincount) {

        // count collection array only needs to be as big as the number of terms we are
        // going to collect counts for.
        final int[] counts = new int[nTerms];

        DocIdSetIterator iter = docs.iterator();
        while (iter.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
            int term = termNum[iter.docID()];
            int arrIdx = term - startTermIndex;
            if (arrIdx >= 0 && arrIdx < nTerms)
                counts[arrIdx]++;
        }

        // IDEA: we could also maintain a count of "other"... everything that fell outside
        // of the top 'N'

        int off = offset;
        int lim = limit >= 0 ? limit : Integer.MAX_VALUE;

        if (sort.equals(FacetParams.FACET_SORT_COUNT) || sort.equals(FacetParams.FACET_SORT_COUNT_LEGACY)) {
            int maxsize = limit > 0 ? offset + limit : Integer.MAX_VALUE - 1;
            maxsize = Math.min(maxsize, nTerms);
            final TreeSet<SimpleFacets.CountPair<String, Integer>> queue = new TreeSet<SimpleFacets.CountPair<String, Integer>>();
            int min = mincount - 1; // the smallest value in the top 'N' values
            for (int i = 0; i < nTerms; i++) {
                int c = counts[i];
                if (c > min) {
                    // NOTE: we use c>min rather than c>=min as an optimization because we are going in
                    // index order, so we already know that the keys are ordered. This can be very
                    // important if a lot of the counts are repeated (like zero counts would be).
                    queue.add(new SimpleFacets.CountPair<String, Integer>(terms[startTermIndex + i], c));
                    if (queue.size() >= maxsize) {
                        break;
                    }
                }
            }
            // now select the right page from the results
            for (SimpleFacets.CountPair<String, Integer> p : queue) {
                if (--off >= 0)
                    continue;
                if (--lim < 0)
                    break;
                res.add(ft.indexedToReadable(p.key), p.val);
            }
        } else {
            // add results in index order
            int i = 0;
            if (mincount <= 0) {
                // if mincount<=0, then we won't discard any terms and we know exactly
                // where to start.
                i = off;
                off = 0;
            }

            for (; i < nTerms; i++) {
                int c = counts[i];
                if (c < mincount || --off >= 0)
                    continue;
                if (--lim < 0)
                    break;
                res.add(ft.indexedToReadable(terms[startTermIndex + i]), c);
            }
        }
    }

    if (missing) {
        res.add(null, getFieldMissingCount(searcher, docs, fieldName, locale));
    }

    return res;
}

From source file:org.jahia.services.search.facets.SimpleJahiaJcrFacets.java

License:Open Source License

/**
 * Returns a list of terms in the specified field along with the corresponding count of documents in the set that match that constraint.
 * This method uses the FilterCache to get the intersection count between <code>docs</code> and the DocSet for each term in the filter.
 * /*from   w w  w .j av a2 s.  c om*/
 * @see FacetParams#FACET_LIMIT
 * @see FacetParams#FACET_ZEROS
 * @see FacetParams#FACET_MISSING
 */
public NamedList<Object> getFacetTermEnumCounts(IndexSearcher searcher, OpenBitSet docs, String field,
        String fieldName, int offset, int limit, int mincount, boolean missing, String sort, String prefix,
        String locale, ExtendedPropertyDefinition epd) throws IOException {

    /*
     * :TODO: potential optimization... cache the Terms with the highest docFreq and try them first don't enum if we get our max from
     * them
     */

    // Minimum term docFreq in order to use the filterCache for that term.
    int minDfFilterCache = params.getFieldInt(field, FacetParams.FACET_ENUM_CACHE_MINDF, 0);

    IndexReader r = searcher.getIndexReader();
    FieldType ft = getType(epd);

    final int maxsize = limit >= 0 ? offset + limit : Integer.MAX_VALUE - 1;
    final TreeSet<SimpleFacets.CountPair<String, Integer>> queue = (sort.equals("count") || sort.equals("true"))
            ? new TreeSet<SimpleFacets.CountPair<String, Integer>>()
            : null;
    final NamedList<Object> res = new NamedList<Object>();

    int min = mincount - 1; // the smallest value in the top 'N' values
    int off = offset;
    int lim = limit >= 0 ? limit : Integer.MAX_VALUE;

    String startTerm = prefix == null ? "" : ft.toInternal(prefix);
    TermEnum te = r.terms(new Term(fieldName, startTerm));
    TermDocs td = r.termDocs();
    SolrIndexSearcher.TermDocsState tdState = new SolrIndexSearcher.TermDocsState();
    tdState.tenum = te;
    tdState.tdocs = td;

    if (docs.size() >= mincount) {
        do {
            Term t = te.term();

            if (null == t || !t.field().equals(fieldName))
                break;

            if (prefix != null && !t.text().startsWith(prefix))
                break;

            int df = te.docFreq();

            // If we are sorting, we can use df>min (rather than >=) since we
            // are going in index order. For certain term distributions this can
            // make a large difference (for example, many terms with df=1).
            if (df > 0 && df > min) {
                int c;

                if (df >= minDfFilterCache) {
                    // use the filter cache
                    // TODO: use the new method ???                        
                    //                        docs.intersectionSize( searcher.getPositiveDocSet(new TermQuery(t), tdState) );
                    c = (int) OpenBitSet.intersectionCount(getDocIdSet(new TermQuery(t), locale), docs);
                } else {
                    // iterate over TermDocs to calculate the intersection
                    td.seek(te);
                    c = 0;
                    while (td.next()) {
                        int doc = td.doc();
                        if (locale != null) {
                            doc = getMainDocIdForTranslations(
                                    searcher.getIndexReader().document(doc, PARENT_AND_TRANSLATION_FIELDS),
                                    locale);
                        }

                        if (docs.fastGet(doc)) {
                            c++;
                        }
                    }
                }

                if (sort.equals("count") || sort.equals("true")) {
                    if (c > min) {
                        queue.add(new SimpleFacets.CountPair<String, Integer>(t.text(), c));
                        if (queue.size() >= maxsize) {
                            break;
                        }
                    }
                } else {
                    if (c >= mincount && --off < 0) {
                        if (--lim < 0)
                            break;
                        res.add(ft.indexedToReadable(t.text()), c);
                    }
                }
            }
        } while (te.next());
    }

    if (sort.equals("count") || sort.equals("true")) {
        for (SimpleFacets.CountPair<String, Integer> p : queue) {
            if (--off >= 0)
                continue;
            if (--lim < 0)
                break;
            res.add(ft.indexedToReadable(p.key), p.val);
        }
    }

    if (missing) {
        res.add(null, getFieldMissingCount(searcher, docs, fieldName, locale));
    }

    te.close();
    td.close();

    return res;
}