List of usage examples for org.apache.solr.schema FieldType indexedToReadable
public String indexedToReadable(String indexedForm)
From source file:org.dyndns.andreasbaumann.LuceneAnalyzer.java
License:Open Source License
private static void printTerms(IndexReader indexReader, boolean printHeaders, boolean isSolr, SolrIndexSearcher solrSearch, boolean printDocNumbers, boolean printPositions) throws IOException { if (printHeaders) { System.out.println("Terms:"); System.out.println("======"); }/*from www . j a v a2 s . com*/ TermEnum terms = indexReader.terms(); while (terms.next()) { Term term = terms.term(); // the df is stored in the iterator and not in the term, weird... int df = terms.docFreq(); String field = term.field(); String text = term.text(); if (isSolr) { IndexSchema schema = solrSearch.getSchema(); SchemaField schemaField = schema.getField(field); FieldType fieldType = schemaField.getType(); text = fieldType.indexedToReadable(text); } if (!printDocNumbers && !printPositions) { System.out.print(field + "\t" + text + "\t" + df); } else { System.out.print(field + "\t" + text); } if (printDocNumbers) { TermDocs termDocs = indexReader.termDocs(term); boolean first = true; while (termDocs.next()) { if (first) { System.out.print("\t" + termDocs.doc()); first = false; } else { System.out.print("," + termDocs.doc()); } } termDocs.close(); } else if (printPositions) { TermPositions termPositions = indexReader.termPositions(term); boolean first = true; while (termPositions.next()) { if (first) { System.out.print("\t" + termPositions.doc()); first = false; } else { System.out.print("," + termPositions.doc()); } for (int i = 0; i < termPositions.freq(); i++) { int position = termPositions.nextPosition(); if (i == 0) { System.out.print("["); } System.out.print(position); if (i < termPositions.freq() - 1) { System.out.print(","); } if (i == termPositions.freq() - 1) { System.out.print("]"); } } } termPositions.close(); } System.out.println(""); } System.out.println(""); }
From source file:org.jahia.services.search.facets.SimpleJahiaJcrFacets.java
License:Open Source License
/** * Use the Lucene FieldCache to get counts for each unique field value in <code>docs</code>. The field must have at most one indexed * token per document./*from w w w .ja v a2s .com*/ */ public NamedList<Object> getFieldCacheCounts(IndexSearcher searcher, OpenBitSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort, String prefix, String locale, ExtendedPropertyDefinition epd) throws IOException { // TODO: If the number of terms is high compared to docs.size(), and zeros==false, // we should use an alternate strategy to avoid // 1) creating another huge int[] for the counts // 2) looping over that huge int[] looking for the rare non-zeros. // // Yet another variation: if docs.size() is small and termvectors are stored, // then use them instead of the FieldCache. // // TODO: this function is too big and could use some refactoring, but // we also need a facet cache, and refactoring of SimpleFacets instead of // trying to pass all the various params around. FieldType ft = getType(epd); NamedList<Object> res = new NamedList<Object>(); FieldCache.StringIndex si = FieldCache.DEFAULT.getStringIndex(searcher.getIndexReader(), fieldName); final String[] terms = si.lookup; final int[] termNum = si.order; if (prefix != null && prefix.length() == 0) prefix = null; int startTermIndex, endTermIndex; if (prefix != null) { startTermIndex = Arrays.binarySearch(terms, prefix, nullStrComparator); if (startTermIndex < 0) startTermIndex = -startTermIndex - 1; // find the end term. \uffff isn't a legal unicode char, but only compareTo // is used, so it should be fine, and is guaranteed to be bigger than legal chars. endTermIndex = Arrays.binarySearch(terms, prefix + "\uffff\uffff\uffff\uffff", nullStrComparator); endTermIndex = -endTermIndex - 1; } else { startTermIndex = 1; endTermIndex = terms.length; } final int nTerms = endTermIndex - startTermIndex; if (nTerms > 0 && docs.size() >= mincount) { // count collection array only needs to be as big as the number of terms we are // going to collect counts for. final int[] counts = new int[nTerms]; DocIdSetIterator iter = docs.iterator(); while (iter.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { int term = termNum[iter.docID()]; int arrIdx = term - startTermIndex; if (arrIdx >= 0 && arrIdx < nTerms) counts[arrIdx]++; } // IDEA: we could also maintain a count of "other"... everything that fell outside // of the top 'N' int off = offset; int lim = limit >= 0 ? limit : Integer.MAX_VALUE; if (sort.equals(FacetParams.FACET_SORT_COUNT) || sort.equals(FacetParams.FACET_SORT_COUNT_LEGACY)) { int maxsize = limit > 0 ? offset + limit : Integer.MAX_VALUE - 1; maxsize = Math.min(maxsize, nTerms); final TreeSet<SimpleFacets.CountPair<String, Integer>> queue = new TreeSet<SimpleFacets.CountPair<String, Integer>>(); int min = mincount - 1; // the smallest value in the top 'N' values for (int i = 0; i < nTerms; i++) { int c = counts[i]; if (c > min) { // NOTE: we use c>min rather than c>=min as an optimization because we are going in // index order, so we already know that the keys are ordered. This can be very // important if a lot of the counts are repeated (like zero counts would be). queue.add(new SimpleFacets.CountPair<String, Integer>(terms[startTermIndex + i], c)); if (queue.size() >= maxsize) { break; } } } // now select the right page from the results for (SimpleFacets.CountPair<String, Integer> p : queue) { if (--off >= 0) continue; if (--lim < 0) break; res.add(ft.indexedToReadable(p.key), p.val); } } else { // add results in index order int i = 0; if (mincount <= 0) { // if mincount<=0, then we won't discard any terms and we know exactly // where to start. i = off; off = 0; } for (; i < nTerms; i++) { int c = counts[i]; if (c < mincount || --off >= 0) continue; if (--lim < 0) break; res.add(ft.indexedToReadable(terms[startTermIndex + i]), c); } } } if (missing) { res.add(null, getFieldMissingCount(searcher, docs, fieldName, locale)); } return res; }
From source file:org.jahia.services.search.facets.SimpleJahiaJcrFacets.java
License:Open Source License
/** * Returns a list of terms in the specified field along with the corresponding count of documents in the set that match that constraint. * This method uses the FilterCache to get the intersection count between <code>docs</code> and the DocSet for each term in the filter. * /*from w w w .j av a2 s. c om*/ * @see FacetParams#FACET_LIMIT * @see FacetParams#FACET_ZEROS * @see FacetParams#FACET_MISSING */ public NamedList<Object> getFacetTermEnumCounts(IndexSearcher searcher, OpenBitSet docs, String field, String fieldName, int offset, int limit, int mincount, boolean missing, String sort, String prefix, String locale, ExtendedPropertyDefinition epd) throws IOException { /* * :TODO: potential optimization... cache the Terms with the highest docFreq and try them first don't enum if we get our max from * them */ // Minimum term docFreq in order to use the filterCache for that term. int minDfFilterCache = params.getFieldInt(field, FacetParams.FACET_ENUM_CACHE_MINDF, 0); IndexReader r = searcher.getIndexReader(); FieldType ft = getType(epd); final int maxsize = limit >= 0 ? offset + limit : Integer.MAX_VALUE - 1; final TreeSet<SimpleFacets.CountPair<String, Integer>> queue = (sort.equals("count") || sort.equals("true")) ? new TreeSet<SimpleFacets.CountPair<String, Integer>>() : null; final NamedList<Object> res = new NamedList<Object>(); int min = mincount - 1; // the smallest value in the top 'N' values int off = offset; int lim = limit >= 0 ? limit : Integer.MAX_VALUE; String startTerm = prefix == null ? "" : ft.toInternal(prefix); TermEnum te = r.terms(new Term(fieldName, startTerm)); TermDocs td = r.termDocs(); SolrIndexSearcher.TermDocsState tdState = new SolrIndexSearcher.TermDocsState(); tdState.tenum = te; tdState.tdocs = td; if (docs.size() >= mincount) { do { Term t = te.term(); if (null == t || !t.field().equals(fieldName)) break; if (prefix != null && !t.text().startsWith(prefix)) break; int df = te.docFreq(); // If we are sorting, we can use df>min (rather than >=) since we // are going in index order. For certain term distributions this can // make a large difference (for example, many terms with df=1). if (df > 0 && df > min) { int c; if (df >= minDfFilterCache) { // use the filter cache // TODO: use the new method ??? // docs.intersectionSize( searcher.getPositiveDocSet(new TermQuery(t), tdState) ); c = (int) OpenBitSet.intersectionCount(getDocIdSet(new TermQuery(t), locale), docs); } else { // iterate over TermDocs to calculate the intersection td.seek(te); c = 0; while (td.next()) { int doc = td.doc(); if (locale != null) { doc = getMainDocIdForTranslations( searcher.getIndexReader().document(doc, PARENT_AND_TRANSLATION_FIELDS), locale); } if (docs.fastGet(doc)) { c++; } } } if (sort.equals("count") || sort.equals("true")) { if (c > min) { queue.add(new SimpleFacets.CountPair<String, Integer>(t.text(), c)); if (queue.size() >= maxsize) { break; } } } else { if (c >= mincount && --off < 0) { if (--lim < 0) break; res.add(ft.indexedToReadable(t.text()), c); } } } } while (te.next()); } if (sort.equals("count") || sort.equals("true")) { for (SimpleFacets.CountPair<String, Integer> p : queue) { if (--off >= 0) continue; if (--lim < 0) break; res.add(ft.indexedToReadable(p.key), p.val); } } if (missing) { res.add(null, getFieldMissingCount(searcher, docs, fieldName, locale)); } te.close(); td.close(); return res; }