List of usage examples for org.apache.lucene.index Terms getDocCount
public abstract int getDocCount() throws IOException;
From source file:SimpleNaiveBayesClassifier.java
License:Apache License
/** * Returns the average number of unique terms times the number of docs belonging to the input class * @param term the term representing the class * @return the average number of unique terms * @throws IOException if a low level I/O problem happens *//* ww w .ja v a 2 s . co m*/ private double getTextTermFreqForClass(Term term) throws IOException { double avgNumberOfUniqueTerms = 0; for (String textFieldName : textFieldNames) { Terms terms = MultiFields.getTerms(leafReader, textFieldName); long numPostings = terms.getSumDocFreq(); // number of term/doc pairs avgNumberOfUniqueTerms += numPostings / (double) terms.getDocCount(); // avg # of unique terms per doc } int docsWithC = leafReader.docFreq(term); return avgNumberOfUniqueTerms * docsWithC; // avg # of unique terms in text fields per doc * # docs with c }
From source file:SimpleNaiveBayesDocumentClassifier.java
License:Apache License
/** * Returns the average number of unique terms times the number of docs belonging to the input class * * @param term the class term/*from w w w .j a v a 2 s .c o m*/ * @return the average number of unique terms * @throws java.io.IOException If there is a low-level I/O error */ private double getTextTermFreqForClass(Term term, String fieldName) throws IOException { double avgNumberOfUniqueTerms; Terms terms = MultiFields.getTerms(leafReader, fieldName); long numPostings = terms.getSumDocFreq(); // number of term/doc pairs avgNumberOfUniqueTerms = numPostings / (double) terms.getDocCount(); // avg # of unique terms per doc int docsWithC = leafReader.docFreq(term); return avgNumberOfUniqueTerms * docsWithC; // avg # of unique terms in text fields per doc * # docs with c }
From source file:com.basistech.lucene.tools.LuceneQueryTool.java
License:Apache License
private void countFields() throws IOException { for (String field : allFieldNames) { List<LeafReaderContext> leaves = indexReader.leaves(); Map<String, Integer> fieldCounts = new TreeMap<>(); int count = 0; for (LeafReaderContext leaf : leaves) { Terms terms = leaf.reader().terms(field); if (terms == null) { continue; }/* w w w . j a va 2s. c o m*/ count += terms.getDocCount(); } fieldCounts.put(field, count); for (Map.Entry<String, Integer> entry : fieldCounts.entrySet()) { defaultOut.println(entry.getKey() + ": " + entry.getValue()); } } }
From source file:com.github.flaxsearch.api.TermsData.java
License:Apache License
public TermsData(Terms terms, List<String> termsList, String encoding) throws IOException { this.termCount = terms.size(); this.docCount = terms.getDocCount(); this.minTerm = BytesRefUtils.encode(terms.getMin(), encoding); this.maxTerm = BytesRefUtils.encode(terms.getMax(), encoding); this.terms = termsList; }
From source file:com.meizu.nlp.classification.CachingNaiveBayesClassifier.java
License:Apache License
/** * This function is building the frame of the cache. The cache is storing the * word occurrences to the memory after those searched once. This cache can * made 2-100x speedup in proper use, but can eat lot of memory. There is an * option to lower the memory consume, if a word have really low occurrence in * the index you could filter it out. The other parameter is switching between * the term searching, if it true, just the terms in the skeleton will be * searched, but if it false the terms whoes not in the cache will be searched * out too (but not cached).// w w w . j ava 2 s . c o m * * @param minTermOccurrenceInCache Lower cache size with higher value. * @param justCachedTerms The switch for fully exclude low occurrence docs. * @throws IOException If there is a low-level I/O error. */ public void reInitCache(int minTermOccurrenceInCache, boolean justCachedTerms) throws IOException { this.justCachedTerms = justCachedTerms; this.docsWithClassSize = countDocsWithClass(); termCClassHitCache.clear(); cclasses.clear(); classTermFreq.clear(); // build the cache for the word Map<String, Long> frequencyMap = new HashMap<>(); for (String textFieldName : textFieldNames) { TermsEnum termsEnum = leafReader.terms(textFieldName).iterator(); while (termsEnum.next() != null) { BytesRef term = termsEnum.term(); String termText = term.utf8ToString(); long frequency = termsEnum.docFreq(); Long lastfreq = frequencyMap.get(termText); if (lastfreq != null) frequency += lastfreq; frequencyMap.put(termText, frequency); } } for (Map.Entry<String, Long> entry : frequencyMap.entrySet()) { if (entry.getValue() > minTermOccurrenceInCache) { termCClassHitCache.put(entry.getKey(), new ConcurrentHashMap<BytesRef, Integer>()); } } // fill the class list Terms terms = MultiFields.getTerms(leafReader, classFieldName); TermsEnum termsEnum = terms.iterator(); while ((termsEnum.next()) != null) { cclasses.add(BytesRef.deepCopyOf(termsEnum.term())); } // fill the classTermFreq map for (BytesRef cclass : cclasses) { double avgNumberOfUniqueTerms = 0; for (String textFieldName : textFieldNames) { terms = MultiFields.getTerms(leafReader, textFieldName); long numPostings = terms.getSumDocFreq(); // number of term/doc pairs avgNumberOfUniqueTerms += numPostings / (double) terms.getDocCount(); } int docsWithC = leafReader.docFreq(new Term(classFieldName, cclass)); classTermFreq.put(cclass, avgNumberOfUniqueTerms * docsWithC); } }
From source file:com.meizu.nlp.classification.SimpleNaiveBayesClassifier.java
License:Apache License
private double getTextTermFreqForClass(BytesRef c) throws IOException { double avgNumberOfUniqueTerms = 0; for (String textFieldName : textFieldNames) { Terms terms = MultiFields.getTerms(leafReader, textFieldName); long numPostings = terms.getSumDocFreq(); // number of term/doc pairs avgNumberOfUniqueTerms += numPostings / (double) terms.getDocCount(); // avg # of unique terms per doc }//from w ww .ja v a2 s . c o m int docsWithC = leafReader.docFreq(new Term(classFieldName, c)); return avgNumberOfUniqueTerms * docsWithC; // avg # of unique terms in text fields per doc * # docs with c }
From source file:com.rocana.lucene.codec.v1.TestBlockPostingsFormat3.java
License:Apache License
/** * checks collection-level statistics on Terms *//*from w ww . java 2 s . c om*/ public void assertTermsStatistics(Terms leftTerms, Terms rightTerms) throws Exception { if (leftTerms.getDocCount() != -1 && rightTerms.getDocCount() != -1) { assertEquals(leftTerms.getDocCount(), rightTerms.getDocCount()); } if (leftTerms.getSumDocFreq() != -1 && rightTerms.getSumDocFreq() != -1) { assertEquals(leftTerms.getSumDocFreq(), rightTerms.getSumDocFreq()); } if (leftTerms.getSumTotalTermFreq() != -1 && rightTerms.getSumTotalTermFreq() != -1) { assertEquals(leftTerms.getSumTotalTermFreq(), rightTerms.getSumTotalTermFreq()); } if (leftTerms.size() != -1 && rightTerms.size() != -1) { assertEquals(leftTerms.size(), rightTerms.size()); } }
From source file:de.unihildesheim.iw.lucene.search.EmptyFieldFilter.java
License:Open Source License
@Override public DocIdSet getDocIdSet(@NotNull final LeafReaderContext context, @Nullable final Bits acceptDocs) throws IOException { FixedBitSet checkBits;//from ww w .j a v a 2 s . c o m final LeafReader reader = context.reader(); final int maxDoc = reader.maxDoc(); BitSet finalBits = new SparseFixedBitSet(maxDoc); if (acceptDocs == null) { checkBits = BitsUtils.bits2FixedBitSet(reader.getLiveDocs()); if (checkBits == null) { // all live checkBits = new FixedBitSet(maxDoc); checkBits.set(0, checkBits.length()); } } else { checkBits = BitsUtils.bits2FixedBitSet(acceptDocs); } @Nullable final Terms terms = reader.terms(this.field); if (terms != null) { final int termsDocCount = terms.getDocCount(); if (termsDocCount != 0) { if (termsDocCount == maxDoc) { // all matching finalBits = checkBits; } else { @Nullable final Terms t = reader.terms(this.field); if (t != null) { PostingsEnum pe = null; final TermsEnum te = t.iterator(null); int docId; while (te.next() != null) { pe = te.postings(checkBits, pe, (int) PostingsEnum.NONE); while ((docId = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (checkBits.getAndClear(docId)) { finalBits.set(docId); } } } } } } } return new BitDocIdSet(finalBits); }
From source file:org.apache.solr.handler.admin.LukeRequestHandler.java
License:Apache License
private static SimpleOrderedMap<Object> getIndexedFieldsInfo(SolrQueryRequest req) throws Exception { SolrIndexSearcher searcher = req.getSearcher(); SolrParams params = req.getParams(); Set<String> fields = null; String fl = params.get(CommonParams.FL); if (fl != null) { fields = new TreeSet<String>(Arrays.asList(fl.split("[,\\s]+"))); }/*from w w w . j a v a 2 s . com*/ AtomicReader reader = searcher.getAtomicReader(); IndexSchema schema = searcher.getSchema(); // Don't be tempted to put this in the loop below, the whole point here is to alphabetize the fields! Set<String> fieldNames = new TreeSet<String>(); for (FieldInfo fieldInfo : reader.getFieldInfos()) { fieldNames.add(fieldInfo.name); } // Walk the term enum and keep a priority queue for each map in our set SimpleOrderedMap<Object> finfo = new SimpleOrderedMap<Object>(); for (String fieldName : fieldNames) { if (fields != null && !fields.contains(fieldName) && !fields.contains("*")) { continue; //we're not interested in this field Still an issue here } SimpleOrderedMap<Object> fieldMap = new SimpleOrderedMap<Object>(); SchemaField sfield = schema.getFieldOrNull(fieldName); FieldType ftype = (sfield == null) ? null : sfield.getType(); fieldMap.add("type", (ftype == null) ? null : ftype.getTypeName()); fieldMap.add("schema", getFieldFlags(sfield)); if (sfield != null && schema.isDynamicField(sfield.getName()) && schema.getDynamicPattern(sfield.getName()) != null) { fieldMap.add("dynamicBase", schema.getDynamicPattern(sfield.getName())); } Terms terms = reader.fields().terms(fieldName); if (terms == null) { // Not indexed, so we need to report what we can (it made it through the fl param if specified) finfo.add(fieldName, fieldMap); continue; } if (sfield != null && sfield.indexed()) { // In the pre-4.0 days, this did a veeeery expensive range query. But we can be much faster now, // so just do this all the time. Document doc = getFirstLiveDoc(terms, reader); if (doc != null) { // Found a document with this field try { IndexableField fld = doc.getField(fieldName); if (fld != null) { fieldMap.add("index", getFieldFlags(fld)); } else { // it is a non-stored field... fieldMap.add("index", "(unstored field)"); } } catch (Exception ex) { log.warn("error reading field: " + fieldName); } } fieldMap.add("docs", terms.getDocCount()); } if (fields != null && (fields.contains(fieldName) || fields.contains("*"))) { getDetailedFieldInfo(req, fieldName, fieldMap); } // Add the field finfo.add(fieldName, fieldMap); } return finfo; }
From source file:org.apache.solr.handler.component.AlfrescoLukeRequestHandler.java
License:Open Source License
private static SimpleOrderedMap<Object> getIndexedFieldsInfo(SolrQueryRequest req) throws Exception { SolrIndexSearcher searcher = req.getSearcher(); SolrParams params = req.getParams(); Set<String> fields = null; String fl = params.get(CommonParams.FL); if (fl != null) { fields = new TreeSet<>(Arrays.asList(fl.split("[,\\s]+"))); }/*from w ww. j av a2s. c o m*/ LeafReader reader = searcher.getSlowAtomicReader(); IndexSchema schema = searcher.getSchema(); // Don't be tempted to put this in the loop below, the whole point here // is to alphabetize the fields! Set<String> fieldNames = new TreeSet<>(); for (FieldInfo fieldInfo : reader.getFieldInfos()) { fieldNames.add(fieldInfo.name); } // Walk the term enum and keep a priority queue for each map in our set SimpleOrderedMap<Object> vInfo = new SimpleOrderedMap<>(); SimpleOrderedMap<Object> aInfo = new SimpleOrderedMap<>(); for (String fieldName : fieldNames) { if (fields != null && !fields.contains(fieldName) && !fields.contains("*")) { continue; // we're not interested in this field Still an issue // here } SimpleOrderedMap<Object> fieldMap = new SimpleOrderedMap<>(); SchemaField sfield = schema.getFieldOrNull(fieldName); FieldType ftype = (sfield == null) ? null : sfield.getType(); fieldMap.add("type", (ftype == null) ? null : ftype.getTypeName()); fieldMap.add("schema", getFieldFlags(sfield)); if (sfield != null && schema.isDynamicField(sfield.getName()) && schema.getDynamicPattern(sfield.getName()) != null) { fieldMap.add("dynamicBase", schema.getDynamicPattern(sfield.getName())); } Terms terms = reader.fields().terms(fieldName); if (terms == null) { // Not indexed, so we need to report what we // can (it made it through the fl param if // specified) vInfo.add(AlfrescoSolrDataModel.getInstance().getAlfrescoPropertyFromSchemaField(fieldName), fieldMap); aInfo.add(fieldName, fieldMap); continue; } if (sfield != null && sfield.indexed()) { if (params.getBool(INCLUDE_INDEX_FIELD_FLAGS, true)) { Document doc = getFirstLiveDoc(terms, reader); if (doc != null) { // Found a document with this field try { IndexableField fld = doc.getField(fieldName); if (fld != null) { fieldMap.add("index", getFieldFlags(fld)); } else { // it is a non-stored field... fieldMap.add("index", "(unstored field)"); } } catch (Exception ex) { log.warn("error reading field: " + fieldName); } } } fieldMap.add("docs", terms.getDocCount()); } if (fields != null && (fields.contains(fieldName) || fields.contains("*"))) { getDetailedFieldInfo(req, fieldName, fieldMap); } // Add the field vInfo.add(fieldName, fieldMap); aInfo.add(AlfrescoSolrDataModel.getInstance().getAlfrescoPropertyFromSchemaField(fieldName), fieldMap); } SimpleOrderedMap<Object> finfo = new SimpleOrderedMap<>(); finfo.addAll(vInfo); // finfo.add("mimetype()", finfo.get("cm:content.mimetype")); // finfo.add("contentSize()", finfo.get("cm:content.size")); finfo.addAll(aInfo); return finfo; }