List of usage examples for org.apache.lucene.index IndexReader numDocs
public abstract int numDocs();
From source file:org.tallison.lucene.corpus.stats.IDFIndexCalc.java
License:Apache License
public IDFIndexCalc(IndexReader reader) { super(reader.numDocs()); this.reader = reader; }
From source file:org.tallison.solr.search.concordance.KeywordCooccurRankHandler.java
License:Apache License
public static NamedList doLocalSearch(Query filter, SolrQueryRequest req) throws Exception { SolrParams params = req.getParams(); String field = getField(params); String fl = params.get(CommonParams.FL); DocMetadataExtractor metadataExtractor = (fl != null && fl.length() > 0) ? new SimpleDocMetadataExtractor(fl.split(",")) : new SimpleDocMetadataExtractor(); CooccurConfig config = configureParams(field, params); IndexSchema schema = req.getSchema(); SchemaField sf = schema.getField(field); Analyzer analyzer = sf.getType().getIndexAnalyzer(); Filter queryFilter = getFilterQuery(req); String q = params.get(CommonParams.Q); Query query = QParser.getParser(q, null, req).parse(); String solrUniqueKeyField = req.getSchema().getUniqueKeyField().getName(); SolrIndexSearcher solr = req.getSearcher(); IndexReader reader = solr.getIndexReader(); boolean allowDuplicates = false; boolean allowFieldSeparators = false; Grammer grammer = new WGrammer(config.getMinNGram(), config.getMaxNGram(), allowFieldSeparators); IDFCalc idfCalc = new IDFCalc(reader); CooccurVisitor visitor = new CooccurVisitor(field, config.getTokensBefore(), config.getTokensAfter(), grammer, idfCalc, config.getMaxWindows(), allowDuplicates); visitor.setMinTermFreq(config.getMinTermFreq()); try {//from w ww .j ava 2s.c o m ConcordanceArrayWindowSearcher searcher = new ConcordanceArrayWindowSearcher(); System.out.println("UNIQUE KEY FIELD: " + solrUniqueKeyField); DocIdBuilder docIdBuilder = new FieldBasedDocIdBuilder(solrUniqueKeyField); System.out.println("QUERY: " + query.toString()); searcher.search(reader, field, query, queryFilter, analyzer, visitor, docIdBuilder); } catch (IllegalArgumentException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (TargetTokenNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } List<TermIDF> overallResults = visitor.getResults(); NamedList results = toNamedList(overallResults); //needed for cloud computations, merging cores results.add("collectionSize", reader.numDocs()); results.add("numDocsVisited", visitor.getNumDocsVisited()); results.add("numWindowsVisited", visitor.getNumWindowsVisited()); results.add("numResults", overallResults.size()); results.add("minTF", visitor.getMinTermFreq()); return results; }
From source file:org.toubassi.femtozip.lucene.IndexAnalyzer.java
License:Apache License
private IndexReader openIndex(String path) throws IOException { IndexReader reader = IndexReader.open(path); totalIndexSize = FileUtil.computeSize(new File(path)); totalNumDocs = reader.numDocs(); return reader; }
From source file:org.toubassi.femtozip.lucene.IndexDocumentList.java
License:Apache License
public IndexDocumentList(IndexReader reader, int numSamples, int firstDoc, String fieldName) throws IOException { this.reader = reader; this.fieldName = fieldName; numDocs = reader.numDocs(); float samplingRate = ((float) numSamples) / numDocs; ArrayList<Integer> docIdsList = new ArrayList<Integer>(); ArrayList<Integer> fieldCountList = new ArrayList<Integer>(); int numDocsScanned = 0, numDocsSampled = 0; for (int i = firstDoc, count = reader.maxDoc(); i < count; i++) { numDocsScanned++;/*w w w .j av a2 s . c om*/ if (reader.isDeleted(i)) { continue; } if (((int) (numDocsScanned * samplingRate)) <= numDocsSampled) { continue; } numDocsSampled++; Document doc = reader.document(i); Field fields[] = doc.getFields(fieldName); if (fields.length > 0) { if (fields[0].isStored()) { docIdsList.add(i); fieldCountList.add(fields.length); } } } docIds = new int[docIdsList.size()]; for (int i = 0, count = docIdsList.size(); i < count; i++) { docIds[i] = docIdsList.get(i); } fieldCounts = new int[fieldCountList.size()]; for (int i = 0, count = fieldCountList.size(); i < count; i++) { fieldCounts[i] = fieldCountList.get(i); if (i > 0) { fieldCounts[i] += fieldCounts[i - 1]; } } }
From source file:org.toubassi.femtozip.lucene.IndexDumper.java
License:Apache License
protected void dump() throws IOException { IndexReader reader = IndexReader.open(indexPath); Collection<?> allFields = reader.getFieldNames(IndexReader.FieldOption.ALL); String[] fieldNames = new String[allFields.size()]; allFields.toArray(fieldNames);/*from w w w.j a v a2 s .c o m*/ numDocs = reader.numDocs(); int maxDocId = reader.maxDoc(); float samplingRate = ((float) numSamples) / numDocs; int numDocsScanned = 0; int numDocsSampled = 0; for (int docId = 0; docId < maxDocId; docId++) { if (reader.isDeleted(docId)) { continue; } numDocsScanned++; if (((int) (numDocsScanned * samplingRate)) <= numDocsSampled) { continue; } numDocsSampled++; Document doc = reader.document(docId); System.out.println("DOCUMENT: " + docId); for (String fieldName : fieldNames) { if (fieldsToDump != null && fieldsToDump.indexOf(fieldName) == -1) { continue; } Field[] fields = doc.getFields(fieldName); for (Field field : fields) { if (!field.isStored() || field.isCompressed()) { // TODO if its compressed, uncompress it and benchmark it. continue; } byte[] bytes; if (field.isBinary()) { bytes = new byte[field.getBinaryLength()]; System.arraycopy(field.getBinaryValue(), field.getBinaryOffset(), bytes, 0, field.getBinaryLength()); } else { String value = field.stringValue(); bytes = value.getBytes("UTF-8"); } if (bytes.length > 0) { System.out.print(" " + fieldName + " " + bytes.length + " "); System.out.write(bytes); System.out.println(); } } } } reader.close(); }
From source file:org.zilverline.core.AbstractCollection.java
License:Open Source License
/** * Get the number of documents in this collection. The number is not * calculated, but stored after indexing process, so it is a cheap * operation.// w ww .j a v a 2 s .c o m * * @return number of documents in collection */ public final int getNumberOfDocs() { if (isIndexingInProgress()) { IndexReader index = null; try { File thisIndex = getIndexDirWithManagerDefaults(); index = IndexReader.open(thisIndex); if (index != null) { return index.numDocs(); } } catch (IOException e) { log.warn("Error getting index for collection '" + name + "'", e); } finally { if (index != null) { try { index.close(); } catch (IOException e1) { log.error("Error closing index for collection " + name, e1); } } } } return numberOfDocs; }
From source file:org.zilverline.core.AbstractCollection.java
License:Open Source License
/** * Initialize this collection by getting its index. It retrieves the number * of documents and the MD5 hash of all documents in the collection. * /*from www .java 2 s . c o m*/ * If the index does not exist (this is a new Collection) just return. * * @throws IndexException * when existing index of Collection can not be succesfully * opened. */ public final void init() throws IndexException { log.debug("Initializing collection " + name); IndexReader index = null; // Determine whether the collection exists on disk setExistsOnDisk(); // check whether this collection has a cache for the MD5 hashes of // documents if (md5DocumentCache == null) { md5DocumentCache = new HashSet(); } // check whether this collection has a cache for the MD5 hashes of // indexed archives if (archiveCache == null) { archiveCache = new HashSet(); } if (!isIndexValid()) { log.info("Index does not exist (yet) for collection '" + name + "'. Possibly new collection."); numberOfDocs = 0; return; } // Get the index File thisIndex = getIndexDirWithManagerDefaults(); try { index = IndexReader.open(thisIndex); if (index != null) { numberOfDocs = index.numDocs(); // retrieve all hashes of Documents from the cache md5DocumentCache.clear(); for (int i = 0; i < numberOfDocs; i++) { Document d = index.document(i); String hashValue = d.get("hash"); md5DocumentCache.add(hashValue); } // get some relevant information from the index version = IndexReader.getCurrentVersion(thisIndex); // deprecated, but needed lastIndexed = new Date(IndexReader.lastModified(thisIndex)); log.debug("Collection " + name + " has " + numberOfDocs + " documents, index created at: " + lastIndexed); } else { log.error("Index could not be retrieved for collection " + name); } } catch (IOException e) { throw new IndexException("Error initializing collection '" + name + "'", e); } finally { if (index != null) { try { index.close(); } catch (IOException e1) { log.error("Error closing index for collection " + name, e1); } } else { numberOfDocs = 0; version = 0; lastIndexed = null; } } }
From source file:proj.zoie.impl.indexing.internal.DiskSearchIndex.java
License:Apache License
/** * Gets the number of docs in the current loaded index * @return number of docs/*from w w w. ja va 2 s . co m*/ */ public int getNumdocs() { IndexReader reader = _dispenser.getIndexReader(); if (reader != null) { return reader.numDocs(); } else { return 0; } }
From source file:project.lucene.RelativeTermWeightQuery.java
License:Apache License
@Override public Query rewrite(IndexReader reader) throws IOException { if (this.terms.isEmpty()) { return new BooleanQuery(); } else if (this.terms.size() == 1) { final Query tq = newTermQuery(this.terms.get(0), null); tq.setBoost(getBoost());//from w w w . j a v a 2s . c o m return tq; } final List<AtomicReaderContext> leaves = reader.leaves(); final int totalDocs = reader.numDocs(); final TermContext[] contextArray = new TermContext[terms.size()]; final Term[] queryTerms = this.terms.toArray(new Term[0]); collectTermContext(reader, leaves, contextArray, queryTerms); return buildQuery(totalDocs, contextArray, queryTerms); }
From source file:retriever.TermStats.java
TermStats(String term, int tf, IndexReader reader) throws Exception { this.term = term; this.tf = tf; idf = (float) (Math.log(reader.numDocs() / (float) (reader.docFreq(new Term(TextDocIndexer.FIELD_ANALYZED_CONTENT, term))))); }