List of usage examples for org.apache.lucene.index IndexReader numDocs
public abstract int numDocs();
From source file:org.archive.nutchwax.tools.GetUniqFieldValues.java
License:LGPL
private static void dumpUniqValues(String fieldName, String indexDir) throws Exception { IndexReader reader = IndexReader.open(indexDir); Collection fieldNames = reader.getFieldNames(IndexReader.FieldOption.ALL); if (!fieldNames.contains(fieldName)) { System.out.println("Field not in index: " + fieldName); System.exit(2);/*from w w w . j a va 2 s.c om*/ } int numDocs = reader.numDocs(); Set<String> values = new HashSet<String>(); for (int i = 0; i < numDocs; i++) { values.add(reader.document(i).get(fieldName)); } for (String v : values) { System.out.println(v); } }
From source file:org.archive.tnh.FieldCacheLucene.java
License:Apache License
public static void main(String args[]) throws Exception { if (args.length == 0) { System.err.println("FieldCacheLucene: <index...>"); System.exit(1);/* w w w . j a v a 2 s.c om*/ } java.util.ArrayList<IndexReader> readers = new java.util.ArrayList<IndexReader>(args.length); for (String arg : args) { try { IndexReader reader = IndexReader.open(new MMapDirectory(new File(arg)), true); readers.add(reader); } catch (IOException ioe) { System.err.println("Error reading: " + arg); } } FieldCacheLucene cache = new FieldCacheLucene("site"); for (IndexReader reader : readers) { int numDocs = reader.numDocs(); System.out.println("Index: " + reader); System.out.println(" numDocs: " + reader.numDocs()); System.out.println(" docBase: -1"); for (int i = 0; i < numDocs; i++) { System.out.println(" doc[" + i + "]: " + cache.getValue(reader, -1, i)); } } }
From source file:org.archive.tnh.FieldCachePreAllocated.java
License:Apache License
public static void main(String args[]) throws Exception { if (args.length == 0) { System.err.println("FieldCachePreAllocated: <index...>"); System.exit(1);//from w w w . j a v a 2 s . com } java.util.ArrayList<IndexReader> readers = new java.util.ArrayList<IndexReader>(args.length); int totalNumDocuments = 0; for (String arg : args) { try { IndexReader reader = IndexReader.open(new MMapDirectory(new File(arg)), true); totalNumDocuments += reader.numDocs(); readers.add(reader); } catch (IOException ioe) { System.err.println("Error reading: " + arg); } } FieldCachePreAllocated siteCache = new FieldCachePreAllocated("site", readers.size(), totalNumDocuments); int docBase = 0; for (IndexReader reader : readers) { siteCache.getFieldCache(reader, docBase); docBase += reader.numDocs(); } for (Map.Entry<IndexReader, Integer> e : siteCache.readerDocBases.entrySet()) { IndexReader reader = e.getKey(); docBase = e.getValue(); int numDocs = reader.numDocs(); System.out.println("Index: " + reader); System.out.println(" numDocs: " + numDocs); System.out.println(" docBase: " + docBase); String[] sitePerDoc = siteCache.getFieldCache(reader, docBase); for (int i = 0; i < numDocs; i++) { System.out.println(" doc[" + i + "]: " + sitePerDoc[i + docBase]); } } }
From source file:org.archive.tnh.servlet.DiagnosticServlet.java
License:Apache License
public void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { Document doc = new Document(); Element root = new Element("info"); doc.addContent(root);/*from ww w.jav a2 s .co m*/ Search search = (Search) this.getServletConfig().getServletContext().getAttribute("tnh.search"); if (search == null) { OpenSearchHelper.writeResponse(doc, response); return; } Set<String> indexNames = new HashSet(Arrays.asList(ServletHelper.getParam(request, "i", search.searchers.keySet().toArray(QueryParameters.EMPTY_STRINGS)))); Set<String> fieldNames = new HashSet( Arrays.asList(ServletHelper.getParam(request, "f", QueryParameters.EMPTY_STRINGS))); for (String indexName : indexNames) { Searcher searcher = search.searchers.get(indexName); if (searcher == null) continue; Element e = new Element("searcher"); root.addContent(e); e.setAttribute("name", indexName); e.setAttribute("type", searcher.getClass().getCanonicalName()); try { IndexReader ir = ((IndexSearcher) searcher).getIndexReader(); Element ise = new Element("index"); e.addContent(ise); ise.setAttribute("numDocs", Integer.toString(ir.numDocs())); TermDocs termDocs = ir.termDocs(); for (String fieldName : ir.getFieldNames(IndexReader.FieldOption.ALL) .toArray(QueryParameters.EMPTY_STRINGS)) { // If this field is not requested, skip it. if (!fieldNames.contains(fieldName)) { continue; } Element field = new Element("field"); field.setAttribute("name", fieldName); ise.addContent(field); // Iterate through the terms and for each term that // belongs to this field, count up the number of // documents containing that term and add it to the // XML Document. TermEnum termEnum = ir.terms(new Term(fieldName)); do { Term term = termEnum.term(); if (term == null || !fieldName.equals(term.field())) continue; termDocs.seek(termEnum); int c = 0; for (; termDocs.next(); c++) ; Element value = new Element("term"); value.setAttribute("name", term.text()); value.setAttribute("count", Integer.toString(c)); field.addContent(value); } while (termEnum.next()); } } catch (ClassCastException cce) { } } OpenSearchHelper.writeResponse(doc, response); }
From source file:org.archive.tnh.tools.IndexDumper.java
License:Apache License
private static void dumpIndex(IndexReader reader, List<String> fields, boolean includeDocIds) throws Exception { Collection fieldNames = reader.getFieldNames(IndexReader.FieldOption.ALL); // If no fields were specified, then dump them all. if (fields.size() == 0) { fields.addAll(fieldNames);/* ww w . jav a 2s.c om*/ } else { for (String field : fields) { if (!fieldNames.contains(field)) { System.out.println("Field not in index: " + field); System.exit(2); } } } int numDocs = reader.numDocs(); for (int i = 0; i < numDocs; i++) { if (includeDocIds) { System.out.print(i + "\t"); } for (String field : fields) { System.out.print(Arrays.toString(reader.document(i).getValues(field))); System.out.print("\t"); } System.out.println(); } }
From source file:org.capelin.transaction.utils.TXLuceneRecordImporter.java
License:GNU General Public License
protected int importRecords(IndexReader reader, Session session) throws IOException { CapelinRecord data = null;/*w w w . java2 s. c o m*/ int totalDoc = reader.numDocs(); // Read documents for (int i = 0; i < totalDoc; i++) { data = buildRecord(reader.document(i)); if (null != data) session.save(data); if (i % BATCH_SIZE == 0) { session.flush(); // apply changes to indexes session.clear(); // free memory since the queue is processed log.info(i); } } return totalDoc; }
From source file:org.deals.lucene.highlight.QueryTermExtractor.java
License:Apache License
/** * Extracts all terms texts of a given Query into an array of WeightedTerms * * @param query Query to extract term texts from * @param reader used to compute IDF which can be used to a) score selected fragments better * b) use graded highlights eg chaning intensity of font color * @param fieldName the field on which Inverse Document Frequency (IDF) calculations are based * @return an array of the terms used in a query, plus their weights. *///ww w . j a v a 2s . com public static final WeightedTerm[] getIdfWeightedTerms(Query query, IndexReader reader, String fieldName) { WeightedTerm[] terms = getTerms(query, false, fieldName); int totalNumDocs = reader.numDocs(); for (int i = 0; i < terms.length; i++) { try { int docFreq = reader.docFreq(new Term(fieldName, terms[i].term)); //IDF algorithm taken from DefaultSimilarity class float idf = (float) (Math.log((float) totalNumDocs / (double) (docFreq + 1)) + 1.0); terms[i].weight *= idf; } catch (IOException e) { //ignore } } return terms; }
From source file:org.deshang.content.indexing.scheduling.ContentIndexingTask.java
License:Apache License
private void calcPersonTermDocFreqInfo(TermDocFreqStatistics statistics, IndexReader reader) throws IOException { long docNum = reader.numDocs(); LOGGER.debug("Total number of documents is " + docNum + "."); List<AtomicReaderContext> atomicCtxList = reader.leaves(); for (AtomicReaderContext ctx : atomicCtxList) { FilterAtomicReader far = new FilterAtomicReader(ctx.reader()); for (String field : far.fields()) { Terms terms = far.fields().terms(field); LOGGER.debug("Reader [" + far.toString() + "] totally has " + terms.size() + " term(s)."); TermsEnum termsEnum = terms.iterator(null); BytesRef term = null;/*from w w w. jav a 2 s. c o m*/ while ((term = termsEnum.next()) != null) { String termUtf8String = term.utf8ToString(); int existPersonDocFreq = statistics.getTermPersonDocFreq(termUtf8String); int personDocFreq = far.docFreq(new Term(field, term)); double personDocFreqPercent = ((double) personDocFreq) / docNum; if (existPersonDocFreq < 0) { personDocFreq += statistics.getTermPersonDocFreq(termUtf8String); personDocFreqPercent += statistics.getTermPersonDocFreqPercent(termUtf8String); } statistics.putTermPersonDocFreqInfo(termUtf8String, personDocFreq, personDocFreqPercent); } } far.close(); } }
From source file:org.dspace.search.DSIndexer.java
License:BSD License
/** * Iterates over all documents in the Lucene index and verifies they * are in database, if not, they are removed. * * @param context// www . j a va 2 s . c o m * @throws IOException * @throws SQLException */ public static void cleanIndex(Context context) throws IOException, SQLException { IndexReader reader = DSQuery.getIndexReader(); Bits liveDocs = MultiFields.getLiveDocs(reader); for (int i = 0; i < reader.numDocs(); i++) { if (!liveDocs.get(i)) { // document is deleted... log.debug("Encountered deleted doc: " + i); } else { Document doc = reader.document(i); String handle = doc.get("handle"); if (!StringUtils.isEmpty(handle)) { DSpaceObject o = HandleManager.resolveToObject(context, handle); if (o == null) { log.info("Deleting: " + handle); /* Use IndexWriter to delete, its easier to manage write.lock */ DSIndexer.unIndexContent(context, handle); } else { context.removeCached(o, o.getID()); log.debug("Keeping: " + handle); } } } } }
From source file:org.dyndns.andreasbaumann.LuceneAnalyzer.java
License:Open Source License
private static void printGlobalInfo(IndexReader indexReader, boolean printHeaders, boolean isSolr, SolrIndexSearcher solrSearch) throws IOException { if (printHeaders) { System.out.println("Global Information:"); System.out.println("==================="); }/*w w w . ja v a 2 s. co m*/ System.out.println("\tnumber of documents: " + indexReader.numDocs()); // we should get the number of features differently, this is inefficient, but Lucene // has no notion of global statistics (because the default weighting schema doesn't // make use of it!) int nofFeatures = 0; int nofTokens = 0; TermEnum terms = indexReader.terms(); while (terms.next()) { Term term = terms.term(); int df = terms.docFreq(); nofFeatures++; nofTokens += df; } System.out.println("\ttotal number of features: " + nofFeatures); System.out.println("\ttotal number of tokens: " + nofTokens); System.out.println("\tversion: " + indexReader.getVersion()); System.out.println("\tstill current: " + indexReader.isCurrent()); //TODO: we don't get segment information! //System.out.println( "is optimized:" + segmentInfos.size( ) == 1 && !indexReader.hasDeletions( ) ); System.out.println("\tmaximal document number: " + indexReader.maxDoc()); System.out.println("\thas deletions: " + indexReader.hasDeletions()); if (isSolr) { System.out.println("\tSolr version: " + solrSearch.getVersion()); } System.out.println(""); }