List of usage examples for org.apache.lucene.index IndexReader maxDoc
public abstract int maxDoc();
From source file:info.boytsov.lucene.DumpIndex.java
License:Open Source License
public static void main(String[] args) { if (args.length < 3 || args.length > 8) { printUsage();// w w w. j av a2 s .c o m System.exit(1); } boolean sortByURL = Integer.parseInt(args[0]) != 0; String srcDirName = args[1]; String dstFileName = args[2]; int minTermFreq = MIN_TERM_FREQ; if (args.length >= 4) minTermFreq = Integer.parseInt(args[3]); int maxTermQty = MAX_TERM_QTY; if (args.length >= 5) maxTermQty = Integer.parseInt(args[4]); System.out.println("Source dir: " + srcDirName + " target dir: " + dstFileName); System.out.println("Min term freq: " + minTermFreq + " Max # of terms: " + maxTermQty); try { IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(srcDirName))); int docQty = reader.maxDoc(); int sortTable[] = new int[docQty]; Arrays.fill(sortTable, -1); if (sortByURL) { System.out.println("Re-sorting documents by URL!"); URL2DocID remap[] = new URL2DocID[docQty]; for (int docID = 0; docID < docQty; ++docID) { Document doc = reader.document(docID); String url = doc.get("url"); remap[docID] = new URL2DocID(url, docID); if (docID % 100000 == 0) { System.out.println("Collected " + (docID + 1) + " URLs for re-sorting"); } } Arrays.sort(remap); System.out.println("Collected and sorted all URLs for resoring, " + "filling out the sort table."); for (int newDocID = 0; newDocID < docQty; ++newDocID) { sortTable[remap[newDocID].docID] = newDocID; //System.out.println(remap[newDocID].url); } System.out.println("Sort table is filled up!"); for (int i = 0; i < docQty; ++i) remap[i] = null; remap = null; System.gc(); // Let's try to free some memory /* * Paranoid check: did we change all the -1 to non-negative numbers. * Turned out, it wasn't that paranoid. You may have repeating URLs. * Then, some elements in sortTable remain unset. */ for (int i = 0; i < sortTable.length; ++i) { if (sortTable[i] == -1) { throw new Exception("Bug: element " + i + " in sort table is not set"); } } } else { System.out.println("Keeping the original document order!"); for (int i = 0; i < sortTable.length; ++i) { sortTable[i] = i; // Identity transformation } } FreqWordDict dict = new FreqWordDict(reader, FIELD_NAME, minTermFreq, maxTermQty); File dstFile = new File(dstFileName); FileOutputStream outData = new FileOutputStream(dstFile); Iterator<Entry<TermDesc, Integer>> iter = dict.getTermIterator(); long totalWritten = 0; long totalInts = 0; int termId = 0; int batchWriteSize = 1024 * 1024 * 16; /* * We are trying to re-use as many objects as possible, * in order to reduce the number of allocations. */ IntArray bufferArray = new IntArray(batchWriteSize); int tmpDocId[] = null; ByteBuffer buffer = null; while (iter.hasNext()) { Entry<TermDesc, Integer> e = iter.next(); TermDesc ts = e.getKey(); DocsEnum docIter = dict.getDocIterator(ts.text); int postQty = ts.freq; int qty = 0, prevDocID = -1; /* * If posting lists appear in the order of descending term frequencies., * this will be actually only one allocation. */ if (tmpDocId == null || tmpDocId.length < postQty) tmpDocId = new int[postQty]; bufferArray.add(postQty); for (int i = 0; docIter.nextDoc() != DocIdSetIterator.NO_MORE_DOCS; ++i, ++qty) { if (i >= postQty) { throw new Exception("Bug: more postings than expected for term: " + ts.getText()); } int currDocID = docIter.docID(); if (currDocID >= docQty) { throw new Exception("Bug: a document ID " + currDocID + " is out of bounds, total # of docs: " + docQty); } tmpDocId[i] = sortTable[currDocID]; if (prevDocID >= docIter.docID()) { throw new Exception("Bug: unsorted doc ids for term: " + ts.getText()); } prevDocID = currDocID; } if (qty != postQty) { throw new Exception("Bug: fewer postings than expected for term: " + ts.getText()); } /* * Now let's resort docIds and write them. * REMEMBER that tmpDocId is a buffer that may contain * MORE than postQty elements!!! * Some of the won't be used. * */ Arrays.sort(tmpDocId, 0, postQty); for (int i = 0; i < postQty; ++i) bufferArray.add(tmpDocId[i]); totalWritten += 4 * (1 + postQty); totalInts += postQty; if (termId % 100000 == 0 || bufferArray.size() >= batchWriteSize) { System.out.println(termId + ":" + ts.getText() + " \t postQty=" + postQty + " overall written: " + totalWritten / 1024.0 / 1024.0 / 1024.0 + " Gbs, " + totalInts / 1e6 + " Millions postings"); } if (bufferArray.size() >= batchWriteSize) { // WriteArray may produce a new buffer, let's reuse it buffer = WriteArray(bufferArray, outData, buffer); } ++termId; } System.out.println("Term qty: " + termId + " flat size size : " + totalWritten / 1024.0 / 1024.0 / 1024.0 + " Gbs, " + totalInts / 1e6 + " Millions postings"); // WriteArray may produce a new buffer, let's reuse it buffer = WriteArray(bufferArray, outData, buffer); } catch (Exception e) { System.err.println("Error: " + e.getMessage()); e.printStackTrace(); System.exit(1); } }
From source file:info.boytsov.lucene.GetTotPostQty.java
License:Open Source License
public static void main(String[] args) { if (args.length != 1) { printUsage();/*from ww w . ja v a 2 s . com*/ System.exit(1); } String srcDirName = args[0]; System.out.println("Source dir: " + srcDirName); try { IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(srcDirName))); int docQty = reader.maxDoc(); Fields fields = MultiFields.getFields(reader); Terms terms = fields.terms(FIELD_NAME); long totalInts = 0; int termQty = 0; for (TermsEnum termIter = terms.iterator(null); termIter.next() != null;) { totalInts += termIter.docFreq(); //System.out.println(termQty + " -> " + termIter.docFreq()); ++termQty; if (termQty % 1000000 == 0) System.out.println("Read " + termQty + " dictionary terms"); } System.out.println("Term qty: " + termQty + " Doc qty: " + docQty + " postings qty: " + totalInts); } catch (Exception e) { System.err.println("Error: " + e.getMessage()); e.printStackTrace(); System.exit(1); } }
From source file:info.extensiblecatalog.OAIToolkit.oai.dataproviders.LuceneFacadeDataProvider.java
License:Open Source License
synchronized static public void initializeCachedFullHarvest() { if (cachedFullHarvestIds == null) { IndexReader indexReader; try {//w w w . j a v a 2 s . c o m indexReader = ApplInfo.luceneSearcher.getIndexReader().clone(true); } catch (CorruptIndexException e1) { prglog.error("[PRG] " + e1); return; } catch (IOException e1) { prglog.error("[PRG] " + e1); return; } cachedFullHarvestIndexSearcher = new IndexSearcher(indexReader); try { cachedFullHarvestEarliestDate = TextUtil .luceneToDate(ApplInfo.luceneSearcher.getEarliestDatestamp()); cachedFullHarvestExpiry = ApplInfo.luceneSearcher.getLatestDatestamp(); } catch (ParseException pe) { prglog.error("[PRG] " + pe); return; } BooleanQuery query = new BooleanQuery(); // don't include deleted records query.add((Query) new TermQuery(new Term("is_deleted", "false")), Occur.MUST); // do we need to filter based on orgCode? if (ApplInfo.getOrgCodeFilter() != null) { query.add((Query) new TermQuery(new Term("repository_code", ApplInfo.getOrgCodeFilter())), Occur.MUST); } try { cachedFullHarvestIds = new BitSet(indexReader.maxDoc()); cachedFullHarvestIndexSearcher.search(query, new Collector() { private int docBase; // ignore scorer public void setScorer(Scorer scorer) { } // accept docs out of order (for a BitSet it doesn't matter) public boolean acceptsDocsOutOfOrder() { return true; } public void collect(int doc) { cachedFullHarvestIds.set(doc + docBase); } public void setNextReader(IndexReader reader, int docBase) { this.docBase = docBase; } }); prglog.info("[PRG] Initial Full Harvest Cache created successfully."); } catch (IOException e) { prglog.error("[PRG] " + e); cachedFullHarvestIds = null; return; } } }
From source file:intelligentWebAlgorithms.algos.search.ranking.DocRankMatrixBuilder.java
License:Apache License
private List<Integer> getProcessedDocs(IndexReader idxR) throws IOException { List<Integer> docs = new ArrayList<Integer>(); for (int i = 0, n = idxR.maxDoc(); i < n; i++) { if (idxR.hasDeletions() == false) { Document doc = idxR.document(i); if (eligibleForDocRank(doc.get("doctype"))) { docs.add(i);/* w ww. jav a2 s. c o m*/ } } } return docs; }
From source file:io.anserini.rerank.lib.AxiomReranker.java
License:Apache License
/** * If the result is deterministic we can cache all the docids. All queries can share this * cache./*w ww . j av a 2 s .c o m*/ */ private ScoreDoc[] buildInternalDocidsCache(SearchArgs args) throws IOException { String index = args.axiom_index == null ? args.index : args.axiom_index; Path indexPath = Paths.get(index); if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) { throw new IllegalArgumentException(index + " does not exist or is not a directory."); } IndexReader reader = DirectoryReader.open(FSDirectory.open(indexPath)); IndexSearcher searcher = new IndexSearcher(reader); if (args.searchtweets) { return searcher.search(new FieldValueQuery(TweetGenerator.StatusField.ID_LONG.name), reader.maxDoc(), BREAK_SCORE_TIES_BY_TWEETID).scoreDocs; } return searcher.search(new FieldValueQuery(LuceneDocumentGenerator.FIELD_ID), reader.maxDoc(), BREAK_SCORE_TIES_BY_DOCID).scoreDocs; }
From source file:io.anserini.rerank.lib.AxiomReranker.java
License:Apache License
/** * Calculate the scores (weights) of each term that occured in the reranking pool. * The Process:/*from w w w . j a v a2s . c o m*/ * 1. For each query term, calculate its score for each term in the reranking pool. the score * is calcuated as * <pre> * P(both occurs)*log{P(both occurs)/P(t1 occurs)/P(t2 occurs)} * + P(both not occurs)*log{P(both not occurs)/P(t1 not occurs)/P(t2 not occurs)} * + P(t1 occurs t2 not occurs)*log{P(t1 occurs t2 not occurs)/P(t1 occurs)/P(t2 not occurs)} * + P(t1 not occurs t2 occurs)*log{P(t1 not occurs t2 occurs)/P(t1 not occurs)/P(t2 occurs)} * </pre> * 2. For each query term the scores of every other term in the reranking pool are stored in a * PriorityQueue, only the top {@code K} are kept. * 3. Add the scores of the same term together and pick the top {@code M} ones. * * @param termInvertedList A Map of <term -> Set<docId>> where the Set of docIds is where the term occurs * @param context An instance of RerankerContext * @return Map<String, Double> Top terms and their weight scores in a HashMap */ private Map<String, Double> computeTermScore(Map<String, Set<Integer>> termInvertedList, RerankerContext<T> context) throws IOException { class ScoreComparator implements Comparator<Pair<String, Double>> { public int compare(Pair<String, Double> a, Pair<String, Double> b) { int cmp = Double.compare(b.getRight(), a.getRight()); if (cmp == 0) { return a.getLeft().compareToIgnoreCase(b.getLeft()); } else { return cmp; } } } // get collection statistics so that we can get idf later on. IndexReader reader; if (this.externalIndexPath != null) { Path indexPath = Paths.get(this.externalIndexPath); if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) { throw new IllegalArgumentException( this.externalIndexPath + " does not exist or is not a directory."); } reader = DirectoryReader.open(FSDirectory.open(indexPath)); } else { IndexSearcher searcher = context.getIndexSearcher(); reader = searcher.getIndexReader(); } final long docCount = reader.numDocs() == -1 ? reader.maxDoc() : reader.numDocs(); //calculate the Mutual Information between term with each query term List<String> queryTerms = context.getQueryTokens(); Map<String, Integer> queryTermsCounts = new HashMap<>(); for (String qt : queryTerms) { queryTermsCounts.put(qt, queryTermsCounts.getOrDefault(qt, 0) + 1); } Set<Integer> allDocIds = new HashSet<>(); for (Set<Integer> s : termInvertedList.values()) { allDocIds.addAll(s); } int docIdsCount = allDocIds.size(); // Each priority queue corresponds to a query term: The p-queue itself stores all terms // in the reranking pool and their reranking scores to the query term. List<PriorityQueue<Pair<String, Double>>> allTermScoresPQ = new ArrayList<>(); for (Map.Entry<String, Integer> q : queryTermsCounts.entrySet()) { String queryTerm = q.getKey(); long df = reader.docFreq(new Term(LuceneDocumentGenerator.FIELD_BODY, queryTerm)); if (df == 0L) { continue; } float idf = (float) Math.log((1 + docCount) / df); int qtf = q.getValue(); if (termInvertedList.containsKey(queryTerm)) { PriorityQueue<Pair<String, Double>> termScorePQ = new PriorityQueue<>(new ScoreComparator()); double selfMI = computeMutualInformation(termInvertedList.get(queryTerm), termInvertedList.get(queryTerm), docIdsCount); for (Map.Entry<String, Set<Integer>> termEntry : termInvertedList.entrySet()) { double score; if (termEntry.getKey().equals(queryTerm)) { // The mutual information to itself will always be 1 score = idf * qtf; } else { double crossMI = computeMutualInformation(termInvertedList.get(queryTerm), termEntry.getValue(), docIdsCount); score = idf * beta * qtf * crossMI / selfMI; } termScorePQ.add(Pair.of(termEntry.getKey(), score)); } allTermScoresPQ.add(termScorePQ); } } Map<String, Double> aggTermScores = new HashMap<>(); for (PriorityQueue<Pair<String, Double>> termScores : allTermScoresPQ) { for (int i = 0; i < Math.min(termScores.size(), this.K); i++) { Pair<String, Double> termScore = termScores.poll(); String term = termScore.getLeft(); Double score = termScore.getRight(); if (score - 0.0 > 1e-8) { aggTermScores.put(term, aggTermScores.getOrDefault(term, 0.0) + score); } } } PriorityQueue<Pair<String, Double>> termScoresPQ = new PriorityQueue<>(new ScoreComparator()); for (Map.Entry<String, Double> termScore : aggTermScores.entrySet()) { termScoresPQ.add(Pair.of(termScore.getKey(), termScore.getValue() / queryTerms.size())); } Map<String, Double> resultTermScores = new HashMap<>(); for (int i = 0; i < Math.min(termScoresPQ.size(), this.M); i++) { Pair<String, Double> termScore = termScoresPQ.poll(); String term = termScore.getKey(); double score = termScore.getValue(); resultTermScores.put(term, score); } return resultTermScores; }
From source file:io.datalayer.lucene.index.LuceneLifecycleTest.java
License:Apache License
@Test public void testReader() throws IOException { IndexReader reader = DirectoryReader.open(directory); assertEquals(ids.length, reader.maxDoc()); assertEquals(ids.length, reader.numDocs()); reader.close();//from ww w.j a v a2 s. co m }
From source file:io.datalayer.lucene.read.LuceneReaderTest.java
License:Apache License
@Test public void testReader() throws IOException { IndexReader reader = DirectoryReader.open(directory); assertEquals(keywords.length, reader.maxDoc()); assertEquals(keywords.length, reader.numDocs()); reader.close();/*from w w w . ja v a 2 s . c o m*/ }
From source file:ir.project.TFIDFMatrix.java
private void createTermMap() { try {/*from w ww. j av a2 s.c o m*/ IndexReader reader = DirectoryReader.open(this.index); this.termMap = new HashMap<>(); // Map used to identify position in matrix for this.numDocs = reader.maxDoc(); int count = 0; // Setup the termMap for (int i = 0; i < numDocs; i++) { Terms vector = reader.getTermVector(i, "text"); if (vector == null) { System.err.println("Vector is null!"); continue; } TermsEnum it = vector.iterator(); while (it.next() != null) { Term t = new Term("text", it.term().utf8ToString()); if (!termMap.containsKey(it.term().utf8ToString())) { termMap.put(it.term().utf8ToString(), count); count += 1; } } } this.numTerms = count; reader.close(); } catch (IOException ex) { Logger.getLogger(TFIDFMatrix.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:it.doqui.index.ecmengine.business.personalization.multirepository.index.lucene.RepositoryAwareAbstractLuceneIndexerImpl.java
License:Open Source License
/** * Delete all index entries which do not start with the goven prefix * * @param prefix/* www . j a v a2 s .co m*/ */ public void deleteAll(String prefix) { IndexReader mainReader = null; try { // mainReader = getReader(); for (int doc = 0; doc < mainReader.maxDoc(); doc++) { if (!mainReader.isDeleted(doc)) { Document document = mainReader.document(doc); String[] ids = document.getValues("ID"); if ((prefix == null) || nonStartwWith(ids, prefix)) { deletions.add(ids[ids.length - 1]); } } } } catch (IOException e) { // If anything goes wrong we try and do a roll back throw new LuceneIndexException("Failed to delete all entries from the index", e); } finally { if (mainReader != null) { try { mainReader.close(); } catch (IOException e) { throw new LuceneIndexException("Filed to close main reader", e); } } } }