List of usage examples for org.apache.lucene.index IndexReader document
public final Document document(int docID) throws IOException
n
th Document
in this index. From source file:hr.fer.tel.rovkp.homework03.task01.JokesCollection.java
private static float[][] createMatrix(Map<Integer, String> entries, StandardAnalyzer analyzer, Directory index) throws NumberFormatException, IOException, ParseException { int hitsNo = entries.size(); float[][] similarityMatrix = new float[hitsNo][hitsNo]; int i = 0;//from w ww. j av a2 s . c o m for (Entry<Integer, String> entry : entries.entrySet()) { Query query = new QueryParser("text", analyzer).parse(QueryParser.escape(entry.getValue())); IndexReader reader = DirectoryReader.open(index); IndexSearcher searcher = new IndexSearcher(reader); TopDocs docs = searcher.search(query, hitsNo); for (ScoreDoc hit : docs.scoreDocs) { Document document = reader.document(hit.doc); int docId = Integer.parseInt(document.get("ID")); int docIndex = docId - 1; similarityMatrix[i][docIndex] = hit.score; } i++; } return similarityMatrix; }
From source file:indexer.Cell.java
List<DocVector> getVectors(IndexReader reader, Terms terms, int numDimensions) throws Exception { List<DocVector> containedPoints = new ArrayList<>(); TermsEnum termsEnum = terms.iterator(); // seek to a specific term boolean found = termsEnum.seekExact(new BytesRef(this.toString())); if (found) {//from w w w.j av a 2 s .c om // enumerate through documents DocsEnum docsEnum = termsEnum.docs(null, null); int docid; while ((docid = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { Document d = reader.document(docid); DocVector dvec = new DocVector(d, numDimensions, DocVector.numIntervals, null); containedPoints.add(dvec); } } return containedPoints; }
From source file:indexer.IndexHtmlToText.java
static String getHTMLFromDocId(String indexDirPath, String docId) throws Exception { IndexReader reader; IndexSearcher searcher;//from w w w . j a va2 s. c om File indexDir = new File(indexDirPath); reader = DirectoryReader.open(FSDirectory.open(indexDir)); searcher = new IndexSearcher(reader); TopScoreDocCollector collector; TopDocs topDocs; Query query = new TermQuery(new Term(TrecDocIndexer.FIELD_ID, docId)); collector = TopScoreDocCollector.create(1, true); searcher.search(query, collector); topDocs = collector.topDocs(); ScoreDoc sd = topDocs.scoreDocs[0]; Document doc = reader.document(sd.doc); String htmlDecompressed = decompress(doc.getBinaryValue(WTDOC_FIELD_HTML).bytes); System.out.println(htmlDecompressed); reader.close(); return htmlDecompressed; }
From source file:indexer.IndexSplitter.java
public void split() throws Exception { IndexReader reader = DirectoryReader.open(FSDirectory.open(indexDir)); final int numDocs = reader.numDocs(); IndexWriter pWriter; // pointer variable for (int i = 0; i < numDocs; i++) { Document d = reader.document(i); pWriter = d.get(FIELD_CODEMIXED).equals("1") ? mixedIndexWriter : pureIndexWriter; pWriter.addDocument(d);/* w w w. j av a2s. com*/ } reader.close(); pureIndexWriter.close(); mixedIndexWriter.close(); }
From source file:indexer.SplitCells.java
public static SplitCells readFromIndex(IndexReader reader) throws Exception { SplitCells splitCells = new SplitCells(); // The last document contains the split information. int numDocs = reader.maxDoc(); Document splitCellInfoDoc = reader.document(numDocs - 1); String splitCellsInfo = splitCellInfoDoc.get(OptimizedRealValuedVecIndexer.SPLIT_CELLS_FIELD); if (splitCellsInfo == null) return null; String[] tokens = splitCellsInfo.split("\\s+"); for (String token : tokens) { Cell cell = new Cell(token); splitCells.addSplit(cell);//from w w w .j a v a 2 s . co m } return splitCells; }
From source file:indexing.eval.Eval.java
License:Open Source License
public static void printReviewIds() { try {/* w w w . ja v a2 s . com*/ IndexReader ir = IndexReader.open(new SimpleFSDirectory(new File(Paths.luceneIndex)), true); String field = "reviewid"; int ndocs = ir.maxDoc(); for (int i = 0; i < ndocs; i++) { Document doc = ir.document(i); System.out.println(doc.get(field)); } } catch (Exception e) { // TODO: handle exception } }
From source file:info.boytsov.lucene.CheckSort.java
License:Open Source License
public static void main(String[] args) { if (args.length != 2) { printUsage();//from www .j a v a 2 s . com System.exit(1); } int dir = 1; String srcDirName = args[0]; System.out.println("Source dir: " + srcDirName); if (args[1].equals("forward")) dir = 1; else if (args[1].equals("backward")) dir = -1; else { System.err.println("Invalid direction: " + args[1]); printUsage(); System.exit(1); } try { IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(srcDirName))); int docQty = reader.maxDoc(); int sortTable[] = new int[docQty]; Arrays.fill(sortTable, -1); int sortedQty = 0; double sortedStreak = 0; int sortedStreakQty = 0; URL2DocID remap[] = new URL2DocID[docQty]; String prevURL = ""; int prevSorted = 0; for (int docID = 0; docID < docQty; ++docID) { Document doc = reader.document(docID); String url = doc.get("url"); if (dir > 0) { remap[docID] = new URL2DocID(url, docID); } else { remap[docQty - 1 - docID] = new URL2DocID(url, docID); } if (docID % 100000 == 0) { System.out.println("Collected " + (docID + 1) + " URLs, sorted so far, direct " + sortedQty + " avg. sorted streak QTY: " + (sortedStreak / sortedStreakQty) + " sortedStreakQty: " + sortedStreakQty); } // Assuming the increasing order if (dir * url.compareTo(prevURL) >= 0) { ++sortedQty; } else { sortedStreak += docID - prevSorted - 1; sortedStreakQty++; prevSorted = docID; } prevURL = url; } System.out.println("Collected " + docQty + " URLs, sorted so far, direct " + sortedQty + " avg. sorted streak QTY: " + (sortedStreak / sortedStreakQty) + " sortedStreakQty: " + sortedStreakQty); double invQty = Inversions.count(remap); System.out.println("A total number of inversions: " + invQty + " relative to n*(n-1)/2: " + (invQty * 2.0 / docQty / (docQty + 1))); } catch (Exception e) { System.err.println("Error: " + e.getMessage()); e.printStackTrace(); System.exit(1); } }
From source file:info.boytsov.lucene.DumpIndex.java
License:Open Source License
public static void main(String[] args) { if (args.length < 3 || args.length > 8) { printUsage();/* w w w. ja v a 2 s .c om*/ System.exit(1); } boolean sortByURL = Integer.parseInt(args[0]) != 0; String srcDirName = args[1]; String dstFileName = args[2]; int minTermFreq = MIN_TERM_FREQ; if (args.length >= 4) minTermFreq = Integer.parseInt(args[3]); int maxTermQty = MAX_TERM_QTY; if (args.length >= 5) maxTermQty = Integer.parseInt(args[4]); System.out.println("Source dir: " + srcDirName + " target dir: " + dstFileName); System.out.println("Min term freq: " + minTermFreq + " Max # of terms: " + maxTermQty); try { IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(srcDirName))); int docQty = reader.maxDoc(); int sortTable[] = new int[docQty]; Arrays.fill(sortTable, -1); if (sortByURL) { System.out.println("Re-sorting documents by URL!"); URL2DocID remap[] = new URL2DocID[docQty]; for (int docID = 0; docID < docQty; ++docID) { Document doc = reader.document(docID); String url = doc.get("url"); remap[docID] = new URL2DocID(url, docID); if (docID % 100000 == 0) { System.out.println("Collected " + (docID + 1) + " URLs for re-sorting"); } } Arrays.sort(remap); System.out.println("Collected and sorted all URLs for resoring, " + "filling out the sort table."); for (int newDocID = 0; newDocID < docQty; ++newDocID) { sortTable[remap[newDocID].docID] = newDocID; //System.out.println(remap[newDocID].url); } System.out.println("Sort table is filled up!"); for (int i = 0; i < docQty; ++i) remap[i] = null; remap = null; System.gc(); // Let's try to free some memory /* * Paranoid check: did we change all the -1 to non-negative numbers. * Turned out, it wasn't that paranoid. You may have repeating URLs. * Then, some elements in sortTable remain unset. */ for (int i = 0; i < sortTable.length; ++i) { if (sortTable[i] == -1) { throw new Exception("Bug: element " + i + " in sort table is not set"); } } } else { System.out.println("Keeping the original document order!"); for (int i = 0; i < sortTable.length; ++i) { sortTable[i] = i; // Identity transformation } } FreqWordDict dict = new FreqWordDict(reader, FIELD_NAME, minTermFreq, maxTermQty); File dstFile = new File(dstFileName); FileOutputStream outData = new FileOutputStream(dstFile); Iterator<Entry<TermDesc, Integer>> iter = dict.getTermIterator(); long totalWritten = 0; long totalInts = 0; int termId = 0; int batchWriteSize = 1024 * 1024 * 16; /* * We are trying to re-use as many objects as possible, * in order to reduce the number of allocations. */ IntArray bufferArray = new IntArray(batchWriteSize); int tmpDocId[] = null; ByteBuffer buffer = null; while (iter.hasNext()) { Entry<TermDesc, Integer> e = iter.next(); TermDesc ts = e.getKey(); DocsEnum docIter = dict.getDocIterator(ts.text); int postQty = ts.freq; int qty = 0, prevDocID = -1; /* * If posting lists appear in the order of descending term frequencies., * this will be actually only one allocation. */ if (tmpDocId == null || tmpDocId.length < postQty) tmpDocId = new int[postQty]; bufferArray.add(postQty); for (int i = 0; docIter.nextDoc() != DocIdSetIterator.NO_MORE_DOCS; ++i, ++qty) { if (i >= postQty) { throw new Exception("Bug: more postings than expected for term: " + ts.getText()); } int currDocID = docIter.docID(); if (currDocID >= docQty) { throw new Exception("Bug: a document ID " + currDocID + " is out of bounds, total # of docs: " + docQty); } tmpDocId[i] = sortTable[currDocID]; if (prevDocID >= docIter.docID()) { throw new Exception("Bug: unsorted doc ids for term: " + ts.getText()); } prevDocID = currDocID; } if (qty != postQty) { throw new Exception("Bug: fewer postings than expected for term: " + ts.getText()); } /* * Now let's resort docIds and write them. * REMEMBER that tmpDocId is a buffer that may contain * MORE than postQty elements!!! * Some of the won't be used. * */ Arrays.sort(tmpDocId, 0, postQty); for (int i = 0; i < postQty; ++i) bufferArray.add(tmpDocId[i]); totalWritten += 4 * (1 + postQty); totalInts += postQty; if (termId % 100000 == 0 || bufferArray.size() >= batchWriteSize) { System.out.println(termId + ":" + ts.getText() + " \t postQty=" + postQty + " overall written: " + totalWritten / 1024.0 / 1024.0 / 1024.0 + " Gbs, " + totalInts / 1e6 + " Millions postings"); } if (bufferArray.size() >= batchWriteSize) { // WriteArray may produce a new buffer, let's reuse it buffer = WriteArray(bufferArray, outData, buffer); } ++termId; } System.out.println("Term qty: " + termId + " flat size size : " + totalWritten / 1024.0 / 1024.0 / 1024.0 + " Gbs, " + totalInts / 1e6 + " Millions postings"); // WriteArray may produce a new buffer, let's reuse it buffer = WriteArray(bufferArray, outData, buffer); } catch (Exception e) { System.err.println("Error: " + e.getMessage()); e.printStackTrace(); System.exit(1); } }
From source file:intelligentWebAlgorithms.algos.search.ranking.DocRankMatrixBuilder.java
License:Apache License
private PageRankMatrixH buildMatrixH(IndexReader idxR) throws IOException { // only consider URLs that with fetched and parsed content List<Integer> allDocs = getProcessedDocs(idxR); PageRankMatrixH docMatrix = new PageRankMatrixH(allDocs.size()); for (int i = 0, n = allDocs.size(); i < n; i++) { for (int j = 0, k = allDocs.size(); j < k; j++) { double similarity = 0.0d; Document docX = idxR.document(i); String xURL = docX.get("url"); if (i == j) { // Avoid shameless self-promotion ;-) docMatrix.addLink(xURL, xURL, similarity); } else { TextDocumentTerms xDocumentTerms = new TextDocumentTerms(docX.get("content")); Document docY = idxR.document(j); TextDocumentTerms yDocumentTerms = new TextDocumentTerms(docY.get("content")); similarity = getImportance(xDocumentTerms, yDocumentTerms); // add link from docX to docY String yURL = docY.get("url"); docMatrix.addLink(xURL, yURL, similarity); }/* w ww. j av a 2 s. co m*/ } } docMatrix.calculate(); return docMatrix; }
From source file:intelligentWebAlgorithms.algos.search.ranking.DocRankMatrixBuilder.java
License:Apache License
private List<Integer> getProcessedDocs(IndexReader idxR) throws IOException { List<Integer> docs = new ArrayList<Integer>(); for (int i = 0, n = idxR.maxDoc(); i < n; i++) { if (idxR.hasDeletions() == false) { Document doc = idxR.document(i); if (eligibleForDocRank(doc.get("doctype"))) { docs.add(i);/*from w w w. ja va2 s . com*/ } } } return docs; }