Example usage for org.apache.lucene.index IndexReader document

List of usage examples for org.apache.lucene.index IndexReader document

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader document.

Prototype




public final Document document(int docID) throws IOException 

Source Link

Document

Returns the stored fields of the nth Document in this index.

Usage

From source file:hr.fer.tel.rovkp.homework03.task01.JokesCollection.java

private static float[][] createMatrix(Map<Integer, String> entries, StandardAnalyzer analyzer, Directory index)
        throws NumberFormatException, IOException, ParseException {
    int hitsNo = entries.size();
    float[][] similarityMatrix = new float[hitsNo][hitsNo];

    int i = 0;//from   w  ww.  j  av a2  s . c  o  m
    for (Entry<Integer, String> entry : entries.entrySet()) {
        Query query = new QueryParser("text", analyzer).parse(QueryParser.escape(entry.getValue()));
        IndexReader reader = DirectoryReader.open(index);
        IndexSearcher searcher = new IndexSearcher(reader);
        TopDocs docs = searcher.search(query, hitsNo);
        for (ScoreDoc hit : docs.scoreDocs) {
            Document document = reader.document(hit.doc);
            int docId = Integer.parseInt(document.get("ID"));
            int docIndex = docId - 1;
            similarityMatrix[i][docIndex] = hit.score;
        }
        i++;
    }
    return similarityMatrix;
}

From source file:indexer.Cell.java

List<DocVector> getVectors(IndexReader reader, Terms terms, int numDimensions) throws Exception {
    List<DocVector> containedPoints = new ArrayList<>();

    TermsEnum termsEnum = terms.iterator();
    // seek to a specific term
    boolean found = termsEnum.seekExact(new BytesRef(this.toString()));

    if (found) {//from w w w.j av  a 2  s .c om
        // enumerate through documents
        DocsEnum docsEnum = termsEnum.docs(null, null);
        int docid;
        while ((docid = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
            Document d = reader.document(docid);
            DocVector dvec = new DocVector(d, numDimensions, DocVector.numIntervals, null);
            containedPoints.add(dvec);
        }
    }

    return containedPoints;
}

From source file:indexer.IndexHtmlToText.java

static String getHTMLFromDocId(String indexDirPath, String docId) throws Exception {
    IndexReader reader;
    IndexSearcher searcher;//from w  w  w . j a  va2 s. c om

    File indexDir = new File(indexDirPath);
    reader = DirectoryReader.open(FSDirectory.open(indexDir));
    searcher = new IndexSearcher(reader);

    TopScoreDocCollector collector;
    TopDocs topDocs;

    Query query = new TermQuery(new Term(TrecDocIndexer.FIELD_ID, docId));
    collector = TopScoreDocCollector.create(1, true);
    searcher.search(query, collector);
    topDocs = collector.topDocs();
    ScoreDoc sd = topDocs.scoreDocs[0];

    Document doc = reader.document(sd.doc);
    String htmlDecompressed = decompress(doc.getBinaryValue(WTDOC_FIELD_HTML).bytes);
    System.out.println(htmlDecompressed);

    reader.close();
    return htmlDecompressed;
}

From source file:indexer.IndexSplitter.java

public void split() throws Exception {
    IndexReader reader = DirectoryReader.open(FSDirectory.open(indexDir));
    final int numDocs = reader.numDocs();
    IndexWriter pWriter; // pointer variable

    for (int i = 0; i < numDocs; i++) {
        Document d = reader.document(i);
        pWriter = d.get(FIELD_CODEMIXED).equals("1") ? mixedIndexWriter : pureIndexWriter;
        pWriter.addDocument(d);/* w w w. j  av  a2s.  com*/
    }

    reader.close();
    pureIndexWriter.close();
    mixedIndexWriter.close();
}

From source file:indexer.SplitCells.java

public static SplitCells readFromIndex(IndexReader reader) throws Exception {
    SplitCells splitCells = new SplitCells();

    // The last document contains the split information.
    int numDocs = reader.maxDoc();
    Document splitCellInfoDoc = reader.document(numDocs - 1);

    String splitCellsInfo = splitCellInfoDoc.get(OptimizedRealValuedVecIndexer.SPLIT_CELLS_FIELD);
    if (splitCellsInfo == null)
        return null;

    String[] tokens = splitCellsInfo.split("\\s+");
    for (String token : tokens) {
        Cell cell = new Cell(token);
        splitCells.addSplit(cell);//from w w w .j a v a  2 s  . co m
    }

    return splitCells;
}

From source file:indexing.eval.Eval.java

License:Open Source License

public static void printReviewIds() {
    try {/*  w  w w .  ja v a2 s  . com*/
        IndexReader ir = IndexReader.open(new SimpleFSDirectory(new File(Paths.luceneIndex)), true);
        String field = "reviewid";

        int ndocs = ir.maxDoc();
        for (int i = 0; i < ndocs; i++) {
            Document doc = ir.document(i);
            System.out.println(doc.get(field));
        }

    } catch (Exception e) {
        // TODO: handle exception
    }

}

From source file:info.boytsov.lucene.CheckSort.java

License:Open Source License

public static void main(String[] args) {
    if (args.length != 2) {
        printUsage();//from  www  .j  a v a 2  s  . com
        System.exit(1);
    }
    int dir = 1;

    String srcDirName = args[0];
    System.out.println("Source dir: " + srcDirName);
    if (args[1].equals("forward"))
        dir = 1;
    else if (args[1].equals("backward"))
        dir = -1;
    else {
        System.err.println("Invalid direction: " + args[1]);
        printUsage();
        System.exit(1);
    }

    try {
        IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(srcDirName)));

        int docQty = reader.maxDoc();
        int sortTable[] = new int[docQty];

        Arrays.fill(sortTable, -1);

        int sortedQty = 0;

        double sortedStreak = 0;
        int sortedStreakQty = 0;

        URL2DocID remap[] = new URL2DocID[docQty];

        String prevURL = "";

        int prevSorted = 0;

        for (int docID = 0; docID < docQty; ++docID) {
            Document doc = reader.document(docID);
            String url = doc.get("url");
            if (dir > 0) {
                remap[docID] = new URL2DocID(url, docID);
            } else {
                remap[docQty - 1 - docID] = new URL2DocID(url, docID);
            }
            if (docID % 100000 == 0) {
                System.out.println("Collected " + (docID + 1) + " URLs, sorted so far, direct " + sortedQty
                        + " avg. sorted streak QTY: " + (sortedStreak / sortedStreakQty) + " sortedStreakQty: "
                        + sortedStreakQty);
            }
            // Assuming the increasing order
            if (dir * url.compareTo(prevURL) >= 0) {
                ++sortedQty;
            } else {
                sortedStreak += docID - prevSorted - 1;
                sortedStreakQty++;

                prevSorted = docID;
            }
            prevURL = url;
        }

        System.out.println("Collected " + docQty + " URLs, sorted so far, direct " + sortedQty
                + " avg. sorted streak QTY: " + (sortedStreak / sortedStreakQty) + " sortedStreakQty: "
                + sortedStreakQty);

        double invQty = Inversions.count(remap);
        System.out.println("A total number of inversions: " + invQty + " relative to n*(n-1)/2: "
                + (invQty * 2.0 / docQty / (docQty + 1)));

    } catch (Exception e) {
        System.err.println("Error: " + e.getMessage());
        e.printStackTrace();
        System.exit(1);
    }

}

From source file:info.boytsov.lucene.DumpIndex.java

License:Open Source License

public static void main(String[] args) {
    if (args.length < 3 || args.length > 8) {
        printUsage();/* w w w. ja v  a 2  s .c  om*/
        System.exit(1);
    }
    boolean sortByURL = Integer.parseInt(args[0]) != 0;

    String srcDirName = args[1];
    String dstFileName = args[2];

    int minTermFreq = MIN_TERM_FREQ;

    if (args.length >= 4)
        minTermFreq = Integer.parseInt(args[3]);

    int maxTermQty = MAX_TERM_QTY;

    if (args.length >= 5)
        maxTermQty = Integer.parseInt(args[4]);

    System.out.println("Source dir: " + srcDirName + " target dir: " + dstFileName);
    System.out.println("Min term freq: " + minTermFreq + " Max # of terms: " + maxTermQty);

    try {
        IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(srcDirName)));

        int docQty = reader.maxDoc();
        int sortTable[] = new int[docQty];

        Arrays.fill(sortTable, -1);

        if (sortByURL) {
            System.out.println("Re-sorting documents by URL!");

            URL2DocID remap[] = new URL2DocID[docQty];

            for (int docID = 0; docID < docQty; ++docID) {
                Document doc = reader.document(docID);
                String url = doc.get("url");
                remap[docID] = new URL2DocID(url, docID);
                if (docID % 100000 == 0) {
                    System.out.println("Collected " + (docID + 1) + " URLs for re-sorting");
                }
            }

            Arrays.sort(remap);

            System.out.println("Collected and sorted all URLs for resoring, " + "filling out the sort table.");

            for (int newDocID = 0; newDocID < docQty; ++newDocID) {
                sortTable[remap[newDocID].docID] = newDocID;
                //System.out.println(remap[newDocID].url);
            }

            System.out.println("Sort table is filled up!");

            for (int i = 0; i < docQty; ++i)
                remap[i] = null;
            remap = null;
            System.gc(); // Let's try to free some memory

            /*
             *  Paranoid check: did we change all the -1 to non-negative numbers.
             *  Turned out, it wasn't that paranoid. You may have repeating URLs.
             *  Then, some elements in sortTable remain unset.
             */
            for (int i = 0; i < sortTable.length; ++i) {
                if (sortTable[i] == -1) {
                    throw new Exception("Bug: element " + i + " in sort table is not set");
                }
            }
        } else {
            System.out.println("Keeping the original document order!");

            for (int i = 0; i < sortTable.length; ++i) {
                sortTable[i] = i; // Identity transformation
            }
        }

        FreqWordDict dict = new FreqWordDict(reader, FIELD_NAME, minTermFreq, maxTermQty);

        File dstFile = new File(dstFileName);

        FileOutputStream outData = new FileOutputStream(dstFile);

        Iterator<Entry<TermDesc, Integer>> iter = dict.getTermIterator();

        long totalWritten = 0;
        long totalInts = 0;

        int termId = 0;

        int batchWriteSize = 1024 * 1024 * 16;

        /*
         *  We are trying to re-use as many objects as possible,
         *  in order to reduce the number of allocations.
         */
        IntArray bufferArray = new IntArray(batchWriteSize);
        int tmpDocId[] = null;

        ByteBuffer buffer = null;

        while (iter.hasNext()) {
            Entry<TermDesc, Integer> e = iter.next();

            TermDesc ts = e.getKey();
            DocsEnum docIter = dict.getDocIterator(ts.text);

            int postQty = ts.freq;

            int qty = 0, prevDocID = -1;

            /*
             * If posting lists appear in the order of descending term frequencies.,
             * this will be actually only one allocation.
             */
            if (tmpDocId == null || tmpDocId.length < postQty)
                tmpDocId = new int[postQty];

            bufferArray.add(postQty);

            for (int i = 0; docIter.nextDoc() != DocIdSetIterator.NO_MORE_DOCS; ++i, ++qty) {
                if (i >= postQty) {
                    throw new Exception("Bug: more postings than expected for term: " + ts.getText());
                }
                int currDocID = docIter.docID();
                if (currDocID >= docQty) {
                    throw new Exception("Bug: a document ID " + currDocID
                            + " is out of bounds, total # of docs: " + docQty);
                }
                tmpDocId[i] = sortTable[currDocID];
                if (prevDocID >= docIter.docID()) {
                    throw new Exception("Bug: unsorted doc ids for term: " + ts.getText());
                }
                prevDocID = currDocID;
            }
            if (qty != postQty) {
                throw new Exception("Bug: fewer postings than expected for term: " + ts.getText());
            }
            /*
             *  Now let's resort docIds and write them.
             *  REMEMBER that tmpDocId is a buffer that may contain 
             *  MORE than postQty elements!!!
             *  Some of the won't be used.
             *  
             */
            Arrays.sort(tmpDocId, 0, postQty);

            for (int i = 0; i < postQty; ++i)
                bufferArray.add(tmpDocId[i]);

            totalWritten += 4 * (1 + postQty);
            totalInts += postQty;

            if (termId % 100000 == 0 || bufferArray.size() >= batchWriteSize) {
                System.out.println(termId + ":" + ts.getText() + " \t postQty=" + postQty + " overall written: "
                        + totalWritten / 1024.0 / 1024.0 / 1024.0 + " Gbs, " + totalInts / 1e6
                        + " Millions postings");
            }

            if (bufferArray.size() >= batchWriteSize) {
                // WriteArray may produce a new buffer, let's reuse it
                buffer = WriteArray(bufferArray, outData, buffer);
            }

            ++termId;
        }
        System.out.println("Term qty: " + termId + " flat size size : "
                + totalWritten / 1024.0 / 1024.0 / 1024.0 + " Gbs, " + totalInts / 1e6 + " Millions postings");

        // WriteArray may produce a new buffer, let's reuse it      
        buffer = WriteArray(bufferArray, outData, buffer);
    } catch (Exception e) {
        System.err.println("Error: " + e.getMessage());
        e.printStackTrace();
        System.exit(1);
    }

}

From source file:intelligentWebAlgorithms.algos.search.ranking.DocRankMatrixBuilder.java

License:Apache License

private PageRankMatrixH buildMatrixH(IndexReader idxR) throws IOException {

    // only consider URLs that with fetched and parsed content
    List<Integer> allDocs = getProcessedDocs(idxR);

    PageRankMatrixH docMatrix = new PageRankMatrixH(allDocs.size());

    for (int i = 0, n = allDocs.size(); i < n; i++) {

        for (int j = 0, k = allDocs.size(); j < k; j++) {

            double similarity = 0.0d;

            Document docX = idxR.document(i);
            String xURL = docX.get("url");

            if (i == j) {

                // Avoid shameless self-promotion ;-)
                docMatrix.addLink(xURL, xURL, similarity);

            } else {

                TextDocumentTerms xDocumentTerms = new TextDocumentTerms(docX.get("content"));

                Document docY = idxR.document(j);
                TextDocumentTerms yDocumentTerms = new TextDocumentTerms(docY.get("content"));

                similarity = getImportance(xDocumentTerms, yDocumentTerms);

                // add link from docX to docY
                String yURL = docY.get("url");

                docMatrix.addLink(xURL, yURL, similarity);
            }/*  w ww. j av  a  2 s.  co  m*/
        }
    }

    docMatrix.calculate();

    return docMatrix;
}

From source file:intelligentWebAlgorithms.algos.search.ranking.DocRankMatrixBuilder.java

License:Apache License

private List<Integer> getProcessedDocs(IndexReader idxR) throws IOException {
    List<Integer> docs = new ArrayList<Integer>();
    for (int i = 0, n = idxR.maxDoc(); i < n; i++) {
        if (idxR.hasDeletions() == false) {
            Document doc = idxR.document(i);
            if (eligibleForDocRank(doc.get("doctype"))) {
                docs.add(i);/*from w  w  w.  ja va2  s .  com*/
            }
        }
    }
    return docs;

}