Example usage for org.apache.lucene.index IndexReader numDocs

List of usage examples for org.apache.lucene.index IndexReader numDocs

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader numDocs.

Prototype

public abstract int numDocs();

Source Link

Document

Returns the number of documents in this index.

Usage

From source file:at.lux.fotoretrieval.retrievalengines.LucenePathIndexRetrievalEngineTest.java

License:Open Source License

private void testQuery(IndexReader ir, Graph query, IndexSearcher is) throws IOException, ParseException {
    // create results from mcs:
    LinkedList<ResultHolder> resultsMcs = new LinkedList<ResultHolder>();
    for (int j = 0; j < ir.numDocs(); j++) {
        Graph model = new Graph(ir.document(j).getValues("graph")[0]);
        float mcsSimilarity = query.getMcsSimilarity(model);
        resultsMcs.add(new ResultHolder(j, model.toString(), mcsSimilarity));
    }/*from   ww  w  . jav a 2  s. co m*/
    Collections.sort(resultsMcs);
    //            for (Iterator<ResultHolder> iterator = resultsMcs.iterator(); iterator.hasNext();) {
    //                ResultHolder r = iterator.next();
    //                System.out.println(r.getDocumentNumber() + ": " + r.getSimilarity());
    //            }

    // create results from search:

    // set to another similarity if necessary:
    is.setSimilarity(new TermFrequencySimilarity());
    //        is.setSimilarity(new SimpleTfIdfSimilarity());

    LucenePathIndexRetrievalEngine engine = new LucenePathIndexRetrievalEngine(50);
    String gQuery = LucenePathIndexRetrievalEngine.createLucenePathQuery(query);
    //        System.out.println(query);
    QueryParser qParse = new QueryParser("paths", new WhitespaceAnalyzer());
    Query q = qParse.parse(gQuery);
    Hits hits = is.search(q);
    LinkedList<ResultHolder> resultsSearch = new LinkedList<ResultHolder>();
    for (int i = 0; i < hits.length(); i++) {
        String graph = hits.doc(i).getValues("graph")[0];
        int docID = -1;
        for (int j = 0; j < ir.numDocs(); j++) {
            Graph model = new Graph(ir.document(j).getValues("graph")[0]);
            if (model.toString().equals(graph))
                docID = j;
        }
        resultsSearch.add(new ResultHolder(docID, graph, hits.score(i)));
    }
    Collections.sort(resultsSearch);
    printPrecisionRecallPlot(resultsMcs, resultsSearch);
}

From source file:at.lux.fotoretrieval.retrievalengines.LucenePathIndexRetrievalEngineTest.java

License:Open Source License

private void testDirectQuery(IndexReader ir, Graph query, IndexSearcher is) throws IOException, ParseException {
    IndexReader reader = IndexReader.open("C:\\Java\\JavaProjects\\CaliphEmir\\testdata\\idx_semantic");
    IndexSearcher searcher = new IndexSearcher("C:\\Java\\JavaProjects\\CaliphEmir\\testdata\\idx_fulltext");

    HashMap<Integer, String> node2label = new HashMap<Integer, String>();
    for (int j = 0; j < reader.numDocs(); j++) {
        String id = reader.document(j).getValues("id")[0];
        String label = reader.document(j).getValues("label")[0];
        node2label.put(Integer.parseInt(id), label);
    }//from  w  w  w  .  j av a  2  s .c  om
    // create results from mcs:
    LinkedList<ResultHolder> resultsMcs = new LinkedList<ResultHolder>();
    for (int j = 0; j < ir.numDocs(); j++) {
        Graph model = new Graph(ir.document(j).getValues("graph")[0]);
        float mcsSimilarity = query.getMcsSimilarity(model);
        String[] file = ir.document(j).getValues("file");
        for (int i = 0; i < file.length; i++) {
            String s = file[i];
            resultsMcs.add(new ResultHolder(mcsSimilarity, s));
        }
    }
    Collections.sort(resultsMcs);
    //            for (Iterator<ResultHolder> iterator = resultsMcs.iterator(); iterator.hasNext();) {
    //                ResultHolder r = iterator.next();
    //                System.out.println(r.getDocumentNumber() + ": " + r.getSimilarity());
    //            }

    // create results from search:
    StringBuilder qBuilder = new StringBuilder(64);
    for (Iterator<Node> iterator = query.getNodes().iterator(); iterator.hasNext();) {
        Node node = iterator.next();
        //            qBuilder.append("\"");
        qBuilder.append(node2label.get(node.getNodeID()));
        qBuilder.append(" ");
        //            qBuilder.append("\" ");
    }
    //        System.out.println(query);
    QueryParser qParse = new QueryParser("all", new WhitespaceAnalyzer());
    Query q = qParse.parse(qBuilder.toString());
    Hits hits = searcher.search(q);
    LinkedList<ResultHolder> resultsSearch = new LinkedList<ResultHolder>();
    for (int i = 0; i < hits.length(); i++) {
        String graph = hits.doc(i).getValues("file")[0];
        //            int docID = -1;
        //            for (int j = 0; j < ir.numDocs(); j++) {
        //                Graph model = new Graph(ir.document(j).getValues("graph")[0]);
        //                if (model.toString().equals(graph)) docID = j;
        //            }
        resultsSearch.add(new ResultHolder(hits.score(i), graph));
    }
    Collections.sort(resultsSearch);
    printPrecisionRecallPlotFileBased(resultsMcs, resultsSearch);
}

From source file:at.lux.retrieval.vectorspace.ElementTextVectorSimilarityTest.java

License:Open Source License

public void testSimilarity() throws IOException, JDOMException {
    ElementTextVectorSimilarity sim = new ElementTextVectorSimilarity();
    double distance = sim.getSimilarity(d1, d1);
    System.out.println("distance = " + distance);
    distance = sim.getSimilarity(d1, d2);
    System.out.println("distance = " + distance);
    distance = sim.getSimilarity(d2, d1);
    System.out.println("distance = " + distance);

    IndexReader reader = IndexReader.open("testdata/idx_paths");

    System.out.println("Loading documents and adding them to corpus ...");
    for (int i = 0; i < reader.numDocs(); i++) {
        //            Graph g_idx = new Graph(reader.document(i).getField("graph").stringValue());
        Field[] files = reader.document(i).getFields("file");
        for (Field file : files) {
            Document d = saxBuilder.build(file.stringValue());
            sim.addToCorpus(d);//from w  w  w .ja v a 2s . c  o m
        }
    }

    System.out.println("");

    distance = sim.getSimilarity(d1, d1, ElementTextVectorSimilarity.WeightType.TfIdf);
    System.out.println("distance = " + distance);
    distance = sim.getSimilarity(d1, d2, ElementTextVectorSimilarity.WeightType.TfIdf);
    System.out.println("distance = " + distance);
    distance = sim.getSimilarity(d2, d1, ElementTextVectorSimilarity.WeightType.TfIdf);
    System.out.println("distance = " + distance);
    distance = sim.getSimilarity(d2, d2, ElementTextVectorSimilarity.WeightType.TfIdf);
    System.out.println("distance = " + distance);

    System.out.println("");

    distance = sim.getSimilarity(d1, d1, ElementTextVectorSimilarity.WeightType.BM25);
    System.out.println("distance = " + distance);
    distance = sim.getSimilarity(d1, d2, ElementTextVectorSimilarity.WeightType.BM25);
    System.out.println("distance = " + distance);
    distance = sim.getSimilarity(d2, d1, ElementTextVectorSimilarity.WeightType.BM25);
    System.out.println("distance = " + distance);
    distance = sim.getSimilarity(d2, d2, ElementTextVectorSimilarity.WeightType.BM25);
    System.out.println("distance = " + distance);

}

From source file:be.ugent.tiwi.sleroux.newsrec.newsreclib.termExtract.LuceneTopTermExtract.java

License:Apache License

private PriorityQueue<TermScorePair> getTermScores(Map<String, Double> termFreqMap,
        Map<String, Integer> docFreqMap, IndexReader reader) {
    int numDocs = reader.numDocs();
    PriorityQueue<TermScorePair> pq = new PriorityQueue<>(termFreqMap.size());

    for (String term : termFreqMap.keySet()) {
        double tf = 1 + Math.log(termFreqMap.get(term));
        int docFreq = docFreqMap.get(term);
        if (docFreq > 0) {
            double idf = 1 + Math.log((double) numDocs / docFreq);
            double score = tf * idf;
            pq.add(new TermScorePair(term, score));
        }//from  w w  w.j  a v a2s.c om
    }
    return pq;
}

From source file:be.ugent.tiwi.sleroux.newsrec.recommendationstester.LuceneTopTermExtract.java

License:Apache License

private PriorityQueue<TermScorePair> getTermScores(Map<String, Double> termFreqMap,
        Map<String, Integer> docFreqMap, IndexReader reader) {
    int numDocs = reader.numDocs();
    PriorityQueue<TermScorePair> pq = new PriorityQueue<>(termFreqMap.size());

    double max = 0;
    double avg = 0;
    for (String term : termFreqMap.keySet()) {
        max = (max > termFreqMap.get(term) ? max : termFreqMap.get(term));
        avg += termFreqMap.get(term);//  ww w.j  ava 2 s.  c  o m
    }
    for (String term : termFreqMap.keySet()) {
        double tf = Math.log(termFreqMap.size() / termFreqMap.get(term));
        int docFreq = docFreqMap.get(term);
        if (docFreq > 0) {
            double idf = 1 + Math.log((double) numDocs / docFreqMap.get(term));
            double score = tf * idf;
            pq.add(new TermScorePair(term, score));
            System.out.println(term + ";" + termFreqMap.get(term) + ";" + docFreqMap.get(term));
        }
    }
    return pq;
}

From source file:BlockBuilding.AbstractBlockBuilding.java

License:Apache License

protected int[] getDocumentIds(IndexReader reader) {
    int[] documentIds = new int[reader.numDocs()];
    for (int i = 0; i < documentIds.length; i++) {
        try {//from   www.  j  av a 2  s  .  c o  m
            Document document = reader.document(i);
            documentIds[i] = Integer.parseInt(document.get(DOC_ID));
        } catch (IOException ex) {
            LOGGER.log(Level.SEVERE, null, ex);
        }
    }
    return documentIds;
}

From source file:BlockBuilding.AbstractExtendedSortedNeighborhoodBlocking.java

License:Open Source License

@Override
protected void parseIndex() {
    IndexReader d1Reader = Utilities.openReader(indexDirectory[0]);
    final Set<String> blockingKeysSet = getTerms(d1Reader);
    String[] sortedTerms = blockingKeysSet.toArray(new String[blockingKeysSet.size()]);
    Arrays.sort(sortedTerms);/*  w  w w.  ja  va2 s  .  c  o m*/

    //slide window over the sorted list of blocking keys
    int upperLimit = sortedTerms.length - windowSize;
    int[] documentIds = Utilities.getDocumentIds(d1Reader);
    for (int i = 0; i <= upperLimit; i++) {
        final Set<Integer> entityIds = new HashSet<>();
        for (int j = 0; j < windowSize; j++) {
            entityIds.addAll(getTermEntities(documentIds, d1Reader, sortedTerms[i + j]));
        }

        if (1 < entityIds.size()) {
            int[] idsArray = Converter.convertCollectionToArray(entityIds);
            UnilateralBlock uBlock = new UnilateralBlock(idsArray);
            blocks.add(uBlock);
        }
    }

    noOfEntities = new double[1];
    noOfEntities[0] = d1Reader.numDocs();

    Utilities.closeReader(d1Reader);
}

From source file:BlockBuilding.AbstractExtendedSortedNeighborhoodBlocking.java

License:Open Source License

@Override
protected void parseIndices() {
    IndexReader d1Reader = Utilities.openReader(indexDirectory[0]);
    IndexReader d2Reader = Utilities.openReader(indexDirectory[1]);

    final Set<String> blockingKeysSet = getTerms(d1Reader);
    blockingKeysSet.addAll(getTerms(d2Reader));
    String[] sortedTerms = blockingKeysSet.toArray(new String[blockingKeysSet.size()]);
    Arrays.sort(sortedTerms);/* www  . j ava 2s  .co  m*/

    //slide window over the sorted list of blocking keys
    int upperLimit = sortedTerms.length - windowSize;
    int[] documentIdsD1 = Utilities.getDocumentIds(d1Reader);
    int[] documentIdsD2 = Utilities.getDocumentIds(d2Reader);
    for (int i = 0; i <= upperLimit; i++) {
        final Set<Integer> entityIds1 = new HashSet<>();
        final Set<Integer> entityIds2 = new HashSet<>();
        for (int j = 0; j < windowSize; j++) {
            try {
                int docFrequency = d1Reader.docFreq(new Term(VALUE_LABEL, sortedTerms[i + j]));
                if (0 < docFrequency) {
                    entityIds1.addAll(getTermEntities(documentIdsD1, d1Reader, sortedTerms[i + j]));
                }

                docFrequency = d2Reader.docFreq(new Term(VALUE_LABEL, sortedTerms[i + j]));
                if (0 < docFrequency) {
                    entityIds2.addAll(getTermEntities(documentIdsD2, d2Reader, sortedTerms[i + j]));
                }
            } catch (IOException ex) {
                ex.printStackTrace();
            }
        }

        if (!entityIds1.isEmpty() && !entityIds2.isEmpty()) {
            int[] idsArray1 = Converter.convertCollectionToArray(entityIds1);
            int[] idsArray2 = Converter.convertCollectionToArray(entityIds2);
            BilateralBlock bBlock = new BilateralBlock(idsArray1, idsArray2);
            blocks.add(bBlock);
        }
    }

    noOfEntities = new double[2];
    noOfEntities[0] = d1Reader.numDocs();
    noOfEntities[1] = d2Reader.numDocs();

    Utilities.closeReader(d1Reader);
    Utilities.closeReader(d2Reader);
}

From source file:BlockBuilding.AbstractSortedNeighborhoodBlocking.java

License:Open Source License

protected Integer[] getSortedEntities(String[] sortedTerms, IndexReader d1Reader, IndexReader d2Reader) {
    int datasetLimit = d1Reader.numDocs();
    final List<Integer> sortedEntityIds = new ArrayList<>();

    int[] documentIdsD1 = Utilities.getDocumentIds(d1Reader);
    int[] documentIdsD2 = Utilities.getDocumentIds(d2Reader);
    for (String blockingKey : sortedTerms) {
        List<Integer> sortedIds = new ArrayList<>();
        sortedIds.addAll(getTermEntities(documentIdsD1, d1Reader, blockingKey));

        getTermEntities(documentIdsD2, d2Reader, blockingKey).stream().forEach((entityId) -> {
            sortedIds.add(datasetLimit + entityId);
        });/*from w  ww . j av  a  2  s  .  c  o m*/

        Collections.shuffle(sortedIds);
        sortedEntityIds.addAll(sortedIds);
    }

    return sortedEntityIds.toArray(new Integer[sortedEntityIds.size()]);
}

From source file:BlockBuilding.AbstractSortedNeighborhoodBlocking.java

License:Open Source License

protected void parseIndex() {
    IndexReader d1Reader = Utilities.openReader(indexDirectory[sourceId]);

    final Set<String> blockingKeysSet = getTerms(d1Reader);
    String[] sortedTerms = blockingKeysSet.toArray(new String[blockingKeysSet.size()]);
    Arrays.sort(sortedTerms);//from   ww  w  . j av  a2  s  .com

    Integer[] allEntityIds = getSortedEntities(sortedTerms, d1Reader);

    //slide window over the sorted list of entity ids
    int upperLimit = allEntityIds.length - windowSize;
    for (int i = 0; i <= upperLimit; i++) {
        final Set<Integer> entityIds = new HashSet<>();
        for (int j = 0; j < windowSize; j++) {
            entityIds.add(allEntityIds[i + j]);
        }

        if (1 < entityIds.size()) {
            int[] idsArray = Converter.convertCollectionToArray(entityIds);
            UnilateralBlock uBlock = new UnilateralBlock(idsArray);
            blocks.add(uBlock);
        }
    }

    noOfEntities = new double[1];
    noOfEntities[0] = d1Reader.numDocs();

    Utilities.closeReader(d1Reader);
}