List of usage examples for org.apache.lucene.index IndexReader numDocs
public abstract int numDocs();
From source file:at.lux.fotoretrieval.retrievalengines.LucenePathIndexRetrievalEngineTest.java
License:Open Source License
private void testQuery(IndexReader ir, Graph query, IndexSearcher is) throws IOException, ParseException { // create results from mcs: LinkedList<ResultHolder> resultsMcs = new LinkedList<ResultHolder>(); for (int j = 0; j < ir.numDocs(); j++) { Graph model = new Graph(ir.document(j).getValues("graph")[0]); float mcsSimilarity = query.getMcsSimilarity(model); resultsMcs.add(new ResultHolder(j, model.toString(), mcsSimilarity)); }/*from ww w . jav a 2 s. co m*/ Collections.sort(resultsMcs); // for (Iterator<ResultHolder> iterator = resultsMcs.iterator(); iterator.hasNext();) { // ResultHolder r = iterator.next(); // System.out.println(r.getDocumentNumber() + ": " + r.getSimilarity()); // } // create results from search: // set to another similarity if necessary: is.setSimilarity(new TermFrequencySimilarity()); // is.setSimilarity(new SimpleTfIdfSimilarity()); LucenePathIndexRetrievalEngine engine = new LucenePathIndexRetrievalEngine(50); String gQuery = LucenePathIndexRetrievalEngine.createLucenePathQuery(query); // System.out.println(query); QueryParser qParse = new QueryParser("paths", new WhitespaceAnalyzer()); Query q = qParse.parse(gQuery); Hits hits = is.search(q); LinkedList<ResultHolder> resultsSearch = new LinkedList<ResultHolder>(); for (int i = 0; i < hits.length(); i++) { String graph = hits.doc(i).getValues("graph")[0]; int docID = -1; for (int j = 0; j < ir.numDocs(); j++) { Graph model = new Graph(ir.document(j).getValues("graph")[0]); if (model.toString().equals(graph)) docID = j; } resultsSearch.add(new ResultHolder(docID, graph, hits.score(i))); } Collections.sort(resultsSearch); printPrecisionRecallPlot(resultsMcs, resultsSearch); }
From source file:at.lux.fotoretrieval.retrievalengines.LucenePathIndexRetrievalEngineTest.java
License:Open Source License
private void testDirectQuery(IndexReader ir, Graph query, IndexSearcher is) throws IOException, ParseException { IndexReader reader = IndexReader.open("C:\\Java\\JavaProjects\\CaliphEmir\\testdata\\idx_semantic"); IndexSearcher searcher = new IndexSearcher("C:\\Java\\JavaProjects\\CaliphEmir\\testdata\\idx_fulltext"); HashMap<Integer, String> node2label = new HashMap<Integer, String>(); for (int j = 0; j < reader.numDocs(); j++) { String id = reader.document(j).getValues("id")[0]; String label = reader.document(j).getValues("label")[0]; node2label.put(Integer.parseInt(id), label); }//from w w w . j av a 2 s .c om // create results from mcs: LinkedList<ResultHolder> resultsMcs = new LinkedList<ResultHolder>(); for (int j = 0; j < ir.numDocs(); j++) { Graph model = new Graph(ir.document(j).getValues("graph")[0]); float mcsSimilarity = query.getMcsSimilarity(model); String[] file = ir.document(j).getValues("file"); for (int i = 0; i < file.length; i++) { String s = file[i]; resultsMcs.add(new ResultHolder(mcsSimilarity, s)); } } Collections.sort(resultsMcs); // for (Iterator<ResultHolder> iterator = resultsMcs.iterator(); iterator.hasNext();) { // ResultHolder r = iterator.next(); // System.out.println(r.getDocumentNumber() + ": " + r.getSimilarity()); // } // create results from search: StringBuilder qBuilder = new StringBuilder(64); for (Iterator<Node> iterator = query.getNodes().iterator(); iterator.hasNext();) { Node node = iterator.next(); // qBuilder.append("\""); qBuilder.append(node2label.get(node.getNodeID())); qBuilder.append(" "); // qBuilder.append("\" "); } // System.out.println(query); QueryParser qParse = new QueryParser("all", new WhitespaceAnalyzer()); Query q = qParse.parse(qBuilder.toString()); Hits hits = searcher.search(q); LinkedList<ResultHolder> resultsSearch = new LinkedList<ResultHolder>(); for (int i = 0; i < hits.length(); i++) { String graph = hits.doc(i).getValues("file")[0]; // int docID = -1; // for (int j = 0; j < ir.numDocs(); j++) { // Graph model = new Graph(ir.document(j).getValues("graph")[0]); // if (model.toString().equals(graph)) docID = j; // } resultsSearch.add(new ResultHolder(hits.score(i), graph)); } Collections.sort(resultsSearch); printPrecisionRecallPlotFileBased(resultsMcs, resultsSearch); }
From source file:at.lux.retrieval.vectorspace.ElementTextVectorSimilarityTest.java
License:Open Source License
public void testSimilarity() throws IOException, JDOMException { ElementTextVectorSimilarity sim = new ElementTextVectorSimilarity(); double distance = sim.getSimilarity(d1, d1); System.out.println("distance = " + distance); distance = sim.getSimilarity(d1, d2); System.out.println("distance = " + distance); distance = sim.getSimilarity(d2, d1); System.out.println("distance = " + distance); IndexReader reader = IndexReader.open("testdata/idx_paths"); System.out.println("Loading documents and adding them to corpus ..."); for (int i = 0; i < reader.numDocs(); i++) { // Graph g_idx = new Graph(reader.document(i).getField("graph").stringValue()); Field[] files = reader.document(i).getFields("file"); for (Field file : files) { Document d = saxBuilder.build(file.stringValue()); sim.addToCorpus(d);//from w w w .ja v a 2s . c o m } } System.out.println(""); distance = sim.getSimilarity(d1, d1, ElementTextVectorSimilarity.WeightType.TfIdf); System.out.println("distance = " + distance); distance = sim.getSimilarity(d1, d2, ElementTextVectorSimilarity.WeightType.TfIdf); System.out.println("distance = " + distance); distance = sim.getSimilarity(d2, d1, ElementTextVectorSimilarity.WeightType.TfIdf); System.out.println("distance = " + distance); distance = sim.getSimilarity(d2, d2, ElementTextVectorSimilarity.WeightType.TfIdf); System.out.println("distance = " + distance); System.out.println(""); distance = sim.getSimilarity(d1, d1, ElementTextVectorSimilarity.WeightType.BM25); System.out.println("distance = " + distance); distance = sim.getSimilarity(d1, d2, ElementTextVectorSimilarity.WeightType.BM25); System.out.println("distance = " + distance); distance = sim.getSimilarity(d2, d1, ElementTextVectorSimilarity.WeightType.BM25); System.out.println("distance = " + distance); distance = sim.getSimilarity(d2, d2, ElementTextVectorSimilarity.WeightType.BM25); System.out.println("distance = " + distance); }
From source file:be.ugent.tiwi.sleroux.newsrec.newsreclib.termExtract.LuceneTopTermExtract.java
License:Apache License
private PriorityQueue<TermScorePair> getTermScores(Map<String, Double> termFreqMap, Map<String, Integer> docFreqMap, IndexReader reader) { int numDocs = reader.numDocs(); PriorityQueue<TermScorePair> pq = new PriorityQueue<>(termFreqMap.size()); for (String term : termFreqMap.keySet()) { double tf = 1 + Math.log(termFreqMap.get(term)); int docFreq = docFreqMap.get(term); if (docFreq > 0) { double idf = 1 + Math.log((double) numDocs / docFreq); double score = tf * idf; pq.add(new TermScorePair(term, score)); }//from w w w.j a v a2s.c om } return pq; }
From source file:be.ugent.tiwi.sleroux.newsrec.recommendationstester.LuceneTopTermExtract.java
License:Apache License
private PriorityQueue<TermScorePair> getTermScores(Map<String, Double> termFreqMap, Map<String, Integer> docFreqMap, IndexReader reader) { int numDocs = reader.numDocs(); PriorityQueue<TermScorePair> pq = new PriorityQueue<>(termFreqMap.size()); double max = 0; double avg = 0; for (String term : termFreqMap.keySet()) { max = (max > termFreqMap.get(term) ? max : termFreqMap.get(term)); avg += termFreqMap.get(term);// ww w.j ava 2 s. c o m } for (String term : termFreqMap.keySet()) { double tf = Math.log(termFreqMap.size() / termFreqMap.get(term)); int docFreq = docFreqMap.get(term); if (docFreq > 0) { double idf = 1 + Math.log((double) numDocs / docFreqMap.get(term)); double score = tf * idf; pq.add(new TermScorePair(term, score)); System.out.println(term + ";" + termFreqMap.get(term) + ";" + docFreqMap.get(term)); } } return pq; }
From source file:BlockBuilding.AbstractBlockBuilding.java
License:Apache License
protected int[] getDocumentIds(IndexReader reader) { int[] documentIds = new int[reader.numDocs()]; for (int i = 0; i < documentIds.length; i++) { try {//from www. j av a 2 s . c o m Document document = reader.document(i); documentIds[i] = Integer.parseInt(document.get(DOC_ID)); } catch (IOException ex) { LOGGER.log(Level.SEVERE, null, ex); } } return documentIds; }
From source file:BlockBuilding.AbstractExtendedSortedNeighborhoodBlocking.java
License:Open Source License
@Override protected void parseIndex() { IndexReader d1Reader = Utilities.openReader(indexDirectory[0]); final Set<String> blockingKeysSet = getTerms(d1Reader); String[] sortedTerms = blockingKeysSet.toArray(new String[blockingKeysSet.size()]); Arrays.sort(sortedTerms);/* w w w. ja va2 s . c o m*/ //slide window over the sorted list of blocking keys int upperLimit = sortedTerms.length - windowSize; int[] documentIds = Utilities.getDocumentIds(d1Reader); for (int i = 0; i <= upperLimit; i++) { final Set<Integer> entityIds = new HashSet<>(); for (int j = 0; j < windowSize; j++) { entityIds.addAll(getTermEntities(documentIds, d1Reader, sortedTerms[i + j])); } if (1 < entityIds.size()) { int[] idsArray = Converter.convertCollectionToArray(entityIds); UnilateralBlock uBlock = new UnilateralBlock(idsArray); blocks.add(uBlock); } } noOfEntities = new double[1]; noOfEntities[0] = d1Reader.numDocs(); Utilities.closeReader(d1Reader); }
From source file:BlockBuilding.AbstractExtendedSortedNeighborhoodBlocking.java
License:Open Source License
@Override protected void parseIndices() { IndexReader d1Reader = Utilities.openReader(indexDirectory[0]); IndexReader d2Reader = Utilities.openReader(indexDirectory[1]); final Set<String> blockingKeysSet = getTerms(d1Reader); blockingKeysSet.addAll(getTerms(d2Reader)); String[] sortedTerms = blockingKeysSet.toArray(new String[blockingKeysSet.size()]); Arrays.sort(sortedTerms);/* www . j ava 2s .co m*/ //slide window over the sorted list of blocking keys int upperLimit = sortedTerms.length - windowSize; int[] documentIdsD1 = Utilities.getDocumentIds(d1Reader); int[] documentIdsD2 = Utilities.getDocumentIds(d2Reader); for (int i = 0; i <= upperLimit; i++) { final Set<Integer> entityIds1 = new HashSet<>(); final Set<Integer> entityIds2 = new HashSet<>(); for (int j = 0; j < windowSize; j++) { try { int docFrequency = d1Reader.docFreq(new Term(VALUE_LABEL, sortedTerms[i + j])); if (0 < docFrequency) { entityIds1.addAll(getTermEntities(documentIdsD1, d1Reader, sortedTerms[i + j])); } docFrequency = d2Reader.docFreq(new Term(VALUE_LABEL, sortedTerms[i + j])); if (0 < docFrequency) { entityIds2.addAll(getTermEntities(documentIdsD2, d2Reader, sortedTerms[i + j])); } } catch (IOException ex) { ex.printStackTrace(); } } if (!entityIds1.isEmpty() && !entityIds2.isEmpty()) { int[] idsArray1 = Converter.convertCollectionToArray(entityIds1); int[] idsArray2 = Converter.convertCollectionToArray(entityIds2); BilateralBlock bBlock = new BilateralBlock(idsArray1, idsArray2); blocks.add(bBlock); } } noOfEntities = new double[2]; noOfEntities[0] = d1Reader.numDocs(); noOfEntities[1] = d2Reader.numDocs(); Utilities.closeReader(d1Reader); Utilities.closeReader(d2Reader); }
From source file:BlockBuilding.AbstractSortedNeighborhoodBlocking.java
License:Open Source License
protected Integer[] getSortedEntities(String[] sortedTerms, IndexReader d1Reader, IndexReader d2Reader) { int datasetLimit = d1Reader.numDocs(); final List<Integer> sortedEntityIds = new ArrayList<>(); int[] documentIdsD1 = Utilities.getDocumentIds(d1Reader); int[] documentIdsD2 = Utilities.getDocumentIds(d2Reader); for (String blockingKey : sortedTerms) { List<Integer> sortedIds = new ArrayList<>(); sortedIds.addAll(getTermEntities(documentIdsD1, d1Reader, blockingKey)); getTermEntities(documentIdsD2, d2Reader, blockingKey).stream().forEach((entityId) -> { sortedIds.add(datasetLimit + entityId); });/*from w ww . j av a 2 s . c o m*/ Collections.shuffle(sortedIds); sortedEntityIds.addAll(sortedIds); } return sortedEntityIds.toArray(new Integer[sortedEntityIds.size()]); }
From source file:BlockBuilding.AbstractSortedNeighborhoodBlocking.java
License:Open Source License
protected void parseIndex() { IndexReader d1Reader = Utilities.openReader(indexDirectory[sourceId]); final Set<String> blockingKeysSet = getTerms(d1Reader); String[] sortedTerms = blockingKeysSet.toArray(new String[blockingKeysSet.size()]); Arrays.sort(sortedTerms);//from ww w . j av a2 s .com Integer[] allEntityIds = getSortedEntities(sortedTerms, d1Reader); //slide window over the sorted list of entity ids int upperLimit = allEntityIds.length - windowSize; for (int i = 0; i <= upperLimit; i++) { final Set<Integer> entityIds = new HashSet<>(); for (int j = 0; j < windowSize; j++) { entityIds.add(allEntityIds[i + j]); } if (1 < entityIds.size()) { int[] idsArray = Converter.convertCollectionToArray(entityIds); UnilateralBlock uBlock = new UnilateralBlock(idsArray); blocks.add(uBlock); } } noOfEntities = new double[1]; noOfEntities[0] = d1Reader.numDocs(); Utilities.closeReader(d1Reader); }