List of usage examples for org.apache.lucene.index IndexReader numDocs
public abstract int numDocs();
From source file:BlockBuilding.AbstractSortedNeighborhoodBlocking.java
License:Open Source License
protected void parseIndices() { IndexReader d1Reader = Utilities.openReader(indexDirectory[0]); IndexReader d2Reader = Utilities.openReader(indexDirectory[1]); final Set<String> blockingKeysSet = getTerms(d1Reader); blockingKeysSet.addAll(getTerms(d2Reader)); String[] sortedTerms = blockingKeysSet.toArray(new String[blockingKeysSet.size()]); Arrays.sort(sortedTerms);/*from w w w . j a va 2s . com*/ Integer[] allEntityIds = getSortedEntities(sortedTerms, d1Reader, d2Reader); int datasetLimit = d1Reader.numDocs(); //slide window over the sorted list of entity ids int upperLimit = allEntityIds.length - windowSize; for (int i = 0; i <= upperLimit; i++) { final Set<Integer> entityIds1 = new HashSet<>(); final Set<Integer> entityIds2 = new HashSet<>(); for (int j = 0; j < windowSize; j++) { if (allEntityIds[i + j] < datasetLimit) { entityIds1.add(allEntityIds[i + j]); } else { entityIds2.add(allEntityIds[i + j] - datasetLimit); } } if (!entityIds1.isEmpty() && !entityIds2.isEmpty()) { int[] idsArray1 = Converter.convertCollectionToArray(entityIds1); int[] idsArray2 = Converter.convertCollectionToArray(entityIds2); BilateralBlock bBlock = new BilateralBlock(idsArray1, idsArray2); blocks.add(bBlock); } } noOfEntities = new double[2]; noOfEntities[0] = d1Reader.numDocs(); noOfEntities[1] = d2Reader.numDocs(); Utilities.closeReader(d1Reader); Utilities.closeReader(d2Reader); }
From source file:BlockBuilding.MemoryBased.SchemaBased.SortedNeighborhood.java
License:Open Source License
protected void parseIndices() { IndexReader d1Reader = Utilities.openReader(indexDirectory[0]); IndexReader d2Reader = Utilities.openReader(indexDirectory[1]); final Set<String> blockingKeysSet = getTerms(d1Reader); blockingKeysSet.addAll(getTerms(d2Reader)); String[] sortedTerms = blockingKeysSet.toArray(new String[blockingKeysSet.size()]); Arrays.sort(sortedTerms);// w w w. j a va2 s .c o m Integer[] allEntityIds = getTermEntities(sortedTerms, d1Reader, d2Reader); int datasetLimit = d1Reader.numDocs(); //slide window over the sorted list of entity ids int upperLimit = allEntityIds.length - windowSize; for (int i = 0; i <= upperLimit; i++) { final Set<Integer> entityIds1 = new HashSet<>(); final Set<Integer> entityIds2 = new HashSet<>(); for (int j = 0; j < windowSize; j++) { if (allEntityIds[i + j] < datasetLimit) { entityIds1.add(allEntityIds[i + j]); } else { entityIds2.add(allEntityIds[i + j] - datasetLimit); } } if (!entityIds1.isEmpty() && !entityIds2.isEmpty()) { int[] idsArray1 = Converter.convertCollectionToArray(entityIds1); int[] idsArray2 = Converter.convertCollectionToArray(entityIds2); BilateralBlock bBlock = new BilateralBlock(idsArray1, idsArray2); blocks.add(bBlock); } } Utilities.closeReader(d1Reader); Utilities.closeReader(d2Reader); }
From source file:BlockBuilding.MemoryBased.SchemaBased.SortedNeighborhood.java
License:Open Source License
protected Integer[] getTermEntities(String[] sortedTerms, IndexReader d1Reader, IndexReader d2Reader) { int datasetLimit = d1Reader.numDocs(); final List<Integer> sortedEntityIds = new ArrayList<>(); int[] documentIdsD1 = Utilities.getDocumentIds(d1Reader); int[] documentIdsD2 = Utilities.getDocumentIds(d2Reader); for (String blockingKey : sortedTerms) { List<Integer> sortedIds = new ArrayList<>(); sortedIds.addAll(getTermEntities(documentIdsD1, d1Reader, blockingKey)); for (Integer entityId : getTermEntities(documentIdsD2, d2Reader, blockingKey)) { sortedIds.add(datasetLimit + entityId); }// w w w . j a va2s .c o m Collections.shuffle(sortedIds); sortedEntityIds.addAll(sortedIds); } return sortedEntityIds.toArray(new Integer[sortedEntityIds.size()]); }
From source file:BlockBuilding.SortedNeighborhoodBlocking.java
License:Apache License
protected Integer[] getSortedEntities(String[] sortedTerms, IndexReader d1Reader, IndexReader d2Reader) { int datasetLimit = d1Reader.numDocs(); final List<Integer> sortedEntityIds = new ArrayList<>(); int[] documentIdsD1 = getDocumentIds(d1Reader); int[] documentIdsD2 = getDocumentIds(d2Reader); for (String blockingKey : sortedTerms) { List<Integer> sortedIds = new ArrayList<>(); sortedIds.addAll(getTermEntities(documentIdsD1, d1Reader, blockingKey)); getTermEntities(documentIdsD2, d2Reader, blockingKey).stream().forEach((entityId) -> { sortedIds.add(datasetLimit + entityId); });//from w w w . ja v a 2 s . c o m Collections.shuffle(sortedIds); sortedEntityIds.addAll(sortedIds); } return sortedEntityIds.toArray(new Integer[sortedEntityIds.size()]); }
From source file:BlockBuilding.SortedNeighborhoodBlocking.java
License:Apache License
protected void parseIndices(IndexReader iReader1, IndexReader iReader2) { final Set<String> blockingKeysSet = getTerms(iReader1); blockingKeysSet.addAll(getTerms(iReader2)); String[] sortedTerms = blockingKeysSet.toArray(new String[blockingKeysSet.size()]); Arrays.sort(sortedTerms);// w w w . ja v a 2s . c o m Integer[] allEntityIds = getSortedEntities(sortedTerms, iReader1, iReader2); int datasetLimit = iReader1.numDocs(); //slide window over the sorted list of entity ids int upperLimit = allEntityIds.length - windowSize; for (int i = 0; i <= upperLimit; i++) { final Set<Integer> entityIds1 = new HashSet<>(); final Set<Integer> entityIds2 = new HashSet<>(); for (int j = 0; j < windowSize; j++) { if (allEntityIds[i + j] < datasetLimit) { entityIds1.add(allEntityIds[i + j]); } else { entityIds2.add(allEntityIds[i + j] - datasetLimit); } } if (!entityIds1.isEmpty() && !entityIds2.isEmpty()) { int[] idsArray1 = Converter.convertCollectionToArray(entityIds1); int[] idsArray2 = Converter.convertCollectionToArray(entityIds2); BilateralBlock bBlock = new BilateralBlock(idsArray1, idsArray2); blocks.add(bBlock); } } }
From source file:BlockBuilding.Utilities.java
License:Open Source License
public static int[] getDocumentIds(IndexReader reader) { int[] documentIds = new int[reader.numDocs()]; for (int i = 0; i < documentIds.length; i++) { try {/* ww w . ja v a 2 s .c o m*/ Document document = reader.document(i); documentIds[i] = Integer.parseInt(document.get(DOC_ID)); } catch (IOException ex) { ex.printStackTrace(); } } return documentIds; }
From source file:ca.ualberta.entitylinking.common.indexing.DocumentIndexer.java
License:Open Source License
public void readLuceneIndex(String indexDir, String docName) { IndexReader reader = null; Map<String, Integer> name2id = null; //load index/*ww w. j av a 2s .co m*/ try { reader = IndexReader.open(FSDirectory.open(new File(indexDir))); String[] stringArray = FieldCache.DEFAULT.getStrings(reader, "name"); // build a map from string to its document id. name2id = new HashMap<String, Integer>(); for (int i = 0; i < stringArray.length; i++) name2id.put(stringArray[i], i); } catch (IOException e) { e.printStackTrace(); } //get tf-idf vector of a document. DefaultSimilarity simObj = new DefaultSimilarity(); try { if (!name2id.containsKey(docName)) return; int docId = name2id.get(docName); Document doc = reader.document(docId); TermFreqVector termVector = reader.getTermFreqVector(docId, "contents"); int numDocs = reader.numDocs(); int[] termFreq = termVector.getTermFrequencies(); String[] terms = termVector.getTerms(); for (int i = 0; i < terms.length; i++) { //avoid stop words // if (isStopWord(terms[i])) // continue; int tf = termFreq[i]; int df = reader.docFreq(new Term("contents", terms[i])); float tfidf = simObj.tf(tf) * simObj.idf(df, numDocs); System.out.println(terms[i] + ": " + tfidf); } } catch (Exception e) { e.printStackTrace(); } }
From source file:com.codecrate.shard.search.ObjectIndexer.java
License:Apache License
public void clean() { IndexReader reader = null; try {//from w ww . j a v a 2 s . c o m reader = IndexReader.open(directory); int numberDocuments = reader.numDocs(); for (int currentDocument = 0; currentDocument < numberDocuments; currentDocument++) { reader.delete(currentDocument); } LOG.debug("Cleaned " + numberDocuments + " documents from index " + directory); } catch (IOException e) { LOG.error("Error cleaning index " + directory, e); } finally { closeReader(reader); } }
From source file:com.doculibre.constellio.lucene.BaseLuceneIndexHelper.java
License:Open Source License
@Override public synchronized boolean isEmpty() { boolean empty; try {/*from w w w. j a v a2s . c o m*/ Directory directory = FSDirectory.open(indexDir); IndexReader indexReader = DirectoryReader.open(directory); empty = indexReader.numDocs() <= 1; indexReader.close(); directory.close(); } catch (IOException e) { throw new RuntimeException(e); } return empty; }
From source file:com.doculibre.constellio.services.ImportExportServicesImpl.java
License:Open Source License
@SuppressWarnings("unchecked") @Override//from w ww.ja va2 s. c om public void importData(Directory directory, RecordCollection collection, ProgressInfo progressInfo) { try { ConnectorInstance connectorInstance = collection.getConnectorInstances().iterator().next(); RecordServices recordServices = ConstellioSpringUtils.getRecordServices(); String uniqueKeyMetaName = null; IndexField uniqueKeyIndexField = collection.getUniqueKeyIndexField(); for (ConnectorInstanceMeta connectorInstanceMeta : uniqueKeyIndexField.getConnectorInstanceMetas()) { if (connectorInstance.equals(connectorInstanceMeta.getConnectorInstance())) { uniqueKeyMetaName = connectorInstanceMeta.getName(); break; } } Pattern invalidDatePattern = Pattern .compile("^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}(\\.[0-9]*)?$"); IndexReader indexReader = DirectoryReader.open(directory); if (progressInfo != null) { progressInfo.setTotal(indexReader.numDocs()); } for (int i = 0; i < indexReader.numDocs(); i++) { Document document = indexReader.document(i); Record record = new Record(); record.setLastModified(new Date()); record.setConnectorInstance(connectorInstance); for (IndexableField field : document.getFields()) { // for (String fieldName : (Collection<String>) // indexReader.getFieldNames(FieldOption.ALL)) { if (field != null && field.fieldType().stored() && field.binaryValue() == null) { String metaName = field.name(); String metaContent = field.stringValue(); Matcher invalidDateMatcher = invalidDatePattern.matcher(metaContent); if (invalidDateMatcher.matches()) { metaContent = metaContent + "Z"; } if (uniqueKeyMetaName.equals(metaName)) { record.setUrl(metaContent); } RecordMeta meta = new RecordMeta(); ConnectorInstanceMeta connectorInstanceMeta = connectorInstance.getOrCreateMeta(metaName); meta.setConnectorInstanceMeta(connectorInstanceMeta); meta.setRecord(record); meta.setContent(metaContent); record.addContentMeta(meta); } } try { recordServices.makePersistent(record); // if (i % 500 == 0) { // EntityManager entityManager = // ConstellioPersistenceContext.getCurrentEntityManager(); // entityManager.getTransaction().commit(); // entityManager.getTransaction().begin(); // } } catch (Exception e) { e.printStackTrace(); } if (progressInfo != null) { progressInfo.setCurrentIndex(i); } } indexReader.close(); } catch (IOException e) { throw new RuntimeException(e); } }