Example usage for org.apache.lucene.index IndexReader numDocs

List of usage examples for org.apache.lucene.index IndexReader numDocs

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader numDocs.

Prototype

public abstract int numDocs();

Source Link

Document

Returns the number of documents in this index.

Usage

From source file:BlockBuilding.AbstractSortedNeighborhoodBlocking.java

License:Open Source License

protected void parseIndices() {
    IndexReader d1Reader = Utilities.openReader(indexDirectory[0]);
    IndexReader d2Reader = Utilities.openReader(indexDirectory[1]);

    final Set<String> blockingKeysSet = getTerms(d1Reader);
    blockingKeysSet.addAll(getTerms(d2Reader));
    String[] sortedTerms = blockingKeysSet.toArray(new String[blockingKeysSet.size()]);
    Arrays.sort(sortedTerms);/*from  w  w w . j  a va 2s . com*/

    Integer[] allEntityIds = getSortedEntities(sortedTerms, d1Reader, d2Reader);

    int datasetLimit = d1Reader.numDocs();
    //slide window over the sorted list of entity ids
    int upperLimit = allEntityIds.length - windowSize;
    for (int i = 0; i <= upperLimit; i++) {
        final Set<Integer> entityIds1 = new HashSet<>();
        final Set<Integer> entityIds2 = new HashSet<>();
        for (int j = 0; j < windowSize; j++) {
            if (allEntityIds[i + j] < datasetLimit) {
                entityIds1.add(allEntityIds[i + j]);
            } else {
                entityIds2.add(allEntityIds[i + j] - datasetLimit);
            }
        }

        if (!entityIds1.isEmpty() && !entityIds2.isEmpty()) {
            int[] idsArray1 = Converter.convertCollectionToArray(entityIds1);
            int[] idsArray2 = Converter.convertCollectionToArray(entityIds2);
            BilateralBlock bBlock = new BilateralBlock(idsArray1, idsArray2);
            blocks.add(bBlock);
        }
    }

    noOfEntities = new double[2];
    noOfEntities[0] = d1Reader.numDocs();
    noOfEntities[1] = d2Reader.numDocs();

    Utilities.closeReader(d1Reader);
    Utilities.closeReader(d2Reader);
}

From source file:BlockBuilding.MemoryBased.SchemaBased.SortedNeighborhood.java

License:Open Source License

protected void parseIndices() {
    IndexReader d1Reader = Utilities.openReader(indexDirectory[0]);
    IndexReader d2Reader = Utilities.openReader(indexDirectory[1]);

    final Set<String> blockingKeysSet = getTerms(d1Reader);
    blockingKeysSet.addAll(getTerms(d2Reader));
    String[] sortedTerms = blockingKeysSet.toArray(new String[blockingKeysSet.size()]);
    Arrays.sort(sortedTerms);//  w w  w. j a  va2 s .c o m

    Integer[] allEntityIds = getTermEntities(sortedTerms, d1Reader, d2Reader);

    int datasetLimit = d1Reader.numDocs();
    //slide window over the sorted list of entity ids
    int upperLimit = allEntityIds.length - windowSize;
    for (int i = 0; i <= upperLimit; i++) {
        final Set<Integer> entityIds1 = new HashSet<>();
        final Set<Integer> entityIds2 = new HashSet<>();
        for (int j = 0; j < windowSize; j++) {
            if (allEntityIds[i + j] < datasetLimit) {
                entityIds1.add(allEntityIds[i + j]);
            } else {
                entityIds2.add(allEntityIds[i + j] - datasetLimit);
            }
        }

        if (!entityIds1.isEmpty() && !entityIds2.isEmpty()) {
            int[] idsArray1 = Converter.convertCollectionToArray(entityIds1);
            int[] idsArray2 = Converter.convertCollectionToArray(entityIds2);
            BilateralBlock bBlock = new BilateralBlock(idsArray1, idsArray2);
            blocks.add(bBlock);
        }
    }

    Utilities.closeReader(d1Reader);
    Utilities.closeReader(d2Reader);
}

From source file:BlockBuilding.MemoryBased.SchemaBased.SortedNeighborhood.java

License:Open Source License

protected Integer[] getTermEntities(String[] sortedTerms, IndexReader d1Reader, IndexReader d2Reader) {
    int datasetLimit = d1Reader.numDocs();
    final List<Integer> sortedEntityIds = new ArrayList<>();

    int[] documentIdsD1 = Utilities.getDocumentIds(d1Reader);
    int[] documentIdsD2 = Utilities.getDocumentIds(d2Reader);
    for (String blockingKey : sortedTerms) {
        List<Integer> sortedIds = new ArrayList<>();
        sortedIds.addAll(getTermEntities(documentIdsD1, d1Reader, blockingKey));

        for (Integer entityId : getTermEntities(documentIdsD2, d2Reader, blockingKey)) {
            sortedIds.add(datasetLimit + entityId);
        }// w w w  .  j  a va2s  .c  o  m

        Collections.shuffle(sortedIds);
        sortedEntityIds.addAll(sortedIds);
    }

    return sortedEntityIds.toArray(new Integer[sortedEntityIds.size()]);
}

From source file:BlockBuilding.SortedNeighborhoodBlocking.java

License:Apache License

protected Integer[] getSortedEntities(String[] sortedTerms, IndexReader d1Reader, IndexReader d2Reader) {
    int datasetLimit = d1Reader.numDocs();
    final List<Integer> sortedEntityIds = new ArrayList<>();

    int[] documentIdsD1 = getDocumentIds(d1Reader);
    int[] documentIdsD2 = getDocumentIds(d2Reader);
    for (String blockingKey : sortedTerms) {
        List<Integer> sortedIds = new ArrayList<>();
        sortedIds.addAll(getTermEntities(documentIdsD1, d1Reader, blockingKey));

        getTermEntities(documentIdsD2, d2Reader, blockingKey).stream().forEach((entityId) -> {
            sortedIds.add(datasetLimit + entityId);
        });//from   w  w w . ja v  a  2  s  .  c o  m

        Collections.shuffle(sortedIds);
        sortedEntityIds.addAll(sortedIds);
    }

    return sortedEntityIds.toArray(new Integer[sortedEntityIds.size()]);
}

From source file:BlockBuilding.SortedNeighborhoodBlocking.java

License:Apache License

protected void parseIndices(IndexReader iReader1, IndexReader iReader2) {
    final Set<String> blockingKeysSet = getTerms(iReader1);
    blockingKeysSet.addAll(getTerms(iReader2));
    String[] sortedTerms = blockingKeysSet.toArray(new String[blockingKeysSet.size()]);
    Arrays.sort(sortedTerms);//  w  w w . ja  v  a  2s .  c  o m

    Integer[] allEntityIds = getSortedEntities(sortedTerms, iReader1, iReader2);

    int datasetLimit = iReader1.numDocs();
    //slide window over the sorted list of entity ids
    int upperLimit = allEntityIds.length - windowSize;
    for (int i = 0; i <= upperLimit; i++) {
        final Set<Integer> entityIds1 = new HashSet<>();
        final Set<Integer> entityIds2 = new HashSet<>();
        for (int j = 0; j < windowSize; j++) {
            if (allEntityIds[i + j] < datasetLimit) {
                entityIds1.add(allEntityIds[i + j]);
            } else {
                entityIds2.add(allEntityIds[i + j] - datasetLimit);
            }
        }

        if (!entityIds1.isEmpty() && !entityIds2.isEmpty()) {
            int[] idsArray1 = Converter.convertCollectionToArray(entityIds1);
            int[] idsArray2 = Converter.convertCollectionToArray(entityIds2);
            BilateralBlock bBlock = new BilateralBlock(idsArray1, idsArray2);
            blocks.add(bBlock);
        }
    }
}

From source file:BlockBuilding.Utilities.java

License:Open Source License

public static int[] getDocumentIds(IndexReader reader) {
    int[] documentIds = new int[reader.numDocs()];
    for (int i = 0; i < documentIds.length; i++) {
        try {/*  ww w .  ja v a  2  s  .c o  m*/
            Document document = reader.document(i);
            documentIds[i] = Integer.parseInt(document.get(DOC_ID));
        } catch (IOException ex) {
            ex.printStackTrace();
        }
    }
    return documentIds;
}

From source file:ca.ualberta.entitylinking.common.indexing.DocumentIndexer.java

License:Open Source License

public void readLuceneIndex(String indexDir, String docName) {
    IndexReader reader = null;
    Map<String, Integer> name2id = null;

    //load index/*ww  w. j av a  2s  .co m*/
    try {
        reader = IndexReader.open(FSDirectory.open(new File(indexDir)));

        String[] stringArray = FieldCache.DEFAULT.getStrings(reader, "name");

        // build a map from string to its document id.
        name2id = new HashMap<String, Integer>();
        for (int i = 0; i < stringArray.length; i++)
            name2id.put(stringArray[i], i);
    } catch (IOException e) {
        e.printStackTrace();
    }

    //get tf-idf vector of a document.
    DefaultSimilarity simObj = new DefaultSimilarity();

    try {
        if (!name2id.containsKey(docName))
            return;

        int docId = name2id.get(docName);
        Document doc = reader.document(docId);

        TermFreqVector termVector = reader.getTermFreqVector(docId, "contents");
        int numDocs = reader.numDocs();

        int[] termFreq = termVector.getTermFrequencies();
        String[] terms = termVector.getTerms();
        for (int i = 0; i < terms.length; i++) {
            //avoid stop words
            //            if (isStopWord(terms[i]))
            //               continue;

            int tf = termFreq[i];
            int df = reader.docFreq(new Term("contents", terms[i]));
            float tfidf = simObj.tf(tf) * simObj.idf(df, numDocs);
            System.out.println(terms[i] + ": " + tfidf);
        }

    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:com.codecrate.shard.search.ObjectIndexer.java

License:Apache License

public void clean() {
    IndexReader reader = null;
    try {//from w  ww  . j a v a  2  s  . c o m
        reader = IndexReader.open(directory);
        int numberDocuments = reader.numDocs();
        for (int currentDocument = 0; currentDocument < numberDocuments; currentDocument++) {
            reader.delete(currentDocument);
        }
        LOG.debug("Cleaned " + numberDocuments + " documents from index " + directory);
    } catch (IOException e) {
        LOG.error("Error cleaning index " + directory, e);
    } finally {
        closeReader(reader);
    }
}

From source file:com.doculibre.constellio.lucene.BaseLuceneIndexHelper.java

License:Open Source License

@Override
public synchronized boolean isEmpty() {
    boolean empty;
    try {/*from   w  w  w.  j  a v  a2s . c o m*/
        Directory directory = FSDirectory.open(indexDir);
        IndexReader indexReader = DirectoryReader.open(directory);
        empty = indexReader.numDocs() <= 1;
        indexReader.close();
        directory.close();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    return empty;
}

From source file:com.doculibre.constellio.services.ImportExportServicesImpl.java

License:Open Source License

@SuppressWarnings("unchecked")
@Override//from  w ww.ja va2  s. c  om
public void importData(Directory directory, RecordCollection collection, ProgressInfo progressInfo) {
    try {
        ConnectorInstance connectorInstance = collection.getConnectorInstances().iterator().next();
        RecordServices recordServices = ConstellioSpringUtils.getRecordServices();

        String uniqueKeyMetaName = null;
        IndexField uniqueKeyIndexField = collection.getUniqueKeyIndexField();
        for (ConnectorInstanceMeta connectorInstanceMeta : uniqueKeyIndexField.getConnectorInstanceMetas()) {
            if (connectorInstance.equals(connectorInstanceMeta.getConnectorInstance())) {
                uniqueKeyMetaName = connectorInstanceMeta.getName();
                break;
            }
        }

        Pattern invalidDatePattern = Pattern
                .compile("^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}(\\.[0-9]*)?$");

        IndexReader indexReader = DirectoryReader.open(directory);
        if (progressInfo != null) {
            progressInfo.setTotal(indexReader.numDocs());
        }
        for (int i = 0; i < indexReader.numDocs(); i++) {
            Document document = indexReader.document(i);

            Record record = new Record();
            record.setLastModified(new Date());
            record.setConnectorInstance(connectorInstance);

            for (IndexableField field : document.getFields()) {
                // for (String fieldName : (Collection<String>)
                // indexReader.getFieldNames(FieldOption.ALL)) {
                if (field != null && field.fieldType().stored() && field.binaryValue() == null) {
                    String metaName = field.name();
                    String metaContent = field.stringValue();

                    Matcher invalidDateMatcher = invalidDatePattern.matcher(metaContent);
                    if (invalidDateMatcher.matches()) {
                        metaContent = metaContent + "Z";
                    }

                    if (uniqueKeyMetaName.equals(metaName)) {
                        record.setUrl(metaContent);
                    }

                    RecordMeta meta = new RecordMeta();
                    ConnectorInstanceMeta connectorInstanceMeta = connectorInstance.getOrCreateMeta(metaName);
                    meta.setConnectorInstanceMeta(connectorInstanceMeta);
                    meta.setRecord(record);
                    meta.setContent(metaContent);
                    record.addContentMeta(meta);
                }
            }

            try {
                recordServices.makePersistent(record);

                // if (i % 500 == 0) {
                // EntityManager entityManager =
                // ConstellioPersistenceContext.getCurrentEntityManager();
                // entityManager.getTransaction().commit();
                // entityManager.getTransaction().begin();
                // }
            } catch (Exception e) {
                e.printStackTrace();
            }

            if (progressInfo != null) {
                progressInfo.setCurrentIndex(i);
            }
        }

        indexReader.close();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}