Example usage for org.apache.lucene.index IndexReader document

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader document.

Prototype




public final Document document(int docID) throws IOException

Source Link

Document

Returns the stored fields of the n^th Document in this index.

Usage

From source file:org.toubassi.femtozip.lucene.IndexDocumentList.java

License:Apache License

public IndexDocumentList(IndexReader reader, int numSamples, int firstDoc, String fieldName)
        throws IOException {
    this.reader = reader;
    this.fieldName = fieldName;
    numDocs = reader.numDocs();/*  w  ww.  ja v a 2 s  .c  om*/
    float samplingRate = ((float) numSamples) / numDocs;

    ArrayList<Integer> docIdsList = new ArrayList<Integer>();
    ArrayList<Integer> fieldCountList = new ArrayList<Integer>();

    int numDocsScanned = 0, numDocsSampled = 0;
    for (int i = firstDoc, count = reader.maxDoc(); i < count; i++) {
        numDocsScanned++;

        if (reader.isDeleted(i)) {
            continue;
        }

        if (((int) (numDocsScanned * samplingRate)) <= numDocsSampled) {
            continue;
        }

        numDocsSampled++;

        Document doc = reader.document(i);
        Field fields[] = doc.getFields(fieldName);
        if (fields.length > 0) {
            if (fields[0].isStored()) {
                docIdsList.add(i);
                fieldCountList.add(fields.length);
            }
        }
    }

    docIds = new int[docIdsList.size()];
    for (int i = 0, count = docIdsList.size(); i < count; i++) {
        docIds[i] = docIdsList.get(i);
    }

    fieldCounts = new int[fieldCountList.size()];
    for (int i = 0, count = fieldCountList.size(); i < count; i++) {
        fieldCounts[i] = fieldCountList.get(i);
        if (i > 0) {
            fieldCounts[i] += fieldCounts[i - 1];
        }
    }
}

From source file:org.toubassi.femtozip.lucene.IndexDumper.java

License:Apache License

protected void dump() throws IOException {
    IndexReader reader = IndexReader.open(indexPath);

    Collection<?> allFields = reader.getFieldNames(IndexReader.FieldOption.ALL);
    String[] fieldNames = new String[allFields.size()];
    allFields.toArray(fieldNames);/*w  w w  .  j  av a2  s . c  o m*/

    numDocs = reader.numDocs();
    int maxDocId = reader.maxDoc();
    float samplingRate = ((float) numSamples) / numDocs;

    int numDocsScanned = 0;
    int numDocsSampled = 0;
    for (int docId = 0; docId < maxDocId; docId++) {

        if (reader.isDeleted(docId)) {
            continue;
        }

        numDocsScanned++;

        if (((int) (numDocsScanned * samplingRate)) <= numDocsSampled) {
            continue;
        }

        numDocsSampled++;

        Document doc = reader.document(docId);

        System.out.println("DOCUMENT: " + docId);

        for (String fieldName : fieldNames) {
            if (fieldsToDump != null && fieldsToDump.indexOf(fieldName) == -1) {
                continue;
            }

            Field[] fields = doc.getFields(fieldName);

            for (Field field : fields) {

                if (!field.isStored() || field.isCompressed()) {
                    // TODO if its compressed, uncompress it and benchmark it.
                    continue;
                }

                byte[] bytes;

                if (field.isBinary()) {
                    bytes = new byte[field.getBinaryLength()];
                    System.arraycopy(field.getBinaryValue(), field.getBinaryOffset(), bytes, 0,
                            field.getBinaryLength());
                } else {
                    String value = field.stringValue();
                    bytes = value.getBytes("UTF-8");
                }

                if (bytes.length > 0) {
                    System.out.print("    " + fieldName + " " + bytes.length + " ");
                    System.out.write(bytes);
                    System.out.println();
                }
            }
        }
    }

    reader.close();
}

From source file:org.toubassi.femtozip.lucene.StoredFieldDumper.java

License:Apache License

protected void dump() throws IOException {
    IndexReader reader = IndexReader.open(indexPath);

    Collection<?> allFields = reader.getFieldNames(IndexReader.FieldOption.ALL);
    String[] fieldNames = new String[allFields.size()];
    allFields.toArray(fieldNames);//from w  w  w .jav  a 2s  .co  m

    Map<String, OutputStream> output = new HashMap<String, OutputStream>();

    long lastStatusTime = 0;

    for (int docId = 0, count = reader.maxDoc(); docId < count; docId++) {
        Document doc = reader.document(docId);

        if (System.currentTimeMillis() - lastStatusTime > 5000) {
            lastStatusTime = System.currentTimeMillis();
            System.out.println("Processing docId " + docId + " of " + count);
        }

        for (String fieldName : fieldNames) {
            Field[] fields = doc.getFields(fieldName);

            for (Field field : fields) {

                if (!field.isStored() || field.isCompressed()) {
                    // TODO if its compressed, uncompress it and benchmark it.
                    continue;
                }

                byte[] bytes;

                if (field.isBinary()) {
                    bytes = new byte[field.getBinaryLength()];
                    System.arraycopy(field.getBinaryValue(), field.getBinaryOffset(), bytes, 0,
                            field.getBinaryLength());
                } else {
                    String value = field.stringValue();
                    bytes = value.getBytes("UTF-8");
                }

                OutputStream out = output.get(fieldName);
                if (out == null) {
                    FileOutputStream fileOut = new FileOutputStream(outputBasePath + "_" + fieldName);
                    out = new BufferedOutputStream(fileOut);
                    output.put(fieldName, out);
                }

                out.write(bytes);
            }
        }
    }

    reader.close();

    for (Map.Entry<String, OutputStream> entry : output.entrySet()) {
        entry.getValue().close();
    }
}

From source file:org.toubassi.femtozip.lucene.StoredFieldExploder.java

License:Apache License

protected void dump() throws IOException {
    IndexReader reader = IndexReader.open(indexPath);

    Collection<?> allFields = reader.getFieldNames(IndexReader.FieldOption.ALL);
    String[] fieldNames = new String[allFields.size()];
    allFields.toArray(fieldNames);/*from w w w. j a va2s . c  o  m*/

    int numProcessed = 0;

    for (int docId = 0, count = reader.maxDoc(); docId < count && numProcessed < numSamples; docId++) {
        if (reader.isDeleted(docId)) {
            continue;
        }

        Document doc = reader.document(docId);
        Field field = doc.getField(fieldName);

        if (field != null) {

            FileOutputStream out = new FileOutputStream(
                    outputBasePath + File.separator + (numProcessed + 1) + "." + fieldName);
            if (field.isBinary()) {
                out.write(field.getBinaryValue(), field.getBinaryOffset(), field.getBinaryLength());
            } else {
                out.write(field.stringValue().getBytes("UTF-8"));
            }
            out.close();

            numProcessed++;
        }
    }

    reader.close();
}

From source file:org.wandora.indexer.AbstractIndexBuilder.java

License:Open Source License

public Set getDependentTopics(String topic, IndexReader reader) throws IOException {
    TermDocs docs = reader.termDocs(new Term("topic", topic));
    while (docs.next()) {
        Document doc = reader.document(docs.doc());
        String type = doc.get("type");
        if (type != null && type.toString().equals("topic")) {
            String d = doc.get("dependent");
            StringTokenizer st = new StringTokenizer(d, "\n");
            Set s = new HashSet();
            while (st.hasMoreTokens()) {
                s.add(st.nextToken());/*from   w w w.ja  va  2  s .c om*/
            }
            return s;
        }
    }
    return null;
}

From source file:org.xcmis.search.lucene.index.PersistedIndex.java

License:Open Source License

private Document getDocument(final String uuid, final IndexReader reader) throws IndexException {

    try {/* ww  w  . j av  a 2 s  . co  m*/

        final TermDocs termDocs = reader.termDocs(new Term(FieldNames.UUID, uuid));
        if (termDocs.next()) {
            final Document document = reader.document(termDocs.doc());
            if (termDocs.next()) {
                throw new IndexException("More then one document found for uuid:" + uuid);
            }
            return document;
        }

    } catch (final IOException e) {
        throw new IndexException(e.getLocalizedMessage(), e);
    }
    return null;
}

From source file:org.xcmis.search.lucene.LuceneQueryableIndexStorage.java

License:Open Source License

protected Document getDocument(String uuid, IndexReader reader) throws IndexException {

    try {//from  w  ww .jav a  2 s. c om

        if (reader != null) {
            final TermDocs termDocs = reader.termDocs(new Term(FieldNames.UUID, uuid));
            if (termDocs.next()) {
                final Document document = reader.document(termDocs.doc());
                if (termDocs.next()) {
                    throw new IndexException("More then one document found for uuid:" + uuid);
                }
                return document;
            }
        }
    } catch (final IOException e) {
        throw new IndexException(e.getLocalizedMessage(), e);
    }
    return null;
}

From source file:org.zenoss.zep.index.impl.EventIndexDaoImplIT.java

License:Open Source License

private Set<String> getFieldNames(LuceneEventIndexBackend backend, String eventUuid) throws IOException {
    IndexWriter indexWriter = (IndexWriter) ReflectionTestUtils.getField(backend, "writer");
    IndexReader reader = null;
    try {/*from w w w .  j  av  a 2  s . c o m*/
        reader = IndexReader.open(indexWriter, true);
        IndexSearcher searcher = new IndexSearcher(reader);
        TopDocs docs = searcher.search(new TermQuery(new Term(IndexConstants.FIELD_UUID, eventUuid)), null, 1);
        assertEquals(1, docs.totalHits);
        int docId = docs.scoreDocs[0].doc;
        Document document = reader.document(docId);
        Set<String> fieldNames = Sets.newHashSet();
        for (IndexableField field : document.getFields()) {
            fieldNames.add(field.name());
        }
        return fieldNames;
    } finally {
        ZepUtils.close(reader);
    }
}

From source file:org.zilverline.core.AbstractCollection.java

License:Open Source License

/**
 * Initialize this collection by getting its index. It retrieves the number
 * of documents and the MD5 hash of all documents in the collection.
 * /*w w  w .j  av  a2s. c o m*/
 * If the index does not exist (this is a new Collection) just return.
 * 
 * @throws IndexException
 *             when existing index of Collection can not be succesfully
 *             opened.
 */
public final void init() throws IndexException {
    log.debug("Initializing collection " + name);
    IndexReader index = null;
    // Determine whether the collection exists on disk
    setExistsOnDisk();
    // check whether this collection has a cache for the MD5 hashes of
    // documents
    if (md5DocumentCache == null) {
        md5DocumentCache = new HashSet();
    }
    // check whether this collection has a cache for the MD5 hashes of
    // indexed archives
    if (archiveCache == null) {
        archiveCache = new HashSet();
    }
    if (!isIndexValid()) {
        log.info("Index does not exist (yet) for collection '" + name + "'. Possibly new collection.");
        numberOfDocs = 0;
        return;
    }

    // Get the index
    File thisIndex = getIndexDirWithManagerDefaults();
    try {
        index = IndexReader.open(thisIndex);

        if (index != null) {
            numberOfDocs = index.numDocs();
            // retrieve all hashes of Documents from the cache
            md5DocumentCache.clear();
            for (int i = 0; i < numberOfDocs; i++) {
                Document d = index.document(i);
                String hashValue = d.get("hash");
                md5DocumentCache.add(hashValue);
            }
            // get some relevant information from the index
            version = IndexReader.getCurrentVersion(thisIndex);
            // deprecated, but needed
            lastIndexed = new Date(IndexReader.lastModified(thisIndex));
            log.debug("Collection " + name + " has " + numberOfDocs + " documents, index created at: "
                    + lastIndexed);
        } else {
            log.error("Index could not be retrieved for collection " + name);
        }
    } catch (IOException e) {
        throw new IndexException("Error initializing collection '" + name + "'", e);
    } finally {
        if (index != null) {
            try {
                index.close();
            } catch (IOException e1) {
                log.error("Error closing index for collection " + name, e1);
            }
        } else {
            numberOfDocs = 0;
            version = 0;
            lastIndexed = null;
        }
    }
}

From source file:perf.PKLookupPerfTest3X.java

License:Apache License

public static void main(String[] args) throws IOException {

    final Directory dir;
    final String dirImpl = args[0];
    final String dirPath = args[1];
    final int numDocs = Integer.parseInt(args[2]);
    final int numLookups = Integer.parseInt(args[3]);
    final long seed = Long.parseLong(args[4]);

    if (dirImpl.equals("MMapDirectory")) {
        dir = new MMapDirectory(new File(dirPath));
    } else if (dirImpl.equals("NIOFSDirectory")) {
        dir = new NIOFSDirectory(new File(dirPath));
    } else if (dirImpl.equals("SimpleFSDirectory")) {
        dir = new SimpleFSDirectory(new File(dirPath));
    } else {/*from   w  w  w  .j ava2 s  . c o  m*/
        throw new RuntimeException("unknown directory impl \"" + dirImpl + "\"");
    }

    if (!new File(dirPath).exists()) {
        createIndex(dir, numDocs);
    }

    final IndexReader r = IndexReader.open(dir);
    System.out.println("Reader=" + r);

    final IndexReader[] subs = r.getSequentialSubReaders();
    final TermDocs[] termDocsArr = new TermDocs[subs.length];
    for (int subIdx = 0; subIdx < subs.length; subIdx++) {
        termDocsArr[subIdx] = subs[subIdx].termDocs();
    }

    final int maxDoc = r.maxDoc();
    final Random rand = new Random(seed);

    for (int cycle = 0; cycle < 10; cycle++) {
        System.out.println("Cycle: " + (cycle == 0 ? "warm" : "test"));
        System.out.println("  Lookup...");
        final Term[] lookup = new Term[numLookups];
        final int[] docIDs = new int[numLookups];
        final Term protoTerm = new Term("id");
        for (int iter = 0; iter < numLookups; iter++) {
            // Base 36, prefixed with 0s to be length 6 (= 2.2 B)
            lookup[iter] = protoTerm.createTerm(
                    String.format("%6s", Integer.toString(rand.nextInt(maxDoc), Character.MAX_RADIX))
                            .replace(' ', '0'));
        }
        Arrays.fill(docIDs, -1);

        final AtomicBoolean failed = new AtomicBoolean(false);

        final Term t = new Term("id", "");

        final long tStart = System.currentTimeMillis();
        for (int iter = 0; iter < numLookups; iter++) {
            //System.out.println("lookup " + lookup[iter].utf8ToString());
            int base = 0;
            int found = 0;
            for (int subIdx = 0; subIdx < subs.length; subIdx++) {
                final IndexReader sub = subs[subIdx];
                if (!DO_DOC_LOOKUP) {
                    final int df = sub.docFreq(lookup[iter]);
                    if (df != 0) {
                        if (df != 1) {
                            // Only 1 doc should be found
                            failed.set(true);
                        }
                        found++;
                        if (found > 1) {
                            // Should have been found only once across segs
                            System.out.println("FAIL0");
                            failed.set(true);
                        }
                    }
                } else {
                    final TermDocs termDocs = termDocsArr[subIdx];
                    termDocs.seek(lookup[iter]);
                    if (termDocs.next()) {
                        found++;
                        if (found > 1) {
                            // Should have been found only once across segs
                            failed.set(true);
                        }
                        final int docID = termDocs.doc();
                        if (docIDs[iter] != -1) {
                            // Same doc should only be seen once
                            failed.set(true);
                        }
                        docIDs[iter] = base + docID;
                        if (termDocs.next()) {
                            // Only 1 doc should be found
                            failed.set(true);
                        }
                    }
                }
                base += sub.maxDoc();
            }
        }
        final long tLookup = (System.currentTimeMillis() - tStart);

        // cycle 0 is for warming
        //System.out.println("  " + (cycle == 0 ? "WARM: " : "") + tLookup + " msec for " + numLookups + " lookups (" + (1000*tLookup/numLookups) + " us per lookup) + totSeekMS=" + (BlockTermsReader.totSeekNanos/1000000.));
        System.out.println("  " + (cycle == 0 ? "WARM: " : "") + tLookup + " msec for " + numLookups
                + " lookups (" + (1000.0 * tLookup / numLookups) + " us per lookup)");

        if (failed.get()) {
            throw new RuntimeException("at least one lookup produced more than one result");
        }

        if (DO_DOC_LOOKUP) {
            System.out.println("  Verify...");
            for (int iter = 0; iter < numLookups; iter++) {
                if (docIDs[iter] == -1) {
                    throw new RuntimeException("lookup of " + lookup[iter] + " failed iter=" + iter);
                }
                final String found = r.document(docIDs[iter]).get("id");
                if (!found.equals(lookup[iter].text())) {
                    throw new RuntimeException(
                            "lookup of docid=" + lookup[iter].text() + " hit wrong docid=" + found);
                }
            }
        }
    }

    // System.out.println("blocks=" + BlockTermsReader.totBlockReadCount + " scans=" + BlockTermsReader.totScanCount + " " + (((float) BlockTermsReader.totScanCount))/(BlockTermsReader.totBlockReadCount) + " scans/block");

    r.close();
    dir.close();
}