List of usage examples for org.apache.lucene.index IndexReader document
public final Document document(int docID) throws IOException
n
th Document
in this index. From source file:org.toubassi.femtozip.lucene.IndexDocumentList.java
License:Apache License
public IndexDocumentList(IndexReader reader, int numSamples, int firstDoc, String fieldName) throws IOException { this.reader = reader; this.fieldName = fieldName; numDocs = reader.numDocs();/* w ww. ja v a 2 s .c om*/ float samplingRate = ((float) numSamples) / numDocs; ArrayList<Integer> docIdsList = new ArrayList<Integer>(); ArrayList<Integer> fieldCountList = new ArrayList<Integer>(); int numDocsScanned = 0, numDocsSampled = 0; for (int i = firstDoc, count = reader.maxDoc(); i < count; i++) { numDocsScanned++; if (reader.isDeleted(i)) { continue; } if (((int) (numDocsScanned * samplingRate)) <= numDocsSampled) { continue; } numDocsSampled++; Document doc = reader.document(i); Field fields[] = doc.getFields(fieldName); if (fields.length > 0) { if (fields[0].isStored()) { docIdsList.add(i); fieldCountList.add(fields.length); } } } docIds = new int[docIdsList.size()]; for (int i = 0, count = docIdsList.size(); i < count; i++) { docIds[i] = docIdsList.get(i); } fieldCounts = new int[fieldCountList.size()]; for (int i = 0, count = fieldCountList.size(); i < count; i++) { fieldCounts[i] = fieldCountList.get(i); if (i > 0) { fieldCounts[i] += fieldCounts[i - 1]; } } }
From source file:org.toubassi.femtozip.lucene.IndexDumper.java
License:Apache License
protected void dump() throws IOException { IndexReader reader = IndexReader.open(indexPath); Collection<?> allFields = reader.getFieldNames(IndexReader.FieldOption.ALL); String[] fieldNames = new String[allFields.size()]; allFields.toArray(fieldNames);/*w w w . j av a2 s . c o m*/ numDocs = reader.numDocs(); int maxDocId = reader.maxDoc(); float samplingRate = ((float) numSamples) / numDocs; int numDocsScanned = 0; int numDocsSampled = 0; for (int docId = 0; docId < maxDocId; docId++) { if (reader.isDeleted(docId)) { continue; } numDocsScanned++; if (((int) (numDocsScanned * samplingRate)) <= numDocsSampled) { continue; } numDocsSampled++; Document doc = reader.document(docId); System.out.println("DOCUMENT: " + docId); for (String fieldName : fieldNames) { if (fieldsToDump != null && fieldsToDump.indexOf(fieldName) == -1) { continue; } Field[] fields = doc.getFields(fieldName); for (Field field : fields) { if (!field.isStored() || field.isCompressed()) { // TODO if its compressed, uncompress it and benchmark it. continue; } byte[] bytes; if (field.isBinary()) { bytes = new byte[field.getBinaryLength()]; System.arraycopy(field.getBinaryValue(), field.getBinaryOffset(), bytes, 0, field.getBinaryLength()); } else { String value = field.stringValue(); bytes = value.getBytes("UTF-8"); } if (bytes.length > 0) { System.out.print(" " + fieldName + " " + bytes.length + " "); System.out.write(bytes); System.out.println(); } } } } reader.close(); }
From source file:org.toubassi.femtozip.lucene.StoredFieldDumper.java
License:Apache License
protected void dump() throws IOException { IndexReader reader = IndexReader.open(indexPath); Collection<?> allFields = reader.getFieldNames(IndexReader.FieldOption.ALL); String[] fieldNames = new String[allFields.size()]; allFields.toArray(fieldNames);//from w w w .jav a 2s .co m Map<String, OutputStream> output = new HashMap<String, OutputStream>(); long lastStatusTime = 0; for (int docId = 0, count = reader.maxDoc(); docId < count; docId++) { Document doc = reader.document(docId); if (System.currentTimeMillis() - lastStatusTime > 5000) { lastStatusTime = System.currentTimeMillis(); System.out.println("Processing docId " + docId + " of " + count); } for (String fieldName : fieldNames) { Field[] fields = doc.getFields(fieldName); for (Field field : fields) { if (!field.isStored() || field.isCompressed()) { // TODO if its compressed, uncompress it and benchmark it. continue; } byte[] bytes; if (field.isBinary()) { bytes = new byte[field.getBinaryLength()]; System.arraycopy(field.getBinaryValue(), field.getBinaryOffset(), bytes, 0, field.getBinaryLength()); } else { String value = field.stringValue(); bytes = value.getBytes("UTF-8"); } OutputStream out = output.get(fieldName); if (out == null) { FileOutputStream fileOut = new FileOutputStream(outputBasePath + "_" + fieldName); out = new BufferedOutputStream(fileOut); output.put(fieldName, out); } out.write(bytes); } } } reader.close(); for (Map.Entry<String, OutputStream> entry : output.entrySet()) { entry.getValue().close(); } }
From source file:org.toubassi.femtozip.lucene.StoredFieldExploder.java
License:Apache License
protected void dump() throws IOException { IndexReader reader = IndexReader.open(indexPath); Collection<?> allFields = reader.getFieldNames(IndexReader.FieldOption.ALL); String[] fieldNames = new String[allFields.size()]; allFields.toArray(fieldNames);/*from w w w. j a va2s . c o m*/ int numProcessed = 0; for (int docId = 0, count = reader.maxDoc(); docId < count && numProcessed < numSamples; docId++) { if (reader.isDeleted(docId)) { continue; } Document doc = reader.document(docId); Field field = doc.getField(fieldName); if (field != null) { FileOutputStream out = new FileOutputStream( outputBasePath + File.separator + (numProcessed + 1) + "." + fieldName); if (field.isBinary()) { out.write(field.getBinaryValue(), field.getBinaryOffset(), field.getBinaryLength()); } else { out.write(field.stringValue().getBytes("UTF-8")); } out.close(); numProcessed++; } } reader.close(); }
From source file:org.wandora.indexer.AbstractIndexBuilder.java
License:Open Source License
public Set getDependentTopics(String topic, IndexReader reader) throws IOException { TermDocs docs = reader.termDocs(new Term("topic", topic)); while (docs.next()) { Document doc = reader.document(docs.doc()); String type = doc.get("type"); if (type != null && type.toString().equals("topic")) { String d = doc.get("dependent"); StringTokenizer st = new StringTokenizer(d, "\n"); Set s = new HashSet(); while (st.hasMoreTokens()) { s.add(st.nextToken());/*from w w w.ja va 2 s .c om*/ } return s; } } return null; }
From source file:org.xcmis.search.lucene.index.PersistedIndex.java
License:Open Source License
private Document getDocument(final String uuid, final IndexReader reader) throws IndexException { try {/* ww w . j av a 2 s . co m*/ final TermDocs termDocs = reader.termDocs(new Term(FieldNames.UUID, uuid)); if (termDocs.next()) { final Document document = reader.document(termDocs.doc()); if (termDocs.next()) { throw new IndexException("More then one document found for uuid:" + uuid); } return document; } } catch (final IOException e) { throw new IndexException(e.getLocalizedMessage(), e); } return null; }
From source file:org.xcmis.search.lucene.LuceneQueryableIndexStorage.java
License:Open Source License
protected Document getDocument(String uuid, IndexReader reader) throws IndexException { try {//from w ww .jav a 2 s. c om if (reader != null) { final TermDocs termDocs = reader.termDocs(new Term(FieldNames.UUID, uuid)); if (termDocs.next()) { final Document document = reader.document(termDocs.doc()); if (termDocs.next()) { throw new IndexException("More then one document found for uuid:" + uuid); } return document; } } } catch (final IOException e) { throw new IndexException(e.getLocalizedMessage(), e); } return null; }
From source file:org.zenoss.zep.index.impl.EventIndexDaoImplIT.java
License:Open Source License
private Set<String> getFieldNames(LuceneEventIndexBackend backend, String eventUuid) throws IOException { IndexWriter indexWriter = (IndexWriter) ReflectionTestUtils.getField(backend, "writer"); IndexReader reader = null; try {/*from w w w . j av a 2 s . c o m*/ reader = IndexReader.open(indexWriter, true); IndexSearcher searcher = new IndexSearcher(reader); TopDocs docs = searcher.search(new TermQuery(new Term(IndexConstants.FIELD_UUID, eventUuid)), null, 1); assertEquals(1, docs.totalHits); int docId = docs.scoreDocs[0].doc; Document document = reader.document(docId); Set<String> fieldNames = Sets.newHashSet(); for (IndexableField field : document.getFields()) { fieldNames.add(field.name()); } return fieldNames; } finally { ZepUtils.close(reader); } }
From source file:org.zilverline.core.AbstractCollection.java
License:Open Source License
/** * Initialize this collection by getting its index. It retrieves the number * of documents and the MD5 hash of all documents in the collection. * /*w w w .j av a2s. c o m*/ * If the index does not exist (this is a new Collection) just return. * * @throws IndexException * when existing index of Collection can not be succesfully * opened. */ public final void init() throws IndexException { log.debug("Initializing collection " + name); IndexReader index = null; // Determine whether the collection exists on disk setExistsOnDisk(); // check whether this collection has a cache for the MD5 hashes of // documents if (md5DocumentCache == null) { md5DocumentCache = new HashSet(); } // check whether this collection has a cache for the MD5 hashes of // indexed archives if (archiveCache == null) { archiveCache = new HashSet(); } if (!isIndexValid()) { log.info("Index does not exist (yet) for collection '" + name + "'. Possibly new collection."); numberOfDocs = 0; return; } // Get the index File thisIndex = getIndexDirWithManagerDefaults(); try { index = IndexReader.open(thisIndex); if (index != null) { numberOfDocs = index.numDocs(); // retrieve all hashes of Documents from the cache md5DocumentCache.clear(); for (int i = 0; i < numberOfDocs; i++) { Document d = index.document(i); String hashValue = d.get("hash"); md5DocumentCache.add(hashValue); } // get some relevant information from the index version = IndexReader.getCurrentVersion(thisIndex); // deprecated, but needed lastIndexed = new Date(IndexReader.lastModified(thisIndex)); log.debug("Collection " + name + " has " + numberOfDocs + " documents, index created at: " + lastIndexed); } else { log.error("Index could not be retrieved for collection " + name); } } catch (IOException e) { throw new IndexException("Error initializing collection '" + name + "'", e); } finally { if (index != null) { try { index.close(); } catch (IOException e1) { log.error("Error closing index for collection " + name, e1); } } else { numberOfDocs = 0; version = 0; lastIndexed = null; } } }
From source file:perf.PKLookupPerfTest3X.java
License:Apache License
public static void main(String[] args) throws IOException { final Directory dir; final String dirImpl = args[0]; final String dirPath = args[1]; final int numDocs = Integer.parseInt(args[2]); final int numLookups = Integer.parseInt(args[3]); final long seed = Long.parseLong(args[4]); if (dirImpl.equals("MMapDirectory")) { dir = new MMapDirectory(new File(dirPath)); } else if (dirImpl.equals("NIOFSDirectory")) { dir = new NIOFSDirectory(new File(dirPath)); } else if (dirImpl.equals("SimpleFSDirectory")) { dir = new SimpleFSDirectory(new File(dirPath)); } else {/*from w w w .j ava2 s . c o m*/ throw new RuntimeException("unknown directory impl \"" + dirImpl + "\""); } if (!new File(dirPath).exists()) { createIndex(dir, numDocs); } final IndexReader r = IndexReader.open(dir); System.out.println("Reader=" + r); final IndexReader[] subs = r.getSequentialSubReaders(); final TermDocs[] termDocsArr = new TermDocs[subs.length]; for (int subIdx = 0; subIdx < subs.length; subIdx++) { termDocsArr[subIdx] = subs[subIdx].termDocs(); } final int maxDoc = r.maxDoc(); final Random rand = new Random(seed); for (int cycle = 0; cycle < 10; cycle++) { System.out.println("Cycle: " + (cycle == 0 ? "warm" : "test")); System.out.println(" Lookup..."); final Term[] lookup = new Term[numLookups]; final int[] docIDs = new int[numLookups]; final Term protoTerm = new Term("id"); for (int iter = 0; iter < numLookups; iter++) { // Base 36, prefixed with 0s to be length 6 (= 2.2 B) lookup[iter] = protoTerm.createTerm( String.format("%6s", Integer.toString(rand.nextInt(maxDoc), Character.MAX_RADIX)) .replace(' ', '0')); } Arrays.fill(docIDs, -1); final AtomicBoolean failed = new AtomicBoolean(false); final Term t = new Term("id", ""); final long tStart = System.currentTimeMillis(); for (int iter = 0; iter < numLookups; iter++) { //System.out.println("lookup " + lookup[iter].utf8ToString()); int base = 0; int found = 0; for (int subIdx = 0; subIdx < subs.length; subIdx++) { final IndexReader sub = subs[subIdx]; if (!DO_DOC_LOOKUP) { final int df = sub.docFreq(lookup[iter]); if (df != 0) { if (df != 1) { // Only 1 doc should be found failed.set(true); } found++; if (found > 1) { // Should have been found only once across segs System.out.println("FAIL0"); failed.set(true); } } } else { final TermDocs termDocs = termDocsArr[subIdx]; termDocs.seek(lookup[iter]); if (termDocs.next()) { found++; if (found > 1) { // Should have been found only once across segs failed.set(true); } final int docID = termDocs.doc(); if (docIDs[iter] != -1) { // Same doc should only be seen once failed.set(true); } docIDs[iter] = base + docID; if (termDocs.next()) { // Only 1 doc should be found failed.set(true); } } } base += sub.maxDoc(); } } final long tLookup = (System.currentTimeMillis() - tStart); // cycle 0 is for warming //System.out.println(" " + (cycle == 0 ? "WARM: " : "") + tLookup + " msec for " + numLookups + " lookups (" + (1000*tLookup/numLookups) + " us per lookup) + totSeekMS=" + (BlockTermsReader.totSeekNanos/1000000.)); System.out.println(" " + (cycle == 0 ? "WARM: " : "") + tLookup + " msec for " + numLookups + " lookups (" + (1000.0 * tLookup / numLookups) + " us per lookup)"); if (failed.get()) { throw new RuntimeException("at least one lookup produced more than one result"); } if (DO_DOC_LOOKUP) { System.out.println(" Verify..."); for (int iter = 0; iter < numLookups; iter++) { if (docIDs[iter] == -1) { throw new RuntimeException("lookup of " + lookup[iter] + " failed iter=" + iter); } final String found = r.document(docIDs[iter]).get("id"); if (!found.equals(lookup[iter].text())) { throw new RuntimeException( "lookup of docid=" + lookup[iter].text() + " hit wrong docid=" + found); } } } } // System.out.println("blocks=" + BlockTermsReader.totBlockReadCount + " scans=" + BlockTermsReader.totScanCount + " " + (((float) BlockTermsReader.totScanCount))/(BlockTermsReader.totBlockReadCount) + " scans/block"); r.close(); dir.close(); }