List of usage examples for org.apache.lucene.index IndexReader maxDoc
public abstract int maxDoc();
From source file:net.semanticmetadata.lire.TestImageSearcher.java
License:Open Source License
public void testCachingSearcher() throws IOException { IndexReader ir = DirectoryReader.open(FSDirectory.open(new File("C:\\Temp\\test-100k-cedd-idx"))); GenericFastImageSearcher is = new GenericFastImageSearcher(1, CEDD.class, true, ir); SingleNddCeddImageSearcher nis = new SingleNddCeddImageSearcher(ir); LinkedList<Document> q = new LinkedList<Document>(); for (int i = 0; i < Math.min(1000, ir.maxDoc()); i++) { q.add(ir.document(i));//from w w w . j a v a2s.c om } long time = System.currentTimeMillis(); int count = 0; for (Iterator<Document> iterator = q.iterator(); iterator.hasNext();) { Document next = iterator.next(); String id = is.search(next, ir).doc(0).getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]; CEDD cedd = new CEDD(); BytesRef binaryValue = next.getBinaryValue(cedd.getFieldName()); cedd.setByteArrayRepresentation(binaryValue.bytes, binaryValue.offset, binaryValue.length); String s = nis.findMostSimilar(cedd).getDocument().getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]; String qID = next.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]; System.out.println(s.equals(id) + " " + id.equals(qID) + " " + qID.equals(s)); count++; if (count > 100) break; } long l = System.currentTimeMillis() - time; System.out.printf("Tested %d search requests on %d documents: overall time of %d:%02d, %.2f ms per search", count, ir.maxDoc(), l / (1000 * 60), (l / 1000) % 60, ((float) l / (float) count)); }
From source file:net.semanticmetadata.lire.TestImageSearcher.java
License:Open Source License
public void testCustomCachingSearcher() throws IOException { IndexReader ir = DirectoryReader.open(FSDirectory.open(new File("C:\\Temp\\test-100k-cedd-idx"))); SingleNddCeddImageSearcher is = new SingleNddCeddImageSearcher(ir); LinkedList<LireFeature> q = new LinkedList<LireFeature>(); for (int i = 0; i < ir.maxDoc(); i++) { Document d = ir.document(i); CEDD cedd = new CEDD(); BytesRef binaryValue = d.getBinaryValue(cedd.getFieldName()); cedd.setByteArrayRepresentation(binaryValue.bytes, binaryValue.offset, binaryValue.length); q.add(cedd);/* w w w. j av a2 s . c o m*/ } long time = System.currentTimeMillis(); int count = 0; for (Iterator<LireFeature> iterator = q.iterator(); iterator.hasNext();) { LireFeature next = iterator.next(); is.findMostSimilar(next); count++; if (count > 100) break; } long l = System.currentTimeMillis() - time; System.out.printf("Tested %d search requests on %d documents: overall time of %d:%02d, %.2f ms per search", count, ir.maxDoc(), l / (1000 * 60), (l / 1000) % 60, ((float) l / (float) count)); }
From source file:net.semanticmetadata.lire.TestImageSearcher.java
License:Open Source License
public void testCachingSearcherParallel() throws IOException, InterruptedException { final IndexReader ir = DirectoryReader.open(FSDirectory.open(new File("C:\\Temp\\test-100k-cedd-idx"))); SingleNddCeddImageSearcher is = new SingleNddCeddImageSearcher(ir); LinkedList<LireFeature> q = new LinkedList<LireFeature>(); for (int i = 0; i < ir.maxDoc(); i++) { Document d = ir.document(i); CEDD cedd = new CEDD(); BytesRef binaryValue = d.getBinaryValue(cedd.getFieldName()); cedd.setByteArrayRepresentation(binaryValue.bytes, binaryValue.offset, binaryValue.length); q.add(cedd);//from w w w . jav a 2s .com } int count = 0; Thread[] searchers = new Thread[3]; final LinkedBlockingQueue<LireFeature> queryQueue = new LinkedBlockingQueue<LireFeature>(1000); for (int i = 0; i < searchers.length; i++) { searchers[i] = new Thread(new Runnable() { @Override public void run() { SingleNddCeddImageSearcher is = new SingleNddCeddImageSearcher(ir); LireFeature remove; while ((remove = queryQueue.remove()) instanceof CEDD) { try { is.findMostSimilar(remove); } catch (IOException e) { e.printStackTrace(); } } } }); searchers[i].start(); } long time = System.currentTimeMillis(); for (Iterator<LireFeature> iterator = q.iterator(); iterator.hasNext() && count < 1000;) { LireFeature next = iterator.next(); try { queryQueue.put(next); count++; } catch (InterruptedException e) { e.printStackTrace(); } } for (int i = 0; i < 8; i++) { queryQueue.put(new ScalableColor()); } for (int i = 0; i < searchers.length; i++) { searchers[i].join(); } long l = System.currentTimeMillis() - time; System.out.printf("Tested %d search requests on %d documents: overall time of %d:%02d, %.2f ms per search", count, ir.maxDoc(), l / (1000 * 60), (l / 1000) % 60, ((float) l / (float) count)); }
From source file:net.semanticmetadata.lire.TestImageSearcher.java
License:Open Source License
public void testCachingSearcherParallelWithBundling() throws IOException, InterruptedException { final IndexReader ir = DirectoryReader.open(FSDirectory.open(new File("C:\\Temp\\test-100k-cedd-idx"))); LinkedList<LireFeature> q = new LinkedList<LireFeature>(); for (int i = 0; i < ir.maxDoc(); i++) { Document d = ir.document(i); CEDD cedd = new CEDD(); BytesRef binaryValue = d.getBinaryValue(cedd.getFieldName()); cedd.setByteArrayRepresentation(binaryValue.bytes, binaryValue.offset, binaryValue.length); q.add(cedd);// w ww . ja v a 2s . co m } int count = 0; Thread[] searchers = new Thread[4]; final LinkedBlockingQueue<WorkItem> queryQueue = new LinkedBlockingQueue<WorkItem>(100); for (int i = 0; i < searchers.length; i++) { searchers[i] = new Thread(new Runnable() { @Override public void run() { SingleNddCeddImageSearcher is = new SingleNddCeddImageSearcher(ir); WorkItem remove; while ((remove = queryQueue.remove()).features != null) { try { SimpleResult[] hits = is.findMostSimilar(remove.features); for (int j = 0; j < hits.length; j++) { if (hits[j].getIndexNumber() != remove.id[j]) System.err.println("oops"); } } catch (IOException e) { e.printStackTrace(); } } } }); searchers[i].start(); } long time = System.currentTimeMillis(); LireFeature[] qarr = new LireFeature[10]; int[] iarr = new int[10]; int currentIndex = 0; int bundleCount = 0; Iterator<LireFeature> iterator = q.iterator(); while (iterator.hasNext() && bundleCount < 200) { LireFeature next = iterator.next(); try { iarr[currentIndex] = count; qarr[currentIndex++] = next; if (currentIndex >= qarr.length) { // do bundled search currentIndex = 0; queryQueue.put(new WorkItem(qarr.clone(), iarr.clone())); bundleCount++; } count++; } catch (InterruptedException e) { e.printStackTrace(); } } for (int i = 0; i < 8; i++) { queryQueue.put(new WorkItem(null, null)); } for (int i = 0; i < searchers.length; i++) { searchers[i].join(); } long l = System.currentTimeMillis() - time; System.out.printf("Tested %d search requests on %d documents: overall time of %d:%02d, %.2f ms per search", count, ir.maxDoc(), l / (1000 * 60), (l / 1000) % 60, ((float) l / (float) count)); }
From source file:net.semanticmetadata.lire.TestImageSearcher.java
License:Open Source License
public void testCachingSearcherBundling() throws IOException { IndexReader ir = DirectoryReader.open(FSDirectory.open(new File("C:\\Temp\\test-100k-cedd-idx"))); SingleNddCeddImageSearcher is = new SingleNddCeddImageSearcher(ir); LinkedList<LireFeature> q = new LinkedList<LireFeature>(); for (int i = 0; i < ir.maxDoc(); i++) { Document d = ir.document(i); CEDD cedd = new CEDD(); BytesRef binaryValue = d.getBinaryValue(cedd.getFieldName()); cedd.setByteArrayRepresentation(binaryValue.bytes, binaryValue.offset, binaryValue.length); q.add(cedd);/* w w w. j a va 2s. c o m*/ } long time = System.currentTimeMillis(); int count = 0; LireFeature[] qarr = new LireFeature[10]; int currentIndex = 0; for (Iterator<LireFeature> iterator = q.iterator(); iterator.hasNext();) { LireFeature next = iterator.next(); qarr[currentIndex++] = next; if (currentIndex >= qarr.length) { // do bundled search currentIndex = 0; is.findMostSimilar(qarr); } count++; if (count > 999 & currentIndex == 0) break; } long l = System.currentTimeMillis() - time; System.out.printf("Tested %d search requests on %d documents: overall time of %d:%02d, %.2f ms per search", count, ir.maxDoc(), l / (1000 * 60), (l / 1000) % 60, ((float) l / (float) count)); }
From source file:net.sf.jclal.util.dataset.LuceneIndexToWekaDataSet.java
License:Open Source License
/** * It converts a index file of Lucene to a weka file for classification. The * weka file class are nominal. The classifiers will work with nominal * class./* w w w. j a v a2s . c om*/ * * * @param wekaFileName Path of weka file. * @param indexFile Path of index file based on Lucene. The document indexes * must have fields called "class" and "content". WARNING: The fields must * not contains any puntuaction sign. * * @return Instances of weka. The instances are sparse since it is about * text information. * * @throws FileNotFoundException If the file does not exists. * @throws IOException If happens a error while writing the file. */ public Instances convertLuceneToWekaClassification(String wekaFileName, String indexFile) throws FileNotFoundException, IOException { File nuevo = new File(wekaFileName); if (!verify(nuevo)) { return null; } FileUtil.writeFile(nuevo, "@RELATION " + nuevo.getName() + doubleLine); IndexSearcher searcher = new IndexSearcher(indexFile); IndexReader reader = searcher.getIndexReader(); int total = reader.maxDoc(); HashMap<String, Integer> terms = new HashMap<String, Integer>(total * 2); Set<String> labels = new HashSet<String>(total * 2); int i; for (int l = 0; l < total; l++) { if (!reader.isDeleted(l)) { TermFreqVector vector = reader.getTermFreqVector(l, content); Document doc = reader.document(l); String current = doc.getField(classF).stringValue(); labels.add(current); if (vector != null) { String listosI[] = vector.getTerms(); for (i = 0; i < listosI.length; i++) { if (!terms.containsKey(listosI[i])) { terms.put(listosI[i], terms.size()); } } } } } String[] labelReady = new String[labels.size()]; int posLabel = 0; for (String string : labels) { labelReady[posLabel] = string; posLabel++; } Container[] terminos = convertir(terms); Arrays.sort(terminos); for (int j = 0; j < terminos.length; j++) { FileUtil.writeFile(nuevo, "@ATTRIBUTE " + (int) terminos[j].getKey() + " NUMERIC" + "\n"); } FileUtil.writeFile(nuevo, "@ATTRIBUTE class {"); for (int j = 0; j < labelReady.length - 1; j++) { FileUtil.writeFile(nuevo, labelReady[j] + ","); } FileUtil.writeFile(nuevo, labelReady[labelReady.length - 1] + "}" + doubleLine); FileUtil.writeFile(nuevo, "@DATA\n"); for (int pos = 0; pos < searcher.maxDoc(); pos++) { if (!reader.isDeleted(pos)) { TermFreqVector vector = reader.getTermFreqVector(pos, content); if (vector != null) { int[] origen = vector.getTermFrequencies(); String[] termsI = vector.getTerms(); int[] positions = new int[origen.length]; for (int k = 0; k < origen.length; k++) { positions[k] = terms.get(termsI[k]); } Container[] escribir = convertir(positions, origen); Arrays.sort(escribir); FileUtil.writeFile(nuevo, "{"); for (int j = 0; j < escribir.length; j++) { FileUtil.writeFile(nuevo, (int) escribir[j].getKey() + " " + escribir[j].getValue() + ","); } FileUtil.writeFile(nuevo, terms.size() + " " + searcher.doc(pos).getField(classF).stringValue() + "}\n"); } } } //close files closeReaders(searcher, reader); //Test if the weka file works Instances test = testWekaFile(wekaFileName); return test; }
From source file:net.sf.jclal.util.dataset.LuceneIndexToWekaDataSet.java
License:Open Source License
/** * It converts a index file of Lucene to a weka file for regression. The * weka file class are real. The used classifiers will work with numeric * real classe.//from w w w . j a va2 s.com * * @param wekaFileName Path of weka file. * @param indexFile Path of index file based on Lucene. The document indexes * must have fields called "class" and "content". WARNING: The fields must * not contains any puntuaction sign. * * @return Instances of weka. The instances are sparse since it is about * text information. * * @throws FileNotFoundException If the file does not exists. * @throws IOException If happens a error while writing the file. */ public Instances convertLuceneToWekaRegression(String wekaFileName, String indexFile) throws FileNotFoundException, IOException { File nuevo = new File(wekaFileName); if (!verify(nuevo)) { return null; } FileUtil.writeFile(nuevo, "@RELATION " + nuevo.getName() + doubleLine); IndexSearcher searcher = new IndexSearcher(indexFile); IndexReader reader = searcher.getIndexReader(); int total = reader.maxDoc(); HashMap<String, Integer> terms = new HashMap<String, Integer>(total * 2); HashMap<String, Integer> labels = new HashMap<String, Integer>(total * 2); int i; for (int l = 0; l < total; l++) { if (!reader.isDeleted(l)) { TermFreqVector vector = reader.getTermFreqVector(l, content); Document doc = reader.document(l); String current = doc.getField(classF).stringValue(); if (!labels.containsKey(current)) { labels.put(current, labels.size()); } if (vector != null) { String listosI[] = vector.getTerms(); for (i = 0; i < listosI.length; i++) { if (!terms.containsKey(listosI[i])) { terms.put(listosI[i], terms.size()); } } } } } Container[] terminos = convertir(terms); Arrays.sort(terminos); for (int j = 0; j < terminos.length; j++) { FileUtil.writeFile(nuevo, "@ATTRIBUTE " + (int) terminos[j].getKey() + " NUMERIC" + "\n"); } FileUtil.writeFile(nuevo, "@ATTRIBUTE class REAL [0.0,"); FileUtil.writeFile(nuevo, (labels.size() - 1) + ".0]" + doubleLine); FileUtil.writeFile(nuevo, "@DATA\n"); for (int pos = 0; pos < searcher.maxDoc(); pos++) { if (!reader.isDeleted(pos)) { TermFreqVector vector = reader.getTermFreqVector(pos, content); if (vector != null) { int[] origen = vector.getTermFrequencies(); String[] termsI = vector.getTerms(); int[] positions = new int[origen.length]; for (int k = 0; k < origen.length; k++) { positions[k] = terms.get(termsI[k]); } Container[] escribir = convertir(positions, origen); Arrays.sort(escribir); FileUtil.writeFile(nuevo, "{"); for (int j = 0; j < escribir.length; j++) { FileUtil.writeFile(nuevo, (int) escribir[j].getKey() + " " + escribir[j].getValue() + ","); } FileUtil.writeFile(nuevo, terms.size() + " " + labels.get(searcher.doc(pos).getField(classF).stringValue()) + ".0}\n"); } } } //close files closeReaders(searcher, reader); //Test if the weka file works Instances test = testWekaFile(wekaFileName); return test; }
From source file:net.sf.logsaw.index.internal.LuceneIndexServiceImpl.java
License:Open Source License
private Date getLatestEntryDate(ILogResource log) throws CoreException { if (!hasDateComponent(log)) { return null; }/*from ww w . j a va 2s .c om*/ ARunWithIndexReader<Date> runnable = new ARunWithIndexReader<Date>() { /* (non-Javadoc) * @see net.sf.logsaw.index.impl.ARunWithIndexReader#doRunWithIndexReader(org.apache.lucene.index.IndexReader, net.sf.logsaw.core.framework.ILogResource) */ @Override protected Date doRunWithIndexReader(IndexReader reader, ILogResource log) throws CoreException { if (reader == null) { // Index does not exist yet return null; } int i = reader.maxDoc(); if (i > 0) { try { Document doc = reader.document(i - 1); String val = doc.get(log.getDialect().getFieldProvider().getTimestampField().getKey()); return log.getDialect().getFieldProvider().getTimestampField().fromIndexedValue(val); } catch (IOException e) { // Unexpected exception; wrap with CoreException throw new CoreException(new Status(IStatus.ERROR, IndexPlugin.PLUGIN_ID, NLS.bind(Messages.LuceneIndexService_error_failedToReadIndex, new Object[] { log.getName(), e.getLocalizedMessage() }), e)); } } return null; } }; return runnable.runWithIndexReader(log); }
From source file:net.sf.zekr.engine.search.lucene.QuranTextSearcher.java
@Override public DocIdSet getDocIdSet(IndexReader reader) throws IOException { BitSet bits = new BitSet(reader.maxDoc()); for (int i = 0; i < reader.maxDoc(); i++) { Document doc = reader.document(i); IQuranLocation loc = new QuranLocation(doc.getField("location").stringValue()); if (searchScope.includes(loc)) { bits.set(i);//from ww w . ja va2 s.c om } } return new DocIdBitSet(bits); }
From source file:nl.elucidator.maven.analyzer.indexer.IndexSearcher.java
License:Apache License
public Set<ArtifactInfo> getUniqueGAV() throws IOException, ComponentLookupException { IndexingContext centralContext = indexUpdater.getIndexContext(); centralContext.lock();// ww w . ja v a 2 s . com Set<ArtifactInfo> artifactInfoSet = new HashSet<ArtifactInfo>(); try { final IndexReader ir = centralContext.getIndexReader(); for (int i = 0; i < ir.maxDoc(); i++) { if (!ir.isDeleted(i)) { final Document doc = ir.document(i); final ArtifactInfo ai = IndexUtils.constructArtifactInfo(doc, centralContext); if (ai != null) { artifactInfoSet.add(ai); } } } } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { centralContext.unlock(); } return artifactInfoSet; }