List of usage examples for org.apache.lucene.index IndexReader maxDoc
public abstract int maxDoc();
From source file:it.unibz.instasearch.indexing.SearchResultDoc.java
License:Open Source License
private float[] createTermScoreVector(TermFreqVector vect, IndexReader reader) throws IOException { if (vect == null) return new float[0]; int[] termFrequencies = vect.getTermFrequencies(); String[] terms = vect.getTerms(); float[] scores = new float[terms.length]; int numDocs = reader.maxDoc(); Similarity sim = Searcher.SIMILARITY; for (int i = 0; i < terms.length; i++) { String termText = terms[i]; Term term = new Term(Field.CONTENTS.toString(), termText); float termFreq = sim.tf(termFrequencies[i]); int docFreq = reader.docFreq(term); float idf = sim.idf(docFreq, numDocs); float tfIdf = termFreq * idf; scores[i] = tfIdf;// w w w. j ava2 s.c o m } return scores; }
From source file:lia.chapter2.IndexingTest.java
License:Apache License
@Test public void testIndexReader() throws IOException { IndexReader reader = DirectoryReader.open(directory); assertEquals(ids.length, reader.maxDoc()); //8 assertEquals(ids.length, reader.numDocs()); //8 reader.close();//from ww w . j av a 2s . co m }
From source file:lia.chapter5.CategorizerTest.java
License:Apache License
private void buildCategoryVectors() throws IOException { IndexSearcher searcher = Utils.getBookIndexSearcher(); IndexReader reader = searcher.getIndexReader(); int maxDoc = reader.maxDoc(); System.out.println(maxDoc);/*from w w w.jav a 2 s . c o m*/ for (int i = 0; i < maxDoc; i++) { Document doc = reader.document(i); String category = doc.get("category"); System.out.println("\n" + doc.get("subject") + "\n"); Map vectorMap = (Map) categoryMap.get(category); if (vectorMap == null) { vectorMap = new TreeMap(); categoryMap.put(category, vectorMap); } Terms termsVector = reader.getTermVector(i, "subject"); addTermFreqToMap(vectorMap, termsVector); } }
From source file:liredemo.flickr.TestParallelIndexer.java
License:Open Source License
/** * Delete all Fields besides the ones needed. * * @throws IOException/*w w w. j av a 2 s . co m*/ */ public void testReduceIndex() throws IOException { IndexReader reader = IndexReader.open(FSDirectory.open(new File("./index-mirflickr"))); IndexWriter writer = LuceneUtils.createIndexWriter(FSDirectory.open(new File("./mirflickr-data-vw")), true, LuceneUtils.AnalyzerType.WhitespaceAnalyzer); // IndexReader reader = IndexReader.open(FSDirectory.open(new File(indexPath))); // IndexWriter writer = LuceneUtils.createIndexWriter(indexPath + "-reduced", true); int maxDocs = reader.maxDoc(); Document d; for (int i = 0; i < maxDocs; i++) { /* if (!reader.isDeleted(0)) { d = reader.document(i); Document writeDoc = new Document(); writeDoc.add(d.getFieldable(DocumentBuilder.FIELD_NAME_CEDD)); // writeDoc.add(d.getFieldable(DocumentBuilder.FIELD_NAME_IDENTIFIER)); writeDoc.add(d.getFieldable("tags")); writeDoc.add(d.getFieldable(DocumentBuilder.FIELD_NAME_SURF_LOCAL_FEATURE_HISTOGRAM_VISUAL_WORDS)); writer.addDocument(writeDoc); }*/ } writer.close(); }
From source file:liredemo.flickr.TestParallelIndexer.java
License:Open Source License
public void testSearchTime() throws IOException { ImageSearcher ceddImageSearcher = new VisualWordsImageSearcher(100, DocumentBuilder.FIELD_NAME_SURF + DocumentBuilder.FIELD_NAME_BOVW); // ImageSearcher ceddImageSearcher = ImageSearcherFactory.createCEDDImageSearcher(100); IndexReader reader = IndexReader.open(FSDirectory.open(new File(indexPath))); // IndexReader reader = IndexReader.open(new RAMDirectory(FSDirectory.open(new File(indexPath + "-reduced")))); System.out.println("reader.maxDoc() = " + reader.maxDoc()); for (int i = 0; i < 10; i++) { long ms = System.currentTimeMillis(); ceddImageSearcher.search(reader.document(0), reader); System.out.println("s = " + (double) (System.currentTimeMillis() - ms) / 1000d); }//from w ww .jav a 2 s . c o m }
From source file:lucandra.LucandraFilter.java
License:Apache License
public DocIdSet getDocIdSet(IndexReader reader) throws IOException { OpenBitSet result = new OpenBitSet(reader.maxDoc()); Map<Integer, String> filterMap = ((lucandra.IndexReader) reader).getDocIndexToDocId(); List<String> filteredValues = new ArrayList<String>(); for (Map.Entry<Integer, String> entry : filterMap.entrySet()) { filteredValues.add(entry.getValue()); }//from ww w. ja v a2 s .c o m if (filteredValues.size() == 0) return null; LucandraTermDocs termDocs = (LucandraTermDocs) reader.termDocs(); for (Term term : terms) { List<ColumnOrSuperColumn> terms = termDocs.filteredSeek(term, filteredValues); // This is a conjunction and at least one value must match if (terms == null) return null; while (termDocs.next()) { result.set(termDocs.doc()); } } termDocs.close(); return result; }
From source file:lucee.runtime.search.lucene2.LuceneSearchCollection.java
License:Open Source License
/** * @param id/* w w w . j a v a 2 s. c om*/ * @param title * @param keyColumn * @param bodyColumns * @param language * @param custom1 * @param custom2 * @param custom3 * @param custom4 * @return * @throws SearchException */ protected IndexResult _deleteCustom(String id, QueryColumn keyColumn) throws SearchException { int countBefore = 0; int countAfter = 0; Map<String, Document> docs = new HashMap<String, Document>(); Set<String> keys = toSet(keyColumn); IndexWriter writer = null; String key; IndexReader reader = null; Document doc; synchronized (token) { try { try { reader = _getReader(id, false); countBefore = reader.maxDoc(); for (int i = 0; i < countBefore; i++) { doc = reader.document(i); key = doc.getField("key").stringValue(); if (!keys.contains(key)) docs.put(key, doc); } } catch (Exception e) { } finally { close(reader); } countAfter = docs.size(); writer = _getWriter(id, true); Iterator<Entry<String, Document>> it = docs.entrySet().iterator(); while (it.hasNext()) { writer.addDocument(it.next().getValue()); } optimizeEL(writer); } catch (IOException e) { throw new SearchException(e); } finally { close(writer); } indexSpellCheck(id); } int removes = countBefore - countAfter; return new IndexResultImpl(removes, 0, 0); }
From source file:lucee.runtime.search.lucene2.LuceneSearchCollection.java
License:Open Source License
/** * @param id/* w w w . jav a 2 s . co m*/ * @param title * @param keyColumn * @param bodyColumns * @param language * @param custom1 * @param custom2 * @param custom3 * @param custom4 * @return * @throws SearchException */ protected IndexResult _indexCustom(String id, Object title, QueryColumn keyColumn, QueryColumn[] bodyColumns, String language, Object urlpath, Object custom1, Object custom2, Object custom3, Object custom4) throws SearchException { _checkLanguage(language); String t; String url; String c1; String c2; String c3; String c4; int countExisting = 0; int countAdd = keyColumn.size(); int countNew = 0; Map<String, Document> docs = new HashMap<String, Document>(); IndexWriter writer = null; synchronized (token) { try { // read existing reader IndexReader reader = null; try { reader = _getReader(id, false); int len = reader.maxDoc(); Document doc; for (int i = 0; i < len; i++) { doc = reader.document(i); docs.put(doc.getField("key").stringValue(), doc); } } catch (Exception e) { } finally { close(reader); } countExisting = docs.size(); writer = _getWriter(id, true); int len = keyColumn.size(); String key; for (int i = 1; i <= len; i++) { key = Caster.toString(keyColumn.get(i, null), null); if (key == null) continue; StringBuilder body = new StringBuilder(); for (int y = 0; y < bodyColumns.length; y++) { Object tmp = bodyColumns[y].get(i, null); if (tmp != null) { body.append(tmp.toString()); body.append(' '); } } //t=(title==null)?null:Caster.toString(title.get(i,null),null); //url=(urlpath==null)?null:Caster.toString(urlpath.get(i,null),null); t = getRow(title, i); url = getRow(urlpath, i); c1 = getRow(custom1, i); c2 = getRow(custom2, i); c3 = getRow(custom3, i); c4 = getRow(custom4, i); docs.put(key, CustomDocument.getDocument(t, key, body.toString(), url, c1, c2, c3, c4)); } countNew = docs.size(); Iterator<Entry<String, Document>> it = docs.entrySet().iterator(); Entry<String, Document> entry; Document doc; while (it.hasNext()) { entry = it.next(); doc = entry.getValue(); writer.addDocument(doc); } optimizeEL(writer); //writer.optimize(); } catch (IOException ioe) { throw new SearchException(ioe); } finally { close(writer); } indexSpellCheck(id); } int inserts = countNew - countExisting; return new IndexResultImpl(0, inserts, countAdd - inserts); }
From source file:lucenesearch.Mallet.java
public void getMalletAllOutput() throws IOException { String index = new Searcher().getPostIndexPath(); IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(index))); PrintWriter pw = new PrintWriter("./data/mallet_all.txt"); StringBuilder sb = new StringBuilder(); for (int i = 0; i < reader.maxDoc(); i++) { Document doc = reader.document(i); System.out.println("Doc " + i); ArrayList<String> res = LuceneUtils.getAnalyzedRemoveHtml(doc.get("Body")); int id = Integer.parseInt(doc.get("SId")); sb = new StringBuilder(); sb.append(id);/* w ww .ja v a 2 s .c o m*/ sb.append("\t"); for (String re : res) { re = re.replaceAll("\r\n", " ").replaceAll("\n", " ").replaceAll("<.+?>", "").replaceAll(" +", " ") .replaceAll("[^\\x00-\\x7F]", " ").trim(); sb.append(re).append(" "); } sb.append("\n"); pw.print(sb.toString()); } pw.close(); }
From source file:lucenetools.TermData.java
License:Apache License
/** * Main application. /* w ww. j a v a 2s . co m*/ * * @param args the command line arguments */ public static void main(String[] args) { Options opts = new Options(); CommandLine commandLine = new CommandLine(); // if no command line options specified, user wants help if (0 == args.length) { commandLine.showHelp(); System.exit(0); } // extract command line args and store in opts if (!commandLine.parse(args, opts)) System.exit(1); if (opts.showHelp) { commandLine.showHelp(); System.exit(0); } // validate all command line options if (!commandLine.isValid(opts)) System.exit(1); // report all command line options to the user System.out.println("\nLuceneToMtx version " + VERSION + "."); commandLine.printOpts(opts); long maxMemory = Runtime.getRuntime().maxMemory() / 1024 / 1024; System.out.println("Java runtime max memory: " + maxMemory + " MB."); // Build a map and assign a dictionary index to each term. // Include only those terms that survive the min term freq cutoff. Map<String, Integer> dictMap = new TreeMap<>(); File file = null; System.out.println("Processing index..."); try { file = new File(opts.indexDir); IndexReader reader = DirectoryReader.open(FSDirectory.open(file)); TermsEnum te = null; int nnz = 0, numCols = 0, maxDocs = reader.maxDoc(); LinkedList<FeatureVector> matrixData = new LinkedList<>(); // add other fields Collection<String> fields = new ArrayList<>(); if (opts.fields > 0) { fields = MultiFields.getIndexedFields(reader); fields.remove(CONTENTSFIELD); fields.remove(PATHFIELD); } if (!extractTerms(reader, dictMap, opts.minTermFreq, maxDocs - 1, opts.maxTermPercentage)) System.exit(1); // set of field names to extract Set<String> fieldSet = new HashSet<>(); fieldSet.add(PATHFIELD); for (String s : fields) { fieldSet.add(s); } for (int i = 0; i < maxDocs; ++i) { // get term vector for next document Terms terms = reader.getTermVector(i, CONTENTSFIELD); if (terms == null) continue; te = terms.iterator(te); FeatureVector fv = new FeatureVector(numCols); int numEntries = buildFeatureVector(fv, te, dictMap); if (numEntries > 0) { // extract document path and save with FeatureVector Document doc = reader.document(i, fieldSet); fv.docPath = doc.get(PATHFIELD); // add any additional fields for (String s : fields) { fv.fields.put(s, doc.get(s)); } //System.out.println("processing document:" + fv.docPath); matrixData.add(fv); nnz += numEntries; ++numCols; } } // Sort the feature vectors by their document path field. Write // the matrix columns in this sorted order. Collections.sort(matrixData, new FeatureVectorComparator()); File outdir = new File(opts.outDir); writeMatrixMarketFile(new File(outdir, MATRIXFILE), matrixData, dictMap.size(), numCols, nnz); System.out.println("Wrote " + MATRIXFILE + "."); writeDictionaryFile(new File(outdir, DICTFILE), dictMap); System.out.println("Wrote " + DICTFILE + "."); writeDocumentFile(new File(outdir, DOCFILE), matrixData); System.out.println("Wrote " + DOCFILE + "."); writeFieldFiles(outdir, fields, matrixData); } catch (IndexNotFoundException e) { if (null != file) { System.out.println("Lucene index not found in: " + file.getAbsolutePath()); } } catch (IOException e) { System.out.println("LuceneToMtx exception: caught a " + e.getClass() + "\nMessage: " + e.getMessage()); } }