List of usage examples for org.apache.lucene.index IndexReader numDocs
public abstract int numDocs();
From source file:com.joliciel.jochre.search.highlight.LuceneQueryHighlighter.java
License:Open Source License
public LuceneQueryHighlighter(JochreQuery jochreQuery, IndexSearcher indexSearcher) { try {//from ww w.j a v a 2 s . com this.indexSearcher = indexSearcher; this.jochreQuery = jochreQuery; query = rewrite(jochreQuery.getLuceneQuery()); queryTerms = new TreeSet<Term>(); query.extractTerms(queryTerms); if (LOG.isTraceEnabled()) queryTermList = new ArrayList<Term>(queryTerms); final IndexReader reader = indexSearcher.getIndexReader(); // add 1 to doc count to ensure even terms in all docs get a very small weight docCountLog = Math.log(reader.numDocs() + 1); IndexReaderContext readerContext = reader.getContext(); leaves = readerContext.leaves(); // since the same terms might be contained in the query multiple times (e.g. once per field) // we only consider them once each by using a HashSet terms = new HashSet<BytesRef>(); Map<BytesRef, Integer> termFreqs = new HashMap<BytesRef, Integer>(); for (Term term : queryTerms) { terms.add(term.bytes()); termFreqs.put(term.bytes(), 0); } termLogs = new HashMap<BytesRef, Double>(); for (Term term : queryTerms) { int freq = termFreqs.get(term.bytes()); freq += reader.docFreq(term); termFreqs.put(term.bytes(), freq); } for (BytesRef term : terms) { int freq = termFreqs.get(term); termLogs.put(term, Math.log(freq)); } } catch (IOException e) { LogUtils.logError(LOG, e); throw new RuntimeException(e); } }
From source file:com.main.Searcher.java
public List<Bean> searching(String s1, String s2, String radioBtn) throws IOException, ParseException, InvalidTokenOffsetsException { //getting reference of directory Directory dir = FSDirectory.open(Paths.get(Index_Dir)); //Index reader - an interface for accessing a point-in-time view of a lucene index IndexReader reader = DirectoryReader.open(dir); IndexSearcher searcher = new IndexSearcher(reader); //analyzer with the default stop words, takes out the stop words Analyzer analyzer = new StandardAnalyzer(); String contents = "contents"; QueryParser parser = new QueryParser(contents, analyzer); int numOfDoc = reader.numDocs(); for (int i = 0; i < numOfDoc; i++) { Document d = reader.document(i); }// w ww . j a v a2s . co m Query q1 = parser.parse(s1); Query q2 = parser.parse(s2); //conjuction, disjunction and negation BooleanQuery.Builder bq = new BooleanQuery.Builder(); //occur.must : both queries required in a doc if (radioBtn.equals("conjunction")) { bq.add(q1, BooleanClause.Occur.MUST); bq.add(q2, BooleanClause.Occur.MUST); bq.build(); } //occur.should: one of the q1 should be presen t in doc else if (radioBtn.equals("disjunction")) { bq.add(q1, BooleanClause.Occur.SHOULD); bq.add(q2, BooleanClause.Occur.SHOULD); bq.build(); } //negation: first should present , second should not else { bq.add(q1, BooleanClause.Occur.MUST); bq.add(q2, BooleanClause.Occur.MUST_NOT); bq.build(); } TopDocs hits = searcher.search(bq.build(), 10); Formatter formatter = new SimpleHTMLFormatter(); QueryScorer scorer = new QueryScorer(bq.build()); //used to markup highlighted terms found in the best sections of a cont Highlighter highlighter = new Highlighter(formatter, scorer); //It breaks cont up into same-size texts but does not split up spans Fragmenter fragmenter = new SimpleSpanFragmenter(scorer, 10); //breaks cont up into same-size fragments with no concerns over spotting sentence boundaries. //set fragmenter to highlighter highlighter.setTextFragmenter(fragmenter); for (int i = 0; i < hits.scoreDocs.length; i++) { Bean bean = new Bean(); int outResult = hits.scoreDocs.length; bean.setNumFile(outResult); int docid = hits.scoreDocs[i].doc; double rank = hits.scoreDocs[i].score; bean.setRankSc(rank); Document doc = searcher.doc(docid); String name = doc.get("name"); String title = doc.get("title"); bean.setTitle(name); String path = doc.get("path"); bean.setPath(path); String cont = doc.get("contents"); //Create token stream TokenStream stream = TokenSources.getAnyTokenStream(reader, docid, "contents", analyzer); //Get highlighted cont fragments String[] frags = highlighter.getBestFragments(stream, cont, 10); ArrayList<String> dummy = new ArrayList<>(); for (String frag : frags) { dummy.add(frag); } bean.setContent(dummy); beanList.add(bean); } dir.close(); // } return beanList; }
From source file:com.main.Searcher.java
public List<Bean> searching(String s1) throws IOException, ParseException, InvalidTokenOffsetsException { //Get directory reference Directory dir = FSDirectory.open(Paths.get(Index_Dir)); //Index reader - an interface for accessing a point-in-time view of a lucene index IndexReader reader = DirectoryReader.open(dir); //CreateIndexReader reader = DirectoryReader.open(dir); lucene searcher. It search over a single IndexReader. IndexSearcher searcher = new IndexSearcher(reader); //analyzer with the default stop words Analyzer analyzer = new StandardAnalyzer(); //Query parser to be used for creating TermQuery String queries = null;//from w ww .j a v a 2 s. c o m String queryString = null; //regular search String contents = "contents"; BufferedReader in = null; if (queries != null) { in = Files.newBufferedReader(Paths.get(queries), StandardCharsets.UTF_8); } else { in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8)); } QueryParser parser = new QueryParser(contents, analyzer); int numOfDoc = reader.numDocs(); for (int i = 0; i < numOfDoc; i++) { Document d = reader.document(i); } Query q1 = parser.parse(s1); BooleanQuery.Builder bq = new BooleanQuery.Builder(); bq.add(q1, BooleanClause.Occur.MUST); //Search the lucene documents TopDocs hits = searcher.search(bq.build(), 10); // TopScoreDocCollector collector = TopScoreDocCollector.create(5); /** * Highlighter Code Start *** */ //Uses HTML <B></B> tag to highlight the searched terms Formatter formatter = new SimpleHTMLFormatter(); //It scores cont fragments by the number of unique q1 terms found //Basically the matching score in layman terms QueryScorer scorer = new QueryScorer(bq.build()); //used to markup highlighted terms found in the best sections of a cont Highlighter highlighter = new Highlighter(formatter, scorer); //It breaks cont up into same-size texts but does not split up spans Fragmenter fragmenter = new SimpleSpanFragmenter(scorer, 10); //breaks cont up into same-size fragments with no concerns over spotting sentence boundaries. //set fragmenter to highlighter highlighter.setTextFragmenter(fragmenter); //Iterate over found results for (int i = 0; i < hits.scoreDocs.length; i++) { Bean bean = new Bean(); //int rank = hits.scoreDocs.length; int outResult = hits.scoreDocs.length; bean.setNumFile(outResult); int docid = hits.scoreDocs[i].doc; double rank = hits.scoreDocs[i].score; bean.setRankSc(rank); Document doc = searcher.doc(docid); // String title = doc.get("title"); String name = doc.get("name"); String title = doc.get("title"); bean.setTitle(name); String path = doc.get("path"); bean.setPath(path); String cont = doc.get("contents"); //Create token stream TokenStream stream = TokenSources.getAnyTokenStream(reader, docid, "contents", analyzer); //Get highlighted cont fragments String[] frags = highlighter.getBestFragments(stream, cont, 10); ArrayList<String> dummy = new ArrayList<>(); for (String frag : frags) { dummy.add(frag); } bean.setContent(dummy); beanList.add(bean); } dir.close(); // } return beanList; }
From source file:com.mothsoft.alexis.dao.DocumentDaoImpl.java
License:Apache License
@SuppressWarnings("unchecked") private List<ImportantTerm> getImportantTerms(FullTextQuery fullTextQuery, int count, boolean filterStopWords) { final Long start = System.currentTimeMillis(); final List<Object[]> results = fullTextQuery.list(); final LinkedHashMap<String, Tuple<Integer, Float>> termCountMap = new LinkedHashMap<String, Tuple<Integer, Float>>(); final FullTextSession fullTextSession = Search.getFullTextSession((Session) this.em.getDelegate()); final SearchFactory searchFactory = fullTextSession.getSearchFactory(); final IndexReaderAccessor ira = searchFactory.getIndexReaderAccessor(); final IndexReader reader = ira.open(com.mothsoft.alexis.domain.Document.class); final IndexSearcher searcher = new IndexSearcher(reader); final List<ImportantTerm> importantTerms; final int numDocs; try {// ww w . j a v a 2s . c o m numDocs = reader.numDocs(); Term luceneTerm = new Term(CONTENT_TEXT_FIELD_NAME); if (logger.isDebugEnabled()) { logger.debug(String.format("Found %d matching Lucene documents of %d in reader", results.size(), numDocs)); } // loop over all the matching documents for (final Object[] ith : results) { int docId = ((Number) ith[0]).intValue(); final TermFreqVector tfv = reader.getTermFreqVector(docId, CONTENT_TEXT_FIELD_NAME); if (tfv == null) { continue; } final String[] terms = tfv.getTerms(); final int[] freqs = tfv.getTermFrequencies(); // total document size int size = 0; for (int freq : freqs) { size += freq; } if (logger.isDebugEnabled()) { logger.debug( String.format("Lucene document %d has %d terms, to be merged with running count %d", docId, size, termCountMap.size())); } // loop over the terms and aggregate the counts and tf-idf int i = 0; for (final String term : terms) { if (StopWords.ENGLISH.contains(term)) { continue; } luceneTerm = luceneTerm.createTerm(term); final int termCount = freqs[i++]; final Tuple<Integer, Float> countScore; if (termCountMap.containsKey(term)) { countScore = termCountMap.get(term); countScore.t1 += termCount; countScore.t2 += (TFIDF.score(term, termCount, size, numDocs, searcher.docFreq(luceneTerm))); } else { countScore = new Tuple<Integer, Float>(); countScore.t1 = termCount; countScore.t2 = (TFIDF.score(term, termCount, size, numDocs, searcher.docFreq(luceneTerm))); termCountMap.put(term, countScore); } } } if (logger.isDebugEnabled()) { logger.debug("Completed Lucene document processing."); } importantTerms = new ArrayList<ImportantTerm>(termCountMap.size()); // find max TF-IDF float maxTfIdf = 0.0f; for (final Tuple<Integer, Float> ith : termCountMap.values()) { if (ith.t2 > maxTfIdf) { maxTfIdf = ith.t2; } } for (final Map.Entry<String, Tuple<Integer, Float>> entry : termCountMap.entrySet()) { final int ithCount = entry.getValue().t1; final float ithTfIdf = entry.getValue().t2; importantTerms.add(new ImportantTerm(entry.getKey(), ithCount, ithTfIdf, maxTfIdf)); } if (logger.isDebugEnabled()) { logger.debug("Completed term aggregation, will clear term map"); } termCountMap.clear(); } catch (IOException e) { throw new RuntimeException(e); } finally { try { searcher.close(); } catch (IOException e) { logger.warn("Failed to close searcher: " + e, e); } ira.close(reader); } if (logger.isDebugEnabled()) { logger.debug("Sorting terms"); } Collections.sort(importantTerms, new Comparator<ImportantTerm>() { @Override public int compare(ImportantTerm term1, ImportantTerm term2) { return -1 * term1.getTfIdf().compareTo(term2.getTfIdf()); } }); if (logger.isDebugEnabled()) { logger.debug("Term sort complete"); } if (importantTerms.isEmpty() || importantTerms.size() < count) { if (logger.isDebugEnabled()) { logger.debug("Will return full list."); } logger.debug("Timer: " + (System.currentTimeMillis() - start)); return importantTerms; } else { if (logger.isDebugEnabled()) { logger.debug( "Will return sublist containing " + count + " of " + importantTerms.size() + " terms."); } logger.debug("Timer: " + (System.currentTimeMillis() - start)); return importantTerms.subList(0, count); } }
From source file:com.mothsoft.alexis.engine.textual.TFIDFCalculatorImpl.java
License:Apache License
@SuppressWarnings("unchecked") @Transactional/*from w w w. ja va2 s.c om*/ public void execute() { final long start = System.currentTimeMillis(); final FullTextSession fullTextSession = Search.getFullTextSession((Session) this.em.getDelegate()); final SearchFactory searchFactory = fullTextSession.getSearchFactory(); final IndexReaderAccessor ira = searchFactory.getIndexReaderAccessor(); final IndexReader reader = ira.open(com.mothsoft.alexis.domain.Document.class); final Query query = em.createQuery( "select d from Document d join d.documentTerms dt where dt.tfIdf IS NULL ORDER BY d.id ASC"); final List<Document> documents = query.getResultList(); final Term luceneTerm = new Term(CONTENT_TEXT_FIELD_NAME); int affectedRows = 0; try { for (final Document document : documents) { final Map<String, Float> termTfIdfMap = new HashMap<String, Float>(); // calculate term TF-IDFs for (final DocumentTerm documentTerm : document.getDocumentTerms()) { final Term term = luceneTerm.createTerm(documentTerm.getTerm().getValueLowercase()); Float score = TFIDF.score(documentTerm.getTerm().getValueLowercase(), documentTerm.getCount(), document.getTermCount(), reader.numDocs(), reader.docFreq(term)); documentTerm.setTfIdf(score); termTfIdfMap.put(documentTerm.getTerm().getValueLowercase(), score); affectedRows++; } // update association weights for (final DocumentAssociation documentAssociation : document.getDocumentAssociations()) { final String a = documentAssociation.getA().getValueLowercase(); final String b = documentAssociation.getB().getValueLowercase(); documentAssociation.setAssociationWeight((float) documentAssociation.getAssociationCount() * (termTfIdfMap.get(a) + termTfIdfMap.get(b))); } } } catch (IOException e) { throw new RuntimeException(e); } finally { ira.close(reader); } logger.info("TF-IDF calc took: " + ((System.currentTimeMillis() - start) / 1000.00) + " seconds and affected " + affectedRows + " rows."); }
From source file:com.netspective.sparx.navigate.fts.FullTextSearchPage.java
License:Open Source License
protected void readIndexInfo(File indexDir) throws IOException { IndexReader indexReader = IndexReader.open(indexDir); totalDocsInIndex = indexReader.numDocs(); List fields = new ArrayList(); List indexedFields = new ArrayList(); Iterator fieldIterator = indexReader.getFieldNames().iterator(); while (fieldIterator.hasNext()) { Object field = fieldIterator.next(); if (field != null && !field.equals("")) fields.add(field.toString()); }//from w w w .ja va2 s . c o m fieldIterator = indexReader.getFieldNames(true).iterator(); while (fieldIterator.hasNext()) { Object field = fieldIterator.next(); if (field != null && !field.equals("")) indexedFields.add(field.toString()); } indexReader.close(); allFieldNames = (String[]) fields.toArray(new String[fields.size()]); allIndexedFieldNames = (String[]) indexedFields.toArray(new String[indexedFields.size()]); }
From source file:com.orientechnologies.lucene.engine.OLuceneIndexEngineAbstract.java
License:Apache License
@Override public long sizeInTx(OLuceneTxChanges changes) { IndexReader reader = null; IndexSearcher searcher = null;//from ww w .ja v a 2s.c om try { searcher = searcher(); if (searcher != null) reader = searcher.getIndexReader(); } catch (IOException e) { OLogManager.instance().error(this, "Error on getting size of Lucene index", e); } finally { if (searcher != null) { release(searcher); } } return changes == null ? reader.numDocs() : reader.numDocs() + changes.numDocs(); }
From source file:com.orientechnologies.lucene.manager.OLuceneIndexManagerAbstract.java
License:Apache License
public long size(final ValuesTransformer<V> transformer) { IndexReader reader = null; IndexSearcher searcher = null;// w ww . jav a 2s. com try { reader = getSearcher().getIndexReader(); } catch (IOException e) { OLogManager.instance().error(this, "Error on getting size of Lucene index", e); } finally { if (searcher != null) { release(searcher); } } return reader.numDocs(); }
From source file:com.pjaol.search.geo.utils.DistanceFilter.java
License:Apache License
@Override public BitSet bits(IndexReader reader) throws IOException { /* Create a BitSet to store the result */ int maxdocs = reader.numDocs(); BitSet bits = new BitSet(maxdocs); setPrecision(maxdocs);//from w w w .j a v a 2 s .com /* create an intermediate cache to avoid recomputing distances for the same point TODO: Why is this a WeakHashMap? */ WeakHashMap<String, Double> cdistance = new WeakHashMap<String, Double>(maxdocs); String[] latIndex = FieldCache.DEFAULT.getStrings(reader, latField); String[] lngIndex = FieldCache.DEFAULT.getStrings(reader, lngField); /* store calculated distances for reuse by other components */ distances = new HashMap<Integer, Double>(maxdocs); for (int i = 0; i < maxdocs; i++) { String sx = latIndex[i]; String sy = lngIndex[i]; if (sx != null && sy != null) { double x = NumberUtils.SortableStr2double(sx); double y = NumberUtils.SortableStr2double(sy); // round off lat / longs if necessary x = DistanceHandler.getPrecision(x, precise); y = DistanceHandler.getPrecision(y, precise); String ck = new Double(x).toString() + "," + new Double(y).toString(); Double cachedDistance = cdistance.get(ck); double d; if (cachedDistance != null) { d = cachedDistance.doubleValue(); } else { d = DistanceUtils.getDistanceMi(lat, lng, x, y); cdistance.put(ck, d); } distances.put(i, d); if (distance < 0 || d < distance) { bits.set(i); } } } return bits; }
From source file:com.qwazr.search.index.IndexStatus.java
License:Apache License
public IndexStatus(IndexReader indexReader, IndexSettingsDefinition settings, Set<String> analyzers, Set<String> fields) { num_docs = (long) indexReader.numDocs(); num_deleted_docs = (long) indexReader.numDeletedDocs(); this.settings = settings; this.analyzers = analyzers; this.fields = fields; }