List of usage examples for org.apache.lucene.index IndexReader docFreq
public abstract int docFreq(Term term) throws IOException;
term
. From source file:com.semantic.util.suggest.Suggestion.java
public static Suggestion create(Lookup.LookupResult result, IndexReader reader, String[] fields) { Suggestion ret = new Suggestion(result); /* *//*from w w w . j ava2 s. c o m*/ if (reader != null && fields != null) { /* remove bold tag */ String text = format(result.key.toString()); try { for (String field : fields) { int count = reader.docFreq(new Term(field, text)); if (count > 0) { ret.lucene = field; ret.fieldCount = count; } } } catch (IOException ex) { } } return ret; }
From source file:com.silverpeas.silvercrawler.model.FileFolder.java
License:Open Source License
public FileFolder(String rootPath, String path, boolean isAdmin, String componentId) { files = new ArrayList<FileDetail>(0); folders = new ArrayList<FileDetail>(0); String childPath = null;// w w w. ja va 2s. c o m try { SilverTrace.debug("silverCrawler", "FileFolder.FileFolder()", "root.MSG_GEN_PARAM_VALUE", "Starting constructor for FileFolder. Path = " + path); File f = new File(path); File fChild; SilverTrace.debug("silverCrawler", "FileFolder.FileFolder()", "root.MSG_GEN_PARAM_VALUE", "isExists " + f.exists() + " isFile=" + f.isFile()); writable = f.canWrite(); if (f.exists()) { this.name = f.getName(); this.readable = f.canRead(); String[] children_name = f.list(); IndexReader reader = null; boolean isIndexed = false; if (isAdmin) { // ouverture de l'index String indexPath = FileRepositoryManager.getAbsoluteIndexPath("", componentId); if (IndexReader.indexExists(indexPath)) reader = IndexReader.open(indexPath); } for (int i = 0; children_name != null && i < children_name.length; i++) { SilverTrace.debug("silverCrawler", "FileFolder.FileFolder()", "root.MSG_GEN_PARAM_VALUE", "Name = " + children_name[i]); fChild = new File(path + File.separator + children_name[i]); isIndexed = false; if (isAdmin) { // rechercher si le rpertoire (ou le fichier) est index String pathIndex = componentId + "|"; if (fChild.isDirectory()) pathIndex = pathIndex + "LinkedDir" + "|"; else pathIndex = pathIndex + "LinkedFile" + "|"; pathIndex = pathIndex + fChild.getPath(); SilverTrace.debug("silverCrawler", "FileFolder.FileFolder()", "root.MSG_GEN_PARAM_VALUE", "pathIndex = " + pathIndex); Term term = new Term("key", pathIndex); if (reader != null && reader.docFreq(term) == 1) isIndexed = true; } if (fChild.isDirectory()) { folders.add(new FileDetail(fChild.getName(), fChild.getPath(), fChild.length(), true, isIndexed)); } else { childPath = fChild.getPath().substring(rootPath.length() + 1); files.add(new FileDetail(fChild.getName(), childPath, fChild.length(), false, isIndexed)); } } // fermeture de l'index if (reader != null && isAdmin) reader.close(); } } catch (Exception e) { throw new SilverCrawlerRuntimeException("FileFolder.FileFolder()", SilverpeasRuntimeException.ERROR, "silverCrawler.IMPOSSIBLE_DACCEDER_AU_REPERTOIRE", e); } }
From source file:com.sindicetech.siren.search.node.NodeScoringRewrite.java
License:Open Source License
@Override public Q rewrite(final IndexReader reader, final MultiNodeTermQuery query) throws IOException { final Q result = this.getTopLevelQuery(query); final ParallelArraysTermCollector col = new ParallelArraysTermCollector(); this.collectTerms(reader, query, col); final int size = col.terms.size(); if (size > 0) { final int sort[] = col.terms.sort(col.termsEnum.getComparator()); final float[] boost = col.array.boost; final TermContext[] termStates = col.array.termState; for (int i = 0; i < size; i++) { final int pos = sort[i]; final Term term = new Term(query.getField(), col.terms.get(pos, new BytesRef())); assert reader.docFreq(term) == termStates[pos].docFreq(); this.addClause(result, term, termStates[pos].docFreq(), query.getBoost() * boost[pos], termStates[pos]);/* w w w. j a v a 2 s .c o m*/ } } return result; }
From source file:com.sindicetech.siren.search.node.TopNodeTermsRewrite.java
License:Open Source License
@Override public Q rewrite(final IndexReader reader, final MultiNodeTermQuery query) throws IOException { final int maxSize = Math.min(size, this.getMaxSize()); final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>(); this.collectTerms(reader, query, new TermCollector() { private final MaxNonCompetitiveBoostAttribute maxBoostAtt = attributes .addAttribute(MaxNonCompetitiveBoostAttribute.class); private final Map<BytesRef, ScoreTerm> visitedTerms = new HashMap<BytesRef, ScoreTerm>(); private TermsEnum termsEnum; private Comparator<BytesRef> termComp; private BoostAttribute boostAtt; private ScoreTerm st; @Override/*from www. j ava 2 s. c o m*/ public void setNextEnum(final TermsEnum termsEnum) throws IOException { this.termsEnum = termsEnum; this.termComp = termsEnum.getComparator(); assert this.compareToLastTerm(null); // lazy init the initial ScoreTerm because comparator is not known on ctor: if (st == null) st = new ScoreTerm(this.termComp, new TermContext(topReaderContext)); boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class); } // for assert: private BytesRef lastTerm; private boolean compareToLastTerm(final BytesRef t) throws IOException { if (lastTerm == null && t != null) { lastTerm = BytesRef.deepCopyOf(t); } else if (t == null) { lastTerm = null; } else { assert termsEnum.getComparator().compare(lastTerm, t) < 0 : "lastTerm=" + lastTerm + " t=" + t; lastTerm.copyBytes(t); } return true; } @Override public boolean collect(final BytesRef bytes) throws IOException { final float boost = boostAtt.getBoost(); // make sure within a single seg we always collect // terms in order assert this.compareToLastTerm(bytes); //System.out.println("TTR.collect term=" + bytes.utf8ToString() + " boost=" + boost + " ord=" + readerContext.ord); // ignore uncompetitive hits if (stQueue.size() == maxSize) { final ScoreTerm t = stQueue.peek(); if (boost < t.boost) return true; if (boost == t.boost && termComp.compare(bytes, t.bytes) > 0) return true; } ScoreTerm t = visitedTerms.get(bytes); final TermState state = termsEnum.termState(); assert state != null; if (t != null) { // if the term is already in the PQ, only update docFreq of term in PQ assert t.boost == boost : "boost should be equal in all segment TermsEnums"; t.termState.register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); } else { // add new entry in PQ, we must clone the term, else it may get overwritten! st.bytes.copyBytes(bytes); st.boost = boost; visitedTerms.put(st.bytes, st); assert st.termState.docFreq() == 0; st.termState.register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); stQueue.offer(st); // possibly drop entries from queue if (stQueue.size() > maxSize) { st = stQueue.poll(); visitedTerms.remove(st.bytes); st.termState.clear(); // reset the termstate! } else { st = new ScoreTerm(termComp, new TermContext(topReaderContext)); } assert stQueue.size() <= maxSize : "the PQ size must be limited to maxSize"; // set maxBoostAtt with values to help FuzzyTermsEnum to optimize if (stQueue.size() == maxSize) { t = stQueue.peek(); maxBoostAtt.setMaxNonCompetitiveBoost(t.boost); maxBoostAtt.setCompetitiveTerm(t.bytes); } } return true; } }); final Q q = this.getTopLevelQuery(query); final ScoreTerm[] scoreTerms = stQueue.toArray(new ScoreTerm[stQueue.size()]); ArrayUtil.timSort(scoreTerms, scoreTermSortByTermComp); for (final ScoreTerm st : scoreTerms) { final Term term = new Term(query.field, st.bytes); assert reader.docFreq(term) == st.termState.docFreq() : "reader DF is " + reader.docFreq(term) + " vs " + st.termState.docFreq() + " term=" + term; this.addClause(q, term, st.termState.docFreq(), query.getBoost() * st.boost, st.termState); // add to query } return q; }
From source file:edu.coeia.tasks.CommonKeywordsTask.java
License:Open Source License
public Map<String, Integer> getAllTermFreqFromItems() throws IOException { Map<String, Integer> map = new HashMap<String, Integer>(); String indexDir = this.aCase.getCaseLocation() + File.separator + ApplicationConstants.CASE_INDEX_FOLDER; Directory dir = FSDirectory.open(new File(indexDir)); IndexReader indexReader = IndexReader.open(dir); TermEnum terms = indexReader.terms(); int factor = indexReader.maxDoc() / 100; while (terms.next()) { if (isCancelledTask()) break; Term term = terms.term();// ww w .ja va 2 s . co m if (this.isAllowedFeild(term.field().trim())) { String termText = term.text(); int frequency = indexReader.docFreq(term); if (frequency >= factor) map.put(termText, frequency); } } System.out.println("map size: " + map.size()); indexReader.close(); return map; }
From source file:edu.mit.ll.vizlinc.highlight.QueryTermExtractor.java
License:Apache License
/** * Extracts all terms texts of a given Query into an array of WeightedTerms * * @param query Query to extract term texts from * @param reader used to compute IDF which can be used to a) score selected fragments better * b) use graded highlights eg changing intensity of font color * @param fieldName the field on which Inverse Document Frequency (IDF) calculations are based * @return an array of the terms used in a query, plus their weights. *///from w w w . j a v a2 s . c o m public static final WeightedTerm[] getIdfWeightedTerms(Query query, IndexReader reader, String fieldName) { WeightedTerm[] terms = getTerms(query, false, fieldName); int totalNumDocs = reader.numDocs(); for (int i = 0; i < terms.length; i++) { try { int docFreq = reader.docFreq(new Term(fieldName, terms[i].term)); // docFreq counts deletes if (totalNumDocs < docFreq) { docFreq = totalNumDocs; } //IDF algorithm taken from DefaultSimilarity class float idf = (float) (Math.log((float) totalNumDocs / (double) (docFreq + 1)) + 1.0); terms[i].weight *= idf; } catch (IOException e) { //ignore } } return terms; }
From source file:edu.mit.ll.vizlinc.highlight.WeightedSpanTermExtractor.java
License:Apache License
/** * Creates a Map of <code>WeightedSpanTerms</code> from the given <code>Query</code> and <code>TokenStream</code>. Uses a supplied * <code>IndexReader</code> to properly weight terms (for gradient highlighting). * // w w w .ja va 2 s . co m * <p> * * @param query * that caused hit * @param tokenStream * of text to be highlighted * @param fieldName * restricts Term's used based on field name * @param reader * to use for scoring * @return Map of WeightedSpanTerms with quasi tf/idf scores * @throws IOException */ public Map<String, WeightedSpanTerm> getWeightedSpanTermsWithScores(Query query, TokenStream tokenStream, String fieldName, IndexReader reader) throws IOException { if (fieldName != null) { this.fieldName = StringHelper.intern(fieldName); } else { this.fieldName = null; } this.tokenStream = tokenStream; Map<String, WeightedSpanTerm> terms = new PositionCheckingMap<String>(); extract(query, terms); int totalNumDocs = reader.numDocs(); Set<String> weightedTerms = terms.keySet(); Iterator<String> it = weightedTerms.iterator(); try { while (it.hasNext()) { WeightedSpanTerm weightedSpanTerm = terms.get(it.next()); int docFreq = reader.docFreq(new Term(fieldName, weightedSpanTerm.term)); // docFreq counts deletes if (totalNumDocs < docFreq) { docFreq = totalNumDocs; } // IDF algorithm taken from DefaultSimilarity class float idf = (float) (Math.log((float) totalNumDocs / (double) (docFreq + 1)) + 1.0); weightedSpanTerm.weight *= idf; } } finally { closeReaders(); } return terms; }
From source file:edu.wayne.cs.severe.ir4se.lucene.SearchFiles.java
License:Apache License
@SuppressWarnings("deprecation") public static void search(String system, int documentType, int queryNumber, String indexDirectoryPath, String queryString, String fileOutput, String[] targetClasses, boolean runIndividualTerms, boolean append) throws Exception { String index = indexDirectoryPath; FileWriter f = new FileWriter(index + "../NotFound.txt", true); for (int i = 0; i < targetClasses.length; i++) { String target = targetClasses[i]; boolean found = Indexer.isFileDocInIndex(indexDirectoryPath, target); if (!found) f.append("Target doc " + i + " - " + target + " not found in index!\n"); }//from www. ja va2s .c om f.close(); IndexReader reader = IndexReader.open(FSDirectory.open(new File(index)), true); int numDocs = reader.numDocs(); System.out.println("The number of documents in the index is: " + numDocs); IndexSearcher searcher = new IndexSearcher(reader); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_35); String[] fields; fields = new String[1]; fields[0] = "contents"; if (!runIndividualTerms) { MultiFieldQueryParser parser = new MultiFieldQueryParser(Version.LUCENE_35, fields, analyzer); int hitsPerPage = numDocs; TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true); Query query = parser.parse(queryString); searcher.search(query, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; System.out.println("The number of hits is: " + hits.length); // file with the results (score and position) only for the relevant // documents // the file contains entries in the following format: // (queryNumber,relDoc1,posRelDoc1,scoreRelDoc1,relDoc2,posRelDoc2,scoreRelDoc2,...) FileWriter fwRelevant = new FileWriter(fileOutput, append); String path = ""; String docName = ""; String docPathAndName = ""; for (String target : targetClasses) { boolean found = false; for (int i = 0; i < hits.length; i++) { int docId = hits[i].doc; Document d = searcher.doc(docId); path = d.get("path"); float score = hits[i].score; if (documentType == 2) { docName = d.get("docName"); docPathAndName = path.toLowerCase() + "." + docName.toLowerCase(); if (target.equalsIgnoreCase(docPathAndName)) { fwRelevant.write(system + ";" + queryNumber + ";" + target + ";" + (i + 1) + ";" + hits.length + ";" + numDocs + ";" + score + "\n"); found = true; break; } } else if (documentType == 1) { File pathDir = new File(path.trim()); String fileName = pathDir.getName(); docName = fileName.replaceAll(".txt", ""); fwRelevant.write((i + 1) + ". doc = " + docName + " score = " + score + "\n"); } } if (found == false) fwRelevant.write(system + ";" + queryNumber + ";" + target + "; NOT_RETRIEVED" + "\n"); } // fw.close(); fwRelevant.close(); reader.close(); } else // runIndividualTerms = true { /** * each query will be divided in its constituent terms and each term * will be run as a separate query **/ /** * this is useful to determine the similarity of each of the terms * in a query to a target document so that we determine which terms * in the query tend to lead to the best results, i.e., to finding * the targets sooner **/ SearchFiles.search(system, documentType, queryNumber, indexDirectoryPath, queryString, fileOutput.replaceAll(".txt", "_wholeQuery.txt"), targetClasses, false, append); FileWriter fw = new FileWriter(fileOutput.replaceAll(".txt", "_terms.txt")); fw.write( "\n\n\n------------------------------------------------------------------------------------\n\n"); fw.write(" Results for query " + queryNumber + "\n"); fw.write("------------------------------------------------------------------------------------\n\n"); // file with the results (score and position) only for the relevant // documents // the file contains entries in the following format: // (queryNumber,term1,term1TF,term1DF,relDoc1,posRelDoc1Term1,scoreRelDoc1Term1,relDoc2,posRelDoc2Term1,scoreRelDoc2Term1,...) // (queryNumber,term2,term2TF,term2DF,relDoc1,posRelDoc1Term2,scoreRelDoc1Term2,relDoc2,posRelDoc2Term2,scoreRelDoc2Term2,...) // ... FileWriter fwRelevant = new FileWriter( fileOutput.replaceAll(".txt", "_terms_RelevantDocsPositions.txt")); String[] queryTerms = queryString.split(" "); for (int l = 0; l < queryTerms.length; l++) { MultiFieldQueryParser parser = new MultiFieldQueryParser(Version.LUCENE_35, fields, analyzer); int hitsPerPage = numDocs; TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true); String q = queryTerms[l]; Query query = parser.parse(q); searcher.search(query, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; fw.write("TERM " + (l + 1) + ": " + q + "\n\n"); fwRelevant.write("\n" + queryNumber + "," + q); for (int i = 0; i < hits.length; i++) { int docId = hits[i].doc; Document d = searcher.doc(docId); String path = d.get("path"); float score = hits[i].score; if (documentType == 2) { String docName = d.get("docName"); fw.write((i + 1) + ". doc = " + path + " " + docName + " - score = " + score + "\n"); for (int k = 0; k < targetClasses.length; k++) { if (docName.equalsIgnoreCase(targetClasses[k])) { String contents = d.get("contents"); int frequency = countOccurrences(contents, q);// tf fwRelevant.write("," + frequency); fwRelevant.write("," + reader.docFreq(new Term("contents", q)));// df fwRelevant.write("," + path + "." + docName + "," + (i + 1) + "," + score); break; } } } else if (documentType == 1) { File pathDir = new File(path); String fileName = pathDir.getName(); String docName = fileName.replaceAll(".txt", ""); fw.write((i + 1) + ". doc = " + docName + " score = " + score + "\n"); } } fw.write("\n\n\n"); } fw.close(); f.close(); fwRelevant.close(); reader.close(); } }
From source file:engine.easy.search.EasySearchEngine.java
License:Apache License
/** * Computes the results on ranking function and other scoring factors. * /* w w w . j av a 2 s .co m*/ * @param terms the query terms * @param ixReader the index reader * @param esiReader the custom easy index reader * @param numberOfResults the number of results return back * @return the Results. * @throws Exception if one is thrown. */ public Result[] getResults(Query query, IndexReader ixReader, EasySearchIndexReader esiReader, Map<Integer, Float> relevanceDocMap) { Map<Integer, Result> results = null; try { Set<Term> terms = new HashSet<Term>(); query.extractTerms(terms); results = new HashMap<Integer, Result>(); Iterator<Term> itr = terms.iterator(); while (itr.hasNext()) { Term term = itr.next(); TermDocs docs = ixReader.termDocs(term); int docFreq = ixReader.docFreq(term); // get the document frequency of the term from lucene's index reader int docNum = esiReader.recordCount(AppConstants.CONTENT_FIELD); // get the total record of the field from lucene extra index (you may think it is also possible to use ixreader.maxDoc() here, but the ixreader.maxDoc() only returns the number of documents, while some documents may not have the search field (although every document has the search field in this example)) while (docs.next()) { Integer id = docs.doc(); // get the internal lucene's id of the document int termFreq = docs.freq(); // get the frequency of the term in this document int docLen = esiReader.docLength(id, AppConstants.CONTENT_FIELD); // get the length of the document from lucene extra index. double avgDocLen = esiReader.avgFieldLength(AppConstants.CONTENT_FIELD); // get the average length of the search field from lucene extra index. Document document = ixReader.document(id); //get the particular document. String storedField = extractData(document.get(AppConstants.CONTENT_FIELD)); // Compute the scoring with BM25 ranking and also include other scoring factors such as (relevance feedback based on terms) BM25 bm25 = new BM25(); //System.out.println(bm25.getInfo()); // Also add the document boost in the ranking score. double termWeight = bm25.score(termFreq, docNum, docLen, avgDocLen, 1d, docFreq); //Add each document relevance score! if (relevanceDocMap != null && !relevanceDocMap.isEmpty() && relevanceDocMap.containsKey(id)) termWeight = termWeight * relevanceDocMap.get(id); //System.out.println("lucene id" + id + " Doc id " + document.getField("DOCID").stringValue() + "wieght" + termWeight); if (results.containsKey(id)) { results.get(id).score = results.get(id).score + termWeight; } else { Result result = new Result(new Integer(id), document.getField("DOCID").stringValue(), termWeight, storedField); results.put(id, result); } } } return sortArray(results, AppConstants.TOP_RESULTS); } catch (Exception e) { System.out.println("Exception: getResults " + e.toString()); } return null; }
From source file:engine.easy.search.RelevanceFeedBackUtil.java
License:Apache License
/** * Computes a term frequency map for the overall index at the specified location. * Builds a Boolean OR query out of the "most frequent" terms in the index * and returns it. "Most Frequent" is defined as the terms whose frequencies * are greater than or equal to the topTermCutoff * the frequency of the top * term, where the topTermCutoff is number between 0 and 1. * /*w w w . j a v a 2 s .c om*/ * @param ramdir the directory where the index is created. * @return a Boolean OR query. * @throws Exception if one is thrown. */ private static Query computeTopTermQueryFromDataCollection(Directory ramdir, int numOf) throws Exception { final Map<String, Integer> frequencyMap = new HashMap<String, Integer>(); List<String> termlist = new ArrayList<String>(); IndexReader reader = IndexReader.open(ramdir); TermEnum terms = reader.terms(); while (terms.next()) { Term term = terms.term(); String termText = term.text(); int frequency = reader.docFreq(term); frequencyMap.put(termText, frequency); termlist.add(termText); } reader.close(); return computeTopTermQuery(termlist, frequencyMap, AppConstants.TOP_DOCUMENTS); }