List of usage examples for org.apache.lucene.index IndexReader docFreq
public abstract int docFreq(Term term) throws IOException;
term
. From source file:org.apache.solr.spelling.WordBreakSolrSpellChecker.java
License:Apache License
@Override public SpellingResult getSuggestions(SpellingOptions options) throws IOException { IndexReader ir = options.reader; int numSuggestions = options.count; StringBuilder sb = new StringBuilder(); Token[] tokenArr = options.tokens.toArray(new Token[options.tokens.size()]); List<Term> termArr = new ArrayList<Term>(options.tokens.size() + 2); List<ResultEntry> breakSuggestionList = new ArrayList<ResultEntry>(); boolean lastOneProhibited = false; boolean lastOneRequired = false; boolean lastOneprocedesNewBooleanOp = false; for (int i = 0; i < tokenArr.length; i++) { boolean prohibited = (tokenArr[i].getFlags() & QueryConverter.PROHIBITED_TERM_FLAG) == QueryConverter.PROHIBITED_TERM_FLAG; boolean required = (tokenArr[i].getFlags() & QueryConverter.REQUIRED_TERM_FLAG) == QueryConverter.REQUIRED_TERM_FLAG; boolean procedesNewBooleanOp = (tokenArr[i].getFlags() & QueryConverter.TERM_PRECEDES_NEW_BOOLEAN_OPERATOR_FLAG) == QueryConverter.TERM_PRECEDES_NEW_BOOLEAN_OPERATOR_FLAG; if (i > 0 && (prohibited != lastOneProhibited || required != lastOneRequired || lastOneprocedesNewBooleanOp)) { termArr.add(WordBreakSpellChecker.SEPARATOR_TERM); }// w ww .j av a 2 s . c om lastOneProhibited = prohibited; lastOneRequired = required; lastOneprocedesNewBooleanOp = procedesNewBooleanOp; Term thisTerm = new Term(field, tokenArr[i].toString()); termArr.add(thisTerm); if (breakWords) { SuggestWord[][] breakSuggestions = wbsp.suggestWordBreaks(thisTerm, numSuggestions, ir, options.suggestMode, sortMethod); for (SuggestWord[] breakSuggestion : breakSuggestions) { sb.delete(0, sb.length()); boolean firstOne = true; int freq = 0; for (SuggestWord word : breakSuggestion) { if (!firstOne) { sb.append(" "); } firstOne = false; sb.append(word.string); if (sortMethod == BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY) { freq = Math.max(freq, word.freq); } else { freq += word.freq; } } breakSuggestionList.add(new ResultEntry(tokenArr[i], sb.toString(), freq)); } } } List<ResultEntry> combineSuggestionList = Collections.emptyList(); CombineSuggestion[] combineSuggestions = wbsp.suggestWordCombinations( termArr.toArray(new Term[termArr.size()]), numSuggestions, ir, options.suggestMode); if (combineWords) { combineSuggestionList = new ArrayList<ResultEntry>(combineSuggestions.length); for (CombineSuggestion cs : combineSuggestions) { int firstTermIndex = cs.originalTermIndexes[0]; int lastTermIndex = cs.originalTermIndexes[cs.originalTermIndexes.length - 1]; sb.delete(0, sb.length()); for (int i = firstTermIndex; i <= lastTermIndex; i++) { if (i > firstTermIndex) { sb.append(" "); } sb.append(tokenArr[i].toString()); } Token token = new Token(sb.toString(), tokenArr[firstTermIndex].startOffset(), tokenArr[lastTermIndex].endOffset()); combineSuggestionList.add(new ResultEntry(token, cs.suggestion.string, cs.suggestion.freq)); } } // Interleave the two lists of suggestions into one SpellingResult SpellingResult result = new SpellingResult(); Iterator<ResultEntry> breakIter = breakSuggestionList.iterator(); Iterator<ResultEntry> combineIter = combineSuggestionList.iterator(); ResultEntry lastBreak = breakIter.hasNext() ? breakIter.next() : null; ResultEntry lastCombine = combineIter.hasNext() ? combineIter.next() : null; int breakCount = 0; int combineCount = 0; while (lastBreak != null || lastCombine != null) { if (lastBreak == null) { result.add(lastCombine.token, lastCombine.suggestion, lastCombine.freq); result.addFrequency(lastCombine.token, getCombineFrequency(ir, lastCombine.token)); lastCombine = null; } else if (lastCombine == null) { result.add(lastBreak.token, lastBreak.suggestion, lastBreak.freq); result.addFrequency(lastBreak.token, ir.docFreq(new Term(field, lastBreak.token.toString()))); lastBreak = null; } else if (lastBreak.freq < lastCombine.freq) { result.add(lastCombine.token, lastCombine.suggestion, lastCombine.freq); result.addFrequency(lastCombine.token, getCombineFrequency(ir, lastCombine.token)); lastCombine = null; } else if (lastCombine.freq < lastBreak.freq) { result.add(lastBreak.token, lastBreak.suggestion, lastBreak.freq); result.addFrequency(lastBreak.token, ir.docFreq(new Term(field, lastBreak.token.toString()))); lastBreak = null; } else if (breakCount >= combineCount) { result.add(lastCombine.token, lastCombine.suggestion, lastCombine.freq); result.addFrequency(lastCombine.token, getCombineFrequency(ir, lastCombine.token)); lastCombine = null; } else { result.add(lastBreak.token, lastBreak.suggestion, lastBreak.freq); result.addFrequency(lastBreak.token, ir.docFreq(new Term(field, lastBreak.token.toString()))); lastBreak = null; } if (result.getSuggestions().size() > numSuggestions) { break; } if (lastBreak == null && breakIter.hasNext()) { lastBreak = breakIter.next(); breakCount++; } if (lastCombine == null && combineIter.hasNext()) { lastCombine = combineIter.next(); combineCount++; } } return result; }
From source file:org.apache.solr.spelling.WordBreakSolrSpellChecker.java
License:Apache License
private int getCombineFrequency(IndexReader ir, Token token) throws IOException { String[] words = spacePattern.split(token.toString()); int result = 0; if (sortMethod == BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY) { for (String word : words) { result = Math.max(result, ir.docFreq(new Term(field, word))); }//from w ww . j a v a 2 s. c o m } else { for (String word : words) { result += ir.docFreq(new Term(field, word)); } } return result; }
From source file:org.codesearch.searcher.server.util.STAutocompleter.java
License:Open Source License
public void setupIndex(Directory sourceDirectory, String fieldToAutocomplete) throws CorruptIndexException, IOException { IndexReader sourceReader = IndexReader.open(sourceDirectory); LuceneDictionary dict = new LuceneDictionary(sourceReader, fieldToAutocomplete); IndexWriter writer = new IndexWriter(autoCompleteDirectory, new STAutocompleteLuceneAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED); writer.setMergeFactor(300);/*from ww w .java 2 s .c o m*/ writer.setMaxBufferedDocs(150); Map<String, Integer> wordsMap = new HashMap<String, Integer>(); Iterator<String> iter = dict.getWordsIterator(); while (iter.hasNext()) { String word = iter.next(); if (word.length() < 0) { continue; } wordsMap.put(word, sourceReader.docFreq(new Term(fieldToAutocomplete, word))); } LOG.info("SetupIndex: " + GRAMMED_WORDS_FIELD); for (String word : wordsMap.keySet()) { Document doc = new Document(); doc.add(new Field(SOURCE_WORD_FIELD, word, Field.Store.YES, Field.Index.NOT_ANALYZED)); LOG.info("source:" + word); doc.add(new Field(GRAMMED_WORDS_FIELD, word, Field.Store.YES, Field.Index.ANALYZED)); LOG.info("grammed:" + word); writer.addDocument(doc); } sourceReader.close(); writer.optimize(); writer.close(); setupReader(); }
From source file:org.dbpedia.spotlight.lucene.search.BaseSearcher.java
License:Apache License
/** * Computes a term frequency map for the index at the specified location. * @param // w w w .j a va 2s. c o m * @return a Boolean OR query. * @throws Exception if one is thrown. * @author sujitpal (computeTopTermQuery in http://sujitpal.blogspot.com/2009/02/summarization-with-lucene.html) * @author pablomendes adapted from sujitpal */ public static List<Map.Entry<Term, Integer>> getTopTerms(IndexReader mReader) throws IOException { final Map<Term, Integer> frequencyMap = new HashMap<Term, Integer>(); TermEnum terms = mReader.terms(); //TODO check what can we do about fields here. should have only top terms for context field? while (terms.next()) { Term term = terms.term(); int frequency = mReader.docFreq(term); // DF frequencyMap.put(term, frequency); } // sort the term map by frequency descending Ordering descOrder = new Ordering<Map.Entry<Term, Integer>>() { public int compare(Map.Entry<Term, Integer> left, Map.Entry<Term, Integer> right) { return Ints.compare(right.getValue(), left.getValue()); } }; List<Map.Entry<Term, Integer>> sorted = descOrder.sortedCopy(frequencyMap.entrySet()); return sorted; }
From source file:org.deals.lucene.highlight.QueryTermExtractor.java
License:Apache License
/** * Extracts all terms texts of a given Query into an array of WeightedTerms * * @param query Query to extract term texts from * @param reader used to compute IDF which can be used to a) score selected fragments better * b) use graded highlights eg chaning intensity of font color * @param fieldName the field on which Inverse Document Frequency (IDF) calculations are based * @return an array of the terms used in a query, plus their weights. *//*from w ww .j a v a 2s .co m*/ public static final WeightedTerm[] getIdfWeightedTerms(Query query, IndexReader reader, String fieldName) { WeightedTerm[] terms = getTerms(query, false, fieldName); int totalNumDocs = reader.numDocs(); for (int i = 0; i < terms.length; i++) { try { int docFreq = reader.docFreq(new Term(fieldName, terms[i].term)); //IDF algorithm taken from DefaultSimilarity class float idf = (float) (Math.log((float) totalNumDocs / (double) (docFreq + 1)) + 1.0); terms[i].weight *= idf; } catch (IOException e) { //ignore } } return terms; }
From source file:org.eclipse.smila.search.lucene.index.access.ExistsOperation.java
License:Open Source License
/** * {@inheritDoc}//from w w w . j a v a2s.co m * * @see org.eclipse.smila.search.lucene.index.access.ISynchronizedOperation#process(java.lang.Object) */ public Boolean process(final IndexReader object) throws IndexException { boolean exists; try { exists = (object.docFreq(_term) > 0); } catch (final Exception e) { throw new IndexException("Unable to check wether document exists by term [" + _term.text() + "]", e); } return exists; }
From source file:org.karsha.base.DocIndexer.java
License:Open Source License
/** * This method calculates the TF-IDF score for each terms in the indexed * documents/*from w w w . ja va2s. c o m*/ * * @param numberOfDocs * @return - Hashmap of TF-IDF score per each term in document wise * @throws CorruptIndexException * @throws ParseException */ public HashMap<Integer, HashMap> tfIdfScore(int numberOfDocs) throws CorruptIndexException, ParseException { int noOfDocs = docNames.length; HashMap<Integer, HashMap> scoreMap = new HashMap<Integer, HashMap>(); //HashMap<Integer, float[]> scoreMap = new HashMap<Integer, float[]>(); try { //IndexReader re = IndexReader.open(FSDirectory.open(new File(pathToIndex)), true) ; IndexReader re = IndexReader.open(ramMemDir); int i = 0; for (int k = 0; k < numberOfDocs; k++) { int freq[]; TermFreqVector termsFreq; TermFreqVector termsFreqDocId; //TermFreqVector termsFreq3[]; HashMap<String, Float> wordMap = new HashMap<String, Float>(); String terms[]; float score[] = null; //termsFreq3=re.getTermFreqVectors(currentDocID); termsFreq = re.getTermFreqVector(k, "doccontent"); termsFreqDocId = re.getTermFreqVector(k, "docid"); int aInt = Integer.parseInt(termsFreqDocId.getTerms()[0]); freq = termsFreq.getTermFrequencies(); terms = termsFreq.getTerms(); int noOfTerms = terms.length; score = new float[noOfTerms]; DefaultSimilarity simi = new DefaultSimilarity(); for (i = 0; i < noOfTerms; i++) { int noofDocsContainTerm = re.docFreq(new Term("doccontent", terms[i])); // System.out.println(terms[i]+"\t"+freq[i]); //int noofDocsContainTerm = docsContainTerm(terms[i], "docnames"); float tf = simi.tf(freq[i]); float idf = simi.idf(noofDocsContainTerm, noOfDocs); wordMap.put(terms[i], (tf * idf)); } scoreMap.put(aInt, wordMap); } } catch (IOException e) { // score = null; e.printStackTrace(); } //Map<Integer,Float[]> scoreMap=new Map<Integer, Float[]>(); return scoreMap; }
From source file:org.karsha.base.DocIndexer.java
License:Open Source License
/** * This method calculates the TF-IDF score for each terms including markedup * Taxonoomy and FIBO terms, in the indexed documents * * @param numberOfDocs/* w ww .ja v a 2 s . c o m*/ * @param weight- higher weight can be given for FIBO and Taxomony terms * with preference * @return - Hashmap of TF-IDF score per each term in document wise * @throws CorruptIndexException * @throws ParseException */ public HashMap<Integer, HashMap> tfIdfScoreWithMarkUpTerms(int numberOfDocs, int weight) throws CorruptIndexException, ParseException { int noOfDocs = docNames.length; HashMap<Integer, HashMap> scoreMap = new HashMap<Integer, HashMap>(); //HashMap<Integer, float[]> scoreMap = new HashMap<Integer, float[]>(); try { //IndexReader re = IndexReader.open(FSDirectory.open(new File(pathToIndex)), true) ; IndexReader re = IndexReader.open(ramMemDir); int i = 0; for (int k = 0; k < numberOfDocs; k++) { int aInt = 0; //TermFreqVector termsFreqVec[]; TermFreqVector termsFreq; TermFreqVector termsFreqDocId = null; TermFreqVector termsFreqFiboTerm; TermFreqVector termsFreqTaxoTerm; HashMap<String, Float> wordMap = new HashMap<String, Float>(); String termsVec[][]; int freqVec[]; int noOfTermsVec[]; String terms[]; int freq[]; int noOfTerms; float score[] = null; //termsFreq3=re.getTermFreqVectors(currentDocID); /* * getting the fields in the indexed order, Doccontent, docid, * fiboterms */ //termsFreqVec = re.getTermFreqVectors(k); DefaultSimilarity simi = new DefaultSimilarity(); for (int m = 0; m < 4; m++) { switch (m) { case 0: //doc content termsFreq = re.getTermFreqVector(k, "doccontent"); // freq = termsFreqVec[0].getTermFrequencies(); // terms = termsFreqVec[0].getTerms(); freq = termsFreq.getTermFrequencies(); terms = termsFreq.getTerms(); noOfTerms = terms.length; score = new float[noOfTerms]; for (i = 0; i < noOfTerms; i++) { int noofDocsContainTerm = re.docFreq(new Term("doccontent", terms[i])); float tf = simi.tf(freq[i]); float idf = simi.idf(noofDocsContainTerm, noOfDocs); wordMap.put(terms[i], (tf * idf)); } break; case 1: // doc Id termsFreqDocId = re.getTermFreqVector(k, "docid"); // terms = termsFreqVec[1].getTerms(); aInt = Integer.parseInt(termsFreqDocId.getTerms()[0]); break; case 2: //Fiboterms termsFreqFiboTerm = re.getTermFreqVector(k, "fiboterms"); if (termsFreqFiboTerm != null) { freq = termsFreqFiboTerm.getTermFrequencies(); terms = termsFreqFiboTerm.getTerms(); noOfTerms = terms.length; score = new float[noOfTerms]; //DefaultSimilarity simi = new DefaultSimilarity(); for (i = 0; i < noOfTerms; i++) { int noofDocsContainTerm = re.docFreq(new Term("fiboterms", terms[i])); float tf = simi.tf(freq[i]); float idf = simi.idf(noofDocsContainTerm, noOfDocs); wordMap.put(terms[i], (tf * idf * weight)); } } break; case 3: //taxoterms termsFreqTaxoTerm = re.getTermFreqVector(k, "taxoterms"); if (termsFreqTaxoTerm != null) { freq = termsFreqTaxoTerm.getTermFrequencies(); terms = termsFreqTaxoTerm.getTerms(); noOfTerms = terms.length; score = new float[noOfTerms]; //DefaultSimilarity simi = new DefaultSimilarity(); for (i = 0; i < noOfTerms; i++) { int noofDocsContainTerm = re.docFreq(new Term("taxoterms", terms[i])); float tf = simi.tf(freq[i]); float idf = simi.idf(noofDocsContainTerm, noOfDocs); wordMap.put(terms[i], (tf * idf * weight)); } } break; default: //System.out.println("Invalid Entry!"); } } scoreMap.put(aInt, wordMap); } } catch (IOException e) { // score = null; e.printStackTrace(); } //Map<Integer,Float[]> scoreMap=new Map<Integer, Float[]>(); return scoreMap; }
From source file:org.ninit.models.bm25.BM25TermScorer.java
License:Apache License
public BM25TermScorer(IndexReader reader, TermQuery term, Similarity similarity) throws IOException { super(similarity); this.reader = reader; this.term = term; this.idf = this.getSimilarity().idf(reader.docFreq(term.getTerm()), reader.numDocs()); this.norm = this.reader.norms(this.term.getTerm().field()); this.av_length = BM25Parameters.getAverageLength(this.term.getTerm().field()); this.b = BM25Parameters.getB(); this.k1 = BM25Parameters.getK1(); this.termDocs = this.reader.termDocs(this.term.getTerm()); }
From source file:org.opengrok.suggest.SuggesterUtils.java
License:Open Source License
private static double computeNormalizedDocumentFrequency(final IndexReader indexReader, final Term term) throws IOException { int documentFrequency = indexReader.docFreq(term); return ((double) documentFrequency) / indexReader.numDocs(); }