List of usage examples for org.apache.lucene.index IndexReader docFreq
public abstract int docFreq(Term term) throws IOException;
term
. From source file:ca.ualberta.entitylinking.common.indexing.DocumentIndexer.java
License:Open Source License
public void readLuceneIndex(String indexDir, String docName) { IndexReader reader = null; Map<String, Integer> name2id = null; //load index/*from w w w .j a v a2 s . co m*/ try { reader = IndexReader.open(FSDirectory.open(new File(indexDir))); String[] stringArray = FieldCache.DEFAULT.getStrings(reader, "name"); // build a map from string to its document id. name2id = new HashMap<String, Integer>(); for (int i = 0; i < stringArray.length; i++) name2id.put(stringArray[i], i); } catch (IOException e) { e.printStackTrace(); } //get tf-idf vector of a document. DefaultSimilarity simObj = new DefaultSimilarity(); try { if (!name2id.containsKey(docName)) return; int docId = name2id.get(docName); Document doc = reader.document(docId); TermFreqVector termVector = reader.getTermFreqVector(docId, "contents"); int numDocs = reader.numDocs(); int[] termFreq = termVector.getTermFrequencies(); String[] terms = termVector.getTerms(); for (int i = 0; i < terms.length; i++) { //avoid stop words // if (isStopWord(terms[i])) // continue; int tf = termFreq[i]; int df = reader.docFreq(new Term("contents", terms[i])); float tfidf = simObj.tf(tf) * simObj.idf(df, numDocs); System.out.println(terms[i] + ": " + tfidf); } } catch (Exception e) { e.printStackTrace(); } }
From source file:com.bdaum.zoom.lal.internal.lucene.Lucene.java
License:Open Source License
public List<ScoredString> listTags(File indexPath, int maxItems) throws IOException { Object readerToken = null;// w ww . j ava2s .c o m try { readerToken = indexPath == null ? null : getIndexReaderToken(indexPath); if (readerToken != null) { IndexReader indexReader = readerMap.get(readerToken); if (indexReader != null) { List<ScoredString> result = new ArrayList<ScoredString>(1000); Terms terms = MultiFields.getTerms(indexReader, LireActivator.FIELD_NAME_FULL_TEXT); if (terms == null) return null; TermsEnum termEnum = terms.iterator(); BytesRef bytesRef; while ((bytesRef = termEnum.next()) != null) result.add(new ScoredString(bytesRef.utf8ToString(), indexReader.docFreq(new Term(LireActivator.FIELD_NAME_FULL_TEXT, bytesRef)))); Collections.sort(result); return (result.size() > maxItems) ? result.subList(0, maxItems) : result; } } return null; } finally { if (readerToken != null) releaseIndexReader(indexPath, readerToken); } }
From source file:com.flaptor.hounder.searcher.spell.SpellChecker.java
License:Apache License
/** * Suggest similar words (restricted or not to a field of a user index) * @param word String the word you want a spell check done on * @param num_sug int the number of suggest words * @param ir the indexReader of the user index (can be null see field param) * @param field String the field of the user index: if field is not null, the suggested * words are restricted to the words present in this field. * @param morePopular boolean return only the suggest words that are more frequent than the searched word * (only if restricted mode = (indexReader!=null and field!=null) * @throws IOException/*from w w w.j a va 2 s . c o m*/ * @return String[] the sorted list of the suggest words with this 2 criteria: * first criteria: the edit distance, second criteria (only if restricted mode): the popularity * of the suggest words in the field of the user index */ public String[] suggestSimilar(String word, int num_sug, IndexReader ir, String field, boolean morePopular) throws IOException { float minScore = min; final TRStringDistance sd = new TRStringDistance(word); final int lengthWord = word.length(); final int goalFreq = (morePopular && ir != null) ? ir.docFreq(new Term(field, word)) : 0; if (!morePopular && goalFreq > 0) { return new String[] { word }; // return the word if it exist in the index and i don't want a more popular word } BooleanQuery query = new BooleanQuery(); String[] grams; String key; for (int ng = getMin(lengthWord); ng <= getMax(lengthWord); ng++) { key = "gram" + ng; // form key grams = formGrams(word, ng); // form word into ngrams (allow dups too) if (grams.length == 0) { continue; // hmm } if (bStart > 0) { // should we boost prefixes? add(query, "start" + ng, grams[0], bStart); // matches start of word } if (bEnd > 0) { // should we boost suffixes add(query, "end" + ng, grams[grams.length - 1], bEnd); // matches end of word } for (int i = 0; i < grams.length; i++) { add(query, key, grams[i]); } } IndexSearcher searcher = new IndexSearcher(this.spellindex); TopDocCollector collector = new TopDocCollector(10 * num_sug); // go thru more than 'maxr' matches in case the distance filter triggers searcher.search(query, collector); ScoreDoc[] scoreDocs = collector.topDocs().scoreDocs; SuggestWordQueue sugqueue = new SuggestWordQueue(num_sug); SuggestWord sugword = new SuggestWord(); for (int i = 0; i < scoreDocs.length; i++) { Document doc = searcher.doc(i); sugword.string = doc.get(F_WORD); // get orig word) if (sugword.string.equals(word)) { continue; // don't suggest a word for itself, that would be silly } //edit distance/normalize with the min word length sugword.score = doc.getBoost() * (1.0f - ((float) sd.getDistance(sugword.string) / Math.min(sugword.string.length(), lengthWord))); if (sugword.score < minScore) { continue; } if (ir != null) { // use the user index sugword.freq = ir.docFreq(new Term(field, sugword.string)); // freq in the index if ((morePopular && goalFreq > sugword.freq) || sugword.freq < 1) { // don't suggest a word that is not present in the field continue; } } sugqueue.insert(sugword); if (sugqueue.size() == num_sug) { //if queue full , maintain the min score minScore = ((SuggestWord) sugqueue.top()).score; } sugword = new SuggestWord(); } // convert to array string String[] list = new String[sugqueue.size()]; for (int i = sugqueue.size() - 1; i >= 0; i--) { list[i] = ((SuggestWord) sugqueue.pop()).string; } searcher.close(); return list; }
From source file:com.greplin.lucene.filter.PhraseFilter.java
License:Apache License
@Override public DocIdSet getDocIdSet(final IndexReader reader) throws IOException { List<IndexReader> subReaders = IndexReaders.gatherSubReaders(reader); PhraseFilterMatchList[] results = new PhraseFilterMatchList[subReaders.size()]; int matchCount = 0; int readerNumber = 0; for (IndexReader subReader : subReaders) { SortedSet<TermWithFrequency> termsOrderedByFrequency = Sets.newTreeSet(); for (int i = 0; i < this.terms.length; i++) { Term t = this.terms[i]; termsOrderedByFrequency.add(new TermWithFrequency(t, subReader.docFreq(t), i)); }/*from w w w . j ava2 s.c o m*/ PhraseFilterMatchList matches = null; TermPositions termPositions = subReader.termPositions(); try { for (TermWithFrequency term : termsOrderedByFrequency) { if (term.docFreq == 0) { break; } termPositions.seek(term.term); if (matches == null) { // If this is the first term, collect all matches that intersect // with the provided initial document set. Intersection intersection = this.intersectionProvider.get(reader); matches = new PhraseFilterMatchList(term.docFreq); while (intersection.advanceToNextIntersection(termPositions)) { int freq = termPositions.freq(); PhraseFilterIntList list = new PhraseFilterIntList(freq); for (int i = 0; i < freq; i++) { list.add(termPositions.nextPosition() - term.offset); } matches.add(termPositions.doc(), list); } } else { // Otherwise, intersect with the existing matches. matches.intersect(termPositions, term.offset); } if (matches.getCount() == 0) { break; } } } finally { termPositions.close(); } if (matches != null) { results[readerNumber] = matches; matchCount += matches.getCount(); } readerNumber++; } final int bitsPerIntPowerLogTwo = 5; // 2^5 = 32 if (matchCount > reader.maxDoc() >> bitsPerIntPowerLogTwo) { FixedBitSet result = new FixedBitSet(reader.maxDoc()); int readerOffset = 0; for (int readerIndex = 0; readerIndex < results.length; readerIndex++) { PhraseFilterMatchList matches = results[readerIndex]; if (matches != null) { int count = matches.getCount(); int[] docIds = matches.getDocIds(); for (int i = 0; i < count; i++) { result.set(docIds[i] + readerOffset); } } readerOffset += subReaders.get(readerIndex).maxDoc(); } return result; } else if (matchCount == 0) { return DocIdSets.EMPTY; } else { int[] result = new int[matchCount]; int base = 0; int readerOffset = 0; for (int readerIndex = 0; readerIndex < results.length; readerIndex++) { PhraseFilterMatchList matches = results[readerIndex]; if (matches != null) { int count = matches.getCount(); int[] docIds = matches.getDocIds(); for (int i = 0; i < count; i++) { result[base + i] = docIds[i] + readerOffset; } base += count; } readerOffset += subReaders.get(readerIndex).maxDoc(); } return new SortedIntArrayDocIdSet(result); } }
From source file:com.joliciel.jochre.search.highlight.LuceneQueryHighlighter.java
License:Open Source License
public LuceneQueryHighlighter(JochreQuery jochreQuery, IndexSearcher indexSearcher) { try {//from w w w .j a v a 2s .co m this.indexSearcher = indexSearcher; this.jochreQuery = jochreQuery; query = rewrite(jochreQuery.getLuceneQuery()); queryTerms = new TreeSet<Term>(); query.extractTerms(queryTerms); if (LOG.isTraceEnabled()) queryTermList = new ArrayList<Term>(queryTerms); final IndexReader reader = indexSearcher.getIndexReader(); // add 1 to doc count to ensure even terms in all docs get a very small weight docCountLog = Math.log(reader.numDocs() + 1); IndexReaderContext readerContext = reader.getContext(); leaves = readerContext.leaves(); // since the same terms might be contained in the query multiple times (e.g. once per field) // we only consider them once each by using a HashSet terms = new HashSet<BytesRef>(); Map<BytesRef, Integer> termFreqs = new HashMap<BytesRef, Integer>(); for (Term term : queryTerms) { terms.add(term.bytes()); termFreqs.put(term.bytes(), 0); } termLogs = new HashMap<BytesRef, Double>(); for (Term term : queryTerms) { int freq = termFreqs.get(term.bytes()); freq += reader.docFreq(term); termFreqs.put(term.bytes(), freq); } for (BytesRef term : terms) { int freq = termFreqs.get(term); termLogs.put(term, Math.log(freq)); } } catch (IOException e) { LogUtils.logError(LOG, e); throw new RuntimeException(e); } }
From source file:com.meltwater.elasticsearch.search.suggest.PrefixSuggester.java
License:Apache License
private Suggest.Suggestion.Entry<Suggest.Suggestion.Entry.Option> getOptions(Term text, PrefixTermsEnum prefixTermsEnum, IndexReader indexReader, final int size) throws IOException { OptionQueue collectionQueue = new OptionQueue(size); BytesRef ref;/*from w ww.j av a 2 s . co m*/ while ((ref = prefixTermsEnum.next()) != null) { Term term = new Term(text.field(), BytesRef.deepCopyOf(ref)); collectionQueue.insertWithOverflow(new Suggest.Suggestion.Entry.Option( new StringText(term.bytes().utf8ToString()), indexReader.docFreq(term))); } Suggest.Suggestion.Entry<Suggest.Suggestion.Entry.Option> entry = new Suggest.Suggestion.Entry<Suggest.Suggestion.Entry.Option>( new StringText(text.text()), 0, text.bytes().length); while (collectionQueue.size() > 0) { entry.addOption(collectionQueue.pop()); } return entry; }
From source file:com.mothsoft.alexis.engine.textual.TFIDFCalculatorImpl.java
License:Apache License
@SuppressWarnings("unchecked") @Transactional//from w w w . j a v a 2s . c om public void execute() { final long start = System.currentTimeMillis(); final FullTextSession fullTextSession = Search.getFullTextSession((Session) this.em.getDelegate()); final SearchFactory searchFactory = fullTextSession.getSearchFactory(); final IndexReaderAccessor ira = searchFactory.getIndexReaderAccessor(); final IndexReader reader = ira.open(com.mothsoft.alexis.domain.Document.class); final Query query = em.createQuery( "select d from Document d join d.documentTerms dt where dt.tfIdf IS NULL ORDER BY d.id ASC"); final List<Document> documents = query.getResultList(); final Term luceneTerm = new Term(CONTENT_TEXT_FIELD_NAME); int affectedRows = 0; try { for (final Document document : documents) { final Map<String, Float> termTfIdfMap = new HashMap<String, Float>(); // calculate term TF-IDFs for (final DocumentTerm documentTerm : document.getDocumentTerms()) { final Term term = luceneTerm.createTerm(documentTerm.getTerm().getValueLowercase()); Float score = TFIDF.score(documentTerm.getTerm().getValueLowercase(), documentTerm.getCount(), document.getTermCount(), reader.numDocs(), reader.docFreq(term)); documentTerm.setTfIdf(score); termTfIdfMap.put(documentTerm.getTerm().getValueLowercase(), score); affectedRows++; } // update association weights for (final DocumentAssociation documentAssociation : document.getDocumentAssociations()) { final String a = documentAssociation.getA().getValueLowercase(); final String b = documentAssociation.getB().getValueLowercase(); documentAssociation.setAssociationWeight((float) documentAssociation.getAssociationCount() * (termTfIdfMap.get(a) + termTfIdfMap.get(b))); } } } catch (IOException e) { throw new RuntimeException(e); } finally { ira.close(reader); } logger.info("TF-IDF calc took: " + ((System.currentTimeMillis() - start) / 1000.00) + " seconds and affected " + affectedRows + " rows."); }
From source file:com.nearinfinity.blur.manager.IndexManager.java
License:Apache License
public static long recordFrequency(IndexReader reader, String columnFamily, String columnName, String value) throws IOException { return reader.docFreq(getTerm(columnFamily, columnName, value)); }
From source file:com.o19s.solr.swan.highlight.SpanAwareFieldTermStack.java
License:Apache License
/** * a constructor./*from w w w . j a v a 2 s . c om*/ * * @param reader IndexReader of the index * @param docId document id to be highlighted * @param fieldName field of the document to be highlighted * @param fieldQuery FieldQuery object * @throws IOException If there is a low-level I/O error */ public SpanAwareFieldTermStack(IndexReader reader, int docId, String fieldName, final SpanAwareFieldQuery fieldQuery) throws IOException { this.fieldName = fieldName; Set<String> termSet = fieldQuery.getTermSet(fieldName); Set<String> alwaysHighlightTermSet = fieldQuery.getHighlightTermSet(fieldName); // just return to make null snippet if un-matched fieldName specified when fieldMatch == true if (termSet == null) return; final Fields vectors = reader.getTermVectors(docId); if (vectors == null) { // null snippet return; } final Terms vector = vectors.terms(fieldName); if (vector == null) { // null snippet return; } final CharsRef spare = new CharsRef(); final TermsEnum termsEnum = vector.iterator(null); DocsAndPositionsEnum dpEnum = null; BytesRef text; int numDocs = reader.maxDoc(); while ((text = termsEnum.next()) != null) { UnicodeUtil.UTF8toUTF16(text, spare); final String term = spare.toString(); if (!termSet.contains(term)) { continue; } dpEnum = termsEnum.docsAndPositions(null, dpEnum); if (dpEnum == null) { // null snippet return; } dpEnum.nextDoc(); // For weight look here: http://lucene.apache.org/core/3_6_0/api/core/org/apache/lucene/search/DefaultSimilarity.html final float weight = (float) (Math .log(numDocs / (double) (reader.docFreq(new Term(fieldName, text)) + 1)) + 1.0); final int freq = dpEnum.freq(); for (int i = 0; i < freq; i++) { int pos = dpEnum.nextPosition(); if (dpEnum.startOffset() < 0) { return; // no offsets, null snippet } if (alwaysHighlightTermSet.contains(term) || fieldQuery.doesDocFieldContainPosition(fieldName, docId, dpEnum.startOffset())) { termList.add(new TermInfo(term, dpEnum.startOffset(), dpEnum.endOffset(), pos, weight)); } } } // sort by position Collections.sort(termList); }
From source file:com.redhat.satellite.search.index.IndexManager.java
License:Open Source License
/** * @param indexName//from w ww .ja va 2 s. com * @param doc document with data to index * @param uniqueField field in doc which identifies this uniquely * @param lang language * @throws IndexingException */ public void addUniqueToIndex(String indexName, Document doc, String uniqueField, String lang) throws IndexingException { IndexReader reader = null; int numFound = 0; try { reader = getIndexReader(indexName, lang); Term term = new Term(uniqueField, doc.get(uniqueField)); numFound = reader.docFreq(term); } catch (FileNotFoundException e) { // Index doesn't exist, so this add will be unique // we don't need to do anything/ } catch (IOException e) { throw new IndexingException(e); } finally { if (reader != null) { try { reader.close(); } catch (IOException e) { // } } } if (numFound > 0) { log.info("Found " + numFound + " <" + indexName + " docs for " + uniqueField + ":" + doc.get(uniqueField) + " will remove them now."); removeFromIndex(indexName, uniqueField, doc.get(uniqueField)); } addToIndex(indexName, doc, lang); }