List of usage examples for org.apache.lucene.index IndexReader docFreq
public abstract int docFreq(Term term) throws IOException;
term
. From source file:org.pageseeder.flint.lucene.search.Terms.java
License:Apache License
/** * Loads all the fuzzy terms in the list of terms given the reader. * * @param reader Index reader to use./*w w w . j a v a 2 s . c o m*/ * @param bucket Where to store the terms. * @param term The term to use. * * @throws IOException If an error is thrown by the fuzzy term enumeration. */ @Beta public static void fuzzy(IndexReader reader, Bucket<Term> bucket, Term term, int minSimilarity) throws IOException { AttributeSource atts = new AttributeSource(); Fields fields = MultiFields.getFields(reader); org.apache.lucene.index.Terms terms = fields == null ? null : fields.terms(term.field()); if (terms == null) return; FuzzyTermsEnum fuzzy = new FuzzyTermsEnum(terms, atts, term, minSimilarity, 0, true); BytesRef val; BytesRef searched = term.bytes(); while ((val = fuzzy.next()) != null) { if (!searched.bytesEquals(val)) { Term t = new Term(term.field(), BytesRef.deepCopyOf(val)); bucket.add(t, reader.docFreq(t)); } } }
From source file:org.pageseeder.flint.lucene.search.Terms.java
License:Apache License
/** * Loads all the prefix terms in the list of terms given the reader. * * @param reader Index reader to use./* w w w . ja va 2s.c o m*/ * @param bucket Where to store the terms. * @param term The term to use. * * @throws IOException If an error is thrown by the prefix term enumeration. */ public static void prefix(IndexReader reader, Bucket<Term> bucket, Term term) throws IOException { Fields fields = MultiFields.getFields(reader); org.apache.lucene.index.Terms terms = fields == null ? null : fields.terms(term.field()); if (terms == null) return; TermsEnum prefixes = terms.intersect(new CompiledAutomaton(PrefixQuery.toAutomaton(term.bytes())), term.bytes()); BytesRef val; while ((val = prefixes.next()) != null) { Term t = new Term(term.field(), BytesRef.deepCopyOf(val)); bucket.add(t, reader.docFreq(t)); } }
From source file:org.silverpeas.components.silvercrawler.model.FileFolder.java
License:Open Source License
public FileFolder(String rootPath, String path, boolean isAdmin, String componentId) { this.path = path; files = new ArrayList<>(0); folders = new ArrayList<>(0); try {/*from w ww .j a va2 s .c o m*/ // Check security access : cannot browse inside rootPath FileUtil.validateFilename(path, rootPath); File f = new File(path); writable = f.canWrite(); if (f.exists()) { this.name = f.getName(); this.readable = f.canRead(); File[] children = f.listFiles(); IndexReader reader = null; boolean isIndexed = false; if (isAdmin) { // ouverture de l'index Directory indexPath = FSDirectory .open(new File(IndexFileManager.getAbsoluteIndexPath(componentId))); if (IndexReader.indexExists(indexPath)) { reader = IndexReader.open(indexPath); } } if (children != null && children.length > 0) { for (File childFile : children) { isIndexed = false; if (isAdmin) { // rechercher si le rpertoire (ou le fichier) est index String pathIndex = componentId + "|"; if (childFile.isDirectory()) { pathIndex = pathIndex + "LinkedDir" + "|"; } else { pathIndex = pathIndex + "LinkedFile" + "|"; } pathIndex = pathIndex + FilenameUtils.separatorsToUnix(childFile.getPath()); Term term = new Term("key", pathIndex); if (reader != null && reader.docFreq(term) == 1) { isIndexed = true; } } if (childFile.isDirectory()) { folders.add(new FileDetail(childFile.getName(), childFile.getPath(), null, childFile.length(), true, isIndexed)); } else { String childPath = FileUtils.getFile(childFile.getPath().substring(rootPath.length())) .getPath(); files.add(new FileDetail(childFile.getName(), childPath, childFile.getPath(), childFile.length(), false, isIndexed)); } } } // fermeture de l'index if (reader != null && isAdmin) { reader.close(); } } } catch (Exception e) { throw new SilverCrawlerRuntimeException("FileFolder.FileFolder()", SilverpeasRuntimeException.ERROR, "silverCrawler.IMPOSSIBLE_DACCEDER_AU_REPERTOIRE", e); } }
From source file:org.sindice.siren.search.node.NodeScoringRewrite.java
License:Apache License
@Override public Q rewrite(final IndexReader reader, final MultiNodeTermQuery query) throws IOException { final Q result = this.getTopLevelQuery(); final ParallelArraysTermCollector col = new ParallelArraysTermCollector(); this.collectTerms(reader, query, col); final int size = col.terms.size(); if (size > 0) { final int sort[] = col.terms.sort(col.termsEnum.getComparator()); final float[] boost = col.array.boost; final TermContext[] termStates = col.array.termState; for (int i = 0; i < size; i++) { final int pos = sort[i]; final Term term = new Term(query.getField(), col.terms.get(pos, new BytesRef())); assert reader.docFreq(term) == termStates[pos].docFreq(); this.addClause(result, term, termStates[pos].docFreq(), query.getBoost() * boost[pos], termStates[pos]);/*from ww w . j a va 2s.c o m*/ } } return result; }
From source file:org.sindice.siren.search.node.TopNodeTermsRewrite.java
License:Apache License
@Override public Q rewrite(final IndexReader reader, final MultiNodeTermQuery query) throws IOException { final int maxSize = Math.min(size, this.getMaxSize()); final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>(); this.collectTerms(reader, query, new TermCollector() { private final MaxNonCompetitiveBoostAttribute maxBoostAtt = attributes .addAttribute(MaxNonCompetitiveBoostAttribute.class); private final Map<BytesRef, ScoreTerm> visitedTerms = new HashMap<BytesRef, ScoreTerm>(); private TermsEnum termsEnum; private Comparator<BytesRef> termComp; private BoostAttribute boostAtt; private ScoreTerm st; @Override//from ww w . j ava 2s . co m public void setNextEnum(final TermsEnum termsEnum) throws IOException { this.termsEnum = termsEnum; this.termComp = termsEnum.getComparator(); assert this.compareToLastTerm(null); // lazy init the initial ScoreTerm because comparator is not known on ctor: if (st == null) st = new ScoreTerm(this.termComp, new TermContext(topReaderContext)); boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class); } // for assert: private BytesRef lastTerm; private boolean compareToLastTerm(final BytesRef t) throws IOException { if (lastTerm == null && t != null) { lastTerm = BytesRef.deepCopyOf(t); } else if (t == null) { lastTerm = null; } else { assert termsEnum.getComparator().compare(lastTerm, t) < 0 : "lastTerm=" + lastTerm + " t=" + t; lastTerm.copyBytes(t); } return true; } @Override public boolean collect(final BytesRef bytes) throws IOException { final float boost = boostAtt.getBoost(); // make sure within a single seg we always collect // terms in order assert this.compareToLastTerm(bytes); //System.out.println("TTR.collect term=" + bytes.utf8ToString() + " boost=" + boost + " ord=" + readerContext.ord); // ignore uncompetitive hits if (stQueue.size() == maxSize) { final ScoreTerm t = stQueue.peek(); if (boost < t.boost) return true; if (boost == t.boost && termComp.compare(bytes, t.bytes) > 0) return true; } ScoreTerm t = visitedTerms.get(bytes); final TermState state = termsEnum.termState(); assert state != null; if (t != null) { // if the term is already in the PQ, only update docFreq of term in PQ assert t.boost == boost : "boost should be equal in all segment TermsEnums"; t.termState.register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); } else { // add new entry in PQ, we must clone the term, else it may get overwritten! st.bytes.copyBytes(bytes); st.boost = boost; visitedTerms.put(st.bytes, st); assert st.termState.docFreq() == 0; st.termState.register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); stQueue.offer(st); // possibly drop entries from queue if (stQueue.size() > maxSize) { st = stQueue.poll(); visitedTerms.remove(st.bytes); st.termState.clear(); // reset the termstate! } else { st = new ScoreTerm(termComp, new TermContext(topReaderContext)); } assert stQueue.size() <= maxSize : "the PQ size must be limited to maxSize"; // set maxBoostAtt with values to help FuzzyTermsEnum to optimize if (stQueue.size() == maxSize) { t = stQueue.peek(); maxBoostAtt.setMaxNonCompetitiveBoost(t.boost); maxBoostAtt.setCompetitiveTerm(t.bytes); } } return true; } }); final Q q = this.getTopLevelQuery(); final ScoreTerm[] scoreTerms = stQueue.toArray(new ScoreTerm[stQueue.size()]); ArrayUtil.mergeSort(scoreTerms, scoreTermSortByTermComp); for (final ScoreTerm st : scoreTerms) { final Term term = new Term(query.field, st.bytes); assert reader.docFreq(term) == st.termState.docFreq() : "reader DF is " + reader.docFreq(term) + " vs " + st.termState.docFreq() + " term=" + term; this.addClause(q, term, st.termState.docFreq(), query.getBoost() * st.boost, st.termState); // add to query } return q; }
From source file:perf.CreateQueries.java
License:Apache License
private static void processShingles(IndexReader r, String field, Writer queriesOut) throws IOException { System.out.println("\nFind phrase queries..."); // First pass: get high/medium freq shingles: final TermFreq[] topShingles = getTopTermsByDocFreq(r, field, TOP_N, true); long topDF = topShingles[0].df; int upto = 0; int counter = 0; while (topShingles[upto].df >= topDF / 10) { final TermFreq tf = topShingles[upto]; String[] terms = tf.term.utf8ToString().split(" "); if (terms.length != 2) { throw new RuntimeException("expected two terms from " + tf.term.utf8ToString()); }/*w w w. j a v a 2 s . co m*/ int df1 = r.docFreq(new Term(field, terms[0])); int df2 = r.docFreq(new Term(field, terms[1])); queriesOut.write("HighPhrase: \"" + tf.term.utf8ToString() + "\" # freq=" + tf.df + "|" + df1 + "|" + df2 + "\n"); queriesOut.write("HighSloppyPhrase: \"" + tf.term.utf8ToString() + "\"~4 # freq=" + tf.df + "|" + df1 + "|" + df2 + "\n"); queriesOut.write("HighSpanNear: near//" + tf.term.utf8ToString() + " # freq=" + tf.df + "|" + df1 + "|" + df2 + "\n"); upto++; counter++; if (counter >= NUM_QUERIES) { break; } } counter = 0; while (topShingles[upto].df >= topDF / 100) { final TermFreq tf = topShingles[upto]; String[] terms = tf.term.utf8ToString().split(" "); if (terms.length != 2) { throw new RuntimeException("expected two terms from " + tf.term.utf8ToString()); } int df1 = r.docFreq(new Term(field, terms[0])); int df2 = r.docFreq(new Term(field, terms[1])); queriesOut.write( "MedPhrase: \"" + tf.term.utf8ToString() + "\" # freq=" + tf.df + "|" + df1 + "|" + df2 + "\n"); queriesOut.write("MedSloppyPhrase: \"" + tf.term.utf8ToString() + "\"~4 # freq=" + tf.df + "|" + df1 + "|" + df2 + "\n"); queriesOut.write("MedSpanNear: near//" + tf.term.utf8ToString() + " # freq=" + tf.df + "|" + df1 + "|" + df2 + "\n"); upto++; counter++; if (counter >= NUM_QUERIES) { break; } } counter = 0; while (topShingles[upto].df >= topDF / 1000) { final TermFreq tf = topShingles[upto]; String[] terms = tf.term.utf8ToString().split(" "); if (terms.length != 2) { throw new RuntimeException("expected two terms from " + tf.term.utf8ToString()); } int df1 = r.docFreq(new Term(field, terms[0])); int df2 = r.docFreq(new Term(field, terms[1])); queriesOut.write( "LowPhrase: \"" + tf.term.utf8ToString() + "\" # freq=" + tf.df + "|" + df1 + "|" + df2 + "\n"); queriesOut.write("LowSloppyPhrase: \"" + tf.term.utf8ToString() + "\"~4 # freq=" + tf.df + "|" + df1 + "|" + df2 + "\n"); queriesOut.write("LowSpanNear: near//" + tf.term.utf8ToString() + " # freq=" + tf.df + "|" + df1 + "|" + df2 + "\n"); upto++; counter++; if (counter >= NUM_QUERIES) { break; } } queriesOut.flush(); }
From source file:perf.PKLookupPerfTest3X.java
License:Apache License
public static void main(String[] args) throws IOException { final Directory dir; final String dirImpl = args[0]; final String dirPath = args[1]; final int numDocs = Integer.parseInt(args[2]); final int numLookups = Integer.parseInt(args[3]); final long seed = Long.parseLong(args[4]); if (dirImpl.equals("MMapDirectory")) { dir = new MMapDirectory(new File(dirPath)); } else if (dirImpl.equals("NIOFSDirectory")) { dir = new NIOFSDirectory(new File(dirPath)); } else if (dirImpl.equals("SimpleFSDirectory")) { dir = new SimpleFSDirectory(new File(dirPath)); } else {/* w w w .ja v a2s . c om*/ throw new RuntimeException("unknown directory impl \"" + dirImpl + "\""); } if (!new File(dirPath).exists()) { createIndex(dir, numDocs); } final IndexReader r = IndexReader.open(dir); System.out.println("Reader=" + r); final IndexReader[] subs = r.getSequentialSubReaders(); final TermDocs[] termDocsArr = new TermDocs[subs.length]; for (int subIdx = 0; subIdx < subs.length; subIdx++) { termDocsArr[subIdx] = subs[subIdx].termDocs(); } final int maxDoc = r.maxDoc(); final Random rand = new Random(seed); for (int cycle = 0; cycle < 10; cycle++) { System.out.println("Cycle: " + (cycle == 0 ? "warm" : "test")); System.out.println(" Lookup..."); final Term[] lookup = new Term[numLookups]; final int[] docIDs = new int[numLookups]; final Term protoTerm = new Term("id"); for (int iter = 0; iter < numLookups; iter++) { // Base 36, prefixed with 0s to be length 6 (= 2.2 B) lookup[iter] = protoTerm.createTerm( String.format("%6s", Integer.toString(rand.nextInt(maxDoc), Character.MAX_RADIX)) .replace(' ', '0')); } Arrays.fill(docIDs, -1); final AtomicBoolean failed = new AtomicBoolean(false); final Term t = new Term("id", ""); final long tStart = System.currentTimeMillis(); for (int iter = 0; iter < numLookups; iter++) { //System.out.println("lookup " + lookup[iter].utf8ToString()); int base = 0; int found = 0; for (int subIdx = 0; subIdx < subs.length; subIdx++) { final IndexReader sub = subs[subIdx]; if (!DO_DOC_LOOKUP) { final int df = sub.docFreq(lookup[iter]); if (df != 0) { if (df != 1) { // Only 1 doc should be found failed.set(true); } found++; if (found > 1) { // Should have been found only once across segs System.out.println("FAIL0"); failed.set(true); } } } else { final TermDocs termDocs = termDocsArr[subIdx]; termDocs.seek(lookup[iter]); if (termDocs.next()) { found++; if (found > 1) { // Should have been found only once across segs failed.set(true); } final int docID = termDocs.doc(); if (docIDs[iter] != -1) { // Same doc should only be seen once failed.set(true); } docIDs[iter] = base + docID; if (termDocs.next()) { // Only 1 doc should be found failed.set(true); } } } base += sub.maxDoc(); } } final long tLookup = (System.currentTimeMillis() - tStart); // cycle 0 is for warming //System.out.println(" " + (cycle == 0 ? "WARM: " : "") + tLookup + " msec for " + numLookups + " lookups (" + (1000*tLookup/numLookups) + " us per lookup) + totSeekMS=" + (BlockTermsReader.totSeekNanos/1000000.)); System.out.println(" " + (cycle == 0 ? "WARM: " : "") + tLookup + " msec for " + numLookups + " lookups (" + (1000.0 * tLookup / numLookups) + " us per lookup)"); if (failed.get()) { throw new RuntimeException("at least one lookup produced more than one result"); } if (DO_DOC_LOOKUP) { System.out.println(" Verify..."); for (int iter = 0; iter < numLookups; iter++) { if (docIDs[iter] == -1) { throw new RuntimeException("lookup of " + lookup[iter] + " failed iter=" + iter); } final String found = r.document(docIDs[iter]).get("id"); if (!found.equals(lookup[iter].text())) { throw new RuntimeException( "lookup of docid=" + lookup[iter].text() + " hit wrong docid=" + found); } } } } // System.out.println("blocks=" + BlockTermsReader.totBlockReadCount + " scans=" + BlockTermsReader.totScanCount + " " + (((float) BlockTermsReader.totScanCount))/(BlockTermsReader.totBlockReadCount) + " scans/block"); r.close(); dir.close(); }
From source file:retriever.TermStats.java
TermStats(String term, int tf, IndexReader reader) throws Exception { this.term = term; this.tf = tf; idf = (float) (Math.log(reader.numDocs() / (float) (reader.docFreq(new Term(TextDocIndexer.FIELD_ANALYZED_CONTENT, term))))); }
From source file:ro.ranking.technique.bm25.BM25FTermScorer.java
License:Apache License
public BM25FTermScorer(IndexReader reader, TermQuery term, String[] fields, float[] boosts, float[] bParams, Similarity similarity) {//from w ww .j a v a 2 s . co m super(similarity); this.fields = fields; this.boosts = boosts; this.bParam = bParams; len = fields.length; this.termDocs = new TermDocs[len]; this.termDocsNext = new boolean[len]; this.norms = new byte[len][]; this.averageLengths = new float[len]; this.K1 = BM25FParameters.getK1(); this.termBoost = term.getBoost(); this.numDocs = reader.numDocs(); this.termText = term.getTerm().text(); try { this.docFreq = reader.docFreq(new Term(BM25FParameters.getIdfField(), termText)); for (int i = 0; i < len; i++) { String field = this.fields[i]; this.termDocs[i] = reader.termDocs(new Term(field, termText)); norms[i] = reader.norms(field); averageLengths[i] = BM25FParameters.getAverageLength(field); } this.idf = this.getSimilarity().idf(docFreq, numDocs); } catch (IOException e) { } }
From source file:ro.ranking.technique.bm25.BM25TermScorer.java
License:Apache License
public BM25TermScorer(IndexReader reader, TermQuery term, Similarity similarity) throws IOException { super(similarity); this.reader = reader; this.term = term; this.idf = this.getSimilarity().idf(reader.docFreq(term.getTerm()), reader.numDocs()); this.norm = this.reader.norms(this.term.getTerm().field()); this.avgLength = BM25Parameters.getAverageLength(this.term.getTerm().field()); this.b = BM25Parameters.getB(); this.k1 = BM25Parameters.getK1(); this.termDocs = this.reader.termDocs(this.term.getTerm()); }