Example usage for org.apache.lucene.index IndexReader docFreq

List of usage examples for org.apache.lucene.index IndexReader docFreq

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader docFreq.

Prototype

public abstract int docFreq(Term term) throws IOException;

Source Link

Document

Returns the number of documents containing the term.

Usage

From source file:org.pageseeder.flint.lucene.search.Terms.java

License:Apache License

/**
 * Loads all the fuzzy terms in the list of terms given the reader.
 *
 * @param reader  Index reader to use./*w w  w  .  j a v  a  2 s  . c  o m*/
 * @param bucket  Where to store the terms.
 * @param term    The term to use.
 *
 * @throws IOException If an error is thrown by the fuzzy term enumeration.
 */
@Beta
public static void fuzzy(IndexReader reader, Bucket<Term> bucket, Term term, int minSimilarity)
        throws IOException {
    AttributeSource atts = new AttributeSource();
    Fields fields = MultiFields.getFields(reader);
    org.apache.lucene.index.Terms terms = fields == null ? null : fields.terms(term.field());
    if (terms == null)
        return;
    FuzzyTermsEnum fuzzy = new FuzzyTermsEnum(terms, atts, term, minSimilarity, 0, true);
    BytesRef val;
    BytesRef searched = term.bytes();
    while ((val = fuzzy.next()) != null) {
        if (!searched.bytesEquals(val)) {
            Term t = new Term(term.field(), BytesRef.deepCopyOf(val));
            bucket.add(t, reader.docFreq(t));
        }
    }
}

From source file:org.pageseeder.flint.lucene.search.Terms.java

License:Apache License

/**
 * Loads all the prefix terms in the list of terms given the reader.
 *
 * @param reader  Index reader to use./*  w w  w .  ja va 2s.c  o  m*/
 * @param bucket  Where to store the terms.
 * @param term    The term to use.
 *
 * @throws IOException If an error is thrown by the prefix term enumeration.
 */
public static void prefix(IndexReader reader, Bucket<Term> bucket, Term term) throws IOException {
    Fields fields = MultiFields.getFields(reader);
    org.apache.lucene.index.Terms terms = fields == null ? null : fields.terms(term.field());
    if (terms == null)
        return;
    TermsEnum prefixes = terms.intersect(new CompiledAutomaton(PrefixQuery.toAutomaton(term.bytes())),
            term.bytes());
    BytesRef val;
    while ((val = prefixes.next()) != null) {
        Term t = new Term(term.field(), BytesRef.deepCopyOf(val));
        bucket.add(t, reader.docFreq(t));
    }
}

From source file:org.silverpeas.components.silvercrawler.model.FileFolder.java

License:Open Source License

public FileFolder(String rootPath, String path, boolean isAdmin, String componentId) {
    this.path = path;
    files = new ArrayList<>(0);
    folders = new ArrayList<>(0);

    try {/*from   w ww .j a va2  s  .c o  m*/
        // Check security access : cannot browse inside rootPath
        FileUtil.validateFilename(path, rootPath);
        File f = new File(path);

        writable = f.canWrite();

        if (f.exists()) {
            this.name = f.getName();
            this.readable = f.canRead();
            File[] children = f.listFiles();

            IndexReader reader = null;
            boolean isIndexed = false;

            if (isAdmin) {
                // ouverture de l'index
                Directory indexPath = FSDirectory
                        .open(new File(IndexFileManager.getAbsoluteIndexPath(componentId)));
                if (IndexReader.indexExists(indexPath)) {
                    reader = IndexReader.open(indexPath);
                }
            }
            if (children != null && children.length > 0) {
                for (File childFile : children) {
                    isIndexed = false;
                    if (isAdmin) {
                        // rechercher si le rpertoire (ou le fichier) est index
                        String pathIndex = componentId + "|";
                        if (childFile.isDirectory()) {
                            pathIndex = pathIndex + "LinkedDir" + "|";
                        } else {
                            pathIndex = pathIndex + "LinkedFile" + "|";
                        }
                        pathIndex = pathIndex + FilenameUtils.separatorsToUnix(childFile.getPath());
                        Term term = new Term("key", pathIndex);
                        if (reader != null && reader.docFreq(term) == 1) {
                            isIndexed = true;
                        }
                    }

                    if (childFile.isDirectory()) {
                        folders.add(new FileDetail(childFile.getName(), childFile.getPath(), null,
                                childFile.length(), true, isIndexed));
                    } else {
                        String childPath = FileUtils.getFile(childFile.getPath().substring(rootPath.length()))
                                .getPath();
                        files.add(new FileDetail(childFile.getName(), childPath, childFile.getPath(),
                                childFile.length(), false, isIndexed));
                    }
                }
            }
            // fermeture de l'index
            if (reader != null && isAdmin) {
                reader.close();
            }

        }
    } catch (Exception e) {
        throw new SilverCrawlerRuntimeException("FileFolder.FileFolder()", SilverpeasRuntimeException.ERROR,
                "silverCrawler.IMPOSSIBLE_DACCEDER_AU_REPERTOIRE", e);
    }
}

From source file:org.sindice.siren.search.node.NodeScoringRewrite.java

License:Apache License

@Override
public Q rewrite(final IndexReader reader, final MultiNodeTermQuery query) throws IOException {
    final Q result = this.getTopLevelQuery();
    final ParallelArraysTermCollector col = new ParallelArraysTermCollector();
    this.collectTerms(reader, query, col);

    final int size = col.terms.size();
    if (size > 0) {
        final int sort[] = col.terms.sort(col.termsEnum.getComparator());
        final float[] boost = col.array.boost;
        final TermContext[] termStates = col.array.termState;
        for (int i = 0; i < size; i++) {
            final int pos = sort[i];
            final Term term = new Term(query.getField(), col.terms.get(pos, new BytesRef()));
            assert reader.docFreq(term) == termStates[pos].docFreq();
            this.addClause(result, term, termStates[pos].docFreq(), query.getBoost() * boost[pos],
                    termStates[pos]);/*from   ww  w . j  a va 2s.c  o m*/
        }
    }
    return result;
}

From source file:org.sindice.siren.search.node.TopNodeTermsRewrite.java

License:Apache License

@Override
public Q rewrite(final IndexReader reader, final MultiNodeTermQuery query) throws IOException {
    final int maxSize = Math.min(size, this.getMaxSize());
    final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>();
    this.collectTerms(reader, query, new TermCollector() {
        private final MaxNonCompetitiveBoostAttribute maxBoostAtt = attributes
                .addAttribute(MaxNonCompetitiveBoostAttribute.class);

        private final Map<BytesRef, ScoreTerm> visitedTerms = new HashMap<BytesRef, ScoreTerm>();

        private TermsEnum termsEnum;
        private Comparator<BytesRef> termComp;
        private BoostAttribute boostAtt;
        private ScoreTerm st;

        @Override//from  ww w  . j  ava 2s  .  co m
        public void setNextEnum(final TermsEnum termsEnum) throws IOException {
            this.termsEnum = termsEnum;
            this.termComp = termsEnum.getComparator();

            assert this.compareToLastTerm(null);

            // lazy init the initial ScoreTerm because comparator is not known on ctor:
            if (st == null)
                st = new ScoreTerm(this.termComp, new TermContext(topReaderContext));
            boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class);
        }

        // for assert:
        private BytesRef lastTerm;

        private boolean compareToLastTerm(final BytesRef t) throws IOException {
            if (lastTerm == null && t != null) {
                lastTerm = BytesRef.deepCopyOf(t);
            } else if (t == null) {
                lastTerm = null;
            } else {
                assert termsEnum.getComparator().compare(lastTerm, t) < 0 : "lastTerm=" + lastTerm + " t=" + t;
                lastTerm.copyBytes(t);
            }
            return true;
        }

        @Override
        public boolean collect(final BytesRef bytes) throws IOException {
            final float boost = boostAtt.getBoost();

            // make sure within a single seg we always collect
            // terms in order
            assert this.compareToLastTerm(bytes);

            //System.out.println("TTR.collect term=" + bytes.utf8ToString() + " boost=" + boost + " ord=" + readerContext.ord);
            // ignore uncompetitive hits
            if (stQueue.size() == maxSize) {
                final ScoreTerm t = stQueue.peek();
                if (boost < t.boost)
                    return true;
                if (boost == t.boost && termComp.compare(bytes, t.bytes) > 0)
                    return true;
            }
            ScoreTerm t = visitedTerms.get(bytes);
            final TermState state = termsEnum.termState();
            assert state != null;
            if (t != null) {
                // if the term is already in the PQ, only update docFreq of term in PQ
                assert t.boost == boost : "boost should be equal in all segment TermsEnums";
                t.termState.register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
            } else {
                // add new entry in PQ, we must clone the term, else it may get overwritten!
                st.bytes.copyBytes(bytes);
                st.boost = boost;
                visitedTerms.put(st.bytes, st);
                assert st.termState.docFreq() == 0;
                st.termState.register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
                stQueue.offer(st);
                // possibly drop entries from queue
                if (stQueue.size() > maxSize) {
                    st = stQueue.poll();
                    visitedTerms.remove(st.bytes);
                    st.termState.clear(); // reset the termstate!
                } else {
                    st = new ScoreTerm(termComp, new TermContext(topReaderContext));
                }
                assert stQueue.size() <= maxSize : "the PQ size must be limited to maxSize";
                // set maxBoostAtt with values to help FuzzyTermsEnum to optimize
                if (stQueue.size() == maxSize) {
                    t = stQueue.peek();
                    maxBoostAtt.setMaxNonCompetitiveBoost(t.boost);
                    maxBoostAtt.setCompetitiveTerm(t.bytes);
                }
            }

            return true;
        }
    });

    final Q q = this.getTopLevelQuery();
    final ScoreTerm[] scoreTerms = stQueue.toArray(new ScoreTerm[stQueue.size()]);
    ArrayUtil.mergeSort(scoreTerms, scoreTermSortByTermComp);

    for (final ScoreTerm st : scoreTerms) {
        final Term term = new Term(query.field, st.bytes);
        assert reader.docFreq(term) == st.termState.docFreq() : "reader DF is " + reader.docFreq(term) + " vs "
                + st.termState.docFreq() + " term=" + term;
        this.addClause(q, term, st.termState.docFreq(), query.getBoost() * st.boost, st.termState); // add to query
    }
    return q;
}

From source file:perf.CreateQueries.java

License:Apache License

private static void processShingles(IndexReader r, String field, Writer queriesOut) throws IOException {
    System.out.println("\nFind phrase queries...");
    // First pass: get high/medium freq shingles:
    final TermFreq[] topShingles = getTopTermsByDocFreq(r, field, TOP_N, true);

    long topDF = topShingles[0].df;
    int upto = 0;
    int counter = 0;
    while (topShingles[upto].df >= topDF / 10) {
        final TermFreq tf = topShingles[upto];
        String[] terms = tf.term.utf8ToString().split(" ");
        if (terms.length != 2) {
            throw new RuntimeException("expected two terms from " + tf.term.utf8ToString());
        }/*w  w w.  j a  v a 2 s  . co  m*/
        int df1 = r.docFreq(new Term(field, terms[0]));
        int df2 = r.docFreq(new Term(field, terms[1]));
        queriesOut.write("HighPhrase: \"" + tf.term.utf8ToString() + "\" # freq=" + tf.df + "|" + df1 + "|"
                + df2 + "\n");
        queriesOut.write("HighSloppyPhrase: \"" + tf.term.utf8ToString() + "\"~4 # freq=" + tf.df + "|" + df1
                + "|" + df2 + "\n");
        queriesOut.write("HighSpanNear: near//" + tf.term.utf8ToString() + " # freq=" + tf.df + "|" + df1 + "|"
                + df2 + "\n");
        upto++;
        counter++;
        if (counter >= NUM_QUERIES) {
            break;
        }
    }
    counter = 0;
    while (topShingles[upto].df >= topDF / 100) {
        final TermFreq tf = topShingles[upto];
        String[] terms = tf.term.utf8ToString().split(" ");
        if (terms.length != 2) {
            throw new RuntimeException("expected two terms from " + tf.term.utf8ToString());
        }
        int df1 = r.docFreq(new Term(field, terms[0]));
        int df2 = r.docFreq(new Term(field, terms[1]));
        queriesOut.write(
                "MedPhrase: \"" + tf.term.utf8ToString() + "\" # freq=" + tf.df + "|" + df1 + "|" + df2 + "\n");
        queriesOut.write("MedSloppyPhrase: \"" + tf.term.utf8ToString() + "\"~4 # freq=" + tf.df + "|" + df1
                + "|" + df2 + "\n");
        queriesOut.write("MedSpanNear: near//" + tf.term.utf8ToString() + " # freq=" + tf.df + "|" + df1 + "|"
                + df2 + "\n");
        upto++;
        counter++;
        if (counter >= NUM_QUERIES) {
            break;
        }
    }
    counter = 0;
    while (topShingles[upto].df >= topDF / 1000) {
        final TermFreq tf = topShingles[upto];
        String[] terms = tf.term.utf8ToString().split(" ");
        if (terms.length != 2) {
            throw new RuntimeException("expected two terms from " + tf.term.utf8ToString());
        }
        int df1 = r.docFreq(new Term(field, terms[0]));
        int df2 = r.docFreq(new Term(field, terms[1]));
        queriesOut.write(
                "LowPhrase: \"" + tf.term.utf8ToString() + "\" # freq=" + tf.df + "|" + df1 + "|" + df2 + "\n");
        queriesOut.write("LowSloppyPhrase: \"" + tf.term.utf8ToString() + "\"~4 # freq=" + tf.df + "|" + df1
                + "|" + df2 + "\n");
        queriesOut.write("LowSpanNear: near//" + tf.term.utf8ToString() + " # freq=" + tf.df + "|" + df1 + "|"
                + df2 + "\n");
        upto++;
        counter++;
        if (counter >= NUM_QUERIES) {
            break;
        }
    }
    queriesOut.flush();
}

From source file:perf.PKLookupPerfTest3X.java

License:Apache License

public static void main(String[] args) throws IOException {

    final Directory dir;
    final String dirImpl = args[0];
    final String dirPath = args[1];
    final int numDocs = Integer.parseInt(args[2]);
    final int numLookups = Integer.parseInt(args[3]);
    final long seed = Long.parseLong(args[4]);

    if (dirImpl.equals("MMapDirectory")) {
        dir = new MMapDirectory(new File(dirPath));
    } else if (dirImpl.equals("NIOFSDirectory")) {
        dir = new NIOFSDirectory(new File(dirPath));
    } else if (dirImpl.equals("SimpleFSDirectory")) {
        dir = new SimpleFSDirectory(new File(dirPath));
    } else {/*  w w  w .ja v  a2s . c  om*/
        throw new RuntimeException("unknown directory impl \"" + dirImpl + "\"");
    }

    if (!new File(dirPath).exists()) {
        createIndex(dir, numDocs);
    }

    final IndexReader r = IndexReader.open(dir);
    System.out.println("Reader=" + r);

    final IndexReader[] subs = r.getSequentialSubReaders();
    final TermDocs[] termDocsArr = new TermDocs[subs.length];
    for (int subIdx = 0; subIdx < subs.length; subIdx++) {
        termDocsArr[subIdx] = subs[subIdx].termDocs();
    }

    final int maxDoc = r.maxDoc();
    final Random rand = new Random(seed);

    for (int cycle = 0; cycle < 10; cycle++) {
        System.out.println("Cycle: " + (cycle == 0 ? "warm" : "test"));
        System.out.println("  Lookup...");
        final Term[] lookup = new Term[numLookups];
        final int[] docIDs = new int[numLookups];
        final Term protoTerm = new Term("id");
        for (int iter = 0; iter < numLookups; iter++) {
            // Base 36, prefixed with 0s to be length 6 (= 2.2 B)
            lookup[iter] = protoTerm.createTerm(
                    String.format("%6s", Integer.toString(rand.nextInt(maxDoc), Character.MAX_RADIX))
                            .replace(' ', '0'));
        }
        Arrays.fill(docIDs, -1);

        final AtomicBoolean failed = new AtomicBoolean(false);

        final Term t = new Term("id", "");

        final long tStart = System.currentTimeMillis();
        for (int iter = 0; iter < numLookups; iter++) {
            //System.out.println("lookup " + lookup[iter].utf8ToString());
            int base = 0;
            int found = 0;
            for (int subIdx = 0; subIdx < subs.length; subIdx++) {
                final IndexReader sub = subs[subIdx];
                if (!DO_DOC_LOOKUP) {
                    final int df = sub.docFreq(lookup[iter]);
                    if (df != 0) {
                        if (df != 1) {
                            // Only 1 doc should be found
                            failed.set(true);
                        }
                        found++;
                        if (found > 1) {
                            // Should have been found only once across segs
                            System.out.println("FAIL0");
                            failed.set(true);
                        }
                    }
                } else {
                    final TermDocs termDocs = termDocsArr[subIdx];
                    termDocs.seek(lookup[iter]);
                    if (termDocs.next()) {
                        found++;
                        if (found > 1) {
                            // Should have been found only once across segs
                            failed.set(true);
                        }
                        final int docID = termDocs.doc();
                        if (docIDs[iter] != -1) {
                            // Same doc should only be seen once
                            failed.set(true);
                        }
                        docIDs[iter] = base + docID;
                        if (termDocs.next()) {
                            // Only 1 doc should be found
                            failed.set(true);
                        }
                    }
                }
                base += sub.maxDoc();
            }
        }
        final long tLookup = (System.currentTimeMillis() - tStart);

        // cycle 0 is for warming
        //System.out.println("  " + (cycle == 0 ? "WARM: " : "") + tLookup + " msec for " + numLookups + " lookups (" + (1000*tLookup/numLookups) + " us per lookup) + totSeekMS=" + (BlockTermsReader.totSeekNanos/1000000.));
        System.out.println("  " + (cycle == 0 ? "WARM: " : "") + tLookup + " msec for " + numLookups
                + " lookups (" + (1000.0 * tLookup / numLookups) + " us per lookup)");

        if (failed.get()) {
            throw new RuntimeException("at least one lookup produced more than one result");
        }

        if (DO_DOC_LOOKUP) {
            System.out.println("  Verify...");
            for (int iter = 0; iter < numLookups; iter++) {
                if (docIDs[iter] == -1) {
                    throw new RuntimeException("lookup of " + lookup[iter] + " failed iter=" + iter);
                }
                final String found = r.document(docIDs[iter]).get("id");
                if (!found.equals(lookup[iter].text())) {
                    throw new RuntimeException(
                            "lookup of docid=" + lookup[iter].text() + " hit wrong docid=" + found);
                }
            }
        }
    }

    // System.out.println("blocks=" + BlockTermsReader.totBlockReadCount + " scans=" + BlockTermsReader.totScanCount + " " + (((float) BlockTermsReader.totScanCount))/(BlockTermsReader.totBlockReadCount) + " scans/block");

    r.close();
    dir.close();
}

From source file:retriever.TermStats.java

TermStats(String term, int tf, IndexReader reader) throws Exception {
    this.term = term;
    this.tf = tf;
    idf = (float) (Math.log(reader.numDocs()
            / (float) (reader.docFreq(new Term(TextDocIndexer.FIELD_ANALYZED_CONTENT, term)))));
}

From source file:ro.ranking.technique.bm25.BM25FTermScorer.java

License:Apache License

public BM25FTermScorer(IndexReader reader, TermQuery term, String[] fields, float[] boosts, float[] bParams,
        Similarity similarity) {//from w  ww  .j  a v a  2 s . co m
    super(similarity);
    this.fields = fields;
    this.boosts = boosts;
    this.bParam = bParams;
    len = fields.length;
    this.termDocs = new TermDocs[len];
    this.termDocsNext = new boolean[len];
    this.norms = new byte[len][];
    this.averageLengths = new float[len];
    this.K1 = BM25FParameters.getK1();
    this.termBoost = term.getBoost();
    this.numDocs = reader.numDocs();
    this.termText = term.getTerm().text();

    try {
        this.docFreq = reader.docFreq(new Term(BM25FParameters.getIdfField(), termText));
        for (int i = 0; i < len; i++) {
            String field = this.fields[i];
            this.termDocs[i] = reader.termDocs(new Term(field, termText));
            norms[i] = reader.norms(field);
            averageLengths[i] = BM25FParameters.getAverageLength(field);
        }
        this.idf = this.getSimilarity().idf(docFreq, numDocs);
    } catch (IOException e) {
    }

}

From source file:ro.ranking.technique.bm25.BM25TermScorer.java

License:Apache License

public BM25TermScorer(IndexReader reader, TermQuery term, Similarity similarity) throws IOException {
    super(similarity);
    this.reader = reader;
    this.term = term;
    this.idf = this.getSimilarity().idf(reader.docFreq(term.getTerm()), reader.numDocs());
    this.norm = this.reader.norms(this.term.getTerm().field());
    this.avgLength = BM25Parameters.getAverageLength(this.term.getTerm().field());
    this.b = BM25Parameters.getB();
    this.k1 = BM25Parameters.getK1();
    this.termDocs = this.reader.termDocs(this.term.getTerm());
}