Example usage for org.apache.lucene.index IndexReader docFreq

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader docFreq.

Prototype

public abstract int docFreq(Term term) throws IOException;

Source Link

Document

Returns the number of documents containing the term.

Usage

From source file:org.pageseeder.flint.lucene.search.Terms.java

License:Apache License

/**
 * Loads all the fuzzy terms in the list of terms given the reader.
 *
 * @param reader  Index reader to use./*w w  w  .  j a v  a  2 s  . c  o m*/
 * @param bucket  Where to store the terms.
 * @param term    The term to use.
 *
 * @throws IOException If an error is thrown by the fuzzy term enumeration.
 */
@Beta
public static void fuzzy(IndexReader reader, Bucket<Term> bucket, Term term, int minSimilarity)
        throws IOException {
    AttributeSource atts = new AttributeSource();
    Fields fields = MultiFields.getFields(reader);
    org.apache.lucene.index.Terms terms = fields == null ? null : fields.terms(term.field());
    if (terms == null)
        return;
    FuzzyTermsEnum fuzzy = new FuzzyTermsEnum(terms, atts, term, minSimilarity, 0, true);
    BytesRef val;
    BytesRef searched = term.bytes();
    while ((val = fuzzy.next()) != null) {
        if (!searched.bytesEquals(val)) {
            Term t = new Term(term.field(), BytesRef.deepCopyOf(val));
            bucket.add(t, reader.docFreq(t));
        }
    }
}

From source file:org.pageseeder.flint.lucene.search.Terms.java

License:Apache License

/**
 * Loads all the prefix terms in the list of terms given the reader.
 *
 * @param reader  Index reader to use./*  w w  w .  ja va 2s.c  o  m*/
 * @param bucket  Where to store the terms.
 * @param term    The term to use.
 *
 * @throws IOException If an error is thrown by the prefix term enumeration.
 */
public static void prefix(IndexReader reader, Bucket<Term> bucket, Term term) throws IOException {
    Fields fields = MultiFields.getFields(reader);
    org.apache.lucene.index.Terms terms = fields == null ? null : fields.terms(term.field());
    if (terms == null)
        return;
    TermsEnum prefixes = terms.intersect(new CompiledAutomaton(PrefixQuery.toAutomaton(term.bytes())),
            term.bytes());
    BytesRef val;
    while ((val = prefixes.next()) != null) {
        Term t = new Term(term.field(), BytesRef.deepCopyOf(val));
        bucket.add(t, reader.docFreq(t));
    }
}

From source file:org.silverpeas.components.silvercrawler.model.FileFolder.java

License:Open Source License

public FileFolder(String rootPath, String path, boolean isAdmin, String componentId) {
    this.path = path;
    files = new ArrayList<>(0);
    folders = new ArrayList<>(0);

    try {/*from   w ww .j a va2  s  .c o  m*/
        // Check security access : cannot browse inside rootPath
        FileUtil.validateFilename(path, rootPath);
        File f = new File(path);

        writable = f.canWrite();

        if (f.exists()) {
            this.name = f.getName();
            this.readable = f.canRead();
            File[] children = f.listFiles();

            IndexReader reader = null;
            boolean isIndexed = false;

            if (isAdmin) {
                // ouverture de l'index
                Directory indexPath = FSDirectory
                        .open(new File(IndexFileManager.getAbsoluteIndexPath(componentId)));
                if (IndexReader.indexExists(indexPath)) {
                    reader = IndexReader.open(indexPath);
                }
            }
            if (children != null && children.length > 0) {
                for (File childFile : children) {
                    isIndexed = false;
                    if (isAdmin) {
                        // rechercher si le rpertoire (ou le fichier) est index
                        String pathIndex = componentId + "|";
                        if (childFile.isDirectory()) {
                            pathIndex = pathIndex + "LinkedDir" + "|";
                        } else {
                            pathIndex = pathIndex + "LinkedFile" + "|";
                        }
                        pathIndex = pathIndex + FilenameUtils.separatorsToUnix(childFile.getPath());
                        Term term = new Term("key", pathIndex);
                        if (reader != null && reader.docFreq(term) == 1) {
                            isIndexed = true;
                        }
                    }

                    if (childFile.isDirectory()) {
                        folders.add(new FileDetail(childFile.getName(), childFile.getPath(), null,
                                childFile.length(), true, isIndexed));
                    } else {
                        String childPath = FileUtils.getFile(childFile.getPath().substring(rootPath.length()))
                                .getPath();
                        files.add(new FileDetail(childFile.getName(), childPath, childFile.getPath(),
                                childFile.length(), false, isIndexed));
                    }
                }
            }
            // fermeture de l'index
            if (reader != null && isAdmin) {
                reader.close();
            }

        }
    } catch (Exception e) {
        throw new SilverCrawlerRuntimeException("FileFolder.FileFolder()", SilverpeasRuntimeException.ERROR,
                "silverCrawler.IMPOSSIBLE_DACCEDER_AU_REPERTOIRE", e);
    }
}

From source file:org.sindice.siren.search.node.NodeScoringRewrite.java

License:Apache License

@Override
public Q rewrite(final IndexReader reader, final MultiNodeTermQuery query) throws IOException {
    final Q result = this.getTopLevelQuery();
    final ParallelArraysTermCollector col = new ParallelArraysTermCollector();
    this.collectTerms(reader, query, col);

    final int size = col.terms.size();
    if (size > 0) {
        final int sort[] = col.terms.sort(col.termsEnum.getComparator());
        final float[] boost = col.array.boost;
        final TermContext[] termStates = col.array.termState;
        for (int i = 0; i < size; i++) {
            final int pos = sort[i];
            final Term term = new Term(query.getField(), col.terms.get(pos, new BytesRef()));
            assert reader.docFreq(term) == termStates[pos].docFreq();
            this.addClause(result, term, termStates[pos].docFreq(), query.getBoost() * boost[pos],
                    termStates[pos]);/*from   ww  w . j  a va 2s.c  o m*/
        }
    }
    return result;
}

From source file:org.sindice.siren.search.node.TopNodeTermsRewrite.java

License:Apache License

@Override
public Q rewrite(final IndexReader reader, final MultiNodeTermQuery query) throws IOException {
    final int maxSize = Math.min(size, this.getMaxSize());
    final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>();
    this.collectTerms(reader, query, new TermCollector() {
        private final MaxNonCompetitiveBoostAttribute maxBoostAtt = attributes
                .addAttribute(MaxNonCompetitiveBoostAttribute.class);

        private final Map<BytesRef, ScoreTerm> visitedTerms = new HashMap<BytesRef, ScoreTerm>();

        private TermsEnum termsEnum;
        private Comparator<BytesRef> termComp;
        private BoostAttribute boostAtt;
        private ScoreTerm st;

        @Override//from  ww w  . j  ava 2s  .  co m
        public void setNextEnum(final TermsEnum termsEnum) throws IOException {
            this.termsEnum = termsEnum;
            this.termComp = termsEnum.getComparator();

            assert this.compareToLastTerm(null);

            // lazy init the initial ScoreTerm because comparator is not known on ctor:
            if (st == null)
                st = new ScoreTerm(this.termComp, new TermContext(topReaderContext));
            boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class);
        }

        // for assert:
        private BytesRef lastTerm;

        private boolean compareToLastTerm(final BytesRef t) throws IOException {
            if (lastTerm == null && t != null) {
                lastTerm = BytesRef.deepCopyOf(t);
            } else if (t == null) {
                lastTerm = null;
            } else {
                assert termsEnum.getComparator().compare(lastTerm, t) < 0 : "lastTerm=" + lastTerm + " t=" + t;
                lastTerm.copyBytes(t);
            }
            return true;
        }

        @Override
        public boolean collect(final BytesRef bytes) throws IOException {
            final float boost = boostAtt.getBoost();

            // make sure within a single seg we always collect
            // terms in order
            assert this.compareToLastTerm(bytes);

            //System.out.println("TTR.collect term=" + bytes.utf8ToString() + " boost=" + boost + " ord=" + readerContext.ord);
            // ignore uncompetitive hits
            if (stQueue.size() == maxSize) {
                final ScoreTerm t = stQueue.peek();
                if (boost < t.boost)
                    return true;
                if (boost == t.boost && termComp.compare(bytes, t.bytes) > 0)
                    return true;
            }
            ScoreTerm t = visitedTerms.get(bytes);
            final TermState state = termsEnum.termState();
            assert state != null;
            if (t != null) {
                // if the term is already in the PQ, only update docFreq of term in PQ
                assert t.boost == boost : "boost should be equal in all segment TermsEnums";
                t.termState.register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
            } else {
                // add new entry in PQ, we must clone the term, else it may get overwritten!
                st.bytes.copyBytes(bytes);
                st.boost = boost;
                visitedTerms.put(st.bytes, st);
                assert st.termState.docFreq() == 0;
                st.termState.register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
                stQueue.offer(st);
                // possibly drop entries from queue
                if (stQueue.size() > maxSize) {
                    st = stQueue.poll();
                    visitedTerms.remove(st.bytes);
                    st.termState.clear(); // reset the termstate!
                } else {
                    st = new ScoreTerm(termComp, new TermContext(topReaderContext));
                }
                assert stQueue.size() <= maxSize : "the PQ size must be limited to maxSize";
                // set maxBoostAtt with values to help FuzzyTermsEnum to optimize
                if (stQueue.size() == maxSize) {
                    t = stQueue.peek();
                    maxBoostAtt.setMaxNonCompetitiveBoost(t.boost);
                    maxBoostAtt.setCompetitiveTerm(t.bytes);
                }
            }

            return true;
        }
    });

    final Q q = this.getTopLevelQuery();
    final ScoreTerm[] scoreTerms = stQueue.toArray(new ScoreTerm[stQueue.size()]);
    ArrayUtil.mergeSort(scoreTerms, scoreTermSortByTermComp);

    for (final ScoreTerm st : scoreTerms) {
        final Term term = new Term(query.field, st.bytes);
        assert reader.docFreq(term) == st.termState.docFreq() : "reader DF is " + reader.docFreq(term) + " vs "
                + st.termState.docFreq() + " term=" + term;
        this.addClause(q, term, st.termState.docFreq(), query.getBoost() * st.boost, st.termState); // add to query
    }
    return q;
}

From source file:perf.CreateQueries.java

License:Apache License

private static void processShingles(IndexReader r, String field, Writer queriesOut) throws IOException {
    System.out.println("\nFind phrase queries...");
    // First pass: get high/medium freq shingles:
    final TermFreq[] topShingles = getTopTermsByDocFreq(r, field, TOP_N, true);

    long topDF = topShingles[0].df;
    int upto = 0;
    int counter = 0;
    while (topShingles[upto].df >= topDF / 10) {
        final TermFreq tf = topShingles[upto];
        String[] terms = tf.term.utf8ToString().split(" ");
        if (terms.length != 2) {
            throw new RuntimeException("expected two terms from " + tf.term.utf8ToString());
        }/*w  w w.  j a  v a 2 s  . co  m*/
        int df1 = r.docFreq(new Term(field, terms[0]));
        int df2 = r.docFreq(new Term(field, terms[1]));
        queriesOut.write("HighPhrase: \"" + tf.term.utf8ToString() + "\" # freq=" + tf.df + "|" + df1 + "|"
                + df2 + "\n");
        queriesOut.write("HighSloppyPhrase: \"" + tf.term.utf8ToString() + "\"~4 # freq=" + tf.df + "|" + df1
                + "|" + df2 + "\n");
        queriesOut.write("HighSpanNear: near//" + tf.term.utf8ToString() + " # freq=" + tf.df + "|" + df1 + "|"
                + df2 + "\n");
        upto++;
        counter++;
        if (counter >= NUM_QUERIES) {
            break;
        }
    }
    counter = 0;
    while (topShingles[upto].df >= topDF / 100) {
        final TermFreq tf = topShingles[upto];
        String[] terms = tf.term.utf8ToString().split(" ");
        if (terms.length != 2) {
            throw new RuntimeException("expected two terms from " + tf.term.utf8ToString());
        }
        int df1 = r.docFreq(new Term(field, terms[0]));
        int df2 = r.docFreq(new Term(field, terms[1]));
        queriesOut.write(
                "MedPhrase: \"" + tf.term.utf8ToString() + "\" # freq=" + tf.df + "|" + df1 + "|" + df2 + "\n");
        queriesOut.write("MedSloppyPhrase: \"" + tf.term.utf8ToString() + "\"~4 # freq=" + tf.df + "|" + df1
                + "|" + df2 + "\n");
        queriesOut.write("MedSpanNear: near//" + tf.term.utf8ToString() + " # freq=" + tf.df + "|" + df1 + "|"
                + df2 + "\n");
        upto++;
        counter++;
        if (counter >= NUM_QUERIES) {
            break;
        }
    }
    counter = 0;
    while (topShingles[upto].df >= topDF / 1000) {
        final TermFreq tf = topShingles[upto];
        String[] terms = tf.term.utf8ToString().split(" ");
        if (terms.length != 2) {
            throw new RuntimeException("expected two terms from " + tf.term.utf8ToString());
        }
        int df1 = r.docFreq(new Term(field, terms[0]));
        int df2 = r.docFreq(new Term(field, terms[1]));
        queriesOut.write(
                "LowPhrase: \"" + tf.term.utf8ToString() + "\" # freq=" + tf.df + "|" + df1 + "|" + df2 + "\n");
        queriesOut.write("LowSloppyPhrase: \"" + tf.term.utf8ToString() + "\"~4 # freq=" + tf.df + "|" + df1
                + "|" + df2 + "\n");
        queriesOut.write("LowSpanNear: near//" + tf.term.utf8ToString() + " # freq=" + tf.df + "|" + df1 + "|"
                + df2 + "\n");
        upto++;
        counter++;
        if (counter >= NUM_QUERIES) {
            break;
        }
    }
    queriesOut.flush();
}

From source file:perf.PKLookupPerfTest3X.java

License:Apache License

public static void main(String[] args) throws IOException {

    final Directory dir;
    final String dirImpl = args[0];
    final String dirPath = args[1];
    final int numDocs = Integer.parseInt(args[2]);
    final int numLookups = Integer.parseInt(args[3]);
    final long seed = Long.parseLong(args[4]);

    if (dirImpl.equals("MMapDirectory")) {
        dir = new MMapDirectory(new File(dirPath));
    } else if (dirImpl.equals("NIOFSDirectory")) {
        dir = new NIOFSDirectory(new File(dirPath));
    } else if (dirImpl.equals("SimpleFSDirectory")) {
        dir = new SimpleFSDirectory(new File(dirPath));
    } else {/*  w w  w .ja v  a2s . c  om*/
        throw new RuntimeException("unknown directory impl \"" + dirImpl + "\"");
    }

    if (!new File(dirPath).exists()) {
        createIndex(dir, numDocs);
    }

    final IndexReader r = IndexReader.open(dir);
    System.out.println("Reader=" + r);

    final IndexReader[] subs = r.getSequentialSubReaders();
    final TermDocs[] termDocsArr = new TermDocs[subs.length];
    for (int subIdx = 0; subIdx < subs.length; subIdx++) {
        termDocsArr[subIdx] = subs[subIdx].termDocs();
    }

    final int maxDoc = r.maxDoc();
    final Random rand = new Random(seed);

    for (int cycle = 0; cycle < 10; cycle++) {
        System.out.println("Cycle: " + (cycle == 0 ? "warm" : "test"));
        System.out.println("  Lookup...");
        final Term[] lookup = new Term[numLookups];
        final int[] docIDs = new int[numLookups];
        final Term protoTerm = new Term("id");
        for (int iter = 0; iter < numLookups; iter++) {
            // Base 36, prefixed with 0s to be length 6 (= 2.2 B)
            lookup[iter] = protoTerm.createTerm(
                    String.format("%6s", Integer.toString(rand.nextInt(maxDoc), Character.MAX_RADIX))
                            .replace(' ', '0'));
        }
        Arrays.fill(docIDs, -1);

        final AtomicBoolean failed = new AtomicBoolean(false);

        final Term t = new Term("id", "");

        final long tStart = System.currentTimeMillis();
        for (int iter = 0; iter < numLookups; iter++) {
            //System.out.println("lookup " + lookup[iter].utf8ToString());
            int base = 0;
            int found = 0;
            for (int subIdx = 0; subIdx < subs.length; subIdx++) {
                final IndexReader sub = subs[subIdx];
                if (!DO_DOC_LOOKUP) {
                    final int df = sub.docFreq(lookup[iter]);
                    if (df != 0) {
                        if (df != 1) {
                            // Only 1 doc should be found
                            failed.set(true);
                        }
                        found++;
                        if (found > 1) {
                            // Should have been found only once across segs
                            System.out.println("FAIL0");
                            failed.set(true);
                        }
                    }
                } else {
                    final TermDocs termDocs = termDocsArr[subIdx];
                    termDocs.seek(lookup[iter]);
                    if (termDocs.next()) {
                        found++;
                        if (found > 1) {
                            // Should have been found only once across segs
                            failed.set(true);
                        }
                        final int docID = termDocs.doc();
                        if (docIDs[iter] != -1) {
                            // Same doc should only be seen once
                            failed.set(true);
                        }
                        docIDs[iter] = base + docID;
                        if (termDocs.next()) {
                            // Only 1 doc should be found
                            failed.set(true);
                        }
                    }
                }
                base += sub.maxDoc();
            }
        }
        final long tLookup = (System.currentTimeMillis() - tStart);

        // cycle 0 is for warming
        //System.out.println("  " + (cycle == 0 ? "WARM: " : "") + tLookup + " msec for " + numLookups + " lookups (" + (1000*tLookup/numLookups) + " us per lookup) + totSeekMS=" + (BlockTermsReader.totSeekNanos/1000000.));
        System.out.println("  " + (cycle == 0 ? "WARM: " : "") + tLookup + " msec for " + numLookups
                + " lookups (" + (1000.0 * tLookup / numLookups) + " us per lookup)");

        if (failed.get()) {
            throw new RuntimeException("at least one lookup produced more than one result");
        }

        if (DO_DOC_LOOKUP) {
            System.out.println("  Verify...");
            for (int iter = 0; iter < numLookups; iter++) {
                if (docIDs[iter] == -1) {
                    throw new RuntimeException("lookup of " + lookup[iter] + " failed iter=" + iter);
                }
                final String found = r.document(docIDs[iter]).get("id");
                if (!found.equals(lookup[iter].text())) {
                    throw new RuntimeException(
                            "lookup of docid=" + lookup[iter].text() + " hit wrong docid=" + found);
                }
            }
        }
    }

    // System.out.println("blocks=" + BlockTermsReader.totBlockReadCount + " scans=" + BlockTermsReader.totScanCount + " " + (((float) BlockTermsReader.totScanCount))/(BlockTermsReader.totBlockReadCount) + " scans/block");

    r.close();
    dir.close();
}

From source file:retriever.TermStats.java

TermStats(String term, int tf, IndexReader reader) throws Exception {
    this.term = term;
    this.tf = tf;
    idf = (float) (Math.log(reader.numDocs()
            / (float) (reader.docFreq(new Term(TextDocIndexer.FIELD_ANALYZED_CONTENT, term)))));
}

From source file:ro.ranking.technique.bm25.BM25FTermScorer.java

License:Apache License

public BM25FTermScorer(IndexReader reader, TermQuery term, String[] fields, float[] boosts, float[] bParams,
        Similarity similarity) {//from w  ww  .j  a v a  2 s . co m
    super(similarity);
    this.fields = fields;
    this.boosts = boosts;
    this.bParam = bParams;
    len = fields.length;
    this.termDocs = new TermDocs[len];
    this.termDocsNext = new boolean[len];
    this.norms = new byte[len][];
    this.averageLengths = new float[len];
    this.K1 = BM25FParameters.getK1();
    this.termBoost = term.getBoost();
    this.numDocs = reader.numDocs();
    this.termText = term.getTerm().text();

    try {
        this.docFreq = reader.docFreq(new Term(BM25FParameters.getIdfField(), termText));
        for (int i = 0; i < len; i++) {
            String field = this.fields[i];
            this.termDocs[i] = reader.termDocs(new Term(field, termText));
            norms[i] = reader.norms(field);
            averageLengths[i] = BM25FParameters.getAverageLength(field);
        }
        this.idf = this.getSimilarity().idf(docFreq, numDocs);
    } catch (IOException e) {
    }

}

From source file:ro.ranking.technique.bm25.BM25TermScorer.java

License:Apache License

public BM25TermScorer(IndexReader reader, TermQuery term, Similarity similarity) throws IOException {
    super(similarity);
    this.reader = reader;
    this.term = term;
    this.idf = this.getSimilarity().idf(reader.docFreq(term.getTerm()), reader.numDocs());
    this.norm = this.reader.norms(this.term.getTerm().field());
    this.avgLength = BM25Parameters.getAverageLength(this.term.getTerm().field());
    this.b = BM25Parameters.getB();
    this.k1 = BM25Parameters.getK1();
    this.termDocs = this.reader.termDocs(this.term.getTerm());
}