Example usage for org.apache.lucene.index IndexReader docFreq

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader docFreq.

Prototype

public abstract int docFreq(Term term) throws IOException;

Source Link

Document

Returns the number of documents containing the term.

Usage

From source file:com.semantic.util.suggest.Suggestion.java

public static Suggestion create(Lookup.LookupResult result, IndexReader reader, String[] fields) {
    Suggestion ret = new Suggestion(result);
    /* *//*from   w w w  .  j  ava2 s.  c o  m*/
    if (reader != null && fields != null) {
        /* remove bold tag */
        String text = format(result.key.toString());
        try {
            for (String field : fields) {
                int count = reader.docFreq(new Term(field, text));
                if (count > 0) {
                    ret.lucene = field;
                    ret.fieldCount = count;
                }
            }
        } catch (IOException ex) {
        }
    }
    return ret;
}

From source file:com.silverpeas.silvercrawler.model.FileFolder.java

License:Open Source License

public FileFolder(String rootPath, String path, boolean isAdmin, String componentId) {
    files = new ArrayList<FileDetail>(0);
    folders = new ArrayList<FileDetail>(0);

    String childPath = null;//  w w  w.  ja va 2s. c o  m

    try {
        SilverTrace.debug("silverCrawler", "FileFolder.FileFolder()", "root.MSG_GEN_PARAM_VALUE",
                "Starting constructor for FileFolder. Path = " + path);
        File f = new File(path);
        File fChild;

        SilverTrace.debug("silverCrawler", "FileFolder.FileFolder()", "root.MSG_GEN_PARAM_VALUE",
                "isExists " + f.exists() + " isFile=" + f.isFile());

        writable = f.canWrite();

        if (f.exists()) {
            this.name = f.getName();

            this.readable = f.canRead();

            String[] children_name = f.list();

            IndexReader reader = null;
            boolean isIndexed = false;

            if (isAdmin) {
                // ouverture de l'index
                String indexPath = FileRepositoryManager.getAbsoluteIndexPath("", componentId);

                if (IndexReader.indexExists(indexPath))
                    reader = IndexReader.open(indexPath);
            }

            for (int i = 0; children_name != null && i < children_name.length; i++) {
                SilverTrace.debug("silverCrawler", "FileFolder.FileFolder()", "root.MSG_GEN_PARAM_VALUE",
                        "Name = " + children_name[i]);
                fChild = new File(path + File.separator + children_name[i]);
                isIndexed = false;
                if (isAdmin) {
                    // rechercher si le rpertoire (ou le fichier) est index
                    String pathIndex = componentId + "|";
                    if (fChild.isDirectory())
                        pathIndex = pathIndex + "LinkedDir" + "|";
                    else
                        pathIndex = pathIndex + "LinkedFile" + "|";
                    pathIndex = pathIndex + fChild.getPath();
                    SilverTrace.debug("silverCrawler", "FileFolder.FileFolder()", "root.MSG_GEN_PARAM_VALUE",
                            "pathIndex = " + pathIndex);

                    Term term = new Term("key", pathIndex);
                    if (reader != null && reader.docFreq(term) == 1)
                        isIndexed = true;
                }

                if (fChild.isDirectory()) {
                    folders.add(new FileDetail(fChild.getName(), fChild.getPath(), fChild.length(), true,
                            isIndexed));
                } else {
                    childPath = fChild.getPath().substring(rootPath.length() + 1);
                    files.add(new FileDetail(fChild.getName(), childPath, fChild.length(), false, isIndexed));
                }
            }

            // fermeture de l'index
            if (reader != null && isAdmin)
                reader.close();

        }
    } catch (Exception e) {
        throw new SilverCrawlerRuntimeException("FileFolder.FileFolder()", SilverpeasRuntimeException.ERROR,
                "silverCrawler.IMPOSSIBLE_DACCEDER_AU_REPERTOIRE", e);
    }
}

From source file:com.sindicetech.siren.search.node.NodeScoringRewrite.java

License:Open Source License

@Override
public Q rewrite(final IndexReader reader, final MultiNodeTermQuery query) throws IOException {
    final Q result = this.getTopLevelQuery(query);
    final ParallelArraysTermCollector col = new ParallelArraysTermCollector();
    this.collectTerms(reader, query, col);

    final int size = col.terms.size();
    if (size > 0) {
        final int sort[] = col.terms.sort(col.termsEnum.getComparator());
        final float[] boost = col.array.boost;
        final TermContext[] termStates = col.array.termState;
        for (int i = 0; i < size; i++) {
            final int pos = sort[i];
            final Term term = new Term(query.getField(), col.terms.get(pos, new BytesRef()));
            assert reader.docFreq(term) == termStates[pos].docFreq();
            this.addClause(result, term, termStates[pos].docFreq(), query.getBoost() * boost[pos],
                    termStates[pos]);/* w w  w.  j a v  a 2  s .c  o  m*/
        }
    }
    return result;
}

From source file:com.sindicetech.siren.search.node.TopNodeTermsRewrite.java

License:Open Source License

@Override
public Q rewrite(final IndexReader reader, final MultiNodeTermQuery query) throws IOException {
    final int maxSize = Math.min(size, this.getMaxSize());
    final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>();
    this.collectTerms(reader, query, new TermCollector() {
        private final MaxNonCompetitiveBoostAttribute maxBoostAtt = attributes
                .addAttribute(MaxNonCompetitiveBoostAttribute.class);

        private final Map<BytesRef, ScoreTerm> visitedTerms = new HashMap<BytesRef, ScoreTerm>();

        private TermsEnum termsEnum;
        private Comparator<BytesRef> termComp;
        private BoostAttribute boostAtt;
        private ScoreTerm st;

        @Override/*from  www. j ava  2 s. c o  m*/
        public void setNextEnum(final TermsEnum termsEnum) throws IOException {
            this.termsEnum = termsEnum;
            this.termComp = termsEnum.getComparator();

            assert this.compareToLastTerm(null);

            // lazy init the initial ScoreTerm because comparator is not known on ctor:
            if (st == null)
                st = new ScoreTerm(this.termComp, new TermContext(topReaderContext));
            boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class);
        }

        // for assert:
        private BytesRef lastTerm;

        private boolean compareToLastTerm(final BytesRef t) throws IOException {
            if (lastTerm == null && t != null) {
                lastTerm = BytesRef.deepCopyOf(t);
            } else if (t == null) {
                lastTerm = null;
            } else {
                assert termsEnum.getComparator().compare(lastTerm, t) < 0 : "lastTerm=" + lastTerm + " t=" + t;
                lastTerm.copyBytes(t);
            }
            return true;
        }

        @Override
        public boolean collect(final BytesRef bytes) throws IOException {
            final float boost = boostAtt.getBoost();

            // make sure within a single seg we always collect
            // terms in order
            assert this.compareToLastTerm(bytes);

            //System.out.println("TTR.collect term=" + bytes.utf8ToString() + " boost=" + boost + " ord=" + readerContext.ord);
            // ignore uncompetitive hits
            if (stQueue.size() == maxSize) {
                final ScoreTerm t = stQueue.peek();
                if (boost < t.boost)
                    return true;
                if (boost == t.boost && termComp.compare(bytes, t.bytes) > 0)
                    return true;
            }
            ScoreTerm t = visitedTerms.get(bytes);
            final TermState state = termsEnum.termState();
            assert state != null;
            if (t != null) {
                // if the term is already in the PQ, only update docFreq of term in PQ
                assert t.boost == boost : "boost should be equal in all segment TermsEnums";
                t.termState.register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
            } else {
                // add new entry in PQ, we must clone the term, else it may get overwritten!
                st.bytes.copyBytes(bytes);
                st.boost = boost;
                visitedTerms.put(st.bytes, st);
                assert st.termState.docFreq() == 0;
                st.termState.register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
                stQueue.offer(st);
                // possibly drop entries from queue
                if (stQueue.size() > maxSize) {
                    st = stQueue.poll();
                    visitedTerms.remove(st.bytes);
                    st.termState.clear(); // reset the termstate!
                } else {
                    st = new ScoreTerm(termComp, new TermContext(topReaderContext));
                }
                assert stQueue.size() <= maxSize : "the PQ size must be limited to maxSize";
                // set maxBoostAtt with values to help FuzzyTermsEnum to optimize
                if (stQueue.size() == maxSize) {
                    t = stQueue.peek();
                    maxBoostAtt.setMaxNonCompetitiveBoost(t.boost);
                    maxBoostAtt.setCompetitiveTerm(t.bytes);
                }
            }

            return true;
        }
    });

    final Q q = this.getTopLevelQuery(query);
    final ScoreTerm[] scoreTerms = stQueue.toArray(new ScoreTerm[stQueue.size()]);
    ArrayUtil.timSort(scoreTerms, scoreTermSortByTermComp);

    for (final ScoreTerm st : scoreTerms) {
        final Term term = new Term(query.field, st.bytes);
        assert reader.docFreq(term) == st.termState.docFreq() : "reader DF is " + reader.docFreq(term) + " vs "
                + st.termState.docFreq() + " term=" + term;
        this.addClause(q, term, st.termState.docFreq(), query.getBoost() * st.boost, st.termState); // add to query
    }
    return q;
}

From source file:edu.coeia.tasks.CommonKeywordsTask.java

License:Open Source License

public Map<String, Integer> getAllTermFreqFromItems() throws IOException {
    Map<String, Integer> map = new HashMap<String, Integer>();

    String indexDir = this.aCase.getCaseLocation() + File.separator + ApplicationConstants.CASE_INDEX_FOLDER;
    Directory dir = FSDirectory.open(new File(indexDir));
    IndexReader indexReader = IndexReader.open(dir);
    TermEnum terms = indexReader.terms();

    int factor = indexReader.maxDoc() / 100;

    while (terms.next()) {
        if (isCancelledTask())
            break;

        Term term = terms.term();// ww w .ja va  2 s  . co m

        if (this.isAllowedFeild(term.field().trim())) {
            String termText = term.text();
            int frequency = indexReader.docFreq(term);

            if (frequency >= factor)
                map.put(termText, frequency);
        }
    }

    System.out.println("map size: " + map.size());
    indexReader.close();
    return map;
}

From source file:edu.mit.ll.vizlinc.highlight.QueryTermExtractor.java

License:Apache License

/**
 * Extracts all terms texts of a given Query into an array of WeightedTerms
 *
 * @param query      Query to extract term texts from
 * @param reader used to compute IDF which can be used to a) score selected fragments better 
 * b) use graded highlights eg changing intensity of font color
 * @param fieldName the field on which Inverse Document Frequency (IDF) calculations are based
 * @return an array of the terms used in a query, plus their weights.
 *///from  w w w  .  j a  v  a2  s . c  o m
public static final WeightedTerm[] getIdfWeightedTerms(Query query, IndexReader reader, String fieldName) {
    WeightedTerm[] terms = getTerms(query, false, fieldName);
    int totalNumDocs = reader.numDocs();
    for (int i = 0; i < terms.length; i++) {
        try {
            int docFreq = reader.docFreq(new Term(fieldName, terms[i].term));
            // docFreq counts deletes
            if (totalNumDocs < docFreq) {
                docFreq = totalNumDocs;
            }
            //IDF algorithm taken from DefaultSimilarity class
            float idf = (float) (Math.log((float) totalNumDocs / (double) (docFreq + 1)) + 1.0);
            terms[i].weight *= idf;
        } catch (IOException e) {
            //ignore 
        }
    }
    return terms;
}

From source file:edu.mit.ll.vizlinc.highlight.WeightedSpanTermExtractor.java

License:Apache License

/**
 * Creates a Map of <code>WeightedSpanTerms</code> from the given <code>Query</code> and <code>TokenStream</code>. Uses a supplied
 * <code>IndexReader</code> to properly weight terms (for gradient highlighting).
 * //  w  w  w  .ja va 2 s  . co m
 * <p>
 * 
 * @param query
 *          that caused hit
 * @param tokenStream
 *          of text to be highlighted
 * @param fieldName
 *          restricts Term's used based on field name
 * @param reader
 *          to use for scoring
 * @return Map of WeightedSpanTerms with quasi tf/idf scores
 * @throws IOException
 */
public Map<String, WeightedSpanTerm> getWeightedSpanTermsWithScores(Query query, TokenStream tokenStream,
        String fieldName, IndexReader reader) throws IOException {
    if (fieldName != null) {
        this.fieldName = StringHelper.intern(fieldName);
    } else {
        this.fieldName = null;
    }
    this.tokenStream = tokenStream;

    Map<String, WeightedSpanTerm> terms = new PositionCheckingMap<String>();
    extract(query, terms);

    int totalNumDocs = reader.numDocs();
    Set<String> weightedTerms = terms.keySet();
    Iterator<String> it = weightedTerms.iterator();

    try {
        while (it.hasNext()) {
            WeightedSpanTerm weightedSpanTerm = terms.get(it.next());
            int docFreq = reader.docFreq(new Term(fieldName, weightedSpanTerm.term));
            // docFreq counts deletes
            if (totalNumDocs < docFreq) {
                docFreq = totalNumDocs;
            }
            // IDF algorithm taken from DefaultSimilarity class
            float idf = (float) (Math.log((float) totalNumDocs / (double) (docFreq + 1)) + 1.0);
            weightedSpanTerm.weight *= idf;
        }
    } finally {

        closeReaders();
    }

    return terms;
}

From source file:edu.wayne.cs.severe.ir4se.lucene.SearchFiles.java

License:Apache License

@SuppressWarnings("deprecation")
public static void search(String system, int documentType, int queryNumber, String indexDirectoryPath,
        String queryString, String fileOutput, String[] targetClasses, boolean runIndividualTerms,
        boolean append) throws Exception {

    String index = indexDirectoryPath;
    FileWriter f = new FileWriter(index + "../NotFound.txt", true);

    for (int i = 0; i < targetClasses.length; i++) {
        String target = targetClasses[i];
        boolean found = Indexer.isFileDocInIndex(indexDirectoryPath, target);
        if (!found)
            f.append("Target doc " + i + " - " + target + " not found in index!\n");
    }//from   www. ja  va2s .c  om
    f.close();
    IndexReader reader = IndexReader.open(FSDirectory.open(new File(index)), true);

    int numDocs = reader.numDocs();
    System.out.println("The number of documents in the index is: " + numDocs);

    IndexSearcher searcher = new IndexSearcher(reader);
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_35);

    String[] fields;
    fields = new String[1];
    fields[0] = "contents";

    if (!runIndividualTerms) {
        MultiFieldQueryParser parser = new MultiFieldQueryParser(Version.LUCENE_35, fields, analyzer);
        int hitsPerPage = numDocs;
        TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true);
        Query query = parser.parse(queryString);
        searcher.search(query, collector);
        ScoreDoc[] hits = collector.topDocs().scoreDocs;
        System.out.println("The number of hits is: " + hits.length);

        // file with the results (score and position) only for the relevant
        // documents
        // the file contains entries in the following format:
        // (queryNumber,relDoc1,posRelDoc1,scoreRelDoc1,relDoc2,posRelDoc2,scoreRelDoc2,...)
        FileWriter fwRelevant = new FileWriter(fileOutput, append);

        String path = "";
        String docName = "";
        String docPathAndName = "";
        for (String target : targetClasses) {
            boolean found = false;
            for (int i = 0; i < hits.length; i++) {
                int docId = hits[i].doc;
                Document d = searcher.doc(docId);
                path = d.get("path");

                float score = hits[i].score;

                if (documentType == 2) {
                    docName = d.get("docName");

                    docPathAndName = path.toLowerCase() + "." + docName.toLowerCase();

                    if (target.equalsIgnoreCase(docPathAndName)) {
                        fwRelevant.write(system + ";" + queryNumber + ";" + target + ";" + (i + 1) + ";"
                                + hits.length + ";" + numDocs + ";" + score + "\n");
                        found = true;
                        break;
                    }
                } else if (documentType == 1) {
                    File pathDir = new File(path.trim());
                    String fileName = pathDir.getName();
                    docName = fileName.replaceAll(".txt", "");
                    fwRelevant.write((i + 1) + ". doc = " + docName + " score = " + score + "\n");
                }
            }
            if (found == false)
                fwRelevant.write(system + ";" + queryNumber + ";" + target + "; NOT_RETRIEVED" + "\n");

        }
        // fw.close();
        fwRelevant.close();
        reader.close();
    } else // runIndividualTerms = true
    {
        /**
         * each query will be divided in its constituent terms and each term
         * will be run as a separate query
         **/
        /**
         * this is useful to determine the similarity of each of the terms
         * in a query to a target document so that we determine which terms
         * in the query tend to lead to the best results, i.e., to finding
         * the targets sooner
         **/

        SearchFiles.search(system, documentType, queryNumber, indexDirectoryPath, queryString,
                fileOutput.replaceAll(".txt", "_wholeQuery.txt"), targetClasses, false, append);

        FileWriter fw = new FileWriter(fileOutput.replaceAll(".txt", "_terms.txt"));
        fw.write(
                "\n\n\n------------------------------------------------------------------------------------\n\n");
        fw.write("                               Results for query " + queryNumber + "\n");
        fw.write("------------------------------------------------------------------------------------\n\n");

        // file with the results (score and position) only for the relevant
        // documents
        // the file contains entries in the following format:
        // (queryNumber,term1,term1TF,term1DF,relDoc1,posRelDoc1Term1,scoreRelDoc1Term1,relDoc2,posRelDoc2Term1,scoreRelDoc2Term1,...)
        // (queryNumber,term2,term2TF,term2DF,relDoc1,posRelDoc1Term2,scoreRelDoc1Term2,relDoc2,posRelDoc2Term2,scoreRelDoc2Term2,...)
        // ...
        FileWriter fwRelevant = new FileWriter(
                fileOutput.replaceAll(".txt", "_terms_RelevantDocsPositions.txt"));

        String[] queryTerms = queryString.split(" ");
        for (int l = 0; l < queryTerms.length; l++) {
            MultiFieldQueryParser parser = new MultiFieldQueryParser(Version.LUCENE_35, fields, analyzer);
            int hitsPerPage = numDocs;
            TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true);

            String q = queryTerms[l];
            Query query = parser.parse(q);
            searcher.search(query, collector);
            ScoreDoc[] hits = collector.topDocs().scoreDocs;
            fw.write("TERM " + (l + 1) + ": " + q + "\n\n");
            fwRelevant.write("\n" + queryNumber + "," + q);
            for (int i = 0; i < hits.length; i++) {
                int docId = hits[i].doc;
                Document d = searcher.doc(docId);
                String path = d.get("path");
                float score = hits[i].score;
                if (documentType == 2) {
                    String docName = d.get("docName");
                    fw.write((i + 1) + ". doc = " + path + " " + docName + " - score = " + score + "\n");
                    for (int k = 0; k < targetClasses.length; k++) {
                        if (docName.equalsIgnoreCase(targetClasses[k])) {
                            String contents = d.get("contents");
                            int frequency = countOccurrences(contents, q);// tf
                            fwRelevant.write("," + frequency);

                            fwRelevant.write("," + reader.docFreq(new Term("contents", q)));// df
                            fwRelevant.write("," + path + "." + docName + "," + (i + 1) + "," + score);
                            break;
                        }
                    }
                } else if (documentType == 1) {
                    File pathDir = new File(path);
                    String fileName = pathDir.getName();
                    String docName = fileName.replaceAll(".txt", "");
                    fw.write((i + 1) + ". doc = " + docName + " score = " + score + "\n");
                }
            }
            fw.write("\n\n\n");
        }
        fw.close();
        f.close();
        fwRelevant.close();
        reader.close();
    }
}

From source file:engine.easy.search.EasySearchEngine.java

License:Apache License

/**
 * Computes the results on ranking function and other scoring factors.
 * /*  w  w w .  j  av  a  2 s .co m*/
 * @param terms the query terms
 * @param ixReader the index reader
 * @param esiReader the custom easy index reader
 * @param numberOfResults the number of results return back
 * @return the Results.
 * @throws Exception if one is thrown.
 */
public Result[] getResults(Query query, IndexReader ixReader, EasySearchIndexReader esiReader,
        Map<Integer, Float> relevanceDocMap) {

    Map<Integer, Result> results = null;

    try {
        Set<Term> terms = new HashSet<Term>();
        query.extractTerms(terms);

        results = new HashMap<Integer, Result>();
        Iterator<Term> itr = terms.iterator();

        while (itr.hasNext()) {
            Term term = itr.next();

            TermDocs docs = ixReader.termDocs(term);
            int docFreq = ixReader.docFreq(term); // get the document frequency of the term from lucene's index reader
            int docNum = esiReader.recordCount(AppConstants.CONTENT_FIELD); // get the total record of the field from lucene extra index (you may think it is also possible to use ixreader.maxDoc() here, but the ixreader.maxDoc() only returns the number of documents, while some documents may not have the search field (although every document has the search field in this example))

            while (docs.next()) {
                Integer id = docs.doc(); // get the internal lucene's id of the document
                int termFreq = docs.freq(); // get the frequency of the term in this document
                int docLen = esiReader.docLength(id, AppConstants.CONTENT_FIELD); // get the length of the document from lucene extra index.
                double avgDocLen = esiReader.avgFieldLength(AppConstants.CONTENT_FIELD); // get the average length of the search field from lucene extra index.
                Document document = ixReader.document(id); //get the particular document.
                String storedField = extractData(document.get(AppConstants.CONTENT_FIELD));

                // Compute the scoring with BM25 ranking and also include other scoring factors such as (relevance feedback based on terms) 
                BM25 bm25 = new BM25();
                //System.out.println(bm25.getInfo());

                // Also add the document boost in the ranking score.
                double termWeight = bm25.score(termFreq, docNum, docLen, avgDocLen, 1d, docFreq);

                //Add each document relevance score!
                if (relevanceDocMap != null && !relevanceDocMap.isEmpty() && relevanceDocMap.containsKey(id))
                    termWeight = termWeight * relevanceDocMap.get(id);

                //System.out.println("lucene id" + id  + " Doc id " + document.getField("DOCID").stringValue() + "wieght" + termWeight);

                if (results.containsKey(id)) {
                    results.get(id).score = results.get(id).score + termWeight;
                } else {
                    Result result = new Result(new Integer(id), document.getField("DOCID").stringValue(),
                            termWeight, storedField);
                    results.put(id, result);
                }
            }
        }

        return sortArray(results, AppConstants.TOP_RESULTS);

    } catch (Exception e) {
        System.out.println("Exception: getResults " + e.toString());
    }

    return null;
}

From source file:engine.easy.search.RelevanceFeedBackUtil.java

License:Apache License

/**
 * Computes a term frequency map for the overall index at the specified location.
 * Builds a Boolean OR query out of the "most frequent" terms in the index
 * and returns it. "Most Frequent" is defined as the terms whose frequencies
 * are greater than or equal to the topTermCutoff * the frequency of the top
 * term, where the topTermCutoff is number between 0 and 1.
 * /*w w w . j a  v a  2 s  .c  om*/
 * @param ramdir the directory where the index is created.
 * @return a Boolean OR query.
 * @throws Exception if one is thrown.
 */
private static Query computeTopTermQueryFromDataCollection(Directory ramdir, int numOf) throws Exception {

    final Map<String, Integer> frequencyMap = new HashMap<String, Integer>();
    List<String> termlist = new ArrayList<String>();
    IndexReader reader = IndexReader.open(ramdir);

    TermEnum terms = reader.terms();
    while (terms.next()) {
        Term term = terms.term();
        String termText = term.text();
        int frequency = reader.docFreq(term);
        frequencyMap.put(termText, frequency);
        termlist.add(termText);
    }
    reader.close();

    return computeTopTermQuery(termlist, frequencyMap, AppConstants.TOP_DOCUMENTS);
}