Example usage for org.apache.lucene.index IndexReader docFreq

List of usage examples for org.apache.lucene.index IndexReader docFreq

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader docFreq.

Prototype

public abstract int docFreq(Term term) throws IOException;

Source Link

Document

Returns the number of documents containing the term.

Usage

From source file:nicta.com.au.patent.pac.terms.impact.FeaturesSelection.java

public void iterateOverQueryTerms() throws ParseException, Exception {
    long start = System.currentTimeMillis();
    int l = 0;//from w  w  w  . j av  a  2 s.  com
    //        System.out.println("queryid\tterm\ttf\tln_tf\tidf\ttfidf\ttLength\tratioTerm\t"
    //                + "nbrUniqTerms\tqSize\tscq\tisInTitle\tisInAbstract\tisInDescription\tisInClaims");

    System.out.println(
            "queryid\tremovedBooleanClause\ttf\tln_tf\tidf\ttfidf\ttLength\tratioTerm\tnbrUniqTerms\tqSize\tscq\tSCS\tictf\tQC\tclarity\tfreqInTitle\tratioInTitle\tfreqDescription\tratioInDescription\tfreqClaims\tratioInClaims");
    for (Map.Entry<String, PatentDocument> e : topics.getTopics().entrySet()) {
        l++;
        String queryid = e.getKey();
        PatentDocument pt = e.getValue();
        //            System.err.print(l + "- " + queryid + " -> " + pt.getUcid() + ": ");
        long start2 = System.currentTimeMillis();
        PatentQuery query = new PatentQuery(pt, boosts, filter, stopWords);
        BooleanQuery bQuery = (BooleanQuery) query.parse();
        if (bQuery.getClauses().length != 2 || !(bQuery.getClauses()[1].getQuery() instanceof BooleanQuery)
                || ((BooleanQuery) bQuery.getClauses()[1].getQuery()).getClauses().length == 0
                || !(((BooleanQuery) bQuery.getClauses()[1].getQuery()).getClauses()[0]
                        .getQuery() instanceof BooleanQuery)) {
            continue;
        }
        BooleanQuery bQuery2 = (BooleanQuery) ((BooleanQuery) bQuery.getClauses()[1].getQuery()).getClauses()[0]
                .getQuery();
        for (int i = 0; i < bQuery2.clauses().size(); i++) {
            BooleanQuery bQueryFinal = new BooleanQuery();
            BooleanQuery bQuery3 = bQuery2.clone();
            BooleanClause removedBooleanClause = bQuery3.clauses().remove(i);
            bQueryFinal.add((Query) bQuery.getClauses()[0].getQuery(), BooleanClause.Occur.MUST);
            bQueryFinal.add(bQuery3, BooleanClause.Occur.MUST);
            //***************************
            // Get features
            //*************************** 
            IndexReader ir = searcher.getIndexSearch().getIndexReader();
            TermQuery term = (TermQuery) removedBooleanClause.getQuery();
            double tf = removedBooleanClause.getQuery().getBoost();// Term frequency
            double ln_tf = Math.log(1 + tf);// Get log of the term frequency
            int totalTF = ir.docFreq(term.getTerm());
            int docs = ir.getDocCount(term.getTerm().field());
            double idf = 0;
            if (totalTF != 0) {
                idf = Math.log10((double) docs / (totalTF));// Inverse document frequency
            }
            double tfidf = ln_tf * idf;// Compute the TFIDF
            int tLength = term.getTerm().text().length();// Term length
            int qSize = 0;
            if (term.getTerm().field().endsWith(PatentDocument.Title)) {
                qSize = query.getTitleSize(); // Query size
            } else if (term.getTerm().field().endsWith(PatentDocument.Abstract)) {
                qSize = query.getAbstractSize(); // Query size
            } else if (term.getTerm().field().endsWith(PatentDocument.Description)) {
                qSize = query.getDescriptionSize(); // Query size
            } else if (term.getTerm().field().endsWith(PatentDocument.Claims)) {
                qSize = query.getClaimsSize(); // Query size
            }
            double ratioTerm = (double) tf / qSize;
            int nbrUniqTerms = bQuery2.getClauses().length;
            long totalTermFreq = ir.totalTermFreq(term.getTerm());
            double ln_totalTermFreq = Math.log(1 + totalTermFreq);
            double scq = ln_totalTermFreq * idf;
            double freqInTitle = query.getFreqInTitle(term.getTerm().text());
            double ratioInTitle = (double) freqInTitle / query.getTitleSize();
            double freqAbstract = query.getFreqInAbstract(term.getTerm().text());
            double ratioInAbstract = (double) freqAbstract / query.getAbstractSize();
            double freqDescription = query.getFreqInDescription(term.getTerm().text());
            double ratioInDescription = (double) freqDescription / query.getDescriptionSize();
            double freqClaims = query.getFreqInClaims(term.getTerm().text());
            double ratioInClaims = (double) freqClaims / query.getClaimsSize();
            double Pcoll = (double) totalTermFreq / ir.getSumTotalTermFreq(term.getTerm().field());
            double SCS = 0;
            double ictf = 0;
            List<TermFreqVector> docsTermVector = getDocsTerms(searcher.search(term), term.getTerm().field());
            double a1 = 0;
            for (TermFreqVector vec : docsTermVector) {
                a1 += Math.sqrt((double) vec.getFreq(term.getTerm().text()) / vec.numberOfTerms());
            }
            double clarity = 0;
            if (totalTermFreq != 0) {
                SCS = ratioTerm * Log2(ratioTerm / Pcoll);// Simplified Clarity Score
                ictf = Math.log10((double) docs / (totalTermFreq));// Inverse Collection Term Frequency
                clarity = a1 * Log2(a1 / Pcoll);
            }
            double QC = totalTF / (double) docs;// QueryScope

            //***************************
            System.out.println(queryid + "\t" + removedBooleanClause + "\t" + tf + "\t" + ln_tf + "\t" + idf
                    + "\t" + tfidf + "\t" + tLength + "\t" + ratioTerm + "\t" + nbrUniqTerms + "\t" + qSize
                    + "\t" + scq + "\t" + SCS + "\t" + ictf + "\t" + QC + "\t" + clarity + "\t" + freqInTitle
                    + "\t" + ratioInTitle + "\t" + freqDescription + "\t" + ratioInDescription + "\t"
                    + freqClaims + "\t" + ratioInClaims);
        }
        long end2 = System.currentTimeMillis();
        //            System.err.println(bQuery2.clauses().size() + " terms processed in " + Functions.getTimer(end2 - start2) + ".");
    }
    long end = System.currentTimeMillis();
    long millis = (end - start);
    System.err.println("#Global Execution time: " + Functions.getTimer(millis) + ".");
}

From source file:nmsu.cs.TFIDFVector.java

License:Open Source License

/**
 * read in text tfidfvector to docVectorMap.
 * @param docVectorMap/* ww  w  .j  a v a  2  s.c o m*/
 * @param ir
 * @param datasetID
 * @param indexID
 * @param numDocs
 * @return
 */
public static TFIDFVector getFullTextTFIDFVector(Map<Integer, TFIDFVector> docVectorMap, IndexReader ir,
        int datasetID, int indexID, int numDocs) {
    TFIDFVector vector = null;
    try {
        if ((vector = docVectorMap.get(datasetID)) == null) {
            TermFreqVector termFreqVector = ir.getTermFreqVector(indexID, "fulltext");
            int[] tf = termFreqVector == null ? new int[0] : termFreqVector.getTermFrequencies();
            String[] terms = termFreqVector == null ? new String[0] : termFreqVector.getTerms();

            int[] df = new int[tf.length];
            for (int j = 0; j < terms.length; j++)
                df[j] = ir.docFreq(new Term(terms[j]));

            vector = new TFIDFVector(terms, tf, df, numDocs);
            //            docVectorMap.put(datasetID, vector);
        }
    } catch (IOException e) {
        e.printStackTrace();
    }

    return vector;
}

From source file:nmsu.cs.TFIDFVector.java

License:Open Source License

/**
 * read in aspect tfidfvector to docVectorMap.
 * @param docAspectVectorMap//from  w  w w  . j av a 2 s.  c  o  m
 * @param ir
 * @param datasetID
 * @param indexID
 * @param aspectID
 * @param numDocs
 * @return
 */
public static TFIDFVector getAspectTFIDFVector(Map<Integer, Map<Integer, TFIDFVector>> docAspectVectorMap,
        IndexReader ir, int datasetID, int indexID, int aspectID, int numDocs) {
    TFIDFVector vector = null;
    try {
        Map<Integer, TFIDFVector> aspectVectorMap = docAspectVectorMap.get(datasetID);
        if (aspectVectorMap == null) {
            aspectVectorMap = new HashMap<Integer, TFIDFVector>();
            docAspectVectorMap.put(datasetID, aspectVectorMap);

            TermFreqVector termFreqVector = ir.getTermFreqVector(indexID, rawdata.id2Aspect.get(aspectID));
            int[] tf = termFreqVector == null ? new int[0] : termFreqVector.getTermFrequencies();
            String[] terms = termFreqVector == null ? new String[0] : termFreqVector.getTerms();

            int[] df = new int[tf.length];
            for (int j = 0; j < terms.length; j++)
                df[j] = ir.docFreq(new Term(terms[j]));

            vector = new TFIDFVector(terms, tf, df, numDocs);
            //            aspectVectorMap.put(aspectID, vector);
        } else if ((vector = aspectVectorMap.get(aspectID)) == null) {
            TermFreqVector termFreqVector = ir.getTermFreqVector(indexID, rawdata.id2Aspect.get(aspectID));
            int[] tf = termFreqVector == null ? new int[0] : termFreqVector.getTermFrequencies();
            String[] terms = termFreqVector == null ? new String[0] : termFreqVector.getTerms();

            int[] df = new int[tf.length];
            for (int j = 0; j < terms.length; j++)
                df[j] = ir.docFreq(new Term(terms[j]));

            vector = new TFIDFVector(terms, tf, df, numDocs);
            //               aspectVectorMap.put(aspectID, vector);
        }
    } catch (IOException e) {
        e.printStackTrace();
    }

    return vector;
}

From source file:org.ala.lucene.Autocompleter.java

License:Open Source License

@SuppressWarnings("unchecked")
public void reIndex(Directory sourceDirectory, String fieldToAutocomplete, boolean createNewIndex)
        throws CorruptIndexException, IOException {
    // build a dictionary (from the spell package)
    IndexReader sourceReader = IndexReader.open(sourceDirectory);

    LuceneDictionary dict = new LuceneDictionary(sourceReader, fieldToAutocomplete);

    // code from//from   w w  w  . j a v  a  2s . c  o  m
    // org.apache.lucene.search.spell.SpellChecker.indexDictionary(
    // Dictionary)
    IndexWriter.unlock(autoCompleteDirectory);

    // use a custom analyzer so we can do EdgeNGramFiltering
    IndexWriterConfig indexWriterConfig = new IndexWriterConfig(SolrUtils.BIE_LUCENE_VERSION, new Analyzer() {
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            final StandardTokenizer src = new StandardTokenizer(SolrUtils.BIE_LUCENE_VERSION, reader);
            TokenStream result = new StandardTokenizer(SolrUtils.BIE_LUCENE_VERSION, reader);
            result = new StandardFilter(SolrUtils.BIE_LUCENE_VERSION, result);
            result = new LowerCaseFilter(SolrUtils.BIE_LUCENE_VERSION, result);
            result = new StopFilter(SolrUtils.BIE_LUCENE_VERSION, result,
                    new CharArraySet(SolrUtils.BIE_LUCENE_VERSION,
                            new HashSet<String>(Arrays.asList(ENGLISH_STOP_WORDS)), true));
            result = new EdgeNGramTokenFilter(result, Side.FRONT, 1, 20);
            return new TokenStreamComponents(src, result) {
                @Override
                protected void setReader(final Reader reader) throws IOException {
                    super.setReader(reader);
                }

            };
        }
        //            public TokenStream tokenStream(String fieldName, Reader reader) {
        //            TokenStream result = new StandardTokenizer(SolrUtils.BIE_LUCENE_VERSION, reader);
        //            
        //            result = new StandardFilter(SolrUtils.BIE_LUCENE_VERSION, result);
        //            result = new LowerCaseFilter(SolrUtils.BIE_LUCENE_VERSION, result);
        //            //result = new ISOLatin1AccentFilter(result);
        //            result = new StopFilter(SolrUtils.BIE_LUCENE_VERSION, result, new HashSet<String>(Arrays.asList(ENGLISH_STOP_WORDS)));
        //            result = new EdgeNGramTokenFilter(result, Side.FRONT,1, 20);
        //            
        //            return result;
        //          }
    });
    if (createNewIndex) {
        indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    } else {
        indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
    }
    indexWriterConfig.setMaxBufferedDocs(150);
    IndexWriter writer = new IndexWriter(autoCompleteDirectory, indexWriterConfig);
    //        writer.setMergeFactor(300);

    // go through every word, storing the original word (incl. n-grams)
    // and the number of times it occurs
    Map<String, Integer> wordsMap = new HashMap<String, Integer>();

    Iterator<String> iter = (Iterator<String>) dict.getWordsIterator();
    while (iter.hasNext()) {
        String word = iter.next();

        int len = word.length();
        if (len < 3) {
            continue; // too short we bail but "too long" is fine...
        }

        if (wordsMap.containsKey(word)) {
            throw new IllegalStateException("This should never happen in Lucene 2.3.2");
            // wordsMap.put(word, wordsMap.get(word) + 1);
        } else {
            // use the number of documents this word appears in
            wordsMap.put(word, sourceReader.docFreq(new Term(fieldToAutocomplete, word)));
        }
    }

    for (String word : wordsMap.keySet()) {
        // ok index the word
        Document doc = new Document();
        doc.add(new Field(SOURCE_WORD_FIELD, word, Field.Store.YES, Field.Index.NOT_ANALYZED)); // orig term
        doc.add(new Field(GRAMMED_WORDS_FIELD, word, Field.Store.YES, Field.Index.ANALYZED)); // grammed
        doc.add(new Field(COUNT_FIELD, Integer.toString(wordsMap.get(word)), Field.Store.NO,
                Field.Index.NOT_ANALYZED)); // count

        writer.addDocument(doc);
    }

    sourceReader.close();

    // close writer
    writer.forceMerge(1);
    writer.close();

    // re-open our reader
    reOpenReader();
}

From source file:org.apache.mahout.utils.vectors.lucene.ClusterLabels.java

License:Apache License

/**
 * Get the list of labels, sorted by best score.
 *//*  www .j a va  2 s. c  o  m*/
protected List<TermInfoClusterInOut> getClusterLabels(Integer integer,
        Collection<WeightedPropertyVectorWritable> wpvws) throws IOException {

    if (wpvws.size() < minNumIds) {
        log.info("Skipping small cluster {} with size: {}", integer, wpvws.size());
        return null;
    }

    log.info("Processing Cluster {} with {} documents", integer, wpvws.size());
    Directory dir = FSDirectory.open(new File(this.indexDir));
    IndexReader reader = DirectoryReader.open(dir);

    log.info("# of documents in the index {}", reader.numDocs());

    Collection<String> idSet = Sets.newHashSet();
    for (WeightedPropertyVectorWritable wpvw : wpvws) {
        Vector vector = wpvw.getVector();
        if (vector instanceof NamedVector) {
            idSet.add(((NamedVector) vector).getName());
        }
    }

    int numDocs = reader.numDocs();

    OpenBitSet clusterDocBitset = getClusterDocBitset(reader, idSet, this.idField);

    log.info("Populating term infos from the index");

    /**
     * This code is as that of CachedTermInfo, with one major change, which is to get the document frequency.
     * 
     * Since we have deleted the documents out of the cluster, the document frequency for a term should only
     * include the in-cluster documents. The document frequency obtained from TermEnum reflects the frequency
     * in the entire index. To get the in-cluster frequency, we need to query the index to get the term
     * frequencies in each document. The number of results of this call will be the in-cluster document
     * frequency.
     */
    Terms t = MultiFields.getTerms(reader, contentField);
    TermsEnum te = t.iterator(null);
    Map<String, TermEntry> termEntryMap = new LinkedHashMap<String, TermEntry>();
    Bits liveDocs = MultiFields.getLiveDocs(reader); //WARNING: returns null if there are no deletions

    int count = 0;
    BytesRef term;
    while ((term = te.next()) != null) {
        OpenBitSet termBitset = new OpenBitSet(reader.maxDoc());
        DocsEnum docsEnum = MultiFields.getTermDocsEnum(reader, null, contentField, term);
        int docID;
        while ((docID = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
            //check to see if we don't have an deletions (null) or if document is live
            if (liveDocs != null && !liveDocs.get(docID)) {
                // document is deleted...
                termBitset.set(docsEnum.docID());
            }
        }
        // AND the term's bitset with cluster doc bitset to get the term's in-cluster frequency.
        // This modifies the termBitset, but that's fine as we are not using it anywhere else.
        termBitset.and(clusterDocBitset);
        int inclusterDF = (int) termBitset.cardinality();

        TermEntry entry = new TermEntry(term.utf8ToString(), count++, inclusterDF);
        termEntryMap.put(entry.getTerm(), entry);

    }

    List<TermInfoClusterInOut> clusteredTermInfo = Lists.newLinkedList();

    int clusterSize = wpvws.size();

    for (TermEntry termEntry : termEntryMap.values()) {

        int corpusDF = reader.docFreq(new Term(this.contentField, termEntry.getTerm()));
        int outDF = corpusDF - termEntry.getDocFreq();
        int inDF = termEntry.getDocFreq();
        double logLikelihoodRatio = scoreDocumentFrequencies(inDF, outDF, clusterSize, numDocs);
        TermInfoClusterInOut termInfoCluster = new TermInfoClusterInOut(termEntry.getTerm(), inDF, outDF,
                logLikelihoodRatio);
        clusteredTermInfo.add(termInfoCluster);
    }

    Collections.sort(clusteredTermInfo);
    // Cleanup
    Closeables.close(reader, true);
    termEntryMap.clear();

    return clusteredTermInfo.subList(0, Math.min(clusteredTermInfo.size(), maxLabels));
}

From source file:org.apache.solr.handler.admin.LukeRequestHandler.java

License:Apache License

private static SimpleOrderedMap<Object> getDocumentFieldsInfo(Document doc, int docId, IndexReader reader,
        IndexSchema schema) throws IOException {
    final CharsRef spare = new CharsRef();
    SimpleOrderedMap<Object> finfo = new SimpleOrderedMap<Object>();
    for (Object o : doc.getFields()) {
        Field field = (Field) o;
        SimpleOrderedMap<Object> f = new SimpleOrderedMap<Object>();

        SchemaField sfield = schema.getFieldOrNull(field.name());
        FieldType ftype = (sfield == null) ? null : sfield.getType();

        f.add("type", (ftype == null) ? null : ftype.getTypeName());
        f.add("schema", getFieldFlags(sfield));
        f.add("flags", getFieldFlags(field));

        Term t = new Term(field.name(), ftype != null ? ftype.storedToIndexed(field) : field.stringValue());

        f.add("value", (ftype == null) ? null : ftype.toExternal(field));

        // TODO: this really should be "stored"
        f.add("internal", field.stringValue()); // may be a binary number

        BytesRef bytes = field.binaryValue();
        if (bytes != null) {
            f.add("binary", Base64.byteArrayToBase64(bytes.bytes, bytes.offset, bytes.length));
        }//from   www .  ja va2 s.c  o m
        f.add("boost", field.boost());
        f.add("docFreq", t.text() == null ? 0 : reader.docFreq(t)); // this can be 0 for non-indexed fields

        // If we have a term vector, return that
        if (field.fieldType().storeTermVectors()) {
            try {
                Terms v = reader.getTermVector(docId, field.name());
                if (v != null) {
                    SimpleOrderedMap<Integer> tfv = new SimpleOrderedMap<Integer>();
                    final TermsEnum termsEnum = v.iterator(null);
                    BytesRef text;
                    while ((text = termsEnum.next()) != null) {
                        final int freq = (int) termsEnum.totalTermFreq();
                        UnicodeUtil.UTF8toUTF16(text, spare);
                        tfv.add(spare.toString(), freq);
                    }
                    f.add("termVector", tfv);
                }
            } catch (Exception ex) {
                log.warn("error writing term vector", ex);
            }
        }

        finfo.add(field.name(), f);
    }
    return finfo;
}

From source file:org.apache.solr.handler.component.AlfrescoLukeRequestHandler.java

License:Open Source License

private static SimpleOrderedMap<Object> getDocumentFieldsInfo(Document doc, int docId, IndexReader reader,
        IndexSchema schema) throws IOException {
    final CharsRefBuilder spare = new CharsRefBuilder();
    SimpleOrderedMap<Object> finfo = new SimpleOrderedMap<>();
    for (Object o : doc.getFields()) {
        Field field = (Field) o;
        SimpleOrderedMap<Object> f = new SimpleOrderedMap<>();

        SchemaField sfield = schema.getFieldOrNull(field.name());
        FieldType ftype = (sfield == null) ? null : sfield.getType();

        f.add("type", (ftype == null) ? null : ftype.getTypeName());
        f.add("schema", getFieldFlags(sfield));
        f.add("flags", getFieldFlags(field));

        Term t = new Term(field.name(), ftype != null ? ftype.storedToIndexed(field) : field.stringValue());

        f.add("value", (ftype == null) ? null : ftype.toExternal(field));

        // TODO: this really should be "stored"
        f.add("internal", field.stringValue()); // may be a binary number

        BytesRef bytes = field.binaryValue();
        if (bytes != null) {
            f.add("binary", Base64.byteArrayToBase64(bytes.bytes, bytes.offset, bytes.length));
        }/*from w  w w .  j  ava 2s .  c o  m*/
        f.add("boost", field.boost());
        f.add("docFreq", t.text() == null ? 0 : reader.docFreq(t)); // this
        // can
        // be 0
        // for
        // non-indexed
        // fields

        // If we have a term vector, return that
        if (field.fieldType().storeTermVectors()) {
            try {
                Terms v = reader.getTermVector(docId, field.name());
                if (v != null) {
                    SimpleOrderedMap<Integer> tfv = new SimpleOrderedMap<>();
                    final TermsEnum termsEnum = v.iterator();
                    BytesRef text;
                    while ((text = termsEnum.next()) != null) {
                        final int freq = (int) termsEnum.totalTermFreq();
                        spare.copyUTF8Bytes(text);
                        tfv.add(spare.toString(), freq);
                    }
                    f.add("termVector", tfv);
                }
            } catch (Exception ex) {
                log.warn("error writing term vector", ex);
            }
        }

        finfo.add(field.name(), f);
    }
    return finfo;
}

From source file:org.apache.solr.handler.component.TermVectorComponent.java

License:Apache License

private void mapOneVector(NamedList<Object> docNL, FieldOptions fieldOptions, IndexReader reader, int docID,
        TermsEnum termsEnum, String field) throws IOException {
    NamedList<Object> fieldNL = new NamedList<Object>();
    docNL.add(field, fieldNL);// w  ww. j  a v  a2 s.  co m

    BytesRef text;
    DocsAndPositionsEnum dpEnum = null;
    while ((text = termsEnum.next()) != null) {
        String term = text.utf8ToString();
        NamedList<Object> termInfo = new NamedList<Object>();
        fieldNL.add(term, termInfo);
        final int freq = (int) termsEnum.totalTermFreq();
        if (fieldOptions.termFreq == true) {
            termInfo.add("tf", freq);
        }

        dpEnum = termsEnum.docsAndPositions(null, dpEnum);
        boolean useOffsets = false;
        boolean usePositions = false;
        if (dpEnum != null) {
            dpEnum.nextDoc();
            usePositions = fieldOptions.positions;
            useOffsets = fieldOptions.offsets;
        }

        NamedList<Integer> positionsNL = null;
        NamedList<Number> theOffsets = null;

        if (usePositions || useOffsets) {
            for (int i = 0; i < freq; i++) {
                final int pos = dpEnum.nextPosition();
                if (usePositions && pos >= 0) {
                    if (positionsNL == null) {
                        positionsNL = new NamedList<Integer>();
                        termInfo.add("positions", positionsNL);
                    }
                    positionsNL.add("position", pos);
                }

                if (useOffsets && theOffsets == null) {
                    if (dpEnum.startOffset() == -1) {
                        useOffsets = false;
                    } else {
                        theOffsets = new NamedList<Number>();
                        termInfo.add("offsets", theOffsets);
                    }
                }

                if (theOffsets != null) {
                    theOffsets.add("start", dpEnum.startOffset());
                    theOffsets.add("end", dpEnum.endOffset());
                }
            }
        }

        int df = 0;
        if (fieldOptions.docFreq || fieldOptions.tfIdf) {
            df = reader.docFreq(new Term(field, text));
        }

        if (fieldOptions.docFreq) {
            termInfo.add("df", df);
        }

        // TODO: this is not TF/IDF by anyone's definition!
        if (fieldOptions.tfIdf) {
            double tfIdfVal = ((double) freq) / df;
            termInfo.add("tf-idf", tfIdfVal);
        }
    }
}

From source file:org.apache.solr.handler.SpellCheckerRequestHandler.java

License:Apache License

/**
 * Processes the following query string parameters: q, extendedResults, cmd rebuild,
 * cmd reopen, accuracy, suggestionCount, restrictToField, and onlyMorePopular.
 *///from   ww w .j a  v  a  2 s  .  c o  m
@Override
public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception {
    SolrParams p = req.getParams();
    String words = p.get("q");
    String cmd = p.get("cmd");
    if (cmd != null) {
        cmd = cmd.trim();
        if (cmd.equals("rebuild")) {
            rebuild(req);
            rsp.add("cmdExecuted", "rebuild");
        } else if (cmd.equals("reopen")) {
            reopen();
            rsp.add("cmdExecuted", "reopen");
        } else {
            throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Unrecognized Command: " + cmd);
        }
    }

    // empty query string
    if (null == words || "".equals(words.trim())) {
        return;
    }

    IndexReader indexReader = null;
    String suggestionField = null;
    Float accuracy;
    int numSug;
    boolean onlyMorePopular;
    boolean extendedResults;
    try {
        accuracy = p.getFloat(ACCURACY, p.getFloat("accuracy", DEFAULT_ACCURACY));
        spellChecker.setAccuracy(accuracy);
    } catch (NumberFormatException e) {
        throw new RuntimeException("Accuracy must be a valid positive float", e);
    }
    try {
        numSug = p.getInt(SUGGESTIONS, p.getInt("suggestionCount", DEFAULT_SUGGESTION_COUNT));
    } catch (NumberFormatException e) {
        throw new RuntimeException("Spelling suggestion count must be a valid positive integer", e);
    }
    try {
        onlyMorePopular = p.getBool(POPULAR, DEFAULT_MORE_POPULAR);
    } catch (SolrException e) {
        throw new RuntimeException("'Only more popular' must be a valid boolean", e);
    }
    try {
        extendedResults = p.getBool(EXTENDED, DEFAULT_EXTENDED_RESULTS);
    } catch (SolrException e) {
        throw new RuntimeException("'Extended results' must be a valid boolean", e);
    }

    // when searching for more popular, a non null index-reader and
    // restricted-field are required
    if (onlyMorePopular || extendedResults) {
        indexReader = req.getSearcher().getReader();
        suggestionField = termSourceField;
    }

    if (extendedResults) {

        rsp.add("numDocs", indexReader.numDocs());

        SimpleOrderedMap<Object> results = new SimpleOrderedMap<Object>();
        String[] wordz = words.split(" ");
        for (String word : wordz) {
            SimpleOrderedMap<Object> nl = new SimpleOrderedMap<Object>();
            nl.add("frequency", indexReader.docFreq(new Term(suggestionField, word)));
            String[] suggestions = spellChecker.suggestSimilar(word, numSug, indexReader, suggestionField,
                    onlyMorePopular);

            // suggestion array
            NamedList<Object> sa = new NamedList<Object>();
            for (int i = 0; i < suggestions.length; i++) {
                // suggestion item
                SimpleOrderedMap<Object> si = new SimpleOrderedMap<Object>();
                si.add("frequency", indexReader.docFreq(new Term(termSourceField, suggestions[i])));
                sa.add(suggestions[i], si);
            }
            nl.add("suggestions", sa);
            results.add(word, nl);
        }
        rsp.add("result", results);

    } else {
        rsp.add("words", words);
        if (spellChecker.exist(words)) {
            rsp.add("exist", "true");
        } else {
            rsp.add("exist", "false");
        }
        String[] suggestions = spellChecker.suggestSimilar(words, numSug, indexReader, suggestionField,
                onlyMorePopular);

        rsp.add("suggestions", Arrays.asList(suggestions));
    }
}

From source file:org.apache.solr.spelling.AbstractLuceneSpellChecker.java

License:Apache License

@Override
public SpellingResult getSuggestions(SpellingOptions options) throws IOException {
    SpellingResult result = new SpellingResult(options.tokens);
    IndexReader reader = determineReader(options.reader);
    Term term = field != null ? new Term(field, "") : null;
    float theAccuracy = (options.accuracy == Float.MIN_VALUE) ? spellChecker.getAccuracy() : options.accuracy;

    int count = Math.max(options.count, AbstractLuceneSpellChecker.DEFAULT_SUGGESTION_COUNT);
    for (Token token : options.tokens) {
        String tokenText = new String(token.buffer(), 0, token.length());
        term = new Term(field, tokenText);
        int docFreq = 0;
        if (reader != null) {
            docFreq = reader.docFreq(term);
        }//from w w w . j  av  a2 s  . c o m
        String[] suggestions = spellChecker.suggestSimilar(tokenText,
                ((options.alternativeTermCount == null || docFreq == 0) ? count : options.alternativeTermCount),
                field != null ? reader : null, // workaround LUCENE-1295
                field, options.suggestMode, theAccuracy);
        if (suggestions.length == 1 && suggestions[0].equals(tokenText)
                && options.alternativeTermCount == null) {
            // These are spelled the same, continue on
            continue;
        }
        // If considering alternatives to "correctly-spelled" terms, then add the
        // original as a viable suggestion.
        if (options.alternativeTermCount != null && docFreq > 0) {
            boolean foundOriginal = false;
            String[] suggestionsWithOrig = new String[suggestions.length + 1];
            for (int i = 0; i < suggestions.length; i++) {
                if (suggestions[i].equals(tokenText)) {
                    foundOriginal = true;
                    break;
                }
                suggestionsWithOrig[i + 1] = suggestions[i];
            }
            if (!foundOriginal) {
                suggestionsWithOrig[0] = tokenText;
                suggestions = suggestionsWithOrig;
            }
        }

        if (options.extendedResults == true && reader != null && field != null) {
            result.addFrequency(token, docFreq);
            int countLimit = Math.min(options.count, suggestions.length);
            if (countLimit > 0) {
                for (int i = 0; i < countLimit; i++) {
                    term = new Term(field, suggestions[i]);
                    result.add(token, suggestions[i], reader.docFreq(term));
                }
            } else {
                List<String> suggList = Collections.emptyList();
                result.add(token, suggList);
            }
        } else {
            if (suggestions.length > 0) {
                List<String> suggList = Arrays.asList(suggestions);
                if (suggestions.length > options.count) {
                    suggList = suggList.subList(0, options.count);
                }
                result.add(token, suggList);
            } else {
                List<String> suggList = Collections.emptyList();
                result.add(token, suggList);
            }
        }
    }
    return result;
}