List of usage examples for org.apache.lucene.index IndexReader docFreq
public abstract int docFreq(Term term) throws IOException;
term
. From source file:nicta.com.au.patent.pac.terms.impact.FeaturesSelection.java
public void iterateOverQueryTerms() throws ParseException, Exception { long start = System.currentTimeMillis(); int l = 0;//from w w w . j av a 2 s. com // System.out.println("queryid\tterm\ttf\tln_tf\tidf\ttfidf\ttLength\tratioTerm\t" // + "nbrUniqTerms\tqSize\tscq\tisInTitle\tisInAbstract\tisInDescription\tisInClaims"); System.out.println( "queryid\tremovedBooleanClause\ttf\tln_tf\tidf\ttfidf\ttLength\tratioTerm\tnbrUniqTerms\tqSize\tscq\tSCS\tictf\tQC\tclarity\tfreqInTitle\tratioInTitle\tfreqDescription\tratioInDescription\tfreqClaims\tratioInClaims"); for (Map.Entry<String, PatentDocument> e : topics.getTopics().entrySet()) { l++; String queryid = e.getKey(); PatentDocument pt = e.getValue(); // System.err.print(l + "- " + queryid + " -> " + pt.getUcid() + ": "); long start2 = System.currentTimeMillis(); PatentQuery query = new PatentQuery(pt, boosts, filter, stopWords); BooleanQuery bQuery = (BooleanQuery) query.parse(); if (bQuery.getClauses().length != 2 || !(bQuery.getClauses()[1].getQuery() instanceof BooleanQuery) || ((BooleanQuery) bQuery.getClauses()[1].getQuery()).getClauses().length == 0 || !(((BooleanQuery) bQuery.getClauses()[1].getQuery()).getClauses()[0] .getQuery() instanceof BooleanQuery)) { continue; } BooleanQuery bQuery2 = (BooleanQuery) ((BooleanQuery) bQuery.getClauses()[1].getQuery()).getClauses()[0] .getQuery(); for (int i = 0; i < bQuery2.clauses().size(); i++) { BooleanQuery bQueryFinal = new BooleanQuery(); BooleanQuery bQuery3 = bQuery2.clone(); BooleanClause removedBooleanClause = bQuery3.clauses().remove(i); bQueryFinal.add((Query) bQuery.getClauses()[0].getQuery(), BooleanClause.Occur.MUST); bQueryFinal.add(bQuery3, BooleanClause.Occur.MUST); //*************************** // Get features //*************************** IndexReader ir = searcher.getIndexSearch().getIndexReader(); TermQuery term = (TermQuery) removedBooleanClause.getQuery(); double tf = removedBooleanClause.getQuery().getBoost();// Term frequency double ln_tf = Math.log(1 + tf);// Get log of the term frequency int totalTF = ir.docFreq(term.getTerm()); int docs = ir.getDocCount(term.getTerm().field()); double idf = 0; if (totalTF != 0) { idf = Math.log10((double) docs / (totalTF));// Inverse document frequency } double tfidf = ln_tf * idf;// Compute the TFIDF int tLength = term.getTerm().text().length();// Term length int qSize = 0; if (term.getTerm().field().endsWith(PatentDocument.Title)) { qSize = query.getTitleSize(); // Query size } else if (term.getTerm().field().endsWith(PatentDocument.Abstract)) { qSize = query.getAbstractSize(); // Query size } else if (term.getTerm().field().endsWith(PatentDocument.Description)) { qSize = query.getDescriptionSize(); // Query size } else if (term.getTerm().field().endsWith(PatentDocument.Claims)) { qSize = query.getClaimsSize(); // Query size } double ratioTerm = (double) tf / qSize; int nbrUniqTerms = bQuery2.getClauses().length; long totalTermFreq = ir.totalTermFreq(term.getTerm()); double ln_totalTermFreq = Math.log(1 + totalTermFreq); double scq = ln_totalTermFreq * idf; double freqInTitle = query.getFreqInTitle(term.getTerm().text()); double ratioInTitle = (double) freqInTitle / query.getTitleSize(); double freqAbstract = query.getFreqInAbstract(term.getTerm().text()); double ratioInAbstract = (double) freqAbstract / query.getAbstractSize(); double freqDescription = query.getFreqInDescription(term.getTerm().text()); double ratioInDescription = (double) freqDescription / query.getDescriptionSize(); double freqClaims = query.getFreqInClaims(term.getTerm().text()); double ratioInClaims = (double) freqClaims / query.getClaimsSize(); double Pcoll = (double) totalTermFreq / ir.getSumTotalTermFreq(term.getTerm().field()); double SCS = 0; double ictf = 0; List<TermFreqVector> docsTermVector = getDocsTerms(searcher.search(term), term.getTerm().field()); double a1 = 0; for (TermFreqVector vec : docsTermVector) { a1 += Math.sqrt((double) vec.getFreq(term.getTerm().text()) / vec.numberOfTerms()); } double clarity = 0; if (totalTermFreq != 0) { SCS = ratioTerm * Log2(ratioTerm / Pcoll);// Simplified Clarity Score ictf = Math.log10((double) docs / (totalTermFreq));// Inverse Collection Term Frequency clarity = a1 * Log2(a1 / Pcoll); } double QC = totalTF / (double) docs;// QueryScope //*************************** System.out.println(queryid + "\t" + removedBooleanClause + "\t" + tf + "\t" + ln_tf + "\t" + idf + "\t" + tfidf + "\t" + tLength + "\t" + ratioTerm + "\t" + nbrUniqTerms + "\t" + qSize + "\t" + scq + "\t" + SCS + "\t" + ictf + "\t" + QC + "\t" + clarity + "\t" + freqInTitle + "\t" + ratioInTitle + "\t" + freqDescription + "\t" + ratioInDescription + "\t" + freqClaims + "\t" + ratioInClaims); } long end2 = System.currentTimeMillis(); // System.err.println(bQuery2.clauses().size() + " terms processed in " + Functions.getTimer(end2 - start2) + "."); } long end = System.currentTimeMillis(); long millis = (end - start); System.err.println("#Global Execution time: " + Functions.getTimer(millis) + "."); }
From source file:nmsu.cs.TFIDFVector.java
License:Open Source License
/** * read in text tfidfvector to docVectorMap. * @param docVectorMap/* ww w .j a v a 2 s.c o m*/ * @param ir * @param datasetID * @param indexID * @param numDocs * @return */ public static TFIDFVector getFullTextTFIDFVector(Map<Integer, TFIDFVector> docVectorMap, IndexReader ir, int datasetID, int indexID, int numDocs) { TFIDFVector vector = null; try { if ((vector = docVectorMap.get(datasetID)) == null) { TermFreqVector termFreqVector = ir.getTermFreqVector(indexID, "fulltext"); int[] tf = termFreqVector == null ? new int[0] : termFreqVector.getTermFrequencies(); String[] terms = termFreqVector == null ? new String[0] : termFreqVector.getTerms(); int[] df = new int[tf.length]; for (int j = 0; j < terms.length; j++) df[j] = ir.docFreq(new Term(terms[j])); vector = new TFIDFVector(terms, tf, df, numDocs); // docVectorMap.put(datasetID, vector); } } catch (IOException e) { e.printStackTrace(); } return vector; }
From source file:nmsu.cs.TFIDFVector.java
License:Open Source License
/** * read in aspect tfidfvector to docVectorMap. * @param docAspectVectorMap//from w w w . j av a 2 s. c o m * @param ir * @param datasetID * @param indexID * @param aspectID * @param numDocs * @return */ public static TFIDFVector getAspectTFIDFVector(Map<Integer, Map<Integer, TFIDFVector>> docAspectVectorMap, IndexReader ir, int datasetID, int indexID, int aspectID, int numDocs) { TFIDFVector vector = null; try { Map<Integer, TFIDFVector> aspectVectorMap = docAspectVectorMap.get(datasetID); if (aspectVectorMap == null) { aspectVectorMap = new HashMap<Integer, TFIDFVector>(); docAspectVectorMap.put(datasetID, aspectVectorMap); TermFreqVector termFreqVector = ir.getTermFreqVector(indexID, rawdata.id2Aspect.get(aspectID)); int[] tf = termFreqVector == null ? new int[0] : termFreqVector.getTermFrequencies(); String[] terms = termFreqVector == null ? new String[0] : termFreqVector.getTerms(); int[] df = new int[tf.length]; for (int j = 0; j < terms.length; j++) df[j] = ir.docFreq(new Term(terms[j])); vector = new TFIDFVector(terms, tf, df, numDocs); // aspectVectorMap.put(aspectID, vector); } else if ((vector = aspectVectorMap.get(aspectID)) == null) { TermFreqVector termFreqVector = ir.getTermFreqVector(indexID, rawdata.id2Aspect.get(aspectID)); int[] tf = termFreqVector == null ? new int[0] : termFreqVector.getTermFrequencies(); String[] terms = termFreqVector == null ? new String[0] : termFreqVector.getTerms(); int[] df = new int[tf.length]; for (int j = 0; j < terms.length; j++) df[j] = ir.docFreq(new Term(terms[j])); vector = new TFIDFVector(terms, tf, df, numDocs); // aspectVectorMap.put(aspectID, vector); } } catch (IOException e) { e.printStackTrace(); } return vector; }
From source file:org.ala.lucene.Autocompleter.java
License:Open Source License
@SuppressWarnings("unchecked") public void reIndex(Directory sourceDirectory, String fieldToAutocomplete, boolean createNewIndex) throws CorruptIndexException, IOException { // build a dictionary (from the spell package) IndexReader sourceReader = IndexReader.open(sourceDirectory); LuceneDictionary dict = new LuceneDictionary(sourceReader, fieldToAutocomplete); // code from//from w w w . j a v a 2s . c o m // org.apache.lucene.search.spell.SpellChecker.indexDictionary( // Dictionary) IndexWriter.unlock(autoCompleteDirectory); // use a custom analyzer so we can do EdgeNGramFiltering IndexWriterConfig indexWriterConfig = new IndexWriterConfig(SolrUtils.BIE_LUCENE_VERSION, new Analyzer() { protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final StandardTokenizer src = new StandardTokenizer(SolrUtils.BIE_LUCENE_VERSION, reader); TokenStream result = new StandardTokenizer(SolrUtils.BIE_LUCENE_VERSION, reader); result = new StandardFilter(SolrUtils.BIE_LUCENE_VERSION, result); result = new LowerCaseFilter(SolrUtils.BIE_LUCENE_VERSION, result); result = new StopFilter(SolrUtils.BIE_LUCENE_VERSION, result, new CharArraySet(SolrUtils.BIE_LUCENE_VERSION, new HashSet<String>(Arrays.asList(ENGLISH_STOP_WORDS)), true)); result = new EdgeNGramTokenFilter(result, Side.FRONT, 1, 20); return new TokenStreamComponents(src, result) { @Override protected void setReader(final Reader reader) throws IOException { super.setReader(reader); } }; } // public TokenStream tokenStream(String fieldName, Reader reader) { // TokenStream result = new StandardTokenizer(SolrUtils.BIE_LUCENE_VERSION, reader); // // result = new StandardFilter(SolrUtils.BIE_LUCENE_VERSION, result); // result = new LowerCaseFilter(SolrUtils.BIE_LUCENE_VERSION, result); // //result = new ISOLatin1AccentFilter(result); // result = new StopFilter(SolrUtils.BIE_LUCENE_VERSION, result, new HashSet<String>(Arrays.asList(ENGLISH_STOP_WORDS))); // result = new EdgeNGramTokenFilter(result, Side.FRONT,1, 20); // // return result; // } }); if (createNewIndex) { indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE); } else { indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); } indexWriterConfig.setMaxBufferedDocs(150); IndexWriter writer = new IndexWriter(autoCompleteDirectory, indexWriterConfig); // writer.setMergeFactor(300); // go through every word, storing the original word (incl. n-grams) // and the number of times it occurs Map<String, Integer> wordsMap = new HashMap<String, Integer>(); Iterator<String> iter = (Iterator<String>) dict.getWordsIterator(); while (iter.hasNext()) { String word = iter.next(); int len = word.length(); if (len < 3) { continue; // too short we bail but "too long" is fine... } if (wordsMap.containsKey(word)) { throw new IllegalStateException("This should never happen in Lucene 2.3.2"); // wordsMap.put(word, wordsMap.get(word) + 1); } else { // use the number of documents this word appears in wordsMap.put(word, sourceReader.docFreq(new Term(fieldToAutocomplete, word))); } } for (String word : wordsMap.keySet()) { // ok index the word Document doc = new Document(); doc.add(new Field(SOURCE_WORD_FIELD, word, Field.Store.YES, Field.Index.NOT_ANALYZED)); // orig term doc.add(new Field(GRAMMED_WORDS_FIELD, word, Field.Store.YES, Field.Index.ANALYZED)); // grammed doc.add(new Field(COUNT_FIELD, Integer.toString(wordsMap.get(word)), Field.Store.NO, Field.Index.NOT_ANALYZED)); // count writer.addDocument(doc); } sourceReader.close(); // close writer writer.forceMerge(1); writer.close(); // re-open our reader reOpenReader(); }
From source file:org.apache.mahout.utils.vectors.lucene.ClusterLabels.java
License:Apache License
/** * Get the list of labels, sorted by best score. *//* www .j a va 2 s. c o m*/ protected List<TermInfoClusterInOut> getClusterLabels(Integer integer, Collection<WeightedPropertyVectorWritable> wpvws) throws IOException { if (wpvws.size() < minNumIds) { log.info("Skipping small cluster {} with size: {}", integer, wpvws.size()); return null; } log.info("Processing Cluster {} with {} documents", integer, wpvws.size()); Directory dir = FSDirectory.open(new File(this.indexDir)); IndexReader reader = DirectoryReader.open(dir); log.info("# of documents in the index {}", reader.numDocs()); Collection<String> idSet = Sets.newHashSet(); for (WeightedPropertyVectorWritable wpvw : wpvws) { Vector vector = wpvw.getVector(); if (vector instanceof NamedVector) { idSet.add(((NamedVector) vector).getName()); } } int numDocs = reader.numDocs(); OpenBitSet clusterDocBitset = getClusterDocBitset(reader, idSet, this.idField); log.info("Populating term infos from the index"); /** * This code is as that of CachedTermInfo, with one major change, which is to get the document frequency. * * Since we have deleted the documents out of the cluster, the document frequency for a term should only * include the in-cluster documents. The document frequency obtained from TermEnum reflects the frequency * in the entire index. To get the in-cluster frequency, we need to query the index to get the term * frequencies in each document. The number of results of this call will be the in-cluster document * frequency. */ Terms t = MultiFields.getTerms(reader, contentField); TermsEnum te = t.iterator(null); Map<String, TermEntry> termEntryMap = new LinkedHashMap<String, TermEntry>(); Bits liveDocs = MultiFields.getLiveDocs(reader); //WARNING: returns null if there are no deletions int count = 0; BytesRef term; while ((term = te.next()) != null) { OpenBitSet termBitset = new OpenBitSet(reader.maxDoc()); DocsEnum docsEnum = MultiFields.getTermDocsEnum(reader, null, contentField, term); int docID; while ((docID = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { //check to see if we don't have an deletions (null) or if document is live if (liveDocs != null && !liveDocs.get(docID)) { // document is deleted... termBitset.set(docsEnum.docID()); } } // AND the term's bitset with cluster doc bitset to get the term's in-cluster frequency. // This modifies the termBitset, but that's fine as we are not using it anywhere else. termBitset.and(clusterDocBitset); int inclusterDF = (int) termBitset.cardinality(); TermEntry entry = new TermEntry(term.utf8ToString(), count++, inclusterDF); termEntryMap.put(entry.getTerm(), entry); } List<TermInfoClusterInOut> clusteredTermInfo = Lists.newLinkedList(); int clusterSize = wpvws.size(); for (TermEntry termEntry : termEntryMap.values()) { int corpusDF = reader.docFreq(new Term(this.contentField, termEntry.getTerm())); int outDF = corpusDF - termEntry.getDocFreq(); int inDF = termEntry.getDocFreq(); double logLikelihoodRatio = scoreDocumentFrequencies(inDF, outDF, clusterSize, numDocs); TermInfoClusterInOut termInfoCluster = new TermInfoClusterInOut(termEntry.getTerm(), inDF, outDF, logLikelihoodRatio); clusteredTermInfo.add(termInfoCluster); } Collections.sort(clusteredTermInfo); // Cleanup Closeables.close(reader, true); termEntryMap.clear(); return clusteredTermInfo.subList(0, Math.min(clusteredTermInfo.size(), maxLabels)); }
From source file:org.apache.solr.handler.admin.LukeRequestHandler.java
License:Apache License
private static SimpleOrderedMap<Object> getDocumentFieldsInfo(Document doc, int docId, IndexReader reader, IndexSchema schema) throws IOException { final CharsRef spare = new CharsRef(); SimpleOrderedMap<Object> finfo = new SimpleOrderedMap<Object>(); for (Object o : doc.getFields()) { Field field = (Field) o; SimpleOrderedMap<Object> f = new SimpleOrderedMap<Object>(); SchemaField sfield = schema.getFieldOrNull(field.name()); FieldType ftype = (sfield == null) ? null : sfield.getType(); f.add("type", (ftype == null) ? null : ftype.getTypeName()); f.add("schema", getFieldFlags(sfield)); f.add("flags", getFieldFlags(field)); Term t = new Term(field.name(), ftype != null ? ftype.storedToIndexed(field) : field.stringValue()); f.add("value", (ftype == null) ? null : ftype.toExternal(field)); // TODO: this really should be "stored" f.add("internal", field.stringValue()); // may be a binary number BytesRef bytes = field.binaryValue(); if (bytes != null) { f.add("binary", Base64.byteArrayToBase64(bytes.bytes, bytes.offset, bytes.length)); }//from www . ja va2 s.c o m f.add("boost", field.boost()); f.add("docFreq", t.text() == null ? 0 : reader.docFreq(t)); // this can be 0 for non-indexed fields // If we have a term vector, return that if (field.fieldType().storeTermVectors()) { try { Terms v = reader.getTermVector(docId, field.name()); if (v != null) { SimpleOrderedMap<Integer> tfv = new SimpleOrderedMap<Integer>(); final TermsEnum termsEnum = v.iterator(null); BytesRef text; while ((text = termsEnum.next()) != null) { final int freq = (int) termsEnum.totalTermFreq(); UnicodeUtil.UTF8toUTF16(text, spare); tfv.add(spare.toString(), freq); } f.add("termVector", tfv); } } catch (Exception ex) { log.warn("error writing term vector", ex); } } finfo.add(field.name(), f); } return finfo; }
From source file:org.apache.solr.handler.component.AlfrescoLukeRequestHandler.java
License:Open Source License
private static SimpleOrderedMap<Object> getDocumentFieldsInfo(Document doc, int docId, IndexReader reader, IndexSchema schema) throws IOException { final CharsRefBuilder spare = new CharsRefBuilder(); SimpleOrderedMap<Object> finfo = new SimpleOrderedMap<>(); for (Object o : doc.getFields()) { Field field = (Field) o; SimpleOrderedMap<Object> f = new SimpleOrderedMap<>(); SchemaField sfield = schema.getFieldOrNull(field.name()); FieldType ftype = (sfield == null) ? null : sfield.getType(); f.add("type", (ftype == null) ? null : ftype.getTypeName()); f.add("schema", getFieldFlags(sfield)); f.add("flags", getFieldFlags(field)); Term t = new Term(field.name(), ftype != null ? ftype.storedToIndexed(field) : field.stringValue()); f.add("value", (ftype == null) ? null : ftype.toExternal(field)); // TODO: this really should be "stored" f.add("internal", field.stringValue()); // may be a binary number BytesRef bytes = field.binaryValue(); if (bytes != null) { f.add("binary", Base64.byteArrayToBase64(bytes.bytes, bytes.offset, bytes.length)); }/*from w w w . j ava 2s . c o m*/ f.add("boost", field.boost()); f.add("docFreq", t.text() == null ? 0 : reader.docFreq(t)); // this // can // be 0 // for // non-indexed // fields // If we have a term vector, return that if (field.fieldType().storeTermVectors()) { try { Terms v = reader.getTermVector(docId, field.name()); if (v != null) { SimpleOrderedMap<Integer> tfv = new SimpleOrderedMap<>(); final TermsEnum termsEnum = v.iterator(); BytesRef text; while ((text = termsEnum.next()) != null) { final int freq = (int) termsEnum.totalTermFreq(); spare.copyUTF8Bytes(text); tfv.add(spare.toString(), freq); } f.add("termVector", tfv); } } catch (Exception ex) { log.warn("error writing term vector", ex); } } finfo.add(field.name(), f); } return finfo; }
From source file:org.apache.solr.handler.component.TermVectorComponent.java
License:Apache License
private void mapOneVector(NamedList<Object> docNL, FieldOptions fieldOptions, IndexReader reader, int docID, TermsEnum termsEnum, String field) throws IOException { NamedList<Object> fieldNL = new NamedList<Object>(); docNL.add(field, fieldNL);// w ww. j a v a2 s. co m BytesRef text; DocsAndPositionsEnum dpEnum = null; while ((text = termsEnum.next()) != null) { String term = text.utf8ToString(); NamedList<Object> termInfo = new NamedList<Object>(); fieldNL.add(term, termInfo); final int freq = (int) termsEnum.totalTermFreq(); if (fieldOptions.termFreq == true) { termInfo.add("tf", freq); } dpEnum = termsEnum.docsAndPositions(null, dpEnum); boolean useOffsets = false; boolean usePositions = false; if (dpEnum != null) { dpEnum.nextDoc(); usePositions = fieldOptions.positions; useOffsets = fieldOptions.offsets; } NamedList<Integer> positionsNL = null; NamedList<Number> theOffsets = null; if (usePositions || useOffsets) { for (int i = 0; i < freq; i++) { final int pos = dpEnum.nextPosition(); if (usePositions && pos >= 0) { if (positionsNL == null) { positionsNL = new NamedList<Integer>(); termInfo.add("positions", positionsNL); } positionsNL.add("position", pos); } if (useOffsets && theOffsets == null) { if (dpEnum.startOffset() == -1) { useOffsets = false; } else { theOffsets = new NamedList<Number>(); termInfo.add("offsets", theOffsets); } } if (theOffsets != null) { theOffsets.add("start", dpEnum.startOffset()); theOffsets.add("end", dpEnum.endOffset()); } } } int df = 0; if (fieldOptions.docFreq || fieldOptions.tfIdf) { df = reader.docFreq(new Term(field, text)); } if (fieldOptions.docFreq) { termInfo.add("df", df); } // TODO: this is not TF/IDF by anyone's definition! if (fieldOptions.tfIdf) { double tfIdfVal = ((double) freq) / df; termInfo.add("tf-idf", tfIdfVal); } } }
From source file:org.apache.solr.handler.SpellCheckerRequestHandler.java
License:Apache License
/** * Processes the following query string parameters: q, extendedResults, cmd rebuild, * cmd reopen, accuracy, suggestionCount, restrictToField, and onlyMorePopular. *///from ww w .j a v a 2 s . c o m @Override public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception { SolrParams p = req.getParams(); String words = p.get("q"); String cmd = p.get("cmd"); if (cmd != null) { cmd = cmd.trim(); if (cmd.equals("rebuild")) { rebuild(req); rsp.add("cmdExecuted", "rebuild"); } else if (cmd.equals("reopen")) { reopen(); rsp.add("cmdExecuted", "reopen"); } else { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Unrecognized Command: " + cmd); } } // empty query string if (null == words || "".equals(words.trim())) { return; } IndexReader indexReader = null; String suggestionField = null; Float accuracy; int numSug; boolean onlyMorePopular; boolean extendedResults; try { accuracy = p.getFloat(ACCURACY, p.getFloat("accuracy", DEFAULT_ACCURACY)); spellChecker.setAccuracy(accuracy); } catch (NumberFormatException e) { throw new RuntimeException("Accuracy must be a valid positive float", e); } try { numSug = p.getInt(SUGGESTIONS, p.getInt("suggestionCount", DEFAULT_SUGGESTION_COUNT)); } catch (NumberFormatException e) { throw new RuntimeException("Spelling suggestion count must be a valid positive integer", e); } try { onlyMorePopular = p.getBool(POPULAR, DEFAULT_MORE_POPULAR); } catch (SolrException e) { throw new RuntimeException("'Only more popular' must be a valid boolean", e); } try { extendedResults = p.getBool(EXTENDED, DEFAULT_EXTENDED_RESULTS); } catch (SolrException e) { throw new RuntimeException("'Extended results' must be a valid boolean", e); } // when searching for more popular, a non null index-reader and // restricted-field are required if (onlyMorePopular || extendedResults) { indexReader = req.getSearcher().getReader(); suggestionField = termSourceField; } if (extendedResults) { rsp.add("numDocs", indexReader.numDocs()); SimpleOrderedMap<Object> results = new SimpleOrderedMap<Object>(); String[] wordz = words.split(" "); for (String word : wordz) { SimpleOrderedMap<Object> nl = new SimpleOrderedMap<Object>(); nl.add("frequency", indexReader.docFreq(new Term(suggestionField, word))); String[] suggestions = spellChecker.suggestSimilar(word, numSug, indexReader, suggestionField, onlyMorePopular); // suggestion array NamedList<Object> sa = new NamedList<Object>(); for (int i = 0; i < suggestions.length; i++) { // suggestion item SimpleOrderedMap<Object> si = new SimpleOrderedMap<Object>(); si.add("frequency", indexReader.docFreq(new Term(termSourceField, suggestions[i]))); sa.add(suggestions[i], si); } nl.add("suggestions", sa); results.add(word, nl); } rsp.add("result", results); } else { rsp.add("words", words); if (spellChecker.exist(words)) { rsp.add("exist", "true"); } else { rsp.add("exist", "false"); } String[] suggestions = spellChecker.suggestSimilar(words, numSug, indexReader, suggestionField, onlyMorePopular); rsp.add("suggestions", Arrays.asList(suggestions)); } }
From source file:org.apache.solr.spelling.AbstractLuceneSpellChecker.java
License:Apache License
@Override public SpellingResult getSuggestions(SpellingOptions options) throws IOException { SpellingResult result = new SpellingResult(options.tokens); IndexReader reader = determineReader(options.reader); Term term = field != null ? new Term(field, "") : null; float theAccuracy = (options.accuracy == Float.MIN_VALUE) ? spellChecker.getAccuracy() : options.accuracy; int count = Math.max(options.count, AbstractLuceneSpellChecker.DEFAULT_SUGGESTION_COUNT); for (Token token : options.tokens) { String tokenText = new String(token.buffer(), 0, token.length()); term = new Term(field, tokenText); int docFreq = 0; if (reader != null) { docFreq = reader.docFreq(term); }//from w w w . j av a2 s . c o m String[] suggestions = spellChecker.suggestSimilar(tokenText, ((options.alternativeTermCount == null || docFreq == 0) ? count : options.alternativeTermCount), field != null ? reader : null, // workaround LUCENE-1295 field, options.suggestMode, theAccuracy); if (suggestions.length == 1 && suggestions[0].equals(tokenText) && options.alternativeTermCount == null) { // These are spelled the same, continue on continue; } // If considering alternatives to "correctly-spelled" terms, then add the // original as a viable suggestion. if (options.alternativeTermCount != null && docFreq > 0) { boolean foundOriginal = false; String[] suggestionsWithOrig = new String[suggestions.length + 1]; for (int i = 0; i < suggestions.length; i++) { if (suggestions[i].equals(tokenText)) { foundOriginal = true; break; } suggestionsWithOrig[i + 1] = suggestions[i]; } if (!foundOriginal) { suggestionsWithOrig[0] = tokenText; suggestions = suggestionsWithOrig; } } if (options.extendedResults == true && reader != null && field != null) { result.addFrequency(token, docFreq); int countLimit = Math.min(options.count, suggestions.length); if (countLimit > 0) { for (int i = 0; i < countLimit; i++) { term = new Term(field, suggestions[i]); result.add(token, suggestions[i], reader.docFreq(term)); } } else { List<String> suggList = Collections.emptyList(); result.add(token, suggList); } } else { if (suggestions.length > 0) { List<String> suggList = Arrays.asList(suggestions); if (suggestions.length > options.count) { suggList = suggList.subList(0, options.count); } result.add(token, suggList); } else { List<String> suggList = Collections.emptyList(); result.add(token, suggList); } } } return result; }