org.apache.solr.handler.component.WordCloudComponent.java Source code

Introduction

Here is the source code for org.apache.solr.handler.component.WordCloudComponent.java
Source

package org.apache.solr.handler.component;

import java.io.IOException;
import java.io.StringReader;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Map.Entry;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.NumericTokenStream.NumericTermAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.transform.DocTransformer;
import org.apache.solr.response.transform.TransformContext;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocList;
import org.apache.solr.search.SolrIndexSearcher;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

public class WordCloudComponent extends SearchComponent {

    public static final String COMPONENT_NAME = "wordcloud";

    @Override
    public void prepare(ResponseBuilder rb) throws IOException {
        SolrQueryRequest req = rb.req;
        SolrParams params = req.getParams();
        if (!params.getBool(COMPONENT_NAME, true)) {
            return;
        }

        Query query = rb.getQuery();
        if (query == null)
            return;

    }

    @Override
    public void process(ResponseBuilder rb) throws IOException {
        SolrQueryRequest req = rb.req;
        SolrParams params = req.getParams();
        if (!params.getBool(COMPONENT_NAME, true)) {
            return;
        }

        String wcFields = null;
        if ((wcFields = params.get("wordcloud.fl", null)) == null) {
            return;
        }

        Set<String> flds = new HashSet<String>(StrUtils.splitSmart(wcFields, ','));
        DocList ids = rb.getResults().docList;

        SolrIndexSearcher searcher = rb.req.getSearcher();
        IndexSchema schema = rb.req.getCore().getLatestSchema();

        final Analyzer analyzer = rb.req.getCore().getLatestSchema().getAnalyzer();
        final HashMap<String, FieldType> fieldsToLoad = new HashMap<String, FieldType>();

        CharTermAttribute termAtt;
        Map<String, Map<String, Integer>> tokens = new HashMap<String, Map<String, Integer>>();

        for (String f : flds) {
            SchemaField field = schema.getFieldOrNull(f);
            if (field == null || !field.stored()) {
                continue; // ignore this field
            }
            fieldsToLoad.put(f, field.getType());
            tokens.put(f, new HashMap<String, Integer>());
        }

        DocIterator iterator = ids.iterator();
        String w;
        Integer v;
        int sz = ids.size();
        for (int i = 0; i < sz; i++) {
            int id = iterator.nextDoc();
            Document doc = searcher.doc(id, fieldsToLoad.keySet());
            for (Entry<String, FieldType> en : fieldsToLoad.entrySet()) {
                Map<String, Integer> toks = tokens.get(en.getKey());
                String[] vals = doc.getValues(en.getKey());
                FieldType fType = en.getValue();

                if (vals != null) {
                    for (String s : vals) {
                        TokenStream buffer = analyzer.tokenStream(en.getKey(),
                                new StringReader(fType.indexedToReadable(s)));

                        if (!buffer.hasAttribute(CharTermAttribute.class)) {
                            continue; // empty stream
                        }

                        termAtt = buffer.getAttribute(CharTermAttribute.class);
                        buffer.reset();

                        while (buffer.incrementToken()) {
                            w = termAtt.toString();
                            v = toks.get(w);
                            if (v == null)
                                v = 0;
                            toks.put(w, ++v);
                        }

                        buffer.close();
                    }
                }
            }
        }

        // TODO: filter out the tokens (use some sort of a range 0.1-0.9 by frequency)

        AtomicReader reader = searcher.getAtomicReader();
        BytesRef term;
        int df;
        String f;

        Map<String, Map<String, Double>> docFreqs = new HashMap<String, Map<String, Double>>();
        for (Entry<String, Map<String, Integer>> field : tokens.entrySet()) {
            HashMap<String, Double> idfs = new HashMap<String, Double>();
            f = field.getKey();
            docFreqs.put(f, idfs);
            int N = reader.getDocCount(f);

            for (Entry<String, Integer> token : field.getValue().entrySet()) {
                w = token.getKey();
                df = reader.docFreq(new Term(f, new BytesRef(w)));
                if (df != 0) {
                    idfs.put(w, Math.log10(N / df));
                }
            }
        }

        HashMap<String, Object> ret = new HashMap<String, Object>();
        for (String fi : fieldsToLoad.keySet()) {
            HashMap<String, Object> va = new HashMap<String, Object>();
            va.put("tf", tokens.get(fi));
            va.put("idf", docFreqs.get(fi));
            ret.put(fi, va);
        }
        rb.rsp.add("wordcloud", ret);

    }

    @Override
    public String getDescription() {
        return "return tokens with TF and IDF for wordcloud";
    }

    @Override
    public String getSource() {
        return null;
    }

}