Example usage for org.apache.lucene.search IndexSearcher collectionStatistics

Introduction

In this page you can find the example usage for org.apache.lucene.search IndexSearcher collectionStatistics.

Prototype

public CollectionStatistics collectionStatistics(String field) throws IOException

Source Link

Document

Returns CollectionStatistics for a field, or null if the field does not exist (has no indexed terms) This can be overridden for example, to return a field's statistics across a distributed collection.

Usage

From source file:com.xiaomi.linden.lucene.query.flexiblequery.FlexibleWeight.java

License:Apache License

public FlexibleWeight(FlexibleQuery query, IndexSearcher searcher) throws IOException {
    this.query = query;
    this.similarity = searcher.getSimilarity();
    final IndexReaderContext context = searcher.getTopReaderContext();

    int[] maxDocFreqs = null;
    long[] maxTotalTermFreqs = null;
    Map<Term, TermContext> builtTermMap = new HashMap<>();
    if (query.enableGlobalIDF()) {
        FlexibleQuery.FlexibleTerm[][] globalTerms = query.getGlobalTerms();
        TermContext[][] globalStates = new TermContext[globalTerms.length][];
        for (int i = 0; i < globalTerms.length; ++i) {
            globalStates[i] = new TermContext[globalTerms[i].length];
            for (int j = 0; j < globalTerms[i].length; ++j) {
                Term term = globalTerms[i][j].term;
                TermContext termContext = builtTermMap.get(term);
                if (termContext != null) {
                    globalStates[i][j] = termContext;
                } else {
                    globalStates[i][j] = TermContext.build(context, globalTerms[i][j].term);
                    builtTermMap.put(term, globalStates[i][j]);
                }/* w w  w .  ja  v  a 2 s.  c o m*/
            }
        }
        maxDocFreqs = new int[globalTerms[0].length];
        maxTotalTermFreqs = new long[globalTerms[0].length];
        int fieldLength = globalTerms.length;
        int termLength = globalTerms[0].length;
        for (int i = 0; i < termLength; ++i) {
            int maxDocFreq = 0;
            long maxTotalTermFreq = 0;
            for (int j = 0; j < fieldLength; ++j) {
                maxDocFreq = Math.max(globalStates[j][i].docFreq(), maxDocFreq);
                maxTotalTermFreq = Math.max(globalStates[j][i].totalTermFreq(), maxTotalTermFreq);
            }
            maxDocFreqs[i] = maxDocFreq;
            maxTotalTermFreqs[i] = maxTotalTermFreq;
        }
    }

    FlexibleQuery.FlexibleTerm[][] terms = query.getTerms();
    TermContext[][] states = new TermContext[terms.length][];
    for (int i = 0; i < terms.length; ++i) {
        states[i] = new TermContext[terms[i].length];
        for (int j = 0; j < terms[i].length; ++j) {
            Term term = terms[i][j].term;
            TermContext termContext = builtTermMap.get(term);
            if (termContext != null) {
                states[i][j] = termContext;
            } else {
                states[i][j] = TermContext.build(context, terms[i][j].term);
                builtTermMap.put(term, states[i][j]);
            }
        }
    }
    termStatsMatrix = new TermStats[terms.length][];
    for (int i = 0; i < terms.length; ++i) {
        termStatsMatrix[i] = new TermStats[terms[i].length];
        for (int j = 0; j < terms[i].length; ++j) {
            FlexibleQuery.FlexibleTerm term = terms[i][j];
            TermContext state = states[i][j];
            TermStatistics termStats;
            if (query.enableGlobalIDF()) {
                termStats = new TermStatistics(term.term.bytes(), maxDocFreqs[j], maxTotalTermFreqs[j]);
            } else {
                termStats = searcher.termStatistics(term.term, state);
            }
            Similarity.SimWeight stats = similarity.computeWeight(term.boost,
                    searcher.collectionStatistics(term.term.field()), termStats);
            TermStats termStatsInfo = new TermStats();
            termStatsInfo.stats = stats;
            termStatsInfo.term = term.term;
            termStatsInfo.termContext = state;
            termStatsMatrix[i][j] = termStatsInfo;
        }
    }
}

From source file:io.anserini.index.UpdateIndex.java

License:Apache License

@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(new Option(HELP_OPTION, "show help"));
    options.addOption(new Option(OPTIMIZE_OPTION, "merge indexes into a single segment"));
    options.addOption(new Option(STORE_TERM_VECTORS_OPTION, "store term vectors"));

    options.addOption(/*from  w  ww. j a  va2  s  . c  om*/
            OptionBuilder.withArgName("dir").hasArg().withDescription("index location").create(INDEX_OPTION));
    options.addOption(OptionBuilder.withArgName("file").hasArg().withDescription("file with deleted tweetids")
            .create(DELETES_OPTION));
    options.addOption(OptionBuilder.withArgName("id").hasArg().withDescription("max id").create(MAX_ID_OPTION));

    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        System.exit(-1);
    }

    if (cmdline.hasOption(HELP_OPTION) || !cmdline.hasOption(INDEX_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(UpdateIndex.class.getName(), options);
        System.exit(-1);
    }

    String indexPath = cmdline.getOptionValue(INDEX_OPTION);

    final FieldType textOptions = new FieldType();
    textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
    textOptions.setStored(true);
    textOptions.setTokenized(true);
    textOptions.setStoreTermVectors(true);

    LOG.info("index: " + indexPath);

    File file = new File("PittsburghUserTimeline");
    if (!file.exists()) {
        System.err.println("Error: " + file + " does not exist!");
        System.exit(-1);
    }

    final StatusStream stream = new JsonStatusCorpusReader(file);

    Status status;
    String s;
    HashMap<Long, String> hm = new HashMap<Long, String>();
    try {
        while ((s = stream.nextRaw()) != null) {
            try {
                status = DataObjectFactory.createStatus(s);

                if (status.getText() == null) {
                    continue;
                }

                hm.put(status.getUser().getId(),
                        hm.get(status.getUser().getId()) + status.getText().replaceAll("[\\r\\n]+", " "));

            } catch (Exception e) {

            }
        }

    } catch (Exception e) {
        e.printStackTrace();
    } finally {

        stream.close();
    }

    ArrayList<String> userIDList = new ArrayList<String>();
    try (BufferedReader br = new BufferedReader(new FileReader(new File("userID")))) {
        String line;
        while ((line = br.readLine()) != null) {
            userIDList.add(line.replaceAll("[\\r\\n]+", ""));

            // process the line.
        }
    }

    try {
        reader = DirectoryReader
                .open(FSDirectory.open(new File(cmdline.getOptionValue(INDEX_OPTION)).toPath()));
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

    final Directory dir = new SimpleFSDirectory(Paths.get(cmdline.getOptionValue(INDEX_OPTION)));
    final IndexWriterConfig config = new IndexWriterConfig(ANALYZER);

    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);

    final IndexWriter writer = new IndexWriter(dir, config);

    IndexSearcher searcher = new IndexSearcher(reader);
    System.out.println("The total number of docs indexed "
            + searcher.collectionStatistics(TweetStreamReader.StatusField.TEXT.name).docCount());

    for (int city = 0; city < cityName.length; city++) {

        // Pittsburgh's coordinate -79.976389, 40.439722

        Query q_long = NumericRangeQuery.newDoubleRange(TweetStreamReader.StatusField.LONGITUDE.name,
                new Double(longitude[city] - 0.05), new Double(longitude[city] + 0.05), true, true);
        Query q_lat = NumericRangeQuery.newDoubleRange(TweetStreamReader.StatusField.LATITUDE.name,
                new Double(latitude[city] - 0.05), new Double(latitude[city] + 0.05), true, true);

        BooleanQuery bqCityName = new BooleanQuery();

        Term t = new Term("place", cityName[city]);
        TermQuery query = new TermQuery(t);
        bqCityName.add(query, BooleanClause.Occur.SHOULD);
        System.out.println(query.toString());

        for (int i = 0; i < cityNameAlias[city].length; i++) {
            t = new Term("place", cityNameAlias[city][i]);
            query = new TermQuery(t);
            bqCityName.add(query, BooleanClause.Occur.SHOULD);
            System.out.println(query.toString());
        }

        BooleanQuery bq = new BooleanQuery();

        BooleanQuery finalQuery = new BooleanQuery();

        // either a coordinate match
        bq.add(q_long, BooleanClause.Occur.MUST);
        bq.add(q_lat, BooleanClause.Occur.MUST);

        finalQuery.add(bq, BooleanClause.Occur.SHOULD);
        // or a place city name match
        finalQuery.add(bqCityName, BooleanClause.Occur.SHOULD);

        TotalHitCountCollector totalHitCollector = new TotalHitCountCollector();

        // Query hasFieldQuery = new ConstantScoreQuery(new
        // FieldValueFilter("timeline"));
        //
        // searcher.search(hasFieldQuery, totalHitCollector);
        //
        // if (totalHitCollector.getTotalHits() > 0) {
        // TopScoreDocCollector collector =
        // TopScoreDocCollector.create(Math.max(0,
        // totalHitCollector.getTotalHits()));
        // searcher.search(finalQuery, collector);
        // ScoreDoc[] hits = collector.topDocs().scoreDocs;
        //
        //
        // HashMap<String, Integer> hasHit = new HashMap<String, Integer>();
        // int dupcount = 0;
        // for (int i = 0; i < hits.length; ++i) {
        // int docId = hits[i].doc;
        // Document d;
        //
        // d = searcher.doc(docId);
        //
        // System.out.println(d.getFields());
        // }
        // }

        // totalHitCollector = new TotalHitCountCollector();
        searcher.search(finalQuery, totalHitCollector);

        if (totalHitCollector.getTotalHits() > 0) {
            TopScoreDocCollector collector = TopScoreDocCollector
                    .create(Math.max(0, totalHitCollector.getTotalHits()));
            searcher.search(finalQuery, collector);
            ScoreDoc[] hits = collector.topDocs().scoreDocs;

            System.out.println("City " + cityName[city] + " " + collector.getTotalHits() + " hits.");

            HashMap<String, Integer> hasHit = new HashMap<String, Integer>();
            int dupcount = 0;
            for (int i = 0; i < hits.length; ++i) {
                int docId = hits[i].doc;
                Document d;

                d = searcher.doc(docId);

                if (userIDList.contains(d.get(IndexTweets.StatusField.USER_ID.name))
                        && hm.containsKey(Long.parseLong(d.get(IndexTweets.StatusField.USER_ID.name)))) {
                    //            System.out.println("Has timeline field?" + (d.get("timeline") != null));
                    //            System.out.println(reader.getDocCount("timeline"));
                    //            d.add(new Field("timeline", hm.get(Long.parseLong(d.get(IndexTweets.StatusField.USER_ID.name))),
                    //                textOptions));
                    System.out.println("Found a user hit");
                    BytesRefBuilder brb = new BytesRefBuilder();
                    NumericUtils.longToPrefixCodedBytes(Long.parseLong(d.get(IndexTweets.StatusField.ID.name)),
                            0, brb);
                    Term term = new Term(IndexTweets.StatusField.ID.name, brb.get());
                    //            System.out.println(reader.getDocCount("timeline"));

                    Document d_new = new Document();
                    //            for (IndexableField field : d.getFields()) {
                    //              d_new.add(field);
                    //            }
                    // System.out.println(d_new.getFields());
                    d_new.add(new StringField("userBackground", d.get(IndexTweets.StatusField.USER_ID.name),
                            Store.YES));
                    d_new.add(new Field("timeline",
                            hm.get(Long.parseLong(d.get(IndexTweets.StatusField.USER_ID.name))), textOptions));
                    // System.out.println(d_new.get());
                    writer.addDocument(d_new);
                    writer.commit();

                    //            t = new Term("label", "why");
                    //            TermQuery tqnew = new TermQuery(t);
                    //
                    //            totalHitCollector = new TotalHitCountCollector();
                    //
                    //            searcher.search(tqnew, totalHitCollector);
                    //
                    //            if (totalHitCollector.getTotalHits() > 0) {
                    //              collector = TopScoreDocCollector.create(Math.max(0, totalHitCollector.getTotalHits()));
                    //              searcher.search(tqnew, collector);
                    //              hits = collector.topDocs().scoreDocs;
                    //
                    //              System.out.println("City " + cityName[city] + " " + collector.getTotalHits() + " hits.");
                    //
                    //              for (int k = 0; k < hits.length; k++) {
                    //                docId = hits[k].doc;
                    //                d = searcher.doc(docId);
                    //                System.out.println(d.get(IndexTweets.StatusField.ID.name));
                    //                System.out.println(d.get(IndexTweets.StatusField.PLACE.name));
                    //              }
                    //            }

                    // writer.deleteDocuments(term);
                    // writer.commit();
                    // writer.addDocument(d);
                    // writer.commit();

                    //            System.out.println(reader.getDocCount("timeline"));
                    // writer.updateDocument(term, d);
                    // writer.commit();

                }

            }
        }
    }
    reader.close();
    writer.close();

}

From source file:org.codelibs.elasticsearch.common.lucene.all.AllTermQuery.java

License:Apache License

@Override
public Weight createWeight(IndexSearcher searcher, boolean needsScores) throws IOException {
    if (needsScores == false) {
        return new TermQuery(term).createWeight(searcher, needsScores);
    }//from  ww w  .ja v  a2 s .  c  o m
    final TermContext termStates = TermContext.build(searcher.getTopReaderContext(), term);
    final CollectionStatistics collectionStats = searcher.collectionStatistics(term.field());
    final TermStatistics termStats = searcher.termStatistics(term, termStates);
    final Similarity similarity = searcher.getSimilarity(needsScores);
    final SimWeight stats = similarity.computeWeight(collectionStats, termStats);
    return new Weight(this) {

        @Override
        public float getValueForNormalization() throws IOException {
            return stats.getValueForNormalization();
        }

        @Override
        public void normalize(float norm, float topLevelBoost) {
            stats.normalize(norm, topLevelBoost);
        }

        @Override
        public void extractTerms(Set<Term> terms) {
            terms.add(term);
        }

        @Override
        public Explanation explain(LeafReaderContext context, int doc) throws IOException {
            AllTermScorer scorer = scorer(context);
            if (scorer != null) {
                int newDoc = scorer.iterator().advance(doc);
                if (newDoc == doc) {
                    float score = scorer.score();
                    float freq = scorer.freq();
                    SimScorer docScorer = similarity.simScorer(stats, context);
                    Explanation freqExplanation = Explanation.match(freq, "termFreq=" + freq);
                    Explanation termScoreExplanation = docScorer.explain(doc, freqExplanation);
                    Explanation payloadBoostExplanation = Explanation.match(scorer.payloadBoost(),
                            "payloadBoost=" + scorer.payloadBoost());
                    return Explanation.match(score,
                            "weight(" + getQuery() + " in " + doc + ") ["
                                    + similarity.getClass().getSimpleName() + "], product of:",
                            termScoreExplanation, payloadBoostExplanation);
                }
            }
            return Explanation.noMatch("no matching term");
        }

        @Override
        public AllTermScorer scorer(LeafReaderContext context) throws IOException {
            final Terms terms = context.reader().terms(term.field());
            if (terms == null) {
                return null;
            }
            final TermsEnum termsEnum = terms.iterator();
            if (termsEnum == null) {
                return null;
            }
            final TermState state = termStates.get(context.ord);
            if (state == null) {
                // Term does not exist in this segment
                return null;
            }
            termsEnum.seekExact(term.bytes(), state);
            PostingsEnum docs = termsEnum.postings(null, PostingsEnum.PAYLOADS);
            assert docs != null;
            return new AllTermScorer(this, docs, similarity.simScorer(stats, context));
        }

    };
}