List of usage examples for org.apache.lucene.search IndexSearcher collectionStatistics
public CollectionStatistics collectionStatistics(String field) throws IOException
From source file:com.xiaomi.linden.lucene.query.flexiblequery.FlexibleWeight.java
License:Apache License
public FlexibleWeight(FlexibleQuery query, IndexSearcher searcher) throws IOException { this.query = query; this.similarity = searcher.getSimilarity(); final IndexReaderContext context = searcher.getTopReaderContext(); int[] maxDocFreqs = null; long[] maxTotalTermFreqs = null; Map<Term, TermContext> builtTermMap = new HashMap<>(); if (query.enableGlobalIDF()) { FlexibleQuery.FlexibleTerm[][] globalTerms = query.getGlobalTerms(); TermContext[][] globalStates = new TermContext[globalTerms.length][]; for (int i = 0; i < globalTerms.length; ++i) { globalStates[i] = new TermContext[globalTerms[i].length]; for (int j = 0; j < globalTerms[i].length; ++j) { Term term = globalTerms[i][j].term; TermContext termContext = builtTermMap.get(term); if (termContext != null) { globalStates[i][j] = termContext; } else { globalStates[i][j] = TermContext.build(context, globalTerms[i][j].term); builtTermMap.put(term, globalStates[i][j]); }/* w w w . ja v a 2 s. c o m*/ } } maxDocFreqs = new int[globalTerms[0].length]; maxTotalTermFreqs = new long[globalTerms[0].length]; int fieldLength = globalTerms.length; int termLength = globalTerms[0].length; for (int i = 0; i < termLength; ++i) { int maxDocFreq = 0; long maxTotalTermFreq = 0; for (int j = 0; j < fieldLength; ++j) { maxDocFreq = Math.max(globalStates[j][i].docFreq(), maxDocFreq); maxTotalTermFreq = Math.max(globalStates[j][i].totalTermFreq(), maxTotalTermFreq); } maxDocFreqs[i] = maxDocFreq; maxTotalTermFreqs[i] = maxTotalTermFreq; } } FlexibleQuery.FlexibleTerm[][] terms = query.getTerms(); TermContext[][] states = new TermContext[terms.length][]; for (int i = 0; i < terms.length; ++i) { states[i] = new TermContext[terms[i].length]; for (int j = 0; j < terms[i].length; ++j) { Term term = terms[i][j].term; TermContext termContext = builtTermMap.get(term); if (termContext != null) { states[i][j] = termContext; } else { states[i][j] = TermContext.build(context, terms[i][j].term); builtTermMap.put(term, states[i][j]); } } } termStatsMatrix = new TermStats[terms.length][]; for (int i = 0; i < terms.length; ++i) { termStatsMatrix[i] = new TermStats[terms[i].length]; for (int j = 0; j < terms[i].length; ++j) { FlexibleQuery.FlexibleTerm term = terms[i][j]; TermContext state = states[i][j]; TermStatistics termStats; if (query.enableGlobalIDF()) { termStats = new TermStatistics(term.term.bytes(), maxDocFreqs[j], maxTotalTermFreqs[j]); } else { termStats = searcher.termStatistics(term.term, state); } Similarity.SimWeight stats = similarity.computeWeight(term.boost, searcher.collectionStatistics(term.term.field()), termStats); TermStats termStatsInfo = new TermStats(); termStatsInfo.stats = stats; termStatsInfo.term = term.term; termStatsInfo.termContext = state; termStatsMatrix[i][j] = termStatsInfo; } } }
From source file:io.anserini.index.UpdateIndex.java
License:Apache License
@SuppressWarnings("static-access") public static void main(String[] args) throws Exception { Options options = new Options(); options.addOption(new Option(HELP_OPTION, "show help")); options.addOption(new Option(OPTIMIZE_OPTION, "merge indexes into a single segment")); options.addOption(new Option(STORE_TERM_VECTORS_OPTION, "store term vectors")); options.addOption(/*from w ww. j a va2 s . c om*/ OptionBuilder.withArgName("dir").hasArg().withDescription("index location").create(INDEX_OPTION)); options.addOption(OptionBuilder.withArgName("file").hasArg().withDescription("file with deleted tweetids") .create(DELETES_OPTION)); options.addOption(OptionBuilder.withArgName("id").hasArg().withDescription("max id").create(MAX_ID_OPTION)); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (cmdline.hasOption(HELP_OPTION) || !cmdline.hasOption(INDEX_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(UpdateIndex.class.getName(), options); System.exit(-1); } String indexPath = cmdline.getOptionValue(INDEX_OPTION); final FieldType textOptions = new FieldType(); textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); textOptions.setStored(true); textOptions.setTokenized(true); textOptions.setStoreTermVectors(true); LOG.info("index: " + indexPath); File file = new File("PittsburghUserTimeline"); if (!file.exists()) { System.err.println("Error: " + file + " does not exist!"); System.exit(-1); } final StatusStream stream = new JsonStatusCorpusReader(file); Status status; String s; HashMap<Long, String> hm = new HashMap<Long, String>(); try { while ((s = stream.nextRaw()) != null) { try { status = DataObjectFactory.createStatus(s); if (status.getText() == null) { continue; } hm.put(status.getUser().getId(), hm.get(status.getUser().getId()) + status.getText().replaceAll("[\\r\\n]+", " ")); } catch (Exception e) { } } } catch (Exception e) { e.printStackTrace(); } finally { stream.close(); } ArrayList<String> userIDList = new ArrayList<String>(); try (BufferedReader br = new BufferedReader(new FileReader(new File("userID")))) { String line; while ((line = br.readLine()) != null) { userIDList.add(line.replaceAll("[\\r\\n]+", "")); // process the line. } } try { reader = DirectoryReader .open(FSDirectory.open(new File(cmdline.getOptionValue(INDEX_OPTION)).toPath())); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } final Directory dir = new SimpleFSDirectory(Paths.get(cmdline.getOptionValue(INDEX_OPTION))); final IndexWriterConfig config = new IndexWriterConfig(ANALYZER); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); final IndexWriter writer = new IndexWriter(dir, config); IndexSearcher searcher = new IndexSearcher(reader); System.out.println("The total number of docs indexed " + searcher.collectionStatistics(TweetStreamReader.StatusField.TEXT.name).docCount()); for (int city = 0; city < cityName.length; city++) { // Pittsburgh's coordinate -79.976389, 40.439722 Query q_long = NumericRangeQuery.newDoubleRange(TweetStreamReader.StatusField.LONGITUDE.name, new Double(longitude[city] - 0.05), new Double(longitude[city] + 0.05), true, true); Query q_lat = NumericRangeQuery.newDoubleRange(TweetStreamReader.StatusField.LATITUDE.name, new Double(latitude[city] - 0.05), new Double(latitude[city] + 0.05), true, true); BooleanQuery bqCityName = new BooleanQuery(); Term t = new Term("place", cityName[city]); TermQuery query = new TermQuery(t); bqCityName.add(query, BooleanClause.Occur.SHOULD); System.out.println(query.toString()); for (int i = 0; i < cityNameAlias[city].length; i++) { t = new Term("place", cityNameAlias[city][i]); query = new TermQuery(t); bqCityName.add(query, BooleanClause.Occur.SHOULD); System.out.println(query.toString()); } BooleanQuery bq = new BooleanQuery(); BooleanQuery finalQuery = new BooleanQuery(); // either a coordinate match bq.add(q_long, BooleanClause.Occur.MUST); bq.add(q_lat, BooleanClause.Occur.MUST); finalQuery.add(bq, BooleanClause.Occur.SHOULD); // or a place city name match finalQuery.add(bqCityName, BooleanClause.Occur.SHOULD); TotalHitCountCollector totalHitCollector = new TotalHitCountCollector(); // Query hasFieldQuery = new ConstantScoreQuery(new // FieldValueFilter("timeline")); // // searcher.search(hasFieldQuery, totalHitCollector); // // if (totalHitCollector.getTotalHits() > 0) { // TopScoreDocCollector collector = // TopScoreDocCollector.create(Math.max(0, // totalHitCollector.getTotalHits())); // searcher.search(finalQuery, collector); // ScoreDoc[] hits = collector.topDocs().scoreDocs; // // // HashMap<String, Integer> hasHit = new HashMap<String, Integer>(); // int dupcount = 0; // for (int i = 0; i < hits.length; ++i) { // int docId = hits[i].doc; // Document d; // // d = searcher.doc(docId); // // System.out.println(d.getFields()); // } // } // totalHitCollector = new TotalHitCountCollector(); searcher.search(finalQuery, totalHitCollector); if (totalHitCollector.getTotalHits() > 0) { TopScoreDocCollector collector = TopScoreDocCollector .create(Math.max(0, totalHitCollector.getTotalHits())); searcher.search(finalQuery, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; System.out.println("City " + cityName[city] + " " + collector.getTotalHits() + " hits."); HashMap<String, Integer> hasHit = new HashMap<String, Integer>(); int dupcount = 0; for (int i = 0; i < hits.length; ++i) { int docId = hits[i].doc; Document d; d = searcher.doc(docId); if (userIDList.contains(d.get(IndexTweets.StatusField.USER_ID.name)) && hm.containsKey(Long.parseLong(d.get(IndexTweets.StatusField.USER_ID.name)))) { // System.out.println("Has timeline field?" + (d.get("timeline") != null)); // System.out.println(reader.getDocCount("timeline")); // d.add(new Field("timeline", hm.get(Long.parseLong(d.get(IndexTweets.StatusField.USER_ID.name))), // textOptions)); System.out.println("Found a user hit"); BytesRefBuilder brb = new BytesRefBuilder(); NumericUtils.longToPrefixCodedBytes(Long.parseLong(d.get(IndexTweets.StatusField.ID.name)), 0, brb); Term term = new Term(IndexTweets.StatusField.ID.name, brb.get()); // System.out.println(reader.getDocCount("timeline")); Document d_new = new Document(); // for (IndexableField field : d.getFields()) { // d_new.add(field); // } // System.out.println(d_new.getFields()); d_new.add(new StringField("userBackground", d.get(IndexTweets.StatusField.USER_ID.name), Store.YES)); d_new.add(new Field("timeline", hm.get(Long.parseLong(d.get(IndexTweets.StatusField.USER_ID.name))), textOptions)); // System.out.println(d_new.get()); writer.addDocument(d_new); writer.commit(); // t = new Term("label", "why"); // TermQuery tqnew = new TermQuery(t); // // totalHitCollector = new TotalHitCountCollector(); // // searcher.search(tqnew, totalHitCollector); // // if (totalHitCollector.getTotalHits() > 0) { // collector = TopScoreDocCollector.create(Math.max(0, totalHitCollector.getTotalHits())); // searcher.search(tqnew, collector); // hits = collector.topDocs().scoreDocs; // // System.out.println("City " + cityName[city] + " " + collector.getTotalHits() + " hits."); // // for (int k = 0; k < hits.length; k++) { // docId = hits[k].doc; // d = searcher.doc(docId); // System.out.println(d.get(IndexTweets.StatusField.ID.name)); // System.out.println(d.get(IndexTweets.StatusField.PLACE.name)); // } // } // writer.deleteDocuments(term); // writer.commit(); // writer.addDocument(d); // writer.commit(); // System.out.println(reader.getDocCount("timeline")); // writer.updateDocument(term, d); // writer.commit(); } } } } reader.close(); writer.close(); }
From source file:org.codelibs.elasticsearch.common.lucene.all.AllTermQuery.java
License:Apache License
@Override public Weight createWeight(IndexSearcher searcher, boolean needsScores) throws IOException { if (needsScores == false) { return new TermQuery(term).createWeight(searcher, needsScores); }//from ww w .ja v a2 s . c o m final TermContext termStates = TermContext.build(searcher.getTopReaderContext(), term); final CollectionStatistics collectionStats = searcher.collectionStatistics(term.field()); final TermStatistics termStats = searcher.termStatistics(term, termStates); final Similarity similarity = searcher.getSimilarity(needsScores); final SimWeight stats = similarity.computeWeight(collectionStats, termStats); return new Weight(this) { @Override public float getValueForNormalization() throws IOException { return stats.getValueForNormalization(); } @Override public void normalize(float norm, float topLevelBoost) { stats.normalize(norm, topLevelBoost); } @Override public void extractTerms(Set<Term> terms) { terms.add(term); } @Override public Explanation explain(LeafReaderContext context, int doc) throws IOException { AllTermScorer scorer = scorer(context); if (scorer != null) { int newDoc = scorer.iterator().advance(doc); if (newDoc == doc) { float score = scorer.score(); float freq = scorer.freq(); SimScorer docScorer = similarity.simScorer(stats, context); Explanation freqExplanation = Explanation.match(freq, "termFreq=" + freq); Explanation termScoreExplanation = docScorer.explain(doc, freqExplanation); Explanation payloadBoostExplanation = Explanation.match(scorer.payloadBoost(), "payloadBoost=" + scorer.payloadBoost()); return Explanation.match(score, "weight(" + getQuery() + " in " + doc + ") [" + similarity.getClass().getSimpleName() + "], product of:", termScoreExplanation, payloadBoostExplanation); } } return Explanation.noMatch("no matching term"); } @Override public AllTermScorer scorer(LeafReaderContext context) throws IOException { final Terms terms = context.reader().terms(term.field()); if (terms == null) { return null; } final TermsEnum termsEnum = terms.iterator(); if (termsEnum == null) { return null; } final TermState state = termStates.get(context.ord); if (state == null) { // Term does not exist in this segment return null; } termsEnum.seekExact(term.bytes(), state); PostingsEnum docs = termsEnum.postings(null, PostingsEnum.PAYLOADS); assert docs != null; return new AllTermScorer(this, docs, similarity.simScorer(stats, context)); } }; }