List of usage examples for org.apache.lucene.search AutomatonQuery AutomatonQuery
public AutomatonQuery(final Term term, Automaton automaton)
From source file:org.apache.solr.parser.SolrQueryParserBase.java
License:Apache License
protected Query getWildcardQuery(String field, String termStr) throws SyntaxError { checkNullField(field);/*from w w w . j ava2 s.c om*/ // *:* -> MatchAllDocsQuery if ("*".equals(termStr)) { if ("*".equals(field) || getExplicitField() == null) { return newMatchAllDocsQuery(); } } FieldType fieldType = schema.getFieldType(field); termStr = analyzeIfMultitermTermText(field, termStr, fieldType); // can we use reversed wildcards in this field? ReversedWildcardFilterFactory factory = getReversedWildcardFilterFactory(fieldType); if (factory != null) { Term term = new Term(field, termStr); // fsa representing the query Automaton automaton = WildcardQuery.toAutomaton(term); // TODO: we should likely use the automaton to calculate shouldReverse, too. if (factory.shouldReverse(termStr)) { automaton = BasicOperations.concatenate(automaton, BasicAutomata.makeChar(factory.getMarkerChar())); SpecialOperations.reverse(automaton); } else { // reverse wildcardfilter is active: remove false positives // fsa representing false positives (markerChar*) Automaton falsePositives = BasicOperations.concatenate( BasicAutomata.makeChar(factory.getMarkerChar()), BasicAutomata.makeAnyString()); // subtract these away automaton = BasicOperations.minus(automaton, falsePositives); } return new AutomatonQuery(term, automaton) { // override toString so its completely transparent @Override public String toString(String field) { StringBuilder buffer = new StringBuilder(); if (!getField().equals(field)) { buffer.append(getField()); buffer.append(":"); } buffer.append(term.text()); buffer.append(ToStringUtils.boost(getBoost())); return buffer.toString(); } }; } // Solr has always used constant scoring for wildcard queries. This should return constant scoring by default. return newWildcardQuery(new Term(field, termStr)); }
From source file:perf.TermsQueryPerf.java
License:Apache License
public static void main(String[] args) throws Exception { List<BytesRef> lookupIDs = new ArrayList<>(); Random random = new Random(17); double rate = 1.01 * ((double) NUM_QUERIES * ID_SEARCH_COUNT) / ID_INDEX_COUNT; Path indexPath = Paths.get(args[0]); boolean doIndex = Files.exists(indexPath) == false; Directory dir = FSDirectory.open(indexPath); if (doIndex) { IndexWriterConfig iwc = new IndexWriterConfig(new WhitespaceAnalyzer()); iwc.setMergeScheduler(new SerialMergeScheduler()); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); // So I can walk the files and get the *.tip sizes: iwc.setUseCompoundFile(false);/*from w ww . j ava 2 s . c o m*/ /// 7/7/7 segment structure: iwc.setMaxBufferedDocs(ID_INDEX_COUNT / 777); iwc.setRAMBufferSizeMB(-1); ((TieredMergePolicy) iwc.getMergePolicy()).setFloorSegmentMB(.001); ((TieredMergePolicy) iwc.getMergePolicy()).setNoCFSRatio(0.0); IndexWriter w = new IndexWriter(dir, iwc); // IDIterator ids = zeroPadSequentialIDs(10); IDIterator ids = randomIDs(10, random); BytesRef idValue = new BytesRef(64); for (int i = 0; i < ID_INDEX_COUNT; i++) { ids.next(idValue); Document doc = new Document(); doc.add(new StringField("id", idValue, Field.Store.NO)); w.addDocument(doc); if (random.nextDouble() <= rate && lookupIDs.size() < NUM_QUERIES * ID_SEARCH_COUNT) { lookupIDs.add(BytesRef.deepCopyOf(idValue)); } if (i % 100000 == 0) { System.out.println(i + " docs..."); } } w.close(); } IndexReader r = DirectoryReader.open(dir); if (doIndex == false) { System.out.println("Build lookup ids"); TermsEnum termsEnum = MultiFields.getTerms(r, "id").iterator(); BytesRef idValue; while ((idValue = termsEnum.next()) != null) { if (random.nextDouble() <= rate && lookupIDs.size() < NUM_QUERIES * ID_SEARCH_COUNT) { lookupIDs.add(BytesRef.deepCopyOf(idValue)); //System.out.println("add: " + idValue); } } shuffle(random, lookupIDs); System.out.println("Done build lookup ids"); } IndexSearcher s = new IndexSearcher(r); if (lookupIDs.size() < NUM_QUERIES * ID_SEARCH_COUNT) { throw new RuntimeException( "didn't get enough lookup ids: " + (NUM_QUERIES * ID_SEARCH_COUNT) + " vs " + lookupIDs.size()); } List<Query> queries = new ArrayList<Query>(); for (int i = 0; i < NUM_QUERIES; i++) { List<BytesRef> sortedTermBytes = new ArrayList<>(); for (BytesRef term : lookupIDs.subList(i * ID_SEARCH_COUNT, (i + 1) * ID_SEARCH_COUNT)) { sortedTermBytes.add(term); } Collections.sort(sortedTermBytes); // nocommit only do this if term count is high enough? // nocommit: we can be more efficient here, go straight to binary: Query query = new AutomatonQuery(new Term("id", "manyterms"), Automata.makeStringUnion(sortedTermBytes)); //((MultiTermQuery) query).setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_REWRITE); //Query query = new TermsQuery("id", lookupIDs.subList(i*ID_SEARCH_COUNT, (i+1)*ID_SEARCH_COUNT)); queries.add(query); } // TODO: also include construction time of queries long best = Long.MAX_VALUE; for (int iter = 0; iter < 100; iter++) { long t0 = System.nanoTime(); long totCount = 0; for (int i = 0; i < NUM_QUERIES; i++) { //Query query = new TermsQuery("id", lookupIDs.subList(i*ID_SEARCH_COUNT, (i+1)*ID_SEARCH_COUNT)); Query query = queries.get(i); totCount += s.search(query, 10).totalHits; } if (totCount != NUM_QUERIES * ID_SEARCH_COUNT) { throw new RuntimeException( "totCount=" + totCount + " but expected " + (NUM_QUERIES * ID_SEARCH_COUNT)); } long t = System.nanoTime() - t0; System.out.println("ITER: " + iter + ": " + (t / 1000000.) + " msec"); if (t < best) { System.out.println(" **"); best = t; } } IOUtils.close(r, dir); }