Example usage for org.apache.lucene.search AutomatonQuery AutomatonQuery

List of usage examples for org.apache.lucene.search AutomatonQuery AutomatonQuery

Introduction

In this page you can find the example usage for org.apache.lucene.search AutomatonQuery AutomatonQuery.

Prototype

public AutomatonQuery(final Term term, Automaton automaton) 

Source Link

Document

Create a new AutomatonQuery from an Automaton .

Usage

From source file:org.apache.solr.parser.SolrQueryParserBase.java

License:Apache License

protected Query getWildcardQuery(String field, String termStr) throws SyntaxError {
    checkNullField(field);/*from   w w  w .  j ava2 s.c  om*/
    // *:* -> MatchAllDocsQuery
    if ("*".equals(termStr)) {
        if ("*".equals(field) || getExplicitField() == null) {
            return newMatchAllDocsQuery();
        }
    }

    FieldType fieldType = schema.getFieldType(field);
    termStr = analyzeIfMultitermTermText(field, termStr, fieldType);
    // can we use reversed wildcards in this field?
    ReversedWildcardFilterFactory factory = getReversedWildcardFilterFactory(fieldType);
    if (factory != null) {
        Term term = new Term(field, termStr);
        // fsa representing the query
        Automaton automaton = WildcardQuery.toAutomaton(term);
        // TODO: we should likely use the automaton to calculate shouldReverse, too.
        if (factory.shouldReverse(termStr)) {
            automaton = BasicOperations.concatenate(automaton, BasicAutomata.makeChar(factory.getMarkerChar()));
            SpecialOperations.reverse(automaton);
        } else {
            // reverse wildcardfilter is active: remove false positives
            // fsa representing false positives (markerChar*)
            Automaton falsePositives = BasicOperations.concatenate(
                    BasicAutomata.makeChar(factory.getMarkerChar()), BasicAutomata.makeAnyString());
            // subtract these away
            automaton = BasicOperations.minus(automaton, falsePositives);
        }
        return new AutomatonQuery(term, automaton) {
            // override toString so its completely transparent
            @Override
            public String toString(String field) {
                StringBuilder buffer = new StringBuilder();
                if (!getField().equals(field)) {
                    buffer.append(getField());
                    buffer.append(":");
                }
                buffer.append(term.text());
                buffer.append(ToStringUtils.boost(getBoost()));
                return buffer.toString();
            }
        };
    }

    // Solr has always used constant scoring for wildcard queries.  This should return constant scoring by default.
    return newWildcardQuery(new Term(field, termStr));
}

From source file:perf.TermsQueryPerf.java

License:Apache License

public static void main(String[] args) throws Exception {

    List<BytesRef> lookupIDs = new ArrayList<>();
    Random random = new Random(17);
    double rate = 1.01 * ((double) NUM_QUERIES * ID_SEARCH_COUNT) / ID_INDEX_COUNT;

    Path indexPath = Paths.get(args[0]);

    boolean doIndex = Files.exists(indexPath) == false;

    Directory dir = FSDirectory.open(indexPath);

    if (doIndex) {
        IndexWriterConfig iwc = new IndexWriterConfig(new WhitespaceAnalyzer());
        iwc.setMergeScheduler(new SerialMergeScheduler());
        iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);

        // So I can walk the files and get the *.tip sizes:
        iwc.setUseCompoundFile(false);/*from   w  ww  .  j  ava 2 s .  c o m*/

        /// 7/7/7 segment structure:
        iwc.setMaxBufferedDocs(ID_INDEX_COUNT / 777);
        iwc.setRAMBufferSizeMB(-1);

        ((TieredMergePolicy) iwc.getMergePolicy()).setFloorSegmentMB(.001);
        ((TieredMergePolicy) iwc.getMergePolicy()).setNoCFSRatio(0.0);

        IndexWriter w = new IndexWriter(dir, iwc);
        // IDIterator ids = zeroPadSequentialIDs(10);
        IDIterator ids = randomIDs(10, random);

        BytesRef idValue = new BytesRef(64);
        for (int i = 0; i < ID_INDEX_COUNT; i++) {
            ids.next(idValue);
            Document doc = new Document();
            doc.add(new StringField("id", idValue, Field.Store.NO));
            w.addDocument(doc);
            if (random.nextDouble() <= rate && lookupIDs.size() < NUM_QUERIES * ID_SEARCH_COUNT) {
                lookupIDs.add(BytesRef.deepCopyOf(idValue));
            }
            if (i % 100000 == 0) {
                System.out.println(i + " docs...");
            }
        }
        w.close();
    }

    IndexReader r = DirectoryReader.open(dir);

    if (doIndex == false) {
        System.out.println("Build lookup ids");
        TermsEnum termsEnum = MultiFields.getTerms(r, "id").iterator();
        BytesRef idValue;
        while ((idValue = termsEnum.next()) != null) {
            if (random.nextDouble() <= rate && lookupIDs.size() < NUM_QUERIES * ID_SEARCH_COUNT) {
                lookupIDs.add(BytesRef.deepCopyOf(idValue));
                //System.out.println("add: " + idValue);
            }
        }
        shuffle(random, lookupIDs);
        System.out.println("Done build lookup ids");
    }

    IndexSearcher s = new IndexSearcher(r);

    if (lookupIDs.size() < NUM_QUERIES * ID_SEARCH_COUNT) {
        throw new RuntimeException(
                "didn't get enough lookup ids: " + (NUM_QUERIES * ID_SEARCH_COUNT) + " vs " + lookupIDs.size());
    }

    List<Query> queries = new ArrayList<Query>();
    for (int i = 0; i < NUM_QUERIES; i++) {

        List<BytesRef> sortedTermBytes = new ArrayList<>();
        for (BytesRef term : lookupIDs.subList(i * ID_SEARCH_COUNT, (i + 1) * ID_SEARCH_COUNT)) {
            sortedTermBytes.add(term);
        }
        Collections.sort(sortedTermBytes);

        // nocommit only do this if term count is high enough?
        // nocommit: we can be more efficient here, go straight to binary:
        Query query = new AutomatonQuery(new Term("id", "manyterms"),
                Automata.makeStringUnion(sortedTermBytes));
        //((MultiTermQuery) query).setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_REWRITE);
        //Query query = new TermsQuery("id", lookupIDs.subList(i*ID_SEARCH_COUNT, (i+1)*ID_SEARCH_COUNT));
        queries.add(query);
    }

    // TODO: also include construction time of queries
    long best = Long.MAX_VALUE;
    for (int iter = 0; iter < 100; iter++) {
        long t0 = System.nanoTime();
        long totCount = 0;
        for (int i = 0; i < NUM_QUERIES; i++) {
            //Query query = new TermsQuery("id", lookupIDs.subList(i*ID_SEARCH_COUNT, (i+1)*ID_SEARCH_COUNT));
            Query query = queries.get(i);
            totCount += s.search(query, 10).totalHits;
        }
        if (totCount != NUM_QUERIES * ID_SEARCH_COUNT) {
            throw new RuntimeException(
                    "totCount=" + totCount + " but expected " + (NUM_QUERIES * ID_SEARCH_COUNT));
        }
        long t = System.nanoTime() - t0;
        System.out.println("ITER: " + iter + ": " + (t / 1000000.) + " msec");
        if (t < best) {
            System.out.println("  **");
            best = t;
        }
    }

    IOUtils.close(r, dir);
}