Example usage for org.apache.lucene.search IndexSearcher setSimilarity

Introduction

In this page you can find the example usage for org.apache.lucene.search IndexSearcher setSimilarity.

Prototype

public void setSimilarity(Similarity similarity)

Source Link

Document

Expert: Set the Similarity implementation used by this IndexSearcher.

Usage

From source file:io.anserini.search.SimpleSearcher.java

License:Apache License

public Result[] search(String q, int k) throws IOException {
    IndexSearcher searcher = new IndexSearcher(reader);
    searcher.setSimilarity(similarity);
    Query query = AnalyzerUtils.buildBagOfWordsQuery(LuceneDocumentGenerator.FIELD_BODY, analyzer, q);

    TopDocs rs = searcher.search(query, k);
    ScoreDoc[] hits = rs.scoreDocs;//ww w.ja v  a  2  s.  c om

    Result[] results = new Result[hits.length];
    for (int i = 0; i < hits.length; i++) {
        Document doc = searcher.doc(hits[i].doc);
        String docid = doc.getField(LuceneDocumentGenerator.FIELD_ID).stringValue();
        IndexableField field = doc.getField(LuceneDocumentGenerator.FIELD_RAW);
        String content = field == null ? null : field.stringValue();
        results[i] = new Result(docid, hits[i].doc, hits[i].score, content);
    }

    return results;
}

From source file:io.anserini.SearcherCW09B.java

License:Apache License

/**
 * Prints TREC submission file to the standard output stream.
 *
 * @param topicsFile One of: topics.web.1-50.txt topics.web.51-100.txt topics.web.101-150.txt topics.web.151-200.txt
 * @param operator   Default search operator: AND or OR
 * @throws IOException/*from  w  w  w.  j  a  v  a 2 s.  c  o m*/
 * @throws ParseException
 */

public void search(String topicsFile, QueryParser.Operator operator) throws IOException, ParseException {

    Path topicsPath = Paths.get(topicsFile);

    if (!Files.exists(topicsPath) || !Files.isRegularFile(topicsPath) || !Files.isReadable(topicsPath)) {
        throw new IllegalArgumentException(
                "Topics file : " + topicsFile + " does not exist or is not a (readable) file.");
    }

    IndexSearcher searcher = new IndexSearcher(reader);
    searcher.setSimilarity(new BM25Similarity());

    final String runTag = "BM25_Krovetz_" + FIELD_BODY + "_" + operator.toString();

    // PrintWriter out = new PrintWriter(Files.newBufferedWriter(path.resolve(runTag + ".txt"), StandardCharsets.US_ASCII));

    PrintStream out = System.out;

    QueryParser queryParser = new QueryParser(FIELD_BODY, analyzer());
    queryParser.setDefaultOperator(operator);

    SortedMap<Integer, String> topics = readQueries(topicsPath);

    for (Map.Entry<Integer, String> entry : topics.entrySet()) {

        int qID = entry.getKey();
        String queryString = entry.getValue();
        Query query = queryParser.parse(queryString);

        /**
         * For Web Tracks 2010,2011,and 2012; an experimental run consists of the top 10,000 documents for each topic query.
         */
        ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;

        /**
         * the first column is the topic number.
         * the second column is currently unused and should always be "Q0".
         * the third column is the official document identifier of the retrieved document.
         * the fourth column is the rank the document is retrieved.
         * the fifth column shows the score (integer or floating point) that generated the ranking.
         * the sixth column is called the "run tag" and should be a unique identifier for your
         */
        for (int i = 0; i < hits.length; i++) {
            int docId = hits[i].doc;
            Document doc = searcher.doc(docId);
            out.print(qID);
            out.print("\tQ0\t");
            out.print(doc.get(FIELD_ID));
            out.print("\t");
            out.print(i);
            out.print("\t");
            out.print(hits[i].score);
            out.print("\t");
            out.print(runTag);
            out.println();
        }
    }
    // out.flush();
    // out.close();
}

From source file:io.puntanegra.fhir.index.lucene.LuceneService.java

License:Apache License

/**
 * Builds a new {@link FSIndex}./*from w w w .  ja  v  a2 s  .c  om*/
 *
 * @param name
 *            the index name
 * @param mbeanName
 *            the JMX MBean object name
 * @param path
 *            the directory path
 * @param analyzer
 *            the index writer analyzer
 * @param refresh
 *            the index reader refresh frequency in seconds
 * @param ramBufferMB
 *            the index writer RAM buffer size in MB
 * @param maxMergeMB
 *            the directory max merge size in MB
 * @param maxCachedMB
 *            the directory max cache size in MB
 * @param refreshTask
 *            action to be done during refresh
 */
public void init(String name, String mbeanName, Path path, Analyzer analyzer, double refresh, int ramBufferMB,
        int maxMergeMB, int maxCachedMB, Runnable refreshTask) {
    try {

        this.path = path;
        this.name = name;

        // Open or create directory
        FSDirectory fsDirectory = FSDirectory.open(path);
        this.directory = new NRTCachingDirectory(fsDirectory, maxMergeMB, maxCachedMB);

        // Setup index writer
        IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
        indexWriterConfig.setRAMBufferSizeMB(ramBufferMB);
        indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
        indexWriterConfig.setUseCompoundFile(true);
        indexWriterConfig.setMergePolicy(new TieredMergePolicy());
        this.indexWriter = new IndexWriter(this.directory, indexWriterConfig);

        // Setup NRT search
        SearcherFactory searcherFactory = new SearcherFactory() {
            @Override
            public IndexSearcher newSearcher(IndexReader reader, IndexReader previousReader) {
                if (refreshTask != null) {
                    refreshTask.run();
                }
                IndexSearcher searcher = new IndexSearcher(reader);
                searcher.setSimilarity(new NoIDFSimilarity());
                return searcher;
            }
        };
        TrackingIndexWriter trackingWriter = new TrackingIndexWriter(this.indexWriter);
        this.searcherManager = new SearcherManager(this.indexWriter, true, searcherFactory);
        this.searcherReopener = new ControlledRealTimeReopenThread<>(trackingWriter, this.searcherManager,
                refresh, refresh);
        this.searcherReopener.start();

        // Register JMX MBean
        // mbean = new ObjectName(mbeanName);
        // ManagementFactory.getPlatformMBeanServer().registerMBean(service,
        // this.mbean);

    } catch (Exception e) {
        throw new FhirIndexException(e, "Error while creating index %s", name);
    }
}

From source file:ir.ac.ut.engine.FeaturedRetriever.java

public static ScoreDoc[] search(String query, String qId, String field) throws IOException {
    float mu = (float) 1000;
    query = query.toLowerCase();//from w w w.  ja  v a  2s  .  c o m
    BooleanQuery.setMaxClauseCount(query.length());

    Analyzer analyzer;
    if (field.equals(IndexedDocument.FIELD_REAL_ID)) {
        analyzer = new SimpleAnalyzer(Version.LUCENE_CURRENT);
    } else if (field.equals(IndexedDocument.FIELD_NAMED_ENTITIES)) {
        analyzer = (new MyAnalyzer(false)).MyNgramAnalyzer();
    } else if (field.equals(IndexedDocument.FIELD_SORTED_BIGRAMS)) {
        analyzer = (new MyAnalyzer(false)).MyNgramAnalyzer();
    } else if (field.equals(IndexedDocument.FIELD_SORTED_TRIGRAMS)) {
        analyzer = (new MyAnalyzer(false)).MyNgramAnalyzer();
    } else if (field.equals(IndexedDocument.FIELD_STOPWORDS3Gram)) {
        analyzer = (new MyAnalyzer(false)).MyNgramAnalyzer();
    } else if (field.equals(IndexedDocument.FIELD_POS3GRAM)) {
        analyzer = (new MyAnalyzer(false)).MyNgramAnalyzer();
    } else {
        analyzer = (new MyAnalyzer(false)).MyDefaultAnalyzer();
    }

    QueryParser qParser = new QueryParser(Version.LUCENE_47, field, analyzer);
    Query q = null;
    try {
        q = qParser.parse(QueryParser.escape(query));
    } catch (org.apache.lucene.queryparser.classic.ParseException e) {
        e.printStackTrace();
        System.out.println("Exceptional Query:" + qId);
        return new ScoreDoc[0];
    }

    Similarity simFunction = new LMDirichletSimilarity(mu);
    // Similarity simFunction = new BM25Similarity();
    IndexSearcher isearcher = new IndexSearcher(ireader);
    isearcher.setSimilarity(simFunction);
    TopFieldCollector tfc = TopFieldCollector.create(Sort.RELEVANCE, ireader.numDocs(), true, true, true,
            false);
    isearcher.search(q, tfc);

    TopDocs results = tfc.topDocs();
    ScoreDoc[] hits = results.scoreDocs;
    reportInTREC(hits, qId);
    return hits;
}

From source file:it.unipd.dei.ims.lucene.clef.applications.BatchRetrieval.java

License:Apache License

public static void main(String[] args) {

    Properties properties = new Properties();
    InputStream input = null;//  w w w . ja v  a2 s. co  m
    try {
        if (System.getProperty("properties.path") != null) {
            input = new FileInputStream(System.getProperty("properties.path"));
            properties.load(input);
        } else {
            logger.info("Loading default property file [resources/lucene-clef.properties]");
            ClassLoader loader = Thread.currentThread().getContextClassLoader();
            input = loader.getResourceAsStream("lucene-clef.properties");
            properties.load(input);
        }
    } catch (IOException ex) {
        ex.printStackTrace();
    } finally {
        if (input != null) {
            try {
                input.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

    properties.putAll(System.getProperties());

    Path indexPath = new File(properties.getProperty("index.path")).toPath();

    Path runPath = new File(properties.getProperty("run.path")).toPath();

    String runTag = properties.getProperty("run.tag");

    String language = properties.getProperty("language");

    String stemmer = properties.getProperty("stemmer");

    String stopsetType = properties.getProperty("stopset.type");

    String stopsetPath = properties.getProperty("stopset.path");

    try {

        Directory directory = new SimpleFSDirectory(indexPath);
        IndexReader reader = DirectoryReader.open(directory);
        IndexSearcher searcher = new IndexSearcher(reader);

        String model = properties.getProperty("run.model").toUpperCase();

        Similarity similarity;

        switch (model) {
        case "BM25":
            similarity = new BM25Similarity(Float.parseFloat(properties.getProperty("bm25.k1", "1.2f")),
                    Float.parseFloat(properties.getProperty("bm25.b", "0.75f")));
            break;
        default:
            throw new UnsupportedOperationException("Model " + model + " not supported yet");

        }

        searcher.setSimilarity(similarity);

        int maxResults = Integer.parseInt(properties.getProperty("maxresults", "1000"));

        SubmissionReport runfile = new SubmissionReport(
                new PrintWriter(Files.newBufferedWriter(runPath, StandardCharsets.UTF_8)), model);

        String topicPath = properties.getProperty("topics.path");

        String[] topicFields = properties.getProperty("topics.fields").split(";");

        CharArraySet stopset = AnalyzerFactory.createStopset(language, stopsetType, stopsetPath);

        QualityQuery qqs[] = getQualityQueries(topicPath, topicFields);

        QualityQueryParser qqParser = new ClefQQParser(topicFields, BuildIndex.BODY_FIELD_NAME, language,
                stemmer, stopset);

        // run the benchmark
        QualityBenchmark qrun = new QualityBenchmark(qqs, qqParser, searcher, BuildIndex.ID_FIELD_NAME);

        qrun.setMaxResults(maxResults);

        QualityStats stats[] = qrun.execute(null, runfile, null);

        reader.close();
        directory.close();

    } catch (IOException e) {
        e.printStackTrace();
    } catch (Exception e) {
        e.printStackTrace();
    }

}

From source file:lucenesearch.RelevantPostFinder.java

public void saveRelevantPost() throws SQLException, IOException, ParseException {
    String url = "jdbc:mysql://localhost:3306/sof17";
    String username = "root";
    String password = "root";
    String folderPath = "./data/rel_posts/";
    String dupNotFound = "./data/dup_not_exist.txt";
    int hitsPerPage = 10000;

    System.out.println("Connecting database...");

    Connection conn = DriverManager.getConnection(url, username, password);
    System.out.println("Database connected!");
    Statement stmt = conn.createStatement();
    String query = "select PostId,PostBody,OriginalPostId from java_test_data";
    ResultSet rs = stmt.executeQuery(query);

    String index = new Searcher().getPostIndexPath();

    IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(index)));
    IndexSearcher searcher = new IndexSearcher(reader);
    //        searcher.setSimilarity(new BM25Similarity(0.05f, 0.03f)); //!!!!!!!!!!!!!!
    searcher.setSimilarity(new BM25Similarity()); //!!!!!!!!!!!!!!

    Analyzer analyzer = new StandardAnalyzer();

    int cnt = 0;/*  ww w .j  a  v a  2  s  .  c o m*/

    while (rs.next()) {
        System.out.println("Processing post " + (++cnt));

        int postid = rs.getInt("PostId");
        int dupId = rs.getInt("OriginalPostId");
        ArrayList<String> bd = LuceneUtils.getAnalyzedRemoveHtml(rs.getString("PostBody").replace(':', ' '));

        StringBuilder sb = new StringBuilder();
        int j = 0;
        for (String b : bd) {
            if (++j > 600)
                break;
            sb.append(b);
            sb.append(" ");
        }
        String body = sb.toString();

        BooleanQuery.Builder booleanQuery = new BooleanQuery.Builder();
        booleanQuery.add(IntPoint.newExactQuery("PostTypeId", 1), BooleanClause.Occur.MUST);
        booleanQuery.add(new QueryParser("Tags", analyzer).parse("java"), BooleanClause.Occur.MUST);
        booleanQuery.add(new QueryParser("Body", analyzer).parse(body), BooleanClause.Occur.MUST);

        TopDocs results;
        results = searcher.search(booleanQuery.build(), hitsPerPage);

        ScoreDoc[] hits = results.scoreDocs;

        int numTotalHits = results.totalHits;
        System.out.println(numTotalHits + " total matching documents");

        int start = 0;
        int end = Math.min(numTotalHits, hitsPerPage);

        PrintWriter out = new PrintWriter(folderPath + postid + ".txt");

        boolean isFound = false;

        for (int i = start; i < end; i++) {
            Document doc = searcher.doc(hits[i].doc);
            int id = Integer.parseInt(doc.get("SId"));
            String s = doc.get("Body");
            if (id == dupId)
                isFound = true;
            out.println(id);
        }
        out.close();

        if (!isFound) {
            System.out.println("Duplicate not found");
            PrintWriter out2 = new PrintWriter(
                    new FileOutputStream(new File(dupNotFound), true /* append = true */));
            out2.println(postid);
            out2.close();
        }

    }
    rs.close();
    stmt.close();
    conn.close();
}

From source file:main.java.run.SearchFiles.java

License:Apache License

/** Simple command-line based search demo. */
public static void main(String[] args) throws Exception {
    String usage = "Usage:\tjava org.apache.lucene.demo.SearchFiles [-index dir] [-field f] [-repeat n] [-queries file] [-query string] [-raw] [-paging hitsPerPage]\n\nSee http://lucene.apache.org/core/4_1_0/demo/ for details.";
    if (args.length > 0 && ("-h".equals(args[0]) || "-help".equals(args[0]))) {
        System.out.println(usage);
        System.exit(0);/*from   w  w w .ja v a2  s  .  c o m*/
    }

    String index = "index";
    String field = "contents";
    String queries = null;
    int repeat = 0;
    boolean raw = false;
    String queryString = null;
    int hitsPerPage = 10;

    for (int i = 0; i < args.length; i++) {
        if ("-index".equals(args[i])) {
            index = args[i + 1];
            i++;
        } else if ("-field".equals(args[i])) {
            field = args[i + 1];
            i++;
        } else if ("-queries".equals(args[i])) {
            queries = args[i + 1];
            i++;
        } else if ("-query".equals(args[i])) {
            queryString = args[i + 1];
            i++;
        } else if ("-repeat".equals(args[i])) {
            repeat = Integer.parseInt(args[i + 1]);
            i++;
        } else if ("-raw".equals(args[i])) {
            raw = true;
        } else if ("-paging".equals(args[i])) {
            hitsPerPage = Integer.parseInt(args[i + 1]);
            if (hitsPerPage <= 0) {
                System.err.println("There must be at least 1 hit per page.");
                System.exit(1);
            }
            i++;
        }
    }

    IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(index)));
    IndexSearcher searcher = new IndexSearcher(reader);
    searcher.setSimilarity(new BM25Similarity());
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);

    BufferedReader in = null;
    if (queries != null) {
        in = new BufferedReader(new InputStreamReader(new FileInputStream(queries), "UTF-8"));
    } else {
        in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
    }
    QueryParser parser = new QueryParser(Version.LUCENE_40, field, analyzer);
    while (true) {
        if (queries == null && queryString == null) { // prompt the user
            System.out.println("Enter query: ");
        }

        String line = queryString != null ? queryString : in.readLine();

        if (line == null || line.length() == -1) {
            break;
        }

        line = line.trim();
        if (line.length() == 0) {
            break;
        }

        Query query = parser.parse(line);
        System.out.println("Searching for: " + query.toString(field));

        if (repeat > 0) { // repeat & time as benchmark
            Date start = new Date();
            for (int i = 0; i < repeat; i++) {
                searcher.search(query, null, 100);
            }
            Date end = new Date();
            System.out.println("Time: " + (end.getTime() - start.getTime()) + "ms");
        }

        doPagingSearch(in, searcher, query, hitsPerPage, raw, queries == null && queryString == null);

        if (queryString != null) {
            break;
        }
    }
    reader.close();
}

From source file:net.semanticmetadata.lire.impl.BitSamplingImageSearcher.java

License:Open Source License

private ImageSearchHits search(String[] hashes, LireFeature queryFeature, IndexReader reader)
        throws IOException {
    // first search by text:
    IndexSearcher searcher = new IndexSearcher(reader);
    searcher.setSimilarity(new BaseSimilarity());
    BooleanQuery query = new BooleanQuery();
    for (int i = 0; i < hashes.length; i++) {
        // be aware that the hashFunctionsFileName of the field must match the one you put the hashes in before.
        if (partialHashes) {
            if (Math.random() < 0.5)
                query.add(new BooleanClause(new TermQuery(new Term(hashesFieldName, hashes[i] + "")),
                        BooleanClause.Occur.SHOULD));
        } else/*from www. j a v a  2  s  . c  o m*/
            query.add(new BooleanClause(new TermQuery(new Term(hashesFieldName, hashes[i] + "")),
                    BooleanClause.Occur.SHOULD));
    }
    TopDocs docs = searcher.search(query, maxResultsHashBased);
    //        System.out.println(docs.totalHits);
    // then re-rank
    TreeSet<SimpleResult> resultScoreDocs = new TreeSet<SimpleResult>();
    float maxDistance = -1f;
    float tmpScore;
    for (int i = 0; i < docs.scoreDocs.length; i++) {
        feature.setByteArrayRepresentation(
                reader.document(docs.scoreDocs[i].doc).getBinaryValue(featureFieldName).bytes,
                reader.document(docs.scoreDocs[i].doc).getBinaryValue(featureFieldName).offset,
                reader.document(docs.scoreDocs[i].doc).getBinaryValue(featureFieldName).length);
        tmpScore = queryFeature.getDistance(feature);
        assert (tmpScore >= 0);
        if (resultScoreDocs.size() < maximumHits) {
            resultScoreDocs.add(
                    new SimpleResult(tmpScore, reader.document(docs.scoreDocs[i].doc), docs.scoreDocs[i].doc));
            maxDistance = Math.max(maxDistance, tmpScore);
        } else if (tmpScore < maxDistance) {
            // if it is nearer to the sample than at least one of the current set:
            // remove the last one ...
            resultScoreDocs.remove(resultScoreDocs.last());
            // add the new one ...
            resultScoreDocs.add(
                    new SimpleResult(tmpScore, reader.document(docs.scoreDocs[i].doc), docs.scoreDocs[i].doc));
            // and set our new distance border ...
            maxDistance = resultScoreDocs.last().getDistance();
        }
    }
    assert (resultScoreDocs.size() <= maximumHits);
    return new SimpleImageSearchHits(resultScoreDocs, maxDistance);
}

From source file:net.semanticmetadata.lire.impl.LshImageSearcher.java

License:Open Source License

private ImageSearchHits search(String[] hashes, LireFeature queryFeature, IndexReader reader)
        throws IOException {
    // first search by text:
    IndexSearcher searcher = new IndexSearcher(reader);
    searcher.setSimilarity(new DefaultSimilarity() {
        @Override/*from  w  ww  .ja v  a  2 s  .c o m*/
        public float tf(float freq) {
            return 1;
        }

        @Override
        public float idf(long docFreq, long numDocs) {
            return 1;
        }

        @Override
        public float coord(int overlap, int maxOverlap) {
            return 1;
        }

        @Override
        public float queryNorm(float sumOfSquaredWeights) {
            return 1;
        }

        @Override
        public float sloppyFreq(int distance) {
            return 1;
        }

        @Override
        public float lengthNorm(FieldInvertState state) {
            return 1;
        }
    });
    BooleanQuery query = new BooleanQuery();
    for (int i = 0; i < hashes.length; i++) {
        // be aware that the hashFunctionsFileName of the field must match the one you put the hashes in before.
        query.add(new BooleanClause(new TermQuery(new Term(hashesFieldName, hashes[i] + "")),
                BooleanClause.Occur.SHOULD));
    }
    TopDocs docs = searcher.search(query, maxResultsHashBased);
    // then re-rank
    TreeSet<SimpleResult> resultScoreDocs = new TreeSet<SimpleResult>();
    float maxDistance = 0f;
    float tmpScore = 0f;
    for (int i = 0; i < docs.scoreDocs.length; i++) {
        feature.setByteArrayRepresentation(
                reader.document(docs.scoreDocs[i].doc).getBinaryValue(featureFieldName).bytes,
                reader.document(docs.scoreDocs[i].doc).getBinaryValue(featureFieldName).offset,
                reader.document(docs.scoreDocs[i].doc).getBinaryValue(featureFieldName).length);
        tmpScore = queryFeature.getDistance(feature);
        if (resultScoreDocs.size() < maximumHits) {
            resultScoreDocs.add(
                    new SimpleResult(tmpScore, reader.document(docs.scoreDocs[i].doc), docs.scoreDocs[i].doc));
            maxDistance = Math.max(maxDistance, tmpScore);
        } else if (tmpScore < maxDistance) {
            resultScoreDocs.add(
                    new SimpleResult(tmpScore, reader.document(docs.scoreDocs[i].doc), docs.scoreDocs[i].doc));
        }
        while (resultScoreDocs.size() > maximumHits) {
            resultScoreDocs.remove(resultScoreDocs.last());
            maxDistance = resultScoreDocs.last().getDistance();
        }
        //            resultScoreDocs.add(new SimpleResult(tmpScore, reader.document(docs.scoreDocs[i].doc), docs.scoreDocs[i].doc));
    }
    return new SimpleImageSearchHits(resultScoreDocs, maxDistance);
}

From source file:net.semanticmetadata.lire.impl.searcher.VisualWordsImageSearcher.java

License:Open Source License

public ImageSearchHits search(Document doc, IndexReader reader) throws IOException {
    SimpleImageSearchHits sh = null;//w  w w  . j a  va2s  .  co m
    IndexSearcher isearcher = new IndexSearcher(reader);
    isearcher.setSimilarity(similarity);
    String queryString = doc.getValues(fieldName)[0];
    Query tq = null;
    try {
        tq = qp.parse(queryString);
        TopDocs docs = isearcher.search(tq, numMaxHits);
        LinkedList<SimpleResult> res = new LinkedList<SimpleResult>();
        float maxDistance = 0;
        for (int i = 0; i < docs.scoreDocs.length; i++) {
            float d = 1f / docs.scoreDocs[i].score;
            maxDistance = Math.max(d, maxDistance);
            SimpleResult sr = new SimpleResult(d, reader.document(docs.scoreDocs[i].doc), i);
            res.add(sr);
        }
        sh = new SimpleImageSearchHits(res, maxDistance);
    } catch (ParseException e) {
        System.err.println(queryString);
        e.printStackTrace();
    }
    return sh;
}