Example usage for org.apache.lucene.search IndexSearcher setSimilarity

List of usage examples for org.apache.lucene.search IndexSearcher setSimilarity

Introduction

In this page you can find the example usage for org.apache.lucene.search IndexSearcher setSimilarity.

Prototype

public void setSimilarity(Similarity similarity) 

Source Link

Document

Expert: Set the Similarity implementation used by this IndexSearcher.

Usage

From source file:io.anserini.search.SimpleSearcher.java

License:Apache License

public Result[] search(String q, int k) throws IOException {
    IndexSearcher searcher = new IndexSearcher(reader);
    searcher.setSimilarity(similarity);
    Query query = AnalyzerUtils.buildBagOfWordsQuery(LuceneDocumentGenerator.FIELD_BODY, analyzer, q);

    TopDocs rs = searcher.search(query, k);
    ScoreDoc[] hits = rs.scoreDocs;//ww w.ja v  a  2  s.  c om

    Result[] results = new Result[hits.length];
    for (int i = 0; i < hits.length; i++) {
        Document doc = searcher.doc(hits[i].doc);
        String docid = doc.getField(LuceneDocumentGenerator.FIELD_ID).stringValue();
        IndexableField field = doc.getField(LuceneDocumentGenerator.FIELD_RAW);
        String content = field == null ? null : field.stringValue();
        results[i] = new Result(docid, hits[i].doc, hits[i].score, content);
    }

    return results;
}

From source file:io.anserini.SearcherCW09B.java

License:Apache License

/**
 * Prints TREC submission file to the standard output stream.
 *
 * @param topicsFile One of: topics.web.1-50.txt topics.web.51-100.txt topics.web.101-150.txt topics.web.151-200.txt
 * @param operator   Default search operator: AND or OR
 * @throws IOException/*from  w  w  w.  j  a  v  a 2 s.  c  o m*/
 * @throws ParseException
 */

public void search(String topicsFile, QueryParser.Operator operator) throws IOException, ParseException {

    Path topicsPath = Paths.get(topicsFile);

    if (!Files.exists(topicsPath) || !Files.isRegularFile(topicsPath) || !Files.isReadable(topicsPath)) {
        throw new IllegalArgumentException(
                "Topics file : " + topicsFile + " does not exist or is not a (readable) file.");
    }

    IndexSearcher searcher = new IndexSearcher(reader);
    searcher.setSimilarity(new BM25Similarity());

    final String runTag = "BM25_Krovetz_" + FIELD_BODY + "_" + operator.toString();

    // PrintWriter out = new PrintWriter(Files.newBufferedWriter(path.resolve(runTag + ".txt"), StandardCharsets.US_ASCII));

    PrintStream out = System.out;

    QueryParser queryParser = new QueryParser(FIELD_BODY, analyzer());
    queryParser.setDefaultOperator(operator);

    SortedMap<Integer, String> topics = readQueries(topicsPath);

    for (Map.Entry<Integer, String> entry : topics.entrySet()) {

        int qID = entry.getKey();
        String queryString = entry.getValue();
        Query query = queryParser.parse(queryString);

        /**
         * For Web Tracks 2010,2011,and 2012; an experimental run consists of the top 10,000 documents for each topic query.
         */
        ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;

        /**
         * the first column is the topic number.
         * the second column is currently unused and should always be "Q0".
         * the third column is the official document identifier of the retrieved document.
         * the fourth column is the rank the document is retrieved.
         * the fifth column shows the score (integer or floating point) that generated the ranking.
         * the sixth column is called the "run tag" and should be a unique identifier for your
         */
        for (int i = 0; i < hits.length; i++) {
            int docId = hits[i].doc;
            Document doc = searcher.doc(docId);
            out.print(qID);
            out.print("\tQ0\t");
            out.print(doc.get(FIELD_ID));
            out.print("\t");
            out.print(i);
            out.print("\t");
            out.print(hits[i].score);
            out.print("\t");
            out.print(runTag);
            out.println();
        }
    }
    // out.flush();
    // out.close();
}

From source file:io.puntanegra.fhir.index.lucene.LuceneService.java

License:Apache License

/**
 * Builds a new {@link FSIndex}./*from w w w .  ja  v  a2 s  .c  om*/
 *
 * @param name
 *            the index name
 * @param mbeanName
 *            the JMX MBean object name
 * @param path
 *            the directory path
 * @param analyzer
 *            the index writer analyzer
 * @param refresh
 *            the index reader refresh frequency in seconds
 * @param ramBufferMB
 *            the index writer RAM buffer size in MB
 * @param maxMergeMB
 *            the directory max merge size in MB
 * @param maxCachedMB
 *            the directory max cache size in MB
 * @param refreshTask
 *            action to be done during refresh
 */
public void init(String name, String mbeanName, Path path, Analyzer analyzer, double refresh, int ramBufferMB,
        int maxMergeMB, int maxCachedMB, Runnable refreshTask) {
    try {

        this.path = path;
        this.name = name;

        // Open or create directory
        FSDirectory fsDirectory = FSDirectory.open(path);
        this.directory = new NRTCachingDirectory(fsDirectory, maxMergeMB, maxCachedMB);

        // Setup index writer
        IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
        indexWriterConfig.setRAMBufferSizeMB(ramBufferMB);
        indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
        indexWriterConfig.setUseCompoundFile(true);
        indexWriterConfig.setMergePolicy(new TieredMergePolicy());
        this.indexWriter = new IndexWriter(this.directory, indexWriterConfig);

        // Setup NRT search
        SearcherFactory searcherFactory = new SearcherFactory() {
            @Override
            public IndexSearcher newSearcher(IndexReader reader, IndexReader previousReader) {
                if (refreshTask != null) {
                    refreshTask.run();
                }
                IndexSearcher searcher = new IndexSearcher(reader);
                searcher.setSimilarity(new NoIDFSimilarity());
                return searcher;
            }
        };
        TrackingIndexWriter trackingWriter = new TrackingIndexWriter(this.indexWriter);
        this.searcherManager = new SearcherManager(this.indexWriter, true, searcherFactory);
        this.searcherReopener = new ControlledRealTimeReopenThread<>(trackingWriter, this.searcherManager,
                refresh, refresh);
        this.searcherReopener.start();

        // Register JMX MBean
        // mbean = new ObjectName(mbeanName);
        // ManagementFactory.getPlatformMBeanServer().registerMBean(service,
        // this.mbean);

    } catch (Exception e) {
        throw new FhirIndexException(e, "Error while creating index %s", name);
    }
}

From source file:ir.ac.ut.engine.FeaturedRetriever.java

public static ScoreDoc[] search(String query, String qId, String field) throws IOException {
    float mu = (float) 1000;
    query = query.toLowerCase();//from w w w.  ja  v a  2s  .  c o m
    BooleanQuery.setMaxClauseCount(query.length());

    Analyzer analyzer;
    if (field.equals(IndexedDocument.FIELD_REAL_ID)) {
        analyzer = new SimpleAnalyzer(Version.LUCENE_CURRENT);
    } else if (field.equals(IndexedDocument.FIELD_NAMED_ENTITIES)) {
        analyzer = (new MyAnalyzer(false)).MyNgramAnalyzer();
    } else if (field.equals(IndexedDocument.FIELD_SORTED_BIGRAMS)) {
        analyzer = (new MyAnalyzer(false)).MyNgramAnalyzer();
    } else if (field.equals(IndexedDocument.FIELD_SORTED_TRIGRAMS)) {
        analyzer = (new MyAnalyzer(false)).MyNgramAnalyzer();
    } else if (field.equals(IndexedDocument.FIELD_STOPWORDS3Gram)) {
        analyzer = (new MyAnalyzer(false)).MyNgramAnalyzer();
    } else if (field.equals(IndexedDocument.FIELD_POS3GRAM)) {
        analyzer = (new MyAnalyzer(false)).MyNgramAnalyzer();
    } else {
        analyzer = (new MyAnalyzer(false)).MyDefaultAnalyzer();
    }

    QueryParser qParser = new QueryParser(Version.LUCENE_47, field, analyzer);
    Query q = null;
    try {
        q = qParser.parse(QueryParser.escape(query));
    } catch (org.apache.lucene.queryparser.classic.ParseException e) {
        e.printStackTrace();
        System.out.println("Exceptional Query:" + qId);
        return new ScoreDoc[0];
    }

    Similarity simFunction = new LMDirichletSimilarity(mu);
    // Similarity simFunction = new BM25Similarity();
    IndexSearcher isearcher = new IndexSearcher(ireader);
    isearcher.setSimilarity(simFunction);
    TopFieldCollector tfc = TopFieldCollector.create(Sort.RELEVANCE, ireader.numDocs(), true, true, true,
            false);
    isearcher.search(q, tfc);

    TopDocs results = tfc.topDocs();
    ScoreDoc[] hits = results.scoreDocs;
    reportInTREC(hits, qId);
    return hits;
}

From source file:it.unipd.dei.ims.lucene.clef.applications.BatchRetrieval.java

License:Apache License

public static void main(String[] args) {

    Properties properties = new Properties();
    InputStream input = null;//  w w w . ja v  a2 s. co  m
    try {
        if (System.getProperty("properties.path") != null) {
            input = new FileInputStream(System.getProperty("properties.path"));
            properties.load(input);
        } else {
            logger.info("Loading default property file [resources/lucene-clef.properties]");
            ClassLoader loader = Thread.currentThread().getContextClassLoader();
            input = loader.getResourceAsStream("lucene-clef.properties");
            properties.load(input);
        }
    } catch (IOException ex) {
        ex.printStackTrace();
    } finally {
        if (input != null) {
            try {
                input.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

    properties.putAll(System.getProperties());

    Path indexPath = new File(properties.getProperty("index.path")).toPath();

    Path runPath = new File(properties.getProperty("run.path")).toPath();

    String runTag = properties.getProperty("run.tag");

    String language = properties.getProperty("language");

    String stemmer = properties.getProperty("stemmer");

    String stopsetType = properties.getProperty("stopset.type");

    String stopsetPath = properties.getProperty("stopset.path");

    try {

        Directory directory = new SimpleFSDirectory(indexPath);
        IndexReader reader = DirectoryReader.open(directory);
        IndexSearcher searcher = new IndexSearcher(reader);

        String model = properties.getProperty("run.model").toUpperCase();

        Similarity similarity;

        switch (model) {
        case "BM25":
            similarity = new BM25Similarity(Float.parseFloat(properties.getProperty("bm25.k1", "1.2f")),
                    Float.parseFloat(properties.getProperty("bm25.b", "0.75f")));
            break;
        default:
            throw new UnsupportedOperationException("Model " + model + " not supported yet");

        }

        searcher.setSimilarity(similarity);

        int maxResults = Integer.parseInt(properties.getProperty("maxresults", "1000"));

        SubmissionReport runfile = new SubmissionReport(
                new PrintWriter(Files.newBufferedWriter(runPath, StandardCharsets.UTF_8)), model);

        String topicPath = properties.getProperty("topics.path");

        String[] topicFields = properties.getProperty("topics.fields").split(";");

        CharArraySet stopset = AnalyzerFactory.createStopset(language, stopsetType, stopsetPath);

        QualityQuery qqs[] = getQualityQueries(topicPath, topicFields);

        QualityQueryParser qqParser = new ClefQQParser(topicFields, BuildIndex.BODY_FIELD_NAME, language,
                stemmer, stopset);

        // run the benchmark
        QualityBenchmark qrun = new QualityBenchmark(qqs, qqParser, searcher, BuildIndex.ID_FIELD_NAME);

        qrun.setMaxResults(maxResults);

        QualityStats stats[] = qrun.execute(null, runfile, null);

        reader.close();
        directory.close();

    } catch (IOException e) {
        e.printStackTrace();
    } catch (Exception e) {
        e.printStackTrace();
    }

}

From source file:lucenesearch.RelevantPostFinder.java

public void saveRelevantPost() throws SQLException, IOException, ParseException {
    String url = "jdbc:mysql://localhost:3306/sof17";
    String username = "root";
    String password = "root";
    String folderPath = "./data/rel_posts/";
    String dupNotFound = "./data/dup_not_exist.txt";
    int hitsPerPage = 10000;

    System.out.println("Connecting database...");

    Connection conn = DriverManager.getConnection(url, username, password);
    System.out.println("Database connected!");
    Statement stmt = conn.createStatement();
    String query = "select PostId,PostBody,OriginalPostId from java_test_data";
    ResultSet rs = stmt.executeQuery(query);

    String index = new Searcher().getPostIndexPath();

    IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(index)));
    IndexSearcher searcher = new IndexSearcher(reader);
    //        searcher.setSimilarity(new BM25Similarity(0.05f, 0.03f)); //!!!!!!!!!!!!!!
    searcher.setSimilarity(new BM25Similarity()); //!!!!!!!!!!!!!!

    Analyzer analyzer = new StandardAnalyzer();

    int cnt = 0;/*  ww w .j  a  v a  2  s  .  c o m*/

    while (rs.next()) {
        System.out.println("Processing post " + (++cnt));

        int postid = rs.getInt("PostId");
        int dupId = rs.getInt("OriginalPostId");
        ArrayList<String> bd = LuceneUtils.getAnalyzedRemoveHtml(rs.getString("PostBody").replace(':', ' '));

        StringBuilder sb = new StringBuilder();
        int j = 0;
        for (String b : bd) {
            if (++j > 600)
                break;
            sb.append(b);
            sb.append(" ");
        }
        String body = sb.toString();

        BooleanQuery.Builder booleanQuery = new BooleanQuery.Builder();
        booleanQuery.add(IntPoint.newExactQuery("PostTypeId", 1), BooleanClause.Occur.MUST);
        booleanQuery.add(new QueryParser("Tags", analyzer).parse("java"), BooleanClause.Occur.MUST);
        booleanQuery.add(new QueryParser("Body", analyzer).parse(body), BooleanClause.Occur.MUST);

        TopDocs results;
        results = searcher.search(booleanQuery.build(), hitsPerPage);

        ScoreDoc[] hits = results.scoreDocs;

        int numTotalHits = results.totalHits;
        System.out.println(numTotalHits + " total matching documents");

        int start = 0;
        int end = Math.min(numTotalHits, hitsPerPage);

        PrintWriter out = new PrintWriter(folderPath + postid + ".txt");

        boolean isFound = false;

        for (int i = start; i < end; i++) {
            Document doc = searcher.doc(hits[i].doc);
            int id = Integer.parseInt(doc.get("SId"));
            String s = doc.get("Body");
            if (id == dupId)
                isFound = true;
            out.println(id);
        }
        out.close();

        if (!isFound) {
            System.out.println("Duplicate not found");
            PrintWriter out2 = new PrintWriter(
                    new FileOutputStream(new File(dupNotFound), true /* append = true */));
            out2.println(postid);
            out2.close();
        }

    }
    rs.close();
    stmt.close();
    conn.close();
}

From source file:main.java.run.SearchFiles.java

License:Apache License

/** Simple command-line based search demo. */
public static void main(String[] args) throws Exception {
    String usage = "Usage:\tjava org.apache.lucene.demo.SearchFiles [-index dir] [-field f] [-repeat n] [-queries file] [-query string] [-raw] [-paging hitsPerPage]\n\nSee http://lucene.apache.org/core/4_1_0/demo/ for details.";
    if (args.length > 0 && ("-h".equals(args[0]) || "-help".equals(args[0]))) {
        System.out.println(usage);
        System.exit(0);/*from   w  w w .ja v a2  s  .  c o m*/
    }

    String index = "index";
    String field = "contents";
    String queries = null;
    int repeat = 0;
    boolean raw = false;
    String queryString = null;
    int hitsPerPage = 10;

    for (int i = 0; i < args.length; i++) {
        if ("-index".equals(args[i])) {
            index = args[i + 1];
            i++;
        } else if ("-field".equals(args[i])) {
            field = args[i + 1];
            i++;
        } else if ("-queries".equals(args[i])) {
            queries = args[i + 1];
            i++;
        } else if ("-query".equals(args[i])) {
            queryString = args[i + 1];
            i++;
        } else if ("-repeat".equals(args[i])) {
            repeat = Integer.parseInt(args[i + 1]);
            i++;
        } else if ("-raw".equals(args[i])) {
            raw = true;
        } else if ("-paging".equals(args[i])) {
            hitsPerPage = Integer.parseInt(args[i + 1]);
            if (hitsPerPage <= 0) {
                System.err.println("There must be at least 1 hit per page.");
                System.exit(1);
            }
            i++;
        }
    }

    IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(index)));
    IndexSearcher searcher = new IndexSearcher(reader);
    searcher.setSimilarity(new BM25Similarity());
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);

    BufferedReader in = null;
    if (queries != null) {
        in = new BufferedReader(new InputStreamReader(new FileInputStream(queries), "UTF-8"));
    } else {
        in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
    }
    QueryParser parser = new QueryParser(Version.LUCENE_40, field, analyzer);
    while (true) {
        if (queries == null && queryString == null) { // prompt the user
            System.out.println("Enter query: ");
        }

        String line = queryString != null ? queryString : in.readLine();

        if (line == null || line.length() == -1) {
            break;
        }

        line = line.trim();
        if (line.length() == 0) {
            break;
        }

        Query query = parser.parse(line);
        System.out.println("Searching for: " + query.toString(field));

        if (repeat > 0) { // repeat & time as benchmark
            Date start = new Date();
            for (int i = 0; i < repeat; i++) {
                searcher.search(query, null, 100);
            }
            Date end = new Date();
            System.out.println("Time: " + (end.getTime() - start.getTime()) + "ms");
        }

        doPagingSearch(in, searcher, query, hitsPerPage, raw, queries == null && queryString == null);

        if (queryString != null) {
            break;
        }
    }
    reader.close();
}

From source file:net.semanticmetadata.lire.impl.BitSamplingImageSearcher.java

License:Open Source License

private ImageSearchHits search(String[] hashes, LireFeature queryFeature, IndexReader reader)
        throws IOException {
    // first search by text:
    IndexSearcher searcher = new IndexSearcher(reader);
    searcher.setSimilarity(new BaseSimilarity());
    BooleanQuery query = new BooleanQuery();
    for (int i = 0; i < hashes.length; i++) {
        // be aware that the hashFunctionsFileName of the field must match the one you put the hashes in before.
        if (partialHashes) {
            if (Math.random() < 0.5)
                query.add(new BooleanClause(new TermQuery(new Term(hashesFieldName, hashes[i] + "")),
                        BooleanClause.Occur.SHOULD));
        } else/*from www. j a v a  2  s  . c  o m*/
            query.add(new BooleanClause(new TermQuery(new Term(hashesFieldName, hashes[i] + "")),
                    BooleanClause.Occur.SHOULD));
    }
    TopDocs docs = searcher.search(query, maxResultsHashBased);
    //        System.out.println(docs.totalHits);
    // then re-rank
    TreeSet<SimpleResult> resultScoreDocs = new TreeSet<SimpleResult>();
    float maxDistance = -1f;
    float tmpScore;
    for (int i = 0; i < docs.scoreDocs.length; i++) {
        feature.setByteArrayRepresentation(
                reader.document(docs.scoreDocs[i].doc).getBinaryValue(featureFieldName).bytes,
                reader.document(docs.scoreDocs[i].doc).getBinaryValue(featureFieldName).offset,
                reader.document(docs.scoreDocs[i].doc).getBinaryValue(featureFieldName).length);
        tmpScore = queryFeature.getDistance(feature);
        assert (tmpScore >= 0);
        if (resultScoreDocs.size() < maximumHits) {
            resultScoreDocs.add(
                    new SimpleResult(tmpScore, reader.document(docs.scoreDocs[i].doc), docs.scoreDocs[i].doc));
            maxDistance = Math.max(maxDistance, tmpScore);
        } else if (tmpScore < maxDistance) {
            // if it is nearer to the sample than at least one of the current set:
            // remove the last one ...
            resultScoreDocs.remove(resultScoreDocs.last());
            // add the new one ...
            resultScoreDocs.add(
                    new SimpleResult(tmpScore, reader.document(docs.scoreDocs[i].doc), docs.scoreDocs[i].doc));
            // and set our new distance border ...
            maxDistance = resultScoreDocs.last().getDistance();
        }
    }
    assert (resultScoreDocs.size() <= maximumHits);
    return new SimpleImageSearchHits(resultScoreDocs, maxDistance);
}

From source file:net.semanticmetadata.lire.impl.LshImageSearcher.java

License:Open Source License

private ImageSearchHits search(String[] hashes, LireFeature queryFeature, IndexReader reader)
        throws IOException {
    // first search by text:
    IndexSearcher searcher = new IndexSearcher(reader);
    searcher.setSimilarity(new DefaultSimilarity() {
        @Override/*from  w  ww  .ja v  a  2 s  .c o m*/
        public float tf(float freq) {
            return 1;
        }

        @Override
        public float idf(long docFreq, long numDocs) {
            return 1;
        }

        @Override
        public float coord(int overlap, int maxOverlap) {
            return 1;
        }

        @Override
        public float queryNorm(float sumOfSquaredWeights) {
            return 1;
        }

        @Override
        public float sloppyFreq(int distance) {
            return 1;
        }

        @Override
        public float lengthNorm(FieldInvertState state) {
            return 1;
        }
    });
    BooleanQuery query = new BooleanQuery();
    for (int i = 0; i < hashes.length; i++) {
        // be aware that the hashFunctionsFileName of the field must match the one you put the hashes in before.
        query.add(new BooleanClause(new TermQuery(new Term(hashesFieldName, hashes[i] + "")),
                BooleanClause.Occur.SHOULD));
    }
    TopDocs docs = searcher.search(query, maxResultsHashBased);
    // then re-rank
    TreeSet<SimpleResult> resultScoreDocs = new TreeSet<SimpleResult>();
    float maxDistance = 0f;
    float tmpScore = 0f;
    for (int i = 0; i < docs.scoreDocs.length; i++) {
        feature.setByteArrayRepresentation(
                reader.document(docs.scoreDocs[i].doc).getBinaryValue(featureFieldName).bytes,
                reader.document(docs.scoreDocs[i].doc).getBinaryValue(featureFieldName).offset,
                reader.document(docs.scoreDocs[i].doc).getBinaryValue(featureFieldName).length);
        tmpScore = queryFeature.getDistance(feature);
        if (resultScoreDocs.size() < maximumHits) {
            resultScoreDocs.add(
                    new SimpleResult(tmpScore, reader.document(docs.scoreDocs[i].doc), docs.scoreDocs[i].doc));
            maxDistance = Math.max(maxDistance, tmpScore);
        } else if (tmpScore < maxDistance) {
            resultScoreDocs.add(
                    new SimpleResult(tmpScore, reader.document(docs.scoreDocs[i].doc), docs.scoreDocs[i].doc));
        }
        while (resultScoreDocs.size() > maximumHits) {
            resultScoreDocs.remove(resultScoreDocs.last());
            maxDistance = resultScoreDocs.last().getDistance();
        }
        //            resultScoreDocs.add(new SimpleResult(tmpScore, reader.document(docs.scoreDocs[i].doc), docs.scoreDocs[i].doc));
    }
    return new SimpleImageSearchHits(resultScoreDocs, maxDistance);
}

From source file:net.semanticmetadata.lire.impl.searcher.VisualWordsImageSearcher.java

License:Open Source License

public ImageSearchHits search(Document doc, IndexReader reader) throws IOException {
    SimpleImageSearchHits sh = null;//w  w w  . j a  va2s  .  co m
    IndexSearcher isearcher = new IndexSearcher(reader);
    isearcher.setSimilarity(similarity);
    String queryString = doc.getValues(fieldName)[0];
    Query tq = null;
    try {
        tq = qp.parse(queryString);
        TopDocs docs = isearcher.search(tq, numMaxHits);
        LinkedList<SimpleResult> res = new LinkedList<SimpleResult>();
        float maxDistance = 0;
        for (int i = 0; i < docs.scoreDocs.length; i++) {
            float d = 1f / docs.scoreDocs[i].score;
            maxDistance = Math.max(d, maxDistance);
            SimpleResult sr = new SimpleResult(d, reader.document(docs.scoreDocs[i].doc), i);
            res.add(sr);
        }
        sh = new SimpleImageSearchHits(res, maxDistance);
    } catch (ParseException e) {
        System.err.println(queryString);
        e.printStackTrace();
    }
    return sh;
}