Example usage for org.apache.lucene.search IndexSearcher setSimilarity

Introduction

In this page you can find the example usage for org.apache.lucene.search IndexSearcher setSimilarity.

Prototype

public void setSimilarity(Similarity similarity)

Source Link

Document

Expert: Set the Similarity implementation used by this IndexSearcher.

Usage

From source file:org.hibernate.search.query.engine.impl.HSQueryImpl.java

License:Open Source License

/**
 * Build the index searcher for this fulltext query.
 *
 * @param searchFactoryImplementor the search factory.
 * @param forceScoring if true, force SCORE computation, if false, force not to compute score, if null used best choice
 *
 * @return the <code>IndexSearcher</code> for this query (can be <code>null</code>.
 *         TODO change classesAndSubclasses by side effect, which is a mismatch with the Searcher return, fix that.
 *///from  w  w w. ja v a 2  s. co  m
private IndexSearcherWithPayload buildSearcher(SearchFactoryImplementor searchFactoryImplementor,
        Boolean forceScoring) {
    Map<Class<?>, EntityIndexBinder<?>> builders = searchFactoryImplementor.getIndexBindingForEntity();
    List<IndexManager> targetedIndexes = new ArrayList<IndexManager>();
    Set<String> idFieldNames = new HashSet<String>();

    Similarity searcherSimilarity = null;
    //TODO check if caching this work for the last n list of indexedTargetedEntities makes a perf boost
    if (indexedTargetedEntities.size() == 0) {
        // empty indexedTargetedEntities array means search over all indexed entities,
        // but we have to make sure there is at least one
        if (builders.isEmpty()) {
            throw new SearchException(
                    "There are no mapped entities. Don't forget to add @Indexed to at least one class.");
        }

        for (EntityIndexBinder indexBinder : builders.values()) {
            DocumentBuilderIndexedEntity<?> builder = indexBinder.getDocumentBuilder();
            searcherSimilarity = checkSimilarity(searcherSimilarity, builder);
            if (builder.getIdKeywordName() != null) {
                idFieldNames.add(builder.getIdKeywordName());
                allowFieldSelectionInProjection = allowFieldSelectionInProjection
                        && builder.allowFieldSelectionInProjection();
            }
            useFieldCacheOnClassTypes = useFieldCacheOnClassTypes
                    || builder.getFieldCacheOption().contains(FieldCacheType.CLASS);
            populateIndexManagers(targetedIndexes, indexBinder.getSelectionStrategy());
        }
        classesAndSubclasses = null;
    } else {
        Set<Class<?>> involvedClasses = new HashSet<Class<?>>(indexedTargetedEntities.size());
        involvedClasses.addAll(indexedTargetedEntities);
        for (Class<?> clazz : indexedTargetedEntities) {
            EntityIndexBinder<?> indexBinder = builders.get(clazz);
            if (indexBinder != null) {
                DocumentBuilderIndexedEntity<?> builder = indexBinder.getDocumentBuilder();
                involvedClasses.addAll(builder.getMappedSubclasses());
            }
        }

        for (Class clazz : involvedClasses) {
            EntityIndexBinder indexBinder = builders.get(clazz);
            //TODO should we rather choose a polymorphic path and allow non mapped entities
            if (indexBinder == null) {
                throw new SearchException("Not a mapped entity (don't forget to add @Indexed): " + clazz);
            }
            DocumentBuilderIndexedEntity<?> builder = indexBinder.getDocumentBuilder();
            if (builder.getIdKeywordName() != null) {
                idFieldNames.add(builder.getIdKeywordName());
                allowFieldSelectionInProjection = allowFieldSelectionInProjection
                        && builder.allowFieldSelectionInProjection();
            }
            searcherSimilarity = checkSimilarity(searcherSimilarity, builder);
            useFieldCacheOnClassTypes = useFieldCacheOnClassTypes
                    || builder.getFieldCacheOption().contains(FieldCacheType.CLASS);
            populateIndexManagers(targetedIndexes, indexBinder.getSelectionStrategy());
        }
        this.classesAndSubclasses = involvedClasses;
    }
    this.idFieldNames = idFieldNames;

    //compute optimization needClassFilterClause
    //if at least one DP contains one class that is not part of the targeted classesAndSubclasses we can't optimize
    if (classesAndSubclasses != null) {
        for (IndexManager indexManager : targetedIndexes) {
            final Set<Class<?>> classesInIndexManager = indexManager.getContainedTypes();
            // if an IndexManager contains only one class, we know for sure it's part of classesAndSubclasses
            if (classesInIndexManager.size() > 1) {
                //risk of needClassFilterClause
                for (Class clazz : classesInIndexManager) {
                    if (!classesAndSubclasses.contains(clazz)) {
                        this.needClassFilterClause = true;
                        break;
                    }
                }
            }
            if (this.needClassFilterClause) {
                break;
            }
        }
    } else {
        Map<Class<?>, EntityIndexBinder<?>> documentBuildersIndexedEntities = searchFactoryImplementor
                .getIndexBindingForEntity();
        this.classesAndSubclasses = documentBuildersIndexedEntities.keySet();
    }

    //set up the searcher
    final IndexManager[] indexManagers = targetedIndexes.toArray(new IndexManager[targetedIndexes.size()]);
    IndexSearcher is = new IndexSearcher(MultiReaderFactory.openReader(indexManagers));
    is.setSimilarity(searcherSimilarity);

    //handle the sort and projection
    final String[] projection = this.projectedFields;
    if (Boolean.TRUE.equals(forceScoring)) {
        return new IndexSearcherWithPayload(is, true, true);
    } else if (Boolean.FALSE.equals(forceScoring)) {
        return new IndexSearcherWithPayload(is, false, false);
    } else if (this.sort != null && projection != null) {
        boolean activate = false;
        for (String field : projection) {
            if (SCORE.equals(field)) {
                activate = true;
                break;
            }
        }
        if (activate) {
            return new IndexSearcherWithPayload(is, true, false);
        }
    }
    //default
    return new IndexSearcherWithPayload(is, false, false);
}

From source file:org.neo4j.index.impl.lucene.legacy.IndexReferenceFactory.java

License:Open Source License

IndexSearcher newIndexSearcher(IndexIdentifier identifier, IndexReader reader) {
    IndexSearcher searcher = new IndexSearcher(reader);
    IndexType type = getType(identifier);
    if (type.getSimilarity() != null) {
        searcher.setSimilarity(type.getSimilarity());
    }/*www  .  ja v a  2  s.c o m*/
    return searcher;
}

From source file:org.neo4j.index.impl.lucene.LuceneDataSource.java

License:Open Source License

private IndexSearcher newIndexSearcher(IndexIdentifier identifier, IndexReader reader) {
    IndexSearcher searcher = new IndexSearcher(reader);
    IndexType type = getType(identifier, false);
    if (type.getSimilarity() != null) {
        searcher.setSimilarity(type.getSimilarity());
    }//from   w ww  .j a v a 2s  .  co  m
    return searcher;
}

From source file:org.waveprotocol.box.server.search.LuceneSearchImpl.java

License:Apache License

@Override
public SearchResult search(String query, int startAt, int numResults, ParticipantId viewer) {
    LOG.fine("Search query '" + query + "' from user: " + viewer + " [" + startAt + ", "
            + (startAt + numResults - 1) + "]");
    SearchResult result = new SearchResult(query);
    SearchQuery queryParams = queryParser.parseQuery(query);
    List<IndexCondition> indexConditions = SearchQueryHelper.convertToIndexQuery(queryParams, viewer,
            waveDomain);/*from  www  .  ja  v  a 2 s  .c o m*/
    try {
        BooleanQuery allQuery = new BooleanQuery();
        String fromParticipant = viewer.getAddress();
        Query userQuery = null;
        if (!indexConditions.isEmpty()) {
            try {
                userQuery = makeQuery(indexConditions);
            } catch (ParseException ex) {
                LOG.log(Level.SEVERE, "Invalid query: " + query, ex);
                return result;
            }
        }
        if (userQuery == null || !SearchQueryHelper.withParticipant(indexConditions, sharedDomainParticipant)) {
            TermQuery participantQuery = new TermQuery(
                    new Term(IndexCondition.Field.PARTICIPANTS.toString(), fromParticipant));
            participantQuery.setBoost(0);
            allQuery.add(participantQuery, Occur.MUST);
        }
        if (userQuery != null) {
            userQuery.setBoost(1);
            allQuery.add(userQuery, Occur.MUST);
            for (IndexCondition condition : indexConditions) {
                if (condition.getField() == IndexCondition.Field.CONTENT) {
                    IndexCondition titleCondition = new IndexCondition(IndexCondition.Field.TITLE, null,
                            condition.getValue(), condition.isPhrase(), condition.isNot());
                    Query titleQuery = makeQuery(titleCondition);
                    titleQuery.setBoost(2);
                    allQuery.add(titleQuery, titleCondition.isNot() ? Occur.MUST_NOT : Occur.SHOULD);
                    IndexCondition tagCondition = new IndexCondition(IndexCondition.Field.TAG, null,
                            condition.getValue(), condition.isPhrase(), condition.isNot());
                    Query tagQuery = makeQuery(tagCondition);
                    tagQuery.setBoost(3);
                    allQuery.add(tagQuery, tagCondition.isNot() ? Occur.MUST_NOT : Occur.SHOULD);
                }
            }
        }
        LOG.fine("Search query " + allQuery.toString());
        List<SortField> sortFields = new LinkedList<SortField>();
        sortFields.add(SortField.FIELD_SCORE);
        sortFields.add(new SortField(IndexCondition.Field.LAST_MODIFIED.toString(), longParser, true));
        SearcherManager searcherManager = nrtManager.getSearcherManager(true);
        IndexSearcher indexSearcher = searcherManager.acquire();
        try {
            indexSearcher.setSimilarity(similarity);
            TopDocs hints = indexSearcher.search(allQuery, startAt + numResults,
                    new Sort(sortFields.toArray(new SortField[sortFields.size()])));
            for (int i = startAt; i < hints.scoreDocs.length; i++) {
                try {
                    ScoreDoc hint = hints.scoreDocs[i];
                    Document doc = indexSearcher.doc(hint.doc);
                    result.addDigest(parseDigest(doc, viewer));
                } catch (IOException ex) {
                    LOG.log(Level.SEVERE, "Get digest from index", ex);
                }
            }
        } finally {
            searcherManager.release(indexSearcher);
        }
    } catch (ParseException ex) {
        LOG.log(Level.SEVERE, "Search failed: " + query, ex);
    } catch (IOException ex) {
        LOG.log(Level.SEVERE, "Search failed: " + query, ex);
    }
    return result;
}

From source file:perf.SearchPerfTest.java

License:Apache License

private static void _main(String[] clArgs) throws Exception {

    // args: dirImpl indexPath numThread numIterPerThread
    // eg java SearchPerfTest /path/to/index 4 100
    final Args args = new Args(clArgs);

    Directory dir0;//  w  w w.jav a2 s.  c om
    final String dirPath = args.getString("-indexPath") + "/index";
    final String dirImpl = args.getString("-dirImpl");

    OpenDirectory od = OpenDirectory.get(dirImpl);

    /*
    } else if (dirImpl.equals("NativePosixMMapDirectory")) {
      dir0 = new NativePosixMMapDirectory(new File(dirPath));
      ramDir = null;
      if (doFacets) {
        facetsDir = new NativePosixMMapDirectory(new File(facetsDirPath));
      }
    } else if (dirImpl.equals("CachingDirWrapper")) {
      dir0 = new CachingRAMDirectory(new MMapDirectory(new File(dirPath)));
      ramDir = null;
    } else if (dirImpl.equals("RAMExceptDirectPostingsDirectory")) {
      // Load only non-postings files into RAMDir (assumes
      // Lucene40PF is the wrapped PF):
      Set<String> postingsExtensions = new HashSet<String>();
      postingsExtensions.add("frq");
      postingsExtensions.add("prx");
      postingsExtensions.add("tip");
      postingsExtensions.add("tim");
              
      ramDir =  new RAMDirectory();
      Directory fsDir = new MMapDirectory(new File(dirPath));
      for (String file : fsDir.listAll()) {
        int idx = file.indexOf('.');
        if (idx != -1 && postingsExtensions.contains(file.substring(idx+1, file.length()))) {
          continue;
        }
            
        fsDir.copy(ramDir, file, file, IOContext.READ);
      }
      dir0 = new FileSwitchDirectory(postingsExtensions,
                             fsDir,
                             ramDir,
                             true);
      if (doFacets) {
        facetsDir = new RAMDirectory(new SimpleFSDirectory(new File(facetsDirPath)), IOContext.READ);
      }
      */

    final RAMDirectory ramDir;
    dir0 = od.open(Paths.get(dirPath));
    if (dir0 instanceof RAMDirectory) {
        ramDir = (RAMDirectory) dir0;
    } else {
        ramDir = null;
    }

    // TODO: NativeUnixDir?

    final String analyzer = args.getString("-analyzer");
    final String tasksFile = args.getString("-taskSource");
    final int searchThreadCount = args.getInt("-searchThreadCount");
    final String fieldName = args.getString("-field");
    final boolean printHeap = args.getFlag("-printHeap");
    final boolean doPKLookup = args.getFlag("-pk");
    final int topN = args.getInt("-topN");
    final boolean doStoredLoads = args.getFlag("-loadStoredFields");

    // Used to choose which random subset of tasks we will
    // run, to generate the PKLookup tasks, and to generate
    // any random pct filters:
    final long staticRandomSeed = args.getLong("-staticSeed");

    // Used to shuffle the random subset of tasks:
    final long randomSeed = args.getLong("-seed");

    // TODO: this could be way better.
    final String similarity = args.getString("-similarity");
    // now reflect
    final Class<? extends Similarity> simClazz = Class
            .forName("org.apache.lucene.search.similarities." + similarity).asSubclass(Similarity.class);
    final Similarity sim = simClazz.newInstance();

    System.out.println("Using dir impl " + dir0.getClass().getName());
    System.out.println("Analyzer " + analyzer);
    System.out.println("Similarity " + similarity);
    System.out.println("Search thread count " + searchThreadCount);
    System.out.println("topN " + topN);
    System.out.println("JVM " + (Constants.JRE_IS_64BIT ? "is" : "is not") + " 64bit");
    System.out.println("Pointer is " + RamUsageEstimator.NUM_BYTES_OBJECT_REF + " bytes");

    final Analyzer a;
    if (analyzer.equals("EnglishAnalyzer")) {
        a = new EnglishAnalyzer();
    } else if (analyzer.equals("ClassicAnalyzer")) {
        a = new ClassicAnalyzer();
    } else if (analyzer.equals("StandardAnalyzer")) {
        a = new StandardAnalyzer();
    } else if (analyzer.equals("StandardAnalyzerNoStopWords")) {
        a = new StandardAnalyzer(CharArraySet.EMPTY_SET);
    } else if (analyzer.equals("ShingleStandardAnalyzer")) {
        a = new ShingleAnalyzerWrapper(new StandardAnalyzer(CharArraySet.EMPTY_SET), 2, 2,
                ShingleFilter.DEFAULT_TOKEN_SEPARATOR, true, true, ShingleFilter.DEFAULT_FILLER_TOKEN);
    } else {
        throw new RuntimeException("unknown analyzer " + analyzer);
    }

    final ReferenceManager<IndexSearcher> mgr;
    final IndexWriter writer;
    final Directory dir;

    final String commit = args.getString("-commit");
    final String hiliteImpl = args.getString("-hiliteImpl");

    final String logFile = args.getString("-log");

    final long tSearcherStart = System.currentTimeMillis();

    final boolean verifyCheckSum = !args.getFlag("-skipVerifyChecksum");
    final boolean recacheFilterDeletes = args.getFlag("-recacheFilterDeletes");

    if (recacheFilterDeletes) {
        throw new UnsupportedOperationException("recacheFilterDeletes was deprecated");
    }

    if (args.getFlag("-nrt")) {
        // TODO: get taxoReader working here too
        // TODO: factor out & share this CL processing w/ Indexer
        final int indexThreadCount = args.getInt("-indexThreadCount");
        final String lineDocsFile = args.getString("-lineDocsFile");
        final float docsPerSecPerThread = args.getFloat("-docsPerSecPerThread");
        final float reopenEverySec = args.getFloat("-reopenEverySec");
        final boolean storeBody = args.getFlag("-store");
        final boolean tvsBody = args.getFlag("-tvs");
        final boolean useCFS = args.getFlag("-cfs");
        final String defaultPostingsFormat = args.getString("-postingsFormat");
        final String idFieldPostingsFormat = args.getString("-idFieldPostingsFormat");
        final boolean verbose = args.getFlag("-verbose");
        final boolean cloneDocs = args.getFlag("-cloneDocs");
        final Mode mode = Mode.valueOf(args.getString("-mode", "update").toUpperCase(Locale.ROOT));

        final long reopenEveryMS = (long) (1000 * reopenEverySec);

        if (verbose) {
            InfoStream.setDefault(new PrintStreamInfoStream(System.out));
        }

        if (!dirImpl.equals("RAMDirectory") && !dirImpl.equals("RAMExceptDirectPostingsDirectory")) {
            System.out.println("Wrap NRTCachingDirectory");
            dir0 = new NRTCachingDirectory(dir0, 20, 400.0);
        }

        dir = dir0;

        final IndexWriterConfig iwc = new IndexWriterConfig(a);
        iwc.setOpenMode(IndexWriterConfig.OpenMode.APPEND);
        iwc.setRAMBufferSizeMB(256.0);
        iwc.setIndexDeletionPolicy(NoDeletionPolicy.INSTANCE);

        // TODO: also RAMDirExceptDirect...?  need to
        // ... block deletes against wrapped FSDir?
        if (dirImpl.equals("RAMDirectory")) {
            // Let IW remove files only referenced by starting commit:
            iwc.setIndexDeletionPolicy(new KeepNoCommitsDeletionPolicy());
        }

        if (commit != null && commit.length() > 0) {
            System.out.println("Opening writer on commit=" + commit);
            iwc.setIndexCommit(PerfUtils.findCommitPoint(commit, dir));
        }

        ((TieredMergePolicy) iwc.getMergePolicy()).setNoCFSRatio(useCFS ? 1.0 : 0.0);
        //((TieredMergePolicy) iwc.getMergePolicy()).setMaxMergedSegmentMB(1024);
        //((TieredMergePolicy) iwc.getMergePolicy()).setReclaimDeletesWeight(3.0);
        //((TieredMergePolicy) iwc.getMergePolicy()).setMaxMergeAtOnce(4);

        final Codec codec = new Lucene62Codec() {
            @Override
            public PostingsFormat getPostingsFormatForField(String field) {
                return PostingsFormat
                        .forName(field.equals("id") ? idFieldPostingsFormat : defaultPostingsFormat);
            }
        };
        iwc.setCodec(codec);

        final ConcurrentMergeScheduler cms = (ConcurrentMergeScheduler) iwc.getMergeScheduler();
        // Only let one merge run at a time...
        // ... but queue up up to 4, before index thread is stalled:
        cms.setMaxMergesAndThreads(4, 1);

        iwc.setMergedSegmentWarmer(new IndexWriter.IndexReaderWarmer() {
            @Override
            public void warm(LeafReader reader) throws IOException {
                final long t0 = System.currentTimeMillis();
                //System.out.println("DO WARM: " + reader);
                IndexSearcher s = new IndexSearcher(reader);
                s.setQueryCache(null); // don't bench the cache
                s.search(new TermQuery(new Term(fieldName, "united")), 10);
                final long t1 = System.currentTimeMillis();
                System.out.println("warm segment=" + reader + " numDocs=" + reader.numDocs() + ": took "
                        + (t1 - t0) + " msec");
            }
        });

        writer = new IndexWriter(dir, iwc);
        System.out.println("Initial writer.maxDoc()=" + writer.maxDoc());

        // TODO: add -nrtBodyPostingsOffsets instead of
        // hardwired false:
        boolean addDVFields = mode == Mode.BDV_UPDATE || mode == Mode.NDV_UPDATE;
        LineFileDocs lineFileDocs = new LineFileDocs(lineDocsFile, false, storeBody, tvsBody, false, cloneDocs,
                null, null, null, addDVFields);
        IndexThreads threads = new IndexThreads(new Random(17), writer, new AtomicBoolean(false), lineFileDocs,
                indexThreadCount, -1, false, false, mode, docsPerSecPerThread, null, -1.0, -1);
        threads.start();

        mgr = new SearcherManager(writer, new SearcherFactory() {
            @Override
            public IndexSearcher newSearcher(IndexReader reader, IndexReader previous) {
                IndexSearcher s = new IndexSearcher(reader);
                s.setQueryCache(null); // don't bench the cache
                s.setSimilarity(sim);
                return s;
            }
        });

        System.out.println("reopen every " + reopenEverySec);

        Thread reopenThread = new Thread() {
            @Override
            public void run() {
                try {
                    final long startMS = System.currentTimeMillis();

                    int reopenCount = 1;
                    while (true) {
                        final long sleepMS = startMS + (reopenCount * reopenEveryMS)
                                - System.currentTimeMillis();
                        if (sleepMS < 0) {
                            System.out.println("WARNING: reopen fell behind by " + Math.abs(sleepMS) + " ms");
                        } else {
                            Thread.sleep(sleepMS);
                        }

                        Thread.sleep(sleepMS);
                        mgr.maybeRefresh();
                        reopenCount++;
                        IndexSearcher s = mgr.acquire();
                        try {
                            if (ramDir != null) {
                                System.out.println(String.format(Locale.ENGLISH,
                                        "%.1fs: index: %d bytes in RAMDir; writer.maxDoc()=%d; searcher.maxDoc()=%d; searcher.numDocs()=%d",
                                        (System.currentTimeMillis() - startMS) / 1000.0, ramDir.ramBytesUsed(),
                                        writer.maxDoc(), s.getIndexReader().maxDoc(),
                                        s.getIndexReader().numDocs()));
                                //String[] l = ramDir.listAll();
                                //Arrays.sort(l);
                                //for(String f : l) {
                                //System.out.println("  " + f + ": " + ramDir.fileLength(f));
                                //}
                            } else {
                                System.out.println(String.format(Locale.ENGLISH,
                                        "%.1fs: done reopen; writer.maxDoc()=%d; searcher.maxDoc()=%d; searcher.numDocs()=%d",
                                        (System.currentTimeMillis() - startMS) / 1000.0, writer.maxDoc(),
                                        s.getIndexReader().maxDoc(), s.getIndexReader().numDocs()));
                            }
                        } finally {
                            mgr.release(s);
                        }
                    }
                } catch (Exception e) {
                    throw new RuntimeException(e);
                }
            }
        };
        reopenThread.setName("ReopenThread");
        reopenThread.setPriority(4 + Thread.currentThread().getPriority());
        reopenThread.start();

    } else {
        dir = dir0;
        writer = null;
        final DirectoryReader reader;
        if (commit != null && commit.length() > 0) {
            System.out.println("Opening searcher on commit=" + commit);
            reader = DirectoryReader.open(PerfUtils.findCommitPoint(commit, dir));
        } else {
            // open last commit
            reader = DirectoryReader.open(dir);
        }
        IndexSearcher s = new IndexSearcher(reader);
        s.setQueryCache(null); // don't bench the cache
        s.setSimilarity(sim);
        System.out.println("maxDoc=" + reader.maxDoc() + " numDocs=" + reader.numDocs() + " %tg deletes="
                + (100. * reader.maxDoc() / reader.numDocs()));

        mgr = new SingleIndexSearcher(s);
    }

    System.out.println((System.currentTimeMillis() - tSearcherStart) + " msec to init searcher/NRT");

    {
        IndexSearcher s = mgr.acquire();
        try {
            System.out.println("Searcher: numDocs=" + s.getIndexReader().numDocs() + " maxDoc="
                    + s.getIndexReader().maxDoc() + ": " + s);
        } finally {
            mgr.release(s);
        }
    }

    //System.out.println("searcher=" + searcher);

    FacetsConfig facetsConfig = new FacetsConfig();
    facetsConfig.setHierarchical("Date", true);

    TaxonomyReader taxoReader;
    Path taxoPath = Paths.get(args.getString("-indexPath"), "facets");
    Directory taxoDir = od.open(taxoPath);
    if (DirectoryReader.indexExists(taxoDir)) {
        taxoReader = new DirectoryTaxonomyReader(taxoDir);
        System.out.println("Taxonomy has " + taxoReader.getSize() + " ords");
    } else {
        taxoReader = null;
    }

    final Random staticRandom = new Random(staticRandomSeed);
    final Random random = new Random(randomSeed);

    final DirectSpellChecker spellChecker = new DirectSpellChecker();
    final IndexState indexState = new IndexState(mgr, taxoReader, fieldName, spellChecker, hiliteImpl,
            facetsConfig);

    final QueryParser queryParser = new QueryParser("body", a);
    TaskParser taskParser = new TaskParser(indexState, queryParser, fieldName, topN, staticRandom,
            doStoredLoads);

    final TaskSource tasks;

    if (tasksFile.startsWith("server:")) {
        int idx = tasksFile.indexOf(':', 8);
        if (idx == -1) {
            throw new RuntimeException(
                    "server is missing the port; should be server:interface:port (got: " + tasksFile + ")");
        }
        String iface = tasksFile.substring(7, idx);
        int port = Integer.valueOf(tasksFile.substring(1 + idx));
        RemoteTaskSource remoteTasks = new RemoteTaskSource(iface, port, searchThreadCount, taskParser);

        // nocommit must stop thread?
        tasks = remoteTasks;
    } else {
        // Load the tasks from a file:
        final int taskRepeatCount = args.getInt("-taskRepeatCount");
        final int numTaskPerCat = args.getInt("-tasksPerCat");
        tasks = new LocalTaskSource(indexState, taskParser, tasksFile, staticRandom, random, numTaskPerCat,
                taskRepeatCount, doPKLookup);
        System.out.println("Task repeat count " + taskRepeatCount);
        System.out.println("Tasks file " + tasksFile);
        System.out.println("Num task per cat " + numTaskPerCat);
    }

    args.check();

    // Evil respeller:
    //spellChecker.setMinPrefix(0);
    //spellChecker.setMaxInspections(1024);
    final TaskThreads taskThreads = new TaskThreads(tasks, indexState, searchThreadCount);
    Thread.sleep(10);

    final long startNanos = System.nanoTime();
    taskThreads.start();
    taskThreads.finish();
    final long endNanos = System.nanoTime();

    System.out.println("\n" + ((endNanos - startNanos) / 1000000.0) + " msec total");

    final List<Task> allTasks = tasks.getAllTasks();

    PrintStream out = new PrintStream(logFile);

    if (allTasks != null) {
        // Tasks were local: verify checksums:

        // indexState.setDocIDToID();

        final Map<Task, Task> tasksSeen = new HashMap<Task, Task>();

        out.println("\nResults for " + allTasks.size() + " tasks:");

        boolean fail = false;
        for (final Task task : allTasks) {
            if (verifyCheckSum) {
                final Task other = tasksSeen.get(task);
                if (other != null) {
                    if (task.checksum() != other.checksum()) {
                        System.out.println("\nTASK:");
                        task.printResults(System.out, indexState);
                        System.out.println("\nOTHER TASK:");
                        other.printResults(System.out, indexState);
                        fail = true;
                        //throw new RuntimeException("task " + task + " hit different checksums: " + task.checksum() + " vs " + other.checksum() + " other=" + other);
                    }
                } else {
                    tasksSeen.put(task, task);
                }
            }
            out.println("\nTASK: " + task);
            out.println("  " + (task.runTimeNanos / 1000000.0) + " msec");
            out.println("  thread " + task.threadID);
            task.printResults(out, indexState);
        }
        if (fail) {
            throw new RuntimeException("some tasks got different results across different threads");
        }

        allTasks.clear();
    }

    mgr.close();

    if (taxoReader != null) {
        taxoReader.close();
    }

    if (writer != null) {
        // Don't actually commit any index changes:
        writer.rollback();
    }

    dir.close();

    if (printHeap) {

        // Try to get RAM usage -- some ideas poached from http://www.javaworld.com/javaworld/javatips/jw-javatip130.html
        final Runtime runtime = Runtime.getRuntime();
        long usedMem1 = PerfUtils.usedMemory(runtime);
        long usedMem2 = Long.MAX_VALUE;
        for (int iter = 0; iter < 10; iter++) {
            runtime.runFinalization();
            runtime.gc();
            Thread.yield();
            Thread.sleep(100);
            usedMem2 = usedMem1;
            usedMem1 = PerfUtils.usedMemory(runtime);
        }
        out.println("\nHEAP: " + PerfUtils.usedMemory(runtime));
    }
    out.close();
}

From source file:pretraga.IsolationSimilarity.java

public List<String> searchByCategory(String searchingTerm, String category, boolean veryPrecision) {
    try {/* w  ww  . j ava 2s.com*/
        Directory dir = FSDirectory.open(new File(indexDirectoryPath).toPath());

        IndexReader reader = DirectoryReader.open(dir);
        IndexSearcher searcher = new IndexSearcher(reader);
        searcher.setSimilarity(new ClassicSimilarity());

        QueryParser parser = new QueryParser(category, analyzer);
        String queryText = searchingTerm.toLowerCase();
        if (!veryPrecision)
            queryText += "*";
        Query q = parser.parse(queryText);

        TopScoreDocCollector collector = TopScoreDocCollector.create(10);
        searcher.search(q, collector);
        TopDocs docs = collector.topDocs();
        List<String> ret = new ArrayList<>();
        for (int i = 0; i < docs.totalHits; i++) {
            Document d = reader.document(docs.scoreDocs[i].doc);
            ret.add(d.get(category) + ", " + d.get(SIZE) + ", score: " + docs.scoreDocs[i].score);
        }
        reader.close();
        dir.close();
        return ret;
    } catch (Exception e) {
        System.err.println(e.toString());
        return new ArrayList<>();
    }
}

From source file:retriever.TermStats.java

IndexSearcher buildTemporalIndexSearcher(IndexReader reader) throws Exception {

    IndexSearcher searcher = new IndexSearcher(reader);

    if (queryTranslation)
        //searcher.setSimilarity(new BM25Similarity());
        //searcher.setSimilarity(new LMDirichletSimilarity());
        searcher.setSimilarity(new LMJelinekMercerSimilarity(0.4f));
    else//from   w  ww. j a  v  a 2  s  .  c  o  m
        searcher.setSimilarity(new BM25PayloadSimilarity(1.2f, 0.75f));

    return searcher;
}

From source file:reviewclassification.ReviewClassification.java

/**
 * Makes predictions for a test set. Saves them in a hash map that is later written in a file, in ascending file name order. Uses cos_score by default.
 * @param training_set The training set//from  w  w  w.  j  av  a 2 s .com
 * @param query_set The test set
 * @param threshold The threshold used for queries and predictions
 * @param filename The name of the file that holds the results
 * @throws org.apache.lucene.queryparser.classic.ParseException
 * @throws IOException 
 */
public static void predictTestSet(ArrayList<Document> training_set, ArrayList<Document> query_set,
        int threshold, String filename)
        throws org.apache.lucene.queryparser.classic.ParseException, IOException {
    Similarity cos_sim = new ClassicSimilarity();
    FileWriter outfile = new FileWriter(filename);
    HashMap<String, Boolean> predictions = new HashMap<>();

    tlog(ft, "Bulding document index.");
    //Lucene stuff, building an analyzer, an index, creating a configuration, making an index writer, writing documents to the index, making a reader, and a searcher from the reader.
    //Setting cosine similarity as the similarity method in the configuration and the searcher.
    StandardAnalyzer analyzer = new StandardAnalyzer();
    Directory index = new RAMDirectory();
    IndexWriterConfig config = new IndexWriterConfig(analyzer);
    config.setSimilarity(cos_sim);

    IndexWriter w = new IndexWriter(index, config);
    addDocuments(w, training_set);
    w.close();

    IndexReader reader = DirectoryReader.open(index);
    IndexSearcher searcher = new IndexSearcher(reader);
    searcher.setSimilarity(cos_sim);

    tlog(ft, "Done bulding index. Predicting the test set.");

    //For each review in the test set, query the index, get the results, and predict with a given threshold.
    //Then add the prediction to the hash map. The key is the name of the file. We only have the path, so we split it, get the filename, and remove the extension.
    for (Document doc : query_set) {
        ArrayList<Result> results = query(doc, analyzer, searcher, threshold);
        boolean cos_pred = predict(results, doc, threshold, COS_SCORE);
        String[] str = doc.get("path").split("/");
        predictions.put(str[str.length - 1].split("\\.")[0], cos_pred);
    }

    //Sort files in file name ascending order

    tlog(ft, "Done predicting test set. Sorting files.");

    ArrayList<String> files = new ArrayList<>(predictions.keySet());

    files.sort(new Comparator() {
        @Override
        public int compare(Object o1, Object o2) {
            String s1 = (String) o1;
            String s2 = (String) o2;
            return s1.compareTo(s2);
        }
    });

    tlog(ft, "Done sorting files. Writing to disk.");

    //Write results to disk
    for (String s : files) {
        outfile.write(s + " " + boolToInt(predictions.get(s)) + System.lineSeparator());
    }
    outfile.close();

    tlog(ft, "Done writing to disk. Results in: " + filename);
}

From source file:reviewclassification.ReviewClassification.java

/**
 * Uses 9/10 of the training set as the training set, and 1/10 as the test set, chosen randomly.
 * Makes predictions for all 3 scoring methods and for multiple thresholds, to decide the best scoring method and the best threshold to use.
 * @param documents The training set, which will be divided to create a test set
 * @param threshold_start The minimum threshold
 * @param threshold_end The maximum threshold
 * @param filename The name of the file that holds the results
 * @throws IOException/* w  w  w . ja  v  a 2  s . c o m*/
 * @throws org.apache.lucene.queryparser.classic.ParseException 
 */
public static void accuracyTest(ArrayList<Document> documents, int threshold_start, int threshold_end,
        String filename) throws IOException, org.apache.lucene.queryparser.classic.ParseException {
    long seed = System.nanoTime();
    Collections.shuffle(documents, new Random(seed));
    FileWriter outfile = new FileWriter(filename);

    //9/10 of the training set is used for training
    //The remaining 1/10 is used for testing
    ArrayList<Document> training_set = new ArrayList<>(documents.subList(0, documents.size() * 9 / 10));
    ArrayList<Document> test_set = new ArrayList<>(
            documents.subList(documents.size() * 9 / 10, documents.size()));

    //Metrics objects hold tp, fp, tn, and fn counters. We keep one for each threshold. We are testing with 3 scoring methods, so we need 3 lists of objects, 
    //which contain an object for each threshold.
    ArrayList<Integer> threshold_list = new ArrayList<>();
    ArrayList<Metrics> metrics_list_knn = new ArrayList<>();
    ArrayList<Metrics> metrics_list_knn_sentiment = new ArrayList<>();
    ArrayList<Metrics> metrics_list_cos_score = new ArrayList<>();

    //Initializing the metrics objects.
    for (int i = threshold_start; i <= threshold_end; i++) {
        threshold_list.add(i);
        metrics_list_knn.add(new Metrics());
        metrics_list_knn_sentiment.add(new Metrics());
        metrics_list_cos_score.add(new Metrics());
    }

    //Built-in cosine similarity method.
    Similarity cos_sim = new ClassicSimilarity();

    tlog(ft, "Bulding document index.");
    //Lucene stuff, building an analyzer, an index, creating a configuration, making an index writer, writing documents to the index, making a reader, and a searcher from the reader.
    //Setting cosine similarity as the similarity method in the configuration and the searcher.
    StandardAnalyzer analyzer = new StandardAnalyzer();
    Directory index = new RAMDirectory();
    IndexWriterConfig config = new IndexWriterConfig(analyzer);
    config.setSimilarity(cos_sim);

    IndexWriter w = new IndexWriter(index, config);
    addDocuments(w, training_set);
    w.close();

    IndexReader reader = DirectoryReader.open(index);
    IndexSearcher searcher = new IndexSearcher(reader);
    searcher.setSimilarity(cos_sim);

    tlog(ft, "Done bulding index. Predicting the test set.");

    //For each review in the test set, query the index, get the results, then predict with a given threshold.
    //Testing for multiple thresholds to find which one to use.
    for (Document doc : test_set) {
        ArrayList<Result> results = query(doc, analyzer, searcher,
                threshold_list.get(threshold_list.size() - 1));
        boolean query_class = doc.get("path").contains("pos");

        //We execute the query only once, then for each threshold count the results with the appropriate metrics object.
        for (int i = 0; i < threshold_list.size(); i++) {
            boolean knn_pred = predict(results, doc, threshold_list.get(i), KNN);
            boolean knn_senti_pred = predict(results, doc, threshold_list.get(i), KNN_SENTIMENT);
            boolean cos_pred = predict(results, doc, threshold_list.get(i), COS_SCORE);

            update_metrics(metrics_list_knn.get(i), query_class, knn_pred);
            update_metrics(metrics_list_knn_sentiment.get(i), query_class, knn_senti_pred);
            update_metrics(metrics_list_cos_score.get(i), query_class, cos_pred);
        }
    }

    tlog(ft, "Done predicting test set. Calculating accuracies and writing to file.");

    //For each metrics object we call calculate(), which calculates the accuracy, then write it to file.
    for (int i = 0; i < threshold_list.size(); i++) {
        metrics_list_knn.get(i).calculate();
        metrics_list_knn_sentiment.get(i).calculate();
        metrics_list_cos_score.get(i).calculate();
        outfile.write(threshold_list.get(i) + " " + metrics_list_knn.get(i).getAccuracy() + " "
                + metrics_list_knn_sentiment.get(i).getAccuracy() + " "
                + metrics_list_cos_score.get(i).getAccuracy() + System.lineSeparator());
    }

    outfile.close();

    tlog(ft, "Done writing to file. Results in: " + filename);
}

From source file:searcher.CollStat.java

IndexSearcher initSearcher(IndexReader reader) throws Exception {
    IndexSearcher searcher = new IndexSearcher(reader);
    float lambda = Float.parseFloat(prop.getProperty("lm.lambda", "0.6"));
    searcher.setSimilarity(new LMJelinekMercerSimilarity(lambda));
    return searcher;
}