Example usage for org.apache.lucene.index IndexWriterConfig setSimilarity

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriterConfig setSimilarity.

Prototype

public IndexWriterConfig setSimilarity(Similarity similarity)

Source Link

Document

Expert: set the Similarity implementation used by this IndexWriter.

Usage

From source file:edu.rpi.tw.linkipedia.search.indexing.InMemEntityIndexer.java

License:Open Source License

public void createIndex() {
    try {/*www  .j  ava 2s. com*/

        Analyzer analyzer = DefaultAnalyzer.getAnalyzer();
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_47, analyzer);
        iwc.setSimilarity(new MySimilarity());
        Directory dir = FSDirectory.open(new File(indexDirectory));
        IndexWriter writer = new IndexWriter(dir, iwc);
        System.out.println("Indexing to directory '" + indexDirectory + "'...");
        indexDocs(writer, new File(sourceDirectory));
        System.out.println("Optimizing...");
        writer.close();
        System.out.println("Finished Indexing");

    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:edu.rpi.tw.linkipedia.search.indexing.ParallelEntityIndexer.java

License:Open Source License

public void createIndex() {
    try {/*from  ww  w. j  a v  a 2  s  .com*/

        Analyzer stdAnalyzer = DefaultAnalyzer.getAnalyzer();
        PayloadEncoder encoder = new FloatEncoder();
        EntropyAnalyzer entropyAnalyzer = new EntropyAnalyzer(encoder);
        Map<String, Analyzer> myAnalyzerMap = new HashMap<String, Analyzer>();
        myAnalyzerMap.put("related_object", entropyAnalyzer);
        myAnalyzerMap.put("label", entropyAnalyzer);
        myAnalyzerMap.put("analyzedLabel", stdAnalyzer);
        PerFieldAnalyzerWrapper MyAnalyzer = new PerFieldAnalyzerWrapper(stdAnalyzer, myAnalyzerMap);
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_47, MyAnalyzer);
        iwc.setSimilarity(new MySimilarity());
        Directory dir = FSDirectory.open(new File(indexDirectory));
        IndexWriter writer = new IndexWriter(dir, iwc);
        System.out.println("Indexing to directory '" + indexDirectory + "'...");
        indexDocs(writer, new File(sourceDirectory));
        System.out.println("Optimizing...");
        writer.close();
        System.out.println("Finished Indexing");

    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:edu.rpi.tw.linkipedia.search.indexing.SurfaceFormIndexer.java

License:Open Source License

public void createIndex() {
    try {//from   w w w .  j ava  2s  . c  o m
        Analyzer stdAnalyzer = DefaultAnalyzer.getAnalyzer();
        PayloadEncoder encoder = new FloatEncoder();
        EntropyAnalyzer entropyAnalyzer = new EntropyAnalyzer(encoder);
        Map<String, Analyzer> myAnalyzerMap = new HashMap<String, Analyzer>();
        myAnalyzerMap.put("label", entropyAnalyzer);
        myAnalyzerMap.put("analyzedLabel", stdAnalyzer);
        PerFieldAnalyzerWrapper MyAnalyzer = new PerFieldAnalyzerWrapper(stdAnalyzer, myAnalyzerMap);
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_47, MyAnalyzer);
        iwc.setSimilarity(new MySimilarity());
        Directory dir = FSDirectory.open(new File(indexDirectory));
        IndexWriter writer = new IndexWriter(dir, iwc);

        System.out.println("Indexing to directory '" + indexDirectory + "'...");
        indexDocs(writer, new File(sourceDirectory));
        System.out.println("Optimizing...");

        writer.close();
        System.out.println("Finished Indexing");

    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:edu.rpi.tw.linkipedia.search.indexing.SurfaceFormIndexUpdater.java

License:Open Source License

public void updateIndex() {
    try {//from  w ww. j  a  v a 2s  .c o m
        Analyzer stdAnalyzer = DefaultAnalyzer.getAnalyzer();
        PayloadEncoder encoder = new FloatEncoder();
        EntropyAnalyzer entropyAnalyzer = new EntropyAnalyzer(encoder);
        Map<String, Analyzer> myAnalyzerMap = new HashMap<String, Analyzer>();
        myAnalyzerMap.put("label", entropyAnalyzer);
        myAnalyzerMap.put("analyzedLabel", stdAnalyzer);
        PerFieldAnalyzerWrapper MyAnalyzer = new PerFieldAnalyzerWrapper(stdAnalyzer, myAnalyzerMap);
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_47, MyAnalyzer);
        iwc.setSimilarity(new MySimilarity());
        iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
        Directory dir = FSDirectory.open(new File(indexDirectory));
        IndexWriter writer = new IndexWriter(dir, iwc);

        System.out.println("Indexing to directory '" + indexDirectory + "'...");
        indexDocs(writer, new File(sourceDirectory));
        System.out.println("Optimizing...");

        writer.close();
        System.out.println("Finished Indexing");

    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:fr.lipn.yasemir.indexing.YasemirIndexBuilder.java

License:Open Source License

/**
 * Method that starts the actual indexing
 *///from  ww w.  j a va2 s  . co  m
public void run() {
    Date start = new Date();
    try {
        System.err.println("[YaSemIR] Indexing to directory '" + indexPath + "'...");

        Directory dir = FSDirectory.open(new File(indexPath));

        //IndexWriter Configuration
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_44, Yasemir.analyzer);
        if (Yasemir.SCORE.equals("BM25"))
            iwc.setSimilarity(new BM25Similarity());
        else
            iwc.setSimilarity(new DefaultSimilarity());

        if (create) {
            // Create a new index in the directory, removing any
            // previously indexed documents:
            iwc.setOpenMode(OpenMode.CREATE);
        } else {
            // Add new documents to an existing index:
            iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
        }

        IndexWriter writer = new IndexWriter(dir, iwc);
        indexDocs(writer, docDir);

        writer.close();

        Date end = new Date();
        System.err.println(end.getTime() - start.getTime() + " total milliseconds");
    } catch (IOException e) {
        System.err.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
    }
}

From source file:fr.lipn.yasemir.ontology.KnowledgeBattery.java

License:Open Source License

/**
 * Method used to index terminology. The index is created at the position indicated in the configuration file
 * Terminology is analyzed using a StandardAnalyzer provided by Lucene
 * This method is called only if Yasemir is set in indexing mode
 */// w w w  .  j  av  a 2s  . c om
public static void createTermIndex() {
    try {
        String termIndexPath = Yasemir.TERM_DIR;

        Directory dir = FSDirectory.open(new File(termIndexPath));
        /*
        if(DirectoryReader.indexExists(dir)) {
                 
           System.err.println("[KnowledgeBattery] term index exists, skipping");
           dir.close();
           return;
        }
        */

        //Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_44, Yasemir.analyzer);
        iwc.setSimilarity(new BM25Similarity()); //NEW! set BM25 as default similarity for term index

        IndexWriter writer = new IndexWriter(dir, iwc);

        System.err.println("[KnowledgeBattery] indexing labels to " + termIndexPath);

        for (int i = 0; i < terminologies.size(); i++) {
            //Ontology o = ontologies.elementAt(i);
            SKOSTerminology t = terminologies.elementAt(i);

            t.resetIterator();
            while (t.hasMoreLabels()) {
                Document doc = new Document();
                Vector<String> items = t.getNextLabels();
                String classIRI = items.elementAt(0);
                String labels = items.elementAt(1);

                Field pathField = new StringField("id", classIRI, Field.Store.YES);
                //System.err.println("[KnowledgeBattery] indexing "+classIRI+" labels: "+labels);
                //doc.add(new Field("id", classIRI, Field.Store.YES, Field.Index.NOT_ANALYZED)); //old Lucene versions
                doc.add(pathField);
                doc.add(new TextField("labels", labels, Field.Store.YES));
                /*
                if(!t.isStemmed()) {
                   doc.add(new TextField("labels", labels, Field.Store.YES));
                } else {
                   doc.add(new StringField("labels", labels, Field.Store.YES));
                   //doc.add(new Field("labels", labels, Field.Store.YES, Field.Index.NOT_ANALYZED));
                }
                */
                writer.addDocument(doc);
            }
        }
        writer.close();

    } catch (Exception e) {
        e.printStackTrace();
        System.err.println("[YaSemIR] Term Index could not be created");
        System.exit(-1);
    }
}

From source file:io.anserini.index.IndexClueWeb09b.java

License:Apache License

public int indexWithThreads(int numThreads) throws IOException, InterruptedException {

    System.out.println(//from  w  ww . jav  a  2s.  com
            "Indexing with " + numThreads + " threads to directory '" + indexPath.toAbsolutePath() + "'...");

    final Directory dir = FSDirectory.open(indexPath);

    final IndexWriterConfig iwc = new IndexWriterConfig(analyzer());

    iwc.setSimilarity(new BM25Similarity());
    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    iwc.setRAMBufferSizeMB(256.0);
    iwc.setUseCompoundFile(false);
    iwc.setMergeScheduler(new ConcurrentMergeScheduler());

    final IndexWriter writer = new IndexWriter(dir, iwc);

    final ExecutorService executor = Executors.newFixedThreadPool(numThreads);

    List<Path> warcFiles = discoverWarcFiles(docDir);
    if (doclimit > 0 && warcFiles.size() < doclimit)
        warcFiles = warcFiles.subList(0, doclimit);

    for (Path f : warcFiles)
        executor.execute(new IndexerThread(writer, f));

    //add some delay to let some threads spawn by scheduler
    Thread.sleep(30000);
    executor.shutdown(); // Disable new tasks from being submitted

    try {
        // Wait for existing tasks to terminate
        while (!executor.awaitTermination(5, TimeUnit.MINUTES)) {
            Thread.sleep(1000);
        }
    } catch (InterruptedException ie) {
        // (Re-)Cancel if current thread also interrupted
        executor.shutdownNow();
        // Preserve interrupt status
        Thread.currentThread().interrupt();
    }

    int numIndexed = writer.maxDoc();

    try {
        writer.commit();
        if (optimize)
            writer.forceMerge(1);
    } finally {
        writer.close();
    }

    return numIndexed;
}

From source file:io.anserini.index.IndexCollection.java

License:Apache License

public void run() throws IOException, InterruptedException {
    final long start = System.nanoTime();
    LOG.info("Starting indexer...");

    int numThreads = args.threads;

    final Directory dir = FSDirectory.open(indexPath);
    final EnglishAnalyzer analyzer = args.keepStopwords ? new EnglishAnalyzer(CharArraySet.EMPTY_SET)
            : new EnglishAnalyzer();
    final IndexWriterConfig config = new IndexWriterConfig(analyzer);
    config.setSimilarity(new BM25Similarity());
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    config.setRAMBufferSizeMB(args.memorybufferSize);
    config.setUseCompoundFile(false);/*from  www. j a  v a 2  s  .  c o  m*/
    config.setMergeScheduler(new ConcurrentMergeScheduler());

    final IndexWriter writer = new IndexWriter(dir, config);

    final ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(numThreads);
    final List<Path> segmentPaths = collection.getFileSegmentPaths();

    final int segmentCnt = segmentPaths.size();
    LOG.info(segmentCnt + " files found in " + collectionPath.toString());
    for (int i = 0; i < segmentCnt; i++) {
        executor.execute(new IndexerThread(writer, collection, segmentPaths.get(i)));
    }

    executor.shutdown();

    try {
        // Wait for existing tasks to terminate
        while (!executor.awaitTermination(1, TimeUnit.MINUTES)) {
            LOG.info(String.format("%.2f percent completed",
                    (double) executor.getCompletedTaskCount() / segmentCnt * 100.0d));
        }
    } catch (InterruptedException ie) {
        // (Re-)Cancel if current thread also interrupted
        executor.shutdownNow();
        // Preserve interrupt status
        Thread.currentThread().interrupt();
    }

    if (segmentCnt != executor.getCompletedTaskCount()) {
        throw new RuntimeException("totalFiles = " + segmentCnt + " is not equal to completedTaskCount =  "
                + executor.getCompletedTaskCount());
    }

    int numIndexed = writer.maxDoc();

    try {
        writer.commit();
        if (args.optimize)
            writer.forceMerge(1);
    } finally {
        try {
            writer.close();
        } catch (IOException e) {
            // It is possible that this happens... but nothing much we can do at this point,
            // so just log the error and move on.
            LOG.error(e);
        }
    }

    LOG.info("Indexed documents: " + counters.indexedDocuments.get());
    LOG.info("Empty documents: " + counters.emptyDocuments.get());
    LOG.info("Errors: " + counters.errors.get());

    final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS);
    LOG.info("Total " + numIndexed + " documents indexed in "
            + DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss"));
}

From source file:io.anserini.index.IndexWebCollection.java

License:Apache License

public int indexWithThreads(int numThreads) throws IOException, InterruptedException {

    LOG.info("Indexing with " + numThreads + " threads to directory '" + indexPath.toAbsolutePath() + "'...");

    final Directory dir = FSDirectory.open(indexPath);

    final IndexWriterConfig iwc = new IndexWriterConfig(new EnglishAnalyzer());

    iwc.setSimilarity(new BM25Similarity());
    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    iwc.setRAMBufferSizeMB(512);/* w  w  w .  j a v  a 2 s  .com*/
    iwc.setUseCompoundFile(false);
    iwc.setMergeScheduler(new ConcurrentMergeScheduler());

    final IndexWriter writer = new IndexWriter(dir, iwc);

    final ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(numThreads);
    final String suffix = Collection.GOV2.equals(collection) ? ".gz" : ".warc.gz";
    final Deque<Path> warcFiles = discoverWarcFiles(docDir, suffix);

    if (doclimit > 0 && warcFiles.size() < doclimit)
        for (int i = doclimit; i < warcFiles.size(); i++)
            warcFiles.removeFirst();

    long totalWarcFiles = warcFiles.size();
    LOG.info(totalWarcFiles + " many " + suffix + " files found under the docs path : " + docDir.toString());

    for (int i = 0; i < 2000; i++) {
        if (!warcFiles.isEmpty())
            executor.execute(new IndexerThread(writer, warcFiles.removeFirst()));
        else {
            if (!executor.isShutdown()) {
                Thread.sleep(30000);
                executor.shutdown();
            }
            break;
        }
    }

    long first = 0;
    //add some delay to let some threads spawn by scheduler
    Thread.sleep(30000);

    try {
        // Wait for existing tasks to terminate
        while (!executor.awaitTermination(1, TimeUnit.MINUTES)) {

            final long completedTaskCount = executor.getCompletedTaskCount();

            LOG.info(String.format("%.2f percentage completed",
                    (double) completedTaskCount / totalWarcFiles * 100.0d));

            if (!warcFiles.isEmpty())
                for (long i = first; i < completedTaskCount; i++) {
                    if (!warcFiles.isEmpty())
                        executor.execute(new IndexerThread(writer, warcFiles.removeFirst()));
                    else {
                        if (!executor.isShutdown())
                            executor.shutdown();
                    }
                }

            first = completedTaskCount;
            Thread.sleep(1000);
        }
    } catch (InterruptedException ie) {
        // (Re-)Cancel if current thread also interrupted
        executor.shutdownNow();
        // Preserve interrupt status
        Thread.currentThread().interrupt();
    }

    if (totalWarcFiles != executor.getCompletedTaskCount())
        throw new RuntimeException("totalWarcFiles = " + totalWarcFiles
                + " is not equal to completedTaskCount =  " + executor.getCompletedTaskCount());

    int numIndexed = writer.maxDoc();

    try {
        writer.commit();
        if (optimize)
            writer.forceMerge(1);
    } finally {
        writer.close();
    }

    return numIndexed;
}

From source file:io.anserini.IndexerCW09B.java

License:Apache License

public int indexWithThreads(int numThreads) throws IOException, InterruptedException {

    System.out.println(//from   w w  w.j  av a  2 s  . c  o m
            "Indexing with " + numThreads + " threads to directory '" + indexPath.toAbsolutePath() + "'...");

    final Directory dir = FSDirectory.open(indexPath);

    final IndexWriterConfig iwc = new IndexWriterConfig(analyzer());

    iwc.setSimilarity(new BM25Similarity());
    iwc.setIndexDeletionPolicy(NoDeletionPolicy.INSTANCE);
    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    iwc.setRAMBufferSizeMB(256.0);
    iwc.setUseCompoundFile(false);
    iwc.setMergeScheduler(new ConcurrentMergeScheduler());

    final IndexWriter writer = new IndexWriter(dir, iwc);

    final ExecutorService executor = Executors.newFixedThreadPool(numThreads);

    for (Path f : discoverWarcFiles(docDir))
        executor.execute(new IndexerThread(writer, f));

    //add some delay to let some threads spawn by scheduler
    Thread.sleep(30000);
    executor.shutdown(); // Disable new tasks from being submitted

    try {
        // Wait for existing tasks to terminate
        while (!executor.awaitTermination(5, TimeUnit.MINUTES)) {
            Thread.sleep(1000);
        }
    } catch (InterruptedException ie) {
        // (Re-)Cancel if current thread also interrupted
        executor.shutdownNow();
        // Preserve interrupt status
        Thread.currentThread().interrupt();
    }

    int numIndexed = writer.maxDoc();

    try {
        writer.commit();
    } finally {
        writer.close();
    }

    return numIndexed;
}