Example usage for org.apache.lucene.index IndexWriterConfig setSimilarity

List of usage examples for org.apache.lucene.index IndexWriterConfig setSimilarity

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriterConfig setSimilarity.

Prototype

public IndexWriterConfig setSimilarity(Similarity similarity) 

Source Link

Document

Expert: set the Similarity implementation used by this IndexWriter.

Usage

From source file:edu.rpi.tw.linkipedia.search.indexing.InMemEntityIndexer.java

License:Open Source License

public void createIndex() {
    try {/*www  .j  ava 2s. com*/

        Analyzer analyzer = DefaultAnalyzer.getAnalyzer();
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_47, analyzer);
        iwc.setSimilarity(new MySimilarity());
        Directory dir = FSDirectory.open(new File(indexDirectory));
        IndexWriter writer = new IndexWriter(dir, iwc);
        System.out.println("Indexing to directory '" + indexDirectory + "'...");
        indexDocs(writer, new File(sourceDirectory));
        System.out.println("Optimizing...");
        writer.close();
        System.out.println("Finished Indexing");

    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:edu.rpi.tw.linkipedia.search.indexing.ParallelEntityIndexer.java

License:Open Source License

public void createIndex() {
    try {/*from  ww  w. j  a v  a 2  s  .com*/

        Analyzer stdAnalyzer = DefaultAnalyzer.getAnalyzer();
        PayloadEncoder encoder = new FloatEncoder();
        EntropyAnalyzer entropyAnalyzer = new EntropyAnalyzer(encoder);
        Map<String, Analyzer> myAnalyzerMap = new HashMap<String, Analyzer>();
        myAnalyzerMap.put("related_object", entropyAnalyzer);
        myAnalyzerMap.put("label", entropyAnalyzer);
        myAnalyzerMap.put("analyzedLabel", stdAnalyzer);
        PerFieldAnalyzerWrapper MyAnalyzer = new PerFieldAnalyzerWrapper(stdAnalyzer, myAnalyzerMap);
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_47, MyAnalyzer);
        iwc.setSimilarity(new MySimilarity());
        Directory dir = FSDirectory.open(new File(indexDirectory));
        IndexWriter writer = new IndexWriter(dir, iwc);
        System.out.println("Indexing to directory '" + indexDirectory + "'...");
        indexDocs(writer, new File(sourceDirectory));
        System.out.println("Optimizing...");
        writer.close();
        System.out.println("Finished Indexing");

    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:edu.rpi.tw.linkipedia.search.indexing.SurfaceFormIndexer.java

License:Open Source License

public void createIndex() {
    try {//from   w w w .  j ava  2s  . c  o m
        Analyzer stdAnalyzer = DefaultAnalyzer.getAnalyzer();
        PayloadEncoder encoder = new FloatEncoder();
        EntropyAnalyzer entropyAnalyzer = new EntropyAnalyzer(encoder);
        Map<String, Analyzer> myAnalyzerMap = new HashMap<String, Analyzer>();
        myAnalyzerMap.put("label", entropyAnalyzer);
        myAnalyzerMap.put("analyzedLabel", stdAnalyzer);
        PerFieldAnalyzerWrapper MyAnalyzer = new PerFieldAnalyzerWrapper(stdAnalyzer, myAnalyzerMap);
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_47, MyAnalyzer);
        iwc.setSimilarity(new MySimilarity());
        Directory dir = FSDirectory.open(new File(indexDirectory));
        IndexWriter writer = new IndexWriter(dir, iwc);

        System.out.println("Indexing to directory '" + indexDirectory + "'...");
        indexDocs(writer, new File(sourceDirectory));
        System.out.println("Optimizing...");

        writer.close();
        System.out.println("Finished Indexing");

    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:edu.rpi.tw.linkipedia.search.indexing.SurfaceFormIndexUpdater.java

License:Open Source License

public void updateIndex() {
    try {//from  w ww. j  a  v a 2s  .c o m
        Analyzer stdAnalyzer = DefaultAnalyzer.getAnalyzer();
        PayloadEncoder encoder = new FloatEncoder();
        EntropyAnalyzer entropyAnalyzer = new EntropyAnalyzer(encoder);
        Map<String, Analyzer> myAnalyzerMap = new HashMap<String, Analyzer>();
        myAnalyzerMap.put("label", entropyAnalyzer);
        myAnalyzerMap.put("analyzedLabel", stdAnalyzer);
        PerFieldAnalyzerWrapper MyAnalyzer = new PerFieldAnalyzerWrapper(stdAnalyzer, myAnalyzerMap);
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_47, MyAnalyzer);
        iwc.setSimilarity(new MySimilarity());
        iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
        Directory dir = FSDirectory.open(new File(indexDirectory));
        IndexWriter writer = new IndexWriter(dir, iwc);

        System.out.println("Indexing to directory '" + indexDirectory + "'...");
        indexDocs(writer, new File(sourceDirectory));
        System.out.println("Optimizing...");

        writer.close();
        System.out.println("Finished Indexing");

    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:fr.lipn.yasemir.indexing.YasemirIndexBuilder.java

License:Open Source License

/**
 * Method that starts the actual indexing
 *///from  ww w.  j a va2 s  . co  m
public void run() {
    Date start = new Date();
    try {
        System.err.println("[YaSemIR] Indexing to directory '" + indexPath + "'...");

        Directory dir = FSDirectory.open(new File(indexPath));

        //IndexWriter Configuration
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_44, Yasemir.analyzer);
        if (Yasemir.SCORE.equals("BM25"))
            iwc.setSimilarity(new BM25Similarity());
        else
            iwc.setSimilarity(new DefaultSimilarity());

        if (create) {
            // Create a new index in the directory, removing any
            // previously indexed documents:
            iwc.setOpenMode(OpenMode.CREATE);
        } else {
            // Add new documents to an existing index:
            iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
        }

        IndexWriter writer = new IndexWriter(dir, iwc);
        indexDocs(writer, docDir);

        writer.close();

        Date end = new Date();
        System.err.println(end.getTime() - start.getTime() + " total milliseconds");
    } catch (IOException e) {
        System.err.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
    }
}

From source file:fr.lipn.yasemir.ontology.KnowledgeBattery.java

License:Open Source License

/**
 * Method used to index terminology. The index is created at the position indicated in the configuration file
 * Terminology is analyzed using a StandardAnalyzer provided by Lucene
 * This method is called only if Yasemir is set in indexing mode
 */// w w w  .  j  av  a 2s  . c om
public static void createTermIndex() {
    try {
        String termIndexPath = Yasemir.TERM_DIR;

        Directory dir = FSDirectory.open(new File(termIndexPath));
        /*
        if(DirectoryReader.indexExists(dir)) {
                 
           System.err.println("[KnowledgeBattery] term index exists, skipping");
           dir.close();
           return;
        }
        */

        //Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_44, Yasemir.analyzer);
        iwc.setSimilarity(new BM25Similarity()); //NEW! set BM25 as default similarity for term index

        IndexWriter writer = new IndexWriter(dir, iwc);

        System.err.println("[KnowledgeBattery] indexing labels to " + termIndexPath);

        for (int i = 0; i < terminologies.size(); i++) {
            //Ontology o = ontologies.elementAt(i);
            SKOSTerminology t = terminologies.elementAt(i);

            t.resetIterator();
            while (t.hasMoreLabels()) {
                Document doc = new Document();
                Vector<String> items = t.getNextLabels();
                String classIRI = items.elementAt(0);
                String labels = items.elementAt(1);

                Field pathField = new StringField("id", classIRI, Field.Store.YES);
                //System.err.println("[KnowledgeBattery] indexing "+classIRI+" labels: "+labels);
                //doc.add(new Field("id", classIRI, Field.Store.YES, Field.Index.NOT_ANALYZED)); //old Lucene versions
                doc.add(pathField);
                doc.add(new TextField("labels", labels, Field.Store.YES));
                /*
                if(!t.isStemmed()) {
                   doc.add(new TextField("labels", labels, Field.Store.YES));
                } else {
                   doc.add(new StringField("labels", labels, Field.Store.YES));
                   //doc.add(new Field("labels", labels, Field.Store.YES, Field.Index.NOT_ANALYZED));
                }
                */
                writer.addDocument(doc);
            }
        }
        writer.close();

    } catch (Exception e) {
        e.printStackTrace();
        System.err.println("[YaSemIR] Term Index could not be created");
        System.exit(-1);
    }
}

From source file:io.anserini.index.IndexClueWeb09b.java

License:Apache License

public int indexWithThreads(int numThreads) throws IOException, InterruptedException {

    System.out.println(//from  w  ww . jav  a  2s.  com
            "Indexing with " + numThreads + " threads to directory '" + indexPath.toAbsolutePath() + "'...");

    final Directory dir = FSDirectory.open(indexPath);

    final IndexWriterConfig iwc = new IndexWriterConfig(analyzer());

    iwc.setSimilarity(new BM25Similarity());
    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    iwc.setRAMBufferSizeMB(256.0);
    iwc.setUseCompoundFile(false);
    iwc.setMergeScheduler(new ConcurrentMergeScheduler());

    final IndexWriter writer = new IndexWriter(dir, iwc);

    final ExecutorService executor = Executors.newFixedThreadPool(numThreads);

    List<Path> warcFiles = discoverWarcFiles(docDir);
    if (doclimit > 0 && warcFiles.size() < doclimit)
        warcFiles = warcFiles.subList(0, doclimit);

    for (Path f : warcFiles)
        executor.execute(new IndexerThread(writer, f));

    //add some delay to let some threads spawn by scheduler
    Thread.sleep(30000);
    executor.shutdown(); // Disable new tasks from being submitted

    try {
        // Wait for existing tasks to terminate
        while (!executor.awaitTermination(5, TimeUnit.MINUTES)) {
            Thread.sleep(1000);
        }
    } catch (InterruptedException ie) {
        // (Re-)Cancel if current thread also interrupted
        executor.shutdownNow();
        // Preserve interrupt status
        Thread.currentThread().interrupt();
    }

    int numIndexed = writer.maxDoc();

    try {
        writer.commit();
        if (optimize)
            writer.forceMerge(1);
    } finally {
        writer.close();
    }

    return numIndexed;
}

From source file:io.anserini.index.IndexCollection.java

License:Apache License

public void run() throws IOException, InterruptedException {
    final long start = System.nanoTime();
    LOG.info("Starting indexer...");

    int numThreads = args.threads;

    final Directory dir = FSDirectory.open(indexPath);
    final EnglishAnalyzer analyzer = args.keepStopwords ? new EnglishAnalyzer(CharArraySet.EMPTY_SET)
            : new EnglishAnalyzer();
    final IndexWriterConfig config = new IndexWriterConfig(analyzer);
    config.setSimilarity(new BM25Similarity());
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    config.setRAMBufferSizeMB(args.memorybufferSize);
    config.setUseCompoundFile(false);/*from  www. j a  v a 2  s  .  c o  m*/
    config.setMergeScheduler(new ConcurrentMergeScheduler());

    final IndexWriter writer = new IndexWriter(dir, config);

    final ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(numThreads);
    final List<Path> segmentPaths = collection.getFileSegmentPaths();

    final int segmentCnt = segmentPaths.size();
    LOG.info(segmentCnt + " files found in " + collectionPath.toString());
    for (int i = 0; i < segmentCnt; i++) {
        executor.execute(new IndexerThread(writer, collection, segmentPaths.get(i)));
    }

    executor.shutdown();

    try {
        // Wait for existing tasks to terminate
        while (!executor.awaitTermination(1, TimeUnit.MINUTES)) {
            LOG.info(String.format("%.2f percent completed",
                    (double) executor.getCompletedTaskCount() / segmentCnt * 100.0d));
        }
    } catch (InterruptedException ie) {
        // (Re-)Cancel if current thread also interrupted
        executor.shutdownNow();
        // Preserve interrupt status
        Thread.currentThread().interrupt();
    }

    if (segmentCnt != executor.getCompletedTaskCount()) {
        throw new RuntimeException("totalFiles = " + segmentCnt + " is not equal to completedTaskCount =  "
                + executor.getCompletedTaskCount());
    }

    int numIndexed = writer.maxDoc();

    try {
        writer.commit();
        if (args.optimize)
            writer.forceMerge(1);
    } finally {
        try {
            writer.close();
        } catch (IOException e) {
            // It is possible that this happens... but nothing much we can do at this point,
            // so just log the error and move on.
            LOG.error(e);
        }
    }

    LOG.info("Indexed documents: " + counters.indexedDocuments.get());
    LOG.info("Empty documents: " + counters.emptyDocuments.get());
    LOG.info("Errors: " + counters.errors.get());

    final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS);
    LOG.info("Total " + numIndexed + " documents indexed in "
            + DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss"));
}

From source file:io.anserini.index.IndexWebCollection.java

License:Apache License

public int indexWithThreads(int numThreads) throws IOException, InterruptedException {

    LOG.info("Indexing with " + numThreads + " threads to directory '" + indexPath.toAbsolutePath() + "'...");

    final Directory dir = FSDirectory.open(indexPath);

    final IndexWriterConfig iwc = new IndexWriterConfig(new EnglishAnalyzer());

    iwc.setSimilarity(new BM25Similarity());
    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    iwc.setRAMBufferSizeMB(512);/* w  w  w .  j a v  a 2 s  .com*/
    iwc.setUseCompoundFile(false);
    iwc.setMergeScheduler(new ConcurrentMergeScheduler());

    final IndexWriter writer = new IndexWriter(dir, iwc);

    final ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(numThreads);
    final String suffix = Collection.GOV2.equals(collection) ? ".gz" : ".warc.gz";
    final Deque<Path> warcFiles = discoverWarcFiles(docDir, suffix);

    if (doclimit > 0 && warcFiles.size() < doclimit)
        for (int i = doclimit; i < warcFiles.size(); i++)
            warcFiles.removeFirst();

    long totalWarcFiles = warcFiles.size();
    LOG.info(totalWarcFiles + " many " + suffix + " files found under the docs path : " + docDir.toString());

    for (int i = 0; i < 2000; i++) {
        if (!warcFiles.isEmpty())
            executor.execute(new IndexerThread(writer, warcFiles.removeFirst()));
        else {
            if (!executor.isShutdown()) {
                Thread.sleep(30000);
                executor.shutdown();
            }
            break;
        }
    }

    long first = 0;
    //add some delay to let some threads spawn by scheduler
    Thread.sleep(30000);

    try {
        // Wait for existing tasks to terminate
        while (!executor.awaitTermination(1, TimeUnit.MINUTES)) {

            final long completedTaskCount = executor.getCompletedTaskCount();

            LOG.info(String.format("%.2f percentage completed",
                    (double) completedTaskCount / totalWarcFiles * 100.0d));

            if (!warcFiles.isEmpty())
                for (long i = first; i < completedTaskCount; i++) {
                    if (!warcFiles.isEmpty())
                        executor.execute(new IndexerThread(writer, warcFiles.removeFirst()));
                    else {
                        if (!executor.isShutdown())
                            executor.shutdown();
                    }
                }

            first = completedTaskCount;
            Thread.sleep(1000);
        }
    } catch (InterruptedException ie) {
        // (Re-)Cancel if current thread also interrupted
        executor.shutdownNow();
        // Preserve interrupt status
        Thread.currentThread().interrupt();
    }

    if (totalWarcFiles != executor.getCompletedTaskCount())
        throw new RuntimeException("totalWarcFiles = " + totalWarcFiles
                + " is not equal to completedTaskCount =  " + executor.getCompletedTaskCount());

    int numIndexed = writer.maxDoc();

    try {
        writer.commit();
        if (optimize)
            writer.forceMerge(1);
    } finally {
        writer.close();
    }

    return numIndexed;
}

From source file:io.anserini.IndexerCW09B.java

License:Apache License

public int indexWithThreads(int numThreads) throws IOException, InterruptedException {

    System.out.println(//from   w w  w.j  av a  2 s  . c  o m
            "Indexing with " + numThreads + " threads to directory '" + indexPath.toAbsolutePath() + "'...");

    final Directory dir = FSDirectory.open(indexPath);

    final IndexWriterConfig iwc = new IndexWriterConfig(analyzer());

    iwc.setSimilarity(new BM25Similarity());
    iwc.setIndexDeletionPolicy(NoDeletionPolicy.INSTANCE);
    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    iwc.setRAMBufferSizeMB(256.0);
    iwc.setUseCompoundFile(false);
    iwc.setMergeScheduler(new ConcurrentMergeScheduler());

    final IndexWriter writer = new IndexWriter(dir, iwc);

    final ExecutorService executor = Executors.newFixedThreadPool(numThreads);

    for (Path f : discoverWarcFiles(docDir))
        executor.execute(new IndexerThread(writer, f));

    //add some delay to let some threads spawn by scheduler
    Thread.sleep(30000);
    executor.shutdown(); // Disable new tasks from being submitted

    try {
        // Wait for existing tasks to terminate
        while (!executor.awaitTermination(5, TimeUnit.MINUTES)) {
            Thread.sleep(1000);
        }
    } catch (InterruptedException ie) {
        // (Re-)Cancel if current thread also interrupted
        executor.shutdownNow();
        // Preserve interrupt status
        Thread.currentThread().interrupt();
    }

    int numIndexed = writer.maxDoc();

    try {
        writer.commit();
    } finally {
        writer.close();
    }

    return numIndexed;
}