List of usage examples for org.apache.lucene.index IndexWriterConfig setSimilarity
public IndexWriterConfig setSimilarity(Similarity similarity)
From source file:edu.rpi.tw.linkipedia.search.indexing.InMemEntityIndexer.java
License:Open Source License
public void createIndex() { try {/*www .j ava 2s. com*/ Analyzer analyzer = DefaultAnalyzer.getAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_47, analyzer); iwc.setSimilarity(new MySimilarity()); Directory dir = FSDirectory.open(new File(indexDirectory)); IndexWriter writer = new IndexWriter(dir, iwc); System.out.println("Indexing to directory '" + indexDirectory + "'..."); indexDocs(writer, new File(sourceDirectory)); System.out.println("Optimizing..."); writer.close(); System.out.println("Finished Indexing"); } catch (Exception e) { e.printStackTrace(); } }
From source file:edu.rpi.tw.linkipedia.search.indexing.ParallelEntityIndexer.java
License:Open Source License
public void createIndex() { try {/*from ww w. j a v a 2 s .com*/ Analyzer stdAnalyzer = DefaultAnalyzer.getAnalyzer(); PayloadEncoder encoder = new FloatEncoder(); EntropyAnalyzer entropyAnalyzer = new EntropyAnalyzer(encoder); Map<String, Analyzer> myAnalyzerMap = new HashMap<String, Analyzer>(); myAnalyzerMap.put("related_object", entropyAnalyzer); myAnalyzerMap.put("label", entropyAnalyzer); myAnalyzerMap.put("analyzedLabel", stdAnalyzer); PerFieldAnalyzerWrapper MyAnalyzer = new PerFieldAnalyzerWrapper(stdAnalyzer, myAnalyzerMap); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_47, MyAnalyzer); iwc.setSimilarity(new MySimilarity()); Directory dir = FSDirectory.open(new File(indexDirectory)); IndexWriter writer = new IndexWriter(dir, iwc); System.out.println("Indexing to directory '" + indexDirectory + "'..."); indexDocs(writer, new File(sourceDirectory)); System.out.println("Optimizing..."); writer.close(); System.out.println("Finished Indexing"); } catch (Exception e) { e.printStackTrace(); } }
From source file:edu.rpi.tw.linkipedia.search.indexing.SurfaceFormIndexer.java
License:Open Source License
public void createIndex() { try {//from w w w . j ava 2s . c o m Analyzer stdAnalyzer = DefaultAnalyzer.getAnalyzer(); PayloadEncoder encoder = new FloatEncoder(); EntropyAnalyzer entropyAnalyzer = new EntropyAnalyzer(encoder); Map<String, Analyzer> myAnalyzerMap = new HashMap<String, Analyzer>(); myAnalyzerMap.put("label", entropyAnalyzer); myAnalyzerMap.put("analyzedLabel", stdAnalyzer); PerFieldAnalyzerWrapper MyAnalyzer = new PerFieldAnalyzerWrapper(stdAnalyzer, myAnalyzerMap); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_47, MyAnalyzer); iwc.setSimilarity(new MySimilarity()); Directory dir = FSDirectory.open(new File(indexDirectory)); IndexWriter writer = new IndexWriter(dir, iwc); System.out.println("Indexing to directory '" + indexDirectory + "'..."); indexDocs(writer, new File(sourceDirectory)); System.out.println("Optimizing..."); writer.close(); System.out.println("Finished Indexing"); } catch (Exception e) { e.printStackTrace(); } }
From source file:edu.rpi.tw.linkipedia.search.indexing.SurfaceFormIndexUpdater.java
License:Open Source License
public void updateIndex() { try {//from w ww. j a v a 2s .c o m Analyzer stdAnalyzer = DefaultAnalyzer.getAnalyzer(); PayloadEncoder encoder = new FloatEncoder(); EntropyAnalyzer entropyAnalyzer = new EntropyAnalyzer(encoder); Map<String, Analyzer> myAnalyzerMap = new HashMap<String, Analyzer>(); myAnalyzerMap.put("label", entropyAnalyzer); myAnalyzerMap.put("analyzedLabel", stdAnalyzer); PerFieldAnalyzerWrapper MyAnalyzer = new PerFieldAnalyzerWrapper(stdAnalyzer, myAnalyzerMap); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_47, MyAnalyzer); iwc.setSimilarity(new MySimilarity()); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); Directory dir = FSDirectory.open(new File(indexDirectory)); IndexWriter writer = new IndexWriter(dir, iwc); System.out.println("Indexing to directory '" + indexDirectory + "'..."); indexDocs(writer, new File(sourceDirectory)); System.out.println("Optimizing..."); writer.close(); System.out.println("Finished Indexing"); } catch (Exception e) { e.printStackTrace(); } }
From source file:fr.lipn.yasemir.indexing.YasemirIndexBuilder.java
License:Open Source License
/** * Method that starts the actual indexing *///from ww w. j a va2 s . co m public void run() { Date start = new Date(); try { System.err.println("[YaSemIR] Indexing to directory '" + indexPath + "'..."); Directory dir = FSDirectory.open(new File(indexPath)); //IndexWriter Configuration IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_44, Yasemir.analyzer); if (Yasemir.SCORE.equals("BM25")) iwc.setSimilarity(new BM25Similarity()); else iwc.setSimilarity(new DefaultSimilarity()); if (create) { // Create a new index in the directory, removing any // previously indexed documents: iwc.setOpenMode(OpenMode.CREATE); } else { // Add new documents to an existing index: iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); } IndexWriter writer = new IndexWriter(dir, iwc); indexDocs(writer, docDir); writer.close(); Date end = new Date(); System.err.println(end.getTime() - start.getTime() + " total milliseconds"); } catch (IOException e) { System.err.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } }
From source file:fr.lipn.yasemir.ontology.KnowledgeBattery.java
License:Open Source License
/** * Method used to index terminology. The index is created at the position indicated in the configuration file * Terminology is analyzed using a StandardAnalyzer provided by Lucene * This method is called only if Yasemir is set in indexing mode */// w w w . j av a 2s . c om public static void createTermIndex() { try { String termIndexPath = Yasemir.TERM_DIR; Directory dir = FSDirectory.open(new File(termIndexPath)); /* if(DirectoryReader.indexExists(dir)) { System.err.println("[KnowledgeBattery] term index exists, skipping"); dir.close(); return; } */ //Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_44, Yasemir.analyzer); iwc.setSimilarity(new BM25Similarity()); //NEW! set BM25 as default similarity for term index IndexWriter writer = new IndexWriter(dir, iwc); System.err.println("[KnowledgeBattery] indexing labels to " + termIndexPath); for (int i = 0; i < terminologies.size(); i++) { //Ontology o = ontologies.elementAt(i); SKOSTerminology t = terminologies.elementAt(i); t.resetIterator(); while (t.hasMoreLabels()) { Document doc = new Document(); Vector<String> items = t.getNextLabels(); String classIRI = items.elementAt(0); String labels = items.elementAt(1); Field pathField = new StringField("id", classIRI, Field.Store.YES); //System.err.println("[KnowledgeBattery] indexing "+classIRI+" labels: "+labels); //doc.add(new Field("id", classIRI, Field.Store.YES, Field.Index.NOT_ANALYZED)); //old Lucene versions doc.add(pathField); doc.add(new TextField("labels", labels, Field.Store.YES)); /* if(!t.isStemmed()) { doc.add(new TextField("labels", labels, Field.Store.YES)); } else { doc.add(new StringField("labels", labels, Field.Store.YES)); //doc.add(new Field("labels", labels, Field.Store.YES, Field.Index.NOT_ANALYZED)); } */ writer.addDocument(doc); } } writer.close(); } catch (Exception e) { e.printStackTrace(); System.err.println("[YaSemIR] Term Index could not be created"); System.exit(-1); } }
From source file:io.anserini.index.IndexClueWeb09b.java
License:Apache License
public int indexWithThreads(int numThreads) throws IOException, InterruptedException { System.out.println(//from w ww . jav a 2s. com "Indexing with " + numThreads + " threads to directory '" + indexPath.toAbsolutePath() + "'..."); final Directory dir = FSDirectory.open(indexPath); final IndexWriterConfig iwc = new IndexWriterConfig(analyzer()); iwc.setSimilarity(new BM25Similarity()); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); iwc.setRAMBufferSizeMB(256.0); iwc.setUseCompoundFile(false); iwc.setMergeScheduler(new ConcurrentMergeScheduler()); final IndexWriter writer = new IndexWriter(dir, iwc); final ExecutorService executor = Executors.newFixedThreadPool(numThreads); List<Path> warcFiles = discoverWarcFiles(docDir); if (doclimit > 0 && warcFiles.size() < doclimit) warcFiles = warcFiles.subList(0, doclimit); for (Path f : warcFiles) executor.execute(new IndexerThread(writer, f)); //add some delay to let some threads spawn by scheduler Thread.sleep(30000); executor.shutdown(); // Disable new tasks from being submitted try { // Wait for existing tasks to terminate while (!executor.awaitTermination(5, TimeUnit.MINUTES)) { Thread.sleep(1000); } } catch (InterruptedException ie) { // (Re-)Cancel if current thread also interrupted executor.shutdownNow(); // Preserve interrupt status Thread.currentThread().interrupt(); } int numIndexed = writer.maxDoc(); try { writer.commit(); if (optimize) writer.forceMerge(1); } finally { writer.close(); } return numIndexed; }
From source file:io.anserini.index.IndexCollection.java
License:Apache License
public void run() throws IOException, InterruptedException { final long start = System.nanoTime(); LOG.info("Starting indexer..."); int numThreads = args.threads; final Directory dir = FSDirectory.open(indexPath); final EnglishAnalyzer analyzer = args.keepStopwords ? new EnglishAnalyzer(CharArraySet.EMPTY_SET) : new EnglishAnalyzer(); final IndexWriterConfig config = new IndexWriterConfig(analyzer); config.setSimilarity(new BM25Similarity()); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); config.setRAMBufferSizeMB(args.memorybufferSize); config.setUseCompoundFile(false);/*from www. j a v a 2 s . c o m*/ config.setMergeScheduler(new ConcurrentMergeScheduler()); final IndexWriter writer = new IndexWriter(dir, config); final ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(numThreads); final List<Path> segmentPaths = collection.getFileSegmentPaths(); final int segmentCnt = segmentPaths.size(); LOG.info(segmentCnt + " files found in " + collectionPath.toString()); for (int i = 0; i < segmentCnt; i++) { executor.execute(new IndexerThread(writer, collection, segmentPaths.get(i))); } executor.shutdown(); try { // Wait for existing tasks to terminate while (!executor.awaitTermination(1, TimeUnit.MINUTES)) { LOG.info(String.format("%.2f percent completed", (double) executor.getCompletedTaskCount() / segmentCnt * 100.0d)); } } catch (InterruptedException ie) { // (Re-)Cancel if current thread also interrupted executor.shutdownNow(); // Preserve interrupt status Thread.currentThread().interrupt(); } if (segmentCnt != executor.getCompletedTaskCount()) { throw new RuntimeException("totalFiles = " + segmentCnt + " is not equal to completedTaskCount = " + executor.getCompletedTaskCount()); } int numIndexed = writer.maxDoc(); try { writer.commit(); if (args.optimize) writer.forceMerge(1); } finally { try { writer.close(); } catch (IOException e) { // It is possible that this happens... but nothing much we can do at this point, // so just log the error and move on. LOG.error(e); } } LOG.info("Indexed documents: " + counters.indexedDocuments.get()); LOG.info("Empty documents: " + counters.emptyDocuments.get()); LOG.info("Errors: " + counters.errors.get()); final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS); LOG.info("Total " + numIndexed + " documents indexed in " + DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss")); }
From source file:io.anserini.index.IndexWebCollection.java
License:Apache License
public int indexWithThreads(int numThreads) throws IOException, InterruptedException { LOG.info("Indexing with " + numThreads + " threads to directory '" + indexPath.toAbsolutePath() + "'..."); final Directory dir = FSDirectory.open(indexPath); final IndexWriterConfig iwc = new IndexWriterConfig(new EnglishAnalyzer()); iwc.setSimilarity(new BM25Similarity()); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); iwc.setRAMBufferSizeMB(512);/* w w w . j a v a 2 s .com*/ iwc.setUseCompoundFile(false); iwc.setMergeScheduler(new ConcurrentMergeScheduler()); final IndexWriter writer = new IndexWriter(dir, iwc); final ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(numThreads); final String suffix = Collection.GOV2.equals(collection) ? ".gz" : ".warc.gz"; final Deque<Path> warcFiles = discoverWarcFiles(docDir, suffix); if (doclimit > 0 && warcFiles.size() < doclimit) for (int i = doclimit; i < warcFiles.size(); i++) warcFiles.removeFirst(); long totalWarcFiles = warcFiles.size(); LOG.info(totalWarcFiles + " many " + suffix + " files found under the docs path : " + docDir.toString()); for (int i = 0; i < 2000; i++) { if (!warcFiles.isEmpty()) executor.execute(new IndexerThread(writer, warcFiles.removeFirst())); else { if (!executor.isShutdown()) { Thread.sleep(30000); executor.shutdown(); } break; } } long first = 0; //add some delay to let some threads spawn by scheduler Thread.sleep(30000); try { // Wait for existing tasks to terminate while (!executor.awaitTermination(1, TimeUnit.MINUTES)) { final long completedTaskCount = executor.getCompletedTaskCount(); LOG.info(String.format("%.2f percentage completed", (double) completedTaskCount / totalWarcFiles * 100.0d)); if (!warcFiles.isEmpty()) for (long i = first; i < completedTaskCount; i++) { if (!warcFiles.isEmpty()) executor.execute(new IndexerThread(writer, warcFiles.removeFirst())); else { if (!executor.isShutdown()) executor.shutdown(); } } first = completedTaskCount; Thread.sleep(1000); } } catch (InterruptedException ie) { // (Re-)Cancel if current thread also interrupted executor.shutdownNow(); // Preserve interrupt status Thread.currentThread().interrupt(); } if (totalWarcFiles != executor.getCompletedTaskCount()) throw new RuntimeException("totalWarcFiles = " + totalWarcFiles + " is not equal to completedTaskCount = " + executor.getCompletedTaskCount()); int numIndexed = writer.maxDoc(); try { writer.commit(); if (optimize) writer.forceMerge(1); } finally { writer.close(); } return numIndexed; }
From source file:io.anserini.IndexerCW09B.java
License:Apache License
public int indexWithThreads(int numThreads) throws IOException, InterruptedException { System.out.println(//from w w w.j av a 2 s . c o m "Indexing with " + numThreads + " threads to directory '" + indexPath.toAbsolutePath() + "'..."); final Directory dir = FSDirectory.open(indexPath); final IndexWriterConfig iwc = new IndexWriterConfig(analyzer()); iwc.setSimilarity(new BM25Similarity()); iwc.setIndexDeletionPolicy(NoDeletionPolicy.INSTANCE); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); iwc.setRAMBufferSizeMB(256.0); iwc.setUseCompoundFile(false); iwc.setMergeScheduler(new ConcurrentMergeScheduler()); final IndexWriter writer = new IndexWriter(dir, iwc); final ExecutorService executor = Executors.newFixedThreadPool(numThreads); for (Path f : discoverWarcFiles(docDir)) executor.execute(new IndexerThread(writer, f)); //add some delay to let some threads spawn by scheduler Thread.sleep(30000); executor.shutdown(); // Disable new tasks from being submitted try { // Wait for existing tasks to terminate while (!executor.awaitTermination(5, TimeUnit.MINUTES)) { Thread.sleep(1000); } } catch (InterruptedException ie) { // (Re-)Cancel if current thread also interrupted executor.shutdownNow(); // Preserve interrupt status Thread.currentThread().interrupt(); } int numIndexed = writer.maxDoc(); try { writer.commit(); } finally { writer.close(); } return numIndexed; }