List of usage examples for org.apache.lucene.index IndexWriter forceMerge
public void forceMerge(int maxNumSegments) throws IOException
From source file:au.org.ala.names.search.DwcaNameIndexer.java
License:Open Source License
/** * Index the common names CSV file supplied. * * CSV header need to be txaonId, taxonLsid, scientificName, vernacularName, languageCode, countryCode * * The languageCode and countryCode are not necessary as they are not used. * * @param iw//ww w . java2s. c o m * @param file * @throws Exception */ private void indexCommonNames(IndexWriter iw, String file) throws Exception { //assumes that the quoted TSV file is in the following format //taxon id, taxon lsid, scientific name, vernacular name, language code, country code log.info("Starting to load the common names"); int i = 0, count = 0; au.com.bytecode.opencsv.CSVReader cbreader = new au.com.bytecode.opencsv.CSVReader(new FileReader(file), '\t', '"', '\\', 0); for (String[] values = cbreader.readNext(); values != null; values = cbreader.readNext()) { i++; if (values.length == 6) { //relies on having the same lsid supplied as the DWCA file String lsid = StringUtils.isNotEmpty(values[1]) ? values[1] : values[0]; //check to see if it exists TopDocs result = getLoadIdxResults("lsid", lsid, 1); if (result.totalHits > 0) { //we can add the common name Document doc = getCommonNameDocument(values[3], values[2], lsid, 1.0f, false); iw.addDocument(doc); count++; } } else { log.info("Issue on line " + i + " " + values[0]); } if (i % 1000 == 0) { log.info("Finished processing " + i + " common names with " + count + " added to index "); } } log.info("Finished processing " + i + " common names with " + count + " added to index "); iw.commit(); iw.forceMerge(1); iw.close(); }
From source file:au.org.ala.names.search.DwcaNameIndexer.java
License:Open Source License
/** * Creates a loading index to use to generate the hierarchy including the left right values. * * @param tmpIndexDir/* ww w .j a v a2s.com*/ * @param archiveDirectory * @throws Exception */ private void createLoadingIndex(String tmpIndexDir, String archiveDirectory) throws Exception { log.info("Starting to create the temporary loading index."); File indexDir = new File(tmpIndexDir); IndexWriter iw = createIndexWriter(indexDir, new KeywordAnalyzer(), true); //create the loading index so that left right values and classifications can be generated Archive archive = ArchiveFactory.openArchive(new File(archiveDirectory)); Iterator<DarwinCoreRecord> it = archive.iteratorDwc(); int i = 0; long start = System.currentTimeMillis(); while (it.hasNext()) { Document doc = new Document(); DarwinCoreRecord dwcr = it.next(); String id = dwcr.getId(); String lsid = dwcr.getTaxonID() == null ? id : dwcr.getTaxonID(); String acceptedLsid = dwcr.getAcceptedNameUsageID(); //add and store the identifier for the record doc.add(new StringField(NameIndexField.ID.toString(), dwcr.getId(), Field.Store.YES)); if (StringUtils.isNotBlank(lsid)) { doc.add(new StringField(NameIndexField.LSID.toString(), lsid, Field.Store.YES)); } else { System.out.println("LSID is null for " + id + " " + lsid + " " + lsid + " " + acceptedLsid); } if (StringUtils.isNotBlank(dwcr.getParentNameUsageID())) { doc.add(new StringField("parent_id", dwcr.getParentNameUsageID(), Field.Store.YES)); } if (StringUtils.isNotBlank(dwcr.getAcceptedNameUsageID())) { doc.add(new StringField(NameIndexField.ACCEPTED.toString(), dwcr.getAcceptedNameUsageID(), Field.Store.YES)); } if (StringUtils.isNotBlank(dwcr.getScientificName())) { //stored no need to search on doc.add(new StoredField(NameIndexField.NAME.toString(), dwcr.getScientificName())); } if (StringUtils.isNotBlank(dwcr.getScientificNameAuthorship())) { //stored no need to search on doc.add(new StoredField(NameIndexField.AUTHOR.toString(), dwcr.getScientificNameAuthorship())); } if (StringUtils.isNotBlank(dwcr.getGenus())) { //stored no need to search on doc.add(new StoredField("genus", dwcr.getGenus())); } if (StringUtils.isNotBlank(dwcr.getSpecificEpithet())) { //stored no need to search on doc.add(new StoredField(NameIndexField.SPECIFIC.toString(), dwcr.getSpecificEpithet())); } if (StringUtils.isNotBlank(dwcr.getInfraspecificEpithet())) { //stored no need to search on doc.add(new StoredField(NameIndexField.INFRA_SPECIFIC.toString(), dwcr.getInfraspecificEpithet())); } if (StringUtils.isNotBlank(dwcr.getTaxonRank())) { //match the supplied rank RankType rt = RankType.getForStrRank(dwcr.getTaxonRank()); if (rt != null) { doc.add(new StringField(NameIndexField.RANK.toString(), rt.getRank(), Field.Store.YES)); doc.add(new StringField(NameIndexField.RANK_ID.toString(), rt.getId().toString(), Field.Store.YES)); } else { doc.add(new StringField(NameIndexField.RANK.toString(), dwcr.getTaxonRank(), Field.Store.YES)); doc.add(new StringField(NameIndexField.RANK_ID.toString(), RankType.UNRANKED.getId().toString(), Field.Store.YES)); } } else { //put in unknown rank doc.add(new StringField(NameIndexField.RANK.toString(), "Unknown", Field.Store.YES)); doc.add(new StringField(NameIndexField.RANK_ID.toString(), RankType.UNRANKED.getId().toString(), Field.Store.YES)); } if (StringUtils.equals(lsid, acceptedLsid) || StringUtils.equals(id, acceptedLsid) || acceptedLsid == null) { //mark this one as an accepted concept doc.add(new StringField(NameIndexField.iS_SYNONYM.toString(), "F", Field.Store.YES)); if (StringUtils.isBlank(dwcr.getParentNameUsageID())) { doc.add(new StringField("root", "T", Field.Store.YES)); } } else { doc.add(new StringField(NameIndexField.iS_SYNONYM.toString(), "T", Field.Store.YES)); } iw.addDocument(doc); i++; if (i % 1000 == 0) { long finish = System.currentTimeMillis(); log.debug("Loading index: " + i + " records per sec: " + (1000 / (((float) (finish / start)) / 1000))); start = finish; } } log.info("Finished creating the temporary load index with " + i + " concepts"); iw.commit(); iw.forceMerge(1); iw.close(); lsearcher = new IndexSearcher(DirectoryReader.open(FSDirectory.open(indexDir))); }
From source file:br.bireme.ngrams.NGrams.java
public static void index(final NGIndex index, final NGSchema schema, final String inFile, final String inFileEncoding) throws IOException, ParseException { if (index == null) { throw new NullPointerException("index"); }/* www .j av a 2 s. c o m*/ if (schema == null) { throw new NullPointerException("schema"); } if (inFile == null) { throw new NullPointerException("inFile"); } if (inFileEncoding == null) { throw new NullPointerException("inFileEncoding"); } final Charset charset = Charset.forName(inFileEncoding); final IndexWriter writer = index.getIndexWriter(false); int cur = 0; try (BufferedReader reader = Files.newBufferedReader(new File(inFile).toPath(), charset)) { writer.deleteAll(); while (true) { final String line; try { line = reader.readLine(); } catch (MalformedInputException mie) { System.err.println("Line with another encoding. Line number:" + (++cur)); continue; } if (line == null) { break; } final boolean ret = indexDocument(index, writer, schema, line, false); if (ret && (++cur % 100000 == 0)) { System.out.println(">>> " + cur); } } writer.forceMerge(1); // optimize index writer.close(); } }
From source file:cc.twittertools.index.IndexStatuses.java
License:Apache License
@SuppressWarnings("static-access") public static void main(String[] args) throws Exception { Options options = new Options(); options.addOption(new Option(HELP_OPTION, "show help")); options.addOption(new Option(OPTIMIZE_OPTION, "merge indexes into a single segment")); options.addOption(new Option(STORE_TERM_VECTORS_OPTION, "store term vectors")); options.addOption(OptionBuilder.withArgName("dir").hasArg().withDescription("source collection directory") .create(COLLECTION_OPTION)); options.addOption(/* w ww.ja v a2 s .c om*/ OptionBuilder.withArgName("dir").hasArg().withDescription("index location").create(INDEX_OPTION)); options.addOption(OptionBuilder.withArgName("file").hasArg().withDescription("file with deleted tweetids") .create(DELETES_OPTION)); options.addOption(OptionBuilder.withArgName("id").hasArg().withDescription("max id").create(MAX_ID_OPTION)); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (cmdline.hasOption(HELP_OPTION) || !cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(INDEX_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(IndexStatuses.class.getName(), options); System.exit(-1); } String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION); String indexPath = cmdline.getOptionValue(INDEX_OPTION); final FieldType textOptions = new FieldType(); textOptions.setIndexed(true); textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); textOptions.setStored(true); textOptions.setTokenized(true); if (cmdline.hasOption(STORE_TERM_VECTORS_OPTION)) { textOptions.setStoreTermVectors(true); } LOG.info("collection: " + collectionPath); LOG.info("index: " + indexPath); LongOpenHashSet deletes = null; if (cmdline.hasOption(DELETES_OPTION)) { deletes = new LongOpenHashSet(); File deletesFile = new File(cmdline.getOptionValue(DELETES_OPTION)); if (!deletesFile.exists()) { System.err.println("Error: " + deletesFile + " does not exist!"); System.exit(-1); } LOG.info("Reading deletes from " + deletesFile); FileInputStream fin = new FileInputStream(deletesFile); byte[] ignoreBytes = new byte[2]; fin.read(ignoreBytes); // "B", "Z" bytes from commandline tools BufferedReader br = new BufferedReader(new InputStreamReader(new CBZip2InputStream(fin))); String s; while ((s = br.readLine()) != null) { if (s.contains("\t")) { deletes.add(Long.parseLong(s.split("\t")[0])); } else { deletes.add(Long.parseLong(s)); } } br.close(); fin.close(); LOG.info("Read " + deletes.size() + " tweetids from deletes file."); } long maxId = Long.MAX_VALUE; if (cmdline.hasOption(MAX_ID_OPTION)) { maxId = Long.parseLong(cmdline.getOptionValue(MAX_ID_OPTION)); LOG.info("index: " + maxId); } long startTime = System.currentTimeMillis(); File file = new File(collectionPath); if (!file.exists()) { System.err.println("Error: " + file + " does not exist!"); System.exit(-1); } StatusStream stream = new JsonStatusCorpusReader(file); Directory dir = FSDirectory.open(new File(indexPath)); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_43, IndexStatuses.ANALYZER); config.setOpenMode(OpenMode.CREATE); IndexWriter writer = new IndexWriter(dir, config); int cnt = 0; Status status; try { while ((status = stream.next()) != null) { if (status.getText() == null) { continue; } // Skip deletes tweetids. if (deletes != null && deletes.contains(status.getId())) { continue; } if (status.getId() > maxId) { continue; } cnt++; Document doc = new Document(); doc.add(new LongField(StatusField.ID.name, status.getId(), Field.Store.YES)); doc.add(new LongField(StatusField.EPOCH.name, status.getEpoch(), Field.Store.YES)); doc.add(new TextField(StatusField.SCREEN_NAME.name, status.getScreenname(), Store.YES)); doc.add(new Field(StatusField.TEXT.name, status.getText(), textOptions)); doc.add(new IntField(StatusField.FRIENDS_COUNT.name, status.getFollowersCount(), Store.YES)); doc.add(new IntField(StatusField.FOLLOWERS_COUNT.name, status.getFriendsCount(), Store.YES)); doc.add(new IntField(StatusField.STATUSES_COUNT.name, status.getStatusesCount(), Store.YES)); long inReplyToStatusId = status.getInReplyToStatusId(); if (inReplyToStatusId > 0) { doc.add(new LongField(StatusField.IN_REPLY_TO_STATUS_ID.name, inReplyToStatusId, Field.Store.YES)); doc.add(new LongField(StatusField.IN_REPLY_TO_USER_ID.name, status.getInReplyToUserId(), Field.Store.YES)); } String lang = status.getLang(); if (!lang.equals("unknown")) { doc.add(new TextField(StatusField.LANG.name, status.getLang(), Store.YES)); } long retweetStatusId = status.getRetweetedStatusId(); if (retweetStatusId > 0) { doc.add(new LongField(StatusField.RETWEETED_STATUS_ID.name, retweetStatusId, Field.Store.YES)); doc.add(new LongField(StatusField.RETWEETED_USER_ID.name, status.getRetweetedUserId(), Field.Store.YES)); doc.add(new IntField(StatusField.RETWEET_COUNT.name, status.getRetweetCount(), Store.YES)); if (status.getRetweetCount() < 0 || status.getRetweetedStatusId() < 0) { LOG.warn("Error parsing retweet fields of " + status.getId()); } } writer.addDocument(doc); if (cnt % 100000 == 0) { LOG.info(cnt + " statuses indexed"); } } LOG.info(String.format("Total of %s statuses added", cnt)); if (cmdline.hasOption(OPTIMIZE_OPTION)) { LOG.info("Merging segments..."); writer.forceMerge(1); LOG.info("Done!"); } LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms"); } catch (Exception e) { e.printStackTrace(); } finally { writer.close(); dir.close(); stream.close(); } }
From source file:cc.wikitools.lucene.IndexWikipediaDump.java
License:Apache License
@SuppressWarnings("static-access") public static void main(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("bz2 Wikipedia XML dump file") .create(INPUT_OPTION));/*w w w .j a va 2s . co m*/ options.addOption( OptionBuilder.withArgName("dir").hasArg().withDescription("index location").create(INDEX_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg() .withDescription("maximum number of documents to index").create(MAX_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of indexing threads") .create(THREADS_OPTION)); options.addOption(new Option(OPTIMIZE_OPTION, "merge indexes into a single segment")); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(IndexWikipediaDump.class.getCanonicalName(), options); System.exit(-1); } String indexPath = cmdline.getOptionValue(INDEX_OPTION); int maxdocs = cmdline.hasOption(MAX_OPTION) ? Integer.parseInt(cmdline.getOptionValue(MAX_OPTION)) : Integer.MAX_VALUE; int threads = cmdline.hasOption(THREADS_OPTION) ? Integer.parseInt(cmdline.getOptionValue(THREADS_OPTION)) : DEFAULT_NUM_THREADS; long startTime = System.currentTimeMillis(); String path = cmdline.getOptionValue(INPUT_OPTION); PrintStream out = new PrintStream(System.out, true, "UTF-8"); WikiClean cleaner = new WikiCleanBuilder().withTitle(true).build(); Directory dir = FSDirectory.open(new File(indexPath)); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_43, ANALYZER); config.setOpenMode(OpenMode.CREATE); IndexWriter writer = new IndexWriter(dir, config); LOG.info("Creating index at " + indexPath); LOG.info("Indexing with " + threads + " threads"); try { WikipediaBz2DumpInputStream stream = new WikipediaBz2DumpInputStream(path); ExecutorService executor = Executors.newFixedThreadPool(threads); int cnt = 0; String page; while ((page = stream.readNext()) != null) { String title = cleaner.getTitle(page); // These are heuristic specifically for filtering out non-articles in enwiki-20120104. if (title.startsWith("Wikipedia:") || title.startsWith("Portal:") || title.startsWith("File:")) { continue; } if (page.contains("#REDIRECT") || page.contains("#redirect") || page.contains("#Redirect")) { continue; } Runnable worker = new AddDocumentRunnable(writer, cleaner, page); executor.execute(worker); cnt++; if (cnt % 10000 == 0) { LOG.info(cnt + " articles added"); } if (cnt >= maxdocs) { break; } } executor.shutdown(); // Wait until all threads are finish while (!executor.isTerminated()) { } LOG.info("Total of " + cnt + " articles indexed."); if (cmdline.hasOption(OPTIMIZE_OPTION)) { LOG.info("Merging segments..."); writer.forceMerge(1); LOG.info("Done!"); } LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms"); } catch (Exception e) { e.printStackTrace(); } finally { writer.close(); dir.close(); out.close(); } }
From source file:cn.hbu.cs.esearch.index.DiskLuceneIndexDataLoader.java
License:Apache License
public void optimize(int numSegs) throws IOException { long t0 = System.currentTimeMillis(); if (numSegs <= 1) { numSegs = 1;// w ww . j a v a 2 s . com } log.info("optmizing, numSegs: " + numSegs + " ..."); // we should optimize synchronized (optimizeMonitor) { BaseSearchIndex<R> idx = getSearchIndex(); IndexWriter writer = null; try { writer = idx.openIndexWriter(_analyzer, _similarity); writer.forceMerge(numSegs); } finally { if (writer != null) { idx.closeIndexWriter(); } } _idxMgr.refreshDiskReader(); } log.info("index optimized in " + (System.currentTimeMillis() - t0) + "ms"); }
From source file:collene.TestIndexing.java
License:Apache License
@Test public void test() throws IOException, ParseException { Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_4_9); // write it out. IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_4_9, analyzer); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); IndexWriter writer = new IndexWriter(directory, config); for (int i = 0; i < 100; i++) { Collection<Document> documents = new ArrayList<Document>(); Document doc = new Document(); doc.add(new Field("key", "aaa_" + i, TextField.TYPE_STORED)); doc.add(new Field("not", "notaaa", TextField.TYPE_NOT_STORED)); doc.add(new Field("meta", "aaa_meta_aaa_" + i, TextField.TYPE_STORED)); documents.add(doc);/*from www . java 2s .c o m*/ writer.addDocuments(documents); writer.commit(); writer.forceMerge(1); writer.forceMergeDeletes(true); } // now read it back. IndexSearcher searcher = new IndexSearcher(DirectoryReader.open(writer, false)); QueryParser parser = new QueryParser(Version.LUCENE_4_9, "key", analyzer); Query query = parser.parse("aaa_4"); TopDocs docs = searcher.search(query, 1); int idToDelete = docs.scoreDocs[0].doc; Assert.assertTrue(docs.totalHits > 0); query = parser.parse("fersoius"); docs = searcher.search(query, 1); Assert.assertFalse(docs.totalHits > 0); // delete that document. DirectoryReader reader = DirectoryReader.open(writer, true); writer.tryDeleteDocument(reader, idToDelete); reader.close(); writer.close(); // list files Set<String> files = new HashSet<String>(); System.out.println("Listing files for " + directory.toString()); for (String file : directory.listAll()) { files.add(file); System.out.println(" " + file); } if (strictFileChecking) { System.out.println("String file checking..."); Sets.SetView<String> difference = Sets.difference(expectedFiles, files); Assert.assertEquals(Joiner.on(",").join(difference), 0, difference.size()); } reader = DirectoryReader.open(directory); searcher = new IndexSearcher(reader); query = parser.parse("aaa_4"); docs = searcher.search(query, 1); reader.close(); Assert.assertFalse(docs.totalHits > 0); directory.close(); }
From source file:com.aliasi.lingmed.medline.IndexMedline.java
License:Lingpipe license
/** * Run the command. See class documentation above for details on * arguments and behavior.//from w ww . ja v a 2 s . co m */ public void run() { System.out.println("start run"); try { File[] files = getLaterFiles(mDistDir); System.out.println("Total files to process: " + files.length); System.out.println("File names: " + java.util.Arrays.asList(files)); // if (mLogger.isDebugEnabled()) // mLogger.debug("File names: " + java.util.Arrays.asList(files)); if (files.length > 0) { MedlineParser parser = new MedlineParser(true); // true = save raw XML Directory fsDir = FSDirectory.open(mIndex); IndexWriterConfig iwConf = new IndexWriterConfig(Version.LUCENE_36, mCodec.getAnalyzer()); iwConf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); iwConf.setRAMBufferSizeMB(RAM_BUF_SIZE); if (sIsBaseline) { LogDocMergePolicy ldmp = new LogDocMergePolicy(); ldmp.setMergeFactor(MERGE_FACTOR_HI); iwConf.setMergePolicy(ldmp); } IndexWriter indexWriter = new IndexWriter(fsDir, iwConf); for (File file : files) { System.out.println("processing file: " + file); MedlineIndexer indexer = new MedlineIndexer(indexWriter, mCodec); parser.setHandler(indexer); parseFile(parser, file); indexer.close(); recordFile(indexWriter, file.getName()); System.out.println("completed processing file: " + file); } System.out.println("All files parsed, now optimize index"); indexWriter.forceMerge(1); indexWriter.commit(); indexWriter.close(); } System.out.println("Processing complete."); } catch (Exception e) { // mLogger.warn("Unexpected Exception: "+e.getMessage()); // mLogger.warn("stack trace: "+Logging.logStackTrace(e)); // mLogger.warn("Aborting this run"); IllegalStateException e2 = new IllegalStateException(e.getMessage()); e2.setStackTrace(e.getStackTrace()); throw e2; } }
From source file:com.aliasi.lingmed.medline.OptimizeMedline.java
License:Lingpipe license
/** * Run the command. See class documentation above for details on * arguments and behavior.//from ww w . j a va2 s .c o m */ public void run() { // mLogger.info("start run"); try { Directory fsDir = FSDirectory.open(mIndex); IndexWriterConfig iwConf = new IndexWriterConfig(Version.LUCENE_36, new StandardAnalyzer(Version.LUCENE_36)); iwConf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); IndexWriter indexWriter = new IndexWriter(fsDir, iwConf); indexWriter.forceMerge(1); indexWriter.commit(); // mLogger.info("Processing complete."); } catch (Exception e) { // mLogger.warn("Unexpected Exception: "+e.getMessage()); // mLogger.warn("stack trace: "+Logging.logStackTrace(e)); // mLogger.warn("Aborting this run"); IllegalStateException e2 = new IllegalStateException(e.getMessage()); e2.setStackTrace(e.getStackTrace()); throw e2; } }
From source file:com.dreamerpartner.codereview.lucene.IndexHelper.java
License:Apache License
/** * ??/*ww w . j av a 2 s . c om*/ * @param module ? * @throws IOException */ @SuppressWarnings("deprecation") public static void merge(String module) throws IOException { long beginTime = System.currentTimeMillis(); IndexWriter writer = null; try { Directory dir = FSDirectory.open(new File(LuceneUtil.getIndexPath(module))); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_4_10_0); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_4_10_0, analyzer); iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); writer = new IndexWriter(dir, iwc); //?? writer.forceMerge(1); writer.commit(); } finally { long endTime = System.currentTimeMillis(); logger.debug("merge consume " + (endTime - beginTime) + " milliseconds."); if (writer != null) writer.close(); } }