List of usage examples for org.apache.lucene.index IndexWriter commit
@Override public final long commit() throws IOException
Commits all pending changes (added and deleted documents, segment merges, added indexes, etc.) to the index, and syncs all referenced index files, such that a reader will see the changes and the index updates will survive an OS or machine crash or power loss.
From source file:au.org.ala.names.search.ALANameIndexer.java
License:Open Source License
/** * Indexes common names from CoL and ANBG for use in the Common name search. * * @param iw The index writer to write the common documents to * @param exportDir The directory that contains the common name export files. * @param indexDir The directory in which to create the index. * @throws Exception// w w w . ja v a2 s. co m */ private void indexCommonNames(IndexWriter iw, String exportDir, String indexDir) throws Exception { log.info("Creating Common Names Index ..."); //TODO think about adding additional sources for common names IndexSearcher currentNameSearcher = new IndexSearcher( DirectoryReader.open(FSDirectory.open(new File(indexDir + File.separator + "cb")))); IndexSearcher extraSearcher = new IndexSearcher( DirectoryReader.open(FSDirectory.open(new File(indexDir + File.separator + "id")))); addCoLCommonNames(iw, currentNameSearcher); addAnbgCommonNames(afdFile, iw, currentNameSearcher, extraSearcher, '\t'); addAnbgCommonNames(apniFile, iw, currentNameSearcher, extraSearcher, ','); iw.commit(); iw.forceMerge(1); iw.close(); }
From source file:au.org.ala.names.search.ALANameIndexer.java
License:Open Source License
/** * Creates a temporary index that will provide a lookup up of lsid to "real lsid". * <p/>/*from w ww . j av a 2 s .c om*/ * This deals with the following situations: * - common names that are sourced from CoL (LSIDs will be mapped to corresponding ANBG LSID) * - Multiple ANBG LSIDs exist for the same scientific name and more than 1 are mapped to the same common name. * * @param idFile * @throws Exception */ private void createExtraIdIndex(String idxLocation, File idFile) throws Exception { CSVReader reader = new CSVReader(new FileReader(idFile), '\t', '"', '~');//CSVReader.build(idFile, "UTF-8", "\t", '"', 0); File indexDir = new File(idxLocation); IndexWriter iw = createIndexWriter(indexDir, new KeywordAnalyzer(), true);//new IndexWriter(FSDirectory.open(indexDir), new KeywordAnalyzer(), true, MaxFieldLength.UNLIMITED); String[] values = null; while ((values = reader.readNext()) != null) { if (values != null && values.length >= 3) { Document doc = new Document(); //doc.add(new Field("lsid", values[2], Store.NO, Index.NOT_ANALYZED)); doc.add(new StringField("lsid", values[2], Store.NO)); //doc.add(new Field("reallsid", values[1], Store.YES, Index.NO)); doc.add(new StoredField("reallsid", values[1])); iw.addDocument(doc); } } iw.commit(); iw.forceMerge(1); iw.close(); idSearcher = new IndexSearcher(DirectoryReader.open(FSDirectory.open(indexDir))); }
From source file:au.org.ala.names.search.ALANameIndexer.java
License:Open Source License
/** * Creates a temporary index that stores the taxon concept LSIDs that were * included in the last ANBG exports./*from w w w . j av a 2 s. co m*/ * * @param tcFileName * @return * @throws Exception */ private IndexSearcher createTmpIndex(String tcFileName) throws Exception { //creating the tmp index in the /tmp/taxonConcept directory CSVReader reader = new CSVReader(new FileReader(new File(tcFileName)), '\t', '"', '~'); File indexDir = new File("/tmp/taxonConcept"); IndexWriter iw = createIndexWriter(indexDir, new KeywordAnalyzer(), true); String[] values = null; while ((values = reader.readNext()) != null) { if (values != null && values.length > 1) { //just add the LSID to the index Document doc = new Document(); doc.add(new StringField("lsid", values[0], Store.NO)); iw.addDocument(doc); } } iw.commit(); iw.forceMerge(1); iw.close(); return new IndexSearcher(DirectoryReader.open(FSDirectory.open(indexDir))); }
From source file:au.org.ala.names.search.ALANameSearcher.java
License:Open Source License
private File createIfNotExist(String indexDirectory) throws IOException { File idxFile = new File(indexDirectory); if (!idxFile.exists()) { FileUtils.forceMkdir(idxFile);/*from ww w . j a v a 2s . c o m*/ Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_34); IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_34, analyzer); IndexWriter iw = new IndexWriter(FSDirectory.open(idxFile), conf); iw.commit(); iw.close(); } return idxFile; }
From source file:au.org.ala.names.search.DwcaNameIndexer.java
License:Open Source License
/** * Index the common names CSV file supplied. * * CSV header need to be txaonId, taxonLsid, scientificName, vernacularName, languageCode, countryCode * * The languageCode and countryCode are not necessary as they are not used. * * @param iw//from w w w . j a v a 2 s. com * @param file * @throws Exception */ private void indexCommonNames(IndexWriter iw, String file) throws Exception { //assumes that the quoted TSV file is in the following format //taxon id, taxon lsid, scientific name, vernacular name, language code, country code log.info("Starting to load the common names"); int i = 0, count = 0; au.com.bytecode.opencsv.CSVReader cbreader = new au.com.bytecode.opencsv.CSVReader(new FileReader(file), '\t', '"', '\\', 0); for (String[] values = cbreader.readNext(); values != null; values = cbreader.readNext()) { i++; if (values.length == 6) { //relies on having the same lsid supplied as the DWCA file String lsid = StringUtils.isNotEmpty(values[1]) ? values[1] : values[0]; //check to see if it exists TopDocs result = getLoadIdxResults("lsid", lsid, 1); if (result.totalHits > 0) { //we can add the common name Document doc = getCommonNameDocument(values[3], values[2], lsid, 1.0f, false); iw.addDocument(doc); count++; } } else { log.info("Issue on line " + i + " " + values[0]); } if (i % 1000 == 0) { log.info("Finished processing " + i + " common names with " + count + " added to index "); } } log.info("Finished processing " + i + " common names with " + count + " added to index "); iw.commit(); iw.forceMerge(1); iw.close(); }
From source file:au.org.ala.names.search.DwcaNameIndexer.java
License:Open Source License
/** * Creates a loading index to use to generate the hierarchy including the left right values. * * @param tmpIndexDir/*from w ww .j a v a 2 s . c o m*/ * @param archiveDirectory * @throws Exception */ private void createLoadingIndex(String tmpIndexDir, String archiveDirectory) throws Exception { log.info("Starting to create the temporary loading index."); File indexDir = new File(tmpIndexDir); IndexWriter iw = createIndexWriter(indexDir, new KeywordAnalyzer(), true); //create the loading index so that left right values and classifications can be generated Archive archive = ArchiveFactory.openArchive(new File(archiveDirectory)); Iterator<DarwinCoreRecord> it = archive.iteratorDwc(); int i = 0; long start = System.currentTimeMillis(); while (it.hasNext()) { Document doc = new Document(); DarwinCoreRecord dwcr = it.next(); String id = dwcr.getId(); String lsid = dwcr.getTaxonID() == null ? id : dwcr.getTaxonID(); String acceptedLsid = dwcr.getAcceptedNameUsageID(); //add and store the identifier for the record doc.add(new StringField(NameIndexField.ID.toString(), dwcr.getId(), Field.Store.YES)); if (StringUtils.isNotBlank(lsid)) { doc.add(new StringField(NameIndexField.LSID.toString(), lsid, Field.Store.YES)); } else { System.out.println("LSID is null for " + id + " " + lsid + " " + lsid + " " + acceptedLsid); } if (StringUtils.isNotBlank(dwcr.getParentNameUsageID())) { doc.add(new StringField("parent_id", dwcr.getParentNameUsageID(), Field.Store.YES)); } if (StringUtils.isNotBlank(dwcr.getAcceptedNameUsageID())) { doc.add(new StringField(NameIndexField.ACCEPTED.toString(), dwcr.getAcceptedNameUsageID(), Field.Store.YES)); } if (StringUtils.isNotBlank(dwcr.getScientificName())) { //stored no need to search on doc.add(new StoredField(NameIndexField.NAME.toString(), dwcr.getScientificName())); } if (StringUtils.isNotBlank(dwcr.getScientificNameAuthorship())) { //stored no need to search on doc.add(new StoredField(NameIndexField.AUTHOR.toString(), dwcr.getScientificNameAuthorship())); } if (StringUtils.isNotBlank(dwcr.getGenus())) { //stored no need to search on doc.add(new StoredField("genus", dwcr.getGenus())); } if (StringUtils.isNotBlank(dwcr.getSpecificEpithet())) { //stored no need to search on doc.add(new StoredField(NameIndexField.SPECIFIC.toString(), dwcr.getSpecificEpithet())); } if (StringUtils.isNotBlank(dwcr.getInfraspecificEpithet())) { //stored no need to search on doc.add(new StoredField(NameIndexField.INFRA_SPECIFIC.toString(), dwcr.getInfraspecificEpithet())); } if (StringUtils.isNotBlank(dwcr.getTaxonRank())) { //match the supplied rank RankType rt = RankType.getForStrRank(dwcr.getTaxonRank()); if (rt != null) { doc.add(new StringField(NameIndexField.RANK.toString(), rt.getRank(), Field.Store.YES)); doc.add(new StringField(NameIndexField.RANK_ID.toString(), rt.getId().toString(), Field.Store.YES)); } else { doc.add(new StringField(NameIndexField.RANK.toString(), dwcr.getTaxonRank(), Field.Store.YES)); doc.add(new StringField(NameIndexField.RANK_ID.toString(), RankType.UNRANKED.getId().toString(), Field.Store.YES)); } } else { //put in unknown rank doc.add(new StringField(NameIndexField.RANK.toString(), "Unknown", Field.Store.YES)); doc.add(new StringField(NameIndexField.RANK_ID.toString(), RankType.UNRANKED.getId().toString(), Field.Store.YES)); } if (StringUtils.equals(lsid, acceptedLsid) || StringUtils.equals(id, acceptedLsid) || acceptedLsid == null) { //mark this one as an accepted concept doc.add(new StringField(NameIndexField.iS_SYNONYM.toString(), "F", Field.Store.YES)); if (StringUtils.isBlank(dwcr.getParentNameUsageID())) { doc.add(new StringField("root", "T", Field.Store.YES)); } } else { doc.add(new StringField(NameIndexField.iS_SYNONYM.toString(), "T", Field.Store.YES)); } iw.addDocument(doc); i++; if (i % 1000 == 0) { long finish = System.currentTimeMillis(); log.debug("Loading index: " + i + " records per sec: " + (1000 / (((float) (finish / start)) / 1000))); start = finish; } } log.info("Finished creating the temporary load index with " + i + " concepts"); iw.commit(); iw.forceMerge(1); iw.close(); lsearcher = new IndexSearcher(DirectoryReader.open(FSDirectory.open(indexDir))); }
From source file:br.bireme.ngrams.NGrams.java
public static boolean indexDocument(final NGIndex index, final IndexWriter writer, final NGSchema schema, final String pipedDoc, final boolean allowDocUpdate) throws IOException, ParseException { if (index == null) { throw new NullPointerException("index"); }//from w w w. j av a2 s. co m if (writer == null) { throw new NullPointerException("writer"); } if (schema == null) { throw new NullPointerException("schema"); } if (pipedDoc == null) { throw new NullPointerException("pipedDoc"); } boolean ret = false; final String pipedDocT = pipedDoc.trim(); if (!isUtf8Encoding(pipedDocT)) { throw new IOException("Invalid encoded string"); } if (!pipedDocT.isEmpty()) { final Parameters parameters = schema.getParameters(); if (Tools.countOccurrences(pipedDoc, '|') < parameters.maxIdxFieldPos) { throw new IOException("invalid number of fields: [" + pipedDoc + "]"); } final String pipedDoc2 = StringEscapeUtils.unescapeHtml4(pipedDoc); final String[] split = pipedDoc2.replace(':', ' ').trim().split(" *\\| *", Integer.MAX_VALUE); final String id = split[parameters.id.pos]; if (id.isEmpty()) { throw new IOException("id"); } final String dbName = split[parameters.db.pos]; if (dbName.isEmpty()) { throw new IOException("dbName"); } final Map<String, br.bireme.ngrams.Field> flds = parameters.nameFields; final Document doc = createDocument(flds, split); if (doc != null) { if (allowDocUpdate) { writer.updateDocument(new Term("id", id), doc); writer.commit(); } else { writer.addDocument(doc); } ret = true; } } return ret; }
From source file:cn.codepub.redis.directory.Main.java
License:Apache License
public static void testRedisDirectoryWithShardedJedisPool() throws IOException { long start = System.currentTimeMillis(); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(new WhitespaceAnalyzer()) .setOpenMode(IndexWriterConfig.OpenMode.CREATE); //indexWriterConfig.setInfoStream(System.out); //indexWriterConfig.setRAMBufferSizeMB(2048); //LogByteSizeMergePolicy logByteSizeMergePolicy = new LogByteSizeMergePolicy(); //logByteSizeMergePolicy.setMinMergeMB(1); //logByteSizeMergePolicy.setMaxMergeMB(64); //logByteSizeMergePolicy.setMaxCFSSegmentSizeMB(64); //indexWriterConfig.setRAMBufferSizeMB(1024).setMergePolicy(logByteSizeMergePolicy).setUseCompoundFile(false); //GenericObjectPoolConfig genericObjectPoolConfig = new GenericObjectPoolConfig(); //?/*from www.j ava 2 s . com*/ //genericObjectPoolConfig.setMaxWaitMillis(3000); //10s List<JedisShardInfo> shards = new ArrayList<>(); JedisShardInfo si = new JedisShardInfo("localhost", 6379, Constants.TIME_OUT); //JedisShardInfo si2 = new JedisShardInfo("localhost", 6380); shards.add(si); //shards.add(si2); JedisPoolConfig jedisPoolConfig = new JedisPoolConfig(); ShardedJedisPool shardedJedisPool = new ShardedJedisPool(jedisPoolConfig, shards); RedisDirectory redisDirectory = new RedisDirectory(new ShardedJedisPoolStream(shardedJedisPool)); IndexWriter indexWriter = new IndexWriter(redisDirectory, indexWriterConfig); for (int i = 0; i < 10000000; i++) { indexWriter.addDocument(addDocument(i)); } indexWriter.commit(); indexWriter.close(); redisDirectory.close(); long end = System.currentTimeMillis(); log.error("RedisDirectoryWithShardedJedisPool consumes {}s!", (end - start) / 1000); shardedJedisPool = new ShardedJedisPool(jedisPoolConfig, shards); start = System.currentTimeMillis(); IndexSearcher indexSearcher = new IndexSearcher( DirectoryReader.open(new RedisDirectory(new ShardedJedisPoolStream(shardedJedisPool)))); int total = 0; for (int i = 0; i < 10000000; i++) { TermQuery key1 = new TermQuery(new Term("key1", "key" + i)); TopDocs search = indexSearcher.search(key1, 10); total += search.totalHits; } System.out.println(total); end = System.currentTimeMillis(); log.error("RedisDirectoryWithShardedJedisPool search consumes {}ms!", (end - start)); }
From source file:cn.hbu.cs.esearch.index.BaseSearchIndex.java
License:Apache License
public void updateIndex(LongSet delDocs, List<EsearchIndexable.IndexingReq> insertDocs, Analyzer defaultAnalyzer, Similarity similarity) throws IOException { if (delDocs != null && delDocs.size() > 0) { deleteDocs(delDocs);/*from www . j av a2 s . c om*/ } if (insertDocs == null || insertDocs.size() == 0) { return; } IndexWriter idxMod = null; try { idxMod = openIndexWriter(defaultAnalyzer, similarity); if (idxMod != null) { for (EsearchIndexable.IndexingReq idxPair : insertDocs) { Analyzer analyzer = idxPair.getAnalyzer(); Document doc = idxPair.getDocument(); if (analyzer == null) { idxMod.addDocument(doc); } else { idxMod.addDocument(doc, analyzer); } } } } finally { if (idxMod != null) { idxMod.commit(); if (_closeWriterAfterUpdate) { closeIndexWriter(); } } } }
From source file:cn.hbu.cs.esearch.index.BaseSearchIndex.java
License:Apache License
private void deleteDocs(LongSet delDocs) throws IOException { if (delDocs == null || delDocs.size() == 0) { return;/*from w ww . jav a 2s . com*/ } EsearchMultiReader<R> reader = openIndexReader(); if (reader == null) { return; } UIDFilter uidFilter = new UIDFilter(delDocs.toLongArray(), reader); IndexWriter writer = null; try { writer = openIndexWriter(null, null); writer.deleteDocuments(new ConstantScoreQuery(uidFilter)); writer.commit(); } finally { closeIndexWriter(); } }