List of usage examples for org.apache.lucene.index IndexWriter close
@Override public void close() throws IOException
From source file:au.org.ala.names.search.ALANameIndexer.java
License:Open Source License
/** * Creates the temporary index that provides a lookup of checklist bank id to * GUID/*from www.java 2 s . c o m*/ */ private IndexSearcher createTmpGuidIndex(String cbExportFile) throws Exception { System.out.println("Starting to create the tmp guid index..."); IndexWriter iw = createIndexWriter(new File("/data/tmp/guid"), new KeywordAnalyzer(), true); au.com.bytecode.opencsv.CSVReader cbreader = new au.com.bytecode.opencsv.CSVReader( new FileReader(cbExportFile), '\t', '"', '/', 1); for (String[] values = cbreader.readNext(); values != null; values = cbreader.readNext()) { Document doc = new Document(); String id = values[POS_ID]; String guid = values[POS_LSID]; doc.add(new StringField("id", id, Store.YES)); if (StringUtils.isEmpty(id)) guid = id; doc.add(new StoredField("guid", guid)); iw.addDocument(doc); } System.out.println("Finished writing the tmp guid index..."); iw.commit(); iw.forceMerge(1); iw.close(); //As of lucene 4.0 all IndexReaders are read only return new IndexSearcher(DirectoryReader.open(FSDirectory.open(new File("/data/tmp/guid")))); }
From source file:au.org.ala.names.search.ALANameIndexer.java
License:Open Source License
private void indexALA(IndexWriter iw, String file, String synonymFile) throws Exception { int records = 0; long time = System.currentTimeMillis(); au.com.bytecode.opencsv.CSVReader reader = new au.com.bytecode.opencsv.CSVReader(new FileReader(file), '\t', '"', '\\', 1); for (String[] values = reader.readNext(); values != null; values = reader.readNext()) { String lsid = values[POS_LSID]; String id = values[POS_ID]; String rank = values[POS_RANK]; int rankId = -1; try {//w ww . j a va2s .co m rankId = Integer.parseInt(values[POS_RANK_ID]); } catch (Exception e) { } String acceptedValues = values[POS_ACC_LSID]; float boost = 1.0f; //give the major ranks a larger boost if (rankId % 1000 == 0) { boost = 5.0f; } //give non-col concepts a higher boost String source = values[POS_SRC]; if (!source.trim().equals("") && !source.equalsIgnoreCase("CoL")) { boost = boost * 2; } Document doc = createALAIndexDocument(values[POS_SCI_NAME], id, lsid, values[POS_RANK_ID], values[POS_RANK], values[POS_K], values[POS_KID], values[POS_P], values[POS_PID], values[POS_C], values[POS_CID], values[POS_O], values[POS_OID], values[POS_F], values[POS_FID], values[POS_G], values[POS_GID], values[POS_S], values[POS_SID], values[POS_LFT], values[POS_RGT], acceptedValues, values[POS_SP_EPITHET], values[POS_INFRA_EPITHET], values[POS_AUTHOR], boost); //add the excluded information if applicable if ("T".equals(values[POS_EXCLUDED]) || "Y".equals(values[POS_EXCLUDED])) { doc.add(new TextField(NameIndexField.SYNONYM_TYPE.toString(), SynonymType.EXCLUDES.getId().toString(), Store.YES)); } if (doc != null) { iw.addDocument(doc); records++; if (records % 100000 == 0) { log.info("Processed " + records + " in " + (System.currentTimeMillis() - time) + " msecs"); } } } addExtraALAConcept(iw, extraALAConcepts); //add the synonyms addALASyonyms(iw, synonymFile); iw.commit(); iw.forceMerge(1); iw.close(); log.info("Lucene index created - processed a total of " + records + " records in " + (System.currentTimeMillis() - time) + " msecs "); }
From source file:au.org.ala.names.search.ALANameIndexer.java
License:Open Source License
/** * Indexes common names from CoL and ANBG for use in the Common name search. * * @param iw The index writer to write the common documents to * @param exportDir The directory that contains the common name export files. * @param indexDir The directory in which to create the index. * @throws Exception/*from w ww .j a v a2 s. c om*/ */ private void indexCommonNames(IndexWriter iw, String exportDir, String indexDir) throws Exception { log.info("Creating Common Names Index ..."); //TODO think about adding additional sources for common names IndexSearcher currentNameSearcher = new IndexSearcher( DirectoryReader.open(FSDirectory.open(new File(indexDir + File.separator + "cb")))); IndexSearcher extraSearcher = new IndexSearcher( DirectoryReader.open(FSDirectory.open(new File(indexDir + File.separator + "id")))); addCoLCommonNames(iw, currentNameSearcher); addAnbgCommonNames(afdFile, iw, currentNameSearcher, extraSearcher, '\t'); addAnbgCommonNames(apniFile, iw, currentNameSearcher, extraSearcher, ','); iw.commit(); iw.forceMerge(1); iw.close(); }
From source file:au.org.ala.names.search.ALANameIndexer.java
License:Open Source License
/** * Creates a temporary index that will provide a lookup up of lsid to "real lsid". * <p/>//from w ww. j a va 2 s . c o m * This deals with the following situations: * - common names that are sourced from CoL (LSIDs will be mapped to corresponding ANBG LSID) * - Multiple ANBG LSIDs exist for the same scientific name and more than 1 are mapped to the same common name. * * @param idFile * @throws Exception */ private void createExtraIdIndex(String idxLocation, File idFile) throws Exception { CSVReader reader = new CSVReader(new FileReader(idFile), '\t', '"', '~');//CSVReader.build(idFile, "UTF-8", "\t", '"', 0); File indexDir = new File(idxLocation); IndexWriter iw = createIndexWriter(indexDir, new KeywordAnalyzer(), true);//new IndexWriter(FSDirectory.open(indexDir), new KeywordAnalyzer(), true, MaxFieldLength.UNLIMITED); String[] values = null; while ((values = reader.readNext()) != null) { if (values != null && values.length >= 3) { Document doc = new Document(); //doc.add(new Field("lsid", values[2], Store.NO, Index.NOT_ANALYZED)); doc.add(new StringField("lsid", values[2], Store.NO)); //doc.add(new Field("reallsid", values[1], Store.YES, Index.NO)); doc.add(new StoredField("reallsid", values[1])); iw.addDocument(doc); } } iw.commit(); iw.forceMerge(1); iw.close(); idSearcher = new IndexSearcher(DirectoryReader.open(FSDirectory.open(indexDir))); }
From source file:au.org.ala.names.search.ALANameIndexer.java
License:Open Source License
/** * Creates a temporary index that stores the taxon concept LSIDs that were * included in the last ANBG exports./* ww w. jav a 2s. com*/ * * @param tcFileName * @return * @throws Exception */ private IndexSearcher createTmpIndex(String tcFileName) throws Exception { //creating the tmp index in the /tmp/taxonConcept directory CSVReader reader = new CSVReader(new FileReader(new File(tcFileName)), '\t', '"', '~'); File indexDir = new File("/tmp/taxonConcept"); IndexWriter iw = createIndexWriter(indexDir, new KeywordAnalyzer(), true); String[] values = null; while ((values = reader.readNext()) != null) { if (values != null && values.length > 1) { //just add the LSID to the index Document doc = new Document(); doc.add(new StringField("lsid", values[0], Store.NO)); iw.addDocument(doc); } } iw.commit(); iw.forceMerge(1); iw.close(); return new IndexSearcher(DirectoryReader.open(FSDirectory.open(indexDir))); }
From source file:au.org.ala.names.search.ALANameSearcher.java
License:Open Source License
private File createIfNotExist(String indexDirectory) throws IOException { File idxFile = new File(indexDirectory); if (!idxFile.exists()) { FileUtils.forceMkdir(idxFile);//www .j a va 2s. c o m Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_34); IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_34, analyzer); IndexWriter iw = new IndexWriter(FSDirectory.open(idxFile), conf); iw.commit(); iw.close(); } return idxFile; }
From source file:au.org.ala.names.search.DwcaNameIndexer.java
License:Open Source License
/** * Creates the name matching index based on a complete list of names supplied in a single DwCA * * @param loadingIndex True when the loading index should be created. This is necessary to generate the index, but you may wish to skip this step if it has be generated earlier * @param sciIndex True when the name matching index should be generated * @param indexDirectory The directory in which to create the name matching index * @param tmpLoadIndex The directory in which to create the temporary loading index * @param namesDwc The absolute path to the directory that contains the unzipped DWC archive to index * @param irmngDwc The absolute path to the directory that contains the unzipped IRMNG DWCA * @param commonNameFile/* ww w . j a v a 2 s . com*/ * @throws Exception */ public void create(boolean loadingIndex, boolean sciIndex, String indexDirectory, String tmpLoadIndex, String namesDwc, String irmngDwc, String commonNameFile) throws Exception { dirTmpIndex = tmpLoadIndex; LowerCaseKeywordAnalyzer analyzer = new LowerCaseKeywordAnalyzer(); if (loadingIndex) { createLoadingIndex(tmpLoadIndex, namesDwc); } if (sciIndex) { writer = createIndexWriter(new File(indexDirectory + File.separator + "cb"), analyzer, true); generateIndex(); addSynonymsToIndex(namesDwc); writer.commit(); writer.forceMerge(1); writer.close(); } if (irmngDwc != null && new File(irmngDwc).exists()) { IndexWriter irmngWriter = createIndexWriter(new File(indexDirectory + File.separator + "irmng"), analyzer, true); this.indexIrmngDwcA(irmngWriter, irmngDwc); irmngWriter.forceMerge(1); irmngWriter.close(); } if (commonNameFile != null && new File(commonNameFile).exists()) { //index the common names indexCommonNames(createIndexWriter(new File(indexDirectory + File.separator + "vernacular"), new KeywordAnalyzer(), true), commonNameFile); } }
From source file:au.org.ala.names.search.DwcaNameIndexer.java
License:Open Source License
/** * Index the common names CSV file supplied. * * CSV header need to be txaonId, taxonLsid, scientificName, vernacularName, languageCode, countryCode * * The languageCode and countryCode are not necessary as they are not used. * * @param iw/*from www. ja va 2 s . c o m*/ * @param file * @throws Exception */ private void indexCommonNames(IndexWriter iw, String file) throws Exception { //assumes that the quoted TSV file is in the following format //taxon id, taxon lsid, scientific name, vernacular name, language code, country code log.info("Starting to load the common names"); int i = 0, count = 0; au.com.bytecode.opencsv.CSVReader cbreader = new au.com.bytecode.opencsv.CSVReader(new FileReader(file), '\t', '"', '\\', 0); for (String[] values = cbreader.readNext(); values != null; values = cbreader.readNext()) { i++; if (values.length == 6) { //relies on having the same lsid supplied as the DWCA file String lsid = StringUtils.isNotEmpty(values[1]) ? values[1] : values[0]; //check to see if it exists TopDocs result = getLoadIdxResults("lsid", lsid, 1); if (result.totalHits > 0) { //we can add the common name Document doc = getCommonNameDocument(values[3], values[2], lsid, 1.0f, false); iw.addDocument(doc); count++; } } else { log.info("Issue on line " + i + " " + values[0]); } if (i % 1000 == 0) { log.info("Finished processing " + i + " common names with " + count + " added to index "); } } log.info("Finished processing " + i + " common names with " + count + " added to index "); iw.commit(); iw.forceMerge(1); iw.close(); }
From source file:au.org.ala.names.search.DwcaNameIndexer.java
License:Open Source License
/** * Creates a loading index to use to generate the hierarchy including the left right values. * * @param tmpIndexDir//w w w . j a v a 2s . com * @param archiveDirectory * @throws Exception */ private void createLoadingIndex(String tmpIndexDir, String archiveDirectory) throws Exception { log.info("Starting to create the temporary loading index."); File indexDir = new File(tmpIndexDir); IndexWriter iw = createIndexWriter(indexDir, new KeywordAnalyzer(), true); //create the loading index so that left right values and classifications can be generated Archive archive = ArchiveFactory.openArchive(new File(archiveDirectory)); Iterator<DarwinCoreRecord> it = archive.iteratorDwc(); int i = 0; long start = System.currentTimeMillis(); while (it.hasNext()) { Document doc = new Document(); DarwinCoreRecord dwcr = it.next(); String id = dwcr.getId(); String lsid = dwcr.getTaxonID() == null ? id : dwcr.getTaxonID(); String acceptedLsid = dwcr.getAcceptedNameUsageID(); //add and store the identifier for the record doc.add(new StringField(NameIndexField.ID.toString(), dwcr.getId(), Field.Store.YES)); if (StringUtils.isNotBlank(lsid)) { doc.add(new StringField(NameIndexField.LSID.toString(), lsid, Field.Store.YES)); } else { System.out.println("LSID is null for " + id + " " + lsid + " " + lsid + " " + acceptedLsid); } if (StringUtils.isNotBlank(dwcr.getParentNameUsageID())) { doc.add(new StringField("parent_id", dwcr.getParentNameUsageID(), Field.Store.YES)); } if (StringUtils.isNotBlank(dwcr.getAcceptedNameUsageID())) { doc.add(new StringField(NameIndexField.ACCEPTED.toString(), dwcr.getAcceptedNameUsageID(), Field.Store.YES)); } if (StringUtils.isNotBlank(dwcr.getScientificName())) { //stored no need to search on doc.add(new StoredField(NameIndexField.NAME.toString(), dwcr.getScientificName())); } if (StringUtils.isNotBlank(dwcr.getScientificNameAuthorship())) { //stored no need to search on doc.add(new StoredField(NameIndexField.AUTHOR.toString(), dwcr.getScientificNameAuthorship())); } if (StringUtils.isNotBlank(dwcr.getGenus())) { //stored no need to search on doc.add(new StoredField("genus", dwcr.getGenus())); } if (StringUtils.isNotBlank(dwcr.getSpecificEpithet())) { //stored no need to search on doc.add(new StoredField(NameIndexField.SPECIFIC.toString(), dwcr.getSpecificEpithet())); } if (StringUtils.isNotBlank(dwcr.getInfraspecificEpithet())) { //stored no need to search on doc.add(new StoredField(NameIndexField.INFRA_SPECIFIC.toString(), dwcr.getInfraspecificEpithet())); } if (StringUtils.isNotBlank(dwcr.getTaxonRank())) { //match the supplied rank RankType rt = RankType.getForStrRank(dwcr.getTaxonRank()); if (rt != null) { doc.add(new StringField(NameIndexField.RANK.toString(), rt.getRank(), Field.Store.YES)); doc.add(new StringField(NameIndexField.RANK_ID.toString(), rt.getId().toString(), Field.Store.YES)); } else { doc.add(new StringField(NameIndexField.RANK.toString(), dwcr.getTaxonRank(), Field.Store.YES)); doc.add(new StringField(NameIndexField.RANK_ID.toString(), RankType.UNRANKED.getId().toString(), Field.Store.YES)); } } else { //put in unknown rank doc.add(new StringField(NameIndexField.RANK.toString(), "Unknown", Field.Store.YES)); doc.add(new StringField(NameIndexField.RANK_ID.toString(), RankType.UNRANKED.getId().toString(), Field.Store.YES)); } if (StringUtils.equals(lsid, acceptedLsid) || StringUtils.equals(id, acceptedLsid) || acceptedLsid == null) { //mark this one as an accepted concept doc.add(new StringField(NameIndexField.iS_SYNONYM.toString(), "F", Field.Store.YES)); if (StringUtils.isBlank(dwcr.getParentNameUsageID())) { doc.add(new StringField("root", "T", Field.Store.YES)); } } else { doc.add(new StringField(NameIndexField.iS_SYNONYM.toString(), "T", Field.Store.YES)); } iw.addDocument(doc); i++; if (i % 1000 == 0) { long finish = System.currentTimeMillis(); log.debug("Loading index: " + i + " records per sec: " + (1000 / (((float) (finish / start)) / 1000))); start = finish; } } log.info("Finished creating the temporary load index with " + i + " concepts"); iw.commit(); iw.forceMerge(1); iw.close(); lsearcher = new IndexSearcher(DirectoryReader.open(FSDirectory.open(indexDir))); }
From source file:axiom.db.utils.LuceneManipulator.java
License:Open Source License
public void compress(String dbDir) throws Exception { System.setProperty("org.apache.lucene.FSDirectory.class", "org.apache.lucene.store.TransFSDirectory"); File dbhome = new File(dbDir); String url = getUrl(dbhome);/* w w w .j av a 2 s . c o m*/ FSDirectory indexDir = FSDirectory.getDirectory(dbhome, false); if (indexDir instanceof TransFSDirectory) { FSDirectory.setDisableLocks(true); TransFSDirectory d = (TransFSDirectory) indexDir; d.setDriverClass(DRIVER_CLASS); d.setUrl(url); d.setUser(null); d.setPassword(null); } File ndbhome = new File(dbhome.getParentFile(), dbhome.getName() + "_tmp"); File olddbhome = new File(dbhome.getParentFile(), dbhome.getName() + "_old"); FSDirectory nindexDir = FSDirectory.getDirectory(ndbhome, true); if (nindexDir instanceof TransFSDirectory) { FSDirectory.setDisableLocks(true); TransFSDirectory d = (TransFSDirectory) nindexDir; d.setDriverClass(DRIVER_CLASS); d.setUrl(url); d.setUser(null); d.setPassword(null); } IndexSearcher searcher = null; IndexWriter writer = null; LuceneManager lmgr = null; try { searcher = new IndexSearcher(indexDir); PerFieldAnalyzerWrapper a = LuceneManager.buildAnalyzer(); writer = IndexWriterManager.getWriter(nindexDir, a, true); final int numDocs = searcher.getIndexReader().numDocs(); HashSet deldocs = new HashSet(); HashMap infos = new HashMap(); for (int i = 0; i < numDocs; i++) { Document doc = searcher.doc(i); String delprop = doc.get(DeletedInfos.DELETED); final String id = doc.get(LuceneManager.ID) + DeletedInfos.KEY_SEPERATOR + doc.get(LuceneManager.LAYER_OF_SAVE); if (delprop != null && "true".equals(delprop)) { deldocs.add(id); } else { Object v; if ((v = infos.get(id)) == null) { infos.put(id, new Integer(i)); } else { final String lmod = doc.get(LuceneManager.LASTMODIFIED); final String lmod_prev = searcher.doc(((Integer) v).intValue()).get("_lastmodified"); if (lmod_prev == null || (lmod != null && lmod.compareTo(lmod_prev) > 0)) { infos.put(id, new Integer(i)); } } } } ArrayList listOfMaps = new ArrayList(); for (int i = 0; i < numDocs; i++) { Document doc = searcher.doc(i); String delprop = doc.get(DeletedInfos.DELETED); String layerStr = doc.get(LuceneManager.LAYER_OF_SAVE); int layer = -1; try { layer = Integer.parseInt(layerStr); } catch (Exception ex) { layer = -1; } final String id = doc.get(LuceneManager.ID) + DeletedInfos.KEY_SEPERATOR + doc.get(LuceneManager.LAYER_OF_SAVE); if (delprop != null && "true".equals(delprop)) { continue; } else if (id != null && deldocs.contains(id)) { continue; } Integer idx = (Integer) infos.get(id); if (idx != null && i != idx.intValue()) { continue; } Document ndoc = convertDocument(doc); if (ndoc != null) { writer.addDocument(ndoc); } } } catch (Exception ex) { ex.printStackTrace(); throw new RuntimeException(ex); } finally { if (searcher != null) { try { searcher.close(); } catch (Exception ex) { } } if (lmgr != null) { lmgr.shutdown(); lmgr = null; } indexDir.close(); SegmentInfos sinfos = IndexObjectsFactory.getFSSegmentInfos(indexDir); sinfos.clear(); IndexObjectsFactory.removeDeletedInfos(indexDir); } Connection conn = null; boolean exceptionOccured = false; try { if (writer != null) { conn = DriverManager.getConnection(url); conn.setAutoCommit(false); writer.close(); writer.flushCache(); LuceneManager.commitSegments(null, conn, dbhome, writer.getDirectory()); writer.finalizeTrans(); } } catch (Exception ex) { ex.printStackTrace(); exceptionOccured = true; throw new RuntimeException(ex); } finally { if (conn != null) { try { if (!conn.getAutoCommit()) { if (!exceptionOccured) { conn.commit(); } else { conn.rollback(); } } conn.close(); } catch (Exception ex) { ex.printStackTrace(); } conn = null; } nindexDir.close(); SegmentInfos sinfos = IndexObjectsFactory.getFSSegmentInfos(nindexDir); sinfos.clear(); IndexObjectsFactory.removeDeletedInfos(nindexDir); } File[] files = dbhome.listFiles(); for (int i = 0; i < files.length; i++) { if (!files[i].isDirectory()) { files[i].delete(); } } files = ndbhome.listFiles(); for (int i = 0; i < files.length; i++) { if (!files[i].isDirectory()) { File nfile = new File(dbhome, files[i].getName()); files[i].renameTo(nfile); } } if (!FileUtils.deleteDir(ndbhome)) { throw new Exception("Could not delete " + ndbhome); } }