List of usage examples for org.apache.lucene.index IndexWriter addDocument
public long addDocument(Iterable<? extends IndexableField> doc) throws IOException
From source file:au.org.ala.names.search.ALANameIndexer.java
License:Open Source License
private void addALASyonyms(IndexWriter iw, String file) throws Exception { au.com.bytecode.opencsv.CSVReader reader = new au.com.bytecode.opencsv.CSVReader(new FileReader(file), '\t', '"', '\\', 1); for (String[] values = reader.readNext(); values != null; values = reader.readNext()) { String source = values[11]; //give CoL synonyms a lower boost than NSL float boost = source.trim().equals("") || source.equalsIgnoreCase("CoL") ? 0.75f : 1.0f; Document doc = createALASynonymDocument(values[5], values[6], values[0], values[1], values[2], values[3], values[4], boost, values[9]); if (doc != null) iw.addDocument(doc); }/*from w w w .jav a2 s. c o m*/ }
From source file:au.org.ala.names.search.ALANameIndexer.java
License:Open Source License
private void indexALA(IndexWriter iw, String file, String synonymFile) throws Exception { int records = 0; long time = System.currentTimeMillis(); au.com.bytecode.opencsv.CSVReader reader = new au.com.bytecode.opencsv.CSVReader(new FileReader(file), '\t', '"', '\\', 1); for (String[] values = reader.readNext(); values != null; values = reader.readNext()) { String lsid = values[POS_LSID]; String id = values[POS_ID]; String rank = values[POS_RANK]; int rankId = -1; try {//from w w w . j av a2s.co m rankId = Integer.parseInt(values[POS_RANK_ID]); } catch (Exception e) { } String acceptedValues = values[POS_ACC_LSID]; float boost = 1.0f; //give the major ranks a larger boost if (rankId % 1000 == 0) { boost = 5.0f; } //give non-col concepts a higher boost String source = values[POS_SRC]; if (!source.trim().equals("") && !source.equalsIgnoreCase("CoL")) { boost = boost * 2; } Document doc = createALAIndexDocument(values[POS_SCI_NAME], id, lsid, values[POS_RANK_ID], values[POS_RANK], values[POS_K], values[POS_KID], values[POS_P], values[POS_PID], values[POS_C], values[POS_CID], values[POS_O], values[POS_OID], values[POS_F], values[POS_FID], values[POS_G], values[POS_GID], values[POS_S], values[POS_SID], values[POS_LFT], values[POS_RGT], acceptedValues, values[POS_SP_EPITHET], values[POS_INFRA_EPITHET], values[POS_AUTHOR], boost); //add the excluded information if applicable if ("T".equals(values[POS_EXCLUDED]) || "Y".equals(values[POS_EXCLUDED])) { doc.add(new TextField(NameIndexField.SYNONYM_TYPE.toString(), SynonymType.EXCLUDES.getId().toString(), Store.YES)); } if (doc != null) { iw.addDocument(doc); records++; if (records % 100000 == 0) { log.info("Processed " + records + " in " + (System.currentTimeMillis() - time) + " msecs"); } } } addExtraALAConcept(iw, extraALAConcepts); //add the synonyms addALASyonyms(iw, synonymFile); iw.commit(); iw.forceMerge(1); iw.close(); log.info("Lucene index created - processed a total of " + records + " records in " + (System.currentTimeMillis() - time) + " msecs "); }
From source file:au.org.ala.names.search.ALANameIndexer.java
License:Open Source License
/** * Indexes the IRMNG homonyms from the supplied DWCA direcory * @param iw The index writer to write the lucene docs to * @param archiveDirectory The directory in which the IRMNG DWCA has been unzipped. * @throws Exception/*from w w w . j ava 2 s . co m*/ */ protected void indexIrmngDwcA(IndexWriter iw, String archiveDirectory) throws Exception { log.info("Creating the IRMNG index from the DWCA " + archiveDirectory); //open the archive to extract the required information Archive archive = ArchiveFactory.openArchive(new File(archiveDirectory)); Iterator<DarwinCoreRecord> it = archive.iteratorDwc(); while (it.hasNext()) { Document doc = new Document(); DarwinCoreRecord dwcr = it.next(); String kingdom = dwcr.getKingdom(); if (StringUtils.isNotEmpty(kingdom)) { doc.add(new TextField(RankType.KINGDOM.getRank(), kingdom, Store.YES)); } String phylum = dwcr.getPhylum(); if (StringUtils.isNotEmpty(phylum)) { doc.add(new TextField(RankType.PHYLUM.getRank(), phylum, Store.YES)); } String classs = dwcr.getClasss(); if (StringUtils.isNotEmpty(classs)) { doc.add(new TextField(RankType.CLASS.getRank(), classs, Store.YES)); } String order = dwcr.getOrder(); if (StringUtils.isNotEmpty(order)) { doc.add(new TextField(RankType.ORDER.getRank(), order, Store.YES)); } String family = dwcr.getFamily(); if (StringUtils.isNotEmpty(family)) { doc.add(new TextField(RankType.FAMILY.getRank(), family, Store.YES)); } String genus = dwcr.getGenus(); String calculatedRank = "genus"; if (StringUtils.isNotEmpty(genus)) { doc.add(new TextField(RankType.GENUS.getRank(), genus, Store.YES)); String specificEpithet = dwcr.getSpecificEpithet(); if (StringUtils.isNotEmpty(specificEpithet)) { calculatedRank = "species"; doc.add(new TextField(RankType.SPECIES.getRank(), genus + " " + specificEpithet, Store.YES)); } } String rank = dwcr.getTaxonRank() != null ? dwcr.getTaxonRank() : calculatedRank; doc.add(new TextField(IndexField.RANK.toString(), rank, Store.YES)); //now add the author - we don't do anything about this on homonym resolution yet //Add the author information String author = dwcr.getScientificNameAuthorship(); if (StringUtils.isNotEmpty(author)) { //TODO think about whether we need to treat the author string with the taxamatch doc.add(new TextField(NameIndexField.AUTHOR.toString(), author, Store.YES)); } //now add it to the index iw.addDocument(doc); } }
From source file:au.org.ala.names.search.ALANameIndexer.java
License:Open Source License
/** * Indexes an IRMNG export for use in homonym resolution. * * @param iw//w w w . ja v a 2 s . co m * @param irmngExport * @throws Exception */ void indexIRMNG(IndexWriter iw, String irmngExport, RankType rank) throws Exception { log.info("Creating IRMNG index ..."); File file = new File(irmngExport); if (file.exists()) { CSVReader reader = new CSVReader(new FileReader(file), '\t', '"', '~');// CSVReader.build(file,"UTF-8", "\t", 0); int count = 0; String[] values = null; while ((values = reader.readNext()) != null) { Document doc = new Document(); if (values != null && values.length >= 7) { doc.add(new TextField(RankType.KINGDOM.getRank(), values[0], Store.YES)); doc.add(new TextField(RankType.PHYLUM.getRank(), values[1], Store.YES)); doc.add(new TextField(RankType.CLASS.getRank(), values[2], Store.YES)); doc.add(new TextField(RankType.ORDER.getRank(), values[3], Store.YES)); doc.add(new TextField(RankType.FAMILY.getRank(), values[4], Store.YES)); doc.add(new TextField(RankType.GENUS.getRank(), values[5], Store.YES)); if (rank == RankType.GENUS) { doc.add(new TextField(IndexField.ID.toString(), values[6], Store.YES)); doc.add(new TextField(IndexField.ACCEPTED.toString(), values[8], Store.YES)); doc.add(new TextField(IndexField.HOMONYM.toString(), values[10], Store.YES)); } else if (rank == RankType.SPECIES) { doc.add(new TextField(RankType.SPECIES.getRank(), values[6], Store.YES)); } doc.add(new TextField(IndexField.RANK.toString(), rank.getRank(), Store.YES)); iw.addDocument(doc); count++; } } iw.commit(); log.info("Finished indexing " + count + " IRMNG " + rank + " taxa."); } else log.warn("Unable to create IRMNG index. Can't locate " + irmngExport); }
From source file:au.org.ala.names.search.ALANameIndexer.java
License:Open Source License
/** * Adds the CoL common names to the common name index. * @param iw//from w ww . ja va 2 s . co m * @param currentSearcher * @throws Exception * */ private void addCoLCommonNames(IndexWriter iw, IndexSearcher currentSearcher) throws Exception { File fileCol = new File(colFile); if (fileCol.exists()) { CSVReader reader = new CSVReader(new FileReader(fileCol), ',', '"', '~'); int count = 0; String[] values = null; while ((values = reader.readNext()) != null) { if (values.length == 3) { if (doesTaxonConceptExist(currentSearcher, values[2])) { iw.addDocument(getCommonNameDocument(values[0], values[1], values[2], 1.0f)); count++; } else { System.out.println("Unable to locate LSID " + values[2] + " in current dump"); } } } log.info("Finished indexing " + count + " common names from " + fileCol); } else log.warn("Unable to index common names. Unable to locate : " + fileCol); }
From source file:au.org.ala.names.search.ALANameIndexer.java
License:Open Source License
/** * Adds an ANBG CSV file of common names to the common name index. * * @param fileName The file name to add to the common name index * @param iw The index writer to write the common documents to * @param currentSearcher The searcher to find a scientific name * @param idSearcher The searcher to find an lsid * @param recordSep The record separator for the CSV file * @throws Exception//from w w w .j a v a 2 s . co m */ private void addAnbgCommonNames(String fileName, IndexWriter iw, IndexSearcher currentSearcher, IndexSearcher idSearcher, char recordSep) throws Exception { File namesFile = new File(fileName); Pattern p = Pattern.compile(","); if (namesFile.exists()) { CSVReader reader = new CSVReader(new FileReader(namesFile), recordSep, '"', '\\');//CSVReader.build(namesFile,"UTF-8","\t", '"' , 1); int count = 0; String[] values = reader.readNext(); while ((values = reader.readNext()) != null) { if (values != null && values.length >= 4) { //all ANBG records should have the highest boost as they are our authoritive source //we only want to add an ANBG record if the taxon concept LSID exists in the taxonConcepts.txt export if (doesTaxonConceptExist(currentSearcher, values[3]) || doesTaxonConceptExist(idSearcher, values[3])) { //each common name could be a comma separated list if (!values[2].contains(",") || values[2].toLowerCase().contains(" and ")) { iw.addDocument(getCommonNameDocument(values[2], null, values[3], 2.0f)); count++; } else { //we need to process each common name in the list String[] names = p.split(values[2]); for (String name : names) { iw.addDocument(getCommonNameDocument(name, null, values[3], 2.0f)); count++; } } } else { System.out.println("Unable to locate LSID " + values[3] + " in current dump"); } } } log.info("Finished indexing " + count + " common names from " + fileName); } else log.warn("Unable to index common names. Unable to locate : " + fileName); }
From source file:au.org.ala.names.search.ALANameIndexer.java
License:Open Source License
/** * Creates a temporary index that will provide a lookup up of lsid to "real lsid". * <p/>/*from w w w. j a v a 2 s . com*/ * This deals with the following situations: * - common names that are sourced from CoL (LSIDs will be mapped to corresponding ANBG LSID) * - Multiple ANBG LSIDs exist for the same scientific name and more than 1 are mapped to the same common name. * * @param idFile * @throws Exception */ private void createExtraIdIndex(String idxLocation, File idFile) throws Exception { CSVReader reader = new CSVReader(new FileReader(idFile), '\t', '"', '~');//CSVReader.build(idFile, "UTF-8", "\t", '"', 0); File indexDir = new File(idxLocation); IndexWriter iw = createIndexWriter(indexDir, new KeywordAnalyzer(), true);//new IndexWriter(FSDirectory.open(indexDir), new KeywordAnalyzer(), true, MaxFieldLength.UNLIMITED); String[] values = null; while ((values = reader.readNext()) != null) { if (values != null && values.length >= 3) { Document doc = new Document(); //doc.add(new Field("lsid", values[2], Store.NO, Index.NOT_ANALYZED)); doc.add(new StringField("lsid", values[2], Store.NO)); //doc.add(new Field("reallsid", values[1], Store.YES, Index.NO)); doc.add(new StoredField("reallsid", values[1])); iw.addDocument(doc); } } iw.commit(); iw.forceMerge(1); iw.close(); idSearcher = new IndexSearcher(DirectoryReader.open(FSDirectory.open(indexDir))); }
From source file:au.org.ala.names.search.ALANameIndexer.java
License:Open Source License
/** * Creates a temporary index that stores the taxon concept LSIDs that were * included in the last ANBG exports./*from w w w .ja va 2s . com*/ * * @param tcFileName * @return * @throws Exception */ private IndexSearcher createTmpIndex(String tcFileName) throws Exception { //creating the tmp index in the /tmp/taxonConcept directory CSVReader reader = new CSVReader(new FileReader(new File(tcFileName)), '\t', '"', '~'); File indexDir = new File("/tmp/taxonConcept"); IndexWriter iw = createIndexWriter(indexDir, new KeywordAnalyzer(), true); String[] values = null; while ((values = reader.readNext()) != null) { if (values != null && values.length > 1) { //just add the LSID to the index Document doc = new Document(); doc.add(new StringField("lsid", values[0], Store.NO)); iw.addDocument(doc); } } iw.commit(); iw.forceMerge(1); iw.close(); return new IndexSearcher(DirectoryReader.open(FSDirectory.open(indexDir))); }
From source file:au.org.ala.names.search.DwcaNameIndexer.java
License:Open Source License
/** * Index the common names CSV file supplied. * * CSV header need to be txaonId, taxonLsid, scientificName, vernacularName, languageCode, countryCode * * The languageCode and countryCode are not necessary as they are not used. * * @param iw/*from www.j av a2s . c o m*/ * @param file * @throws Exception */ private void indexCommonNames(IndexWriter iw, String file) throws Exception { //assumes that the quoted TSV file is in the following format //taxon id, taxon lsid, scientific name, vernacular name, language code, country code log.info("Starting to load the common names"); int i = 0, count = 0; au.com.bytecode.opencsv.CSVReader cbreader = new au.com.bytecode.opencsv.CSVReader(new FileReader(file), '\t', '"', '\\', 0); for (String[] values = cbreader.readNext(); values != null; values = cbreader.readNext()) { i++; if (values.length == 6) { //relies on having the same lsid supplied as the DWCA file String lsid = StringUtils.isNotEmpty(values[1]) ? values[1] : values[0]; //check to see if it exists TopDocs result = getLoadIdxResults("lsid", lsid, 1); if (result.totalHits > 0) { //we can add the common name Document doc = getCommonNameDocument(values[3], values[2], lsid, 1.0f, false); iw.addDocument(doc); count++; } } else { log.info("Issue on line " + i + " " + values[0]); } if (i % 1000 == 0) { log.info("Finished processing " + i + " common names with " + count + " added to index "); } } log.info("Finished processing " + i + " common names with " + count + " added to index "); iw.commit(); iw.forceMerge(1); iw.close(); }
From source file:au.org.ala.names.search.DwcaNameIndexer.java
License:Open Source License
/** * Creates a loading index to use to generate the hierarchy including the left right values. * * @param tmpIndexDir/*w w w . j av a 2 s .c om*/ * @param archiveDirectory * @throws Exception */ private void createLoadingIndex(String tmpIndexDir, String archiveDirectory) throws Exception { log.info("Starting to create the temporary loading index."); File indexDir = new File(tmpIndexDir); IndexWriter iw = createIndexWriter(indexDir, new KeywordAnalyzer(), true); //create the loading index so that left right values and classifications can be generated Archive archive = ArchiveFactory.openArchive(new File(archiveDirectory)); Iterator<DarwinCoreRecord> it = archive.iteratorDwc(); int i = 0; long start = System.currentTimeMillis(); while (it.hasNext()) { Document doc = new Document(); DarwinCoreRecord dwcr = it.next(); String id = dwcr.getId(); String lsid = dwcr.getTaxonID() == null ? id : dwcr.getTaxonID(); String acceptedLsid = dwcr.getAcceptedNameUsageID(); //add and store the identifier for the record doc.add(new StringField(NameIndexField.ID.toString(), dwcr.getId(), Field.Store.YES)); if (StringUtils.isNotBlank(lsid)) { doc.add(new StringField(NameIndexField.LSID.toString(), lsid, Field.Store.YES)); } else { System.out.println("LSID is null for " + id + " " + lsid + " " + lsid + " " + acceptedLsid); } if (StringUtils.isNotBlank(dwcr.getParentNameUsageID())) { doc.add(new StringField("parent_id", dwcr.getParentNameUsageID(), Field.Store.YES)); } if (StringUtils.isNotBlank(dwcr.getAcceptedNameUsageID())) { doc.add(new StringField(NameIndexField.ACCEPTED.toString(), dwcr.getAcceptedNameUsageID(), Field.Store.YES)); } if (StringUtils.isNotBlank(dwcr.getScientificName())) { //stored no need to search on doc.add(new StoredField(NameIndexField.NAME.toString(), dwcr.getScientificName())); } if (StringUtils.isNotBlank(dwcr.getScientificNameAuthorship())) { //stored no need to search on doc.add(new StoredField(NameIndexField.AUTHOR.toString(), dwcr.getScientificNameAuthorship())); } if (StringUtils.isNotBlank(dwcr.getGenus())) { //stored no need to search on doc.add(new StoredField("genus", dwcr.getGenus())); } if (StringUtils.isNotBlank(dwcr.getSpecificEpithet())) { //stored no need to search on doc.add(new StoredField(NameIndexField.SPECIFIC.toString(), dwcr.getSpecificEpithet())); } if (StringUtils.isNotBlank(dwcr.getInfraspecificEpithet())) { //stored no need to search on doc.add(new StoredField(NameIndexField.INFRA_SPECIFIC.toString(), dwcr.getInfraspecificEpithet())); } if (StringUtils.isNotBlank(dwcr.getTaxonRank())) { //match the supplied rank RankType rt = RankType.getForStrRank(dwcr.getTaxonRank()); if (rt != null) { doc.add(new StringField(NameIndexField.RANK.toString(), rt.getRank(), Field.Store.YES)); doc.add(new StringField(NameIndexField.RANK_ID.toString(), rt.getId().toString(), Field.Store.YES)); } else { doc.add(new StringField(NameIndexField.RANK.toString(), dwcr.getTaxonRank(), Field.Store.YES)); doc.add(new StringField(NameIndexField.RANK_ID.toString(), RankType.UNRANKED.getId().toString(), Field.Store.YES)); } } else { //put in unknown rank doc.add(new StringField(NameIndexField.RANK.toString(), "Unknown", Field.Store.YES)); doc.add(new StringField(NameIndexField.RANK_ID.toString(), RankType.UNRANKED.getId().toString(), Field.Store.YES)); } if (StringUtils.equals(lsid, acceptedLsid) || StringUtils.equals(id, acceptedLsid) || acceptedLsid == null) { //mark this one as an accepted concept doc.add(new StringField(NameIndexField.iS_SYNONYM.toString(), "F", Field.Store.YES)); if (StringUtils.isBlank(dwcr.getParentNameUsageID())) { doc.add(new StringField("root", "T", Field.Store.YES)); } } else { doc.add(new StringField(NameIndexField.iS_SYNONYM.toString(), "T", Field.Store.YES)); } iw.addDocument(doc); i++; if (i % 1000 == 0) { long finish = System.currentTimeMillis(); log.debug("Loading index: " + i + " records per sec: " + (1000 / (((float) (finish / start)) / 1000))); start = finish; } } log.info("Finished creating the temporary load index with " + i + " concepts"); iw.commit(); iw.forceMerge(1); iw.close(); lsearcher = new IndexSearcher(DirectoryReader.open(FSDirectory.open(indexDir))); }