Example usage for org.apache.lucene.index IndexWriter addDocument

List of usage examples for org.apache.lucene.index IndexWriter addDocument

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter addDocument.

Prototype

public long addDocument(Iterable<? extends IndexableField> doc) throws IOException 

Source Link

Document

Adds a document to this index.

Usage

From source file:au.org.ala.names.search.ALANameIndexer.java

License:Open Source License

private void addALASyonyms(IndexWriter iw, String file) throws Exception {
    au.com.bytecode.opencsv.CSVReader reader = new au.com.bytecode.opencsv.CSVReader(new FileReader(file), '\t',
            '"', '\\', 1);
    for (String[] values = reader.readNext(); values != null; values = reader.readNext()) {

        String source = values[11];
        //give CoL synonyms a lower boost than NSL
        float boost = source.trim().equals("") || source.equalsIgnoreCase("CoL") ? 0.75f : 1.0f;
        Document doc = createALASynonymDocument(values[5], values[6], values[0], values[1], values[2],
                values[3], values[4], boost, values[9]);
        if (doc != null)
            iw.addDocument(doc);
    }/*from  w  w w .jav  a2  s. c o  m*/
}

From source file:au.org.ala.names.search.ALANameIndexer.java

License:Open Source License

private void indexALA(IndexWriter iw, String file, String synonymFile) throws Exception {
    int records = 0;
    long time = System.currentTimeMillis();
    au.com.bytecode.opencsv.CSVReader reader = new au.com.bytecode.opencsv.CSVReader(new FileReader(file), '\t',
            '"', '\\', 1);
    for (String[] values = reader.readNext(); values != null; values = reader.readNext()) {

        String lsid = values[POS_LSID];
        String id = values[POS_ID];
        String rank = values[POS_RANK];
        int rankId = -1;
        try {//from  w  w  w  . j  av  a2s.co m
            rankId = Integer.parseInt(values[POS_RANK_ID]);
        } catch (Exception e) {
        }

        String acceptedValues = values[POS_ACC_LSID];
        float boost = 1.0f;
        //give the major ranks a larger boost
        if (rankId % 1000 == 0) {
            boost = 5.0f;
        }
        //give non-col concepts a higher boost
        String source = values[POS_SRC];
        if (!source.trim().equals("") && !source.equalsIgnoreCase("CoL")) {
            boost = boost * 2;
        }

        Document doc = createALAIndexDocument(values[POS_SCI_NAME], id, lsid, values[POS_RANK_ID],
                values[POS_RANK], values[POS_K], values[POS_KID], values[POS_P], values[POS_PID], values[POS_C],
                values[POS_CID], values[POS_O], values[POS_OID], values[POS_F], values[POS_FID], values[POS_G],
                values[POS_GID], values[POS_S], values[POS_SID], values[POS_LFT], values[POS_RGT],
                acceptedValues, values[POS_SP_EPITHET], values[POS_INFRA_EPITHET], values[POS_AUTHOR], boost);

        //add the excluded information if applicable
        if ("T".equals(values[POS_EXCLUDED]) || "Y".equals(values[POS_EXCLUDED])) {
            doc.add(new TextField(NameIndexField.SYNONYM_TYPE.toString(),
                    SynonymType.EXCLUDES.getId().toString(), Store.YES));
        }
        if (doc != null) {
            iw.addDocument(doc);
            records++;
            if (records % 100000 == 0) {
                log.info("Processed " + records + " in " + (System.currentTimeMillis() - time) + " msecs");
            }
        }
    }
    addExtraALAConcept(iw, extraALAConcepts);
    //add the synonyms
    addALASyonyms(iw, synonymFile);
    iw.commit();
    iw.forceMerge(1);
    iw.close();
    log.info("Lucene index created - processed a total of " + records + " records in "
            + (System.currentTimeMillis() - time) + " msecs ");
}

From source file:au.org.ala.names.search.ALANameIndexer.java

License:Open Source License

/**
 * Indexes the IRMNG homonyms from the supplied DWCA direcory
 * @param iw The index writer to write the lucene docs to
 * @param archiveDirectory  The directory in which the IRMNG DWCA has been unzipped.
 * @throws Exception/*from  w  w  w  .  j  ava  2 s  . co  m*/
 */
protected void indexIrmngDwcA(IndexWriter iw, String archiveDirectory) throws Exception {
    log.info("Creating the IRMNG index from the DWCA " + archiveDirectory);
    //open the archive to extract the required information
    Archive archive = ArchiveFactory.openArchive(new File(archiveDirectory));
    Iterator<DarwinCoreRecord> it = archive.iteratorDwc();
    while (it.hasNext()) {
        Document doc = new Document();
        DarwinCoreRecord dwcr = it.next();
        String kingdom = dwcr.getKingdom();
        if (StringUtils.isNotEmpty(kingdom)) {
            doc.add(new TextField(RankType.KINGDOM.getRank(), kingdom, Store.YES));
        }
        String phylum = dwcr.getPhylum();
        if (StringUtils.isNotEmpty(phylum)) {
            doc.add(new TextField(RankType.PHYLUM.getRank(), phylum, Store.YES));
        }
        String classs = dwcr.getClasss();
        if (StringUtils.isNotEmpty(classs)) {
            doc.add(new TextField(RankType.CLASS.getRank(), classs, Store.YES));
        }
        String order = dwcr.getOrder();
        if (StringUtils.isNotEmpty(order)) {
            doc.add(new TextField(RankType.ORDER.getRank(), order, Store.YES));
        }
        String family = dwcr.getFamily();
        if (StringUtils.isNotEmpty(family)) {
            doc.add(new TextField(RankType.FAMILY.getRank(), family, Store.YES));
        }
        String genus = dwcr.getGenus();
        String calculatedRank = "genus";
        if (StringUtils.isNotEmpty(genus)) {
            doc.add(new TextField(RankType.GENUS.getRank(), genus, Store.YES));
            String specificEpithet = dwcr.getSpecificEpithet();
            if (StringUtils.isNotEmpty(specificEpithet)) {
                calculatedRank = "species";
                doc.add(new TextField(RankType.SPECIES.getRank(), genus + " " + specificEpithet, Store.YES));
            }
        }
        String rank = dwcr.getTaxonRank() != null ? dwcr.getTaxonRank() : calculatedRank;
        doc.add(new TextField(IndexField.RANK.toString(), rank, Store.YES));
        //now add the author - we don't do anything about this on homonym resolution yet
        //Add the author information
        String author = dwcr.getScientificNameAuthorship();
        if (StringUtils.isNotEmpty(author)) {
            //TODO think about whether we need to treat the author string with the taxamatch
            doc.add(new TextField(NameIndexField.AUTHOR.toString(), author, Store.YES));
        }
        //now add it to the index
        iw.addDocument(doc);

    }
}

From source file:au.org.ala.names.search.ALANameIndexer.java

License:Open Source License

/**
 * Indexes an IRMNG export for use in homonym resolution.
 *
 * @param iw//w w w . ja v  a  2 s  . co  m
 * @param irmngExport
 * @throws Exception
 */
void indexIRMNG(IndexWriter iw, String irmngExport, RankType rank) throws Exception {
    log.info("Creating IRMNG index ...");
    File file = new File(irmngExport);
    if (file.exists()) {
        CSVReader reader = new CSVReader(new FileReader(file), '\t', '"', '~');// CSVReader.build(file,"UTF-8", "\t", 0);
        int count = 0;
        String[] values = null;
        while ((values = reader.readNext()) != null) {
            Document doc = new Document();
            if (values != null && values.length >= 7) {
                doc.add(new TextField(RankType.KINGDOM.getRank(), values[0], Store.YES));
                doc.add(new TextField(RankType.PHYLUM.getRank(), values[1], Store.YES));
                doc.add(new TextField(RankType.CLASS.getRank(), values[2], Store.YES));
                doc.add(new TextField(RankType.ORDER.getRank(), values[3], Store.YES));
                doc.add(new TextField(RankType.FAMILY.getRank(), values[4], Store.YES));
                doc.add(new TextField(RankType.GENUS.getRank(), values[5], Store.YES));
                if (rank == RankType.GENUS) {
                    doc.add(new TextField(IndexField.ID.toString(), values[6], Store.YES));
                    doc.add(new TextField(IndexField.ACCEPTED.toString(), values[8], Store.YES));
                    doc.add(new TextField(IndexField.HOMONYM.toString(), values[10], Store.YES));
                } else if (rank == RankType.SPECIES) {
                    doc.add(new TextField(RankType.SPECIES.getRank(), values[6], Store.YES));
                }
                doc.add(new TextField(IndexField.RANK.toString(), rank.getRank(), Store.YES));
                iw.addDocument(doc);
                count++;
            }

        }
        iw.commit();

        log.info("Finished indexing " + count + " IRMNG " + rank + " taxa.");
    } else
        log.warn("Unable to create IRMNG index.  Can't locate " + irmngExport);
}

From source file:au.org.ala.names.search.ALANameIndexer.java

License:Open Source License

/**
 * Adds the CoL common names to the common name index.
 * @param iw//from   w  ww  .  ja va 2  s . co  m
 * @param currentSearcher
 * @throws Exception
 *
 */
private void addCoLCommonNames(IndexWriter iw, IndexSearcher currentSearcher) throws Exception {
    File fileCol = new File(colFile);
    if (fileCol.exists()) {
        CSVReader reader = new CSVReader(new FileReader(fileCol), ',', '"', '~');
        int count = 0;
        String[] values = null;
        while ((values = reader.readNext()) != null) {
            if (values.length == 3) {
                if (doesTaxonConceptExist(currentSearcher, values[2])) {
                    iw.addDocument(getCommonNameDocument(values[0], values[1], values[2], 1.0f));
                    count++;
                } else {
                    System.out.println("Unable to locate LSID " + values[2] + " in current dump");
                }
            }

        }
        log.info("Finished indexing " + count + " common names from " + fileCol);
    } else
        log.warn("Unable to index common names. Unable to locate : " + fileCol);
}

From source file:au.org.ala.names.search.ALANameIndexer.java

License:Open Source License

/**
 * Adds an ANBG CSV file of common names to the common name index.
 *
 * @param fileName The file name to add to the common name index
 * @param iw  The index writer to write the common documents to
 * @param currentSearcher The searcher to find a scientific name
 * @param idSearcher  The searcher to find an lsid
 * @param recordSep The record separator for the CSV file
 * @throws Exception//from   w w  w .j a v a 2 s . co  m
 */
private void addAnbgCommonNames(String fileName, IndexWriter iw, IndexSearcher currentSearcher,
        IndexSearcher idSearcher, char recordSep) throws Exception {
    File namesFile = new File(fileName);
    Pattern p = Pattern.compile(",");
    if (namesFile.exists()) {
        CSVReader reader = new CSVReader(new FileReader(namesFile), recordSep, '"', '\\');//CSVReader.build(namesFile,"UTF-8","\t", '"' , 1);
        int count = 0;
        String[] values = reader.readNext();
        while ((values = reader.readNext()) != null) {
            if (values != null && values.length >= 4) {
                //all ANBG records should have the highest boost as they are our authoritive source
                //we only want to add an ANBG record if the taxon concept LSID exists in the taxonConcepts.txt export
                if (doesTaxonConceptExist(currentSearcher, values[3])
                        || doesTaxonConceptExist(idSearcher, values[3])) {
                    //each common name could be a comma separated list
                    if (!values[2].contains(",") || values[2].toLowerCase().contains(" and ")) {
                        iw.addDocument(getCommonNameDocument(values[2], null, values[3], 2.0f));
                        count++;
                    } else {
                        //we need to process each common name in the list
                        String[] names = p.split(values[2]);
                        for (String name : names) {
                            iw.addDocument(getCommonNameDocument(name, null, values[3], 2.0f));
                            count++;
                        }
                    }
                } else {
                    System.out.println("Unable to locate LSID " + values[3] + " in current dump");
                }
            }

        }
        log.info("Finished indexing " + count + " common names from " + fileName);
    } else
        log.warn("Unable to index common names. Unable to locate : " + fileName);
}

From source file:au.org.ala.names.search.ALANameIndexer.java

License:Open Source License

/**
 * Creates a temporary index that will provide a lookup up of lsid to "real lsid".
 * <p/>/*from  w w  w.  j  a v  a 2  s . com*/
 * This deals with the following situations:
 * - common names that are sourced from CoL (LSIDs will be mapped to corresponding ANBG LSID)
 * - Multiple ANBG LSIDs exist for the same scientific name and more than 1 are mapped to the same common name.
 *
 * @param idFile
 * @throws Exception
 */
private void createExtraIdIndex(String idxLocation, File idFile) throws Exception {
    CSVReader reader = new CSVReader(new FileReader(idFile), '\t', '"', '~');//CSVReader.build(idFile, "UTF-8", "\t", '"', 0);
    File indexDir = new File(idxLocation);
    IndexWriter iw = createIndexWriter(indexDir, new KeywordAnalyzer(), true);//new IndexWriter(FSDirectory.open(indexDir), new KeywordAnalyzer(), true, MaxFieldLength.UNLIMITED);
    String[] values = null;
    while ((values = reader.readNext()) != null) {

        if (values != null && values.length >= 3) {
            Document doc = new Document();
            //doc.add(new Field("lsid", values[2], Store.NO, Index.NOT_ANALYZED));
            doc.add(new StringField("lsid", values[2], Store.NO));
            //doc.add(new Field("reallsid", values[1], Store.YES, Index.NO));
            doc.add(new StoredField("reallsid", values[1]));
            iw.addDocument(doc);
        }
    }
    iw.commit();
    iw.forceMerge(1);
    iw.close();
    idSearcher = new IndexSearcher(DirectoryReader.open(FSDirectory.open(indexDir)));
}

From source file:au.org.ala.names.search.ALANameIndexer.java

License:Open Source License

/**
 * Creates a temporary index that stores the taxon concept LSIDs that were
 * included in the last ANBG exports./*from   w w w .ja va 2s  . com*/
 *
 * @param tcFileName
 * @return
 * @throws Exception
 */
private IndexSearcher createTmpIndex(String tcFileName) throws Exception {
    //creating the tmp index in the /tmp/taxonConcept directory
    CSVReader reader = new CSVReader(new FileReader(new File(tcFileName)), '\t', '"', '~');
    File indexDir = new File("/tmp/taxonConcept");
    IndexWriter iw = createIndexWriter(indexDir, new KeywordAnalyzer(), true);
    String[] values = null;
    while ((values = reader.readNext()) != null) {
        if (values != null && values.length > 1) {
            //just add the LSID to the index
            Document doc = new Document();

            doc.add(new StringField("lsid", values[0], Store.NO));
            iw.addDocument(doc);

        }
    }
    iw.commit();
    iw.forceMerge(1);
    iw.close();
    return new IndexSearcher(DirectoryReader.open(FSDirectory.open(indexDir)));
}

From source file:au.org.ala.names.search.DwcaNameIndexer.java

License:Open Source License

/**
 * Index the common names CSV file supplied.
 *
 * CSV header need to be txaonId, taxonLsid, scientificName, vernacularName, languageCode, countryCode
 *
 * The languageCode and countryCode are not necessary as they are not used.
 *
 * @param iw/*from   www.j  av  a2s . c o m*/
 * @param file
 * @throws Exception
 */
private void indexCommonNames(IndexWriter iw, String file) throws Exception {
    //assumes that the quoted TSV file is in the following format
    //taxon id, taxon lsid, scientific name, vernacular name, language code, country code
    log.info("Starting to load the common names");
    int i = 0, count = 0;
    au.com.bytecode.opencsv.CSVReader cbreader = new au.com.bytecode.opencsv.CSVReader(new FileReader(file),
            '\t', '"', '\\', 0);
    for (String[] values = cbreader.readNext(); values != null; values = cbreader.readNext()) {
        i++;
        if (values.length == 6) {
            //relies on having the same lsid supplied as the DWCA file
            String lsid = StringUtils.isNotEmpty(values[1]) ? values[1] : values[0];
            //check to see if it exists
            TopDocs result = getLoadIdxResults("lsid", lsid, 1);
            if (result.totalHits > 0) {
                //we can add the common name
                Document doc = getCommonNameDocument(values[3], values[2], lsid, 1.0f, false);
                iw.addDocument(doc);
                count++;
            }
        } else {
            log.info("Issue on line " + i + "  " + values[0]);
        }
        if (i % 1000 == 0) {
            log.info("Finished processing " + i + " common names with " + count + " added to index ");
        }
    }
    log.info("Finished processing " + i + " common names with " + count + " added to index ");
    iw.commit();
    iw.forceMerge(1);
    iw.close();
}

From source file:au.org.ala.names.search.DwcaNameIndexer.java

License:Open Source License

/**
 * Creates a loading index to use to generate the hierarchy including the left right values.
 *
 * @param tmpIndexDir/*w w w .  j  av a 2  s  .c  om*/
 * @param archiveDirectory
 * @throws Exception
 */
private void createLoadingIndex(String tmpIndexDir, String archiveDirectory) throws Exception {
    log.info("Starting to create the temporary loading index.");
    File indexDir = new File(tmpIndexDir);
    IndexWriter iw = createIndexWriter(indexDir, new KeywordAnalyzer(), true);
    //create the loading index so that left right values and classifications can be generated
    Archive archive = ArchiveFactory.openArchive(new File(archiveDirectory));
    Iterator<DarwinCoreRecord> it = archive.iteratorDwc();
    int i = 0;
    long start = System.currentTimeMillis();
    while (it.hasNext()) {
        Document doc = new Document();
        DarwinCoreRecord dwcr = it.next();
        String id = dwcr.getId();
        String lsid = dwcr.getTaxonID() == null ? id : dwcr.getTaxonID();
        String acceptedLsid = dwcr.getAcceptedNameUsageID();
        //add and store the identifier for the record
        doc.add(new StringField(NameIndexField.ID.toString(), dwcr.getId(), Field.Store.YES));
        if (StringUtils.isNotBlank(lsid)) {
            doc.add(new StringField(NameIndexField.LSID.toString(), lsid, Field.Store.YES));
        } else {
            System.out.println("LSID is null for " + id + " " + lsid + " " + lsid + " " + acceptedLsid);
        }
        if (StringUtils.isNotBlank(dwcr.getParentNameUsageID())) {
            doc.add(new StringField("parent_id", dwcr.getParentNameUsageID(), Field.Store.YES));
        }
        if (StringUtils.isNotBlank(dwcr.getAcceptedNameUsageID())) {
            doc.add(new StringField(NameIndexField.ACCEPTED.toString(), dwcr.getAcceptedNameUsageID(),
                    Field.Store.YES));
        }
        if (StringUtils.isNotBlank(dwcr.getScientificName())) {
            //stored no need to search on
            doc.add(new StoredField(NameIndexField.NAME.toString(), dwcr.getScientificName()));
        }
        if (StringUtils.isNotBlank(dwcr.getScientificNameAuthorship())) {
            //stored no need to search on
            doc.add(new StoredField(NameIndexField.AUTHOR.toString(), dwcr.getScientificNameAuthorship()));
        }
        if (StringUtils.isNotBlank(dwcr.getGenus())) {
            //stored no need to search on
            doc.add(new StoredField("genus", dwcr.getGenus()));
        }
        if (StringUtils.isNotBlank(dwcr.getSpecificEpithet())) {
            //stored no need to search on
            doc.add(new StoredField(NameIndexField.SPECIFIC.toString(), dwcr.getSpecificEpithet()));
        }
        if (StringUtils.isNotBlank(dwcr.getInfraspecificEpithet())) {
            //stored no need to search on
            doc.add(new StoredField(NameIndexField.INFRA_SPECIFIC.toString(), dwcr.getInfraspecificEpithet()));
        }
        if (StringUtils.isNotBlank(dwcr.getTaxonRank())) {
            //match the supplied rank
            RankType rt = RankType.getForStrRank(dwcr.getTaxonRank());
            if (rt != null) {
                doc.add(new StringField(NameIndexField.RANK.toString(), rt.getRank(), Field.Store.YES));
                doc.add(new StringField(NameIndexField.RANK_ID.toString(), rt.getId().toString(),
                        Field.Store.YES));
            } else {
                doc.add(new StringField(NameIndexField.RANK.toString(), dwcr.getTaxonRank(), Field.Store.YES));
                doc.add(new StringField(NameIndexField.RANK_ID.toString(), RankType.UNRANKED.getId().toString(),
                        Field.Store.YES));
            }
        } else {
            //put in unknown rank
            doc.add(new StringField(NameIndexField.RANK.toString(), "Unknown", Field.Store.YES));
            doc.add(new StringField(NameIndexField.RANK_ID.toString(), RankType.UNRANKED.getId().toString(),
                    Field.Store.YES));
        }
        if (StringUtils.equals(lsid, acceptedLsid) || StringUtils.equals(id, acceptedLsid)
                || acceptedLsid == null) {
            //mark this one as an accepted concept
            doc.add(new StringField(NameIndexField.iS_SYNONYM.toString(), "F", Field.Store.YES));
            if (StringUtils.isBlank(dwcr.getParentNameUsageID())) {
                doc.add(new StringField("root", "T", Field.Store.YES));
            }
        } else {
            doc.add(new StringField(NameIndexField.iS_SYNONYM.toString(), "T", Field.Store.YES));
        }
        iw.addDocument(doc);
        i++;
        if (i % 1000 == 0) {
            long finish = System.currentTimeMillis();
            log.debug("Loading index: " + i + " records per sec: "
                    + (1000 / (((float) (finish / start)) / 1000)));
            start = finish;
        }
    }
    log.info("Finished creating the temporary load index with " + i + " concepts");
    iw.commit();
    iw.forceMerge(1);
    iw.close();
    lsearcher = new IndexSearcher(DirectoryReader.open(FSDirectory.open(indexDir)));
}