Example usage for org.apache.lucene.index IndexWriter addDocument

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter addDocument.

Prototype

public long addDocument(Iterable<? extends IndexableField> doc) throws IOException

Source Link

Document

Adds a document to this index.

Usage

From source file:au.org.ala.names.search.ALANameIndexer.java

License:Open Source License

private void addALASyonyms(IndexWriter iw, String file) throws Exception {
    au.com.bytecode.opencsv.CSVReader reader = new au.com.bytecode.opencsv.CSVReader(new FileReader(file), '\t',
            '"', '\\', 1);
    for (String[] values = reader.readNext(); values != null; values = reader.readNext()) {

        String source = values[11];
        //give CoL synonyms a lower boost than NSL
        float boost = source.trim().equals("") || source.equalsIgnoreCase("CoL") ? 0.75f : 1.0f;
        Document doc = createALASynonymDocument(values[5], values[6], values[0], values[1], values[2],
                values[3], values[4], boost, values[9]);
        if (doc != null)
            iw.addDocument(doc);
    }/*from  w  w w .jav  a2  s. c o  m*/
}

From source file:au.org.ala.names.search.ALANameIndexer.java

License:Open Source License

private void indexALA(IndexWriter iw, String file, String synonymFile) throws Exception {
    int records = 0;
    long time = System.currentTimeMillis();
    au.com.bytecode.opencsv.CSVReader reader = new au.com.bytecode.opencsv.CSVReader(new FileReader(file), '\t',
            '"', '\\', 1);
    for (String[] values = reader.readNext(); values != null; values = reader.readNext()) {

        String lsid = values[POS_LSID];
        String id = values[POS_ID];
        String rank = values[POS_RANK];
        int rankId = -1;
        try {//from  w  w  w  . j  av  a2s.co m
            rankId = Integer.parseInt(values[POS_RANK_ID]);
        } catch (Exception e) {
        }

        String acceptedValues = values[POS_ACC_LSID];
        float boost = 1.0f;
        //give the major ranks a larger boost
        if (rankId % 1000 == 0) {
            boost = 5.0f;
        }
        //give non-col concepts a higher boost
        String source = values[POS_SRC];
        if (!source.trim().equals("") && !source.equalsIgnoreCase("CoL")) {
            boost = boost * 2;
        }

        Document doc = createALAIndexDocument(values[POS_SCI_NAME], id, lsid, values[POS_RANK_ID],
                values[POS_RANK], values[POS_K], values[POS_KID], values[POS_P], values[POS_PID], values[POS_C],
                values[POS_CID], values[POS_O], values[POS_OID], values[POS_F], values[POS_FID], values[POS_G],
                values[POS_GID], values[POS_S], values[POS_SID], values[POS_LFT], values[POS_RGT],
                acceptedValues, values[POS_SP_EPITHET], values[POS_INFRA_EPITHET], values[POS_AUTHOR], boost);

        //add the excluded information if applicable
        if ("T".equals(values[POS_EXCLUDED]) || "Y".equals(values[POS_EXCLUDED])) {
            doc.add(new TextField(NameIndexField.SYNONYM_TYPE.toString(),
                    SynonymType.EXCLUDES.getId().toString(), Store.YES));
        }
        if (doc != null) {
            iw.addDocument(doc);
            records++;
            if (records % 100000 == 0) {
                log.info("Processed " + records + " in " + (System.currentTimeMillis() - time) + " msecs");
            }
        }
    }
    addExtraALAConcept(iw, extraALAConcepts);
    //add the synonyms
    addALASyonyms(iw, synonymFile);
    iw.commit();
    iw.forceMerge(1);
    iw.close();
    log.info("Lucene index created - processed a total of " + records + " records in "
            + (System.currentTimeMillis() - time) + " msecs ");
}

From source file:au.org.ala.names.search.ALANameIndexer.java

License:Open Source License

/**
 * Indexes the IRMNG homonyms from the supplied DWCA direcory
 * @param iw The index writer to write the lucene docs to
 * @param archiveDirectory  The directory in which the IRMNG DWCA has been unzipped.
 * @throws Exception/*from  w  w  w  .  j  ava  2 s  . co  m*/
 */
protected void indexIrmngDwcA(IndexWriter iw, String archiveDirectory) throws Exception {
    log.info("Creating the IRMNG index from the DWCA " + archiveDirectory);
    //open the archive to extract the required information
    Archive archive = ArchiveFactory.openArchive(new File(archiveDirectory));
    Iterator<DarwinCoreRecord> it = archive.iteratorDwc();
    while (it.hasNext()) {
        Document doc = new Document();
        DarwinCoreRecord dwcr = it.next();
        String kingdom = dwcr.getKingdom();
        if (StringUtils.isNotEmpty(kingdom)) {
            doc.add(new TextField(RankType.KINGDOM.getRank(), kingdom, Store.YES));
        }
        String phylum = dwcr.getPhylum();
        if (StringUtils.isNotEmpty(phylum)) {
            doc.add(new TextField(RankType.PHYLUM.getRank(), phylum, Store.YES));
        }
        String classs = dwcr.getClasss();
        if (StringUtils.isNotEmpty(classs)) {
            doc.add(new TextField(RankType.CLASS.getRank(), classs, Store.YES));
        }
        String order = dwcr.getOrder();
        if (StringUtils.isNotEmpty(order)) {
            doc.add(new TextField(RankType.ORDER.getRank(), order, Store.YES));
        }
        String family = dwcr.getFamily();
        if (StringUtils.isNotEmpty(family)) {
            doc.add(new TextField(RankType.FAMILY.getRank(), family, Store.YES));
        }
        String genus = dwcr.getGenus();
        String calculatedRank = "genus";
        if (StringUtils.isNotEmpty(genus)) {
            doc.add(new TextField(RankType.GENUS.getRank(), genus, Store.YES));
            String specificEpithet = dwcr.getSpecificEpithet();
            if (StringUtils.isNotEmpty(specificEpithet)) {
                calculatedRank = "species";
                doc.add(new TextField(RankType.SPECIES.getRank(), genus + " " + specificEpithet, Store.YES));
            }
        }
        String rank = dwcr.getTaxonRank() != null ? dwcr.getTaxonRank() : calculatedRank;
        doc.add(new TextField(IndexField.RANK.toString(), rank, Store.YES));
        //now add the author - we don't do anything about this on homonym resolution yet
        //Add the author information
        String author = dwcr.getScientificNameAuthorship();
        if (StringUtils.isNotEmpty(author)) {
            //TODO think about whether we need to treat the author string with the taxamatch
            doc.add(new TextField(NameIndexField.AUTHOR.toString(), author, Store.YES));
        }
        //now add it to the index
        iw.addDocument(doc);

    }
}

From source file:au.org.ala.names.search.ALANameIndexer.java

License:Open Source License

/**
 * Indexes an IRMNG export for use in homonym resolution.
 *
 * @param iw//w w w . ja v  a  2 s  . co  m
 * @param irmngExport
 * @throws Exception
 */
void indexIRMNG(IndexWriter iw, String irmngExport, RankType rank) throws Exception {
    log.info("Creating IRMNG index ...");
    File file = new File(irmngExport);
    if (file.exists()) {
        CSVReader reader = new CSVReader(new FileReader(file), '\t', '"', '~');// CSVReader.build(file,"UTF-8", "\t", 0);
        int count = 0;
        String[] values = null;
        while ((values = reader.readNext()) != null) {
            Document doc = new Document();
            if (values != null && values.length >= 7) {
                doc.add(new TextField(RankType.KINGDOM.getRank(), values[0], Store.YES));
                doc.add(new TextField(RankType.PHYLUM.getRank(), values[1], Store.YES));
                doc.add(new TextField(RankType.CLASS.getRank(), values[2], Store.YES));
                doc.add(new TextField(RankType.ORDER.getRank(), values[3], Store.YES));
                doc.add(new TextField(RankType.FAMILY.getRank(), values[4], Store.YES));
                doc.add(new TextField(RankType.GENUS.getRank(), values[5], Store.YES));
                if (rank == RankType.GENUS) {
                    doc.add(new TextField(IndexField.ID.toString(), values[6], Store.YES));
                    doc.add(new TextField(IndexField.ACCEPTED.toString(), values[8], Store.YES));
                    doc.add(new TextField(IndexField.HOMONYM.toString(), values[10], Store.YES));
                } else if (rank == RankType.SPECIES) {
                    doc.add(new TextField(RankType.SPECIES.getRank(), values[6], Store.YES));
                }
                doc.add(new TextField(IndexField.RANK.toString(), rank.getRank(), Store.YES));
                iw.addDocument(doc);
                count++;
            }

        }
        iw.commit();

        log.info("Finished indexing " + count + " IRMNG " + rank + " taxa.");
    } else
        log.warn("Unable to create IRMNG index.  Can't locate " + irmngExport);
}

From source file:au.org.ala.names.search.ALANameIndexer.java

License:Open Source License

/**
 * Adds the CoL common names to the common name index.
 * @param iw//from   w  ww  .  ja va 2  s . co  m
 * @param currentSearcher
 * @throws Exception
 *
 */
private void addCoLCommonNames(IndexWriter iw, IndexSearcher currentSearcher) throws Exception {
    File fileCol = new File(colFile);
    if (fileCol.exists()) {
        CSVReader reader = new CSVReader(new FileReader(fileCol), ',', '"', '~');
        int count = 0;
        String[] values = null;
        while ((values = reader.readNext()) != null) {
            if (values.length == 3) {
                if (doesTaxonConceptExist(currentSearcher, values[2])) {
                    iw.addDocument(getCommonNameDocument(values[0], values[1], values[2], 1.0f));
                    count++;
                } else {
                    System.out.println("Unable to locate LSID " + values[2] + " in current dump");
                }
            }

        }
        log.info("Finished indexing " + count + " common names from " + fileCol);
    } else
        log.warn("Unable to index common names. Unable to locate : " + fileCol);
}

From source file:au.org.ala.names.search.ALANameIndexer.java

License:Open Source License

/**
 * Adds an ANBG CSV file of common names to the common name index.
 *
 * @param fileName The file name to add to the common name index
 * @param iw  The index writer to write the common documents to
 * @param currentSearcher The searcher to find a scientific name
 * @param idSearcher  The searcher to find an lsid
 * @param recordSep The record separator for the CSV file
 * @throws Exception//from   w w  w .j a v a 2 s . co  m
 */
private void addAnbgCommonNames(String fileName, IndexWriter iw, IndexSearcher currentSearcher,
        IndexSearcher idSearcher, char recordSep) throws Exception {
    File namesFile = new File(fileName);
    Pattern p = Pattern.compile(",");
    if (namesFile.exists()) {
        CSVReader reader = new CSVReader(new FileReader(namesFile), recordSep, '"', '\\');//CSVReader.build(namesFile,"UTF-8","\t", '"' , 1);
        int count = 0;
        String[] values = reader.readNext();
        while ((values = reader.readNext()) != null) {
            if (values != null && values.length >= 4) {
                //all ANBG records should have the highest boost as they are our authoritive source
                //we only want to add an ANBG record if the taxon concept LSID exists in the taxonConcepts.txt export
                if (doesTaxonConceptExist(currentSearcher, values[3])
                        || doesTaxonConceptExist(idSearcher, values[3])) {
                    //each common name could be a comma separated list
                    if (!values[2].contains(",") || values[2].toLowerCase().contains(" and ")) {
                        iw.addDocument(getCommonNameDocument(values[2], null, values[3], 2.0f));
                        count++;
                    } else {
                        //we need to process each common name in the list
                        String[] names = p.split(values[2]);
                        for (String name : names) {
                            iw.addDocument(getCommonNameDocument(name, null, values[3], 2.0f));
                            count++;
                        }
                    }
                } else {
                    System.out.println("Unable to locate LSID " + values[3] + " in current dump");
                }
            }

        }
        log.info("Finished indexing " + count + " common names from " + fileName);
    } else
        log.warn("Unable to index common names. Unable to locate : " + fileName);
}

From source file:au.org.ala.names.search.ALANameIndexer.java

License:Open Source License

/**
 * Creates a temporary index that will provide a lookup up of lsid to "real lsid".
 * <p/>/*from  w w  w.  j  a v  a 2  s . com*/
 * This deals with the following situations:
 * - common names that are sourced from CoL (LSIDs will be mapped to corresponding ANBG LSID)
 * - Multiple ANBG LSIDs exist for the same scientific name and more than 1 are mapped to the same common name.
 *
 * @param idFile
 * @throws Exception
 */
private void createExtraIdIndex(String idxLocation, File idFile) throws Exception {
    CSVReader reader = new CSVReader(new FileReader(idFile), '\t', '"', '~');//CSVReader.build(idFile, "UTF-8", "\t", '"', 0);
    File indexDir = new File(idxLocation);
    IndexWriter iw = createIndexWriter(indexDir, new KeywordAnalyzer(), true);//new IndexWriter(FSDirectory.open(indexDir), new KeywordAnalyzer(), true, MaxFieldLength.UNLIMITED);
    String[] values = null;
    while ((values = reader.readNext()) != null) {

        if (values != null && values.length >= 3) {
            Document doc = new Document();
            //doc.add(new Field("lsid", values[2], Store.NO, Index.NOT_ANALYZED));
            doc.add(new StringField("lsid", values[2], Store.NO));
            //doc.add(new Field("reallsid", values[1], Store.YES, Index.NO));
            doc.add(new StoredField("reallsid", values[1]));
            iw.addDocument(doc);
        }
    }
    iw.commit();
    iw.forceMerge(1);
    iw.close();
    idSearcher = new IndexSearcher(DirectoryReader.open(FSDirectory.open(indexDir)));
}

From source file:au.org.ala.names.search.ALANameIndexer.java

License:Open Source License

/**
 * Creates a temporary index that stores the taxon concept LSIDs that were
 * included in the last ANBG exports./*from   w w w .ja va 2s  . com*/
 *
 * @param tcFileName
 * @return
 * @throws Exception
 */
private IndexSearcher createTmpIndex(String tcFileName) throws Exception {
    //creating the tmp index in the /tmp/taxonConcept directory
    CSVReader reader = new CSVReader(new FileReader(new File(tcFileName)), '\t', '"', '~');
    File indexDir = new File("/tmp/taxonConcept");
    IndexWriter iw = createIndexWriter(indexDir, new KeywordAnalyzer(), true);
    String[] values = null;
    while ((values = reader.readNext()) != null) {
        if (values != null && values.length > 1) {
            //just add the LSID to the index
            Document doc = new Document();

            doc.add(new StringField("lsid", values[0], Store.NO));
            iw.addDocument(doc);

        }
    }
    iw.commit();
    iw.forceMerge(1);
    iw.close();
    return new IndexSearcher(DirectoryReader.open(FSDirectory.open(indexDir)));
}

From source file:au.org.ala.names.search.DwcaNameIndexer.java

License:Open Source License

/**
 * Index the common names CSV file supplied.
 *
 * CSV header need to be txaonId, taxonLsid, scientificName, vernacularName, languageCode, countryCode
 *
 * The languageCode and countryCode are not necessary as they are not used.
 *
 * @param iw/*from   www.j  av  a2s . c o m*/
 * @param file
 * @throws Exception
 */
private void indexCommonNames(IndexWriter iw, String file) throws Exception {
    //assumes that the quoted TSV file is in the following format
    //taxon id, taxon lsid, scientific name, vernacular name, language code, country code
    log.info("Starting to load the common names");
    int i = 0, count = 0;
    au.com.bytecode.opencsv.CSVReader cbreader = new au.com.bytecode.opencsv.CSVReader(new FileReader(file),
            '\t', '"', '\\', 0);
    for (String[] values = cbreader.readNext(); values != null; values = cbreader.readNext()) {
        i++;
        if (values.length == 6) {
            //relies on having the same lsid supplied as the DWCA file
            String lsid = StringUtils.isNotEmpty(values[1]) ? values[1] : values[0];
            //check to see if it exists
            TopDocs result = getLoadIdxResults("lsid", lsid, 1);
            if (result.totalHits > 0) {
                //we can add the common name
                Document doc = getCommonNameDocument(values[3], values[2], lsid, 1.0f, false);
                iw.addDocument(doc);
                count++;
            }
        } else {
            log.info("Issue on line " + i + "  " + values[0]);
        }
        if (i % 1000 == 0) {
            log.info("Finished processing " + i + " common names with " + count + " added to index ");
        }
    }
    log.info("Finished processing " + i + " common names with " + count + " added to index ");
    iw.commit();
    iw.forceMerge(1);
    iw.close();
}

From source file:au.org.ala.names.search.DwcaNameIndexer.java

License:Open Source License

/**
 * Creates a loading index to use to generate the hierarchy including the left right values.
 *
 * @param tmpIndexDir/*w w w .  j  av a 2  s  .c  om*/
 * @param archiveDirectory
 * @throws Exception
 */
private void createLoadingIndex(String tmpIndexDir, String archiveDirectory) throws Exception {
    log.info("Starting to create the temporary loading index.");
    File indexDir = new File(tmpIndexDir);
    IndexWriter iw = createIndexWriter(indexDir, new KeywordAnalyzer(), true);
    //create the loading index so that left right values and classifications can be generated
    Archive archive = ArchiveFactory.openArchive(new File(archiveDirectory));
    Iterator<DarwinCoreRecord> it = archive.iteratorDwc();
    int i = 0;
    long start = System.currentTimeMillis();
    while (it.hasNext()) {
        Document doc = new Document();
        DarwinCoreRecord dwcr = it.next();
        String id = dwcr.getId();
        String lsid = dwcr.getTaxonID() == null ? id : dwcr.getTaxonID();
        String acceptedLsid = dwcr.getAcceptedNameUsageID();
        //add and store the identifier for the record
        doc.add(new StringField(NameIndexField.ID.toString(), dwcr.getId(), Field.Store.YES));
        if (StringUtils.isNotBlank(lsid)) {
            doc.add(new StringField(NameIndexField.LSID.toString(), lsid, Field.Store.YES));
        } else {
            System.out.println("LSID is null for " + id + " " + lsid + " " + lsid + " " + acceptedLsid);
        }
        if (StringUtils.isNotBlank(dwcr.getParentNameUsageID())) {
            doc.add(new StringField("parent_id", dwcr.getParentNameUsageID(), Field.Store.YES));
        }
        if (StringUtils.isNotBlank(dwcr.getAcceptedNameUsageID())) {
            doc.add(new StringField(NameIndexField.ACCEPTED.toString(), dwcr.getAcceptedNameUsageID(),
                    Field.Store.YES));
        }
        if (StringUtils.isNotBlank(dwcr.getScientificName())) {
            //stored no need to search on
            doc.add(new StoredField(NameIndexField.NAME.toString(), dwcr.getScientificName()));
        }
        if (StringUtils.isNotBlank(dwcr.getScientificNameAuthorship())) {
            //stored no need to search on
            doc.add(new StoredField(NameIndexField.AUTHOR.toString(), dwcr.getScientificNameAuthorship()));
        }
        if (StringUtils.isNotBlank(dwcr.getGenus())) {
            //stored no need to search on
            doc.add(new StoredField("genus", dwcr.getGenus()));
        }
        if (StringUtils.isNotBlank(dwcr.getSpecificEpithet())) {
            //stored no need to search on
            doc.add(new StoredField(NameIndexField.SPECIFIC.toString(), dwcr.getSpecificEpithet()));
        }
        if (StringUtils.isNotBlank(dwcr.getInfraspecificEpithet())) {
            //stored no need to search on
            doc.add(new StoredField(NameIndexField.INFRA_SPECIFIC.toString(), dwcr.getInfraspecificEpithet()));
        }
        if (StringUtils.isNotBlank(dwcr.getTaxonRank())) {
            //match the supplied rank
            RankType rt = RankType.getForStrRank(dwcr.getTaxonRank());
            if (rt != null) {
                doc.add(new StringField(NameIndexField.RANK.toString(), rt.getRank(), Field.Store.YES));
                doc.add(new StringField(NameIndexField.RANK_ID.toString(), rt.getId().toString(),
                        Field.Store.YES));
            } else {
                doc.add(new StringField(NameIndexField.RANK.toString(), dwcr.getTaxonRank(), Field.Store.YES));
                doc.add(new StringField(NameIndexField.RANK_ID.toString(), RankType.UNRANKED.getId().toString(),
                        Field.Store.YES));
            }
        } else {
            //put in unknown rank
            doc.add(new StringField(NameIndexField.RANK.toString(), "Unknown", Field.Store.YES));
            doc.add(new StringField(NameIndexField.RANK_ID.toString(), RankType.UNRANKED.getId().toString(),
                    Field.Store.YES));
        }
        if (StringUtils.equals(lsid, acceptedLsid) || StringUtils.equals(id, acceptedLsid)
                || acceptedLsid == null) {
            //mark this one as an accepted concept
            doc.add(new StringField(NameIndexField.iS_SYNONYM.toString(), "F", Field.Store.YES));
            if (StringUtils.isBlank(dwcr.getParentNameUsageID())) {
                doc.add(new StringField("root", "T", Field.Store.YES));
            }
        } else {
            doc.add(new StringField(NameIndexField.iS_SYNONYM.toString(), "T", Field.Store.YES));
        }
        iw.addDocument(doc);
        i++;
        if (i % 1000 == 0) {
            long finish = System.currentTimeMillis();
            log.debug("Loading index: " + i + " records per sec: "
                    + (1000 / (((float) (finish / start)) / 1000)));
            start = finish;
        }
    }
    log.info("Finished creating the temporary load index with " + i + " concepts");
    iw.commit();
    iw.forceMerge(1);
    iw.close();
    lsearcher = new IndexSearcher(DirectoryReader.open(FSDirectory.open(indexDir)));
}