Example usage for org.apache.lucene.document StoredField StoredField

Introduction

In this page you can find the example usage for org.apache.lucene.document StoredField StoredField.

Prototype

public StoredField(String name, double value)

Source Link

Document

Create a stored-only field with the given double value.

Usage

From source file:au.org.ala.names.search.ALANameIndexer.java

License:Open Source License

/**
 * Creates the temporary index that provides a lookup of checklist bank id to
 * GUID/*from w  w  w  .j  av  a  2 s  .  c  o m*/
 */
private IndexSearcher createTmpGuidIndex(String cbExportFile) throws Exception {
    System.out.println("Starting to create the tmp guid index...");
    IndexWriter iw = createIndexWriter(new File("/data/tmp/guid"), new KeywordAnalyzer(), true);
    au.com.bytecode.opencsv.CSVReader cbreader = new au.com.bytecode.opencsv.CSVReader(
            new FileReader(cbExportFile), '\t', '"', '/', 1);
    for (String[] values = cbreader.readNext(); values != null; values = cbreader.readNext()) {
        Document doc = new Document();
        String id = values[POS_ID];
        String guid = values[POS_LSID];
        doc.add(new StringField("id", id, Store.YES));
        if (StringUtils.isEmpty(id))
            guid = id;

        doc.add(new StoredField("guid", guid));
        iw.addDocument(doc);
    }
    System.out.println("Finished writing the tmp guid index...");
    iw.commit();
    iw.forceMerge(1);
    iw.close();
    //As of lucene 4.0 all IndexReaders are read only
    return new IndexSearcher(DirectoryReader.open(FSDirectory.open(new File("/data/tmp/guid"))));
}

From source file:au.org.ala.names.search.ALANameIndexer.java

License:Open Source License

/**
 * Creates a temporary index that will provide a lookup up of lsid to "real lsid".
 * <p/>/*from   w  w w .j a  v  a 2  s  . c  o  m*/
 * This deals with the following situations:
 * - common names that are sourced from CoL (LSIDs will be mapped to corresponding ANBG LSID)
 * - Multiple ANBG LSIDs exist for the same scientific name and more than 1 are mapped to the same common name.
 *
 * @param idFile
 * @throws Exception
 */
private void createExtraIdIndex(String idxLocation, File idFile) throws Exception {
    CSVReader reader = new CSVReader(new FileReader(idFile), '\t', '"', '~');//CSVReader.build(idFile, "UTF-8", "\t", '"', 0);
    File indexDir = new File(idxLocation);
    IndexWriter iw = createIndexWriter(indexDir, new KeywordAnalyzer(), true);//new IndexWriter(FSDirectory.open(indexDir), new KeywordAnalyzer(), true, MaxFieldLength.UNLIMITED);
    String[] values = null;
    while ((values = reader.readNext()) != null) {

        if (values != null && values.length >= 3) {
            Document doc = new Document();
            //doc.add(new Field("lsid", values[2], Store.NO, Index.NOT_ANALYZED));
            doc.add(new StringField("lsid", values[2], Store.NO));
            //doc.add(new Field("reallsid", values[1], Store.YES, Index.NO));
            doc.add(new StoredField("reallsid", values[1]));
            iw.addDocument(doc);
        }
    }
    iw.commit();
    iw.forceMerge(1);
    iw.close();
    idSearcher = new IndexSearcher(DirectoryReader.open(FSDirectory.open(indexDir)));
}

From source file:au.org.ala.names.search.ALANameIndexer.java

License:Open Source License

protected Document getCommonNameDocument(String cn, String sn, String lsid, float boost,
        boolean checkAccepted) {
    Document doc = new Document();
    //we are only interested in keeping all the alphanumerical values of the common name
    //when searching the same operations will need to be peformed on the search string
    TextField tf = new TextField(IndexField.COMMON_NAME.toString(),
            cn.toUpperCase().replaceAll("[^A-Z0-9??]", ""), Store.YES);
    tf.setBoost(boost);//  w w w  . j  a  v  a 2 s.c  om

    doc.add(tf);
    if (sn != null) {

        doc.add(new TextField(IndexField.NAME.toString(), sn, Store.YES));
    }
    String newLsid = getAcceptedLSID(lsid);

    doc.add(new StoredField(IndexField.LSID.toString(), newLsid));

    return doc;
}

From source file:au.org.ala.names.search.ALANameIndexer.java

License:Open Source License

private Document createALAIndexDocument(String name, String id, String lsid, String rank, String rankString,
        String kingdom, String kid, String phylum, String pid, String clazz, String cid, String order,
        String oid, String family, String fid, String genus, String gid, String species, String sid,
        String left, String right, String acceptedConcept, String specificEpithet, String infraspecificEpithet,
        String author, float boost) {
    ////  w  ww.  j  ava 2  s .c om
    if (isBlacklisted(name)) {
        System.out.println(name + " has been blacklisted");
        return null;
    }
    Document doc = new Document();

    //Add the ids
    doc.add(new StringField(NameIndexField.ID.toString(), id, Store.YES));

    doc.add(new StringField(NameIndexField.LSID.toString(), lsid, Store.YES));
    if (lsid.startsWith("ALA")) {
        doc.add(new TextField(NameIndexField.ALA.toString(), "T", Store.NO));
    }

    //Add the scientific name information

    Field f = new TextField(NameIndexField.NAME.toString(), name, Store.YES);
    f.setBoost(boost);
    doc.add(f);

    //rank information
    if (StringUtils.isNotEmpty(rank)) {
        doc.add(new StringField(NameIndexField.RANK_ID.toString(), rank, Store.YES));
    }
    if (StringUtils.isNotEmpty(rankString)) {
        doc.add(new StringField(NameIndexField.RANK.toString(), rankString, Store.YES));
    }

    //handle the synonyms
    if (StringUtils.isNotEmpty(acceptedConcept)) {
        doc.add(new StringField(NameIndexField.ACCEPTED.toString(), acceptedConcept, Store.YES));
        doc.add(new TextField(NameIndexField.iS_SYNONYM.toString(), "T", Store.NO));
    } else {
        doc.add(new TextField(NameIndexField.iS_SYNONYM.toString(), "F", Store.NO));
    }

    //Add the classification information
    if (StringUtils.trimToNull(kingdom) != null) {
        doc.add(new TextField(RankType.KINGDOM.getRank(), kingdom, Store.YES));
        if (StringUtils.isNotBlank(kid)) {
            doc.add(new StoredField("kid", kid));
        }
    }
    if (StringUtils.trimToNull(phylum) != null) {
        doc.add(new TextField(RankType.PHYLUM.getRank(), phylum, Store.YES));
        if (StringUtils.isNotBlank(pid)) {
            doc.add(new StoredField("pid", pid));
        }
    }
    if (StringUtils.trimToNull(clazz) != null) {
        doc.add(new TextField(RankType.CLASS.getRank(), clazz, Store.YES));
        if (StringUtils.isNotBlank(cid)) {
            doc.add(new StoredField("cid", cid));
        }
    }
    if (StringUtils.trimToNull(order) != null) {
        doc.add(new TextField(RankType.ORDER.getRank(), order, Store.YES));
        if (StringUtils.isNotBlank(oid)) {
            doc.add(new StoredField("oid", oid));
        }
    }
    if (StringUtils.trimToNull(family) != null) {
        doc.add(new TextField(RankType.FAMILY.getRank(), family, Store.YES));
        if (StringUtils.isNotBlank(fid)) {
            doc.add(new StoredField("fid", fid));
        }
    }
    if (StringUtils.trimToNull(genus) != null) {
        doc.add(new TextField(RankType.GENUS.getRank(), genus, Store.YES));
        if (StringUtils.isNotBlank(gid)) {
            doc.add(new StoredField("gid", gid));
        }
    }
    if (StringUtils.trimToNull(species) != null) {
        doc.add(new TextField(RankType.SPECIES.getRank(), species, Store.YES));
        if (StringUtils.isNotBlank(sid)) {
            doc.add(new StoredField("sid", sid));
        }
    }
    if (StringUtils.trimToNull(left) != null) {
        doc.add(new StringField("left", left, Store.YES));
    }
    if (StringUtils.trimToNull(right) != null) {
        doc.add(new StringField("right", right, Store.YES));
    }

    //Add the author information
    if (StringUtils.isNotEmpty(author)) {
        //TODO think about whether we need to treat the author string with the taxamatch
        doc.add(new TextField(NameIndexField.AUTHOR.toString(), author, Store.YES));
    }

    //Generate the canonical
    //add the canonical form of the name
    try {
        ParsedName cn = parser.parse(name);
        //if(cn != null && !cn.hasProblem() && !cn.isIndetermined()){
        if (cn != null && cn.isParsableType() && !cn.isIndetermined() && cn.getType() != NameType.informal
                && !"6500".equals(rank) && cn.getType() != NameType.doubtful)// a scientific name with some informal addition like "cf." or indetermined like Abies spec. ALSO prevent subgenus because they parse down to genus plus author
        {

            Field f2 = new TextField(NameIndexField.NAME.toString(), cn.canonicalName(), Store.YES);
            f2.setBoost(boost);
            doc.add(f2);
            if (specificEpithet == null && cn.isBinomial()) {
                //check to see if we need to determine the epithets from the parse
                genus = cn.getGenusOrAbove();
                if (specificEpithet == null)
                    specificEpithet = cn.getSpecificEpithet();
                if (infraspecificEpithet == null)
                    infraspecificEpithet = cn.getInfraSpecificEpithet();
            }
        }
        //check to see if the concept represents a phrase name
        if (cn instanceof ALAParsedName) {
            //set up the field type that is stored and Index.ANALYZED_NO_NORMS
            FieldType ft = new FieldType(TextField.TYPE_STORED);
            ft.setOmitNorms(true);
            ALAParsedName alapn = (ALAParsedName) cn;
            if ((!"sp.".equals(alapn.rank)) && alapn.specificEpithet != null) {
                doc.add(new Field(NameIndexField.SPECIFIC.toString(), alapn.getSpecificEpithet(), ft));
            } else if ((!"sp.".equals(alapn.rank)) && alapn.specificEpithet == null) {
                log.warn(lsid + " " + name + " has an empty specific for non sp. phrase");
            }
            if (StringUtils.trimToNull(alapn.getLocationPhraseDesciption()) != null) {
                doc.add(new Field(NameIndexField.PHRASE.toString(), alapn.cleanPhrase, ft));
            }
            if (alapn.getPhraseVoucher() != null) {
                doc.add(new Field(NameIndexField.VOUCHER.toString(), alapn.cleanVoucher, ft));
            }
            if (StringUtils.isBlank(genus) && StringUtils.isNotBlank(alapn.getGenusOrAbove())) {
                //add the genus to the index as it is necessary to match on the phrase name.
                doc.add(new TextField(RankType.GENUS.getRank(), alapn.getGenusOrAbove(), Store.YES));
            }

        }
    } catch (org.gbif.ecat.parser.UnparsableException e) {
        //check to see if the name is a virus in which case an extra name is added without the virus key word
        if (e.type == NameType.virus) {
            doc.add(new TextField(NameIndexField.NAME.toString(),
                    ALANameSearcher.virusStopPattern.matcher(name).replaceAll(" "), Store.YES));
        }

    } catch (Exception e) {
        e.printStackTrace();
        //throw e;
    }

    //add the sound expressions for the name if required
    try {
        if (StringUtils.isNotBlank(genus)) {
            doc.add(new TextField(NameIndexField.GENUS_EX.toString(),
                    TaxonNameSoundEx.treatWord(genus, "genus"), Store.YES));
        }
        if (StringUtils.isNotBlank(specificEpithet)) {
            doc.add(new TextField(NameIndexField.SPECIES_EX.toString(),
                    TaxonNameSoundEx.treatWord(specificEpithet, "species"), Store.YES));
        } else if (StringUtils.isNotBlank(genus)) {
            doc.add(new TextField(NameIndexField.SPECIES_EX.toString(), "<null>", Store.YES));
        }
        if (StringUtils.isNotBlank(infraspecificEpithet)) {
            doc.add(new TextField(NameIndexField.INFRA_EX.toString(),
                    TaxonNameSoundEx.treatWord(infraspecificEpithet, "species"), Store.YES));
        } else if (StringUtils.isNotBlank(specificEpithet)) {
            //make searching for an empty infraspecific soudex easier
            doc.add(new TextField(NameIndexField.INFRA_EX.toString(), "<null>", Store.YES));
        }
    } catch (Exception e) {
        log.warn(lsid + " " + name + " has issues creating a soundex: " + e.getMessage());
    }

    return doc;

}

From source file:au.org.ala.names.search.DwcaNameIndexer.java

License:Open Source License

/**
 * Creates a loading index to use to generate the hierarchy including the left right values.
 *
 * @param tmpIndexDir//from  w ww .jav  a2 s  . c o  m
 * @param archiveDirectory
 * @throws Exception
 */
private void createLoadingIndex(String tmpIndexDir, String archiveDirectory) throws Exception {
    log.info("Starting to create the temporary loading index.");
    File indexDir = new File(tmpIndexDir);
    IndexWriter iw = createIndexWriter(indexDir, new KeywordAnalyzer(), true);
    //create the loading index so that left right values and classifications can be generated
    Archive archive = ArchiveFactory.openArchive(new File(archiveDirectory));
    Iterator<DarwinCoreRecord> it = archive.iteratorDwc();
    int i = 0;
    long start = System.currentTimeMillis();
    while (it.hasNext()) {
        Document doc = new Document();
        DarwinCoreRecord dwcr = it.next();
        String id = dwcr.getId();
        String lsid = dwcr.getTaxonID() == null ? id : dwcr.getTaxonID();
        String acceptedLsid = dwcr.getAcceptedNameUsageID();
        //add and store the identifier for the record
        doc.add(new StringField(NameIndexField.ID.toString(), dwcr.getId(), Field.Store.YES));
        if (StringUtils.isNotBlank(lsid)) {
            doc.add(new StringField(NameIndexField.LSID.toString(), lsid, Field.Store.YES));
        } else {
            System.out.println("LSID is null for " + id + " " + lsid + " " + lsid + " " + acceptedLsid);
        }
        if (StringUtils.isNotBlank(dwcr.getParentNameUsageID())) {
            doc.add(new StringField("parent_id", dwcr.getParentNameUsageID(), Field.Store.YES));
        }
        if (StringUtils.isNotBlank(dwcr.getAcceptedNameUsageID())) {
            doc.add(new StringField(NameIndexField.ACCEPTED.toString(), dwcr.getAcceptedNameUsageID(),
                    Field.Store.YES));
        }
        if (StringUtils.isNotBlank(dwcr.getScientificName())) {
            //stored no need to search on
            doc.add(new StoredField(NameIndexField.NAME.toString(), dwcr.getScientificName()));
        }
        if (StringUtils.isNotBlank(dwcr.getScientificNameAuthorship())) {
            //stored no need to search on
            doc.add(new StoredField(NameIndexField.AUTHOR.toString(), dwcr.getScientificNameAuthorship()));
        }
        if (StringUtils.isNotBlank(dwcr.getGenus())) {
            //stored no need to search on
            doc.add(new StoredField("genus", dwcr.getGenus()));
        }
        if (StringUtils.isNotBlank(dwcr.getSpecificEpithet())) {
            //stored no need to search on
            doc.add(new StoredField(NameIndexField.SPECIFIC.toString(), dwcr.getSpecificEpithet()));
        }
        if (StringUtils.isNotBlank(dwcr.getInfraspecificEpithet())) {
            //stored no need to search on
            doc.add(new StoredField(NameIndexField.INFRA_SPECIFIC.toString(), dwcr.getInfraspecificEpithet()));
        }
        if (StringUtils.isNotBlank(dwcr.getTaxonRank())) {
            //match the supplied rank
            RankType rt = RankType.getForStrRank(dwcr.getTaxonRank());
            if (rt != null) {
                doc.add(new StringField(NameIndexField.RANK.toString(), rt.getRank(), Field.Store.YES));
                doc.add(new StringField(NameIndexField.RANK_ID.toString(), rt.getId().toString(),
                        Field.Store.YES));
            } else {
                doc.add(new StringField(NameIndexField.RANK.toString(), dwcr.getTaxonRank(), Field.Store.YES));
                doc.add(new StringField(NameIndexField.RANK_ID.toString(), RankType.UNRANKED.getId().toString(),
                        Field.Store.YES));
            }
        } else {
            //put in unknown rank
            doc.add(new StringField(NameIndexField.RANK.toString(), "Unknown", Field.Store.YES));
            doc.add(new StringField(NameIndexField.RANK_ID.toString(), RankType.UNRANKED.getId().toString(),
                    Field.Store.YES));
        }
        if (StringUtils.equals(lsid, acceptedLsid) || StringUtils.equals(id, acceptedLsid)
                || acceptedLsid == null) {
            //mark this one as an accepted concept
            doc.add(new StringField(NameIndexField.iS_SYNONYM.toString(), "F", Field.Store.YES));
            if (StringUtils.isBlank(dwcr.getParentNameUsageID())) {
                doc.add(new StringField("root", "T", Field.Store.YES));
            }
        } else {
            doc.add(new StringField(NameIndexField.iS_SYNONYM.toString(), "T", Field.Store.YES));
        }
        iw.addDocument(doc);
        i++;
        if (i % 1000 == 0) {
            long finish = System.currentTimeMillis();
            log.debug("Loading index: " + i + " records per sec: "
                    + (1000 / (((float) (finish / start)) / 1000)));
            start = finish;
        }
    }
    log.info("Finished creating the temporary load index with " + i + " concepts");
    iw.commit();
    iw.forceMerge(1);
    iw.close();
    lsearcher = new IndexSearcher(DirectoryReader.open(FSDirectory.open(indexDir)));
}

From source file:be.ugent.tiwi.sleroux.newsrec.newsreclib.utils.NewsItemLuceneDocConverter.java

License:Apache License

/**
 * Converts a NewsItem to a Lucene Document.
 *
 * @param item/*from www .  ja  va  2  s  .co  m*/
 * @return
 */
public static Document newsItemToDocument(NewsItem item) {
    Document doc = new Document();
    FieldType ftype = getTextType();

    if (item.getTitle() != null) {
        doc.add(new Field("title", item.getTitle(), ftype));
    }
    if (item.getFulltext() != null) {
        doc.add(new Field("text", item.getFulltext(), ftype));
    }
    if (item.getDescription() != null) {
        doc.add(new Field("description", item.getDescription(), ftype));
    }

    doc.add(new StringField("id", item.getId(), Field.Store.YES));

    if (item.getUrl() != null) {
        doc.add(new StringField("url", item.getUrl().toString(), Field.Store.YES));
    }
    if (item.getImageUrl() != null) {
        System.out.println("image : " + item.getImageUrl());
        doc.add(new StringField("imageUrl", item.getImageUrl().toString(), Field.Store.YES));
    }
    if (item.getLocale() != null) {
        doc.add(new StringField("locale", item.getLocale().getISO3Language(), Field.Store.YES));
    }
    if (item.getSource() != null) {
        doc.add(new StringField("source", item.getSource(), Field.Store.YES));
    }
    if (item.getTimestamp() != null) {
        doc.add(new LongField("timestamp", item.getTimestamp().getTime(), Field.Store.YES));
    }
    for (String author : item.getAuthors()) {
        doc.add(new StringField("author", author, Field.Store.YES));
    }

    Map<String, Double> terms = item.getTerms();
    String termsJson = gson.toJson(terms);
    doc.add(new StoredField("terms", termsJson));

    return doc;
}

From source file:BlockBuilding.AbstractBlockBuilding.java

License:Apache License

protected void indexEntities(IndexWriter index, List<EntityProfile> entities) {
    try {/*from   ww w.  j  av a  2 s.  co m*/
        int counter = 0;
        for (EntityProfile profile : entities) {
            Document doc = new Document();
            doc.add(new StoredField(DOC_ID, counter++));
            for (Attribute attribute : profile.getAttributes()) {
                getBlockingKeys(attribute.getValue()).stream().filter((key) -> (0 < key.trim().length()))
                        .forEach((key) -> {
                            doc.add(new StringField(VALUE_LABEL, key.trim(), Field.Store.YES));
                        });
            }
            index.addDocument(doc);
        }
    } catch (IOException ex) {
        LOGGER.log(Level.SEVERE, null, ex);
    }
}

From source file:BlockBuilding.AbstractIndexBasedMethod.java

License:Open Source License

protected void indexEntities(IndexWriter index, List<EntityProfile> entities) {
    try {//from w w  w .j  a  va2  s. c o m
        int counter = 0;
        for (EntityProfile profile : entities) {
            Document doc = new Document();
            doc.add(new StoredField(DOC_ID, counter++));
            for (Attribute attribute : profile.getAttributes()) {
                getBlockingKeys(attribute.getValue()).stream().filter((key) -> (0 < key.trim().length()))
                        .forEach((key) -> {
                            doc.add(new StringField(VALUE_LABEL, key.trim(), Field.Store.YES));
                            totalWords++;
                        });
            }
            index.addDocument(doc);
        }
    } catch (IOException ex) {
        ex.printStackTrace();
    }
}

From source file:BlockBuilding.AbstractTYPiMatch.java

License:Open Source License

@Override
protected void indexEntities(IndexWriter index, List<EntityProfile> entities) {
    try {/*from w ww .  j  a va  2 s .co  m*/
        int counter = 0;
        for (EntityProfile profile : entities) {
            Document doc = new Document();
            doc.add(new StoredField(DOC_ID, counter++));

            String entitySuffix = "";
            if (!firstPass) {
                entitySuffix = CLUSTER_PREFIX + entityTypes[entityCounter++] + CLUSTER_SUFFIX;
            }

            for (Attribute attribute : profile.getAttributes()) {
                for (String token : getTokens(attribute.getValue())) {
                    if (0 < token.trim().length()) {
                        doc.add(new StringField(VALUE_LABEL, token.trim() + entitySuffix, Field.Store.YES));
                    }
                }
            }

            index.addDocument(doc);
        }
    } catch (IOException ex) {
        ex.printStackTrace();
    }
}

From source file:BlockBuilding.AttributeClusteringBlocking.java

License:Apache License

protected void indexEntities(int sourceId, IndexWriter index, List<EntityProfile> entities) {
    try {//from   www  . j a  va2  s  .  c o m
        int counter = 0;
        for (EntityProfile profile : entities) {
            Document doc = new Document();
            doc.add(new StoredField(DOC_ID, counter++));
            for (Attribute attribute : profile.getAttributes()) {
                Integer clusterId = attributeClusters[sourceId].get(attribute.getName());
                if (clusterId == null) {
                    LOGGER.log(Level.WARNING,
                            "No cluster id found for attribute name\t:\t{0}"
                                    + ".\nCorresponding attribute value\t:\t{1}",
                            new Object[] { attribute.getName(), attribute.getValue() });
                    continue;
                }
                String clusterSuffix = CLUSTER_PREFIX + clusterId + CLUSTER_SUFFIX;
                for (String token : getTokens(attribute.getValue())) {
                    if (0 < token.trim().length()) {
                        doc.add(new StringField(VALUE_LABEL, token.trim() + clusterSuffix, Field.Store.YES));
                    }
                }
            }

            index.addDocument(doc);
        }
    } catch (IOException ex) {
        LOGGER.log(Level.SEVERE, null, ex);
    }
}