Example usage for org.apache.lucene.analysis.core KeywordAnalyzer KeywordAnalyzer

List of usage examples for org.apache.lucene.analysis.core KeywordAnalyzer KeywordAnalyzer

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.core KeywordAnalyzer KeywordAnalyzer.

Prototype

public KeywordAnalyzer() 

Source Link

Usage

From source file:au.org.ala.names.search.ALANameIndexer.java

License:Open Source License

public void createIndex(String exportsDir, String indexDir, String acceptedFile, String synonymFile,
        String irmngDwca, boolean generateSciNames, boolean generateCommonNames) throws Exception {

    Analyzer analyzer = new LowerCaseKeywordAnalyzer();
    //generate the extra id index
    createExtraIdIndex(indexDir + File.separator + "id",
            new File(exportsDir + File.separator + "identifiers.txt"));
    if (generateSciNames) {
        indexALA(createIndexWriter(new File(indexDir + File.separator + "cb"), analyzer, true), acceptedFile,
                synonymFile);//exportsDir + File.separator + "ala_accepted_concepts_dump.txt");//, exportsDir + File.separator + lexFile);
        //IRMNG index to aid in the resolving of homonyms
        IndexWriter irmngWriter = createIndexWriter(new File(indexDir + File.separator + "irmng"), analyzer,
                true);/*from  ww w.  j a  v a  2 s.  c  om*/
        indexIrmngDwcA(irmngWriter, irmngDwca);

        indexIRMNG(irmngWriter, exportsDir + File.separator + "ala-species-homonyms.txt", RankType.SPECIES);
        irmngWriter.forceMerge(1);
        irmngWriter.close();
    }
    if (generateCommonNames) {
        //vernacular index to search for common names
        indexCommonNames(createIndexWriter(new File(indexDir + File.separator + "vernacular"),
                new KeywordAnalyzer(), true), exportsDir, indexDir);
    }
}

From source file:au.org.ala.names.search.ALANameIndexer.java

License:Open Source License

/**
 * Creates the temporary index that provides a lookup of checklist bank id to
 * GUID//  www  . j a  v  a 2  s.  c  o m
 */
private IndexSearcher createTmpGuidIndex(String cbExportFile) throws Exception {
    System.out.println("Starting to create the tmp guid index...");
    IndexWriter iw = createIndexWriter(new File("/data/tmp/guid"), new KeywordAnalyzer(), true);
    au.com.bytecode.opencsv.CSVReader cbreader = new au.com.bytecode.opencsv.CSVReader(
            new FileReader(cbExportFile), '\t', '"', '/', 1);
    for (String[] values = cbreader.readNext(); values != null; values = cbreader.readNext()) {
        Document doc = new Document();
        String id = values[POS_ID];
        String guid = values[POS_LSID];
        doc.add(new StringField("id", id, Store.YES));
        if (StringUtils.isEmpty(id))
            guid = id;

        doc.add(new StoredField("guid", guid));
        iw.addDocument(doc);
    }
    System.out.println("Finished writing the tmp guid index...");
    iw.commit();
    iw.forceMerge(1);
    iw.close();
    //As of lucene 4.0 all IndexReaders are read only
    return new IndexSearcher(DirectoryReader.open(FSDirectory.open(new File("/data/tmp/guid"))));
}

From source file:au.org.ala.names.search.ALANameIndexer.java

License:Open Source License

/**
 * Creates a temporary index that will provide a lookup up of lsid to "real lsid".
 * <p/>/*w  w  w .j av a  2 s .c  o  m*/
 * This deals with the following situations:
 * - common names that are sourced from CoL (LSIDs will be mapped to corresponding ANBG LSID)
 * - Multiple ANBG LSIDs exist for the same scientific name and more than 1 are mapped to the same common name.
 *
 * @param idFile
 * @throws Exception
 */
private void createExtraIdIndex(String idxLocation, File idFile) throws Exception {
    CSVReader reader = new CSVReader(new FileReader(idFile), '\t', '"', '~');//CSVReader.build(idFile, "UTF-8", "\t", '"', 0);
    File indexDir = new File(idxLocation);
    IndexWriter iw = createIndexWriter(indexDir, new KeywordAnalyzer(), true);//new IndexWriter(FSDirectory.open(indexDir), new KeywordAnalyzer(), true, MaxFieldLength.UNLIMITED);
    String[] values = null;
    while ((values = reader.readNext()) != null) {

        if (values != null && values.length >= 3) {
            Document doc = new Document();
            //doc.add(new Field("lsid", values[2], Store.NO, Index.NOT_ANALYZED));
            doc.add(new StringField("lsid", values[2], Store.NO));
            //doc.add(new Field("reallsid", values[1], Store.YES, Index.NO));
            doc.add(new StoredField("reallsid", values[1]));
            iw.addDocument(doc);
        }
    }
    iw.commit();
    iw.forceMerge(1);
    iw.close();
    idSearcher = new IndexSearcher(DirectoryReader.open(FSDirectory.open(indexDir)));
}

From source file:au.org.ala.names.search.ALANameIndexer.java

License:Open Source License

/**
 * Creates a temporary index that stores the taxon concept LSIDs that were
 * included in the last ANBG exports.//from   www. j  a va2  s. c o m
 *
 * @param tcFileName
 * @return
 * @throws Exception
 */
private IndexSearcher createTmpIndex(String tcFileName) throws Exception {
    //creating the tmp index in the /tmp/taxonConcept directory
    CSVReader reader = new CSVReader(new FileReader(new File(tcFileName)), '\t', '"', '~');
    File indexDir = new File("/tmp/taxonConcept");
    IndexWriter iw = createIndexWriter(indexDir, new KeywordAnalyzer(), true);
    String[] values = null;
    while ((values = reader.readNext()) != null) {
        if (values != null && values.length > 1) {
            //just add the LSID to the index
            Document doc = new Document();

            doc.add(new StringField("lsid", values[0], Store.NO));
            iw.addDocument(doc);

        }
    }
    iw.commit();
    iw.forceMerge(1);
    iw.close();
    return new IndexSearcher(DirectoryReader.open(FSDirectory.open(indexDir)));
}

From source file:au.org.ala.names.search.DwcaNameIndexer.java

License:Open Source License

/**
 * Creates the name matching index based on a complete list of names supplied in a single DwCA
 *
 * @param loadingIndex True when the loading index should be created. This is necessary to generate the index, but you may wish to skip this step if it has be generated earlier
 * @param sciIndex True when the name matching index should be generated
 * @param indexDirectory The directory in which to create the name matching index
 * @param tmpLoadIndex The directory in which to create the temporary loading index
 * @param namesDwc The absolute path to the directory that contains the unzipped DWC archive to index
 * @param irmngDwc The absolute path to the directory that contains the unzipped IRMNG DWCA
 * @param commonNameFile/*from w  ww  . j  a v a2  s.co  m*/
 * @throws Exception
 */
public void create(boolean loadingIndex, boolean sciIndex, String indexDirectory, String tmpLoadIndex,
        String namesDwc, String irmngDwc, String commonNameFile) throws Exception {
    dirTmpIndex = tmpLoadIndex;
    LowerCaseKeywordAnalyzer analyzer = new LowerCaseKeywordAnalyzer();
    if (loadingIndex) {
        createLoadingIndex(tmpLoadIndex, namesDwc);
    }
    if (sciIndex) {
        writer = createIndexWriter(new File(indexDirectory + File.separator + "cb"), analyzer, true);
        generateIndex();
        addSynonymsToIndex(namesDwc);
        writer.commit();
        writer.forceMerge(1);
        writer.close();
    }
    if (irmngDwc != null && new File(irmngDwc).exists()) {
        IndexWriter irmngWriter = createIndexWriter(new File(indexDirectory + File.separator + "irmng"),
                analyzer, true);
        this.indexIrmngDwcA(irmngWriter, irmngDwc);
        irmngWriter.forceMerge(1);
        irmngWriter.close();
    }
    if (commonNameFile != null && new File(commonNameFile).exists()) {
        //index the common names
        indexCommonNames(createIndexWriter(new File(indexDirectory + File.separator + "vernacular"),
                new KeywordAnalyzer(), true), commonNameFile);
    }
}

From source file:au.org.ala.names.search.DwcaNameIndexer.java

License:Open Source License

/**
 * Creates a loading index to use to generate the hierarchy including the left right values.
 *
 * @param tmpIndexDir//from ww  w .j  ava 2  s .  c  o  m
 * @param archiveDirectory
 * @throws Exception
 */
private void createLoadingIndex(String tmpIndexDir, String archiveDirectory) throws Exception {
    log.info("Starting to create the temporary loading index.");
    File indexDir = new File(tmpIndexDir);
    IndexWriter iw = createIndexWriter(indexDir, new KeywordAnalyzer(), true);
    //create the loading index so that left right values and classifications can be generated
    Archive archive = ArchiveFactory.openArchive(new File(archiveDirectory));
    Iterator<DarwinCoreRecord> it = archive.iteratorDwc();
    int i = 0;
    long start = System.currentTimeMillis();
    while (it.hasNext()) {
        Document doc = new Document();
        DarwinCoreRecord dwcr = it.next();
        String id = dwcr.getId();
        String lsid = dwcr.getTaxonID() == null ? id : dwcr.getTaxonID();
        String acceptedLsid = dwcr.getAcceptedNameUsageID();
        //add and store the identifier for the record
        doc.add(new StringField(NameIndexField.ID.toString(), dwcr.getId(), Field.Store.YES));
        if (StringUtils.isNotBlank(lsid)) {
            doc.add(new StringField(NameIndexField.LSID.toString(), lsid, Field.Store.YES));
        } else {
            System.out.println("LSID is null for " + id + " " + lsid + " " + lsid + " " + acceptedLsid);
        }
        if (StringUtils.isNotBlank(dwcr.getParentNameUsageID())) {
            doc.add(new StringField("parent_id", dwcr.getParentNameUsageID(), Field.Store.YES));
        }
        if (StringUtils.isNotBlank(dwcr.getAcceptedNameUsageID())) {
            doc.add(new StringField(NameIndexField.ACCEPTED.toString(), dwcr.getAcceptedNameUsageID(),
                    Field.Store.YES));
        }
        if (StringUtils.isNotBlank(dwcr.getScientificName())) {
            //stored no need to search on
            doc.add(new StoredField(NameIndexField.NAME.toString(), dwcr.getScientificName()));
        }
        if (StringUtils.isNotBlank(dwcr.getScientificNameAuthorship())) {
            //stored no need to search on
            doc.add(new StoredField(NameIndexField.AUTHOR.toString(), dwcr.getScientificNameAuthorship()));
        }
        if (StringUtils.isNotBlank(dwcr.getGenus())) {
            //stored no need to search on
            doc.add(new StoredField("genus", dwcr.getGenus()));
        }
        if (StringUtils.isNotBlank(dwcr.getSpecificEpithet())) {
            //stored no need to search on
            doc.add(new StoredField(NameIndexField.SPECIFIC.toString(), dwcr.getSpecificEpithet()));
        }
        if (StringUtils.isNotBlank(dwcr.getInfraspecificEpithet())) {
            //stored no need to search on
            doc.add(new StoredField(NameIndexField.INFRA_SPECIFIC.toString(), dwcr.getInfraspecificEpithet()));
        }
        if (StringUtils.isNotBlank(dwcr.getTaxonRank())) {
            //match the supplied rank
            RankType rt = RankType.getForStrRank(dwcr.getTaxonRank());
            if (rt != null) {
                doc.add(new StringField(NameIndexField.RANK.toString(), rt.getRank(), Field.Store.YES));
                doc.add(new StringField(NameIndexField.RANK_ID.toString(), rt.getId().toString(),
                        Field.Store.YES));
            } else {
                doc.add(new StringField(NameIndexField.RANK.toString(), dwcr.getTaxonRank(), Field.Store.YES));
                doc.add(new StringField(NameIndexField.RANK_ID.toString(), RankType.UNRANKED.getId().toString(),
                        Field.Store.YES));
            }
        } else {
            //put in unknown rank
            doc.add(new StringField(NameIndexField.RANK.toString(), "Unknown", Field.Store.YES));
            doc.add(new StringField(NameIndexField.RANK_ID.toString(), RankType.UNRANKED.getId().toString(),
                    Field.Store.YES));
        }
        if (StringUtils.equals(lsid, acceptedLsid) || StringUtils.equals(id, acceptedLsid)
                || acceptedLsid == null) {
            //mark this one as an accepted concept
            doc.add(new StringField(NameIndexField.iS_SYNONYM.toString(), "F", Field.Store.YES));
            if (StringUtils.isBlank(dwcr.getParentNameUsageID())) {
                doc.add(new StringField("root", "T", Field.Store.YES));
            }
        } else {
            doc.add(new StringField(NameIndexField.iS_SYNONYM.toString(), "T", Field.Store.YES));
        }
        iw.addDocument(doc);
        i++;
        if (i % 1000 == 0) {
            long finish = System.currentTimeMillis();
            log.debug("Loading index: " + i + " records per sec: "
                    + (1000 / (((float) (finish / start)) / 1000)));
            start = finish;
        }
    }
    log.info("Finished creating the temporary load index with " + i + " concepts");
    iw.commit();
    iw.forceMerge(1);
    iw.close();
    lsearcher = new IndexSearcher(DirectoryReader.open(FSDirectory.open(indexDir)));
}

From source file:com.bah.lucene.BaseDirectoryTestSuite.java

License:Apache License

@Test
public void testCreateIndex() throws IOException {
    long s = System.nanoTime();
    IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_43, new KeywordAnalyzer());
    FSDirectory control = FSDirectory.open(fileControl);
    Directory dir = getControlDir(control, directory);
    // The serial merge scheduler can be useful for debugging.
    // conf.setMergeScheduler(new SerialMergeScheduler());
    IndexWriter writer = new IndexWriter(dir, conf);
    int numDocs = 10000;
    DirectoryReader reader = null;/*from w  w  w  .j av a  2 s  . c  om*/
    for (int i = 0; i < 100; i++) {
        if (reader == null) {
            reader = DirectoryReader.open(writer, true);
        } else {
            DirectoryReader old = reader;
            reader = DirectoryReader.openIfChanged(old, writer, true);
            if (reader == null) {
                reader = old;
            } else {
                old.close();
            }
        }
        assertEquals(i * numDocs, reader.numDocs());
        IndexSearcher searcher = new IndexSearcher(reader);
        NumericRangeQuery<Integer> query = NumericRangeQuery.newIntRange("id", 42, 42, true, true);
        TopDocs topDocs = searcher.search(query, 10);
        assertEquals(i, topDocs.totalHits);
        addDocuments(writer, numDocs);
    }
    writer.close(false);
    reader.close();
    long e = System.nanoTime();
    System.out.println("Total time [" + (e - s) / 1000000.0 + " ms]");
}

From source file:com.bah.lucene.blockcache_v2.CacheDirectoryTest.java

License:Apache License

@Test
public void test3() throws IOException, InterruptedException {
    // Thread.sleep(30000);
    IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_43, new KeywordAnalyzer());
    IndexWriter writer = new IndexWriter(_cacheDirectory, conf);
    int docs = 100000;
    for (int i = 0; i < docs; i++) {
        if (i % 500 == 0) {
            System.out.println(i);
        }//from   www  . j  a v  a  2s  .  c  o m
        writer.addDocument(newDoc());
        // Thread.sleep(1);
    }
    writer.close();
    System.out.println("done writing");

    DirectoryReader reader = DirectoryReader.open(_cacheDirectory);
    System.out.println("done opening");
    assertEquals(docs, reader.numDocs());

    Document document = reader.document(0);
    System.out.println("done fetching");
    System.out.println(document);

    IndexSearcher searcher = new IndexSearcher(reader);
    TopDocs topDocs = searcher.search(new TermQuery(new Term("test", "test")), 10);
    System.out.println("done searching");
    assertEquals(docs, topDocs.totalHits);

    reader.close();
}

From source file:com.basistech.lucene.tools.LuceneQueryTool.java

License:Apache License

LuceneQueryTool(IndexReader reader, PrintStream out) throws IOException {
    this.indexReader = reader;
    this.outputLimit = Integer.MAX_VALUE;
    this.analyzer = new KeywordAnalyzer();
    this.fieldNames = Lists.newArrayList();
    this.defaultOut = out;
    allFieldNames = Sets.newTreeSet();/*from  w  w  w  .  j  a va2s. c o  m*/
    for (LeafReaderContext leaf : reader.leaves()) {
        for (FieldInfo fieldInfo : leaf.reader().getFieldInfos()) {
            allFieldNames.add(fieldInfo.name);
        }
    }
    this.formatter = Formatter.newInstance(Formatter.Format.MULTILINE, false);
}

From source file:com.basistech.lucene.tools.LuceneQueryTool.java

License:Apache License

void setAnalyzer(String analyzerString) {
    if ("KeywordAnalyzer".equals(analyzerString)) {
        this.analyzer = new KeywordAnalyzer();
    } else if ("StandardAnalyzer".equals(analyzerString)) {
        this.analyzer = new StandardAnalyzer();
    } else {/*  w  w  w .j  a  v a 2 s  .c o m*/
        throw new RuntimeException(String.format("Invalid analyzer %s: %s", analyzerString,
                "Only KeywordAnalyzer and StandardAnalyzer currently supported"));
    }
}