Example usage for org.apache.lucene.document TextField setStringValue

List of usage examples for org.apache.lucene.document TextField setStringValue

Introduction

In this page you can find the example usage for org.apache.lucene.document TextField setStringValue.

Prototype

public void setStringValue(String value) 

Source Link

Document

Expert: change the value of this field.

Usage

From source file:com.bericotech.clavin.index.IndexDirectoryBuilder.java

License:Apache License

/**
 * Builds a set of Lucene documents for the provided GeoName, indexing
 * each using all available names and storing the entire ancestry path
 * for each GeoName in the index.  See {@link IndexField} for descriptions
 * of the fields indexed for each document.
 *
 * @param geoName       the GeoName to index
 * @throws IOException  if an error occurs while indexing
 */// www  .j  ava 2  s .  com
private void indexGeoName(final GeoName geoName) throws IOException {
    indexCount++;
    // find all unique names for this GeoName
    String nm = geoName.getName();
    String asciiNm = geoName.getAsciiName();
    Set<String> names = new HashSet<String>();
    names.add(nm);
    names.add(asciiNm);
    names.addAll(geoName.getAlternateNames());
    // if this is a top-level administrative division, add its primary and alternate country codes
    // if they are not already found in the name or alternate names
    if (geoName.isTopLevelAdminDivision()) {
        if (geoName.getPrimaryCountryCode() != null) {
            names.add(geoName.getPrimaryCountryCode().name());
        }
        for (CountryCode cc : geoName.getAlternateCountryCodes()) {
            names.add(cc.name());
        }
    }
    AlternateName preferredName = alternateNameMap.get(geoName.getGeonameID());
    // ensure preferred name is found in alternate names
    if (preferredName != null) {
        names.add(preferredName.name);
    }
    names.remove(null);
    names.remove("");

    // reuse a single Document and field instances
    Document doc = new Document();
    doc.add(new StoredField(GEONAME.key(),
            fullAncestry ? geoName.getGazetteerRecordWithAncestry() : geoName.getGazetteerRecord()));
    doc.add(new IntField(GEONAME_ID.key(), geoName.getGeonameID(), Field.Store.YES));
    // if the alternate names file was loaded and we found a preferred name for this GeoName, store it
    if (preferredName != null) {
        doc.add(new StoredField(PREFERRED_NAME.key(), preferredName.name));
    }
    // index the direct parent ID in the PARENT_ID field
    GeoName parent = geoName.getParent();
    if (parent != null) {
        doc.add(new IntField(PARENT_ID.key(), parent.getGeonameID(), Field.Store.YES));
    }
    // index all ancestor IDs in the ANCESTOR_IDS field; this is a secondary field
    // so it can be used to restrict searches and PARENT_ID can be used for ancestor
    // resolution
    while (parent != null) {
        doc.add(new IntField(ANCESTOR_IDS.key(), parent.getGeonameID(), Field.Store.YES));
        parent = parent.getParent();
    }
    doc.add(new LongField(POPULATION.key(), geoName.getPopulation(), Field.Store.YES));
    // set up sort field based on population and geographic feature type
    if (geoName.getFeatureClass().equals(FeatureClass.P) || geoName.getFeatureCode().name().startsWith("PCL")) {
        if (geoName.getGeonameID() != 2643741) // todo: temporary hack until GeoNames.org fixes the population for City of London
            // boost cities and countries when sorting results by population
            doc.add(new LongField(SORT_POP.key(), geoName.getPopulation() * 11, Field.Store.YES));
    } else {
        // don't boost anything else, because people rarely talk about other stuff
        // (e.g., Washington State's population is more than 10x that of Washington, DC
        // but Washington, DC is mentioned far more frequently than Washington State)
        doc.add(new LongField(SORT_POP.key(), geoName.getPopulation(), Field.Store.YES));
    }
    doc.add(new IntField(HISTORICAL.key(),
            IndexField.getBooleanIndexValue(geoName.getFeatureCode().isHistorical()), Field.Store.NO));
    doc.add(new StringField(FEATURE_CODE.key(), geoName.getFeatureCode().name(), Field.Store.NO));

    // create a unique Document for each name of this GeoName
    TextField nameField = new TextField(INDEX_NAME.key(), "", Field.Store.YES);
    doc.add(nameField);
    for (String name : names) {
        nameField.setStringValue(name);
        indexWriter.addDocument(doc);
    }
}

From source file:edu.cmu.geolocator.resource.gazindexing.CollaborativeIndex.GazStringIndexerAllCountries.java

License:Apache License

void indexGazatteer(BufferedReader br, IndexWriter iw) throws IOException, InterruptedException {

    Document d = new Document();
    StringField nfid = new StringField("ID", "", Field.Store.YES);
    StringField sforigin = new StringField("LOWERED_ORIGIN", "", Field.Store.YES);
    StringField normnws = new StringField("LOWERED-NO-WS", "", Field.Store.YES);
    TextField sfunigram = new TextField("UNIGRAM", "", Field.Store.YES);
    TextField sfbigram = new TextField("BIGRAM", "", Field.Store.YES);
    TextField sftrigram = new TextField("TRIGRAM", "", Field.Store.YES);
    StringField lang = new StringField("LANG", "", Field.Store.YES);
    d.add(nfid);/*from w  w w  . j av  a 2  s. c  o  m*/
    d.add(sforigin);
    d.add(normnws);
    d.add(sfunigram);
    d.add(sfbigram);
    d.add(sftrigram);

    String line;
    String[] column;
    String id, phrase, otherlang, oneOtherlang;
    int otherlangLength;
    int linen = 0;
    while ((line = br.readLine()) != null) {
        if (linen++ % 10000 == 0)
            System.out.println(linen + "\n" + line);
        column = line.trim().split("\t");

        // get other columns except for the location words
        id = column[0];
        phrase = column[1];
        otherlang = column[3];

        // To Do: set values to document d, and index it
        nfid.setStringValue(id);// 1
        // id does not change, change the strings to index.
        phrase = phrase.toLowerCase();
        sforigin.setStringValue(phrase);// 5
        normnws.setStringValue(phrase.replaceAll(" ", ""));

        getIndexFeatures(phrase);

        sfunigram.setStringValue(getUnigram());
        sfbigram.setStringValue(getBigram());
        sftrigram.setStringValue(getTrigram());

        lang.setStringValue("");
        // add this new document.
        iw.addDocument(d);

        if (otherlang.length() == 0)
            continue;
        String otherlangs[] = otherlang.split(",");
        otherlangLength = otherlangs.length;
        for (int i = 0; i < otherlangLength; i++) {
            if (otherlangs[i].length() == 0)
                continue;
            // id does not change, change the strings to index.
            oneOtherlang = otherlangs[i].toLowerCase();
            sforigin.setStringValue(oneOtherlang);// 5
            normnws.setStringValue(oneOtherlang.replaceAll(" ", ""));

            getIndexFeatures(oneOtherlang);

            sfunigram.setStringValue(getUnigram());
            sfbigram.setStringValue(getBigram());
            sftrigram.setStringValue(getTrigram());

            lang.setStringValue("");
            // add this new document.
            iw.addDocument(d);
        }
    }
}

From source file:edu.cmu.geolocator.resource.gazindexing.CollaborativeIndex.GazStringIndexerAltNames.java

License:Apache License

void indexAlterNames(BufferedReader br, IndexWriter iw) throws IOException, InterruptedException {

    Document d = new Document();
    StringField nfid = new StringField("ID", "", Field.Store.YES);
    StringField sforigin = new StringField("LOWERED_ORIGIN", "", Field.Store.YES);
    StringField normnws = new StringField("LOWERED-NO-WS", "", Field.Store.YES);
    TextField sfunigram = new TextField("UNIGRAM", "", Field.Store.YES);
    TextField sfbigram = new TextField("BIGRAM", "", Field.Store.YES);
    TextField sftrigram = new TextField("TRIGRAM", "", Field.Store.YES);
    StringField lang = new StringField("LANG", "", Field.Store.YES);
    d.add(nfid);/*from w  ww .j a  va2 s.  c o  m*/
    d.add(sforigin);
    d.add(normnws);
    d.add(sfunigram);
    d.add(sfbigram);
    d.add(sftrigram);

    String line;
    String[] column;
    String id, langOrLink, phrase;
    int linen = 0;
    while ((line = br.readLine()) != null) {
        if (linen++ % 10000 == 0)
            System.out.println(linen + "\n" + line);
        column = line.trim().split("\t");

        // get other columns except for the location words
        id = column[1];
        langOrLink = column[2];
        phrase = column[3];

        // To Do: set values to document d, and index it
        nfid.setStringValue(id);// 1

        phrase = phrase.toLowerCase();

        sforigin.setStringValue(phrase);// 5
        normnws.setStringValue(phrase.replaceAll(" ", ""));

        getIndexFeatures(phrase);

        sfunigram.setStringValue(getUnigram());
        sfbigram.setStringValue(getBigram());
        sftrigram.setStringValue(getTrigram());

        lang.setStringValue(langOrLink);
        // add this new document.
        iw.addDocument(d);
    }
}

From source file:perf.IndexGeoNames.java

License:Apache License

public static void main(String[] args) throws Exception {
    String geoNamesFile = args[0];
    File indexPath = new File(args[1]);
    int numThreads = Integer.parseInt(args[2]);
    int precStep = Integer.parseInt(args[3]);
    if (indexPath.exists()) {
        throw new IllegalArgumentException("please remove indexPath \"" + indexPath + "\" before running");
    }// w w  w  . ja va2  s. c o m

    Directory dir = FSDirectory.open(indexPath);
    //IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_48, new StandardAnalyzer(Version.LUCENE_48));
    IndexWriterConfig iwc = new IndexWriterConfig(new StandardAnalyzer());
    //iwc.setRAMBufferSizeMB(350);
    iwc.setInfoStream(new PrintStreamInfoStream(System.out));
    if (normal == false) {
        iwc.setRAMBufferSizeMB(1024);
        iwc.setMergePolicy(NoMergePolicy.INSTANCE);
        //iwc.setMergePolicy(NoMergePolicy.NO_COMPOUND_FILES);
    } else {
        // 5/5 segments:
        iwc.setMaxBufferedDocs(157234);
        iwc.setRAMBufferSizeMB(-1);
    }
    //((ConcurrentMergeScheduler) iwc.getMergeScheduler()).setMaxMergesAndThreads(3, 1);
    final IndexWriter w = new IndexWriter(dir, iwc);

    final Field.Store store = Field.Store.NO;

    final FieldType doubleFieldType = new FieldType(
            store == Field.Store.NO ? DoubleField.TYPE_NOT_STORED : DoubleField.TYPE_STORED);
    doubleFieldType.setNumericPrecisionStep(precStep);
    doubleFieldType.freeze();

    final FieldType longFieldType = new FieldType(
            store == Field.Store.NO ? LongField.TYPE_NOT_STORED : LongField.TYPE_STORED);
    longFieldType.setNumericPrecisionStep(precStep);
    longFieldType.freeze();

    final FieldType intFieldType = new FieldType(
            store == Field.Store.NO ? IntField.TYPE_NOT_STORED : IntField.TYPE_STORED);
    intFieldType.setNumericPrecisionStep(precStep);
    intFieldType.freeze();

    // 64K buffer:
    InputStream is = new FileInputStream(geoNamesFile);
    final BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8"), 1 << 16);
    final AtomicInteger docsIndexed = new AtomicInteger();

    final long startMS = System.currentTimeMillis();
    Thread[] threads = new Thread[numThreads];

    // With reuse it's ~ 38% faster (41.8 sec vs 67.0 sec):
    final boolean reuseDocAndFields = false;

    for (int i = 0; i < numThreads; i++) {
        threads[i] = new Thread() {
            @Override
            public void run() {
                ParsePosition datePos = new ParsePosition(0);
                SimpleDateFormat dateParser = new SimpleDateFormat("yyyy-MM-dd", Locale.US);

                if (reuseDocAndFields) {
                    Document doc = new Document();
                    IntField geoNameID = new IntField("geoNameID", 0, intFieldType);
                    doc.add(geoNameID);
                    TextField nameField = new TextField("name", "", store);
                    doc.add(nameField);
                    TextField asciiNameField = new TextField("asciiName", "", store);
                    doc.add(asciiNameField);
                    TextField alternateNameField = new TextField("alternateNames", "", store);
                    doc.add(alternateNameField);
                    StringField featureClassField = new StringField("featureClass", "", store);
                    doc.add(featureClassField);
                    StringField featureCodeField = new StringField("featureCode", "", store);
                    doc.add(featureCodeField);
                    StringField countryCodeField = new StringField("countryCode", "", store);
                    doc.add(countryCodeField);
                    StringField cc2Field = new StringField("cc2", "", store);
                    doc.add(cc2Field);
                    StringField admin1Field = new StringField("admin1", "", store);
                    doc.add(admin1Field);
                    StringField admin2Field = new StringField("admin2", "", store);
                    doc.add(admin2Field);
                    StringField admin3Field = new StringField("admin3", "", store);
                    doc.add(admin3Field);
                    StringField admin4Field = new StringField("admin4", "", store);
                    doc.add(admin4Field);
                    StringField tzField = new StringField("timezone", "", store);
                    doc.add(tzField);

                    while (true) {
                        try {

                            // Curiously BufferedReader.readLine seems to be thread-safe...
                            String line = reader.readLine();
                            if (line == null) {
                                break;
                            }
                            String[] values = line.split("\t");

                            geoNameID.setIntValue(Integer.parseInt(values[0]));
                            nameField.setStringValue(values[1]);
                            asciiNameField.setStringValue(values[2]);
                            alternateNameField.setStringValue(values[3]);

                            /*
                            if (values[4].isEmpty() == false) {
                              double v = Double.parseDouble(values[4]);
                              doc.add(new DoubleField("latitude", v, doubleFieldType));
                              doc.add(new DoubleDocValuesField("latitude", v));
                            }
                            if (values[5].isEmpty() == false) {
                              double v = Double.parseDouble(values[5]);
                              doc.add(new DoubleField("longitude", v, doubleFieldType));
                              doc.add(new DoubleDocValuesField("longitude", v));
                            }
                            */

                            featureClassField.setStringValue(values[6]);
                            featureCodeField.setStringValue(values[7]);
                            countryCodeField.setStringValue(values[8]);
                            cc2Field.setStringValue(values[9]);
                            admin1Field.setStringValue(values[10]);
                            admin2Field.setStringValue(values[11]);
                            admin3Field.setStringValue(values[12]);
                            admin4Field.setStringValue(values[13]);

                            /*
                            if (values[14].isEmpty() == false) {
                              long v = Long.parseLong(values[14]);
                              doc.add(new LongField("population", v, longFieldType));
                              doc.add(new NumericDocValuesField("population", v));
                            }
                            if (values[15].isEmpty() == false) {
                              long v = Long.parseLong(values[15]);
                              doc.add(new LongField("elevation", v, longFieldType));
                              doc.add(new NumericDocValuesField("elevation", v));
                            }
                            if (values[16].isEmpty() == false) {
                              doc.add(new IntField("dem", Integer.parseInt(values[16]), intFieldType));
                            }
                            */

                            tzField.setStringValue(values[17]);
                            /*
                            if (values[18].isEmpty() == false) {
                              datePos.setIndex(0);
                              Date date = dateParser.parse(values[18], datePos);
                              doc.add(new LongField("modified", date.getTime(), longFieldType));
                            }
                            */
                            w.addDocument(doc);
                            int count = docsIndexed.incrementAndGet();
                            if (count % 200000 == 0) {
                                long ms = System.currentTimeMillis();
                                System.out.println(count + ": " + ((ms - startMS) / 1000.0) + " sec");
                            }
                        } catch (Exception e) {
                            throw new RuntimeException(e);
                        }
                    }
                } else {
                    while (true) {
                        try {

                            // Curiously BufferedReader.readLine seems to be thread-safe...
                            String line = reader.readLine();
                            if (line == null) {
                                break;
                            }
                            String[] values = line.split("\t");

                            Document doc = new Document();

                            doc.add(new IntField("geoNameID", Integer.parseInt(values[0]), intFieldType));
                            doc.add(new TextField("name", values[1], store));
                            doc.add(new TextField("asciiName", values[2], store));
                            doc.add(new TextField("alternateNames", values[3], store));

                            if (values[4].isEmpty() == false) {
                                double v = Double.parseDouble(values[4]);
                                doc.add(new DoubleField("latitude", v, doubleFieldType));
                                doc.add(new DoubleDocValuesField("latitude", v));
                            }
                            if (values[5].isEmpty() == false) {
                                double v = Double.parseDouble(values[5]);
                                doc.add(new DoubleField("longitude", v, doubleFieldType));
                                doc.add(new DoubleDocValuesField("longitude", v));
                            }

                            doc.add(new StringField("featureClass", values[6], store));
                            doc.add(new StringField("featureCode", values[7], store));
                            doc.add(new StringField("countryCode", values[8], store));
                            doc.add(new StringField("cc2", values[9], store));
                            doc.add(new StringField("admin1Code", values[10], store));
                            doc.add(new StringField("admin2Code", values[11], store));
                            doc.add(new StringField("admin3Code", values[12], store));
                            doc.add(new StringField("admin4Code", values[13], store));

                            if (values[14].isEmpty() == false) {
                                long v = Long.parseLong(values[14]);
                                doc.add(new LongField("population", v, longFieldType));
                                doc.add(new NumericDocValuesField("population", v));
                            }
                            if (values[15].isEmpty() == false) {
                                long v = Long.parseLong(values[15]);
                                doc.add(new LongField("elevation", v, longFieldType));
                                doc.add(new NumericDocValuesField("elevation", v));
                            }
                            if (values[16].isEmpty() == false) {
                                doc.add(new IntField("dem", Integer.parseInt(values[16]), intFieldType));
                            }

                            doc.add(new StringField("timezone", values[17], store));

                            if (values[18].isEmpty() == false) {
                                datePos.setIndex(0);
                                Date date = dateParser.parse(values[18], datePos);
                                doc.add(new LongField("modified", date.getTime(), longFieldType));
                            }
                            w.addDocument(doc);
                            int count = docsIndexed.incrementAndGet();
                            if (count % 200000 == 0) {
                                long ms = System.currentTimeMillis();
                                System.out.println(count + ": " + ((ms - startMS) / 1000.0) + " sec");
                            }
                        } catch (Exception e) {
                            throw new RuntimeException(e);
                        }
                    }
                }
            }
        };
        threads[i].start();
    }
    DirectoryReader r = DirectoryReader.open(w, true);
    for (int i = 0; i < 100; i++) {
        DirectoryReader r2 = DirectoryReader.openIfChanged(r);
        if (r2 != null) {
            r.close();
            r = r2;
        }
        Thread.sleep(500);
    }
    if (r != null) {
        r.close();
        r = null;
    }
    for (int i = 0; i < numThreads; i++) {
        threads[i].join();
    }
    long ms = System.currentTimeMillis();
    System.out.println(docsIndexed + ": " + ((ms - startMS) / 1000.0) + " sec");
    //System.out.println("tot conflicts: " + BytesRefHash.totConflict);
    //w.shutdown(normal);
    w.close();
    dir.close();
}