List of usage examples for org.apache.lucene.document TextField setStringValue
public void setStringValue(String value)
Expert: change the value of this field.
From source file:com.bericotech.clavin.index.IndexDirectoryBuilder.java
License:Apache License
/** * Builds a set of Lucene documents for the provided GeoName, indexing * each using all available names and storing the entire ancestry path * for each GeoName in the index. See {@link IndexField} for descriptions * of the fields indexed for each document. * * @param geoName the GeoName to index * @throws IOException if an error occurs while indexing */// www .j ava 2 s . com private void indexGeoName(final GeoName geoName) throws IOException { indexCount++; // find all unique names for this GeoName String nm = geoName.getName(); String asciiNm = geoName.getAsciiName(); Set<String> names = new HashSet<String>(); names.add(nm); names.add(asciiNm); names.addAll(geoName.getAlternateNames()); // if this is a top-level administrative division, add its primary and alternate country codes // if they are not already found in the name or alternate names if (geoName.isTopLevelAdminDivision()) { if (geoName.getPrimaryCountryCode() != null) { names.add(geoName.getPrimaryCountryCode().name()); } for (CountryCode cc : geoName.getAlternateCountryCodes()) { names.add(cc.name()); } } AlternateName preferredName = alternateNameMap.get(geoName.getGeonameID()); // ensure preferred name is found in alternate names if (preferredName != null) { names.add(preferredName.name); } names.remove(null); names.remove(""); // reuse a single Document and field instances Document doc = new Document(); doc.add(new StoredField(GEONAME.key(), fullAncestry ? geoName.getGazetteerRecordWithAncestry() : geoName.getGazetteerRecord())); doc.add(new IntField(GEONAME_ID.key(), geoName.getGeonameID(), Field.Store.YES)); // if the alternate names file was loaded and we found a preferred name for this GeoName, store it if (preferredName != null) { doc.add(new StoredField(PREFERRED_NAME.key(), preferredName.name)); } // index the direct parent ID in the PARENT_ID field GeoName parent = geoName.getParent(); if (parent != null) { doc.add(new IntField(PARENT_ID.key(), parent.getGeonameID(), Field.Store.YES)); } // index all ancestor IDs in the ANCESTOR_IDS field; this is a secondary field // so it can be used to restrict searches and PARENT_ID can be used for ancestor // resolution while (parent != null) { doc.add(new IntField(ANCESTOR_IDS.key(), parent.getGeonameID(), Field.Store.YES)); parent = parent.getParent(); } doc.add(new LongField(POPULATION.key(), geoName.getPopulation(), Field.Store.YES)); // set up sort field based on population and geographic feature type if (geoName.getFeatureClass().equals(FeatureClass.P) || geoName.getFeatureCode().name().startsWith("PCL")) { if (geoName.getGeonameID() != 2643741) // todo: temporary hack until GeoNames.org fixes the population for City of London // boost cities and countries when sorting results by population doc.add(new LongField(SORT_POP.key(), geoName.getPopulation() * 11, Field.Store.YES)); } else { // don't boost anything else, because people rarely talk about other stuff // (e.g., Washington State's population is more than 10x that of Washington, DC // but Washington, DC is mentioned far more frequently than Washington State) doc.add(new LongField(SORT_POP.key(), geoName.getPopulation(), Field.Store.YES)); } doc.add(new IntField(HISTORICAL.key(), IndexField.getBooleanIndexValue(geoName.getFeatureCode().isHistorical()), Field.Store.NO)); doc.add(new StringField(FEATURE_CODE.key(), geoName.getFeatureCode().name(), Field.Store.NO)); // create a unique Document for each name of this GeoName TextField nameField = new TextField(INDEX_NAME.key(), "", Field.Store.YES); doc.add(nameField); for (String name : names) { nameField.setStringValue(name); indexWriter.addDocument(doc); } }
From source file:edu.cmu.geolocator.resource.gazindexing.CollaborativeIndex.GazStringIndexerAllCountries.java
License:Apache License
void indexGazatteer(BufferedReader br, IndexWriter iw) throws IOException, InterruptedException { Document d = new Document(); StringField nfid = new StringField("ID", "", Field.Store.YES); StringField sforigin = new StringField("LOWERED_ORIGIN", "", Field.Store.YES); StringField normnws = new StringField("LOWERED-NO-WS", "", Field.Store.YES); TextField sfunigram = new TextField("UNIGRAM", "", Field.Store.YES); TextField sfbigram = new TextField("BIGRAM", "", Field.Store.YES); TextField sftrigram = new TextField("TRIGRAM", "", Field.Store.YES); StringField lang = new StringField("LANG", "", Field.Store.YES); d.add(nfid);/*from w w w . j av a 2 s. c o m*/ d.add(sforigin); d.add(normnws); d.add(sfunigram); d.add(sfbigram); d.add(sftrigram); String line; String[] column; String id, phrase, otherlang, oneOtherlang; int otherlangLength; int linen = 0; while ((line = br.readLine()) != null) { if (linen++ % 10000 == 0) System.out.println(linen + "\n" + line); column = line.trim().split("\t"); // get other columns except for the location words id = column[0]; phrase = column[1]; otherlang = column[3]; // To Do: set values to document d, and index it nfid.setStringValue(id);// 1 // id does not change, change the strings to index. phrase = phrase.toLowerCase(); sforigin.setStringValue(phrase);// 5 normnws.setStringValue(phrase.replaceAll(" ", "")); getIndexFeatures(phrase); sfunigram.setStringValue(getUnigram()); sfbigram.setStringValue(getBigram()); sftrigram.setStringValue(getTrigram()); lang.setStringValue(""); // add this new document. iw.addDocument(d); if (otherlang.length() == 0) continue; String otherlangs[] = otherlang.split(","); otherlangLength = otherlangs.length; for (int i = 0; i < otherlangLength; i++) { if (otherlangs[i].length() == 0) continue; // id does not change, change the strings to index. oneOtherlang = otherlangs[i].toLowerCase(); sforigin.setStringValue(oneOtherlang);// 5 normnws.setStringValue(oneOtherlang.replaceAll(" ", "")); getIndexFeatures(oneOtherlang); sfunigram.setStringValue(getUnigram()); sfbigram.setStringValue(getBigram()); sftrigram.setStringValue(getTrigram()); lang.setStringValue(""); // add this new document. iw.addDocument(d); } } }
From source file:edu.cmu.geolocator.resource.gazindexing.CollaborativeIndex.GazStringIndexerAltNames.java
License:Apache License
void indexAlterNames(BufferedReader br, IndexWriter iw) throws IOException, InterruptedException { Document d = new Document(); StringField nfid = new StringField("ID", "", Field.Store.YES); StringField sforigin = new StringField("LOWERED_ORIGIN", "", Field.Store.YES); StringField normnws = new StringField("LOWERED-NO-WS", "", Field.Store.YES); TextField sfunigram = new TextField("UNIGRAM", "", Field.Store.YES); TextField sfbigram = new TextField("BIGRAM", "", Field.Store.YES); TextField sftrigram = new TextField("TRIGRAM", "", Field.Store.YES); StringField lang = new StringField("LANG", "", Field.Store.YES); d.add(nfid);/*from w ww .j a va2 s. c o m*/ d.add(sforigin); d.add(normnws); d.add(sfunigram); d.add(sfbigram); d.add(sftrigram); String line; String[] column; String id, langOrLink, phrase; int linen = 0; while ((line = br.readLine()) != null) { if (linen++ % 10000 == 0) System.out.println(linen + "\n" + line); column = line.trim().split("\t"); // get other columns except for the location words id = column[1]; langOrLink = column[2]; phrase = column[3]; // To Do: set values to document d, and index it nfid.setStringValue(id);// 1 phrase = phrase.toLowerCase(); sforigin.setStringValue(phrase);// 5 normnws.setStringValue(phrase.replaceAll(" ", "")); getIndexFeatures(phrase); sfunigram.setStringValue(getUnigram()); sfbigram.setStringValue(getBigram()); sftrigram.setStringValue(getTrigram()); lang.setStringValue(langOrLink); // add this new document. iw.addDocument(d); } }
From source file:perf.IndexGeoNames.java
License:Apache License
public static void main(String[] args) throws Exception { String geoNamesFile = args[0]; File indexPath = new File(args[1]); int numThreads = Integer.parseInt(args[2]); int precStep = Integer.parseInt(args[3]); if (indexPath.exists()) { throw new IllegalArgumentException("please remove indexPath \"" + indexPath + "\" before running"); }// w w w . ja va2 s. c o m Directory dir = FSDirectory.open(indexPath); //IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_48, new StandardAnalyzer(Version.LUCENE_48)); IndexWriterConfig iwc = new IndexWriterConfig(new StandardAnalyzer()); //iwc.setRAMBufferSizeMB(350); iwc.setInfoStream(new PrintStreamInfoStream(System.out)); if (normal == false) { iwc.setRAMBufferSizeMB(1024); iwc.setMergePolicy(NoMergePolicy.INSTANCE); //iwc.setMergePolicy(NoMergePolicy.NO_COMPOUND_FILES); } else { // 5/5 segments: iwc.setMaxBufferedDocs(157234); iwc.setRAMBufferSizeMB(-1); } //((ConcurrentMergeScheduler) iwc.getMergeScheduler()).setMaxMergesAndThreads(3, 1); final IndexWriter w = new IndexWriter(dir, iwc); final Field.Store store = Field.Store.NO; final FieldType doubleFieldType = new FieldType( store == Field.Store.NO ? DoubleField.TYPE_NOT_STORED : DoubleField.TYPE_STORED); doubleFieldType.setNumericPrecisionStep(precStep); doubleFieldType.freeze(); final FieldType longFieldType = new FieldType( store == Field.Store.NO ? LongField.TYPE_NOT_STORED : LongField.TYPE_STORED); longFieldType.setNumericPrecisionStep(precStep); longFieldType.freeze(); final FieldType intFieldType = new FieldType( store == Field.Store.NO ? IntField.TYPE_NOT_STORED : IntField.TYPE_STORED); intFieldType.setNumericPrecisionStep(precStep); intFieldType.freeze(); // 64K buffer: InputStream is = new FileInputStream(geoNamesFile); final BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8"), 1 << 16); final AtomicInteger docsIndexed = new AtomicInteger(); final long startMS = System.currentTimeMillis(); Thread[] threads = new Thread[numThreads]; // With reuse it's ~ 38% faster (41.8 sec vs 67.0 sec): final boolean reuseDocAndFields = false; for (int i = 0; i < numThreads; i++) { threads[i] = new Thread() { @Override public void run() { ParsePosition datePos = new ParsePosition(0); SimpleDateFormat dateParser = new SimpleDateFormat("yyyy-MM-dd", Locale.US); if (reuseDocAndFields) { Document doc = new Document(); IntField geoNameID = new IntField("geoNameID", 0, intFieldType); doc.add(geoNameID); TextField nameField = new TextField("name", "", store); doc.add(nameField); TextField asciiNameField = new TextField("asciiName", "", store); doc.add(asciiNameField); TextField alternateNameField = new TextField("alternateNames", "", store); doc.add(alternateNameField); StringField featureClassField = new StringField("featureClass", "", store); doc.add(featureClassField); StringField featureCodeField = new StringField("featureCode", "", store); doc.add(featureCodeField); StringField countryCodeField = new StringField("countryCode", "", store); doc.add(countryCodeField); StringField cc2Field = new StringField("cc2", "", store); doc.add(cc2Field); StringField admin1Field = new StringField("admin1", "", store); doc.add(admin1Field); StringField admin2Field = new StringField("admin2", "", store); doc.add(admin2Field); StringField admin3Field = new StringField("admin3", "", store); doc.add(admin3Field); StringField admin4Field = new StringField("admin4", "", store); doc.add(admin4Field); StringField tzField = new StringField("timezone", "", store); doc.add(tzField); while (true) { try { // Curiously BufferedReader.readLine seems to be thread-safe... String line = reader.readLine(); if (line == null) { break; } String[] values = line.split("\t"); geoNameID.setIntValue(Integer.parseInt(values[0])); nameField.setStringValue(values[1]); asciiNameField.setStringValue(values[2]); alternateNameField.setStringValue(values[3]); /* if (values[4].isEmpty() == false) { double v = Double.parseDouble(values[4]); doc.add(new DoubleField("latitude", v, doubleFieldType)); doc.add(new DoubleDocValuesField("latitude", v)); } if (values[5].isEmpty() == false) { double v = Double.parseDouble(values[5]); doc.add(new DoubleField("longitude", v, doubleFieldType)); doc.add(new DoubleDocValuesField("longitude", v)); } */ featureClassField.setStringValue(values[6]); featureCodeField.setStringValue(values[7]); countryCodeField.setStringValue(values[8]); cc2Field.setStringValue(values[9]); admin1Field.setStringValue(values[10]); admin2Field.setStringValue(values[11]); admin3Field.setStringValue(values[12]); admin4Field.setStringValue(values[13]); /* if (values[14].isEmpty() == false) { long v = Long.parseLong(values[14]); doc.add(new LongField("population", v, longFieldType)); doc.add(new NumericDocValuesField("population", v)); } if (values[15].isEmpty() == false) { long v = Long.parseLong(values[15]); doc.add(new LongField("elevation", v, longFieldType)); doc.add(new NumericDocValuesField("elevation", v)); } if (values[16].isEmpty() == false) { doc.add(new IntField("dem", Integer.parseInt(values[16]), intFieldType)); } */ tzField.setStringValue(values[17]); /* if (values[18].isEmpty() == false) { datePos.setIndex(0); Date date = dateParser.parse(values[18], datePos); doc.add(new LongField("modified", date.getTime(), longFieldType)); } */ w.addDocument(doc); int count = docsIndexed.incrementAndGet(); if (count % 200000 == 0) { long ms = System.currentTimeMillis(); System.out.println(count + ": " + ((ms - startMS) / 1000.0) + " sec"); } } catch (Exception e) { throw new RuntimeException(e); } } } else { while (true) { try { // Curiously BufferedReader.readLine seems to be thread-safe... String line = reader.readLine(); if (line == null) { break; } String[] values = line.split("\t"); Document doc = new Document(); doc.add(new IntField("geoNameID", Integer.parseInt(values[0]), intFieldType)); doc.add(new TextField("name", values[1], store)); doc.add(new TextField("asciiName", values[2], store)); doc.add(new TextField("alternateNames", values[3], store)); if (values[4].isEmpty() == false) { double v = Double.parseDouble(values[4]); doc.add(new DoubleField("latitude", v, doubleFieldType)); doc.add(new DoubleDocValuesField("latitude", v)); } if (values[5].isEmpty() == false) { double v = Double.parseDouble(values[5]); doc.add(new DoubleField("longitude", v, doubleFieldType)); doc.add(new DoubleDocValuesField("longitude", v)); } doc.add(new StringField("featureClass", values[6], store)); doc.add(new StringField("featureCode", values[7], store)); doc.add(new StringField("countryCode", values[8], store)); doc.add(new StringField("cc2", values[9], store)); doc.add(new StringField("admin1Code", values[10], store)); doc.add(new StringField("admin2Code", values[11], store)); doc.add(new StringField("admin3Code", values[12], store)); doc.add(new StringField("admin4Code", values[13], store)); if (values[14].isEmpty() == false) { long v = Long.parseLong(values[14]); doc.add(new LongField("population", v, longFieldType)); doc.add(new NumericDocValuesField("population", v)); } if (values[15].isEmpty() == false) { long v = Long.parseLong(values[15]); doc.add(new LongField("elevation", v, longFieldType)); doc.add(new NumericDocValuesField("elevation", v)); } if (values[16].isEmpty() == false) { doc.add(new IntField("dem", Integer.parseInt(values[16]), intFieldType)); } doc.add(new StringField("timezone", values[17], store)); if (values[18].isEmpty() == false) { datePos.setIndex(0); Date date = dateParser.parse(values[18], datePos); doc.add(new LongField("modified", date.getTime(), longFieldType)); } w.addDocument(doc); int count = docsIndexed.incrementAndGet(); if (count % 200000 == 0) { long ms = System.currentTimeMillis(); System.out.println(count + ": " + ((ms - startMS) / 1000.0) + " sec"); } } catch (Exception e) { throw new RuntimeException(e); } } } } }; threads[i].start(); } DirectoryReader r = DirectoryReader.open(w, true); for (int i = 0; i < 100; i++) { DirectoryReader r2 = DirectoryReader.openIfChanged(r); if (r2 != null) { r.close(); r = r2; } Thread.sleep(500); } if (r != null) { r.close(); r = null; } for (int i = 0; i < numThreads; i++) { threads[i].join(); } long ms = System.currentTimeMillis(); System.out.println(docsIndexed + ": " + ((ms - startMS) / 1000.0) + " sec"); //System.out.println("tot conflicts: " + BytesRefHash.totConflict); //w.shutdown(normal); w.close(); dir.close(); }