List of usage examples for org.apache.lucene.document StringField setStringValue
public void setStringValue(String value)
Expert: change the value of this field.
From source file:edu.cmu.geolocator.resource.gazindexing.CollaborativeIndex.GazInfoIndexerAllCountries.java
License:Apache License
void indexGazatteer(BufferedReader br, IndexWriter iw) throws IOException { Document d = new Document(); StringField nfid = new StringField("ID", "0", Field.Store.YES); StringField name = new StringField("ORIGINAL-NAME", "", Field.Store.YES); IntField nfaltnames = new IntField("ALTNAME-COUNT", 0, Field.Store.YES); DoubleField nflong = new DoubleField("LONGTITUDE", 0.0, Field.Store.YES); DoubleField nfla = new DoubleField("LATITUDE", 0.0, Field.Store.YES); LongField nfpop = new LongField("POPULATION", 0, Field.Store.YES); StringField sfcountry = new StringField("COUNTRY-CODE", "", Field.Store.YES); StringField sfadm1 = new StringField("ADM1-CODE", "", Field.Store.YES); StringField sfadm2 = new StringField("ADM2-CODE", "", Field.Store.YES); StringField sfadm3 = new StringField("ADM3-CODE", "", Field.Store.YES); StringField sfadm4 = new StringField("ADM4-CODE", "", Field.Store.YES); StringField sffeatureclass = new StringField("FEATURE-CLASS", "", Field.Store.YES); StringField sffeature = new StringField("FEATURE", "", Field.Store.YES); StringField sftimezone = new StringField("TIMEZONE", "", Field.Store.YES); d.add(nfid);//from w ww . j av a2 s. c om d.add(name); d.add(nfaltnames); d.add(nflong); d.add(nfla); d.add(nfpop); d.add(sfcountry); d.add(sfadm1); d.add(sfadm2); d.add(sfadm3); d.add(sfadm4); d.add(sffeatureclass); d.add(sffeature); d.add(sftimezone); String line; int linen = 0; while ((line = br.readLine()) != null) { if (linen++ % 10000 == 0) System.out.println(linen + "\n" + line); String[] column = line.trim().split("\t"); // get other columns except for the location words String id = column[0]; String utfname = column[1]; String altnames = column[3]; String latitude = column[4]; String longtitude = column[5]; double dlong, dla; if (latitude == null) { dlong = 999; dla = 999; } else { dlong = Double.parseDouble(longtitude); dla = Double.parseDouble(latitude); } String featureclass = column[6]; String feature = column[7]; String country = column[8]; String population = column[14]; long longpop; if (population == null) longpop = -1; longpop = Long.parseLong(population); String timezone = column[17]; // To Do: set values to document d, and index it nfid.setStringValue(id);// 1 name.setStringValue(utfname); nfaltnames.setIntValue(altnames.split(",").length); nflong.setDoubleValue(dlong); nfla.setDoubleValue(dla); nfpop.setLongValue(longpop); sfcountry.setStringValue(country.toLowerCase()); sfadm1.setStringValue(column[10].toLowerCase()); sfadm2.setStringValue(column[11].toLowerCase()); sfadm3.setStringValue(column[12].toLowerCase()); sfadm4.setStringValue(column[13].toLowerCase()); sffeatureclass.setStringValue(featureclass); sffeature.setStringValue(feature); sftimezone.setStringValue(timezone);// 13 // add this new document. iw.addDocument(d); } }
From source file:edu.cmu.geolocator.resource.gazindexing.CollaborativeIndex.GazStringIndexerAllCountries.java
License:Apache License
void indexGazatteer(BufferedReader br, IndexWriter iw) throws IOException, InterruptedException { Document d = new Document(); StringField nfid = new StringField("ID", "", Field.Store.YES); StringField sforigin = new StringField("LOWERED_ORIGIN", "", Field.Store.YES); StringField normnws = new StringField("LOWERED-NO-WS", "", Field.Store.YES); TextField sfunigram = new TextField("UNIGRAM", "", Field.Store.YES); TextField sfbigram = new TextField("BIGRAM", "", Field.Store.YES); TextField sftrigram = new TextField("TRIGRAM", "", Field.Store.YES); StringField lang = new StringField("LANG", "", Field.Store.YES); d.add(nfid);/*w w w. ja v a 2s . c o m*/ d.add(sforigin); d.add(normnws); d.add(sfunigram); d.add(sfbigram); d.add(sftrigram); String line; String[] column; String id, phrase, otherlang, oneOtherlang; int otherlangLength; int linen = 0; while ((line = br.readLine()) != null) { if (linen++ % 10000 == 0) System.out.println(linen + "\n" + line); column = line.trim().split("\t"); // get other columns except for the location words id = column[0]; phrase = column[1]; otherlang = column[3]; // To Do: set values to document d, and index it nfid.setStringValue(id);// 1 // id does not change, change the strings to index. phrase = phrase.toLowerCase(); sforigin.setStringValue(phrase);// 5 normnws.setStringValue(phrase.replaceAll(" ", "")); getIndexFeatures(phrase); sfunigram.setStringValue(getUnigram()); sfbigram.setStringValue(getBigram()); sftrigram.setStringValue(getTrigram()); lang.setStringValue(""); // add this new document. iw.addDocument(d); if (otherlang.length() == 0) continue; String otherlangs[] = otherlang.split(","); otherlangLength = otherlangs.length; for (int i = 0; i < otherlangLength; i++) { if (otherlangs[i].length() == 0) continue; // id does not change, change the strings to index. oneOtherlang = otherlangs[i].toLowerCase(); sforigin.setStringValue(oneOtherlang);// 5 normnws.setStringValue(oneOtherlang.replaceAll(" ", "")); getIndexFeatures(oneOtherlang); sfunigram.setStringValue(getUnigram()); sfbigram.setStringValue(getBigram()); sftrigram.setStringValue(getTrigram()); lang.setStringValue(""); // add this new document. iw.addDocument(d); } } }
From source file:edu.cmu.geolocator.resource.gazindexing.CollaborativeIndex.GazStringIndexerAltNames.java
License:Apache License
void indexAlterNames(BufferedReader br, IndexWriter iw) throws IOException, InterruptedException { Document d = new Document(); StringField nfid = new StringField("ID", "", Field.Store.YES); StringField sforigin = new StringField("LOWERED_ORIGIN", "", Field.Store.YES); StringField normnws = new StringField("LOWERED-NO-WS", "", Field.Store.YES); TextField sfunigram = new TextField("UNIGRAM", "", Field.Store.YES); TextField sfbigram = new TextField("BIGRAM", "", Field.Store.YES); TextField sftrigram = new TextField("TRIGRAM", "", Field.Store.YES); StringField lang = new StringField("LANG", "", Field.Store.YES); d.add(nfid);/*from w ww. java2s .co m*/ d.add(sforigin); d.add(normnws); d.add(sfunigram); d.add(sfbigram); d.add(sftrigram); String line; String[] column; String id, langOrLink, phrase; int linen = 0; while ((line = br.readLine()) != null) { if (linen++ % 10000 == 0) System.out.println(linen + "\n" + line); column = line.trim().split("\t"); // get other columns except for the location words id = column[1]; langOrLink = column[2]; phrase = column[3]; // To Do: set values to document d, and index it nfid.setStringValue(id);// 1 phrase = phrase.toLowerCase(); sforigin.setStringValue(phrase);// 5 normnws.setStringValue(phrase.replaceAll(" ", "")); getIndexFeatures(phrase); sfunigram.setStringValue(getUnigram()); sfbigram.setStringValue(getBigram()); sftrigram.setStringValue(getTrigram()); lang.setStringValue(langOrLink); // add this new document. iw.addDocument(d); } }
From source file:edu.cmu.lti.huiying.ir.rangedsearch.TableIndexer.java
License:Apache License
public void indexExplodedXml(IndexWriter writer, File file) throws IOException { if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); if (files != null) { for (int i = 0; i < files.length; i++) { indexExplodedXml(writer, new File(file, files[i])); }// www . j av a2 s . c om } } else { FileInputStream fis = new FileInputStream(file); try { NumericFeatureGenerator nfg = new NumericFeatureGenerator(); if (this.xmlreader == null) { this.xmlreader = new XmlStAXReader(); } Article a = xmlreader.readArticleFromXml(file.getAbsolutePath()); for (Table t : a.tables) { for (Group g : t.groups) { for (Column col : g.columns) { // index columns Document coldoc = new Document(); ArrayList<Double> cfv = nfg.getFeatureVector(col.content); if (cfv.get(0) != null) { DoubleField intratio = new DoubleField("intratio", cfv.get(0), Field.Store.NO); coldoc.add(intratio); } if (cfv.get(1) != null) { DoubleField floatratio = new DoubleField("floatratio", cfv.get(1), Field.Store.NO); coldoc.add(floatratio); } if (cfv.get(3) != null) { DoubleField mean = new DoubleField("mean", cfv.get(3), Field.Store.NO); coldoc.add(mean); } if (cfv.get(4) != null) { DoubleField std = new DoubleField("std", cfv.get(4), Field.Store.NO); coldoc.add(std); } if (cfv.get(6) != null) { DoubleField min = new DoubleField("min", cfv.get(6), Field.Store.NO); coldoc.add(min); } if (cfv.get(7) != null) { DoubleField max = new DoubleField("max", cfv.get(7), Field.Store.NO); coldoc.add(max); } if (cfv.get(8) != null) { DoubleField acc = new DoubleField("acc", cfv.get(8), Field.Store.NO); coldoc.add(acc); } if (cfv.get(11) != null) { DoubleField colmag = new DoubleField("colmag", cfv.get(11), Field.Store.NO); coldoc.add(colmag); } StringField wholegroup = new StringField("wholegroup", g.toString(), Field.Store.YES); if (wholegroup.stringValue().getBytes().length > 32760) { wholegroup.setStringValue("Table too large..."); System.err.println( "table too large:" + wholegroup.stringValue().getBytes().length); } String headers = ""; if (col.headers != null) { for (Header hdr : col.headers) { headers += hdr.text.toLowerCase() + " "; } } TextField header = new TextField("headerkeywords", headers.trim(), Field.Store.NO); coldoc.add(header); coldoc.add(wholegroup); StringField fname = new StringField("filename", file.getAbsolutePath(), Field.Store.YES); coldoc.add(fname); StringField type = new StringField("type", "column", Field.Store.YES); coldoc.add(type); IntField bstart = new IntField("bytestart", col.content.get(0).byteStart, Field.Store.YES); IntField bend = new IntField("byteend", col.content.get(col.content.size() - 1).byteEnd, Field.Store.YES); String content = ""; for (edu.cmu.lti.huiying.domainclasses.Field f : col.content) content += f.text + "|"; StringField colcontent = new StringField("colcontent", content.substring(0, content.length() - 1), Field.Store.YES); coldoc.add(colcontent); coldoc.add(bstart); coldoc.add(bend); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { writer.addDocument(coldoc); totalDocAdded++; } else { writer.updateDocument(new Term("path", file.getPath()), coldoc); } for (edu.cmu.lti.huiying.domainclasses.Field f : col.content) { // Index single cell Document celldoc = new Document(); ArrayList<Double> fv = nfg.field2Features(f); if (fv.get(0) == 1 || fv.get(0) == 2) { try { DoubleField df = new DoubleField("value", fv.get(1), Field.Store.YES); celldoc.add(df); StringField textf = new StringField("text", f.text, Field.Store.YES); celldoc.add(textf); if (fv.get(2) != null & fv.get(2) != Double.NaN) { DoubleField errf = new DoubleField("error", fv.get(2), Field.Store.NO); celldoc.add(errf); } if (fv.get(5) != Double.NaN) { DoubleField magf = new DoubleField("cellmag", fv.get(5), Field.Store.NO); celldoc.add(magf); } if (fv.get(4) != null) { DoubleField pvalue = new DoubleField("cellpvalue", fv.get(4), Field.Store.NO); celldoc.add(pvalue); } StringField sf = new StringField("filename", file.getAbsolutePath(), Field.Store.YES); celldoc.add(sf); StringField ctype = new StringField("type", "cell", Field.Store.YES); celldoc.add(ctype); //StringField cwholegroup=new StringField("wholegroup", g.toString(), Field.Store.YES); //celldoc.add(cwholegroup); IntField cbstart = new IntField("bytestart", f.byteStart, Field.Store.YES); IntField cbend = new IntField("byteend", f.byteEnd, Field.Store.YES); celldoc.add(cbstart); celldoc.add(cbend); } catch (NullPointerException e) { e.printStackTrace(); System.out.println(f.text); } if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { writer.addDocument(celldoc); totalDocAdded++; } else { writer.updateDocument(new Term("path", file.getPath()), celldoc); } } } } } } } finally { fis.close(); } } } }
From source file:org.elasticsearch.common.lucene.uid.VersionsTests.java
License:Apache License
@Test public void testMergingOldIndices() throws Exception { final IndexWriterConfig iwConf = new IndexWriterConfig(Lucene.VERSION, new KeywordAnalyzer()); iwConf.setMergePolicy(new IndexUpgraderMergePolicy(iwConf.getMergePolicy())); final Directory dir = newDirectory(); final IndexWriter iw = new IndexWriter(dir, iwConf); // 1st segment, no _version Document document = new Document(); // Add a dummy field (enough to trigger #3237) document.add(new StringField("a", "b", Store.NO)); StringField uid = new StringField(UidFieldMapper.NAME, "1", Store.YES); document.add(uid);// w ww . j a va 2 s . co m iw.addDocument(document); uid.setStringValue("2"); iw.addDocument(document); iw.commit(); // 2nd segment, old layout document = new Document(); UidField uidAndVersion = new UidField("3", 3L); document.add(uidAndVersion); iw.addDocument(document); uidAndVersion.uid = "4"; uidAndVersion.version = 4L; iw.addDocument(document); iw.commit(); // 3rd segment new layout document = new Document(); uid.setStringValue("5"); Field version = new NumericDocValuesField(VersionFieldMapper.NAME, 5L); document.add(uid); document.add(version); iw.addDocument(document); uid.setStringValue("6"); version.setLongValue(6L); iw.addDocument(document); iw.commit(); final Map<String, Long> expectedVersions = ImmutableMap.<String, Long>builder().put("1", 0L).put("2", 0L) .put("3", 0L).put("4", 4L).put("5", 5L).put("6", 6L).build(); // Force merge and check versions iw.forceMerge(1); final AtomicReader ir = SlowCompositeReaderWrapper.wrap(DirectoryReader.open(iw.getDirectory())); final NumericDocValues versions = ir.getNumericDocValues(VersionFieldMapper.NAME); assertThat(versions, notNullValue()); for (int i = 0; i < ir.maxDoc(); ++i) { final String uidValue = ir.document(i).get(UidFieldMapper.NAME); final long expectedVersion = expectedVersions.get(uidValue); assertThat(versions.get(i), equalTo(expectedVersion)); } iw.close(); assertThat(IndexWriter.isLocked(iw.getDirectory()), is(false)); ir.close(); dir.close(); }
From source file:org.elasticsearch.common.UUIDTests.java
License:Apache License
private static double testCompression(int numDocs, int numDocsPerSecond, int numNodes, Logger logger) throws Exception { final double intervalBetweenDocs = 1000. / numDocsPerSecond; // milliseconds final byte[][] macAddresses = new byte[numNodes][]; Random r = random();// www.j a va2s . co m for (int i = 0; i < macAddresses.length; ++i) { macAddresses[i] = new byte[6]; random().nextBytes(macAddresses[i]); } UUIDGenerator generator = new TimeBasedUUIDGenerator() { double currentTimeMillis = System.currentTimeMillis(); @Override protected long currentTimeMillis() { currentTimeMillis += intervalBetweenDocs * 2 * r.nextDouble(); return (long) currentTimeMillis; } @Override protected byte[] macAddress() { return RandomPicks.randomFrom(r, macAddresses); } }; // Avoid randomization which will slow down things without improving // the quality of this test Directory dir = newFSDirectory(createTempDir()); IndexWriterConfig config = new IndexWriterConfig().setMergeScheduler(new SerialMergeScheduler()); // for reproducibility IndexWriter w = new IndexWriter(dir, config); Document doc = new Document(); StringField id = new StringField("_id", "", Store.NO); doc.add(id); long start = System.nanoTime(); for (int i = 0; i < numDocs; ++i) { id.setStringValue(generator.getBase64UUID()); w.addDocument(doc); } w.forceMerge(1); long time = (System.nanoTime() - start) / 1000 / 1000; w.close(); long size = 0; for (String file : dir.listAll()) { size += dir.fileLength(file); } dir.close(); double bytesPerDoc = (double) size / numDocs; logger.info(numDocs + " docs indexed at " + numDocsPerSecond + " docs/s required " + new ByteSizeValue(size) + " bytes of disk space, or " + bytesPerDoc + " bytes per document. Took: " + new TimeValue(time) + "."); return bytesPerDoc; }
From source file:org.elasticsearch.index.fielddata.AbstractStringFieldDataTests.java
License:Apache License
public void testActualMissingValue(boolean reverse) throws IOException { // missing value is set to an actual value Document d = new Document(); final StringField s = new StringField("value", "", Field.Store.YES); d.add(s);// w w w . j a v a2 s . com final String[] values = new String[randomIntBetween(2, 30)]; for (int i = 1; i < values.length; ++i) { values[i] = _TestUtil.randomUnicodeString(getRandom()); } final int numDocs = atLeast(100); for (int i = 0; i < numDocs; ++i) { final String value = RandomPicks.randomFrom(getRandom(), values); if (value == null) { writer.addDocument(new Document()); } else { s.setStringValue(value); writer.addDocument(d); } if (randomInt(10) == 0) { writer.commit(); } } final IndexFieldData indexFieldData = getForField("value"); final String missingValue = values[1]; IndexSearcher searcher = new IndexSearcher(DirectoryReader.open(writer, true)); XFieldComparatorSource comparator = indexFieldData.comparatorSource(missingValue, SortMode.MIN); TopFieldDocs topDocs = searcher.search(new MatchAllDocsQuery(), randomBoolean() ? numDocs : randomIntBetween(10, numDocs), new Sort(new SortField("value", comparator, reverse))); assertEquals(numDocs, topDocs.totalHits); BytesRef previousValue = reverse ? UnicodeUtil.BIG_TERM : new BytesRef(); for (int i = 0; i < topDocs.scoreDocs.length; ++i) { final String docValue = searcher.doc(topDocs.scoreDocs[i].doc).get("value"); final BytesRef value = new BytesRef(docValue == null ? missingValue : docValue); if (reverse) { assertTrue(previousValue.compareTo(value) >= 0); } else { assertTrue(previousValue.compareTo(value) <= 0); } previousValue = value; } searcher.getIndexReader().close(); }
From source file:org.elasticsearch.index.fielddata.AbstractStringFieldDataTests.java
License:Apache License
public void testSortMissing(boolean first, boolean reverse) throws IOException { Document d = new Document(); final StringField s = new StringField("value", "", Field.Store.YES); d.add(s);//w ww . j av a2 s.c o m final String[] values = new String[randomIntBetween(2, 10)]; for (int i = 1; i < values.length; ++i) { values[i] = _TestUtil.randomUnicodeString(getRandom()); } final int numDocs = atLeast(100); for (int i = 0; i < numDocs; ++i) { final String value = RandomPicks.randomFrom(getRandom(), values); if (value == null) { writer.addDocument(new Document()); } else { s.setStringValue(value); writer.addDocument(d); } if (randomInt(10) == 0) { writer.commit(); } } final IndexFieldData indexFieldData = getForField("value"); IndexSearcher searcher = new IndexSearcher(DirectoryReader.open(writer, true)); XFieldComparatorSource comparator = indexFieldData.comparatorSource(first ? "_first" : "_last", SortMode.MIN); TopFieldDocs topDocs = searcher.search(new MatchAllDocsQuery(), randomBoolean() ? numDocs : randomIntBetween(10, numDocs), new Sort(new SortField("value", comparator, reverse))); assertEquals(numDocs, topDocs.totalHits); BytesRef previousValue = first ? null : reverse ? UnicodeUtil.BIG_TERM : new BytesRef(); for (int i = 0; i < topDocs.scoreDocs.length; ++i) { final String docValue = searcher.doc(topDocs.scoreDocs[i].doc).get("value"); if (first && docValue == null) { assertNull(previousValue); } else if (!first && docValue != null) { assertNotNull(previousValue); } final BytesRef value = docValue == null ? null : new BytesRef(docValue); if (previousValue != null && value != null) { if (reverse) { assertTrue(previousValue.compareTo(value) >= 0); } else { assertTrue(previousValue.compareTo(value) <= 0); } } previousValue = value; } searcher.getIndexReader().close(); }
From source file:org.elasticsearch.index.fielddata.LongFieldDataTests.java
License:Apache License
private void test(List<LongOpenHashSet> values) throws Exception { StringField id = new StringField("_id", "", Field.Store.NO); for (int i = 0; i < values.size(); ++i) { Document doc = new Document(); id.setStringValue("" + i); doc.add(id);/*from w w w. j a va 2 s . c om*/ final LongOpenHashSet v = values.get(i); final boolean[] states = v.allocated; final long[] keys = v.keys; for (int j = 0; j < states.length; j++) { if (states[j]) { LongField value = new LongField("value", keys[j], Field.Store.NO); doc.add(value); } } writer.addDocument(doc); } writer.forceMerge(1); final IndexNumericFieldData indexFieldData = getForField("value"); final AtomicNumericFieldData atomicFieldData = indexFieldData.load(refreshReader()); final LongValues data = atomicFieldData.getLongValues(); final DoubleValues doubleData = atomicFieldData.getDoubleValues(); final LongOpenHashSet set = new LongOpenHashSet(); final DoubleOpenHashSet doubleSet = new DoubleOpenHashSet(); for (int i = 0; i < values.size(); ++i) { final LongOpenHashSet v = values.get(i); assertThat(data.setDocument(i) > 0, equalTo(!v.isEmpty())); assertThat(doubleData.setDocument(i) > 0, equalTo(!v.isEmpty())); set.clear(); int numValues = data.setDocument(i); for (int j = 0; j < numValues; j++) { set.add(data.nextValue()); } assertThat(set, equalTo(v)); final DoubleOpenHashSet doubleV = new DoubleOpenHashSet(); final boolean[] states = v.allocated; final long[] keys = v.keys; for (int j = 0; j < states.length; j++) { if (states[j]) { doubleV.add((double) keys[j]); } } doubleSet.clear(); numValues = doubleData.setDocument(i); double prev = 0; for (int j = 0; j < numValues; j++) { double current; doubleSet.add(current = doubleData.nextValue()); if (j > 0) { assertThat(prev, lessThan(current)); } prev = current; } assertThat(doubleSet, equalTo(doubleV)); } }
From source file:org.elasticsearch.index.mapper.internal.TypeFieldTypeTests.java
License:Apache License
public void testTermQuery() throws Exception { Directory dir = newDirectory();/*from w w w. ja va 2s . c om*/ IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); Document doc = new Document(); StringField type = new StringField(TypeFieldMapper.NAME, "my_type", Store.NO); doc.add(type); w.addDocument(doc); w.addDocument(doc); IndexReader reader = DirectoryReader.open(w); TypeFieldMapper.TypeFieldType ft = new TypeFieldMapper.TypeFieldType(); ft.setName(TypeFieldMapper.NAME); Query query = ft.termQuery("my_type", null); assertEquals(new MatchAllDocsQuery(), query.rewrite(reader)); // Make sure that Lucene actually simplifies the query when there is a single type Query userQuery = new PhraseQuery("body", "quick", "fox"); Query filteredQuery = new BooleanQuery.Builder().add(userQuery, Occur.MUST).add(query, Occur.FILTER) .build(); Query rewritten = new IndexSearcher(reader).rewrite(filteredQuery); assertEquals(userQuery, rewritten); type.setStringValue("my_type2"); w.addDocument(doc); reader.close(); reader = DirectoryReader.open(w); assertEquals(new ConstantScoreQuery(new TermQuery(new Term(TypeFieldMapper.NAME, "my_type"))), query.rewrite(reader)); IOUtils.close(reader, w, dir); }