List of usage examples for org.apache.lucene.index RandomIndexWriter addDocument
public <T extends IndexableField> long addDocument(final Iterable<T> doc) throws IOException
From source file:brightsolid.solr.plugins.TestTargetPositionQueryFuzzy.java
License:Apache License
@Override public void setUp() throws Exception { super.setUp(); directory = newDirectory();//from w w w. j ava 2 s . com RandomIndexWriter iw = new RandomIndexWriter(random(), directory); Document doc = new Document(); FieldType newType = new FieldType(org.apache.lucene.document.TextField.TYPE_STORED); newType.setOmitNorms(true); Field field = newField("field", "", newType); field.fieldType().setOmitNorms(true); doc.add(field); field.setStringValue("one two threx"); iw.addDocument(doc); field.setStringValue("two threx one"); iw.addDocument(doc); field.setStringValue("threx one twp"); iw.addDocument(doc); reader = iw.getReader(); iw.close(); searcher = newSearcher(reader); }
From source file:brightsolid.solr.plugins.TestTargetPositionQuerySimple.java
License:Apache License
@Override public void setUp() throws Exception { super.setUp(); directory = newDirectory();/*from w w w .j a va2 s .c o m*/ RandomIndexWriter iw = new RandomIndexWriter(random(), directory); Document doc = new Document(); FieldType newType = new FieldType(org.apache.lucene.document.TextField.TYPE_STORED); newType.setOmitNorms(true); Field field = newField("field", "", newType); field.fieldType().setOmitNorms(true); doc.add(field); field.setStringValue("two"); iw.addDocument(doc); field.setStringValue("one"); iw.addDocument(doc); field.setStringValue("one two"); iw.addDocument(doc); field.setStringValue("two one"); iw.addDocument(doc); field.setStringValue("one two three"); iw.addDocument(doc); field.setStringValue("three two one"); iw.addDocument(doc); field.setStringValue("one one two"); iw.addDocument(doc); reader = iw.getReader(); iw.close(); searcher = newSearcher(reader); }
From source file:brightsolid.solr.plugins.TestTargetPositionQuerySynonyms.java
License:Apache License
@Override public void setUp() throws Exception { super.setUp(); String testFile = "one, uno, un\n" + "two, dos, too\n" + "three, free, tres"; SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random())); parser.parse(new StringReader(testFile)); final SynonymMap map = parser.build(); Analyzer analyzer = new Analyzer() { @Override/*w w w.j ava2 s. c om*/ protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, true); return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, false)); } }; directory = newDirectory(); RandomIndexWriter iw = new RandomIndexWriter(random(), directory, analyzer); Document doc = new Document(); FieldType newType = new FieldType(org.apache.lucene.document.TextField.TYPE_STORED); newType.setOmitNorms(true); Field field = newField("field", "", newType); field.fieldType().setOmitNorms(true); doc.add(field); field.setStringValue("one two three"); iw.addDocument(doc); field.setStringValue("two three one"); iw.addDocument(doc); field.setStringValue("three one two"); iw.addDocument(doc); reader = iw.getReader(); iw.close(); searcher = newSearcher(reader); }
From source file:com.meizu.nlp.classification.utils.DocToDoubleVectorUtilsTest.java
License:Apache License
@Override @Before//from w ww.jav a2s .c o m public void setUp() throws Exception { super.setUp(); dir = newDirectory(); RandomIndexWriter indexWriter = new RandomIndexWriter(random(), dir); FieldType ft = new FieldType(TextField.TYPE_STORED); ft.setStoreTermVectors(true); ft.setStoreTermVectorOffsets(true); ft.setStoreTermVectorPositions(true); Document doc; for (int i = 0; i < 10; i++) { doc = new Document(); doc.add(new Field("id", Integer.toString(i), ft)); doc.add(new Field("text", random().nextInt(10) + " " + random().nextInt(10) + " " + random().nextInt(10), ft)); indexWriter.addDocument(doc); } indexWriter.commit(); index = indexWriter.getReader(); indexWriter.close(); }
From source file:com.rocana.lucene.codec.v1.RocanaBasePostingsFormatTestCase.java
License:Apache License
@Override public void testInvertedWrite() throws Exception { Directory dir = newDirectory();//w w w .ja va 2 s .com MockAnalyzer analyzer = new MockAnalyzer(random()); analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH)); IndexWriterConfig iwc = newIndexWriterConfig(analyzer); // Must be concurrent because thread(s) can be merging // while up to one thread flushes, and each of those // threads iterates over the map while the flushing // thread might be adding to it: final Map<String, TermFreqs> termFreqs = new ConcurrentHashMap<>(); final AtomicLong sumDocFreq = new AtomicLong(); final AtomicLong sumTotalTermFreq = new AtomicLong(); // TODO: would be better to use / delegate to the current // Codec returned by getCodec() iwc.setCodec(new AssertingCodec() { @Override public PostingsFormat getPostingsFormatForField(String field) { PostingsFormat p = getCodec().postingsFormat(); if (p instanceof PerFieldPostingsFormat) { p = ((PerFieldPostingsFormat) p).getPostingsFormatForField(field); } if (p instanceof RocanaPerFieldPostingsFormat) { p = ((RocanaPerFieldPostingsFormat) p).getPostingsFormatForField(field); } final PostingsFormat defaultPostingsFormat = p; final Thread mainThread = Thread.currentThread(); if (field.equals("body")) { // A PF that counts up some stats and then in // the end we verify the stats match what the // final IndexReader says, just to exercise the // new freedom of iterating the postings more // than once at flush/merge: return new PostingsFormat(defaultPostingsFormat.getName()) { @Override public FieldsConsumer fieldsConsumer(final SegmentWriteState state) throws IOException { final FieldsConsumer fieldsConsumer = defaultPostingsFormat.fieldsConsumer(state); return new FieldsConsumer() { @Override public void write(Fields fields) throws IOException { fieldsConsumer.write(fields); boolean isMerge = state.context.context == IOContext.Context.MERGE; // We only use one thread for flushing // in this test: assert isMerge || Thread.currentThread() == mainThread; // We iterate the provided TermsEnum // twice, so we excercise this new freedom // with the inverted API; if // addOnSecondPass is true, we add up // term stats on the 2nd iteration: boolean addOnSecondPass = random().nextBoolean(); //System.out.println("write isMerge=" + isMerge + " 2ndPass=" + addOnSecondPass); // Gather our own stats: Terms terms = fields.terms("body"); assert terms != null; TermsEnum termsEnum = terms.iterator(); PostingsEnum docs = null; while (termsEnum.next() != null) { BytesRef term = termsEnum.term(); // TODO: also sometimes ask for payloads/offsets? boolean noPositions = random().nextBoolean(); if (noPositions) { docs = termsEnum.postings(docs, PostingsEnum.FREQS); } else { docs = termsEnum.postings(null, PostingsEnum.POSITIONS); } int docFreq = 0; long totalTermFreq = 0; while (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS) { docFreq++; totalTermFreq += docs.freq(); int limit = TestUtil.nextInt(random(), 1, docs.freq()); if (!noPositions) { for (int i = 0; i < limit; i++) { docs.nextPosition(); } } } String termString = term.utf8ToString(); // During merge we should only see terms // we had already seen during a // previous flush: assertTrue(isMerge == false || termFreqs.containsKey(termString)); if (isMerge == false) { if (addOnSecondPass == false) { TermFreqs tf = termFreqs.get(termString); if (tf == null) { tf = new TermFreqs(); termFreqs.put(termString, tf); } tf.docFreq += docFreq; tf.totalTermFreq += totalTermFreq; sumDocFreq.addAndGet(docFreq); sumTotalTermFreq.addAndGet(totalTermFreq); } else if (termFreqs.containsKey(termString) == false) { // Add placeholder (2nd pass will // set its counts): termFreqs.put(termString, new TermFreqs()); } } } // Also test seeking the TermsEnum: for (String term : termFreqs.keySet()) { if (termsEnum.seekExact(new BytesRef(term))) { // TODO: also sometimes ask for payloads/offsets? boolean noPositions = random().nextBoolean(); if (noPositions) { docs = termsEnum.postings(docs, PostingsEnum.FREQS); } else { docs = termsEnum.postings(null, PostingsEnum.POSITIONS); } int docFreq = 0; long totalTermFreq = 0; while (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS) { docFreq++; totalTermFreq += docs.freq(); int limit = TestUtil.nextInt(random(), 1, docs.freq()); if (!noPositions) { for (int i = 0; i < limit; i++) { docs.nextPosition(); } } } if (isMerge == false && addOnSecondPass) { TermFreqs tf = termFreqs.get(term); assert tf != null; tf.docFreq += docFreq; tf.totalTermFreq += totalTermFreq; sumDocFreq.addAndGet(docFreq); sumTotalTermFreq.addAndGet(totalTermFreq); } //System.out.println(" term=" + term + " docFreq=" + docFreq + " ttDF=" + termToDocFreq.get(term)); assertTrue(docFreq <= termFreqs.get(term).docFreq); assertTrue(totalTermFreq <= termFreqs.get(term).totalTermFreq); } } // Also test seekCeil for (int iter = 0; iter < 10; iter++) { BytesRef term = new BytesRef( TestUtil.randomRealisticUnicodeString(random())); SeekStatus status = termsEnum.seekCeil(term); if (status == SeekStatus.NOT_FOUND) { assertTrue(term.compareTo(termsEnum.term()) < 0); } } } @Override public void close() throws IOException { fieldsConsumer.close(); } }; } @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { return defaultPostingsFormat.fieldsProducer(state); } }; } else { return defaultPostingsFormat; } } }); RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc); LineFileDocs docs = new LineFileDocs(random()); int bytesToIndex = atLeast(100) * 1024; int bytesIndexed = 0; while (bytesIndexed < bytesToIndex) { Document doc = docs.nextDoc(); w.addDocument(doc); bytesIndexed += RamUsageTester.sizeOf(doc); } IndexReader r = w.getReader(); w.close(); Terms terms = MultiFields.getTerms(r, "body"); assertEquals(sumDocFreq.get(), terms.getSumDocFreq()); assertEquals(sumTotalTermFreq.get(), terms.getSumTotalTermFreq()); TermsEnum termsEnum = terms.iterator(); long termCount = 0; boolean supportsOrds = true; while (termsEnum.next() != null) { BytesRef term = termsEnum.term(); assertEquals(termFreqs.get(term.utf8ToString()).docFreq, termsEnum.docFreq()); assertEquals(termFreqs.get(term.utf8ToString()).totalTermFreq, termsEnum.totalTermFreq()); if (supportsOrds) { long ord; try { ord = termsEnum.ord(); } catch (UnsupportedOperationException uoe) { supportsOrds = false; ord = -1; } if (ord != -1) { assertEquals(termCount, ord); } } termCount++; } assertEquals(termFreqs.size(), termCount); r.close(); dir.close(); }
From source file:com.rocana.lucene.codec.v1.TestBlockPostingsFormat3.java
License:Apache License
public void test() throws Exception { Directory dir = newDirectory();/*w w w.j ava2 s . c om*/ Analyzer analyzer = new Analyzer(Analyzer.PER_FIELD_REUSE_STRATEGY) { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(); if (fieldName.contains("payloadsFixed")) { TokenFilter filter = new MockFixedLengthPayloadFilter(new Random(0), tokenizer, 1); return new TokenStreamComponents(tokenizer, filter); } else if (fieldName.contains("payloadsVariable")) { TokenFilter filter = new MockVariableLengthPayloadFilter(new Random(0), tokenizer); return new TokenStreamComponents(tokenizer, filter); } else { return new TokenStreamComponents(tokenizer); } } }; IndexWriterConfig iwc = newIndexWriterConfig(analyzer); iwc.setCodec(TestUtil.alwaysPostingsFormat(new RocanaLucene50PostingsFormat())); // TODO we could actually add more fields implemented with different PFs // or, just put this test into the usual rotation? RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); Document doc = new Document(); FieldType docsOnlyType = new FieldType(TextField.TYPE_NOT_STORED); // turn this on for a cross-check docsOnlyType.setStoreTermVectors(true); docsOnlyType.setIndexOptions(IndexOptions.DOCS); FieldType docsAndFreqsType = new FieldType(TextField.TYPE_NOT_STORED); // turn this on for a cross-check docsAndFreqsType.setStoreTermVectors(true); docsAndFreqsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS); FieldType positionsType = new FieldType(TextField.TYPE_NOT_STORED); // turn these on for a cross-check positionsType.setStoreTermVectors(true); positionsType.setStoreTermVectorPositions(true); positionsType.setStoreTermVectorOffsets(true); positionsType.setStoreTermVectorPayloads(true); FieldType offsetsType = new FieldType(positionsType); offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); Field field1 = new Field("field1docs", "", docsOnlyType); Field field2 = new Field("field2freqs", "", docsAndFreqsType); Field field3 = new Field("field3positions", "", positionsType); Field field4 = new Field("field4offsets", "", offsetsType); Field field5 = new Field("field5payloadsFixed", "", positionsType); Field field6 = new Field("field6payloadsVariable", "", positionsType); Field field7 = new Field("field7payloadsFixedOffsets", "", offsetsType); Field field8 = new Field("field8payloadsVariableOffsets", "", offsetsType); doc.add(field1); doc.add(field2); doc.add(field3); doc.add(field4); doc.add(field5); doc.add(field6); doc.add(field7); doc.add(field8); for (int i = 0; i < MAXDOC; i++) { String stringValue = Integer.toString(i) + " verycommon " + English.intToEnglish(i).replace('-', ' ') + " " + TestUtil.randomSimpleString(random()); field1.setStringValue(stringValue); field2.setStringValue(stringValue); field3.setStringValue(stringValue); field4.setStringValue(stringValue); field5.setStringValue(stringValue); field6.setStringValue(stringValue); field7.setStringValue(stringValue); field8.setStringValue(stringValue); iw.addDocument(doc); } iw.close(); verify(dir); TestUtil.checkIndex(dir); // for some extra coverage, checkIndex before we forceMerge iwc = newIndexWriterConfig(analyzer); iwc.setCodec(TestUtil.alwaysPostingsFormat(new RocanaLucene50PostingsFormat())); iwc.setOpenMode(OpenMode.APPEND); IndexWriter iw2 = new IndexWriter(dir, iwc); iw2.forceMerge(1); iw2.close(); verify(dir); dir.close(); }
From source file:com.rocana.lucene.codec.v1.TestRocanaPerFieldPostingsFormat2.java
License:Apache License
private void doTestMixedPostings(Codec codec) throws Exception { Directory dir = newDirectory();//from w w w . ja va 2s .c o m IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())); iwc.setCodec(codec); RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); Document doc = new Document(); FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); // turn on vectors for the checkindex cross-check ft.setStoreTermVectors(true); ft.setStoreTermVectorOffsets(true); ft.setStoreTermVectorPositions(true); Field idField = new Field("id", "", ft); Field dateField = new Field("date", "", ft); doc.add(idField); doc.add(dateField); for (int i = 0; i < 100; i++) { idField.setStringValue(Integer.toString(random().nextInt(50))); dateField.setStringValue(Integer.toString(random().nextInt(100))); iw.addDocument(doc); } iw.close(); dir.close(); // checkindex }
From source file:com.sindicetech.siren.util.SirenTestCase.java
License:Open Source License
protected static void addDocument(final RandomIndexWriter writer, final String data) throws IOException { final Document doc = new Document(); doc.add(new Field(DEFAULT_TEST_FIELD, data, newStoredFieldType())); writer.addDocument(doc); writer.commit();/* w w w .java2s .c o m*/ }
From source file:com.sindicetech.siren.util.SirenTestCase.java
License:Open Source License
protected void addDocumentNoNorms(final RandomIndexWriter writer, final String data) throws IOException { final Document doc = new Document(); doc.add(new Field(DEFAULT_TEST_FIELD, data, this.newStoredNoNormFieldType())); writer.addDocument(doc); writer.commit();//from www. ja va 2 s .c om }
From source file:io.yucca.lucene.FieldRemoverTestCase.java
License:Apache License
/** * Create a new random test index directory * /*from w w w .j a v a 2 s . c o m*/ * @param directoryName * File path of index directory * @throws IOException */ public void newFSIndex(File directoryPath) throws IOException { Analyzer analyzer = new MockAnalyzer(random()); Directory directory = FSDirectory.open(directoryPath); RandomIndexWriter writer = new RandomIndexWriter(random(), directory, analyzer); Document doc = new Document(); doc.add(newTextField("dc_title", title, Field.Store.YES)); doc.add(newTextField("dc_content", content, Field.Store.YES)); doc.add(newTextField("dc_identifier", url, Field.Store.YES)); writer.addDocument(doc); writer.close(); directory.close(); }